diff options
author | Dave Airlie <airlied@redhat.com> | 2020-07-02 15:17:31 +1000 |
---|---|---|
committer | Dave Airlie <airlied@redhat.com> | 2020-07-02 15:17:31 +1000 |
commit | 9555152beb1143c85c03f9b9de59863cbbe89f4b (patch) | |
tree | 3d43b98bf373e72fe84562adafe3bcbb45d21054 /drivers/gpu/drm/amd/amdkfd | |
parent | f75020fcb97a54c0d2ade1f4918db82f44d225ad (diff) | |
parent | 7808363154d622f9446bf4db97ff0f041dafa30b (diff) |
Merge tag 'amd-drm-next-5.9-2020-07-01' of git://people.freedesktop.org/~agd5f/linux into drm-next
amd-drm-next-5.9-2020-07-01:
amdgpu:
- DC DMUB updates
- HDCP fixes
- Thermal interrupt fixes
- Add initial support for Sienna Cichlid GPU
- Add support for unique id on Arcturus
- Major swSMU code cleanup
- Skip BAR resizing if the bios already did id
- Fixes for DCN bandwidth calculations
- Runtime PM reference count fixes
- Add initial UVD support for SI
- Add support for ASSR on eDP links
- Lots of misc fixes and cleanups
- Enable runtime PM on vega10 boards that support BACO
- RAS fixes
- SR-IOV fixes
- Use IP discovery table on renoir
- DC stream synchronization fixes
amdkfd:
- Track SDMA usage per process
- Fix GCC10 compiler warnings
- Locking fix
radeon:
- Default to on chip GART for AGP boards on all arches
- Runtime PM reference count fixes
UAPI:
- Update comments to clarify MTYPE
From: Alex Deucher <alexdeucher@gmail.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200701155041.1102829-1-alexander.deucher@amd.com
Signed-off-by: Dave Airlie <airlied@redhat.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd')
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h | 877 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm | 301 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_crat.c | 1 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_device.c | 28 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 63 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h | 1 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c | 1 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c | 1 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 21 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_process.c | 246 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 295 |
11 files changed, 1352 insertions, 483 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h index d3400da6ab64..577d901fdb63 100644 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h @@ -679,169 +679,175 @@ static const uint32_t cwsr_trap_gfx9_hex[] = { 0xbf810000, 0x00000000, }; -static const uint32_t cwsr_trap_gfx10_hex[] = { - 0xbf820001, 0xbf8201c1, +static const uint32_t cwsr_trap_nv1x_hex[] = { + 0xbf820001, 0xbf8201cd, 0xb0804004, 0xb978f802, - 0x8a788678, 0xb971f803, - 0x876eff71, 0x00000400, - 0xbf850033, 0x876eff71, - 0x00000100, 0xbf840002, - 0x8878ff78, 0x00002000, - 0x8a77ff77, 0xff000000, - 0xb96ef807, 0x876fff6e, - 0x02000000, 0x8f6f866f, - 0x88776f77, 0x876fff6e, - 0x003f8000, 0x8f6f896f, - 0x88776f77, 0x8a6eff6e, - 0x023f8000, 0xb9eef807, - 0xb97af812, 0xb97bf813, - 0x8ffa887a, 0xf4051bbd, - 0xfa000000, 0xbf8cc07f, - 0xf4051ebd, 0xfa000008, - 0xbf8cc07f, 0x87ee6e6e, - 0xbf840001, 0xbe80206e, - 0xb971f803, 0x8771ff71, - 0x000001ff, 0xbf850002, - 0x806c846c, 0x826d806d, - 0x876dff6d, 0x0000ffff, - 0x906e8977, 0x876fff6e, - 0x003f8000, 0x906e8677, - 0x876eff6e, 0x02000000, - 0x886e6f6e, 0xb9eef807, - 0x87fe7e7e, 0x87ea6a6a, - 0xb9f8f802, 0xbe80226c, - 0xb971f803, 0x8771ff71, - 0x00000100, 0xbf840006, - 0xbef60380, 0xb9f60203, - 0x876dff6d, 0x0000ffff, - 0x80ec886c, 0x82ed806d, - 0xbef60380, 0xb9f60283, - 0xb972f816, 0xb9762c07, - 0x8f769a76, 0x886d766d, - 0xb97603c7, 0x8f769976, - 0x886d766d, 0xb9760647, - 0x8f769876, 0x886d766d, - 0xb976f807, 0x8776ff76, - 0x00007fff, 0xb9f6f807, + 0x8a788678, 0xb96ef801, + 0x876eff6e, 0x00000800, + 0xbf840003, 0x876eff78, + 0x00002000, 0xbf840009, + 0xb97bf803, 0x876eff7b, + 0x00000400, 0xbf850033, + 0x876eff7b, 0x00000100, + 0xbf840002, 0x8878ff78, + 0x00002000, 0x8a77ff77, + 0xff000000, 0xb96ef807, + 0x876fff6e, 0x02000000, + 0x8f6f866f, 0x88776f77, + 0x876fff6e, 0x003f8000, + 0x8f6f896f, 0x88776f77, + 0x8a6eff6e, 0x023f8000, + 0xb9eef807, 0xb97af812, + 0xb97bf813, 0x8ffa887a, + 0xf4051bbd, 0xfa000000, + 0xbf8cc07f, 0xf4051ebd, + 0xfa000008, 0xbf8cc07f, + 0x87ee6e6e, 0xbf840001, + 0xbe80206e, 0xb97bf803, + 0x877bff7b, 0x000001ff, + 0xbf850002, 0x806c846c, + 0x826d806d, 0x876dff6d, + 0x0000ffff, 0x906e8977, + 0x876fff6e, 0x003f8000, + 0x906e8677, 0x876eff6e, + 0x02000000, 0x886e6f6e, + 0xb9eef807, 0x87fe7e7e, + 0x87ea6a6a, 0xb9f8f802, + 0xbe80226c, 0x876dff6d, + 0x0000ffff, 0xbefa0380, + 0xb9fa0283, 0xb97a2c07, + 0x8f7a9a7a, 0x886d7a6d, + 0xb97a03c7, 0x8f7a997a, + 0x886d7a6d, 0xb97a0647, + 0x8f7a987a, 0x886d7a6d, + 0xb97af807, 0x877aff7a, + 0x00007fff, 0xb9faf807, 0xbeee037e, 0xbeef037f, 0xbefe0480, 0xbf900004, 0xbf8e0002, 0xbf88fffe, + 0xb97b02dc, 0x8f7b997b, + 0x887b7b7f, 0xb97a2a05, + 0x807a817a, 0xbf0d997b, + 0xbf850002, 0x8f7a897a, + 0xbf820001, 0x8f7a8a7a, + 0x877bff7f, 0x0000ffff, + 0x807aff7a, 0x00000200, + 0x807a7e7a, 0x827b807b, + 0xf4491c3d, 0xfa000050, + 0xf4491d3d, 0xfa000060, + 0xf4411e7d, 0xfa000074, 0xbef4037e, 0x8775ff7f, 0x0000ffff, 0x8875ff75, 0x00040000, 0xbef60380, 0xbef703ff, 0x10807fac, - 0x8776ff7f, 0x08000000, - 0x90768376, 0x88777677, - 0x8776ff7f, 0x70000000, - 0x90768176, 0x88777677, - 0xbefb037c, 0xbefa0380, + 0x877aff7f, 0x08000000, + 0x907a837a, 0x88777a77, + 0x877aff7f, 0x70000000, + 0x907a817a, 0x88777a77, + 0xbef1037c, 0xbef00380, 0xb97302dc, 0x8f739973, - 0x8873737f, 0xb97a2a05, - 0x807a817a, 0x907c9973, - 0x877c817c, 0xbf06817c, - 0xbf850002, 0x8f7a897a, - 0xbf820001, 0x8f7a8a7a, - 0xb9761e06, 0x8f768a76, - 0x807a767a, 0x807aff7a, - 0x00000200, 0xbef603ff, - 0x01000000, 0xbefe037c, - 0xbefc037a, 0xf4611efa, - 0xf8000000, 0x807a847a, - 0xbefc037e, 0xbefe037c, - 0xbefc037a, 0xf4611b3a, - 0xf8000000, 0x807a847a, - 0xbefc037e, 0xbefe037c, - 0xbefc037a, 0xf4611b7a, - 0xf8000000, 0x807a847a, - 0xbefc037e, 0xbefe037c, - 0xbefc037a, 0xf4611bba, - 0xf8000000, 0x807a847a, - 0xbefc037e, 0xbefe037c, - 0xbefc037a, 0xf4611bfa, - 0xf8000000, 0x807a847a, - 0xbefc037e, 0xbefe037c, - 0xbefc037a, 0xf4611e3a, - 0xf8000000, 0x807a847a, - 0xbefc037e, 0xb971f803, - 0xbefe037c, 0xbefc037a, - 0xf4611c7a, 0xf8000000, - 0x807a847a, 0xbefc037e, - 0xbefe037c, 0xbefc037a, - 0xf4611cba, 0xf8000000, - 0x807a847a, 0xbefc037e, - 0xb97bf801, 0xbefe037c, - 0xbefc037a, 0xf4611efa, - 0xf8000000, 0x807a847a, - 0xbefc037e, 0xb97bf814, - 0xbefe037c, 0xbefc037a, - 0xf4611efa, 0xf8000000, - 0x807a847a, 0xbefc037e, - 0xb97bf815, 0xbefe037c, - 0xbefc037a, 0xf4611efa, - 0xf8000000, 0x807a847a, - 0xbefc037e, 0x8776ff7f, - 0x04000000, 0xbeef0380, - 0x886f6f76, 0xb97a2a05, - 0x807a817a, 0x907c9973, - 0x877c817c, 0xbf06817c, - 0xbf850002, 0x8f7a897a, - 0xbf820001, 0x8f7a8a7a, - 0xb9761e06, 0x8f768a76, - 0x807a767a, 0xbef603ff, - 0x01000000, 0xbef20374, - 0x80747a74, 0x82758075, - 0xbefc0380, 0xbf800000, - 0xbe802f00, 0xbe822f02, - 0xbe842f04, 0xbe862f06, - 0xbe882f08, 0xbe8a2f0a, - 0xbe8c2f0c, 0xbe8e2f0e, - 0xf469003a, 0xfa000000, - 0xf469013a, 0xfa000010, - 0xf469023a, 0xfa000020, - 0xf469033a, 0xfa000030, - 0x8074c074, 0x82758075, - 0x807c907c, 0xbf0aff7c, - 0x00000060, 0xbf85ffea, - 0xbe802f00, 0xbe822f02, - 0xbe842f04, 0xbe862f06, - 0xbe882f08, 0xbe8a2f0a, - 0xf469003a, 0xfa000000, - 0xf469013a, 0xfa000010, - 0xf469023a, 0xfa000020, - 0x8074b074, 0x82758075, - 0xbef40372, 0xbefa0380, + 0x8873737f, 0xb97bf816, + 0xba80f816, 0x00000000, 0xbefe03c1, 0x907c9973, 0x877c817c, 0xbf06817c, 0xbf850002, 0xbeff0380, 0xbf820002, 0xbeff03c1, 0xbf82000b, 0xbef603ff, 0x01000000, 0xe0704000, - 0x7a5d0000, 0xe0704080, - 0x7a5d0100, 0xe0704100, - 0x7a5d0200, 0xe0704180, - 0x7a5d0300, 0xbf82000a, + 0x705d0000, 0xe0704080, + 0x705d0100, 0xe0704100, + 0x705d0200, 0xe0704180, + 0x705d0300, 0xbf82000a, 0xbef603ff, 0x01000000, - 0xe0704000, 0x7a5d0000, - 0xe0704100, 0x7a5d0100, - 0xe0704200, 0x7a5d0200, - 0xe0704300, 0x7a5d0300, + 0xe0704000, 0x705d0000, + 0xe0704100, 0x705d0100, + 0xe0704200, 0x705d0200, + 0xe0704300, 0x705d0300, + 0xb9702a05, 0x80708170, + 0xbf0d9973, 0xbf850002, + 0x8f708970, 0xbf820001, + 0x8f708a70, 0xb97a1e06, + 0x8f7a8a7a, 0x80707a70, + 0x8070ff70, 0x00000200, + 0xbef603ff, 0x01000000, + 0xbefe037c, 0xbefc0370, + 0xf4611c7a, 0xf8000000, + 0x80708470, 0xbefc037e, + 0xbefe037c, 0xbefc0370, + 0xf4611b3a, 0xf8000000, + 0x80708470, 0xbefc037e, + 0xbefe037c, 0xbefc0370, + 0xf4611b7a, 0xf8000000, + 0x80708470, 0xbefc037e, + 0xbefe037c, 0xbefc0370, + 0xf4611bba, 0xf8000000, + 0x80708470, 0xbefc037e, + 0xbefe037c, 0xbefc0370, + 0xf4611bfa, 0xf8000000, + 0x80708470, 0xbefc037e, + 0xbefe037c, 0xbefc0370, + 0xf4611e3a, 0xf8000000, + 0x80708470, 0xbefc037e, + 0xb97af803, 0xbefe037c, + 0xbefc0370, 0xf4611eba, + 0xf8000000, 0x80708470, + 0xbefc037e, 0xbefe037c, + 0xbefc0370, 0xf4611efa, + 0xf8000000, 0x80708470, + 0xbefc037e, 0xb971f801, + 0xbefe037c, 0xbefc0370, + 0xf4611c7a, 0xf8000000, + 0x80708470, 0xbefc037e, + 0xb971f814, 0xbefe037c, + 0xbefc0370, 0xf4611c7a, + 0xf8000000, 0x80708470, + 0xbefc037e, 0xb971f815, + 0xbefe037c, 0xbefc0370, + 0xf4611c7a, 0xf8000000, + 0x80708470, 0xbefc037e, + 0xb9702a05, 0x80708170, + 0xbf0d9973, 0xbf850002, + 0x8f708970, 0xbf820001, + 0x8f708a70, 0xb97a1e06, + 0x8f7a8a7a, 0x80707a70, + 0xbef603ff, 0x01000000, + 0xbefb0374, 0x80747074, + 0x82758075, 0xbefc0380, + 0xbf800000, 0xbe802f00, + 0xbe822f02, 0xbe842f04, + 0xbe862f06, 0xbe882f08, + 0xbe8a2f0a, 0xbe8c2f0c, + 0xbe8e2f0e, 0xf469003a, + 0xfa000000, 0xf469013a, + 0xfa000010, 0xf469023a, + 0xfa000020, 0xf469033a, + 0xfa000030, 0x8074c074, + 0x82758075, 0x807c907c, + 0xbf0aff7c, 0x00000060, + 0xbf85ffea, 0xbe802f00, + 0xbe822f02, 0xbe842f04, + 0xbe862f06, 0xbe882f08, + 0xbe8a2f0a, 0xf469003a, + 0xfa000000, 0xf469013a, + 0xfa000010, 0xf469023a, + 0xfa000020, 0x8074b074, + 0x82758075, 0xbef4037b, 0xbefe03c1, 0x907c9973, 0x877c817c, 0xbf06817c, 0xbf850002, 0xbeff0380, 0xbf820001, 0xbeff03c1, - 0xb9714306, 0x8771c171, - 0xbf840046, 0xbf8a0000, - 0x8776ff6f, 0x04000000, - 0xbf840042, 0x8f718671, - 0x8f718271, 0xbef60371, - 0xb97a2a05, 0x807a817a, - 0x907c9973, 0x877c817c, - 0xbf06817c, 0xbf850002, - 0x8f7a897a, 0xbf820001, - 0x8f7a8a7a, 0xb9761e06, - 0x8f768a76, 0x807a767a, - 0x807aff7a, 0x00000200, - 0x807aff7a, 0x00000080, + 0xb97b4306, 0x877bc17b, + 0xbf840044, 0xbf8a0000, + 0x877aff73, 0x04000000, + 0xbf840040, 0x8f7b867b, + 0x8f7b827b, 0xbef6037b, + 0xb9702a05, 0x80708170, + 0xbf0d9973, 0xbf850002, + 0x8f708970, 0xbf820001, + 0x8f708a70, 0xb97a1e06, + 0x8f7a8a7a, 0x80707a70, + 0x8070ff70, 0x00000200, + 0x8070ff70, 0x00000080, 0xbef603ff, 0x01000000, 0xd7650000, 0x000100c1, 0xd7660000, 0x000200c1, @@ -852,87 +858,86 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { 0xbf800000, 0xbf800000, 0xbf800000, 0xd8d80000, 0x01000000, 0xbf8c0000, - 0xe0704000, 0x7a5d0100, - 0x807c037c, 0x807a037a, + 0xe0704000, 0x705d0100, + 0x807c037c, 0x80700370, 0xd5250000, 0x0001ff00, - 0x00000080, 0xbf0a717c, + 0x00000080, 0xbf0a7b7c, 0xbf85fff4, 0xbf820011, 0xbe8303ff, 0x00000100, 0xbf800000, 0xbf800000, 0xbf800000, 0xd8d80000, 0x01000000, 0xbf8c0000, - 0xe0704000, 0x7a5d0100, - 0x807c037c, 0x807a037a, + 0xe0704000, 0x705d0100, + 0x807c037c, 0x80700370, 0xd5250000, 0x0001ff00, - 0x00000100, 0xbf0a717c, + 0x00000100, 0xbf0a7b7c, 0xbf85fff4, 0xbefe03c1, 0x907c9973, 0x877c817c, 0xbf06817c, 0xbf850004, - 0xbefa03ff, 0x00000200, + 0xbef003ff, 0x00000200, 0xbeff0380, 0xbf820003, - 0xbefa03ff, 0x00000400, - 0xbeff03c1, 0xb9712a05, - 0x80718171, 0x8f718271, + 0xbef003ff, 0x00000400, + 0xbeff03c1, 0xb97b2a05, + 0x807b817b, 0x8f7b827b, 0x907c9973, 0x877c817c, 0xbf06817c, 0xbf850017, 0xbef603ff, 0x01000000, - 0xbefc0384, 0xbf0a717c, + 0xbefc0384, 0xbf0a7b7c, 0xbf840037, 0x7e008700, 0x7e028701, 0x7e048702, 0x7e068703, 0xe0704000, - 0x7a5d0000, 0xe0704080, - 0x7a5d0100, 0xe0704100, - 0x7a5d0200, 0xe0704180, - 0x7a5d0300, 0x807c847c, - 0x807aff7a, 0x00000200, - 0xbf0a717c, 0xbf85ffef, + 0x705d0000, 0xe0704080, + 0x705d0100, 0xe0704100, + 0x705d0200, 0xe0704180, + 0x705d0300, 0x807c847c, + 0x8070ff70, 0x00000200, + 0xbf0a7b7c, 0xbf85ffef, 0xbf820025, 0xbef603ff, 0x01000000, 0xbefc0384, - 0xbf0a717c, 0xbf840020, + 0xbf0a7b7c, 0xbf840020, 0x7e008700, 0x7e028701, 0x7e048702, 0x7e068703, - 0xe0704000, 0x7a5d0000, - 0xe0704100, 0x7a5d0100, - 0xe0704200, 0x7a5d0200, - 0xe0704300, 0x7a5d0300, - 0x807c847c, 0x807aff7a, - 0x00000400, 0xbf0a717c, - 0xbf85ffef, 0xb9711e06, - 0x8771c171, 0xbf84000c, - 0x8f718371, 0x80717c71, + 0xe0704000, 0x705d0000, + 0xe0704100, 0x705d0100, + 0xe0704200, 0x705d0200, + 0xe0704300, 0x705d0300, + 0x807c847c, 0x8070ff70, + 0x00000400, 0xbf0a7b7c, + 0xbf85ffef, 0xb97b1e06, + 0x877bc17b, 0xbf84000c, + 0x8f7b837b, 0x807b7c7b, 0xbefe03c1, 0xbeff0380, 0x7e008700, 0xe0704000, - 0x7a5d0000, 0x807c817c, - 0x807aff7a, 0x00000080, - 0xbf0a717c, 0xbf85fff8, - 0xbf820142, 0xbef4037e, + 0x705d0000, 0x807c817c, + 0x8070ff70, 0x00000080, + 0xbf0a7b7c, 0xbf85fff8, + 0xbf82014f, 0xbef4037e, 0x8775ff7f, 0x0000ffff, 0x8875ff75, 0x00040000, 0xbef60380, 0xbef703ff, - 0x10807fac, 0x8772ff7f, - 0x08000000, 0x90728372, - 0x88777277, 0x8772ff7f, - 0x70000000, 0x90728172, - 0x88777277, 0xb97302dc, - 0x8f739973, 0x8873737f, - 0x8772ff7f, 0x04000000, - 0xbf840036, 0xbefe03c1, - 0x907c9973, 0x877c817c, + 0x10807fac, 0x876eff7f, + 0x08000000, 0x906e836e, + 0x88776e77, 0x876eff7f, + 0x70000000, 0x906e816e, + 0x88776e77, 0xb97202dc, + 0x8f729972, 0x8872727f, + 0x876eff7f, 0x04000000, + 0xbf840034, 0xbefe03c1, + 0x907c9972, 0x877c817c, 0xbf06817c, 0xbf850002, 0xbeff0380, 0xbf820001, 0xbeff03c1, 0xb96f4306, - 0x876fc16f, 0xbf84002b, + 0x876fc16f, 0xbf840029, 0x8f6f866f, 0x8f6f826f, 0xbef6036f, 0xb9782a05, - 0x80788178, 0x907c9973, - 0x877c817c, 0xbf06817c, + 0x80788178, 0xbf0d9972, 0xbf850002, 0x8f788978, 0xbf820001, 0x8f788a78, - 0xb9721e06, 0x8f728a72, - 0x80787278, 0x8078ff78, + 0xb96e1e06, 0x8f6e8a6e, + 0x80786e78, 0x8078ff78, 0x00000200, 0x8078ff78, 0x00000080, 0xbef603ff, - 0x01000000, 0x907c9973, + 0x01000000, 0x907c9972, 0x877c817c, 0xbf06817c, 0xbefc0380, 0xbf850009, 0xe0310000, 0x781d0000, @@ -944,15 +949,15 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { 0x00000100, 0x8078ff78, 0x00000100, 0xbf0a6f7c, 0xbf85fff8, 0xbef80380, - 0xbefe03c1, 0x907c9973, + 0xbefe03c1, 0x907c9972, 0x877c817c, 0xbf06817c, 0xbf850002, 0xbeff0380, 0xbf820001, 0xbeff03c1, 0xb96f2a05, 0x806f816f, - 0x8f6f826f, 0x907c9973, + 0x8f6f826f, 0x907c9972, 0x877c817c, 0xbf06817c, 0xbf850021, 0xbef603ff, - 0x01000000, 0xbef20378, + 0x01000000, 0xbeee0378, 0x8078ff78, 0x00000200, 0xbefc0384, 0xe0304000, 0x785d0000, 0xe0304080, @@ -964,12 +969,12 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { 0x807c847c, 0x8078ff78, 0x00000200, 0xbf0a6f7c, 0xbf85ffee, 0xe0304000, - 0x725d0000, 0xe0304080, - 0x725d0100, 0xe0304100, - 0x725d0200, 0xe0304180, - 0x725d0300, 0xbf820032, + 0x6e5d0000, 0xe0304080, + 0x6e5d0100, 0xe0304100, + 0x6e5d0200, 0xe0304180, + 0x6e5d0300, 0xbf820032, 0xbef603ff, 0x01000000, - 0xbef20378, 0x8078ff78, + 0xbeee0378, 0x8078ff78, 0x00000400, 0xbefc0384, 0xe0304000, 0x785d0000, 0xe0304100, 0x785d0100, @@ -989,16 +994,15 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { 0x8078ff78, 0x00000080, 0xbf0a6f7c, 0xbf85fff7, 0xbeff03c1, 0xe0304000, - 0x725d0000, 0xe0304100, - 0x725d0100, 0xe0304200, - 0x725d0200, 0xe0304300, - 0x725d0300, 0xbf8c3f70, + 0x6e5d0000, 0xe0304100, + 0x6e5d0100, 0xe0304200, + 0x6e5d0200, 0xe0304300, + 0x6e5d0300, 0xbf8c3f70, 0xb9782a05, 0x80788178, - 0x907c9973, 0x877c817c, - 0xbf06817c, 0xbf850002, + 0xbf0d9972, 0xbf850002, 0x8f788978, 0xbf820001, - 0x8f788a78, 0xb9721e06, - 0x8f728a72, 0x80787278, + 0x8f788a78, 0xb96e1e06, + 0x8f6e8a6e, 0x80786e78, 0x8078ff78, 0x00000200, 0x80f8ff78, 0x00000050, 0xbef603ff, 0x01000000, @@ -1021,23 +1025,22 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { 0xbe8c310c, 0xbe8e310e, 0xbf06807c, 0xbf84fff0, 0xb9782a05, 0x80788178, - 0x907c9973, 0x877c817c, - 0xbf06817c, 0xbf850002, + 0xbf0d9972, 0xbf850002, 0x8f788978, 0xbf820001, - 0x8f788a78, 0xb9721e06, - 0x8f728a72, 0x80787278, + 0x8f788a78, 0xb96e1e06, + 0x8f6e8a6e, 0x80786e78, 0x8078ff78, 0x00000200, 0xbef603ff, 0x01000000, 0xf4211bfa, 0xf0000000, 0x80788478, 0xf4211b3a, 0xf0000000, 0x80788478, 0xf4211b7a, 0xf0000000, - 0x80788478, 0xf4211eba, - 0xf0000000, 0x80788478, - 0xf4211efa, 0xf0000000, 0x80788478, 0xf4211c3a, 0xf0000000, 0x80788478, 0xf4211c7a, 0xf0000000, + 0x80788478, 0xf4211eba, + 0xf0000000, 0x80788478, + 0xf4211efa, 0xf0000000, 0x80788478, 0xf4211e7a, 0xf0000000, 0x80788478, 0xf4211cfa, 0xf0000000, @@ -1046,31 +1049,41 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { 0xbf8cc07f, 0xb9eef814, 0xf4211bba, 0xf0000000, 0x80788478, 0xbf8cc07f, - 0xb9eef815, 0xbef2036d, - 0x876dff72, 0x0000ffff, - 0xbefc036f, 0xbefe037a, - 0xbeff037b, 0x876f71ff, - 0x000003ff, 0xb9ef4803, - 0xb9f9f816, 0x876f71ff, - 0xfffff800, 0x906f8b6f, - 0xb9efa2c3, 0xb9f3f801, - 0x876fff72, 0xfc000000, - 0x906f9a6f, 0x8f6f906f, - 0xbef30380, 0x88736f73, - 0x876fff72, 0x02000000, - 0x906f996f, 0x8f6f8f6f, - 0x88736f73, 0x876fff72, - 0x01000000, 0x906f986f, - 0x8f6f996f, 0x88736f73, - 0x876fff70, 0x00800000, - 0x906f976f, 0xb9f3f807, - 0x87fe7e7e, 0x87ea6a6a, - 0xb9f0f802, 0xbf8a0000, - 0xbe80226c, 0xbf810000, + 0xb9eef815, 0xbefc036f, + 0xbefe0370, 0xbeff0371, + 0x876f7bff, 0x000003ff, + 0xb9ef4803, 0xb9f9f816, + 0x876f7bff, 0xfffff800, + 0x906f8b6f, 0xb9efa2c3, + 0xb9f3f801, 0xb96e2a05, + 0x806e816e, 0xbf0d9972, + 0xbf850002, 0x8f6e896e, + 0xbf820001, 0x8f6e8a6e, + 0x806eff6e, 0x00000200, + 0x806e746e, 0x826f8075, + 0x876fff6f, 0x0000ffff, + 0xf4091c37, 0xfa000050, + 0xf4091d37, 0xfa000060, + 0xf4011e77, 0xfa000074, + 0xbf8cc07f, 0x876fff6d, + 0xfc000000, 0x906f9a6f, + 0x8f6f906f, 0xbeee0380, + 0x886e6f6e, 0x876fff6d, + 0x02000000, 0x906f996f, + 0x8f6f8f6f, 0x886e6f6e, + 0x876fff6d, 0x01000000, + 0x906f986f, 0x8f6f996f, + 0x886e6f6e, 0x876fff7a, + 0x00800000, 0x906f976f, + 0xb9eef807, 0x876dff6d, + 0x0000ffff, 0x87fe7e7e, + 0x87ea6a6a, 0xb9faf802, + 0xbf8a0000, 0xbe80226c, + 0xbf810000, 0xbf9f0000, 0xbf9f0000, 0xbf9f0000, 0xbf9f0000, 0xbf9f0000, - 0xbf9f0000, 0x00000000, }; + static const uint32_t cwsr_trap_arcturus_hex[] = { 0xbf820001, 0xbf8202c4, 0xb8f8f802, 0x89788678, @@ -1560,3 +1573,399 @@ static const uint32_t cwsr_trap_arcturus_hex[] = { 0xbf8a0000, 0x95806f6c, 0xbf810000, 0x00000000, }; + +static const uint32_t cwsr_trap_gfx10_hex[] = { + 0xbf820001, 0xbf8201cf, + 0xb0804004, 0xb978f802, + 0x8a788678, 0xb96ef801, + 0x876eff6e, 0x00000800, + 0xbf840003, 0x876eff78, + 0x00002000, 0xbf840009, + 0xb97bf803, 0x876eff7b, + 0x00000400, 0xbf85001d, + 0x876eff7b, 0x00000100, + 0xbf840002, 0x8878ff78, + 0x00002000, 0xb97af812, + 0xb97bf813, 0x8ffa887a, + 0xf4051bbd, 0xfa000000, + 0xbf8cc07f, 0xf4051ebd, + 0xfa000008, 0xbf8cc07f, + 0x87ee6e6e, 0xbf840001, + 0xbe80206e, 0xb97bf803, + 0x877bff7b, 0x000001ff, + 0xbf850002, 0x806c846c, + 0x826d806d, 0x876dff6d, + 0x0000ffff, 0x87fe7e7e, + 0x87ea6a6a, 0xb9f8f802, + 0xbe80226c, 0x876dff6d, + 0x0000ffff, 0xbefa0380, + 0xb9fa0283, 0xbeee037e, + 0xbeef037f, 0xbefe0480, + 0xbf900004, 0xbf8cc07f, + 0xb97b02dc, 0x8f7b997b, + 0x887b7b7f, 0xb97a2a05, + 0x807a817a, 0xbf0d997b, + 0xbf850002, 0x8f7a897a, + 0xbf820001, 0x8f7a8a7a, + 0x877bff7f, 0x0000ffff, + 0x807aff7a, 0x00000200, + 0x807a7e7a, 0x827b807b, + 0xbef4037e, 0x8775ff7f, + 0x0000ffff, 0x8875ff75, + 0x00040000, 0xbef60380, + 0xbef703ff, 0x10807fac, + 0x877aff7f, 0x08000000, + 0x907a837a, 0x88777a77, + 0x877aff7f, 0x70000000, + 0x907a817a, 0x88777a77, + 0xbef1037c, 0xbef00380, + 0xb97302dc, 0x8f739973, + 0x8873737f, 0xbefe03c1, + 0x907c9973, 0x877c817c, + 0xbf06817c, 0xbf850002, + 0xbeff0380, 0xbf820002, + 0xbeff03c1, 0xbf82000b, + 0xbef603ff, 0x01000000, + 0xe0704000, 0x705d0000, + 0xe0704080, 0x705d0100, + 0xe0704100, 0x705d0200, + 0xe0704180, 0x705d0300, + 0xbf82000a, 0xbef603ff, + 0x01000000, 0xe0704000, + 0x705d0000, 0xe0704100, + 0x705d0100, 0xe0704200, + 0x705d0200, 0xe0704300, + 0x705d0300, 0xb9702a05, + 0x80708170, 0xbf0d9973, + 0xbf850002, 0x8f708970, + 0xbf820001, 0x8f708a70, + 0xb97a1e06, 0x8f7a8a7a, + 0x80707a70, 0x8070ff70, + 0x00000200, 0xbef603ff, + 0x01000000, 0x7e000280, + 0x7e020280, 0x7e040280, + 0xbefc0380, 0xd7610002, + 0x0000f871, 0x807c817c, + 0xd7610002, 0x0000f86c, + 0x807c817c, 0xd7610002, + 0x0000f86d, 0x807c817c, + 0xd7610002, 0x0000f86e, + 0x807c817c, 0xd7610002, + 0x0000f86f, 0x807c817c, + 0xd7610002, 0x0000f878, + 0x807c817c, 0xb97af803, + 0xd7610002, 0x0000f87a, + 0x807c817c, 0xd7610002, + 0x0000f87b, 0x807c817c, + 0xb971f801, 0xd7610002, + 0x0000f871, 0x807c817c, + 0xb971f814, 0xd7610002, + 0x0000f871, 0x807c817c, + 0xb971f815, 0xd7610002, + 0x0000f871, 0x807c817c, + 0xbeff0380, 0xe0704000, + 0x705d0200, 0xb9702a05, + 0x80708170, 0xbf0d9973, + 0xbf850002, 0x8f708970, + 0xbf820001, 0x8f708a70, + 0xb97a1e06, 0x8f7a8a7a, + 0x80707a70, 0xbef603ff, + 0x01000000, 0xbef90380, + 0xbefc0380, 0xbf800000, + 0xbe802f00, 0xbe822f02, + 0xbe842f04, 0xbe862f06, + 0xbe882f08, 0xbe8a2f0a, + 0xbe8c2f0c, 0xbe8e2f0e, + 0xd7610002, 0x0000f200, + 0x80798179, 0xd7610002, + 0x0000f201, 0x80798179, + 0xd7610002, 0x0000f202, + 0x80798179, 0xd7610002, + 0x0000f203, 0x80798179, + 0xd7610002, 0x0000f204, + 0x80798179, 0xd7610002, + 0x0000f205, 0x80798179, + 0xd7610002, 0x0000f206, + 0x80798179, 0xd7610002, + 0x0000f207, 0x80798179, + 0xd7610002, 0x0000f208, + 0x80798179, 0xd7610002, + 0x0000f209, 0x80798179, + 0xd7610002, 0x0000f20a, + 0x80798179, 0xd7610002, + 0x0000f20b, 0x80798179, + 0xd7610002, 0x0000f20c, + 0x80798179, 0xd7610002, + 0x0000f20d, 0x80798179, + 0xd7610002, 0x0000f20e, + 0x80798179, 0xd7610002, + 0x0000f20f, 0x80798179, + 0xbf06a079, 0xbf840006, + 0xe0704000, 0x705d0200, + 0x8070ff70, 0x00000080, + 0xbef90380, 0x7e040280, + 0x807c907c, 0xbf0aff7c, + 0x00000060, 0xbf85ffbc, + 0xbe802f00, 0xbe822f02, + 0xbe842f04, 0xbe862f06, + 0xbe882f08, 0xbe8a2f0a, + 0xd7610002, 0x0000f200, + 0x80798179, 0xd7610002, + 0x0000f201, 0x80798179, + 0xd7610002, 0x0000f202, + 0x80798179, 0xd7610002, + 0x0000f203, 0x80798179, + 0xd7610002, 0x0000f204, + 0x80798179, 0xd7610002, + 0x0000f205, 0x80798179, + 0xd7610002, 0x0000f206, + 0x80798179, 0xd7610002, + 0x0000f207, 0x80798179, + 0xd7610002, 0x0000f208, + 0x80798179, 0xd7610002, + 0x0000f209, 0x80798179, + 0xd7610002, 0x0000f20a, + 0x80798179, 0xd7610002, + 0x0000f20b, 0x80798179, + 0xe0704000, 0x705d0200, + 0xbefe03c1, 0x907c9973, + 0x877c817c, 0xbf06817c, + 0xbf850002, 0xbeff0380, + 0xbf820001, 0xbeff03c1, + 0xb97b4306, 0x877bc17b, + 0xbf840044, 0xbf8a0000, + 0x877aff73, 0x04000000, + 0xbf840040, 0x8f7b867b, + 0x8f7b827b, 0xbef6037b, + 0xb9702a05, 0x80708170, + 0xbf0d9973, 0xbf850002, + 0x8f708970, 0xbf820001, + 0x8f708a70, 0xb97a1e06, + 0x8f7a8a7a, 0x80707a70, + 0x8070ff70, 0x00000200, + 0x8070ff70, 0x00000080, + 0xbef603ff, 0x01000000, + 0xd7650000, 0x000100c1, + 0xd7660000, 0x000200c1, + 0x16000084, 0x907c9973, + 0x877c817c, 0xbf06817c, + 0xbefc0380, 0xbf850012, + 0xbe8303ff, 0x00000080, + 0xbf800000, 0xbf800000, + 0xbf800000, 0xd8d80000, + 0x01000000, 0xbf8c0000, + 0xe0704000, 0x705d0100, + 0x807c037c, 0x80700370, + 0xd5250000, 0x0001ff00, + 0x00000080, 0xbf0a7b7c, + 0xbf85fff4, 0xbf820011, + 0xbe8303ff, 0x00000100, + 0xbf800000, 0xbf800000, + 0xbf800000, 0xd8d80000, + 0x01000000, 0xbf8c0000, + 0xe0704000, 0x705d0100, + 0x807c037c, 0x80700370, + 0xd5250000, 0x0001ff00, + 0x00000100, 0xbf0a7b7c, + 0xbf85fff4, 0xbefe03c1, + 0x907c9973, 0x877c817c, + 0xbf06817c, 0xbf850004, + 0xbef003ff, 0x00000200, + 0xbeff0380, 0xbf820003, + 0xbef003ff, 0x00000400, + 0xbeff03c1, 0xb97b2a05, + 0x807b817b, 0x8f7b827b, + 0x907c9973, 0x877c817c, + 0xbf06817c, 0xbf850017, + 0xbef603ff, 0x01000000, + 0xbefc0384, 0xbf0a7b7c, + 0xbf840037, 0x7e008700, + 0x7e028701, 0x7e048702, + 0x7e068703, 0xe0704000, + 0x705d0000, 0xe0704080, + 0x705d0100, 0xe0704100, + 0x705d0200, 0xe0704180, + 0x705d0300, 0x807c847c, + 0x8070ff70, 0x00000200, + 0xbf0a7b7c, 0xbf85ffef, + 0xbf820025, 0xbef603ff, + 0x01000000, 0xbefc0384, + 0xbf0a7b7c, 0xbf840020, + 0x7e008700, 0x7e028701, + 0x7e048702, 0x7e068703, + 0xe0704000, 0x705d0000, + 0xe0704100, 0x705d0100, + 0xe0704200, 0x705d0200, + 0xe0704300, 0x705d0300, + 0x807c847c, 0x8070ff70, + 0x00000400, 0xbf0a7b7c, + 0xbf85ffef, 0xb97b1e06, + 0x877bc17b, 0xbf84000c, + 0x8f7b837b, 0x807b7c7b, + 0xbefe03c1, 0xbeff0380, + 0x7e008700, 0xe0704000, + 0x705d0000, 0x807c817c, + 0x8070ff70, 0x00000080, + 0xbf0a7b7c, 0xbf85fff8, + 0xbf82013a, 0xbef4037e, + 0x8775ff7f, 0x0000ffff, + 0x8875ff75, 0x00040000, + 0xbef60380, 0xbef703ff, + 0x10807fac, 0x876eff7f, + 0x08000000, 0x906e836e, + 0x88776e77, 0x876eff7f, + 0x70000000, 0x906e816e, + 0x88776e77, 0xb97202dc, + 0x8f729972, 0x8872727f, + 0x876eff7f, 0x04000000, + 0xbf840034, 0xbefe03c1, + 0x907c9972, 0x877c817c, + 0xbf06817c, 0xbf850002, + 0xbeff0380, 0xbf820001, + 0xbeff03c1, 0xb96f4306, + 0x876fc16f, 0xbf840029, + 0x8f6f866f, 0x8f6f826f, + 0xbef6036f, 0xb9782a05, + 0x80788178, 0xbf0d9972, + 0xbf850002, 0x8f788978, + 0xbf820001, 0x8f788a78, + 0xb96e1e06, 0x8f6e8a6e, + 0x80786e78, 0x8078ff78, + 0x00000200, 0x8078ff78, + 0x00000080, 0xbef603ff, + 0x01000000, 0x907c9972, + 0x877c817c, 0xbf06817c, + 0xbefc0380, 0xbf850009, + 0xe0310000, 0x781d0000, + 0x807cff7c, 0x00000080, + 0x8078ff78, 0x00000080, + 0xbf0a6f7c, 0xbf85fff8, + 0xbf820008, 0xe0310000, + 0x781d0000, 0x807cff7c, + 0x00000100, 0x8078ff78, + 0x00000100, 0xbf0a6f7c, + 0xbf85fff8, 0xbef80380, + 0xbefe03c1, 0x907c9972, + 0x877c817c, 0xbf06817c, + 0xbf850002, 0xbeff0380, + 0xbf820001, 0xbeff03c1, + 0xb96f2a05, 0x806f816f, + 0x8f6f826f, 0x907c9972, + 0x877c817c, 0xbf06817c, + 0xbf850021, 0xbef603ff, + 0x01000000, 0xbeee0378, + 0x8078ff78, 0x00000200, + 0xbefc0384, 0xe0304000, + 0x785d0000, 0xe0304080, + 0x785d0100, 0xe0304100, + 0x785d0200, 0xe0304180, + 0x785d0300, 0xbf8c3f70, + 0x7e008500, 0x7e028501, + 0x7e048502, 0x7e068503, + 0x807c847c, 0x8078ff78, + 0x00000200, 0xbf0a6f7c, + 0xbf85ffee, 0xe0304000, + 0x6e5d0000, 0xe0304080, + 0x6e5d0100, 0xe0304100, + 0x6e5d0200, 0xe0304180, + 0x6e5d0300, 0xbf820032, + 0xbef603ff, 0x01000000, + 0xbeee0378, 0x8078ff78, + 0x00000400, 0xbefc0384, + 0xe0304000, 0x785d0000, + 0xe0304100, 0x785d0100, + 0xe0304200, 0x785d0200, + 0xe0304300, 0x785d0300, + 0xbf8c3f70, 0x7e008500, + 0x7e028501, 0x7e048502, + 0x7e068503, 0x807c847c, + 0x8078ff78, 0x00000400, + 0xbf0a6f7c, 0xbf85ffee, + 0xb96f1e06, 0x876fc16f, + 0xbf84000e, 0x8f6f836f, + 0x806f7c6f, 0xbefe03c1, + 0xbeff0380, 0xe0304000, + 0x785d0000, 0xbf8c3f70, + 0x7e008500, 0x807c817c, + 0x8078ff78, 0x00000080, + 0xbf0a6f7c, 0xbf85fff7, + 0xbeff03c1, 0xe0304000, + 0x6e5d0000, 0xe0304100, + 0x6e5d0100, 0xe0304200, + 0x6e5d0200, 0xe0304300, + 0x6e5d0300, 0xbf8c3f70, + 0xb9782a05, 0x80788178, + 0xbf0d9972, 0xbf850002, + 0x8f788978, 0xbf820001, + 0x8f788a78, 0xb96e1e06, + 0x8f6e8a6e, 0x80786e78, + 0x8078ff78, 0x00000200, + 0x80f8ff78, 0x00000050, + 0xbef603ff, 0x01000000, + 0xbefc03ff, 0x0000006c, + 0x80f89078, 0xf429003a, + 0xf0000000, 0xbf8cc07f, + 0x80fc847c, 0xbf800000, + 0xbe803100, 0xbe823102, + 0x80f8a078, 0xf42d003a, + 0xf0000000, 0xbf8cc07f, + 0x80fc887c, 0xbf800000, + 0xbe803100, 0xbe823102, + 0xbe843104, 0xbe863106, + 0x80f8c078, 0xf431003a, + 0xf0000000, 0xbf8cc07f, + 0x80fc907c, 0xbf800000, + 0xbe803100, 0xbe823102, + 0xbe843104, 0xbe863106, + 0xbe883108, 0xbe8a310a, + 0xbe8c310c, 0xbe8e310e, + 0xbf06807c, 0xbf84fff0, + 0xb9782a05, 0x80788178, + 0xbf0d9972, 0xbf850002, + 0x8f788978, 0xbf820001, + 0x8f788a78, 0xb96e1e06, + 0x8f6e8a6e, 0x80786e78, + 0x8078ff78, 0x00000200, + 0xbef603ff, 0x01000000, + 0xf4211bfa, 0xf0000000, + 0x80788478, 0xf4211b3a, + 0xf0000000, 0x80788478, + 0xf4211b7a, 0xf0000000, + 0x80788478, 0xf4211c3a, + 0xf0000000, 0x80788478, + 0xf4211c7a, 0xf0000000, + 0x80788478, 0xf4211eba, + 0xf0000000, 0x80788478, + 0xf4211efa, 0xf0000000, + 0x80788478, 0xf4211e7a, + 0xf0000000, 0x80788478, + 0xf4211cfa, 0xf0000000, + 0x80788478, 0xf4211bba, + 0xf0000000, 0x80788478, + 0xbf8cc07f, 0xb9eef814, + 0xf4211bba, 0xf0000000, + 0x80788478, 0xbf8cc07f, + 0xb9eef815, 0xbefc036f, + 0xbefe0370, 0xbeff0371, + 0x876f7bff, 0x000003ff, + 0xb9ef4803, 0x876f7bff, + 0xfffff800, 0x906f8b6f, + 0xb9efa2c3, 0xb9f3f801, + 0xb96e2a05, 0x806e816e, + 0xbf0d9972, 0xbf850002, + 0x8f6e896e, 0xbf820001, + 0x8f6e8a6e, 0x806eff6e, + 0x00000200, 0x806e746e, + 0x826f8075, 0x876fff6f, + 0x0000ffff, 0xf4091c37, + 0xfa000050, 0xf4091d37, + 0xfa000060, 0xf4011e77, + 0xfa000074, 0xbf8cc07f, + 0x876dff6d, 0x0000ffff, + 0x87fe7e7e, 0x87ea6a6a, + 0xb9faf802, 0xbf8a0000, + 0xbe80226c, 0xbf810000, + 0xbf9f0000, 0xbf9f0000, + 0xbf9f0000, 0xbf9f0000, + 0xbf9f0000, 0x00000000, +}; diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm index 4433bda2ce25..5b220f2a7501 100644 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm @@ -20,6 +20,21 @@ * OTHER DEALINGS IN THE SOFTWARE. */ +/* To compile this assembly code: + * + * Navi1x: + * cpp -DASIC_TARGET_NAVI1X=1 cwsr_trap_handler_gfx10.asm -P -o nv1x.sp3 + * sp3-nv1x nv1x.sp3 -hex nv1x.hex + * + * Others: + * cpp -DASIC_TARGET_NAVI1X=0 cwsr_trap_handler_gfx10.asm -P -o gfx10.sp3 + * sp3-gfx10 gfx10.sp3 -hex gfx10.hex + */ + +#define NO_SQC_STORE !ASIC_TARGET_NAVI1X + +var SINGLE_STEP_MISSED_WORKAROUND = 1 //workaround for lost MODE.DEBUG_EN exception when SAVECTX raised + var SQ_WAVE_STATUS_INST_ATC_SHIFT = 23 var SQ_WAVE_STATUS_INST_ATC_MASK = 0x00800000 var SQ_WAVE_STATUS_SPI_PRIO_MASK = 0x00000006 @@ -59,6 +74,8 @@ var SQ_WAVE_IB_STS_RCNT_SIZE = 6 var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK = 0x003F8000 var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG = 0x00007FFF +var SQ_WAVE_MODE_DEBUG_EN_MASK = 0x800 + var SQ_BUF_RSRC_WORD1_ATC_SHIFT = 24 var SQ_BUF_RSRC_WORD3_MTYPE_SHIFT = 27 @@ -96,17 +113,19 @@ var s_save_pc_hi = ttmp1 var s_save_exec_lo = ttmp2 var s_save_exec_hi = ttmp3 var s_save_status = ttmp12 -var s_save_trapsts = ttmp5 -var s_save_xnack_mask = ttmp6 +var s_save_trapsts = ttmp15 +var s_save_xnack_mask = s_save_trapsts var s_wave_size = ttmp7 var s_save_buf_rsrc0 = ttmp8 var s_save_buf_rsrc1 = ttmp9 var s_save_buf_rsrc2 = ttmp10 var s_save_buf_rsrc3 = ttmp11 -var s_save_mem_offset = ttmp14 +var s_save_mem_offset = ttmp4 var s_save_alloc_size = s_save_trapsts -var s_save_tmp = s_save_buf_rsrc2 -var s_save_m0 = ttmp15 +var s_save_tmp = ttmp14 +var s_save_m0 = ttmp5 +var s_save_ttmps_lo = s_save_tmp +var s_save_ttmps_hi = s_save_trapsts var S_RESTORE_BUF_RSRC_WORD1_STRIDE = S_SAVE_BUF_RSRC_WORD1_STRIDE var S_RESTORE_BUF_RSRC_WORD3_MISC = S_SAVE_BUF_RSRC_WORD3_MISC @@ -128,23 +147,25 @@ var s_restore_spi_init_lo = exec_lo var s_restore_spi_init_hi = exec_hi var s_restore_mem_offset = ttmp12 var s_restore_alloc_size = ttmp3 -var s_restore_tmp = ttmp6 +var s_restore_tmp = ttmp2 var s_restore_mem_offset_save = s_restore_tmp var s_restore_m0 = s_restore_alloc_size var s_restore_mode = ttmp7 -var s_restore_flat_scratch = ttmp2 +var s_restore_flat_scratch = s_restore_tmp var s_restore_pc_lo = ttmp0 var s_restore_pc_hi = ttmp1 -var s_restore_exec_lo = ttmp14 -var s_restore_exec_hi = ttmp15 -var s_restore_status = ttmp4 -var s_restore_trapsts = ttmp5 +var s_restore_exec_lo = ttmp4 +var s_restore_exec_hi = ttmp5 +var s_restore_status = ttmp14 +var s_restore_trapsts = ttmp15 var s_restore_xnack_mask = ttmp13 var s_restore_buf_rsrc0 = ttmp8 var s_restore_buf_rsrc1 = ttmp9 var s_restore_buf_rsrc2 = ttmp10 var s_restore_buf_rsrc3 = ttmp11 -var s_restore_size = ttmp7 +var s_restore_size = ttmp6 +var s_restore_ttmps_lo = s_restore_tmp +var s_restore_ttmps_hi = s_restore_alloc_size shader main asic(DEFAULT) @@ -159,6 +180,24 @@ L_JUMP_TO_RESTORE: L_SKIP_RESTORE: s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC s_andn2_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_SPI_PRIO_MASK + +if SINGLE_STEP_MISSED_WORKAROUND + // No single step exceptions if MODE.DEBUG_EN=0. + s_getreg_b32 ttmp2, hwreg(HW_REG_MODE) + s_and_b32 ttmp2, ttmp2, SQ_WAVE_MODE_DEBUG_EN_MASK + s_cbranch_scc0 L_NO_SINGLE_STEP_WORKAROUND + + // Second-level trap already handled exception if STATUS.HALT=1. + s_and_b32 ttmp2, s_save_status, SQ_WAVE_STATUS_HALT_MASK + + // Prioritize single step exception over context save. + // Second-level trap will halt wave and RFE, re-entering for SAVECTX. + s_cbranch_scc0 L_FETCH_2ND_TRAP + +L_NO_SINGLE_STEP_WORKAROUND: +end + + s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) s_and_b32 ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK //check whether this is for save s_cbranch_scc1 L_SAVE @@ -170,6 +209,8 @@ L_SKIP_RESTORE: s_or_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_HALT_MASK L_FETCH_2ND_TRAP: + +#if ASIC_TARGET_NAVI1X // Preserve and clear scalar XNACK state before issuing scalar loads. // Save IB_STS.REPLAY_W64H[25], RCNT[21:16], FIRST_REPLAY[15] into // unused space ttmp11[31:24]. @@ -183,6 +224,7 @@ L_FETCH_2ND_TRAP: s_or_b32 ttmp11, ttmp11, ttmp3 s_andn2_b32 ttmp2, ttmp2, (SQ_WAVE_IB_STS_REPLAY_W64H_MASK | SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK) s_setreg_b32 hwreg(HW_REG_IB_STS), ttmp2 +#endif // Read second-level TBA/TMA from first-level TMA and jump if available. // ttmp[2:5] and ttmp12 can be used (others hold SPI-initialized debug data) @@ -207,6 +249,7 @@ L_NO_NEXT_TRAP: L_EXCP_CASE: s_and_b32 ttmp1, ttmp1, 0xFFFF +#if ASIC_TARGET_NAVI1X // Restore SQ_WAVE_IB_STS. s_lshr_b32 ttmp2, ttmp11, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT) s_and_b32 ttmp3, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK @@ -214,6 +257,7 @@ L_EXCP_CASE: s_and_b32 ttmp2, ttmp2, SQ_WAVE_IB_STS_REPLAY_W64H_MASK s_or_b32 ttmp2, ttmp2, ttmp3 s_setreg_b32 hwreg(HW_REG_IB_STS), ttmp2 +#endif // Restore SQ_WAVE_STATUS. s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 @@ -223,23 +267,11 @@ L_EXCP_CASE: s_rfe_b64 [ttmp0, ttmp1] L_SAVE: - //check whether there is mem_viol - s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) - s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK - s_cbranch_scc0 L_NO_PC_REWIND - - //if so, need rewind PC assuming GDS operation gets NACKed - s_mov_b32 s_save_tmp, 0 - s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT, 1), s_save_tmp //clear mem_viol bit s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] - s_sub_u32 s_save_pc_lo, s_save_pc_lo, 8 //pc[31:0]-8 - s_subb_u32 s_save_pc_hi, s_save_pc_hi, 0x0 - -L_NO_PC_REWIND: s_mov_b32 s_save_tmp, 0 s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp //clear saveCtx bit - s_getreg_b32 s_save_xnack_mask, hwreg(HW_REG_SHADER_XNACK_MASK) +#if ASIC_TARGET_NAVI1X s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_RCNT_SHIFT, SQ_WAVE_IB_STS_RCNT_SIZE) s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_RCNT_SHIFT s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp @@ -253,6 +285,7 @@ L_NO_PC_REWIND: s_and_b32 s_save_tmp, s_save_tmp, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG s_setreg_b32 hwreg(HW_REG_IB_STS), s_save_tmp +#endif /* inform SPI the readiness and wait for SPI's go signal */ s_mov_b32 s_save_exec_lo, exec_lo //save EXEC and use EXEC for the go signal from SPI @@ -261,12 +294,31 @@ L_NO_PC_REWIND: s_sendmsg sendmsg(MSG_SAVEWAVE) //send SPI a message and wait for SPI's write to EXEC +#if ASIC_TARGET_NAVI1X L_SLEEP: // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause // SQ hang, since the 7,8th wave could not get arbit to exec inst, while // other waves are stuck into the sleep-loop and waiting for wrexec!=0 s_sleep 0x2 s_cbranch_execz L_SLEEP +#else + s_waitcnt lgkmcnt(0) +#endif + + // Save trap temporaries 4-11, 13 initialized by SPI debug dispatch logic + // ttmp SR memory offset : size(VGPR)+size(SGPR)+0x40 + get_wave_size(s_save_ttmps_hi) + get_vgpr_size_bytes(s_save_ttmps_lo, s_save_ttmps_hi) + s_and_b32 s_save_ttmps_hi, s_save_spi_init_hi, 0xFFFF + s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, get_sgpr_size_bytes() + s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, s_save_spi_init_lo + s_addc_u32 s_save_ttmps_hi, s_save_ttmps_hi, 0x0 + +#if ASIC_TARGET_NAVI1X + s_store_dwordx4 [ttmp4, ttmp5, ttmp6, ttmp7], [s_save_ttmps_lo, s_save_ttmps_hi], 0x50 glc:1 + s_store_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_save_ttmps_lo, s_save_ttmps_hi], 0x60 glc:1 + s_store_dword ttmp13, [s_save_ttmps_lo, s_save_ttmps_hi], 0x74 glc:1 +#endif /* setup Resource Contants */ s_mov_b32 s_save_buf_rsrc0, s_save_spi_init_lo //base_addr_lo @@ -285,9 +337,45 @@ L_SLEEP: /* global mem offset */ s_mov_b32 s_save_mem_offset, 0x0 - s_getreg_b32 s_wave_size, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE) - s_lshl_b32 s_wave_size, s_wave_size, S_WAVE_SIZE - s_or_b32 s_wave_size, s_save_spi_init_hi, s_wave_size //share s_wave_size with exec_hi, it's at bit25 + get_wave_size(s_wave_size) + +#if ASIC_TARGET_NAVI1X + // Save and clear vector XNACK state late to free up SGPRs. + s_getreg_b32 s_save_xnack_mask, hwreg(HW_REG_SHADER_XNACK_MASK) + s_setreg_imm32_b32 hwreg(HW_REG_SHADER_XNACK_MASK), 0x0 +#endif + + /* save first 4 VGPRs, needed for SGPR save */ + s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on + s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE + s_and_b32 m0, m0, 1 + s_cmp_eq_u32 m0, 1 + s_cbranch_scc1 L_ENABLE_SAVE_4VGPR_EXEC_HI + s_mov_b32 exec_hi, 0x00000000 + s_branch L_SAVE_4VGPR_WAVE32 +L_ENABLE_SAVE_4VGPR_EXEC_HI: + s_mov_b32 exec_hi, 0xFFFFFFFF + s_branch L_SAVE_4VGPR_WAVE64 +L_SAVE_4VGPR_WAVE32: + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + + // VGPR Allocated in 4-GPR granularity + + buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128 + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*2 + buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*3 + s_branch L_SAVE_HWREG + +L_SAVE_4VGPR_WAVE64: + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + + // VGPR Allocated in 4-GPR granularity + + buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 + buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 /* save HW registers */ @@ -300,6 +388,13 @@ L_SAVE_HWREG: s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes +#if NO_SQC_STORE + v_mov_b32 v0, 0x0 //Offset[31:0] from buffer resource + v_mov_b32 v1, 0x0 //Offset[63:32] from buffer resource + v_mov_b32 v2, 0x0 //Set of SGPRs for TCP store + s_mov_b32 m0, 0x0 //Next lane of v2 to write to +#endif + write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) write_hwreg_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset) write_hwreg_to_mem(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset) @@ -307,8 +402,10 @@ L_SAVE_HWREG: write_hwreg_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset) write_hwreg_to_mem(s_save_status, s_save_buf_rsrc0, s_save_mem_offset) - s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) - write_hwreg_to_mem(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset) + s_getreg_b32 s_save_tmp, hwreg(HW_REG_TRAPSTS) + write_hwreg_to_mem(s_save_tmp, s_save_buf_rsrc0, s_save_mem_offset) + + // Not used on Sienna_Cichlid but keep layout same for debugger. write_hwreg_to_mem(s_save_xnack_mask, s_save_buf_rsrc0, s_save_mem_offset) s_getreg_b32 s_save_m0, hwreg(HW_REG_MODE) @@ -320,10 +417,11 @@ L_SAVE_HWREG: s_getreg_b32 s_save_m0, hwreg(HW_REG_SHADER_FLAT_SCRATCH_HI) write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) - /* the first wave in the threadgroup */ - s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK - s_mov_b32 s_save_exec_hi, 0x0 - s_or_b32 s_save_exec_hi, s_save_tmp, s_save_exec_hi // save first wave bit to s_save_exec_hi.bits[26] +#if NO_SQC_STORE + // Write HWREG/SGPRs with 32 VGPR lanes, wave32 is common case. + s_mov_b32 exec_hi, 0x0 + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 +#endif /* save SGPRs */ // Save SGPR before LDS save, then the s0 to s4 can be used during LDS save... @@ -334,10 +432,14 @@ L_SAVE_HWREG: s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes +#if NO_SQC_STORE + s_mov_b32 ttmp13, 0x0 //next VGPR lane to copy SGPR into +#else // backup s_save_buf_rsrc0,1 to s_save_pc_lo/hi, since write_16sgpr_to_mem function will change the rsrc0 s_mov_b32 s_save_xnack_mask, s_save_buf_rsrc0 s_add_u32 s_save_buf_rsrc0, s_save_buf_rsrc0, s_save_mem_offset s_addc_u32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0 +#endif s_mov_b32 m0, 0x0 //SGPR initial index value =0 s_nop 0x0 //Manually inserted wait states @@ -353,6 +455,18 @@ L_SAVE_SGPR_LOOP: s_movrels_b64 s14, s14 //s14 = s[14+m0], s15 = s[15+m0] write_16sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset) + +#if NO_SQC_STORE + s_cmp_eq_u32 ttmp13, 0x20 //have 32 VGPR lanes filled? + s_cbranch_scc0 L_SAVE_SGPR_SKIP_TCP_STORE + + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + s_add_u32 s_save_mem_offset, s_save_mem_offset, 0x80 + s_mov_b32 ttmp13, 0x0 + v_mov_b32 v2, 0x0 +L_SAVE_SGPR_SKIP_TCP_STORE: +#endif + s_add_u32 m0, m0, 16 //next sgpr index s_cmp_lt_u32 m0, 96 //scc = (m0 < first 96 SGPR) ? 1 : 0 s_cbranch_scc1 L_SAVE_SGPR_LOOP //first 96 SGPR save is complete? @@ -366,43 +480,12 @@ L_SAVE_SGPR_LOOP: s_movrels_b64 s10, s10 //s10 = s[10+m0], s11 = s[11+m0] write_12sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset) +#if NO_SQC_STORE + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 +#else // restore s_save_buf_rsrc0,1 s_mov_b32 s_save_buf_rsrc0, s_save_xnack_mask - - /* save first 4 VGPR, then LDS save could use */ - // each wave will alloc 4 vgprs at least... - - s_mov_b32 s_save_mem_offset, 0 - s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on - s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE - s_and_b32 m0, m0, 1 - s_cmp_eq_u32 m0, 1 - s_cbranch_scc1 L_ENABLE_SAVE_4VGPR_EXEC_HI - s_mov_b32 exec_hi, 0x00000000 - s_branch L_SAVE_4VGPR_WAVE32 -L_ENABLE_SAVE_4VGPR_EXEC_HI: - s_mov_b32 exec_hi, 0xFFFFFFFF - s_branch L_SAVE_4VGPR_WAVE64 -L_SAVE_4VGPR_WAVE32: - s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - - // VGPR Allocated in 4-GPR granularity - - buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 - buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128 - buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*2 - buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*3 - s_branch L_SAVE_LDS - -L_SAVE_4VGPR_WAVE64: - s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - - // VGPR Allocated in 4-GPR granularity - - buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 - buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 - buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 - buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 +#endif /* save LDS */ @@ -423,7 +506,7 @@ L_SAVE_LDS_NORMAL: s_cbranch_scc0 L_SAVE_LDS_DONE //no lds used? jump to L_SAVE_DONE s_barrier //LDS is used? wait for other waves in the same TG - s_and_b32 s_save_tmp, s_save_exec_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK + s_and_b32 s_save_tmp, s_wave_size, S_SAVE_SPI_INIT_FIRST_WAVE_MASK s_cbranch_scc0 L_SAVE_LDS_DONE // first wave do LDS save; @@ -598,9 +681,7 @@ L_RESTORE: s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or MTYPE //determine it is wave32 or wave64 - s_getreg_b32 s_restore_size, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE) - s_lshl_b32 s_restore_size, s_restore_size, S_WAVE_SIZE - s_or_b32 s_restore_size, s_restore_spi_init_hi, s_restore_size + get_wave_size(s_restore_size) s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK s_cbranch_scc0 L_RESTORE_VGPR @@ -634,7 +715,7 @@ L_RESTORE_LDS_NORMAL: s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE + s_lshr_b32 m0, s_restore_size, S_WAVE_SIZE s_and_b32 m0, m0, 1 s_cmp_eq_u32 m0, 1 s_mov_b32 m0, 0x0 @@ -842,38 +923,55 @@ L_RESTORE_HWREG: s_setreg_b32 hwreg(HW_REG_SHADER_FLAT_SCRATCH_HI), s_restore_flat_scratch - s_mov_b32 s_restore_tmp, s_restore_pc_hi - s_and_b32 s_restore_pc_hi, s_restore_tmp, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS - s_mov_b32 m0, s_restore_m0 s_mov_b32 exec_lo, s_restore_exec_lo s_mov_b32 exec_hi, s_restore_exec_hi s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0 + +#if ASIC_TARGET_NAVI1X s_setreg_b32 hwreg(HW_REG_SHADER_XNACK_MASK), s_restore_xnack_mask +#endif + s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0 s_setreg_b32 hwreg(HW_REG_MODE), s_restore_mode - s_and_b32 s_restore_m0, s_restore_tmp, S_SAVE_PC_HI_RCNT_MASK + + // Restore trap temporaries 4-11, 13 initialized by SPI debug dispatch logic + // ttmp SR memory offset : size(VGPR)+size(SGPR)+0x40 + get_vgpr_size_bytes(s_restore_ttmps_lo, s_restore_size) + s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, get_sgpr_size_bytes() + s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_buf_rsrc0 + s_addc_u32 s_restore_ttmps_hi, s_restore_buf_rsrc1, 0x0 + s_and_b32 s_restore_ttmps_hi, s_restore_ttmps_hi, 0xFFFF + s_load_dwordx4 [ttmp4, ttmp5, ttmp6, ttmp7], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x50 glc:1 + s_load_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x60 glc:1 + s_load_dword ttmp13, [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x74 glc:1 + s_waitcnt lgkmcnt(0) + +#if ASIC_TARGET_NAVI1X + s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_RCNT_MASK s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_RCNT_SHIFT s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_RCNT_SHIFT - s_mov_b32 s_restore_mode, 0x0 - s_or_b32 s_restore_mode, s_restore_mode, s_restore_m0 - s_and_b32 s_restore_m0, s_restore_tmp, S_SAVE_PC_HI_FIRST_REPLAY_MASK + s_mov_b32 s_restore_tmp, 0x0 + s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0 + s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_FIRST_REPLAY_MASK s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT - s_or_b32 s_restore_mode, s_restore_mode, s_restore_m0 - s_and_b32 s_restore_m0, s_restore_tmp, S_SAVE_PC_HI_REPLAY_W64H_MASK + s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0 + s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_REPLAY_W64H_MASK s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_REPLAY_W64H_SHIFT s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT - s_or_b32 s_restore_mode, s_restore_mode, s_restore_m0 + s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0 s_and_b32 s_restore_m0, s_restore_status, SQ_WAVE_STATUS_INST_ATC_MASK s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT - s_setreg_b32 hwreg(HW_REG_IB_STS), s_restore_mode + s_setreg_b32 hwreg(HW_REG_IB_STS), s_restore_tmp +#endif + s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 s_setreg_b32 hwreg(HW_REG_STATUS), s_restore_status // SCC is included, which is changed by previous salu @@ -887,32 +985,53 @@ L_END_PGM: end function write_hwreg_to_mem(s, s_rsrc, s_mem_offset) +#if NO_SQC_STORE + // Copy into VGPR for later TCP store. + v_writelane_b32 v2, s, m0 + s_add_u32 m0, m0, 0x1 +#else s_mov_b32 exec_lo, m0 s_mov_b32 m0, s_mem_offset s_buffer_store_dword s, s_rsrc, m0 glc:1 s_add_u32 s_mem_offset, s_mem_offset, 4 s_mov_b32 m0, exec_lo +#endif end function write_16sgpr_to_mem(s, s_rsrc, s_mem_offset) +#if NO_SQC_STORE + // Copy into VGPR for later TCP store. + for var sgpr_idx = 0; sgpr_idx < 16; sgpr_idx ++ + v_writelane_b32 v2, s[sgpr_idx], ttmp13 + s_add_u32 ttmp13, ttmp13, 0x1 + end +#else s_buffer_store_dwordx4 s[0], s_rsrc, 0 glc:1 s_buffer_store_dwordx4 s[4], s_rsrc, 16 glc:1 s_buffer_store_dwordx4 s[8], s_rsrc, 32 glc:1 s_buffer_store_dwordx4 s[12], s_rsrc, 48 glc:1 s_add_u32 s_rsrc[0], s_rsrc[0], 4*16 s_addc_u32 s_rsrc[1], s_rsrc[1], 0x0 +#endif end function write_12sgpr_to_mem(s, s_rsrc, s_mem_offset) +#if NO_SQC_STORE + // Copy into VGPR for later TCP store. + for var sgpr_idx = 0; sgpr_idx < 12; sgpr_idx ++ + v_writelane_b32 v2, s[sgpr_idx], ttmp13 + s_add_u32 ttmp13, ttmp13, 0x1 + end +#else s_buffer_store_dwordx4 s[0], s_rsrc, 0 glc:1 s_buffer_store_dwordx4 s[4], s_rsrc, 16 glc:1 s_buffer_store_dwordx4 s[8], s_rsrc, 32 glc:1 s_add_u32 s_rsrc[0], s_rsrc[0], 4*12 s_addc_u32 s_rsrc[1], s_rsrc[1], 0x0 +#endif end - function read_hwreg_from_mem(s, s_rsrc, s_mem_offset) s_buffer_load_dword s, s_rsrc, s_mem_offset glc:1 s_add_u32 s_mem_offset, s_mem_offset, 4 @@ -942,9 +1061,7 @@ end function get_vgpr_size_bytes(s_vgpr_size_byte, s_size) s_getreg_b32 s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) s_add_u32 s_vgpr_size_byte, s_vgpr_size_byte, 1 - s_lshr_b32 m0, s_size, S_WAVE_SIZE - s_and_b32 m0, m0, 1 - s_cmp_eq_u32 m0, 1 + s_bitcmp1_b32 s_size, S_WAVE_SIZE s_cbranch_scc1 L_ENABLE_SHIFT_W64 s_lshl_b32 s_vgpr_size_byte, s_vgpr_size_byte, (2+7) //Number of VGPRs = (vgpr_size + 1) * 4 * 32 * 4 (non-zero value) s_branch L_SHIFT_DONE @@ -965,3 +1082,9 @@ end function get_hwreg_size_bytes return 128 end + +function get_wave_size(s_reg) + s_getreg_b32 s_reg, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE) + s_lshl_b32 s_reg, s_reg, S_WAVE_SIZE + s_or_b32 s_reg, s_save_spi_init_hi, s_reg //share with exec_hi, it's at bit25 +end diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c index 1009a3b8dcc2..9deadfd8f929 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c @@ -678,6 +678,7 @@ static int kfd_fill_gpu_cache_info(struct kfd_dev *kdev, case CHIP_NAVI10: case CHIP_NAVI12: case CHIP_NAVI14: + case CHIP_SIENNA_CICHLID: pcache_info = navi10_cache_info; num_of_cache_types = ARRAY_SIZE(navi10_cache_info); break; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index 0491ab2b4a9b..7f6d0958ed62 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -46,6 +46,7 @@ extern const struct kfd2kgd_calls gfx_v8_kfd2kgd; extern const struct kfd2kgd_calls gfx_v9_kfd2kgd; extern const struct kfd2kgd_calls arcturus_kfd2kgd; extern const struct kfd2kgd_calls gfx_v10_kfd2kgd; +extern const struct kfd2kgd_calls gfx_v10_3_kfd2kgd; static const struct kfd2kgd_calls *kfd2kgd_funcs[] = { #ifdef KFD_SUPPORT_IOMMU_V2 @@ -72,6 +73,7 @@ static const struct kfd2kgd_calls *kfd2kgd_funcs[] = { [CHIP_NAVI10] = &gfx_v10_kfd2kgd, [CHIP_NAVI12] = &gfx_v10_kfd2kgd, [CHIP_NAVI14] = &gfx_v10_kfd2kgd, + [CHIP_SIENNA_CICHLID] = &gfx_v10_3_kfd2kgd, }; #ifdef KFD_SUPPORT_IOMMU_V2 @@ -458,6 +460,24 @@ static const struct kfd_device_info navi14_device_info = { .num_sdma_queues_per_engine = 8, }; +static const struct kfd_device_info sienna_cichlid_device_info = { + .asic_family = CHIP_SIENNA_CICHLID, + .asic_name = "sienna_cichlid", + .max_pasid_bits = 16, + .max_no_of_hqd = 24, + .doorbell_size = 8, + .ih_ring_entry_size = 8 * sizeof(uint32_t), + .event_interrupt_class = &event_interrupt_class_v9, + .num_of_watch_points = 4, + .mqd_size_aligned = MQD_SIZE_ALIGNED, + .needs_iommu_device = false, + .supports_cwsr = true, + .needs_pci_atomics = false, + .num_sdma_engines = 4, + .num_xgmi_sdma_engines = 0, + .num_sdma_queues_per_engine = 8, +}; + /* For each entry, [0] is regular and [1] is virtualisation device. */ static const struct kfd_device_info *kfd_supported_devices[][2] = { #ifdef KFD_SUPPORT_IOMMU_V2 @@ -480,6 +500,7 @@ static const struct kfd_device_info *kfd_supported_devices[][2] = { [CHIP_NAVI10] = {&navi10_device_info, NULL}, [CHIP_NAVI12] = {&navi12_device_info, &navi12_device_info}, [CHIP_NAVI14] = {&navi14_device_info, NULL}, + [CHIP_SIENNA_CICHLID] = {&sienna_cichlid_device_info, &sienna_cichlid_device_info}, }; static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size, @@ -559,6 +580,10 @@ static void kfd_cwsr_init(struct kfd_dev *kfd) BUILD_BUG_ON(sizeof(cwsr_trap_gfx9_hex) > PAGE_SIZE); kfd->cwsr_isa = cwsr_trap_gfx9_hex; kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx9_hex); + } else if (kfd->device_info->asic_family < CHIP_SIENNA_CICHLID) { + BUILD_BUG_ON(sizeof(cwsr_trap_nv1x_hex) > PAGE_SIZE); + kfd->cwsr_isa = cwsr_trap_nv1x_hex; + kfd->cwsr_isa_size = sizeof(cwsr_trap_nv1x_hex); } else { BUILD_BUG_ON(sizeof(cwsr_trap_gfx10_hex) > PAGE_SIZE); kfd->cwsr_isa = cwsr_trap_gfx10_hex; @@ -910,6 +935,7 @@ int kgd2kfd_quiesce_mm(struct mm_struct *mm) if (!p) return -ESRCH; + WARN(debug_evictions, "Evicting pid %d", p->lead_thread->pid); r = kfd_process_evict_queues(p); kfd_unref_process(p); @@ -977,6 +1003,8 @@ int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm, /* During process initialization eviction_work.dwork is initialized * to kfd_evict_bo_worker */ + WARN(debug_evictions, "Scheduling eviction of pid %d in %ld jiffies", + p->lead_thread->pid, delay_jiffies); schedule_delayed_work(&p->eviction_work, delay_jiffies); out: kfd_unref_process(p); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index e9c4867abeff..dd550025d1c1 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -137,7 +137,7 @@ void program_sh_mem_settings(struct device_queue_manager *dqm, qpd->sh_mem_bases); } -void increment_queue_count(struct device_queue_manager *dqm, +static void increment_queue_count(struct device_queue_manager *dqm, enum kfd_queue_type type) { dqm->active_queue_count++; @@ -145,7 +145,7 @@ void increment_queue_count(struct device_queue_manager *dqm, dqm->active_cp_queue_count++; } -void decrement_queue_count(struct device_queue_manager *dqm, +static void decrement_queue_count(struct device_queue_manager *dqm, enum kfd_queue_type type) { dqm->active_queue_count--; @@ -153,6 +153,30 @@ void decrement_queue_count(struct device_queue_manager *dqm, dqm->active_cp_queue_count--; } +int read_sdma_queue_counter(uint64_t q_rptr, uint64_t *val) +{ + int ret; + uint64_t tmp = 0; + + if (!val) + return -EINVAL; + /* + * SDMA activity counter is stored at queue's RPTR + 0x8 location. + */ + if (!access_ok((const void __user *)(q_rptr + + sizeof(uint64_t)), sizeof(uint64_t))) { + pr_err("Can't access sdma queue activity counter\n"); + return -EFAULT; + } + + ret = get_user(tmp, (uint64_t *)(q_rptr + sizeof(uint64_t))); + if (!ret) { + *val = tmp; + } + + return ret; +} + static int allocate_doorbell(struct qcm_process_device *qpd, struct queue *q) { struct kfd_dev *dev = qpd->dqm->dev; @@ -487,6 +511,7 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm, if (retval == -ETIME) qpd->reset_wavefronts = true; + mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj); list_del(&q->list); @@ -521,9 +546,23 @@ static int destroy_queue_nocpsch(struct device_queue_manager *dqm, struct queue *q) { int retval; + uint64_t sdma_val = 0; + struct kfd_process_device *pdd = qpd_to_pdd(qpd); + + /* Get the SDMA queue stats */ + if ((q->properties.type == KFD_QUEUE_TYPE_SDMA) || + (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)) { + retval = read_sdma_queue_counter((uint64_t)q->properties.read_ptr, + &sdma_val); + if (retval) + pr_err("Failed to read SDMA queue counter for queue: %d\n", + q->properties.queue_id); + } dqm_lock(dqm); retval = destroy_queue_nocpsch_locked(dqm, qpd, q); + if (!retval) + pdd->sdma_past_activity_counter += sdma_val; dqm_unlock(dqm); return retval; @@ -1428,6 +1467,18 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, { int retval; struct mqd_manager *mqd_mgr; + uint64_t sdma_val = 0; + struct kfd_process_device *pdd = qpd_to_pdd(qpd); + + /* Get the SDMA queue stats */ + if ((q->properties.type == KFD_QUEUE_TYPE_SDMA) || + (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)) { + retval = read_sdma_queue_counter((uint64_t)q->properties.read_ptr, + &sdma_val); + if (retval) + pr_err("Failed to read SDMA queue counter for queue: %d\n", + q->properties.queue_id); + } retval = 0; @@ -1449,10 +1500,11 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, deallocate_doorbell(qpd, q); - if (q->properties.type == KFD_QUEUE_TYPE_SDMA) - deallocate_sdma_queue(dqm, q); - else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) + if ((q->properties.type == KFD_QUEUE_TYPE_SDMA) || + (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)) { deallocate_sdma_queue(dqm, q); + pdd->sdma_past_activity_counter += sdma_val; + } list_del(&q->list); qpd->queue_count--; @@ -1886,6 +1938,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev) case CHIP_NAVI10: case CHIP_NAVI12: case CHIP_NAVI14: + case CHIP_SIENNA_CICHLID: device_queue_manager_init_v10_navi10(&dqm->asic_ops); break; default: diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h index 4afa015c69b1..49d8e324c636 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h @@ -251,4 +251,5 @@ static inline void dqm_unlock(struct device_queue_manager *dqm) mutex_unlock(&dqm->lock_hidden); } +int read_sdma_queue_counter(uint64_t q_rptr, uint64_t *val); #endif /* KFD_DEVICE_QUEUE_MANAGER_H_ */ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c index 78714f9a8b11..b4674cf73132 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c @@ -415,6 +415,7 @@ int kfd_init_apertures(struct kfd_process *process) case CHIP_NAVI10: case CHIP_NAVI12: case CHIP_NAVI14: + case CHIP_SIENNA_CICHLID: kfd_init_apertures_v9(pdd, id); break; default: diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c index 685ca82d42fe..89d7f08d749f 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c @@ -245,6 +245,7 @@ int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm) case CHIP_NAVI10: case CHIP_NAVI12: case CHIP_NAVI14: + case CHIP_SIENNA_CICHLID: pm->pmf = &kfd_v9_pm_funcs; break; default: diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index fee60921fccf..58de109d2909 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -177,6 +177,11 @@ extern bool hws_gws_support; */ extern int queue_preemption_timeout_ms; +/* + * Enable eviction debug messages + */ +extern bool debug_evictions; + enum cache_policy { cache_policy_coherent, cache_policy_noncoherent @@ -630,7 +635,14 @@ enum kfd_pdd_bound { PDD_BOUND_SUSPENDED, }; -#define MAX_VRAM_FILENAME_LEN 11 +#define MAX_SYSFS_FILENAME_LEN 11 + +/* + * SDMA counter runs at 100MHz frequency. + * We display SDMA activity in microsecond granularity in sysfs. + * As a result, the divisor is 100. + */ +#define SDMA_ACTIVITY_DIVISOR 100 /* Data that is per-process-per device. */ struct kfd_process_device { @@ -678,7 +690,12 @@ struct kfd_process_device { /* VRAM usage */ uint64_t vram_usage; struct attribute attr_vram; - char vram_filename[MAX_VRAM_FILENAME_LEN]; + char vram_filename[MAX_SYSFS_FILENAME_LEN]; + + /* SDMA activity tracking */ + uint64_t sdma_past_activity_counter; + struct attribute attr_sdma; + char sdma_filename[MAX_SYSFS_FILENAME_LEN]; }; #define qpd_to_pdd(x) container_of(x, struct kfd_process_device, qpd) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index d27221ddcdeb..40695d52e9a8 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -25,6 +25,7 @@ #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/sched/task.h> +#include <linux/mmu_context.h> #include <linux/slab.h> #include <linux/amd-iommu.h> #include <linux/notifier.h> @@ -76,6 +77,178 @@ struct kfd_procfs_tree { static struct kfd_procfs_tree procfs; +/* + * Structure for SDMA activity tracking + */ +struct kfd_sdma_activity_handler_workarea { + struct work_struct sdma_activity_work; + struct kfd_process_device *pdd; + uint64_t sdma_activity_counter; +}; + +struct temp_sdma_queue_list { + uint64_t rptr; + uint64_t sdma_val; + unsigned int queue_id; + struct list_head list; +}; + +static void kfd_sdma_activity_worker(struct work_struct *work) +{ + struct kfd_sdma_activity_handler_workarea *workarea; + struct kfd_process_device *pdd; + uint64_t val; + struct mm_struct *mm; + struct queue *q; + struct qcm_process_device *qpd; + struct device_queue_manager *dqm; + int ret = 0; + struct temp_sdma_queue_list sdma_q_list; + struct temp_sdma_queue_list *sdma_q, *next; + + workarea = container_of(work, struct kfd_sdma_activity_handler_workarea, + sdma_activity_work); + if (!workarea) + return; + + pdd = workarea->pdd; + if (!pdd) + return; + dqm = pdd->dev->dqm; + qpd = &pdd->qpd; + if (!dqm || !qpd) + return; + /* + * Total SDMA activity is current SDMA activity + past SDMA activity + * Past SDMA count is stored in pdd. + * To get the current activity counters for all active SDMA queues, + * we loop over all SDMA queues and get their counts from user-space. + * + * We cannot call get_user() with dqm_lock held as it can cause + * a circular lock dependency situation. To read the SDMA stats, + * we need to do the following: + * + * 1. Create a temporary list of SDMA queue nodes from the qpd->queues_list, + * with dqm_lock/dqm_unlock(). + * 2. Call get_user() for each node in temporary list without dqm_lock. + * Save the SDMA count for each node and also add the count to the total + * SDMA count counter. + * Its possible, during this step, a few SDMA queue nodes got deleted + * from the qpd->queues_list. + * 3. Do a second pass over qpd->queues_list to check if any nodes got deleted. + * If any node got deleted, its SDMA count would be captured in the sdma + * past activity counter. So subtract the SDMA counter stored in step 2 + * for this node from the total SDMA count. + */ + INIT_LIST_HEAD(&sdma_q_list.list); + + /* + * Create the temp list of all SDMA queues + */ + dqm_lock(dqm); + + list_for_each_entry(q, &qpd->queues_list, list) { + if ((q->properties.type != KFD_QUEUE_TYPE_SDMA) && + (q->properties.type != KFD_QUEUE_TYPE_SDMA_XGMI)) + continue; + + sdma_q = kzalloc(sizeof(struct temp_sdma_queue_list), GFP_KERNEL); + if (!sdma_q) { + dqm_unlock(dqm); + goto cleanup; + } + + INIT_LIST_HEAD(&sdma_q->list); + sdma_q->rptr = (uint64_t)q->properties.read_ptr; + sdma_q->queue_id = q->properties.queue_id; + list_add_tail(&sdma_q->list, &sdma_q_list.list); + } + + /* + * If the temp list is empty, then no SDMA queues nodes were found in + * qpd->queues_list. Return the past activity count as the total sdma + * count + */ + if (list_empty(&sdma_q_list.list)) { + workarea->sdma_activity_counter = pdd->sdma_past_activity_counter; + dqm_unlock(dqm); + return; + } + + dqm_unlock(dqm); + + /* + * Get the usage count for each SDMA queue in temp_list. + */ + mm = get_task_mm(pdd->process->lead_thread); + if (!mm) + goto cleanup; + + kthread_use_mm(mm); + + list_for_each_entry(sdma_q, &sdma_q_list.list, list) { + val = 0; + ret = read_sdma_queue_counter(sdma_q->rptr, &val); + if (ret) { + pr_debug("Failed to read SDMA queue active counter for queue id: %d", + sdma_q->queue_id); + } else { + sdma_q->sdma_val = val; + workarea->sdma_activity_counter += val; + } + } + + kthread_unuse_mm(mm); + mmput(mm); + + /* + * Do a second iteration over qpd_queues_list to check if any SDMA + * nodes got deleted while fetching SDMA counter. + */ + dqm_lock(dqm); + + workarea->sdma_activity_counter += pdd->sdma_past_activity_counter; + + list_for_each_entry(q, &qpd->queues_list, list) { + if (list_empty(&sdma_q_list.list)) + break; + + if ((q->properties.type != KFD_QUEUE_TYPE_SDMA) && + (q->properties.type != KFD_QUEUE_TYPE_SDMA_XGMI)) + continue; + + list_for_each_entry_safe(sdma_q, next, &sdma_q_list.list, list) { + if (((uint64_t)q->properties.read_ptr == sdma_q->rptr) && + (sdma_q->queue_id == q->properties.queue_id)) { + list_del(&sdma_q->list); + kfree(sdma_q); + break; + } + } + } + + dqm_unlock(dqm); + + /* + * If temp list is not empty, it implies some queues got deleted + * from qpd->queues_list during SDMA usage read. Subtract the SDMA + * count for each node from the total SDMA count. + */ + list_for_each_entry_safe(sdma_q, next, &sdma_q_list.list, list) { + workarea->sdma_activity_counter -= sdma_q->sdma_val; + list_del(&sdma_q->list); + kfree(sdma_q); + } + + return; + +cleanup: + list_for_each_entry_safe(sdma_q, next, &sdma_q_list.list, list) { + list_del(&sdma_q->list); + kfree(sdma_q); + } +} + static ssize_t kfd_procfs_show(struct kobject *kobj, struct attribute *attr, char *buffer) { @@ -87,8 +260,24 @@ static ssize_t kfd_procfs_show(struct kobject *kobj, struct attribute *attr, } else if (strncmp(attr->name, "vram_", 5) == 0) { struct kfd_process_device *pdd = container_of(attr, struct kfd_process_device, attr_vram); - if (pdd) - return snprintf(buffer, PAGE_SIZE, "%llu\n", READ_ONCE(pdd->vram_usage)); + return snprintf(buffer, PAGE_SIZE, "%llu\n", READ_ONCE(pdd->vram_usage)); + } else if (strncmp(attr->name, "sdma_", 5) == 0) { + struct kfd_process_device *pdd = container_of(attr, struct kfd_process_device, + attr_sdma); + struct kfd_sdma_activity_handler_workarea sdma_activity_work_handler; + + INIT_WORK(&sdma_activity_work_handler.sdma_activity_work, + kfd_sdma_activity_worker); + + sdma_activity_work_handler.pdd = pdd; + + schedule_work(&sdma_activity_work_handler.sdma_activity_work); + + flush_work(&sdma_activity_work_handler.sdma_activity_work); + + return snprintf(buffer, PAGE_SIZE, "%llu\n", + (sdma_activity_work_handler.sdma_activity_counter)/ + SDMA_ACTIVITY_DIVISOR); } else { pr_err("Invalid attribute"); return -EINVAL; @@ -210,7 +399,24 @@ int kfd_procfs_add_queue(struct queue *q) return 0; } -int kfd_procfs_add_vram_usage(struct kfd_process *p) +static int kfd_sysfs_create_file(struct kfd_process *p, struct attribute *attr, + char *name) +{ + int ret = 0; + + if (!p || !attr || !name) + return -EINVAL; + + attr->name = name; + attr->mode = KFD_SYSFS_FILE_MODE; + sysfs_attr_init(attr); + + ret = sysfs_create_file(p->kobj, attr); + + return ret; +} + +static int kfd_procfs_add_sysfs_files(struct kfd_process *p) { int ret = 0; struct kfd_process_device *pdd; @@ -221,17 +427,25 @@ int kfd_procfs_add_vram_usage(struct kfd_process *p) if (!p->kobj) return -EFAULT; - /* Create proc/<pid>/vram_<gpuid> file for each GPU */ + /* + * Create sysfs files for each GPU: + * - proc/<pid>/vram_<gpuid> + * - proc/<pid>/sdma_<gpuid> + */ list_for_each_entry(pdd, &p->per_device_data, per_device_list) { - snprintf(pdd->vram_filename, MAX_VRAM_FILENAME_LEN, "vram_%u", + snprintf(pdd->vram_filename, MAX_SYSFS_FILENAME_LEN, "vram_%u", pdd->dev->id); - pdd->attr_vram.name = pdd->vram_filename; - pdd->attr_vram.mode = KFD_SYSFS_FILE_MODE; - sysfs_attr_init(&pdd->attr_vram); - ret = sysfs_create_file(p->kobj, &pdd->attr_vram); + ret = kfd_sysfs_create_file(p, &pdd->attr_vram, pdd->vram_filename); if (ret) pr_warn("Creating vram usage for gpu id %d failed", (int)pdd->dev->id); + + snprintf(pdd->sdma_filename, MAX_SYSFS_FILENAME_LEN, "sdma_%u", + pdd->dev->id); + ret = kfd_sysfs_create_file(p, &pdd->attr_sdma, pdd->sdma_filename); + if (ret) + pr_warn("Creating sdma usage for gpu id %d failed", + (int)pdd->dev->id); } return ret; @@ -428,6 +642,7 @@ struct kfd_process *kfd_create_process(struct file *filep) (int)process->lead_thread->pid); if (ret) { pr_warn("Creating procfs pid directory failed"); + kobject_put(process->kobj); goto out; } @@ -444,9 +659,9 @@ struct kfd_process *kfd_create_process(struct file *filep) if (!process->kobj_queues) pr_warn("Creating KFD proc/queues folder failed"); - ret = kfd_procfs_add_vram_usage(process); + ret = kfd_procfs_add_sysfs_files(process); if (ret) - pr_warn("Creating vram usage file for pid %d failed", + pr_warn("Creating sysfs usage file for pid %d failed", (int)process->lead_thread->pid); } out: @@ -597,8 +812,10 @@ static void kfd_process_wq_release(struct work_struct *work) kobject_put(p->kobj_queues); p->kobj_queues = NULL; - list_for_each_entry(pdd, &p->per_device_data, per_device_list) + list_for_each_entry(pdd, &p->per_device_data, per_device_list) { sysfs_remove_file(p->kobj, &pdd->attr_vram); + sysfs_remove_file(p->kobj, &pdd->attr_sdma); + } kobject_del(p->kobj); kobject_put(p->kobj); @@ -906,6 +1123,7 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev, pdd->already_dequeued = false; pdd->runtime_inuse = false; pdd->vram_usage = 0; + pdd->sdma_past_activity_counter = 0; list_add(&pdd->per_device_list, &p->per_device_data); /* Init idr used for memory handle translation */ @@ -1002,8 +1220,10 @@ struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev, */ if (!pdd->runtime_inuse) { err = pm_runtime_get_sync(dev->ddev->dev); - if (err < 0) + if (err < 0) { + pm_runtime_put_autosuspend(dev->ddev->dev); return ERR_PTR(err); + } } err = kfd_iommu_bind_process_to_device(pdd); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c index bb77f7af2b6d..cd18baf62727 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c @@ -210,39 +210,41 @@ struct kfd_topology_device *kfd_create_topology_device( } -#define sysfs_show_gen_prop(buffer, fmt, ...) \ - snprintf(buffer, PAGE_SIZE, "%s"fmt, buffer, __VA_ARGS__) -#define sysfs_show_32bit_prop(buffer, name, value) \ - sysfs_show_gen_prop(buffer, "%s %u\n", name, value) -#define sysfs_show_64bit_prop(buffer, name, value) \ - sysfs_show_gen_prop(buffer, "%s %llu\n", name, value) -#define sysfs_show_32bit_val(buffer, value) \ - sysfs_show_gen_prop(buffer, "%u\n", value) -#define sysfs_show_str_val(buffer, value) \ - sysfs_show_gen_prop(buffer, "%s\n", value) +#define sysfs_show_gen_prop(buffer, offs, fmt, ...) \ + (offs += snprintf(buffer+offs, PAGE_SIZE-offs, \ + fmt, __VA_ARGS__)) +#define sysfs_show_32bit_prop(buffer, offs, name, value) \ + sysfs_show_gen_prop(buffer, offs, "%s %u\n", name, value) +#define sysfs_show_64bit_prop(buffer, offs, name, value) \ + sysfs_show_gen_prop(buffer, offs, "%s %llu\n", name, value) +#define sysfs_show_32bit_val(buffer, offs, value) \ + sysfs_show_gen_prop(buffer, offs, "%u\n", value) +#define sysfs_show_str_val(buffer, offs, value) \ + sysfs_show_gen_prop(buffer, offs, "%s\n", value) static ssize_t sysprops_show(struct kobject *kobj, struct attribute *attr, char *buffer) { - ssize_t ret; + int offs = 0; /* Making sure that the buffer is an empty string */ buffer[0] = 0; if (attr == &sys_props.attr_genid) { - ret = sysfs_show_32bit_val(buffer, sys_props.generation_count); + sysfs_show_32bit_val(buffer, offs, + sys_props.generation_count); } else if (attr == &sys_props.attr_props) { - sysfs_show_64bit_prop(buffer, "platform_oem", - sys_props.platform_oem); - sysfs_show_64bit_prop(buffer, "platform_id", - sys_props.platform_id); - ret = sysfs_show_64bit_prop(buffer, "platform_rev", - sys_props.platform_rev); + sysfs_show_64bit_prop(buffer, offs, "platform_oem", + sys_props.platform_oem); + sysfs_show_64bit_prop(buffer, offs, "platform_id", + sys_props.platform_id); + sysfs_show_64bit_prop(buffer, offs, "platform_rev", + sys_props.platform_rev); } else { - ret = -EINVAL; + offs = -EINVAL; } - return ret; + return offs; } static void kfd_topology_kobj_release(struct kobject *kobj) @@ -262,7 +264,7 @@ static struct kobj_type sysprops_type = { static ssize_t iolink_show(struct kobject *kobj, struct attribute *attr, char *buffer) { - ssize_t ret; + int offs = 0; struct kfd_iolink_properties *iolink; /* Making sure that the buffer is an empty string */ @@ -271,21 +273,23 @@ static ssize_t iolink_show(struct kobject *kobj, struct attribute *attr, iolink = container_of(attr, struct kfd_iolink_properties, attr); if (iolink->gpu && kfd_devcgroup_check_permission(iolink->gpu)) return -EPERM; - sysfs_show_32bit_prop(buffer, "type", iolink->iolink_type); - sysfs_show_32bit_prop(buffer, "version_major", iolink->ver_maj); - sysfs_show_32bit_prop(buffer, "version_minor", iolink->ver_min); - sysfs_show_32bit_prop(buffer, "node_from", iolink->node_from); - sysfs_show_32bit_prop(buffer, "node_to", iolink->node_to); - sysfs_show_32bit_prop(buffer, "weight", iolink->weight); - sysfs_show_32bit_prop(buffer, "min_latency", iolink->min_latency); - sysfs_show_32bit_prop(buffer, "max_latency", iolink->max_latency); - sysfs_show_32bit_prop(buffer, "min_bandwidth", iolink->min_bandwidth); - sysfs_show_32bit_prop(buffer, "max_bandwidth", iolink->max_bandwidth); - sysfs_show_32bit_prop(buffer, "recommended_transfer_size", - iolink->rec_transfer_size); - ret = sysfs_show_32bit_prop(buffer, "flags", iolink->flags); - - return ret; + sysfs_show_32bit_prop(buffer, offs, "type", iolink->iolink_type); + sysfs_show_32bit_prop(buffer, offs, "version_major", iolink->ver_maj); + sysfs_show_32bit_prop(buffer, offs, "version_minor", iolink->ver_min); + sysfs_show_32bit_prop(buffer, offs, "node_from", iolink->node_from); + sysfs_show_32bit_prop(buffer, offs, "node_to", iolink->node_to); + sysfs_show_32bit_prop(buffer, offs, "weight", iolink->weight); + sysfs_show_32bit_prop(buffer, offs, "min_latency", iolink->min_latency); + sysfs_show_32bit_prop(buffer, offs, "max_latency", iolink->max_latency); + sysfs_show_32bit_prop(buffer, offs, "min_bandwidth", + iolink->min_bandwidth); + sysfs_show_32bit_prop(buffer, offs, "max_bandwidth", + iolink->max_bandwidth); + sysfs_show_32bit_prop(buffer, offs, "recommended_transfer_size", + iolink->rec_transfer_size); + sysfs_show_32bit_prop(buffer, offs, "flags", iolink->flags); + + return offs; } static const struct sysfs_ops iolink_ops = { @@ -300,7 +304,7 @@ static struct kobj_type iolink_type = { static ssize_t mem_show(struct kobject *kobj, struct attribute *attr, char *buffer) { - ssize_t ret; + int offs = 0; struct kfd_mem_properties *mem; /* Making sure that the buffer is an empty string */ @@ -309,13 +313,15 @@ static ssize_t mem_show(struct kobject *kobj, struct attribute *attr, mem = container_of(attr, struct kfd_mem_properties, attr); if (mem->gpu && kfd_devcgroup_check_permission(mem->gpu)) return -EPERM; - sysfs_show_32bit_prop(buffer, "heap_type", mem->heap_type); - sysfs_show_64bit_prop(buffer, "size_in_bytes", mem->size_in_bytes); - sysfs_show_32bit_prop(buffer, "flags", mem->flags); - sysfs_show_32bit_prop(buffer, "width", mem->width); - ret = sysfs_show_32bit_prop(buffer, "mem_clk_max", mem->mem_clk_max); - - return ret; + sysfs_show_32bit_prop(buffer, offs, "heap_type", mem->heap_type); + sysfs_show_64bit_prop(buffer, offs, "size_in_bytes", + mem->size_in_bytes); + sysfs_show_32bit_prop(buffer, offs, "flags", mem->flags); + sysfs_show_32bit_prop(buffer, offs, "width", mem->width); + sysfs_show_32bit_prop(buffer, offs, "mem_clk_max", + mem->mem_clk_max); + + return offs; } static const struct sysfs_ops mem_ops = { @@ -330,7 +336,7 @@ static struct kobj_type mem_type = { static ssize_t kfd_cache_show(struct kobject *kobj, struct attribute *attr, char *buffer) { - ssize_t ret; + int offs = 0; uint32_t i, j; struct kfd_cache_properties *cache; @@ -340,30 +346,27 @@ static ssize_t kfd_cache_show(struct kobject *kobj, struct attribute *attr, cache = container_of(attr, struct kfd_cache_properties, attr); if (cache->gpu && kfd_devcgroup_check_permission(cache->gpu)) return -EPERM; - sysfs_show_32bit_prop(buffer, "processor_id_low", + sysfs_show_32bit_prop(buffer, offs, "processor_id_low", cache->processor_id_low); - sysfs_show_32bit_prop(buffer, "level", cache->cache_level); - sysfs_show_32bit_prop(buffer, "size", cache->cache_size); - sysfs_show_32bit_prop(buffer, "cache_line_size", cache->cacheline_size); - sysfs_show_32bit_prop(buffer, "cache_lines_per_tag", - cache->cachelines_per_tag); - sysfs_show_32bit_prop(buffer, "association", cache->cache_assoc); - sysfs_show_32bit_prop(buffer, "latency", cache->cache_latency); - sysfs_show_32bit_prop(buffer, "type", cache->cache_type); - snprintf(buffer, PAGE_SIZE, "%ssibling_map ", buffer); + sysfs_show_32bit_prop(buffer, offs, "level", cache->cache_level); + sysfs_show_32bit_prop(buffer, offs, "size", cache->cache_size); + sysfs_show_32bit_prop(buffer, offs, "cache_line_size", + cache->cacheline_size); + sysfs_show_32bit_prop(buffer, offs, "cache_lines_per_tag", + cache->cachelines_per_tag); + sysfs_show_32bit_prop(buffer, offs, "association", cache->cache_assoc); + sysfs_show_32bit_prop(buffer, offs, "latency", cache->cache_latency); + sysfs_show_32bit_prop(buffer, offs, "type", cache->cache_type); + offs += snprintf(buffer+offs, PAGE_SIZE-offs, "sibling_map "); for (i = 0; i < CRAT_SIBLINGMAP_SIZE; i++) - for (j = 0; j < sizeof(cache->sibling_map[0])*8; j++) { + for (j = 0; j < sizeof(cache->sibling_map[0])*8; j++) /* Check each bit */ - if (cache->sibling_map[i] & (1 << j)) - ret = snprintf(buffer, PAGE_SIZE, - "%s%d%s", buffer, 1, ","); - else - ret = snprintf(buffer, PAGE_SIZE, - "%s%d%s", buffer, 0, ","); - } + offs += snprintf(buffer+offs, PAGE_SIZE-offs, "%d,", + (cache->sibling_map[i] >> j) & 1); + /* Replace the last "," with end of line */ - *(buffer + strlen(buffer) - 1) = 0xA; - return ret; + buffer[offs-1] = '\n'; + return offs; } static const struct sysfs_ops cache_ops = { @@ -385,6 +388,7 @@ struct kfd_perf_attr { static ssize_t perf_show(struct kobject *kobj, struct kobj_attribute *attrs, char *buf) { + int offs = 0; struct kfd_perf_attr *attr; buf[0] = 0; @@ -392,7 +396,7 @@ static ssize_t perf_show(struct kobject *kobj, struct kobj_attribute *attrs, if (!attr->data) /* invalid data for PMC */ return 0; else - return sysfs_show_32bit_val(buf, attr->data); + return sysfs_show_32bit_val(buf, offs, attr->data); } #define KFD_PERF_DESC(_name, _data) \ @@ -411,6 +415,7 @@ static struct kfd_perf_attr perf_attr_iommu[] = { static ssize_t node_show(struct kobject *kobj, struct attribute *attr, char *buffer) { + int offs = 0; struct kfd_topology_device *dev; uint32_t log_max_watch_addr; @@ -422,7 +427,7 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr, attr_gpuid); if (dev->gpu && kfd_devcgroup_check_permission(dev->gpu)) return -EPERM; - return sysfs_show_32bit_val(buffer, dev->gpu_id); + return sysfs_show_32bit_val(buffer, offs, dev->gpu_id); } if (strcmp(attr->name, "name") == 0) { @@ -431,69 +436,69 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr, if (dev->gpu && kfd_devcgroup_check_permission(dev->gpu)) return -EPERM; - return sysfs_show_str_val(buffer, dev->node_props.name); + return sysfs_show_str_val(buffer, offs, dev->node_props.name); } dev = container_of(attr, struct kfd_topology_device, attr_props); if (dev->gpu && kfd_devcgroup_check_permission(dev->gpu)) return -EPERM; - sysfs_show_32bit_prop(buffer, "cpu_cores_count", - dev->node_props.cpu_cores_count); - sysfs_show_32bit_prop(buffer, "simd_count", - dev->node_props.simd_count); - sysfs_show_32bit_prop(buffer, "mem_banks_count", - dev->node_props.mem_banks_count); - sysfs_show_32bit_prop(buffer, "caches_count", - dev->node_props.caches_count); - sysfs_show_32bit_prop(buffer, "io_links_count", - dev->node_props.io_links_count); - sysfs_show_32bit_prop(buffer, "cpu_core_id_base", - dev->node_props.cpu_core_id_base); - sysfs_show_32bit_prop(buffer, "simd_id_base", - dev->node_props.simd_id_base); - sysfs_show_32bit_prop(buffer, "max_waves_per_simd", - dev->node_props.max_waves_per_simd); - sysfs_show_32bit_prop(buffer, "lds_size_in_kb", - dev->node_props.lds_size_in_kb); - sysfs_show_32bit_prop(buffer, "gds_size_in_kb", - dev->node_props.gds_size_in_kb); - sysfs_show_32bit_prop(buffer, "num_gws", - dev->node_props.num_gws); - sysfs_show_32bit_prop(buffer, "wave_front_size", - dev->node_props.wave_front_size); - sysfs_show_32bit_prop(buffer, "array_count", - dev->node_props.array_count); - sysfs_show_32bit_prop(buffer, "simd_arrays_per_engine", - dev->node_props.simd_arrays_per_engine); - sysfs_show_32bit_prop(buffer, "cu_per_simd_array", - dev->node_props.cu_per_simd_array); - sysfs_show_32bit_prop(buffer, "simd_per_cu", - dev->node_props.simd_per_cu); - sysfs_show_32bit_prop(buffer, "max_slots_scratch_cu", - dev->node_props.max_slots_scratch_cu); - sysfs_show_32bit_prop(buffer, "vendor_id", - dev->node_props.vendor_id); - sysfs_show_32bit_prop(buffer, "device_id", - dev->node_props.device_id); - sysfs_show_32bit_prop(buffer, "location_id", - dev->node_props.location_id); - sysfs_show_32bit_prop(buffer, "domain", - dev->node_props.domain); - sysfs_show_32bit_prop(buffer, "drm_render_minor", - dev->node_props.drm_render_minor); - sysfs_show_64bit_prop(buffer, "hive_id", - dev->node_props.hive_id); - sysfs_show_32bit_prop(buffer, "num_sdma_engines", - dev->node_props.num_sdma_engines); - sysfs_show_32bit_prop(buffer, "num_sdma_xgmi_engines", - dev->node_props.num_sdma_xgmi_engines); - sysfs_show_32bit_prop(buffer, "num_sdma_queues_per_engine", - dev->node_props.num_sdma_queues_per_engine); - sysfs_show_32bit_prop(buffer, "num_cp_queues", - dev->node_props.num_cp_queues); - sysfs_show_64bit_prop(buffer, "unique_id", - dev->node_props.unique_id); + sysfs_show_32bit_prop(buffer, offs, "cpu_cores_count", + dev->node_props.cpu_cores_count); + sysfs_show_32bit_prop(buffer, offs, "simd_count", + dev->node_props.simd_count); + sysfs_show_32bit_prop(buffer, offs, "mem_banks_count", + dev->node_props.mem_banks_count); + sysfs_show_32bit_prop(buffer, offs, "caches_count", + dev->node_props.caches_count); + sysfs_show_32bit_prop(buffer, offs, "io_links_count", + dev->node_props.io_links_count); + sysfs_show_32bit_prop(buffer, offs, "cpu_core_id_base", + dev->node_props.cpu_core_id_base); + sysfs_show_32bit_prop(buffer, offs, "simd_id_base", + dev->node_props.simd_id_base); + sysfs_show_32bit_prop(buffer, offs, "max_waves_per_simd", + dev->node_props.max_waves_per_simd); + sysfs_show_32bit_prop(buffer, offs, "lds_size_in_kb", + dev->node_props.lds_size_in_kb); + sysfs_show_32bit_prop(buffer, offs, "gds_size_in_kb", + dev->node_props.gds_size_in_kb); + sysfs_show_32bit_prop(buffer, offs, "num_gws", + dev->node_props.num_gws); + sysfs_show_32bit_prop(buffer, offs, "wave_front_size", + dev->node_props.wave_front_size); + sysfs_show_32bit_prop(buffer, offs, "array_count", + dev->node_props.array_count); + sysfs_show_32bit_prop(buffer, offs, "simd_arrays_per_engine", + dev->node_props.simd_arrays_per_engine); + sysfs_show_32bit_prop(buffer, offs, "cu_per_simd_array", + dev->node_props.cu_per_simd_array); + sysfs_show_32bit_prop(buffer, offs, "simd_per_cu", + dev->node_props.simd_per_cu); + sysfs_show_32bit_prop(buffer, offs, "max_slots_scratch_cu", + dev->node_props.max_slots_scratch_cu); + sysfs_show_32bit_prop(buffer, offs, "vendor_id", + dev->node_props.vendor_id); + sysfs_show_32bit_prop(buffer, offs, "device_id", + dev->node_props.device_id); + sysfs_show_32bit_prop(buffer, offs, "location_id", + dev->node_props.location_id); + sysfs_show_32bit_prop(buffer, offs, "domain", + dev->node_props.domain); + sysfs_show_32bit_prop(buffer, offs, "drm_render_minor", + dev->node_props.drm_render_minor); + sysfs_show_64bit_prop(buffer, offs, "hive_id", + dev->node_props.hive_id); + sysfs_show_32bit_prop(buffer, offs, "num_sdma_engines", + dev->node_props.num_sdma_engines); + sysfs_show_32bit_prop(buffer, offs, "num_sdma_xgmi_engines", + dev->node_props.num_sdma_xgmi_engines); + sysfs_show_32bit_prop(buffer, offs, "num_sdma_queues_per_engine", + dev->node_props.num_sdma_queues_per_engine); + sysfs_show_32bit_prop(buffer, offs, "num_cp_queues", + dev->node_props.num_cp_queues); + sysfs_show_64bit_prop(buffer, offs, "unique_id", + dev->node_props.unique_id); if (dev->gpu) { log_max_watch_addr = @@ -513,22 +518,21 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr, dev->node_props.capability |= HSA_CAP_AQL_QUEUE_DOUBLE_MAP; - sysfs_show_32bit_prop(buffer, "max_engine_clk_fcompute", + sysfs_show_32bit_prop(buffer, offs, "max_engine_clk_fcompute", dev->node_props.max_engine_clk_fcompute); - sysfs_show_64bit_prop(buffer, "local_mem_size", - (unsigned long long int) 0); + sysfs_show_64bit_prop(buffer, offs, "local_mem_size", 0ULL); - sysfs_show_32bit_prop(buffer, "fw_version", - dev->gpu->mec_fw_version); - sysfs_show_32bit_prop(buffer, "capability", - dev->node_props.capability); - sysfs_show_32bit_prop(buffer, "sdma_fw_version", - dev->gpu->sdma_fw_version); + sysfs_show_32bit_prop(buffer, offs, "fw_version", + dev->gpu->mec_fw_version); + sysfs_show_32bit_prop(buffer, offs, "capability", + dev->node_props.capability); + sysfs_show_32bit_prop(buffer, offs, "sdma_fw_version", + dev->gpu->sdma_fw_version); } - return sysfs_show_32bit_prop(buffer, "max_engine_clk_ccompute", - cpufreq_quick_get_max(0)/1000); + return sysfs_show_32bit_prop(buffer, offs, "max_engine_clk_ccompute", + cpufreq_quick_get_max(0)/1000); } static const struct sysfs_ops node_ops = { @@ -632,8 +636,10 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev, ret = kobject_init_and_add(dev->kobj_node, &node_type, sys_props.kobj_nodes, "%d", id); - if (ret < 0) + if (ret < 0) { + kobject_put(dev->kobj_node); return ret; + } dev->kobj_mem = kobject_create_and_add("mem_banks", dev->kobj_node); if (!dev->kobj_mem) @@ -680,8 +686,10 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev, return -ENOMEM; ret = kobject_init_and_add(mem->kobj, &mem_type, dev->kobj_mem, "%d", i); - if (ret < 0) + if (ret < 0) { + kobject_put(mem->kobj); return ret; + } mem->attr.name = "properties"; mem->attr.mode = KFD_SYSFS_FILE_MODE; @@ -699,8 +707,10 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev, return -ENOMEM; ret = kobject_init_and_add(cache->kobj, &cache_type, dev->kobj_cache, "%d", i); - if (ret < 0) + if (ret < 0) { + kobject_put(cache->kobj); return ret; + } cache->attr.name = "properties"; cache->attr.mode = KFD_SYSFS_FILE_MODE; @@ -718,8 +728,10 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev, return -ENOMEM; ret = kobject_init_and_add(iolink->kobj, &iolink_type, dev->kobj_iolink, "%d", i); - if (ret < 0) + if (ret < 0) { + kobject_put(iolink->kobj); return ret; + } iolink->attr.name = "properties"; iolink->attr.mode = KFD_SYSFS_FILE_MODE; @@ -798,8 +810,10 @@ static int kfd_topology_update_sysfs(void) ret = kobject_init_and_add(sys_props.kobj_topology, &sysprops_type, &kfd_device->kobj, "topology"); - if (ret < 0) + if (ret < 0) { + kobject_put(sys_props.kobj_topology); return ret; + } sys_props.kobj_nodes = kobject_create_and_add("nodes", sys_props.kobj_topology); @@ -1359,6 +1373,7 @@ int kfd_topology_add_device(struct kfd_dev *gpu) case CHIP_NAVI10: case CHIP_NAVI12: case CHIP_NAVI14: + case CHIP_SIENNA_CICHLID: dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_2_0 << HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) & HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK); |