diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd')
45 files changed, 5041 insertions, 2006 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/Kconfig b/drivers/gpu/drm/amd/amdkfd/Kconfig index c3613604a4f8..a1a35d4d594b 100644 --- a/drivers/gpu/drm/amd/amdkfd/Kconfig +++ b/drivers/gpu/drm/amd/amdkfd/Kconfig @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only # # Heterogenous system architecture configuration # diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile index 69ec96998bb9..48155060a57c 100644 --- a/drivers/gpu/drm/amd/amdkfd/Makefile +++ b/drivers/gpu/drm/amd/amdkfd/Makefile @@ -36,16 +36,19 @@ AMDKFD_FILES := $(AMDKFD_PATH)/kfd_module.o \ $(AMDKFD_PATH)/kfd_mqd_manager_cik.o \ $(AMDKFD_PATH)/kfd_mqd_manager_vi.o \ $(AMDKFD_PATH)/kfd_mqd_manager_v9.o \ + $(AMDKFD_PATH)/kfd_mqd_manager_v10.o \ $(AMDKFD_PATH)/kfd_kernel_queue.o \ $(AMDKFD_PATH)/kfd_kernel_queue_cik.o \ $(AMDKFD_PATH)/kfd_kernel_queue_vi.o \ $(AMDKFD_PATH)/kfd_kernel_queue_v9.o \ + $(AMDKFD_PATH)/kfd_kernel_queue_v10.o \ $(AMDKFD_PATH)/kfd_packet_manager.o \ $(AMDKFD_PATH)/kfd_process_queue_manager.o \ $(AMDKFD_PATH)/kfd_device_queue_manager.o \ $(AMDKFD_PATH)/kfd_device_queue_manager_cik.o \ $(AMDKFD_PATH)/kfd_device_queue_manager_vi.o \ $(AMDKFD_PATH)/kfd_device_queue_manager_v9.o \ + $(AMDKFD_PATH)/kfd_device_queue_manager_v10.o \ $(AMDKFD_PATH)/kfd_interrupt.o \ $(AMDKFD_PATH)/kfd_events.o \ $(AMDKFD_PATH)/cik_event_interrupt.o \ diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c index 177d1e5329a5..9f59ba93cfe0 100644 --- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c +++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c @@ -33,7 +33,9 @@ static bool cik_event_interrupt_isr(struct kfd_dev *dev, const struct cik_ih_ring_entry *ihre = (const struct cik_ih_ring_entry *)ih_ring_entry; const struct kfd2kgd_calls *f2g = dev->kfd2kgd; - unsigned int vmid, pasid; + unsigned int vmid; + uint16_t pasid; + bool ret; /* This workaround is due to HW/FW limitation on Hawaii that * VMID and PASID are not written into ih_ring_entry @@ -48,13 +50,13 @@ static bool cik_event_interrupt_isr(struct kfd_dev *dev, *tmp_ihre = *ihre; vmid = f2g->read_vmid_from_vmfault_reg(dev->kgd); - pasid = f2g->get_atc_vmid_pasid_mapping_pasid(dev->kgd, vmid); + ret = f2g->get_atc_vmid_pasid_mapping_info(dev->kgd, vmid, &pasid); tmp_ihre->ring_id &= 0x000000ff; tmp_ihre->ring_id |= vmid << 8; tmp_ihre->ring_id |= pasid << 16; - return (pasid != 0) && + return ret && (pasid != 0) && vmid >= dev->vm_info.first_vmid_kfd && vmid <= dev->vm_info.last_vmid_kfd; } diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h index 3621efbd5759..d3400da6ab64 100644 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h @@ -21,7 +21,7 @@ */ static const uint32_t cwsr_trap_gfx8_hex[] = { - 0xbf820001, 0xbf82012b, + 0xbf820001, 0xbf820121, 0xb8f4f802, 0x89748674, 0xb8f5f803, 0x8675ff75, 0x00000400, 0xbf850017, @@ -36,12 +36,7 @@ static const uint32_t cwsr_trap_gfx8_hex[] = { 0x8671ff71, 0x0000ffff, 0x8f728374, 0xb972e0c2, 0xbf800002, 0xb9740002, - 0xbe801f70, 0xb8f5f803, - 0x8675ff75, 0x00000100, - 0xbf840006, 0xbefa0080, - 0xb97a0203, 0x8671ff71, - 0x0000ffff, 0x80f08870, - 0x82f18071, 0xbefa0080, + 0xbe801f70, 0xbefa0080, 0xb97a0283, 0xbef60068, 0xbef70069, 0xb8fa1c07, 0x8e7a9c7a, 0x87717a71, @@ -279,118 +274,122 @@ static const uint32_t cwsr_trap_gfx8_hex[] = { static const uint32_t cwsr_trap_gfx9_hex[] = { - 0xbf820001, 0xbf82015d, + 0xbf820001, 0xbf820248, 0xb8f8f802, 0x89788678, - 0xb8f1f803, 0x866eff71, - 0x00000400, 0xbf850037, - 0x866eff71, 0x00000800, - 0xbf850003, 0x866eff71, - 0x00000100, 0xbf840008, + 0xb8eef801, 0x866eff6e, + 0x00000800, 0xbf840003, 0x866eff78, 0x00002000, - 0xbf840001, 0xbf810000, - 0x8778ff78, 0x00002000, - 0x80ec886c, 0x82ed806d, - 0xb8eef807, 0x866fff6e, - 0x001f8000, 0x8e6f8b6f, - 0x8977ff77, 0xfc000000, - 0x87776f77, 0x896eff6e, - 0x001f8000, 0xb96ef807, - 0xb8f0f812, 0xb8f1f813, - 0x8ef08870, 0xc0071bb8, - 0x00000000, 0xbf8cc07f, - 0xc0071c38, 0x00000008, - 0xbf8cc07f, 0x86ee6e6e, - 0xbf840001, 0xbe801d6e, - 0xb8f1f803, 0x8671ff71, - 0x000001ff, 0xbf850002, - 0x806c846c, 0x826d806d, + 0xbf840016, 0xb8fbf803, + 0x866eff7b, 0x00000400, + 0xbf85003b, 0x866eff7b, + 0x00000800, 0xbf850003, + 0x866eff7b, 0x00000100, + 0xbf84000c, 0x866eff78, + 0x00002000, 0xbf840005, + 0xbf8e0010, 0xb8eef803, + 0x866eff6e, 0x00000400, + 0xbf84fffb, 0x8778ff78, + 0x00002000, 0x80ec886c, + 0x82ed806d, 0xb8eef807, + 0x866fff6e, 0x001f8000, + 0x8e6f8b6f, 0x8977ff77, + 0xfc000000, 0x87776f77, + 0x896eff6e, 0x001f8000, + 0xb96ef807, 0xb8faf812, + 0xb8fbf813, 0x8efa887a, + 0xc0071bbd, 0x00000000, + 0xbf8cc07f, 0xc0071ebd, + 0x00000008, 0xbf8cc07f, + 0x86ee6e6e, 0xbf840001, + 0xbe801d6e, 0xb8fbf803, + 0x867bff7b, 0x000001ff, + 0xbf850002, 0x806c846c, + 0x826d806d, 0x866dff6d, + 0x0000ffff, 0x8f6e8b77, + 0x866eff6e, 0x001f8000, + 0xb96ef807, 0x86fe7e7e, + 0x86ea6a6a, 0x8f6e8378, + 0xb96ee0c2, 0xbf800002, + 0xb9780002, 0xbe801f6c, 0x866dff6d, 0x0000ffff, - 0x8f6e8b77, 0x866eff6e, - 0x001f8000, 0xb96ef807, - 0x86fe7e7e, 0x86ea6a6a, - 0x8f6e8378, 0xb96ee0c2, - 0xbf800002, 0xb9780002, - 0xbe801f6c, 0x866dff6d, - 0x0000ffff, 0xbef00080, - 0xb9700283, 0xb8f02407, - 0x8e709c70, 0x876d706d, - 0xb8f003c7, 0x8e709b70, - 0x876d706d, 0xb8f0f807, - 0x8670ff70, 0x00007fff, - 0xb970f807, 0xbeee007e, - 0xbeef007f, 0xbefe0180, - 0xbf900004, 0x87708478, - 0xb970f802, 0xbf8e0002, - 0xbf88fffe, 0xb8f02a05, - 0x80708170, 0x8e708a70, - 0xb8f11605, 0x80718171, - 0x8e718671, 0x80707170, - 0x80707e70, 0x8271807f, - 0x8671ff71, 0x0000ffff, - 0xc0471cb8, 0x00000040, - 0xbf8cc07f, 0xc04b1d38, - 0x00000048, 0xbf8cc07f, - 0xc0431e78, 0x00000058, - 0xbf8cc07f, 0xc0471eb8, - 0x0000005c, 0xbf8cc07f, + 0xbefa0080, 0xb97a0283, + 0xb8fa2407, 0x8e7a9b7a, + 0x876d7a6d, 0xb8fa03c7, + 0x8e7a9a7a, 0x876d7a6d, + 0xb8faf807, 0x867aff7a, + 0x00007fff, 0xb97af807, + 0xbeee007e, 0xbeef007f, + 0xbefe0180, 0xbf900004, + 0x877a8478, 0xb97af802, + 0xbf8e0002, 0xbf88fffe, + 0xb8fa2a05, 0x807a817a, + 0x8e7a8a7a, 0xb8fb1605, + 0x807b817b, 0x8e7b867b, + 0x807a7b7a, 0x807a7e7a, + 0x827b807f, 0x867bff7b, + 0x0000ffff, 0xc04b1c3d, + 0x00000050, 0xbf8cc07f, + 0xc04b1d3d, 0x00000060, + 0xbf8cc07f, 0xc0431e7d, + 0x00000074, 0xbf8cc07f, 0xbef4007e, 0x8675ff7f, 0x0000ffff, 0x8775ff75, 0x00040000, 0xbef60080, 0xbef700ff, 0x00807fac, - 0x8670ff7f, 0x08000000, - 0x8f708370, 0x87777077, - 0x8670ff7f, 0x70000000, - 0x8f708170, 0x87777077, - 0xbefb007c, 0xbefa0080, - 0xb8fa2a05, 0x807a817a, - 0x8e7a8a7a, 0xb8f01605, - 0x80708170, 0x8e708670, - 0x807a707a, 0xbef60084, + 0x867aff7f, 0x08000000, + 0x8f7a837a, 0x87777a77, + 0x867aff7f, 0x70000000, + 0x8f7a817a, 0x87777a77, + 0xbef1007c, 0xbef00080, + 0xb8f02a05, 0x80708170, + 0x8e708a70, 0xb8fa1605, + 0x807a817a, 0x8e7a867a, + 0x80707a70, 0xbef60084, 0xbef600ff, 0x01000000, - 0xbefe007c, 0xbefc007a, - 0xc0611efa, 0x0000007c, - 0xbf8cc07f, 0x807a847a, + 0xbefe007c, 0xbefc0070, + 0xc0611c7a, 0x0000007c, + 0xbf8cc07f, 0x80708470, 0xbefc007e, 0xbefe007c, - 0xbefc007a, 0xc0611b3a, + 0xbefc0070, 0xc0611b3a, 0x0000007c, 0xbf8cc07f, - 0x807a847a, 0xbefc007e, - 0xbefe007c, 0xbefc007a, + 0x80708470, 0xbefc007e, + 0xbefe007c, 0xbefc0070, 0xc0611b7a, 0x0000007c, - 0xbf8cc07f, 0x807a847a, + 0xbf8cc07f, 0x80708470, 0xbefc007e, 0xbefe007c, - 0xbefc007a, 0xc0611bba, + 0xbefc0070, 0xc0611bba, 0x0000007c, 0xbf8cc07f, - 0x807a847a, 0xbefc007e, - 0xbefe007c, 0xbefc007a, + 0x80708470, 0xbefc007e, + 0xbefe007c, 0xbefc0070, 0xc0611bfa, 0x0000007c, - 0xbf8cc07f, 0x807a847a, + 0xbf8cc07f, 0x80708470, 0xbefc007e, 0xbefe007c, - 0xbefc007a, 0xc0611e3a, + 0xbefc0070, 0xc0611e3a, 0x0000007c, 0xbf8cc07f, - 0x807a847a, 0xbefc007e, - 0xb8f1f803, 0xbefe007c, - 0xbefc007a, 0xc0611c7a, + 0x80708470, 0xbefc007e, + 0xb8fbf803, 0xbefe007c, + 0xbefc0070, 0xc0611efa, 0x0000007c, 0xbf8cc07f, - 0x807a847a, 0xbefc007e, - 0xbefe007c, 0xbefc007a, + 0x80708470, 0xbefc007e, + 0xbefe007c, 0xbefc0070, 0xc0611a3a, 0x0000007c, - 0xbf8cc07f, 0x807a847a, + 0xbf8cc07f, 0x80708470, 0xbefc007e, 0xbefe007c, - 0xbefc007a, 0xc0611a7a, + 0xbefc0070, 0xc0611a7a, 0x0000007c, 0xbf8cc07f, - 0x807a847a, 0xbefc007e, - 0xb8fbf801, 0xbefe007c, - 0xbefc007a, 0xc0611efa, + 0x80708470, 0xbefc007e, + 0xb8f1f801, 0xbefe007c, + 0xbefc0070, 0xc0611c7a, 0x0000007c, 0xbf8cc07f, - 0x807a847a, 0xbefc007e, - 0x8670ff7f, 0x04000000, - 0xbeef0080, 0x876f6f70, - 0xb8fa2a05, 0x807a817a, - 0x8e7a8a7a, 0xb8f11605, - 0x80718171, 0x8e718471, - 0x8e768271, 0xbef600ff, + 0x80708470, 0xbefc007e, + 0x867aff7f, 0x04000000, + 0xbeef0080, 0x876f6f7a, + 0xb8f02a05, 0x80708170, + 0x8e708a70, 0xb8fb1605, + 0x807b817b, 0x8e7b847b, + 0x8e76827b, 0xbef600ff, 0x01000000, 0xbef20174, - 0x80747a74, 0x82758075, + 0x80747074, 0x82758075, 0xbefc0080, 0xbf800000, 0xbe802b00, 0xbe822b02, 0xbe842b04, 0xbe862b06, @@ -403,73 +402,1047 @@ static const uint32_t cwsr_trap_gfx9_hex[] = { 0xbf8cc07f, 0xc06b033a, 0x00000030, 0xbf8cc07f, 0x8074c074, 0x82758075, - 0x807c907c, 0xbf0a717c, + 0x807c907c, 0xbf0a7b7c, 0xbf85ffe7, 0xbef40172, - 0xbefa0080, 0xbefe00c1, + 0xbef00080, 0xbefe00c1, 0xbeff00c1, 0xbee80080, 0xbee90080, 0xbef600ff, - 0x01000000, 0xe0724000, - 0x7a1d0000, 0xe0724100, - 0x7a1d0100, 0xe0724200, - 0x7a1d0200, 0xe0724300, - 0x7a1d0300, 0xbefe00c1, - 0xbeff00c1, 0xb8f14306, - 0x8671c171, 0xbf84002c, - 0xbf8a0000, 0x8670ff6f, - 0x04000000, 0xbf840028, - 0x8e718671, 0x8e718271, - 0xbef60071, 0xb8fa2a05, - 0x807a817a, 0x8e7a8a7a, - 0xb8f01605, 0x80708170, - 0x8e708670, 0x807a707a, - 0x807aff7a, 0x00000080, + 0x01000000, 0x867aff78, + 0x00400000, 0xbf850003, + 0xb8faf803, 0x897a7aff, + 0x10000000, 0xbf85004d, + 0xbe840080, 0xd2890000, + 0x00000900, 0x80048104, + 0xd2890001, 0x00000900, + 0x80048104, 0xd2890002, + 0x00000900, 0x80048104, + 0xd2890003, 0x00000900, + 0x80048104, 0xc069003a, + 0x00000070, 0xbf8cc07f, + 0x80709070, 0xbf06c004, + 0xbf84ffee, 0xbe840080, + 0xd2890000, 0x00000901, + 0x80048104, 0xd2890001, + 0x00000901, 0x80048104, + 0xd2890002, 0x00000901, + 0x80048104, 0xd2890003, + 0x00000901, 0x80048104, + 0xc069003a, 0x00000070, + 0xbf8cc07f, 0x80709070, + 0xbf06c004, 0xbf84ffee, + 0xbe840080, 0xd2890000, + 0x00000902, 0x80048104, + 0xd2890001, 0x00000902, + 0x80048104, 0xd2890002, + 0x00000902, 0x80048104, + 0xd2890003, 0x00000902, + 0x80048104, 0xc069003a, + 0x00000070, 0xbf8cc07f, + 0x80709070, 0xbf06c004, + 0xbf84ffee, 0xbe840080, + 0xd2890000, 0x00000903, + 0x80048104, 0xd2890001, + 0x00000903, 0x80048104, + 0xd2890002, 0x00000903, + 0x80048104, 0xd2890003, + 0x00000903, 0x80048104, + 0xc069003a, 0x00000070, + 0xbf8cc07f, 0x80709070, + 0xbf06c004, 0xbf84ffee, + 0xbf820008, 0xe0724000, + 0x701d0000, 0xe0724100, + 0x701d0100, 0xe0724200, + 0x701d0200, 0xe0724300, + 0x701d0300, 0xbefe00c1, + 0xbeff00c1, 0xb8fb4306, + 0x867bc17b, 0xbf840063, + 0xbf8a0000, 0x867aff6f, + 0x04000000, 0xbf84005f, + 0x8e7b867b, 0x8e7b827b, + 0xbef6007b, 0xb8f02a05, + 0x80708170, 0x8e708a70, + 0xb8fa1605, 0x807a817a, + 0x8e7a867a, 0x80707a70, + 0x8070ff70, 0x00000080, 0xbef600ff, 0x01000000, 0xbefc0080, 0xd28c0002, 0x000100c1, 0xd28d0003, - 0x000204c1, 0xd1060002, - 0x00011103, 0x7e0602ff, - 0x00000200, 0xbefc00ff, - 0x00010000, 0xbe800077, - 0x8677ff77, 0xff7fffff, - 0x8777ff77, 0x00058000, - 0xd8ec0000, 0x00000002, - 0xbf8cc07f, 0xe0765000, - 0x7a1d0002, 0x68040702, - 0xd0c9006a, 0x0000e302, - 0xbf87fff7, 0xbef70000, - 0xbefa00ff, 0x00000400, + 0x000204c1, 0x867aff78, + 0x00400000, 0xbf850003, + 0xb8faf803, 0x897a7aff, + 0x10000000, 0xbf850030, + 0x24040682, 0xd86e4000, + 0x00000002, 0xbf8cc07f, + 0xbe840080, 0xd2890000, + 0x00000900, 0x80048104, + 0xd2890001, 0x00000900, + 0x80048104, 0xd2890002, + 0x00000900, 0x80048104, + 0xd2890003, 0x00000900, + 0x80048104, 0xc069003a, + 0x00000070, 0xbf8cc07f, + 0x80709070, 0xbf06c004, + 0xbf84ffee, 0xbe840080, + 0xd2890000, 0x00000901, + 0x80048104, 0xd2890001, + 0x00000901, 0x80048104, + 0xd2890002, 0x00000901, + 0x80048104, 0xd2890003, + 0x00000901, 0x80048104, + 0xc069003a, 0x00000070, + 0xbf8cc07f, 0x80709070, + 0xbf06c004, 0xbf84ffee, + 0x680404ff, 0x00000200, + 0xd0c9006a, 0x0000f702, + 0xbf87ffd2, 0xbf820015, + 0xd1060002, 0x00011103, + 0x7e0602ff, 0x00000200, + 0xbefc00ff, 0x00010000, + 0xbe800077, 0x8677ff77, + 0xff7fffff, 0x8777ff77, + 0x00058000, 0xd8ec0000, + 0x00000002, 0xbf8cc07f, + 0xe0765000, 0x701d0002, + 0x68040702, 0xd0c9006a, + 0x0000f702, 0xbf87fff7, + 0xbef70000, 0xbef000ff, + 0x00000400, 0xbefe00c1, + 0xbeff00c1, 0xb8fb2a05, + 0x807b817b, 0x8e7b827b, + 0x8e76887b, 0xbef600ff, + 0x01000000, 0xbefc0084, + 0xbf0a7b7c, 0xbf84006d, + 0xbf11017c, 0x807bff7b, + 0x00001000, 0x867aff78, + 0x00400000, 0xbf850003, + 0xb8faf803, 0x897a7aff, + 0x10000000, 0xbf850051, + 0xbe840080, 0xd2890000, + 0x00000900, 0x80048104, + 0xd2890001, 0x00000900, + 0x80048104, 0xd2890002, + 0x00000900, 0x80048104, + 0xd2890003, 0x00000900, + 0x80048104, 0xc069003a, + 0x00000070, 0xbf8cc07f, + 0x80709070, 0xbf06c004, + 0xbf84ffee, 0xbe840080, + 0xd2890000, 0x00000901, + 0x80048104, 0xd2890001, + 0x00000901, 0x80048104, + 0xd2890002, 0x00000901, + 0x80048104, 0xd2890003, + 0x00000901, 0x80048104, + 0xc069003a, 0x00000070, + 0xbf8cc07f, 0x80709070, + 0xbf06c004, 0xbf84ffee, + 0xbe840080, 0xd2890000, + 0x00000902, 0x80048104, + 0xd2890001, 0x00000902, + 0x80048104, 0xd2890002, + 0x00000902, 0x80048104, + 0xd2890003, 0x00000902, + 0x80048104, 0xc069003a, + 0x00000070, 0xbf8cc07f, + 0x80709070, 0xbf06c004, + 0xbf84ffee, 0xbe840080, + 0xd2890000, 0x00000903, + 0x80048104, 0xd2890001, + 0x00000903, 0x80048104, + 0xd2890002, 0x00000903, + 0x80048104, 0xd2890003, + 0x00000903, 0x80048104, + 0xc069003a, 0x00000070, + 0xbf8cc07f, 0x80709070, + 0xbf06c004, 0xbf84ffee, + 0x807c847c, 0xbf0a7b7c, + 0xbf85ffb1, 0xbf9c0000, + 0xbf820012, 0x7e000300, + 0x7e020301, 0x7e040302, + 0x7e060303, 0xe0724000, + 0x701d0000, 0xe0724100, + 0x701d0100, 0xe0724200, + 0x701d0200, 0xe0724300, + 0x701d0300, 0x807c847c, + 0x8070ff70, 0x00000400, + 0xbf0a7b7c, 0xbf85ffef, + 0xbf9c0000, 0xbf8200da, + 0xbef4007e, 0x8675ff7f, + 0x0000ffff, 0x8775ff75, + 0x00040000, 0xbef60080, + 0xbef700ff, 0x00807fac, + 0x866eff7f, 0x08000000, + 0x8f6e836e, 0x87776e77, + 0x866eff7f, 0x70000000, + 0x8f6e816e, 0x87776e77, + 0x866eff7f, 0x04000000, + 0xbf84001e, 0xbefe00c1, + 0xbeff00c1, 0xb8ef4306, + 0x866fc16f, 0xbf840019, + 0x8e6f866f, 0x8e6f826f, + 0xbef6006f, 0xb8f82a05, + 0x80788178, 0x8e788a78, + 0xb8ee1605, 0x806e816e, + 0x8e6e866e, 0x80786e78, + 0x8078ff78, 0x00000080, + 0xbef600ff, 0x01000000, + 0xbefc0080, 0xe0510000, + 0x781d0000, 0xe0510100, + 0x781d0000, 0x807cff7c, + 0x00000200, 0x8078ff78, + 0x00000200, 0xbf0a6f7c, + 0xbf85fff6, 0xbef80080, 0xbefe00c1, 0xbeff00c1, - 0xb8f12a05, 0x80718171, - 0x8e718271, 0x8e768871, + 0xb8ef2a05, 0x806f816f, + 0x8e6f826f, 0x8e76886f, 0xbef600ff, 0x01000000, - 0xbefc0084, 0xbf0a717c, - 0xbf840015, 0xbf11017c, - 0x8071ff71, 0x00001000, + 0xbeee0078, 0x8078ff78, + 0x00000400, 0xbefc0084, + 0xbf11087c, 0x806fff6f, + 0x00008000, 0xe0524000, + 0x781d0000, 0xe0524100, + 0x781d0100, 0xe0524200, + 0x781d0200, 0xe0524300, + 0x781d0300, 0xbf8c0f70, 0x7e000300, 0x7e020301, 0x7e040302, 0x7e060303, - 0xe0724000, 0x7a1d0000, - 0xe0724100, 0x7a1d0100, - 0xe0724200, 0x7a1d0200, - 0xe0724300, 0x7a1d0300, + 0x807c847c, 0x8078ff78, + 0x00000400, 0xbf0a6f7c, + 0xbf85ffee, 0xbf9c0000, + 0xe0524000, 0x6e1d0000, + 0xe0524100, 0x6e1d0100, + 0xe0524200, 0x6e1d0200, + 0xe0524300, 0x6e1d0300, + 0xb8f82a05, 0x80788178, + 0x8e788a78, 0xb8ee1605, + 0x806e816e, 0x8e6e866e, + 0x80786e78, 0x80f8c078, + 0xb8ef1605, 0x806f816f, + 0x8e6f846f, 0x8e76826f, + 0xbef600ff, 0x01000000, + 0xbefc006f, 0xc031003a, + 0x00000078, 0x80f8c078, + 0xbf8cc07f, 0x80fc907c, + 0xbf800000, 0xbe802d00, + 0xbe822d02, 0xbe842d04, + 0xbe862d06, 0xbe882d08, + 0xbe8a2d0a, 0xbe8c2d0c, + 0xbe8e2d0e, 0xbf06807c, + 0xbf84fff0, 0xb8f82a05, + 0x80788178, 0x8e788a78, + 0xb8ee1605, 0x806e816e, + 0x8e6e866e, 0x80786e78, + 0xbef60084, 0xbef600ff, + 0x01000000, 0xc0211bfa, + 0x00000078, 0x80788478, + 0xc0211b3a, 0x00000078, + 0x80788478, 0xc0211b7a, + 0x00000078, 0x80788478, + 0xc0211c3a, 0x00000078, + 0x80788478, 0xc0211c7a, + 0x00000078, 0x80788478, + 0xc0211eba, 0x00000078, + 0x80788478, 0xc0211efa, + 0x00000078, 0x80788478, + 0xc0211a3a, 0x00000078, + 0x80788478, 0xc0211a7a, + 0x00000078, 0x80788478, + 0xc0211cfa, 0x00000078, + 0x80788478, 0xbf8cc07f, + 0xbefc006f, 0xbefe0070, + 0xbeff0071, 0x866f7bff, + 0x000003ff, 0xb96f4803, + 0x866f7bff, 0xfffff800, + 0x8f6f8b6f, 0xb96fa2c3, + 0xb973f801, 0xb8ee2a05, + 0x806e816e, 0x8e6e8a6e, + 0xb8ef1605, 0x806f816f, + 0x8e6f866f, 0x806e6f6e, + 0x806e746e, 0x826f8075, + 0x866fff6f, 0x0000ffff, + 0xc00b1c37, 0x00000050, + 0xc00b1d37, 0x00000060, + 0xc0031e77, 0x00000074, + 0xbf8cc07f, 0x866fff6d, + 0xf8000000, 0x8f6f9b6f, + 0x8e6f906f, 0xbeee0080, + 0x876e6f6e, 0x866fff6d, + 0x04000000, 0x8f6f9a6f, + 0x8e6f8f6f, 0x876e6f6e, + 0x866fff7a, 0x00800000, + 0x8f6f976f, 0xb96ef807, + 0x866dff6d, 0x0000ffff, + 0x86fe7e7e, 0x86ea6a6a, + 0x8f6e837a, 0xb96ee0c2, + 0xbf800002, 0xb97a0002, + 0xbf8a0000, 0x95806f6c, + 0xbf810000, 0x00000000, +}; + +static const uint32_t cwsr_trap_gfx10_hex[] = { + 0xbf820001, 0xbf8201c1, + 0xb0804004, 0xb978f802, + 0x8a788678, 0xb971f803, + 0x876eff71, 0x00000400, + 0xbf850033, 0x876eff71, + 0x00000100, 0xbf840002, + 0x8878ff78, 0x00002000, + 0x8a77ff77, 0xff000000, + 0xb96ef807, 0x876fff6e, + 0x02000000, 0x8f6f866f, + 0x88776f77, 0x876fff6e, + 0x003f8000, 0x8f6f896f, + 0x88776f77, 0x8a6eff6e, + 0x023f8000, 0xb9eef807, + 0xb97af812, 0xb97bf813, + 0x8ffa887a, 0xf4051bbd, + 0xfa000000, 0xbf8cc07f, + 0xf4051ebd, 0xfa000008, + 0xbf8cc07f, 0x87ee6e6e, + 0xbf840001, 0xbe80206e, + 0xb971f803, 0x8771ff71, + 0x000001ff, 0xbf850002, + 0x806c846c, 0x826d806d, + 0x876dff6d, 0x0000ffff, + 0x906e8977, 0x876fff6e, + 0x003f8000, 0x906e8677, + 0x876eff6e, 0x02000000, + 0x886e6f6e, 0xb9eef807, + 0x87fe7e7e, 0x87ea6a6a, + 0xb9f8f802, 0xbe80226c, + 0xb971f803, 0x8771ff71, + 0x00000100, 0xbf840006, + 0xbef60380, 0xb9f60203, + 0x876dff6d, 0x0000ffff, + 0x80ec886c, 0x82ed806d, + 0xbef60380, 0xb9f60283, + 0xb972f816, 0xb9762c07, + 0x8f769a76, 0x886d766d, + 0xb97603c7, 0x8f769976, + 0x886d766d, 0xb9760647, + 0x8f769876, 0x886d766d, + 0xb976f807, 0x8776ff76, + 0x00007fff, 0xb9f6f807, + 0xbeee037e, 0xbeef037f, + 0xbefe0480, 0xbf900004, + 0xbf8e0002, 0xbf88fffe, + 0xbef4037e, 0x8775ff7f, + 0x0000ffff, 0x8875ff75, + 0x00040000, 0xbef60380, + 0xbef703ff, 0x10807fac, + 0x8776ff7f, 0x08000000, + 0x90768376, 0x88777677, + 0x8776ff7f, 0x70000000, + 0x90768176, 0x88777677, + 0xbefb037c, 0xbefa0380, + 0xb97302dc, 0x8f739973, + 0x8873737f, 0xb97a2a05, + 0x807a817a, 0x907c9973, + 0x877c817c, 0xbf06817c, + 0xbf850002, 0x8f7a897a, + 0xbf820001, 0x8f7a8a7a, + 0xb9761e06, 0x8f768a76, + 0x807a767a, 0x807aff7a, + 0x00000200, 0xbef603ff, + 0x01000000, 0xbefe037c, + 0xbefc037a, 0xf4611efa, + 0xf8000000, 0x807a847a, + 0xbefc037e, 0xbefe037c, + 0xbefc037a, 0xf4611b3a, + 0xf8000000, 0x807a847a, + 0xbefc037e, 0xbefe037c, + 0xbefc037a, 0xf4611b7a, + 0xf8000000, 0x807a847a, + 0xbefc037e, 0xbefe037c, + 0xbefc037a, 0xf4611bba, + 0xf8000000, 0x807a847a, + 0xbefc037e, 0xbefe037c, + 0xbefc037a, 0xf4611bfa, + 0xf8000000, 0x807a847a, + 0xbefc037e, 0xbefe037c, + 0xbefc037a, 0xf4611e3a, + 0xf8000000, 0x807a847a, + 0xbefc037e, 0xb971f803, + 0xbefe037c, 0xbefc037a, + 0xf4611c7a, 0xf8000000, + 0x807a847a, 0xbefc037e, + 0xbefe037c, 0xbefc037a, + 0xf4611cba, 0xf8000000, + 0x807a847a, 0xbefc037e, + 0xb97bf801, 0xbefe037c, + 0xbefc037a, 0xf4611efa, + 0xf8000000, 0x807a847a, + 0xbefc037e, 0xb97bf814, + 0xbefe037c, 0xbefc037a, + 0xf4611efa, 0xf8000000, + 0x807a847a, 0xbefc037e, + 0xb97bf815, 0xbefe037c, + 0xbefc037a, 0xf4611efa, + 0xf8000000, 0x807a847a, + 0xbefc037e, 0x8776ff7f, + 0x04000000, 0xbeef0380, + 0x886f6f76, 0xb97a2a05, + 0x807a817a, 0x907c9973, + 0x877c817c, 0xbf06817c, + 0xbf850002, 0x8f7a897a, + 0xbf820001, 0x8f7a8a7a, + 0xb9761e06, 0x8f768a76, + 0x807a767a, 0xbef603ff, + 0x01000000, 0xbef20374, + 0x80747a74, 0x82758075, + 0xbefc0380, 0xbf800000, + 0xbe802f00, 0xbe822f02, + 0xbe842f04, 0xbe862f06, + 0xbe882f08, 0xbe8a2f0a, + 0xbe8c2f0c, 0xbe8e2f0e, + 0xf469003a, 0xfa000000, + 0xf469013a, 0xfa000010, + 0xf469023a, 0xfa000020, + 0xf469033a, 0xfa000030, + 0x8074c074, 0x82758075, + 0x807c907c, 0xbf0aff7c, + 0x00000060, 0xbf85ffea, + 0xbe802f00, 0xbe822f02, + 0xbe842f04, 0xbe862f06, + 0xbe882f08, 0xbe8a2f0a, + 0xf469003a, 0xfa000000, + 0xf469013a, 0xfa000010, + 0xf469023a, 0xfa000020, + 0x8074b074, 0x82758075, + 0xbef40372, 0xbefa0380, + 0xbefe03c1, 0x907c9973, + 0x877c817c, 0xbf06817c, + 0xbf850002, 0xbeff0380, + 0xbf820002, 0xbeff03c1, + 0xbf82000b, 0xbef603ff, + 0x01000000, 0xe0704000, + 0x7a5d0000, 0xe0704080, + 0x7a5d0100, 0xe0704100, + 0x7a5d0200, 0xe0704180, + 0x7a5d0300, 0xbf82000a, + 0xbef603ff, 0x01000000, + 0xe0704000, 0x7a5d0000, + 0xe0704100, 0x7a5d0100, + 0xe0704200, 0x7a5d0200, + 0xe0704300, 0x7a5d0300, + 0xbefe03c1, 0x907c9973, + 0x877c817c, 0xbf06817c, + 0xbf850002, 0xbeff0380, + 0xbf820001, 0xbeff03c1, + 0xb9714306, 0x8771c171, + 0xbf840046, 0xbf8a0000, + 0x8776ff6f, 0x04000000, + 0xbf840042, 0x8f718671, + 0x8f718271, 0xbef60371, + 0xb97a2a05, 0x807a817a, + 0x907c9973, 0x877c817c, + 0xbf06817c, 0xbf850002, + 0x8f7a897a, 0xbf820001, + 0x8f7a8a7a, 0xb9761e06, + 0x8f768a76, 0x807a767a, + 0x807aff7a, 0x00000200, + 0x807aff7a, 0x00000080, + 0xbef603ff, 0x01000000, + 0xd7650000, 0x000100c1, + 0xd7660000, 0x000200c1, + 0x16000084, 0x907c9973, + 0x877c817c, 0xbf06817c, + 0xbefc0380, 0xbf850012, + 0xbe8303ff, 0x00000080, + 0xbf800000, 0xbf800000, + 0xbf800000, 0xd8d80000, + 0x01000000, 0xbf8c0000, + 0xe0704000, 0x7a5d0100, + 0x807c037c, 0x807a037a, + 0xd5250000, 0x0001ff00, + 0x00000080, 0xbf0a717c, + 0xbf85fff4, 0xbf820011, + 0xbe8303ff, 0x00000100, + 0xbf800000, 0xbf800000, + 0xbf800000, 0xd8d80000, + 0x01000000, 0xbf8c0000, + 0xe0704000, 0x7a5d0100, + 0x807c037c, 0x807a037a, + 0xd5250000, 0x0001ff00, + 0x00000100, 0xbf0a717c, + 0xbf85fff4, 0xbefe03c1, + 0x907c9973, 0x877c817c, + 0xbf06817c, 0xbf850004, + 0xbefa03ff, 0x00000200, + 0xbeff0380, 0xbf820003, + 0xbefa03ff, 0x00000400, + 0xbeff03c1, 0xb9712a05, + 0x80718171, 0x8f718271, + 0x907c9973, 0x877c817c, + 0xbf06817c, 0xbf850017, + 0xbef603ff, 0x01000000, + 0xbefc0384, 0xbf0a717c, + 0xbf840037, 0x7e008700, + 0x7e028701, 0x7e048702, + 0x7e068703, 0xe0704000, + 0x7a5d0000, 0xe0704080, + 0x7a5d0100, 0xe0704100, + 0x7a5d0200, 0xe0704180, + 0x7a5d0300, 0x807c847c, + 0x807aff7a, 0x00000200, + 0xbf0a717c, 0xbf85ffef, + 0xbf820025, 0xbef603ff, + 0x01000000, 0xbefc0384, + 0xbf0a717c, 0xbf840020, + 0x7e008700, 0x7e028701, + 0x7e048702, 0x7e068703, + 0xe0704000, 0x7a5d0000, + 0xe0704100, 0x7a5d0100, + 0xe0704200, 0x7a5d0200, + 0xe0704300, 0x7a5d0300, 0x807c847c, 0x807aff7a, 0x00000400, 0xbf0a717c, - 0xbf85ffef, 0xbf9c0000, - 0xbf8200dc, 0xbef4007e, + 0xbf85ffef, 0xb9711e06, + 0x8771c171, 0xbf84000c, + 0x8f718371, 0x80717c71, + 0xbefe03c1, 0xbeff0380, + 0x7e008700, 0xe0704000, + 0x7a5d0000, 0x807c817c, + 0x807aff7a, 0x00000080, + 0xbf0a717c, 0xbf85fff8, + 0xbf820142, 0xbef4037e, + 0x8775ff7f, 0x0000ffff, + 0x8875ff75, 0x00040000, + 0xbef60380, 0xbef703ff, + 0x10807fac, 0x8772ff7f, + 0x08000000, 0x90728372, + 0x88777277, 0x8772ff7f, + 0x70000000, 0x90728172, + 0x88777277, 0xb97302dc, + 0x8f739973, 0x8873737f, + 0x8772ff7f, 0x04000000, + 0xbf840036, 0xbefe03c1, + 0x907c9973, 0x877c817c, + 0xbf06817c, 0xbf850002, + 0xbeff0380, 0xbf820001, + 0xbeff03c1, 0xb96f4306, + 0x876fc16f, 0xbf84002b, + 0x8f6f866f, 0x8f6f826f, + 0xbef6036f, 0xb9782a05, + 0x80788178, 0x907c9973, + 0x877c817c, 0xbf06817c, + 0xbf850002, 0x8f788978, + 0xbf820001, 0x8f788a78, + 0xb9721e06, 0x8f728a72, + 0x80787278, 0x8078ff78, + 0x00000200, 0x8078ff78, + 0x00000080, 0xbef603ff, + 0x01000000, 0x907c9973, + 0x877c817c, 0xbf06817c, + 0xbefc0380, 0xbf850009, + 0xe0310000, 0x781d0000, + 0x807cff7c, 0x00000080, + 0x8078ff78, 0x00000080, + 0xbf0a6f7c, 0xbf85fff8, + 0xbf820008, 0xe0310000, + 0x781d0000, 0x807cff7c, + 0x00000100, 0x8078ff78, + 0x00000100, 0xbf0a6f7c, + 0xbf85fff8, 0xbef80380, + 0xbefe03c1, 0x907c9973, + 0x877c817c, 0xbf06817c, + 0xbf850002, 0xbeff0380, + 0xbf820001, 0xbeff03c1, + 0xb96f2a05, 0x806f816f, + 0x8f6f826f, 0x907c9973, + 0x877c817c, 0xbf06817c, + 0xbf850021, 0xbef603ff, + 0x01000000, 0xbef20378, + 0x8078ff78, 0x00000200, + 0xbefc0384, 0xe0304000, + 0x785d0000, 0xe0304080, + 0x785d0100, 0xe0304100, + 0x785d0200, 0xe0304180, + 0x785d0300, 0xbf8c3f70, + 0x7e008500, 0x7e028501, + 0x7e048502, 0x7e068503, + 0x807c847c, 0x8078ff78, + 0x00000200, 0xbf0a6f7c, + 0xbf85ffee, 0xe0304000, + 0x725d0000, 0xe0304080, + 0x725d0100, 0xe0304100, + 0x725d0200, 0xe0304180, + 0x725d0300, 0xbf820032, + 0xbef603ff, 0x01000000, + 0xbef20378, 0x8078ff78, + 0x00000400, 0xbefc0384, + 0xe0304000, 0x785d0000, + 0xe0304100, 0x785d0100, + 0xe0304200, 0x785d0200, + 0xe0304300, 0x785d0300, + 0xbf8c3f70, 0x7e008500, + 0x7e028501, 0x7e048502, + 0x7e068503, 0x807c847c, + 0x8078ff78, 0x00000400, + 0xbf0a6f7c, 0xbf85ffee, + 0xb96f1e06, 0x876fc16f, + 0xbf84000e, 0x8f6f836f, + 0x806f7c6f, 0xbefe03c1, + 0xbeff0380, 0xe0304000, + 0x785d0000, 0xbf8c3f70, + 0x7e008500, 0x807c817c, + 0x8078ff78, 0x00000080, + 0xbf0a6f7c, 0xbf85fff7, + 0xbeff03c1, 0xe0304000, + 0x725d0000, 0xe0304100, + 0x725d0100, 0xe0304200, + 0x725d0200, 0xe0304300, + 0x725d0300, 0xbf8c3f70, + 0xb9782a05, 0x80788178, + 0x907c9973, 0x877c817c, + 0xbf06817c, 0xbf850002, + 0x8f788978, 0xbf820001, + 0x8f788a78, 0xb9721e06, + 0x8f728a72, 0x80787278, + 0x8078ff78, 0x00000200, + 0x80f8ff78, 0x00000050, + 0xbef603ff, 0x01000000, + 0xbefc03ff, 0x0000006c, + 0x80f89078, 0xf429003a, + 0xf0000000, 0xbf8cc07f, + 0x80fc847c, 0xbf800000, + 0xbe803100, 0xbe823102, + 0x80f8a078, 0xf42d003a, + 0xf0000000, 0xbf8cc07f, + 0x80fc887c, 0xbf800000, + 0xbe803100, 0xbe823102, + 0xbe843104, 0xbe863106, + 0x80f8c078, 0xf431003a, + 0xf0000000, 0xbf8cc07f, + 0x80fc907c, 0xbf800000, + 0xbe803100, 0xbe823102, + 0xbe843104, 0xbe863106, + 0xbe883108, 0xbe8a310a, + 0xbe8c310c, 0xbe8e310e, + 0xbf06807c, 0xbf84fff0, + 0xb9782a05, 0x80788178, + 0x907c9973, 0x877c817c, + 0xbf06817c, 0xbf850002, + 0x8f788978, 0xbf820001, + 0x8f788a78, 0xb9721e06, + 0x8f728a72, 0x80787278, + 0x8078ff78, 0x00000200, + 0xbef603ff, 0x01000000, + 0xf4211bfa, 0xf0000000, + 0x80788478, 0xf4211b3a, + 0xf0000000, 0x80788478, + 0xf4211b7a, 0xf0000000, + 0x80788478, 0xf4211eba, + 0xf0000000, 0x80788478, + 0xf4211efa, 0xf0000000, + 0x80788478, 0xf4211c3a, + 0xf0000000, 0x80788478, + 0xf4211c7a, 0xf0000000, + 0x80788478, 0xf4211e7a, + 0xf0000000, 0x80788478, + 0xf4211cfa, 0xf0000000, + 0x80788478, 0xf4211bba, + 0xf0000000, 0x80788478, + 0xbf8cc07f, 0xb9eef814, + 0xf4211bba, 0xf0000000, + 0x80788478, 0xbf8cc07f, + 0xb9eef815, 0xbef2036d, + 0x876dff72, 0x0000ffff, + 0xbefc036f, 0xbefe037a, + 0xbeff037b, 0x876f71ff, + 0x000003ff, 0xb9ef4803, + 0xb9f9f816, 0x876f71ff, + 0xfffff800, 0x906f8b6f, + 0xb9efa2c3, 0xb9f3f801, + 0x876fff72, 0xfc000000, + 0x906f9a6f, 0x8f6f906f, + 0xbef30380, 0x88736f73, + 0x876fff72, 0x02000000, + 0x906f996f, 0x8f6f8f6f, + 0x88736f73, 0x876fff72, + 0x01000000, 0x906f986f, + 0x8f6f996f, 0x88736f73, + 0x876fff70, 0x00800000, + 0x906f976f, 0xb9f3f807, + 0x87fe7e7e, 0x87ea6a6a, + 0xb9f0f802, 0xbf8a0000, + 0xbe80226c, 0xbf810000, + 0xbf9f0000, 0xbf9f0000, + 0xbf9f0000, 0xbf9f0000, + 0xbf9f0000, 0x00000000, +}; +static const uint32_t cwsr_trap_arcturus_hex[] = { + 0xbf820001, 0xbf8202c4, + 0xb8f8f802, 0x89788678, + 0xb8eef801, 0x866eff6e, + 0x00000800, 0xbf840003, + 0x866eff78, 0x00002000, + 0xbf840016, 0xb8fbf803, + 0x866eff7b, 0x00000400, + 0xbf85003b, 0x866eff7b, + 0x00000800, 0xbf850003, + 0x866eff7b, 0x00000100, + 0xbf84000c, 0x866eff78, + 0x00002000, 0xbf840005, + 0xbf8e0010, 0xb8eef803, + 0x866eff6e, 0x00000400, + 0xbf84fffb, 0x8778ff78, + 0x00002000, 0x80ec886c, + 0x82ed806d, 0xb8eef807, + 0x866fff6e, 0x001f8000, + 0x8e6f8b6f, 0x8977ff77, + 0xfc000000, 0x87776f77, + 0x896eff6e, 0x001f8000, + 0xb96ef807, 0xb8faf812, + 0xb8fbf813, 0x8efa887a, + 0xc0071bbd, 0x00000000, + 0xbf8cc07f, 0xc0071ebd, + 0x00000008, 0xbf8cc07f, + 0x86ee6e6e, 0xbf840001, + 0xbe801d6e, 0xb8fbf803, + 0x867bff7b, 0x000001ff, + 0xbf850002, 0x806c846c, + 0x826d806d, 0x866dff6d, + 0x0000ffff, 0x8f6e8b77, + 0x866eff6e, 0x001f8000, + 0xb96ef807, 0x86fe7e7e, + 0x86ea6a6a, 0x8f6e8378, + 0xb96ee0c2, 0xbf800002, + 0xb9780002, 0xbe801f6c, + 0x866dff6d, 0x0000ffff, + 0xbefa0080, 0xb97a0283, + 0xb8fa2407, 0x8e7a9b7a, + 0x876d7a6d, 0xb8fa03c7, + 0x8e7a9a7a, 0x876d7a6d, + 0xb8faf807, 0x867aff7a, + 0x00007fff, 0xb97af807, + 0xbeee007e, 0xbeef007f, + 0xbefe0180, 0xbf900004, + 0x877a8478, 0xb97af802, + 0xbf8e0002, 0xbf88fffe, + 0xb8fa2a05, 0x807a817a, + 0x8e7a8a7a, 0x8e7a817a, + 0xb8fb1605, 0x807b817b, + 0x8e7b867b, 0x807a7b7a, + 0x807a7e7a, 0x827b807f, + 0x867bff7b, 0x0000ffff, + 0xc04b1c3d, 0x00000050, + 0xbf8cc07f, 0xc04b1d3d, + 0x00000060, 0xbf8cc07f, + 0xc0431e7d, 0x00000074, + 0xbf8cc07f, 0xbef4007e, 0x8675ff7f, 0x0000ffff, 0x8775ff75, 0x00040000, 0xbef60080, 0xbef700ff, - 0x00807fac, 0x866eff7f, - 0x08000000, 0x8f6e836e, - 0x87776e77, 0x866eff7f, - 0x70000000, 0x8f6e816e, - 0x87776e77, 0x866eff7f, - 0x04000000, 0xbf84001e, + 0x00807fac, 0x867aff7f, + 0x08000000, 0x8f7a837a, + 0x87777a77, 0x867aff7f, + 0x70000000, 0x8f7a817a, + 0x87777a77, 0xbef1007c, + 0xbef00080, 0xb8f02a05, + 0x80708170, 0x8e708a70, + 0x8e708170, 0xb8fa1605, + 0x807a817a, 0x8e7a867a, + 0x80707a70, 0xbef60084, + 0xbef600ff, 0x01000000, + 0xbefe007c, 0xbefc0070, + 0xc0611c7a, 0x0000007c, + 0xbf8cc07f, 0x80708470, + 0xbefc007e, 0xbefe007c, + 0xbefc0070, 0xc0611b3a, + 0x0000007c, 0xbf8cc07f, + 0x80708470, 0xbefc007e, + 0xbefe007c, 0xbefc0070, + 0xc0611b7a, 0x0000007c, + 0xbf8cc07f, 0x80708470, + 0xbefc007e, 0xbefe007c, + 0xbefc0070, 0xc0611bba, + 0x0000007c, 0xbf8cc07f, + 0x80708470, 0xbefc007e, + 0xbefe007c, 0xbefc0070, + 0xc0611bfa, 0x0000007c, + 0xbf8cc07f, 0x80708470, + 0xbefc007e, 0xbefe007c, + 0xbefc0070, 0xc0611e3a, + 0x0000007c, 0xbf8cc07f, + 0x80708470, 0xbefc007e, + 0xb8fbf803, 0xbefe007c, + 0xbefc0070, 0xc0611efa, + 0x0000007c, 0xbf8cc07f, + 0x80708470, 0xbefc007e, + 0xbefe007c, 0xbefc0070, + 0xc0611a3a, 0x0000007c, + 0xbf8cc07f, 0x80708470, + 0xbefc007e, 0xbefe007c, + 0xbefc0070, 0xc0611a7a, + 0x0000007c, 0xbf8cc07f, + 0x80708470, 0xbefc007e, + 0xb8f1f801, 0xbefe007c, + 0xbefc0070, 0xc0611c7a, + 0x0000007c, 0xbf8cc07f, + 0x80708470, 0xbefc007e, + 0x867aff7f, 0x04000000, + 0xbeef0080, 0x876f6f7a, + 0xb8f02a05, 0x80708170, + 0x8e708a70, 0x8e708170, + 0xb8fb1605, 0x807b817b, + 0x8e7b847b, 0x8e76827b, + 0xbef600ff, 0x01000000, + 0xbef20174, 0x80747074, + 0x82758075, 0xbefc0080, + 0xbf800000, 0xbe802b00, + 0xbe822b02, 0xbe842b04, + 0xbe862b06, 0xbe882b08, + 0xbe8a2b0a, 0xbe8c2b0c, + 0xbe8e2b0e, 0xc06b003a, + 0x00000000, 0xbf8cc07f, + 0xc06b013a, 0x00000010, + 0xbf8cc07f, 0xc06b023a, + 0x00000020, 0xbf8cc07f, + 0xc06b033a, 0x00000030, + 0xbf8cc07f, 0x8074c074, + 0x82758075, 0x807c907c, + 0xbf0a7b7c, 0xbf85ffe7, + 0xbef40172, 0xbef00080, 0xbefe00c1, 0xbeff00c1, - 0xb8ef4306, 0x866fc16f, - 0xbf840019, 0x8e6f866f, - 0x8e6f826f, 0xbef6006f, - 0xb8f82a05, 0x80788178, - 0x8e788a78, 0xb8ee1605, + 0xbee80080, 0xbee90080, + 0xbef600ff, 0x01000000, + 0x867aff78, 0x00400000, + 0xbf850003, 0xb8faf803, + 0x897a7aff, 0x10000000, + 0xbf85004d, 0xbe840080, + 0xd2890000, 0x00000900, + 0x80048104, 0xd2890001, + 0x00000900, 0x80048104, + 0xd2890002, 0x00000900, + 0x80048104, 0xd2890003, + 0x00000900, 0x80048104, + 0xc069003a, 0x00000070, + 0xbf8cc07f, 0x80709070, + 0xbf06c004, 0xbf84ffee, + 0xbe840080, 0xd2890000, + 0x00000901, 0x80048104, + 0xd2890001, 0x00000901, + 0x80048104, 0xd2890002, + 0x00000901, 0x80048104, + 0xd2890003, 0x00000901, + 0x80048104, 0xc069003a, + 0x00000070, 0xbf8cc07f, + 0x80709070, 0xbf06c004, + 0xbf84ffee, 0xbe840080, + 0xd2890000, 0x00000902, + 0x80048104, 0xd2890001, + 0x00000902, 0x80048104, + 0xd2890002, 0x00000902, + 0x80048104, 0xd2890003, + 0x00000902, 0x80048104, + 0xc069003a, 0x00000070, + 0xbf8cc07f, 0x80709070, + 0xbf06c004, 0xbf84ffee, + 0xbe840080, 0xd2890000, + 0x00000903, 0x80048104, + 0xd2890001, 0x00000903, + 0x80048104, 0xd2890002, + 0x00000903, 0x80048104, + 0xd2890003, 0x00000903, + 0x80048104, 0xc069003a, + 0x00000070, 0xbf8cc07f, + 0x80709070, 0xbf06c004, + 0xbf84ffee, 0xbf820008, + 0xe0724000, 0x701d0000, + 0xe0724100, 0x701d0100, + 0xe0724200, 0x701d0200, + 0xe0724300, 0x701d0300, + 0xbefe00c1, 0xbeff00c1, + 0xb8fb4306, 0x867bc17b, + 0xbf840064, 0xbf8a0000, + 0x867aff6f, 0x04000000, + 0xbf840060, 0x8e7b867b, + 0x8e7b827b, 0xbef6007b, + 0xb8f02a05, 0x80708170, + 0x8e708a70, 0x8e708170, + 0xb8fa1605, 0x807a817a, + 0x8e7a867a, 0x80707a70, + 0x8070ff70, 0x00000080, + 0xbef600ff, 0x01000000, + 0xbefc0080, 0xd28c0002, + 0x000100c1, 0xd28d0003, + 0x000204c1, 0x867aff78, + 0x00400000, 0xbf850003, + 0xb8faf803, 0x897a7aff, + 0x10000000, 0xbf850030, + 0x24040682, 0xd86e4000, + 0x00000002, 0xbf8cc07f, + 0xbe840080, 0xd2890000, + 0x00000900, 0x80048104, + 0xd2890001, 0x00000900, + 0x80048104, 0xd2890002, + 0x00000900, 0x80048104, + 0xd2890003, 0x00000900, + 0x80048104, 0xc069003a, + 0x00000070, 0xbf8cc07f, + 0x80709070, 0xbf06c004, + 0xbf84ffee, 0xbe840080, + 0xd2890000, 0x00000901, + 0x80048104, 0xd2890001, + 0x00000901, 0x80048104, + 0xd2890002, 0x00000901, + 0x80048104, 0xd2890003, + 0x00000901, 0x80048104, + 0xc069003a, 0x00000070, + 0xbf8cc07f, 0x80709070, + 0xbf06c004, 0xbf84ffee, + 0x680404ff, 0x00000200, + 0xd0c9006a, 0x0000f702, + 0xbf87ffd2, 0xbf820015, + 0xd1060002, 0x00011103, + 0x7e0602ff, 0x00000200, + 0xbefc00ff, 0x00010000, + 0xbe800077, 0x8677ff77, + 0xff7fffff, 0x8777ff77, + 0x00058000, 0xd8ec0000, + 0x00000002, 0xbf8cc07f, + 0xe0765000, 0x701d0002, + 0x68040702, 0xd0c9006a, + 0x0000f702, 0xbf87fff7, + 0xbef70000, 0xbef000ff, + 0x00000400, 0xbefe00c1, + 0xbeff00c1, 0xb8fb2a05, + 0x807b817b, 0x8e7b827b, + 0x8e76887b, 0xbef600ff, + 0x01000000, 0xbefc0084, + 0xbf0a7b7c, 0xbf84006d, + 0xbf11017c, 0x807bff7b, + 0x00001000, 0x867aff78, + 0x00400000, 0xbf850003, + 0xb8faf803, 0x897a7aff, + 0x10000000, 0xbf850051, + 0xbe840080, 0xd2890000, + 0x00000900, 0x80048104, + 0xd2890001, 0x00000900, + 0x80048104, 0xd2890002, + 0x00000900, 0x80048104, + 0xd2890003, 0x00000900, + 0x80048104, 0xc069003a, + 0x00000070, 0xbf8cc07f, + 0x80709070, 0xbf06c004, + 0xbf84ffee, 0xbe840080, + 0xd2890000, 0x00000901, + 0x80048104, 0xd2890001, + 0x00000901, 0x80048104, + 0xd2890002, 0x00000901, + 0x80048104, 0xd2890003, + 0x00000901, 0x80048104, + 0xc069003a, 0x00000070, + 0xbf8cc07f, 0x80709070, + 0xbf06c004, 0xbf84ffee, + 0xbe840080, 0xd2890000, + 0x00000902, 0x80048104, + 0xd2890001, 0x00000902, + 0x80048104, 0xd2890002, + 0x00000902, 0x80048104, + 0xd2890003, 0x00000902, + 0x80048104, 0xc069003a, + 0x00000070, 0xbf8cc07f, + 0x80709070, 0xbf06c004, + 0xbf84ffee, 0xbe840080, + 0xd2890000, 0x00000903, + 0x80048104, 0xd2890001, + 0x00000903, 0x80048104, + 0xd2890002, 0x00000903, + 0x80048104, 0xd2890003, + 0x00000903, 0x80048104, + 0xc069003a, 0x00000070, + 0xbf8cc07f, 0x80709070, + 0xbf06c004, 0xbf84ffee, + 0x807c847c, 0xbf0a7b7c, + 0xbf85ffb1, 0xbf9c0000, + 0xbf820012, 0x7e000300, + 0x7e020301, 0x7e040302, + 0x7e060303, 0xe0724000, + 0x701d0000, 0xe0724100, + 0x701d0100, 0xe0724200, + 0x701d0200, 0xe0724300, + 0x701d0300, 0x807c847c, + 0x8070ff70, 0x00000400, + 0xbf0a7b7c, 0xbf85ffef, + 0xbf9c0000, 0xbefc0080, + 0xbf11017c, 0x867aff78, + 0x00400000, 0xbf850003, + 0xb8faf803, 0x897a7aff, + 0x10000000, 0xbf850059, + 0xd3d84000, 0x18000100, + 0xd3d84001, 0x18000101, + 0xd3d84002, 0x18000102, + 0xd3d84003, 0x18000103, + 0xbe840080, 0xd2890000, + 0x00000900, 0x80048104, + 0xd2890001, 0x00000900, + 0x80048104, 0xd2890002, + 0x00000900, 0x80048104, + 0xd2890003, 0x00000900, + 0x80048104, 0xc069003a, + 0x00000070, 0xbf8cc07f, + 0x80709070, 0xbf06c004, + 0xbf84ffee, 0xbe840080, + 0xd2890000, 0x00000901, + 0x80048104, 0xd2890001, + 0x00000901, 0x80048104, + 0xd2890002, 0x00000901, + 0x80048104, 0xd2890003, + 0x00000901, 0x80048104, + 0xc069003a, 0x00000070, + 0xbf8cc07f, 0x80709070, + 0xbf06c004, 0xbf84ffee, + 0xbe840080, 0xd2890000, + 0x00000902, 0x80048104, + 0xd2890001, 0x00000902, + 0x80048104, 0xd2890002, + 0x00000902, 0x80048104, + 0xd2890003, 0x00000902, + 0x80048104, 0xc069003a, + 0x00000070, 0xbf8cc07f, + 0x80709070, 0xbf06c004, + 0xbf84ffee, 0xbe840080, + 0xd2890000, 0x00000903, + 0x80048104, 0xd2890001, + 0x00000903, 0x80048104, + 0xd2890002, 0x00000903, + 0x80048104, 0xd2890003, + 0x00000903, 0x80048104, + 0xc069003a, 0x00000070, + 0xbf8cc07f, 0x80709070, + 0xbf06c004, 0xbf84ffee, + 0x807c847c, 0xbf0a7b7c, + 0xbf85ffa9, 0xbf9c0000, + 0xbf820016, 0xd3d84000, + 0x18000100, 0xd3d84001, + 0x18000101, 0xd3d84002, + 0x18000102, 0xd3d84003, + 0x18000103, 0xe0724000, + 0x701d0000, 0xe0724100, + 0x701d0100, 0xe0724200, + 0x701d0200, 0xe0724300, + 0x701d0300, 0x807c847c, + 0x8070ff70, 0x00000400, + 0xbf0a7b7c, 0xbf85ffeb, + 0xbf9c0000, 0xbf820106, + 0xbef4007e, 0x8675ff7f, + 0x0000ffff, 0x8775ff75, + 0x00040000, 0xbef60080, + 0xbef700ff, 0x00807fac, + 0x866eff7f, 0x08000000, + 0x8f6e836e, 0x87776e77, + 0x866eff7f, 0x70000000, + 0x8f6e816e, 0x87776e77, + 0x866eff7f, 0x04000000, + 0xbf84001f, 0xbefe00c1, + 0xbeff00c1, 0xb8ef4306, + 0x866fc16f, 0xbf84001a, + 0x8e6f866f, 0x8e6f826f, + 0xbef6006f, 0xb8f82a05, + 0x80788178, 0x8e788a78, + 0x8e788178, 0xb8ee1605, 0x806e816e, 0x8e6e866e, 0x80786e78, 0x8078ff78, 0x00000080, 0xbef600ff, @@ -482,42 +1455,63 @@ static const uint32_t cwsr_trap_gfx9_hex[] = { 0xbef80080, 0xbefe00c1, 0xbeff00c1, 0xb8ef2a05, 0x806f816f, 0x8e6f826f, - 0x8e76886f, 0xbef600ff, - 0x01000000, 0xbeee0078, - 0x8078ff78, 0x00000400, + 0x8e76886f, 0xbef90076, + 0xbef600ff, 0x01000000, + 0xbeee0078, 0x8078ff78, + 0x00000400, 0xbef30079, + 0x8079ff79, 0x00000400, 0xbefc0084, 0xbf11087c, 0x806fff6f, 0x00008000, - 0xe0524000, 0x781d0000, - 0xe0524100, 0x781d0100, - 0xe0524200, 0x781d0200, - 0xe0524300, 0x781d0300, - 0xbf8c0f70, 0x7e000300, - 0x7e020301, 0x7e040302, - 0x7e060303, 0x807c847c, - 0x8078ff78, 0x00000400, - 0xbf0a6f7c, 0xbf85ffee, - 0xbf9c0000, 0xe0524000, + 0xe0524000, 0x791d0000, + 0xe0524100, 0x791d0100, + 0xe0524200, 0x791d0200, + 0xe0524300, 0x791d0300, + 0x8079ff79, 0x00000400, + 0xbf8c0f70, 0xd3d94000, + 0x18000100, 0xd3d94001, + 0x18000101, 0xd3d94002, + 0x18000102, 0xd3d94003, + 0x18000103, 0xe0524000, + 0x781d0000, 0xe0524100, + 0x781d0100, 0xe0524200, + 0x781d0200, 0xe0524300, + 0x781d0300, 0xbf8c0f70, + 0x7e000300, 0x7e020301, + 0x7e040302, 0x7e060303, + 0x807c847c, 0x8078ff78, + 0x00000400, 0xbf0a6f7c, + 0xbf85ffdb, 0xbf9c0000, + 0xe0524000, 0x731d0000, + 0xe0524100, 0x731d0100, + 0xe0524200, 0x731d0200, + 0xe0524300, 0x731d0300, + 0xbf8c0f70, 0xd3d94000, + 0x18000100, 0xd3d94001, + 0x18000101, 0xd3d94002, + 0x18000102, 0xd3d94003, + 0x18000103, 0xe0524000, 0x6e1d0000, 0xe0524100, 0x6e1d0100, 0xe0524200, 0x6e1d0200, 0xe0524300, 0x6e1d0300, 0xb8f82a05, 0x80788178, 0x8e788a78, - 0xb8ee1605, 0x806e816e, - 0x8e6e866e, 0x80786e78, - 0x80f8c078, 0xb8ef1605, - 0x806f816f, 0x8e6f846f, - 0x8e76826f, 0xbef600ff, - 0x01000000, 0xbefc006f, - 0xc031003a, 0x00000078, - 0x80f8c078, 0xbf8cc07f, - 0x80fc907c, 0xbf800000, - 0xbe802d00, 0xbe822d02, - 0xbe842d04, 0xbe862d06, - 0xbe882d08, 0xbe8a2d0a, - 0xbe8c2d0c, 0xbe8e2d0e, - 0xbf06807c, 0xbf84fff0, - 0xb8f82a05, 0x80788178, - 0x8e788a78, 0xb8ee1605, + 0x8e788178, 0xb8ee1605, + 0x806e816e, 0x8e6e866e, + 0x80786e78, 0x80f8c078, + 0xb8ef1605, 0x806f816f, + 0x8e6f846f, 0x8e76826f, + 0xbef600ff, 0x01000000, + 0xbefc006f, 0xc031003a, + 0x00000078, 0x80f8c078, + 0xbf8cc07f, 0x80fc907c, + 0xbf800000, 0xbe802d00, + 0xbe822d02, 0xbe842d04, + 0xbe862d06, 0xbe882d08, + 0xbe8a2d0a, 0xbe8c2d0c, + 0xbe8e2d0e, 0xbf06807c, + 0xbf84fff0, 0xb8f82a05, + 0x80788178, 0x8e788a78, + 0x8e788178, 0xb8ee1605, 0x806e816e, 0x8e6e866e, 0x80786e78, 0xbef60084, 0xbef600ff, 0x01000000, @@ -525,44 +1519,44 @@ static const uint32_t cwsr_trap_gfx9_hex[] = { 0x80788478, 0xc0211b3a, 0x00000078, 0x80788478, 0xc0211b7a, 0x00000078, - 0x80788478, 0xc0211eba, - 0x00000078, 0x80788478, - 0xc0211efa, 0x00000078, 0x80788478, 0xc0211c3a, 0x00000078, 0x80788478, 0xc0211c7a, 0x00000078, + 0x80788478, 0xc0211eba, + 0x00000078, 0x80788478, + 0xc0211efa, 0x00000078, 0x80788478, 0xc0211a3a, 0x00000078, 0x80788478, 0xc0211a7a, 0x00000078, 0x80788478, 0xc0211cfa, 0x00000078, 0x80788478, 0xbf8cc07f, 0xbefc006f, - 0xbefe007a, 0xbeff007b, - 0x866f71ff, 0x000003ff, - 0xb96f4803, 0x866f71ff, + 0xbefe0070, 0xbeff0071, + 0x866f7bff, 0x000003ff, + 0xb96f4803, 0x866f7bff, 0xfffff800, 0x8f6f8b6f, 0xb96fa2c3, 0xb973f801, 0xb8ee2a05, 0x806e816e, - 0x8e6e8a6e, 0xb8ef1605, - 0x806f816f, 0x8e6f866f, - 0x806e6f6e, 0x806e746e, - 0x826f8075, 0x866fff6f, - 0x0000ffff, 0xc0071cb7, - 0x00000040, 0xc00b1d37, - 0x00000048, 0xc0031e77, - 0x00000058, 0xc0071eb7, - 0x0000005c, 0xbf8cc07f, - 0x866fff6d, 0xf0000000, - 0x8f6f9c6f, 0x8e6f906f, - 0xbeee0080, 0x876e6f6e, - 0x866fff6d, 0x08000000, - 0x8f6f9b6f, 0x8e6f8f6f, - 0x876e6f6e, 0x866fff70, - 0x00800000, 0x8f6f976f, - 0xb96ef807, 0x866dff6d, - 0x0000ffff, 0x86fe7e7e, - 0x86ea6a6a, 0x8f6e8370, - 0xb96ee0c2, 0xbf800002, - 0xb9700002, 0xbf8a0000, - 0x95806f6c, 0xbf810000, + 0x8e6e8a6e, 0x8e6e816e, + 0xb8ef1605, 0x806f816f, + 0x8e6f866f, 0x806e6f6e, + 0x806e746e, 0x826f8075, + 0x866fff6f, 0x0000ffff, + 0xc00b1c37, 0x00000050, + 0xc00b1d37, 0x00000060, + 0xc0031e77, 0x00000074, + 0xbf8cc07f, 0x866fff6d, + 0xf8000000, 0x8f6f9b6f, + 0x8e6f906f, 0xbeee0080, + 0x876e6f6e, 0x866fff6d, + 0x04000000, 0x8f6f9a6f, + 0x8e6f8f6f, 0x876e6f6e, + 0x866fff7a, 0x00800000, + 0x8f6f976f, 0xb96ef807, + 0x866dff6d, 0x0000ffff, + 0x86fe7e7e, 0x86ea6a6a, + 0x8f6e837a, 0xb96ee0c2, + 0xbf800002, 0xb97a0002, + 0xbf8a0000, 0x95806f6c, + 0xbf810000, 0x00000000, }; diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm new file mode 100644 index 000000000000..4433bda2ce25 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm @@ -0,0 +1,967 @@ +/* + * Copyright 2018 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +var SQ_WAVE_STATUS_INST_ATC_SHIFT = 23 +var SQ_WAVE_STATUS_INST_ATC_MASK = 0x00800000 +var SQ_WAVE_STATUS_SPI_PRIO_MASK = 0x00000006 +var SQ_WAVE_STATUS_HALT_MASK = 0x2000 + +var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12 +var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9 +var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT = 8 +var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE = 6 +var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT = 24 +var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE = 4 +var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT = 24 +var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE = 4 +var SQ_WAVE_IB_STS2_WAVE64_SHIFT = 11 +var SQ_WAVE_IB_STS2_WAVE64_SIZE = 1 + +var SQ_WAVE_TRAPSTS_SAVECTX_MASK = 0x400 +var SQ_WAVE_TRAPSTS_EXCE_MASK = 0x1FF +var SQ_WAVE_TRAPSTS_SAVECTX_SHIFT = 10 +var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK = 0x100 +var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT = 8 +var SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK = 0x3FF +var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT = 0x0 +var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE = 10 +var SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK = 0xFFFFF800 +var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT = 11 +var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE = 21 +var SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK = 0x800 + +var SQ_WAVE_IB_STS_RCNT_SHIFT = 16 +var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT = 15 +var SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT = 25 +var SQ_WAVE_IB_STS_REPLAY_W64H_SIZE = 1 +var SQ_WAVE_IB_STS_REPLAY_W64H_MASK = 0x02000000 +var SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE = 1 +var SQ_WAVE_IB_STS_RCNT_SIZE = 6 +var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK = 0x003F8000 +var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG = 0x00007FFF + +var SQ_BUF_RSRC_WORD1_ATC_SHIFT = 24 +var SQ_BUF_RSRC_WORD3_MTYPE_SHIFT = 27 + +// bits [31:24] unused by SPI debug data +var TTMP11_SAVE_REPLAY_W64H_SHIFT = 31 +var TTMP11_SAVE_REPLAY_W64H_MASK = 0x80000000 +var TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT = 24 +var TTMP11_SAVE_RCNT_FIRST_REPLAY_MASK = 0x7F000000 + +// SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14] +// when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE +var S_SAVE_BUF_RSRC_WORD1_STRIDE = 0x00040000 +var S_SAVE_BUF_RSRC_WORD3_MISC = 0x10807FAC + +var S_SAVE_SPI_INIT_ATC_MASK = 0x08000000 +var S_SAVE_SPI_INIT_ATC_SHIFT = 27 +var S_SAVE_SPI_INIT_MTYPE_MASK = 0x70000000 +var S_SAVE_SPI_INIT_MTYPE_SHIFT = 28 +var S_SAVE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 +var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT = 26 + +var S_SAVE_PC_HI_RCNT_SHIFT = 26 +var S_SAVE_PC_HI_RCNT_MASK = 0xFC000000 +var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT = 25 +var S_SAVE_PC_HI_FIRST_REPLAY_MASK = 0x02000000 +var S_SAVE_PC_HI_REPLAY_W64H_SHIFT = 24 +var S_SAVE_PC_HI_REPLAY_W64H_MASK = 0x01000000 + +var s_sgpr_save_num = 108 + +var s_save_spi_init_lo = exec_lo +var s_save_spi_init_hi = exec_hi +var s_save_pc_lo = ttmp0 +var s_save_pc_hi = ttmp1 +var s_save_exec_lo = ttmp2 +var s_save_exec_hi = ttmp3 +var s_save_status = ttmp12 +var s_save_trapsts = ttmp5 +var s_save_xnack_mask = ttmp6 +var s_wave_size = ttmp7 +var s_save_buf_rsrc0 = ttmp8 +var s_save_buf_rsrc1 = ttmp9 +var s_save_buf_rsrc2 = ttmp10 +var s_save_buf_rsrc3 = ttmp11 +var s_save_mem_offset = ttmp14 +var s_save_alloc_size = s_save_trapsts +var s_save_tmp = s_save_buf_rsrc2 +var s_save_m0 = ttmp15 + +var S_RESTORE_BUF_RSRC_WORD1_STRIDE = S_SAVE_BUF_RSRC_WORD1_STRIDE +var S_RESTORE_BUF_RSRC_WORD3_MISC = S_SAVE_BUF_RSRC_WORD3_MISC + +var S_RESTORE_SPI_INIT_ATC_MASK = 0x08000000 +var S_RESTORE_SPI_INIT_ATC_SHIFT = 27 +var S_RESTORE_SPI_INIT_MTYPE_MASK = 0x70000000 +var S_RESTORE_SPI_INIT_MTYPE_SHIFT = 28 +var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 +var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT = 26 +var S_WAVE_SIZE = 25 + +var S_RESTORE_PC_HI_RCNT_SHIFT = S_SAVE_PC_HI_RCNT_SHIFT +var S_RESTORE_PC_HI_RCNT_MASK = S_SAVE_PC_HI_RCNT_MASK +var S_RESTORE_PC_HI_FIRST_REPLAY_SHIFT = S_SAVE_PC_HI_FIRST_REPLAY_SHIFT +var S_RESTORE_PC_HI_FIRST_REPLAY_MASK = S_SAVE_PC_HI_FIRST_REPLAY_MASK + +var s_restore_spi_init_lo = exec_lo +var s_restore_spi_init_hi = exec_hi +var s_restore_mem_offset = ttmp12 +var s_restore_alloc_size = ttmp3 +var s_restore_tmp = ttmp6 +var s_restore_mem_offset_save = s_restore_tmp +var s_restore_m0 = s_restore_alloc_size +var s_restore_mode = ttmp7 +var s_restore_flat_scratch = ttmp2 +var s_restore_pc_lo = ttmp0 +var s_restore_pc_hi = ttmp1 +var s_restore_exec_lo = ttmp14 +var s_restore_exec_hi = ttmp15 +var s_restore_status = ttmp4 +var s_restore_trapsts = ttmp5 +var s_restore_xnack_mask = ttmp13 +var s_restore_buf_rsrc0 = ttmp8 +var s_restore_buf_rsrc1 = ttmp9 +var s_restore_buf_rsrc2 = ttmp10 +var s_restore_buf_rsrc3 = ttmp11 +var s_restore_size = ttmp7 + +shader main + asic(DEFAULT) + type(CS) + wave_size(32) + + s_branch L_SKIP_RESTORE //NOT restore. might be a regular trap or save + +L_JUMP_TO_RESTORE: + s_branch L_RESTORE + +L_SKIP_RESTORE: + s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC + s_andn2_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_SPI_PRIO_MASK + s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) + s_and_b32 ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK //check whether this is for save + s_cbranch_scc1 L_SAVE + + // If STATUS.MEM_VIOL is asserted then halt the wave to prevent + // the exception raising again and blocking context save. + s_and_b32 ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK + s_cbranch_scc0 L_FETCH_2ND_TRAP + s_or_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_HALT_MASK + +L_FETCH_2ND_TRAP: + // Preserve and clear scalar XNACK state before issuing scalar loads. + // Save IB_STS.REPLAY_W64H[25], RCNT[21:16], FIRST_REPLAY[15] into + // unused space ttmp11[31:24]. + s_andn2_b32 ttmp11, ttmp11, (TTMP11_SAVE_REPLAY_W64H_MASK | TTMP11_SAVE_RCNT_FIRST_REPLAY_MASK) + s_getreg_b32 ttmp2, hwreg(HW_REG_IB_STS) + s_and_b32 ttmp3, ttmp2, SQ_WAVE_IB_STS_REPLAY_W64H_MASK + s_lshl_b32 ttmp3, ttmp3, (TTMP11_SAVE_REPLAY_W64H_SHIFT - SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT) + s_or_b32 ttmp11, ttmp11, ttmp3 + s_and_b32 ttmp3, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK + s_lshl_b32 ttmp3, ttmp3, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT) + s_or_b32 ttmp11, ttmp11, ttmp3 + s_andn2_b32 ttmp2, ttmp2, (SQ_WAVE_IB_STS_REPLAY_W64H_MASK | SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK) + s_setreg_b32 hwreg(HW_REG_IB_STS), ttmp2 + + // Read second-level TBA/TMA from first-level TMA and jump if available. + // ttmp[2:5] and ttmp12 can be used (others hold SPI-initialized debug data) + // ttmp12 holds SQ_WAVE_STATUS + s_getreg_b32 ttmp14, hwreg(HW_REG_SHADER_TMA_LO) + s_getreg_b32 ttmp15, hwreg(HW_REG_SHADER_TMA_HI) + s_lshl_b64 [ttmp14, ttmp15], [ttmp14, ttmp15], 0x8 + s_load_dwordx2 [ttmp2, ttmp3], [ttmp14, ttmp15], 0x0 glc:1 // second-level TBA + s_waitcnt lgkmcnt(0) + s_load_dwordx2 [ttmp14, ttmp15], [ttmp14, ttmp15], 0x8 glc:1 // second-level TMA + s_waitcnt lgkmcnt(0) + s_and_b64 [ttmp2, ttmp3], [ttmp2, ttmp3], [ttmp2, ttmp3] + s_cbranch_scc0 L_NO_NEXT_TRAP // second-level trap handler not been set + s_setpc_b64 [ttmp2, ttmp3] // jump to second-level trap handler + +L_NO_NEXT_TRAP: + s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) + s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_EXCE_MASK + s_cbranch_scc1 L_EXCP_CASE // Exception, jump back to the shader program directly. + s_add_u32 ttmp0, ttmp0, 4 // S_TRAP case, add 4 to ttmp0 + s_addc_u32 ttmp1, ttmp1, 0 +L_EXCP_CASE: + s_and_b32 ttmp1, ttmp1, 0xFFFF + + // Restore SQ_WAVE_IB_STS. + s_lshr_b32 ttmp2, ttmp11, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT) + s_and_b32 ttmp3, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK + s_lshr_b32 ttmp2, ttmp11, (TTMP11_SAVE_REPLAY_W64H_SHIFT - SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT) + s_and_b32 ttmp2, ttmp2, SQ_WAVE_IB_STS_REPLAY_W64H_MASK + s_or_b32 ttmp2, ttmp2, ttmp3 + s_setreg_b32 hwreg(HW_REG_IB_STS), ttmp2 + + // Restore SQ_WAVE_STATUS. + s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 + s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 + s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status + + s_rfe_b64 [ttmp0, ttmp1] + +L_SAVE: + //check whether there is mem_viol + s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) + s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK + s_cbranch_scc0 L_NO_PC_REWIND + + //if so, need rewind PC assuming GDS operation gets NACKed + s_mov_b32 s_save_tmp, 0 + s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT, 1), s_save_tmp //clear mem_viol bit + s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] + s_sub_u32 s_save_pc_lo, s_save_pc_lo, 8 //pc[31:0]-8 + s_subb_u32 s_save_pc_hi, s_save_pc_hi, 0x0 + +L_NO_PC_REWIND: + s_mov_b32 s_save_tmp, 0 + s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp //clear saveCtx bit + + s_getreg_b32 s_save_xnack_mask, hwreg(HW_REG_SHADER_XNACK_MASK) + s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_RCNT_SHIFT, SQ_WAVE_IB_STS_RCNT_SIZE) + s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_RCNT_SHIFT + s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp + s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT, SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE) + s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT + s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp + s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT, SQ_WAVE_IB_STS_REPLAY_W64H_SIZE) + s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_REPLAY_W64H_SHIFT + s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp + s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS) //clear RCNT and FIRST_REPLAY and REPLAY_W64H in IB_STS + s_and_b32 s_save_tmp, s_save_tmp, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG + + s_setreg_b32 hwreg(HW_REG_IB_STS), s_save_tmp + + /* inform SPI the readiness and wait for SPI's go signal */ + s_mov_b32 s_save_exec_lo, exec_lo //save EXEC and use EXEC for the go signal from SPI + s_mov_b32 s_save_exec_hi, exec_hi + s_mov_b64 exec, 0x0 //clear EXEC to get ready to receive + + s_sendmsg sendmsg(MSG_SAVEWAVE) //send SPI a message and wait for SPI's write to EXEC + +L_SLEEP: + // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause + // SQ hang, since the 7,8th wave could not get arbit to exec inst, while + // other waves are stuck into the sleep-loop and waiting for wrexec!=0 + s_sleep 0x2 + s_cbranch_execz L_SLEEP + + /* setup Resource Contants */ + s_mov_b32 s_save_buf_rsrc0, s_save_spi_init_lo //base_addr_lo + s_and_b32 s_save_buf_rsrc1, s_save_spi_init_hi, 0x0000FFFF //base_addr_hi + s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE + s_mov_b32 s_save_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited + s_mov_b32 s_save_buf_rsrc3, S_SAVE_BUF_RSRC_WORD3_MISC + s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_ATC_MASK + s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) + s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or ATC + s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_MTYPE_MASK + s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) + s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or MTYPE + + s_mov_b32 s_save_m0, m0 + + /* global mem offset */ + s_mov_b32 s_save_mem_offset, 0x0 + s_getreg_b32 s_wave_size, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE) + s_lshl_b32 s_wave_size, s_wave_size, S_WAVE_SIZE + s_or_b32 s_wave_size, s_save_spi_init_hi, s_wave_size //share s_wave_size with exec_hi, it's at bit25 + + /* save HW registers */ + +L_SAVE_HWREG: + // HWREG SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR) + get_vgpr_size_bytes(s_save_mem_offset, s_wave_size) + get_svgpr_size_bytes(s_save_tmp) + s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp + s_add_u32 s_save_mem_offset, s_save_mem_offset, get_sgpr_size_bytes() + + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + + write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) + write_hwreg_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset) + write_hwreg_to_mem(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset) + write_hwreg_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset) + write_hwreg_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset) + write_hwreg_to_mem(s_save_status, s_save_buf_rsrc0, s_save_mem_offset) + + s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) + write_hwreg_to_mem(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset) + write_hwreg_to_mem(s_save_xnack_mask, s_save_buf_rsrc0, s_save_mem_offset) + + s_getreg_b32 s_save_m0, hwreg(HW_REG_MODE) + write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) + + s_getreg_b32 s_save_m0, hwreg(HW_REG_SHADER_FLAT_SCRATCH_LO) + write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) + + s_getreg_b32 s_save_m0, hwreg(HW_REG_SHADER_FLAT_SCRATCH_HI) + write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) + + /* the first wave in the threadgroup */ + s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK + s_mov_b32 s_save_exec_hi, 0x0 + s_or_b32 s_save_exec_hi, s_save_tmp, s_save_exec_hi // save first wave bit to s_save_exec_hi.bits[26] + + /* save SGPRs */ + // Save SGPR before LDS save, then the s0 to s4 can be used during LDS save... + + // SGPR SR memory offset : size(VGPR)+size(SVGPR) + get_vgpr_size_bytes(s_save_mem_offset, s_wave_size) + get_svgpr_size_bytes(s_save_tmp) + s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + + // backup s_save_buf_rsrc0,1 to s_save_pc_lo/hi, since write_16sgpr_to_mem function will change the rsrc0 + s_mov_b32 s_save_xnack_mask, s_save_buf_rsrc0 + s_add_u32 s_save_buf_rsrc0, s_save_buf_rsrc0, s_save_mem_offset + s_addc_u32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0 + + s_mov_b32 m0, 0x0 //SGPR initial index value =0 + s_nop 0x0 //Manually inserted wait states +L_SAVE_SGPR_LOOP: + // SGPR is allocated in 16 SGPR granularity + s_movrels_b64 s0, s0 //s0 = s[0+m0], s1 = s[1+m0] + s_movrels_b64 s2, s2 //s2 = s[2+m0], s3 = s[3+m0] + s_movrels_b64 s4, s4 //s4 = s[4+m0], s5 = s[5+m0] + s_movrels_b64 s6, s6 //s6 = s[6+m0], s7 = s[7+m0] + s_movrels_b64 s8, s8 //s8 = s[8+m0], s9 = s[9+m0] + s_movrels_b64 s10, s10 //s10 = s[10+m0], s11 = s[11+m0] + s_movrels_b64 s12, s12 //s12 = s[12+m0], s13 = s[13+m0] + s_movrels_b64 s14, s14 //s14 = s[14+m0], s15 = s[15+m0] + + write_16sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset) + s_add_u32 m0, m0, 16 //next sgpr index + s_cmp_lt_u32 m0, 96 //scc = (m0 < first 96 SGPR) ? 1 : 0 + s_cbranch_scc1 L_SAVE_SGPR_LOOP //first 96 SGPR save is complete? + + //save the rest 12 SGPR + s_movrels_b64 s0, s0 //s0 = s[0+m0], s1 = s[1+m0] + s_movrels_b64 s2, s2 //s2 = s[2+m0], s3 = s[3+m0] + s_movrels_b64 s4, s4 //s4 = s[4+m0], s5 = s[5+m0] + s_movrels_b64 s6, s6 //s6 = s[6+m0], s7 = s[7+m0] + s_movrels_b64 s8, s8 //s8 = s[8+m0], s9 = s[9+m0] + s_movrels_b64 s10, s10 //s10 = s[10+m0], s11 = s[11+m0] + write_12sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset) + + // restore s_save_buf_rsrc0,1 + s_mov_b32 s_save_buf_rsrc0, s_save_xnack_mask + + /* save first 4 VGPR, then LDS save could use */ + // each wave will alloc 4 vgprs at least... + + s_mov_b32 s_save_mem_offset, 0 + s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on + s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE + s_and_b32 m0, m0, 1 + s_cmp_eq_u32 m0, 1 + s_cbranch_scc1 L_ENABLE_SAVE_4VGPR_EXEC_HI + s_mov_b32 exec_hi, 0x00000000 + s_branch L_SAVE_4VGPR_WAVE32 +L_ENABLE_SAVE_4VGPR_EXEC_HI: + s_mov_b32 exec_hi, 0xFFFFFFFF + s_branch L_SAVE_4VGPR_WAVE64 +L_SAVE_4VGPR_WAVE32: + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + + // VGPR Allocated in 4-GPR granularity + + buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128 + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*2 + buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*3 + s_branch L_SAVE_LDS + +L_SAVE_4VGPR_WAVE64: + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + + // VGPR Allocated in 4-GPR granularity + + buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 + buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 + + /* save LDS */ + +L_SAVE_LDS: + // Change EXEC to all threads... + s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on + s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE + s_and_b32 m0, m0, 1 + s_cmp_eq_u32 m0, 1 + s_cbranch_scc1 L_ENABLE_SAVE_LDS_EXEC_HI + s_mov_b32 exec_hi, 0x00000000 + s_branch L_SAVE_LDS_NORMAL +L_ENABLE_SAVE_LDS_EXEC_HI: + s_mov_b32 exec_hi, 0xFFFFFFFF +L_SAVE_LDS_NORMAL: + s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) + s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //lds_size is zero? + s_cbranch_scc0 L_SAVE_LDS_DONE //no lds used? jump to L_SAVE_DONE + + s_barrier //LDS is used? wait for other waves in the same TG + s_and_b32 s_save_tmp, s_save_exec_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK + s_cbranch_scc0 L_SAVE_LDS_DONE + + // first wave do LDS save; + + s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 6 //LDS size in dwords = lds_size * 64dw + s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //LDS size in bytes + s_mov_b32 s_save_buf_rsrc2, s_save_alloc_size //NUM_RECORDS in bytes + + // LDS at offset: size(VGPR)+size(SVGPR)+SIZE(SGPR)+SIZE(HWREG) + // + get_vgpr_size_bytes(s_save_mem_offset, s_wave_size) + get_svgpr_size_bytes(s_save_tmp) + s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp + s_add_u32 s_save_mem_offset, s_save_mem_offset, get_sgpr_size_bytes() + s_add_u32 s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes() + + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + + //load 0~63*4(byte address) to vgpr v0 + v_mbcnt_lo_u32_b32 v0, -1, 0 + v_mbcnt_hi_u32_b32 v0, -1, v0 + v_mul_u32_u24 v0, 4, v0 + + s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE + s_and_b32 m0, m0, 1 + s_cmp_eq_u32 m0, 1 + s_mov_b32 m0, 0x0 + s_cbranch_scc1 L_SAVE_LDS_W64 + +L_SAVE_LDS_W32: + s_mov_b32 s3, 128 + s_nop 0 + s_nop 0 + s_nop 0 +L_SAVE_LDS_LOOP_W32: + ds_read_b32 v1, v0 + s_waitcnt 0 + buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + + s_add_u32 m0, m0, s3 //every buffer_store_lds does 256 bytes + s_add_u32 s_save_mem_offset, s_save_mem_offset, s3 + v_add_nc_u32 v0, v0, 128 //mem offset increased by 128 bytes + s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_SAVE_LDS_LOOP_W32 //LDS save is complete? + + s_branch L_SAVE_LDS_DONE + +L_SAVE_LDS_W64: + s_mov_b32 s3, 256 + s_nop 0 + s_nop 0 + s_nop 0 +L_SAVE_LDS_LOOP_W64: + ds_read_b32 v1, v0 + s_waitcnt 0 + buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + + s_add_u32 m0, m0, s3 //every buffer_store_lds does 256 bytes + s_add_u32 s_save_mem_offset, s_save_mem_offset, s3 + v_add_nc_u32 v0, v0, 256 //mem offset increased by 256 bytes + s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_SAVE_LDS_LOOP_W64 //LDS save is complete? + +L_SAVE_LDS_DONE: + /* save VGPRs - set the Rest VGPRs */ +L_SAVE_VGPR: + // VGPR SR memory offset: 0 + s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on + s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE + s_and_b32 m0, m0, 1 + s_cmp_eq_u32 m0, 1 + s_cbranch_scc1 L_ENABLE_SAVE_VGPR_EXEC_HI + s_mov_b32 s_save_mem_offset, (0+128*4) // for the rest VGPRs + s_mov_b32 exec_hi, 0x00000000 + s_branch L_SAVE_VGPR_NORMAL +L_ENABLE_SAVE_VGPR_EXEC_HI: + s_mov_b32 s_save_mem_offset, (0+256*4) // for the rest VGPRs + s_mov_b32 exec_hi, 0xFFFFFFFF +L_SAVE_VGPR_NORMAL: + s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) + s_add_u32 s_save_alloc_size, s_save_alloc_size, 1 + s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) + //determine it is wave32 or wave64 + s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE + s_and_b32 m0, m0, 1 + s_cmp_eq_u32 m0, 1 + s_cbranch_scc1 L_SAVE_VGPR_WAVE64 + + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + + // VGPR Allocated in 4-GPR granularity + + // VGPR store using dw burst + s_mov_b32 m0, 0x4 //VGPR initial index value =4 + s_cmp_lt_u32 m0, s_save_alloc_size + s_cbranch_scc0 L_SAVE_VGPR_END + +L_SAVE_VGPR_W32_LOOP: + v_movrels_b32 v0, v0 //v0 = v[0+m0] + v_movrels_b32 v1, v1 //v1 = v[1+m0] + v_movrels_b32 v2, v2 //v2 = v[2+m0] + v_movrels_b32 v3, v3 //v3 = v[3+m0] + + buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128 + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*2 + buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*3 + + s_add_u32 m0, m0, 4 //next vgpr index + s_add_u32 s_save_mem_offset, s_save_mem_offset, 128*4 //every buffer_store_dword does 128 bytes + s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_SAVE_VGPR_W32_LOOP //VGPR save is complete? + + s_branch L_SAVE_VGPR_END + +L_SAVE_VGPR_WAVE64: + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + + // VGPR store using dw burst + s_mov_b32 m0, 0x4 //VGPR initial index value =4 + s_cmp_lt_u32 m0, s_save_alloc_size + s_cbranch_scc0 L_SAVE_VGPR_END + +L_SAVE_VGPR_W64_LOOP: + v_movrels_b32 v0, v0 //v0 = v[0+m0] + v_movrels_b32 v1, v1 //v1 = v[1+m0] + v_movrels_b32 v2, v2 //v2 = v[2+m0] + v_movrels_b32 v3, v3 //v3 = v[3+m0] + + buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 + buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 + + s_add_u32 m0, m0, 4 //next vgpr index + s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 //every buffer_store_dword does 256 bytes + s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_SAVE_VGPR_W64_LOOP //VGPR save is complete? + + //Below part will be the save shared vgpr part (new for gfx10) + s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE) + s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //shared_vgpr_size is zero? + s_cbranch_scc0 L_SAVE_VGPR_END //no shared_vgpr used? jump to L_SAVE_LDS + s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 3 //Number of SHARED_VGPRs = shared_vgpr_size * 8 (non-zero value) + //m0 now has the value of normal vgpr count, just add the m0 with shared_vgpr count to get the total count. + //save shared_vgpr will start from the index of m0 + s_add_u32 s_save_alloc_size, s_save_alloc_size, m0 + s_mov_b32 exec_lo, 0xFFFFFFFF + s_mov_b32 exec_hi, 0x00000000 +L_SAVE_SHARED_VGPR_WAVE64_LOOP: + v_movrels_b32 v0, v0 //v0 = v[0+m0] + buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + s_add_u32 m0, m0, 1 //next vgpr index + s_add_u32 s_save_mem_offset, s_save_mem_offset, 128 + s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_SAVE_SHARED_VGPR_WAVE64_LOOP //SHARED_VGPR save is complete? + +L_SAVE_VGPR_END: + s_branch L_END_PGM + +L_RESTORE: + /* Setup Resource Contants */ + s_mov_b32 s_restore_buf_rsrc0, s_restore_spi_init_lo //base_addr_lo + s_and_b32 s_restore_buf_rsrc1, s_restore_spi_init_hi, 0x0000FFFF //base_addr_hi + s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE + s_mov_b32 s_restore_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) + s_mov_b32 s_restore_buf_rsrc3, S_RESTORE_BUF_RSRC_WORD3_MISC + s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_ATC_MASK + s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) + s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or ATC + s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_MTYPE_MASK + s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) + s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or MTYPE + //determine it is wave32 or wave64 + s_getreg_b32 s_restore_size, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE) + s_lshl_b32 s_restore_size, s_restore_size, S_WAVE_SIZE + s_or_b32 s_restore_size, s_restore_spi_init_hi, s_restore_size + + s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK + s_cbranch_scc0 L_RESTORE_VGPR + + /* restore LDS */ +L_RESTORE_LDS: + s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on + s_lshr_b32 m0, s_restore_size, S_WAVE_SIZE + s_and_b32 m0, m0, 1 + s_cmp_eq_u32 m0, 1 + s_cbranch_scc1 L_ENABLE_RESTORE_LDS_EXEC_HI + s_mov_b32 exec_hi, 0x00000000 + s_branch L_RESTORE_LDS_NORMAL +L_ENABLE_RESTORE_LDS_EXEC_HI: + s_mov_b32 exec_hi, 0xFFFFFFFF +L_RESTORE_LDS_NORMAL: + s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) + s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //lds_size is zero? + s_cbranch_scc0 L_RESTORE_VGPR //no lds used? jump to L_RESTORE_VGPR + s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 6 //LDS size in dwords = lds_size * 64dw + s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //LDS size in bytes + s_mov_b32 s_restore_buf_rsrc2, s_restore_alloc_size //NUM_RECORDS in bytes + + // LDS at offset: size(VGPR)+size(SVGPR)+SIZE(SGPR)+SIZE(HWREG) + // + get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size) + get_svgpr_size_bytes(s_restore_tmp) + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes() + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes() + + s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + + s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE + s_and_b32 m0, m0, 1 + s_cmp_eq_u32 m0, 1 + s_mov_b32 m0, 0x0 + s_cbranch_scc1 L_RESTORE_LDS_LOOP_W64 + +L_RESTORE_LDS_LOOP_W32: + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 // first 64DW + s_add_u32 m0, m0, 128 // 128 DW + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128 //mem offset increased by 128DW + s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_RESTORE_LDS_LOOP_W32 //LDS restore is complete? + s_branch L_RESTORE_VGPR + +L_RESTORE_LDS_LOOP_W64: + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 // first 64DW + s_add_u32 m0, m0, 256 // 256 DW + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256 //mem offset increased by 256DW + s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_RESTORE_LDS_LOOP_W64 //LDS restore is complete? + + /* restore VGPRs */ +L_RESTORE_VGPR: + // VGPR SR memory offset : 0 + s_mov_b32 s_restore_mem_offset, 0x0 + s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on + s_lshr_b32 m0, s_restore_size, S_WAVE_SIZE + s_and_b32 m0, m0, 1 + s_cmp_eq_u32 m0, 1 + s_cbranch_scc1 L_ENABLE_RESTORE_VGPR_EXEC_HI + s_mov_b32 exec_hi, 0x00000000 + s_branch L_RESTORE_VGPR_NORMAL +L_ENABLE_RESTORE_VGPR_EXEC_HI: + s_mov_b32 exec_hi, 0xFFFFFFFF +L_RESTORE_VGPR_NORMAL: + s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) + s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 + s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) + //determine it is wave32 or wave64 + s_lshr_b32 m0, s_restore_size, S_WAVE_SIZE + s_and_b32 m0, m0, 1 + s_cmp_eq_u32 m0, 1 + s_cbranch_scc1 L_RESTORE_VGPR_WAVE64 + + s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + + // VGPR load using dw burst + s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128*4 + s_mov_b32 m0, 4 //VGPR initial index value = 4 + +L_RESTORE_VGPR_WAVE32_LOOP: + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 + buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:128 + buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:128*2 + buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:128*3 + s_waitcnt vmcnt(0) + v_movreld_b32 v0, v0 //v[0+m0] = v0 + v_movreld_b32 v1, v1 + v_movreld_b32 v2, v2 + v_movreld_b32 v3, v3 + s_add_u32 m0, m0, 4 //next vgpr index + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128*4 //every buffer_load_dword does 128 bytes + s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_RESTORE_VGPR_WAVE32_LOOP //VGPR restore (except v0) is complete? + + /* VGPR restore on v0 */ + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 + buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:128 + buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:128*2 + buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:128*3 + + s_branch L_RESTORE_SGPR + +L_RESTORE_VGPR_WAVE64: + s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + + // VGPR load using dw burst + s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v4, v0 will be the last + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 + s_mov_b32 m0, 4 //VGPR initial index value = 4 + +L_RESTORE_VGPR_WAVE64_LOOP: + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 + buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256 + buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*2 + buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*3 + s_waitcnt vmcnt(0) + v_movreld_b32 v0, v0 //v[0+m0] = v0 + v_movreld_b32 v1, v1 + v_movreld_b32 v2, v2 + v_movreld_b32 v3, v3 + s_add_u32 m0, m0, 4 //next vgpr index + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 //every buffer_load_dword does 256 bytes + s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_RESTORE_VGPR_WAVE64_LOOP //VGPR restore (except v0) is complete? + + //Below part will be the restore shared vgpr part (new for gfx10) + s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE) //shared_vgpr_size + s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //shared_vgpr_size is zero? + s_cbranch_scc0 L_RESTORE_V0 //no shared_vgpr used? + s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 3 //Number of SHARED_VGPRs = shared_vgpr_size * 8 (non-zero value) + //m0 now has the value of normal vgpr count, just add the m0 with shared_vgpr count to get the total count. + //restore shared_vgpr will start from the index of m0 + s_add_u32 s_restore_alloc_size, s_restore_alloc_size, m0 + s_mov_b32 exec_lo, 0xFFFFFFFF + s_mov_b32 exec_hi, 0x00000000 +L_RESTORE_SHARED_VGPR_WAVE64_LOOP: + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 + s_waitcnt vmcnt(0) + v_movreld_b32 v0, v0 //v[0+m0] = v0 + s_add_u32 m0, m0, 1 //next vgpr index + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 128 + s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0 + s_cbranch_scc1 L_RESTORE_SHARED_VGPR_WAVE64_LOOP //VGPR restore (except v0) is complete? + + s_mov_b32 exec_hi, 0xFFFFFFFF //restore back exec_hi before restoring V0!! + + /* VGPR restore on v0 */ +L_RESTORE_V0: + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 + buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256 + buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*2 + buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*3 + s_waitcnt vmcnt(0) + + /* restore SGPRs */ + //will be 2+8+16*6 + // SGPR SR memory offset : size(VGPR)+size(SVGPR) +L_RESTORE_SGPR: + get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size) + get_svgpr_size_bytes(s_restore_tmp) + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes() + s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 20*4 //s108~s127 is not saved + + s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + + s_mov_b32 m0, s_sgpr_save_num + + read_4sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset) + s_waitcnt lgkmcnt(0) + + s_sub_u32 m0, m0, 4 // Restore from S[0] to S[104] + s_nop 0 // hazard SALU M0=> S_MOVREL + + s_movreld_b64 s0, s0 //s[0+m0] = s0 + s_movreld_b64 s2, s2 + + read_8sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset) + s_waitcnt lgkmcnt(0) + + s_sub_u32 m0, m0, 8 // Restore from S[0] to S[96] + s_nop 0 // hazard SALU M0=> S_MOVREL + + s_movreld_b64 s0, s0 //s[0+m0] = s0 + s_movreld_b64 s2, s2 + s_movreld_b64 s4, s4 + s_movreld_b64 s6, s6 + + L_RESTORE_SGPR_LOOP: + read_16sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset) + s_waitcnt lgkmcnt(0) + + s_sub_u32 m0, m0, 16 // Restore from S[n] to S[0] + s_nop 0 // hazard SALU M0=> S_MOVREL + + s_movreld_b64 s0, s0 //s[0+m0] = s0 + s_movreld_b64 s2, s2 + s_movreld_b64 s4, s4 + s_movreld_b64 s6, s6 + s_movreld_b64 s8, s8 + s_movreld_b64 s10, s10 + s_movreld_b64 s12, s12 + s_movreld_b64 s14, s14 + + s_cmp_eq_u32 m0, 0 //scc = (m0 < s_sgpr_save_num) ? 1 : 0 + s_cbranch_scc0 L_RESTORE_SGPR_LOOP + + /* restore HW registers */ +L_RESTORE_HWREG: + // HWREG SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR) + get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size) + get_svgpr_size_bytes(s_restore_tmp) + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp + s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes() + + s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + + read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset) + read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset) + read_hwreg_from_mem(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset) + read_hwreg_from_mem(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset) + read_hwreg_from_mem(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset) + read_hwreg_from_mem(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset) + read_hwreg_from_mem(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset) + read_hwreg_from_mem(s_restore_xnack_mask, s_restore_buf_rsrc0, s_restore_mem_offset) + read_hwreg_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset) + read_hwreg_from_mem(s_restore_flat_scratch, s_restore_buf_rsrc0, s_restore_mem_offset) + s_waitcnt lgkmcnt(0) + + s_setreg_b32 hwreg(HW_REG_SHADER_FLAT_SCRATCH_LO), s_restore_flat_scratch + + read_hwreg_from_mem(s_restore_flat_scratch, s_restore_buf_rsrc0, s_restore_mem_offset) + s_waitcnt lgkmcnt(0) //from now on, it is safe to restore STATUS and IB_STS + + s_setreg_b32 hwreg(HW_REG_SHADER_FLAT_SCRATCH_HI), s_restore_flat_scratch + + s_mov_b32 s_restore_tmp, s_restore_pc_hi + s_and_b32 s_restore_pc_hi, s_restore_tmp, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS + + s_mov_b32 m0, s_restore_m0 + s_mov_b32 exec_lo, s_restore_exec_lo + s_mov_b32 exec_hi, s_restore_exec_hi + + s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts + s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0 + s_setreg_b32 hwreg(HW_REG_SHADER_XNACK_MASK), s_restore_xnack_mask + s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts + s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT + s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0 + s_setreg_b32 hwreg(HW_REG_MODE), s_restore_mode + s_and_b32 s_restore_m0, s_restore_tmp, S_SAVE_PC_HI_RCNT_MASK + s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_RCNT_SHIFT + s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_RCNT_SHIFT + s_mov_b32 s_restore_mode, 0x0 + s_or_b32 s_restore_mode, s_restore_mode, s_restore_m0 + s_and_b32 s_restore_m0, s_restore_tmp, S_SAVE_PC_HI_FIRST_REPLAY_MASK + s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT + s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT + s_or_b32 s_restore_mode, s_restore_mode, s_restore_m0 + s_and_b32 s_restore_m0, s_restore_tmp, S_SAVE_PC_HI_REPLAY_W64H_MASK + s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_REPLAY_W64H_SHIFT + s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT + s_or_b32 s_restore_mode, s_restore_mode, s_restore_m0 + + s_and_b32 s_restore_m0, s_restore_status, SQ_WAVE_STATUS_INST_ATC_MASK + s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT + s_setreg_b32 hwreg(HW_REG_IB_STS), s_restore_mode + + s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 + s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 + s_setreg_b32 hwreg(HW_REG_STATUS), s_restore_status // SCC is included, which is changed by previous salu + + s_barrier //barrier to ensure the readiness of LDS before access attemps from any other wave in the same TG + + s_rfe_b64 s_restore_pc_lo //Return to the main shader program and resume execution + +L_END_PGM: + s_endpgm +end + +function write_hwreg_to_mem(s, s_rsrc, s_mem_offset) + s_mov_b32 exec_lo, m0 + s_mov_b32 m0, s_mem_offset + s_buffer_store_dword s, s_rsrc, m0 glc:1 + s_add_u32 s_mem_offset, s_mem_offset, 4 + s_mov_b32 m0, exec_lo +end + + +function write_16sgpr_to_mem(s, s_rsrc, s_mem_offset) + s_buffer_store_dwordx4 s[0], s_rsrc, 0 glc:1 + s_buffer_store_dwordx4 s[4], s_rsrc, 16 glc:1 + s_buffer_store_dwordx4 s[8], s_rsrc, 32 glc:1 + s_buffer_store_dwordx4 s[12], s_rsrc, 48 glc:1 + s_add_u32 s_rsrc[0], s_rsrc[0], 4*16 + s_addc_u32 s_rsrc[1], s_rsrc[1], 0x0 +end + +function write_12sgpr_to_mem(s, s_rsrc, s_mem_offset) + s_buffer_store_dwordx4 s[0], s_rsrc, 0 glc:1 + s_buffer_store_dwordx4 s[4], s_rsrc, 16 glc:1 + s_buffer_store_dwordx4 s[8], s_rsrc, 32 glc:1 + s_add_u32 s_rsrc[0], s_rsrc[0], 4*12 + s_addc_u32 s_rsrc[1], s_rsrc[1], 0x0 +end + + +function read_hwreg_from_mem(s, s_rsrc, s_mem_offset) + s_buffer_load_dword s, s_rsrc, s_mem_offset glc:1 + s_add_u32 s_mem_offset, s_mem_offset, 4 +end + +function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset) + s_sub_u32 s_mem_offset, s_mem_offset, 4*16 + s_buffer_load_dwordx16 s, s_rsrc, s_mem_offset glc:1 +end + +function read_8sgpr_from_mem(s, s_rsrc, s_mem_offset) + s_sub_u32 s_mem_offset, s_mem_offset, 4*8 + s_buffer_load_dwordx8 s, s_rsrc, s_mem_offset glc:1 +end + +function read_4sgpr_from_mem(s, s_rsrc, s_mem_offset) + s_sub_u32 s_mem_offset, s_mem_offset, 4*4 + s_buffer_load_dwordx4 s, s_rsrc, s_mem_offset glc:1 +end + + +function get_lds_size_bytes(s_lds_size_byte) + s_getreg_b32 s_lds_size_byte, hwreg(HW_REG_LDS_ALLOC, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) + s_lshl_b32 s_lds_size_byte, s_lds_size_byte, 8 //LDS size in dwords = lds_size * 64 *4Bytes // granularity 64DW +end + +function get_vgpr_size_bytes(s_vgpr_size_byte, s_size) + s_getreg_b32 s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) + s_add_u32 s_vgpr_size_byte, s_vgpr_size_byte, 1 + s_lshr_b32 m0, s_size, S_WAVE_SIZE + s_and_b32 m0, m0, 1 + s_cmp_eq_u32 m0, 1 + s_cbranch_scc1 L_ENABLE_SHIFT_W64 + s_lshl_b32 s_vgpr_size_byte, s_vgpr_size_byte, (2+7) //Number of VGPRs = (vgpr_size + 1) * 4 * 32 * 4 (non-zero value) + s_branch L_SHIFT_DONE +L_ENABLE_SHIFT_W64: + s_lshl_b32 s_vgpr_size_byte, s_vgpr_size_byte, (2+8) //Number of VGPRs = (vgpr_size + 1) * 4 * 64 * 4 (non-zero value) +L_SHIFT_DONE: +end + +function get_svgpr_size_bytes(s_svgpr_size_byte) + s_getreg_b32 s_svgpr_size_byte, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE) + s_lshl_b32 s_svgpr_size_byte, s_svgpr_size_byte, (3+7) +end + +function get_sgpr_size_bytes + return 512 +end + +function get_hwreg_size_bytes + return 128 +end diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm index abe1a5da29fb..b195b7cd8a17 100644 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm @@ -24,78 +24,6 @@ * PROJECT=vi ./sp3 cwsr_trap_handler_gfx8.asm -hex tmp.hex */ -/* HW (VI) source code for CWSR trap handler */ -/* Version 18 + multiple trap handler */ - -// this performance-optimal version was originally from Seven Xu at SRDC - -// Revison #18 --... -/* Rev History -** #1. Branch from gc dv. //gfxip/gfx8/main/src/test/suites/block/cs/sr/cs_trap_handler.sp3#1,#50, #51, #52-53(Skip, Already Fixed by PV), #54-56(merged),#57-58(mergerd, skiped-already fixed by PV) -** #4. SR Memory Layout: -** 1. VGPR-SGPR-HWREG-{LDS} -** 2. tba_hi.bits.26 - reconfigured as the first wave in tg bits, for defer Save LDS for a threadgroup.. performance concern.. -** #5. Update: 1. Accurate g8sr_ts_save_d timestamp -** #6. Update: 1. Fix s_barrier usage; 2. VGPR s/r using swizzle buffer?(NoNeed, already matched the swizzle pattern, more investigation) -** #7. Update: 1. don't barrier if noLDS -** #8. Branch: 1. Branch to ver#0, which is very similar to gc dv version -** 2. Fix SQ issue by s_sleep 2 -** #9. Update: 1. Fix scc restore failed issue, restore wave_status at last -** 2. optimize s_buffer save by burst 16sgprs... -** #10. Update 1. Optimize restore sgpr by busrt 16 sgprs. -** #11. Update 1. Add 2 more timestamp for debug version -** #12. Update 1. Add VGPR SR using DWx4, some case improve and some case drop performance -** #13. Integ 1. Always use MUBUF for PV trap shader... -** #14. Update 1. s_buffer_store soft clause... -** #15. Update 1. PERF - sclar write with glc:0/mtype0 to allow L2 combine. perf improvement a lot. -** #16. Update 1. PRRF - UNROLL LDS_DMA got 2500cycle save in IP tree -** #17. Update 1. FUNC - LDS_DMA has issues while ATC, replace with ds_read/buffer_store for save part[TODO restore part] -** 2. PERF - Save LDS before save VGPR to cover LDS save long latency... -** #18. Update 1. FUNC - Implicitly estore STATUS.VCCZ, which is not writable by s_setreg_b32 -** 2. FUNC - Handle non-CWSR traps -*/ - -var G8SR_WDMEM_HWREG_OFFSET = 0 -var G8SR_WDMEM_SGPR_OFFSET = 128 // in bytes - -// Keep definition same as the app shader, These 2 time stamps are part of the app shader... Should before any Save and after restore. - -var G8SR_DEBUG_TIMESTAMP = 0 -var G8SR_DEBUG_TS_SAVE_D_OFFSET = 40*4 // ts_save_d timestamp offset relative to SGPR_SR_memory_offset -var s_g8sr_ts_save_s = s[34:35] // save start -var s_g8sr_ts_sq_save_msg = s[36:37] // The save shader send SAVEWAVE msg to spi -var s_g8sr_ts_spi_wrexec = s[38:39] // the SPI write the sr address to SQ -var s_g8sr_ts_save_d = s[40:41] // save end -var s_g8sr_ts_restore_s = s[42:43] // restore start -var s_g8sr_ts_restore_d = s[44:45] // restore end - -var G8SR_VGPR_SR_IN_DWX4 = 0 -var G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 = 0x00100000 // DWx4 stride is 4*4Bytes -var G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4 = G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 - - -/*************************************************************************/ -/* control on how to run the shader */ -/*************************************************************************/ -//any hack that needs to be made to run this code in EMU (either because various EMU code are not ready or no compute save & restore in EMU run) -var EMU_RUN_HACK = 0 -var EMU_RUN_HACK_RESTORE_NORMAL = 0 -var EMU_RUN_HACK_SAVE_NORMAL_EXIT = 0 -var EMU_RUN_HACK_SAVE_SINGLE_WAVE = 0 -var EMU_RUN_HACK_SAVE_FIRST_TIME = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK -var EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_LO = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK -var EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_HI = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK -var SAVE_LDS = 1 -var WG_BASE_ADDR_LO = 0x9000a000 -var WG_BASE_ADDR_HI = 0x0 -var WAVE_SPACE = 0x5000 //memory size that each wave occupies in workgroup state mem -var CTX_SAVE_CONTROL = 0x0 -var CTX_RESTORE_CONTROL = CTX_SAVE_CONTROL -var SIM_RUN_HACK = 0 //any hack that needs to be made to run this code in SIM (either because various RTL code are not ready or no compute save & restore in RTL run) -var SGPR_SAVE_USE_SQC = 1 //use SQC D$ to do the write -var USE_MTBUF_INSTEAD_OF_MUBUF = 0 //because TC EMU currently asserts on 0 of // overload DFMT field to carry 4 more bits of stride for MUBUF opcodes -var SWIZZLE_EN = 0 //whether we use swizzled buffer addressing - /**************************************************************************/ /* variables */ /**************************************************************************/ @@ -226,16 +154,7 @@ shader main type(CS) - if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) //hack to use trap_id for determining save/restore - //FIXME VCCZ un-init assertion s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC - s_and_b32 s_save_tmp, s_save_pc_hi, 0xffff0000 //change SCC - s_cmp_eq_u32 s_save_tmp, 0x007e0000 //Save: trap_id = 0x7e. Restore: trap_id = 0x7f. - s_cbranch_scc0 L_JUMP_TO_RESTORE //do not need to recover STATUS here since we are going to RESTORE - //FIXME s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //need to recover STATUS since we are going to SAVE - s_branch L_SKIP_RESTORE //NOT restore, SAVE actually - else s_branch L_SKIP_RESTORE //NOT restore. might be a regular trap or save - end L_JUMP_TO_RESTORE: s_branch L_RESTORE //restore @@ -249,7 +168,7 @@ L_SKIP_RESTORE: s_cbranch_scc1 L_SAVE //this is the operation for save // ********* Handle non-CWSR traps ******************* -if (!EMU_RUN_HACK) + /* read tba and tma for next level trap handler, ttmp4 is used as s_save_status */ s_load_dwordx4 [ttmp8,ttmp9,ttmp10, ttmp11], [tma_lo,tma_hi], 0 s_waitcnt lgkmcnt(0) @@ -268,7 +187,7 @@ L_EXCP_CASE: s_and_b32 ttmp1, ttmp1, 0xFFFF set_status_without_spi_prio(s_save_status, ttmp2) //restore HW status(SCC) s_rfe_b64 [ttmp0, ttmp1] -end + // ********* End handling of non-CWSR traps ******************* /**************************************************************************/ @@ -276,25 +195,6 @@ end /**************************************************************************/ L_SAVE: - -if G8SR_DEBUG_TIMESTAMP - s_memrealtime s_g8sr_ts_save_s - s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? -end - - //check whether there is mem_viol - s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) - s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK - s_cbranch_scc0 L_NO_PC_REWIND - - //if so, need rewind PC assuming GDS operation gets NACKed - s_mov_b32 s_save_tmp, 0 //clear mem_viol bit - s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT, 1), s_save_tmp //clear mem_viol bit - s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] - s_sub_u32 s_save_pc_lo, s_save_pc_lo, 8 //pc[31:0]-8 - s_subb_u32 s_save_pc_hi, s_save_pc_hi, 0x0 // -scc - -L_NO_PC_REWIND: s_mov_b32 s_save_tmp, 0 //clear saveCtx bit s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp //clear saveCtx bit @@ -316,16 +216,7 @@ L_NO_PC_REWIND: s_mov_b32 s_save_exec_hi, exec_hi s_mov_b64 exec, 0x0 //clear EXEC to get ready to receive -if G8SR_DEBUG_TIMESTAMP - s_memrealtime s_g8sr_ts_sq_save_msg - s_waitcnt lgkmcnt(0) -end - - if (EMU_RUN_HACK) - - else s_sendmsg sendmsg(MSG_SAVEWAVE) //send SPI a message and wait for SPI's write to EXEC - end // Set SPI_PRIO=2 to avoid starving instruction fetch in the waves we're waiting for. s_or_b32 s_save_tmp, s_save_status, (2 << SQ_WAVE_STATUS_SPI_PRIO_SHIFT) @@ -334,36 +225,9 @@ end L_SLEEP: s_sleep 0x2 // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause SQ hang, since the 7,8th wave could not get arbit to exec inst, while other waves are stuck into the sleep-loop and waiting for wrexec!=0 - if (EMU_RUN_HACK) - - else s_cbranch_execz L_SLEEP - end - -if G8SR_DEBUG_TIMESTAMP - s_memrealtime s_g8sr_ts_spi_wrexec - s_waitcnt lgkmcnt(0) -end /* setup Resource Contants */ - if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_SINGLE_WAVE)) - //calculate wd_addr using absolute thread id - v_readlane_b32 s_save_tmp, v9, 0 - s_lshr_b32 s_save_tmp, s_save_tmp, 6 - s_mul_i32 s_save_tmp, s_save_tmp, WAVE_SPACE - s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO - s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI - s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL - else - end - if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_SINGLE_WAVE)) - s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO - s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI - s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL - else - end - - s_mov_b32 s_save_buf_rsrc0, s_save_spi_init_lo //base_addr_lo s_and_b32 s_save_buf_rsrc1, s_save_spi_init_hi, 0x0000FFFF //base_addr_hi s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE @@ -396,22 +260,10 @@ end s_mov_b32 s_save_buf_rsrc2, 0x4 //NUM_RECORDS in bytes - if (SWIZZLE_EN) - s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? - else s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - end write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) //M0 - - if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_FIRST_TIME)) - s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4 - s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over - s_mov_b32 tba_lo, EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_LO - s_mov_b32 tba_hi, EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_HI - end - write_hwreg_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset) //PC write_hwreg_to_mem(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset) write_hwreg_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset) //EXEC @@ -453,18 +305,8 @@ end s_add_u32 s_save_alloc_size, s_save_alloc_size, 1 s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value) - if (SGPR_SAVE_USE_SQC) s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 2 //NUM_RECORDS in bytes - else - s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads) - end - - if (SWIZZLE_EN) - s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? - else s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - end - // backup s_save_buf_rsrc0,1 to s_save_pc_lo/hi, since write_16sgpr_to_mem function will change the rsrc0 //s_mov_b64 s_save_pc_lo, s_save_buf_rsrc0 @@ -503,30 +345,14 @@ end s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on s_mov_b32 exec_hi, 0xFFFFFFFF - if (SWIZZLE_EN) - s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? - else s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - end - // VGPR Allocated in 4-GPR granularity -if G8SR_VGPR_SR_IN_DWX4 - // the const stride for DWx4 is 4*4 bytes - s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 - s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes - - buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 - - s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 - s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE // reset const stride to 4 bytes -else buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 -end @@ -562,64 +388,10 @@ end s_add_u32 s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes() - if (SWIZZLE_EN) - s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? - else s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - end - s_mov_b32 m0, 0x0 //lds_offset initial value = 0 -var LDS_DMA_ENABLE = 0 -var UNROLL = 0 -if UNROLL==0 && LDS_DMA_ENABLE==1 - s_mov_b32 s3, 256*2 - s_nop 0 - s_nop 0 - s_nop 0 - L_SAVE_LDS_LOOP: - //TODO: looks the 2 buffer_store/load clause for s/r will hurt performance.??? - if (SAVE_LDS) //SPI always alloc LDS space in 128DW granularity - buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 // first 64DW - buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW - end - - s_add_u32 m0, m0, s3 //every buffer_store_lds does 256 bytes - s_add_u32 s_save_mem_offset, s_save_mem_offset, s3 //mem offset increased by 256 bytes - s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0 - s_cbranch_scc1 L_SAVE_LDS_LOOP //LDS save is complete? - -elsif LDS_DMA_ENABLE==1 && UNROLL==1 // UNROOL , has ichace miss - // store from higest LDS address to lowest - s_mov_b32 s3, 256*2 - s_sub_u32 m0, s_save_alloc_size, s3 - s_add_u32 s_save_mem_offset, s_save_mem_offset, m0 - s_lshr_b32 s_save_alloc_size, s_save_alloc_size, 9 // how many 128 trunks... - s_sub_u32 s_save_alloc_size, 128, s_save_alloc_size // store from higheset addr to lowest - s_mul_i32 s_save_alloc_size, s_save_alloc_size, 6*4 // PC offset increment, each LDS save block cost 6*4 Bytes instruction - s_add_u32 s_save_alloc_size, s_save_alloc_size, 3*4 //2is the below 2 inst...//s_addc and s_setpc - s_nop 0 - s_nop 0 - s_nop 0 //pad 3 dw to let LDS_DMA align with 64Bytes - s_getpc_b64 s[0:1] // reuse s[0:1], since s[0:1] already saved - s_add_u32 s0, s0,s_save_alloc_size - s_addc_u32 s1, s1, 0 - s_setpc_b64 s[0:1] - - - for var i =0; i< 128; i++ - // be careful to make here a 64Byte aligned address, which could improve performance... - buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:0 // first 64DW - buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW - - if i!=127 - s_sub_u32 m0, m0, s3 // use a sgpr to shrink 2DW-inst to 1DW inst to improve performance , i.e. pack more LDS_DMA inst to one Cacheline - s_sub_u32 s_save_mem_offset, s_save_mem_offset, s3 - end - end - -else // BUFFER_STORE v_mbcnt_lo_u32_b32 v2, 0xffffffff, 0x0 v_mbcnt_hi_u32_b32 v3, 0xffffffff, v2 // tid v_mul_i32_i24 v2, v3, 8 // tid*8 @@ -641,8 +413,6 @@ L_SAVE_LDS_LOOP_VECTOR: // restore rsrc3 s_mov_b32 s_save_buf_rsrc3, s0 -end - L_SAVE_LDS_DONE: @@ -660,44 +430,8 @@ L_SAVE_LDS_DONE: s_add_u32 s_save_alloc_size, s_save_alloc_size, 1 s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) //FIXME for GFX, zero is possible s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4) - if (SWIZZLE_EN) - s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? - else s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - end - - - // VGPR Allocated in 4-GPR granularity -if G8SR_VGPR_SR_IN_DWX4 - // the const stride for DWx4 is 4*4 bytes - s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 - s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes - - s_mov_b32 m0, 4 // skip first 4 VGPRs - s_cmp_lt_u32 m0, s_save_alloc_size - s_cbranch_scc0 L_SAVE_VGPR_LOOP_END // no more vgprs - - s_set_gpr_idx_on m0, 0x1 // This will change M0 - s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 // because above inst change m0 -L_SAVE_VGPR_LOOP: - v_mov_b32 v0, v0 // v0 = v[0+m0] - v_mov_b32 v1, v1 - v_mov_b32 v2, v2 - v_mov_b32 v3, v3 - - - buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 - s_add_u32 m0, m0, 4 - s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 - s_cmp_lt_u32 m0, s_save_alloc_size - s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete? - s_set_gpr_idx_off -L_SAVE_VGPR_LOOP_END: - - s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 - s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE // reset const stride to 4 bytes -else // VGPR store using dw burst s_mov_b32 m0, 0x4 //VGPR initial index value =0 s_cmp_lt_u32 m0, s_save_alloc_size @@ -713,52 +447,18 @@ else v_mov_b32 v2, v2 //v0 = v[0+m0] v_mov_b32 v3, v3 //v0 = v[0+m0] - if(USE_MTBUF_INSTEAD_OF_MUBUF) - tbuffer_store_format_x v0, v0, s_save_buf_rsrc0, s_save_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 - else buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 - end s_add_u32 m0, m0, 4 //next vgpr index s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 //every buffer_store_dword does 256 bytes s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete? s_set_gpr_idx_off -end L_SAVE_VGPR_END: - - - - - - - /* S_PGM_END_SAVED */ //FIXME graphics ONLY - if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_NORMAL_EXIT)) - s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] - s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4 - s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over - s_rfe_b64 s_save_pc_lo //Return to the main shader program - else - end - -// Save Done timestamp -if G8SR_DEBUG_TIMESTAMP - s_memrealtime s_g8sr_ts_save_d - // SGPR SR memory offset : size(VGPR) - get_vgpr_size_bytes(s_save_mem_offset) - s_add_u32 s_save_mem_offset, s_save_mem_offset, G8SR_DEBUG_TS_SAVE_D_OFFSET - s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? - // Need reset rsrc2?? - s_mov_b32 m0, s_save_mem_offset - s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - s_buffer_store_dwordx2 s_g8sr_ts_save_d, s_save_buf_rsrc0, m0 glc:1 -end - - s_branch L_END_PGM @@ -769,27 +469,6 @@ end L_RESTORE: /* Setup Resource Contants */ - if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) - //calculate wd_addr using absolute thread id - v_readlane_b32 s_restore_tmp, v9, 0 - s_lshr_b32 s_restore_tmp, s_restore_tmp, 6 - s_mul_i32 s_restore_tmp, s_restore_tmp, WAVE_SPACE - s_add_i32 s_restore_spi_init_lo, s_restore_tmp, WG_BASE_ADDR_LO - s_mov_b32 s_restore_spi_init_hi, WG_BASE_ADDR_HI - s_and_b32 s_restore_spi_init_hi, s_restore_spi_init_hi, CTX_RESTORE_CONTROL - else - end - -if G8SR_DEBUG_TIMESTAMP - s_memrealtime s_g8sr_ts_restore_s - s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? - // tma_lo/hi are sgpr 110, 111, which will not used for 112 SGPR allocated case... - s_mov_b32 s_restore_pc_lo, s_g8sr_ts_restore_s[0] - s_mov_b32 s_restore_pc_hi, s_g8sr_ts_restore_s[1] //backup ts to ttmp0/1, sicne exec will be finally restored.. -end - - - s_mov_b32 s_restore_buf_rsrc0, s_restore_spi_init_lo //base_addr_lo s_and_b32 s_restore_buf_rsrc1, s_restore_spi_init_hi, 0x0000FFFF //base_addr_hi s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE @@ -831,18 +510,12 @@ end s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes() //FIXME, Check if offset overflow??? - if (SWIZZLE_EN) - s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? - else s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - end s_mov_b32 m0, 0x0 //lds_offset initial value = 0 L_RESTORE_LDS_LOOP: - if (SAVE_LDS) buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 // first 64DW buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:256 // second 64DW - end s_add_u32 m0, m0, 256*2 // 128 DW s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*2 //mem offset increased by 128DW s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0 @@ -861,40 +534,8 @@ end s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4) - if (SWIZZLE_EN) - s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? - else s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - end - -if G8SR_VGPR_SR_IN_DWX4 - get_vgpr_size_bytes(s_restore_mem_offset) - s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 - - // the const stride for DWx4 is 4*4 bytes - s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF // reset const stride to 0 - s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes - - s_mov_b32 m0, s_restore_alloc_size - s_set_gpr_idx_on m0, 0x8 // Note.. This will change m0 - -L_RESTORE_VGPR_LOOP: - buffer_load_dwordx4 v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 - s_waitcnt vmcnt(0) - s_sub_u32 m0, m0, 4 - v_mov_b32 v0, v0 // v[0+m0] = v0 - v_mov_b32 v1, v1 - v_mov_b32 v2, v2 - v_mov_b32 v3, v3 - s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 - s_cmp_eq_u32 m0, 0x8000 - s_cbranch_scc0 L_RESTORE_VGPR_LOOP - s_set_gpr_idx_off - - s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF // reset const stride to 0 - s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE // const stride to 4*4 bytes - -else + // VGPR load using dw burst s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 @@ -903,14 +544,10 @@ else s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 0x8000 //add 0x8000 since we compare m0 against it later L_RESTORE_VGPR_LOOP: - if(USE_MTBUF_INSTEAD_OF_MUBUF) - tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 - else buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256 buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*2 buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*3 - end s_waitcnt vmcnt(0) //ensure data ready v_mov_b32 v0, v0 //v[0+m0] = v0 v_mov_b32 v1, v1 @@ -922,16 +559,10 @@ else s_cbranch_scc1 L_RESTORE_VGPR_LOOP //VGPR restore (except v0) is complete? s_set_gpr_idx_off /* VGPR restore on v0 */ - if(USE_MTBUF_INSTEAD_OF_MUBUF) - tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 - else buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256 buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*2 buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*3 - end - -end /* restore SGPRs */ ////////////////////////////// @@ -947,16 +578,8 @@ end s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value) - if (SGPR_SAVE_USE_SQC) s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 2 //NUM_RECORDS in bytes - else - s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads) - end - if (SWIZZLE_EN) - s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? - else s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - end /* If 112 SGPRs ar allocated, 4 sgprs are not used TBA(108,109),TMA(110,111), However, we are safe to restore these 4 SGPRs anyway, since TBA,TMA will later be restored by HWREG @@ -985,12 +608,6 @@ end ////////////////////////////// L_RESTORE_HWREG: - -if G8SR_DEBUG_TIMESTAMP - s_mov_b32 s_g8sr_ts_restore_s[0], s_restore_pc_lo - s_mov_b32 s_g8sr_ts_restore_s[1], s_restore_pc_hi -end - // HWREG SR memory offset : size(VGPR)+size(SGPR) get_vgpr_size_bytes(s_restore_mem_offset) get_sgpr_size_bytes(s_restore_tmp) @@ -998,11 +615,7 @@ end s_mov_b32 s_restore_buf_rsrc2, 0x4 //NUM_RECORDS in bytes - if (SWIZZLE_EN) - s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? - else s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - end read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset) //M0 read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //PC @@ -1019,16 +632,6 @@ end s_waitcnt lgkmcnt(0) //from now on, it is safe to restore STATUS and IB_STS - //for normal save & restore, the saved PC points to the next inst to execute, no adjustment needs to be made, otherwise: - if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) - s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 8 //pc[31:0]+8 //two back-to-back s_trap are used (first for save and second for restore) - s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over - end - if ((EMU_RUN_HACK) && (EMU_RUN_HACK_RESTORE_NORMAL)) - s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 4 //pc[31:0]+4 // save is hack through s_trap but restore is normal - s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over - end - s_mov_b32 m0, s_restore_m0 s_mov_b32 exec_lo, s_restore_exec_lo s_mov_b32 exec_hi, s_restore_exec_hi @@ -1061,11 +664,6 @@ end s_barrier //barrier to ensure the readiness of LDS before access attempts from any other wave in the same TG //FIXME not performance-optimal at this time -if G8SR_DEBUG_TIMESTAMP - s_memrealtime s_g8sr_ts_restore_d - s_waitcnt lgkmcnt(0) -end - // s_rfe_b64 s_restore_pc_lo //Return to the main shader program and resume execution s_rfe_restore_b64 s_restore_pc_lo, s_restore_m0 // s_restore_m0[0] is used to set STATUS.inst_atc diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm index 0bb9c577b3a2..75f29d13c90f 100644 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm @@ -24,76 +24,9 @@ * PROJECT=greenland ./sp3 cwsr_trap_handler_gfx9.asm -hex tmp.hex */ -/* HW (GFX9) source code for CWSR trap handler */ -/* Version 18 + multiple trap handler */ - -// this performance-optimal version was originally from Seven Xu at SRDC - -// Revison #18 --... -/* Rev History -** #1. Branch from gc dv. //gfxip/gfx9/main/src/test/suites/block/cs/sr/cs_trap_handler.sp3#1,#50, #51, #52-53(Skip, Already Fixed by PV), #54-56(merged),#57-58(mergerd, skiped-already fixed by PV) -** #4. SR Memory Layout: -** 1. VGPR-SGPR-HWREG-{LDS} -** 2. tba_hi.bits.26 - reconfigured as the first wave in tg bits, for defer Save LDS for a threadgroup.. performance concern.. -** #5. Update: 1. Accurate g8sr_ts_save_d timestamp -** #6. Update: 1. Fix s_barrier usage; 2. VGPR s/r using swizzle buffer?(NoNeed, already matched the swizzle pattern, more investigation) -** #7. Update: 1. don't barrier if noLDS -** #8. Branch: 1. Branch to ver#0, which is very similar to gc dv version -** 2. Fix SQ issue by s_sleep 2 -** #9. Update: 1. Fix scc restore failed issue, restore wave_status at last -** 2. optimize s_buffer save by burst 16sgprs... -** #10. Update 1. Optimize restore sgpr by busrt 16 sgprs. -** #11. Update 1. Add 2 more timestamp for debug version -** #12. Update 1. Add VGPR SR using DWx4, some case improve and some case drop performance -** #13. Integ 1. Always use MUBUF for PV trap shader... -** #14. Update 1. s_buffer_store soft clause... -** #15. Update 1. PERF - sclar write with glc:0/mtype0 to allow L2 combine. perf improvement a lot. -** #16. Update 1. PRRF - UNROLL LDS_DMA got 2500cycle save in IP tree -** #17. Update 1. FUNC - LDS_DMA has issues while ATC, replace with ds_read/buffer_store for save part[TODO restore part] -** 2. PERF - Save LDS before save VGPR to cover LDS save long latency... -** #18. Update 1. FUNC - Implicitly estore STATUS.VCCZ, which is not writable by s_setreg_b32 -** 2. FUNC - Handle non-CWSR traps -*/ - -var G8SR_WDMEM_HWREG_OFFSET = 0 -var G8SR_WDMEM_SGPR_OFFSET = 128 // in bytes - -// Keep definition same as the app shader, These 2 time stamps are part of the app shader... Should before any Save and after restore. - -var G8SR_DEBUG_TIMESTAMP = 0 -var G8SR_DEBUG_TS_SAVE_D_OFFSET = 40*4 // ts_save_d timestamp offset relative to SGPR_SR_memory_offset -var s_g8sr_ts_save_s = s[34:35] // save start -var s_g8sr_ts_sq_save_msg = s[36:37] // The save shader send SAVEWAVE msg to spi -var s_g8sr_ts_spi_wrexec = s[38:39] // the SPI write the sr address to SQ -var s_g8sr_ts_save_d = s[40:41] // save end -var s_g8sr_ts_restore_s = s[42:43] // restore start -var s_g8sr_ts_restore_d = s[44:45] // restore end - -var G8SR_VGPR_SR_IN_DWX4 = 0 -var G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 = 0x00100000 // DWx4 stride is 4*4Bytes -var G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4 = G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 - - -/*************************************************************************/ -/* control on how to run the shader */ -/*************************************************************************/ -//any hack that needs to be made to run this code in EMU (either because various EMU code are not ready or no compute save & restore in EMU run) -var EMU_RUN_HACK = 0 -var EMU_RUN_HACK_RESTORE_NORMAL = 0 -var EMU_RUN_HACK_SAVE_NORMAL_EXIT = 0 -var EMU_RUN_HACK_SAVE_SINGLE_WAVE = 0 -var EMU_RUN_HACK_SAVE_FIRST_TIME = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK -var SAVE_LDS = 1 -var WG_BASE_ADDR_LO = 0x9000a000 -var WG_BASE_ADDR_HI = 0x0 -var WAVE_SPACE = 0x5000 //memory size that each wave occupies in workgroup state mem -var CTX_SAVE_CONTROL = 0x0 -var CTX_RESTORE_CONTROL = CTX_SAVE_CONTROL -var SIM_RUN_HACK = 0 //any hack that needs to be made to run this code in SIM (either because various RTL code are not ready or no compute save & restore in RTL run) -var SGPR_SAVE_USE_SQC = 1 //use SQC D$ to do the write -var USE_MTBUF_INSTEAD_OF_MUBUF = 0 //because TC EMU currently asserts on 0 of // overload DFMT field to carry 4 more bits of stride for MUBUF opcodes -var SWIZZLE_EN = 0 //whether we use swizzled buffer addressing var ACK_SQC_STORE = 1 //workaround for suspected SQC store bug causing incorrect stores under concurrency +var SAVE_AFTER_XNACK_ERROR = 1 //workaround for TCP store failure after XNACK error when ALLOW_REPLAY=0, for debugger +var SINGLE_STEP_MISSED_WORKAROUND = 1 //workaround for lost MODE.DEBUG_EN exception when SAVECTX raised /**************************************************************************/ /* variables */ @@ -107,6 +40,7 @@ var SQ_WAVE_STATUS_PRE_SPI_PRIO_SHIFT = 0 var SQ_WAVE_STATUS_PRE_SPI_PRIO_SIZE = 1 var SQ_WAVE_STATUS_POST_SPI_PRIO_SHIFT = 3 var SQ_WAVE_STATUS_POST_SPI_PRIO_SIZE = 29 +var SQ_WAVE_STATUS_ALLOW_REPLAY_MASK = 0x400000 var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12 var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9 @@ -127,12 +61,15 @@ var SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK = 0xFFFFF800 var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT = 11 var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE = 21 var SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK = 0x800 +var SQ_WAVE_TRAPSTS_XNACK_ERROR_MASK = 0x10000000 var SQ_WAVE_IB_STS_RCNT_SHIFT = 16 //FIXME var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT = 15 //FIXME var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK = 0x1F8000 var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG = 0x00007FFF //FIXME +var SQ_WAVE_MODE_DEBUG_EN_MASK = 0x800 + var SQ_BUF_RSRC_WORD1_ATC_SHIFT = 24 var SQ_BUF_RSRC_WORD3_MTYPE_SHIFT = 27 @@ -150,10 +87,10 @@ var S_SAVE_SPI_INIT_MTYPE_SHIFT = 28 var S_SAVE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT = 26 -var S_SAVE_PC_HI_RCNT_SHIFT = 28 //FIXME check with Brian to ensure all fields other than PC[47:0] can be used -var S_SAVE_PC_HI_RCNT_MASK = 0xF0000000 //FIXME -var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT = 27 //FIXME -var S_SAVE_PC_HI_FIRST_REPLAY_MASK = 0x08000000 //FIXME +var S_SAVE_PC_HI_RCNT_SHIFT = 27 //FIXME check with Brian to ensure all fields other than PC[47:0] can be used +var S_SAVE_PC_HI_RCNT_MASK = 0xF8000000 //FIXME +var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT = 26 //FIXME +var S_SAVE_PC_HI_FIRST_REPLAY_MASK = 0x04000000 //FIXME var s_save_spi_init_lo = exec_lo var s_save_spi_init_hi = exec_hi @@ -162,8 +99,8 @@ var s_save_pc_lo = ttmp0 //{TTMP1, TTMP0} = {3'h0,pc_rewind[3:0], HT[0],tra var s_save_pc_hi = ttmp1 var s_save_exec_lo = ttmp2 var s_save_exec_hi = ttmp3 -var s_save_tmp = ttmp4 -var s_save_trapsts = ttmp5 //not really used until the end of the SAVE routine +var s_save_tmp = ttmp14 +var s_save_trapsts = ttmp15 //not really used until the end of the SAVE routine var s_save_xnack_mask_lo = ttmp6 var s_save_xnack_mask_hi = ttmp7 var s_save_buf_rsrc0 = ttmp8 @@ -171,9 +108,9 @@ var s_save_buf_rsrc1 = ttmp9 var s_save_buf_rsrc2 = ttmp10 var s_save_buf_rsrc3 = ttmp11 var s_save_status = ttmp12 -var s_save_mem_offset = ttmp14 +var s_save_mem_offset = ttmp4 var s_save_alloc_size = s_save_trapsts //conflict -var s_save_m0 = ttmp15 +var s_save_m0 = ttmp5 var s_save_ttmps_lo = s_save_tmp //no conflict var s_save_ttmps_hi = s_save_trapsts //no conflict @@ -197,20 +134,22 @@ var s_restore_spi_init_lo = exec_lo var s_restore_spi_init_hi = exec_hi var s_restore_mem_offset = ttmp12 +var s_restore_accvgpr_offset = ttmp13 var s_restore_alloc_size = ttmp3 var s_restore_tmp = ttmp2 var s_restore_mem_offset_save = s_restore_tmp //no conflict +var s_restore_accvgpr_offset_save = ttmp7 var s_restore_m0 = s_restore_alloc_size //no conflict -var s_restore_mode = ttmp7 +var s_restore_mode = s_restore_accvgpr_offset_save var s_restore_pc_lo = ttmp0 var s_restore_pc_hi = ttmp1 -var s_restore_exec_lo = ttmp14 -var s_restore_exec_hi = ttmp15 -var s_restore_status = ttmp4 -var s_restore_trapsts = ttmp5 +var s_restore_exec_lo = ttmp4 +var s_restore_exec_hi = ttmp5 +var s_restore_status = ttmp14 +var s_restore_trapsts = ttmp15 var s_restore_xnack_mask_lo = xnack_mask_lo var s_restore_xnack_mask_hi = xnack_mask_hi var s_restore_buf_rsrc0 = ttmp8 @@ -226,20 +165,11 @@ var s_restore_ttmps_hi = s_restore_alloc_size //no conflict /* Shader Main*/ shader main - asic(GFX9) + asic(DEFAULT) type(CS) - if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) //hack to use trap_id for determining save/restore - //FIXME VCCZ un-init assertion s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC - s_and_b32 s_save_tmp, s_save_pc_hi, 0xffff0000 //change SCC - s_cmp_eq_u32 s_save_tmp, 0x007e0000 //Save: trap_id = 0x7e. Restore: trap_id = 0x7f. - s_cbranch_scc0 L_JUMP_TO_RESTORE //do not need to recover STATUS here since we are going to RESTORE - //FIXME s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //need to recover STATUS since we are going to SAVE - s_branch L_SKIP_RESTORE //NOT restore, SAVE actually - else s_branch L_SKIP_RESTORE //NOT restore. might be a regular trap or save - end L_JUMP_TO_RESTORE: s_branch L_RESTORE //restore @@ -248,12 +178,29 @@ L_SKIP_RESTORE: s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC s_andn2_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_SPI_PRIO_MASK //check whether this is for save + +if SINGLE_STEP_MISSED_WORKAROUND + // No single step exceptions if MODE.DEBUG_EN=0. + s_getreg_b32 ttmp2, hwreg(HW_REG_MODE) + s_and_b32 ttmp2, ttmp2, SQ_WAVE_MODE_DEBUG_EN_MASK + s_cbranch_scc0 L_NO_SINGLE_STEP_WORKAROUND + + // Second-level trap already handled exception if STATUS.HALT=1. + s_and_b32 ttmp2, s_save_status, SQ_WAVE_STATUS_HALT_MASK + + // Prioritize single step exception over context save. + // Second-level trap will halt wave and RFE, re-entering for SAVECTX. + s_cbranch_scc0 L_FETCH_2ND_TRAP + +L_NO_SINGLE_STEP_WORKAROUND: +end + s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) s_and_b32 ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK //check whether this is for save s_cbranch_scc1 L_SAVE //this is the operation for save // ********* Handle non-CWSR traps ******************* -if (!EMU_RUN_HACK) + // Illegal instruction is a non-maskable exception which blocks context save. // Halt the wavefront and return from the trap. s_and_b32 ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK @@ -266,10 +213,16 @@ if (!EMU_RUN_HACK) L_HALT_WAVE: // If STATUS.HALT is set then this fault must come from SQC instruction fetch. - // We cannot prevent further faults so just terminate the wavefront. + // We cannot prevent further faults. Spin wait until context saved. s_and_b32 ttmp2, s_save_status, SQ_WAVE_STATUS_HALT_MASK s_cbranch_scc0 L_NOT_ALREADY_HALTED - s_endpgm + +L_WAIT_CTX_SAVE: + s_sleep 0x10 + s_getreg_b32 ttmp2, hwreg(HW_REG_TRAPSTS) + s_and_b32 ttmp2, ttmp2, SQ_WAVE_TRAPSTS_SAVECTX_MASK + s_cbranch_scc0 L_WAIT_CTX_SAVE + L_NOT_ALREADY_HALTED: s_or_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_HALT_MASK @@ -293,12 +246,12 @@ L_FETCH_2ND_TRAP: // Read second-level TBA/TMA from first-level TMA and jump if available. // ttmp[2:5] and ttmp12 can be used (others hold SPI-initialized debug data) // ttmp12 holds SQ_WAVE_STATUS - s_getreg_b32 ttmp4, hwreg(HW_REG_SQ_SHADER_TMA_LO) - s_getreg_b32 ttmp5, hwreg(HW_REG_SQ_SHADER_TMA_HI) - s_lshl_b64 [ttmp4, ttmp5], [ttmp4, ttmp5], 0x8 - s_load_dwordx2 [ttmp2, ttmp3], [ttmp4, ttmp5], 0x0 glc:1 // second-level TBA + s_getreg_b32 ttmp14, hwreg(HW_REG_SQ_SHADER_TMA_LO) + s_getreg_b32 ttmp15, hwreg(HW_REG_SQ_SHADER_TMA_HI) + s_lshl_b64 [ttmp14, ttmp15], [ttmp14, ttmp15], 0x8 + s_load_dwordx2 [ttmp2, ttmp3], [ttmp14, ttmp15], 0x0 glc:1 // second-level TBA s_waitcnt lgkmcnt(0) - s_load_dwordx2 [ttmp4, ttmp5], [ttmp4, ttmp5], 0x8 glc:1 // second-level TMA + s_load_dwordx2 [ttmp14, ttmp15], [ttmp14, ttmp15], 0x8 glc:1 // second-level TMA s_waitcnt lgkmcnt(0) s_and_b64 [ttmp2, ttmp3], [ttmp2, ttmp3], [ttmp2, ttmp3] s_cbranch_scc0 L_NO_NEXT_TRAP // second-level trap handler not been set @@ -324,7 +277,7 @@ L_EXCP_CASE: set_status_without_spi_prio(s_save_status, ttmp2) s_rfe_b64 [ttmp0, ttmp1] -end + // ********* End handling of non-CWSR traps ******************* /**************************************************************************/ @@ -332,12 +285,6 @@ end /**************************************************************************/ L_SAVE: - -if G8SR_DEBUG_TIMESTAMP - s_memrealtime s_g8sr_ts_save_s - s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? -end - s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] s_mov_b32 s_save_tmp, 0 //clear saveCtx bit @@ -359,16 +306,7 @@ end s_mov_b32 s_save_exec_hi, exec_hi s_mov_b64 exec, 0x0 //clear EXEC to get ready to receive -if G8SR_DEBUG_TIMESTAMP - s_memrealtime s_g8sr_ts_sq_save_msg - s_waitcnt lgkmcnt(0) -end - - if (EMU_RUN_HACK) - - else s_sendmsg sendmsg(MSG_SAVEWAVE) //send SPI a message and wait for SPI's write to EXEC - end // Set SPI_PRIO=2 to avoid starving instruction fetch in the waves we're waiting for. s_or_b32 s_save_tmp, s_save_status, (2 << SQ_WAVE_STATUS_SPI_PRIO_SHIFT) @@ -377,35 +315,9 @@ end L_SLEEP: s_sleep 0x2 // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause SQ hang, since the 7,8th wave could not get arbit to exec inst, while other waves are stuck into the sleep-loop and waiting for wrexec!=0 - if (EMU_RUN_HACK) - - else s_cbranch_execz L_SLEEP - end - -if G8SR_DEBUG_TIMESTAMP - s_memrealtime s_g8sr_ts_spi_wrexec - s_waitcnt lgkmcnt(0) -end - - if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_SINGLE_WAVE)) - //calculate wd_addr using absolute thread id - v_readlane_b32 s_save_tmp, v9, 0 - s_lshr_b32 s_save_tmp, s_save_tmp, 6 - s_mul_i32 s_save_tmp, s_save_tmp, WAVE_SPACE - s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO - s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI - s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL - else - end - if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_SINGLE_WAVE)) - s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO - s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI - s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL - else - end - // Save trap temporaries 6-11, 13-15 initialized by SPI debug dispatch logic + // Save trap temporaries 4-11, 13 initialized by SPI debug dispatch logic // ttmp SR memory offset : size(VGPR)+size(SGPR)+0x40 get_vgpr_size_bytes(s_save_ttmps_lo) get_sgpr_size_bytes(s_save_ttmps_hi) @@ -413,13 +325,11 @@ end s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, s_save_spi_init_lo s_addc_u32 s_save_ttmps_hi, s_save_spi_init_hi, 0x0 s_and_b32 s_save_ttmps_hi, s_save_ttmps_hi, 0xFFFF - s_store_dwordx2 [ttmp6, ttmp7], [s_save_ttmps_lo, s_save_ttmps_hi], 0x40 glc:1 + s_store_dwordx4 [ttmp4, ttmp5, ttmp6, ttmp7], [s_save_ttmps_lo, s_save_ttmps_hi], 0x50 glc:1 ack_sqc_store_workaround() - s_store_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_save_ttmps_lo, s_save_ttmps_hi], 0x48 glc:1 + s_store_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_save_ttmps_lo, s_save_ttmps_hi], 0x60 glc:1 ack_sqc_store_workaround() - s_store_dword ttmp13, [s_save_ttmps_lo, s_save_ttmps_hi], 0x58 glc:1 - ack_sqc_store_workaround() - s_store_dwordx2 [ttmp14, ttmp15], [s_save_ttmps_lo, s_save_ttmps_hi], 0x5C glc:1 + s_store_dword ttmp13, [s_save_ttmps_lo, s_save_ttmps_hi], 0x74 glc:1 ack_sqc_store_workaround() /* setup Resource Contants */ @@ -455,20 +365,10 @@ end s_mov_b32 s_save_buf_rsrc2, 0x4 //NUM_RECORDS in bytes - if (SWIZZLE_EN) - s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? - else s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - end write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) //M0 - - if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_FIRST_TIME)) - s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4 - s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over - end - write_hwreg_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset) //PC write_hwreg_to_mem(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset) write_hwreg_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset) //EXEC @@ -506,17 +406,9 @@ end s_add_u32 s_save_alloc_size, s_save_alloc_size, 1 s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value) - if (SGPR_SAVE_USE_SQC) s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 2 //NUM_RECORDS in bytes - else - s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads) - end - if (SWIZZLE_EN) - s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? - else s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - end // backup s_save_buf_rsrc0,1 to s_save_pc_lo/hi, since write_16sgpr_to_mem function will change the rsrc0 @@ -559,30 +451,25 @@ end s_mov_b32 xnack_mask_lo, 0x0 s_mov_b32 xnack_mask_hi, 0x0 - if (SWIZZLE_EN) - s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? - else s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - end // VGPR Allocated in 4-GPR granularity -if G8SR_VGPR_SR_IN_DWX4 - // the const stride for DWx4 is 4*4 bytes - s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 - s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes +if SAVE_AFTER_XNACK_ERROR + check_if_tcp_store_ok() + s_cbranch_scc1 L_SAVE_FIRST_VGPRS_WITH_TCP - buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + write_vgprs_to_mem_with_sqc(v0, 4, s_save_buf_rsrc0, s_save_mem_offset) + s_branch L_SAVE_LDS + +L_SAVE_FIRST_VGPRS_WITH_TCP: +end - s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 - s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE // reset const stride to 4 bytes -else buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 -end @@ -617,66 +504,34 @@ end s_add_u32 s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes() - if (SWIZZLE_EN) - s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? - else s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - end s_mov_b32 m0, 0x0 //lds_offset initial value = 0 -var LDS_DMA_ENABLE = 0 -var UNROLL = 0 -if UNROLL==0 && LDS_DMA_ENABLE==1 - s_mov_b32 s3, 256*2 - s_nop 0 - s_nop 0 - s_nop 0 - L_SAVE_LDS_LOOP: - //TODO: looks the 2 buffer_store/load clause for s/r will hurt performance.??? - if (SAVE_LDS) //SPI always alloc LDS space in 128DW granularity - buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 // first 64DW - buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW - end - - s_add_u32 m0, m0, s3 //every buffer_store_lds does 256 bytes - s_add_u32 s_save_mem_offset, s_save_mem_offset, s3 //mem offset increased by 256 bytes - s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0 - s_cbranch_scc1 L_SAVE_LDS_LOOP //LDS save is complete? - -elsif LDS_DMA_ENABLE==1 && UNROLL==1 // UNROOL , has ichace miss - // store from higest LDS address to lowest - s_mov_b32 s3, 256*2 - s_sub_u32 m0, s_save_alloc_size, s3 - s_add_u32 s_save_mem_offset, s_save_mem_offset, m0 - s_lshr_b32 s_save_alloc_size, s_save_alloc_size, 9 // how many 128 trunks... - s_sub_u32 s_save_alloc_size, 128, s_save_alloc_size // store from higheset addr to lowest - s_mul_i32 s_save_alloc_size, s_save_alloc_size, 6*4 // PC offset increment, each LDS save block cost 6*4 Bytes instruction - s_add_u32 s_save_alloc_size, s_save_alloc_size, 3*4 //2is the below 2 inst...//s_addc and s_setpc - s_nop 0 - s_nop 0 - s_nop 0 //pad 3 dw to let LDS_DMA align with 64Bytes - s_getpc_b64 s[0:1] // reuse s[0:1], since s[0:1] already saved - s_add_u32 s0, s0,s_save_alloc_size - s_addc_u32 s1, s1, 0 - s_setpc_b64 s[0:1] - - - for var i =0; i< 128; i++ - // be careful to make here a 64Byte aligned address, which could improve performance... - buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:0 // first 64DW - buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW - - if i!=127 - s_sub_u32 m0, m0, s3 // use a sgpr to shrink 2DW-inst to 1DW inst to improve performance , i.e. pack more LDS_DMA inst to one Cacheline - s_sub_u32 s_save_mem_offset, s_save_mem_offset, s3 - end - end - -else // BUFFER_STORE v_mbcnt_lo_u32_b32 v2, 0xffffffff, 0x0 v_mbcnt_hi_u32_b32 v3, 0xffffffff, v2 // tid + +if SAVE_AFTER_XNACK_ERROR + check_if_tcp_store_ok() + s_cbranch_scc1 L_SAVE_LDS_WITH_TCP + + v_lshlrev_b32 v2, 2, v3 +L_SAVE_LDS_LOOP_SQC: + ds_read2_b32 v[0:1], v2 offset0:0 offset1:0x40 + s_waitcnt lgkmcnt(0) + + write_vgprs_to_mem_with_sqc(v0, 2, s_save_buf_rsrc0, s_save_mem_offset) + + v_add_u32 v2, 0x200, v2 + v_cmp_lt_u32 vcc[0:1], v2, s_save_alloc_size + s_cbranch_vccnz L_SAVE_LDS_LOOP_SQC + + s_branch L_SAVE_LDS_DONE + +L_SAVE_LDS_WITH_TCP: +end + v_mul_i32_i24 v2, v3, 8 // tid*8 v_mov_b32 v3, 256*2 s_mov_b32 m0, 0x10000 @@ -697,8 +552,6 @@ L_SAVE_LDS_LOOP_VECTOR: // restore rsrc3 s_mov_b32 s_save_buf_rsrc3, s0 -end - L_SAVE_LDS_DONE: @@ -716,44 +569,9 @@ L_SAVE_LDS_DONE: s_add_u32 s_save_alloc_size, s_save_alloc_size, 1 s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) //FIXME for GFX, zero is possible s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4) - if (SWIZZLE_EN) - s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? - else s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - end - - - // VGPR Allocated in 4-GPR granularity - -if G8SR_VGPR_SR_IN_DWX4 - // the const stride for DWx4 is 4*4 bytes - s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 - s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes - - s_mov_b32 m0, 4 // skip first 4 VGPRs - s_cmp_lt_u32 m0, s_save_alloc_size - s_cbranch_scc0 L_SAVE_VGPR_LOOP_END // no more vgprs - s_set_gpr_idx_on m0, 0x1 // This will change M0 - s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 // because above inst change m0 -L_SAVE_VGPR_LOOP: - v_mov_b32 v0, v0 // v0 = v[0+m0] - v_mov_b32 v1, v1 - v_mov_b32 v2, v2 - v_mov_b32 v3, v3 - - buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 - s_add_u32 m0, m0, 4 - s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 - s_cmp_lt_u32 m0, s_save_alloc_size - s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete? - s_set_gpr_idx_off -L_SAVE_VGPR_LOOP_END: - - s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF // reset const stride to 0 - s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE // reset const stride to 4 bytes -else // VGPR store using dw burst s_mov_b32 m0, 0x4 //VGPR initial index value =0 s_cmp_lt_u32 m0, s_save_alloc_size @@ -763,57 +581,82 @@ else s_set_gpr_idx_on m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1 s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 //add 0x1000 since we compare m0 against it later +if SAVE_AFTER_XNACK_ERROR + check_if_tcp_store_ok() + s_cbranch_scc1 L_SAVE_VGPR_LOOP + +L_SAVE_VGPR_LOOP_SQC: + write_vgprs_to_mem_with_sqc(v0, 4, s_save_buf_rsrc0, s_save_mem_offset) + + s_add_u32 m0, m0, 4 + s_cmp_lt_u32 m0, s_save_alloc_size + s_cbranch_scc1 L_SAVE_VGPR_LOOP_SQC + + s_set_gpr_idx_off + s_branch L_SAVE_VGPR_END +end + L_SAVE_VGPR_LOOP: v_mov_b32 v0, v0 //v0 = v[0+m0] v_mov_b32 v1, v1 //v0 = v[0+m0] v_mov_b32 v2, v2 //v0 = v[0+m0] v_mov_b32 v3, v3 //v0 = v[0+m0] - if(USE_MTBUF_INSTEAD_OF_MUBUF) - tbuffer_store_format_x v0, v0, s_save_buf_rsrc0, s_save_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 - else buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 - end s_add_u32 m0, m0, 4 //next vgpr index s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 //every buffer_store_dword does 256 bytes s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete? s_set_gpr_idx_off -end L_SAVE_VGPR_END: +if ASIC_TARGET_ARCTURUS + // Save ACC VGPRs + s_mov_b32 m0, 0x0 //VGPR initial index value =0 + s_set_gpr_idx_on m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1 + +if SAVE_AFTER_XNACK_ERROR + check_if_tcp_store_ok() + s_cbranch_scc1 L_SAVE_ACCVGPR_LOOP +L_SAVE_ACCVGPR_LOOP_SQC: + for var vgpr = 0; vgpr < 4; ++ vgpr + v_accvgpr_read v[vgpr], acc[vgpr] // v[N] = acc[N+m0] + end + write_vgprs_to_mem_with_sqc(v0, 4, s_save_buf_rsrc0, s_save_mem_offset) + s_add_u32 m0, m0, 4 + s_cmp_lt_u32 m0, s_save_alloc_size + s_cbranch_scc1 L_SAVE_ACCVGPR_LOOP_SQC + s_set_gpr_idx_off + s_branch L_SAVE_ACCVGPR_END +end - /* S_PGM_END_SAVED */ //FIXME graphics ONLY - if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_NORMAL_EXIT)) - s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] - s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4 - s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over - s_rfe_b64 s_save_pc_lo //Return to the main shader program - else +L_SAVE_ACCVGPR_LOOP: + for var vgpr = 0; vgpr < 4; ++ vgpr + v_accvgpr_read v[vgpr], acc[vgpr] // v[N] = acc[N+m0] end -// Save Done timestamp -if G8SR_DEBUG_TIMESTAMP - s_memrealtime s_g8sr_ts_save_d - // SGPR SR memory offset : size(VGPR) - get_vgpr_size_bytes(s_save_mem_offset) - s_add_u32 s_save_mem_offset, s_save_mem_offset, G8SR_DEBUG_TS_SAVE_D_OFFSET - s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? - // Need reset rsrc2?? - s_mov_b32 m0, s_save_mem_offset - s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - s_buffer_store_dwordx2 s_g8sr_ts_save_d, s_save_buf_rsrc0, m0 glc:1 -end + buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 + buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 + s_add_u32 m0, m0, 4 + s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 + s_cmp_lt_u32 m0, s_save_alloc_size + s_cbranch_scc1 L_SAVE_ACCVGPR_LOOP + s_set_gpr_idx_off + +L_SAVE_ACCVGPR_END: +end s_branch L_END_PGM @@ -825,27 +668,6 @@ end L_RESTORE: /* Setup Resource Contants */ - if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) - //calculate wd_addr using absolute thread id - v_readlane_b32 s_restore_tmp, v9, 0 - s_lshr_b32 s_restore_tmp, s_restore_tmp, 6 - s_mul_i32 s_restore_tmp, s_restore_tmp, WAVE_SPACE - s_add_i32 s_restore_spi_init_lo, s_restore_tmp, WG_BASE_ADDR_LO - s_mov_b32 s_restore_spi_init_hi, WG_BASE_ADDR_HI - s_and_b32 s_restore_spi_init_hi, s_restore_spi_init_hi, CTX_RESTORE_CONTROL - else - end - -if G8SR_DEBUG_TIMESTAMP - s_memrealtime s_g8sr_ts_restore_s - s_waitcnt lgkmcnt(0) //FIXME, will cause xnack?? - // tma_lo/hi are sgpr 110, 111, which will not used for 112 SGPR allocated case... - s_mov_b32 s_restore_pc_lo, s_g8sr_ts_restore_s[0] - s_mov_b32 s_restore_pc_hi, s_g8sr_ts_restore_s[1] //backup ts to ttmp0/1, sicne exec will be finally restored.. -end - - - s_mov_b32 s_restore_buf_rsrc0, s_restore_spi_init_lo //base_addr_lo s_and_b32 s_restore_buf_rsrc1, s_restore_spi_init_hi, 0x0000FFFF //base_addr_hi s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE @@ -887,18 +709,12 @@ end s_add_u32 s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes() //FIXME, Check if offset overflow??? - if (SWIZZLE_EN) - s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? - else s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - end s_mov_b32 m0, 0x0 //lds_offset initial value = 0 L_RESTORE_LDS_LOOP: - if (SAVE_LDS) buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 // first 64DW buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:256 // second 64DW - end s_add_u32 m0, m0, 256*2 // 128 DW s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*2 //mem offset increased by 128DW s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0 @@ -917,56 +733,43 @@ end s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4) - if (SWIZZLE_EN) - s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? - else + +if ASIC_TARGET_ARCTURUS + s_mov_b32 s_restore_accvgpr_offset, s_restore_buf_rsrc2 //ACC VGPRs at end of VGPRs +end + s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - end -if G8SR_VGPR_SR_IN_DWX4 - get_vgpr_size_bytes(s_restore_mem_offset) - s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 - - // the const stride for DWx4 is 4*4 bytes - s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF // reset const stride to 0 - s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4 // const stride to 4*4 bytes - - s_mov_b32 m0, s_restore_alloc_size - s_set_gpr_idx_on m0, 0x8 // Note.. This will change m0 - -L_RESTORE_VGPR_LOOP: - buffer_load_dwordx4 v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 - s_waitcnt vmcnt(0) - s_sub_u32 m0, m0, 4 - v_mov_b32 v0, v0 // v[0+m0] = v0 - v_mov_b32 v1, v1 - v_mov_b32 v2, v2 - v_mov_b32 v3, v3 - s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 - s_cmp_eq_u32 m0, 0x8000 - s_cbranch_scc0 L_RESTORE_VGPR_LOOP - s_set_gpr_idx_off - - s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF // reset const stride to 0 - s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE // const stride to 4*4 bytes - -else // VGPR load using dw burst s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 +if ASIC_TARGET_ARCTURUS + s_mov_b32 s_restore_accvgpr_offset_save, s_restore_accvgpr_offset + s_add_u32 s_restore_accvgpr_offset, s_restore_accvgpr_offset, 256*4 +end s_mov_b32 m0, 4 //VGPR initial index value = 1 s_set_gpr_idx_on m0, 0x8 //M0[7:0] = M0[7:0] and M0[15:12] = 0x8 s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 0x8000 //add 0x8000 since we compare m0 against it later L_RESTORE_VGPR_LOOP: - if(USE_MTBUF_INSTEAD_OF_MUBUF) - tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 - else + +if ASIC_TARGET_ARCTURUS + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_accvgpr_offset slc:1 glc:1 + buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_accvgpr_offset slc:1 glc:1 offset:256 + buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_accvgpr_offset slc:1 glc:1 offset:256*2 + buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_accvgpr_offset slc:1 glc:1 offset:256*3 + s_add_u32 s_restore_accvgpr_offset, s_restore_accvgpr_offset, 256*4 + s_waitcnt vmcnt(0) + + for var vgpr = 0; vgpr < 4; ++ vgpr + v_accvgpr_write acc[vgpr], v[vgpr] + end +end + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256 buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*2 buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*3 - end s_waitcnt vmcnt(0) //ensure data ready v_mov_b32 v0, v0 //v[0+m0] = v0 v_mov_b32 v1, v1 @@ -978,16 +781,22 @@ else s_cbranch_scc1 L_RESTORE_VGPR_LOOP //VGPR restore (except v0) is complete? s_set_gpr_idx_off /* VGPR restore on v0 */ - if(USE_MTBUF_INSTEAD_OF_MUBUF) - tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 - else +if ASIC_TARGET_ARCTURUS + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_accvgpr_offset_save slc:1 glc:1 + buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_accvgpr_offset_save slc:1 glc:1 offset:256 + buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_accvgpr_offset_save slc:1 glc:1 offset:256*2 + buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_accvgpr_offset_save slc:1 glc:1 offset:256*3 + s_waitcnt vmcnt(0) + + for var vgpr = 0; vgpr < 4; ++ vgpr + v_accvgpr_write acc[vgpr], v[vgpr] + end +end + buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256 buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*2 buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*3 - end - -end /* restore SGPRs */ ////////////////////////////// @@ -1003,16 +812,8 @@ end s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value) - if (SGPR_SAVE_USE_SQC) s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 2 //NUM_RECORDS in bytes - else - s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads) - end - if (SWIZZLE_EN) - s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? - else s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - end s_mov_b32 m0, s_restore_alloc_size @@ -1040,11 +841,6 @@ end L_RESTORE_HWREG: -if G8SR_DEBUG_TIMESTAMP - s_mov_b32 s_g8sr_ts_restore_s[0], s_restore_pc_lo - s_mov_b32 s_g8sr_ts_restore_s[1], s_restore_pc_hi -end - // HWREG SR memory offset : size(VGPR)+size(SGPR) get_vgpr_size_bytes(s_restore_mem_offset) get_sgpr_size_bytes(s_restore_tmp) @@ -1052,11 +848,7 @@ end s_mov_b32 s_restore_buf_rsrc2, 0x4 //NUM_RECORDS in bytes - if (SWIZZLE_EN) - s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? - else s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - end read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset) //M0 read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset) //PC @@ -1071,16 +863,6 @@ end s_waitcnt lgkmcnt(0) //from now on, it is safe to restore STATUS and IB_STS - //for normal save & restore, the saved PC points to the next inst to execute, no adjustment needs to be made, otherwise: - if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) - s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 8 //pc[31:0]+8 //two back-to-back s_trap are used (first for save and second for restore) - s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over - end - if ((EMU_RUN_HACK) && (EMU_RUN_HACK_RESTORE_NORMAL)) - s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 4 //pc[31:0]+4 // save is hack through s_trap but restore is normal - s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over - end - s_mov_b32 m0, s_restore_m0 s_mov_b32 exec_lo, s_restore_exec_lo s_mov_b32 exec_hi, s_restore_exec_hi @@ -1093,7 +875,7 @@ end //s_setreg_b32 hwreg(HW_REG_TRAPSTS), s_restore_trapsts //don't overwrite SAVECTX bit as it may be set through external SAVECTX during restore s_setreg_b32 hwreg(HW_REG_MODE), s_restore_mode - // Restore trap temporaries 6-11, 13-15 initialized by SPI debug dispatch logic + // Restore trap temporaries 4-11, 13 initialized by SPI debug dispatch logic // ttmp SR memory offset : size(VGPR)+size(SGPR)+0x40 get_vgpr_size_bytes(s_restore_ttmps_lo) get_sgpr_size_bytes(s_restore_ttmps_hi) @@ -1101,10 +883,9 @@ end s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_buf_rsrc0 s_addc_u32 s_restore_ttmps_hi, s_restore_buf_rsrc1, 0x0 s_and_b32 s_restore_ttmps_hi, s_restore_ttmps_hi, 0xFFFF - s_load_dwordx2 [ttmp6, ttmp7], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x40 glc:1 - s_load_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x48 glc:1 - s_load_dword ttmp13, [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x58 glc:1 - s_load_dwordx2 [ttmp14, ttmp15], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x5C glc:1 + s_load_dwordx4 [ttmp4, ttmp5, ttmp6, ttmp7], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x50 glc:1 + s_load_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x60 glc:1 + s_load_dword ttmp13, [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x74 glc:1 s_waitcnt lgkmcnt(0) //reuse s_restore_m0 as a temp register @@ -1128,11 +909,6 @@ end s_barrier //barrier to ensure the readiness of LDS before access attempts from any other wave in the same TG //FIXME not performance-optimal at this time -if G8SR_DEBUG_TIMESTAMP - s_memrealtime s_g8sr_ts_restore_d - s_waitcnt lgkmcnt(0) -end - // s_rfe_b64 s_restore_pc_lo //Return to the main shader program and resume execution s_rfe_restore_b64 s_restore_pc_lo, s_restore_m0 // s_restore_m0[0] is used to set STATUS.inst_atc @@ -1187,7 +963,39 @@ function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset) s_sub_u32 s_mem_offset, s_mem_offset, 4*16 end +function check_if_tcp_store_ok + // If STATUS.ALLOW_REPLAY=0 and TRAPSTS.XNACK_ERROR=1 then TCP stores will fail. + s_and_b32 s_save_tmp, s_save_status, SQ_WAVE_STATUS_ALLOW_REPLAY_MASK + s_cbranch_scc1 L_TCP_STORE_CHECK_DONE + + s_getreg_b32 s_save_tmp, hwreg(HW_REG_TRAPSTS) + s_andn2_b32 s_save_tmp, SQ_WAVE_TRAPSTS_XNACK_ERROR_MASK, s_save_tmp + +L_TCP_STORE_CHECK_DONE: +end + +function write_vgpr_to_mem_with_sqc(v, s_rsrc, s_mem_offset) + s_mov_b32 s4, 0 +L_WRITE_VGPR_LANE_LOOP: + for var lane = 0; lane < 4; ++ lane + v_readlane_b32 s[lane], v, s4 + s_add_u32 s4, s4, 1 + end + + s_buffer_store_dwordx4 s[0:3], s_rsrc, s_mem_offset glc:1 + ack_sqc_store_workaround() + + s_add_u32 s_mem_offset, s_mem_offset, 0x10 + s_cmp_eq_u32 s4, 0x40 + s_cbranch_scc0 L_WRITE_VGPR_LANE_LOOP +end + +function write_vgprs_to_mem_with_sqc(v, n_vgprs, s_rsrc, s_mem_offset) + for var vgpr = 0; vgpr < n_vgprs; ++ vgpr + write_vgpr_to_mem_with_sqc(v[vgpr], s_rsrc, s_mem_offset) + end +end function get_lds_size_bytes(s_lds_size_byte) // SQ LDS granularity is 64DW, while PGM_RSRC2.lds_size is in granularity 128DW @@ -1199,6 +1007,10 @@ function get_vgpr_size_bytes(s_vgpr_size_byte) s_getreg_b32 s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size s_add_u32 s_vgpr_size_byte, s_vgpr_size_byte, 1 s_lshl_b32 s_vgpr_size_byte, s_vgpr_size_byte, (2+8) //Number of VGPRs = (vgpr_size + 1) * 4 * 64 * 4 (non-zero value) //FIXME for GFX, zero is possible + +if ASIC_TARGET_ARCTURUS + s_lshl_b32 s_vgpr_size_byte, s_vgpr_size_byte, 1 // Double size for ACC VGPRs +end end function get_sgpr_size_bytes(s_sgpr_size_byte) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index 083bd8114db1..1544007af34a 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -49,7 +49,7 @@ static const char kfd_dev_name[] = "kfd"; static const struct file_operations kfd_fops = { .owner = THIS_MODULE, .unlocked_ioctl = kfd_ioctl, - .compat_ioctl = kfd_ioctl, + .compat_ioctl = compat_ptr_ioctl, .open = kfd_open, .mmap = kfd_mmap, }; @@ -213,6 +213,8 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties, q_properties->type = KFD_QUEUE_TYPE_COMPUTE; else if (args->queue_type == KFD_IOC_QUEUE_TYPE_SDMA) q_properties->type = KFD_QUEUE_TYPE_SDMA; + else if (args->queue_type == KFD_IOC_QUEUE_TYPE_SDMA_XGMI) + q_properties->type = KFD_QUEUE_TYPE_SDMA_XGMI; else return -ENOTSUPP; @@ -280,7 +282,7 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, goto err_bind_process; } - pr_debug("Creating queue for PASID %d on gpu 0x%x\n", + pr_debug("Creating queue for PASID 0x%x on gpu 0x%x\n", p->pasid, dev->id); @@ -330,7 +332,7 @@ static int kfd_ioctl_destroy_queue(struct file *filp, struct kfd_process *p, int retval; struct kfd_ioctl_destroy_queue_args *args = data; - pr_debug("Destroying queue id %d for pasid %d\n", + pr_debug("Destroying queue id %d for pasid 0x%x\n", args->queue_id, p->pasid); @@ -376,7 +378,7 @@ static int kfd_ioctl_update_queue(struct file *filp, struct kfd_process *p, properties.queue_percent = args->queue_percentage; properties.priority = args->queue_priority; - pr_debug("Updating queue id %d for pasid %d\n", + pr_debug("Updating queue id %d for pasid 0x%x\n", args->queue_id, p->pasid); mutex_lock(&p->mutex); @@ -522,7 +524,7 @@ static int kfd_ioctl_set_trap_handler(struct file *filep, struct kfd_process_device *pdd; dev = kfd_device_by_id(args->gpu_id); - if (dev == NULL) + if (!dev) return -EINVAL; mutex_lock(&p->mutex); @@ -837,7 +839,7 @@ static int kfd_ioctl_get_clock_counters(struct file *filep, /* No access to rdtsc. Using raw monotonic time */ args->cpu_clock_counter = ktime_get_raw_ns(); - args->system_clock_counter = ktime_get_boot_ns(); + args->system_clock_counter = ktime_get_boottime_ns(); /* Since the counter is in nano-seconds we use 1GHz frequency */ args->system_clock_freq = 1000000000; @@ -853,7 +855,7 @@ static int kfd_ioctl_get_process_apertures(struct file *filp, struct kfd_process_device_apertures *pAperture; struct kfd_process_device *pdd; - dev_dbg(kfd_device, "get apertures for PASID %d", p->pasid); + dev_dbg(kfd_device, "get apertures for PASID 0x%x", p->pasid); args->num_of_nodes = 0; @@ -911,7 +913,7 @@ static int kfd_ioctl_get_process_apertures_new(struct file *filp, uint32_t nodes = 0; int ret; - dev_dbg(kfd_device, "get apertures for PASID %d", p->pasid); + dev_dbg(kfd_device, "get apertures for PASID 0x%x", p->pasid); if (args->num_of_nodes == 0) { /* Return number of nodes, so that user space can alloacate @@ -1126,7 +1128,7 @@ static int kfd_ioctl_set_scratch_backing_va(struct file *filep, mutex_unlock(&p->mutex); if (dev->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS && - pdd->qpd.vmid != 0) + pdd->qpd.vmid != 0 && dev->kfd2kgd->set_scratch_backing_va) dev->kfd2kgd->set_scratch_backing_va( dev->kgd, args->va_addr, pdd->qpd.vmid); @@ -1272,6 +1274,12 @@ static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep, if (args->size != kfd_doorbell_process_slice(dev)) return -EINVAL; offset = kfd_get_process_doorbells(dev, p); + } else if (flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP) { + if (args->size != PAGE_SIZE) + return -EINVAL; + offset = amdgpu_amdkfd_get_mmio_remap_phys_addr(dev->kgd); + if (!offset) + return -ENOMEM; } mutex_lock(&p->mutex); @@ -1301,6 +1309,14 @@ static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep, args->handle = MAKE_HANDLE(args->gpu_id, idr_handle); args->mmap_offset = offset; + /* MMIO is mapped through kfd device + * Generate a kfd mmap offset + */ + if (flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP) { + args->mmap_offset = KFD_MMAP_TYPE_MMIO | KFD_MMAP_GPU_ID(args->gpu_id); + args->mmap_offset <<= PAGE_SHIFT; + } + return 0; err_free: @@ -1785,7 +1801,7 @@ static long kfd_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) } else goto err_i1; - dev_dbg(kfd_device, "ioctl cmd 0x%x (#%d), arg 0x%lx\n", cmd, nr, arg); + dev_dbg(kfd_device, "ioctl cmd 0x%x (#0x%x), arg 0x%lx\n", cmd, nr, arg); process = kfd_get_process(current); if (IS_ERR(process)) { @@ -1840,11 +1856,45 @@ err_i1: kfree(kdata); if (retcode) - dev_dbg(kfd_device, "ret = %d\n", retcode); + dev_dbg(kfd_device, "ioctl cmd (#0x%x), arg 0x%lx, ret = %d\n", + nr, arg, retcode); return retcode; } +static int kfd_mmio_mmap(struct kfd_dev *dev, struct kfd_process *process, + struct vm_area_struct *vma) +{ + phys_addr_t address; + int ret; + + if (vma->vm_end - vma->vm_start != PAGE_SIZE) + return -EINVAL; + + address = amdgpu_amdkfd_get_mmio_remap_phys_addr(dev->kgd); + + vma->vm_flags |= VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_NORESERVE | + VM_DONTDUMP | VM_PFNMAP; + + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + + pr_debug("pasid 0x%x mapping mmio page\n" + " target user address == 0x%08llX\n" + " physical address == 0x%08llX\n" + " vm_flags == 0x%04lX\n" + " size == 0x%04lX\n", + process->pasid, (unsigned long long) vma->vm_start, + address, vma->vm_flags, PAGE_SIZE); + + ret = io_remap_pfn_range(vma, + vma->vm_start, + address >> PAGE_SHIFT, + PAGE_SIZE, + vma->vm_page_prot); + return ret; +} + + static int kfd_mmap(struct file *filp, struct vm_area_struct *vma) { struct kfd_process *process; @@ -1875,6 +1925,10 @@ static int kfd_mmap(struct file *filp, struct vm_area_struct *vma) if (!dev) return -ENODEV; return kfd_reserved_mem_mmap(dev, process, vma); + case KFD_MMAP_TYPE_MMIO: + if (!dev) + return -ENODEV; + return kfd_mmio_mmap(dev, process, vma); } return -EFAULT; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c index 2e7c44955f43..de9f68d5c312 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c @@ -134,9 +134,13 @@ static struct kfd_gpu_cache_info carrizo_cache_info[] = { #define polaris10_cache_info carrizo_cache_info #define polaris11_cache_info carrizo_cache_info #define polaris12_cache_info carrizo_cache_info +#define vegam_cache_info carrizo_cache_info /* TODO - check & update Vega10 cache details */ #define vega10_cache_info carrizo_cache_info #define raven_cache_info carrizo_cache_info +#define renoir_cache_info carrizo_cache_info +/* TODO - check & update Navi10 cache details */ +#define navi10_cache_info carrizo_cache_info static void kfd_populated_cu_info_cpu(struct kfd_topology_device *dev, struct crat_subtype_computeunit *cu) @@ -372,7 +376,7 @@ static int kfd_parse_subtype_iolink(struct crat_subtype_iolink *iolink, if (props->iolink_type == CRAT_IOLINK_TYPE_PCIEXPRESS) props->weight = 20; else if (props->iolink_type == CRAT_IOLINK_TYPE_XGMI) - props->weight = 15; + props->weight = 15 * iolink->num_hops_xgmi; else props->weight = node_distance(id_from, id_to); @@ -652,9 +656,14 @@ static int kfd_fill_gpu_cache_info(struct kfd_dev *kdev, pcache_info = polaris12_cache_info; num_of_cache_types = ARRAY_SIZE(polaris12_cache_info); break; + case CHIP_VEGAM: + pcache_info = vegam_cache_info; + num_of_cache_types = ARRAY_SIZE(vegam_cache_info); + break; case CHIP_VEGA10: case CHIP_VEGA12: case CHIP_VEGA20: + case CHIP_ARCTURUS: pcache_info = vega10_cache_info; num_of_cache_types = ARRAY_SIZE(vega10_cache_info); break; @@ -662,6 +671,16 @@ static int kfd_fill_gpu_cache_info(struct kfd_dev *kdev, pcache_info = raven_cache_info; num_of_cache_types = ARRAY_SIZE(raven_cache_info); break; + case CHIP_RENOIR: + pcache_info = renoir_cache_info; + num_of_cache_types = ARRAY_SIZE(renoir_cache_info); + break; + case CHIP_NAVI10: + case CHIP_NAVI12: + case CHIP_NAVI14: + pcache_info = navi10_cache_info; + num_of_cache_types = ARRAY_SIZE(navi10_cache_info); + break; default: return -EINVAL; } @@ -691,7 +710,7 @@ static int kfd_fill_gpu_cache_info(struct kfd_dev *kdev, pcache_info, cu_info, mem_available, - cu_info->cu_bitmap[i][j], + cu_info->cu_bitmap[i % 4][j + i / 4], ct, cu_processor_id, k); @@ -777,7 +796,7 @@ int kfd_create_crat_image_acpi(void **crat_image, size_t *size) * is put in the code to ensure we don't overwrite. */ #define VCRAT_SIZE_FOR_CPU (2 * PAGE_SIZE) -#define VCRAT_SIZE_FOR_GPU (3 * PAGE_SIZE) +#define VCRAT_SIZE_FOR_GPU (4 * PAGE_SIZE) /* kfd_fill_cu_for_cpu - Fill in Compute info for the given CPU NUMA node * @@ -1092,6 +1111,7 @@ static int kfd_fill_gpu_direct_io_link_to_cpu(int *avail_size, static int kfd_fill_gpu_xgmi_link_to_gpu(int *avail_size, struct kfd_dev *kdev, + struct kfd_dev *peer_kdev, struct crat_subtype_iolink *sub_type_hdr, uint32_t proximity_domain_from, uint32_t proximity_domain_to) @@ -1110,6 +1130,8 @@ static int kfd_fill_gpu_xgmi_link_to_gpu(int *avail_size, sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_XGMI; sub_type_hdr->proximity_domain_from = proximity_domain_from; sub_type_hdr->proximity_domain_to = proximity_domain_to; + sub_type_hdr->num_hops_xgmi = + amdgpu_amdkfd_get_xgmi_hops_count(kdev->kgd, peer_kdev->kgd); return 0; } @@ -1287,7 +1309,7 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image, (char *)sub_type_hdr + sizeof(struct crat_subtype_iolink)); ret = kfd_fill_gpu_xgmi_link_to_gpu( - &avail_size, kdev, + &avail_size, kdev, peer_dev->gpu, (struct crat_subtype_iolink *)sub_type_hdr, proximity_domain, nid); if (ret < 0) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h index 7c3f192fe25f..d54ceebd346b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h @@ -274,7 +274,8 @@ struct crat_subtype_iolink { uint32_t minimum_bandwidth_mbs; uint32_t maximum_bandwidth_mbs; uint32_t recommended_transfer_size; - uint8_t reserved2[CRAT_IOLINK_RESERVED_LENGTH]; + uint8_t reserved2[CRAT_IOLINK_RESERVED_LENGTH - 1]; + uint8_t num_hops_xgmi; }; /* diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c index a3441b0e385b..d59f2cd056c6 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c @@ -761,6 +761,7 @@ int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p) { int status = 0; unsigned int vmid; + uint16_t queried_pasid; union SQ_CMD_BITS reg_sq_cmd; union GRBM_GFX_INDEX_BITS reg_gfx_index; struct kfd_process_device *pdd; @@ -782,19 +783,18 @@ int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p) */ for (vmid = first_vmid_to_scan; vmid <= last_vmid_to_scan; vmid++) { - if (dev->kfd2kgd->get_atc_vmid_pasid_mapping_valid - (dev->kgd, vmid)) { - if (dev->kfd2kgd->get_atc_vmid_pasid_mapping_pasid - (dev->kgd, vmid) == p->pasid) { - pr_debug("Killing wave fronts of vmid %d and pasid %d\n", - vmid, p->pasid); - break; - } + status = dev->kfd2kgd->get_atc_vmid_pasid_mapping_info + (dev->kgd, vmid, &queried_pasid); + + if (status && queried_pasid == p->pasid) { + pr_debug("Killing wave fronts of vmid %d and pasid 0x%x\n", + vmid, p->pasid); + break; } } if (vmid > last_vmid_to_scan) { - pr_err("Didn't find vmid for pasid %d\n", p->pasid); + pr_err("Didn't find vmid for pasid 0x%x\n", p->pasid); return -EFAULT; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c b/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c index 9d4af961c5d1..9bfa50633654 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c @@ -96,7 +96,7 @@ bool kfd_dbgmgr_create(struct kfd_dbgmgr **ppmgr, struct kfd_dev *pdev) long kfd_dbgmgr_register(struct kfd_dbgmgr *pmgr, struct kfd_process *p) { if (pmgr->pasid != 0) { - pr_debug("H/W debugger is already active using pasid %d\n", + pr_debug("H/W debugger is already active using pasid 0x%x\n", pmgr->pasid); return -EBUSY; } @@ -117,7 +117,7 @@ long kfd_dbgmgr_unregister(struct kfd_dbgmgr *pmgr, struct kfd_process *p) { /* Is the requests coming from the already registered process? */ if (pmgr->pasid != p->pasid) { - pr_debug("H/W debugger is not registered by calling pasid %d\n", + pr_debug("H/W debugger is not registered by calling pasid 0x%x\n", p->pasid); return -EINVAL; } @@ -134,7 +134,7 @@ long kfd_dbgmgr_wave_control(struct kfd_dbgmgr *pmgr, { /* Is the requests coming from the already registered process? */ if (pmgr->pasid != wac_info->process->pasid) { - pr_debug("H/W debugger support was not registered for requester pasid %d\n", + pr_debug("H/W debugger support was not registered for requester pasid 0x%x\n", wac_info->process->pasid); return -EINVAL; } @@ -147,7 +147,7 @@ long kfd_dbgmgr_address_watch(struct kfd_dbgmgr *pmgr, { /* Is the requests coming from the already registered process? */ if (pmgr->pasid != adw_info->process->pasid) { - pr_debug("H/W debugger support was not registered for requester pasid %d\n", + pr_debug("H/W debugger support was not registered for requester pasid 0x%x\n", adw_info->process->pasid); return -EINVAL; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c b/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c index ab37d36d9cd6..15c523027285 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c @@ -85,36 +85,16 @@ static const struct file_operations kfd_debugfs_hang_hws_fops = { void kfd_debugfs_init(void) { - struct dentry *ent; - debugfs_root = debugfs_create_dir("kfd", NULL); - if (!debugfs_root || debugfs_root == ERR_PTR(-ENODEV)) { - pr_warn("Failed to create kfd debugfs dir\n"); - return; - } - - ent = debugfs_create_file("mqds", S_IFREG | 0444, debugfs_root, - kfd_debugfs_mqds_by_process, - &kfd_debugfs_fops); - if (!ent) - pr_warn("Failed to create mqds in kfd debugfs\n"); - - ent = debugfs_create_file("hqds", S_IFREG | 0444, debugfs_root, - kfd_debugfs_hqds_by_device, - &kfd_debugfs_fops); - if (!ent) - pr_warn("Failed to create hqds in kfd debugfs\n"); - - ent = debugfs_create_file("rls", S_IFREG | 0444, debugfs_root, - kfd_debugfs_rls_by_device, - &kfd_debugfs_fops); - - ent = debugfs_create_file("hang_hws", S_IFREG | 0644, debugfs_root, - NULL, - &kfd_debugfs_hang_hws_fops); - if (!ent) - pr_warn("Failed to create rls in kfd debugfs\n"); + debugfs_create_file("mqds", S_IFREG | 0444, debugfs_root, + kfd_debugfs_mqds_by_process, &kfd_debugfs_fops); + debugfs_create_file("hqds", S_IFREG | 0444, debugfs_root, + kfd_debugfs_hqds_by_device, &kfd_debugfs_fops); + debugfs_create_file("rls", S_IFREG | 0444, debugfs_root, + kfd_debugfs_rls_by_device, &kfd_debugfs_fops); + debugfs_create_file("hang_hws", S_IFREG | 0644, debugfs_root, + NULL, &kfd_debugfs_hang_hws_fops); } void kfd_debugfs_fini(void) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index 8be9677c0c07..4fa8834ce7cb 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -39,9 +39,45 @@ */ static atomic_t kfd_locked = ATOMIC_INIT(0); +#ifdef CONFIG_DRM_AMDGPU_CIK +extern const struct kfd2kgd_calls gfx_v7_kfd2kgd; +#endif +extern const struct kfd2kgd_calls gfx_v8_kfd2kgd; +extern const struct kfd2kgd_calls gfx_v9_kfd2kgd; +extern const struct kfd2kgd_calls arcturus_kfd2kgd; +extern const struct kfd2kgd_calls gfx_v10_kfd2kgd; + +static const struct kfd2kgd_calls *kfd2kgd_funcs[] = { +#ifdef KFD_SUPPORT_IOMMU_V2 +#ifdef CONFIG_DRM_AMDGPU_CIK + [CHIP_KAVERI] = &gfx_v7_kfd2kgd, +#endif + [CHIP_CARRIZO] = &gfx_v8_kfd2kgd, + [CHIP_RAVEN] = &gfx_v9_kfd2kgd, +#endif +#ifdef CONFIG_DRM_AMDGPU_CIK + [CHIP_HAWAII] = &gfx_v7_kfd2kgd, +#endif + [CHIP_TONGA] = &gfx_v8_kfd2kgd, + [CHIP_FIJI] = &gfx_v8_kfd2kgd, + [CHIP_POLARIS10] = &gfx_v8_kfd2kgd, + [CHIP_POLARIS11] = &gfx_v8_kfd2kgd, + [CHIP_POLARIS12] = &gfx_v8_kfd2kgd, + [CHIP_VEGAM] = &gfx_v8_kfd2kgd, + [CHIP_VEGA10] = &gfx_v9_kfd2kgd, + [CHIP_VEGA12] = &gfx_v9_kfd2kgd, + [CHIP_VEGA20] = &gfx_v9_kfd2kgd, + [CHIP_RENOIR] = &gfx_v9_kfd2kgd, + [CHIP_ARCTURUS] = &arcturus_kfd2kgd, + [CHIP_NAVI10] = &gfx_v10_kfd2kgd, + [CHIP_NAVI12] = &gfx_v10_kfd2kgd, + [CHIP_NAVI14] = &gfx_v10_kfd2kgd, +}; + #ifdef KFD_SUPPORT_IOMMU_V2 static const struct kfd_device_info kaveri_device_info = { .asic_family = CHIP_KAVERI, + .asic_name = "kaveri", .max_pasid_bits = 16, /* max num of queues for KV.TODO should be a dynamic value */ .max_no_of_hqd = 24, @@ -54,11 +90,13 @@ static const struct kfd_device_info kaveri_device_info = { .needs_iommu_device = true, .needs_pci_atomics = false, .num_sdma_engines = 2, + .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 2, }; static const struct kfd_device_info carrizo_device_info = { .asic_family = CHIP_CARRIZO, + .asic_name = "carrizo", .max_pasid_bits = 16, /* max num of queues for CZ.TODO should be a dynamic value */ .max_no_of_hqd = 24, @@ -71,11 +109,13 @@ static const struct kfd_device_info carrizo_device_info = { .needs_iommu_device = true, .needs_pci_atomics = false, .num_sdma_engines = 2, + .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 2, }; static const struct kfd_device_info raven_device_info = { .asic_family = CHIP_RAVEN, + .asic_name = "raven", .max_pasid_bits = 16, .max_no_of_hqd = 24, .doorbell_size = 8, @@ -87,12 +127,14 @@ static const struct kfd_device_info raven_device_info = { .needs_iommu_device = true, .needs_pci_atomics = true, .num_sdma_engines = 1, + .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 2, }; #endif static const struct kfd_device_info hawaii_device_info = { .asic_family = CHIP_HAWAII, + .asic_name = "hawaii", .max_pasid_bits = 16, /* max num of queues for KV.TODO should be a dynamic value */ .max_no_of_hqd = 24, @@ -105,11 +147,13 @@ static const struct kfd_device_info hawaii_device_info = { .needs_iommu_device = false, .needs_pci_atomics = false, .num_sdma_engines = 2, + .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 2, }; static const struct kfd_device_info tonga_device_info = { .asic_family = CHIP_TONGA, + .asic_name = "tonga", .max_pasid_bits = 16, .max_no_of_hqd = 24, .doorbell_size = 4, @@ -121,11 +165,13 @@ static const struct kfd_device_info tonga_device_info = { .needs_iommu_device = false, .needs_pci_atomics = true, .num_sdma_engines = 2, + .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 2, }; static const struct kfd_device_info fiji_device_info = { .asic_family = CHIP_FIJI, + .asic_name = "fiji", .max_pasid_bits = 16, .max_no_of_hqd = 24, .doorbell_size = 4, @@ -137,11 +183,13 @@ static const struct kfd_device_info fiji_device_info = { .needs_iommu_device = false, .needs_pci_atomics = true, .num_sdma_engines = 2, + .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 2, }; static const struct kfd_device_info fiji_vf_device_info = { .asic_family = CHIP_FIJI, + .asic_name = "fiji", .max_pasid_bits = 16, .max_no_of_hqd = 24, .doorbell_size = 4, @@ -153,12 +201,14 @@ static const struct kfd_device_info fiji_vf_device_info = { .needs_iommu_device = false, .needs_pci_atomics = false, .num_sdma_engines = 2, + .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 2, }; static const struct kfd_device_info polaris10_device_info = { .asic_family = CHIP_POLARIS10, + .asic_name = "polaris10", .max_pasid_bits = 16, .max_no_of_hqd = 24, .doorbell_size = 4, @@ -170,11 +220,13 @@ static const struct kfd_device_info polaris10_device_info = { .needs_iommu_device = false, .needs_pci_atomics = true, .num_sdma_engines = 2, + .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 2, }; static const struct kfd_device_info polaris10_vf_device_info = { .asic_family = CHIP_POLARIS10, + .asic_name = "polaris10", .max_pasid_bits = 16, .max_no_of_hqd = 24, .doorbell_size = 4, @@ -186,11 +238,13 @@ static const struct kfd_device_info polaris10_vf_device_info = { .needs_iommu_device = false, .needs_pci_atomics = false, .num_sdma_engines = 2, + .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 2, }; static const struct kfd_device_info polaris11_device_info = { .asic_family = CHIP_POLARIS11, + .asic_name = "polaris11", .max_pasid_bits = 16, .max_no_of_hqd = 24, .doorbell_size = 4, @@ -202,11 +256,31 @@ static const struct kfd_device_info polaris11_device_info = { .needs_iommu_device = false, .needs_pci_atomics = true, .num_sdma_engines = 2, + .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 2, }; static const struct kfd_device_info polaris12_device_info = { .asic_family = CHIP_POLARIS12, + .asic_name = "polaris12", + .max_pasid_bits = 16, + .max_no_of_hqd = 24, + .doorbell_size = 4, + .ih_ring_entry_size = 4 * sizeof(uint32_t), + .event_interrupt_class = &event_interrupt_class_cik, + .num_of_watch_points = 4, + .mqd_size_aligned = MQD_SIZE_ALIGNED, + .supports_cwsr = true, + .needs_iommu_device = false, + .needs_pci_atomics = true, + .num_sdma_engines = 2, + .num_xgmi_sdma_engines = 0, + .num_sdma_queues_per_engine = 2, +}; + +static const struct kfd_device_info vegam_device_info = { + .asic_family = CHIP_VEGAM, + .asic_name = "vegam", .max_pasid_bits = 16, .max_no_of_hqd = 24, .doorbell_size = 4, @@ -218,11 +292,13 @@ static const struct kfd_device_info polaris12_device_info = { .needs_iommu_device = false, .needs_pci_atomics = true, .num_sdma_engines = 2, + .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 2, }; static const struct kfd_device_info vega10_device_info = { .asic_family = CHIP_VEGA10, + .asic_name = "vega10", .max_pasid_bits = 16, .max_no_of_hqd = 24, .doorbell_size = 8, @@ -234,11 +310,13 @@ static const struct kfd_device_info vega10_device_info = { .needs_iommu_device = false, .needs_pci_atomics = false, .num_sdma_engines = 2, + .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 2, }; static const struct kfd_device_info vega10_vf_device_info = { .asic_family = CHIP_VEGA10, + .asic_name = "vega10", .max_pasid_bits = 16, .max_no_of_hqd = 24, .doorbell_size = 8, @@ -250,11 +328,13 @@ static const struct kfd_device_info vega10_vf_device_info = { .needs_iommu_device = false, .needs_pci_atomics = false, .num_sdma_engines = 2, + .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 2, }; static const struct kfd_device_info vega12_device_info = { .asic_family = CHIP_VEGA12, + .asic_name = "vega12", .max_pasid_bits = 16, .max_no_of_hqd = 24, .doorbell_size = 8, @@ -266,11 +346,13 @@ static const struct kfd_device_info vega12_device_info = { .needs_iommu_device = false, .needs_pci_atomics = false, .num_sdma_engines = 2, + .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 2, }; static const struct kfd_device_info vega20_device_info = { .asic_family = CHIP_VEGA20, + .asic_name = "vega20", .max_pasid_bits = 16, .max_no_of_hqd = 24, .doorbell_size = 8, @@ -282,122 +364,122 @@ static const struct kfd_device_info vega20_device_info = { .needs_iommu_device = false, .needs_pci_atomics = false, .num_sdma_engines = 2, + .num_xgmi_sdma_engines = 0, .num_sdma_queues_per_engine = 8, }; -struct kfd_deviceid { - unsigned short did; - const struct kfd_device_info *device_info; +static const struct kfd_device_info arcturus_device_info = { + .asic_family = CHIP_ARCTURUS, + .asic_name = "arcturus", + .max_pasid_bits = 16, + .max_no_of_hqd = 24, + .doorbell_size = 8, + .ih_ring_entry_size = 8 * sizeof(uint32_t), + .event_interrupt_class = &event_interrupt_class_v9, + .num_of_watch_points = 4, + .mqd_size_aligned = MQD_SIZE_ALIGNED, + .supports_cwsr = true, + .needs_iommu_device = false, + .needs_pci_atomics = false, + .num_sdma_engines = 2, + .num_xgmi_sdma_engines = 6, + .num_sdma_queues_per_engine = 8, +}; + +static const struct kfd_device_info renoir_device_info = { + .asic_family = CHIP_RENOIR, + .asic_name = "renoir", + .max_pasid_bits = 16, + .max_no_of_hqd = 24, + .doorbell_size = 8, + .ih_ring_entry_size = 8 * sizeof(uint32_t), + .event_interrupt_class = &event_interrupt_class_v9, + .num_of_watch_points = 4, + .mqd_size_aligned = MQD_SIZE_ALIGNED, + .supports_cwsr = true, + .needs_iommu_device = false, + .needs_pci_atomics = false, + .num_sdma_engines = 1, + .num_xgmi_sdma_engines = 0, + .num_sdma_queues_per_engine = 2, +}; + +static const struct kfd_device_info navi10_device_info = { + .asic_family = CHIP_NAVI10, + .asic_name = "navi10", + .max_pasid_bits = 16, + .max_no_of_hqd = 24, + .doorbell_size = 8, + .ih_ring_entry_size = 8 * sizeof(uint32_t), + .event_interrupt_class = &event_interrupt_class_v9, + .num_of_watch_points = 4, + .mqd_size_aligned = MQD_SIZE_ALIGNED, + .needs_iommu_device = false, + .supports_cwsr = true, + .needs_pci_atomics = false, + .num_sdma_engines = 2, + .num_xgmi_sdma_engines = 0, + .num_sdma_queues_per_engine = 8, }; -static const struct kfd_deviceid supported_devices[] = { +static const struct kfd_device_info navi12_device_info = { + .asic_family = CHIP_NAVI12, + .asic_name = "navi12", + .max_pasid_bits = 16, + .max_no_of_hqd = 24, + .doorbell_size = 8, + .ih_ring_entry_size = 8 * sizeof(uint32_t), + .event_interrupt_class = &event_interrupt_class_v9, + .num_of_watch_points = 4, + .mqd_size_aligned = MQD_SIZE_ALIGNED, + .needs_iommu_device = false, + .supports_cwsr = true, + .needs_pci_atomics = false, + .num_sdma_engines = 2, + .num_xgmi_sdma_engines = 0, + .num_sdma_queues_per_engine = 8, +}; + +static const struct kfd_device_info navi14_device_info = { + .asic_family = CHIP_NAVI14, + .asic_name = "navi14", + .max_pasid_bits = 16, + .max_no_of_hqd = 24, + .doorbell_size = 8, + .ih_ring_entry_size = 8 * sizeof(uint32_t), + .event_interrupt_class = &event_interrupt_class_v9, + .num_of_watch_points = 4, + .mqd_size_aligned = MQD_SIZE_ALIGNED, + .needs_iommu_device = false, + .supports_cwsr = true, + .needs_pci_atomics = false, + .num_sdma_engines = 2, + .num_xgmi_sdma_engines = 0, + .num_sdma_queues_per_engine = 8, +}; + +/* For each entry, [0] is regular and [1] is virtualisation device. */ +static const struct kfd_device_info *kfd_supported_devices[][2] = { #ifdef KFD_SUPPORT_IOMMU_V2 - { 0x1304, &kaveri_device_info }, /* Kaveri */ - { 0x1305, &kaveri_device_info }, /* Kaveri */ - { 0x1306, &kaveri_device_info }, /* Kaveri */ - { 0x1307, &kaveri_device_info }, /* Kaveri */ - { 0x1309, &kaveri_device_info }, /* Kaveri */ - { 0x130A, &kaveri_device_info }, /* Kaveri */ - { 0x130B, &kaveri_device_info }, /* Kaveri */ - { 0x130C, &kaveri_device_info }, /* Kaveri */ - { 0x130D, &kaveri_device_info }, /* Kaveri */ - { 0x130E, &kaveri_device_info }, /* Kaveri */ - { 0x130F, &kaveri_device_info }, /* Kaveri */ - { 0x1310, &kaveri_device_info }, /* Kaveri */ - { 0x1311, &kaveri_device_info }, /* Kaveri */ - { 0x1312, &kaveri_device_info }, /* Kaveri */ - { 0x1313, &kaveri_device_info }, /* Kaveri */ - { 0x1315, &kaveri_device_info }, /* Kaveri */ - { 0x1316, &kaveri_device_info }, /* Kaveri */ - { 0x1317, &kaveri_device_info }, /* Kaveri */ - { 0x1318, &kaveri_device_info }, /* Kaveri */ - { 0x131B, &kaveri_device_info }, /* Kaveri */ - { 0x131C, &kaveri_device_info }, /* Kaveri */ - { 0x131D, &kaveri_device_info }, /* Kaveri */ - { 0x9870, &carrizo_device_info }, /* Carrizo */ - { 0x9874, &carrizo_device_info }, /* Carrizo */ - { 0x9875, &carrizo_device_info }, /* Carrizo */ - { 0x9876, &carrizo_device_info }, /* Carrizo */ - { 0x9877, &carrizo_device_info }, /* Carrizo */ - { 0x15DD, &raven_device_info }, /* Raven */ + [CHIP_KAVERI] = {&kaveri_device_info, NULL}, + [CHIP_CARRIZO] = {&carrizo_device_info, NULL}, + [CHIP_RAVEN] = {&raven_device_info, NULL}, #endif - { 0x67A0, &hawaii_device_info }, /* Hawaii */ - { 0x67A1, &hawaii_device_info }, /* Hawaii */ - { 0x67A2, &hawaii_device_info }, /* Hawaii */ - { 0x67A8, &hawaii_device_info }, /* Hawaii */ - { 0x67A9, &hawaii_device_info }, /* Hawaii */ - { 0x67AA, &hawaii_device_info }, /* Hawaii */ - { 0x67B0, &hawaii_device_info }, /* Hawaii */ - { 0x67B1, &hawaii_device_info }, /* Hawaii */ - { 0x67B8, &hawaii_device_info }, /* Hawaii */ - { 0x67B9, &hawaii_device_info }, /* Hawaii */ - { 0x67BA, &hawaii_device_info }, /* Hawaii */ - { 0x67BE, &hawaii_device_info }, /* Hawaii */ - { 0x6920, &tonga_device_info }, /* Tonga */ - { 0x6921, &tonga_device_info }, /* Tonga */ - { 0x6928, &tonga_device_info }, /* Tonga */ - { 0x6929, &tonga_device_info }, /* Tonga */ - { 0x692B, &tonga_device_info }, /* Tonga */ - { 0x6938, &tonga_device_info }, /* Tonga */ - { 0x6939, &tonga_device_info }, /* Tonga */ - { 0x7300, &fiji_device_info }, /* Fiji */ - { 0x730F, &fiji_vf_device_info }, /* Fiji vf*/ - { 0x67C0, &polaris10_device_info }, /* Polaris10 */ - { 0x67C1, &polaris10_device_info }, /* Polaris10 */ - { 0x67C2, &polaris10_device_info }, /* Polaris10 */ - { 0x67C4, &polaris10_device_info }, /* Polaris10 */ - { 0x67C7, &polaris10_device_info }, /* Polaris10 */ - { 0x67C8, &polaris10_device_info }, /* Polaris10 */ - { 0x67C9, &polaris10_device_info }, /* Polaris10 */ - { 0x67CA, &polaris10_device_info }, /* Polaris10 */ - { 0x67CC, &polaris10_device_info }, /* Polaris10 */ - { 0x67CF, &polaris10_device_info }, /* Polaris10 */ - { 0x67D0, &polaris10_vf_device_info }, /* Polaris10 vf*/ - { 0x67DF, &polaris10_device_info }, /* Polaris10 */ - { 0x67E0, &polaris11_device_info }, /* Polaris11 */ - { 0x67E1, &polaris11_device_info }, /* Polaris11 */ - { 0x67E3, &polaris11_device_info }, /* Polaris11 */ - { 0x67E7, &polaris11_device_info }, /* Polaris11 */ - { 0x67E8, &polaris11_device_info }, /* Polaris11 */ - { 0x67E9, &polaris11_device_info }, /* Polaris11 */ - { 0x67EB, &polaris11_device_info }, /* Polaris11 */ - { 0x67EF, &polaris11_device_info }, /* Polaris11 */ - { 0x67FF, &polaris11_device_info }, /* Polaris11 */ - { 0x6980, &polaris12_device_info }, /* Polaris12 */ - { 0x6981, &polaris12_device_info }, /* Polaris12 */ - { 0x6985, &polaris12_device_info }, /* Polaris12 */ - { 0x6986, &polaris12_device_info }, /* Polaris12 */ - { 0x6987, &polaris12_device_info }, /* Polaris12 */ - { 0x6995, &polaris12_device_info }, /* Polaris12 */ - { 0x6997, &polaris12_device_info }, /* Polaris12 */ - { 0x699F, &polaris12_device_info }, /* Polaris12 */ - { 0x6860, &vega10_device_info }, /* Vega10 */ - { 0x6861, &vega10_device_info }, /* Vega10 */ - { 0x6862, &vega10_device_info }, /* Vega10 */ - { 0x6863, &vega10_device_info }, /* Vega10 */ - { 0x6864, &vega10_device_info }, /* Vega10 */ - { 0x6867, &vega10_device_info }, /* Vega10 */ - { 0x6868, &vega10_device_info }, /* Vega10 */ - { 0x6869, &vega10_device_info }, /* Vega10 */ - { 0x686A, &vega10_device_info }, /* Vega10 */ - { 0x686B, &vega10_device_info }, /* Vega10 */ - { 0x686C, &vega10_vf_device_info }, /* Vega10 vf*/ - { 0x686D, &vega10_device_info }, /* Vega10 */ - { 0x686E, &vega10_device_info }, /* Vega10 */ - { 0x686F, &vega10_device_info }, /* Vega10 */ - { 0x687F, &vega10_device_info }, /* Vega10 */ - { 0x69A0, &vega12_device_info }, /* Vega12 */ - { 0x69A1, &vega12_device_info }, /* Vega12 */ - { 0x69A2, &vega12_device_info }, /* Vega12 */ - { 0x69A3, &vega12_device_info }, /* Vega12 */ - { 0x69AF, &vega12_device_info }, /* Vega12 */ - { 0x66a0, &vega20_device_info }, /* Vega20 */ - { 0x66a1, &vega20_device_info }, /* Vega20 */ - { 0x66a2, &vega20_device_info }, /* Vega20 */ - { 0x66a3, &vega20_device_info }, /* Vega20 */ - { 0x66a4, &vega20_device_info }, /* Vega20 */ - { 0x66a7, &vega20_device_info }, /* Vega20 */ - { 0x66af, &vega20_device_info } /* Vega20 */ + [CHIP_HAWAII] = {&hawaii_device_info, NULL}, + [CHIP_TONGA] = {&tonga_device_info, NULL}, + [CHIP_FIJI] = {&fiji_device_info, &fiji_vf_device_info}, + [CHIP_POLARIS10] = {&polaris10_device_info, &polaris10_vf_device_info}, + [CHIP_POLARIS11] = {&polaris11_device_info, NULL}, + [CHIP_POLARIS12] = {&polaris12_device_info, NULL}, + [CHIP_VEGAM] = {&vegam_device_info, NULL}, + [CHIP_VEGA10] = {&vega10_device_info, &vega10_vf_device_info}, + [CHIP_VEGA12] = {&vega12_device_info, NULL}, + [CHIP_VEGA20] = {&vega20_device_info, NULL}, + [CHIP_RENOIR] = {&renoir_device_info, NULL}, + [CHIP_ARCTURUS] = {&arcturus_device_info, &arcturus_device_info}, + [CHIP_NAVI10] = {&navi10_device_info, NULL}, + [CHIP_NAVI12] = {&navi12_device_info, &navi12_device_info}, + [CHIP_NAVI14] = {&navi14_device_info, NULL}, }; static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size, @@ -406,33 +488,25 @@ static void kfd_gtt_sa_fini(struct kfd_dev *kfd); static int kfd_resume(struct kfd_dev *kfd); -static const struct kfd_device_info *lookup_device_info(unsigned short did) +struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd, + struct pci_dev *pdev, unsigned int asic_type, bool vf) { - size_t i; + struct kfd_dev *kfd; + const struct kfd_device_info *device_info; + const struct kfd2kgd_calls *f2g; - for (i = 0; i < ARRAY_SIZE(supported_devices); i++) { - if (supported_devices[i].did == did) { - WARN_ON(!supported_devices[i].device_info); - return supported_devices[i].device_info; - } + if (asic_type >= sizeof(kfd_supported_devices) / (sizeof(void *) * 2) + || asic_type >= sizeof(kfd2kgd_funcs) / sizeof(void *)) { + dev_err(kfd_device, "asic_type %d out of range\n", asic_type); + return NULL; /* asic_type out of range */ } - dev_warn(kfd_device, "DID %04x is missing in supported_devices\n", - did); - - return NULL; -} - -struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd, - struct pci_dev *pdev, const struct kfd2kgd_calls *f2g) -{ - struct kfd_dev *kfd; - int ret; - const struct kfd_device_info *device_info = - lookup_device_info(pdev->device); + device_info = kfd_supported_devices[asic_type][vf]; + f2g = kfd2kgd_funcs[asic_type]; - if (!device_info) { - dev_err(kfd_device, "kgd2kfd_probe failed\n"); + if (!device_info || !f2g) { + dev_err(kfd_device, "%s %s not supported in kfd\n", + amdgpu_asic_name[asic_type], vf ? "VF" : ""); return NULL; } @@ -444,28 +518,29 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd, * 32 and 64-bit requests are possible and must be * supported. */ - ret = pci_enable_atomic_ops_to_root(pdev, - PCI_EXP_DEVCAP2_ATOMIC_COMP32 | - PCI_EXP_DEVCAP2_ATOMIC_COMP64); - if (device_info->needs_pci_atomics && ret < 0) { + kfd->pci_atomic_requested = amdgpu_amdkfd_have_atomics_support(kgd); + if (device_info->needs_pci_atomics && + !kfd->pci_atomic_requested) { dev_info(kfd_device, "skipped device %x:%x, PCI rejects atomics\n", pdev->vendor, pdev->device); kfree(kfd); return NULL; - } else if (!ret) - kfd->pci_atomic_requested = true; + } kfd->kgd = kgd; kfd->device_info = device_info; kfd->pdev = pdev; kfd->init_complete = false; kfd->kfd2kgd = f2g; + atomic_set(&kfd->compute_profile, 0); mutex_init(&kfd->doorbell_mutex); memset(&kfd->doorbell_available_index, 0, sizeof(kfd->doorbell_available_index)); + atomic_set(&kfd->sram_ecc_flag, 0); + return kfd; } @@ -476,10 +551,18 @@ static void kfd_cwsr_init(struct kfd_dev *kfd) BUILD_BUG_ON(sizeof(cwsr_trap_gfx8_hex) > PAGE_SIZE); kfd->cwsr_isa = cwsr_trap_gfx8_hex; kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx8_hex); - } else { + } else if (kfd->device_info->asic_family == CHIP_ARCTURUS) { + BUILD_BUG_ON(sizeof(cwsr_trap_arcturus_hex) > PAGE_SIZE); + kfd->cwsr_isa = cwsr_trap_arcturus_hex; + kfd->cwsr_isa_size = sizeof(cwsr_trap_arcturus_hex); + } else if (kfd->device_info->asic_family < CHIP_NAVI10) { BUILD_BUG_ON(sizeof(cwsr_trap_gfx9_hex) > PAGE_SIZE); kfd->cwsr_isa = cwsr_trap_gfx9_hex; kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx9_hex); + } else { + BUILD_BUG_ON(sizeof(cwsr_trap_gfx10_hex) > PAGE_SIZE); + kfd->cwsr_isa = cwsr_trap_gfx10_hex; + kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx10_hex); } kfd->cwsr_enabled = true; @@ -487,13 +570,15 @@ static void kfd_cwsr_init(struct kfd_dev *kfd) } bool kgd2kfd_device_init(struct kfd_dev *kfd, + struct drm_device *ddev, const struct kgd2kfd_shared_resources *gpu_resources) { unsigned int size; - kfd->mec_fw_version = kfd->kfd2kgd->get_fw_version(kfd->kgd, + kfd->ddev = ddev; + kfd->mec_fw_version = amdgpu_amdkfd_get_fw_version(kfd->kgd, KGD_ENGINE_MEC1); - kfd->sdma_fw_version = kfd->kfd2kgd->get_fw_version(kfd->kgd, + kfd->sdma_fw_version = amdgpu_amdkfd_get_fw_version(kfd->kgd, KGD_ENGINE_SDMA1); kfd->shared_resources = *gpu_resources; @@ -513,6 +598,13 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, } else kfd->max_proc_per_quantum = hws_max_conc_proc; + /* Allocate global GWS that is shared by all KFD processes */ + if (hws_gws_support && amdgpu_amdkfd_alloc_gws(kfd->kgd, + amdgpu_amdkfd_get_num_gws(kfd->kgd), &kfd->gws)) { + dev_err(kfd_device, "Could not allocate %d gws\n", + amdgpu_amdkfd_get_num_gws(kfd->kgd)); + goto out; + } /* calculate max size of mqds needed for queues */ size = max_num_of_queues_per_device * kfd->device_info->mqd_size_aligned; @@ -536,7 +628,7 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, &kfd->gtt_start_gpu_addr, &kfd->gtt_start_cpu_ptr, false)) { dev_err(kfd_device, "Could not allocate %d bytes\n", size); - goto out; + goto alloc_gtt_mem_failure; } dev_info(kfd_device, "Allocated %d bytes on gart\n", size); @@ -556,11 +648,6 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, if (kfd->kfd2kgd->get_hive_id) kfd->hive_id = kfd->kfd2kgd->get_hive_id(kfd->kgd); - if (kfd_topology_add_device(kfd)) { - dev_err(kfd_device, "Error adding device to topology\n"); - goto kfd_topology_add_device_error; - } - if (kfd_interrupt_init(kfd)) { dev_err(kfd_device, "Error initializing interrupts\n"); goto kfd_interrupt_error; @@ -584,6 +671,11 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, kfd->dbgmgr = NULL; + if (kfd_topology_add_device(kfd)) { + dev_err(kfd_device, "Error adding device to topology\n"); + goto kfd_topology_add_device_error; + } + kfd->init_complete = true; dev_info(kfd_device, "added device %x:%x\n", kfd->pdev->vendor, kfd->pdev->device); @@ -593,19 +685,21 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, goto out; +kfd_topology_add_device_error: kfd_resume_error: device_iommu_error: device_queue_manager_uninit(kfd->dqm); device_queue_manager_error: kfd_interrupt_exit(kfd); kfd_interrupt_error: - kfd_topology_remove_device(kfd); -kfd_topology_add_device_error: kfd_doorbell_fini(kfd); kfd_doorbell_error: kfd_gtt_sa_fini(kfd); kfd_gtt_sa_init_error: amdgpu_amdkfd_free_gtt_mem(kfd->kgd, kfd->gtt_mem); +alloc_gtt_mem_failure: + if (hws_gws_support) + amdgpu_amdkfd_free_gws(kfd->kgd, kfd->gws); dev_err(kfd_device, "device %x:%x NOT added due to errors\n", kfd->pdev->vendor, kfd->pdev->device); @@ -623,6 +717,8 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd) kfd_doorbell_fini(kfd); kfd_gtt_sa_fini(kfd); amdgpu_amdkfd_free_gtt_mem(kfd->kgd, kfd->gtt_mem); + if (hws_gws_support) + amdgpu_amdkfd_free_gws(kfd->kgd, kfd->gws); } kfree(kfd); @@ -634,9 +730,6 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd) return 0; kgd2kfd_suspend(kfd); - /* hold dqm->lock to prevent further execution*/ - dqm_lock(kfd->dqm); - kfd_signal_reset_event(kfd); return 0; } @@ -654,13 +747,13 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd) if (!kfd->init_complete) return 0; - dqm_unlock(kfd->dqm); - ret = kfd_resume(kfd); if (ret) return ret; count = atomic_dec_return(&kfd_locked); - WARN_ONCE(count != 0, "KFD reset ref. error"); + + atomic_set(&kfd->sram_ecc_flag, 0); + return 0; } @@ -1024,6 +1117,27 @@ int kfd_gtt_sa_free(struct kfd_dev *kfd, struct kfd_mem_obj *mem_obj) return 0; } +void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd) +{ + if (kfd) + atomic_inc(&kfd->sram_ecc_flag); +} + +void kfd_inc_compute_active(struct kfd_dev *kfd) +{ + if (atomic_inc_return(&kfd->compute_profile) == 1) + amdgpu_amdkfd_set_compute_idle(kfd->kgd, false); +} + +void kfd_dec_compute_active(struct kfd_dev *kfd) +{ + int count = atomic_dec_return(&kfd->compute_profile); + + if (count == 0) + amdgpu_amdkfd_set_compute_idle(kfd->kgd, true); + WARN_ONCE(count < 0, "Compute profile ref. count error"); +} + #if defined(CONFIG_DEBUG_FS) /* This function will send a package to HIQ to hang the HWS diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index c6c9530e704e..984c2f2b24b6 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -42,10 +42,6 @@ static int set_pasid_vmid_mapping(struct device_queue_manager *dqm, unsigned int pasid, unsigned int vmid); -static int create_compute_queue_nocpsch(struct device_queue_manager *dqm, - struct queue *q, - struct qcm_process_device *qpd); - static int execute_queues_cpsch(struct device_queue_manager *dqm, enum kfd_unmap_queues_filter filter, uint32_t filter_param); @@ -55,19 +51,20 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm, static int map_queues_cpsch(struct device_queue_manager *dqm); -static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm, - struct queue *q, - struct qcm_process_device *qpd); - static void deallocate_sdma_queue(struct device_queue_manager *dqm, - unsigned int sdma_queue_id); + struct queue *q); +static inline void deallocate_hqd(struct device_queue_manager *dqm, + struct queue *q); +static int allocate_hqd(struct device_queue_manager *dqm, struct queue *q); +static int allocate_sdma_queue(struct device_queue_manager *dqm, + struct queue *q); static void kfd_process_hw_exception(struct work_struct *work); static inline enum KFD_MQD_TYPE get_mqd_type_from_queue_type(enum kfd_queue_type type) { - if (type == KFD_QUEUE_TYPE_SDMA) + if (type == KFD_QUEUE_TYPE_SDMA || type == KFD_QUEUE_TYPE_SDMA_XGMI) return KFD_MQD_TYPE_SDMA; return KFD_MQD_TYPE_CP; } @@ -107,12 +104,23 @@ static unsigned int get_num_sdma_engines(struct device_queue_manager *dqm) return dqm->dev->device_info->num_sdma_engines; } +static unsigned int get_num_xgmi_sdma_engines(struct device_queue_manager *dqm) +{ + return dqm->dev->device_info->num_xgmi_sdma_engines; +} + unsigned int get_num_sdma_queues(struct device_queue_manager *dqm) { return dqm->dev->device_info->num_sdma_engines * dqm->dev->device_info->num_sdma_queues_per_engine; } +unsigned int get_num_xgmi_sdma_queues(struct device_queue_manager *dqm) +{ + return dqm->dev->device_info->num_xgmi_sdma_engines + * dqm->dev->device_info->num_sdma_queues_per_engine; +} + void program_sh_mem_settings(struct device_queue_manager *dqm, struct qcm_process_device *qpd) { @@ -133,7 +141,8 @@ static int allocate_doorbell(struct qcm_process_device *qpd, struct queue *q) * preserve the user mode ABI. */ q->doorbell_id = q->properties.queue_id; - } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { + } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA || + q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) { /* For SDMA queues on SOC15 with 8-byte doorbell, use static * doorbell assignments based on the engine and queue id. * The doobell index distance between RLC (2*i) and (2*i+1) @@ -174,7 +183,8 @@ static void deallocate_doorbell(struct qcm_process_device *qpd, struct kfd_dev *dev = qpd->dqm->dev; if (!KFD_IS_SOC15(dev->device_info->asic_family) || - q->properties.type == KFD_QUEUE_TYPE_SDMA) + q->properties.type == KFD_QUEUE_TYPE_SDMA || + q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) return; old = test_and_clear_bit(q->doorbell_id, qpd->doorbell_bitmap); @@ -185,20 +195,30 @@ static int allocate_vmid(struct device_queue_manager *dqm, struct qcm_process_device *qpd, struct queue *q) { - int bit, allocated_vmid; + int allocated_vmid = -1, i; - if (dqm->vmid_bitmap == 0) - return -ENOMEM; + for (i = dqm->dev->vm_info.first_vmid_kfd; + i <= dqm->dev->vm_info.last_vmid_kfd; i++) { + if (!dqm->vmid_pasid[i]) { + allocated_vmid = i; + break; + } + } + + if (allocated_vmid < 0) { + pr_err("no more vmid to allocate\n"); + return -ENOSPC; + } + + pr_debug("vmid allocated: %d\n", allocated_vmid); - bit = ffs(dqm->vmid_bitmap) - 1; - dqm->vmid_bitmap &= ~(1 << bit); + dqm->vmid_pasid[allocated_vmid] = q->process->pasid; + + set_pasid_vmid_mapping(dqm, q->process->pasid, allocated_vmid); - allocated_vmid = bit + dqm->dev->vm_info.first_vmid_kfd; - pr_debug("vmid allocation %d\n", allocated_vmid); qpd->vmid = allocated_vmid; q->properties.vmid = allocated_vmid; - set_pasid_vmid_mapping(dqm, q->process->pasid, q->properties.vmid); program_sh_mem_settings(dqm, qpd); /* qpd->page_table_base is set earlier when register_process() @@ -210,6 +230,10 @@ static int allocate_vmid(struct device_queue_manager *dqm, /* invalidate the VM context after pasid and vmid mapping is set up */ kfd_flush_tlb(qpd_to_pdd(qpd)); + if (dqm->dev->kfd2kgd->set_scratch_backing_va) + dqm->dev->kfd2kgd->set_scratch_backing_va(dqm->dev->kgd, + qpd->sh_hidden_private_base, qpd->vmid); + return 0; } @@ -235,8 +259,6 @@ static void deallocate_vmid(struct device_queue_manager *dqm, struct qcm_process_device *qpd, struct queue *q) { - int bit = qpd->vmid - dqm->dev->vm_info.first_vmid_kfd; - /* On GFX v7, CP doesn't flush TC at dequeue */ if (q->device->device_info->asic_family == CHIP_HAWAII) if (flush_texture_cache_nocpsch(q->device, qpd)) @@ -246,8 +268,8 @@ static void deallocate_vmid(struct device_queue_manager *dqm, /* Release the vmid mapping */ set_pasid_vmid_mapping(dqm, 0, qpd->vmid); + dqm->vmid_pasid[qpd->vmid] = 0; - dqm->vmid_bitmap |= (1 << bit); qpd->vmid = 0; q->properties.vmid = 0; } @@ -256,6 +278,7 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm, struct queue *q, struct qcm_process_device *qpd) { + struct mqd_manager *mqd_mgr; int retval; print_queue(q); @@ -276,30 +299,63 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm, } q->properties.vmid = qpd->vmid; /* - * Eviction state logic: we only mark active queues as evicted - * to avoid the overhead of restoring inactive queues later + * Eviction state logic: mark all queues as evicted, even ones + * not currently active. Restoring inactive queues later only + * updates the is_evicted flag but is a no-op otherwise. */ - if (qpd->evicted) - q->properties.is_evicted = (q->properties.queue_size > 0 && - q->properties.queue_percent > 0 && - q->properties.queue_address != 0); + q->properties.is_evicted = !!qpd->evicted; q->properties.tba_addr = qpd->tba_addr; q->properties.tma_addr = qpd->tma_addr; - if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE) - retval = create_compute_queue_nocpsch(dqm, q, qpd); - else if (q->properties.type == KFD_QUEUE_TYPE_SDMA) - retval = create_sdma_queue_nocpsch(dqm, q, qpd); - else - retval = -EINVAL; + mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type( + q->properties.type)]; + if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE) { + retval = allocate_hqd(dqm, q); + if (retval) + goto deallocate_vmid; + pr_debug("Loading mqd to hqd on pipe %d, queue %d\n", + q->pipe, q->queue); + } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA || + q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) { + retval = allocate_sdma_queue(dqm, q); + if (retval) + goto deallocate_vmid; + dqm->asic_ops.init_sdma_vm(dqm, q, qpd); + } - if (retval) { - if (list_empty(&qpd->queues_list)) - deallocate_vmid(dqm, qpd, q); - goto out_unlock; + retval = allocate_doorbell(qpd, q); + if (retval) + goto out_deallocate_hqd; + + /* Temporarily release dqm lock to avoid a circular lock dependency */ + dqm_unlock(dqm); + q->mqd_mem_obj = mqd_mgr->allocate_mqd(mqd_mgr->dev, &q->properties); + dqm_lock(dqm); + + if (!q->mqd_mem_obj) { + retval = -ENOMEM; + goto out_deallocate_doorbell; } + mqd_mgr->init_mqd(mqd_mgr, &q->mqd, q->mqd_mem_obj, + &q->gart_mqd_addr, &q->properties); + if (q->properties.is_active) { + if (!dqm->sched_running) { + WARN_ONCE(1, "Load non-HWS mqd while stopped\n"); + goto add_queue_to_list; + } + if (WARN(q->process->mm != current->mm, + "should only run in user thread")) + retval = -EFAULT; + else + retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, q->pipe, + q->queue, &q->properties, current->mm); + if (retval) + goto out_free_mqd; + } + +add_queue_to_list: list_add(&q->list, &qpd->queues_list); qpd->queue_count++; if (q->properties.is_active) @@ -307,6 +363,8 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm, if (q->properties.type == KFD_QUEUE_TYPE_SDMA) dqm->sdma_queue_count++; + else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) + dqm->xgmi_sdma_queue_count++; /* * Unconditionally increment this counter, regardless of the queue's @@ -315,7 +373,21 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm, dqm->total_queue_count++; pr_debug("Total of %d queues are accountable so far\n", dqm->total_queue_count); + goto out_unlock; +out_free_mqd: + mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj); +out_deallocate_doorbell: + deallocate_doorbell(qpd, q); +out_deallocate_hqd: + if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE) + deallocate_hqd(dqm, q); + else if (q->properties.type == KFD_QUEUE_TYPE_SDMA || + q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) + deallocate_sdma_queue(dqm, q); +deallocate_vmid: + if (list_empty(&qpd->queues_list)) + deallocate_vmid(dqm, qpd, q); out_unlock: dqm_unlock(dqm); return retval; @@ -361,60 +433,6 @@ static inline void deallocate_hqd(struct device_queue_manager *dqm, dqm->allocated_queues[q->pipe] |= (1 << q->queue); } -static int create_compute_queue_nocpsch(struct device_queue_manager *dqm, - struct queue *q, - struct qcm_process_device *qpd) -{ - struct mqd_manager *mqd_mgr; - int retval; - - mqd_mgr = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_COMPUTE); - if (!mqd_mgr) - return -ENOMEM; - - retval = allocate_hqd(dqm, q); - if (retval) - return retval; - - retval = allocate_doorbell(qpd, q); - if (retval) - goto out_deallocate_hqd; - - retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, &q->mqd_mem_obj, - &q->gart_mqd_addr, &q->properties); - if (retval) - goto out_deallocate_doorbell; - - pr_debug("Loading mqd to hqd on pipe %d, queue %d\n", - q->pipe, q->queue); - - dqm->dev->kfd2kgd->set_scratch_backing_va( - dqm->dev->kgd, qpd->sh_hidden_private_base, qpd->vmid); - - if (!q->properties.is_active) - return 0; - - if (WARN(q->process->mm != current->mm, - "should only run in user thread")) - retval = -EFAULT; - else - retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, q->pipe, q->queue, - &q->properties, current->mm); - if (retval) - goto out_uninit_mqd; - - return 0; - -out_uninit_mqd: - mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj); -out_deallocate_doorbell: - deallocate_doorbell(qpd, q); -out_deallocate_hqd: - deallocate_hqd(dqm, q); - - return retval; -} - /* Access to DQM has to be locked before calling destroy_queue_nocpsch_locked * to avoid asynchronized access */ @@ -425,16 +443,17 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm, int retval; struct mqd_manager *mqd_mgr; - mqd_mgr = dqm->ops.get_mqd_manager(dqm, - get_mqd_type_from_queue_type(q->properties.type)); - if (!mqd_mgr) - return -ENOMEM; + mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type( + q->properties.type)]; if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE) { deallocate_hqd(dqm, q); } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { dqm->sdma_queue_count--; - deallocate_sdma_queue(dqm, q->sdma_id); + deallocate_sdma_queue(dqm, q); + } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) { + dqm->xgmi_sdma_queue_count--; + deallocate_sdma_queue(dqm, q); } else { pr_debug("q->properties.type %d is invalid\n", q->properties.type); @@ -444,6 +463,11 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm, deallocate_doorbell(qpd, q); + if (!dqm->sched_running) { + WARN_ONCE(1, "Destroy non-HWS queue while stopped\n"); + return 0; + } + retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd, KFD_PREEMPT_TYPE_WAVEFRONT_RESET, KFD_UNMAP_LATENCY_MS, @@ -451,7 +475,7 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm, if (retval == -ETIME) qpd->reset_wavefronts = true; - mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj); + mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj); list_del(&q->list); if (list_empty(&qpd->queues_list)) { @@ -490,7 +514,7 @@ static int destroy_queue_nocpsch(struct device_queue_manager *dqm, static int update_queue(struct device_queue_manager *dqm, struct queue *q) { - int retval; + int retval = 0; struct mqd_manager *mqd_mgr; struct kfd_process_device *pdd; bool prev_active = false; @@ -501,20 +525,8 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q) retval = -ENODEV; goto out_unlock; } - mqd_mgr = dqm->ops.get_mqd_manager(dqm, - get_mqd_type_from_queue_type(q->properties.type)); - if (!mqd_mgr) { - retval = -ENOMEM; - goto out_unlock; - } - /* - * Eviction state logic: we only mark active queues as evicted - * to avoid the overhead of restoring inactive queues later - */ - if (pdd->qpd.evicted) - q->properties.is_evicted = (q->properties.queue_size > 0 && - q->properties.queue_percent > 0 && - q->properties.queue_address != 0); + mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type( + q->properties.type)]; /* Save previous activity state for counters */ prev_active = q->properties.is_active; @@ -529,7 +541,14 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q) } } else if (prev_active && (q->properties.type == KFD_QUEUE_TYPE_COMPUTE || - q->properties.type == KFD_QUEUE_TYPE_SDMA)) { + q->properties.type == KFD_QUEUE_TYPE_SDMA || + q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)) { + + if (!dqm->sched_running) { + WARN_ONCE(1, "Update non-HWS queue while stopped\n"); + goto out_unlock; + } + retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd, KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN, KFD_UNMAP_LATENCY_MS, q->pipe, q->queue); @@ -539,7 +558,7 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q) } } - retval = mqd_mgr->update_mqd(mqd_mgr, q->mqd, &q->properties); + mqd_mgr->update_mqd(mqd_mgr, q->mqd, &q->properties); /* * check active state vs. the previous state and modify @@ -556,7 +575,8 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q) retval = map_queues_cpsch(dqm); else if (q->properties.is_active && (q->properties.type == KFD_QUEUE_TYPE_COMPUTE || - q->properties.type == KFD_QUEUE_TYPE_SDMA)) { + q->properties.type == KFD_QUEUE_TYPE_SDMA || + q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)) { if (WARN(q->process->mm != current->mm, "should only run in user thread")) retval = -EFAULT; @@ -571,67 +591,51 @@ out_unlock: return retval; } -static struct mqd_manager *get_mqd_manager( - struct device_queue_manager *dqm, enum KFD_MQD_TYPE type) -{ - struct mqd_manager *mqd_mgr; - - if (WARN_ON(type >= KFD_MQD_TYPE_MAX)) - return NULL; - - pr_debug("mqd type %d\n", type); - - mqd_mgr = dqm->mqd_mgrs[type]; - if (!mqd_mgr) { - mqd_mgr = mqd_manager_init(type, dqm->dev); - if (!mqd_mgr) - pr_err("mqd manager is NULL"); - dqm->mqd_mgrs[type] = mqd_mgr; - } - - return mqd_mgr; -} - static int evict_process_queues_nocpsch(struct device_queue_manager *dqm, struct qcm_process_device *qpd) { struct queue *q; struct mqd_manager *mqd_mgr; struct kfd_process_device *pdd; - int retval = 0; + int retval, ret = 0; dqm_lock(dqm); if (qpd->evicted++ > 0) /* already evicted, do nothing */ goto out; pdd = qpd_to_pdd(qpd); - pr_info_ratelimited("Evicting PASID %u queues\n", + pr_info_ratelimited("Evicting PASID 0x%x queues\n", pdd->process->pasid); - /* unactivate all active queues on the qpd */ + /* Mark all queues as evicted. Deactivate all active queues on + * the qpd. + */ list_for_each_entry(q, &qpd->queues_list, list) { + q->properties.is_evicted = true; if (!q->properties.is_active) continue; - mqd_mgr = dqm->ops.get_mqd_manager(dqm, - get_mqd_type_from_queue_type(q->properties.type)); - if (!mqd_mgr) { /* should not be here */ - pr_err("Cannot evict queue, mqd mgr is NULL\n"); - retval = -ENOMEM; - goto out; - } - q->properties.is_evicted = true; + + mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type( + q->properties.type)]; q->properties.is_active = false; + dqm->queue_count--; + + if (WARN_ONCE(!dqm->sched_running, "Evict when stopped\n")) + continue; + retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd, KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN, KFD_UNMAP_LATENCY_MS, q->pipe, q->queue); - if (retval) - goto out; - dqm->queue_count--; + if (retval && !ret) + /* Return the first error, but keep going to + * maintain a consistent eviction state + */ + ret = retval; } out: dqm_unlock(dqm); - return retval; + return ret; } static int evict_process_queues_cpsch(struct device_queue_manager *dqm, @@ -646,14 +650,17 @@ static int evict_process_queues_cpsch(struct device_queue_manager *dqm, goto out; pdd = qpd_to_pdd(qpd); - pr_info_ratelimited("Evicting PASID %u queues\n", + pr_info_ratelimited("Evicting PASID 0x%x queues\n", pdd->process->pasid); - /* unactivate all active queues on the qpd */ + /* Mark all queues as evicted. Deactivate all active queues on + * the qpd. + */ list_for_each_entry(q, &qpd->queues_list, list) { + q->properties.is_evicted = true; if (!q->properties.is_active) continue; - q->properties.is_evicted = true; + q->properties.is_active = false; dqm->queue_count--; } @@ -675,7 +682,7 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm, struct mqd_manager *mqd_mgr; struct kfd_process_device *pdd; uint64_t pd_base; - int retval = 0; + int retval, ret = 0; pdd = qpd_to_pdd(qpd); /* Retrieve PD base */ @@ -689,7 +696,7 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm, goto out; } - pr_info_ratelimited("Restoring PASID %u queues\n", + pr_info_ratelimited("Restoring PASID 0x%x queues\n", pdd->process->pasid); /* Update PD Base in QPD */ @@ -709,35 +716,40 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm, */ mm = get_task_mm(pdd->process->lead_thread); if (!mm) { - retval = -EFAULT; + ret = -EFAULT; goto out; } - /* activate all active queues on the qpd */ + /* Remove the eviction flags. Activate queues that are not + * inactive for other reasons. + */ list_for_each_entry(q, &qpd->queues_list, list) { - if (!q->properties.is_evicted) - continue; - mqd_mgr = dqm->ops.get_mqd_manager(dqm, - get_mqd_type_from_queue_type(q->properties.type)); - if (!mqd_mgr) { /* should not be here */ - pr_err("Cannot restore queue, mqd mgr is NULL\n"); - retval = -ENOMEM; - goto out; - } q->properties.is_evicted = false; + if (!QUEUE_IS_ACTIVE(q->properties)) + continue; + + mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type( + q->properties.type)]; q->properties.is_active = true; + dqm->queue_count++; + + if (WARN_ONCE(!dqm->sched_running, "Restore when stopped\n")) + continue; + retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, q->pipe, q->queue, &q->properties, mm); - if (retval) - goto out; - dqm->queue_count++; + if (retval && !ret) + /* Return the first error, but keep going to + * maintain a consistent eviction state + */ + ret = retval; } qpd->evicted = 0; out: if (mm) mmput(mm); dqm_unlock(dqm); - return retval; + return ret; } static int restore_process_queues_cpsch(struct device_queue_manager *dqm, @@ -760,7 +772,7 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm, goto out; } - pr_info_ratelimited("Restoring PASID %u queues\n", + pr_info_ratelimited("Restoring PASID 0x%x queues\n", pdd->process->pasid); /* Update PD Base in QPD */ @@ -769,16 +781,16 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm, /* activate all active queues on the qpd */ list_for_each_entry(q, &qpd->queues_list, list) { - if (!q->properties.is_evicted) - continue; q->properties.is_evicted = false; + if (!QUEUE_IS_ACTIVE(q->properties)) + continue; + q->properties.is_active = true; dqm->queue_count++; } retval = execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0); - if (!retval) - qpd->evicted = 0; + qpd->evicted = 0; out: dqm_unlock(dqm); return retval; @@ -811,11 +823,15 @@ static int register_process(struct device_queue_manager *dqm, retval = dqm->asic_ops.update_qpd(dqm, qpd); - if (dqm->processes_count++ == 0) - amdgpu_amdkfd_set_compute_idle(dqm->dev->kgd, false); + dqm->processes_count++; dqm_unlock(dqm); + /* Outside the DQM lock because under the DQM lock we can't do + * reclaim or take other locks that others hold while reclaiming. + */ + kfd_inc_compute_active(dqm->dev); + return retval; } @@ -835,9 +851,7 @@ static int unregister_process(struct device_queue_manager *dqm, if (qpd == cur->qpd) { list_del(&cur->list); kfree(cur); - if (--dqm->processes_count == 0) - amdgpu_amdkfd_set_compute_idle( - dqm->dev->kgd, true); + dqm->processes_count--; goto out; } } @@ -845,6 +859,13 @@ static int unregister_process(struct device_queue_manager *dqm, retval = 1; out: dqm_unlock(dqm); + + /* Outside the DQM lock because under the DQM lock we can't do + * reclaim or take other locks that others hold while reclaiming. + */ + if (!retval) + kfd_dec_compute_active(dqm->dev); + return retval; } @@ -880,6 +901,7 @@ static int initialize_nocpsch(struct device_queue_manager *dqm) INIT_LIST_HEAD(&dqm->queues); dqm->queue_count = dqm->next_pipe_to_allocate = 0; dqm->sdma_queue_count = 0; + dqm->xgmi_sdma_queue_count = 0; for (pipe = 0; pipe < get_pipes_per_mec(dqm); pipe++) { int pipe_offset = pipe * get_queues_per_pipe(dqm); @@ -890,8 +912,10 @@ static int initialize_nocpsch(struct device_queue_manager *dqm) dqm->allocated_queues[pipe] |= 1 << queue; } - dqm->vmid_bitmap = (1 << dqm->dev->vm_info.vmid_num_kfd) - 1; - dqm->sdma_bitmap = (1 << get_num_sdma_queues(dqm)) - 1; + memset(dqm->vmid_pasid, 0, sizeof(dqm->vmid_pasid)); + + dqm->sdma_bitmap = ~0ULL >> (64 - get_num_sdma_queues(dqm)); + dqm->xgmi_sdma_bitmap = ~0ULL >> (64 - get_num_xgmi_sdma_queues(dqm)); return 0; } @@ -912,85 +936,74 @@ static void uninitialize(struct device_queue_manager *dqm) static int start_nocpsch(struct device_queue_manager *dqm) { init_interrupts(dqm); - return pm_init(&dqm->packets, dqm); + + if (dqm->dev->device_info->asic_family == CHIP_HAWAII) + return pm_init(&dqm->packets, dqm); + dqm->sched_running = true; + + return 0; } static int stop_nocpsch(struct device_queue_manager *dqm) { - pm_uninit(&dqm->packets); + if (dqm->dev->device_info->asic_family == CHIP_HAWAII) + pm_uninit(&dqm->packets); + dqm->sched_running = false; + return 0; } static int allocate_sdma_queue(struct device_queue_manager *dqm, - unsigned int *sdma_queue_id) + struct queue *q) { int bit; - if (dqm->sdma_bitmap == 0) - return -ENOMEM; + if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { + if (dqm->sdma_bitmap == 0) + return -ENOMEM; + bit = __ffs64(dqm->sdma_bitmap); + dqm->sdma_bitmap &= ~(1ULL << bit); + q->sdma_id = bit; + q->properties.sdma_engine_id = q->sdma_id % + get_num_sdma_engines(dqm); + q->properties.sdma_queue_id = q->sdma_id / + get_num_sdma_engines(dqm); + } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) { + if (dqm->xgmi_sdma_bitmap == 0) + return -ENOMEM; + bit = __ffs64(dqm->xgmi_sdma_bitmap); + dqm->xgmi_sdma_bitmap &= ~(1ULL << bit); + q->sdma_id = bit; + /* sdma_engine_id is sdma id including + * both PCIe-optimized SDMAs and XGMI- + * optimized SDMAs. The calculation below + * assumes the first N engines are always + * PCIe-optimized ones + */ + q->properties.sdma_engine_id = get_num_sdma_engines(dqm) + + q->sdma_id % get_num_xgmi_sdma_engines(dqm); + q->properties.sdma_queue_id = q->sdma_id / + get_num_xgmi_sdma_engines(dqm); + } - bit = ffs(dqm->sdma_bitmap) - 1; - dqm->sdma_bitmap &= ~(1 << bit); - *sdma_queue_id = bit; + pr_debug("SDMA engine id: %d\n", q->properties.sdma_engine_id); + pr_debug("SDMA queue id: %d\n", q->properties.sdma_queue_id); return 0; } static void deallocate_sdma_queue(struct device_queue_manager *dqm, - unsigned int sdma_queue_id) -{ - if (sdma_queue_id >= get_num_sdma_queues(dqm)) - return; - dqm->sdma_bitmap |= (1 << sdma_queue_id); -} - -static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm, - struct queue *q, - struct qcm_process_device *qpd) + struct queue *q) { - struct mqd_manager *mqd_mgr; - int retval; - - mqd_mgr = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_SDMA); - if (!mqd_mgr) - return -ENOMEM; - - retval = allocate_sdma_queue(dqm, &q->sdma_id); - if (retval) - return retval; - - q->properties.sdma_queue_id = q->sdma_id / get_num_sdma_engines(dqm); - q->properties.sdma_engine_id = q->sdma_id % get_num_sdma_engines(dqm); - - retval = allocate_doorbell(qpd, q); - if (retval) - goto out_deallocate_sdma_queue; - - pr_debug("SDMA id is: %d\n", q->sdma_id); - pr_debug("SDMA queue id: %d\n", q->properties.sdma_queue_id); - pr_debug("SDMA engine id: %d\n", q->properties.sdma_engine_id); - - dqm->asic_ops.init_sdma_vm(dqm, q, qpd); - retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, &q->mqd_mem_obj, - &q->gart_mqd_addr, &q->properties); - if (retval) - goto out_deallocate_doorbell; - - retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, 0, 0, &q->properties, - NULL); - if (retval) - goto out_uninit_mqd; - - return 0; - -out_uninit_mqd: - mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj); -out_deallocate_doorbell: - deallocate_doorbell(qpd, q); -out_deallocate_sdma_queue: - deallocate_sdma_queue(dqm, q->sdma_id); - - return retval; + if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { + if (q->sdma_id >= get_num_sdma_queues(dqm)) + return; + dqm->sdma_bitmap |= (1ULL << q->sdma_id); + } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) { + if (q->sdma_id >= get_num_xgmi_sdma_queues(dqm)) + return; + dqm->xgmi_sdma_bitmap |= (1ULL << q->sdma_id); + } } /* @@ -1027,8 +1040,8 @@ static int set_sched_resources(struct device_queue_manager *dqm) res.queue_mask |= (1ull << i); } - res.gws_mask = res.oac_mask = res.gds_heap_base = - res.gds_heap_size = 0; + res.gws_mask = ~0ull; + res.oac_mask = res.gds_heap_base = res.gds_heap_size = 0; pr_debug("Scheduling resources:\n" "vmid mask: 0x%8X\n" @@ -1046,8 +1059,10 @@ static int initialize_cpsch(struct device_queue_manager *dqm) INIT_LIST_HEAD(&dqm->queues); dqm->queue_count = dqm->processes_count = 0; dqm->sdma_queue_count = 0; + dqm->xgmi_sdma_queue_count = 0; dqm->active_runlist = false; - dqm->sdma_bitmap = (1 << get_num_sdma_queues(dqm)) - 1; + dqm->sdma_bitmap = ~0ULL >> (64 - get_num_sdma_queues(dqm)); + dqm->xgmi_sdma_bitmap = ~0ULL >> (64 - get_num_xgmi_sdma_queues(dqm)); INIT_WORK(&dqm->hw_exception_work, kfd_process_hw_exception); @@ -1085,6 +1100,7 @@ static int start_cpsch(struct device_queue_manager *dqm) dqm_lock(dqm); /* clear hang status when driver try to start the hw scheduler */ dqm->is_hws_hang = false; + dqm->sched_running = true; execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0); dqm_unlock(dqm); @@ -1100,6 +1116,7 @@ static int stop_cpsch(struct device_queue_manager *dqm) { dqm_lock(dqm); unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0); + dqm->sched_running = false; dqm_unlock(dqm); kfd_gtt_sa_free(dqm->dev, dqm->fence_mem); @@ -1162,55 +1179,49 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, int retval; struct mqd_manager *mqd_mgr; - retval = 0; - - dqm_lock(dqm); - if (dqm->total_queue_count >= max_num_of_queues_per_device) { pr_warn("Can't create new usermode queue because %d queues were already created\n", dqm->total_queue_count); retval = -EPERM; - goto out_unlock; + goto out; } - if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { - retval = allocate_sdma_queue(dqm, &q->sdma_id); + if (q->properties.type == KFD_QUEUE_TYPE_SDMA || + q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) { + dqm_lock(dqm); + retval = allocate_sdma_queue(dqm, q); + dqm_unlock(dqm); if (retval) - goto out_unlock; - q->properties.sdma_queue_id = - q->sdma_id / get_num_sdma_engines(dqm); - q->properties.sdma_engine_id = - q->sdma_id % get_num_sdma_engines(dqm); + goto out; } retval = allocate_doorbell(qpd, q); if (retval) goto out_deallocate_sdma_queue; - mqd_mgr = dqm->ops.get_mqd_manager(dqm, - get_mqd_type_from_queue_type(q->properties.type)); + mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type( + q->properties.type)]; - if (!mqd_mgr) { + if (q->properties.type == KFD_QUEUE_TYPE_SDMA || + q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) + dqm->asic_ops.init_sdma_vm(dqm, q, qpd); + q->properties.tba_addr = qpd->tba_addr; + q->properties.tma_addr = qpd->tma_addr; + q->mqd_mem_obj = mqd_mgr->allocate_mqd(mqd_mgr->dev, &q->properties); + if (!q->mqd_mem_obj) { retval = -ENOMEM; goto out_deallocate_doorbell; } + + dqm_lock(dqm); /* - * Eviction state logic: we only mark active queues as evicted - * to avoid the overhead of restoring inactive queues later + * Eviction state logic: mark all queues as evicted, even ones + * not currently active. Restoring inactive queues later only + * updates the is_evicted flag but is a no-op otherwise. */ - if (qpd->evicted) - q->properties.is_evicted = (q->properties.queue_size > 0 && - q->properties.queue_percent > 0 && - q->properties.queue_address != 0); - - dqm->asic_ops.init_sdma_vm(dqm, q, qpd); - - q->properties.tba_addr = qpd->tba_addr; - q->properties.tma_addr = qpd->tma_addr; - retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, &q->mqd_mem_obj, + q->properties.is_evicted = !!qpd->evicted; + mqd_mgr->init_mqd(mqd_mgr, &q->mqd, q->mqd_mem_obj, &q->gart_mqd_addr, &q->properties); - if (retval) - goto out_deallocate_doorbell; list_add(&q->list, &qpd->queues_list); qpd->queue_count++; @@ -1222,6 +1233,8 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, if (q->properties.type == KFD_QUEUE_TYPE_SDMA) dqm->sdma_queue_count++; + else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) + dqm->xgmi_sdma_queue_count++; /* * Unconditionally increment this counter, regardless of the queue's * type or whether the queue is active. @@ -1237,11 +1250,13 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, out_deallocate_doorbell: deallocate_doorbell(qpd, q); out_deallocate_sdma_queue: - if (q->properties.type == KFD_QUEUE_TYPE_SDMA) - deallocate_sdma_queue(dqm, q->sdma_id); -out_unlock: - dqm_unlock(dqm); - + if (q->properties.type == KFD_QUEUE_TYPE_SDMA || + q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) { + dqm_lock(dqm); + deallocate_sdma_queue(dqm, q); + dqm_unlock(dqm); + } +out: return retval; } @@ -1269,12 +1284,18 @@ int amdkfd_fence_wait_timeout(unsigned int *fence_addr, return 0; } -static int unmap_sdma_queues(struct device_queue_manager *dqm, - unsigned int sdma_engine) +static int unmap_sdma_queues(struct device_queue_manager *dqm) { - return pm_send_unmap_queue(&dqm->packets, KFD_QUEUE_TYPE_SDMA, - KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, false, - sdma_engine); + int i, retval = 0; + + for (i = 0; i < dqm->dev->device_info->num_sdma_engines + + dqm->dev->device_info->num_xgmi_sdma_engines; i++) { + retval = pm_send_unmap_queue(&dqm->packets, KFD_QUEUE_TYPE_SDMA, + KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, false, i); + if (retval) + return retval; + } + return retval; } /* dqm->lock mutex has to be locked before calling this function */ @@ -1282,13 +1303,15 @@ static int map_queues_cpsch(struct device_queue_manager *dqm) { int retval; + if (!dqm->sched_running) + return 0; if (dqm->queue_count <= 0 || dqm->processes_count <= 0) return 0; - if (dqm->active_runlist) return 0; retval = pm_send_runlist(&dqm->packets, &dqm->queues); + pr_debug("%s sent runlist\n", __func__); if (retval) { pr_err("failed to execute runlist\n"); return retval; @@ -1305,18 +1328,18 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm, { int retval = 0; + if (!dqm->sched_running) + return 0; if (dqm->is_hws_hang) return -EIO; if (!dqm->active_runlist) return retval; - pr_debug("Before destroying queues, sdma queue count is : %u\n", - dqm->sdma_queue_count); + pr_debug("Before destroying queues, sdma queue count is : %u, xgmi sdma queue count is : %u\n", + dqm->sdma_queue_count, dqm->xgmi_sdma_queue_count); - if (dqm->sdma_queue_count > 0) { - unmap_sdma_queues(dqm, 0); - unmap_sdma_queues(dqm, 1); - } + if (dqm->sdma_queue_count > 0 || dqm->xgmi_sdma_queue_count) + unmap_sdma_queues(dqm); retval = pm_send_unmap_queue(&dqm->packets, KFD_QUEUE_TYPE_COMPUTE, filter, filter_param, false, 0); @@ -1328,7 +1351,7 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm, KFD_FENCE_COMPLETED); /* should be timed out */ retval = amdkfd_fence_wait_timeout(dqm->fence_addr, KFD_FENCE_COMPLETED, - QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS); + queue_preemption_timeout_ms); if (retval) return retval; @@ -1380,18 +1403,17 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, } - mqd_mgr = dqm->ops.get_mqd_manager(dqm, - get_mqd_type_from_queue_type(q->properties.type)); - if (!mqd_mgr) { - retval = -ENOMEM; - goto failed; - } + mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type( + q->properties.type)]; deallocate_doorbell(qpd, q); if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { dqm->sdma_queue_count--; - deallocate_sdma_queue(dqm, q->sdma_id); + deallocate_sdma_queue(dqm, q); + } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) { + dqm->xgmi_sdma_queue_count--; + deallocate_sdma_queue(dqm, q); } list_del(&q->list); @@ -1404,8 +1426,6 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, qpd->reset_wavefronts = true; } - mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj); - /* * Unconditionally decrement this counter, regardless of the queue's * type @@ -1416,9 +1436,11 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, dqm_unlock(dqm); + /* Do free_mqd after dqm_unlock(dqm) to avoid circular locking */ + mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj); + return retval; -failed: failed_try_destroy_debugged_queue: dqm_unlock(dqm); @@ -1521,6 +1543,7 @@ static int process_termination_nocpsch(struct device_queue_manager *dqm, struct queue *q, *next; struct device_process_node *cur, *next_dpn; int retval = 0; + bool found = false; dqm_lock(dqm); @@ -1539,11 +1562,19 @@ static int process_termination_nocpsch(struct device_queue_manager *dqm, list_del(&cur->list); kfree(cur); dqm->processes_count--; + found = true; break; } } dqm_unlock(dqm); + + /* Outside the DQM lock because under the DQM lock we can't do + * reclaim or take other locks that others hold while reclaiming. + */ + if (found) + kfd_dec_compute_active(dqm->dev); + return retval; } @@ -1564,11 +1595,7 @@ static int get_wave_state(struct device_queue_manager *dqm, goto dqm_unlock; } - mqd_mgr = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_COMPUTE); - if (!mqd_mgr) { - r = -ENOMEM; - goto dqm_unlock; - } + mqd_mgr = dqm->mqd_mgrs[KFD_MQD_TYPE_COMPUTE]; if (!mqd_mgr->get_wave_state) { r = -EINVAL; @@ -1593,6 +1620,7 @@ static int process_termination_cpsch(struct device_queue_manager *dqm, struct device_process_node *cur, *next_dpn; enum kfd_unmap_queues_filter filter = KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES; + bool found = false; retval = 0; @@ -1611,7 +1639,10 @@ static int process_termination_cpsch(struct device_queue_manager *dqm, list_for_each_entry(q, &qpd->queues_list, list) { if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { dqm->sdma_queue_count--; - deallocate_sdma_queue(dqm, q->sdma_id); + deallocate_sdma_queue(dqm, q); + } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) { + dqm->xgmi_sdma_queue_count--; + deallocate_sdma_queue(dqm, q); } if (q->properties.is_active) @@ -1626,6 +1657,7 @@ static int process_termination_cpsch(struct device_queue_manager *dqm, list_del(&cur->list); kfree(cur); dqm->processes_count--; + found = true; break; } } @@ -1637,21 +1669,69 @@ static int process_termination_cpsch(struct device_queue_manager *dqm, qpd->reset_wavefronts = false; } - /* lastly, free mqd resources */ + dqm_unlock(dqm); + + /* Outside the DQM lock because under the DQM lock we can't do + * reclaim or take other locks that others hold while reclaiming. + */ + if (found) + kfd_dec_compute_active(dqm->dev); + + /* Lastly, free mqd resources. + * Do free_mqd() after dqm_unlock to avoid circular locking. + */ list_for_each_entry_safe(q, next, &qpd->queues_list, list) { - mqd_mgr = dqm->ops.get_mqd_manager(dqm, - get_mqd_type_from_queue_type(q->properties.type)); - if (!mqd_mgr) { - retval = -ENOMEM; - goto out; - } + mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type( + q->properties.type)]; list_del(&q->list); qpd->queue_count--; - mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj); + mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj); } -out: - dqm_unlock(dqm); + return retval; +} + +static int init_mqd_managers(struct device_queue_manager *dqm) +{ + int i, j; + struct mqd_manager *mqd_mgr; + + for (i = 0; i < KFD_MQD_TYPE_MAX; i++) { + mqd_mgr = dqm->asic_ops.mqd_manager_init(i, dqm->dev); + if (!mqd_mgr) { + pr_err("mqd manager [%d] initialization failed\n", i); + goto out_free; + } + dqm->mqd_mgrs[i] = mqd_mgr; + } + + return 0; + +out_free: + for (j = 0; j < i; j++) { + kfree(dqm->mqd_mgrs[j]); + dqm->mqd_mgrs[j] = NULL; + } + + return -ENOMEM; +} + +/* Allocate one hiq mqd (HWS) and all SDMA mqd in a continuous trunk*/ +static int allocate_hiq_sdma_mqd(struct device_queue_manager *dqm) +{ + int retval; + struct kfd_dev *dev = dqm->dev; + struct kfd_mem_obj *mem_obj = &dqm->hiq_sdma_mqd; + uint32_t size = dqm->mqd_mgrs[KFD_MQD_TYPE_SDMA]->mqd_size * + (dev->device_info->num_sdma_engines + + dev->device_info->num_xgmi_sdma_engines) * + dev->device_info->num_sdma_queues_per_engine + + dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]->mqd_size; + + retval = amdgpu_amdkfd_alloc_gtt_mem(dev->kgd, size, + &(mem_obj->gtt_mem), &(mem_obj->gpu_addr), + (void *)&(mem_obj->cpu_ptr), true); + return retval; } @@ -1692,7 +1772,6 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev) dqm->ops.stop = stop_cpsch; dqm->ops.destroy_queue = destroy_queue_cpsch; dqm->ops.update_queue = update_queue; - dqm->ops.get_mqd_manager = get_mqd_manager; dqm->ops.register_process = register_process; dqm->ops.unregister_process = unregister_process; dqm->ops.uninitialize = uninitialize; @@ -1712,7 +1791,6 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev) dqm->ops.create_queue = create_queue_nocpsch; dqm->ops.destroy_queue = destroy_queue_nocpsch; dqm->ops.update_queue = update_queue; - dqm->ops.get_mqd_manager = get_mqd_manager; dqm->ops.register_process = register_process; dqm->ops.unregister_process = unregister_process; dqm->ops.initialize = initialize_nocpsch; @@ -1748,6 +1826,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev) case CHIP_POLARIS10: case CHIP_POLARIS11: case CHIP_POLARIS12: + case CHIP_VEGAM: device_queue_manager_init_vi_tonga(&dqm->asic_ops); break; @@ -1755,14 +1834,29 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev) case CHIP_VEGA12: case CHIP_VEGA20: case CHIP_RAVEN: + case CHIP_RENOIR: + case CHIP_ARCTURUS: device_queue_manager_init_v9(&dqm->asic_ops); break; + case CHIP_NAVI10: + case CHIP_NAVI12: + case CHIP_NAVI14: + device_queue_manager_init_v10_navi10(&dqm->asic_ops); + break; default: WARN(1, "Unexpected ASIC family %u", dev->device_info->asic_family); goto out_free; } + if (init_mqd_managers(dqm)) + goto out_free; + + if (allocate_hiq_sdma_mqd(dqm)) { + pr_err("Failed to allocate hiq sdma mqd trunk buffer\n"); + goto out_free; + } + if (!dqm->ops.initialize(dqm)) return dqm; @@ -1771,9 +1865,18 @@ out_free: return NULL; } +static void deallocate_hiq_sdma_mqd(struct kfd_dev *dev, + struct kfd_mem_obj *mqd) +{ + WARN(!mqd, "No hiq sdma mqd trunk to free"); + + amdgpu_amdkfd_free_gtt_mem(dev->kgd, mqd->gtt_mem); +} + void device_queue_manager_uninit(struct device_queue_manager *dqm) { dqm->ops.uninitialize(dqm); + deallocate_hiq_sdma_mqd(dqm->dev, &dqm->hiq_sdma_mqd); kfree(dqm); } @@ -1831,13 +1934,20 @@ int dqm_debugfs_hqds(struct seq_file *m, void *data) int pipe, queue; int r = 0; + if (!dqm->sched_running) { + seq_printf(m, " Device is stopped\n"); + + return 0; + } + r = dqm->dev->kfd2kgd->hqd_dump(dqm->dev->kgd, - KFD_CIK_HIQ_PIPE, KFD_CIK_HIQ_QUEUE, &dump, &n_regs); + KFD_CIK_HIQ_PIPE, KFD_CIK_HIQ_QUEUE, + &dump, &n_regs); if (!r) { seq_printf(m, " HIQ on MEC %d Pipe %d Queue %d\n", - KFD_CIK_HIQ_PIPE/get_pipes_per_mec(dqm)+1, - KFD_CIK_HIQ_PIPE%get_pipes_per_mec(dqm), - KFD_CIK_HIQ_QUEUE); + KFD_CIK_HIQ_PIPE/get_pipes_per_mec(dqm)+1, + KFD_CIK_HIQ_PIPE%get_pipes_per_mec(dqm), + KFD_CIK_HIQ_QUEUE); seq_reg_dump(m, dump, n_regs); kfree(dump); @@ -1864,7 +1974,8 @@ int dqm_debugfs_hqds(struct seq_file *m, void *data) } } - for (pipe = 0; pipe < get_num_sdma_engines(dqm); pipe++) { + for (pipe = 0; pipe < get_num_sdma_engines(dqm) + + get_num_xgmi_sdma_engines(dqm); pipe++) { for (queue = 0; queue < dqm->dev->device_info->num_sdma_queues_per_engine; queue++) { diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h index 70e38a2e23b9..a8c37e6da027 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h @@ -31,8 +31,8 @@ #include "kfd_priv.h" #include "kfd_mqd_manager.h" -#define KFD_UNMAP_LATENCY_MS (4000) -#define QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS (2 * KFD_UNMAP_LATENCY_MS + 1000) + +#define VMID_NUM 16 struct device_process_node { struct qcm_process_device *qpd; @@ -48,8 +48,6 @@ struct device_process_node { * * @update_queue: Queue update routine. * - * @get_mqd_manager: Returns the mqd manager according to the mqd type. - * * @exeute_queues: Dispatches the queues list to the H/W. * * @register_process: This routine associates a specific process with device. @@ -97,10 +95,6 @@ struct device_queue_manager_ops { int (*update_queue)(struct device_queue_manager *dqm, struct queue *q); - struct mqd_manager * (*get_mqd_manager) - (struct device_queue_manager *dqm, - enum KFD_MQD_TYPE type); - int (*register_process)(struct device_queue_manager *dqm, struct qcm_process_device *qpd); @@ -158,6 +152,8 @@ struct device_queue_manager_asic_ops { void (*init_sdma_vm)(struct device_queue_manager *dqm, struct queue *q, struct qcm_process_device *qpd); + struct mqd_manager * (*mqd_manager_init)(enum KFD_MQD_TYPE type, + struct kfd_dev *dev); }; /** @@ -185,11 +181,14 @@ struct device_queue_manager { unsigned int processes_count; unsigned int queue_count; unsigned int sdma_queue_count; + unsigned int xgmi_sdma_queue_count; unsigned int total_queue_count; unsigned int next_pipe_to_allocate; unsigned int *allocated_queues; - unsigned int sdma_bitmap; - unsigned int vmid_bitmap; + uint64_t sdma_bitmap; + uint64_t xgmi_sdma_bitmap; + /* the pasid mapping for each kfd vmid */ + uint16_t vmid_pasid[VMID_NUM]; uint64_t pipelines_addr; struct kfd_mem_obj *pipeline_mem; uint64_t fence_gpu_addr; @@ -201,6 +200,8 @@ struct device_queue_manager { /* hw exception */ bool is_hws_hang; struct work_struct hw_exception_work; + struct kfd_mem_obj hiq_sdma_mqd; + bool sched_running; }; void device_queue_manager_init_cik( @@ -213,12 +214,15 @@ void device_queue_manager_init_vi_tonga( struct device_queue_manager_asic_ops *asic_ops); void device_queue_manager_init_v9( struct device_queue_manager_asic_ops *asic_ops); +void device_queue_manager_init_v10_navi10( + struct device_queue_manager_asic_ops *asic_ops); void program_sh_mem_settings(struct device_queue_manager *dqm, struct qcm_process_device *qpd); unsigned int get_queues_num(struct device_queue_manager *dqm); unsigned int get_queues_per_pipe(struct device_queue_manager *dqm); unsigned int get_pipes_per_mec(struct device_queue_manager *dqm); unsigned int get_num_sdma_queues(struct device_queue_manager *dqm); +unsigned int get_num_xgmi_sdma_queues(struct device_queue_manager *dqm); static inline unsigned int get_sh_mem_bases_32(struct kfd_process_device *pdd) { diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c index aed4c21417bf..0d26506798cf 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c @@ -48,6 +48,7 @@ void device_queue_manager_init_cik( asic_ops->set_cache_memory_policy = set_cache_memory_policy_cik; asic_ops->update_qpd = update_qpd_cik; asic_ops->init_sdma_vm = init_sdma_vm; + asic_ops->mqd_manager_init = mqd_manager_init_cik; } void device_queue_manager_init_cik_hawaii( @@ -56,6 +57,7 @@ void device_queue_manager_init_cik_hawaii( asic_ops->set_cache_memory_policy = set_cache_memory_policy_cik; asic_ops->update_qpd = update_qpd_cik_hawaii; asic_ops->init_sdma_vm = init_sdma_vm_hawaii; + asic_ops->mqd_manager_init = mqd_manager_init_cik_hawaii; } static uint32_t compute_sh_mem_bases_64bit(unsigned int top_address_nybble) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v10.c new file mode 100644 index 000000000000..72e4d61ac752 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v10.c @@ -0,0 +1,88 @@ +/* + * Copyright 2018 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include "kfd_device_queue_manager.h" +#include "navi10_enum.h" +#include "gc/gc_10_1_0_offset.h" +#include "gc/gc_10_1_0_sh_mask.h" + +static int update_qpd_v10(struct device_queue_manager *dqm, + struct qcm_process_device *qpd); +static void init_sdma_vm_v10(struct device_queue_manager *dqm, struct queue *q, + struct qcm_process_device *qpd); + +void device_queue_manager_init_v10_navi10( + struct device_queue_manager_asic_ops *asic_ops) +{ + asic_ops->update_qpd = update_qpd_v10; + asic_ops->init_sdma_vm = init_sdma_vm_v10; + asic_ops->mqd_manager_init = mqd_manager_init_v10; +} + +static uint32_t compute_sh_mem_bases_64bit(struct kfd_process_device *pdd) +{ + uint32_t shared_base = pdd->lds_base >> 48; + uint32_t private_base = pdd->scratch_base >> 48; + + return (shared_base << SH_MEM_BASES__SHARED_BASE__SHIFT) | + private_base; +} + +static int update_qpd_v10(struct device_queue_manager *dqm, + struct qcm_process_device *qpd) +{ + struct kfd_process_device *pdd; + + pdd = qpd_to_pdd(qpd); + + /* check if sh_mem_config register already configured */ + if (qpd->sh_mem_config == 0) { + qpd->sh_mem_config = + SH_MEM_ALIGNMENT_MODE_UNALIGNED << + SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT; +#if 0 + /* TODO: + * This shouldn't be an issue with Navi10. Verify. + */ + if (vega10_noretry) + qpd->sh_mem_config |= + 1 << SH_MEM_CONFIG__RETRY_DISABLE__SHIFT; +#endif + + qpd->sh_mem_ape1_limit = 0; + qpd->sh_mem_ape1_base = 0; + } + + qpd->sh_mem_bases = compute_sh_mem_bases_64bit(pdd); + + pr_debug("sh_mem_bases 0x%X\n", qpd->sh_mem_bases); + + return 0; +} + +static void init_sdma_vm_v10(struct device_queue_manager *dqm, struct queue *q, + struct qcm_process_device *qpd) +{ + /* Not needed on SDMAv4 onwards any more */ + q->properties.sdma_vm_addr = 0; +} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c index 417515332c35..95a82ac455f2 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c @@ -37,6 +37,7 @@ void device_queue_manager_init_v9( { asic_ops->update_qpd = update_qpd_v9; asic_ops->init_sdma_vm = init_sdma_vm_v9; + asic_ops->mqd_manager_init = mqd_manager_init_v9; } static uint32_t compute_sh_mem_bases_64bit(struct kfd_process_device *pdd) @@ -60,7 +61,7 @@ static int update_qpd_v9(struct device_queue_manager *dqm, qpd->sh_mem_config = SH_MEM_ALIGNMENT_MODE_UNALIGNED << SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT; - if (noretry && + if (amdgpu_noretry && !dqm->dev->device_info->needs_iommu_device) qpd->sh_mem_config |= 1 << SH_MEM_CONFIG__RETRY_DISABLE__SHIFT; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c index c3a5dcfe877a..3a7cb2f88366 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c @@ -54,6 +54,7 @@ void device_queue_manager_init_vi( asic_ops->set_cache_memory_policy = set_cache_memory_policy_vi; asic_ops->update_qpd = update_qpd_vi; asic_ops->init_sdma_vm = init_sdma_vm; + asic_ops->mqd_manager_init = mqd_manager_init_vi; } void device_queue_manager_init_vi_tonga( @@ -62,6 +63,7 @@ void device_queue_manager_init_vi_tonga( asic_ops->set_cache_memory_policy = set_cache_memory_policy_vi_tonga; asic_ops->update_qpd = update_qpd_vi_tonga; asic_ops->init_sdma_vm = init_sdma_vm_tonga; + asic_ops->mqd_manager_init = mqd_manager_init_vi_tonga; } static uint32_t compute_sh_mem_bases_64bit(unsigned int top_address_nybble) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c index e9f0e0a1b41c..908081c85de1 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c @@ -852,8 +852,8 @@ static void lookup_events_by_type_and_signal(struct kfd_process *p, if (type == KFD_EVENT_TYPE_MEMORY) { dev_warn(kfd_device, - "Sending SIGSEGV to HSA Process with PID %d ", - p->lead_thread->pid); + "Sending SIGSEGV to process %d (pasid 0x%x)", + p->lead_thread->pid, p->pasid); send_sig(SIGSEGV, p->lead_thread, 0); } @@ -861,13 +861,13 @@ static void lookup_events_by_type_and_signal(struct kfd_process *p, if (send_signal) { if (send_sigterm) { dev_warn(kfd_device, - "Sending SIGTERM to HSA Process with PID %d ", - p->lead_thread->pid); + "Sending SIGTERM to process %d (pasid 0x%x)", + p->lead_thread->pid, p->pasid); send_sig(SIGTERM, p->lead_thread, 0); } else { dev_err(kfd_device, - "HSA Process (PID %d) got unhandled exception", - p->lead_thread->pid); + "Process %d (pasid 0x%x) got unhandled exception", + p->lead_thread->pid, p->pasid); } } } @@ -936,7 +936,8 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid, /* Workaround on Raven to not kill the process when memory is freed * before IOMMU is able to finish processing all the excessive PPRs */ - if (dev->device_info->asic_family != CHIP_RAVEN) { + if (dev->device_info->asic_family != CHIP_RAVEN && + dev->device_info->asic_family != CHIP_RENOIR) { mutex_lock(&p->event_mutex); /* Lookup events by type and signal them */ @@ -983,7 +984,7 @@ void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid, return; /* Presumably process exited. */ memset(&memory_exception_data, 0, sizeof(memory_exception_data)); memory_exception_data.gpu_id = dev->id; - memory_exception_data.failure.imprecise = 1; + memory_exception_data.failure.imprecise = true; /* Set failure reason */ if (info) { memory_exception_data.va = (info->page_addr) << PAGE_SHIFT; @@ -1011,25 +1012,41 @@ void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid, void kfd_signal_reset_event(struct kfd_dev *dev) { struct kfd_hsa_hw_exception_data hw_exception_data; + struct kfd_hsa_memory_exception_data memory_exception_data; struct kfd_process *p; struct kfd_event *ev; unsigned int temp; uint32_t id, idx; + int reset_cause = atomic_read(&dev->sram_ecc_flag) ? + KFD_HW_EXCEPTION_ECC : + KFD_HW_EXCEPTION_GPU_HANG; /* Whole gpu reset caused by GPU hang and memory is lost */ memset(&hw_exception_data, 0, sizeof(hw_exception_data)); hw_exception_data.gpu_id = dev->id; hw_exception_data.memory_lost = 1; + hw_exception_data.reset_cause = reset_cause; + + memset(&memory_exception_data, 0, sizeof(memory_exception_data)); + memory_exception_data.ErrorType = KFD_MEM_ERR_SRAM_ECC; + memory_exception_data.gpu_id = dev->id; + memory_exception_data.failure.imprecise = true; idx = srcu_read_lock(&kfd_processes_srcu); hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { mutex_lock(&p->event_mutex); id = KFD_FIRST_NONSIGNAL_EVENT_ID; - idr_for_each_entry_continue(&p->event_idr, ev, id) + idr_for_each_entry_continue(&p->event_idr, ev, id) { if (ev->type == KFD_EVENT_TYPE_HW_EXCEPTION) { ev->hw_exception_data = hw_exception_data; set_event(ev); } + if (ev->type == KFD_EVENT_TYPE_MEMORY && + reset_cause == KFD_HW_EXCEPTION_ECC) { + ev->memory_exception_data = memory_exception_data; + set_event(ev); + } + } mutex_unlock(&p->event_mutex); } srcu_read_unlock(&kfd_processes_srcu, idx); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c index 213ea5454d11..bb77b8890e77 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c @@ -369,8 +369,13 @@ int kfd_init_apertures(struct kfd_process *process) /*Iterating over all devices*/ while (kfd_topology_enum_kfd_devices(id, &dev) == 0) { - if (!dev) { - id++; /* Skip non GPU devices */ + if (!dev || kfd_devcgroup_check_permission(dev)) { + /* Skip non GPU devices and devices to which the + * current process have no access to. Access can be + * limited by placing the process in a specific + * cgroup hierarchy + */ + id++; continue; } @@ -398,12 +403,18 @@ int kfd_init_apertures(struct kfd_process *process) case CHIP_POLARIS10: case CHIP_POLARIS11: case CHIP_POLARIS12: + case CHIP_VEGAM: kfd_init_apertures_vi(pdd, id); break; case CHIP_VEGA10: case CHIP_VEGA12: case CHIP_VEGA20: case CHIP_RAVEN: + case CHIP_RENOIR: + case CHIP_ARCTURUS: + case CHIP_NAVI10: + case CHIP_NAVI12: + case CHIP_NAVI14: kfd_init_apertures_v9(pdd, id); break; default: @@ -435,5 +446,3 @@ int kfd_init_apertures(struct kfd_process *process) return 0; } - - diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c index a85904ad0d5f..e05d75ecda21 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c @@ -54,8 +54,7 @@ static bool event_interrupt_isr_v9(struct kfd_dev *dev, memcpy(patched_ihre, ih_ring_entry, dev->device_info->ih_ring_entry_size); - pasid = dev->kfd2kgd->get_atc_vmid_pasid_mapping_pasid( - dev->kgd, vmid); + pasid = dev->dqm->vmid_pasid[vmid]; /* Patch the pasid field */ patched_ihre[3] = cpu_to_le32((le32_to_cpu(patched_ihre[3]) @@ -80,6 +79,7 @@ static bool event_interrupt_isr_v9(struct kfd_dev *dev, source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG || source_id == SOC15_INTSRC_CP_BAD_OPCODE || client_id == SOC15_IH_CLIENTID_VMC || + client_id == SOC15_IH_CLIENTID_VMC1 || client_id == SOC15_IH_CLIENTID_UTCL2; } @@ -104,6 +104,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev, else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE) kfd_signal_hw_exception_event(pasid); else if (client_id == SOC15_IH_CLIENTID_VMC || + client_id == SOC15_IH_CLIENTID_VMC1 || client_id == SOC15_IH_CLIENTID_UTCL2) { struct kfd_vm_fault_info info = {0}; uint16_t ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c index c56ac47cd318..bc47f6a44456 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c @@ -62,6 +62,11 @@ int kfd_interrupt_init(struct kfd_dev *kfd) } kfd->ih_wq = alloc_workqueue("KFD IH", WQ_HIGHPRI, 1); + if (unlikely(!kfd->ih_wq)) { + kfifo_free(&kfd->ih_fifo); + dev_err(kfd_chardev(), "Failed to allocate KFD IH workqueue\n"); + return -ENOMEM; + } spin_lock_init(&kfd->interrupt_lock); INIT_WORK(&kfd->interrupt_work, interrupt_wq); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_iommu.c b/drivers/gpu/drm/amd/amdkfd/kfd_iommu.c index 01494752c36a..193e2835bd4d 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_iommu.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_iommu.c @@ -66,16 +66,8 @@ int kfd_iommu_device_init(struct kfd_dev *kfd) top_dev = kfd_topology_device_by_id(kfd->id); - /* - * Overwrite ATS capability according to needs_iommu_device to fix - * potential missing corresponding bit in CRAT of BIOS. - */ - if (!kfd->device_info->needs_iommu_device) { - top_dev->node_props.capability &= ~HSA_CAP_ATS_PRESENT; + if (!kfd->device_info->needs_iommu_device) return 0; - } - - top_dev->node_props.capability |= HSA_CAP_ATS_PRESENT; iommu_info.flags = 0; err = amd_iommu_device_info(kfd->pdev, &iommu_info); @@ -168,7 +160,7 @@ static void iommu_pasid_shutdown_callback(struct pci_dev *pdev, int pasid) if (!p) return; - pr_debug("Unbinding process %d from IOMMU\n", pasid); + pr_debug("Unbinding process 0x%x from IOMMU\n", pasid); mutex_lock(kfd_get_dbgmgr_mutex()); @@ -202,7 +194,7 @@ static int iommu_invalid_ppr_cb(struct pci_dev *pdev, int pasid, struct kfd_dev *dev; dev_warn_ratelimited(kfd_device, - "Invalid PPR device %x:%x.%x pasid %d address 0x%lX flags 0x%X", + "Invalid PPR device %x:%x.%x pasid 0x%x address 0x%lX flags 0x%X", PCI_BUS_NUM(pdev->devfn), PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn), @@ -243,7 +235,7 @@ static int kfd_bind_processes_to_device(struct kfd_dev *kfd) err = amd_iommu_bind_pasid(kfd->pdev, p->pasid, p->lead_thread); if (err < 0) { - pr_err("Unexpected pasid %d binding failure\n", + pr_err("Unexpected pasid 0x%x binding failure\n", p->pasid); mutex_unlock(&p->mutex); break; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c index f1596881f20a..11d244891393 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c @@ -58,9 +58,10 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev, kq->nop_packet = nop.u32all; switch (type) { case KFD_QUEUE_TYPE_DIQ: + kq->mqd_mgr = dev->dqm->mqd_mgrs[KFD_MQD_TYPE_DIQ]; + break; case KFD_QUEUE_TYPE_HIQ: - kq->mqd_mgr = dev->dqm->ops.get_mqd_manager(dev->dqm, - KFD_MQD_TYPE_HIQ); + kq->mqd_mgr = dev->dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]; break; default: pr_err("Invalid queue type %d\n", type); @@ -131,13 +132,14 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev, kq->queue->device = dev; kq->queue->process = kfd_get_process(current); - retval = kq->mqd_mgr->init_mqd(kq->mqd_mgr, &kq->queue->mqd, - &kq->queue->mqd_mem_obj, + kq->queue->mqd_mem_obj = kq->mqd_mgr->allocate_mqd(kq->mqd_mgr->dev, + &kq->queue->properties); + if (!kq->queue->mqd_mem_obj) + goto err_allocate_mqd; + kq->mqd_mgr->init_mqd(kq->mqd_mgr, &kq->queue->mqd, + kq->queue->mqd_mem_obj, &kq->queue->gart_mqd_addr, &kq->queue->properties); - if (retval != 0) - goto err_init_mqd; - /* assign HIQ to HQD */ if (type == KFD_QUEUE_TYPE_HIQ) { pr_debug("Assigning hiq to hqd\n"); @@ -163,7 +165,8 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev, return true; err_alloc_fence: -err_init_mqd: + kq->mqd_mgr->free_mqd(kq->mqd_mgr, kq->queue->mqd, kq->queue->mqd_mem_obj); +err_allocate_mqd: uninit_queue(kq->queue); err_init_queue: kfd_gtt_sa_free(dev, kq->wptr_mem); @@ -192,7 +195,7 @@ static void uninitialize(struct kernel_queue *kq) else if (kq->queue->properties.type == KFD_QUEUE_TYPE_DIQ) kfd_gtt_sa_free(kq->dev, kq->fence_mem_obj); - kq->mqd_mgr->uninit_mqd(kq->mqd_mgr, kq->queue->mqd, + kq->mqd_mgr->free_mqd(kq->mqd_mgr, kq->queue->mqd, kq->queue->mqd_mem_obj); kfd_gtt_sa_free(kq->dev, kq->rptr_mem); @@ -314,6 +317,7 @@ struct kernel_queue *kernel_queue_init(struct kfd_dev *dev, case CHIP_POLARIS10: case CHIP_POLARIS11: case CHIP_POLARIS12: + case CHIP_VEGAM: kernel_queue_init_vi(&kq->ops_asic_specific); break; @@ -326,8 +330,15 @@ struct kernel_queue *kernel_queue_init(struct kfd_dev *dev, case CHIP_VEGA12: case CHIP_VEGA20: case CHIP_RAVEN: + case CHIP_RENOIR: + case CHIP_ARCTURUS: kernel_queue_init_v9(&kq->ops_asic_specific); break; + case CHIP_NAVI10: + case CHIP_NAVI12: + case CHIP_NAVI14: + kernel_queue_init_v10(&kq->ops_asic_specific); + break; default: WARN(1, "Unexpected ASIC family %u", dev->device_info->asic_family); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h index a7116a939029..365fc674fea4 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h @@ -102,5 +102,6 @@ struct kernel_queue { void kernel_queue_init_cik(struct kernel_queue_ops *ops); void kernel_queue_init_vi(struct kernel_queue_ops *ops); void kernel_queue_init_v9(struct kernel_queue_ops *ops); +void kernel_queue_init_v10(struct kernel_queue_ops *ops); #endif /* KFD_KERNEL_QUEUE_H_ */ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v10.c new file mode 100644 index 000000000000..aed32ab7102e --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v10.c @@ -0,0 +1,348 @@ +/* + * Copyright 2018 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include "kfd_kernel_queue.h" +#include "kfd_device_queue_manager.h" +#include "kfd_pm4_headers_ai.h" +#include "kfd_pm4_opcodes.h" +#include "gc/gc_10_1_0_sh_mask.h" + +static bool initialize_v10(struct kernel_queue *kq, struct kfd_dev *dev, + enum kfd_queue_type type, unsigned int queue_size); +static void uninitialize_v10(struct kernel_queue *kq); +static void submit_packet_v10(struct kernel_queue *kq); + +void kernel_queue_init_v10(struct kernel_queue_ops *ops) +{ + ops->initialize = initialize_v10; + ops->uninitialize = uninitialize_v10; + ops->submit_packet = submit_packet_v10; +} + +static bool initialize_v10(struct kernel_queue *kq, struct kfd_dev *dev, + enum kfd_queue_type type, unsigned int queue_size) +{ + int retval; + + retval = kfd_gtt_sa_allocate(dev, PAGE_SIZE, &kq->eop_mem); + if (retval != 0) + return false; + + kq->eop_gpu_addr = kq->eop_mem->gpu_addr; + kq->eop_kernel_addr = kq->eop_mem->cpu_ptr; + + memset(kq->eop_kernel_addr, 0, PAGE_SIZE); + + return true; +} + +static void uninitialize_v10(struct kernel_queue *kq) +{ + kfd_gtt_sa_free(kq->dev, kq->eop_mem); +} + +static void submit_packet_v10(struct kernel_queue *kq) +{ + *kq->wptr64_kernel = kq->pending_wptr64; + write_kernel_doorbell64(kq->queue->properties.doorbell_ptr, + kq->pending_wptr64); +} + +static int pm_map_process_v10(struct packet_manager *pm, + uint32_t *buffer, struct qcm_process_device *qpd) +{ + struct pm4_mes_map_process *packet; + uint64_t vm_page_table_base_addr = qpd->page_table_base; + + packet = (struct pm4_mes_map_process *)buffer; + memset(buffer, 0, sizeof(struct pm4_mes_map_process)); + + packet->header.u32All = pm_build_pm4_header(IT_MAP_PROCESS, + sizeof(struct pm4_mes_map_process)); + packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; + packet->bitfields2.process_quantum = 1; + packet->bitfields2.pasid = qpd->pqm->process->pasid; + packet->bitfields14.gds_size = qpd->gds_size; + packet->bitfields14.num_gws = qpd->num_gws; + packet->bitfields14.num_oac = qpd->num_oac; + packet->bitfields14.sdma_enable = 1; + + packet->bitfields14.num_queues = (qpd->is_debug) ? 0 : qpd->queue_count; + + packet->sh_mem_config = qpd->sh_mem_config; + packet->sh_mem_bases = qpd->sh_mem_bases; + if (qpd->tba_addr) { + packet->sq_shader_tba_lo = lower_32_bits(qpd->tba_addr >> 8); + packet->sq_shader_tba_hi = (1 << SQ_SHADER_TBA_HI__TRAP_EN__SHIFT) | + upper_32_bits(qpd->tba_addr >> 8); + packet->sq_shader_tma_lo = lower_32_bits(qpd->tma_addr >> 8); + packet->sq_shader_tma_hi = upper_32_bits(qpd->tma_addr >> 8); + } + + packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area); + packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area); + + packet->vm_context_page_table_base_addr_lo32 = + lower_32_bits(vm_page_table_base_addr); + packet->vm_context_page_table_base_addr_hi32 = + upper_32_bits(vm_page_table_base_addr); + + return 0; +} + +static int pm_runlist_v10(struct packet_manager *pm, uint32_t *buffer, + uint64_t ib, size_t ib_size_in_dwords, bool chain) +{ + struct pm4_mes_runlist *packet; + + int concurrent_proc_cnt = 0; + struct kfd_dev *kfd = pm->dqm->dev; + + /* Determine the number of processes to map together to HW: + * it can not exceed the number of VMIDs available to the + * scheduler, and it is determined by the smaller of the number + * of processes in the runlist and kfd module parameter + * hws_max_conc_proc. + * Note: the arbitration between the number of VMIDs and + * hws_max_conc_proc has been done in + * kgd2kfd_device_init(). + */ + concurrent_proc_cnt = min(pm->dqm->processes_count, + kfd->max_proc_per_quantum); + + + packet = (struct pm4_mes_runlist *)buffer; + + memset(buffer, 0, sizeof(struct pm4_mes_runlist)); + packet->header.u32All = pm_build_pm4_header(IT_RUN_LIST, + sizeof(struct pm4_mes_runlist)); + + packet->bitfields4.ib_size = ib_size_in_dwords; + packet->bitfields4.chain = chain ? 1 : 0; + packet->bitfields4.offload_polling = 0; + packet->bitfields4.valid = 1; + packet->bitfields4.process_cnt = concurrent_proc_cnt; + packet->ordinal2 = lower_32_bits(ib); + packet->ib_base_hi = upper_32_bits(ib); + + return 0; +} + +static int pm_map_queues_v10(struct packet_manager *pm, uint32_t *buffer, + struct queue *q, bool is_static) +{ + struct pm4_mes_map_queues *packet; + bool use_static = is_static; + + packet = (struct pm4_mes_map_queues *)buffer; + memset(buffer, 0, sizeof(struct pm4_mes_map_queues)); + + packet->header.u32All = pm_build_pm4_header(IT_MAP_QUEUES, + sizeof(struct pm4_mes_map_queues)); + packet->bitfields2.num_queues = 1; + packet->bitfields2.queue_sel = + queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi; + + packet->bitfields2.engine_sel = + engine_sel__mes_map_queues__compute_vi; + packet->bitfields2.queue_type = + queue_type__mes_map_queues__normal_compute_vi; + + switch (q->properties.type) { + case KFD_QUEUE_TYPE_COMPUTE: + if (use_static) + packet->bitfields2.queue_type = + queue_type__mes_map_queues__normal_latency_static_queue_vi; + break; + case KFD_QUEUE_TYPE_DIQ: + packet->bitfields2.queue_type = + queue_type__mes_map_queues__debug_interface_queue_vi; + break; + case KFD_QUEUE_TYPE_SDMA: + case KFD_QUEUE_TYPE_SDMA_XGMI: + packet->bitfields2.engine_sel = q->properties.sdma_engine_id + + engine_sel__mes_map_queues__sdma0_vi; + use_static = false; /* no static queues under SDMA */ + break; + default: + WARN(1, "queue type %d\n", q->properties.type); + return -EINVAL; + } + packet->bitfields3.doorbell_offset = + q->properties.doorbell_off; + + packet->mqd_addr_lo = + lower_32_bits(q->gart_mqd_addr); + + packet->mqd_addr_hi = + upper_32_bits(q->gart_mqd_addr); + + packet->wptr_addr_lo = + lower_32_bits((uint64_t)q->properties.write_ptr); + + packet->wptr_addr_hi = + upper_32_bits((uint64_t)q->properties.write_ptr); + + return 0; +} + +static int pm_unmap_queues_v10(struct packet_manager *pm, uint32_t *buffer, + enum kfd_queue_type type, + enum kfd_unmap_queues_filter filter, + uint32_t filter_param, bool reset, + unsigned int sdma_engine) +{ + struct pm4_mes_unmap_queues *packet; + + packet = (struct pm4_mes_unmap_queues *)buffer; + memset(buffer, 0, sizeof(struct pm4_mes_unmap_queues)); + + packet->header.u32All = pm_build_pm4_header(IT_UNMAP_QUEUES, + sizeof(struct pm4_mes_unmap_queues)); + switch (type) { + case KFD_QUEUE_TYPE_COMPUTE: + case KFD_QUEUE_TYPE_DIQ: + packet->bitfields2.engine_sel = + engine_sel__mes_unmap_queues__compute; + break; + case KFD_QUEUE_TYPE_SDMA: + case KFD_QUEUE_TYPE_SDMA_XGMI: + packet->bitfields2.engine_sel = + engine_sel__mes_unmap_queues__sdma0 + sdma_engine; + break; + default: + WARN(1, "queue type %d\n", type); + break; + } + + if (reset) + packet->bitfields2.action = + action__mes_unmap_queues__reset_queues; + else + packet->bitfields2.action = + action__mes_unmap_queues__preempt_queues; + + switch (filter) { + case KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE: + packet->bitfields2.queue_sel = + queue_sel__mes_unmap_queues__perform_request_on_specified_queues; + packet->bitfields2.num_queues = 1; + packet->bitfields3b.doorbell_offset0 = filter_param; + break; + case KFD_UNMAP_QUEUES_FILTER_BY_PASID: + packet->bitfields2.queue_sel = + queue_sel__mes_unmap_queues__perform_request_on_pasid_queues; + packet->bitfields3a.pasid = filter_param; + break; + case KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES: + packet->bitfields2.queue_sel = + queue_sel__mes_unmap_queues__unmap_all_queues; + break; + case KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES: + /* in this case, we do not preempt static queues */ + packet->bitfields2.queue_sel = + queue_sel__mes_unmap_queues__unmap_all_non_static_queues; + break; + default: + WARN(1, "filter %d\n", filter); + break; + } + + return 0; + +} + +static int pm_query_status_v10(struct packet_manager *pm, uint32_t *buffer, + uint64_t fence_address, uint32_t fence_value) +{ + struct pm4_mes_query_status *packet; + + packet = (struct pm4_mes_query_status *)buffer; + memset(buffer, 0, sizeof(struct pm4_mes_query_status)); + + + packet->header.u32All = pm_build_pm4_header(IT_QUERY_STATUS, + sizeof(struct pm4_mes_query_status)); + + packet->bitfields2.context_id = 0; + packet->bitfields2.interrupt_sel = + interrupt_sel__mes_query_status__completion_status; + packet->bitfields2.command = + command__mes_query_status__fence_only_after_write_ack; + + packet->addr_hi = upper_32_bits((uint64_t)fence_address); + packet->addr_lo = lower_32_bits((uint64_t)fence_address); + packet->data_hi = upper_32_bits((uint64_t)fence_value); + packet->data_lo = lower_32_bits((uint64_t)fence_value); + + return 0; +} + + +static int pm_release_mem_v10(uint64_t gpu_addr, uint32_t *buffer) +{ + struct pm4_mec_release_mem *packet; + + WARN_ON(!buffer); + + packet = (struct pm4_mec_release_mem *)buffer; + memset(buffer, 0, sizeof(struct pm4_mec_release_mem)); + + packet->header.u32All = pm_build_pm4_header(IT_RELEASE_MEM, + sizeof(struct pm4_mec_release_mem)); + + packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT; + packet->bitfields2.event_index = event_index__mec_release_mem__end_of_pipe; + packet->bitfields2.tcl1_action_ena = 1; + packet->bitfields2.tc_action_ena = 1; + packet->bitfields2.cache_policy = cache_policy__mec_release_mem__lru; + + packet->bitfields3.data_sel = data_sel__mec_release_mem__send_32_bit_low; + packet->bitfields3.int_sel = + int_sel__mec_release_mem__send_interrupt_after_write_confirm; + + packet->bitfields4.address_lo_32b = (gpu_addr & 0xffffffff) >> 2; + packet->address_hi = upper_32_bits(gpu_addr); + + packet->data_lo = 0; + + return sizeof(struct pm4_mec_release_mem) / sizeof(unsigned int); +} + +const struct packet_manager_funcs kfd_v10_pm_funcs = { + .map_process = pm_map_process_v10, + .runlist = pm_runlist_v10, + .set_resources = pm_set_resources_vi, + .map_queues = pm_map_queues_v10, + .unmap_queues = pm_unmap_queues_v10, + .query_status = pm_query_status_v10, + .release_mem = pm_release_mem_v10, + .map_process_size = sizeof(struct pm4_mes_map_process), + .runlist_size = sizeof(struct pm4_mes_runlist), + .set_resources_size = sizeof(struct pm4_mes_set_resources), + .map_queues_size = sizeof(struct pm4_mes_map_queues), + .unmap_queues_size = sizeof(struct pm4_mes_unmap_queues), + .query_status_size = sizeof(struct pm4_mes_query_status), + .release_mem_size = sizeof(struct pm4_mec_release_mem) +}; + diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c index 33830b1a5a54..9a4bafb2e175 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c @@ -81,7 +81,8 @@ static int pm_map_process_v9(struct packet_manager *pm, packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; packet->bitfields2.process_quantum = 1; packet->bitfields2.pasid = qpd->pqm->process->pasid; - packet->bitfields14.gds_size = qpd->gds_size; + packet->bitfields14.gds_size = qpd->gds_size & 0x3F; + packet->bitfields14.gds_size_hi = (qpd->gds_size >> 6) & 0xF; packet->bitfields14.num_gws = qpd->num_gws; packet->bitfields14.num_oac = qpd->num_oac; packet->bitfields14.sdma_enable = 1; @@ -134,6 +135,7 @@ static int pm_runlist_v9(struct packet_manager *pm, uint32_t *buffer, packet->bitfields4.ib_size = ib_size_in_dwords; packet->bitfields4.chain = chain ? 1 : 0; packet->bitfields4.offload_polling = 0; + packet->bitfields4.chained_runlist_idle_disable = chain ? 1 : 0; packet->bitfields4.valid = 1; packet->bitfields4.process_cnt = concurrent_proc_cnt; packet->ordinal2 = lower_32_bits(ib); @@ -142,6 +144,34 @@ static int pm_runlist_v9(struct packet_manager *pm, uint32_t *buffer, return 0; } +static int pm_set_resources_v9(struct packet_manager *pm, uint32_t *buffer, + struct scheduling_resources *res) +{ + struct pm4_mes_set_resources *packet; + + packet = (struct pm4_mes_set_resources *)buffer; + memset(buffer, 0, sizeof(struct pm4_mes_set_resources)); + + packet->header.u32All = pm_build_pm4_header(IT_SET_RESOURCES, + sizeof(struct pm4_mes_set_resources)); + + packet->bitfields2.queue_type = + queue_type__mes_set_resources__hsa_interface_queue_hiq; + packet->bitfields2.vmid_mask = res->vmid_mask; + packet->bitfields2.unmap_latency = KFD_UNMAP_LATENCY_MS / 100; + packet->bitfields7.oac_mask = res->oac_mask; + packet->bitfields8.gds_heap_base = res->gds_heap_base; + packet->bitfields8.gds_heap_size = res->gds_heap_size; + + packet->gws_mask_lo = lower_32_bits(res->gws_mask); + packet->gws_mask_hi = upper_32_bits(res->gws_mask); + + packet->queue_mask_lo = lower_32_bits(res->queue_mask); + packet->queue_mask_hi = upper_32_bits(res->queue_mask); + + return 0; +} + static int pm_map_queues_v9(struct packet_manager *pm, uint32_t *buffer, struct queue *q, bool is_static) { @@ -153,14 +183,15 @@ static int pm_map_queues_v9(struct packet_manager *pm, uint32_t *buffer, packet->header.u32All = pm_build_pm4_header(IT_MAP_QUEUES, sizeof(struct pm4_mes_map_queues)); - packet->bitfields2.alloc_format = - alloc_format__mes_map_queues__one_per_pipe_vi; packet->bitfields2.num_queues = 1; packet->bitfields2.queue_sel = queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi; packet->bitfields2.engine_sel = engine_sel__mes_map_queues__compute_vi; + packet->bitfields2.gws_control_queue = q->gws ? 1 : 0; + packet->bitfields2.extended_engine_sel = + extended_engine_sel__mes_map_queues__legacy_engine_sel; packet->bitfields2.queue_type = queue_type__mes_map_queues__normal_compute_vi; @@ -175,9 +206,16 @@ static int pm_map_queues_v9(struct packet_manager *pm, uint32_t *buffer, queue_type__mes_map_queues__debug_interface_queue_vi; break; case KFD_QUEUE_TYPE_SDMA: - packet->bitfields2.engine_sel = q->properties.sdma_engine_id + - engine_sel__mes_map_queues__sdma0_vi; + case KFD_QUEUE_TYPE_SDMA_XGMI: use_static = false; /* no static queues under SDMA */ + if (q->properties.sdma_engine_id < 2) + packet->bitfields2.engine_sel = q->properties.sdma_engine_id + + engine_sel__mes_map_queues__sdma0_vi; + else { + packet->bitfields2.extended_engine_sel = + extended_engine_sel__mes_map_queues__sdma0_to_7_sel; + packet->bitfields2.engine_sel = q->properties.sdma_engine_id; + } break; default: WARN(1, "queue type %d", q->properties.type); @@ -217,12 +255,23 @@ static int pm_unmap_queues_v9(struct packet_manager *pm, uint32_t *buffer, switch (type) { case KFD_QUEUE_TYPE_COMPUTE: case KFD_QUEUE_TYPE_DIQ: + packet->bitfields2.extended_engine_sel = + extended_engine_sel__mes_unmap_queues__legacy_engine_sel; packet->bitfields2.engine_sel = engine_sel__mes_unmap_queues__compute; break; case KFD_QUEUE_TYPE_SDMA: - packet->bitfields2.engine_sel = - engine_sel__mes_unmap_queues__sdma0 + sdma_engine; + case KFD_QUEUE_TYPE_SDMA_XGMI: + if (sdma_engine < 2) { + packet->bitfields2.extended_engine_sel = + extended_engine_sel__mes_unmap_queues__legacy_engine_sel; + packet->bitfields2.engine_sel = + engine_sel__mes_unmap_queues__sdma0 + sdma_engine; + } else { + packet->bitfields2.extended_engine_sel = + extended_engine_sel__mes_unmap_queues__sdma0_to_7_sel; + packet->bitfields2.engine_sel = sdma_engine; + } break; default: WARN(1, "queue type %d", type); @@ -324,7 +373,7 @@ static int pm_release_mem_v9(uint64_t gpu_addr, uint32_t *buffer) const struct packet_manager_funcs kfd_v9_pm_funcs = { .map_process = pm_map_process_v9, .runlist = pm_runlist_v9, - .set_resources = pm_set_resources_vi, + .set_resources = pm_set_resources_v9, .map_queues = pm_map_queues_v9, .unmap_queues = pm_unmap_queues_v9, .query_status = pm_query_status_v9, diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c index bf20c6d32ef3..2adaf40027eb 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c @@ -190,8 +190,6 @@ static int pm_map_queues_vi(struct packet_manager *pm, uint32_t *buffer, packet->header.u32All = pm_build_pm4_header(IT_MAP_QUEUES, sizeof(struct pm4_mes_map_queues)); - packet->bitfields2.alloc_format = - alloc_format__mes_map_queues__one_per_pipe_vi; packet->bitfields2.num_queues = 1; packet->bitfields2.queue_sel = queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi; @@ -212,6 +210,7 @@ static int pm_map_queues_vi(struct packet_manager *pm, uint32_t *buffer, queue_type__mes_map_queues__debug_interface_queue_vi; break; case KFD_QUEUE_TYPE_SDMA: + case KFD_QUEUE_TYPE_SDMA_XGMI: packet->bitfields2.engine_sel = q->properties.sdma_engine_id + engine_sel__mes_map_queues__sdma0_vi; use_static = false; /* no static queues under SDMA */ @@ -258,6 +257,7 @@ static int pm_unmap_queues_vi(struct packet_manager *pm, uint32_t *buffer, engine_sel__mes_unmap_queues__compute; break; case KFD_QUEUE_TYPE_SDMA: + case KFD_QUEUE_TYPE_SDMA_XGMI: packet->bitfields2.engine_sel = engine_sel__mes_unmap_queues__sdma0 + sdma_engine; break; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_module.c b/drivers/gpu/drm/amd/amdkfd/kfd_module.c index 932007eb9168..f4b7f7e6c40e 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_module.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_module.c @@ -56,6 +56,11 @@ static int kfd_init(void) if (err < 0) goto err_create_wq; + /* Ignore the return value, so that we can continue + * to init the KFD, even if procfs isn't craated + */ + kfd_procfs_init(); + kfd_debugfs_init(); return 0; @@ -72,11 +77,12 @@ static void kfd_exit(void) { kfd_debugfs_fini(); kfd_process_destroy_wq(); + kfd_procfs_shutdown(); kfd_topology_shutdown(); kfd_chardev_exit(); } -int kgd2kfd_init() +int kgd2kfd_init(void) { return kfd_init(); } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c index aed9b9b82213..88813dad731f 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c @@ -23,34 +23,74 @@ #include "kfd_mqd_manager.h" #include "amdgpu_amdkfd.h" +#include "kfd_device_queue_manager.h" -struct mqd_manager *mqd_manager_init(enum KFD_MQD_TYPE type, - struct kfd_dev *dev) +/* Mapping queue priority to pipe priority, indexed by queue priority */ +int pipe_priority_map[] = { + KFD_PIPE_PRIORITY_CS_LOW, + KFD_PIPE_PRIORITY_CS_LOW, + KFD_PIPE_PRIORITY_CS_LOW, + KFD_PIPE_PRIORITY_CS_LOW, + KFD_PIPE_PRIORITY_CS_LOW, + KFD_PIPE_PRIORITY_CS_LOW, + KFD_PIPE_PRIORITY_CS_LOW, + KFD_PIPE_PRIORITY_CS_MEDIUM, + KFD_PIPE_PRIORITY_CS_MEDIUM, + KFD_PIPE_PRIORITY_CS_MEDIUM, + KFD_PIPE_PRIORITY_CS_MEDIUM, + KFD_PIPE_PRIORITY_CS_HIGH, + KFD_PIPE_PRIORITY_CS_HIGH, + KFD_PIPE_PRIORITY_CS_HIGH, + KFD_PIPE_PRIORITY_CS_HIGH, + KFD_PIPE_PRIORITY_CS_HIGH +}; + +struct kfd_mem_obj *allocate_hiq_mqd(struct kfd_dev *dev, struct queue_properties *q) { - switch (dev->device_info->asic_family) { - case CHIP_KAVERI: - return mqd_manager_init_cik(type, dev); - case CHIP_HAWAII: - return mqd_manager_init_cik_hawaii(type, dev); - case CHIP_CARRIZO: - return mqd_manager_init_vi(type, dev); - case CHIP_TONGA: - case CHIP_FIJI: - case CHIP_POLARIS10: - case CHIP_POLARIS11: - case CHIP_POLARIS12: - return mqd_manager_init_vi_tonga(type, dev); - case CHIP_VEGA10: - case CHIP_VEGA12: - case CHIP_VEGA20: - case CHIP_RAVEN: - return mqd_manager_init_v9(type, dev); - default: - WARN(1, "Unexpected ASIC family %u", - dev->device_info->asic_family); - } + struct kfd_mem_obj *mqd_mem_obj = NULL; + + mqd_mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_KERNEL); + if (!mqd_mem_obj) + return NULL; + + mqd_mem_obj->gtt_mem = dev->dqm->hiq_sdma_mqd.gtt_mem; + mqd_mem_obj->gpu_addr = dev->dqm->hiq_sdma_mqd.gpu_addr; + mqd_mem_obj->cpu_ptr = dev->dqm->hiq_sdma_mqd.cpu_ptr; + + return mqd_mem_obj; +} + +struct kfd_mem_obj *allocate_sdma_mqd(struct kfd_dev *dev, + struct queue_properties *q) +{ + struct kfd_mem_obj *mqd_mem_obj = NULL; + uint64_t offset; - return NULL; + mqd_mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_KERNEL); + if (!mqd_mem_obj) + return NULL; + + offset = (q->sdma_engine_id * + dev->device_info->num_sdma_queues_per_engine + + q->sdma_queue_id) * + dev->dqm->mqd_mgrs[KFD_MQD_TYPE_SDMA]->mqd_size; + + offset += dev->dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]->mqd_size; + + mqd_mem_obj->gtt_mem = (void *)((uint64_t)dev->dqm->hiq_sdma_mqd.gtt_mem + + offset); + mqd_mem_obj->gpu_addr = dev->dqm->hiq_sdma_mqd.gpu_addr + offset; + mqd_mem_obj->cpu_ptr = (uint32_t *)((uint64_t) + dev->dqm->hiq_sdma_mqd.cpu_ptr + offset); + + return mqd_mem_obj; +} + +void free_mqd_hiq_sdma(struct mqd_manager *mm, void *mqd, + struct kfd_mem_obj *mqd_mem_obj) +{ + WARN_ON(!mqd_mem_obj->gtt_mem); + kfree(mqd_mem_obj); } void mqd_symmetrically_map_cu_mask(struct mqd_manager *mm, @@ -58,8 +98,8 @@ void mqd_symmetrically_map_cu_mask(struct mqd_manager *mm, uint32_t *se_mask) { struct kfd_cu_info cu_info; - uint32_t cu_per_sh[4] = {0}; - int i, se, cu = 0; + uint32_t cu_per_se[KFD_MAX_NUM_SE] = {0}; + int i, se, sh, cu = 0; amdgpu_amdkfd_get_cu_info(mm->dev->kgd, &cu_info); @@ -67,8 +107,8 @@ void mqd_symmetrically_map_cu_mask(struct mqd_manager *mm, cu_mask_count = cu_info.cu_active_number; for (se = 0; se < cu_info.num_shader_engines; se++) - for (i = 0; i < 4; i++) - cu_per_sh[se] += hweight32(cu_info.cu_bitmap[se][i]); + for (sh = 0; sh < cu_info.num_shader_arrays_per_engine; sh++) + cu_per_se[se] += hweight32(cu_info.cu_bitmap[se % 4][sh + (se / 4)]); /* Symmetrically map cu_mask to all SEs: * cu_mask[0] bit0 -> se_mask[0] bit0; @@ -88,6 +128,6 @@ void mqd_symmetrically_map_cu_mask(struct mqd_manager *mm, se = 0; cu++; } - } while (cu >= cu_per_sh[se] && cu < 32); + } while (cu >= cu_per_se[se] && cu < 32); } } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h index f8261313ae7b..fbdb16418847 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h @@ -26,6 +26,8 @@ #include "kfd_priv.h" +#define KFD_MAX_NUM_SE 8 + /** * struct mqd_manager * @@ -39,7 +41,7 @@ * @destroy_mqd: Destroys the HQD slot and by that preempt the relevant queue. * Used only for no cp scheduling. * - * @uninit_mqd: Releases the mqd buffer from local gpu memory. + * @free_mqd: Releases the mqd buffer from local gpu memory. * * @is_occupied: Checks if the relevant HQD slot is occupied. * @@ -62,10 +64,13 @@ * per KFD_MQD_TYPE for each device. * */ - +extern int pipe_priority_map[]; struct mqd_manager { - int (*init_mqd)(struct mqd_manager *mm, void **mqd, - struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, + struct kfd_mem_obj* (*allocate_mqd)(struct kfd_dev *kfd, + struct queue_properties *q); + + void (*init_mqd)(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr, struct queue_properties *q); int (*load_mqd)(struct mqd_manager *mm, void *mqd, @@ -73,7 +78,7 @@ struct mqd_manager { struct queue_properties *p, struct mm_struct *mms); - int (*update_mqd)(struct mqd_manager *mm, void *mqd, + void (*update_mqd)(struct mqd_manager *mm, void *mqd, struct queue_properties *q); int (*destroy_mqd)(struct mqd_manager *mm, void *mqd, @@ -81,7 +86,7 @@ struct mqd_manager { unsigned int timeout, uint32_t pipe_id, uint32_t queue_id); - void (*uninit_mqd)(struct mqd_manager *mm, void *mqd, + void (*free_mqd)(struct mqd_manager *mm, void *mqd, struct kfd_mem_obj *mqd_mem_obj); bool (*is_occupied)(struct mqd_manager *mm, void *mqd, @@ -99,8 +104,17 @@ struct mqd_manager { struct mutex mqd_mutex; struct kfd_dev *dev; + uint32_t mqd_size; }; +struct kfd_mem_obj *allocate_hiq_mqd(struct kfd_dev *dev, + struct queue_properties *q); + +struct kfd_mem_obj *allocate_sdma_mqd(struct kfd_dev *dev, + struct queue_properties *q); +void free_mqd_hiq_sdma(struct mqd_manager *mm, void *mqd, + struct kfd_mem_obj *mqd_mem_obj); + void mqd_symmetrically_map_cu_mask(struct mqd_manager *mm, const uint32_t *cu_mask, uint32_t cu_mask_count, uint32_t *se_mask); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c index ae90a99909ef..28876aceb14b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c @@ -66,22 +66,33 @@ static void update_cu_mask(struct mqd_manager *mm, void *mqd, m->compute_static_thread_mgmt_se3); } -static int init_mqd(struct mqd_manager *mm, void **mqd, - struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, +static void set_priority(struct cik_mqd *m, struct queue_properties *q) +{ + m->cp_hqd_pipe_priority = pipe_priority_map[q->priority]; + m->cp_hqd_queue_priority = q->priority; +} + +static struct kfd_mem_obj *allocate_mqd(struct kfd_dev *kfd, + struct queue_properties *q) +{ + struct kfd_mem_obj *mqd_mem_obj; + + if (kfd_gtt_sa_allocate(kfd, sizeof(struct cik_mqd), + &mqd_mem_obj)) + return NULL; + + return mqd_mem_obj; +} + +static void init_mqd(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr, struct queue_properties *q) { uint64_t addr; struct cik_mqd *m; - int retval; - - retval = kfd_gtt_sa_allocate(mm->dev, sizeof(struct cik_mqd), - mqd_mem_obj); - if (retval != 0) - return -ENOMEM; - - m = (struct cik_mqd *) (*mqd_mem_obj)->cpu_ptr; - addr = (*mqd_mem_obj)->gpu_addr; + m = (struct cik_mqd *) mqd_mem_obj->cpu_ptr; + addr = mqd_mem_obj->gpu_addr; memset(m, 0, ALIGN(sizeof(struct cik_mqd), 256)); @@ -116,8 +127,7 @@ static int init_mqd(struct mqd_manager *mm, void **mqd, * 1 = CS_MEDIUM (typically between HP3D and GFX * 2 = CS_HIGH (typically above HP3D) */ - m->cp_hqd_pipe_priority = 1; - m->cp_hqd_queue_priority = 15; + set_priority(m, q); if (q->format == KFD_QUEUE_FORMAT_AQL) m->cp_hqd_iq_rptr = AQL_ENABLE; @@ -125,49 +135,32 @@ static int init_mqd(struct mqd_manager *mm, void **mqd, *mqd = m; if (gart_addr) *gart_addr = addr; - retval = mm->update_mqd(mm, m, q); - - return retval; + mm->update_mqd(mm, m, q); } -static int init_mqd_sdma(struct mqd_manager *mm, void **mqd, - struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, +static void init_mqd_sdma(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr, struct queue_properties *q) { - int retval; struct cik_sdma_rlc_registers *m; - retval = kfd_gtt_sa_allocate(mm->dev, - sizeof(struct cik_sdma_rlc_registers), - mqd_mem_obj); - - if (retval != 0) - return -ENOMEM; - - m = (struct cik_sdma_rlc_registers *) (*mqd_mem_obj)->cpu_ptr; + m = (struct cik_sdma_rlc_registers *) mqd_mem_obj->cpu_ptr; memset(m, 0, sizeof(struct cik_sdma_rlc_registers)); *mqd = m; if (gart_addr) - *gart_addr = (*mqd_mem_obj)->gpu_addr; + *gart_addr = mqd_mem_obj->gpu_addr; - retval = mm->update_mqd(mm, m, q); - - return retval; + mm->update_mqd(mm, m, q); } -static void uninit_mqd(struct mqd_manager *mm, void *mqd, +static void free_mqd(struct mqd_manager *mm, void *mqd, struct kfd_mem_obj *mqd_mem_obj) { kfd_gtt_sa_free(mm->dev, mqd_mem_obj); } -static void uninit_mqd_sdma(struct mqd_manager *mm, void *mqd, - struct kfd_mem_obj *mqd_mem_obj) -{ - kfd_gtt_sa_free(mm->dev, mqd_mem_obj); -} static int load_mqd(struct mqd_manager *mm, void *mqd, uint32_t pipe_id, uint32_t queue_id, struct queue_properties *p, @@ -191,7 +184,7 @@ static int load_mqd_sdma(struct mqd_manager *mm, void *mqd, mms); } -static int __update_mqd(struct mqd_manager *mm, void *mqd, +static void __update_mqd(struct mqd_manager *mm, void *mqd, struct queue_properties *q, unsigned int atc_bit) { struct cik_mqd *m; @@ -222,28 +215,24 @@ static int __update_mqd(struct mqd_manager *mm, void *mqd, m->cp_hqd_pq_control |= NO_UPDATE_RPTR; update_cu_mask(mm, mqd, q); + set_priority(m, q); - q->is_active = (q->queue_size > 0 && - q->queue_address != 0 && - q->queue_percent > 0 && - !q->is_evicted); - - return 0; + q->is_active = QUEUE_IS_ACTIVE(*q); } -static int update_mqd(struct mqd_manager *mm, void *mqd, +static void update_mqd(struct mqd_manager *mm, void *mqd, struct queue_properties *q) { - return __update_mqd(mm, mqd, q, 1); + __update_mqd(mm, mqd, q, 1); } -static int update_mqd_hawaii(struct mqd_manager *mm, void *mqd, +static void update_mqd_hawaii(struct mqd_manager *mm, void *mqd, struct queue_properties *q) { - return __update_mqd(mm, mqd, q, 0); + __update_mqd(mm, mqd, q, 0); } -static int update_mqd_sdma(struct mqd_manager *mm, void *mqd, +static void update_mqd_sdma(struct mqd_manager *mm, void *mqd, struct queue_properties *q) { struct cik_sdma_rlc_registers *m; @@ -267,12 +256,7 @@ static int update_mqd_sdma(struct mqd_manager *mm, void *mqd, m->sdma_engine_id = q->sdma_engine_id; m->sdma_queue_id = q->sdma_queue_id; - q->is_active = (q->queue_size > 0 && - q->queue_address != 0 && - q->queue_percent > 0 && - !q->is_evicted); - - return 0; + q->is_active = QUEUE_IS_ACTIVE(*q); } static int destroy_mqd(struct mqd_manager *mm, void *mqd, @@ -319,14 +303,14 @@ static bool is_occupied_sdma(struct mqd_manager *mm, void *mqd, * queues but with different initial values. */ -static int init_mqd_hiq(struct mqd_manager *mm, void **mqd, - struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, +static void init_mqd_hiq(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr, struct queue_properties *q) { - return init_mqd(mm, mqd, mqd_mem_obj, gart_addr, q); + init_mqd(mm, mqd, mqd_mem_obj, gart_addr, q); } -static int update_mqd_hiq(struct mqd_manager *mm, void *mqd, +static void update_mqd_hiq(struct mqd_manager *mm, void *mqd, struct queue_properties *q) { struct cik_mqd *m; @@ -350,12 +334,9 @@ static int update_mqd_hiq(struct mqd_manager *mm, void *mqd, m->cp_hqd_vmid = q->vmid; - q->is_active = (q->queue_size > 0 && - q->queue_address != 0 && - q->queue_percent > 0 && - !q->is_evicted); + q->is_active = QUEUE_IS_ACTIVE(*q); - return 0; + set_priority(m, q); } #if defined(CONFIG_DEBUG_FS) @@ -394,34 +375,53 @@ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type, switch (type) { case KFD_MQD_TYPE_CP: case KFD_MQD_TYPE_COMPUTE: + mqd->allocate_mqd = allocate_mqd; mqd->init_mqd = init_mqd; - mqd->uninit_mqd = uninit_mqd; + mqd->free_mqd = free_mqd; mqd->load_mqd = load_mqd; mqd->update_mqd = update_mqd; mqd->destroy_mqd = destroy_mqd; mqd->is_occupied = is_occupied; + mqd->mqd_size = sizeof(struct cik_mqd); #if defined(CONFIG_DEBUG_FS) mqd->debugfs_show_mqd = debugfs_show_mqd; #endif break; case KFD_MQD_TYPE_HIQ: + mqd->allocate_mqd = allocate_hiq_mqd; + mqd->init_mqd = init_mqd_hiq; + mqd->free_mqd = free_mqd_hiq_sdma; + mqd->load_mqd = load_mqd; + mqd->update_mqd = update_mqd_hiq; + mqd->destroy_mqd = destroy_mqd; + mqd->is_occupied = is_occupied; + mqd->mqd_size = sizeof(struct cik_mqd); +#if defined(CONFIG_DEBUG_FS) + mqd->debugfs_show_mqd = debugfs_show_mqd; +#endif + break; + case KFD_MQD_TYPE_DIQ: + mqd->allocate_mqd = allocate_hiq_mqd; mqd->init_mqd = init_mqd_hiq; - mqd->uninit_mqd = uninit_mqd; + mqd->free_mqd = free_mqd; mqd->load_mqd = load_mqd; mqd->update_mqd = update_mqd_hiq; mqd->destroy_mqd = destroy_mqd; mqd->is_occupied = is_occupied; + mqd->mqd_size = sizeof(struct cik_mqd); #if defined(CONFIG_DEBUG_FS) mqd->debugfs_show_mqd = debugfs_show_mqd; #endif break; case KFD_MQD_TYPE_SDMA: + mqd->allocate_mqd = allocate_sdma_mqd; mqd->init_mqd = init_mqd_sdma; - mqd->uninit_mqd = uninit_mqd_sdma; + mqd->free_mqd = free_mqd_hiq_sdma; mqd->load_mqd = load_mqd_sdma; mqd->update_mqd = update_mqd_sdma; mqd->destroy_mqd = destroy_mqd_sdma; mqd->is_occupied = is_occupied_sdma; + mqd->mqd_size = sizeof(struct cik_sdma_rlc_registers); #if defined(CONFIG_DEBUG_FS) mqd->debugfs_show_mqd = debugfs_show_mqd_sdma; #endif diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c new file mode 100644 index 000000000000..4a236b2c2354 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c @@ -0,0 +1,468 @@ +/* + * Copyright 2018 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include <linux/printk.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#include "kfd_priv.h" +#include "kfd_mqd_manager.h" +#include "v10_structs.h" +#include "gc/gc_10_1_0_offset.h" +#include "gc/gc_10_1_0_sh_mask.h" +#include "amdgpu_amdkfd.h" + +static inline struct v10_compute_mqd *get_mqd(void *mqd) +{ + return (struct v10_compute_mqd *)mqd; +} + +static inline struct v10_sdma_mqd *get_sdma_mqd(void *mqd) +{ + return (struct v10_sdma_mqd *)mqd; +} + +static void update_cu_mask(struct mqd_manager *mm, void *mqd, + struct queue_properties *q) +{ + struct v10_compute_mqd *m; + uint32_t se_mask[4] = {0}; /* 4 is the max # of SEs */ + + if (q->cu_mask_count == 0) + return; + + mqd_symmetrically_map_cu_mask(mm, + q->cu_mask, q->cu_mask_count, se_mask); + + m = get_mqd(mqd); + m->compute_static_thread_mgmt_se0 = se_mask[0]; + m->compute_static_thread_mgmt_se1 = se_mask[1]; + m->compute_static_thread_mgmt_se2 = se_mask[2]; + m->compute_static_thread_mgmt_se3 = se_mask[3]; + + pr_debug("update cu mask to %#x %#x %#x %#x\n", + m->compute_static_thread_mgmt_se0, + m->compute_static_thread_mgmt_se1, + m->compute_static_thread_mgmt_se2, + m->compute_static_thread_mgmt_se3); +} + +static struct kfd_mem_obj *allocate_mqd(struct kfd_dev *kfd, + struct queue_properties *q) +{ + struct kfd_mem_obj *mqd_mem_obj; + + if (kfd_gtt_sa_allocate(kfd, sizeof(struct v10_compute_mqd), + &mqd_mem_obj)) + return NULL; + + return mqd_mem_obj; +} + +static void init_mqd(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr, + struct queue_properties *q) +{ + uint64_t addr; + struct v10_compute_mqd *m; + + m = (struct v10_compute_mqd *) mqd_mem_obj->cpu_ptr; + addr = mqd_mem_obj->gpu_addr; + + memset(m, 0, sizeof(struct v10_compute_mqd)); + + m->header = 0xC0310800; + m->compute_pipelinestat_enable = 1; + m->compute_static_thread_mgmt_se0 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se1 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se2 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se3 = 0xFFFFFFFF; + + m->cp_hqd_persistent_state = CP_HQD_PERSISTENT_STATE__PRELOAD_REQ_MASK | + 0x53 << CP_HQD_PERSISTENT_STATE__PRELOAD_SIZE__SHIFT; + + m->cp_mqd_control = 1 << CP_MQD_CONTROL__PRIV_STATE__SHIFT; + + m->cp_mqd_base_addr_lo = lower_32_bits(addr); + m->cp_mqd_base_addr_hi = upper_32_bits(addr); + + m->cp_hqd_quantum = 1 << CP_HQD_QUANTUM__QUANTUM_EN__SHIFT | + 1 << CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT | + 10 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT; + + m->cp_hqd_pipe_priority = 1; + m->cp_hqd_queue_priority = 15; + + if (q->format == KFD_QUEUE_FORMAT_AQL) { + m->cp_hqd_aql_control = + 1 << CP_HQD_AQL_CONTROL__CONTROL0__SHIFT; + } + + if (mm->dev->cwsr_enabled) { + m->cp_hqd_persistent_state |= + (1 << CP_HQD_PERSISTENT_STATE__QSWITCH_MODE__SHIFT); + m->cp_hqd_ctx_save_base_addr_lo = + lower_32_bits(q->ctx_save_restore_area_address); + m->cp_hqd_ctx_save_base_addr_hi = + upper_32_bits(q->ctx_save_restore_area_address); + m->cp_hqd_ctx_save_size = q->ctx_save_restore_area_size; + m->cp_hqd_cntl_stack_size = q->ctl_stack_size; + m->cp_hqd_cntl_stack_offset = q->ctl_stack_size; + m->cp_hqd_wg_state_offset = q->ctl_stack_size; + } + + *mqd = m; + if (gart_addr) + *gart_addr = addr; + mm->update_mqd(mm, m, q); +} + +static int load_mqd(struct mqd_manager *mm, void *mqd, + uint32_t pipe_id, uint32_t queue_id, + struct queue_properties *p, struct mm_struct *mms) +{ + int r = 0; + /* AQL write pointer counts in 64B packets, PM4/CP counts in dwords. */ + uint32_t wptr_shift = (p->format == KFD_QUEUE_FORMAT_AQL ? 4 : 0); + + r = mm->dev->kfd2kgd->hqd_load(mm->dev->kgd, mqd, pipe_id, queue_id, + (uint32_t __user *)p->write_ptr, + wptr_shift, 0, mms); + return r; +} + +static void update_mqd(struct mqd_manager *mm, void *mqd, + struct queue_properties *q) +{ + struct v10_compute_mqd *m; + + m = get_mqd(mqd); + + m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT; + m->cp_hqd_pq_control |= + ffs(q->queue_size / sizeof(unsigned int)) - 1 - 1; + pr_debug("cp_hqd_pq_control 0x%x\n", m->cp_hqd_pq_control); + + m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8); + m->cp_hqd_pq_base_hi = upper_32_bits((uint64_t)q->queue_address >> 8); + + m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr); + m->cp_hqd_pq_rptr_report_addr_hi = upper_32_bits((uint64_t)q->read_ptr); + m->cp_hqd_pq_wptr_poll_addr_lo = lower_32_bits((uint64_t)q->write_ptr); + m->cp_hqd_pq_wptr_poll_addr_hi = upper_32_bits((uint64_t)q->write_ptr); + + m->cp_hqd_pq_doorbell_control = + q->doorbell_off << + CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET__SHIFT; + pr_debug("cp_hqd_pq_doorbell_control 0x%x\n", + m->cp_hqd_pq_doorbell_control); + + m->cp_hqd_ib_control = 3 << CP_HQD_IB_CONTROL__MIN_IB_AVAIL_SIZE__SHIFT; + + /* + * HW does not clamp this field correctly. Maximum EOP queue size + * is constrained by per-SE EOP done signal count, which is 8-bit. + * Limit is 0xFF EOP entries (= 0x7F8 dwords). CP will not submit + * more than (EOP entry count - 1) so a queue size of 0x800 dwords + * is safe, giving a maximum field value of 0xA. + */ + m->cp_hqd_eop_control = min(0xA, + ffs(q->eop_ring_buffer_size / sizeof(unsigned int)) - 1 - 1); + m->cp_hqd_eop_base_addr_lo = + lower_32_bits(q->eop_ring_buffer_address >> 8); + m->cp_hqd_eop_base_addr_hi = + upper_32_bits(q->eop_ring_buffer_address >> 8); + + m->cp_hqd_iq_timer = 0; + + m->cp_hqd_vmid = q->vmid; + + if (q->format == KFD_QUEUE_FORMAT_AQL) { + /* GC 10 removed WPP_CLAMP from PQ Control */ + m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__NO_UPDATE_RPTR_MASK | + 2 << CP_HQD_PQ_CONTROL__SLOT_BASED_WPTR__SHIFT | + 1 << CP_HQD_PQ_CONTROL__QUEUE_FULL_EN__SHIFT ; + m->cp_hqd_pq_doorbell_control |= + 1 << CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_BIF_DROP__SHIFT; + } + if (mm->dev->cwsr_enabled) + m->cp_hqd_ctx_save_control = 0; + + update_cu_mask(mm, mqd, q); + + q->is_active = (q->queue_size > 0 && + q->queue_address != 0 && + q->queue_percent > 0 && + !q->is_evicted); +} + +static int destroy_mqd(struct mqd_manager *mm, void *mqd, + enum kfd_preempt_type type, + unsigned int timeout, uint32_t pipe_id, + uint32_t queue_id) +{ + return mm->dev->kfd2kgd->hqd_destroy + (mm->dev->kgd, mqd, type, timeout, + pipe_id, queue_id); +} + +static void free_mqd(struct mqd_manager *mm, void *mqd, + struct kfd_mem_obj *mqd_mem_obj) +{ + kfd_gtt_sa_free(mm->dev, mqd_mem_obj); +} + +static bool is_occupied(struct mqd_manager *mm, void *mqd, + uint64_t queue_address, uint32_t pipe_id, + uint32_t queue_id) +{ + return mm->dev->kfd2kgd->hqd_is_occupied( + mm->dev->kgd, queue_address, + pipe_id, queue_id); +} + +static int get_wave_state(struct mqd_manager *mm, void *mqd, + void __user *ctl_stack, + u32 *ctl_stack_used_size, + u32 *save_area_used_size) +{ + struct v10_compute_mqd *m; + + /* Control stack is located one page after MQD. */ + void *mqd_ctl_stack = (void *)((uintptr_t)mqd + PAGE_SIZE); + + m = get_mqd(mqd); + + *ctl_stack_used_size = m->cp_hqd_cntl_stack_size - + m->cp_hqd_cntl_stack_offset; + *save_area_used_size = m->cp_hqd_wg_state_offset - + m->cp_hqd_cntl_stack_size; + + if (copy_to_user(ctl_stack, mqd_ctl_stack, m->cp_hqd_cntl_stack_size)) + return -EFAULT; + + return 0; +} + +static void init_mqd_hiq(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr, + struct queue_properties *q) +{ + struct v10_compute_mqd *m; + + init_mqd(mm, mqd, mqd_mem_obj, gart_addr, q); + + m = get_mqd(*mqd); + + m->cp_hqd_pq_control |= 1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT | + 1 << CP_HQD_PQ_CONTROL__KMD_QUEUE__SHIFT; +} + +static void update_mqd_hiq(struct mqd_manager *mm, void *mqd, + struct queue_properties *q) +{ + struct v10_compute_mqd *m; + + update_mqd(mm, mqd, q); + + /* TODO: what's the point? update_mqd already does this. */ + m = get_mqd(mqd); + m->cp_hqd_vmid = q->vmid; +} + +static void init_mqd_sdma(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr, + struct queue_properties *q) +{ + struct v10_sdma_mqd *m; + + m = (struct v10_sdma_mqd *) mqd_mem_obj->cpu_ptr; + + memset(m, 0, sizeof(struct v10_sdma_mqd)); + + *mqd = m; + if (gart_addr) + *gart_addr = mqd_mem_obj->gpu_addr; + + mm->update_mqd(mm, m, q); +} + +static int load_mqd_sdma(struct mqd_manager *mm, void *mqd, + uint32_t pipe_id, uint32_t queue_id, + struct queue_properties *p, struct mm_struct *mms) +{ + return mm->dev->kfd2kgd->hqd_sdma_load(mm->dev->kgd, mqd, + (uint32_t __user *)p->write_ptr, + mms); +} + +#define SDMA_RLC_DUMMY_DEFAULT 0xf + +static void update_mqd_sdma(struct mqd_manager *mm, void *mqd, + struct queue_properties *q) +{ + struct v10_sdma_mqd *m; + + m = get_sdma_mqd(mqd); + m->sdmax_rlcx_rb_cntl = (ffs(q->queue_size / sizeof(unsigned int)) - 1) + << SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT | + q->vmid << SDMA0_RLC0_RB_CNTL__RB_VMID__SHIFT | + 1 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT | + 6 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT; + + m->sdmax_rlcx_rb_base = lower_32_bits(q->queue_address >> 8); + m->sdmax_rlcx_rb_base_hi = upper_32_bits(q->queue_address >> 8); + m->sdmax_rlcx_rb_rptr_addr_lo = lower_32_bits((uint64_t)q->read_ptr); + m->sdmax_rlcx_rb_rptr_addr_hi = upper_32_bits((uint64_t)q->read_ptr); + m->sdmax_rlcx_doorbell_offset = + q->doorbell_off << SDMA0_RLC0_DOORBELL_OFFSET__OFFSET__SHIFT; + + m->sdma_engine_id = q->sdma_engine_id; + m->sdma_queue_id = q->sdma_queue_id; + m->sdmax_rlcx_dummy_reg = SDMA_RLC_DUMMY_DEFAULT; + + + q->is_active = (q->queue_size > 0 && + q->queue_address != 0 && + q->queue_percent > 0 && + !q->is_evicted); +} + +/* + * * preempt type here is ignored because there is only one way + * * to preempt sdma queue + */ +static int destroy_mqd_sdma(struct mqd_manager *mm, void *mqd, + enum kfd_preempt_type type, + unsigned int timeout, uint32_t pipe_id, + uint32_t queue_id) +{ + return mm->dev->kfd2kgd->hqd_sdma_destroy(mm->dev->kgd, mqd, timeout); +} + +static bool is_occupied_sdma(struct mqd_manager *mm, void *mqd, + uint64_t queue_address, uint32_t pipe_id, + uint32_t queue_id) +{ + return mm->dev->kfd2kgd->hqd_sdma_is_occupied(mm->dev->kgd, mqd); +} + +#if defined(CONFIG_DEBUG_FS) + +static int debugfs_show_mqd(struct seq_file *m, void *data) +{ + seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, + data, sizeof(struct v10_compute_mqd), false); + return 0; +} + +static int debugfs_show_mqd_sdma(struct seq_file *m, void *data) +{ + seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4, + data, sizeof(struct v10_sdma_mqd), false); + return 0; +} + +#endif + +struct mqd_manager *mqd_manager_init_v10(enum KFD_MQD_TYPE type, + struct kfd_dev *dev) +{ + struct mqd_manager *mqd; + + if (WARN_ON(type >= KFD_MQD_TYPE_MAX)) + return NULL; + + mqd = kzalloc(sizeof(*mqd), GFP_NOIO); + if (!mqd) + return NULL; + + mqd->dev = dev; + + switch (type) { + case KFD_MQD_TYPE_CP: + case KFD_MQD_TYPE_COMPUTE: + pr_debug("%s@%i\n", __func__, __LINE__); + mqd->allocate_mqd = allocate_mqd; + mqd->init_mqd = init_mqd; + mqd->free_mqd = free_mqd; + mqd->load_mqd = load_mqd; + mqd->update_mqd = update_mqd; + mqd->destroy_mqd = destroy_mqd; + mqd->is_occupied = is_occupied; + mqd->mqd_size = sizeof(struct v10_compute_mqd); + mqd->get_wave_state = get_wave_state; +#if defined(CONFIG_DEBUG_FS) + mqd->debugfs_show_mqd = debugfs_show_mqd; +#endif + pr_debug("%s@%i\n", __func__, __LINE__); + break; + case KFD_MQD_TYPE_HIQ: + pr_debug("%s@%i\n", __func__, __LINE__); + mqd->allocate_mqd = allocate_hiq_mqd; + mqd->init_mqd = init_mqd_hiq; + mqd->free_mqd = free_mqd_hiq_sdma; + mqd->load_mqd = load_mqd; + mqd->update_mqd = update_mqd_hiq; + mqd->destroy_mqd = destroy_mqd; + mqd->is_occupied = is_occupied; + mqd->mqd_size = sizeof(struct v10_compute_mqd); +#if defined(CONFIG_DEBUG_FS) + mqd->debugfs_show_mqd = debugfs_show_mqd; +#endif + pr_debug("%s@%i\n", __func__, __LINE__); + break; + case KFD_MQD_TYPE_DIQ: + mqd->allocate_mqd = allocate_hiq_mqd; + mqd->init_mqd = init_mqd_hiq; + mqd->free_mqd = free_mqd; + mqd->load_mqd = load_mqd; + mqd->update_mqd = update_mqd_hiq; + mqd->destroy_mqd = destroy_mqd; + mqd->is_occupied = is_occupied; + mqd->mqd_size = sizeof(struct v10_compute_mqd); +#if defined(CONFIG_DEBUG_FS) + mqd->debugfs_show_mqd = debugfs_show_mqd; +#endif + break; + case KFD_MQD_TYPE_SDMA: + pr_debug("%s@%i\n", __func__, __LINE__); + mqd->allocate_mqd = allocate_sdma_mqd; + mqd->init_mqd = init_mqd_sdma; + mqd->free_mqd = free_mqd_hiq_sdma; + mqd->load_mqd = load_mqd_sdma; + mqd->update_mqd = update_mqd_sdma; + mqd->destroy_mqd = destroy_mqd_sdma; + mqd->is_occupied = is_occupied_sdma; + mqd->mqd_size = sizeof(struct v10_sdma_mqd); +#if defined(CONFIG_DEBUG_FS) + mqd->debugfs_show_mqd = debugfs_show_mqd_sdma; +#endif + pr_debug("%s@%i\n", __func__, __LINE__); + break; + default: + kfree(mqd); + return NULL; + } + + return mqd; +} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c index 9dbba609450e..d3380c5bdbde 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c @@ -46,7 +46,7 @@ static void update_cu_mask(struct mqd_manager *mm, void *mqd, struct queue_properties *q) { struct v9_mqd *m; - uint32_t se_mask[4] = {0}; /* 4 is the max # of SEs */ + uint32_t se_mask[KFD_MAX_NUM_SE] = {0}; if (q->cu_mask_count == 0) return; @@ -59,45 +59,71 @@ static void update_cu_mask(struct mqd_manager *mm, void *mqd, m->compute_static_thread_mgmt_se1 = se_mask[1]; m->compute_static_thread_mgmt_se2 = se_mask[2]; m->compute_static_thread_mgmt_se3 = se_mask[3]; + m->compute_static_thread_mgmt_se4 = se_mask[4]; + m->compute_static_thread_mgmt_se5 = se_mask[5]; + m->compute_static_thread_mgmt_se6 = se_mask[6]; + m->compute_static_thread_mgmt_se7 = se_mask[7]; - pr_debug("update cu mask to %#x %#x %#x %#x\n", + pr_debug("update cu mask to %#x %#x %#x %#x %#x %#x %#x %#x\n", m->compute_static_thread_mgmt_se0, m->compute_static_thread_mgmt_se1, m->compute_static_thread_mgmt_se2, - m->compute_static_thread_mgmt_se3); + m->compute_static_thread_mgmt_se3, + m->compute_static_thread_mgmt_se4, + m->compute_static_thread_mgmt_se5, + m->compute_static_thread_mgmt_se6, + m->compute_static_thread_mgmt_se7); } -static int init_mqd(struct mqd_manager *mm, void **mqd, - struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, - struct queue_properties *q) +static void set_priority(struct v9_mqd *m, struct queue_properties *q) +{ + m->cp_hqd_pipe_priority = pipe_priority_map[q->priority]; + m->cp_hqd_queue_priority = q->priority; +} + +static struct kfd_mem_obj *allocate_mqd(struct kfd_dev *kfd, + struct queue_properties *q) { int retval; - uint64_t addr; - struct v9_mqd *m; - struct kfd_dev *kfd = mm->dev; + struct kfd_mem_obj *mqd_mem_obj = NULL; /* From V9, for CWSR, the control stack is located on the next page * boundary after the mqd, we will use the gtt allocation function * instead of sub-allocation function. */ if (kfd->cwsr_enabled && (q->type == KFD_QUEUE_TYPE_COMPUTE)) { - *mqd_mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_KERNEL); - if (!*mqd_mem_obj) - return -ENOMEM; + mqd_mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_NOIO); + if (!mqd_mem_obj) + return NULL; retval = amdgpu_amdkfd_alloc_gtt_mem(kfd->kgd, ALIGN(q->ctl_stack_size, PAGE_SIZE) + ALIGN(sizeof(struct v9_mqd), PAGE_SIZE), - &((*mqd_mem_obj)->gtt_mem), - &((*mqd_mem_obj)->gpu_addr), - (void *)&((*mqd_mem_obj)->cpu_ptr), true); - } else - retval = kfd_gtt_sa_allocate(mm->dev, sizeof(struct v9_mqd), - mqd_mem_obj); - if (retval != 0) - return -ENOMEM; - - m = (struct v9_mqd *) (*mqd_mem_obj)->cpu_ptr; - addr = (*mqd_mem_obj)->gpu_addr; + &(mqd_mem_obj->gtt_mem), + &(mqd_mem_obj->gpu_addr), + (void *)&(mqd_mem_obj->cpu_ptr), true); + } else { + retval = kfd_gtt_sa_allocate(kfd, sizeof(struct v9_mqd), + &mqd_mem_obj); + } + + if (retval) { + kfree(mqd_mem_obj); + return NULL; + } + + return mqd_mem_obj; + +} + +static void init_mqd(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr, + struct queue_properties *q) +{ + uint64_t addr; + struct v9_mqd *m; + + m = (struct v9_mqd *) mqd_mem_obj->cpu_ptr; + addr = mqd_mem_obj->gpu_addr; memset(m, 0, sizeof(struct v9_mqd)); @@ -107,6 +133,10 @@ static int init_mqd(struct mqd_manager *mm, void **mqd, m->compute_static_thread_mgmt_se1 = 0xFFFFFFFF; m->compute_static_thread_mgmt_se2 = 0xFFFFFFFF; m->compute_static_thread_mgmt_se3 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se4 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se5 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se6 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se7 = 0xFFFFFFFF; m->cp_hqd_persistent_state = CP_HQD_PERSISTENT_STATE__PRELOAD_REQ_MASK | 0x53 << CP_HQD_PERSISTENT_STATE__PRELOAD_SIZE__SHIFT; @@ -120,9 +150,6 @@ static int init_mqd(struct mqd_manager *mm, void **mqd, 1 << CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT | 10 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT; - m->cp_hqd_pipe_priority = 1; - m->cp_hqd_queue_priority = 15; - if (q->format == KFD_QUEUE_FORMAT_AQL) { m->cp_hqd_aql_control = 1 << CP_HQD_AQL_CONTROL__CONTROL0__SHIFT; @@ -149,9 +176,7 @@ static int init_mqd(struct mqd_manager *mm, void **mqd, *mqd = m; if (gart_addr) *gart_addr = addr; - retval = mm->update_mqd(mm, m, q); - - return retval; + mm->update_mqd(mm, m, q); } static int load_mqd(struct mqd_manager *mm, void *mqd, @@ -166,7 +191,7 @@ static int load_mqd(struct mqd_manager *mm, void *mqd, wptr_shift, 0, mms); } -static int update_mqd(struct mqd_manager *mm, void *mqd, +static void update_mqd(struct mqd_manager *mm, void *mqd, struct queue_properties *q) { struct v9_mqd *m; @@ -225,13 +250,9 @@ static int update_mqd(struct mqd_manager *mm, void *mqd, m->cp_hqd_ctx_save_control = 0; update_cu_mask(mm, mqd, q); + set_priority(m, q); - q->is_active = (q->queue_size > 0 && - q->queue_address != 0 && - q->queue_percent > 0 && - !q->is_evicted); - - return 0; + q->is_active = QUEUE_IS_ACTIVE(*q); } @@ -245,7 +266,7 @@ static int destroy_mqd(struct mqd_manager *mm, void *mqd, pipe_id, queue_id); } -static void uninit_mqd(struct mqd_manager *mm, void *mqd, +static void free_mqd(struct mqd_manager *mm, void *mqd, struct kfd_mem_obj *mqd_mem_obj) { struct kfd_dev *kfd = mm->dev; @@ -289,71 +310,47 @@ static int get_wave_state(struct mqd_manager *mm, void *mqd, return 0; } -static int init_mqd_hiq(struct mqd_manager *mm, void **mqd, - struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, +static void init_mqd_hiq(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr, struct queue_properties *q) { struct v9_mqd *m; - int retval = init_mqd(mm, mqd, mqd_mem_obj, gart_addr, q); - if (retval != 0) - return retval; + init_mqd(mm, mqd, mqd_mem_obj, gart_addr, q); m = get_mqd(*mqd); m->cp_hqd_pq_control |= 1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT | 1 << CP_HQD_PQ_CONTROL__KMD_QUEUE__SHIFT; - - return retval; } -static int update_mqd_hiq(struct mqd_manager *mm, void *mqd, +static void update_mqd_hiq(struct mqd_manager *mm, void *mqd, struct queue_properties *q) { struct v9_mqd *m; - int retval = update_mqd(mm, mqd, q); - if (retval != 0) - return retval; + update_mqd(mm, mqd, q); /* TODO: what's the point? update_mqd already does this. */ m = get_mqd(mqd); m->cp_hqd_vmid = q->vmid; - return retval; } -static int init_mqd_sdma(struct mqd_manager *mm, void **mqd, - struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, +static void init_mqd_sdma(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr, struct queue_properties *q) { - int retval; struct v9_sdma_mqd *m; - - retval = kfd_gtt_sa_allocate(mm->dev, - sizeof(struct v9_sdma_mqd), - mqd_mem_obj); - - if (retval != 0) - return -ENOMEM; - - m = (struct v9_sdma_mqd *) (*mqd_mem_obj)->cpu_ptr; + m = (struct v9_sdma_mqd *) mqd_mem_obj->cpu_ptr; memset(m, 0, sizeof(struct v9_sdma_mqd)); *mqd = m; if (gart_addr) - *gart_addr = (*mqd_mem_obj)->gpu_addr; - - retval = mm->update_mqd(mm, m, q); + *gart_addr = mqd_mem_obj->gpu_addr; - return retval; -} - -static void uninit_mqd_sdma(struct mqd_manager *mm, void *mqd, - struct kfd_mem_obj *mqd_mem_obj) -{ - kfd_gtt_sa_free(mm->dev, mqd_mem_obj); + mm->update_mqd(mm, m, q); } static int load_mqd_sdma(struct mqd_manager *mm, void *mqd, @@ -367,7 +364,7 @@ static int load_mqd_sdma(struct mqd_manager *mm, void *mqd, #define SDMA_RLC_DUMMY_DEFAULT 0xf -static int update_mqd_sdma(struct mqd_manager *mm, void *mqd, +static void update_mqd_sdma(struct mqd_manager *mm, void *mqd, struct queue_properties *q) { struct v9_sdma_mqd *m; @@ -390,12 +387,7 @@ static int update_mqd_sdma(struct mqd_manager *mm, void *mqd, m->sdma_queue_id = q->sdma_queue_id; m->sdmax_rlcx_dummy_reg = SDMA_RLC_DUMMY_DEFAULT; - q->is_active = (q->queue_size > 0 && - q->queue_address != 0 && - q->queue_percent > 0 && - !q->is_evicted); - - return 0; + q->is_active = QUEUE_IS_ACTIVE(*q); } /* @@ -452,35 +444,54 @@ struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type, switch (type) { case KFD_MQD_TYPE_CP: case KFD_MQD_TYPE_COMPUTE: + mqd->allocate_mqd = allocate_mqd; mqd->init_mqd = init_mqd; - mqd->uninit_mqd = uninit_mqd; + mqd->free_mqd = free_mqd; mqd->load_mqd = load_mqd; mqd->update_mqd = update_mqd; mqd->destroy_mqd = destroy_mqd; mqd->is_occupied = is_occupied; mqd->get_wave_state = get_wave_state; + mqd->mqd_size = sizeof(struct v9_mqd); #if defined(CONFIG_DEBUG_FS) mqd->debugfs_show_mqd = debugfs_show_mqd; #endif break; case KFD_MQD_TYPE_HIQ: + mqd->allocate_mqd = allocate_hiq_mqd; + mqd->init_mqd = init_mqd_hiq; + mqd->free_mqd = free_mqd_hiq_sdma; + mqd->load_mqd = load_mqd; + mqd->update_mqd = update_mqd_hiq; + mqd->destroy_mqd = destroy_mqd; + mqd->is_occupied = is_occupied; + mqd->mqd_size = sizeof(struct v9_mqd); +#if defined(CONFIG_DEBUG_FS) + mqd->debugfs_show_mqd = debugfs_show_mqd; +#endif + break; + case KFD_MQD_TYPE_DIQ: + mqd->allocate_mqd = allocate_hiq_mqd; mqd->init_mqd = init_mqd_hiq; - mqd->uninit_mqd = uninit_mqd; + mqd->free_mqd = free_mqd; mqd->load_mqd = load_mqd; mqd->update_mqd = update_mqd_hiq; mqd->destroy_mqd = destroy_mqd; mqd->is_occupied = is_occupied; + mqd->mqd_size = sizeof(struct v9_mqd); #if defined(CONFIG_DEBUG_FS) mqd->debugfs_show_mqd = debugfs_show_mqd; #endif break; case KFD_MQD_TYPE_SDMA: + mqd->allocate_mqd = allocate_sdma_mqd; mqd->init_mqd = init_mqd_sdma; - mqd->uninit_mqd = uninit_mqd_sdma; + mqd->free_mqd = free_mqd_hiq_sdma; mqd->load_mqd = load_mqd_sdma; mqd->update_mqd = update_mqd_sdma; mqd->destroy_mqd = destroy_mqd_sdma; mqd->is_occupied = is_occupied_sdma; + mqd->mqd_size = sizeof(struct v9_sdma_mqd); #if defined(CONFIG_DEBUG_FS) mqd->debugfs_show_mqd = debugfs_show_mqd_sdma; #endif diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c index 6469b3456f00..7d144f56f421 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c @@ -31,6 +31,7 @@ #include "gca/gfx_8_0_sh_mask.h" #include "gca/gfx_8_0_enum.h" #include "oss/oss_3_0_sh_mask.h" + #define CP_MQD_CONTROL__PRIV_STATE__SHIFT 0x8 static inline struct vi_mqd *get_mqd(void *mqd) @@ -68,21 +69,33 @@ static void update_cu_mask(struct mqd_manager *mm, void *mqd, m->compute_static_thread_mgmt_se3); } -static int init_mqd(struct mqd_manager *mm, void **mqd, - struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, +static void set_priority(struct vi_mqd *m, struct queue_properties *q) +{ + m->cp_hqd_pipe_priority = pipe_priority_map[q->priority]; + m->cp_hqd_queue_priority = q->priority; +} + +static struct kfd_mem_obj *allocate_mqd(struct kfd_dev *kfd, + struct queue_properties *q) +{ + struct kfd_mem_obj *mqd_mem_obj; + + if (kfd_gtt_sa_allocate(kfd, sizeof(struct vi_mqd), + &mqd_mem_obj)) + return NULL; + + return mqd_mem_obj; +} + +static void init_mqd(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr, struct queue_properties *q) { - int retval; uint64_t addr; struct vi_mqd *m; - retval = kfd_gtt_sa_allocate(mm->dev, sizeof(struct vi_mqd), - mqd_mem_obj); - if (retval != 0) - return -ENOMEM; - - m = (struct vi_mqd *) (*mqd_mem_obj)->cpu_ptr; - addr = (*mqd_mem_obj)->gpu_addr; + m = (struct vi_mqd *) mqd_mem_obj->cpu_ptr; + addr = mqd_mem_obj->gpu_addr; memset(m, 0, sizeof(struct vi_mqd)); @@ -106,9 +119,7 @@ static int init_mqd(struct mqd_manager *mm, void **mqd, 1 << CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT | 10 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT; - m->cp_hqd_pipe_priority = 1; - m->cp_hqd_queue_priority = 15; - + set_priority(m, q); m->cp_hqd_eop_rptr = 1 << CP_HQD_EOP_RPTR__INIT_FETCHER__SHIFT; if (q->format == KFD_QUEUE_FORMAT_AQL) @@ -139,9 +150,7 @@ static int init_mqd(struct mqd_manager *mm, void **mqd, *mqd = m; if (gart_addr) *gart_addr = addr; - retval = mm->update_mqd(mm, m, q); - - return retval; + mm->update_mqd(mm, m, q); } static int load_mqd(struct mqd_manager *mm, void *mqd, @@ -157,7 +166,7 @@ static int load_mqd(struct mqd_manager *mm, void *mqd, wptr_shift, wptr_mask, mms); } -static int __update_mqd(struct mqd_manager *mm, void *mqd, +static void __update_mqd(struct mqd_manager *mm, void *mqd, struct queue_properties *q, unsigned int mtype, unsigned int atc_bit) { @@ -222,26 +231,22 @@ static int __update_mqd(struct mqd_manager *mm, void *mqd, mtype << CP_HQD_CTX_SAVE_CONTROL__MTYPE__SHIFT; update_cu_mask(mm, mqd, q); + set_priority(m, q); - q->is_active = (q->queue_size > 0 && - q->queue_address != 0 && - q->queue_percent > 0 && - !q->is_evicted); - - return 0; + q->is_active = QUEUE_IS_ACTIVE(*q); } -static int update_mqd(struct mqd_manager *mm, void *mqd, +static void update_mqd(struct mqd_manager *mm, void *mqd, struct queue_properties *q) { - return __update_mqd(mm, mqd, q, MTYPE_CC, 1); + __update_mqd(mm, mqd, q, MTYPE_CC, 1); } -static int update_mqd_tonga(struct mqd_manager *mm, void *mqd, +static void update_mqd_tonga(struct mqd_manager *mm, void *mqd, struct queue_properties *q) { - return __update_mqd(mm, mqd, q, MTYPE_UC, 0); + __update_mqd(mm, mqd, q, MTYPE_UC, 0); } static int destroy_mqd(struct mqd_manager *mm, void *mqd, @@ -254,7 +259,7 @@ static int destroy_mqd(struct mqd_manager *mm, void *mqd, pipe_id, queue_id); } -static void uninit_mqd(struct mqd_manager *mm, void *mqd, +static void free_mqd(struct mqd_manager *mm, void *mqd, struct kfd_mem_obj *mqd_mem_obj) { kfd_gtt_sa_free(mm->dev, mqd_mem_obj); @@ -291,70 +296,44 @@ static int get_wave_state(struct mqd_manager *mm, void *mqd, return 0; } -static int init_mqd_hiq(struct mqd_manager *mm, void **mqd, - struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, +static void init_mqd_hiq(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr, struct queue_properties *q) { struct vi_mqd *m; - int retval = init_mqd(mm, mqd, mqd_mem_obj, gart_addr, q); - - if (retval != 0) - return retval; + init_mqd(mm, mqd, mqd_mem_obj, gart_addr, q); m = get_mqd(*mqd); m->cp_hqd_pq_control |= 1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT | 1 << CP_HQD_PQ_CONTROL__KMD_QUEUE__SHIFT; - - return retval; } -static int update_mqd_hiq(struct mqd_manager *mm, void *mqd, +static void update_mqd_hiq(struct mqd_manager *mm, void *mqd, struct queue_properties *q) { struct vi_mqd *m; - int retval = __update_mqd(mm, mqd, q, MTYPE_UC, 0); - - if (retval != 0) - return retval; + __update_mqd(mm, mqd, q, MTYPE_UC, 0); m = get_mqd(mqd); m->cp_hqd_vmid = q->vmid; - return retval; } -static int init_mqd_sdma(struct mqd_manager *mm, void **mqd, - struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr, +static void init_mqd_sdma(struct mqd_manager *mm, void **mqd, + struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr, struct queue_properties *q) { - int retval; struct vi_sdma_mqd *m; - - retval = kfd_gtt_sa_allocate(mm->dev, - sizeof(struct vi_sdma_mqd), - mqd_mem_obj); - - if (retval != 0) - return -ENOMEM; - - m = (struct vi_sdma_mqd *) (*mqd_mem_obj)->cpu_ptr; + m = (struct vi_sdma_mqd *) mqd_mem_obj->cpu_ptr; memset(m, 0, sizeof(struct vi_sdma_mqd)); *mqd = m; - if (gart_addr != NULL) - *gart_addr = (*mqd_mem_obj)->gpu_addr; - - retval = mm->update_mqd(mm, m, q); - - return retval; -} + if (gart_addr) + *gart_addr = mqd_mem_obj->gpu_addr; -static void uninit_mqd_sdma(struct mqd_manager *mm, void *mqd, - struct kfd_mem_obj *mqd_mem_obj) -{ - kfd_gtt_sa_free(mm->dev, mqd_mem_obj); + mm->update_mqd(mm, m, q); } static int load_mqd_sdma(struct mqd_manager *mm, void *mqd, @@ -366,7 +345,7 @@ static int load_mqd_sdma(struct mqd_manager *mm, void *mqd, mms); } -static int update_mqd_sdma(struct mqd_manager *mm, void *mqd, +static void update_mqd_sdma(struct mqd_manager *mm, void *mqd, struct queue_properties *q) { struct vi_sdma_mqd *m; @@ -390,12 +369,7 @@ static int update_mqd_sdma(struct mqd_manager *mm, void *mqd, m->sdma_engine_id = q->sdma_engine_id; m->sdma_queue_id = q->sdma_queue_id; - q->is_active = (q->queue_size > 0 && - q->queue_address != 0 && - q->queue_percent > 0 && - !q->is_evicted); - - return 0; + q->is_active = QUEUE_IS_ACTIVE(*q); } /* @@ -452,35 +426,54 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type, switch (type) { case KFD_MQD_TYPE_CP: case KFD_MQD_TYPE_COMPUTE: + mqd->allocate_mqd = allocate_mqd; mqd->init_mqd = init_mqd; - mqd->uninit_mqd = uninit_mqd; + mqd->free_mqd = free_mqd; mqd->load_mqd = load_mqd; mqd->update_mqd = update_mqd; mqd->destroy_mqd = destroy_mqd; mqd->is_occupied = is_occupied; mqd->get_wave_state = get_wave_state; + mqd->mqd_size = sizeof(struct vi_mqd); #if defined(CONFIG_DEBUG_FS) mqd->debugfs_show_mqd = debugfs_show_mqd; #endif break; case KFD_MQD_TYPE_HIQ: + mqd->allocate_mqd = allocate_hiq_mqd; + mqd->init_mqd = init_mqd_hiq; + mqd->free_mqd = free_mqd_hiq_sdma; + mqd->load_mqd = load_mqd; + mqd->update_mqd = update_mqd_hiq; + mqd->destroy_mqd = destroy_mqd; + mqd->is_occupied = is_occupied; + mqd->mqd_size = sizeof(struct vi_mqd); +#if defined(CONFIG_DEBUG_FS) + mqd->debugfs_show_mqd = debugfs_show_mqd; +#endif + break; + case KFD_MQD_TYPE_DIQ: + mqd->allocate_mqd = allocate_hiq_mqd; mqd->init_mqd = init_mqd_hiq; - mqd->uninit_mqd = uninit_mqd; + mqd->free_mqd = free_mqd; mqd->load_mqd = load_mqd; mqd->update_mqd = update_mqd_hiq; mqd->destroy_mqd = destroy_mqd; mqd->is_occupied = is_occupied; + mqd->mqd_size = sizeof(struct vi_mqd); #if defined(CONFIG_DEBUG_FS) mqd->debugfs_show_mqd = debugfs_show_mqd; #endif break; case KFD_MQD_TYPE_SDMA: + mqd->allocate_mqd = allocate_sdma_mqd; mqd->init_mqd = init_mqd_sdma; - mqd->uninit_mqd = uninit_mqd_sdma; + mqd->free_mqd = free_mqd_hiq_sdma; mqd->load_mqd = load_mqd_sdma; mqd->update_mqd = update_mqd_sdma; mqd->destroy_mqd = destroy_mqd_sdma; mqd->is_occupied = is_occupied_sdma; + mqd->mqd_size = sizeof(struct vi_sdma_mqd); #if defined(CONFIG_DEBUG_FS) mqd->debugfs_show_mqd = debugfs_show_mqd_sdma; #endif diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c index 045a229436a0..83ef4b3dd2fb 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c @@ -48,7 +48,8 @@ static void pm_calc_rlib_size(struct packet_manager *pm, process_count = pm->dqm->processes_count; queue_count = pm->dqm->queue_count; - compute_queue_count = queue_count - pm->dqm->sdma_queue_count; + compute_queue_count = queue_count - pm->dqm->sdma_queue_count - + pm->dqm->xgmi_sdma_queue_count; /* check if there is over subscription * Note: the arbitration between the number of VMIDs and @@ -202,11 +203,15 @@ static int pm_create_runlist_ib(struct packet_manager *pm, pr_debug("Finished map process and queues to runlist\n"); - if (is_over_subscription) + if (is_over_subscription) { + if (!pm->is_over_subscription) + pr_warn("Runlist is getting oversubscribed. Expect reduced ROCm performance.\n"); retval = pm->pmf->runlist(pm, &rl_buffer[rl_wptr], *rl_gpu_addr, alloc_size_bytes / sizeof(uint32_t), true); + } + pm->is_over_subscription = is_over_subscription; for (i = 0; i < alloc_size_bytes / sizeof(uint32_t); i++) pr_debug("0x%2X ", rl_buffer[i]); @@ -227,14 +232,22 @@ int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm) case CHIP_POLARIS10: case CHIP_POLARIS11: case CHIP_POLARIS12: + case CHIP_VEGAM: pm->pmf = &kfd_vi_pm_funcs; break; case CHIP_VEGA10: case CHIP_VEGA12: case CHIP_VEGA20: case CHIP_RAVEN: + case CHIP_RENOIR: + case CHIP_ARCTURUS: pm->pmf = &kfd_v9_pm_funcs; break; + case CHIP_NAVI10: + case CHIP_NAVI12: + case CHIP_NAVI14: + pm->pmf = &kfd_v10_pm_funcs; + break; default: WARN(1, "Unexpected ASIC family %u", dqm->dev->device_info->asic_family); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h index f2bcf5c092ea..4d7add843746 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h @@ -83,10 +83,10 @@ struct pm4_mes_set_resources { union { struct { - uint32_t gds_heap_base:6; - uint32_t reserved3:5; - uint32_t gds_heap_size:6; - uint32_t reserved4:15; + uint32_t gds_heap_base:10; + uint32_t reserved3:1; + uint32_t gds_heap_size:10; + uint32_t reserved4:11; } bitfields8; uint32_t ordinal8; }; @@ -120,7 +120,7 @@ struct pm4_mes_runlist { uint32_t ib_size:20; uint32_t chain:1; uint32_t offload_polling:1; - uint32_t reserved2:1; + uint32_t chained_runlist_idle_disable:1; uint32_t valid:1; uint32_t process_cnt:4; uint32_t reserved3:4; @@ -176,11 +176,10 @@ struct pm4_mes_map_process { union { struct { - uint32_t num_gws:6; - uint32_t reserved7:1; + uint32_t num_gws:7; uint32_t sdma_enable:1; uint32_t num_oac:4; - uint32_t reserved8:4; + uint32_t gds_size_hi:4; uint32_t gds_size:6; uint32_t num_queues:10; } bitfields14; @@ -255,17 +254,16 @@ enum mes_map_queues_queue_type_enum { queue_type__mes_map_queues__low_latency_static_queue_vi = 3 }; -enum mes_map_queues_alloc_format_enum { - alloc_format__mes_map_queues__one_per_pipe_vi = 0, -alloc_format__mes_map_queues__all_on_one_pipe_vi = 1 -}; - enum mes_map_queues_engine_sel_enum { engine_sel__mes_map_queues__compute_vi = 0, engine_sel__mes_map_queues__sdma0_vi = 2, engine_sel__mes_map_queues__sdma1_vi = 3 }; +enum mes_map_queues_extended_engine_sel_enum { + extended_engine_sel__mes_map_queues__legacy_engine_sel = 0, + extended_engine_sel__mes_map_queues__sdma0_to_7_sel = 1 +}; struct pm4_mes_map_queues { union { @@ -275,11 +273,14 @@ struct pm4_mes_map_queues { union { struct { - uint32_t reserved1:4; + uint32_t reserved1:2; + enum mes_map_queues_extended_engine_sel_enum extended_engine_sel:2; enum mes_map_queues_queue_sel_enum queue_sel:2; - uint32_t reserved2:15; + uint32_t reserved5:6; + uint32_t gws_control_queue:1; + uint32_t reserved2:8; enum mes_map_queues_queue_type_enum queue_type:3; - enum mes_map_queues_alloc_format_enum alloc_format:2; + uint32_t reserved3:2; enum mes_map_queues_engine_sel_enum engine_sel:3; uint32_t num_queues:3; } bitfields2; @@ -386,6 +387,11 @@ enum mes_unmap_queues_engine_sel_enum { engine_sel__mes_unmap_queues__sdmal = 3 }; +enum mes_unmap_queues_extended_engine_sel_enum { + extended_engine_sel__mes_unmap_queues__legacy_engine_sel = 0, + extended_engine_sel__mes_unmap_queues__sdma0_to_7_sel = 1 +}; + struct pm4_mes_unmap_queues { union { union PM4_MES_TYPE_3_HEADER header; /* header */ @@ -395,7 +401,7 @@ struct pm4_mes_unmap_queues { union { struct { enum mes_unmap_queues_action_enum action:2; - uint32_t reserved1:2; + enum mes_unmap_queues_extended_engine_sel_enum extended_engine_sel:2; enum mes_unmap_queues_queue_sel_enum queue_sel:2; uint32_t reserved2:20; enum mes_unmap_queues_engine_sel_enum engine_sel:3; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_vi.h b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_vi.h index 7c8d9b357749..5466cfe1c3cc 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_vi.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_vi.h @@ -216,11 +216,6 @@ enum mes_map_queues_queue_type_vi_enum { queue_type__mes_map_queues__low_latency_static_queue_vi = 3 }; -enum mes_map_queues_alloc_format_vi_enum { - alloc_format__mes_map_queues__one_per_pipe_vi = 0, -alloc_format__mes_map_queues__all_on_one_pipe_vi = 1 -}; - enum mes_map_queues_engine_sel_vi_enum { engine_sel__mes_map_queues__compute_vi = 0, engine_sel__mes_map_queues__sdma0_vi = 2, @@ -240,7 +235,7 @@ struct pm4_mes_map_queues { enum mes_map_queues_queue_sel_vi_enum queue_sel:2; uint32_t reserved2:15; enum mes_map_queues_queue_type_vi_enum queue_type:3; - enum mes_map_queues_alloc_format_vi_enum alloc_format:2; + uint32_t reserved3:2; enum mes_map_queues_engine_sel_vi_enum engine_sel:3; uint32_t num_queues:3; } bitfields2; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 0eeee3c6d6dc..060a9e8b301e 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -35,6 +35,11 @@ #include <linux/kfifo.h> #include <linux/seq_file.h> #include <linux/kref.h> +#include <linux/sysfs.h> +#include <linux/device_cgroup.h> +#include <drm/drm_file.h> +#include <drm/drm_drv.h> +#include <drm/drm_device.h> #include <kgd_kfd_interface.h> #include "amd_shared.h" @@ -59,6 +64,7 @@ #define KFD_MMAP_TYPE_DOORBELL (0x3ULL << KFD_MMAP_TYPE_SHIFT) #define KFD_MMAP_TYPE_EVENTS (0x2ULL << KFD_MMAP_TYPE_SHIFT) #define KFD_MMAP_TYPE_RESERVED_MEM (0x1ULL << KFD_MMAP_TYPE_SHIFT) +#define KFD_MMAP_TYPE_MMIO (0x0ULL << KFD_MMAP_TYPE_SHIFT) #define KFD_MMAP_GPU_ID_SHIFT (46 - PAGE_SHIFT) #define KFD_MMAP_GPU_ID_MASK (((1ULL << KFD_GPU_ID_HASH_WIDTH) - 1) \ @@ -103,6 +109,8 @@ #define KFD_KERNEL_QUEUE_SIZE 2048 +#define KFD_UNMAP_LATENCY_MS (4000) + /* * 512 = 0x200 * The doorbell index distance between SDMA RLC (2*i) and (2*i+1) in the @@ -153,13 +161,23 @@ extern int ignore_crat; /* * Set sh_mem_config.retry_disable on Vega10 */ -extern int noretry; +extern int amdgpu_noretry; /* * Halt if HWS hang is detected */ extern int halt_if_hws_hang; +/* + * Whether MEC FW support GWS barriers + */ +extern bool hws_gws_support; + +/* + * Queue preemption timeout in ms + */ +extern int queue_preemption_timeout_ms; + enum cache_policy { cache_policy_coherent, cache_policy_noncoherent @@ -177,6 +195,7 @@ struct kfd_event_interrupt_class { struct kfd_device_info { enum amd_asic_type asic_family; + const char *asic_name; const struct kfd_event_interrupt_class *event_interrupt_class; unsigned int max_pasid_bits; unsigned int max_no_of_hqd; @@ -188,6 +207,7 @@ struct kfd_device_info { bool needs_iommu_device; bool needs_pci_atomics; unsigned int num_sdma_engines; + unsigned int num_xgmi_sdma_engines; unsigned int num_sdma_queues_per_engine; }; @@ -210,6 +230,7 @@ struct kfd_dev { const struct kfd_device_info *device_info; struct pci_dev *pdev; + struct drm_device *ddev; unsigned int id; /* topology stub index */ @@ -258,7 +279,7 @@ struct kfd_dev { bool interrupts_active; /* Debug manager */ - struct kfd_dbgmgr *dbgmgr; + struct kfd_dbgmgr *dbgmgr; /* Firmware versions */ uint16_t mec_fw_version; @@ -276,6 +297,15 @@ struct kfd_dev { uint64_t hive_id; bool pci_atomic_requested; + + /* SRAM ECC flag */ + atomic_t sram_ecc_flag; + + /* Compute Profile ref. count */ + atomic_t compute_profile; + + /* Global GWS resource shared b/t processes*/ + void *gws; }; enum kfd_mempool { @@ -323,7 +353,8 @@ enum kfd_queue_type { KFD_QUEUE_TYPE_COMPUTE, KFD_QUEUE_TYPE_SDMA, KFD_QUEUE_TYPE_HIQ, - KFD_QUEUE_TYPE_DIQ + KFD_QUEUE_TYPE_DIQ, + KFD_QUEUE_TYPE_SDMA_XGMI }; enum kfd_queue_format { @@ -331,6 +362,11 @@ enum kfd_queue_format { KFD_QUEUE_FORMAT_AQL }; +enum KFD_QUEUE_PRIORITY { + KFD_QUEUE_PRIORITY_MINIMUM = 0, + KFD_QUEUE_PRIORITY_MAXIMUM = 15 +}; + /** * struct queue_properties * @@ -413,6 +449,11 @@ struct queue_properties { uint32_t *cu_mask; }; +#define QUEUE_IS_ACTIVE(q) ((q).queue_size > 0 && \ + (q).queue_address != 0 && \ + (q).queue_percent > 0 && \ + !(q).is_evicted) + /** * struct queue * @@ -438,6 +479,9 @@ struct queue_properties { * * @device: The kfd device that created this queue. * + * @gws: Pointing to gws kgd_mem if this is a gws control queue; NULL + * otherwise. + * * This structure represents user mode compute queues. * It contains all the necessary data to handle such queues. * @@ -459,6 +503,7 @@ struct queue { struct kfd_process *process; struct kfd_dev *device; + void *gws; }; /* @@ -469,9 +514,16 @@ enum KFD_MQD_TYPE { KFD_MQD_TYPE_HIQ, /* for hiq */ KFD_MQD_TYPE_CP, /* for cp queues and diq */ KFD_MQD_TYPE_SDMA, /* for sdma queues */ + KFD_MQD_TYPE_DIQ, /* for diq */ KFD_MQD_TYPE_MAX }; +enum KFD_PIPE_PRIORITY { + KFD_PIPE_PRIORITY_CS_LOW = 0, + KFD_PIPE_PRIORITY_CS_MEDIUM, + KFD_PIPE_PRIORITY_CS_HIGH +}; + struct scheduling_resources { unsigned int vmid_mask; enum kfd_queue_type type; @@ -636,10 +688,7 @@ struct kfd_process { /* We want to receive a notification when the mm_struct is destroyed */ struct mmu_notifier mmu_notifier; - /* Use for delayed freeing of kfd_process structure */ - struct rcu_head rcu; - - unsigned int pasid; + uint16_t pasid; unsigned int doorbell_index; /* @@ -680,6 +729,10 @@ struct kfd_process { * restored after an eviction */ unsigned long last_restore_timestamp; + + /* Kobj for our procfs */ + struct kobject *kobj; + struct attribute attr_pasid; }; #define KFD_PROCESS_TABLE_SIZE 5 /* bits: 32 entries */ @@ -782,6 +835,10 @@ int kfd_gtt_sa_free(struct kfd_dev *kfd, struct kfd_mem_obj *mem_obj); extern struct device *kfd_device; +/* KFD's procfs */ +void kfd_procfs_init(void); +void kfd_procfs_shutdown(void); + /* Topology */ int kfd_topology_init(void); void kfd_topology_shutdown(void); @@ -813,8 +870,6 @@ void uninit_queue(struct queue *q); void print_queue_properties(struct queue_properties *q); void print_queue(struct queue *q); -struct mqd_manager *mqd_manager_init(enum KFD_MQD_TYPE type, - struct kfd_dev *dev); struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type, struct kfd_dev *dev); struct mqd_manager *mqd_manager_init_cik_hawaii(enum KFD_MQD_TYPE type, @@ -825,6 +880,8 @@ struct mqd_manager *mqd_manager_init_vi_tonga(enum KFD_MQD_TYPE type, struct kfd_dev *dev); struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type, struct kfd_dev *dev); +struct mqd_manager *mqd_manager_init_v10(enum KFD_MQD_TYPE type, + struct kfd_dev *dev); struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev); void device_queue_manager_uninit(struct device_queue_manager *dqm); struct kernel_queue *kernel_queue_init(struct kfd_dev *dev, @@ -853,6 +910,8 @@ int pqm_update_queue(struct process_queue_manager *pqm, unsigned int qid, struct queue_properties *p); int pqm_set_cu_mask(struct process_queue_manager *pqm, unsigned int qid, struct queue_properties *p); +int pqm_set_gws(struct process_queue_manager *pqm, unsigned int qid, + void *gws); struct kernel_queue *pqm_get_kernel_queue(struct process_queue_manager *pqm, unsigned int qid); int pqm_get_wave_state(struct process_queue_manager *pqm, @@ -862,8 +921,8 @@ int pqm_get_wave_state(struct process_queue_manager *pqm, u32 *save_area_used_size); int amdkfd_fence_wait_timeout(unsigned int *fence_addr, - unsigned int fence_value, - unsigned int timeout_ms); + unsigned int fence_value, + unsigned int timeout_ms); /* Packet Manager */ @@ -877,6 +936,7 @@ struct packet_manager { bool allocated; struct kfd_mem_obj *ib_buffer_obj; unsigned int ib_size_bytes; + bool is_over_subscription; const struct packet_manager_funcs *pmf; }; @@ -912,6 +972,7 @@ struct packet_manager_funcs { extern const struct packet_manager_funcs kfd_vi_pm_funcs; extern const struct packet_manager_funcs kfd_v9_pm_funcs; +extern const struct packet_manager_funcs kfd_v10_pm_funcs; int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm); void pm_uninit(struct packet_manager *pm); @@ -931,7 +992,8 @@ void pm_release_ib(struct packet_manager *pm); /* Following PM funcs can be shared among VI and AI */ unsigned int pm_build_pm4_header(unsigned int opcode, size_t packet_size); int pm_set_resources_vi(struct packet_manager *pm, uint32_t *buffer, - struct scheduling_resources *res); + struct scheduling_resources *res); + uint64_t kfd_get_number_elems(struct kfd_dev *kfd); @@ -975,6 +1037,25 @@ int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p); bool kfd_is_locked(void); +/* Compute profile */ +void kfd_inc_compute_active(struct kfd_dev *dev); +void kfd_dec_compute_active(struct kfd_dev *dev); + +/* Cgroup Support */ +/* Check with device cgroup if @kfd device is accessible */ +static inline int kfd_devcgroup_check_permission(struct kfd_dev *kfd) +{ +#if defined(CONFIG_CGROUP_DEVICE) + struct drm_device *ddev = kfd->ddev; + + return devcgroup_check_permission(DEVCG_DEV_CHAR, ddev->driver->major, + ddev->render->index, + DEVCG_ACC_WRITE | DEVCG_ACC_READ); +#else + return 0; +#endif +} + /* Debugfs */ #if defined(CONFIG_DEBUG_FS) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index 4bdae78bab8e..10f9af5784f2 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -62,12 +62,74 @@ static struct workqueue_struct *kfd_restore_wq; static struct kfd_process *find_process(const struct task_struct *thread); static void kfd_process_ref_release(struct kref *ref); -static struct kfd_process *create_process(const struct task_struct *thread, - struct file *filep); +static struct kfd_process *create_process(const struct task_struct *thread); +static int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep); static void evict_process_worker(struct work_struct *work); static void restore_process_worker(struct work_struct *work); +struct kfd_procfs_tree { + struct kobject *kobj; +}; + +static struct kfd_procfs_tree procfs; + +static ssize_t kfd_procfs_show(struct kobject *kobj, struct attribute *attr, + char *buffer) +{ + int val = 0; + + if (strcmp(attr->name, "pasid") == 0) { + struct kfd_process *p = container_of(attr, struct kfd_process, + attr_pasid); + val = p->pasid; + } else { + pr_err("Invalid attribute"); + return -EINVAL; + } + + return snprintf(buffer, PAGE_SIZE, "%d\n", val); +} + +static void kfd_procfs_kobj_release(struct kobject *kobj) +{ + kfree(kobj); +} + +static const struct sysfs_ops kfd_procfs_ops = { + .show = kfd_procfs_show, +}; + +static struct kobj_type procfs_type = { + .release = kfd_procfs_kobj_release, + .sysfs_ops = &kfd_procfs_ops, +}; + +void kfd_procfs_init(void) +{ + int ret = 0; + + procfs.kobj = kfd_alloc_struct(procfs.kobj); + if (!procfs.kobj) + return; + + ret = kobject_init_and_add(procfs.kobj, &procfs_type, + &kfd_device->kobj, "proc"); + if (ret) { + pr_warn("Could not create procfs proc folder"); + /* If we fail to create the procfs, clean up */ + kfd_procfs_shutdown(); + } +} + +void kfd_procfs_shutdown(void) +{ + if (procfs.kobj) { + kobject_del(procfs.kobj); + kobject_put(procfs.kobj); + procfs.kobj = NULL; + } +} int kfd_process_create_wq(void) { @@ -206,6 +268,7 @@ struct kfd_process *kfd_create_process(struct file *filep) { struct kfd_process *process; struct task_struct *thread = current; + int ret; if (!thread->mm) return ERR_PTR(-EINVAL); @@ -223,11 +286,44 @@ struct kfd_process *kfd_create_process(struct file *filep) /* A prior open of /dev/kfd could have already created the process. */ process = find_process(thread); - if (process) + if (process) { pr_debug("Process already found\n"); - else - process = create_process(thread, filep); + } else { + process = create_process(thread); + if (IS_ERR(process)) + goto out; + + ret = kfd_process_init_cwsr_apu(process, filep); + if (ret) { + process = ERR_PTR(ret); + goto out; + } + + if (!procfs.kobj) + goto out; + + process->kobj = kfd_alloc_struct(process->kobj); + if (!process->kobj) { + pr_warn("Creating procfs kobject failed"); + goto out; + } + ret = kobject_init_and_add(process->kobj, &procfs_type, + procfs.kobj, "%d", + (int)process->lead_thread->pid); + if (ret) { + pr_warn("Creating procfs pid directory failed"); + goto out; + } + process->attr_pasid.name = "pasid"; + process->attr_pasid.mode = KFD_SYSFS_FILE_MODE; + sysfs_attr_init(&process->attr_pasid); + ret = sysfs_create_file(process->kobj, &process->attr_pasid); + if (ret) + pr_warn("Creating pasid for pid %d failed", + (int)process->lead_thread->pid); + } +out: mutex_unlock(&kfd_processes_mutex); return process; @@ -320,7 +416,7 @@ static void kfd_process_destroy_pdds(struct kfd_process *p) list_for_each_entry_safe(pdd, temp, &p->per_device_data, per_device_list) { - pr_debug("Releasing pdd (topology id %d) for process (pasid %d)\n", + pr_debug("Releasing pdd (topology id %d) for process (pasid 0x%x)\n", pdd->dev->id, p->pasid); if (pdd->drm_file) { @@ -355,6 +451,14 @@ static void kfd_process_wq_release(struct work_struct *work) struct kfd_process *p = container_of(work, struct kfd_process, release_work); + /* Remove the procfs files */ + if (p->kobj) { + sysfs_remove_file(p->kobj, &p->attr_pasid); + kobject_del(p->kobj); + kobject_put(p->kobj); + p->kobj = NULL; + } + kfd_iommu_unbind_process(p); kfd_process_free_outstanding_kfd_bos(p); @@ -382,11 +486,9 @@ static void kfd_process_ref_release(struct kref *ref) queue_work(kfd_process_wq, &p->release_work); } -static void kfd_process_destroy_delayed(struct rcu_head *rcu) +static void kfd_process_free_notifier(struct mmu_notifier *mn) { - struct kfd_process *p = container_of(rcu, struct kfd_process, rcu); - - kfd_unref_process(p); + kfd_unref_process(container_of(mn, struct kfd_process, mmu_notifier)); } static void kfd_process_notifier_release(struct mmu_notifier *mn, @@ -438,12 +540,12 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn, mutex_unlock(&p->mutex); - mmu_notifier_unregister_no_release(&p->mmu_notifier, mm); - mmu_notifier_call_srcu(&p->rcu, &kfd_process_destroy_delayed); + mmu_notifier_put(&p->mmu_notifier); } static const struct mmu_notifier_ops kfd_process_mmu_notifier_ops = { .release = kfd_process_notifier_release, + .free_notifier = kfd_process_free_notifier, }; static int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep) @@ -513,81 +615,69 @@ static int kfd_process_device_init_cwsr_dgpu(struct kfd_process_device *pdd) return 0; } -static struct kfd_process *create_process(const struct task_struct *thread, - struct file *filep) +/* + * On return the kfd_process is fully operational and will be freed when the + * mm is released + */ +static struct kfd_process *create_process(const struct task_struct *thread) { struct kfd_process *process; int err = -ENOMEM; process = kzalloc(sizeof(*process), GFP_KERNEL); - if (!process) goto err_alloc_process; - process->pasid = kfd_pasid_alloc(); - if (process->pasid == 0) - goto err_alloc_pasid; - - if (kfd_alloc_process_doorbells(process) < 0) - goto err_alloc_doorbells; - kref_init(&process->ref); - mutex_init(&process->mutex); - process->mm = thread->mm; - - /* register notifier */ - process->mmu_notifier.ops = &kfd_process_mmu_notifier_ops; - err = mmu_notifier_register(&process->mmu_notifier, process->mm); - if (err) - goto err_mmu_notifier; - - hash_add_rcu(kfd_processes_table, &process->kfd_processes, - (uintptr_t)process->mm); - process->lead_thread = thread->group_leader; - get_task_struct(process->lead_thread); - INIT_LIST_HEAD(&process->per_device_data); - + INIT_DELAYED_WORK(&process->eviction_work, evict_process_worker); + INIT_DELAYED_WORK(&process->restore_work, restore_process_worker); + process->last_restore_timestamp = get_jiffies_64(); kfd_event_init_process(process); + process->is_32bit_user_mode = in_compat_syscall(); + + process->pasid = kfd_pasid_alloc(); + if (process->pasid == 0) + goto err_alloc_pasid; + + if (kfd_alloc_process_doorbells(process) < 0) + goto err_alloc_doorbells; err = pqm_init(&process->pqm, process); if (err != 0) goto err_process_pqm_init; /* init process apertures*/ - process->is_32bit_user_mode = in_compat_syscall(); err = kfd_init_apertures(process); if (err != 0) goto err_init_apertures; - INIT_DELAYED_WORK(&process->eviction_work, evict_process_worker); - INIT_DELAYED_WORK(&process->restore_work, restore_process_worker); - process->last_restore_timestamp = get_jiffies_64(); - - err = kfd_process_init_cwsr_apu(process, filep); + /* Must be last, have to use release destruction after this */ + process->mmu_notifier.ops = &kfd_process_mmu_notifier_ops; + err = mmu_notifier_register(&process->mmu_notifier, process->mm); if (err) - goto err_init_cwsr; + goto err_register_notifier; + + get_task_struct(process->lead_thread); + hash_add_rcu(kfd_processes_table, &process->kfd_processes, + (uintptr_t)process->mm); return process; -err_init_cwsr: +err_register_notifier: kfd_process_free_outstanding_kfd_bos(process); kfd_process_destroy_pdds(process); err_init_apertures: pqm_uninit(&process->pqm); err_process_pqm_init: - hash_del_rcu(&process->kfd_processes); - synchronize_rcu(); - mmu_notifier_unregister_no_release(&process->mmu_notifier, process->mm); -err_mmu_notifier: - mutex_destroy(&process->mutex); kfd_free_process_doorbells(process); err_alloc_doorbells: kfd_pasid_free(process->pasid); err_alloc_pasid: + mutex_destroy(&process->mutex); kfree(process); err_alloc_process: return ERR_PTR(err); @@ -597,6 +687,8 @@ static int init_doorbell_bitmap(struct qcm_process_device *qpd, struct kfd_dev *dev) { unsigned int i; + int range_start = dev->shared_resources.non_cp_doorbells_start; + int range_end = dev->shared_resources.non_cp_doorbells_end; if (!KFD_IS_SOC15(dev->device_info->asic_family)) return 0; @@ -608,14 +700,16 @@ static int init_doorbell_bitmap(struct qcm_process_device *qpd, return -ENOMEM; /* Mask out doorbells reserved for SDMA, IH, and VCN on SOC15. */ + pr_debug("reserved doorbell 0x%03x - 0x%03x\n", range_start, range_end); + pr_debug("reserved doorbell 0x%03x - 0x%03x\n", + range_start + KFD_QUEUE_DOORBELL_MIRROR_OFFSET, + range_end + KFD_QUEUE_DOORBELL_MIRROR_OFFSET); + for (i = 0; i < KFD_MAX_NUM_OF_QUEUES_PER_PROCESS / 2; i++) { - if (i >= dev->shared_resources.non_cp_doorbells_start - && i <= dev->shared_resources.non_cp_doorbells_end) { + if (i >= range_start && i <= range_end) { set_bit(i, qpd->doorbell_bitmap); set_bit(i + KFD_QUEUE_DOORBELL_MIRROR_OFFSET, qpd->doorbell_bitmap); - pr_debug("reserved doorbell 0x%03x and 0x%03x\n", i, - i + KFD_QUEUE_DOORBELL_MIRROR_OFFSET); } } @@ -705,6 +799,8 @@ int kfd_process_device_init_vm(struct kfd_process_device *pdd, return ret; } + amdgpu_vm_set_task_info(pdd->vm); + ret = kfd_process_device_reserve_ib_mem(pdd); if (ret) goto err_reserve_ib_mem; @@ -928,7 +1024,7 @@ static void evict_process_worker(struct work_struct *work) */ flush_delayed_work(&p->restore_work); - pr_debug("Started evicting pasid %d\n", p->pasid); + pr_debug("Started evicting pasid 0x%x\n", p->pasid); ret = kfd_process_evict_queues(p); if (!ret) { dma_fence_signal(p->ef); @@ -937,16 +1033,15 @@ static void evict_process_worker(struct work_struct *work) queue_delayed_work(kfd_restore_wq, &p->restore_work, msecs_to_jiffies(PROCESS_RESTORE_TIME_MS)); - pr_debug("Finished evicting pasid %d\n", p->pasid); + pr_debug("Finished evicting pasid 0x%x\n", p->pasid); } else - pr_err("Failed to evict queues of pasid %d\n", p->pasid); + pr_err("Failed to evict queues of pasid 0x%x\n", p->pasid); } static void restore_process_worker(struct work_struct *work) { struct delayed_work *dwork; struct kfd_process *p; - struct kfd_process_device *pdd; int ret = 0; dwork = to_delayed_work(work); @@ -955,17 +1050,7 @@ static void restore_process_worker(struct work_struct *work) * lifetime of this thread, kfd_process p will be valid */ p = container_of(dwork, struct kfd_process, restore_work); - - /* Call restore_process_bos on the first KGD device. This function - * takes care of restoring the whole process including other devices. - * Restore can fail if enough memory is not available. If so, - * reschedule again. - */ - pdd = list_first_entry(&p->per_device_data, - struct kfd_process_device, - per_device_list); - - pr_debug("Started restoring pasid %d\n", p->pasid); + pr_debug("Started restoring pasid 0x%x\n", p->pasid); /* Setting last_restore_timestamp before successful restoration. * Otherwise this would have to be set by KGD (restore_process_bos) @@ -981,7 +1066,7 @@ static void restore_process_worker(struct work_struct *work) ret = amdgpu_amdkfd_gpuvm_restore_process_bos(p->kgd_process_info, &p->ef); if (ret) { - pr_debug("Failed to restore BOs of pasid %d, retry after %d ms\n", + pr_debug("Failed to restore BOs of pasid 0x%x, retry after %d ms\n", p->pasid, PROCESS_BACK_OFF_TIME_MS); ret = queue_delayed_work(kfd_restore_wq, &p->restore_work, msecs_to_jiffies(PROCESS_BACK_OFF_TIME_MS)); @@ -991,9 +1076,9 @@ static void restore_process_worker(struct work_struct *work) ret = kfd_process_restore_queues(p); if (!ret) - pr_debug("Finished restoring pasid %d\n", p->pasid); + pr_debug("Finished restoring pasid 0x%x\n", p->pasid); else - pr_err("Failed to restore queues of pasid %d\n", p->pasid); + pr_err("Failed to restore queues of pasid 0x%x\n", p->pasid); } void kfd_suspend_all_processes(void) @@ -1007,7 +1092,7 @@ void kfd_suspend_all_processes(void) cancel_delayed_work_sync(&p->restore_work); if (kfd_process_evict_queues(p)) - pr_err("Failed to suspend process %d\n", p->pasid); + pr_err("Failed to suspend process 0x%x\n", p->pasid); dma_fence_signal(p->ef); dma_fence_put(p->ef); p->ef = NULL; @@ -1090,7 +1175,7 @@ int kfd_debugfs_mqds_by_process(struct seq_file *m, void *data) int idx = srcu_read_lock(&kfd_processes_srcu); hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { - seq_printf(m, "Process %d PASID %d:\n", + seq_printf(m, "Process %d PASID 0x%x:\n", p->lead_thread->tgid, p->pasid); mutex_lock(&p->mutex); @@ -1107,3 +1192,4 @@ int kfd_debugfs_mqds_by_process(struct seq_file *m, void *data) } #endif + diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c index fcaaf93681ac..2659d226c056 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c @@ -26,6 +26,7 @@ #include "kfd_device_queue_manager.h" #include "kfd_priv.h" #include "kfd_kernel_queue.h" +#include "amdgpu_amdkfd.h" static inline struct process_queue_node *get_queue_by_qid( struct process_queue_manager *pqm, unsigned int qid) @@ -52,7 +53,7 @@ static int find_available_queue_slot(struct process_queue_manager *pqm, pr_debug("The new slot id %lu\n", found); if (found >= KFD_MAX_NUM_OF_QUEUES_PER_PROCESS) { - pr_info("Cannot open more queues for process with pasid %d\n", + pr_info("Cannot open more queues for process with pasid 0x%x\n", pqm->process->pasid); return -ENOMEM; } @@ -74,6 +75,55 @@ void kfd_process_dequeue_from_device(struct kfd_process_device *pdd) pdd->already_dequeued = true; } +int pqm_set_gws(struct process_queue_manager *pqm, unsigned int qid, + void *gws) +{ + struct kfd_dev *dev = NULL; + struct process_queue_node *pqn; + struct kfd_process_device *pdd; + struct kgd_mem *mem = NULL; + int ret; + + pqn = get_queue_by_qid(pqm, qid); + if (!pqn) { + pr_err("Queue id does not match any known queue\n"); + return -EINVAL; + } + + if (pqn->q) + dev = pqn->q->device; + if (WARN_ON(!dev)) + return -ENODEV; + + pdd = kfd_get_process_device_data(dev, pqm->process); + if (!pdd) { + pr_err("Process device data doesn't exist\n"); + return -EINVAL; + } + + /* Only allow one queue per process can have GWS assigned */ + if (gws && pdd->qpd.num_gws) + return -EBUSY; + + if (!gws && pdd->qpd.num_gws == 0) + return -EINVAL; + + if (gws) + ret = amdgpu_amdkfd_add_gws_to_process(pdd->process->kgd_process_info, + gws, &mem); + else + ret = amdgpu_amdkfd_remove_gws_from_process(pdd->process->kgd_process_info, + pqn->q->gws); + if (unlikely(ret)) + return ret; + + pqn->q->gws = mem; + pdd->qpd.num_gws = gws ? amdgpu_amdkfd_get_num_gws(dev->kgd) : 0; + + return pqn->q->device->dqm->ops.update_queue(pqn->q->device->dqm, + pqn->q); +} + void kfd_process_dequeue_from_all_devices(struct kfd_process *p) { struct kfd_process_device *pdd; @@ -100,6 +150,9 @@ void pqm_uninit(struct process_queue_manager *pqm) struct process_queue_node *pqn, *next; list_for_each_entry_safe(pqn, next, &pqm->queues, process_queue_list) { + if (pqn->q && pqn->q->gws) + amdgpu_amdkfd_remove_gws_from_process(pqm->process->kgd_process_info, + pqn->q->gws); uninit_queue(pqn->q); list_del(&pqn->process_queue_list); kfree(pqn); @@ -186,8 +239,13 @@ int pqm_create_queue(struct process_queue_manager *pqm, switch (type) { case KFD_QUEUE_TYPE_SDMA: - if (dev->dqm->queue_count >= get_num_sdma_queues(dev->dqm)) { - pr_err("Over-subscription is not allowed for SDMA.\n"); + case KFD_QUEUE_TYPE_SDMA_XGMI: + if ((type == KFD_QUEUE_TYPE_SDMA && dev->dqm->sdma_queue_count + >= get_num_sdma_queues(dev->dqm)) || + (type == KFD_QUEUE_TYPE_SDMA_XGMI && + dev->dqm->xgmi_sdma_queue_count + >= get_num_xgmi_sdma_queues(dev->dqm))) { + pr_debug("Over-subscription is not allowed for SDMA.\n"); retval = -EPERM; goto err_create_queue; } @@ -240,7 +298,7 @@ int pqm_create_queue(struct process_queue_manager *pqm, } if (retval != 0) { - pr_err("Pasid %d DQM create queue %d failed. ret %d\n", + pr_err("Pasid 0x%x DQM create queue %d failed. ret %d\n", pqm->process->pasid, type, retval); goto err_create_queue; } @@ -319,12 +377,19 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid) dqm = pqn->q->device->dqm; retval = dqm->ops.destroy_queue(dqm, &pdd->qpd, pqn->q); if (retval) { - pr_err("Pasid %d destroy queue %d failed, ret %d\n", + pr_err("Pasid 0x%x destroy queue %d failed, ret %d\n", pqm->process->pasid, pqn->q->properties.queue_id, retval); if (retval != -ETIME) goto err_destroy_queue; } + + if (pqn->q->gws) { + amdgpu_amdkfd_remove_gws_from_process(pqm->process->kgd_process_info, + pqn->q->gws); + pdd->qpd.num_gws = 0; + } + kfree(pqn->q->properties.cu_mask); pqn->q->properties.cu_mask = NULL; uninit_queue(pqn->q); @@ -446,6 +511,7 @@ int pqm_debugfs_mqds(struct seq_file *m, void *data) q = pqn->q; switch (q->properties.type) { case KFD_QUEUE_TYPE_SDMA: + case KFD_QUEUE_TYPE_SDMA_XGMI: seq_printf(m, " SDMA queue on device %x\n", q->device->id); mqd_type = KFD_MQD_TYPE_SDMA; @@ -461,8 +527,7 @@ int pqm_debugfs_mqds(struct seq_file *m, void *data) q->properties.type, q->device->id); continue; } - mqd_mgr = q->device->dqm->ops.get_mqd_manager( - q->device->dqm, mqd_type); + mqd_mgr = q->device->dqm->mqd_mgrs[mqd_type]; } else if (pqn->kq) { q = pqn->kq->queue; mqd_mgr = pqn->kq->mqd_mgr; @@ -470,7 +535,6 @@ int pqm_debugfs_mqds(struct seq_file *m, void *data) case KFD_QUEUE_TYPE_DIQ: seq_printf(m, " DIQ on device %x\n", pqn->kq->dev->id); - mqd_type = KFD_MQD_TYPE_HIQ; break; default: seq_printf(m, diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c index 09da91644f9f..69bd0628fdc6 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c @@ -37,6 +37,7 @@ #include "kfd_device_queue_manager.h" #include "kfd_iommu.h" #include "amdgpu_amdkfd.h" +#include "amdgpu_ras.h" /* topology_device_list - Master list of all topology devices */ static struct list_head topology_device_list; @@ -268,6 +269,8 @@ static ssize_t iolink_show(struct kobject *kobj, struct attribute *attr, buffer[0] = 0; iolink = container_of(attr, struct kfd_iolink_properties, attr); + if (iolink->gpu && kfd_devcgroup_check_permission(iolink->gpu)) + return -EPERM; sysfs_show_32bit_prop(buffer, "type", iolink->iolink_type); sysfs_show_32bit_prop(buffer, "version_major", iolink->ver_maj); sysfs_show_32bit_prop(buffer, "version_minor", iolink->ver_min); @@ -304,6 +307,8 @@ static ssize_t mem_show(struct kobject *kobj, struct attribute *attr, buffer[0] = 0; mem = container_of(attr, struct kfd_mem_properties, attr); + if (mem->gpu && kfd_devcgroup_check_permission(mem->gpu)) + return -EPERM; sysfs_show_32bit_prop(buffer, "heap_type", mem->heap_type); sysfs_show_64bit_prop(buffer, "size_in_bytes", mem->size_in_bytes); sysfs_show_32bit_prop(buffer, "flags", mem->flags); @@ -333,6 +338,8 @@ static ssize_t kfd_cache_show(struct kobject *kobj, struct attribute *attr, buffer[0] = 0; cache = container_of(attr, struct kfd_cache_properties, attr); + if (cache->gpu && kfd_devcgroup_check_permission(cache->gpu)) + return -EPERM; sysfs_show_32bit_prop(buffer, "processor_id_low", cache->processor_id_low); sysfs_show_32bit_prop(buffer, "level", cache->cache_level); @@ -405,8 +412,6 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr, char *buffer) { struct kfd_topology_device *dev; - char public_name[KFD_TOPOLOGY_PUBLIC_NAME_SIZE]; - uint32_t i; uint32_t log_max_watch_addr; /* Making sure that the buffer is an empty string */ @@ -415,24 +420,24 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr, if (strcmp(attr->name, "gpu_id") == 0) { dev = container_of(attr, struct kfd_topology_device, attr_gpuid); + if (dev->gpu && kfd_devcgroup_check_permission(dev->gpu)) + return -EPERM; return sysfs_show_32bit_val(buffer, dev->gpu_id); } if (strcmp(attr->name, "name") == 0) { dev = container_of(attr, struct kfd_topology_device, attr_name); - for (i = 0; i < KFD_TOPOLOGY_PUBLIC_NAME_SIZE; i++) { - public_name[i] = - (char)dev->node_props.marketing_name[i]; - if (dev->node_props.marketing_name[i] == 0) - break; - } - public_name[KFD_TOPOLOGY_PUBLIC_NAME_SIZE-1] = 0x0; - return sysfs_show_str_val(buffer, public_name); + + if (dev->gpu && kfd_devcgroup_check_permission(dev->gpu)) + return -EPERM; + return sysfs_show_str_val(buffer, dev->node_props.name); } dev = container_of(attr, struct kfd_topology_device, attr_props); + if (dev->gpu && kfd_devcgroup_check_permission(dev->gpu)) + return -EPERM; sysfs_show_32bit_prop(buffer, "cpu_cores_count", dev->node_props.cpu_cores_count); sysfs_show_32bit_prop(buffer, "simd_count", @@ -453,6 +458,8 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr, dev->node_props.lds_size_in_kb); sysfs_show_32bit_prop(buffer, "gds_size_in_kb", dev->node_props.gds_size_in_kb); + sysfs_show_32bit_prop(buffer, "num_gws", + dev->node_props.num_gws); sysfs_show_32bit_prop(buffer, "wave_front_size", dev->node_props.wave_front_size); sysfs_show_32bit_prop(buffer, "array_count", @@ -475,6 +482,10 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr, dev->node_props.drm_render_minor); sysfs_show_64bit_prop(buffer, "hive_id", dev->node_props.hive_id); + sysfs_show_32bit_prop(buffer, "num_sdma_engines", + dev->node_props.num_sdma_engines); + sysfs_show_32bit_prop(buffer, "num_sdma_xgmi_engines", + dev->node_props.num_sdma_xgmi_engines); if (dev->gpu) { log_max_watch_addr = @@ -1077,8 +1088,9 @@ static uint32_t kfd_generate_gpu_id(struct kfd_dev *gpu) local_mem_info.local_mem_size_public; buf[0] = gpu->pdev->devfn; - buf[1] = gpu->pdev->subsystem_vendor; - buf[2] = gpu->pdev->subsystem_device; + buf[1] = gpu->pdev->subsystem_vendor | + (gpu->pdev->subsystem_device << 16); + buf[2] = pci_domain_nr(gpu->pdev->bus); buf[3] = gpu->pdev->device; buf[4] = gpu->pdev->bus->number; buf[5] = lower_32_bits(local_mem_size); @@ -1098,6 +1110,9 @@ static struct kfd_topology_device *kfd_assign_gpu(struct kfd_dev *gpu) { struct kfd_topology_device *dev; struct kfd_topology_device *out_dev = NULL; + struct kfd_mem_properties *mem; + struct kfd_cache_properties *cache; + struct kfd_iolink_properties *iolink; down_write(&topology_lock); list_for_each_entry(dev, &topology_device_list, list) { @@ -1111,6 +1126,13 @@ static struct kfd_topology_device *kfd_assign_gpu(struct kfd_dev *gpu) if (!dev->gpu && (dev->node_props.simd_count > 0)) { dev->gpu = gpu; out_dev = dev; + + list_for_each_entry(mem, &dev->mem_props, list) + mem->gpu = dev->gpu; + list_for_each_entry(cache, &dev->cache_props, list) + cache->gpu = dev->gpu; + list_for_each_entry(iolink, &dev->io_link_props, list) + iolink->gpu = dev->gpu; break; } } @@ -1197,6 +1219,7 @@ int kfd_topology_add_device(struct kfd_dev *gpu) void *crat_image = NULL; size_t image_size = 0; int proximity_domain; + struct amdgpu_ras *ctx; INIT_LIST_HEAD(&temp_topology_device_list); @@ -1265,13 +1288,16 @@ int kfd_topology_add_device(struct kfd_dev *gpu) */ amdgpu_amdkfd_get_cu_info(dev->gpu->kgd, &cu_info); + + strncpy(dev->node_props.name, gpu->device_info->asic_name, + KFD_TOPOLOGY_PUBLIC_NAME_SIZE); + dev->node_props.simd_arrays_per_engine = cu_info.num_shader_arrays_per_engine; dev->node_props.vendor_id = gpu->pdev->vendor; dev->node_props.device_id = gpu->pdev->device; - dev->node_props.location_id = PCI_DEVID(gpu->pdev->bus->number, - gpu->pdev->devfn); + dev->node_props.location_id = pci_dev_id(gpu->pdev); dev->node_props.max_engine_clk_fcompute = amdgpu_amdkfd_get_max_engine_clock_in_mhz(dev->gpu->kgd); dev->node_props.max_engine_clk_ccompute = @@ -1280,6 +1306,12 @@ int kfd_topology_add_device(struct kfd_dev *gpu) gpu->shared_resources.drm_render_minor; dev->node_props.hive_id = gpu->hive_id; + dev->node_props.num_sdma_engines = gpu->device_info->num_sdma_engines; + dev->node_props.num_sdma_xgmi_engines = + gpu->device_info->num_xgmi_sdma_engines; + dev->node_props.num_gws = (hws_gws_support && + dev->gpu->dqm->sched_policy != KFD_SCHED_POLICY_NO_HWS) ? + amdgpu_amdkfd_get_num_gws(dev->gpu->kgd) : 0; kfd_fill_mem_clk_max_info(dev); kfd_fill_iolink_non_crat_info(dev); @@ -1297,6 +1329,7 @@ int kfd_topology_add_device(struct kfd_dev *gpu) case CHIP_POLARIS10: case CHIP_POLARIS11: case CHIP_POLARIS12: + case CHIP_VEGAM: pr_debug("Adding doorbell packet type capability\n"); dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_1_0 << HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) & @@ -1306,6 +1339,11 @@ int kfd_topology_add_device(struct kfd_dev *gpu) case CHIP_VEGA12: case CHIP_VEGA20: case CHIP_RAVEN: + case CHIP_RENOIR: + case CHIP_ARCTURUS: + case CHIP_NAVI10: + case CHIP_NAVI12: + case CHIP_NAVI14: dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_2_0 << HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) & HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK); @@ -1315,17 +1353,38 @@ int kfd_topology_add_device(struct kfd_dev *gpu) dev->gpu->device_info->asic_family); } + /* + * Overwrite ATS capability according to needs_iommu_device to fix + * potential missing corresponding bit in CRAT of BIOS. + */ + if (dev->gpu->device_info->needs_iommu_device) + dev->node_props.capability |= HSA_CAP_ATS_PRESENT; + else + dev->node_props.capability &= ~HSA_CAP_ATS_PRESENT; + /* Fix errors in CZ CRAT. * simd_count: Carrizo CRAT reports wrong simd_count, probably * because it doesn't consider masked out CUs * max_waves_per_simd: Carrizo reports wrong max_waves_per_simd - * capability flag: Carrizo CRAT doesn't report IOMMU flags */ if (dev->gpu->device_info->asic_family == CHIP_CARRIZO) { dev->node_props.simd_count = cu_info.simd_per_cu * cu_info.cu_active_number; dev->node_props.max_waves_per_simd = 10; - dev->node_props.capability |= HSA_CAP_ATS_PRESENT; + } + + ctx = amdgpu_ras_get_context((struct amdgpu_device *)(dev->gpu->kgd)); + if (ctx) { + /* kfd only concerns sram ecc on GFX/SDMA and HBM ecc on UMC */ + dev->node_props.capability |= + (((ctx->features & BIT(AMDGPU_RAS_BLOCK__SDMA)) != 0) || + ((ctx->features & BIT(AMDGPU_RAS_BLOCK__GFX)) != 0)) ? + HSA_CAP_SRAM_EDCSUPPORTED : 0; + dev->node_props.capability |= ((ctx->features & BIT(AMDGPU_RAS_BLOCK__UMC)) != 0) ? + HSA_CAP_MEM_EDCSUPPORTED : 0; + + dev->node_props.capability |= (ctx->features != 0) ? + HSA_CAP_RASEVENTNOTIFY : 0; } kfd_debug_print_topology(); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h index 92a19be07344..15843e0fc756 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h @@ -27,7 +27,7 @@ #include <linux/list.h> #include "kfd_crat.h" -#define KFD_TOPOLOGY_PUBLIC_NAME_SIZE 128 +#define KFD_TOPOLOGY_PUBLIC_NAME_SIZE 32 #define HSA_CAP_HOT_PLUGGABLE 0x00000001 #define HSA_CAP_ATS_PRESENT 0x00000002 @@ -48,6 +48,10 @@ #define HSA_CAP_DOORBELL_TYPE_2_0 0x2 #define HSA_CAP_AQL_QUEUE_DOUBLE_MAP 0x00004000 +#define HSA_CAP_SRAM_EDCSUPPORTED 0x00080000 +#define HSA_CAP_MEM_EDCSUPPORTED 0x00100000 +#define HSA_CAP_RASEVENTNOTIFY 0x00200000 + struct kfd_node_properties { uint64_t hive_id; uint32_t cpu_cores_count; @@ -61,6 +65,7 @@ struct kfd_node_properties { uint32_t max_waves_per_simd; uint32_t lds_size_in_kb; uint32_t gds_size_in_kb; + uint32_t num_gws; uint32_t wave_front_size; uint32_t array_count; uint32_t simd_arrays_per_engine; @@ -74,7 +79,9 @@ struct kfd_node_properties { uint32_t max_engine_clk_fcompute; uint32_t max_engine_clk_ccompute; int32_t drm_render_minor; - uint16_t marketing_name[KFD_TOPOLOGY_PUBLIC_NAME_SIZE]; + uint32_t num_sdma_engines; + uint32_t num_sdma_xgmi_engines; + char name[KFD_TOPOLOGY_PUBLIC_NAME_SIZE]; }; #define HSA_MEM_HEAP_TYPE_SYSTEM 0 @@ -95,6 +102,7 @@ struct kfd_mem_properties { uint32_t flags; uint32_t width; uint32_t mem_clk_max; + struct kfd_dev *gpu; struct kobject *kobj; struct attribute attr; }; @@ -116,6 +124,7 @@ struct kfd_cache_properties { uint32_t cache_latency; uint32_t cache_type; uint8_t sibling_map[CRAT_SIBLINGMAP_SIZE]; + struct kfd_dev *gpu; struct kobject *kobj; struct attribute attr; }; @@ -134,6 +143,7 @@ struct kfd_iolink_properties { uint32_t max_bandwidth; uint32_t rec_transfer_size; uint32_t flags; + struct kfd_dev *gpu; struct kobject *kobj; struct attribute attr; }; |