45 files changed, 5041 insertions, 2006 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/Kconfig b/drivers/gpu/drm/amd/amdkfd/Kconfig
index c3613604a4f8..a1a35d4d594b 100644
--- a/drivers/gpu/drm/amd/amdkfd/Kconfig
+++ b/drivers/gpu/drm/amd/amdkfd/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
 #
 # Heterogenous system architecture configuration
 #
diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile
index 69ec96998bb9..48155060a57c 100644
--- a/drivers/gpu/drm/amd/amdkfd/Makefile
+++ b/drivers/gpu/drm/amd/amdkfd/Makefile
@@ -36,16 +36,19 @@ AMDKFD_FILES	:= $(AMDKFD_PATH)/kfd_module.o \
 		$(AMDKFD_PATH)/kfd_mqd_manager_cik.o \
 		$(AMDKFD_PATH)/kfd_mqd_manager_vi.o \
 		$(AMDKFD_PATH)/kfd_mqd_manager_v9.o \
+		$(AMDKFD_PATH)/kfd_mqd_manager_v10.o \
 		$(AMDKFD_PATH)/kfd_kernel_queue.o \
 		$(AMDKFD_PATH)/kfd_kernel_queue_cik.o \
 		$(AMDKFD_PATH)/kfd_kernel_queue_vi.o \
 		$(AMDKFD_PATH)/kfd_kernel_queue_v9.o \
+		$(AMDKFD_PATH)/kfd_kernel_queue_v10.o \
 		$(AMDKFD_PATH)/kfd_packet_manager.o \
 		$(AMDKFD_PATH)/kfd_process_queue_manager.o \
 		$(AMDKFD_PATH)/kfd_device_queue_manager.o \
 		$(AMDKFD_PATH)/kfd_device_queue_manager_cik.o \
 		$(AMDKFD_PATH)/kfd_device_queue_manager_vi.o \
 		$(AMDKFD_PATH)/kfd_device_queue_manager_v9.o \
+		$(AMDKFD_PATH)/kfd_device_queue_manager_v10.o \
 		$(AMDKFD_PATH)/kfd_interrupt.o \
 		$(AMDKFD_PATH)/kfd_events.o \
 		$(AMDKFD_PATH)/cik_event_interrupt.o \
diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
index 177d1e5329a5..9f59ba93cfe0 100644
--- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
+++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
@@ -33,7 +33,9 @@ static bool cik_event_interrupt_isr(struct kfd_dev *dev,
 	const struct cik_ih_ring_entry *ihre =
 			(const struct cik_ih_ring_entry *)ih_ring_entry;
 	const struct kfd2kgd_calls *f2g = dev->kfd2kgd;
-	unsigned int vmid, pasid;
+	unsigned int vmid;
+	uint16_t pasid;
+	bool ret;
 
 	/* This workaround is due to HW/FW limitation on Hawaii that
 	 * VMID and PASID are not written into ih_ring_entry
@@ -48,13 +50,13 @@ static bool cik_event_interrupt_isr(struct kfd_dev *dev,
 		*tmp_ihre = *ihre;
 
 		vmid = f2g->read_vmid_from_vmfault_reg(dev->kgd);
-		pasid = f2g->get_atc_vmid_pasid_mapping_pasid(dev->kgd, vmid);
+		ret = f2g->get_atc_vmid_pasid_mapping_info(dev->kgd, vmid, &pasid);
 
 		tmp_ihre->ring_id &= 0x000000ff;
 		tmp_ihre->ring_id |= vmid << 8;
 		tmp_ihre->ring_id |= pasid << 16;
 
-		return (pasid != 0) &&
+		return ret && (pasid != 0) &&
 			vmid >= dev->vm_info.first_vmid_kfd &&
 			vmid <= dev->vm_info.last_vmid_kfd;
 	}
diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
index 3621efbd5759..d3400da6ab64 100644
--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
@@ -21,7 +21,7 @@
  */
 
 static const uint32_t cwsr_trap_gfx8_hex[] = {
-	0xbf820001, 0xbf82012b,
+	0xbf820001, 0xbf820121,
 	0xb8f4f802, 0x89748674,
 	0xb8f5f803, 0x8675ff75,
 	0x00000400, 0xbf850017,
@@ -36,12 +36,7 @@ static const uint32_t cwsr_trap_gfx8_hex[] = {
 	0x8671ff71, 0x0000ffff,
 	0x8f728374, 0xb972e0c2,
 	0xbf800002, 0xb9740002,
-	0xbe801f70, 0xb8f5f803,
-	0x8675ff75, 0x00000100,
-	0xbf840006, 0xbefa0080,
-	0xb97a0203, 0x8671ff71,
-	0x0000ffff, 0x80f08870,
-	0x82f18071, 0xbefa0080,
+	0xbe801f70, 0xbefa0080,
 	0xb97a0283, 0xbef60068,
 	0xbef70069, 0xb8fa1c07,
 	0x8e7a9c7a, 0x87717a71,
@@ -279,118 +274,122 @@ static const uint32_t cwsr_trap_gfx8_hex[] = {
 
 
 static const uint32_t cwsr_trap_gfx9_hex[] = {
-	0xbf820001, 0xbf82015d,
+	0xbf820001, 0xbf820248,
 	0xb8f8f802, 0x89788678,
-	0xb8f1f803, 0x866eff71,
-	0x00000400, 0xbf850037,
-	0x866eff71, 0x00000800,
-	0xbf850003, 0x866eff71,
-	0x00000100, 0xbf840008,
+	0xb8eef801, 0x866eff6e,
+	0x00000800, 0xbf840003,
 	0x866eff78, 0x00002000,
-	0xbf840001, 0xbf810000,
-	0x8778ff78, 0x00002000,
-	0x80ec886c, 0x82ed806d,
-	0xb8eef807, 0x866fff6e,
-	0x001f8000, 0x8e6f8b6f,
-	0x8977ff77, 0xfc000000,
-	0x87776f77, 0x896eff6e,
-	0x001f8000, 0xb96ef807,
-	0xb8f0f812, 0xb8f1f813,
-	0x8ef08870, 0xc0071bb8,
-	0x00000000, 0xbf8cc07f,
-	0xc0071c38, 0x00000008,
-	0xbf8cc07f, 0x86ee6e6e,
-	0xbf840001, 0xbe801d6e,
-	0xb8f1f803, 0x8671ff71,
-	0x000001ff, 0xbf850002,
-	0x806c846c, 0x826d806d,
+	0xbf840016, 0xb8fbf803,
+	0x866eff7b, 0x00000400,
+	0xbf85003b, 0x866eff7b,
+	0x00000800, 0xbf850003,
+	0x866eff7b, 0x00000100,
+	0xbf84000c, 0x866eff78,
+	0x00002000, 0xbf840005,
+	0xbf8e0010, 0xb8eef803,
+	0x866eff6e, 0x00000400,
+	0xbf84fffb, 0x8778ff78,
+	0x00002000, 0x80ec886c,
+	0x82ed806d, 0xb8eef807,
+	0x866fff6e, 0x001f8000,
+	0x8e6f8b6f, 0x8977ff77,
+	0xfc000000, 0x87776f77,
+	0x896eff6e, 0x001f8000,
+	0xb96ef807, 0xb8faf812,
+	0xb8fbf813, 0x8efa887a,
+	0xc0071bbd, 0x00000000,
+	0xbf8cc07f, 0xc0071ebd,
+	0x00000008, 0xbf8cc07f,
+	0x86ee6e6e, 0xbf840001,
+	0xbe801d6e, 0xb8fbf803,
+	0x867bff7b, 0x000001ff,
+	0xbf850002, 0x806c846c,
+	0x826d806d, 0x866dff6d,
+	0x0000ffff, 0x8f6e8b77,
+	0x866eff6e, 0x001f8000,
+	0xb96ef807, 0x86fe7e7e,
+	0x86ea6a6a, 0x8f6e8378,
+	0xb96ee0c2, 0xbf800002,
+	0xb9780002, 0xbe801f6c,
 	0x866dff6d, 0x0000ffff,
-	0x8f6e8b77, 0x866eff6e,
-	0x001f8000, 0xb96ef807,
-	0x86fe7e7e, 0x86ea6a6a,
-	0x8f6e8378, 0xb96ee0c2,
-	0xbf800002, 0xb9780002,
-	0xbe801f6c, 0x866dff6d,
-	0x0000ffff, 0xbef00080,
-	0xb9700283, 0xb8f02407,
-	0x8e709c70, 0x876d706d,
-	0xb8f003c7, 0x8e709b70,
-	0x876d706d, 0xb8f0f807,
-	0x8670ff70, 0x00007fff,
-	0xb970f807, 0xbeee007e,
-	0xbeef007f, 0xbefe0180,
-	0xbf900004, 0x87708478,
-	0xb970f802, 0xbf8e0002,
-	0xbf88fffe, 0xb8f02a05,
-	0x80708170, 0x8e708a70,
-	0xb8f11605, 0x80718171,
-	0x8e718671, 0x80707170,
-	0x80707e70, 0x8271807f,
-	0x8671ff71, 0x0000ffff,
-	0xc0471cb8, 0x00000040,
-	0xbf8cc07f, 0xc04b1d38,
-	0x00000048, 0xbf8cc07f,
-	0xc0431e78, 0x00000058,
-	0xbf8cc07f, 0xc0471eb8,
-	0x0000005c, 0xbf8cc07f,
+	0xbefa0080, 0xb97a0283,
+	0xb8fa2407, 0x8e7a9b7a,
+	0x876d7a6d, 0xb8fa03c7,
+	0x8e7a9a7a, 0x876d7a6d,
+	0xb8faf807, 0x867aff7a,
+	0x00007fff, 0xb97af807,
+	0xbeee007e, 0xbeef007f,
+	0xbefe0180, 0xbf900004,
+	0x877a8478, 0xb97af802,
+	0xbf8e0002, 0xbf88fffe,
+	0xb8fa2a05, 0x807a817a,
+	0x8e7a8a7a, 0xb8fb1605,
+	0x807b817b, 0x8e7b867b,
+	0x807a7b7a, 0x807a7e7a,
+	0x827b807f, 0x867bff7b,
+	0x0000ffff, 0xc04b1c3d,
+	0x00000050, 0xbf8cc07f,
+	0xc04b1d3d, 0x00000060,
+	0xbf8cc07f, 0xc0431e7d,
+	0x00000074, 0xbf8cc07f,
 	0xbef4007e, 0x8675ff7f,
 	0x0000ffff, 0x8775ff75,
 	0x00040000, 0xbef60080,
 	0xbef700ff, 0x00807fac,
-	0x8670ff7f, 0x08000000,
-	0x8f708370, 0x87777077,
-	0x8670ff7f, 0x70000000,
-	0x8f708170, 0x87777077,
-	0xbefb007c, 0xbefa0080,
-	0xb8fa2a05, 0x807a817a,
-	0x8e7a8a7a, 0xb8f01605,
-	0x80708170, 0x8e708670,
-	0x807a707a, 0xbef60084,
+	0x867aff7f, 0x08000000,
+	0x8f7a837a, 0x87777a77,
+	0x867aff7f, 0x70000000,
+	0x8f7a817a, 0x87777a77,
+	0xbef1007c, 0xbef00080,
+	0xb8f02a05, 0x80708170,
+	0x8e708a70, 0xb8fa1605,
+	0x807a817a, 0x8e7a867a,
+	0x80707a70, 0xbef60084,
 	0xbef600ff, 0x01000000,
-	0xbefe007c, 0xbefc007a,
-	0xc0611efa, 0x0000007c,
-	0xbf8cc07f, 0x807a847a,
+	0xbefe007c, 0xbefc0070,
+	0xc0611c7a, 0x0000007c,
+	0xbf8cc07f, 0x80708470,
 	0xbefc007e, 0xbefe007c,
-	0xbefc007a, 0xc0611b3a,
+	0xbefc0070, 0xc0611b3a,
 	0x0000007c, 0xbf8cc07f,
-	0x807a847a, 0xbefc007e,
-	0xbefe007c, 0xbefc007a,
+	0x80708470, 0xbefc007e,
+	0xbefe007c, 0xbefc0070,
 	0xc0611b7a, 0x0000007c,
-	0xbf8cc07f, 0x807a847a,
+	0xbf8cc07f, 0x80708470,
 	0xbefc007e, 0xbefe007c,
-	0xbefc007a, 0xc0611bba,
+	0xbefc0070, 0xc0611bba,
 	0x0000007c, 0xbf8cc07f,
-	0x807a847a, 0xbefc007e,
-	0xbefe007c, 0xbefc007a,
+	0x80708470, 0xbefc007e,
+	0xbefe007c, 0xbefc0070,
 	0xc0611bfa, 0x0000007c,
-	0xbf8cc07f, 0x807a847a,
+	0xbf8cc07f, 0x80708470,
 	0xbefc007e, 0xbefe007c,
-	0xbefc007a, 0xc0611e3a,
+	0xbefc0070, 0xc0611e3a,
 	0x0000007c, 0xbf8cc07f,
-	0x807a847a, 0xbefc007e,
-	0xb8f1f803, 0xbefe007c,
-	0xbefc007a, 0xc0611c7a,
+	0x80708470, 0xbefc007e,
+	0xb8fbf803, 0xbefe007c,
+	0xbefc0070, 0xc0611efa,
 	0x0000007c, 0xbf8cc07f,
-	0x807a847a, 0xbefc007e,
-	0xbefe007c, 0xbefc007a,
+	0x80708470, 0xbefc007e,
+	0xbefe007c, 0xbefc0070,
 	0xc0611a3a, 0x0000007c,
-	0xbf8cc07f, 0x807a847a,
+	0xbf8cc07f, 0x80708470,
 	0xbefc007e, 0xbefe007c,
-	0xbefc007a, 0xc0611a7a,
+	0xbefc0070, 0xc0611a7a,
 	0x0000007c, 0xbf8cc07f,
-	0x807a847a, 0xbefc007e,
-	0xb8fbf801, 0xbefe007c,
-	0xbefc007a, 0xc0611efa,
+	0x80708470, 0xbefc007e,
+	0xb8f1f801, 0xbefe007c,
+	0xbefc0070, 0xc0611c7a,
 	0x0000007c, 0xbf8cc07f,
-	0x807a847a, 0xbefc007e,
-	0x8670ff7f, 0x04000000,
-	0xbeef0080, 0x876f6f70,
-	0xb8fa2a05, 0x807a817a,
-	0x8e7a8a7a, 0xb8f11605,
-	0x80718171, 0x8e718471,
-	0x8e768271, 0xbef600ff,
+	0x80708470, 0xbefc007e,
+	0x867aff7f, 0x04000000,
+	0xbeef0080, 0x876f6f7a,
+	0xb8f02a05, 0x80708170,
+	0x8e708a70, 0xb8fb1605,
+	0x807b817b, 0x8e7b847b,
+	0x8e76827b, 0xbef600ff,
 	0x01000000, 0xbef20174,
-	0x80747a74, 0x82758075,
+	0x80747074, 0x82758075,
 	0xbefc0080, 0xbf800000,
 	0xbe802b00, 0xbe822b02,
 	0xbe842b04, 0xbe862b06,
@@ -403,73 +402,1047 @@ static const uint32_t cwsr_trap_gfx9_hex[] = {
 	0xbf8cc07f, 0xc06b033a,
 	0x00000030, 0xbf8cc07f,
 	0x8074c074, 0x82758075,
-	0x807c907c, 0xbf0a717c,
+	0x807c907c, 0xbf0a7b7c,
 	0xbf85ffe7, 0xbef40172,
-	0xbefa0080, 0xbefe00c1,
+	0xbef00080, 0xbefe00c1,
 	0xbeff00c1, 0xbee80080,
 	0xbee90080, 0xbef600ff,
-	0x01000000, 0xe0724000,
-	0x7a1d0000, 0xe0724100,
-	0x7a1d0100, 0xe0724200,
-	0x7a1d0200, 0xe0724300,
-	0x7a1d0300, 0xbefe00c1,
-	0xbeff00c1, 0xb8f14306,
-	0x8671c171, 0xbf84002c,
-	0xbf8a0000, 0x8670ff6f,
-	0x04000000, 0xbf840028,
-	0x8e718671, 0x8e718271,
-	0xbef60071, 0xb8fa2a05,
-	0x807a817a, 0x8e7a8a7a,
-	0xb8f01605, 0x80708170,
-	0x8e708670, 0x807a707a,
-	0x807aff7a, 0x00000080,
+	0x01000000, 0x867aff78,
+	0x00400000, 0xbf850003,
+	0xb8faf803, 0x897a7aff,
+	0x10000000, 0xbf85004d,
+	0xbe840080, 0xd2890000,
+	0x00000900, 0x80048104,
+	0xd2890001, 0x00000900,
+	0x80048104, 0xd2890002,
+	0x00000900, 0x80048104,
+	0xd2890003, 0x00000900,
+	0x80048104, 0xc069003a,
+	0x00000070, 0xbf8cc07f,
+	0x80709070, 0xbf06c004,
+	0xbf84ffee, 0xbe840080,
+	0xd2890000, 0x00000901,
+	0x80048104, 0xd2890001,
+	0x00000901, 0x80048104,
+	0xd2890002, 0x00000901,
+	0x80048104, 0xd2890003,
+	0x00000901, 0x80048104,
+	0xc069003a, 0x00000070,
+	0xbf8cc07f, 0x80709070,
+	0xbf06c004, 0xbf84ffee,
+	0xbe840080, 0xd2890000,
+	0x00000902, 0x80048104,
+	0xd2890001, 0x00000902,
+	0x80048104, 0xd2890002,
+	0x00000902, 0x80048104,
+	0xd2890003, 0x00000902,
+	0x80048104, 0xc069003a,
+	0x00000070, 0xbf8cc07f,
+	0x80709070, 0xbf06c004,
+	0xbf84ffee, 0xbe840080,
+	0xd2890000, 0x00000903,
+	0x80048104, 0xd2890001,
+	0x00000903, 0x80048104,
+	0xd2890002, 0x00000903,
+	0x80048104, 0xd2890003,
+	0x00000903, 0x80048104,
+	0xc069003a, 0x00000070,
+	0xbf8cc07f, 0x80709070,
+	0xbf06c004, 0xbf84ffee,
+	0xbf820008, 0xe0724000,
+	0x701d0000, 0xe0724100,
+	0x701d0100, 0xe0724200,
+	0x701d0200, 0xe0724300,
+	0x701d0300, 0xbefe00c1,
+	0xbeff00c1, 0xb8fb4306,
+	0x867bc17b, 0xbf840063,
+	0xbf8a0000, 0x867aff6f,
+	0x04000000, 0xbf84005f,
+	0x8e7b867b, 0x8e7b827b,
+	0xbef6007b, 0xb8f02a05,
+	0x80708170, 0x8e708a70,
+	0xb8fa1605, 0x807a817a,
+	0x8e7a867a, 0x80707a70,
+	0x8070ff70, 0x00000080,
 	0xbef600ff, 0x01000000,
 	0xbefc0080, 0xd28c0002,
 	0x000100c1, 0xd28d0003,
-	0x000204c1, 0xd1060002,
-	0x00011103, 0x7e0602ff,
-	0x00000200, 0xbefc00ff,
-	0x00010000, 0xbe800077,
-	0x8677ff77, 0xff7fffff,
-	0x8777ff77, 0x00058000,
-	0xd8ec0000, 0x00000002,
-	0xbf8cc07f, 0xe0765000,
-	0x7a1d0002, 0x68040702,
-	0xd0c9006a, 0x0000e302,
-	0xbf87fff7, 0xbef70000,
-	0xbefa00ff, 0x00000400,
+	0x000204c1, 0x867aff78,
+	0x00400000, 0xbf850003,
+	0xb8faf803, 0x897a7aff,
+	0x10000000, 0xbf850030,
+	0x24040682, 0xd86e4000,
+	0x00000002, 0xbf8cc07f,
+	0xbe840080, 0xd2890000,
+	0x00000900, 0x80048104,
+	0xd2890001, 0x00000900,
+	0x80048104, 0xd2890002,
+	0x00000900, 0x80048104,
+	0xd2890003, 0x00000900,
+	0x80048104, 0xc069003a,
+	0x00000070, 0xbf8cc07f,
+	0x80709070, 0xbf06c004,
+	0xbf84ffee, 0xbe840080,
+	0xd2890000, 0x00000901,
+	0x80048104, 0xd2890001,
+	0x00000901, 0x80048104,
+	0xd2890002, 0x00000901,
+	0x80048104, 0xd2890003,
+	0x00000901, 0x80048104,
+	0xc069003a, 0x00000070,
+	0xbf8cc07f, 0x80709070,
+	0xbf06c004, 0xbf84ffee,
+	0x680404ff, 0x00000200,
+	0xd0c9006a, 0x0000f702,
+	0xbf87ffd2, 0xbf820015,
+	0xd1060002, 0x00011103,
+	0x7e0602ff, 0x00000200,
+	0xbefc00ff, 0x00010000,
+	0xbe800077, 0x8677ff77,
+	0xff7fffff, 0x8777ff77,
+	0x00058000, 0xd8ec0000,
+	0x00000002, 0xbf8cc07f,
+	0xe0765000, 0x701d0002,
+	0x68040702, 0xd0c9006a,
+	0x0000f702, 0xbf87fff7,
+	0xbef70000, 0xbef000ff,
+	0x00000400, 0xbefe00c1,
+	0xbeff00c1, 0xb8fb2a05,
+	0x807b817b, 0x8e7b827b,
+	0x8e76887b, 0xbef600ff,
+	0x01000000, 0xbefc0084,
+	0xbf0a7b7c, 0xbf84006d,
+	0xbf11017c, 0x807bff7b,
+	0x00001000, 0x867aff78,
+	0x00400000, 0xbf850003,
+	0xb8faf803, 0x897a7aff,
+	0x10000000, 0xbf850051,
+	0xbe840080, 0xd2890000,
+	0x00000900, 0x80048104,
+	0xd2890001, 0x00000900,
+	0x80048104, 0xd2890002,
+	0x00000900, 0x80048104,
+	0xd2890003, 0x00000900,
+	0x80048104, 0xc069003a,
+	0x00000070, 0xbf8cc07f,
+	0x80709070, 0xbf06c004,
+	0xbf84ffee, 0xbe840080,
+	0xd2890000, 0x00000901,
+	0x80048104, 0xd2890001,
+	0x00000901, 0x80048104,
+	0xd2890002, 0x00000901,
+	0x80048104, 0xd2890003,
+	0x00000901, 0x80048104,
+	0xc069003a, 0x00000070,
+	0xbf8cc07f, 0x80709070,
+	0xbf06c004, 0xbf84ffee,
+	0xbe840080, 0xd2890000,
+	0x00000902, 0x80048104,
+	0xd2890001, 0x00000902,
+	0x80048104, 0xd2890002,
+	0x00000902, 0x80048104,
+	0xd2890003, 0x00000902,
+	0x80048104, 0xc069003a,
+	0x00000070, 0xbf8cc07f,
+	0x80709070, 0xbf06c004,
+	0xbf84ffee, 0xbe840080,
+	0xd2890000, 0x00000903,
+	0x80048104, 0xd2890001,
+	0x00000903, 0x80048104,
+	0xd2890002, 0x00000903,
+	0x80048104, 0xd2890003,
+	0x00000903, 0x80048104,
+	0xc069003a, 0x00000070,
+	0xbf8cc07f, 0x80709070,
+	0xbf06c004, 0xbf84ffee,
+	0x807c847c, 0xbf0a7b7c,
+	0xbf85ffb1, 0xbf9c0000,
+	0xbf820012, 0x7e000300,
+	0x7e020301, 0x7e040302,
+	0x7e060303, 0xe0724000,
+	0x701d0000, 0xe0724100,
+	0x701d0100, 0xe0724200,
+	0x701d0200, 0xe0724300,
+	0x701d0300, 0x807c847c,
+	0x8070ff70, 0x00000400,
+	0xbf0a7b7c, 0xbf85ffef,
+	0xbf9c0000, 0xbf8200da,
+	0xbef4007e, 0x8675ff7f,
+	0x0000ffff, 0x8775ff75,
+	0x00040000, 0xbef60080,
+	0xbef700ff, 0x00807fac,
+	0x866eff7f, 0x08000000,
+	0x8f6e836e, 0x87776e77,
+	0x866eff7f, 0x70000000,
+	0x8f6e816e, 0x87776e77,
+	0x866eff7f, 0x04000000,
+	0xbf84001e, 0xbefe00c1,
+	0xbeff00c1, 0xb8ef4306,
+	0x866fc16f, 0xbf840019,
+	0x8e6f866f, 0x8e6f826f,
+	0xbef6006f, 0xb8f82a05,
+	0x80788178, 0x8e788a78,
+	0xb8ee1605, 0x806e816e,
+	0x8e6e866e, 0x80786e78,
+	0x8078ff78, 0x00000080,
+	0xbef600ff, 0x01000000,
+	0xbefc0080, 0xe0510000,
+	0x781d0000, 0xe0510100,
+	0x781d0000, 0x807cff7c,
+	0x00000200, 0x8078ff78,
+	0x00000200, 0xbf0a6f7c,
+	0xbf85fff6, 0xbef80080,
 	0xbefe00c1, 0xbeff00c1,
-	0xb8f12a05, 0x80718171,
-	0x8e718271, 0x8e768871,
+	0xb8ef2a05, 0x806f816f,
+	0x8e6f826f, 0x8e76886f,
 	0xbef600ff, 0x01000000,
-	0xbefc0084, 0xbf0a717c,
-	0xbf840015, 0xbf11017c,
-	0x8071ff71, 0x00001000,
+	0xbeee0078, 0x8078ff78,
+	0x00000400, 0xbefc0084,
+	0xbf11087c, 0x806fff6f,
+	0x00008000, 0xe0524000,
+	0x781d0000, 0xe0524100,
+	0x781d0100, 0xe0524200,
+	0x781d0200, 0xe0524300,
+	0x781d0300, 0xbf8c0f70,
 	0x7e000300, 0x7e020301,
 	0x7e040302, 0x7e060303,
-	0xe0724000, 0x7a1d0000,
-	0xe0724100, 0x7a1d0100,
-	0xe0724200, 0x7a1d0200,
-	0xe0724300, 0x7a1d0300,
+	0x807c847c, 0x8078ff78,
+	0x00000400, 0xbf0a6f7c,
+	0xbf85ffee, 0xbf9c0000,
+	0xe0524000, 0x6e1d0000,
+	0xe0524100, 0x6e1d0100,
+	0xe0524200, 0x6e1d0200,
+	0xe0524300, 0x6e1d0300,
+	0xb8f82a05, 0x80788178,
+	0x8e788a78, 0xb8ee1605,
+	0x806e816e, 0x8e6e866e,
+	0x80786e78, 0x80f8c078,
+	0xb8ef1605, 0x806f816f,
+	0x8e6f846f, 0x8e76826f,
+	0xbef600ff, 0x01000000,
+	0xbefc006f, 0xc031003a,
+	0x00000078, 0x80f8c078,
+	0xbf8cc07f, 0x80fc907c,
+	0xbf800000, 0xbe802d00,
+	0xbe822d02, 0xbe842d04,
+	0xbe862d06, 0xbe882d08,
+	0xbe8a2d0a, 0xbe8c2d0c,
+	0xbe8e2d0e, 0xbf06807c,
+	0xbf84fff0, 0xb8f82a05,
+	0x80788178, 0x8e788a78,
+	0xb8ee1605, 0x806e816e,
+	0x8e6e866e, 0x80786e78,
+	0xbef60084, 0xbef600ff,
+	0x01000000, 0xc0211bfa,
+	0x00000078, 0x80788478,
+	0xc0211b3a, 0x00000078,
+	0x80788478, 0xc0211b7a,
+	0x00000078, 0x80788478,
+	0xc0211c3a, 0x00000078,
+	0x80788478, 0xc0211c7a,
+	0x00000078, 0x80788478,
+	0xc0211eba, 0x00000078,
+	0x80788478, 0xc0211efa,
+	0x00000078, 0x80788478,
+	0xc0211a3a, 0x00000078,
+	0x80788478, 0xc0211a7a,
+	0x00000078, 0x80788478,
+	0xc0211cfa, 0x00000078,
+	0x80788478, 0xbf8cc07f,
+	0xbefc006f, 0xbefe0070,
+	0xbeff0071, 0x866f7bff,
+	0x000003ff, 0xb96f4803,
+	0x866f7bff, 0xfffff800,
+	0x8f6f8b6f, 0xb96fa2c3,
+	0xb973f801, 0xb8ee2a05,
+	0x806e816e, 0x8e6e8a6e,
+	0xb8ef1605, 0x806f816f,
+	0x8e6f866f, 0x806e6f6e,
+	0x806e746e, 0x826f8075,
+	0x866fff6f, 0x0000ffff,
+	0xc00b1c37, 0x00000050,
+	0xc00b1d37, 0x00000060,
+	0xc0031e77, 0x00000074,
+	0xbf8cc07f, 0x866fff6d,
+	0xf8000000, 0x8f6f9b6f,
+	0x8e6f906f, 0xbeee0080,
+	0x876e6f6e, 0x866fff6d,
+	0x04000000, 0x8f6f9a6f,
+	0x8e6f8f6f, 0x876e6f6e,
+	0x866fff7a, 0x00800000,
+	0x8f6f976f, 0xb96ef807,
+	0x866dff6d, 0x0000ffff,
+	0x86fe7e7e, 0x86ea6a6a,
+	0x8f6e837a, 0xb96ee0c2,
+	0xbf800002, 0xb97a0002,
+	0xbf8a0000, 0x95806f6c,
+	0xbf810000, 0x00000000,
+};
+
+static const uint32_t cwsr_trap_gfx10_hex[] = {
+	0xbf820001, 0xbf8201c1,
+	0xb0804004, 0xb978f802,
+	0x8a788678, 0xb971f803,
+	0x876eff71, 0x00000400,
+	0xbf850033, 0x876eff71,
+	0x00000100, 0xbf840002,
+	0x8878ff78, 0x00002000,
+	0x8a77ff77, 0xff000000,
+	0xb96ef807, 0x876fff6e,
+	0x02000000, 0x8f6f866f,
+	0x88776f77, 0x876fff6e,
+	0x003f8000, 0x8f6f896f,
+	0x88776f77, 0x8a6eff6e,
+	0x023f8000, 0xb9eef807,
+	0xb97af812, 0xb97bf813,
+	0x8ffa887a, 0xf4051bbd,
+	0xfa000000, 0xbf8cc07f,
+	0xf4051ebd, 0xfa000008,
+	0xbf8cc07f, 0x87ee6e6e,
+	0xbf840001, 0xbe80206e,
+	0xb971f803, 0x8771ff71,
+	0x000001ff, 0xbf850002,
+	0x806c846c, 0x826d806d,
+	0x876dff6d, 0x0000ffff,
+	0x906e8977, 0x876fff6e,
+	0x003f8000, 0x906e8677,
+	0x876eff6e, 0x02000000,
+	0x886e6f6e, 0xb9eef807,
+	0x87fe7e7e, 0x87ea6a6a,
+	0xb9f8f802, 0xbe80226c,
+	0xb971f803, 0x8771ff71,
+	0x00000100, 0xbf840006,
+	0xbef60380, 0xb9f60203,
+	0x876dff6d, 0x0000ffff,
+	0x80ec886c, 0x82ed806d,
+	0xbef60380, 0xb9f60283,
+	0xb972f816, 0xb9762c07,
+	0x8f769a76, 0x886d766d,
+	0xb97603c7, 0x8f769976,
+	0x886d766d, 0xb9760647,
+	0x8f769876, 0x886d766d,
+	0xb976f807, 0x8776ff76,
+	0x00007fff, 0xb9f6f807,
+	0xbeee037e, 0xbeef037f,
+	0xbefe0480, 0xbf900004,
+	0xbf8e0002, 0xbf88fffe,
+	0xbef4037e, 0x8775ff7f,
+	0x0000ffff, 0x8875ff75,
+	0x00040000, 0xbef60380,
+	0xbef703ff, 0x10807fac,
+	0x8776ff7f, 0x08000000,
+	0x90768376, 0x88777677,
+	0x8776ff7f, 0x70000000,
+	0x90768176, 0x88777677,
+	0xbefb037c, 0xbefa0380,
+	0xb97302dc, 0x8f739973,
+	0x8873737f, 0xb97a2a05,
+	0x807a817a, 0x907c9973,
+	0x877c817c, 0xbf06817c,
+	0xbf850002, 0x8f7a897a,
+	0xbf820001, 0x8f7a8a7a,
+	0xb9761e06, 0x8f768a76,
+	0x807a767a, 0x807aff7a,
+	0x00000200, 0xbef603ff,
+	0x01000000, 0xbefe037c,
+	0xbefc037a, 0xf4611efa,
+	0xf8000000, 0x807a847a,
+	0xbefc037e, 0xbefe037c,
+	0xbefc037a, 0xf4611b3a,
+	0xf8000000, 0x807a847a,
+	0xbefc037e, 0xbefe037c,
+	0xbefc037a, 0xf4611b7a,
+	0xf8000000, 0x807a847a,
+	0xbefc037e, 0xbefe037c,
+	0xbefc037a, 0xf4611bba,
+	0xf8000000, 0x807a847a,
+	0xbefc037e, 0xbefe037c,
+	0xbefc037a, 0xf4611bfa,
+	0xf8000000, 0x807a847a,
+	0xbefc037e, 0xbefe037c,
+	0xbefc037a, 0xf4611e3a,
+	0xf8000000, 0x807a847a,
+	0xbefc037e, 0xb971f803,
+	0xbefe037c, 0xbefc037a,
+	0xf4611c7a, 0xf8000000,
+	0x807a847a, 0xbefc037e,
+	0xbefe037c, 0xbefc037a,
+	0xf4611cba, 0xf8000000,
+	0x807a847a, 0xbefc037e,
+	0xb97bf801, 0xbefe037c,
+	0xbefc037a, 0xf4611efa,
+	0xf8000000, 0x807a847a,
+	0xbefc037e, 0xb97bf814,
+	0xbefe037c, 0xbefc037a,
+	0xf4611efa, 0xf8000000,
+	0x807a847a, 0xbefc037e,
+	0xb97bf815, 0xbefe037c,
+	0xbefc037a, 0xf4611efa,
+	0xf8000000, 0x807a847a,
+	0xbefc037e, 0x8776ff7f,
+	0x04000000, 0xbeef0380,
+	0x886f6f76, 0xb97a2a05,
+	0x807a817a, 0x907c9973,
+	0x877c817c, 0xbf06817c,
+	0xbf850002, 0x8f7a897a,
+	0xbf820001, 0x8f7a8a7a,
+	0xb9761e06, 0x8f768a76,
+	0x807a767a, 0xbef603ff,
+	0x01000000, 0xbef20374,
+	0x80747a74, 0x82758075,
+	0xbefc0380, 0xbf800000,
+	0xbe802f00, 0xbe822f02,
+	0xbe842f04, 0xbe862f06,
+	0xbe882f08, 0xbe8a2f0a,
+	0xbe8c2f0c, 0xbe8e2f0e,
+	0xf469003a, 0xfa000000,
+	0xf469013a, 0xfa000010,
+	0xf469023a, 0xfa000020,
+	0xf469033a, 0xfa000030,
+	0x8074c074, 0x82758075,
+	0x807c907c, 0xbf0aff7c,
+	0x00000060, 0xbf85ffea,
+	0xbe802f00, 0xbe822f02,
+	0xbe842f04, 0xbe862f06,
+	0xbe882f08, 0xbe8a2f0a,
+	0xf469003a, 0xfa000000,
+	0xf469013a, 0xfa000010,
+	0xf469023a, 0xfa000020,
+	0x8074b074, 0x82758075,
+	0xbef40372, 0xbefa0380,
+	0xbefe03c1, 0x907c9973,
+	0x877c817c, 0xbf06817c,
+	0xbf850002, 0xbeff0380,
+	0xbf820002, 0xbeff03c1,
+	0xbf82000b, 0xbef603ff,
+	0x01000000, 0xe0704000,
+	0x7a5d0000, 0xe0704080,
+	0x7a5d0100, 0xe0704100,
+	0x7a5d0200, 0xe0704180,
+	0x7a5d0300, 0xbf82000a,
+	0xbef603ff, 0x01000000,
+	0xe0704000, 0x7a5d0000,
+	0xe0704100, 0x7a5d0100,
+	0xe0704200, 0x7a5d0200,
+	0xe0704300, 0x7a5d0300,
+	0xbefe03c1, 0x907c9973,
+	0x877c817c, 0xbf06817c,
+	0xbf850002, 0xbeff0380,
+	0xbf820001, 0xbeff03c1,
+	0xb9714306, 0x8771c171,
+	0xbf840046, 0xbf8a0000,
+	0x8776ff6f, 0x04000000,
+	0xbf840042, 0x8f718671,
+	0x8f718271, 0xbef60371,
+	0xb97a2a05, 0x807a817a,
+	0x907c9973, 0x877c817c,
+	0xbf06817c, 0xbf850002,
+	0x8f7a897a, 0xbf820001,
+	0x8f7a8a7a, 0xb9761e06,
+	0x8f768a76, 0x807a767a,
+	0x807aff7a, 0x00000200,
+	0x807aff7a, 0x00000080,
+	0xbef603ff, 0x01000000,
+	0xd7650000, 0x000100c1,
+	0xd7660000, 0x000200c1,
+	0x16000084, 0x907c9973,
+	0x877c817c, 0xbf06817c,
+	0xbefc0380, 0xbf850012,
+	0xbe8303ff, 0x00000080,
+	0xbf800000, 0xbf800000,
+	0xbf800000, 0xd8d80000,
+	0x01000000, 0xbf8c0000,
+	0xe0704000, 0x7a5d0100,
+	0x807c037c, 0x807a037a,
+	0xd5250000, 0x0001ff00,
+	0x00000080, 0xbf0a717c,
+	0xbf85fff4, 0xbf820011,
+	0xbe8303ff, 0x00000100,
+	0xbf800000, 0xbf800000,
+	0xbf800000, 0xd8d80000,
+	0x01000000, 0xbf8c0000,
+	0xe0704000, 0x7a5d0100,
+	0x807c037c, 0x807a037a,
+	0xd5250000, 0x0001ff00,
+	0x00000100, 0xbf0a717c,
+	0xbf85fff4, 0xbefe03c1,
+	0x907c9973, 0x877c817c,
+	0xbf06817c, 0xbf850004,
+	0xbefa03ff, 0x00000200,
+	0xbeff0380, 0xbf820003,
+	0xbefa03ff, 0x00000400,
+	0xbeff03c1, 0xb9712a05,
+	0x80718171, 0x8f718271,
+	0x907c9973, 0x877c817c,
+	0xbf06817c, 0xbf850017,
+	0xbef603ff, 0x01000000,
+	0xbefc0384, 0xbf0a717c,
+	0xbf840037, 0x7e008700,
+	0x7e028701, 0x7e048702,
+	0x7e068703, 0xe0704000,
+	0x7a5d0000, 0xe0704080,
+	0x7a5d0100, 0xe0704100,
+	0x7a5d0200, 0xe0704180,
+	0x7a5d0300, 0x807c847c,
+	0x807aff7a, 0x00000200,
+	0xbf0a717c, 0xbf85ffef,
+	0xbf820025, 0xbef603ff,
+	0x01000000, 0xbefc0384,
+	0xbf0a717c, 0xbf840020,
+	0x7e008700, 0x7e028701,
+	0x7e048702, 0x7e068703,
+	0xe0704000, 0x7a5d0000,
+	0xe0704100, 0x7a5d0100,
+	0xe0704200, 0x7a5d0200,
+	0xe0704300, 0x7a5d0300,
 	0x807c847c, 0x807aff7a,
 	0x00000400, 0xbf0a717c,
-	0xbf85ffef, 0xbf9c0000,
-	0xbf8200dc, 0xbef4007e,
+	0xbf85ffef, 0xb9711e06,
+	0x8771c171, 0xbf84000c,
+	0x8f718371, 0x80717c71,
+	0xbefe03c1, 0xbeff0380,
+	0x7e008700, 0xe0704000,
+	0x7a5d0000, 0x807c817c,
+	0x807aff7a, 0x00000080,
+	0xbf0a717c, 0xbf85fff8,
+	0xbf820142, 0xbef4037e,
+	0x8775ff7f, 0x0000ffff,
+	0x8875ff75, 0x00040000,
+	0xbef60380, 0xbef703ff,
+	0x10807fac, 0x8772ff7f,
+	0x08000000, 0x90728372,
+	0x88777277, 0x8772ff7f,
+	0x70000000, 0x90728172,
+	0x88777277, 0xb97302dc,
+	0x8f739973, 0x8873737f,
+	0x8772ff7f, 0x04000000,
+	0xbf840036, 0xbefe03c1,
+	0x907c9973, 0x877c817c,
+	0xbf06817c, 0xbf850002,
+	0xbeff0380, 0xbf820001,
+	0xbeff03c1, 0xb96f4306,
+	0x876fc16f, 0xbf84002b,
+	0x8f6f866f, 0x8f6f826f,
+	0xbef6036f, 0xb9782a05,
+	0x80788178, 0x907c9973,
+	0x877c817c, 0xbf06817c,
+	0xbf850002, 0x8f788978,
+	0xbf820001, 0x8f788a78,
+	0xb9721e06, 0x8f728a72,
+	0x80787278, 0x8078ff78,
+	0x00000200, 0x8078ff78,
+	0x00000080, 0xbef603ff,
+	0x01000000, 0x907c9973,
+	0x877c817c, 0xbf06817c,
+	0xbefc0380, 0xbf850009,
+	0xe0310000, 0x781d0000,
+	0x807cff7c, 0x00000080,
+	0x8078ff78, 0x00000080,
+	0xbf0a6f7c, 0xbf85fff8,
+	0xbf820008, 0xe0310000,
+	0x781d0000, 0x807cff7c,
+	0x00000100, 0x8078ff78,
+	0x00000100, 0xbf0a6f7c,
+	0xbf85fff8, 0xbef80380,
+	0xbefe03c1, 0x907c9973,
+	0x877c817c, 0xbf06817c,
+	0xbf850002, 0xbeff0380,
+	0xbf820001, 0xbeff03c1,
+	0xb96f2a05, 0x806f816f,
+	0x8f6f826f, 0x907c9973,
+	0x877c817c, 0xbf06817c,
+	0xbf850021, 0xbef603ff,
+	0x01000000, 0xbef20378,
+	0x8078ff78, 0x00000200,
+	0xbefc0384, 0xe0304000,
+	0x785d0000, 0xe0304080,
+	0x785d0100, 0xe0304100,
+	0x785d0200, 0xe0304180,
+	0x785d0300, 0xbf8c3f70,
+	0x7e008500, 0x7e028501,
+	0x7e048502, 0x7e068503,
+	0x807c847c, 0x8078ff78,
+	0x00000200, 0xbf0a6f7c,
+	0xbf85ffee, 0xe0304000,
+	0x725d0000, 0xe0304080,
+	0x725d0100, 0xe0304100,
+	0x725d0200, 0xe0304180,
+	0x725d0300, 0xbf820032,
+	0xbef603ff, 0x01000000,
+	0xbef20378, 0x8078ff78,
+	0x00000400, 0xbefc0384,
+	0xe0304000, 0x785d0000,
+	0xe0304100, 0x785d0100,
+	0xe0304200, 0x785d0200,
+	0xe0304300, 0x785d0300,
+	0xbf8c3f70, 0x7e008500,
+	0x7e028501, 0x7e048502,
+	0x7e068503, 0x807c847c,
+	0x8078ff78, 0x00000400,
+	0xbf0a6f7c, 0xbf85ffee,
+	0xb96f1e06, 0x876fc16f,
+	0xbf84000e, 0x8f6f836f,
+	0x806f7c6f, 0xbefe03c1,
+	0xbeff0380, 0xe0304000,
+	0x785d0000, 0xbf8c3f70,
+	0x7e008500, 0x807c817c,
+	0x8078ff78, 0x00000080,
+	0xbf0a6f7c, 0xbf85fff7,
+	0xbeff03c1, 0xe0304000,
+	0x725d0000, 0xe0304100,
+	0x725d0100, 0xe0304200,
+	0x725d0200, 0xe0304300,
+	0x725d0300, 0xbf8c3f70,
+	0xb9782a05, 0x80788178,
+	0x907c9973, 0x877c817c,
+	0xbf06817c, 0xbf850002,
+	0x8f788978, 0xbf820001,
+	0x8f788a78, 0xb9721e06,
+	0x8f728a72, 0x80787278,
+	0x8078ff78, 0x00000200,
+	0x80f8ff78, 0x00000050,
+	0xbef603ff, 0x01000000,
+	0xbefc03ff, 0x0000006c,
+	0x80f89078, 0xf429003a,
+	0xf0000000, 0xbf8cc07f,
+	0x80fc847c, 0xbf800000,
+	0xbe803100, 0xbe823102,
+	0x80f8a078, 0xf42d003a,
+	0xf0000000, 0xbf8cc07f,
+	0x80fc887c, 0xbf800000,
+	0xbe803100, 0xbe823102,
+	0xbe843104, 0xbe863106,
+	0x80f8c078, 0xf431003a,
+	0xf0000000, 0xbf8cc07f,
+	0x80fc907c, 0xbf800000,
+	0xbe803100, 0xbe823102,
+	0xbe843104, 0xbe863106,
+	0xbe883108, 0xbe8a310a,
+	0xbe8c310c, 0xbe8e310e,
+	0xbf06807c, 0xbf84fff0,
+	0xb9782a05, 0x80788178,
+	0x907c9973, 0x877c817c,
+	0xbf06817c, 0xbf850002,
+	0x8f788978, 0xbf820001,
+	0x8f788a78, 0xb9721e06,
+	0x8f728a72, 0x80787278,
+	0x8078ff78, 0x00000200,
+	0xbef603ff, 0x01000000,
+	0xf4211bfa, 0xf0000000,
+	0x80788478, 0xf4211b3a,
+	0xf0000000, 0x80788478,
+	0xf4211b7a, 0xf0000000,
+	0x80788478, 0xf4211eba,
+	0xf0000000, 0x80788478,
+	0xf4211efa, 0xf0000000,
+	0x80788478, 0xf4211c3a,
+	0xf0000000, 0x80788478,
+	0xf4211c7a, 0xf0000000,
+	0x80788478, 0xf4211e7a,
+	0xf0000000, 0x80788478,
+	0xf4211cfa, 0xf0000000,
+	0x80788478, 0xf4211bba,
+	0xf0000000, 0x80788478,
+	0xbf8cc07f, 0xb9eef814,
+	0xf4211bba, 0xf0000000,
+	0x80788478, 0xbf8cc07f,
+	0xb9eef815, 0xbef2036d,
+	0x876dff72, 0x0000ffff,
+	0xbefc036f, 0xbefe037a,
+	0xbeff037b, 0x876f71ff,
+	0x000003ff, 0xb9ef4803,
+	0xb9f9f816, 0x876f71ff,
+	0xfffff800, 0x906f8b6f,
+	0xb9efa2c3, 0xb9f3f801,
+	0x876fff72, 0xfc000000,
+	0x906f9a6f, 0x8f6f906f,
+	0xbef30380, 0x88736f73,
+	0x876fff72, 0x02000000,
+	0x906f996f, 0x8f6f8f6f,
+	0x88736f73, 0x876fff72,
+	0x01000000, 0x906f986f,
+	0x8f6f996f, 0x88736f73,
+	0x876fff70, 0x00800000,
+	0x906f976f, 0xb9f3f807,
+	0x87fe7e7e, 0x87ea6a6a,
+	0xb9f0f802, 0xbf8a0000,
+	0xbe80226c, 0xbf810000,
+	0xbf9f0000, 0xbf9f0000,
+	0xbf9f0000, 0xbf9f0000,
+	0xbf9f0000, 0x00000000,
+};
+static const uint32_t cwsr_trap_arcturus_hex[] = {
+	0xbf820001, 0xbf8202c4,
+	0xb8f8f802, 0x89788678,
+	0xb8eef801, 0x866eff6e,
+	0x00000800, 0xbf840003,
+	0x866eff78, 0x00002000,
+	0xbf840016, 0xb8fbf803,
+	0x866eff7b, 0x00000400,
+	0xbf85003b, 0x866eff7b,
+	0x00000800, 0xbf850003,
+	0x866eff7b, 0x00000100,
+	0xbf84000c, 0x866eff78,
+	0x00002000, 0xbf840005,
+	0xbf8e0010, 0xb8eef803,
+	0x866eff6e, 0x00000400,
+	0xbf84fffb, 0x8778ff78,
+	0x00002000, 0x80ec886c,
+	0x82ed806d, 0xb8eef807,
+	0x866fff6e, 0x001f8000,
+	0x8e6f8b6f, 0x8977ff77,
+	0xfc000000, 0x87776f77,
+	0x896eff6e, 0x001f8000,
+	0xb96ef807, 0xb8faf812,
+	0xb8fbf813, 0x8efa887a,
+	0xc0071bbd, 0x00000000,
+	0xbf8cc07f, 0xc0071ebd,
+	0x00000008, 0xbf8cc07f,
+	0x86ee6e6e, 0xbf840001,
+	0xbe801d6e, 0xb8fbf803,
+	0x867bff7b, 0x000001ff,
+	0xbf850002, 0x806c846c,
+	0x826d806d, 0x866dff6d,
+	0x0000ffff, 0x8f6e8b77,
+	0x866eff6e, 0x001f8000,
+	0xb96ef807, 0x86fe7e7e,
+	0x86ea6a6a, 0x8f6e8378,
+	0xb96ee0c2, 0xbf800002,
+	0xb9780002, 0xbe801f6c,
+	0x866dff6d, 0x0000ffff,
+	0xbefa0080, 0xb97a0283,
+	0xb8fa2407, 0x8e7a9b7a,
+	0x876d7a6d, 0xb8fa03c7,
+	0x8e7a9a7a, 0x876d7a6d,
+	0xb8faf807, 0x867aff7a,
+	0x00007fff, 0xb97af807,
+	0xbeee007e, 0xbeef007f,
+	0xbefe0180, 0xbf900004,
+	0x877a8478, 0xb97af802,
+	0xbf8e0002, 0xbf88fffe,
+	0xb8fa2a05, 0x807a817a,
+	0x8e7a8a7a, 0x8e7a817a,
+	0xb8fb1605, 0x807b817b,
+	0x8e7b867b, 0x807a7b7a,
+	0x807a7e7a, 0x827b807f,
+	0x867bff7b, 0x0000ffff,
+	0xc04b1c3d, 0x00000050,
+	0xbf8cc07f, 0xc04b1d3d,
+	0x00000060, 0xbf8cc07f,
+	0xc0431e7d, 0x00000074,
+	0xbf8cc07f, 0xbef4007e,
 	0x8675ff7f, 0x0000ffff,
 	0x8775ff75, 0x00040000,
 	0xbef60080, 0xbef700ff,
-	0x00807fac, 0x866eff7f,
-	0x08000000, 0x8f6e836e,
-	0x87776e77, 0x866eff7f,
-	0x70000000, 0x8f6e816e,
-	0x87776e77, 0x866eff7f,
-	0x04000000, 0xbf84001e,
+	0x00807fac, 0x867aff7f,
+	0x08000000, 0x8f7a837a,
+	0x87777a77, 0x867aff7f,
+	0x70000000, 0x8f7a817a,
+	0x87777a77, 0xbef1007c,
+	0xbef00080, 0xb8f02a05,
+	0x80708170, 0x8e708a70,
+	0x8e708170, 0xb8fa1605,
+	0x807a817a, 0x8e7a867a,
+	0x80707a70, 0xbef60084,
+	0xbef600ff, 0x01000000,
+	0xbefe007c, 0xbefc0070,
+	0xc0611c7a, 0x0000007c,
+	0xbf8cc07f, 0x80708470,
+	0xbefc007e, 0xbefe007c,
+	0xbefc0070, 0xc0611b3a,
+	0x0000007c, 0xbf8cc07f,
+	0x80708470, 0xbefc007e,
+	0xbefe007c, 0xbefc0070,
+	0xc0611b7a, 0x0000007c,
+	0xbf8cc07f, 0x80708470,
+	0xbefc007e, 0xbefe007c,
+	0xbefc0070, 0xc0611bba,
+	0x0000007c, 0xbf8cc07f,
+	0x80708470, 0xbefc007e,
+	0xbefe007c, 0xbefc0070,
+	0xc0611bfa, 0x0000007c,
+	0xbf8cc07f, 0x80708470,
+	0xbefc007e, 0xbefe007c,
+	0xbefc0070, 0xc0611e3a,
+	0x0000007c, 0xbf8cc07f,
+	0x80708470, 0xbefc007e,
+	0xb8fbf803, 0xbefe007c,
+	0xbefc0070, 0xc0611efa,
+	0x0000007c, 0xbf8cc07f,
+	0x80708470, 0xbefc007e,
+	0xbefe007c, 0xbefc0070,
+	0xc0611a3a, 0x0000007c,
+	0xbf8cc07f, 0x80708470,
+	0xbefc007e, 0xbefe007c,
+	0xbefc0070, 0xc0611a7a,
+	0x0000007c, 0xbf8cc07f,
+	0x80708470, 0xbefc007e,
+	0xb8f1f801, 0xbefe007c,
+	0xbefc0070, 0xc0611c7a,
+	0x0000007c, 0xbf8cc07f,
+	0x80708470, 0xbefc007e,
+	0x867aff7f, 0x04000000,
+	0xbeef0080, 0x876f6f7a,
+	0xb8f02a05, 0x80708170,
+	0x8e708a70, 0x8e708170,
+	0xb8fb1605, 0x807b817b,
+	0x8e7b847b, 0x8e76827b,
+	0xbef600ff, 0x01000000,
+	0xbef20174, 0x80747074,
+	0x82758075, 0xbefc0080,
+	0xbf800000, 0xbe802b00,
+	0xbe822b02, 0xbe842b04,
+	0xbe862b06, 0xbe882b08,
+	0xbe8a2b0a, 0xbe8c2b0c,
+	0xbe8e2b0e, 0xc06b003a,
+	0x00000000, 0xbf8cc07f,
+	0xc06b013a, 0x00000010,
+	0xbf8cc07f, 0xc06b023a,
+	0x00000020, 0xbf8cc07f,
+	0xc06b033a, 0x00000030,
+	0xbf8cc07f, 0x8074c074,
+	0x82758075, 0x807c907c,
+	0xbf0a7b7c, 0xbf85ffe7,
+	0xbef40172, 0xbef00080,
 	0xbefe00c1, 0xbeff00c1,
-	0xb8ef4306, 0x866fc16f,
-	0xbf840019, 0x8e6f866f,
-	0x8e6f826f, 0xbef6006f,
-	0xb8f82a05, 0x80788178,
-	0x8e788a78, 0xb8ee1605,
+	0xbee80080, 0xbee90080,
+	0xbef600ff, 0x01000000,
+	0x867aff78, 0x00400000,
+	0xbf850003, 0xb8faf803,
+	0x897a7aff, 0x10000000,
+	0xbf85004d, 0xbe840080,
+	0xd2890000, 0x00000900,
+	0x80048104, 0xd2890001,
+	0x00000900, 0x80048104,
+	0xd2890002, 0x00000900,
+	0x80048104, 0xd2890003,
+	0x00000900, 0x80048104,
+	0xc069003a, 0x00000070,
+	0xbf8cc07f, 0x80709070,
+	0xbf06c004, 0xbf84ffee,
+	0xbe840080, 0xd2890000,
+	0x00000901, 0x80048104,
+	0xd2890001, 0x00000901,
+	0x80048104, 0xd2890002,
+	0x00000901, 0x80048104,
+	0xd2890003, 0x00000901,
+	0x80048104, 0xc069003a,
+	0x00000070, 0xbf8cc07f,
+	0x80709070, 0xbf06c004,
+	0xbf84ffee, 0xbe840080,
+	0xd2890000, 0x00000902,
+	0x80048104, 0xd2890001,
+	0x00000902, 0x80048104,
+	0xd2890002, 0x00000902,
+	0x80048104, 0xd2890003,
+	0x00000902, 0x80048104,
+	0xc069003a, 0x00000070,
+	0xbf8cc07f, 0x80709070,
+	0xbf06c004, 0xbf84ffee,
+	0xbe840080, 0xd2890000,
+	0x00000903, 0x80048104,
+	0xd2890001, 0x00000903,
+	0x80048104, 0xd2890002,
+	0x00000903, 0x80048104,
+	0xd2890003, 0x00000903,
+	0x80048104, 0xc069003a,
+	0x00000070, 0xbf8cc07f,
+	0x80709070, 0xbf06c004,
+	0xbf84ffee, 0xbf820008,
+	0xe0724000, 0x701d0000,
+	0xe0724100, 0x701d0100,
+	0xe0724200, 0x701d0200,
+	0xe0724300, 0x701d0300,
+	0xbefe00c1, 0xbeff00c1,
+	0xb8fb4306, 0x867bc17b,
+	0xbf840064, 0xbf8a0000,
+	0x867aff6f, 0x04000000,
+	0xbf840060, 0x8e7b867b,
+	0x8e7b827b, 0xbef6007b,
+	0xb8f02a05, 0x80708170,
+	0x8e708a70, 0x8e708170,
+	0xb8fa1605, 0x807a817a,
+	0x8e7a867a, 0x80707a70,
+	0x8070ff70, 0x00000080,
+	0xbef600ff, 0x01000000,
+	0xbefc0080, 0xd28c0002,
+	0x000100c1, 0xd28d0003,
+	0x000204c1, 0x867aff78,
+	0x00400000, 0xbf850003,
+	0xb8faf803, 0x897a7aff,
+	0x10000000, 0xbf850030,
+	0x24040682, 0xd86e4000,
+	0x00000002, 0xbf8cc07f,
+	0xbe840080, 0xd2890000,
+	0x00000900, 0x80048104,
+	0xd2890001, 0x00000900,
+	0x80048104, 0xd2890002,
+	0x00000900, 0x80048104,
+	0xd2890003, 0x00000900,
+	0x80048104, 0xc069003a,
+	0x00000070, 0xbf8cc07f,
+	0x80709070, 0xbf06c004,
+	0xbf84ffee, 0xbe840080,
+	0xd2890000, 0x00000901,
+	0x80048104, 0xd2890001,
+	0x00000901, 0x80048104,
+	0xd2890002, 0x00000901,
+	0x80048104, 0xd2890003,
+	0x00000901, 0x80048104,
+	0xc069003a, 0x00000070,
+	0xbf8cc07f, 0x80709070,
+	0xbf06c004, 0xbf84ffee,
+	0x680404ff, 0x00000200,
+	0xd0c9006a, 0x0000f702,
+	0xbf87ffd2, 0xbf820015,
+	0xd1060002, 0x00011103,
+	0x7e0602ff, 0x00000200,
+	0xbefc00ff, 0x00010000,
+	0xbe800077, 0x8677ff77,
+	0xff7fffff, 0x8777ff77,
+	0x00058000, 0xd8ec0000,
+	0x00000002, 0xbf8cc07f,
+	0xe0765000, 0x701d0002,
+	0x68040702, 0xd0c9006a,
+	0x0000f702, 0xbf87fff7,
+	0xbef70000, 0xbef000ff,
+	0x00000400, 0xbefe00c1,
+	0xbeff00c1, 0xb8fb2a05,
+	0x807b817b, 0x8e7b827b,
+	0x8e76887b, 0xbef600ff,
+	0x01000000, 0xbefc0084,
+	0xbf0a7b7c, 0xbf84006d,
+	0xbf11017c, 0x807bff7b,
+	0x00001000, 0x867aff78,
+	0x00400000, 0xbf850003,
+	0xb8faf803, 0x897a7aff,
+	0x10000000, 0xbf850051,
+	0xbe840080, 0xd2890000,
+	0x00000900, 0x80048104,
+	0xd2890001, 0x00000900,
+	0x80048104, 0xd2890002,
+	0x00000900, 0x80048104,
+	0xd2890003, 0x00000900,
+	0x80048104, 0xc069003a,
+	0x00000070, 0xbf8cc07f,
+	0x80709070, 0xbf06c004,
+	0xbf84ffee, 0xbe840080,
+	0xd2890000, 0x00000901,
+	0x80048104, 0xd2890001,
+	0x00000901, 0x80048104,
+	0xd2890002, 0x00000901,
+	0x80048104, 0xd2890003,
+	0x00000901, 0x80048104,
+	0xc069003a, 0x00000070,
+	0xbf8cc07f, 0x80709070,
+	0xbf06c004, 0xbf84ffee,
+	0xbe840080, 0xd2890000,
+	0x00000902, 0x80048104,
+	0xd2890001, 0x00000902,
+	0x80048104, 0xd2890002,
+	0x00000902, 0x80048104,
+	0xd2890003, 0x00000902,
+	0x80048104, 0xc069003a,
+	0x00000070, 0xbf8cc07f,
+	0x80709070, 0xbf06c004,
+	0xbf84ffee, 0xbe840080,
+	0xd2890000, 0x00000903,
+	0x80048104, 0xd2890001,
+	0x00000903, 0x80048104,
+	0xd2890002, 0x00000903,
+	0x80048104, 0xd2890003,
+	0x00000903, 0x80048104,
+	0xc069003a, 0x00000070,
+	0xbf8cc07f, 0x80709070,
+	0xbf06c004, 0xbf84ffee,
+	0x807c847c, 0xbf0a7b7c,
+	0xbf85ffb1, 0xbf9c0000,
+	0xbf820012, 0x7e000300,
+	0x7e020301, 0x7e040302,
+	0x7e060303, 0xe0724000,
+	0x701d0000, 0xe0724100,
+	0x701d0100, 0xe0724200,
+	0x701d0200, 0xe0724300,
+	0x701d0300, 0x807c847c,
+	0x8070ff70, 0x00000400,
+	0xbf0a7b7c, 0xbf85ffef,
+	0xbf9c0000, 0xbefc0080,
+	0xbf11017c, 0x867aff78,
+	0x00400000, 0xbf850003,
+	0xb8faf803, 0x897a7aff,
+	0x10000000, 0xbf850059,
+	0xd3d84000, 0x18000100,
+	0xd3d84001, 0x18000101,
+	0xd3d84002, 0x18000102,
+	0xd3d84003, 0x18000103,
+	0xbe840080, 0xd2890000,
+	0x00000900, 0x80048104,
+	0xd2890001, 0x00000900,
+	0x80048104, 0xd2890002,
+	0x00000900, 0x80048104,
+	0xd2890003, 0x00000900,
+	0x80048104, 0xc069003a,
+	0x00000070, 0xbf8cc07f,
+	0x80709070, 0xbf06c004,
+	0xbf84ffee, 0xbe840080,
+	0xd2890000, 0x00000901,
+	0x80048104, 0xd2890001,
+	0x00000901, 0x80048104,
+	0xd2890002, 0x00000901,
+	0x80048104, 0xd2890003,
+	0x00000901, 0x80048104,
+	0xc069003a, 0x00000070,
+	0xbf8cc07f, 0x80709070,
+	0xbf06c004, 0xbf84ffee,
+	0xbe840080, 0xd2890000,
+	0x00000902, 0x80048104,
+	0xd2890001, 0x00000902,
+	0x80048104, 0xd2890002,
+	0x00000902, 0x80048104,
+	0xd2890003, 0x00000902,
+	0x80048104, 0xc069003a,
+	0x00000070, 0xbf8cc07f,
+	0x80709070, 0xbf06c004,
+	0xbf84ffee, 0xbe840080,
+	0xd2890000, 0x00000903,
+	0x80048104, 0xd2890001,
+	0x00000903, 0x80048104,
+	0xd2890002, 0x00000903,
+	0x80048104, 0xd2890003,
+	0x00000903, 0x80048104,
+	0xc069003a, 0x00000070,
+	0xbf8cc07f, 0x80709070,
+	0xbf06c004, 0xbf84ffee,
+	0x807c847c, 0xbf0a7b7c,
+	0xbf85ffa9, 0xbf9c0000,
+	0xbf820016, 0xd3d84000,
+	0x18000100, 0xd3d84001,
+	0x18000101, 0xd3d84002,
+	0x18000102, 0xd3d84003,
+	0x18000103, 0xe0724000,
+	0x701d0000, 0xe0724100,
+	0x701d0100, 0xe0724200,
+	0x701d0200, 0xe0724300,
+	0x701d0300, 0x807c847c,
+	0x8070ff70, 0x00000400,
+	0xbf0a7b7c, 0xbf85ffeb,
+	0xbf9c0000, 0xbf820106,
+	0xbef4007e, 0x8675ff7f,
+	0x0000ffff, 0x8775ff75,
+	0x00040000, 0xbef60080,
+	0xbef700ff, 0x00807fac,
+	0x866eff7f, 0x08000000,
+	0x8f6e836e, 0x87776e77,
+	0x866eff7f, 0x70000000,
+	0x8f6e816e, 0x87776e77,
+	0x866eff7f, 0x04000000,
+	0xbf84001f, 0xbefe00c1,
+	0xbeff00c1, 0xb8ef4306,
+	0x866fc16f, 0xbf84001a,
+	0x8e6f866f, 0x8e6f826f,
+	0xbef6006f, 0xb8f82a05,
+	0x80788178, 0x8e788a78,
+	0x8e788178, 0xb8ee1605,
 	0x806e816e, 0x8e6e866e,
 	0x80786e78, 0x8078ff78,
 	0x00000080, 0xbef600ff,
@@ -482,42 +1455,63 @@ static const uint32_t cwsr_trap_gfx9_hex[] = {
 	0xbef80080, 0xbefe00c1,
 	0xbeff00c1, 0xb8ef2a05,
 	0x806f816f, 0x8e6f826f,
-	0x8e76886f, 0xbef600ff,
-	0x01000000, 0xbeee0078,
-	0x8078ff78, 0x00000400,
+	0x8e76886f, 0xbef90076,
+	0xbef600ff, 0x01000000,
+	0xbeee0078, 0x8078ff78,
+	0x00000400, 0xbef30079,
+	0x8079ff79, 0x00000400,
 	0xbefc0084, 0xbf11087c,
 	0x806fff6f, 0x00008000,
-	0xe0524000, 0x781d0000,
-	0xe0524100, 0x781d0100,
-	0xe0524200, 0x781d0200,
-	0xe0524300, 0x781d0300,
-	0xbf8c0f70, 0x7e000300,
-	0x7e020301, 0x7e040302,
-	0x7e060303, 0x807c847c,
-	0x8078ff78, 0x00000400,
-	0xbf0a6f7c, 0xbf85ffee,
-	0xbf9c0000, 0xe0524000,
+	0xe0524000, 0x791d0000,
+	0xe0524100, 0x791d0100,
+	0xe0524200, 0x791d0200,
+	0xe0524300, 0x791d0300,
+	0x8079ff79, 0x00000400,
+	0xbf8c0f70, 0xd3d94000,
+	0x18000100, 0xd3d94001,
+	0x18000101, 0xd3d94002,
+	0x18000102, 0xd3d94003,
+	0x18000103, 0xe0524000,
+	0x781d0000, 0xe0524100,
+	0x781d0100, 0xe0524200,
+	0x781d0200, 0xe0524300,
+	0x781d0300, 0xbf8c0f70,
+	0x7e000300, 0x7e020301,
+	0x7e040302, 0x7e060303,
+	0x807c847c, 0x8078ff78,
+	0x00000400, 0xbf0a6f7c,
+	0xbf85ffdb, 0xbf9c0000,
+	0xe0524000, 0x731d0000,
+	0xe0524100, 0x731d0100,
+	0xe0524200, 0x731d0200,
+	0xe0524300, 0x731d0300,
+	0xbf8c0f70, 0xd3d94000,
+	0x18000100, 0xd3d94001,
+	0x18000101, 0xd3d94002,
+	0x18000102, 0xd3d94003,
+	0x18000103, 0xe0524000,
 	0x6e1d0000, 0xe0524100,
 	0x6e1d0100, 0xe0524200,
 	0x6e1d0200, 0xe0524300,
 	0x6e1d0300, 0xb8f82a05,
 	0x80788178, 0x8e788a78,
-	0xb8ee1605, 0x806e816e,
-	0x8e6e866e, 0x80786e78,
-	0x80f8c078, 0xb8ef1605,
-	0x806f816f, 0x8e6f846f,
-	0x8e76826f, 0xbef600ff,
-	0x01000000, 0xbefc006f,
-	0xc031003a, 0x00000078,
-	0x80f8c078, 0xbf8cc07f,
-	0x80fc907c, 0xbf800000,
-	0xbe802d00, 0xbe822d02,
-	0xbe842d04, 0xbe862d06,
-	0xbe882d08, 0xbe8a2d0a,
-	0xbe8c2d0c, 0xbe8e2d0e,
-	0xbf06807c, 0xbf84fff0,
-	0xb8f82a05, 0x80788178,
-	0x8e788a78, 0xb8ee1605,
+	0x8e788178, 0xb8ee1605,
+	0x806e816e, 0x8e6e866e,
+	0x80786e78, 0x80f8c078,
+	0xb8ef1605, 0x806f816f,
+	0x8e6f846f, 0x8e76826f,
+	0xbef600ff, 0x01000000,
+	0xbefc006f, 0xc031003a,
+	0x00000078, 0x80f8c078,
+	0xbf8cc07f, 0x80fc907c,
+	0xbf800000, 0xbe802d00,
+	0xbe822d02, 0xbe842d04,
+	0xbe862d06, 0xbe882d08,
+	0xbe8a2d0a, 0xbe8c2d0c,
+	0xbe8e2d0e, 0xbf06807c,
+	0xbf84fff0, 0xb8f82a05,
+	0x80788178, 0x8e788a78,
+	0x8e788178, 0xb8ee1605,
 	0x806e816e, 0x8e6e866e,
 	0x80786e78, 0xbef60084,
 	0xbef600ff, 0x01000000,
@@ -525,44 +1519,44 @@ static const uint32_t cwsr_trap_gfx9_hex[] = {
 	0x80788478, 0xc0211b3a,
 	0x00000078, 0x80788478,
 	0xc0211b7a, 0x00000078,
-	0x80788478, 0xc0211eba,
-	0x00000078, 0x80788478,
-	0xc0211efa, 0x00000078,
 	0x80788478, 0xc0211c3a,
 	0x00000078, 0x80788478,
 	0xc0211c7a, 0x00000078,
+	0x80788478, 0xc0211eba,
+	0x00000078, 0x80788478,
+	0xc0211efa, 0x00000078,
 	0x80788478, 0xc0211a3a,
 	0x00000078, 0x80788478,
 	0xc0211a7a, 0x00000078,
 	0x80788478, 0xc0211cfa,
 	0x00000078, 0x80788478,
 	0xbf8cc07f, 0xbefc006f,
-	0xbefe007a, 0xbeff007b,
-	0x866f71ff, 0x000003ff,
-	0xb96f4803, 0x866f71ff,
+	0xbefe0070, 0xbeff0071,
+	0x866f7bff, 0x000003ff,
+	0xb96f4803, 0x866f7bff,
 	0xfffff800, 0x8f6f8b6f,
 	0xb96fa2c3, 0xb973f801,
 	0xb8ee2a05, 0x806e816e,
-	0x8e6e8a6e, 0xb8ef1605,
-	0x806f816f, 0x8e6f866f,
-	0x806e6f6e, 0x806e746e,
-	0x826f8075, 0x866fff6f,
-	0x0000ffff, 0xc0071cb7,
-	0x00000040, 0xc00b1d37,
-	0x00000048, 0xc0031e77,
-	0x00000058, 0xc0071eb7,
-	0x0000005c, 0xbf8cc07f,
-	0x866fff6d, 0xf0000000,
-	0x8f6f9c6f, 0x8e6f906f,
-	0xbeee0080, 0x876e6f6e,
-	0x866fff6d, 0x08000000,
-	0x8f6f9b6f, 0x8e6f8f6f,
-	0x876e6f6e, 0x866fff70,
-	0x00800000, 0x8f6f976f,
-	0xb96ef807, 0x866dff6d,
-	0x0000ffff, 0x86fe7e7e,
-	0x86ea6a6a, 0x8f6e8370,
-	0xb96ee0c2, 0xbf800002,
-	0xb9700002, 0xbf8a0000,
-	0x95806f6c, 0xbf810000,
+	0x8e6e8a6e, 0x8e6e816e,
+	0xb8ef1605, 0x806f816f,
+	0x8e6f866f, 0x806e6f6e,
+	0x806e746e, 0x826f8075,
+	0x866fff6f, 0x0000ffff,
+	0xc00b1c37, 0x00000050,
+	0xc00b1d37, 0x00000060,
+	0xc0031e77, 0x00000074,
+	0xbf8cc07f, 0x866fff6d,
+	0xf8000000, 0x8f6f9b6f,
+	0x8e6f906f, 0xbeee0080,
+	0x876e6f6e, 0x866fff6d,
+	0x04000000, 0x8f6f9a6f,
+	0x8e6f8f6f, 0x876e6f6e,
+	0x866fff7a, 0x00800000,
+	0x8f6f976f, 0xb96ef807,
+	0x866dff6d, 0x0000ffff,
+	0x86fe7e7e, 0x86ea6a6a,
+	0x8f6e837a, 0xb96ee0c2,
+	0xbf800002, 0xb97a0002,
+	0xbf8a0000, 0x95806f6c,
+	0xbf810000, 0x00000000,
 };
diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm
new file mode 100644
index 000000000000..4433bda2ce25
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm
@@ -0,0 +1,967 @@
+/*
+ * Copyright 2018 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+var SQ_WAVE_STATUS_INST_ATC_SHIFT		= 23
+var SQ_WAVE_STATUS_INST_ATC_MASK		= 0x00800000
+var SQ_WAVE_STATUS_SPI_PRIO_MASK		= 0x00000006
+var SQ_WAVE_STATUS_HALT_MASK			= 0x2000
+
+var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT		= 12
+var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE		= 9
+var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT		= 8
+var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE		= 6
+var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT		= 24
+var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE		= 4
+var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT	= 24
+var SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE	= 4
+var SQ_WAVE_IB_STS2_WAVE64_SHIFT		= 11
+var SQ_WAVE_IB_STS2_WAVE64_SIZE			= 1
+
+var SQ_WAVE_TRAPSTS_SAVECTX_MASK		= 0x400
+var SQ_WAVE_TRAPSTS_EXCE_MASK			= 0x1FF
+var SQ_WAVE_TRAPSTS_SAVECTX_SHIFT		= 10
+var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK		= 0x100
+var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT		= 8
+var SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK		= 0x3FF
+var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT		= 0x0
+var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE		= 10
+var SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK		= 0xFFFFF800
+var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT		= 11
+var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE		= 21
+var SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK		= 0x800
+
+var SQ_WAVE_IB_STS_RCNT_SHIFT			= 16
+var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT		= 15
+var SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT		= 25
+var SQ_WAVE_IB_STS_REPLAY_W64H_SIZE		= 1
+var SQ_WAVE_IB_STS_REPLAY_W64H_MASK		= 0x02000000
+var SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE		= 1
+var SQ_WAVE_IB_STS_RCNT_SIZE			= 6
+var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK	= 0x003F8000
+var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG	= 0x00007FFF
+
+var SQ_BUF_RSRC_WORD1_ATC_SHIFT			= 24
+var SQ_BUF_RSRC_WORD3_MTYPE_SHIFT		= 27
+
+// bits [31:24] unused by SPI debug data
+var TTMP11_SAVE_REPLAY_W64H_SHIFT		= 31
+var TTMP11_SAVE_REPLAY_W64H_MASK		= 0x80000000
+var TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT		= 24
+var TTMP11_SAVE_RCNT_FIRST_REPLAY_MASK		= 0x7F000000
+
+// SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14]
+// when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE
+var S_SAVE_BUF_RSRC_WORD1_STRIDE		= 0x00040000
+var S_SAVE_BUF_RSRC_WORD3_MISC			= 0x10807FAC
+
+var S_SAVE_SPI_INIT_ATC_MASK			= 0x08000000
+var S_SAVE_SPI_INIT_ATC_SHIFT			= 27
+var S_SAVE_SPI_INIT_MTYPE_MASK			= 0x70000000
+var S_SAVE_SPI_INIT_MTYPE_SHIFT			= 28
+var S_SAVE_SPI_INIT_FIRST_WAVE_MASK		= 0x04000000
+var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT		= 26
+
+var S_SAVE_PC_HI_RCNT_SHIFT			= 26
+var S_SAVE_PC_HI_RCNT_MASK			= 0xFC000000
+var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT		= 25
+var S_SAVE_PC_HI_FIRST_REPLAY_MASK		= 0x02000000
+var S_SAVE_PC_HI_REPLAY_W64H_SHIFT		= 24
+var S_SAVE_PC_HI_REPLAY_W64H_MASK		= 0x01000000
+
+var s_sgpr_save_num				= 108
+
+var s_save_spi_init_lo				= exec_lo
+var s_save_spi_init_hi				= exec_hi
+var s_save_pc_lo				= ttmp0
+var s_save_pc_hi				= ttmp1
+var s_save_exec_lo				= ttmp2
+var s_save_exec_hi				= ttmp3
+var s_save_status				= ttmp12
+var s_save_trapsts				= ttmp5
+var s_save_xnack_mask				= ttmp6
+var s_wave_size					= ttmp7
+var s_save_buf_rsrc0				= ttmp8
+var s_save_buf_rsrc1				= ttmp9
+var s_save_buf_rsrc2				= ttmp10
+var s_save_buf_rsrc3				= ttmp11
+var s_save_mem_offset				= ttmp14
+var s_save_alloc_size				= s_save_trapsts
+var s_save_tmp					= s_save_buf_rsrc2
+var s_save_m0					= ttmp15
+
+var S_RESTORE_BUF_RSRC_WORD1_STRIDE		= S_SAVE_BUF_RSRC_WORD1_STRIDE
+var S_RESTORE_BUF_RSRC_WORD3_MISC		= S_SAVE_BUF_RSRC_WORD3_MISC
+
+var S_RESTORE_SPI_INIT_ATC_MASK			= 0x08000000
+var S_RESTORE_SPI_INIT_ATC_SHIFT		= 27
+var S_RESTORE_SPI_INIT_MTYPE_MASK		= 0x70000000
+var S_RESTORE_SPI_INIT_MTYPE_SHIFT		= 28
+var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK		= 0x04000000
+var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT		= 26
+var S_WAVE_SIZE					= 25
+
+var S_RESTORE_PC_HI_RCNT_SHIFT			= S_SAVE_PC_HI_RCNT_SHIFT
+var S_RESTORE_PC_HI_RCNT_MASK			= S_SAVE_PC_HI_RCNT_MASK
+var S_RESTORE_PC_HI_FIRST_REPLAY_SHIFT		= S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
+var S_RESTORE_PC_HI_FIRST_REPLAY_MASK		= S_SAVE_PC_HI_FIRST_REPLAY_MASK
+
+var s_restore_spi_init_lo			= exec_lo
+var s_restore_spi_init_hi			= exec_hi
+var s_restore_mem_offset			= ttmp12
+var s_restore_alloc_size			= ttmp3
+var s_restore_tmp				= ttmp6
+var s_restore_mem_offset_save			= s_restore_tmp
+var s_restore_m0				= s_restore_alloc_size
+var s_restore_mode				= ttmp7
+var s_restore_flat_scratch			= ttmp2
+var s_restore_pc_lo				= ttmp0
+var s_restore_pc_hi				= ttmp1
+var s_restore_exec_lo				= ttmp14
+var s_restore_exec_hi				= ttmp15
+var s_restore_status				= ttmp4
+var s_restore_trapsts				= ttmp5
+var s_restore_xnack_mask			= ttmp13
+var s_restore_buf_rsrc0				= ttmp8
+var s_restore_buf_rsrc1				= ttmp9
+var s_restore_buf_rsrc2				= ttmp10
+var s_restore_buf_rsrc3				= ttmp11
+var s_restore_size				= ttmp7
+
+shader main
+	asic(DEFAULT)
+	type(CS)
+	wave_size(32)
+
+	s_branch	L_SKIP_RESTORE						//NOT restore. might be a regular trap or save
+
+L_JUMP_TO_RESTORE:
+	s_branch	L_RESTORE
+
+L_SKIP_RESTORE:
+	s_getreg_b32	s_save_status, hwreg(HW_REG_STATUS)			//save STATUS since we will change SCC
+	s_andn2_b32	s_save_status, s_save_status, SQ_WAVE_STATUS_SPI_PRIO_MASK
+	s_getreg_b32	s_save_trapsts, hwreg(HW_REG_TRAPSTS)
+	s_and_b32	ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK	//check whether this is for save
+	s_cbranch_scc1	L_SAVE
+
+	// If STATUS.MEM_VIOL is asserted then halt the wave to prevent
+	// the exception raising again and blocking context save.
+	s_and_b32	ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK
+	s_cbranch_scc0	L_FETCH_2ND_TRAP
+	s_or_b32	s_save_status, s_save_status, SQ_WAVE_STATUS_HALT_MASK
+
+L_FETCH_2ND_TRAP:
+	// Preserve and clear scalar XNACK state before issuing scalar loads.
+	// Save IB_STS.REPLAY_W64H[25], RCNT[21:16], FIRST_REPLAY[15] into
+	// unused space ttmp11[31:24].
+	s_andn2_b32	ttmp11, ttmp11, (TTMP11_SAVE_REPLAY_W64H_MASK | TTMP11_SAVE_RCNT_FIRST_REPLAY_MASK)
+	s_getreg_b32	ttmp2, hwreg(HW_REG_IB_STS)
+	s_and_b32	ttmp3, ttmp2, SQ_WAVE_IB_STS_REPLAY_W64H_MASK
+	s_lshl_b32	ttmp3, ttmp3, (TTMP11_SAVE_REPLAY_W64H_SHIFT - SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT)
+	s_or_b32	ttmp11, ttmp11, ttmp3
+	s_and_b32	ttmp3, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK
+	s_lshl_b32	ttmp3, ttmp3, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT)
+	s_or_b32	ttmp11, ttmp11, ttmp3
+	s_andn2_b32	ttmp2, ttmp2, (SQ_WAVE_IB_STS_REPLAY_W64H_MASK | SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK)
+	s_setreg_b32	hwreg(HW_REG_IB_STS), ttmp2
+
+	// Read second-level TBA/TMA from first-level TMA and jump if available.
+	// ttmp[2:5] and ttmp12 can be used (others hold SPI-initialized debug data)
+	// ttmp12 holds SQ_WAVE_STATUS
+	s_getreg_b32	ttmp14, hwreg(HW_REG_SHADER_TMA_LO)
+	s_getreg_b32	ttmp15, hwreg(HW_REG_SHADER_TMA_HI)
+	s_lshl_b64	[ttmp14, ttmp15], [ttmp14, ttmp15], 0x8
+	s_load_dwordx2	[ttmp2, ttmp3], [ttmp14, ttmp15], 0x0 glc:1		// second-level TBA
+	s_waitcnt	lgkmcnt(0)
+	s_load_dwordx2	[ttmp14, ttmp15], [ttmp14, ttmp15], 0x8 glc:1		// second-level TMA
+	s_waitcnt	lgkmcnt(0)
+	s_and_b64	[ttmp2, ttmp3], [ttmp2, ttmp3], [ttmp2, ttmp3]
+	s_cbranch_scc0	L_NO_NEXT_TRAP						// second-level trap handler not been set
+	s_setpc_b64	[ttmp2, ttmp3]						// jump to second-level trap handler
+
+L_NO_NEXT_TRAP:
+	s_getreg_b32	s_save_trapsts, hwreg(HW_REG_TRAPSTS)
+	s_and_b32	s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_EXCE_MASK
+	s_cbranch_scc1	L_EXCP_CASE						// Exception, jump back to the shader program directly.
+	s_add_u32	ttmp0, ttmp0, 4						// S_TRAP case, add 4 to ttmp0
+	s_addc_u32	ttmp1, ttmp1, 0
+L_EXCP_CASE:
+	s_and_b32	ttmp1, ttmp1, 0xFFFF
+
+	// Restore SQ_WAVE_IB_STS.
+	s_lshr_b32	ttmp2, ttmp11, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT)
+	s_and_b32	ttmp3, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK
+	s_lshr_b32	ttmp2, ttmp11, (TTMP11_SAVE_REPLAY_W64H_SHIFT - SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT)
+	s_and_b32	ttmp2, ttmp2, SQ_WAVE_IB_STS_REPLAY_W64H_MASK
+	s_or_b32	ttmp2, ttmp2, ttmp3
+	s_setreg_b32	hwreg(HW_REG_IB_STS), ttmp2
+
+	// Restore SQ_WAVE_STATUS.
+	s_and_b64	exec, exec, exec					// Restore STATUS.EXECZ, not writable by s_setreg_b32
+	s_and_b64	vcc, vcc, vcc						// Restore STATUS.VCCZ, not writable by s_setreg_b32
+	s_setreg_b32	hwreg(HW_REG_STATUS), s_save_status
+
+	s_rfe_b64	[ttmp0, ttmp1]
+
+L_SAVE:
+	//check whether there is mem_viol
+	s_getreg_b32	s_save_trapsts, hwreg(HW_REG_TRAPSTS)
+	s_and_b32	s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK
+	s_cbranch_scc0	L_NO_PC_REWIND
+
+	//if so, need rewind PC assuming GDS operation gets NACKed
+	s_mov_b32	s_save_tmp, 0
+	s_setreg_b32	hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT, 1), s_save_tmp	//clear mem_viol bit
+	s_and_b32	s_save_pc_hi, s_save_pc_hi, 0x0000ffff			//pc[47:32]
+	s_sub_u32	s_save_pc_lo, s_save_pc_lo, 8				//pc[31:0]-8
+	s_subb_u32	s_save_pc_hi, s_save_pc_hi, 0x0
+
+L_NO_PC_REWIND:
+	s_mov_b32	s_save_tmp, 0
+	s_setreg_b32	hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp	//clear saveCtx bit
+
+	s_getreg_b32	s_save_xnack_mask, hwreg(HW_REG_SHADER_XNACK_MASK)
+	s_getreg_b32	s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_RCNT_SHIFT, SQ_WAVE_IB_STS_RCNT_SIZE)
+	s_lshl_b32	s_save_tmp, s_save_tmp, S_SAVE_PC_HI_RCNT_SHIFT
+	s_or_b32	s_save_pc_hi, s_save_pc_hi, s_save_tmp
+	s_getreg_b32	s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT, SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE)
+	s_lshl_b32	s_save_tmp, s_save_tmp, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
+	s_or_b32	s_save_pc_hi, s_save_pc_hi, s_save_tmp
+	s_getreg_b32	s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT, SQ_WAVE_IB_STS_REPLAY_W64H_SIZE)
+	s_lshl_b32	s_save_tmp, s_save_tmp, S_SAVE_PC_HI_REPLAY_W64H_SHIFT
+	s_or_b32	s_save_pc_hi, s_save_pc_hi, s_save_tmp
+	s_getreg_b32	s_save_tmp, hwreg(HW_REG_IB_STS)			//clear RCNT and FIRST_REPLAY and REPLAY_W64H in IB_STS
+	s_and_b32	s_save_tmp, s_save_tmp, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG
+
+	s_setreg_b32	hwreg(HW_REG_IB_STS), s_save_tmp
+
+	/* inform SPI the readiness and wait for SPI's go signal */
+	s_mov_b32	s_save_exec_lo, exec_lo					//save EXEC and use EXEC for the go signal from SPI
+	s_mov_b32	s_save_exec_hi, exec_hi
+	s_mov_b64	exec, 0x0						//clear EXEC to get ready to receive
+
+	s_sendmsg	sendmsg(MSG_SAVEWAVE)					//send SPI a message and wait for SPI's write to EXEC
+
+L_SLEEP:
+	// sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause
+	// SQ hang, since the 7,8th wave could not get arbit to exec inst, while
+	// other waves are stuck into the sleep-loop and waiting for wrexec!=0
+	s_sleep		0x2
+	s_cbranch_execz	L_SLEEP
+
+	/* setup Resource Contants */
+	s_mov_b32	s_save_buf_rsrc0, s_save_spi_init_lo			//base_addr_lo
+	s_and_b32	s_save_buf_rsrc1, s_save_spi_init_hi, 0x0000FFFF	//base_addr_hi
+	s_or_b32	s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE
+	s_mov_b32	s_save_buf_rsrc2, 0					//NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited
+	s_mov_b32	s_save_buf_rsrc3, S_SAVE_BUF_RSRC_WORD3_MISC
+	s_and_b32	s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_ATC_MASK
+	s_lshr_b32	s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT)
+	s_or_b32	s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp		//or ATC
+	s_and_b32	s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_MTYPE_MASK
+	s_lshr_b32	s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT)
+	s_or_b32	s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp		//or MTYPE
+
+	s_mov_b32	s_save_m0, m0
+
+	/* global mem offset */
+	s_mov_b32	s_save_mem_offset, 0x0
+	s_getreg_b32	s_wave_size, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE)
+	s_lshl_b32	s_wave_size, s_wave_size, S_WAVE_SIZE
+	s_or_b32	s_wave_size, s_save_spi_init_hi, s_wave_size		//share s_wave_size with exec_hi, it's at bit25
+
+	/* save HW registers */
+
+L_SAVE_HWREG:
+	// HWREG SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR)
+	get_vgpr_size_bytes(s_save_mem_offset, s_wave_size)
+	get_svgpr_size_bytes(s_save_tmp)
+	s_add_u32	s_save_mem_offset, s_save_mem_offset, s_save_tmp
+	s_add_u32	s_save_mem_offset, s_save_mem_offset, get_sgpr_size_bytes()
+
+	s_mov_b32	s_save_buf_rsrc2, 0x1000000				//NUM_RECORDS in bytes
+
+	write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)
+	write_hwreg_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset)
+	write_hwreg_to_mem(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset)
+	write_hwreg_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset)
+	write_hwreg_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset)
+	write_hwreg_to_mem(s_save_status, s_save_buf_rsrc0, s_save_mem_offset)
+
+	s_getreg_b32	s_save_trapsts, hwreg(HW_REG_TRAPSTS)
+	write_hwreg_to_mem(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset)
+	write_hwreg_to_mem(s_save_xnack_mask, s_save_buf_rsrc0, s_save_mem_offset)
+
+	s_getreg_b32	s_save_m0, hwreg(HW_REG_MODE)
+	write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)
+
+	s_getreg_b32	s_save_m0, hwreg(HW_REG_SHADER_FLAT_SCRATCH_LO)
+	write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)
+
+	s_getreg_b32	s_save_m0, hwreg(HW_REG_SHADER_FLAT_SCRATCH_HI)
+	write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)
+
+	/* the first wave in the threadgroup */
+	s_and_b32	s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK
+	s_mov_b32	s_save_exec_hi, 0x0
+	s_or_b32	s_save_exec_hi, s_save_tmp, s_save_exec_hi		// save first wave bit to s_save_exec_hi.bits[26]
+
+	/* save SGPRs */
+	// Save SGPR before LDS save, then the s0 to s4 can be used during LDS save...
+
+	// SGPR SR memory offset : size(VGPR)+size(SVGPR)
+	get_vgpr_size_bytes(s_save_mem_offset, s_wave_size)
+	get_svgpr_size_bytes(s_save_tmp)
+	s_add_u32	s_save_mem_offset, s_save_mem_offset, s_save_tmp
+	s_mov_b32	s_save_buf_rsrc2, 0x1000000				//NUM_RECORDS in bytes
+
+	// backup s_save_buf_rsrc0,1 to s_save_pc_lo/hi, since write_16sgpr_to_mem function will change the rsrc0
+	s_mov_b32	s_save_xnack_mask, s_save_buf_rsrc0
+	s_add_u32	s_save_buf_rsrc0, s_save_buf_rsrc0, s_save_mem_offset
+	s_addc_u32	s_save_buf_rsrc1, s_save_buf_rsrc1, 0
+
+	s_mov_b32	m0, 0x0							//SGPR initial index value =0
+	s_nop		0x0							//Manually inserted wait states
+L_SAVE_SGPR_LOOP:
+	// SGPR is allocated in 16 SGPR granularity
+	s_movrels_b64	s0, s0							//s0 = s[0+m0], s1 = s[1+m0]
+	s_movrels_b64	s2, s2							//s2 = s[2+m0], s3 = s[3+m0]
+	s_movrels_b64	s4, s4							//s4 = s[4+m0], s5 = s[5+m0]
+	s_movrels_b64	s6, s6							//s6 = s[6+m0], s7 = s[7+m0]
+	s_movrels_b64	s8, s8							//s8 = s[8+m0], s9 = s[9+m0]
+	s_movrels_b64	s10, s10						//s10 = s[10+m0], s11 = s[11+m0]
+	s_movrels_b64	s12, s12						//s12 = s[12+m0], s13 = s[13+m0]
+	s_movrels_b64	s14, s14						//s14 = s[14+m0], s15 = s[15+m0]
+
+	write_16sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset)
+	s_add_u32	m0, m0, 16						//next sgpr index
+	s_cmp_lt_u32	m0, 96							//scc = (m0 < first 96 SGPR) ? 1 : 0
+	s_cbranch_scc1	L_SAVE_SGPR_LOOP					//first 96 SGPR save is complete?
+
+	//save the rest 12 SGPR
+	s_movrels_b64	s0, s0							//s0 = s[0+m0], s1 = s[1+m0]
+	s_movrels_b64	s2, s2							//s2 = s[2+m0], s3 = s[3+m0]
+	s_movrels_b64	s4, s4							//s4 = s[4+m0], s5 = s[5+m0]
+	s_movrels_b64	s6, s6							//s6 = s[6+m0], s7 = s[7+m0]
+	s_movrels_b64	s8, s8							//s8 = s[8+m0], s9 = s[9+m0]
+	s_movrels_b64	s10, s10						//s10 = s[10+m0], s11 = s[11+m0]
+	write_12sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset)
+
+	// restore s_save_buf_rsrc0,1
+	s_mov_b32	s_save_buf_rsrc0, s_save_xnack_mask
+
+	/* save first 4 VGPR, then LDS save could use   */
+	// each wave will alloc 4 vgprs at least...
+
+	s_mov_b32	s_save_mem_offset, 0
+ 	s_mov_b32	exec_lo, 0xFFFFFFFF					//need every thread from now on
+	s_lshr_b32	m0, s_wave_size, S_WAVE_SIZE
+	s_and_b32	m0, m0, 1
+	s_cmp_eq_u32	m0, 1
+	s_cbranch_scc1	L_ENABLE_SAVE_4VGPR_EXEC_HI
+	s_mov_b32	exec_hi, 0x00000000
+	s_branch	L_SAVE_4VGPR_WAVE32
+L_ENABLE_SAVE_4VGPR_EXEC_HI:
+	s_mov_b32	exec_hi, 0xFFFFFFFF
+	s_branch	L_SAVE_4VGPR_WAVE64
+L_SAVE_4VGPR_WAVE32:
+	s_mov_b32	s_save_buf_rsrc2, 0x1000000				//NUM_RECORDS in bytes
+
+	// VGPR Allocated in 4-GPR granularity
+
+	buffer_store_dword	v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
+	buffer_store_dword	v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128
+	buffer_store_dword	v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*2
+	buffer_store_dword	v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*3
+	s_branch	L_SAVE_LDS
+
+L_SAVE_4VGPR_WAVE64:
+	s_mov_b32	s_save_buf_rsrc2, 0x1000000				//NUM_RECORDS in bytes
+
+	// VGPR Allocated in 4-GPR granularity
+
+	buffer_store_dword	v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
+	buffer_store_dword	v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256
+	buffer_store_dword	v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2
+	buffer_store_dword	v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3
+
+	/* save LDS */
+
+L_SAVE_LDS:
+	// Change EXEC to all threads...
+	s_mov_b32	exec_lo, 0xFFFFFFFF					//need every thread from now on
+	s_lshr_b32	m0, s_wave_size, S_WAVE_SIZE
+	s_and_b32	m0, m0, 1
+	s_cmp_eq_u32	m0, 1
+	s_cbranch_scc1	L_ENABLE_SAVE_LDS_EXEC_HI
+	s_mov_b32	exec_hi, 0x00000000
+	s_branch	L_SAVE_LDS_NORMAL
+L_ENABLE_SAVE_LDS_EXEC_HI:
+	s_mov_b32	exec_hi, 0xFFFFFFFF
+L_SAVE_LDS_NORMAL:
+	s_getreg_b32	s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)
+	s_and_b32	s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF	//lds_size is zero?
+	s_cbranch_scc0	L_SAVE_LDS_DONE						//no lds used? jump to L_SAVE_DONE
+
+	s_barrier								//LDS is used? wait for other waves in the same TG
+	s_and_b32	s_save_tmp, s_save_exec_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK
+	s_cbranch_scc0	L_SAVE_LDS_DONE
+
+	// first wave do LDS save;
+
+	s_lshl_b32	s_save_alloc_size, s_save_alloc_size, 6			//LDS size in dwords = lds_size * 64dw
+	s_lshl_b32	s_save_alloc_size, s_save_alloc_size, 2			//LDS size in bytes
+	s_mov_b32	s_save_buf_rsrc2, s_save_alloc_size			//NUM_RECORDS in bytes
+
+	// LDS at offset: size(VGPR)+size(SVGPR)+SIZE(SGPR)+SIZE(HWREG)
+	//
+	get_vgpr_size_bytes(s_save_mem_offset, s_wave_size)
+	get_svgpr_size_bytes(s_save_tmp)
+	s_add_u32	s_save_mem_offset, s_save_mem_offset, s_save_tmp
+	s_add_u32	s_save_mem_offset, s_save_mem_offset, get_sgpr_size_bytes()
+	s_add_u32	s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes()
+
+	s_mov_b32	s_save_buf_rsrc2, 0x1000000				//NUM_RECORDS in bytes
+
+	//load 0~63*4(byte address) to vgpr v0
+	v_mbcnt_lo_u32_b32	v0, -1, 0
+	v_mbcnt_hi_u32_b32	v0, -1, v0
+	v_mul_u32_u24	v0, 4, v0
+
+	s_lshr_b32	m0, s_wave_size, S_WAVE_SIZE
+	s_and_b32	m0, m0, 1
+	s_cmp_eq_u32	m0, 1
+	s_mov_b32	m0, 0x0
+	s_cbranch_scc1	L_SAVE_LDS_W64
+
+L_SAVE_LDS_W32:
+	s_mov_b32	s3, 128
+	s_nop		0
+	s_nop		0
+	s_nop		0
+L_SAVE_LDS_LOOP_W32:
+	ds_read_b32	v1, v0
+	s_waitcnt	0
+	buffer_store_dword	v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
+
+	s_add_u32	m0, m0, s3						//every buffer_store_lds does 256 bytes
+	s_add_u32	s_save_mem_offset, s_save_mem_offset, s3
+	v_add_nc_u32	v0, v0, 128						//mem offset increased by 128 bytes
+	s_cmp_lt_u32	m0, s_save_alloc_size					//scc=(m0 < s_save_alloc_size) ? 1 : 0
+	s_cbranch_scc1	L_SAVE_LDS_LOOP_W32					//LDS save is complete?
+
+	s_branch	L_SAVE_LDS_DONE
+
+L_SAVE_LDS_W64:
+	s_mov_b32	s3, 256
+	s_nop		0
+	s_nop		0
+	s_nop		0
+L_SAVE_LDS_LOOP_W64:
+	ds_read_b32	v1, v0
+	s_waitcnt	0
+	buffer_store_dword	v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
+
+	s_add_u32	m0, m0, s3						//every buffer_store_lds does 256 bytes
+	s_add_u32	s_save_mem_offset, s_save_mem_offset, s3
+	v_add_nc_u32	v0, v0, 256						//mem offset increased by 256 bytes
+	s_cmp_lt_u32	m0, s_save_alloc_size					//scc=(m0 < s_save_alloc_size) ? 1 : 0
+	s_cbranch_scc1	L_SAVE_LDS_LOOP_W64					//LDS save is complete?
+
+L_SAVE_LDS_DONE:
+	/* save VGPRs  - set the Rest VGPRs */
+L_SAVE_VGPR:
+	// VGPR SR memory offset: 0
+	s_mov_b32	exec_lo, 0xFFFFFFFF					//need every thread from now on
+	s_lshr_b32	m0, s_wave_size, S_WAVE_SIZE
+	s_and_b32	m0, m0, 1
+	s_cmp_eq_u32	m0, 1
+	s_cbranch_scc1	L_ENABLE_SAVE_VGPR_EXEC_HI
+	s_mov_b32	s_save_mem_offset, (0+128*4)				// for the rest VGPRs
+	s_mov_b32	exec_hi, 0x00000000
+	s_branch	L_SAVE_VGPR_NORMAL
+L_ENABLE_SAVE_VGPR_EXEC_HI:
+	s_mov_b32	s_save_mem_offset, (0+256*4)				// for the rest VGPRs
+	s_mov_b32	exec_hi, 0xFFFFFFFF
+L_SAVE_VGPR_NORMAL:
+	s_getreg_b32	s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)
+	s_add_u32	s_save_alloc_size, s_save_alloc_size, 1
+	s_lshl_b32	s_save_alloc_size, s_save_alloc_size, 2			//Number of VGPRs = (vgpr_size + 1) * 4    (non-zero value)
+	//determine it is wave32 or wave64
+	s_lshr_b32	m0, s_wave_size, S_WAVE_SIZE
+	s_and_b32	m0, m0, 1
+	s_cmp_eq_u32	m0, 1
+	s_cbranch_scc1	L_SAVE_VGPR_WAVE64
+
+	s_mov_b32	s_save_buf_rsrc2, 0x1000000				//NUM_RECORDS in bytes
+
+	// VGPR Allocated in 4-GPR granularity
+
+	// VGPR store using dw burst
+	s_mov_b32	m0, 0x4							//VGPR initial index value =4
+	s_cmp_lt_u32	m0, s_save_alloc_size
+	s_cbranch_scc0	L_SAVE_VGPR_END
+
+L_SAVE_VGPR_W32_LOOP:
+	v_movrels_b32	v0, v0							//v0 = v[0+m0]
+	v_movrels_b32	v1, v1							//v1 = v[1+m0]
+	v_movrels_b32	v2, v2							//v2 = v[2+m0]
+	v_movrels_b32	v3, v3							//v3 = v[3+m0]
+
+	buffer_store_dword	v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
+	buffer_store_dword	v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128
+	buffer_store_dword	v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*2
+	buffer_store_dword	v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*3
+
+	s_add_u32	m0, m0, 4						//next vgpr index
+	s_add_u32	s_save_mem_offset, s_save_mem_offset, 128*4		//every buffer_store_dword does 128 bytes
+	s_cmp_lt_u32	m0, s_save_alloc_size					//scc = (m0 < s_save_alloc_size) ? 1 : 0
+	s_cbranch_scc1	L_SAVE_VGPR_W32_LOOP					//VGPR save is complete?
+
+	s_branch	L_SAVE_VGPR_END
+
+L_SAVE_VGPR_WAVE64:
+	s_mov_b32	s_save_buf_rsrc2, 0x1000000				//NUM_RECORDS in bytes
+
+	// VGPR store using dw burst
+	s_mov_b32	m0, 0x4							//VGPR initial index value =4
+	s_cmp_lt_u32	m0, s_save_alloc_size
+	s_cbranch_scc0	L_SAVE_VGPR_END
+
+L_SAVE_VGPR_W64_LOOP:
+	v_movrels_b32	v0, v0							//v0 = v[0+m0]
+	v_movrels_b32	v1, v1							//v1 = v[1+m0]
+	v_movrels_b32	v2, v2							//v2 = v[2+m0]
+	v_movrels_b32	v3, v3							//v3 = v[3+m0]
+
+	buffer_store_dword	v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
+	buffer_store_dword	v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256
+	buffer_store_dword	v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2
+	buffer_store_dword	v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3
+
+	s_add_u32	m0, m0, 4						//next vgpr index
+	s_add_u32	s_save_mem_offset, s_save_mem_offset, 256*4		//every buffer_store_dword does 256 bytes
+	s_cmp_lt_u32	m0, s_save_alloc_size					//scc = (m0 < s_save_alloc_size) ? 1 : 0
+	s_cbranch_scc1	L_SAVE_VGPR_W64_LOOP					//VGPR save is complete?
+
+	//Below part will be the save shared vgpr part (new for gfx10)
+	s_getreg_b32	s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE)
+	s_and_b32	s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF	//shared_vgpr_size is zero?
+	s_cbranch_scc0	L_SAVE_VGPR_END						//no shared_vgpr used? jump to L_SAVE_LDS
+	s_lshl_b32	s_save_alloc_size, s_save_alloc_size, 3			//Number of SHARED_VGPRs = shared_vgpr_size * 8    (non-zero value)
+	//m0 now has the value of normal vgpr count, just add the m0 with shared_vgpr count to get the total count.
+	//save shared_vgpr will start from the index of m0
+	s_add_u32	s_save_alloc_size, s_save_alloc_size, m0
+	s_mov_b32	exec_lo, 0xFFFFFFFF
+	s_mov_b32	exec_hi, 0x00000000
+L_SAVE_SHARED_VGPR_WAVE64_LOOP:
+	v_movrels_b32	v0, v0							//v0 = v[0+m0]
+	buffer_store_dword	v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
+	s_add_u32	m0, m0, 1						//next vgpr index
+	s_add_u32	s_save_mem_offset, s_save_mem_offset, 128
+	s_cmp_lt_u32	m0, s_save_alloc_size					//scc = (m0 < s_save_alloc_size) ? 1 : 0
+	s_cbranch_scc1	L_SAVE_SHARED_VGPR_WAVE64_LOOP				//SHARED_VGPR save is complete?
+
+L_SAVE_VGPR_END:
+	s_branch	L_END_PGM
+
+L_RESTORE:
+	/* Setup Resource Contants */
+	s_mov_b32	s_restore_buf_rsrc0, s_restore_spi_init_lo		//base_addr_lo
+	s_and_b32	s_restore_buf_rsrc1, s_restore_spi_init_hi, 0x0000FFFF	//base_addr_hi
+	s_or_b32	s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE
+	s_mov_b32	s_restore_buf_rsrc2, 0					//NUM_RECORDS initial value = 0 (in bytes)
+	s_mov_b32	s_restore_buf_rsrc3, S_RESTORE_BUF_RSRC_WORD3_MISC
+	s_and_b32	s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_ATC_MASK
+	s_lshr_b32	s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT)
+	s_or_b32	s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp	//or ATC
+	s_and_b32	s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_MTYPE_MASK
+	s_lshr_b32	s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT)
+	s_or_b32	s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp	//or MTYPE
+	//determine it is wave32 or wave64
+	s_getreg_b32	s_restore_size, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE)
+	s_lshl_b32	s_restore_size, s_restore_size, S_WAVE_SIZE
+	s_or_b32	s_restore_size, s_restore_spi_init_hi, s_restore_size
+
+	s_and_b32	s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK
+	s_cbranch_scc0	L_RESTORE_VGPR
+
+	/* restore LDS */
+L_RESTORE_LDS:
+	s_mov_b32	exec_lo, 0xFFFFFFFF					//need every thread from now on
+	s_lshr_b32	m0, s_restore_size, S_WAVE_SIZE
+	s_and_b32	m0, m0, 1
+	s_cmp_eq_u32	m0, 1
+	s_cbranch_scc1	L_ENABLE_RESTORE_LDS_EXEC_HI
+	s_mov_b32	exec_hi, 0x00000000
+	s_branch	L_RESTORE_LDS_NORMAL
+L_ENABLE_RESTORE_LDS_EXEC_HI:
+	s_mov_b32	exec_hi, 0xFFFFFFFF
+L_RESTORE_LDS_NORMAL:
+	s_getreg_b32	s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)
+	s_and_b32	s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF	//lds_size is zero?
+	s_cbranch_scc0	L_RESTORE_VGPR						//no lds used? jump to L_RESTORE_VGPR
+	s_lshl_b32	s_restore_alloc_size, s_restore_alloc_size, 6		//LDS size in dwords = lds_size * 64dw
+	s_lshl_b32	s_restore_alloc_size, s_restore_alloc_size, 2		//LDS size in bytes
+	s_mov_b32	s_restore_buf_rsrc2, s_restore_alloc_size		//NUM_RECORDS in bytes
+
+	// LDS at offset: size(VGPR)+size(SVGPR)+SIZE(SGPR)+SIZE(HWREG)
+	//
+	get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size)
+	get_svgpr_size_bytes(s_restore_tmp)
+	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
+	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes()
+	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes()
+
+	s_mov_b32	s_restore_buf_rsrc2, 0x1000000				//NUM_RECORDS in bytes
+
+	s_lshr_b32	m0, s_wave_size, S_WAVE_SIZE
+	s_and_b32	m0, m0, 1
+	s_cmp_eq_u32	m0, 1
+	s_mov_b32	m0, 0x0
+	s_cbranch_scc1	L_RESTORE_LDS_LOOP_W64
+
+L_RESTORE_LDS_LOOP_W32:
+	buffer_load_dword	v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1	// first 64DW
+	s_add_u32	m0, m0, 128						// 128 DW
+	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, 128		//mem offset increased by 128DW
+	s_cmp_lt_u32	m0, s_restore_alloc_size				//scc=(m0 < s_restore_alloc_size) ? 1 : 0
+	s_cbranch_scc1	L_RESTORE_LDS_LOOP_W32					//LDS restore is complete?
+	s_branch	L_RESTORE_VGPR
+
+L_RESTORE_LDS_LOOP_W64:
+	buffer_load_dword	v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1	// first 64DW
+	s_add_u32	m0, m0, 256						// 256 DW
+	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, 256		//mem offset increased by 256DW
+	s_cmp_lt_u32	m0, s_restore_alloc_size				//scc=(m0 < s_restore_alloc_size) ? 1 : 0
+	s_cbranch_scc1	L_RESTORE_LDS_LOOP_W64					//LDS restore is complete?
+
+	/* restore VGPRs */
+L_RESTORE_VGPR:
+	// VGPR SR memory offset : 0
+	s_mov_b32	s_restore_mem_offset, 0x0
+ 	s_mov_b32	exec_lo, 0xFFFFFFFF					//need every thread from now on
+	s_lshr_b32	m0, s_restore_size, S_WAVE_SIZE
+	s_and_b32	m0, m0, 1
+	s_cmp_eq_u32	m0, 1
+	s_cbranch_scc1	L_ENABLE_RESTORE_VGPR_EXEC_HI
+	s_mov_b32	exec_hi, 0x00000000
+	s_branch	L_RESTORE_VGPR_NORMAL
+L_ENABLE_RESTORE_VGPR_EXEC_HI:
+	s_mov_b32	exec_hi, 0xFFFFFFFF
+L_RESTORE_VGPR_NORMAL:
+	s_getreg_b32	s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)
+	s_add_u32	s_restore_alloc_size, s_restore_alloc_size, 1
+	s_lshl_b32	s_restore_alloc_size, s_restore_alloc_size, 2		//Number of VGPRs = (vgpr_size + 1) * 4    (non-zero value)
+	//determine it is wave32 or wave64
+	s_lshr_b32	m0, s_restore_size, S_WAVE_SIZE
+	s_and_b32	m0, m0, 1
+	s_cmp_eq_u32	m0, 1
+	s_cbranch_scc1	L_RESTORE_VGPR_WAVE64
+
+	s_mov_b32	s_restore_buf_rsrc2, 0x1000000				//NUM_RECORDS in bytes
+
+	// VGPR load using dw burst
+	s_mov_b32	s_restore_mem_offset_save, s_restore_mem_offset		// restore start with v1, v0 will be the last
+	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, 128*4
+	s_mov_b32	m0, 4							//VGPR initial index value = 4
+
+L_RESTORE_VGPR_WAVE32_LOOP:
+	buffer_load_dword	v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
+	buffer_load_dword	v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:128
+	buffer_load_dword	v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:128*2
+	buffer_load_dword	v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:128*3
+	s_waitcnt	vmcnt(0)
+	v_movreld_b32	v0, v0							//v[0+m0] = v0
+	v_movreld_b32	v1, v1
+	v_movreld_b32	v2, v2
+	v_movreld_b32	v3, v3
+	s_add_u32	m0, m0, 4						//next vgpr index
+	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, 128*4	//every buffer_load_dword does 128 bytes
+	s_cmp_lt_u32	m0, s_restore_alloc_size				//scc = (m0 < s_restore_alloc_size) ? 1 : 0
+	s_cbranch_scc1	L_RESTORE_VGPR_WAVE32_LOOP				//VGPR restore (except v0) is complete?
+
+	/* VGPR restore on v0 */
+	buffer_load_dword	v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1
+	buffer_load_dword	v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:128
+	buffer_load_dword	v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:128*2
+	buffer_load_dword	v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:128*3
+
+	s_branch	L_RESTORE_SGPR
+
+L_RESTORE_VGPR_WAVE64:
+	s_mov_b32	s_restore_buf_rsrc2, 0x1000000				//NUM_RECORDS in bytes
+
+	// VGPR load using dw burst
+	s_mov_b32	s_restore_mem_offset_save, s_restore_mem_offset		// restore start with v4, v0 will be the last
+	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, 256*4
+	s_mov_b32	m0, 4							//VGPR initial index value = 4
+
+L_RESTORE_VGPR_WAVE64_LOOP:
+	buffer_load_dword	v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
+	buffer_load_dword	v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256
+	buffer_load_dword	v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*2
+	buffer_load_dword	v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*3
+	s_waitcnt	vmcnt(0)
+	v_movreld_b32	v0, v0							//v[0+m0] = v0
+	v_movreld_b32	v1, v1
+	v_movreld_b32	v2, v2
+	v_movreld_b32	v3, v3
+	s_add_u32	m0, m0, 4						//next vgpr index
+	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, 256*4	//every buffer_load_dword does 256 bytes
+	s_cmp_lt_u32	m0, s_restore_alloc_size				//scc = (m0 < s_restore_alloc_size) ? 1 : 0
+	s_cbranch_scc1	L_RESTORE_VGPR_WAVE64_LOOP				//VGPR restore (except v0) is complete?
+
+	//Below part will be the restore shared vgpr part (new for gfx10)
+	s_getreg_b32	s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE)	//shared_vgpr_size
+	s_and_b32	s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF	//shared_vgpr_size is zero?
+	s_cbranch_scc0	L_RESTORE_V0						//no shared_vgpr used?
+	s_lshl_b32	s_restore_alloc_size, s_restore_alloc_size, 3		//Number of SHARED_VGPRs = shared_vgpr_size * 8    (non-zero value)
+	//m0 now has the value of normal vgpr count, just add the m0 with shared_vgpr count to get the total count.
+	//restore shared_vgpr will start from the index of m0
+	s_add_u32	s_restore_alloc_size, s_restore_alloc_size, m0
+	s_mov_b32	exec_lo, 0xFFFFFFFF
+	s_mov_b32	exec_hi, 0x00000000
+L_RESTORE_SHARED_VGPR_WAVE64_LOOP:
+	buffer_load_dword	v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
+	s_waitcnt	vmcnt(0)
+	v_movreld_b32	v0, v0							//v[0+m0] = v0
+	s_add_u32	m0, m0, 1						//next vgpr index
+	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, 128
+	s_cmp_lt_u32	m0, s_restore_alloc_size				//scc = (m0 < s_restore_alloc_size) ? 1 : 0
+	s_cbranch_scc1	L_RESTORE_SHARED_VGPR_WAVE64_LOOP			//VGPR restore (except v0) is complete?
+
+	s_mov_b32	exec_hi, 0xFFFFFFFF					//restore back exec_hi before restoring V0!!
+
+	/* VGPR restore on v0 */
+L_RESTORE_V0:
+	buffer_load_dword	v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1
+	buffer_load_dword	v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256
+	buffer_load_dword	v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*2
+	buffer_load_dword	v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 offset:256*3
+	s_waitcnt	vmcnt(0)
+
+	/* restore SGPRs */
+	//will be 2+8+16*6
+	// SGPR SR memory offset : size(VGPR)+size(SVGPR)
+L_RESTORE_SGPR:
+	get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size)
+	get_svgpr_size_bytes(s_restore_tmp)
+	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
+	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes()
+	s_sub_u32	s_restore_mem_offset, s_restore_mem_offset, 20*4	//s108~s127 is not saved
+
+	s_mov_b32	s_restore_buf_rsrc2, 0x1000000				//NUM_RECORDS in bytes
+
+	s_mov_b32	m0, s_sgpr_save_num
+
+	read_4sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset)
+	s_waitcnt	lgkmcnt(0)
+
+	s_sub_u32	m0, m0, 4						// Restore from S[0] to S[104]
+	s_nop		0							// hazard SALU M0=> S_MOVREL
+
+	s_movreld_b64	s0, s0							//s[0+m0] = s0
+	s_movreld_b64	s2, s2
+
+	read_8sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset)
+	s_waitcnt	lgkmcnt(0)
+
+	s_sub_u32	m0, m0, 8						// Restore from S[0] to S[96]
+	s_nop		0							// hazard SALU M0=> S_MOVREL
+
+	s_movreld_b64	s0, s0							//s[0+m0] = s0
+	s_movreld_b64	s2, s2
+	s_movreld_b64	s4, s4
+	s_movreld_b64	s6, s6
+
+ L_RESTORE_SGPR_LOOP:
+	read_16sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset)
+	s_waitcnt	lgkmcnt(0)
+
+	s_sub_u32	m0, m0, 16						// Restore from S[n] to S[0]
+	s_nop		0							// hazard SALU M0=> S_MOVREL
+
+	s_movreld_b64	s0, s0							//s[0+m0] = s0
+	s_movreld_b64	s2, s2
+	s_movreld_b64	s4, s4
+	s_movreld_b64	s6, s6
+	s_movreld_b64	s8, s8
+	s_movreld_b64	s10, s10
+	s_movreld_b64	s12, s12
+	s_movreld_b64	s14, s14
+
+	s_cmp_eq_u32	m0, 0							//scc = (m0 < s_sgpr_save_num) ? 1 : 0
+	s_cbranch_scc0	L_RESTORE_SGPR_LOOP
+
+	/* restore HW registers */
+L_RESTORE_HWREG:
+	// HWREG SR memory offset : size(VGPR)+size(SVGPR)+size(SGPR)
+	get_vgpr_size_bytes(s_restore_mem_offset, s_restore_size)
+	get_svgpr_size_bytes(s_restore_tmp)
+	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
+	s_add_u32	s_restore_mem_offset, s_restore_mem_offset, get_sgpr_size_bytes()
+
+	s_mov_b32	s_restore_buf_rsrc2, 0x1000000				//NUM_RECORDS in bytes
+
+	read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset)
+	read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset)
+	read_hwreg_from_mem(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset)
+	read_hwreg_from_mem(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset)
+	read_hwreg_from_mem(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset)
+	read_hwreg_from_mem(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset)
+	read_hwreg_from_mem(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset)
+	read_hwreg_from_mem(s_restore_xnack_mask, s_restore_buf_rsrc0, s_restore_mem_offset)
+	read_hwreg_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset)
+	read_hwreg_from_mem(s_restore_flat_scratch, s_restore_buf_rsrc0, s_restore_mem_offset)
+	s_waitcnt	lgkmcnt(0)
+
+	s_setreg_b32	hwreg(HW_REG_SHADER_FLAT_SCRATCH_LO), s_restore_flat_scratch
+
+	read_hwreg_from_mem(s_restore_flat_scratch, s_restore_buf_rsrc0, s_restore_mem_offset)
+	s_waitcnt	lgkmcnt(0)						//from now on, it is safe to restore STATUS and IB_STS
+
+	s_setreg_b32	hwreg(HW_REG_SHADER_FLAT_SCRATCH_HI), s_restore_flat_scratch
+
+	s_mov_b32	s_restore_tmp, s_restore_pc_hi
+	s_and_b32	s_restore_pc_hi, s_restore_tmp, 0x0000ffff		//pc[47:32] //Do it here in order not to affect STATUS
+
+	s_mov_b32	m0, s_restore_m0
+	s_mov_b32	exec_lo, s_restore_exec_lo
+	s_mov_b32	exec_hi, s_restore_exec_hi
+
+	s_and_b32	s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts
+	s_setreg_b32	hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0
+	s_setreg_b32	hwreg(HW_REG_SHADER_XNACK_MASK), s_restore_xnack_mask
+	s_and_b32	s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts
+	s_lshr_b32	s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT
+	s_setreg_b32	hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0
+	s_setreg_b32	hwreg(HW_REG_MODE), s_restore_mode
+	s_and_b32	s_restore_m0, s_restore_tmp, S_SAVE_PC_HI_RCNT_MASK
+	s_lshr_b32	s_restore_m0, s_restore_m0, S_SAVE_PC_HI_RCNT_SHIFT
+	s_lshl_b32	s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_RCNT_SHIFT
+	s_mov_b32	s_restore_mode, 0x0
+	s_or_b32	s_restore_mode, s_restore_mode, s_restore_m0
+	s_and_b32	s_restore_m0, s_restore_tmp, S_SAVE_PC_HI_FIRST_REPLAY_MASK
+	s_lshr_b32	s_restore_m0, s_restore_m0, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
+	s_lshl_b32	s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT
+	s_or_b32	s_restore_mode, s_restore_mode, s_restore_m0
+	s_and_b32	s_restore_m0, s_restore_tmp, S_SAVE_PC_HI_REPLAY_W64H_MASK
+	s_lshr_b32	s_restore_m0, s_restore_m0, S_SAVE_PC_HI_REPLAY_W64H_SHIFT
+	s_lshl_b32	s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT
+	s_or_b32	s_restore_mode, s_restore_mode, s_restore_m0
+
+	s_and_b32	s_restore_m0, s_restore_status, SQ_WAVE_STATUS_INST_ATC_MASK
+	s_lshr_b32	s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT
+	s_setreg_b32 	hwreg(HW_REG_IB_STS), s_restore_mode
+
+	s_and_b64	exec, exec, exec					// Restore STATUS.EXECZ, not writable by s_setreg_b32
+	s_and_b64	vcc, vcc, vcc						// Restore STATUS.VCCZ, not writable by s_setreg_b32
+	s_setreg_b32	hwreg(HW_REG_STATUS), s_restore_status			// SCC is included, which is changed by previous salu
+
+	s_barrier								//barrier to ensure the readiness of LDS before access attemps from any other wave in the same TG
+
+	s_rfe_b64	s_restore_pc_lo						//Return to the main shader program and resume execution
+
+L_END_PGM:
+	s_endpgm
+end
+
+function write_hwreg_to_mem(s, s_rsrc, s_mem_offset)
+	s_mov_b32	exec_lo, m0
+	s_mov_b32	m0, s_mem_offset
+	s_buffer_store_dword	s, s_rsrc, m0 glc:1
+	s_add_u32	s_mem_offset, s_mem_offset, 4
+	s_mov_b32	m0, exec_lo
+end
+
+
+function write_16sgpr_to_mem(s, s_rsrc, s_mem_offset)
+	s_buffer_store_dwordx4	s[0], s_rsrc, 0 glc:1
+	s_buffer_store_dwordx4	s[4], s_rsrc, 16 glc:1
+	s_buffer_store_dwordx4	s[8], s_rsrc, 32 glc:1
+	s_buffer_store_dwordx4	s[12], s_rsrc, 48 glc:1
+	s_add_u32	s_rsrc[0], s_rsrc[0], 4*16
+	s_addc_u32	s_rsrc[1], s_rsrc[1], 0x0
+end
+
+function write_12sgpr_to_mem(s, s_rsrc, s_mem_offset)
+	s_buffer_store_dwordx4	s[0], s_rsrc, 0 glc:1
+	s_buffer_store_dwordx4	s[4], s_rsrc, 16 glc:1
+	s_buffer_store_dwordx4	s[8], s_rsrc, 32 glc:1
+	s_add_u32	s_rsrc[0], s_rsrc[0], 4*12
+	s_addc_u32	s_rsrc[1], s_rsrc[1], 0x0
+end
+
+
+function read_hwreg_from_mem(s, s_rsrc, s_mem_offset)
+	s_buffer_load_dword	s, s_rsrc, s_mem_offset glc:1
+	s_add_u32	s_mem_offset, s_mem_offset, 4
+end
+
+function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset)
+	s_sub_u32	s_mem_offset, s_mem_offset, 4*16
+	s_buffer_load_dwordx16	s, s_rsrc, s_mem_offset glc:1
+end
+
+function read_8sgpr_from_mem(s, s_rsrc, s_mem_offset)
+	s_sub_u32	s_mem_offset, s_mem_offset, 4*8
+	s_buffer_load_dwordx8	s, s_rsrc, s_mem_offset glc:1
+end
+
+function read_4sgpr_from_mem(s, s_rsrc, s_mem_offset)
+	s_sub_u32	s_mem_offset, s_mem_offset, 4*4
+	s_buffer_load_dwordx4	s, s_rsrc, s_mem_offset glc:1
+end
+
+
+function get_lds_size_bytes(s_lds_size_byte)
+	s_getreg_b32	s_lds_size_byte, hwreg(HW_REG_LDS_ALLOC, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)
+	s_lshl_b32	s_lds_size_byte, s_lds_size_byte, 8			//LDS size in dwords = lds_size * 64 *4Bytes // granularity 64DW
+end
+
+function get_vgpr_size_bytes(s_vgpr_size_byte, s_size)
+	s_getreg_b32	s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)
+	s_add_u32	s_vgpr_size_byte, s_vgpr_size_byte, 1
+	s_lshr_b32	m0, s_size, S_WAVE_SIZE
+	s_and_b32	m0, m0, 1
+	s_cmp_eq_u32	m0, 1
+	s_cbranch_scc1	L_ENABLE_SHIFT_W64
+	s_lshl_b32	s_vgpr_size_byte, s_vgpr_size_byte, (2+7)		//Number of VGPRs = (vgpr_size + 1) * 4 * 32 * 4   (non-zero value)
+	s_branch	L_SHIFT_DONE
+L_ENABLE_SHIFT_W64:
+	s_lshl_b32	s_vgpr_size_byte, s_vgpr_size_byte, (2+8)		//Number of VGPRs = (vgpr_size + 1) * 4 * 64 * 4   (non-zero value)
+L_SHIFT_DONE:
+end
+
+function get_svgpr_size_bytes(s_svgpr_size_byte)
+	s_getreg_b32	s_svgpr_size_byte, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_VGPR_SHARED_SIZE_SIZE)
+	s_lshl_b32	s_svgpr_size_byte, s_svgpr_size_byte, (3+7)
+end
+
+function get_sgpr_size_bytes
+	return 512
+end
+
+function get_hwreg_size_bytes
+	return 128
+end
diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm
index abe1a5da29fb..b195b7cd8a17 100644
--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm
@@ -24,78 +24,6 @@
  * PROJECT=vi ./sp3 cwsr_trap_handler_gfx8.asm -hex tmp.hex
  */
 
-/* HW (VI) source code for CWSR trap handler */
-/* Version 18 + multiple trap handler */
-
-// this performance-optimal version was originally from Seven Xu at SRDC
-
-// Revison #18   --...
-/* Rev History
-** #1. Branch from gc dv.   //gfxip/gfx8/main/src/test/suites/block/cs/sr/cs_trap_handler.sp3#1,#50, #51, #52-53(Skip, Already Fixed by PV), #54-56(merged),#57-58(mergerd, skiped-already fixed by PV)
-** #4. SR Memory Layout:
-**             1. VGPR-SGPR-HWREG-{LDS}
-**             2. tba_hi.bits.26 - reconfigured as the first wave in tg bits, for defer Save LDS for a threadgroup.. performance concern..
-** #5. Update: 1. Accurate g8sr_ts_save_d timestamp
-** #6. Update: 1. Fix s_barrier usage; 2. VGPR s/r using swizzle buffer?(NoNeed, already matched the swizzle pattern, more investigation)
-** #7. Update: 1. don't barrier if noLDS
-** #8. Branch: 1. Branch to ver#0, which is very similar to gc dv version
-**             2. Fix SQ issue by s_sleep 2
-** #9. Update: 1. Fix scc restore failed issue, restore wave_status at last
-**             2. optimize s_buffer save by burst 16sgprs...
-** #10. Update 1. Optimize restore sgpr by busrt 16 sgprs.
-** #11. Update 1. Add 2 more timestamp for debug version
-** #12. Update 1. Add VGPR SR using DWx4, some case improve and some case drop performance
-** #13. Integ  1. Always use MUBUF for PV trap shader...
-** #14. Update 1. s_buffer_store soft clause...
-** #15. Update 1. PERF - sclar write with glc:0/mtype0 to allow L2 combine. perf improvement a lot.
-** #16. Update 1. PRRF - UNROLL LDS_DMA got 2500cycle save in IP tree
-** #17. Update 1. FUNC - LDS_DMA has issues while ATC, replace with ds_read/buffer_store for save part[TODO restore part]
-**             2. PERF - Save LDS before save VGPR to cover LDS save long latency...
-** #18. Update 1. FUNC - Implicitly estore STATUS.VCCZ, which is not writable by s_setreg_b32
-**             2. FUNC - Handle non-CWSR traps
-*/
-
-var G8SR_WDMEM_HWREG_OFFSET = 0
-var G8SR_WDMEM_SGPR_OFFSET  = 128  // in bytes
-
-// Keep definition same as the app shader, These 2 time stamps are part of the app shader... Should before any Save and after restore.
-
-var G8SR_DEBUG_TIMESTAMP = 0
-var G8SR_DEBUG_TS_SAVE_D_OFFSET = 40*4  // ts_save_d timestamp offset relative to SGPR_SR_memory_offset
-var s_g8sr_ts_save_s    = s[34:35]   // save start
-var s_g8sr_ts_sq_save_msg  = s[36:37]   // The save shader send SAVEWAVE msg to spi
-var s_g8sr_ts_spi_wrexec   = s[38:39]   // the SPI write the sr address to SQ
-var s_g8sr_ts_save_d    = s[40:41]   // save end
-var s_g8sr_ts_restore_s = s[42:43]   // restore start
-var s_g8sr_ts_restore_d = s[44:45]   // restore end
-
-var G8SR_VGPR_SR_IN_DWX4 = 0
-var G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 = 0x00100000    // DWx4 stride is 4*4Bytes
-var G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4  = G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4
-
-
-/*************************************************************************/
-/*                  control on how to run the shader                     */
-/*************************************************************************/
-//any hack that needs to be made to run this code in EMU (either because various EMU code are not ready or no compute save & restore in EMU run)
-var EMU_RUN_HACK                    =   0
-var EMU_RUN_HACK_RESTORE_NORMAL     =   0
-var EMU_RUN_HACK_SAVE_NORMAL_EXIT   =   0
-var EMU_RUN_HACK_SAVE_SINGLE_WAVE   =   0
-var EMU_RUN_HACK_SAVE_FIRST_TIME    =   0                   //for interrupted restore in which the first save is through EMU_RUN_HACK
-var EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_LO =   0                   //for interrupted restore in which the first save is through EMU_RUN_HACK
-var EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_HI =   0                   //for interrupted restore in which the first save is through EMU_RUN_HACK
-var SAVE_LDS                        =   1
-var WG_BASE_ADDR_LO                 =   0x9000a000
-var WG_BASE_ADDR_HI                 =   0x0
-var WAVE_SPACE                      =   0x5000              //memory size that each wave occupies in workgroup state mem
-var CTX_SAVE_CONTROL                =   0x0
-var CTX_RESTORE_CONTROL             =   CTX_SAVE_CONTROL
-var SIM_RUN_HACK                    =   0                   //any hack that needs to be made to run this code in SIM (either because various RTL code are not ready or no compute save & restore in RTL run)
-var SGPR_SAVE_USE_SQC               =   1                   //use SQC D$ to do the write
-var USE_MTBUF_INSTEAD_OF_MUBUF      =   0                   //because TC EMU currently asserts on 0 of // overload DFMT field to carry 4 more bits of stride for MUBUF opcodes
-var SWIZZLE_EN                      =   0                   //whether we use swizzled buffer addressing
-
 /**************************************************************************/
 /*                      variables                                         */
 /**************************************************************************/
@@ -226,16 +154,7 @@ shader main
   type(CS)
 
 
-    if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))                   //hack to use trap_id for determining save/restore
-        //FIXME VCCZ un-init assertion s_getreg_b32     s_save_status, hwreg(HW_REG_STATUS)         //save STATUS since we will change SCC
-        s_and_b32 s_save_tmp, s_save_pc_hi, 0xffff0000              //change SCC
-        s_cmp_eq_u32 s_save_tmp, 0x007e0000                         //Save: trap_id = 0x7e. Restore: trap_id = 0x7f.
-        s_cbranch_scc0 L_JUMP_TO_RESTORE                            //do not need to recover STATUS here  since we are going to RESTORE
-        //FIXME  s_setreg_b32   hwreg(HW_REG_STATUS),   s_save_status       //need to recover STATUS since we are going to SAVE
-        s_branch L_SKIP_RESTORE                                     //NOT restore, SAVE actually
-    else
         s_branch L_SKIP_RESTORE                                     //NOT restore. might be a regular trap or save
-    end
 
 L_JUMP_TO_RESTORE:
     s_branch L_RESTORE                                              //restore
@@ -249,7 +168,7 @@ L_SKIP_RESTORE:
     s_cbranch_scc1  L_SAVE                                      //this is the operation for save
 
     // *********    Handle non-CWSR traps       *******************
-if (!EMU_RUN_HACK)
+
     /* read tba and tma for next level trap handler, ttmp4 is used as s_save_status */
     s_load_dwordx4  [ttmp8,ttmp9,ttmp10, ttmp11], [tma_lo,tma_hi], 0
     s_waitcnt lgkmcnt(0)
@@ -268,7 +187,7 @@ L_EXCP_CASE:
     s_and_b32   ttmp1, ttmp1, 0xFFFF
     set_status_without_spi_prio(s_save_status, ttmp2) //restore HW status(SCC)
     s_rfe_b64       [ttmp0, ttmp1]
-end
+
     // *********        End handling of non-CWSR traps   *******************
 
 /**************************************************************************/
@@ -276,25 +195,6 @@ end
 /**************************************************************************/
 
 L_SAVE:
-
-if G8SR_DEBUG_TIMESTAMP
-        s_memrealtime   s_g8sr_ts_save_s
-        s_waitcnt lgkmcnt(0)         //FIXME, will cause xnack??
-end
-
-    //check whether there is mem_viol
-    s_getreg_b32    s_save_trapsts, hwreg(HW_REG_TRAPSTS)
-    s_and_b32   s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK
-    s_cbranch_scc0  L_NO_PC_REWIND
-
-    //if so, need rewind PC assuming GDS operation gets NACKed
-    s_mov_b32       s_save_tmp, 0                                                           //clear mem_viol bit
-    s_setreg_b32    hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT, 1), s_save_tmp    //clear mem_viol bit
-    s_and_b32       s_save_pc_hi, s_save_pc_hi, 0x0000ffff    //pc[47:32]
-    s_sub_u32       s_save_pc_lo, s_save_pc_lo, 8             //pc[31:0]-8
-    s_subb_u32      s_save_pc_hi, s_save_pc_hi, 0x0           // -scc
-
-L_NO_PC_REWIND:
     s_mov_b32       s_save_tmp, 0                                                           //clear saveCtx bit
     s_setreg_b32    hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp     //clear saveCtx bit
 
@@ -316,16 +216,7 @@ L_NO_PC_REWIND:
     s_mov_b32       s_save_exec_hi, exec_hi
     s_mov_b64       exec,   0x0                                                             //clear EXEC to get ready to receive
 
-if G8SR_DEBUG_TIMESTAMP
-        s_memrealtime  s_g8sr_ts_sq_save_msg
-        s_waitcnt lgkmcnt(0)
-end
-
-    if (EMU_RUN_HACK)
-
-    else
         s_sendmsg   sendmsg(MSG_SAVEWAVE)  //send SPI a message and wait for SPI's write to EXEC
-    end
 
     // Set SPI_PRIO=2 to avoid starving instruction fetch in the waves we're waiting for.
     s_or_b32 s_save_tmp, s_save_status, (2 << SQ_WAVE_STATUS_SPI_PRIO_SHIFT)
@@ -334,36 +225,9 @@ end
   L_SLEEP:
     s_sleep 0x2                // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause SQ hang, since the 7,8th wave could not get arbit to exec inst, while other waves are stuck into the sleep-loop and waiting for wrexec!=0
 
-    if (EMU_RUN_HACK)
-
-    else
         s_cbranch_execz L_SLEEP
-    end
-
-if G8SR_DEBUG_TIMESTAMP
-        s_memrealtime  s_g8sr_ts_spi_wrexec
-        s_waitcnt lgkmcnt(0)
-end
 
     /*      setup Resource Contants    */
-    if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_SINGLE_WAVE))
-        //calculate wd_addr using absolute thread id
-        v_readlane_b32 s_save_tmp, v9, 0
-        s_lshr_b32 s_save_tmp, s_save_tmp, 6
-        s_mul_i32 s_save_tmp, s_save_tmp, WAVE_SPACE
-        s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO
-        s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI
-        s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL
-    else
-    end
-    if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_SINGLE_WAVE))
-        s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO
-        s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI
-        s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL
-    else
-    end
-
-
     s_mov_b32       s_save_buf_rsrc0,   s_save_spi_init_lo                                                      //base_addr_lo
     s_and_b32       s_save_buf_rsrc1,   s_save_spi_init_hi, 0x0000FFFF                                          //base_addr_hi
     s_or_b32        s_save_buf_rsrc1,   s_save_buf_rsrc1,  S_SAVE_BUF_RSRC_WORD1_STRIDE
@@ -396,22 +260,10 @@ end
 
 
     s_mov_b32       s_save_buf_rsrc2, 0x4                               //NUM_RECORDS   in bytes
-    if (SWIZZLE_EN)
-        s_add_u32       s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0                     //FIXME need to use swizzle to enable bounds checking?
-    else
         s_mov_b32       s_save_buf_rsrc2,  0x1000000                                //NUM_RECORDS in bytes
-    end
 
 
     write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)                  //M0
-
-    if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_FIRST_TIME))
-        s_add_u32 s_save_pc_lo, s_save_pc_lo, 4             //pc[31:0]+4
-        s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0          //carry bit over
-        s_mov_b32   tba_lo, EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_LO
-        s_mov_b32   tba_hi, EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_HI
-    end
-
     write_hwreg_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset)                   //PC
     write_hwreg_to_mem(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset)
     write_hwreg_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset)             //EXEC
@@ -453,18 +305,8 @@ end
     s_add_u32       s_save_alloc_size, s_save_alloc_size, 1
     s_lshl_b32      s_save_alloc_size, s_save_alloc_size, 4                         //Number of SGPRs = (sgpr_size + 1) * 16   (non-zero value)
 
-    if (SGPR_SAVE_USE_SQC)
         s_lshl_b32      s_save_buf_rsrc2,   s_save_alloc_size, 2                    //NUM_RECORDS in bytes
-    else
-        s_lshl_b32      s_save_buf_rsrc2,   s_save_alloc_size, 8                    //NUM_RECORDS in bytes (64 threads)
-    end
-
-    if (SWIZZLE_EN)
-        s_add_u32       s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0                     //FIXME need to use swizzle to enable bounds checking?
-    else
         s_mov_b32       s_save_buf_rsrc2,  0x1000000                                //NUM_RECORDS in bytes
-    end
-
 
     // backup s_save_buf_rsrc0,1 to s_save_pc_lo/hi, since write_16sgpr_to_mem function will change the rsrc0
     //s_mov_b64 s_save_pc_lo, s_save_buf_rsrc0
@@ -503,30 +345,14 @@ end
     s_mov_b32       exec_lo, 0xFFFFFFFF                                             //need every thread from now on
     s_mov_b32       exec_hi, 0xFFFFFFFF
 
-    if (SWIZZLE_EN)
-        s_add_u32       s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0                     //FIXME need to use swizzle to enable bounds checking?
-    else
         s_mov_b32       s_save_buf_rsrc2,  0x1000000                                //NUM_RECORDS in bytes
-    end
-
 
     // VGPR Allocated in 4-GPR granularity
 
-if G8SR_VGPR_SR_IN_DWX4
-        // the const stride for DWx4 is 4*4 bytes
-        s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF   // reset const stride to 0
-        s_or_b32  s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4  // const stride to 4*4 bytes
-
-        buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
-
-        s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF   // reset const stride to 0
-        s_or_b32  s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE  // reset const stride to 4 bytes
-else
         buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
         buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256
         buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256*2
         buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256*3
-end
 
 
 
@@ -562,64 +388,10 @@ end
     s_add_u32 s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes()
 
 
-    if (SWIZZLE_EN)
-        s_add_u32       s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0       //FIXME need to use swizzle to enable bounds checking?
-    else
         s_mov_b32       s_save_buf_rsrc2,  0x1000000                  //NUM_RECORDS in bytes
-    end
-
     s_mov_b32       m0, 0x0                                               //lds_offset initial value = 0
 
 
-var LDS_DMA_ENABLE = 0
-var UNROLL = 0
-if UNROLL==0 && LDS_DMA_ENABLE==1
-        s_mov_b32  s3, 256*2
-        s_nop 0
-        s_nop 0
-        s_nop 0
-  L_SAVE_LDS_LOOP:
-        //TODO: looks the 2 buffer_store/load clause for s/r will hurt performance.???
-    if (SAVE_LDS)     //SPI always alloc LDS space in 128DW granularity
-            buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1            // first 64DW
-            buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW
-    end
-
-    s_add_u32       m0, m0, s3                                          //every buffer_store_lds does 256 bytes
-    s_add_u32       s_save_mem_offset, s_save_mem_offset, s3                            //mem offset increased by 256 bytes
-    s_cmp_lt_u32    m0, s_save_alloc_size                                               //scc=(m0 < s_save_alloc_size) ? 1 : 0
-    s_cbranch_scc1  L_SAVE_LDS_LOOP                                                     //LDS save is complete?
-
-elsif LDS_DMA_ENABLE==1 && UNROLL==1 // UNROOL  , has ichace miss
-      // store from higest LDS address to lowest
-      s_mov_b32  s3, 256*2
-      s_sub_u32  m0, s_save_alloc_size, s3
-      s_add_u32 s_save_mem_offset, s_save_mem_offset, m0
-      s_lshr_b32 s_save_alloc_size, s_save_alloc_size, 9   // how many 128 trunks...
-      s_sub_u32 s_save_alloc_size, 128, s_save_alloc_size   // store from higheset addr to lowest
-      s_mul_i32 s_save_alloc_size, s_save_alloc_size, 6*4   // PC offset increment,  each LDS save block cost 6*4 Bytes instruction
-      s_add_u32 s_save_alloc_size, s_save_alloc_size, 3*4   //2is the below 2 inst...//s_addc and s_setpc
-      s_nop 0
-      s_nop 0
-      s_nop 0   //pad 3 dw to let LDS_DMA align with 64Bytes
-      s_getpc_b64 s[0:1]                              // reuse s[0:1], since s[0:1] already saved
-      s_add_u32   s0, s0,s_save_alloc_size
-      s_addc_u32  s1, s1, 0
-      s_setpc_b64 s[0:1]
-
-
-       for var i =0; i< 128; i++
-            // be careful to make here a 64Byte aligned address, which could improve performance...
-            buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:0           // first 64DW
-            buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256           // second 64DW
-
-        if i!=127
-        s_sub_u32  m0, m0, s3      // use a sgpr to shrink 2DW-inst to 1DW inst to improve performance , i.e.  pack more LDS_DMA inst to one Cacheline
-            s_sub_u32  s_save_mem_offset, s_save_mem_offset,  s3
-            end
-       end
-
-else   // BUFFER_STORE
       v_mbcnt_lo_u32_b32 v2, 0xffffffff, 0x0
       v_mbcnt_hi_u32_b32 v3, 0xffffffff, v2     // tid
       v_mul_i32_i24 v2, v3, 8   // tid*8
@@ -641,8 +413,6 @@ L_SAVE_LDS_LOOP_VECTOR:
       // restore rsrc3
       s_mov_b32 s_save_buf_rsrc3, s0
 
-end
-
 L_SAVE_LDS_DONE:
 
 
@@ -660,44 +430,8 @@ L_SAVE_LDS_DONE:
     s_add_u32       s_save_alloc_size, s_save_alloc_size, 1
     s_lshl_b32      s_save_alloc_size, s_save_alloc_size, 2                         //Number of VGPRs = (vgpr_size + 1) * 4    (non-zero value)   //FIXME for GFX, zero is possible
     s_lshl_b32      s_save_buf_rsrc2,  s_save_alloc_size, 8                         //NUM_RECORDS in bytes (64 threads*4)
-    if (SWIZZLE_EN)
-        s_add_u32       s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0                     //FIXME need to use swizzle to enable bounds checking?
-    else
         s_mov_b32       s_save_buf_rsrc2,  0x1000000                                //NUM_RECORDS in bytes
-    end
-
-
-    // VGPR Allocated in 4-GPR granularity
 
-if G8SR_VGPR_SR_IN_DWX4
-        // the const stride for DWx4 is 4*4 bytes
-        s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF   // reset const stride to 0
-        s_or_b32  s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4  // const stride to 4*4 bytes
-
-        s_mov_b32         m0, 4     // skip first 4 VGPRs
-        s_cmp_lt_u32      m0, s_save_alloc_size
-        s_cbranch_scc0    L_SAVE_VGPR_LOOP_END      // no more vgprs
-
-        s_set_gpr_idx_on  m0, 0x1   // This will change M0
-        s_add_u32         s_save_alloc_size, s_save_alloc_size, 0x1000  // because above inst change m0
-L_SAVE_VGPR_LOOP:
-        v_mov_b32         v0, v0   // v0 = v[0+m0]
-        v_mov_b32         v1, v1
-        v_mov_b32         v2, v2
-        v_mov_b32         v3, v3
-
-
-        buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
-        s_add_u32         m0, m0, 4
-        s_add_u32         s_save_mem_offset, s_save_mem_offset, 256*4
-        s_cmp_lt_u32      m0, s_save_alloc_size
-    s_cbranch_scc1  L_SAVE_VGPR_LOOP                                                //VGPR save is complete?
-    s_set_gpr_idx_off
-L_SAVE_VGPR_LOOP_END:
-
-        s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF   // reset const stride to 0
-        s_or_b32  s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE  // reset const stride to 4 bytes
-else
     // VGPR store using dw burst
     s_mov_b32         m0, 0x4   //VGPR initial index value =0
     s_cmp_lt_u32      m0, s_save_alloc_size
@@ -713,52 +447,18 @@ else
     v_mov_b32       v2, v2              //v0 = v[0+m0]
     v_mov_b32       v3, v3              //v0 = v[0+m0]
 
-    if(USE_MTBUF_INSTEAD_OF_MUBUF)
-        tbuffer_store_format_x v0, v0, s_save_buf_rsrc0, s_save_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
-    else
         buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
         buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256
         buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256*2
         buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256*3
-    end
 
     s_add_u32       m0, m0, 4                                                       //next vgpr index
     s_add_u32       s_save_mem_offset, s_save_mem_offset, 256*4                     //every buffer_store_dword does 256 bytes
     s_cmp_lt_u32    m0, s_save_alloc_size                                           //scc = (m0 < s_save_alloc_size) ? 1 : 0
     s_cbranch_scc1  L_SAVE_VGPR_LOOP                                                //VGPR save is complete?
     s_set_gpr_idx_off
-end
 
 L_SAVE_VGPR_END:
-
-
-
-
-
-
-    /*     S_PGM_END_SAVED  */                              //FIXME  graphics ONLY
-    if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_NORMAL_EXIT))
-        s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff    //pc[47:32]
-        s_add_u32 s_save_pc_lo, s_save_pc_lo, 4             //pc[31:0]+4
-        s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0          //carry bit over
-        s_rfe_b64 s_save_pc_lo                              //Return to the main shader program
-    else
-    end
-
-// Save Done timestamp
-if G8SR_DEBUG_TIMESTAMP
-        s_memrealtime   s_g8sr_ts_save_d
-        // SGPR SR memory offset : size(VGPR)
-        get_vgpr_size_bytes(s_save_mem_offset)
-        s_add_u32 s_save_mem_offset, s_save_mem_offset, G8SR_DEBUG_TS_SAVE_D_OFFSET
-        s_waitcnt lgkmcnt(0)         //FIXME, will cause xnack??
-        // Need reset rsrc2??
-        s_mov_b32 m0, s_save_mem_offset
-        s_mov_b32 s_save_buf_rsrc2,  0x1000000                                  //NUM_RECORDS in bytes
-        s_buffer_store_dwordx2 s_g8sr_ts_save_d, s_save_buf_rsrc0, m0       glc:1
-end
-
-
     s_branch    L_END_PGM
 
 
@@ -769,27 +469,6 @@ end
 
 L_RESTORE:
     /*      Setup Resource Contants    */
-    if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))
-        //calculate wd_addr using absolute thread id
-        v_readlane_b32 s_restore_tmp, v9, 0
-        s_lshr_b32 s_restore_tmp, s_restore_tmp, 6
-        s_mul_i32 s_restore_tmp, s_restore_tmp, WAVE_SPACE
-        s_add_i32 s_restore_spi_init_lo, s_restore_tmp, WG_BASE_ADDR_LO
-        s_mov_b32 s_restore_spi_init_hi, WG_BASE_ADDR_HI
-        s_and_b32 s_restore_spi_init_hi, s_restore_spi_init_hi, CTX_RESTORE_CONTROL
-    else
-    end
-
-if G8SR_DEBUG_TIMESTAMP
-        s_memrealtime   s_g8sr_ts_restore_s
-        s_waitcnt lgkmcnt(0)         //FIXME, will cause xnack??
-        // tma_lo/hi are sgpr 110, 111, which will not used for 112 SGPR allocated case...
-        s_mov_b32 s_restore_pc_lo, s_g8sr_ts_restore_s[0]
-        s_mov_b32 s_restore_pc_hi, s_g8sr_ts_restore_s[1]   //backup ts to ttmp0/1, sicne exec will be finally restored..
-end
-
-
-
     s_mov_b32       s_restore_buf_rsrc0,    s_restore_spi_init_lo                                                           //base_addr_lo
     s_and_b32       s_restore_buf_rsrc1,    s_restore_spi_init_hi, 0x0000FFFF                                               //base_addr_hi
     s_or_b32        s_restore_buf_rsrc1,    s_restore_buf_rsrc1,  S_RESTORE_BUF_RSRC_WORD1_STRIDE
@@ -831,18 +510,12 @@ end
     s_add_u32  s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes()            //FIXME, Check if offset overflow???
 
 
-    if (SWIZZLE_EN)
-        s_add_u32       s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0                       //FIXME need to use swizzle to enable bounds checking?
-    else
         s_mov_b32       s_restore_buf_rsrc2,  0x1000000                                     //NUM_RECORDS in bytes
-    end
     s_mov_b32       m0, 0x0                                                                 //lds_offset initial value = 0
 
   L_RESTORE_LDS_LOOP:
-    if (SAVE_LDS)
         buffer_load_dword   v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1                    // first 64DW
         buffer_load_dword   v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:256         // second 64DW
-    end
     s_add_u32       m0, m0, 256*2                                               // 128 DW
     s_add_u32       s_restore_mem_offset, s_restore_mem_offset, 256*2           //mem offset increased by 128DW
     s_cmp_lt_u32    m0, s_restore_alloc_size                                    //scc=(m0 < s_restore_alloc_size) ? 1 : 0
@@ -861,40 +534,8 @@ end
     s_add_u32       s_restore_alloc_size, s_restore_alloc_size, 1
     s_lshl_b32      s_restore_alloc_size, s_restore_alloc_size, 2                           //Number of VGPRs = (vgpr_size + 1) * 4    (non-zero value)
     s_lshl_b32      s_restore_buf_rsrc2,  s_restore_alloc_size, 8                           //NUM_RECORDS in bytes (64 threads*4)
-    if (SWIZZLE_EN)
-        s_add_u32       s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0                       //FIXME need to use swizzle to enable bounds checking?
-    else
         s_mov_b32       s_restore_buf_rsrc2,  0x1000000                                     //NUM_RECORDS in bytes
-    end
-
-if G8SR_VGPR_SR_IN_DWX4
-     get_vgpr_size_bytes(s_restore_mem_offset)
-     s_sub_u32         s_restore_mem_offset, s_restore_mem_offset, 256*4
-
-     // the const stride for DWx4 is 4*4 bytes
-     s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF   // reset const stride to 0
-     s_or_b32  s_restore_buf_rsrc1, s_restore_buf_rsrc1, G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4  // const stride to 4*4 bytes
-
-     s_mov_b32         m0, s_restore_alloc_size
-     s_set_gpr_idx_on  m0, 0x8    // Note.. This will change m0
-
-L_RESTORE_VGPR_LOOP:
-     buffer_load_dwordx4 v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
-     s_waitcnt vmcnt(0)
-     s_sub_u32         m0, m0, 4
-     v_mov_b32         v0, v0   // v[0+m0] = v0
-     v_mov_b32         v1, v1
-     v_mov_b32         v2, v2
-     v_mov_b32         v3, v3
-     s_sub_u32         s_restore_mem_offset, s_restore_mem_offset, 256*4
-     s_cmp_eq_u32      m0, 0x8000
-     s_cbranch_scc0    L_RESTORE_VGPR_LOOP
-     s_set_gpr_idx_off
-
-     s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF   // reset const stride to 0
-     s_or_b32  s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE  // const stride to 4*4 bytes
-
-else
+
     // VGPR load using dw burst
     s_mov_b32       s_restore_mem_offset_save, s_restore_mem_offset     // restore start with v1, v0 will be the last
     s_add_u32       s_restore_mem_offset, s_restore_mem_offset, 256*4
@@ -903,14 +544,10 @@ else
     s_add_u32       s_restore_alloc_size, s_restore_alloc_size, 0x8000                      //add 0x8000 since we compare m0 against it later
 
   L_RESTORE_VGPR_LOOP:
-    if(USE_MTBUF_INSTEAD_OF_MUBUF)
-        tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
-    else
         buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
         buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256
         buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*2
         buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*3
-    end
     s_waitcnt       vmcnt(0)                                                                //ensure data ready
     v_mov_b32       v0, v0                                                                  //v[0+m0] = v0
     v_mov_b32       v1, v1
@@ -922,16 +559,10 @@ else
     s_cbranch_scc1  L_RESTORE_VGPR_LOOP                                                     //VGPR restore (except v0) is complete?
     s_set_gpr_idx_off
                                                                                             /* VGPR restore on v0 */
-    if(USE_MTBUF_INSTEAD_OF_MUBUF)
-        tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
-    else
         buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save    slc:1 glc:1
         buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save    slc:1 glc:1 offset:256
         buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save    slc:1 glc:1 offset:256*2
         buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save    slc:1 glc:1 offset:256*3
-    end
-
-end
 
     /*          restore SGPRs       */
     //////////////////////////////
@@ -947,16 +578,8 @@ end
     s_add_u32       s_restore_alloc_size, s_restore_alloc_size, 1
     s_lshl_b32      s_restore_alloc_size, s_restore_alloc_size, 4                           //Number of SGPRs = (sgpr_size + 1) * 16   (non-zero value)
 
-    if (SGPR_SAVE_USE_SQC)
         s_lshl_b32      s_restore_buf_rsrc2,    s_restore_alloc_size, 2                     //NUM_RECORDS in bytes
-    else
-        s_lshl_b32      s_restore_buf_rsrc2,    s_restore_alloc_size, 8                     //NUM_RECORDS in bytes (64 threads)
-    end
-    if (SWIZZLE_EN)
-        s_add_u32       s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0                       //FIXME need to use swizzle to enable bounds checking?
-    else
         s_mov_b32       s_restore_buf_rsrc2,  0x1000000                                     //NUM_RECORDS in bytes
-    end
 
     /* If 112 SGPRs ar allocated, 4 sgprs are not used TBA(108,109),TMA(110,111),
        However, we are safe to restore these 4 SGPRs anyway, since TBA,TMA will later be restored by HWREG
@@ -985,12 +608,6 @@ end
     //////////////////////////////
   L_RESTORE_HWREG:
 
-
-if G8SR_DEBUG_TIMESTAMP
-      s_mov_b32 s_g8sr_ts_restore_s[0], s_restore_pc_lo
-      s_mov_b32 s_g8sr_ts_restore_s[1], s_restore_pc_hi
-end
-
     // HWREG SR memory offset : size(VGPR)+size(SGPR)
     get_vgpr_size_bytes(s_restore_mem_offset)
     get_sgpr_size_bytes(s_restore_tmp)
@@ -998,11 +615,7 @@ end
 
 
     s_mov_b32       s_restore_buf_rsrc2, 0x4                                                //NUM_RECORDS   in bytes
-    if (SWIZZLE_EN)
-        s_add_u32       s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0                       //FIXME need to use swizzle to enable bounds checking?
-    else
         s_mov_b32       s_restore_buf_rsrc2,  0x1000000                                     //NUM_RECORDS in bytes
-    end
 
     read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset)                    //M0
     read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset)             //PC
@@ -1019,16 +632,6 @@ end
 
     s_waitcnt       lgkmcnt(0)                                                                                      //from now on, it is safe to restore STATUS and IB_STS
 
-    //for normal save & restore, the saved PC points to the next inst to execute, no adjustment needs to be made, otherwise:
-    if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))
-        s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 8            //pc[31:0]+8     //two back-to-back s_trap are used (first for save and second for restore)
-        s_addc_u32  s_restore_pc_hi, s_restore_pc_hi, 0x0        //carry bit over
-    end
-    if ((EMU_RUN_HACK) && (EMU_RUN_HACK_RESTORE_NORMAL))
-        s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 4            //pc[31:0]+4     // save is hack through s_trap but restore is normal
-        s_addc_u32  s_restore_pc_hi, s_restore_pc_hi, 0x0        //carry bit over
-    end
-
     s_mov_b32       m0,         s_restore_m0
     s_mov_b32       exec_lo,    s_restore_exec_lo
     s_mov_b32       exec_hi,    s_restore_exec_hi
@@ -1061,11 +664,6 @@ end
 
     s_barrier                                                   //barrier to ensure the readiness of LDS before access attempts from any other wave in the same TG //FIXME not performance-optimal at this time
 
-if G8SR_DEBUG_TIMESTAMP
-    s_memrealtime s_g8sr_ts_restore_d
-    s_waitcnt lgkmcnt(0)
-end
-
 //  s_rfe_b64 s_restore_pc_lo                                   //Return to the main shader program and resume execution
     s_rfe_restore_b64  s_restore_pc_lo, s_restore_m0            // s_restore_m0[0] is used to set STATUS.inst_atc
 
diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm
index 0bb9c577b3a2..75f29d13c90f 100644
--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm
@@ -24,76 +24,9 @@
  * PROJECT=greenland ./sp3 cwsr_trap_handler_gfx9.asm -hex tmp.hex
  */
 
-/* HW (GFX9) source code for CWSR trap handler */
-/* Version 18 + multiple trap handler */
-
-// this performance-optimal version was originally from Seven Xu at SRDC
-
-// Revison #18	 --...
-/* Rev History
-** #1. Branch from gc dv.   //gfxip/gfx9/main/src/test/suites/block/cs/sr/cs_trap_handler.sp3#1,#50, #51, #52-53(Skip, Already Fixed by PV), #54-56(merged),#57-58(mergerd, skiped-already fixed by PV)
-** #4. SR Memory Layout:
-**			 1. VGPR-SGPR-HWREG-{LDS}
-**			 2. tba_hi.bits.26 - reconfigured as the first wave in tg bits, for defer Save LDS for a threadgroup.. performance concern..
-** #5. Update: 1. Accurate g8sr_ts_save_d timestamp
-** #6. Update: 1. Fix s_barrier usage; 2. VGPR s/r using swizzle buffer?(NoNeed, already matched the swizzle pattern, more investigation)
-** #7. Update: 1. don't barrier if noLDS
-** #8. Branch: 1. Branch to ver#0, which is very similar to gc dv version
-**	       2. Fix SQ issue by s_sleep 2
-** #9. Update: 1. Fix scc restore failed issue, restore wave_status at last
-**	       2. optimize s_buffer save by burst 16sgprs...
-** #10. Update 1. Optimize restore sgpr by busrt 16 sgprs.
-** #11. Update 1. Add 2 more timestamp for debug version
-** #12. Update 1. Add VGPR SR using DWx4, some case improve and some case drop performance
-** #13. Integ  1. Always use MUBUF for PV trap shader...
-** #14. Update 1. s_buffer_store soft clause...
-** #15. Update 1. PERF - sclar write with glc:0/mtype0 to allow L2 combine. perf improvement a lot.
-** #16. Update 1. PRRF - UNROLL LDS_DMA got 2500cycle save in IP tree
-** #17. Update 1. FUNC - LDS_DMA has issues while ATC, replace with ds_read/buffer_store for save part[TODO restore part]
-**	       2. PERF - Save LDS before save VGPR to cover LDS save long latency...
-** #18. Update 1. FUNC - Implicitly estore STATUS.VCCZ, which is not writable by s_setreg_b32
-**	       2. FUNC - Handle non-CWSR traps
-*/
-
-var G8SR_WDMEM_HWREG_OFFSET = 0
-var G8SR_WDMEM_SGPR_OFFSET  = 128  // in bytes
-
-// Keep definition same as the app shader, These 2 time stamps are part of the app shader... Should before any Save and after restore.
-
-var G8SR_DEBUG_TIMESTAMP = 0
-var G8SR_DEBUG_TS_SAVE_D_OFFSET = 40*4	// ts_save_d timestamp offset relative to SGPR_SR_memory_offset
-var s_g8sr_ts_save_s	= s[34:35]   // save start
-var s_g8sr_ts_sq_save_msg  = s[36:37]	// The save shader send SAVEWAVE msg to spi
-var s_g8sr_ts_spi_wrexec   = s[38:39]	// the SPI write the sr address to SQ
-var s_g8sr_ts_save_d	= s[40:41]   // save end
-var s_g8sr_ts_restore_s = s[42:43]   // restore start
-var s_g8sr_ts_restore_d = s[44:45]   // restore end
-
-var G8SR_VGPR_SR_IN_DWX4 = 0
-var G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 = 0x00100000	 // DWx4 stride is 4*4Bytes
-var G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4  = G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4
-
-
-/*************************************************************************/
-/*		    control on how to run the shader			 */
-/*************************************************************************/
-//any hack that needs to be made to run this code in EMU (either because various EMU code are not ready or no compute save & restore in EMU run)
-var EMU_RUN_HACK		    =	0
-var EMU_RUN_HACK_RESTORE_NORMAL	    =	0
-var EMU_RUN_HACK_SAVE_NORMAL_EXIT   =	0
-var EMU_RUN_HACK_SAVE_SINGLE_WAVE   =	0
-var EMU_RUN_HACK_SAVE_FIRST_TIME    =	0		    //for interrupted restore in which the first save is through EMU_RUN_HACK
-var SAVE_LDS			    =	1
-var WG_BASE_ADDR_LO		    =	0x9000a000
-var WG_BASE_ADDR_HI		    =	0x0
-var WAVE_SPACE			    =	0x5000		    //memory size that each wave occupies in workgroup state mem
-var CTX_SAVE_CONTROL		    =	0x0
-var CTX_RESTORE_CONTROL		    =	CTX_SAVE_CONTROL
-var SIM_RUN_HACK		    =	0		    //any hack that needs to be made to run this code in SIM (either because various RTL code are not ready or no compute save & restore in RTL run)
-var SGPR_SAVE_USE_SQC		    =	1		    //use SQC D$ to do the write
-var USE_MTBUF_INSTEAD_OF_MUBUF	    =	0		    //because TC EMU currently asserts on 0 of // overload DFMT field to carry 4 more bits of stride for MUBUF opcodes
-var SWIZZLE_EN			    =	0		    //whether we use swizzled buffer addressing
 var ACK_SQC_STORE		    =	1		    //workaround for suspected SQC store bug causing incorrect stores under concurrency
+var SAVE_AFTER_XNACK_ERROR	    =	1		    //workaround for TCP store failure after XNACK error when ALLOW_REPLAY=0, for debugger
+var SINGLE_STEP_MISSED_WORKAROUND   =	1		    //workaround for lost MODE.DEBUG_EN exception when SAVECTX raised
 
 /**************************************************************************/
 /*			variables					  */
@@ -107,6 +40,7 @@ var SQ_WAVE_STATUS_PRE_SPI_PRIO_SHIFT   = 0
 var SQ_WAVE_STATUS_PRE_SPI_PRIO_SIZE    = 1
 var SQ_WAVE_STATUS_POST_SPI_PRIO_SHIFT  = 3
 var SQ_WAVE_STATUS_POST_SPI_PRIO_SIZE   = 29
+var SQ_WAVE_STATUS_ALLOW_REPLAY_MASK    = 0x400000
 
 var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT	= 12
 var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE	= 9
@@ -127,12 +61,15 @@ var SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK	=   0xFFFFF800
 var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT	=   11
 var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE	=   21
 var SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK	=   0x800
+var SQ_WAVE_TRAPSTS_XNACK_ERROR_MASK	=   0x10000000
 
 var SQ_WAVE_IB_STS_RCNT_SHIFT		=   16			//FIXME
 var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT	=   15			//FIXME
 var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK	= 0x1F8000
 var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG	= 0x00007FFF	//FIXME
 
+var SQ_WAVE_MODE_DEBUG_EN_MASK		=   0x800
+
 var SQ_BUF_RSRC_WORD1_ATC_SHIFT	    =	24
 var SQ_BUF_RSRC_WORD3_MTYPE_SHIFT   =	27
 
@@ -150,10 +87,10 @@ var S_SAVE_SPI_INIT_MTYPE_SHIFT		=   28
 var S_SAVE_SPI_INIT_FIRST_WAVE_MASK	=   0x04000000		//bit[26]: FirstWaveInTG
 var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT	=   26
 
-var S_SAVE_PC_HI_RCNT_SHIFT		=   28			//FIXME	 check with Brian to ensure all fields other than PC[47:0] can be used
-var S_SAVE_PC_HI_RCNT_MASK		=   0xF0000000		//FIXME
-var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT	=   27			//FIXME
-var S_SAVE_PC_HI_FIRST_REPLAY_MASK	=   0x08000000		//FIXME
+var S_SAVE_PC_HI_RCNT_SHIFT		=   27			//FIXME	 check with Brian to ensure all fields other than PC[47:0] can be used
+var S_SAVE_PC_HI_RCNT_MASK		=   0xF8000000		//FIXME
+var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT	=   26			//FIXME
+var S_SAVE_PC_HI_FIRST_REPLAY_MASK	=   0x04000000		//FIXME
 
 var s_save_spi_init_lo		    =	exec_lo
 var s_save_spi_init_hi		    =	exec_hi
@@ -162,8 +99,8 @@ var s_save_pc_lo	    =	ttmp0		//{TTMP1, TTMP0} = {3'h0,pc_rewind[3:0], HT[0],tra
 var s_save_pc_hi	    =	ttmp1
 var s_save_exec_lo	    =	ttmp2
 var s_save_exec_hi	    =	ttmp3
-var s_save_tmp		    =	ttmp4
-var s_save_trapsts	    =	ttmp5		//not really used until the end of the SAVE routine
+var s_save_tmp		    =	ttmp14
+var s_save_trapsts	    =	ttmp15		//not really used until the end of the SAVE routine
 var s_save_xnack_mask_lo    =	ttmp6
 var s_save_xnack_mask_hi    =	ttmp7
 var s_save_buf_rsrc0	    =	ttmp8
@@ -171,9 +108,9 @@ var s_save_buf_rsrc1	    =	ttmp9
 var s_save_buf_rsrc2	    =	ttmp10
 var s_save_buf_rsrc3	    =	ttmp11
 var s_save_status	    =	ttmp12
-var s_save_mem_offset	    =	ttmp14
+var s_save_mem_offset	    =	ttmp4
 var s_save_alloc_size	    =	s_save_trapsts		//conflict
-var s_save_m0		    =	ttmp15
+var s_save_m0		    =	ttmp5
 var s_save_ttmps_lo	    =	s_save_tmp		//no conflict
 var s_save_ttmps_hi	    =	s_save_trapsts		//no conflict
 
@@ -197,20 +134,22 @@ var s_restore_spi_init_lo		    =	exec_lo
 var s_restore_spi_init_hi		    =	exec_hi
 
 var s_restore_mem_offset	=   ttmp12
+var s_restore_accvgpr_offset	=   ttmp13
 var s_restore_alloc_size	=   ttmp3
 var s_restore_tmp		=   ttmp2
 var s_restore_mem_offset_save	=   s_restore_tmp	//no conflict
+var s_restore_accvgpr_offset_save = ttmp7
 
 var s_restore_m0	    =	s_restore_alloc_size	//no conflict
 
-var s_restore_mode	    =	ttmp7
+var s_restore_mode	    =	s_restore_accvgpr_offset_save
 
 var s_restore_pc_lo	    =	ttmp0
 var s_restore_pc_hi	    =	ttmp1
-var s_restore_exec_lo	    =	ttmp14
-var s_restore_exec_hi	    = 	ttmp15
-var s_restore_status	    =	ttmp4
-var s_restore_trapsts	    =	ttmp5
+var s_restore_exec_lo	    =	ttmp4
+var s_restore_exec_hi	    = 	ttmp5
+var s_restore_status	    =	ttmp14
+var s_restore_trapsts	    =	ttmp15
 var s_restore_xnack_mask_lo =	xnack_mask_lo
 var s_restore_xnack_mask_hi =	xnack_mask_hi
 var s_restore_buf_rsrc0	    =	ttmp8
@@ -226,20 +165,11 @@ var s_restore_ttmps_hi	    =	s_restore_alloc_size	//no conflict
 /* Shader Main*/
 
 shader main
-  asic(GFX9)
+  asic(DEFAULT)
   type(CS)
 
 
-    if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))		    //hack to use trap_id for determining save/restore
-	//FIXME VCCZ un-init assertion s_getreg_b32	s_save_status, hwreg(HW_REG_STATUS)	    //save STATUS since we will change SCC
-	s_and_b32 s_save_tmp, s_save_pc_hi, 0xffff0000		    //change SCC
-	s_cmp_eq_u32 s_save_tmp, 0x007e0000			    //Save: trap_id = 0x7e. Restore: trap_id = 0x7f.
-	s_cbranch_scc0 L_JUMP_TO_RESTORE			    //do not need to recover STATUS here  since we are going to RESTORE
-	//FIXME	 s_setreg_b32	hwreg(HW_REG_STATUS),	s_save_status	    //need to recover STATUS since we are going to SAVE
-	s_branch L_SKIP_RESTORE					    //NOT restore, SAVE actually
-    else
 	s_branch L_SKIP_RESTORE					    //NOT restore. might be a regular trap or save
-    end
 
 L_JUMP_TO_RESTORE:
     s_branch L_RESTORE						    //restore
@@ -248,12 +178,29 @@ L_SKIP_RESTORE:
 
     s_getreg_b32    s_save_status, hwreg(HW_REG_STATUS)				    //save STATUS since we will change SCC
     s_andn2_b32	    s_save_status, s_save_status, SQ_WAVE_STATUS_SPI_PRIO_MASK	    //check whether this is for save
+
+if SINGLE_STEP_MISSED_WORKAROUND
+    // No single step exceptions if MODE.DEBUG_EN=0.
+    s_getreg_b32    ttmp2, hwreg(HW_REG_MODE)
+    s_and_b32       ttmp2, ttmp2, SQ_WAVE_MODE_DEBUG_EN_MASK
+    s_cbranch_scc0  L_NO_SINGLE_STEP_WORKAROUND
+
+    // Second-level trap already handled exception if STATUS.HALT=1.
+    s_and_b32       ttmp2, s_save_status, SQ_WAVE_STATUS_HALT_MASK
+
+    // Prioritize single step exception over context save.
+    // Second-level trap will halt wave and RFE, re-entering for SAVECTX.
+    s_cbranch_scc0  L_FETCH_2ND_TRAP
+
+L_NO_SINGLE_STEP_WORKAROUND:
+end
+
     s_getreg_b32    s_save_trapsts, hwreg(HW_REG_TRAPSTS)
     s_and_b32       ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK    //check whether this is for save
     s_cbranch_scc1  L_SAVE					//this is the operation for save
 
     // *********    Handle non-CWSR traps	*******************
-if (!EMU_RUN_HACK)
+
     // Illegal instruction is a non-maskable exception which blocks context save.
     // Halt the wavefront and return from the trap.
     s_and_b32       ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK
@@ -266,10 +213,16 @@ if (!EMU_RUN_HACK)
 
 L_HALT_WAVE:
     // If STATUS.HALT is set then this fault must come from SQC instruction fetch.
-    // We cannot prevent further faults so just terminate the wavefront.
+    // We cannot prevent further faults. Spin wait until context saved.
     s_and_b32       ttmp2, s_save_status, SQ_WAVE_STATUS_HALT_MASK
     s_cbranch_scc0  L_NOT_ALREADY_HALTED
-    s_endpgm
+
+L_WAIT_CTX_SAVE:
+    s_sleep         0x10
+    s_getreg_b32    ttmp2, hwreg(HW_REG_TRAPSTS)
+    s_and_b32       ttmp2, ttmp2, SQ_WAVE_TRAPSTS_SAVECTX_MASK
+    s_cbranch_scc0  L_WAIT_CTX_SAVE
+
 L_NOT_ALREADY_HALTED:
     s_or_b32        s_save_status, s_save_status, SQ_WAVE_STATUS_HALT_MASK
 
@@ -293,12 +246,12 @@ L_FETCH_2ND_TRAP:
     // Read second-level TBA/TMA from first-level TMA and jump if available.
     // ttmp[2:5] and ttmp12 can be used (others hold SPI-initialized debug data)
     // ttmp12 holds SQ_WAVE_STATUS
-    s_getreg_b32    ttmp4, hwreg(HW_REG_SQ_SHADER_TMA_LO)
-    s_getreg_b32    ttmp5, hwreg(HW_REG_SQ_SHADER_TMA_HI)
-    s_lshl_b64      [ttmp4, ttmp5], [ttmp4, ttmp5], 0x8
-    s_load_dwordx2  [ttmp2, ttmp3], [ttmp4, ttmp5], 0x0 glc:1 // second-level TBA
+    s_getreg_b32    ttmp14, hwreg(HW_REG_SQ_SHADER_TMA_LO)
+    s_getreg_b32    ttmp15, hwreg(HW_REG_SQ_SHADER_TMA_HI)
+    s_lshl_b64      [ttmp14, ttmp15], [ttmp14, ttmp15], 0x8
+    s_load_dwordx2  [ttmp2, ttmp3], [ttmp14, ttmp15], 0x0 glc:1 // second-level TBA
     s_waitcnt       lgkmcnt(0)
-    s_load_dwordx2  [ttmp4, ttmp5], [ttmp4, ttmp5], 0x8 glc:1 // second-level TMA
+    s_load_dwordx2  [ttmp14, ttmp15], [ttmp14, ttmp15], 0x8 glc:1 // second-level TMA
     s_waitcnt       lgkmcnt(0)
     s_and_b64       [ttmp2, ttmp3], [ttmp2, ttmp3], [ttmp2, ttmp3]
     s_cbranch_scc0  L_NO_NEXT_TRAP // second-level trap handler not been set
@@ -324,7 +277,7 @@ L_EXCP_CASE:
     set_status_without_spi_prio(s_save_status, ttmp2)
 
     s_rfe_b64       [ttmp0, ttmp1]
-end
+
     // *********	End handling of non-CWSR traps	 *******************
 
 /**************************************************************************/
@@ -332,12 +285,6 @@ end
 /**************************************************************************/
 
 L_SAVE:
-
-if G8SR_DEBUG_TIMESTAMP
-	s_memrealtime	s_g8sr_ts_save_s
-	s_waitcnt lgkmcnt(0)	     //FIXME, will cause xnack??
-end
-
     s_and_b32	    s_save_pc_hi, s_save_pc_hi, 0x0000ffff    //pc[47:32]
 
     s_mov_b32	    s_save_tmp, 0							    //clear saveCtx bit
@@ -359,16 +306,7 @@ end
     s_mov_b32	    s_save_exec_hi, exec_hi
     s_mov_b64	    exec,   0x0								    //clear EXEC to get ready to receive
 
-if G8SR_DEBUG_TIMESTAMP
-	s_memrealtime  s_g8sr_ts_sq_save_msg
-	s_waitcnt lgkmcnt(0)
-end
-
-    if (EMU_RUN_HACK)
-
-    else
 	s_sendmsg   sendmsg(MSG_SAVEWAVE)  //send SPI a message and wait for SPI's write to EXEC
-    end
 
     // Set SPI_PRIO=2 to avoid starving instruction fetch in the waves we're waiting for.
     s_or_b32 s_save_tmp, s_save_status, (2 << SQ_WAVE_STATUS_SPI_PRIO_SHIFT)
@@ -377,35 +315,9 @@ end
   L_SLEEP:
     s_sleep 0x2		       // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause SQ hang, since the 7,8th wave could not get arbit to exec inst, while other waves are stuck into the sleep-loop and waiting for wrexec!=0
 
-    if (EMU_RUN_HACK)
-
-    else
 	s_cbranch_execz L_SLEEP
-    end
-
-if G8SR_DEBUG_TIMESTAMP
-	s_memrealtime  s_g8sr_ts_spi_wrexec
-	s_waitcnt lgkmcnt(0)
-end
-
-    if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_SINGLE_WAVE))
-	//calculate wd_addr using absolute thread id
-	v_readlane_b32 s_save_tmp, v9, 0
-	s_lshr_b32 s_save_tmp, s_save_tmp, 6
-	s_mul_i32 s_save_tmp, s_save_tmp, WAVE_SPACE
-	s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO
-	s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI
-	s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL
-    else
-    end
-    if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_SINGLE_WAVE))
-	s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO
-	s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI
-	s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL
-    else
-    end
 
-    // Save trap temporaries 6-11, 13-15 initialized by SPI debug dispatch logic
+    // Save trap temporaries 4-11, 13 initialized by SPI debug dispatch logic
     // ttmp SR memory offset : size(VGPR)+size(SGPR)+0x40
     get_vgpr_size_bytes(s_save_ttmps_lo)
     get_sgpr_size_bytes(s_save_ttmps_hi)
@@ -413,13 +325,11 @@ end
     s_add_u32	    s_save_ttmps_lo, s_save_ttmps_lo, s_save_spi_init_lo
     s_addc_u32	    s_save_ttmps_hi, s_save_spi_init_hi, 0x0
     s_and_b32	    s_save_ttmps_hi, s_save_ttmps_hi, 0xFFFF
-    s_store_dwordx2 [ttmp6, ttmp7], [s_save_ttmps_lo, s_save_ttmps_hi], 0x40 glc:1
+    s_store_dwordx4 [ttmp4, ttmp5, ttmp6, ttmp7], [s_save_ttmps_lo, s_save_ttmps_hi], 0x50 glc:1
     ack_sqc_store_workaround()
-    s_store_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_save_ttmps_lo, s_save_ttmps_hi], 0x48 glc:1
+    s_store_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_save_ttmps_lo, s_save_ttmps_hi], 0x60 glc:1
     ack_sqc_store_workaround()
-    s_store_dword   ttmp13, [s_save_ttmps_lo, s_save_ttmps_hi], 0x58 glc:1
-    ack_sqc_store_workaround()
-    s_store_dwordx2 [ttmp14, ttmp15], [s_save_ttmps_lo, s_save_ttmps_hi], 0x5C glc:1
+    s_store_dword   ttmp13, [s_save_ttmps_lo, s_save_ttmps_hi], 0x74 glc:1
     ack_sqc_store_workaround()
 
     /*	    setup Resource Contants    */
@@ -455,20 +365,10 @@ end
 
 
     s_mov_b32	    s_save_buf_rsrc2, 0x4				//NUM_RECORDS	in bytes
-    if (SWIZZLE_EN)
-	s_add_u32	s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0			    //FIXME need to use swizzle to enable bounds checking?
-    else
 	s_mov_b32	s_save_buf_rsrc2,  0x1000000				    //NUM_RECORDS in bytes
-    end
 
 
     write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)			//M0
-
-    if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_FIRST_TIME))
-	s_add_u32 s_save_pc_lo, s_save_pc_lo, 4		    //pc[31:0]+4
-	s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0	    //carry bit over
-    end
-
     write_hwreg_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset)		    //PC
     write_hwreg_to_mem(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset)
     write_hwreg_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset)		//EXEC
@@ -506,17 +406,9 @@ end
     s_add_u32	    s_save_alloc_size, s_save_alloc_size, 1
     s_lshl_b32	    s_save_alloc_size, s_save_alloc_size, 4			    //Number of SGPRs = (sgpr_size + 1) * 16   (non-zero value)
 
-    if (SGPR_SAVE_USE_SQC)
 	s_lshl_b32	s_save_buf_rsrc2,   s_save_alloc_size, 2		    //NUM_RECORDS in bytes
-    else
-	s_lshl_b32	s_save_buf_rsrc2,   s_save_alloc_size, 8		    //NUM_RECORDS in bytes (64 threads)
-    end
 
-    if (SWIZZLE_EN)
-	s_add_u32	s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0			    //FIXME need to use swizzle to enable bounds checking?
-    else
 	s_mov_b32	s_save_buf_rsrc2,  0x1000000				    //NUM_RECORDS in bytes
-    end
 
 
     // backup s_save_buf_rsrc0,1 to s_save_pc_lo/hi, since write_16sgpr_to_mem function will change the rsrc0
@@ -559,30 +451,25 @@ end
     s_mov_b32	    xnack_mask_lo, 0x0
     s_mov_b32	    xnack_mask_hi, 0x0
 
-    if (SWIZZLE_EN)
-	s_add_u32	s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0			    //FIXME need to use swizzle to enable bounds checking?
-    else
 	s_mov_b32	s_save_buf_rsrc2,  0x1000000				    //NUM_RECORDS in bytes
-    end
 
 
     // VGPR Allocated in 4-GPR granularity
 
-if G8SR_VGPR_SR_IN_DWX4
-	// the const stride for DWx4 is 4*4 bytes
-	s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF   // reset const stride to 0
-	s_or_b32  s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4  // const stride to 4*4 bytes
+if SAVE_AFTER_XNACK_ERROR
+	check_if_tcp_store_ok()
+	s_cbranch_scc1 L_SAVE_FIRST_VGPRS_WITH_TCP
 
-	buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
+	write_vgprs_to_mem_with_sqc(v0, 4, s_save_buf_rsrc0, s_save_mem_offset)
+	s_branch L_SAVE_LDS
+
+L_SAVE_FIRST_VGPRS_WITH_TCP:
+end
 
-	s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF   // reset const stride to 0
-	s_or_b32  s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE  // reset const stride to 4 bytes
-else
 	buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
 	buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256
 	buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256*2
 	buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256*3
-end
 
 
 
@@ -617,66 +504,34 @@ end
     s_add_u32 s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes()
 
 
-    if (SWIZZLE_EN)
-	s_add_u32	s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0	      //FIXME need to use swizzle to enable bounds checking?
-    else
 	s_mov_b32	s_save_buf_rsrc2,  0x1000000		      //NUM_RECORDS in bytes
-    end
 
     s_mov_b32	    m0, 0x0						  //lds_offset initial value = 0
 
 
-var LDS_DMA_ENABLE = 0
-var UNROLL = 0
-if UNROLL==0 && LDS_DMA_ENABLE==1
-	s_mov_b32  s3, 256*2
-	s_nop 0
-	s_nop 0
-	s_nop 0
-  L_SAVE_LDS_LOOP:
-	//TODO: looks the 2 buffer_store/load clause for s/r will hurt performance.???
-    if (SAVE_LDS)     //SPI always alloc LDS space in 128DW granularity
-	    buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1		// first 64DW
-	    buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW
-    end
-
-    s_add_u32	    m0, m0, s3						//every buffer_store_lds does 256 bytes
-    s_add_u32	    s_save_mem_offset, s_save_mem_offset, s3				//mem offset increased by 256 bytes
-    s_cmp_lt_u32    m0, s_save_alloc_size						//scc=(m0 < s_save_alloc_size) ? 1 : 0
-    s_cbranch_scc1  L_SAVE_LDS_LOOP							//LDS save is complete?
-
-elsif LDS_DMA_ENABLE==1 && UNROLL==1 // UNROOL	, has ichace miss
-      // store from higest LDS address to lowest
-      s_mov_b32	 s3, 256*2
-      s_sub_u32	 m0, s_save_alloc_size, s3
-      s_add_u32 s_save_mem_offset, s_save_mem_offset, m0
-      s_lshr_b32 s_save_alloc_size, s_save_alloc_size, 9   // how many 128 trunks...
-      s_sub_u32 s_save_alloc_size, 128, s_save_alloc_size   // store from higheset addr to lowest
-      s_mul_i32 s_save_alloc_size, s_save_alloc_size, 6*4   // PC offset increment,  each LDS save block cost 6*4 Bytes instruction
-      s_add_u32 s_save_alloc_size, s_save_alloc_size, 3*4   //2is the below 2 inst...//s_addc and s_setpc
-      s_nop 0
-      s_nop 0
-      s_nop 0	//pad 3 dw to let LDS_DMA align with 64Bytes
-      s_getpc_b64 s[0:1]			      // reuse s[0:1], since s[0:1] already saved
-      s_add_u32	  s0, s0,s_save_alloc_size
-      s_addc_u32  s1, s1, 0
-      s_setpc_b64 s[0:1]
-
-
-       for var i =0; i< 128; i++
-	    // be careful to make here a 64Byte aligned address, which could improve performance...
-	    buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:0		// first 64DW
-	    buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256		  // second 64DW
-
-	if i!=127
-	s_sub_u32  m0, m0, s3	   // use a sgpr to shrink 2DW-inst to 1DW inst to improve performance , i.e.  pack more LDS_DMA inst to one Cacheline
-	    s_sub_u32  s_save_mem_offset, s_save_mem_offset,  s3
-	    end
-       end
-
-else   // BUFFER_STORE
       v_mbcnt_lo_u32_b32 v2, 0xffffffff, 0x0
       v_mbcnt_hi_u32_b32 v3, 0xffffffff, v2	// tid
+
+if SAVE_AFTER_XNACK_ERROR
+	check_if_tcp_store_ok()
+	s_cbranch_scc1 L_SAVE_LDS_WITH_TCP
+
+	v_lshlrev_b32 v2, 2, v3
+L_SAVE_LDS_LOOP_SQC:
+	ds_read2_b32 v[0:1], v2 offset0:0 offset1:0x40
+	s_waitcnt lgkmcnt(0)
+
+	write_vgprs_to_mem_with_sqc(v0, 2, s_save_buf_rsrc0, s_save_mem_offset)
+
+	v_add_u32 v2, 0x200, v2
+	v_cmp_lt_u32 vcc[0:1], v2, s_save_alloc_size
+	s_cbranch_vccnz L_SAVE_LDS_LOOP_SQC
+
+	s_branch L_SAVE_LDS_DONE
+
+L_SAVE_LDS_WITH_TCP:
+end
+
       v_mul_i32_i24 v2, v3, 8	// tid*8
       v_mov_b32 v3, 256*2
       s_mov_b32 m0, 0x10000
@@ -697,8 +552,6 @@ L_SAVE_LDS_LOOP_VECTOR:
       // restore rsrc3
       s_mov_b32 s_save_buf_rsrc3, s0
 
-end
-
 L_SAVE_LDS_DONE:
 
 
@@ -716,44 +569,9 @@ L_SAVE_LDS_DONE:
     s_add_u32	    s_save_alloc_size, s_save_alloc_size, 1
     s_lshl_b32	    s_save_alloc_size, s_save_alloc_size, 2			    //Number of VGPRs = (vgpr_size + 1) * 4    (non-zero value)	  //FIXME for GFX, zero is possible
     s_lshl_b32	    s_save_buf_rsrc2,  s_save_alloc_size, 8			    //NUM_RECORDS in bytes (64 threads*4)
-    if (SWIZZLE_EN)
-	s_add_u32	s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0			    //FIXME need to use swizzle to enable bounds checking?
-    else
 	s_mov_b32	s_save_buf_rsrc2,  0x1000000				    //NUM_RECORDS in bytes
-    end
-
-
-    // VGPR Allocated in 4-GPR granularity
-
-if G8SR_VGPR_SR_IN_DWX4
-	// the const stride for DWx4 is 4*4 bytes
-	s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF   // reset const stride to 0
-	s_or_b32  s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4  // const stride to 4*4 bytes
-
-	s_mov_b32	  m0, 4	    // skip first 4 VGPRs
-	s_cmp_lt_u32	  m0, s_save_alloc_size
-	s_cbranch_scc0	  L_SAVE_VGPR_LOOP_END	    // no more vgprs
 
-	s_set_gpr_idx_on  m0, 0x1   // This will change M0
-	s_add_u32	  s_save_alloc_size, s_save_alloc_size, 0x1000	// because above inst change m0
-L_SAVE_VGPR_LOOP:
-	v_mov_b32	  v0, v0   // v0 = v[0+m0]
-	v_mov_b32	  v1, v1
-	v_mov_b32	  v2, v2
-	v_mov_b32	  v3, v3
 
-
-	buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
-	s_add_u32	  m0, m0, 4
-	s_add_u32	  s_save_mem_offset, s_save_mem_offset, 256*4
-	s_cmp_lt_u32	  m0, s_save_alloc_size
-    s_cbranch_scc1  L_SAVE_VGPR_LOOP						    //VGPR save is complete?
-    s_set_gpr_idx_off
-L_SAVE_VGPR_LOOP_END:
-
-	s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF   // reset const stride to 0
-	s_or_b32  s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE  // reset const stride to 4 bytes
-else
     // VGPR store using dw burst
     s_mov_b32	      m0, 0x4	//VGPR initial index value =0
     s_cmp_lt_u32      m0, s_save_alloc_size
@@ -763,57 +581,82 @@ else
     s_set_gpr_idx_on	m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1
     s_add_u32	    s_save_alloc_size, s_save_alloc_size, 0x1000		    //add 0x1000 since we compare m0 against it later
 
+if SAVE_AFTER_XNACK_ERROR
+	check_if_tcp_store_ok()
+	s_cbranch_scc1 L_SAVE_VGPR_LOOP
+
+L_SAVE_VGPR_LOOP_SQC:
+	write_vgprs_to_mem_with_sqc(v0, 4, s_save_buf_rsrc0, s_save_mem_offset)
+
+	s_add_u32 m0, m0, 4
+	s_cmp_lt_u32 m0, s_save_alloc_size
+	s_cbranch_scc1 L_SAVE_VGPR_LOOP_SQC
+
+	s_set_gpr_idx_off
+	s_branch L_SAVE_VGPR_END
+end
+
   L_SAVE_VGPR_LOOP:
     v_mov_b32	    v0, v0		//v0 = v[0+m0]
     v_mov_b32	    v1, v1		//v0 = v[0+m0]
     v_mov_b32	    v2, v2		//v0 = v[0+m0]
     v_mov_b32	    v3, v3		//v0 = v[0+m0]
 
-    if(USE_MTBUF_INSTEAD_OF_MUBUF)
-	tbuffer_store_format_x v0, v0, s_save_buf_rsrc0, s_save_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
-    else
 	buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
 	buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256
 	buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256*2
 	buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256*3
-    end
 
     s_add_u32	    m0, m0, 4							    //next vgpr index
     s_add_u32	    s_save_mem_offset, s_save_mem_offset, 256*4			    //every buffer_store_dword does 256 bytes
     s_cmp_lt_u32    m0, s_save_alloc_size					    //scc = (m0 < s_save_alloc_size) ? 1 : 0
     s_cbranch_scc1  L_SAVE_VGPR_LOOP						    //VGPR save is complete?
     s_set_gpr_idx_off
-end
 
 L_SAVE_VGPR_END:
 
+if ASIC_TARGET_ARCTURUS
+    // Save ACC VGPRs
+    s_mov_b32 m0, 0x0 //VGPR initial index value =0
+    s_set_gpr_idx_on m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1
+
+if SAVE_AFTER_XNACK_ERROR
+    check_if_tcp_store_ok()
+    s_cbranch_scc1 L_SAVE_ACCVGPR_LOOP
 
+L_SAVE_ACCVGPR_LOOP_SQC:
+    for var vgpr = 0; vgpr < 4; ++ vgpr
+        v_accvgpr_read v[vgpr], acc[vgpr]  // v[N] = acc[N+m0]
+    end
 
+    write_vgprs_to_mem_with_sqc(v0, 4, s_save_buf_rsrc0, s_save_mem_offset)
 
+    s_add_u32 m0, m0, 4
+    s_cmp_lt_u32 m0, s_save_alloc_size
+    s_cbranch_scc1 L_SAVE_ACCVGPR_LOOP_SQC
 
+    s_set_gpr_idx_off
+    s_branch L_SAVE_ACCVGPR_END
+end
 
-    /*	   S_PGM_END_SAVED  */				    //FIXME  graphics ONLY
-    if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_NORMAL_EXIT))
-	s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff    //pc[47:32]
-	s_add_u32 s_save_pc_lo, s_save_pc_lo, 4		    //pc[31:0]+4
-	s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0	    //carry bit over
-	s_rfe_b64 s_save_pc_lo				    //Return to the main shader program
-    else
+L_SAVE_ACCVGPR_LOOP:
+    for var vgpr = 0; vgpr < 4; ++ vgpr
+        v_accvgpr_read v[vgpr], acc[vgpr]  // v[N] = acc[N+m0]
     end
 
-// Save Done timestamp
-if G8SR_DEBUG_TIMESTAMP
-	s_memrealtime	s_g8sr_ts_save_d
-	// SGPR SR memory offset : size(VGPR)
-	get_vgpr_size_bytes(s_save_mem_offset)
-	s_add_u32 s_save_mem_offset, s_save_mem_offset, G8SR_DEBUG_TS_SAVE_D_OFFSET
-	s_waitcnt lgkmcnt(0)	     //FIXME, will cause xnack??
-	// Need reset rsrc2??
-	s_mov_b32 m0, s_save_mem_offset
-	s_mov_b32 s_save_buf_rsrc2,  0x1000000					//NUM_RECORDS in bytes
-	s_buffer_store_dwordx2 s_g8sr_ts_save_d, s_save_buf_rsrc0, m0	    glc:1
-end
+    buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
+    buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256
+    buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2
+    buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3
 
+    s_add_u32 m0, m0, 4
+    s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4
+    s_cmp_lt_u32 m0, s_save_alloc_size
+    s_cbranch_scc1 L_SAVE_ACCVGPR_LOOP
+    s_set_gpr_idx_off
+
+L_SAVE_ACCVGPR_END:
+end
 
     s_branch	L_END_PGM
 
@@ -825,27 +668,6 @@ end
 
 L_RESTORE:
     /*	    Setup Resource Contants    */
-    if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))
-	//calculate wd_addr using absolute thread id
-	v_readlane_b32 s_restore_tmp, v9, 0
-	s_lshr_b32 s_restore_tmp, s_restore_tmp, 6
-	s_mul_i32 s_restore_tmp, s_restore_tmp, WAVE_SPACE
-	s_add_i32 s_restore_spi_init_lo, s_restore_tmp, WG_BASE_ADDR_LO
-	s_mov_b32 s_restore_spi_init_hi, WG_BASE_ADDR_HI
-	s_and_b32 s_restore_spi_init_hi, s_restore_spi_init_hi, CTX_RESTORE_CONTROL
-    else
-    end
-
-if G8SR_DEBUG_TIMESTAMP
-	s_memrealtime	s_g8sr_ts_restore_s
-	s_waitcnt lgkmcnt(0)	     //FIXME, will cause xnack??
-	// tma_lo/hi are sgpr 110, 111, which will not used for 112 SGPR allocated case...
-	s_mov_b32 s_restore_pc_lo, s_g8sr_ts_restore_s[0]
-	s_mov_b32 s_restore_pc_hi, s_g8sr_ts_restore_s[1]   //backup ts to ttmp0/1, sicne exec will be finally restored..
-end
-
-
-
     s_mov_b32	    s_restore_buf_rsrc0,    s_restore_spi_init_lo							    //base_addr_lo
     s_and_b32	    s_restore_buf_rsrc1,    s_restore_spi_init_hi, 0x0000FFFF						    //base_addr_hi
     s_or_b32	    s_restore_buf_rsrc1,    s_restore_buf_rsrc1,  S_RESTORE_BUF_RSRC_WORD1_STRIDE
@@ -887,18 +709,12 @@ end
     s_add_u32  s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes()	     //FIXME, Check if offset overflow???
 
 
-    if (SWIZZLE_EN)
-	s_add_u32	s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0			    //FIXME need to use swizzle to enable bounds checking?
-    else
 	s_mov_b32	s_restore_buf_rsrc2,  0x1000000					    //NUM_RECORDS in bytes
-    end
     s_mov_b32	    m0, 0x0								    //lds_offset initial value = 0
 
   L_RESTORE_LDS_LOOP:
-    if (SAVE_LDS)
 	buffer_load_dword   v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1		       // first 64DW
 	buffer_load_dword   v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:256	       // second 64DW
-    end
     s_add_u32	    m0, m0, 256*2						// 128 DW
     s_add_u32	    s_restore_mem_offset, s_restore_mem_offset, 256*2		//mem offset increased by 128DW
     s_cmp_lt_u32    m0, s_restore_alloc_size					//scc=(m0 < s_restore_alloc_size) ? 1 : 0
@@ -917,56 +733,43 @@ end
     s_add_u32	    s_restore_alloc_size, s_restore_alloc_size, 1
     s_lshl_b32	    s_restore_alloc_size, s_restore_alloc_size, 2			    //Number of VGPRs = (vgpr_size + 1) * 4    (non-zero value)
     s_lshl_b32	    s_restore_buf_rsrc2,  s_restore_alloc_size, 8			    //NUM_RECORDS in bytes (64 threads*4)
-    if (SWIZZLE_EN)
-	s_add_u32	s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0			    //FIXME need to use swizzle to enable bounds checking?
-    else
+
+if ASIC_TARGET_ARCTURUS
+    s_mov_b32	    s_restore_accvgpr_offset, s_restore_buf_rsrc2                           //ACC VGPRs at end of VGPRs
+end
+
 	s_mov_b32	s_restore_buf_rsrc2,  0x1000000					    //NUM_RECORDS in bytes
-    end
 
-if G8SR_VGPR_SR_IN_DWX4
-     get_vgpr_size_bytes(s_restore_mem_offset)
-     s_sub_u32	       s_restore_mem_offset, s_restore_mem_offset, 256*4
-
-     // the const stride for DWx4 is 4*4 bytes
-     s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF   // reset const stride to 0
-     s_or_b32  s_restore_buf_rsrc1, s_restore_buf_rsrc1, G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4  // const stride to 4*4 bytes
-
-     s_mov_b32	       m0, s_restore_alloc_size
-     s_set_gpr_idx_on  m0, 0x8	  // Note.. This will change m0
-
-L_RESTORE_VGPR_LOOP:
-     buffer_load_dwordx4 v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
-     s_waitcnt vmcnt(0)
-     s_sub_u32	       m0, m0, 4
-     v_mov_b32	       v0, v0	// v[0+m0] = v0
-     v_mov_b32	       v1, v1
-     v_mov_b32	       v2, v2
-     v_mov_b32	       v3, v3
-     s_sub_u32	       s_restore_mem_offset, s_restore_mem_offset, 256*4
-     s_cmp_eq_u32      m0, 0x8000
-     s_cbranch_scc0    L_RESTORE_VGPR_LOOP
-     s_set_gpr_idx_off
-
-     s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF   // reset const stride to 0
-     s_or_b32  s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE  // const stride to 4*4 bytes
-
-else
     // VGPR load using dw burst
     s_mov_b32	    s_restore_mem_offset_save, s_restore_mem_offset	// restore start with v1, v0 will be the last
     s_add_u32	    s_restore_mem_offset, s_restore_mem_offset, 256*4
+if ASIC_TARGET_ARCTURUS
+    s_mov_b32	    s_restore_accvgpr_offset_save, s_restore_accvgpr_offset
+    s_add_u32	    s_restore_accvgpr_offset, s_restore_accvgpr_offset, 256*4
+end
     s_mov_b32	    m0, 4				//VGPR initial index value = 1
     s_set_gpr_idx_on  m0, 0x8			    //M0[7:0] = M0[7:0] and M0[15:12] = 0x8
     s_add_u32	    s_restore_alloc_size, s_restore_alloc_size, 0x8000			    //add 0x8000 since we compare m0 against it later
 
   L_RESTORE_VGPR_LOOP:
-    if(USE_MTBUF_INSTEAD_OF_MUBUF)
-	tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
-    else
+
+if ASIC_TARGET_ARCTURUS
+	buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_accvgpr_offset slc:1 glc:1
+	buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_accvgpr_offset slc:1 glc:1 offset:256
+	buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_accvgpr_offset slc:1 glc:1 offset:256*2
+	buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_accvgpr_offset slc:1 glc:1 offset:256*3
+	s_add_u32 s_restore_accvgpr_offset, s_restore_accvgpr_offset, 256*4
+	s_waitcnt vmcnt(0)
+
+	for var vgpr = 0; vgpr < 4; ++ vgpr
+		v_accvgpr_write acc[vgpr], v[vgpr]
+	end
+end
+
 	buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
 	buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256
 	buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*2
 	buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*3
-    end
     s_waitcnt	    vmcnt(0)								    //ensure data ready
     v_mov_b32	    v0, v0								    //v[0+m0] = v0
     v_mov_b32	    v1, v1
@@ -978,16 +781,22 @@ else
     s_cbranch_scc1  L_RESTORE_VGPR_LOOP							    //VGPR restore (except v0) is complete?
     s_set_gpr_idx_off
 											    /* VGPR restore on v0 */
-    if(USE_MTBUF_INSTEAD_OF_MUBUF)
-	tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
-    else
+if ASIC_TARGET_ARCTURUS
+	buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_accvgpr_offset_save slc:1 glc:1
+	buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_accvgpr_offset_save slc:1 glc:1 offset:256
+	buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_accvgpr_offset_save slc:1 glc:1 offset:256*2
+	buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_accvgpr_offset_save slc:1 glc:1 offset:256*3
+	s_waitcnt vmcnt(0)
+
+	for var vgpr = 0; vgpr < 4; ++ vgpr
+		v_accvgpr_write acc[vgpr], v[vgpr]
+	end
+end
+
 	buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save    slc:1 glc:1
 	buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save    slc:1 glc:1 offset:256
 	buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save    slc:1 glc:1 offset:256*2
 	buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save    slc:1 glc:1 offset:256*3
-    end
-
-end
 
     /*		restore SGPRs	    */
     //////////////////////////////
@@ -1003,16 +812,8 @@ end
     s_add_u32	    s_restore_alloc_size, s_restore_alloc_size, 1
     s_lshl_b32	    s_restore_alloc_size, s_restore_alloc_size, 4			    //Number of SGPRs = (sgpr_size + 1) * 16   (non-zero value)
 
-    if (SGPR_SAVE_USE_SQC)
 	s_lshl_b32	s_restore_buf_rsrc2,	s_restore_alloc_size, 2			    //NUM_RECORDS in bytes
-    else
-	s_lshl_b32	s_restore_buf_rsrc2,	s_restore_alloc_size, 8			    //NUM_RECORDS in bytes (64 threads)
-    end
-    if (SWIZZLE_EN)
-	s_add_u32	s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0			    //FIXME need to use swizzle to enable bounds checking?
-    else
 	s_mov_b32	s_restore_buf_rsrc2,  0x1000000					    //NUM_RECORDS in bytes
-    end
 
     s_mov_b32 m0, s_restore_alloc_size
 
@@ -1040,11 +841,6 @@ end
   L_RESTORE_HWREG:
 
 
-if G8SR_DEBUG_TIMESTAMP
-      s_mov_b32 s_g8sr_ts_restore_s[0], s_restore_pc_lo
-      s_mov_b32 s_g8sr_ts_restore_s[1], s_restore_pc_hi
-end
-
     // HWREG SR memory offset : size(VGPR)+size(SGPR)
     get_vgpr_size_bytes(s_restore_mem_offset)
     get_sgpr_size_bytes(s_restore_tmp)
@@ -1052,11 +848,7 @@ end
 
 
     s_mov_b32	    s_restore_buf_rsrc2, 0x4						    //NUM_RECORDS   in bytes
-    if (SWIZZLE_EN)
-	s_add_u32	s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0			    //FIXME need to use swizzle to enable bounds checking?
-    else
 	s_mov_b32	s_restore_buf_rsrc2,  0x1000000					    //NUM_RECORDS in bytes
-    end
 
     read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset)		    //M0
     read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset)		//PC
@@ -1071,16 +863,6 @@ end
 
     s_waitcnt	    lgkmcnt(0)											    //from now on, it is safe to restore STATUS and IB_STS
 
-    //for normal save & restore, the saved PC points to the next inst to execute, no adjustment needs to be made, otherwise:
-    if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))
-	s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 8		 //pc[31:0]+8	  //two back-to-back s_trap are used (first for save and second for restore)
-	s_addc_u32  s_restore_pc_hi, s_restore_pc_hi, 0x0	 //carry bit over
-    end
-    if ((EMU_RUN_HACK) && (EMU_RUN_HACK_RESTORE_NORMAL))
-	s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 4		 //pc[31:0]+4	  // save is hack through s_trap but restore is normal
-	s_addc_u32  s_restore_pc_hi, s_restore_pc_hi, 0x0	 //carry bit over
-    end
-
     s_mov_b32	    m0,		s_restore_m0
     s_mov_b32	    exec_lo,	s_restore_exec_lo
     s_mov_b32	    exec_hi,	s_restore_exec_hi
@@ -1093,7 +875,7 @@ end
     //s_setreg_b32  hwreg(HW_REG_TRAPSTS),  s_restore_trapsts	   //don't overwrite SAVECTX bit as it may be set through external SAVECTX during restore
     s_setreg_b32    hwreg(HW_REG_MODE),	    s_restore_mode
 
-    // Restore trap temporaries 6-11, 13-15 initialized by SPI debug dispatch logic
+    // Restore trap temporaries 4-11, 13 initialized by SPI debug dispatch logic
     // ttmp SR memory offset : size(VGPR)+size(SGPR)+0x40
     get_vgpr_size_bytes(s_restore_ttmps_lo)
     get_sgpr_size_bytes(s_restore_ttmps_hi)
@@ -1101,10 +883,9 @@ end
     s_add_u32	    s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_buf_rsrc0
     s_addc_u32	    s_restore_ttmps_hi, s_restore_buf_rsrc1, 0x0
     s_and_b32	    s_restore_ttmps_hi, s_restore_ttmps_hi, 0xFFFF
-    s_load_dwordx2  [ttmp6, ttmp7], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x40 glc:1
-    s_load_dwordx4  [ttmp8, ttmp9, ttmp10, ttmp11], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x48 glc:1
-    s_load_dword    ttmp13, [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x58 glc:1
-    s_load_dwordx2  [ttmp14, ttmp15], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x5C glc:1
+    s_load_dwordx4  [ttmp4, ttmp5, ttmp6, ttmp7], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x50 glc:1
+    s_load_dwordx4  [ttmp8, ttmp9, ttmp10, ttmp11], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x60 glc:1
+    s_load_dword    ttmp13, [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x74 glc:1
     s_waitcnt	    lgkmcnt(0)
 
     //reuse s_restore_m0 as a temp register
@@ -1128,11 +909,6 @@ end
 
     s_barrier							//barrier to ensure the readiness of LDS before access attempts from any other wave in the same TG //FIXME not performance-optimal at this time
 
-if G8SR_DEBUG_TIMESTAMP
-    s_memrealtime s_g8sr_ts_restore_d
-    s_waitcnt lgkmcnt(0)
-end
-
 //  s_rfe_b64 s_restore_pc_lo					//Return to the main shader program and resume execution
     s_rfe_restore_b64  s_restore_pc_lo, s_restore_m0		// s_restore_m0[0] is used to set STATUS.inst_atc
 
@@ -1187,7 +963,39 @@ function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset)
     s_sub_u32	    s_mem_offset, s_mem_offset, 4*16
 end
 
+function check_if_tcp_store_ok
+	// If STATUS.ALLOW_REPLAY=0 and TRAPSTS.XNACK_ERROR=1 then TCP stores will fail.
+	s_and_b32 s_save_tmp, s_save_status, SQ_WAVE_STATUS_ALLOW_REPLAY_MASK
+	s_cbranch_scc1 L_TCP_STORE_CHECK_DONE
+
+	s_getreg_b32 s_save_tmp, hwreg(HW_REG_TRAPSTS)
+	s_andn2_b32 s_save_tmp, SQ_WAVE_TRAPSTS_XNACK_ERROR_MASK, s_save_tmp
+
+L_TCP_STORE_CHECK_DONE:
+end
+
+function write_vgpr_to_mem_with_sqc(v, s_rsrc, s_mem_offset)
+	s_mov_b32 s4, 0
 
+L_WRITE_VGPR_LANE_LOOP:
+	for var lane = 0; lane < 4; ++ lane
+		v_readlane_b32 s[lane], v, s4
+		s_add_u32 s4, s4, 1
+	end
+
+	s_buffer_store_dwordx4 s[0:3], s_rsrc, s_mem_offset glc:1
+	ack_sqc_store_workaround()
+
+	s_add_u32 s_mem_offset, s_mem_offset, 0x10
+	s_cmp_eq_u32 s4, 0x40
+	s_cbranch_scc0 L_WRITE_VGPR_LANE_LOOP
+end
+
+function write_vgprs_to_mem_with_sqc(v, n_vgprs, s_rsrc, s_mem_offset)
+	for var vgpr = 0; vgpr < n_vgprs; ++ vgpr
+		write_vgpr_to_mem_with_sqc(v[vgpr], s_rsrc, s_mem_offset)
+	end
+end
 
 function get_lds_size_bytes(s_lds_size_byte)
     // SQ LDS granularity is 64DW, while PGM_RSRC2.lds_size is in granularity 128DW
@@ -1199,6 +1007,10 @@ function get_vgpr_size_bytes(s_vgpr_size_byte)
     s_getreg_b32   s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)	 //vpgr_size
     s_add_u32	   s_vgpr_size_byte, s_vgpr_size_byte, 1
     s_lshl_b32	   s_vgpr_size_byte, s_vgpr_size_byte, (2+8) //Number of VGPRs = (vgpr_size + 1) * 4 * 64 * 4	(non-zero value)   //FIXME for GFX, zero is possible
+
+if ASIC_TARGET_ARCTURUS
+    s_lshl_b32     s_vgpr_size_byte, s_vgpr_size_byte, 1  // Double size for ACC VGPRs
+end
 end
 
 function get_sgpr_size_bytes(s_sgpr_size_byte)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 083bd8114db1..1544007af34a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -49,7 +49,7 @@ static const char kfd_dev_name[] = "kfd";
 static const struct file_operations kfd_fops = {
 	.owner = THIS_MODULE,
 	.unlocked_ioctl = kfd_ioctl,
-	.compat_ioctl = kfd_ioctl,
+	.compat_ioctl = compat_ptr_ioctl,
 	.open = kfd_open,
 	.mmap = kfd_mmap,
 };
@@ -213,6 +213,8 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties,
 		q_properties->type = KFD_QUEUE_TYPE_COMPUTE;
 	else if (args->queue_type == KFD_IOC_QUEUE_TYPE_SDMA)
 		q_properties->type = KFD_QUEUE_TYPE_SDMA;
+	else if (args->queue_type == KFD_IOC_QUEUE_TYPE_SDMA_XGMI)
+		q_properties->type = KFD_QUEUE_TYPE_SDMA_XGMI;
 	else
 		return -ENOTSUPP;
 
@@ -280,7 +282,7 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,
 		goto err_bind_process;
 	}
 
-	pr_debug("Creating queue for PASID %d on gpu 0x%x\n",
+	pr_debug("Creating queue for PASID 0x%x on gpu 0x%x\n",
 			p->pasid,
 			dev->id);
 
@@ -330,7 +332,7 @@ static int kfd_ioctl_destroy_queue(struct file *filp, struct kfd_process *p,
 	int retval;
 	struct kfd_ioctl_destroy_queue_args *args = data;
 
-	pr_debug("Destroying queue id %d for pasid %d\n",
+	pr_debug("Destroying queue id %d for pasid 0x%x\n",
 				args->queue_id,
 				p->pasid);
 
@@ -376,7 +378,7 @@ static int kfd_ioctl_update_queue(struct file *filp, struct kfd_process *p,
 	properties.queue_percent = args->queue_percentage;
 	properties.priority = args->queue_priority;
 
-	pr_debug("Updating queue id %d for pasid %d\n",
+	pr_debug("Updating queue id %d for pasid 0x%x\n",
 			args->queue_id, p->pasid);
 
 	mutex_lock(&p->mutex);
@@ -522,7 +524,7 @@ static int kfd_ioctl_set_trap_handler(struct file *filep,
 	struct kfd_process_device *pdd;
 
 	dev = kfd_device_by_id(args->gpu_id);
-	if (dev == NULL)
+	if (!dev)
 		return -EINVAL;
 
 	mutex_lock(&p->mutex);
@@ -837,7 +839,7 @@ static int kfd_ioctl_get_clock_counters(struct file *filep,
 
 	/* No access to rdtsc. Using raw monotonic time */
 	args->cpu_clock_counter = ktime_get_raw_ns();
-	args->system_clock_counter = ktime_get_boot_ns();
+	args->system_clock_counter = ktime_get_boottime_ns();
 
 	/* Since the counter is in nano-seconds we use 1GHz frequency */
 	args->system_clock_freq = 1000000000;
@@ -853,7 +855,7 @@ static int kfd_ioctl_get_process_apertures(struct file *filp,
 	struct kfd_process_device_apertures *pAperture;
 	struct kfd_process_device *pdd;
 
-	dev_dbg(kfd_device, "get apertures for PASID %d", p->pasid);
+	dev_dbg(kfd_device, "get apertures for PASID 0x%x", p->pasid);
 
 	args->num_of_nodes = 0;
 
@@ -911,7 +913,7 @@ static int kfd_ioctl_get_process_apertures_new(struct file *filp,
 	uint32_t nodes = 0;
 	int ret;
 
-	dev_dbg(kfd_device, "get apertures for PASID %d", p->pasid);
+	dev_dbg(kfd_device, "get apertures for PASID 0x%x", p->pasid);
 
 	if (args->num_of_nodes == 0) {
 		/* Return number of nodes, so that user space can alloacate
@@ -1126,7 +1128,7 @@ static int kfd_ioctl_set_scratch_backing_va(struct file *filep,
 	mutex_unlock(&p->mutex);
 
 	if (dev->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS &&
-	    pdd->qpd.vmid != 0)
+	    pdd->qpd.vmid != 0 && dev->kfd2kgd->set_scratch_backing_va)
 		dev->kfd2kgd->set_scratch_backing_va(
 			dev->kgd, args->va_addr, pdd->qpd.vmid);
 
@@ -1272,6 +1274,12 @@ static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep,
 		if (args->size != kfd_doorbell_process_slice(dev))
 			return -EINVAL;
 		offset = kfd_get_process_doorbells(dev, p);
+	} else if (flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP) {
+		if (args->size != PAGE_SIZE)
+			return -EINVAL;
+		offset = amdgpu_amdkfd_get_mmio_remap_phys_addr(dev->kgd);
+		if (!offset)
+			return -ENOMEM;
 	}
 
 	mutex_lock(&p->mutex);
@@ -1301,6 +1309,14 @@ static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep,
 	args->handle = MAKE_HANDLE(args->gpu_id, idr_handle);
 	args->mmap_offset = offset;
 
+	/* MMIO is mapped through kfd device
+	 * Generate a kfd mmap offset
+	 */
+	if (flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP) {
+		args->mmap_offset = KFD_MMAP_TYPE_MMIO | KFD_MMAP_GPU_ID(args->gpu_id);
+		args->mmap_offset <<= PAGE_SHIFT;
+	}
+
 	return 0;
 
 err_free:
@@ -1785,7 +1801,7 @@ static long kfd_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
 	} else
 		goto err_i1;
 
-	dev_dbg(kfd_device, "ioctl cmd 0x%x (#%d), arg 0x%lx\n", cmd, nr, arg);
+	dev_dbg(kfd_device, "ioctl cmd 0x%x (#0x%x), arg 0x%lx\n", cmd, nr, arg);
 
 	process = kfd_get_process(current);
 	if (IS_ERR(process)) {
@@ -1840,11 +1856,45 @@ err_i1:
 		kfree(kdata);
 
 	if (retcode)
-		dev_dbg(kfd_device, "ret = %d\n", retcode);
+		dev_dbg(kfd_device, "ioctl cmd (#0x%x), arg 0x%lx, ret = %d\n",
+				nr, arg, retcode);
 
 	return retcode;
 }
 
+static int kfd_mmio_mmap(struct kfd_dev *dev, struct kfd_process *process,
+		      struct vm_area_struct *vma)
+{
+	phys_addr_t address;
+	int ret;
+
+	if (vma->vm_end - vma->vm_start != PAGE_SIZE)
+		return -EINVAL;
+
+	address = amdgpu_amdkfd_get_mmio_remap_phys_addr(dev->kgd);
+
+	vma->vm_flags |= VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_NORESERVE |
+				VM_DONTDUMP | VM_PFNMAP;
+
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+	pr_debug("pasid 0x%x mapping mmio page\n"
+		 "     target user address == 0x%08llX\n"
+		 "     physical address    == 0x%08llX\n"
+		 "     vm_flags            == 0x%04lX\n"
+		 "     size                == 0x%04lX\n",
+		 process->pasid, (unsigned long long) vma->vm_start,
+		 address, vma->vm_flags, PAGE_SIZE);
+
+	ret = io_remap_pfn_range(vma,
+				vma->vm_start,
+				address >> PAGE_SHIFT,
+				PAGE_SIZE,
+				vma->vm_page_prot);
+	return ret;
+}
+
+
 static int kfd_mmap(struct file *filp, struct vm_area_struct *vma)
 {
 	struct kfd_process *process;
@@ -1875,6 +1925,10 @@ static int kfd_mmap(struct file *filp, struct vm_area_struct *vma)
 		if (!dev)
 			return -ENODEV;
 		return kfd_reserved_mem_mmap(dev, process, vma);
+	case KFD_MMAP_TYPE_MMIO:
+		if (!dev)
+			return -ENODEV;
+		return kfd_mmio_mmap(dev, process, vma);
 	}
 
 	return -EFAULT;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
index 2e7c44955f43..de9f68d5c312 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
@@ -134,9 +134,13 @@ static struct kfd_gpu_cache_info carrizo_cache_info[] = {
 #define polaris10_cache_info carrizo_cache_info
 #define polaris11_cache_info carrizo_cache_info
 #define polaris12_cache_info carrizo_cache_info
+#define vegam_cache_info carrizo_cache_info
 /* TODO - check & update Vega10 cache details */
 #define vega10_cache_info carrizo_cache_info
 #define raven_cache_info carrizo_cache_info
+#define renoir_cache_info carrizo_cache_info
+/* TODO - check & update Navi10 cache details */
+#define navi10_cache_info carrizo_cache_info
 
 static void kfd_populated_cu_info_cpu(struct kfd_topology_device *dev,
 		struct crat_subtype_computeunit *cu)
@@ -372,7 +376,7 @@ static int kfd_parse_subtype_iolink(struct crat_subtype_iolink *iolink,
 			if (props->iolink_type == CRAT_IOLINK_TYPE_PCIEXPRESS)
 				props->weight = 20;
 			else if (props->iolink_type == CRAT_IOLINK_TYPE_XGMI)
-				props->weight = 15;
+				props->weight = 15 * iolink->num_hops_xgmi;
 			else
 				props->weight = node_distance(id_from, id_to);
 
@@ -652,9 +656,14 @@ static int kfd_fill_gpu_cache_info(struct kfd_dev *kdev,
 		pcache_info = polaris12_cache_info;
 		num_of_cache_types = ARRAY_SIZE(polaris12_cache_info);
 		break;
+	case CHIP_VEGAM:
+		pcache_info = vegam_cache_info;
+		num_of_cache_types = ARRAY_SIZE(vegam_cache_info);
+		break;
 	case CHIP_VEGA10:
 	case CHIP_VEGA12:
 	case CHIP_VEGA20:
+	case CHIP_ARCTURUS:
 		pcache_info = vega10_cache_info;
 		num_of_cache_types = ARRAY_SIZE(vega10_cache_info);
 		break;
@@ -662,6 +671,16 @@ static int kfd_fill_gpu_cache_info(struct kfd_dev *kdev,
 		pcache_info = raven_cache_info;
 		num_of_cache_types = ARRAY_SIZE(raven_cache_info);
 		break;
+	case CHIP_RENOIR:
+		pcache_info = renoir_cache_info;
+		num_of_cache_types = ARRAY_SIZE(renoir_cache_info);
+		break;
+	case CHIP_NAVI10:
+	case CHIP_NAVI12:
+	case CHIP_NAVI14:
+		pcache_info = navi10_cache_info;
+		num_of_cache_types = ARRAY_SIZE(navi10_cache_info);
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -691,7 +710,7 @@ static int kfd_fill_gpu_cache_info(struct kfd_dev *kdev,
 						pcache_info,
 						cu_info,
 						mem_available,
-						cu_info->cu_bitmap[i][j],
+						cu_info->cu_bitmap[i % 4][j + i / 4],
 						ct,
 						cu_processor_id,
 						k);
@@ -777,7 +796,7 @@ int kfd_create_crat_image_acpi(void **crat_image, size_t *size)
  * is put in the code to ensure we don't overwrite.
  */
 #define VCRAT_SIZE_FOR_CPU	(2 * PAGE_SIZE)
-#define VCRAT_SIZE_FOR_GPU	(3 * PAGE_SIZE)
+#define VCRAT_SIZE_FOR_GPU	(4 * PAGE_SIZE)
 
 /* kfd_fill_cu_for_cpu - Fill in Compute info for the given CPU NUMA node
  *
@@ -1092,6 +1111,7 @@ static int kfd_fill_gpu_direct_io_link_to_cpu(int *avail_size,
 
 static int kfd_fill_gpu_xgmi_link_to_gpu(int *avail_size,
 			struct kfd_dev *kdev,
+			struct kfd_dev *peer_kdev,
 			struct crat_subtype_iolink *sub_type_hdr,
 			uint32_t proximity_domain_from,
 			uint32_t proximity_domain_to)
@@ -1110,6 +1130,8 @@ static int kfd_fill_gpu_xgmi_link_to_gpu(int *avail_size,
 	sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_XGMI;
 	sub_type_hdr->proximity_domain_from = proximity_domain_from;
 	sub_type_hdr->proximity_domain_to = proximity_domain_to;
+	sub_type_hdr->num_hops_xgmi =
+		amdgpu_amdkfd_get_xgmi_hops_count(kdev->kgd, peer_kdev->kgd);
 	return 0;
 }
 
@@ -1287,7 +1309,7 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image,
 				(char *)sub_type_hdr +
 				sizeof(struct crat_subtype_iolink));
 			ret = kfd_fill_gpu_xgmi_link_to_gpu(
-				&avail_size, kdev,
+				&avail_size, kdev, peer_dev->gpu,
 				(struct crat_subtype_iolink *)sub_type_hdr,
 				proximity_domain, nid);
 			if (ret < 0)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h
index 7c3f192fe25f..d54ceebd346b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h
@@ -274,7 +274,8 @@ struct crat_subtype_iolink {
 	uint32_t	minimum_bandwidth_mbs;
 	uint32_t	maximum_bandwidth_mbs;
 	uint32_t	recommended_transfer_size;
-	uint8_t		reserved2[CRAT_IOLINK_RESERVED_LENGTH];
+	uint8_t		reserved2[CRAT_IOLINK_RESERVED_LENGTH - 1];
+	uint8_t		num_hops_xgmi;
 };
 
 /*
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c
index a3441b0e385b..d59f2cd056c6 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c
@@ -761,6 +761,7 @@ int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p)
 {
 	int status = 0;
 	unsigned int vmid;
+	uint16_t queried_pasid;
 	union SQ_CMD_BITS reg_sq_cmd;
 	union GRBM_GFX_INDEX_BITS reg_gfx_index;
 	struct kfd_process_device *pdd;
@@ -782,19 +783,18 @@ int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p)
 	 */
 
 	for (vmid = first_vmid_to_scan; vmid <= last_vmid_to_scan; vmid++) {
-		if (dev->kfd2kgd->get_atc_vmid_pasid_mapping_valid
-				(dev->kgd, vmid)) {
-			if (dev->kfd2kgd->get_atc_vmid_pasid_mapping_pasid
-					(dev->kgd, vmid) == p->pasid) {
-				pr_debug("Killing wave fronts of vmid %d and pasid %d\n",
-						vmid, p->pasid);
-				break;
-			}
+		status = dev->kfd2kgd->get_atc_vmid_pasid_mapping_info
+				(dev->kgd, vmid, &queried_pasid);
+
+		if (status && queried_pasid == p->pasid) {
+			pr_debug("Killing wave fronts of vmid %d and pasid 0x%x\n",
+					vmid, p->pasid);
+			break;
 		}
 	}
 
 	if (vmid > last_vmid_to_scan) {
-		pr_err("Didn't find vmid for pasid %d\n", p->pasid);
+		pr_err("Didn't find vmid for pasid 0x%x\n", p->pasid);
 		return -EFAULT;
 	}
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c b/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c
index 9d4af961c5d1..9bfa50633654 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgmgr.c
@@ -96,7 +96,7 @@ bool kfd_dbgmgr_create(struct kfd_dbgmgr **ppmgr, struct kfd_dev *pdev)
 long kfd_dbgmgr_register(struct kfd_dbgmgr *pmgr, struct kfd_process *p)
 {
 	if (pmgr->pasid != 0) {
-		pr_debug("H/W debugger is already active using pasid %d\n",
+		pr_debug("H/W debugger is already active using pasid 0x%x\n",
 				pmgr->pasid);
 		return -EBUSY;
 	}
@@ -117,7 +117,7 @@ long kfd_dbgmgr_unregister(struct kfd_dbgmgr *pmgr, struct kfd_process *p)
 {
 	/* Is the requests coming from the already registered process? */
 	if (pmgr->pasid != p->pasid) {
-		pr_debug("H/W debugger is not registered by calling pasid %d\n",
+		pr_debug("H/W debugger is not registered by calling pasid 0x%x\n",
 				p->pasid);
 		return -EINVAL;
 	}
@@ -134,7 +134,7 @@ long kfd_dbgmgr_wave_control(struct kfd_dbgmgr *pmgr,
 {
 	/* Is the requests coming from the already registered process? */
 	if (pmgr->pasid != wac_info->process->pasid) {
-		pr_debug("H/W debugger support was not registered for requester pasid %d\n",
+		pr_debug("H/W debugger support was not registered for requester pasid 0x%x\n",
 				wac_info->process->pasid);
 		return -EINVAL;
 	}
@@ -147,7 +147,7 @@ long kfd_dbgmgr_address_watch(struct kfd_dbgmgr *pmgr,
 {
 	/* Is the requests coming from the already registered process? */
 	if (pmgr->pasid != adw_info->process->pasid) {
-		pr_debug("H/W debugger support was not registered for requester pasid %d\n",
+		pr_debug("H/W debugger support was not registered for requester pasid 0x%x\n",
 				adw_info->process->pasid);
 		return -EINVAL;
 	}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c b/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c
index ab37d36d9cd6..15c523027285 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c
@@ -85,36 +85,16 @@ static const struct file_operations kfd_debugfs_hang_hws_fops = {
 
 void kfd_debugfs_init(void)
 {
-	struct dentry *ent;
-
 	debugfs_root = debugfs_create_dir("kfd", NULL);
-	if (!debugfs_root || debugfs_root == ERR_PTR(-ENODEV)) {
-		pr_warn("Failed to create kfd debugfs dir\n");
-		return;
-	}
-
-	ent = debugfs_create_file("mqds", S_IFREG | 0444, debugfs_root,
-				  kfd_debugfs_mqds_by_process,
-				  &kfd_debugfs_fops);
-	if (!ent)
-		pr_warn("Failed to create mqds in kfd debugfs\n");
-
-	ent = debugfs_create_file("hqds", S_IFREG | 0444, debugfs_root,
-				  kfd_debugfs_hqds_by_device,
-				  &kfd_debugfs_fops);
-	if (!ent)
-		pr_warn("Failed to create hqds in kfd debugfs\n");
-
-	ent = debugfs_create_file("rls", S_IFREG | 0444, debugfs_root,
-				  kfd_debugfs_rls_by_device,
-				  &kfd_debugfs_fops);
-
-	ent = debugfs_create_file("hang_hws", S_IFREG | 0644, debugfs_root,
-				  NULL,
-				  &kfd_debugfs_hang_hws_fops);
 
-	if (!ent)
-		pr_warn("Failed to create rls in kfd debugfs\n");
+	debugfs_create_file("mqds", S_IFREG | 0444, debugfs_root,
+			    kfd_debugfs_mqds_by_process, &kfd_debugfs_fops);
+	debugfs_create_file("hqds", S_IFREG | 0444, debugfs_root,
+			    kfd_debugfs_hqds_by_device, &kfd_debugfs_fops);
+	debugfs_create_file("rls", S_IFREG | 0444, debugfs_root,
+			    kfd_debugfs_rls_by_device, &kfd_debugfs_fops);
+	debugfs_create_file("hang_hws", S_IFREG | 0644, debugfs_root,
+			    NULL, &kfd_debugfs_hang_hws_fops);
 }
 
 void kfd_debugfs_fini(void)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 8be9677c0c07..4fa8834ce7cb 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -39,9 +39,45 @@
  */
 static atomic_t kfd_locked = ATOMIC_INIT(0);
 
+#ifdef CONFIG_DRM_AMDGPU_CIK
+extern const struct kfd2kgd_calls gfx_v7_kfd2kgd;
+#endif
+extern const struct kfd2kgd_calls gfx_v8_kfd2kgd;
+extern const struct kfd2kgd_calls gfx_v9_kfd2kgd;
+extern const struct kfd2kgd_calls arcturus_kfd2kgd;
+extern const struct kfd2kgd_calls gfx_v10_kfd2kgd;
+
+static const struct kfd2kgd_calls *kfd2kgd_funcs[] = {
+#ifdef KFD_SUPPORT_IOMMU_V2
+#ifdef CONFIG_DRM_AMDGPU_CIK
+	[CHIP_KAVERI] = &gfx_v7_kfd2kgd,
+#endif
+	[CHIP_CARRIZO] = &gfx_v8_kfd2kgd,
+	[CHIP_RAVEN] = &gfx_v9_kfd2kgd,
+#endif
+#ifdef CONFIG_DRM_AMDGPU_CIK
+	[CHIP_HAWAII] = &gfx_v7_kfd2kgd,
+#endif
+	[CHIP_TONGA] = &gfx_v8_kfd2kgd,
+	[CHIP_FIJI] = &gfx_v8_kfd2kgd,
+	[CHIP_POLARIS10] = &gfx_v8_kfd2kgd,
+	[CHIP_POLARIS11] = &gfx_v8_kfd2kgd,
+	[CHIP_POLARIS12] = &gfx_v8_kfd2kgd,
+	[CHIP_VEGAM] = &gfx_v8_kfd2kgd,
+	[CHIP_VEGA10] = &gfx_v9_kfd2kgd,
+	[CHIP_VEGA12] = &gfx_v9_kfd2kgd,
+	[CHIP_VEGA20] = &gfx_v9_kfd2kgd,
+	[CHIP_RENOIR] = &gfx_v9_kfd2kgd,
+	[CHIP_ARCTURUS] = &arcturus_kfd2kgd,
+	[CHIP_NAVI10] = &gfx_v10_kfd2kgd,
+	[CHIP_NAVI12] = &gfx_v10_kfd2kgd,
+	[CHIP_NAVI14] = &gfx_v10_kfd2kgd,
+};
+
 #ifdef KFD_SUPPORT_IOMMU_V2
 static const struct kfd_device_info kaveri_device_info = {
 	.asic_family = CHIP_KAVERI,
+	.asic_name = "kaveri",
 	.max_pasid_bits = 16,
 	/* max num of queues for KV.TODO should be a dynamic value */
 	.max_no_of_hqd	= 24,
@@ -54,11 +90,13 @@ static const struct kfd_device_info kaveri_device_info = {
 	.needs_iommu_device = true,
 	.needs_pci_atomics = false,
 	.num_sdma_engines = 2,
+	.num_xgmi_sdma_engines = 0,
 	.num_sdma_queues_per_engine = 2,
 };
 
 static const struct kfd_device_info carrizo_device_info = {
 	.asic_family = CHIP_CARRIZO,
+	.asic_name = "carrizo",
 	.max_pasid_bits = 16,
 	/* max num of queues for CZ.TODO should be a dynamic value */
 	.max_no_of_hqd	= 24,
@@ -71,11 +109,13 @@ static const struct kfd_device_info carrizo_device_info = {
 	.needs_iommu_device = true,
 	.needs_pci_atomics = false,
 	.num_sdma_engines = 2,
+	.num_xgmi_sdma_engines = 0,
 	.num_sdma_queues_per_engine = 2,
 };
 
 static const struct kfd_device_info raven_device_info = {
 	.asic_family = CHIP_RAVEN,
+	.asic_name = "raven",
 	.max_pasid_bits = 16,
 	.max_no_of_hqd  = 24,
 	.doorbell_size  = 8,
@@ -87,12 +127,14 @@ static const struct kfd_device_info raven_device_info = {
 	.needs_iommu_device = true,
 	.needs_pci_atomics = true,
 	.num_sdma_engines = 1,
+	.num_xgmi_sdma_engines = 0,
 	.num_sdma_queues_per_engine = 2,
 };
 #endif
 
 static const struct kfd_device_info hawaii_device_info = {
 	.asic_family = CHIP_HAWAII,
+	.asic_name = "hawaii",
 	.max_pasid_bits = 16,
 	/* max num of queues for KV.TODO should be a dynamic value */
 	.max_no_of_hqd	= 24,
@@ -105,11 +147,13 @@ static const struct kfd_device_info hawaii_device_info = {
 	.needs_iommu_device = false,
 	.needs_pci_atomics = false,
 	.num_sdma_engines = 2,
+	.num_xgmi_sdma_engines = 0,
 	.num_sdma_queues_per_engine = 2,
 };
 
 static const struct kfd_device_info tonga_device_info = {
 	.asic_family = CHIP_TONGA,
+	.asic_name = "tonga",
 	.max_pasid_bits = 16,
 	.max_no_of_hqd  = 24,
 	.doorbell_size  = 4,
@@ -121,11 +165,13 @@ static const struct kfd_device_info tonga_device_info = {
 	.needs_iommu_device = false,
 	.needs_pci_atomics = true,
 	.num_sdma_engines = 2,
+	.num_xgmi_sdma_engines = 0,
 	.num_sdma_queues_per_engine = 2,
 };
 
 static const struct kfd_device_info fiji_device_info = {
 	.asic_family = CHIP_FIJI,
+	.asic_name = "fiji",
 	.max_pasid_bits = 16,
 	.max_no_of_hqd  = 24,
 	.doorbell_size  = 4,
@@ -137,11 +183,13 @@ static const struct kfd_device_info fiji_device_info = {
 	.needs_iommu_device = false,
 	.needs_pci_atomics = true,
 	.num_sdma_engines = 2,
+	.num_xgmi_sdma_engines = 0,
 	.num_sdma_queues_per_engine = 2,
 };
 
 static const struct kfd_device_info fiji_vf_device_info = {
 	.asic_family = CHIP_FIJI,
+	.asic_name = "fiji",
 	.max_pasid_bits = 16,
 	.max_no_of_hqd  = 24,
 	.doorbell_size  = 4,
@@ -153,12 +201,14 @@ static const struct kfd_device_info fiji_vf_device_info = {
 	.needs_iommu_device = false,
 	.needs_pci_atomics = false,
 	.num_sdma_engines = 2,
+	.num_xgmi_sdma_engines = 0,
 	.num_sdma_queues_per_engine = 2,
 };
 
 
 static const struct kfd_device_info polaris10_device_info = {
 	.asic_family = CHIP_POLARIS10,
+	.asic_name = "polaris10",
 	.max_pasid_bits = 16,
 	.max_no_of_hqd  = 24,
 	.doorbell_size  = 4,
@@ -170,11 +220,13 @@ static const struct kfd_device_info polaris10_device_info = {
 	.needs_iommu_device = false,
 	.needs_pci_atomics = true,
 	.num_sdma_engines = 2,
+	.num_xgmi_sdma_engines = 0,
 	.num_sdma_queues_per_engine = 2,
 };
 
 static const struct kfd_device_info polaris10_vf_device_info = {
 	.asic_family = CHIP_POLARIS10,
+	.asic_name = "polaris10",
 	.max_pasid_bits = 16,
 	.max_no_of_hqd  = 24,
 	.doorbell_size  = 4,
@@ -186,11 +238,13 @@ static const struct kfd_device_info polaris10_vf_device_info = {
 	.needs_iommu_device = false,
 	.needs_pci_atomics = false,
 	.num_sdma_engines = 2,
+	.num_xgmi_sdma_engines = 0,
 	.num_sdma_queues_per_engine = 2,
 };
 
 static const struct kfd_device_info polaris11_device_info = {
 	.asic_family = CHIP_POLARIS11,
+	.asic_name = "polaris11",
 	.max_pasid_bits = 16,
 	.max_no_of_hqd  = 24,
 	.doorbell_size  = 4,
@@ -202,11 +256,31 @@ static const struct kfd_device_info polaris11_device_info = {
 	.needs_iommu_device = false,
 	.needs_pci_atomics = true,
 	.num_sdma_engines = 2,
+	.num_xgmi_sdma_engines = 0,
 	.num_sdma_queues_per_engine = 2,
 };
 
 static const struct kfd_device_info polaris12_device_info = {
 	.asic_family = CHIP_POLARIS12,
+	.asic_name = "polaris12",
+	.max_pasid_bits = 16,
+	.max_no_of_hqd  = 24,
+	.doorbell_size  = 4,
+	.ih_ring_entry_size = 4 * sizeof(uint32_t),
+	.event_interrupt_class = &event_interrupt_class_cik,
+	.num_of_watch_points = 4,
+	.mqd_size_aligned = MQD_SIZE_ALIGNED,
+	.supports_cwsr = true,
+	.needs_iommu_device = false,
+	.needs_pci_atomics = true,
+	.num_sdma_engines = 2,
+	.num_xgmi_sdma_engines = 0,
+	.num_sdma_queues_per_engine = 2,
+};
+
+static const struct kfd_device_info vegam_device_info = {
+	.asic_family = CHIP_VEGAM,
+	.asic_name = "vegam",
 	.max_pasid_bits = 16,
 	.max_no_of_hqd  = 24,
 	.doorbell_size  = 4,
@@ -218,11 +292,13 @@ static const struct kfd_device_info polaris12_device_info = {
 	.needs_iommu_device = false,
 	.needs_pci_atomics = true,
 	.num_sdma_engines = 2,
+	.num_xgmi_sdma_engines = 0,
 	.num_sdma_queues_per_engine = 2,
 };
 
 static const struct kfd_device_info vega10_device_info = {
 	.asic_family = CHIP_VEGA10,
+	.asic_name = "vega10",
 	.max_pasid_bits = 16,
 	.max_no_of_hqd  = 24,
 	.doorbell_size  = 8,
@@ -234,11 +310,13 @@ static const struct kfd_device_info vega10_device_info = {
 	.needs_iommu_device = false,
 	.needs_pci_atomics = false,
 	.num_sdma_engines = 2,
+	.num_xgmi_sdma_engines = 0,
 	.num_sdma_queues_per_engine = 2,
 };
 
 static const struct kfd_device_info vega10_vf_device_info = {
 	.asic_family = CHIP_VEGA10,
+	.asic_name = "vega10",
 	.max_pasid_bits = 16,
 	.max_no_of_hqd  = 24,
 	.doorbell_size  = 8,
@@ -250,11 +328,13 @@ static const struct kfd_device_info vega10_vf_device_info = {
 	.needs_iommu_device = false,
 	.needs_pci_atomics = false,
 	.num_sdma_engines = 2,
+	.num_xgmi_sdma_engines = 0,
 	.num_sdma_queues_per_engine = 2,
 };
 
 static const struct kfd_device_info vega12_device_info = {
 	.asic_family = CHIP_VEGA12,
+	.asic_name = "vega12",
 	.max_pasid_bits = 16,
 	.max_no_of_hqd  = 24,
 	.doorbell_size  = 8,
@@ -266,11 +346,13 @@ static const struct kfd_device_info vega12_device_info = {
 	.needs_iommu_device = false,
 	.needs_pci_atomics = false,
 	.num_sdma_engines = 2,
+	.num_xgmi_sdma_engines = 0,
 	.num_sdma_queues_per_engine = 2,
 };
 
 static const struct kfd_device_info vega20_device_info = {
 	.asic_family = CHIP_VEGA20,
+	.asic_name = "vega20",
 	.max_pasid_bits = 16,
 	.max_no_of_hqd	= 24,
 	.doorbell_size	= 8,
@@ -282,122 +364,122 @@ static const struct kfd_device_info vega20_device_info = {
 	.needs_iommu_device = false,
 	.needs_pci_atomics = false,
 	.num_sdma_engines = 2,
+	.num_xgmi_sdma_engines = 0,
 	.num_sdma_queues_per_engine = 8,
 };
 
-struct kfd_deviceid {
-	unsigned short did;
-	const struct kfd_device_info *device_info;
+static const struct kfd_device_info arcturus_device_info = {
+	.asic_family = CHIP_ARCTURUS,
+	.asic_name = "arcturus",
+	.max_pasid_bits = 16,
+	.max_no_of_hqd	= 24,
+	.doorbell_size	= 8,
+	.ih_ring_entry_size = 8 * sizeof(uint32_t),
+	.event_interrupt_class = &event_interrupt_class_v9,
+	.num_of_watch_points = 4,
+	.mqd_size_aligned = MQD_SIZE_ALIGNED,
+	.supports_cwsr = true,
+	.needs_iommu_device = false,
+	.needs_pci_atomics = false,
+	.num_sdma_engines = 2,
+	.num_xgmi_sdma_engines = 6,
+	.num_sdma_queues_per_engine = 8,
+};
+
+static const struct kfd_device_info renoir_device_info = {
+	.asic_family = CHIP_RENOIR,
+	.asic_name = "renoir",
+	.max_pasid_bits = 16,
+	.max_no_of_hqd  = 24,
+	.doorbell_size  = 8,
+	.ih_ring_entry_size = 8 * sizeof(uint32_t),
+	.event_interrupt_class = &event_interrupt_class_v9,
+	.num_of_watch_points = 4,
+	.mqd_size_aligned = MQD_SIZE_ALIGNED,
+	.supports_cwsr = true,
+	.needs_iommu_device = false,
+	.needs_pci_atomics = false,
+	.num_sdma_engines = 1,
+	.num_xgmi_sdma_engines = 0,
+	.num_sdma_queues_per_engine = 2,
+};
+
+static const struct kfd_device_info navi10_device_info = {
+	.asic_family = CHIP_NAVI10,
+	.asic_name = "navi10",
+	.max_pasid_bits = 16,
+	.max_no_of_hqd  = 24,
+	.doorbell_size  = 8,
+	.ih_ring_entry_size = 8 * sizeof(uint32_t),
+	.event_interrupt_class = &event_interrupt_class_v9,
+	.num_of_watch_points = 4,
+	.mqd_size_aligned = MQD_SIZE_ALIGNED,
+	.needs_iommu_device = false,
+	.supports_cwsr = true,
+	.needs_pci_atomics = false,
+	.num_sdma_engines = 2,
+	.num_xgmi_sdma_engines = 0,
+	.num_sdma_queues_per_engine = 8,
 };
 
-static const struct kfd_deviceid supported_devices[] = {
+static const struct kfd_device_info navi12_device_info = {
+	.asic_family = CHIP_NAVI12,
+	.asic_name = "navi12",
+	.max_pasid_bits = 16,
+	.max_no_of_hqd  = 24,
+	.doorbell_size  = 8,
+	.ih_ring_entry_size = 8 * sizeof(uint32_t),
+	.event_interrupt_class = &event_interrupt_class_v9,
+	.num_of_watch_points = 4,
+	.mqd_size_aligned = MQD_SIZE_ALIGNED,
+	.needs_iommu_device = false,
+	.supports_cwsr = true,
+	.needs_pci_atomics = false,
+	.num_sdma_engines = 2,
+	.num_xgmi_sdma_engines = 0,
+	.num_sdma_queues_per_engine = 8,
+};
+
+static const struct kfd_device_info navi14_device_info = {
+	.asic_family = CHIP_NAVI14,
+	.asic_name = "navi14",
+	.max_pasid_bits = 16,
+	.max_no_of_hqd  = 24,
+	.doorbell_size  = 8,
+	.ih_ring_entry_size = 8 * sizeof(uint32_t),
+	.event_interrupt_class = &event_interrupt_class_v9,
+	.num_of_watch_points = 4,
+	.mqd_size_aligned = MQD_SIZE_ALIGNED,
+	.needs_iommu_device = false,
+	.supports_cwsr = true,
+	.needs_pci_atomics = false,
+	.num_sdma_engines = 2,
+	.num_xgmi_sdma_engines = 0,
+	.num_sdma_queues_per_engine = 8,
+};
+
+/* For each entry, [0] is regular and [1] is virtualisation device. */
+static const struct kfd_device_info *kfd_supported_devices[][2] = {
 #ifdef KFD_SUPPORT_IOMMU_V2
-	{ 0x1304, &kaveri_device_info },	/* Kaveri */
-	{ 0x1305, &kaveri_device_info },	/* Kaveri */
-	{ 0x1306, &kaveri_device_info },	/* Kaveri */
-	{ 0x1307, &kaveri_device_info },	/* Kaveri */
-	{ 0x1309, &kaveri_device_info },	/* Kaveri */
-	{ 0x130A, &kaveri_device_info },	/* Kaveri */
-	{ 0x130B, &kaveri_device_info },	/* Kaveri */
-	{ 0x130C, &kaveri_device_info },	/* Kaveri */
-	{ 0x130D, &kaveri_device_info },	/* Kaveri */
-	{ 0x130E, &kaveri_device_info },	/* Kaveri */
-	{ 0x130F, &kaveri_device_info },	/* Kaveri */
-	{ 0x1310, &kaveri_device_info },	/* Kaveri */
-	{ 0x1311, &kaveri_device_info },	/* Kaveri */
-	{ 0x1312, &kaveri_device_info },	/* Kaveri */
-	{ 0x1313, &kaveri_device_info },	/* Kaveri */
-	{ 0x1315, &kaveri_device_info },	/* Kaveri */
-	{ 0x1316, &kaveri_device_info },	/* Kaveri */
-	{ 0x1317, &kaveri_device_info },	/* Kaveri */
-	{ 0x1318, &kaveri_device_info },	/* Kaveri */
-	{ 0x131B, &kaveri_device_info },	/* Kaveri */
-	{ 0x131C, &kaveri_device_info },	/* Kaveri */
-	{ 0x131D, &kaveri_device_info },	/* Kaveri */
-	{ 0x9870, &carrizo_device_info },	/* Carrizo */
-	{ 0x9874, &carrizo_device_info },	/* Carrizo */
-	{ 0x9875, &carrizo_device_info },	/* Carrizo */
-	{ 0x9876, &carrizo_device_info },	/* Carrizo */
-	{ 0x9877, &carrizo_device_info },	/* Carrizo */
-	{ 0x15DD, &raven_device_info },		/* Raven */
+	[CHIP_KAVERI] = {&kaveri_device_info, NULL},
+	[CHIP_CARRIZO] = {&carrizo_device_info, NULL},
+	[CHIP_RAVEN] = {&raven_device_info, NULL},
 #endif
-	{ 0x67A0, &hawaii_device_info },	/* Hawaii */
-	{ 0x67A1, &hawaii_device_info },	/* Hawaii */
-	{ 0x67A2, &hawaii_device_info },	/* Hawaii */
-	{ 0x67A8, &hawaii_device_info },	/* Hawaii */
-	{ 0x67A9, &hawaii_device_info },	/* Hawaii */
-	{ 0x67AA, &hawaii_device_info },	/* Hawaii */
-	{ 0x67B0, &hawaii_device_info },	/* Hawaii */
-	{ 0x67B1, &hawaii_device_info },	/* Hawaii */
-	{ 0x67B8, &hawaii_device_info },	/* Hawaii */
-	{ 0x67B9, &hawaii_device_info },	/* Hawaii */
-	{ 0x67BA, &hawaii_device_info },	/* Hawaii */
-	{ 0x67BE, &hawaii_device_info },	/* Hawaii */
-	{ 0x6920, &tonga_device_info },		/* Tonga */
-	{ 0x6921, &tonga_device_info },		/* Tonga */
-	{ 0x6928, &tonga_device_info },		/* Tonga */
-	{ 0x6929, &tonga_device_info },		/* Tonga */
-	{ 0x692B, &tonga_device_info },		/* Tonga */
-	{ 0x6938, &tonga_device_info },		/* Tonga */
-	{ 0x6939, &tonga_device_info },		/* Tonga */
-	{ 0x7300, &fiji_device_info },		/* Fiji */
-	{ 0x730F, &fiji_vf_device_info },	/* Fiji vf*/
-	{ 0x67C0, &polaris10_device_info },	/* Polaris10 */
-	{ 0x67C1, &polaris10_device_info },	/* Polaris10 */
-	{ 0x67C2, &polaris10_device_info },	/* Polaris10 */
-	{ 0x67C4, &polaris10_device_info },	/* Polaris10 */
-	{ 0x67C7, &polaris10_device_info },	/* Polaris10 */
-	{ 0x67C8, &polaris10_device_info },	/* Polaris10 */
-	{ 0x67C9, &polaris10_device_info },	/* Polaris10 */
-	{ 0x67CA, &polaris10_device_info },	/* Polaris10 */
-	{ 0x67CC, &polaris10_device_info },	/* Polaris10 */
-	{ 0x67CF, &polaris10_device_info },	/* Polaris10 */
-	{ 0x67D0, &polaris10_vf_device_info },	/* Polaris10 vf*/
-	{ 0x67DF, &polaris10_device_info },	/* Polaris10 */
-	{ 0x67E0, &polaris11_device_info },	/* Polaris11 */
-	{ 0x67E1, &polaris11_device_info },	/* Polaris11 */
-	{ 0x67E3, &polaris11_device_info },	/* Polaris11 */
-	{ 0x67E7, &polaris11_device_info },	/* Polaris11 */
-	{ 0x67E8, &polaris11_device_info },	/* Polaris11 */
-	{ 0x67E9, &polaris11_device_info },	/* Polaris11 */
-	{ 0x67EB, &polaris11_device_info },	/* Polaris11 */
-	{ 0x67EF, &polaris11_device_info },	/* Polaris11 */
-	{ 0x67FF, &polaris11_device_info },	/* Polaris11 */
-	{ 0x6980, &polaris12_device_info },	/* Polaris12 */
-	{ 0x6981, &polaris12_device_info },	/* Polaris12 */
-	{ 0x6985, &polaris12_device_info },	/* Polaris12 */
-	{ 0x6986, &polaris12_device_info },	/* Polaris12 */
-	{ 0x6987, &polaris12_device_info },	/* Polaris12 */
-	{ 0x6995, &polaris12_device_info },	/* Polaris12 */
-	{ 0x6997, &polaris12_device_info },	/* Polaris12 */
-	{ 0x699F, &polaris12_device_info },	/* Polaris12 */
-	{ 0x6860, &vega10_device_info },	/* Vega10 */
-	{ 0x6861, &vega10_device_info },	/* Vega10 */
-	{ 0x6862, &vega10_device_info },	/* Vega10 */
-	{ 0x6863, &vega10_device_info },	/* Vega10 */
-	{ 0x6864, &vega10_device_info },	/* Vega10 */
-	{ 0x6867, &vega10_device_info },	/* Vega10 */
-	{ 0x6868, &vega10_device_info },	/* Vega10 */
-	{ 0x6869, &vega10_device_info },	/* Vega10 */
-	{ 0x686A, &vega10_device_info },	/* Vega10 */
-	{ 0x686B, &vega10_device_info },	/* Vega10 */
-	{ 0x686C, &vega10_vf_device_info },	/* Vega10  vf*/
-	{ 0x686D, &vega10_device_info },	/* Vega10 */
-	{ 0x686E, &vega10_device_info },	/* Vega10 */
-	{ 0x686F, &vega10_device_info },	/* Vega10 */
-	{ 0x687F, &vega10_device_info },	/* Vega10 */
-	{ 0x69A0, &vega12_device_info },	/* Vega12 */
-	{ 0x69A1, &vega12_device_info },	/* Vega12 */
-	{ 0x69A2, &vega12_device_info },	/* Vega12 */
-	{ 0x69A3, &vega12_device_info },	/* Vega12 */
-	{ 0x69AF, &vega12_device_info },	/* Vega12 */
-	{ 0x66a0, &vega20_device_info },	/* Vega20 */
-	{ 0x66a1, &vega20_device_info },	/* Vega20 */
-	{ 0x66a2, &vega20_device_info },	/* Vega20 */
-	{ 0x66a3, &vega20_device_info },	/* Vega20 */
-	{ 0x66a4, &vega20_device_info },	/* Vega20 */
-	{ 0x66a7, &vega20_device_info },	/* Vega20 */
-	{ 0x66af, &vega20_device_info }		/* Vega20 */
+	[CHIP_HAWAII] = {&hawaii_device_info, NULL},
+	[CHIP_TONGA] = {&tonga_device_info, NULL},
+	[CHIP_FIJI] = {&fiji_device_info, &fiji_vf_device_info},
+	[CHIP_POLARIS10] = {&polaris10_device_info, &polaris10_vf_device_info},
+	[CHIP_POLARIS11] = {&polaris11_device_info, NULL},
+	[CHIP_POLARIS12] = {&polaris12_device_info, NULL},
+	[CHIP_VEGAM] = {&vegam_device_info, NULL},
+	[CHIP_VEGA10] = {&vega10_device_info, &vega10_vf_device_info},
+	[CHIP_VEGA12] = {&vega12_device_info, NULL},
+	[CHIP_VEGA20] = {&vega20_device_info, NULL},
+	[CHIP_RENOIR] = {&renoir_device_info, NULL},
+	[CHIP_ARCTURUS] = {&arcturus_device_info, &arcturus_device_info},
+	[CHIP_NAVI10] = {&navi10_device_info, NULL},
+	[CHIP_NAVI12] = {&navi12_device_info, &navi12_device_info},
+	[CHIP_NAVI14] = {&navi14_device_info, NULL},
 };
 
 static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size,
@@ -406,33 +488,25 @@ static void kfd_gtt_sa_fini(struct kfd_dev *kfd);
 
 static int kfd_resume(struct kfd_dev *kfd);
 
-static const struct kfd_device_info *lookup_device_info(unsigned short did)
+struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd,
+	struct pci_dev *pdev, unsigned int asic_type, bool vf)
 {
-	size_t i;
+	struct kfd_dev *kfd;
+	const struct kfd_device_info *device_info;
+	const struct kfd2kgd_calls *f2g;
 
-	for (i = 0; i < ARRAY_SIZE(supported_devices); i++) {
-		if (supported_devices[i].did == did) {
-			WARN_ON(!supported_devices[i].device_info);
-			return supported_devices[i].device_info;
-		}
+	if (asic_type >= sizeof(kfd_supported_devices) / (sizeof(void *) * 2)
+		|| asic_type >= sizeof(kfd2kgd_funcs) / sizeof(void *)) {
+		dev_err(kfd_device, "asic_type %d out of range\n", asic_type);
+		return NULL; /* asic_type out of range */
 	}
 
-	dev_warn(kfd_device, "DID %04x is missing in supported_devices\n",
-		 did);
-
-	return NULL;
-}
-
-struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd,
-	struct pci_dev *pdev, const struct kfd2kgd_calls *f2g)
-{
-	struct kfd_dev *kfd;
-	int ret;
-	const struct kfd_device_info *device_info =
-					lookup_device_info(pdev->device);
+	device_info = kfd_supported_devices[asic_type][vf];
+	f2g = kfd2kgd_funcs[asic_type];
 
-	if (!device_info) {
-		dev_err(kfd_device, "kgd2kfd_probe failed\n");
+	if (!device_info || !f2g) {
+		dev_err(kfd_device, "%s %s not supported in kfd\n",
+			amdgpu_asic_name[asic_type], vf ? "VF" : "");
 		return NULL;
 	}
 
@@ -444,28 +518,29 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd,
 	 * 32 and 64-bit requests are possible and must be
 	 * supported.
 	 */
-	ret = pci_enable_atomic_ops_to_root(pdev,
-			PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
-			PCI_EXP_DEVCAP2_ATOMIC_COMP64);
-	if (device_info->needs_pci_atomics && ret < 0) {
+	kfd->pci_atomic_requested = amdgpu_amdkfd_have_atomics_support(kgd);
+	if (device_info->needs_pci_atomics &&
+	    !kfd->pci_atomic_requested) {
 		dev_info(kfd_device,
 			 "skipped device %x:%x, PCI rejects atomics\n",
 			 pdev->vendor, pdev->device);
 		kfree(kfd);
 		return NULL;
-	} else if (!ret)
-		kfd->pci_atomic_requested = true;
+	}
 
 	kfd->kgd = kgd;
 	kfd->device_info = device_info;
 	kfd->pdev = pdev;
 	kfd->init_complete = false;
 	kfd->kfd2kgd = f2g;
+	atomic_set(&kfd->compute_profile, 0);
 
 	mutex_init(&kfd->doorbell_mutex);
 	memset(&kfd->doorbell_available_index, 0,
 		sizeof(kfd->doorbell_available_index));
 
+	atomic_set(&kfd->sram_ecc_flag, 0);
+
 	return kfd;
 }
 
@@ -476,10 +551,18 @@ static void kfd_cwsr_init(struct kfd_dev *kfd)
 			BUILD_BUG_ON(sizeof(cwsr_trap_gfx8_hex) > PAGE_SIZE);
 			kfd->cwsr_isa = cwsr_trap_gfx8_hex;
 			kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx8_hex);
-		} else {
+		} else if (kfd->device_info->asic_family == CHIP_ARCTURUS) {
+			BUILD_BUG_ON(sizeof(cwsr_trap_arcturus_hex) > PAGE_SIZE);
+			kfd->cwsr_isa = cwsr_trap_arcturus_hex;
+			kfd->cwsr_isa_size = sizeof(cwsr_trap_arcturus_hex);
+		} else if (kfd->device_info->asic_family < CHIP_NAVI10) {
 			BUILD_BUG_ON(sizeof(cwsr_trap_gfx9_hex) > PAGE_SIZE);
 			kfd->cwsr_isa = cwsr_trap_gfx9_hex;
 			kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx9_hex);
+		} else {
+			BUILD_BUG_ON(sizeof(cwsr_trap_gfx10_hex) > PAGE_SIZE);
+			kfd->cwsr_isa = cwsr_trap_gfx10_hex;
+			kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx10_hex);
 		}
 
 		kfd->cwsr_enabled = true;
@@ -487,13 +570,15 @@ static void kfd_cwsr_init(struct kfd_dev *kfd)
 }
 
 bool kgd2kfd_device_init(struct kfd_dev *kfd,
+			 struct drm_device *ddev,
 			 const struct kgd2kfd_shared_resources *gpu_resources)
 {
 	unsigned int size;
 
-	kfd->mec_fw_version = kfd->kfd2kgd->get_fw_version(kfd->kgd,
+	kfd->ddev = ddev;
+	kfd->mec_fw_version = amdgpu_amdkfd_get_fw_version(kfd->kgd,
 			KGD_ENGINE_MEC1);
-	kfd->sdma_fw_version = kfd->kfd2kgd->get_fw_version(kfd->kgd,
+	kfd->sdma_fw_version = amdgpu_amdkfd_get_fw_version(kfd->kgd,
 			KGD_ENGINE_SDMA1);
 	kfd->shared_resources = *gpu_resources;
 
@@ -513,6 +598,13 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
 	} else
 		kfd->max_proc_per_quantum = hws_max_conc_proc;
 
+	/* Allocate global GWS that is shared by all KFD processes */
+	if (hws_gws_support && amdgpu_amdkfd_alloc_gws(kfd->kgd,
+			amdgpu_amdkfd_get_num_gws(kfd->kgd), &kfd->gws)) {
+		dev_err(kfd_device, "Could not allocate %d gws\n",
+			amdgpu_amdkfd_get_num_gws(kfd->kgd));
+		goto out;
+	}
 	/* calculate max size of mqds needed for queues */
 	size = max_num_of_queues_per_device *
 			kfd->device_info->mqd_size_aligned;
@@ -536,7 +628,7 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
 			&kfd->gtt_start_gpu_addr, &kfd->gtt_start_cpu_ptr,
 			false)) {
 		dev_err(kfd_device, "Could not allocate %d bytes\n", size);
-		goto out;
+		goto alloc_gtt_mem_failure;
 	}
 
 	dev_info(kfd_device, "Allocated %d bytes on gart\n", size);
@@ -556,11 +648,6 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
 	if (kfd->kfd2kgd->get_hive_id)
 		kfd->hive_id = kfd->kfd2kgd->get_hive_id(kfd->kgd);
 
-	if (kfd_topology_add_device(kfd)) {
-		dev_err(kfd_device, "Error adding device to topology\n");
-		goto kfd_topology_add_device_error;
-	}
-
 	if (kfd_interrupt_init(kfd)) {
 		dev_err(kfd_device, "Error initializing interrupts\n");
 		goto kfd_interrupt_error;
@@ -584,6 +671,11 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
 
 	kfd->dbgmgr = NULL;
 
+	if (kfd_topology_add_device(kfd)) {
+		dev_err(kfd_device, "Error adding device to topology\n");
+		goto kfd_topology_add_device_error;
+	}
+
 	kfd->init_complete = true;
 	dev_info(kfd_device, "added device %x:%x\n", kfd->pdev->vendor,
 		 kfd->pdev->device);
@@ -593,19 +685,21 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
 
 	goto out;
 
+kfd_topology_add_device_error:
 kfd_resume_error:
 device_iommu_error:
 	device_queue_manager_uninit(kfd->dqm);
 device_queue_manager_error:
 	kfd_interrupt_exit(kfd);
 kfd_interrupt_error:
-	kfd_topology_remove_device(kfd);
-kfd_topology_add_device_error:
 	kfd_doorbell_fini(kfd);
 kfd_doorbell_error:
 	kfd_gtt_sa_fini(kfd);
 kfd_gtt_sa_init_error:
 	amdgpu_amdkfd_free_gtt_mem(kfd->kgd, kfd->gtt_mem);
+alloc_gtt_mem_failure:
+	if (hws_gws_support)
+		amdgpu_amdkfd_free_gws(kfd->kgd, kfd->gws);
 	dev_err(kfd_device,
 		"device %x:%x NOT added due to errors\n",
 		kfd->pdev->vendor, kfd->pdev->device);
@@ -623,6 +717,8 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd)
 		kfd_doorbell_fini(kfd);
 		kfd_gtt_sa_fini(kfd);
 		amdgpu_amdkfd_free_gtt_mem(kfd->kgd, kfd->gtt_mem);
+		if (hws_gws_support)
+			amdgpu_amdkfd_free_gws(kfd->kgd, kfd->gws);
 	}
 
 	kfree(kfd);
@@ -634,9 +730,6 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd)
 		return 0;
 	kgd2kfd_suspend(kfd);
 
-	/* hold dqm->lock to prevent further execution*/
-	dqm_lock(kfd->dqm);
-
 	kfd_signal_reset_event(kfd);
 	return 0;
 }
@@ -654,13 +747,13 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd)
 	if (!kfd->init_complete)
 		return 0;
 
-	dqm_unlock(kfd->dqm);
-
 	ret = kfd_resume(kfd);
 	if (ret)
 		return ret;
 	count = atomic_dec_return(&kfd_locked);
-	WARN_ONCE(count != 0, "KFD reset ref. error");
+
+	atomic_set(&kfd->sram_ecc_flag, 0);
+
 	return 0;
 }
 
@@ -1024,6 +1117,27 @@ int kfd_gtt_sa_free(struct kfd_dev *kfd, struct kfd_mem_obj *mem_obj)
 	return 0;
 }
 
+void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd)
+{
+	if (kfd)
+		atomic_inc(&kfd->sram_ecc_flag);
+}
+
+void kfd_inc_compute_active(struct kfd_dev *kfd)
+{
+	if (atomic_inc_return(&kfd->compute_profile) == 1)
+		amdgpu_amdkfd_set_compute_idle(kfd->kgd, false);
+}
+
+void kfd_dec_compute_active(struct kfd_dev *kfd)
+{
+	int count = atomic_dec_return(&kfd->compute_profile);
+
+	if (count == 0)
+		amdgpu_amdkfd_set_compute_idle(kfd->kgd, true);
+	WARN_ONCE(count < 0, "Compute profile ref. count error");
+}
+
 #if defined(CONFIG_DEBUG_FS)
 
 /* This function will send a package to HIQ to hang the HWS
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index c6c9530e704e..984c2f2b24b6 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -42,10 +42,6 @@
 static int set_pasid_vmid_mapping(struct device_queue_manager *dqm,
 					unsigned int pasid, unsigned int vmid);
 
-static int create_compute_queue_nocpsch(struct device_queue_manager *dqm,
-					struct queue *q,
-					struct qcm_process_device *qpd);
-
 static int execute_queues_cpsch(struct device_queue_manager *dqm,
 				enum kfd_unmap_queues_filter filter,
 				uint32_t filter_param);
@@ -55,19 +51,20 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
 
 static int map_queues_cpsch(struct device_queue_manager *dqm);
 
-static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
-					struct queue *q,
-					struct qcm_process_device *qpd);
-
 static void deallocate_sdma_queue(struct device_queue_manager *dqm,
-				unsigned int sdma_queue_id);
+				struct queue *q);
 
+static inline void deallocate_hqd(struct device_queue_manager *dqm,
+				struct queue *q);
+static int allocate_hqd(struct device_queue_manager *dqm, struct queue *q);
+static int allocate_sdma_queue(struct device_queue_manager *dqm,
+				struct queue *q);
 static void kfd_process_hw_exception(struct work_struct *work);
 
 static inline
 enum KFD_MQD_TYPE get_mqd_type_from_queue_type(enum kfd_queue_type type)
 {
-	if (type == KFD_QUEUE_TYPE_SDMA)
+	if (type == KFD_QUEUE_TYPE_SDMA || type == KFD_QUEUE_TYPE_SDMA_XGMI)
 		return KFD_MQD_TYPE_SDMA;
 	return KFD_MQD_TYPE_CP;
 }
@@ -107,12 +104,23 @@ static unsigned int get_num_sdma_engines(struct device_queue_manager *dqm)
 	return dqm->dev->device_info->num_sdma_engines;
 }
 
+static unsigned int get_num_xgmi_sdma_engines(struct device_queue_manager *dqm)
+{
+	return dqm->dev->device_info->num_xgmi_sdma_engines;
+}
+
 unsigned int get_num_sdma_queues(struct device_queue_manager *dqm)
 {
 	return dqm->dev->device_info->num_sdma_engines
 			* dqm->dev->device_info->num_sdma_queues_per_engine;
 }
 
+unsigned int get_num_xgmi_sdma_queues(struct device_queue_manager *dqm)
+{
+	return dqm->dev->device_info->num_xgmi_sdma_engines
+			* dqm->dev->device_info->num_sdma_queues_per_engine;
+}
+
 void program_sh_mem_settings(struct device_queue_manager *dqm,
 					struct qcm_process_device *qpd)
 {
@@ -133,7 +141,8 @@ static int allocate_doorbell(struct qcm_process_device *qpd, struct queue *q)
 		 * preserve the user mode ABI.
 		 */
 		q->doorbell_id = q->properties.queue_id;
-	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
+	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
+			q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
 		/* For SDMA queues on SOC15 with 8-byte doorbell, use static
 		 * doorbell assignments based on the engine and queue id.
 		 * The doobell index distance between RLC (2*i) and (2*i+1)
@@ -174,7 +183,8 @@ static void deallocate_doorbell(struct qcm_process_device *qpd,
 	struct kfd_dev *dev = qpd->dqm->dev;
 
 	if (!KFD_IS_SOC15(dev->device_info->asic_family) ||
-	    q->properties.type == KFD_QUEUE_TYPE_SDMA)
+	    q->properties.type == KFD_QUEUE_TYPE_SDMA ||
+	    q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
 		return;
 
 	old = test_and_clear_bit(q->doorbell_id, qpd->doorbell_bitmap);
@@ -185,20 +195,30 @@ static int allocate_vmid(struct device_queue_manager *dqm,
 			struct qcm_process_device *qpd,
 			struct queue *q)
 {
-	int bit, allocated_vmid;
+	int allocated_vmid = -1, i;
 
-	if (dqm->vmid_bitmap == 0)
-		return -ENOMEM;
+	for (i = dqm->dev->vm_info.first_vmid_kfd;
+			i <= dqm->dev->vm_info.last_vmid_kfd; i++) {
+		if (!dqm->vmid_pasid[i]) {
+			allocated_vmid = i;
+			break;
+		}
+	}
+
+	if (allocated_vmid < 0) {
+		pr_err("no more vmid to allocate\n");
+		return -ENOSPC;
+	}
+
+	pr_debug("vmid allocated: %d\n", allocated_vmid);
 
-	bit = ffs(dqm->vmid_bitmap) - 1;
-	dqm->vmid_bitmap &= ~(1 << bit);
+	dqm->vmid_pasid[allocated_vmid] = q->process->pasid;
+
+	set_pasid_vmid_mapping(dqm, q->process->pasid, allocated_vmid);
 
-	allocated_vmid = bit + dqm->dev->vm_info.first_vmid_kfd;
-	pr_debug("vmid allocation %d\n", allocated_vmid);
 	qpd->vmid = allocated_vmid;
 	q->properties.vmid = allocated_vmid;
 
-	set_pasid_vmid_mapping(dqm, q->process->pasid, q->properties.vmid);
 	program_sh_mem_settings(dqm, qpd);
 
 	/* qpd->page_table_base is set earlier when register_process()
@@ -210,6 +230,10 @@ static int allocate_vmid(struct device_queue_manager *dqm,
 	/* invalidate the VM context after pasid and vmid mapping is set up */
 	kfd_flush_tlb(qpd_to_pdd(qpd));
 
+	if (dqm->dev->kfd2kgd->set_scratch_backing_va)
+		dqm->dev->kfd2kgd->set_scratch_backing_va(dqm->dev->kgd,
+				qpd->sh_hidden_private_base, qpd->vmid);
+
 	return 0;
 }
 
@@ -235,8 +259,6 @@ static void deallocate_vmid(struct device_queue_manager *dqm,
 				struct qcm_process_device *qpd,
 				struct queue *q)
 {
-	int bit = qpd->vmid - dqm->dev->vm_info.first_vmid_kfd;
-
 	/* On GFX v7, CP doesn't flush TC at dequeue */
 	if (q->device->device_info->asic_family == CHIP_HAWAII)
 		if (flush_texture_cache_nocpsch(q->device, qpd))
@@ -246,8 +268,8 @@ static void deallocate_vmid(struct device_queue_manager *dqm,
 
 	/* Release the vmid mapping */
 	set_pasid_vmid_mapping(dqm, 0, qpd->vmid);
+	dqm->vmid_pasid[qpd->vmid] = 0;
 
-	dqm->vmid_bitmap |= (1 << bit);
 	qpd->vmid = 0;
 	q->properties.vmid = 0;
 }
@@ -256,6 +278,7 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
 				struct queue *q,
 				struct qcm_process_device *qpd)
 {
+	struct mqd_manager *mqd_mgr;
 	int retval;
 
 	print_queue(q);
@@ -276,30 +299,63 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
 	}
 	q->properties.vmid = qpd->vmid;
 	/*
-	 * Eviction state logic: we only mark active queues as evicted
-	 * to avoid the overhead of restoring inactive queues later
+	 * Eviction state logic: mark all queues as evicted, even ones
+	 * not currently active. Restoring inactive queues later only
+	 * updates the is_evicted flag but is a no-op otherwise.
 	 */
-	if (qpd->evicted)
-		q->properties.is_evicted = (q->properties.queue_size > 0 &&
-					    q->properties.queue_percent > 0 &&
-					    q->properties.queue_address != 0);
+	q->properties.is_evicted = !!qpd->evicted;
 
 	q->properties.tba_addr = qpd->tba_addr;
 	q->properties.tma_addr = qpd->tma_addr;
 
-	if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE)
-		retval = create_compute_queue_nocpsch(dqm, q, qpd);
-	else if (q->properties.type == KFD_QUEUE_TYPE_SDMA)
-		retval = create_sdma_queue_nocpsch(dqm, q, qpd);
-	else
-		retval = -EINVAL;
+	mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
+			q->properties.type)];
+	if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE) {
+		retval = allocate_hqd(dqm, q);
+		if (retval)
+			goto deallocate_vmid;
+		pr_debug("Loading mqd to hqd on pipe %d, queue %d\n",
+			q->pipe, q->queue);
+	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
+		q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
+		retval = allocate_sdma_queue(dqm, q);
+		if (retval)
+			goto deallocate_vmid;
+		dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
+	}
 
-	if (retval) {
-		if (list_empty(&qpd->queues_list))
-			deallocate_vmid(dqm, qpd, q);
-		goto out_unlock;
+	retval = allocate_doorbell(qpd, q);
+	if (retval)
+		goto out_deallocate_hqd;
+
+	/* Temporarily release dqm lock to avoid a circular lock dependency */
+	dqm_unlock(dqm);
+	q->mqd_mem_obj = mqd_mgr->allocate_mqd(mqd_mgr->dev, &q->properties);
+	dqm_lock(dqm);
+
+	if (!q->mqd_mem_obj) {
+		retval = -ENOMEM;
+		goto out_deallocate_doorbell;
 	}
+	mqd_mgr->init_mqd(mqd_mgr, &q->mqd, q->mqd_mem_obj,
+				&q->gart_mqd_addr, &q->properties);
+	if (q->properties.is_active) {
+		if (!dqm->sched_running) {
+			WARN_ONCE(1, "Load non-HWS mqd while stopped\n");
+			goto add_queue_to_list;
+		}
 
+		if (WARN(q->process->mm != current->mm,
+					"should only run in user thread"))
+			retval = -EFAULT;
+		else
+			retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, q->pipe,
+					q->queue, &q->properties, current->mm);
+		if (retval)
+			goto out_free_mqd;
+	}
+
+add_queue_to_list:
 	list_add(&q->list, &qpd->queues_list);
 	qpd->queue_count++;
 	if (q->properties.is_active)
@@ -307,6 +363,8 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
 
 	if (q->properties.type == KFD_QUEUE_TYPE_SDMA)
 		dqm->sdma_queue_count++;
+	else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
+		dqm->xgmi_sdma_queue_count++;
 
 	/*
 	 * Unconditionally increment this counter, regardless of the queue's
@@ -315,7 +373,21 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
 	dqm->total_queue_count++;
 	pr_debug("Total of %d queues are accountable so far\n",
 			dqm->total_queue_count);
+	goto out_unlock;
 
+out_free_mqd:
+	mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
+out_deallocate_doorbell:
+	deallocate_doorbell(qpd, q);
+out_deallocate_hqd:
+	if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE)
+		deallocate_hqd(dqm, q);
+	else if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
+		q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
+		deallocate_sdma_queue(dqm, q);
+deallocate_vmid:
+	if (list_empty(&qpd->queues_list))
+		deallocate_vmid(dqm, qpd, q);
 out_unlock:
 	dqm_unlock(dqm);
 	return retval;
@@ -361,60 +433,6 @@ static inline void deallocate_hqd(struct device_queue_manager *dqm,
 	dqm->allocated_queues[q->pipe] |= (1 << q->queue);
 }
 
-static int create_compute_queue_nocpsch(struct device_queue_manager *dqm,
-					struct queue *q,
-					struct qcm_process_device *qpd)
-{
-	struct mqd_manager *mqd_mgr;
-	int retval;
-
-	mqd_mgr = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_COMPUTE);
-	if (!mqd_mgr)
-		return -ENOMEM;
-
-	retval = allocate_hqd(dqm, q);
-	if (retval)
-		return retval;
-
-	retval = allocate_doorbell(qpd, q);
-	if (retval)
-		goto out_deallocate_hqd;
-
-	retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, &q->mqd_mem_obj,
-				&q->gart_mqd_addr, &q->properties);
-	if (retval)
-		goto out_deallocate_doorbell;
-
-	pr_debug("Loading mqd to hqd on pipe %d, queue %d\n",
-			q->pipe, q->queue);
-
-	dqm->dev->kfd2kgd->set_scratch_backing_va(
-			dqm->dev->kgd, qpd->sh_hidden_private_base, qpd->vmid);
-
-	if (!q->properties.is_active)
-		return 0;
-
-	if (WARN(q->process->mm != current->mm,
-		 "should only run in user thread"))
-		retval = -EFAULT;
-	else
-		retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, q->pipe, q->queue,
-					   &q->properties, current->mm);
-	if (retval)
-		goto out_uninit_mqd;
-
-	return 0;
-
-out_uninit_mqd:
-	mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
-out_deallocate_doorbell:
-	deallocate_doorbell(qpd, q);
-out_deallocate_hqd:
-	deallocate_hqd(dqm, q);
-
-	return retval;
-}
-
 /* Access to DQM has to be locked before calling destroy_queue_nocpsch_locked
  * to avoid asynchronized access
  */
@@ -425,16 +443,17 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm,
 	int retval;
 	struct mqd_manager *mqd_mgr;
 
-	mqd_mgr = dqm->ops.get_mqd_manager(dqm,
-		get_mqd_type_from_queue_type(q->properties.type));
-	if (!mqd_mgr)
-		return -ENOMEM;
+	mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
+			q->properties.type)];
 
 	if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE) {
 		deallocate_hqd(dqm, q);
 	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
 		dqm->sdma_queue_count--;
-		deallocate_sdma_queue(dqm, q->sdma_id);
+		deallocate_sdma_queue(dqm, q);
+	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
+		dqm->xgmi_sdma_queue_count--;
+		deallocate_sdma_queue(dqm, q);
 	} else {
 		pr_debug("q->properties.type %d is invalid\n",
 				q->properties.type);
@@ -444,6 +463,11 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm,
 
 	deallocate_doorbell(qpd, q);
 
+	if (!dqm->sched_running) {
+		WARN_ONCE(1, "Destroy non-HWS queue while stopped\n");
+		return 0;
+	}
+
 	retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd,
 				KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
 				KFD_UNMAP_LATENCY_MS,
@@ -451,7 +475,7 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm,
 	if (retval == -ETIME)
 		qpd->reset_wavefronts = true;
 
-	mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
+	mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
 
 	list_del(&q->list);
 	if (list_empty(&qpd->queues_list)) {
@@ -490,7 +514,7 @@ static int destroy_queue_nocpsch(struct device_queue_manager *dqm,
 
 static int update_queue(struct device_queue_manager *dqm, struct queue *q)
 {
-	int retval;
+	int retval = 0;
 	struct mqd_manager *mqd_mgr;
 	struct kfd_process_device *pdd;
 	bool prev_active = false;
@@ -501,20 +525,8 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q)
 		retval = -ENODEV;
 		goto out_unlock;
 	}
-	mqd_mgr = dqm->ops.get_mqd_manager(dqm,
-			get_mqd_type_from_queue_type(q->properties.type));
-	if (!mqd_mgr) {
-		retval = -ENOMEM;
-		goto out_unlock;
-	}
-	/*
-	 * Eviction state logic: we only mark active queues as evicted
-	 * to avoid the overhead of restoring inactive queues later
-	 */
-	if (pdd->qpd.evicted)
-		q->properties.is_evicted = (q->properties.queue_size > 0 &&
-					    q->properties.queue_percent > 0 &&
-					    q->properties.queue_address != 0);
+	mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
+			q->properties.type)];
 
 	/* Save previous activity state for counters */
 	prev_active = q->properties.is_active;
@@ -529,7 +541,14 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q)
 		}
 	} else if (prev_active &&
 		   (q->properties.type == KFD_QUEUE_TYPE_COMPUTE ||
-		    q->properties.type == KFD_QUEUE_TYPE_SDMA)) {
+		    q->properties.type == KFD_QUEUE_TYPE_SDMA ||
+		    q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)) {
+
+		if (!dqm->sched_running) {
+			WARN_ONCE(1, "Update non-HWS queue while stopped\n");
+			goto out_unlock;
+		}
+
 		retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd,
 				KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN,
 				KFD_UNMAP_LATENCY_MS, q->pipe, q->queue);
@@ -539,7 +558,7 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q)
 		}
 	}
 
-	retval = mqd_mgr->update_mqd(mqd_mgr, q->mqd, &q->properties);
+	mqd_mgr->update_mqd(mqd_mgr, q->mqd, &q->properties);
 
 	/*
 	 * check active state vs. the previous state and modify
@@ -556,7 +575,8 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q)
 		retval = map_queues_cpsch(dqm);
 	else if (q->properties.is_active &&
 		 (q->properties.type == KFD_QUEUE_TYPE_COMPUTE ||
-		  q->properties.type == KFD_QUEUE_TYPE_SDMA)) {
+		  q->properties.type == KFD_QUEUE_TYPE_SDMA ||
+		  q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)) {
 		if (WARN(q->process->mm != current->mm,
 			 "should only run in user thread"))
 			retval = -EFAULT;
@@ -571,67 +591,51 @@ out_unlock:
 	return retval;
 }
 
-static struct mqd_manager *get_mqd_manager(
-		struct device_queue_manager *dqm, enum KFD_MQD_TYPE type)
-{
-	struct mqd_manager *mqd_mgr;
-
-	if (WARN_ON(type >= KFD_MQD_TYPE_MAX))
-		return NULL;
-
-	pr_debug("mqd type %d\n", type);
-
-	mqd_mgr = dqm->mqd_mgrs[type];
-	if (!mqd_mgr) {
-		mqd_mgr = mqd_manager_init(type, dqm->dev);
-		if (!mqd_mgr)
-			pr_err("mqd manager is NULL");
-		dqm->mqd_mgrs[type] = mqd_mgr;
-	}
-
-	return mqd_mgr;
-}
-
 static int evict_process_queues_nocpsch(struct device_queue_manager *dqm,
 					struct qcm_process_device *qpd)
 {
 	struct queue *q;
 	struct mqd_manager *mqd_mgr;
 	struct kfd_process_device *pdd;
-	int retval = 0;
+	int retval, ret = 0;
 
 	dqm_lock(dqm);
 	if (qpd->evicted++ > 0) /* already evicted, do nothing */
 		goto out;
 
 	pdd = qpd_to_pdd(qpd);
-	pr_info_ratelimited("Evicting PASID %u queues\n",
+	pr_info_ratelimited("Evicting PASID 0x%x queues\n",
 			    pdd->process->pasid);
 
-	/* unactivate all active queues on the qpd */
+	/* Mark all queues as evicted. Deactivate all active queues on
+	 * the qpd.
+	 */
 	list_for_each_entry(q, &qpd->queues_list, list) {
+		q->properties.is_evicted = true;
 		if (!q->properties.is_active)
 			continue;
-		mqd_mgr = dqm->ops.get_mqd_manager(dqm,
-			get_mqd_type_from_queue_type(q->properties.type));
-		if (!mqd_mgr) { /* should not be here */
-			pr_err("Cannot evict queue, mqd mgr is NULL\n");
-			retval = -ENOMEM;
-			goto out;
-		}
-		q->properties.is_evicted = true;
+
+		mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
+				q->properties.type)];
 		q->properties.is_active = false;
+		dqm->queue_count--;
+
+		if (WARN_ONCE(!dqm->sched_running, "Evict when stopped\n"))
+			continue;
+
 		retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd,
 				KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN,
 				KFD_UNMAP_LATENCY_MS, q->pipe, q->queue);
-		if (retval)
-			goto out;
-		dqm->queue_count--;
+		if (retval && !ret)
+			/* Return the first error, but keep going to
+			 * maintain a consistent eviction state
+			 */
+			ret = retval;
 	}
 
 out:
 	dqm_unlock(dqm);
-	return retval;
+	return ret;
 }
 
 static int evict_process_queues_cpsch(struct device_queue_manager *dqm,
@@ -646,14 +650,17 @@ static int evict_process_queues_cpsch(struct device_queue_manager *dqm,
 		goto out;
 
 	pdd = qpd_to_pdd(qpd);
-	pr_info_ratelimited("Evicting PASID %u queues\n",
+	pr_info_ratelimited("Evicting PASID 0x%x queues\n",
 			    pdd->process->pasid);
 
-	/* unactivate all active queues on the qpd */
+	/* Mark all queues as evicted. Deactivate all active queues on
+	 * the qpd.
+	 */
 	list_for_each_entry(q, &qpd->queues_list, list) {
+		q->properties.is_evicted = true;
 		if (!q->properties.is_active)
 			continue;
-		q->properties.is_evicted = true;
+
 		q->properties.is_active = false;
 		dqm->queue_count--;
 	}
@@ -675,7 +682,7 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm,
 	struct mqd_manager *mqd_mgr;
 	struct kfd_process_device *pdd;
 	uint64_t pd_base;
-	int retval = 0;
+	int retval, ret = 0;
 
 	pdd = qpd_to_pdd(qpd);
 	/* Retrieve PD base */
@@ -689,7 +696,7 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm,
 		goto out;
 	}
 
-	pr_info_ratelimited("Restoring PASID %u queues\n",
+	pr_info_ratelimited("Restoring PASID 0x%x queues\n",
 			    pdd->process->pasid);
 
 	/* Update PD Base in QPD */
@@ -709,35 +716,40 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm,
 	 */
 	mm = get_task_mm(pdd->process->lead_thread);
 	if (!mm) {
-		retval = -EFAULT;
+		ret = -EFAULT;
 		goto out;
 	}
 
-	/* activate all active queues on the qpd */
+	/* Remove the eviction flags. Activate queues that are not
+	 * inactive for other reasons.
+	 */
 	list_for_each_entry(q, &qpd->queues_list, list) {
-		if (!q->properties.is_evicted)
-			continue;
-		mqd_mgr = dqm->ops.get_mqd_manager(dqm,
-			get_mqd_type_from_queue_type(q->properties.type));
-		if (!mqd_mgr) { /* should not be here */
-			pr_err("Cannot restore queue, mqd mgr is NULL\n");
-			retval = -ENOMEM;
-			goto out;
-		}
 		q->properties.is_evicted = false;
+		if (!QUEUE_IS_ACTIVE(q->properties))
+			continue;
+
+		mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
+				q->properties.type)];
 		q->properties.is_active = true;
+		dqm->queue_count++;
+
+		if (WARN_ONCE(!dqm->sched_running, "Restore when stopped\n"))
+			continue;
+
 		retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, q->pipe,
 				       q->queue, &q->properties, mm);
-		if (retval)
-			goto out;
-		dqm->queue_count++;
+		if (retval && !ret)
+			/* Return the first error, but keep going to
+			 * maintain a consistent eviction state
+			 */
+			ret = retval;
 	}
 	qpd->evicted = 0;
 out:
 	if (mm)
 		mmput(mm);
 	dqm_unlock(dqm);
-	return retval;
+	return ret;
 }
 
 static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
@@ -760,7 +772,7 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
 		goto out;
 	}
 
-	pr_info_ratelimited("Restoring PASID %u queues\n",
+	pr_info_ratelimited("Restoring PASID 0x%x queues\n",
 			    pdd->process->pasid);
 
 	/* Update PD Base in QPD */
@@ -769,16 +781,16 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
 
 	/* activate all active queues on the qpd */
 	list_for_each_entry(q, &qpd->queues_list, list) {
-		if (!q->properties.is_evicted)
-			continue;
 		q->properties.is_evicted = false;
+		if (!QUEUE_IS_ACTIVE(q->properties))
+			continue;
+
 		q->properties.is_active = true;
 		dqm->queue_count++;
 	}
 	retval = execute_queues_cpsch(dqm,
 				KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
-	if (!retval)
-		qpd->evicted = 0;
+	qpd->evicted = 0;
 out:
 	dqm_unlock(dqm);
 	return retval;
@@ -811,11 +823,15 @@ static int register_process(struct device_queue_manager *dqm,
 
 	retval = dqm->asic_ops.update_qpd(dqm, qpd);
 
-	if (dqm->processes_count++ == 0)
-		amdgpu_amdkfd_set_compute_idle(dqm->dev->kgd, false);
+	dqm->processes_count++;
 
 	dqm_unlock(dqm);
 
+	/* Outside the DQM lock because under the DQM lock we can't do
+	 * reclaim or take other locks that others hold while reclaiming.
+	 */
+	kfd_inc_compute_active(dqm->dev);
+
 	return retval;
 }
 
@@ -835,9 +851,7 @@ static int unregister_process(struct device_queue_manager *dqm,
 		if (qpd == cur->qpd) {
 			list_del(&cur->list);
 			kfree(cur);
-			if (--dqm->processes_count == 0)
-				amdgpu_amdkfd_set_compute_idle(
-					dqm->dev->kgd, true);
+			dqm->processes_count--;
 			goto out;
 		}
 	}
@@ -845,6 +859,13 @@ static int unregister_process(struct device_queue_manager *dqm,
 	retval = 1;
 out:
 	dqm_unlock(dqm);
+
+	/* Outside the DQM lock because under the DQM lock we can't do
+	 * reclaim or take other locks that others hold while reclaiming.
+	 */
+	if (!retval)
+		kfd_dec_compute_active(dqm->dev);
+
 	return retval;
 }
 
@@ -880,6 +901,7 @@ static int initialize_nocpsch(struct device_queue_manager *dqm)
 	INIT_LIST_HEAD(&dqm->queues);
 	dqm->queue_count = dqm->next_pipe_to_allocate = 0;
 	dqm->sdma_queue_count = 0;
+	dqm->xgmi_sdma_queue_count = 0;
 
 	for (pipe = 0; pipe < get_pipes_per_mec(dqm); pipe++) {
 		int pipe_offset = pipe * get_queues_per_pipe(dqm);
@@ -890,8 +912,10 @@ static int initialize_nocpsch(struct device_queue_manager *dqm)
 				dqm->allocated_queues[pipe] |= 1 << queue;
 	}
 
-	dqm->vmid_bitmap = (1 << dqm->dev->vm_info.vmid_num_kfd) - 1;
-	dqm->sdma_bitmap = (1 << get_num_sdma_queues(dqm)) - 1;
+	memset(dqm->vmid_pasid, 0, sizeof(dqm->vmid_pasid));
+
+	dqm->sdma_bitmap = ~0ULL >> (64 - get_num_sdma_queues(dqm));
+	dqm->xgmi_sdma_bitmap = ~0ULL >> (64 - get_num_xgmi_sdma_queues(dqm));
 
 	return 0;
 }
@@ -912,85 +936,74 @@ static void uninitialize(struct device_queue_manager *dqm)
 static int start_nocpsch(struct device_queue_manager *dqm)
 {
 	init_interrupts(dqm);
-	return pm_init(&dqm->packets, dqm);
+	
+	if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
+		return pm_init(&dqm->packets, dqm);
+	dqm->sched_running = true;
+
+	return 0;
 }
 
 static int stop_nocpsch(struct device_queue_manager *dqm)
 {
-	pm_uninit(&dqm->packets);
+	if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
+		pm_uninit(&dqm->packets);
+	dqm->sched_running = false;
+
 	return 0;
 }
 
 static int allocate_sdma_queue(struct device_queue_manager *dqm,
-				unsigned int *sdma_queue_id)
+				struct queue *q)
 {
 	int bit;
 
-	if (dqm->sdma_bitmap == 0)
-		return -ENOMEM;
+	if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
+		if (dqm->sdma_bitmap == 0)
+			return -ENOMEM;
+		bit = __ffs64(dqm->sdma_bitmap);
+		dqm->sdma_bitmap &= ~(1ULL << bit);
+		q->sdma_id = bit;
+		q->properties.sdma_engine_id = q->sdma_id %
+				get_num_sdma_engines(dqm);
+		q->properties.sdma_queue_id = q->sdma_id /
+				get_num_sdma_engines(dqm);
+	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
+		if (dqm->xgmi_sdma_bitmap == 0)
+			return -ENOMEM;
+		bit = __ffs64(dqm->xgmi_sdma_bitmap);
+		dqm->xgmi_sdma_bitmap &= ~(1ULL << bit);
+		q->sdma_id = bit;
+		/* sdma_engine_id is sdma id including
+		 * both PCIe-optimized SDMAs and XGMI-
+		 * optimized SDMAs. The calculation below
+		 * assumes the first N engines are always
+		 * PCIe-optimized ones
+		 */
+		q->properties.sdma_engine_id = get_num_sdma_engines(dqm) +
+				q->sdma_id % get_num_xgmi_sdma_engines(dqm);
+		q->properties.sdma_queue_id = q->sdma_id /
+				get_num_xgmi_sdma_engines(dqm);
+	}
 
-	bit = ffs(dqm->sdma_bitmap) - 1;
-	dqm->sdma_bitmap &= ~(1 << bit);
-	*sdma_queue_id = bit;
+	pr_debug("SDMA engine id: %d\n", q->properties.sdma_engine_id);
+	pr_debug("SDMA queue id: %d\n", q->properties.sdma_queue_id);
 
 	return 0;
 }
 
 static void deallocate_sdma_queue(struct device_queue_manager *dqm,
-				unsigned int sdma_queue_id)
-{
-	if (sdma_queue_id >= get_num_sdma_queues(dqm))
-		return;
-	dqm->sdma_bitmap |= (1 << sdma_queue_id);
-}
-
-static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
-					struct queue *q,
-					struct qcm_process_device *qpd)
+				struct queue *q)
 {
-	struct mqd_manager *mqd_mgr;
-	int retval;
-
-	mqd_mgr = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_SDMA);
-	if (!mqd_mgr)
-		return -ENOMEM;
-
-	retval = allocate_sdma_queue(dqm, &q->sdma_id);
-	if (retval)
-		return retval;
-
-	q->properties.sdma_queue_id = q->sdma_id / get_num_sdma_engines(dqm);
-	q->properties.sdma_engine_id = q->sdma_id % get_num_sdma_engines(dqm);
-
-	retval = allocate_doorbell(qpd, q);
-	if (retval)
-		goto out_deallocate_sdma_queue;
-
-	pr_debug("SDMA id is:    %d\n", q->sdma_id);
-	pr_debug("SDMA queue id: %d\n", q->properties.sdma_queue_id);
-	pr_debug("SDMA engine id: %d\n", q->properties.sdma_engine_id);
-
-	dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
-	retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, &q->mqd_mem_obj,
-				&q->gart_mqd_addr, &q->properties);
-	if (retval)
-		goto out_deallocate_doorbell;
-
-	retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, 0, 0, &q->properties,
-				NULL);
-	if (retval)
-		goto out_uninit_mqd;
-
-	return 0;
-
-out_uninit_mqd:
-	mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
-out_deallocate_doorbell:
-	deallocate_doorbell(qpd, q);
-out_deallocate_sdma_queue:
-	deallocate_sdma_queue(dqm, q->sdma_id);
-
-	return retval;
+	if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
+		if (q->sdma_id >= get_num_sdma_queues(dqm))
+			return;
+		dqm->sdma_bitmap |= (1ULL << q->sdma_id);
+	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
+		if (q->sdma_id >= get_num_xgmi_sdma_queues(dqm))
+			return;
+		dqm->xgmi_sdma_bitmap |= (1ULL << q->sdma_id);
+	}
 }
 
 /*
@@ -1027,8 +1040,8 @@ static int set_sched_resources(struct device_queue_manager *dqm)
 
 		res.queue_mask |= (1ull << i);
 	}
-	res.gws_mask = res.oac_mask = res.gds_heap_base =
-						res.gds_heap_size = 0;
+	res.gws_mask = ~0ull;
+	res.oac_mask = res.gds_heap_base = res.gds_heap_size = 0;
 
 	pr_debug("Scheduling resources:\n"
 			"vmid mask: 0x%8X\n"
@@ -1046,8 +1059,10 @@ static int initialize_cpsch(struct device_queue_manager *dqm)
 	INIT_LIST_HEAD(&dqm->queues);
 	dqm->queue_count = dqm->processes_count = 0;
 	dqm->sdma_queue_count = 0;
+	dqm->xgmi_sdma_queue_count = 0;
 	dqm->active_runlist = false;
-	dqm->sdma_bitmap = (1 << get_num_sdma_queues(dqm)) - 1;
+	dqm->sdma_bitmap = ~0ULL >> (64 - get_num_sdma_queues(dqm));
+	dqm->xgmi_sdma_bitmap = ~0ULL >> (64 - get_num_xgmi_sdma_queues(dqm));
 
 	INIT_WORK(&dqm->hw_exception_work, kfd_process_hw_exception);
 
@@ -1085,6 +1100,7 @@ static int start_cpsch(struct device_queue_manager *dqm)
 	dqm_lock(dqm);
 	/* clear hang status when driver try to start the hw scheduler */
 	dqm->is_hws_hang = false;
+	dqm->sched_running = true;
 	execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
 	dqm_unlock(dqm);
 
@@ -1100,6 +1116,7 @@ static int stop_cpsch(struct device_queue_manager *dqm)
 {
 	dqm_lock(dqm);
 	unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
+	dqm->sched_running = false;
 	dqm_unlock(dqm);
 
 	kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
@@ -1162,55 +1179,49 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
 	int retval;
 	struct mqd_manager *mqd_mgr;
 
-	retval = 0;
-
-	dqm_lock(dqm);
-
 	if (dqm->total_queue_count >= max_num_of_queues_per_device) {
 		pr_warn("Can't create new usermode queue because %d queues were already created\n",
 				dqm->total_queue_count);
 		retval = -EPERM;
-		goto out_unlock;
+		goto out;
 	}
 
-	if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
-		retval = allocate_sdma_queue(dqm, &q->sdma_id);
+	if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
+		q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
+		dqm_lock(dqm);
+		retval = allocate_sdma_queue(dqm, q);
+		dqm_unlock(dqm);
 		if (retval)
-			goto out_unlock;
-		q->properties.sdma_queue_id =
-			q->sdma_id / get_num_sdma_engines(dqm);
-		q->properties.sdma_engine_id =
-			q->sdma_id % get_num_sdma_engines(dqm);
+			goto out;
 	}
 
 	retval = allocate_doorbell(qpd, q);
 	if (retval)
 		goto out_deallocate_sdma_queue;
 
-	mqd_mgr = dqm->ops.get_mqd_manager(dqm,
-			get_mqd_type_from_queue_type(q->properties.type));
+	mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
+			q->properties.type)];
 
-	if (!mqd_mgr) {
+	if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
+		q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
+		dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
+	q->properties.tba_addr = qpd->tba_addr;
+	q->properties.tma_addr = qpd->tma_addr;
+	q->mqd_mem_obj = mqd_mgr->allocate_mqd(mqd_mgr->dev, &q->properties);
+	if (!q->mqd_mem_obj) {
 		retval = -ENOMEM;
 		goto out_deallocate_doorbell;
 	}
+
+	dqm_lock(dqm);
 	/*
-	 * Eviction state logic: we only mark active queues as evicted
-	 * to avoid the overhead of restoring inactive queues later
+	 * Eviction state logic: mark all queues as evicted, even ones
+	 * not currently active. Restoring inactive queues later only
+	 * updates the is_evicted flag but is a no-op otherwise.
 	 */
-	if (qpd->evicted)
-		q->properties.is_evicted = (q->properties.queue_size > 0 &&
-					    q->properties.queue_percent > 0 &&
-					    q->properties.queue_address != 0);
-
-	dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
-
-	q->properties.tba_addr = qpd->tba_addr;
-	q->properties.tma_addr = qpd->tma_addr;
-	retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, &q->mqd_mem_obj,
+	q->properties.is_evicted = !!qpd->evicted;
+	mqd_mgr->init_mqd(mqd_mgr, &q->mqd, q->mqd_mem_obj,
 				&q->gart_mqd_addr, &q->properties);
-	if (retval)
-		goto out_deallocate_doorbell;
 
 	list_add(&q->list, &qpd->queues_list);
 	qpd->queue_count++;
@@ -1222,6 +1233,8 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
 
 	if (q->properties.type == KFD_QUEUE_TYPE_SDMA)
 		dqm->sdma_queue_count++;
+	else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
+		dqm->xgmi_sdma_queue_count++;
 	/*
 	 * Unconditionally increment this counter, regardless of the queue's
 	 * type or whether the queue is active.
@@ -1237,11 +1250,13 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
 out_deallocate_doorbell:
 	deallocate_doorbell(qpd, q);
 out_deallocate_sdma_queue:
-	if (q->properties.type == KFD_QUEUE_TYPE_SDMA)
-		deallocate_sdma_queue(dqm, q->sdma_id);
-out_unlock:
-	dqm_unlock(dqm);
-
+	if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
+		q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
+		dqm_lock(dqm);
+		deallocate_sdma_queue(dqm, q);
+		dqm_unlock(dqm);
+	}
+out:
 	return retval;
 }
 
@@ -1269,12 +1284,18 @@ int amdkfd_fence_wait_timeout(unsigned int *fence_addr,
 	return 0;
 }
 
-static int unmap_sdma_queues(struct device_queue_manager *dqm,
-				unsigned int sdma_engine)
+static int unmap_sdma_queues(struct device_queue_manager *dqm)
 {
-	return pm_send_unmap_queue(&dqm->packets, KFD_QUEUE_TYPE_SDMA,
-			KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, false,
-			sdma_engine);
+	int i, retval = 0;
+
+	for (i = 0; i < dqm->dev->device_info->num_sdma_engines +
+		dqm->dev->device_info->num_xgmi_sdma_engines; i++) {
+		retval = pm_send_unmap_queue(&dqm->packets, KFD_QUEUE_TYPE_SDMA,
+			KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, false, i);
+		if (retval)
+			return retval;
+	}
+	return retval;
 }
 
 /* dqm->lock mutex has to be locked before calling this function */
@@ -1282,13 +1303,15 @@ static int map_queues_cpsch(struct device_queue_manager *dqm)
 {
 	int retval;
 
+	if (!dqm->sched_running)
+		return 0;
 	if (dqm->queue_count <= 0 || dqm->processes_count <= 0)
 		return 0;
-
 	if (dqm->active_runlist)
 		return 0;
 
 	retval = pm_send_runlist(&dqm->packets, &dqm->queues);
+	pr_debug("%s sent runlist\n", __func__);
 	if (retval) {
 		pr_err("failed to execute runlist\n");
 		return retval;
@@ -1305,18 +1328,18 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
 {
 	int retval = 0;
 
+	if (!dqm->sched_running)
+		return 0;
 	if (dqm->is_hws_hang)
 		return -EIO;
 	if (!dqm->active_runlist)
 		return retval;
 
-	pr_debug("Before destroying queues, sdma queue count is : %u\n",
-		dqm->sdma_queue_count);
+	pr_debug("Before destroying queues, sdma queue count is : %u, xgmi sdma queue count is : %u\n",
+		dqm->sdma_queue_count, dqm->xgmi_sdma_queue_count);
 
-	if (dqm->sdma_queue_count > 0) {
-		unmap_sdma_queues(dqm, 0);
-		unmap_sdma_queues(dqm, 1);
-	}
+	if (dqm->sdma_queue_count > 0 || dqm->xgmi_sdma_queue_count)
+		unmap_sdma_queues(dqm);
 
 	retval = pm_send_unmap_queue(&dqm->packets, KFD_QUEUE_TYPE_COMPUTE,
 			filter, filter_param, false, 0);
@@ -1328,7 +1351,7 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
 				KFD_FENCE_COMPLETED);
 	/* should be timed out */
 	retval = amdkfd_fence_wait_timeout(dqm->fence_addr, KFD_FENCE_COMPLETED,
-				QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS);
+				queue_preemption_timeout_ms);
 	if (retval)
 		return retval;
 
@@ -1380,18 +1403,17 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
 
 	}
 
-	mqd_mgr = dqm->ops.get_mqd_manager(dqm,
-			get_mqd_type_from_queue_type(q->properties.type));
-	if (!mqd_mgr) {
-		retval = -ENOMEM;
-		goto failed;
-	}
+	mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
+			q->properties.type)];
 
 	deallocate_doorbell(qpd, q);
 
 	if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
 		dqm->sdma_queue_count--;
-		deallocate_sdma_queue(dqm, q->sdma_id);
+		deallocate_sdma_queue(dqm, q);
+	} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
+		dqm->xgmi_sdma_queue_count--;
+		deallocate_sdma_queue(dqm, q);
 	}
 
 	list_del(&q->list);
@@ -1404,8 +1426,6 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
 			qpd->reset_wavefronts = true;
 	}
 
-	mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
-
 	/*
 	 * Unconditionally decrement this counter, regardless of the queue's
 	 * type
@@ -1416,9 +1436,11 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
 
 	dqm_unlock(dqm);
 
+	/* Do free_mqd after dqm_unlock(dqm) to avoid circular locking */
+	mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
+
 	return retval;
 
-failed:
 failed_try_destroy_debugged_queue:
 
 	dqm_unlock(dqm);
@@ -1521,6 +1543,7 @@ static int process_termination_nocpsch(struct device_queue_manager *dqm,
 	struct queue *q, *next;
 	struct device_process_node *cur, *next_dpn;
 	int retval = 0;
+	bool found = false;
 
 	dqm_lock(dqm);
 
@@ -1539,11 +1562,19 @@ static int process_termination_nocpsch(struct device_queue_manager *dqm,
 			list_del(&cur->list);
 			kfree(cur);
 			dqm->processes_count--;
+			found = true;
 			break;
 		}
 	}
 
 	dqm_unlock(dqm);
+
+	/* Outside the DQM lock because under the DQM lock we can't do
+	 * reclaim or take other locks that others hold while reclaiming.
+	 */
+	if (found)
+		kfd_dec_compute_active(dqm->dev);
+
 	return retval;
 }
 
@@ -1564,11 +1595,7 @@ static int get_wave_state(struct device_queue_manager *dqm,
 		goto dqm_unlock;
 	}
 
-	mqd_mgr = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_COMPUTE);
-	if (!mqd_mgr) {
-		r = -ENOMEM;
-		goto dqm_unlock;
-	}
+	mqd_mgr = dqm->mqd_mgrs[KFD_MQD_TYPE_COMPUTE];
 
 	if (!mqd_mgr->get_wave_state) {
 		r = -EINVAL;
@@ -1593,6 +1620,7 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
 	struct device_process_node *cur, *next_dpn;
 	enum kfd_unmap_queues_filter filter =
 		KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES;
+	bool found = false;
 
 	retval = 0;
 
@@ -1611,7 +1639,10 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
 	list_for_each_entry(q, &qpd->queues_list, list) {
 		if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
 			dqm->sdma_queue_count--;
-			deallocate_sdma_queue(dqm, q->sdma_id);
+			deallocate_sdma_queue(dqm, q);
+		} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
+			dqm->xgmi_sdma_queue_count--;
+			deallocate_sdma_queue(dqm, q);
 		}
 
 		if (q->properties.is_active)
@@ -1626,6 +1657,7 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
 			list_del(&cur->list);
 			kfree(cur);
 			dqm->processes_count--;
+			found = true;
 			break;
 		}
 	}
@@ -1637,21 +1669,69 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
 		qpd->reset_wavefronts = false;
 	}
 
-	/* lastly, free mqd resources */
+	dqm_unlock(dqm);
+
+	/* Outside the DQM lock because under the DQM lock we can't do
+	 * reclaim or take other locks that others hold while reclaiming.
+	 */
+	if (found)
+		kfd_dec_compute_active(dqm->dev);
+
+	/* Lastly, free mqd resources.
+	 * Do free_mqd() after dqm_unlock to avoid circular locking.
+	 */
 	list_for_each_entry_safe(q, next, &qpd->queues_list, list) {
-		mqd_mgr = dqm->ops.get_mqd_manager(dqm,
-			get_mqd_type_from_queue_type(q->properties.type));
-		if (!mqd_mgr) {
-			retval = -ENOMEM;
-			goto out;
-		}
+		mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
+				q->properties.type)];
 		list_del(&q->list);
 		qpd->queue_count--;
-		mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
+		mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
 	}
 
-out:
-	dqm_unlock(dqm);
+	return retval;
+}
+
+static int init_mqd_managers(struct device_queue_manager *dqm)
+{
+	int i, j;
+	struct mqd_manager *mqd_mgr;
+
+	for (i = 0; i < KFD_MQD_TYPE_MAX; i++) {
+		mqd_mgr = dqm->asic_ops.mqd_manager_init(i, dqm->dev);
+		if (!mqd_mgr) {
+			pr_err("mqd manager [%d] initialization failed\n", i);
+			goto out_free;
+		}
+		dqm->mqd_mgrs[i] = mqd_mgr;
+	}
+
+	return 0;
+
+out_free:
+	for (j = 0; j < i; j++) {
+		kfree(dqm->mqd_mgrs[j]);
+		dqm->mqd_mgrs[j] = NULL;
+	}
+
+	return -ENOMEM;
+}
+
+/* Allocate one hiq mqd (HWS) and all SDMA mqd in a continuous trunk*/
+static int allocate_hiq_sdma_mqd(struct device_queue_manager *dqm)
+{
+	int retval;
+	struct kfd_dev *dev = dqm->dev;
+	struct kfd_mem_obj *mem_obj = &dqm->hiq_sdma_mqd;
+	uint32_t size = dqm->mqd_mgrs[KFD_MQD_TYPE_SDMA]->mqd_size *
+		(dev->device_info->num_sdma_engines +
+		dev->device_info->num_xgmi_sdma_engines) *
+		dev->device_info->num_sdma_queues_per_engine +
+		dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]->mqd_size;
+
+	retval = amdgpu_amdkfd_alloc_gtt_mem(dev->kgd, size,
+		&(mem_obj->gtt_mem), &(mem_obj->gpu_addr),
+		(void *)&(mem_obj->cpu_ptr), true);
+
 	return retval;
 }
 
@@ -1692,7 +1772,6 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
 		dqm->ops.stop = stop_cpsch;
 		dqm->ops.destroy_queue = destroy_queue_cpsch;
 		dqm->ops.update_queue = update_queue;
-		dqm->ops.get_mqd_manager = get_mqd_manager;
 		dqm->ops.register_process = register_process;
 		dqm->ops.unregister_process = unregister_process;
 		dqm->ops.uninitialize = uninitialize;
@@ -1712,7 +1791,6 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
 		dqm->ops.create_queue = create_queue_nocpsch;
 		dqm->ops.destroy_queue = destroy_queue_nocpsch;
 		dqm->ops.update_queue = update_queue;
-		dqm->ops.get_mqd_manager = get_mqd_manager;
 		dqm->ops.register_process = register_process;
 		dqm->ops.unregister_process = unregister_process;
 		dqm->ops.initialize = initialize_nocpsch;
@@ -1748,6 +1826,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
 	case CHIP_POLARIS10:
 	case CHIP_POLARIS11:
 	case CHIP_POLARIS12:
+	case CHIP_VEGAM:
 		device_queue_manager_init_vi_tonga(&dqm->asic_ops);
 		break;
 
@@ -1755,14 +1834,29 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
 	case CHIP_VEGA12:
 	case CHIP_VEGA20:
 	case CHIP_RAVEN:
+	case CHIP_RENOIR:
+	case CHIP_ARCTURUS:
 		device_queue_manager_init_v9(&dqm->asic_ops);
 		break;
+	case CHIP_NAVI10:
+	case CHIP_NAVI12:
+	case CHIP_NAVI14:
+		device_queue_manager_init_v10_navi10(&dqm->asic_ops);
+		break;
 	default:
 		WARN(1, "Unexpected ASIC family %u",
 		     dev->device_info->asic_family);
 		goto out_free;
 	}
 
+	if (init_mqd_managers(dqm))
+		goto out_free;
+
+	if (allocate_hiq_sdma_mqd(dqm)) {
+		pr_err("Failed to allocate hiq sdma mqd trunk buffer\n");
+		goto out_free;
+	}
+
 	if (!dqm->ops.initialize(dqm))
 		return dqm;
 
@@ -1771,9 +1865,18 @@ out_free:
 	return NULL;
 }
 
+static void deallocate_hiq_sdma_mqd(struct kfd_dev *dev,
+				    struct kfd_mem_obj *mqd)
+{
+	WARN(!mqd, "No hiq sdma mqd trunk to free");
+
+	amdgpu_amdkfd_free_gtt_mem(dev->kgd, mqd->gtt_mem);
+}
+
 void device_queue_manager_uninit(struct device_queue_manager *dqm)
 {
 	dqm->ops.uninitialize(dqm);
+	deallocate_hiq_sdma_mqd(dqm->dev, &dqm->hiq_sdma_mqd);
 	kfree(dqm);
 }
 
@@ -1831,13 +1934,20 @@ int dqm_debugfs_hqds(struct seq_file *m, void *data)
 	int pipe, queue;
 	int r = 0;
 
+	if (!dqm->sched_running) {
+		seq_printf(m, " Device is stopped\n");
+
+		return 0;
+	}
+
 	r = dqm->dev->kfd2kgd->hqd_dump(dqm->dev->kgd,
-		KFD_CIK_HIQ_PIPE, KFD_CIK_HIQ_QUEUE, &dump, &n_regs);
+					KFD_CIK_HIQ_PIPE, KFD_CIK_HIQ_QUEUE,
+					&dump, &n_regs);
 	if (!r) {
 		seq_printf(m, "  HIQ on MEC %d Pipe %d Queue %d\n",
-				KFD_CIK_HIQ_PIPE/get_pipes_per_mec(dqm)+1,
-				KFD_CIK_HIQ_PIPE%get_pipes_per_mec(dqm),
-				KFD_CIK_HIQ_QUEUE);
+			   KFD_CIK_HIQ_PIPE/get_pipes_per_mec(dqm)+1,
+			   KFD_CIK_HIQ_PIPE%get_pipes_per_mec(dqm),
+			   KFD_CIK_HIQ_QUEUE);
 		seq_reg_dump(m, dump, n_regs);
 
 		kfree(dump);
@@ -1864,7 +1974,8 @@ int dqm_debugfs_hqds(struct seq_file *m, void *data)
 		}
 	}
 
-	for (pipe = 0; pipe < get_num_sdma_engines(dqm); pipe++) {
+	for (pipe = 0; pipe < get_num_sdma_engines(dqm) +
+			get_num_xgmi_sdma_engines(dqm); pipe++) {
 		for (queue = 0;
 		     queue < dqm->dev->device_info->num_sdma_queues_per_engine;
 		     queue++) {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index 70e38a2e23b9..a8c37e6da027 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -31,8 +31,8 @@
 #include "kfd_priv.h"
 #include "kfd_mqd_manager.h"
 
-#define KFD_UNMAP_LATENCY_MS			(4000)
-#define QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS (2 * KFD_UNMAP_LATENCY_MS + 1000)
+
+#define VMID_NUM 16
 
 struct device_process_node {
 	struct qcm_process_device *qpd;
@@ -48,8 +48,6 @@ struct device_process_node {
  *
  * @update_queue: Queue update routine.
  *
- * @get_mqd_manager: Returns the mqd manager according to the mqd type.
- *
  * @exeute_queues: Dispatches the queues list to the H/W.
  *
  * @register_process: This routine associates a specific process with device.
@@ -97,10 +95,6 @@ struct device_queue_manager_ops {
 	int	(*update_queue)(struct device_queue_manager *dqm,
 				struct queue *q);
 
-	struct mqd_manager * (*get_mqd_manager)
-					(struct device_queue_manager *dqm,
-					enum KFD_MQD_TYPE type);
-
 	int	(*register_process)(struct device_queue_manager *dqm,
 					struct qcm_process_device *qpd);
 
@@ -158,6 +152,8 @@ struct device_queue_manager_asic_ops {
 	void	(*init_sdma_vm)(struct device_queue_manager *dqm,
 				struct queue *q,
 				struct qcm_process_device *qpd);
+	struct mqd_manager *	(*mqd_manager_init)(enum KFD_MQD_TYPE type,
+				 struct kfd_dev *dev);
 };
 
 /**
@@ -185,11 +181,14 @@ struct device_queue_manager {
 	unsigned int		processes_count;
 	unsigned int		queue_count;
 	unsigned int		sdma_queue_count;
+	unsigned int		xgmi_sdma_queue_count;
 	unsigned int		total_queue_count;
 	unsigned int		next_pipe_to_allocate;
 	unsigned int		*allocated_queues;
-	unsigned int		sdma_bitmap;
-	unsigned int		vmid_bitmap;
+	uint64_t		sdma_bitmap;
+	uint64_t		xgmi_sdma_bitmap;
+	/* the pasid mapping for each kfd vmid */
+	uint16_t		vmid_pasid[VMID_NUM];
 	uint64_t		pipelines_addr;
 	struct kfd_mem_obj	*pipeline_mem;
 	uint64_t		fence_gpu_addr;
@@ -201,6 +200,8 @@ struct device_queue_manager {
 	/* hw exception  */
 	bool			is_hws_hang;
 	struct work_struct	hw_exception_work;
+	struct kfd_mem_obj	hiq_sdma_mqd;
+	bool			sched_running;
 };
 
 void device_queue_manager_init_cik(
@@ -213,12 +214,15 @@ void device_queue_manager_init_vi_tonga(
 		struct device_queue_manager_asic_ops *asic_ops);
 void device_queue_manager_init_v9(
 		struct device_queue_manager_asic_ops *asic_ops);
+void device_queue_manager_init_v10_navi10(
+		struct device_queue_manager_asic_ops *asic_ops);
 void program_sh_mem_settings(struct device_queue_manager *dqm,
 					struct qcm_process_device *qpd);
 unsigned int get_queues_num(struct device_queue_manager *dqm);
 unsigned int get_queues_per_pipe(struct device_queue_manager *dqm);
 unsigned int get_pipes_per_mec(struct device_queue_manager *dqm);
 unsigned int get_num_sdma_queues(struct device_queue_manager *dqm);
+unsigned int get_num_xgmi_sdma_queues(struct device_queue_manager *dqm);
 
 static inline unsigned int get_sh_mem_bases_32(struct kfd_process_device *pdd)
 {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c
index aed4c21417bf..0d26506798cf 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_cik.c
@@ -48,6 +48,7 @@ void device_queue_manager_init_cik(
 	asic_ops->set_cache_memory_policy = set_cache_memory_policy_cik;
 	asic_ops->update_qpd = update_qpd_cik;
 	asic_ops->init_sdma_vm = init_sdma_vm;
+	asic_ops->mqd_manager_init = mqd_manager_init_cik;
 }
 
 void device_queue_manager_init_cik_hawaii(
@@ -56,6 +57,7 @@ void device_queue_manager_init_cik_hawaii(
 	asic_ops->set_cache_memory_policy = set_cache_memory_policy_cik;
 	asic_ops->update_qpd = update_qpd_cik_hawaii;
 	asic_ops->init_sdma_vm = init_sdma_vm_hawaii;
+	asic_ops->mqd_manager_init = mqd_manager_init_cik_hawaii;
 }
 
 static uint32_t compute_sh_mem_bases_64bit(unsigned int top_address_nybble)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v10.c
new file mode 100644
index 000000000000..72e4d61ac752
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v10.c
@@ -0,0 +1,88 @@
+/*
+ * Copyright 2018 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "kfd_device_queue_manager.h"
+#include "navi10_enum.h"
+#include "gc/gc_10_1_0_offset.h"
+#include "gc/gc_10_1_0_sh_mask.h"
+
+static int update_qpd_v10(struct device_queue_manager *dqm,
+			 struct qcm_process_device *qpd);
+static void init_sdma_vm_v10(struct device_queue_manager *dqm, struct queue *q,
+			    struct qcm_process_device *qpd);
+
+void device_queue_manager_init_v10_navi10(
+	struct device_queue_manager_asic_ops *asic_ops)
+{
+	asic_ops->update_qpd = update_qpd_v10;
+	asic_ops->init_sdma_vm = init_sdma_vm_v10;
+	asic_ops->mqd_manager_init = mqd_manager_init_v10;
+}
+
+static uint32_t compute_sh_mem_bases_64bit(struct kfd_process_device *pdd)
+{
+	uint32_t shared_base = pdd->lds_base >> 48;
+	uint32_t private_base = pdd->scratch_base >> 48;
+
+	return (shared_base << SH_MEM_BASES__SHARED_BASE__SHIFT) |
+		private_base;
+}
+
+static int update_qpd_v10(struct device_queue_manager *dqm,
+			 struct qcm_process_device *qpd)
+{
+	struct kfd_process_device *pdd;
+
+	pdd = qpd_to_pdd(qpd);
+
+	/* check if sh_mem_config register already configured */
+	if (qpd->sh_mem_config == 0) {
+		qpd->sh_mem_config =
+				SH_MEM_ALIGNMENT_MODE_UNALIGNED <<
+					SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT;
+#if 0
+		/* TODO:
+		 *    This shouldn't be an issue with Navi10.  Verify.
+		 */
+		if (vega10_noretry)
+			qpd->sh_mem_config |=
+				1 << SH_MEM_CONFIG__RETRY_DISABLE__SHIFT;
+#endif
+
+		qpd->sh_mem_ape1_limit = 0;
+		qpd->sh_mem_ape1_base = 0;
+	}
+
+	qpd->sh_mem_bases = compute_sh_mem_bases_64bit(pdd);
+
+	pr_debug("sh_mem_bases 0x%X\n", qpd->sh_mem_bases);
+
+	return 0;
+}
+
+static void init_sdma_vm_v10(struct device_queue_manager *dqm, struct queue *q,
+			    struct qcm_process_device *qpd)
+{
+	/* Not needed on SDMAv4 onwards any more */
+	q->properties.sdma_vm_addr = 0;
+}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c
index 417515332c35..95a82ac455f2 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c
@@ -37,6 +37,7 @@ void device_queue_manager_init_v9(
 {
 	asic_ops->update_qpd = update_qpd_v9;
 	asic_ops->init_sdma_vm = init_sdma_vm_v9;
+	asic_ops->mqd_manager_init = mqd_manager_init_v9;
 }
 
 static uint32_t compute_sh_mem_bases_64bit(struct kfd_process_device *pdd)
@@ -60,7 +61,7 @@ static int update_qpd_v9(struct device_queue_manager *dqm,
 		qpd->sh_mem_config =
 				SH_MEM_ALIGNMENT_MODE_UNALIGNED <<
 					SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT;
-		if (noretry &&
+		if (amdgpu_noretry &&
 		    !dqm->dev->device_info->needs_iommu_device)
 			qpd->sh_mem_config |=
 				1 << SH_MEM_CONFIG__RETRY_DISABLE__SHIFT;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c
index c3a5dcfe877a..3a7cb2f88366 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_vi.c
@@ -54,6 +54,7 @@ void device_queue_manager_init_vi(
 	asic_ops->set_cache_memory_policy = set_cache_memory_policy_vi;
 	asic_ops->update_qpd = update_qpd_vi;
 	asic_ops->init_sdma_vm = init_sdma_vm;
+	asic_ops->mqd_manager_init = mqd_manager_init_vi;
 }
 
 void device_queue_manager_init_vi_tonga(
@@ -62,6 +63,7 @@ void device_queue_manager_init_vi_tonga(
 	asic_ops->set_cache_memory_policy = set_cache_memory_policy_vi_tonga;
 	asic_ops->update_qpd = update_qpd_vi_tonga;
 	asic_ops->init_sdma_vm = init_sdma_vm_tonga;
+	asic_ops->mqd_manager_init = mqd_manager_init_vi_tonga;
 }
 
 static uint32_t compute_sh_mem_bases_64bit(unsigned int top_address_nybble)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
index e9f0e0a1b41c..908081c85de1 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
@@ -852,8 +852,8 @@ static void lookup_events_by_type_and_signal(struct kfd_process *p,
 
 	if (type == KFD_EVENT_TYPE_MEMORY) {
 		dev_warn(kfd_device,
-			"Sending SIGSEGV to HSA Process with PID %d ",
-				p->lead_thread->pid);
+			"Sending SIGSEGV to process %d (pasid 0x%x)",
+				p->lead_thread->pid, p->pasid);
 		send_sig(SIGSEGV, p->lead_thread, 0);
 	}
 
@@ -861,13 +861,13 @@ static void lookup_events_by_type_and_signal(struct kfd_process *p,
 	if (send_signal) {
 		if (send_sigterm) {
 			dev_warn(kfd_device,
-				"Sending SIGTERM to HSA Process with PID %d ",
-					p->lead_thread->pid);
+				"Sending SIGTERM to process %d (pasid 0x%x)",
+					p->lead_thread->pid, p->pasid);
 			send_sig(SIGTERM, p->lead_thread, 0);
 		} else {
 			dev_err(kfd_device,
-				"HSA Process (PID %d) got unhandled exception",
-				p->lead_thread->pid);
+				"Process %d (pasid 0x%x) got unhandled exception",
+				p->lead_thread->pid, p->pasid);
 		}
 	}
 }
@@ -936,7 +936,8 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid,
 	/* Workaround on Raven to not kill the process when memory is freed
 	 * before IOMMU is able to finish processing all the excessive PPRs
 	 */
-	if (dev->device_info->asic_family != CHIP_RAVEN) {
+	if (dev->device_info->asic_family != CHIP_RAVEN &&
+	    dev->device_info->asic_family != CHIP_RENOIR) {
 		mutex_lock(&p->event_mutex);
 
 		/* Lookup events by type and signal them */
@@ -983,7 +984,7 @@ void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid,
 		return; /* Presumably process exited. */
 	memset(&memory_exception_data, 0, sizeof(memory_exception_data));
 	memory_exception_data.gpu_id = dev->id;
-	memory_exception_data.failure.imprecise = 1;
+	memory_exception_data.failure.imprecise = true;
 	/* Set failure reason */
 	if (info) {
 		memory_exception_data.va = (info->page_addr) << PAGE_SHIFT;
@@ -1011,25 +1012,41 @@ void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid,
 void kfd_signal_reset_event(struct kfd_dev *dev)
 {
 	struct kfd_hsa_hw_exception_data hw_exception_data;
+	struct kfd_hsa_memory_exception_data memory_exception_data;
 	struct kfd_process *p;
 	struct kfd_event *ev;
 	unsigned int temp;
 	uint32_t id, idx;
+	int reset_cause = atomic_read(&dev->sram_ecc_flag) ?
+			KFD_HW_EXCEPTION_ECC :
+			KFD_HW_EXCEPTION_GPU_HANG;
 
 	/* Whole gpu reset caused by GPU hang and memory is lost */
 	memset(&hw_exception_data, 0, sizeof(hw_exception_data));
 	hw_exception_data.gpu_id = dev->id;
 	hw_exception_data.memory_lost = 1;
+	hw_exception_data.reset_cause = reset_cause;
+
+	memset(&memory_exception_data, 0, sizeof(memory_exception_data));
+	memory_exception_data.ErrorType = KFD_MEM_ERR_SRAM_ECC;
+	memory_exception_data.gpu_id = dev->id;
+	memory_exception_data.failure.imprecise = true;
 
 	idx = srcu_read_lock(&kfd_processes_srcu);
 	hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
 		mutex_lock(&p->event_mutex);
 		id = KFD_FIRST_NONSIGNAL_EVENT_ID;
-		idr_for_each_entry_continue(&p->event_idr, ev, id)
+		idr_for_each_entry_continue(&p->event_idr, ev, id) {
 			if (ev->type == KFD_EVENT_TYPE_HW_EXCEPTION) {
 				ev->hw_exception_data = hw_exception_data;
 				set_event(ev);
 			}
+			if (ev->type == KFD_EVENT_TYPE_MEMORY &&
+			    reset_cause == KFD_HW_EXCEPTION_ECC) {
+				ev->memory_exception_data = memory_exception_data;
+				set_event(ev);
+			}
+		}
 		mutex_unlock(&p->event_mutex);
 	}
 	srcu_read_unlock(&kfd_processes_srcu, idx);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
index 213ea5454d11..bb77b8890e77 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
@@ -369,8 +369,13 @@ int kfd_init_apertures(struct kfd_process *process)
 
 	/*Iterating over all devices*/
 	while (kfd_topology_enum_kfd_devices(id, &dev) == 0) {
-		if (!dev) {
-			id++; /* Skip non GPU devices */
+		if (!dev || kfd_devcgroup_check_permission(dev)) {
+			/* Skip non GPU devices and devices to which the
+			 * current process have no access to. Access can be
+			 * limited by placing the process in a specific
+			 * cgroup hierarchy
+			 */
+			id++;
 			continue;
 		}
 
@@ -398,12 +403,18 @@ int kfd_init_apertures(struct kfd_process *process)
 			case CHIP_POLARIS10:
 			case CHIP_POLARIS11:
 			case CHIP_POLARIS12:
+			case CHIP_VEGAM:
 				kfd_init_apertures_vi(pdd, id);
 				break;
 			case CHIP_VEGA10:
 			case CHIP_VEGA12:
 			case CHIP_VEGA20:
 			case CHIP_RAVEN:
+			case CHIP_RENOIR:
+			case CHIP_ARCTURUS:
+			case CHIP_NAVI10:
+			case CHIP_NAVI12:
+			case CHIP_NAVI14:
 				kfd_init_apertures_v9(pdd, id);
 				break;
 			default:
@@ -435,5 +446,3 @@ int kfd_init_apertures(struct kfd_process *process)
 
 	return 0;
 }
-
-
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index a85904ad0d5f..e05d75ecda21 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -54,8 +54,7 @@ static bool event_interrupt_isr_v9(struct kfd_dev *dev,
 		memcpy(patched_ihre, ih_ring_entry,
 				dev->device_info->ih_ring_entry_size);
 
-		pasid = dev->kfd2kgd->get_atc_vmid_pasid_mapping_pasid(
-				dev->kgd, vmid);
+		pasid = dev->dqm->vmid_pasid[vmid];
 
 		/* Patch the pasid field */
 		patched_ihre[3] = cpu_to_le32((le32_to_cpu(patched_ihre[3])
@@ -80,6 +79,7 @@ static bool event_interrupt_isr_v9(struct kfd_dev *dev,
 		source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG ||
 		source_id == SOC15_INTSRC_CP_BAD_OPCODE ||
 		client_id == SOC15_IH_CLIENTID_VMC ||
+		client_id == SOC15_IH_CLIENTID_VMC1 ||
 		client_id == SOC15_IH_CLIENTID_UTCL2;
 }
 
@@ -104,6 +104,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
 	else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE)
 		kfd_signal_hw_exception_event(pasid);
 	else if (client_id == SOC15_IH_CLIENTID_VMC ||
+		client_id == SOC15_IH_CLIENTID_VMC1 ||
 		 client_id == SOC15_IH_CLIENTID_UTCL2) {
 		struct kfd_vm_fault_info info = {0};
 		uint16_t ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c
index c56ac47cd318..bc47f6a44456 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_interrupt.c
@@ -62,6 +62,11 @@ int kfd_interrupt_init(struct kfd_dev *kfd)
 	}
 
 	kfd->ih_wq = alloc_workqueue("KFD IH", WQ_HIGHPRI, 1);
+	if (unlikely(!kfd->ih_wq)) {
+		kfifo_free(&kfd->ih_fifo);
+		dev_err(kfd_chardev(), "Failed to allocate KFD IH workqueue\n");
+		return -ENOMEM;
+	}
 	spin_lock_init(&kfd->interrupt_lock);
 
 	INIT_WORK(&kfd->interrupt_work, interrupt_wq);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_iommu.c b/drivers/gpu/drm/amd/amdkfd/kfd_iommu.c
index 01494752c36a..193e2835bd4d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_iommu.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_iommu.c
@@ -66,16 +66,8 @@ int kfd_iommu_device_init(struct kfd_dev *kfd)
 
 	top_dev = kfd_topology_device_by_id(kfd->id);
 
-	/*
-	 * Overwrite ATS capability according to needs_iommu_device to fix
-	 * potential missing corresponding bit in CRAT of BIOS.
-	 */
-	if (!kfd->device_info->needs_iommu_device) {
-		top_dev->node_props.capability &= ~HSA_CAP_ATS_PRESENT;
+	if (!kfd->device_info->needs_iommu_device)
 		return 0;
-	}
-
-	top_dev->node_props.capability |= HSA_CAP_ATS_PRESENT;
 
 	iommu_info.flags = 0;
 	err = amd_iommu_device_info(kfd->pdev, &iommu_info);
@@ -168,7 +160,7 @@ static void iommu_pasid_shutdown_callback(struct pci_dev *pdev, int pasid)
 	if (!p)
 		return;
 
-	pr_debug("Unbinding process %d from IOMMU\n", pasid);
+	pr_debug("Unbinding process 0x%x from IOMMU\n", pasid);
 
 	mutex_lock(kfd_get_dbgmgr_mutex());
 
@@ -202,7 +194,7 @@ static int iommu_invalid_ppr_cb(struct pci_dev *pdev, int pasid,
 	struct kfd_dev *dev;
 
 	dev_warn_ratelimited(kfd_device,
-			"Invalid PPR device %x:%x.%x pasid %d address 0x%lX flags 0x%X",
+			"Invalid PPR device %x:%x.%x pasid 0x%x address 0x%lX flags 0x%X",
 			PCI_BUS_NUM(pdev->devfn),
 			PCI_SLOT(pdev->devfn),
 			PCI_FUNC(pdev->devfn),
@@ -243,7 +235,7 @@ static int kfd_bind_processes_to_device(struct kfd_dev *kfd)
 		err = amd_iommu_bind_pasid(kfd->pdev, p->pasid,
 				p->lead_thread);
 		if (err < 0) {
-			pr_err("Unexpected pasid %d binding failure\n",
+			pr_err("Unexpected pasid 0x%x binding failure\n",
 					p->pasid);
 			mutex_unlock(&p->mutex);
 			break;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
index f1596881f20a..11d244891393 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
@@ -58,9 +58,10 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev,
 	kq->nop_packet = nop.u32all;
 	switch (type) {
 	case KFD_QUEUE_TYPE_DIQ:
+		kq->mqd_mgr = dev->dqm->mqd_mgrs[KFD_MQD_TYPE_DIQ];
+		break;
 	case KFD_QUEUE_TYPE_HIQ:
-		kq->mqd_mgr = dev->dqm->ops.get_mqd_manager(dev->dqm,
-						KFD_MQD_TYPE_HIQ);
+		kq->mqd_mgr = dev->dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ];
 		break;
 	default:
 		pr_err("Invalid queue type %d\n", type);
@@ -131,13 +132,14 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev,
 	kq->queue->device = dev;
 	kq->queue->process = kfd_get_process(current);
 
-	retval = kq->mqd_mgr->init_mqd(kq->mqd_mgr, &kq->queue->mqd,
-					&kq->queue->mqd_mem_obj,
+	kq->queue->mqd_mem_obj = kq->mqd_mgr->allocate_mqd(kq->mqd_mgr->dev,
+					&kq->queue->properties);
+	if (!kq->queue->mqd_mem_obj)
+		goto err_allocate_mqd;
+	kq->mqd_mgr->init_mqd(kq->mqd_mgr, &kq->queue->mqd,
+					kq->queue->mqd_mem_obj,
 					&kq->queue->gart_mqd_addr,
 					&kq->queue->properties);
-	if (retval != 0)
-		goto err_init_mqd;
-
 	/* assign HIQ to HQD */
 	if (type == KFD_QUEUE_TYPE_HIQ) {
 		pr_debug("Assigning hiq to hqd\n");
@@ -163,7 +165,8 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev,
 
 	return true;
 err_alloc_fence:
-err_init_mqd:
+	kq->mqd_mgr->free_mqd(kq->mqd_mgr, kq->queue->mqd, kq->queue->mqd_mem_obj);
+err_allocate_mqd:
 	uninit_queue(kq->queue);
 err_init_queue:
 	kfd_gtt_sa_free(dev, kq->wptr_mem);
@@ -192,7 +195,7 @@ static void uninitialize(struct kernel_queue *kq)
 	else if (kq->queue->properties.type == KFD_QUEUE_TYPE_DIQ)
 		kfd_gtt_sa_free(kq->dev, kq->fence_mem_obj);
 
-	kq->mqd_mgr->uninit_mqd(kq->mqd_mgr, kq->queue->mqd,
+	kq->mqd_mgr->free_mqd(kq->mqd_mgr, kq->queue->mqd,
 				kq->queue->mqd_mem_obj);
 
 	kfd_gtt_sa_free(kq->dev, kq->rptr_mem);
@@ -314,6 +317,7 @@ struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
 	case CHIP_POLARIS10:
 	case CHIP_POLARIS11:
 	case CHIP_POLARIS12:
+	case CHIP_VEGAM:
 		kernel_queue_init_vi(&kq->ops_asic_specific);
 		break;
 
@@ -326,8 +330,15 @@ struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
 	case CHIP_VEGA12:
 	case CHIP_VEGA20:
 	case CHIP_RAVEN:
+	case CHIP_RENOIR:
+	case CHIP_ARCTURUS:
 		kernel_queue_init_v9(&kq->ops_asic_specific);
 		break;
+	case CHIP_NAVI10:
+	case CHIP_NAVI12:
+	case CHIP_NAVI14:
+		kernel_queue_init_v10(&kq->ops_asic_specific);
+		break;
 	default:
 		WARN(1, "Unexpected ASIC family %u",
 		     dev->device_info->asic_family);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h
index a7116a939029..365fc674fea4 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h
@@ -102,5 +102,6 @@ struct kernel_queue {
 void kernel_queue_init_cik(struct kernel_queue_ops *ops);
 void kernel_queue_init_vi(struct kernel_queue_ops *ops);
 void kernel_queue_init_v9(struct kernel_queue_ops *ops);
+void kernel_queue_init_v10(struct kernel_queue_ops *ops);
 
 #endif /* KFD_KERNEL_QUEUE_H_ */
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v10.c
new file mode 100644
index 000000000000..aed32ab7102e
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v10.c
@@ -0,0 +1,348 @@
+/*
+ * Copyright 2018 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "kfd_kernel_queue.h"
+#include "kfd_device_queue_manager.h"
+#include "kfd_pm4_headers_ai.h"
+#include "kfd_pm4_opcodes.h"
+#include "gc/gc_10_1_0_sh_mask.h"
+
+static bool initialize_v10(struct kernel_queue *kq, struct kfd_dev *dev,
+			enum kfd_queue_type type, unsigned int queue_size);
+static void uninitialize_v10(struct kernel_queue *kq);
+static void submit_packet_v10(struct kernel_queue *kq);
+
+void kernel_queue_init_v10(struct kernel_queue_ops *ops)
+{
+	ops->initialize = initialize_v10;
+	ops->uninitialize = uninitialize_v10;
+	ops->submit_packet = submit_packet_v10;
+}
+
+static bool initialize_v10(struct kernel_queue *kq, struct kfd_dev *dev,
+			enum kfd_queue_type type, unsigned int queue_size)
+{
+	int retval;
+
+	retval = kfd_gtt_sa_allocate(dev, PAGE_SIZE, &kq->eop_mem);
+	if (retval != 0)
+		return false;
+
+	kq->eop_gpu_addr = kq->eop_mem->gpu_addr;
+	kq->eop_kernel_addr = kq->eop_mem->cpu_ptr;
+
+	memset(kq->eop_kernel_addr, 0, PAGE_SIZE);
+
+	return true;
+}
+
+static void uninitialize_v10(struct kernel_queue *kq)
+{
+	kfd_gtt_sa_free(kq->dev, kq->eop_mem);
+}
+
+static void submit_packet_v10(struct kernel_queue *kq)
+{
+	*kq->wptr64_kernel = kq->pending_wptr64;
+	write_kernel_doorbell64(kq->queue->properties.doorbell_ptr,
+				kq->pending_wptr64);
+}
+
+static int pm_map_process_v10(struct packet_manager *pm,
+		uint32_t *buffer, struct qcm_process_device *qpd)
+{
+	struct pm4_mes_map_process *packet;
+	uint64_t vm_page_table_base_addr = qpd->page_table_base;
+
+	packet = (struct pm4_mes_map_process *)buffer;
+	memset(buffer, 0, sizeof(struct pm4_mes_map_process));
+
+	packet->header.u32All = pm_build_pm4_header(IT_MAP_PROCESS,
+					sizeof(struct pm4_mes_map_process));
+	packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0;
+	packet->bitfields2.process_quantum = 1;
+	packet->bitfields2.pasid = qpd->pqm->process->pasid;
+	packet->bitfields14.gds_size = qpd->gds_size;
+	packet->bitfields14.num_gws = qpd->num_gws;
+	packet->bitfields14.num_oac = qpd->num_oac;
+	packet->bitfields14.sdma_enable = 1;
+
+	packet->bitfields14.num_queues = (qpd->is_debug) ? 0 : qpd->queue_count;
+
+	packet->sh_mem_config = qpd->sh_mem_config;
+	packet->sh_mem_bases = qpd->sh_mem_bases;
+	if (qpd->tba_addr) {
+		packet->sq_shader_tba_lo = lower_32_bits(qpd->tba_addr >> 8);
+		packet->sq_shader_tba_hi = (1 << SQ_SHADER_TBA_HI__TRAP_EN__SHIFT) |
+			upper_32_bits(qpd->tba_addr >> 8);
+		packet->sq_shader_tma_lo = lower_32_bits(qpd->tma_addr >> 8);
+		packet->sq_shader_tma_hi = upper_32_bits(qpd->tma_addr >> 8);
+	}
+
+	packet->gds_addr_lo = lower_32_bits(qpd->gds_context_area);
+	packet->gds_addr_hi = upper_32_bits(qpd->gds_context_area);
+
+	packet->vm_context_page_table_base_addr_lo32 =
+			lower_32_bits(vm_page_table_base_addr);
+	packet->vm_context_page_table_base_addr_hi32 =
+			upper_32_bits(vm_page_table_base_addr);
+
+	return 0;
+}
+
+static int pm_runlist_v10(struct packet_manager *pm, uint32_t *buffer,
+			uint64_t ib, size_t ib_size_in_dwords, bool chain)
+{
+	struct pm4_mes_runlist *packet;
+
+	int concurrent_proc_cnt = 0;
+	struct kfd_dev *kfd = pm->dqm->dev;
+
+	/* Determine the number of processes to map together to HW:
+	 * it can not exceed the number of VMIDs available to the
+	 * scheduler, and it is determined by the smaller of the number
+	 * of processes in the runlist and kfd module parameter
+	 * hws_max_conc_proc.
+	 * Note: the arbitration between the number of VMIDs and
+	 * hws_max_conc_proc has been done in
+	 * kgd2kfd_device_init().
+	 */
+	concurrent_proc_cnt = min(pm->dqm->processes_count,
+			kfd->max_proc_per_quantum);
+
+
+	packet = (struct pm4_mes_runlist *)buffer;
+
+	memset(buffer, 0, sizeof(struct pm4_mes_runlist));
+	packet->header.u32All = pm_build_pm4_header(IT_RUN_LIST,
+						sizeof(struct pm4_mes_runlist));
+
+	packet->bitfields4.ib_size = ib_size_in_dwords;
+	packet->bitfields4.chain = chain ? 1 : 0;
+	packet->bitfields4.offload_polling = 0;
+	packet->bitfields4.valid = 1;
+	packet->bitfields4.process_cnt = concurrent_proc_cnt;
+	packet->ordinal2 = lower_32_bits(ib);
+	packet->ib_base_hi = upper_32_bits(ib);
+
+	return 0;
+}
+
+static int pm_map_queues_v10(struct packet_manager *pm, uint32_t *buffer,
+		struct queue *q, bool is_static)
+{
+	struct pm4_mes_map_queues *packet;
+	bool use_static = is_static;
+
+	packet = (struct pm4_mes_map_queues *)buffer;
+	memset(buffer, 0, sizeof(struct pm4_mes_map_queues));
+
+	packet->header.u32All = pm_build_pm4_header(IT_MAP_QUEUES,
+					sizeof(struct pm4_mes_map_queues));
+	packet->bitfields2.num_queues = 1;
+	packet->bitfields2.queue_sel =
+		queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi;
+
+	packet->bitfields2.engine_sel =
+		engine_sel__mes_map_queues__compute_vi;
+	packet->bitfields2.queue_type =
+		queue_type__mes_map_queues__normal_compute_vi;
+
+	switch (q->properties.type) {
+	case KFD_QUEUE_TYPE_COMPUTE:
+		if (use_static)
+			packet->bitfields2.queue_type =
+		queue_type__mes_map_queues__normal_latency_static_queue_vi;
+		break;
+	case KFD_QUEUE_TYPE_DIQ:
+		packet->bitfields2.queue_type =
+			queue_type__mes_map_queues__debug_interface_queue_vi;
+		break;
+	case KFD_QUEUE_TYPE_SDMA:
+	case KFD_QUEUE_TYPE_SDMA_XGMI:
+		packet->bitfields2.engine_sel = q->properties.sdma_engine_id +
+				engine_sel__mes_map_queues__sdma0_vi;
+		use_static = false; /* no static queues under SDMA */
+		break;
+	default:
+		WARN(1, "queue type %d\n", q->properties.type);
+		return -EINVAL;
+	}
+	packet->bitfields3.doorbell_offset =
+			q->properties.doorbell_off;
+
+	packet->mqd_addr_lo =
+			lower_32_bits(q->gart_mqd_addr);
+
+	packet->mqd_addr_hi =
+			upper_32_bits(q->gart_mqd_addr);
+
+	packet->wptr_addr_lo =
+			lower_32_bits((uint64_t)q->properties.write_ptr);
+
+	packet->wptr_addr_hi =
+			upper_32_bits((uint64_t)q->properties.write_ptr);
+
+	return 0;
+}
+
+static int pm_unmap_queues_v10(struct packet_manager *pm, uint32_t *buffer,
+			enum kfd_queue_type type,
+			enum kfd_unmap_queues_filter filter,
+			uint32_t filter_param, bool reset,
+			unsigned int sdma_engine)
+{
+	struct pm4_mes_unmap_queues *packet;
+
+	packet = (struct pm4_mes_unmap_queues *)buffer;
+	memset(buffer, 0, sizeof(struct pm4_mes_unmap_queues));
+
+	packet->header.u32All = pm_build_pm4_header(IT_UNMAP_QUEUES,
+					sizeof(struct pm4_mes_unmap_queues));
+	switch (type) {
+	case KFD_QUEUE_TYPE_COMPUTE:
+	case KFD_QUEUE_TYPE_DIQ:
+		packet->bitfields2.engine_sel =
+			engine_sel__mes_unmap_queues__compute;
+		break;
+	case KFD_QUEUE_TYPE_SDMA:
+	case KFD_QUEUE_TYPE_SDMA_XGMI:
+		packet->bitfields2.engine_sel =
+			engine_sel__mes_unmap_queues__sdma0 + sdma_engine;
+		break;
+	default:
+		WARN(1, "queue type %d\n", type);
+		break;
+	}
+
+	if (reset)
+		packet->bitfields2.action =
+			action__mes_unmap_queues__reset_queues;
+	else
+		packet->bitfields2.action =
+			action__mes_unmap_queues__preempt_queues;
+
+	switch (filter) {
+	case KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE:
+		packet->bitfields2.queue_sel =
+			queue_sel__mes_unmap_queues__perform_request_on_specified_queues;
+		packet->bitfields2.num_queues = 1;
+		packet->bitfields3b.doorbell_offset0 = filter_param;
+		break;
+	case KFD_UNMAP_QUEUES_FILTER_BY_PASID:
+		packet->bitfields2.queue_sel =
+			queue_sel__mes_unmap_queues__perform_request_on_pasid_queues;
+		packet->bitfields3a.pasid = filter_param;
+		break;
+	case KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES:
+		packet->bitfields2.queue_sel =
+			queue_sel__mes_unmap_queues__unmap_all_queues;
+		break;
+	case KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES:
+		/* in this case, we do not preempt static queues */
+		packet->bitfields2.queue_sel =
+			queue_sel__mes_unmap_queues__unmap_all_non_static_queues;
+		break;
+	default:
+		WARN(1, "filter %d\n", filter);
+		break;
+	}
+
+	return 0;
+
+}
+
+static int pm_query_status_v10(struct packet_manager *pm, uint32_t *buffer,
+			uint64_t fence_address,	uint32_t fence_value)
+{
+	struct pm4_mes_query_status *packet;
+
+	packet = (struct pm4_mes_query_status *)buffer;
+	memset(buffer, 0, sizeof(struct pm4_mes_query_status));
+
+
+	packet->header.u32All = pm_build_pm4_header(IT_QUERY_STATUS,
+					sizeof(struct pm4_mes_query_status));
+
+	packet->bitfields2.context_id = 0;
+	packet->bitfields2.interrupt_sel =
+			interrupt_sel__mes_query_status__completion_status;
+	packet->bitfields2.command =
+			command__mes_query_status__fence_only_after_write_ack;
+
+	packet->addr_hi = upper_32_bits((uint64_t)fence_address);
+	packet->addr_lo = lower_32_bits((uint64_t)fence_address);
+	packet->data_hi = upper_32_bits((uint64_t)fence_value);
+	packet->data_lo = lower_32_bits((uint64_t)fence_value);
+
+	return 0;
+}
+
+
+static int pm_release_mem_v10(uint64_t gpu_addr, uint32_t *buffer)
+{
+	struct pm4_mec_release_mem *packet;
+
+	WARN_ON(!buffer);
+
+	packet = (struct pm4_mec_release_mem *)buffer;
+	memset(buffer, 0, sizeof(struct pm4_mec_release_mem));
+
+	packet->header.u32All = pm_build_pm4_header(IT_RELEASE_MEM,
+					sizeof(struct pm4_mec_release_mem));
+
+	packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT;
+	packet->bitfields2.event_index = event_index__mec_release_mem__end_of_pipe;
+	packet->bitfields2.tcl1_action_ena = 1;
+	packet->bitfields2.tc_action_ena = 1;
+	packet->bitfields2.cache_policy = cache_policy__mec_release_mem__lru;
+
+	packet->bitfields3.data_sel = data_sel__mec_release_mem__send_32_bit_low;
+	packet->bitfields3.int_sel =
+		int_sel__mec_release_mem__send_interrupt_after_write_confirm;
+
+	packet->bitfields4.address_lo_32b = (gpu_addr & 0xffffffff) >> 2;
+	packet->address_hi = upper_32_bits(gpu_addr);
+
+	packet->data_lo = 0;
+
+	return sizeof(struct pm4_mec_release_mem) / sizeof(unsigned int);
+}
+
+const struct packet_manager_funcs kfd_v10_pm_funcs = {
+	.map_process			= pm_map_process_v10,
+	.runlist			= pm_runlist_v10,
+	.set_resources			= pm_set_resources_vi,
+	.map_queues			= pm_map_queues_v10,
+	.unmap_queues			= pm_unmap_queues_v10,
+	.query_status			= pm_query_status_v10,
+	.release_mem			= pm_release_mem_v10,
+	.map_process_size		= sizeof(struct pm4_mes_map_process),
+	.runlist_size			= sizeof(struct pm4_mes_runlist),
+	.set_resources_size		= sizeof(struct pm4_mes_set_resources),
+	.map_queues_size		= sizeof(struct pm4_mes_map_queues),
+	.unmap_queues_size		= sizeof(struct pm4_mes_unmap_queues),
+	.query_status_size		= sizeof(struct pm4_mes_query_status),
+	.release_mem_size		= sizeof(struct pm4_mec_release_mem)
+};
+
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c
index 33830b1a5a54..9a4bafb2e175 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c
@@ -81,7 +81,8 @@ static int pm_map_process_v9(struct packet_manager *pm,
 	packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0;
 	packet->bitfields2.process_quantum = 1;
 	packet->bitfields2.pasid = qpd->pqm->process->pasid;
-	packet->bitfields14.gds_size = qpd->gds_size;
+	packet->bitfields14.gds_size = qpd->gds_size & 0x3F;
+	packet->bitfields14.gds_size_hi = (qpd->gds_size >> 6) & 0xF;
 	packet->bitfields14.num_gws = qpd->num_gws;
 	packet->bitfields14.num_oac = qpd->num_oac;
 	packet->bitfields14.sdma_enable = 1;
@@ -134,6 +135,7 @@ static int pm_runlist_v9(struct packet_manager *pm, uint32_t *buffer,
 	packet->bitfields4.ib_size = ib_size_in_dwords;
 	packet->bitfields4.chain = chain ? 1 : 0;
 	packet->bitfields4.offload_polling = 0;
+	packet->bitfields4.chained_runlist_idle_disable = chain ? 1 : 0;
 	packet->bitfields4.valid = 1;
 	packet->bitfields4.process_cnt = concurrent_proc_cnt;
 	packet->ordinal2 = lower_32_bits(ib);
@@ -142,6 +144,34 @@ static int pm_runlist_v9(struct packet_manager *pm, uint32_t *buffer,
 	return 0;
 }
 
+static int pm_set_resources_v9(struct packet_manager *pm, uint32_t *buffer,
+				struct scheduling_resources *res)
+{
+	struct pm4_mes_set_resources *packet;
+
+	packet = (struct pm4_mes_set_resources *)buffer;
+	memset(buffer, 0, sizeof(struct pm4_mes_set_resources));
+
+	packet->header.u32All = pm_build_pm4_header(IT_SET_RESOURCES,
+					sizeof(struct pm4_mes_set_resources));
+
+	packet->bitfields2.queue_type =
+			queue_type__mes_set_resources__hsa_interface_queue_hiq;
+	packet->bitfields2.vmid_mask = res->vmid_mask;
+	packet->bitfields2.unmap_latency = KFD_UNMAP_LATENCY_MS / 100;
+	packet->bitfields7.oac_mask = res->oac_mask;
+	packet->bitfields8.gds_heap_base = res->gds_heap_base;
+	packet->bitfields8.gds_heap_size = res->gds_heap_size;
+
+	packet->gws_mask_lo = lower_32_bits(res->gws_mask);
+	packet->gws_mask_hi = upper_32_bits(res->gws_mask);
+
+	packet->queue_mask_lo = lower_32_bits(res->queue_mask);
+	packet->queue_mask_hi = upper_32_bits(res->queue_mask);
+
+	return 0;
+}
+
 static int pm_map_queues_v9(struct packet_manager *pm, uint32_t *buffer,
 		struct queue *q, bool is_static)
 {
@@ -153,14 +183,15 @@ static int pm_map_queues_v9(struct packet_manager *pm, uint32_t *buffer,
 
 	packet->header.u32All = pm_build_pm4_header(IT_MAP_QUEUES,
 					sizeof(struct pm4_mes_map_queues));
-	packet->bitfields2.alloc_format =
-		alloc_format__mes_map_queues__one_per_pipe_vi;
 	packet->bitfields2.num_queues = 1;
 	packet->bitfields2.queue_sel =
 		queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi;
 
 	packet->bitfields2.engine_sel =
 		engine_sel__mes_map_queues__compute_vi;
+	packet->bitfields2.gws_control_queue = q->gws ? 1 : 0;
+	packet->bitfields2.extended_engine_sel =
+		extended_engine_sel__mes_map_queues__legacy_engine_sel;
 	packet->bitfields2.queue_type =
 		queue_type__mes_map_queues__normal_compute_vi;
 
@@ -175,9 +206,16 @@ static int pm_map_queues_v9(struct packet_manager *pm, uint32_t *buffer,
 			queue_type__mes_map_queues__debug_interface_queue_vi;
 		break;
 	case KFD_QUEUE_TYPE_SDMA:
-		packet->bitfields2.engine_sel = q->properties.sdma_engine_id +
-				engine_sel__mes_map_queues__sdma0_vi;
+	case KFD_QUEUE_TYPE_SDMA_XGMI:
 		use_static = false; /* no static queues under SDMA */
+		if (q->properties.sdma_engine_id < 2)
+			packet->bitfields2.engine_sel = q->properties.sdma_engine_id +
+				engine_sel__mes_map_queues__sdma0_vi;
+		else {
+			packet->bitfields2.extended_engine_sel =
+				extended_engine_sel__mes_map_queues__sdma0_to_7_sel;
+			packet->bitfields2.engine_sel = q->properties.sdma_engine_id;
+		}
 		break;
 	default:
 		WARN(1, "queue type %d", q->properties.type);
@@ -217,12 +255,23 @@ static int pm_unmap_queues_v9(struct packet_manager *pm, uint32_t *buffer,
 	switch (type) {
 	case KFD_QUEUE_TYPE_COMPUTE:
 	case KFD_QUEUE_TYPE_DIQ:
+		packet->bitfields2.extended_engine_sel =
+			extended_engine_sel__mes_unmap_queues__legacy_engine_sel;
 		packet->bitfields2.engine_sel =
 			engine_sel__mes_unmap_queues__compute;
 		break;
 	case KFD_QUEUE_TYPE_SDMA:
-		packet->bitfields2.engine_sel =
-			engine_sel__mes_unmap_queues__sdma0 + sdma_engine;
+	case KFD_QUEUE_TYPE_SDMA_XGMI:
+		if (sdma_engine < 2) {
+			packet->bitfields2.extended_engine_sel =
+				extended_engine_sel__mes_unmap_queues__legacy_engine_sel;
+			packet->bitfields2.engine_sel =
+				engine_sel__mes_unmap_queues__sdma0 + sdma_engine;
+		} else {
+			packet->bitfields2.extended_engine_sel =
+				extended_engine_sel__mes_unmap_queues__sdma0_to_7_sel;
+			packet->bitfields2.engine_sel = sdma_engine;
+		}
 		break;
 	default:
 		WARN(1, "queue type %d", type);
@@ -324,7 +373,7 @@ static int pm_release_mem_v9(uint64_t gpu_addr, uint32_t *buffer)
 const struct packet_manager_funcs kfd_v9_pm_funcs = {
 	.map_process		= pm_map_process_v9,
 	.runlist		= pm_runlist_v9,
-	.set_resources		= pm_set_resources_vi,
+	.set_resources		= pm_set_resources_v9,
 	.map_queues		= pm_map_queues_v9,
 	.unmap_queues		= pm_unmap_queues_v9,
 	.query_status		= pm_query_status_v9,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c
index bf20c6d32ef3..2adaf40027eb 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c
@@ -190,8 +190,6 @@ static int pm_map_queues_vi(struct packet_manager *pm, uint32_t *buffer,
 
 	packet->header.u32All = pm_build_pm4_header(IT_MAP_QUEUES,
 					sizeof(struct pm4_mes_map_queues));
-	packet->bitfields2.alloc_format =
-		alloc_format__mes_map_queues__one_per_pipe_vi;
 	packet->bitfields2.num_queues = 1;
 	packet->bitfields2.queue_sel =
 		queue_sel__mes_map_queues__map_to_hws_determined_queue_slots_vi;
@@ -212,6 +210,7 @@ static int pm_map_queues_vi(struct packet_manager *pm, uint32_t *buffer,
 			queue_type__mes_map_queues__debug_interface_queue_vi;
 		break;
 	case KFD_QUEUE_TYPE_SDMA:
+	case KFD_QUEUE_TYPE_SDMA_XGMI:
 		packet->bitfields2.engine_sel = q->properties.sdma_engine_id +
 				engine_sel__mes_map_queues__sdma0_vi;
 		use_static = false; /* no static queues under SDMA */
@@ -258,6 +257,7 @@ static int pm_unmap_queues_vi(struct packet_manager *pm, uint32_t *buffer,
 			engine_sel__mes_unmap_queues__compute;
 		break;
 	case KFD_QUEUE_TYPE_SDMA:
+	case KFD_QUEUE_TYPE_SDMA_XGMI:
 		packet->bitfields2.engine_sel =
 			engine_sel__mes_unmap_queues__sdma0 + sdma_engine;
 		break;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_module.c b/drivers/gpu/drm/amd/amdkfd/kfd_module.c
index 932007eb9168..f4b7f7e6c40e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_module.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_module.c
@@ -56,6 +56,11 @@ static int kfd_init(void)
 	if (err < 0)
 		goto err_create_wq;
 
+	/* Ignore the return value, so that we can continue
+	 * to init the KFD, even if procfs isn't craated
+	 */
+	kfd_procfs_init();
+
 	kfd_debugfs_init();
 
 	return 0;
@@ -72,11 +77,12 @@ static void kfd_exit(void)
 {
 	kfd_debugfs_fini();
 	kfd_process_destroy_wq();
+	kfd_procfs_shutdown();
 	kfd_topology_shutdown();
 	kfd_chardev_exit();
 }
 
-int kgd2kfd_init()
+int kgd2kfd_init(void)
 {
 	return kfd_init();
 }
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
index aed9b9b82213..88813dad731f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
@@ -23,34 +23,74 @@
 
 #include "kfd_mqd_manager.h"
 #include "amdgpu_amdkfd.h"
+#include "kfd_device_queue_manager.h"
 
-struct mqd_manager *mqd_manager_init(enum KFD_MQD_TYPE type,
-					struct kfd_dev *dev)
+/* Mapping queue priority to pipe priority, indexed by queue priority */
+int pipe_priority_map[] = {
+	KFD_PIPE_PRIORITY_CS_LOW,
+	KFD_PIPE_PRIORITY_CS_LOW,
+	KFD_PIPE_PRIORITY_CS_LOW,
+	KFD_PIPE_PRIORITY_CS_LOW,
+	KFD_PIPE_PRIORITY_CS_LOW,
+	KFD_PIPE_PRIORITY_CS_LOW,
+	KFD_PIPE_PRIORITY_CS_LOW,
+	KFD_PIPE_PRIORITY_CS_MEDIUM,
+	KFD_PIPE_PRIORITY_CS_MEDIUM,
+	KFD_PIPE_PRIORITY_CS_MEDIUM,
+	KFD_PIPE_PRIORITY_CS_MEDIUM,
+	KFD_PIPE_PRIORITY_CS_HIGH,
+	KFD_PIPE_PRIORITY_CS_HIGH,
+	KFD_PIPE_PRIORITY_CS_HIGH,
+	KFD_PIPE_PRIORITY_CS_HIGH,
+	KFD_PIPE_PRIORITY_CS_HIGH
+};
+
+struct kfd_mem_obj *allocate_hiq_mqd(struct kfd_dev *dev, struct queue_properties *q)
 {
-	switch (dev->device_info->asic_family) {
-	case CHIP_KAVERI:
-		return mqd_manager_init_cik(type, dev);
-	case CHIP_HAWAII:
-		return mqd_manager_init_cik_hawaii(type, dev);
-	case CHIP_CARRIZO:
-		return mqd_manager_init_vi(type, dev);
-	case CHIP_TONGA:
-	case CHIP_FIJI:
-	case CHIP_POLARIS10:
-	case CHIP_POLARIS11:
-	case CHIP_POLARIS12:
-		return mqd_manager_init_vi_tonga(type, dev);
-	case CHIP_VEGA10:
-	case CHIP_VEGA12:
-	case CHIP_VEGA20:
-	case CHIP_RAVEN:
-		return mqd_manager_init_v9(type, dev);
-	default:
-		WARN(1, "Unexpected ASIC family %u",
-		     dev->device_info->asic_family);
-	}
+	struct kfd_mem_obj *mqd_mem_obj = NULL;
+
+	mqd_mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_KERNEL);
+	if (!mqd_mem_obj)
+		return NULL;
+
+	mqd_mem_obj->gtt_mem = dev->dqm->hiq_sdma_mqd.gtt_mem;
+	mqd_mem_obj->gpu_addr = dev->dqm->hiq_sdma_mqd.gpu_addr;
+	mqd_mem_obj->cpu_ptr = dev->dqm->hiq_sdma_mqd.cpu_ptr;
+
+	return mqd_mem_obj;
+}
+
+struct kfd_mem_obj *allocate_sdma_mqd(struct kfd_dev *dev,
+					struct queue_properties *q)
+{
+	struct kfd_mem_obj *mqd_mem_obj = NULL;
+	uint64_t offset;
 
-	return NULL;
+	mqd_mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_KERNEL);
+	if (!mqd_mem_obj)
+		return NULL;
+
+	offset = (q->sdma_engine_id *
+		dev->device_info->num_sdma_queues_per_engine +
+		q->sdma_queue_id) *
+		dev->dqm->mqd_mgrs[KFD_MQD_TYPE_SDMA]->mqd_size;
+
+	offset += dev->dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]->mqd_size;
+
+	mqd_mem_obj->gtt_mem = (void *)((uint64_t)dev->dqm->hiq_sdma_mqd.gtt_mem
+				+ offset);
+	mqd_mem_obj->gpu_addr = dev->dqm->hiq_sdma_mqd.gpu_addr + offset;
+	mqd_mem_obj->cpu_ptr = (uint32_t *)((uint64_t)
+				dev->dqm->hiq_sdma_mqd.cpu_ptr + offset);
+
+	return mqd_mem_obj;
+}
+
+void free_mqd_hiq_sdma(struct mqd_manager *mm, void *mqd,
+			struct kfd_mem_obj *mqd_mem_obj)
+{
+	WARN_ON(!mqd_mem_obj->gtt_mem);
+	kfree(mqd_mem_obj);
 }
 
 void mqd_symmetrically_map_cu_mask(struct mqd_manager *mm,
@@ -58,8 +98,8 @@ void mqd_symmetrically_map_cu_mask(struct mqd_manager *mm,
 		uint32_t *se_mask)
 {
 	struct kfd_cu_info cu_info;
-	uint32_t cu_per_sh[4] = {0};
-	int i, se, cu = 0;
+	uint32_t cu_per_se[KFD_MAX_NUM_SE] = {0};
+	int i, se, sh, cu = 0;
 
 	amdgpu_amdkfd_get_cu_info(mm->dev->kgd, &cu_info);
 
@@ -67,8 +107,8 @@ void mqd_symmetrically_map_cu_mask(struct mqd_manager *mm,
 		cu_mask_count = cu_info.cu_active_number;
 
 	for (se = 0; se < cu_info.num_shader_engines; se++)
-		for (i = 0; i < 4; i++)
-			cu_per_sh[se] += hweight32(cu_info.cu_bitmap[se][i]);
+		for (sh = 0; sh < cu_info.num_shader_arrays_per_engine; sh++)
+			cu_per_se[se] += hweight32(cu_info.cu_bitmap[se % 4][sh + (se / 4)]);
 
 	/* Symmetrically map cu_mask to all SEs:
 	 * cu_mask[0] bit0 -> se_mask[0] bit0;
@@ -88,6 +128,6 @@ void mqd_symmetrically_map_cu_mask(struct mqd_manager *mm,
 				se = 0;
 				cu++;
 			}
-		} while (cu >= cu_per_sh[se] && cu < 32);
+		} while (cu >= cu_per_se[se] && cu < 32);
 	}
 }
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h
index f8261313ae7b..fbdb16418847 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h
@@ -26,6 +26,8 @@
 
 #include "kfd_priv.h"
 
+#define KFD_MAX_NUM_SE 8
+
 /**
  * struct mqd_manager
  *
@@ -39,7 +41,7 @@
  * @destroy_mqd: Destroys the HQD slot and by that preempt the relevant queue.
  * Used only for no cp scheduling.
  *
- * @uninit_mqd: Releases the mqd buffer from local gpu memory.
+ * @free_mqd: Releases the mqd buffer from local gpu memory.
  *
  * @is_occupied: Checks if the relevant HQD slot is occupied.
  *
@@ -62,10 +64,13 @@
  * per KFD_MQD_TYPE for each device.
  *
  */
-
+extern int pipe_priority_map[];
 struct mqd_manager {
-	int	(*init_mqd)(struct mqd_manager *mm, void **mqd,
-			struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
+	struct kfd_mem_obj*	(*allocate_mqd)(struct kfd_dev *kfd,
+		struct queue_properties *q);
+
+	void	(*init_mqd)(struct mqd_manager *mm, void **mqd,
+			struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr,
 			struct queue_properties *q);
 
 	int	(*load_mqd)(struct mqd_manager *mm, void *mqd,
@@ -73,7 +78,7 @@ struct mqd_manager {
 				struct queue_properties *p,
 				struct mm_struct *mms);
 
-	int	(*update_mqd)(struct mqd_manager *mm, void *mqd,
+	void	(*update_mqd)(struct mqd_manager *mm, void *mqd,
 				struct queue_properties *q);
 
 	int	(*destroy_mqd)(struct mqd_manager *mm, void *mqd,
@@ -81,7 +86,7 @@ struct mqd_manager {
 				unsigned int timeout, uint32_t pipe_id,
 				uint32_t queue_id);
 
-	void	(*uninit_mqd)(struct mqd_manager *mm, void *mqd,
+	void	(*free_mqd)(struct mqd_manager *mm, void *mqd,
 				struct kfd_mem_obj *mqd_mem_obj);
 
 	bool	(*is_occupied)(struct mqd_manager *mm, void *mqd,
@@ -99,8 +104,17 @@ struct mqd_manager {
 
 	struct mutex	mqd_mutex;
 	struct kfd_dev	*dev;
+	uint32_t mqd_size;
 };
 
+struct kfd_mem_obj *allocate_hiq_mqd(struct kfd_dev *dev,
+				struct queue_properties *q);
+
+struct kfd_mem_obj *allocate_sdma_mqd(struct kfd_dev *dev,
+					struct queue_properties *q);
+void free_mqd_hiq_sdma(struct mqd_manager *mm, void *mqd,
+				struct kfd_mem_obj *mqd_mem_obj);
+
 void mqd_symmetrically_map_cu_mask(struct mqd_manager *mm,
 		const uint32_t *cu_mask, uint32_t cu_mask_count,
 		uint32_t *se_mask);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
index ae90a99909ef..28876aceb14b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
@@ -66,22 +66,33 @@ static void update_cu_mask(struct mqd_manager *mm, void *mqd,
 		m->compute_static_thread_mgmt_se3);
 }
 
-static int init_mqd(struct mqd_manager *mm, void **mqd,
-		struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
+static void set_priority(struct cik_mqd *m, struct queue_properties *q)
+{
+	m->cp_hqd_pipe_priority = pipe_priority_map[q->priority];
+	m->cp_hqd_queue_priority = q->priority;
+}
+
+static struct kfd_mem_obj *allocate_mqd(struct kfd_dev *kfd,
+					struct queue_properties *q)
+{
+	struct kfd_mem_obj *mqd_mem_obj;
+
+	if (kfd_gtt_sa_allocate(kfd, sizeof(struct cik_mqd),
+			&mqd_mem_obj))
+		return NULL;
+
+	return mqd_mem_obj;
+}
+
+static void init_mqd(struct mqd_manager *mm, void **mqd,
+		struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr,
 		struct queue_properties *q)
 {
 	uint64_t addr;
 	struct cik_mqd *m;
-	int retval;
-
-	retval = kfd_gtt_sa_allocate(mm->dev, sizeof(struct cik_mqd),
-					mqd_mem_obj);
 
-	if (retval != 0)
-		return -ENOMEM;
-
-	m = (struct cik_mqd *) (*mqd_mem_obj)->cpu_ptr;
-	addr = (*mqd_mem_obj)->gpu_addr;
+	m = (struct cik_mqd *) mqd_mem_obj->cpu_ptr;
+	addr = mqd_mem_obj->gpu_addr;
 
 	memset(m, 0, ALIGN(sizeof(struct cik_mqd), 256));
 
@@ -116,8 +127,7 @@ static int init_mqd(struct mqd_manager *mm, void **mqd,
 	 * 1 = CS_MEDIUM (typically between HP3D and GFX
 	 * 2 = CS_HIGH (typically above HP3D)
 	 */
-	m->cp_hqd_pipe_priority = 1;
-	m->cp_hqd_queue_priority = 15;
+	set_priority(m, q);
 
 	if (q->format == KFD_QUEUE_FORMAT_AQL)
 		m->cp_hqd_iq_rptr = AQL_ENABLE;
@@ -125,49 +135,32 @@ static int init_mqd(struct mqd_manager *mm, void **mqd,
 	*mqd = m;
 	if (gart_addr)
 		*gart_addr = addr;
-	retval = mm->update_mqd(mm, m, q);
-
-	return retval;
+	mm->update_mqd(mm, m, q);
 }
 
-static int init_mqd_sdma(struct mqd_manager *mm, void **mqd,
-			struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
+static void init_mqd_sdma(struct mqd_manager *mm, void **mqd,
+			struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr,
 			struct queue_properties *q)
 {
-	int retval;
 	struct cik_sdma_rlc_registers *m;
 
-	retval = kfd_gtt_sa_allocate(mm->dev,
-					sizeof(struct cik_sdma_rlc_registers),
-					mqd_mem_obj);
-
-	if (retval != 0)
-		return -ENOMEM;
-
-	m = (struct cik_sdma_rlc_registers *) (*mqd_mem_obj)->cpu_ptr;
+	m = (struct cik_sdma_rlc_registers *) mqd_mem_obj->cpu_ptr;
 
 	memset(m, 0, sizeof(struct cik_sdma_rlc_registers));
 
 	*mqd = m;
 	if (gart_addr)
-		*gart_addr = (*mqd_mem_obj)->gpu_addr;
+		*gart_addr = mqd_mem_obj->gpu_addr;
 
-	retval = mm->update_mqd(mm, m, q);
-
-	return retval;
+	mm->update_mqd(mm, m, q);
 }
 
-static void uninit_mqd(struct mqd_manager *mm, void *mqd,
+static void free_mqd(struct mqd_manager *mm, void *mqd,
 			struct kfd_mem_obj *mqd_mem_obj)
 {
 	kfd_gtt_sa_free(mm->dev, mqd_mem_obj);
 }
 
-static void uninit_mqd_sdma(struct mqd_manager *mm, void *mqd,
-				struct kfd_mem_obj *mqd_mem_obj)
-{
-	kfd_gtt_sa_free(mm->dev, mqd_mem_obj);
-}
 
 static int load_mqd(struct mqd_manager *mm, void *mqd, uint32_t pipe_id,
 		    uint32_t queue_id, struct queue_properties *p,
@@ -191,7 +184,7 @@ static int load_mqd_sdma(struct mqd_manager *mm, void *mqd,
 					       mms);
 }
 
-static int __update_mqd(struct mqd_manager *mm, void *mqd,
+static void __update_mqd(struct mqd_manager *mm, void *mqd,
 			struct queue_properties *q, unsigned int atc_bit)
 {
 	struct cik_mqd *m;
@@ -222,28 +215,24 @@ static int __update_mqd(struct mqd_manager *mm, void *mqd,
 		m->cp_hqd_pq_control |= NO_UPDATE_RPTR;
 
 	update_cu_mask(mm, mqd, q);
+	set_priority(m, q);
 
-	q->is_active = (q->queue_size > 0 &&
-			q->queue_address != 0 &&
-			q->queue_percent > 0 &&
-			!q->is_evicted);
-
-	return 0;
+	q->is_active = QUEUE_IS_ACTIVE(*q);
 }
 
-static int update_mqd(struct mqd_manager *mm, void *mqd,
+static void update_mqd(struct mqd_manager *mm, void *mqd,
 			struct queue_properties *q)
 {
-	return __update_mqd(mm, mqd, q, 1);
+	__update_mqd(mm, mqd, q, 1);
 }
 
-static int update_mqd_hawaii(struct mqd_manager *mm, void *mqd,
+static void update_mqd_hawaii(struct mqd_manager *mm, void *mqd,
 			struct queue_properties *q)
 {
-	return __update_mqd(mm, mqd, q, 0);
+	__update_mqd(mm, mqd, q, 0);
 }
 
-static int update_mqd_sdma(struct mqd_manager *mm, void *mqd,
+static void update_mqd_sdma(struct mqd_manager *mm, void *mqd,
 				struct queue_properties *q)
 {
 	struct cik_sdma_rlc_registers *m;
@@ -267,12 +256,7 @@ static int update_mqd_sdma(struct mqd_manager *mm, void *mqd,
 	m->sdma_engine_id = q->sdma_engine_id;
 	m->sdma_queue_id = q->sdma_queue_id;
 
-	q->is_active = (q->queue_size > 0 &&
-			q->queue_address != 0 &&
-			q->queue_percent > 0 &&
-			!q->is_evicted);
-
-	return 0;
+	q->is_active = QUEUE_IS_ACTIVE(*q);
 }
 
 static int destroy_mqd(struct mqd_manager *mm, void *mqd,
@@ -319,14 +303,14 @@ static bool is_occupied_sdma(struct mqd_manager *mm, void *mqd,
  * queues but with different initial values.
  */
 
-static int init_mqd_hiq(struct mqd_manager *mm, void **mqd,
-		struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
+static void init_mqd_hiq(struct mqd_manager *mm, void **mqd,
+		struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr,
 		struct queue_properties *q)
 {
-	return init_mqd(mm, mqd, mqd_mem_obj, gart_addr, q);
+	init_mqd(mm, mqd, mqd_mem_obj, gart_addr, q);
 }
 
-static int update_mqd_hiq(struct mqd_manager *mm, void *mqd,
+static void update_mqd_hiq(struct mqd_manager *mm, void *mqd,
 				struct queue_properties *q)
 {
 	struct cik_mqd *m;
@@ -350,12 +334,9 @@ static int update_mqd_hiq(struct mqd_manager *mm, void *mqd,
 
 	m->cp_hqd_vmid = q->vmid;
 
-	q->is_active = (q->queue_size > 0 &&
-			q->queue_address != 0 &&
-			q->queue_percent > 0 &&
-			!q->is_evicted);
+	q->is_active = QUEUE_IS_ACTIVE(*q);
 
-	return 0;
+	set_priority(m, q);
 }
 
 #if defined(CONFIG_DEBUG_FS)
@@ -394,34 +375,53 @@ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type,
 	switch (type) {
 	case KFD_MQD_TYPE_CP:
 	case KFD_MQD_TYPE_COMPUTE:
+		mqd->allocate_mqd = allocate_mqd;
 		mqd->init_mqd = init_mqd;
-		mqd->uninit_mqd = uninit_mqd;
+		mqd->free_mqd = free_mqd;
 		mqd->load_mqd = load_mqd;
 		mqd->update_mqd = update_mqd;
 		mqd->destroy_mqd = destroy_mqd;
 		mqd->is_occupied = is_occupied;
+		mqd->mqd_size = sizeof(struct cik_mqd);
 #if defined(CONFIG_DEBUG_FS)
 		mqd->debugfs_show_mqd = debugfs_show_mqd;
 #endif
 		break;
 	case KFD_MQD_TYPE_HIQ:
+		mqd->allocate_mqd = allocate_hiq_mqd;
+		mqd->init_mqd = init_mqd_hiq;
+		mqd->free_mqd = free_mqd_hiq_sdma;
+		mqd->load_mqd = load_mqd;
+		mqd->update_mqd = update_mqd_hiq;
+		mqd->destroy_mqd = destroy_mqd;
+		mqd->is_occupied = is_occupied;
+		mqd->mqd_size = sizeof(struct cik_mqd);
+#if defined(CONFIG_DEBUG_FS)
+		mqd->debugfs_show_mqd = debugfs_show_mqd;
+#endif
+		break;
+	case KFD_MQD_TYPE_DIQ:
+		mqd->allocate_mqd = allocate_hiq_mqd;
 		mqd->init_mqd = init_mqd_hiq;
-		mqd->uninit_mqd = uninit_mqd;
+		mqd->free_mqd = free_mqd;
 		mqd->load_mqd = load_mqd;
 		mqd->update_mqd = update_mqd_hiq;
 		mqd->destroy_mqd = destroy_mqd;
 		mqd->is_occupied = is_occupied;
+		mqd->mqd_size = sizeof(struct cik_mqd);
 #if defined(CONFIG_DEBUG_FS)
 		mqd->debugfs_show_mqd = debugfs_show_mqd;
 #endif
 		break;
 	case KFD_MQD_TYPE_SDMA:
+		mqd->allocate_mqd = allocate_sdma_mqd;
 		mqd->init_mqd = init_mqd_sdma;
-		mqd->uninit_mqd = uninit_mqd_sdma;
+		mqd->free_mqd = free_mqd_hiq_sdma;
 		mqd->load_mqd = load_mqd_sdma;
 		mqd->update_mqd = update_mqd_sdma;
 		mqd->destroy_mqd = destroy_mqd_sdma;
 		mqd->is_occupied = is_occupied_sdma;
+		mqd->mqd_size = sizeof(struct cik_sdma_rlc_registers);
 #if defined(CONFIG_DEBUG_FS)
 		mqd->debugfs_show_mqd = debugfs_show_mqd_sdma;
 #endif
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c
new file mode 100644
index 000000000000..4a236b2c2354
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c
@@ -0,0 +1,468 @@
+/*
+ * Copyright 2018 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include "kfd_priv.h"
+#include "kfd_mqd_manager.h"
+#include "v10_structs.h"
+#include "gc/gc_10_1_0_offset.h"
+#include "gc/gc_10_1_0_sh_mask.h"
+#include "amdgpu_amdkfd.h"
+
+static inline struct v10_compute_mqd *get_mqd(void *mqd)
+{
+	return (struct v10_compute_mqd *)mqd;
+}
+
+static inline struct v10_sdma_mqd *get_sdma_mqd(void *mqd)
+{
+	return (struct v10_sdma_mqd *)mqd;
+}
+
+static void update_cu_mask(struct mqd_manager *mm, void *mqd,
+			   struct queue_properties *q)
+{
+	struct v10_compute_mqd *m;
+	uint32_t se_mask[4] = {0}; /* 4 is the max # of SEs */
+
+	if (q->cu_mask_count == 0)
+		return;
+
+	mqd_symmetrically_map_cu_mask(mm,
+		q->cu_mask, q->cu_mask_count, se_mask);
+
+	m = get_mqd(mqd);
+	m->compute_static_thread_mgmt_se0 = se_mask[0];
+	m->compute_static_thread_mgmt_se1 = se_mask[1];
+	m->compute_static_thread_mgmt_se2 = se_mask[2];
+	m->compute_static_thread_mgmt_se3 = se_mask[3];
+
+	pr_debug("update cu mask to %#x %#x %#x %#x\n",
+		m->compute_static_thread_mgmt_se0,
+		m->compute_static_thread_mgmt_se1,
+		m->compute_static_thread_mgmt_se2,
+		m->compute_static_thread_mgmt_se3);
+}
+
+static struct kfd_mem_obj *allocate_mqd(struct kfd_dev *kfd,
+		struct queue_properties *q)
+{
+	struct kfd_mem_obj *mqd_mem_obj;
+
+	if (kfd_gtt_sa_allocate(kfd, sizeof(struct v10_compute_mqd),
+			&mqd_mem_obj))
+		return NULL;
+
+	return mqd_mem_obj;
+}
+
+static void init_mqd(struct mqd_manager *mm, void **mqd,
+			struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr,
+			struct queue_properties *q)
+{
+	uint64_t addr;
+	struct v10_compute_mqd *m;
+
+	m = (struct v10_compute_mqd *) mqd_mem_obj->cpu_ptr;
+	addr = mqd_mem_obj->gpu_addr;
+
+	memset(m, 0, sizeof(struct v10_compute_mqd));
+
+	m->header = 0xC0310800;
+	m->compute_pipelinestat_enable = 1;
+	m->compute_static_thread_mgmt_se0 = 0xFFFFFFFF;
+	m->compute_static_thread_mgmt_se1 = 0xFFFFFFFF;
+	m->compute_static_thread_mgmt_se2 = 0xFFFFFFFF;
+	m->compute_static_thread_mgmt_se3 = 0xFFFFFFFF;
+
+	m->cp_hqd_persistent_state = CP_HQD_PERSISTENT_STATE__PRELOAD_REQ_MASK |
+			0x53 << CP_HQD_PERSISTENT_STATE__PRELOAD_SIZE__SHIFT;
+
+	m->cp_mqd_control = 1 << CP_MQD_CONTROL__PRIV_STATE__SHIFT;
+
+	m->cp_mqd_base_addr_lo        = lower_32_bits(addr);
+	m->cp_mqd_base_addr_hi        = upper_32_bits(addr);
+
+	m->cp_hqd_quantum = 1 << CP_HQD_QUANTUM__QUANTUM_EN__SHIFT |
+			1 << CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT |
+			10 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT;
+
+	m->cp_hqd_pipe_priority = 1;
+	m->cp_hqd_queue_priority = 15;
+
+	if (q->format == KFD_QUEUE_FORMAT_AQL) {
+		m->cp_hqd_aql_control =
+			1 << CP_HQD_AQL_CONTROL__CONTROL0__SHIFT;
+	}
+
+	if (mm->dev->cwsr_enabled) {
+		m->cp_hqd_persistent_state |=
+			(1 << CP_HQD_PERSISTENT_STATE__QSWITCH_MODE__SHIFT);
+		m->cp_hqd_ctx_save_base_addr_lo =
+			lower_32_bits(q->ctx_save_restore_area_address);
+		m->cp_hqd_ctx_save_base_addr_hi =
+			upper_32_bits(q->ctx_save_restore_area_address);
+		m->cp_hqd_ctx_save_size = q->ctx_save_restore_area_size;
+		m->cp_hqd_cntl_stack_size = q->ctl_stack_size;
+		m->cp_hqd_cntl_stack_offset = q->ctl_stack_size;
+		m->cp_hqd_wg_state_offset = q->ctl_stack_size;
+	}
+
+	*mqd = m;
+	if (gart_addr)
+		*gart_addr = addr;
+	mm->update_mqd(mm, m, q);
+}
+
+static int load_mqd(struct mqd_manager *mm, void *mqd,
+			uint32_t pipe_id, uint32_t queue_id,
+			struct queue_properties *p, struct mm_struct *mms)
+{
+	int r = 0;
+	/* AQL write pointer counts in 64B packets, PM4/CP counts in dwords. */
+	uint32_t wptr_shift = (p->format == KFD_QUEUE_FORMAT_AQL ? 4 : 0);
+
+	r = mm->dev->kfd2kgd->hqd_load(mm->dev->kgd, mqd, pipe_id, queue_id,
+					  (uint32_t __user *)p->write_ptr,
+					  wptr_shift, 0, mms);
+	return r;
+}
+
+static void update_mqd(struct mqd_manager *mm, void *mqd,
+		      struct queue_properties *q)
+{
+	struct v10_compute_mqd *m;
+
+	m = get_mqd(mqd);
+
+	m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT;
+	m->cp_hqd_pq_control |=
+			ffs(q->queue_size / sizeof(unsigned int)) - 1 - 1;
+	pr_debug("cp_hqd_pq_control 0x%x\n", m->cp_hqd_pq_control);
+
+	m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8);
+	m->cp_hqd_pq_base_hi = upper_32_bits((uint64_t)q->queue_address >> 8);
+
+	m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr);
+	m->cp_hqd_pq_rptr_report_addr_hi = upper_32_bits((uint64_t)q->read_ptr);
+	m->cp_hqd_pq_wptr_poll_addr_lo = lower_32_bits((uint64_t)q->write_ptr);
+	m->cp_hqd_pq_wptr_poll_addr_hi = upper_32_bits((uint64_t)q->write_ptr);
+
+	m->cp_hqd_pq_doorbell_control =
+		q->doorbell_off <<
+			CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET__SHIFT;
+	pr_debug("cp_hqd_pq_doorbell_control 0x%x\n",
+			m->cp_hqd_pq_doorbell_control);
+
+	m->cp_hqd_ib_control = 3 << CP_HQD_IB_CONTROL__MIN_IB_AVAIL_SIZE__SHIFT;
+
+	/*
+	 * HW does not clamp this field correctly. Maximum EOP queue size
+	 * is constrained by per-SE EOP done signal count, which is 8-bit.
+	 * Limit is 0xFF EOP entries (= 0x7F8 dwords). CP will not submit
+	 * more than (EOP entry count - 1) so a queue size of 0x800 dwords
+	 * is safe, giving a maximum field value of 0xA.
+	 */
+	m->cp_hqd_eop_control = min(0xA,
+		ffs(q->eop_ring_buffer_size / sizeof(unsigned int)) - 1 - 1);
+	m->cp_hqd_eop_base_addr_lo =
+			lower_32_bits(q->eop_ring_buffer_address >> 8);
+	m->cp_hqd_eop_base_addr_hi =
+			upper_32_bits(q->eop_ring_buffer_address >> 8);
+
+	m->cp_hqd_iq_timer = 0;
+
+	m->cp_hqd_vmid = q->vmid;
+
+	if (q->format == KFD_QUEUE_FORMAT_AQL) {
+		/* GC 10 removed WPP_CLAMP from PQ Control */
+		m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__NO_UPDATE_RPTR_MASK |
+				2 << CP_HQD_PQ_CONTROL__SLOT_BASED_WPTR__SHIFT |
+				1 << CP_HQD_PQ_CONTROL__QUEUE_FULL_EN__SHIFT ;
+		m->cp_hqd_pq_doorbell_control |=
+			1 << CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_BIF_DROP__SHIFT;
+	}
+	if (mm->dev->cwsr_enabled)
+		m->cp_hqd_ctx_save_control = 0;
+
+	update_cu_mask(mm, mqd, q);
+
+	q->is_active = (q->queue_size > 0 &&
+			q->queue_address != 0 &&
+			q->queue_percent > 0 &&
+			!q->is_evicted);
+}
+
+static int destroy_mqd(struct mqd_manager *mm, void *mqd,
+		       enum kfd_preempt_type type,
+		       unsigned int timeout, uint32_t pipe_id,
+		       uint32_t queue_id)
+{
+	return mm->dev->kfd2kgd->hqd_destroy
+		(mm->dev->kgd, mqd, type, timeout,
+		 pipe_id, queue_id);
+}
+
+static void free_mqd(struct mqd_manager *mm, void *mqd,
+			struct kfd_mem_obj *mqd_mem_obj)
+{
+	kfd_gtt_sa_free(mm->dev, mqd_mem_obj);
+}
+
+static bool is_occupied(struct mqd_manager *mm, void *mqd,
+			uint64_t queue_address,	uint32_t pipe_id,
+			uint32_t queue_id)
+{
+	return mm->dev->kfd2kgd->hqd_is_occupied(
+		mm->dev->kgd, queue_address,
+		pipe_id, queue_id);
+}
+
+static int get_wave_state(struct mqd_manager *mm, void *mqd,
+			  void __user *ctl_stack,
+			  u32 *ctl_stack_used_size,
+			  u32 *save_area_used_size)
+{
+	struct v10_compute_mqd *m;
+
+	/* Control stack is located one page after MQD. */
+	void *mqd_ctl_stack = (void *)((uintptr_t)mqd + PAGE_SIZE);
+
+	m = get_mqd(mqd);
+
+	*ctl_stack_used_size = m->cp_hqd_cntl_stack_size -
+		m->cp_hqd_cntl_stack_offset;
+	*save_area_used_size = m->cp_hqd_wg_state_offset -
+		m->cp_hqd_cntl_stack_size;
+
+	if (copy_to_user(ctl_stack, mqd_ctl_stack, m->cp_hqd_cntl_stack_size))
+		return -EFAULT;
+
+	return 0;
+}
+
+static void init_mqd_hiq(struct mqd_manager *mm, void **mqd,
+			struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr,
+			struct queue_properties *q)
+{
+	struct v10_compute_mqd *m;
+
+	init_mqd(mm, mqd, mqd_mem_obj, gart_addr, q);
+
+	m = get_mqd(*mqd);
+
+	m->cp_hqd_pq_control |= 1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT |
+			1 << CP_HQD_PQ_CONTROL__KMD_QUEUE__SHIFT;
+}
+
+static void update_mqd_hiq(struct mqd_manager *mm, void *mqd,
+			struct queue_properties *q)
+{
+	struct v10_compute_mqd *m;
+
+	update_mqd(mm, mqd, q);
+
+	/* TODO: what's the point? update_mqd already does this. */
+	m = get_mqd(mqd);
+	m->cp_hqd_vmid = q->vmid;
+}
+
+static void init_mqd_sdma(struct mqd_manager *mm, void **mqd,
+		struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr,
+		struct queue_properties *q)
+{
+	struct v10_sdma_mqd *m;
+
+	m = (struct v10_sdma_mqd *) mqd_mem_obj->cpu_ptr;
+
+	memset(m, 0, sizeof(struct v10_sdma_mqd));
+
+	*mqd = m;
+	if (gart_addr)
+		*gart_addr = mqd_mem_obj->gpu_addr;
+
+	mm->update_mqd(mm, m, q);
+}
+
+static int load_mqd_sdma(struct mqd_manager *mm, void *mqd,
+		uint32_t pipe_id, uint32_t queue_id,
+		struct queue_properties *p, struct mm_struct *mms)
+{
+	return mm->dev->kfd2kgd->hqd_sdma_load(mm->dev->kgd, mqd,
+					       (uint32_t __user *)p->write_ptr,
+					       mms);
+}
+
+#define SDMA_RLC_DUMMY_DEFAULT 0xf
+
+static void update_mqd_sdma(struct mqd_manager *mm, void *mqd,
+		struct queue_properties *q)
+{
+	struct v10_sdma_mqd *m;
+
+	m = get_sdma_mqd(mqd);
+	m->sdmax_rlcx_rb_cntl = (ffs(q->queue_size / sizeof(unsigned int)) - 1)
+		<< SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT |
+		q->vmid << SDMA0_RLC0_RB_CNTL__RB_VMID__SHIFT |
+		1 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT |
+		6 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT;
+
+	m->sdmax_rlcx_rb_base = lower_32_bits(q->queue_address >> 8);
+	m->sdmax_rlcx_rb_base_hi = upper_32_bits(q->queue_address >> 8);
+	m->sdmax_rlcx_rb_rptr_addr_lo = lower_32_bits((uint64_t)q->read_ptr);
+	m->sdmax_rlcx_rb_rptr_addr_hi = upper_32_bits((uint64_t)q->read_ptr);
+	m->sdmax_rlcx_doorbell_offset =
+		q->doorbell_off << SDMA0_RLC0_DOORBELL_OFFSET__OFFSET__SHIFT;
+
+	m->sdma_engine_id = q->sdma_engine_id;
+	m->sdma_queue_id = q->sdma_queue_id;
+	m->sdmax_rlcx_dummy_reg = SDMA_RLC_DUMMY_DEFAULT;
+
+
+	q->is_active = (q->queue_size > 0 &&
+			q->queue_address != 0 &&
+			q->queue_percent > 0 &&
+			!q->is_evicted);
+}
+
+/*
+ *  * preempt type here is ignored because there is only one way
+ *  * to preempt sdma queue
+ */
+static int destroy_mqd_sdma(struct mqd_manager *mm, void *mqd,
+		enum kfd_preempt_type type,
+		unsigned int timeout, uint32_t pipe_id,
+		uint32_t queue_id)
+{
+	return mm->dev->kfd2kgd->hqd_sdma_destroy(mm->dev->kgd, mqd, timeout);
+}
+
+static bool is_occupied_sdma(struct mqd_manager *mm, void *mqd,
+		uint64_t queue_address, uint32_t pipe_id,
+		uint32_t queue_id)
+{
+	return mm->dev->kfd2kgd->hqd_sdma_is_occupied(mm->dev->kgd, mqd);
+}
+
+#if defined(CONFIG_DEBUG_FS)
+
+static int debugfs_show_mqd(struct seq_file *m, void *data)
+{
+	seq_hex_dump(m, "    ", DUMP_PREFIX_OFFSET, 32, 4,
+		     data, sizeof(struct v10_compute_mqd), false);
+	return 0;
+}
+
+static int debugfs_show_mqd_sdma(struct seq_file *m, void *data)
+{
+	seq_hex_dump(m, "    ", DUMP_PREFIX_OFFSET, 32, 4,
+		     data, sizeof(struct v10_sdma_mqd), false);
+	return 0;
+}
+
+#endif
+
+struct mqd_manager *mqd_manager_init_v10(enum KFD_MQD_TYPE type,
+		struct kfd_dev *dev)
+{
+	struct mqd_manager *mqd;
+
+	if (WARN_ON(type >= KFD_MQD_TYPE_MAX))
+		return NULL;
+
+	mqd = kzalloc(sizeof(*mqd), GFP_NOIO);
+	if (!mqd)
+		return NULL;
+
+	mqd->dev = dev;
+
+	switch (type) {
+	case KFD_MQD_TYPE_CP:
+	case KFD_MQD_TYPE_COMPUTE:
+		pr_debug("%s@%i\n", __func__, __LINE__);
+		mqd->allocate_mqd = allocate_mqd;
+		mqd->init_mqd = init_mqd;
+		mqd->free_mqd = free_mqd;
+		mqd->load_mqd = load_mqd;
+		mqd->update_mqd = update_mqd;
+		mqd->destroy_mqd = destroy_mqd;
+		mqd->is_occupied = is_occupied;
+		mqd->mqd_size = sizeof(struct v10_compute_mqd);
+		mqd->get_wave_state = get_wave_state;
+#if defined(CONFIG_DEBUG_FS)
+		mqd->debugfs_show_mqd = debugfs_show_mqd;
+#endif
+		pr_debug("%s@%i\n", __func__, __LINE__);
+		break;
+	case KFD_MQD_TYPE_HIQ:
+		pr_debug("%s@%i\n", __func__, __LINE__);
+		mqd->allocate_mqd = allocate_hiq_mqd;
+		mqd->init_mqd = init_mqd_hiq;
+		mqd->free_mqd = free_mqd_hiq_sdma;
+		mqd->load_mqd = load_mqd;
+		mqd->update_mqd = update_mqd_hiq;
+		mqd->destroy_mqd = destroy_mqd;
+		mqd->is_occupied = is_occupied;
+		mqd->mqd_size = sizeof(struct v10_compute_mqd);
+#if defined(CONFIG_DEBUG_FS)
+		mqd->debugfs_show_mqd = debugfs_show_mqd;
+#endif
+		pr_debug("%s@%i\n", __func__, __LINE__);
+		break;
+	case KFD_MQD_TYPE_DIQ:
+		mqd->allocate_mqd = allocate_hiq_mqd;
+		mqd->init_mqd = init_mqd_hiq;
+		mqd->free_mqd = free_mqd;
+		mqd->load_mqd = load_mqd;
+		mqd->update_mqd = update_mqd_hiq;
+		mqd->destroy_mqd = destroy_mqd;
+		mqd->is_occupied = is_occupied;
+		mqd->mqd_size = sizeof(struct v10_compute_mqd);
+#if defined(CONFIG_DEBUG_FS)
+		mqd->debugfs_show_mqd = debugfs_show_mqd;
+#endif
+		break;
+	case KFD_MQD_TYPE_SDMA:
+		pr_debug("%s@%i\n", __func__, __LINE__);
+		mqd->allocate_mqd = allocate_sdma_mqd;
+		mqd->init_mqd = init_mqd_sdma;
+		mqd->free_mqd = free_mqd_hiq_sdma;
+		mqd->load_mqd = load_mqd_sdma;
+		mqd->update_mqd = update_mqd_sdma;
+		mqd->destroy_mqd = destroy_mqd_sdma;
+		mqd->is_occupied = is_occupied_sdma;
+		mqd->mqd_size = sizeof(struct v10_sdma_mqd);
+#if defined(CONFIG_DEBUG_FS)
+		mqd->debugfs_show_mqd = debugfs_show_mqd_sdma;
+#endif
+		pr_debug("%s@%i\n", __func__, __LINE__);
+		break;
+	default:
+		kfree(mqd);
+		return NULL;
+	}
+
+	return mqd;
+}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
index 9dbba609450e..d3380c5bdbde 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
@@ -46,7 +46,7 @@ static void update_cu_mask(struct mqd_manager *mm, void *mqd,
 			struct queue_properties *q)
 {
 	struct v9_mqd *m;
-	uint32_t se_mask[4] = {0}; /* 4 is the max # of SEs */
+	uint32_t se_mask[KFD_MAX_NUM_SE] = {0};
 
 	if (q->cu_mask_count == 0)
 		return;
@@ -59,45 +59,71 @@ static void update_cu_mask(struct mqd_manager *mm, void *mqd,
 	m->compute_static_thread_mgmt_se1 = se_mask[1];
 	m->compute_static_thread_mgmt_se2 = se_mask[2];
 	m->compute_static_thread_mgmt_se3 = se_mask[3];
+	m->compute_static_thread_mgmt_se4 = se_mask[4];
+	m->compute_static_thread_mgmt_se5 = se_mask[5];
+	m->compute_static_thread_mgmt_se6 = se_mask[6];
+	m->compute_static_thread_mgmt_se7 = se_mask[7];
 
-	pr_debug("update cu mask to %#x %#x %#x %#x\n",
+	pr_debug("update cu mask to %#x %#x %#x %#x %#x %#x %#x %#x\n",
 		m->compute_static_thread_mgmt_se0,
 		m->compute_static_thread_mgmt_se1,
 		m->compute_static_thread_mgmt_se2,
-		m->compute_static_thread_mgmt_se3);
+		m->compute_static_thread_mgmt_se3,
+		m->compute_static_thread_mgmt_se4,
+		m->compute_static_thread_mgmt_se5,
+		m->compute_static_thread_mgmt_se6,
+		m->compute_static_thread_mgmt_se7);
 }
 
-static int init_mqd(struct mqd_manager *mm, void **mqd,
-			struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
-			struct queue_properties *q)
+static void set_priority(struct v9_mqd *m, struct queue_properties *q)
+{
+	m->cp_hqd_pipe_priority = pipe_priority_map[q->priority];
+	m->cp_hqd_queue_priority = q->priority;
+}
+
+static struct kfd_mem_obj *allocate_mqd(struct kfd_dev *kfd,
+		struct queue_properties *q)
 {
 	int retval;
-	uint64_t addr;
-	struct v9_mqd *m;
-	struct kfd_dev *kfd = mm->dev;
+	struct kfd_mem_obj *mqd_mem_obj = NULL;
 
 	/* From V9,  for CWSR, the control stack is located on the next page
 	 * boundary after the mqd, we will use the gtt allocation function
 	 * instead of sub-allocation function.
 	 */
 	if (kfd->cwsr_enabled && (q->type == KFD_QUEUE_TYPE_COMPUTE)) {
-		*mqd_mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_KERNEL);
-		if (!*mqd_mem_obj)
-			return -ENOMEM;
+		mqd_mem_obj = kzalloc(sizeof(struct kfd_mem_obj), GFP_NOIO);
+		if (!mqd_mem_obj)
+			return NULL;
 		retval = amdgpu_amdkfd_alloc_gtt_mem(kfd->kgd,
 			ALIGN(q->ctl_stack_size, PAGE_SIZE) +
 				ALIGN(sizeof(struct v9_mqd), PAGE_SIZE),
-			&((*mqd_mem_obj)->gtt_mem),
-			&((*mqd_mem_obj)->gpu_addr),
-			(void *)&((*mqd_mem_obj)->cpu_ptr), true);
-	} else
-		retval = kfd_gtt_sa_allocate(mm->dev, sizeof(struct v9_mqd),
-				mqd_mem_obj);
-	if (retval != 0)
-		return -ENOMEM;
-
-	m = (struct v9_mqd *) (*mqd_mem_obj)->cpu_ptr;
-	addr = (*mqd_mem_obj)->gpu_addr;
+			&(mqd_mem_obj->gtt_mem),
+			&(mqd_mem_obj->gpu_addr),
+			(void *)&(mqd_mem_obj->cpu_ptr), true);
+	} else {
+		retval = kfd_gtt_sa_allocate(kfd, sizeof(struct v9_mqd),
+				&mqd_mem_obj);
+	}
+
+	if (retval) {
+		kfree(mqd_mem_obj);
+		return NULL;
+	}
+
+	return mqd_mem_obj;
+
+}
+
+static void init_mqd(struct mqd_manager *mm, void **mqd,
+			struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr,
+			struct queue_properties *q)
+{
+	uint64_t addr;
+	struct v9_mqd *m;
+
+	m = (struct v9_mqd *) mqd_mem_obj->cpu_ptr;
+	addr = mqd_mem_obj->gpu_addr;
 
 	memset(m, 0, sizeof(struct v9_mqd));
 
@@ -107,6 +133,10 @@ static int init_mqd(struct mqd_manager *mm, void **mqd,
 	m->compute_static_thread_mgmt_se1 = 0xFFFFFFFF;
 	m->compute_static_thread_mgmt_se2 = 0xFFFFFFFF;
 	m->compute_static_thread_mgmt_se3 = 0xFFFFFFFF;
+	m->compute_static_thread_mgmt_se4 = 0xFFFFFFFF;
+	m->compute_static_thread_mgmt_se5 = 0xFFFFFFFF;
+	m->compute_static_thread_mgmt_se6 = 0xFFFFFFFF;
+	m->compute_static_thread_mgmt_se7 = 0xFFFFFFFF;
 
 	m->cp_hqd_persistent_state = CP_HQD_PERSISTENT_STATE__PRELOAD_REQ_MASK |
 			0x53 << CP_HQD_PERSISTENT_STATE__PRELOAD_SIZE__SHIFT;
@@ -120,9 +150,6 @@ static int init_mqd(struct mqd_manager *mm, void **mqd,
 			1 << CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT |
 			10 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT;
 
-	m->cp_hqd_pipe_priority = 1;
-	m->cp_hqd_queue_priority = 15;
-
 	if (q->format == KFD_QUEUE_FORMAT_AQL) {
 		m->cp_hqd_aql_control =
 			1 << CP_HQD_AQL_CONTROL__CONTROL0__SHIFT;
@@ -149,9 +176,7 @@ static int init_mqd(struct mqd_manager *mm, void **mqd,
 	*mqd = m;
 	if (gart_addr)
 		*gart_addr = addr;
-	retval = mm->update_mqd(mm, m, q);
-
-	return retval;
+	mm->update_mqd(mm, m, q);
 }
 
 static int load_mqd(struct mqd_manager *mm, void *mqd,
@@ -166,7 +191,7 @@ static int load_mqd(struct mqd_manager *mm, void *mqd,
 					  wptr_shift, 0, mms);
 }
 
-static int update_mqd(struct mqd_manager *mm, void *mqd,
+static void update_mqd(struct mqd_manager *mm, void *mqd,
 		      struct queue_properties *q)
 {
 	struct v9_mqd *m;
@@ -225,13 +250,9 @@ static int update_mqd(struct mqd_manager *mm, void *mqd,
 		m->cp_hqd_ctx_save_control = 0;
 
 	update_cu_mask(mm, mqd, q);
+	set_priority(m, q);
 
-	q->is_active = (q->queue_size > 0 &&
-			q->queue_address != 0 &&
-			q->queue_percent > 0 &&
-			!q->is_evicted);
-
-	return 0;
+	q->is_active = QUEUE_IS_ACTIVE(*q);
 }
 
 
@@ -245,7 +266,7 @@ static int destroy_mqd(struct mqd_manager *mm, void *mqd,
 		pipe_id, queue_id);
 }
 
-static void uninit_mqd(struct mqd_manager *mm, void *mqd,
+static void free_mqd(struct mqd_manager *mm, void *mqd,
 			struct kfd_mem_obj *mqd_mem_obj)
 {
 	struct kfd_dev *kfd = mm->dev;
@@ -289,71 +310,47 @@ static int get_wave_state(struct mqd_manager *mm, void *mqd,
 	return 0;
 }
 
-static int init_mqd_hiq(struct mqd_manager *mm, void **mqd,
-			struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
+static void init_mqd_hiq(struct mqd_manager *mm, void **mqd,
+			struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr,
 			struct queue_properties *q)
 {
 	struct v9_mqd *m;
-	int retval = init_mqd(mm, mqd, mqd_mem_obj, gart_addr, q);
 
-	if (retval != 0)
-		return retval;
+	init_mqd(mm, mqd, mqd_mem_obj, gart_addr, q);
 
 	m = get_mqd(*mqd);
 
 	m->cp_hqd_pq_control |= 1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT |
 			1 << CP_HQD_PQ_CONTROL__KMD_QUEUE__SHIFT;
-
-	return retval;
 }
 
-static int update_mqd_hiq(struct mqd_manager *mm, void *mqd,
+static void update_mqd_hiq(struct mqd_manager *mm, void *mqd,
 			struct queue_properties *q)
 {
 	struct v9_mqd *m;
-	int retval = update_mqd(mm, mqd, q);
 
-	if (retval != 0)
-		return retval;
+	update_mqd(mm, mqd, q);
 
 	/* TODO: what's the point? update_mqd already does this. */
 	m = get_mqd(mqd);
 	m->cp_hqd_vmid = q->vmid;
-	return retval;
 }
 
-static int init_mqd_sdma(struct mqd_manager *mm, void **mqd,
-		struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
+static void init_mqd_sdma(struct mqd_manager *mm, void **mqd,
+		struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr,
 		struct queue_properties *q)
 {
-	int retval;
 	struct v9_sdma_mqd *m;
 
-
-	retval = kfd_gtt_sa_allocate(mm->dev,
-			sizeof(struct v9_sdma_mqd),
-			mqd_mem_obj);
-
-	if (retval != 0)
-		return -ENOMEM;
-
-	m = (struct v9_sdma_mqd *) (*mqd_mem_obj)->cpu_ptr;
+	m = (struct v9_sdma_mqd *) mqd_mem_obj->cpu_ptr;
 
 	memset(m, 0, sizeof(struct v9_sdma_mqd));
 
 	*mqd = m;
 	if (gart_addr)
-		*gart_addr = (*mqd_mem_obj)->gpu_addr;
-
-	retval = mm->update_mqd(mm, m, q);
+		*gart_addr = mqd_mem_obj->gpu_addr;
 
-	return retval;
-}
-
-static void uninit_mqd_sdma(struct mqd_manager *mm, void *mqd,
-		struct kfd_mem_obj *mqd_mem_obj)
-{
-	kfd_gtt_sa_free(mm->dev, mqd_mem_obj);
+	mm->update_mqd(mm, m, q);
 }
 
 static int load_mqd_sdma(struct mqd_manager *mm, void *mqd,
@@ -367,7 +364,7 @@ static int load_mqd_sdma(struct mqd_manager *mm, void *mqd,
 
 #define SDMA_RLC_DUMMY_DEFAULT 0xf
 
-static int update_mqd_sdma(struct mqd_manager *mm, void *mqd,
+static void update_mqd_sdma(struct mqd_manager *mm, void *mqd,
 		struct queue_properties *q)
 {
 	struct v9_sdma_mqd *m;
@@ -390,12 +387,7 @@ static int update_mqd_sdma(struct mqd_manager *mm, void *mqd,
 	m->sdma_queue_id = q->sdma_queue_id;
 	m->sdmax_rlcx_dummy_reg = SDMA_RLC_DUMMY_DEFAULT;
 
-	q->is_active = (q->queue_size > 0 &&
-			q->queue_address != 0 &&
-			q->queue_percent > 0 &&
-			!q->is_evicted);
-
-	return 0;
+	q->is_active = QUEUE_IS_ACTIVE(*q);
 }
 
 /*
@@ -452,35 +444,54 @@ struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type,
 	switch (type) {
 	case KFD_MQD_TYPE_CP:
 	case KFD_MQD_TYPE_COMPUTE:
+		mqd->allocate_mqd = allocate_mqd;
 		mqd->init_mqd = init_mqd;
-		mqd->uninit_mqd = uninit_mqd;
+		mqd->free_mqd = free_mqd;
 		mqd->load_mqd = load_mqd;
 		mqd->update_mqd = update_mqd;
 		mqd->destroy_mqd = destroy_mqd;
 		mqd->is_occupied = is_occupied;
 		mqd->get_wave_state = get_wave_state;
+		mqd->mqd_size = sizeof(struct v9_mqd);
 #if defined(CONFIG_DEBUG_FS)
 		mqd->debugfs_show_mqd = debugfs_show_mqd;
 #endif
 		break;
 	case KFD_MQD_TYPE_HIQ:
+		mqd->allocate_mqd = allocate_hiq_mqd;
+		mqd->init_mqd = init_mqd_hiq;
+		mqd->free_mqd = free_mqd_hiq_sdma;
+		mqd->load_mqd = load_mqd;
+		mqd->update_mqd = update_mqd_hiq;
+		mqd->destroy_mqd = destroy_mqd;
+		mqd->is_occupied = is_occupied;
+		mqd->mqd_size = sizeof(struct v9_mqd);
+#if defined(CONFIG_DEBUG_FS)
+		mqd->debugfs_show_mqd = debugfs_show_mqd;
+#endif
+		break;
+	case KFD_MQD_TYPE_DIQ:
+		mqd->allocate_mqd = allocate_hiq_mqd;
 		mqd->init_mqd = init_mqd_hiq;
-		mqd->uninit_mqd = uninit_mqd;
+		mqd->free_mqd = free_mqd;
 		mqd->load_mqd = load_mqd;
 		mqd->update_mqd = update_mqd_hiq;
 		mqd->destroy_mqd = destroy_mqd;
 		mqd->is_occupied = is_occupied;
+		mqd->mqd_size = sizeof(struct v9_mqd);
 #if defined(CONFIG_DEBUG_FS)
 		mqd->debugfs_show_mqd = debugfs_show_mqd;
 #endif
 		break;
 	case KFD_MQD_TYPE_SDMA:
+		mqd->allocate_mqd = allocate_sdma_mqd;
 		mqd->init_mqd = init_mqd_sdma;
-		mqd->uninit_mqd = uninit_mqd_sdma;
+		mqd->free_mqd = free_mqd_hiq_sdma;
 		mqd->load_mqd = load_mqd_sdma;
 		mqd->update_mqd = update_mqd_sdma;
 		mqd->destroy_mqd = destroy_mqd_sdma;
 		mqd->is_occupied = is_occupied_sdma;
+		mqd->mqd_size = sizeof(struct v9_sdma_mqd);
 #if defined(CONFIG_DEBUG_FS)
 		mqd->debugfs_show_mqd = debugfs_show_mqd_sdma;
 #endif
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
index 6469b3456f00..7d144f56f421 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
@@ -31,6 +31,7 @@
 #include "gca/gfx_8_0_sh_mask.h"
 #include "gca/gfx_8_0_enum.h"
 #include "oss/oss_3_0_sh_mask.h"
+
 #define CP_MQD_CONTROL__PRIV_STATE__SHIFT 0x8
 
 static inline struct vi_mqd *get_mqd(void *mqd)
@@ -68,21 +69,33 @@ static void update_cu_mask(struct mqd_manager *mm, void *mqd,
 		m->compute_static_thread_mgmt_se3);
 }
 
-static int init_mqd(struct mqd_manager *mm, void **mqd,
-			struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
+static void set_priority(struct vi_mqd *m, struct queue_properties *q)
+{
+	m->cp_hqd_pipe_priority = pipe_priority_map[q->priority];
+	m->cp_hqd_queue_priority = q->priority;
+}
+
+static struct kfd_mem_obj *allocate_mqd(struct kfd_dev *kfd,
+					struct queue_properties *q)
+{
+	struct kfd_mem_obj *mqd_mem_obj;
+
+	if (kfd_gtt_sa_allocate(kfd, sizeof(struct vi_mqd),
+			&mqd_mem_obj))
+		return NULL;
+
+	return mqd_mem_obj;
+}
+
+static void init_mqd(struct mqd_manager *mm, void **mqd,
+			struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr,
 			struct queue_properties *q)
 {
-	int retval;
 	uint64_t addr;
 	struct vi_mqd *m;
 
-	retval = kfd_gtt_sa_allocate(mm->dev, sizeof(struct vi_mqd),
-			mqd_mem_obj);
-	if (retval != 0)
-		return -ENOMEM;
-
-	m = (struct vi_mqd *) (*mqd_mem_obj)->cpu_ptr;
-	addr = (*mqd_mem_obj)->gpu_addr;
+	m = (struct vi_mqd *) mqd_mem_obj->cpu_ptr;
+	addr = mqd_mem_obj->gpu_addr;
 
 	memset(m, 0, sizeof(struct vi_mqd));
 
@@ -106,9 +119,7 @@ static int init_mqd(struct mqd_manager *mm, void **mqd,
 			1 << CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT |
 			10 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT;
 
-	m->cp_hqd_pipe_priority = 1;
-	m->cp_hqd_queue_priority = 15;
-
+	set_priority(m, q);
 	m->cp_hqd_eop_rptr = 1 << CP_HQD_EOP_RPTR__INIT_FETCHER__SHIFT;
 
 	if (q->format == KFD_QUEUE_FORMAT_AQL)
@@ -139,9 +150,7 @@ static int init_mqd(struct mqd_manager *mm, void **mqd,
 	*mqd = m;
 	if (gart_addr)
 		*gart_addr = addr;
-	retval = mm->update_mqd(mm, m, q);
-
-	return retval;
+	mm->update_mqd(mm, m, q);
 }
 
 static int load_mqd(struct mqd_manager *mm, void *mqd,
@@ -157,7 +166,7 @@ static int load_mqd(struct mqd_manager *mm, void *mqd,
 					  wptr_shift, wptr_mask, mms);
 }
 
-static int __update_mqd(struct mqd_manager *mm, void *mqd,
+static void __update_mqd(struct mqd_manager *mm, void *mqd,
 			struct queue_properties *q, unsigned int mtype,
 			unsigned int atc_bit)
 {
@@ -222,26 +231,22 @@ static int __update_mqd(struct mqd_manager *mm, void *mqd,
 			mtype << CP_HQD_CTX_SAVE_CONTROL__MTYPE__SHIFT;
 
 	update_cu_mask(mm, mqd, q);
+	set_priority(m, q);
 
-	q->is_active = (q->queue_size > 0 &&
-			q->queue_address != 0 &&
-			q->queue_percent > 0 &&
-			!q->is_evicted);
-
-	return 0;
+	q->is_active = QUEUE_IS_ACTIVE(*q);
 }
 
 
-static int update_mqd(struct mqd_manager *mm, void *mqd,
+static void update_mqd(struct mqd_manager *mm, void *mqd,
 			struct queue_properties *q)
 {
-	return __update_mqd(mm, mqd, q, MTYPE_CC, 1);
+	__update_mqd(mm, mqd, q, MTYPE_CC, 1);
 }
 
-static int update_mqd_tonga(struct mqd_manager *mm, void *mqd,
+static void update_mqd_tonga(struct mqd_manager *mm, void *mqd,
 			struct queue_properties *q)
 {
-	return __update_mqd(mm, mqd, q, MTYPE_UC, 0);
+	__update_mqd(mm, mqd, q, MTYPE_UC, 0);
 }
 
 static int destroy_mqd(struct mqd_manager *mm, void *mqd,
@@ -254,7 +259,7 @@ static int destroy_mqd(struct mqd_manager *mm, void *mqd,
 		pipe_id, queue_id);
 }
 
-static void uninit_mqd(struct mqd_manager *mm, void *mqd,
+static void free_mqd(struct mqd_manager *mm, void *mqd,
 			struct kfd_mem_obj *mqd_mem_obj)
 {
 	kfd_gtt_sa_free(mm->dev, mqd_mem_obj);
@@ -291,70 +296,44 @@ static int get_wave_state(struct mqd_manager *mm, void *mqd,
 	return 0;
 }
 
-static int init_mqd_hiq(struct mqd_manager *mm, void **mqd,
-			struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
+static void init_mqd_hiq(struct mqd_manager *mm, void **mqd,
+			struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr,
 			struct queue_properties *q)
 {
 	struct vi_mqd *m;
-	int retval = init_mqd(mm, mqd, mqd_mem_obj, gart_addr, q);
-
-	if (retval != 0)
-		return retval;
+	init_mqd(mm, mqd, mqd_mem_obj, gart_addr, q);
 
 	m = get_mqd(*mqd);
 
 	m->cp_hqd_pq_control |= 1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT |
 			1 << CP_HQD_PQ_CONTROL__KMD_QUEUE__SHIFT;
-
-	return retval;
 }
 
-static int update_mqd_hiq(struct mqd_manager *mm, void *mqd,
+static void update_mqd_hiq(struct mqd_manager *mm, void *mqd,
 			struct queue_properties *q)
 {
 	struct vi_mqd *m;
-	int retval = __update_mqd(mm, mqd, q, MTYPE_UC, 0);
-
-	if (retval != 0)
-		return retval;
+	__update_mqd(mm, mqd, q, MTYPE_UC, 0);
 
 	m = get_mqd(mqd);
 	m->cp_hqd_vmid = q->vmid;
-	return retval;
 }
 
-static int init_mqd_sdma(struct mqd_manager *mm, void **mqd,
-		struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
+static void init_mqd_sdma(struct mqd_manager *mm, void **mqd,
+		struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr,
 		struct queue_properties *q)
 {
-	int retval;
 	struct vi_sdma_mqd *m;
 
-
-	retval = kfd_gtt_sa_allocate(mm->dev,
-			sizeof(struct vi_sdma_mqd),
-			mqd_mem_obj);
-
-	if (retval != 0)
-		return -ENOMEM;
-
-	m = (struct vi_sdma_mqd *) (*mqd_mem_obj)->cpu_ptr;
+	m = (struct vi_sdma_mqd *) mqd_mem_obj->cpu_ptr;
 
 	memset(m, 0, sizeof(struct vi_sdma_mqd));
 
 	*mqd = m;
-	if (gart_addr != NULL)
-		*gart_addr = (*mqd_mem_obj)->gpu_addr;
-
-	retval = mm->update_mqd(mm, m, q);
-
-	return retval;
-}
+	if (gart_addr)
+		*gart_addr = mqd_mem_obj->gpu_addr;
 
-static void uninit_mqd_sdma(struct mqd_manager *mm, void *mqd,
-		struct kfd_mem_obj *mqd_mem_obj)
-{
-	kfd_gtt_sa_free(mm->dev, mqd_mem_obj);
+	mm->update_mqd(mm, m, q);
 }
 
 static int load_mqd_sdma(struct mqd_manager *mm, void *mqd,
@@ -366,7 +345,7 @@ static int load_mqd_sdma(struct mqd_manager *mm, void *mqd,
 					       mms);
 }
 
-static int update_mqd_sdma(struct mqd_manager *mm, void *mqd,
+static void update_mqd_sdma(struct mqd_manager *mm, void *mqd,
 		struct queue_properties *q)
 {
 	struct vi_sdma_mqd *m;
@@ -390,12 +369,7 @@ static int update_mqd_sdma(struct mqd_manager *mm, void *mqd,
 	m->sdma_engine_id = q->sdma_engine_id;
 	m->sdma_queue_id = q->sdma_queue_id;
 
-	q->is_active = (q->queue_size > 0 &&
-			q->queue_address != 0 &&
-			q->queue_percent > 0 &&
-			!q->is_evicted);
-
-	return 0;
+	q->is_active = QUEUE_IS_ACTIVE(*q);
 }
 
 /*
@@ -452,35 +426,54 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type,
 	switch (type) {
 	case KFD_MQD_TYPE_CP:
 	case KFD_MQD_TYPE_COMPUTE:
+		mqd->allocate_mqd = allocate_mqd;
 		mqd->init_mqd = init_mqd;
-		mqd->uninit_mqd = uninit_mqd;
+		mqd->free_mqd = free_mqd;
 		mqd->load_mqd = load_mqd;
 		mqd->update_mqd = update_mqd;
 		mqd->destroy_mqd = destroy_mqd;
 		mqd->is_occupied = is_occupied;
 		mqd->get_wave_state = get_wave_state;
+		mqd->mqd_size = sizeof(struct vi_mqd);
 #if defined(CONFIG_DEBUG_FS)
 		mqd->debugfs_show_mqd = debugfs_show_mqd;
 #endif
 		break;
 	case KFD_MQD_TYPE_HIQ:
+		mqd->allocate_mqd = allocate_hiq_mqd;
+		mqd->init_mqd = init_mqd_hiq;
+		mqd->free_mqd = free_mqd_hiq_sdma;
+		mqd->load_mqd = load_mqd;
+		mqd->update_mqd = update_mqd_hiq;
+		mqd->destroy_mqd = destroy_mqd;
+		mqd->is_occupied = is_occupied;
+		mqd->mqd_size = sizeof(struct vi_mqd);
+#if defined(CONFIG_DEBUG_FS)
+		mqd->debugfs_show_mqd = debugfs_show_mqd;
+#endif
+		break;
+	case KFD_MQD_TYPE_DIQ:
+		mqd->allocate_mqd = allocate_hiq_mqd;
 		mqd->init_mqd = init_mqd_hiq;
-		mqd->uninit_mqd = uninit_mqd;
+		mqd->free_mqd = free_mqd;
 		mqd->load_mqd = load_mqd;
 		mqd->update_mqd = update_mqd_hiq;
 		mqd->destroy_mqd = destroy_mqd;
 		mqd->is_occupied = is_occupied;
+		mqd->mqd_size = sizeof(struct vi_mqd);
 #if defined(CONFIG_DEBUG_FS)
 		mqd->debugfs_show_mqd = debugfs_show_mqd;
 #endif
 		break;
 	case KFD_MQD_TYPE_SDMA:
+		mqd->allocate_mqd = allocate_sdma_mqd;
 		mqd->init_mqd = init_mqd_sdma;
-		mqd->uninit_mqd = uninit_mqd_sdma;
+		mqd->free_mqd = free_mqd_hiq_sdma;
 		mqd->load_mqd = load_mqd_sdma;
 		mqd->update_mqd = update_mqd_sdma;
 		mqd->destroy_mqd = destroy_mqd_sdma;
 		mqd->is_occupied = is_occupied_sdma;
+		mqd->mqd_size = sizeof(struct vi_sdma_mqd);
 #if defined(CONFIG_DEBUG_FS)
 		mqd->debugfs_show_mqd = debugfs_show_mqd_sdma;
 #endif
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
index 045a229436a0..83ef4b3dd2fb 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
@@ -48,7 +48,8 @@ static void pm_calc_rlib_size(struct packet_manager *pm,
 
 	process_count = pm->dqm->processes_count;
 	queue_count = pm->dqm->queue_count;
-	compute_queue_count = queue_count - pm->dqm->sdma_queue_count;
+	compute_queue_count = queue_count - pm->dqm->sdma_queue_count -
+				pm->dqm->xgmi_sdma_queue_count;
 
 	/* check if there is over subscription
 	 * Note: the arbitration between the number of VMIDs and
@@ -202,11 +203,15 @@ static int pm_create_runlist_ib(struct packet_manager *pm,
 
 	pr_debug("Finished map process and queues to runlist\n");
 
-	if (is_over_subscription)
+	if (is_over_subscription) {
+		if (!pm->is_over_subscription)
+			pr_warn("Runlist is getting oversubscribed. Expect reduced ROCm performance.\n");
 		retval = pm->pmf->runlist(pm, &rl_buffer[rl_wptr],
 					*rl_gpu_addr,
 					alloc_size_bytes / sizeof(uint32_t),
 					true);
+	}
+	pm->is_over_subscription = is_over_subscription;
 
 	for (i = 0; i < alloc_size_bytes / sizeof(uint32_t); i++)
 		pr_debug("0x%2X ", rl_buffer[i]);
@@ -227,14 +232,22 @@ int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm)
 	case CHIP_POLARIS10:
 	case CHIP_POLARIS11:
 	case CHIP_POLARIS12:
+	case CHIP_VEGAM:
 		pm->pmf = &kfd_vi_pm_funcs;
 		break;
 	case CHIP_VEGA10:
 	case CHIP_VEGA12:
 	case CHIP_VEGA20:
 	case CHIP_RAVEN:
+	case CHIP_RENOIR:
+	case CHIP_ARCTURUS:
 		pm->pmf = &kfd_v9_pm_funcs;
 		break;
+	case CHIP_NAVI10:
+	case CHIP_NAVI12:
+	case CHIP_NAVI14:
+		pm->pmf = &kfd_v10_pm_funcs;
+		break;
 	default:
 		WARN(1, "Unexpected ASIC family %u",
 		     dqm->dev->device_info->asic_family);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h
index f2bcf5c092ea..4d7add843746 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h
@@ -83,10 +83,10 @@ struct pm4_mes_set_resources {
 
 	union {
 		struct {
-		uint32_t gds_heap_base:6;
-		uint32_t reserved3:5;
-		uint32_t gds_heap_size:6;
-		uint32_t reserved4:15;
+		uint32_t gds_heap_base:10;
+		uint32_t reserved3:1;
+		uint32_t gds_heap_size:10;
+		uint32_t reserved4:11;
 		} bitfields8;
 		uint32_t ordinal8;
 	};
@@ -120,7 +120,7 @@ struct pm4_mes_runlist {
 			uint32_t ib_size:20;
 			uint32_t chain:1;
 			uint32_t offload_polling:1;
-			uint32_t reserved2:1;
+			uint32_t chained_runlist_idle_disable:1;
 			uint32_t valid:1;
 			uint32_t process_cnt:4;
 			uint32_t reserved3:4;
@@ -176,11 +176,10 @@ struct pm4_mes_map_process {
 
 	union {
 		struct {
-			uint32_t num_gws:6;
-			uint32_t reserved7:1;
+			uint32_t num_gws:7;
 			uint32_t sdma_enable:1;
 			uint32_t num_oac:4;
-			uint32_t reserved8:4;
+			uint32_t gds_size_hi:4;
 			uint32_t gds_size:6;
 			uint32_t num_queues:10;
 		} bitfields14;
@@ -255,17 +254,16 @@ enum mes_map_queues_queue_type_enum {
 queue_type__mes_map_queues__low_latency_static_queue_vi = 3
 };
 
-enum mes_map_queues_alloc_format_enum {
-	alloc_format__mes_map_queues__one_per_pipe_vi = 0,
-alloc_format__mes_map_queues__all_on_one_pipe_vi = 1
-};
-
 enum mes_map_queues_engine_sel_enum {
 	engine_sel__mes_map_queues__compute_vi = 0,
 	engine_sel__mes_map_queues__sdma0_vi = 2,
 	engine_sel__mes_map_queues__sdma1_vi = 3
 };
 
+enum mes_map_queues_extended_engine_sel_enum {
+	extended_engine_sel__mes_map_queues__legacy_engine_sel = 0,
+	extended_engine_sel__mes_map_queues__sdma0_to_7_sel = 1
+};
 
 struct pm4_mes_map_queues {
 	union {
@@ -275,11 +273,14 @@ struct pm4_mes_map_queues {
 
 	union {
 		struct {
-			uint32_t reserved1:4;
+			uint32_t reserved1:2;
+			enum mes_map_queues_extended_engine_sel_enum extended_engine_sel:2;
 			enum mes_map_queues_queue_sel_enum queue_sel:2;
-			uint32_t reserved2:15;
+			uint32_t reserved5:6;
+			uint32_t gws_control_queue:1;
+			uint32_t reserved2:8;
 			enum mes_map_queues_queue_type_enum queue_type:3;
-			enum mes_map_queues_alloc_format_enum alloc_format:2;
+			uint32_t reserved3:2;
 			enum mes_map_queues_engine_sel_enum engine_sel:3;
 			uint32_t num_queues:3;
 		} bitfields2;
@@ -386,6 +387,11 @@ enum mes_unmap_queues_engine_sel_enum {
 	engine_sel__mes_unmap_queues__sdmal = 3
 };
 
+enum mes_unmap_queues_extended_engine_sel_enum {
+	extended_engine_sel__mes_unmap_queues__legacy_engine_sel = 0,
+	extended_engine_sel__mes_unmap_queues__sdma0_to_7_sel = 1
+};
+
 struct pm4_mes_unmap_queues {
 	union {
 		union PM4_MES_TYPE_3_HEADER   header;            /* header */
@@ -395,7 +401,7 @@ struct pm4_mes_unmap_queues {
 	union {
 		struct {
 			enum mes_unmap_queues_action_enum action:2;
-			uint32_t reserved1:2;
+			enum mes_unmap_queues_extended_engine_sel_enum extended_engine_sel:2;
 			enum mes_unmap_queues_queue_sel_enum queue_sel:2;
 			uint32_t reserved2:20;
 			enum mes_unmap_queues_engine_sel_enum engine_sel:3;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_vi.h b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_vi.h
index 7c8d9b357749..5466cfe1c3cc 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_vi.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_pm4_headers_vi.h
@@ -216,11 +216,6 @@ enum mes_map_queues_queue_type_vi_enum {
 queue_type__mes_map_queues__low_latency_static_queue_vi = 3
 };
 
-enum mes_map_queues_alloc_format_vi_enum {
-	alloc_format__mes_map_queues__one_per_pipe_vi = 0,
-alloc_format__mes_map_queues__all_on_one_pipe_vi = 1
-};
-
 enum mes_map_queues_engine_sel_vi_enum {
 	engine_sel__mes_map_queues__compute_vi = 0,
 	engine_sel__mes_map_queues__sdma0_vi = 2,
@@ -240,7 +235,7 @@ struct pm4_mes_map_queues {
 			enum mes_map_queues_queue_sel_vi_enum queue_sel:2;
 			uint32_t reserved2:15;
 			enum mes_map_queues_queue_type_vi_enum queue_type:3;
-			enum mes_map_queues_alloc_format_vi_enum alloc_format:2;
+			uint32_t reserved3:2;
 			enum mes_map_queues_engine_sel_vi_enum engine_sel:3;
 			uint32_t num_queues:3;
 		} bitfields2;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 0eeee3c6d6dc..060a9e8b301e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -35,6 +35,11 @@
 #include <linux/kfifo.h>
 #include <linux/seq_file.h>
 #include <linux/kref.h>
+#include <linux/sysfs.h>
+#include <linux/device_cgroup.h>
+#include <drm/drm_file.h>
+#include <drm/drm_drv.h>
+#include <drm/drm_device.h>
 #include <kgd_kfd_interface.h>
 
 #include "amd_shared.h"
@@ -59,6 +64,7 @@
 #define KFD_MMAP_TYPE_DOORBELL	(0x3ULL << KFD_MMAP_TYPE_SHIFT)
 #define KFD_MMAP_TYPE_EVENTS	(0x2ULL << KFD_MMAP_TYPE_SHIFT)
 #define KFD_MMAP_TYPE_RESERVED_MEM	(0x1ULL << KFD_MMAP_TYPE_SHIFT)
+#define KFD_MMAP_TYPE_MMIO	(0x0ULL << KFD_MMAP_TYPE_SHIFT)
 
 #define KFD_MMAP_GPU_ID_SHIFT (46 - PAGE_SHIFT)
 #define KFD_MMAP_GPU_ID_MASK (((1ULL << KFD_GPU_ID_HASH_WIDTH) - 1) \
@@ -103,6 +109,8 @@
 
 #define KFD_KERNEL_QUEUE_SIZE 2048
 
+#define KFD_UNMAP_LATENCY_MS	(4000)
+
 /*
  * 512 = 0x200
  * The doorbell index distance between SDMA RLC (2*i) and (2*i+1) in the
@@ -153,13 +161,23 @@ extern int ignore_crat;
 /*
  * Set sh_mem_config.retry_disable on Vega10
  */
-extern int noretry;
+extern int amdgpu_noretry;
 
 /*
  * Halt if HWS hang is detected
  */
 extern int halt_if_hws_hang;
 
+/*
+ * Whether MEC FW support GWS barriers
+ */
+extern bool hws_gws_support;
+
+/*
+ * Queue preemption timeout in ms
+ */
+extern int queue_preemption_timeout_ms;
+
 enum cache_policy {
 	cache_policy_coherent,
 	cache_policy_noncoherent
@@ -177,6 +195,7 @@ struct kfd_event_interrupt_class {
 
 struct kfd_device_info {
 	enum amd_asic_type asic_family;
+	const char *asic_name;
 	const struct kfd_event_interrupt_class *event_interrupt_class;
 	unsigned int max_pasid_bits;
 	unsigned int max_no_of_hqd;
@@ -188,6 +207,7 @@ struct kfd_device_info {
 	bool needs_iommu_device;
 	bool needs_pci_atomics;
 	unsigned int num_sdma_engines;
+	unsigned int num_xgmi_sdma_engines;
 	unsigned int num_sdma_queues_per_engine;
 };
 
@@ -210,6 +230,7 @@ struct kfd_dev {
 
 	const struct kfd_device_info *device_info;
 	struct pci_dev *pdev;
+	struct drm_device *ddev;
 
 	unsigned int id;		/* topology stub index */
 
@@ -258,7 +279,7 @@ struct kfd_dev {
 	bool interrupts_active;
 
 	/* Debug manager */
-	struct kfd_dbgmgr           *dbgmgr;
+	struct kfd_dbgmgr *dbgmgr;
 
 	/* Firmware versions */
 	uint16_t mec_fw_version;
@@ -276,6 +297,15 @@ struct kfd_dev {
 	uint64_t hive_id;
 
 	bool pci_atomic_requested;
+
+	/* SRAM ECC flag */
+	atomic_t sram_ecc_flag;
+
+	/* Compute Profile ref. count */
+	atomic_t compute_profile;
+
+	/* Global GWS resource shared b/t processes*/
+	void *gws;
 };
 
 enum kfd_mempool {
@@ -323,7 +353,8 @@ enum kfd_queue_type  {
 	KFD_QUEUE_TYPE_COMPUTE,
 	KFD_QUEUE_TYPE_SDMA,
 	KFD_QUEUE_TYPE_HIQ,
-	KFD_QUEUE_TYPE_DIQ
+	KFD_QUEUE_TYPE_DIQ,
+	KFD_QUEUE_TYPE_SDMA_XGMI
 };
 
 enum kfd_queue_format {
@@ -331,6 +362,11 @@ enum kfd_queue_format {
 	KFD_QUEUE_FORMAT_AQL
 };
 
+enum KFD_QUEUE_PRIORITY {
+	KFD_QUEUE_PRIORITY_MINIMUM = 0,
+	KFD_QUEUE_PRIORITY_MAXIMUM = 15
+};
+
 /**
  * struct queue_properties
  *
@@ -413,6 +449,11 @@ struct queue_properties {
 	uint32_t *cu_mask;
 };
 
+#define QUEUE_IS_ACTIVE(q) ((q).queue_size > 0 &&	\
+			    (q).queue_address != 0 &&	\
+			    (q).queue_percent > 0 &&	\
+			    !(q).is_evicted)
+
 /**
  * struct queue
  *
@@ -438,6 +479,9 @@ struct queue_properties {
  *
  * @device: The kfd device that created this queue.
  *
+ * @gws: Pointing to gws kgd_mem if this is a gws control queue; NULL
+ * otherwise.
+ *
  * This structure represents user mode compute queues.
  * It contains all the necessary data to handle such queues.
  *
@@ -459,6 +503,7 @@ struct queue {
 
 	struct kfd_process	*process;
 	struct kfd_dev		*device;
+	void *gws;
 };
 
 /*
@@ -469,9 +514,16 @@ enum KFD_MQD_TYPE {
 	KFD_MQD_TYPE_HIQ,		/* for hiq */
 	KFD_MQD_TYPE_CP,		/* for cp queues and diq */
 	KFD_MQD_TYPE_SDMA,		/* for sdma queues */
+	KFD_MQD_TYPE_DIQ,		/* for diq */
 	KFD_MQD_TYPE_MAX
 };
 
+enum KFD_PIPE_PRIORITY {
+	KFD_PIPE_PRIORITY_CS_LOW = 0,
+	KFD_PIPE_PRIORITY_CS_MEDIUM,
+	KFD_PIPE_PRIORITY_CS_HIGH
+};
+
 struct scheduling_resources {
 	unsigned int vmid_mask;
 	enum kfd_queue_type type;
@@ -636,10 +688,7 @@ struct kfd_process {
 	/* We want to receive a notification when the mm_struct is destroyed */
 	struct mmu_notifier mmu_notifier;
 
-	/* Use for delayed freeing of kfd_process structure */
-	struct rcu_head	rcu;
-
-	unsigned int pasid;
+	uint16_t pasid;
 	unsigned int doorbell_index;
 
 	/*
@@ -680,6 +729,10 @@ struct kfd_process {
 	 * restored after an eviction
 	 */
 	unsigned long last_restore_timestamp;
+
+	/* Kobj for our procfs */
+	struct kobject *kobj;
+	struct attribute attr_pasid;
 };
 
 #define KFD_PROCESS_TABLE_SIZE 5 /* bits: 32 entries */
@@ -782,6 +835,10 @@ int kfd_gtt_sa_free(struct kfd_dev *kfd, struct kfd_mem_obj *mem_obj);
 
 extern struct device *kfd_device;
 
+/* KFD's procfs */
+void kfd_procfs_init(void);
+void kfd_procfs_shutdown(void);
+
 /* Topology */
 int kfd_topology_init(void);
 void kfd_topology_shutdown(void);
@@ -813,8 +870,6 @@ void uninit_queue(struct queue *q);
 void print_queue_properties(struct queue_properties *q);
 void print_queue(struct queue *q);
 
-struct mqd_manager *mqd_manager_init(enum KFD_MQD_TYPE type,
-					struct kfd_dev *dev);
 struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type,
 		struct kfd_dev *dev);
 struct mqd_manager *mqd_manager_init_cik_hawaii(enum KFD_MQD_TYPE type,
@@ -825,6 +880,8 @@ struct mqd_manager *mqd_manager_init_vi_tonga(enum KFD_MQD_TYPE type,
 		struct kfd_dev *dev);
 struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type,
 		struct kfd_dev *dev);
+struct mqd_manager *mqd_manager_init_v10(enum KFD_MQD_TYPE type,
+		struct kfd_dev *dev);
 struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev);
 void device_queue_manager_uninit(struct device_queue_manager *dqm);
 struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
@@ -853,6 +910,8 @@ int pqm_update_queue(struct process_queue_manager *pqm, unsigned int qid,
 			struct queue_properties *p);
 int pqm_set_cu_mask(struct process_queue_manager *pqm, unsigned int qid,
 			struct queue_properties *p);
+int pqm_set_gws(struct process_queue_manager *pqm, unsigned int qid,
+			void *gws);
 struct kernel_queue *pqm_get_kernel_queue(struct process_queue_manager *pqm,
 						unsigned int qid);
 int pqm_get_wave_state(struct process_queue_manager *pqm,
@@ -862,8 +921,8 @@ int pqm_get_wave_state(struct process_queue_manager *pqm,
 		       u32 *save_area_used_size);
 
 int amdkfd_fence_wait_timeout(unsigned int *fence_addr,
-				unsigned int fence_value,
-				unsigned int timeout_ms);
+			      unsigned int fence_value,
+			      unsigned int timeout_ms);
 
 /* Packet Manager */
 
@@ -877,6 +936,7 @@ struct packet_manager {
 	bool allocated;
 	struct kfd_mem_obj *ib_buffer_obj;
 	unsigned int ib_size_bytes;
+	bool is_over_subscription;
 
 	const struct packet_manager_funcs *pmf;
 };
@@ -912,6 +972,7 @@ struct packet_manager_funcs {
 
 extern const struct packet_manager_funcs kfd_vi_pm_funcs;
 extern const struct packet_manager_funcs kfd_v9_pm_funcs;
+extern const struct packet_manager_funcs kfd_v10_pm_funcs;
 
 int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm);
 void pm_uninit(struct packet_manager *pm);
@@ -931,7 +992,8 @@ void pm_release_ib(struct packet_manager *pm);
 /* Following PM funcs can be shared among VI and AI */
 unsigned int pm_build_pm4_header(unsigned int opcode, size_t packet_size);
 int pm_set_resources_vi(struct packet_manager *pm, uint32_t *buffer,
-				struct scheduling_resources *res);
+			struct scheduling_resources *res);
+
 
 uint64_t kfd_get_number_elems(struct kfd_dev *kfd);
 
@@ -975,6 +1037,25 @@ int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p);
 
 bool kfd_is_locked(void);
 
+/* Compute profile */
+void kfd_inc_compute_active(struct kfd_dev *dev);
+void kfd_dec_compute_active(struct kfd_dev *dev);
+
+/* Cgroup Support */
+/* Check with device cgroup if @kfd device is accessible */
+static inline int kfd_devcgroup_check_permission(struct kfd_dev *kfd)
+{
+#if defined(CONFIG_CGROUP_DEVICE)
+	struct drm_device *ddev = kfd->ddev;
+
+	return devcgroup_check_permission(DEVCG_DEV_CHAR, ddev->driver->major,
+					  ddev->render->index,
+					  DEVCG_ACC_WRITE | DEVCG_ACC_READ);
+#else
+	return 0;
+#endif
+}
+
 /* Debugfs */
 #if defined(CONFIG_DEBUG_FS)
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 4bdae78bab8e..10f9af5784f2 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -62,12 +62,74 @@ static struct workqueue_struct *kfd_restore_wq;
 
 static struct kfd_process *find_process(const struct task_struct *thread);
 static void kfd_process_ref_release(struct kref *ref);
-static struct kfd_process *create_process(const struct task_struct *thread,
-					struct file *filep);
+static struct kfd_process *create_process(const struct task_struct *thread);
+static int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep);
 
 static void evict_process_worker(struct work_struct *work);
 static void restore_process_worker(struct work_struct *work);
 
+struct kfd_procfs_tree {
+	struct kobject *kobj;
+};
+
+static struct kfd_procfs_tree procfs;
+
+static ssize_t kfd_procfs_show(struct kobject *kobj, struct attribute *attr,
+			       char *buffer)
+{
+	int val = 0;
+
+	if (strcmp(attr->name, "pasid") == 0) {
+		struct kfd_process *p = container_of(attr, struct kfd_process,
+						     attr_pasid);
+		val = p->pasid;
+	} else {
+		pr_err("Invalid attribute");
+		return -EINVAL;
+	}
+
+	return snprintf(buffer, PAGE_SIZE, "%d\n", val);
+}
+
+static void kfd_procfs_kobj_release(struct kobject *kobj)
+{
+	kfree(kobj);
+}
+
+static const struct sysfs_ops kfd_procfs_ops = {
+	.show = kfd_procfs_show,
+};
+
+static struct kobj_type procfs_type = {
+	.release = kfd_procfs_kobj_release,
+	.sysfs_ops = &kfd_procfs_ops,
+};
+
+void kfd_procfs_init(void)
+{
+	int ret = 0;
+
+	procfs.kobj = kfd_alloc_struct(procfs.kobj);
+	if (!procfs.kobj)
+		return;
+
+	ret = kobject_init_and_add(procfs.kobj, &procfs_type,
+				   &kfd_device->kobj, "proc");
+	if (ret) {
+		pr_warn("Could not create procfs proc folder");
+		/* If we fail to create the procfs, clean up */
+		kfd_procfs_shutdown();
+	}
+}
+
+void kfd_procfs_shutdown(void)
+{
+	if (procfs.kobj) {
+		kobject_del(procfs.kobj);
+		kobject_put(procfs.kobj);
+		procfs.kobj = NULL;
+	}
+}
 
 int kfd_process_create_wq(void)
 {
@@ -206,6 +268,7 @@ struct kfd_process *kfd_create_process(struct file *filep)
 {
 	struct kfd_process *process;
 	struct task_struct *thread = current;
+	int ret;
 
 	if (!thread->mm)
 		return ERR_PTR(-EINVAL);
@@ -223,11 +286,44 @@ struct kfd_process *kfd_create_process(struct file *filep)
 
 	/* A prior open of /dev/kfd could have already created the process. */
 	process = find_process(thread);
-	if (process)
+	if (process) {
 		pr_debug("Process already found\n");
-	else
-		process = create_process(thread, filep);
+	} else {
+		process = create_process(thread);
+		if (IS_ERR(process))
+			goto out;
+
+		ret = kfd_process_init_cwsr_apu(process, filep);
+		if (ret) {
+			process = ERR_PTR(ret);
+			goto out;
+		}
+
+		if (!procfs.kobj)
+			goto out;
+
+		process->kobj = kfd_alloc_struct(process->kobj);
+		if (!process->kobj) {
+			pr_warn("Creating procfs kobject failed");
+			goto out;
+		}
+		ret = kobject_init_and_add(process->kobj, &procfs_type,
+					   procfs.kobj, "%d",
+					   (int)process->lead_thread->pid);
+		if (ret) {
+			pr_warn("Creating procfs pid directory failed");
+			goto out;
+		}
 
+		process->attr_pasid.name = "pasid";
+		process->attr_pasid.mode = KFD_SYSFS_FILE_MODE;
+		sysfs_attr_init(&process->attr_pasid);
+		ret = sysfs_create_file(process->kobj, &process->attr_pasid);
+		if (ret)
+			pr_warn("Creating pasid for pid %d failed",
+					(int)process->lead_thread->pid);
+	}
+out:
 	mutex_unlock(&kfd_processes_mutex);
 
 	return process;
@@ -320,7 +416,7 @@ static void kfd_process_destroy_pdds(struct kfd_process *p)
 
 	list_for_each_entry_safe(pdd, temp, &p->per_device_data,
 				 per_device_list) {
-		pr_debug("Releasing pdd (topology id %d) for process (pasid %d)\n",
+		pr_debug("Releasing pdd (topology id %d) for process (pasid 0x%x)\n",
 				pdd->dev->id, p->pasid);
 
 		if (pdd->drm_file) {
@@ -355,6 +451,14 @@ static void kfd_process_wq_release(struct work_struct *work)
 	struct kfd_process *p = container_of(work, struct kfd_process,
 					     release_work);
 
+	/* Remove the procfs files */
+	if (p->kobj) {
+		sysfs_remove_file(p->kobj, &p->attr_pasid);
+		kobject_del(p->kobj);
+		kobject_put(p->kobj);
+		p->kobj = NULL;
+	}
+
 	kfd_iommu_unbind_process(p);
 
 	kfd_process_free_outstanding_kfd_bos(p);
@@ -382,11 +486,9 @@ static void kfd_process_ref_release(struct kref *ref)
 	queue_work(kfd_process_wq, &p->release_work);
 }
 
-static void kfd_process_destroy_delayed(struct rcu_head *rcu)
+static void kfd_process_free_notifier(struct mmu_notifier *mn)
 {
-	struct kfd_process *p = container_of(rcu, struct kfd_process, rcu);
-
-	kfd_unref_process(p);
+	kfd_unref_process(container_of(mn, struct kfd_process, mmu_notifier));
 }
 
 static void kfd_process_notifier_release(struct mmu_notifier *mn,
@@ -438,12 +540,12 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn,
 
 	mutex_unlock(&p->mutex);
 
-	mmu_notifier_unregister_no_release(&p->mmu_notifier, mm);
-	mmu_notifier_call_srcu(&p->rcu, &kfd_process_destroy_delayed);
+	mmu_notifier_put(&p->mmu_notifier);
 }
 
 static const struct mmu_notifier_ops kfd_process_mmu_notifier_ops = {
 	.release = kfd_process_notifier_release,
+	.free_notifier = kfd_process_free_notifier,
 };
 
 static int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep)
@@ -513,81 +615,69 @@ static int kfd_process_device_init_cwsr_dgpu(struct kfd_process_device *pdd)
 	return 0;
 }
 
-static struct kfd_process *create_process(const struct task_struct *thread,
-					struct file *filep)
+/*
+ * On return the kfd_process is fully operational and will be freed when the
+ * mm is released
+ */
+static struct kfd_process *create_process(const struct task_struct *thread)
 {
 	struct kfd_process *process;
 	int err = -ENOMEM;
 
 	process = kzalloc(sizeof(*process), GFP_KERNEL);
-
 	if (!process)
 		goto err_alloc_process;
 
-	process->pasid = kfd_pasid_alloc();
-	if (process->pasid == 0)
-		goto err_alloc_pasid;
-
-	if (kfd_alloc_process_doorbells(process) < 0)
-		goto err_alloc_doorbells;
-
 	kref_init(&process->ref);
-
 	mutex_init(&process->mutex);
-
 	process->mm = thread->mm;
-
-	/* register notifier */
-	process->mmu_notifier.ops = &kfd_process_mmu_notifier_ops;
-	err = mmu_notifier_register(&process->mmu_notifier, process->mm);
-	if (err)
-		goto err_mmu_notifier;
-
-	hash_add_rcu(kfd_processes_table, &process->kfd_processes,
-			(uintptr_t)process->mm);
-
 	process->lead_thread = thread->group_leader;
-	get_task_struct(process->lead_thread);
-
 	INIT_LIST_HEAD(&process->per_device_data);
-
+	INIT_DELAYED_WORK(&process->eviction_work, evict_process_worker);
+	INIT_DELAYED_WORK(&process->restore_work, restore_process_worker);
+	process->last_restore_timestamp = get_jiffies_64();
 	kfd_event_init_process(process);
+	process->is_32bit_user_mode = in_compat_syscall();
+
+	process->pasid = kfd_pasid_alloc();
+	if (process->pasid == 0)
+		goto err_alloc_pasid;
+
+	if (kfd_alloc_process_doorbells(process) < 0)
+		goto err_alloc_doorbells;
 
 	err = pqm_init(&process->pqm, process);
 	if (err != 0)
 		goto err_process_pqm_init;
 
 	/* init process apertures*/
-	process->is_32bit_user_mode = in_compat_syscall();
 	err = kfd_init_apertures(process);
 	if (err != 0)
 		goto err_init_apertures;
 
-	INIT_DELAYED_WORK(&process->eviction_work, evict_process_worker);
-	INIT_DELAYED_WORK(&process->restore_work, restore_process_worker);
-	process->last_restore_timestamp = get_jiffies_64();
-
-	err = kfd_process_init_cwsr_apu(process, filep);
+	/* Must be last, have to use release destruction after this */
+	process->mmu_notifier.ops = &kfd_process_mmu_notifier_ops;
+	err = mmu_notifier_register(&process->mmu_notifier, process->mm);
 	if (err)
-		goto err_init_cwsr;
+		goto err_register_notifier;
+
+	get_task_struct(process->lead_thread);
+	hash_add_rcu(kfd_processes_table, &process->kfd_processes,
+			(uintptr_t)process->mm);
 
 	return process;
 
-err_init_cwsr:
+err_register_notifier:
 	kfd_process_free_outstanding_kfd_bos(process);
 	kfd_process_destroy_pdds(process);
 err_init_apertures:
 	pqm_uninit(&process->pqm);
 err_process_pqm_init:
-	hash_del_rcu(&process->kfd_processes);
-	synchronize_rcu();
-	mmu_notifier_unregister_no_release(&process->mmu_notifier, process->mm);
-err_mmu_notifier:
-	mutex_destroy(&process->mutex);
 	kfd_free_process_doorbells(process);
 err_alloc_doorbells:
 	kfd_pasid_free(process->pasid);
 err_alloc_pasid:
+	mutex_destroy(&process->mutex);
 	kfree(process);
 err_alloc_process:
 	return ERR_PTR(err);
@@ -597,6 +687,8 @@ static int init_doorbell_bitmap(struct qcm_process_device *qpd,
 			struct kfd_dev *dev)
 {
 	unsigned int i;
+	int range_start = dev->shared_resources.non_cp_doorbells_start;
+	int range_end = dev->shared_resources.non_cp_doorbells_end;
 
 	if (!KFD_IS_SOC15(dev->device_info->asic_family))
 		return 0;
@@ -608,14 +700,16 @@ static int init_doorbell_bitmap(struct qcm_process_device *qpd,
 		return -ENOMEM;
 
 	/* Mask out doorbells reserved for SDMA, IH, and VCN on SOC15. */
+	pr_debug("reserved doorbell 0x%03x - 0x%03x\n", range_start, range_end);
+	pr_debug("reserved doorbell 0x%03x - 0x%03x\n",
+			range_start + KFD_QUEUE_DOORBELL_MIRROR_OFFSET,
+			range_end + KFD_QUEUE_DOORBELL_MIRROR_OFFSET);
+
 	for (i = 0; i < KFD_MAX_NUM_OF_QUEUES_PER_PROCESS / 2; i++) {
-		if (i >= dev->shared_resources.non_cp_doorbells_start
-			&& i <= dev->shared_resources.non_cp_doorbells_end) {
+		if (i >= range_start && i <= range_end) {
 			set_bit(i, qpd->doorbell_bitmap);
 			set_bit(i + KFD_QUEUE_DOORBELL_MIRROR_OFFSET,
 				qpd->doorbell_bitmap);
-			pr_debug("reserved doorbell 0x%03x and 0x%03x\n", i,
-				i + KFD_QUEUE_DOORBELL_MIRROR_OFFSET);
 		}
 	}
 
@@ -705,6 +799,8 @@ int kfd_process_device_init_vm(struct kfd_process_device *pdd,
 		return ret;
 	}
 
+	amdgpu_vm_set_task_info(pdd->vm);
+
 	ret = kfd_process_device_reserve_ib_mem(pdd);
 	if (ret)
 		goto err_reserve_ib_mem;
@@ -928,7 +1024,7 @@ static void evict_process_worker(struct work_struct *work)
 	 */
 	flush_delayed_work(&p->restore_work);
 
-	pr_debug("Started evicting pasid %d\n", p->pasid);
+	pr_debug("Started evicting pasid 0x%x\n", p->pasid);
 	ret = kfd_process_evict_queues(p);
 	if (!ret) {
 		dma_fence_signal(p->ef);
@@ -937,16 +1033,15 @@ static void evict_process_worker(struct work_struct *work)
 		queue_delayed_work(kfd_restore_wq, &p->restore_work,
 				msecs_to_jiffies(PROCESS_RESTORE_TIME_MS));
 
-		pr_debug("Finished evicting pasid %d\n", p->pasid);
+		pr_debug("Finished evicting pasid 0x%x\n", p->pasid);
 	} else
-		pr_err("Failed to evict queues of pasid %d\n", p->pasid);
+		pr_err("Failed to evict queues of pasid 0x%x\n", p->pasid);
 }
 
 static void restore_process_worker(struct work_struct *work)
 {
 	struct delayed_work *dwork;
 	struct kfd_process *p;
-	struct kfd_process_device *pdd;
 	int ret = 0;
 
 	dwork = to_delayed_work(work);
@@ -955,17 +1050,7 @@ static void restore_process_worker(struct work_struct *work)
 	 * lifetime of this thread, kfd_process p will be valid
 	 */
 	p = container_of(dwork, struct kfd_process, restore_work);
-
-	/* Call restore_process_bos on the first KGD device. This function
-	 * takes care of restoring the whole process including other devices.
-	 * Restore can fail if enough memory is not available. If so,
-	 * reschedule again.
-	 */
-	pdd = list_first_entry(&p->per_device_data,
-			       struct kfd_process_device,
-			       per_device_list);
-
-	pr_debug("Started restoring pasid %d\n", p->pasid);
+	pr_debug("Started restoring pasid 0x%x\n", p->pasid);
 
 	/* Setting last_restore_timestamp before successful restoration.
 	 * Otherwise this would have to be set by KGD (restore_process_bos)
@@ -981,7 +1066,7 @@ static void restore_process_worker(struct work_struct *work)
 	ret = amdgpu_amdkfd_gpuvm_restore_process_bos(p->kgd_process_info,
 						     &p->ef);
 	if (ret) {
-		pr_debug("Failed to restore BOs of pasid %d, retry after %d ms\n",
+		pr_debug("Failed to restore BOs of pasid 0x%x, retry after %d ms\n",
 			 p->pasid, PROCESS_BACK_OFF_TIME_MS);
 		ret = queue_delayed_work(kfd_restore_wq, &p->restore_work,
 				msecs_to_jiffies(PROCESS_BACK_OFF_TIME_MS));
@@ -991,9 +1076,9 @@ static void restore_process_worker(struct work_struct *work)
 
 	ret = kfd_process_restore_queues(p);
 	if (!ret)
-		pr_debug("Finished restoring pasid %d\n", p->pasid);
+		pr_debug("Finished restoring pasid 0x%x\n", p->pasid);
 	else
-		pr_err("Failed to restore queues of pasid %d\n", p->pasid);
+		pr_err("Failed to restore queues of pasid 0x%x\n", p->pasid);
 }
 
 void kfd_suspend_all_processes(void)
@@ -1007,7 +1092,7 @@ void kfd_suspend_all_processes(void)
 		cancel_delayed_work_sync(&p->restore_work);
 
 		if (kfd_process_evict_queues(p))
-			pr_err("Failed to suspend process %d\n", p->pasid);
+			pr_err("Failed to suspend process 0x%x\n", p->pasid);
 		dma_fence_signal(p->ef);
 		dma_fence_put(p->ef);
 		p->ef = NULL;
@@ -1090,7 +1175,7 @@ int kfd_debugfs_mqds_by_process(struct seq_file *m, void *data)
 	int idx = srcu_read_lock(&kfd_processes_srcu);
 
 	hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
-		seq_printf(m, "Process %d PASID %d:\n",
+		seq_printf(m, "Process %d PASID 0x%x:\n",
 			   p->lead_thread->tgid, p->pasid);
 
 		mutex_lock(&p->mutex);
@@ -1107,3 +1192,4 @@ int kfd_debugfs_mqds_by_process(struct seq_file *m, void *data)
 }
 
 #endif
+
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
index fcaaf93681ac..2659d226c056 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
@@ -26,6 +26,7 @@
 #include "kfd_device_queue_manager.h"
 #include "kfd_priv.h"
 #include "kfd_kernel_queue.h"
+#include "amdgpu_amdkfd.h"
 
 static inline struct process_queue_node *get_queue_by_qid(
 			struct process_queue_manager *pqm, unsigned int qid)
@@ -52,7 +53,7 @@ static int find_available_queue_slot(struct process_queue_manager *pqm,
 	pr_debug("The new slot id %lu\n", found);
 
 	if (found >= KFD_MAX_NUM_OF_QUEUES_PER_PROCESS) {
-		pr_info("Cannot open more queues for process with pasid %d\n",
+		pr_info("Cannot open more queues for process with pasid 0x%x\n",
 				pqm->process->pasid);
 		return -ENOMEM;
 	}
@@ -74,6 +75,55 @@ void kfd_process_dequeue_from_device(struct kfd_process_device *pdd)
 	pdd->already_dequeued = true;
 }
 
+int pqm_set_gws(struct process_queue_manager *pqm, unsigned int qid,
+			void *gws)
+{
+	struct kfd_dev *dev = NULL;
+	struct process_queue_node *pqn;
+	struct kfd_process_device *pdd;
+	struct kgd_mem *mem = NULL;
+	int ret;
+
+	pqn = get_queue_by_qid(pqm, qid);
+	if (!pqn) {
+		pr_err("Queue id does not match any known queue\n");
+		return -EINVAL;
+	}
+
+	if (pqn->q)
+		dev = pqn->q->device;
+	if (WARN_ON(!dev))
+		return -ENODEV;
+
+	pdd = kfd_get_process_device_data(dev, pqm->process);
+	if (!pdd) {
+		pr_err("Process device data doesn't exist\n");
+		return -EINVAL;
+	}
+
+	/* Only allow one queue per process can have GWS assigned */
+	if (gws && pdd->qpd.num_gws)
+		return -EBUSY;
+
+	if (!gws && pdd->qpd.num_gws == 0)
+		return -EINVAL;
+
+	if (gws)
+		ret = amdgpu_amdkfd_add_gws_to_process(pdd->process->kgd_process_info,
+			gws, &mem);
+	else
+		ret = amdgpu_amdkfd_remove_gws_from_process(pdd->process->kgd_process_info,
+			pqn->q->gws);
+	if (unlikely(ret))
+		return ret;
+
+	pqn->q->gws = mem;
+	pdd->qpd.num_gws = gws ? amdgpu_amdkfd_get_num_gws(dev->kgd) : 0;
+
+	return pqn->q->device->dqm->ops.update_queue(pqn->q->device->dqm,
+							pqn->q);
+}
+
 void kfd_process_dequeue_from_all_devices(struct kfd_process *p)
 {
 	struct kfd_process_device *pdd;
@@ -100,6 +150,9 @@ void pqm_uninit(struct process_queue_manager *pqm)
 	struct process_queue_node *pqn, *next;
 
 	list_for_each_entry_safe(pqn, next, &pqm->queues, process_queue_list) {
+		if (pqn->q && pqn->q->gws)
+			amdgpu_amdkfd_remove_gws_from_process(pqm->process->kgd_process_info,
+				pqn->q->gws);
 		uninit_queue(pqn->q);
 		list_del(&pqn->process_queue_list);
 		kfree(pqn);
@@ -186,8 +239,13 @@ int pqm_create_queue(struct process_queue_manager *pqm,
 
 	switch (type) {
 	case KFD_QUEUE_TYPE_SDMA:
-		if (dev->dqm->queue_count >= get_num_sdma_queues(dev->dqm)) {
-			pr_err("Over-subscription is not allowed for SDMA.\n");
+	case KFD_QUEUE_TYPE_SDMA_XGMI:
+		if ((type == KFD_QUEUE_TYPE_SDMA && dev->dqm->sdma_queue_count
+			>= get_num_sdma_queues(dev->dqm)) ||
+			(type == KFD_QUEUE_TYPE_SDMA_XGMI &&
+			dev->dqm->xgmi_sdma_queue_count
+			>= get_num_xgmi_sdma_queues(dev->dqm))) {
+			pr_debug("Over-subscription is not allowed for SDMA.\n");
 			retval = -EPERM;
 			goto err_create_queue;
 		}
@@ -240,7 +298,7 @@ int pqm_create_queue(struct process_queue_manager *pqm,
 	}
 
 	if (retval != 0) {
-		pr_err("Pasid %d DQM create queue %d failed. ret %d\n",
+		pr_err("Pasid 0x%x DQM create queue %d failed. ret %d\n",
 			pqm->process->pasid, type, retval);
 		goto err_create_queue;
 	}
@@ -319,12 +377,19 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
 		dqm = pqn->q->device->dqm;
 		retval = dqm->ops.destroy_queue(dqm, &pdd->qpd, pqn->q);
 		if (retval) {
-			pr_err("Pasid %d destroy queue %d failed, ret %d\n",
+			pr_err("Pasid 0x%x destroy queue %d failed, ret %d\n",
 				pqm->process->pasid,
 				pqn->q->properties.queue_id, retval);
 			if (retval != -ETIME)
 				goto err_destroy_queue;
 		}
+
+		if (pqn->q->gws) {
+			amdgpu_amdkfd_remove_gws_from_process(pqm->process->kgd_process_info,
+				pqn->q->gws);
+			pdd->qpd.num_gws = 0;
+		}
+
 		kfree(pqn->q->properties.cu_mask);
 		pqn->q->properties.cu_mask = NULL;
 		uninit_queue(pqn->q);
@@ -446,6 +511,7 @@ int pqm_debugfs_mqds(struct seq_file *m, void *data)
 			q = pqn->q;
 			switch (q->properties.type) {
 			case KFD_QUEUE_TYPE_SDMA:
+			case KFD_QUEUE_TYPE_SDMA_XGMI:
 				seq_printf(m, "  SDMA queue on device %x\n",
 					   q->device->id);
 				mqd_type = KFD_MQD_TYPE_SDMA;
@@ -461,8 +527,7 @@ int pqm_debugfs_mqds(struct seq_file *m, void *data)
 					   q->properties.type, q->device->id);
 				continue;
 			}
-			mqd_mgr = q->device->dqm->ops.get_mqd_manager(
-				q->device->dqm, mqd_type);
+			mqd_mgr = q->device->dqm->mqd_mgrs[mqd_type];
 		} else if (pqn->kq) {
 			q = pqn->kq->queue;
 			mqd_mgr = pqn->kq->mqd_mgr;
@@ -470,7 +535,6 @@ int pqm_debugfs_mqds(struct seq_file *m, void *data)
 			case KFD_QUEUE_TYPE_DIQ:
 				seq_printf(m, "  DIQ on device %x\n",
 					   pqn->kq->dev->id);
-				mqd_type = KFD_MQD_TYPE_HIQ;
 				break;
 			default:
 				seq_printf(m,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
index 09da91644f9f..69bd0628fdc6 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
@@ -37,6 +37,7 @@
 #include "kfd_device_queue_manager.h"
 #include "kfd_iommu.h"
 #include "amdgpu_amdkfd.h"
+#include "amdgpu_ras.h"
 
 /* topology_device_list - Master list of all topology devices */
 static struct list_head topology_device_list;
@@ -268,6 +269,8 @@ static ssize_t iolink_show(struct kobject *kobj, struct attribute *attr,
 	buffer[0] = 0;
 
 	iolink = container_of(attr, struct kfd_iolink_properties, attr);
+	if (iolink->gpu && kfd_devcgroup_check_permission(iolink->gpu))
+		return -EPERM;
 	sysfs_show_32bit_prop(buffer, "type", iolink->iolink_type);
 	sysfs_show_32bit_prop(buffer, "version_major", iolink->ver_maj);
 	sysfs_show_32bit_prop(buffer, "version_minor", iolink->ver_min);
@@ -304,6 +307,8 @@ static ssize_t mem_show(struct kobject *kobj, struct attribute *attr,
 	buffer[0] = 0;
 
 	mem = container_of(attr, struct kfd_mem_properties, attr);
+	if (mem->gpu && kfd_devcgroup_check_permission(mem->gpu))
+		return -EPERM;
 	sysfs_show_32bit_prop(buffer, "heap_type", mem->heap_type);
 	sysfs_show_64bit_prop(buffer, "size_in_bytes", mem->size_in_bytes);
 	sysfs_show_32bit_prop(buffer, "flags", mem->flags);
@@ -333,6 +338,8 @@ static ssize_t kfd_cache_show(struct kobject *kobj, struct attribute *attr,
 	buffer[0] = 0;
 
 	cache = container_of(attr, struct kfd_cache_properties, attr);
+	if (cache->gpu && kfd_devcgroup_check_permission(cache->gpu))
+		return -EPERM;
 	sysfs_show_32bit_prop(buffer, "processor_id_low",
 			cache->processor_id_low);
 	sysfs_show_32bit_prop(buffer, "level", cache->cache_level);
@@ -405,8 +412,6 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr,
 		char *buffer)
 {
 	struct kfd_topology_device *dev;
-	char public_name[KFD_TOPOLOGY_PUBLIC_NAME_SIZE];
-	uint32_t i;
 	uint32_t log_max_watch_addr;
 
 	/* Making sure that the buffer is an empty string */
@@ -415,24 +420,24 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr,
 	if (strcmp(attr->name, "gpu_id") == 0) {
 		dev = container_of(attr, struct kfd_topology_device,
 				attr_gpuid);
+		if (dev->gpu && kfd_devcgroup_check_permission(dev->gpu))
+			return -EPERM;
 		return sysfs_show_32bit_val(buffer, dev->gpu_id);
 	}
 
 	if (strcmp(attr->name, "name") == 0) {
 		dev = container_of(attr, struct kfd_topology_device,
 				attr_name);
-		for (i = 0; i < KFD_TOPOLOGY_PUBLIC_NAME_SIZE; i++) {
-			public_name[i] =
-					(char)dev->node_props.marketing_name[i];
-			if (dev->node_props.marketing_name[i] == 0)
-				break;
-		}
-		public_name[KFD_TOPOLOGY_PUBLIC_NAME_SIZE-1] = 0x0;
-		return sysfs_show_str_val(buffer, public_name);
+
+		if (dev->gpu && kfd_devcgroup_check_permission(dev->gpu))
+			return -EPERM;
+		return sysfs_show_str_val(buffer, dev->node_props.name);
 	}
 
 	dev = container_of(attr, struct kfd_topology_device,
 			attr_props);
+	if (dev->gpu && kfd_devcgroup_check_permission(dev->gpu))
+		return -EPERM;
 	sysfs_show_32bit_prop(buffer, "cpu_cores_count",
 			dev->node_props.cpu_cores_count);
 	sysfs_show_32bit_prop(buffer, "simd_count",
@@ -453,6 +458,8 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr,
 			dev->node_props.lds_size_in_kb);
 	sysfs_show_32bit_prop(buffer, "gds_size_in_kb",
 			dev->node_props.gds_size_in_kb);
+	sysfs_show_32bit_prop(buffer, "num_gws",
+			dev->node_props.num_gws);
 	sysfs_show_32bit_prop(buffer, "wave_front_size",
 			dev->node_props.wave_front_size);
 	sysfs_show_32bit_prop(buffer, "array_count",
@@ -475,6 +482,10 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr,
 			dev->node_props.drm_render_minor);
 	sysfs_show_64bit_prop(buffer, "hive_id",
 			dev->node_props.hive_id);
+	sysfs_show_32bit_prop(buffer, "num_sdma_engines",
+			dev->node_props.num_sdma_engines);
+	sysfs_show_32bit_prop(buffer, "num_sdma_xgmi_engines",
+			dev->node_props.num_sdma_xgmi_engines);
 
 	if (dev->gpu) {
 		log_max_watch_addr =
@@ -1077,8 +1088,9 @@ static uint32_t kfd_generate_gpu_id(struct kfd_dev *gpu)
 			local_mem_info.local_mem_size_public;
 
 	buf[0] = gpu->pdev->devfn;
-	buf[1] = gpu->pdev->subsystem_vendor;
-	buf[2] = gpu->pdev->subsystem_device;
+	buf[1] = gpu->pdev->subsystem_vendor |
+		(gpu->pdev->subsystem_device << 16);
+	buf[2] = pci_domain_nr(gpu->pdev->bus);
 	buf[3] = gpu->pdev->device;
 	buf[4] = gpu->pdev->bus->number;
 	buf[5] = lower_32_bits(local_mem_size);
@@ -1098,6 +1110,9 @@ static struct kfd_topology_device *kfd_assign_gpu(struct kfd_dev *gpu)
 {
 	struct kfd_topology_device *dev;
 	struct kfd_topology_device *out_dev = NULL;
+	struct kfd_mem_properties *mem;
+	struct kfd_cache_properties *cache;
+	struct kfd_iolink_properties *iolink;
 
 	down_write(&topology_lock);
 	list_for_each_entry(dev, &topology_device_list, list) {
@@ -1111,6 +1126,13 @@ static struct kfd_topology_device *kfd_assign_gpu(struct kfd_dev *gpu)
 		if (!dev->gpu && (dev->node_props.simd_count > 0)) {
 			dev->gpu = gpu;
 			out_dev = dev;
+
+			list_for_each_entry(mem, &dev->mem_props, list)
+				mem->gpu = dev->gpu;
+			list_for_each_entry(cache, &dev->cache_props, list)
+				cache->gpu = dev->gpu;
+			list_for_each_entry(iolink, &dev->io_link_props, list)
+				iolink->gpu = dev->gpu;
 			break;
 		}
 	}
@@ -1197,6 +1219,7 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
 	void *crat_image = NULL;
 	size_t image_size = 0;
 	int proximity_domain;
+	struct amdgpu_ras *ctx;
 
 	INIT_LIST_HEAD(&temp_topology_device_list);
 
@@ -1265,13 +1288,16 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
 	 */
 
 	amdgpu_amdkfd_get_cu_info(dev->gpu->kgd, &cu_info);
+
+	strncpy(dev->node_props.name, gpu->device_info->asic_name,
+			KFD_TOPOLOGY_PUBLIC_NAME_SIZE);
+
 	dev->node_props.simd_arrays_per_engine =
 		cu_info.num_shader_arrays_per_engine;
 
 	dev->node_props.vendor_id = gpu->pdev->vendor;
 	dev->node_props.device_id = gpu->pdev->device;
-	dev->node_props.location_id = PCI_DEVID(gpu->pdev->bus->number,
-		gpu->pdev->devfn);
+	dev->node_props.location_id = pci_dev_id(gpu->pdev);
 	dev->node_props.max_engine_clk_fcompute =
 		amdgpu_amdkfd_get_max_engine_clock_in_mhz(dev->gpu->kgd);
 	dev->node_props.max_engine_clk_ccompute =
@@ -1280,6 +1306,12 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
 		gpu->shared_resources.drm_render_minor;
 
 	dev->node_props.hive_id = gpu->hive_id;
+	dev->node_props.num_sdma_engines = gpu->device_info->num_sdma_engines;
+	dev->node_props.num_sdma_xgmi_engines =
+				gpu->device_info->num_xgmi_sdma_engines;
+	dev->node_props.num_gws = (hws_gws_support &&
+		dev->gpu->dqm->sched_policy != KFD_SCHED_POLICY_NO_HWS) ?
+		amdgpu_amdkfd_get_num_gws(dev->gpu->kgd) : 0;
 
 	kfd_fill_mem_clk_max_info(dev);
 	kfd_fill_iolink_non_crat_info(dev);
@@ -1297,6 +1329,7 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
 	case CHIP_POLARIS10:
 	case CHIP_POLARIS11:
 	case CHIP_POLARIS12:
+	case CHIP_VEGAM:
 		pr_debug("Adding doorbell packet type capability\n");
 		dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_1_0 <<
 			HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) &
@@ -1306,6 +1339,11 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
 	case CHIP_VEGA12:
 	case CHIP_VEGA20:
 	case CHIP_RAVEN:
+	case CHIP_RENOIR:
+	case CHIP_ARCTURUS:
+	case CHIP_NAVI10:
+	case CHIP_NAVI12:
+	case CHIP_NAVI14:
 		dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_2_0 <<
 			HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) &
 			HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK);
@@ -1315,17 +1353,38 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
 		     dev->gpu->device_info->asic_family);
 	}
 
+	/*
+	* Overwrite ATS capability according to needs_iommu_device to fix
+	* potential missing corresponding bit in CRAT of BIOS.
+	*/
+	if (dev->gpu->device_info->needs_iommu_device)
+		dev->node_props.capability |= HSA_CAP_ATS_PRESENT;
+	else
+		dev->node_props.capability &= ~HSA_CAP_ATS_PRESENT;
+
 	/* Fix errors in CZ CRAT.
 	 * simd_count: Carrizo CRAT reports wrong simd_count, probably
 	 *		because it doesn't consider masked out CUs
 	 * max_waves_per_simd: Carrizo reports wrong max_waves_per_simd
-	 * capability flag: Carrizo CRAT doesn't report IOMMU flags
 	 */
 	if (dev->gpu->device_info->asic_family == CHIP_CARRIZO) {
 		dev->node_props.simd_count =
 			cu_info.simd_per_cu * cu_info.cu_active_number;
 		dev->node_props.max_waves_per_simd = 10;
-		dev->node_props.capability |= HSA_CAP_ATS_PRESENT;
+	}
+
+	ctx = amdgpu_ras_get_context((struct amdgpu_device *)(dev->gpu->kgd));
+	if (ctx) {
+		/* kfd only concerns sram ecc on GFX/SDMA and HBM ecc on UMC */
+		dev->node_props.capability |=
+			(((ctx->features & BIT(AMDGPU_RAS_BLOCK__SDMA)) != 0) ||
+			 ((ctx->features & BIT(AMDGPU_RAS_BLOCK__GFX)) != 0)) ?
+			HSA_CAP_SRAM_EDCSUPPORTED : 0;
+		dev->node_props.capability |= ((ctx->features & BIT(AMDGPU_RAS_BLOCK__UMC)) != 0) ?
+			HSA_CAP_MEM_EDCSUPPORTED : 0;
+
+		dev->node_props.capability |= (ctx->features != 0) ?
+			HSA_CAP_RASEVENTNOTIFY : 0;
 	}
 
 	kfd_debug_print_topology();
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
index 92a19be07344..15843e0fc756 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
@@ -27,7 +27,7 @@
 #include <linux/list.h>
 #include "kfd_crat.h"
 
-#define KFD_TOPOLOGY_PUBLIC_NAME_SIZE 128
+#define KFD_TOPOLOGY_PUBLIC_NAME_SIZE 32
 
 #define HSA_CAP_HOT_PLUGGABLE			0x00000001
 #define HSA_CAP_ATS_PRESENT			0x00000002
@@ -48,6 +48,10 @@
 #define HSA_CAP_DOORBELL_TYPE_2_0		0x2
 #define HSA_CAP_AQL_QUEUE_DOUBLE_MAP		0x00004000
 
+#define HSA_CAP_SRAM_EDCSUPPORTED		0x00080000
+#define HSA_CAP_MEM_EDCSUPPORTED		0x00100000
+#define HSA_CAP_RASEVENTNOTIFY			0x00200000
+
 struct kfd_node_properties {
 	uint64_t hive_id;
 	uint32_t cpu_cores_count;
@@ -61,6 +65,7 @@ struct kfd_node_properties {
 	uint32_t max_waves_per_simd;
 	uint32_t lds_size_in_kb;
 	uint32_t gds_size_in_kb;
+	uint32_t num_gws;
 	uint32_t wave_front_size;
 	uint32_t array_count;
 	uint32_t simd_arrays_per_engine;
@@ -74,7 +79,9 @@ struct kfd_node_properties {
 	uint32_t max_engine_clk_fcompute;
 	uint32_t max_engine_clk_ccompute;
 	int32_t  drm_render_minor;
-	uint16_t marketing_name[KFD_TOPOLOGY_PUBLIC_NAME_SIZE];
+	uint32_t num_sdma_engines;
+	uint32_t num_sdma_xgmi_engines;
+	char name[KFD_TOPOLOGY_PUBLIC_NAME_SIZE];
 };
 
 #define HSA_MEM_HEAP_TYPE_SYSTEM	0
@@ -95,6 +102,7 @@ struct kfd_mem_properties {
 	uint32_t		flags;
 	uint32_t		width;
 	uint32_t		mem_clk_max;
+	struct kfd_dev		*gpu;
 	struct kobject		*kobj;
 	struct attribute	attr;
 };
@@ -116,6 +124,7 @@ struct kfd_cache_properties {
 	uint32_t		cache_latency;
 	uint32_t		cache_type;
 	uint8_t			sibling_map[CRAT_SIBLINGMAP_SIZE];
+	struct kfd_dev		*gpu;
 	struct kobject		*kobj;
 	struct attribute	attr;
 };
@@ -134,6 +143,7 @@ struct kfd_iolink_properties {
 	uint32_t		max_bandwidth;
 	uint32_t		rec_transfer_size;
 	uint32_t		flags;
+	struct kfd_dev		*gpu;
 	struct kobject		*kobj;
 	struct attribute	attr;
 };