145 files changed, 8120 insertions, 5400 deletions
diff --git a/Documentation/gpu/drm-kms-helpers.rst b/Documentation/gpu/drm-kms-helpers.rst
index 3ea622876b67..e37557b30f62 100644
--- a/Documentation/gpu/drm-kms-helpers.rst
+++ b/Documentation/gpu/drm-kms-helpers.rst
@@ -74,15 +74,6 @@ Helper Functions Reference
 .. kernel-doc:: drivers/gpu/drm/drm_atomic_helper.c
    :export:
 
-Legacy CRTC/Modeset Helper Functions Reference
-==============================================
-
-.. kernel-doc:: drivers/gpu/drm/drm_crtc_helper.c
-   :doc: overview
-
-.. kernel-doc:: drivers/gpu/drm/drm_crtc_helper.c
-   :export:
-
 Simple KMS Helper Reference
 ===========================
 
@@ -282,15 +273,6 @@ Flip-work Helper Reference
 .. kernel-doc:: drivers/gpu/drm/drm_flip_work.c
    :export:
 
-Plane Helper Reference
-======================
-
-.. kernel-doc:: drivers/gpu/drm/drm_plane_helper.c
-   :doc: overview
-
-.. kernel-doc:: drivers/gpu/drm/drm_plane_helper.c
-   :export:
-
 Auxiliary Modeset Helpers
 =========================
 
@@ -308,3 +290,21 @@ Framebuffer GEM Helper Reference
 
 .. kernel-doc:: drivers/gpu/drm/drm_gem_framebuffer_helper.c
    :export:
+
+Legacy Plane Helper Reference
+=============================
+
+.. kernel-doc:: drivers/gpu/drm/drm_plane_helper.c
+   :doc: overview
+
+.. kernel-doc:: drivers/gpu/drm/drm_plane_helper.c
+   :export:
+
+Legacy CRTC/Modeset Helper Functions Reference
+==============================================
+
+.. kernel-doc:: drivers/gpu/drm/drm_crtc_helper.c
+   :doc: overview
+
+.. kernel-doc:: drivers/gpu/drm/drm_crtc_helper.c
+   :export:
diff --git a/Documentation/gpu/drm-kms.rst b/Documentation/gpu/drm-kms.rst
index 307284125d7a..2dcf5b42015d 100644
--- a/Documentation/gpu/drm-kms.rst
+++ b/Documentation/gpu/drm-kms.rst
@@ -263,14 +263,20 @@ Taken all together there's two consequences for the atomic design:
 
 - An atomic update is assembled and validated as an entirely free-standing pile
   of structures within the :c:type:`drm_atomic_state <drm_atomic_state>`
-  container. Again drivers can subclass that container for their own state
-  structure tracking needs. Only when a state is committed is it applied to the
-  driver and modeset objects. This way rolling back an update boils down to
-  releasing memory and unreferencing objects like framebuffers.
+  container. Driver private state structures are also tracked in the same
+  structure; see the next chapter.  Only when a state is committed is it applied
+  to the driver and modeset objects. This way rolling back an update boils down
+  to releasing memory and unreferencing objects like framebuffers.
 
 Read on in this chapter, and also in :ref:`drm_atomic_helper` for more detailed
 coverage of specific topics.
 
+Handling Driver Private State
+-----------------------------
+
+.. kernel-doc:: drivers/gpu/drm/drm_atomic.c
+   :doc: handling driver private state
+
 Atomic Mode Setting Function Reference
 --------------------------------------
 
diff --git a/Documentation/gpu/todo.rst b/Documentation/gpu/todo.rst
index f421a54527d2..1e593370f64f 100644
--- a/Documentation/gpu/todo.rst
+++ b/Documentation/gpu/todo.rst
@@ -194,6 +194,24 @@ drm_mode_config_helper_suspend/resume().
 
 Contact: Maintainer of the driver you plan to convert
 
+Convert drivers to use drm_fb_helper_fbdev_setup/teardown()
+-----------------------------------------------------------
+
+Most drivers can use drm_fb_helper_fbdev_setup() except maybe:
+
+- amdgpu which has special logic to decide whether to call
+  drm_helper_disable_unused_functions()
+
+- armada which isn't atomic and doesn't call
+  drm_helper_disable_unused_functions()
+
+- i915 which calls drm_fb_helper_initial_config() in a worker
+
+Drivers that use drm_framebuffer_remove() to clean up the fbdev framebuffer can
+probably use drm_fb_helper_fbdev_teardown().
+
+Contact: Maintainer of the driver you plan to convert
+
 Core refactorings
 =================
 
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 3cbb2c78a9df..bae0d32e327b 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -528,6 +528,7 @@ static const struct pci_device_id intel_early_ids[] __initconst = {
 	INTEL_SKL_IDS(&gen9_early_ops),
 	INTEL_BXT_IDS(&gen9_early_ops),
 	INTEL_KBL_IDS(&gen9_early_ops),
+	INTEL_CFL_IDS(&gen9_early_ops),
 	INTEL_GLK_IDS(&gen9_early_ops),
 	INTEL_CNL_IDS(&gen9_early_ops),
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 642bea2c9b3a..d5a2eefd6c3e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -959,6 +959,7 @@ struct amdgpu_gfx_config {
 };
 
 struct amdgpu_cu_info {
+	uint32_t simd_per_cu;
 	uint32_t max_waves_per_simd;
 	uint32_t wave_front_size;
 	uint32_t max_scratch_slots_per_cu;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 896b16db58aa..335e454e2ee1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -275,14 +275,34 @@ void free_gtt_mem(struct kgd_dev *kgd, void *mem_obj)
 	kfree(mem);
 }
 
-uint64_t get_vmem_size(struct kgd_dev *kgd)
+void get_local_mem_info(struct kgd_dev *kgd,
+			struct kfd_local_mem_info *mem_info)
 {
-	struct amdgpu_device *adev =
-		(struct amdgpu_device *)kgd;
+	struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
+	uint64_t address_mask = adev->dev->dma_mask ? ~*adev->dev->dma_mask :
+					     ~((1ULL << 32) - 1);
+	resource_size_t aper_limit = adev->mc.aper_base + adev->mc.aper_size;
+
+	memset(mem_info, 0, sizeof(*mem_info));
+	if (!(adev->mc.aper_base & address_mask || aper_limit & address_mask)) {
+		mem_info->local_mem_size_public = adev->mc.visible_vram_size;
+		mem_info->local_mem_size_private = adev->mc.real_vram_size -
+				adev->mc.visible_vram_size;
+	} else {
+		mem_info->local_mem_size_public = 0;
+		mem_info->local_mem_size_private = adev->mc.real_vram_size;
+	}
+	mem_info->vram_width = adev->mc.vram_width;
 
-	BUG_ON(kgd == NULL);
+	pr_debug("Address base: 0x%llx limit 0x%llx public 0x%llx private 0x%llx\n",
+			adev->mc.aper_base, aper_limit,
+			mem_info->local_mem_size_public,
+			mem_info->local_mem_size_private);
 
-	return adev->mc.real_vram_size;
+	if (amdgpu_sriov_vf(adev))
+		mem_info->mem_clk_max = adev->clock.default_mclk / 100;
+	else
+		mem_info->mem_clk_max = amdgpu_dpm_get_mclk(adev, false) / 100;
 }
 
 uint64_t get_gpu_clock_counter(struct kgd_dev *kgd)
@@ -298,6 +318,39 @@ uint32_t get_max_engine_clock_in_mhz(struct kgd_dev *kgd)
 {
 	struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
 
-	/* The sclk is in quantas of 10kHz */
-	return adev->pm.dpm.dyn_state.max_clock_voltage_on_ac.sclk / 100;
+	/* the sclk is in quantas of 10kHz */
+	if (amdgpu_sriov_vf(adev))
+		return adev->clock.default_sclk / 100;
+
+	return amdgpu_dpm_get_sclk(adev, false) / 100;
+}
+
+void get_cu_info(struct kgd_dev *kgd, struct kfd_cu_info *cu_info)
+{
+	struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
+	struct amdgpu_cu_info acu_info = adev->gfx.cu_info;
+
+	memset(cu_info, 0, sizeof(*cu_info));
+	if (sizeof(cu_info->cu_bitmap) != sizeof(acu_info.bitmap))
+		return;
+
+	cu_info->cu_active_number = acu_info.number;
+	cu_info->cu_ao_mask = acu_info.ao_cu_mask;
+	memcpy(&cu_info->cu_bitmap[0], &acu_info.bitmap[0],
+	       sizeof(acu_info.bitmap));
+	cu_info->num_shader_engines = adev->gfx.config.max_shader_engines;
+	cu_info->num_shader_arrays_per_engine = adev->gfx.config.max_sh_per_se;
+	cu_info->num_cu_per_sh = adev->gfx.config.max_cu_per_sh;
+	cu_info->simd_per_cu = acu_info.simd_per_cu;
+	cu_info->max_waves_per_simd = acu_info.max_waves_per_simd;
+	cu_info->wave_front_size = acu_info.wave_front_size;
+	cu_info->max_scratch_slots_per_cu = acu_info.max_scratch_slots_per_cu;
+	cu_info->lds_size = acu_info.lds_size;
+}
+
+uint64_t amdgpu_amdkfd_get_vram_usage(struct kgd_dev *kgd)
+{
+	struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
+
+	return amdgpu_vram_mgr_usage(&adev->mman.bdev.man[TTM_PL_VRAM]);
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 8d689ab7e429..2a519f9062ee 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -56,10 +56,13 @@ int alloc_gtt_mem(struct kgd_dev *kgd, size_t size,
 			void **mem_obj, uint64_t *gpu_addr,
 			void **cpu_ptr);
 void free_gtt_mem(struct kgd_dev *kgd, void *mem_obj);
-uint64_t get_vmem_size(struct kgd_dev *kgd);
+void get_local_mem_info(struct kgd_dev *kgd,
+			struct kfd_local_mem_info *mem_info);
 uint64_t get_gpu_clock_counter(struct kgd_dev *kgd);
 
 uint32_t get_max_engine_clock_in_mhz(struct kgd_dev *kgd);
+void get_cu_info(struct kgd_dev *kgd, struct kfd_cu_info *cu_info);
+uint64_t amdgpu_amdkfd_get_vram_usage(struct kgd_dev *kgd);
 
 #define read_user_wptr(mmptr, wptr, dst)				\
 	({								\
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
index 1ae149456c9f..a9e6aea0e5f8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
@@ -105,7 +105,14 @@ static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
 			uint32_t queue_id, uint32_t __user *wptr,
 			uint32_t wptr_shift, uint32_t wptr_mask,
 			struct mm_struct *mm);
-static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd);
+static int kgd_hqd_dump(struct kgd_dev *kgd,
+			uint32_t pipe_id, uint32_t queue_id,
+			uint32_t (**dump)[2], uint32_t *n_regs);
+static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd,
+			     uint32_t __user *wptr, struct mm_struct *mm);
+static int kgd_hqd_sdma_dump(struct kgd_dev *kgd,
+			     uint32_t engine_id, uint32_t queue_id,
+			     uint32_t (**dump)[2], uint32_t *n_regs);
 static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address,
 				uint32_t pipe_id, uint32_t queue_id);
 
@@ -166,7 +173,7 @@ static int get_tile_config(struct kgd_dev *kgd,
 static const struct kfd2kgd_calls kfd2kgd = {
 	.init_gtt_mem_allocation = alloc_gtt_mem,
 	.free_gtt_mem = free_gtt_mem,
-	.get_vmem_size = get_vmem_size,
+	.get_local_mem_info = get_local_mem_info,
 	.get_gpu_clock_counter = get_gpu_clock_counter,
 	.get_max_engine_clock_in_mhz = get_max_engine_clock_in_mhz,
 	.alloc_pasid = amdgpu_pasid_alloc,
@@ -177,6 +184,8 @@ static const struct kfd2kgd_calls kfd2kgd = {
 	.init_interrupts = kgd_init_interrupts,
 	.hqd_load = kgd_hqd_load,
 	.hqd_sdma_load = kgd_hqd_sdma_load,
+	.hqd_dump = kgd_hqd_dump,
+	.hqd_sdma_dump = kgd_hqd_sdma_dump,
 	.hqd_is_occupied = kgd_hqd_is_occupied,
 	.hqd_sdma_is_occupied = kgd_hqd_sdma_is_occupied,
 	.hqd_destroy = kgd_hqd_destroy,
@@ -191,6 +200,8 @@ static const struct kfd2kgd_calls kfd2kgd = {
 	.get_fw_version = get_fw_version,
 	.set_scratch_backing_va = set_scratch_backing_va,
 	.get_tile_config = get_tile_config,
+	.get_cu_info = get_cu_info,
+	.get_vram_usage = amdgpu_amdkfd_get_vram_usage
 };
 
 struct kfd2kgd_calls *amdgpu_amdkfd_gfx_7_get_functions(void)
@@ -375,7 +386,44 @@ static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
 	return 0;
 }
 
-static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd)
+static int kgd_hqd_dump(struct kgd_dev *kgd,
+			uint32_t pipe_id, uint32_t queue_id,
+			uint32_t (**dump)[2], uint32_t *n_regs)
+{
+	struct amdgpu_device *adev = get_amdgpu_device(kgd);
+	uint32_t i = 0, reg;
+#define HQD_N_REGS (35+4)
+#define DUMP_REG(addr) do {				\
+		if (WARN_ON_ONCE(i >= HQD_N_REGS))	\
+			break;				\
+		(*dump)[i][0] = (addr) << 2;		\
+		(*dump)[i++][1] = RREG32(addr);		\
+	} while (0)
+
+	*dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL);
+	if (*dump == NULL)
+		return -ENOMEM;
+
+	acquire_queue(kgd, pipe_id, queue_id);
+
+	DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE0);
+	DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE1);
+	DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE2);
+	DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE3);
+
+	for (reg = mmCP_MQD_BASE_ADDR; reg <= mmCP_MQD_CONTROL; reg++)
+		DUMP_REG(reg);
+
+	release_queue(kgd);
+
+	WARN_ON_ONCE(i != HQD_N_REGS);
+	*n_regs = i;
+
+	return 0;
+}
+
+static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd,
+			     uint32_t __user *wptr, struct mm_struct *mm)
 {
 	struct amdgpu_device *adev = get_amdgpu_device(kgd);
 	struct cik_sdma_rlc_registers *m;
@@ -410,10 +458,17 @@ static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd)
 		WREG32(mmSDMA0_GFX_CONTEXT_CNTL, data);
 	}
 
-	WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL,
-				m->sdma_rlc_doorbell);
-	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, 0);
-	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, 0);
+	data = REG_SET_FIELD(m->sdma_rlc_doorbell, SDMA0_RLC0_DOORBELL,
+			     ENABLE, 1);
+	WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, data);
+	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, m->sdma_rlc_rb_rptr);
+
+	if (read_user_wptr(mm, wptr, data))
+		WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, data);
+	else
+		WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR,
+		       m->sdma_rlc_rb_rptr);
+
 	WREG32(sdma_base_addr + mmSDMA0_RLC0_VIRTUAL_ADDR,
 				m->sdma_rlc_virtual_addr);
 	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, m->sdma_rlc_rb_base);
@@ -423,8 +478,37 @@ static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd)
 			m->sdma_rlc_rb_rptr_addr_lo);
 	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_HI,
 			m->sdma_rlc_rb_rptr_addr_hi);
-	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL,
-			m->sdma_rlc_rb_cntl);
+
+	data = REG_SET_FIELD(m->sdma_rlc_rb_cntl, SDMA0_RLC0_RB_CNTL,
+			     RB_ENABLE, 1);
+	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, data);
+
+	return 0;
+}
+
+static int kgd_hqd_sdma_dump(struct kgd_dev *kgd,
+			     uint32_t engine_id, uint32_t queue_id,
+			     uint32_t (**dump)[2], uint32_t *n_regs)
+{
+	struct amdgpu_device *adev = get_amdgpu_device(kgd);
+	uint32_t sdma_offset = engine_id * SDMA1_REGISTER_OFFSET +
+		queue_id * KFD_CIK_SDMA_QUEUE_OFFSET;
+	uint32_t i = 0, reg;
+#undef HQD_N_REGS
+#define HQD_N_REGS (19+4)
+
+	*dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL);
+	if (*dump == NULL)
+		return -ENOMEM;
+
+	for (reg = mmSDMA0_RLC0_RB_CNTL; reg <= mmSDMA0_RLC0_DOORBELL; reg++)
+		DUMP_REG(sdma_offset + reg);
+	for (reg = mmSDMA0_RLC0_VIRTUAL_ADDR; reg <= mmSDMA0_RLC0_WATERMARK;
+	     reg++)
+		DUMP_REG(sdma_offset + reg);
+
+	WARN_ON_ONCE(i != HQD_N_REGS);
+	*n_regs = i;
 
 	return 0;
 }
@@ -575,7 +659,7 @@ static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd,
 	struct cik_sdma_rlc_registers *m;
 	uint32_t sdma_base_addr;
 	uint32_t temp;
-	int timeout = utimeout;
+	unsigned long end_jiffies = (utimeout * HZ / 1000) + jiffies;
 
 	m = get_sdma_mqd(mqd);
 	sdma_base_addr = get_sdma_base_addr(m);
@@ -588,10 +672,9 @@ static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd,
 		temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS);
 		if (temp & SDMA0_STATUS_REG__RB_CMD_IDLE__SHIFT)
 			break;
-		if (timeout <= 0)
+		if (time_after(jiffies, end_jiffies))
 			return -ETIME;
-		msleep(20);
-		timeout -= 20;
+		usleep_range(500, 1000);
 	}
 
 	WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, 0);
@@ -599,6 +682,8 @@ static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd,
 		RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL) |
 		SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK);
 
+	m->sdma_rlc_rb_rptr = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR);
+
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
index e9b436bc8dcb..b127259d7d85 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
@@ -45,7 +45,7 @@ enum hqd_dequeue_request_type {
 	RESET_WAVES
 };
 
-struct cik_sdma_rlc_registers;
+struct vi_sdma_mqd;
 
 /*
  * Register access functions
@@ -64,7 +64,14 @@ static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
 			uint32_t queue_id, uint32_t __user *wptr,
 			uint32_t wptr_shift, uint32_t wptr_mask,
 			struct mm_struct *mm);
-static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd);
+static int kgd_hqd_dump(struct kgd_dev *kgd,
+			uint32_t pipe_id, uint32_t queue_id,
+			uint32_t (**dump)[2], uint32_t *n_regs);
+static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd,
+			     uint32_t __user *wptr, struct mm_struct *mm);
+static int kgd_hqd_sdma_dump(struct kgd_dev *kgd,
+			     uint32_t engine_id, uint32_t queue_id,
+			     uint32_t (**dump)[2], uint32_t *n_regs);
 static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address,
 		uint32_t pipe_id, uint32_t queue_id);
 static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd);
@@ -125,7 +132,7 @@ static int get_tile_config(struct kgd_dev *kgd,
 static const struct kfd2kgd_calls kfd2kgd = {
 	.init_gtt_mem_allocation = alloc_gtt_mem,
 	.free_gtt_mem = free_gtt_mem,
-	.get_vmem_size = get_vmem_size,
+	.get_local_mem_info = get_local_mem_info,
 	.get_gpu_clock_counter = get_gpu_clock_counter,
 	.get_max_engine_clock_in_mhz = get_max_engine_clock_in_mhz,
 	.alloc_pasid = amdgpu_pasid_alloc,
@@ -136,6 +143,8 @@ static const struct kfd2kgd_calls kfd2kgd = {
 	.init_interrupts = kgd_init_interrupts,
 	.hqd_load = kgd_hqd_load,
 	.hqd_sdma_load = kgd_hqd_sdma_load,
+	.hqd_dump = kgd_hqd_dump,
+	.hqd_sdma_dump = kgd_hqd_sdma_dump,
 	.hqd_is_occupied = kgd_hqd_is_occupied,
 	.hqd_sdma_is_occupied = kgd_hqd_sdma_is_occupied,
 	.hqd_destroy = kgd_hqd_destroy,
@@ -152,6 +161,8 @@ static const struct kfd2kgd_calls kfd2kgd = {
 	.get_fw_version = get_fw_version,
 	.set_scratch_backing_va = set_scratch_backing_va,
 	.get_tile_config = get_tile_config,
+	.get_cu_info = get_cu_info,
+	.get_vram_usage = amdgpu_amdkfd_get_vram_usage
 };
 
 struct kfd2kgd_calls *amdgpu_amdkfd_gfx_8_0_get_functions(void)
@@ -268,9 +279,15 @@ static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id)
 	return 0;
 }
 
-static inline uint32_t get_sdma_base_addr(struct cik_sdma_rlc_registers *m)
+static inline uint32_t get_sdma_base_addr(struct vi_sdma_mqd *m)
 {
-	return 0;
+	uint32_t retval;
+
+	retval = m->sdma_engine_id * SDMA1_REGISTER_OFFSET +
+		m->sdma_queue_id * KFD_VI_SDMA_QUEUE_OFFSET;
+	pr_debug("kfd: sdma base address: 0x%x\n", retval);
+
+	return retval;
 }
 
 static inline struct vi_mqd *get_mqd(void *mqd)
@@ -278,9 +295,9 @@ static inline struct vi_mqd *get_mqd(void *mqd)
 	return (struct vi_mqd *)mqd;
 }
 
-static inline struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd)
+static inline struct vi_sdma_mqd *get_sdma_mqd(void *mqd)
 {
-	return (struct cik_sdma_rlc_registers *)mqd;
+	return (struct vi_sdma_mqd *)mqd;
 }
 
 static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
@@ -358,8 +375,138 @@ static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
 	return 0;
 }
 
-static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd)
+static int kgd_hqd_dump(struct kgd_dev *kgd,
+			uint32_t pipe_id, uint32_t queue_id,
+			uint32_t (**dump)[2], uint32_t *n_regs)
 {
+	struct amdgpu_device *adev = get_amdgpu_device(kgd);
+	uint32_t i = 0, reg;
+#define HQD_N_REGS (54+4)
+#define DUMP_REG(addr) do {				\
+		if (WARN_ON_ONCE(i >= HQD_N_REGS))	\
+			break;				\
+		(*dump)[i][0] = (addr) << 2;		\
+		(*dump)[i++][1] = RREG32(addr);		\
+	} while (0)
+
+	*dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL);
+	if (*dump == NULL)
+		return -ENOMEM;
+
+	acquire_queue(kgd, pipe_id, queue_id);
+
+	DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE0);
+	DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE1);
+	DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE2);
+	DUMP_REG(mmCOMPUTE_STATIC_THREAD_MGMT_SE3);
+
+	for (reg = mmCP_MQD_BASE_ADDR; reg <= mmCP_HQD_EOP_DONES; reg++)
+		DUMP_REG(reg);
+
+	release_queue(kgd);
+
+	WARN_ON_ONCE(i != HQD_N_REGS);
+	*n_regs = i;
+
+	return 0;
+}
+
+static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd,
+			     uint32_t __user *wptr, struct mm_struct *mm)
+{
+	struct amdgpu_device *adev = get_amdgpu_device(kgd);
+	struct vi_sdma_mqd *m;
+	unsigned long end_jiffies;
+	uint32_t sdma_base_addr;
+	uint32_t data;
+
+	m = get_sdma_mqd(mqd);
+	sdma_base_addr = get_sdma_base_addr(m);
+	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL,
+		m->sdmax_rlcx_rb_cntl & (~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK));
+
+	end_jiffies = msecs_to_jiffies(2000) + jiffies;
+	while (true) {
+		data = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS);
+		if (data & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK)
+			break;
+		if (time_after(jiffies, end_jiffies))
+			return -ETIME;
+		usleep_range(500, 1000);
+	}
+	if (m->sdma_engine_id) {
+		data = RREG32(mmSDMA1_GFX_CONTEXT_CNTL);
+		data = REG_SET_FIELD(data, SDMA1_GFX_CONTEXT_CNTL,
+				RESUME_CTX, 0);
+		WREG32(mmSDMA1_GFX_CONTEXT_CNTL, data);
+	} else {
+		data = RREG32(mmSDMA0_GFX_CONTEXT_CNTL);
+		data = REG_SET_FIELD(data, SDMA0_GFX_CONTEXT_CNTL,
+				RESUME_CTX, 0);
+		WREG32(mmSDMA0_GFX_CONTEXT_CNTL, data);
+	}
+
+	data = REG_SET_FIELD(m->sdmax_rlcx_doorbell, SDMA0_RLC0_DOORBELL,
+			     ENABLE, 1);
+	WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, data);
+	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, m->sdmax_rlcx_rb_rptr);
+
+	if (read_user_wptr(mm, wptr, data))
+		WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, data);
+	else
+		WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR,
+		       m->sdmax_rlcx_rb_rptr);
+
+	WREG32(sdma_base_addr + mmSDMA0_RLC0_VIRTUAL_ADDR,
+				m->sdmax_rlcx_virtual_addr);
+	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, m->sdmax_rlcx_rb_base);
+	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE_HI,
+			m->sdmax_rlcx_rb_base_hi);
+	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_LO,
+			m->sdmax_rlcx_rb_rptr_addr_lo);
+	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_HI,
+			m->sdmax_rlcx_rb_rptr_addr_hi);
+
+	data = REG_SET_FIELD(m->sdmax_rlcx_rb_cntl, SDMA0_RLC0_RB_CNTL,
+			     RB_ENABLE, 1);
+	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, data);
+
+	return 0;
+}
+
+static int kgd_hqd_sdma_dump(struct kgd_dev *kgd,
+			     uint32_t engine_id, uint32_t queue_id,
+			     uint32_t (**dump)[2], uint32_t *n_regs)
+{
+	struct amdgpu_device *adev = get_amdgpu_device(kgd);
+	uint32_t sdma_offset = engine_id * SDMA1_REGISTER_OFFSET +
+		queue_id * KFD_VI_SDMA_QUEUE_OFFSET;
+	uint32_t i = 0, reg;
+#undef HQD_N_REGS
+#define HQD_N_REGS (19+4+2+3+7)
+
+	*dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL);
+	if (*dump == NULL)
+		return -ENOMEM;
+
+	for (reg = mmSDMA0_RLC0_RB_CNTL; reg <= mmSDMA0_RLC0_DOORBELL; reg++)
+		DUMP_REG(sdma_offset + reg);
+	for (reg = mmSDMA0_RLC0_VIRTUAL_ADDR; reg <= mmSDMA0_RLC0_WATERMARK;
+	     reg++)
+		DUMP_REG(sdma_offset + reg);
+	for (reg = mmSDMA0_RLC0_CSA_ADDR_LO; reg <= mmSDMA0_RLC0_CSA_ADDR_HI;
+	     reg++)
+		DUMP_REG(sdma_offset + reg);
+	for (reg = mmSDMA0_RLC0_IB_SUB_REMAIN; reg <= mmSDMA0_RLC0_DUMMY_REG;
+	     reg++)
+		DUMP_REG(sdma_offset + reg);
+	for (reg = mmSDMA0_RLC0_MIDCMD_DATA0; reg <= mmSDMA0_RLC0_MIDCMD_CNTL;
+	     reg++)
+		DUMP_REG(sdma_offset + reg);
+
+	WARN_ON_ONCE(i != HQD_N_REGS);
+	*n_regs = i;
+
 	return 0;
 }
 
@@ -388,7 +535,7 @@ static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address,
 static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd)
 {
 	struct amdgpu_device *adev = get_amdgpu_device(kgd);
-	struct cik_sdma_rlc_registers *m;
+	struct vi_sdma_mqd *m;
 	uint32_t sdma_base_addr;
 	uint32_t sdma_rlc_rb_cntl;
 
@@ -509,10 +656,10 @@ static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd,
 				unsigned int utimeout)
 {
 	struct amdgpu_device *adev = get_amdgpu_device(kgd);
-	struct cik_sdma_rlc_registers *m;
+	struct vi_sdma_mqd *m;
 	uint32_t sdma_base_addr;
 	uint32_t temp;
-	int timeout = utimeout;
+	unsigned long end_jiffies = (utimeout * HZ / 1000) + jiffies;
 
 	m = get_sdma_mqd(mqd);
 	sdma_base_addr = get_sdma_base_addr(m);
@@ -523,18 +670,19 @@ static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd,
 
 	while (true) {
 		temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS);
-		if (temp & SDMA0_STATUS_REG__RB_CMD_IDLE__SHIFT)
+		if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK)
 			break;
-		if (timeout <= 0)
+		if (time_after(jiffies, end_jiffies))
 			return -ETIME;
-		msleep(20);
-		timeout -= 20;
+		usleep_range(500, 1000);
 	}
 
 	WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, 0);
-	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, 0);
-	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, 0);
-	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, 0);
+	WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL,
+		RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL) |
+		SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK);
+
+	m->sdmax_rlcx_rb_rptr = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR);
 
 	return 0;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/cikd.h b/drivers/gpu/drm/amd/amdgpu/cikd.h
index 6a9e38a3d2a0..cee6e8a3ad9c 100644
--- a/drivers/gpu/drm/amd/amdgpu/cikd.h
+++ b/drivers/gpu/drm/amd/amdgpu/cikd.h
@@ -562,7 +562,7 @@
 #define	PRIVATE_BASE(x)	((x) << 0) /* scratch */
 #define	SHARED_BASE(x)	((x) << 16) /* LDS */
 
-#define KFD_CIK_SDMA_QUEUE_OFFSET	0x200
+#define KFD_CIK_SDMA_QUEUE_OFFSET (mmSDMA0_RLC1_RB_CNTL - mmSDMA0_RLC0_RB_CNTL)
 
 /* valid for both DEFAULT_MTYPE and APE1_MTYPE */
 enum {
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
index 4b5109bfd5f4..a066c5eda135 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
@@ -48,6 +48,8 @@
 #include "oss/oss_2_0_d.h"
 #include "oss/oss_2_0_sh_mask.h"
 
+#define NUM_SIMD_PER_CU 0x4 /* missing from the gfx_7 IP headers */
+
 #define GFX7_NUM_GFX_RINGS     1
 #define GFX7_MEC_HPD_SIZE      2048
 
@@ -5277,6 +5279,11 @@ static void gfx_v7_0_get_cu_info(struct amdgpu_device *adev)
 
 	cu_info->number = active_cu_number;
 	cu_info->ao_cu_mask = ao_cu_mask;
+	cu_info->simd_per_cu = NUM_SIMD_PER_CU;
+	cu_info->max_waves_per_simd = 10;
+	cu_info->max_scratch_slots_per_cu = 32;
+	cu_info->wave_front_size = 64;
+	cu_info->lds_size = 64;
 }
 
 const struct amdgpu_ip_block_version gfx_v7_0_ip_block =
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index ff9f1a82630f..4e694ae9f308 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -7116,6 +7116,11 @@ static void gfx_v8_0_get_cu_info(struct amdgpu_device *adev)
 
 	cu_info->number = active_cu_number;
 	cu_info->ao_cu_mask = ao_cu_mask;
+	cu_info->simd_per_cu = NUM_SIMD_PER_CU;
+	cu_info->max_waves_per_simd = 10;
+	cu_info->max_scratch_slots_per_cu = 32;
+	cu_info->wave_front_size = 64;
+	cu_info->lds_size = 64;
 }
 
 const struct amdgpu_ip_block_version gfx_v8_0_ip_block =
diff --git a/drivers/gpu/drm/amd/amdgpu/vid.h b/drivers/gpu/drm/amd/amdgpu/vid.h
index dbf3703cbd1b..19ddd2312e00 100644
--- a/drivers/gpu/drm/amd/amdgpu/vid.h
+++ b/drivers/gpu/drm/amd/amdgpu/vid.h
@@ -27,6 +27,8 @@
 #define SDMA1_REGISTER_OFFSET                             0x200 /* not a register */
 #define SDMA_MAX_INSTANCE 2
 
+#define KFD_VI_SDMA_QUEUE_OFFSET                      0x80 /* not a register */
+
 /* crtc instance offsets */
 #define CRTC0_REGISTER_OFFSET                 (0x1b9c - 0x1b9c)
 #define CRTC1_REGISTER_OFFSET                 (0x1d9c - 0x1b9c)
diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile
index 342c2d937b17..a317e76ffb5e 100644
--- a/drivers/gpu/drm/amd/amdkfd/Makefile
+++ b/drivers/gpu/drm/amd/amdkfd/Makefile
@@ -35,6 +35,8 @@ amdkfd-y	:= kfd_module.o kfd_device.o kfd_chardev.o kfd_topology.o \
 		kfd_process_queue_manager.o kfd_device_queue_manager.o \
 		kfd_device_queue_manager_cik.o kfd_device_queue_manager_vi.o \
 		kfd_interrupt.o kfd_events.o cik_event_interrupt.o \
-		kfd_dbgdev.o kfd_dbgmgr.o
+		kfd_dbgdev.o kfd_dbgmgr.o kfd_crat.o
+
+amdkfd-$(CONFIG_DEBUG_FS) += kfd_debugfs.o
 
 obj-$(CONFIG_HSA_AMD)	+= amdkfd.o
diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm
new file mode 100644
index 000000000000..997a383dcb8b
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm
@@ -0,0 +1,1384 @@
+/*
+ * Copyright 2015-2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#if 0
+HW (VI) source code for CWSR trap handler
+#Version 18 + multiple trap handler
+
+// this performance-optimal version was originally from Seven Xu at SRDC
+
+// Revison #18   --...
+/* Rev History
+** #1. Branch from gc dv.   //gfxip/gfx8/main/src/test/suites/block/cs/sr/cs_trap_handler.sp3#1,#50, #51, #52-53(Skip, Already Fixed by PV), #54-56(merged),#57-58(mergerd, skiped-already fixed by PV)
+** #4. SR Memory Layout:
+**             1. VGPR-SGPR-HWREG-{LDS}
+**             2. tba_hi.bits.26 - reconfigured as the first wave in tg bits, for defer Save LDS for a threadgroup.. performance concern..
+** #5. Update: 1. Accurate g8sr_ts_save_d timestamp
+** #6. Update: 1. Fix s_barrier usage; 2. VGPR s/r using swizzle buffer?(NoNeed, already matched the swizzle pattern, more investigation)
+** #7. Update: 1. don't barrier if noLDS
+** #8. Branch: 1. Branch to ver#0, which is very similar to gc dv version
+**             2. Fix SQ issue by s_sleep 2
+** #9. Update: 1. Fix scc restore failed issue, restore wave_status at last
+**             2. optimize s_buffer save by burst 16sgprs...
+** #10. Update 1. Optimize restore sgpr by busrt 16 sgprs.
+** #11. Update 1. Add 2 more timestamp for debug version
+** #12. Update 1. Add VGPR SR using DWx4, some case improve and some case drop performance
+** #13. Integ  1. Always use MUBUF for PV trap shader...
+** #14. Update 1. s_buffer_store soft clause...
+** #15. Update 1. PERF - sclar write with glc:0/mtype0 to allow L2 combine. perf improvement a lot.
+** #16. Update 1. PRRF - UNROLL LDS_DMA got 2500cycle save in IP tree
+** #17. Update 1. FUNC - LDS_DMA has issues while ATC, replace with ds_read/buffer_store for save part[TODO restore part]
+**             2. PERF - Save LDS before save VGPR to cover LDS save long latency...
+** #18. Update 1. FUNC - Implicitly estore STATUS.VCCZ, which is not writable by s_setreg_b32
+**             2. FUNC - Handle non-CWSR traps
+*/
+
+var G8SR_WDMEM_HWREG_OFFSET = 0
+var G8SR_WDMEM_SGPR_OFFSET  = 128  // in bytes
+
+// Keep definition same as the app shader, These 2 time stamps are part of the app shader... Should before any Save and after restore.
+
+var G8SR_DEBUG_TIMESTAMP = 0
+var G8SR_DEBUG_TS_SAVE_D_OFFSET = 40*4  // ts_save_d timestamp offset relative to SGPR_SR_memory_offset
+var s_g8sr_ts_save_s    = s[34:35]   // save start
+var s_g8sr_ts_sq_save_msg  = s[36:37]   // The save shader send SAVEWAVE msg to spi
+var s_g8sr_ts_spi_wrexec   = s[38:39]   // the SPI write the sr address to SQ
+var s_g8sr_ts_save_d    = s[40:41]   // save end
+var s_g8sr_ts_restore_s = s[42:43]   // restore start
+var s_g8sr_ts_restore_d = s[44:45]   // restore end
+
+var G8SR_VGPR_SR_IN_DWX4 = 0
+var G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4 = 0x00100000    // DWx4 stride is 4*4Bytes
+var G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4  = G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4
+
+
+/*************************************************************************/
+/*                  control on how to run the shader                     */
+/*************************************************************************/
+//any hack that needs to be made to run this code in EMU (either because various EMU code are not ready or no compute save & restore in EMU run)
+var EMU_RUN_HACK                    =   0
+var EMU_RUN_HACK_RESTORE_NORMAL     =   0
+var EMU_RUN_HACK_SAVE_NORMAL_EXIT   =   0
+var EMU_RUN_HACK_SAVE_SINGLE_WAVE   =   0
+var EMU_RUN_HACK_SAVE_FIRST_TIME    =   0                   //for interrupted restore in which the first save is through EMU_RUN_HACK
+var EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_LO =   0                   //for interrupted restore in which the first save is through EMU_RUN_HACK
+var EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_HI =   0                   //for interrupted restore in which the first save is through EMU_RUN_HACK
+var SAVE_LDS                        =   1
+var WG_BASE_ADDR_LO                 =   0x9000a000
+var WG_BASE_ADDR_HI                 =   0x0
+var WAVE_SPACE                      =   0x5000              //memory size that each wave occupies in workgroup state mem
+var CTX_SAVE_CONTROL                =   0x0
+var CTX_RESTORE_CONTROL             =   CTX_SAVE_CONTROL
+var SIM_RUN_HACK                    =   0                   //any hack that needs to be made to run this code in SIM (either because various RTL code are not ready or no compute save & restore in RTL run)
+var SGPR_SAVE_USE_SQC               =   1                   //use SQC D$ to do the write
+var USE_MTBUF_INSTEAD_OF_MUBUF      =   0                   //because TC EMU currently asserts on 0 of // overload DFMT field to carry 4 more bits of stride for MUBUF opcodes
+var SWIZZLE_EN                      =   0                   //whether we use swizzled buffer addressing
+
+/**************************************************************************/
+/*                      variables                                         */
+/**************************************************************************/
+var SQ_WAVE_STATUS_INST_ATC_SHIFT  = 23
+var SQ_WAVE_STATUS_INST_ATC_MASK   = 0x00800000
+var SQ_WAVE_STATUS_SPI_PRIO_MASK   = 0x00000006
+
+var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT    = 12
+var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE     = 9
+var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT   = 8
+var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE    = 6
+var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT   = 24
+var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE    = 3                     //FIXME  sq.blk still has 4 bits at this time while SQ programming guide has 3 bits
+
+var SQ_WAVE_TRAPSTS_SAVECTX_MASK    =   0x400
+var SQ_WAVE_TRAPSTS_EXCE_MASK       =   0x1FF                   // Exception mask
+var SQ_WAVE_TRAPSTS_SAVECTX_SHIFT   =   10
+var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK   =   0x100
+var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT  =   8
+var SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK    =   0x3FF
+var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT   =   0x0
+var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE    =   10
+var SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK   =   0xFFFFF800
+var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT  =   11
+var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE   =   21
+
+var SQ_WAVE_IB_STS_RCNT_SHIFT           =   16                  //FIXME
+var SQ_WAVE_IB_STS_RCNT_SIZE            =   4                   //FIXME
+var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT   =   15                  //FIXME
+var SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE    =   1                   //FIXME
+var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG   = 0x00007FFF    //FIXME
+
+var SQ_BUF_RSRC_WORD1_ATC_SHIFT     =   24
+var SQ_BUF_RSRC_WORD3_MTYPE_SHIFT   =   27
+
+
+/*      Save        */
+var S_SAVE_BUF_RSRC_WORD1_STRIDE        =   0x00040000          //stride is 4 bytes
+var S_SAVE_BUF_RSRC_WORD3_MISC          =   0x00807FAC          //SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14] when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE
+
+var S_SAVE_SPI_INIT_ATC_MASK            =   0x08000000          //bit[27]: ATC bit
+var S_SAVE_SPI_INIT_ATC_SHIFT           =   27
+var S_SAVE_SPI_INIT_MTYPE_MASK          =   0x70000000          //bit[30:28]: Mtype
+var S_SAVE_SPI_INIT_MTYPE_SHIFT         =   28
+var S_SAVE_SPI_INIT_FIRST_WAVE_MASK     =   0x04000000          //bit[26]: FirstWaveInTG
+var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT    =   26
+
+var S_SAVE_PC_HI_RCNT_SHIFT             =   28                  //FIXME  check with Brian to ensure all fields other than PC[47:0] can be used
+var S_SAVE_PC_HI_RCNT_MASK              =   0xF0000000          //FIXME
+var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT     =   27                  //FIXME
+var S_SAVE_PC_HI_FIRST_REPLAY_MASK      =   0x08000000          //FIXME
+
+var s_save_spi_init_lo              =   exec_lo
+var s_save_spi_init_hi              =   exec_hi
+
+                                                //tba_lo and tba_hi need to be saved/restored
+var s_save_pc_lo            =   ttmp0           //{TTMP1, TTMP0} = {3??h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]}
+var s_save_pc_hi            =   ttmp1
+var s_save_exec_lo          =   ttmp2
+var s_save_exec_hi          =   ttmp3
+var s_save_status           =   ttmp4
+var s_save_trapsts          =   ttmp5           //not really used until the end of the SAVE routine
+var s_save_xnack_mask_lo    =   ttmp6
+var s_save_xnack_mask_hi    =   ttmp7
+var s_save_buf_rsrc0        =   ttmp8
+var s_save_buf_rsrc1        =   ttmp9
+var s_save_buf_rsrc2        =   ttmp10
+var s_save_buf_rsrc3        =   ttmp11
+
+var s_save_mem_offset       =   tma_lo
+var s_save_alloc_size       =   s_save_trapsts          //conflict
+var s_save_tmp              =   s_save_buf_rsrc2        //shared with s_save_buf_rsrc2  (conflict: should not use mem access with s_save_tmp at the same time)
+var s_save_m0               =   tma_hi
+
+/*      Restore     */
+var S_RESTORE_BUF_RSRC_WORD1_STRIDE         =   S_SAVE_BUF_RSRC_WORD1_STRIDE
+var S_RESTORE_BUF_RSRC_WORD3_MISC           =   S_SAVE_BUF_RSRC_WORD3_MISC
+
+var S_RESTORE_SPI_INIT_ATC_MASK             =   0x08000000          //bit[27]: ATC bit
+var S_RESTORE_SPI_INIT_ATC_SHIFT            =   27
+var S_RESTORE_SPI_INIT_MTYPE_MASK           =   0x70000000          //bit[30:28]: Mtype
+var S_RESTORE_SPI_INIT_MTYPE_SHIFT          =   28
+var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK      =   0x04000000          //bit[26]: FirstWaveInTG
+var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT     =   26
+
+var S_RESTORE_PC_HI_RCNT_SHIFT              =   S_SAVE_PC_HI_RCNT_SHIFT
+var S_RESTORE_PC_HI_RCNT_MASK               =   S_SAVE_PC_HI_RCNT_MASK
+var S_RESTORE_PC_HI_FIRST_REPLAY_SHIFT      =   S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
+var S_RESTORE_PC_HI_FIRST_REPLAY_MASK       =   S_SAVE_PC_HI_FIRST_REPLAY_MASK
+
+var s_restore_spi_init_lo                   =   exec_lo
+var s_restore_spi_init_hi                   =   exec_hi
+
+var s_restore_mem_offset        =   ttmp2
+var s_restore_alloc_size        =   ttmp3
+var s_restore_tmp               =   ttmp6               //tba_lo/hi need to be restored
+var s_restore_mem_offset_save   =   s_restore_tmp       //no conflict
+
+var s_restore_m0            =   s_restore_alloc_size    //no conflict
+
+var s_restore_mode          =   ttmp7
+
+var s_restore_pc_lo         =   ttmp0
+var s_restore_pc_hi         =   ttmp1
+var s_restore_exec_lo       =   tma_lo                  //no conflict
+var s_restore_exec_hi       =   tma_hi                  //no conflict
+var s_restore_status        =   ttmp4
+var s_restore_trapsts       =   ttmp5
+var s_restore_xnack_mask_lo =   xnack_mask_lo
+var s_restore_xnack_mask_hi =   xnack_mask_hi
+var s_restore_buf_rsrc0     =   ttmp8
+var s_restore_buf_rsrc1     =   ttmp9
+var s_restore_buf_rsrc2     =   ttmp10
+var s_restore_buf_rsrc3     =   ttmp11
+
+/**************************************************************************/
+/*                      trap handler entry points                         */
+/**************************************************************************/
+/* Shader Main*/
+
+shader main
+  asic(VI)
+  type(CS)
+
+
+    if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))                   //hack to use trap_id for determining save/restore
+        //FIXME VCCZ un-init assertion s_getreg_b32     s_save_status, hwreg(HW_REG_STATUS)         //save STATUS since we will change SCC
+        s_and_b32 s_save_tmp, s_save_pc_hi, 0xffff0000              //change SCC
+        s_cmp_eq_u32 s_save_tmp, 0x007e0000                         //Save: trap_id = 0x7e. Restore: trap_id = 0x7f.
+        s_cbranch_scc0 L_JUMP_TO_RESTORE                            //do not need to recover STATUS here  since we are going to RESTORE
+        //FIXME  s_setreg_b32   hwreg(HW_REG_STATUS),   s_save_status       //need to recover STATUS since we are going to SAVE
+        s_branch L_SKIP_RESTORE                                     //NOT restore, SAVE actually
+    else
+        s_branch L_SKIP_RESTORE                                     //NOT restore. might be a regular trap or save
+    end
+
+L_JUMP_TO_RESTORE:
+    s_branch L_RESTORE                                              //restore
+
+L_SKIP_RESTORE:
+
+    s_getreg_b32    s_save_status, hwreg(HW_REG_STATUS)                             //save STATUS since we will change SCC
+    s_andn2_b32     s_save_status, s_save_status, SQ_WAVE_STATUS_SPI_PRIO_MASK      //check whether this is for save
+    s_getreg_b32    s_save_trapsts, hwreg(HW_REG_TRAPSTS)
+    s_and_b32       s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK    //check whether this is for save
+    s_cbranch_scc1  L_SAVE                                      //this is the operation for save
+
+    // *********    Handle non-CWSR traps       *******************
+if (!EMU_RUN_HACK)
+    /* read tba and tma for next level trap handler, ttmp4 is used as s_save_status */
+    s_load_dwordx4  [ttmp8,ttmp9,ttmp10, ttmp11], [tma_lo,tma_hi], 0
+    s_waitcnt lgkmcnt(0)
+    s_or_b32        ttmp7, ttmp8, ttmp9
+    s_cbranch_scc0  L_NO_NEXT_TRAP //next level trap handler not been set
+    s_setreg_b32    hwreg(HW_REG_STATUS), s_save_status //restore HW status(SCC)
+    s_setpc_b64     [ttmp8,ttmp9] //jump to next level trap handler
+
+L_NO_NEXT_TRAP:
+    s_getreg_b32    s_save_trapsts, hwreg(HW_REG_TRAPSTS)
+    s_and_b32       s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_EXCE_MASK // Check whether it is an exception
+    s_cbranch_scc1  L_EXCP_CASE   // Exception, jump back to the shader program directly.
+    s_add_u32       ttmp0, ttmp0, 4   // S_TRAP case, add 4 to ttmp0
+    s_addc_u32  ttmp1, ttmp1, 0
+L_EXCP_CASE:
+    s_and_b32   ttmp1, ttmp1, 0xFFFF
+    s_setreg_b32    hwreg(HW_REG_STATUS), s_save_status //restore HW status(SCC)
+    s_rfe_b64       [ttmp0, ttmp1]
+end
+    // *********        End handling of non-CWSR traps   *******************
+
+/**************************************************************************/
+/*                      save routine                                      */
+/**************************************************************************/
+
+L_SAVE:
+
+if G8SR_DEBUG_TIMESTAMP
+        s_memrealtime   s_g8sr_ts_save_s
+        s_waitcnt lgkmcnt(0)         //FIXME, will cause xnack??
+end
+
+    //check whether there is mem_viol
+    s_getreg_b32    s_save_trapsts, hwreg(HW_REG_TRAPSTS)
+    s_and_b32   s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK
+    s_cbranch_scc0  L_NO_PC_REWIND
+
+    //if so, need rewind PC assuming GDS operation gets NACKed
+    s_mov_b32       s_save_tmp, 0                                                           //clear mem_viol bit
+    s_setreg_b32    hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT, 1), s_save_tmp    //clear mem_viol bit
+    s_and_b32       s_save_pc_hi, s_save_pc_hi, 0x0000ffff    //pc[47:32]
+    s_sub_u32       s_save_pc_lo, s_save_pc_lo, 8             //pc[31:0]-8
+    s_subb_u32      s_save_pc_hi, s_save_pc_hi, 0x0           // -scc
+
+L_NO_PC_REWIND:
+    s_mov_b32       s_save_tmp, 0                                                           //clear saveCtx bit
+    s_setreg_b32    hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp     //clear saveCtx bit
+
+    s_mov_b32       s_save_xnack_mask_lo,   xnack_mask_lo                                   //save XNACK_MASK
+    s_mov_b32       s_save_xnack_mask_hi,   xnack_mask_hi    //save XNACK must before any memory operation
+    s_getreg_b32    s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_RCNT_SHIFT, SQ_WAVE_IB_STS_RCNT_SIZE)                   //save RCNT
+    s_lshl_b32      s_save_tmp, s_save_tmp, S_SAVE_PC_HI_RCNT_SHIFT
+    s_or_b32        s_save_pc_hi, s_save_pc_hi, s_save_tmp
+    s_getreg_b32    s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT, SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE)   //save FIRST_REPLAY
+    s_lshl_b32      s_save_tmp, s_save_tmp, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
+    s_or_b32        s_save_pc_hi, s_save_pc_hi, s_save_tmp
+    s_getreg_b32    s_save_tmp, hwreg(HW_REG_IB_STS)                                        //clear RCNT and FIRST_REPLAY in IB_STS
+    s_and_b32       s_save_tmp, s_save_tmp, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG
+
+    s_setreg_b32    hwreg(HW_REG_IB_STS), s_save_tmp
+
+    /*      inform SPI the readiness and wait for SPI's go signal */
+    s_mov_b32       s_save_exec_lo, exec_lo                                                 //save EXEC and use EXEC for the go signal from SPI
+    s_mov_b32       s_save_exec_hi, exec_hi
+    s_mov_b64       exec,   0x0                                                             //clear EXEC to get ready to receive
+
+if G8SR_DEBUG_TIMESTAMP
+        s_memrealtime  s_g8sr_ts_sq_save_msg
+        s_waitcnt lgkmcnt(0)
+end
+
+    if (EMU_RUN_HACK)
+
+    else
+        s_sendmsg   sendmsg(MSG_SAVEWAVE)  //send SPI a message and wait for SPI's write to EXEC
+    end
+
+  L_SLEEP:
+    s_sleep 0x2                // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause SQ hang, since the 7,8th wave could not get arbit to exec inst, while other waves are stuck into the sleep-loop and waiting for wrexec!=0
+
+    if (EMU_RUN_HACK)
+
+    else
+        s_cbranch_execz L_SLEEP
+    end
+
+if G8SR_DEBUG_TIMESTAMP
+        s_memrealtime  s_g8sr_ts_spi_wrexec
+        s_waitcnt lgkmcnt(0)
+end
+
+    /*      setup Resource Contants    */
+    if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_SINGLE_WAVE))
+        //calculate wd_addr using absolute thread id
+        v_readlane_b32 s_save_tmp, v9, 0
+        s_lshr_b32 s_save_tmp, s_save_tmp, 6
+        s_mul_i32 s_save_tmp, s_save_tmp, WAVE_SPACE
+        s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO
+        s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI
+        s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL
+    else
+    end
+    if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_SINGLE_WAVE))
+        s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO
+        s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI
+        s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL
+    else
+    end
+
+
+    s_mov_b32       s_save_buf_rsrc0,   s_save_spi_init_lo                                                      //base_addr_lo
+    s_and_b32       s_save_buf_rsrc1,   s_save_spi_init_hi, 0x0000FFFF                                          //base_addr_hi
+    s_or_b32        s_save_buf_rsrc1,   s_save_buf_rsrc1,  S_SAVE_BUF_RSRC_WORD1_STRIDE
+    s_mov_b32       s_save_buf_rsrc2,   0                                                                       //NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited
+    s_mov_b32       s_save_buf_rsrc3,   S_SAVE_BUF_RSRC_WORD3_MISC
+    s_and_b32       s_save_tmp,         s_save_spi_init_hi, S_SAVE_SPI_INIT_ATC_MASK
+    s_lshr_b32      s_save_tmp,         s_save_tmp, (S_SAVE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT)         //get ATC bit into position
+    s_or_b32        s_save_buf_rsrc3,   s_save_buf_rsrc3,  s_save_tmp                                           //or ATC
+    s_and_b32       s_save_tmp,         s_save_spi_init_hi, S_SAVE_SPI_INIT_MTYPE_MASK
+    s_lshr_b32      s_save_tmp,         s_save_tmp, (S_SAVE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT)     //get MTYPE bits into position
+    s_or_b32        s_save_buf_rsrc3,   s_save_buf_rsrc3,  s_save_tmp                                           //or MTYPE
+
+    //FIXME  right now s_save_m0/s_save_mem_offset use tma_lo/tma_hi  (might need to save them before using them?)
+    s_mov_b32       s_save_m0,          m0                                                                  //save M0
+
+    /*      global mem offset           */
+    s_mov_b32       s_save_mem_offset,  0x0                                                                     //mem offset initial value = 0
+
+
+
+
+    /*      save HW registers   */
+    //////////////////////////////
+
+  L_SAVE_HWREG:
+        // HWREG SR memory offset : size(VGPR)+size(SGPR)
+       get_vgpr_size_bytes(s_save_mem_offset)
+       get_sgpr_size_bytes(s_save_tmp)
+       s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp
+
+
+    s_mov_b32       s_save_buf_rsrc2, 0x4                               //NUM_RECORDS   in bytes
+    if (SWIZZLE_EN)
+        s_add_u32       s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0                     //FIXME need to use swizzle to enable bounds checking?
+    else
+        s_mov_b32       s_save_buf_rsrc2,  0x1000000                                //NUM_RECORDS in bytes
+    end
+
+
+    write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)                  //M0
+
+    if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_FIRST_TIME))
+        s_add_u32 s_save_pc_lo, s_save_pc_lo, 4             //pc[31:0]+4
+        s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0          //carry bit over
+        s_mov_b32   tba_lo, EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_LO
+        s_mov_b32   tba_hi, EMU_RUN_HACK_SAVE_FIRST_TIME_TBA_HI
+    end
+
+    write_hwreg_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset)                   //PC
+    write_hwreg_to_mem(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset)
+    write_hwreg_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset)             //EXEC
+    write_hwreg_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset)
+    write_hwreg_to_mem(s_save_status, s_save_buf_rsrc0, s_save_mem_offset)              //STATUS
+
+    //s_save_trapsts conflicts with s_save_alloc_size
+    s_getreg_b32    s_save_trapsts, hwreg(HW_REG_TRAPSTS)
+    write_hwreg_to_mem(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset)             //TRAPSTS
+
+    write_hwreg_to_mem(s_save_xnack_mask_lo, s_save_buf_rsrc0, s_save_mem_offset)           //XNACK_MASK_LO
+    write_hwreg_to_mem(s_save_xnack_mask_hi, s_save_buf_rsrc0, s_save_mem_offset)           //XNACK_MASK_HI
+
+    //use s_save_tmp would introduce conflict here between s_save_tmp and s_save_buf_rsrc2
+    s_getreg_b32    s_save_m0, hwreg(HW_REG_MODE)                                                   //MODE
+    write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)
+    write_hwreg_to_mem(tba_lo, s_save_buf_rsrc0, s_save_mem_offset)                     //TBA_LO
+    write_hwreg_to_mem(tba_hi, s_save_buf_rsrc0, s_save_mem_offset)                     //TBA_HI
+
+
+
+    /*      the first wave in the threadgroup    */
+        // save fist_wave bits in tba_hi unused bit.26
+    s_and_b32       s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK     // extract fisrt wave bit
+    //s_or_b32        tba_hi, s_save_tmp, tba_hi                                        // save first wave bit to tba_hi.bits[26]
+    s_mov_b32        s_save_exec_hi, 0x0
+    s_or_b32         s_save_exec_hi, s_save_tmp, s_save_exec_hi                          // save first wave bit to s_save_exec_hi.bits[26]
+
+
+    /*          save SGPRs      */
+        // Save SGPR before LDS save, then the s0 to s4 can be used during LDS save...
+    //////////////////////////////
+
+    // SGPR SR memory offset : size(VGPR)
+    get_vgpr_size_bytes(s_save_mem_offset)
+    // TODO, change RSRC word to rearrange memory layout for SGPRS
+
+    s_getreg_b32    s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE)               //spgr_size
+    s_add_u32       s_save_alloc_size, s_save_alloc_size, 1
+    s_lshl_b32      s_save_alloc_size, s_save_alloc_size, 4                         //Number of SGPRs = (sgpr_size + 1) * 16   (non-zero value)
+
+    if (SGPR_SAVE_USE_SQC)
+        s_lshl_b32      s_save_buf_rsrc2,   s_save_alloc_size, 2                    //NUM_RECORDS in bytes
+    else
+        s_lshl_b32      s_save_buf_rsrc2,   s_save_alloc_size, 8                    //NUM_RECORDS in bytes (64 threads)
+    end
+
+    if (SWIZZLE_EN)
+        s_add_u32       s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0                     //FIXME need to use swizzle to enable bounds checking?
+    else
+        s_mov_b32       s_save_buf_rsrc2,  0x1000000                                //NUM_RECORDS in bytes
+    end
+
+
+    // backup s_save_buf_rsrc0,1 to s_save_pc_lo/hi, since write_16sgpr_to_mem function will change the rsrc0
+    //s_mov_b64 s_save_pc_lo, s_save_buf_rsrc0
+    s_mov_b64 s_save_xnack_mask_lo, s_save_buf_rsrc0
+    s_add_u32 s_save_buf_rsrc0, s_save_buf_rsrc0, s_save_mem_offset
+    s_addc_u32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0
+
+    s_mov_b32       m0, 0x0                         //SGPR initial index value =0
+  L_SAVE_SGPR_LOOP:
+    // SGPR is allocated in 16 SGPR granularity
+    s_movrels_b64   s0, s0     //s0 = s[0+m0], s1 = s[1+m0]
+    s_movrels_b64   s2, s2     //s2 = s[2+m0], s3 = s[3+m0]
+    s_movrels_b64   s4, s4     //s4 = s[4+m0], s5 = s[5+m0]
+    s_movrels_b64   s6, s6     //s6 = s[6+m0], s7 = s[7+m0]
+    s_movrels_b64   s8, s8     //s8 = s[8+m0], s9 = s[9+m0]
+    s_movrels_b64   s10, s10   //s10 = s[10+m0], s11 = s[11+m0]
+    s_movrels_b64   s12, s12   //s12 = s[12+m0], s13 = s[13+m0]
+    s_movrels_b64   s14, s14   //s14 = s[14+m0], s15 = s[15+m0]
+
+    write_16sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset) //PV: the best performance should be using s_buffer_store_dwordx4
+    s_add_u32       m0, m0, 16                                                      //next sgpr index
+    s_cmp_lt_u32    m0, s_save_alloc_size                                           //scc = (m0 < s_save_alloc_size) ? 1 : 0
+    s_cbranch_scc1  L_SAVE_SGPR_LOOP                                    //SGPR save is complete?
+    // restore s_save_buf_rsrc0,1
+    //s_mov_b64 s_save_buf_rsrc0, s_save_pc_lo
+    s_mov_b64 s_save_buf_rsrc0, s_save_xnack_mask_lo
+
+
+
+
+    /*          save first 4 VGPR, then LDS save could use   */
+        // each wave will alloc 4 vgprs at least...
+    /////////////////////////////////////////////////////////////////////////////////////
+
+    s_mov_b32       s_save_mem_offset, 0
+    s_mov_b32       exec_lo, 0xFFFFFFFF                                             //need every thread from now on
+    s_mov_b32       exec_hi, 0xFFFFFFFF
+
+    if (SWIZZLE_EN)
+        s_add_u32       s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0                     //FIXME need to use swizzle to enable bounds checking?
+    else
+        s_mov_b32       s_save_buf_rsrc2,  0x1000000                                //NUM_RECORDS in bytes
+    end
+
+
+    // VGPR Allocated in 4-GPR granularity
+
+if G8SR_VGPR_SR_IN_DWX4
+        // the const stride for DWx4 is 4*4 bytes
+        s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF   // reset const stride to 0
+        s_or_b32  s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4  // const stride to 4*4 bytes
+
+        buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
+
+        s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF   // reset const stride to 0
+        s_or_b32  s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE  // reset const stride to 4 bytes
+else
+        buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
+        buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256
+        buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256*2
+        buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256*3
+end
+
+
+
+    /*          save LDS        */
+    //////////////////////////////
+
+  L_SAVE_LDS:
+
+        // Change EXEC to all threads...
+    s_mov_b32       exec_lo, 0xFFFFFFFF   //need every thread from now on
+    s_mov_b32       exec_hi, 0xFFFFFFFF
+
+    s_getreg_b32    s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)             //lds_size
+    s_and_b32       s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF                //lds_size is zero?
+    s_cbranch_scc0  L_SAVE_LDS_DONE                                                                            //no lds used? jump to L_SAVE_DONE
+
+    s_barrier               //LDS is used? wait for other waves in the same TG
+    //s_and_b32     s_save_tmp, tba_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK                //exec is still used here
+    s_and_b32       s_save_tmp, s_save_exec_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK                //exec is still used here
+    s_cbranch_scc0  L_SAVE_LDS_DONE
+
+        // first wave do LDS save;
+
+    s_lshl_b32      s_save_alloc_size, s_save_alloc_size, 6                         //LDS size in dwords = lds_size * 64dw
+    s_lshl_b32      s_save_alloc_size, s_save_alloc_size, 2                         //LDS size in bytes
+    s_mov_b32       s_save_buf_rsrc2,  s_save_alloc_size                            //NUM_RECORDS in bytes
+
+    // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG)
+    //
+    get_vgpr_size_bytes(s_save_mem_offset)
+    get_sgpr_size_bytes(s_save_tmp)
+    s_add_u32  s_save_mem_offset, s_save_mem_offset, s_save_tmp
+    s_add_u32 s_save_mem_offset, s_save_mem_offset, get_hwreg_size_bytes()
+
+
+    if (SWIZZLE_EN)
+        s_add_u32       s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0       //FIXME need to use swizzle to enable bounds checking?
+    else
+        s_mov_b32       s_save_buf_rsrc2,  0x1000000                  //NUM_RECORDS in bytes
+    end
+
+    s_mov_b32       m0, 0x0                                               //lds_offset initial value = 0
+
+
+var LDS_DMA_ENABLE = 0
+var UNROLL = 0
+if UNROLL==0 && LDS_DMA_ENABLE==1
+        s_mov_b32  s3, 256*2
+        s_nop 0
+        s_nop 0
+        s_nop 0
+  L_SAVE_LDS_LOOP:
+        //TODO: looks the 2 buffer_store/load clause for s/r will hurt performance.???
+    if (SAVE_LDS)     //SPI always alloc LDS space in 128DW granularity
+            buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1            // first 64DW
+            buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256 // second 64DW
+    end
+
+    s_add_u32       m0, m0, s3                                          //every buffer_store_lds does 256 bytes
+    s_add_u32       s_save_mem_offset, s_save_mem_offset, s3                            //mem offset increased by 256 bytes
+    s_cmp_lt_u32    m0, s_save_alloc_size                                               //scc=(m0 < s_save_alloc_size) ? 1 : 0
+    s_cbranch_scc1  L_SAVE_LDS_LOOP                                                     //LDS save is complete?
+
+elsif LDS_DMA_ENABLE==1 && UNROLL==1 // UNROOL  , has ichace miss
+      // store from higest LDS address to lowest
+      s_mov_b32  s3, 256*2
+      s_sub_u32  m0, s_save_alloc_size, s3
+      s_add_u32 s_save_mem_offset, s_save_mem_offset, m0
+      s_lshr_b32 s_save_alloc_size, s_save_alloc_size, 9   // how many 128 trunks...
+      s_sub_u32 s_save_alloc_size, 128, s_save_alloc_size   // store from higheset addr to lowest
+      s_mul_i32 s_save_alloc_size, s_save_alloc_size, 6*4   // PC offset increment,  each LDS save block cost 6*4 Bytes instruction
+      s_add_u32 s_save_alloc_size, s_save_alloc_size, 3*4   //2is the below 2 inst...//s_addc and s_setpc
+      s_nop 0
+      s_nop 0
+      s_nop 0   //pad 3 dw to let LDS_DMA align with 64Bytes
+      s_getpc_b64 s[0:1]                              // reuse s[0:1], since s[0:1] already saved
+      s_add_u32   s0, s0,s_save_alloc_size
+      s_addc_u32  s1, s1, 0
+      s_setpc_b64 s[0:1]
+
+
+       for var i =0; i< 128; i++
+            // be careful to make here a 64Byte aligned address, which could improve performance...
+            buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:0           // first 64DW
+            buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 offset:256           // second 64DW
+
+        if i!=127
+        s_sub_u32  m0, m0, s3      // use a sgpr to shrink 2DW-inst to 1DW inst to improve performance , i.e.  pack more LDS_DMA inst to one Cacheline
+            s_sub_u32  s_save_mem_offset, s_save_mem_offset,  s3
+            end
+       end
+
+else   // BUFFER_STORE
+      v_mbcnt_lo_u32_b32 v2, 0xffffffff, 0x0
+      v_mbcnt_hi_u32_b32 v3, 0xffffffff, v2     // tid
+      v_mul_i32_i24 v2, v3, 8   // tid*8
+      v_mov_b32 v3, 256*2
+      s_mov_b32 m0, 0x10000
+      s_mov_b32 s0, s_save_buf_rsrc3
+      s_and_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0xFF7FFFFF    // disable add_tid
+      s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, 0x58000   //DFMT
+
+L_SAVE_LDS_LOOP_VECTOR:
+      ds_read_b64 v[0:1], v2    //x =LDS[a], byte address
+      s_waitcnt lgkmcnt(0)
+      buffer_store_dwordx2  v[0:1], v2, s_save_buf_rsrc0, s_save_mem_offset offen:1  glc:1  slc:1
+//      s_waitcnt vmcnt(0)
+      v_add_u32 v2, vcc[0:1], v2, v3
+      v_cmp_lt_u32 vcc[0:1], v2, s_save_alloc_size
+      s_cbranch_vccnz L_SAVE_LDS_LOOP_VECTOR
+
+      // restore rsrc3
+      s_mov_b32 s_save_buf_rsrc3, s0
+
+end
+
+L_SAVE_LDS_DONE:
+
+
+    /*          save VGPRs  - set the Rest VGPRs        */
+    //////////////////////////////////////////////////////////////////////////////////////
+  L_SAVE_VGPR:
+    // VGPR SR memory offset: 0
+    // TODO rearrange the RSRC words to use swizzle for VGPR save...
+
+    s_mov_b32       s_save_mem_offset, (0+256*4)                                    // for the rest VGPRs
+    s_mov_b32       exec_lo, 0xFFFFFFFF                                             //need every thread from now on
+    s_mov_b32       exec_hi, 0xFFFFFFFF
+
+    s_getreg_b32    s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)                   //vpgr_size
+    s_add_u32       s_save_alloc_size, s_save_alloc_size, 1
+    s_lshl_b32      s_save_alloc_size, s_save_alloc_size, 2                         //Number of VGPRs = (vgpr_size + 1) * 4    (non-zero value)   //FIXME for GFX, zero is possible
+    s_lshl_b32      s_save_buf_rsrc2,  s_save_alloc_size, 8                         //NUM_RECORDS in bytes (64 threads*4)
+    if (SWIZZLE_EN)
+        s_add_u32       s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0                     //FIXME need to use swizzle to enable bounds checking?
+    else
+        s_mov_b32       s_save_buf_rsrc2,  0x1000000                                //NUM_RECORDS in bytes
+    end
+
+
+    // VGPR Allocated in 4-GPR granularity
+
+if G8SR_VGPR_SR_IN_DWX4
+        // the const stride for DWx4 is 4*4 bytes
+        s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF   // reset const stride to 0
+        s_or_b32  s_save_buf_rsrc1, s_save_buf_rsrc1, G8SR_SAVE_BUF_RSRC_WORD1_STRIDE_DWx4  // const stride to 4*4 bytes
+
+        s_mov_b32         m0, 4     // skip first 4 VGPRs
+        s_cmp_lt_u32      m0, s_save_alloc_size
+        s_cbranch_scc0    L_SAVE_VGPR_LOOP_END      // no more vgprs
+
+        s_set_gpr_idx_on  m0, 0x1   // This will change M0
+        s_add_u32         s_save_alloc_size, s_save_alloc_size, 0x1000  // because above inst change m0
+L_SAVE_VGPR_LOOP:
+        v_mov_b32         v0, v0   // v0 = v[0+m0]
+        v_mov_b32         v1, v1
+        v_mov_b32         v2, v2
+        v_mov_b32         v3, v3
+
+
+        buffer_store_dwordx4 v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
+        s_add_u32         m0, m0, 4
+        s_add_u32         s_save_mem_offset, s_save_mem_offset, 256*4
+        s_cmp_lt_u32      m0, s_save_alloc_size
+    s_cbranch_scc1  L_SAVE_VGPR_LOOP                                                //VGPR save is complete?
+    s_set_gpr_idx_off
+L_SAVE_VGPR_LOOP_END:
+
+        s_and_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0x0000FFFF   // reset const stride to 0
+        s_or_b32  s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE  // reset const stride to 4 bytes
+else
+    // VGPR store using dw burst
+    s_mov_b32         m0, 0x4   //VGPR initial index value =0
+    s_cmp_lt_u32      m0, s_save_alloc_size
+    s_cbranch_scc0    L_SAVE_VGPR_END
+
+
+    s_set_gpr_idx_on    m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1
+    s_add_u32       s_save_alloc_size, s_save_alloc_size, 0x1000                    //add 0x1000 since we compare m0 against it later
+
+  L_SAVE_VGPR_LOOP:
+    v_mov_b32       v0, v0              //v0 = v[0+m0]
+    v_mov_b32       v1, v1              //v0 = v[0+m0]
+    v_mov_b32       v2, v2              //v0 = v[0+m0]
+    v_mov_b32       v3, v3              //v0 = v[0+m0]
+
+    if(USE_MTBUF_INSTEAD_OF_MUBUF)
+        tbuffer_store_format_x v0, v0, s_save_buf_rsrc0, s_save_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
+    else
+        buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1
+        buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256
+        buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256*2
+        buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1  offset:256*3
+    end
+
+    s_add_u32       m0, m0, 4                                                       //next vgpr index
+    s_add_u32       s_save_mem_offset, s_save_mem_offset, 256*4                     //every buffer_store_dword does 256 bytes
+    s_cmp_lt_u32    m0, s_save_alloc_size                                           //scc = (m0 < s_save_alloc_size) ? 1 : 0
+    s_cbranch_scc1  L_SAVE_VGPR_LOOP                                                //VGPR save is complete?
+    s_set_gpr_idx_off
+end
+
+L_SAVE_VGPR_END:
+
+
+
+
+
+
+    /*     S_PGM_END_SAVED  */                              //FIXME  graphics ONLY
+    if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_NORMAL_EXIT))
+        s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff    //pc[47:32]
+        s_add_u32 s_save_pc_lo, s_save_pc_lo, 4             //pc[31:0]+4
+        s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0          //carry bit over
+        s_rfe_b64 s_save_pc_lo                              //Return to the main shader program
+    else
+    end
+
+// Save Done timestamp
+if G8SR_DEBUG_TIMESTAMP
+        s_memrealtime   s_g8sr_ts_save_d
+        // SGPR SR memory offset : size(VGPR)
+        get_vgpr_size_bytes(s_save_mem_offset)
+        s_add_u32 s_save_mem_offset, s_save_mem_offset, G8SR_DEBUG_TS_SAVE_D_OFFSET
+        s_waitcnt lgkmcnt(0)         //FIXME, will cause xnack??
+        // Need reset rsrc2??
+        s_mov_b32 m0, s_save_mem_offset
+        s_mov_b32 s_save_buf_rsrc2,  0x1000000                                  //NUM_RECORDS in bytes
+        s_buffer_store_dwordx2 s_g8sr_ts_save_d, s_save_buf_rsrc0, m0       glc:1
+end
+
+
+    s_branch    L_END_PGM
+
+
+
+/**************************************************************************/
+/*                      restore routine                                   */
+/**************************************************************************/
+
+L_RESTORE:
+    /*      Setup Resource Contants    */
+    if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))
+        //calculate wd_addr using absolute thread id
+        v_readlane_b32 s_restore_tmp, v9, 0
+        s_lshr_b32 s_restore_tmp, s_restore_tmp, 6
+        s_mul_i32 s_restore_tmp, s_restore_tmp, WAVE_SPACE
+        s_add_i32 s_restore_spi_init_lo, s_restore_tmp, WG_BASE_ADDR_LO
+        s_mov_b32 s_restore_spi_init_hi, WG_BASE_ADDR_HI
+        s_and_b32 s_restore_spi_init_hi, s_restore_spi_init_hi, CTX_RESTORE_CONTROL
+    else
+    end
+
+if G8SR_DEBUG_TIMESTAMP
+        s_memrealtime   s_g8sr_ts_restore_s
+        s_waitcnt lgkmcnt(0)         //FIXME, will cause xnack??
+        // tma_lo/hi are sgpr 110, 111, which will not used for 112 SGPR allocated case...
+        s_mov_b32 s_restore_pc_lo, s_g8sr_ts_restore_s[0]
+        s_mov_b32 s_restore_pc_hi, s_g8sr_ts_restore_s[1]   //backup ts to ttmp0/1, sicne exec will be finally restored..
+end
+
+
+
+    s_mov_b32       s_restore_buf_rsrc0,    s_restore_spi_init_lo                                                           //base_addr_lo
+    s_and_b32       s_restore_buf_rsrc1,    s_restore_spi_init_hi, 0x0000FFFF                                               //base_addr_hi
+    s_or_b32        s_restore_buf_rsrc1,    s_restore_buf_rsrc1,  S_RESTORE_BUF_RSRC_WORD1_STRIDE
+    s_mov_b32       s_restore_buf_rsrc2,    0                                                                               //NUM_RECORDS initial value = 0 (in bytes)
+    s_mov_b32       s_restore_buf_rsrc3,    S_RESTORE_BUF_RSRC_WORD3_MISC
+    s_and_b32       s_restore_tmp,          s_restore_spi_init_hi, S_RESTORE_SPI_INIT_ATC_MASK
+    s_lshr_b32      s_restore_tmp,          s_restore_tmp, (S_RESTORE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT)       //get ATC bit into position
+    s_or_b32        s_restore_buf_rsrc3,    s_restore_buf_rsrc3,  s_restore_tmp                                             //or ATC
+    s_and_b32       s_restore_tmp,          s_restore_spi_init_hi, S_RESTORE_SPI_INIT_MTYPE_MASK
+    s_lshr_b32      s_restore_tmp,          s_restore_tmp, (S_RESTORE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT)   //get MTYPE bits into position
+    s_or_b32        s_restore_buf_rsrc3,    s_restore_buf_rsrc3,  s_restore_tmp                                             //or MTYPE
+
+    /*      global mem offset           */
+//  s_mov_b32       s_restore_mem_offset, 0x0                               //mem offset initial value = 0
+
+    /*      the first wave in the threadgroup    */
+    s_and_b32       s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK
+    s_cbranch_scc0  L_RESTORE_VGPR
+
+    /*          restore LDS     */
+    //////////////////////////////
+  L_RESTORE_LDS:
+
+    s_mov_b32       exec_lo, 0xFFFFFFFF                                                     //need every thread from now on   //be consistent with SAVE although can be moved ahead
+    s_mov_b32       exec_hi, 0xFFFFFFFF
+
+    s_getreg_b32    s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)              //lds_size
+    s_and_b32       s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF                  //lds_size is zero?
+    s_cbranch_scc0  L_RESTORE_VGPR                                                          //no lds used? jump to L_RESTORE_VGPR
+    s_lshl_b32      s_restore_alloc_size, s_restore_alloc_size, 6                           //LDS size in dwords = lds_size * 64dw
+    s_lshl_b32      s_restore_alloc_size, s_restore_alloc_size, 2                           //LDS size in bytes
+    s_mov_b32       s_restore_buf_rsrc2,    s_restore_alloc_size                            //NUM_RECORDS in bytes
+
+    // LDS at offset: size(VGPR)+SIZE(SGPR)+SIZE(HWREG)
+    //
+    get_vgpr_size_bytes(s_restore_mem_offset)
+    get_sgpr_size_bytes(s_restore_tmp)
+    s_add_u32  s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
+    s_add_u32  s_restore_mem_offset, s_restore_mem_offset, get_hwreg_size_bytes()            //FIXME, Check if offset overflow???
+
+
+    if (SWIZZLE_EN)
+        s_add_u32       s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0                       //FIXME need to use swizzle to enable bounds checking?
+    else
+        s_mov_b32       s_restore_buf_rsrc2,  0x1000000                                     //NUM_RECORDS in bytes
+    end
+    s_mov_b32       m0, 0x0                                                                 //lds_offset initial value = 0
+
+  L_RESTORE_LDS_LOOP:
+    if (SAVE_LDS)
+        buffer_load_dword   v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1                    // first 64DW
+        buffer_load_dword   v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:256         // second 64DW
+    end
+    s_add_u32       m0, m0, 256*2                                               // 128 DW
+    s_add_u32       s_restore_mem_offset, s_restore_mem_offset, 256*2           //mem offset increased by 128DW
+    s_cmp_lt_u32    m0, s_restore_alloc_size                                    //scc=(m0 < s_restore_alloc_size) ? 1 : 0
+    s_cbranch_scc1  L_RESTORE_LDS_LOOP                                                      //LDS restore is complete?
+
+
+    /*          restore VGPRs       */
+    //////////////////////////////
+  L_RESTORE_VGPR:
+        // VGPR SR memory offset : 0
+    s_mov_b32       s_restore_mem_offset, 0x0
+    s_mov_b32       exec_lo, 0xFFFFFFFF                                                     //need every thread from now on   //be consistent with SAVE although can be moved ahead
+    s_mov_b32       exec_hi, 0xFFFFFFFF
+
+    s_getreg_b32    s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)    //vpgr_size
+    s_add_u32       s_restore_alloc_size, s_restore_alloc_size, 1
+    s_lshl_b32      s_restore_alloc_size, s_restore_alloc_size, 2                           //Number of VGPRs = (vgpr_size + 1) * 4    (non-zero value)
+    s_lshl_b32      s_restore_buf_rsrc2,  s_restore_alloc_size, 8                           //NUM_RECORDS in bytes (64 threads*4)
+    if (SWIZZLE_EN)
+        s_add_u32       s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0                       //FIXME need to use swizzle to enable bounds checking?
+    else
+        s_mov_b32       s_restore_buf_rsrc2,  0x1000000                                     //NUM_RECORDS in bytes
+    end
+
+if G8SR_VGPR_SR_IN_DWX4
+     get_vgpr_size_bytes(s_restore_mem_offset)
+     s_sub_u32         s_restore_mem_offset, s_restore_mem_offset, 256*4
+
+     // the const stride for DWx4 is 4*4 bytes
+     s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF   // reset const stride to 0
+     s_or_b32  s_restore_buf_rsrc1, s_restore_buf_rsrc1, G8SR_RESTORE_BUF_RSRC_WORD1_STRIDE_DWx4  // const stride to 4*4 bytes
+
+     s_mov_b32         m0, s_restore_alloc_size
+     s_set_gpr_idx_on  m0, 0x8    // Note.. This will change m0
+
+L_RESTORE_VGPR_LOOP:
+     buffer_load_dwordx4 v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
+     s_waitcnt vmcnt(0)
+     s_sub_u32         m0, m0, 4
+     v_mov_b32         v0, v0   // v[0+m0] = v0
+     v_mov_b32         v1, v1
+     v_mov_b32         v2, v2
+     v_mov_b32         v3, v3
+     s_sub_u32         s_restore_mem_offset, s_restore_mem_offset, 256*4
+     s_cmp_eq_u32      m0, 0x8000
+     s_cbranch_scc0    L_RESTORE_VGPR_LOOP
+     s_set_gpr_idx_off
+
+     s_and_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, 0x0000FFFF   // reset const stride to 0
+     s_or_b32  s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE  // const stride to 4*4 bytes
+
+else
+    // VGPR load using dw burst
+    s_mov_b32       s_restore_mem_offset_save, s_restore_mem_offset     // restore start with v1, v0 will be the last
+    s_add_u32       s_restore_mem_offset, s_restore_mem_offset, 256*4
+    s_mov_b32       m0, 4                               //VGPR initial index value = 1
+    s_set_gpr_idx_on  m0, 0x8                       //M0[7:0] = M0[7:0] and M0[15:12] = 0x8
+    s_add_u32       s_restore_alloc_size, s_restore_alloc_size, 0x8000                      //add 0x8000 since we compare m0 against it later
+
+  L_RESTORE_VGPR_LOOP:
+    if(USE_MTBUF_INSTEAD_OF_MUBUF)
+        tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
+    else
+        buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1
+        buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256
+        buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*2
+        buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*3
+    end
+    s_waitcnt       vmcnt(0)                                                                //ensure data ready
+    v_mov_b32       v0, v0                                                                  //v[0+m0] = v0
+    v_mov_b32       v1, v1
+    v_mov_b32       v2, v2
+    v_mov_b32       v3, v3
+    s_add_u32       m0, m0, 4                                                               //next vgpr index
+    s_add_u32       s_restore_mem_offset, s_restore_mem_offset, 256*4                           //every buffer_load_dword does 256 bytes
+    s_cmp_lt_u32    m0, s_restore_alloc_size                                                //scc = (m0 < s_restore_alloc_size) ? 1 : 0
+    s_cbranch_scc1  L_RESTORE_VGPR_LOOP                                                     //VGPR restore (except v0) is complete?
+    s_set_gpr_idx_off
+                                                                                            /* VGPR restore on v0 */
+    if(USE_MTBUF_INSTEAD_OF_MUBUF)
+        tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1
+    else
+        buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save    slc:1 glc:1
+        buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save    slc:1 glc:1 offset:256
+        buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save    slc:1 glc:1 offset:256*2
+        buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save    slc:1 glc:1 offset:256*3
+    end
+
+end
+
+    /*          restore SGPRs       */
+    //////////////////////////////
+
+    // SGPR SR memory offset : size(VGPR)
+    get_vgpr_size_bytes(s_restore_mem_offset)
+    get_sgpr_size_bytes(s_restore_tmp)
+    s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
+    s_sub_u32 s_restore_mem_offset, s_restore_mem_offset, 16*4     // restore SGPR from S[n] to S[0], by 16 sgprs group
+    // TODO, change RSRC word to rearrange memory layout for SGPRS
+
+    s_getreg_b32    s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE)                //spgr_size
+    s_add_u32       s_restore_alloc_size, s_restore_alloc_size, 1
+    s_lshl_b32      s_restore_alloc_size, s_restore_alloc_size, 4                           //Number of SGPRs = (sgpr_size + 1) * 16   (non-zero value)
+
+    if (SGPR_SAVE_USE_SQC)
+        s_lshl_b32      s_restore_buf_rsrc2,    s_restore_alloc_size, 2                     //NUM_RECORDS in bytes
+    else
+        s_lshl_b32      s_restore_buf_rsrc2,    s_restore_alloc_size, 8                     //NUM_RECORDS in bytes (64 threads)
+    end
+    if (SWIZZLE_EN)
+        s_add_u32       s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0                       //FIXME need to use swizzle to enable bounds checking?
+    else
+        s_mov_b32       s_restore_buf_rsrc2,  0x1000000                                     //NUM_RECORDS in bytes
+    end
+
+    /* If 112 SGPRs ar allocated, 4 sgprs are not used TBA(108,109),TMA(110,111),
+       However, we are safe to restore these 4 SGPRs anyway, since TBA,TMA will later be restored by HWREG
+    */
+    s_mov_b32 m0, s_restore_alloc_size
+
+ L_RESTORE_SGPR_LOOP:
+    read_16sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset)  //PV: further performance improvement can be made
+    s_waitcnt       lgkmcnt(0)                                                              //ensure data ready
+
+    s_sub_u32 m0, m0, 16    // Restore from S[n] to S[0]
+
+    s_movreld_b64   s0, s0      //s[0+m0] = s0
+    s_movreld_b64   s2, s2
+    s_movreld_b64   s4, s4
+    s_movreld_b64   s6, s6
+    s_movreld_b64   s8, s8
+    s_movreld_b64   s10, s10
+    s_movreld_b64   s12, s12
+    s_movreld_b64   s14, s14
+
+    s_cmp_eq_u32    m0, 0               //scc = (m0 < s_restore_alloc_size) ? 1 : 0
+    s_cbranch_scc0  L_RESTORE_SGPR_LOOP             //SGPR restore (except s0) is complete?
+
+    /*      restore HW registers    */
+    //////////////////////////////
+  L_RESTORE_HWREG:
+
+
+if G8SR_DEBUG_TIMESTAMP
+      s_mov_b32 s_g8sr_ts_restore_s[0], s_restore_pc_lo
+      s_mov_b32 s_g8sr_ts_restore_s[1], s_restore_pc_hi
+end
+
+    // HWREG SR memory offset : size(VGPR)+size(SGPR)
+    get_vgpr_size_bytes(s_restore_mem_offset)
+    get_sgpr_size_bytes(s_restore_tmp)
+    s_add_u32 s_restore_mem_offset, s_restore_mem_offset, s_restore_tmp
+
+
+    s_mov_b32       s_restore_buf_rsrc2, 0x4                                                //NUM_RECORDS   in bytes
+    if (SWIZZLE_EN)
+        s_add_u32       s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0                       //FIXME need to use swizzle to enable bounds checking?
+    else
+        s_mov_b32       s_restore_buf_rsrc2,  0x1000000                                     //NUM_RECORDS in bytes
+    end
+
+    read_hwreg_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset)                    //M0
+    read_hwreg_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset)             //PC
+    read_hwreg_from_mem(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset)
+    read_hwreg_from_mem(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset)               //EXEC
+    read_hwreg_from_mem(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset)
+    read_hwreg_from_mem(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset)                //STATUS
+    read_hwreg_from_mem(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset)               //TRAPSTS
+    read_hwreg_from_mem(xnack_mask_lo, s_restore_buf_rsrc0, s_restore_mem_offset)                   //XNACK_MASK_LO
+    read_hwreg_from_mem(xnack_mask_hi, s_restore_buf_rsrc0, s_restore_mem_offset)                   //XNACK_MASK_HI
+    read_hwreg_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset)              //MODE
+    read_hwreg_from_mem(tba_lo, s_restore_buf_rsrc0, s_restore_mem_offset)                      //TBA_LO
+    read_hwreg_from_mem(tba_hi, s_restore_buf_rsrc0, s_restore_mem_offset)                      //TBA_HI
+
+    s_waitcnt       lgkmcnt(0)                                                                                      //from now on, it is safe to restore STATUS and IB_STS
+
+    s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff      //pc[47:32]        //Do it here in order not to affect STATUS
+
+    //for normal save & restore, the saved PC points to the next inst to execute, no adjustment needs to be made, otherwise:
+    if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL))
+        s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 8            //pc[31:0]+8     //two back-to-back s_trap are used (first for save and second for restore)
+        s_addc_u32  s_restore_pc_hi, s_restore_pc_hi, 0x0        //carry bit over
+    end
+    if ((EMU_RUN_HACK) && (EMU_RUN_HACK_RESTORE_NORMAL))
+        s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 4            //pc[31:0]+4     // save is hack through s_trap but restore is normal
+        s_addc_u32  s_restore_pc_hi, s_restore_pc_hi, 0x0        //carry bit over
+    end
+
+    s_mov_b32       m0,         s_restore_m0
+    s_mov_b32       exec_lo,    s_restore_exec_lo
+    s_mov_b32       exec_hi,    s_restore_exec_hi
+
+    s_and_b32       s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts
+    s_setreg_b32    hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0
+    s_and_b32       s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts
+    s_lshr_b32      s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT
+    s_setreg_b32    hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0
+    //s_setreg_b32  hwreg(HW_REG_TRAPSTS),  s_restore_trapsts      //don't overwrite SAVECTX bit as it may be set through external SAVECTX during restore
+    s_setreg_b32    hwreg(HW_REG_MODE),     s_restore_mode
+    //reuse s_restore_m0 as a temp register
+    s_and_b32       s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_RCNT_MASK
+    s_lshr_b32      s_restore_m0, s_restore_m0, S_SAVE_PC_HI_RCNT_SHIFT
+    s_lshl_b32      s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_RCNT_SHIFT
+    s_mov_b32       s_restore_tmp, 0x0                                                                              //IB_STS is zero
+    s_or_b32        s_restore_tmp, s_restore_tmp, s_restore_m0
+    s_and_b32       s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_FIRST_REPLAY_MASK
+    s_lshr_b32      s_restore_m0, s_restore_m0, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT
+    s_lshl_b32      s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT
+    s_or_b32        s_restore_tmp, s_restore_tmp, s_restore_m0
+    s_and_b32       s_restore_m0, s_restore_status, SQ_WAVE_STATUS_INST_ATC_MASK
+    s_lshr_b32      s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT
+    s_setreg_b32    hwreg(HW_REG_IB_STS),   s_restore_tmp
+
+    s_and_b64    exec, exec, exec  // Restore STATUS.EXECZ, not writable by s_setreg_b32
+    s_and_b64    vcc, vcc, vcc  // Restore STATUS.VCCZ, not writable by s_setreg_b32
+    s_setreg_b32    hwreg(HW_REG_STATUS),   s_restore_status     // SCC is included, which is changed by previous salu
+
+    s_barrier                                                   //barrier to ensure the readiness of LDS before access attempts from any other wave in the same TG //FIXME not performance-optimal at this time
+
+if G8SR_DEBUG_TIMESTAMP
+    s_memrealtime s_g8sr_ts_restore_d
+    s_waitcnt lgkmcnt(0)
+end
+
+//  s_rfe_b64 s_restore_pc_lo                                   //Return to the main shader program and resume execution
+    s_rfe_restore_b64  s_restore_pc_lo, s_restore_m0            // s_restore_m0[0] is used to set STATUS.inst_atc
+
+
+/**************************************************************************/
+/*                      the END                                           */
+/**************************************************************************/
+L_END_PGM:
+    s_endpgm
+
+end
+
+
+/**************************************************************************/
+/*                      the helper functions                              */
+/**************************************************************************/
+
+//Only for save hwreg to mem
+function write_hwreg_to_mem(s, s_rsrc, s_mem_offset)
+        s_mov_b32 exec_lo, m0                   //assuming exec_lo is not needed anymore from this point on
+        s_mov_b32 m0, s_mem_offset
+        s_buffer_store_dword s, s_rsrc, m0      glc:1
+        s_add_u32       s_mem_offset, s_mem_offset, 4
+        s_mov_b32   m0, exec_lo
+end
+
+
+// HWREG are saved before SGPRs, so all HWREG could be use.
+function write_16sgpr_to_mem(s, s_rsrc, s_mem_offset)
+
+        s_buffer_store_dwordx4 s[0], s_rsrc, 0  glc:1
+        s_buffer_store_dwordx4 s[4], s_rsrc, 16  glc:1
+        s_buffer_store_dwordx4 s[8], s_rsrc, 32  glc:1
+        s_buffer_store_dwordx4 s[12], s_rsrc, 48 glc:1
+        s_add_u32       s_rsrc[0], s_rsrc[0], 4*16
+        s_addc_u32      s_rsrc[1], s_rsrc[1], 0x0             // +scc
+end
+
+
+function read_hwreg_from_mem(s, s_rsrc, s_mem_offset)
+    s_buffer_load_dword s, s_rsrc, s_mem_offset     glc:1
+    s_add_u32       s_mem_offset, s_mem_offset, 4
+end
+
+function read_16sgpr_from_mem(s, s_rsrc, s_mem_offset)
+    s_buffer_load_dwordx16 s, s_rsrc, s_mem_offset      glc:1
+    s_sub_u32       s_mem_offset, s_mem_offset, 4*16
+end
+
+
+
+function get_lds_size_bytes(s_lds_size_byte)
+    // SQ LDS granularity is 64DW, while PGM_RSRC2.lds_size is in granularity 128DW
+    s_getreg_b32   s_lds_size_byte, hwreg(HW_REG_LDS_ALLOC, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT, SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE)          // lds_size
+    s_lshl_b32     s_lds_size_byte, s_lds_size_byte, 8                      //LDS size in dwords = lds_size * 64 *4Bytes    // granularity 64DW
+end
+
+function get_vgpr_size_bytes(s_vgpr_size_byte)
+    s_getreg_b32   s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE)  //vpgr_size
+    s_add_u32      s_vgpr_size_byte, s_vgpr_size_byte, 1
+    s_lshl_b32     s_vgpr_size_byte, s_vgpr_size_byte, (2+8) //Number of VGPRs = (vgpr_size + 1) * 4 * 64 * 4   (non-zero value)   //FIXME for GFX, zero is possible
+end
+
+function get_sgpr_size_bytes(s_sgpr_size_byte)
+    s_getreg_b32   s_sgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE)  //spgr_size
+    s_add_u32      s_sgpr_size_byte, s_sgpr_size_byte, 1
+    s_lshl_b32     s_sgpr_size_byte, s_sgpr_size_byte, 6 //Number of SGPRs = (sgpr_size + 1) * 16 *4   (non-zero value)
+end
+
+function get_hwreg_size_bytes
+    return 128 //HWREG size 128 bytes
+end
+
+
+#endif
+
+static const uint32_t cwsr_trap_gfx8_hex[] = {
+	0xbf820001, 0xbf820123,
+	0xb8f4f802, 0x89748674,
+	0xb8f5f803, 0x8675ff75,
+	0x00000400, 0xbf850011,
+	0xc00a1e37, 0x00000000,
+	0xbf8c007f, 0x87777978,
+	0xbf840002, 0xb974f802,
+	0xbe801d78, 0xb8f5f803,
+	0x8675ff75, 0x000001ff,
+	0xbf850002, 0x80708470,
+	0x82718071, 0x8671ff71,
+	0x0000ffff, 0xb974f802,
+	0xbe801f70, 0xb8f5f803,
+	0x8675ff75, 0x00000100,
+	0xbf840006, 0xbefa0080,
+	0xb97a0203, 0x8671ff71,
+	0x0000ffff, 0x80f08870,
+	0x82f18071, 0xbefa0080,
+	0xb97a0283, 0xbef60068,
+	0xbef70069, 0xb8fa1c07,
+	0x8e7a9c7a, 0x87717a71,
+	0xb8fa03c7, 0x8e7a9b7a,
+	0x87717a71, 0xb8faf807,
+	0x867aff7a, 0x00007fff,
+	0xb97af807, 0xbef2007e,
+	0xbef3007f, 0xbefe0180,
+	0xbf900004, 0xbf8e0002,
+	0xbf88fffe, 0xbef8007e,
+	0x8679ff7f, 0x0000ffff,
+	0x8779ff79, 0x00040000,
+	0xbefa0080, 0xbefb00ff,
+	0x00807fac, 0x867aff7f,
+	0x08000000, 0x8f7a837a,
+	0x877b7a7b, 0x867aff7f,
+	0x70000000, 0x8f7a817a,
+	0x877b7a7b, 0xbeef007c,
+	0xbeee0080, 0xb8ee2a05,
+	0x806e816e, 0x8e6e8a6e,
+	0xb8fa1605, 0x807a817a,
+	0x8e7a867a, 0x806e7a6e,
+	0xbefa0084, 0xbefa00ff,
+	0x01000000, 0xbefe007c,
+	0xbefc006e, 0xc0611bfc,
+	0x0000007c, 0x806e846e,
+	0xbefc007e, 0xbefe007c,
+	0xbefc006e, 0xc0611c3c,
+	0x0000007c, 0x806e846e,
+	0xbefc007e, 0xbefe007c,
+	0xbefc006e, 0xc0611c7c,
+	0x0000007c, 0x806e846e,
+	0xbefc007e, 0xbefe007c,
+	0xbefc006e, 0xc0611cbc,
+	0x0000007c, 0x806e846e,
+	0xbefc007e, 0xbefe007c,
+	0xbefc006e, 0xc0611cfc,
+	0x0000007c, 0x806e846e,
+	0xbefc007e, 0xbefe007c,
+	0xbefc006e, 0xc0611d3c,
+	0x0000007c, 0x806e846e,
+	0xbefc007e, 0xb8f5f803,
+	0xbefe007c, 0xbefc006e,
+	0xc0611d7c, 0x0000007c,
+	0x806e846e, 0xbefc007e,
+	0xbefe007c, 0xbefc006e,
+	0xc0611dbc, 0x0000007c,
+	0x806e846e, 0xbefc007e,
+	0xbefe007c, 0xbefc006e,
+	0xc0611dfc, 0x0000007c,
+	0x806e846e, 0xbefc007e,
+	0xb8eff801, 0xbefe007c,
+	0xbefc006e, 0xc0611bfc,
+	0x0000007c, 0x806e846e,
+	0xbefc007e, 0xbefe007c,
+	0xbefc006e, 0xc0611b3c,
+	0x0000007c, 0x806e846e,
+	0xbefc007e, 0xbefe007c,
+	0xbefc006e, 0xc0611b7c,
+	0x0000007c, 0x806e846e,
+	0xbefc007e, 0x867aff7f,
+	0x04000000, 0xbef30080,
+	0x8773737a, 0xb8ee2a05,
+	0x806e816e, 0x8e6e8a6e,
+	0xb8f51605, 0x80758175,
+	0x8e758475, 0x8e7a8275,
+	0xbefa00ff, 0x01000000,
+	0xbef60178, 0x80786e78,
+	0x82798079, 0xbefc0080,
+	0xbe802b00, 0xbe822b02,
+	0xbe842b04, 0xbe862b06,
+	0xbe882b08, 0xbe8a2b0a,
+	0xbe8c2b0c, 0xbe8e2b0e,
+	0xc06b003c, 0x00000000,
+	0xc06b013c, 0x00000010,
+	0xc06b023c, 0x00000020,
+	0xc06b033c, 0x00000030,
+	0x8078c078, 0x82798079,
+	0x807c907c, 0xbf0a757c,
+	0xbf85ffeb, 0xbef80176,
+	0xbeee0080, 0xbefe00c1,
+	0xbeff00c1, 0xbefa00ff,
+	0x01000000, 0xe0724000,
+	0x6e1e0000, 0xe0724100,
+	0x6e1e0100, 0xe0724200,
+	0x6e1e0200, 0xe0724300,
+	0x6e1e0300, 0xbefe00c1,
+	0xbeff00c1, 0xb8f54306,
+	0x8675c175, 0xbf84002c,
+	0xbf8a0000, 0x867aff73,
+	0x04000000, 0xbf840028,
+	0x8e758675, 0x8e758275,
+	0xbefa0075, 0xb8ee2a05,
+	0x806e816e, 0x8e6e8a6e,
+	0xb8fa1605, 0x807a817a,
+	0x8e7a867a, 0x806e7a6e,
+	0x806eff6e, 0x00000080,
+	0xbefa00ff, 0x01000000,
+	0xbefc0080, 0xd28c0002,
+	0x000100c1, 0xd28d0003,
+	0x000204c1, 0xd1060002,
+	0x00011103, 0x7e0602ff,
+	0x00000200, 0xbefc00ff,
+	0x00010000, 0xbe80007b,
+	0x867bff7b, 0xff7fffff,
+	0x877bff7b, 0x00058000,
+	0xd8ec0000, 0x00000002,
+	0xbf8c007f, 0xe0765000,
+	0x6e1e0002, 0x32040702,
+	0xd0c9006a, 0x0000eb02,
+	0xbf87fff7, 0xbefb0000,
+	0xbeee00ff, 0x00000400,
+	0xbefe00c1, 0xbeff00c1,
+	0xb8f52a05, 0x80758175,
+	0x8e758275, 0x8e7a8875,
+	0xbefa00ff, 0x01000000,
+	0xbefc0084, 0xbf0a757c,
+	0xbf840015, 0xbf11017c,
+	0x8075ff75, 0x00001000,
+	0x7e000300, 0x7e020301,
+	0x7e040302, 0x7e060303,
+	0xe0724000, 0x6e1e0000,
+	0xe0724100, 0x6e1e0100,
+	0xe0724200, 0x6e1e0200,
+	0xe0724300, 0x6e1e0300,
+	0x807c847c, 0x806eff6e,
+	0x00000400, 0xbf0a757c,
+	0xbf85ffef, 0xbf9c0000,
+	0xbf8200ca, 0xbef8007e,
+	0x8679ff7f, 0x0000ffff,
+	0x8779ff79, 0x00040000,
+	0xbefa0080, 0xbefb00ff,
+	0x00807fac, 0x8676ff7f,
+	0x08000000, 0x8f768376,
+	0x877b767b, 0x8676ff7f,
+	0x70000000, 0x8f768176,
+	0x877b767b, 0x8676ff7f,
+	0x04000000, 0xbf84001e,
+	0xbefe00c1, 0xbeff00c1,
+	0xb8f34306, 0x8673c173,
+	0xbf840019, 0x8e738673,
+	0x8e738273, 0xbefa0073,
+	0xb8f22a05, 0x80728172,
+	0x8e728a72, 0xb8f61605,
+	0x80768176, 0x8e768676,
+	0x80727672, 0x8072ff72,
+	0x00000080, 0xbefa00ff,
+	0x01000000, 0xbefc0080,
+	0xe0510000, 0x721e0000,
+	0xe0510100, 0x721e0000,
+	0x807cff7c, 0x00000200,
+	0x8072ff72, 0x00000200,
+	0xbf0a737c, 0xbf85fff6,
+	0xbef20080, 0xbefe00c1,
+	0xbeff00c1, 0xb8f32a05,
+	0x80738173, 0x8e738273,
+	0x8e7a8873, 0xbefa00ff,
+	0x01000000, 0xbef60072,
+	0x8072ff72, 0x00000400,
+	0xbefc0084, 0xbf11087c,
+	0x8073ff73, 0x00008000,
+	0xe0524000, 0x721e0000,
+	0xe0524100, 0x721e0100,
+	0xe0524200, 0x721e0200,
+	0xe0524300, 0x721e0300,
+	0xbf8c0f70, 0x7e000300,
+	0x7e020301, 0x7e040302,
+	0x7e060303, 0x807c847c,
+	0x8072ff72, 0x00000400,
+	0xbf0a737c, 0xbf85ffee,
+	0xbf9c0000, 0xe0524000,
+	0x761e0000, 0xe0524100,
+	0x761e0100, 0xe0524200,
+	0x761e0200, 0xe0524300,
+	0x761e0300, 0xb8f22a05,
+	0x80728172, 0x8e728a72,
+	0xb8f61605, 0x80768176,
+	0x8e768676, 0x80727672,
+	0x80f2c072, 0xb8f31605,
+	0x80738173, 0x8e738473,
+	0x8e7a8273, 0xbefa00ff,
+	0x01000000, 0xbefc0073,
+	0xc031003c, 0x00000072,
+	0x80f2c072, 0xbf8c007f,
+	0x80fc907c, 0xbe802d00,
+	0xbe822d02, 0xbe842d04,
+	0xbe862d06, 0xbe882d08,
+	0xbe8a2d0a, 0xbe8c2d0c,
+	0xbe8e2d0e, 0xbf06807c,
+	0xbf84fff1, 0xb8f22a05,
+	0x80728172, 0x8e728a72,
+	0xb8f61605, 0x80768176,
+	0x8e768676, 0x80727672,
+	0xbefa0084, 0xbefa00ff,
+	0x01000000, 0xc0211cfc,
+	0x00000072, 0x80728472,
+	0xc0211c3c, 0x00000072,
+	0x80728472, 0xc0211c7c,
+	0x00000072, 0x80728472,
+	0xc0211bbc, 0x00000072,
+	0x80728472, 0xc0211bfc,
+	0x00000072, 0x80728472,
+	0xc0211d3c, 0x00000072,
+	0x80728472, 0xc0211d7c,
+	0x00000072, 0x80728472,
+	0xc0211a3c, 0x00000072,
+	0x80728472, 0xc0211a7c,
+	0x00000072, 0x80728472,
+	0xc0211dfc, 0x00000072,
+	0x80728472, 0xc0211b3c,
+	0x00000072, 0x80728472,
+	0xc0211b7c, 0x00000072,
+	0x80728472, 0xbf8c007f,
+	0x8671ff71, 0x0000ffff,
+	0xbefc0073, 0xbefe006e,
+	0xbeff006f, 0x867375ff,
+	0x000003ff, 0xb9734803,
+	0x867375ff, 0xfffff800,
+	0x8f738b73, 0xb973a2c3,
+	0xb977f801, 0x8673ff71,
+	0xf0000000, 0x8f739c73,
+	0x8e739073, 0xbef60080,
+	0x87767376, 0x8673ff71,
+	0x08000000, 0x8f739b73,
+	0x8e738f73, 0x87767376,
+	0x8673ff74, 0x00800000,
+	0x8f739773, 0xb976f807,
+	0x86fe7e7e, 0x86ea6a6a,
+	0xb974f802, 0xbf8a0000,
+	0x95807370, 0xbf810000,
+};
+
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 505d39156acd..62c3d9cd6ef1 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -117,7 +117,7 @@ static int kfd_open(struct inode *inode, struct file *filep)
 		return -EPERM;
 	}
 
-	process = kfd_create_process(current);
+	process = kfd_create_process(filep);
 	if (IS_ERR(process))
 		return PTR_ERR(process);
 
@@ -206,6 +206,7 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties,
 	q_properties->ctx_save_restore_area_address =
 			args->ctx_save_restore_address;
 	q_properties->ctx_save_restore_area_size = args->ctx_save_restore_size;
+	q_properties->ctl_stack_size = args->ctl_stack_size;
 	if (args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE ||
 		args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE_AQL)
 		q_properties->type = KFD_QUEUE_TYPE_COMPUTE;
@@ -431,6 +432,38 @@ out:
 	return err;
 }
 
+static int kfd_ioctl_set_trap_handler(struct file *filep,
+					struct kfd_process *p, void *data)
+{
+	struct kfd_ioctl_set_trap_handler_args *args = data;
+	struct kfd_dev *dev;
+	int err = 0;
+	struct kfd_process_device *pdd;
+
+	dev = kfd_device_by_id(args->gpu_id);
+	if (dev == NULL)
+		return -EINVAL;
+
+	mutex_lock(&p->mutex);
+
+	pdd = kfd_bind_process_to_device(dev, p);
+	if (IS_ERR(pdd)) {
+		err = -ESRCH;
+		goto out;
+	}
+
+	if (dev->dqm->ops.set_trap_handler(dev->dqm,
+					&pdd->qpd,
+					args->tba_addr,
+					args->tma_addr))
+		err = -EINVAL;
+
+out:
+	mutex_unlock(&p->mutex);
+
+	return err;
+}
+
 static int kfd_ioctl_dbg_register(struct file *filep,
 				struct kfd_process *p, void *data)
 {
@@ -493,7 +526,7 @@ static int kfd_ioctl_dbg_unregister(struct file *filep,
 	long status;
 
 	dev = kfd_device_by_id(args->gpu_id);
-	if (!dev)
+	if (!dev || !dev->dbgmgr)
 		return -EINVAL;
 
 	if (dev->device_info->asic_family == CHIP_CARRIZO) {
@@ -979,7 +1012,10 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
 			kfd_ioctl_set_scratch_backing_va, 0),
 
 	AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_TILE_CONFIG,
-			kfd_ioctl_get_tile_config, 0)
+			kfd_ioctl_get_tile_config, 0),
+
+	AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_TRAP_HANDLER,
+			kfd_ioctl_set_trap_handler, 0),
 };
 
 #define AMDKFD_CORE_IOCTL_COUNT	ARRAY_SIZE(amdkfd_ioctls)
@@ -1088,6 +1124,10 @@ static int kfd_mmap(struct file *filp, struct vm_area_struct *vma)
 			KFD_MMAP_EVENTS_MASK) {
 		vma->vm_pgoff = vma->vm_pgoff ^ KFD_MMAP_EVENTS_MASK;
 		return kfd_event_mmap(process, vma);
+	} else if ((vma->vm_pgoff & KFD_MMAP_RESERVED_MEM_MASK) ==
+			KFD_MMAP_RESERVED_MEM_MASK) {
+		vma->vm_pgoff = vma->vm_pgoff ^ KFD_MMAP_RESERVED_MEM_MASK;
+		return kfd_reserved_mem_mmap(process, vma);
 	}
 
 	return -EFAULT;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
new file mode 100644
index 000000000000..2bc2816767a7
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
@@ -0,0 +1,1267 @@
+/*
+ * Copyright 2015-2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <linux/pci.h>
+#include <linux/acpi.h>
+#include <linux/amd-iommu.h>
+#include "kfd_crat.h"
+#include "kfd_priv.h"
+#include "kfd_topology.h"
+
+/* GPU Processor ID base for dGPUs for which VCRAT needs to be created.
+ * GPU processor ID are expressed with Bit[31]=1.
+ * The base is set to 0x8000_0000 + 0x1000 to avoid collision with GPU IDs
+ * used in the CRAT.
+ */
+static uint32_t gpu_processor_id_low = 0x80001000;
+
+/* Return the next available gpu_processor_id and increment it for next GPU
+ *	@total_cu_count - Total CUs present in the GPU including ones
+ *			  masked off
+ */
+static inline unsigned int get_and_inc_gpu_processor_id(
+				unsigned int total_cu_count)
+{
+	int current_id = gpu_processor_id_low;
+
+	gpu_processor_id_low += total_cu_count;
+	return current_id;
+}
+
+/* Static table to describe GPU Cache information */
+struct kfd_gpu_cache_info {
+	uint32_t	cache_size;
+	uint32_t	cache_level;
+	uint32_t	flags;
+	/* Indicates how many Compute Units share this cache
+	 * Value = 1 indicates the cache is not shared
+	 */
+	uint32_t	num_cu_shared;
+};
+
+static struct kfd_gpu_cache_info kaveri_cache_info[] = {
+	{
+		/* TCP L1 Cache per CU */
+		.cache_size = 16,
+		.cache_level = 1,
+		.flags = (CRAT_CACHE_FLAGS_ENABLED |
+				CRAT_CACHE_FLAGS_DATA_CACHE |
+				CRAT_CACHE_FLAGS_SIMD_CACHE),
+		.num_cu_shared = 1,
+
+	},
+	{
+		/* Scalar L1 Instruction Cache (in SQC module) per bank */
+		.cache_size = 16,
+		.cache_level = 1,
+		.flags = (CRAT_CACHE_FLAGS_ENABLED |
+				CRAT_CACHE_FLAGS_INST_CACHE |
+				CRAT_CACHE_FLAGS_SIMD_CACHE),
+		.num_cu_shared = 2,
+	},
+	{
+		/* Scalar L1 Data Cache (in SQC module) per bank */
+		.cache_size = 8,
+		.cache_level = 1,
+		.flags = (CRAT_CACHE_FLAGS_ENABLED |
+				CRAT_CACHE_FLAGS_DATA_CACHE |
+				CRAT_CACHE_FLAGS_SIMD_CACHE),
+		.num_cu_shared = 2,
+	},
+
+	/* TODO: Add L2 Cache information */
+};
+
+
+static struct kfd_gpu_cache_info carrizo_cache_info[] = {
+	{
+		/* TCP L1 Cache per CU */
+		.cache_size = 16,
+		.cache_level = 1,
+		.flags = (CRAT_CACHE_FLAGS_ENABLED |
+				CRAT_CACHE_FLAGS_DATA_CACHE |
+				CRAT_CACHE_FLAGS_SIMD_CACHE),
+		.num_cu_shared = 1,
+	},
+	{
+		/* Scalar L1 Instruction Cache (in SQC module) per bank */
+		.cache_size = 8,
+		.cache_level = 1,
+		.flags = (CRAT_CACHE_FLAGS_ENABLED |
+				CRAT_CACHE_FLAGS_INST_CACHE |
+				CRAT_CACHE_FLAGS_SIMD_CACHE),
+		.num_cu_shared = 4,
+	},
+	{
+		/* Scalar L1 Data Cache (in SQC module) per bank. */
+		.cache_size = 4,
+		.cache_level = 1,
+		.flags = (CRAT_CACHE_FLAGS_ENABLED |
+				CRAT_CACHE_FLAGS_DATA_CACHE |
+				CRAT_CACHE_FLAGS_SIMD_CACHE),
+		.num_cu_shared = 4,
+	},
+
+	/* TODO: Add L2 Cache information */
+};
+
+/* NOTE: In future if more information is added to struct kfd_gpu_cache_info
+ * the following ASICs may need a separate table.
+ */
+#define hawaii_cache_info kaveri_cache_info
+#define tonga_cache_info carrizo_cache_info
+#define fiji_cache_info  carrizo_cache_info
+#define polaris10_cache_info carrizo_cache_info
+#define polaris11_cache_info carrizo_cache_info
+
+static void kfd_populated_cu_info_cpu(struct kfd_topology_device *dev,
+		struct crat_subtype_computeunit *cu)
+{
+	dev->node_props.cpu_cores_count = cu->num_cpu_cores;
+	dev->node_props.cpu_core_id_base = cu->processor_id_low;
+	if (cu->hsa_capability & CRAT_CU_FLAGS_IOMMU_PRESENT)
+		dev->node_props.capability |= HSA_CAP_ATS_PRESENT;
+
+	pr_debug("CU CPU: cores=%d id_base=%d\n", cu->num_cpu_cores,
+			cu->processor_id_low);
+}
+
+static void kfd_populated_cu_info_gpu(struct kfd_topology_device *dev,
+		struct crat_subtype_computeunit *cu)
+{
+	dev->node_props.simd_id_base = cu->processor_id_low;
+	dev->node_props.simd_count = cu->num_simd_cores;
+	dev->node_props.lds_size_in_kb = cu->lds_size_in_kb;
+	dev->node_props.max_waves_per_simd = cu->max_waves_simd;
+	dev->node_props.wave_front_size = cu->wave_front_size;
+	dev->node_props.array_count = cu->array_count;
+	dev->node_props.cu_per_simd_array = cu->num_cu_per_array;
+	dev->node_props.simd_per_cu = cu->num_simd_per_cu;
+	dev->node_props.max_slots_scratch_cu = cu->max_slots_scatch_cu;
+	if (cu->hsa_capability & CRAT_CU_FLAGS_HOT_PLUGGABLE)
+		dev->node_props.capability |= HSA_CAP_HOT_PLUGGABLE;
+	pr_debug("CU GPU: id_base=%d\n", cu->processor_id_low);
+}
+
+/* kfd_parse_subtype_cu - parse compute unit subtypes and attach it to correct
+ * topology device present in the device_list
+ */
+static int kfd_parse_subtype_cu(struct crat_subtype_computeunit *cu,
+				struct list_head *device_list)
+{
+	struct kfd_topology_device *dev;
+
+	pr_debug("Found CU entry in CRAT table with proximity_domain=%d caps=%x\n",
+			cu->proximity_domain, cu->hsa_capability);
+	list_for_each_entry(dev, device_list, list) {
+		if (cu->proximity_domain == dev->proximity_domain) {
+			if (cu->flags & CRAT_CU_FLAGS_CPU_PRESENT)
+				kfd_populated_cu_info_cpu(dev, cu);
+
+			if (cu->flags & CRAT_CU_FLAGS_GPU_PRESENT)
+				kfd_populated_cu_info_gpu(dev, cu);
+			break;
+		}
+	}
+
+	return 0;
+}
+
+/* kfd_parse_subtype_mem - parse memory subtypes and attach it to correct
+ * topology device present in the device_list
+ */
+static int kfd_parse_subtype_mem(struct crat_subtype_memory *mem,
+				struct list_head *device_list)
+{
+	struct kfd_mem_properties *props;
+	struct kfd_topology_device *dev;
+
+	pr_debug("Found memory entry in CRAT table with proximity_domain=%d\n",
+			mem->proximity_domain);
+	list_for_each_entry(dev, device_list, list) {
+		if (mem->proximity_domain == dev->proximity_domain) {
+			props = kfd_alloc_struct(props);
+			if (!props)
+				return -ENOMEM;
+
+			/* We're on GPU node */
+			if (dev->node_props.cpu_cores_count == 0) {
+				/* APU */
+				if (mem->visibility_type == 0)
+					props->heap_type =
+						HSA_MEM_HEAP_TYPE_FB_PRIVATE;
+				/* dGPU */
+				else
+					props->heap_type = mem->visibility_type;
+			} else
+				props->heap_type = HSA_MEM_HEAP_TYPE_SYSTEM;
+
+			if (mem->flags & CRAT_MEM_FLAGS_HOT_PLUGGABLE)
+				props->flags |= HSA_MEM_FLAGS_HOT_PLUGGABLE;
+			if (mem->flags & CRAT_MEM_FLAGS_NON_VOLATILE)
+				props->flags |= HSA_MEM_FLAGS_NON_VOLATILE;
+
+			props->size_in_bytes =
+				((uint64_t)mem->length_high << 32) +
+							mem->length_low;
+			props->width = mem->width;
+
+			dev->node_props.mem_banks_count++;
+			list_add_tail(&props->list, &dev->mem_props);
+
+			break;
+		}
+	}
+
+	return 0;
+}
+
+/* kfd_parse_subtype_cache - parse cache subtypes and attach it to correct
+ * topology device present in the device_list
+ */
+static int kfd_parse_subtype_cache(struct crat_subtype_cache *cache,
+			struct list_head *device_list)
+{
+	struct kfd_cache_properties *props;
+	struct kfd_topology_device *dev;
+	uint32_t id;
+	uint32_t total_num_of_cu;
+
+	id = cache->processor_id_low;
+
+	pr_debug("Found cache entry in CRAT table with processor_id=%d\n", id);
+	list_for_each_entry(dev, device_list, list) {
+		total_num_of_cu = (dev->node_props.array_count *
+					dev->node_props.cu_per_simd_array);
+
+		/* Cache infomration in CRAT doesn't have proximity_domain
+		 * information as it is associated with a CPU core or GPU
+		 * Compute Unit. So map the cache using CPU core Id or SIMD
+		 * (GPU) ID.
+		 * TODO: This works because currently we can safely assume that
+		 *  Compute Units are parsed before caches are parsed. In
+		 *  future, remove this dependency
+		 */
+		if ((id >= dev->node_props.cpu_core_id_base &&
+			id <= dev->node_props.cpu_core_id_base +
+				dev->node_props.cpu_cores_count) ||
+			(id >= dev->node_props.simd_id_base &&
+			id < dev->node_props.simd_id_base +
+				total_num_of_cu)) {
+			props = kfd_alloc_struct(props);
+			if (!props)
+				return -ENOMEM;
+
+			props->processor_id_low = id;
+			props->cache_level = cache->cache_level;
+			props->cache_size = cache->cache_size;
+			props->cacheline_size = cache->cache_line_size;
+			props->cachelines_per_tag = cache->lines_per_tag;
+			props->cache_assoc = cache->associativity;
+			props->cache_latency = cache->cache_latency;
+			memcpy(props->sibling_map, cache->sibling_map,
+					sizeof(props->sibling_map));
+
+			if (cache->flags & CRAT_CACHE_FLAGS_DATA_CACHE)
+				props->cache_type |= HSA_CACHE_TYPE_DATA;
+			if (cache->flags & CRAT_CACHE_FLAGS_INST_CACHE)
+				props->cache_type |= HSA_CACHE_TYPE_INSTRUCTION;
+			if (cache->flags & CRAT_CACHE_FLAGS_CPU_CACHE)
+				props->cache_type |= HSA_CACHE_TYPE_CPU;
+			if (cache->flags & CRAT_CACHE_FLAGS_SIMD_CACHE)
+				props->cache_type |= HSA_CACHE_TYPE_HSACU;
+
+			dev->cache_count++;
+			dev->node_props.caches_count++;
+			list_add_tail(&props->list, &dev->cache_props);
+
+			break;
+		}
+	}
+
+	return 0;
+}
+
+/* kfd_parse_subtype_iolink - parse iolink subtypes and attach it to correct
+ * topology device present in the device_list
+ */
+static int kfd_parse_subtype_iolink(struct crat_subtype_iolink *iolink,
+					struct list_head *device_list)
+{
+	struct kfd_iolink_properties *props = NULL, *props2;
+	struct kfd_topology_device *dev, *cpu_dev;
+	uint32_t id_from;
+	uint32_t id_to;
+
+	id_from = iolink->proximity_domain_from;
+	id_to = iolink->proximity_domain_to;
+
+	pr_debug("Found IO link entry in CRAT table with id_from=%d\n",
+			id_from);
+	list_for_each_entry(dev, device_list, list) {
+		if (id_from == dev->proximity_domain) {
+			props = kfd_alloc_struct(props);
+			if (!props)
+				return -ENOMEM;
+
+			props->node_from = id_from;
+			props->node_to = id_to;
+			props->ver_maj = iolink->version_major;
+			props->ver_min = iolink->version_minor;
+			props->iolink_type = iolink->io_interface_type;
+
+			if (props->iolink_type == CRAT_IOLINK_TYPE_PCIEXPRESS)
+				props->weight = 20;
+			else
+				props->weight = node_distance(id_from, id_to);
+
+			props->min_latency = iolink->minimum_latency;
+			props->max_latency = iolink->maximum_latency;
+			props->min_bandwidth = iolink->minimum_bandwidth_mbs;
+			props->max_bandwidth = iolink->maximum_bandwidth_mbs;
+			props->rec_transfer_size =
+					iolink->recommended_transfer_size;
+
+			dev->io_link_count++;
+			dev->node_props.io_links_count++;
+			list_add_tail(&props->list, &dev->io_link_props);
+			break;
+		}
+	}
+
+	/* CPU topology is created before GPUs are detected, so CPU->GPU
+	 * links are not built at that time. If a PCIe type is discovered, it
+	 * means a GPU is detected and we are adding GPU->CPU to the topology.
+	 * At this time, also add the corresponded CPU->GPU link.
+	 */
+	if (props && props->iolink_type == CRAT_IOLINK_TYPE_PCIEXPRESS) {
+		cpu_dev = kfd_topology_device_by_proximity_domain(id_to);
+		if (!cpu_dev)
+			return -ENODEV;
+		/* same everything but the other direction */
+		props2 = kmemdup(props, sizeof(*props2), GFP_KERNEL);
+		props2->node_from = id_to;
+		props2->node_to = id_from;
+		props2->kobj = NULL;
+		cpu_dev->io_link_count++;
+		cpu_dev->node_props.io_links_count++;
+		list_add_tail(&props2->list, &cpu_dev->io_link_props);
+	}
+
+	return 0;
+}
+
+/* kfd_parse_subtype - parse subtypes and attach it to correct topology device
+ * present in the device_list
+ *	@sub_type_hdr - subtype section of crat_image
+ *	@device_list - list of topology devices present in this crat_image
+ */
+static int kfd_parse_subtype(struct crat_subtype_generic *sub_type_hdr,
+				struct list_head *device_list)
+{
+	struct crat_subtype_computeunit *cu;
+	struct crat_subtype_memory *mem;
+	struct crat_subtype_cache *cache;
+	struct crat_subtype_iolink *iolink;
+	int ret = 0;
+
+	switch (sub_type_hdr->type) {
+	case CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY:
+		cu = (struct crat_subtype_computeunit *)sub_type_hdr;
+		ret = kfd_parse_subtype_cu(cu, device_list);
+		break;
+	case CRAT_SUBTYPE_MEMORY_AFFINITY:
+		mem = (struct crat_subtype_memory *)sub_type_hdr;
+		ret = kfd_parse_subtype_mem(mem, device_list);
+		break;
+	case CRAT_SUBTYPE_CACHE_AFFINITY:
+		cache = (struct crat_subtype_cache *)sub_type_hdr;
+		ret = kfd_parse_subtype_cache(cache, device_list);
+		break;
+	case CRAT_SUBTYPE_TLB_AFFINITY:
+		/*
+		 * For now, nothing to do here
+		 */
+		pr_debug("Found TLB entry in CRAT table (not processing)\n");
+		break;
+	case CRAT_SUBTYPE_CCOMPUTE_AFFINITY:
+		/*
+		 * For now, nothing to do here
+		 */
+		pr_debug("Found CCOMPUTE entry in CRAT table (not processing)\n");
+		break;
+	case CRAT_SUBTYPE_IOLINK_AFFINITY:
+		iolink = (struct crat_subtype_iolink *)sub_type_hdr;
+		ret = kfd_parse_subtype_iolink(iolink, device_list);
+		break;
+	default:
+		pr_warn("Unknown subtype %d in CRAT\n",
+				sub_type_hdr->type);
+	}
+
+	return ret;
+}
+
+/* kfd_parse_crat_table - parse CRAT table. For each node present in CRAT
+ * create a kfd_topology_device and add in to device_list. Also parse
+ * CRAT subtypes and attach it to appropriate kfd_topology_device
+ *	@crat_image - input image containing CRAT
+ *	@device_list - [OUT] list of kfd_topology_device generated after
+ *		       parsing crat_image
+ *	@proximity_domain - Proximity domain of the first device in the table
+ *
+ *	Return - 0 if successful else -ve value
+ */
+int kfd_parse_crat_table(void *crat_image, struct list_head *device_list,
+			 uint32_t proximity_domain)
+{
+	struct kfd_topology_device *top_dev = NULL;
+	struct crat_subtype_generic *sub_type_hdr;
+	uint16_t node_id;
+	int ret = 0;
+	struct crat_header *crat_table = (struct crat_header *)crat_image;
+	uint16_t num_nodes;
+	uint32_t image_len;
+
+	if (!crat_image)
+		return -EINVAL;
+
+	if (!list_empty(device_list)) {
+		pr_warn("Error device list should be empty\n");
+		return -EINVAL;
+	}
+
+	num_nodes = crat_table->num_domains;
+	image_len = crat_table->length;
+
+	pr_info("Parsing CRAT table with %d nodes\n", num_nodes);
+
+	for (node_id = 0; node_id < num_nodes; node_id++) {
+		top_dev = kfd_create_topology_device(device_list);
+		if (!top_dev)
+			break;
+		top_dev->proximity_domain = proximity_domain++;
+	}
+
+	if (!top_dev) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	memcpy(top_dev->oem_id, crat_table->oem_id, CRAT_OEMID_LENGTH);
+	memcpy(top_dev->oem_table_id, crat_table->oem_table_id,
+			CRAT_OEMTABLEID_LENGTH);
+	top_dev->oem_revision = crat_table->oem_revision;
+
+	sub_type_hdr = (struct crat_subtype_generic *)(crat_table+1);
+	while ((char *)sub_type_hdr + sizeof(struct crat_subtype_generic) <
+			((char *)crat_image) + image_len) {
+		if (sub_type_hdr->flags & CRAT_SUBTYPE_FLAGS_ENABLED) {
+			ret = kfd_parse_subtype(sub_type_hdr, device_list);
+			if (ret)
+				break;
+		}
+
+		sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
+				sub_type_hdr->length);
+	}
+
+err:
+	if (ret)
+		kfd_release_topology_device_list(device_list);
+
+	return ret;
+}
+
+/* Helper function. See kfd_fill_gpu_cache_info for parameter description */
+static int fill_in_pcache(struct crat_subtype_cache *pcache,
+				struct kfd_gpu_cache_info *pcache_info,
+				struct kfd_cu_info *cu_info,
+				int mem_available,
+				int cu_bitmask,
+				int cache_type, unsigned int cu_processor_id,
+				int cu_block)
+{
+	unsigned int cu_sibling_map_mask;
+	int first_active_cu;
+
+	/* First check if enough memory is available */
+	if (sizeof(struct crat_subtype_cache) > mem_available)
+		return -ENOMEM;
+
+	cu_sibling_map_mask = cu_bitmask;
+	cu_sibling_map_mask >>= cu_block;
+	cu_sibling_map_mask &=
+		((1 << pcache_info[cache_type].num_cu_shared) - 1);
+	first_active_cu = ffs(cu_sibling_map_mask);
+
+	/* CU could be inactive. In case of shared cache find the first active
+	 * CU. and incase of non-shared cache check if the CU is inactive. If
+	 * inactive active skip it
+	 */
+	if (first_active_cu) {
+		memset(pcache, 0, sizeof(struct crat_subtype_cache));
+		pcache->type = CRAT_SUBTYPE_CACHE_AFFINITY;
+		pcache->length = sizeof(struct crat_subtype_cache);
+		pcache->flags = pcache_info[cache_type].flags;
+		pcache->processor_id_low = cu_processor_id
+					 + (first_active_cu - 1);
+		pcache->cache_level = pcache_info[cache_type].cache_level;
+		pcache->cache_size = pcache_info[cache_type].cache_size;
+
+		/* Sibling map is w.r.t processor_id_low, so shift out
+		 * inactive CU
+		 */
+		cu_sibling_map_mask =
+			cu_sibling_map_mask >> (first_active_cu - 1);
+
+		pcache->sibling_map[0] = (uint8_t)(cu_sibling_map_mask & 0xFF);
+		pcache->sibling_map[1] =
+				(uint8_t)((cu_sibling_map_mask >> 8) & 0xFF);
+		pcache->sibling_map[2] =
+				(uint8_t)((cu_sibling_map_mask >> 16) & 0xFF);
+		pcache->sibling_map[3] =
+				(uint8_t)((cu_sibling_map_mask >> 24) & 0xFF);
+		return 0;
+	}
+	return 1;
+}
+
+/* kfd_fill_gpu_cache_info - Fill GPU cache info using kfd_gpu_cache_info
+ * tables
+ *
+ *	@kdev - [IN] GPU device
+ *	@gpu_processor_id - [IN] GPU processor ID to which these caches
+ *			    associate
+ *	@available_size - [IN] Amount of memory available in pcache
+ *	@cu_info - [IN] Compute Unit info obtained from KGD
+ *	@pcache - [OUT] memory into which cache data is to be filled in.
+ *	@size_filled - [OUT] amount of data used up in pcache.
+ *	@num_of_entries - [OUT] number of caches added
+ */
+static int kfd_fill_gpu_cache_info(struct kfd_dev *kdev,
+			int gpu_processor_id,
+			int available_size,
+			struct kfd_cu_info *cu_info,
+			struct crat_subtype_cache *pcache,
+			int *size_filled,
+			int *num_of_entries)
+{
+	struct kfd_gpu_cache_info *pcache_info;
+	int num_of_cache_types = 0;
+	int i, j, k;
+	int ct = 0;
+	int mem_available = available_size;
+	unsigned int cu_processor_id;
+	int ret;
+
+	switch (kdev->device_info->asic_family) {
+	case CHIP_KAVERI:
+		pcache_info = kaveri_cache_info;
+		num_of_cache_types = ARRAY_SIZE(kaveri_cache_info);
+		break;
+	case CHIP_HAWAII:
+		pcache_info = hawaii_cache_info;
+		num_of_cache_types = ARRAY_SIZE(hawaii_cache_info);
+		break;
+	case CHIP_CARRIZO:
+		pcache_info = carrizo_cache_info;
+		num_of_cache_types = ARRAY_SIZE(carrizo_cache_info);
+		break;
+	case CHIP_TONGA:
+		pcache_info = tonga_cache_info;
+		num_of_cache_types = ARRAY_SIZE(tonga_cache_info);
+		break;
+	case CHIP_FIJI:
+		pcache_info = fiji_cache_info;
+		num_of_cache_types = ARRAY_SIZE(fiji_cache_info);
+		break;
+	case CHIP_POLARIS10:
+		pcache_info = polaris10_cache_info;
+		num_of_cache_types = ARRAY_SIZE(polaris10_cache_info);
+		break;
+	case CHIP_POLARIS11:
+		pcache_info = polaris11_cache_info;
+		num_of_cache_types = ARRAY_SIZE(polaris11_cache_info);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	*size_filled = 0;
+	*num_of_entries = 0;
+
+	/* For each type of cache listed in the kfd_gpu_cache_info table,
+	 * go through all available Compute Units.
+	 * The [i,j,k] loop will
+	 *		if kfd_gpu_cache_info.num_cu_shared = 1
+	 *			will parse through all available CU
+	 *		If (kfd_gpu_cache_info.num_cu_shared != 1)
+	 *			then it will consider only one CU from
+	 *			the shared unit
+	 */
+
+	for (ct = 0; ct < num_of_cache_types; ct++) {
+		cu_processor_id = gpu_processor_id;
+		for (i = 0; i < cu_info->num_shader_engines; i++) {
+			for (j = 0; j < cu_info->num_shader_arrays_per_engine;
+				j++) {
+				for (k = 0; k < cu_info->num_cu_per_sh;
+					k += pcache_info[ct].num_cu_shared) {
+
+					ret = fill_in_pcache(pcache,
+						pcache_info,
+						cu_info,
+						mem_available,
+						cu_info->cu_bitmap[i][j],
+						ct,
+						cu_processor_id,
+						k);
+
+					if (ret < 0)
+						break;
+
+					if (!ret) {
+						pcache++;
+						(*num_of_entries)++;
+						mem_available -=
+							sizeof(*pcache);
+						(*size_filled) +=
+							sizeof(*pcache);
+					}
+
+					/* Move to next CU block */
+					cu_processor_id +=
+						pcache_info[ct].num_cu_shared;
+				}
+			}
+		}
+	}
+
+	pr_debug("Added [%d] GPU cache entries\n", *num_of_entries);
+
+	return 0;
+}
+
+/*
+ * kfd_create_crat_image_acpi - Allocates memory for CRAT image and
+ * copies CRAT from ACPI (if available).
+ * NOTE: Call kfd_destroy_crat_image to free CRAT image memory
+ *
+ *	@crat_image: CRAT read from ACPI. If no CRAT in ACPI then
+ *		     crat_image will be NULL
+ *	@size: [OUT] size of crat_image
+ *
+ *	Return 0 if successful else return error code
+ */
+int kfd_create_crat_image_acpi(void **crat_image, size_t *size)
+{
+	struct acpi_table_header *crat_table;
+	acpi_status status;
+	void *pcrat_image;
+
+	if (!crat_image)
+		return -EINVAL;
+
+	*crat_image = NULL;
+
+	/* Fetch the CRAT table from ACPI */
+	status = acpi_get_table(CRAT_SIGNATURE, 0, &crat_table);
+	if (status == AE_NOT_FOUND) {
+		pr_warn("CRAT table not found\n");
+		return -ENODATA;
+	} else if (ACPI_FAILURE(status)) {
+		const char *err = acpi_format_exception(status);
+
+		pr_err("CRAT table error: %s\n", err);
+		return -EINVAL;
+	}
+
+	if (ignore_crat) {
+		pr_info("CRAT table disabled by module option\n");
+		return -ENODATA;
+	}
+
+	pcrat_image = kmalloc(crat_table->length, GFP_KERNEL);
+	if (!pcrat_image)
+		return -ENOMEM;
+
+	memcpy(pcrat_image, crat_table, crat_table->length);
+
+	*crat_image = pcrat_image;
+	*size = crat_table->length;
+
+	return 0;
+}
+
+/* Memory required to create Virtual CRAT.
+ * Since there is no easy way to predict the amount of memory required, the
+ * following amount are allocated for CPU and GPU Virtual CRAT. This is
+ * expected to cover all known conditions. But to be safe additional check
+ * is put in the code to ensure we don't overwrite.
+ */
+#define VCRAT_SIZE_FOR_CPU	(2 * PAGE_SIZE)
+#define VCRAT_SIZE_FOR_GPU	(3 * PAGE_SIZE)
+
+/* kfd_fill_cu_for_cpu - Fill in Compute info for the given CPU NUMA node
+ *
+ *	@numa_node_id: CPU NUMA node id
+ *	@avail_size: Available size in the memory
+ *	@sub_type_hdr: Memory into which compute info will be filled in
+ *
+ *	Return 0 if successful else return -ve value
+ */
+static int kfd_fill_cu_for_cpu(int numa_node_id, int *avail_size,
+				int proximity_domain,
+				struct crat_subtype_computeunit *sub_type_hdr)
+{
+	const struct cpumask *cpumask;
+
+	*avail_size -= sizeof(struct crat_subtype_computeunit);
+	if (*avail_size < 0)
+		return -ENOMEM;
+
+	memset(sub_type_hdr, 0, sizeof(struct crat_subtype_computeunit));
+
+	/* Fill in subtype header data */
+	sub_type_hdr->type = CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY;
+	sub_type_hdr->length = sizeof(struct crat_subtype_computeunit);
+	sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED;
+
+	cpumask = cpumask_of_node(numa_node_id);
+
+	/* Fill in CU data */
+	sub_type_hdr->flags |= CRAT_CU_FLAGS_CPU_PRESENT;
+	sub_type_hdr->proximity_domain = proximity_domain;
+	sub_type_hdr->processor_id_low = kfd_numa_node_to_apic_id(numa_node_id);
+	if (sub_type_hdr->processor_id_low == -1)
+		return -EINVAL;
+
+	sub_type_hdr->num_cpu_cores = cpumask_weight(cpumask);
+
+	return 0;
+}
+
+/* kfd_fill_mem_info_for_cpu - Fill in Memory info for the given CPU NUMA node
+ *
+ *	@numa_node_id: CPU NUMA node id
+ *	@avail_size: Available size in the memory
+ *	@sub_type_hdr: Memory into which compute info will be filled in
+ *
+ *	Return 0 if successful else return -ve value
+ */
+static int kfd_fill_mem_info_for_cpu(int numa_node_id, int *avail_size,
+			int proximity_domain,
+			struct crat_subtype_memory *sub_type_hdr)
+{
+	uint64_t mem_in_bytes = 0;
+	pg_data_t *pgdat;
+	int zone_type;
+
+	*avail_size -= sizeof(struct crat_subtype_memory);
+	if (*avail_size < 0)
+		return -ENOMEM;
+
+	memset(sub_type_hdr, 0, sizeof(struct crat_subtype_memory));
+
+	/* Fill in subtype header data */
+	sub_type_hdr->type = CRAT_SUBTYPE_MEMORY_AFFINITY;
+	sub_type_hdr->length = sizeof(struct crat_subtype_memory);
+	sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED;
+
+	/* Fill in Memory Subunit data */
+
+	/* Unlike si_meminfo, si_meminfo_node is not exported. So
+	 * the following lines are duplicated from si_meminfo_node
+	 * function
+	 */
+	pgdat = NODE_DATA(numa_node_id);
+	for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
+		mem_in_bytes += pgdat->node_zones[zone_type].managed_pages;
+	mem_in_bytes <<= PAGE_SHIFT;
+
+	sub_type_hdr->length_low = lower_32_bits(mem_in_bytes);
+	sub_type_hdr->length_high = upper_32_bits(mem_in_bytes);
+	sub_type_hdr->proximity_domain = proximity_domain;
+
+	return 0;
+}
+
+static int kfd_fill_iolink_info_for_cpu(int numa_node_id, int *avail_size,
+				uint32_t *num_entries,
+				struct crat_subtype_iolink *sub_type_hdr)
+{
+	int nid;
+	struct cpuinfo_x86 *c = &cpu_data(0);
+	uint8_t link_type;
+
+	if (c->x86_vendor == X86_VENDOR_AMD)
+		link_type = CRAT_IOLINK_TYPE_HYPERTRANSPORT;
+	else
+		link_type = CRAT_IOLINK_TYPE_QPI_1_1;
+
+	*num_entries = 0;
+
+	/* Create IO links from this node to other CPU nodes */
+	for_each_online_node(nid) {
+		if (nid == numa_node_id) /* node itself */
+			continue;
+
+		*avail_size -= sizeof(struct crat_subtype_iolink);
+		if (*avail_size < 0)
+			return -ENOMEM;
+
+		memset(sub_type_hdr, 0, sizeof(struct crat_subtype_iolink));
+
+		/* Fill in subtype header data */
+		sub_type_hdr->type = CRAT_SUBTYPE_IOLINK_AFFINITY;
+		sub_type_hdr->length = sizeof(struct crat_subtype_iolink);
+		sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED;
+
+		/* Fill in IO link data */
+		sub_type_hdr->proximity_domain_from = numa_node_id;
+		sub_type_hdr->proximity_domain_to = nid;
+		sub_type_hdr->io_interface_type = link_type;
+
+		(*num_entries)++;
+		sub_type_hdr++;
+	}
+
+	return 0;
+}
+
+/* kfd_create_vcrat_image_cpu - Create Virtual CRAT for CPU
+ *
+ *	@pcrat_image: Fill in VCRAT for CPU
+ *	@size:	[IN] allocated size of crat_image.
+ *		[OUT] actual size of data filled in crat_image
+ */
+static int kfd_create_vcrat_image_cpu(void *pcrat_image, size_t *size)
+{
+	struct crat_header *crat_table = (struct crat_header *)pcrat_image;
+	struct acpi_table_header *acpi_table;
+	acpi_status status;
+	struct crat_subtype_generic *sub_type_hdr;
+	int avail_size = *size;
+	int numa_node_id;
+	uint32_t entries = 0;
+	int ret = 0;
+
+	if (!pcrat_image || avail_size < VCRAT_SIZE_FOR_CPU)
+		return -EINVAL;
+
+	/* Fill in CRAT Header.
+	 * Modify length and total_entries as subunits are added.
+	 */
+	avail_size -= sizeof(struct crat_header);
+	if (avail_size < 0)
+		return -ENOMEM;
+
+	memset(crat_table, 0, sizeof(struct crat_header));
+	memcpy(&crat_table->signature, CRAT_SIGNATURE,
+			sizeof(crat_table->signature));
+	crat_table->length = sizeof(struct crat_header);
+
+	status = acpi_get_table("DSDT", 0, &acpi_table);
+	if (status == AE_NOT_FOUND)
+		pr_warn("DSDT table not found for OEM information\n");
+	else {
+		crat_table->oem_revision = acpi_table->revision;
+		memcpy(crat_table->oem_id, acpi_table->oem_id,
+				CRAT_OEMID_LENGTH);
+		memcpy(crat_table->oem_table_id, acpi_table->oem_table_id,
+				CRAT_OEMTABLEID_LENGTH);
+	}
+	crat_table->total_entries = 0;
+	crat_table->num_domains = 0;
+
+	sub_type_hdr = (struct crat_subtype_generic *)(crat_table+1);
+
+	for_each_online_node(numa_node_id) {
+		if (kfd_numa_node_to_apic_id(numa_node_id) == -1)
+			continue;
+
+		/* Fill in Subtype: Compute Unit */
+		ret = kfd_fill_cu_for_cpu(numa_node_id, &avail_size,
+			crat_table->num_domains,
+			(struct crat_subtype_computeunit *)sub_type_hdr);
+		if (ret < 0)
+			return ret;
+		crat_table->length += sub_type_hdr->length;
+		crat_table->total_entries++;
+
+		sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
+			sub_type_hdr->length);
+
+		/* Fill in Subtype: Memory */
+		ret = kfd_fill_mem_info_for_cpu(numa_node_id, &avail_size,
+			crat_table->num_domains,
+			(struct crat_subtype_memory *)sub_type_hdr);
+		if (ret < 0)
+			return ret;
+		crat_table->length += sub_type_hdr->length;
+		crat_table->total_entries++;
+
+		sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
+			sub_type_hdr->length);
+
+		/* Fill in Subtype: IO Link */
+		ret = kfd_fill_iolink_info_for_cpu(numa_node_id, &avail_size,
+				&entries,
+				(struct crat_subtype_iolink *)sub_type_hdr);
+		if (ret < 0)
+			return ret;
+		crat_table->length += (sub_type_hdr->length * entries);
+		crat_table->total_entries += entries;
+
+		sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
+				sub_type_hdr->length * entries);
+
+		crat_table->num_domains++;
+	}
+
+	/* TODO: Add cache Subtype for CPU.
+	 * Currently, CPU cache information is available in function
+	 * detect_cache_attributes(cpu) defined in the file
+	 * ./arch/x86/kernel/cpu/intel_cacheinfo.c. This function is not
+	 * exported and to get the same information the code needs to be
+	 * duplicated.
+	 */
+
+	*size = crat_table->length;
+	pr_info("Virtual CRAT table created for CPU\n");
+
+	return 0;
+}
+
+static int kfd_fill_gpu_memory_affinity(int *avail_size,
+		struct kfd_dev *kdev, uint8_t type, uint64_t size,
+		struct crat_subtype_memory *sub_type_hdr,
+		uint32_t proximity_domain,
+		const struct kfd_local_mem_info *local_mem_info)
+{
+	*avail_size -= sizeof(struct crat_subtype_memory);
+	if (*avail_size < 0)
+		return -ENOMEM;
+
+	memset((void *)sub_type_hdr, 0, sizeof(struct crat_subtype_memory));
+	sub_type_hdr->type = CRAT_SUBTYPE_MEMORY_AFFINITY;
+	sub_type_hdr->length = sizeof(struct crat_subtype_memory);
+	sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED;
+
+	sub_type_hdr->proximity_domain = proximity_domain;
+
+	pr_debug("Fill gpu memory affinity - type 0x%x size 0x%llx\n",
+			type, size);
+
+	sub_type_hdr->length_low = lower_32_bits(size);
+	sub_type_hdr->length_high = upper_32_bits(size);
+
+	sub_type_hdr->width = local_mem_info->vram_width;
+	sub_type_hdr->visibility_type = type;
+
+	return 0;
+}
+
+/* kfd_fill_gpu_direct_io_link - Fill in direct io link from GPU
+ * to its NUMA node
+ *	@avail_size: Available size in the memory
+ *	@kdev - [IN] GPU device
+ *	@sub_type_hdr: Memory into which io link info will be filled in
+ *	@proximity_domain - proximity domain of the GPU node
+ *
+ *	Return 0 if successful else return -ve value
+ */
+static int kfd_fill_gpu_direct_io_link(int *avail_size,
+			struct kfd_dev *kdev,
+			struct crat_subtype_iolink *sub_type_hdr,
+			uint32_t proximity_domain)
+{
+	*avail_size -= sizeof(struct crat_subtype_iolink);
+	if (*avail_size < 0)
+		return -ENOMEM;
+
+	memset((void *)sub_type_hdr, 0, sizeof(struct crat_subtype_iolink));
+
+	/* Fill in subtype header data */
+	sub_type_hdr->type = CRAT_SUBTYPE_IOLINK_AFFINITY;
+	sub_type_hdr->length = sizeof(struct crat_subtype_iolink);
+	sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED;
+
+	/* Fill in IOLINK subtype.
+	 * TODO: Fill-in other fields of iolink subtype
+	 */
+	sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_PCIEXPRESS;
+	sub_type_hdr->proximity_domain_from = proximity_domain;
+#ifdef CONFIG_NUMA
+	if (kdev->pdev->dev.numa_node == NUMA_NO_NODE)
+		sub_type_hdr->proximity_domain_to = 0;
+	else
+		sub_type_hdr->proximity_domain_to = kdev->pdev->dev.numa_node;
+#else
+	sub_type_hdr->proximity_domain_to = 0;
+#endif
+	return 0;
+}
+
+/* kfd_create_vcrat_image_gpu - Create Virtual CRAT for CPU
+ *
+ *	@pcrat_image: Fill in VCRAT for GPU
+ *	@size:	[IN] allocated size of crat_image.
+ *		[OUT] actual size of data filled in crat_image
+ */
+static int kfd_create_vcrat_image_gpu(void *pcrat_image,
+				      size_t *size, struct kfd_dev *kdev,
+				      uint32_t proximity_domain)
+{
+	struct crat_header *crat_table = (struct crat_header *)pcrat_image;
+	struct crat_subtype_generic *sub_type_hdr;
+	struct crat_subtype_computeunit *cu;
+	struct kfd_cu_info cu_info;
+	struct amd_iommu_device_info iommu_info;
+	int avail_size = *size;
+	uint32_t total_num_of_cu;
+	int num_of_cache_entries = 0;
+	int cache_mem_filled = 0;
+	int ret = 0;
+	const u32 required_iommu_flags = AMD_IOMMU_DEVICE_FLAG_ATS_SUP |
+					 AMD_IOMMU_DEVICE_FLAG_PRI_SUP |
+					 AMD_IOMMU_DEVICE_FLAG_PASID_SUP;
+	struct kfd_local_mem_info local_mem_info;
+
+	if (!pcrat_image || avail_size < VCRAT_SIZE_FOR_GPU)
+		return -EINVAL;
+
+	/* Fill the CRAT Header.
+	 * Modify length and total_entries as subunits are added.
+	 */
+	avail_size -= sizeof(struct crat_header);
+	if (avail_size < 0)
+		return -ENOMEM;
+
+	memset(crat_table, 0, sizeof(struct crat_header));
+
+	memcpy(&crat_table->signature, CRAT_SIGNATURE,
+			sizeof(crat_table->signature));
+	/* Change length as we add more subtypes*/
+	crat_table->length = sizeof(struct crat_header);
+	crat_table->num_domains = 1;
+	crat_table->total_entries = 0;
+
+	/* Fill in Subtype: Compute Unit
+	 * First fill in the sub type header and then sub type data
+	 */
+	avail_size -= sizeof(struct crat_subtype_computeunit);
+	if (avail_size < 0)
+		return -ENOMEM;
+
+	sub_type_hdr = (struct crat_subtype_generic *)(crat_table + 1);
+	memset(sub_type_hdr, 0, sizeof(struct crat_subtype_computeunit));
+
+	sub_type_hdr->type = CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY;
+	sub_type_hdr->length = sizeof(struct crat_subtype_computeunit);
+	sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED;
+
+	/* Fill CU subtype data */
+	cu = (struct crat_subtype_computeunit *)sub_type_hdr;
+	cu->flags |= CRAT_CU_FLAGS_GPU_PRESENT;
+	cu->proximity_domain = proximity_domain;
+
+	kdev->kfd2kgd->get_cu_info(kdev->kgd, &cu_info);
+	cu->num_simd_per_cu = cu_info.simd_per_cu;
+	cu->num_simd_cores = cu_info.simd_per_cu * cu_info.cu_active_number;
+	cu->max_waves_simd = cu_info.max_waves_per_simd;
+
+	cu->wave_front_size = cu_info.wave_front_size;
+	cu->array_count = cu_info.num_shader_arrays_per_engine *
+		cu_info.num_shader_engines;
+	total_num_of_cu = (cu->array_count * cu_info.num_cu_per_sh);
+	cu->processor_id_low = get_and_inc_gpu_processor_id(total_num_of_cu);
+	cu->num_cu_per_array = cu_info.num_cu_per_sh;
+	cu->max_slots_scatch_cu = cu_info.max_scratch_slots_per_cu;
+	cu->num_banks = cu_info.num_shader_engines;
+	cu->lds_size_in_kb = cu_info.lds_size;
+
+	cu->hsa_capability = 0;
+
+	/* Check if this node supports IOMMU. During parsing this flag will
+	 * translate to HSA_CAP_ATS_PRESENT
+	 */
+	iommu_info.flags = 0;
+	if (amd_iommu_device_info(kdev->pdev, &iommu_info) == 0) {
+		if ((iommu_info.flags & required_iommu_flags) ==
+				required_iommu_flags)
+			cu->hsa_capability |= CRAT_CU_FLAGS_IOMMU_PRESENT;
+	}
+
+	crat_table->length += sub_type_hdr->length;
+	crat_table->total_entries++;
+
+	/* Fill in Subtype: Memory. Only on systems with large BAR (no
+	 * private FB), report memory as public. On other systems
+	 * report the total FB size (public+private) as a single
+	 * private heap.
+	 */
+	kdev->kfd2kgd->get_local_mem_info(kdev->kgd, &local_mem_info);
+	sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
+			sub_type_hdr->length);
+
+	if (local_mem_info.local_mem_size_private == 0)
+		ret = kfd_fill_gpu_memory_affinity(&avail_size,
+				kdev, HSA_MEM_HEAP_TYPE_FB_PUBLIC,
+				local_mem_info.local_mem_size_public,
+				(struct crat_subtype_memory *)sub_type_hdr,
+				proximity_domain,
+				&local_mem_info);
+	else
+		ret = kfd_fill_gpu_memory_affinity(&avail_size,
+				kdev, HSA_MEM_HEAP_TYPE_FB_PRIVATE,
+				local_mem_info.local_mem_size_public +
+				local_mem_info.local_mem_size_private,
+				(struct crat_subtype_memory *)sub_type_hdr,
+				proximity_domain,
+				&local_mem_info);
+	if (ret < 0)
+		return ret;
+
+	crat_table->length += sizeof(struct crat_subtype_memory);
+	crat_table->total_entries++;
+
+	/* TODO: Fill in cache information. This information is NOT readily
+	 * available in KGD
+	 */
+	sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
+		sub_type_hdr->length);
+	ret = kfd_fill_gpu_cache_info(kdev, cu->processor_id_low,
+				avail_size,
+				&cu_info,
+				(struct crat_subtype_cache *)sub_type_hdr,
+				&cache_mem_filled,
+				&num_of_cache_entries);
+
+	if (ret < 0)
+		return ret;
+
+	crat_table->length += cache_mem_filled;
+	crat_table->total_entries += num_of_cache_entries;
+	avail_size -= cache_mem_filled;
+
+	/* Fill in Subtype: IO_LINKS
+	 *  Only direct links are added here which is Link from GPU to
+	 *  to its NUMA node. Indirect links are added by userspace.
+	 */
+	sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
+		cache_mem_filled);
+	ret = kfd_fill_gpu_direct_io_link(&avail_size, kdev,
+		(struct crat_subtype_iolink *)sub_type_hdr, proximity_domain);
+
+	if (ret < 0)
+		return ret;
+
+	crat_table->length += sub_type_hdr->length;
+	crat_table->total_entries++;
+
+	*size = crat_table->length;
+	pr_info("Virtual CRAT table created for GPU\n");
+
+	return ret;
+}
+
+/* kfd_create_crat_image_virtual - Allocates memory for CRAT image and
+ *		creates a Virtual CRAT (VCRAT) image
+ *
+ * NOTE: Call kfd_destroy_crat_image to free CRAT image memory
+ *
+ *	@crat_image: VCRAT image created because ACPI does not have a
+ *		     CRAT for this device
+ *	@size: [OUT] size of virtual crat_image
+ *	@flags:	COMPUTE_UNIT_CPU - Create VCRAT for CPU device
+ *		COMPUTE_UNIT_GPU - Create VCRAT for GPU
+ *		(COMPUTE_UNIT_CPU | COMPUTE_UNIT_GPU) - Create VCRAT for APU
+ *			-- this option is not currently implemented.
+ *			The assumption is that all AMD APUs will have CRAT
+ *	@kdev: Valid kfd_device required if flags contain COMPUTE_UNIT_GPU
+ *
+ *	Return 0 if successful else return -ve value
+ */
+int kfd_create_crat_image_virtual(void **crat_image, size_t *size,
+				  int flags, struct kfd_dev *kdev,
+				  uint32_t proximity_domain)
+{
+	void *pcrat_image = NULL;
+	int ret = 0;
+
+	if (!crat_image)
+		return -EINVAL;
+
+	*crat_image = NULL;
+
+	/* Allocate one VCRAT_SIZE_FOR_CPU for CPU virtual CRAT image and
+	 * VCRAT_SIZE_FOR_GPU for GPU virtual CRAT image. This should cover
+	 * all the current conditions. A check is put not to overwrite beyond
+	 * allocated size
+	 */
+	switch (flags) {
+	case COMPUTE_UNIT_CPU:
+		pcrat_image = kmalloc(VCRAT_SIZE_FOR_CPU, GFP_KERNEL);
+		if (!pcrat_image)
+			return -ENOMEM;
+		*size = VCRAT_SIZE_FOR_CPU;
+		ret = kfd_create_vcrat_image_cpu(pcrat_image, size);
+		break;
+	case COMPUTE_UNIT_GPU:
+		if (!kdev)
+			return -EINVAL;
+		pcrat_image = kmalloc(VCRAT_SIZE_FOR_GPU, GFP_KERNEL);
+		if (!pcrat_image)
+			return -ENOMEM;
+		*size = VCRAT_SIZE_FOR_GPU;
+		ret = kfd_create_vcrat_image_gpu(pcrat_image, size, kdev,
+						 proximity_domain);
+		break;
+	case (COMPUTE_UNIT_CPU | COMPUTE_UNIT_GPU):
+		/* TODO: */
+		ret = -EINVAL;
+		pr_err("VCRAT not implemented for APU\n");
+		break;
+	default:
+		ret = -EINVAL;
+	}
+
+	if (!ret)
+		*crat_image = pcrat_image;
+	else
+		kfree(pcrat_image);
+
+	return ret;
+}
+
+
+/* kfd_destroy_crat_image
+ *
+ *	@crat_image: [IN] - crat_image from kfd_create_crat_image_xxx(..)
+ *
+ */
+void kfd_destroy_crat_image(void *crat_image)
+{
+	kfree(crat_image);
+}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h
index a374fa3d3ee6..b5cd182b9edd 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h
@@ -44,6 +44,10 @@
 
 #define CRAT_OEMID_64BIT_MASK ((1ULL << (CRAT_OEMID_LENGTH * 8)) - 1)
 
+/* Compute Unit flags */
+#define COMPUTE_UNIT_CPU	(1 << 0)  /* Create Virtual CRAT for CPU */
+#define COMPUTE_UNIT_GPU	(1 << 1)  /* Create Virtual CRAT for GPU */
+
 struct crat_header {
 	uint32_t	signature;
 	uint32_t	length;
@@ -105,7 +109,7 @@ struct crat_subtype_computeunit {
 	uint8_t		wave_front_size;
 	uint8_t		num_banks;
 	uint16_t	micro_engine_id;
-	uint8_t		num_arrays;
+	uint8_t		array_count;
 	uint8_t		num_cu_per_array;
 	uint8_t		num_simd_per_cu;
 	uint8_t		max_slots_scatch_cu;
@@ -127,13 +131,14 @@ struct crat_subtype_memory {
 	uint8_t		length;
 	uint16_t	reserved;
 	uint32_t	flags;
-	uint32_t	promixity_domain;
+	uint32_t	proximity_domain;
 	uint32_t	base_addr_low;
 	uint32_t	base_addr_high;
 	uint32_t	length_low;
 	uint32_t	length_high;
 	uint32_t	width;
-	uint8_t		reserved2[CRAT_MEMORY_RESERVED_LENGTH];
+	uint8_t		visibility_type; /* for virtual (dGPU) CRAT */
+	uint8_t		reserved2[CRAT_MEMORY_RESERVED_LENGTH - 1];
 };
 
 /*
@@ -222,9 +227,12 @@ struct crat_subtype_ccompute {
 /*
  * HSA IO Link Affinity structure and definitions
  */
-#define CRAT_IOLINK_FLAGS_ENABLED	0x00000001
-#define CRAT_IOLINK_FLAGS_COHERENCY	0x00000002
-#define CRAT_IOLINK_FLAGS_RESERVED	0xfffffffc
+#define CRAT_IOLINK_FLAGS_ENABLED		(1 << 0)
+#define CRAT_IOLINK_FLAGS_NON_COHERENT		(1 << 1)
+#define CRAT_IOLINK_FLAGS_NO_ATOMICS_32_BIT	(1 << 2)
+#define CRAT_IOLINK_FLAGS_NO_ATOMICS_64_BIT	(1 << 3)
+#define CRAT_IOLINK_FLAGS_NO_PEER_TO_PEER_DMA	(1 << 4)
+#define CRAT_IOLINK_FLAGS_RESERVED_MASK		0xffffffe0
 
 /*
  * IO interface types
@@ -232,10 +240,18 @@ struct crat_subtype_ccompute {
 #define CRAT_IOLINK_TYPE_UNDEFINED	0
 #define CRAT_IOLINK_TYPE_HYPERTRANSPORT	1
 #define CRAT_IOLINK_TYPE_PCIEXPRESS	2
-#define CRAT_IOLINK_TYPE_OTHER		3
+#define CRAT_IOLINK_TYPE_AMBA		3
+#define CRAT_IOLINK_TYPE_MIPI		4
+#define CRAT_IOLINK_TYPE_QPI_1_1	5
+#define CRAT_IOLINK_TYPE_RESERVED1	6
+#define CRAT_IOLINK_TYPE_RESERVED2	7
+#define CRAT_IOLINK_TYPE_RAPID_IO	8
+#define CRAT_IOLINK_TYPE_INFINIBAND	9
+#define CRAT_IOLINK_TYPE_RESERVED3	10
+#define CRAT_IOLINK_TYPE_OTHER		11
 #define CRAT_IOLINK_TYPE_MAX		255
 
-#define CRAT_IOLINK_RESERVED_LENGTH 24
+#define CRAT_IOLINK_RESERVED_LENGTH	24
 
 struct crat_subtype_iolink {
 	uint8_t		type;
@@ -291,4 +307,14 @@ struct cdit_header {
 
 #pragma pack()
 
+struct kfd_dev;
+
+int kfd_create_crat_image_acpi(void **crat_image, size_t *size);
+void kfd_destroy_crat_image(void *crat_image);
+int kfd_parse_crat_table(void *crat_image, struct list_head *device_list,
+			 uint32_t proximity_domain);
+int kfd_create_crat_image_virtual(void **crat_image, size_t *size,
+				  int flags, struct kfd_dev *kdev,
+				  uint32_t proximity_domain);
+
 #endif /* KFD_CRAT_H_INCLUDED */
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c
index c407f6bd9956..afb26f205d29 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c
@@ -95,7 +95,7 @@ static int dbgdev_diq_submit_ib(struct kfd_dbgdev *dbgdev,
 	ib_packet->bitfields3.ib_base_hi = largep->u.high_part;
 
 	ib_packet->control = (1 << 23) | (1 << 31) |
-			((size_in_bytes / sizeof(uint32_t)) & 0xfffff);
+			((size_in_bytes / 4) & 0xfffff);
 
 	ib_packet->bitfields5.pasid = pasid;
 
@@ -126,8 +126,7 @@ static int dbgdev_diq_submit_ib(struct kfd_dbgdev *dbgdev,
 
 	rm_packet->header.opcode = IT_RELEASE_MEM;
 	rm_packet->header.type = PM4_TYPE_3;
-	rm_packet->header.count = sizeof(struct pm4__release_mem) /
-					sizeof(unsigned int) - 2;
+	rm_packet->header.count = sizeof(struct pm4__release_mem) / 4 - 2;
 
 	rm_packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT;
 	rm_packet->bitfields2.event_index =
@@ -652,8 +651,7 @@ static int dbgdev_wave_control_diq(struct kfd_dbgdev *dbgdev,
 	packets_vec[0].header.opcode = IT_SET_UCONFIG_REG;
 	packets_vec[0].header.type = PM4_TYPE_3;
 	packets_vec[0].bitfields2.reg_offset =
-			GRBM_GFX_INDEX / (sizeof(uint32_t)) -
-				USERCONFIG_REG_BASE;
+			GRBM_GFX_INDEX / 4 - USERCONFIG_REG_BASE;
 
 	packets_vec[0].bitfields2.insert_vmid = 0;
 	packets_vec[0].reg_data[0] = reg_gfx_index.u32All;
@@ -661,8 +659,7 @@ static int dbgdev_wave_control_diq(struct kfd_dbgdev *dbgdev,
 	packets_vec[1].header.count = 1;
 	packets_vec[1].header.opcode = IT_SET_CONFIG_REG;
 	packets_vec[1].header.type = PM4_TYPE_3;
-	packets_vec[1].bitfields2.reg_offset = SQ_CMD / (sizeof(uint32_t)) -
-						AMD_CONFIG_REG_BASE;
+	packets_vec[1].bitfields2.reg_offset = SQ_CMD / 4 - AMD_CONFIG_REG_BASE;
 
 	packets_vec[1].bitfields2.vmid_shift = SQ_CMD_VMID_OFFSET;
 	packets_vec[1].bitfields2.insert_vmid = 1;
@@ -678,8 +675,7 @@ static int dbgdev_wave_control_diq(struct kfd_dbgdev *dbgdev,
 
 	packets_vec[2].ordinal1 = packets_vec[0].ordinal1;
 	packets_vec[2].bitfields2.reg_offset =
-				GRBM_GFX_INDEX / (sizeof(uint32_t)) -
-					USERCONFIG_REG_BASE;
+				GRBM_GFX_INDEX / 4 - USERCONFIG_REG_BASE;
 
 	packets_vec[2].bitfields2.insert_vmid = 0;
 	packets_vec[2].reg_data[0] = reg_gfx_index.u32All;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c b/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c
new file mode 100644
index 000000000000..4bd6ebfaf425
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_debugfs.c
@@ -0,0 +1,75 @@
+/*
+ * Copyright 2016-2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <linux/debugfs.h>
+#include "kfd_priv.h"
+
+static struct dentry *debugfs_root;
+
+static int kfd_debugfs_open(struct inode *inode, struct file *file)
+{
+	int (*show)(struct seq_file *, void *) = inode->i_private;
+
+	return single_open(file, show, NULL);
+}
+
+static const struct file_operations kfd_debugfs_fops = {
+	.owner = THIS_MODULE,
+	.open = kfd_debugfs_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+void kfd_debugfs_init(void)
+{
+	struct dentry *ent;
+
+	debugfs_root = debugfs_create_dir("kfd", NULL);
+	if (!debugfs_root || debugfs_root == ERR_PTR(-ENODEV)) {
+		pr_warn("Failed to create kfd debugfs dir\n");
+		return;
+	}
+
+	ent = debugfs_create_file("mqds", S_IFREG | 0444, debugfs_root,
+				  kfd_debugfs_mqds_by_process,
+				  &kfd_debugfs_fops);
+	if (!ent)
+		pr_warn("Failed to create mqds in kfd debugfs\n");
+
+	ent = debugfs_create_file("hqds", S_IFREG | 0444, debugfs_root,
+				  kfd_debugfs_hqds_by_device,
+				  &kfd_debugfs_fops);
+	if (!ent)
+		pr_warn("Failed to create hqds in kfd debugfs\n");
+
+	ent = debugfs_create_file("rls", S_IFREG | 0444, debugfs_root,
+				  kfd_debugfs_rls_by_device,
+				  &kfd_debugfs_fops);
+	if (!ent)
+		pr_warn("Failed to create rls in kfd debugfs\n");
+}
+
+void kfd_debugfs_fini(void)
+{
+	debugfs_remove_recursive(debugfs_root);
+}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 621a3b53a038..a8fa33a08de3 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -27,6 +27,7 @@
 #include "kfd_priv.h"
 #include "kfd_device_queue_manager.h"
 #include "kfd_pm4_headers_vi.h"
+#include "cwsr_trap_handler_gfx8.asm"
 
 #define MQD_SIZE_ALIGNED 768
 
@@ -38,7 +39,8 @@ static const struct kfd_device_info kaveri_device_info = {
 	.ih_ring_entry_size = 4 * sizeof(uint32_t),
 	.event_interrupt_class = &event_interrupt_class_cik,
 	.num_of_watch_points = 4,
-	.mqd_size_aligned = MQD_SIZE_ALIGNED
+	.mqd_size_aligned = MQD_SIZE_ALIGNED,
+	.supports_cwsr = false,
 };
 
 static const struct kfd_device_info carrizo_device_info = {
@@ -49,7 +51,8 @@ static const struct kfd_device_info carrizo_device_info = {
 	.ih_ring_entry_size = 4 * sizeof(uint32_t),
 	.event_interrupt_class = &event_interrupt_class_cik,
 	.num_of_watch_points = 4,
-	.mqd_size_aligned = MQD_SIZE_ALIGNED
+	.mqd_size_aligned = MQD_SIZE_ALIGNED,
+	.supports_cwsr = true,
 };
 
 struct kfd_deviceid {
@@ -212,6 +215,17 @@ static int iommu_invalid_ppr_cb(struct pci_dev *pdev, int pasid,
 	return AMD_IOMMU_INV_PRI_RSP_INVALID;
 }
 
+static void kfd_cwsr_init(struct kfd_dev *kfd)
+{
+	if (cwsr_enable && kfd->device_info->supports_cwsr) {
+		BUILD_BUG_ON(sizeof(cwsr_trap_gfx8_hex) > PAGE_SIZE);
+
+		kfd->cwsr_isa = cwsr_trap_gfx8_hex;
+		kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx8_hex);
+		kfd->cwsr_enabled = true;
+	}
+}
+
 bool kgd2kfd_device_init(struct kfd_dev *kfd,
 			 const struct kgd2kfd_shared_resources *gpu_resources)
 {
@@ -224,6 +238,17 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
 	kfd->vm_info.vmid_num_kfd = kfd->vm_info.last_vmid_kfd
 			- kfd->vm_info.first_vmid_kfd + 1;
 
+	/* Verify module parameters regarding mapped process number*/
+	if ((hws_max_conc_proc < 0)
+			|| (hws_max_conc_proc > kfd->vm_info.vmid_num_kfd)) {
+		dev_err(kfd_device,
+			"hws_max_conc_proc %d must be between 0 and %d, use %d instead\n",
+			hws_max_conc_proc, kfd->vm_info.vmid_num_kfd,
+			kfd->vm_info.vmid_num_kfd);
+		kfd->max_proc_per_quantum = kfd->vm_info.vmid_num_kfd;
+	} else
+		kfd->max_proc_per_quantum = hws_max_conc_proc;
+
 	/* calculate max size of mqds needed for queues */
 	size = max_num_of_queues_per_device *
 			kfd->device_info->mqd_size_aligned;
@@ -286,6 +311,8 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
 		goto device_iommu_pasid_error;
 	}
 
+	kfd_cwsr_init(kfd);
+
 	if (kfd_resume(kfd))
 		goto kfd_resume_error;
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index e202921c150e..d0693fd8cbf8 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -149,8 +149,7 @@ static void deallocate_vmid(struct device_queue_manager *dqm,
 
 static int create_queue_nocpsch(struct device_queue_manager *dqm,
 				struct queue *q,
-				struct qcm_process_device *qpd,
-				int *allocated_vmid)
+				struct qcm_process_device *qpd)
 {
 	int retval;
 
@@ -170,9 +169,11 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
 		if (retval)
 			goto out_unlock;
 	}
-	*allocated_vmid = qpd->vmid;
 	q->properties.vmid = qpd->vmid;
 
+	q->properties.tba_addr = qpd->tba_addr;
+	q->properties.tma_addr = qpd->tma_addr;
+
 	if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE)
 		retval = create_compute_queue_nocpsch(dqm, q, qpd);
 	else if (q->properties.type == KFD_QUEUE_TYPE_SDMA)
@@ -181,10 +182,8 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
 		retval = -EINVAL;
 
 	if (retval) {
-		if (list_empty(&qpd->queues_list)) {
+		if (list_empty(&qpd->queues_list))
 			deallocate_vmid(dqm, qpd, q);
-			*allocated_vmid = 0;
-		}
 		goto out_unlock;
 	}
 
@@ -809,16 +808,13 @@ static void destroy_kernel_queue_cpsch(struct device_queue_manager *dqm,
 }
 
 static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
-			struct qcm_process_device *qpd, int *allocate_vmid)
+			struct qcm_process_device *qpd)
 {
 	int retval;
 	struct mqd_manager *mqd;
 
 	retval = 0;
 
-	if (allocate_vmid)
-		*allocate_vmid = 0;
-
 	mutex_lock(&dqm->lock);
 
 	if (dqm->total_queue_count >= max_num_of_queues_per_device) {
@@ -846,6 +842,9 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
 	}
 
 	dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
+
+	q->properties.tba_addr = qpd->tba_addr;
+	q->properties.tma_addr = qpd->tma_addr;
 	retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj,
 				&q->gart_mqd_addr, &q->properties);
 	if (retval)
@@ -1110,6 +1109,26 @@ out:
 	return retval;
 }
 
+static int set_trap_handler(struct device_queue_manager *dqm,
+				struct qcm_process_device *qpd,
+				uint64_t tba_addr,
+				uint64_t tma_addr)
+{
+	uint64_t *tma;
+
+	if (dqm->dev->cwsr_enabled) {
+		/* Jump from CWSR trap handler to user trap */
+		tma = (uint64_t *)(qpd->cwsr_kaddr + KFD_CWSR_TMA_OFFSET);
+		tma[0] = tba_addr;
+		tma[1] = tma_addr;
+	} else {
+		qpd->tba_addr = tba_addr;
+		qpd->tma_addr = tma_addr;
+	}
+
+	return 0;
+}
+
 static int process_termination_nocpsch(struct device_queue_manager *dqm,
 		struct qcm_process_device *qpd)
 {
@@ -1241,6 +1260,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
 		dqm->ops.create_kernel_queue = create_kernel_queue_cpsch;
 		dqm->ops.destroy_kernel_queue = destroy_kernel_queue_cpsch;
 		dqm->ops.set_cache_memory_policy = set_cache_memory_policy;
+		dqm->ops.set_trap_handler = set_trap_handler;
 		dqm->ops.process_termination = process_termination_cpsch;
 		break;
 	case KFD_SCHED_POLICY_NO_HWS:
@@ -1256,6 +1276,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
 		dqm->ops.initialize = initialize_nocpsch;
 		dqm->ops.uninitialize = uninitialize;
 		dqm->ops.set_cache_memory_policy = set_cache_memory_policy;
+		dqm->ops.set_trap_handler = set_trap_handler;
 		dqm->ops.process_termination = process_termination_nocpsch;
 		break;
 	default:
@@ -1290,3 +1311,74 @@ void device_queue_manager_uninit(struct device_queue_manager *dqm)
 	dqm->ops.uninitialize(dqm);
 	kfree(dqm);
 }
+
+#if defined(CONFIG_DEBUG_FS)
+
+static void seq_reg_dump(struct seq_file *m,
+			 uint32_t (*dump)[2], uint32_t n_regs)
+{
+	uint32_t i, count;
+
+	for (i = 0, count = 0; i < n_regs; i++) {
+		if (count == 0 ||
+		    dump[i-1][0] + sizeof(uint32_t) != dump[i][0]) {
+			seq_printf(m, "%s    %08x: %08x",
+				   i ? "\n" : "",
+				   dump[i][0], dump[i][1]);
+			count = 7;
+		} else {
+			seq_printf(m, " %08x", dump[i][1]);
+			count--;
+		}
+	}
+
+	seq_puts(m, "\n");
+}
+
+int dqm_debugfs_hqds(struct seq_file *m, void *data)
+{
+	struct device_queue_manager *dqm = data;
+	uint32_t (*dump)[2], n_regs;
+	int pipe, queue;
+	int r = 0;
+
+	for (pipe = 0; pipe < get_pipes_per_mec(dqm); pipe++) {
+		int pipe_offset = pipe * get_queues_per_pipe(dqm);
+
+		for (queue = 0; queue < get_queues_per_pipe(dqm); queue++) {
+			if (!test_bit(pipe_offset + queue,
+				      dqm->dev->shared_resources.queue_bitmap))
+				continue;
+
+			r = dqm->dev->kfd2kgd->hqd_dump(
+				dqm->dev->kgd, pipe, queue, &dump, &n_regs);
+			if (r)
+				break;
+
+			seq_printf(m, "  CP Pipe %d, Queue %d\n",
+				  pipe, queue);
+			seq_reg_dump(m, dump, n_regs);
+
+			kfree(dump);
+		}
+	}
+
+	for (pipe = 0; pipe < CIK_SDMA_ENGINE_NUM; pipe++) {
+		for (queue = 0; queue < CIK_SDMA_QUEUES_PER_ENGINE; queue++) {
+			r = dqm->dev->kfd2kgd->hqd_sdma_dump(
+				dqm->dev->kgd, pipe, queue, &dump, &n_regs);
+			if (r)
+				break;
+
+			seq_printf(m, "  SDMA Engine %d, RLC %d\n",
+				  pipe, queue);
+			seq_reg_dump(m, dump, n_regs);
+
+			kfree(dump);
+		}
+	}
+
+	return r;
+}
+
+#endif
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index 5b77cb69f732..c61b693bfa8c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -84,8 +84,7 @@ struct device_process_node {
 struct device_queue_manager_ops {
 	int	(*create_queue)(struct device_queue_manager *dqm,
 				struct queue *q,
-				struct qcm_process_device *qpd,
-				int *allocate_vmid);
+				struct qcm_process_device *qpd);
 
 	int	(*destroy_queue)(struct device_queue_manager *dqm,
 				struct qcm_process_device *qpd,
@@ -123,6 +122,11 @@ struct device_queue_manager_ops {
 					   void __user *alternate_aperture_base,
 					   uint64_t alternate_aperture_size);
 
+	int	(*set_trap_handler)(struct device_queue_manager *dqm,
+				    struct qcm_process_device *qpd,
+				    uint64_t tba_addr,
+				    uint64_t tma_addr);
+
 	int (*process_termination)(struct device_queue_manager *dqm,
 			struct qcm_process_device *qpd);
 };
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c
index feb76c235b1a..ebb4da14e3df 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c
@@ -116,8 +116,7 @@ int kfd_doorbell_init(struct kfd_dev *kfd)
 	pr_debug("doorbell aperture size  == 0x%08lX\n",
 			kfd->shared_resources.doorbell_aperture_size);
 
-	pr_debug("doorbell kernel address == 0x%08lX\n",
-			(uintptr_t)kfd->doorbell_kernel_ptr);
+	pr_debug("doorbell kernel address == %p\n", kfd->doorbell_kernel_ptr);
 
 	return 0;
 }
@@ -194,8 +193,8 @@ u32 __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd,
 
 	pr_debug("Get kernel queue doorbell\n"
 			 "     doorbell offset   == 0x%08X\n"
-			 "     kernel address    == 0x%08lX\n",
-		*doorbell_off, (uintptr_t)(kfd->doorbell_kernel_ptr + inx));
+			 "     kernel address    == %p\n",
+		*doorbell_off, (kfd->doorbell_kernel_ptr + inx));
 
 	return kfd->doorbell_kernel_ptr + inx;
 }
@@ -215,7 +214,7 @@ inline void write_kernel_doorbell(u32 __iomem *db, u32 value)
 {
 	if (db) {
 		writel(value, db);
-		pr_debug("Writing %d to doorbell address 0x%p\n", value, db);
+		pr_debug("Writing %d to doorbell address %p\n", value, db);
 	}
 }
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
index cb92d4b72400..93aae5c1e78b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
@@ -441,7 +441,7 @@ void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id,
 	/*
 	 * Because we are called from arbitrary context (workqueue) as opposed
 	 * to process context, kfd_process could attempt to exit while we are
-	 * running so the lookup function returns a locked process.
+	 * running so the lookup function increments the process ref count.
 	 */
 	struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
 
@@ -493,7 +493,7 @@ void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id,
 	}
 
 	mutex_unlock(&p->event_mutex);
-	mutex_unlock(&p->mutex);
+	kfd_unref_process(p);
 }
 
 static struct kfd_event_waiter *alloc_event_waiters(uint32_t num_events)
@@ -847,7 +847,7 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid,
 	/*
 	 * Because we are called from arbitrary context (workqueue) as opposed
 	 * to process context, kfd_process could attempt to exit while we are
-	 * running so the lookup function returns a locked process.
+	 * running so the lookup function increments the process ref count.
 	 */
 	struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
 	struct mm_struct *mm;
@@ -860,7 +860,7 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid,
 	 */
 	mm = get_task_mm(p->lead_thread);
 	if (!mm) {
-		mutex_unlock(&p->mutex);
+		kfd_unref_process(p);
 		return; /* Process is exiting */
 	}
 
@@ -903,7 +903,7 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid,
 			&memory_exception_data);
 
 	mutex_unlock(&p->event_mutex);
-	mutex_unlock(&p->mutex);
+	kfd_unref_process(p);
 }
 
 void kfd_signal_hw_exception_event(unsigned int pasid)
@@ -911,7 +911,7 @@ void kfd_signal_hw_exception_event(unsigned int pasid)
 	/*
 	 * Because we are called from arbitrary context (workqueue) as opposed
 	 * to process context, kfd_process could attempt to exit while we are
-	 * running so the lookup function returns a locked process.
+	 * running so the lookup function increments the process ref count.
 	 */
 	struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
 
@@ -924,5 +924,5 @@ void kfd_signal_hw_exception_event(unsigned int pasid)
 	lookup_events_by_type_and_signal(p, KFD_EVENT_TYPE_HW_EXCEPTION, NULL);
 
 	mutex_unlock(&p->event_mutex);
-	mutex_unlock(&p->mutex);
+	kfd_unref_process(p);
 }
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
index c59384bbbc5f..7377513050e6 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
@@ -300,9 +300,14 @@ int kfd_init_apertures(struct kfd_process *process)
 	struct kfd_process_device *pdd;
 
 	/*Iterating over all devices*/
-	while ((dev = kfd_topology_enum_kfd_devices(id)) != NULL &&
+	while (kfd_topology_enum_kfd_devices(id, &dev) == 0 &&
 		id < NUM_OF_SUPPORTED_GPUS) {
 
+		if (!dev) {
+			id++; /* Skip non GPU devices */
+			continue;
+		}
+
 		pdd = kfd_create_process_device_data(dev, process);
 		if (!pdd) {
 			pr_err("Failed to create process device data\n");
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
index 8b0c0645d7c0..5dc6567d4a13 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
@@ -218,7 +218,7 @@ static int acquire_packet_buffer(struct kernel_queue *kq,
 	rptr = *kq->rptr_kernel;
 	wptr = *kq->wptr_kernel;
 	queue_address = (unsigned int *)kq->pq_kernel_addr;
-	queue_size_dwords = kq->queue->properties.queue_size / sizeof(uint32_t);
+	queue_size_dwords = kq->queue->properties.queue_size / 4;
 
 	pr_debug("rptr: %d\n", rptr);
 	pr_debug("wptr: %d\n", wptr);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_module.c b/drivers/gpu/drm/amd/amdkfd/kfd_module.c
index f744caeaee04..3ac72bed4f31 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_module.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_module.c
@@ -50,6 +50,15 @@ module_param(sched_policy, int, 0444);
 MODULE_PARM_DESC(sched_policy,
 	"Scheduling policy (0 = HWS (Default), 1 = HWS without over-subscription, 2 = Non-HWS (Used for debugging only)");
 
+int hws_max_conc_proc = 8;
+module_param(hws_max_conc_proc, int, 0444);
+MODULE_PARM_DESC(hws_max_conc_proc,
+	"Max # processes HWS can execute concurrently when sched_policy=0 (0 = no concurrency, #VMIDs for KFD = Maximum(default))");
+
+int cwsr_enable = 1;
+module_param(cwsr_enable, int, 0444);
+MODULE_PARM_DESC(cwsr_enable, "CWSR enable (0 = Off, 1 = On (Default))");
+
 int max_num_of_queues_per_device = KFD_MAX_NUM_OF_QUEUES_PER_DEVICE_DEFAULT;
 module_param(max_num_of_queues_per_device, int, 0444);
 MODULE_PARM_DESC(max_num_of_queues_per_device,
@@ -60,6 +69,11 @@ module_param(send_sigterm, int, 0444);
 MODULE_PARM_DESC(send_sigterm,
 	"Send sigterm to HSA process on unhandled exception (0 = disable, 1 = enable)");
 
+int ignore_crat;
+module_param(ignore_crat, int, 0444);
+MODULE_PARM_DESC(ignore_crat,
+	"Ignore CRAT table during KFD initialization (0 = use CRAT (default), 1 = ignore CRAT)");
+
 static int amdkfd_init_completed;
 
 int kgd2kfd_init(unsigned int interface_version,
@@ -114,6 +128,8 @@ static int __init kfd_module_init(void)
 
 	kfd_process_create_wq();
 
+	kfd_debugfs_init();
+
 	amdkfd_init_completed = 1;
 
 	dev_info(kfd_device, "Initialized module\n");
@@ -130,6 +146,7 @@ static void __exit kfd_module_exit(void)
 {
 	amdkfd_init_completed = 0;
 
+	kfd_debugfs_fini();
 	kfd_process_destroy_wq();
 	kfd_topology_shutdown();
 	kfd_chardev_exit();
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h
index 1f3a6ba7eed2..8972bcfbf701 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h
@@ -85,6 +85,10 @@ struct mqd_manager {
 				uint64_t queue_address,	uint32_t pipe_id,
 				uint32_t queue_id);
 
+#if defined(CONFIG_DEBUG_FS)
+	int	(*debugfs_show_mqd)(struct seq_file *m, void *data);
+#endif
+
 	struct mutex	mqd_mutex;
 	struct kfd_dev	*dev;
 };
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
index 4728fad3fd74..f8ef4a051e08 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c
@@ -36,6 +36,11 @@ static inline struct cik_mqd *get_mqd(void *mqd)
 	return (struct cik_mqd *)mqd;
 }
 
+static inline struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd)
+{
+	return (struct cik_sdma_rlc_registers *)mqd;
+}
+
 static int init_mqd(struct mqd_manager *mm, void **mqd,
 		struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
 		struct queue_properties *q)
@@ -149,7 +154,7 @@ static int load_mqd(struct mqd_manager *mm, void *mqd, uint32_t pipe_id,
 {
 	/* AQL write pointer counts in 64B packets, PM4/CP counts in dwords. */
 	uint32_t wptr_shift = (p->format == KFD_QUEUE_FORMAT_AQL ? 4 : 0);
-	uint32_t wptr_mask = (uint32_t)((p->queue_size / sizeof(uint32_t)) - 1);
+	uint32_t wptr_mask = (uint32_t)((p->queue_size / 4) - 1);
 
 	return mm->dev->kfd2kgd->hqd_load(mm->dev->kgd, mqd, pipe_id, queue_id,
 					  (uint32_t __user *)p->write_ptr,
@@ -160,7 +165,9 @@ static int load_mqd_sdma(struct mqd_manager *mm, void *mqd,
 			 uint32_t pipe_id, uint32_t queue_id,
 			 struct queue_properties *p, struct mm_struct *mms)
 {
-	return mm->dev->kfd2kgd->hqd_sdma_load(mm->dev->kgd, mqd);
+	return mm->dev->kfd2kgd->hqd_sdma_load(mm->dev->kgd, mqd,
+					       (uint32_t __user *)p->write_ptr,
+					       mms);
 }
 
 static int update_mqd(struct mqd_manager *mm, void *mqd,
@@ -176,8 +183,7 @@ static int update_mqd(struct mqd_manager *mm, void *mqd,
 	 * Calculating queue size which is log base 2 of actual queue size -1
 	 * dwords and another -1 for ffs
 	 */
-	m->cp_hqd_pq_control |= ffs(q->queue_size / sizeof(unsigned int))
-								- 1 - 1;
+	m->cp_hqd_pq_control |= order_base_2(q->queue_size / 4) - 1;
 	m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8);
 	m->cp_hqd_pq_base_hi = upper_32_bits((uint64_t)q->queue_address >> 8);
 	m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr);
@@ -202,7 +208,7 @@ static int update_mqd_sdma(struct mqd_manager *mm, void *mqd,
 	struct cik_sdma_rlc_registers *m;
 
 	m = get_sdma_mqd(mqd);
-	m->sdma_rlc_rb_cntl = (ffs(q->queue_size / sizeof(unsigned int)) - 1)
+	m->sdma_rlc_rb_cntl = order_base_2(q->queue_size / 4)
 			<< SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT |
 			q->vmid << SDMA0_RLC0_RB_CNTL__RB_VMID__SHIFT |
 			1 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT |
@@ -343,8 +349,7 @@ static int update_mqd_hiq(struct mqd_manager *mm, void *mqd,
 	 * Calculating queue size which is log base 2 of actual queue
 	 * size -1 dwords
 	 */
-	m->cp_hqd_pq_control |= ffs(q->queue_size / sizeof(unsigned int))
-								- 1 - 1;
+	m->cp_hqd_pq_control |= order_base_2(q->queue_size / 4) - 1;
 	m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8);
 	m->cp_hqd_pq_base_hi = upper_32_bits((uint64_t)q->queue_address >> 8);
 	m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr);
@@ -360,15 +365,25 @@ static int update_mqd_hiq(struct mqd_manager *mm, void *mqd,
 	return 0;
 }
 
-struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd)
-{
-	struct cik_sdma_rlc_registers *m;
+#if defined(CONFIG_DEBUG_FS)
 
-	m = (struct cik_sdma_rlc_registers *)mqd;
+static int debugfs_show_mqd(struct seq_file *m, void *data)
+{
+	seq_hex_dump(m, "    ", DUMP_PREFIX_OFFSET, 32, 4,
+		     data, sizeof(struct cik_mqd), false);
+	return 0;
+}
 
-	return m;
+static int debugfs_show_mqd_sdma(struct seq_file *m, void *data)
+{
+	seq_hex_dump(m, "    ", DUMP_PREFIX_OFFSET, 32, 4,
+		     data, sizeof(struct cik_sdma_rlc_registers), false);
+	return 0;
 }
 
+#endif
+
+
 struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type,
 		struct kfd_dev *dev)
 {
@@ -392,6 +407,9 @@ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type,
 		mqd->update_mqd = update_mqd;
 		mqd->destroy_mqd = destroy_mqd;
 		mqd->is_occupied = is_occupied;
+#if defined(CONFIG_DEBUG_FS)
+		mqd->debugfs_show_mqd = debugfs_show_mqd;
+#endif
 		break;
 	case KFD_MQD_TYPE_HIQ:
 		mqd->init_mqd = init_mqd_hiq;
@@ -400,6 +418,9 @@ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type,
 		mqd->update_mqd = update_mqd_hiq;
 		mqd->destroy_mqd = destroy_mqd;
 		mqd->is_occupied = is_occupied;
+#if defined(CONFIG_DEBUG_FS)
+		mqd->debugfs_show_mqd = debugfs_show_mqd;
+#endif
 		break;
 	case KFD_MQD_TYPE_SDMA:
 		mqd->init_mqd = init_mqd_sdma;
@@ -408,6 +429,9 @@ struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type,
 		mqd->update_mqd = update_mqd_sdma;
 		mqd->destroy_mqd = destroy_mqd_sdma;
 		mqd->is_occupied = is_occupied_sdma;
+#if defined(CONFIG_DEBUG_FS)
+		mqd->debugfs_show_mqd = debugfs_show_mqd_sdma;
+#endif
 		break;
 	default:
 		kfree(mqd);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
index 4ea854f9007b..971aec0637dc 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c
@@ -30,7 +30,7 @@
 #include "vi_structs.h"
 #include "gca/gfx_8_0_sh_mask.h"
 #include "gca/gfx_8_0_enum.h"
-
+#include "oss/oss_3_0_sh_mask.h"
 #define CP_MQD_CONTROL__PRIV_STATE__SHIFT 0x8
 
 static inline struct vi_mqd *get_mqd(void *mqd)
@@ -38,6 +38,11 @@ static inline struct vi_mqd *get_mqd(void *mqd)
 	return (struct vi_mqd *)mqd;
 }
 
+static inline struct vi_sdma_mqd *get_sdma_mqd(void *mqd)
+{
+	return (struct vi_sdma_mqd *)mqd;
+}
+
 static int init_mqd(struct mqd_manager *mm, void **mqd,
 			struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
 			struct queue_properties *q)
@@ -84,6 +89,28 @@ static int init_mqd(struct mqd_manager *mm, void **mqd,
 	if (q->format == KFD_QUEUE_FORMAT_AQL)
 		m->cp_hqd_iq_rptr = 1;
 
+	if (q->tba_addr) {
+		m->compute_tba_lo = lower_32_bits(q->tba_addr >> 8);
+		m->compute_tba_hi = upper_32_bits(q->tba_addr >> 8);
+		m->compute_tma_lo = lower_32_bits(q->tma_addr >> 8);
+		m->compute_tma_hi = upper_32_bits(q->tma_addr >> 8);
+		m->compute_pgm_rsrc2 |=
+			(1 << COMPUTE_PGM_RSRC2__TRAP_PRESENT__SHIFT);
+	}
+
+	if (mm->dev->cwsr_enabled && q->ctx_save_restore_area_address) {
+		m->cp_hqd_persistent_state |=
+			(1 << CP_HQD_PERSISTENT_STATE__QSWITCH_MODE__SHIFT);
+		m->cp_hqd_ctx_save_base_addr_lo =
+			lower_32_bits(q->ctx_save_restore_area_address);
+		m->cp_hqd_ctx_save_base_addr_hi =
+			upper_32_bits(q->ctx_save_restore_area_address);
+		m->cp_hqd_ctx_save_size = q->ctx_save_restore_area_size;
+		m->cp_hqd_cntl_stack_size = q->ctl_stack_size;
+		m->cp_hqd_cntl_stack_offset = q->ctl_stack_size;
+		m->cp_hqd_wg_state_offset = q->ctl_stack_size;
+	}
+
 	*mqd = m;
 	if (gart_addr)
 		*gart_addr = addr;
@@ -98,7 +125,7 @@ static int load_mqd(struct mqd_manager *mm, void *mqd,
 {
 	/* AQL write pointer counts in 64B packets, PM4/CP counts in dwords. */
 	uint32_t wptr_shift = (p->format == KFD_QUEUE_FORMAT_AQL ? 4 : 0);
-	uint32_t wptr_mask = (uint32_t)((p->queue_size / sizeof(uint32_t)) - 1);
+	uint32_t wptr_mask = (uint32_t)((p->queue_size / 4) - 1);
 
 	return mm->dev->kfd2kgd->hqd_load(mm->dev->kgd, mqd, pipe_id, queue_id,
 					  (uint32_t __user *)p->write_ptr,
@@ -116,8 +143,7 @@ static int __update_mqd(struct mqd_manager *mm, void *mqd,
 	m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT |
 			atc_bit << CP_HQD_PQ_CONTROL__PQ_ATC__SHIFT |
 			mtype << CP_HQD_PQ_CONTROL__MTYPE__SHIFT;
-	m->cp_hqd_pq_control |=
-			ffs(q->queue_size / sizeof(unsigned int)) - 1 - 1;
+	m->cp_hqd_pq_control |=	order_base_2(q->queue_size / 4) - 1;
 	pr_debug("cp_hqd_pq_control 0x%x\n", m->cp_hqd_pq_control);
 
 	m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8);
@@ -147,7 +173,7 @@ static int __update_mqd(struct mqd_manager *mm, void *mqd,
 	 * is safe, giving a maximum field value of 0xA.
 	 */
 	m->cp_hqd_eop_control |= min(0xA,
-		ffs(q->eop_ring_buffer_size / sizeof(unsigned int)) - 1 - 1);
+		order_base_2(q->eop_ring_buffer_size / 4) - 1);
 	m->cp_hqd_eop_base_addr_lo =
 			lower_32_bits(q->eop_ring_buffer_address >> 8);
 	m->cp_hqd_eop_base_addr_hi =
@@ -163,6 +189,11 @@ static int __update_mqd(struct mqd_manager *mm, void *mqd,
 				2 << CP_HQD_PQ_CONTROL__SLOT_BASED_WPTR__SHIFT;
 	}
 
+	if (mm->dev->cwsr_enabled && q->ctx_save_restore_area_address)
+		m->cp_hqd_ctx_save_control =
+			atc_bit << CP_HQD_CTX_SAVE_CONTROL__ATC__SHIFT |
+			mtype << CP_HQD_CTX_SAVE_CONTROL__MTYPE__SHIFT;
+
 	q->is_active = (q->queue_size > 0 &&
 			q->queue_address != 0 &&
 			q->queue_percent > 0);
@@ -234,6 +265,117 @@ static int update_mqd_hiq(struct mqd_manager *mm, void *mqd,
 	return retval;
 }
 
+static int init_mqd_sdma(struct mqd_manager *mm, void **mqd,
+		struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
+		struct queue_properties *q)
+{
+	int retval;
+	struct vi_sdma_mqd *m;
+
+
+	retval = kfd_gtt_sa_allocate(mm->dev,
+			sizeof(struct vi_sdma_mqd),
+			mqd_mem_obj);
+
+	if (retval != 0)
+		return -ENOMEM;
+
+	m = (struct vi_sdma_mqd *) (*mqd_mem_obj)->cpu_ptr;
+
+	memset(m, 0, sizeof(struct vi_sdma_mqd));
+
+	*mqd = m;
+	if (gart_addr != NULL)
+		*gart_addr = (*mqd_mem_obj)->gpu_addr;
+
+	retval = mm->update_mqd(mm, m, q);
+
+	return retval;
+}
+
+static void uninit_mqd_sdma(struct mqd_manager *mm, void *mqd,
+		struct kfd_mem_obj *mqd_mem_obj)
+{
+	kfd_gtt_sa_free(mm->dev, mqd_mem_obj);
+}
+
+static int load_mqd_sdma(struct mqd_manager *mm, void *mqd,
+		uint32_t pipe_id, uint32_t queue_id,
+		struct queue_properties *p, struct mm_struct *mms)
+{
+	return mm->dev->kfd2kgd->hqd_sdma_load(mm->dev->kgd, mqd,
+					       (uint32_t __user *)p->write_ptr,
+					       mms);
+}
+
+static int update_mqd_sdma(struct mqd_manager *mm, void *mqd,
+		struct queue_properties *q)
+{
+	struct vi_sdma_mqd *m;
+
+	m = get_sdma_mqd(mqd);
+	m->sdmax_rlcx_rb_cntl = order_base_2(q->queue_size / 4)
+		<< SDMA0_RLC0_RB_CNTL__RB_SIZE__SHIFT |
+		q->vmid << SDMA0_RLC0_RB_CNTL__RB_VMID__SHIFT |
+		1 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT |
+		6 << SDMA0_RLC0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT;
+
+	m->sdmax_rlcx_rb_base = lower_32_bits(q->queue_address >> 8);
+	m->sdmax_rlcx_rb_base_hi = upper_32_bits(q->queue_address >> 8);
+	m->sdmax_rlcx_rb_rptr_addr_lo = lower_32_bits((uint64_t)q->read_ptr);
+	m->sdmax_rlcx_rb_rptr_addr_hi = upper_32_bits((uint64_t)q->read_ptr);
+	m->sdmax_rlcx_doorbell =
+		q->doorbell_off << SDMA0_RLC0_DOORBELL__OFFSET__SHIFT;
+
+	m->sdmax_rlcx_virtual_addr = q->sdma_vm_addr;
+
+	m->sdma_engine_id = q->sdma_engine_id;
+	m->sdma_queue_id = q->sdma_queue_id;
+
+	q->is_active = (q->queue_size > 0 &&
+			q->queue_address != 0 &&
+			q->queue_percent > 0);
+
+	return 0;
+}
+
+/*
+ *  * preempt type here is ignored because there is only one way
+ *  * to preempt sdma queue
+ */
+static int destroy_mqd_sdma(struct mqd_manager *mm, void *mqd,
+		enum kfd_preempt_type type,
+		unsigned int timeout, uint32_t pipe_id,
+		uint32_t queue_id)
+{
+	return mm->dev->kfd2kgd->hqd_sdma_destroy(mm->dev->kgd, mqd, timeout);
+}
+
+static bool is_occupied_sdma(struct mqd_manager *mm, void *mqd,
+		uint64_t queue_address, uint32_t pipe_id,
+		uint32_t queue_id)
+{
+	return mm->dev->kfd2kgd->hqd_sdma_is_occupied(mm->dev->kgd, mqd);
+}
+
+#if defined(CONFIG_DEBUG_FS)
+
+static int debugfs_show_mqd(struct seq_file *m, void *data)
+{
+	seq_hex_dump(m, "    ", DUMP_PREFIX_OFFSET, 32, 4,
+		     data, sizeof(struct vi_mqd), false);
+	return 0;
+}
+
+static int debugfs_show_mqd_sdma(struct seq_file *m, void *data)
+{
+	seq_hex_dump(m, "    ", DUMP_PREFIX_OFFSET, 32, 4,
+		     data, sizeof(struct vi_sdma_mqd), false);
+	return 0;
+}
+
+#endif
+
 struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type,
 		struct kfd_dev *dev)
 {
@@ -257,6 +399,9 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type,
 		mqd->update_mqd = update_mqd;
 		mqd->destroy_mqd = destroy_mqd;
 		mqd->is_occupied = is_occupied;
+#if defined(CONFIG_DEBUG_FS)
+		mqd->debugfs_show_mqd = debugfs_show_mqd;
+#endif
 		break;
 	case KFD_MQD_TYPE_HIQ:
 		mqd->init_mqd = init_mqd_hiq;
@@ -265,8 +410,20 @@ struct mqd_manager *mqd_manager_init_vi(enum KFD_MQD_TYPE type,
 		mqd->update_mqd = update_mqd_hiq;
 		mqd->destroy_mqd = destroy_mqd;
 		mqd->is_occupied = is_occupied;
+#if defined(CONFIG_DEBUG_FS)
+		mqd->debugfs_show_mqd = debugfs_show_mqd;
+#endif
 		break;
 	case KFD_MQD_TYPE_SDMA:
+		mqd->init_mqd = init_mqd_sdma;
+		mqd->uninit_mqd = uninit_mqd_sdma;
+		mqd->load_mqd = load_mqd_sdma;
+		mqd->update_mqd = update_mqd_sdma;
+		mqd->destroy_mqd = destroy_mqd_sdma;
+		mqd->is_occupied = is_occupied_sdma;
+#if defined(CONFIG_DEBUG_FS)
+		mqd->debugfs_show_mqd = debugfs_show_mqd_sdma;
+#endif
 		break;
 	default:
 		kfree(mqd);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
index 16da8ad02d8b..0ecbd1f9b606 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
@@ -45,7 +45,7 @@ static unsigned int build_pm4_header(unsigned int opcode, size_t packet_size)
 
 	header.u32All = 0;
 	header.opcode = opcode;
-	header.count = packet_size/sizeof(uint32_t) - 2;
+	header.count = packet_size / 4 - 2;
 	header.type = PM4_TYPE_3;
 
 	return header.u32All;
@@ -55,15 +55,27 @@ static void pm_calc_rlib_size(struct packet_manager *pm,
 				unsigned int *rlib_size,
 				bool *over_subscription)
 {
-	unsigned int process_count, queue_count;
+	unsigned int process_count, queue_count, compute_queue_count;
 	unsigned int map_queue_size;
+	unsigned int max_proc_per_quantum = 1;
+	struct kfd_dev *dev = pm->dqm->dev;
 
 	process_count = pm->dqm->processes_count;
 	queue_count = pm->dqm->queue_count;
+	compute_queue_count = queue_count - pm->dqm->sdma_queue_count;
 
-	/* check if there is over subscription*/
+	/* check if there is over subscription
+	 * Note: the arbitration between the number of VMIDs and
+	 * hws_max_conc_proc has been done in
+	 * kgd2kfd_device_init().
+	 */
 	*over_subscription = false;
-	if ((process_count > 1) || queue_count > get_queues_num(pm->dqm)) {
+
+	if (dev->max_proc_per_quantum > 1)
+		max_proc_per_quantum = dev->max_proc_per_quantum;
+
+	if ((process_count > max_proc_per_quantum) ||
+	    compute_queue_count > get_queues_num(pm->dqm)) {
 		*over_subscription = true;
 		pr_debug("Over subscribed runlist\n");
 	}
@@ -116,10 +128,24 @@ static int pm_create_runlist(struct packet_manager *pm, uint32_t *buffer,
 			uint64_t ib, size_t ib_size_in_dwords, bool chain)
 {
 	struct pm4_mes_runlist *packet;
+	int concurrent_proc_cnt = 0;
+	struct kfd_dev *kfd = pm->dqm->dev;
 
 	if (WARN_ON(!ib))
 		return -EFAULT;
 
+	/* Determine the number of processes to map together to HW:
+	 * it can not exceed the number of VMIDs available to the
+	 * scheduler, and it is determined by the smaller of the number
+	 * of processes in the runlist and kfd module parameter
+	 * hws_max_conc_proc.
+	 * Note: the arbitration between the number of VMIDs and
+	 * hws_max_conc_proc has been done in
+	 * kgd2kfd_device_init().
+	 */
+	concurrent_proc_cnt = min(pm->dqm->processes_count,
+			kfd->max_proc_per_quantum);
+
 	packet = (struct pm4_mes_runlist *)buffer;
 
 	memset(buffer, 0, sizeof(struct pm4_mes_runlist));
@@ -130,6 +156,7 @@ static int pm_create_runlist(struct packet_manager *pm, uint32_t *buffer,
 	packet->bitfields4.chain = chain ? 1 : 0;
 	packet->bitfields4.offload_polling = 0;
 	packet->bitfields4.valid = 1;
+	packet->bitfields4.process_cnt = concurrent_proc_cnt;
 	packet->ordinal2 = lower_32_bits(ib);
 	packet->bitfields3.ib_base_hi = upper_32_bits(ib);
 
@@ -251,6 +278,7 @@ static int pm_create_runlist_ib(struct packet_manager *pm,
 		return retval;
 
 	*rl_size_bytes = alloc_size_bytes;
+	pm->ib_size_bytes = alloc_size_bytes;
 
 	pr_debug("Building runlist ib process count: %d queues count %d\n",
 		pm->dqm->processes_count, pm->dqm->queue_count);
@@ -564,3 +592,26 @@ void pm_release_ib(struct packet_manager *pm)
 	}
 	mutex_unlock(&pm->lock);
 }
+
+#if defined(CONFIG_DEBUG_FS)
+
+int pm_debugfs_runlist(struct seq_file *m, void *data)
+{
+	struct packet_manager *pm = data;
+
+	mutex_lock(&pm->lock);
+
+	if (!pm->allocated) {
+		seq_puts(m, "  No active runlist\n");
+		goto out;
+	}
+
+	seq_hex_dump(m, "  ", DUMP_PREFIX_OFFSET, 32, 4,
+		     pm->ib_buffer_obj->cpu_ptr, pm->ib_size_bytes, false);
+
+out:
+	mutex_unlock(&pm->lock);
+	return 0;
+}
+
+#endif
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c b/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c
index d6a796144269..15fff4420e53 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c
@@ -59,7 +59,7 @@ unsigned int kfd_pasid_alloc(void)
 		struct kfd_dev *dev = NULL;
 		unsigned int i = 0;
 
-		while ((dev = kfd_topology_enum_kfd_devices(i)) != NULL) {
+		while ((kfd_topology_enum_kfd_devices(i, &dev)) == 0) {
 			if (dev && dev->kfd2kgd) {
 				kfd2kgd = dev->kfd2kgd;
 				break;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 9e4134c5b481..6a48d29ada47 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -33,6 +33,8 @@
 #include <linux/kfd_ioctl.h>
 #include <linux/idr.h>
 #include <linux/kfifo.h>
+#include <linux/seq_file.h>
+#include <linux/kref.h>
 #include <kgd_kfd_interface.h>
 
 #include "amd_shared.h"
@@ -41,6 +43,7 @@
 
 #define KFD_MMAP_DOORBELL_MASK 0x8000000000000
 #define KFD_MMAP_EVENTS_MASK 0x4000000000000
+#define KFD_MMAP_RESERVED_MEM_MASK 0x2000000000000
 
 /*
  * When working with cp scheduler we should assign the HIQ manually or via
@@ -63,6 +66,15 @@
 #define KFD_MAX_NUM_OF_QUEUES_PER_PROCESS 1024
 
 /*
+ * Size of the per-process TBA+TMA buffer: 2 pages
+ *
+ * The first page is the TBA used for the CWSR ISA code. The second
+ * page is used as TMA for daisy changing a user-mode trap handler.
+ */
+#define KFD_CWSR_TBA_TMA_SIZE (PAGE_SIZE * 2)
+#define KFD_CWSR_TMA_OFFSET PAGE_SIZE
+
+/*
  * Kernel module parameter to specify maximum number of supported queues per
  * device
  */
@@ -79,11 +91,25 @@ extern int max_num_of_queues_per_device;
 extern int sched_policy;
 
 /*
+ * Kernel module parameter to specify the maximum process
+ * number per HW scheduler
+ */
+extern int hws_max_conc_proc;
+
+extern int cwsr_enable;
+
+/*
  * Kernel module parameter to specify whether to send sigterm to HSA process on
  * unhandled exception
  */
 extern int send_sigterm;
 
+/*
+ * Ignore CRAT table during KFD initialization, can be used to work around
+ * broken CRAT tables on some AMD systems
+ */
+extern int ignore_crat;
+
 /**
  * enum kfd_sched_policy
  *
@@ -131,6 +157,7 @@ struct kfd_device_info {
 	size_t ih_ring_entry_size;
 	uint8_t num_of_watch_points;
 	uint16_t mqd_size_aligned;
+	bool supports_cwsr;
 };
 
 struct kfd_mem_obj {
@@ -200,6 +227,14 @@ struct kfd_dev {
 
 	/* Debug manager */
 	struct kfd_dbgmgr           *dbgmgr;
+
+	/* Maximum process number mapped to HW scheduler */
+	unsigned int max_proc_per_quantum;
+
+	/* CWSR */
+	bool cwsr_enabled;
+	const void *cwsr_isa;
+	unsigned int cwsr_isa_size;
 };
 
 /* KGD2KFD callbacks */
@@ -332,6 +367,9 @@ struct queue_properties {
 	uint32_t eop_ring_buffer_size;
 	uint64_t ctx_save_restore_area_address;
 	uint32_t ctx_save_restore_area_size;
+	uint32_t ctl_stack_size;
+	uint64_t tba_addr;
+	uint64_t tma_addr;
 };
 
 /**
@@ -439,6 +477,11 @@ struct qcm_process_device {
 	uint32_t num_gws;
 	uint32_t num_oac;
 	uint32_t sh_hidden_private_base;
+
+	/* CWSR memory */
+	void *cwsr_kaddr;
+	uint64_t tba_addr;
+	uint64_t tma_addr;
 };
 
 
@@ -501,6 +544,9 @@ struct kfd_process {
 	 */
 	void *mm;
 
+	struct kref ref;
+	struct work_struct release_work;
+
 	struct mutex mutex;
 
 	/*
@@ -563,9 +609,10 @@ struct amdkfd_ioctl_desc {
 
 void kfd_process_create_wq(void);
 void kfd_process_destroy_wq(void);
-struct kfd_process *kfd_create_process(const struct task_struct *);
+struct kfd_process *kfd_create_process(struct file *filep);
 struct kfd_process *kfd_get_process(const struct task_struct *);
 struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid);
+void kfd_unref_process(struct kfd_process *p);
 
 struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev,
 						struct kfd_process *p);
@@ -577,6 +624,9 @@ struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev,
 struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
 							struct kfd_process *p);
 
+int kfd_reserved_mem_mmap(struct kfd_process *process,
+			  struct vm_area_struct *vma);
+
 /* Process device data iterator */
 struct kfd_process_device *kfd_get_first_process_device_data(
 							struct kfd_process *p);
@@ -624,9 +674,12 @@ int kfd_topology_init(void);
 void kfd_topology_shutdown(void);
 int kfd_topology_add_device(struct kfd_dev *gpu);
 int kfd_topology_remove_device(struct kfd_dev *gpu);
+struct kfd_topology_device *kfd_topology_device_by_proximity_domain(
+						uint32_t proximity_domain);
 struct kfd_dev *kfd_device_by_id(uint32_t gpu_id);
 struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev);
-struct kfd_dev *kfd_topology_enum_kfd_devices(uint8_t idx);
+int kfd_topology_enum_kfd_devices(uint8_t idx, struct kfd_dev **kdev);
+int kfd_numa_node_to_apic_id(int numa_node_id);
 
 /* Interrupts */
 int kfd_interrupt_init(struct kfd_dev *dev);
@@ -643,8 +696,6 @@ int kgd2kfd_resume(struct kfd_dev *kfd);
 int kfd_init_apertures(struct kfd_process *process);
 
 /* Queue Context Management */
-struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd);
-
 int init_queue(struct queue **q, const struct queue_properties *properties);
 void uninit_queue(struct queue *q);
 void print_queue_properties(struct queue_properties *q);
@@ -699,6 +750,7 @@ struct packet_manager {
 	struct mutex lock;
 	bool allocated;
 	struct kfd_mem_obj *ib_buffer_obj;
+	unsigned int ib_size_bytes;
 };
 
 int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm);
@@ -745,4 +797,23 @@ int kfd_event_destroy(struct kfd_process *p, uint32_t event_id);
 
 int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p);
 
+/* Debugfs */
+#if defined(CONFIG_DEBUG_FS)
+
+void kfd_debugfs_init(void);
+void kfd_debugfs_fini(void);
+int kfd_debugfs_mqds_by_process(struct seq_file *m, void *data);
+int pqm_debugfs_mqds(struct seq_file *m, void *data);
+int kfd_debugfs_hqds_by_device(struct seq_file *m, void *data);
+int dqm_debugfs_hqds(struct seq_file *m, void *data);
+int kfd_debugfs_rls_by_device(struct seq_file *m, void *data);
+int pm_debugfs_runlist(struct seq_file *m, void *data);
+
+#else
+
+static inline void kfd_debugfs_init(void) {}
+static inline void kfd_debugfs_fini(void) {}
+
+#endif
+
 #endif
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 1f5ccd28bd41..a22fb0710f15 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -24,10 +24,12 @@
 #include <linux/log2.h>
 #include <linux/sched.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/task.h>
 #include <linux/slab.h>
 #include <linux/amd-iommu.h>
 #include <linux/notifier.h>
 #include <linux/compat.h>
+#include <linux/mman.h>
 
 struct mm_struct;
 
@@ -46,13 +48,12 @@ DEFINE_STATIC_SRCU(kfd_processes_srcu);
 
 static struct workqueue_struct *kfd_process_wq;
 
-struct kfd_process_release_work {
-	struct work_struct kfd_work;
-	struct kfd_process *p;
-};
-
 static struct kfd_process *find_process(const struct task_struct *thread);
-static struct kfd_process *create_process(const struct task_struct *thread);
+static void kfd_process_ref_release(struct kref *ref);
+static struct kfd_process *create_process(const struct task_struct *thread,
+					struct file *filep);
+static int kfd_process_init_cwsr(struct kfd_process *p, struct file *filep);
+
 
 void kfd_process_create_wq(void)
 {
@@ -68,9 +69,10 @@ void kfd_process_destroy_wq(void)
 	}
 }
 
-struct kfd_process *kfd_create_process(const struct task_struct *thread)
+struct kfd_process *kfd_create_process(struct file *filep)
 {
 	struct kfd_process *process;
+	struct task_struct *thread = current;
 
 	if (!thread->mm)
 		return ERR_PTR(-EINVAL);
@@ -79,9 +81,6 @@ struct kfd_process *kfd_create_process(const struct task_struct *thread)
 	if (thread->group_leader->mm != thread->mm)
 		return ERR_PTR(-EINVAL);
 
-	/* Take mmap_sem because we call __mmu_notifier_register inside */
-	down_write(&thread->mm->mmap_sem);
-
 	/*
 	 * take kfd processes mutex before starting of process creation
 	 * so there won't be a case where two threads of the same process
@@ -93,14 +92,11 @@ struct kfd_process *kfd_create_process(const struct task_struct *thread)
 	process = find_process(thread);
 	if (process)
 		pr_debug("Process already found\n");
-
-	if (!process)
-		process = create_process(thread);
+	else
+		process = create_process(thread, filep);
 
 	mutex_unlock(&kfd_processes_mutex);
 
-	up_write(&thread->mm->mmap_sem);
-
 	return process;
 }
 
@@ -144,63 +140,75 @@ static struct kfd_process *find_process(const struct task_struct *thread)
 	return p;
 }
 
-static void kfd_process_wq_release(struct work_struct *work)
+void kfd_unref_process(struct kfd_process *p)
+{
+	kref_put(&p->ref, kfd_process_ref_release);
+}
+
+static void kfd_process_destroy_pdds(struct kfd_process *p)
 {
-	struct kfd_process_release_work *my_work;
 	struct kfd_process_device *pdd, *temp;
-	struct kfd_process *p;
 
-	my_work = (struct kfd_process_release_work *) work;
+	list_for_each_entry_safe(pdd, temp, &p->per_device_data,
+				 per_device_list) {
+		pr_debug("Releasing pdd (topology id %d) for process (pasid %d)\n",
+				pdd->dev->id, p->pasid);
 
-	p = my_work->p;
+		list_del(&pdd->per_device_list);
 
-	pr_debug("Releasing process (pasid %d) in workqueue\n",
-			p->pasid);
+		if (pdd->qpd.cwsr_kaddr)
+			free_pages((unsigned long)pdd->qpd.cwsr_kaddr,
+				get_order(KFD_CWSR_TBA_TMA_SIZE));
 
-	mutex_lock(&p->mutex);
+		kfree(pdd);
+	}
+}
 
-	list_for_each_entry_safe(pdd, temp, &p->per_device_data,
-							per_device_list) {
-		pr_debug("Releasing pdd (topology id %d) for process (pasid %d) in workqueue\n",
-				pdd->dev->id, p->pasid);
+/* No process locking is needed in this function, because the process
+ * is not findable any more. We must assume that no other thread is
+ * using it any more, otherwise we couldn't safely free the process
+ * structure in the end.
+ */
+static void kfd_process_wq_release(struct work_struct *work)
+{
+	struct kfd_process *p = container_of(work, struct kfd_process,
+					     release_work);
+	struct kfd_process_device *pdd;
+
+	pr_debug("Releasing process (pasid %d) in workqueue\n", p->pasid);
 
+	list_for_each_entry(pdd, &p->per_device_data, per_device_list) {
 		if (pdd->bound == PDD_BOUND)
 			amd_iommu_unbind_pasid(pdd->dev->pdev, p->pasid);
-
-		list_del(&pdd->per_device_list);
-		kfree(pdd);
 	}
 
+	kfd_process_destroy_pdds(p);
+
 	kfd_event_free_process(p);
 
 	kfd_pasid_free(p->pasid);
 	kfd_free_process_doorbells(p);
 
-	mutex_unlock(&p->mutex);
-
 	mutex_destroy(&p->mutex);
 
-	kfree(p);
+	put_task_struct(p->lead_thread);
 
-	kfree(work);
+	kfree(p);
 }
 
-static void kfd_process_destroy_delayed(struct rcu_head *rcu)
+static void kfd_process_ref_release(struct kref *ref)
 {
-	struct kfd_process_release_work *work;
-	struct kfd_process *p;
+	struct kfd_process *p = container_of(ref, struct kfd_process, ref);
 
-	p = container_of(rcu, struct kfd_process, rcu);
-
-	mmdrop(p->mm);
+	INIT_WORK(&p->release_work, kfd_process_wq_release);
+	queue_work(kfd_process_wq, &p->release_work);
+}
 
-	work = kmalloc(sizeof(struct kfd_process_release_work), GFP_ATOMIC);
+static void kfd_process_destroy_delayed(struct rcu_head *rcu)
+{
+	struct kfd_process *p = container_of(rcu, struct kfd_process, rcu);
 
-	if (work) {
-		INIT_WORK((struct work_struct *) work, kfd_process_wq_release);
-		work->p = p;
-		queue_work(kfd_process_wq, (struct work_struct *) work);
-	}
+	kfd_unref_process(p);
 }
 
 static void kfd_process_notifier_release(struct mmu_notifier *mn,
@@ -244,15 +252,12 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn,
 	kfd_process_dequeue_from_all_devices(p);
 	pqm_uninit(&p->pqm);
 
+	/* Indicate to other users that MM is no longer valid */
+	p->mm = NULL;
+
 	mutex_unlock(&p->mutex);
 
-	/*
-	 * Because we drop mm_count inside kfd_process_destroy_delayed
-	 * and because the mmu_notifier_unregister function also drop
-	 * mm_count we need to take an extra count here.
-	 */
-	mmgrab(p->mm);
-	mmu_notifier_unregister_no_release(&p->mmu_notifier, p->mm);
+	mmu_notifier_unregister_no_release(&p->mmu_notifier, mm);
 	mmu_notifier_call_srcu(&p->rcu, &kfd_process_destroy_delayed);
 }
 
@@ -260,7 +265,44 @@ static const struct mmu_notifier_ops kfd_process_mmu_notifier_ops = {
 	.release = kfd_process_notifier_release,
 };
 
-static struct kfd_process *create_process(const struct task_struct *thread)
+static int kfd_process_init_cwsr(struct kfd_process *p, struct file *filep)
+{
+	unsigned long  offset;
+	struct kfd_process_device *pdd = NULL;
+	struct kfd_dev *dev = NULL;
+	struct qcm_process_device *qpd = NULL;
+
+	list_for_each_entry(pdd, &p->per_device_data, per_device_list) {
+		dev = pdd->dev;
+		qpd = &pdd->qpd;
+		if (!dev->cwsr_enabled || qpd->cwsr_kaddr)
+			continue;
+		offset = (dev->id | KFD_MMAP_RESERVED_MEM_MASK) << PAGE_SHIFT;
+		qpd->tba_addr = (int64_t)vm_mmap(filep, 0,
+			KFD_CWSR_TBA_TMA_SIZE, PROT_READ | PROT_EXEC,
+			MAP_SHARED, offset);
+
+		if (IS_ERR_VALUE(qpd->tba_addr)) {
+			int err = qpd->tba_addr;
+
+			pr_err("Failure to set tba address. error %d.\n", err);
+			qpd->tba_addr = 0;
+			qpd->cwsr_kaddr = NULL;
+			return err;
+		}
+
+		memcpy(qpd->cwsr_kaddr, dev->cwsr_isa, dev->cwsr_isa_size);
+
+		qpd->tma_addr = qpd->tba_addr + KFD_CWSR_TMA_OFFSET;
+		pr_debug("set tba :0x%llx, tma:0x%llx, cwsr_kaddr:%p for pqm.\n",
+			qpd->tba_addr, qpd->tma_addr, qpd->cwsr_kaddr);
+	}
+
+	return 0;
+}
+
+static struct kfd_process *create_process(const struct task_struct *thread,
+					struct file *filep)
 {
 	struct kfd_process *process;
 	int err = -ENOMEM;
@@ -277,13 +319,15 @@ static struct kfd_process *create_process(const struct task_struct *thread)
 	if (kfd_alloc_process_doorbells(process) < 0)
 		goto err_alloc_doorbells;
 
+	kref_init(&process->ref);
+
 	mutex_init(&process->mutex);
 
 	process->mm = thread->mm;
 
 	/* register notifier */
 	process->mmu_notifier.ops = &kfd_process_mmu_notifier_ops;
-	err = __mmu_notifier_register(&process->mmu_notifier, process->mm);
+	err = mmu_notifier_register(&process->mmu_notifier, process->mm);
 	if (err)
 		goto err_mmu_notifier;
 
@@ -291,6 +335,7 @@ static struct kfd_process *create_process(const struct task_struct *thread)
 			(uintptr_t)process->mm);
 
 	process->lead_thread = thread->group_leader;
+	get_task_struct(process->lead_thread);
 
 	INIT_LIST_HEAD(&process->per_device_data);
 
@@ -306,8 +351,14 @@ static struct kfd_process *create_process(const struct task_struct *thread)
 	if (err != 0)
 		goto err_init_apertures;
 
+	err = kfd_process_init_cwsr(process, filep);
+	if (err)
+		goto err_init_cwsr;
+
 	return process;
 
+err_init_cwsr:
+	kfd_process_destroy_pdds(process);
 err_init_apertures:
 	pqm_uninit(&process->pqm);
 err_process_pqm_init:
@@ -343,16 +394,18 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
 	struct kfd_process_device *pdd = NULL;
 
 	pdd = kzalloc(sizeof(*pdd), GFP_KERNEL);
-	if (pdd != NULL) {
-		pdd->dev = dev;
-		INIT_LIST_HEAD(&pdd->qpd.queues_list);
-		INIT_LIST_HEAD(&pdd->qpd.priv_queue_list);
-		pdd->qpd.dqm = dev->dqm;
-		pdd->process = p;
-		pdd->bound = PDD_UNBOUND;
-		pdd->already_dequeued = false;
-		list_add(&pdd->per_device_list, &p->per_device_data);
-	}
+	if (!pdd)
+		return NULL;
+
+	pdd->dev = dev;
+	INIT_LIST_HEAD(&pdd->qpd.queues_list);
+	INIT_LIST_HEAD(&pdd->qpd.priv_queue_list);
+	pdd->qpd.dqm = dev->dqm;
+	pdd->qpd.pqm = &p->pqm;
+	pdd->process = p;
+	pdd->bound = PDD_UNBOUND;
+	pdd->already_dequeued = false;
+	list_add(&pdd->per_device_list, &p->per_device_data);
 
 	return pdd;
 }
@@ -483,6 +536,8 @@ void kfd_process_iommu_unbind_callback(struct kfd_dev *dev, unsigned int pasid)
 
 	mutex_unlock(kfd_get_dbgmgr_mutex());
 
+	mutex_lock(&p->mutex);
+
 	pdd = kfd_get_process_device_data(dev, p);
 	if (pdd)
 		/* For GPU relying on IOMMU, we need to dequeue here
@@ -491,6 +546,8 @@ void kfd_process_iommu_unbind_callback(struct kfd_dev *dev, unsigned int pasid)
 		kfd_process_dequeue_from_device(pdd);
 
 	mutex_unlock(&p->mutex);
+
+	kfd_unref_process(p);
 }
 
 struct kfd_process_device *kfd_get_first_process_device_data(
@@ -515,22 +572,86 @@ bool kfd_has_process_device_data(struct kfd_process *p)
 	return !(list_empty(&p->per_device_data));
 }
 
-/* This returns with process->mutex locked. */
+/* This increments the process->ref counter. */
 struct kfd_process *kfd_lookup_process_by_pasid(unsigned int pasid)
 {
-	struct kfd_process *p;
+	struct kfd_process *p, *ret_p = NULL;
 	unsigned int temp;
 
 	int idx = srcu_read_lock(&kfd_processes_srcu);
 
 	hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
 		if (p->pasid == pasid) {
-			mutex_lock(&p->mutex);
+			kref_get(&p->ref);
+			ret_p = p;
 			break;
 		}
 	}
 
 	srcu_read_unlock(&kfd_processes_srcu, idx);
 
-	return p;
+	return ret_p;
 }
+
+int kfd_reserved_mem_mmap(struct kfd_process *process,
+			  struct vm_area_struct *vma)
+{
+	struct kfd_dev *dev = kfd_device_by_id(vma->vm_pgoff);
+	struct kfd_process_device *pdd;
+	struct qcm_process_device *qpd;
+
+	if (!dev)
+		return -EINVAL;
+	if ((vma->vm_end - vma->vm_start) != KFD_CWSR_TBA_TMA_SIZE) {
+		pr_err("Incorrect CWSR mapping size.\n");
+		return -EINVAL;
+	}
+
+	pdd = kfd_get_process_device_data(dev, process);
+	if (!pdd)
+		return -EINVAL;
+	qpd = &pdd->qpd;
+
+	qpd->cwsr_kaddr = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+					get_order(KFD_CWSR_TBA_TMA_SIZE));
+	if (!qpd->cwsr_kaddr) {
+		pr_err("Error allocating per process CWSR buffer.\n");
+		return -ENOMEM;
+	}
+
+	vma->vm_flags |= VM_IO | VM_DONTCOPY | VM_DONTEXPAND
+		| VM_NORESERVE | VM_DONTDUMP | VM_PFNMAP;
+	/* Mapping pages to user process */
+	return remap_pfn_range(vma, vma->vm_start,
+			       PFN_DOWN(__pa(qpd->cwsr_kaddr)),
+			       KFD_CWSR_TBA_TMA_SIZE, vma->vm_page_prot);
+}
+
+#if defined(CONFIG_DEBUG_FS)
+
+int kfd_debugfs_mqds_by_process(struct seq_file *m, void *data)
+{
+	struct kfd_process *p;
+	unsigned int temp;
+	int r = 0;
+
+	int idx = srcu_read_lock(&kfd_processes_srcu);
+
+	hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
+		seq_printf(m, "Process %d PASID %d:\n",
+			   p->lead_thread->tgid, p->pasid);
+
+		mutex_lock(&p->mutex);
+		r = pqm_debugfs_mqds(m, &p->pqm);
+		mutex_unlock(&p->mutex);
+
+		if (r)
+			break;
+	}
+
+	srcu_read_unlock(&kfd_processes_srcu, idx);
+
+	return r;
+}
+
+#endif
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
index a3f1e62c60ba..876380632668 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
@@ -178,10 +178,8 @@ int pqm_create_queue(struct process_queue_manager *pqm,
 		return retval;
 
 	if (list_empty(&pdd->qpd.queues_list) &&
-	    list_empty(&pdd->qpd.priv_queue_list)) {
-		pdd->qpd.pqm = pqm;
+	    list_empty(&pdd->qpd.priv_queue_list))
 		dev->dqm->ops.register_process(dev->dqm, &pdd->qpd);
-	}
 
 	pqn = kzalloc(sizeof(*pqn), GFP_KERNEL);
 	if (!pqn) {
@@ -203,8 +201,7 @@ int pqm_create_queue(struct process_queue_manager *pqm,
 			goto err_create_queue;
 		pqn->q = q;
 		pqn->kq = NULL;
-		retval = dev->dqm->ops.create_queue(dev->dqm, q, &pdd->qpd,
-						&q->properties.vmid);
+		retval = dev->dqm->ops.create_queue(dev->dqm, q, &pdd->qpd);
 		pr_debug("DQM returned %d for create_queue\n", retval);
 		print_queue(q);
 		break;
@@ -224,8 +221,7 @@ int pqm_create_queue(struct process_queue_manager *pqm,
 			goto err_create_queue;
 		pqn->q = q;
 		pqn->kq = NULL;
-		retval = dev->dqm->ops.create_queue(dev->dqm, q, &pdd->qpd,
-						&q->properties.vmid);
+		retval = dev->dqm->ops.create_queue(dev->dqm, q, &pdd->qpd);
 		pr_debug("DQM returned %d for create_queue\n", retval);
 		print_queue(q);
 		break;
@@ -315,6 +311,10 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
 	if (pqn->q) {
 		dqm = pqn->q->device->dqm;
 		retval = dqm->ops.destroy_queue(dqm, &pdd->qpd, pqn->q);
+		if (retval) {
+			pr_debug("Destroy queue failed, returned %d\n", retval);
+			goto err_destroy_queue;
+		}
 		uninit_queue(pqn->q);
 	}
 
@@ -326,6 +326,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
 	    list_empty(&pdd->qpd.priv_queue_list))
 		dqm->ops.unregister_process(dqm, &pdd->qpd);
 
+err_destroy_queue:
 	return retval;
 }
 
@@ -367,4 +368,67 @@ struct kernel_queue *pqm_get_kernel_queue(
 	return NULL;
 }
 
+#if defined(CONFIG_DEBUG_FS)
+
+int pqm_debugfs_mqds(struct seq_file *m, void *data)
+{
+	struct process_queue_manager *pqm = data;
+	struct process_queue_node *pqn;
+	struct queue *q;
+	enum KFD_MQD_TYPE mqd_type;
+	struct mqd_manager *mqd_manager;
+	int r = 0;
+
+	list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
+		if (pqn->q) {
+			q = pqn->q;
+			switch (q->properties.type) {
+			case KFD_QUEUE_TYPE_SDMA:
+				seq_printf(m, "  SDMA queue on device %x\n",
+					   q->device->id);
+				mqd_type = KFD_MQD_TYPE_SDMA;
+				break;
+			case KFD_QUEUE_TYPE_COMPUTE:
+				seq_printf(m, "  Compute queue on device %x\n",
+					   q->device->id);
+				mqd_type = KFD_MQD_TYPE_CP;
+				break;
+			default:
+				seq_printf(m,
+				"  Bad user queue type %d on device %x\n",
+					   q->properties.type, q->device->id);
+				continue;
+			}
+			mqd_manager = q->device->dqm->ops.get_mqd_manager(
+				q->device->dqm, mqd_type);
+		} else if (pqn->kq) {
+			q = pqn->kq->queue;
+			mqd_manager = pqn->kq->mqd;
+			switch (q->properties.type) {
+			case KFD_QUEUE_TYPE_DIQ:
+				seq_printf(m, "  DIQ on device %x\n",
+					   pqn->kq->dev->id);
+				mqd_type = KFD_MQD_TYPE_HIQ;
+				break;
+			default:
+				seq_printf(m,
+				"  Bad kernel queue type %d on device %x\n",
+					   q->properties.type,
+					   pqn->kq->dev->id);
+				continue;
+			}
+		} else {
+			seq_printf(m,
+		"  Weird: Queue node with neither kernel nor user queue\n");
+			continue;
+		}
+
+		r = mqd_manager->debugfs_show_mqd(m, q->mqd);
+		if (r != 0)
+			break;
+	}
+
+	return r;
+}
 
+#endif
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
index 19ce59028d6b..c6a76090a725 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
@@ -28,27 +28,32 @@
 #include <linux/hash.h>
 #include <linux/cpufreq.h>
 #include <linux/log2.h>
+#include <linux/dmi.h>
+#include <linux/atomic.h>
 
 #include "kfd_priv.h"
 #include "kfd_crat.h"
 #include "kfd_topology.h"
+#include "kfd_device_queue_manager.h"
 
+/* topology_device_list - Master list of all topology devices */
 static struct list_head topology_device_list;
-static int topology_crat_parsed;
 static struct kfd_system_properties sys_props;
 
 static DECLARE_RWSEM(topology_lock);
+static atomic_t topology_crat_proximity_domain;
 
-struct kfd_dev *kfd_device_by_id(uint32_t gpu_id)
+struct kfd_topology_device *kfd_topology_device_by_proximity_domain(
+						uint32_t proximity_domain)
 {
 	struct kfd_topology_device *top_dev;
-	struct kfd_dev *device = NULL;
+	struct kfd_topology_device *device = NULL;
 
 	down_read(&topology_lock);
 
 	list_for_each_entry(top_dev, &topology_device_list, list)
-		if (top_dev->gpu_id == gpu_id) {
-			device = top_dev->gpu;
+		if (top_dev->proximity_domain == proximity_domain) {
+			device = top_dev;
 			break;
 		}
 
@@ -57,7 +62,7 @@ struct kfd_dev *kfd_device_by_id(uint32_t gpu_id)
 	return device;
 }
 
-struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev)
+struct kfd_dev *kfd_device_by_id(uint32_t gpu_id)
 {
 	struct kfd_topology_device *top_dev;
 	struct kfd_dev *device = NULL;
@@ -65,7 +70,7 @@ struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev)
 	down_read(&topology_lock);
 
 	list_for_each_entry(top_dev, &topology_device_list, list)
-		if (top_dev->gpu->pdev == pdev) {
+		if (top_dev->gpu_id == gpu_id) {
 			device = top_dev->gpu;
 			break;
 		}
@@ -75,282 +80,31 @@ struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev)
 	return device;
 }
 
-static int kfd_topology_get_crat_acpi(void *crat_image, size_t *size)
-{
-	struct acpi_table_header *crat_table;
-	acpi_status status;
-
-	if (!size)
-		return -EINVAL;
-
-	/*
-	 * Fetch the CRAT table from ACPI
-	 */
-	status = acpi_get_table(CRAT_SIGNATURE, 0, &crat_table);
-	if (status == AE_NOT_FOUND) {
-		pr_warn("CRAT table not found\n");
-		return -ENODATA;
-	} else if (ACPI_FAILURE(status)) {
-		const char *err = acpi_format_exception(status);
-
-		pr_err("CRAT table error: %s\n", err);
-		return -EINVAL;
-	}
-
-	if (*size >= crat_table->length && crat_image != NULL)
-		memcpy(crat_image, crat_table, crat_table->length);
-
-	*size = crat_table->length;
-
-	return 0;
-}
-
-static void kfd_populated_cu_info_cpu(struct kfd_topology_device *dev,
-		struct crat_subtype_computeunit *cu)
-{
-	dev->node_props.cpu_cores_count = cu->num_cpu_cores;
-	dev->node_props.cpu_core_id_base = cu->processor_id_low;
-	if (cu->hsa_capability & CRAT_CU_FLAGS_IOMMU_PRESENT)
-		dev->node_props.capability |= HSA_CAP_ATS_PRESENT;
-
-	pr_info("CU CPU: cores=%d id_base=%d\n", cu->num_cpu_cores,
-			cu->processor_id_low);
-}
-
-static void kfd_populated_cu_info_gpu(struct kfd_topology_device *dev,
-		struct crat_subtype_computeunit *cu)
-{
-	dev->node_props.simd_id_base = cu->processor_id_low;
-	dev->node_props.simd_count = cu->num_simd_cores;
-	dev->node_props.lds_size_in_kb = cu->lds_size_in_kb;
-	dev->node_props.max_waves_per_simd = cu->max_waves_simd;
-	dev->node_props.wave_front_size = cu->wave_front_size;
-	dev->node_props.mem_banks_count = cu->num_banks;
-	dev->node_props.array_count = cu->num_arrays;
-	dev->node_props.cu_per_simd_array = cu->num_cu_per_array;
-	dev->node_props.simd_per_cu = cu->num_simd_per_cu;
-	dev->node_props.max_slots_scratch_cu = cu->max_slots_scatch_cu;
-	if (cu->hsa_capability & CRAT_CU_FLAGS_HOT_PLUGGABLE)
-		dev->node_props.capability |= HSA_CAP_HOT_PLUGGABLE;
-	pr_info("CU GPU: simds=%d id_base=%d\n", cu->num_simd_cores,
-				cu->processor_id_low);
-}
-
-/* kfd_parse_subtype_cu is called when the topology mutex is already acquired */
-static int kfd_parse_subtype_cu(struct crat_subtype_computeunit *cu)
-{
-	struct kfd_topology_device *dev;
-	int i = 0;
-
-	pr_info("Found CU entry in CRAT table with proximity_domain=%d caps=%x\n",
-			cu->proximity_domain, cu->hsa_capability);
-	list_for_each_entry(dev, &topology_device_list, list) {
-		if (cu->proximity_domain == i) {
-			if (cu->flags & CRAT_CU_FLAGS_CPU_PRESENT)
-				kfd_populated_cu_info_cpu(dev, cu);
-
-			if (cu->flags & CRAT_CU_FLAGS_GPU_PRESENT)
-				kfd_populated_cu_info_gpu(dev, cu);
-			break;
-		}
-		i++;
-	}
-
-	return 0;
-}
-
-/*
- * kfd_parse_subtype_mem is called when the topology mutex is
- * already acquired
- */
-static int kfd_parse_subtype_mem(struct crat_subtype_memory *mem)
-{
-	struct kfd_mem_properties *props;
-	struct kfd_topology_device *dev;
-	int i = 0;
-
-	pr_info("Found memory entry in CRAT table with proximity_domain=%d\n",
-			mem->promixity_domain);
-	list_for_each_entry(dev, &topology_device_list, list) {
-		if (mem->promixity_domain == i) {
-			props = kfd_alloc_struct(props);
-			if (props == NULL)
-				return -ENOMEM;
-
-			if (dev->node_props.cpu_cores_count == 0)
-				props->heap_type = HSA_MEM_HEAP_TYPE_FB_PRIVATE;
-			else
-				props->heap_type = HSA_MEM_HEAP_TYPE_SYSTEM;
-
-			if (mem->flags & CRAT_MEM_FLAGS_HOT_PLUGGABLE)
-				props->flags |= HSA_MEM_FLAGS_HOT_PLUGGABLE;
-			if (mem->flags & CRAT_MEM_FLAGS_NON_VOLATILE)
-				props->flags |= HSA_MEM_FLAGS_NON_VOLATILE;
-
-			props->size_in_bytes =
-				((uint64_t)mem->length_high << 32) +
-							mem->length_low;
-			props->width = mem->width;
-
-			dev->mem_bank_count++;
-			list_add_tail(&props->list, &dev->mem_props);
-
-			break;
-		}
-		i++;
-	}
-
-	return 0;
-}
-
-/*
- * kfd_parse_subtype_cache is called when the topology mutex
- * is already acquired
- */
-static int kfd_parse_subtype_cache(struct crat_subtype_cache *cache)
-{
-	struct kfd_cache_properties *props;
-	struct kfd_topology_device *dev;
-	uint32_t id;
-
-	id = cache->processor_id_low;
-
-	pr_info("Found cache entry in CRAT table with processor_id=%d\n", id);
-	list_for_each_entry(dev, &topology_device_list, list)
-		if (id == dev->node_props.cpu_core_id_base ||
-		    id == dev->node_props.simd_id_base) {
-			props = kfd_alloc_struct(props);
-			if (props == NULL)
-				return -ENOMEM;
-
-			props->processor_id_low = id;
-			props->cache_level = cache->cache_level;
-			props->cache_size = cache->cache_size;
-			props->cacheline_size = cache->cache_line_size;
-			props->cachelines_per_tag = cache->lines_per_tag;
-			props->cache_assoc = cache->associativity;
-			props->cache_latency = cache->cache_latency;
-
-			if (cache->flags & CRAT_CACHE_FLAGS_DATA_CACHE)
-				props->cache_type |= HSA_CACHE_TYPE_DATA;
-			if (cache->flags & CRAT_CACHE_FLAGS_INST_CACHE)
-				props->cache_type |= HSA_CACHE_TYPE_INSTRUCTION;
-			if (cache->flags & CRAT_CACHE_FLAGS_CPU_CACHE)
-				props->cache_type |= HSA_CACHE_TYPE_CPU;
-			if (cache->flags & CRAT_CACHE_FLAGS_SIMD_CACHE)
-				props->cache_type |= HSA_CACHE_TYPE_HSACU;
-
-			dev->cache_count++;
-			dev->node_props.caches_count++;
-			list_add_tail(&props->list, &dev->cache_props);
-
-			break;
-		}
-
-	return 0;
-}
-
-/*
- * kfd_parse_subtype_iolink is called when the topology mutex
- * is already acquired
- */
-static int kfd_parse_subtype_iolink(struct crat_subtype_iolink *iolink)
+struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev)
 {
-	struct kfd_iolink_properties *props;
-	struct kfd_topology_device *dev;
-	uint32_t i = 0;
-	uint32_t id_from;
-	uint32_t id_to;
-
-	id_from = iolink->proximity_domain_from;
-	id_to = iolink->proximity_domain_to;
+	struct kfd_topology_device *top_dev;
+	struct kfd_dev *device = NULL;
 
-	pr_info("Found IO link entry in CRAT table with id_from=%d\n", id_from);
-	list_for_each_entry(dev, &topology_device_list, list) {
-		if (id_from == i) {
-			props = kfd_alloc_struct(props);
-			if (props == NULL)
-				return -ENOMEM;
-
-			props->node_from = id_from;
-			props->node_to = id_to;
-			props->ver_maj = iolink->version_major;
-			props->ver_min = iolink->version_minor;
-
-			/*
-			 * weight factor (derived from CDIR), currently always 1
-			 */
-			props->weight = 1;
-
-			props->min_latency = iolink->minimum_latency;
-			props->max_latency = iolink->maximum_latency;
-			props->min_bandwidth = iolink->minimum_bandwidth_mbs;
-			props->max_bandwidth = iolink->maximum_bandwidth_mbs;
-			props->rec_transfer_size =
-					iolink->recommended_transfer_size;
-
-			dev->io_link_count++;
-			dev->node_props.io_links_count++;
-			list_add_tail(&props->list, &dev->io_link_props);
+	down_read(&topology_lock);
 
+	list_for_each_entry(top_dev, &topology_device_list, list)
+		if (top_dev->gpu->pdev == pdev) {
+			device = top_dev->gpu;
 			break;
 		}
-		i++;
-	}
 
-	return 0;
-}
-
-static int kfd_parse_subtype(struct crat_subtype_generic *sub_type_hdr)
-{
-	struct crat_subtype_computeunit *cu;
-	struct crat_subtype_memory *mem;
-	struct crat_subtype_cache *cache;
-	struct crat_subtype_iolink *iolink;
-	int ret = 0;
-
-	switch (sub_type_hdr->type) {
-	case CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY:
-		cu = (struct crat_subtype_computeunit *)sub_type_hdr;
-		ret = kfd_parse_subtype_cu(cu);
-		break;
-	case CRAT_SUBTYPE_MEMORY_AFFINITY:
-		mem = (struct crat_subtype_memory *)sub_type_hdr;
-		ret = kfd_parse_subtype_mem(mem);
-		break;
-	case CRAT_SUBTYPE_CACHE_AFFINITY:
-		cache = (struct crat_subtype_cache *)sub_type_hdr;
-		ret = kfd_parse_subtype_cache(cache);
-		break;
-	case CRAT_SUBTYPE_TLB_AFFINITY:
-		/*
-		 * For now, nothing to do here
-		 */
-		pr_info("Found TLB entry in CRAT table (not processing)\n");
-		break;
-	case CRAT_SUBTYPE_CCOMPUTE_AFFINITY:
-		/*
-		 * For now, nothing to do here
-		 */
-		pr_info("Found CCOMPUTE entry in CRAT table (not processing)\n");
-		break;
-	case CRAT_SUBTYPE_IOLINK_AFFINITY:
-		iolink = (struct crat_subtype_iolink *)sub_type_hdr;
-		ret = kfd_parse_subtype_iolink(iolink);
-		break;
-	default:
-		pr_warn("Unknown subtype (%d) in CRAT\n",
-				sub_type_hdr->type);
-	}
+	up_read(&topology_lock);
 
-	return ret;
+	return device;
 }
 
+/* Called with write topology_lock acquired */
 static void kfd_release_topology_device(struct kfd_topology_device *dev)
 {
 	struct kfd_mem_properties *mem;
 	struct kfd_cache_properties *cache;
 	struct kfd_iolink_properties *iolink;
+	struct kfd_perf_properties *perf;
 
 	list_del(&dev->list);
 
@@ -375,25 +129,35 @@ static void kfd_release_topology_device(struct kfd_topology_device *dev)
 		kfree(iolink);
 	}
 
-	kfree(dev);
+	while (dev->perf_props.next != &dev->perf_props) {
+		perf = container_of(dev->perf_props.next,
+				struct kfd_perf_properties, list);
+		list_del(&perf->list);
+		kfree(perf);
+	}
 
-	sys_props.num_devices--;
+	kfree(dev);
 }
 
-static void kfd_release_live_view(void)
+void kfd_release_topology_device_list(struct list_head *device_list)
 {
 	struct kfd_topology_device *dev;
 
-	while (topology_device_list.next != &topology_device_list) {
-		dev = container_of(topology_device_list.next,
-				 struct kfd_topology_device, list);
+	while (!list_empty(device_list)) {
+		dev = list_first_entry(device_list,
+				       struct kfd_topology_device, list);
 		kfd_release_topology_device(dev);
+	}
 }
 
+static void kfd_release_live_view(void)
+{
+	kfd_release_topology_device_list(&topology_device_list);
 	memset(&sys_props, 0, sizeof(sys_props));
 }
 
-static struct kfd_topology_device *kfd_create_topology_device(void)
+struct kfd_topology_device *kfd_create_topology_device(
+				struct list_head *device_list)
 {
 	struct kfd_topology_device *dev;
 
@@ -406,65 +170,13 @@ static struct kfd_topology_device *kfd_create_topology_device(void)
 	INIT_LIST_HEAD(&dev->mem_props);
 	INIT_LIST_HEAD(&dev->cache_props);
 	INIT_LIST_HEAD(&dev->io_link_props);
+	INIT_LIST_HEAD(&dev->perf_props);
 
-	list_add_tail(&dev->list, &topology_device_list);
-	sys_props.num_devices++;
+	list_add_tail(&dev->list, device_list);
 
 	return dev;
 }
 
-static int kfd_parse_crat_table(void *crat_image)
-{
-	struct kfd_topology_device *top_dev;
-	struct crat_subtype_generic *sub_type_hdr;
-	uint16_t node_id;
-	int ret;
-	struct crat_header *crat_table = (struct crat_header *)crat_image;
-	uint16_t num_nodes;
-	uint32_t image_len;
-
-	if (!crat_image)
-		return -EINVAL;
-
-	num_nodes = crat_table->num_domains;
-	image_len = crat_table->length;
-
-	pr_info("Parsing CRAT table with %d nodes\n", num_nodes);
-
-	for (node_id = 0; node_id < num_nodes; node_id++) {
-		top_dev = kfd_create_topology_device();
-		if (!top_dev) {
-			kfd_release_live_view();
-			return -ENOMEM;
-		}
-	}
-
-	sys_props.platform_id =
-		(*((uint64_t *)crat_table->oem_id)) & CRAT_OEMID_64BIT_MASK;
-	sys_props.platform_oem = *((uint64_t *)crat_table->oem_table_id);
-	sys_props.platform_rev = crat_table->revision;
-
-	sub_type_hdr = (struct crat_subtype_generic *)(crat_table+1);
-	while ((char *)sub_type_hdr + sizeof(struct crat_subtype_generic) <
-			((char *)crat_image) + image_len) {
-		if (sub_type_hdr->flags & CRAT_SUBTYPE_FLAGS_ENABLED) {
-			ret = kfd_parse_subtype(sub_type_hdr);
-			if (ret != 0) {
-				kfd_release_live_view();
-				return ret;
-			}
-		}
-
-		sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
-				sub_type_hdr->length);
-	}
-
-	sys_props.generation_count++;
-	topology_crat_parsed = 1;
-
-	return 0;
-}
-
 
 #define sysfs_show_gen_prop(buffer, fmt, ...) \
 		snprintf(buffer, PAGE_SIZE, "%s"fmt, buffer, __VA_ARGS__)
@@ -501,11 +213,17 @@ static ssize_t sysprops_show(struct kobject *kobj, struct attribute *attr,
 	return ret;
 }
 
+static void kfd_topology_kobj_release(struct kobject *kobj)
+{
+	kfree(kobj);
+}
+
 static const struct sysfs_ops sysprops_ops = {
 	.show = sysprops_show,
 };
 
 static struct kobj_type sysprops_type = {
+	.release = kfd_topology_kobj_release,
 	.sysfs_ops = &sysprops_ops,
 };
 
@@ -541,6 +259,7 @@ static const struct sysfs_ops iolink_ops = {
 };
 
 static struct kobj_type iolink_type = {
+	.release = kfd_topology_kobj_release,
 	.sysfs_ops = &iolink_ops,
 };
 
@@ -568,6 +287,7 @@ static const struct sysfs_ops mem_ops = {
 };
 
 static struct kobj_type mem_type = {
+	.release = kfd_topology_kobj_release,
 	.sysfs_ops = &mem_ops,
 };
 
@@ -575,7 +295,7 @@ static ssize_t kfd_cache_show(struct kobject *kobj, struct attribute *attr,
 		char *buffer)
 {
 	ssize_t ret;
-	uint32_t i;
+	uint32_t i, j;
 	struct kfd_cache_properties *cache;
 
 	/* Making sure that the buffer is an empty string */
@@ -593,12 +313,18 @@ static ssize_t kfd_cache_show(struct kobject *kobj, struct attribute *attr,
 	sysfs_show_32bit_prop(buffer, "latency", cache->cache_latency);
 	sysfs_show_32bit_prop(buffer, "type", cache->cache_type);
 	snprintf(buffer, PAGE_SIZE, "%ssibling_map ", buffer);
-	for (i = 0; i < KFD_TOPOLOGY_CPU_SIBLINGS; i++)
-		ret = snprintf(buffer, PAGE_SIZE, "%s%d%s",
-				buffer, cache->sibling_map[i],
-				(i == KFD_TOPOLOGY_CPU_SIBLINGS-1) ?
-						"\n" : ",");
-
+	for (i = 0; i < CRAT_SIBLINGMAP_SIZE; i++)
+		for (j = 0; j < sizeof(cache->sibling_map[0])*8; j++) {
+			/* Check each bit */
+			if (cache->sibling_map[i] & (1 << j))
+				ret = snprintf(buffer, PAGE_SIZE,
+					 "%s%d%s", buffer, 1, ",");
+			else
+				ret = snprintf(buffer, PAGE_SIZE,
+					 "%s%d%s", buffer, 0, ",");
+		}
+	/* Replace the last "," with end of line */
+	*(buffer + strlen(buffer) - 1) = 0xA;
 	return ret;
 }
 
@@ -607,9 +333,43 @@ static const struct sysfs_ops cache_ops = {
 };
 
 static struct kobj_type cache_type = {
+	.release = kfd_topology_kobj_release,
 	.sysfs_ops = &cache_ops,
 };
 
+/****** Sysfs of Performance Counters ******/
+
+struct kfd_perf_attr {
+	struct kobj_attribute attr;
+	uint32_t data;
+};
+
+static ssize_t perf_show(struct kobject *kobj, struct kobj_attribute *attrs,
+			char *buf)
+{
+	struct kfd_perf_attr *attr;
+
+	buf[0] = 0;
+	attr = container_of(attrs, struct kfd_perf_attr, attr);
+	if (!attr->data) /* invalid data for PMC */
+		return 0;
+	else
+		return sysfs_show_32bit_val(buf, attr->data);
+}
+
+#define KFD_PERF_DESC(_name, _data)			\
+{							\
+	.attr  = __ATTR(_name, 0444, perf_show, NULL),	\
+	.data = _data,					\
+}
+
+static struct kfd_perf_attr perf_attr_iommu[] = {
+	KFD_PERF_DESC(max_concurrent, 0),
+	KFD_PERF_DESC(num_counters, 0),
+	KFD_PERF_DESC(counter_ids, 0),
+};
+/****************************************/
+
 static ssize_t node_show(struct kobject *kobj, struct attribute *attr,
 		char *buffer)
 {
@@ -646,18 +406,8 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr,
 			dev->node_props.cpu_cores_count);
 	sysfs_show_32bit_prop(buffer, "simd_count",
 			dev->node_props.simd_count);
-
-	if (dev->mem_bank_count < dev->node_props.mem_banks_count) {
-		pr_info_once("mem_banks_count truncated from %d to %d\n",
-				dev->node_props.mem_banks_count,
-				dev->mem_bank_count);
-		sysfs_show_32bit_prop(buffer, "mem_banks_count",
-				dev->mem_bank_count);
-	} else {
-		sysfs_show_32bit_prop(buffer, "mem_banks_count",
-				dev->node_props.mem_banks_count);
-	}
-
+	sysfs_show_32bit_prop(buffer, "mem_banks_count",
+			dev->node_props.mem_banks_count);
 	sysfs_show_32bit_prop(buffer, "caches_count",
 			dev->node_props.caches_count);
 	sysfs_show_32bit_prop(buffer, "io_links_count",
@@ -705,9 +455,12 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr,
 				HSA_CAP_WATCH_POINTS_TOTALBITS_MASK);
 		}
 
+		if (dev->gpu->device_info->asic_family == CHIP_TONGA)
+			dev->node_props.capability |=
+					HSA_CAP_AQL_QUEUE_DOUBLE_MAP;
+
 		sysfs_show_32bit_prop(buffer, "max_engine_clk_fcompute",
-			dev->gpu->kfd2kgd->get_max_engine_clock_in_mhz(
-					dev->gpu->kgd));
+			dev->node_props.max_engine_clk_fcompute);
 
 		sysfs_show_64bit_prop(buffer, "local_mem_size",
 				(unsigned long long int) 0);
@@ -729,6 +482,7 @@ static const struct sysfs_ops node_ops = {
 };
 
 static struct kobj_type node_type = {
+	.release = kfd_topology_kobj_release,
 	.sysfs_ops = &node_ops,
 };
 
@@ -744,6 +498,7 @@ static void kfd_remove_sysfs_node_entry(struct kfd_topology_device *dev)
 	struct kfd_iolink_properties *iolink;
 	struct kfd_cache_properties *cache;
 	struct kfd_mem_properties *mem;
+	struct kfd_perf_properties *perf;
 
 	if (dev->kobj_iolink) {
 		list_for_each_entry(iolink, &dev->io_link_props, list)
@@ -780,6 +535,16 @@ static void kfd_remove_sysfs_node_entry(struct kfd_topology_device *dev)
 		dev->kobj_mem = NULL;
 	}
 
+	if (dev->kobj_perf) {
+		list_for_each_entry(perf, &dev->perf_props, list) {
+			kfree(perf->attr_group);
+			perf->attr_group = NULL;
+		}
+		kobject_del(dev->kobj_perf);
+		kobject_put(dev->kobj_perf);
+		dev->kobj_perf = NULL;
+	}
+
 	if (dev->kobj_node) {
 		sysfs_remove_file(dev->kobj_node, &dev->attr_gpuid);
 		sysfs_remove_file(dev->kobj_node, &dev->attr_name);
@@ -796,8 +561,10 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev,
 	struct kfd_iolink_properties *iolink;
 	struct kfd_cache_properties *cache;
 	struct kfd_mem_properties *mem;
+	struct kfd_perf_properties *perf;
 	int ret;
-	uint32_t i;
+	uint32_t i, num_attrs;
+	struct attribute **attrs;
 
 	if (WARN_ON(dev->kobj_node))
 		return -EEXIST;
@@ -826,6 +593,10 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev,
 	if (!dev->kobj_iolink)
 		return -ENOMEM;
 
+	dev->kobj_perf = kobject_create_and_add("perf", dev->kobj_node);
+	if (!dev->kobj_perf)
+		return -ENOMEM;
+
 	/*
 	 * Creating sysfs files for node properties
 	 */
@@ -903,11 +674,38 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev,
 		if (ret < 0)
 			return ret;
 		i++;
-}
+	}
+
+	/* All hardware blocks have the same number of attributes. */
+	num_attrs = sizeof(perf_attr_iommu)/sizeof(struct kfd_perf_attr);
+	list_for_each_entry(perf, &dev->perf_props, list) {
+		perf->attr_group = kzalloc(sizeof(struct kfd_perf_attr)
+			* num_attrs + sizeof(struct attribute_group),
+			GFP_KERNEL);
+		if (!perf->attr_group)
+			return -ENOMEM;
+
+		attrs = (struct attribute **)(perf->attr_group + 1);
+		if (!strcmp(perf->block_name, "iommu")) {
+		/* Information of IOMMU's num_counters and counter_ids is shown
+		 * under /sys/bus/event_source/devices/amd_iommu. We don't
+		 * duplicate here.
+		 */
+			perf_attr_iommu[0].data = perf->max_concurrent;
+			for (i = 0; i < num_attrs; i++)
+				attrs[i] = &perf_attr_iommu[i].attr.attr;
+		}
+		perf->attr_group->name = perf->block_name;
+		perf->attr_group->attrs = attrs;
+		ret = sysfs_create_group(dev->kobj_perf, perf->attr_group);
+		if (ret < 0)
+			return ret;
+	}
 
 	return 0;
 }
 
+/* Called with write topology lock acquired */
 static int kfd_build_sysfs_node_tree(void)
 {
 	struct kfd_topology_device *dev;
@@ -924,6 +722,7 @@ static int kfd_build_sysfs_node_tree(void)
 	return 0;
 }
 
+/* Called with write topology lock acquired */
 static void kfd_remove_sysfs_node_tree(void)
 {
 	struct kfd_topology_device *dev;
@@ -995,75 +794,246 @@ static void kfd_topology_release_sysfs(void)
 	}
 }
 
+/* Called with write topology_lock acquired */
+static void kfd_topology_update_device_list(struct list_head *temp_list,
+					struct list_head *master_list)
+{
+	while (!list_empty(temp_list)) {
+		list_move_tail(temp_list->next, master_list);
+		sys_props.num_devices++;
+	}
+}
+
+static void kfd_debug_print_topology(void)
+{
+	struct kfd_topology_device *dev;
+
+	down_read(&topology_lock);
+
+	dev = list_last_entry(&topology_device_list,
+			struct kfd_topology_device, list);
+	if (dev) {
+		if (dev->node_props.cpu_cores_count &&
+				dev->node_props.simd_count) {
+			pr_info("Topology: Add APU node [0x%0x:0x%0x]\n",
+				dev->node_props.device_id,
+				dev->node_props.vendor_id);
+		} else if (dev->node_props.cpu_cores_count)
+			pr_info("Topology: Add CPU node\n");
+		else if (dev->node_props.simd_count)
+			pr_info("Topology: Add dGPU node [0x%0x:0x%0x]\n",
+				dev->node_props.device_id,
+				dev->node_props.vendor_id);
+	}
+	up_read(&topology_lock);
+}
+
+/* Helper function for intializing platform_xx members of
+ * kfd_system_properties. Uses OEM info from the last CPU/APU node.
+ */
+static void kfd_update_system_properties(void)
+{
+	struct kfd_topology_device *dev;
+
+	down_read(&topology_lock);
+	dev = list_last_entry(&topology_device_list,
+			struct kfd_topology_device, list);
+	if (dev) {
+		sys_props.platform_id =
+			(*((uint64_t *)dev->oem_id)) & CRAT_OEMID_64BIT_MASK;
+		sys_props.platform_oem = *((uint64_t *)dev->oem_table_id);
+		sys_props.platform_rev = dev->oem_revision;
+	}
+	up_read(&topology_lock);
+}
+
+static void find_system_memory(const struct dmi_header *dm,
+	void *private)
+{
+	struct kfd_mem_properties *mem;
+	u16 mem_width, mem_clock;
+	struct kfd_topology_device *kdev =
+		(struct kfd_topology_device *)private;
+	const u8 *dmi_data = (const u8 *)(dm + 1);
+
+	if (dm->type == DMI_ENTRY_MEM_DEVICE && dm->length >= 0x15) {
+		mem_width = (u16)(*(const u16 *)(dmi_data + 0x6));
+		mem_clock = (u16)(*(const u16 *)(dmi_data + 0x11));
+		list_for_each_entry(mem, &kdev->mem_props, list) {
+			if (mem_width != 0xFFFF && mem_width != 0)
+				mem->width = mem_width;
+			if (mem_clock != 0)
+				mem->mem_clk_max = mem_clock;
+		}
+	}
+}
+
+/*
+ * Performance counters information is not part of CRAT but we would like to
+ * put them in the sysfs under topology directory for Thunk to get the data.
+ * This function is called before updating the sysfs.
+ */
+static int kfd_add_perf_to_topology(struct kfd_topology_device *kdev)
+{
+	struct kfd_perf_properties *props;
+
+	if (amd_iommu_pc_supported()) {
+		props = kfd_alloc_struct(props);
+		if (!props)
+			return -ENOMEM;
+		strcpy(props->block_name, "iommu");
+		props->max_concurrent = amd_iommu_pc_get_max_banks(0) *
+			amd_iommu_pc_get_max_counters(0); /* assume one iommu */
+		list_add_tail(&props->list, &kdev->perf_props);
+	}
+
+	return 0;
+}
+
+/* kfd_add_non_crat_information - Add information that is not currently
+ *	defined in CRAT but is necessary for KFD topology
+ * @dev - topology device to which addition info is added
+ */
+static void kfd_add_non_crat_information(struct kfd_topology_device *kdev)
+{
+	/* Check if CPU only node. */
+	if (!kdev->gpu) {
+		/* Add system memory information */
+		dmi_walk(find_system_memory, kdev);
+	}
+	/* TODO: For GPU node, rearrange code from kfd_topology_add_device */
+}
+
+/* kfd_is_acpi_crat_invalid - CRAT from ACPI is valid only for AMD APU devices.
+ *	Ignore CRAT for all other devices. AMD APU is identified if both CPU
+ *	and GPU cores are present.
+ * @device_list - topology device list created by parsing ACPI CRAT table.
+ * @return - TRUE if invalid, FALSE is valid.
+ */
+static bool kfd_is_acpi_crat_invalid(struct list_head *device_list)
+{
+	struct kfd_topology_device *dev;
+
+	list_for_each_entry(dev, device_list, list) {
+		if (dev->node_props.cpu_cores_count &&
+			dev->node_props.simd_count)
+			return false;
+	}
+	pr_info("Ignoring ACPI CRAT on non-APU system\n");
+	return true;
+}
+
 int kfd_topology_init(void)
 {
 	void *crat_image = NULL;
 	size_t image_size = 0;
 	int ret;
-
-	/*
-	 * Initialize the head for the topology device list
+	struct list_head temp_topology_device_list;
+	int cpu_only_node = 0;
+	struct kfd_topology_device *kdev;
+	int proximity_domain;
+
+	/* topology_device_list - Master list of all topology devices
+	 * temp_topology_device_list - temporary list created while parsing CRAT
+	 * or VCRAT. Once parsing is complete the contents of list is moved to
+	 * topology_device_list
 	 */
+
+	/* Initialize the head for the both the lists */
 	INIT_LIST_HEAD(&topology_device_list);
+	INIT_LIST_HEAD(&temp_topology_device_list);
 	init_rwsem(&topology_lock);
-	topology_crat_parsed = 0;
 
 	memset(&sys_props, 0, sizeof(sys_props));
 
+	/* Proximity domains in ACPI CRAT tables start counting at
+	 * 0. The same should be true for virtual CRAT tables created
+	 * at this stage. GPUs added later in kfd_topology_add_device
+	 * use a counter.
+	 */
+	proximity_domain = 0;
+
 	/*
-	 * Get the CRAT image from the ACPI
+	 * Get the CRAT image from the ACPI. If ACPI doesn't have one
+	 * or if ACPI CRAT is invalid create a virtual CRAT.
+	 * NOTE: The current implementation expects all AMD APUs to have
+	 *	CRAT. If no CRAT is available, it is assumed to be a CPU
 	 */
-	ret = kfd_topology_get_crat_acpi(crat_image, &image_size);
-	if (ret == 0 && image_size > 0) {
-		pr_info("Found CRAT image with size=%zd\n", image_size);
-		crat_image = kmalloc(image_size, GFP_KERNEL);
-		if (!crat_image) {
-			ret = -ENOMEM;
-			pr_err("No memory for allocating CRAT image\n");
-			goto err;
+	ret = kfd_create_crat_image_acpi(&crat_image, &image_size);
+	if (!ret) {
+		ret = kfd_parse_crat_table(crat_image,
+					   &temp_topology_device_list,
+					   proximity_domain);
+		if (ret ||
+		    kfd_is_acpi_crat_invalid(&temp_topology_device_list)) {
+			kfd_release_topology_device_list(
+				&temp_topology_device_list);
+			kfd_destroy_crat_image(crat_image);
+			crat_image = NULL;
 		}
-		ret = kfd_topology_get_crat_acpi(crat_image, &image_size);
-
-		if (ret == 0) {
-			down_write(&topology_lock);
-			ret = kfd_parse_crat_table(crat_image);
-			if (ret == 0)
-				ret = kfd_topology_update_sysfs();
-			up_write(&topology_lock);
-		} else {
-			pr_err("Couldn't get CRAT table size from ACPI\n");
+	}
+
+	if (!crat_image) {
+		ret = kfd_create_crat_image_virtual(&crat_image, &image_size,
+						    COMPUTE_UNIT_CPU, NULL,
+						    proximity_domain);
+		cpu_only_node = 1;
+		if (ret) {
+			pr_err("Error creating VCRAT table for CPU\n");
+			return ret;
 		}
-		kfree(crat_image);
-	} else if (ret == -ENODATA) {
-		ret = 0;
-	} else {
-		pr_err("Couldn't get CRAT table size from ACPI\n");
+
+		ret = kfd_parse_crat_table(crat_image,
+					   &temp_topology_device_list,
+					   proximity_domain);
+		if (ret) {
+			pr_err("Error parsing VCRAT table for CPU\n");
+			goto err;
+		}
+	}
+
+	kdev = list_first_entry(&temp_topology_device_list,
+				struct kfd_topology_device, list);
+	kfd_add_perf_to_topology(kdev);
+
+	down_write(&topology_lock);
+	kfd_topology_update_device_list(&temp_topology_device_list,
+					&topology_device_list);
+	atomic_set(&topology_crat_proximity_domain, sys_props.num_devices-1);
+	ret = kfd_topology_update_sysfs();
+	up_write(&topology_lock);
+
+	if (!ret) {
+		sys_props.generation_count++;
+		kfd_update_system_properties();
+		kfd_debug_print_topology();
+		pr_info("Finished initializing topology\n");
+	} else
+		pr_err("Failed to update topology in sysfs ret=%d\n", ret);
+
+	/* For nodes with GPU, this information gets added
+	 * when GPU is detected (kfd_topology_add_device).
+	 */
+	if (cpu_only_node) {
+		/* Add additional information to CPU only node created above */
+		down_write(&topology_lock);
+		kdev = list_first_entry(&topology_device_list,
+				struct kfd_topology_device, list);
+		up_write(&topology_lock);
+		kfd_add_non_crat_information(kdev);
 	}
 
 err:
-	pr_info("Finished initializing topology ret=%d\n", ret);
+	kfd_destroy_crat_image(crat_image);
 	return ret;
 }
 
 void kfd_topology_shutdown(void)
 {
+	down_write(&topology_lock);
 	kfd_topology_release_sysfs();
 	kfd_release_live_view();
-}
-
-static void kfd_debug_print_topology(void)
-{
-	struct kfd_topology_device *dev;
-	uint32_t i = 0;
-
-	pr_info("DEBUG PRINT OF TOPOLOGY:");
-	list_for_each_entry(dev, &topology_device_list, list) {
-		pr_info("Node: %d\n", i);
-		pr_info("\tGPU assigned: %s\n", (dev->gpu ? "yes" : "no"));
-		pr_info("\tCPU count: %d\n", dev->node_props.cpu_cores_count);
-		pr_info("\tSIMD count: %d", dev->node_props.simd_count);
-		i++;
-	}
+	up_write(&topology_lock);
 }
 
 static uint32_t kfd_generate_gpu_id(struct kfd_dev *gpu)
@@ -1072,11 +1042,15 @@ static uint32_t kfd_generate_gpu_id(struct kfd_dev *gpu)
 	uint32_t buf[7];
 	uint64_t local_mem_size;
 	int i;
+	struct kfd_local_mem_info local_mem_info;
 
 	if (!gpu)
 		return 0;
 
-	local_mem_size = gpu->kfd2kgd->get_vmem_size(gpu->kgd);
+	gpu->kfd2kgd->get_local_mem_info(gpu->kgd, &local_mem_info);
+
+	local_mem_size = local_mem_info.local_mem_size_private +
+			local_mem_info.local_mem_size_public;
 
 	buf[0] = gpu->pdev->devfn;
 	buf[1] = gpu->pdev->subsystem_vendor;
@@ -1091,19 +1065,26 @@ static uint32_t kfd_generate_gpu_id(struct kfd_dev *gpu)
 
 	return hashout;
 }
-
+/* kfd_assign_gpu - Attach @gpu to the correct kfd topology device. If
+ *		the GPU device is not already present in the topology device
+ *		list then return NULL. This means a new topology device has to
+ *		be created for this GPU.
+ * TODO: Rather than assiging @gpu to first topology device withtout
+ *		gpu attached, it will better to have more stringent check.
+ */
 static struct kfd_topology_device *kfd_assign_gpu(struct kfd_dev *gpu)
 {
 	struct kfd_topology_device *dev;
 	struct kfd_topology_device *out_dev = NULL;
 
+	down_write(&topology_lock);
 	list_for_each_entry(dev, &topology_device_list, list)
 		if (!dev->gpu && (dev->node_props.simd_count > 0)) {
 			dev->gpu = gpu;
 			out_dev = dev;
 			break;
 		}
-
+	up_write(&topology_lock);
 	return out_dev;
 }
 
@@ -1115,84 +1096,196 @@ static void kfd_notify_gpu_change(uint32_t gpu_id, int arrival)
 	 */
 }
 
+/* kfd_fill_mem_clk_max_info - Since CRAT doesn't have memory clock info,
+ *		patch this after CRAT parsing.
+ */
+static void kfd_fill_mem_clk_max_info(struct kfd_topology_device *dev)
+{
+	struct kfd_mem_properties *mem;
+	struct kfd_local_mem_info local_mem_info;
+
+	if (!dev)
+		return;
+
+	/* Currently, amdgpu driver (amdgpu_mc) deals only with GPUs with
+	 * single bank of VRAM local memory.
+	 * for dGPUs - VCRAT reports only one bank of Local Memory
+	 * for APUs - If CRAT from ACPI reports more than one bank, then
+	 *	all the banks will report the same mem_clk_max information
+	 */
+	dev->gpu->kfd2kgd->get_local_mem_info(dev->gpu->kgd,
+		&local_mem_info);
+
+	list_for_each_entry(mem, &dev->mem_props, list)
+		mem->mem_clk_max = local_mem_info.mem_clk_max;
+}
+
+static void kfd_fill_iolink_non_crat_info(struct kfd_topology_device *dev)
+{
+	struct kfd_iolink_properties *link;
+
+	if (!dev || !dev->gpu)
+		return;
+
+	/* GPU only creates direck links so apply flags setting to all */
+	if (dev->gpu->device_info->asic_family == CHIP_HAWAII)
+		list_for_each_entry(link, &dev->io_link_props, list)
+			link->flags = CRAT_IOLINK_FLAGS_ENABLED |
+				CRAT_IOLINK_FLAGS_NO_ATOMICS_32_BIT |
+				CRAT_IOLINK_FLAGS_NO_ATOMICS_64_BIT;
+}
+
 int kfd_topology_add_device(struct kfd_dev *gpu)
 {
 	uint32_t gpu_id;
 	struct kfd_topology_device *dev;
-	int res;
+	struct kfd_cu_info cu_info;
+	int res = 0;
+	struct list_head temp_topology_device_list;
+	void *crat_image = NULL;
+	size_t image_size = 0;
+	int proximity_domain;
+
+	INIT_LIST_HEAD(&temp_topology_device_list);
 
 	gpu_id = kfd_generate_gpu_id(gpu);
 
 	pr_debug("Adding new GPU (ID: 0x%x) to topology\n", gpu_id);
 
-	down_write(&topology_lock);
-	/*
-	 * Try to assign the GPU to existing topology device (generated from
-	 * CRAT table
+	proximity_domain = atomic_inc_return(&topology_crat_proximity_domain);
+
+	/* Check to see if this gpu device exists in the topology_device_list.
+	 * If so, assign the gpu to that device,
+	 * else create a Virtual CRAT for this gpu device and then parse that
+	 * CRAT to create a new topology device. Once created assign the gpu to
+	 * that topology device
 	 */
 	dev = kfd_assign_gpu(gpu);
 	if (!dev) {
-		pr_info("GPU was not found in the current topology. Extending.\n");
-		kfd_debug_print_topology();
-		dev = kfd_create_topology_device();
-		if (!dev) {
-			res = -ENOMEM;
+		res = kfd_create_crat_image_virtual(&crat_image, &image_size,
+						    COMPUTE_UNIT_GPU, gpu,
+						    proximity_domain);
+		if (res) {
+			pr_err("Error creating VCRAT for GPU (ID: 0x%x)\n",
+			       gpu_id);
+			return res;
+		}
+		res = kfd_parse_crat_table(crat_image,
+					   &temp_topology_device_list,
+					   proximity_domain);
+		if (res) {
+			pr_err("Error parsing VCRAT for GPU (ID: 0x%x)\n",
+			       gpu_id);
 			goto err;
 		}
-		dev->gpu = gpu;
 
-		/*
-		 * TODO: Make a call to retrieve topology information from the
-		 * GPU vBIOS
-		 */
+		down_write(&topology_lock);
+		kfd_topology_update_device_list(&temp_topology_device_list,
+			&topology_device_list);
 
 		/* Update the SYSFS tree, since we added another topology
 		 * device
 		 */
-		if (kfd_topology_update_sysfs() < 0)
-			kfd_topology_release_sysfs();
-
+		res = kfd_topology_update_sysfs();
+		up_write(&topology_lock);
+
+		if (!res)
+			sys_props.generation_count++;
+		else
+			pr_err("Failed to update GPU (ID: 0x%x) to sysfs topology. res=%d\n",
+						gpu_id, res);
+		dev = kfd_assign_gpu(gpu);
+		if (WARN_ON(!dev)) {
+			res = -ENODEV;
+			goto err;
+		}
 	}
 
 	dev->gpu_id = gpu_id;
 	gpu->id = gpu_id;
+
+	/* TODO: Move the following lines to function
+	 *	kfd_add_non_crat_information
+	 */
+
+	/* Fill-in additional information that is not available in CRAT but
+	 * needed for the topology
+	 */
+
+	dev->gpu->kfd2kgd->get_cu_info(dev->gpu->kgd, &cu_info);
+	dev->node_props.simd_arrays_per_engine =
+		cu_info.num_shader_arrays_per_engine;
+
 	dev->node_props.vendor_id = gpu->pdev->vendor;
 	dev->node_props.device_id = gpu->pdev->device;
-	dev->node_props.location_id = (gpu->pdev->bus->number << 24) +
-			(gpu->pdev->devfn & 0xffffff);
-	/*
-	 * TODO: Retrieve max engine clock values from KGD
-	 */
+	dev->node_props.location_id = PCI_DEVID(gpu->pdev->bus->number,
+		gpu->pdev->devfn);
+	dev->node_props.max_engine_clk_fcompute =
+		dev->gpu->kfd2kgd->get_max_engine_clock_in_mhz(dev->gpu->kgd);
+	dev->node_props.max_engine_clk_ccompute =
+		cpufreq_quick_get_max(0) / 1000;
+
+	kfd_fill_mem_clk_max_info(dev);
+	kfd_fill_iolink_non_crat_info(dev);
+
+	switch (dev->gpu->device_info->asic_family) {
+	case CHIP_KAVERI:
+	case CHIP_HAWAII:
+	case CHIP_TONGA:
+		dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_PRE_1_0 <<
+			HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) &
+			HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK);
+		break;
+	case CHIP_CARRIZO:
+	case CHIP_FIJI:
+	case CHIP_POLARIS10:
+	case CHIP_POLARIS11:
+		pr_debug("Adding doorbell packet type capability\n");
+		dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_1_0 <<
+			HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) &
+			HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK);
+		break;
+	default:
+		WARN(1, "Unexpected ASIC family %u",
+		     dev->gpu->device_info->asic_family);
+	}
 
+	/* Fix errors in CZ CRAT.
+	 * simd_count: Carrizo CRAT reports wrong simd_count, probably
+	 *		because it doesn't consider masked out CUs
+	 * max_waves_per_simd: Carrizo reports wrong max_waves_per_simd
+	 * capability flag: Carrizo CRAT doesn't report IOMMU flags
+	 */
 	if (dev->gpu->device_info->asic_family == CHIP_CARRIZO) {
-		dev->node_props.capability |= HSA_CAP_DOORBELL_PACKET_TYPE;
-		pr_info("Adding doorbell packet type capability\n");
+		dev->node_props.simd_count =
+			cu_info.simd_per_cu * cu_info.cu_active_number;
+		dev->node_props.max_waves_per_simd = 10;
+		dev->node_props.capability |= HSA_CAP_ATS_PRESENT;
 	}
 
-	res = 0;
-
-err:
-	up_write(&topology_lock);
+	kfd_debug_print_topology();
 
-	if (res == 0)
+	if (!res)
 		kfd_notify_gpu_change(gpu_id, 1);
-
+err:
+	kfd_destroy_crat_image(crat_image);
 	return res;
 }
 
 int kfd_topology_remove_device(struct kfd_dev *gpu)
 {
-	struct kfd_topology_device *dev;
+	struct kfd_topology_device *dev, *tmp;
 	uint32_t gpu_id;
 	int res = -ENODEV;
 
 	down_write(&topology_lock);
 
-	list_for_each_entry(dev, &topology_device_list, list)
+	list_for_each_entry_safe(dev, tmp, &topology_device_list, list)
 		if (dev->gpu == gpu) {
 			gpu_id = dev->gpu_id;
 			kfd_remove_sysfs_node_entry(dev);
 			kfd_release_topology_device(dev);
+			sys_props.num_devices--;
 			res = 0;
 			if (kfd_topology_update_sysfs() < 0)
 				kfd_topology_release_sysfs();
@@ -1201,28 +1294,32 @@ int kfd_topology_remove_device(struct kfd_dev *gpu)
 
 	up_write(&topology_lock);
 
-	if (res == 0)
+	if (!res)
 		kfd_notify_gpu_change(gpu_id, 0);
 
 	return res;
 }
 
-/*
- * When idx is out of bounds, the function will return NULL
+/* kfd_topology_enum_kfd_devices - Enumerate through all devices in KFD
+ *	topology. If GPU device is found @idx, then valid kfd_dev pointer is
+ *	returned through @kdev
+ * Return -	0: On success (@kdev will be NULL for non GPU nodes)
+ *		-1: If end of list
  */
-struct kfd_dev *kfd_topology_enum_kfd_devices(uint8_t idx)
+int kfd_topology_enum_kfd_devices(uint8_t idx, struct kfd_dev **kdev)
 {
 
 	struct kfd_topology_device *top_dev;
-	struct kfd_dev *device = NULL;
 	uint8_t device_idx = 0;
 
+	*kdev = NULL;
 	down_read(&topology_lock);
 
 	list_for_each_entry(top_dev, &topology_device_list, list) {
 		if (device_idx == idx) {
-			device = top_dev->gpu;
-			break;
+			*kdev = top_dev->gpu;
+			up_read(&topology_lock);
+			return 0;
 		}
 
 		device_idx++;
@@ -1230,6 +1327,88 @@ struct kfd_dev *kfd_topology_enum_kfd_devices(uint8_t idx)
 
 	up_read(&topology_lock);
 
-	return device;
+	return -1;
+
+}
+
+static int kfd_cpumask_to_apic_id(const struct cpumask *cpumask)
+{
+	const struct cpuinfo_x86 *cpuinfo;
+	int first_cpu_of_numa_node;
+
+	if (!cpumask || cpumask == cpu_none_mask)
+		return -1;
+	first_cpu_of_numa_node = cpumask_first(cpumask);
+	if (first_cpu_of_numa_node >= nr_cpu_ids)
+		return -1;
+	cpuinfo = &cpu_data(first_cpu_of_numa_node);
 
+	return cpuinfo->apicid;
 }
+
+/* kfd_numa_node_to_apic_id - Returns the APIC ID of the first logical processor
+ *	of the given NUMA node (numa_node_id)
+ * Return -1 on failure
+ */
+int kfd_numa_node_to_apic_id(int numa_node_id)
+{
+	if (numa_node_id == -1) {
+		pr_warn("Invalid NUMA Node. Use online CPU mask\n");
+		return kfd_cpumask_to_apic_id(cpu_online_mask);
+	}
+	return kfd_cpumask_to_apic_id(cpumask_of_node(numa_node_id));
+}
+
+#if defined(CONFIG_DEBUG_FS)
+
+int kfd_debugfs_hqds_by_device(struct seq_file *m, void *data)
+{
+	struct kfd_topology_device *dev;
+	unsigned int i = 0;
+	int r = 0;
+
+	down_read(&topology_lock);
+
+	list_for_each_entry(dev, &topology_device_list, list) {
+		if (!dev->gpu) {
+			i++;
+			continue;
+		}
+
+		seq_printf(m, "Node %u, gpu_id %x:\n", i++, dev->gpu->id);
+		r = dqm_debugfs_hqds(m, dev->gpu->dqm);
+		if (r)
+			break;
+	}
+
+	up_read(&topology_lock);
+
+	return r;
+}
+
+int kfd_debugfs_rls_by_device(struct seq_file *m, void *data)
+{
+	struct kfd_topology_device *dev;
+	unsigned int i = 0;
+	int r = 0;
+
+	down_read(&topology_lock);
+
+	list_for_each_entry(dev, &topology_device_list, list) {
+		if (!dev->gpu) {
+			i++;
+			continue;
+		}
+
+		seq_printf(m, "Node %u, gpu_id %x:\n", i++, dev->gpu->id);
+		r = pm_debugfs_runlist(m, &dev->gpu->dqm->packets);
+		if (r)
+			break;
+	}
+
+	up_read(&topology_lock);
+
+	return r;
+}
+
+#endif
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
index c3ddb9b95ff8..53fca1f45401 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
@@ -39,8 +39,13 @@
 #define HSA_CAP_WATCH_POINTS_SUPPORTED		0x00000080
 #define HSA_CAP_WATCH_POINTS_TOTALBITS_MASK	0x00000f00
 #define HSA_CAP_WATCH_POINTS_TOTALBITS_SHIFT	8
-#define HSA_CAP_RESERVED			0xfffff000
-#define HSA_CAP_DOORBELL_PACKET_TYPE		0x00001000
+#define HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK	0x00003000
+#define HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT	12
+#define HSA_CAP_RESERVED			0xffffc000
+
+#define HSA_CAP_DOORBELL_TYPE_PRE_1_0		0x0
+#define HSA_CAP_DOORBELL_TYPE_1_0		0x1
+#define HSA_CAP_AQL_QUEUE_DOUBLE_MAP		0x00004000
 
 struct kfd_node_properties {
 	uint32_t cpu_cores_count;
@@ -91,8 +96,6 @@ struct kfd_mem_properties {
 	struct attribute	attr;
 };
 
-#define KFD_TOPOLOGY_CPU_SIBLINGS 256
-
 #define HSA_CACHE_TYPE_DATA		0x00000001
 #define HSA_CACHE_TYPE_INSTRUCTION	0x00000002
 #define HSA_CACHE_TYPE_CPU		0x00000004
@@ -109,7 +112,7 @@ struct kfd_cache_properties {
 	uint32_t		cache_assoc;
 	uint32_t		cache_latency;
 	uint32_t		cache_type;
-	uint8_t			sibling_map[KFD_TOPOLOGY_CPU_SIBLINGS];
+	uint8_t			sibling_map[CRAT_SIBLINGMAP_SIZE];
 	struct kobject		*kobj;
 	struct attribute	attr;
 };
@@ -132,24 +135,36 @@ struct kfd_iolink_properties {
 	struct attribute	attr;
 };
 
+struct kfd_perf_properties {
+	struct list_head	list;
+	char			block_name[16];
+	uint32_t		max_concurrent;
+	struct attribute_group	*attr_group;
+};
+
 struct kfd_topology_device {
 	struct list_head		list;
 	uint32_t			gpu_id;
+	uint32_t			proximity_domain;
 	struct kfd_node_properties	node_props;
-	uint32_t			mem_bank_count;
 	struct list_head		mem_props;
 	uint32_t			cache_count;
 	struct list_head		cache_props;
 	uint32_t			io_link_count;
 	struct list_head		io_link_props;
+	struct list_head		perf_props;
 	struct kfd_dev			*gpu;
 	struct kobject			*kobj_node;
 	struct kobject			*kobj_mem;
 	struct kobject			*kobj_cache;
 	struct kobject			*kobj_iolink;
+	struct kobject			*kobj_perf;
 	struct attribute		attr_gpuid;
 	struct attribute		attr_name;
 	struct attribute		attr_props;
+	uint8_t				oem_id[CRAT_OEMID_LENGTH];
+	uint8_t				oem_table_id[CRAT_OEMTABLEID_LENGTH];
+	uint32_t			oem_revision;
 };
 
 struct kfd_system_properties {
@@ -164,6 +179,12 @@ struct kfd_system_properties {
 	struct attribute	attr_props;
 };
 
+struct kfd_topology_device *kfd_create_topology_device(
+		struct list_head *device_list);
+void kfd_release_topology_device_list(struct list_head *device_list);
 
+extern bool amd_iommu_pc_supported(void);
+extern u8 amd_iommu_pc_get_max_banks(u16 devid);
+extern u8 amd_iommu_pc_get_max_counters(u16 devid);
 
 #endif /* __KFD_TOPOLOGY_H__ */
diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
index f516fd10e6ba..a6752bd0c871 100644
--- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
+++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
@@ -46,6 +46,28 @@ enum kfd_preempt_type {
 	KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
 };
 
+struct kfd_cu_info {
+	uint32_t num_shader_engines;
+	uint32_t num_shader_arrays_per_engine;
+	uint32_t num_cu_per_sh;
+	uint32_t cu_active_number;
+	uint32_t cu_ao_mask;
+	uint32_t simd_per_cu;
+	uint32_t max_waves_per_simd;
+	uint32_t wave_front_size;
+	uint32_t max_scratch_slots_per_cu;
+	uint32_t lds_size;
+	uint32_t cu_bitmap[4][4];
+};
+
+/* For getting GPU local memory information from KGD */
+struct kfd_local_mem_info {
+	uint64_t local_mem_size_private;
+	uint64_t local_mem_size_public;
+	uint32_t vram_width;
+	uint32_t mem_clk_max;
+};
+
 enum kgd_memory_pool {
 	KGD_POOL_SYSTEM_CACHEABLE = 1,
 	KGD_POOL_SYSTEM_WRITECOMBINE = 2,
@@ -106,7 +128,7 @@ struct tile_config {
  *
  * @free_gtt_mem: Frees a buffer that was allocated on the gart aperture
  *
- * @get_vmem_size: Retrieves (physical) size of VRAM
+ * @get_local_mem_info: Retrieves information about GPU local memory
  *
  * @get_gpu_clock_counter: Retrieves GPU clock counter
  *
@@ -131,6 +153,12 @@ struct tile_config {
  * @hqd_sdma_load: Loads the SDMA mqd structure to a H/W SDMA hqd slot.
  * used only for no HWS mode.
  *
+ * @hqd_dump: Dumps CPC HQD registers to an array of address-value pairs.
+ * Array is allocated with kmalloc, needs to be freed with kfree by caller.
+ *
+ * @hqd_sdma_dump: Dumps SDMA HQD registers to an array of address-value pairs.
+ * Array is allocated with kmalloc, needs to be freed with kfree by caller.
+ *
  * @hqd_is_occupies: Checks if a hqd slot is occupied.
  *
  * @hqd_destroy: Destructs and preempts the queue assigned to that hqd slot.
@@ -147,6 +175,10 @@ struct tile_config {
  *
  * @get_tile_config: Returns GPU-specific tiling mode information
  *
+ * @get_cu_info: Retrieves activated cu info
+ *
+ * @get_vram_usage: Returns current VRAM usage
+ *
  * This structure contains function pointers to services that the kgd driver
  * provides to amdkfd driver.
  *
@@ -158,7 +190,8 @@ struct kfd2kgd_calls {
 
 	void (*free_gtt_mem)(struct kgd_dev *kgd, void *mem_obj);
 
-	uint64_t (*get_vmem_size)(struct kgd_dev *kgd);
+	void (*get_local_mem_info)(struct kgd_dev *kgd,
+			struct kfd_local_mem_info *mem_info);
 	uint64_t (*get_gpu_clock_counter)(struct kgd_dev *kgd);
 
 	uint32_t (*get_max_engine_clock_in_mhz)(struct kgd_dev *kgd);
@@ -184,7 +217,16 @@ struct kfd2kgd_calls {
 			uint32_t wptr_shift, uint32_t wptr_mask,
 			struct mm_struct *mm);
 
-	int (*hqd_sdma_load)(struct kgd_dev *kgd, void *mqd);
+	int (*hqd_sdma_load)(struct kgd_dev *kgd, void *mqd,
+			     uint32_t __user *wptr, struct mm_struct *mm);
+
+	int (*hqd_dump)(struct kgd_dev *kgd,
+			uint32_t pipe_id, uint32_t queue_id,
+			uint32_t (**dump)[2], uint32_t *n_regs);
+
+	int (*hqd_sdma_dump)(struct kgd_dev *kgd,
+			     uint32_t engine_id, uint32_t queue_id,
+			     uint32_t (**dump)[2], uint32_t *n_regs);
 
 	bool (*hqd_is_occupied)(struct kgd_dev *kgd, uint64_t queue_address,
 				uint32_t pipe_id, uint32_t queue_id);
@@ -224,6 +266,10 @@ struct kfd2kgd_calls {
 	void (*set_scratch_backing_va)(struct kgd_dev *kgd,
 				uint64_t va, uint32_t vmid);
 	int (*get_tile_config)(struct kgd_dev *kgd, struct tile_config *config);
+
+	void (*get_cu_info)(struct kgd_dev *kgd,
+			struct kfd_cu_info *cu_info);
+	uint64_t (*get_vram_usage)(struct kgd_dev *kgd);
 };
 
 /**
diff --git a/drivers/gpu/drm/amd/include/vi_structs.h b/drivers/gpu/drm/amd/include/vi_structs.h
index 20234820194b..717fbae1d362 100644
--- a/drivers/gpu/drm/amd/include/vi_structs.h
+++ b/drivers/gpu/drm/amd/include/vi_structs.h
@@ -153,6 +153,8 @@ struct vi_sdma_mqd {
 	uint32_t reserved_125;
 	uint32_t reserved_126;
 	uint32_t reserved_127;
+	uint32_t sdma_engine_id;
+	uint32_t sdma_queue_id;
 };
 
 struct vi_mqd {
diff --git a/drivers/gpu/drm/armada/armada_crtc.c b/drivers/gpu/drm/armada/armada_crtc.c
index 2e065facdce7..e2adfbef7d6b 100644
--- a/drivers/gpu/drm/armada/armada_crtc.c
+++ b/drivers/gpu/drm/armada/armada_crtc.c
@@ -13,6 +13,7 @@
 #include <drm/drmP.h>
 #include <drm/drm_crtc_helper.h>
 #include <drm/drm_plane_helper.h>
+#include <drm/drm_atomic_helper.h>
 #include "armada_crtc.h"
 #include "armada_drm.h"
 #include "armada_fb.h"
@@ -20,13 +21,6 @@
 #include "armada_hw.h"
 #include "armada_trace.h"
 
-struct armada_frame_work {
-	struct armada_plane_work work;
-	struct drm_pending_vblank_event *event;
-	struct armada_regs regs[4];
-	struct drm_framebuffer *old_fb;
-};
-
 enum csc_mode {
 	CSC_AUTO = 0,
 	CSC_YUV_CCIR601 = 1,
@@ -168,16 +162,23 @@ static void armada_drm_crtc_update(struct armada_crtc *dcrtc)
 void armada_drm_plane_calc_addrs(u32 *addrs, struct drm_framebuffer *fb,
 	int x, int y)
 {
+	const struct drm_format_info *format = fb->format;
+	unsigned int num_planes = format->num_planes;
 	u32 addr = drm_fb_obj(fb)->dev_addr;
-	int num_planes = fb->format->num_planes;
 	int i;
 
 	if (num_planes > 3)
 		num_planes = 3;
 
-	for (i = 0; i < num_planes; i++)
+	addrs[0] = addr + fb->offsets[0] + y * fb->pitches[0] +
+		   x * format->cpp[0];
+
+	y /= format->vsub;
+	x /= format->hsub;
+
+	for (i = 1; i < num_planes; i++)
 		addrs[i] = addr + fb->offsets[i] + y * fb->pitches[i] +
-			     x * fb->format->cpp[i];
+			     x * format->cpp[i];
 	for (; i < 3; i++)
 		addrs[i] = 0;
 }
@@ -209,6 +210,38 @@ static unsigned armada_drm_crtc_calc_fb(struct drm_framebuffer *fb,
 	return i;
 }
 
+static void armada_drm_plane_work_call(struct armada_crtc *dcrtc,
+	struct armada_plane_work *work,
+	void (*fn)(struct armada_crtc *, struct armada_plane_work *))
+{
+	struct armada_plane *dplane = drm_to_armada_plane(work->plane);
+	struct drm_pending_vblank_event *event;
+	struct drm_framebuffer *fb;
+
+	if (fn)
+		fn(dcrtc, work);
+	drm_crtc_vblank_put(&dcrtc->crtc);
+
+	event = work->event;
+	fb = work->old_fb;
+	if (event || fb) {
+		struct drm_device *dev = dcrtc->crtc.dev;
+		unsigned long flags;
+
+		spin_lock_irqsave(&dev->event_lock, flags);
+		if (event)
+			drm_crtc_send_vblank_event(&dcrtc->crtc, event);
+		if (fb)
+			__armada_drm_queue_unref_work(dev, fb);
+		spin_unlock_irqrestore(&dev->event_lock, flags);
+	}
+
+	if (work->need_kfree)
+		kfree(work);
+
+	wake_up(&dplane->frame_wait);
+}
+
 static void armada_drm_plane_work_run(struct armada_crtc *dcrtc,
 	struct drm_plane *plane)
 {
@@ -216,24 +249,19 @@ static void armada_drm_plane_work_run(struct armada_crtc *dcrtc,
 	struct armada_plane_work *work = xchg(&dplane->work, NULL);
 
 	/* Handle any pending frame work. */
-	if (work) {
-		work->fn(dcrtc, dplane, work);
-		drm_crtc_vblank_put(&dcrtc->crtc);
-	}
-
-	wake_up(&dplane->frame_wait);
+	if (work)
+		armada_drm_plane_work_call(dcrtc, work, work->fn);
 }
 
 int armada_drm_plane_work_queue(struct armada_crtc *dcrtc,
-	struct armada_plane *plane, struct armada_plane_work *work)
+	struct armada_plane_work *work)
 {
+	struct armada_plane *plane = drm_to_armada_plane(work->plane);
 	int ret;
 
 	ret = drm_crtc_vblank_get(&dcrtc->crtc);
-	if (ret) {
-		DRM_ERROR("failed to acquire vblank counter\n");
+	if (ret)
 		return ret;
-	}
 
 	ret = cmpxchg(&plane->work, NULL, work) ? -EBUSY : 0;
 	if (ret)
@@ -247,51 +275,60 @@ int armada_drm_plane_work_wait(struct armada_plane *plane, long timeout)
 	return wait_event_timeout(plane->frame_wait, !plane->work, timeout);
 }
 
-struct armada_plane_work *armada_drm_plane_work_cancel(
-	struct armada_crtc *dcrtc, struct armada_plane *plane)
+void armada_drm_plane_work_cancel(struct armada_crtc *dcrtc,
+	struct armada_plane *dplane)
 {
-	struct armada_plane_work *work = xchg(&plane->work, NULL);
+	struct armada_plane_work *work = xchg(&dplane->work, NULL);
 
 	if (work)
-		drm_crtc_vblank_put(&dcrtc->crtc);
-
-	return work;
+		armada_drm_plane_work_call(dcrtc, work, work->cancel);
 }
 
-static int armada_drm_crtc_queue_frame_work(struct armada_crtc *dcrtc,
-	struct armada_frame_work *work)
+static void armada_drm_crtc_complete_frame_work(struct armada_crtc *dcrtc,
+	struct armada_plane_work *work)
 {
-	struct armada_plane *plane = drm_to_armada_plane(dcrtc->crtc.primary);
+	unsigned long flags;
 
-	return armada_drm_plane_work_queue(dcrtc, plane, &work->work);
+	spin_lock_irqsave(&dcrtc->irq_lock, flags);
+	armada_drm_crtc_update_regs(dcrtc, work->regs);
+	spin_unlock_irqrestore(&dcrtc->irq_lock, flags);
 }
 
-static void armada_drm_crtc_complete_frame_work(struct armada_crtc *dcrtc,
-	struct armada_plane *plane, struct armada_plane_work *work)
+static void armada_drm_crtc_complete_disable_work(struct armada_crtc *dcrtc,
+	struct armada_plane_work *work)
 {
-	struct armada_frame_work *fwork = container_of(work, struct armada_frame_work, work);
-	struct drm_device *dev = dcrtc->crtc.dev;
 	unsigned long flags;
 
+	if (dcrtc->plane == work->plane)
+		dcrtc->plane = NULL;
+
 	spin_lock_irqsave(&dcrtc->irq_lock, flags);
-	armada_drm_crtc_update_regs(dcrtc, fwork->regs);
+	armada_drm_crtc_update_regs(dcrtc, work->regs);
 	spin_unlock_irqrestore(&dcrtc->irq_lock, flags);
+}
 
-	if (fwork->event) {
-		spin_lock_irqsave(&dev->event_lock, flags);
-		drm_crtc_send_vblank_event(&dcrtc->crtc, fwork->event);
-		spin_unlock_irqrestore(&dev->event_lock, flags);
-	}
+static struct armada_plane_work *
+armada_drm_crtc_alloc_plane_work(struct drm_plane *plane)
+{
+	struct armada_plane_work *work;
+	int i = 0;
+
+	work = kzalloc(sizeof(*work), GFP_KERNEL);
+	if (!work)
+		return NULL;
 
-	/* Finally, queue the process-half of the cleanup. */
-	__armada_drm_queue_unref_work(dcrtc->crtc.dev, fwork->old_fb);
-	kfree(fwork);
+	work->plane = plane;
+	work->fn = armada_drm_crtc_complete_frame_work;
+	work->need_kfree = true;
+	armada_reg_queue_end(work->regs, i);
+
+	return work;
 }
 
 static void armada_drm_crtc_finish_fb(struct armada_crtc *dcrtc,
 	struct drm_framebuffer *fb, bool force)
 {
-	struct armada_frame_work *work;
+	struct armada_plane_work *work;
 
 	if (!fb)
 		return;
@@ -302,15 +339,11 @@ static void armada_drm_crtc_finish_fb(struct armada_crtc *dcrtc,
 		return;
 	}
 
-	work = kmalloc(sizeof(*work), GFP_KERNEL);
+	work = armada_drm_crtc_alloc_plane_work(dcrtc->crtc.primary);
 	if (work) {
-		int i = 0;
-		work->work.fn = armada_drm_crtc_complete_frame_work;
-		work->event = NULL;
 		work->old_fb = fb;
-		armada_reg_queue_end(work->regs, i);
 
-		if (armada_drm_crtc_queue_frame_work(dcrtc, work) == 0)
+		if (armada_drm_plane_work_queue(dcrtc, work) == 0)
 			return;
 
 		kfree(work);
@@ -373,8 +406,11 @@ static void armada_drm_crtc_prepare(struct drm_crtc *crtc)
 	 * the new mode parameters.
 	 */
 	plane = dcrtc->plane;
-	if (plane)
+	if (plane) {
 		drm_plane_force_disable(plane);
+		WARN_ON(!armada_drm_plane_work_wait(drm_to_armada_plane(plane),
+						    HZ));
+	}
 }
 
 /* The mode_config.mutex will be held for this call */
@@ -440,11 +476,11 @@ static void armada_drm_crtc_irq(struct armada_crtc *dcrtc, u32 stat)
 	if (stat & VSYNC_IRQ)
 		drm_crtc_handle_vblank(&dcrtc->crtc);
 
-	spin_lock(&dcrtc->irq_lock);
 	ovl_plane = dcrtc->plane;
 	if (ovl_plane)
 		armada_drm_plane_work_run(dcrtc, ovl_plane);
 
+	spin_lock(&dcrtc->irq_lock);
 	if (stat & GRA_FRAME_IRQ && dcrtc->interlaced) {
 		int i = stat & GRA_FRAME_IRQ0 ? 0 : 1;
 		uint32_t val;
@@ -536,18 +572,14 @@ static uint32_t armada_drm_crtc_calculate_csc(struct armada_crtc *dcrtc)
 	return val;
 }
 
-static void armada_drm_primary_set(struct drm_crtc *crtc,
-	struct drm_plane *plane, int x, int y)
+static void armada_drm_gra_plane_regs(struct armada_regs *regs,
+	struct drm_framebuffer *fb, struct armada_plane_state *state,
+	int x, int y, bool interlaced)
 {
-	struct armada_plane_state *state = &drm_to_armada_plane(plane)->state;
-	struct armada_crtc *dcrtc = drm_to_armada_crtc(crtc);
-	struct armada_regs regs[8];
-	bool interlaced = dcrtc->interlaced;
-	unsigned i;
+	unsigned int i;
 	u32 ctrl0;
 
-	i = armada_drm_crtc_calc_fb(plane->fb, x, y, regs, interlaced);
-
+	i = armada_drm_crtc_calc_fb(fb, x, y, regs, interlaced);
 	armada_reg_queue_set(regs, i, state->dst_yx, LCD_SPU_GRA_OVSA_HPXL_VLN);
 	armada_reg_queue_set(regs, i, state->src_hw, LCD_SPU_GRA_HPXL_VLN);
 	armada_reg_queue_set(regs, i, state->dst_hw, LCD_SPU_GZM_HPXL_VLN);
@@ -559,9 +591,21 @@ static void armada_drm_primary_set(struct drm_crtc *crtc,
 	armada_reg_queue_mod(regs, i, ctrl0, CFG_GRAFORMAT |
 			     CFG_GRA_MOD(CFG_SWAPRB | CFG_SWAPUV |
 					 CFG_SWAPYU | CFG_YUV2RGB) |
-			     CFG_PALETTE_ENA | CFG_GRA_FTOGGLE,
+			     CFG_PALETTE_ENA | CFG_GRA_FTOGGLE |
+			     CFG_GRA_HSMOOTH | CFG_GRA_ENA,
 			     LCD_SPU_DMA_CTRL0);
 	armada_reg_queue_end(regs, i);
+}
+
+static void armada_drm_primary_set(struct drm_crtc *crtc,
+	struct drm_plane *plane, int x, int y)
+{
+	struct armada_plane_state *state = &drm_to_armada_plane(plane)->state;
+	struct armada_crtc *dcrtc = drm_to_armada_crtc(crtc);
+	struct armada_regs regs[8];
+	bool interlaced = dcrtc->interlaced;
+
+	armada_drm_gra_plane_regs(regs, plane->fb, state, x, y, interlaced);
 	armada_drm_crtc_update_regs(dcrtc, regs);
 }
 
@@ -581,7 +625,7 @@ static int armada_drm_crtc_mode_set(struct drm_crtc *crtc,
 
 	interlaced = !!(adj->flags & DRM_MODE_FLAG_INTERLACE);
 
-	val = CFG_GRA_ENA | CFG_GRA_HSMOOTH;
+	val = CFG_GRA_ENA;
 	val |= CFG_GRA_FMT(drm_fb_to_armada_fb(dcrtc->crtc.primary->fb)->fmt);
 	val |= CFG_GRA_MOD(drm_fb_to_armada_fb(dcrtc->crtc.primary->fb)->mod);
 
@@ -633,8 +677,6 @@ static int armada_drm_crtc_mode_set(struct drm_crtc *crtc,
 	/* Now compute the divider for real */
 	dcrtc->variant->compute_clock(dcrtc, adj, &sclk);
 
-	/* Ensure graphic fifo is enabled */
-	armada_reg_queue_mod(regs, i, 0, CFG_PDWN64x66, LCD_SPU_SRAM_PARA1);
 	armada_reg_queue_set(regs, i, sclk, LCD_CFG_SCLK_DIV);
 
 	if (interlaced ^ dcrtc->interlaced) {
@@ -647,6 +689,9 @@ static int armada_drm_crtc_mode_set(struct drm_crtc *crtc,
 
 	spin_lock_irqsave(&dcrtc->irq_lock, flags);
 
+	/* Ensure graphic fifo is enabled */
+	armada_reg_queue_mod(regs, i, 0, CFG_PDWN64x66, LCD_SPU_SRAM_PARA1);
+
 	/* Even interlaced/progressive frame */
 	dcrtc->v[1].spu_v_h_total = adj->crtc_vtotal << 16 |
 				    adj->crtc_htotal;
@@ -729,47 +774,13 @@ static int armada_drm_crtc_mode_set_base(struct drm_crtc *crtc, int x, int y,
 	return 0;
 }
 
-void armada_drm_crtc_plane_disable(struct armada_crtc *dcrtc,
-	struct drm_plane *plane)
-{
-	u32 sram_para1, dma_ctrl0_mask;
-
-	/*
-	 * Drop our reference on any framebuffer attached to this plane.
-	 * We don't need to NULL this out as drm_plane_force_disable(),
-	 * and __setplane_internal() will do so for an overlay plane, and
-	 * __drm_helper_disable_unused_functions() will do so for the
-	 * primary plane.
-	 */
-	if (plane->fb)
-		drm_framebuffer_put(plane->fb);
-
-	/* Power down the Y/U/V FIFOs */
-	sram_para1 = CFG_PDWN16x66 | CFG_PDWN32x66;
-
-	/* Power down most RAMs and FIFOs if this is the primary plane */
-	if (plane->type == DRM_PLANE_TYPE_PRIMARY) {
-		sram_para1 |= CFG_PDWN256x32 | CFG_PDWN256x24 | CFG_PDWN256x8 |
-			      CFG_PDWN32x32 | CFG_PDWN64x66;
-		dma_ctrl0_mask = CFG_GRA_ENA;
-	} else {
-		dma_ctrl0_mask = CFG_DMA_ENA;
-	}
-
-	spin_lock_irq(&dcrtc->irq_lock);
-	armada_updatel(0, dma_ctrl0_mask, dcrtc->base + LCD_SPU_DMA_CTRL0);
-	spin_unlock_irq(&dcrtc->irq_lock);
-
-	armada_updatel(sram_para1, 0, dcrtc->base + LCD_SPU_SRAM_PARA1);
-}
-
 /* The mode_config.mutex will be held for this call */
 static void armada_drm_crtc_disable(struct drm_crtc *crtc)
 {
-	struct armada_crtc *dcrtc = drm_to_armada_crtc(crtc);
-
 	armada_drm_crtc_dpms(crtc, DRM_MODE_DPMS_OFF);
-	armada_drm_crtc_plane_disable(dcrtc, crtc->primary);
+
+	/* Disable our primary plane when we disable the CRTC. */
+	crtc->primary->funcs->disable_plane(crtc->primary, NULL);
 }
 
 static const struct drm_crtc_helper_funcs armada_crtc_helper_funcs = {
@@ -879,9 +890,11 @@ static int armada_drm_crtc_cursor_update(struct armada_crtc *dcrtc, bool reload)
 		return 0;
 	}
 
+	spin_lock_irq(&dcrtc->irq_lock);
 	para1 = readl_relaxed(dcrtc->base + LCD_SPU_SRAM_PARA1);
 	armada_updatel(CFG_CSB_256x32, CFG_CSB_256x32 | CFG_PDWN256x32,
 		       dcrtc->base + LCD_SPU_SRAM_PARA1);
+	spin_unlock_irq(&dcrtc->irq_lock);
 
 	/*
 	 * Initialize the transparency if the SRAM was powered down.
@@ -1021,7 +1034,7 @@ static int armada_drm_crtc_page_flip(struct drm_crtc *crtc,
 	struct drm_modeset_acquire_ctx *ctx)
 {
 	struct armada_crtc *dcrtc = drm_to_armada_crtc(crtc);
-	struct armada_frame_work *work;
+	struct armada_plane_work *work;
 	unsigned i;
 	int ret;
 
@@ -1029,11 +1042,10 @@ static int armada_drm_crtc_page_flip(struct drm_crtc *crtc,
 	if (fb->format != crtc->primary->fb->format)
 		return -EINVAL;
 
-	work = kmalloc(sizeof(*work), GFP_KERNEL);
+	work = armada_drm_crtc_alloc_plane_work(dcrtc->crtc.primary);
 	if (!work)
 		return -ENOMEM;
 
-	work->work.fn = armada_drm_crtc_complete_frame_work;
 	work->event = event;
 	work->old_fb = dcrtc->crtc.primary->fb;
 
@@ -1047,7 +1059,7 @@ static int armada_drm_crtc_page_flip(struct drm_crtc *crtc,
 	 */
 	drm_framebuffer_get(fb);
 
-	ret = armada_drm_crtc_queue_frame_work(dcrtc, work);
+	ret = armada_drm_plane_work_queue(dcrtc, work);
 	if (ret) {
 		/* Undo our reference above */
 		drm_framebuffer_put(fb);
@@ -1127,14 +1139,195 @@ static const struct drm_crtc_funcs armada_crtc_funcs = {
 	.disable_vblank	= armada_drm_crtc_disable_vblank,
 };
 
+static void armada_drm_primary_update_state(struct drm_plane_state *state,
+	struct armada_regs *regs)
+{
+	struct armada_plane *dplane = drm_to_armada_plane(state->plane);
+	struct armada_crtc *dcrtc = drm_to_armada_crtc(state->crtc);
+	struct armada_framebuffer *dfb = drm_fb_to_armada_fb(state->fb);
+	bool was_disabled;
+	unsigned int idx = 0;
+	u32 val;
+
+	val = CFG_GRA_FMT(dfb->fmt) | CFG_GRA_MOD(dfb->mod);
+	if (dfb->fmt > CFG_420)
+		val |= CFG_PALETTE_ENA;
+	if (state->visible)
+		val |= CFG_GRA_ENA;
+	if (drm_rect_width(&state->src) >> 16 != drm_rect_width(&state->dst))
+		val |= CFG_GRA_HSMOOTH;
+
+	was_disabled = !(dplane->state.ctrl0 & CFG_GRA_ENA);
+	if (was_disabled)
+		armada_reg_queue_mod(regs, idx,
+				     0, CFG_PDWN64x66, LCD_SPU_SRAM_PARA1);
+
+	dplane->state.ctrl0 = val;
+	dplane->state.src_hw = (drm_rect_height(&state->src) & 0xffff0000) |
+				drm_rect_width(&state->src) >> 16;
+	dplane->state.dst_hw = drm_rect_height(&state->dst) << 16 |
+			       drm_rect_width(&state->dst);
+	dplane->state.dst_yx = state->dst.y1 << 16 | state->dst.x1;
+
+	armada_drm_gra_plane_regs(regs + idx, &dfb->fb, &dplane->state,
+				  state->src.x1 >> 16, state->src.y1 >> 16,
+				  dcrtc->interlaced);
+
+	dplane->state.vsync_update = !was_disabled;
+	dplane->state.changed = true;
+}
+
+static int armada_drm_primary_update(struct drm_plane *plane,
+	struct drm_crtc *crtc, struct drm_framebuffer *fb,
+	int crtc_x, int crtc_y, unsigned int crtc_w, unsigned int crtc_h,
+	uint32_t src_x, uint32_t src_y, uint32_t src_w, uint32_t src_h,
+	struct drm_modeset_acquire_ctx *ctx)
+{
+	struct armada_plane *dplane = drm_to_armada_plane(plane);
+	struct armada_crtc *dcrtc = drm_to_armada_crtc(crtc);
+	struct armada_plane_work *work;
+	struct drm_plane_state state = {
+		.plane = plane,
+		.crtc = crtc,
+		.fb = fb,
+		.src_x = src_x,
+		.src_y = src_y,
+		.src_w = src_w,
+		.src_h = src_h,
+		.crtc_x = crtc_x,
+		.crtc_y = crtc_y,
+		.crtc_w = crtc_w,
+		.crtc_h = crtc_h,
+		.rotation = DRM_MODE_ROTATE_0,
+	};
+	const struct drm_rect clip = {
+		.x2 = crtc->mode.hdisplay,
+		.y2 = crtc->mode.vdisplay,
+	};
+	int ret;
+
+	ret = drm_atomic_helper_check_plane_state(&state, crtc->state, &clip, 0,
+						  INT_MAX, true, false);
+	if (ret)
+		return ret;
+
+	work = &dplane->works[dplane->next_work];
+	work->fn = armada_drm_crtc_complete_frame_work;
+
+	if (plane->fb != fb) {
+		/*
+		 * Take a reference on the new framebuffer - we want to
+		 * hold on to it while the hardware is displaying it.
+		 */
+		drm_framebuffer_reference(fb);
+
+		work->old_fb = plane->fb;
+	} else {
+		work->old_fb = NULL;
+	}
+
+	armada_drm_primary_update_state(&state, work->regs);
+
+	if (!dplane->state.changed)
+		return 0;
+
+	/* Wait for pending work to complete */
+	if (armada_drm_plane_work_wait(dplane, HZ / 10) == 0)
+		armada_drm_plane_work_cancel(dcrtc, dplane);
+
+	if (!dplane->state.vsync_update) {
+		work->fn(dcrtc, work);
+		if (work->old_fb)
+			drm_framebuffer_unreference(work->old_fb);
+		return 0;
+	}
+
+	/* Queue it for update on the next interrupt if we are enabled */
+	ret = armada_drm_plane_work_queue(dcrtc, work);
+	if (ret) {
+		work->fn(dcrtc, work);
+		if (work->old_fb)
+			drm_framebuffer_unreference(work->old_fb);
+	}
+
+	dplane->next_work = !dplane->next_work;
+
+	return 0;
+}
+
+int armada_drm_plane_disable(struct drm_plane *plane,
+			     struct drm_modeset_acquire_ctx *ctx)
+{
+	struct armada_plane *dplane = drm_to_armada_plane(plane);
+	struct armada_crtc *dcrtc;
+	struct armada_plane_work *work;
+	unsigned int idx = 0;
+	u32 sram_para1, enable_mask;
+
+	if (!plane->crtc)
+		return 0;
+
+	/*
+	 * Arrange to power down most RAMs and FIFOs if this is the primary
+	 * plane, otherwise just the YUV FIFOs for the overlay plane.
+	 */
+	if (plane->type == DRM_PLANE_TYPE_PRIMARY) {
+		sram_para1 = CFG_PDWN256x32 | CFG_PDWN256x24 | CFG_PDWN256x8 |
+			     CFG_PDWN32x32 | CFG_PDWN64x66;
+		enable_mask = CFG_GRA_ENA;
+	} else {
+		sram_para1 = CFG_PDWN16x66 | CFG_PDWN32x66;
+		enable_mask = CFG_DMA_ENA;
+	}
+
+	dplane->state.ctrl0 &= ~enable_mask;
+
+	dcrtc = drm_to_armada_crtc(plane->crtc);
+
+	/*
+	 * Try to disable the plane and drop our ref on the framebuffer
+	 * at the next frame update. If we fail for any reason, disable
+	 * the plane immediately.
+	 */
+	work = &dplane->works[dplane->next_work];
+	work->fn = armada_drm_crtc_complete_disable_work;
+	work->cancel = armada_drm_crtc_complete_disable_work;
+	work->old_fb = plane->fb;
+
+	armada_reg_queue_mod(work->regs, idx,
+			     0, enable_mask, LCD_SPU_DMA_CTRL0);
+	armada_reg_queue_mod(work->regs, idx,
+			     sram_para1, 0, LCD_SPU_SRAM_PARA1);
+	armada_reg_queue_end(work->regs, idx);
+
+	/* Wait for any preceding work to complete, but don't wedge */
+	if (WARN_ON(!armada_drm_plane_work_wait(dplane, HZ)))
+		armada_drm_plane_work_cancel(dcrtc, dplane);
+
+	if (armada_drm_plane_work_queue(dcrtc, work)) {
+		work->fn(dcrtc, work);
+		if (work->old_fb)
+			drm_framebuffer_unreference(work->old_fb);
+	}
+
+	dplane->next_work = !dplane->next_work;
+
+	return 0;
+}
+
 static const struct drm_plane_funcs armada_primary_plane_funcs = {
-	.update_plane	= drm_primary_helper_update,
-	.disable_plane	= drm_primary_helper_disable,
+	.update_plane	= armada_drm_primary_update,
+	.disable_plane	= armada_drm_plane_disable,
 	.destroy	= drm_primary_helper_destroy,
 };
 
 int armada_drm_plane_init(struct armada_plane *plane)
 {
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(plane->works); i++)
+		plane->works[i].plane = &plane->base;
+
 	init_waitqueue_head(&plane->frame_wait);
 
 	return 0;
@@ -1225,17 +1418,13 @@ static int armada_drm_crtc_create(struct drm_device *drm, struct device *dev,
 
 	ret = devm_request_irq(dev, irq, armada_drm_irq, 0, "armada_drm_crtc",
 			       dcrtc);
-	if (ret < 0) {
-		kfree(dcrtc);
-		return ret;
-	}
+	if (ret < 0)
+		goto err_crtc;
 
 	if (dcrtc->variant->init) {
 		ret = dcrtc->variant->init(dcrtc, dev);
-		if (ret) {
-			kfree(dcrtc);
-			return ret;
-		}
+		if (ret)
+			goto err_crtc;
 	}
 
 	/* Ensure AXI pipeline is enabled */
@@ -1246,13 +1435,15 @@ static int armada_drm_crtc_create(struct drm_device *drm, struct device *dev,
 	dcrtc->crtc.port = port;
 
 	primary = kzalloc(sizeof(*primary), GFP_KERNEL);
-	if (!primary)
-		return -ENOMEM;
+	if (!primary) {
+		ret = -ENOMEM;
+		goto err_crtc;
+	}
 
 	ret = armada_drm_plane_init(primary);
 	if (ret) {
 		kfree(primary);
-		return ret;
+		goto err_crtc;
 	}
 
 	ret = drm_universal_plane_init(drm, &primary->base, 0,
@@ -1263,7 +1454,7 @@ static int armada_drm_crtc_create(struct drm_device *drm, struct device *dev,
 				       DRM_PLANE_TYPE_PRIMARY, NULL);
 	if (ret) {
 		kfree(primary);
-		return ret;
+		goto err_crtc;
 	}
 
 	ret = drm_crtc_init_with_planes(drm, &dcrtc->crtc, &primary->base, NULL,
@@ -1282,6 +1473,9 @@ static int armada_drm_crtc_create(struct drm_device *drm, struct device *dev,
 
 err_crtc_init:
 	primary->base.funcs->destroy(&primary->base);
+err_crtc:
+	kfree(dcrtc);
+
 	return ret;
 }
 
diff --git a/drivers/gpu/drm/armada/armada_crtc.h b/drivers/gpu/drm/armada/armada_crtc.h
index bab11f483575..445829b8877a 100644
--- a/drivers/gpu/drm/armada/armada_crtc.h
+++ b/drivers/gpu/drm/armada/armada_crtc.h
@@ -36,21 +36,31 @@ struct armada_plane;
 struct armada_variant;
 
 struct armada_plane_work {
-	void			(*fn)(struct armada_crtc *,
-				      struct armada_plane *,
-				      struct armada_plane_work *);
+	void (*fn)(struct armada_crtc *, struct armada_plane_work *);
+	void (*cancel)(struct armada_crtc *, struct armada_plane_work *);
+	bool need_kfree;
+	struct drm_plane *plane;
+	struct drm_framebuffer *old_fb;
+	struct drm_pending_vblank_event *event;
+	struct armada_regs regs[14];
 };
 
 struct armada_plane_state {
+	u16 src_x;
+	u16 src_y;
 	u32 src_hw;
 	u32 dst_hw;
 	u32 dst_yx;
 	u32 ctrl0;
+	bool changed;
+	bool vsync_update;
 };
 
 struct armada_plane {
 	struct drm_plane	base;
 	wait_queue_head_t	frame_wait;
+	bool			next_work;
+	struct armada_plane_work works[2];
 	struct armada_plane_work *work;
 	struct armada_plane_state state;
 };
@@ -58,10 +68,10 @@ struct armada_plane {
 
 int armada_drm_plane_init(struct armada_plane *plane);
 int armada_drm_plane_work_queue(struct armada_crtc *dcrtc,
-	struct armada_plane *plane, struct armada_plane_work *work);
+	struct armada_plane_work *work);
 int armada_drm_plane_work_wait(struct armada_plane *plane, long timeout);
-struct armada_plane_work *armada_drm_plane_work_cancel(
-	struct armada_crtc *dcrtc, struct armada_plane *plane);
+void armada_drm_plane_work_cancel(struct armada_crtc *dcrtc,
+	struct armada_plane *plane);
 void armada_drm_plane_calc_addrs(u32 *addrs, struct drm_framebuffer *fb,
 	int x, int y);
 
@@ -104,8 +114,8 @@ struct armada_crtc {
 
 void armada_drm_crtc_update_regs(struct armada_crtc *, struct armada_regs *);
 
-void armada_drm_crtc_plane_disable(struct armada_crtc *dcrtc,
-	struct drm_plane *plane);
+int armada_drm_plane_disable(struct drm_plane *plane,
+			     struct drm_modeset_acquire_ctx *ctx);
 
 extern struct platform_driver armada_lcd_platform_driver;
 
diff --git a/drivers/gpu/drm/armada/armada_overlay.c b/drivers/gpu/drm/armada/armada_overlay.c
index b411b608821a..77b55adaa2ac 100644
--- a/drivers/gpu/drm/armada/armada_overlay.c
+++ b/drivers/gpu/drm/armada/armada_overlay.c
@@ -7,7 +7,7 @@
  * published by the Free Software Foundation.
  */
 #include <drm/drmP.h>
-#include <drm/drm_plane_helper.h>
+#include <drm/drm_atomic_helper.h>
 #include "armada_crtc.h"
 #include "armada_drm.h"
 #include "armada_fb.h"
@@ -32,11 +32,6 @@ struct armada_ovl_plane_properties {
 
 struct armada_ovl_plane {
 	struct armada_plane base;
-	struct drm_framebuffer *old_fb;
-	struct {
-		struct armada_plane_work work;
-		struct armada_regs regs[13];
-	} vbl;
 	struct armada_ovl_plane_properties prop;
 };
 #define drm_to_armada_ovl_plane(p) \
@@ -67,218 +62,204 @@ armada_ovl_update_attr(struct armada_ovl_plane_properties *prop,
 	spin_unlock_irq(&dcrtc->irq_lock);
 }
 
-static void armada_ovl_retire_fb(struct armada_ovl_plane *dplane,
-	struct drm_framebuffer *fb)
-{
-	struct drm_framebuffer *old_fb;
-
-	old_fb = xchg(&dplane->old_fb, fb);
-
-	if (old_fb)
-		armada_drm_queue_unref_work(dplane->base.base.dev, old_fb);
-}
-
 /* === Plane support === */
 static void armada_ovl_plane_work(struct armada_crtc *dcrtc,
-	struct armada_plane *plane, struct armada_plane_work *work)
+	struct armada_plane_work *work)
 {
-	struct armada_ovl_plane *dplane = container_of(plane, struct armada_ovl_plane, base);
+	unsigned long flags;
 
-	trace_armada_ovl_plane_work(&dcrtc->crtc, &plane->base);
+	trace_armada_ovl_plane_work(&dcrtc->crtc, work->plane);
 
-	armada_drm_crtc_update_regs(dcrtc, dplane->vbl.regs);
-	armada_ovl_retire_fb(dplane, NULL);
+	spin_lock_irqsave(&dcrtc->irq_lock, flags);
+	armada_drm_crtc_update_regs(dcrtc, work->regs);
+	spin_unlock_irqrestore(&dcrtc->irq_lock, flags);
 }
 
-static int
-armada_ovl_plane_update(struct drm_plane *plane, struct drm_crtc *crtc,
-	struct drm_framebuffer *fb,
-	int crtc_x, int crtc_y, unsigned crtc_w, unsigned crtc_h,
-	uint32_t src_x, uint32_t src_y, uint32_t src_w, uint32_t src_h,
-	struct drm_modeset_acquire_ctx *ctx)
+static void armada_ovl_plane_update_state(struct drm_plane_state *state,
+	struct armada_regs *regs)
 {
-	struct armada_ovl_plane *dplane = drm_to_armada_ovl_plane(plane);
-	struct armada_crtc *dcrtc = drm_to_armada_crtc(crtc);
-	struct drm_rect src = {
-		.x1 = src_x,
-		.y1 = src_y,
-		.x2 = src_x + src_w,
-		.y2 = src_y + src_h,
-	};
-	struct drm_rect dest = {
-		.x1 = crtc_x,
-		.y1 = crtc_y,
-		.x2 = crtc_x + crtc_w,
-		.y2 = crtc_y + crtc_h,
-	};
-	const struct drm_rect clip = {
-		.x2 = crtc->mode.hdisplay,
-		.y2 = crtc->mode.vdisplay,
-	};
-	uint32_t val, ctrl0;
-	unsigned idx = 0;
-	bool visible;
-	int ret;
-
-	trace_armada_ovl_plane_update(plane, crtc, fb,
-				 crtc_x, crtc_y, crtc_w, crtc_h,
-				 src_x, src_y, src_w, src_h);
-
-	ret = drm_plane_helper_check_update(plane, crtc, fb, &src, &dest, &clip,
-					    DRM_MODE_ROTATE_0,
-					    0, INT_MAX, true, false, &visible);
-	if (ret)
-		return ret;
-
-	ctrl0 = CFG_DMA_FMT(drm_fb_to_armada_fb(fb)->fmt) |
-		CFG_DMA_MOD(drm_fb_to_armada_fb(fb)->mod) |
-		CFG_CBSH_ENA | CFG_DMA_HSMOOTH | CFG_DMA_ENA;
-
-	/* Does the position/size result in nothing to display? */
-	if (!visible)
-		ctrl0 &= ~CFG_DMA_ENA;
-
-	if (!dcrtc->plane) {
-		dcrtc->plane = plane;
-		armada_ovl_update_attr(&dplane->prop, dcrtc);
-	}
-
-	/* FIXME: overlay on an interlaced display */
-	/* Just updating the position/size? */
-	if (plane->fb == fb && dplane->base.state.ctrl0 == ctrl0) {
-		val = (drm_rect_height(&src) & 0xffff0000) |
-		      drm_rect_width(&src) >> 16;
-		dplane->base.state.src_hw = val;
-		writel_relaxed(val, dcrtc->base + LCD_SPU_DMA_HPXL_VLN);
-
-		val = drm_rect_height(&dest) << 16 | drm_rect_width(&dest);
-		dplane->base.state.dst_hw = val;
-		writel_relaxed(val, dcrtc->base + LCD_SPU_DZM_HPXL_VLN);
-
-		val = dest.y1 << 16 | dest.x1;
-		dplane->base.state.dst_yx = val;
-		writel_relaxed(val, dcrtc->base + LCD_SPU_DMA_OVSA_HPXL_VLN);
-
-		return 0;
-	} else if (~dplane->base.state.ctrl0 & ctrl0 & CFG_DMA_ENA) {
+	struct armada_ovl_plane *dplane = drm_to_armada_ovl_plane(state->plane);
+	struct armada_framebuffer *dfb = drm_fb_to_armada_fb(state->fb);
+	const struct drm_format_info *format;
+	unsigned int idx = 0;
+	bool fb_changed;
+	u32 val, ctrl0;
+	u16 src_x, src_y;
+
+	ctrl0 = CFG_DMA_FMT(dfb->fmt) | CFG_DMA_MOD(dfb->mod) | CFG_CBSH_ENA;
+	if (state->visible)
+		ctrl0 |= CFG_DMA_ENA;
+	if (drm_rect_width(&state->src) >> 16 != drm_rect_width(&state->dst))
+		ctrl0 |= CFG_DMA_HSMOOTH;
+
+	/*
+	 * Shifting a YUV packed format image by one pixel causes the U/V
+	 * planes to swap.  Compensate for it by also toggling the UV swap.
+	 */
+	format = dfb->fb.format;
+	if (format->num_planes == 1 && state->src.x1 >> 16 & (format->hsub - 1))
+		ctrl0 ^= CFG_DMA_MOD(CFG_SWAPUV);
+
+	if (~dplane->base.state.ctrl0 & ctrl0 & CFG_DMA_ENA) {
 		/* Power up the Y/U/V FIFOs on ENA 0->1 transitions */
-		armada_updatel(0, CFG_PDWN16x66 | CFG_PDWN32x66,
-			       dcrtc->base + LCD_SPU_SRAM_PARA1);
+		armada_reg_queue_mod(regs, idx,
+				     0, CFG_PDWN16x66 | CFG_PDWN32x66,
+				     LCD_SPU_SRAM_PARA1);
 	}
 
-	if (armada_drm_plane_work_wait(&dplane->base, HZ / 25) == 0)
-		armada_drm_plane_work_cancel(dcrtc, &dplane->base);
-
-	if (plane->fb != fb) {
-		u32 addrs[3], pixel_format;
-		int num_planes, hsub;
-
-		/*
-		 * Take a reference on the new framebuffer - we want to
-		 * hold on to it while the hardware is displaying it.
-		 */
-		drm_framebuffer_get(fb);
-
-		if (plane->fb)
-			armada_ovl_retire_fb(dplane, plane->fb);
+	fb_changed = dplane->base.base.fb != &dfb->fb ||
+		     dplane->base.state.src_x != state->src.x1 >> 16 ||
+	             dplane->base.state.src_y != state->src.y1 >> 16;
 
-		src_y = src.y1 >> 16;
-		src_x = src.x1 >> 16;
+	dplane->base.state.vsync_update = fb_changed;
 
-		armada_drm_plane_calc_addrs(addrs, fb, src_x, src_y);
+	/* FIXME: overlay on an interlaced display */
+	if (fb_changed) {
+		u32 addrs[3];
 
-		pixel_format = fb->format->format;
-		hsub = drm_format_horz_chroma_subsampling(pixel_format);
-		num_planes = fb->format->num_planes;
+		dplane->base.state.src_y = src_y = state->src.y1 >> 16;
+		dplane->base.state.src_x = src_x = state->src.x1 >> 16;
 
-		/*
-		 * Annoyingly, shifting a YUYV-format image by one pixel
-		 * causes the U/V planes to toggle.  Toggle the UV swap.
-		 * (Unfortunately, this causes momentary colour flickering.)
-		 */
-		if (src_x & (hsub - 1) && num_planes == 1)
-			ctrl0 ^= CFG_DMA_MOD(CFG_SWAPUV);
+		armada_drm_plane_calc_addrs(addrs, &dfb->fb, src_x, src_y);
 
-		armada_reg_queue_set(dplane->vbl.regs, idx, addrs[0],
+		armada_reg_queue_set(regs, idx, addrs[0],
 				     LCD_SPU_DMA_START_ADDR_Y0);
-		armada_reg_queue_set(dplane->vbl.regs, idx, addrs[1],
+		armada_reg_queue_set(regs, idx, addrs[1],
 				     LCD_SPU_DMA_START_ADDR_U0);
-		armada_reg_queue_set(dplane->vbl.regs, idx, addrs[2],
+		armada_reg_queue_set(regs, idx, addrs[2],
 				     LCD_SPU_DMA_START_ADDR_V0);
-		armada_reg_queue_set(dplane->vbl.regs, idx, addrs[0],
+		armada_reg_queue_set(regs, idx, addrs[0],
 				     LCD_SPU_DMA_START_ADDR_Y1);
-		armada_reg_queue_set(dplane->vbl.regs, idx, addrs[1],
+		armada_reg_queue_set(regs, idx, addrs[1],
 				     LCD_SPU_DMA_START_ADDR_U1);
-		armada_reg_queue_set(dplane->vbl.regs, idx, addrs[2],
+		armada_reg_queue_set(regs, idx, addrs[2],
 				     LCD_SPU_DMA_START_ADDR_V1);
 
-		val = fb->pitches[0] << 16 | fb->pitches[0];
-		armada_reg_queue_set(dplane->vbl.regs, idx, val,
+		val = dfb->fb.pitches[0] << 16 | dfb->fb.pitches[0];
+		armada_reg_queue_set(regs, idx, val,
 				     LCD_SPU_DMA_PITCH_YC);
-		val = fb->pitches[1] << 16 | fb->pitches[2];
-		armada_reg_queue_set(dplane->vbl.regs, idx, val,
+		val = dfb->fb.pitches[1] << 16 | dfb->fb.pitches[2];
+		armada_reg_queue_set(regs, idx, val,
 				     LCD_SPU_DMA_PITCH_UV);
 	}
 
-	val = (drm_rect_height(&src) & 0xffff0000) | drm_rect_width(&src) >> 16;
+	val = (drm_rect_height(&state->src) & 0xffff0000) |
+	       drm_rect_width(&state->src) >> 16;
 	if (dplane->base.state.src_hw != val) {
 		dplane->base.state.src_hw = val;
-		armada_reg_queue_set(dplane->vbl.regs, idx, val,
+		armada_reg_queue_set(regs, idx, val,
 				     LCD_SPU_DMA_HPXL_VLN);
 	}
 
-	val = drm_rect_height(&dest) << 16 | drm_rect_width(&dest);
+	val = drm_rect_height(&state->dst) << 16 | drm_rect_width(&state->dst);
 	if (dplane->base.state.dst_hw != val) {
 		dplane->base.state.dst_hw = val;
-		armada_reg_queue_set(dplane->vbl.regs, idx, val,
+		armada_reg_queue_set(regs, idx, val,
 				     LCD_SPU_DZM_HPXL_VLN);
 	}
 
-	val = dest.y1 << 16 | dest.x1;
+	val = state->dst.y1 << 16 | state->dst.x1;
 	if (dplane->base.state.dst_yx != val) {
 		dplane->base.state.dst_yx = val;
-		armada_reg_queue_set(dplane->vbl.regs, idx, val,
+		armada_reg_queue_set(regs, idx, val,
 				     LCD_SPU_DMA_OVSA_HPXL_VLN);
 	}
 
 	if (dplane->base.state.ctrl0 != ctrl0) {
 		dplane->base.state.ctrl0 = ctrl0;
-		armada_reg_queue_mod(dplane->vbl.regs, idx, ctrl0,
+		armada_reg_queue_mod(regs, idx, ctrl0,
 			CFG_CBSH_ENA | CFG_DMAFORMAT | CFG_DMA_FTOGGLE |
 			CFG_DMA_HSMOOTH | CFG_DMA_TSTMODE |
 			CFG_DMA_MOD(CFG_SWAPRB | CFG_SWAPUV | CFG_SWAPYU |
 			CFG_YUV2RGB) | CFG_DMA_ENA,
 			LCD_SPU_DMA_CTRL0);
+		dplane->base.state.vsync_update = true;
 	}
-	if (idx) {
-		armada_reg_queue_end(dplane->vbl.regs, idx);
-		armada_drm_plane_work_queue(dcrtc, &dplane->base,
-					    &dplane->vbl.work);
-	}
-	return 0;
+
+	dplane->base.state.changed = idx != 0;
+
+	armada_reg_queue_end(regs, idx);
 }
 
-static int armada_ovl_plane_disable(struct drm_plane *plane,
-				    struct drm_modeset_acquire_ctx *ctx)
+static int
+armada_ovl_plane_update(struct drm_plane *plane, struct drm_crtc *crtc,
+	struct drm_framebuffer *fb,
+	int crtc_x, int crtc_y, unsigned crtc_w, unsigned crtc_h,
+	uint32_t src_x, uint32_t src_y, uint32_t src_w, uint32_t src_h,
+	struct drm_modeset_acquire_ctx *ctx)
 {
 	struct armada_ovl_plane *dplane = drm_to_armada_ovl_plane(plane);
-	struct drm_framebuffer *fb;
-	struct armada_crtc *dcrtc;
+	struct armada_crtc *dcrtc = drm_to_armada_crtc(crtc);
+	struct armada_plane_work *work;
+	struct drm_plane_state state = {
+		.plane = plane,
+		.crtc = crtc,
+		.fb = fb,
+		.src_x = src_x,
+		.src_y = src_y,
+		.src_w = src_w,
+		.src_h = src_h,
+		.crtc_x = crtc_x,
+		.crtc_y = crtc_y,
+		.crtc_w = crtc_w,
+		.crtc_h = crtc_h,
+		.rotation = DRM_MODE_ROTATE_0,
+	};
+	const struct drm_rect clip = {
+		.x2 = crtc->mode.hdisplay,
+		.y2 = crtc->mode.vdisplay,
+	};
+	int ret;
 
-	if (!dplane->base.base.crtc)
+	trace_armada_ovl_plane_update(plane, crtc, fb,
+				 crtc_x, crtc_y, crtc_w, crtc_h,
+				 src_x, src_y, src_w, src_h);
+
+	ret = drm_atomic_helper_check_plane_state(&state, crtc->state, &clip, 0,
+						  INT_MAX, true, false);
+	if (ret)
+		return ret;
+
+	work = &dplane->base.works[dplane->base.next_work];
+
+	if (plane->fb != fb) {
+		/*
+		 * Take a reference on the new framebuffer - we want to
+		 * hold on to it while the hardware is displaying it.
+		 */
+		drm_framebuffer_reference(fb);
+
+		work->old_fb = plane->fb;
+	} else {
+		work->old_fb = NULL;
+	}
+
+	armada_ovl_plane_update_state(&state, work->regs);
+
+	if (!dplane->base.state.changed)
 		return 0;
 
-	dcrtc = drm_to_armada_crtc(dplane->base.base.crtc);
+	/* Wait for pending work to complete */
+	if (armada_drm_plane_work_wait(&dplane->base, HZ / 25) == 0)
+		armada_drm_plane_work_cancel(dcrtc, &dplane->base);
+
+	/* Just updating the position/size? */
+	if (!dplane->base.state.vsync_update) {
+		armada_ovl_plane_work(dcrtc, work);
+		return 0;
+	}
 
-	armada_drm_plane_work_cancel(dcrtc, &dplane->base);
-	armada_drm_crtc_plane_disable(dcrtc, plane);
+	if (!dcrtc->plane) {
+		dcrtc->plane = plane;
+		armada_ovl_update_attr(&dplane->prop, dcrtc);
+	}
 
-	dcrtc->plane = NULL;
-	dplane->base.state.ctrl0 = 0;
+	/* Queue it for update on the next interrupt if we are enabled */
+	ret = armada_drm_plane_work_queue(dcrtc, work);
+	if (ret)
+		DRM_ERROR("failed to queue plane work: %d\n", ret);
 
-	fb = xchg(&dplane->old_fb, NULL);
-	if (fb)
-		drm_framebuffer_put(fb);
+	dplane->base.next_work = !dplane->base.next_work;
 
 	return 0;
 }
@@ -362,7 +343,7 @@ static int armada_ovl_plane_set_property(struct drm_plane *plane,
 
 static const struct drm_plane_funcs armada_ovl_plane_funcs = {
 	.update_plane	= armada_ovl_plane_update,
-	.disable_plane	= armada_ovl_plane_disable,
+	.disable_plane	= armada_drm_plane_disable,
 	.destroy	= armada_ovl_plane_destroy,
 	.set_property	= armada_ovl_plane_set_property,
 };
@@ -454,7 +435,8 @@ int armada_overlay_plane_create(struct drm_device *dev, unsigned long crtcs)
 		return ret;
 	}
 
-	dplane->vbl.work.fn = armada_ovl_plane_work;
+	dplane->base.works[0].fn = armada_ovl_plane_work;
+	dplane->base.works[1].fn = armada_ovl_plane_work;
 
 	ret = drm_universal_plane_init(dev, &dplane->base.base, crtcs,
 				       &armada_ovl_plane_funcs,
diff --git a/drivers/gpu/drm/armada/armada_trace.h b/drivers/gpu/drm/armada/armada_trace.h
index 8dbfea7a00fe..f03a56bda596 100644
--- a/drivers/gpu/drm/armada/armada_trace.h
+++ b/drivers/gpu/drm/armada/armada_trace.h
@@ -34,14 +34,34 @@ TRACE_EVENT(armada_ovl_plane_update,
 		__field(struct drm_plane *, plane)
 		__field(struct drm_crtc *, crtc)
 		__field(struct drm_framebuffer *, fb)
+		__field(int, crtc_x)
+		__field(int, crtc_y)
+		__field(unsigned int, crtc_w)
+		__field(unsigned int, crtc_h)
+		__field(u32, src_x)
+		__field(u32, src_y)
+		__field(u32, src_w)
+		__field(u32, src_h)
 	),
 	TP_fast_assign(
 		__entry->plane = plane;
 		__entry->crtc = crtc;
 		__entry->fb = fb;
+		__entry->crtc_x = crtc_x;
+		__entry->crtc_y = crtc_y;
+		__entry->crtc_w = crtc_w;
+		__entry->crtc_h = crtc_h;
+		__entry->src_x = src_x;
+		__entry->src_y = src_y;
+		__entry->src_w = src_w;
+		__entry->src_h = src_h;
 	),
-	TP_printk("plane %p crtc %p fb %p",
-		__entry->plane, __entry->crtc, __entry->fb)
+	TP_printk("plane %p crtc %p fb %p crtc @ (%d,%d, %ux%u) src @ (%u,%u, %ux%u)",
+		__entry->plane, __entry->crtc, __entry->fb,
+		__entry->crtc_x, __entry->crtc_y,
+		__entry->crtc_w, __entry->crtc_h,
+		__entry->src_x >> 16, __entry->src_y >> 16,
+		__entry->src_w >> 16, __entry->src_h >> 16)
 );
 
 TRACE_EVENT(armada_ovl_plane_work,
diff --git a/drivers/gpu/drm/drm_atomic.c b/drivers/gpu/drm/drm_atomic.c
index 37445d50816a..b76d49218cf1 100644
--- a/drivers/gpu/drm/drm_atomic.c
+++ b/drivers/gpu/drm/drm_atomic.c
@@ -50,7 +50,8 @@ EXPORT_SYMBOL(__drm_crtc_commit_free);
  * @state: atomic state
  *
  * Free all the memory allocated by drm_atomic_state_init.
- * This is useful for drivers that subclass the atomic state.
+ * This should only be used by drivers which are still subclassing
+ * &drm_atomic_state and haven't switched to &drm_private_state yet.
  */
 void drm_atomic_state_default_release(struct drm_atomic_state *state)
 {
@@ -67,7 +68,8 @@ EXPORT_SYMBOL(drm_atomic_state_default_release);
  * @state: atomic state
  *
  * Default implementation for filling in a new atomic state.
- * This is useful for drivers that subclass the atomic state.
+ * This should only be used by drivers which are still subclassing
+ * &drm_atomic_state and haven't switched to &drm_private_state yet.
  */
 int
 drm_atomic_state_init(struct drm_device *dev, struct drm_atomic_state *state)
@@ -132,7 +134,8 @@ EXPORT_SYMBOL(drm_atomic_state_alloc);
  * @state: atomic state
  *
  * Default implementation for clearing atomic state.
- * This is useful for drivers that subclass the atomic state.
+ * This should only be used by drivers which are still subclassing
+ * &drm_atomic_state and haven't switched to &drm_private_state yet.
  */
 void drm_atomic_state_default_clear(struct drm_atomic_state *state)
 {
@@ -947,6 +950,42 @@ static void drm_atomic_plane_print_state(struct drm_printer *p,
 }
 
 /**
+ * DOC: handling driver private state
+ *
+ * Very often the DRM objects exposed to userspace in the atomic modeset api
+ * (&drm_connector, &drm_crtc and &drm_plane) do not map neatly to the
+ * underlying hardware. Especially for any kind of shared resources (e.g. shared
+ * clocks, scaler units, bandwidth and fifo limits shared among a group of
+ * planes or CRTCs, and so on) it makes sense to model these as independent
+ * objects. Drivers then need to do similar state tracking and commit ordering for
+ * such private (since not exposed to userpace) objects as the atomic core and
+ * helpers already provide for connectors, planes and CRTCs.
+ *
+ * To make this easier on drivers the atomic core provides some support to track
+ * driver private state objects using struct &drm_private_obj, with the
+ * associated state struct &drm_private_state.
+ *
+ * Similar to userspace-exposed objects, private state structures can be
+ * acquired by calling drm_atomic_get_private_obj_state(). Since this function
+ * does not take care of locking, drivers should wrap it for each type of
+ * private state object they have with the required call to drm_modeset_lock()
+ * for the corresponding &drm_modeset_lock.
+ *
+ * All private state structures contained in a &drm_atomic_state update can be
+ * iterated using for_each_oldnew_private_obj_in_state(),
+ * for_each_new_private_obj_in_state() and for_each_old_private_obj_in_state().
+ * Drivers are recommended to wrap these for each type of driver private state
+ * object they have, filtering on &drm_private_obj.funcs using for_each_if(), at
+ * least if they want to iterate over all objects of a given type.
+ *
+ * An earlier way to handle driver private state was by subclassing struct
+ * &drm_atomic_state. But since that encourages non-standard ways to implement
+ * the check/commit split atomic requires (by using e.g. "check and rollback or
+ * commit instead" of "duplicate state, check, then either commit or release
+ * duplicated state) it is deprecated in favour of using &drm_private_state.
+ */
+
+/**
  * drm_atomic_private_obj_init - initialize private object
  * @obj: private object
  * @state: initial private object state
diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c
index 365901c1c33c..ddd537914575 100644
--- a/drivers/gpu/drm/drm_edid.c
+++ b/drivers/gpu/drm/drm_edid.c
@@ -4871,6 +4871,11 @@ EXPORT_SYMBOL(drm_hdmi_avi_infoframe_from_display_mode);
  * @mode: DRM display mode
  * @rgb_quant_range: RGB quantization range (Q)
  * @rgb_quant_range_selectable: Sink support selectable RGB quantization range (QS)
+ * @is_hdmi2_sink: HDMI 2.0 sink, which has different default recommendations
+ *
+ * Note that @is_hdmi2_sink can be derived by looking at the
+ * &drm_scdc.supported flag stored in &drm_hdmi_info.scdc,
+ * &drm_display_info.hdmi, which can be found in &drm_connector.display_info.
  */
 void
 drm_hdmi_avi_infoframe_quant_range(struct hdmi_avi_infoframe *frame,
diff --git a/drivers/gpu/drm/drm_fb_helper.c b/drivers/gpu/drm/drm_fb_helper.c
index 04a3a5ce370a..035784ddd133 100644
--- a/drivers/gpu/drm/drm_fb_helper.c
+++ b/drivers/gpu/drm/drm_fb_helper.c
@@ -66,19 +66,23 @@ static DEFINE_MUTEX(kernel_fb_helper_lock);
  * helper functions used by many drivers to implement the kernel mode setting
  * interfaces.
  *
- * Initialization is done as a four-step process with drm_fb_helper_prepare(),
- * drm_fb_helper_init(), drm_fb_helper_single_add_all_connectors() and
- * drm_fb_helper_initial_config(). Drivers with fancier requirements than the
- * default behaviour can override the third step with their own code.
- * Teardown is done with drm_fb_helper_fini() after the fbdev device is
- * unregisters using drm_fb_helper_unregister_fbi().
- *
- * At runtime drivers should restore the fbdev console by calling
- * drm_fb_helper_restore_fbdev_mode_unlocked() from their &drm_driver.lastclose
- * callback.  They should also notify the fb helper code from updates to the
- * output configuration by calling drm_fb_helper_hotplug_event(). For easier
- * integration with the output polling code in drm_crtc_helper.c the modeset
- * code provides a &drm_mode_config_funcs.output_poll_changed callback.
+ * Setup fbdev emulation by calling drm_fb_helper_fbdev_setup() and tear it
+ * down by calling drm_fb_helper_fbdev_teardown().
+ *
+ * Drivers that need to handle connector hotplugging (e.g. dp mst) can't use
+ * the setup helper and will need to do the whole four-step setup process with
+ * drm_fb_helper_prepare(), drm_fb_helper_init(),
+ * drm_fb_helper_single_add_all_connectors(), enable hotplugging and
+ * drm_fb_helper_initial_config() to avoid a possible race window.
+ *
+ * At runtime drivers should restore the fbdev console by using
+ * drm_fb_helper_lastclose() as their &drm_driver.lastclose callback.
+ * They should also notify the fb helper code from updates to the output
+ * configuration by using drm_fb_helper_output_poll_changed() as their
+ * &drm_mode_config_funcs.output_poll_changed callback.
+ *
+ * For suspend/resume consider using drm_mode_config_helper_suspend() and
+ * drm_mode_config_helper_resume() which takes care of fbdev as well.
  *
  * All other functions exported by the fb helper library can be used to
  * implement the fbdev driver interface by the driver.
@@ -103,7 +107,8 @@ static DEFINE_MUTEX(kernel_fb_helper_lock);
  * always run in process context since the fb_*() function could be running in
  * atomic context. If drm_fb_helper_deferred_io() is used as the deferred_io
  * callback it will also schedule dirty_work with the damage collected from the
- * mmap page writes.
+ * mmap page writes. Drivers can use drm_fb_helper_defio_init() to setup
+ * deferred I/O (coupled with drm_fb_helper_fbdev_teardown()).
  */
 
 #define drm_fb_helper_for_each_connector(fbh, i__) \
@@ -1025,6 +1030,49 @@ void drm_fb_helper_deferred_io(struct fb_info *info,
 EXPORT_SYMBOL(drm_fb_helper_deferred_io);
 
 /**
+ * drm_fb_helper_defio_init - fbdev deferred I/O initialization
+ * @fb_helper: driver-allocated fbdev helper
+ *
+ * This function allocates &fb_deferred_io, sets callback to
+ * drm_fb_helper_deferred_io(), delay to 50ms and calls fb_deferred_io_init().
+ * It should be called from the &drm_fb_helper_funcs->fb_probe callback.
+ * drm_fb_helper_fbdev_teardown() cleans up deferred I/O.
+ *
+ * NOTE: A copy of &fb_ops is made and assigned to &info->fbops. This is done
+ * because fb_deferred_io_cleanup() clears &fbops->fb_mmap and would thereby
+ * affect other instances of that &fb_ops.
+ *
+ * Returns:
+ * 0 on success or a negative error code on failure.
+ */
+int drm_fb_helper_defio_init(struct drm_fb_helper *fb_helper)
+{
+	struct fb_info *info = fb_helper->fbdev;
+	struct fb_deferred_io *fbdefio;
+	struct fb_ops *fbops;
+
+	fbdefio = kzalloc(sizeof(*fbdefio), GFP_KERNEL);
+	fbops = kzalloc(sizeof(*fbops), GFP_KERNEL);
+	if (!fbdefio || !fbops) {
+		kfree(fbdefio);
+		kfree(fbops);
+		return -ENOMEM;
+	}
+
+	info->fbdefio = fbdefio;
+	fbdefio->delay = msecs_to_jiffies(50);
+	fbdefio->deferred_io = drm_fb_helper_deferred_io;
+
+	*fbops = *info->fbops;
+	info->fbops = fbops;
+
+	fb_deferred_io_init(info);
+
+	return 0;
+}
+EXPORT_SYMBOL(drm_fb_helper_defio_init);
+
+/**
  * drm_fb_helper_sys_read - wrapper around fb_sys_read
  * @info: fb_info struct pointer
  * @buf: userspace buffer to read from framebuffer memory
@@ -1848,6 +1896,7 @@ static int drm_fb_helper_single_fb_probe(struct drm_fb_helper *fb_helper,
 	if (ret < 0)
 		return ret;
 
+	strcpy(fb_helper->fb->comm, "[fbcon]");
 	return 0;
 }
 
@@ -2730,6 +2779,120 @@ int drm_fb_helper_hotplug_event(struct drm_fb_helper *fb_helper)
 EXPORT_SYMBOL(drm_fb_helper_hotplug_event);
 
 /**
+ * drm_fb_helper_fbdev_setup() - Setup fbdev emulation
+ * @dev: DRM device
+ * @fb_helper: fbdev helper structure to set up
+ * @funcs: fbdev helper functions
+ * @preferred_bpp: Preferred bits per pixel for the device.
+ *                 @dev->mode_config.preferred_depth is used if this is zero.
+ * @max_conn_count: Maximum number of connectors.
+ *                  @dev->mode_config.num_connector is used if this is zero.
+ *
+ * This function sets up fbdev emulation and registers fbdev for access by
+ * userspace. If all connectors are disconnected, setup is deferred to the next
+ * time drm_fb_helper_hotplug_event() is called.
+ * The caller must to provide a &drm_fb_helper_funcs->fb_probe callback
+ * function.
+ *
+ * See also: drm_fb_helper_initial_config()
+ *
+ * Returns:
+ * Zero on success or negative error code on failure.
+ */
+int drm_fb_helper_fbdev_setup(struct drm_device *dev,
+			      struct drm_fb_helper *fb_helper,
+			      const struct drm_fb_helper_funcs *funcs,
+			      unsigned int preferred_bpp,
+			      unsigned int max_conn_count)
+{
+	int ret;
+
+	if (!preferred_bpp)
+		preferred_bpp = dev->mode_config.preferred_depth;
+	if (!preferred_bpp)
+		preferred_bpp = 32;
+
+	if (!max_conn_count)
+		max_conn_count = dev->mode_config.num_connector;
+	if (!max_conn_count) {
+		DRM_DEV_ERROR(dev->dev, "No connectors\n");
+		return -EINVAL;
+	}
+
+	drm_fb_helper_prepare(dev, fb_helper, funcs);
+
+	ret = drm_fb_helper_init(dev, fb_helper, max_conn_count);
+	if (ret < 0) {
+		DRM_DEV_ERROR(dev->dev, "Failed to initialize fbdev helper\n");
+		return ret;
+	}
+
+	ret = drm_fb_helper_single_add_all_connectors(fb_helper);
+	if (ret < 0) {
+		DRM_DEV_ERROR(dev->dev, "Failed to add connectors\n");
+		goto err_drm_fb_helper_fini;
+	}
+
+	if (!drm_drv_uses_atomic_modeset(dev))
+		drm_helper_disable_unused_functions(dev);
+
+	ret = drm_fb_helper_initial_config(fb_helper, preferred_bpp);
+	if (ret < 0) {
+		DRM_DEV_ERROR(dev->dev, "Failed to set fbdev configuration\n");
+		goto err_drm_fb_helper_fini;
+	}
+
+	return 0;
+
+err_drm_fb_helper_fini:
+	drm_fb_helper_fini(fb_helper);
+
+	return ret;
+}
+EXPORT_SYMBOL(drm_fb_helper_fbdev_setup);
+
+/**
+ * drm_fb_helper_fbdev_teardown - Tear down fbdev emulation
+ * @dev: DRM device
+ *
+ * This function unregisters fbdev if not already done and cleans up the
+ * associated resources including the &drm_framebuffer.
+ * The driver is responsible for freeing the &drm_fb_helper structure which is
+ * stored in &drm_device->fb_helper. Do note that this pointer has been cleared
+ * when this function returns.
+ *
+ * In order to support device removal/unplug while file handles are still open,
+ * drm_fb_helper_unregister_fbi() should be called on device removal and
+ * drm_fb_helper_fbdev_teardown() in the &drm_driver->release callback when
+ * file handles are closed.
+ */
+void drm_fb_helper_fbdev_teardown(struct drm_device *dev)
+{
+	struct drm_fb_helper *fb_helper = dev->fb_helper;
+	struct fb_ops *fbops = NULL;
+
+	if (!fb_helper)
+		return;
+
+	/* Unregister if it hasn't been done already */
+	if (fb_helper->fbdev && fb_helper->fbdev->dev)
+		drm_fb_helper_unregister_fbi(fb_helper);
+
+	if (fb_helper->fbdev && fb_helper->fbdev->fbdefio) {
+		fb_deferred_io_cleanup(fb_helper->fbdev);
+		kfree(fb_helper->fbdev->fbdefio);
+		fbops = fb_helper->fbdev->fbops;
+	}
+
+	drm_fb_helper_fini(fb_helper);
+	kfree(fbops);
+
+	if (fb_helper->fb)
+		drm_framebuffer_remove(fb_helper->fb);
+}
+EXPORT_SYMBOL(drm_fb_helper_fbdev_teardown);
+
+/**
  * drm_fb_helper_lastclose - DRM driver lastclose helper for fbdev emulation
  * @dev: DRM device
  *
diff --git a/drivers/gpu/drm/drm_framebuffer.c b/drivers/gpu/drm/drm_framebuffer.c
index d63d4c2ac4c8..5a13ff29f4f0 100644
--- a/drivers/gpu/drm/drm_framebuffer.c
+++ b/drivers/gpu/drm/drm_framebuffer.c
@@ -664,6 +664,7 @@ int drm_framebuffer_init(struct drm_device *dev, struct drm_framebuffer *fb,
 	INIT_LIST_HEAD(&fb->filp_head);
 
 	fb->funcs = funcs;
+	strcpy(fb->comm, current->comm);
 
 	ret = __drm_mode_object_add(dev, &fb->base, DRM_MODE_OBJECT_FB,
 				    false, drm_framebuffer_free);
@@ -978,6 +979,7 @@ void drm_framebuffer_print_info(struct drm_printer *p, unsigned int indent,
 	struct drm_format_name_buf format_name;
 	unsigned int i;
 
+	drm_printf_indent(p, indent, "allocated by = %s\n", fb->comm);
 	drm_printf_indent(p, indent, "refcount=%u\n",
 			  drm_framebuffer_read_refcount(fb));
 	drm_printf_indent(p, indent, "format=%s\n",
diff --git a/drivers/gpu/drm/drm_syncobj.c b/drivers/gpu/drm/drm_syncobj.c
index 9b733c510cbf..131695915acd 100644
--- a/drivers/gpu/drm/drm_syncobj.c
+++ b/drivers/gpu/drm/drm_syncobj.c
@@ -29,9 +29,9 @@
 /**
  * DOC: Overview
  *
- * DRM synchronisation objects (syncobj) are a persistent objects,
- * that contain an optional fence. The fence can be updated with a new
- * fence, or be NULL.
+ * DRM synchronisation objects (syncobj, see struct &drm_syncobj) are
+ * persistent objects that contain an optional fence. The fence can be updated
+ * with a new fence, or be NULL.
  *
  * syncobj's can be waited upon, where it will wait for the underlying
  * fence.
@@ -61,7 +61,8 @@
  * @file_private: drm file private pointer
  * @handle: sync object handle to lookup.
  *
- * Returns a reference to the syncobj pointed to by handle or NULL.
+ * Returns a reference to the syncobj pointed to by handle or NULL. The
+ * reference must be released by calling drm_syncobj_put().
  */
 struct drm_syncobj *drm_syncobj_find(struct drm_file *file_private,
 				     u32 handle)
@@ -229,6 +230,19 @@ static int drm_syncobj_assign_null_handle(struct drm_syncobj *syncobj)
 	return 0;
 }
 
+/**
+ * drm_syncobj_find_fence - lookup and reference the fence in a sync object
+ * @file_private: drm file private pointer
+ * @handle: sync object handle to lookup.
+ * @fence: out parameter for the fence
+ *
+ * This is just a convenience function that combines drm_syncobj_find() and
+ * drm_syncobj_fence_get().
+ *
+ * Returns 0 on success or a negative error value on failure. On success @fence
+ * contains a reference to the fence, which must be released by calling
+ * dma_fence_put().
+ */
 int drm_syncobj_find_fence(struct drm_file *file_private,
 			   u32 handle,
 			   struct dma_fence **fence)
@@ -269,6 +283,12 @@ EXPORT_SYMBOL(drm_syncobj_free);
  * @out_syncobj: returned syncobj
  * @flags: DRM_SYNCOBJ_* flags
  * @fence: if non-NULL, the syncobj will represent this fence
+ *
+ * This is the first function to create a sync object. After creating, drivers
+ * probably want to make it available to userspace, either through
+ * drm_syncobj_get_handle() or drm_syncobj_get_fd().
+ *
+ * Returns 0 on success or a negative error value on failure.
  */
 int drm_syncobj_create(struct drm_syncobj **out_syncobj, uint32_t flags,
 		       struct dma_fence *fence)
@@ -302,6 +322,14 @@ EXPORT_SYMBOL(drm_syncobj_create);
 
 /**
  * drm_syncobj_get_handle - get a handle from a syncobj
+ * @file_private: drm file private pointer
+ * @syncobj: Sync object to export
+ * @handle: out parameter with the new handle
+ *
+ * Exports a sync object created with drm_syncobj_create() as a handle on
+ * @file_private to userspace.
+ *
+ * Returns 0 on success or a negative error value on failure.
  */
 int drm_syncobj_get_handle(struct drm_file *file_private,
 			   struct drm_syncobj *syncobj, u32 *handle)
@@ -388,6 +416,15 @@ static int drm_syncobj_alloc_file(struct drm_syncobj *syncobj)
 	return 0;
 }
 
+/**
+ * drm_syncobj_get_fd - get a file descriptor from a syncobj
+ * @syncobj: Sync object to export
+ * @p_fd: out parameter with the new file descriptor
+ *
+ * Exports a sync object created with drm_syncobj_create() as a file descriptor.
+ *
+ * Returns 0 on success or a negative error value on failure.
+ */
 int drm_syncobj_get_fd(struct drm_syncobj *syncobj, int *p_fd)
 {
 	int ret;
diff --git a/drivers/gpu/drm/etnaviv/Kconfig b/drivers/gpu/drm/etnaviv/Kconfig
index a29b8f59eb15..3f58b4077767 100644
--- a/drivers/gpu/drm/etnaviv/Kconfig
+++ b/drivers/gpu/drm/etnaviv/Kconfig
@@ -6,6 +6,7 @@ config DRM_ETNAVIV
 	depends on MMU
 	select SHMEM
 	select SYNC_FILE
+	select THERMAL if DRM_ETNAVIV_THERMAL
 	select TMPFS
 	select WANT_DEV_COREDUMP
 	select CMA if HAVE_DMA_CONTIGUOUS
@@ -13,6 +14,14 @@ config DRM_ETNAVIV
 	help
 	  DRM driver for Vivante GPUs.
 
+config DRM_ETNAVIV_THERMAL
+	bool "enable ETNAVIV thermal throttling"
+	depends on DRM_ETNAVIV
+	default y
+	help
+	  Compile in support for thermal throttling.
+	  Say Y unless you want to risk burning your SoC.
+
 config DRM_ETNAVIV_REGISTER_LOGGING
 	bool "enable ETNAVIV register logging"
 	depends on DRM_ETNAVIV
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_buffer.c b/drivers/gpu/drm/etnaviv/etnaviv_buffer.c
index 9e7098e3207f..99ad2f073c6e 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_buffer.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_buffer.c
@@ -100,6 +100,8 @@ static void etnaviv_cmd_select_pipe(struct etnaviv_gpu *gpu,
 {
 	u32 flush = 0;
 
+	lockdep_assert_held(&gpu->lock);
+
 	/*
 	 * This assumes that if we're switching to 2D, we're switching
 	 * away from 3D, and vice versa.  Hence, if we're switching to
@@ -164,7 +166,9 @@ static u32 etnaviv_buffer_reserve(struct etnaviv_gpu *gpu,
 
 u16 etnaviv_buffer_init(struct etnaviv_gpu *gpu)
 {
-	struct etnaviv_cmdbuf *buffer = gpu->buffer;
+	struct etnaviv_cmdbuf *buffer = &gpu->buffer;
+
+	lockdep_assert_held(&gpu->lock);
 
 	/* initialize buffer */
 	buffer->user_size = 0;
@@ -178,7 +182,9 @@ u16 etnaviv_buffer_init(struct etnaviv_gpu *gpu)
 
 u16 etnaviv_buffer_config_mmuv2(struct etnaviv_gpu *gpu, u32 mtlb_addr, u32 safe_addr)
 {
-	struct etnaviv_cmdbuf *buffer = gpu->buffer;
+	struct etnaviv_cmdbuf *buffer = &gpu->buffer;
+
+	lockdep_assert_held(&gpu->lock);
 
 	buffer->user_size = 0;
 
@@ -211,10 +217,12 @@ u16 etnaviv_buffer_config_mmuv2(struct etnaviv_gpu *gpu, u32 mtlb_addr, u32 safe
 
 void etnaviv_buffer_end(struct etnaviv_gpu *gpu)
 {
-	struct etnaviv_cmdbuf *buffer = gpu->buffer;
+	struct etnaviv_cmdbuf *buffer = &gpu->buffer;
 	unsigned int waitlink_offset = buffer->user_size - 16;
 	u32 link_target, flush = 0;
 
+	lockdep_assert_held(&gpu->lock);
+
 	if (gpu->exec_state == ETNA_PIPE_2D)
 		flush = VIVS_GL_FLUSH_CACHE_PE2D;
 	else if (gpu->exec_state == ETNA_PIPE_3D)
@@ -253,10 +261,12 @@ void etnaviv_buffer_end(struct etnaviv_gpu *gpu)
 /* Append a 'sync point' to the ring buffer. */
 void etnaviv_sync_point_queue(struct etnaviv_gpu *gpu, unsigned int event)
 {
-	struct etnaviv_cmdbuf *buffer = gpu->buffer;
+	struct etnaviv_cmdbuf *buffer = &gpu->buffer;
 	unsigned int waitlink_offset = buffer->user_size - 16;
 	u32 dwords, target;
 
+	lockdep_assert_held(&gpu->lock);
+
 	/*
 	 * We need at most 3 dwords in the return target:
 	 * 1 event + 1 end + 1 wait + 1 link.
@@ -287,13 +297,16 @@ void etnaviv_sync_point_queue(struct etnaviv_gpu *gpu, unsigned int event)
 }
 
 /* Append a command buffer to the ring buffer. */
-void etnaviv_buffer_queue(struct etnaviv_gpu *gpu, unsigned int event,
-	struct etnaviv_cmdbuf *cmdbuf)
+void etnaviv_buffer_queue(struct etnaviv_gpu *gpu, u32 exec_state,
+	unsigned int event, struct etnaviv_cmdbuf *cmdbuf)
 {
-	struct etnaviv_cmdbuf *buffer = gpu->buffer;
+	struct etnaviv_cmdbuf *buffer = &gpu->buffer;
 	unsigned int waitlink_offset = buffer->user_size - 16;
 	u32 return_target, return_dwords;
 	u32 link_target, link_dwords;
+	bool switch_context = gpu->exec_state != exec_state;
+
+	lockdep_assert_held(&gpu->lock);
 
 	if (drm_debug & DRM_UT_DRIVER)
 		etnaviv_buffer_dump(gpu, buffer, 0, 0x50);
@@ -306,7 +319,7 @@ void etnaviv_buffer_queue(struct etnaviv_gpu *gpu, unsigned int event,
 	 * need to append a mmu flush load state, followed by a new
 	 * link to this buffer - a total of four additional words.
 	 */
-	if (gpu->mmu->need_flush || gpu->switch_context) {
+	if (gpu->mmu->need_flush || switch_context) {
 		u32 target, extra_dwords;
 
 		/* link command */
@@ -321,7 +334,7 @@ void etnaviv_buffer_queue(struct etnaviv_gpu *gpu, unsigned int event,
 		}
 
 		/* pipe switch commands */
-		if (gpu->switch_context)
+		if (switch_context)
 			extra_dwords += 4;
 
 		target = etnaviv_buffer_reserve(gpu, buffer, extra_dwords);
@@ -349,10 +362,9 @@ void etnaviv_buffer_queue(struct etnaviv_gpu *gpu, unsigned int event,
 			gpu->mmu->need_flush = false;
 		}
 
-		if (gpu->switch_context) {
-			etnaviv_cmd_select_pipe(gpu, buffer, cmdbuf->exec_state);
-			gpu->exec_state = cmdbuf->exec_state;
-			gpu->switch_context = false;
+		if (switch_context) {
+			etnaviv_cmd_select_pipe(gpu, buffer, exec_state);
+			gpu->exec_state = exec_state;
 		}
 
 		/* And the link to the submitted buffer */
@@ -421,4 +433,6 @@ void etnaviv_buffer_queue(struct etnaviv_gpu *gpu, unsigned int event,
 
 	if (drm_debug & DRM_UT_DRIVER)
 		etnaviv_buffer_dump(gpu, buffer, 0, 0x50);
+
+	gpu->lastctx = cmdbuf->ctx;
 }
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_cmd_parser.c b/drivers/gpu/drm/etnaviv/etnaviv_cmd_parser.c
index 6e3bbcf24160..68e6d3772ad8 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_cmd_parser.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_cmd_parser.c
@@ -78,6 +78,7 @@ static const struct {
 	ST(0x17c0, 8),
 	ST(0x17e0, 8),
 	ST(0x2400, 14 * 16),
+	ST(0x3824, 1),
 	ST(0x10800, 32 * 16),
 	ST(0x14600, 16),
 	ST(0x14800, 8 * 8),
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_cmdbuf.c b/drivers/gpu/drm/etnaviv/etnaviv_cmdbuf.c
index 66ac79558bbd..3746827f45eb 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_cmdbuf.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_cmdbuf.c
@@ -86,26 +86,11 @@ void etnaviv_cmdbuf_suballoc_destroy(struct etnaviv_cmdbuf_suballoc *suballoc)
 	kfree(suballoc);
 }
 
-struct etnaviv_cmdbuf *
-etnaviv_cmdbuf_new(struct etnaviv_cmdbuf_suballoc *suballoc, u32 size,
-		   size_t nr_bos, size_t nr_pmrs)
+int etnaviv_cmdbuf_init(struct etnaviv_cmdbuf_suballoc *suballoc,
+			struct etnaviv_cmdbuf *cmdbuf, u32 size)
 {
-	struct etnaviv_cmdbuf *cmdbuf;
-	struct etnaviv_perfmon_request *pmrs;
-	size_t sz = size_vstruct(nr_bos, sizeof(cmdbuf->bo_map[0]),
-				 sizeof(*cmdbuf));
 	int granule_offs, order, ret;
 
-	cmdbuf = kzalloc(sz, GFP_KERNEL);
-	if (!cmdbuf)
-		return NULL;
-
-	sz = sizeof(*pmrs) * nr_pmrs;
-	pmrs = kzalloc(sz, GFP_KERNEL);
-	if (!pmrs)
-		goto out_free_cmdbuf;
-
-	cmdbuf->pmrs = pmrs;
 	cmdbuf->suballoc = suballoc;
 	cmdbuf->size = size;
 
@@ -123,7 +108,7 @@ retry:
 		if (!ret) {
 			dev_err(suballoc->gpu->dev,
 				"Timeout waiting for cmdbuf space\n");
-			return NULL;
+			return -ETIMEDOUT;
 		}
 		goto retry;
 	}
@@ -131,11 +116,7 @@ retry:
 	cmdbuf->suballoc_offset = granule_offs * SUBALLOC_GRANULE;
 	cmdbuf->vaddr = suballoc->vaddr + cmdbuf->suballoc_offset;
 
-	return cmdbuf;
-
-out_free_cmdbuf:
-	kfree(cmdbuf);
-	return NULL;
+	return 0;
 }
 
 void etnaviv_cmdbuf_free(struct etnaviv_cmdbuf *cmdbuf)
@@ -151,8 +132,6 @@ void etnaviv_cmdbuf_free(struct etnaviv_cmdbuf *cmdbuf)
 	suballoc->free_space = 1;
 	mutex_unlock(&suballoc->lock);
 	wake_up_all(&suballoc->free_event);
-	kfree(cmdbuf->pmrs);
-	kfree(cmdbuf);
 }
 
 u32 etnaviv_cmdbuf_get_va(struct etnaviv_cmdbuf *buf)
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_cmdbuf.h b/drivers/gpu/drm/etnaviv/etnaviv_cmdbuf.h
index b6348b9f2a9d..ddc3f7ea169c 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_cmdbuf.h
+++ b/drivers/gpu/drm/etnaviv/etnaviv_cmdbuf.h
@@ -33,27 +33,15 @@ struct etnaviv_cmdbuf {
 	void *vaddr;
 	u32 size;
 	u32 user_size;
-	/* fence after which this buffer is to be disposed */
-	struct dma_fence *fence;
-	/* target exec state */
-	u32 exec_state;
-	/* per GPU in-flight list */
-	struct list_head node;
-	/* perfmon requests */
-	unsigned int nr_pmrs;
-	struct etnaviv_perfmon_request *pmrs;
-	/* BOs attached to this command buffer */
-	unsigned int nr_bos;
-	struct etnaviv_vram_mapping *bo_map[0];
 };
 
 struct etnaviv_cmdbuf_suballoc *
 etnaviv_cmdbuf_suballoc_new(struct etnaviv_gpu * gpu);
 void etnaviv_cmdbuf_suballoc_destroy(struct etnaviv_cmdbuf_suballoc *suballoc);
 
-struct etnaviv_cmdbuf *
-etnaviv_cmdbuf_new(struct etnaviv_cmdbuf_suballoc *suballoc, u32 size,
-		   size_t nr_bos, size_t nr_pmrs);
+
+int etnaviv_cmdbuf_init(struct etnaviv_cmdbuf_suballoc *suballoc,
+		struct etnaviv_cmdbuf *cmdbuf, u32 size);
 void etnaviv_cmdbuf_free(struct etnaviv_cmdbuf *cmdbuf);
 
 u32 etnaviv_cmdbuf_get_va(struct etnaviv_cmdbuf *buf);
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_drv.c b/drivers/gpu/drm/etnaviv/etnaviv_drv.c
index 491eddf9b150..6faf4042db23 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_drv.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_drv.c
@@ -172,7 +172,7 @@ static int etnaviv_mmu_show(struct etnaviv_gpu *gpu, struct seq_file *m)
 
 static void etnaviv_buffer_dump(struct etnaviv_gpu *gpu, struct seq_file *m)
 {
-	struct etnaviv_cmdbuf *buf = gpu->buffer;
+	struct etnaviv_cmdbuf *buf = &gpu->buffer;
 	u32 size = buf->size;
 	u32 *ptr = buf->vaddr;
 	u32 i;
@@ -459,9 +459,6 @@ static int etnaviv_ioctl_pm_query_dom(struct drm_device *dev, void *data,
 	struct drm_etnaviv_pm_domain *args = data;
 	struct etnaviv_gpu *gpu;
 
-	/* reject as long as the feature isn't stable */
-	return -EINVAL;
-
 	if (args->pipe >= ETNA_MAX_PIPES)
 		return -EINVAL;
 
@@ -479,9 +476,6 @@ static int etnaviv_ioctl_pm_query_sig(struct drm_device *dev, void *data,
 	struct drm_etnaviv_pm_signal *args = data;
 	struct etnaviv_gpu *gpu;
 
-	/* reject as long as the feature isn't stable */
-	return -EINVAL;
-
 	if (args->pipe >= ETNA_MAX_PIPES)
 		return -EINVAL;
 
@@ -556,7 +550,7 @@ static struct drm_driver etnaviv_drm_driver = {
 	.desc               = "etnaviv DRM",
 	.date               = "20151214",
 	.major              = 1,
-	.minor              = 1,
+	.minor              = 2,
 };
 
 /*
@@ -580,12 +574,6 @@ static int etnaviv_bind(struct device *dev)
 	}
 	drm->dev_private = priv;
 
-	priv->wq = alloc_ordered_workqueue("etnaviv", 0);
-	if (!priv->wq) {
-		ret = -ENOMEM;
-		goto out_wq;
-	}
-
 	mutex_init(&priv->gem_lock);
 	INIT_LIST_HEAD(&priv->gem_list);
 	priv->num_gpus = 0;
@@ -607,9 +595,6 @@ static int etnaviv_bind(struct device *dev)
 out_register:
 	component_unbind_all(dev, drm);
 out_bind:
-	flush_workqueue(priv->wq);
-	destroy_workqueue(priv->wq);
-out_wq:
 	kfree(priv);
 out_unref:
 	drm_dev_unref(drm);
@@ -624,9 +609,6 @@ static void etnaviv_unbind(struct device *dev)
 
 	drm_dev_unregister(drm);
 
-	flush_workqueue(priv->wq);
-	destroy_workqueue(priv->wq);
-
 	component_unbind_all(dev, drm);
 
 	drm->dev_private = NULL;
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_drv.h b/drivers/gpu/drm/etnaviv/etnaviv_drv.h
index d249acb6da08..a54f0b758a5c 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_drv.h
+++ b/drivers/gpu/drm/etnaviv/etnaviv_drv.h
@@ -56,18 +56,8 @@ struct etnaviv_drm_private {
 	/* list of GEM objects: */
 	struct mutex gem_lock;
 	struct list_head gem_list;
-
-	struct workqueue_struct *wq;
 };
 
-static inline void etnaviv_queue_work(struct drm_device *dev,
-	struct work_struct *w)
-{
-	struct etnaviv_drm_private *priv = dev->dev_private;
-
-	queue_work(priv->wq, w);
-}
-
 int etnaviv_ioctl_gem_submit(struct drm_device *dev, void *data,
 		struct drm_file *file);
 
@@ -97,8 +87,8 @@ u16 etnaviv_buffer_init(struct etnaviv_gpu *gpu);
 u16 etnaviv_buffer_config_mmuv2(struct etnaviv_gpu *gpu, u32 mtlb_addr, u32 safe_addr);
 void etnaviv_buffer_end(struct etnaviv_gpu *gpu);
 void etnaviv_sync_point_queue(struct etnaviv_gpu *gpu, unsigned int event);
-void etnaviv_buffer_queue(struct etnaviv_gpu *gpu, unsigned int event,
-	struct etnaviv_cmdbuf *cmdbuf);
+void etnaviv_buffer_queue(struct etnaviv_gpu *gpu, u32 exec_state,
+	unsigned int event, struct etnaviv_cmdbuf *cmdbuf);
 void etnaviv_validate_init(void);
 bool etnaviv_cmd_validate_one(struct etnaviv_gpu *gpu,
 	u32 *stream, unsigned int size,
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_dump.c b/drivers/gpu/drm/etnaviv/etnaviv_dump.c
index 2d955d7d7b6d..6d0909c589d1 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_dump.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_dump.c
@@ -120,7 +120,7 @@ void etnaviv_core_dump(struct etnaviv_gpu *gpu)
 	struct core_dump_iterator iter;
 	struct etnaviv_vram_mapping *vram;
 	struct etnaviv_gem_object *obj;
-	struct etnaviv_cmdbuf *cmd;
+	struct etnaviv_gem_submit *submit;
 	unsigned int n_obj, n_bomap_pages;
 	size_t file_size, mmu_size;
 	__le64 *bomap, *bomap_start;
@@ -132,11 +132,11 @@ void etnaviv_core_dump(struct etnaviv_gpu *gpu)
 	n_bomap_pages = 0;
 	file_size = ARRAY_SIZE(etnaviv_dump_registers) *
 			sizeof(struct etnaviv_dump_registers) +
-		    mmu_size + gpu->buffer->size;
+		    mmu_size + gpu->buffer.size;
 
 	/* Add in the active command buffers */
-	list_for_each_entry(cmd, &gpu->active_cmd_list, node) {
-		file_size += cmd->size;
+	list_for_each_entry(submit, &gpu->active_submit_list, node) {
+		file_size += submit->cmdbuf.size;
 		n_obj++;
 	}
 
@@ -176,13 +176,14 @@ void etnaviv_core_dump(struct etnaviv_gpu *gpu)
 
 	etnaviv_core_dump_registers(&iter, gpu);
 	etnaviv_core_dump_mmu(&iter, gpu, mmu_size);
-	etnaviv_core_dump_mem(&iter, ETDUMP_BUF_RING, gpu->buffer->vaddr,
-			      gpu->buffer->size,
-			      etnaviv_cmdbuf_get_va(gpu->buffer));
-
-	list_for_each_entry(cmd, &gpu->active_cmd_list, node)
-		etnaviv_core_dump_mem(&iter, ETDUMP_BUF_CMD, cmd->vaddr,
-				      cmd->size, etnaviv_cmdbuf_get_va(cmd));
+	etnaviv_core_dump_mem(&iter, ETDUMP_BUF_RING, gpu->buffer.vaddr,
+			      gpu->buffer.size,
+			      etnaviv_cmdbuf_get_va(&gpu->buffer));
+
+	list_for_each_entry(submit, &gpu->active_submit_list, node)
+		etnaviv_core_dump_mem(&iter, ETDUMP_BUF_CMD,
+				      submit->cmdbuf.vaddr, submit->cmdbuf.size,
+				      etnaviv_cmdbuf_get_va(&submit->cmdbuf));
 
 	/* Reserve space for the bomap */
 	if (n_bomap_pages) {
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem.c b/drivers/gpu/drm/etnaviv/etnaviv_gem.c
index daee3f1196df..fcc969fa0e69 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_gem.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_gem.c
@@ -24,6 +24,9 @@
 #include "etnaviv_gpu.h"
 #include "etnaviv_mmu.h"
 
+static struct lock_class_key etnaviv_shm_lock_class;
+static struct lock_class_key etnaviv_userptr_lock_class;
+
 static void etnaviv_gem_scatter_map(struct etnaviv_gem_object *etnaviv_obj)
 {
 	struct drm_device *dev = etnaviv_obj->base.dev;
@@ -583,7 +586,7 @@ void etnaviv_gem_free_object(struct drm_gem_object *obj)
 	kfree(etnaviv_obj);
 }
 
-int etnaviv_gem_obj_add(struct drm_device *dev, struct drm_gem_object *obj)
+void etnaviv_gem_obj_add(struct drm_device *dev, struct drm_gem_object *obj)
 {
 	struct etnaviv_drm_private *priv = dev->dev_private;
 	struct etnaviv_gem_object *etnaviv_obj = to_etnaviv_bo(obj);
@@ -591,8 +594,6 @@ int etnaviv_gem_obj_add(struct drm_device *dev, struct drm_gem_object *obj)
 	mutex_lock(&priv->gem_lock);
 	list_add_tail(&etnaviv_obj->gem_node, &priv->gem_list);
 	mutex_unlock(&priv->gem_lock);
-
-	return 0;
 }
 
 static int etnaviv_gem_new_impl(struct drm_device *dev, u32 size, u32 flags,
@@ -640,8 +641,9 @@ static int etnaviv_gem_new_impl(struct drm_device *dev, u32 size, u32 flags,
 	return 0;
 }
 
-static struct drm_gem_object *__etnaviv_gem_new(struct drm_device *dev,
-		u32 size, u32 flags)
+/* convenience method to construct a GEM buffer object, and userspace handle */
+int etnaviv_gem_new_handle(struct drm_device *dev, struct drm_file *file,
+	u32 size, u32 flags, u32 *handle)
 {
 	struct drm_gem_object *obj = NULL;
 	int ret;
@@ -653,6 +655,8 @@ static struct drm_gem_object *__etnaviv_gem_new(struct drm_device *dev,
 	if (ret)
 		goto fail;
 
+	lockdep_set_class(&to_etnaviv_bo(obj)->lock, &etnaviv_shm_lock_class);
+
 	ret = drm_gem_object_init(dev, obj, size);
 	if (ret == 0) {
 		struct address_space *mapping;
@@ -660,7 +664,7 @@ static struct drm_gem_object *__etnaviv_gem_new(struct drm_device *dev,
 		/*
 		 * Our buffers are kept pinned, so allocating them
 		 * from the MOVABLE zone is a really bad idea, and
-		 * conflicts with CMA.  See coments above new_inode()
+		 * conflicts with CMA. See comments above new_inode()
 		 * why this is required _and_ expected if you're
 		 * going to pin these pages.
 		 */
@@ -672,33 +676,12 @@ static struct drm_gem_object *__etnaviv_gem_new(struct drm_device *dev,
 	if (ret)
 		goto fail;
 
-	return obj;
-
-fail:
-	drm_gem_object_put_unlocked(obj);
-	return ERR_PTR(ret);
-}
-
-/* convenience method to construct a GEM buffer object, and userspace handle */
-int etnaviv_gem_new_handle(struct drm_device *dev, struct drm_file *file,
-		u32 size, u32 flags, u32 *handle)
-{
-	struct drm_gem_object *obj;
-	int ret;
-
-	obj = __etnaviv_gem_new(dev, size, flags);
-	if (IS_ERR(obj))
-		return PTR_ERR(obj);
-
-	ret = etnaviv_gem_obj_add(dev, obj);
-	if (ret < 0) {
-		drm_gem_object_put_unlocked(obj);
-		return ret;
-	}
+	etnaviv_gem_obj_add(dev, obj);
 
 	ret = drm_gem_handle_create(file, obj, handle);
 
 	/* drop reference from allocate - handle holds it now */
+fail:
 	drm_gem_object_put_unlocked(obj);
 
 	return ret;
@@ -722,139 +705,41 @@ int etnaviv_gem_new_private(struct drm_device *dev, size_t size, u32 flags,
 	return 0;
 }
 
-struct get_pages_work {
-	struct work_struct work;
-	struct mm_struct *mm;
-	struct task_struct *task;
-	struct etnaviv_gem_object *etnaviv_obj;
-};
-
-static struct page **etnaviv_gem_userptr_do_get_pages(
-	struct etnaviv_gem_object *etnaviv_obj, struct mm_struct *mm, struct task_struct *task)
-{
-	int ret = 0, pinned, npages = etnaviv_obj->base.size >> PAGE_SHIFT;
-	struct page **pvec;
-	uintptr_t ptr;
-	unsigned int flags = 0;
-
-	pvec = kvmalloc_array(npages, sizeof(struct page *), GFP_KERNEL);
-	if (!pvec)
-		return ERR_PTR(-ENOMEM);
-
-	if (!etnaviv_obj->userptr.ro)
-		flags |= FOLL_WRITE;
-
-	pinned = 0;
-	ptr = etnaviv_obj->userptr.ptr;
-
-	down_read(&mm->mmap_sem);
-	while (pinned < npages) {
-		ret = get_user_pages_remote(task, mm, ptr, npages - pinned,
-					    flags, pvec + pinned, NULL, NULL);
-		if (ret < 0)
-			break;
-
-		ptr += ret * PAGE_SIZE;
-		pinned += ret;
-	}
-	up_read(&mm->mmap_sem);
-
-	if (ret < 0) {
-		release_pages(pvec, pinned);
-		kvfree(pvec);
-		return ERR_PTR(ret);
-	}
-
-	return pvec;
-}
-
-static void __etnaviv_gem_userptr_get_pages(struct work_struct *_work)
-{
-	struct get_pages_work *work = container_of(_work, typeof(*work), work);
-	struct etnaviv_gem_object *etnaviv_obj = work->etnaviv_obj;
-	struct page **pvec;
-
-	pvec = etnaviv_gem_userptr_do_get_pages(etnaviv_obj, work->mm, work->task);
-
-	mutex_lock(&etnaviv_obj->lock);
-	if (IS_ERR(pvec)) {
-		etnaviv_obj->userptr.work = ERR_CAST(pvec);
-	} else {
-		etnaviv_obj->userptr.work = NULL;
-		etnaviv_obj->pages = pvec;
-	}
-
-	mutex_unlock(&etnaviv_obj->lock);
-	drm_gem_object_put_unlocked(&etnaviv_obj->base);
-
-	mmput(work->mm);
-	put_task_struct(work->task);
-	kfree(work);
-}
-
 static int etnaviv_gem_userptr_get_pages(struct etnaviv_gem_object *etnaviv_obj)
 {
 	struct page **pvec = NULL;
-	struct get_pages_work *work;
-	struct mm_struct *mm;
-	int ret, pinned, npages = etnaviv_obj->base.size >> PAGE_SHIFT;
-
-	if (etnaviv_obj->userptr.work) {
-		if (IS_ERR(etnaviv_obj->userptr.work)) {
-			ret = PTR_ERR(etnaviv_obj->userptr.work);
-			etnaviv_obj->userptr.work = NULL;
-		} else {
-			ret = -EAGAIN;
-		}
-		return ret;
-	}
+	struct etnaviv_gem_userptr *userptr = &etnaviv_obj->userptr;
+	int ret, pinned = 0, npages = etnaviv_obj->base.size >> PAGE_SHIFT;
 
-	mm = get_task_mm(etnaviv_obj->userptr.task);
-	pinned = 0;
-	if (mm == current->mm) {
-		pvec = kvmalloc_array(npages, sizeof(struct page *), GFP_KERNEL);
-		if (!pvec) {
-			mmput(mm);
-			return -ENOMEM;
-		}
-
-		pinned = __get_user_pages_fast(etnaviv_obj->userptr.ptr, npages,
-					       !etnaviv_obj->userptr.ro, pvec);
-		if (pinned < 0) {
-			kvfree(pvec);
-			mmput(mm);
-			return pinned;
-		}
-
-		if (pinned == npages) {
-			etnaviv_obj->pages = pvec;
-			mmput(mm);
-			return 0;
-		}
-	}
+	might_lock_read(&current->mm->mmap_sem);
 
-	release_pages(pvec, pinned);
-	kvfree(pvec);
+	if (userptr->mm != current->mm)
+		return -EPERM;
 
-	work = kmalloc(sizeof(*work), GFP_KERNEL);
-	if (!work) {
-		mmput(mm);
+	pvec = kvmalloc_array(npages, sizeof(struct page *), GFP_KERNEL);
+	if (!pvec)
 		return -ENOMEM;
-	}
 
-	get_task_struct(current);
-	drm_gem_object_get(&etnaviv_obj->base);
+	do {
+		unsigned num_pages = npages - pinned;
+		uint64_t ptr = userptr->ptr + pinned * PAGE_SIZE;
+		struct page **pages = pvec + pinned;
 
-	work->mm = mm;
-	work->task = current;
-	work->etnaviv_obj = etnaviv_obj;
+		ret = get_user_pages_fast(ptr, num_pages,
+					  !userptr->ro ? FOLL_WRITE : 0, pages);
+		if (ret < 0) {
+			release_pages(pvec, pinned);
+			kvfree(pvec);
+			return ret;
+		}
+
+		pinned += ret;
 
-	etnaviv_obj->userptr.work = &work->work;
-	INIT_WORK(&work->work, __etnaviv_gem_userptr_get_pages);
+	} while (pinned < npages);
 
-	etnaviv_queue_work(etnaviv_obj->base.dev, &work->work);
+	etnaviv_obj->pages = pvec;
 
-	return -EAGAIN;
+	return 0;
 }
 
 static void etnaviv_gem_userptr_release(struct etnaviv_gem_object *etnaviv_obj)
@@ -870,7 +755,6 @@ static void etnaviv_gem_userptr_release(struct etnaviv_gem_object *etnaviv_obj)
 		release_pages(etnaviv_obj->pages, npages);
 		kvfree(etnaviv_obj->pages);
 	}
-	put_task_struct(etnaviv_obj->userptr.task);
 }
 
 static int etnaviv_gem_userptr_mmap_obj(struct etnaviv_gem_object *etnaviv_obj,
@@ -897,17 +781,16 @@ int etnaviv_gem_new_userptr(struct drm_device *dev, struct drm_file *file,
 	if (ret)
 		return ret;
 
+	lockdep_set_class(&etnaviv_obj->lock, &etnaviv_userptr_lock_class);
+
 	etnaviv_obj->userptr.ptr = ptr;
-	etnaviv_obj->userptr.task = current;
+	etnaviv_obj->userptr.mm = current->mm;
 	etnaviv_obj->userptr.ro = !(flags & ETNA_USERPTR_WRITE);
-	get_task_struct(current);
 
-	ret = etnaviv_gem_obj_add(dev, &etnaviv_obj->base);
-	if (ret)
-		goto unreference;
+	etnaviv_gem_obj_add(dev, &etnaviv_obj->base);
 
 	ret = drm_gem_handle_create(file, &etnaviv_obj->base, handle);
-unreference:
+
 	/* drop reference from allocate - handle holds it now */
 	drm_gem_object_put_unlocked(&etnaviv_obj->base);
 	return ret;
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem.h b/drivers/gpu/drm/etnaviv/etnaviv_gem.h
index e437fba1209d..be72a9833f2b 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_gem.h
+++ b/drivers/gpu/drm/etnaviv/etnaviv_gem.h
@@ -18,6 +18,7 @@
 #define __ETNAVIV_GEM_H__
 
 #include <linux/reservation.h>
+#include "etnaviv_cmdbuf.h"
 #include "etnaviv_drv.h"
 
 struct dma_fence;
@@ -26,8 +27,7 @@ struct etnaviv_gem_object;
 
 struct etnaviv_gem_userptr {
 	uintptr_t ptr;
-	struct task_struct *task;
-	struct work_struct *work;
+	struct mm_struct *mm;
 	bool ro;
 };
 
@@ -98,26 +98,32 @@ struct etnaviv_gem_submit_bo {
 
 /* Created per submit-ioctl, to track bo's and cmdstream bufs, etc,
  * associated with the cmdstream submission for synchronization (and
- * make it easier to unwind when things go wrong, etc).  This only
- * lasts for the duration of the submit-ioctl.
+ * make it easier to unwind when things go wrong, etc).
  */
 struct etnaviv_gem_submit {
-	struct drm_device *dev;
+	struct kref refcount;
 	struct etnaviv_gpu *gpu;
-	struct ww_acquire_ctx ticket;
-	struct dma_fence *fence;
+	struct dma_fence *out_fence, *in_fence;
+	struct list_head node; /* GPU active submit list */
+	struct etnaviv_cmdbuf cmdbuf;
+	bool runtime_resumed;
+	u32 exec_state;
 	u32 flags;
+	unsigned int nr_pmrs;
+	struct etnaviv_perfmon_request *pmrs;
 	unsigned int nr_bos;
 	struct etnaviv_gem_submit_bo bos[0];
 	/* No new members here, the previous one is variable-length! */
 };
 
+void etnaviv_submit_put(struct etnaviv_gem_submit * submit);
+
 int etnaviv_gem_wait_bo(struct etnaviv_gpu *gpu, struct drm_gem_object *obj,
 	struct timespec *timeout);
 int etnaviv_gem_new_private(struct drm_device *dev, size_t size, u32 flags,
 	struct reservation_object *robj, const struct etnaviv_gem_ops *ops,
 	struct etnaviv_gem_object **res);
-int etnaviv_gem_obj_add(struct drm_device *dev, struct drm_gem_object *obj);
+void etnaviv_gem_obj_add(struct drm_device *dev, struct drm_gem_object *obj);
 struct page **etnaviv_gem_get_pages(struct etnaviv_gem_object *obj);
 void etnaviv_gem_put_pages(struct etnaviv_gem_object *obj);
 
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem_prime.c b/drivers/gpu/drm/etnaviv/etnaviv_gem_prime.c
index ae884723e9b1..5704305d41e6 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_gem_prime.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_gem_prime.c
@@ -19,6 +19,7 @@
 #include "etnaviv_drv.h"
 #include "etnaviv_gem.h"
 
+static struct lock_class_key etnaviv_prime_lock_class;
 
 struct sg_table *etnaviv_gem_prime_get_sg_table(struct drm_gem_object *obj)
 {
@@ -125,6 +126,8 @@ struct drm_gem_object *etnaviv_gem_prime_import_sg_table(struct drm_device *dev,
 	if (ret < 0)
 		return ERR_PTR(ret);
 
+	lockdep_set_class(&etnaviv_obj->lock, &etnaviv_prime_lock_class);
+
 	npages = size / PAGE_SIZE;
 
 	etnaviv_obj->sgt = sgt;
@@ -139,9 +142,7 @@ struct drm_gem_object *etnaviv_gem_prime_import_sg_table(struct drm_device *dev,
 	if (ret)
 		goto fail;
 
-	ret = etnaviv_gem_obj_add(dev, &etnaviv_obj->base);
-	if (ret)
-		goto fail;
+	etnaviv_gem_obj_add(dev, &etnaviv_obj->base);
 
 	return &etnaviv_obj->base;
 
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c b/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c
index ff911541a190..1f8202bca061 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c
@@ -33,22 +33,25 @@
 #define BO_PINNED   0x2000
 
 static struct etnaviv_gem_submit *submit_create(struct drm_device *dev,
-		struct etnaviv_gpu *gpu, size_t nr)
+		struct etnaviv_gpu *gpu, size_t nr_bos, size_t nr_pmrs)
 {
 	struct etnaviv_gem_submit *submit;
-	size_t sz = size_vstruct(nr, sizeof(submit->bos[0]), sizeof(*submit));
+	size_t sz = size_vstruct(nr_bos, sizeof(submit->bos[0]), sizeof(*submit));
 
-	submit = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY);
-	if (submit) {
-		submit->dev = dev;
-		submit->gpu = gpu;
+	submit = kzalloc(sz, GFP_KERNEL);
+	if (!submit)
+		return NULL;
 
-		/* initially, until copy_from_user() and bo lookup succeeds: */
-		submit->nr_bos = 0;
-		submit->fence = NULL;
-
-		ww_acquire_init(&submit->ticket, &reservation_ww_class);
+	submit->pmrs = kcalloc(nr_pmrs, sizeof(struct etnaviv_perfmon_request),
+			       GFP_KERNEL);
+	if (!submit->pmrs) {
+		kfree(submit);
+		return NULL;
 	}
+	submit->nr_pmrs = nr_pmrs;
+
+	submit->gpu = gpu;
+	kref_init(&submit->refcount);
 
 	return submit;
 }
@@ -111,7 +114,8 @@ static void submit_unlock_object(struct etnaviv_gem_submit *submit, int i)
 	}
 }
 
-static int submit_lock_objects(struct etnaviv_gem_submit *submit)
+static int submit_lock_objects(struct etnaviv_gem_submit *submit,
+		struct ww_acquire_ctx *ticket)
 {
 	int contended, slow_locked = -1, i, ret = 0;
 
@@ -126,7 +130,7 @@ retry:
 
 		if (!(submit->bos[i].flags & BO_LOCKED)) {
 			ret = ww_mutex_lock_interruptible(&etnaviv_obj->resv->lock,
-					&submit->ticket);
+							  ticket);
 			if (ret == -EALREADY)
 				DRM_ERROR("BO at index %u already on submit list\n",
 					  i);
@@ -136,7 +140,7 @@ retry:
 		}
 	}
 
-	ww_acquire_done(&submit->ticket);
+	ww_acquire_done(ticket);
 
 	return 0;
 
@@ -154,7 +158,7 @@ fail:
 
 		/* we lost out in a seqno race, lock and retry.. */
 		ret = ww_mutex_lock_slow_interruptible(&etnaviv_obj->resv->lock,
-				&submit->ticket);
+						       ticket);
 		if (!ret) {
 			submit->bos[contended].flags |= BO_LOCKED;
 			slow_locked = contended;
@@ -181,19 +185,33 @@ static int submit_fence_sync(const struct etnaviv_gem_submit *submit)
 			break;
 	}
 
+	if (submit->flags & ETNA_SUBMIT_FENCE_FD_IN) {
+		/*
+		 * Wait if the fence is from a foreign context, or if the fence
+		 * array contains any fence from a foreign context.
+		 */
+		if (!dma_fence_match_context(submit->in_fence, context))
+			ret = dma_fence_wait(submit->in_fence, true);
+	}
+
 	return ret;
 }
 
-static void submit_unpin_objects(struct etnaviv_gem_submit *submit)
+static void submit_attach_object_fences(struct etnaviv_gem_submit *submit)
 {
 	int i;
 
 	for (i = 0; i < submit->nr_bos; i++) {
-		if (submit->bos[i].flags & BO_PINNED)
-			etnaviv_gem_mapping_unreference(submit->bos[i].mapping);
+		struct etnaviv_gem_object *etnaviv_obj = submit->bos[i].obj;
+
+		if (submit->bos[i].flags & ETNA_SUBMIT_BO_WRITE)
+			reservation_object_add_excl_fence(etnaviv_obj->resv,
+							  submit->out_fence);
+		else
+			reservation_object_add_shared_fence(etnaviv_obj->resv,
+							    submit->out_fence);
 
-		submit->bos[i].mapping = NULL;
-		submit->bos[i].flags &= ~BO_PINNED;
+		submit_unlock_object(submit, i);
 	}
 }
 
@@ -211,6 +229,7 @@ static int submit_pin_objects(struct etnaviv_gem_submit *submit)
 			ret = PTR_ERR(mapping);
 			break;
 		}
+		atomic_inc(&etnaviv_obj->gpu_active);
 
 		submit->bos[i].flags |= BO_PINNED;
 		submit->bos[i].mapping = mapping;
@@ -285,13 +304,11 @@ static int submit_reloc(struct etnaviv_gem_submit *submit, void *stream,
 }
 
 static int submit_perfmon_validate(struct etnaviv_gem_submit *submit,
-		struct etnaviv_cmdbuf *cmdbuf,
-		const struct drm_etnaviv_gem_submit_pmr *pmrs,
-		u32 nr_pms)
+		u32 exec_state, const struct drm_etnaviv_gem_submit_pmr *pmrs)
 {
 	u32 i;
 
-	for (i = 0; i < nr_pms; i++) {
+	for (i = 0; i < submit->nr_pmrs; i++) {
 		const struct drm_etnaviv_gem_submit_pmr *r = pmrs + i;
 		struct etnaviv_gem_submit_bo *bo;
 		int ret;
@@ -316,39 +333,65 @@ static int submit_perfmon_validate(struct etnaviv_gem_submit *submit,
 			return -EINVAL;
 		}
 
-		if (etnaviv_pm_req_validate(r, cmdbuf->exec_state)) {
+		if (etnaviv_pm_req_validate(r, exec_state)) {
 			DRM_ERROR("perfmon request: domain or signal not valid");
 			return -EINVAL;
 		}
 
-		cmdbuf->pmrs[i].flags = r->flags;
-		cmdbuf->pmrs[i].domain = r->domain;
-		cmdbuf->pmrs[i].signal = r->signal;
-		cmdbuf->pmrs[i].sequence = r->sequence;
-		cmdbuf->pmrs[i].offset = r->read_offset;
-		cmdbuf->pmrs[i].bo_vma = etnaviv_gem_vmap(&bo->obj->base);
+		submit->pmrs[i].flags = r->flags;
+		submit->pmrs[i].domain = r->domain;
+		submit->pmrs[i].signal = r->signal;
+		submit->pmrs[i].sequence = r->sequence;
+		submit->pmrs[i].offset = r->read_offset;
+		submit->pmrs[i].bo_vma = etnaviv_gem_vmap(&bo->obj->base);
 	}
 
 	return 0;
 }
 
-static void submit_cleanup(struct etnaviv_gem_submit *submit)
+static void submit_cleanup(struct kref *kref)
 {
+	struct etnaviv_gem_submit *submit =
+			container_of(kref, struct etnaviv_gem_submit, refcount);
 	unsigned i;
 
+	if (submit->runtime_resumed)
+		pm_runtime_put_autosuspend(submit->gpu->dev);
+
+	if (submit->cmdbuf.suballoc)
+		etnaviv_cmdbuf_free(&submit->cmdbuf);
+
 	for (i = 0; i < submit->nr_bos; i++) {
 		struct etnaviv_gem_object *etnaviv_obj = submit->bos[i].obj;
 
+		/* unpin all objects */
+		if (submit->bos[i].flags & BO_PINNED) {
+			etnaviv_gem_mapping_unreference(submit->bos[i].mapping);
+			atomic_dec(&etnaviv_obj->gpu_active);
+			submit->bos[i].mapping = NULL;
+			submit->bos[i].flags &= ~BO_PINNED;
+		}
+
+		/* if the GPU submit failed, objects might still be locked */
 		submit_unlock_object(submit, i);
 		drm_gem_object_put_unlocked(&etnaviv_obj->base);
 	}
 
-	ww_acquire_fini(&submit->ticket);
-	if (submit->fence)
-		dma_fence_put(submit->fence);
+	wake_up_all(&submit->gpu->fence_event);
+
+	if (submit->in_fence)
+		dma_fence_put(submit->in_fence);
+	if (submit->out_fence)
+		dma_fence_put(submit->out_fence);
+	kfree(submit->pmrs);
 	kfree(submit);
 }
 
+void etnaviv_submit_put(struct etnaviv_gem_submit *submit)
+{
+	kref_put(&submit->refcount, submit_cleanup);
+}
+
 int etnaviv_ioctl_gem_submit(struct drm_device *dev, void *data,
 		struct drm_file *file)
 {
@@ -358,10 +401,9 @@ int etnaviv_ioctl_gem_submit(struct drm_device *dev, void *data,
 	struct drm_etnaviv_gem_submit_pmr *pmrs;
 	struct drm_etnaviv_gem_submit_bo *bos;
 	struct etnaviv_gem_submit *submit;
-	struct etnaviv_cmdbuf *cmdbuf;
 	struct etnaviv_gpu *gpu;
-	struct dma_fence *in_fence = NULL;
 	struct sync_file *sync_file = NULL;
+	struct ww_acquire_ctx ticket;
 	int out_fence_fd = -1;
 	void *stream;
 	int ret;
@@ -399,17 +441,11 @@ int etnaviv_ioctl_gem_submit(struct drm_device *dev, void *data,
 	relocs = kvmalloc_array(args->nr_relocs, sizeof(*relocs), GFP_KERNEL);
 	pmrs = kvmalloc_array(args->nr_pmrs, sizeof(*pmrs), GFP_KERNEL);
 	stream = kvmalloc_array(1, args->stream_size, GFP_KERNEL);
-	cmdbuf = etnaviv_cmdbuf_new(gpu->cmdbuf_suballoc,
-				    ALIGN(args->stream_size, 8) + 8,
-				    args->nr_bos, args->nr_pmrs);
-	if (!bos || !relocs || !pmrs || !stream || !cmdbuf) {
+	if (!bos || !relocs || !pmrs || !stream) {
 		ret = -ENOMEM;
 		goto err_submit_cmds;
 	}
 
-	cmdbuf->exec_state = args->exec_state;
-	cmdbuf->ctx = file->driver_priv;
-
 	ret = copy_from_user(bos, u64_to_user_ptr(args->bos),
 			     args->nr_bos * sizeof(*bos));
 	if (ret) {
@@ -430,7 +466,6 @@ int etnaviv_ioctl_gem_submit(struct drm_device *dev, void *data,
 		ret = -EFAULT;
 		goto err_submit_cmds;
 	}
-	cmdbuf->nr_pmrs = args->nr_pmrs;
 
 	ret = copy_from_user(stream, u64_to_user_ptr(args->stream),
 			     args->stream_size);
@@ -447,19 +482,28 @@ int etnaviv_ioctl_gem_submit(struct drm_device *dev, void *data,
 		}
 	}
 
-	submit = submit_create(dev, gpu, args->nr_bos);
+	ww_acquire_init(&ticket, &reservation_ww_class);
+
+	submit = submit_create(dev, gpu, args->nr_bos, args->nr_pmrs);
 	if (!submit) {
 		ret = -ENOMEM;
-		goto err_submit_cmds;
+		goto err_submit_ww_acquire;
 	}
 
+	ret = etnaviv_cmdbuf_init(gpu->cmdbuf_suballoc, &submit->cmdbuf,
+				  ALIGN(args->stream_size, 8) + 8);
+	if (ret)
+		goto err_submit_objects;
+
+	submit->cmdbuf.ctx = file->driver_priv;
+	submit->exec_state = args->exec_state;
 	submit->flags = args->flags;
 
 	ret = submit_lookup_objects(submit, file, bos, args->nr_bos);
 	if (ret)
 		goto err_submit_objects;
 
-	ret = submit_lock_objects(submit);
+	ret = submit_lock_objects(submit, &ticket);
 	if (ret)
 		goto err_submit_objects;
 
@@ -470,21 +514,11 @@ int etnaviv_ioctl_gem_submit(struct drm_device *dev, void *data,
 	}
 
 	if (args->flags & ETNA_SUBMIT_FENCE_FD_IN) {
-		in_fence = sync_file_get_fence(args->fence_fd);
-		if (!in_fence) {
+		submit->in_fence = sync_file_get_fence(args->fence_fd);
+		if (!submit->in_fence) {
 			ret = -EINVAL;
 			goto err_submit_objects;
 		}
-
-		/*
-		 * Wait if the fence is from a foreign context, or if the fence
-		 * array contains any fence from a foreign context.
-		 */
-		if (!dma_fence_match_context(in_fence, gpu->fence_context)) {
-			ret = dma_fence_wait(in_fence, true);
-			if (ret)
-				goto err_submit_objects;
-		}
 	}
 
 	ret = submit_fence_sync(submit);
@@ -493,25 +527,25 @@ int etnaviv_ioctl_gem_submit(struct drm_device *dev, void *data,
 
 	ret = submit_pin_objects(submit);
 	if (ret)
-		goto out;
+		goto err_submit_objects;
 
 	ret = submit_reloc(submit, stream, args->stream_size / 4,
 			   relocs, args->nr_relocs);
 	if (ret)
-		goto out;
+		goto err_submit_objects;
 
-	ret = submit_perfmon_validate(submit, cmdbuf, pmrs, args->nr_pmrs);
+	ret = submit_perfmon_validate(submit, args->exec_state, pmrs);
 	if (ret)
-		goto out;
+		goto err_submit_objects;
 
-	memcpy(cmdbuf->vaddr, stream, args->stream_size);
-	cmdbuf->user_size = ALIGN(args->stream_size, 8);
+	memcpy(submit->cmdbuf.vaddr, stream, args->stream_size);
+	submit->cmdbuf.user_size = ALIGN(args->stream_size, 8);
 
-	ret = etnaviv_gpu_submit(gpu, submit, cmdbuf);
+	ret = etnaviv_gpu_submit(gpu, submit);
 	if (ret)
-		goto out;
+		goto err_submit_objects;
 
-	cmdbuf = NULL;
+	submit_attach_object_fences(submit);
 
 	if (args->flags & ETNA_SUBMIT_FENCE_FD_OUT) {
 		/*
@@ -520,39 +554,26 @@ int etnaviv_ioctl_gem_submit(struct drm_device *dev, void *data,
 		 * fence to the sync file here, eliminating the ENOMEM
 		 * possibility at this stage.
 		 */
-		sync_file = sync_file_create(submit->fence);
+		sync_file = sync_file_create(submit->out_fence);
 		if (!sync_file) {
 			ret = -ENOMEM;
-			goto out;
+			goto err_submit_objects;
 		}
 		fd_install(out_fence_fd, sync_file->file);
 	}
 
 	args->fence_fd = out_fence_fd;
-	args->fence = submit->fence->seqno;
-
-out:
-	submit_unpin_objects(submit);
-
-	/*
-	 * If we're returning -EAGAIN, it may be due to the userptr code
-	 * wanting to run its workqueue outside of any locks. Flush our
-	 * workqueue to ensure that it is run in a timely manner.
-	 */
-	if (ret == -EAGAIN)
-		flush_workqueue(priv->wq);
+	args->fence = submit->out_fence->seqno;
 
 err_submit_objects:
-	if (in_fence)
-		dma_fence_put(in_fence);
-	submit_cleanup(submit);
+	etnaviv_submit_put(submit);
+
+err_submit_ww_acquire:
+	ww_acquire_fini(&ticket);
 
 err_submit_cmds:
 	if (ret && (out_fence_fd >= 0))
 		put_unused_fd(out_fence_fd);
-	/* if we still own the cmdbuf */
-	if (cmdbuf)
-		etnaviv_cmdbuf_free(cmdbuf);
 	if (stream)
 		kvfree(stream);
 	if (bos)
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gpu.c b/drivers/gpu/drm/etnaviv/etnaviv_gpu.c
index e19cbe05da2a..21d0d22f1168 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_gpu.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_gpu.c
@@ -644,7 +644,7 @@ static void etnaviv_gpu_hw_init(struct etnaviv_gpu *gpu)
 	prefetch = etnaviv_buffer_init(gpu);
 
 	gpu_write(gpu, VIVS_HI_INTR_ENBL, ~0U);
-	etnaviv_gpu_start_fe(gpu, etnaviv_cmdbuf_get_va(gpu->buffer),
+	etnaviv_gpu_start_fe(gpu, etnaviv_cmdbuf_get_va(&gpu->buffer),
 			     prefetch);
 }
 
@@ -717,15 +717,15 @@ int etnaviv_gpu_init(struct etnaviv_gpu *gpu)
 	}
 
 	/* Create buffer: */
-	gpu->buffer = etnaviv_cmdbuf_new(gpu->cmdbuf_suballoc, PAGE_SIZE, 0, 0);
-	if (!gpu->buffer) {
-		ret = -ENOMEM;
+	ret = etnaviv_cmdbuf_init(gpu->cmdbuf_suballoc, &gpu->buffer,
+				  PAGE_SIZE);
+	if (ret) {
 		dev_err(gpu->dev, "could not create command buffer\n");
 		goto destroy_iommu;
 	}
 
 	if (gpu->mmu->version == ETNAVIV_IOMMU_V1 &&
-	    etnaviv_cmdbuf_get_va(gpu->buffer) > 0x80000000) {
+	    etnaviv_cmdbuf_get_va(&gpu->buffer) > 0x80000000) {
 		ret = -EINVAL;
 		dev_err(gpu->dev,
 			"command buffer outside valid memory window\n");
@@ -751,8 +751,7 @@ int etnaviv_gpu_init(struct etnaviv_gpu *gpu)
 	return 0;
 
 free_buffer:
-	etnaviv_cmdbuf_free(gpu->buffer);
-	gpu->buffer = NULL;
+	etnaviv_cmdbuf_free(&gpu->buffer);
 destroy_iommu:
 	etnaviv_iommu_destroy(gpu->mmu);
 	gpu->mmu = NULL;
@@ -958,7 +957,7 @@ static void recover_worker(struct work_struct *work)
 	pm_runtime_put_autosuspend(gpu->dev);
 
 	/* Retire the buffer objects in a work */
-	etnaviv_queue_work(gpu->drm, &gpu->retire_work);
+	queue_work(gpu->wq, &gpu->retire_work);
 }
 
 static void hangcheck_timer_reset(struct etnaviv_gpu *gpu)
@@ -994,7 +993,7 @@ static void hangcheck_handler(struct timer_list *t)
 		dev_err(gpu->dev, "     completed fence: %u\n", fence);
 		dev_err(gpu->dev, "     active fence: %u\n",
 			gpu->active_fence);
-		etnaviv_queue_work(gpu->drm, &gpu->recover_work);
+		queue_work(gpu->wq, &gpu->recover_work);
 	}
 
 	/* if still more pending work, reset the hangcheck timer: */
@@ -1201,42 +1200,23 @@ static void retire_worker(struct work_struct *work)
 	struct etnaviv_gpu *gpu = container_of(work, struct etnaviv_gpu,
 					       retire_work);
 	u32 fence = gpu->completed_fence;
-	struct etnaviv_cmdbuf *cmdbuf, *tmp;
-	unsigned int i;
+	struct etnaviv_gem_submit *submit, *tmp;
+	LIST_HEAD(retire_list);
 
 	mutex_lock(&gpu->lock);
-	list_for_each_entry_safe(cmdbuf, tmp, &gpu->active_cmd_list, node) {
-		if (!dma_fence_is_signaled(cmdbuf->fence))
+	list_for_each_entry_safe(submit, tmp, &gpu->active_submit_list, node) {
+		if (!dma_fence_is_signaled(submit->out_fence))
 			break;
 
-		list_del(&cmdbuf->node);
-		dma_fence_put(cmdbuf->fence);
-
-		for (i = 0; i < cmdbuf->nr_bos; i++) {
-			struct etnaviv_vram_mapping *mapping = cmdbuf->bo_map[i];
-			struct etnaviv_gem_object *etnaviv_obj = mapping->object;
-
-			atomic_dec(&etnaviv_obj->gpu_active);
-			/* drop the refcount taken in etnaviv_gpu_submit */
-			etnaviv_gem_mapping_unreference(mapping);
-		}
-
-		etnaviv_cmdbuf_free(cmdbuf);
-		/*
-		 * We need to balance the runtime PM count caused by
-		 * each submission.  Upon submission, we increment
-		 * the runtime PM counter, and allocate one event.
-		 * So here, we put the runtime PM count for each
-		 * completed event.
-		 */
-		pm_runtime_put_autosuspend(gpu->dev);
+		list_move(&submit->node, &retire_list);
 	}
 
 	gpu->retired_fence = fence;
 
 	mutex_unlock(&gpu->lock);
 
-	wake_up_all(&gpu->fence_event);
+	list_for_each_entry_safe(submit, tmp, &retire_list, node)
+		etnaviv_submit_put(submit);
 }
 
 int etnaviv_gpu_wait_fence_interruptible(struct etnaviv_gpu *gpu,
@@ -1295,41 +1275,25 @@ int etnaviv_gpu_wait_obj_inactive(struct etnaviv_gpu *gpu,
 	ret = wait_event_interruptible_timeout(gpu->fence_event,
 					       !is_active(etnaviv_obj),
 					       remaining);
-	if (ret > 0) {
-		struct etnaviv_drm_private *priv = gpu->drm->dev_private;
-
-		/* Synchronise with the retire worker */
-		flush_workqueue(priv->wq);
+	if (ret > 0)
 		return 0;
-	} else if (ret == -ERESTARTSYS) {
+	else if (ret == -ERESTARTSYS)
 		return -ERESTARTSYS;
-	} else {
+	else
 		return -ETIMEDOUT;
-	}
-}
-
-int etnaviv_gpu_pm_get_sync(struct etnaviv_gpu *gpu)
-{
-	return pm_runtime_get_sync(gpu->dev);
-}
-
-void etnaviv_gpu_pm_put(struct etnaviv_gpu *gpu)
-{
-	pm_runtime_mark_last_busy(gpu->dev);
-	pm_runtime_put_autosuspend(gpu->dev);
 }
 
 static void sync_point_perfmon_sample(struct etnaviv_gpu *gpu,
 	struct etnaviv_event *event, unsigned int flags)
 {
-	const struct etnaviv_cmdbuf *cmdbuf = event->cmdbuf;
+	const struct etnaviv_gem_submit *submit = event->submit;
 	unsigned int i;
 
-	for (i = 0; i < cmdbuf->nr_pmrs; i++) {
-		const struct etnaviv_perfmon_request *pmr = cmdbuf->pmrs + i;
+	for (i = 0; i < submit->nr_pmrs; i++) {
+		const struct etnaviv_perfmon_request *pmr = submit->pmrs + i;
 
 		if (pmr->flags == flags)
-			etnaviv_perfmon_process(gpu, pmr);
+			etnaviv_perfmon_process(gpu, pmr, submit->exec_state);
 	}
 }
 
@@ -1354,14 +1318,14 @@ static void sync_point_perfmon_sample_pre(struct etnaviv_gpu *gpu,
 static void sync_point_perfmon_sample_post(struct etnaviv_gpu *gpu,
 	struct etnaviv_event *event)
 {
-	const struct etnaviv_cmdbuf *cmdbuf = event->cmdbuf;
+	const struct etnaviv_gem_submit *submit = event->submit;
 	unsigned int i;
 	u32 val;
 
 	sync_point_perfmon_sample(gpu, event, ETNA_PM_PROCESS_POST);
 
-	for (i = 0; i < cmdbuf->nr_pmrs; i++) {
-		const struct etnaviv_perfmon_request *pmr = cmdbuf->pmrs + i;
+	for (i = 0; i < submit->nr_pmrs; i++) {
+		const struct etnaviv_perfmon_request *pmr = submit->pmrs + i;
 
 		*pmr->bo_vma = pmr->sequence;
 	}
@@ -1380,24 +1344,15 @@ static void sync_point_perfmon_sample_post(struct etnaviv_gpu *gpu,
 
 /* add bo's to gpu's ring, and kick gpu: */
 int etnaviv_gpu_submit(struct etnaviv_gpu *gpu,
-	struct etnaviv_gem_submit *submit, struct etnaviv_cmdbuf *cmdbuf)
+	struct etnaviv_gem_submit *submit)
 {
-	struct dma_fence *fence;
 	unsigned int i, nr_events = 1, event[3];
 	int ret;
 
-	ret = etnaviv_gpu_pm_get_sync(gpu);
+	ret = pm_runtime_get_sync(gpu->dev);
 	if (ret < 0)
 		return ret;
-
-	/*
-	 * TODO
-	 *
-	 * - flush
-	 * - data endian
-	 * - prefetch
-	 *
-	 */
+	submit->runtime_resumed = true;
 
 	/*
 	 * if there are performance monitor requests we need to have
@@ -1406,19 +1361,19 @@ int etnaviv_gpu_submit(struct etnaviv_gpu *gpu,
 	 * - a sync point to re-configure gpu, process ETNA_PM_PROCESS_POST requests
 	 *   and update the sequence number for userspace.
 	 */
-	if (cmdbuf->nr_pmrs)
+	if (submit->nr_pmrs)
 		nr_events = 3;
 
 	ret = event_alloc(gpu, nr_events, event);
 	if (ret) {
 		DRM_ERROR("no free events\n");
-		goto out_pm_put;
+		return ret;
 	}
 
 	mutex_lock(&gpu->lock);
 
-	fence = etnaviv_gpu_fence_alloc(gpu);
-	if (!fence) {
+	submit->out_fence = etnaviv_gpu_fence_alloc(gpu);
+	if (!submit->out_fence) {
 		for (i = 0; i < nr_events; i++)
 			event_free(gpu, event[i]);
 
@@ -1426,80 +1381,51 @@ int etnaviv_gpu_submit(struct etnaviv_gpu *gpu,
 		goto out_unlock;
 	}
 
-	gpu->event[event[0]].fence = fence;
-	submit->fence = dma_fence_get(fence);
-	gpu->active_fence = submit->fence->seqno;
+	gpu->active_fence = submit->out_fence->seqno;
 
-	if (gpu->lastctx != cmdbuf->ctx) {
-		gpu->mmu->need_flush = true;
-		gpu->switch_context = true;
-		gpu->lastctx = cmdbuf->ctx;
-	}
-
-	if (cmdbuf->nr_pmrs) {
+	if (submit->nr_pmrs) {
 		gpu->event[event[1]].sync_point = &sync_point_perfmon_sample_pre;
-		gpu->event[event[1]].cmdbuf = cmdbuf;
+		kref_get(&submit->refcount);
+		gpu->event[event[1]].submit = submit;
 		etnaviv_sync_point_queue(gpu, event[1]);
 	}
 
-	etnaviv_buffer_queue(gpu, event[0], cmdbuf);
+	kref_get(&submit->refcount);
+	gpu->event[event[0]].fence = submit->out_fence;
+	etnaviv_buffer_queue(gpu, submit->exec_state, event[0],
+			     &submit->cmdbuf);
 
-	if (cmdbuf->nr_pmrs) {
+	if (submit->nr_pmrs) {
 		gpu->event[event[2]].sync_point = &sync_point_perfmon_sample_post;
-		gpu->event[event[2]].cmdbuf = cmdbuf;
+		kref_get(&submit->refcount);
+		gpu->event[event[2]].submit = submit;
 		etnaviv_sync_point_queue(gpu, event[2]);
 	}
 
-	cmdbuf->fence = fence;
-	list_add_tail(&cmdbuf->node, &gpu->active_cmd_list);
-
-	/* We're committed to adding this command buffer, hold a PM reference */
-	pm_runtime_get_noresume(gpu->dev);
-
-	for (i = 0; i < submit->nr_bos; i++) {
-		struct etnaviv_gem_object *etnaviv_obj = submit->bos[i].obj;
+	list_add_tail(&submit->node, &gpu->active_submit_list);
 
-		/* Each cmdbuf takes a refcount on the mapping */
-		etnaviv_gem_mapping_reference(submit->bos[i].mapping);
-		cmdbuf->bo_map[i] = submit->bos[i].mapping;
-		atomic_inc(&etnaviv_obj->gpu_active);
-
-		if (submit->bos[i].flags & ETNA_SUBMIT_BO_WRITE)
-			reservation_object_add_excl_fence(etnaviv_obj->resv,
-							  fence);
-		else
-			reservation_object_add_shared_fence(etnaviv_obj->resv,
-							    fence);
-	}
-	cmdbuf->nr_bos = submit->nr_bos;
 	hangcheck_timer_reset(gpu);
 	ret = 0;
 
 out_unlock:
 	mutex_unlock(&gpu->lock);
 
-out_pm_put:
-	etnaviv_gpu_pm_put(gpu);
-
 	return ret;
 }
 
-static void etnaviv_process_sync_point(struct etnaviv_gpu *gpu,
-	struct etnaviv_event *event)
-{
-	u32 addr = gpu_read(gpu, VIVS_FE_DMA_ADDRESS);
-
-	event->sync_point(gpu, event);
-	etnaviv_gpu_start_fe(gpu, addr + 2, 2);
-}
-
 static void sync_point_worker(struct work_struct *work)
 {
 	struct etnaviv_gpu *gpu = container_of(work, struct etnaviv_gpu,
 					       sync_point_work);
+	struct etnaviv_event *event = &gpu->event[gpu->sync_point_event];
+	u32 addr = gpu_read(gpu, VIVS_FE_DMA_ADDRESS);
 
-	etnaviv_process_sync_point(gpu, &gpu->event[gpu->sync_point_event]);
+	event->sync_point(gpu, event);
+	etnaviv_submit_put(event->submit);
 	event_free(gpu, gpu->sync_point_event);
+
+	/* restart FE last to avoid GPU and IRQ racing against this worker */
+	etnaviv_gpu_start_fe(gpu, addr + 2, 2);
 }
 
 /*
@@ -1550,7 +1476,7 @@ static irqreturn_t irq_handler(int irq, void *data)
 
 			if (gpu->event[event].sync_point) {
 				gpu->sync_point_event = event;
-				etnaviv_queue_work(gpu->drm, &gpu->sync_point_work);
+				queue_work(gpu->wq, &gpu->sync_point_work);
 			}
 
 			fence = gpu->event[event].fence;
@@ -1576,7 +1502,7 @@ static irqreturn_t irq_handler(int irq, void *data)
 		}
 
 		/* Retire the buffer objects in a work */
-		etnaviv_queue_work(gpu->drm, &gpu->retire_work);
+		queue_work(gpu->wq, &gpu->retire_work);
 
 		ret = IRQ_HANDLED;
 	}
@@ -1653,9 +1579,11 @@ int etnaviv_gpu_wait_idle(struct etnaviv_gpu *gpu, unsigned int timeout_ms)
 
 static int etnaviv_gpu_hw_suspend(struct etnaviv_gpu *gpu)
 {
-	if (gpu->buffer) {
+	if (gpu->buffer.suballoc) {
 		/* Replace the last WAIT with END */
+		mutex_lock(&gpu->lock);
 		etnaviv_buffer_end(gpu);
+		mutex_unlock(&gpu->lock);
 
 		/*
 		 * We know that only the FE is busy here, this should
@@ -1680,7 +1608,7 @@ static int etnaviv_gpu_hw_resume(struct etnaviv_gpu *gpu)
 	etnaviv_gpu_update_clock(gpu);
 	etnaviv_gpu_hw_init(gpu);
 
-	gpu->switch_context = true;
+	gpu->lastctx = NULL;
 	gpu->exec_state = -1;
 
 	mutex_unlock(&gpu->lock);
@@ -1738,20 +1666,29 @@ static int etnaviv_gpu_bind(struct device *dev, struct device *master,
 	struct etnaviv_gpu *gpu = dev_get_drvdata(dev);
 	int ret;
 
-	if (IS_ENABLED(CONFIG_THERMAL)) {
+	if (IS_ENABLED(CONFIG_DRM_ETNAVIV_THERMAL)) {
 		gpu->cooling = thermal_of_cooling_device_register(dev->of_node,
 				(char *)dev_name(dev), gpu, &cooling_ops);
 		if (IS_ERR(gpu->cooling))
 			return PTR_ERR(gpu->cooling);
 	}
 
+	gpu->wq = alloc_ordered_workqueue(dev_name(dev), 0);
+	if (!gpu->wq) {
+		if (IS_ENABLED(CONFIG_DRM_ETNAVIV_THERMAL))
+			thermal_cooling_device_unregister(gpu->cooling);
+		return -ENOMEM;
+	}
+
 #ifdef CONFIG_PM
 	ret = pm_runtime_get_sync(gpu->dev);
 #else
 	ret = etnaviv_gpu_clk_enable(gpu);
 #endif
 	if (ret < 0) {
-		thermal_cooling_device_unregister(gpu->cooling);
+		destroy_workqueue(gpu->wq);
+		if (IS_ENABLED(CONFIG_DRM_ETNAVIV_THERMAL))
+			thermal_cooling_device_unregister(gpu->cooling);
 		return ret;
 	}
 
@@ -1759,7 +1696,7 @@ static int etnaviv_gpu_bind(struct device *dev, struct device *master,
 	gpu->fence_context = dma_fence_context_alloc(1);
 	spin_lock_init(&gpu->fence_spinlock);
 
-	INIT_LIST_HEAD(&gpu->active_cmd_list);
+	INIT_LIST_HEAD(&gpu->active_submit_list);
 	INIT_WORK(&gpu->retire_work, retire_worker);
 	INIT_WORK(&gpu->sync_point_work, sync_point_worker);
 	INIT_WORK(&gpu->recover_work, recover_worker);
@@ -1784,6 +1721,9 @@ static void etnaviv_gpu_unbind(struct device *dev, struct device *master,
 
 	hangcheck_disable(gpu);
 
+	flush_workqueue(gpu->wq);
+	destroy_workqueue(gpu->wq);
+
 #ifdef CONFIG_PM
 	pm_runtime_get_sync(gpu->dev);
 	pm_runtime_put_sync_suspend(gpu->dev);
@@ -1791,10 +1731,8 @@ static void etnaviv_gpu_unbind(struct device *dev, struct device *master,
 	etnaviv_gpu_hw_suspend(gpu);
 #endif
 
-	if (gpu->buffer) {
-		etnaviv_cmdbuf_free(gpu->buffer);
-		gpu->buffer = NULL;
-	}
+	if (gpu->buffer.suballoc)
+		etnaviv_cmdbuf_free(&gpu->buffer);
 
 	if (gpu->cmdbuf_suballoc) {
 		etnaviv_cmdbuf_suballoc_destroy(gpu->cmdbuf_suballoc);
@@ -1808,7 +1746,8 @@ static void etnaviv_gpu_unbind(struct device *dev, struct device *master,
 
 	gpu->drm = NULL;
 
-	thermal_cooling_device_unregister(gpu->cooling);
+	if (IS_ENABLED(CONFIG_DRM_ETNAVIV_THERMAL))
+		thermal_cooling_device_unregister(gpu->cooling);
 	gpu->cooling = NULL;
 }
 
@@ -1931,7 +1870,7 @@ static int etnaviv_gpu_rpm_resume(struct device *dev)
 		return ret;
 
 	/* Re-initialise the basic hardware state */
-	if (gpu->drm && gpu->buffer) {
+	if (gpu->drm && gpu->buffer.suballoc) {
 		ret = etnaviv_gpu_hw_resume(gpu);
 		if (ret) {
 			etnaviv_gpu_clk_disable(gpu);
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gpu.h b/drivers/gpu/drm/etnaviv/etnaviv_gpu.h
index 4f10f147297a..7623905210dc 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_gpu.h
+++ b/drivers/gpu/drm/etnaviv/etnaviv_gpu.h
@@ -20,6 +20,7 @@
 #include <linux/clk.h>
 #include <linux/regulator/consumer.h>
 
+#include "etnaviv_cmdbuf.h"
 #include "etnaviv_drv.h"
 
 struct etnaviv_gem_submit;
@@ -89,7 +90,7 @@ struct etnaviv_chip_identity {
 
 struct etnaviv_event {
 	struct dma_fence *fence;
-	struct etnaviv_cmdbuf *cmdbuf;
+	struct etnaviv_gem_submit *submit;
 
 	void (*sync_point)(struct etnaviv_gpu *gpu, struct etnaviv_event *event);
 };
@@ -106,10 +107,10 @@ struct etnaviv_gpu {
 	struct mutex lock;
 	struct etnaviv_chip_identity identity;
 	struct etnaviv_file_private *lastctx;
-	bool switch_context;
+	struct workqueue_struct *wq;
 
 	/* 'ring'-buffer: */
-	struct etnaviv_cmdbuf *buffer;
+	struct etnaviv_cmdbuf buffer;
 	int exec_state;
 
 	/* bus base address of memory  */
@@ -122,7 +123,7 @@ struct etnaviv_gpu {
 	spinlock_t event_spinlock;
 
 	/* list of currently in-flight command buffers */
-	struct list_head active_cmd_list;
+	struct list_head active_submit_list;
 
 	u32 idle_mask;
 
@@ -202,7 +203,7 @@ int etnaviv_gpu_wait_fence_interruptible(struct etnaviv_gpu *gpu,
 int etnaviv_gpu_wait_obj_inactive(struct etnaviv_gpu *gpu,
 	struct etnaviv_gem_object *etnaviv_obj, struct timespec *timeout);
 int etnaviv_gpu_submit(struct etnaviv_gpu *gpu,
-	struct etnaviv_gem_submit *submit, struct etnaviv_cmdbuf *cmdbuf);
+	struct etnaviv_gem_submit *submit);
 int etnaviv_gpu_pm_get_sync(struct etnaviv_gpu *gpu);
 void etnaviv_gpu_pm_put(struct etnaviv_gpu *gpu);
 int etnaviv_gpu_wait_idle(struct etnaviv_gpu *gpu, unsigned int timeout_ms);
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_iommu.c b/drivers/gpu/drm/etnaviv/etnaviv_iommu.c
index 14e24ac6573f..7a8c94731748 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_iommu.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_iommu.c
@@ -70,9 +70,8 @@ static int __etnaviv_iommu_init(struct etnaviv_iommuv1_domain *etnaviv_domain)
 		return -ENOMEM;
 	}
 
-	for (i = 0; i < PT_ENTRIES; i++)
-		etnaviv_domain->pgtable_cpu[i] =
-				etnaviv_domain->base.bad_page_dma;
+	memset32(etnaviv_domain->pgtable_cpu, etnaviv_domain->base.bad_page_dma,
+		 PT_ENTRIES);
 
 	return 0;
 }
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_iommu_v2.c b/drivers/gpu/drm/etnaviv/etnaviv_iommu_v2.c
index fc60fc8ddbf0..1e956e266aa3 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_iommu_v2.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_iommu_v2.c
@@ -229,7 +229,7 @@ void etnaviv_iommuv2_restore(struct etnaviv_gpu *gpu)
 	prefetch = etnaviv_buffer_config_mmuv2(gpu,
 				(u32)etnaviv_domain->mtlb_dma,
 				(u32)etnaviv_domain->base.bad_page_dma);
-	etnaviv_gpu_start_fe(gpu, (u32)etnaviv_cmdbuf_get_pa(gpu->buffer),
+	etnaviv_gpu_start_fe(gpu, (u32)etnaviv_cmdbuf_get_pa(&gpu->buffer),
 			     prefetch);
 	etnaviv_gpu_wait_idle(gpu, 100);
 
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_mmu.c b/drivers/gpu/drm/etnaviv/etnaviv_mmu.c
index 35074b944778..d113fe06e6b5 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_mmu.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_mmu.c
@@ -263,18 +263,16 @@ int etnaviv_iommu_map_gem(struct etnaviv_iommu *mmu,
 		if (iova < 0x80000000 - sg_dma_len(sgt->sgl)) {
 			mapping->iova = iova;
 			list_add_tail(&mapping->mmu_node, &mmu->mappings);
-			mutex_unlock(&mmu->lock);
-			return 0;
+			ret = 0;
+			goto unlock;
 		}
 	}
 
 	node = &mapping->vram_node;
 
 	ret = etnaviv_iommu_find_iova(mmu, node, etnaviv_obj->base.size);
-	if (ret < 0) {
-		mutex_unlock(&mmu->lock);
-		return ret;
-	}
+	if (ret < 0)
+		goto unlock;
 
 	mmu->last_iova = node->start + etnaviv_obj->base.size;
 	mapping->iova = node->start;
@@ -283,12 +281,12 @@ int etnaviv_iommu_map_gem(struct etnaviv_iommu *mmu,
 
 	if (ret < 0) {
 		drm_mm_remove_node(node);
-		mutex_unlock(&mmu->lock);
-		return ret;
+		goto unlock;
 	}
 
 	list_add_tail(&mapping->mmu_node, &mmu->mappings);
 	mmu->need_flush = true;
+unlock:
 	mutex_unlock(&mmu->lock);
 
 	return ret;
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_perfmon.c b/drivers/gpu/drm/etnaviv/etnaviv_perfmon.c
index 768f5aafdd18..26dddfc41aac 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_perfmon.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_perfmon.c
@@ -479,9 +479,9 @@ int etnaviv_pm_req_validate(const struct drm_etnaviv_gem_submit_pmr *r,
 }
 
 void etnaviv_perfmon_process(struct etnaviv_gpu *gpu,
-	const struct etnaviv_perfmon_request *pmr)
+	const struct etnaviv_perfmon_request *pmr, u32 exec_state)
 {
-	const struct etnaviv_pm_domain_meta *meta = &doms_meta[gpu->exec_state];
+	const struct etnaviv_pm_domain_meta *meta = &doms_meta[exec_state];
 	const struct etnaviv_pm_domain *dom;
 	const struct etnaviv_pm_signal *sig;
 	u32 *bo = pmr->bo_vma;
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_perfmon.h b/drivers/gpu/drm/etnaviv/etnaviv_perfmon.h
index 35dce194cb00..c1653c64ab6b 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_perfmon.h
+++ b/drivers/gpu/drm/etnaviv/etnaviv_perfmon.h
@@ -44,6 +44,6 @@ int etnaviv_pm_req_validate(const struct drm_etnaviv_gem_submit_pmr *r,
 	u32 exec_state);
 
 void etnaviv_perfmon_process(struct etnaviv_gpu *gpu,
-	const struct etnaviv_perfmon_request *pmr);
+	const struct etnaviv_perfmon_request *pmr, u32 exec_state);
 
 #endif /* __ETNAVIV_PERFMON_H__ */
diff --git a/drivers/gpu/drm/exynos/Kconfig b/drivers/gpu/drm/exynos/Kconfig
index 5a7c9d8abd6b..735ce47688f9 100644
--- a/drivers/gpu/drm/exynos/Kconfig
+++ b/drivers/gpu/drm/exynos/Kconfig
@@ -95,26 +95,21 @@ config DRM_EXYNOS_G2D
 	help
 	  Choose this option if you want to use Exynos G2D for DRM.
 
-config DRM_EXYNOS_IPP
-	bool "Image Post Processor"
-	help
-	  Choose this option if you want to use IPP feature for DRM.
-
 config DRM_EXYNOS_FIMC
 	bool "FIMC"
-	depends on DRM_EXYNOS_IPP && MFD_SYSCON
+	depends on BROKEN && MFD_SYSCON
 	help
 	  Choose this option if you want to use Exynos FIMC for DRM.
 
 config DRM_EXYNOS_ROTATOR
 	bool "Rotator"
-	depends on DRM_EXYNOS_IPP
+	depends on BROKEN
 	help
 	  Choose this option if you want to use Exynos Rotator for DRM.
 
 config DRM_EXYNOS_GSC
 	bool "GScaler"
-	depends on DRM_EXYNOS_IPP && ARCH_EXYNOS5 && VIDEO_SAMSUNG_EXYNOS_GSC=n
+	depends on BROKEN && ARCH_EXYNOS5 && VIDEO_SAMSUNG_EXYNOS_GSC=n
 	help
 	  Choose this option if you want to use Exynos GSC for DRM.
 
diff --git a/drivers/gpu/drm/exynos/Makefile b/drivers/gpu/drm/exynos/Makefile
index bdf4212dde7b..a51c5459bb13 100644
--- a/drivers/gpu/drm/exynos/Makefile
+++ b/drivers/gpu/drm/exynos/Makefile
@@ -18,7 +18,6 @@ exynosdrm-$(CONFIG_DRM_EXYNOS_MIXER)	+= exynos_mixer.o
 exynosdrm-$(CONFIG_DRM_EXYNOS_HDMI)	+= exynos_hdmi.o
 exynosdrm-$(CONFIG_DRM_EXYNOS_VIDI)	+= exynos_drm_vidi.o
 exynosdrm-$(CONFIG_DRM_EXYNOS_G2D)	+= exynos_drm_g2d.o
-exynosdrm-$(CONFIG_DRM_EXYNOS_IPP)	+= exynos_drm_ipp.o
 exynosdrm-$(CONFIG_DRM_EXYNOS_FIMC)	+= exynos_drm_fimc.o
 exynosdrm-$(CONFIG_DRM_EXYNOS_ROTATOR)	+= exynos_drm_rotator.o
 exynosdrm-$(CONFIG_DRM_EXYNOS_GSC)	+= exynos_drm_gsc.o
diff --git a/drivers/gpu/drm/exynos/exynos5433_drm_decon.c b/drivers/gpu/drm/exynos/exynos5433_drm_decon.c
index 6be5b53c3b27..1c330f2a7a5d 100644
--- a/drivers/gpu/drm/exynos/exynos5433_drm_decon.c
+++ b/drivers/gpu/drm/exynos/exynos5433_drm_decon.c
@@ -21,13 +21,12 @@
 #include <linux/pm_runtime.h>
 #include <linux/regmap.h>
 
-#include <video/exynos5433_decon.h>
-
 #include "exynos_drm_drv.h"
 #include "exynos_drm_crtc.h"
 #include "exynos_drm_fb.h"
 #include "exynos_drm_plane.h"
 #include "exynos_drm_iommu.h"
+#include "regs-decon5433.h"
 
 #define DSD_CFG_MUX 0x1004
 #define DSD_CFG_MUX_TE_UNMASK_GLOBAL BIT(13)
@@ -744,11 +743,6 @@ static int exynos5433_decon_probe(struct platform_device *pdev)
 	}
 
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	if (!res) {
-		dev_err(dev, "cannot find IO resource\n");
-		return -ENXIO;
-	}
-
 	ctx->addr = devm_ioremap_resource(dev, res);
 	if (IS_ERR(ctx->addr)) {
 		dev_err(dev, "ioremap failed\n");
diff --git a/drivers/gpu/drm/exynos/exynos7_drm_decon.c b/drivers/gpu/drm/exynos/exynos7_drm_decon.c
index 615efcf7782a..3931d5e33fe0 100644
--- a/drivers/gpu/drm/exynos/exynos7_drm_decon.c
+++ b/drivers/gpu/drm/exynos/exynos7_drm_decon.c
@@ -25,13 +25,13 @@
 
 #include <video/of_display_timing.h>
 #include <video/of_videomode.h>
-#include <video/exynos7_decon.h>
 
 #include "exynos_drm_crtc.h"
 #include "exynos_drm_plane.h"
 #include "exynos_drm_drv.h"
 #include "exynos_drm_fb.h"
 #include "exynos_drm_iommu.h"
+#include "regs-decon7.h"
 
 /*
  * DECON stands for Display and Enhancement controller.
diff --git a/drivers/gpu/drm/exynos/exynos_drm_drv.c b/drivers/gpu/drm/exynos/exynos_drm_drv.c
index b96bd5a781b2..a518e9c6d6cc 100644
--- a/drivers/gpu/drm/exynos/exynos_drm_drv.c
+++ b/drivers/gpu/drm/exynos/exynos_drm_drv.c
@@ -29,7 +29,6 @@
 #include "exynos_drm_plane.h"
 #include "exynos_drm_vidi.h"
 #include "exynos_drm_g2d.h"
-#include "exynos_drm_ipp.h"
 #include "exynos_drm_iommu.h"
 
 #define DRIVER_NAME	"exynos"
@@ -109,14 +108,6 @@ static const struct drm_ioctl_desc exynos_ioctls[] = {
 			DRM_AUTH | DRM_RENDER_ALLOW),
 	DRM_IOCTL_DEF_DRV(EXYNOS_G2D_EXEC, exynos_g2d_exec_ioctl,
 			DRM_AUTH | DRM_RENDER_ALLOW),
-	DRM_IOCTL_DEF_DRV(EXYNOS_IPP_GET_PROPERTY, exynos_drm_ipp_get_property,
-			DRM_AUTH | DRM_RENDER_ALLOW),
-	DRM_IOCTL_DEF_DRV(EXYNOS_IPP_SET_PROPERTY, exynos_drm_ipp_set_property,
-			DRM_AUTH | DRM_RENDER_ALLOW),
-	DRM_IOCTL_DEF_DRV(EXYNOS_IPP_QUEUE_BUF, exynos_drm_ipp_queue_buf,
-			DRM_AUTH | DRM_RENDER_ALLOW),
-	DRM_IOCTL_DEF_DRV(EXYNOS_IPP_CMD_CTRL, exynos_drm_ipp_cmd_ctrl,
-			DRM_AUTH | DRM_RENDER_ALLOW),
 };
 
 static const struct file_operations exynos_drm_driver_fops = {
@@ -257,9 +248,6 @@ static struct exynos_drm_driver_info exynos_drm_drivers[] = {
 	}, {
 		DRV_PTR(gsc_driver, CONFIG_DRM_EXYNOS_GSC),
 	}, {
-		DRV_PTR(ipp_driver, CONFIG_DRM_EXYNOS_IPP),
-		DRM_VIRTUAL_DEVICE
-	}, {
 		&exynos_drm_platform_driver,
 		DRM_VIRTUAL_DEVICE
 	}
diff --git a/drivers/gpu/drm/exynos/exynos_drm_drv.h b/drivers/gpu/drm/exynos/exynos_drm_drv.h
index 589d465a7f88..df2262f70d91 100644
--- a/drivers/gpu/drm/exynos/exynos_drm_drv.h
+++ b/drivers/gpu/drm/exynos/exynos_drm_drv.h
@@ -188,7 +188,6 @@ struct exynos_drm_g2d_private {
 
 struct drm_exynos_file_private {
 	struct exynos_drm_g2d_private	*g2d_priv;
-	struct device			*ipp_dev;
 };
 
 /*
@@ -291,6 +290,5 @@ extern struct platform_driver g2d_driver;
 extern struct platform_driver fimc_driver;
 extern struct platform_driver rotator_driver;
 extern struct platform_driver gsc_driver;
-extern struct platform_driver ipp_driver;
 extern struct platform_driver mic_driver;
 #endif
diff --git a/drivers/gpu/drm/exynos/exynos_drm_ipp.c b/drivers/gpu/drm/exynos/exynos_drm_ipp.c
deleted file mode 100644
index 3edda18cc2d2..000000000000
--- a/drivers/gpu/drm/exynos/exynos_drm_ipp.c
+++ /dev/null
@@ -1,1806 +0,0 @@
-/*
- * Copyright (C) 2012 Samsung Electronics Co.Ltd
- * Authors:
- *	Eunchul Kim <[email protected]>
- *	Jinyoung Jeon <[email protected]>
- *	Sangmin Lee <[email protected]>
- *
- * This program is free software; you can redistribute  it and/or modify it
- * under  the terms of  the GNU General  Public License as published by the
- * Free Software Foundation;  either version 2 of the  License, or (at your
- * option) any later version.
- *
- */
-#include <linux/kernel.h>
-#include <linux/platform_device.h>
-#include <linux/types.h>
-#include <linux/clk.h>
-#include <linux/pm_runtime.h>
-
-#include <drm/drmP.h>
-#include <drm/exynos_drm.h>
-#include "exynos_drm_drv.h"
-#include "exynos_drm_gem.h"
-#include "exynos_drm_ipp.h"
-#include "exynos_drm_iommu.h"
-
-/*
- * IPP stands for Image Post Processing and
- * supports image scaler/rotator and input/output DMA operations.
- * using FIMC, GSC, Rotator, so on.
- * IPP is integration device driver of same attribute h/w
- */
-
-/*
- * TODO
- * 1. expand command control id.
- * 2. integrate	property and config.
- * 3. removed send_event id check routine.
- * 4. compare send_event id if needed.
- * 5. free subdrv_remove notifier callback list if needed.
- * 6. need to check subdrv_open about multi-open.
- * 7. need to power_on implement power and sysmmu ctrl.
- */
-
-#define get_ipp_context(dev)	platform_get_drvdata(to_platform_device(dev))
-#define ipp_is_m2m_cmd(c)	(c == IPP_CMD_M2M)
-
-/*
- * A structure of event.
- *
- * @base: base of event.
- * @event: ipp event.
- */
-struct drm_exynos_ipp_send_event {
-	struct drm_pending_event	base;
-	struct drm_exynos_ipp_event	event;
-};
-
-/*
- * A structure of memory node.
- *
- * @list: list head to memory queue information.
- * @ops_id: id of operations.
- * @prop_id: id of property.
- * @buf_id: id of buffer.
- * @buf_info: gem objects and dma address, size.
- * @filp: a pointer to drm_file.
- */
-struct drm_exynos_ipp_mem_node {
-	struct list_head	list;
-	enum drm_exynos_ops_id	ops_id;
-	u32	prop_id;
-	u32	buf_id;
-	struct drm_exynos_ipp_buf_info	buf_info;
-};
-
-/*
- * A structure of ipp context.
- *
- * @subdrv: prepare initialization using subdrv.
- * @ipp_lock: lock for synchronization of access to ipp_idr.
- * @prop_lock: lock for synchronization of access to prop_idr.
- * @ipp_idr: ipp driver idr.
- * @prop_idr: property idr.
- * @event_workq: event work queue.
- * @cmd_workq: command work queue.
- */
-struct ipp_context {
-	struct exynos_drm_subdrv	subdrv;
-	struct mutex	ipp_lock;
-	struct mutex	prop_lock;
-	struct idr	ipp_idr;
-	struct idr	prop_idr;
-	struct workqueue_struct	*event_workq;
-	struct workqueue_struct	*cmd_workq;
-};
-
-static LIST_HEAD(exynos_drm_ippdrv_list);
-static DEFINE_MUTEX(exynos_drm_ippdrv_lock);
-static BLOCKING_NOTIFIER_HEAD(exynos_drm_ippnb_list);
-
-int exynos_drm_ippdrv_register(struct exynos_drm_ippdrv *ippdrv)
-{
-	mutex_lock(&exynos_drm_ippdrv_lock);
-	list_add_tail(&ippdrv->drv_list, &exynos_drm_ippdrv_list);
-	mutex_unlock(&exynos_drm_ippdrv_lock);
-
-	return 0;
-}
-
-int exynos_drm_ippdrv_unregister(struct exynos_drm_ippdrv *ippdrv)
-{
-	mutex_lock(&exynos_drm_ippdrv_lock);
-	list_del(&ippdrv->drv_list);
-	mutex_unlock(&exynos_drm_ippdrv_lock);
-
-	return 0;
-}
-
-static int ipp_create_id(struct idr *id_idr, struct mutex *lock, void *obj)
-{
-	int ret;
-
-	mutex_lock(lock);
-	ret = idr_alloc(id_idr, obj, 1, 0, GFP_KERNEL);
-	mutex_unlock(lock);
-
-	return ret;
-}
-
-static void ipp_remove_id(struct idr *id_idr, struct mutex *lock, u32 id)
-{
-	mutex_lock(lock);
-	idr_remove(id_idr, id);
-	mutex_unlock(lock);
-}
-
-static void *ipp_find_obj(struct idr *id_idr, struct mutex *lock, u32 id)
-{
-	void *obj;
-
-	mutex_lock(lock);
-	obj = idr_find(id_idr, id);
-	mutex_unlock(lock);
-
-	return obj;
-}
-
-static int ipp_check_driver(struct exynos_drm_ippdrv *ippdrv,
-			    struct drm_exynos_ipp_property *property)
-{
-	if (ippdrv->dedicated || (!ipp_is_m2m_cmd(property->cmd) &&
-				  !pm_runtime_suspended(ippdrv->dev)))
-		return -EBUSY;
-
-	if (ippdrv->check_property &&
-	    ippdrv->check_property(ippdrv->dev, property))
-		return -EINVAL;
-
-	return 0;
-}
-
-static struct exynos_drm_ippdrv *ipp_find_driver(struct ipp_context *ctx,
-		struct drm_exynos_ipp_property *property)
-{
-	struct exynos_drm_ippdrv *ippdrv;
-	u32 ipp_id = property->ipp_id;
-	int ret;
-
-	if (ipp_id) {
-		ippdrv = ipp_find_obj(&ctx->ipp_idr, &ctx->ipp_lock, ipp_id);
-		if (!ippdrv) {
-			DRM_DEBUG("ipp%d driver not found\n", ipp_id);
-			return ERR_PTR(-ENODEV);
-		}
-
-		ret = ipp_check_driver(ippdrv, property);
-		if (ret < 0) {
-			DRM_DEBUG("ipp%d driver check error %d\n", ipp_id, ret);
-			return ERR_PTR(ret);
-		}
-
-		return ippdrv;
-	} else {
-		list_for_each_entry(ippdrv, &exynos_drm_ippdrv_list, drv_list) {
-			ret = ipp_check_driver(ippdrv, property);
-			if (ret == 0)
-				return ippdrv;
-		}
-
-		DRM_DEBUG("cannot find driver suitable for given property.\n");
-	}
-
-	return ERR_PTR(-ENODEV);
-}
-
-static struct exynos_drm_ippdrv *ipp_find_drv_by_handle(u32 prop_id)
-{
-	struct exynos_drm_ippdrv *ippdrv;
-	struct drm_exynos_ipp_cmd_node *c_node;
-	int count = 0;
-
-	DRM_DEBUG_KMS("prop_id[%d]\n", prop_id);
-
-	/*
-	 * This case is search ipp driver by prop_id handle.
-	 * sometimes, ipp subsystem find driver by prop_id.
-	 * e.g PAUSE state, queue buf, command control.
-	 */
-	list_for_each_entry(ippdrv, &exynos_drm_ippdrv_list, drv_list) {
-		DRM_DEBUG_KMS("count[%d]ippdrv[%pK]\n", count++, ippdrv);
-
-		mutex_lock(&ippdrv->cmd_lock);
-		list_for_each_entry(c_node, &ippdrv->cmd_list, list) {
-			if (c_node->property.prop_id == prop_id) {
-				mutex_unlock(&ippdrv->cmd_lock);
-				return ippdrv;
-			}
-		}
-		mutex_unlock(&ippdrv->cmd_lock);
-	}
-
-	return ERR_PTR(-ENODEV);
-}
-
-int exynos_drm_ipp_get_property(struct drm_device *drm_dev, void *data,
-		struct drm_file *file)
-{
-	struct drm_exynos_file_private *file_priv = file->driver_priv;
-	struct device *dev = file_priv->ipp_dev;
-	struct ipp_context *ctx = get_ipp_context(dev);
-	struct drm_exynos_ipp_prop_list *prop_list = data;
-	struct exynos_drm_ippdrv *ippdrv;
-	int count = 0;
-
-	if (!ctx) {
-		DRM_ERROR("invalid context.\n");
-		return -EINVAL;
-	}
-
-	if (!prop_list) {
-		DRM_ERROR("invalid property parameter.\n");
-		return -EINVAL;
-	}
-
-	DRM_DEBUG_KMS("ipp_id[%d]\n", prop_list->ipp_id);
-
-	if (!prop_list->ipp_id) {
-		list_for_each_entry(ippdrv, &exynos_drm_ippdrv_list, drv_list)
-			count++;
-
-		/*
-		 * Supports ippdrv list count for user application.
-		 * First step user application getting ippdrv count.
-		 * and second step getting ippdrv capability using ipp_id.
-		 */
-		prop_list->count = count;
-	} else {
-		/*
-		 * Getting ippdrv capability by ipp_id.
-		 * some device not supported wb, output interface.
-		 * so, user application detect correct ipp driver
-		 * using this ioctl.
-		 */
-		ippdrv = ipp_find_obj(&ctx->ipp_idr, &ctx->ipp_lock,
-						prop_list->ipp_id);
-		if (!ippdrv) {
-			DRM_ERROR("not found ipp%d driver.\n",
-					prop_list->ipp_id);
-			return -ENODEV;
-		}
-
-		*prop_list = ippdrv->prop_list;
-	}
-
-	return 0;
-}
-
-static void ipp_print_property(struct drm_exynos_ipp_property *property,
-		int idx)
-{
-	struct drm_exynos_ipp_config *config = &property->config[idx];
-	struct drm_exynos_pos *pos = &config->pos;
-	struct drm_exynos_sz *sz = &config->sz;
-
-	DRM_DEBUG_KMS("prop_id[%d]ops[%s]fmt[0x%x]\n",
-		property->prop_id, idx ? "dst" : "src", config->fmt);
-
-	DRM_DEBUG_KMS("pos[%d %d %d %d]sz[%d %d]f[%d]r[%d]\n",
-		pos->x, pos->y, pos->w, pos->h,
-		sz->hsize, sz->vsize, config->flip, config->degree);
-}
-
-static struct drm_exynos_ipp_cmd_work *ipp_create_cmd_work(void)
-{
-	struct drm_exynos_ipp_cmd_work *cmd_work;
-
-	cmd_work = kzalloc(sizeof(*cmd_work), GFP_KERNEL);
-	if (!cmd_work)
-		return ERR_PTR(-ENOMEM);
-
-	INIT_WORK((struct work_struct *)cmd_work, ipp_sched_cmd);
-
-	return cmd_work;
-}
-
-static struct drm_exynos_ipp_event_work *ipp_create_event_work(void)
-{
-	struct drm_exynos_ipp_event_work *event_work;
-
-	event_work = kzalloc(sizeof(*event_work), GFP_KERNEL);
-	if (!event_work)
-		return ERR_PTR(-ENOMEM);
-
-	INIT_WORK(&event_work->work, ipp_sched_event);
-
-	return event_work;
-}
-
-int exynos_drm_ipp_set_property(struct drm_device *drm_dev, void *data,
-		struct drm_file *file)
-{
-	struct drm_exynos_file_private *file_priv = file->driver_priv;
-	struct device *dev = file_priv->ipp_dev;
-	struct ipp_context *ctx = get_ipp_context(dev);
-	struct drm_exynos_ipp_property *property = data;
-	struct exynos_drm_ippdrv *ippdrv;
-	struct drm_exynos_ipp_cmd_node *c_node;
-	u32 prop_id;
-	int ret, i;
-
-	if (!ctx) {
-		DRM_ERROR("invalid context.\n");
-		return -EINVAL;
-	}
-
-	if (!property) {
-		DRM_ERROR("invalid property parameter.\n");
-		return -EINVAL;
-	}
-
-	prop_id = property->prop_id;
-
-	/*
-	 * This is log print for user application property.
-	 * user application set various property.
-	 */
-	for_each_ipp_ops(i)
-		ipp_print_property(property, i);
-
-	/*
-	 * In case prop_id is not zero try to set existing property.
-	 */
-	if (prop_id) {
-		c_node = ipp_find_obj(&ctx->prop_idr, &ctx->prop_lock, prop_id);
-
-		if (!c_node || c_node->filp != file) {
-			DRM_DEBUG_KMS("prop_id[%d] not found\n", prop_id);
-			return -EINVAL;
-		}
-
-		if (c_node->state != IPP_STATE_STOP) {
-			DRM_DEBUG_KMS("prop_id[%d] not stopped\n", prop_id);
-			return -EINVAL;
-		}
-
-		c_node->property = *property;
-
-		return 0;
-	}
-
-	/* find ipp driver using ipp id */
-	ippdrv = ipp_find_driver(ctx, property);
-	if (IS_ERR(ippdrv)) {
-		DRM_ERROR("failed to get ipp driver.\n");
-		return -EINVAL;
-	}
-
-	/* allocate command node */
-	c_node = kzalloc(sizeof(*c_node), GFP_KERNEL);
-	if (!c_node)
-		return -ENOMEM;
-
-	ret = ipp_create_id(&ctx->prop_idr, &ctx->prop_lock, c_node);
-	if (ret < 0) {
-		DRM_ERROR("failed to create id.\n");
-		goto err_clear;
-	}
-	property->prop_id = ret;
-
-	DRM_DEBUG_KMS("created prop_id[%d]cmd[%d]ippdrv[%pK]\n",
-		property->prop_id, property->cmd, ippdrv);
-
-	/* stored property information and ippdrv in private data */
-	c_node->property = *property;
-	c_node->state = IPP_STATE_IDLE;
-	c_node->filp = file;
-
-	c_node->start_work = ipp_create_cmd_work();
-	if (IS_ERR(c_node->start_work)) {
-		DRM_ERROR("failed to create start work.\n");
-		ret = PTR_ERR(c_node->start_work);
-		goto err_remove_id;
-	}
-
-	c_node->stop_work = ipp_create_cmd_work();
-	if (IS_ERR(c_node->stop_work)) {
-		DRM_ERROR("failed to create stop work.\n");
-		ret = PTR_ERR(c_node->stop_work);
-		goto err_free_start;
-	}
-
-	c_node->event_work = ipp_create_event_work();
-	if (IS_ERR(c_node->event_work)) {
-		DRM_ERROR("failed to create event work.\n");
-		ret = PTR_ERR(c_node->event_work);
-		goto err_free_stop;
-	}
-
-	mutex_init(&c_node->lock);
-	mutex_init(&c_node->mem_lock);
-	mutex_init(&c_node->event_lock);
-
-	init_completion(&c_node->start_complete);
-	init_completion(&c_node->stop_complete);
-
-	for_each_ipp_ops(i)
-		INIT_LIST_HEAD(&c_node->mem_list[i]);
-
-	INIT_LIST_HEAD(&c_node->event_list);
-	mutex_lock(&ippdrv->cmd_lock);
-	list_add_tail(&c_node->list, &ippdrv->cmd_list);
-	mutex_unlock(&ippdrv->cmd_lock);
-
-	/* make dedicated state without m2m */
-	if (!ipp_is_m2m_cmd(property->cmd))
-		ippdrv->dedicated = true;
-
-	return 0;
-
-err_free_stop:
-	kfree(c_node->stop_work);
-err_free_start:
-	kfree(c_node->start_work);
-err_remove_id:
-	ipp_remove_id(&ctx->prop_idr, &ctx->prop_lock, property->prop_id);
-err_clear:
-	kfree(c_node);
-	return ret;
-}
-
-static int ipp_validate_mem_node(struct drm_device *drm_dev,
-				 struct drm_exynos_ipp_mem_node *m_node,
-				 struct drm_exynos_ipp_cmd_node *c_node)
-{
-	struct drm_exynos_ipp_config *ipp_cfg;
-	unsigned int num_plane;
-	unsigned long size, buf_size = 0, plane_size, img_size = 0;
-	unsigned int bpp, width, height;
-	int i;
-
-	ipp_cfg = &c_node->property.config[m_node->ops_id];
-	num_plane = drm_format_num_planes(ipp_cfg->fmt);
-
-	/**
-	 * This is a rather simplified validation of a memory node.
-	 * It basically verifies provided gem object handles
-	 * and the buffer sizes with respect to current configuration.
-	 * This is not the best that can be done
-	 * but it seems more than enough
-	 */
-	for (i = 0; i < num_plane; ++i) {
-		width = ipp_cfg->sz.hsize;
-		height = ipp_cfg->sz.vsize;
-		bpp = drm_format_plane_cpp(ipp_cfg->fmt, i);
-
-		/*
-		 * The result of drm_format_plane_cpp() for chroma planes must
-		 * be used with drm_format_xxxx_chroma_subsampling() for
-		 * correct result.
-		 */
-		if (i > 0) {
-			width /= drm_format_horz_chroma_subsampling(
-								ipp_cfg->fmt);
-			height /= drm_format_vert_chroma_subsampling(
-								ipp_cfg->fmt);
-		}
-		plane_size = width * height * bpp;
-		img_size += plane_size;
-
-		if (m_node->buf_info.handles[i]) {
-			size = exynos_drm_gem_get_size(drm_dev,
-					m_node->buf_info.handles[i],
-					c_node->filp);
-			if (plane_size > size) {
-				DRM_ERROR(
-					"buffer %d is smaller than required\n",
-					i);
-				return -EINVAL;
-			}
-
-			buf_size += size;
-		}
-	}
-
-	if (buf_size < img_size) {
-		DRM_ERROR("size of buffers(%lu) is smaller than image(%lu)\n",
-			buf_size, img_size);
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-static int ipp_put_mem_node(struct drm_device *drm_dev,
-		struct drm_exynos_ipp_cmd_node *c_node,
-		struct drm_exynos_ipp_mem_node *m_node)
-{
-	int i;
-
-	DRM_DEBUG_KMS("node[%pK]\n", m_node);
-
-	if (!m_node) {
-		DRM_ERROR("invalid dequeue node.\n");
-		return -EFAULT;
-	}
-
-	DRM_DEBUG_KMS("ops_id[%d]\n", m_node->ops_id);
-
-	/* put gem buffer */
-	for_each_ipp_planar(i) {
-		unsigned long handle = m_node->buf_info.handles[i];
-		if (handle)
-			exynos_drm_gem_put_dma_addr(drm_dev, handle,
-							c_node->filp);
-	}
-
-	list_del(&m_node->list);
-	kfree(m_node);
-
-	return 0;
-}
-
-static struct drm_exynos_ipp_mem_node
-		*ipp_get_mem_node(struct drm_device *drm_dev,
-		struct drm_exynos_ipp_cmd_node *c_node,
-		struct drm_exynos_ipp_queue_buf *qbuf)
-{
-	struct drm_exynos_ipp_mem_node *m_node;
-	struct drm_exynos_ipp_buf_info *buf_info;
-	int i;
-
-	m_node = kzalloc(sizeof(*m_node), GFP_KERNEL);
-	if (!m_node)
-		return ERR_PTR(-ENOMEM);
-
-	buf_info = &m_node->buf_info;
-
-	/* operations, buffer id */
-	m_node->ops_id = qbuf->ops_id;
-	m_node->prop_id = qbuf->prop_id;
-	m_node->buf_id = qbuf->buf_id;
-	INIT_LIST_HEAD(&m_node->list);
-
-	DRM_DEBUG_KMS("m_node[%pK]ops_id[%d]\n", m_node, qbuf->ops_id);
-	DRM_DEBUG_KMS("prop_id[%d]buf_id[%d]\n", qbuf->prop_id, m_node->buf_id);
-
-	for_each_ipp_planar(i) {
-		DRM_DEBUG_KMS("i[%d]handle[0x%x]\n", i, qbuf->handle[i]);
-
-		/* get dma address by handle */
-		if (qbuf->handle[i]) {
-			dma_addr_t *addr;
-
-			addr = exynos_drm_gem_get_dma_addr(drm_dev,
-					qbuf->handle[i], c_node->filp);
-			if (IS_ERR(addr)) {
-				DRM_ERROR("failed to get addr.\n");
-				ipp_put_mem_node(drm_dev, c_node, m_node);
-				return ERR_PTR(-EFAULT);
-			}
-
-			buf_info->handles[i] = qbuf->handle[i];
-			buf_info->base[i] = *addr;
-			DRM_DEBUG_KMS("i[%d]base[%pad]hd[0x%lx]\n", i,
-				      &buf_info->base[i], buf_info->handles[i]);
-		}
-	}
-
-	mutex_lock(&c_node->mem_lock);
-	if (ipp_validate_mem_node(drm_dev, m_node, c_node)) {
-		ipp_put_mem_node(drm_dev, c_node, m_node);
-		mutex_unlock(&c_node->mem_lock);
-		return ERR_PTR(-EFAULT);
-	}
-	list_add_tail(&m_node->list, &c_node->mem_list[qbuf->ops_id]);
-	mutex_unlock(&c_node->mem_lock);
-
-	return m_node;
-}
-
-static void ipp_clean_mem_nodes(struct drm_device *drm_dev,
-			       struct drm_exynos_ipp_cmd_node *c_node, int ops)
-{
-	struct drm_exynos_ipp_mem_node *m_node, *tm_node;
-	struct list_head *head = &c_node->mem_list[ops];
-
-	mutex_lock(&c_node->mem_lock);
-
-	list_for_each_entry_safe(m_node, tm_node, head, list) {
-		int ret;
-
-		ret = ipp_put_mem_node(drm_dev, c_node, m_node);
-		if (ret)
-			DRM_ERROR("failed to put m_node.\n");
-	}
-
-	mutex_unlock(&c_node->mem_lock);
-}
-
-static int ipp_get_event(struct drm_device *drm_dev,
-		struct drm_exynos_ipp_cmd_node *c_node,
-		struct drm_exynos_ipp_queue_buf *qbuf)
-{
-	struct drm_exynos_ipp_send_event *e;
-	int ret;
-
-	DRM_DEBUG_KMS("ops_id[%d]buf_id[%d]\n", qbuf->ops_id, qbuf->buf_id);
-
-	e = kzalloc(sizeof(*e), GFP_KERNEL);
-	if (!e)
-		return -ENOMEM;
-
-	/* make event */
-	e->event.base.type = DRM_EXYNOS_IPP_EVENT;
-	e->event.base.length = sizeof(e->event);
-	e->event.user_data = qbuf->user_data;
-	e->event.prop_id = qbuf->prop_id;
-	e->event.buf_id[EXYNOS_DRM_OPS_DST] = qbuf->buf_id;
-
-	ret = drm_event_reserve_init(drm_dev, c_node->filp, &e->base, &e->event.base);
-	if (ret) {
-		kfree(e);
-		return ret;
-	}
-
-	mutex_lock(&c_node->event_lock);
-	list_add_tail(&e->base.link, &c_node->event_list);
-	mutex_unlock(&c_node->event_lock);
-
-	return 0;
-}
-
-static void ipp_put_event(struct drm_exynos_ipp_cmd_node *c_node,
-		struct drm_exynos_ipp_queue_buf *qbuf)
-{
-	struct drm_exynos_ipp_send_event *e, *te;
-	int count = 0;
-
-	mutex_lock(&c_node->event_lock);
-	list_for_each_entry_safe(e, te, &c_node->event_list, base.link) {
-		DRM_DEBUG_KMS("count[%d]e[%pK]\n", count++, e);
-
-		/*
-		 * qbuf == NULL condition means all event deletion.
-		 * stop operations want to delete all event list.
-		 * another case delete only same buf id.
-		 */
-		if (!qbuf) {
-			/* delete list */
-			list_del(&e->base.link);
-			kfree(e);
-		}
-
-		/* compare buffer id */
-		if (qbuf && (qbuf->buf_id ==
-		    e->event.buf_id[EXYNOS_DRM_OPS_DST])) {
-			/* delete list */
-			list_del(&e->base.link);
-			kfree(e);
-			goto out_unlock;
-		}
-	}
-
-out_unlock:
-	mutex_unlock(&c_node->event_lock);
-	return;
-}
-
-static void ipp_clean_cmd_node(struct ipp_context *ctx,
-				struct drm_exynos_ipp_cmd_node *c_node)
-{
-	int i;
-
-	/* cancel works */
-	cancel_work_sync(&c_node->start_work->work);
-	cancel_work_sync(&c_node->stop_work->work);
-	cancel_work_sync(&c_node->event_work->work);
-
-	/* put event */
-	ipp_put_event(c_node, NULL);
-
-	for_each_ipp_ops(i)
-		ipp_clean_mem_nodes(ctx->subdrv.drm_dev, c_node, i);
-
-	/* delete list */
-	list_del(&c_node->list);
-
-	ipp_remove_id(&ctx->prop_idr, &ctx->prop_lock,
-			c_node->property.prop_id);
-
-	/* destroy mutex */
-	mutex_destroy(&c_node->lock);
-	mutex_destroy(&c_node->mem_lock);
-	mutex_destroy(&c_node->event_lock);
-
-	/* free command node */
-	kfree(c_node->start_work);
-	kfree(c_node->stop_work);
-	kfree(c_node->event_work);
-	kfree(c_node);
-}
-
-static bool ipp_check_mem_list(struct drm_exynos_ipp_cmd_node *c_node)
-{
-	switch (c_node->property.cmd) {
-	case IPP_CMD_WB:
-		return !list_empty(&c_node->mem_list[EXYNOS_DRM_OPS_DST]);
-	case IPP_CMD_OUTPUT:
-		return !list_empty(&c_node->mem_list[EXYNOS_DRM_OPS_SRC]);
-	case IPP_CMD_M2M:
-	default:
-		return !list_empty(&c_node->mem_list[EXYNOS_DRM_OPS_SRC]) &&
-		       !list_empty(&c_node->mem_list[EXYNOS_DRM_OPS_DST]);
-	}
-}
-
-static struct drm_exynos_ipp_mem_node
-		*ipp_find_mem_node(struct drm_exynos_ipp_cmd_node *c_node,
-		struct drm_exynos_ipp_queue_buf *qbuf)
-{
-	struct drm_exynos_ipp_mem_node *m_node;
-	struct list_head *head;
-	int count = 0;
-
-	DRM_DEBUG_KMS("buf_id[%d]\n", qbuf->buf_id);
-
-	/* source/destination memory list */
-	head = &c_node->mem_list[qbuf->ops_id];
-
-	/* find memory node from memory list */
-	list_for_each_entry(m_node, head, list) {
-		DRM_DEBUG_KMS("count[%d]m_node[%pK]\n", count++, m_node);
-
-		/* compare buffer id */
-		if (m_node->buf_id == qbuf->buf_id)
-			return m_node;
-	}
-
-	return NULL;
-}
-
-static int ipp_set_mem_node(struct exynos_drm_ippdrv *ippdrv,
-		struct drm_exynos_ipp_cmd_node *c_node,
-		struct drm_exynos_ipp_mem_node *m_node)
-{
-	struct exynos_drm_ipp_ops *ops = NULL;
-	int ret = 0;
-
-	DRM_DEBUG_KMS("node[%pK]\n", m_node);
-
-	if (!m_node) {
-		DRM_ERROR("invalid queue node.\n");
-		return -EFAULT;
-	}
-
-	DRM_DEBUG_KMS("ops_id[%d]\n", m_node->ops_id);
-
-	/* get operations callback */
-	ops = ippdrv->ops[m_node->ops_id];
-	if (!ops) {
-		DRM_ERROR("not support ops.\n");
-		return -EFAULT;
-	}
-
-	/* set address and enable irq */
-	if (ops->set_addr) {
-		ret = ops->set_addr(ippdrv->dev, &m_node->buf_info,
-			m_node->buf_id, IPP_BUF_ENQUEUE);
-		if (ret) {
-			DRM_ERROR("failed to set addr.\n");
-			return ret;
-		}
-	}
-
-	return ret;
-}
-
-static void ipp_handle_cmd_work(struct device *dev,
-		struct exynos_drm_ippdrv *ippdrv,
-		struct drm_exynos_ipp_cmd_work *cmd_work,
-		struct drm_exynos_ipp_cmd_node *c_node)
-{
-	struct ipp_context *ctx = get_ipp_context(dev);
-
-	cmd_work->ippdrv = ippdrv;
-	cmd_work->c_node = c_node;
-	queue_work(ctx->cmd_workq, &cmd_work->work);
-}
-
-static int ipp_queue_buf_with_run(struct device *dev,
-		struct drm_exynos_ipp_cmd_node *c_node,
-		struct drm_exynos_ipp_mem_node *m_node,
-		struct drm_exynos_ipp_queue_buf *qbuf)
-{
-	struct exynos_drm_ippdrv *ippdrv;
-	struct drm_exynos_ipp_property *property;
-	struct exynos_drm_ipp_ops *ops;
-	int ret;
-
-	ippdrv = ipp_find_drv_by_handle(qbuf->prop_id);
-	if (IS_ERR(ippdrv)) {
-		DRM_ERROR("failed to get ipp driver.\n");
-		return -EFAULT;
-	}
-
-	ops = ippdrv->ops[qbuf->ops_id];
-	if (!ops) {
-		DRM_ERROR("failed to get ops.\n");
-		return -EFAULT;
-	}
-
-	property = &c_node->property;
-
-	if (c_node->state != IPP_STATE_START) {
-		DRM_DEBUG_KMS("bypass for invalid state.\n");
-		return 0;
-	}
-
-	mutex_lock(&c_node->mem_lock);
-	if (!ipp_check_mem_list(c_node)) {
-		mutex_unlock(&c_node->mem_lock);
-		DRM_DEBUG_KMS("empty memory.\n");
-		return 0;
-	}
-
-	/*
-	 * If set destination buffer and enabled clock,
-	 * then m2m operations need start operations at queue_buf
-	 */
-	if (ipp_is_m2m_cmd(property->cmd)) {
-		struct drm_exynos_ipp_cmd_work *cmd_work = c_node->start_work;
-
-		cmd_work->ctrl = IPP_CTRL_PLAY;
-		ipp_handle_cmd_work(dev, ippdrv, cmd_work, c_node);
-	} else {
-		ret = ipp_set_mem_node(ippdrv, c_node, m_node);
-		if (ret) {
-			mutex_unlock(&c_node->mem_lock);
-			DRM_ERROR("failed to set m node.\n");
-			return ret;
-		}
-	}
-	mutex_unlock(&c_node->mem_lock);
-
-	return 0;
-}
-
-static void ipp_clean_queue_buf(struct drm_device *drm_dev,
-		struct drm_exynos_ipp_cmd_node *c_node,
-		struct drm_exynos_ipp_queue_buf *qbuf)
-{
-	struct drm_exynos_ipp_mem_node *m_node, *tm_node;
-
-	/* delete list */
-	mutex_lock(&c_node->mem_lock);
-	list_for_each_entry_safe(m_node, tm_node,
-		&c_node->mem_list[qbuf->ops_id], list) {
-		if (m_node->buf_id == qbuf->buf_id &&
-		    m_node->ops_id == qbuf->ops_id)
-			ipp_put_mem_node(drm_dev, c_node, m_node);
-	}
-	mutex_unlock(&c_node->mem_lock);
-}
-
-int exynos_drm_ipp_queue_buf(struct drm_device *drm_dev, void *data,
-		struct drm_file *file)
-{
-	struct drm_exynos_file_private *file_priv = file->driver_priv;
-	struct device *dev = file_priv->ipp_dev;
-	struct ipp_context *ctx = get_ipp_context(dev);
-	struct drm_exynos_ipp_queue_buf *qbuf = data;
-	struct drm_exynos_ipp_cmd_node *c_node;
-	struct drm_exynos_ipp_mem_node *m_node;
-	int ret;
-
-	if (!qbuf) {
-		DRM_ERROR("invalid buf parameter.\n");
-		return -EINVAL;
-	}
-
-	if (qbuf->ops_id >= EXYNOS_DRM_OPS_MAX) {
-		DRM_ERROR("invalid ops parameter.\n");
-		return -EINVAL;
-	}
-
-	DRM_DEBUG_KMS("prop_id[%d]ops_id[%s]buf_id[%d]buf_type[%d]\n",
-		qbuf->prop_id, qbuf->ops_id ? "dst" : "src",
-		qbuf->buf_id, qbuf->buf_type);
-
-	/* find command node */
-	c_node = ipp_find_obj(&ctx->prop_idr, &ctx->prop_lock,
-		qbuf->prop_id);
-	if (!c_node || c_node->filp != file) {
-		DRM_ERROR("failed to get command node.\n");
-		return -ENODEV;
-	}
-
-	/* buffer control */
-	switch (qbuf->buf_type) {
-	case IPP_BUF_ENQUEUE:
-		/* get memory node */
-		m_node = ipp_get_mem_node(drm_dev, c_node, qbuf);
-		if (IS_ERR(m_node)) {
-			DRM_ERROR("failed to get m_node.\n");
-			return PTR_ERR(m_node);
-		}
-
-		/*
-		 * first step get event for destination buffer.
-		 * and second step when M2M case run with destination buffer
-		 * if needed.
-		 */
-		if (qbuf->ops_id == EXYNOS_DRM_OPS_DST) {
-			/* get event for destination buffer */
-			ret = ipp_get_event(drm_dev, c_node, qbuf);
-			if (ret) {
-				DRM_ERROR("failed to get event.\n");
-				goto err_clean_node;
-			}
-
-			/*
-			 * M2M case run play control for streaming feature.
-			 * other case set address and waiting.
-			 */
-			ret = ipp_queue_buf_with_run(dev, c_node, m_node, qbuf);
-			if (ret) {
-				DRM_ERROR("failed to run command.\n");
-				goto err_clean_node;
-			}
-		}
-		break;
-	case IPP_BUF_DEQUEUE:
-		mutex_lock(&c_node->lock);
-
-		/* put event for destination buffer */
-		if (qbuf->ops_id == EXYNOS_DRM_OPS_DST)
-			ipp_put_event(c_node, qbuf);
-
-		ipp_clean_queue_buf(drm_dev, c_node, qbuf);
-
-		mutex_unlock(&c_node->lock);
-		break;
-	default:
-		DRM_ERROR("invalid buffer control.\n");
-		return -EINVAL;
-	}
-
-	return 0;
-
-err_clean_node:
-	DRM_ERROR("clean memory nodes.\n");
-
-	ipp_clean_queue_buf(drm_dev, c_node, qbuf);
-	return ret;
-}
-
-static bool exynos_drm_ipp_check_valid(struct device *dev,
-		enum drm_exynos_ipp_ctrl ctrl, enum drm_exynos_ipp_state state)
-{
-	if (ctrl != IPP_CTRL_PLAY) {
-		if (pm_runtime_suspended(dev)) {
-			DRM_ERROR("pm:runtime_suspended.\n");
-			goto err_status;
-		}
-	}
-
-	switch (ctrl) {
-	case IPP_CTRL_PLAY:
-		if (state != IPP_STATE_IDLE)
-			goto err_status;
-		break;
-	case IPP_CTRL_STOP:
-		if (state == IPP_STATE_STOP)
-			goto err_status;
-		break;
-	case IPP_CTRL_PAUSE:
-		if (state != IPP_STATE_START)
-			goto err_status;
-		break;
-	case IPP_CTRL_RESUME:
-		if (state != IPP_STATE_STOP)
-			goto err_status;
-		break;
-	default:
-		DRM_ERROR("invalid state.\n");
-		goto err_status;
-	}
-
-	return true;
-
-err_status:
-	DRM_ERROR("invalid status:ctrl[%d]state[%d]\n", ctrl, state);
-	return false;
-}
-
-int exynos_drm_ipp_cmd_ctrl(struct drm_device *drm_dev, void *data,
-		struct drm_file *file)
-{
-	struct drm_exynos_file_private *file_priv = file->driver_priv;
-	struct exynos_drm_ippdrv *ippdrv = NULL;
-	struct device *dev = file_priv->ipp_dev;
-	struct ipp_context *ctx = get_ipp_context(dev);
-	struct drm_exynos_ipp_cmd_ctrl *cmd_ctrl = data;
-	struct drm_exynos_ipp_cmd_work *cmd_work;
-	struct drm_exynos_ipp_cmd_node *c_node;
-
-	if (!ctx) {
-		DRM_ERROR("invalid context.\n");
-		return -EINVAL;
-	}
-
-	if (!cmd_ctrl) {
-		DRM_ERROR("invalid control parameter.\n");
-		return -EINVAL;
-	}
-
-	DRM_DEBUG_KMS("ctrl[%d]prop_id[%d]\n",
-		cmd_ctrl->ctrl, cmd_ctrl->prop_id);
-
-	ippdrv = ipp_find_drv_by_handle(cmd_ctrl->prop_id);
-	if (IS_ERR(ippdrv)) {
-		DRM_ERROR("failed to get ipp driver.\n");
-		return PTR_ERR(ippdrv);
-	}
-
-	c_node = ipp_find_obj(&ctx->prop_idr, &ctx->prop_lock,
-		cmd_ctrl->prop_id);
-	if (!c_node || c_node->filp != file) {
-		DRM_ERROR("invalid command node list.\n");
-		return -ENODEV;
-	}
-
-	if (!exynos_drm_ipp_check_valid(ippdrv->dev, cmd_ctrl->ctrl,
-	    c_node->state)) {
-		DRM_ERROR("invalid state.\n");
-		return -EINVAL;
-	}
-
-	switch (cmd_ctrl->ctrl) {
-	case IPP_CTRL_PLAY:
-		if (pm_runtime_suspended(ippdrv->dev))
-			pm_runtime_get_sync(ippdrv->dev);
-
-		c_node->state = IPP_STATE_START;
-
-		cmd_work = c_node->start_work;
-		cmd_work->ctrl = cmd_ctrl->ctrl;
-		ipp_handle_cmd_work(dev, ippdrv, cmd_work, c_node);
-		break;
-	case IPP_CTRL_STOP:
-		cmd_work = c_node->stop_work;
-		cmd_work->ctrl = cmd_ctrl->ctrl;
-		ipp_handle_cmd_work(dev, ippdrv, cmd_work, c_node);
-
-		if (!wait_for_completion_timeout(&c_node->stop_complete,
-		    msecs_to_jiffies(300))) {
-			DRM_ERROR("timeout stop:prop_id[%d]\n",
-				c_node->property.prop_id);
-		}
-
-		c_node->state = IPP_STATE_STOP;
-		ippdrv->dedicated = false;
-		mutex_lock(&ippdrv->cmd_lock);
-		ipp_clean_cmd_node(ctx, c_node);
-
-		if (list_empty(&ippdrv->cmd_list))
-			pm_runtime_put_sync(ippdrv->dev);
-		mutex_unlock(&ippdrv->cmd_lock);
-		break;
-	case IPP_CTRL_PAUSE:
-		cmd_work = c_node->stop_work;
-		cmd_work->ctrl = cmd_ctrl->ctrl;
-		ipp_handle_cmd_work(dev, ippdrv, cmd_work, c_node);
-
-		if (!wait_for_completion_timeout(&c_node->stop_complete,
-		    msecs_to_jiffies(200))) {
-			DRM_ERROR("timeout stop:prop_id[%d]\n",
-				c_node->property.prop_id);
-		}
-
-		c_node->state = IPP_STATE_STOP;
-		break;
-	case IPP_CTRL_RESUME:
-		c_node->state = IPP_STATE_START;
-		cmd_work = c_node->start_work;
-		cmd_work->ctrl = cmd_ctrl->ctrl;
-		ipp_handle_cmd_work(dev, ippdrv, cmd_work, c_node);
-		break;
-	default:
-		DRM_ERROR("could not support this state currently.\n");
-		return -EINVAL;
-	}
-
-	DRM_DEBUG_KMS("done ctrl[%d]prop_id[%d]\n",
-		cmd_ctrl->ctrl, cmd_ctrl->prop_id);
-
-	return 0;
-}
-
-int exynos_drm_ippnb_register(struct notifier_block *nb)
-{
-	return blocking_notifier_chain_register(
-		&exynos_drm_ippnb_list, nb);
-}
-
-int exynos_drm_ippnb_unregister(struct notifier_block *nb)
-{
-	return blocking_notifier_chain_unregister(
-		&exynos_drm_ippnb_list, nb);
-}
-
-int exynos_drm_ippnb_send_event(unsigned long val, void *v)
-{
-	return blocking_notifier_call_chain(
-		&exynos_drm_ippnb_list, val, v);
-}
-
-static int ipp_set_property(struct exynos_drm_ippdrv *ippdrv,
-		struct drm_exynos_ipp_property *property)
-{
-	struct exynos_drm_ipp_ops *ops = NULL;
-	bool swap = false;
-	int ret, i;
-
-	if (!property) {
-		DRM_ERROR("invalid property parameter.\n");
-		return -EINVAL;
-	}
-
-	DRM_DEBUG_KMS("prop_id[%d]\n", property->prop_id);
-
-	/* reset h/w block */
-	if (ippdrv->reset &&
-	    ippdrv->reset(ippdrv->dev)) {
-		return -EINVAL;
-	}
-
-	/* set source,destination operations */
-	for_each_ipp_ops(i) {
-		struct drm_exynos_ipp_config *config =
-			&property->config[i];
-
-		ops = ippdrv->ops[i];
-		if (!ops || !config) {
-			DRM_ERROR("not support ops and config.\n");
-			return -EINVAL;
-		}
-
-		/* set format */
-		if (ops->set_fmt) {
-			ret = ops->set_fmt(ippdrv->dev, config->fmt);
-			if (ret)
-				return ret;
-		}
-
-		/* set transform for rotation, flip */
-		if (ops->set_transf) {
-			ret = ops->set_transf(ippdrv->dev, config->degree,
-				config->flip, &swap);
-			if (ret)
-				return ret;
-		}
-
-		/* set size */
-		if (ops->set_size) {
-			ret = ops->set_size(ippdrv->dev, swap, &config->pos,
-				&config->sz);
-			if (ret)
-				return ret;
-		}
-	}
-
-	return 0;
-}
-
-static int ipp_start_property(struct exynos_drm_ippdrv *ippdrv,
-		struct drm_exynos_ipp_cmd_node *c_node)
-{
-	struct drm_exynos_ipp_mem_node *m_node;
-	struct drm_exynos_ipp_property *property = &c_node->property;
-	struct list_head *head;
-	int ret, i;
-
-	DRM_DEBUG_KMS("prop_id[%d]\n", property->prop_id);
-
-	/* store command info in ippdrv */
-	ippdrv->c_node = c_node;
-
-	mutex_lock(&c_node->mem_lock);
-	if (!ipp_check_mem_list(c_node)) {
-		DRM_DEBUG_KMS("empty memory.\n");
-		ret = -ENOMEM;
-		goto err_unlock;
-	}
-
-	/* set current property in ippdrv */
-	ret = ipp_set_property(ippdrv, property);
-	if (ret) {
-		DRM_ERROR("failed to set property.\n");
-		ippdrv->c_node = NULL;
-		goto err_unlock;
-	}
-
-	/* check command */
-	switch (property->cmd) {
-	case IPP_CMD_M2M:
-		for_each_ipp_ops(i) {
-			/* source/destination memory list */
-			head = &c_node->mem_list[i];
-
-			m_node = list_first_entry(head,
-				struct drm_exynos_ipp_mem_node, list);
-
-			DRM_DEBUG_KMS("m_node[%pK]\n", m_node);
-
-			ret = ipp_set_mem_node(ippdrv, c_node, m_node);
-			if (ret) {
-				DRM_ERROR("failed to set m node.\n");
-				goto err_unlock;
-			}
-		}
-		break;
-	case IPP_CMD_WB:
-		/* destination memory list */
-		head = &c_node->mem_list[EXYNOS_DRM_OPS_DST];
-
-		list_for_each_entry(m_node, head, list) {
-			ret = ipp_set_mem_node(ippdrv, c_node, m_node);
-			if (ret) {
-				DRM_ERROR("failed to set m node.\n");
-				goto err_unlock;
-			}
-		}
-		break;
-	case IPP_CMD_OUTPUT:
-		/* source memory list */
-		head = &c_node->mem_list[EXYNOS_DRM_OPS_SRC];
-
-		list_for_each_entry(m_node, head, list) {
-			ret = ipp_set_mem_node(ippdrv, c_node, m_node);
-			if (ret) {
-				DRM_ERROR("failed to set m node.\n");
-				goto err_unlock;
-			}
-		}
-		break;
-	default:
-		DRM_ERROR("invalid operations.\n");
-		ret = -EINVAL;
-		goto err_unlock;
-	}
-	mutex_unlock(&c_node->mem_lock);
-
-	DRM_DEBUG_KMS("cmd[%d]\n", property->cmd);
-
-	/* start operations */
-	if (ippdrv->start) {
-		ret = ippdrv->start(ippdrv->dev, property->cmd);
-		if (ret) {
-			DRM_ERROR("failed to start ops.\n");
-			ippdrv->c_node = NULL;
-			return ret;
-		}
-	}
-
-	return 0;
-
-err_unlock:
-	mutex_unlock(&c_node->mem_lock);
-	ippdrv->c_node = NULL;
-	return ret;
-}
-
-static int ipp_stop_property(struct drm_device *drm_dev,
-		struct exynos_drm_ippdrv *ippdrv,
-		struct drm_exynos_ipp_cmd_node *c_node)
-{
-	struct drm_exynos_ipp_property *property = &c_node->property;
-	int i;
-
-	DRM_DEBUG_KMS("prop_id[%d]\n", property->prop_id);
-
-	/* stop operations */
-	if (ippdrv->stop)
-		ippdrv->stop(ippdrv->dev, property->cmd);
-
-	/* check command */
-	switch (property->cmd) {
-	case IPP_CMD_M2M:
-		for_each_ipp_ops(i)
-			ipp_clean_mem_nodes(drm_dev, c_node, i);
-		break;
-	case IPP_CMD_WB:
-		ipp_clean_mem_nodes(drm_dev, c_node, EXYNOS_DRM_OPS_DST);
-		break;
-	case IPP_CMD_OUTPUT:
-		ipp_clean_mem_nodes(drm_dev, c_node, EXYNOS_DRM_OPS_SRC);
-		break;
-	default:
-		DRM_ERROR("invalid operations.\n");
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-void ipp_sched_cmd(struct work_struct *work)
-{
-	struct drm_exynos_ipp_cmd_work *cmd_work =
-		container_of(work, struct drm_exynos_ipp_cmd_work, work);
-	struct exynos_drm_ippdrv *ippdrv;
-	struct drm_exynos_ipp_cmd_node *c_node;
-	struct drm_exynos_ipp_property *property;
-	int ret;
-
-	ippdrv = cmd_work->ippdrv;
-	if (!ippdrv) {
-		DRM_ERROR("invalid ippdrv list.\n");
-		return;
-	}
-
-	c_node = cmd_work->c_node;
-	if (!c_node) {
-		DRM_ERROR("invalid command node list.\n");
-		return;
-	}
-
-	mutex_lock(&c_node->lock);
-
-	property = &c_node->property;
-
-	switch (cmd_work->ctrl) {
-	case IPP_CTRL_PLAY:
-	case IPP_CTRL_RESUME:
-		ret = ipp_start_property(ippdrv, c_node);
-		if (ret) {
-			DRM_ERROR("failed to start property:prop_id[%d]\n",
-				c_node->property.prop_id);
-			goto err_unlock;
-		}
-
-		/*
-		 * M2M case supports wait_completion of transfer.
-		 * because M2M case supports single unit operation
-		 * with multiple queue.
-		 * M2M need to wait completion of data transfer.
-		 */
-		if (ipp_is_m2m_cmd(property->cmd)) {
-			if (!wait_for_completion_timeout
-			    (&c_node->start_complete, msecs_to_jiffies(200))) {
-				DRM_ERROR("timeout event:prop_id[%d]\n",
-					c_node->property.prop_id);
-				goto err_unlock;
-			}
-		}
-		break;
-	case IPP_CTRL_STOP:
-	case IPP_CTRL_PAUSE:
-		ret = ipp_stop_property(ippdrv->drm_dev, ippdrv,
-			c_node);
-		if (ret) {
-			DRM_ERROR("failed to stop property.\n");
-			goto err_unlock;
-		}
-
-		complete(&c_node->stop_complete);
-		break;
-	default:
-		DRM_ERROR("unknown control type\n");
-		break;
-	}
-
-	DRM_DEBUG_KMS("ctrl[%d] done.\n", cmd_work->ctrl);
-
-err_unlock:
-	mutex_unlock(&c_node->lock);
-}
-
-static int ipp_send_event(struct exynos_drm_ippdrv *ippdrv,
-		struct drm_exynos_ipp_cmd_node *c_node, int *buf_id)
-{
-	struct drm_device *drm_dev = ippdrv->drm_dev;
-	struct drm_exynos_ipp_property *property = &c_node->property;
-	struct drm_exynos_ipp_mem_node *m_node;
-	struct drm_exynos_ipp_queue_buf qbuf;
-	struct drm_exynos_ipp_send_event *e;
-	struct list_head *head;
-	struct timeval now;
-	u32 tbuf_id[EXYNOS_DRM_OPS_MAX] = {0, };
-	int ret, i;
-
-	for_each_ipp_ops(i)
-		DRM_DEBUG_KMS("%s buf_id[%d]\n", i ? "dst" : "src", buf_id[i]);
-
-	if (!drm_dev) {
-		DRM_ERROR("failed to get drm_dev.\n");
-		return -EINVAL;
-	}
-
-	if (!property) {
-		DRM_ERROR("failed to get property.\n");
-		return -EINVAL;
-	}
-
-	mutex_lock(&c_node->event_lock);
-	if (list_empty(&c_node->event_list)) {
-		DRM_DEBUG_KMS("event list is empty.\n");
-		ret = 0;
-		goto err_event_unlock;
-	}
-
-	mutex_lock(&c_node->mem_lock);
-	if (!ipp_check_mem_list(c_node)) {
-		DRM_DEBUG_KMS("empty memory.\n");
-		ret = 0;
-		goto err_mem_unlock;
-	}
-
-	/* check command */
-	switch (property->cmd) {
-	case IPP_CMD_M2M:
-		for_each_ipp_ops(i) {
-			/* source/destination memory list */
-			head = &c_node->mem_list[i];
-
-			m_node = list_first_entry(head,
-				struct drm_exynos_ipp_mem_node, list);
-
-			tbuf_id[i] = m_node->buf_id;
-			DRM_DEBUG_KMS("%s buf_id[%d]\n",
-				i ? "dst" : "src", tbuf_id[i]);
-
-			ret = ipp_put_mem_node(drm_dev, c_node, m_node);
-			if (ret)
-				DRM_ERROR("failed to put m_node.\n");
-		}
-		break;
-	case IPP_CMD_WB:
-		/* clear buf for finding */
-		memset(&qbuf, 0x0, sizeof(qbuf));
-		qbuf.ops_id = EXYNOS_DRM_OPS_DST;
-		qbuf.buf_id = buf_id[EXYNOS_DRM_OPS_DST];
-
-		/* get memory node entry */
-		m_node = ipp_find_mem_node(c_node, &qbuf);
-		if (!m_node) {
-			DRM_ERROR("empty memory node.\n");
-			ret = -ENOMEM;
-			goto err_mem_unlock;
-		}
-
-		tbuf_id[EXYNOS_DRM_OPS_DST] = m_node->buf_id;
-
-		ret = ipp_put_mem_node(drm_dev, c_node, m_node);
-		if (ret)
-			DRM_ERROR("failed to put m_node.\n");
-		break;
-	case IPP_CMD_OUTPUT:
-		/* source memory list */
-		head = &c_node->mem_list[EXYNOS_DRM_OPS_SRC];
-
-		m_node = list_first_entry(head,
-			struct drm_exynos_ipp_mem_node, list);
-
-		tbuf_id[EXYNOS_DRM_OPS_SRC] = m_node->buf_id;
-
-		ret = ipp_put_mem_node(drm_dev, c_node, m_node);
-		if (ret)
-			DRM_ERROR("failed to put m_node.\n");
-		break;
-	default:
-		DRM_ERROR("invalid operations.\n");
-		ret = -EINVAL;
-		goto err_mem_unlock;
-	}
-	mutex_unlock(&c_node->mem_lock);
-
-	if (tbuf_id[EXYNOS_DRM_OPS_DST] != buf_id[EXYNOS_DRM_OPS_DST])
-		DRM_ERROR("failed to match buf_id[%d %d]prop_id[%d]\n",
-			tbuf_id[1], buf_id[1], property->prop_id);
-
-	/*
-	 * command node have event list of destination buffer
-	 * If destination buffer enqueue to mem list,
-	 * then we make event and link to event list tail.
-	 * so, we get first event for first enqueued buffer.
-	 */
-	e = list_first_entry(&c_node->event_list,
-		struct drm_exynos_ipp_send_event, base.link);
-
-	do_gettimeofday(&now);
-	DRM_DEBUG_KMS("tv_sec[%ld]tv_usec[%ld]\n", now.tv_sec, now.tv_usec);
-	e->event.tv_sec = now.tv_sec;
-	e->event.tv_usec = now.tv_usec;
-	e->event.prop_id = property->prop_id;
-
-	/* set buffer id about source destination */
-	for_each_ipp_ops(i)
-		e->event.buf_id[i] = tbuf_id[i];
-
-	drm_send_event(drm_dev, &e->base);
-	mutex_unlock(&c_node->event_lock);
-
-	DRM_DEBUG_KMS("done cmd[%d]prop_id[%d]buf_id[%d]\n",
-		property->cmd, property->prop_id, tbuf_id[EXYNOS_DRM_OPS_DST]);
-
-	return 0;
-
-err_mem_unlock:
-	mutex_unlock(&c_node->mem_lock);
-err_event_unlock:
-	mutex_unlock(&c_node->event_lock);
-	return ret;
-}
-
-void ipp_sched_event(struct work_struct *work)
-{
-	struct drm_exynos_ipp_event_work *event_work =
-		container_of(work, struct drm_exynos_ipp_event_work, work);
-	struct exynos_drm_ippdrv *ippdrv;
-	struct drm_exynos_ipp_cmd_node *c_node;
-	int ret;
-
-	if (!event_work) {
-		DRM_ERROR("failed to get event_work.\n");
-		return;
-	}
-
-	DRM_DEBUG_KMS("buf_id[%d]\n", event_work->buf_id[EXYNOS_DRM_OPS_DST]);
-
-	ippdrv = event_work->ippdrv;
-	if (!ippdrv) {
-		DRM_ERROR("failed to get ipp driver.\n");
-		return;
-	}
-
-	c_node = ippdrv->c_node;
-	if (!c_node) {
-		DRM_ERROR("failed to get command node.\n");
-		return;
-	}
-
-	/*
-	 * IPP supports command thread, event thread synchronization.
-	 * If IPP close immediately from user land, then IPP make
-	 * synchronization with command thread, so make complete event.
-	 * or going out operations.
-	 */
-	if (c_node->state != IPP_STATE_START) {
-		DRM_DEBUG_KMS("bypass state[%d]prop_id[%d]\n",
-			c_node->state, c_node->property.prop_id);
-		goto err_completion;
-	}
-
-	ret = ipp_send_event(ippdrv, c_node, event_work->buf_id);
-	if (ret) {
-		DRM_ERROR("failed to send event.\n");
-		goto err_completion;
-	}
-
-err_completion:
-	if (ipp_is_m2m_cmd(c_node->property.cmd))
-		complete(&c_node->start_complete);
-}
-
-static int ipp_subdrv_probe(struct drm_device *drm_dev, struct device *dev)
-{
-	struct ipp_context *ctx = get_ipp_context(dev);
-	struct exynos_drm_ippdrv *ippdrv;
-	int ret, count = 0;
-
-	/* get ipp driver entry */
-	list_for_each_entry(ippdrv, &exynos_drm_ippdrv_list, drv_list) {
-		ippdrv->drm_dev = drm_dev;
-
-		ret = ipp_create_id(&ctx->ipp_idr, &ctx->ipp_lock, ippdrv);
-		if (ret < 0) {
-			DRM_ERROR("failed to create id.\n");
-			goto err;
-		}
-		ippdrv->prop_list.ipp_id = ret;
-
-		DRM_DEBUG_KMS("count[%d]ippdrv[%pK]ipp_id[%d]\n",
-			count++, ippdrv, ret);
-
-		/* store parent device for node */
-		ippdrv->parent_dev = dev;
-
-		/* store event work queue and handler */
-		ippdrv->event_workq = ctx->event_workq;
-		ippdrv->sched_event = ipp_sched_event;
-		INIT_LIST_HEAD(&ippdrv->cmd_list);
-		mutex_init(&ippdrv->cmd_lock);
-
-		ret = drm_iommu_attach_device(drm_dev, ippdrv->dev);
-		if (ret) {
-			DRM_ERROR("failed to activate iommu\n");
-			goto err;
-		}
-	}
-
-	return 0;
-
-err:
-	/* get ipp driver entry */
-	list_for_each_entry_continue_reverse(ippdrv, &exynos_drm_ippdrv_list,
-						drv_list) {
-		drm_iommu_detach_device(drm_dev, ippdrv->dev);
-
-		ipp_remove_id(&ctx->ipp_idr, &ctx->ipp_lock,
-				ippdrv->prop_list.ipp_id);
-	}
-
-	return ret;
-}
-
-static void ipp_subdrv_remove(struct drm_device *drm_dev, struct device *dev)
-{
-	struct exynos_drm_ippdrv *ippdrv, *t;
-	struct ipp_context *ctx = get_ipp_context(dev);
-
-	/* get ipp driver entry */
-	list_for_each_entry_safe(ippdrv, t, &exynos_drm_ippdrv_list, drv_list) {
-		drm_iommu_detach_device(drm_dev, ippdrv->dev);
-
-		ipp_remove_id(&ctx->ipp_idr, &ctx->ipp_lock,
-				ippdrv->prop_list.ipp_id);
-
-		ippdrv->drm_dev = NULL;
-		exynos_drm_ippdrv_unregister(ippdrv);
-	}
-}
-
-static int ipp_subdrv_open(struct drm_device *drm_dev, struct device *dev,
-		struct drm_file *file)
-{
-	struct drm_exynos_file_private *file_priv = file->driver_priv;
-
-	file_priv->ipp_dev = dev;
-
-	DRM_DEBUG_KMS("done priv[%pK]\n", dev);
-
-	return 0;
-}
-
-static void ipp_subdrv_close(struct drm_device *drm_dev, struct device *dev,
-		struct drm_file *file)
-{
-	struct exynos_drm_ippdrv *ippdrv = NULL;
-	struct ipp_context *ctx = get_ipp_context(dev);
-	struct drm_exynos_ipp_cmd_node *c_node, *tc_node;
-	int count = 0;
-
-	list_for_each_entry(ippdrv, &exynos_drm_ippdrv_list, drv_list) {
-		mutex_lock(&ippdrv->cmd_lock);
-		list_for_each_entry_safe(c_node, tc_node,
-			&ippdrv->cmd_list, list) {
-			DRM_DEBUG_KMS("count[%d]ippdrv[%pK]\n",
-				count++, ippdrv);
-
-			if (c_node->filp == file) {
-				/*
-				 * userland goto unnormal state. process killed.
-				 * and close the file.
-				 * so, IPP didn't called stop cmd ctrl.
-				 * so, we are make stop operation in this state.
-				 */
-				if (c_node->state == IPP_STATE_START) {
-					ipp_stop_property(drm_dev, ippdrv,
-						c_node);
-					c_node->state = IPP_STATE_STOP;
-				}
-
-				ippdrv->dedicated = false;
-				ipp_clean_cmd_node(ctx, c_node);
-				if (list_empty(&ippdrv->cmd_list))
-					pm_runtime_put_sync(ippdrv->dev);
-			}
-		}
-		mutex_unlock(&ippdrv->cmd_lock);
-	}
-
-	return;
-}
-
-static int ipp_probe(struct platform_device *pdev)
-{
-	struct device *dev = &pdev->dev;
-	struct ipp_context *ctx;
-	struct exynos_drm_subdrv *subdrv;
-	int ret;
-
-	ctx = devm_kzalloc(dev, sizeof(*ctx), GFP_KERNEL);
-	if (!ctx)
-		return -ENOMEM;
-
-	mutex_init(&ctx->ipp_lock);
-	mutex_init(&ctx->prop_lock);
-
-	idr_init(&ctx->ipp_idr);
-	idr_init(&ctx->prop_idr);
-
-	/*
-	 * create single thread for ipp event
-	 * IPP supports event thread for IPP drivers.
-	 * IPP driver send event_work to this thread.
-	 * and IPP event thread send event to user process.
-	 */
-	ctx->event_workq = create_singlethread_workqueue("ipp_event");
-	if (!ctx->event_workq) {
-		dev_err(dev, "failed to create event workqueue\n");
-		return -EINVAL;
-	}
-
-	/*
-	 * create single thread for ipp command
-	 * IPP supports command thread for user process.
-	 * user process make command node using set property ioctl.
-	 * and make start_work and send this work to command thread.
-	 * and then this command thread start property.
-	 */
-	ctx->cmd_workq = create_singlethread_workqueue("ipp_cmd");
-	if (!ctx->cmd_workq) {
-		dev_err(dev, "failed to create cmd workqueue\n");
-		ret = -EINVAL;
-		goto err_event_workq;
-	}
-
-	/* set sub driver informations */
-	subdrv = &ctx->subdrv;
-	subdrv->dev = dev;
-	subdrv->probe = ipp_subdrv_probe;
-	subdrv->remove = ipp_subdrv_remove;
-	subdrv->open = ipp_subdrv_open;
-	subdrv->close = ipp_subdrv_close;
-
-	platform_set_drvdata(pdev, ctx);
-
-	ret = exynos_drm_subdrv_register(subdrv);
-	if (ret < 0) {
-		DRM_ERROR("failed to register drm ipp device.\n");
-		goto err_cmd_workq;
-	}
-
-	dev_info(dev, "drm ipp registered successfully.\n");
-
-	return 0;
-
-err_cmd_workq:
-	destroy_workqueue(ctx->cmd_workq);
-err_event_workq:
-	destroy_workqueue(ctx->event_workq);
-	return ret;
-}
-
-static int ipp_remove(struct platform_device *pdev)
-{
-	struct ipp_context *ctx = platform_get_drvdata(pdev);
-
-	/* unregister sub driver */
-	exynos_drm_subdrv_unregister(&ctx->subdrv);
-
-	/* remove,destroy ipp idr */
-	idr_destroy(&ctx->ipp_idr);
-	idr_destroy(&ctx->prop_idr);
-
-	mutex_destroy(&ctx->ipp_lock);
-	mutex_destroy(&ctx->prop_lock);
-
-	/* destroy command, event work queue */
-	destroy_workqueue(ctx->cmd_workq);
-	destroy_workqueue(ctx->event_workq);
-
-	return 0;
-}
-
-struct platform_driver ipp_driver = {
-	.probe		= ipp_probe,
-	.remove		= ipp_remove,
-	.driver		= {
-		.name	= "exynos-drm-ipp",
-		.owner	= THIS_MODULE,
-	},
-};
-
diff --git a/drivers/gpu/drm/exynos/exynos_drm_ipp.h b/drivers/gpu/drm/exynos/exynos_drm_ipp.h
deleted file mode 100644
index 2a61547a39d0..000000000000
--- a/drivers/gpu/drm/exynos/exynos_drm_ipp.h
+++ /dev/null
@@ -1,252 +0,0 @@
-/*
- * Copyright (c) 2012 Samsung Electronics Co., Ltd.
- *
- * Authors:
- *	Eunchul Kim <[email protected]>
- *	Jinyoung Jeon <[email protected]>
- *	Sangmin Lee <[email protected]>
- *
- * This program is free software; you can redistribute  it and/or modify it
- * under  the terms of  the GNU General  Public License as published by the
- * Free Software Foundation;  either version 2 of the  License, or (at your
- * option) any later version.
- */
-
-#ifndef _EXYNOS_DRM_IPP_H_
-#define _EXYNOS_DRM_IPP_H_
-
-#define for_each_ipp_ops(pos)	\
-	for (pos = 0; pos < EXYNOS_DRM_OPS_MAX; pos++)
-#define for_each_ipp_planar(pos)	\
-	for (pos = 0; pos < EXYNOS_DRM_PLANAR_MAX; pos++)
-
-#define IPP_GET_LCD_WIDTH	_IOR('F', 302, int)
-#define IPP_GET_LCD_HEIGHT	_IOR('F', 303, int)
-#define IPP_SET_WRITEBACK	_IOW('F', 304, u32)
-
-/* definition of state */
-enum drm_exynos_ipp_state {
-	IPP_STATE_IDLE,
-	IPP_STATE_START,
-	IPP_STATE_STOP,
-};
-
-/*
- * A structure of command work information.
- * @work: work structure.
- * @ippdrv: current work ippdrv.
- * @c_node: command node information.
- * @ctrl: command control.
- */
-struct drm_exynos_ipp_cmd_work {
-	struct work_struct	work;
-	struct exynos_drm_ippdrv	*ippdrv;
-	struct drm_exynos_ipp_cmd_node *c_node;
-	enum drm_exynos_ipp_ctrl	ctrl;
-};
-
-/*
- * A structure of command node.
- *
- * @list: list head to command queue information.
- * @event_list: list head of event.
- * @mem_list: list head to source,destination memory queue information.
- * @lock: lock for synchronization of access to ioctl.
- * @mem_lock: lock for synchronization of access to memory nodes.
- * @event_lock: lock for synchronization of access to scheduled event.
- * @start_complete: completion of start of command.
- * @stop_complete: completion of stop of command.
- * @property: property information.
- * @start_work: start command work structure.
- * @stop_work: stop command work structure.
- * @event_work: event work structure.
- * @state: state of command node.
- * @filp: associated file pointer.
- */
-struct drm_exynos_ipp_cmd_node {
-	struct list_head	list;
-	struct list_head	event_list;
-	struct list_head	mem_list[EXYNOS_DRM_OPS_MAX];
-	struct mutex	lock;
-	struct mutex	mem_lock;
-	struct mutex	event_lock;
-	struct completion	start_complete;
-	struct completion	stop_complete;
-	struct drm_exynos_ipp_property	property;
-	struct drm_exynos_ipp_cmd_work *start_work;
-	struct drm_exynos_ipp_cmd_work *stop_work;
-	struct drm_exynos_ipp_event_work *event_work;
-	enum drm_exynos_ipp_state	state;
-	struct drm_file	*filp;
-};
-
-/*
- * A structure of buffer information.
- *
- * @handles: Y, Cb, Cr each gem object handle.
- * @base: Y, Cb, Cr each planar address.
- */
-struct drm_exynos_ipp_buf_info {
-	unsigned long	handles[EXYNOS_DRM_PLANAR_MAX];
-	dma_addr_t	base[EXYNOS_DRM_PLANAR_MAX];
-};
-
-/*
- * A structure of wb setting information.
- *
- * @enable: enable flag for wb.
- * @refresh: HZ of the refresh rate.
- */
-struct drm_exynos_ipp_set_wb {
-	__u32	enable;
-	__u32	refresh;
-};
-
-/*
- * A structure of event work information.
- *
- * @work: work structure.
- * @ippdrv: current work ippdrv.
- * @buf_id: id of src, dst buffer.
- */
-struct drm_exynos_ipp_event_work {
-	struct work_struct	work;
-	struct exynos_drm_ippdrv *ippdrv;
-	u32	buf_id[EXYNOS_DRM_OPS_MAX];
-};
-
-/*
- * A structure of source,destination operations.
- *
- * @set_fmt: set format of image.
- * @set_transf: set transform(rotations, flip).
- * @set_size: set size of region.
- * @set_addr: set address for dma.
- */
-struct exynos_drm_ipp_ops {
-	int (*set_fmt)(struct device *dev, u32 fmt);
-	int (*set_transf)(struct device *dev,
-		enum drm_exynos_degree degree,
-		enum drm_exynos_flip flip, bool *swap);
-	int (*set_size)(struct device *dev, int swap,
-		struct drm_exynos_pos *pos, struct drm_exynos_sz *sz);
-	int (*set_addr)(struct device *dev,
-		 struct drm_exynos_ipp_buf_info *buf_info, u32 buf_id,
-		enum drm_exynos_ipp_buf_type buf_type);
-};
-
-/*
- * A structure of ipp driver.
- *
- * @drv_list: list head for registed sub driver information.
- * @parent_dev: parent device information.
- * @dev: platform device.
- * @drm_dev: drm device.
- * @dedicated: dedicated ipp device.
- * @ops: source, destination operations.
- * @event_workq: event work queue.
- * @c_node: current command information.
- * @cmd_list: list head for command information.
- * @cmd_lock: lock for synchronization of access to cmd_list.
- * @prop_list: property informations of current ipp driver.
- * @check_property: check property about format, size, buffer.
- * @reset: reset ipp block.
- * @start: ipp each device start.
- * @stop: ipp each device stop.
- * @sched_event: work schedule handler.
- */
-struct exynos_drm_ippdrv {
-	struct list_head	drv_list;
-	struct device	*parent_dev;
-	struct device	*dev;
-	struct drm_device	*drm_dev;
-	bool	dedicated;
-	struct exynos_drm_ipp_ops	*ops[EXYNOS_DRM_OPS_MAX];
-	struct workqueue_struct	*event_workq;
-	struct drm_exynos_ipp_cmd_node *c_node;
-	struct list_head	cmd_list;
-	struct mutex	cmd_lock;
-	struct drm_exynos_ipp_prop_list prop_list;
-
-	int (*check_property)(struct device *dev,
-		struct drm_exynos_ipp_property *property);
-	int (*reset)(struct device *dev);
-	int (*start)(struct device *dev, enum drm_exynos_ipp_cmd cmd);
-	void (*stop)(struct device *dev, enum drm_exynos_ipp_cmd cmd);
-	void (*sched_event)(struct work_struct *work);
-};
-
-#ifdef CONFIG_DRM_EXYNOS_IPP
-extern int exynos_drm_ippdrv_register(struct exynos_drm_ippdrv *ippdrv);
-extern int exynos_drm_ippdrv_unregister(struct exynos_drm_ippdrv *ippdrv);
-extern int exynos_drm_ipp_get_property(struct drm_device *drm_dev, void *data,
-					 struct drm_file *file);
-extern int exynos_drm_ipp_set_property(struct drm_device *drm_dev, void *data,
-					 struct drm_file *file);
-extern int exynos_drm_ipp_queue_buf(struct drm_device *drm_dev, void *data,
-					 struct drm_file *file);
-extern int exynos_drm_ipp_cmd_ctrl(struct drm_device *drm_dev, void *data,
-					 struct drm_file *file);
-extern int exynos_drm_ippnb_register(struct notifier_block *nb);
-extern int exynos_drm_ippnb_unregister(struct notifier_block *nb);
-extern int exynos_drm_ippnb_send_event(unsigned long val, void *v);
-extern void ipp_sched_cmd(struct work_struct *work);
-extern void ipp_sched_event(struct work_struct *work);
-
-#else
-static inline int exynos_drm_ippdrv_register(struct exynos_drm_ippdrv *ippdrv)
-{
-	return -ENODEV;
-}
-
-static inline int exynos_drm_ippdrv_unregister(struct exynos_drm_ippdrv *ippdrv)
-{
-	return -ENODEV;
-}
-
-static inline int exynos_drm_ipp_get_property(struct drm_device *drm_dev,
-						void *data,
-						struct drm_file *file_priv)
-{
-	return -ENOTTY;
-}
-
-static inline int exynos_drm_ipp_set_property(struct drm_device *drm_dev,
-						void *data,
-						struct drm_file *file_priv)
-{
-	return -ENOTTY;
-}
-
-static inline int exynos_drm_ipp_queue_buf(struct drm_device *drm_dev,
-						void *data,
-						struct drm_file *file)
-{
-	return -ENOTTY;
-}
-
-static inline int exynos_drm_ipp_cmd_ctrl(struct drm_device *drm_dev,
-						void *data,
-						struct drm_file *file)
-{
-	return -ENOTTY;
-}
-
-static inline int exynos_drm_ippnb_register(struct notifier_block *nb)
-{
-	return -ENODEV;
-}
-
-static inline int exynos_drm_ippnb_unregister(struct notifier_block *nb)
-{
-	return -ENODEV;
-}
-
-static inline int exynos_drm_ippnb_send_event(unsigned long val, void *v)
-{
-	return -ENOTTY;
-}
-#endif
-
-#endif /* _EXYNOS_DRM_IPP_H_ */
-
diff --git a/include/video/exynos5433_decon.h b/drivers/gpu/drm/exynos/regs-decon5433.h
index 78957c9626f5..19ad9e47945e 100644
--- a/include/video/exynos5433_decon.h
+++ b/drivers/gpu/drm/exynos/regs-decon5433.h
@@ -6,8 +6,8 @@
  * published by the Free Software Foundationr
  */
 
-#ifndef EXYNOS_REGS_DECON_H
-#define EXYNOS_REGS_DECON_H
+#ifndef EXYNOS_REGS_DECON5433_H
+#define EXYNOS_REGS_DECON5433_H
 
 /* Exynos543X DECON */
 #define DECON_VIDCON0			0x0000
@@ -206,4 +206,4 @@
 #define CRCCTRL_CRCEN			(0x1 << 0)
 #define CRCCTRL_MASK			(0x7)
 
-#endif /* EXYNOS_REGS_DECON_H */
+#endif /* EXYNOS_REGS_DECON5433_H */
diff --git a/include/video/exynos7_decon.h b/drivers/gpu/drm/exynos/regs-decon7.h
index a62b11b613f6..5df7765d2397 100644
--- a/include/video/exynos7_decon.h
+++ b/drivers/gpu/drm/exynos/regs-decon7.h
@@ -1,5 +1,4 @@
-/* include/video/exynos7_decon.h
- *
+/*
  * Copyright (c) 2014 Samsung Electronics Co., Ltd.
  * Author: Ajay Kumar <[email protected]>
  *
@@ -9,6 +8,9 @@
  * option) any later version.
  */
 
+#ifndef EXYNOS_REGS_DECON7_H
+#define EXYNOS_REGS_DECON7_H
+
 /* VIDCON0 */
 #define VIDCON0					0x00
 
@@ -347,3 +349,5 @@
 
 #define DECON_UPDATE_SLAVE_SYNC			(1 << 4)
 #define DECON_UPDATE_STANDALONE_F		(1 << 0)
+
+#endif /* EXYNOS_REGS_DECON7_H */
diff --git a/drivers/gpu/drm/i915/Kconfig.debug b/drivers/gpu/drm/i915/Kconfig.debug
index fa36491495b1..108d21f34777 100644
--- a/drivers/gpu/drm/i915/Kconfig.debug
+++ b/drivers/gpu/drm/i915/Kconfig.debug
@@ -29,7 +29,6 @@ config DRM_I915_DEBUG
 	select SW_SYNC # signaling validation framework (igt/syncobj*)
 	select DRM_I915_SW_FENCE_DEBUG_OBJECTS
 	select DRM_I915_SELFTEST
-	select DRM_I915_TRACE_GEM
         default n
         help
           Choose this option to turn on extra driver debugging that may affect
@@ -53,6 +52,7 @@ config DRM_I915_DEBUG_GEM
 
 config DRM_I915_TRACE_GEM
 	bool "Insert extra ftrace output from the GEM internals"
+	depends on DRM_I915_DEBUG_GEM
 	select TRACING
 	default n
 	help
diff --git a/drivers/gpu/drm/i915/gvt/cmd_parser.c b/drivers/gpu/drm/i915/gvt/cmd_parser.c
index 18c45734c7a2..edec15d19538 100644
--- a/drivers/gpu/drm/i915/gvt/cmd_parser.c
+++ b/drivers/gpu/drm/i915/gvt/cmd_parser.c
@@ -825,6 +825,21 @@ static int force_nonpriv_reg_handler(struct parser_exec_state *s,
 	return 0;
 }
 
+static inline bool is_mocs_mmio(unsigned int offset)
+{
+	return ((offset >= 0xc800) && (offset <= 0xcff8)) ||
+		((offset >= 0xb020) && (offset <= 0xb0a0));
+}
+
+static int mocs_cmd_reg_handler(struct parser_exec_state *s,
+				unsigned int offset, unsigned int index)
+{
+	if (!is_mocs_mmio(offset))
+		return -EINVAL;
+	vgpu_vreg(s->vgpu, offset) = cmd_val(s, index + 1);
+	return 0;
+}
+
 static int cmd_reg_handler(struct parser_exec_state *s,
 	unsigned int offset, unsigned int index, char *cmd)
 {
@@ -848,6 +863,10 @@ static int cmd_reg_handler(struct parser_exec_state *s,
 		return 0;
 	}
 
+	if (is_mocs_mmio(offset) &&
+	    mocs_cmd_reg_handler(s, offset, index))
+		return -EINVAL;
+
 	if (is_force_nonpriv_mmio(offset) &&
 		force_nonpriv_reg_handler(s, offset, index))
 		return -EPERM;
@@ -1220,13 +1239,13 @@ static int gen8_check_mi_display_flip(struct parser_exec_state *s,
 		return 0;
 
 	if (IS_SKYLAKE(dev_priv) || IS_KABYLAKE(dev_priv)) {
-		stride = vgpu_vreg(s->vgpu, info->stride_reg) & GENMASK(9, 0);
-		tile = (vgpu_vreg(s->vgpu, info->ctrl_reg) &
+		stride = vgpu_vreg_t(s->vgpu, info->stride_reg) & GENMASK(9, 0);
+		tile = (vgpu_vreg_t(s->vgpu, info->ctrl_reg) &
 				GENMASK(12, 10)) >> 10;
 	} else {
-		stride = (vgpu_vreg(s->vgpu, info->stride_reg) &
+		stride = (vgpu_vreg_t(s->vgpu, info->stride_reg) &
 				GENMASK(15, 6)) >> 6;
-		tile = (vgpu_vreg(s->vgpu, info->ctrl_reg) & (1 << 10)) >> 10;
+		tile = (vgpu_vreg_t(s->vgpu, info->ctrl_reg) & (1 << 10)) >> 10;
 	}
 
 	if (stride != info->stride_val)
@@ -1245,21 +1264,21 @@ static int gen8_update_plane_mmio_from_mi_display_flip(
 	struct drm_i915_private *dev_priv = s->vgpu->gvt->dev_priv;
 	struct intel_vgpu *vgpu = s->vgpu;
 
-	set_mask_bits(&vgpu_vreg(vgpu, info->surf_reg), GENMASK(31, 12),
+	set_mask_bits(&vgpu_vreg_t(vgpu, info->surf_reg), GENMASK(31, 12),
 		      info->surf_val << 12);
 	if (IS_SKYLAKE(dev_priv) || IS_KABYLAKE(dev_priv)) {
-		set_mask_bits(&vgpu_vreg(vgpu, info->stride_reg), GENMASK(9, 0),
+		set_mask_bits(&vgpu_vreg_t(vgpu, info->stride_reg), GENMASK(9, 0),
 			      info->stride_val);
-		set_mask_bits(&vgpu_vreg(vgpu, info->ctrl_reg), GENMASK(12, 10),
+		set_mask_bits(&vgpu_vreg_t(vgpu, info->ctrl_reg), GENMASK(12, 10),
 			      info->tile_val << 10);
 	} else {
-		set_mask_bits(&vgpu_vreg(vgpu, info->stride_reg), GENMASK(15, 6),
+		set_mask_bits(&vgpu_vreg_t(vgpu, info->stride_reg), GENMASK(15, 6),
 			      info->stride_val << 6);
-		set_mask_bits(&vgpu_vreg(vgpu, info->ctrl_reg), GENMASK(10, 10),
+		set_mask_bits(&vgpu_vreg_t(vgpu, info->ctrl_reg), GENMASK(10, 10),
 			      info->tile_val << 10);
 	}
 
-	vgpu_vreg(vgpu, PIPE_FRMCOUNT_G4X(info->pipe))++;
+	vgpu_vreg_t(vgpu, PIPE_FRMCOUNT_G4X(info->pipe))++;
 	intel_vgpu_trigger_virtual_event(vgpu, info->event);
 	return 0;
 }
diff --git a/drivers/gpu/drm/i915/gvt/display.c b/drivers/gpu/drm/i915/gvt/display.c
index 09185036bac8..dd96ffc878ac 100644
--- a/drivers/gpu/drm/i915/gvt/display.c
+++ b/drivers/gpu/drm/i915/gvt/display.c
@@ -59,7 +59,7 @@ static int edp_pipe_is_enabled(struct intel_vgpu *vgpu)
 {
 	struct drm_i915_private *dev_priv = vgpu->gvt->dev_priv;
 
-	if (!(vgpu_vreg(vgpu, PIPECONF(_PIPE_EDP)) & PIPECONF_ENABLE))
+	if (!(vgpu_vreg_t(vgpu, PIPECONF(_PIPE_EDP)) & PIPECONF_ENABLE))
 		return 0;
 
 	if (!(vgpu_vreg(vgpu, _TRANS_DDI_FUNC_CTL_EDP) & TRANS_DDI_FUNC_ENABLE))
@@ -74,7 +74,7 @@ int pipe_is_enabled(struct intel_vgpu *vgpu, int pipe)
 	if (WARN_ON(pipe < PIPE_A || pipe >= I915_MAX_PIPES))
 		return -EINVAL;
 
-	if (vgpu_vreg(vgpu, PIPECONF(pipe)) & PIPECONF_ENABLE)
+	if (vgpu_vreg_t(vgpu, PIPECONF(pipe)) & PIPECONF_ENABLE)
 		return 1;
 
 	if (edp_pipe_is_enabled(vgpu) &&
@@ -169,103 +169,105 @@ static u8 dpcd_fix_data[DPCD_HEADER_SIZE] = {
 static void emulate_monitor_status_change(struct intel_vgpu *vgpu)
 {
 	struct drm_i915_private *dev_priv = vgpu->gvt->dev_priv;
-	vgpu_vreg(vgpu, SDEISR) &= ~(SDE_PORTB_HOTPLUG_CPT |
+	vgpu_vreg_t(vgpu, SDEISR) &= ~(SDE_PORTB_HOTPLUG_CPT |
 			SDE_PORTC_HOTPLUG_CPT |
 			SDE_PORTD_HOTPLUG_CPT);
 
 	if (IS_SKYLAKE(dev_priv) || IS_KABYLAKE(dev_priv)) {
-		vgpu_vreg(vgpu, SDEISR) &= ~(SDE_PORTA_HOTPLUG_SPT |
+		vgpu_vreg_t(vgpu, SDEISR) &= ~(SDE_PORTA_HOTPLUG_SPT |
 				SDE_PORTE_HOTPLUG_SPT);
-		vgpu_vreg(vgpu, SKL_FUSE_STATUS) |=
+		vgpu_vreg_t(vgpu, SKL_FUSE_STATUS) |=
 				SKL_FUSE_DOWNLOAD_STATUS |
 				SKL_FUSE_PG_DIST_STATUS(SKL_PG0) |
 				SKL_FUSE_PG_DIST_STATUS(SKL_PG1) |
 				SKL_FUSE_PG_DIST_STATUS(SKL_PG2);
-		vgpu_vreg(vgpu, LCPLL1_CTL) |=
+		vgpu_vreg_t(vgpu, LCPLL1_CTL) |=
 				LCPLL_PLL_ENABLE |
 				LCPLL_PLL_LOCK;
-		vgpu_vreg(vgpu, LCPLL2_CTL) |= LCPLL_PLL_ENABLE;
+		vgpu_vreg_t(vgpu, LCPLL2_CTL) |= LCPLL_PLL_ENABLE;
 
 	}
 
 	if (intel_vgpu_has_monitor_on_port(vgpu, PORT_B)) {
-		vgpu_vreg(vgpu, SFUSE_STRAP) |= SFUSE_STRAP_DDIB_DETECTED;
-		vgpu_vreg(vgpu, TRANS_DDI_FUNC_CTL(TRANSCODER_A)) &=
+		vgpu_vreg_t(vgpu, SFUSE_STRAP) |= SFUSE_STRAP_DDIB_DETECTED;
+		vgpu_vreg_t(vgpu, TRANS_DDI_FUNC_CTL(TRANSCODER_A)) &=
 			~(TRANS_DDI_BPC_MASK | TRANS_DDI_MODE_SELECT_MASK |
 			TRANS_DDI_PORT_MASK);
-		vgpu_vreg(vgpu, TRANS_DDI_FUNC_CTL(TRANSCODER_A)) |=
+		vgpu_vreg_t(vgpu, TRANS_DDI_FUNC_CTL(TRANSCODER_A)) |=
 			(TRANS_DDI_BPC_8 | TRANS_DDI_MODE_SELECT_DP_SST |
 			(PORT_B << TRANS_DDI_PORT_SHIFT) |
 			TRANS_DDI_FUNC_ENABLE);
 		if (IS_BROADWELL(dev_priv)) {
-			vgpu_vreg(vgpu, PORT_CLK_SEL(PORT_B)) &=
+			vgpu_vreg_t(vgpu, PORT_CLK_SEL(PORT_B)) &=
 				~PORT_CLK_SEL_MASK;
-			vgpu_vreg(vgpu, PORT_CLK_SEL(PORT_B)) |=
+			vgpu_vreg_t(vgpu, PORT_CLK_SEL(PORT_B)) |=
 				PORT_CLK_SEL_LCPLL_810;
 		}
-		vgpu_vreg(vgpu, DDI_BUF_CTL(PORT_B)) |= DDI_BUF_CTL_ENABLE;
-		vgpu_vreg(vgpu, DDI_BUF_CTL(PORT_B)) &= ~DDI_BUF_IS_IDLE;
-		vgpu_vreg(vgpu, SDEISR) |= SDE_PORTB_HOTPLUG_CPT;
+		vgpu_vreg_t(vgpu, DDI_BUF_CTL(PORT_B)) |= DDI_BUF_CTL_ENABLE;
+		vgpu_vreg_t(vgpu, DDI_BUF_CTL(PORT_B)) &= ~DDI_BUF_IS_IDLE;
+		vgpu_vreg_t(vgpu, SDEISR) |= SDE_PORTB_HOTPLUG_CPT;
 	}
 
 	if (intel_vgpu_has_monitor_on_port(vgpu, PORT_C)) {
-		vgpu_vreg(vgpu, SDEISR) |= SDE_PORTC_HOTPLUG_CPT;
-		vgpu_vreg(vgpu, TRANS_DDI_FUNC_CTL(TRANSCODER_A)) &=
+		vgpu_vreg_t(vgpu, SDEISR) |= SDE_PORTC_HOTPLUG_CPT;
+		vgpu_vreg_t(vgpu, TRANS_DDI_FUNC_CTL(TRANSCODER_A)) &=
 			~(TRANS_DDI_BPC_MASK | TRANS_DDI_MODE_SELECT_MASK |
 			TRANS_DDI_PORT_MASK);
-		vgpu_vreg(vgpu, TRANS_DDI_FUNC_CTL(TRANSCODER_A)) |=
+		vgpu_vreg_t(vgpu, TRANS_DDI_FUNC_CTL(TRANSCODER_A)) |=
 			(TRANS_DDI_BPC_8 | TRANS_DDI_MODE_SELECT_DP_SST |
 			(PORT_C << TRANS_DDI_PORT_SHIFT) |
 			TRANS_DDI_FUNC_ENABLE);
 		if (IS_BROADWELL(dev_priv)) {
-			vgpu_vreg(vgpu, PORT_CLK_SEL(PORT_C)) &=
+			vgpu_vreg_t(vgpu, PORT_CLK_SEL(PORT_C)) &=
 				~PORT_CLK_SEL_MASK;
-			vgpu_vreg(vgpu, PORT_CLK_SEL(PORT_C)) |=
+			vgpu_vreg_t(vgpu, PORT_CLK_SEL(PORT_C)) |=
 				PORT_CLK_SEL_LCPLL_810;
 		}
-		vgpu_vreg(vgpu, DDI_BUF_CTL(PORT_C)) |= DDI_BUF_CTL_ENABLE;
-		vgpu_vreg(vgpu, DDI_BUF_CTL(PORT_C)) &= ~DDI_BUF_IS_IDLE;
-		vgpu_vreg(vgpu, SFUSE_STRAP) |= SFUSE_STRAP_DDIC_DETECTED;
+		vgpu_vreg_t(vgpu, DDI_BUF_CTL(PORT_C)) |= DDI_BUF_CTL_ENABLE;
+		vgpu_vreg_t(vgpu, DDI_BUF_CTL(PORT_C)) &= ~DDI_BUF_IS_IDLE;
+		vgpu_vreg_t(vgpu, SFUSE_STRAP) |= SFUSE_STRAP_DDIC_DETECTED;
 	}
 
 	if (intel_vgpu_has_monitor_on_port(vgpu, PORT_D)) {
-		vgpu_vreg(vgpu, SDEISR) |= SDE_PORTD_HOTPLUG_CPT;
-		vgpu_vreg(vgpu, TRANS_DDI_FUNC_CTL(TRANSCODER_A)) &=
+		vgpu_vreg_t(vgpu, SDEISR) |= SDE_PORTD_HOTPLUG_CPT;
+		vgpu_vreg_t(vgpu, TRANS_DDI_FUNC_CTL(TRANSCODER_A)) &=
 			~(TRANS_DDI_BPC_MASK | TRANS_DDI_MODE_SELECT_MASK |
 			TRANS_DDI_PORT_MASK);
-		vgpu_vreg(vgpu, TRANS_DDI_FUNC_CTL(TRANSCODER_A)) |=
+		vgpu_vreg_t(vgpu, TRANS_DDI_FUNC_CTL(TRANSCODER_A)) |=
 			(TRANS_DDI_BPC_8 | TRANS_DDI_MODE_SELECT_DP_SST |
 			(PORT_D << TRANS_DDI_PORT_SHIFT) |
 			TRANS_DDI_FUNC_ENABLE);
 		if (IS_BROADWELL(dev_priv)) {
-			vgpu_vreg(vgpu, PORT_CLK_SEL(PORT_D)) &=
+			vgpu_vreg_t(vgpu, PORT_CLK_SEL(PORT_D)) &=
 				~PORT_CLK_SEL_MASK;
-			vgpu_vreg(vgpu, PORT_CLK_SEL(PORT_D)) |=
+			vgpu_vreg_t(vgpu, PORT_CLK_SEL(PORT_D)) |=
 				PORT_CLK_SEL_LCPLL_810;
 		}
-		vgpu_vreg(vgpu, DDI_BUF_CTL(PORT_D)) |= DDI_BUF_CTL_ENABLE;
-		vgpu_vreg(vgpu, DDI_BUF_CTL(PORT_D)) &= ~DDI_BUF_IS_IDLE;
-		vgpu_vreg(vgpu, SFUSE_STRAP) |= SFUSE_STRAP_DDID_DETECTED;
+		vgpu_vreg_t(vgpu, DDI_BUF_CTL(PORT_D)) |= DDI_BUF_CTL_ENABLE;
+		vgpu_vreg_t(vgpu, DDI_BUF_CTL(PORT_D)) &= ~DDI_BUF_IS_IDLE;
+		vgpu_vreg_t(vgpu, SFUSE_STRAP) |= SFUSE_STRAP_DDID_DETECTED;
 	}
 
 	if ((IS_SKYLAKE(dev_priv) || IS_KABYLAKE(dev_priv)) &&
 			intel_vgpu_has_monitor_on_port(vgpu, PORT_E)) {
-		vgpu_vreg(vgpu, SDEISR) |= SDE_PORTE_HOTPLUG_SPT;
+		vgpu_vreg_t(vgpu, SDEISR) |= SDE_PORTE_HOTPLUG_SPT;
 	}
 
 	if (intel_vgpu_has_monitor_on_port(vgpu, PORT_A)) {
 		if (IS_BROADWELL(dev_priv))
-			vgpu_vreg(vgpu, GEN8_DE_PORT_ISR) |=
+			vgpu_vreg_t(vgpu, GEN8_DE_PORT_ISR) |=
 				GEN8_PORT_DP_A_HOTPLUG;
 		else
-			vgpu_vreg(vgpu, SDEISR) |= SDE_PORTA_HOTPLUG_SPT;
+			vgpu_vreg_t(vgpu, SDEISR) |= SDE_PORTA_HOTPLUG_SPT;
 
-		vgpu_vreg(vgpu, DDI_BUF_CTL(PORT_A)) |= DDI_INIT_DISPLAY_DETECTED;
+		vgpu_vreg_t(vgpu, DDI_BUF_CTL(PORT_A)) |= DDI_INIT_DISPLAY_DETECTED;
 	}
 
 	/* Clear host CRT status, so guest couldn't detect this host CRT. */
 	if (IS_BROADWELL(dev_priv))
-		vgpu_vreg(vgpu, PCH_ADPA) &= ~ADPA_CRT_HOTPLUG_MONITOR_MASK;
+		vgpu_vreg_t(vgpu, PCH_ADPA) &= ~ADPA_CRT_HOTPLUG_MONITOR_MASK;
+
+	vgpu_vreg_t(vgpu, PIPECONF(PIPE_A)) |= PIPECONF_ENABLE;
 }
 
 static void clean_virtual_dp_monitor(struct intel_vgpu *vgpu, int port_num)
@@ -282,7 +284,6 @@ static void clean_virtual_dp_monitor(struct intel_vgpu *vgpu, int port_num)
 static int setup_virtual_dp_monitor(struct intel_vgpu *vgpu, int port_num,
 				    int type, unsigned int resolution)
 {
-	struct drm_i915_private *dev_priv = vgpu->gvt->dev_priv;
 	struct intel_vgpu_port *port = intel_vgpu_port(vgpu, port_num);
 
 	if (WARN_ON(resolution >= GVT_EDID_NUM))
@@ -308,7 +309,7 @@ static int setup_virtual_dp_monitor(struct intel_vgpu *vgpu, int port_num,
 	port->type = type;
 
 	emulate_monitor_status_change(vgpu);
-	vgpu_vreg(vgpu, PIPECONF(PIPE_A)) |= PIPECONF_ENABLE;
+
 	return 0;
 }
 
@@ -368,12 +369,12 @@ static void emulate_vblank_on_pipe(struct intel_vgpu *vgpu, int pipe)
 		if (!pipe_is_enabled(vgpu, pipe))
 			continue;
 
-		vgpu_vreg(vgpu, PIPE_FLIPCOUNT_G4X(pipe))++;
+		vgpu_vreg_t(vgpu, PIPE_FLIPCOUNT_G4X(pipe))++;
 		intel_vgpu_trigger_virtual_event(vgpu, event);
 	}
 
 	if (pipe_is_enabled(vgpu, pipe)) {
-		vgpu_vreg(vgpu, PIPE_FRMCOUNT_G4X(pipe))++;
+		vgpu_vreg_t(vgpu, PIPE_FRMCOUNT_G4X(pipe))++;
 		intel_vgpu_trigger_virtual_event(vgpu, vblank_event[pipe]);
 	}
 }
diff --git a/drivers/gpu/drm/i915/gvt/edid.c b/drivers/gpu/drm/i915/gvt/edid.c
index 42cd09ec63fa..f61337632969 100644
--- a/drivers/gpu/drm/i915/gvt/edid.c
+++ b/drivers/gpu/drm/i915/gvt/edid.c
@@ -95,9 +95,9 @@ static inline int get_port_from_gmbus0(u32 gmbus0)
 
 static void reset_gmbus_controller(struct intel_vgpu *vgpu)
 {
-	vgpu_vreg(vgpu, PCH_GMBUS2) = GMBUS_HW_RDY;
+	vgpu_vreg_t(vgpu, PCH_GMBUS2) = GMBUS_HW_RDY;
 	if (!vgpu->display.i2c_edid.edid_available)
-		vgpu_vreg(vgpu, PCH_GMBUS2) |= GMBUS_SATOER;
+		vgpu_vreg_t(vgpu, PCH_GMBUS2) |= GMBUS_SATOER;
 	vgpu->display.i2c_edid.gmbus.phase = GMBUS_IDLE_PHASE;
 }
 
@@ -123,16 +123,16 @@ static int gmbus0_mmio_write(struct intel_vgpu *vgpu,
 	vgpu->display.i2c_edid.state = I2C_GMBUS;
 	vgpu->display.i2c_edid.gmbus.phase = GMBUS_IDLE_PHASE;
 
-	vgpu_vreg(vgpu, PCH_GMBUS2) &= ~GMBUS_ACTIVE;
-	vgpu_vreg(vgpu, PCH_GMBUS2) |= GMBUS_HW_RDY | GMBUS_HW_WAIT_PHASE;
+	vgpu_vreg_t(vgpu, PCH_GMBUS2) &= ~GMBUS_ACTIVE;
+	vgpu_vreg_t(vgpu, PCH_GMBUS2) |= GMBUS_HW_RDY | GMBUS_HW_WAIT_PHASE;
 
 	if (intel_vgpu_has_monitor_on_port(vgpu, port) &&
 			!intel_vgpu_port_is_dp(vgpu, port)) {
 		vgpu->display.i2c_edid.port = port;
 		vgpu->display.i2c_edid.edid_available = true;
-		vgpu_vreg(vgpu, PCH_GMBUS2) &= ~GMBUS_SATOER;
+		vgpu_vreg_t(vgpu, PCH_GMBUS2) &= ~GMBUS_SATOER;
 	} else
-		vgpu_vreg(vgpu, PCH_GMBUS2) |= GMBUS_SATOER;
+		vgpu_vreg_t(vgpu, PCH_GMBUS2) |= GMBUS_SATOER;
 	return 0;
 }
 
@@ -159,8 +159,8 @@ static int gmbus1_mmio_write(struct intel_vgpu *vgpu, unsigned int offset,
 		 * 2) HW_RDY bit asserted
 		 */
 		if (wvalue & GMBUS_SW_CLR_INT) {
-			vgpu_vreg(vgpu, PCH_GMBUS2) &= ~GMBUS_INT;
-			vgpu_vreg(vgpu, PCH_GMBUS2) |= GMBUS_HW_RDY;
+			vgpu_vreg_t(vgpu, PCH_GMBUS2) &= ~GMBUS_INT;
+			vgpu_vreg_t(vgpu, PCH_GMBUS2) |= GMBUS_HW_RDY;
 		}
 
 		/* For virtualization, we suppose that HW is always ready,
@@ -208,7 +208,7 @@ static int gmbus1_mmio_write(struct intel_vgpu *vgpu, unsigned int offset,
 				 * visible in gmbus interface)
 				 */
 				i2c_edid->gmbus.phase = GMBUS_IDLE_PHASE;
-				vgpu_vreg(vgpu, PCH_GMBUS2) &= ~GMBUS_ACTIVE;
+				vgpu_vreg_t(vgpu, PCH_GMBUS2) &= ~GMBUS_ACTIVE;
 			}
 			break;
 		case NIDX_NS_W:
@@ -220,7 +220,7 @@ static int gmbus1_mmio_write(struct intel_vgpu *vgpu, unsigned int offset,
 			 * START (-->INDEX) -->DATA
 			 */
 			i2c_edid->gmbus.phase = GMBUS_DATA_PHASE;
-			vgpu_vreg(vgpu, PCH_GMBUS2) |= GMBUS_ACTIVE;
+			vgpu_vreg_t(vgpu, PCH_GMBUS2) |= GMBUS_ACTIVE;
 			break;
 		default:
 			gvt_vgpu_err("Unknown/reserved GMBUS cycle detected!\n");
@@ -256,7 +256,7 @@ static int gmbus3_mmio_read(struct intel_vgpu *vgpu, unsigned int offset,
 	u32 reg_data = 0;
 
 	/* Data can only be recevied if previous settings correct */
-	if (vgpu_vreg(vgpu, PCH_GMBUS1) & GMBUS_SLAVE_READ) {
+	if (vgpu_vreg_t(vgpu, PCH_GMBUS1) & GMBUS_SLAVE_READ) {
 		if (byte_left <= 0) {
 			memcpy(p_data, &vgpu_vreg(vgpu, offset), bytes);
 			return 0;
diff --git a/drivers/gpu/drm/i915/gvt/fb_decoder.c b/drivers/gpu/drm/i915/gvt/fb_decoder.c
index 6cc99543693f..6b50fe78dc1b 100644
--- a/drivers/gpu/drm/i915/gvt/fb_decoder.c
+++ b/drivers/gpu/drm/i915/gvt/fb_decoder.c
@@ -147,7 +147,7 @@ static u32 intel_vgpu_get_stride(struct intel_vgpu *vgpu, int pipe,
 {
 	struct drm_i915_private *dev_priv = vgpu->gvt->dev_priv;
 
-	u32 stride_reg = vgpu_vreg(vgpu, DSPSTRIDE(pipe)) & stride_mask;
+	u32 stride_reg = vgpu_vreg_t(vgpu, DSPSTRIDE(pipe)) & stride_mask;
 	u32 stride = stride_reg;
 
 	if (IS_SKYLAKE(dev_priv) || IS_KABYLAKE(dev_priv)) {
@@ -209,7 +209,7 @@ int intel_vgpu_decode_primary_plane(struct intel_vgpu *vgpu,
 	if (pipe >= I915_MAX_PIPES)
 		return -ENODEV;
 
-	val = vgpu_vreg(vgpu, DSPCNTR(pipe));
+	val = vgpu_vreg_t(vgpu, DSPCNTR(pipe));
 	plane->enabled = !!(val & DISPLAY_PLANE_ENABLE);
 	if (!plane->enabled)
 		return -ENODEV;
@@ -244,7 +244,7 @@ int intel_vgpu_decode_primary_plane(struct intel_vgpu *vgpu,
 
 	plane->hw_format = fmt;
 
-	plane->base = vgpu_vreg(vgpu, DSPSURF(pipe)) & I915_GTT_PAGE_MASK;
+	plane->base = vgpu_vreg_t(vgpu, DSPSURF(pipe)) & I915_GTT_PAGE_MASK;
 	if (!intel_gvt_ggtt_validate_range(vgpu, plane->base, 0)) {
 		gvt_vgpu_err("invalid gma address: %lx\n",
 			     (unsigned long)plane->base);
@@ -263,14 +263,14 @@ int intel_vgpu_decode_primary_plane(struct intel_vgpu *vgpu,
 			(_PRI_PLANE_STRIDE_MASK >> 6) :
 				_PRI_PLANE_STRIDE_MASK, plane->bpp);
 
-	plane->width = (vgpu_vreg(vgpu, PIPESRC(pipe)) & _PIPE_H_SRCSZ_MASK) >>
+	plane->width = (vgpu_vreg_t(vgpu, PIPESRC(pipe)) & _PIPE_H_SRCSZ_MASK) >>
 		_PIPE_H_SRCSZ_SHIFT;
 	plane->width += 1;
-	plane->height = (vgpu_vreg(vgpu, PIPESRC(pipe)) &
+	plane->height = (vgpu_vreg_t(vgpu, PIPESRC(pipe)) &
 			_PIPE_V_SRCSZ_MASK) >> _PIPE_V_SRCSZ_SHIFT;
 	plane->height += 1;	/* raw height is one minus the real value */
 
-	val = vgpu_vreg(vgpu, DSPTILEOFF(pipe));
+	val = vgpu_vreg_t(vgpu, DSPTILEOFF(pipe));
 	plane->x_offset = (val & _PRI_PLANE_X_OFF_MASK) >>
 		_PRI_PLANE_X_OFF_SHIFT;
 	plane->y_offset = (val & _PRI_PLANE_Y_OFF_MASK) >>
@@ -344,7 +344,7 @@ int intel_vgpu_decode_cursor_plane(struct intel_vgpu *vgpu,
 	if (pipe >= I915_MAX_PIPES)
 		return -ENODEV;
 
-	val = vgpu_vreg(vgpu, CURCNTR(pipe));
+	val = vgpu_vreg_t(vgpu, CURCNTR(pipe));
 	mode = val & CURSOR_MODE;
 	plane->enabled = (mode != CURSOR_MODE_DISABLE);
 	if (!plane->enabled)
@@ -370,7 +370,7 @@ int intel_vgpu_decode_cursor_plane(struct intel_vgpu *vgpu,
 		gvt_dbg_core("alpha_plane=0x%x, alpha_force=0x%x\n",
 			alpha_plane, alpha_force);
 
-	plane->base = vgpu_vreg(vgpu, CURBASE(pipe)) & I915_GTT_PAGE_MASK;
+	plane->base = vgpu_vreg_t(vgpu, CURBASE(pipe)) & I915_GTT_PAGE_MASK;
 	if (!intel_gvt_ggtt_validate_range(vgpu, plane->base, 0)) {
 		gvt_vgpu_err("invalid gma address: %lx\n",
 			     (unsigned long)plane->base);
@@ -384,7 +384,7 @@ int intel_vgpu_decode_cursor_plane(struct intel_vgpu *vgpu,
 		return  -EINVAL;
 	}
 
-	val = vgpu_vreg(vgpu, CURPOS(pipe));
+	val = vgpu_vreg_t(vgpu, CURPOS(pipe));
 	plane->x_pos = (val & _CURSOR_POS_X_MASK) >> _CURSOR_POS_X_SHIFT;
 	plane->x_sign = (val & _CURSOR_SIGN_X_MASK) >> _CURSOR_SIGN_X_SHIFT;
 	plane->y_pos = (val & _CURSOR_POS_Y_MASK) >> _CURSOR_POS_Y_SHIFT;
@@ -424,7 +424,7 @@ int intel_vgpu_decode_sprite_plane(struct intel_vgpu *vgpu,
 	if (pipe >= I915_MAX_PIPES)
 		return -ENODEV;
 
-	val = vgpu_vreg(vgpu, SPRCTL(pipe));
+	val = vgpu_vreg_t(vgpu, SPRCTL(pipe));
 	plane->enabled = !!(val & SPRITE_ENABLE);
 	if (!plane->enabled)
 		return -ENODEV;
@@ -475,7 +475,7 @@ int intel_vgpu_decode_sprite_plane(struct intel_vgpu *vgpu,
 
 	plane->drm_format = drm_format;
 
-	plane->base = vgpu_vreg(vgpu, SPRSURF(pipe)) & I915_GTT_PAGE_MASK;
+	plane->base = vgpu_vreg_t(vgpu, SPRSURF(pipe)) & I915_GTT_PAGE_MASK;
 	if (!intel_gvt_ggtt_validate_range(vgpu, plane->base, 0)) {
 		gvt_vgpu_err("invalid gma address: %lx\n",
 			     (unsigned long)plane->base);
@@ -489,10 +489,10 @@ int intel_vgpu_decode_sprite_plane(struct intel_vgpu *vgpu,
 		return  -EINVAL;
 	}
 
-	plane->stride = vgpu_vreg(vgpu, SPRSTRIDE(pipe)) &
+	plane->stride = vgpu_vreg_t(vgpu, SPRSTRIDE(pipe)) &
 				_SPRITE_STRIDE_MASK;
 
-	val = vgpu_vreg(vgpu, SPRSIZE(pipe));
+	val = vgpu_vreg_t(vgpu, SPRSIZE(pipe));
 	plane->height = (val & _SPRITE_SIZE_HEIGHT_MASK) >>
 		_SPRITE_SIZE_HEIGHT_SHIFT;
 	plane->width = (val & _SPRITE_SIZE_WIDTH_MASK) >>
@@ -500,11 +500,11 @@ int intel_vgpu_decode_sprite_plane(struct intel_vgpu *vgpu,
 	plane->height += 1;	/* raw height is one minus the real value */
 	plane->width += 1;	/* raw width is one minus the real value */
 
-	val = vgpu_vreg(vgpu, SPRPOS(pipe));
+	val = vgpu_vreg_t(vgpu, SPRPOS(pipe));
 	plane->x_pos = (val & _SPRITE_POS_X_MASK) >> _SPRITE_POS_X_SHIFT;
 	plane->y_pos = (val & _SPRITE_POS_Y_MASK) >> _SPRITE_POS_Y_SHIFT;
 
-	val = vgpu_vreg(vgpu, SPROFFSET(pipe));
+	val = vgpu_vreg_t(vgpu, SPROFFSET(pipe));
 	plane->x_offset = (val & _SPRITE_OFFSET_START_X_MASK) >>
 			   _SPRITE_OFFSET_START_X_SHIFT;
 	plane->y_offset = (val & _SPRITE_OFFSET_START_Y_MASK) >>
diff --git a/drivers/gpu/drm/i915/gvt/gtt.c b/drivers/gpu/drm/i915/gvt/gtt.c
index 71a0f2b87b3a..c4f752eeadcc 100644
--- a/drivers/gpu/drm/i915/gvt/gtt.c
+++ b/drivers/gpu/drm/i915/gvt/gtt.c
@@ -1968,6 +1968,39 @@ int intel_vgpu_emulate_gtt_mmio_write(struct intel_vgpu *vgpu, unsigned int off,
 	return ret;
 }
 
+int intel_vgpu_write_protect_handler(struct intel_vgpu *vgpu, u64 pa,
+				     void *p_data, unsigned int bytes)
+{
+	struct intel_gvt *gvt = vgpu->gvt;
+	int ret = 0;
+
+	if (atomic_read(&vgpu->gtt.n_tracked_guest_page)) {
+		struct intel_vgpu_page_track *t;
+
+		mutex_lock(&gvt->lock);
+
+		t = intel_vgpu_find_tracked_page(vgpu, pa >> PAGE_SHIFT);
+		if (t) {
+			if (unlikely(vgpu->failsafe)) {
+				/* remove write protection to prevent furture traps */
+				intel_vgpu_clean_page_track(vgpu, t);
+			} else {
+				ret = t->handler(t, pa, p_data, bytes);
+				if (ret) {
+					gvt_err("guest page write error %d, "
+						"gfn 0x%lx, pa 0x%llx, "
+						"var 0x%x, len %d\n",
+						ret, t->gfn, pa,
+						*(u32 *)p_data, bytes);
+				}
+			}
+		}
+		mutex_unlock(&gvt->lock);
+	}
+	return ret;
+}
+
+
 static int alloc_scratch_pages(struct intel_vgpu *vgpu,
 		intel_gvt_gtt_type_t type)
 {
@@ -2244,7 +2277,7 @@ struct intel_vgpu_mm *intel_vgpu_find_ppgtt_mm(struct intel_vgpu *vgpu,
 int intel_vgpu_g2v_create_ppgtt_mm(struct intel_vgpu *vgpu,
 		int page_table_level)
 {
-	u64 *pdp = (u64 *)&vgpu_vreg64(vgpu, vgtif_reg(pdp[0]));
+	u64 *pdp = (u64 *)&vgpu_vreg64_t(vgpu, vgtif_reg(pdp[0]));
 	struct intel_vgpu_mm *mm;
 
 	if (WARN_ON((page_table_level != 4) && (page_table_level != 3)))
@@ -2279,7 +2312,7 @@ int intel_vgpu_g2v_create_ppgtt_mm(struct intel_vgpu *vgpu,
 int intel_vgpu_g2v_destroy_ppgtt_mm(struct intel_vgpu *vgpu,
 		int page_table_level)
 {
-	u64 *pdp = (u64 *)&vgpu_vreg64(vgpu, vgtif_reg(pdp[0]));
+	u64 *pdp = (u64 *)&vgpu_vreg64_t(vgpu, vgtif_reg(pdp[0]));
 	struct intel_vgpu_mm *mm;
 
 	if (WARN_ON((page_table_level != 4) && (page_table_level != 3)))
diff --git a/drivers/gpu/drm/i915/gvt/gtt.h b/drivers/gpu/drm/i915/gvt/gtt.h
index f98c1c19b4cb..4cc13b5934f1 100644
--- a/drivers/gpu/drm/i915/gvt/gtt.h
+++ b/drivers/gpu/drm/i915/gvt/gtt.h
@@ -308,4 +308,7 @@ int intel_vgpu_emulate_gtt_mmio_read(struct intel_vgpu *vgpu,
 int intel_vgpu_emulate_gtt_mmio_write(struct intel_vgpu *vgpu,
 	unsigned int off, void *p_data, unsigned int bytes);
 
+int intel_vgpu_write_protect_handler(struct intel_vgpu *vgpu, u64 pa,
+				     void *p_data, unsigned int bytes);
+
 #endif /* _GVT_GTT_H_ */
diff --git a/drivers/gpu/drm/i915/gvt/gvt.c b/drivers/gpu/drm/i915/gvt/gvt.c
index 643bb961d40d..fac54f32d33f 100644
--- a/drivers/gpu/drm/i915/gvt/gvt.c
+++ b/drivers/gpu/drm/i915/gvt/gvt.c
@@ -183,6 +183,7 @@ static const struct intel_gvt_ops intel_gvt_ops = {
 	.get_gvt_attrs = intel_get_gvt_attrs,
 	.vgpu_query_plane = intel_vgpu_query_plane,
 	.vgpu_get_dmabuf = intel_vgpu_get_dmabuf,
+	.write_protect_handler = intel_vgpu_write_protect_handler,
 };
 
 /**
diff --git a/drivers/gpu/drm/i915/gvt/gvt.h b/drivers/gpu/drm/i915/gvt/gvt.h
index 1e9f11c8b7bb..7dc7a80213a8 100644
--- a/drivers/gpu/drm/i915/gvt/gvt.h
+++ b/drivers/gpu/drm/i915/gvt/gvt.h
@@ -412,23 +412,20 @@ void intel_vgpu_free_resource(struct intel_vgpu *vgpu);
 void intel_vgpu_write_fence(struct intel_vgpu *vgpu,
 	u32 fence, u64 value);
 
-/* Macros for easily accessing vGPU virtual/shadow register */
-#define vgpu_vreg(vgpu, reg) \
-	(*(u32 *)(vgpu->mmio.vreg + INTEL_GVT_MMIO_OFFSET(reg)))
-#define vgpu_vreg8(vgpu, reg) \
-	(*(u8 *)(vgpu->mmio.vreg + INTEL_GVT_MMIO_OFFSET(reg)))
-#define vgpu_vreg16(vgpu, reg) \
-	(*(u16 *)(vgpu->mmio.vreg + INTEL_GVT_MMIO_OFFSET(reg)))
-#define vgpu_vreg64(vgpu, reg) \
-	(*(u64 *)(vgpu->mmio.vreg + INTEL_GVT_MMIO_OFFSET(reg)))
-#define vgpu_sreg(vgpu, reg) \
-	(*(u32 *)(vgpu->mmio.sreg + INTEL_GVT_MMIO_OFFSET(reg)))
-#define vgpu_sreg8(vgpu, reg) \
-	(*(u8 *)(vgpu->mmio.sreg + INTEL_GVT_MMIO_OFFSET(reg)))
-#define vgpu_sreg16(vgpu, reg) \
-	(*(u16 *)(vgpu->mmio.sreg + INTEL_GVT_MMIO_OFFSET(reg)))
-#define vgpu_sreg64(vgpu, reg) \
-	(*(u64 *)(vgpu->mmio.sreg + INTEL_GVT_MMIO_OFFSET(reg)))
+/* Macros for easily accessing vGPU virtual/shadow register.
+   Explicitly seperate use for typed MMIO reg or real offset.*/
+#define vgpu_vreg_t(vgpu, reg) \
+	(*(u32 *)(vgpu->mmio.vreg + i915_mmio_reg_offset(reg)))
+#define vgpu_vreg(vgpu, offset) \
+	(*(u32 *)(vgpu->mmio.vreg + (offset)))
+#define vgpu_vreg64_t(vgpu, reg) \
+	(*(u64 *)(vgpu->mmio.vreg + i915_mmio_reg_offset(reg)))
+#define vgpu_vreg64(vgpu, offset) \
+	(*(u64 *)(vgpu->mmio.vreg + (offset)))
+#define vgpu_sreg_t(vgpu, reg) \
+	(*(u32 *)(vgpu->mmio.sreg + i915_mmio_reg_offset(reg)))
+#define vgpu_sreg(vgpu, offset) \
+	(*(u32 *)(vgpu->mmio.sreg + (offset)))
 
 #define for_each_active_vgpu(gvt, vgpu, id) \
 	idr_for_each_entry((&(gvt)->vgpu_idr), (vgpu), (id)) \
@@ -549,6 +546,8 @@ struct intel_gvt_ops {
 			struct attribute_group ***intel_vgpu_type_groups);
 	int (*vgpu_query_plane)(struct intel_vgpu *vgpu, void *);
 	int (*vgpu_get_dmabuf)(struct intel_vgpu *vgpu, unsigned int);
+	int (*write_protect_handler)(struct intel_vgpu *, u64, void *,
+				     unsigned int);
 };
 
 
diff --git a/drivers/gpu/drm/i915/gvt/handlers.c b/drivers/gpu/drm/i915/gvt/handlers.c
index c982867e7c2b..92d6468daeee 100644
--- a/drivers/gpu/drm/i915/gvt/handlers.c
+++ b/drivers/gpu/drm/i915/gvt/handlers.c
@@ -343,13 +343,13 @@ static int pch_pp_control_mmio_write(struct intel_vgpu *vgpu,
 	write_vreg(vgpu, offset, p_data, bytes);
 
 	if (vgpu_vreg(vgpu, offset) & PANEL_POWER_ON) {
-		vgpu_vreg(vgpu, PCH_PP_STATUS) |= PP_ON;
-		vgpu_vreg(vgpu, PCH_PP_STATUS) |= PP_SEQUENCE_STATE_ON_IDLE;
-		vgpu_vreg(vgpu, PCH_PP_STATUS) &= ~PP_SEQUENCE_POWER_DOWN;
-		vgpu_vreg(vgpu, PCH_PP_STATUS) &= ~PP_CYCLE_DELAY_ACTIVE;
+		vgpu_vreg_t(vgpu, PCH_PP_STATUS) |= PP_ON;
+		vgpu_vreg_t(vgpu, PCH_PP_STATUS) |= PP_SEQUENCE_STATE_ON_IDLE;
+		vgpu_vreg_t(vgpu, PCH_PP_STATUS) &= ~PP_SEQUENCE_POWER_DOWN;
+		vgpu_vreg_t(vgpu, PCH_PP_STATUS) &= ~PP_CYCLE_DELAY_ACTIVE;
 
 	} else
-		vgpu_vreg(vgpu, PCH_PP_STATUS) &=
+		vgpu_vreg_t(vgpu, PCH_PP_STATUS) &=
 			~(PP_ON | PP_SEQUENCE_POWER_DOWN
 					| PP_CYCLE_DELAY_ACTIVE);
 	return 0;
@@ -503,7 +503,7 @@ static int ddi_buf_ctl_mmio_write(struct intel_vgpu *vgpu, unsigned int offset,
 	} else {
 		vgpu_vreg(vgpu, offset) |= DDI_BUF_IS_IDLE;
 		if (offset == i915_mmio_reg_offset(DDI_BUF_CTL(PORT_E)))
-			vgpu_vreg(vgpu, DP_TP_STATUS(PORT_E))
+			vgpu_vreg_t(vgpu, DP_TP_STATUS(PORT_E))
 				&= ~DP_TP_STATUS_AUTOTRAIN_DONE;
 	}
 	return 0;
@@ -521,9 +521,9 @@ static int fdi_rx_iir_mmio_write(struct intel_vgpu *vgpu,
 
 static int fdi_auto_training_started(struct intel_vgpu *vgpu)
 {
-	u32 ddi_buf_ctl = vgpu_vreg(vgpu, DDI_BUF_CTL(PORT_E));
+	u32 ddi_buf_ctl = vgpu_vreg_t(vgpu, DDI_BUF_CTL(PORT_E));
 	u32 rx_ctl = vgpu_vreg(vgpu, _FDI_RXA_CTL);
-	u32 tx_ctl = vgpu_vreg(vgpu, DP_TP_CTL(PORT_E));
+	u32 tx_ctl = vgpu_vreg_t(vgpu, DP_TP_CTL(PORT_E));
 
 	if ((ddi_buf_ctl & DDI_BUF_CTL_ENABLE) &&
 			(rx_ctl & FDI_RX_ENABLE) &&
@@ -564,12 +564,12 @@ static int check_fdi_rx_train_status(struct intel_vgpu *vgpu,
 	fdi_tx_check_bits = FDI_TX_ENABLE | fdi_tx_train_bits;
 
 	/* If imr bit has been masked */
-	if (vgpu_vreg(vgpu, fdi_rx_imr) & fdi_iir_check_bits)
+	if (vgpu_vreg_t(vgpu, fdi_rx_imr) & fdi_iir_check_bits)
 		return 0;
 
-	if (((vgpu_vreg(vgpu, fdi_tx_ctl) & fdi_tx_check_bits)
+	if (((vgpu_vreg_t(vgpu, fdi_tx_ctl) & fdi_tx_check_bits)
 			== fdi_tx_check_bits)
-		&& ((vgpu_vreg(vgpu, fdi_rx_ctl) & fdi_rx_check_bits)
+		&& ((vgpu_vreg_t(vgpu, fdi_rx_ctl) & fdi_rx_check_bits)
 			== fdi_rx_check_bits))
 		return 1;
 	else
@@ -626,17 +626,17 @@ static int update_fdi_rx_iir_status(struct intel_vgpu *vgpu,
 	if (ret < 0)
 		return ret;
 	if (ret)
-		vgpu_vreg(vgpu, fdi_rx_iir) |= FDI_RX_BIT_LOCK;
+		vgpu_vreg_t(vgpu, fdi_rx_iir) |= FDI_RX_BIT_LOCK;
 
 	ret = check_fdi_rx_train_status(vgpu, index, FDI_LINK_TRAIN_PATTERN2);
 	if (ret < 0)
 		return ret;
 	if (ret)
-		vgpu_vreg(vgpu, fdi_rx_iir) |= FDI_RX_SYMBOL_LOCK;
+		vgpu_vreg_t(vgpu, fdi_rx_iir) |= FDI_RX_SYMBOL_LOCK;
 
 	if (offset == _FDI_RXA_CTL)
 		if (fdi_auto_training_started(vgpu))
-			vgpu_vreg(vgpu, DP_TP_STATUS(PORT_E)) |=
+			vgpu_vreg_t(vgpu, DP_TP_STATUS(PORT_E)) |=
 				DP_TP_STATUS_AUTOTRAIN_DONE;
 	return 0;
 }
@@ -657,7 +657,7 @@ static int dp_tp_ctl_mmio_write(struct intel_vgpu *vgpu, unsigned int offset,
 	data = (vgpu_vreg(vgpu, offset) & GENMASK(10, 8)) >> 8;
 	if (data == 0x2) {
 		status_reg = DP_TP_STATUS(index);
-		vgpu_vreg(vgpu, status_reg) |= (1 << 25);
+		vgpu_vreg_t(vgpu, status_reg) |= (1 << 25);
 	}
 	return 0;
 }
@@ -721,7 +721,7 @@ static int pri_surf_mmio_write(struct intel_vgpu *vgpu, unsigned int offset,
 	};
 
 	write_vreg(vgpu, offset, p_data, bytes);
-	vgpu_vreg(vgpu, surflive_reg) = vgpu_vreg(vgpu, offset);
+	vgpu_vreg_t(vgpu, surflive_reg) = vgpu_vreg(vgpu, offset);
 
 	set_bit(flip_event[index], vgpu->irq.flip_done_event[index]);
 	return 0;
@@ -742,7 +742,7 @@ static int spr_surf_mmio_write(struct intel_vgpu *vgpu, unsigned int offset,
 	};
 
 	write_vreg(vgpu, offset, p_data, bytes);
-	vgpu_vreg(vgpu, surflive_reg) = vgpu_vreg(vgpu, offset);
+	vgpu_vreg_t(vgpu, surflive_reg) = vgpu_vreg(vgpu, offset);
 
 	set_bit(flip_event[index], vgpu->irq.flip_done_event[index]);
 	return 0;
@@ -1064,9 +1064,9 @@ static void write_virtual_sbi_register(struct intel_vgpu *vgpu,
 static int sbi_data_mmio_read(struct intel_vgpu *vgpu, unsigned int offset,
 		void *p_data, unsigned int bytes)
 {
-	if (((vgpu_vreg(vgpu, SBI_CTL_STAT) & SBI_OPCODE_MASK) >>
+	if (((vgpu_vreg_t(vgpu, SBI_CTL_STAT) & SBI_OPCODE_MASK) >>
 				SBI_OPCODE_SHIFT) == SBI_CMD_CRRD) {
-		unsigned int sbi_offset = (vgpu_vreg(vgpu, SBI_ADDR) &
+		unsigned int sbi_offset = (vgpu_vreg_t(vgpu, SBI_ADDR) &
 				SBI_ADDR_OFFSET_MASK) >> SBI_ADDR_OFFSET_SHIFT;
 		vgpu_vreg(vgpu, offset) = read_virtual_sbi_register(vgpu,
 				sbi_offset);
@@ -1091,13 +1091,13 @@ static int sbi_ctl_mmio_write(struct intel_vgpu *vgpu, unsigned int offset,
 
 	vgpu_vreg(vgpu, offset) = data;
 
-	if (((vgpu_vreg(vgpu, SBI_CTL_STAT) & SBI_OPCODE_MASK) >>
+	if (((vgpu_vreg_t(vgpu, SBI_CTL_STAT) & SBI_OPCODE_MASK) >>
 				SBI_OPCODE_SHIFT) == SBI_CMD_CRWR) {
-		unsigned int sbi_offset = (vgpu_vreg(vgpu, SBI_ADDR) &
+		unsigned int sbi_offset = (vgpu_vreg_t(vgpu, SBI_ADDR) &
 				SBI_ADDR_OFFSET_MASK) >> SBI_ADDR_OFFSET_SHIFT;
 
 		write_virtual_sbi_register(vgpu, sbi_offset,
-				vgpu_vreg(vgpu, SBI_DATA));
+					   vgpu_vreg_t(vgpu, SBI_DATA));
 	}
 	return 0;
 }
@@ -1343,7 +1343,7 @@ static int mailbox_write(struct intel_vgpu *vgpu, unsigned int offset,
 {
 	u32 value = *(u32 *)p_data;
 	u32 cmd = value & 0xff;
-	u32 *data0 = &vgpu_vreg(vgpu, GEN6_PCODE_DATA);
+	u32 *data0 = &vgpu_vreg_t(vgpu, GEN6_PCODE_DATA);
 
 	switch (cmd) {
 	case GEN9_PCODE_READ_MEM_LATENCY:
@@ -1586,7 +1586,7 @@ static int ring_reset_ctl_write(struct intel_vgpu *vgpu,
 }
 
 #define MMIO_F(reg, s, f, am, rm, d, r, w) do { \
-	ret = new_mmio_info(gvt, INTEL_GVT_MMIO_OFFSET(reg), \
+	ret = new_mmio_info(gvt, i915_mmio_reg_offset(reg), \
 		f, s, am, rm, d, r, w); \
 	if (ret) \
 		return ret; \
@@ -1654,22 +1654,22 @@ static int init_generic_mmio_info(struct intel_gvt *gvt)
 	MMIO_GM_RDR(BLT_HWS_PGA_GEN7, D_ALL, NULL, NULL);
 	MMIO_GM_RDR(VEBOX_HWS_PGA_GEN7, D_ALL, NULL, NULL);
 
-#define RING_REG(base) (base + 0x28)
+#define RING_REG(base) _MMIO((base) + 0x28)
 	MMIO_RING_DFH(RING_REG, D_ALL, F_CMD_ACCESS, NULL, NULL);
 #undef RING_REG
 
-#define RING_REG(base) (base + 0x134)
+#define RING_REG(base) _MMIO((base) + 0x134)
 	MMIO_RING_DFH(RING_REG, D_ALL, F_CMD_ACCESS, NULL, NULL);
 #undef RING_REG
 
-#define RING_REG(base) (base + 0x6c)
+#define RING_REG(base) _MMIO((base) + 0x6c)
 	MMIO_RING_DFH(RING_REG, D_ALL, 0, mmio_read_from_hw, NULL);
 #undef RING_REG
 	MMIO_DH(GEN7_SC_INSTDONE, D_BDW_PLUS, mmio_read_from_hw, NULL);
 
-	MMIO_GM_RDR(0x2148, D_ALL, NULL, NULL);
+	MMIO_GM_RDR(_MMIO(0x2148), D_ALL, NULL, NULL);
 	MMIO_GM_RDR(CCID, D_ALL, NULL, NULL);
-	MMIO_GM_RDR(0x12198, D_ALL, NULL, NULL);
+	MMIO_GM_RDR(_MMIO(0x12198), D_ALL, NULL, NULL);
 	MMIO_D(GEN7_CXT_SIZE, D_ALL);
 
 	MMIO_RING_DFH(RING_TAIL, D_ALL, F_CMD_ACCESS, NULL, NULL);
@@ -1679,7 +1679,7 @@ static int init_generic_mmio_info(struct intel_gvt *gvt)
 	MMIO_RING_GM_RDR(RING_START, D_ALL, NULL, NULL);
 
 	/* RING MODE */
-#define RING_REG(base) (base + 0x29c)
+#define RING_REG(base) _MMIO((base) + 0x29c)
 	MMIO_RING_DFH(RING_REG, D_ALL, F_MODE_MASK | F_CMD_ACCESS, NULL,
 		ring_mode_mmio_write);
 #undef RING_REG
@@ -1698,37 +1698,37 @@ static int init_generic_mmio_info(struct intel_gvt *gvt)
 		NULL, NULL);
 	MMIO_DFH(CACHE_MODE_1, D_ALL, F_MODE_MASK | F_CMD_ACCESS, NULL, NULL);
 	MMIO_DFH(CACHE_MODE_0, D_ALL, F_MODE_MASK | F_CMD_ACCESS, NULL, NULL);
-	MMIO_DFH(0x2124, D_ALL, F_MODE_MASK | F_CMD_ACCESS, NULL, NULL);
+	MMIO_DFH(_MMIO(0x2124), D_ALL, F_MODE_MASK | F_CMD_ACCESS, NULL, NULL);
 
-	MMIO_DFH(0x20dc, D_ALL, F_MODE_MASK | F_CMD_ACCESS, NULL, NULL);
+	MMIO_DFH(_MMIO(0x20dc), D_ALL, F_MODE_MASK | F_CMD_ACCESS, NULL, NULL);
 	MMIO_DFH(_3D_CHICKEN3, D_ALL, F_MODE_MASK | F_CMD_ACCESS, NULL, NULL);
-	MMIO_DFH(0x2088, D_ALL, F_MODE_MASK | F_CMD_ACCESS, NULL, NULL);
-	MMIO_DFH(0x20e4, D_ALL, F_MODE_MASK | F_CMD_ACCESS, NULL, NULL);
-	MMIO_DFH(0x2470, D_ALL, F_MODE_MASK | F_CMD_ACCESS, NULL, NULL);
+	MMIO_DFH(_MMIO(0x2088), D_ALL, F_MODE_MASK | F_CMD_ACCESS, NULL, NULL);
+	MMIO_DFH(_MMIO(0x20e4), D_ALL, F_MODE_MASK | F_CMD_ACCESS, NULL, NULL);
+	MMIO_DFH(_MMIO(0x2470), D_ALL, F_MODE_MASK | F_CMD_ACCESS, NULL, NULL);
 	MMIO_DFH(GAM_ECOCHK, D_ALL, F_CMD_ACCESS, NULL, NULL);
 	MMIO_DFH(GEN7_COMMON_SLICE_CHICKEN1, D_ALL, F_MODE_MASK | F_CMD_ACCESS,
 		NULL, NULL);
 	MMIO_DFH(COMMON_SLICE_CHICKEN2, D_ALL, F_MODE_MASK | F_CMD_ACCESS,
 		 NULL, NULL);
-	MMIO_DFH(0x9030, D_ALL, F_CMD_ACCESS, NULL, NULL);
-	MMIO_DFH(0x20a0, D_ALL, F_CMD_ACCESS, NULL, NULL);
-	MMIO_DFH(0x2420, D_ALL, F_CMD_ACCESS, NULL, NULL);
-	MMIO_DFH(0x2430, D_ALL, F_CMD_ACCESS, NULL, NULL);
-	MMIO_DFH(0x2434, D_ALL, F_CMD_ACCESS, NULL, NULL);
-	MMIO_DFH(0x2438, D_ALL, F_CMD_ACCESS, NULL, NULL);
-	MMIO_DFH(0x243c, D_ALL, F_CMD_ACCESS, NULL, NULL);
-	MMIO_DFH(0x7018, D_ALL, F_MODE_MASK | F_CMD_ACCESS, NULL, NULL);
+	MMIO_DFH(_MMIO(0x9030), D_ALL, F_CMD_ACCESS, NULL, NULL);
+	MMIO_DFH(_MMIO(0x20a0), D_ALL, F_CMD_ACCESS, NULL, NULL);
+	MMIO_DFH(_MMIO(0x2420), D_ALL, F_CMD_ACCESS, NULL, NULL);
+	MMIO_DFH(_MMIO(0x2430), D_ALL, F_CMD_ACCESS, NULL, NULL);
+	MMIO_DFH(_MMIO(0x2434), D_ALL, F_CMD_ACCESS, NULL, NULL);
+	MMIO_DFH(_MMIO(0x2438), D_ALL, F_CMD_ACCESS, NULL, NULL);
+	MMIO_DFH(_MMIO(0x243c), D_ALL, F_CMD_ACCESS, NULL, NULL);
+	MMIO_DFH(_MMIO(0x7018), D_ALL, F_MODE_MASK | F_CMD_ACCESS, NULL, NULL);
 	MMIO_DFH(HALF_SLICE_CHICKEN3, D_ALL, F_MODE_MASK | F_CMD_ACCESS, NULL, NULL);
 	MMIO_DFH(GEN7_HALF_SLICE_CHICKEN1, D_ALL, F_MODE_MASK | F_CMD_ACCESS, NULL, NULL);
 
 	/* display */
-	MMIO_F(0x60220, 0x20, 0, 0, 0, D_ALL, NULL, NULL);
-	MMIO_D(0x602a0, D_ALL);
+	MMIO_F(_MMIO(0x60220), 0x20, 0, 0, 0, D_ALL, NULL, NULL);
+	MMIO_D(_MMIO(0x602a0), D_ALL);
 
-	MMIO_D(0x65050, D_ALL);
-	MMIO_D(0x650b4, D_ALL);
+	MMIO_D(_MMIO(0x65050), D_ALL);
+	MMIO_D(_MMIO(0x650b4), D_ALL);
 
-	MMIO_D(0xc4040, D_ALL);
+	MMIO_D(_MMIO(0xc4040), D_ALL);
 	MMIO_D(DERRMR, D_ALL);
 
 	MMIO_D(PIPEDSL(PIPE_A), D_ALL);
@@ -1768,14 +1768,14 @@ static int init_generic_mmio_info(struct intel_gvt *gvt)
 	MMIO_D(CURBASE(PIPE_B), D_ALL);
 	MMIO_D(CURBASE(PIPE_C), D_ALL);
 
-	MMIO_D(0x700ac, D_ALL);
-	MMIO_D(0x710ac, D_ALL);
-	MMIO_D(0x720ac, D_ALL);
+	MMIO_D(_MMIO(0x700ac), D_ALL);
+	MMIO_D(_MMIO(0x710ac), D_ALL);
+	MMIO_D(_MMIO(0x720ac), D_ALL);
 
-	MMIO_D(0x70090, D_ALL);
-	MMIO_D(0x70094, D_ALL);
-	MMIO_D(0x70098, D_ALL);
-	MMIO_D(0x7009c, D_ALL);
+	MMIO_D(_MMIO(0x70090), D_ALL);
+	MMIO_D(_MMIO(0x70094), D_ALL);
+	MMIO_D(_MMIO(0x70098), D_ALL);
+	MMIO_D(_MMIO(0x7009c), D_ALL);
 
 	MMIO_D(DSPCNTR(PIPE_A), D_ALL);
 	MMIO_D(DSPADDR(PIPE_A), D_ALL);
@@ -1951,24 +1951,24 @@ static int init_generic_mmio_info(struct intel_gvt *gvt)
 	MMIO_D(BLC_PWM_PCH_CTL1, D_ALL);
 	MMIO_D(BLC_PWM_PCH_CTL2, D_ALL);
 
-	MMIO_D(0x48268, D_ALL);
+	MMIO_D(_MMIO(0x48268), D_ALL);
 
 	MMIO_F(PCH_GMBUS0, 4 * 4, 0, 0, 0, D_ALL, gmbus_mmio_read,
 		gmbus_mmio_write);
 	MMIO_F(PCH_GPIOA, 6 * 4, F_UNALIGN, 0, 0, D_ALL, NULL, NULL);
-	MMIO_F(0xe4f00, 0x28, 0, 0, 0, D_ALL, NULL, NULL);
+	MMIO_F(_MMIO(0xe4f00), 0x28, 0, 0, 0, D_ALL, NULL, NULL);
 
-	MMIO_F(_PCH_DPB_AUX_CH_CTL, 6 * 4, 0, 0, 0, D_PRE_SKL, NULL,
+	MMIO_F(_MMIO(_PCH_DPB_AUX_CH_CTL), 6 * 4, 0, 0, 0, D_PRE_SKL, NULL,
 		dp_aux_ch_ctl_mmio_write);
-	MMIO_F(_PCH_DPC_AUX_CH_CTL, 6 * 4, 0, 0, 0, D_PRE_SKL, NULL,
+	MMIO_F(_MMIO(_PCH_DPC_AUX_CH_CTL), 6 * 4, 0, 0, 0, D_PRE_SKL, NULL,
 		dp_aux_ch_ctl_mmio_write);
-	MMIO_F(_PCH_DPD_AUX_CH_CTL, 6 * 4, 0, 0, 0, D_PRE_SKL, NULL,
+	MMIO_F(_MMIO(_PCH_DPD_AUX_CH_CTL), 6 * 4, 0, 0, 0, D_PRE_SKL, NULL,
 		dp_aux_ch_ctl_mmio_write);
 
 	MMIO_DH(PCH_ADPA, D_PRE_SKL, NULL, pch_adpa_mmio_write);
 
-	MMIO_DH(_PCH_TRANSACONF, D_ALL, NULL, transconf_mmio_write);
-	MMIO_DH(_PCH_TRANSBCONF, D_ALL, NULL, transconf_mmio_write);
+	MMIO_DH(_MMIO(_PCH_TRANSACONF), D_ALL, NULL, transconf_mmio_write);
+	MMIO_DH(_MMIO(_PCH_TRANSBCONF), D_ALL, NULL, transconf_mmio_write);
 
 	MMIO_DH(FDI_RX_IIR(PIPE_A), D_ALL, NULL, fdi_rx_iir_mmio_write);
 	MMIO_DH(FDI_RX_IIR(PIPE_B), D_ALL, NULL, fdi_rx_iir_mmio_write);
@@ -1980,30 +1980,30 @@ static int init_generic_mmio_info(struct intel_gvt *gvt)
 	MMIO_DH(FDI_RX_CTL(PIPE_B), D_ALL, NULL, update_fdi_rx_iir_status);
 	MMIO_DH(FDI_RX_CTL(PIPE_C), D_ALL, NULL, update_fdi_rx_iir_status);
 
-	MMIO_D(_PCH_TRANS_HTOTAL_A, D_ALL);
-	MMIO_D(_PCH_TRANS_HBLANK_A, D_ALL);
-	MMIO_D(_PCH_TRANS_HSYNC_A, D_ALL);
-	MMIO_D(_PCH_TRANS_VTOTAL_A, D_ALL);
-	MMIO_D(_PCH_TRANS_VBLANK_A, D_ALL);
-	MMIO_D(_PCH_TRANS_VSYNC_A, D_ALL);
-	MMIO_D(_PCH_TRANS_VSYNCSHIFT_A, D_ALL);
-
-	MMIO_D(_PCH_TRANS_HTOTAL_B, D_ALL);
-	MMIO_D(_PCH_TRANS_HBLANK_B, D_ALL);
-	MMIO_D(_PCH_TRANS_HSYNC_B, D_ALL);
-	MMIO_D(_PCH_TRANS_VTOTAL_B, D_ALL);
-	MMIO_D(_PCH_TRANS_VBLANK_B, D_ALL);
-	MMIO_D(_PCH_TRANS_VSYNC_B, D_ALL);
-	MMIO_D(_PCH_TRANS_VSYNCSHIFT_B, D_ALL);
-
-	MMIO_D(_PCH_TRANSA_DATA_M1, D_ALL);
-	MMIO_D(_PCH_TRANSA_DATA_N1, D_ALL);
-	MMIO_D(_PCH_TRANSA_DATA_M2, D_ALL);
-	MMIO_D(_PCH_TRANSA_DATA_N2, D_ALL);
-	MMIO_D(_PCH_TRANSA_LINK_M1, D_ALL);
-	MMIO_D(_PCH_TRANSA_LINK_N1, D_ALL);
-	MMIO_D(_PCH_TRANSA_LINK_M2, D_ALL);
-	MMIO_D(_PCH_TRANSA_LINK_N2, D_ALL);
+	MMIO_D(_MMIO(_PCH_TRANS_HTOTAL_A), D_ALL);
+	MMIO_D(_MMIO(_PCH_TRANS_HBLANK_A), D_ALL);
+	MMIO_D(_MMIO(_PCH_TRANS_HSYNC_A), D_ALL);
+	MMIO_D(_MMIO(_PCH_TRANS_VTOTAL_A), D_ALL);
+	MMIO_D(_MMIO(_PCH_TRANS_VBLANK_A), D_ALL);
+	MMIO_D(_MMIO(_PCH_TRANS_VSYNC_A), D_ALL);
+	MMIO_D(_MMIO(_PCH_TRANS_VSYNCSHIFT_A), D_ALL);
+
+	MMIO_D(_MMIO(_PCH_TRANS_HTOTAL_B), D_ALL);
+	MMIO_D(_MMIO(_PCH_TRANS_HBLANK_B), D_ALL);
+	MMIO_D(_MMIO(_PCH_TRANS_HSYNC_B), D_ALL);
+	MMIO_D(_MMIO(_PCH_TRANS_VTOTAL_B), D_ALL);
+	MMIO_D(_MMIO(_PCH_TRANS_VBLANK_B), D_ALL);
+	MMIO_D(_MMIO(_PCH_TRANS_VSYNC_B), D_ALL);
+	MMIO_D(_MMIO(_PCH_TRANS_VSYNCSHIFT_B), D_ALL);
+
+	MMIO_D(_MMIO(_PCH_TRANSA_DATA_M1), D_ALL);
+	MMIO_D(_MMIO(_PCH_TRANSA_DATA_N1), D_ALL);
+	MMIO_D(_MMIO(_PCH_TRANSA_DATA_M2), D_ALL);
+	MMIO_D(_MMIO(_PCH_TRANSA_DATA_N2), D_ALL);
+	MMIO_D(_MMIO(_PCH_TRANSA_LINK_M1), D_ALL);
+	MMIO_D(_MMIO(_PCH_TRANSA_LINK_N1), D_ALL);
+	MMIO_D(_MMIO(_PCH_TRANSA_LINK_M2), D_ALL);
+	MMIO_D(_MMIO(_PCH_TRANSA_LINK_N2), D_ALL);
 
 	MMIO_D(TRANS_DP_CTL(PIPE_A), D_ALL);
 	MMIO_D(TRANS_DP_CTL(PIPE_B), D_ALL);
@@ -2021,38 +2021,38 @@ static int init_generic_mmio_info(struct intel_gvt *gvt)
 	MMIO_D(TVIDEO_DIP_DATA(PIPE_C), D_ALL);
 	MMIO_D(TVIDEO_DIP_GCP(PIPE_C), D_ALL);
 
-	MMIO_D(_FDI_RXA_MISC, D_ALL);
-	MMIO_D(_FDI_RXB_MISC, D_ALL);
-	MMIO_D(_FDI_RXA_TUSIZE1, D_ALL);
-	MMIO_D(_FDI_RXA_TUSIZE2, D_ALL);
-	MMIO_D(_FDI_RXB_TUSIZE1, D_ALL);
-	MMIO_D(_FDI_RXB_TUSIZE2, D_ALL);
+	MMIO_D(_MMIO(_FDI_RXA_MISC), D_ALL);
+	MMIO_D(_MMIO(_FDI_RXB_MISC), D_ALL);
+	MMIO_D(_MMIO(_FDI_RXA_TUSIZE1), D_ALL);
+	MMIO_D(_MMIO(_FDI_RXA_TUSIZE2), D_ALL);
+	MMIO_D(_MMIO(_FDI_RXB_TUSIZE1), D_ALL);
+	MMIO_D(_MMIO(_FDI_RXB_TUSIZE2), D_ALL);
 
 	MMIO_DH(PCH_PP_CONTROL, D_ALL, NULL, pch_pp_control_mmio_write);
 	MMIO_D(PCH_PP_DIVISOR, D_ALL);
 	MMIO_D(PCH_PP_STATUS,  D_ALL);
 	MMIO_D(PCH_LVDS, D_ALL);
-	MMIO_D(_PCH_DPLL_A, D_ALL);
-	MMIO_D(_PCH_DPLL_B, D_ALL);
-	MMIO_D(_PCH_FPA0, D_ALL);
-	MMIO_D(_PCH_FPA1, D_ALL);
-	MMIO_D(_PCH_FPB0, D_ALL);
-	MMIO_D(_PCH_FPB1, D_ALL);
+	MMIO_D(_MMIO(_PCH_DPLL_A), D_ALL);
+	MMIO_D(_MMIO(_PCH_DPLL_B), D_ALL);
+	MMIO_D(_MMIO(_PCH_FPA0), D_ALL);
+	MMIO_D(_MMIO(_PCH_FPA1), D_ALL);
+	MMIO_D(_MMIO(_PCH_FPB0), D_ALL);
+	MMIO_D(_MMIO(_PCH_FPB1), D_ALL);
 	MMIO_D(PCH_DREF_CONTROL, D_ALL);
 	MMIO_D(PCH_RAWCLK_FREQ, D_ALL);
 	MMIO_D(PCH_DPLL_SEL, D_ALL);
 
-	MMIO_D(0x61208, D_ALL);
-	MMIO_D(0x6120c, D_ALL);
+	MMIO_D(_MMIO(0x61208), D_ALL);
+	MMIO_D(_MMIO(0x6120c), D_ALL);
 	MMIO_D(PCH_PP_ON_DELAYS, D_ALL);
 	MMIO_D(PCH_PP_OFF_DELAYS, D_ALL);
 
-	MMIO_DH(0xe651c, D_ALL, dpy_reg_mmio_read, NULL);
-	MMIO_DH(0xe661c, D_ALL, dpy_reg_mmio_read, NULL);
-	MMIO_DH(0xe671c, D_ALL, dpy_reg_mmio_read, NULL);
-	MMIO_DH(0xe681c, D_ALL, dpy_reg_mmio_read, NULL);
-	MMIO_DH(0xe6c04, D_ALL, dpy_reg_mmio_read, NULL);
-	MMIO_DH(0xe6e1c, D_ALL, dpy_reg_mmio_read, NULL);
+	MMIO_DH(_MMIO(0xe651c), D_ALL, dpy_reg_mmio_read, NULL);
+	MMIO_DH(_MMIO(0xe661c), D_ALL, dpy_reg_mmio_read, NULL);
+	MMIO_DH(_MMIO(0xe671c), D_ALL, dpy_reg_mmio_read, NULL);
+	MMIO_DH(_MMIO(0xe681c), D_ALL, dpy_reg_mmio_read, NULL);
+	MMIO_DH(_MMIO(0xe6c04), D_ALL, dpy_reg_mmio_read, NULL);
+	MMIO_DH(_MMIO(0xe6e1c), D_ALL, dpy_reg_mmio_read, NULL);
 
 	MMIO_RO(PCH_PORT_HOTPLUG, D_ALL, 0,
 		PORTA_HOTPLUG_STATUS_MASK
@@ -2074,11 +2074,11 @@ static int init_generic_mmio_info(struct intel_gvt *gvt)
 
 	MMIO_D(SOUTH_CHICKEN1, D_ALL);
 	MMIO_DH(SOUTH_CHICKEN2, D_ALL, NULL, south_chicken2_mmio_write);
-	MMIO_D(_TRANSA_CHICKEN1, D_ALL);
-	MMIO_D(_TRANSB_CHICKEN1, D_ALL);
+	MMIO_D(_MMIO(_TRANSA_CHICKEN1), D_ALL);
+	MMIO_D(_MMIO(_TRANSB_CHICKEN1), D_ALL);
 	MMIO_D(SOUTH_DSPCLK_GATE_D, D_ALL);
-	MMIO_D(_TRANSA_CHICKEN2, D_ALL);
-	MMIO_D(_TRANSB_CHICKEN2, D_ALL);
+	MMIO_D(_MMIO(_TRANSA_CHICKEN2), D_ALL);
+	MMIO_D(_MMIO(_TRANSB_CHICKEN2), D_ALL);
 
 	MMIO_D(ILK_DPFC_CB_BASE, D_ALL);
 	MMIO_D(ILK_DPFC_CONTROL, D_ALL);
@@ -2144,24 +2144,24 @@ static int init_generic_mmio_info(struct intel_gvt *gvt)
 	MMIO_D(PREC_PAL_DATA(PIPE_C), D_ALL);
 	MMIO_F(PREC_PAL_GC_MAX(PIPE_C, 0), 4 * 3, 0, 0, 0, D_ALL, NULL, NULL);
 
-	MMIO_D(0x60110, D_ALL);
-	MMIO_D(0x61110, D_ALL);
-	MMIO_F(0x70400, 0x40, 0, 0, 0, D_ALL, NULL, NULL);
-	MMIO_F(0x71400, 0x40, 0, 0, 0, D_ALL, NULL, NULL);
-	MMIO_F(0x72400, 0x40, 0, 0, 0, D_ALL, NULL, NULL);
-	MMIO_F(0x70440, 0xc, 0, 0, 0, D_PRE_SKL, NULL, NULL);
-	MMIO_F(0x71440, 0xc, 0, 0, 0, D_PRE_SKL, NULL, NULL);
-	MMIO_F(0x72440, 0xc, 0, 0, 0, D_PRE_SKL, NULL, NULL);
-	MMIO_F(0x7044c, 0xc, 0, 0, 0, D_PRE_SKL, NULL, NULL);
-	MMIO_F(0x7144c, 0xc, 0, 0, 0, D_PRE_SKL, NULL, NULL);
-	MMIO_F(0x7244c, 0xc, 0, 0, 0, D_PRE_SKL, NULL, NULL);
+	MMIO_D(_MMIO(0x60110), D_ALL);
+	MMIO_D(_MMIO(0x61110), D_ALL);
+	MMIO_F(_MMIO(0x70400), 0x40, 0, 0, 0, D_ALL, NULL, NULL);
+	MMIO_F(_MMIO(0x71400), 0x40, 0, 0, 0, D_ALL, NULL, NULL);
+	MMIO_F(_MMIO(0x72400), 0x40, 0, 0, 0, D_ALL, NULL, NULL);
+	MMIO_F(_MMIO(0x70440), 0xc, 0, 0, 0, D_PRE_SKL, NULL, NULL);
+	MMIO_F(_MMIO(0x71440), 0xc, 0, 0, 0, D_PRE_SKL, NULL, NULL);
+	MMIO_F(_MMIO(0x72440), 0xc, 0, 0, 0, D_PRE_SKL, NULL, NULL);
+	MMIO_F(_MMIO(0x7044c), 0xc, 0, 0, 0, D_PRE_SKL, NULL, NULL);
+	MMIO_F(_MMIO(0x7144c), 0xc, 0, 0, 0, D_PRE_SKL, NULL, NULL);
+	MMIO_F(_MMIO(0x7244c), 0xc, 0, 0, 0, D_PRE_SKL, NULL, NULL);
 
 	MMIO_D(PIPE_WM_LINETIME(PIPE_A), D_ALL);
 	MMIO_D(PIPE_WM_LINETIME(PIPE_B), D_ALL);
 	MMIO_D(PIPE_WM_LINETIME(PIPE_C), D_ALL);
 	MMIO_D(SPLL_CTL, D_ALL);
-	MMIO_D(_WRPLL_CTL1, D_ALL);
-	MMIO_D(_WRPLL_CTL2, D_ALL);
+	MMIO_D(_MMIO(_WRPLL_CTL1), D_ALL);
+	MMIO_D(_MMIO(_WRPLL_CTL2), D_ALL);
 	MMIO_D(PORT_CLK_SEL(PORT_A), D_ALL);
 	MMIO_D(PORT_CLK_SEL(PORT_B), D_ALL);
 	MMIO_D(PORT_CLK_SEL(PORT_C), D_ALL);
@@ -2172,15 +2172,15 @@ static int init_generic_mmio_info(struct intel_gvt *gvt)
 	MMIO_D(TRANS_CLK_SEL(TRANSCODER_C), D_ALL);
 
 	MMIO_D(HSW_NDE_RSTWRN_OPT, D_ALL);
-	MMIO_D(0x46508, D_ALL);
+	MMIO_D(_MMIO(0x46508), D_ALL);
 
-	MMIO_D(0x49080, D_ALL);
-	MMIO_D(0x49180, D_ALL);
-	MMIO_D(0x49280, D_ALL);
+	MMIO_D(_MMIO(0x49080), D_ALL);
+	MMIO_D(_MMIO(0x49180), D_ALL);
+	MMIO_D(_MMIO(0x49280), D_ALL);
 
-	MMIO_F(0x49090, 0x14, 0, 0, 0, D_ALL, NULL, NULL);
-	MMIO_F(0x49190, 0x14, 0, 0, 0, D_ALL, NULL, NULL);
-	MMIO_F(0x49290, 0x14, 0, 0, 0, D_ALL, NULL, NULL);
+	MMIO_F(_MMIO(0x49090), 0x14, 0, 0, 0, D_ALL, NULL, NULL);
+	MMIO_F(_MMIO(0x49190), 0x14, 0, 0, 0, D_ALL, NULL, NULL);
+	MMIO_F(_MMIO(0x49290), 0x14, 0, 0, 0, D_ALL, NULL, NULL);
 
 	MMIO_D(GAMMA_MODE(PIPE_A), D_ALL);
 	MMIO_D(GAMMA_MODE(PIPE_B), D_ALL);
@@ -2200,7 +2200,7 @@ static int init_generic_mmio_info(struct intel_gvt *gvt)
 	MMIO_DH(SBI_CTL_STAT, D_ALL, NULL, sbi_ctl_mmio_write);
 	MMIO_D(PIXCLK_GATE, D_ALL);
 
-	MMIO_F(_DPA_AUX_CH_CTL, 6 * 4, 0, 0, 0, D_ALL, NULL,
+	MMIO_F(_MMIO(_DPA_AUX_CH_CTL), 6 * 4, 0, 0, 0, D_ALL, NULL,
 		dp_aux_ch_ctl_mmio_write);
 
 	MMIO_DH(DDI_BUF_CTL(PORT_A), D_ALL, NULL, ddi_buf_ctl_mmio_write);
@@ -2221,24 +2221,24 @@ static int init_generic_mmio_info(struct intel_gvt *gvt)
 	MMIO_DH(DP_TP_STATUS(PORT_D), D_ALL, NULL, dp_tp_status_mmio_write);
 	MMIO_DH(DP_TP_STATUS(PORT_E), D_ALL, NULL, NULL);
 
-	MMIO_F(_DDI_BUF_TRANS_A, 0x50, 0, 0, 0, D_ALL, NULL, NULL);
-	MMIO_F(0x64e60, 0x50, 0, 0, 0, D_ALL, NULL, NULL);
-	MMIO_F(0x64eC0, 0x50, 0, 0, 0, D_ALL, NULL, NULL);
-	MMIO_F(0x64f20, 0x50, 0, 0, 0, D_ALL, NULL, NULL);
-	MMIO_F(0x64f80, 0x50, 0, 0, 0, D_ALL, NULL, NULL);
+	MMIO_F(_MMIO(_DDI_BUF_TRANS_A), 0x50, 0, 0, 0, D_ALL, NULL, NULL);
+	MMIO_F(_MMIO(0x64e60), 0x50, 0, 0, 0, D_ALL, NULL, NULL);
+	MMIO_F(_MMIO(0x64eC0), 0x50, 0, 0, 0, D_ALL, NULL, NULL);
+	MMIO_F(_MMIO(0x64f20), 0x50, 0, 0, 0, D_ALL, NULL, NULL);
+	MMIO_F(_MMIO(0x64f80), 0x50, 0, 0, 0, D_ALL, NULL, NULL);
 
 	MMIO_D(HSW_AUD_CFG(PIPE_A), D_ALL);
 	MMIO_D(HSW_AUD_PIN_ELD_CP_VLD, D_ALL);
 
-	MMIO_DH(_TRANS_DDI_FUNC_CTL_A, D_ALL, NULL, NULL);
-	MMIO_DH(_TRANS_DDI_FUNC_CTL_B, D_ALL, NULL, NULL);
-	MMIO_DH(_TRANS_DDI_FUNC_CTL_C, D_ALL, NULL, NULL);
-	MMIO_DH(_TRANS_DDI_FUNC_CTL_EDP, D_ALL, NULL, NULL);
+	MMIO_DH(_MMIO(_TRANS_DDI_FUNC_CTL_A), D_ALL, NULL, NULL);
+	MMIO_DH(_MMIO(_TRANS_DDI_FUNC_CTL_B), D_ALL, NULL, NULL);
+	MMIO_DH(_MMIO(_TRANS_DDI_FUNC_CTL_C), D_ALL, NULL, NULL);
+	MMIO_DH(_MMIO(_TRANS_DDI_FUNC_CTL_EDP), D_ALL, NULL, NULL);
 
-	MMIO_D(_TRANSA_MSA_MISC, D_ALL);
-	MMIO_D(_TRANSB_MSA_MISC, D_ALL);
-	MMIO_D(_TRANSC_MSA_MISC, D_ALL);
-	MMIO_D(_TRANS_EDP_MSA_MISC, D_ALL);
+	MMIO_D(_MMIO(_TRANSA_MSA_MISC), D_ALL);
+	MMIO_D(_MMIO(_TRANSB_MSA_MISC), D_ALL);
+	MMIO_D(_MMIO(_TRANSC_MSA_MISC), D_ALL);
+	MMIO_D(_MMIO(_TRANS_EDP_MSA_MISC), D_ALL);
 
 	MMIO_DH(FORCEWAKE, D_ALL, NULL, NULL);
 	MMIO_D(FORCEWAKE_ACK, D_ALL);
@@ -2304,101 +2304,101 @@ static int init_generic_mmio_info(struct intel_gvt *gvt)
 	MMIO_D(GEN6_UCGCTL1, D_ALL);
 	MMIO_D(GEN6_UCGCTL2, D_ALL);
 
-	MMIO_F(0x4f000, 0x90, 0, 0, 0, D_ALL, NULL, NULL);
+	MMIO_F(_MMIO(0x4f000), 0x90, 0, 0, 0, D_ALL, NULL, NULL);
 
 	MMIO_D(GEN6_PCODE_DATA, D_ALL);
-	MMIO_D(0x13812c, D_ALL);
+	MMIO_D(_MMIO(0x13812c), D_ALL);
 	MMIO_DH(GEN7_ERR_INT, D_ALL, NULL, NULL);
 	MMIO_D(HSW_EDRAM_CAP, D_ALL);
 	MMIO_D(HSW_IDICR, D_ALL);
 	MMIO_DH(GFX_FLSH_CNTL_GEN6, D_ALL, NULL, NULL);
 
-	MMIO_D(0x3c, D_ALL);
-	MMIO_D(0x860, D_ALL);
+	MMIO_D(_MMIO(0x3c), D_ALL);
+	MMIO_D(_MMIO(0x860), D_ALL);
 	MMIO_D(ECOSKPD, D_ALL);
-	MMIO_D(0x121d0, D_ALL);
+	MMIO_D(_MMIO(0x121d0), D_ALL);
 	MMIO_D(GEN6_BLITTER_ECOSKPD, D_ALL);
-	MMIO_D(0x41d0, D_ALL);
+	MMIO_D(_MMIO(0x41d0), D_ALL);
 	MMIO_D(GAC_ECO_BITS, D_ALL);
-	MMIO_D(0x6200, D_ALL);
-	MMIO_D(0x6204, D_ALL);
-	MMIO_D(0x6208, D_ALL);
-	MMIO_D(0x7118, D_ALL);
-	MMIO_D(0x7180, D_ALL);
-	MMIO_D(0x7408, D_ALL);
-	MMIO_D(0x7c00, D_ALL);
+	MMIO_D(_MMIO(0x6200), D_ALL);
+	MMIO_D(_MMIO(0x6204), D_ALL);
+	MMIO_D(_MMIO(0x6208), D_ALL);
+	MMIO_D(_MMIO(0x7118), D_ALL);
+	MMIO_D(_MMIO(0x7180), D_ALL);
+	MMIO_D(_MMIO(0x7408), D_ALL);
+	MMIO_D(_MMIO(0x7c00), D_ALL);
 	MMIO_DH(GEN6_MBCTL, D_ALL, NULL, mbctl_write);
-	MMIO_D(0x911c, D_ALL);
-	MMIO_D(0x9120, D_ALL);
+	MMIO_D(_MMIO(0x911c), D_ALL);
+	MMIO_D(_MMIO(0x9120), D_ALL);
 	MMIO_DFH(GEN7_UCGCTL4, D_ALL, F_CMD_ACCESS, NULL, NULL);
 
 	MMIO_D(GAB_CTL, D_ALL);
-	MMIO_D(0x48800, D_ALL);
-	MMIO_D(0xce044, D_ALL);
-	MMIO_D(0xe6500, D_ALL);
-	MMIO_D(0xe6504, D_ALL);
-	MMIO_D(0xe6600, D_ALL);
-	MMIO_D(0xe6604, D_ALL);
-	MMIO_D(0xe6700, D_ALL);
-	MMIO_D(0xe6704, D_ALL);
-	MMIO_D(0xe6800, D_ALL);
-	MMIO_D(0xe6804, D_ALL);
+	MMIO_D(_MMIO(0x48800), D_ALL);
+	MMIO_D(_MMIO(0xce044), D_ALL);
+	MMIO_D(_MMIO(0xe6500), D_ALL);
+	MMIO_D(_MMIO(0xe6504), D_ALL);
+	MMIO_D(_MMIO(0xe6600), D_ALL);
+	MMIO_D(_MMIO(0xe6604), D_ALL);
+	MMIO_D(_MMIO(0xe6700), D_ALL);
+	MMIO_D(_MMIO(0xe6704), D_ALL);
+	MMIO_D(_MMIO(0xe6800), D_ALL);
+	MMIO_D(_MMIO(0xe6804), D_ALL);
 	MMIO_D(PCH_GMBUS4, D_ALL);
 	MMIO_D(PCH_GMBUS5, D_ALL);
 
-	MMIO_D(0x902c, D_ALL);
-	MMIO_D(0xec008, D_ALL);
-	MMIO_D(0xec00c, D_ALL);
-	MMIO_D(0xec008 + 0x18, D_ALL);
-	MMIO_D(0xec00c + 0x18, D_ALL);
-	MMIO_D(0xec008 + 0x18 * 2, D_ALL);
-	MMIO_D(0xec00c + 0x18 * 2, D_ALL);
-	MMIO_D(0xec008 + 0x18 * 3, D_ALL);
-	MMIO_D(0xec00c + 0x18 * 3, D_ALL);
-	MMIO_D(0xec408, D_ALL);
-	MMIO_D(0xec40c, D_ALL);
-	MMIO_D(0xec408 + 0x18, D_ALL);
-	MMIO_D(0xec40c + 0x18, D_ALL);
-	MMIO_D(0xec408 + 0x18 * 2, D_ALL);
-	MMIO_D(0xec40c + 0x18 * 2, D_ALL);
-	MMIO_D(0xec408 + 0x18 * 3, D_ALL);
-	MMIO_D(0xec40c + 0x18 * 3, D_ALL);
-	MMIO_D(0xfc810, D_ALL);
-	MMIO_D(0xfc81c, D_ALL);
-	MMIO_D(0xfc828, D_ALL);
-	MMIO_D(0xfc834, D_ALL);
-	MMIO_D(0xfcc00, D_ALL);
-	MMIO_D(0xfcc0c, D_ALL);
-	MMIO_D(0xfcc18, D_ALL);
-	MMIO_D(0xfcc24, D_ALL);
-	MMIO_D(0xfd000, D_ALL);
-	MMIO_D(0xfd00c, D_ALL);
-	MMIO_D(0xfd018, D_ALL);
-	MMIO_D(0xfd024, D_ALL);
-	MMIO_D(0xfd034, D_ALL);
+	MMIO_D(_MMIO(0x902c), D_ALL);
+	MMIO_D(_MMIO(0xec008), D_ALL);
+	MMIO_D(_MMIO(0xec00c), D_ALL);
+	MMIO_D(_MMIO(0xec008 + 0x18), D_ALL);
+	MMIO_D(_MMIO(0xec00c + 0x18), D_ALL);
+	MMIO_D(_MMIO(0xec008 + 0x18 * 2), D_ALL);
+	MMIO_D(_MMIO(0xec00c + 0x18 * 2), D_ALL);
+	MMIO_D(_MMIO(0xec008 + 0x18 * 3), D_ALL);
+	MMIO_D(_MMIO(0xec00c + 0x18 * 3), D_ALL);
+	MMIO_D(_MMIO(0xec408), D_ALL);
+	MMIO_D(_MMIO(0xec40c), D_ALL);
+	MMIO_D(_MMIO(0xec408 + 0x18), D_ALL);
+	MMIO_D(_MMIO(0xec40c + 0x18), D_ALL);
+	MMIO_D(_MMIO(0xec408 + 0x18 * 2), D_ALL);
+	MMIO_D(_MMIO(0xec40c + 0x18 * 2), D_ALL);
+	MMIO_D(_MMIO(0xec408 + 0x18 * 3), D_ALL);
+	MMIO_D(_MMIO(0xec40c + 0x18 * 3), D_ALL);
+	MMIO_D(_MMIO(0xfc810), D_ALL);
+	MMIO_D(_MMIO(0xfc81c), D_ALL);
+	MMIO_D(_MMIO(0xfc828), D_ALL);
+	MMIO_D(_MMIO(0xfc834), D_ALL);
+	MMIO_D(_MMIO(0xfcc00), D_ALL);
+	MMIO_D(_MMIO(0xfcc0c), D_ALL);
+	MMIO_D(_MMIO(0xfcc18), D_ALL);
+	MMIO_D(_MMIO(0xfcc24), D_ALL);
+	MMIO_D(_MMIO(0xfd000), D_ALL);
+	MMIO_D(_MMIO(0xfd00c), D_ALL);
+	MMIO_D(_MMIO(0xfd018), D_ALL);
+	MMIO_D(_MMIO(0xfd024), D_ALL);
+	MMIO_D(_MMIO(0xfd034), D_ALL);
 
 	MMIO_DH(FPGA_DBG, D_ALL, NULL, fpga_dbg_mmio_write);
-	MMIO_D(0x2054, D_ALL);
-	MMIO_D(0x12054, D_ALL);
-	MMIO_D(0x22054, D_ALL);
-	MMIO_D(0x1a054, D_ALL);
-
-	MMIO_D(0x44070, D_ALL);
-	MMIO_DFH(0x215c, D_BDW_PLUS, F_CMD_ACCESS, NULL, NULL);
-	MMIO_DFH(0x2178, D_ALL, F_CMD_ACCESS, NULL, NULL);
-	MMIO_DFH(0x217c, D_ALL, F_CMD_ACCESS, NULL, NULL);
-	MMIO_DFH(0x12178, D_ALL, F_CMD_ACCESS, NULL, NULL);
-	MMIO_DFH(0x1217c, D_ALL, F_CMD_ACCESS, NULL, NULL);
-
-	MMIO_F(0x2290, 8, F_CMD_ACCESS, 0, 0, D_BDW_PLUS, NULL, NULL);
-	MMIO_D(0x2b00, D_BDW_PLUS);
-	MMIO_D(0x2360, D_BDW_PLUS);
-	MMIO_F(0x5200, 32, F_CMD_ACCESS, 0, 0, D_ALL, NULL, NULL);
-	MMIO_F(0x5240, 32, F_CMD_ACCESS, 0, 0, D_ALL, NULL, NULL);
-	MMIO_F(0x5280, 16, F_CMD_ACCESS, 0, 0, D_ALL, NULL, NULL);
-
-	MMIO_DFH(0x1c17c, D_BDW_PLUS, F_CMD_ACCESS, NULL, NULL);
-	MMIO_DFH(0x1c178, D_BDW_PLUS, F_CMD_ACCESS, NULL, NULL);
+	MMIO_D(_MMIO(0x2054), D_ALL);
+	MMIO_D(_MMIO(0x12054), D_ALL);
+	MMIO_D(_MMIO(0x22054), D_ALL);
+	MMIO_D(_MMIO(0x1a054), D_ALL);
+
+	MMIO_D(_MMIO(0x44070), D_ALL);
+	MMIO_DFH(_MMIO(0x215c), D_BDW_PLUS, F_CMD_ACCESS, NULL, NULL);
+	MMIO_DFH(_MMIO(0x2178), D_ALL, F_CMD_ACCESS, NULL, NULL);
+	MMIO_DFH(_MMIO(0x217c), D_ALL, F_CMD_ACCESS, NULL, NULL);
+	MMIO_DFH(_MMIO(0x12178), D_ALL, F_CMD_ACCESS, NULL, NULL);
+	MMIO_DFH(_MMIO(0x1217c), D_ALL, F_CMD_ACCESS, NULL, NULL);
+
+	MMIO_F(_MMIO(0x2290), 8, F_CMD_ACCESS, 0, 0, D_BDW_PLUS, NULL, NULL);
+	MMIO_D(_MMIO(0x2b00), D_BDW_PLUS);
+	MMIO_D(_MMIO(0x2360), D_BDW_PLUS);
+	MMIO_F(_MMIO(0x5200), 32, F_CMD_ACCESS, 0, 0, D_ALL, NULL, NULL);
+	MMIO_F(_MMIO(0x5240), 32, F_CMD_ACCESS, 0, 0, D_ALL, NULL, NULL);
+	MMIO_F(_MMIO(0x5280), 16, F_CMD_ACCESS, 0, 0, D_ALL, NULL, NULL);
+
+	MMIO_DFH(_MMIO(0x1c17c), D_BDW_PLUS, F_CMD_ACCESS, NULL, NULL);
+	MMIO_DFH(_MMIO(0x1c178), D_BDW_PLUS, F_CMD_ACCESS, NULL, NULL);
 	MMIO_DFH(BCS_SWCTRL, D_ALL, F_CMD_ACCESS, NULL, NULL);
 
 	MMIO_F(HS_INVOCATION_COUNT, 8, F_CMD_ACCESS, 0, 0, D_ALL, NULL, NULL);
@@ -2412,24 +2412,24 @@ static int init_generic_mmio_info(struct intel_gvt *gvt)
 	MMIO_F(CL_PRIMITIVES_COUNT, 8, F_CMD_ACCESS, 0, 0, D_ALL, NULL, NULL);
 	MMIO_F(PS_INVOCATION_COUNT, 8, F_CMD_ACCESS, 0, 0, D_ALL, NULL, NULL);
 	MMIO_F(PS_DEPTH_COUNT, 8, F_CMD_ACCESS, 0, 0, D_ALL, NULL, NULL);
-	MMIO_DH(0x4260, D_BDW_PLUS, NULL, gvt_reg_tlb_control_handler);
-	MMIO_DH(0x4264, D_BDW_PLUS, NULL, gvt_reg_tlb_control_handler);
-	MMIO_DH(0x4268, D_BDW_PLUS, NULL, gvt_reg_tlb_control_handler);
-	MMIO_DH(0x426c, D_BDW_PLUS, NULL, gvt_reg_tlb_control_handler);
-	MMIO_DH(0x4270, D_BDW_PLUS, NULL, gvt_reg_tlb_control_handler);
-	MMIO_DFH(0x4094, D_BDW_PLUS, F_CMD_ACCESS, NULL, NULL);
+	MMIO_DH(_MMIO(0x4260), D_BDW_PLUS, NULL, gvt_reg_tlb_control_handler);
+	MMIO_DH(_MMIO(0x4264), D_BDW_PLUS, NULL, gvt_reg_tlb_control_handler);
+	MMIO_DH(_MMIO(0x4268), D_BDW_PLUS, NULL, gvt_reg_tlb_control_handler);
+	MMIO_DH(_MMIO(0x426c), D_BDW_PLUS, NULL, gvt_reg_tlb_control_handler);
+	MMIO_DH(_MMIO(0x4270), D_BDW_PLUS, NULL, gvt_reg_tlb_control_handler);
+	MMIO_DFH(_MMIO(0x4094), D_BDW_PLUS, F_CMD_ACCESS, NULL, NULL);
 
 	MMIO_DFH(ARB_MODE, D_ALL, F_MODE_MASK | F_CMD_ACCESS, NULL, NULL);
 	MMIO_RING_GM_RDR(RING_BBADDR, D_ALL, NULL, NULL);
-	MMIO_DFH(0x2220, D_ALL, F_CMD_ACCESS, NULL, NULL);
-	MMIO_DFH(0x12220, D_ALL, F_CMD_ACCESS, NULL, NULL);
-	MMIO_DFH(0x22220, D_ALL, F_CMD_ACCESS, NULL, NULL);
+	MMIO_DFH(_MMIO(0x2220), D_ALL, F_CMD_ACCESS, NULL, NULL);
+	MMIO_DFH(_MMIO(0x12220), D_ALL, F_CMD_ACCESS, NULL, NULL);
+	MMIO_DFH(_MMIO(0x22220), D_ALL, F_CMD_ACCESS, NULL, NULL);
 	MMIO_RING_DFH(RING_SYNC_1, D_ALL, F_CMD_ACCESS, NULL, NULL);
 	MMIO_RING_DFH(RING_SYNC_0, D_ALL, F_CMD_ACCESS, NULL, NULL);
-	MMIO_DFH(0x22178, D_BDW_PLUS, F_CMD_ACCESS, NULL, NULL);
-	MMIO_DFH(0x1a178, D_BDW_PLUS, F_CMD_ACCESS, NULL, NULL);
-	MMIO_DFH(0x1a17c, D_BDW_PLUS, F_CMD_ACCESS, NULL, NULL);
-	MMIO_DFH(0x2217c, D_BDW_PLUS, F_CMD_ACCESS, NULL, NULL);
+	MMIO_DFH(_MMIO(0x22178), D_BDW_PLUS, F_CMD_ACCESS, NULL, NULL);
+	MMIO_DFH(_MMIO(0x1a178), D_BDW_PLUS, F_CMD_ACCESS, NULL, NULL);
+	MMIO_DFH(_MMIO(0x1a17c), D_BDW_PLUS, F_CMD_ACCESS, NULL, NULL);
+	MMIO_DFH(_MMIO(0x2217c), D_BDW_PLUS, F_CMD_ACCESS, NULL, NULL);
 	return 0;
 }
 
@@ -2503,40 +2503,40 @@ static int init_broadwell_mmio_info(struct intel_gvt *gvt)
 	MMIO_RING_DFH(RING_ACTHD_UDW, D_BDW_PLUS, F_CMD_ACCESS,
 		mmio_read_from_hw, NULL);
 
-#define RING_REG(base) (base + 0xd0)
+#define RING_REG(base) _MMIO((base) + 0xd0)
 	MMIO_RING_F(RING_REG, 4, F_RO, 0,
 		~_MASKED_BIT_ENABLE(RESET_CTL_REQUEST_RESET), D_BDW_PLUS, NULL,
 		ring_reset_ctl_write);
 #undef RING_REG
 
-#define RING_REG(base) (base + 0x230)
+#define RING_REG(base) _MMIO((base) + 0x230)
 	MMIO_RING_DFH(RING_REG, D_BDW_PLUS, 0, NULL, elsp_mmio_write);
 #undef RING_REG
 
-#define RING_REG(base) (base + 0x234)
+#define RING_REG(base) _MMIO((base) + 0x234)
 	MMIO_RING_F(RING_REG, 8, F_RO | F_CMD_ACCESS, 0, ~0, D_BDW_PLUS,
 		NULL, NULL);
 #undef RING_REG
 
-#define RING_REG(base) (base + 0x244)
+#define RING_REG(base) _MMIO((base) + 0x244)
 	MMIO_RING_DFH(RING_REG, D_BDW_PLUS, F_CMD_ACCESS, NULL, NULL);
 #undef RING_REG
 
-#define RING_REG(base) (base + 0x370)
+#define RING_REG(base) _MMIO((base) + 0x370)
 	MMIO_RING_F(RING_REG, 48, F_RO, 0, ~0, D_BDW_PLUS, NULL, NULL);
 #undef RING_REG
 
-#define RING_REG(base) (base + 0x3a0)
+#define RING_REG(base) _MMIO((base) + 0x3a0)
 	MMIO_RING_DFH(RING_REG, D_BDW_PLUS, F_MODE_MASK, NULL, NULL);
 #undef RING_REG
 
 	MMIO_D(PIPEMISC(PIPE_A), D_BDW_PLUS);
 	MMIO_D(PIPEMISC(PIPE_B), D_BDW_PLUS);
 	MMIO_D(PIPEMISC(PIPE_C), D_BDW_PLUS);
-	MMIO_D(0x1c1d0, D_BDW_PLUS);
+	MMIO_D(_MMIO(0x1c1d0), D_BDW_PLUS);
 	MMIO_D(GEN6_MBCUNIT_SNPCR, D_BDW_PLUS);
 	MMIO_D(GEN7_MISCCPCTL, D_BDW_PLUS);
-	MMIO_D(0x1c054, D_BDW_PLUS);
+	MMIO_D(_MMIO(0x1c054), D_BDW_PLUS);
 
 	MMIO_DH(GEN6_PCODE_MAILBOX, D_BDW_PLUS, NULL, mailbox_write);
 
@@ -2545,7 +2545,7 @@ static int init_broadwell_mmio_info(struct intel_gvt *gvt)
 
 	MMIO_D(GAMTARBMODE, D_BDW_PLUS);
 
-#define RING_REG(base) (base + 0x270)
+#define RING_REG(base) _MMIO((base) + 0x270)
 	MMIO_RING_F(RING_REG, 32, 0, 0, 0, D_BDW_PLUS, NULL, NULL);
 #undef RING_REG
 
@@ -2558,10 +2558,10 @@ static int init_broadwell_mmio_info(struct intel_gvt *gvt)
 	MMIO_D(CHICKEN_PIPESL_1(PIPE_C), D_BDW_PLUS);
 
 	MMIO_D(WM_MISC, D_BDW);
-	MMIO_D(BDW_EDP_PSR_BASE, D_BDW);
+	MMIO_D(_MMIO(BDW_EDP_PSR_BASE), D_BDW);
 
-	MMIO_D(0x66c00, D_BDW_PLUS);
-	MMIO_D(0x66c04, D_BDW_PLUS);
+	MMIO_D(_MMIO(0x66c00), D_BDW_PLUS);
+	MMIO_D(_MMIO(0x66c04), D_BDW_PLUS);
 
 	MMIO_D(HSW_GTT_CACHE_EN, D_BDW_PLUS);
 
@@ -2569,54 +2569,54 @@ static int init_broadwell_mmio_info(struct intel_gvt *gvt)
 	MMIO_D(GEN8_EU_DISABLE1, D_BDW_PLUS);
 	MMIO_D(GEN8_EU_DISABLE2, D_BDW_PLUS);
 
-	MMIO_D(0xfdc, D_BDW_PLUS);
+	MMIO_D(_MMIO(0xfdc), D_BDW_PLUS);
 	MMIO_DFH(GEN8_ROW_CHICKEN, D_BDW_PLUS, F_MODE_MASK | F_CMD_ACCESS,
 		NULL, NULL);
 	MMIO_DFH(GEN7_ROW_CHICKEN2, D_BDW_PLUS, F_MODE_MASK | F_CMD_ACCESS,
 		NULL, NULL);
 	MMIO_DFH(GEN8_UCGCTL6, D_BDW_PLUS, F_CMD_ACCESS, NULL, NULL);
 
-	MMIO_DFH(0xb1f0, D_BDW, F_CMD_ACCESS, NULL, NULL);
-	MMIO_DFH(0xb1c0, D_BDW, F_CMD_ACCESS, NULL, NULL);
+	MMIO_DFH(_MMIO(0xb1f0), D_BDW, F_CMD_ACCESS, NULL, NULL);
+	MMIO_DFH(_MMIO(0xb1c0), D_BDW, F_CMD_ACCESS, NULL, NULL);
 	MMIO_DFH(GEN8_L3SQCREG4, D_BDW_PLUS, F_CMD_ACCESS, NULL, NULL);
-	MMIO_DFH(0xb100, D_BDW, F_CMD_ACCESS, NULL, NULL);
-	MMIO_DFH(0xb10c, D_BDW, F_CMD_ACCESS, NULL, NULL);
-	MMIO_D(0xb110, D_BDW);
+	MMIO_DFH(_MMIO(0xb100), D_BDW, F_CMD_ACCESS, NULL, NULL);
+	MMIO_DFH(_MMIO(0xb10c), D_BDW, F_CMD_ACCESS, NULL, NULL);
+	MMIO_D(_MMIO(0xb110), D_BDW);
 
-	MMIO_F(0x24d0, 48, F_CMD_ACCESS, 0, 0, D_BDW_PLUS,
+	MMIO_F(_MMIO(0x24d0), 48, F_CMD_ACCESS, 0, 0, D_BDW_PLUS,
 		NULL, force_nonpriv_write);
 
-	MMIO_D(0x44484, D_BDW_PLUS);
-	MMIO_D(0x4448c, D_BDW_PLUS);
+	MMIO_D(_MMIO(0x44484), D_BDW_PLUS);
+	MMIO_D(_MMIO(0x4448c), D_BDW_PLUS);
 
-	MMIO_DFH(0x83a4, D_BDW, F_CMD_ACCESS, NULL, NULL);
+	MMIO_DFH(_MMIO(0x83a4), D_BDW, F_CMD_ACCESS, NULL, NULL);
 	MMIO_D(GEN8_L3_LRA_1_GPGPU, D_BDW_PLUS);
 
-	MMIO_DFH(0x8430, D_BDW, F_CMD_ACCESS, NULL, NULL);
+	MMIO_DFH(_MMIO(0x8430), D_BDW, F_CMD_ACCESS, NULL, NULL);
 
-	MMIO_D(0x110000, D_BDW_PLUS);
+	MMIO_D(_MMIO(0x110000), D_BDW_PLUS);
 
-	MMIO_D(0x48400, D_BDW_PLUS);
+	MMIO_D(_MMIO(0x48400), D_BDW_PLUS);
 
-	MMIO_D(0x6e570, D_BDW_PLUS);
-	MMIO_D(0x65f10, D_BDW_PLUS);
+	MMIO_D(_MMIO(0x6e570), D_BDW_PLUS);
+	MMIO_D(_MMIO(0x65f10), D_BDW_PLUS);
 
-	MMIO_DFH(0xe194, D_BDW_PLUS, F_MODE_MASK | F_CMD_ACCESS, NULL, NULL);
-	MMIO_DFH(0xe188, D_BDW_PLUS, F_MODE_MASK | F_CMD_ACCESS, NULL, NULL);
+	MMIO_DFH(_MMIO(0xe194), D_BDW_PLUS, F_MODE_MASK | F_CMD_ACCESS, NULL, NULL);
+	MMIO_DFH(_MMIO(0xe188), D_BDW_PLUS, F_MODE_MASK | F_CMD_ACCESS, NULL, NULL);
 	MMIO_DFH(HALF_SLICE_CHICKEN2, D_BDW_PLUS, F_MODE_MASK | F_CMD_ACCESS, NULL, NULL);
-	MMIO_DFH(0x2580, D_BDW_PLUS, F_MODE_MASK | F_CMD_ACCESS, NULL, NULL);
-
-	MMIO_DFH(0x2248, D_BDW, F_CMD_ACCESS, NULL, NULL);
-
-	MMIO_DFH(0xe220, D_BDW_PLUS, F_CMD_ACCESS, NULL, NULL);
-	MMIO_DFH(0xe230, D_BDW_PLUS, F_CMD_ACCESS, NULL, NULL);
-	MMIO_DFH(0xe240, D_BDW_PLUS, F_CMD_ACCESS, NULL, NULL);
-	MMIO_DFH(0xe260, D_BDW_PLUS, F_CMD_ACCESS, NULL, NULL);
-	MMIO_DFH(0xe270, D_BDW_PLUS, F_CMD_ACCESS, NULL, NULL);
-	MMIO_DFH(0xe280, D_BDW_PLUS, F_CMD_ACCESS, NULL, NULL);
-	MMIO_DFH(0xe2a0, D_BDW_PLUS, F_CMD_ACCESS, NULL, NULL);
-	MMIO_DFH(0xe2b0, D_BDW_PLUS, F_CMD_ACCESS, NULL, NULL);
-	MMIO_DFH(0xe2c0, D_BDW_PLUS, F_CMD_ACCESS, NULL, NULL);
+	MMIO_DFH(_MMIO(0x2580), D_BDW_PLUS, F_MODE_MASK | F_CMD_ACCESS, NULL, NULL);
+
+	MMIO_DFH(_MMIO(0x2248), D_BDW, F_CMD_ACCESS, NULL, NULL);
+
+	MMIO_DFH(_MMIO(0xe220), D_BDW_PLUS, F_CMD_ACCESS, NULL, NULL);
+	MMIO_DFH(_MMIO(0xe230), D_BDW_PLUS, F_CMD_ACCESS, NULL, NULL);
+	MMIO_DFH(_MMIO(0xe240), D_BDW_PLUS, F_CMD_ACCESS, NULL, NULL);
+	MMIO_DFH(_MMIO(0xe260), D_BDW_PLUS, F_CMD_ACCESS, NULL, NULL);
+	MMIO_DFH(_MMIO(0xe270), D_BDW_PLUS, F_CMD_ACCESS, NULL, NULL);
+	MMIO_DFH(_MMIO(0xe280), D_BDW_PLUS, F_CMD_ACCESS, NULL, NULL);
+	MMIO_DFH(_MMIO(0xe2a0), D_BDW_PLUS, F_CMD_ACCESS, NULL, NULL);
+	MMIO_DFH(_MMIO(0xe2b0), D_BDW_PLUS, F_CMD_ACCESS, NULL, NULL);
+	MMIO_DFH(_MMIO(0xe2c0), D_BDW_PLUS, F_CMD_ACCESS, NULL, NULL);
 	return 0;
 }
 
@@ -2632,11 +2632,11 @@ static int init_skl_mmio_info(struct intel_gvt *gvt)
 	MMIO_DH(FORCEWAKE_MEDIA_GEN9, D_SKL_PLUS, NULL, mul_force_wake_write);
 	MMIO_DH(FORCEWAKE_ACK_MEDIA_GEN9, D_SKL_PLUS, NULL, NULL);
 
-	MMIO_F(_DPB_AUX_CH_CTL, 6 * 4, 0, 0, 0, D_SKL_PLUS, NULL,
+	MMIO_F(_MMIO(_DPB_AUX_CH_CTL), 6 * 4, 0, 0, 0, D_SKL_PLUS, NULL,
 						dp_aux_ch_ctl_mmio_write);
-	MMIO_F(_DPC_AUX_CH_CTL, 6 * 4, 0, 0, 0, D_SKL_PLUS, NULL,
+	MMIO_F(_MMIO(_DPC_AUX_CH_CTL), 6 * 4, 0, 0, 0, D_SKL_PLUS, NULL,
 						dp_aux_ch_ctl_mmio_write);
-	MMIO_F(_DPD_AUX_CH_CTL, 6 * 4, 0, 0, 0, D_SKL_PLUS, NULL,
+	MMIO_F(_MMIO(_DPD_AUX_CH_CTL), 6 * 4, 0, 0, 0, D_SKL_PLUS, NULL,
 						dp_aux_ch_ctl_mmio_write);
 
 	/*
@@ -2647,26 +2647,26 @@ static int init_skl_mmio_info(struct intel_gvt *gvt)
 	MMIO_DH(HSW_PWR_WELL_CTL_DRIVER(SKL_DISP_PW_MISC_IO), D_SKL_PLUS, NULL,
 		skl_power_well_ctl_write);
 
-	MMIO_D(0xa210, D_SKL_PLUS);
+	MMIO_D(_MMIO(0xa210), D_SKL_PLUS);
 	MMIO_D(GEN9_MEDIA_PG_IDLE_HYSTERESIS, D_SKL_PLUS);
 	MMIO_D(GEN9_RENDER_PG_IDLE_HYSTERESIS, D_SKL_PLUS);
 	MMIO_DFH(GEN9_GAMT_ECO_REG_RW_IA, D_SKL_PLUS, F_CMD_ACCESS, NULL, NULL);
-	MMIO_DH(0x4ddc, D_SKL_PLUS, NULL, NULL);
-	MMIO_DH(0x42080, D_SKL_PLUS, NULL, NULL);
-	MMIO_D(0x45504, D_SKL_PLUS);
-	MMIO_D(0x45520, D_SKL_PLUS);
-	MMIO_D(0x46000, D_SKL_PLUS);
-	MMIO_DH(0x46010, D_SKL | D_KBL, NULL, skl_lcpll_write);
-	MMIO_DH(0x46014, D_SKL | D_KBL, NULL, skl_lcpll_write);
-	MMIO_D(0x6C040, D_SKL | D_KBL);
-	MMIO_D(0x6C048, D_SKL | D_KBL);
-	MMIO_D(0x6C050, D_SKL | D_KBL);
-	MMIO_D(0x6C044, D_SKL | D_KBL);
-	MMIO_D(0x6C04C, D_SKL | D_KBL);
-	MMIO_D(0x6C054, D_SKL | D_KBL);
-	MMIO_D(0x6c058, D_SKL | D_KBL);
-	MMIO_D(0x6c05c, D_SKL | D_KBL);
-	MMIO_DH(0X6c060, D_SKL | D_KBL, dpll_status_read, NULL);
+	MMIO_DH(_MMIO(0x4ddc), D_SKL_PLUS, NULL, NULL);
+	MMIO_DH(_MMIO(0x42080), D_SKL_PLUS, NULL, NULL);
+	MMIO_D(_MMIO(0x45504), D_SKL_PLUS);
+	MMIO_D(_MMIO(0x45520), D_SKL_PLUS);
+	MMIO_D(_MMIO(0x46000), D_SKL_PLUS);
+	MMIO_DH(_MMIO(0x46010), D_SKL | D_KBL, NULL, skl_lcpll_write);
+	MMIO_DH(_MMIO(0x46014), D_SKL | D_KBL, NULL, skl_lcpll_write);
+	MMIO_D(_MMIO(0x6C040), D_SKL | D_KBL);
+	MMIO_D(_MMIO(0x6C048), D_SKL | D_KBL);
+	MMIO_D(_MMIO(0x6C050), D_SKL | D_KBL);
+	MMIO_D(_MMIO(0x6C044), D_SKL | D_KBL);
+	MMIO_D(_MMIO(0x6C04C), D_SKL | D_KBL);
+	MMIO_D(_MMIO(0x6C054), D_SKL | D_KBL);
+	MMIO_D(_MMIO(0x6c058), D_SKL | D_KBL);
+	MMIO_D(_MMIO(0x6c05c), D_SKL | D_KBL);
+	MMIO_DH(_MMIO(0x6c060), D_SKL | D_KBL, dpll_status_read, NULL);
 
 	MMIO_DH(SKL_PS_WIN_POS(PIPE_A, 0), D_SKL_PLUS, NULL, pf_write);
 	MMIO_DH(SKL_PS_WIN_POS(PIPE_A, 1), D_SKL_PLUS, NULL, pf_write);
@@ -2755,105 +2755,105 @@ static int init_skl_mmio_info(struct intel_gvt *gvt)
 	MMIO_DH(PLANE_NV12_BUF_CFG(PIPE_C, 2), D_SKL_PLUS, NULL, NULL);
 	MMIO_DH(PLANE_NV12_BUF_CFG(PIPE_C, 3), D_SKL_PLUS, NULL, NULL);
 
-	MMIO_DH(_REG_701C0(PIPE_A, 1), D_SKL_PLUS, NULL, NULL);
-	MMIO_DH(_REG_701C0(PIPE_A, 2), D_SKL_PLUS, NULL, NULL);
-	MMIO_DH(_REG_701C0(PIPE_A, 3), D_SKL_PLUS, NULL, NULL);
-	MMIO_DH(_REG_701C0(PIPE_A, 4), D_SKL_PLUS, NULL, NULL);
+	MMIO_DH(_MMIO(_REG_701C0(PIPE_A, 1)), D_SKL_PLUS, NULL, NULL);
+	MMIO_DH(_MMIO(_REG_701C0(PIPE_A, 2)), D_SKL_PLUS, NULL, NULL);
+	MMIO_DH(_MMIO(_REG_701C0(PIPE_A, 3)), D_SKL_PLUS, NULL, NULL);
+	MMIO_DH(_MMIO(_REG_701C0(PIPE_A, 4)), D_SKL_PLUS, NULL, NULL);
 
-	MMIO_DH(_REG_701C0(PIPE_B, 1), D_SKL_PLUS, NULL, NULL);
-	MMIO_DH(_REG_701C0(PIPE_B, 2), D_SKL_PLUS, NULL, NULL);
-	MMIO_DH(_REG_701C0(PIPE_B, 3), D_SKL_PLUS, NULL, NULL);
-	MMIO_DH(_REG_701C0(PIPE_B, 4), D_SKL_PLUS, NULL, NULL);
+	MMIO_DH(_MMIO(_REG_701C0(PIPE_B, 1)), D_SKL_PLUS, NULL, NULL);
+	MMIO_DH(_MMIO(_REG_701C0(PIPE_B, 2)), D_SKL_PLUS, NULL, NULL);
+	MMIO_DH(_MMIO(_REG_701C0(PIPE_B, 3)), D_SKL_PLUS, NULL, NULL);
+	MMIO_DH(_MMIO(_REG_701C0(PIPE_B, 4)), D_SKL_PLUS, NULL, NULL);
 
-	MMIO_DH(_REG_701C0(PIPE_C, 1), D_SKL_PLUS, NULL, NULL);
-	MMIO_DH(_REG_701C0(PIPE_C, 2), D_SKL_PLUS, NULL, NULL);
-	MMIO_DH(_REG_701C0(PIPE_C, 3), D_SKL_PLUS, NULL, NULL);
-	MMIO_DH(_REG_701C0(PIPE_C, 4), D_SKL_PLUS, NULL, NULL);
+	MMIO_DH(_MMIO(_REG_701C0(PIPE_C, 1)), D_SKL_PLUS, NULL, NULL);
+	MMIO_DH(_MMIO(_REG_701C0(PIPE_C, 2)), D_SKL_PLUS, NULL, NULL);
+	MMIO_DH(_MMIO(_REG_701C0(PIPE_C, 3)), D_SKL_PLUS, NULL, NULL);
+	MMIO_DH(_MMIO(_REG_701C0(PIPE_C, 4)), D_SKL_PLUS, NULL, NULL);
 
-	MMIO_DH(_REG_701C4(PIPE_A, 1), D_SKL_PLUS, NULL, NULL);
-	MMIO_DH(_REG_701C4(PIPE_A, 2), D_SKL_PLUS, NULL, NULL);
-	MMIO_DH(_REG_701C4(PIPE_A, 3), D_SKL_PLUS, NULL, NULL);
-	MMIO_DH(_REG_701C4(PIPE_A, 4), D_SKL_PLUS, NULL, NULL);
+	MMIO_DH(_MMIO(_REG_701C4(PIPE_A, 1)), D_SKL_PLUS, NULL, NULL);
+	MMIO_DH(_MMIO(_REG_701C4(PIPE_A, 2)), D_SKL_PLUS, NULL, NULL);
+	MMIO_DH(_MMIO(_REG_701C4(PIPE_A, 3)), D_SKL_PLUS, NULL, NULL);
+	MMIO_DH(_MMIO(_REG_701C4(PIPE_A, 4)), D_SKL_PLUS, NULL, NULL);
 
-	MMIO_DH(_REG_701C4(PIPE_B, 1), D_SKL_PLUS, NULL, NULL);
-	MMIO_DH(_REG_701C4(PIPE_B, 2), D_SKL_PLUS, NULL, NULL);
-	MMIO_DH(_REG_701C4(PIPE_B, 3), D_SKL_PLUS, NULL, NULL);
-	MMIO_DH(_REG_701C4(PIPE_B, 4), D_SKL_PLUS, NULL, NULL);
+	MMIO_DH(_MMIO(_REG_701C4(PIPE_B, 1)), D_SKL_PLUS, NULL, NULL);
+	MMIO_DH(_MMIO(_REG_701C4(PIPE_B, 2)), D_SKL_PLUS, NULL, NULL);
+	MMIO_DH(_MMIO(_REG_701C4(PIPE_B, 3)), D_SKL_PLUS, NULL, NULL);
+	MMIO_DH(_MMIO(_REG_701C4(PIPE_B, 4)), D_SKL_PLUS, NULL, NULL);
 
-	MMIO_DH(_REG_701C4(PIPE_C, 1), D_SKL_PLUS, NULL, NULL);
-	MMIO_DH(_REG_701C4(PIPE_C, 2), D_SKL_PLUS, NULL, NULL);
-	MMIO_DH(_REG_701C4(PIPE_C, 3), D_SKL_PLUS, NULL, NULL);
-	MMIO_DH(_REG_701C4(PIPE_C, 4), D_SKL_PLUS, NULL, NULL);
+	MMIO_DH(_MMIO(_REG_701C4(PIPE_C, 1)), D_SKL_PLUS, NULL, NULL);
+	MMIO_DH(_MMIO(_REG_701C4(PIPE_C, 2)), D_SKL_PLUS, NULL, NULL);
+	MMIO_DH(_MMIO(_REG_701C4(PIPE_C, 3)), D_SKL_PLUS, NULL, NULL);
+	MMIO_DH(_MMIO(_REG_701C4(PIPE_C, 4)), D_SKL_PLUS, NULL, NULL);
 
-	MMIO_D(0x70380, D_SKL_PLUS);
-	MMIO_D(0x71380, D_SKL_PLUS);
-	MMIO_D(0x72380, D_SKL_PLUS);
-	MMIO_D(0x7039c, D_SKL_PLUS);
+	MMIO_D(_MMIO(0x70380), D_SKL_PLUS);
+	MMIO_D(_MMIO(0x71380), D_SKL_PLUS);
+	MMIO_D(_MMIO(0x72380), D_SKL_PLUS);
+	MMIO_D(_MMIO(0x7039c), D_SKL_PLUS);
 
-	MMIO_D(0x8f074, D_SKL | D_KBL);
-	MMIO_D(0x8f004, D_SKL | D_KBL);
-	MMIO_D(0x8f034, D_SKL | D_KBL);
+	MMIO_D(_MMIO(0x8f074), D_SKL | D_KBL);
+	MMIO_D(_MMIO(0x8f004), D_SKL | D_KBL);
+	MMIO_D(_MMIO(0x8f034), D_SKL | D_KBL);
 
-	MMIO_D(0xb11c, D_SKL | D_KBL);
+	MMIO_D(_MMIO(0xb11c), D_SKL | D_KBL);
 
-	MMIO_D(0x51000, D_SKL | D_KBL);
-	MMIO_D(0x6c00c, D_SKL_PLUS);
+	MMIO_D(_MMIO(0x51000), D_SKL | D_KBL);
+	MMIO_D(_MMIO(0x6c00c), D_SKL_PLUS);
 
-	MMIO_F(0xc800, 0x7f8, F_CMD_ACCESS, 0, 0, D_SKL | D_KBL, NULL, NULL);
-	MMIO_F(0xb020, 0x80, F_CMD_ACCESS, 0, 0, D_SKL | D_KBL, NULL, NULL);
+	MMIO_F(_MMIO(0xc800), 0x7f8, F_CMD_ACCESS, 0, 0, D_SKL | D_KBL, NULL, NULL);
+	MMIO_F(_MMIO(0xb020), 0x80, F_CMD_ACCESS, 0, 0, D_SKL | D_KBL, NULL, NULL);
 
-	MMIO_D(0xd08, D_SKL_PLUS);
-	MMIO_DFH(0x20e0, D_SKL_PLUS, F_MODE_MASK, NULL, NULL);
-	MMIO_DFH(0x20ec, D_SKL_PLUS, F_MODE_MASK | F_CMD_ACCESS, NULL, NULL);
+	MMIO_D(_MMIO(0xd08), D_SKL_PLUS);
+	MMIO_DFH(_MMIO(0x20e0), D_SKL_PLUS, F_MODE_MASK, NULL, NULL);
+	MMIO_DFH(_MMIO(0x20ec), D_SKL_PLUS, F_MODE_MASK | F_CMD_ACCESS, NULL, NULL);
 
 	/* TRTT */
-	MMIO_DFH(0x4de0, D_SKL | D_KBL, F_CMD_ACCESS, NULL, NULL);
-	MMIO_DFH(0x4de4, D_SKL | D_KBL, F_CMD_ACCESS, NULL, NULL);
-	MMIO_DFH(0x4de8, D_SKL | D_KBL, F_CMD_ACCESS, NULL, NULL);
-	MMIO_DFH(0x4dec, D_SKL | D_KBL, F_CMD_ACCESS, NULL, NULL);
-	MMIO_DFH(0x4df0, D_SKL | D_KBL, F_CMD_ACCESS, NULL, NULL);
-	MMIO_DFH(0x4df4, D_SKL | D_KBL, F_CMD_ACCESS, NULL, gen9_trtte_write);
-	MMIO_DH(0x4dfc, D_SKL | D_KBL, NULL, gen9_trtt_chicken_write);
+	MMIO_DFH(_MMIO(0x4de0), D_SKL | D_KBL, F_CMD_ACCESS, NULL, NULL);
+	MMIO_DFH(_MMIO(0x4de4), D_SKL | D_KBL, F_CMD_ACCESS, NULL, NULL);
+	MMIO_DFH(_MMIO(0x4de8), D_SKL | D_KBL, F_CMD_ACCESS, NULL, NULL);
+	MMIO_DFH(_MMIO(0x4dec), D_SKL | D_KBL, F_CMD_ACCESS, NULL, NULL);
+	MMIO_DFH(_MMIO(0x4df0), D_SKL | D_KBL, F_CMD_ACCESS, NULL, NULL);
+	MMIO_DFH(_MMIO(0x4df4), D_SKL | D_KBL, F_CMD_ACCESS, NULL, gen9_trtte_write);
+	MMIO_DH(_MMIO(0x4dfc), D_SKL | D_KBL, NULL, gen9_trtt_chicken_write);
 
-	MMIO_D(0x45008, D_SKL | D_KBL);
+	MMIO_D(_MMIO(0x45008), D_SKL | D_KBL);
 
-	MMIO_D(0x46430, D_SKL | D_KBL);
+	MMIO_D(_MMIO(0x46430), D_SKL | D_KBL);
 
-	MMIO_D(0x46520, D_SKL | D_KBL);
+	MMIO_D(_MMIO(0x46520), D_SKL | D_KBL);
 
-	MMIO_D(0xc403c, D_SKL | D_KBL);
-	MMIO_D(0xb004, D_SKL_PLUS);
+	MMIO_D(_MMIO(0xc403c), D_SKL | D_KBL);
+	MMIO_D(_MMIO(0xb004), D_SKL_PLUS);
 	MMIO_DH(DMA_CTRL, D_SKL_PLUS, NULL, dma_ctrl_write);
 
-	MMIO_D(0x65900, D_SKL_PLUS);
-	MMIO_D(0x1082c0, D_SKL | D_KBL);
-	MMIO_D(0x4068, D_SKL | D_KBL);
-	MMIO_D(0x67054, D_SKL | D_KBL);
-	MMIO_D(0x6e560, D_SKL | D_KBL);
-	MMIO_D(0x6e554, D_SKL | D_KBL);
-	MMIO_D(0x2b20, D_SKL | D_KBL);
-	MMIO_D(0x65f00, D_SKL | D_KBL);
-	MMIO_D(0x65f08, D_SKL | D_KBL);
-	MMIO_D(0x320f0, D_SKL | D_KBL);
-
-	MMIO_D(0x70034, D_SKL_PLUS);
-	MMIO_D(0x71034, D_SKL_PLUS);
-	MMIO_D(0x72034, D_SKL_PLUS);
-
-	MMIO_D(_PLANE_KEYVAL_1(PIPE_A), D_SKL_PLUS);
-	MMIO_D(_PLANE_KEYVAL_1(PIPE_B), D_SKL_PLUS);
-	MMIO_D(_PLANE_KEYVAL_1(PIPE_C), D_SKL_PLUS);
-	MMIO_D(_PLANE_KEYMSK_1(PIPE_A), D_SKL_PLUS);
-	MMIO_D(_PLANE_KEYMSK_1(PIPE_B), D_SKL_PLUS);
-	MMIO_D(_PLANE_KEYMSK_1(PIPE_C), D_SKL_PLUS);
-
-	MMIO_D(0x44500, D_SKL_PLUS);
+	MMIO_D(_MMIO(0x65900), D_SKL_PLUS);
+	MMIO_D(_MMIO(0x1082c0), D_SKL | D_KBL);
+	MMIO_D(_MMIO(0x4068), D_SKL | D_KBL);
+	MMIO_D(_MMIO(0x67054), D_SKL | D_KBL);
+	MMIO_D(_MMIO(0x6e560), D_SKL | D_KBL);
+	MMIO_D(_MMIO(0x6e554), D_SKL | D_KBL);
+	MMIO_D(_MMIO(0x2b20), D_SKL | D_KBL);
+	MMIO_D(_MMIO(0x65f00), D_SKL | D_KBL);
+	MMIO_D(_MMIO(0x65f08), D_SKL | D_KBL);
+	MMIO_D(_MMIO(0x320f0), D_SKL | D_KBL);
+
+	MMIO_D(_MMIO(0x70034), D_SKL_PLUS);
+	MMIO_D(_MMIO(0x71034), D_SKL_PLUS);
+	MMIO_D(_MMIO(0x72034), D_SKL_PLUS);
+
+	MMIO_D(_MMIO(_PLANE_KEYVAL_1(PIPE_A)), D_SKL_PLUS);
+	MMIO_D(_MMIO(_PLANE_KEYVAL_1(PIPE_B)), D_SKL_PLUS);
+	MMIO_D(_MMIO(_PLANE_KEYVAL_1(PIPE_C)), D_SKL_PLUS);
+	MMIO_D(_MMIO(_PLANE_KEYMSK_1(PIPE_A)), D_SKL_PLUS);
+	MMIO_D(_MMIO(_PLANE_KEYMSK_1(PIPE_B)), D_SKL_PLUS);
+	MMIO_D(_MMIO(_PLANE_KEYMSK_1(PIPE_C)), D_SKL_PLUS);
+
+	MMIO_D(_MMIO(0x44500), D_SKL_PLUS);
 	MMIO_DFH(GEN9_CSFE_CHICKEN1_RCS, D_SKL_PLUS, F_CMD_ACCESS, NULL, NULL);
 	MMIO_DFH(GEN8_HDC_CHICKEN1, D_SKL | D_KBL, F_MODE_MASK | F_CMD_ACCESS,
 		NULL, NULL);
 
-	MMIO_D(0x4ab8, D_KBL);
-	MMIO_D(0x2248, D_SKL_PLUS | D_KBL);
+	MMIO_D(_MMIO(0x4ab8), D_KBL);
+	MMIO_D(_MMIO(0x2248), D_SKL_PLUS | D_KBL);
 
 	return 0;
 }
@@ -2869,8 +2869,8 @@ static struct gvt_mmio_block *find_mmio_block(struct intel_gvt *gvt,
 	for (i = 0; i < num; i++, block++) {
 		if (!(device & block->device))
 			continue;
-		if (offset >= INTEL_GVT_MMIO_OFFSET(block->offset) &&
-		    offset < INTEL_GVT_MMIO_OFFSET(block->offset) + block->size)
+		if (offset >= i915_mmio_reg_offset(block->offset) &&
+		    offset < i915_mmio_reg_offset(block->offset) + block->size)
 			return block;
 	}
 	return NULL;
@@ -2982,8 +2982,8 @@ int intel_gvt_for_each_tracked_mmio(struct intel_gvt *gvt,
 	for (i = 0; i < gvt->mmio.num_mmio_block; i++, block++) {
 		for (j = 0; j < block->size; j += 4) {
 			ret = handler(gvt,
-				INTEL_GVT_MMIO_OFFSET(block->offset) + j,
-				data);
+				      i915_mmio_reg_offset(block->offset) + j,
+				      data);
 			if (ret)
 				return ret;
 		}
diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c
index f86983d6655b..45bab5a6290b 100644
--- a/drivers/gpu/drm/i915/gvt/kvmgt.c
+++ b/drivers/gpu/drm/i915/gvt/kvmgt.c
@@ -1360,8 +1360,8 @@ static void kvmgt_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 					struct kvmgt_guest_info, track_node);
 
 	if (kvmgt_gfn_is_write_protected(info, gpa_to_gfn(gpa)))
-		intel_gvt_ops->emulate_mmio_write(info->vgpu, gpa,
-					(void *)val, len);
+		intel_gvt_ops->write_protect_handler(info->vgpu, gpa,
+						     (void *)val, len);
 }
 
 static void kvmgt_page_track_flush_slot(struct kvm *kvm,
diff --git a/drivers/gpu/drm/i915/gvt/mmio.c b/drivers/gpu/drm/i915/gvt/mmio.c
index f7227a3ad469..562b5ad857a4 100644
--- a/drivers/gpu/drm/i915/gvt/mmio.c
+++ b/drivers/gpu/drm/i915/gvt/mmio.c
@@ -117,25 +117,6 @@ static void failsafe_emulate_mmio_rw(struct intel_vgpu *vgpu, uint64_t pa,
 		else
 			memcpy(pt, p_data, bytes);
 
-	} else if (atomic_read(&vgpu->gtt.n_tracked_guest_page)) {
-		struct intel_vgpu_page_track *t;
-
-		/* Since we enter the failsafe mode early during guest boot,
-		 * guest may not have chance to set up its ppgtt table, so
-		 * there should not be any wp pages for guest. Keep the wp
-		 * related code here in case we need to handle it in furture.
-		 */
-		t = intel_vgpu_find_tracked_page(vgpu, pa >> PAGE_SHIFT);
-		if (t) {
-			/* remove write protection to prevent furture traps */
-			intel_vgpu_clean_page_track(vgpu, t);
-			if (read)
-				intel_gvt_hypervisor_read_gpa(vgpu, pa,
-						p_data, bytes);
-			else
-				intel_gvt_hypervisor_write_gpa(vgpu, pa,
-						p_data, bytes);
-		}
 	}
 	mutex_unlock(&gvt->lock);
 }
@@ -168,23 +149,6 @@ int intel_vgpu_emulate_mmio_read(struct intel_vgpu *vgpu, uint64_t pa,
 		goto out;
 	}
 
-	if (atomic_read(&vgpu->gtt.n_tracked_guest_page)) {
-		struct intel_vgpu_page_track *t;
-
-		t = intel_vgpu_find_tracked_page(vgpu, pa >> PAGE_SHIFT);
-		if (t) {
-			ret = intel_gvt_hypervisor_read_gpa(vgpu, pa,
-					p_data, bytes);
-			if (ret) {
-				gvt_vgpu_err("guest page read error %d, "
-					"gfn 0x%lx, pa 0x%llx, var 0x%x, len %d\n",
-					ret, t->gfn, pa, *(u32 *)p_data,
-					bytes);
-			}
-			goto out;
-		}
-	}
-
 	offset = intel_vgpu_gpa_to_mmio_offset(vgpu, pa);
 
 	if (WARN_ON(bytes > 8))
@@ -263,23 +227,6 @@ int intel_vgpu_emulate_mmio_write(struct intel_vgpu *vgpu, uint64_t pa,
 		goto out;
 	}
 
-	if (atomic_read(&vgpu->gtt.n_tracked_guest_page)) {
-		struct intel_vgpu_page_track *t;
-
-		t = intel_vgpu_find_tracked_page(vgpu, pa >> PAGE_SHIFT);
-		if (t) {
-			ret = t->handler(t, pa, p_data, bytes);
-			if (ret) {
-				gvt_err("guest page write error %d, "
-					"gfn 0x%lx, pa 0x%llx, "
-					"var 0x%x, len %d\n",
-					ret, t->gfn, pa,
-					*(u32 *)p_data, bytes);
-			}
-			goto out;
-		}
-	}
-
 	offset = intel_vgpu_gpa_to_mmio_offset(vgpu, pa);
 
 	if (WARN_ON(bytes > 8))
@@ -336,10 +283,10 @@ void intel_vgpu_reset_mmio(struct intel_vgpu *vgpu, bool dmlr)
 		memcpy(vgpu->mmio.vreg, mmio, info->mmio_size);
 		memcpy(vgpu->mmio.sreg, mmio, info->mmio_size);
 
-		vgpu_vreg(vgpu, GEN6_GT_THREAD_STATUS_REG) = 0;
+		vgpu_vreg_t(vgpu, GEN6_GT_THREAD_STATUS_REG) = 0;
 
 		/* set the bit 0:2(Core C-State ) to C0 */
-		vgpu_vreg(vgpu, GEN6_GT_CORE_STATUS) = 0;
+		vgpu_vreg_t(vgpu, GEN6_GT_CORE_STATUS) = 0;
 
 		vgpu->mmio.disable_warn_untrack = false;
 	} else {
diff --git a/drivers/gpu/drm/i915/gvt/mmio.h b/drivers/gpu/drm/i915/gvt/mmio.h
index 62709ac351cd..71b620875943 100644
--- a/drivers/gpu/drm/i915/gvt/mmio.h
+++ b/drivers/gpu/drm/i915/gvt/mmio.h
@@ -76,13 +76,6 @@ int intel_gvt_for_each_tracked_mmio(struct intel_gvt *gvt,
 	int (*handler)(struct intel_gvt *gvt, u32 offset, void *data),
 	void *data);
 
-
-#define INTEL_GVT_MMIO_OFFSET(reg) ({ \
-	typeof(reg) __reg = reg; \
-	u32 *offset = (u32 *)&__reg; \
-	*offset; \
-})
-
 int intel_vgpu_init_mmio(struct intel_vgpu *vgpu);
 void intel_vgpu_reset_mmio(struct intel_vgpu *vgpu, bool dmlr);
 void intel_vgpu_clean_mmio(struct intel_vgpu *vgpu);
diff --git a/drivers/gpu/drm/i915/gvt/mmio_context.c b/drivers/gpu/drm/i915/gvt/mmio_context.c
index 8a52b56f0e86..74834395dd89 100644
--- a/drivers/gpu/drm/i915/gvt/mmio_context.c
+++ b/drivers/gpu/drm/i915/gvt/mmio_context.c
@@ -149,8 +149,41 @@ static struct engine_mmio gen9_engine_mmio_list[] __cacheline_aligned = {
 	{ /* Terminated */ }
 };
 
-static u32 gen9_render_mocs[I915_NUM_ENGINES][64];
-static u32 gen9_render_mocs_L3[32];
+static struct {
+	bool initialized;
+	u32 control_table[I915_NUM_ENGINES][64];
+	u32 l3cc_table[32];
+} gen9_render_mocs;
+
+static void load_render_mocs(struct drm_i915_private *dev_priv)
+{
+	i915_reg_t offset;
+	u32 regs[] = {
+		[RCS] = 0xc800,
+		[VCS] = 0xc900,
+		[VCS2] = 0xca00,
+		[BCS] = 0xcc00,
+		[VECS] = 0xcb00,
+	};
+	int ring_id, i;
+
+	for (ring_id = 0; ring_id < I915_NUM_ENGINES; ring_id++) {
+		offset.reg = regs[ring_id];
+		for (i = 0; i < 64; i++) {
+			gen9_render_mocs.control_table[ring_id][i] =
+				I915_READ_FW(offset);
+			offset.reg += 4;
+		}
+	}
+
+	offset.reg = 0xb020;
+	for (i = 0; i < 32; i++) {
+		gen9_render_mocs.l3cc_table[i] =
+			I915_READ_FW(offset);
+		offset.reg += 4;
+	}
+	gen9_render_mocs.initialized = true;
+}
 
 static void handle_tlb_pending_event(struct intel_vgpu *vgpu, int ring_id)
 {
@@ -191,17 +224,20 @@ static void handle_tlb_pending_event(struct intel_vgpu *vgpu, int ring_id)
 	if (wait_for_atomic((I915_READ_FW(reg) == 0), 50))
 		gvt_vgpu_err("timeout in invalidate ring (%d) tlb\n", ring_id);
 	else
-		vgpu_vreg(vgpu, regs[ring_id]) = 0;
+		vgpu_vreg_t(vgpu, reg) = 0;
 
 	intel_uncore_forcewake_put(dev_priv, fw);
 
 	gvt_dbg_core("invalidate TLB for ring %d\n", ring_id);
 }
 
-static void load_mocs(struct intel_vgpu *vgpu, int ring_id)
+static void switch_mocs(struct intel_vgpu *pre, struct intel_vgpu *next,
+			int ring_id)
 {
-	struct drm_i915_private *dev_priv = vgpu->gvt->dev_priv;
+	struct drm_i915_private *dev_priv;
 	i915_reg_t offset, l3_offset;
+	u32 old_v, new_v;
+
 	u32 regs[] = {
 		[RCS] = 0xc800,
 		[VCS] = 0xc900,
@@ -211,54 +247,45 @@ static void load_mocs(struct intel_vgpu *vgpu, int ring_id)
 	};
 	int i;
 
+	dev_priv = pre ? pre->gvt->dev_priv : next->gvt->dev_priv;
 	if (WARN_ON(ring_id >= ARRAY_SIZE(regs)))
 		return;
 
+	if (!pre && !gen9_render_mocs.initialized)
+		load_render_mocs(dev_priv);
+
 	offset.reg = regs[ring_id];
 	for (i = 0; i < 64; i++) {
-		gen9_render_mocs[ring_id][i] = I915_READ_FW(offset);
-		I915_WRITE_FW(offset, vgpu_vreg(vgpu, offset));
-		offset.reg += 4;
-	}
-
-	if (ring_id == RCS) {
-		l3_offset.reg = 0xb020;
-		for (i = 0; i < 32; i++) {
-			gen9_render_mocs_L3[i] = I915_READ_FW(l3_offset);
-			I915_WRITE_FW(l3_offset, vgpu_vreg(vgpu, l3_offset));
-			l3_offset.reg += 4;
-		}
-	}
-}
+		if (pre)
+			old_v = vgpu_vreg_t(pre, offset);
+		else
+			old_v = gen9_render_mocs.control_table[ring_id][i];
+		if (next)
+			new_v = vgpu_vreg_t(next, offset);
+		else
+			new_v = gen9_render_mocs.control_table[ring_id][i];
 
-static void restore_mocs(struct intel_vgpu *vgpu, int ring_id)
-{
-	struct drm_i915_private *dev_priv = vgpu->gvt->dev_priv;
-	i915_reg_t offset, l3_offset;
-	u32 regs[] = {
-		[RCS] = 0xc800,
-		[VCS] = 0xc900,
-		[VCS2] = 0xca00,
-		[BCS] = 0xcc00,
-		[VECS] = 0xcb00,
-	};
-	int i;
+		if (old_v != new_v)
+			I915_WRITE_FW(offset, new_v);
 
-	if (WARN_ON(ring_id >= ARRAY_SIZE(regs)))
-		return;
-
-	offset.reg = regs[ring_id];
-	for (i = 0; i < 64; i++) {
-		vgpu_vreg(vgpu, offset) = I915_READ_FW(offset);
-		I915_WRITE_FW(offset, gen9_render_mocs[ring_id][i]);
 		offset.reg += 4;
 	}
 
 	if (ring_id == RCS) {
 		l3_offset.reg = 0xb020;
 		for (i = 0; i < 32; i++) {
-			vgpu_vreg(vgpu, l3_offset) = I915_READ_FW(l3_offset);
-			I915_WRITE_FW(l3_offset, gen9_render_mocs_L3[i]);
+			if (pre)
+				old_v = vgpu_vreg_t(pre, l3_offset);
+			else
+				old_v = gen9_render_mocs.l3cc_table[i];
+			if (next)
+				new_v = vgpu_vreg_t(next, l3_offset);
+			else
+				new_v = gen9_render_mocs.l3cc_table[i];
+
+			if (old_v != new_v)
+				I915_WRITE_FW(l3_offset, new_v);
+
 			l3_offset.reg += 4;
 		}
 	}
@@ -266,84 +293,77 @@ static void restore_mocs(struct intel_vgpu *vgpu, int ring_id)
 
 #define CTX_CONTEXT_CONTROL_VAL	0x03
 
-/* Switch ring mmio values (context) from host to a vgpu. */
-static void switch_mmio_to_vgpu(struct intel_vgpu *vgpu, int ring_id)
+/* Switch ring mmio values (context). */
+static void switch_mmio(struct intel_vgpu *pre,
+			struct intel_vgpu *next,
+			int ring_id)
 {
-	struct drm_i915_private *dev_priv = vgpu->gvt->dev_priv;
-	struct intel_vgpu_submission *s = &vgpu->submission;
-	u32 *reg_state = s->shadow_ctx->engine[ring_id].lrc_reg_state;
-	u32 ctx_ctrl = reg_state[CTX_CONTEXT_CONTROL_VAL];
+	struct drm_i915_private *dev_priv;
+	struct intel_vgpu_submission *s;
+	u32 *reg_state, ctx_ctrl;
 	u32 inhibit_mask =
 		_MASKED_BIT_ENABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
 	struct engine_mmio *mmio;
-	u32 v;
-
-	if (IS_SKYLAKE(dev_priv) || IS_KABYLAKE(dev_priv))
-		load_mocs(vgpu, ring_id);
-
-	mmio = vgpu->gvt->engine_mmio_list;
-	while (i915_mmio_reg_offset((mmio++)->reg)) {
-		if (mmio->ring_id != ring_id)
-			continue;
-
-		mmio->value = I915_READ_FW(mmio->reg);
-
-		/*
-		 * if it is an inhibit context, load in_context mmio
-		 * into HW by mmio write. If it is not, skip this mmio
-		 * write.
-		 */
-		if (mmio->in_context &&
-		    (ctx_ctrl & inhibit_mask) != inhibit_mask)
-			continue;
-
-		if (mmio->mask)
-			v = vgpu_vreg(vgpu, mmio->reg) | (mmio->mask << 16);
-		else
-			v = vgpu_vreg(vgpu, mmio->reg);
-
-		I915_WRITE_FW(mmio->reg, v);
-
-		trace_render_mmio(vgpu->id, "load",
-				  i915_mmio_reg_offset(mmio->reg),
-				  mmio->value, v);
-	}
-
-	handle_tlb_pending_event(vgpu, ring_id);
-}
-
-/* Switch ring mmio values (context) from vgpu to host. */
-static void switch_mmio_to_host(struct intel_vgpu *vgpu, int ring_id)
-{
-	struct drm_i915_private *dev_priv = vgpu->gvt->dev_priv;
-	struct engine_mmio *mmio;
-	u32 v;
+	u32 old_v, new_v;
 
+	dev_priv = pre ? pre->gvt->dev_priv : next->gvt->dev_priv;
 	if (IS_SKYLAKE(dev_priv) || IS_KABYLAKE(dev_priv))
-		restore_mocs(vgpu, ring_id);
+		switch_mocs(pre, next, ring_id);
 
-	mmio = vgpu->gvt->engine_mmio_list;
+	mmio = dev_priv->gvt->engine_mmio_list;
 	while (i915_mmio_reg_offset((mmio++)->reg)) {
 		if (mmio->ring_id != ring_id)
 			continue;
-
-		vgpu_vreg(vgpu, mmio->reg) = I915_READ_FW(mmio->reg);
-
-		if (mmio->mask) {
-			vgpu_vreg(vgpu, mmio->reg) &= ~(mmio->mask << 16);
-			v = mmio->value | (mmio->mask << 16);
+		// save
+		if (pre) {
+			vgpu_vreg_t(pre, mmio->reg) = I915_READ_FW(mmio->reg);
+			if (mmio->mask)
+				vgpu_vreg_t(pre, mmio->reg) &=
+						~(mmio->mask << 16);
+			old_v = vgpu_vreg_t(pre, mmio->reg);
 		} else
-			v = mmio->value;
-
-		if (mmio->in_context)
-			continue;
+			old_v = mmio->value = I915_READ_FW(mmio->reg);
+
+		// restore
+		if (next) {
+			s = &next->submission;
+			reg_state =
+				s->shadow_ctx->engine[ring_id].lrc_reg_state;
+			ctx_ctrl = reg_state[CTX_CONTEXT_CONTROL_VAL];
+			/*
+			 * if it is an inhibit context, load in_context mmio
+			 * into HW by mmio write. If it is not, skip this mmio
+			 * write.
+			 */
+			if (mmio->in_context &&
+			    (ctx_ctrl & inhibit_mask) != inhibit_mask)
+				continue;
+
+			if (mmio->mask)
+				new_v = vgpu_vreg_t(next, mmio->reg) |
+							(mmio->mask << 16);
+			else
+				new_v = vgpu_vreg_t(next, mmio->reg);
+		} else {
+			if (mmio->in_context)
+				continue;
+			if (mmio->mask)
+				new_v = mmio->value | (mmio->mask << 16);
+			else
+				new_v = mmio->value;
+		}
 
-		I915_WRITE_FW(mmio->reg, v);
+		I915_WRITE_FW(mmio->reg, new_v);
 
-		trace_render_mmio(vgpu->id, "restore",
+		trace_render_mmio(pre ? pre->id : 0,
+				  next ? next->id : 0,
+				  "switch",
 				  i915_mmio_reg_offset(mmio->reg),
-				  mmio->value, v);
+				  old_v, new_v);
 	}
+
+	if (next)
+		handle_tlb_pending_event(next, ring_id);
 }
 
 /**
@@ -374,17 +394,7 @@ void intel_gvt_switch_mmio(struct intel_vgpu *pre,
 	 * handle forcewake mannually.
 	 */
 	intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
-
-	/**
-	 * TODO: Optimize for vGPU to vGPU switch by merging
-	 * switch_mmio_to_host() and switch_mmio_to_vgpu().
-	 */
-	if (pre)
-		switch_mmio_to_host(pre, ring_id);
-
-	if (next)
-		switch_mmio_to_vgpu(next, ring_id);
-
+	switch_mmio(pre, next, ring_id);
 	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
 }
 
diff --git a/drivers/gpu/drm/i915/gvt/trace.h b/drivers/gpu/drm/i915/gvt/trace.h
index 8c150381d9a4..7a2511538f34 100644
--- a/drivers/gpu/drm/i915/gvt/trace.h
+++ b/drivers/gpu/drm/i915/gvt/trace.h
@@ -330,13 +330,14 @@ TRACE_EVENT(inject_msi,
 );
 
 TRACE_EVENT(render_mmio,
-	TP_PROTO(int id, char *action, unsigned int reg,
+	TP_PROTO(int old_id, int new_id, char *action, unsigned int reg,
 		 unsigned int old_val, unsigned int new_val),
 
-	TP_ARGS(id, action, reg, new_val, old_val),
+	TP_ARGS(old_id, new_id, action, reg, new_val, old_val),
 
 	TP_STRUCT__entry(
-		__field(int, id)
+		__field(int, old_id)
+		__field(int, new_id)
 		__array(char, buf, GVT_TEMP_STR_LEN)
 		__field(unsigned int, reg)
 		__field(unsigned int, old_val)
@@ -344,15 +345,17 @@ TRACE_EVENT(render_mmio,
 	),
 
 	TP_fast_assign(
-		__entry->id = id;
+		__entry->old_id = old_id;
+		__entry->new_id = new_id;
 		snprintf(__entry->buf, GVT_TEMP_STR_LEN, "%s", action);
 		__entry->reg = reg;
 		__entry->old_val = old_val;
 		__entry->new_val = new_val;
 	),
 
-	TP_printk("VM%u %s reg %x, old %08x new %08x\n",
-		  __entry->id, __entry->buf, __entry->reg,
+	TP_printk("VM%u -> VM%u %s reg %x, old %08x new %08x\n",
+		  __entry->old_id, __entry->new_id,
+		  __entry->buf, __entry->reg,
 		  __entry->old_val, __entry->new_val)
 );
 
diff --git a/drivers/gpu/drm/i915/gvt/vgpu.c b/drivers/gpu/drm/i915/gvt/vgpu.c
index 39926176fbeb..4688619f6a1c 100644
--- a/drivers/gpu/drm/i915/gvt/vgpu.c
+++ b/drivers/gpu/drm/i915/gvt/vgpu.c
@@ -38,25 +38,25 @@
 void populate_pvinfo_page(struct intel_vgpu *vgpu)
 {
 	/* setup the ballooning information */
-	vgpu_vreg64(vgpu, vgtif_reg(magic)) = VGT_MAGIC;
-	vgpu_vreg(vgpu, vgtif_reg(version_major)) = 1;
-	vgpu_vreg(vgpu, vgtif_reg(version_minor)) = 0;
-	vgpu_vreg(vgpu, vgtif_reg(display_ready)) = 0;
-	vgpu_vreg(vgpu, vgtif_reg(vgt_id)) = vgpu->id;
+	vgpu_vreg64_t(vgpu, vgtif_reg(magic)) = VGT_MAGIC;
+	vgpu_vreg_t(vgpu, vgtif_reg(version_major)) = 1;
+	vgpu_vreg_t(vgpu, vgtif_reg(version_minor)) = 0;
+	vgpu_vreg_t(vgpu, vgtif_reg(display_ready)) = 0;
+	vgpu_vreg_t(vgpu, vgtif_reg(vgt_id)) = vgpu->id;
 
-	vgpu_vreg(vgpu, vgtif_reg(vgt_caps)) = VGT_CAPS_FULL_48BIT_PPGTT;
-	vgpu_vreg(vgpu, vgtif_reg(vgt_caps)) |= VGT_CAPS_HWSP_EMULATION;
+	vgpu_vreg_t(vgpu, vgtif_reg(vgt_caps)) = VGT_CAPS_FULL_48BIT_PPGTT;
+	vgpu_vreg_t(vgpu, vgtif_reg(vgt_caps)) |= VGT_CAPS_HWSP_EMULATION;
 
-	vgpu_vreg(vgpu, vgtif_reg(avail_rs.mappable_gmadr.base)) =
+	vgpu_vreg_t(vgpu, vgtif_reg(avail_rs.mappable_gmadr.base)) =
 		vgpu_aperture_gmadr_base(vgpu);
-	vgpu_vreg(vgpu, vgtif_reg(avail_rs.mappable_gmadr.size)) =
+	vgpu_vreg_t(vgpu, vgtif_reg(avail_rs.mappable_gmadr.size)) =
 		vgpu_aperture_sz(vgpu);
-	vgpu_vreg(vgpu, vgtif_reg(avail_rs.nonmappable_gmadr.base)) =
+	vgpu_vreg_t(vgpu, vgtif_reg(avail_rs.nonmappable_gmadr.base)) =
 		vgpu_hidden_gmadr_base(vgpu);
-	vgpu_vreg(vgpu, vgtif_reg(avail_rs.nonmappable_gmadr.size)) =
+	vgpu_vreg_t(vgpu, vgtif_reg(avail_rs.nonmappable_gmadr.size)) =
 		vgpu_hidden_sz(vgpu);
 
-	vgpu_vreg(vgpu, vgtif_reg(avail_rs.fence_num)) = vgpu_fence_sz(vgpu);
+	vgpu_vreg_t(vgpu, vgtif_reg(avail_rs.fence_num)) = vgpu_fence_sz(vgpu);
 
 	gvt_dbg_core("Populate PVINFO PAGE for vGPU %d\n", vgpu->id);
 	gvt_dbg_core("aperture base [GMADR] 0x%llx size 0x%llx\n",
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index d8c6ec3cca71..e968aeae1d84 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -37,40 +37,21 @@ static inline struct drm_i915_private *node_to_i915(struct drm_info_node *node)
 	return to_i915(node->minor->dev);
 }
 
-static __always_inline void seq_print_param(struct seq_file *m,
-					    const char *name,
-					    const char *type,
-					    const void *x)
-{
-	if (!__builtin_strcmp(type, "bool"))
-		seq_printf(m, "i915.%s=%s\n", name, yesno(*(const bool *)x));
-	else if (!__builtin_strcmp(type, "int"))
-		seq_printf(m, "i915.%s=%d\n", name, *(const int *)x);
-	else if (!__builtin_strcmp(type, "unsigned int"))
-		seq_printf(m, "i915.%s=%u\n", name, *(const unsigned int *)x);
-	else if (!__builtin_strcmp(type, "char *"))
-		seq_printf(m, "i915.%s=%s\n", name, *(const char **)x);
-	else
-		BUILD_BUG();
-}
-
 static int i915_capabilities(struct seq_file *m, void *data)
 {
 	struct drm_i915_private *dev_priv = node_to_i915(m->private);
 	const struct intel_device_info *info = INTEL_INFO(dev_priv);
+	struct drm_printer p = drm_seq_file_printer(m);
 
 	seq_printf(m, "gen: %d\n", INTEL_GEN(dev_priv));
 	seq_printf(m, "platform: %s\n", intel_platform_name(info->platform));
 	seq_printf(m, "pch: %d\n", INTEL_PCH_TYPE(dev_priv));
 
-#define PRINT_FLAG(x)  seq_printf(m, #x ": %s\n", yesno(info->x))
-	DEV_INFO_FOR_EACH_FLAG(PRINT_FLAG);
-#undef PRINT_FLAG
+	intel_device_info_dump_flags(info, &p);
+	intel_device_info_dump_runtime(info, &p);
 
 	kernel_param_lock(THIS_MODULE);
-#define PRINT_PARAM(T, x, ...) seq_print_param(m, #x, #T, &i915_modparams.x);
-	I915_PARAMS_FOR_EACH(PRINT_PARAM);
-#undef PRINT_PARAM
+	i915_params_dump(&i915_modparams, &p);
 	kernel_param_unlock(THIS_MODULE);
 
 	return 0;
@@ -1601,20 +1582,23 @@ static int i915_frontbuffer_tracking(struct seq_file *m, void *unused)
 static int i915_fbc_status(struct seq_file *m, void *unused)
 {
 	struct drm_i915_private *dev_priv = node_to_i915(m->private);
+	struct intel_fbc *fbc = &dev_priv->fbc;
 
-	if (!HAS_FBC(dev_priv)) {
-		seq_puts(m, "FBC unsupported on this chipset\n");
-		return 0;
-	}
+	if (!HAS_FBC(dev_priv))
+		return -ENODEV;
 
 	intel_runtime_pm_get(dev_priv);
-	mutex_lock(&dev_priv->fbc.lock);
+	mutex_lock(&fbc->lock);
 
 	if (intel_fbc_is_active(dev_priv))
 		seq_puts(m, "FBC enabled\n");
 	else
-		seq_printf(m, "FBC disabled: %s\n",
-			   dev_priv->fbc.no_fbc_reason);
+		seq_printf(m, "FBC disabled: %s\n", fbc->no_fbc_reason);
+
+	if (fbc->work.scheduled)
+		seq_printf(m, "FBC worker scheduled on vblank %u, now %llu\n",
+			   fbc->work.scheduled_vblank,
+			   drm_crtc_vblank_count(&fbc->crtc->base));
 
 	if (intel_fbc_is_active(dev_priv)) {
 		u32 mask;
@@ -1634,7 +1618,7 @@ static int i915_fbc_status(struct seq_file *m, void *unused)
 		seq_printf(m, "Compressing: %s\n", yesno(mask));
 	}
 
-	mutex_unlock(&dev_priv->fbc.lock);
+	mutex_unlock(&fbc->lock);
 	intel_runtime_pm_put(dev_priv);
 
 	return 0;
@@ -1681,10 +1665,8 @@ static int i915_ips_status(struct seq_file *m, void *unused)
 {
 	struct drm_i915_private *dev_priv = node_to_i915(m->private);
 
-	if (!HAS_IPS(dev_priv)) {
-		seq_puts(m, "not supported\n");
-		return 0;
-	}
+	if (!HAS_IPS(dev_priv))
+		return -ENODEV;
 
 	intel_runtime_pm_get(dev_priv);
 
@@ -1770,10 +1752,8 @@ static int i915_ring_freq_table(struct seq_file *m, void *unused)
 	int gpu_freq, ia_freq;
 	unsigned int max_gpu_freq, min_gpu_freq;
 
-	if (!HAS_LLC(dev_priv)) {
-		seq_puts(m, "unsupported on this chipset\n");
-		return 0;
-	}
+	if (!HAS_LLC(dev_priv))
+		return -ENODEV;
 
 	intel_runtime_pm_get(dev_priv);
 
@@ -2253,8 +2233,8 @@ static int i915_huc_load_status_info(struct seq_file *m, void *data)
 	struct drm_i915_private *dev_priv = node_to_i915(m->private);
 	struct drm_printer p;
 
-	if (!HAS_HUC_UCODE(dev_priv))
-		return 0;
+	if (!HAS_HUC(dev_priv))
+		return -ENODEV;
 
 	p = drm_seq_file_printer(m);
 	intel_uc_fw_dump(&dev_priv->huc.fw, &p);
@@ -2272,8 +2252,8 @@ static int i915_guc_load_status_info(struct seq_file *m, void *data)
 	struct drm_printer p;
 	u32 tmp, i;
 
-	if (!HAS_GUC_UCODE(dev_priv))
-		return 0;
+	if (!HAS_GUC(dev_priv))
+		return -ENODEV;
 
 	p = drm_seq_file_printer(m);
 	intel_uc_fw_dump(&dev_priv->guc.fw, &p);
@@ -2346,29 +2326,16 @@ static void i915_guc_client_info(struct seq_file *m,
 	seq_printf(m, "\tTotal: %llu\n", tot);
 }
 
-static bool check_guc_submission(struct seq_file *m)
-{
-	struct drm_i915_private *dev_priv = node_to_i915(m->private);
-	const struct intel_guc *guc = &dev_priv->guc;
-
-	if (!guc->execbuf_client) {
-		seq_printf(m, "GuC submission %s\n",
-			   HAS_GUC_SCHED(dev_priv) ?
-			   "disabled" :
-			   "not supported");
-		return false;
-	}
-
-	return true;
-}
-
 static int i915_guc_info(struct seq_file *m, void *data)
 {
 	struct drm_i915_private *dev_priv = node_to_i915(m->private);
 	const struct intel_guc *guc = &dev_priv->guc;
 
-	if (!check_guc_submission(m))
-		return 0;
+	if (!USES_GUC_SUBMISSION(dev_priv))
+		return -ENODEV;
+
+	GEM_BUG_ON(!guc->execbuf_client);
+	GEM_BUG_ON(!guc->preempt_client);
 
 	seq_printf(m, "Doorbell map:\n");
 	seq_printf(m, "\t%*pb\n", GUC_NUM_DOORBELLS, guc->doorbell_bitmap);
@@ -2395,8 +2362,8 @@ static int i915_guc_stage_pool(struct seq_file *m, void *data)
 	unsigned int tmp;
 	int index;
 
-	if (!check_guc_submission(m))
-		return 0;
+	if (!USES_GUC_SUBMISSION(dev_priv))
+		return -ENODEV;
 
 	for (index = 0; index < GUC_MAX_STAGE_DESCRIPTORS; index++, desc++) {
 		struct intel_engine_cs *engine;
@@ -2449,6 +2416,9 @@ static int i915_guc_log_dump(struct seq_file *m, void *data)
 	u32 *log;
 	int i = 0;
 
+	if (!HAS_GUC(dev_priv))
+		return -ENODEV;
+
 	if (dump_load_err)
 		obj = dev_priv->guc.load_err_log;
 	else if (dev_priv->guc.log.vma)
@@ -2480,6 +2450,9 @@ static int i915_guc_log_control_get(void *data, u64 *val)
 {
 	struct drm_i915_private *dev_priv = data;
 
+	if (!HAS_GUC(dev_priv))
+		return -ENODEV;
+
 	if (!dev_priv->guc.log.vma)
 		return -EINVAL;
 
@@ -2493,6 +2466,9 @@ static int i915_guc_log_control_set(void *data, u64 val)
 	struct drm_i915_private *dev_priv = data;
 	int ret;
 
+	if (!HAS_GUC(dev_priv))
+		return -ENODEV;
+
 	if (!dev_priv->guc.log.vma)
 		return -EINVAL;
 
@@ -2543,10 +2519,8 @@ static int i915_edp_psr_status(struct seq_file *m, void *data)
 	enum pipe pipe;
 	bool enabled = false;
 
-	if (!HAS_PSR(dev_priv)) {
-		seq_puts(m, "PSR not supported\n");
-		return 0;
-	}
+	if (!HAS_PSR(dev_priv))
+		return -ENODEV;
 
 	intel_runtime_pm_get(dev_priv);
 
@@ -2785,10 +2759,8 @@ static int i915_dmc_info(struct seq_file *m, void *unused)
 	struct drm_i915_private *dev_priv = node_to_i915(m->private);
 	struct intel_csr *csr;
 
-	if (!HAS_CSR(dev_priv)) {
-		seq_puts(m, "not supported\n");
-		return 0;
-	}
+	if (!HAS_CSR(dev_priv))
+		return -ENODEV;
 
 	csr = &dev_priv->csr;
 
@@ -3324,7 +3296,7 @@ static int i915_ddb_info(struct seq_file *m, void *unused)
 	int plane;
 
 	if (INTEL_GEN(dev_priv) < 9)
-		return 0;
+		return -ENODEV;
 
 	drm_modeset_lock_all(dev);
 
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index ca9f4b2862eb..6c8da9d20c33 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -931,8 +931,6 @@ static int i915_driver_init_early(struct drm_i915_private *dev_priv,
 
 	intel_display_crc_init(dev_priv);
 
-	intel_device_info_dump(dev_priv);
-
 	intel_detect_preproduction_hw(dev_priv);
 
 	return 0;
@@ -1084,7 +1082,7 @@ static int i915_driver_init_hw(struct drm_i915_private *dev_priv)
 	if (i915_inject_load_failure())
 		return -ENODEV;
 
-	intel_device_info_runtime_init(dev_priv);
+	intel_device_info_runtime_init(mkwrite_device_info(dev_priv));
 
 	intel_sanitize_options(dev_priv);
 
@@ -1294,6 +1292,21 @@ static void i915_driver_unregister(struct drm_i915_private *dev_priv)
 	i915_gem_shrinker_unregister(dev_priv);
 }
 
+static void i915_welcome_messages(struct drm_i915_private *dev_priv)
+{
+	if (drm_debug & DRM_UT_DRIVER) {
+		struct drm_printer p = drm_debug_printer("i915 device info:");
+
+		intel_device_info_dump(&dev_priv->info, &p);
+		intel_device_info_dump_runtime(&dev_priv->info, &p);
+	}
+
+	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG))
+		DRM_INFO("DRM_I915_DEBUG enabled\n");
+	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
+		DRM_INFO("DRM_I915_DEBUG_GEM enabled\n");
+}
+
 /**
  * i915_driver_load - setup chip and create an initial config
  * @pdev: PCI device
@@ -1379,13 +1392,10 @@ int i915_driver_load(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	intel_init_ipc(dev_priv);
 
-	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG))
-		DRM_INFO("DRM_I915_DEBUG enabled\n");
-	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
-		DRM_INFO("DRM_I915_DEBUG_GEM enabled\n");
-
 	intel_runtime_pm_put(dev_priv);
 
+	i915_welcome_messages(dev_priv);
+
 	return 0;
 
 out_cleanup_hw:
@@ -1924,9 +1934,6 @@ void i915_reset(struct drm_i915_private *i915, unsigned int flags)
 		goto taint;
 	}
 
-	i915_gem_reset(i915);
-	intel_overlay_reset(i915);
-
 	/* Ok, now get things going again... */
 
 	/*
@@ -1939,6 +1946,9 @@ void i915_reset(struct drm_i915_private *i915, unsigned int flags)
 		goto error;
 	}
 
+	i915_gem_reset(i915);
+	intel_overlay_reset(i915);
+
 	/*
 	 * Next we need to restore the context, but we don't use those
 	 * yet either...
@@ -2011,19 +2021,19 @@ int i915_reset_engine(struct intel_engine_cs *engine, unsigned int flags)
 
 	GEM_BUG_ON(!test_bit(I915_RESET_ENGINE + engine->id, &error->flags));
 
+	active_request = i915_gem_reset_prepare_engine(engine);
+	if (IS_ERR_OR_NULL(active_request)) {
+		/* Either the previous reset failed, or we pardon the reset. */
+		ret = PTR_ERR(active_request);
+		goto out;
+	}
+
 	if (!(flags & I915_RESET_QUIET)) {
 		dev_notice(engine->i915->drm.dev,
 			   "Resetting %s after gpu hang\n", engine->name);
 	}
 	error->reset_engine_count[engine->id]++;
 
-	active_request = i915_gem_reset_prepare_engine(engine);
-	if (IS_ERR(active_request)) {
-		DRM_DEBUG_DRIVER("Previous reset failed, promote to full reset\n");
-		ret = PTR_ERR(active_request);
-		goto out;
-	}
-
 	if (!engine->i915->guc.execbuf_client)
 		ret = intel_gt_reset_engine(engine->i915, engine);
 	else
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 1aba5657f5f0..caebd5825279 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -56,12 +56,15 @@
 #include "i915_reg.h"
 #include "i915_utils.h"
 
-#include "intel_uncore.h"
 #include "intel_bios.h"
+#include "intel_device_info.h"
+#include "intel_display.h"
 #include "intel_dpll_mgr.h"
-#include "intel_uc.h"
 #include "intel_lrc.h"
+#include "intel_opregion.h"
 #include "intel_ringbuffer.h"
+#include "intel_uncore.h"
+#include "intel_uc.h"
 
 #include "i915_gem.h"
 #include "i915_gem_context.h"
@@ -80,8 +83,8 @@
 
 #define DRIVER_NAME		"i915"
 #define DRIVER_DESC		"Intel Graphics"
-#define DRIVER_DATE		"20171214"
-#define DRIVER_TIMESTAMP	1513282202
+#define DRIVER_DATE		"20171222"
+#define DRIVER_TIMESTAMP	1513971710
 
 /* Use I915_STATE_WARN(x) and I915_STATE_WARN_ON() (rather than WARN() and
  * WARN_ON()) for hw state sanity checks to check for unexpected conditions
@@ -243,174 +246,6 @@ static inline uint_fixed_16_16_t add_fixed16_u32(uint_fixed_16_16_t add1,
 	return clamp_u64_to_fixed16(interm_sum);
 }
 
-static inline const char *yesno(bool v)
-{
-	return v ? "yes" : "no";
-}
-
-static inline const char *onoff(bool v)
-{
-	return v ? "on" : "off";
-}
-
-static inline const char *enableddisabled(bool v)
-{
-	return v ? "enabled" : "disabled";
-}
-
-enum pipe {
-	INVALID_PIPE = -1,
-	PIPE_A = 0,
-	PIPE_B,
-	PIPE_C,
-	_PIPE_EDP,
-	I915_MAX_PIPES = _PIPE_EDP
-};
-#define pipe_name(p) ((p) + 'A')
-
-enum transcoder {
-	TRANSCODER_A = 0,
-	TRANSCODER_B,
-	TRANSCODER_C,
-	TRANSCODER_EDP,
-	TRANSCODER_DSI_A,
-	TRANSCODER_DSI_C,
-	I915_MAX_TRANSCODERS
-};
-
-static inline const char *transcoder_name(enum transcoder transcoder)
-{
-	switch (transcoder) {
-	case TRANSCODER_A:
-		return "A";
-	case TRANSCODER_B:
-		return "B";
-	case TRANSCODER_C:
-		return "C";
-	case TRANSCODER_EDP:
-		return "EDP";
-	case TRANSCODER_DSI_A:
-		return "DSI A";
-	case TRANSCODER_DSI_C:
-		return "DSI C";
-	default:
-		return "<invalid>";
-	}
-}
-
-static inline bool transcoder_is_dsi(enum transcoder transcoder)
-{
-	return transcoder == TRANSCODER_DSI_A || transcoder == TRANSCODER_DSI_C;
-}
-
-/*
- * Global legacy plane identifier. Valid only for primary/sprite
- * planes on pre-g4x, and only for primary planes on g4x-bdw.
- */
-enum i9xx_plane_id {
-	PLANE_A,
-	PLANE_B,
-	PLANE_C,
-};
-#define plane_name(p) ((p) + 'A')
-
-#define sprite_name(p, s) ((p) * INTEL_INFO(dev_priv)->num_sprites[(p)] + (s) + 'A')
-
-/*
- * Per-pipe plane identifier.
- * I915_MAX_PLANES in the enum below is the maximum (across all platforms)
- * number of planes per CRTC.  Not all platforms really have this many planes,
- * which means some arrays of size I915_MAX_PLANES may have unused entries
- * between the topmost sprite plane and the cursor plane.
- *
- * This is expected to be passed to various register macros
- * (eg. PLANE_CTL(), PS_PLANE_SEL(), etc.) so adjust with care.
- */
-enum plane_id {
-	PLANE_PRIMARY,
-	PLANE_SPRITE0,
-	PLANE_SPRITE1,
-	PLANE_SPRITE2,
-	PLANE_CURSOR,
-	I915_MAX_PLANES,
-};
-
-#define for_each_plane_id_on_crtc(__crtc, __p) \
-	for ((__p) = PLANE_PRIMARY; (__p) < I915_MAX_PLANES; (__p)++) \
-		for_each_if ((__crtc)->plane_ids_mask & BIT(__p))
-
-enum port {
-	PORT_NONE = -1,
-	PORT_A = 0,
-	PORT_B,
-	PORT_C,
-	PORT_D,
-	PORT_E,
-	I915_MAX_PORTS
-};
-#define port_name(p) ((p) + 'A')
-
-#define I915_NUM_PHYS_VLV 2
-
-enum dpio_channel {
-	DPIO_CH0,
-	DPIO_CH1
-};
-
-enum dpio_phy {
-	DPIO_PHY0,
-	DPIO_PHY1,
-	DPIO_PHY2,
-};
-
-enum intel_display_power_domain {
-	POWER_DOMAIN_PIPE_A,
-	POWER_DOMAIN_PIPE_B,
-	POWER_DOMAIN_PIPE_C,
-	POWER_DOMAIN_PIPE_A_PANEL_FITTER,
-	POWER_DOMAIN_PIPE_B_PANEL_FITTER,
-	POWER_DOMAIN_PIPE_C_PANEL_FITTER,
-	POWER_DOMAIN_TRANSCODER_A,
-	POWER_DOMAIN_TRANSCODER_B,
-	POWER_DOMAIN_TRANSCODER_C,
-	POWER_DOMAIN_TRANSCODER_EDP,
-	POWER_DOMAIN_TRANSCODER_DSI_A,
-	POWER_DOMAIN_TRANSCODER_DSI_C,
-	POWER_DOMAIN_PORT_DDI_A_LANES,
-	POWER_DOMAIN_PORT_DDI_B_LANES,
-	POWER_DOMAIN_PORT_DDI_C_LANES,
-	POWER_DOMAIN_PORT_DDI_D_LANES,
-	POWER_DOMAIN_PORT_DDI_E_LANES,
-	POWER_DOMAIN_PORT_DDI_A_IO,
-	POWER_DOMAIN_PORT_DDI_B_IO,
-	POWER_DOMAIN_PORT_DDI_C_IO,
-	POWER_DOMAIN_PORT_DDI_D_IO,
-	POWER_DOMAIN_PORT_DDI_E_IO,
-	POWER_DOMAIN_PORT_DSI,
-	POWER_DOMAIN_PORT_CRT,
-	POWER_DOMAIN_PORT_OTHER,
-	POWER_DOMAIN_VGA,
-	POWER_DOMAIN_AUDIO,
-	POWER_DOMAIN_PLLS,
-	POWER_DOMAIN_AUX_A,
-	POWER_DOMAIN_AUX_B,
-	POWER_DOMAIN_AUX_C,
-	POWER_DOMAIN_AUX_D,
-	POWER_DOMAIN_GMBUS,
-	POWER_DOMAIN_MODESET,
-	POWER_DOMAIN_GT_IRQ,
-	POWER_DOMAIN_INIT,
-
-	POWER_DOMAIN_NUM,
-};
-
-#define POWER_DOMAIN_PIPE(pipe) ((pipe) + POWER_DOMAIN_PIPE_A)
-#define POWER_DOMAIN_PIPE_PANEL_FITTER(pipe) \
-		((pipe) + POWER_DOMAIN_PIPE_A_PANEL_FITTER)
-#define POWER_DOMAIN_TRANSCODER(tran) \
-	((tran) == TRANSCODER_EDP ? POWER_DOMAIN_TRANSCODER_EDP : \
-	 (tran) + POWER_DOMAIN_TRANSCODER_A)
-
 enum hpd_pin {
 	HPD_NONE = 0,
 	HPD_TV = HPD_NONE,     /* TV is known to be unreliable */
@@ -472,121 +307,6 @@ struct i915_hotplug {
 	 I915_GEM_DOMAIN_INSTRUCTION | \
 	 I915_GEM_DOMAIN_VERTEX)
 
-#define for_each_pipe(__dev_priv, __p) \
-	for ((__p) = 0; (__p) < INTEL_INFO(__dev_priv)->num_pipes; (__p)++)
-#define for_each_pipe_masked(__dev_priv, __p, __mask) \
-	for ((__p) = 0; (__p) < INTEL_INFO(__dev_priv)->num_pipes; (__p)++) \
-		for_each_if ((__mask) & (1 << (__p)))
-#define for_each_universal_plane(__dev_priv, __pipe, __p)		\
-	for ((__p) = 0;							\
-	     (__p) < INTEL_INFO(__dev_priv)->num_sprites[(__pipe)] + 1;	\
-	     (__p)++)
-#define for_each_sprite(__dev_priv, __p, __s)				\
-	for ((__s) = 0;							\
-	     (__s) < INTEL_INFO(__dev_priv)->num_sprites[(__p)];	\
-	     (__s)++)
-
-#define for_each_port_masked(__port, __ports_mask) \
-	for ((__port) = PORT_A; (__port) < I915_MAX_PORTS; (__port)++)	\
-		for_each_if ((__ports_mask) & (1 << (__port)))
-
-#define for_each_crtc(dev, crtc) \
-	list_for_each_entry(crtc, &(dev)->mode_config.crtc_list, head)
-
-#define for_each_intel_plane(dev, intel_plane) \
-	list_for_each_entry(intel_plane,			\
-			    &(dev)->mode_config.plane_list,	\
-			    base.head)
-
-#define for_each_intel_plane_mask(dev, intel_plane, plane_mask)		\
-	list_for_each_entry(intel_plane,				\
-			    &(dev)->mode_config.plane_list,		\
-			    base.head)					\
-		for_each_if ((plane_mask) &				\
-			     (1 << drm_plane_index(&intel_plane->base)))
-
-#define for_each_intel_plane_on_crtc(dev, intel_crtc, intel_plane)	\
-	list_for_each_entry(intel_plane,				\
-			    &(dev)->mode_config.plane_list,		\
-			    base.head)					\
-		for_each_if ((intel_plane)->pipe == (intel_crtc)->pipe)
-
-#define for_each_intel_crtc(dev, intel_crtc)				\
-	list_for_each_entry(intel_crtc,					\
-			    &(dev)->mode_config.crtc_list,		\
-			    base.head)
-
-#define for_each_intel_crtc_mask(dev, intel_crtc, crtc_mask)		\
-	list_for_each_entry(intel_crtc,					\
-			    &(dev)->mode_config.crtc_list,		\
-			    base.head)					\
-		for_each_if ((crtc_mask) & (1 << drm_crtc_index(&intel_crtc->base)))
-
-#define for_each_intel_encoder(dev, intel_encoder)		\
-	list_for_each_entry(intel_encoder,			\
-			    &(dev)->mode_config.encoder_list,	\
-			    base.head)
-
-#define for_each_intel_connector_iter(intel_connector, iter) \
-	while ((intel_connector = to_intel_connector(drm_connector_list_iter_next(iter))))
-
-#define for_each_encoder_on_crtc(dev, __crtc, intel_encoder) \
-	list_for_each_entry((intel_encoder), &(dev)->mode_config.encoder_list, base.head) \
-		for_each_if ((intel_encoder)->base.crtc == (__crtc))
-
-#define for_each_connector_on_encoder(dev, __encoder, intel_connector) \
-	list_for_each_entry((intel_connector), &(dev)->mode_config.connector_list, base.head) \
-		for_each_if ((intel_connector)->base.encoder == (__encoder))
-
-#define for_each_power_domain(domain, mask)				\
-	for ((domain) = 0; (domain) < POWER_DOMAIN_NUM; (domain)++)	\
-		for_each_if (BIT_ULL(domain) & (mask))
-
-#define for_each_power_well(__dev_priv, __power_well)				\
-	for ((__power_well) = (__dev_priv)->power_domains.power_wells;	\
-	     (__power_well) - (__dev_priv)->power_domains.power_wells <	\
-		(__dev_priv)->power_domains.power_well_count;		\
-	     (__power_well)++)
-
-#define for_each_power_well_rev(__dev_priv, __power_well)			\
-	for ((__power_well) = (__dev_priv)->power_domains.power_wells +		\
-			      (__dev_priv)->power_domains.power_well_count - 1;	\
-	     (__power_well) - (__dev_priv)->power_domains.power_wells >= 0;	\
-	     (__power_well)--)
-
-#define for_each_power_domain_well(__dev_priv, __power_well, __domain_mask)	\
-	for_each_power_well(__dev_priv, __power_well)				\
-		for_each_if ((__power_well)->domains & (__domain_mask))
-
-#define for_each_power_domain_well_rev(__dev_priv, __power_well, __domain_mask) \
-	for_each_power_well_rev(__dev_priv, __power_well)		        \
-		for_each_if ((__power_well)->domains & (__domain_mask))
-
-#define for_each_new_intel_plane_in_state(__state, plane, new_plane_state, __i) \
-	for ((__i) = 0; \
-	     (__i) < (__state)->base.dev->mode_config.num_total_plane && \
-		     ((plane) = to_intel_plane((__state)->base.planes[__i].ptr), \
-		      (new_plane_state) = to_intel_plane_state((__state)->base.planes[__i].new_state), 1); \
-	     (__i)++) \
-		for_each_if (plane)
-
-#define for_each_new_intel_crtc_in_state(__state, crtc, new_crtc_state, __i) \
-	for ((__i) = 0; \
-	     (__i) < (__state)->base.dev->mode_config.num_crtc && \
-		     ((crtc) = to_intel_crtc((__state)->base.crtcs[__i].ptr), \
-		      (new_crtc_state) = to_intel_crtc_state((__state)->base.crtcs[__i].new_state), 1); \
-	     (__i)++) \
-		for_each_if (crtc)
-
-#define for_each_oldnew_intel_plane_in_state(__state, plane, old_plane_state, new_plane_state, __i) \
-	for ((__i) = 0; \
-	     (__i) < (__state)->base.dev->mode_config.num_total_plane && \
-		     ((plane) = to_intel_plane((__state)->base.planes[__i].ptr), \
-		      (old_plane_state) = to_intel_plane_state((__state)->base.planes[__i].old_state), \
-		      (new_plane_state) = to_intel_plane_state((__state)->base.planes[__i].new_state), 1); \
-	     (__i)++) \
-		for_each_if (plane)
-
 struct drm_i915_private;
 struct i915_mm_struct;
 struct i915_mmu_object;
@@ -623,20 +343,6 @@ struct drm_i915_file_private {
 	atomic_t context_bans;
 };
 
-/* Used by dp and fdi links */
-struct intel_link_m_n {
-	uint32_t	tu;
-	uint32_t	gmch_m;
-	uint32_t	gmch_n;
-	uint32_t	link_m;
-	uint32_t	link_n;
-};
-
-void intel_link_compute_m_n(int bpp, int nlanes,
-			    int pixel_clock, int link_clock,
-			    struct intel_link_m_n *m_n,
-			    bool reduce_m_n);
-
 /* Interface history:
  *
  * 1.1: Original.
@@ -651,27 +357,6 @@ void intel_link_compute_m_n(int bpp, int nlanes,
 #define DRIVER_MINOR		6
 #define DRIVER_PATCHLEVEL	0
 
-struct opregion_header;
-struct opregion_acpi;
-struct opregion_swsci;
-struct opregion_asle;
-
-struct intel_opregion {
-	struct opregion_header *header;
-	struct opregion_acpi *acpi;
-	struct opregion_swsci *swsci;
-	u32 swsci_gbda_sub_functions;
-	u32 swsci_sbcb_sub_functions;
-	struct opregion_asle *asle;
-	void *rvda;
-	void *vbt_firmware;
-	const void *vbt;
-	u32 vbt_size;
-	u32 *lid_state;
-	struct work_struct asle_work;
-};
-#define OPREGION_SIZE            (8*1024)
-
 struct intel_overlay;
 struct intel_overlay_error_state;
 
@@ -764,137 +449,6 @@ struct intel_csr {
 	uint32_t allowed_dc_mask;
 };
 
-#define DEV_INFO_FOR_EACH_FLAG(func) \
-	func(is_mobile); \
-	func(is_lp); \
-	func(is_alpha_support); \
-	/* Keep has_* in alphabetical order */ \
-	func(has_64bit_reloc); \
-	func(has_aliasing_ppgtt); \
-	func(has_csr); \
-	func(has_ddi); \
-	func(has_dp_mst); \
-	func(has_reset_engine); \
-	func(has_fbc); \
-	func(has_fpga_dbg); \
-	func(has_full_ppgtt); \
-	func(has_full_48bit_ppgtt); \
-	func(has_gmch_display); \
-	func(has_guc); \
-	func(has_guc_ct); \
-	func(has_hotplug); \
-	func(has_l3_dpf); \
-	func(has_llc); \
-	func(has_logical_ring_contexts); \
-	func(has_logical_ring_preemption); \
-	func(has_overlay); \
-	func(has_pooled_eu); \
-	func(has_psr); \
-	func(has_rc6); \
-	func(has_rc6p); \
-	func(has_resource_streamer); \
-	func(has_runtime_pm); \
-	func(has_snoop); \
-	func(unfenced_needs_alignment); \
-	func(cursor_needs_physical); \
-	func(hws_needs_physical); \
-	func(overlay_needs_physical); \
-	func(supports_tv); \
-	func(has_ipc);
-
-struct sseu_dev_info {
-	u8 slice_mask;
-	u8 subslice_mask;
-	u8 eu_total;
-	u8 eu_per_subslice;
-	u8 min_eu_in_pool;
-	/* For each slice, which subslice(s) has(have) 7 EUs (bitfield)? */
-	u8 subslice_7eu[3];
-	u8 has_slice_pg:1;
-	u8 has_subslice_pg:1;
-	u8 has_eu_pg:1;
-};
-
-static inline unsigned int sseu_subslice_total(const struct sseu_dev_info *sseu)
-{
-	return hweight8(sseu->slice_mask) * hweight8(sseu->subslice_mask);
-}
-
-/* Keep in gen based order, and chronological order within a gen */
-enum intel_platform {
-	INTEL_PLATFORM_UNINITIALIZED = 0,
-	INTEL_I830,
-	INTEL_I845G,
-	INTEL_I85X,
-	INTEL_I865G,
-	INTEL_I915G,
-	INTEL_I915GM,
-	INTEL_I945G,
-	INTEL_I945GM,
-	INTEL_G33,
-	INTEL_PINEVIEW,
-	INTEL_I965G,
-	INTEL_I965GM,
-	INTEL_G45,
-	INTEL_GM45,
-	INTEL_IRONLAKE,
-	INTEL_SANDYBRIDGE,
-	INTEL_IVYBRIDGE,
-	INTEL_VALLEYVIEW,
-	INTEL_HASWELL,
-	INTEL_BROADWELL,
-	INTEL_CHERRYVIEW,
-	INTEL_SKYLAKE,
-	INTEL_BROXTON,
-	INTEL_KABYLAKE,
-	INTEL_GEMINILAKE,
-	INTEL_COFFEELAKE,
-	INTEL_CANNONLAKE,
-	INTEL_MAX_PLATFORMS
-};
-
-struct intel_device_info {
-	u16 device_id;
-	u16 gen_mask;
-
-	u8 gen;
-	u8 gt; /* GT number, 0 if undefined */
-	u8 num_rings;
-	u8 ring_mask; /* Rings supported by the HW */
-
-	enum intel_platform platform;
-	u32 platform_mask;
-
-	u32 display_mmio_offset;
-
-	u8 num_pipes;
-	u8 num_sprites[I915_MAX_PIPES];
-	u8 num_scalers[I915_MAX_PIPES];
-
-	unsigned int page_sizes; /* page sizes supported by the HW */
-
-#define DEFINE_FLAG(name) u8 name:1
-	DEV_INFO_FOR_EACH_FLAG(DEFINE_FLAG);
-#undef DEFINE_FLAG
-	u16 ddb_size; /* in blocks */
-
-	/* Register offsets for the various display pipes and transcoders */
-	int pipe_offsets[I915_MAX_TRANSCODERS];
-	int trans_offsets[I915_MAX_TRANSCODERS];
-	int palette_offsets[I915_MAX_PIPES];
-	int cursor_offsets[I915_MAX_PIPES];
-
-	/* Slice/subslice/EU info */
-	struct sseu_dev_info sseu;
-
-	u32 cs_timestamp_frequency_khz;
-
-	struct color_luts {
-		u16 degamma_lut_size;
-		u16 gamma_lut_size;
-	} color;
-};
-
 struct intel_display_error_state;
 
 struct i915_gpu_state {
@@ -948,6 +502,7 @@ struct i915_gpu_state {
 	struct drm_i915_error_engine {
 		int engine_id;
 		/* Software tracked state */
+		bool idle;
 		bool waiting;
 		int num_waiters;
 		unsigned long hangcheck_timestamp;
@@ -2405,6 +1960,9 @@ struct drm_i915_private {
 	 */
 	struct workqueue_struct *wq;
 
+	/* ordered wq for modesets */
+	struct workqueue_struct *modeset_wq;
+
 	/* Display functions */
 	struct drm_i915_display_funcs display;
 
@@ -4111,41 +3669,6 @@ bool intel_bios_is_port_hpd_inverted(struct drm_i915_private *dev_priv,
 bool intel_bios_is_lspcon_present(struct drm_i915_private *dev_priv,
 				enum port port);
 
-
-/* intel_opregion.c */
-#ifdef CONFIG_ACPI
-extern int intel_opregion_setup(struct drm_i915_private *dev_priv);
-extern void intel_opregion_register(struct drm_i915_private *dev_priv);
-extern void intel_opregion_unregister(struct drm_i915_private *dev_priv);
-extern void intel_opregion_asle_intr(struct drm_i915_private *dev_priv);
-extern int intel_opregion_notify_encoder(struct intel_encoder *intel_encoder,
-					 bool enable);
-extern int intel_opregion_notify_adapter(struct drm_i915_private *dev_priv,
-					 pci_power_t state);
-extern int intel_opregion_get_panel_type(struct drm_i915_private *dev_priv);
-#else
-static inline int intel_opregion_setup(struct drm_i915_private *dev) { return 0; }
-static inline void intel_opregion_register(struct drm_i915_private *dev_priv) { }
-static inline void intel_opregion_unregister(struct drm_i915_private *dev_priv) { }
-static inline void intel_opregion_asle_intr(struct drm_i915_private *dev_priv)
-{
-}
-static inline int
-intel_opregion_notify_encoder(struct intel_encoder *intel_encoder, bool enable)
-{
-	return 0;
-}
-static inline int
-intel_opregion_notify_adapter(struct drm_i915_private *dev, pci_power_t state)
-{
-	return 0;
-}
-static inline int intel_opregion_get_panel_type(struct drm_i915_private *dev)
-{
-	return -ENODEV;
-}
-#endif
-
 /* intel_acpi.c */
 #ifdef CONFIG_ACPI
 extern void intel_register_dsm_handler(void);
@@ -4162,10 +3685,6 @@ mkwrite_device_info(struct drm_i915_private *dev_priv)
 	return (struct intel_device_info *)&dev_priv->info;
 }
 
-const char *intel_platform_name(enum intel_platform platform);
-void intel_device_info_runtime_init(struct drm_i915_private *dev_priv);
-void intel_device_info_dump(struct drm_i915_private *dev_priv);
-
 /* modesetting */
 extern void intel_modeset_init_hw(struct drm_device *dev);
 extern int intel_modeset_init(struct drm_device *dev);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 4a7f5579a7a5..ba9f67c256f4 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2596,7 +2596,7 @@ static int ____i915_gem_object_get_pages(struct drm_i915_gem_object *obj)
 	}
 
 	err = obj->ops->get_pages(obj);
-	GEM_BUG_ON(!err && IS_ERR_OR_NULL(obj->mm.pages));
+	GEM_BUG_ON(!err && !i915_gem_object_has_pages(obj));
 
 	return err;
 }
@@ -3089,7 +3089,12 @@ i915_gem_reset_request(struct intel_engine_cs *engine,
 void i915_gem_reset_engine(struct intel_engine_cs *engine,
 			   struct drm_i915_gem_request *request)
 {
-	engine->irq_posted = 0;
+	/*
+	 * Make sure this write is visible before we re-enable the interrupt
+	 * handlers on another CPU, as tasklet_enable() resolves to just
+	 * a compiler barrier which is insufficient for our purpose here.
+	 */
+	smp_store_mb(engine->irq_posted, 0);
 
 	if (request)
 		request = i915_gem_reset_request(engine, request);
@@ -3119,6 +3124,25 @@ void i915_gem_reset(struct drm_i915_private *dev_priv)
 		ctx = fetch_and_zero(&engine->last_retired_context);
 		if (ctx)
 			engine->context_unpin(engine, ctx);
+
+		/*
+		 * Ostensibily, we always want a context loaded for powersaving,
+		 * so if the engine is idle after the reset, send a request
+		 * to load our scratch kernel_context.
+		 *
+		 * More mysteriously, if we leave the engine idle after a reset,
+		 * the next userspace batch may hang, with what appears to be
+		 * an incoherent read by the CS (presumably stale TLB). An
+		 * empty request appears sufficient to paper over the glitch.
+		 */
+		if (list_empty(&engine->timeline->requests)) {
+			struct drm_i915_gem_request *rq;
+
+			rq = i915_gem_request_alloc(engine,
+						    dev_priv->kernel_context);
+			if (!IS_ERR(rq))
+				__i915_add_request(rq, false);
+		}
 	}
 
 	i915_gem_restore_fences(dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_gem_internal.c b/drivers/gpu/drm/i915/i915_gem_internal.c
index ee83ec838ee7..a1d6956734f7 100644
--- a/drivers/gpu/drm/i915/i915_gem_internal.c
+++ b/drivers/gpu/drm/i915/i915_gem_internal.c
@@ -27,6 +27,7 @@
 #include "i915_drv.h"
 
 #define QUIET (__GFP_NORETRY | __GFP_NOWARN)
+#define MAYFAIL (__GFP_RETRY_MAYFAIL | __GFP_NOWARN)
 
 /* convert swiotlb segment size into sensible units (pages)! */
 #define IO_TLB_SEGPAGES (IO_TLB_SEGSIZE << IO_TLB_SHIFT >> PAGE_SHIFT)
@@ -95,7 +96,8 @@ create_st:
 		struct page *page;
 
 		do {
-			page = alloc_pages(gfp | (order ? QUIET : 0), order);
+			page = alloc_pages(gfp | (order ? QUIET : MAYFAIL),
+					   order);
 			if (page)
 				break;
 			if (!order--)
diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index 59f023bb7015..d575109f7a7f 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -479,6 +479,7 @@ void __i915_gem_request_submit(struct drm_i915_gem_request *request)
 	/* Transfer from per-context onto the global per-engine timeline */
 	timeline = engine->timeline;
 	GEM_BUG_ON(timeline == request->timeline);
+	GEM_BUG_ON(request->global_seqno);
 
 	seqno = timeline_get_seqno(timeline);
 	GEM_BUG_ON(!seqno);
@@ -525,6 +526,7 @@ void __i915_gem_request_unsubmit(struct drm_i915_gem_request *request)
 	/* Only unwind in reverse order, required so that the per-context list
 	 * is kept in seqno/ring order.
 	 */
+	GEM_BUG_ON(!request->global_seqno);
 	GEM_BUG_ON(request->global_seqno != engine->timeline->seqno);
 	engine->timeline->seqno--;
 
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index aba50aa613f1..944059322daa 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -416,6 +416,7 @@ static void error_print_engine(struct drm_i915_error_state_buf *m,
 	int n;
 
 	err_printf(m, "%s command stream:\n", engine_str(ee->engine_id));
+	err_printf(m, "  IDLE?: %s\n", yesno(ee->idle));
 	err_printf(m, "  START: 0x%08x\n", ee->start);
 	err_printf(m, "  HEAD:  0x%08x [0x%08x]\n", ee->head, ee->rq_head);
 	err_printf(m, "  TAIL:  0x%08x [0x%08x, 0x%08x]\n",
@@ -564,34 +565,17 @@ static void print_error_obj(struct drm_i915_error_state_buf *m,
 static void err_print_capabilities(struct drm_i915_error_state_buf *m,
 				   const struct intel_device_info *info)
 {
-#define PRINT_FLAG(x)  err_printf(m, #x ": %s\n", yesno(info->x))
-	DEV_INFO_FOR_EACH_FLAG(PRINT_FLAG);
-#undef PRINT_FLAG
-}
+	struct drm_printer p = i915_error_printer(m);
 
-static __always_inline void err_print_param(struct drm_i915_error_state_buf *m,
-					    const char *name,
-					    const char *type,
-					    const void *x)
-{
-	if (!__builtin_strcmp(type, "bool"))
-		err_printf(m, "i915.%s=%s\n", name, yesno(*(const bool *)x));
-	else if (!__builtin_strcmp(type, "int"))
-		err_printf(m, "i915.%s=%d\n", name, *(const int *)x);
-	else if (!__builtin_strcmp(type, "unsigned int"))
-		err_printf(m, "i915.%s=%u\n", name, *(const unsigned int *)x);
-	else if (!__builtin_strcmp(type, "char *"))
-		err_printf(m, "i915.%s=%s\n", name, *(const char **)x);
-	else
-		BUILD_BUG();
+	intel_device_info_dump_flags(info, &p);
 }
 
 static void err_print_params(struct drm_i915_error_state_buf *m,
-			     const struct i915_params *p)
+			     const struct i915_params *params)
 {
-#define PRINT(T, x, ...) err_print_param(m, #x, #T, &p->x);
-	I915_PARAMS_FOR_EACH(PRINT);
-#undef PRINT
+	struct drm_printer p = i915_error_printer(m);
+
+	i915_params_dump(params, &p);
 }
 
 static void err_print_pciid(struct drm_i915_error_state_buf *m,
@@ -1256,6 +1240,7 @@ static void error_record_engine_registers(struct i915_gpu_state *error,
 		ee->hws = I915_READ(mmio);
 	}
 
+	ee->idle = intel_engine_is_idle(engine);
 	ee->hangcheck_timestamp = engine->hangcheck.action_timestamp;
 	ee->hangcheck_action = engine->hangcheck.action;
 	ee->hangcheck_stalled = engine->hangcheck.stalled;
diff --git a/drivers/gpu/drm/i915/i915_memcpy.c b/drivers/gpu/drm/i915/i915_memcpy.c
index 49a079494b68..79f8ec756362 100644
--- a/drivers/gpu/drm/i915/i915_memcpy.c
+++ b/drivers/gpu/drm/i915/i915_memcpy.c
@@ -96,6 +96,11 @@ bool i915_memcpy_from_wc(void *dst, const void *src, unsigned long len)
 
 void i915_memcpy_init_early(struct drm_i915_private *dev_priv)
 {
-	if (static_cpu_has(X86_FEATURE_XMM4_1))
+	/*
+	 * Some hypervisors (e.g. KVM) don't support VEX-prefix instructions
+	 * emulation. So don't enable movntdqa in hypervisor guest.
+	 */
+	if (static_cpu_has(X86_FEATURE_XMM4_1) &&
+	    !boot_cpu_has(X86_FEATURE_HYPERVISOR))
 		static_branch_enable(&has_movntdqa);
 }
diff --git a/drivers/gpu/drm/i915/i915_params.c b/drivers/gpu/drm/i915/i915_params.c
index 8dfea0320c2f..b5f3eb4fa8a3 100644
--- a/drivers/gpu/drm/i915/i915_params.c
+++ b/drivers/gpu/drm/i915/i915_params.c
@@ -22,6 +22,8 @@
  * IN THE SOFTWARE.
  */
 
+#include <drm/drm_print.h>
+
 #include "i915_params.h"
 #include "i915_drv.h"
 
@@ -172,3 +174,34 @@ i915_param_named(enable_dpcd_backlight, bool, 0600,
 
 i915_param_named(enable_gvt, bool, 0400,
 	"Enable support for Intel GVT-g graphics virtualization host support(default:false)");
+
+static __always_inline void _print_param(struct drm_printer *p,
+					 const char *name,
+					 const char *type,
+					 const void *x)
+{
+	if (!__builtin_strcmp(type, "bool"))
+		drm_printf(p, "i915.%s=%s\n", name, yesno(*(const bool *)x));
+	else if (!__builtin_strcmp(type, "int"))
+		drm_printf(p, "i915.%s=%d\n", name, *(const int *)x);
+	else if (!__builtin_strcmp(type, "unsigned int"))
+		drm_printf(p, "i915.%s=%u\n", name, *(const unsigned int *)x);
+	else if (!__builtin_strcmp(type, "char *"))
+		drm_printf(p, "i915.%s=%s\n", name, *(const char **)x);
+	else
+		BUILD_BUG();
+}
+
+/**
+ * i915_params_dump - dump i915 modparams
+ * @params: i915 modparams
+ * @p: the &drm_printer
+ *
+ * Pretty printer for i915 modparams.
+ */
+void i915_params_dump(const struct i915_params *params, struct drm_printer *p)
+{
+#define PRINT(T, x, ...) _print_param(p, #x, #T, &params->x);
+	I915_PARAMS_FOR_EACH(PRINT);
+#undef PRINT
+}
diff --git a/drivers/gpu/drm/i915/i915_params.h b/drivers/gpu/drm/i915/i915_params.h
index 792ce26d7449..c96360398072 100644
--- a/drivers/gpu/drm/i915/i915_params.h
+++ b/drivers/gpu/drm/i915/i915_params.h
@@ -28,6 +28,8 @@
 #include <linux/bitops.h>
 #include <linux/cache.h> /* for __read_mostly */
 
+struct drm_printer;
+
 #define ENABLE_GUC_SUBMISSION		BIT(0)
 #define ENABLE_GUC_LOAD_HUC		BIT(1)
 
@@ -77,5 +79,7 @@ struct i915_params {
 
 extern struct i915_params i915_modparams __read_mostly;
 
+void i915_params_dump(const struct i915_params *params, struct drm_printer *p);
+
 #endif
 
diff --git a/drivers/gpu/drm/i915/i915_pci.c b/drivers/gpu/drm/i915/i915_pci.c
index fa67d3dde20e..36d48422b475 100644
--- a/drivers/gpu/drm/i915/i915_pci.c
+++ b/drivers/gpu/drm/i915/i915_pci.c
@@ -633,6 +633,8 @@ static const struct pci_device_id pciidlist[] = {
 	INTEL_CFL_S_GT1_IDS(&intel_coffeelake_gt1_info),
 	INTEL_CFL_S_GT2_IDS(&intel_coffeelake_gt2_info),
 	INTEL_CFL_H_GT2_IDS(&intel_coffeelake_gt2_info),
+	INTEL_CFL_U_GT1_IDS(&intel_coffeelake_gt1_info),
+	INTEL_CFL_U_GT2_IDS(&intel_coffeelake_gt2_info),
 	INTEL_CFL_U_GT3_IDS(&intel_coffeelake_gt3_info),
 	INTEL_CNL_U_GT2_IDS(&intel_cannonlake_gt2_info),
 	INTEL_CNL_Y_GT2_IDS(&intel_cannonlake_gt2_info),
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index 09bf043c1c2e..41285bec8fc0 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -3278,6 +3278,7 @@ enum i915_power_well_id {
 # define AUDUNIT_CLOCK_GATE_DISABLE		(1 << 26) /* 965 */
 # define DPUNIT_A_CLOCK_GATE_DISABLE		(1 << 25) /* 965 */
 # define DPCUNIT_CLOCK_GATE_DISABLE		(1 << 24) /* 965 */
+# define PNV_GMBUSUNIT_CLOCK_GATE_DISABLE	(1 << 24) /* pnv */
 # define TVRUNIT_CLOCK_GATE_DISABLE		(1 << 23) /* 915-945 */
 # define TVCUNIT_CLOCK_GATE_DISABLE		(1 << 22) /* 915-945 */
 # define TVFUNIT_CLOCK_GATE_DISABLE		(1 << 21) /* 915-945 */
@@ -3858,6 +3859,9 @@ enum {
 #define   PWM2_GATING_DIS		(1 << 14)
 #define   PWM1_GATING_DIS		(1 << 13)
 
+#define GEN9_CLKGATE_DIS_4		_MMIO(0x4653C)
+#define   BXT_GMBUS_GATING_DIS		(1 << 14)
+
 #define _CLKGATE_DIS_PSL_A		0x46520
 #define _CLKGATE_DIS_PSL_B		0x46524
 #define _CLKGATE_DIS_PSL_C		0x46528
@@ -3875,6 +3879,9 @@ enum {
 #define  SARBUNIT_CLKGATE_DIS		(1 << 5)
 #define  RCCUNIT_CLKGATE_DIS		(1 << 7)
 
+#define UNSLICE_UNIT_LEVEL_CLKGATE	_MMIO(0x9434)
+#define  VFUNIT_CLKGATE_DIS		(1 << 20)
+
 /*
  * Display engine regs
  */
@@ -6329,6 +6336,7 @@ enum {
 #define   PLANE_CTL_TILED_X			(  1 << 10)
 #define   PLANE_CTL_TILED_Y			(  4 << 10)
 #define   PLANE_CTL_TILED_YF			(  5 << 10)
+#define   PLANE_CTL_FLIP_HORIZONTAL		(  1 << 8)
 #define   PLANE_CTL_ALPHA_MASK			(0x3 << 4) /* Pre-GLK */
 #define   PLANE_CTL_ALPHA_DISABLE		(  0 << 4)
 #define   PLANE_CTL_ALPHA_SW_PREMULTIPLY	(  2 << 4)
@@ -7552,6 +7560,7 @@ enum {
 #define FDI_RX_CHICKEN(pipe)	_MMIO_PIPE(pipe, _FDI_RXA_CHICKEN, _FDI_RXB_CHICKEN)
 
 #define SOUTH_DSPCLK_GATE_D	_MMIO(0xc2020)
+#define  PCH_GMBUSUNIT_CLOCK_GATE_DISABLE (1<<31)
 #define  PCH_DPLUNIT_CLOCK_GATE_DISABLE (1<<30)
 #define  PCH_DPLSUNIT_CLOCK_GATE_DISABLE (1<<29)
 #define  PCH_CPUNIT_CLOCK_GATE_DISABLE (1<<14)
@@ -8142,6 +8151,7 @@ enum {
 #define   PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE	(1<<8)
 #define   STALL_DOP_GATING_DISABLE		(1<<5)
 #define   THROTTLE_12_5				(7<<2)
+#define   DISABLE_EARLY_EOT			(1<<1)
 
 #define GEN7_ROW_CHICKEN2		_MMIO(0xe4f4)
 #define GEN7_ROW_CHICKEN2_GT2		_MMIO(0xf4f4)
diff --git a/drivers/gpu/drm/i915/i915_trace.h b/drivers/gpu/drm/i915/i915_trace.h
index 4e76768ffa95..e1169c02eb2b 100644
--- a/drivers/gpu/drm/i915/i915_trace.h
+++ b/drivers/gpu/drm/i915/i915_trace.h
@@ -616,6 +616,7 @@ TRACE_EVENT(i915_gem_request_queue,
 
 	    TP_STRUCT__entry(
 			     __field(u32, dev)
+			     __field(u32, hw_id)
 			     __field(u32, ring)
 			     __field(u32, ctx)
 			     __field(u32, seqno)
@@ -624,15 +625,16 @@ TRACE_EVENT(i915_gem_request_queue,
 
 	    TP_fast_assign(
 			   __entry->dev = req->i915->drm.primary->index;
+			   __entry->hw_id = req->ctx->hw_id;
 			   __entry->ring = req->engine->id;
 			   __entry->ctx = req->fence.context;
 			   __entry->seqno = req->fence.seqno;
 			   __entry->flags = flags;
 			   ),
 
-	    TP_printk("dev=%u, ring=%u, ctx=%u, seqno=%u, flags=0x%x",
-		      __entry->dev, __entry->ring, __entry->ctx, __entry->seqno,
-		      __entry->flags)
+	    TP_printk("dev=%u, hw_id=%u, ring=%u, ctx=%u, seqno=%u, flags=0x%x",
+		      __entry->dev, __entry->hw_id, __entry->ring, __entry->ctx,
+		      __entry->seqno, __entry->flags)
 );
 
 DECLARE_EVENT_CLASS(i915_gem_request,
@@ -641,23 +643,25 @@ DECLARE_EVENT_CLASS(i915_gem_request,
 
 	    TP_STRUCT__entry(
 			     __field(u32, dev)
-			     __field(u32, ctx)
+			     __field(u32, hw_id)
 			     __field(u32, ring)
+			     __field(u32, ctx)
 			     __field(u32, seqno)
 			     __field(u32, global)
 			     ),
 
 	    TP_fast_assign(
 			   __entry->dev = req->i915->drm.primary->index;
+			   __entry->hw_id = req->ctx->hw_id;
 			   __entry->ring = req->engine->id;
 			   __entry->ctx = req->fence.context;
 			   __entry->seqno = req->fence.seqno;
 			   __entry->global = req->global_seqno;
 			   ),
 
-	    TP_printk("dev=%u, ring=%u, ctx=%u, seqno=%u, global=%u",
-		      __entry->dev, __entry->ring, __entry->ctx, __entry->seqno,
-		      __entry->global)
+	    TP_printk("dev=%u, hw_id=%u, ring=%u, ctx=%u, seqno=%u, global=%u",
+		      __entry->dev, __entry->hw_id, __entry->ring, __entry->ctx,
+		      __entry->seqno, __entry->global)
 );
 
 DEFINE_EVENT(i915_gem_request, i915_gem_request_add,
@@ -683,15 +687,17 @@ DECLARE_EVENT_CLASS(i915_gem_request_hw,
 
 		    TP_STRUCT__entry(
 				     __field(u32, dev)
+				     __field(u32, hw_id)
 				     __field(u32, ring)
+				     __field(u32, ctx)
 				     __field(u32, seqno)
 				     __field(u32, global_seqno)
-				     __field(u32, ctx)
 				     __field(u32, port)
 				    ),
 
 		    TP_fast_assign(
 			           __entry->dev = req->i915->drm.primary->index;
+			           __entry->hw_id = req->ctx->hw_id;
 			           __entry->ring = req->engine->id;
 			           __entry->ctx = req->fence.context;
 			           __entry->seqno = req->fence.seqno;
@@ -699,10 +705,10 @@ DECLARE_EVENT_CLASS(i915_gem_request_hw,
 			           __entry->port = port;
 			          ),
 
-		    TP_printk("dev=%u, ring=%u, ctx=%u, seqno=%u, global=%u, port=%u",
-			      __entry->dev, __entry->ring, __entry->ctx,
-			      __entry->seqno, __entry->global_seqno,
-			      __entry->port)
+		    TP_printk("dev=%u, hw_id=%u, ring=%u, ctx=%u, seqno=%u, global=%u, port=%u",
+			      __entry->dev, __entry->hw_id, __entry->ring,
+			      __entry->ctx, __entry->seqno,
+			      __entry->global_seqno, __entry->port)
 );
 
 DEFINE_EVENT(i915_gem_request_hw, i915_gem_request_in,
@@ -772,6 +778,7 @@ TRACE_EVENT(i915_gem_request_wait_begin,
 
 	    TP_STRUCT__entry(
 			     __field(u32, dev)
+			     __field(u32, hw_id)
 			     __field(u32, ring)
 			     __field(u32, ctx)
 			     __field(u32, seqno)
@@ -787,6 +794,7 @@ TRACE_EVENT(i915_gem_request_wait_begin,
 	     */
 	    TP_fast_assign(
 			   __entry->dev = req->i915->drm.primary->index;
+			   __entry->hw_id = req->ctx->hw_id;
 			   __entry->ring = req->engine->id;
 			   __entry->ctx = req->fence.context;
 			   __entry->seqno = req->fence.seqno;
@@ -794,10 +802,10 @@ TRACE_EVENT(i915_gem_request_wait_begin,
 			   __entry->flags = flags;
 			   ),
 
-	    TP_printk("dev=%u, ring=%u, ctx=%u, seqno=%u, global=%u, blocking=%u, flags=0x%x",
-		      __entry->dev, __entry->ring, __entry->ctx, __entry->seqno,
-		      __entry->global, !!(__entry->flags & I915_WAIT_LOCKED),
-		      __entry->flags)
+	    TP_printk("dev=%u, hw_id=%u, ring=%u, ctx=%u, seqno=%u, global=%u, blocking=%u, flags=0x%x",
+		      __entry->dev, __entry->hw_id, __entry->ring, __entry->ctx,
+		      __entry->seqno, __entry->global,
+		      !!(__entry->flags & I915_WAIT_LOCKED), __entry->flags)
 );
 
 DEFINE_EVENT(i915_gem_request, i915_gem_request_wait_end,
diff --git a/drivers/gpu/drm/i915/i915_utils.h b/drivers/gpu/drm/i915/i915_utils.h
index 8d07764887ec..51dbfe5bb418 100644
--- a/drivers/gpu/drm/i915/i915_utils.h
+++ b/drivers/gpu/drm/i915/i915_utils.h
@@ -140,4 +140,19 @@ static inline void drain_delayed_work(struct delayed_work *dw)
 	} while (delayed_work_pending(dw));
 }
 
+static inline const char *yesno(bool v)
+{
+	return v ? "yes" : "no";
+}
+
+static inline const char *onoff(bool v)
+{
+	return v ? "on" : "off";
+}
+
+static inline const char *enableddisabled(bool v)
+{
+	return v ? "enabled" : "disabled";
+}
+
 #endif /* !__I915_UTILS_H */
diff --git a/drivers/gpu/drm/i915/intel_ddi.c b/drivers/gpu/drm/i915/intel_ddi.c
index 369f780588fb..f51645a08dca 100644
--- a/drivers/gpu/drm/i915/intel_ddi.c
+++ b/drivers/gpu/drm/i915/intel_ddi.c
@@ -2095,6 +2095,8 @@ static void intel_ddi_clk_select(struct intel_encoder *encoder,
 	if (WARN_ON(!pll))
 		return;
 
+	mutex_lock(&dev_priv->dpll_lock);
+
 	if (IS_CANNONLAKE(dev_priv)) {
 		/* Configure DPCLKA_CFGCR0 to map the DPLL to the DDI. */
 		val = I915_READ(DPCLKA_CFGCR0);
@@ -2115,7 +2117,7 @@ static void intel_ddi_clk_select(struct intel_encoder *encoder,
 		val = I915_READ(DPLL_CTRL2);
 
 		val &= ~(DPLL_CTRL2_DDI_CLK_OFF(port) |
-			DPLL_CTRL2_DDI_CLK_SEL_MASK(port));
+			 DPLL_CTRL2_DDI_CLK_SEL_MASK(port));
 		val |= (DPLL_CTRL2_DDI_CLK_SEL(pll->id, port) |
 			DPLL_CTRL2_DDI_SEL_OVERRIDE(port));
 
@@ -2124,6 +2126,8 @@ static void intel_ddi_clk_select(struct intel_encoder *encoder,
 	} else if (INTEL_INFO(dev_priv)->gen < 9) {
 		I915_WRITE(PORT_CLK_SEL(port), hsw_pll_to_ddi_pll_sel(pll));
 	}
+
+	mutex_unlock(&dev_priv->dpll_lock);
 }
 
 static void intel_ddi_clk_disable(struct intel_encoder *encoder)
diff --git a/drivers/gpu/drm/i915/intel_device_info.c b/drivers/gpu/drm/i915/intel_device_info.c
index f478be3ae0ba..d28592e43512 100644
--- a/drivers/gpu/drm/i915/intel_device_info.c
+++ b/drivers/gpu/drm/i915/intel_device_info.c
@@ -22,6 +22,9 @@
  *
  */
 
+#include <drm/drm_print.h>
+
+#include "intel_device_info.h"
 #include "i915_drv.h"
 
 #define PLATFORM_NAME(x) [INTEL_##x] = #x
@@ -67,21 +70,55 @@ const char *intel_platform_name(enum intel_platform platform)
 	return platform_names[platform];
 }
 
-void intel_device_info_dump(struct drm_i915_private *dev_priv)
+void intel_device_info_dump_flags(const struct intel_device_info *info,
+				  struct drm_printer *p)
 {
-	const struct intel_device_info *info = &dev_priv->info;
-
-	DRM_DEBUG_DRIVER("i915 device info: platform=%s gen=%i pciid=0x%04x rev=0x%02x",
-			 intel_platform_name(info->platform),
-			 info->gen,
-			 dev_priv->drm.pdev->device,
-			 dev_priv->drm.pdev->revision);
-#define PRINT_FLAG(name) \
-	DRM_DEBUG_DRIVER("i915 device info: " #name ": %s", yesno(info->name))
+#define PRINT_FLAG(name) drm_printf(p, "%s: %s\n", #name, yesno(info->name));
 	DEV_INFO_FOR_EACH_FLAG(PRINT_FLAG);
 #undef PRINT_FLAG
 }
 
+static void sseu_dump(const struct sseu_dev_info *sseu, struct drm_printer *p)
+{
+	drm_printf(p, "slice mask: %04x\n", sseu->slice_mask);
+	drm_printf(p, "slice total: %u\n", hweight8(sseu->slice_mask));
+	drm_printf(p, "subslice total: %u\n", sseu_subslice_total(sseu));
+	drm_printf(p, "subslice mask %04x\n", sseu->subslice_mask);
+	drm_printf(p, "subslice per slice: %u\n",
+		   hweight8(sseu->subslice_mask));
+	drm_printf(p, "EU total: %u\n", sseu->eu_total);
+	drm_printf(p, "EU per subslice: %u\n", sseu->eu_per_subslice);
+	drm_printf(p, "has slice power gating: %s\n",
+		   yesno(sseu->has_slice_pg));
+	drm_printf(p, "has subslice power gating: %s\n",
+		   yesno(sseu->has_subslice_pg));
+	drm_printf(p, "has EU power gating: %s\n", yesno(sseu->has_eu_pg));
+}
+
+void intel_device_info_dump_runtime(const struct intel_device_info *info,
+				    struct drm_printer *p)
+{
+	sseu_dump(&info->sseu, p);
+
+	drm_printf(p, "CS timestamp frequency: %u kHz\n",
+		   info->cs_timestamp_frequency_khz);
+}
+
+void intel_device_info_dump(const struct intel_device_info *info,
+			    struct drm_printer *p)
+{
+	struct drm_i915_private *dev_priv =
+		container_of(info, struct drm_i915_private, info);
+
+	drm_printf(p, "pciid=0x%04x rev=0x%02x platform=%s gen=%i\n",
+		   INTEL_DEVID(dev_priv),
+		   INTEL_REVID(dev_priv),
+		   intel_platform_name(info->platform),
+		   info->gen);
+
+	intel_device_info_dump_flags(info, p);
+}
+
 static void gen10_sseu_info_init(struct drm_i915_private *dev_priv)
 {
 	struct sseu_dev_info *sseu = &mkwrite_device_info(dev_priv)->sseu;
@@ -420,7 +457,10 @@ static u32 read_timestamp_frequency(struct drm_i915_private *dev_priv)
 	return 0;
 }
 
-/*
+/**
+ * intel_device_info_runtime_init - initialize runtime info
+ * @info: intel device info struct
+ *
  * Determine various intel_device_info fields at runtime.
  *
  * Use it when either:
@@ -433,9 +473,10 @@ static u32 read_timestamp_frequency(struct drm_i915_private *dev_priv)
  *   - after the PCH has been detected,
  *   - before the first usage of the fields it can tweak.
  */
-void intel_device_info_runtime_init(struct drm_i915_private *dev_priv)
+void intel_device_info_runtime_init(struct intel_device_info *info)
 {
-	struct intel_device_info *info = mkwrite_device_info(dev_priv);
+	struct drm_i915_private *dev_priv =
+		container_of(info, struct drm_i915_private, info);
 	enum pipe pipe;
 
 	if (INTEL_GEN(dev_priv) >= 10) {
@@ -543,22 +584,4 @@ void intel_device_info_runtime_init(struct drm_i915_private *dev_priv)
 
 	/* Initialize command stream timestamp frequency */
 	info->cs_timestamp_frequency_khz = read_timestamp_frequency(dev_priv);
-
-	DRM_DEBUG_DRIVER("slice mask: %04x\n", info->sseu.slice_mask);
-	DRM_DEBUG_DRIVER("slice total: %u\n", hweight8(info->sseu.slice_mask));
-	DRM_DEBUG_DRIVER("subslice total: %u\n",
-			 sseu_subslice_total(&info->sseu));
-	DRM_DEBUG_DRIVER("subslice mask %04x\n", info->sseu.subslice_mask);
-	DRM_DEBUG_DRIVER("subslice per slice: %u\n",
-			 hweight8(info->sseu.subslice_mask));
-	DRM_DEBUG_DRIVER("EU total: %u\n", info->sseu.eu_total);
-	DRM_DEBUG_DRIVER("EU per subslice: %u\n", info->sseu.eu_per_subslice);
-	DRM_DEBUG_DRIVER("has slice power gating: %s\n",
-			 info->sseu.has_slice_pg ? "y" : "n");
-	DRM_DEBUG_DRIVER("has subslice power gating: %s\n",
-			 info->sseu.has_subslice_pg ? "y" : "n");
-	DRM_DEBUG_DRIVER("has EU power gating: %s\n",
-			 info->sseu.has_eu_pg ? "y" : "n");
-	DRM_DEBUG_DRIVER("CS timestamp frequency: %u kHz\n",
-			 info->cs_timestamp_frequency_khz);
 }
diff --git a/drivers/gpu/drm/i915/intel_device_info.h b/drivers/gpu/drm/i915/intel_device_info.h
new file mode 100644
index 000000000000..49cb27bd04c1
--- /dev/null
+++ b/drivers/gpu/drm/i915/intel_device_info.h
@@ -0,0 +1,183 @@
+/*
+ * Copyright © 2014-2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+#ifndef _INTEL_DEVICE_INFO_H_
+#define _INTEL_DEVICE_INFO_H_
+
+#include "intel_display.h"
+
+struct drm_printer;
+struct drm_i915_private;
+
+/* Keep in gen based order, and chronological order within a gen */
+enum intel_platform {
+	INTEL_PLATFORM_UNINITIALIZED = 0,
+	/* gen2 */
+	INTEL_I830,
+	INTEL_I845G,
+	INTEL_I85X,
+	INTEL_I865G,
+	/* gen3 */
+	INTEL_I915G,
+	INTEL_I915GM,
+	INTEL_I945G,
+	INTEL_I945GM,
+	INTEL_G33,
+	INTEL_PINEVIEW,
+	/* gen4 */
+	INTEL_I965G,
+	INTEL_I965GM,
+	INTEL_G45,
+	INTEL_GM45,
+	/* gen5 */
+	INTEL_IRONLAKE,
+	/* gen6 */
+	INTEL_SANDYBRIDGE,
+	/* gen7 */
+	INTEL_IVYBRIDGE,
+	INTEL_VALLEYVIEW,
+	INTEL_HASWELL,
+	/* gen8 */
+	INTEL_BROADWELL,
+	INTEL_CHERRYVIEW,
+	/* gen9 */
+	INTEL_SKYLAKE,
+	INTEL_BROXTON,
+	INTEL_KABYLAKE,
+	INTEL_GEMINILAKE,
+	INTEL_COFFEELAKE,
+	/* gen10 */
+	INTEL_CANNONLAKE,
+	INTEL_MAX_PLATFORMS
+};
+
+#define DEV_INFO_FOR_EACH_FLAG(func) \
+	func(is_mobile); \
+	func(is_lp); \
+	func(is_alpha_support); \
+	/* Keep has_* in alphabetical order */ \
+	func(has_64bit_reloc); \
+	func(has_aliasing_ppgtt); \
+	func(has_csr); \
+	func(has_ddi); \
+	func(has_dp_mst); \
+	func(has_reset_engine); \
+	func(has_fbc); \
+	func(has_fpga_dbg); \
+	func(has_full_ppgtt); \
+	func(has_full_48bit_ppgtt); \
+	func(has_gmch_display); \
+	func(has_guc); \
+	func(has_guc_ct); \
+	func(has_hotplug); \
+	func(has_l3_dpf); \
+	func(has_llc); \
+	func(has_logical_ring_contexts); \
+	func(has_logical_ring_preemption); \
+	func(has_overlay); \
+	func(has_pooled_eu); \
+	func(has_psr); \
+	func(has_rc6); \
+	func(has_rc6p); \
+	func(has_resource_streamer); \
+	func(has_runtime_pm); \
+	func(has_snoop); \
+	func(unfenced_needs_alignment); \
+	func(cursor_needs_physical); \
+	func(hws_needs_physical); \
+	func(overlay_needs_physical); \
+	func(supports_tv); \
+	func(has_ipc);
+
+struct sseu_dev_info {
+	u8 slice_mask;
+	u8 subslice_mask;
+	u8 eu_total;
+	u8 eu_per_subslice;
+	u8 min_eu_in_pool;
+	/* For each slice, which subslice(s) has(have) 7 EUs (bitfield)? */
+	u8 subslice_7eu[3];
+	u8 has_slice_pg:1;
+	u8 has_subslice_pg:1;
+	u8 has_eu_pg:1;
+};
+
+struct intel_device_info {
+	u16 device_id;
+	u16 gen_mask;
+
+	u8 gen;
+	u8 gt; /* GT number, 0 if undefined */
+	u8 num_rings;
+	u8 ring_mask; /* Rings supported by the HW */
+
+	enum intel_platform platform;
+	u32 platform_mask;
+
+	u32 display_mmio_offset;
+
+	u8 num_pipes;
+	u8 num_sprites[I915_MAX_PIPES];
+	u8 num_scalers[I915_MAX_PIPES];
+
+	unsigned int page_sizes; /* page sizes supported by the HW */
+
+#define DEFINE_FLAG(name) u8 name:1
+	DEV_INFO_FOR_EACH_FLAG(DEFINE_FLAG);
+#undef DEFINE_FLAG
+	u16 ddb_size; /* in blocks */
+
+	/* Register offsets for the various display pipes and transcoders */
+	int pipe_offsets[I915_MAX_TRANSCODERS];
+	int trans_offsets[I915_MAX_TRANSCODERS];
+	int palette_offsets[I915_MAX_PIPES];
+	int cursor_offsets[I915_MAX_PIPES];
+
+	/* Slice/subslice/EU info */
+	struct sseu_dev_info sseu;
+
+	u32 cs_timestamp_frequency_khz;
+
+	struct color_luts {
+		u16 degamma_lut_size;
+		u16 gamma_lut_size;
+	} color;
+};
+
+static inline unsigned int sseu_subslice_total(const struct sseu_dev_info *sseu)
+{
+	return hweight8(sseu->slice_mask) * hweight8(sseu->subslice_mask);
+}
+
+const char *intel_platform_name(enum intel_platform platform);
+
+void intel_device_info_runtime_init(struct intel_device_info *info);
+void intel_device_info_dump(const struct intel_device_info *info,
+			    struct drm_printer *p);
+void intel_device_info_dump_flags(const struct intel_device_info *info,
+				  struct drm_printer *p);
+void intel_device_info_dump_runtime(const struct intel_device_info *info,
+				    struct drm_printer *p);
+
+#endif
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index efa6c6d19664..0cd355978ab4 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -3073,6 +3073,12 @@ int skl_check_plane_surface(struct intel_plane_state *plane_state)
 	unsigned int rotation = plane_state->base.rotation;
 	int ret;
 
+	if (rotation & DRM_MODE_REFLECT_X &&
+	    fb->modifier == DRM_FORMAT_MOD_LINEAR) {
+		DRM_DEBUG_KMS("horizontal flip is not supported with linear surface formats\n");
+		return -EINVAL;
+	}
+
 	if (!plane_state->base.visible)
 		return 0;
 
@@ -3453,9 +3459,9 @@ static u32 skl_plane_ctl_tiling(uint64_t fb_modifier)
 	return 0;
 }
 
-static u32 skl_plane_ctl_rotation(unsigned int rotation)
+static u32 skl_plane_ctl_rotate(unsigned int rotate)
 {
-	switch (rotation) {
+	switch (rotate) {
 	case DRM_MODE_ROTATE_0:
 		break;
 	/*
@@ -3469,7 +3475,22 @@ static u32 skl_plane_ctl_rotation(unsigned int rotation)
 	case DRM_MODE_ROTATE_270:
 		return PLANE_CTL_ROTATE_90;
 	default:
-		MISSING_CASE(rotation);
+		MISSING_CASE(rotate);
+	}
+
+	return 0;
+}
+
+static u32 cnl_plane_ctl_flip(unsigned int reflect)
+{
+	switch (reflect) {
+	case 0:
+		break;
+	case DRM_MODE_REFLECT_X:
+		return PLANE_CTL_FLIP_HORIZONTAL;
+	case DRM_MODE_REFLECT_Y:
+	default:
+		MISSING_CASE(reflect);
 	}
 
 	return 0;
@@ -3497,7 +3518,11 @@ u32 skl_plane_ctl(const struct intel_crtc_state *crtc_state,
 
 	plane_ctl |= skl_plane_ctl_format(fb->format->format);
 	plane_ctl |= skl_plane_ctl_tiling(fb->modifier);
-	plane_ctl |= skl_plane_ctl_rotation(rotation);
+	plane_ctl |= skl_plane_ctl_rotate(rotation & DRM_MODE_ROTATE_MASK);
+
+	if (INTEL_GEN(dev_priv) >= 10)
+		plane_ctl |= cnl_plane_ctl_flip(rotation &
+						DRM_MODE_REFLECT_MASK);
 
 	if (key->flags & I915_SET_COLORKEY_DESTINATION)
 		plane_ctl |= PLANE_CTL_KEY_ENABLE_DESTINATION;
@@ -9694,111 +9719,27 @@ err:
 	return ERR_PTR(ret);
 }
 
-static u32
-intel_framebuffer_pitch_for_width(int width, int bpp)
-{
-	u32 pitch = DIV_ROUND_UP(width * bpp, 8);
-	return ALIGN(pitch, 64);
-}
-
-static u32
-intel_framebuffer_size_for_mode(const struct drm_display_mode *mode, int bpp)
-{
-	u32 pitch = intel_framebuffer_pitch_for_width(mode->hdisplay, bpp);
-	return PAGE_ALIGN(pitch * mode->vdisplay);
-}
-
-static struct drm_framebuffer *
-intel_framebuffer_create_for_mode(struct drm_device *dev,
-				  const struct drm_display_mode *mode,
-				  int depth, int bpp)
-{
-	struct drm_framebuffer *fb;
-	struct drm_i915_gem_object *obj;
-	struct drm_mode_fb_cmd2 mode_cmd = { 0 };
-
-	obj = i915_gem_object_create(to_i915(dev),
-				    intel_framebuffer_size_for_mode(mode, bpp));
-	if (IS_ERR(obj))
-		return ERR_CAST(obj);
-
-	mode_cmd.width = mode->hdisplay;
-	mode_cmd.height = mode->vdisplay;
-	mode_cmd.pitches[0] = intel_framebuffer_pitch_for_width(mode_cmd.width,
-								bpp);
-	mode_cmd.pixel_format = drm_mode_legacy_fb_format(bpp, depth);
-
-	fb = intel_framebuffer_create(obj, &mode_cmd);
-	if (IS_ERR(fb))
-		i915_gem_object_put(obj);
-
-	return fb;
-}
-
-static struct drm_framebuffer *
-mode_fits_in_fbdev(struct drm_device *dev,
-		   const struct drm_display_mode *mode)
-{
-#ifdef CONFIG_DRM_FBDEV_EMULATION
-	struct drm_i915_private *dev_priv = to_i915(dev);
-	struct drm_i915_gem_object *obj;
-	struct drm_framebuffer *fb;
-
-	if (!dev_priv->fbdev)
-		return NULL;
-
-	if (!dev_priv->fbdev->fb)
-		return NULL;
-
-	obj = dev_priv->fbdev->fb->obj;
-	BUG_ON(!obj);
-
-	fb = &dev_priv->fbdev->fb->base;
-	if (fb->pitches[0] < intel_framebuffer_pitch_for_width(mode->hdisplay,
-							       fb->format->cpp[0] * 8))
-		return NULL;
-
-	if (obj->base.size < mode->vdisplay * fb->pitches[0])
-		return NULL;
-
-	drm_framebuffer_get(fb);
-	return fb;
-#else
-	return NULL;
-#endif
-}
-
-static int intel_modeset_setup_plane_state(struct drm_atomic_state *state,
-					   struct drm_crtc *crtc,
-					   const struct drm_display_mode *mode,
-					   struct drm_framebuffer *fb,
-					   int x, int y)
+static int intel_modeset_disable_planes(struct drm_atomic_state *state,
+					struct drm_crtc *crtc)
 {
+	struct drm_plane *plane;
 	struct drm_plane_state *plane_state;
-	int hdisplay, vdisplay;
-	int ret;
-
-	plane_state = drm_atomic_get_plane_state(state, crtc->primary);
-	if (IS_ERR(plane_state))
-		return PTR_ERR(plane_state);
-
-	if (mode)
-		drm_mode_get_hv_timing(mode, &hdisplay, &vdisplay);
-	else
-		hdisplay = vdisplay = 0;
+	int ret, i;
 
-	ret = drm_atomic_set_crtc_for_plane(plane_state, fb ? crtc : NULL);
+	ret = drm_atomic_add_affected_planes(state, crtc);
 	if (ret)
 		return ret;
-	drm_atomic_set_fb_for_plane(plane_state, fb);
-	plane_state->crtc_x = 0;
-	plane_state->crtc_y = 0;
-	plane_state->crtc_w = hdisplay;
-	plane_state->crtc_h = vdisplay;
-	plane_state->src_x = x << 16;
-	plane_state->src_y = y << 16;
-	plane_state->src_w = hdisplay << 16;
-	plane_state->src_h = vdisplay << 16;
+
+	for_each_new_plane_in_state(state, plane, plane_state, i) {
+		if (plane_state->crtc != crtc)
+			continue;
+
+		ret = drm_atomic_set_crtc_for_plane(plane_state, NULL);
+		if (ret)
+			return ret;
+
+		drm_atomic_set_fb_for_plane(plane_state, NULL);
+	}
 
 	return 0;
 }
@@ -9816,7 +9757,6 @@ int intel_get_load_detect_pipe(struct drm_connector *connector,
 	struct drm_crtc *crtc = NULL;
 	struct drm_device *dev = encoder->dev;
 	struct drm_i915_private *dev_priv = to_i915(dev);
-	struct drm_framebuffer *fb;
 	struct drm_mode_config *config = &dev->mode_config;
 	struct drm_atomic_state *state = NULL, *restore_state = NULL;
 	struct drm_connector_state *connector_state;
@@ -9884,10 +9824,6 @@ int intel_get_load_detect_pipe(struct drm_connector *connector,
 found:
 	intel_crtc = to_intel_crtc(crtc);
 
-	ret = drm_modeset_lock(&crtc->primary->mutex, ctx);
-	if (ret)
-		goto fail;
-
 	state = drm_atomic_state_alloc(dev);
 	restore_state = drm_atomic_state_alloc(dev);
 	if (!state || !restore_state) {
@@ -9919,39 +9855,17 @@ found:
 	if (!mode)
 		mode = &load_detect_mode;
 
-	/* We need a framebuffer large enough to accommodate all accesses
-	 * that the plane may generate whilst we perform load detection.
-	 * We can not rely on the fbcon either being present (we get called
-	 * during its initialisation to detect all boot displays, or it may
-	 * not even exist) or that it is large enough to satisfy the
-	 * requested mode.
-	 */
-	fb = mode_fits_in_fbdev(dev, mode);
-	if (fb == NULL) {
-		DRM_DEBUG_KMS("creating tmp fb for load-detection\n");
-		fb = intel_framebuffer_create_for_mode(dev, mode, 24, 32);
-	} else
-		DRM_DEBUG_KMS("reusing fbdev for load-detection framebuffer\n");
-	if (IS_ERR(fb)) {
-		DRM_DEBUG_KMS("failed to allocate framebuffer for load-detection\n");
-		ret = PTR_ERR(fb);
-		goto fail;
-	}
-
-	ret = intel_modeset_setup_plane_state(state, crtc, mode, fb, 0, 0);
-	drm_framebuffer_put(fb);
+	ret = drm_atomic_set_mode_for_crtc(&crtc_state->base, mode);
 	if (ret)
 		goto fail;
 
-	ret = drm_atomic_set_mode_for_crtc(&crtc_state->base, mode);
+	ret = intel_modeset_disable_planes(state, crtc);
 	if (ret)
 		goto fail;
 
 	ret = PTR_ERR_OR_ZERO(drm_atomic_get_connector_state(restore_state, connector));
 	if (!ret)
 		ret = PTR_ERR_OR_ZERO(drm_atomic_get_crtc_state(restore_state, crtc));
-	if (!ret)
-		ret = PTR_ERR_OR_ZERO(drm_atomic_get_plane_state(restore_state, crtc->primary));
 	if (ret) {
 		DRM_DEBUG_KMS("Failed to create a copy of old state to restore: %i\n", ret);
 		goto fail;
@@ -12569,11 +12483,15 @@ static int intel_atomic_commit(struct drm_device *dev,
 	INIT_WORK(&state->commit_work, intel_atomic_commit_work);
 
 	i915_sw_fence_commit(&intel_state->commit_ready);
-	if (nonblock)
+	if (nonblock && intel_state->modeset) {
+		queue_work(dev_priv->modeset_wq, &state->commit_work);
+	} else if (nonblock) {
 		queue_work(system_unbound_wq, &state->commit_work);
-	else
+	} else {
+		if (intel_state->modeset)
+			flush_workqueue(dev_priv->modeset_wq);
 		intel_atomic_commit_tail(state);
-
+	}
 
 	return 0;
 }
@@ -13238,7 +13156,7 @@ intel_primary_plane_create(struct drm_i915_private *dev_priv, enum pipe pipe)
 	primary->frontbuffer_bit = INTEL_FRONTBUFFER_PRIMARY(pipe);
 	primary->check_plane = intel_check_primary_plane;
 
-	if (INTEL_GEN(dev_priv) >= 10 || IS_GEMINILAKE(dev_priv)) {
+	if (INTEL_GEN(dev_priv) >= 10) {
 		intel_primary_formats = skl_primary_formats;
 		num_formats = ARRAY_SIZE(skl_primary_formats);
 		modifiers = skl_format_modifiers_ccs;
@@ -13300,7 +13218,12 @@ intel_primary_plane_create(struct drm_i915_private *dev_priv, enum pipe pipe)
 	if (ret)
 		goto fail;
 
-	if (INTEL_GEN(dev_priv) >= 9) {
+	if (INTEL_GEN(dev_priv) >= 10) {
+		supported_rotations =
+			DRM_MODE_ROTATE_0 | DRM_MODE_ROTATE_90 |
+			DRM_MODE_ROTATE_180 | DRM_MODE_ROTATE_270 |
+			DRM_MODE_REFLECT_X;
+	} else if (INTEL_GEN(dev_priv) >= 9) {
 		supported_rotations =
 			DRM_MODE_ROTATE_0 | DRM_MODE_ROTATE_90 |
 			DRM_MODE_ROTATE_180 | DRM_MODE_ROTATE_270;
@@ -14531,6 +14454,8 @@ int intel_modeset_init(struct drm_device *dev)
 	enum pipe pipe;
 	struct intel_crtc *crtc;
 
+	dev_priv->modeset_wq = alloc_ordered_workqueue("i915_modeset", 0);
+
 	drm_mode_config_init(dev);
 
 	dev->mode_config.min_width = 0;
@@ -15335,6 +15260,8 @@ void intel_modeset_cleanup(struct drm_device *dev)
 	intel_cleanup_gt_powersave(dev_priv);
 
 	intel_teardown_gmbus(dev_priv);
+
+	destroy_workqueue(dev_priv->modeset_wq);
 }
 
 void intel_connector_attach_encoder(struct intel_connector *connector,
diff --git a/drivers/gpu/drm/i915/intel_display.h b/drivers/gpu/drm/i915/intel_display.h
new file mode 100644
index 000000000000..a0d2b6169361
--- /dev/null
+++ b/drivers/gpu/drm/i915/intel_display.h
@@ -0,0 +1,321 @@
+/*
+ * Copyright © 2006-2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+#ifndef _INTEL_DISPLAY_H_
+#define _INTEL_DISPLAY_H_
+
+enum pipe {
+	INVALID_PIPE = -1,
+
+	PIPE_A = 0,
+	PIPE_B,
+	PIPE_C,
+	_PIPE_EDP,
+
+	I915_MAX_PIPES = _PIPE_EDP
+};
+
+#define pipe_name(p) ((p) + 'A')
+
+enum transcoder {
+	TRANSCODER_A = 0,
+	TRANSCODER_B,
+	TRANSCODER_C,
+	TRANSCODER_EDP,
+	TRANSCODER_DSI_A,
+	TRANSCODER_DSI_C,
+
+	I915_MAX_TRANSCODERS
+};
+
+static inline const char *transcoder_name(enum transcoder transcoder)
+{
+	switch (transcoder) {
+	case TRANSCODER_A:
+		return "A";
+	case TRANSCODER_B:
+		return "B";
+	case TRANSCODER_C:
+		return "C";
+	case TRANSCODER_EDP:
+		return "EDP";
+	case TRANSCODER_DSI_A:
+		return "DSI A";
+	case TRANSCODER_DSI_C:
+		return "DSI C";
+	default:
+		return "<invalid>";
+	}
+}
+
+static inline bool transcoder_is_dsi(enum transcoder transcoder)
+{
+	return transcoder == TRANSCODER_DSI_A || transcoder == TRANSCODER_DSI_C;
+}
+
+/*
+ * Global legacy plane identifier. Valid only for primary/sprite
+ * planes on pre-g4x, and only for primary planes on g4x-bdw.
+ */
+enum i9xx_plane_id {
+	PLANE_A,
+	PLANE_B,
+	PLANE_C,
+};
+
+#define plane_name(p) ((p) + 'A')
+#define sprite_name(p, s) ((p) * INTEL_INFO(dev_priv)->num_sprites[(p)] + (s) + 'A')
+
+/*
+ * Per-pipe plane identifier.
+ * I915_MAX_PLANES in the enum below is the maximum (across all platforms)
+ * number of planes per CRTC.  Not all platforms really have this many planes,
+ * which means some arrays of size I915_MAX_PLANES may have unused entries
+ * between the topmost sprite plane and the cursor plane.
+ *
+ * This is expected to be passed to various register macros
+ * (eg. PLANE_CTL(), PS_PLANE_SEL(), etc.) so adjust with care.
+ */
+enum plane_id {
+	PLANE_PRIMARY,
+	PLANE_SPRITE0,
+	PLANE_SPRITE1,
+	PLANE_SPRITE2,
+	PLANE_CURSOR,
+
+	I915_MAX_PLANES,
+};
+
+#define for_each_plane_id_on_crtc(__crtc, __p) \
+	for ((__p) = PLANE_PRIMARY; (__p) < I915_MAX_PLANES; (__p)++) \
+		for_each_if((__crtc)->plane_ids_mask & BIT(__p))
+
+enum port {
+	PORT_NONE = -1,
+
+	PORT_A = 0,
+	PORT_B,
+	PORT_C,
+	PORT_D,
+	PORT_E,
+
+	I915_MAX_PORTS
+};
+
+#define port_name(p) ((p) + 'A')
+
+enum dpio_channel {
+	DPIO_CH0,
+	DPIO_CH1
+};
+
+enum dpio_phy {
+	DPIO_PHY0,
+	DPIO_PHY1,
+	DPIO_PHY2,
+};
+
+#define I915_NUM_PHYS_VLV 2
+
+enum intel_display_power_domain {
+	POWER_DOMAIN_PIPE_A,
+	POWER_DOMAIN_PIPE_B,
+	POWER_DOMAIN_PIPE_C,
+	POWER_DOMAIN_PIPE_A_PANEL_FITTER,
+	POWER_DOMAIN_PIPE_B_PANEL_FITTER,
+	POWER_DOMAIN_PIPE_C_PANEL_FITTER,
+	POWER_DOMAIN_TRANSCODER_A,
+	POWER_DOMAIN_TRANSCODER_B,
+	POWER_DOMAIN_TRANSCODER_C,
+	POWER_DOMAIN_TRANSCODER_EDP,
+	POWER_DOMAIN_TRANSCODER_DSI_A,
+	POWER_DOMAIN_TRANSCODER_DSI_C,
+	POWER_DOMAIN_PORT_DDI_A_LANES,
+	POWER_DOMAIN_PORT_DDI_B_LANES,
+	POWER_DOMAIN_PORT_DDI_C_LANES,
+	POWER_DOMAIN_PORT_DDI_D_LANES,
+	POWER_DOMAIN_PORT_DDI_E_LANES,
+	POWER_DOMAIN_PORT_DDI_A_IO,
+	POWER_DOMAIN_PORT_DDI_B_IO,
+	POWER_DOMAIN_PORT_DDI_C_IO,
+	POWER_DOMAIN_PORT_DDI_D_IO,
+	POWER_DOMAIN_PORT_DDI_E_IO,
+	POWER_DOMAIN_PORT_DSI,
+	POWER_DOMAIN_PORT_CRT,
+	POWER_DOMAIN_PORT_OTHER,
+	POWER_DOMAIN_VGA,
+	POWER_DOMAIN_AUDIO,
+	POWER_DOMAIN_PLLS,
+	POWER_DOMAIN_AUX_A,
+	POWER_DOMAIN_AUX_B,
+	POWER_DOMAIN_AUX_C,
+	POWER_DOMAIN_AUX_D,
+	POWER_DOMAIN_GMBUS,
+	POWER_DOMAIN_MODESET,
+	POWER_DOMAIN_GT_IRQ,
+	POWER_DOMAIN_INIT,
+
+	POWER_DOMAIN_NUM,
+};
+
+#define POWER_DOMAIN_PIPE(pipe) ((pipe) + POWER_DOMAIN_PIPE_A)
+#define POWER_DOMAIN_PIPE_PANEL_FITTER(pipe) \
+		((pipe) + POWER_DOMAIN_PIPE_A_PANEL_FITTER)
+#define POWER_DOMAIN_TRANSCODER(tran) \
+	((tran) == TRANSCODER_EDP ? POWER_DOMAIN_TRANSCODER_EDP : \
+	 (tran) + POWER_DOMAIN_TRANSCODER_A)
+
+/* Used by dp and fdi links */
+struct intel_link_m_n {
+	u32 tu;
+	u32 gmch_m;
+	u32 gmch_n;
+	u32 link_m;
+	u32 link_n;
+};
+
+#define for_each_pipe(__dev_priv, __p) \
+	for ((__p) = 0; (__p) < INTEL_INFO(__dev_priv)->num_pipes; (__p)++)
+
+#define for_each_pipe_masked(__dev_priv, __p, __mask) \
+	for ((__p) = 0; (__p) < INTEL_INFO(__dev_priv)->num_pipes; (__p)++) \
+		for_each_if((__mask) & BIT(__p))
+
+#define for_each_universal_plane(__dev_priv, __pipe, __p)		\
+	for ((__p) = 0;							\
+	     (__p) < INTEL_INFO(__dev_priv)->num_sprites[(__pipe)] + 1;	\
+	     (__p)++)
+
+#define for_each_sprite(__dev_priv, __p, __s)				\
+	for ((__s) = 0;							\
+	     (__s) < INTEL_INFO(__dev_priv)->num_sprites[(__p)];	\
+	     (__s)++)
+
+#define for_each_port_masked(__port, __ports_mask) \
+	for ((__port) = PORT_A; (__port) < I915_MAX_PORTS; (__port)++)	\
+		for_each_if((__ports_mask) & BIT(__port))
+
+#define for_each_crtc(dev, crtc) \
+	list_for_each_entry(crtc, &(dev)->mode_config.crtc_list, head)
+
+#define for_each_intel_plane(dev, intel_plane) \
+	list_for_each_entry(intel_plane,			\
+			    &(dev)->mode_config.plane_list,	\
+			    base.head)
+
+#define for_each_intel_plane_mask(dev, intel_plane, plane_mask)		\
+	list_for_each_entry(intel_plane,				\
+			    &(dev)->mode_config.plane_list,		\
+			    base.head)					\
+		for_each_if((plane_mask) &				\
+			    BIT(drm_plane_index(&intel_plane->base)))
+
+#define for_each_intel_plane_on_crtc(dev, intel_crtc, intel_plane)	\
+	list_for_each_entry(intel_plane,				\
+			    &(dev)->mode_config.plane_list,		\
+			    base.head)					\
+		for_each_if((intel_plane)->pipe == (intel_crtc)->pipe)
+
+#define for_each_intel_crtc(dev, intel_crtc)				\
+	list_for_each_entry(intel_crtc,					\
+			    &(dev)->mode_config.crtc_list,		\
+			    base.head)
+
+#define for_each_intel_crtc_mask(dev, intel_crtc, crtc_mask)		\
+	list_for_each_entry(intel_crtc,					\
+			    &(dev)->mode_config.crtc_list,		\
+			    base.head)					\
+		for_each_if((crtc_mask) & BIT(drm_crtc_index(&intel_crtc->base)))
+
+#define for_each_intel_encoder(dev, intel_encoder)		\
+	list_for_each_entry(intel_encoder,			\
+			    &(dev)->mode_config.encoder_list,	\
+			    base.head)
+
+#define for_each_intel_connector_iter(intel_connector, iter) \
+	while ((intel_connector = to_intel_connector(drm_connector_list_iter_next(iter))))
+
+#define for_each_encoder_on_crtc(dev, __crtc, intel_encoder) \
+	list_for_each_entry((intel_encoder), &(dev)->mode_config.encoder_list, base.head) \
+		for_each_if((intel_encoder)->base.crtc == (__crtc))
+
+#define for_each_connector_on_encoder(dev, __encoder, intel_connector) \
+	list_for_each_entry((intel_connector), &(dev)->mode_config.connector_list, base.head) \
+		for_each_if((intel_connector)->base.encoder == (__encoder))
+
+#define for_each_power_domain(domain, mask)				\
+	for ((domain) = 0; (domain) < POWER_DOMAIN_NUM; (domain)++)	\
+		for_each_if(BIT_ULL(domain) & (mask))
+
+#define for_each_power_well(__dev_priv, __power_well)				\
+	for ((__power_well) = (__dev_priv)->power_domains.power_wells;	\
+	     (__power_well) - (__dev_priv)->power_domains.power_wells <	\
+		(__dev_priv)->power_domains.power_well_count;		\
+	     (__power_well)++)
+
+#define for_each_power_well_rev(__dev_priv, __power_well)			\
+	for ((__power_well) = (__dev_priv)->power_domains.power_wells +		\
+			      (__dev_priv)->power_domains.power_well_count - 1;	\
+	     (__power_well) - (__dev_priv)->power_domains.power_wells >= 0;	\
+	     (__power_well)--)
+
+#define for_each_power_domain_well(__dev_priv, __power_well, __domain_mask)	\
+	for_each_power_well(__dev_priv, __power_well)				\
+		for_each_if((__power_well)->domains & (__domain_mask))
+
+#define for_each_power_domain_well_rev(__dev_priv, __power_well, __domain_mask) \
+	for_each_power_well_rev(__dev_priv, __power_well)		        \
+		for_each_if((__power_well)->domains & (__domain_mask))
+
+#define for_each_new_intel_plane_in_state(__state, plane, new_plane_state, __i) \
+	for ((__i) = 0; \
+	     (__i) < (__state)->base.dev->mode_config.num_total_plane && \
+		     ((plane) = to_intel_plane((__state)->base.planes[__i].ptr), \
+		      (new_plane_state) = to_intel_plane_state((__state)->base.planes[__i].new_state), 1); \
+	     (__i)++) \
+		for_each_if(plane)
+
+#define for_each_new_intel_crtc_in_state(__state, crtc, new_crtc_state, __i) \
+	for ((__i) = 0; \
+	     (__i) < (__state)->base.dev->mode_config.num_crtc && \
+		     ((crtc) = to_intel_crtc((__state)->base.crtcs[__i].ptr), \
+		      (new_crtc_state) = to_intel_crtc_state((__state)->base.crtcs[__i].new_state), 1); \
+	     (__i)++) \
+		for_each_if(crtc)
+
+#define for_each_oldnew_intel_plane_in_state(__state, plane, old_plane_state, new_plane_state, __i) \
+	for ((__i) = 0; \
+	     (__i) < (__state)->base.dev->mode_config.num_total_plane && \
+		     ((plane) = to_intel_plane((__state)->base.planes[__i].ptr), \
+		      (old_plane_state) = to_intel_plane_state((__state)->base.planes[__i].old_state), \
+		      (new_plane_state) = to_intel_plane_state((__state)->base.planes[__i].new_state), 1); \
+	     (__i)++) \
+		for_each_if(plane)
+
+void intel_link_compute_m_n(int bpp, int nlanes,
+			    int pixel_clock, int link_clock,
+			    struct intel_link_m_n *m_n,
+			    bool reduce_m_n);
+
+#endif
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index 510e0bc3a377..ebdcbcbacb3c 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -1272,6 +1272,9 @@ static int cnl_init_workarounds(struct intel_engine_cs *engine)
 	if (ret)
 		return ret;
 
+	/* WaDisableEarlyEOT:cnl */
+	WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN, DISABLE_EARLY_EOT);
+
 	return 0;
 }
 
@@ -1664,6 +1667,35 @@ static void print_request(struct drm_printer *m,
 		   rq->timeline->common->name);
 }
 
+static void hexdump(struct drm_printer *m, const void *buf, size_t len)
+{
+	const size_t rowsize = 8 * sizeof(u32);
+	const void *prev = NULL;
+	bool skip = false;
+	size_t pos;
+
+	for (pos = 0; pos < len; pos += rowsize) {
+		char line[128];
+
+		if (prev && !memcmp(prev, buf + pos, rowsize)) {
+			if (!skip) {
+				drm_printf(m, "*\n");
+				skip = true;
+			}
+			continue;
+		}
+
+		WARN_ON_ONCE(hex_dump_to_buffer(buf + pos, len - pos,
+						rowsize, sizeof(u32),
+						line, sizeof(line),
+						false) >= sizeof(line));
+		drm_printf(m, "%08zx %s\n", pos, line);
+
+		prev = buf + pos;
+		skip = false;
+	}
+}
+
 void intel_engine_dump(struct intel_engine_cs *engine,
 		       struct drm_printer *m,
 		       const char *header, ...)
@@ -1757,6 +1789,24 @@ void intel_engine_dump(struct intel_engine_cs *engine,
 	addr = intel_engine_get_last_batch_head(engine);
 	drm_printf(m, "\tBBADDR: 0x%08x_%08x\n",
 		   upper_32_bits(addr), lower_32_bits(addr));
+	if (INTEL_GEN(dev_priv) >= 8)
+		addr = I915_READ64_2x32(RING_DMA_FADD(engine->mmio_base),
+					RING_DMA_FADD_UDW(engine->mmio_base));
+	else if (INTEL_GEN(dev_priv) >= 4)
+		addr = I915_READ(RING_DMA_FADD(engine->mmio_base));
+	else
+		addr = I915_READ(DMA_FADD_I8XX);
+	drm_printf(m, "\tDMA_FADDR: 0x%08x_%08x\n",
+		   upper_32_bits(addr), lower_32_bits(addr));
+	if (INTEL_GEN(dev_priv) >= 4) {
+		drm_printf(m, "\tIPEIR: 0x%08x\n",
+			   I915_READ(RING_IPEIR(engine->mmio_base)));
+		drm_printf(m, "\tIPEHR: 0x%08x\n",
+			   I915_READ(RING_IPEHR(engine->mmio_base)));
+	} else {
+		drm_printf(m, "\tIPEIR: 0x%08x\n", I915_READ(IPEIR));
+		drm_printf(m, "\tIPEHR: 0x%08x\n", I915_READ(IPEHR));
+	}
 
 	if (HAS_EXECLISTS(dev_priv)) {
 		const u32 *hws = &engine->status_page.page_addr[I915_HWS_CSB_BUF0_INDEX];
@@ -1848,8 +1898,11 @@ void intel_engine_dump(struct intel_engine_cs *engine,
 				  &engine->irq_posted)),
 		   yesno(test_bit(ENGINE_IRQ_EXECLIST,
 				  &engine->irq_posted)));
+
+	drm_printf(m, "HWSP:\n");
+	hexdump(m, engine->status_page.page_addr, PAGE_SIZE);
+
 	drm_printf(m, "Idle? %s\n", yesno(intel_engine_is_idle(engine)));
-	drm_printf(m, "\n");
 }
 
 static u8 user_class_map[] = {
diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c
index 0acd9dd3ed5c..31f01d64c021 100644
--- a/drivers/gpu/drm/i915/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/intel_hangcheck.c
@@ -429,18 +429,18 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
 	intel_uncore_arm_unclaimed_mmio_detection(dev_priv);
 
 	for_each_engine(engine, dev_priv, id) {
-		struct intel_engine_hangcheck cur_state, *hc = &cur_state;
 		const bool busy = intel_engine_has_waiter(engine);
+		struct intel_engine_hangcheck hc;
 
 		semaphore_clear_deadlocks(dev_priv);
 
-		hangcheck_load_sample(engine, hc);
-		hangcheck_accumulate_sample(engine, hc);
-		hangcheck_store_sample(engine, hc);
+		hangcheck_load_sample(engine, &hc);
+		hangcheck_accumulate_sample(engine, &hc);
+		hangcheck_store_sample(engine, &hc);
 
 		if (engine->hangcheck.stalled) {
 			hung |= intel_engine_flag(engine);
-			if (hc->action != ENGINE_DEAD)
+			if (hc.action != ENGINE_DEAD)
 				stuck |= intel_engine_flag(engine);
 		}
 
diff --git a/drivers/gpu/drm/i915/intel_i2c.c b/drivers/gpu/drm/i915/intel_i2c.c
index 49fdf09f9919..ef9f91a0b0c9 100644
--- a/drivers/gpu/drm/i915/intel_i2c.c
+++ b/drivers/gpu/drm/i915/intel_i2c.c
@@ -128,22 +128,46 @@ intel_i2c_reset(struct drm_i915_private *dev_priv)
 	I915_WRITE(GMBUS4, 0);
 }
 
-static void intel_i2c_quirk_set(struct drm_i915_private *dev_priv, bool enable)
+static void pnv_gmbus_clock_gating(struct drm_i915_private *dev_priv,
+				   bool enable)
 {
 	u32 val;
 
 	/* When using bit bashing for I2C, this bit needs to be set to 1 */
-	if (!IS_PINEVIEW(dev_priv))
-		return;
-
 	val = I915_READ(DSPCLK_GATE_D);
-	if (enable)
-		val |= DPCUNIT_CLOCK_GATE_DISABLE;
+	if (!enable)
+		val |= PNV_GMBUSUNIT_CLOCK_GATE_DISABLE;
 	else
-		val &= ~DPCUNIT_CLOCK_GATE_DISABLE;
+		val &= ~PNV_GMBUSUNIT_CLOCK_GATE_DISABLE;
 	I915_WRITE(DSPCLK_GATE_D, val);
 }
 
+static void pch_gmbus_clock_gating(struct drm_i915_private *dev_priv,
+				   bool enable)
+{
+	u32 val;
+
+	val = I915_READ(SOUTH_DSPCLK_GATE_D);
+	if (!enable)
+		val |= PCH_GMBUSUNIT_CLOCK_GATE_DISABLE;
+	else
+		val &= ~PCH_GMBUSUNIT_CLOCK_GATE_DISABLE;
+	I915_WRITE(SOUTH_DSPCLK_GATE_D, val);
+}
+
+static void bxt_gmbus_clock_gating(struct drm_i915_private *dev_priv,
+				   bool enable)
+{
+	u32 val;
+
+	val = I915_READ(GEN9_CLKGATE_DIS_4);
+	if (!enable)
+		val |= BXT_GMBUS_GATING_DIS;
+	else
+		val &= ~BXT_GMBUS_GATING_DIS;
+	I915_WRITE(GEN9_CLKGATE_DIS_4, val);
+}
+
 static u32 get_reserved(struct intel_gmbus *bus)
 {
 	struct drm_i915_private *dev_priv = bus->dev_priv;
@@ -221,7 +245,10 @@ intel_gpio_pre_xfer(struct i2c_adapter *adapter)
 	struct drm_i915_private *dev_priv = bus->dev_priv;
 
 	intel_i2c_reset(dev_priv);
-	intel_i2c_quirk_set(dev_priv, true);
+
+	if (IS_PINEVIEW(dev_priv))
+		pnv_gmbus_clock_gating(dev_priv, false);
+
 	set_data(bus, 1);
 	set_clock(bus, 1);
 	udelay(I2C_RISEFALL_TIME);
@@ -238,7 +265,9 @@ intel_gpio_post_xfer(struct i2c_adapter *adapter)
 
 	set_data(bus, 1);
 	set_clock(bus, 1);
-	intel_i2c_quirk_set(dev_priv, false);
+
+	if (IS_PINEVIEW(dev_priv))
+		pnv_gmbus_clock_gating(dev_priv, true);
 }
 
 static void
@@ -481,6 +510,13 @@ do_gmbus_xfer(struct i2c_adapter *adapter, struct i2c_msg *msgs, int num)
 	int i = 0, inc, try = 0;
 	int ret = 0;
 
+	/* Display WA #0868: skl,bxt,kbl,cfl,glk,cnl */
+	if (IS_GEN9_LP(dev_priv))
+		bxt_gmbus_clock_gating(dev_priv, false);
+	else if (HAS_PCH_SPT(dev_priv) ||
+		 HAS_PCH_KBP(dev_priv) || HAS_PCH_CNP(dev_priv))
+		pch_gmbus_clock_gating(dev_priv, false);
+
 retry:
 	I915_WRITE_FW(GMBUS0, bus->reg0);
 
@@ -582,6 +618,13 @@ timeout:
 	ret = -EAGAIN;
 
 out:
+	/* Display WA #0868: skl,bxt,kbl,cfl,glk,cnl */
+	if (IS_GEN9_LP(dev_priv))
+		bxt_gmbus_clock_gating(dev_priv, true);
+	else if (HAS_PCH_SPT(dev_priv) ||
+		 HAS_PCH_KBP(dev_priv) || HAS_PCH_CNP(dev_priv))
+		pch_gmbus_clock_gating(dev_priv, true);
+
 	return ret;
 }
 
diff --git a/drivers/gpu/drm/i915/intel_lpe_audio.c b/drivers/gpu/drm/i915/intel_lpe_audio.c
index 3bf65288ffff..5809b29044fc 100644
--- a/drivers/gpu/drm/i915/intel_lpe_audio.c
+++ b/drivers/gpu/drm/i915/intel_lpe_audio.c
@@ -193,7 +193,7 @@ static bool lpe_audio_detect(struct drm_i915_private *dev_priv)
 		};
 
 		if (!pci_dev_present(atom_hdaudio_ids)) {
-			DRM_INFO("%s\n", "HDaudio controller not detected, using LPE audio instead\n");
+			DRM_INFO("HDaudio controller not detected, using LPE audio instead\n");
 			lpe_present = true;
 		}
 	}
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 2e38fbfdf08f..739c33b07c59 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -449,7 +449,7 @@ static void execlists_submit_ports(struct intel_engine_cs *engine)
 
 			GEM_TRACE("%s in[%d]:  ctx=%d.%d, seqno=%x\n",
 				  engine->name, n,
-				  rq->ctx->hw_id, count,
+				  port[n].context_id, count,
 				  rq->global_seqno);
 		} else {
 			GEM_BUG_ON(!n);
@@ -504,7 +504,7 @@ static void inject_preempt_context(struct intel_engine_cs *engine)
 	ce->ring->tail &= (ce->ring->size - 1);
 	ce->lrc_reg_state[CTX_RING_TAIL+1] = ce->ring->tail;
 
-	GEM_TRACE("\n");
+	GEM_TRACE("%s\n", engine->name);
 	for (n = execlists_num_ports(&engine->execlists); --n; )
 		elsp_write(0, engine->execlists.elsp);
 
@@ -861,9 +861,10 @@ static void execlists_submission_tasklet(unsigned long data)
 			 */
 
 			status = READ_ONCE(buf[2 * head]); /* maybe mmio! */
-			GEM_TRACE("%s csb[%dd]: status=0x%08x:0x%08x\n",
+			GEM_TRACE("%s csb[%d]: status=0x%08x:0x%08x, active=0x%x\n",
 				  engine->name, head,
-				  status, buf[2*head + 1]);
+				  status, buf[2*head + 1],
+				  execlists->active);
 
 			if (status & (GEN8_CTX_STATUS_IDLE_ACTIVE |
 				      GEN8_CTX_STATUS_PREEMPTED))
@@ -881,6 +882,8 @@ static void execlists_submission_tasklet(unsigned long data)
 
 			if (status & GEN8_CTX_STATUS_COMPLETE &&
 			    buf[2*head + 1] == PREEMPT_ID) {
+				GEM_TRACE("%s preempt-idle\n", engine->name);
+
 				execlists_cancel_port_requests(execlists);
 				execlists_unwind_incomplete_requests(execlists);
 
@@ -905,8 +908,8 @@ static void execlists_submission_tasklet(unsigned long data)
 			rq = port_unpack(port, &count);
 			GEM_TRACE("%s out[0]: ctx=%d.%d, seqno=%x\n",
 				  engine->name,
-				  rq->ctx->hw_id, count,
-				  rq->global_seqno);
+				  port->context_id, count,
+				  rq ? rq->global_seqno : 0);
 			GEM_BUG_ON(count == 0);
 			if (--count == 0) {
 				GEM_BUG_ON(status & GEN8_CTX_STATUS_PREEMPTED);
@@ -1555,6 +1558,8 @@ static void reset_common_ring(struct intel_engine_cs *engine,
 	struct intel_context *ce;
 	unsigned long flags;
 
+	GEM_TRACE("%s seqno=%x\n",
+		  engine->name, request ? request->global_seqno : 0);
 	spin_lock_irqsave(&engine->timeline->lock, flags);
 
 	/*
diff --git a/drivers/gpu/drm/i915/intel_opregion.c b/drivers/gpu/drm/i915/intel_opregion.c
index fc65f5e451dd..c58e5f53bab0 100644
--- a/drivers/gpu/drm/i915/intel_opregion.c
+++ b/drivers/gpu/drm/i915/intel_opregion.c
@@ -32,6 +32,8 @@
 
 #include <drm/drmP.h>
 #include <drm/i915_drm.h>
+
+#include "intel_opregion.h"
 #include "i915_drv.h"
 #include "intel_drv.h"
 
diff --git a/drivers/gpu/drm/i915/intel_opregion.h b/drivers/gpu/drm/i915/intel_opregion.h
new file mode 100644
index 000000000000..e0e437ba9e51
--- /dev/null
+++ b/drivers/gpu/drm/i915/intel_opregion.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright © 2008-2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+#ifndef _INTEL_OPREGION_H_
+#define _INTEL_OPREGION_H_
+
+#include <linux/workqueue.h>
+#include <linux/pci.h>
+
+struct drm_i915_private;
+struct intel_encoder;
+
+struct opregion_header;
+struct opregion_acpi;
+struct opregion_swsci;
+struct opregion_asle;
+
+struct intel_opregion {
+	struct opregion_header *header;
+	struct opregion_acpi *acpi;
+	struct opregion_swsci *swsci;
+	u32 swsci_gbda_sub_functions;
+	u32 swsci_sbcb_sub_functions;
+	struct opregion_asle *asle;
+	void *rvda;
+	void *vbt_firmware;
+	const void *vbt;
+	u32 vbt_size;
+	u32 *lid_state;
+	struct work_struct asle_work;
+};
+
+#define OPREGION_SIZE            (8 * 1024)
+
+#ifdef CONFIG_ACPI
+
+int intel_opregion_setup(struct drm_i915_private *dev_priv);
+void intel_opregion_register(struct drm_i915_private *dev_priv);
+void intel_opregion_unregister(struct drm_i915_private *dev_priv);
+void intel_opregion_asle_intr(struct drm_i915_private *dev_priv);
+int intel_opregion_notify_encoder(struct intel_encoder *intel_encoder,
+				  bool enable);
+int intel_opregion_notify_adapter(struct drm_i915_private *dev_priv,
+				  pci_power_t state);
+int intel_opregion_get_panel_type(struct drm_i915_private *dev_priv);
+
+#else /* CONFIG_ACPI*/
+
+static inline int intel_opregion_setup(struct drm_i915_private *dev_priv)
+{
+	return 0;
+}
+
+static inline void intel_opregion_register(struct drm_i915_private *dev_priv)
+{
+}
+
+static inline void intel_opregion_unregister(struct drm_i915_private *dev_priv)
+{
+}
+
+static inline void intel_opregion_asle_intr(struct drm_i915_private *dev_priv)
+{
+}
+
+static inline int
+intel_opregion_notify_encoder(struct intel_encoder *intel_encoder, bool enable)
+{
+	return 0;
+}
+
+static inline int
+intel_opregion_notify_adapter(struct drm_i915_private *dev, pci_power_t state)
+{
+	return 0;
+}
+
+static inline int intel_opregion_get_panel_type(struct drm_i915_private *dev)
+{
+	return -ENODEV;
+}
+
+#endif /* CONFIG_ACPI */
+
+#endif
diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c
index 18779c6eb4bf..1db79a860b96 100644
--- a/drivers/gpu/drm/i915/intel_pm.c
+++ b/drivers/gpu/drm/i915/intel_pm.c
@@ -8447,6 +8447,11 @@ static void cnl_init_clock_gating(struct drm_i915_private *dev_priv)
 	if (IS_CNL_REVID(dev_priv, CNL_REVID_A0, CNL_REVID_B0))
 		val |= SARBUNIT_CLKGATE_DIS;
 	I915_WRITE(SLICE_UNIT_LEVEL_CLKGATE, val);
+
+	/* WaDisableVFclkgate:cnl */
+	val = I915_READ(UNSLICE_UNIT_LEVEL_CLKGATE);
+	val |= VFUNIT_CLKGATE_DIS;
+	I915_WRITE(UNSLICE_UNIT_LEVEL_CLKGATE, val);
 }
 
 static void cfl_init_clock_gating(struct drm_i915_private *dev_priv)
diff --git a/drivers/gpu/drm/i915/intel_psr.c b/drivers/gpu/drm/i915/intel_psr.c
index a1ad85fa5c1a..2e32615eeada 100644
--- a/drivers/gpu/drm/i915/intel_psr.c
+++ b/drivers/gpu/drm/i915/intel_psr.c
@@ -590,7 +590,7 @@ static void hsw_psr_disable(struct intel_dp *intel_dp,
 	struct drm_i915_private *dev_priv = to_i915(dev);
 
 	if (dev_priv->psr.active) {
-		i915_reg_t psr_ctl;
+		i915_reg_t psr_status;
 		u32 psr_status_mask;
 
 		if (dev_priv->psr.aux_frame_sync)
@@ -599,24 +599,24 @@ static void hsw_psr_disable(struct intel_dp *intel_dp,
 					0);
 
 		if (dev_priv->psr.psr2_support) {
-			psr_ctl = EDP_PSR2_CTL;
+			psr_status = EDP_PSR2_STATUS_CTL;
 			psr_status_mask = EDP_PSR2_STATUS_STATE_MASK;
 
-			I915_WRITE(psr_ctl,
-				   I915_READ(psr_ctl) &
+			I915_WRITE(EDP_PSR2_CTL,
+				   I915_READ(EDP_PSR2_CTL) &
 				   ~(EDP_PSR2_ENABLE | EDP_SU_TRACK_ENABLE));
 
 		} else {
-			psr_ctl = EDP_PSR_STATUS_CTL;
+			psr_status = EDP_PSR_STATUS_CTL;
 			psr_status_mask = EDP_PSR_STATUS_STATE_MASK;
 
-			I915_WRITE(psr_ctl,
-				   I915_READ(psr_ctl) & ~EDP_PSR_ENABLE);
+			I915_WRITE(EDP_PSR_CTL,
+				   I915_READ(EDP_PSR_CTL) & ~EDP_PSR_ENABLE);
 		}
 
 		/* Wait till PSR is idle */
 		if (intel_wait_for_register(dev_priv,
-					    psr_ctl, psr_status_mask, 0,
+					    psr_status, psr_status_mask, 0,
 					    2000))
 			DRM_ERROR("Timed out waiting for PSR Idle State\n");
 
diff --git a/drivers/gpu/drm/i915/intel_runtime_pm.c b/drivers/gpu/drm/i915/intel_runtime_pm.c
index 96ab74f3d101..db9d57f39534 100644
--- a/drivers/gpu/drm/i915/intel_runtime_pm.c
+++ b/drivers/gpu/drm/i915/intel_runtime_pm.c
@@ -1726,13 +1726,13 @@ void intel_display_power_put(struct drm_i915_private *dev_priv,
 	BIT_ULL(POWER_DOMAIN_AUX_C) |			\
 	BIT_ULL(POWER_DOMAIN_AUDIO) |			\
 	BIT_ULL(POWER_DOMAIN_VGA) |				\
-	BIT_ULL(POWER_DOMAIN_GMBUS) |			\
 	BIT_ULL(POWER_DOMAIN_INIT))
 #define BXT_DISPLAY_DC_OFF_POWER_DOMAINS (		\
 	BXT_DISPLAY_POWERWELL_2_POWER_DOMAINS |		\
 	BIT_ULL(POWER_DOMAIN_GT_IRQ) |			\
 	BIT_ULL(POWER_DOMAIN_MODESET) |			\
 	BIT_ULL(POWER_DOMAIN_AUX_A) |			\
+	BIT_ULL(POWER_DOMAIN_GMBUS) |			\
 	BIT_ULL(POWER_DOMAIN_INIT))
 #define BXT_DPIO_CMN_A_POWER_DOMAINS (			\
 	BIT_ULL(POWER_DOMAIN_PORT_DDI_A_LANES) |		\
@@ -1792,6 +1792,7 @@ void intel_display_power_put(struct drm_i915_private *dev_priv,
 	BIT_ULL(POWER_DOMAIN_GT_IRQ) |			\
 	BIT_ULL(POWER_DOMAIN_MODESET) |			\
 	BIT_ULL(POWER_DOMAIN_AUX_A) |			\
+	BIT_ULL(POWER_DOMAIN_GMBUS) |			\
 	BIT_ULL(POWER_DOMAIN_INIT))
 
 #define CNL_DISPLAY_POWERWELL_2_POWER_DOMAINS (		\
diff --git a/drivers/gpu/drm/i915/selftests/huge_pages.c b/drivers/gpu/drm/i915/selftests/huge_pages.c
index e6b31041cc88..2ea69394f428 100644
--- a/drivers/gpu/drm/i915/selftests/huge_pages.c
+++ b/drivers/gpu/drm/i915/selftests/huge_pages.c
@@ -1637,7 +1637,7 @@ static int igt_shrink_thp(void *arg)
 	 * shmem to truncate our pages.
 	 */
 	i915_gem_shrink_all(i915);
-	if (!IS_ERR_OR_NULL(obj->mm.pages)) {
+	if (i915_gem_object_has_pages(obj)) {
 		pr_err("shrink-all didn't truncate the pages\n");
 		err = -EINVAL;
 		goto out_close;
diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
index f98546b8a7fa..d1f91a533afa 100644
--- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
@@ -132,6 +132,12 @@ static int emit_recurse_batch(struct hang *h,
 		*batch++ = lower_32_bits(hws_address(hws, rq));
 		*batch++ = upper_32_bits(hws_address(hws, rq));
 		*batch++ = rq->fence.seqno;
+		*batch++ = MI_ARB_CHECK;
+
+		memset(batch, 0, 1024);
+		batch += 1024 / sizeof(*batch);
+
+		*batch++ = MI_ARB_CHECK;
 		*batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
 		*batch++ = lower_32_bits(vma->node.start);
 		*batch++ = upper_32_bits(vma->node.start);
@@ -140,6 +146,12 @@ static int emit_recurse_batch(struct hang *h,
 		*batch++ = 0;
 		*batch++ = lower_32_bits(hws_address(hws, rq));
 		*batch++ = rq->fence.seqno;
+		*batch++ = MI_ARB_CHECK;
+
+		memset(batch, 0, 1024);
+		batch += 1024 / sizeof(*batch);
+
+		*batch++ = MI_ARB_CHECK;
 		*batch++ = MI_BATCH_BUFFER_START | 1 << 8;
 		*batch++ = lower_32_bits(vma->node.start);
 	} else if (INTEL_GEN(i915) >= 4) {
@@ -147,12 +159,24 @@ static int emit_recurse_batch(struct hang *h,
 		*batch++ = 0;
 		*batch++ = lower_32_bits(hws_address(hws, rq));
 		*batch++ = rq->fence.seqno;
+		*batch++ = MI_ARB_CHECK;
+
+		memset(batch, 0, 1024);
+		batch += 1024 / sizeof(*batch);
+
+		*batch++ = MI_ARB_CHECK;
 		*batch++ = MI_BATCH_BUFFER_START | 2 << 6;
 		*batch++ = lower_32_bits(vma->node.start);
 	} else {
 		*batch++ = MI_STORE_DWORD_IMM;
 		*batch++ = lower_32_bits(hws_address(hws, rq));
 		*batch++ = rq->fence.seqno;
+		*batch++ = MI_ARB_CHECK;
+
+		memset(batch, 0, 1024);
+		batch += 1024 / sizeof(*batch);
+
+		*batch++ = MI_ARB_CHECK;
 		*batch++ = MI_BATCH_BUFFER_START | 2 << 6 | 1;
 		*batch++ = lower_32_bits(vma->node.start);
 	}
@@ -234,6 +258,16 @@ static void hang_fini(struct hang *h)
 	i915_gem_wait_for_idle(h->i915, I915_WAIT_LOCKED);
 }
 
+static bool wait_for_hang(struct hang *h, struct drm_i915_gem_request *rq)
+{
+	return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
+					       rq->fence.seqno),
+			     10) &&
+		 wait_for(i915_seqno_passed(hws_seqno(h, rq),
+					    rq->fence.seqno),
+			  1000));
+}
+
 static int igt_hang_sanitycheck(void *arg)
 {
 	struct drm_i915_private *i915 = arg;
@@ -296,6 +330,9 @@ static void global_reset_lock(struct drm_i915_private *i915)
 	struct intel_engine_cs *engine;
 	enum intel_engine_id id;
 
+	pr_debug("%s: current gpu_error=%08lx\n",
+		 __func__, i915->gpu_error.flags);
+
 	while (test_and_set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags))
 		wait_event(i915->gpu_error.reset_queue,
 			   !test_bit(I915_RESET_BACKOFF,
@@ -353,54 +390,128 @@ static int igt_global_reset(void *arg)
 	return err;
 }
 
-static int igt_reset_engine(void *arg)
+static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
 {
-	struct drm_i915_private *i915 = arg;
 	struct intel_engine_cs *engine;
 	enum intel_engine_id id;
-	unsigned int reset_count, reset_engine_count;
+	struct hang h;
 	int err = 0;
 
-	/* Check that we can issue a global GPU and engine reset */
+	/* Check that we can issue an engine reset on an idle engine (no-op) */
 
 	if (!intel_has_reset_engine(i915))
 		return 0;
 
+	if (active) {
+		mutex_lock(&i915->drm.struct_mutex);
+		err = hang_init(&h, i915);
+		mutex_unlock(&i915->drm.struct_mutex);
+		if (err)
+			return err;
+	}
+
 	for_each_engine(engine, i915, id) {
-		set_bit(I915_RESET_ENGINE + engine->id, &i915->gpu_error.flags);
+		unsigned int reset_count, reset_engine_count;
+		IGT_TIMEOUT(end_time);
+
+		if (active && !intel_engine_can_store_dword(engine))
+			continue;
+
 		reset_count = i915_reset_count(&i915->gpu_error);
 		reset_engine_count = i915_reset_engine_count(&i915->gpu_error,
 							     engine);
 
-		err = i915_reset_engine(engine, I915_RESET_QUIET);
-		if (err) {
-			pr_err("i915_reset_engine failed\n");
-			break;
-		}
+		set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
+		do {
+			if (active) {
+				struct drm_i915_gem_request *rq;
+
+				mutex_lock(&i915->drm.struct_mutex);
+				rq = hang_create_request(&h, engine,
+							 i915->kernel_context);
+				if (IS_ERR(rq)) {
+					err = PTR_ERR(rq);
+					mutex_unlock(&i915->drm.struct_mutex);
+					break;
+				}
+
+				i915_gem_request_get(rq);
+				__i915_add_request(rq, true);
+				mutex_unlock(&i915->drm.struct_mutex);
+
+				if (!wait_for_hang(&h, rq)) {
+					struct drm_printer p = drm_info_printer(i915->drm.dev);
+
+					pr_err("%s: Failed to start request %x, at %x\n",
+					       __func__, rq->fence.seqno, hws_seqno(&h, rq));
+					intel_engine_dump(engine, &p,
+							  "%s\n", engine->name);
+
+					i915_gem_request_put(rq);
+					err = -EIO;
+					break;
+				}
 
-		if (i915_reset_count(&i915->gpu_error) != reset_count) {
-			pr_err("Full GPU reset recorded! (engine reset expected)\n");
-			err = -EINVAL;
-			break;
-		}
+				i915_gem_request_put(rq);
+			}
+
+			engine->hangcheck.stalled = true;
+			engine->hangcheck.seqno =
+				intel_engine_get_seqno(engine);
+
+			err = i915_reset_engine(engine, I915_RESET_QUIET);
+			if (err) {
+				pr_err("i915_reset_engine failed\n");
+				break;
+			}
+
+			if (i915_reset_count(&i915->gpu_error) != reset_count) {
+				pr_err("Full GPU reset recorded! (engine reset expected)\n");
+				err = -EINVAL;
+				break;
+			}
+
+			reset_engine_count += active;
+			if (i915_reset_engine_count(&i915->gpu_error, engine) !=
+			    reset_engine_count) {
+				pr_err("%s engine reset %srecorded!\n",
+				       engine->name, active ? "not " : "");
+				err = -EINVAL;
+				break;
+			}
+
+			engine->hangcheck.stalled = false;
+		} while (time_before(jiffies, end_time));
+		clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
 
-		if (i915_reset_engine_count(&i915->gpu_error, engine) ==
-		    reset_engine_count) {
-			pr_err("No %s engine reset recorded!\n", engine->name);
-			err = -EINVAL;
+		if (err)
 			break;
-		}
 
-		clear_bit(I915_RESET_ENGINE + engine->id,
-			  &i915->gpu_error.flags);
+		cond_resched();
 	}
 
 	if (i915_terminally_wedged(&i915->gpu_error))
 		err = -EIO;
 
+	if (active) {
+		mutex_lock(&i915->drm.struct_mutex);
+		hang_fini(&h);
+		mutex_unlock(&i915->drm.struct_mutex);
+	}
+
 	return err;
 }
 
+static int igt_reset_idle_engine(void *arg)
+{
+	return __igt_reset_engine(arg, false);
+}
+
+static int igt_reset_active_engine(void *arg)
+{
+	return __igt_reset_engine(arg, true);
+}
+
 static int active_engine(void *data)
 {
 	struct intel_engine_cs *engine = data;
@@ -462,11 +573,12 @@ err_file:
 	return err;
 }
 
-static int igt_reset_active_engines(void *arg)
+static int __igt_reset_engine_others(struct drm_i915_private *i915,
+				     bool active)
 {
-	struct drm_i915_private *i915 = arg;
-	struct intel_engine_cs *engine, *active;
+	struct intel_engine_cs *engine, *other;
 	enum intel_engine_id id, tmp;
+	struct hang h;
 	int err = 0;
 
 	/* Check that issuing a reset on one engine does not interfere
@@ -476,24 +588,36 @@ static int igt_reset_active_engines(void *arg)
 	if (!intel_has_reset_engine(i915))
 		return 0;
 
+	if (active) {
+		mutex_lock(&i915->drm.struct_mutex);
+		err = hang_init(&h, i915);
+		mutex_unlock(&i915->drm.struct_mutex);
+		if (err)
+			return err;
+	}
+
 	for_each_engine(engine, i915, id) {
-		struct task_struct *threads[I915_NUM_ENGINES];
+		struct task_struct *threads[I915_NUM_ENGINES] = {};
 		unsigned long resets[I915_NUM_ENGINES];
 		unsigned long global = i915_reset_count(&i915->gpu_error);
+		unsigned long count = 0;
 		IGT_TIMEOUT(end_time);
 
+		if (active && !intel_engine_can_store_dword(engine))
+			continue;
+
 		memset(threads, 0, sizeof(threads));
-		for_each_engine(active, i915, tmp) {
+		for_each_engine(other, i915, tmp) {
 			struct task_struct *tsk;
 
-			if (active == engine)
-				continue;
-
 			resets[tmp] = i915_reset_engine_count(&i915->gpu_error,
-							      active);
+							      other);
 
-			tsk = kthread_run(active_engine, active,
-					  "igt/%s", active->name);
+			if (other == engine)
+				continue;
+
+			tsk = kthread_run(active_engine, other,
+					  "igt/%s", other->name);
 			if (IS_ERR(tsk)) {
 				err = PTR_ERR(tsk);
 				goto unwind;
@@ -503,20 +627,70 @@ static int igt_reset_active_engines(void *arg)
 			get_task_struct(tsk);
 		}
 
-		set_bit(I915_RESET_ENGINE + engine->id, &i915->gpu_error.flags);
+		set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
 		do {
+			if (active) {
+				struct drm_i915_gem_request *rq;
+
+				mutex_lock(&i915->drm.struct_mutex);
+				rq = hang_create_request(&h, engine,
+							 i915->kernel_context);
+				if (IS_ERR(rq)) {
+					err = PTR_ERR(rq);
+					mutex_unlock(&i915->drm.struct_mutex);
+					break;
+				}
+
+				i915_gem_request_get(rq);
+				__i915_add_request(rq, true);
+				mutex_unlock(&i915->drm.struct_mutex);
+
+				if (!wait_for_hang(&h, rq)) {
+					struct drm_printer p = drm_info_printer(i915->drm.dev);
+
+					pr_err("%s: Failed to start request %x, at %x\n",
+					       __func__, rq->fence.seqno, hws_seqno(&h, rq));
+					intel_engine_dump(engine, &p,
+							  "%s\n", engine->name);
+
+					i915_gem_request_put(rq);
+					err = -EIO;
+					break;
+				}
+
+				i915_gem_request_put(rq);
+			}
+
+			engine->hangcheck.stalled = true;
+			engine->hangcheck.seqno =
+				intel_engine_get_seqno(engine);
+
 			err = i915_reset_engine(engine, I915_RESET_QUIET);
 			if (err) {
-				pr_err("i915_reset_engine(%s) failed, err=%d\n",
-				       engine->name, err);
+				pr_err("i915_reset_engine(%s:%s) failed, err=%d\n",
+				       engine->name, active ? "active" : "idle", err);
 				break;
 			}
+
+			engine->hangcheck.stalled = false;
+			count++;
 		} while (time_before(jiffies, end_time));
-		clear_bit(I915_RESET_ENGINE + engine->id,
-			  &i915->gpu_error.flags);
+		clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
+		pr_info("i915_reset_engine(%s:%s): %lu resets\n",
+			engine->name, active ? "active" : "idle", count);
+
+		if (i915_reset_engine_count(&i915->gpu_error, engine) -
+		    resets[engine->id] != (active ? count : 0)) {
+			pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
+			       engine->name, active ? "active" : "idle", count,
+			       i915_reset_engine_count(&i915->gpu_error,
+						       engine) - resets[engine->id]);
+			if (!err)
+				err = -EINVAL;
+		}
 
 unwind:
-		for_each_engine(active, i915, tmp) {
+		for_each_engine(other, i915, tmp) {
 			int ret;
 
 			if (!threads[tmp])
@@ -524,27 +698,29 @@ unwind:
 
 			ret = kthread_stop(threads[tmp]);
 			if (ret) {
-				pr_err("kthread for active engine %s failed, err=%d\n",
-				       active->name, ret);
+				pr_err("kthread for other engine %s failed, err=%d\n",
+				       other->name, ret);
 				if (!err)
 					err = ret;
 			}
 			put_task_struct(threads[tmp]);
 
 			if (resets[tmp] != i915_reset_engine_count(&i915->gpu_error,
-								   active)) {
+								   other)) {
 				pr_err("Innocent engine %s was reset (count=%ld)\n",
-				       active->name,
+				       other->name,
 				       i915_reset_engine_count(&i915->gpu_error,
-							       active) - resets[tmp]);
-				err = -EIO;
+							       other) - resets[tmp]);
+				if (!err)
+					err = -EINVAL;
 			}
 		}
 
 		if (global != i915_reset_count(&i915->gpu_error)) {
 			pr_err("Global reset (count=%ld)!\n",
 			       i915_reset_count(&i915->gpu_error) - global);
-			err = -EIO;
+			if (!err)
+				err = -EINVAL;
 		}
 
 		if (err)
@@ -556,9 +732,25 @@ unwind:
 	if (i915_terminally_wedged(&i915->gpu_error))
 		err = -EIO;
 
+	if (active) {
+		mutex_lock(&i915->drm.struct_mutex);
+		hang_fini(&h);
+		mutex_unlock(&i915->drm.struct_mutex);
+	}
+
 	return err;
 }
 
+static int igt_reset_idle_engine_others(void *arg)
+{
+	return __igt_reset_engine_others(arg, false);
+}
+
+static int igt_reset_active_engine_others(void *arg)
+{
+	return __igt_reset_engine_others(arg, true);
+}
+
 static u32 fake_hangcheck(struct drm_i915_gem_request *rq)
 {
 	u32 reset_count;
@@ -574,16 +766,6 @@ static u32 fake_hangcheck(struct drm_i915_gem_request *rq)
 	return reset_count;
 }
 
-static bool wait_for_hang(struct hang *h, struct drm_i915_gem_request *rq)
-{
-	return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
-					       rq->fence.seqno),
-			     10) &&
-		 wait_for(i915_seqno_passed(hws_seqno(h, rq),
-					    rq->fence.seqno),
-			  1000));
-}
-
 static int igt_wait_reset(void *arg)
 {
 	struct drm_i915_private *i915 = arg;
@@ -617,8 +799,8 @@ static int igt_wait_reset(void *arg)
 	if (!wait_for_hang(&h, rq)) {
 		struct drm_printer p = drm_info_printer(i915->drm.dev);
 
-		pr_err("Failed to start request %x, at %x\n",
-		       rq->fence.seqno, hws_seqno(&h, rq));
+		pr_err("%s: Failed to start request %x, at %x\n",
+		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
 
 		i915_reset(i915, 0);
@@ -712,8 +894,8 @@ static int igt_reset_queue(void *arg)
 			if (!wait_for_hang(&h, prev)) {
 				struct drm_printer p = drm_info_printer(i915->drm.dev);
 
-				pr_err("Failed to start request %x, at %x\n",
-				       prev->fence.seqno, hws_seqno(&h, prev));
+				pr_err("%s: Failed to start request %x, at %x\n",
+				       __func__, prev->fence.seqno, hws_seqno(&h, prev));
 				intel_engine_dump(prev->engine, &p,
 						  "%s\n", prev->engine->name);
 
@@ -819,8 +1001,8 @@ static int igt_handle_error(void *arg)
 	if (!wait_for_hang(&h, rq)) {
 		struct drm_printer p = drm_info_printer(i915->drm.dev);
 
-		pr_err("Failed to start request %x, at %x\n",
-		       rq->fence.seqno, hws_seqno(&h, rq));
+		pr_err("%s: Failed to start request %x, at %x\n",
+		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
 
 		i915_reset(i915, 0);
@@ -864,21 +1046,26 @@ int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
 	static const struct i915_subtest tests[] = {
 		SUBTEST(igt_global_reset), /* attempt to recover GPU first */
 		SUBTEST(igt_hang_sanitycheck),
-		SUBTEST(igt_reset_engine),
-		SUBTEST(igt_reset_active_engines),
+		SUBTEST(igt_reset_idle_engine),
+		SUBTEST(igt_reset_active_engine),
+		SUBTEST(igt_reset_idle_engine_others),
+		SUBTEST(igt_reset_active_engine_others),
 		SUBTEST(igt_wait_reset),
 		SUBTEST(igt_reset_queue),
 		SUBTEST(igt_handle_error),
 	};
+	bool saved_hangcheck;
 	int err;
 
 	if (!intel_has_gpu_reset(i915))
 		return 0;
 
 	intel_runtime_pm_get(i915);
+	saved_hangcheck = fetch_and_zero(&i915_modparams.enable_hangcheck);
 
 	err = i915_subtests(tests, i915);
 
+	i915_modparams.enable_hangcheck = saved_hangcheck;
 	intel_runtime_pm_put(i915);
 
 	return err;
diff --git a/drivers/gpu/drm/imx/imx-drm-core.c b/drivers/gpu/drm/imx/imx-drm-core.c
index 3f2b4afcb8a7..1d053bbefc02 100644
--- a/drivers/gpu/drm/imx/imx-drm-core.c
+++ b/drivers/gpu/drm/imx/imx-drm-core.c
@@ -257,6 +257,7 @@ static int imx_drm_bind(struct device *dev)
 	drm->mode_config.max_height = 4096;
 	drm->mode_config.funcs = &imx_drm_mode_config_funcs;
 	drm->mode_config.helper_private = &imx_drm_mode_config_helpers;
+	drm->mode_config.allow_fb_modifiers = true;
 
 	drm_mode_config_init(drm);
 
diff --git a/drivers/gpu/drm/imx/ipuv3-plane.c b/drivers/gpu/drm/imx/ipuv3-plane.c
index 5a67daedcf4d..57ed56d8623f 100644
--- a/drivers/gpu/drm/imx/ipuv3-plane.c
+++ b/drivers/gpu/drm/imx/ipuv3-plane.c
@@ -77,6 +77,18 @@ static const uint32_t ipu_plane_formats[] = {
 	DRM_FORMAT_BGRX8888_A8,
 };
 
+static const uint64_t ipu_format_modifiers[] = {
+	DRM_FORMAT_MOD_LINEAR,
+	DRM_FORMAT_MOD_INVALID
+};
+
+static const uint64_t pre_format_modifiers[] = {
+	DRM_FORMAT_MOD_LINEAR,
+	DRM_FORMAT_MOD_VIVANTE_TILED,
+	DRM_FORMAT_MOD_VIVANTE_SUPER_TILED,
+	DRM_FORMAT_MOD_INVALID
+};
+
 int ipu_plane_irq(struct ipu_plane *ipu_plane)
 {
 	return ipu_idmac_channel_irq(ipu_plane->ipu, ipu_plane->ipu_ch,
@@ -303,6 +315,22 @@ void ipu_plane_destroy_state(struct drm_plane *plane,
 	kfree(ipu_state);
 }
 
+static bool ipu_plane_format_mod_supported(struct drm_plane *plane,
+					   uint32_t format, uint64_t modifier)
+{
+	struct ipu_soc *ipu = to_ipu_plane(plane)->ipu;
+
+	/* linear is supported for all planes and formats */
+	if (modifier == DRM_FORMAT_MOD_LINEAR)
+		return true;
+
+	/* without a PRG there are no supported modifiers */
+	if (!ipu_prg_present(ipu))
+		return false;
+
+	return ipu_prg_format_supported(ipu, format, modifier);
+}
+
 static const struct drm_plane_funcs ipu_plane_funcs = {
 	.update_plane	= drm_atomic_helper_update_plane,
 	.disable_plane	= drm_atomic_helper_disable_plane,
@@ -310,6 +338,7 @@ static const struct drm_plane_funcs ipu_plane_funcs = {
 	.reset		= ipu_plane_state_reset,
 	.atomic_duplicate_state	= ipu_plane_duplicate_state,
 	.atomic_destroy_state	= ipu_plane_destroy_state,
+	.format_mod_supported = ipu_plane_format_mod_supported,
 };
 
 static int ipu_plane_atomic_check(struct drm_plane *plane,
@@ -550,8 +579,8 @@ static void ipu_plane_atomic_update(struct drm_plane *plane,
 		ipu_prg_channel_configure(ipu_plane->ipu_ch, axi_id,
 					  drm_rect_width(&state->src) >> 16,
 					  drm_rect_height(&state->src) >> 16,
-					  fb->pitches[0],
-					  fb->format->format, &eba);
+					  fb->pitches[0], fb->format->format,
+					  fb->modifier, &eba);
 	}
 
 	if (old_state->fb && !drm_atomic_crtc_needs_modeset(crtc_state)) {
@@ -700,18 +729,71 @@ static const struct drm_plane_helper_funcs ipu_plane_helper_funcs = {
 int ipu_planes_assign_pre(struct drm_device *dev,
 			  struct drm_atomic_state *state)
 {
+	struct drm_crtc_state *old_crtc_state, *crtc_state;
 	struct drm_plane_state *plane_state;
+	struct ipu_plane_state *ipu_state;
+	struct ipu_plane *ipu_plane;
 	struct drm_plane *plane;
+	struct drm_crtc *crtc;
 	int available_pres = ipu_prg_max_active_channels();
-	int i;
+	int ret, i;
 
+	for_each_oldnew_crtc_in_state(state, crtc, old_crtc_state, crtc_state, i) {
+		ret = drm_atomic_add_affected_planes(state, crtc);
+		if (ret)
+			return ret;
+	}
+
+	/*
+	 * We are going over the planes in 2 passes: first we assign PREs to
+	 * planes with a tiling modifier, which need the PREs to resolve into
+	 * linear. Any failure to assign a PRE there is fatal. In the second
+	 * pass we try to assign PREs to linear FBs, to improve memory access
+	 * patterns for them. Failure at this point is non-fatal, as we can
+	 * scan out linear FBs without a PRE.
+	 */
 	for_each_new_plane_in_state(state, plane, plane_state, i) {
-		struct ipu_plane_state *ipu_state =
-				to_ipu_plane_state(plane_state);
-		struct ipu_plane *ipu_plane = to_ipu_plane(plane);
+		ipu_state = to_ipu_plane_state(plane_state);
+		ipu_plane = to_ipu_plane(plane);
+
+		if (!plane_state->fb) {
+			ipu_state->use_pre = false;
+			continue;
+		}
+
+		if (!(plane_state->fb->flags & DRM_MODE_FB_MODIFIERS) ||
+		    plane_state->fb->modifier == DRM_FORMAT_MOD_LINEAR)
+			continue;
+
+		if (!ipu_prg_present(ipu_plane->ipu) || !available_pres)
+			return -EINVAL;
+
+		if (!ipu_prg_format_supported(ipu_plane->ipu,
+					      plane_state->fb->format->format,
+					      plane_state->fb->modifier))
+			return -EINVAL;
+
+		ipu_state->use_pre = true;
+		available_pres--;
+	}
+
+	for_each_new_plane_in_state(state, plane, plane_state, i) {
+		ipu_state = to_ipu_plane_state(plane_state);
+		ipu_plane = to_ipu_plane(plane);
+
+		if (!plane_state->fb) {
+			ipu_state->use_pre = false;
+			continue;
+		}
+
+		if ((plane_state->fb->flags & DRM_MODE_FB_MODIFIERS) &&
+		    plane_state->fb->modifier != DRM_FORMAT_MOD_LINEAR)
+			continue;
+
+		/* make sure that modifier is initialized */
+		plane_state->fb->modifier = DRM_FORMAT_MOD_LINEAR;
 
 		if (ipu_prg_present(ipu_plane->ipu) && available_pres &&
-		    plane_state->fb &&
 		    ipu_prg_format_supported(ipu_plane->ipu,
 					     plane_state->fb->format->format,
 					     plane_state->fb->modifier)) {
@@ -731,6 +813,7 @@ struct ipu_plane *ipu_plane_init(struct drm_device *dev, struct ipu_soc *ipu,
 				 enum drm_plane_type type)
 {
 	struct ipu_plane *ipu_plane;
+	const uint64_t *modifiers = ipu_format_modifiers;
 	int ret;
 
 	DRM_DEBUG_KMS("channel %d, dp flow %d, possible_crtcs=0x%x\n",
@@ -746,10 +829,13 @@ struct ipu_plane *ipu_plane_init(struct drm_device *dev, struct ipu_soc *ipu,
 	ipu_plane->dma = dma;
 	ipu_plane->dp_flow = dp;
 
+	if (ipu_prg_present(ipu))
+		modifiers = pre_format_modifiers;
+
 	ret = drm_universal_plane_init(dev, &ipu_plane->base, possible_crtcs,
 				       &ipu_plane_funcs, ipu_plane_formats,
 				       ARRAY_SIZE(ipu_plane_formats),
-				       NULL, type, NULL);
+				       modifiers, type, NULL);
 	if (ret) {
 		DRM_ERROR("failed to initialize plane\n");
 		kfree(ipu_plane);
diff --git a/drivers/gpu/drm/stm/dw_mipi_dsi-stm.c b/drivers/gpu/drm/stm/dw_mipi_dsi-stm.c
index 82dcb20cdaa3..fd02506274da 100644
--- a/drivers/gpu/drm/stm/dw_mipi_dsi-stm.c
+++ b/drivers/gpu/drm/stm/dw_mipi_dsi-stm.c
@@ -290,11 +290,6 @@ static int dw_mipi_dsi_stm_probe(struct platform_device *pdev)
 		return -ENOMEM;
 
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	if (!res) {
-		DRM_ERROR("Unable to get resource\n");
-		return -ENODEV;
-	}
-
 	dsi->base = devm_ioremap_resource(dev, res);
 	if (IS_ERR(dsi->base)) {
 		DRM_ERROR("Unable to get dsi registers\n");
diff --git a/drivers/gpu/drm/stm/ltdc.c b/drivers/gpu/drm/stm/ltdc.c
index 394613b0fd46..6dc5d4ec4e17 100644
--- a/drivers/gpu/drm/stm/ltdc.c
+++ b/drivers/gpu/drm/stm/ltdc.c
@@ -901,12 +901,6 @@ int ltdc_load(struct drm_device *ddev)
 	}
 
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	if (!res) {
-		DRM_ERROR("Unable to get resource\n");
-		ret = -ENODEV;
-		goto err;
-	}
-
 	ldev->regs = devm_ioremap_resource(dev, res);
 	if (IS_ERR(ldev->regs)) {
 		DRM_ERROR("Unable to get ltdc registers\n");
diff --git a/drivers/gpu/ipu-v3/Kconfig b/drivers/gpu/ipu-v3/Kconfig
index 87a20b3dcf7a..fe6f8c5b4445 100644
--- a/drivers/gpu/ipu-v3/Kconfig
+++ b/drivers/gpu/ipu-v3/Kconfig
@@ -1,7 +1,9 @@
 config IMX_IPUV3_CORE
 	tristate "IPUv3 core support"
-	depends on SOC_IMX5 || SOC_IMX6Q || ARCH_MULTIPLATFORM
+	depends on SOC_IMX5 || SOC_IMX6Q || ARCH_MULTIPLATFORM || COMPILE_TEST
 	depends on DRM || !DRM # if DRM=m, this can't be 'y'
+	select BITREVERSE
+	select GENERIC_ALLOCATOR if DRM
 	select GENERIC_IRQ_CHIP
 	help
 	  Choose this if you have a i.MX5/6 system and want to use the Image
diff --git a/drivers/gpu/ipu-v3/ipu-cpmem.c b/drivers/gpu/ipu-v3/ipu-cpmem.c
index 1cb82f445f91..bb9c087e6c0d 100644
--- a/drivers/gpu/ipu-v3/ipu-cpmem.c
+++ b/drivers/gpu/ipu-v3/ipu-cpmem.c
@@ -12,6 +12,7 @@
 #include <linux/types.h>
 #include <linux/bitrev.h>
 #include <linux/io.h>
+#include <linux/sizes.h>
 #include <drm/drm_fourcc.h>
 #include "ipu-prv.h"
 
diff --git a/drivers/gpu/ipu-v3/ipu-ic.c b/drivers/gpu/ipu-v3/ipu-ic.c
index 321eb983c2f5..67cc820253a9 100644
--- a/drivers/gpu/ipu-v3/ipu-ic.c
+++ b/drivers/gpu/ipu-v3/ipu-ic.c
@@ -17,6 +17,7 @@
 #include <linux/bitrev.h>
 #include <linux/io.h>
 #include <linux/err.h>
+#include <linux/sizes.h>
 #include "ipu-prv.h"
 
 /* IC Register Offsets */
diff --git a/drivers/gpu/ipu-v3/ipu-pre.c b/drivers/gpu/ipu-v3/ipu-pre.c
index c860a7997cb5..f1cec3d70498 100644
--- a/drivers/gpu/ipu-v3/ipu-pre.c
+++ b/drivers/gpu/ipu-v3/ipu-pre.c
@@ -49,6 +49,10 @@
 #define IPU_PRE_TPR_CTRL				0x070
 #define  IPU_PRE_TPR_CTRL_TILE_FORMAT(v)		((v & 0xff) << 0)
 #define  IPU_PRE_TPR_CTRL_TILE_FORMAT_MASK		0xff
+#define  IPU_PRE_TPR_CTRL_TILE_FORMAT_16_BIT		(1 << 0)
+#define  IPU_PRE_TPR_CTRL_TILE_FORMAT_SPLIT_BUF		(1 << 4)
+#define  IPU_PRE_TPR_CTRL_TILE_FORMAT_SINGLE_BUF	(1 << 5)
+#define  IPU_PRE_TPR_CTRL_TILE_FORMAT_SUPER_TILED	(1 << 6)
 
 #define IPU_PRE_PREFETCH_ENG_CTRL			0x080
 #define  IPU_PRE_PREF_ENG_CTRL_PREFETCH_EN		(1 << 0)
@@ -147,7 +151,7 @@ int ipu_pre_get(struct ipu_pre *pre)
 	val = IPU_PRE_CTRL_HANDSHAKE_ABORT_SKIP_EN |
 	      IPU_PRE_CTRL_HANDSHAKE_EN |
 	      IPU_PRE_CTRL_TPR_REST_SEL |
-	      IPU_PRE_CTRL_BLOCK_16 | IPU_PRE_CTRL_SDW_UPDATE;
+	      IPU_PRE_CTRL_SDW_UPDATE;
 	writel(val, pre->regs + IPU_PRE_CTRL);
 
 	pre->in_use = true;
@@ -163,14 +167,17 @@ void ipu_pre_put(struct ipu_pre *pre)
 
 void ipu_pre_configure(struct ipu_pre *pre, unsigned int width,
 		       unsigned int height, unsigned int stride, u32 format,
-		       unsigned int bufaddr)
+		       uint64_t modifier, unsigned int bufaddr)
 {
 	const struct drm_format_info *info = drm_format_info(format);
 	u32 active_bpp = info->cpp[0] >> 1;
 	u32 val;
 
 	/* calculate safe window for ctrl register updates */
-	pre->safe_window_end = height - 2;
+	if (modifier == DRM_FORMAT_MOD_LINEAR)
+		pre->safe_window_end = height - 2;
+	else
+		pre->safe_window_end = DIV_ROUND_UP(height, 4) - 1;
 
 	writel(bufaddr, pre->regs + IPU_PRE_CUR_BUF);
 	writel(bufaddr, pre->regs + IPU_PRE_NEXT_BUF);
@@ -203,9 +210,25 @@ void ipu_pre_configure(struct ipu_pre *pre, unsigned int width,
 
 	writel(pre->buffer_paddr, pre->regs + IPU_PRE_STORE_ENG_ADDR);
 
+	val = readl(pre->regs + IPU_PRE_TPR_CTRL);
+	val &= ~IPU_PRE_TPR_CTRL_TILE_FORMAT_MASK;
+	if (modifier != DRM_FORMAT_MOD_LINEAR) {
+		/* only support single buffer formats for now */
+		val |= IPU_PRE_TPR_CTRL_TILE_FORMAT_SINGLE_BUF;
+		if (modifier == DRM_FORMAT_MOD_VIVANTE_SUPER_TILED)
+			val |= IPU_PRE_TPR_CTRL_TILE_FORMAT_SUPER_TILED;
+		if (info->cpp[0] == 2)
+			val |= IPU_PRE_TPR_CTRL_TILE_FORMAT_16_BIT;
+	}
+	writel(val, pre->regs + IPU_PRE_TPR_CTRL);
+
 	val = readl(pre->regs + IPU_PRE_CTRL);
 	val |= IPU_PRE_CTRL_EN_REPEAT | IPU_PRE_CTRL_ENABLE |
 	       IPU_PRE_CTRL_SDW_UPDATE;
+	if (modifier == DRM_FORMAT_MOD_LINEAR)
+		val &= ~IPU_PRE_CTRL_BLOCK_EN;
+	else
+		val |= IPU_PRE_CTRL_BLOCK_EN;
 	writel(val, pre->regs + IPU_PRE_CTRL);
 }
 
diff --git a/drivers/gpu/ipu-v3/ipu-prg.c b/drivers/gpu/ipu-v3/ipu-prg.c
index 0013ca9f72c8..067365c733c6 100644
--- a/drivers/gpu/ipu-v3/ipu-prg.c
+++ b/drivers/gpu/ipu-v3/ipu-prg.c
@@ -20,6 +20,7 @@
 #include <linux/module.h>
 #include <linux/of.h>
 #include <linux/platform_device.h>
+#include <linux/pm_runtime.h>
 #include <linux/regmap.h>
 #include <video/imx-ipu-v3.h>
 
@@ -132,28 +133,25 @@ bool ipu_prg_format_supported(struct ipu_soc *ipu, uint32_t format,
 	if (info->num_planes != 1)
 		return false;
 
-	return true;
+	switch (modifier) {
+	case DRM_FORMAT_MOD_LINEAR:
+	case DRM_FORMAT_MOD_VIVANTE_TILED:
+	case DRM_FORMAT_MOD_VIVANTE_SUPER_TILED:
+		return true;
+	default:
+		return false;
+	}
 }
 EXPORT_SYMBOL_GPL(ipu_prg_format_supported);
 
 int ipu_prg_enable(struct ipu_soc *ipu)
 {
 	struct ipu_prg *prg = ipu->prg_priv;
-	int ret;
 
 	if (!prg)
 		return 0;
 
-	ret = clk_prepare_enable(prg->clk_axi);
-	if (ret)
-		goto fail_disable_ipg;
-
-	return 0;
-
-fail_disable_ipg:
-	clk_disable_unprepare(prg->clk_ipg);
-
-	return ret;
+	return pm_runtime_get_sync(prg->dev);
 }
 EXPORT_SYMBOL_GPL(ipu_prg_enable);
 
@@ -164,7 +162,7 @@ void ipu_prg_disable(struct ipu_soc *ipu)
 	if (!prg)
 		return;
 
-	clk_disable_unprepare(prg->clk_axi);
+	pm_runtime_put(prg->dev);
 }
 EXPORT_SYMBOL_GPL(ipu_prg_disable);
 
@@ -255,7 +253,7 @@ void ipu_prg_channel_disable(struct ipuv3_channel *ipu_chan)
 	if (!chan->enabled || prg_chan < 0)
 		return;
 
-	clk_prepare_enable(prg->clk_ipg);
+	pm_runtime_get_sync(prg->dev);
 
 	val = readl(prg->regs + IPU_PRG_CTL);
 	val |= IPU_PRG_CTL_BYPASS(prg_chan);
@@ -264,7 +262,7 @@ void ipu_prg_channel_disable(struct ipuv3_channel *ipu_chan)
 	val = IPU_PRG_REG_UPDATE_REG_UPDATE;
 	writel(val, prg->regs + IPU_PRG_REG_UPDATE);
 
-	clk_disable_unprepare(prg->clk_ipg);
+	pm_runtime_put(prg->dev);
 
 	ipu_prg_put_pre(prg, prg_chan);
 
@@ -275,7 +273,7 @@ EXPORT_SYMBOL_GPL(ipu_prg_channel_disable);
 int ipu_prg_channel_configure(struct ipuv3_channel *ipu_chan,
 			      unsigned int axi_id, unsigned int width,
 			      unsigned int height, unsigned int stride,
-			      u32 format, unsigned long *eba)
+			      u32 format, uint64_t modifier, unsigned long *eba)
 {
 	int prg_chan = ipu_prg_ipu_to_prg_chan(ipu_chan->num);
 	struct ipu_prg *prg = ipu_chan->ipu->prg_priv;
@@ -296,14 +294,10 @@ int ipu_prg_channel_configure(struct ipuv3_channel *ipu_chan,
 		return ret;
 
 	ipu_pre_configure(prg->pres[chan->used_pre],
-			  width, height, stride, format, *eba);
+			  width, height, stride, format, modifier, *eba);
 
 
-	ret = clk_prepare_enable(prg->clk_ipg);
-	if (ret) {
-		ipu_prg_put_pre(prg, prg_chan);
-		return ret;
-	}
+	pm_runtime_get_sync(prg->dev);
 
 	val = (stride - 1) & IPU_PRG_STRIDE_STRIDE_MASK;
 	writel(val, prg->regs + IPU_PRG_STRIDE(prg_chan));
@@ -336,7 +330,7 @@ int ipu_prg_channel_configure(struct ipuv3_channel *ipu_chan,
 			   (val & IPU_PRG_STATUS_BUFFER1_READY(prg_chan)),
 			   5, 1000);
 
-	clk_disable_unprepare(prg->clk_ipg);
+	pm_runtime_put(prg->dev);
 
 	chan->enabled = true;
 	return 0;
@@ -384,6 +378,12 @@ static int ipu_prg_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
+	ret = clk_prepare_enable(prg->clk_axi);
+	if (ret) {
+		clk_disable_unprepare(prg->clk_ipg);
+		return ret;
+	}
+
 	/* init to free running mode */
 	val = readl(prg->regs + IPU_PRG_CTL);
 	val |= IPU_PRG_CTL_SHADOW_EN;
@@ -392,7 +392,8 @@ static int ipu_prg_probe(struct platform_device *pdev)
 	/* disable address threshold */
 	writel(0xffffffff, prg->regs + IPU_PRG_THD);
 
-	clk_disable_unprepare(prg->clk_ipg);
+	pm_runtime_set_active(dev);
+	pm_runtime_enable(dev);
 
 	prg->dev = dev;
 	platform_set_drvdata(pdev, prg);
@@ -414,6 +415,40 @@ static int ipu_prg_remove(struct platform_device *pdev)
 	return 0;
 }
 
+#ifdef CONFIG_PM
+static int prg_suspend(struct device *dev)
+{
+	struct ipu_prg *prg = dev_get_drvdata(dev);
+
+	clk_disable_unprepare(prg->clk_axi);
+	clk_disable_unprepare(prg->clk_ipg);
+
+	return 0;
+}
+
+static int prg_resume(struct device *dev)
+{
+	struct ipu_prg *prg = dev_get_drvdata(dev);
+	int ret;
+
+	ret = clk_prepare_enable(prg->clk_ipg);
+	if (ret)
+		return ret;
+
+	ret = clk_prepare_enable(prg->clk_axi);
+	if (ret) {
+		clk_disable_unprepare(prg->clk_ipg);
+		return ret;
+	}
+
+	return 0;
+}
+#endif
+
+static const struct dev_pm_ops prg_pm_ops = {
+	SET_RUNTIME_PM_OPS(prg_suspend, prg_resume, NULL)
+};
+
 static const struct of_device_id ipu_prg_dt_ids[] = {
 	{ .compatible = "fsl,imx6qp-prg", },
 	{ /* sentinel */ },
@@ -424,6 +459,7 @@ struct platform_driver ipu_prg_drv = {
 	.remove		= ipu_prg_remove,
 	.driver		= {
 		.name	= "imx-ipu-prg",
+		.pm	= &prg_pm_ops,
 		.of_match_table = ipu_prg_dt_ids,
 	},
 };
diff --git a/drivers/gpu/ipu-v3/ipu-prv.h b/drivers/gpu/ipu-v3/ipu-prv.h
index ac4b8d658500..d6beee99b6b8 100644
--- a/drivers/gpu/ipu-v3/ipu-prv.h
+++ b/drivers/gpu/ipu-v3/ipu-prv.h
@@ -269,8 +269,8 @@ int ipu_pre_get(struct ipu_pre *pre);
 void ipu_pre_put(struct ipu_pre *pre);
 u32 ipu_pre_get_baddr(struct ipu_pre *pre);
 void ipu_pre_configure(struct ipu_pre *pre, unsigned int width,
-		       unsigned int height,
-		       unsigned int stride, u32 format, unsigned int bufaddr);
+		       unsigned int height, unsigned int stride, u32 format,
+		       uint64_t modifier, unsigned int bufaddr);
 void ipu_pre_update(struct ipu_pre *pre, unsigned int bufaddr);
 
 struct ipu_prg *ipu_prg_lookup_by_phandle(struct device *dev, const char *name,
diff --git a/include/drm/drm_atomic.h b/include/drm/drm_atomic.h
index 5afd6e364fb6..1c27526c499e 100644
--- a/include/drm/drm_atomic.h
+++ b/include/drm/drm_atomic.h
@@ -189,12 +189,40 @@ struct drm_private_state_funcs {
 				     struct drm_private_state *state);
 };
 
+/**
+ * struct drm_private_obj - base struct for driver private atomic object
+ *
+ * A driver private object is initialized by calling
+ * drm_atomic_private_obj_init() and cleaned up by calling
+ * drm_atomic_private_obj_fini().
+ *
+ * Currently only tracks the state update functions and the opaque driver
+ * private state itself, but in the future might also track which
+ * &drm_modeset_lock is required to duplicate and update this object's state.
+ */
 struct drm_private_obj {
+	/**
+	 * @state: Current atomic state for this driver private object.
+	 */
 	struct drm_private_state *state;
 
+	/**
+	 * @funcs:
+	 *
+	 * Functions to manipulate the state of this driver private object, see
+	 * &drm_private_state_funcs.
+	 */
 	const struct drm_private_state_funcs *funcs;
 };
 
+/**
+ * struct drm_private_state - base struct for driver private object state
+ * @state: backpointer to global drm_atomic_state
+ *
+ * Currently only contains a backpointer to the overall atomic update, but in
+ * the future also might hold synchronization information similar to e.g.
+ * &drm_crtc.commit.
+ */
 struct drm_private_state {
 	struct drm_atomic_state *state;
 };
@@ -218,6 +246,10 @@ struct __drm_private_objs_state {
  * @num_private_objs: size of the @private_objs array
  * @private_objs: pointer to array of private object pointers
  * @acquire_ctx: acquire context for this atomic modeset state update
+ *
+ * States are added to an atomic update by calling drm_atomic_get_crtc_state(),
+ * drm_atomic_get_plane_state(), drm_atomic_get_connector_state(), or for
+ * private state structures, drm_atomic_get_private_obj_state().
  */
 struct drm_atomic_state {
 	struct kref ref;
diff --git a/include/drm/drm_fb_helper.h b/include/drm/drm_fb_helper.h
index 1494b12a984a..b069433e7fc1 100644
--- a/include/drm/drm_fb_helper.h
+++ b/include/drm/drm_fb_helper.h
@@ -33,6 +33,7 @@
 struct drm_fb_helper;
 
 #include <drm/drm_crtc.h>
+#include <drm/drm_device.h>
 #include <linux/kgdb.h>
 
 enum mode_set_atomic {
@@ -275,6 +276,7 @@ void drm_fb_helper_unlink_fbi(struct drm_fb_helper *fb_helper);
 
 void drm_fb_helper_deferred_io(struct fb_info *info,
 			       struct list_head *pagelist);
+int drm_fb_helper_defio_init(struct drm_fb_helper *fb_helper);
 
 ssize_t drm_fb_helper_sys_read(struct fb_info *info, char __user *buf,
 			       size_t count, loff_t *ppos);
@@ -319,6 +321,13 @@ int drm_fb_helper_add_one_connector(struct drm_fb_helper *fb_helper, struct drm_
 int drm_fb_helper_remove_one_connector(struct drm_fb_helper *fb_helper,
 				       struct drm_connector *connector);
 
+int drm_fb_helper_fbdev_setup(struct drm_device *dev,
+			      struct drm_fb_helper *fb_helper,
+			      const struct drm_fb_helper_funcs *funcs,
+			      unsigned int preferred_bpp,
+			      unsigned int max_conn_count);
+void drm_fb_helper_fbdev_teardown(struct drm_device *dev);
+
 void drm_fb_helper_lastclose(struct drm_device *dev);
 void drm_fb_helper_output_poll_changed(struct drm_device *dev);
 #else
@@ -332,11 +341,17 @@ static inline int drm_fb_helper_init(struct drm_device *dev,
 		       struct drm_fb_helper *helper,
 		       int max_conn)
 {
+	/* So drivers can use it to free the struct */
+	helper->dev = dev;
+	dev->fb_helper = helper;
+
 	return 0;
 }
 
 static inline void drm_fb_helper_fini(struct drm_fb_helper *helper)
 {
+	if (helper && helper->dev)
+		helper->dev->fb_helper = NULL;
 }
 
 static inline int drm_fb_helper_blank(int blank, struct fb_info *info)
@@ -409,6 +424,11 @@ static inline void drm_fb_helper_deferred_io(struct fb_info *info,
 {
 }
 
+static inline int drm_fb_helper_defio_init(struct drm_fb_helper *fb_helper)
+{
+	return -ENODEV;
+}
+
 static inline ssize_t drm_fb_helper_sys_read(struct fb_info *info,
 					     char __user *buf, size_t count,
 					     loff_t *ppos)
@@ -518,6 +538,24 @@ drm_fb_helper_remove_one_connector(struct drm_fb_helper *fb_helper,
 	return 0;
 }
 
+static inline int
+drm_fb_helper_fbdev_setup(struct drm_device *dev,
+			  struct drm_fb_helper *fb_helper,
+			  const struct drm_fb_helper_funcs *funcs,
+			  unsigned int preferred_bpp,
+			  unsigned int max_conn_count)
+{
+	/* So drivers can use it to free the struct */
+	dev->fb_helper = fb_helper;
+
+	return 0;
+}
+
+static inline void drm_fb_helper_fbdev_teardown(struct drm_device *dev)
+{
+	dev->fb_helper = NULL;
+}
+
 static inline void drm_fb_helper_lastclose(struct drm_device *dev)
 {
 }
diff --git a/include/drm/drm_framebuffer.h b/include/drm/drm_framebuffer.h
index dccb897951ba..c50502c656e5 100644
--- a/include/drm/drm_framebuffer.h
+++ b/include/drm/drm_framebuffer.h
@@ -121,6 +121,12 @@ struct drm_framebuffer {
 	 * @base: base modeset object structure, contains the reference count.
 	 */
 	struct drm_mode_object base;
+
+	/**
+	 * @comm: Name of the process allocating the fb, used for fb dumping.
+	 */
+	char comm[TASK_COMM_LEN];
+
 	/**
 	 * @format: framebuffer format information
 	 */
diff --git a/include/drm/drm_mode_config.h b/include/drm/drm_mode_config.h
index c6639e8e5007..2cb6f02df64a 100644
--- a/include/drm/drm_mode_config.h
+++ b/include/drm/drm_mode_config.h
@@ -269,6 +269,9 @@ struct drm_mode_config_funcs {
 	 * state easily. If this hook is implemented, drivers must also
 	 * implement @atomic_state_clear and @atomic_state_free.
 	 *
+	 * Subclassing of &drm_atomic_state is deprecated in favour of using
+	 * &drm_private_state and &drm_private_obj.
+	 *
 	 * RETURNS:
 	 *
 	 * A new &drm_atomic_state on success or NULL on failure.
@@ -290,6 +293,9 @@ struct drm_mode_config_funcs {
 	 *
 	 * Drivers that implement this must call drm_atomic_state_default_clear()
 	 * to clear common state.
+	 *
+	 * Subclassing of &drm_atomic_state is deprecated in favour of using
+	 * &drm_private_state and &drm_private_obj.
 	 */
 	void (*atomic_state_clear)(struct drm_atomic_state *state);
 
@@ -302,6 +308,9 @@ struct drm_mode_config_funcs {
 	 *
 	 * Drivers that implement this must call
 	 * drm_atomic_state_default_release() to release common resources.
+	 *
+	 * Subclassing of &drm_atomic_state is deprecated in favour of using
+	 * &drm_private_state and &drm_private_obj.
 	 */
 	void (*atomic_state_free)(struct drm_atomic_state *state);
 };
diff --git a/include/drm/drm_print.h b/include/drm/drm_print.h
index 5f9932e2246e..2a4a42e59a47 100644
--- a/include/drm/drm_print.h
+++ b/include/drm/drm_print.h
@@ -80,13 +80,13 @@ void __drm_printfn_debug(struct drm_printer *p, struct va_format *vaf);
 __printf(2, 3)
 void drm_printf(struct drm_printer *p, const char *f, ...);
 
+__printf(2, 0)
 /**
  * drm_vprintf - print to a &drm_printer stream
  * @p: the &drm_printer
  * @fmt: format string
  * @va: the va_list
  */
-__printf(2, 0)
 static inline void
 drm_vprintf(struct drm_printer *p, const char *fmt, va_list *va)
 {
diff --git a/include/drm/drm_syncobj.h b/include/drm/drm_syncobj.h
index 9e8ba90c6784..3980602472c0 100644
--- a/include/drm/drm_syncobj.h
+++ b/include/drm/drm_syncobj.h
@@ -33,36 +33,31 @@ struct drm_syncobj_cb;
 /**
  * struct drm_syncobj - sync object.
  *
- * This structure defines a generic sync object which wraps a dma fence.
+ * This structure defines a generic sync object which wraps a &dma_fence.
  */
 struct drm_syncobj {
 	/**
-	 * @refcount:
-	 *
-	 * Reference count of this object.
+	 * @refcount: Reference count of this object.
 	 */
 	struct kref refcount;
 	/**
 	 * @fence:
 	 * NULL or a pointer to the fence bound to this object.
 	 *
-	 * This field should not be used directly.  Use drm_syncobj_fence_get
-	 * and drm_syncobj_replace_fence instead.
+	 * This field should not be used directly. Use drm_syncobj_fence_get()
+	 * and drm_syncobj_replace_fence() instead.
 	 */
 	struct dma_fence __rcu *fence;
 	/**
-	 * @cb_list:
-	 * List of callbacks to call when the fence gets replaced
+	 * @cb_list: List of callbacks to call when the &fence gets replaced.
 	 */
 	struct list_head cb_list;
 	/**
-	 * @lock:
-	 * locks cb_list and write-locks fence.
+	 * @lock: Protects &cb_list and write-locks &fence.
 	 */
 	spinlock_t lock;
 	/**
-	 * @file:
-	 * a file backing for this syncobj.
+	 * @file: A file backing for this syncobj.
 	 */
 	struct file *file;
 };
@@ -73,7 +68,7 @@ typedef void (*drm_syncobj_func_t)(struct drm_syncobj *syncobj,
 /**
  * struct drm_syncobj_cb - callback for drm_syncobj_add_callback
  * @node: used by drm_syncob_add_callback to append this struct to
- *	  syncobj::cb_list
+ *	  &drm_syncobj.cb_list
  * @func: drm_syncobj_func_t to call
  *
  * This struct will be initialized by drm_syncobj_add_callback, additional
@@ -92,7 +87,7 @@ void drm_syncobj_free(struct kref *kref);
  * drm_syncobj_get - acquire a syncobj reference
  * @obj: sync object
  *
- * This acquires additional reference to @obj. It is illegal to call this
+ * This acquires an additional reference to @obj. It is illegal to call this
  * without already holding a reference. No locks required.
  */
 static inline void
@@ -111,6 +106,17 @@ drm_syncobj_put(struct drm_syncobj *obj)
 	kref_put(&obj->refcount, drm_syncobj_free);
 }
 
+/**
+ * drm_syncobj_fence_get - get a reference to a fence in a sync object
+ * @syncobj: sync object.
+ *
+ * This acquires additional reference to &drm_syncobj.fence contained in @obj,
+ * if not NULL. It is illegal to call this without already holding a reference.
+ * No locks required.
+ *
+ * Returns:
+ * Either the fence of @obj or NULL if there's none.
+ */
 static inline struct dma_fence *
 drm_syncobj_fence_get(struct drm_syncobj *syncobj)
 {
diff --git a/include/drm/i915_pciids.h b/include/drm/i915_pciids.h
index 972a25633525..5db0458dd832 100644
--- a/include/drm/i915_pciids.h
+++ b/include/drm/i915_pciids.h
@@ -373,24 +373,46 @@
 /* CFL S */
 #define INTEL_CFL_S_GT1_IDS(info) \
 	INTEL_VGA_DEVICE(0x3E90, info), /* SRV GT1 */ \
-	INTEL_VGA_DEVICE(0x3E93, info)  /* SRV GT1 */
+	INTEL_VGA_DEVICE(0x3E93, info), /* SRV GT1 */ \
+	INTEL_VGA_DEVICE(0x3E99, info)  /* SRV GT1 */
 
 #define INTEL_CFL_S_GT2_IDS(info) \
 	INTEL_VGA_DEVICE(0x3E91, info), /* SRV GT2 */ \
 	INTEL_VGA_DEVICE(0x3E92, info), /* SRV GT2 */ \
-	INTEL_VGA_DEVICE(0x3E96, info)  /* SRV GT2 */
+	INTEL_VGA_DEVICE(0x3E96, info), /* SRV GT2 */ \
+	INTEL_VGA_DEVICE(0x3E9A, info)  /* SRV GT2 */
 
 /* CFL H */
 #define INTEL_CFL_H_GT2_IDS(info) \
 	INTEL_VGA_DEVICE(0x3E9B, info), /* Halo GT2 */ \
 	INTEL_VGA_DEVICE(0x3E94, info)  /* Halo GT2 */
 
-/* CFL U */
+/* CFL U GT1 */
+#define INTEL_CFL_U_GT1_IDS(info) \
+	INTEL_VGA_DEVICE(0x3EA1, info), \
+	INTEL_VGA_DEVICE(0x3EA4, info)
+
+/* CFL U GT2 */
+#define INTEL_CFL_U_GT2_IDS(info) \
+	INTEL_VGA_DEVICE(0x3EA0, info), \
+	INTEL_VGA_DEVICE(0x3EA3, info), \
+	INTEL_VGA_DEVICE(0x3EA9, info)
+
+/* CFL U GT3 */
 #define INTEL_CFL_U_GT3_IDS(info) \
+	INTEL_VGA_DEVICE(0x3EA2, info), /* ULT GT3 */ \
+	INTEL_VGA_DEVICE(0x3EA5, info), /* ULT GT3 */ \
 	INTEL_VGA_DEVICE(0x3EA6, info), /* ULT GT3 */ \
 	INTEL_VGA_DEVICE(0x3EA7, info), /* ULT GT3 */ \
-	INTEL_VGA_DEVICE(0x3EA8, info), /* ULT GT3 */ \
-	INTEL_VGA_DEVICE(0x3EA5, info)  /* ULT GT3 */
+	INTEL_VGA_DEVICE(0x3EA8, info)  /* ULT GT3 */
+
+#define INTEL_CFL_IDS(info)	   \
+	INTEL_CFL_S_GT1_IDS(info), \
+	INTEL_CFL_S_GT2_IDS(info), \
+	INTEL_CFL_H_GT2_IDS(info), \
+	INTEL_CFL_U_GT1_IDS(info), \
+	INTEL_CFL_U_GT2_IDS(info), \
+	INTEL_CFL_U_GT3_IDS(info)
 
 /* CNL U 2+2 */
 #define INTEL_CNL_U_GT2_IDS(info) \
diff --git a/include/uapi/drm/exynos_drm.h b/include/uapi/drm/exynos_drm.h
index d01087b2a651..4a54305120e0 100644
--- a/include/uapi/drm/exynos_drm.h
+++ b/include/uapi/drm/exynos_drm.h
@@ -135,172 +135,6 @@ struct drm_exynos_g2d_exec {
 	__u64					async;
 };
 
-enum drm_exynos_ops_id {
-	EXYNOS_DRM_OPS_SRC,
-	EXYNOS_DRM_OPS_DST,
-	EXYNOS_DRM_OPS_MAX,
-};
-
-struct drm_exynos_sz {
-	__u32	hsize;
-	__u32	vsize;
-};
-
-struct drm_exynos_pos {
-	__u32	x;
-	__u32	y;
-	__u32	w;
-	__u32	h;
-};
-
-enum drm_exynos_flip {
-	EXYNOS_DRM_FLIP_NONE = (0 << 0),
-	EXYNOS_DRM_FLIP_VERTICAL = (1 << 0),
-	EXYNOS_DRM_FLIP_HORIZONTAL = (1 << 1),
-	EXYNOS_DRM_FLIP_BOTH = EXYNOS_DRM_FLIP_VERTICAL |
-			EXYNOS_DRM_FLIP_HORIZONTAL,
-};
-
-enum drm_exynos_degree {
-	EXYNOS_DRM_DEGREE_0,
-	EXYNOS_DRM_DEGREE_90,
-	EXYNOS_DRM_DEGREE_180,
-	EXYNOS_DRM_DEGREE_270,
-};
-
-enum drm_exynos_planer {
-	EXYNOS_DRM_PLANAR_Y,
-	EXYNOS_DRM_PLANAR_CB,
-	EXYNOS_DRM_PLANAR_CR,
-	EXYNOS_DRM_PLANAR_MAX,
-};
-
-/**
- * A structure for ipp supported property list.
- *
- * @version: version of this structure.
- * @ipp_id: id of ipp driver.
- * @count: count of ipp driver.
- * @writeback: flag of writeback supporting.
- * @flip: flag of flip supporting.
- * @degree: flag of degree information.
- * @csc: flag of csc supporting.
- * @crop: flag of crop supporting.
- * @scale: flag of scale supporting.
- * @refresh_min: min hz of refresh.
- * @refresh_max: max hz of refresh.
- * @crop_min: crop min resolution.
- * @crop_max: crop max resolution.
- * @scale_min: scale min resolution.
- * @scale_max: scale max resolution.
- */
-struct drm_exynos_ipp_prop_list {
-	__u32	version;
-	__u32	ipp_id;
-	__u32	count;
-	__u32	writeback;
-	__u32	flip;
-	__u32	degree;
-	__u32	csc;
-	__u32	crop;
-	__u32	scale;
-	__u32	refresh_min;
-	__u32	refresh_max;
-	__u32	reserved;
-	struct drm_exynos_sz	crop_min;
-	struct drm_exynos_sz	crop_max;
-	struct drm_exynos_sz	scale_min;
-	struct drm_exynos_sz	scale_max;
-};
-
-/**
- * A structure for ipp config.
- *
- * @ops_id: property of operation directions.
- * @flip: property of mirror, flip.
- * @degree: property of rotation degree.
- * @fmt: property of image format.
- * @sz: property of image size.
- * @pos: property of image position(src-cropped,dst-scaler).
- */
-struct drm_exynos_ipp_config {
-	__u32 ops_id;
-	__u32 flip;
-	__u32 degree;
-	__u32	fmt;
-	struct drm_exynos_sz	sz;
-	struct drm_exynos_pos	pos;
-};
-
-enum drm_exynos_ipp_cmd {
-	IPP_CMD_NONE,
-	IPP_CMD_M2M,
-	IPP_CMD_WB,
-	IPP_CMD_OUTPUT,
-	IPP_CMD_MAX,
-};
-
-/**
- * A structure for ipp property.
- *
- * @config: source, destination config.
- * @cmd: definition of command.
- * @ipp_id: id of ipp driver.
- * @prop_id: id of property.
- * @refresh_rate: refresh rate.
- */
-struct drm_exynos_ipp_property {
-	struct drm_exynos_ipp_config config[EXYNOS_DRM_OPS_MAX];
-	__u32	cmd;
-	__u32	ipp_id;
-	__u32	prop_id;
-	__u32	refresh_rate;
-};
-
-enum drm_exynos_ipp_buf_type {
-	IPP_BUF_ENQUEUE,
-	IPP_BUF_DEQUEUE,
-};
-
-/**
- * A structure for ipp buffer operations.
- *
- * @ops_id: operation directions.
- * @buf_type: definition of buffer.
- * @prop_id: id of property.
- * @buf_id: id of buffer.
- * @handle: Y, Cb, Cr each planar handle.
- * @user_data: user data.
- */
-struct drm_exynos_ipp_queue_buf {
-	__u32	ops_id;
-	__u32	buf_type;
-	__u32	prop_id;
-	__u32	buf_id;
-	__u32	handle[EXYNOS_DRM_PLANAR_MAX];
-	__u32	reserved;
-	__u64	user_data;
-};
-
-enum drm_exynos_ipp_ctrl {
-	IPP_CTRL_PLAY,
-	IPP_CTRL_STOP,
-	IPP_CTRL_PAUSE,
-	IPP_CTRL_RESUME,
-	IPP_CTRL_MAX,
-};
-
-/**
- * A structure for ipp start/stop operations.
- *
- * @prop_id: id of property.
- * @ctrl: definition of control.
- */
-struct drm_exynos_ipp_cmd_ctrl {
-	__u32	prop_id;
-	__u32	ctrl;
-};
-
 #define DRM_EXYNOS_GEM_CREATE		0x00
 #define DRM_EXYNOS_GEM_MAP		0x01
 /* Reserved 0x03 ~ 0x05 for exynos specific gem ioctl */
@@ -312,11 +146,7 @@ struct drm_exynos_ipp_cmd_ctrl {
 #define DRM_EXYNOS_G2D_SET_CMDLIST	0x21
 #define DRM_EXYNOS_G2D_EXEC		0x22
 
-/* IPP - Image Post Processing */
-#define DRM_EXYNOS_IPP_GET_PROPERTY	0x30
-#define DRM_EXYNOS_IPP_SET_PROPERTY	0x31
-#define DRM_EXYNOS_IPP_QUEUE_BUF	0x32
-#define DRM_EXYNOS_IPP_CMD_CTRL	0x33
+/* Reserved 0x30 ~ 0x33 for obsolete Exynos IPP ioctls */
 
 #define DRM_IOCTL_EXYNOS_GEM_CREATE		DRM_IOWR(DRM_COMMAND_BASE + \
 		DRM_EXYNOS_GEM_CREATE, struct drm_exynos_gem_create)
@@ -335,18 +165,8 @@ struct drm_exynos_ipp_cmd_ctrl {
 #define DRM_IOCTL_EXYNOS_G2D_EXEC		DRM_IOWR(DRM_COMMAND_BASE + \
 		DRM_EXYNOS_G2D_EXEC, struct drm_exynos_g2d_exec)
 
-#define DRM_IOCTL_EXYNOS_IPP_GET_PROPERTY	DRM_IOWR(DRM_COMMAND_BASE + \
-		DRM_EXYNOS_IPP_GET_PROPERTY, struct drm_exynos_ipp_prop_list)
-#define DRM_IOCTL_EXYNOS_IPP_SET_PROPERTY	DRM_IOWR(DRM_COMMAND_BASE + \
-		DRM_EXYNOS_IPP_SET_PROPERTY, struct drm_exynos_ipp_property)
-#define DRM_IOCTL_EXYNOS_IPP_QUEUE_BUF	DRM_IOWR(DRM_COMMAND_BASE + \
-		DRM_EXYNOS_IPP_QUEUE_BUF, struct drm_exynos_ipp_queue_buf)
-#define DRM_IOCTL_EXYNOS_IPP_CMD_CTRL		DRM_IOWR(DRM_COMMAND_BASE + \
-		DRM_EXYNOS_IPP_CMD_CTRL, struct drm_exynos_ipp_cmd_ctrl)
-
 /* EXYNOS specific events */
 #define DRM_EXYNOS_G2D_EVENT		0x80000000
-#define DRM_EXYNOS_IPP_EVENT		0x80000001
 
 struct drm_exynos_g2d_event {
 	struct drm_event	base;
@@ -357,16 +177,6 @@ struct drm_exynos_g2d_event {
 	__u32			reserved;
 };
 
-struct drm_exynos_ipp_event {
-	struct drm_event	base;
-	__u64			user_data;
-	__u32			tv_sec;
-	__u32			tv_usec;
-	__u32			prop_id;
-	__u32			reserved;
-	__u32			buf_id[EXYNOS_DRM_OPS_MAX];
-};
-
 #if defined(__cplusplus)
 }
 #endif
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index 6e80501368ae..f4cab5b3ba9a 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -58,7 +58,8 @@ struct kfd_ioctl_create_queue_args {
 	__u64 eop_buffer_address;	/* to KFD */
 	__u64 eop_buffer_size;	/* to KFD */
 	__u64 ctx_save_restore_address; /* to KFD */
-	__u64 ctx_save_restore_size;	/* to KFD */
+	__u32 ctx_save_restore_size;	/* to KFD */
+	__u32 ctl_stack_size;		/* to KFD */
 };
 
 struct kfd_ioctl_destroy_queue_args {
@@ -261,6 +262,13 @@ struct kfd_ioctl_get_tile_config_args {
 	 */
 };
 
+struct kfd_ioctl_set_trap_handler_args {
+	uint64_t tba_addr;		/* to KFD */
+	uint64_t tma_addr;		/* to KFD */
+	uint32_t gpu_id;		/* to KFD */
+	uint32_t pad;
+};
+
 #define AMDKFD_IOCTL_BASE 'K'
 #define AMDKFD_IO(nr)			_IO(AMDKFD_IOCTL_BASE, nr)
 #define AMDKFD_IOR(nr, type)		_IOR(AMDKFD_IOCTL_BASE, nr, type)
@@ -321,7 +329,10 @@ struct kfd_ioctl_get_tile_config_args {
 #define AMDKFD_IOC_GET_TILE_CONFIG                                      \
 		AMDKFD_IOWR(0x12, struct kfd_ioctl_get_tile_config_args)
 
+#define AMDKFD_IOC_SET_TRAP_HANDLER		\
+		AMDKFD_IOW(0x13, struct kfd_ioctl_set_trap_handler_args)
+
 #define AMDKFD_COMMAND_START		0x01
-#define AMDKFD_COMMAND_END		0x13
+#define AMDKFD_COMMAND_END		0x14
 
 #endif
diff --git a/include/video/imx-ipu-v3.h b/include/video/imx-ipu-v3.h
index ce4c07688b13..abbad94e14a1 100644
--- a/include/video/imx-ipu-v3.h
+++ b/include/video/imx-ipu-v3.h
@@ -344,7 +344,7 @@ void ipu_prg_channel_disable(struct ipuv3_channel *ipu_chan);
 int ipu_prg_channel_configure(struct ipuv3_channel *ipu_chan,
 			      unsigned int axi_id,  unsigned int width,
 			      unsigned int height, unsigned int stride,
-			      u32 format, unsigned long *eba);
+			      u32 format, uint64_t modifier, unsigned long *eba);
 
 /*
  * IPU CMOS Sensor Interface (csi) functions