35 files changed, 2159 insertions, 1441 deletions
diff --git a/drivers/gpu/drm/i915/gt/gen2_engine_cs.c b/drivers/gpu/drm/i915/gt/gen2_engine_cs.c
new file mode 100644
index 000000000000..b491a64919c8
--- /dev/null
+++ b/drivers/gpu/drm/i915/gt/gen2_engine_cs.c
@@ -0,0 +1,329 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright © 2020 Intel Corporation
+ */
+
+#include "gen2_engine_cs.h"
+#include "i915_drv.h"
+#include "intel_engine.h"
+#include "intel_gpu_commands.h"
+#include "intel_gt.h"
+#include "intel_gt_irq.h"
+#include "intel_ring.h"
+
+int gen2_emit_flush(struct i915_request *rq, u32 mode)
+{
+	unsigned int num_store_dw = 12;
+	u32 cmd, *cs;
+
+	cmd = MI_FLUSH;
+	if (mode & EMIT_INVALIDATE)
+		cmd |= MI_READ_FLUSH;
+
+	cs = intel_ring_begin(rq, 2 + 4 * num_store_dw);
+	if (IS_ERR(cs))
+		return PTR_ERR(cs);
+
+	*cs++ = cmd;
+	while (num_store_dw--) {
+		*cs++ = MI_STORE_DWORD_INDEX;
+		*cs++ = I915_GEM_HWS_SCRATCH * sizeof(u32);
+		*cs++ = 0;
+		*cs++ = MI_FLUSH | MI_NO_WRITE_FLUSH;
+	}
+	*cs++ = cmd;
+
+	intel_ring_advance(rq, cs);
+
+	return 0;
+}
+
+int gen4_emit_flush_rcs(struct i915_request *rq, u32 mode)
+{
+	u32 cmd, *cs;
+	int i;
+
+	/*
+	 * read/write caches:
+	 *
+	 * I915_GEM_DOMAIN_RENDER is always invalidated, but is
+	 * only flushed if MI_NO_WRITE_FLUSH is unset.  On 965, it is
+	 * also flushed at 2d versus 3d pipeline switches.
+	 *
+	 * read-only caches:
+	 *
+	 * I915_GEM_DOMAIN_SAMPLER is flushed on pre-965 if
+	 * MI_READ_FLUSH is set, and is always flushed on 965.
+	 *
+	 * I915_GEM_DOMAIN_COMMAND may not exist?
+	 *
+	 * I915_GEM_DOMAIN_INSTRUCTION, which exists on 965, is
+	 * invalidated when MI_EXE_FLUSH is set.
+	 *
+	 * I915_GEM_DOMAIN_VERTEX, which exists on 965, is
+	 * invalidated with every MI_FLUSH.
+	 *
+	 * TLBs:
+	 *
+	 * On 965, TLBs associated with I915_GEM_DOMAIN_COMMAND
+	 * and I915_GEM_DOMAIN_CPU in are invalidated at PTE write and
+	 * I915_GEM_DOMAIN_RENDER and I915_GEM_DOMAIN_SAMPLER
+	 * are flushed at any MI_FLUSH.
+	 */
+
+	cmd = MI_FLUSH;
+	if (mode & EMIT_INVALIDATE) {
+		cmd |= MI_EXE_FLUSH;
+		if (IS_G4X(rq->engine->i915) || IS_GEN(rq->engine->i915, 5))
+			cmd |= MI_INVALIDATE_ISP;
+	}
+
+	i = 2;
+	if (mode & EMIT_INVALIDATE)
+		i += 20;
+
+	cs = intel_ring_begin(rq, i);
+	if (IS_ERR(cs))
+		return PTR_ERR(cs);
+
+	*cs++ = cmd;
+
+	/*
+	 * A random delay to let the CS invalidate take effect? Without this
+	 * delay, the GPU relocation path fails as the CS does not see
+	 * the updated contents. Just as important, if we apply the flushes
+	 * to the EMIT_FLUSH branch (i.e. immediately after the relocation
+	 * write and before the invalidate on the next batch), the relocations
+	 * still fail. This implies that is a delay following invalidation
+	 * that is required to reset the caches as opposed to a delay to
+	 * ensure the memory is written.
+	 */
+	if (mode & EMIT_INVALIDATE) {
+		*cs++ = GFX_OP_PIPE_CONTROL(4) | PIPE_CONTROL_QW_WRITE;
+		*cs++ = intel_gt_scratch_offset(rq->engine->gt,
+						INTEL_GT_SCRATCH_FIELD_DEFAULT) |
+			PIPE_CONTROL_GLOBAL_GTT;
+		*cs++ = 0;
+		*cs++ = 0;
+
+		for (i = 0; i < 12; i++)
+			*cs++ = MI_FLUSH;
+
+		*cs++ = GFX_OP_PIPE_CONTROL(4) | PIPE_CONTROL_QW_WRITE;
+		*cs++ = intel_gt_scratch_offset(rq->engine->gt,
+						INTEL_GT_SCRATCH_FIELD_DEFAULT) |
+			PIPE_CONTROL_GLOBAL_GTT;
+		*cs++ = 0;
+		*cs++ = 0;
+	}
+
+	*cs++ = cmd;
+
+	intel_ring_advance(rq, cs);
+
+	return 0;
+}
+
+int gen4_emit_flush_vcs(struct i915_request *rq, u32 mode)
+{
+	u32 *cs;
+
+	cs = intel_ring_begin(rq, 2);
+	if (IS_ERR(cs))
+		return PTR_ERR(cs);
+
+	*cs++ = MI_FLUSH;
+	*cs++ = MI_NOOP;
+	intel_ring_advance(rq, cs);
+
+	return 0;
+}
+
+static u32 *__gen2_emit_breadcrumb(struct i915_request *rq, u32 *cs,
+				   int flush, int post)
+{
+	GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);
+	GEM_BUG_ON(offset_in_page(i915_request_active_timeline(rq)->hwsp_offset) != I915_GEM_HWS_SEQNO_ADDR);
+
+	*cs++ = MI_FLUSH;
+
+	while (flush--) {
+		*cs++ = MI_STORE_DWORD_INDEX;
+		*cs++ = I915_GEM_HWS_SCRATCH * sizeof(u32);
+		*cs++ = rq->fence.seqno;
+	}
+
+	while (post--) {
+		*cs++ = MI_STORE_DWORD_INDEX;
+		*cs++ = I915_GEM_HWS_SEQNO_ADDR;
+		*cs++ = rq->fence.seqno;
+	}
+
+	*cs++ = MI_USER_INTERRUPT;
+
+	rq->tail = intel_ring_offset(rq, cs);
+	assert_ring_tail_valid(rq->ring, rq->tail);
+
+	return cs;
+}
+
+u32 *gen3_emit_breadcrumb(struct i915_request *rq, u32 *cs)
+{
+	return __gen2_emit_breadcrumb(rq, cs, 16, 8);
+}
+
+u32 *gen5_emit_breadcrumb(struct i915_request *rq, u32 *cs)
+{
+	return __gen2_emit_breadcrumb(rq, cs, 8, 8);
+}
+
+/* Just userspace ABI convention to limit the wa batch bo to a resonable size */
+#define I830_BATCH_LIMIT SZ_256K
+#define I830_TLB_ENTRIES (2)
+#define I830_WA_SIZE max(I830_TLB_ENTRIES * SZ_4K, I830_BATCH_LIMIT)
+int i830_emit_bb_start(struct i915_request *rq,
+		       u64 offset, u32 len,
+		       unsigned int dispatch_flags)
+{
+	u32 *cs, cs_offset =
+		intel_gt_scratch_offset(rq->engine->gt,
+					INTEL_GT_SCRATCH_FIELD_DEFAULT);
+
+	GEM_BUG_ON(rq->engine->gt->scratch->size < I830_WA_SIZE);
+
+	cs = intel_ring_begin(rq, 6);
+	if (IS_ERR(cs))
+		return PTR_ERR(cs);
+
+	/* Evict the invalid PTE TLBs */
+	*cs++ = COLOR_BLT_CMD | BLT_WRITE_RGBA;
+	*cs++ = BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | 4096;
+	*cs++ = I830_TLB_ENTRIES << 16 | 4; /* load each page */
+	*cs++ = cs_offset;
+	*cs++ = 0xdeadbeef;
+	*cs++ = MI_NOOP;
+	intel_ring_advance(rq, cs);
+
+	if ((dispatch_flags & I915_DISPATCH_PINNED) == 0) {
+		if (len > I830_BATCH_LIMIT)
+			return -ENOSPC;
+
+		cs = intel_ring_begin(rq, 6 + 2);
+		if (IS_ERR(cs))
+			return PTR_ERR(cs);
+
+		/*
+		 * Blit the batch (which has now all relocs applied) to the
+		 * stable batch scratch bo area (so that the CS never
+		 * stumbles over its tlb invalidation bug) ...
+		 */
+		*cs++ = SRC_COPY_BLT_CMD | BLT_WRITE_RGBA | (6 - 2);
+		*cs++ = BLT_DEPTH_32 | BLT_ROP_SRC_COPY | 4096;
+		*cs++ = DIV_ROUND_UP(len, 4096) << 16 | 4096;
+		*cs++ = cs_offset;
+		*cs++ = 4096;
+		*cs++ = offset;
+
+		*cs++ = MI_FLUSH;
+		*cs++ = MI_NOOP;
+		intel_ring_advance(rq, cs);
+
+		/* ... and execute it. */
+		offset = cs_offset;
+	}
+
+	if (!(dispatch_flags & I915_DISPATCH_SECURE))
+		offset |= MI_BATCH_NON_SECURE;
+
+	cs = intel_ring_begin(rq, 2);
+	if (IS_ERR(cs))
+		return PTR_ERR(cs);
+
+	*cs++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT;
+	*cs++ = offset;
+	intel_ring_advance(rq, cs);
+
+	return 0;
+}
+
+int gen3_emit_bb_start(struct i915_request *rq,
+		       u64 offset, u32 len,
+		       unsigned int dispatch_flags)
+{
+	u32 *cs;
+
+	if (!(dispatch_flags & I915_DISPATCH_SECURE))
+		offset |= MI_BATCH_NON_SECURE;
+
+	cs = intel_ring_begin(rq, 2);
+	if (IS_ERR(cs))
+		return PTR_ERR(cs);
+
+	*cs++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT;
+	*cs++ = offset;
+	intel_ring_advance(rq, cs);
+
+	return 0;
+}
+
+int gen4_emit_bb_start(struct i915_request *rq,
+		       u64 offset, u32 length,
+		       unsigned int dispatch_flags)
+{
+	u32 security;
+	u32 *cs;
+
+	security = MI_BATCH_NON_SECURE_I965;
+	if (dispatch_flags & I915_DISPATCH_SECURE)
+		security = 0;
+
+	cs = intel_ring_begin(rq, 2);
+	if (IS_ERR(cs))
+		return PTR_ERR(cs);
+
+	*cs++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT | security;
+	*cs++ = offset;
+	intel_ring_advance(rq, cs);
+
+	return 0;
+}
+
+void gen2_irq_enable(struct intel_engine_cs *engine)
+{
+	struct drm_i915_private *i915 = engine->i915;
+
+	i915->irq_mask &= ~engine->irq_enable_mask;
+	intel_uncore_write16(&i915->uncore, GEN2_IMR, i915->irq_mask);
+	ENGINE_POSTING_READ16(engine, RING_IMR);
+}
+
+void gen2_irq_disable(struct intel_engine_cs *engine)
+{
+	struct drm_i915_private *i915 = engine->i915;
+
+	i915->irq_mask |= engine->irq_enable_mask;
+	intel_uncore_write16(&i915->uncore, GEN2_IMR, i915->irq_mask);
+}
+
+void gen3_irq_enable(struct intel_engine_cs *engine)
+{
+	engine->i915->irq_mask &= ~engine->irq_enable_mask;
+	intel_uncore_write(engine->uncore, GEN2_IMR, engine->i915->irq_mask);
+	intel_uncore_posting_read_fw(engine->uncore, GEN2_IMR);
+}
+
+void gen3_irq_disable(struct intel_engine_cs *engine)
+{
+	engine->i915->irq_mask |= engine->irq_enable_mask;
+	intel_uncore_write(engine->uncore, GEN2_IMR, engine->i915->irq_mask);
+}
+
+void gen5_irq_enable(struct intel_engine_cs *engine)
+{
+	gen5_gt_enable_irq(engine->gt, engine->irq_enable_mask);
+}
+
+void gen5_irq_disable(struct intel_engine_cs *engine)
+{
+	gen5_gt_disable_irq(engine->gt, engine->irq_enable_mask);
+}
diff --git a/drivers/gpu/drm/i915/gt/gen2_engine_cs.h b/drivers/gpu/drm/i915/gt/gen2_engine_cs.h
new file mode 100644
index 000000000000..a5cd64a65c9e
--- /dev/null
+++ b/drivers/gpu/drm/i915/gt/gen2_engine_cs.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright © 2020 Intel Corporation
+ */
+
+#ifndef __GEN2_ENGINE_CS_H__
+#define __GEN2_ENGINE_CS_H__
+
+#include <linux/types.h>
+
+struct i915_request;
+struct intel_engine_cs;
+
+int gen2_emit_flush(struct i915_request *rq, u32 mode);
+int gen4_emit_flush_rcs(struct i915_request *rq, u32 mode);
+int gen4_emit_flush_vcs(struct i915_request *rq, u32 mode);
+
+u32 *gen3_emit_breadcrumb(struct i915_request *rq, u32 *cs);
+u32 *gen5_emit_breadcrumb(struct i915_request *rq, u32 *cs);
+
+int i830_emit_bb_start(struct i915_request *rq,
+		       u64 offset, u32 len,
+		       unsigned int dispatch_flags);
+int gen3_emit_bb_start(struct i915_request *rq,
+		       u64 offset, u32 len,
+		       unsigned int dispatch_flags);
+int gen4_emit_bb_start(struct i915_request *rq,
+		       u64 offset, u32 length,
+		       unsigned int dispatch_flags);
+
+void gen2_irq_enable(struct intel_engine_cs *engine);
+void gen2_irq_disable(struct intel_engine_cs *engine);
+void gen3_irq_enable(struct intel_engine_cs *engine);
+void gen3_irq_disable(struct intel_engine_cs *engine);
+void gen5_irq_enable(struct intel_engine_cs *engine);
+void gen5_irq_disable(struct intel_engine_cs *engine);
+
+#endif /* __GEN2_ENGINE_CS_H__ */
diff --git a/drivers/gpu/drm/i915/gt/gen6_engine_cs.c b/drivers/gpu/drm/i915/gt/gen6_engine_cs.c
new file mode 100644
index 000000000000..ce38d1bcaba3
--- /dev/null
+++ b/drivers/gpu/drm/i915/gt/gen6_engine_cs.c
@@ -0,0 +1,455 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright © 2020 Intel Corporation
+ */
+
+#include "gen6_engine_cs.h"
+#include "intel_engine.h"
+#include "intel_gpu_commands.h"
+#include "intel_gt.h"
+#include "intel_gt_irq.h"
+#include "intel_gt_pm_irq.h"
+#include "intel_ring.h"
+
+#define HWS_SCRATCH_ADDR	(I915_GEM_HWS_SCRATCH * sizeof(u32))
+
+/*
+ * Emits a PIPE_CONTROL with a non-zero post-sync operation, for
+ * implementing two workarounds on gen6.  From section 1.4.7.1
+ * "PIPE_CONTROL" of the Sandy Bridge PRM volume 2 part 1:
+ *
+ * [DevSNB-C+{W/A}] Before any depth stall flush (including those
+ * produced by non-pipelined state commands), software needs to first
+ * send a PIPE_CONTROL with no bits set except Post-Sync Operation !=
+ * 0.
+ *
+ * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush Enable
+ * =1, a PIPE_CONTROL with any non-zero post-sync-op is required.
+ *
+ * And the workaround for these two requires this workaround first:
+ *
+ * [Dev-SNB{W/A}]: Pipe-control with CS-stall bit set must be sent
+ * BEFORE the pipe-control with a post-sync op and no write-cache
+ * flushes.
+ *
+ * And this last workaround is tricky because of the requirements on
+ * that bit.  From section 1.4.7.2.3 "Stall" of the Sandy Bridge PRM
+ * volume 2 part 1:
+ *
+ *     "1 of the following must also be set:
+ *      - Render Target Cache Flush Enable ([12] of DW1)
+ *      - Depth Cache Flush Enable ([0] of DW1)
+ *      - Stall at Pixel Scoreboard ([1] of DW1)
+ *      - Depth Stall ([13] of DW1)
+ *      - Post-Sync Operation ([13] of DW1)
+ *      - Notify Enable ([8] of DW1)"
+ *
+ * The cache flushes require the workaround flush that triggered this
+ * one, so we can't use it.  Depth stall would trigger the same.
+ * Post-sync nonzero is what triggered this second workaround, so we
+ * can't use that one either.  Notify enable is IRQs, which aren't
+ * really our business.  That leaves only stall at scoreboard.
+ */
+static int
+gen6_emit_post_sync_nonzero_flush(struct i915_request *rq)
+{
+	u32 scratch_addr =
+		intel_gt_scratch_offset(rq->engine->gt,
+					INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
+	u32 *cs;
+
+	cs = intel_ring_begin(rq, 6);
+	if (IS_ERR(cs))
+		return PTR_ERR(cs);
+
+	*cs++ = GFX_OP_PIPE_CONTROL(5);
+	*cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
+	*cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
+	*cs++ = 0; /* low dword */
+	*cs++ = 0; /* high dword */
+	*cs++ = MI_NOOP;
+	intel_ring_advance(rq, cs);
+
+	cs = intel_ring_begin(rq, 6);
+	if (IS_ERR(cs))
+		return PTR_ERR(cs);
+
+	*cs++ = GFX_OP_PIPE_CONTROL(5);
+	*cs++ = PIPE_CONTROL_QW_WRITE;
+	*cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
+	*cs++ = 0;
+	*cs++ = 0;
+	*cs++ = MI_NOOP;
+	intel_ring_advance(rq, cs);
+
+	return 0;
+}
+
+int gen6_emit_flush_rcs(struct i915_request *rq, u32 mode)
+{
+	u32 scratch_addr =
+		intel_gt_scratch_offset(rq->engine->gt,
+					INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
+	u32 *cs, flags = 0;
+	int ret;
+
+	/* Force SNB workarounds for PIPE_CONTROL flushes */
+	ret = gen6_emit_post_sync_nonzero_flush(rq);
+	if (ret)
+		return ret;
+
+	/*
+	 * Just flush everything.  Experiments have shown that reducing the
+	 * number of bits based on the write domains has little performance
+	 * impact. And when rearranging requests, the order of flushes is
+	 * unknown.
+	 */
+	if (mode & EMIT_FLUSH) {
+		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
+		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
+		/*
+		 * Ensure that any following seqno writes only happen
+		 * when the render cache is indeed flushed.
+		 */
+		flags |= PIPE_CONTROL_CS_STALL;
+	}
+	if (mode & EMIT_INVALIDATE) {
+		flags |= PIPE_CONTROL_TLB_INVALIDATE;
+		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
+		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
+		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
+		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
+		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
+		/*
+		 * TLB invalidate requires a post-sync write.
+		 */
+		flags |= PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_CS_STALL;
+	}
+
+	cs = intel_ring_begin(rq, 4);
+	if (IS_ERR(cs))
+		return PTR_ERR(cs);
+
+	*cs++ = GFX_OP_PIPE_CONTROL(4);
+	*cs++ = flags;
+	*cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
+	*cs++ = 0;
+	intel_ring_advance(rq, cs);
+
+	return 0;
+}
+
+u32 *gen6_emit_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
+{
+	/* First we do the gen6_emit_post_sync_nonzero_flush w/a */
+	*cs++ = GFX_OP_PIPE_CONTROL(4);
+	*cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
+	*cs++ = 0;
+	*cs++ = 0;
+
+	*cs++ = GFX_OP_PIPE_CONTROL(4);
+	*cs++ = PIPE_CONTROL_QW_WRITE;
+	*cs++ = intel_gt_scratch_offset(rq->engine->gt,
+					INTEL_GT_SCRATCH_FIELD_DEFAULT) |
+		PIPE_CONTROL_GLOBAL_GTT;
+	*cs++ = 0;
+
+	/* Finally we can flush and with it emit the breadcrumb */
+	*cs++ = GFX_OP_PIPE_CONTROL(4);
+	*cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
+		 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+		 PIPE_CONTROL_DC_FLUSH_ENABLE |
+		 PIPE_CONTROL_QW_WRITE |
+		 PIPE_CONTROL_CS_STALL);
+	*cs++ = i915_request_active_timeline(rq)->hwsp_offset |
+		PIPE_CONTROL_GLOBAL_GTT;
+	*cs++ = rq->fence.seqno;
+
+	*cs++ = MI_USER_INTERRUPT;
+	*cs++ = MI_NOOP;
+
+	rq->tail = intel_ring_offset(rq, cs);
+	assert_ring_tail_valid(rq->ring, rq->tail);
+
+	return cs;
+}
+
+static int mi_flush_dw(struct i915_request *rq, u32 flags)
+{
+	u32 cmd, *cs;
+
+	cs = intel_ring_begin(rq, 4);
+	if (IS_ERR(cs))
+		return PTR_ERR(cs);
+
+	cmd = MI_FLUSH_DW;
+
+	/*
+	 * We always require a command barrier so that subsequent
+	 * commands, such as breadcrumb interrupts, are strictly ordered
+	 * wrt the contents of the write cache being flushed to memory
+	 * (and thus being coherent from the CPU).
+	 */
+	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
+
+	/*
+	 * Bspec vol 1c.3 - blitter engine command streamer:
+	 * "If ENABLED, all TLBs will be invalidated once the flush
+	 * operation is complete. This bit is only valid when the
+	 * Post-Sync Operation field is a value of 1h or 3h."
+	 */
+	cmd |= flags;
+
+	*cs++ = cmd;
+	*cs++ = HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT;
+	*cs++ = 0;
+	*cs++ = MI_NOOP;
+
+	intel_ring_advance(rq, cs);
+
+	return 0;
+}
+
+static int gen6_flush_dw(struct i915_request *rq, u32 mode, u32 invflags)
+{
+	return mi_flush_dw(rq, mode & EMIT_INVALIDATE ? invflags : 0);
+}
+
+int gen6_emit_flush_xcs(struct i915_request *rq, u32 mode)
+{
+	return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB);
+}
+
+int gen6_emit_flush_vcs(struct i915_request *rq, u32 mode)
+{
+	return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB | MI_INVALIDATE_BSD);
+}
+
+int gen6_emit_bb_start(struct i915_request *rq,
+		       u64 offset, u32 len,
+		       unsigned int dispatch_flags)
+{
+	u32 security;
+	u32 *cs;
+
+	security = MI_BATCH_NON_SECURE_I965;
+	if (dispatch_flags & I915_DISPATCH_SECURE)
+		security = 0;
+
+	cs = intel_ring_begin(rq, 2);
+	if (IS_ERR(cs))
+		return PTR_ERR(cs);
+
+	cs = __gen6_emit_bb_start(cs, offset, security);
+	intel_ring_advance(rq, cs);
+
+	return 0;
+}
+
+int
+hsw_emit_bb_start(struct i915_request *rq,
+		  u64 offset, u32 len,
+		  unsigned int dispatch_flags)
+{
+	u32 security;
+	u32 *cs;
+
+	security = MI_BATCH_PPGTT_HSW | MI_BATCH_NON_SECURE_HSW;
+	if (dispatch_flags & I915_DISPATCH_SECURE)
+		security = 0;
+
+	cs = intel_ring_begin(rq, 2);
+	if (IS_ERR(cs))
+		return PTR_ERR(cs);
+
+	cs = __gen6_emit_bb_start(cs, offset, security);
+	intel_ring_advance(rq, cs);
+
+	return 0;
+}
+
+static int gen7_stall_cs(struct i915_request *rq)
+{
+	u32 *cs;
+
+	cs = intel_ring_begin(rq, 4);
+	if (IS_ERR(cs))
+		return PTR_ERR(cs);
+
+	*cs++ = GFX_OP_PIPE_CONTROL(4);
+	*cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
+	*cs++ = 0;
+	*cs++ = 0;
+	intel_ring_advance(rq, cs);
+
+	return 0;
+}
+
+int gen7_emit_flush_rcs(struct i915_request *rq, u32 mode)
+{
+	u32 scratch_addr =
+		intel_gt_scratch_offset(rq->engine->gt,
+					INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
+	u32 *cs, flags = 0;
+
+	/*
+	 * Ensure that any following seqno writes only happen when the render
+	 * cache is indeed flushed.
+	 *
+	 * Workaround: 4th PIPE_CONTROL command (except the ones with only
+	 * read-cache invalidate bits set) must have the CS_STALL bit set. We
+	 * don't try to be clever and just set it unconditionally.
+	 */
+	flags |= PIPE_CONTROL_CS_STALL;
+
+	/*
+	 * CS_STALL suggests at least a post-sync write.
+	 */
+	flags |= PIPE_CONTROL_QW_WRITE;
+	flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;
+
+	/*
+	 * Just flush everything.  Experiments have shown that reducing the
+	 * number of bits based on the write domains has little performance
+	 * impact.
+	 */
+	if (mode & EMIT_FLUSH) {
+		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
+		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
+		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
+		flags |= PIPE_CONTROL_FLUSH_ENABLE;
+	}
+	if (mode & EMIT_INVALIDATE) {
+		flags |= PIPE_CONTROL_TLB_INVALIDATE;
+		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
+		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
+		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
+		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
+		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
+		flags |= PIPE_CONTROL_MEDIA_STATE_CLEAR;
+
+		/*
+		 * Workaround: we must issue a pipe_control with CS-stall bit
+		 * set before a pipe_control command that has the state cache
+		 * invalidate bit set.
+		 */
+		gen7_stall_cs(rq);
+	}
+
+	cs = intel_ring_begin(rq, 4);
+	if (IS_ERR(cs))
+		return PTR_ERR(cs);
+
+	*cs++ = GFX_OP_PIPE_CONTROL(4);
+	*cs++ = flags;
+	*cs++ = scratch_addr;
+	*cs++ = 0;
+	intel_ring_advance(rq, cs);
+
+	return 0;
+}
+
+u32 *gen7_emit_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
+{
+	*cs++ = GFX_OP_PIPE_CONTROL(4);
+	*cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
+		 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+		 PIPE_CONTROL_DC_FLUSH_ENABLE |
+		 PIPE_CONTROL_FLUSH_ENABLE |
+		 PIPE_CONTROL_QW_WRITE |
+		 PIPE_CONTROL_GLOBAL_GTT_IVB |
+		 PIPE_CONTROL_CS_STALL);
+	*cs++ = i915_request_active_timeline(rq)->hwsp_offset;
+	*cs++ = rq->fence.seqno;
+
+	*cs++ = MI_USER_INTERRUPT;
+	*cs++ = MI_NOOP;
+
+	rq->tail = intel_ring_offset(rq, cs);
+	assert_ring_tail_valid(rq->ring, rq->tail);
+
+	return cs;
+}
+
+u32 *gen6_emit_breadcrumb_xcs(struct i915_request *rq, u32 *cs)
+{
+	GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);
+	GEM_BUG_ON(offset_in_page(i915_request_active_timeline(rq)->hwsp_offset) != I915_GEM_HWS_SEQNO_ADDR);
+
+	*cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
+	*cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
+	*cs++ = rq->fence.seqno;
+
+	*cs++ = MI_USER_INTERRUPT;
+
+	rq->tail = intel_ring_offset(rq, cs);
+	assert_ring_tail_valid(rq->ring, rq->tail);
+
+	return cs;
+}
+
+#define GEN7_XCS_WA 32
+u32 *gen7_emit_breadcrumb_xcs(struct i915_request *rq, u32 *cs)
+{
+	int i;
+
+	GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);
+	GEM_BUG_ON(offset_in_page(i915_request_active_timeline(rq)->hwsp_offset) != I915_GEM_HWS_SEQNO_ADDR);
+
+	*cs++ = MI_FLUSH_DW | MI_INVALIDATE_TLB |
+		MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
+	*cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
+	*cs++ = rq->fence.seqno;
+
+	for (i = 0; i < GEN7_XCS_WA; i++) {
+		*cs++ = MI_STORE_DWORD_INDEX;
+		*cs++ = I915_GEM_HWS_SEQNO_ADDR;
+		*cs++ = rq->fence.seqno;
+	}
+
+	*cs++ = MI_FLUSH_DW;
+	*cs++ = 0;
+	*cs++ = 0;
+
+	*cs++ = MI_USER_INTERRUPT;
+	*cs++ = MI_NOOP;
+
+	rq->tail = intel_ring_offset(rq, cs);
+	assert_ring_tail_valid(rq->ring, rq->tail);
+
+	return cs;
+}
+#undef GEN7_XCS_WA
+
+void gen6_irq_enable(struct intel_engine_cs *engine)
+{
+	ENGINE_WRITE(engine, RING_IMR,
+		     ~(engine->irq_enable_mask | engine->irq_keep_mask));
+
+	/* Flush/delay to ensure the RING_IMR is active before the GT IMR */
+	ENGINE_POSTING_READ(engine, RING_IMR);
+
+	gen5_gt_enable_irq(engine->gt, engine->irq_enable_mask);
+}
+
+void gen6_irq_disable(struct intel_engine_cs *engine)
+{
+	ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
+	gen5_gt_disable_irq(engine->gt, engine->irq_enable_mask);
+}
+
+void hsw_irq_enable_vecs(struct intel_engine_cs *engine)
+{
+	ENGINE_WRITE(engine, RING_IMR, ~engine->irq_enable_mask);
+
+	/* Flush/delay to ensure the RING_IMR is active before the GT IMR */
+	ENGINE_POSTING_READ(engine, RING_IMR);
+
+	gen6_gt_pm_unmask_irq(engine->gt, engine->irq_enable_mask);
+}
+
+void hsw_irq_disable_vecs(struct intel_engine_cs *engine)
+{
+	ENGINE_WRITE(engine, RING_IMR, ~0);
+	gen6_gt_pm_mask_irq(engine->gt, engine->irq_enable_mask);
+}
diff --git a/drivers/gpu/drm/i915/gt/gen6_engine_cs.h b/drivers/gpu/drm/i915/gt/gen6_engine_cs.h
new file mode 100644
index 000000000000..76c6bc9f3bde
--- /dev/null
+++ b/drivers/gpu/drm/i915/gt/gen6_engine_cs.h
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright © 2020 Intel Corporation
+ */
+
+#ifndef __GEN6_ENGINE_CS_H__
+#define __GEN6_ENGINE_CS_H__
+
+#include <linux/types.h>
+
+#include "intel_gpu_commands.h"
+
+struct i915_request;
+struct intel_engine_cs;
+
+int gen6_emit_flush_rcs(struct i915_request *rq, u32 mode);
+int gen6_emit_flush_vcs(struct i915_request *rq, u32 mode);
+int gen6_emit_flush_xcs(struct i915_request *rq, u32 mode);
+u32 *gen6_emit_breadcrumb_rcs(struct i915_request *rq, u32 *cs);
+u32 *gen6_emit_breadcrumb_xcs(struct i915_request *rq, u32 *cs);
+
+int gen7_emit_flush_rcs(struct i915_request *rq, u32 mode);
+u32 *gen7_emit_breadcrumb_rcs(struct i915_request *rq, u32 *cs);
+u32 *gen7_emit_breadcrumb_xcs(struct i915_request *rq, u32 *cs);
+
+int gen6_emit_bb_start(struct i915_request *rq,
+		       u64 offset, u32 len,
+		       unsigned int dispatch_flags);
+int hsw_emit_bb_start(struct i915_request *rq,
+		      u64 offset, u32 len,
+		      unsigned int dispatch_flags);
+
+void gen6_irq_enable(struct intel_engine_cs *engine);
+void gen6_irq_disable(struct intel_engine_cs *engine);
+
+void hsw_irq_enable_vecs(struct intel_engine_cs *engine);
+void hsw_irq_disable_vecs(struct intel_engine_cs *engine);
+
+#endif /* __GEN6_ENGINE_CS_H__ */
diff --git a/drivers/gpu/drm/i915/gt/intel_context_sseu.c b/drivers/gpu/drm/i915/gt/intel_context_sseu.c
index 487299cb91f2..27ae48049239 100644
--- a/drivers/gpu/drm/i915/gt/intel_context_sseu.c
+++ b/drivers/gpu/drm/i915/gt/intel_context_sseu.c
@@ -30,7 +30,7 @@ static int gen8_emit_rpcs_config(struct i915_request *rq,
 	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
 	*cs++ = lower_32_bits(offset);
 	*cs++ = upper_32_bits(offset);
-	*cs++ = intel_sseu_make_rpcs(rq->i915, &sseu);
+	*cs++ = intel_sseu_make_rpcs(rq->engine->i915, &sseu);
 
 	intel_ring_advance(rq, cs);
 
diff --git a/drivers/gpu/drm/i915/gt/intel_engine.h b/drivers/gpu/drm/i915/gt/intel_engine.h
index 9bf6d4989968..a9249a23903a 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine.h
@@ -187,7 +187,6 @@ intel_write_status_page(struct intel_engine_cs *engine, int reg, u32 value)
 #define I915_GEM_HWS_SEQNO		0x40
 #define I915_GEM_HWS_SEQNO_ADDR		(I915_GEM_HWS_SEQNO * sizeof(u32))
 #define I915_GEM_HWS_SCRATCH		0x80
-#define I915_GEM_HWS_SCRATCH_ADDR	(I915_GEM_HWS_SCRATCH * sizeof(u32))
 
 #define I915_HWS_CSB_BUF0_INDEX		0x10
 #define I915_HWS_CSB_WRITE_INDEX	0x1f
@@ -335,7 +334,8 @@ void intel_engine_dump(struct intel_engine_cs *engine,
 		       struct drm_printer *m,
 		       const char *header, ...);
 
-ktime_t intel_engine_get_busy_time(struct intel_engine_cs *engine);
+ktime_t intel_engine_get_busy_time(struct intel_engine_cs *engine,
+				   ktime_t *now);
 
 struct i915_request *
 intel_engine_find_active_request(struct intel_engine_cs *engine);
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
index 8691eb61e185..7bf2f76212f0 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
@@ -414,12 +414,12 @@ void intel_engines_release(struct intel_gt *gt)
 
 	/* Decouple the backend; but keep the layout for late GPU resets */
 	for_each_engine(engine, gt, id) {
-		intel_wakeref_wait_for_idle(&engine->wakeref);
-		GEM_BUG_ON(intel_engine_pm_is_awake(engine));
-
 		if (!engine->release)
 			continue;
 
+		intel_wakeref_wait_for_idle(&engine->wakeref);
+		GEM_BUG_ON(intel_engine_pm_is_awake(engine));
+
 		engine->release(engine);
 		engine->release = NULL;
 
@@ -661,7 +661,6 @@ static int measure_breadcrumb_dw(struct intel_context *ce)
 	if (!frame)
 		return -ENOMEM;
 
-	frame->rq.i915 = engine->i915;
 	frame->rq.engine = engine;
 	frame->rq.context = ce;
 	rcu_assign_pointer(frame->rq.timeline, ce->timeline);
@@ -1095,19 +1094,21 @@ void intel_engine_flush_submission(struct intel_engine_cs *engine)
 {
 	struct tasklet_struct *t = &engine->execlists.tasklet;
 
-	if (__tasklet_is_scheduled(t)) {
-		local_bh_disable();
-		if (tasklet_trylock(t)) {
-			/* Must wait for any GPU reset in progress. */
-			if (__tasklet_is_enabled(t))
-				t->func(t->data);
-			tasklet_unlock(t);
-		}
-		local_bh_enable();
-	}
+	if (!t->func)
+		return;
 
-	/* Otherwise flush the tasklet if it was running on another cpu */
-	tasklet_unlock_wait(t);
+	/* Synchronise and wait for the tasklet on another CPU */
+	tasklet_kill(t);
+
+	/* Having cancelled the tasklet, ensure that is run */
+	local_bh_disable();
+	if (tasklet_trylock(t)) {
+		/* Must wait for any GPU reset in progress. */
+		if (__tasklet_is_enabled(t))
+			t->func(t->data);
+		tasklet_unlock(t);
+	}
+	local_bh_enable();
 }
 
 /**
@@ -1194,8 +1195,7 @@ bool intel_engine_can_store_dword(struct intel_engine_cs *engine)
 	}
 }
 
-static int print_sched_attr(struct drm_i915_private *i915,
-			    const struct i915_sched_attr *attr,
+static int print_sched_attr(const struct i915_sched_attr *attr,
 			    char *buf, int x, int len)
 {
 	if (attr->priority == I915_PRIORITY_INVALID)
@@ -1215,7 +1215,7 @@ static void print_request(struct drm_printer *m,
 	char buf[80] = "";
 	int x = 0;
 
-	x = print_sched_attr(rq->i915, &rq->sched.attr, buf, x, sizeof(buf));
+	x = print_sched_attr(&rq->sched.attr, buf, x, sizeof(buf));
 
 	drm_printf(m, "%s %llx:%llx%s%s %s @ %dms: %s\n",
 		   prefix,
@@ -1423,9 +1423,11 @@ static void intel_engine_print_registers(struct intel_engine_cs *engine,
 			int len;
 
 			len = scnprintf(hdr, sizeof(hdr),
-					"\t\tActive[%d]:  ccid:%08x, ",
+					"\t\tActive[%d]:  ccid:%08x%s%s, ",
 					(int)(port - execlists->active),
-					rq->context->lrc.ccid);
+					rq->context->lrc.ccid,
+					intel_context_is_closed(rq->context) ? "!" : "",
+					intel_context_is_banned(rq->context) ? "*" : "");
 			len += print_ring(hdr + len, sizeof(hdr) - len, rq);
 			scnprintf(hdr + len, sizeof(hdr) - len, "rq: ");
 			print_request(m, rq, hdr);
@@ -1435,9 +1437,11 @@ static void intel_engine_print_registers(struct intel_engine_cs *engine,
 			int len;
 
 			len = scnprintf(hdr, sizeof(hdr),
-					"\t\tPending[%d]: ccid:%08x, ",
+					"\t\tPending[%d]: ccid:%08x%s%s, ",
 					(int)(port - execlists->pending),
-					rq->context->lrc.ccid);
+					rq->context->lrc.ccid,
+					intel_context_is_closed(rq->context) ? "!" : "",
+					intel_context_is_banned(rq->context) ? "*" : "");
 			len += print_ring(hdr + len, sizeof(hdr) - len, rq);
 			scnprintf(hdr + len, sizeof(hdr) - len, "rq: ");
 			print_request(m, rq, hdr);
@@ -1506,6 +1510,7 @@ void intel_engine_dump(struct intel_engine_cs *engine,
 	struct i915_request *rq;
 	intel_wakeref_t wakeref;
 	unsigned long flags;
+	ktime_t dummy;
 
 	if (header) {
 		va_list ap;
@@ -1523,6 +1528,12 @@ void intel_engine_dump(struct intel_engine_cs *engine,
 		   yesno(!llist_empty(&engine->barrier_tasks)));
 	drm_printf(m, "\tLatency: %luus\n",
 		   ewma__engine_latency_read(&engine->latency));
+	if (intel_engine_supports_stats(engine))
+		drm_printf(m, "\tRuntime: %llums\n",
+			   ktime_to_ms(intel_engine_get_busy_time(engine,
+								  &dummy)));
+	drm_printf(m, "\tForcewake: %x domains, %d active\n",
+		   engine->fw_domain, atomic_read(&engine->fw_active));
 
 	rcu_read_lock();
 	rq = READ_ONCE(engine->heartbeat.systole);
@@ -1589,7 +1600,8 @@ void intel_engine_dump(struct intel_engine_cs *engine,
 	intel_engine_print_breadcrumbs(engine, m);
 }
 
-static ktime_t __intel_engine_get_busy_time(struct intel_engine_cs *engine)
+static ktime_t __intel_engine_get_busy_time(struct intel_engine_cs *engine,
+					    ktime_t *now)
 {
 	ktime_t total = engine->stats.total;
 
@@ -1597,9 +1609,9 @@ static ktime_t __intel_engine_get_busy_time(struct intel_engine_cs *engine)
 	 * If the engine is executing something at the moment
 	 * add it to the total.
 	 */
+	*now = ktime_get();
 	if (atomic_read(&engine->stats.active))
-		total = ktime_add(total,
-				  ktime_sub(ktime_get(), engine->stats.start));
+		total = ktime_add(total, ktime_sub(*now, engine->stats.start));
 
 	return total;
 }
@@ -1607,17 +1619,18 @@ static ktime_t __intel_engine_get_busy_time(struct intel_engine_cs *engine)
 /**
  * intel_engine_get_busy_time() - Return current accumulated engine busyness
  * @engine: engine to report on
+ * @now: monotonic timestamp of sampling
  *
  * Returns accumulated time @engine was busy since engine stats were enabled.
  */
-ktime_t intel_engine_get_busy_time(struct intel_engine_cs *engine)
+ktime_t intel_engine_get_busy_time(struct intel_engine_cs *engine, ktime_t *now)
 {
 	unsigned int seq;
 	ktime_t total;
 
 	do {
 		seq = read_seqbegin(&engine->stats.lock);
-		total = __intel_engine_get_busy_time(engine);
+		total = __intel_engine_get_busy_time(engine, now);
 	} while (read_seqretry(&engine->stats.lock, seq));
 
 	return total;
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
index 5136c8bf112d..8ffdf676c0a0 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
@@ -4,6 +4,7 @@
  * Copyright © 2019 Intel Corporation
  */
 
+#include "i915_drv.h"
 #include "i915_request.h"
 
 #include "intel_context.h"
@@ -31,7 +32,7 @@ static bool next_heartbeat(struct intel_engine_cs *engine)
 	delay = msecs_to_jiffies_timeout(delay);
 	if (delay >= HZ)
 		delay = round_jiffies_up_relative(delay);
-	mod_delayed_work(system_wq, &engine->heartbeat.work, delay);
+	mod_delayed_work(system_highpri_wq, &engine->heartbeat.work, delay);
 
 	return true;
 }
@@ -48,8 +49,10 @@ static void show_heartbeat(const struct i915_request *rq,
 	struct drm_printer p = drm_debug_printer("heartbeat");
 
 	intel_engine_dump(engine, &p,
-			  "%s heartbeat {prio:%d} not ticking\n",
+			  "%s heartbeat {seqno:%llx:%lld, prio:%d} not ticking\n",
 			  engine->name,
+			  rq->fence.context,
+			  rq->fence.seqno,
 			  rq->sched.attr.priority);
 }
 
@@ -62,6 +65,10 @@ static void heartbeat(struct work_struct *wrk)
 		container_of(wrk, typeof(*engine), heartbeat.work.work);
 	struct intel_context *ce = engine->kernel_context;
 	struct i915_request *rq;
+	unsigned long serial;
+
+	/* Just in case everything has gone horribly wrong, give it a kick */
+	intel_engine_flush_submission(engine);
 
 	rq = engine->heartbeat.systole;
 	if (rq && i915_request_completed(rq)) {
@@ -76,8 +83,19 @@ static void heartbeat(struct work_struct *wrk)
 		goto out;
 
 	if (engine->heartbeat.systole) {
-		if (engine->schedule &&
-		    rq->sched.attr.priority < I915_PRIORITY_BARRIER) {
+		if (!i915_sw_fence_signaled(&rq->submit)) {
+			/*
+			 * Not yet submitted, system is stalled.
+			 *
+			 * This more often happens for ring submission,
+			 * where all contexts are funnelled into a common
+			 * ringbuffer. If one context is blocked on an
+			 * external fence, not only is it not submitted,
+			 * but all other contexts, including the kernel
+			 * context are stuck waiting for the signal.
+			 */
+		} else if (engine->schedule &&
+			   rq->sched.attr.priority < I915_PRIORITY_BARRIER) {
 			/*
 			 * Gradually raise the priority of the heartbeat to
 			 * give high priority work [which presumably desires
@@ -105,10 +123,19 @@ static void heartbeat(struct work_struct *wrk)
 		goto out;
 	}
 
-	if (engine->wakeref_serial == engine->serial)
+	serial = READ_ONCE(engine->serial);
+	if (engine->wakeref_serial == serial)
 		goto out;
 
-	mutex_lock(&ce->timeline->mutex);
+	if (!mutex_trylock(&ce->timeline->mutex)) {
+		/* Unable to lock the kernel timeline, is the engine stuck? */
+		if (xchg(&engine->heartbeat.blocked, serial) == serial)
+			intel_gt_handle_error(engine->gt, engine->mask,
+					      I915_ERROR_CAPTURE,
+					      "no heartbeat on %s",
+					      engine->name);
+		goto out;
+	}
 
 	intel_context_enter(ce);
 	rq = __i915_request_create(ce, GFP_NOWAIT | __GFP_NOWARN);
@@ -117,7 +144,7 @@ static void heartbeat(struct work_struct *wrk)
 		goto unlock;
 
 	idle_pulse(engine, rq);
-	if (i915_modparams.enable_hangcheck)
+	if (engine->i915->params.enable_hangcheck)
 		engine->heartbeat.systole = i915_request_get(rq);
 
 	__i915_request_commit(rq);
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
index 2b6cdf47d428..490af81bd6f3 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
@@ -24,6 +24,7 @@
 #include "i915_selftest.h"
 #include "intel_sseu.h"
 #include "intel_timeline_types.h"
+#include "intel_uncore.h"
 #include "intel_wakeref.h"
 #include "intel_workarounds_types.h"
 
@@ -313,6 +314,16 @@ struct intel_engine_cs {
 	u32 context_size;
 	u32 mmio_base;
 
+	/*
+	 * Some w/a require forcewake to be held (which prevents RC6) while
+	 * a particular engine is active. If so, we set fw_domain to which
+	 * domains need to be held for the duration of request activity,
+	 * and 0 if none. We try to limit the duration of the hold as much
+	 * as possible.
+	 */
+	enum forcewake_domains fw_domain;
+	atomic_t fw_active;
+
 	unsigned long context_tag;
 
 	struct rb_node uabi_node;
@@ -337,6 +348,7 @@ struct intel_engine_cs {
 	struct {
 		struct delayed_work work;
 		struct i915_request *systole;
+		unsigned long blocked;
 	} heartbeat;
 
 	unsigned long serial;
diff --git a/drivers/gpu/drm/i915/gt/intel_ggtt.c b/drivers/gpu/drm/i915/gt/intel_ggtt.c
index 66165b10256e..323c328d444a 100644
--- a/drivers/gpu/drm/i915/gt/intel_ggtt.c
+++ b/drivers/gpu/drm/i915/gt/intel_ggtt.c
@@ -108,13 +108,32 @@ static bool needs_idle_maps(struct drm_i915_private *i915)
 
 void i915_ggtt_suspend(struct i915_ggtt *ggtt)
 {
-	struct i915_vma *vma;
+	struct i915_vma *vma, *vn;
+	int open;
+
+	mutex_lock(&ggtt->vm.mutex);
+
+	/* Skip rewriting PTE on VMA unbind. */
+	open = atomic_xchg(&ggtt->vm.open, 0);
 
-	list_for_each_entry(vma, &ggtt->vm.bound_list, vm_link)
+	list_for_each_entry_safe(vma, vn, &ggtt->vm.bound_list, vm_link) {
+		GEM_BUG_ON(!drm_mm_node_allocated(&vma->node));
 		i915_vma_wait_for_bind(vma);
 
+		if (i915_vma_is_pinned(vma))
+			continue;
+
+		if (!i915_vma_is_bound(vma, I915_VMA_GLOBAL_BIND)) {
+			__i915_vma_evict(vma);
+			drm_mm_remove_node(&vma->node);
+		}
+	}
+
 	ggtt->vm.clear_range(&ggtt->vm, 0, ggtt->vm.total);
 	ggtt->invalidate(ggtt);
+	atomic_set(&ggtt->vm.open, open);
+
+	mutex_unlock(&ggtt->vm.mutex);
 
 	intel_gt_check_and_clear_faults(ggtt->vm.gt);
 }
@@ -424,22 +443,17 @@ static int ggtt_bind_vma(struct i915_vma *vma,
 	struct drm_i915_gem_object *obj = vma->obj;
 	u32 pte_flags;
 
+	if (i915_vma_is_bound(vma, ~flags & I915_VMA_BIND_MASK))
+		return 0;
+
 	/* Applicable to VLV (gen8+ do not support RO in the GGTT) */
 	pte_flags = 0;
 	if (i915_gem_object_is_readonly(obj))
 		pte_flags |= PTE_READ_ONLY;
 
 	vma->vm->insert_entries(vma->vm, vma, cache_level, pte_flags);
-
 	vma->page_sizes.gtt = I915_GTT_PAGE_SIZE;
 
-	/*
-	 * Without aliasing PPGTT there's no difference between
-	 * GLOBAL/LOCAL_BIND, it's all the same ptes. Hence unconditionally
-	 * upgrade to both bound if we bind either to avoid double-binding.
-	 */
-	atomic_or(I915_VMA_GLOBAL_BIND | I915_VMA_LOCAL_BIND, &vma->flags);
-
 	return 0;
 }
 
@@ -1166,6 +1180,11 @@ void i915_ggtt_disable_guc(struct i915_ggtt *ggtt)
 	ggtt->invalidate(ggtt);
 }
 
+static unsigned int clear_bind(struct i915_vma *vma)
+{
+	return atomic_fetch_and(~I915_VMA_BIND_MASK, &vma->flags);
+}
+
 void i915_ggtt_resume(struct i915_ggtt *ggtt)
 {
 	struct i915_vma *vma;
@@ -1183,14 +1202,11 @@ void i915_ggtt_resume(struct i915_ggtt *ggtt)
 	/* clflush objects bound into the GGTT and rebind them. */
 	list_for_each_entry(vma, &ggtt->vm.bound_list, vm_link) {
 		struct drm_i915_gem_object *obj = vma->obj;
+		unsigned int was_bound = clear_bind(vma);
 
-		if (!i915_vma_is_bound(vma, I915_VMA_GLOBAL_BIND))
-			continue;
-
-		clear_bit(I915_VMA_GLOBAL_BIND_BIT, __i915_vma_flags(vma));
 		WARN_ON(i915_vma_bind(vma,
 				      obj ? obj->cache_level : 0,
-				      PIN_GLOBAL, NULL));
+				      was_bound, NULL));
 		if (obj) { /* only used during resume => exclusive access */
 			flush |= fetch_and_zero(&obj->write_domain);
 			obj->read_domains |= I915_GEM_DOMAIN_GTT;
diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c b/drivers/gpu/drm/i915/gt/intel_gt.c
index f069551e412f..ebc29b6ee86c 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt.c
@@ -616,6 +616,11 @@ void intel_gt_driver_unregister(struct intel_gt *gt)
 void intel_gt_driver_release(struct intel_gt *gt)
 {
 	struct i915_address_space *vm;
+	intel_wakeref_t wakeref;
+
+	/* Scrub all HW state upon release */
+	with_intel_runtime_pm(gt->uncore->rpm, wakeref)
+		__intel_gt_reset(gt, ALL_ENGINES);
 
 	vm = fetch_and_zero(&gt->vm);
 	if (vm) /* FIXME being called twice on error paths :( */
diff --git a/drivers/gpu/drm/i915/gt/intel_gt_buffer_pool.c b/drivers/gpu/drm/i915/gt/intel_gt_buffer_pool.c
index 1495054a4305..418ae184cecf 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_buffer_pool.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt_buffer_pool.c
@@ -212,8 +212,9 @@ void intel_gt_flush_buffer_pool(struct intel_gt *gt)
 {
 	struct intel_gt_buffer_pool *pool = &gt->buffer_pool;
 
-	if (cancel_delayed_work_sync(&pool->work))
+	do {
 		pool_free_imm(pool);
+	} while (cancel_delayed_work_sync(&pool->work));
 }
 
 void intel_gt_fini_buffer_pool(struct intel_gt *gt)
diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.c b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
index 6bdb434a442d..f1d5333f9456 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_pm.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
@@ -214,8 +214,8 @@ int intel_gt_resume(struct intel_gt *gt)
 	/* Only when the HW is re-initialised, can we replay the requests */
 	err = intel_gt_init_hw(gt);
 	if (err) {
-		drm_err(&gt->i915->drm,
-			"Failed to initialize GPU, declaring it wedged!\n");
+		i915_probe_error(gt->i915,
+				 "Failed to initialize GPU, declaring it wedged!\n");
 		goto err_wedged;
 	}
 
diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
index cb07e1d2a353..9158ead4afba 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -446,6 +446,9 @@ static int queue_prio(const struct intel_engine_execlists *execlists)
 	 * we have to flip the index value to become priority.
 	 */
 	p = to_priolist(rb);
+	if (!I915_USER_PRIORITY_SHIFT)
+		return p->priority;
+
 	return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used);
 }
 
@@ -1377,6 +1380,8 @@ __execlists_schedule_in(struct i915_request *rq)
 	ce->lrc.ccid |= engine->execlists.ccid;
 
 	__intel_gt_pm_get(engine->gt);
+	if (engine->fw_domain && !atomic_fetch_inc(&engine->fw_active))
+		intel_uncore_forcewake_get(engine->uncore, engine->fw_domain);
 	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
 	intel_engine_context_in(engine);
 
@@ -1409,8 +1414,8 @@ static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
 	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
 	struct i915_request *next = READ_ONCE(ve->request);
 
-	if (next && next->execution_mask & ~rq->execution_mask)
-		tasklet_schedule(&ve->base.execlists.tasklet);
+	if (next == rq || (next && next->execution_mask & ~rq->execution_mask))
+		tasklet_hi_schedule(&ve->base.execlists.tasklet);
 }
 
 static inline void
@@ -1445,6 +1450,8 @@ __execlists_schedule_out(struct i915_request *rq,
 	intel_context_update_runtime(ce);
 	intel_engine_context_out(engine);
 	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
+	if (engine->fw_domain && !atomic_dec_return(&engine->fw_active))
+		intel_uncore_forcewake_put(engine->uncore, engine->fw_domain);
 	intel_gt_pm_put_async(engine->gt);
 
 	/*
@@ -1636,9 +1643,9 @@ assert_pending_valid(const struct intel_engine_execlists *execlists,
 		ccid = ce->lrc.ccid;
 
 		/*
-		 * Sentinels are supposed to be lonely so they flush the
-		 * current exection off the HW. Check that they are the
-		 * only request in the pending submission.
+		 * Sentinels are supposed to be the last request so they flush
+		 * the current execution off the HW. Check that they are the only
+		 * request in the pending submission.
 		 */
 		if (sentinel) {
 			GEM_TRACE_ERR("%s: context:%llx after sentinel in pending[%zd]\n",
@@ -1647,15 +1654,7 @@ assert_pending_valid(const struct intel_engine_execlists *execlists,
 				      port - execlists->pending);
 			return false;
 		}
-
 		sentinel = i915_request_has_sentinel(rq);
-		if (sentinel && port != execlists->pending) {
-			GEM_TRACE_ERR("%s: sentinel context:%llx not in prime position[%zd]\n",
-				      engine->name,
-				      ce->timeline->fence_context,
-				      port - execlists->pending);
-			return false;
-		}
 
 		/* Hold tightly onto the lock to prevent concurrent retires! */
 		if (!spin_trylock_irqsave(&rq->lock, flags))
@@ -1967,7 +1966,7 @@ static int
 switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq)
 {
 	if (list_is_last(&rq->sched.link, &engine->active.requests))
-		return INT_MIN;
+		return engine->execlists.queue_priority_hint;
 
 	return rq_prio(list_next_entry(rq, sched.link));
 }
@@ -2445,6 +2444,7 @@ done:
 		set_preempt_timeout(engine, *active);
 		execlists_submit_ports(engine);
 	} else {
+		start_timeslice(engine, execlists->queue_priority_hint);
 skip_submit:
 		ring_set_paused(engine, 0);
 	}
@@ -3170,13 +3170,6 @@ static void __submit_queue_imm(struct intel_engine_cs *engine)
 	if (reset_in_progress(execlists))
 		return; /* defer until we restart the engine following reset */
 
-	/* Hopefully we clear execlists->pending[] to let us through */
-	if (READ_ONCE(execlists->pending[0]) &&
-	    tasklet_trylock(&execlists->tasklet)) {
-		process_csb(engine);
-		tasklet_unlock(&execlists->tasklet);
-	}
-
 	__execlists_submission_tasklet(engine);
 }
 
@@ -3199,11 +3192,25 @@ static bool ancestor_on_hold(const struct intel_engine_cs *engine,
 	return !list_empty(&engine->active.hold) && hold_request(rq);
 }
 
+static void flush_csb(struct intel_engine_cs *engine)
+{
+	struct intel_engine_execlists *el = &engine->execlists;
+
+	if (READ_ONCE(el->pending[0]) && tasklet_trylock(&el->tasklet)) {
+		if (!reset_in_progress(el))
+			process_csb(engine);
+		tasklet_unlock(&el->tasklet);
+	}
+}
+
 static void execlists_submit_request(struct i915_request *request)
 {
 	struct intel_engine_cs *engine = request->engine;
 	unsigned long flags;
 
+	/* Hopefully we clear execlists->pending[] to let us through */
+	flush_csb(engine);
+
 	/* Will be called from irq-context when using foreign fences. */
 	spin_lock_irqsave(&engine->active.lock, flags);
 
@@ -3536,7 +3543,7 @@ static int emit_pdps(struct i915_request *rq)
 	int err, i;
 	u32 *cs;
 
-	GEM_BUG_ON(intel_vgpu_active(rq->i915));
+	GEM_BUG_ON(intel_vgpu_active(rq->engine->i915));
 
 	/*
 	 * Beware ye of the dragons, this sequence is magic!
@@ -4515,11 +4522,11 @@ static int gen8_emit_flush_render(struct i915_request *request,
 		 * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
 		 * pipe control.
 		 */
-		if (IS_GEN(request->i915, 9))
+		if (IS_GEN(request->engine->i915, 9))
 			vf_flush_wa = true;
 
 		/* WaForGAMHang:kbl */
-		if (IS_KBL_REVID(request->i915, 0, KBL_REVID_B0))
+		if (IS_KBL_REVID(request->engine->i915, 0, KBL_REVID_B0))
 			dc_flush_wa = true;
 	}
 
@@ -5587,7 +5594,7 @@ static void virtual_submit_request(struct i915_request *rq)
 		GEM_BUG_ON(!list_empty(virtual_queue(ve)));
 		list_move_tail(&rq->sched.link, virtual_queue(ve));
 
-		tasklet_schedule(&ve->base.execlists.tasklet);
+		tasklet_hi_schedule(&ve->base.execlists.tasklet);
 	}
 
 	spin_unlock_irqrestore(&ve->base.active.lock, flags);
diff --git a/drivers/gpu/drm/i915/gt/intel_renderstate.c b/drivers/gpu/drm/i915/gt/intel_renderstate.c
index f59e7875cc5e..6db23389e427 100644
--- a/drivers/gpu/drm/i915/gt/intel_renderstate.c
+++ b/drivers/gpu/drm/i915/gt/intel_renderstate.c
@@ -61,7 +61,7 @@ render_state_get_rodata(const struct intel_engine_cs *engine)
 #define OUT_BATCH(batch, i, val)				\
 	do {							\
 		if ((i) >= PAGE_SIZE / sizeof(u32))		\
-			goto err;				\
+			goto out;				\
 		(batch)[(i)++] = (val);				\
 	} while(0)
 
@@ -70,15 +70,12 @@ static int render_state_setup(struct intel_renderstate *so,
 {
 	const struct intel_renderstate_rodata *rodata = so->rodata;
 	unsigned int i = 0, reloc_index = 0;
-	unsigned int needs_clflush;
+	int ret = -EINVAL;
 	u32 *d;
-	int ret;
 
-	ret = i915_gem_object_prepare_write(so->vma->obj, &needs_clflush);
-	if (ret)
-		return ret;
-
-	d = kmap_atomic(i915_gem_object_get_dirty_page(so->vma->obj, 0));
+	d = i915_gem_object_pin_map(so->vma->obj, I915_MAP_WB);
+	if (IS_ERR(d))
+		return PTR_ERR(d);
 
 	while (i < rodata->batch_items) {
 		u32 s = rodata->batch[i];
@@ -89,7 +86,7 @@ static int render_state_setup(struct intel_renderstate *so,
 			if (HAS_64BIT_RELOC(i915)) {
 				if (i + 1 >= rodata->batch_items ||
 				    rodata->batch[i + 1] != 0)
-					goto err;
+					goto out;
 
 				d[i++] = s;
 				s = upper_32_bits(r);
@@ -103,7 +100,7 @@ static int render_state_setup(struct intel_renderstate *so,
 
 	if (rodata->reloc[reloc_index] != -1) {
 		drm_err(&i915->drm, "only %d relocs resolved\n", reloc_index);
-		goto err;
+		goto out;
 	}
 
 	so->batch_offset = i915_ggtt_offset(so->vma);
@@ -150,19 +147,11 @@ static int render_state_setup(struct intel_renderstate *so,
 	 */
 	so->aux_size = ALIGN(so->aux_size, 8);
 
-	if (needs_clflush)
-		drm_clflush_virt_range(d, i * sizeof(u32));
-	kunmap_atomic(d);
-
 	ret = 0;
 out:
-	i915_gem_object_finish_access(so->vma->obj);
+	__i915_gem_object_flush_map(so->vma->obj, 0, i * sizeof(u32));
+	i915_gem_object_unpin_map(so->vma->obj);
 	return ret;
-
-err:
-	kunmap_atomic(d);
-	ret = -EINVAL;
-	goto out;
 }
 
 #undef OUT_BATCH
diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c
index 39070b514e65..0156f1f5c736 100644
--- a/drivers/gpu/drm/i915/gt/intel_reset.c
+++ b/drivers/gpu/drm/i915/gt/intel_reset.c
@@ -638,7 +638,7 @@ int __intel_gt_reset(struct intel_gt *gt, intel_engine_mask_t engine_mask)
 
 bool intel_has_gpu_reset(const struct intel_gt *gt)
 {
-	if (!i915_modparams.reset)
+	if (!gt->i915->params.reset)
 		return NULL;
 
 	return intel_get_gpu_reset(gt);
@@ -646,7 +646,7 @@ bool intel_has_gpu_reset(const struct intel_gt *gt)
 
 bool intel_has_reset_engine(const struct intel_gt *gt)
 {
-	if (i915_modparams.reset < 2)
+	if (gt->i915->params.reset < 2)
 		return false;
 
 	return INTEL_INFO(gt->i915)->has_reset_engine;
@@ -1038,7 +1038,7 @@ void intel_gt_reset(struct intel_gt *gt,
 	awake = reset_prepare(gt);
 
 	if (!intel_has_gpu_reset(gt)) {
-		if (i915_modparams.reset)
+		if (gt->i915->params.reset)
 			drm_err(&gt->i915->drm, "GPU reset not supported\n");
 		else
 			drm_dbg(&gt->i915->drm, "GPU reset disabled\n");
diff --git a/drivers/gpu/drm/i915/gt/intel_ring_submission.c b/drivers/gpu/drm/i915/gt/intel_ring_submission.c
index ca7286e58409..68a08486fc87 100644
--- a/drivers/gpu/drm/i915/gt/intel_ring_submission.c
+++ b/drivers/gpu/drm/i915/gt/intel_ring_submission.c
@@ -27,21 +27,15 @@
  *
  */
 
-#include <linux/log2.h>
-
-#include "gem/i915_gem_context.h"
-
+#include "gen2_engine_cs.h"
+#include "gen6_engine_cs.h"
 #include "gen6_ppgtt.h"
 #include "gen7_renderclear.h"
 #include "i915_drv.h"
-#include "i915_trace.h"
 #include "intel_context.h"
 #include "intel_gt.h"
-#include "intel_gt_irq.h"
-#include "intel_gt_pm_irq.h"
 #include "intel_reset.h"
 #include "intel_ring.h"
-#include "intel_workarounds.h"
 #include "shmem_utils.h"
 
 /* Rough estimate of the typical request size, performing a flush,
@@ -49,436 +43,6 @@
  */
 #define LEGACY_REQUEST_SIZE 200
 
-static int
-gen2_render_ring_flush(struct i915_request *rq, u32 mode)
-{
-	unsigned int num_store_dw;
-	u32 cmd, *cs;
-
-	cmd = MI_FLUSH;
-	num_store_dw = 0;
-	if (mode & EMIT_INVALIDATE)
-		cmd |= MI_READ_FLUSH;
-	if (mode & EMIT_FLUSH)
-		num_store_dw = 4;
-
-	cs = intel_ring_begin(rq, 2 + 3 * num_store_dw);
-	if (IS_ERR(cs))
-		return PTR_ERR(cs);
-
-	*cs++ = cmd;
-	while (num_store_dw--) {
-		*cs++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
-		*cs++ = intel_gt_scratch_offset(rq->engine->gt,
-						INTEL_GT_SCRATCH_FIELD_DEFAULT);
-		*cs++ = 0;
-	}
-	*cs++ = MI_FLUSH | MI_NO_WRITE_FLUSH;
-
-	intel_ring_advance(rq, cs);
-
-	return 0;
-}
-
-static int
-gen4_render_ring_flush(struct i915_request *rq, u32 mode)
-{
-	u32 cmd, *cs;
-	int i;
-
-	/*
-	 * read/write caches:
-	 *
-	 * I915_GEM_DOMAIN_RENDER is always invalidated, but is
-	 * only flushed if MI_NO_WRITE_FLUSH is unset.  On 965, it is
-	 * also flushed at 2d versus 3d pipeline switches.
-	 *
-	 * read-only caches:
-	 *
-	 * I915_GEM_DOMAIN_SAMPLER is flushed on pre-965 if
-	 * MI_READ_FLUSH is set, and is always flushed on 965.
-	 *
-	 * I915_GEM_DOMAIN_COMMAND may not exist?
-	 *
-	 * I915_GEM_DOMAIN_INSTRUCTION, which exists on 965, is
-	 * invalidated when MI_EXE_FLUSH is set.
-	 *
-	 * I915_GEM_DOMAIN_VERTEX, which exists on 965, is
-	 * invalidated with every MI_FLUSH.
-	 *
-	 * TLBs:
-	 *
-	 * On 965, TLBs associated with I915_GEM_DOMAIN_COMMAND
-	 * and I915_GEM_DOMAIN_CPU in are invalidated at PTE write and
-	 * I915_GEM_DOMAIN_RENDER and I915_GEM_DOMAIN_SAMPLER
-	 * are flushed at any MI_FLUSH.
-	 */
-
-	cmd = MI_FLUSH;
-	if (mode & EMIT_INVALIDATE) {
-		cmd |= MI_EXE_FLUSH;
-		if (IS_G4X(rq->i915) || IS_GEN(rq->i915, 5))
-			cmd |= MI_INVALIDATE_ISP;
-	}
-
-	i = 2;
-	if (mode & EMIT_INVALIDATE)
-		i += 20;
-
-	cs = intel_ring_begin(rq, i);
-	if (IS_ERR(cs))
-		return PTR_ERR(cs);
-
-	*cs++ = cmd;
-
-	/*
-	 * A random delay to let the CS invalidate take effect? Without this
-	 * delay, the GPU relocation path fails as the CS does not see
-	 * the updated contents. Just as important, if we apply the flushes
-	 * to the EMIT_FLUSH branch (i.e. immediately after the relocation
-	 * write and before the invalidate on the next batch), the relocations
-	 * still fail. This implies that is a delay following invalidation
-	 * that is required to reset the caches as opposed to a delay to
-	 * ensure the memory is written.
-	 */
-	if (mode & EMIT_INVALIDATE) {
-		*cs++ = GFX_OP_PIPE_CONTROL(4) | PIPE_CONTROL_QW_WRITE;
-		*cs++ = intel_gt_scratch_offset(rq->engine->gt,
-						INTEL_GT_SCRATCH_FIELD_DEFAULT) |
-			PIPE_CONTROL_GLOBAL_GTT;
-		*cs++ = 0;
-		*cs++ = 0;
-
-		for (i = 0; i < 12; i++)
-			*cs++ = MI_FLUSH;
-
-		*cs++ = GFX_OP_PIPE_CONTROL(4) | PIPE_CONTROL_QW_WRITE;
-		*cs++ = intel_gt_scratch_offset(rq->engine->gt,
-						INTEL_GT_SCRATCH_FIELD_DEFAULT) |
-			PIPE_CONTROL_GLOBAL_GTT;
-		*cs++ = 0;
-		*cs++ = 0;
-	}
-
-	*cs++ = cmd;
-
-	intel_ring_advance(rq, cs);
-
-	return 0;
-}
-
-/*
- * Emits a PIPE_CONTROL with a non-zero post-sync operation, for
- * implementing two workarounds on gen6.  From section 1.4.7.1
- * "PIPE_CONTROL" of the Sandy Bridge PRM volume 2 part 1:
- *
- * [DevSNB-C+{W/A}] Before any depth stall flush (including those
- * produced by non-pipelined state commands), software needs to first
- * send a PIPE_CONTROL with no bits set except Post-Sync Operation !=
- * 0.
- *
- * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush Enable
- * =1, a PIPE_CONTROL with any non-zero post-sync-op is required.
- *
- * And the workaround for these two requires this workaround first:
- *
- * [Dev-SNB{W/A}]: Pipe-control with CS-stall bit set must be sent
- * BEFORE the pipe-control with a post-sync op and no write-cache
- * flushes.
- *
- * And this last workaround is tricky because of the requirements on
- * that bit.  From section 1.4.7.2.3 "Stall" of the Sandy Bridge PRM
- * volume 2 part 1:
- *
- *     "1 of the following must also be set:
- *      - Render Target Cache Flush Enable ([12] of DW1)
- *      - Depth Cache Flush Enable ([0] of DW1)
- *      - Stall at Pixel Scoreboard ([1] of DW1)
- *      - Depth Stall ([13] of DW1)
- *      - Post-Sync Operation ([13] of DW1)
- *      - Notify Enable ([8] of DW1)"
- *
- * The cache flushes require the workaround flush that triggered this
- * one, so we can't use it.  Depth stall would trigger the same.
- * Post-sync nonzero is what triggered this second workaround, so we
- * can't use that one either.  Notify enable is IRQs, which aren't
- * really our business.  That leaves only stall at scoreboard.
- */
-static int
-gen6_emit_post_sync_nonzero_flush(struct i915_request *rq)
-{
-	u32 scratch_addr =
-		intel_gt_scratch_offset(rq->engine->gt,
-					INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
-	u32 *cs;
-
-	cs = intel_ring_begin(rq, 6);
-	if (IS_ERR(cs))
-		return PTR_ERR(cs);
-
-	*cs++ = GFX_OP_PIPE_CONTROL(5);
-	*cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
-	*cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
-	*cs++ = 0; /* low dword */
-	*cs++ = 0; /* high dword */
-	*cs++ = MI_NOOP;
-	intel_ring_advance(rq, cs);
-
-	cs = intel_ring_begin(rq, 6);
-	if (IS_ERR(cs))
-		return PTR_ERR(cs);
-
-	*cs++ = GFX_OP_PIPE_CONTROL(5);
-	*cs++ = PIPE_CONTROL_QW_WRITE;
-	*cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
-	*cs++ = 0;
-	*cs++ = 0;
-	*cs++ = MI_NOOP;
-	intel_ring_advance(rq, cs);
-
-	return 0;
-}
-
-static int
-gen6_render_ring_flush(struct i915_request *rq, u32 mode)
-{
-	u32 scratch_addr =
-		intel_gt_scratch_offset(rq->engine->gt,
-					INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
-	u32 *cs, flags = 0;
-	int ret;
-
-	/* Force SNB workarounds for PIPE_CONTROL flushes */
-	ret = gen6_emit_post_sync_nonzero_flush(rq);
-	if (ret)
-		return ret;
-
-	/* Just flush everything.  Experiments have shown that reducing the
-	 * number of bits based on the write domains has little performance
-	 * impact.
-	 */
-	if (mode & EMIT_FLUSH) {
-		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
-		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
-		/*
-		 * Ensure that any following seqno writes only happen
-		 * when the render cache is indeed flushed.
-		 */
-		flags |= PIPE_CONTROL_CS_STALL;
-	}
-	if (mode & EMIT_INVALIDATE) {
-		flags |= PIPE_CONTROL_TLB_INVALIDATE;
-		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
-		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
-		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
-		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
-		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
-		/*
-		 * TLB invalidate requires a post-sync write.
-		 */
-		flags |= PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_CS_STALL;
-	}
-
-	cs = intel_ring_begin(rq, 4);
-	if (IS_ERR(cs))
-		return PTR_ERR(cs);
-
-	*cs++ = GFX_OP_PIPE_CONTROL(4);
-	*cs++ = flags;
-	*cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
-	*cs++ = 0;
-	intel_ring_advance(rq, cs);
-
-	return 0;
-}
-
-static u32 *gen6_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
-{
-	/* First we do the gen6_emit_post_sync_nonzero_flush w/a */
-	*cs++ = GFX_OP_PIPE_CONTROL(4);
-	*cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
-	*cs++ = 0;
-	*cs++ = 0;
-
-	*cs++ = GFX_OP_PIPE_CONTROL(4);
-	*cs++ = PIPE_CONTROL_QW_WRITE;
-	*cs++ = intel_gt_scratch_offset(rq->engine->gt,
-					INTEL_GT_SCRATCH_FIELD_DEFAULT) |
-		PIPE_CONTROL_GLOBAL_GTT;
-	*cs++ = 0;
-
-	/* Finally we can flush and with it emit the breadcrumb */
-	*cs++ = GFX_OP_PIPE_CONTROL(4);
-	*cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
-		 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
-		 PIPE_CONTROL_DC_FLUSH_ENABLE |
-		 PIPE_CONTROL_QW_WRITE |
-		 PIPE_CONTROL_CS_STALL);
-	*cs++ = i915_request_active_timeline(rq)->hwsp_offset |
-		PIPE_CONTROL_GLOBAL_GTT;
-	*cs++ = rq->fence.seqno;
-
-	*cs++ = MI_USER_INTERRUPT;
-	*cs++ = MI_NOOP;
-
-	rq->tail = intel_ring_offset(rq, cs);
-	assert_ring_tail_valid(rq->ring, rq->tail);
-
-	return cs;
-}
-
-static int
-gen7_render_ring_cs_stall_wa(struct i915_request *rq)
-{
-	u32 *cs;
-
-	cs = intel_ring_begin(rq, 4);
-	if (IS_ERR(cs))
-		return PTR_ERR(cs);
-
-	*cs++ = GFX_OP_PIPE_CONTROL(4);
-	*cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
-	*cs++ = 0;
-	*cs++ = 0;
-	intel_ring_advance(rq, cs);
-
-	return 0;
-}
-
-static int
-gen7_render_ring_flush(struct i915_request *rq, u32 mode)
-{
-	u32 scratch_addr =
-		intel_gt_scratch_offset(rq->engine->gt,
-					INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
-	u32 *cs, flags = 0;
-
-	/*
-	 * Ensure that any following seqno writes only happen when the render
-	 * cache is indeed flushed.
-	 *
-	 * Workaround: 4th PIPE_CONTROL command (except the ones with only
-	 * read-cache invalidate bits set) must have the CS_STALL bit set. We
-	 * don't try to be clever and just set it unconditionally.
-	 */
-	flags |= PIPE_CONTROL_CS_STALL;
-
-	/*
-	 * CS_STALL suggests at least a post-sync write.
-	 */
-	flags |= PIPE_CONTROL_QW_WRITE;
-	flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;
-
-	/* Just flush everything.  Experiments have shown that reducing the
-	 * number of bits based on the write domains has little performance
-	 * impact.
-	 */
-	if (mode & EMIT_FLUSH) {
-		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
-		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
-		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
-		flags |= PIPE_CONTROL_FLUSH_ENABLE;
-	}
-	if (mode & EMIT_INVALIDATE) {
-		flags |= PIPE_CONTROL_TLB_INVALIDATE;
-		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
-		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
-		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
-		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
-		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
-		flags |= PIPE_CONTROL_MEDIA_STATE_CLEAR;
-
-		/* Workaround: we must issue a pipe_control with CS-stall bit
-		 * set before a pipe_control command that has the state cache
-		 * invalidate bit set. */
-		gen7_render_ring_cs_stall_wa(rq);
-	}
-
-	cs = intel_ring_begin(rq, 4);
-	if (IS_ERR(cs))
-		return PTR_ERR(cs);
-
-	*cs++ = GFX_OP_PIPE_CONTROL(4);
-	*cs++ = flags;
-	*cs++ = scratch_addr;
-	*cs++ = 0;
-	intel_ring_advance(rq, cs);
-
-	return 0;
-}
-
-static u32 *gen7_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
-{
-	*cs++ = GFX_OP_PIPE_CONTROL(4);
-	*cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
-		 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
-		 PIPE_CONTROL_DC_FLUSH_ENABLE |
-		 PIPE_CONTROL_FLUSH_ENABLE |
-		 PIPE_CONTROL_QW_WRITE |
-		 PIPE_CONTROL_GLOBAL_GTT_IVB |
-		 PIPE_CONTROL_CS_STALL);
-	*cs++ = i915_request_active_timeline(rq)->hwsp_offset;
-	*cs++ = rq->fence.seqno;
-
-	*cs++ = MI_USER_INTERRUPT;
-	*cs++ = MI_NOOP;
-
-	rq->tail = intel_ring_offset(rq, cs);
-	assert_ring_tail_valid(rq->ring, rq->tail);
-
-	return cs;
-}
-
-static u32 *gen6_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
-{
-	GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);
-	GEM_BUG_ON(offset_in_page(i915_request_active_timeline(rq)->hwsp_offset) != I915_GEM_HWS_SEQNO_ADDR);
-
-	*cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
-	*cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
-	*cs++ = rq->fence.seqno;
-
-	*cs++ = MI_USER_INTERRUPT;
-
-	rq->tail = intel_ring_offset(rq, cs);
-	assert_ring_tail_valid(rq->ring, rq->tail);
-
-	return cs;
-}
-
-#define GEN7_XCS_WA 32
-static u32 *gen7_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
-{
-	int i;
-
-	GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);
-	GEM_BUG_ON(offset_in_page(i915_request_active_timeline(rq)->hwsp_offset) != I915_GEM_HWS_SEQNO_ADDR);
-
-	*cs++ = MI_FLUSH_DW | MI_INVALIDATE_TLB |
-		MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
-	*cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
-	*cs++ = rq->fence.seqno;
-
-	for (i = 0; i < GEN7_XCS_WA; i++) {
-		*cs++ = MI_STORE_DWORD_INDEX;
-		*cs++ = I915_GEM_HWS_SEQNO_ADDR;
-		*cs++ = rq->fence.seqno;
-	}
-
-	*cs++ = MI_FLUSH_DW;
-	*cs++ = 0;
-	*cs++ = 0;
-
-	*cs++ = MI_USER_INTERRUPT;
-	*cs++ = MI_NOOP;
-
-	rq->tail = intel_ring_offset(rq, cs);
-	assert_ring_tail_valid(rq->ring, rq->tail);
-
-	return cs;
-}
-#undef GEN7_XCS_WA
-
 static void set_hwstam(struct intel_engine_cs *engine, u32 mask)
 {
 	/*
@@ -865,32 +429,6 @@ static void reset_finish(struct intel_engine_cs *engine)
 {
 }
 
-static int rcs_resume(struct intel_engine_cs *engine)
-{
-	struct drm_i915_private *i915 = engine->i915;
-	struct intel_uncore *uncore = engine->uncore;
-
-	/*
-	 * Disable CONSTANT_BUFFER before it is loaded from the context
-	 * image. For as it is loaded, it is executed and the stored
-	 * address may no longer be valid, leading to a GPU hang.
-	 *
-	 * This imposes the requirement that userspace reload their
-	 * CONSTANT_BUFFER on every batch, fortunately a requirement
-	 * they are already accustomed to from before contexts were
-	 * enabled.
-	 */
-	if (IS_GEN(i915, 4))
-		intel_uncore_write(uncore, ECOSKPD,
-			   _MASKED_BIT_ENABLE(ECO_CONSTANT_BUFFER_SR_DISABLE));
-
-	if (IS_GEN_RANGE(i915, 6, 7))
-		intel_uncore_write(uncore, INSTPM,
-				   _MASKED_BIT_ENABLE(INSTPM_FORCE_ORDERING));
-
-	return xcs_resume(engine);
-}
-
 static void reset_cancel(struct intel_engine_cs *engine)
 {
 	struct i915_request *request;
@@ -918,255 +456,6 @@ static void i9xx_submit_request(struct i915_request *request)
 		     intel_ring_set_tail(request->ring, request->tail));
 }
 
-static u32 *i9xx_emit_breadcrumb(struct i915_request *rq, u32 *cs)
-{
-	GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);
-	GEM_BUG_ON(offset_in_page(i915_request_active_timeline(rq)->hwsp_offset) != I915_GEM_HWS_SEQNO_ADDR);
-
-	*cs++ = MI_FLUSH;
-
-	*cs++ = MI_STORE_DWORD_INDEX;
-	*cs++ = I915_GEM_HWS_SEQNO_ADDR;
-	*cs++ = rq->fence.seqno;
-
-	*cs++ = MI_USER_INTERRUPT;
-	*cs++ = MI_NOOP;
-
-	rq->tail = intel_ring_offset(rq, cs);
-	assert_ring_tail_valid(rq->ring, rq->tail);
-
-	return cs;
-}
-
-#define GEN5_WA_STORES 8 /* must be at least 1! */
-static u32 *gen5_emit_breadcrumb(struct i915_request *rq, u32 *cs)
-{
-	int i;
-
-	GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);
-	GEM_BUG_ON(offset_in_page(i915_request_active_timeline(rq)->hwsp_offset) != I915_GEM_HWS_SEQNO_ADDR);
-
-	*cs++ = MI_FLUSH;
-
-	BUILD_BUG_ON(GEN5_WA_STORES < 1);
-	for (i = 0; i < GEN5_WA_STORES; i++) {
-		*cs++ = MI_STORE_DWORD_INDEX;
-		*cs++ = I915_GEM_HWS_SEQNO_ADDR;
-		*cs++ = rq->fence.seqno;
-	}
-
-	*cs++ = MI_USER_INTERRUPT;
-
-	rq->tail = intel_ring_offset(rq, cs);
-	assert_ring_tail_valid(rq->ring, rq->tail);
-
-	return cs;
-}
-#undef GEN5_WA_STORES
-
-static void
-gen5_irq_enable(struct intel_engine_cs *engine)
-{
-	gen5_gt_enable_irq(engine->gt, engine->irq_enable_mask);
-}
-
-static void
-gen5_irq_disable(struct intel_engine_cs *engine)
-{
-	gen5_gt_disable_irq(engine->gt, engine->irq_enable_mask);
-}
-
-static void
-i9xx_irq_enable(struct intel_engine_cs *engine)
-{
-	engine->i915->irq_mask &= ~engine->irq_enable_mask;
-	intel_uncore_write(engine->uncore, GEN2_IMR, engine->i915->irq_mask);
-	intel_uncore_posting_read_fw(engine->uncore, GEN2_IMR);
-}
-
-static void
-i9xx_irq_disable(struct intel_engine_cs *engine)
-{
-	engine->i915->irq_mask |= engine->irq_enable_mask;
-	intel_uncore_write(engine->uncore, GEN2_IMR, engine->i915->irq_mask);
-}
-
-static void
-i8xx_irq_enable(struct intel_engine_cs *engine)
-{
-	struct drm_i915_private *i915 = engine->i915;
-
-	i915->irq_mask &= ~engine->irq_enable_mask;
-	intel_uncore_write16(&i915->uncore, GEN2_IMR, i915->irq_mask);
-	ENGINE_POSTING_READ16(engine, RING_IMR);
-}
-
-static void
-i8xx_irq_disable(struct intel_engine_cs *engine)
-{
-	struct drm_i915_private *i915 = engine->i915;
-
-	i915->irq_mask |= engine->irq_enable_mask;
-	intel_uncore_write16(&i915->uncore, GEN2_IMR, i915->irq_mask);
-}
-
-static int
-bsd_ring_flush(struct i915_request *rq, u32 mode)
-{
-	u32 *cs;
-
-	cs = intel_ring_begin(rq, 2);
-	if (IS_ERR(cs))
-		return PTR_ERR(cs);
-
-	*cs++ = MI_FLUSH;
-	*cs++ = MI_NOOP;
-	intel_ring_advance(rq, cs);
-	return 0;
-}
-
-static void
-gen6_irq_enable(struct intel_engine_cs *engine)
-{
-	ENGINE_WRITE(engine, RING_IMR,
-		     ~(engine->irq_enable_mask | engine->irq_keep_mask));
-
-	/* Flush/delay to ensure the RING_IMR is active before the GT IMR */
-	ENGINE_POSTING_READ(engine, RING_IMR);
-
-	gen5_gt_enable_irq(engine->gt, engine->irq_enable_mask);
-}
-
-static void
-gen6_irq_disable(struct intel_engine_cs *engine)
-{
-	ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
-	gen5_gt_disable_irq(engine->gt, engine->irq_enable_mask);
-}
-
-static void
-hsw_vebox_irq_enable(struct intel_engine_cs *engine)
-{
-	ENGINE_WRITE(engine, RING_IMR, ~engine->irq_enable_mask);
-
-	/* Flush/delay to ensure the RING_IMR is active before the GT IMR */
-	ENGINE_POSTING_READ(engine, RING_IMR);
-
-	gen6_gt_pm_unmask_irq(engine->gt, engine->irq_enable_mask);
-}
-
-static void
-hsw_vebox_irq_disable(struct intel_engine_cs *engine)
-{
-	ENGINE_WRITE(engine, RING_IMR, ~0);
-	gen6_gt_pm_mask_irq(engine->gt, engine->irq_enable_mask);
-}
-
-static int
-i965_emit_bb_start(struct i915_request *rq,
-		   u64 offset, u32 length,
-		   unsigned int dispatch_flags)
-{
-	u32 *cs;
-
-	cs = intel_ring_begin(rq, 2);
-	if (IS_ERR(cs))
-		return PTR_ERR(cs);
-
-	*cs++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT | (dispatch_flags &
-		I915_DISPATCH_SECURE ? 0 : MI_BATCH_NON_SECURE_I965);
-	*cs++ = offset;
-	intel_ring_advance(rq, cs);
-
-	return 0;
-}
-
-/* Just userspace ABI convention to limit the wa batch bo to a resonable size */
-#define I830_BATCH_LIMIT SZ_256K
-#define I830_TLB_ENTRIES (2)
-#define I830_WA_SIZE max(I830_TLB_ENTRIES*4096, I830_BATCH_LIMIT)
-static int
-i830_emit_bb_start(struct i915_request *rq,
-		   u64 offset, u32 len,
-		   unsigned int dispatch_flags)
-{
-	u32 *cs, cs_offset =
-		intel_gt_scratch_offset(rq->engine->gt,
-					INTEL_GT_SCRATCH_FIELD_DEFAULT);
-
-	GEM_BUG_ON(rq->engine->gt->scratch->size < I830_WA_SIZE);
-
-	cs = intel_ring_begin(rq, 6);
-	if (IS_ERR(cs))
-		return PTR_ERR(cs);
-
-	/* Evict the invalid PTE TLBs */
-	*cs++ = COLOR_BLT_CMD | BLT_WRITE_RGBA;
-	*cs++ = BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | 4096;
-	*cs++ = I830_TLB_ENTRIES << 16 | 4; /* load each page */
-	*cs++ = cs_offset;
-	*cs++ = 0xdeadbeef;
-	*cs++ = MI_NOOP;
-	intel_ring_advance(rq, cs);
-
-	if ((dispatch_flags & I915_DISPATCH_PINNED) == 0) {
-		if (len > I830_BATCH_LIMIT)
-			return -ENOSPC;
-
-		cs = intel_ring_begin(rq, 6 + 2);
-		if (IS_ERR(cs))
-			return PTR_ERR(cs);
-
-		/* Blit the batch (which has now all relocs applied) to the
-		 * stable batch scratch bo area (so that the CS never
-		 * stumbles over its tlb invalidation bug) ...
-		 */
-		*cs++ = SRC_COPY_BLT_CMD | BLT_WRITE_RGBA | (6 - 2);
-		*cs++ = BLT_DEPTH_32 | BLT_ROP_SRC_COPY | 4096;
-		*cs++ = DIV_ROUND_UP(len, 4096) << 16 | 4096;
-		*cs++ = cs_offset;
-		*cs++ = 4096;
-		*cs++ = offset;
-
-		*cs++ = MI_FLUSH;
-		*cs++ = MI_NOOP;
-		intel_ring_advance(rq, cs);
-
-		/* ... and execute it. */
-		offset = cs_offset;
-	}
-
-	cs = intel_ring_begin(rq, 2);
-	if (IS_ERR(cs))
-		return PTR_ERR(cs);
-
-	*cs++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT;
-	*cs++ = offset | (dispatch_flags & I915_DISPATCH_SECURE ? 0 :
-		MI_BATCH_NON_SECURE);
-	intel_ring_advance(rq, cs);
-
-	return 0;
-}
-
-static int
-i915_emit_bb_start(struct i915_request *rq,
-		   u64 offset, u32 len,
-		   unsigned int dispatch_flags)
-{
-	u32 *cs;
-
-	cs = intel_ring_begin(rq, 2);
-	if (IS_ERR(cs))
-		return PTR_ERR(cs);
-
-	*cs++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT;
-	*cs++ = offset | (dispatch_flags & I915_DISPATCH_SECURE ? 0 :
-		MI_BATCH_NON_SECURE);
-	intel_ring_advance(rq, cs);
-
-	return 0;
-}
-
 static void __ring_context_fini(struct intel_context *ce)
 {
 	i915_vma_put(ce->state);
@@ -1356,8 +645,8 @@ static inline int mi_set_context(struct i915_request *rq,
 				 struct intel_context *ce,
 				 u32 flags)
 {
-	struct drm_i915_private *i915 = rq->i915;
 	struct intel_engine_cs *engine = rq->engine;
+	struct drm_i915_private *i915 = engine->i915;
 	enum intel_engine_id id;
 	const int num_engines =
 		IS_HASWELL(i915) ? RUNTIME_INFO(i915)->num_engines - 1 : 0;
@@ -1471,7 +760,7 @@ static inline int mi_set_context(struct i915_request *rq,
 
 static int remap_l3_slice(struct i915_request *rq, int slice)
 {
-	u32 *cs, *remap_info = rq->i915->l3_parity.remap_info[slice];
+	u32 *cs, *remap_info = rq->engine->i915->l3_parity.remap_info[slice];
 	int i;
 
 	if (!remap_info)
@@ -1582,7 +871,7 @@ static int switch_context(struct i915_request *rq)
 	void **residuals = NULL;
 	int ret;
 
-	GEM_BUG_ON(HAS_EXECLISTS(rq->i915));
+	GEM_BUG_ON(HAS_EXECLISTS(engine->i915));
 
 	if (engine->wa_ctx.vma && ce != engine->kernel_context) {
 		if (engine->wa_ctx.vma->private != ce) {
@@ -1704,99 +993,6 @@ static void gen6_bsd_submit_request(struct i915_request *request)
 	intel_uncore_forcewake_put(uncore, FORCEWAKE_ALL);
 }
 
-static int mi_flush_dw(struct i915_request *rq, u32 flags)
-{
-	u32 cmd, *cs;
-
-	cs = intel_ring_begin(rq, 4);
-	if (IS_ERR(cs))
-		return PTR_ERR(cs);
-
-	cmd = MI_FLUSH_DW;
-
-	/*
-	 * We always require a command barrier so that subsequent
-	 * commands, such as breadcrumb interrupts, are strictly ordered
-	 * wrt the contents of the write cache being flushed to memory
-	 * (and thus being coherent from the CPU).
-	 */
-	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
-
-	/*
-	 * Bspec vol 1c.3 - blitter engine command streamer:
-	 * "If ENABLED, all TLBs will be invalidated once the flush
-	 * operation is complete. This bit is only valid when the
-	 * Post-Sync Operation field is a value of 1h or 3h."
-	 */
-	cmd |= flags;
-
-	*cs++ = cmd;
-	*cs++ = I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT;
-	*cs++ = 0;
-	*cs++ = MI_NOOP;
-
-	intel_ring_advance(rq, cs);
-
-	return 0;
-}
-
-static int gen6_flush_dw(struct i915_request *rq, u32 mode, u32 invflags)
-{
-	return mi_flush_dw(rq, mode & EMIT_INVALIDATE ? invflags : 0);
-}
-
-static int gen6_bsd_ring_flush(struct i915_request *rq, u32 mode)
-{
-	return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB | MI_INVALIDATE_BSD);
-}
-
-static int
-hsw_emit_bb_start(struct i915_request *rq,
-		  u64 offset, u32 len,
-		  unsigned int dispatch_flags)
-{
-	u32 *cs;
-
-	cs = intel_ring_begin(rq, 2);
-	if (IS_ERR(cs))
-		return PTR_ERR(cs);
-
-	*cs++ = MI_BATCH_BUFFER_START | (dispatch_flags & I915_DISPATCH_SECURE ?
-		0 : MI_BATCH_PPGTT_HSW | MI_BATCH_NON_SECURE_HSW);
-	/* bit0-7 is the length on GEN6+ */
-	*cs++ = offset;
-	intel_ring_advance(rq, cs);
-
-	return 0;
-}
-
-static int
-gen6_emit_bb_start(struct i915_request *rq,
-		   u64 offset, u32 len,
-		   unsigned int dispatch_flags)
-{
-	u32 *cs;
-
-	cs = intel_ring_begin(rq, 2);
-	if (IS_ERR(cs))
-		return PTR_ERR(cs);
-
-	*cs++ = MI_BATCH_BUFFER_START | (dispatch_flags & I915_DISPATCH_SECURE ?
-		0 : MI_BATCH_NON_SECURE_I965);
-	/* bit0-7 is the length on GEN6+ */
-	*cs++ = offset;
-	intel_ring_advance(rq, cs);
-
-	return 0;
-}
-
-/* Blitter support (SandyBridge+) */
-
-static int gen6_ring_flush(struct i915_request *rq, u32 mode)
-{
-	return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB);
-}
-
 static void i9xx_set_default_submission(struct intel_engine_cs *engine)
 {
 	engine->submit_request = i9xx_submit_request;
@@ -1843,11 +1039,11 @@ static void setup_irq(struct intel_engine_cs *engine)
 		engine->irq_enable = gen5_irq_enable;
 		engine->irq_disable = gen5_irq_disable;
 	} else if (INTEL_GEN(i915) >= 3) {
-		engine->irq_enable = i9xx_irq_enable;
-		engine->irq_disable = i9xx_irq_disable;
+		engine->irq_enable = gen3_irq_enable;
+		engine->irq_disable = gen3_irq_disable;
 	} else {
-		engine->irq_enable = i8xx_irq_enable;
-		engine->irq_disable = i8xx_irq_disable;
+		engine->irq_enable = gen2_irq_enable;
+		engine->irq_disable = gen2_irq_disable;
 	}
 }
 
@@ -1874,7 +1070,7 @@ static void setup_common(struct intel_engine_cs *engine)
 	 * equivalent to our next initial bread so we can elide
 	 * engine->emit_init_breadcrumb().
 	 */
-	engine->emit_fini_breadcrumb = i9xx_emit_breadcrumb;
+	engine->emit_fini_breadcrumb = gen3_emit_breadcrumb;
 	if (IS_GEN(i915, 5))
 		engine->emit_fini_breadcrumb = gen5_emit_breadcrumb;
 
@@ -1883,11 +1079,11 @@ static void setup_common(struct intel_engine_cs *engine)
 	if (INTEL_GEN(i915) >= 6)
 		engine->emit_bb_start = gen6_emit_bb_start;
 	else if (INTEL_GEN(i915) >= 4)
-		engine->emit_bb_start = i965_emit_bb_start;
+		engine->emit_bb_start = gen4_emit_bb_start;
 	else if (IS_I830(i915) || IS_I845G(i915))
 		engine->emit_bb_start = i830_emit_bb_start;
 	else
-		engine->emit_bb_start = i915_emit_bb_start;
+		engine->emit_bb_start = gen3_emit_bb_start;
 }
 
 static void setup_rcs(struct intel_engine_cs *engine)
@@ -1900,25 +1096,23 @@ static void setup_rcs(struct intel_engine_cs *engine)
 	engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT;
 
 	if (INTEL_GEN(i915) >= 7) {
-		engine->emit_flush = gen7_render_ring_flush;
-		engine->emit_fini_breadcrumb = gen7_rcs_emit_breadcrumb;
+		engine->emit_flush = gen7_emit_flush_rcs;
+		engine->emit_fini_breadcrumb = gen7_emit_breadcrumb_rcs;
 	} else if (IS_GEN(i915, 6)) {
-		engine->emit_flush = gen6_render_ring_flush;
-		engine->emit_fini_breadcrumb = gen6_rcs_emit_breadcrumb;
+		engine->emit_flush = gen6_emit_flush_rcs;
+		engine->emit_fini_breadcrumb = gen6_emit_breadcrumb_rcs;
 	} else if (IS_GEN(i915, 5)) {
-		engine->emit_flush = gen4_render_ring_flush;
+		engine->emit_flush = gen4_emit_flush_rcs;
 	} else {
 		if (INTEL_GEN(i915) < 4)
-			engine->emit_flush = gen2_render_ring_flush;
+			engine->emit_flush = gen2_emit_flush;
 		else
-			engine->emit_flush = gen4_render_ring_flush;
+			engine->emit_flush = gen4_emit_flush_rcs;
 		engine->irq_enable_mask = I915_USER_INTERRUPT;
 	}
 
 	if (IS_HASWELL(i915))
 		engine->emit_bb_start = hsw_emit_bb_start;
-
-	engine->resume = rcs_resume;
 }
 
 static void setup_vcs(struct intel_engine_cs *engine)
@@ -1929,15 +1123,15 @@ static void setup_vcs(struct intel_engine_cs *engine)
 		/* gen6 bsd needs a special wa for tail updates */
 		if (IS_GEN(i915, 6))
 			engine->set_default_submission = gen6_bsd_set_default_submission;
-		engine->emit_flush = gen6_bsd_ring_flush;
+		engine->emit_flush = gen6_emit_flush_vcs;
 		engine->irq_enable_mask = GT_BSD_USER_INTERRUPT;
 
 		if (IS_GEN(i915, 6))
-			engine->emit_fini_breadcrumb = gen6_xcs_emit_breadcrumb;
+			engine->emit_fini_breadcrumb = gen6_emit_breadcrumb_xcs;
 		else
-			engine->emit_fini_breadcrumb = gen7_xcs_emit_breadcrumb;
+			engine->emit_fini_breadcrumb = gen7_emit_breadcrumb_xcs;
 	} else {
-		engine->emit_flush = bsd_ring_flush;
+		engine->emit_flush = gen4_emit_flush_vcs;
 		if (IS_GEN(i915, 5))
 			engine->irq_enable_mask = ILK_BSD_USER_INTERRUPT;
 		else
@@ -1949,13 +1143,13 @@ static void setup_bcs(struct intel_engine_cs *engine)
 {
 	struct drm_i915_private *i915 = engine->i915;
 
-	engine->emit_flush = gen6_ring_flush;
+	engine->emit_flush = gen6_emit_flush_xcs;
 	engine->irq_enable_mask = GT_BLT_USER_INTERRUPT;
 
 	if (IS_GEN(i915, 6))
-		engine->emit_fini_breadcrumb = gen6_xcs_emit_breadcrumb;
+		engine->emit_fini_breadcrumb = gen6_emit_breadcrumb_xcs;
 	else
-		engine->emit_fini_breadcrumb = gen7_xcs_emit_breadcrumb;
+		engine->emit_fini_breadcrumb = gen7_emit_breadcrumb_xcs;
 }
 
 static void setup_vecs(struct intel_engine_cs *engine)
@@ -1964,12 +1158,12 @@ static void setup_vecs(struct intel_engine_cs *engine)
 
 	GEM_BUG_ON(INTEL_GEN(i915) < 7);
 
-	engine->emit_flush = gen6_ring_flush;
+	engine->emit_flush = gen6_emit_flush_xcs;
 	engine->irq_enable_mask = PM_VEBOX_USER_INTERRUPT;
-	engine->irq_enable = hsw_vebox_irq_enable;
-	engine->irq_disable = hsw_vebox_irq_disable;
+	engine->irq_enable = hsw_irq_enable_vecs;
+	engine->irq_disable = hsw_irq_disable_vecs;
 
-	engine->emit_fini_breadcrumb = gen7_xcs_emit_breadcrumb;
+	engine->emit_fini_breadcrumb = gen7_emit_breadcrumb_xcs;
 }
 
 static int gen7_ctx_switch_bb_setup(struct intel_engine_cs * const engine,
diff --git a/drivers/gpu/drm/i915/gt/intel_rps.c b/drivers/gpu/drm/i915/gt/intel_rps.c
index 2f59fc6df3c2..296391deeb94 100644
--- a/drivers/gpu/drm/i915/gt/intel_rps.c
+++ b/drivers/gpu/drm/i915/gt/intel_rps.c
@@ -51,15 +51,16 @@ static void rps_timer(struct timer_list *t)
 {
 	struct intel_rps *rps = from_timer(rps, t, timer);
 	struct intel_engine_cs *engine;
+	ktime_t dt, last, timestamp;
 	enum intel_engine_id id;
 	s64 max_busy[3] = {};
-	ktime_t dt, last;
 
+	timestamp = 0;
 	for_each_engine(engine, rps_to_gt(rps), id) {
 		s64 busy;
 		int i;
 
-		dt = intel_engine_get_busy_time(engine);
+		dt = intel_engine_get_busy_time(engine, &timestamp);
 		last = engine->stats.rps;
 		engine->stats.rps = dt;
 
@@ -69,16 +70,14 @@ static void rps_timer(struct timer_list *t)
 				swap(busy, max_busy[i]);
 		}
 	}
-
-	dt = ktime_get();
 	last = rps->pm_timestamp;
-	rps->pm_timestamp = dt;
+	rps->pm_timestamp = timestamp;
 
 	if (intel_rps_is_active(rps)) {
 		s64 busy;
 		int i;
 
-		dt = ktime_sub(dt, last);
+		dt = ktime_sub(timestamp, last);
 
 		/*
 		 * Our goal is to evaluate each engine independently, so we run
diff --git a/drivers/gpu/drm/i915/gt/intel_workarounds.c b/drivers/gpu/drm/i915/gt/intel_workarounds.c
index 85d2bef51524..2da366821dda 100644
--- a/drivers/gpu/drm/i915/gt/intel_workarounds.c
+++ b/drivers/gpu/drm/i915/gt/intel_workarounds.c
@@ -205,6 +205,18 @@ wa_masked_dis(struct i915_wa_list *wal, i915_reg_t reg, u32 val)
 #define WA_SET_FIELD_MASKED(addr, mask, value) \
 	wa_write_masked_or(wal, (addr), 0, _MASKED_FIELD((mask), (value)))
 
+static void gen6_ctx_workarounds_init(struct intel_engine_cs *engine,
+				      struct i915_wa_list *wal)
+{
+	WA_SET_BIT_MASKED(INSTPM, INSTPM_FORCE_ORDERING);
+}
+
+static void gen7_ctx_workarounds_init(struct intel_engine_cs *engine,
+				      struct i915_wa_list *wal)
+{
+	WA_SET_BIT_MASKED(INSTPM, INSTPM_FORCE_ORDERING);
+}
+
 static void gen8_ctx_workarounds_init(struct intel_engine_cs *engine,
 				      struct i915_wa_list *wal)
 {
@@ -355,7 +367,10 @@ static void gen9_ctx_workarounds_init(struct intel_engine_cs *engine,
 			  HDC_FORCE_NON_COHERENT);
 
 	/* WaDisableSamplerPowerBypassForSOPingPong:skl,bxt,kbl,cfl */
-	if (IS_SKYLAKE(i915) || IS_KABYLAKE(i915) || IS_COFFEELAKE(i915))
+	if (IS_SKYLAKE(i915) ||
+	    IS_KABYLAKE(i915) ||
+	    IS_COFFEELAKE(i915) ||
+	    IS_COMETLAKE(i915))
 		WA_SET_BIT_MASKED(HALF_SLICE_CHICKEN3,
 				  GEN8_SAMPLER_POWER_BYPASS_DIS);
 
@@ -600,11 +615,14 @@ static void tgl_ctx_workarounds_init(struct intel_engine_cs *engine,
 	 * Wa_1604555607:gen12 and Wa_1608008084:gen12
 	 * FF_MODE2 register will return the wrong value when read. The default
 	 * value for this register is zero for all fields and there are no bit
-	 * masks. So instead of doing a RMW we should just write the TDS timer
-	 * value for Wa_1604555607.
+	 * masks. So instead of doing a RMW we should just write the GS Timer
+	 * and TDS timer values for Wa_1604555607 and Wa_16011163337.
 	 */
-	wa_add(wal, FF_MODE2, FF_MODE2_TDS_TIMER_MASK,
-	       FF_MODE2_TDS_TIMER_128, 0);
+	wa_add(wal,
+	       FF_MODE2,
+	       FF_MODE2_GS_TIMER_MASK | FF_MODE2_TDS_TIMER_MASK,
+	       FF_MODE2_GS_TIMER_224  | FF_MODE2_TDS_TIMER_128,
+	       0);
 
 	/* WaDisableGPGPUMidThreadPreemption:tgl */
 	WA_SET_FIELD_MASKED(GEN8_CS_CHICKEN1,
@@ -630,7 +648,7 @@ __intel_engine_init_ctx_wa(struct intel_engine_cs *engine,
 		icl_ctx_workarounds_init(engine, wal);
 	else if (IS_CANNONLAKE(i915))
 		cnl_ctx_workarounds_init(engine, wal);
-	else if (IS_COFFEELAKE(i915))
+	else if (IS_COFFEELAKE(i915) || IS_COMETLAKE(i915))
 		cfl_ctx_workarounds_init(engine, wal);
 	else if (IS_GEMINILAKE(i915))
 		glk_ctx_workarounds_init(engine, wal);
@@ -644,6 +662,10 @@ __intel_engine_init_ctx_wa(struct intel_engine_cs *engine,
 		chv_ctx_workarounds_init(engine, wal);
 	else if (IS_BROADWELL(i915))
 		bdw_ctx_workarounds_init(engine, wal);
+	else if (IS_GEN(i915, 7))
+		gen7_ctx_workarounds_init(engine, wal);
+	else if (IS_GEN(i915, 6))
+		gen6_ctx_workarounds_init(engine, wal);
 	else if (INTEL_GEN(i915) < 8)
 		return;
 	else
@@ -917,7 +939,7 @@ static void
 gen9_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal)
 {
 	/* WaDisableKillLogic:bxt,skl,kbl */
-	if (!IS_COFFEELAKE(i915))
+	if (!IS_COFFEELAKE(i915) && !IS_COMETLAKE(i915))
 		wa_write_or(wal,
 			    GAM_ECOCHK,
 			    ECOCHK_DIS_TLB);
@@ -1180,7 +1202,7 @@ gt_init_workarounds(struct drm_i915_private *i915, struct i915_wa_list *wal)
 		icl_gt_workarounds_init(i915, wal);
 	else if (IS_CANNONLAKE(i915))
 		cnl_gt_workarounds_init(i915, wal);
-	else if (IS_COFFEELAKE(i915))
+	else if (IS_COFFEELAKE(i915) || IS_COMETLAKE(i915))
 		cfl_gt_workarounds_init(i915, wal);
 	else if (IS_GEMINILAKE(i915))
 		glk_gt_workarounds_init(i915, wal);
@@ -1428,6 +1450,18 @@ static void cfl_whitelist_build(struct intel_engine_cs *engine)
 			  RING_FORCE_TO_NONPRIV_RANGE_4);
 }
 
+static void cml_whitelist_build(struct intel_engine_cs *engine)
+{
+	struct i915_wa_list *w = &engine->whitelist;
+
+	if (engine->class != RENDER_CLASS)
+		whitelist_reg_ext(w,
+				  RING_CTX_TIMESTAMP(engine->mmio_base),
+				  RING_FORCE_TO_NONPRIV_ACCESS_RD);
+
+	cfl_whitelist_build(engine);
+}
+
 static void cnl_whitelist_build(struct intel_engine_cs *engine)
 {
 	struct i915_wa_list *w = &engine->whitelist;
@@ -1478,9 +1512,15 @@ static void icl_whitelist_build(struct intel_engine_cs *engine)
 		/* hucStatus2RegOffset */
 		whitelist_reg_ext(w, _MMIO(0x23B0 + engine->mmio_base),
 				  RING_FORCE_TO_NONPRIV_ACCESS_RD);
+		whitelist_reg_ext(w,
+				  RING_CTX_TIMESTAMP(engine->mmio_base),
+				  RING_FORCE_TO_NONPRIV_ACCESS_RD);
 		break;
 
 	default:
+		whitelist_reg_ext(w,
+				  RING_CTX_TIMESTAMP(engine->mmio_base),
+				  RING_FORCE_TO_NONPRIV_ACCESS_RD);
 		break;
 	}
 }
@@ -1512,6 +1552,9 @@ static void tgl_whitelist_build(struct intel_engine_cs *engine)
 		whitelist_reg(w, HIZ_CHICKEN);
 		break;
 	default:
+		whitelist_reg_ext(w,
+				  RING_CTX_TIMESTAMP(engine->mmio_base),
+				  RING_FORCE_TO_NONPRIV_ACCESS_RD);
 		break;
 	}
 }
@@ -1529,6 +1572,8 @@ void intel_engine_init_whitelist(struct intel_engine_cs *engine)
 		icl_whitelist_build(engine);
 	else if (IS_CANNONLAKE(i915))
 		cnl_whitelist_build(engine);
+	else if (IS_COMETLAKE(i915))
+		cml_whitelist_build(engine);
 	else if (IS_COFFEELAKE(i915))
 		cfl_whitelist_build(engine);
 	else if (IS_GEMINILAKE(i915))
@@ -1725,6 +1770,12 @@ rcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
 		wa_write_or(wal,
 			    GEN7_FF_THREAD_MODE,
 			    GEN12_FF_TESSELATION_DOP_GATE_DISABLE);
+
+		/* Wa_22010271021:ehl */
+		if (IS_ELKHARTLAKE(i915))
+			wa_masked_en(wal,
+				     GEN9_CS_DEBUG_MODE1,
+				     FF_DOP_CLOCK_GATE_DISABLE);
 	}
 
 	if (IS_GEN_RANGE(i915, 9, 12)) {
@@ -1734,7 +1785,10 @@ rcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
 			     GEN9_FFSC_PERCTX_PREEMPT_CTRL);
 	}
 
-	if (IS_SKYLAKE(i915) || IS_KABYLAKE(i915) || IS_COFFEELAKE(i915)) {
+	if (IS_SKYLAKE(i915) ||
+	    IS_KABYLAKE(i915) ||
+	    IS_COFFEELAKE(i915) ||
+	    IS_COMETLAKE(i915)) {
 		/* WaEnableGapsTsvCreditFix:skl,kbl,cfl */
 		wa_write_or(wal,
 			    GEN8_GARBCNTL,
@@ -1818,6 +1872,21 @@ rcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal)
 		       0, _MASKED_BIT_ENABLE(VS_TIMER_DISPATCH),
 		       /* XXX bit doesn't stick on Broadwater */
 		       IS_I965G(i915) ? 0 : VS_TIMER_DISPATCH);
+
+	if (IS_GEN(i915, 4))
+		/*
+		 * Disable CONSTANT_BUFFER before it is loaded from the context
+		 * image. For as it is loaded, it is executed and the stored
+		 * address may no longer be valid, leading to a GPU hang.
+		 *
+		 * This imposes the requirement that userspace reload their
+		 * CONSTANT_BUFFER on every batch, fortunately a requirement
+		 * they are already accustomed to from before contexts were
+		 * enabled.
+		 */
+		wa_add(wal, ECOSKPD,
+		       0, _MASKED_BIT_ENABLE(ECO_CONSTANT_BUFFER_SR_DISABLE),
+		       0 /* XXX bit doesn't stick on Broadwater */);
 }
 
 static void
@@ -1932,7 +2001,7 @@ wa_list_srm(struct i915_request *rq,
 	    const struct i915_wa_list *wal,
 	    struct i915_vma *vma)
 {
-	struct drm_i915_private *i915 = rq->i915;
+	struct drm_i915_private *i915 = rq->engine->i915;
 	unsigned int i, count = 0;
 	const struct i915_wa *wa;
 	u32 srm, *cs;
@@ -2021,7 +2090,7 @@ static int engine_wa_list_verify(struct intel_context *ce,
 
 	err = 0;
 	for (i = 0, wa = wal->list; i < wal->count; i++, wa++) {
-		if (mcr_range(rq->i915, i915_mmio_reg_offset(wa->reg)))
+		if (mcr_range(rq->engine->i915, i915_mmio_reg_offset(wa->reg)))
 			continue;
 
 		if (!wa_verify(wa, results[i], wal->name, from))
diff --git a/drivers/gpu/drm/i915/gt/selftest_engine_cs.c b/drivers/gpu/drm/i915/gt/selftest_engine_cs.c
index f88e445a1cae..729c3c7b11e2 100644
--- a/drivers/gpu/drm/i915/gt/selftest_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/selftest_engine_cs.c
@@ -49,7 +49,7 @@ static int write_timestamp(struct i915_request *rq, int slot)
 		return PTR_ERR(cs);
 
 	cmd = MI_STORE_REGISTER_MEM | MI_USE_GGTT;
-	if (INTEL_GEN(rq->i915) >= 8)
+	if (INTEL_GEN(rq->engine->i915) >= 8)
 		cmd++;
 	*cs++ = cmd;
 	*cs++ = i915_mmio_reg_offset(RING_TIMESTAMP(rq->engine->mmio_base));
diff --git a/drivers/gpu/drm/i915/gt/selftest_engine_heartbeat.c b/drivers/gpu/drm/i915/gt/selftest_engine_heartbeat.c
index 697114dd1f47..73243ba59c7d 100644
--- a/drivers/gpu/drm/i915/gt/selftest_engine_heartbeat.c
+++ b/drivers/gpu/drm/i915/gt/selftest_engine_heartbeat.c
@@ -10,6 +10,7 @@
 
 #include "intel_gt_requests.h"
 #include "i915_selftest.h"
+#include "selftest_engine_heartbeat.h"
 
 static int timeline_sync(struct intel_timeline *tl)
 {
@@ -142,24 +143,6 @@ out:
 	return err;
 }
 
-static void engine_heartbeat_disable(struct intel_engine_cs *engine,
-				     unsigned long *saved)
-{
-	*saved = engine->props.heartbeat_interval_ms;
-	engine->props.heartbeat_interval_ms = 0;
-
-	intel_engine_pm_get(engine);
-	intel_engine_park_heartbeat(engine);
-}
-
-static void engine_heartbeat_enable(struct intel_engine_cs *engine,
-				    unsigned long saved)
-{
-	intel_engine_pm_put(engine);
-
-	engine->props.heartbeat_interval_ms = saved;
-}
-
 static int live_idle_flush(void *arg)
 {
 	struct intel_gt *gt = arg;
@@ -170,11 +153,9 @@ static int live_idle_flush(void *arg)
 	/* Check that we can flush the idle barriers */
 
 	for_each_engine(engine, gt, id) {
-		unsigned long heartbeat;
-
-		engine_heartbeat_disable(engine, &heartbeat);
+		st_engine_heartbeat_disable(engine);
 		err = __live_idle_pulse(engine, intel_engine_flush_barriers);
-		engine_heartbeat_enable(engine, heartbeat);
+		st_engine_heartbeat_enable(engine);
 		if (err)
 			break;
 	}
@@ -192,11 +173,9 @@ static int live_idle_pulse(void *arg)
 	/* Check that heartbeat pulses flush the idle barriers */
 
 	for_each_engine(engine, gt, id) {
-		unsigned long heartbeat;
-
-		engine_heartbeat_disable(engine, &heartbeat);
+		st_engine_heartbeat_disable(engine);
 		err = __live_idle_pulse(engine, intel_engine_pulse);
-		engine_heartbeat_enable(engine, heartbeat);
+		st_engine_heartbeat_enable(engine);
 		if (err && err != -ENODEV)
 			break;
 
@@ -386,11 +365,27 @@ int intel_heartbeat_live_selftests(struct drm_i915_private *i915)
 	if (intel_gt_is_wedged(&i915->gt))
 		return 0;
 
-	saved_hangcheck = i915_modparams.enable_hangcheck;
-	i915_modparams.enable_hangcheck = INT_MAX;
+	saved_hangcheck = i915->params.enable_hangcheck;
+	i915->params.enable_hangcheck = INT_MAX;
 
 	err = intel_gt_live_subtests(tests, &i915->gt);
 
-	i915_modparams.enable_hangcheck = saved_hangcheck;
+	i915->params.enable_hangcheck = saved_hangcheck;
 	return err;
 }
+
+void st_engine_heartbeat_disable(struct intel_engine_cs *engine)
+{
+	engine->props.heartbeat_interval_ms = 0;
+
+	intel_engine_pm_get(engine);
+	intel_engine_park_heartbeat(engine);
+}
+
+void st_engine_heartbeat_enable(struct intel_engine_cs *engine)
+{
+	intel_engine_pm_put(engine);
+
+	engine->props.heartbeat_interval_ms =
+		engine->defaults.heartbeat_interval_ms;
+}
diff --git a/drivers/gpu/drm/i915/gt/selftest_engine_heartbeat.h b/drivers/gpu/drm/i915/gt/selftest_engine_heartbeat.h
new file mode 100644
index 000000000000..cd27113d5400
--- /dev/null
+++ b/drivers/gpu/drm/i915/gt/selftest_engine_heartbeat.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright © 2020 Intel Corporation
+ */
+
+#ifndef SELFTEST_ENGINE_HEARTBEAT_H
+#define SELFTEST_ENGINE_HEARTBEAT_H
+
+struct intel_engine_cs;
+
+void st_engine_heartbeat_disable(struct intel_engine_cs *engine);
+void st_engine_heartbeat_enable(struct intel_engine_cs *engine);
+
+#endif /* SELFTEST_ENGINE_HEARTBEAT_H */
diff --git a/drivers/gpu/drm/i915/gt/selftest_engine_pm.c b/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
index cbf6b0735272..b08fc5390e8a 100644
--- a/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
+++ b/drivers/gpu/drm/i915/gt/selftest_engine_pm.c
@@ -6,7 +6,107 @@
 
 #include "i915_selftest.h"
 #include "selftest_engine.h"
+#include "selftest_engine_heartbeat.h"
 #include "selftests/igt_atomic.h"
+#include "selftests/igt_flush_test.h"
+#include "selftests/igt_spinner.h"
+
+static int live_engine_busy_stats(void *arg)
+{
+	struct intel_gt *gt = arg;
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+	struct igt_spinner spin;
+	int err = 0;
+
+	/*
+	 * Check that if an engine supports busy-stats, they tell the truth.
+	 */
+
+	if (igt_spinner_init(&spin, gt))
+		return -ENOMEM;
+
+	GEM_BUG_ON(intel_gt_pm_is_awake(gt));
+	for_each_engine(engine, gt, id) {
+		struct i915_request *rq;
+		ktime_t de, dt;
+		ktime_t t[2];
+
+		if (!intel_engine_supports_stats(engine))
+			continue;
+
+		if (!intel_engine_can_store_dword(engine))
+			continue;
+
+		if (intel_gt_pm_wait_for_idle(gt)) {
+			err = -EBUSY;
+			break;
+		}
+
+		st_engine_heartbeat_disable(engine);
+
+		ENGINE_TRACE(engine, "measuring idle time\n");
+		preempt_disable();
+		de = intel_engine_get_busy_time(engine, &t[0]);
+		udelay(100);
+		de = ktime_sub(intel_engine_get_busy_time(engine, &t[1]), de);
+		preempt_enable();
+		dt = ktime_sub(t[1], t[0]);
+		if (de < 0 || de > 10) {
+			pr_err("%s: reported %lldns [%d%%] busyness while sleeping [for %lldns]\n",
+			       engine->name,
+			       de, (int)div64_u64(100 * de, dt), dt);
+			GEM_TRACE_DUMP();
+			err = -EINVAL;
+			goto end;
+		}
+
+		/* 100% busy */
+		rq = igt_spinner_create_request(&spin,
+						engine->kernel_context,
+						MI_NOOP);
+		if (IS_ERR(rq)) {
+			err = PTR_ERR(rq);
+			goto end;
+		}
+		i915_request_add(rq);
+
+		if (!igt_wait_for_spinner(&spin, rq)) {
+			intel_gt_set_wedged(engine->gt);
+			err = -ETIME;
+			goto end;
+		}
+
+		ENGINE_TRACE(engine, "measuring busy time\n");
+		preempt_disable();
+		de = intel_engine_get_busy_time(engine, &t[0]);
+		udelay(100);
+		de = ktime_sub(intel_engine_get_busy_time(engine, &t[1]), de);
+		preempt_enable();
+		dt = ktime_sub(t[1], t[0]);
+		if (100 * de < 95 * dt || 95 * de > 100 * dt) {
+			pr_err("%s: reported %lldns [%d%%] busyness while spinning [for %lldns]\n",
+			       engine->name,
+			       de, (int)div64_u64(100 * de, dt), dt);
+			GEM_TRACE_DUMP();
+			err = -EINVAL;
+			goto end;
+		}
+
+end:
+		st_engine_heartbeat_enable(engine);
+		igt_spinner_end(&spin);
+		if (igt_flush_test(gt->i915))
+			err = -EIO;
+		if (err)
+			break;
+	}
+
+	igt_spinner_fini(&spin);
+	if (igt_flush_test(gt->i915))
+		err = -EIO;
+	return err;
+}
 
 static int live_engine_pm(void *arg)
 {
@@ -77,6 +177,7 @@ static int live_engine_pm(void *arg)
 int live_engine_pm_selftests(struct intel_gt *gt)
 {
 	static const struct i915_subtest tests[] = {
+		SUBTEST(live_engine_busy_stats),
 		SUBTEST(live_engine_pm),
 	};
 
diff --git a/drivers/gpu/drm/i915/gt/selftest_gt_pm.c b/drivers/gpu/drm/i915/gt/selftest_gt_pm.c
index 242181a5214c..6180a47c1b51 100644
--- a/drivers/gpu/drm/i915/gt/selftest_gt_pm.c
+++ b/drivers/gpu/drm/i915/gt/selftest_gt_pm.c
@@ -5,10 +5,141 @@
  * Copyright © 2019 Intel Corporation
  */
 
+#include <linux/sort.h>
+
+#include "intel_gt_clock_utils.h"
+
 #include "selftest_llc.h"
 #include "selftest_rc6.h"
 #include "selftest_rps.h"
 
+static int cmp_u64(const void *A, const void *B)
+{
+	const u64 *a = A, *b = B;
+
+	if (a < b)
+		return -1;
+	else if (a > b)
+		return 1;
+	else
+		return 0;
+}
+
+static int cmp_u32(const void *A, const void *B)
+{
+	const u32 *a = A, *b = B;
+
+	if (a < b)
+		return -1;
+	else if (a > b)
+		return 1;
+	else
+		return 0;
+}
+
+static void measure_clocks(struct intel_engine_cs *engine,
+			   u32 *out_cycles, ktime_t *out_dt)
+{
+	ktime_t dt[5];
+	u32 cycles[5];
+	int i;
+
+	for (i = 0; i < 5; i++) {
+		preempt_disable();
+		cycles[i] = -ENGINE_READ_FW(engine, RING_TIMESTAMP);
+		dt[i] = ktime_get();
+
+		udelay(1000);
+
+		dt[i] = ktime_sub(ktime_get(), dt[i]);
+		cycles[i] += ENGINE_READ_FW(engine, RING_TIMESTAMP);
+		preempt_enable();
+	}
+
+	/* Use the median of both cycle/dt; close enough */
+	sort(cycles, 5, sizeof(*cycles), cmp_u32, NULL);
+	*out_cycles = (cycles[1] + 2 * cycles[2] + cycles[3]) / 4;
+
+	sort(dt, 5, sizeof(*dt), cmp_u64, NULL);
+	*out_dt = div_u64(dt[1] + 2 * dt[2] + dt[3], 4);
+}
+
+static int live_gt_clocks(void *arg)
+{
+	struct intel_gt *gt = arg;
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+	int err = 0;
+
+	if (!RUNTIME_INFO(gt->i915)->cs_timestamp_frequency_hz) { /* unknown */
+		pr_info("CS_TIMESTAMP frequency unknown\n");
+		return 0;
+	}
+
+	if (INTEL_GEN(gt->i915) < 4) /* Any CS_TIMESTAMP? */
+		return 0;
+
+	if (IS_GEN(gt->i915, 5))
+		/*
+		 * XXX CS_TIMESTAMP low dword is dysfunctional?
+		 *
+		 * Ville's experiments indicate the high dword still works,
+		 * but at a correspondingly reduced frequency.
+		 */
+		return 0;
+
+	if (IS_GEN(gt->i915, 4))
+		/*
+		 * XXX CS_TIMESTAMP appears gibberish
+		 *
+		 * Ville's experiments indicate that it mostly appears 'stuck'
+		 * in that we see the register report the same cycle count
+		 * for a couple of reads.
+		 */
+		return 0;
+
+	intel_gt_pm_get(gt);
+	intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
+
+	for_each_engine(engine, gt, id) {
+		u32 cycles;
+		u32 expected;
+		u64 time;
+		u64 dt;
+
+		if (INTEL_GEN(engine->i915) < 7 && engine->id != RCS0)
+			continue;
+
+		measure_clocks(engine, &cycles, &dt);
+
+		time = i915_cs_timestamp_ticks_to_ns(engine->i915, cycles);
+		expected = i915_cs_timestamp_ns_to_ticks(engine->i915, dt);
+
+		pr_info("%s: TIMESTAMP %d cycles [%lldns] in %lldns [%d cycles], using CS clock frequency of %uKHz\n",
+			engine->name, cycles, time, dt, expected,
+			RUNTIME_INFO(engine->i915)->cs_timestamp_frequency_hz / 1000);
+
+		if (9 * time < 8 * dt || 8 * time > 9 * dt) {
+			pr_err("%s: CS ticks did not match walltime!\n",
+			       engine->name);
+			err = -EINVAL;
+			break;
+		}
+
+		if (9 * expected < 8 * cycles || 8 * expected > 9 * cycles) {
+			pr_err("%s: walltime did not match CS ticks!\n",
+			       engine->name);
+			err = -EINVAL;
+			break;
+		}
+	}
+
+	intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
+	intel_gt_pm_put(gt);
+
+	return err;
+}
+
 static int live_gt_resume(void *arg)
 {
 	struct intel_gt *gt = arg;
@@ -52,6 +183,7 @@ static int live_gt_resume(void *arg)
 int intel_gt_pm_live_selftests(struct drm_i915_private *i915)
 {
 	static const struct i915_subtest tests[] = {
+		SUBTEST(live_gt_clocks),
 		SUBTEST(live_rc6_manual),
 		SUBTEST(live_rps_clock_interval),
 		SUBTEST(live_rps_control),
diff --git a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
index 4aa4cc917d8b..fb5ebf930ab2 100644
--- a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
+++ b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
@@ -29,6 +29,7 @@
 #include "intel_gt.h"
 #include "intel_engine_heartbeat.h"
 #include "intel_engine_pm.h"
+#include "selftest_engine_heartbeat.h"
 
 #include "i915_selftest.h"
 #include "selftests/i915_random.h"
@@ -203,12 +204,12 @@ hang_create_request(struct hang *h, struct intel_engine_cs *engine)
 		*batch++ = lower_32_bits(hws_address(hws, rq));
 		*batch++ = upper_32_bits(hws_address(hws, rq));
 		*batch++ = rq->fence.seqno;
-		*batch++ = MI_ARB_CHECK;
+		*batch++ = MI_NOOP;
 
 		memset(batch, 0, 1024);
 		batch += 1024 / sizeof(*batch);
 
-		*batch++ = MI_ARB_CHECK;
+		*batch++ = MI_NOOP;
 		*batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
 		*batch++ = lower_32_bits(vma->node.start);
 		*batch++ = upper_32_bits(vma->node.start);
@@ -217,12 +218,12 @@ hang_create_request(struct hang *h, struct intel_engine_cs *engine)
 		*batch++ = 0;
 		*batch++ = lower_32_bits(hws_address(hws, rq));
 		*batch++ = rq->fence.seqno;
-		*batch++ = MI_ARB_CHECK;
+		*batch++ = MI_NOOP;
 
 		memset(batch, 0, 1024);
 		batch += 1024 / sizeof(*batch);
 
-		*batch++ = MI_ARB_CHECK;
+		*batch++ = MI_NOOP;
 		*batch++ = MI_BATCH_BUFFER_START | 1 << 8;
 		*batch++ = lower_32_bits(vma->node.start);
 	} else if (INTEL_GEN(gt->i915) >= 4) {
@@ -230,24 +231,24 @@ hang_create_request(struct hang *h, struct intel_engine_cs *engine)
 		*batch++ = 0;
 		*batch++ = lower_32_bits(hws_address(hws, rq));
 		*batch++ = rq->fence.seqno;
-		*batch++ = MI_ARB_CHECK;
+		*batch++ = MI_NOOP;
 
 		memset(batch, 0, 1024);
 		batch += 1024 / sizeof(*batch);
 
-		*batch++ = MI_ARB_CHECK;
+		*batch++ = MI_NOOP;
 		*batch++ = MI_BATCH_BUFFER_START | 2 << 6;
 		*batch++ = lower_32_bits(vma->node.start);
 	} else {
 		*batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
 		*batch++ = lower_32_bits(hws_address(hws, rq));
 		*batch++ = rq->fence.seqno;
-		*batch++ = MI_ARB_CHECK;
+		*batch++ = MI_NOOP;
 
 		memset(batch, 0, 1024);
 		batch += 1024 / sizeof(*batch);
 
-		*batch++ = MI_ARB_CHECK;
+		*batch++ = MI_NOOP;
 		*batch++ = MI_BATCH_BUFFER_START | 2 << 6;
 		*batch++ = lower_32_bits(vma->node.start);
 	}
@@ -310,22 +311,6 @@ static bool wait_until_running(struct hang *h, struct i915_request *rq)
 			  1000));
 }
 
-static void engine_heartbeat_disable(struct intel_engine_cs *engine)
-{
-	engine->props.heartbeat_interval_ms = 0;
-
-	intel_engine_pm_get(engine);
-	intel_engine_park_heartbeat(engine);
-}
-
-static void engine_heartbeat_enable(struct intel_engine_cs *engine)
-{
-	intel_engine_pm_put(engine);
-
-	engine->props.heartbeat_interval_ms =
-		engine->defaults.heartbeat_interval_ms;
-}
-
 static int igt_hang_sanitycheck(void *arg)
 {
 	struct intel_gt *gt = arg;
@@ -482,7 +467,7 @@ static int igt_reset_nop_engine(void *arg)
 		reset_engine_count = i915_reset_engine_count(global, engine);
 		count = 0;
 
-		engine_heartbeat_disable(engine);
+		st_engine_heartbeat_disable(engine);
 		set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
 		do {
 			int i;
@@ -499,6 +484,20 @@ static int igt_reset_nop_engine(void *arg)
 
 				rq = intel_context_create_request(ce);
 				if (IS_ERR(rq)) {
+					struct drm_printer p =
+						drm_info_printer(gt->i915->drm.dev);
+					intel_engine_dump(engine, &p,
+							  "%s(%s): failed to submit request\n",
+							  __func__,
+							  engine->name);
+
+					GEM_TRACE("%s(%s): failed to submit request\n",
+						  __func__,
+						  engine->name);
+					GEM_TRACE_DUMP();
+
+					intel_gt_set_wedged(gt);
+
 					err = PTR_ERR(rq);
 					break;
 				}
@@ -526,7 +525,7 @@ static int igt_reset_nop_engine(void *arg)
 			}
 		} while (time_before(jiffies, end_time));
 		clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
-		engine_heartbeat_enable(engine);
+		st_engine_heartbeat_enable(engine);
 
 		pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
 
@@ -576,7 +575,7 @@ static int __igt_reset_engine(struct intel_gt *gt, bool active)
 		reset_count = i915_reset_count(global);
 		reset_engine_count = i915_reset_engine_count(global, engine);
 
-		engine_heartbeat_disable(engine);
+		st_engine_heartbeat_disable(engine);
 		set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
 		do {
 			if (active) {
@@ -628,7 +627,7 @@ static int __igt_reset_engine(struct intel_gt *gt, bool active)
 			}
 		} while (time_before(jiffies, end_time));
 		clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
-		engine_heartbeat_enable(engine);
+		st_engine_heartbeat_enable(engine);
 
 		if (err)
 			break;
@@ -805,10 +804,10 @@ static int __igt_reset_engines(struct intel_gt *gt,
 			threads[tmp].resets =
 				i915_reset_engine_count(global, other);
 
-			if (!(flags & TEST_OTHERS))
+			if (other == engine && !(flags & TEST_SELF))
 				continue;
 
-			if (other == engine && !(flags & TEST_SELF))
+			if (other != engine && !(flags & TEST_OTHERS))
 				continue;
 
 			threads[tmp].engine = other;
@@ -827,7 +826,7 @@ static int __igt_reset_engines(struct intel_gt *gt,
 
 		yield(); /* start all threads before we begin */
 
-		engine_heartbeat_disable(engine);
+		st_engine_heartbeat_disable(engine);
 		set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
 		do {
 			struct i915_request *rq = NULL;
@@ -866,13 +865,29 @@ static int __igt_reset_engines(struct intel_gt *gt,
 			count++;
 
 			if (rq) {
+				if (rq->fence.error != -EIO) {
+					pr_err("i915_reset_engine(%s:%s):"
+					       " failed to reset request %llx:%lld\n",
+					       engine->name, test_name,
+					       rq->fence.context,
+					       rq->fence.seqno);
+					i915_request_put(rq);
+
+					GEM_TRACE_DUMP();
+					intel_gt_set_wedged(gt);
+					err = -EIO;
+					break;
+				}
+
 				if (i915_request_wait(rq, 0, HZ / 5) < 0) {
 					struct drm_printer p =
 						drm_info_printer(gt->i915->drm.dev);
 
 					pr_err("i915_reset_engine(%s:%s):"
-					       " failed to complete request after reset\n",
-					       engine->name, test_name);
+					       " failed to complete request %llx:%lld after reset\n",
+					       engine->name, test_name,
+					       rq->fence.context,
+					       rq->fence.seqno);
 					intel_engine_dump(engine, &p,
 							  "%s\n", engine->name);
 					i915_request_put(rq);
@@ -901,7 +916,7 @@ static int __igt_reset_engines(struct intel_gt *gt,
 			}
 		} while (time_before(jiffies, end_time));
 		clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
-		engine_heartbeat_enable(engine);
+		st_engine_heartbeat_enable(engine);
 
 		pr_info("i915_reset_engine(%s:%s): %lu resets\n",
 			engine->name, test_name, count);
@@ -983,7 +998,7 @@ static int igt_reset_engines(void *arg)
 		},
 		{
 			"self-priority",
-			TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY | TEST_SELF,
+			TEST_ACTIVE | TEST_PRIORITY | TEST_SELF,
 		},
 		{ }
 	};
diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c b/drivers/gpu/drm/i915/gt/selftest_lrc.c
index 924bc01ef526..daa4aabab9a7 100644
--- a/drivers/gpu/drm/i915/gt/selftest_lrc.c
+++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c
@@ -9,6 +9,7 @@
 #include "gem/i915_gem_pm.h"
 #include "gt/intel_engine_heartbeat.h"
 #include "gt/intel_reset.h"
+#include "gt/selftest_engine_heartbeat.h"
 
 #include "i915_selftest.h"
 #include "selftests/i915_random.h"
@@ -51,22 +52,6 @@ static struct i915_vma *create_scratch(struct intel_gt *gt)
 	return vma;
 }
 
-static void engine_heartbeat_disable(struct intel_engine_cs *engine)
-{
-	engine->props.heartbeat_interval_ms = 0;
-
-	intel_engine_pm_get(engine);
-	intel_engine_park_heartbeat(engine);
-}
-
-static void engine_heartbeat_enable(struct intel_engine_cs *engine)
-{
-	intel_engine_pm_put(engine);
-
-	engine->props.heartbeat_interval_ms =
-		engine->defaults.heartbeat_interval_ms;
-}
-
 static bool is_active(struct i915_request *rq)
 {
 	if (i915_request_is_active(rq))
@@ -75,7 +60,7 @@ static bool is_active(struct i915_request *rq)
 	if (i915_request_on_hold(rq))
 		return true;
 
-	if (i915_request_started(rq))
+	if (i915_request_has_initial_breadcrumb(rq) && i915_request_started(rq))
 		return true;
 
 	return false;
@@ -234,7 +219,7 @@ static int live_unlite_restore(struct intel_gt *gt, int prio)
 			err = -EIO;
 			break;
 		}
-		engine_heartbeat_disable(engine);
+		st_engine_heartbeat_disable(engine);
 
 		for (n = 0; n < ARRAY_SIZE(ce); n++) {
 			struct intel_context *tmp;
@@ -332,7 +317,7 @@ static int live_unlite_restore(struct intel_gt *gt, int prio)
 		i915_request_put(rq[0]);
 
 err_ce:
-		tasklet_kill(&engine->execlists.tasklet); /* flush submission */
+		intel_engine_flush_submission(engine);
 		igt_spinner_end(&spin);
 		for (n = 0; n < ARRAY_SIZE(ce); n++) {
 			if (IS_ERR_OR_NULL(ce[n]))
@@ -342,7 +327,7 @@ err_ce:
 			intel_context_put(ce[n]);
 		}
 
-		engine_heartbeat_enable(engine);
+		st_engine_heartbeat_enable(engine);
 		if (igt_live_test_end(&t))
 			err = -EIO;
 		if (err)
@@ -363,6 +348,156 @@ static int live_unlite_preempt(void *arg)
 	return live_unlite_restore(arg, I915_USER_PRIORITY(I915_PRIORITY_MAX));
 }
 
+static int live_unlite_ring(void *arg)
+{
+	struct intel_gt *gt = arg;
+	struct intel_engine_cs *engine;
+	struct igt_spinner spin;
+	enum intel_engine_id id;
+	int err = 0;
+
+	/*
+	 * Setup a preemption event that will cause almost the entire ring
+	 * to be unwound, potentially fooling our intel_ring_direction()
+	 * into emitting a forward lite-restore instead of the rollback.
+	 */
+
+	if (igt_spinner_init(&spin, gt))
+		return -ENOMEM;
+
+	for_each_engine(engine, gt, id) {
+		struct intel_context *ce[2] = {};
+		struct i915_request *rq;
+		struct igt_live_test t;
+		int n;
+
+		if (!intel_engine_has_preemption(engine))
+			continue;
+
+		if (!intel_engine_can_store_dword(engine))
+			continue;
+
+		if (igt_live_test_begin(&t, gt->i915, __func__, engine->name)) {
+			err = -EIO;
+			break;
+		}
+		st_engine_heartbeat_disable(engine);
+
+		for (n = 0; n < ARRAY_SIZE(ce); n++) {
+			struct intel_context *tmp;
+
+			tmp = intel_context_create(engine);
+			if (IS_ERR(tmp)) {
+				err = PTR_ERR(tmp);
+				goto err_ce;
+			}
+
+			err = intel_context_pin(tmp);
+			if (err) {
+				intel_context_put(tmp);
+				goto err_ce;
+			}
+
+			memset32(tmp->ring->vaddr,
+				 0xdeadbeef, /* trigger a hang if executed */
+				 tmp->ring->vma->size / sizeof(u32));
+
+			ce[n] = tmp;
+		}
+
+		/* Create max prio spinner, followed by N low prio nops */
+		rq = igt_spinner_create_request(&spin, ce[0], MI_ARB_CHECK);
+		if (IS_ERR(rq)) {
+			err = PTR_ERR(rq);
+			goto err_ce;
+		}
+
+		i915_request_get(rq);
+		rq->sched.attr.priority = I915_PRIORITY_BARRIER;
+		i915_request_add(rq);
+
+		if (!igt_wait_for_spinner(&spin, rq)) {
+			intel_gt_set_wedged(gt);
+			i915_request_put(rq);
+			err = -ETIME;
+			goto err_ce;
+		}
+
+		/* Fill the ring, until we will cause a wrap */
+		n = 0;
+		while (intel_ring_direction(ce[0]->ring,
+					    rq->wa_tail,
+					    ce[0]->ring->tail) <= 0) {
+			struct i915_request *tmp;
+
+			tmp = intel_context_create_request(ce[0]);
+			if (IS_ERR(tmp)) {
+				err = PTR_ERR(tmp);
+				i915_request_put(rq);
+				goto err_ce;
+			}
+
+			i915_request_add(tmp);
+			intel_engine_flush_submission(engine);
+			n++;
+		}
+		intel_engine_flush_submission(engine);
+		pr_debug("%s: Filled ring with %d nop tails {size:%x, tail:%x, emit:%x, rq.tail:%x}\n",
+			 engine->name, n,
+			 ce[0]->ring->size,
+			 ce[0]->ring->tail,
+			 ce[0]->ring->emit,
+			 rq->tail);
+		GEM_BUG_ON(intel_ring_direction(ce[0]->ring,
+						rq->tail,
+						ce[0]->ring->tail) <= 0);
+		i915_request_put(rq);
+
+		/* Create a second ring to preempt the first ring after rq[0] */
+		rq = intel_context_create_request(ce[1]);
+		if (IS_ERR(rq)) {
+			err = PTR_ERR(rq);
+			goto err_ce;
+		}
+
+		rq->sched.attr.priority = I915_PRIORITY_BARRIER;
+		i915_request_get(rq);
+		i915_request_add(rq);
+
+		err = wait_for_submit(engine, rq, HZ / 2);
+		i915_request_put(rq);
+		if (err) {
+			pr_err("%s: preemption request was not submitted\n",
+			       engine->name);
+			err = -ETIME;
+		}
+
+		pr_debug("%s: ring[0]:{ tail:%x, emit:%x }, ring[1]:{ tail:%x, emit:%x }\n",
+			 engine->name,
+			 ce[0]->ring->tail, ce[0]->ring->emit,
+			 ce[1]->ring->tail, ce[1]->ring->emit);
+
+err_ce:
+		intel_engine_flush_submission(engine);
+		igt_spinner_end(&spin);
+		for (n = 0; n < ARRAY_SIZE(ce); n++) {
+			if (IS_ERR_OR_NULL(ce[n]))
+				break;
+
+			intel_context_unpin(ce[n]);
+			intel_context_put(ce[n]);
+		}
+		st_engine_heartbeat_enable(engine);
+		if (igt_live_test_end(&t))
+			err = -EIO;
+		if (err)
+			break;
+	}
+
+	igt_spinner_fini(&spin);
+	return err;
+}
+
 static int live_pin_rewind(void *arg)
 {
 	struct intel_gt *gt = arg;
@@ -471,7 +606,7 @@ static int live_hold_reset(void *arg)
 			break;
 		}
 
-		engine_heartbeat_disable(engine);
+		st_engine_heartbeat_disable(engine);
 
 		rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK);
 		if (IS_ERR(rq)) {
@@ -531,7 +666,7 @@ static int live_hold_reset(void *arg)
 		i915_request_put(rq);
 
 out:
-		engine_heartbeat_enable(engine);
+		st_engine_heartbeat_enable(engine);
 		intel_context_put(ce);
 		if (err)
 			break;
@@ -578,7 +713,7 @@ static int live_error_interrupt(void *arg)
 		const struct error_phase *p;
 		int err = 0;
 
-		engine_heartbeat_disable(engine);
+		st_engine_heartbeat_disable(engine);
 
 		for (p = phases; p->error[0] != GOOD; p++) {
 			struct i915_request *client[ARRAY_SIZE(phases->error)];
@@ -677,7 +812,7 @@ out:
 			}
 		}
 
-		engine_heartbeat_enable(engine);
+		st_engine_heartbeat_enable(engine);
 		if (err) {
 			intel_gt_set_wedged(gt);
 			return err;
@@ -845,10 +980,11 @@ static int live_timeslice_preempt(void *arg)
 {
 	struct intel_gt *gt = arg;
 	struct drm_i915_gem_object *obj;
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
 	struct i915_vma *vma;
 	void *vaddr;
 	int err = 0;
-	int count;
 
 	/*
 	 * If a request takes too long, we would like to give other users
@@ -885,26 +1021,21 @@ static int live_timeslice_preempt(void *arg)
 	if (err)
 		goto err_pin;
 
-	for_each_prime_number_from(count, 1, 16) {
-		struct intel_engine_cs *engine;
-		enum intel_engine_id id;
-
-		for_each_engine(engine, gt, id) {
-			if (!intel_engine_has_preemption(engine))
-				continue;
+	for_each_engine(engine, gt, id) {
+		if (!intel_engine_has_preemption(engine))
+			continue;
 
-			memset(vaddr, 0, PAGE_SIZE);
+		memset(vaddr, 0, PAGE_SIZE);
 
-			engine_heartbeat_disable(engine);
-			err = slice_semaphore_queue(engine, vma, count);
-			engine_heartbeat_enable(engine);
-			if (err)
-				goto err_pin;
+		st_engine_heartbeat_disable(engine);
+		err = slice_semaphore_queue(engine, vma, 5);
+		st_engine_heartbeat_enable(engine);
+		if (err)
+			goto err_pin;
 
-			if (igt_flush_test(gt->i915)) {
-				err = -EIO;
-				goto err_pin;
-			}
+		if (igt_flush_test(gt->i915)) {
+			err = -EIO;
+			goto err_pin;
 		}
 	}
 
@@ -1020,7 +1151,7 @@ static int live_timeslice_rewind(void *arg)
 		 * Expect execution/evaluation order XZY
 		 */
 
-		engine_heartbeat_disable(engine);
+		st_engine_heartbeat_disable(engine);
 		timeslice = xchg(&engine->props.timeslice_duration_ms, 1);
 
 		slot = memset32(engine->status_page.addr + 1000, 0, 4);
@@ -1031,18 +1162,18 @@ static int live_timeslice_rewind(void *arg)
 			goto err;
 		}
 
-		rq[0] = create_rewinder(ce, NULL, slot, X);
-		if (IS_ERR(rq[0])) {
+		rq[A1] = create_rewinder(ce, NULL, slot, X);
+		if (IS_ERR(rq[A1])) {
 			intel_context_put(ce);
 			goto err;
 		}
 
-		rq[1] = create_rewinder(ce, NULL, slot, Y);
+		rq[A2] = create_rewinder(ce, NULL, slot, Y);
 		intel_context_put(ce);
-		if (IS_ERR(rq[1]))
+		if (IS_ERR(rq[A2]))
 			goto err;
 
-		err = wait_for_submit(engine, rq[1], HZ / 2);
+		err = wait_for_submit(engine, rq[A2], HZ / 2);
 		if (err) {
 			pr_err("%s: failed to submit first context\n",
 			       engine->name);
@@ -1055,12 +1186,12 @@ static int live_timeslice_rewind(void *arg)
 			goto err;
 		}
 
-		rq[2] = create_rewinder(ce, rq[0], slot, Z);
+		rq[B1] = create_rewinder(ce, rq[A1], slot, Z);
 		intel_context_put(ce);
 		if (IS_ERR(rq[2]))
 			goto err;
 
-		err = wait_for_submit(engine, rq[2], HZ / 2);
+		err = wait_for_submit(engine, rq[B1], HZ / 2);
 		if (err) {
 			pr_err("%s: failed to submit second context\n",
 			       engine->name);
@@ -1068,6 +1199,7 @@ static int live_timeslice_rewind(void *arg)
 		}
 
 		/* ELSP[] = { { A:rq1, A:rq2 }, { B:rq1 } } */
+		ENGINE_TRACE(engine, "forcing tasklet for rewind\n");
 		if (i915_request_is_active(rq[A2])) { /* semaphore yielded! */
 			/* Wait for the timeslice to kick in */
 			del_timer(&engine->execlists.timer);
@@ -1114,7 +1246,7 @@ err:
 		wmb();
 
 		engine->props.timeslice_duration_ms = timeslice;
-		engine_heartbeat_enable(engine);
+		st_engine_heartbeat_enable(engine);
 		for (i = 0; i < 3; i++)
 			i915_request_put(rq[i]);
 		if (igt_flush_test(gt->i915))
@@ -1140,9 +1272,17 @@ static struct i915_request *nop_request(struct intel_engine_cs *engine)
 	return rq;
 }
 
-static long timeslice_threshold(const struct intel_engine_cs *engine)
+static long slice_timeout(struct intel_engine_cs *engine)
 {
-	return 2 * msecs_to_jiffies_timeout(timeslice(engine)) + 1;
+	long timeout;
+
+	/* Enough time for a timeslice to kick in, and kick out */
+	timeout = 2 * msecs_to_jiffies_timeout(timeslice(engine));
+
+	/* Enough time for the nop request to complete */
+	timeout += HZ / 5;
+
+	return timeout + 1;
 }
 
 static int live_timeslice_queue(void *arg)
@@ -1198,7 +1338,7 @@ static int live_timeslice_queue(void *arg)
 		if (!intel_engine_has_preemption(engine))
 			continue;
 
-		engine_heartbeat_disable(engine);
+		st_engine_heartbeat_disable(engine);
 		memset(vaddr, 0, PAGE_SIZE);
 
 		/* ELSP[0]: semaphore wait */
@@ -1243,24 +1383,8 @@ static int live_timeslice_queue(void *arg)
 			intel_engine_flush_submission(engine);
 		} while (READ_ONCE(engine->execlists.pending[0]));
 
-		if (!READ_ONCE(engine->execlists.timer.expires) &&
-		    execlists_active(&engine->execlists) == rq &&
-		    !i915_request_completed(rq)) {
-			struct drm_printer p =
-				drm_info_printer(gt->i915->drm.dev);
-
-			GEM_TRACE_ERR("%s: Failed to enable timeslicing!\n",
-				      engine->name);
-			intel_engine_dump(engine, &p,
-					  "%s\n", engine->name);
-			GEM_TRACE_DUMP();
-
-			memset(vaddr, 0xff, PAGE_SIZE);
-			err = -EINVAL;
-		}
-
 		/* Timeslice every jiffy, so within 2 we should signal */
-		if (i915_request_wait(rq, 0, timeslice_threshold(engine)) < 0) {
+		if (i915_request_wait(rq, 0, slice_timeout(engine)) < 0) {
 			struct drm_printer p =
 				drm_info_printer(gt->i915->drm.dev);
 
@@ -1275,7 +1399,7 @@ static int live_timeslice_queue(void *arg)
 err_rq:
 		i915_request_put(rq);
 err_heartbeat:
-		engine_heartbeat_enable(engine);
+		st_engine_heartbeat_enable(engine);
 		if (err)
 			break;
 	}
@@ -1321,7 +1445,7 @@ static int live_timeslice_nopreempt(void *arg)
 			break;
 		}
 
-		engine_heartbeat_disable(engine);
+		st_engine_heartbeat_disable(engine);
 		timeslice = xchg(&engine->props.timeslice_duration_ms, 1);
 
 		/* Create an unpreemptible spinner */
@@ -1349,7 +1473,7 @@ static int live_timeslice_nopreempt(void *arg)
 
 		ce = intel_context_create(engine);
 		if (IS_ERR(ce)) {
-			err = PTR_ERR(rq);
+			err = PTR_ERR(ce);
 			goto out_spin;
 		}
 
@@ -1379,7 +1503,7 @@ static int live_timeslice_nopreempt(void *arg)
 		 * allow the maximum priority barrier through. Wait long
 		 * enough to see if it is timesliced in by mistake.
 		 */
-		if (i915_request_wait(rq, 0, timeslice_threshold(engine)) >= 0) {
+		if (i915_request_wait(rq, 0, slice_timeout(engine)) >= 0) {
 			pr_err("%s: I915_PRIORITY_BARRIER request completed, bypassing no-preempt request\n",
 			       engine->name);
 			err = -EINVAL;
@@ -1390,7 +1514,7 @@ out_spin:
 		igt_spinner_end(&spin);
 out_heartbeat:
 		xchg(&engine->props.timeslice_duration_ms, timeslice);
-		engine_heartbeat_enable(engine);
+		st_engine_heartbeat_enable(engine);
 		if (err)
 			break;
 
@@ -2294,7 +2418,7 @@ static int live_suppress_self_preempt(void *arg)
 		if (igt_flush_test(gt->i915))
 			goto err_wedged;
 
-		intel_engine_pm_get(engine);
+		st_engine_heartbeat_disable(engine);
 		engine->execlists.preempt_hang.count = 0;
 
 		rq_a = spinner_create_request(&a.spin,
@@ -2302,14 +2426,14 @@ static int live_suppress_self_preempt(void *arg)
 					      MI_NOOP);
 		if (IS_ERR(rq_a)) {
 			err = PTR_ERR(rq_a);
-			intel_engine_pm_put(engine);
+			st_engine_heartbeat_enable(engine);
 			goto err_client_b;
 		}
 
 		i915_request_add(rq_a);
 		if (!igt_wait_for_spinner(&a.spin, rq_a)) {
 			pr_err("First client failed to start\n");
-			intel_engine_pm_put(engine);
+			st_engine_heartbeat_enable(engine);
 			goto err_wedged;
 		}
 
@@ -2321,7 +2445,7 @@ static int live_suppress_self_preempt(void *arg)
 						      MI_NOOP);
 			if (IS_ERR(rq_b)) {
 				err = PTR_ERR(rq_b);
-				intel_engine_pm_put(engine);
+				st_engine_heartbeat_enable(engine);
 				goto err_client_b;
 			}
 			i915_request_add(rq_b);
@@ -2332,7 +2456,7 @@ static int live_suppress_self_preempt(void *arg)
 
 			if (!igt_wait_for_spinner(&b.spin, rq_b)) {
 				pr_err("Second client failed to start\n");
-				intel_engine_pm_put(engine);
+				st_engine_heartbeat_enable(engine);
 				goto err_wedged;
 			}
 
@@ -2346,12 +2470,12 @@ static int live_suppress_self_preempt(void *arg)
 			       engine->name,
 			       engine->execlists.preempt_hang.count,
 			       depth);
-			intel_engine_pm_put(engine);
+			st_engine_heartbeat_enable(engine);
 			err = -EINVAL;
 			goto err_client_b;
 		}
 
-		intel_engine_pm_put(engine);
+		st_engine_heartbeat_enable(engine);
 		if (igt_flush_test(gt->i915))
 			goto err_wedged;
 	}
@@ -2371,183 +2495,6 @@ err_wedged:
 	goto err_client_b;
 }
 
-static int __i915_sw_fence_call
-dummy_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
-{
-	return NOTIFY_DONE;
-}
-
-static struct i915_request *dummy_request(struct intel_engine_cs *engine)
-{
-	struct i915_request *rq;
-
-	rq = kzalloc(sizeof(*rq), GFP_KERNEL);
-	if (!rq)
-		return NULL;
-
-	rq->engine = engine;
-
-	spin_lock_init(&rq->lock);
-	INIT_LIST_HEAD(&rq->fence.cb_list);
-	rq->fence.lock = &rq->lock;
-	rq->fence.ops = &i915_fence_ops;
-
-	i915_sched_node_init(&rq->sched);
-
-	/* mark this request as permanently incomplete */
-	rq->fence.seqno = 1;
-	BUILD_BUG_ON(sizeof(rq->fence.seqno) != 8); /* upper 32b == 0 */
-	rq->hwsp_seqno = (u32 *)&rq->fence.seqno + 1;
-	GEM_BUG_ON(i915_request_completed(rq));
-
-	i915_sw_fence_init(&rq->submit, dummy_notify);
-	set_bit(I915_FENCE_FLAG_ACTIVE, &rq->fence.flags);
-
-	spin_lock_init(&rq->lock);
-	rq->fence.lock = &rq->lock;
-	INIT_LIST_HEAD(&rq->fence.cb_list);
-
-	return rq;
-}
-
-static void dummy_request_free(struct i915_request *dummy)
-{
-	/* We have to fake the CS interrupt to kick the next request */
-	i915_sw_fence_commit(&dummy->submit);
-
-	i915_request_mark_complete(dummy);
-	dma_fence_signal(&dummy->fence);
-
-	i915_sched_node_fini(&dummy->sched);
-	i915_sw_fence_fini(&dummy->submit);
-
-	dma_fence_free(&dummy->fence);
-}
-
-static int live_suppress_wait_preempt(void *arg)
-{
-	struct intel_gt *gt = arg;
-	struct preempt_client client[4];
-	struct i915_request *rq[ARRAY_SIZE(client)] = {};
-	struct intel_engine_cs *engine;
-	enum intel_engine_id id;
-	int err = -ENOMEM;
-	int i;
-
-	/*
-	 * Waiters are given a little priority nudge, but not enough
-	 * to actually cause any preemption. Double check that we do
-	 * not needlessly generate preempt-to-idle cycles.
-	 */
-
-	if (!HAS_LOGICAL_RING_PREEMPTION(gt->i915))
-		return 0;
-
-	if (preempt_client_init(gt, &client[0])) /* ELSP[0] */
-		return -ENOMEM;
-	if (preempt_client_init(gt, &client[1])) /* ELSP[1] */
-		goto err_client_0;
-	if (preempt_client_init(gt, &client[2])) /* head of queue */
-		goto err_client_1;
-	if (preempt_client_init(gt, &client[3])) /* bystander */
-		goto err_client_2;
-
-	for_each_engine(engine, gt, id) {
-		int depth;
-
-		if (!intel_engine_has_preemption(engine))
-			continue;
-
-		if (!engine->emit_init_breadcrumb)
-			continue;
-
-		for (depth = 0; depth < ARRAY_SIZE(client); depth++) {
-			struct i915_request *dummy;
-
-			engine->execlists.preempt_hang.count = 0;
-
-			dummy = dummy_request(engine);
-			if (!dummy)
-				goto err_client_3;
-
-			for (i = 0; i < ARRAY_SIZE(client); i++) {
-				struct i915_request *this;
-
-				this = spinner_create_request(&client[i].spin,
-							      client[i].ctx, engine,
-							      MI_NOOP);
-				if (IS_ERR(this)) {
-					err = PTR_ERR(this);
-					goto err_wedged;
-				}
-
-				/* Disable NEWCLIENT promotion */
-				__i915_active_fence_set(&i915_request_timeline(this)->last_request,
-							&dummy->fence);
-
-				rq[i] = i915_request_get(this);
-				i915_request_add(this);
-			}
-
-			dummy_request_free(dummy);
-
-			GEM_BUG_ON(i915_request_completed(rq[0]));
-			if (!igt_wait_for_spinner(&client[0].spin, rq[0])) {
-				pr_err("%s: First client failed to start\n",
-				       engine->name);
-				goto err_wedged;
-			}
-			GEM_BUG_ON(!i915_request_started(rq[0]));
-
-			if (i915_request_wait(rq[depth],
-					      I915_WAIT_PRIORITY,
-					      1) != -ETIME) {
-				pr_err("%s: Waiter depth:%d completed!\n",
-				       engine->name, depth);
-				goto err_wedged;
-			}
-
-			for (i = 0; i < ARRAY_SIZE(client); i++) {
-				igt_spinner_end(&client[i].spin);
-				i915_request_put(rq[i]);
-				rq[i] = NULL;
-			}
-
-			if (igt_flush_test(gt->i915))
-				goto err_wedged;
-
-			if (engine->execlists.preempt_hang.count) {
-				pr_err("%s: Preemption recorded x%d, depth %d; should have been suppressed!\n",
-				       engine->name,
-				       engine->execlists.preempt_hang.count,
-				       depth);
-				err = -EINVAL;
-				goto err_client_3;
-			}
-		}
-	}
-
-	err = 0;
-err_client_3:
-	preempt_client_fini(&client[3]);
-err_client_2:
-	preempt_client_fini(&client[2]);
-err_client_1:
-	preempt_client_fini(&client[1]);
-err_client_0:
-	preempt_client_fini(&client[0]);
-	return err;
-
-err_wedged:
-	for (i = 0; i < ARRAY_SIZE(client); i++) {
-		igt_spinner_end(&client[i].spin);
-		i915_request_put(rq[i]);
-	}
-	intel_gt_set_wedged(gt);
-	err = -EIO;
-	goto err_client_3;
-}
-
 static int live_chain_preempt(void *arg)
 {
 	struct intel_gt *gt = arg;
@@ -2796,6 +2743,168 @@ err_ce:
 	return err;
 }
 
+static int __live_preempt_ring(struct intel_engine_cs *engine,
+			       struct igt_spinner *spin,
+			       int queue_sz, int ring_sz)
+{
+	struct intel_context *ce[2] = {};
+	struct i915_request *rq;
+	struct igt_live_test t;
+	int err = 0;
+	int n;
+
+	if (igt_live_test_begin(&t, engine->i915, __func__, engine->name))
+		return -EIO;
+
+	for (n = 0; n < ARRAY_SIZE(ce); n++) {
+		struct intel_context *tmp;
+
+		tmp = intel_context_create(engine);
+		if (IS_ERR(tmp)) {
+			err = PTR_ERR(tmp);
+			goto err_ce;
+		}
+
+		tmp->ring = __intel_context_ring_size(ring_sz);
+
+		err = intel_context_pin(tmp);
+		if (err) {
+			intel_context_put(tmp);
+			goto err_ce;
+		}
+
+		memset32(tmp->ring->vaddr,
+			 0xdeadbeef, /* trigger a hang if executed */
+			 tmp->ring->vma->size / sizeof(u32));
+
+		ce[n] = tmp;
+	}
+
+	rq = igt_spinner_create_request(spin, ce[0], MI_ARB_CHECK);
+	if (IS_ERR(rq)) {
+		err = PTR_ERR(rq);
+		goto err_ce;
+	}
+
+	i915_request_get(rq);
+	rq->sched.attr.priority = I915_PRIORITY_BARRIER;
+	i915_request_add(rq);
+
+	if (!igt_wait_for_spinner(spin, rq)) {
+		intel_gt_set_wedged(engine->gt);
+		i915_request_put(rq);
+		err = -ETIME;
+		goto err_ce;
+	}
+
+	/* Fill the ring, until we will cause a wrap */
+	n = 0;
+	while (ce[0]->ring->tail - rq->wa_tail <= queue_sz) {
+		struct i915_request *tmp;
+
+		tmp = intel_context_create_request(ce[0]);
+		if (IS_ERR(tmp)) {
+			err = PTR_ERR(tmp);
+			i915_request_put(rq);
+			goto err_ce;
+		}
+
+		i915_request_add(tmp);
+		intel_engine_flush_submission(engine);
+		n++;
+	}
+	intel_engine_flush_submission(engine);
+	pr_debug("%s: Filled %d with %d nop tails {size:%x, tail:%x, emit:%x, rq.tail:%x}\n",
+		 engine->name, queue_sz, n,
+		 ce[0]->ring->size,
+		 ce[0]->ring->tail,
+		 ce[0]->ring->emit,
+		 rq->tail);
+	i915_request_put(rq);
+
+	/* Create a second request to preempt the first ring */
+	rq = intel_context_create_request(ce[1]);
+	if (IS_ERR(rq)) {
+		err = PTR_ERR(rq);
+		goto err_ce;
+	}
+
+	rq->sched.attr.priority = I915_PRIORITY_BARRIER;
+	i915_request_get(rq);
+	i915_request_add(rq);
+
+	err = wait_for_submit(engine, rq, HZ / 2);
+	i915_request_put(rq);
+	if (err) {
+		pr_err("%s: preemption request was not submited\n",
+		       engine->name);
+		err = -ETIME;
+	}
+
+	pr_debug("%s: ring[0]:{ tail:%x, emit:%x }, ring[1]:{ tail:%x, emit:%x }\n",
+		 engine->name,
+		 ce[0]->ring->tail, ce[0]->ring->emit,
+		 ce[1]->ring->tail, ce[1]->ring->emit);
+
+err_ce:
+	intel_engine_flush_submission(engine);
+	igt_spinner_end(spin);
+	for (n = 0; n < ARRAY_SIZE(ce); n++) {
+		if (IS_ERR_OR_NULL(ce[n]))
+			break;
+
+		intel_context_unpin(ce[n]);
+		intel_context_put(ce[n]);
+	}
+	if (igt_live_test_end(&t))
+		err = -EIO;
+	return err;
+}
+
+static int live_preempt_ring(void *arg)
+{
+	struct intel_gt *gt = arg;
+	struct intel_engine_cs *engine;
+	struct igt_spinner spin;
+	enum intel_engine_id id;
+	int err = 0;
+
+	/*
+	 * Check that we rollback large chunks of a ring in order to do a
+	 * preemption event. Similar to live_unlite_ring, but looking at
+	 * ring size rather than the impact of intel_ring_direction().
+	 */
+
+	if (igt_spinner_init(&spin, gt))
+		return -ENOMEM;
+
+	for_each_engine(engine, gt, id) {
+		int n;
+
+		if (!intel_engine_has_preemption(engine))
+			continue;
+
+		if (!intel_engine_can_store_dword(engine))
+			continue;
+
+		st_engine_heartbeat_disable(engine);
+
+		for (n = 0; n <= 3; n++) {
+			err = __live_preempt_ring(engine, &spin,
+						  n * SZ_4K / 4, SZ_4K);
+			if (err)
+				break;
+		}
+
+		st_engine_heartbeat_enable(engine);
+		if (err)
+			break;
+	}
+
+	igt_spinner_fini(&spin);
+	return err;
+}
+
 static int live_preempt_gang(void *arg)
 {
 	struct intel_gt *gt = arg;
@@ -2840,16 +2949,8 @@ static int live_preempt_gang(void *arg)
 
 			/* Submit each spinner at increasing priority */
 			engine->schedule(rq, &attr);
-
-			if (prio <= I915_PRIORITY_MAX)
-				continue;
-
-			if (prio > (INT_MAX >> I915_USER_PRIORITY_SHIFT))
-				break;
-
-			if (__igt_timeout(end_time, NULL))
-				break;
-		} while (1);
+		} while (prio <= I915_PRIORITY_MAX &&
+			 !__igt_timeout(end_time, NULL));
 		pr_debug("%s: Preempt chain of %d requests\n",
 			 engine->name, prio);
 
@@ -3417,7 +3518,7 @@ static int smoke_crescendo_thread(void *arg)
 			return err;
 
 		count++;
-	} while (!__igt_timeout(end_time, NULL));
+	} while (count < smoke->ncontext && !__igt_timeout(end_time, NULL));
 
 	smoke->count = count;
 	return 0;
@@ -3493,7 +3594,7 @@ static int smoke_random(struct preempt_smoke *smoke, unsigned int flags)
 
 			count++;
 		}
-	} while (!__igt_timeout(end_time, NULL));
+	} while (count < smoke->ncontext && !__igt_timeout(end_time, NULL));
 
 	pr_info("Submitted %lu random:%x requests across %d engines and %d contexts\n",
 		count, flags,
@@ -3506,7 +3607,7 @@ static int live_preempt_smoke(void *arg)
 	struct preempt_smoke smoke = {
 		.gt = arg,
 		.prng = I915_RND_STATE_INITIALIZER(i915_selftest.random_seed),
-		.ncontext = 1024,
+		.ncontext = 256,
 	};
 	const unsigned int phase[] = { 0, BATCH };
 	struct igt_live_test t;
@@ -3706,13 +3807,43 @@ out:
 	return err;
 }
 
+static unsigned int
+__select_siblings(struct intel_gt *gt,
+		  unsigned int class,
+		  struct intel_engine_cs **siblings,
+		  bool (*filter)(const struct intel_engine_cs *))
+{
+	unsigned int n = 0;
+	unsigned int inst;
+
+	for (inst = 0; inst <= MAX_ENGINE_INSTANCE; inst++) {
+		if (!gt->engine_class[class][inst])
+			continue;
+
+		if (filter && !filter(gt->engine_class[class][inst]))
+			continue;
+
+		siblings[n++] = gt->engine_class[class][inst];
+	}
+
+	return n;
+}
+
+static unsigned int
+select_siblings(struct intel_gt *gt,
+		unsigned int class,
+		struct intel_engine_cs **siblings)
+{
+	return __select_siblings(gt, class, siblings, NULL);
+}
+
 static int live_virtual_engine(void *arg)
 {
 	struct intel_gt *gt = arg;
 	struct intel_engine_cs *siblings[MAX_ENGINE_INSTANCE + 1];
 	struct intel_engine_cs *engine;
 	enum intel_engine_id id;
-	unsigned int class, inst;
+	unsigned int class;
 	int err;
 
 	if (intel_uc_uses_guc_submission(&gt->uc))
@@ -3730,13 +3861,7 @@ static int live_virtual_engine(void *arg)
 	for (class = 0; class <= MAX_ENGINE_CLASS; class++) {
 		int nsibling, n;
 
-		nsibling = 0;
-		for (inst = 0; inst <= MAX_ENGINE_INSTANCE; inst++) {
-			if (!gt->engine_class[class][inst])
-				continue;
-
-			siblings[nsibling++] = gt->engine_class[class][inst];
-		}
+		nsibling = select_siblings(gt, class, siblings);
 		if (nsibling < 2)
 			continue;
 
@@ -3845,7 +3970,7 @@ static int live_virtual_mask(void *arg)
 {
 	struct intel_gt *gt = arg;
 	struct intel_engine_cs *siblings[MAX_ENGINE_INSTANCE + 1];
-	unsigned int class, inst;
+	unsigned int class;
 	int err;
 
 	if (intel_uc_uses_guc_submission(&gt->uc))
@@ -3854,17 +3979,178 @@ static int live_virtual_mask(void *arg)
 	for (class = 0; class <= MAX_ENGINE_CLASS; class++) {
 		unsigned int nsibling;
 
-		nsibling = 0;
-		for (inst = 0; inst <= MAX_ENGINE_INSTANCE; inst++) {
-			if (!gt->engine_class[class][inst])
-				break;
+		nsibling = select_siblings(gt, class, siblings);
+		if (nsibling < 2)
+			continue;
+
+		err = mask_virtual_engine(gt, siblings, nsibling);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static int slicein_virtual_engine(struct intel_gt *gt,
+				  struct intel_engine_cs **siblings,
+				  unsigned int nsibling)
+{
+	const long timeout = slice_timeout(siblings[0]);
+	struct intel_context *ce;
+	struct i915_request *rq;
+	struct igt_spinner spin;
+	unsigned int n;
+	int err = 0;
+
+	/*
+	 * Virtual requests must take part in timeslicing on the target engines.
+	 */
+
+	if (igt_spinner_init(&spin, gt))
+		return -ENOMEM;
+
+	for (n = 0; n < nsibling; n++) {
+		ce = intel_context_create(siblings[n]);
+		if (IS_ERR(ce)) {
+			err = PTR_ERR(ce);
+			goto out;
+		}
+
+		rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK);
+		intel_context_put(ce);
+		if (IS_ERR(rq)) {
+			err = PTR_ERR(rq);
+			goto out;
+		}
+
+		i915_request_add(rq);
+	}
+
+	ce = intel_execlists_create_virtual(siblings, nsibling);
+	if (IS_ERR(ce)) {
+		err = PTR_ERR(ce);
+		goto out;
+	}
+
+	rq = intel_context_create_request(ce);
+	intel_context_put(ce);
+	if (IS_ERR(rq)) {
+		err = PTR_ERR(rq);
+		goto out;
+	}
+
+	i915_request_get(rq);
+	i915_request_add(rq);
+	if (i915_request_wait(rq, 0, timeout) < 0) {
+		GEM_TRACE_ERR("%s(%s) failed to slice in virtual request\n",
+			      __func__, rq->engine->name);
+		GEM_TRACE_DUMP();
+		intel_gt_set_wedged(gt);
+		err = -EIO;
+	}
+	i915_request_put(rq);
+
+out:
+	igt_spinner_end(&spin);
+	if (igt_flush_test(gt->i915))
+		err = -EIO;
+	igt_spinner_fini(&spin);
+	return err;
+}
+
+static int sliceout_virtual_engine(struct intel_gt *gt,
+				   struct intel_engine_cs **siblings,
+				   unsigned int nsibling)
+{
+	const long timeout = slice_timeout(siblings[0]);
+	struct intel_context *ce;
+	struct i915_request *rq;
+	struct igt_spinner spin;
+	unsigned int n;
+	int err = 0;
+
+	/*
+	 * Virtual requests must allow others a fair timeslice.
+	 */
+
+	if (igt_spinner_init(&spin, gt))
+		return -ENOMEM;
+
+	/* XXX We do not handle oversubscription and fairness with normal rq */
+	for (n = 0; n < nsibling; n++) {
+		ce = intel_execlists_create_virtual(siblings, nsibling);
+		if (IS_ERR(ce)) {
+			err = PTR_ERR(ce);
+			goto out;
+		}
+
+		rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK);
+		intel_context_put(ce);
+		if (IS_ERR(rq)) {
+			err = PTR_ERR(rq);
+			goto out;
+		}
+
+		i915_request_add(rq);
+	}
+
+	for (n = 0; !err && n < nsibling; n++) {
+		ce = intel_context_create(siblings[n]);
+		if (IS_ERR(ce)) {
+			err = PTR_ERR(ce);
+			goto out;
+		}
+
+		rq = intel_context_create_request(ce);
+		intel_context_put(ce);
+		if (IS_ERR(rq)) {
+			err = PTR_ERR(rq);
+			goto out;
+		}
 
-			siblings[nsibling++] = gt->engine_class[class][inst];
+		i915_request_get(rq);
+		i915_request_add(rq);
+		if (i915_request_wait(rq, 0, timeout) < 0) {
+			GEM_TRACE_ERR("%s(%s) failed to slice out virtual request\n",
+				      __func__, siblings[n]->name);
+			GEM_TRACE_DUMP();
+			intel_gt_set_wedged(gt);
+			err = -EIO;
 		}
+		i915_request_put(rq);
+	}
+
+out:
+	igt_spinner_end(&spin);
+	if (igt_flush_test(gt->i915))
+		err = -EIO;
+	igt_spinner_fini(&spin);
+	return err;
+}
+
+static int live_virtual_slice(void *arg)
+{
+	struct intel_gt *gt = arg;
+	struct intel_engine_cs *siblings[MAX_ENGINE_INSTANCE + 1];
+	unsigned int class;
+	int err;
+
+	if (intel_uc_uses_guc_submission(&gt->uc))
+		return 0;
+
+	for (class = 0; class <= MAX_ENGINE_CLASS; class++) {
+		unsigned int nsibling;
+
+		nsibling = __select_siblings(gt, class, siblings,
+					     intel_engine_has_timeslices);
 		if (nsibling < 2)
 			continue;
 
-		err = mask_virtual_engine(gt, siblings, nsibling);
+		err = slicein_virtual_engine(gt, siblings, nsibling);
+		if (err)
+			return err;
+
+		err = sliceout_virtual_engine(gt, siblings, nsibling);
 		if (err)
 			return err;
 	}
@@ -3982,7 +4268,7 @@ static int live_virtual_preserved(void *arg)
 {
 	struct intel_gt *gt = arg;
 	struct intel_engine_cs *siblings[MAX_ENGINE_INSTANCE + 1];
-	unsigned int class, inst;
+	unsigned int class;
 
 	/*
 	 * Check that the context image retains non-privileged (user) registers
@@ -4000,13 +4286,7 @@ static int live_virtual_preserved(void *arg)
 	for (class = 0; class <= MAX_ENGINE_CLASS; class++) {
 		int nsibling, err;
 
-		nsibling = 0;
-		for (inst = 0; inst <= MAX_ENGINE_INSTANCE; inst++) {
-			if (!gt->engine_class[class][inst])
-				continue;
-
-			siblings[nsibling++] = gt->engine_class[class][inst];
-		}
+		nsibling = select_siblings(gt, class, siblings);
 		if (nsibling < 2)
 			continue;
 
@@ -4217,7 +4497,7 @@ static int live_virtual_bond(void *arg)
 	};
 	struct intel_gt *gt = arg;
 	struct intel_engine_cs *siblings[MAX_ENGINE_INSTANCE + 1];
-	unsigned int class, inst;
+	unsigned int class;
 	int err;
 
 	if (intel_uc_uses_guc_submission(&gt->uc))
@@ -4227,14 +4507,7 @@ static int live_virtual_bond(void *arg)
 		const struct phase *p;
 		int nsibling;
 
-		nsibling = 0;
-		for (inst = 0; inst <= MAX_ENGINE_INSTANCE; inst++) {
-			if (!gt->engine_class[class][inst])
-				break;
-
-			GEM_BUG_ON(nsibling == ARRAY_SIZE(siblings));
-			siblings[nsibling++] = gt->engine_class[class][inst];
-		}
+		nsibling = select_siblings(gt, class, siblings);
 		if (nsibling < 2)
 			continue;
 
@@ -4280,7 +4553,7 @@ static int reset_virtual_engine(struct intel_gt *gt,
 	}
 
 	for (n = 0; n < nsibling; n++)
-		engine_heartbeat_disable(siblings[n]);
+		st_engine_heartbeat_disable(siblings[n]);
 
 	rq = igt_spinner_create_request(&spin, ve, MI_ARB_CHECK);
 	if (IS_ERR(rq)) {
@@ -4351,7 +4624,7 @@ out_rq:
 	i915_request_put(rq);
 out_heartbeat:
 	for (n = 0; n < nsibling; n++)
-		engine_heartbeat_enable(siblings[n]);
+		st_engine_heartbeat_enable(siblings[n]);
 
 	intel_context_put(ve);
 out_spin:
@@ -4363,7 +4636,7 @@ static int live_virtual_reset(void *arg)
 {
 	struct intel_gt *gt = arg;
 	struct intel_engine_cs *siblings[MAX_ENGINE_INSTANCE + 1];
-	unsigned int class, inst;
+	unsigned int class;
 
 	/*
 	 * Check that we handle a reset event within a virtual engine.
@@ -4381,13 +4654,7 @@ static int live_virtual_reset(void *arg)
 	for (class = 0; class <= MAX_ENGINE_CLASS; class++) {
 		int nsibling, err;
 
-		nsibling = 0;
-		for (inst = 0; inst <= MAX_ENGINE_INSTANCE; inst++) {
-			if (!gt->engine_class[class][inst])
-				continue;
-
-			siblings[nsibling++] = gt->engine_class[class][inst];
-		}
+		nsibling = select_siblings(gt, class, siblings);
 		if (nsibling < 2)
 			continue;
 
@@ -4405,6 +4672,7 @@ int intel_execlists_live_selftests(struct drm_i915_private *i915)
 		SUBTEST(live_sanitycheck),
 		SUBTEST(live_unlite_switch),
 		SUBTEST(live_unlite_preempt),
+		SUBTEST(live_unlite_ring),
 		SUBTEST(live_pin_rewind),
 		SUBTEST(live_hold_reset),
 		SUBTEST(live_error_interrupt),
@@ -4418,8 +4686,8 @@ int intel_execlists_live_selftests(struct drm_i915_private *i915)
 		SUBTEST(live_nopreempt),
 		SUBTEST(live_preempt_cancel),
 		SUBTEST(live_suppress_self_preempt),
-		SUBTEST(live_suppress_wait_preempt),
 		SUBTEST(live_chain_preempt),
+		SUBTEST(live_preempt_ring),
 		SUBTEST(live_preempt_gang),
 		SUBTEST(live_preempt_timeout),
 		SUBTEST(live_preempt_user),
@@ -4427,6 +4695,7 @@ int intel_execlists_live_selftests(struct drm_i915_private *i915)
 		SUBTEST(live_virtual_engine),
 		SUBTEST(live_virtual_mask),
 		SUBTEST(live_virtual_preserved),
+		SUBTEST(live_virtual_slice),
 		SUBTEST(live_virtual_bond),
 		SUBTEST(live_virtual_reset),
 	};
@@ -5030,7 +5299,7 @@ static int live_lrc_gpr(void *arg)
 		return PTR_ERR(scratch);
 
 	for_each_engine(engine, gt, id) {
-		engine_heartbeat_disable(engine);
+		st_engine_heartbeat_disable(engine);
 
 		err = __live_lrc_gpr(engine, scratch, false);
 		if (err)
@@ -5041,7 +5310,7 @@ static int live_lrc_gpr(void *arg)
 			goto err;
 
 err:
-		engine_heartbeat_enable(engine);
+		st_engine_heartbeat_enable(engine);
 		if (igt_flush_test(gt->i915))
 			err = -EIO;
 		if (err)
@@ -5190,7 +5459,7 @@ static int live_lrc_timestamp(void *arg)
 	for_each_engine(data.engine, gt, id) {
 		int i, err = 0;
 
-		engine_heartbeat_disable(data.engine);
+		st_engine_heartbeat_disable(data.engine);
 
 		for (i = 0; i < ARRAY_SIZE(data.ce); i++) {
 			struct intel_context *tmp;
@@ -5223,7 +5492,7 @@ static int live_lrc_timestamp(void *arg)
 		}
 
 err:
-		engine_heartbeat_enable(data.engine);
+		st_engine_heartbeat_enable(data.engine);
 		for (i = 0; i < ARRAY_SIZE(data.ce); i++) {
 			if (!data.ce[i])
 				break;
diff --git a/drivers/gpu/drm/i915/gt/selftest_mocs.c b/drivers/gpu/drm/i915/gt/selftest_mocs.c
index 63f87d8608c3..b25eba50c88e 100644
--- a/drivers/gpu/drm/i915/gt/selftest_mocs.c
+++ b/drivers/gpu/drm/i915/gt/selftest_mocs.c
@@ -157,7 +157,7 @@ static int read_mocs_table(struct i915_request *rq,
 {
 	u32 addr;
 
-	if (HAS_GLOBAL_MOCS_REGISTERS(rq->i915))
+	if (HAS_GLOBAL_MOCS_REGISTERS(rq->engine->i915))
 		addr = global_mocs_offset();
 	else
 		addr = mocs_offset(rq->engine);
diff --git a/drivers/gpu/drm/i915/gt/selftest_rc6.c b/drivers/gpu/drm/i915/gt/selftest_rc6.c
index 2dc460624bbc..3c8434846fa1 100644
--- a/drivers/gpu/drm/i915/gt/selftest_rc6.c
+++ b/drivers/gpu/drm/i915/gt/selftest_rc6.c
@@ -132,7 +132,7 @@ static const u32 *__live_rc6_ctx(struct intel_context *ce)
 	}
 
 	cmd = MI_STORE_REGISTER_MEM | MI_USE_GGTT;
-	if (INTEL_GEN(rq->i915) >= 8)
+	if (INTEL_GEN(rq->engine->i915) >= 8)
 		cmd++;
 
 	*cs++ = cmd;
@@ -197,10 +197,10 @@ int live_rc6_ctx_wa(void *arg)
 		int pass;
 
 		for (pass = 0; pass < 2; pass++) {
+			struct i915_gpu_error *error = &gt->i915->gpu_error;
 			struct intel_context *ce;
 			unsigned int resets =
-				i915_reset_engine_count(&gt->i915->gpu_error,
-							engine);
+				i915_reset_engine_count(error, engine);
 			const u32 *res;
 
 			/* Use a sacrifical context */
@@ -230,8 +230,7 @@ int live_rc6_ctx_wa(void *arg)
 				 engine->name, READ_ONCE(*res));
 
 			if (resets !=
-			    i915_reset_engine_count(&gt->i915->gpu_error,
-						    engine)) {
+			    i915_reset_engine_count(error, engine)) {
 				pr_err("%s: GPU reset required\n",
 				       engine->name);
 				add_taint_for_CI(TAINT_WARN);
diff --git a/drivers/gpu/drm/i915/gt/selftest_rps.c b/drivers/gpu/drm/i915/gt/selftest_rps.c
index c91981e75ebf..8624f5d2a1f3 100644
--- a/drivers/gpu/drm/i915/gt/selftest_rps.c
+++ b/drivers/gpu/drm/i915/gt/selftest_rps.c
@@ -12,6 +12,7 @@
 #include "intel_gt_clock_utils.h"
 #include "intel_gt_pm.h"
 #include "intel_rc6.h"
+#include "selftest_engine_heartbeat.h"
 #include "selftest_rps.h"
 #include "selftests/igt_flush_test.h"
 #include "selftests/igt_spinner.h"
@@ -20,22 +21,6 @@
 /* Try to isolate the impact of cstates from determing frequency response */
 #define CPU_LATENCY 0 /* -1 to disable pm_qos, 0 to disable cstates */
 
-static void engine_heartbeat_disable(struct intel_engine_cs *engine)
-{
-	engine->props.heartbeat_interval_ms = 0;
-
-	intel_engine_pm_get(engine);
-	intel_engine_park_heartbeat(engine);
-}
-
-static void engine_heartbeat_enable(struct intel_engine_cs *engine)
-{
-	intel_engine_pm_put(engine);
-
-	engine->props.heartbeat_interval_ms =
-		engine->defaults.heartbeat_interval_ms;
-}
-
 static void dummy_rps_work(struct work_struct *wrk)
 {
 }
@@ -249,13 +234,13 @@ int live_rps_clock_interval(void *arg)
 		if (!intel_engine_can_store_dword(engine))
 			continue;
 
-		engine_heartbeat_disable(engine);
+		st_engine_heartbeat_disable(engine);
 
 		rq = igt_spinner_create_request(&spin,
 						engine->kernel_context,
 						MI_NOOP);
 		if (IS_ERR(rq)) {
-			engine_heartbeat_enable(engine);
+			st_engine_heartbeat_enable(engine);
 			err = PTR_ERR(rq);
 			break;
 		}
@@ -266,7 +251,7 @@ int live_rps_clock_interval(void *arg)
 			pr_err("%s: RPS spinner did not start\n",
 			       engine->name);
 			igt_spinner_end(&spin);
-			engine_heartbeat_enable(engine);
+			st_engine_heartbeat_enable(engine);
 			intel_gt_set_wedged(engine->gt);
 			err = -EIO;
 			break;
@@ -322,7 +307,7 @@ int live_rps_clock_interval(void *arg)
 		intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
 
 		igt_spinner_end(&spin);
-		engine_heartbeat_enable(engine);
+		st_engine_heartbeat_enable(engine);
 
 		if (err == 0) {
 			u64 time = intel_gt_pm_interval_to_ns(gt, cycles);
@@ -408,7 +393,7 @@ int live_rps_control(void *arg)
 		if (!intel_engine_can_store_dword(engine))
 			continue;
 
-		engine_heartbeat_disable(engine);
+		st_engine_heartbeat_disable(engine);
 
 		rq = igt_spinner_create_request(&spin,
 						engine->kernel_context,
@@ -424,7 +409,7 @@ int live_rps_control(void *arg)
 			pr_err("%s: RPS spinner did not start\n",
 			       engine->name);
 			igt_spinner_end(&spin);
-			engine_heartbeat_enable(engine);
+			st_engine_heartbeat_enable(engine);
 			intel_gt_set_wedged(engine->gt);
 			err = -EIO;
 			break;
@@ -434,7 +419,7 @@ int live_rps_control(void *arg)
 			pr_err("%s: could not set minimum frequency [%x], only %x!\n",
 			       engine->name, rps->min_freq, read_cagf(rps));
 			igt_spinner_end(&spin);
-			engine_heartbeat_enable(engine);
+			st_engine_heartbeat_enable(engine);
 			show_pstate_limits(rps);
 			err = -EINVAL;
 			break;
@@ -451,7 +436,7 @@ int live_rps_control(void *arg)
 			pr_err("%s: could not restore minimum frequency [%x], only %x!\n",
 			       engine->name, rps->min_freq, read_cagf(rps));
 			igt_spinner_end(&spin);
-			engine_heartbeat_enable(engine);
+			st_engine_heartbeat_enable(engine);
 			show_pstate_limits(rps);
 			err = -EINVAL;
 			break;
@@ -466,7 +451,7 @@ int live_rps_control(void *arg)
 		min_dt = ktime_sub(ktime_get(), min_dt);
 
 		igt_spinner_end(&spin);
-		engine_heartbeat_enable(engine);
+		st_engine_heartbeat_enable(engine);
 
 		pr_info("%s: range:[%x:%uMHz, %x:%uMHz] limit:[%x:%uMHz], %x:%x response %lluns:%lluns\n",
 			engine->name,
@@ -637,14 +622,14 @@ int live_rps_frequency_cs(void *arg)
 			int freq;
 		} min, max;
 
-		engine_heartbeat_disable(engine);
+		st_engine_heartbeat_disable(engine);
 
 		vma = create_spin_counter(engine,
 					  engine->kernel_context->vm, false,
 					  &cancel, &cntr);
 		if (IS_ERR(vma)) {
 			err = PTR_ERR(vma);
-			engine_heartbeat_enable(engine);
+			st_engine_heartbeat_enable(engine);
 			break;
 		}
 
@@ -725,7 +710,7 @@ err_vma:
 		i915_vma_unpin(vma);
 		i915_vma_put(vma);
 
-		engine_heartbeat_enable(engine);
+		st_engine_heartbeat_enable(engine);
 		if (igt_flush_test(gt->i915))
 			err = -EIO;
 		if (err)
@@ -779,14 +764,14 @@ int live_rps_frequency_srm(void *arg)
 			int freq;
 		} min, max;
 
-		engine_heartbeat_disable(engine);
+		st_engine_heartbeat_disable(engine);
 
 		vma = create_spin_counter(engine,
 					  engine->kernel_context->vm, true,
 					  &cancel, &cntr);
 		if (IS_ERR(vma)) {
 			err = PTR_ERR(vma);
-			engine_heartbeat_enable(engine);
+			st_engine_heartbeat_enable(engine);
 			break;
 		}
 
@@ -866,7 +851,7 @@ err_vma:
 		i915_vma_unpin(vma);
 		i915_vma_put(vma);
 
-		engine_heartbeat_enable(engine);
+		st_engine_heartbeat_enable(engine);
 		if (igt_flush_test(gt->i915))
 			err = -EIO;
 		if (err)
@@ -1061,11 +1046,11 @@ int live_rps_interrupt(void *arg)
 			intel_gt_pm_wait_for_idle(engine->gt);
 			GEM_BUG_ON(intel_rps_is_active(rps));
 
-			engine_heartbeat_disable(engine);
+			st_engine_heartbeat_disable(engine);
 
 			err = __rps_up_interrupt(rps, engine, &spin);
 
-			engine_heartbeat_enable(engine);
+			st_engine_heartbeat_enable(engine);
 			if (err)
 				goto out;
 
@@ -1074,13 +1059,13 @@ int live_rps_interrupt(void *arg)
 
 		/* Keep the engine awake but idle and check for DOWN */
 		if (pm_events & GEN6_PM_RP_DOWN_THRESHOLD) {
-			engine_heartbeat_disable(engine);
+			st_engine_heartbeat_disable(engine);
 			intel_rc6_disable(&gt->rc6);
 
 			err = __rps_down_interrupt(rps, engine);
 
 			intel_rc6_enable(&gt->rc6);
-			engine_heartbeat_enable(engine);
+			st_engine_heartbeat_enable(engine);
 			if (err)
 				goto out;
 		}
@@ -1165,13 +1150,13 @@ int live_rps_power(void *arg)
 		if (!intel_engine_can_store_dword(engine))
 			continue;
 
-		engine_heartbeat_disable(engine);
+		st_engine_heartbeat_disable(engine);
 
 		rq = igt_spinner_create_request(&spin,
 						engine->kernel_context,
 						MI_NOOP);
 		if (IS_ERR(rq)) {
-			engine_heartbeat_enable(engine);
+			st_engine_heartbeat_enable(engine);
 			err = PTR_ERR(rq);
 			break;
 		}
@@ -1182,7 +1167,7 @@ int live_rps_power(void *arg)
 			pr_err("%s: RPS spinner did not start\n",
 			       engine->name);
 			igt_spinner_end(&spin);
-			engine_heartbeat_enable(engine);
+			st_engine_heartbeat_enable(engine);
 			intel_gt_set_wedged(engine->gt);
 			err = -EIO;
 			break;
@@ -1195,7 +1180,7 @@ int live_rps_power(void *arg)
 		min.power = measure_power_at(rps, &min.freq);
 
 		igt_spinner_end(&spin);
-		engine_heartbeat_enable(engine);
+		st_engine_heartbeat_enable(engine);
 
 		pr_info("%s: min:%llumW @ %uMHz, max:%llumW @ %uMHz\n",
 			engine->name,
@@ -1252,6 +1237,11 @@ int live_rps_dynamic(void *arg)
 	if (igt_spinner_init(&spin, gt))
 		return -ENOMEM;
 
+	if (intel_rps_has_interrupts(rps))
+		pr_info("RPS has interrupt support\n");
+	if (intel_rps_uses_timer(rps))
+		pr_info("RPS has timer support\n");
+
 	for_each_engine(engine, gt, id) {
 		struct i915_request *rq;
 		struct {
diff --git a/drivers/gpu/drm/i915/gt/selftest_timeline.c b/drivers/gpu/drm/i915/gt/selftest_timeline.c
index ef1c35073dc0..fcdee951579b 100644
--- a/drivers/gpu/drm/i915/gt/selftest_timeline.c
+++ b/drivers/gpu/drm/i915/gt/selftest_timeline.c
@@ -12,6 +12,7 @@
 #include "intel_gt.h"
 #include "intel_gt_requests.h"
 #include "intel_ring.h"
+#include "selftest_engine_heartbeat.h"
 
 #include "../selftests/i915_random.h"
 #include "../i915_selftest.h"
@@ -426,12 +427,12 @@ static int emit_ggtt_store_dw(struct i915_request *rq, u32 addr, u32 value)
 	if (IS_ERR(cs))
 		return PTR_ERR(cs);
 
-	if (INTEL_GEN(rq->i915) >= 8) {
+	if (INTEL_GEN(rq->engine->i915) >= 8) {
 		*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
 		*cs++ = addr;
 		*cs++ = 0;
 		*cs++ = value;
-	} else if (INTEL_GEN(rq->i915) >= 4) {
+	} else if (INTEL_GEN(rq->engine->i915) >= 4) {
 		*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
 		*cs++ = 0;
 		*cs++ = addr;
@@ -751,22 +752,6 @@ out_free:
 	return err;
 }
 
-static void engine_heartbeat_disable(struct intel_engine_cs *engine)
-{
-	engine->props.heartbeat_interval_ms = 0;
-
-	intel_engine_pm_get(engine);
-	intel_engine_park_heartbeat(engine);
-}
-
-static void engine_heartbeat_enable(struct intel_engine_cs *engine)
-{
-	intel_engine_pm_put(engine);
-
-	engine->props.heartbeat_interval_ms =
-		engine->defaults.heartbeat_interval_ms;
-}
-
 static int live_hwsp_rollover_kernel(void *arg)
 {
 	struct intel_gt *gt = arg;
@@ -785,7 +770,7 @@ static int live_hwsp_rollover_kernel(void *arg)
 		struct i915_request *rq[3] = {};
 		int i;
 
-		engine_heartbeat_disable(engine);
+		st_engine_heartbeat_disable(engine);
 		if (intel_gt_wait_for_idle(gt, HZ / 2)) {
 			err = -EIO;
 			goto out;
@@ -836,7 +821,7 @@ static int live_hwsp_rollover_kernel(void *arg)
 out:
 		for (i = 0; i < ARRAY_SIZE(rq); i++)
 			i915_request_put(rq[i]);
-		engine_heartbeat_enable(engine);
+		st_engine_heartbeat_enable(engine);
 		if (err)
 			break;
 	}
diff --git a/drivers/gpu/drm/i915/gt/selftest_workarounds.c b/drivers/gpu/drm/i915/gt/selftest_workarounds.c
index 32785463ec9e..febc9e6692ba 100644
--- a/drivers/gpu/drm/i915/gt/selftest_workarounds.c
+++ b/drivers/gpu/drm/i915/gt/selftest_workarounds.c
@@ -417,6 +417,20 @@ static bool wo_register(struct intel_engine_cs *engine, u32 reg)
 	return false;
 }
 
+static bool timestamp(const struct intel_engine_cs *engine, u32 reg)
+{
+	reg = (reg - engine->mmio_base) & ~RING_FORCE_TO_NONPRIV_ACCESS_MASK;
+	switch (reg) {
+	case 0x358:
+	case 0x35c:
+	case 0x3a8:
+		return true;
+
+	default:
+		return false;
+	}
+}
+
 static bool ro_register(u32 reg)
 {
 	if ((reg & RING_FORCE_TO_NONPRIV_ACCESS_MASK) ==
@@ -497,6 +511,9 @@ static int check_dirty_whitelist(struct intel_context *ce)
 		if (wo_register(engine, reg))
 			continue;
 
+		if (timestamp(engine, reg))
+			continue; /* timestamps are expected to autoincrement */
+
 		ro_reg = ro_register(reg);
 
 		/* Clear non priv flags */
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c
index fb10f3597ea5..9bbe8a795cb8 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c
@@ -424,25 +424,28 @@ static void guc_log_capture_logs(struct intel_guc_log *log)
 
 static u32 __get_default_log_level(struct intel_guc_log *log)
 {
+	struct intel_guc *guc = log_to_guc(log);
+	struct drm_i915_private *i915 = guc_to_gt(guc)->i915;
+
 	/* A negative value means "use platform/config default" */
-	if (i915_modparams.guc_log_level < 0) {
+	if (i915->params.guc_log_level < 0) {
 		return (IS_ENABLED(CONFIG_DRM_I915_DEBUG) ||
 			IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) ?
 			GUC_LOG_LEVEL_MAX : GUC_LOG_LEVEL_NON_VERBOSE;
 	}
 
-	if (i915_modparams.guc_log_level > GUC_LOG_LEVEL_MAX) {
+	if (i915->params.guc_log_level > GUC_LOG_LEVEL_MAX) {
 		DRM_WARN("Incompatible option detected: %s=%d, %s!\n",
-			 "guc_log_level", i915_modparams.guc_log_level,
+			 "guc_log_level", i915->params.guc_log_level,
 			 "verbosity too high");
 		return (IS_ENABLED(CONFIG_DRM_I915_DEBUG) ||
 			IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) ?
 			GUC_LOG_LEVEL_MAX : GUC_LOG_LEVEL_DISABLED;
 	}
 
-	GEM_BUG_ON(i915_modparams.guc_log_level < GUC_LOG_LEVEL_DISABLED);
-	GEM_BUG_ON(i915_modparams.guc_log_level > GUC_LOG_LEVEL_MAX);
-	return i915_modparams.guc_log_level;
+	GEM_BUG_ON(i915->params.guc_log_level < GUC_LOG_LEVEL_DISABLED);
+	GEM_BUG_ON(i915->params.guc_log_level > GUC_LOG_LEVEL_MAX);
+	return i915->params.guc_log_level;
 }
 
 int intel_guc_log_create(struct intel_guc_log *log)
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index 94eb63f309ce..fdfeb4b9b0f5 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -660,10 +660,12 @@ void intel_guc_submission_disable(struct intel_guc *guc)
 
 static bool __guc_submission_selected(struct intel_guc *guc)
 {
+	struct drm_i915_private *i915 = guc_to_gt(guc)->i915;
+
 	if (!intel_guc_submission_is_supported(guc))
 		return false;
 
-	return i915_modparams.enable_guc & ENABLE_GUC_SUBMISSION;
+	return i915->params.enable_guc & ENABLE_GUC_SUBMISSION;
 }
 
 void intel_guc_submission_init_early(struct intel_guc *guc)
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc.c b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
index f518fe05c6f9..1c2d6358826c 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_uc.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
@@ -47,15 +47,15 @@ static void __confirm_options(struct intel_uc *uc)
 
 	drm_dbg(&i915->drm,
 		"enable_guc=%d (guc:%s submission:%s huc:%s)\n",
-		i915_modparams.enable_guc,
+		i915->params.enable_guc,
 		yesno(intel_uc_wants_guc(uc)),
 		yesno(intel_uc_wants_guc_submission(uc)),
 		yesno(intel_uc_wants_huc(uc)));
 
-	if (i915_modparams.enable_guc == -1)
+	if (i915->params.enable_guc == -1)
 		return;
 
-	if (i915_modparams.enable_guc == 0) {
+	if (i915->params.enable_guc == 0) {
 		GEM_BUG_ON(intel_uc_wants_guc(uc));
 		GEM_BUG_ON(intel_uc_wants_guc_submission(uc));
 		GEM_BUG_ON(intel_uc_wants_huc(uc));
@@ -65,25 +65,25 @@ static void __confirm_options(struct intel_uc *uc)
 	if (!intel_uc_supports_guc(uc))
 		drm_info(&i915->drm,
 			 "Incompatible option enable_guc=%d - %s\n",
-			 i915_modparams.enable_guc, "GuC is not supported!");
+			 i915->params.enable_guc, "GuC is not supported!");
 
-	if (i915_modparams.enable_guc & ENABLE_GUC_LOAD_HUC &&
+	if (i915->params.enable_guc & ENABLE_GUC_LOAD_HUC &&
 	    !intel_uc_supports_huc(uc))
 		drm_info(&i915->drm,
 			 "Incompatible option enable_guc=%d - %s\n",
-			 i915_modparams.enable_guc, "HuC is not supported!");
+			 i915->params.enable_guc, "HuC is not supported!");
 
-	if (i915_modparams.enable_guc & ENABLE_GUC_SUBMISSION &&
+	if (i915->params.enable_guc & ENABLE_GUC_SUBMISSION &&
 	    !intel_uc_supports_guc_submission(uc))
 		drm_info(&i915->drm,
 			 "Incompatible option enable_guc=%d - %s\n",
-			 i915_modparams.enable_guc, "GuC submission is N/A");
+			 i915->params.enable_guc, "GuC submission is N/A");
 
-	if (i915_modparams.enable_guc & ~(ENABLE_GUC_SUBMISSION |
+	if (i915->params.enable_guc & ~(ENABLE_GUC_SUBMISSION |
 					  ENABLE_GUC_LOAD_HUC))
 		drm_info(&i915->drm,
 			 "Incompatible option enable_guc=%d - %s\n",
-			 i915_modparams.enable_guc, "undocumented flag");
+			 i915->params.enable_guc, "undocumented flag");
 }
 
 void intel_uc_init_early(struct intel_uc *uc)
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c b/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c
index e1caae93996d..59b27aba15c6 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c
@@ -47,12 +47,15 @@ void intel_uc_fw_change_status(struct intel_uc_fw *uc_fw,
  * TGL 35.2 is interface-compatible with 33.0 for previous Gens. The deltas
  * between 33.0 and 35.2 are only related to new additions to support new Gen12
  * features.
+ *
+ * Note that RKL uses the same firmware as TGL.
  */
 #define INTEL_UC_FIRMWARE_DEFS(fw_def, guc_def, huc_def) \
+	fw_def(ROCKETLAKE,  0, guc_def(tgl, 35, 2, 0), huc_def(tgl,  7, 0, 12)) \
 	fw_def(TIGERLAKE,   0, guc_def(tgl, 35, 2, 0), huc_def(tgl,  7, 0, 12)) \
 	fw_def(ELKHARTLAKE, 0, guc_def(ehl, 33, 0, 4), huc_def(ehl,  9, 0, 0)) \
 	fw_def(ICELAKE,     0, guc_def(icl, 33, 0, 0), huc_def(icl,  9, 0, 0)) \
-	fw_def(COFFEELAKE,  5, guc_def(cml, 33, 0, 0), huc_def(cml,  4, 0, 0)) \
+	fw_def(COMETLAKE,   5, guc_def(cml, 33, 0, 0), huc_def(cml,  4, 0, 0)) \
 	fw_def(COFFEELAKE,  0, guc_def(kbl, 33, 0, 0), huc_def(kbl,  4, 0, 0)) \
 	fw_def(GEMINILAKE,  0, guc_def(glk, 33, 0, 0), huc_def(glk,  4, 0, 0)) \
 	fw_def(KABYLAKE,    0, guc_def(kbl, 33, 0, 0), huc_def(kbl,  4, 0, 0)) \
@@ -112,11 +115,13 @@ struct __packed uc_fw_platform_requirement {
 },
 
 static void
-__uc_fw_auto_select(struct intel_uc_fw *uc_fw, enum intel_platform p, u8 rev)
+__uc_fw_auto_select(struct drm_i915_private *i915, struct intel_uc_fw *uc_fw)
 {
 	static const struct uc_fw_platform_requirement fw_blobs[] = {
 		INTEL_UC_FIRMWARE_DEFS(MAKE_FW_LIST, GUC_FW_BLOB, HUC_FW_BLOB)
 	};
+	enum intel_platform p = INTEL_INFO(i915)->platform;
+	u8 rev = INTEL_REVID(i915);
 	int i;
 
 	for (i = 0; i < ARRAY_SIZE(fw_blobs) && p <= fw_blobs[i].p; i++) {
@@ -151,35 +156,35 @@ __uc_fw_auto_select(struct intel_uc_fw *uc_fw, enum intel_platform p, u8 rev)
 	}
 
 	/* We don't want to enable GuC/HuC on pre-Gen11 by default */
-	if (i915_modparams.enable_guc == -1 && p < INTEL_ICELAKE)
+	if (i915->params.enable_guc == -1 && p < INTEL_ICELAKE)
 		uc_fw->path = NULL;
 }
 
-static const char *__override_guc_firmware_path(void)
+static const char *__override_guc_firmware_path(struct drm_i915_private *i915)
 {
-	if (i915_modparams.enable_guc & (ENABLE_GUC_SUBMISSION |
-					 ENABLE_GUC_LOAD_HUC))
-		return i915_modparams.guc_firmware_path;
+	if (i915->params.enable_guc & (ENABLE_GUC_SUBMISSION |
+				       ENABLE_GUC_LOAD_HUC))
+		return i915->params.guc_firmware_path;
 	return "";
 }
 
-static const char *__override_huc_firmware_path(void)
+static const char *__override_huc_firmware_path(struct drm_i915_private *i915)
 {
-	if (i915_modparams.enable_guc & ENABLE_GUC_LOAD_HUC)
-		return i915_modparams.huc_firmware_path;
+	if (i915->params.enable_guc & ENABLE_GUC_LOAD_HUC)
+		return i915->params.huc_firmware_path;
 	return "";
 }
 
-static void __uc_fw_user_override(struct intel_uc_fw *uc_fw)
+static void __uc_fw_user_override(struct drm_i915_private *i915, struct intel_uc_fw *uc_fw)
 {
 	const char *path = NULL;
 
 	switch (uc_fw->type) {
 	case INTEL_UC_FW_TYPE_GUC:
-		path = __override_guc_firmware_path();
+		path = __override_guc_firmware_path(i915);
 		break;
 	case INTEL_UC_FW_TYPE_HUC:
-		path = __override_huc_firmware_path();
+		path = __override_huc_firmware_path(i915);
 		break;
 	}
 
@@ -213,10 +218,8 @@ void intel_uc_fw_init_early(struct intel_uc_fw *uc_fw,
 	uc_fw->type = type;
 
 	if (HAS_GT_UC(i915)) {
-		__uc_fw_auto_select(uc_fw,
-				    INTEL_INFO(i915)->platform,
-				    INTEL_REVID(i915));
-		__uc_fw_user_override(uc_fw);
+		__uc_fw_auto_select(i915, uc_fw);
+		__uc_fw_user_override(i915, uc_fw);
 	}
 
 	intel_uc_fw_change_status(uc_fw, uc_fw->path ? *uc_fw->path ?