1 files changed, 206 insertions, 154 deletions
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 245386e20c52..68c5af079ef8 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -53,12 +53,6 @@ void intel_ring_update_space(struct intel_ringbuffer *ringbuf)
 					    ringbuf->tail, ringbuf->size);
 }
 
-int intel_ring_space(struct intel_ringbuffer *ringbuf)
-{
-	intel_ring_update_space(ringbuf);
-	return ringbuf->space;
-}
-
 bool intel_engine_stopped(struct intel_engine_cs *engine)
 {
 	struct drm_i915_private *dev_priv = engine->dev->dev_private;
@@ -919,24 +913,26 @@ static int gen9_init_workarounds(struct intel_engine_cs *engine)
 {
 	struct drm_device *dev = engine->dev;
 	struct drm_i915_private *dev_priv = dev->dev_private;
-	uint32_t tmp;
 	int ret;
 
-	/* WaEnableLbsSlaRetryTimerDecrement:skl */
+	/* WaConextSwitchWithConcurrentTLBInvalidate:skl,bxt,kbl */
+	I915_WRITE(GEN9_CSFE_CHICKEN1_RCS, _MASKED_BIT_ENABLE(GEN9_PREEMPT_GPGPU_SYNC_SWITCH_DISABLE));
+
+	/* WaEnableLbsSlaRetryTimerDecrement:skl,bxt,kbl */
 	I915_WRITE(BDW_SCRATCH1, I915_READ(BDW_SCRATCH1) |
 		   GEN9_LBS_SLA_RETRY_TIMER_DECREMENT_ENABLE);
 
-	/* WaDisableKillLogic:bxt,skl */
+	/* WaDisableKillLogic:bxt,skl,kbl */
 	I915_WRITE(GAM_ECOCHK, I915_READ(GAM_ECOCHK) |
 		   ECOCHK_DIS_TLB);
 
-	/* WaClearFlowControlGpgpuContextSave:skl,bxt */
-	/* WaDisablePartialInstShootdown:skl,bxt */
+	/* WaClearFlowControlGpgpuContextSave:skl,bxt,kbl */
+	/* WaDisablePartialInstShootdown:skl,bxt,kbl */
 	WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN,
 			  FLOW_CONTROL_ENABLE |
 			  PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE);
 
-	/* Syncing dependencies between camera and graphics:skl,bxt */
+	/* Syncing dependencies between camera and graphics:skl,bxt,kbl */
 	WA_SET_BIT_MASKED(HALF_SLICE_CHICKEN3,
 			  GEN9_DISABLE_OCL_OOB_SUPPRESS_LOGIC);
 
@@ -958,18 +954,18 @@ static int gen9_init_workarounds(struct intel_engine_cs *engine)
 		 */
 	}
 
-	/* WaEnableYV12BugFixInHalfSliceChicken7:skl,bxt */
-	/* WaEnableSamplerGPGPUPreemptionSupport:skl,bxt */
+	/* WaEnableYV12BugFixInHalfSliceChicken7:skl,bxt,kbl */
+	/* WaEnableSamplerGPGPUPreemptionSupport:skl,bxt,kbl */
 	WA_SET_BIT_MASKED(GEN9_HALF_SLICE_CHICKEN7,
 			  GEN9_ENABLE_YV12_BUGFIX |
 			  GEN9_ENABLE_GPGPU_PREEMPTION);
 
-	/* Wa4x4STCOptimizationDisable:skl,bxt */
-	/* WaDisablePartialResolveInVc:skl,bxt */
+	/* Wa4x4STCOptimizationDisable:skl,bxt,kbl */
+	/* WaDisablePartialResolveInVc:skl,bxt,kbl */
 	WA_SET_BIT_MASKED(CACHE_MODE_1, (GEN8_4x4_STC_OPTIMIZATION_DISABLE |
 					 GEN9_PARTIAL_RESOLVE_IN_VC_DISABLE));
 
-	/* WaCcsTlbPrefetchDisable:skl,bxt */
+	/* WaCcsTlbPrefetchDisable:skl,bxt,kbl */
 	WA_CLR_BIT_MASKED(GEN9_HALF_SLICE_CHICKEN5,
 			  GEN9_CCS_TLB_PREFETCH_ENABLE);
 
@@ -979,31 +975,57 @@ static int gen9_init_workarounds(struct intel_engine_cs *engine)
 		WA_SET_BIT_MASKED(SLICE_ECO_CHICKEN0,
 				  PIXEL_MASK_CAMMING_DISABLE);
 
-	/* WaForceContextSaveRestoreNonCoherent:skl,bxt */
-	tmp = HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT;
-	if (IS_SKL_REVID(dev, SKL_REVID_F0, REVID_FOREVER) ||
-	    IS_BXT_REVID(dev, BXT_REVID_B0, REVID_FOREVER))
-		tmp |= HDC_FORCE_CSR_NON_COHERENT_OVR_DISABLE;
-	WA_SET_BIT_MASKED(HDC_CHICKEN0, tmp);
+	/* WaForceContextSaveRestoreNonCoherent:skl,bxt,kbl */
+	WA_SET_BIT_MASKED(HDC_CHICKEN0,
+			  HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT |
+			  HDC_FORCE_CSR_NON_COHERENT_OVR_DISABLE);
+
+	/* WaForceEnableNonCoherent and WaDisableHDCInvalidation are
+	 * both tied to WaForceContextSaveRestoreNonCoherent
+	 * in some hsds for skl. We keep the tie for all gen9. The
+	 * documentation is a bit hazy and so we want to get common behaviour,
+	 * even though there is no clear evidence we would need both on kbl/bxt.
+	 * This area has been source of system hangs so we play it safe
+	 * and mimic the skl regardless of what bspec says.
+	 *
+	 * Use Force Non-Coherent whenever executing a 3D context. This
+	 * is a workaround for a possible hang in the unlikely event
+	 * a TLB invalidation occurs during a PSD flush.
+	 */
+
+	/* WaForceEnableNonCoherent:skl,bxt,kbl */
+	WA_SET_BIT_MASKED(HDC_CHICKEN0,
+			  HDC_FORCE_NON_COHERENT);
+
+	/* WaDisableHDCInvalidation:skl,bxt,kbl */
+	I915_WRITE(GAM_ECOCHK, I915_READ(GAM_ECOCHK) |
+		   BDW_DISABLE_HDC_INVALIDATION);
 
-	/* WaDisableSamplerPowerBypassForSOPingPong:skl,bxt */
-	if (IS_SKYLAKE(dev) || IS_BXT_REVID(dev, 0, BXT_REVID_B0))
+	/* WaDisableSamplerPowerBypassForSOPingPong:skl,bxt,kbl */
+	if (IS_SKYLAKE(dev_priv) ||
+	    IS_KABYLAKE(dev_priv) ||
+	    IS_BXT_REVID(dev_priv, 0, BXT_REVID_B0))
 		WA_SET_BIT_MASKED(HALF_SLICE_CHICKEN3,
 				  GEN8_SAMPLER_POWER_BYPASS_DIS);
 
-	/* WaDisableSTUnitPowerOptimization:skl,bxt */
+	/* WaDisableSTUnitPowerOptimization:skl,bxt,kbl */
 	WA_SET_BIT_MASKED(HALF_SLICE_CHICKEN2, GEN8_ST_PO_DISABLE);
 
-	/* WaOCLCoherentLineFlush:skl,bxt */
+	/* WaOCLCoherentLineFlush:skl,bxt,kbl */
 	I915_WRITE(GEN8_L3SQCREG4, (I915_READ(GEN8_L3SQCREG4) |
 				    GEN8_LQSC_FLUSH_COHERENT_LINES));
 
-	/* WaEnablePreemptionGranularityControlByUMD:skl,bxt */
+	/* WaVFEStateAfterPipeControlwithMediaStateClear:skl,bxt */
+	ret = wa_ring_whitelist_reg(engine, GEN9_CTX_PREEMPT_REG);
+	if (ret)
+		return ret;
+
+	/* WaEnablePreemptionGranularityControlByUMD:skl,bxt,kbl */
 	ret= wa_ring_whitelist_reg(engine, GEN8_CS_CHICKEN1);
 	if (ret)
 		return ret;
 
-	/* WaAllowUMDToModifyHDCChicken1:skl,bxt */
+	/* WaAllowUMDToModifyHDCChicken1:skl,bxt,kbl */
 	ret = wa_ring_whitelist_reg(engine, GEN8_HDC_CHICKEN1);
 	if (ret)
 		return ret;
@@ -1098,22 +1120,6 @@ static int skl_init_workarounds(struct intel_engine_cs *engine)
 		WA_SET_BIT_MASKED(HIZ_CHICKEN,
 				  BDW_HIZ_POWER_COMPILER_CLOCK_GATING_DISABLE);
 
-	/* This is tied to WaForceContextSaveRestoreNonCoherent */
-	if (IS_SKL_REVID(dev, 0, REVID_FOREVER)) {
-		/*
-		 *Use Force Non-Coherent whenever executing a 3D context. This
-		 * is a workaround for a possible hang in the unlikely event
-		 * a TLB invalidation occurs during a PSD flush.
-		 */
-		/* WaForceEnableNonCoherent:skl */
-		WA_SET_BIT_MASKED(HDC_CHICKEN0,
-				  HDC_FORCE_NON_COHERENT);
-
-		/* WaDisableHDCInvalidation:skl */
-		I915_WRITE(GAM_ECOCHK, I915_READ(GAM_ECOCHK) |
-			   BDW_DISABLE_HDC_INVALIDATION);
-	}
-
 	/* WaBarrierPerformanceFixDisable:skl */
 	if (IS_SKL_REVID(dev, SKL_REVID_C0, SKL_REVID_D0))
 		WA_SET_BIT_MASKED(HDC_CHICKEN0,
@@ -1126,6 +1132,9 @@ static int skl_init_workarounds(struct intel_engine_cs *engine)
 			GEN7_HALF_SLICE_CHICKEN1,
 			GEN7_SBE_SS_CACHE_DISPATCH_PORT_SHARING_DISABLE);
 
+	/* WaDisableGafsUnitClkGating:skl */
+	WA_SET_BIT(GEN7_UCGCTL4, GEN8_EU_GAUNIT_CLOCK_GATE_DISABLE);
+
 	/* WaDisableLSQCROPERFforOCL:skl */
 	ret = wa_ring_whitelist_reg(engine, GEN8_L3SQCREG4);
 	if (ret)
@@ -1180,6 +1189,63 @@ static int bxt_init_workarounds(struct intel_engine_cs *engine)
 			return ret;
 	}
 
+	/* WaInsertDummyPushConstPs:bxt */
+	if (IS_BXT_REVID(dev_priv, 0, BXT_REVID_B0))
+		WA_SET_BIT_MASKED(COMMON_SLICE_CHICKEN2,
+				  GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
+
+	return 0;
+}
+
+static int kbl_init_workarounds(struct intel_engine_cs *engine)
+{
+	struct drm_i915_private *dev_priv = engine->dev->dev_private;
+	int ret;
+
+	ret = gen9_init_workarounds(engine);
+	if (ret)
+		return ret;
+
+	/* WaEnableGapsTsvCreditFix:kbl */
+	I915_WRITE(GEN8_GARBCNTL, (I915_READ(GEN8_GARBCNTL) |
+				   GEN9_GAPS_TSV_CREDIT_DISABLE));
+
+	/* WaDisableDynamicCreditSharing:kbl */
+	if (IS_KBL_REVID(dev_priv, 0, KBL_REVID_B0))
+		WA_SET_BIT(GAMT_CHKN_BIT_REG,
+			   GAMT_CHKN_DISABLE_DYNAMIC_CREDIT_SHARING);
+
+	/* WaDisableFenceDestinationToSLM:kbl (pre-prod) */
+	if (IS_KBL_REVID(dev_priv, KBL_REVID_A0, KBL_REVID_A0))
+		WA_SET_BIT_MASKED(HDC_CHICKEN0,
+				  HDC_FENCE_DEST_SLM_DISABLE);
+
+	/* GEN8_L3SQCREG4 has a dependency with WA batch so any new changes
+	 * involving this register should also be added to WA batch as required.
+	 */
+	if (IS_KBL_REVID(dev_priv, 0, KBL_REVID_E0))
+		/* WaDisableLSQCROPERFforOCL:kbl */
+		I915_WRITE(GEN8_L3SQCREG4, I915_READ(GEN8_L3SQCREG4) |
+			   GEN8_LQSC_RO_PERF_DIS);
+
+	/* WaInsertDummyPushConstPs:kbl */
+	if (IS_KBL_REVID(dev_priv, 0, KBL_REVID_B0))
+		WA_SET_BIT_MASKED(COMMON_SLICE_CHICKEN2,
+				  GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
+
+	/* WaDisableGafsUnitClkGating:kbl */
+	WA_SET_BIT(GEN7_UCGCTL4, GEN8_EU_GAUNIT_CLOCK_GATE_DISABLE);
+
+	/* WaDisableSbeCacheDispatchPortSharing:kbl */
+	WA_SET_BIT_MASKED(
+		GEN7_HALF_SLICE_CHICKEN1,
+		GEN7_SBE_SS_CACHE_DISPATCH_PORT_SHARING_DISABLE);
+
+	/* WaDisableLSQCROPERFforOCL:kbl */
+	ret = wa_ring_whitelist_reg(engine, GEN8_L3SQCREG4);
+	if (ret)
+		return ret;
+
 	return 0;
 }
 
@@ -1205,6 +1271,9 @@ int init_workarounds_ring(struct intel_engine_cs *engine)
 	if (IS_BROXTON(dev))
 		return bxt_init_workarounds(engine);
 
+	if (IS_KABYLAKE(dev_priv))
+		return kbl_init_workarounds(engine);
+
 	return 0;
 }
 
@@ -1309,7 +1378,7 @@ static int gen8_rcs_signal(struct drm_i915_gem_request *signaller_req,
 		intel_ring_emit(signaller, seqno);
 		intel_ring_emit(signaller, 0);
 		intel_ring_emit(signaller, MI_SEMAPHORE_SIGNAL |
-					   MI_SEMAPHORE_TARGET(waiter->id));
+					   MI_SEMAPHORE_TARGET(waiter->hw_id));
 		intel_ring_emit(signaller, 0);
 	}
 
@@ -1349,7 +1418,7 @@ static int gen8_xcs_signal(struct drm_i915_gem_request *signaller_req,
 		intel_ring_emit(signaller, upper_32_bits(gtt_offset));
 		intel_ring_emit(signaller, seqno);
 		intel_ring_emit(signaller, MI_SEMAPHORE_SIGNAL |
-					   MI_SEMAPHORE_TARGET(waiter->id));
+					   MI_SEMAPHORE_TARGET(waiter->hw_id));
 		intel_ring_emit(signaller, 0);
 	}
 
@@ -1573,6 +1642,8 @@ pc_render_add_request(struct drm_i915_gem_request *req)
 static void
 gen6_seqno_barrier(struct intel_engine_cs *engine)
 {
+	struct drm_i915_private *dev_priv = engine->dev->dev_private;
+
 	/* Workaround to force correct ordering between irq and seqno writes on
 	 * ivb (and maybe also on snb) by reading from a CS register (like
 	 * ACTHD) before reading the status page.
@@ -1584,9 +1655,13 @@ gen6_seqno_barrier(struct intel_engine_cs *engine)
 	 * the write time to land, but that would incur a delay after every
 	 * batch i.e. much more frequent than a delay when waiting for the
 	 * interrupt (with the same net latency).
+	 *
+	 * Also note that to prevent whole machine hangs on gen7, we have to
+	 * take the spinlock to guard against concurrent cacheline access.
 	 */
-	struct drm_i915_private *dev_priv = engine->dev->dev_private;
+	spin_lock_irq(&dev_priv->uncore.lock);
 	POSTING_READ_FW(RING_ACTHD(engine->mmio_base));
+	spin_unlock_irq(&dev_priv->uncore.lock);
 }
 
 static u32
@@ -2312,51 +2387,6 @@ void intel_cleanup_engine(struct intel_engine_cs *engine)
 	engine->dev = NULL;
 }
 
-static int ring_wait_for_space(struct intel_engine_cs *engine, int n)
-{
-	struct intel_ringbuffer *ringbuf = engine->buffer;
-	struct drm_i915_gem_request *request;
-	unsigned space;
-	int ret;
-
-	if (intel_ring_space(ringbuf) >= n)
-		return 0;
-
-	/* The whole point of reserving space is to not wait! */
-	WARN_ON(ringbuf->reserved_in_use);
-
-	list_for_each_entry(request, &engine->request_list, list) {
-		space = __intel_ring_space(request->postfix, ringbuf->tail,
-					   ringbuf->size);
-		if (space >= n)
-			break;
-	}
-
-	if (WARN_ON(&request->list == &engine->request_list))
-		return -ENOSPC;
-
-	ret = i915_wait_request(request);
-	if (ret)
-		return ret;
-
-	ringbuf->space = space;
-	return 0;
-}
-
-static void __wrap_ring_buffer(struct intel_ringbuffer *ringbuf)
-{
-	uint32_t __iomem *virt;
-	int rem = ringbuf->size - ringbuf->tail;
-
-	virt = ringbuf->virtual_start + ringbuf->tail;
-	rem /= 4;
-	while (rem--)
-		iowrite32(MI_NOOP, virt++);
-
-	ringbuf->tail = 0;
-	intel_ring_update_space(ringbuf);
-}
-
 int intel_engine_idle(struct intel_engine_cs *engine)
 {
 	struct drm_i915_gem_request *req;
@@ -2398,63 +2428,82 @@ int intel_ring_reserve_space(struct drm_i915_gem_request *request)
 
 void intel_ring_reserved_space_reserve(struct intel_ringbuffer *ringbuf, int size)
 {
-	WARN_ON(ringbuf->reserved_size);
-	WARN_ON(ringbuf->reserved_in_use);
-
+	GEM_BUG_ON(ringbuf->reserved_size);
 	ringbuf->reserved_size = size;
 }
 
 void intel_ring_reserved_space_cancel(struct intel_ringbuffer *ringbuf)
 {
-	WARN_ON(ringbuf->reserved_in_use);
-
+	GEM_BUG_ON(!ringbuf->reserved_size);
 	ringbuf->reserved_size   = 0;
-	ringbuf->reserved_in_use = false;
 }
 
 void intel_ring_reserved_space_use(struct intel_ringbuffer *ringbuf)
 {
-	WARN_ON(ringbuf->reserved_in_use);
-
-	ringbuf->reserved_in_use = true;
-	ringbuf->reserved_tail   = ringbuf->tail;
+	GEM_BUG_ON(!ringbuf->reserved_size);
+	ringbuf->reserved_size   = 0;
 }
 
 void intel_ring_reserved_space_end(struct intel_ringbuffer *ringbuf)
 {
-	WARN_ON(!ringbuf->reserved_in_use);
-	if (ringbuf->tail > ringbuf->reserved_tail) {
-		WARN(ringbuf->tail > ringbuf->reserved_tail + ringbuf->reserved_size,
-		     "request reserved size too small: %d vs %d!\n",
-		     ringbuf->tail - ringbuf->reserved_tail, ringbuf->reserved_size);
-	} else {
+	GEM_BUG_ON(ringbuf->reserved_size);
+}
+
+static int wait_for_space(struct drm_i915_gem_request *req, int bytes)
+{
+	struct intel_ringbuffer *ringbuf = req->ringbuf;
+	struct intel_engine_cs *engine = req->engine;
+	struct drm_i915_gem_request *target;
+
+	intel_ring_update_space(ringbuf);
+	if (ringbuf->space >= bytes)
+		return 0;
+
+	/*
+	 * Space is reserved in the ringbuffer for finalising the request,
+	 * as that cannot be allowed to fail. During request finalisation,
+	 * reserved_space is set to 0 to stop the overallocation and the
+	 * assumption is that then we never need to wait (which has the
+	 * risk of failing with EINTR).
+	 *
+	 * See also i915_gem_request_alloc() and i915_add_request().
+	 */
+	GEM_BUG_ON(!ringbuf->reserved_size);
+
+	list_for_each_entry(target, &engine->request_list, list) {
+		unsigned space;
+
 		/*
-		 * The ring was wrapped while the reserved space was in use.
-		 * That means that some unknown amount of the ring tail was
-		 * no-op filled and skipped. Thus simply adding the ring size
-		 * to the tail and doing the above space check will not work.
-		 * Rather than attempt to track how much tail was skipped,
-		 * it is much simpler to say that also skipping the sanity
-		 * check every once in a while is not a big issue.
+		 * The request queue is per-engine, so can contain requests
+		 * from multiple ringbuffers. Here, we must ignore any that
+		 * aren't from the ringbuffer we're considering.
 		 */
+		if (target->ringbuf != ringbuf)
+			continue;
+
+		/* Would completion of this request free enough space? */
+		space = __intel_ring_space(target->postfix, ringbuf->tail,
+					   ringbuf->size);
+		if (space >= bytes)
+			break;
 	}
 
-	ringbuf->reserved_size   = 0;
-	ringbuf->reserved_in_use = false;
+	if (WARN_ON(&target->list == &engine->request_list))
+		return -ENOSPC;
+
+	return i915_wait_request(target);
 }
 
-static int __intel_ring_prepare(struct intel_engine_cs *engine, int bytes)
+int intel_ring_begin(struct drm_i915_gem_request *req, int num_dwords)
 {
-	struct intel_ringbuffer *ringbuf = engine->buffer;
-	int remain_usable = ringbuf->effective_size - ringbuf->tail;
+	struct intel_ringbuffer *ringbuf = req->ringbuf;
 	int remain_actual = ringbuf->size - ringbuf->tail;
-	int ret, total_bytes, wait_bytes = 0;
+	int remain_usable = ringbuf->effective_size - ringbuf->tail;
+	int bytes = num_dwords * sizeof(u32);
+	int total_bytes, wait_bytes;
 	bool need_wrap = false;
 
-	if (ringbuf->reserved_in_use)
-		total_bytes = bytes;
-	else
-		total_bytes = bytes + ringbuf->reserved_size;
+	total_bytes = bytes + ringbuf->reserved_size;
 
 	if (unlikely(bytes > remain_usable)) {
 		/*
@@ -2463,44 +2512,42 @@ static int __intel_ring_prepare(struct intel_engine_cs *engine, int bytes)
 		 */
 		wait_bytes = remain_actual + total_bytes;
 		need_wrap = true;
+	} else if (unlikely(total_bytes > remain_usable)) {
+		/*
+		 * The base request will fit but the reserved space
+		 * falls off the end. So we don't need an immediate wrap
+		 * and only need to effectively wait for the reserved
+		 * size space from the start of ringbuffer.
+		 */
+		wait_bytes = remain_actual + ringbuf->reserved_size;
 	} else {
-		if (unlikely(total_bytes > remain_usable)) {
-			/*
-			 * The base request will fit but the reserved space
-			 * falls off the end. So don't need an immediate wrap
-			 * and only need to effectively wait for the reserved
-			 * size space from the start of ringbuffer.
-			 */
-			wait_bytes = remain_actual + ringbuf->reserved_size;
-		} else if (total_bytes > ringbuf->space) {
-			/* No wrapping required, just waiting. */
-			wait_bytes = total_bytes;
-		}
+		/* No wrapping required, just waiting. */
+		wait_bytes = total_bytes;
 	}
 
-	if (wait_bytes) {
-		ret = ring_wait_for_space(engine, wait_bytes);
+	if (wait_bytes > ringbuf->space) {
+		int ret = wait_for_space(req, wait_bytes);
 		if (unlikely(ret))
 			return ret;
 
-		if (need_wrap)
-			__wrap_ring_buffer(ringbuf);
+		intel_ring_update_space(ringbuf);
+		if (unlikely(ringbuf->space < wait_bytes))
+			return -EAGAIN;
 	}
 
-	return 0;
-}
+	if (unlikely(need_wrap)) {
+		GEM_BUG_ON(remain_actual > ringbuf->space);
+		GEM_BUG_ON(ringbuf->tail + remain_actual > ringbuf->size);
 
-int intel_ring_begin(struct drm_i915_gem_request *req,
-		     int num_dwords)
-{
-	struct intel_engine_cs *engine = req->engine;
-	int ret;
-
-	ret = __intel_ring_prepare(engine, num_dwords * sizeof(uint32_t));
-	if (ret)
-		return ret;
+		/* Fill the tail with MI_NOOP */
+		memset(ringbuf->virtual_start + ringbuf->tail,
+		       0, remain_actual);
+		ringbuf->tail = 0;
+		ringbuf->space -= remain_actual;
+	}
 
-	engine->buffer->space -= num_dwords * sizeof(uint32_t);
+	ringbuf->space -= bytes;
+	GEM_BUG_ON(ringbuf->space < 0);
 	return 0;
 }
 
@@ -2772,6 +2819,7 @@ int intel_init_render_ring_buffer(struct drm_device *dev)
 	engine->name = "render ring";
 	engine->id = RCS;
 	engine->exec_id = I915_EXEC_RENDER;
+	engine->hw_id = 0;
 	engine->mmio_base = RENDER_RING_BASE;
 
 	if (INTEL_INFO(dev)->gen >= 8) {
@@ -2923,6 +2971,7 @@ int intel_init_bsd_ring_buffer(struct drm_device *dev)
 	engine->name = "bsd ring";
 	engine->id = VCS;
 	engine->exec_id = I915_EXEC_BSD;
+	engine->hw_id = 1;
 
 	engine->write_tail = ring_write_tail;
 	if (INTEL_INFO(dev)->gen >= 6) {
@@ -3001,6 +3050,7 @@ int intel_init_bsd2_ring_buffer(struct drm_device *dev)
 	engine->name = "bsd2 ring";
 	engine->id = VCS2;
 	engine->exec_id = I915_EXEC_BSD;
+	engine->hw_id = 4;
 
 	engine->write_tail = ring_write_tail;
 	engine->mmio_base = GEN8_BSD2_RING_BASE;
@@ -3033,6 +3083,7 @@ int intel_init_blt_ring_buffer(struct drm_device *dev)
 	engine->name = "blitter ring";
 	engine->id = BCS;
 	engine->exec_id = I915_EXEC_BLT;
+	engine->hw_id = 2;
 
 	engine->mmio_base = BLT_RING_BASE;
 	engine->write_tail = ring_write_tail;
@@ -3092,6 +3143,7 @@ int intel_init_vebox_ring_buffer(struct drm_device *dev)
 	engine->name = "video enhancement ring";
 	engine->id = VECS;
 	engine->exec_id = I915_EXEC_VEBOX;
+	engine->hw_id = 3;
 
 	engine->mmio_base = VEBOX_RING_BASE;
 	engine->write_tail = ring_write_tail;