aboutsummaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/i915/intel_lrc.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/i915/intel_lrc.c')
-rw-r--r--drivers/gpu/drm/i915/intel_lrc.c282
1 files changed, 202 insertions, 80 deletions
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 61cac26a8b05..fbfcf88d7fe3 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -208,8 +208,9 @@
/* Typical size of the average request (2 pipecontrols and a MI_BB) */
#define EXECLISTS_REQUEST_SIZE 64 /* bytes */
-
#define WA_TAIL_DWORDS 2
+#define WA_TAIL_BYTES (sizeof(u32) * WA_TAIL_DWORDS)
+#define PREEMPT_ID 0x1
static int execlists_context_deferred_alloc(struct i915_gem_context *ctx,
struct intel_engine_cs *engine);
@@ -243,8 +244,7 @@ int intel_sanitize_enable_execlists(struct drm_i915_private *dev_priv, int enabl
return 0;
if (HAS_LOGICAL_RING_CONTEXTS(dev_priv) &&
- USES_PPGTT(dev_priv) &&
- i915_modparams.use_mmio_flip >= 0)
+ USES_PPGTT(dev_priv))
return 1;
return 0;
@@ -348,6 +348,43 @@ find_priolist:
return ptr_pack_bits(p, first, 1);
}
+static void unwind_wa_tail(struct drm_i915_gem_request *rq)
+{
+ rq->tail = intel_ring_wrap(rq->ring, rq->wa_tail - WA_TAIL_BYTES);
+ assert_ring_tail_valid(rq->ring, rq->tail);
+}
+
+static void unwind_incomplete_requests(struct intel_engine_cs *engine)
+{
+ struct drm_i915_gem_request *rq, *rn;
+ struct i915_priolist *uninitialized_var(p);
+ int last_prio = I915_PRIORITY_INVALID;
+
+ lockdep_assert_held(&engine->timeline->lock);
+
+ list_for_each_entry_safe_reverse(rq, rn,
+ &engine->timeline->requests,
+ link) {
+ if (i915_gem_request_completed(rq))
+ return;
+
+ __i915_gem_request_unsubmit(rq);
+ unwind_wa_tail(rq);
+
+ GEM_BUG_ON(rq->priotree.priority == I915_PRIORITY_INVALID);
+ if (rq->priotree.priority != last_prio) {
+ p = lookup_priolist(engine,
+ &rq->priotree,
+ rq->priotree.priority);
+ p = ptr_mask_bits(p, 1);
+
+ last_prio = rq->priotree.priority;
+ }
+
+ list_add(&rq->priotree.link, &p->requests);
+ }
+}
+
static inline void
execlists_context_status_change(struct drm_i915_gem_request *rq,
unsigned long status)
@@ -392,6 +429,12 @@ static u64 execlists_update_context(struct drm_i915_gem_request *rq)
return ce->lrc_desc;
}
+static inline void elsp_write(u64 desc, u32 __iomem *elsp)
+{
+ writel(upper_32_bits(desc), elsp);
+ writel(lower_32_bits(desc), elsp);
+}
+
static void execlists_submit_ports(struct intel_engine_cs *engine)
{
struct execlist_port *port = engine->execlists.port;
@@ -417,8 +460,7 @@ static void execlists_submit_ports(struct intel_engine_cs *engine)
desc = 0;
}
- writel(upper_32_bits(desc), elsp);
- writel(lower_32_bits(desc), elsp);
+ elsp_write(desc, elsp);
}
}
@@ -451,26 +493,43 @@ static void port_assign(struct execlist_port *port,
port_set(port, port_pack(i915_gem_request_get(rq), port_count(port)));
}
+static void inject_preempt_context(struct intel_engine_cs *engine)
+{
+ struct intel_context *ce =
+ &engine->i915->preempt_context->engine[engine->id];
+ u32 __iomem *elsp =
+ engine->i915->regs + i915_mmio_reg_offset(RING_ELSP(engine));
+ unsigned int n;
+
+ GEM_BUG_ON(engine->i915->preempt_context->hw_id != PREEMPT_ID);
+ GEM_BUG_ON(!IS_ALIGNED(ce->ring->size, WA_TAIL_BYTES));
+
+ memset(ce->ring->vaddr + ce->ring->tail, 0, WA_TAIL_BYTES);
+ ce->ring->tail += WA_TAIL_BYTES;
+ ce->ring->tail &= (ce->ring->size - 1);
+ ce->lrc_reg_state[CTX_RING_TAIL+1] = ce->ring->tail;
+
+ for (n = execlists_num_ports(&engine->execlists); --n; )
+ elsp_write(0, elsp);
+
+ elsp_write(ce->lrc_desc, elsp);
+}
+
+static bool can_preempt(struct intel_engine_cs *engine)
+{
+ return INTEL_INFO(engine->i915)->has_logical_ring_preemption;
+}
+
static void execlists_dequeue(struct intel_engine_cs *engine)
{
- struct drm_i915_gem_request *last;
struct intel_engine_execlists * const execlists = &engine->execlists;
struct execlist_port *port = execlists->port;
const struct execlist_port * const last_port =
&execlists->port[execlists->port_mask];
+ struct drm_i915_gem_request *last = port_request(port);
struct rb_node *rb;
bool submit = false;
- last = port_request(port);
- if (last)
- /* WaIdleLiteRestore:bdw,skl
- * Apply the wa NOOPs to prevent ring:HEAD == req:TAIL
- * as we resubmit the request. See gen8_emit_breadcrumb()
- * for where we prepare the padding after the end of the
- * request.
- */
- last->tail = last->wa_tail;
-
/* Hardware submission is through 2 ports. Conceptually each port
* has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
* static for a context, and unique to each, so we only execute
@@ -495,7 +554,65 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
spin_lock_irq(&engine->timeline->lock);
rb = execlists->first;
GEM_BUG_ON(rb_first(&execlists->queue) != rb);
- while (rb) {
+ if (!rb)
+ goto unlock;
+
+ if (last) {
+ /*
+ * Don't resubmit or switch until all outstanding
+ * preemptions (lite-restore) are seen. Then we
+ * know the next preemption status we see corresponds
+ * to this ELSP update.
+ */
+ if (port_count(&port[0]) > 1)
+ goto unlock;
+
+ if (can_preempt(engine) &&
+ rb_entry(rb, struct i915_priolist, node)->priority >
+ max(last->priotree.priority, 0)) {
+ /*
+ * Switch to our empty preempt context so
+ * the state of the GPU is known (idle).
+ */
+ inject_preempt_context(engine);
+ execlists->preempt = true;
+ goto unlock;
+ } else {
+ /*
+ * In theory, we could coalesce more requests onto
+ * the second port (the first port is active, with
+ * no preemptions pending). However, that means we
+ * then have to deal with the possible lite-restore
+ * of the second port (as we submit the ELSP, there
+ * may be a context-switch) but also we may complete
+ * the resubmission before the context-switch. Ergo,
+ * coalescing onto the second port will cause a
+ * preemption event, but we cannot predict whether
+ * that will affect port[0] or port[1].
+ *
+ * If the second port is already active, we can wait
+ * until the next context-switch before contemplating
+ * new requests. The GPU will be busy and we should be
+ * able to resubmit the new ELSP before it idles,
+ * avoiding pipeline bubbles (momentary pauses where
+ * the driver is unable to keep up the supply of new
+ * work).
+ */
+ if (port_count(&port[1]))
+ goto unlock;
+
+ /* WaIdleLiteRestore:bdw,skl
+ * Apply the wa NOOPs to prevent
+ * ring:HEAD == req:TAIL as we resubmit the
+ * request. See gen8_emit_breadcrumb() for
+ * where we prepare the padding after the
+ * end of the request.
+ */
+ last->tail = last->wa_tail;
+ }
+ }
+
+ do {
struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
struct drm_i915_gem_request *rq, *rn;
@@ -547,8 +664,6 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
}
INIT_LIST_HEAD(&rq->priotree.link);
- rq->priotree.priority = INT_MAX;
-
__i915_gem_request_submit(rq);
trace_i915_gem_request_in(rq, port_index(port, execlists));
last = rq;
@@ -560,11 +675,12 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
INIT_LIST_HEAD(&p->requests);
if (p->priority != I915_PRIORITY_NORMAL)
kmem_cache_free(engine->i915->priorities, p);
- }
+ } while (rb);
done:
execlists->first = rb;
if (submit)
port_assign(port, last);
+unlock:
spin_unlock_irq(&engine->timeline->lock);
if (submit)
@@ -575,12 +691,12 @@ static void
execlist_cancel_port_requests(struct intel_engine_execlists *execlists)
{
struct execlist_port *port = execlists->port;
- unsigned int num_ports = ARRAY_SIZE(execlists->port);
+ unsigned int num_ports = execlists_num_ports(execlists);
while (num_ports-- && port_isset(port)) {
struct drm_i915_gem_request *rq = port_request(port);
- execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
+ execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_PREEMPTED);
i915_gem_request_put(rq);
memset(port, 0, sizeof(*port));
@@ -645,13 +761,6 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
spin_unlock_irqrestore(&engine->timeline->lock, flags);
}
-static bool execlists_elsp_ready(const struct intel_engine_cs *engine)
-{
- const struct execlist_port *port = engine->execlists.port;
-
- return port_count(&port[0]) + port_count(&port[1]) < 2;
-}
-
/*
* Check the unread Context Status Buffers and manage the submission of new
* contexts to the ELSP accordingly.
@@ -660,7 +769,7 @@ static void intel_lrc_irq_handler(unsigned long data)
{
struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
struct intel_engine_execlists * const execlists = &engine->execlists;
- struct execlist_port *port = execlists->port;
+ struct execlist_port * const port = execlists->port;
struct drm_i915_private *dev_priv = engine->i915;
/* We can skip acquiring intel_runtime_pm_get() here as it was taken
@@ -745,6 +854,23 @@ static void intel_lrc_irq_handler(unsigned long data)
if (!(status & GEN8_CTX_STATUS_COMPLETED_MASK))
continue;
+ if (status & GEN8_CTX_STATUS_ACTIVE_IDLE &&
+ buf[2*head + 1] == PREEMPT_ID) {
+ execlist_cancel_port_requests(execlists);
+
+ spin_lock_irq(&engine->timeline->lock);
+ unwind_incomplete_requests(engine);
+ spin_unlock_irq(&engine->timeline->lock);
+
+ GEM_BUG_ON(!execlists->preempt);
+ execlists->preempt = false;
+ continue;
+ }
+
+ if (status & GEN8_CTX_STATUS_PREEMPTED &&
+ execlists->preempt)
+ continue;
+
/* Check the context/desc id for this event matches */
GEM_DEBUG_BUG_ON(buf[2 * head + 1] != port->context_id);
@@ -756,6 +882,7 @@ static void intel_lrc_irq_handler(unsigned long data)
execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
trace_i915_gem_request_out(rq);
+ rq->priotree.priority = INT_MAX;
i915_gem_request_put(rq);
execlists_port_complete(execlists, port);
@@ -775,7 +902,7 @@ static void intel_lrc_irq_handler(unsigned long data)
}
}
- if (execlists_elsp_ready(engine))
+ if (!execlists->preempt)
execlists_dequeue(engine);
intel_uncore_forcewake_put(dev_priv, execlists->fw_domains);
@@ -788,7 +915,7 @@ static void insert_request(struct intel_engine_cs *engine,
struct i915_priolist *p = lookup_priolist(engine, pt, prio);
list_add_tail(&pt->link, &ptr_mask_bits(p, 1)->requests);
- if (ptr_unmask_bits(p, 1) && execlists_elsp_ready(engine))
+ if (ptr_unmask_bits(p, 1))
tasklet_hi_schedule(&engine->execlists.irq_tasklet);
}
@@ -808,11 +935,15 @@ static void execlists_submit_request(struct drm_i915_gem_request *request)
spin_unlock_irqrestore(&engine->timeline->lock, flags);
}
+static struct drm_i915_gem_request *pt_to_request(struct i915_priotree *pt)
+{
+ return container_of(pt, struct drm_i915_gem_request, priotree);
+}
+
static struct intel_engine_cs *
pt_lock_engine(struct i915_priotree *pt, struct intel_engine_cs *locked)
{
- struct intel_engine_cs *engine =
- container_of(pt, struct drm_i915_gem_request, priotree)->engine;
+ struct intel_engine_cs *engine = pt_to_request(pt)->engine;
GEM_BUG_ON(!locked);
@@ -831,6 +962,8 @@ static void execlists_schedule(struct drm_i915_gem_request *request, int prio)
struct i915_dependency stack;
LIST_HEAD(dfs);
+ GEM_BUG_ON(prio == I915_PRIORITY_INVALID);
+
if (prio <= READ_ONCE(request->priotree.priority))
return;
@@ -866,6 +999,9 @@ static void execlists_schedule(struct drm_i915_gem_request *request, int prio)
* engines.
*/
list_for_each_entry(p, &pt->signalers_list, signal_link) {
+ if (i915_gem_request_completed(pt_to_request(p->signaler)))
+ continue;
+
GEM_BUG_ON(p->signaler->priority < pt->priority);
if (prio > READ_ONCE(p->signaler->priority))
list_move_tail(&p->dfs_link, &dfs);
@@ -879,7 +1015,7 @@ static void execlists_schedule(struct drm_i915_gem_request *request, int prio)
* execlists_submit_request()), we can set our own priority and skip
* acquiring the engine locks.
*/
- if (request->priotree.priority == INT_MIN) {
+ if (request->priotree.priority == I915_PRIORITY_INVALID) {
GEM_BUG_ON(!list_empty(&request->priotree.link));
request->priotree.priority = prio;
if (stack.dfs_link.next == stack.dfs_link.prev)
@@ -909,8 +1045,6 @@ static void execlists_schedule(struct drm_i915_gem_request *request, int prio)
}
spin_unlock_irq(&engine->timeline->lock);
-
- /* XXX Do we need to preempt to make room for us and our deps? */
}
static struct intel_ring *
@@ -1106,6 +1240,8 @@ static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
i915_ggtt_offset(engine->scratch) +
2 * CACHELINE_BYTES);
+ *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
+
/* Pad to end of cacheline */
while ((unsigned long)batch % CACHELINE_BYTES)
*batch++ = MI_NOOP;
@@ -1119,26 +1255,10 @@ static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
return batch;
}
-/*
- * This batch is started immediately after indirect_ctx batch. Since we ensure
- * that indirect_ctx ends on a cacheline this batch is aligned automatically.
- *
- * The number of DWORDS written are returned using this field.
- *
- * This batch is terminated with MI_BATCH_BUFFER_END and so we need not add padding
- * to align it with cacheline as padding after MI_BATCH_BUFFER_END is redundant.
- */
-static u32 *gen8_init_perctx_bb(struct intel_engine_cs *engine, u32 *batch)
-{
- /* WaDisableCtxRestoreArbitration:bdw,chv */
- *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
- *batch++ = MI_BATCH_BUFFER_END;
-
- return batch;
-}
-
static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
{
+ *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
+
/* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
batch = gen8_emit_flush_coherentl3_wa(engine, batch);
@@ -1184,6 +1304,8 @@ static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
*batch++ = 0;
}
+ *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
+
/* Pad to end of cacheline */
while ((unsigned long)batch % CACHELINE_BYTES)
*batch++ = MI_NOOP;
@@ -1251,7 +1373,7 @@ static int intel_init_workaround_bb(struct intel_engine_cs *engine)
break;
case 8:
wa_bb_fn[0] = gen8_init_indirectctx_bb;
- wa_bb_fn[1] = gen8_init_perctx_bb;
+ wa_bb_fn[1] = NULL;
break;
default:
MISSING_CASE(INTEL_GEN(engine->i915));
@@ -1337,6 +1459,7 @@ static int gen8_init_common_ring(struct intel_engine_cs *engine)
GT_CONTEXT_SWITCH_INTERRUPT << engine->irq_shift);
clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
execlists->csb_head = -1;
+ execlists->preempt = false;
/* After a GPU reset, we may have requests to replay */
if (!i915_modparams.enable_guc_submission && execlists->first)
@@ -1382,7 +1505,6 @@ static void reset_common_ring(struct intel_engine_cs *engine,
struct drm_i915_gem_request *request)
{
struct intel_engine_execlists * const execlists = &engine->execlists;
- struct drm_i915_gem_request *rq, *rn;
struct intel_context *ce;
unsigned long flags;
@@ -1400,21 +1522,7 @@ static void reset_common_ring(struct intel_engine_cs *engine,
execlist_cancel_port_requests(execlists);
/* Push back any incomplete requests for replay after the reset. */
- list_for_each_entry_safe_reverse(rq, rn,
- &engine->timeline->requests, link) {
- struct i915_priolist *p;
-
- if (i915_gem_request_completed(rq))
- break;
-
- __i915_gem_request_unsubmit(rq);
-
- p = lookup_priolist(engine,
- &rq->priotree,
- rq->priotree.priority);
- list_add(&rq->priotree.link,
- &ptr_mask_bits(p, 1)->requests);
- }
+ unwind_incomplete_requests(engine);
spin_unlock_irqrestore(&engine->timeline->lock, flags);
@@ -1451,10 +1559,7 @@ static void reset_common_ring(struct intel_engine_cs *engine,
intel_ring_update_space(request->ring);
/* Reset WaIdleLiteRestore:bdw,skl as well */
- request->tail =
- intel_ring_wrap(request->ring,
- request->wa_tail - WA_TAIL_DWORDS*sizeof(u32));
- assert_ring_tail_valid(request->ring, request->tail);
+ unwind_wa_tail(request);
}
static int intel_logical_ring_emit_pdps(struct drm_i915_gem_request *req)
@@ -1513,13 +1618,31 @@ static int gen8_emit_bb_start(struct drm_i915_gem_request *req,
if (IS_ERR(cs))
return PTR_ERR(cs);
+ /*
+ * WaDisableCtxRestoreArbitration:bdw,chv
+ *
+ * We don't need to perform MI_ARB_ENABLE as often as we do (in
+ * particular all the gen that do not need the w/a at all!), if we
+ * took care to make sure that on every switch into this context
+ * (both ordinary and for preemption) that arbitrartion was enabled
+ * we would be fine. However, there doesn't seem to be a downside to
+ * being paranoid and making sure it is set before each batch and
+ * every context-switch.
+ *
+ * Note that if we fail to enable arbitration before the request
+ * is complete, then we do not see the context-switch interrupt and
+ * the engine hangs (with RING_HEAD == RING_TAIL).
+ *
+ * That satisfies both the GPGPU w/a and our heavy-handed paranoia.
+ */
+ *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
+
/* FIXME(BDW): Address space and security selectors. */
*cs++ = MI_BATCH_BUFFER_START_GEN8 |
(flags & I915_DISPATCH_SECURE ? 0 : BIT(8)) |
(flags & I915_DISPATCH_RS ? MI_BATCH_RESOURCE_STREAMER : 0);
*cs++ = lower_32_bits(offset);
*cs++ = upper_32_bits(offset);
- *cs++ = MI_NOOP;
intel_ring_advance(req, cs);
return 0;
@@ -1648,7 +1771,8 @@ static int gen8_emit_flush_render(struct drm_i915_gem_request *request,
*/
static void gen8_emit_wa_tail(struct drm_i915_gem_request *request, u32 *cs)
{
- *cs++ = MI_NOOP;
+ /* Ensure there's always at least one preemption point per-request. */
+ *cs++ = MI_ARB_CHECK;
*cs++ = MI_NOOP;
request->wa_tail = intel_ring_offset(request, cs);
}
@@ -1669,7 +1793,6 @@ static void gen8_emit_breadcrumb(struct drm_i915_gem_request *request, u32 *cs)
gen8_emit_wa_tail(request, cs);
}
-
static const int gen8_emit_breadcrumb_sz = 6 + WA_TAIL_DWORDS;
static void gen8_emit_breadcrumb_render(struct drm_i915_gem_request *request,
@@ -1697,7 +1820,6 @@ static void gen8_emit_breadcrumb_render(struct drm_i915_gem_request *request,
gen8_emit_wa_tail(request, cs);
}
-
static const int gen8_emit_breadcrumb_render_sz = 8 + WA_TAIL_DWORDS;
static int gen8_init_rcs_context(struct drm_i915_gem_request *req)