diff options
Diffstat (limited to 'drivers/gpu/drm/i915/intel_lrc.c')
-rw-r--r-- | drivers/gpu/drm/i915/intel_lrc.c | 282 |
1 files changed, 202 insertions, 80 deletions
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c index 61cac26a8b05..fbfcf88d7fe3 100644 --- a/drivers/gpu/drm/i915/intel_lrc.c +++ b/drivers/gpu/drm/i915/intel_lrc.c @@ -208,8 +208,9 @@ /* Typical size of the average request (2 pipecontrols and a MI_BB) */ #define EXECLISTS_REQUEST_SIZE 64 /* bytes */ - #define WA_TAIL_DWORDS 2 +#define WA_TAIL_BYTES (sizeof(u32) * WA_TAIL_DWORDS) +#define PREEMPT_ID 0x1 static int execlists_context_deferred_alloc(struct i915_gem_context *ctx, struct intel_engine_cs *engine); @@ -243,8 +244,7 @@ int intel_sanitize_enable_execlists(struct drm_i915_private *dev_priv, int enabl return 0; if (HAS_LOGICAL_RING_CONTEXTS(dev_priv) && - USES_PPGTT(dev_priv) && - i915_modparams.use_mmio_flip >= 0) + USES_PPGTT(dev_priv)) return 1; return 0; @@ -348,6 +348,43 @@ find_priolist: return ptr_pack_bits(p, first, 1); } +static void unwind_wa_tail(struct drm_i915_gem_request *rq) +{ + rq->tail = intel_ring_wrap(rq->ring, rq->wa_tail - WA_TAIL_BYTES); + assert_ring_tail_valid(rq->ring, rq->tail); +} + +static void unwind_incomplete_requests(struct intel_engine_cs *engine) +{ + struct drm_i915_gem_request *rq, *rn; + struct i915_priolist *uninitialized_var(p); + int last_prio = I915_PRIORITY_INVALID; + + lockdep_assert_held(&engine->timeline->lock); + + list_for_each_entry_safe_reverse(rq, rn, + &engine->timeline->requests, + link) { + if (i915_gem_request_completed(rq)) + return; + + __i915_gem_request_unsubmit(rq); + unwind_wa_tail(rq); + + GEM_BUG_ON(rq->priotree.priority == I915_PRIORITY_INVALID); + if (rq->priotree.priority != last_prio) { + p = lookup_priolist(engine, + &rq->priotree, + rq->priotree.priority); + p = ptr_mask_bits(p, 1); + + last_prio = rq->priotree.priority; + } + + list_add(&rq->priotree.link, &p->requests); + } +} + static inline void execlists_context_status_change(struct drm_i915_gem_request *rq, unsigned long status) @@ -392,6 +429,12 @@ static u64 execlists_update_context(struct drm_i915_gem_request *rq) return ce->lrc_desc; } +static inline void elsp_write(u64 desc, u32 __iomem *elsp) +{ + writel(upper_32_bits(desc), elsp); + writel(lower_32_bits(desc), elsp); +} + static void execlists_submit_ports(struct intel_engine_cs *engine) { struct execlist_port *port = engine->execlists.port; @@ -417,8 +460,7 @@ static void execlists_submit_ports(struct intel_engine_cs *engine) desc = 0; } - writel(upper_32_bits(desc), elsp); - writel(lower_32_bits(desc), elsp); + elsp_write(desc, elsp); } } @@ -451,26 +493,43 @@ static void port_assign(struct execlist_port *port, port_set(port, port_pack(i915_gem_request_get(rq), port_count(port))); } +static void inject_preempt_context(struct intel_engine_cs *engine) +{ + struct intel_context *ce = + &engine->i915->preempt_context->engine[engine->id]; + u32 __iomem *elsp = + engine->i915->regs + i915_mmio_reg_offset(RING_ELSP(engine)); + unsigned int n; + + GEM_BUG_ON(engine->i915->preempt_context->hw_id != PREEMPT_ID); + GEM_BUG_ON(!IS_ALIGNED(ce->ring->size, WA_TAIL_BYTES)); + + memset(ce->ring->vaddr + ce->ring->tail, 0, WA_TAIL_BYTES); + ce->ring->tail += WA_TAIL_BYTES; + ce->ring->tail &= (ce->ring->size - 1); + ce->lrc_reg_state[CTX_RING_TAIL+1] = ce->ring->tail; + + for (n = execlists_num_ports(&engine->execlists); --n; ) + elsp_write(0, elsp); + + elsp_write(ce->lrc_desc, elsp); +} + +static bool can_preempt(struct intel_engine_cs *engine) +{ + return INTEL_INFO(engine->i915)->has_logical_ring_preemption; +} + static void execlists_dequeue(struct intel_engine_cs *engine) { - struct drm_i915_gem_request *last; struct intel_engine_execlists * const execlists = &engine->execlists; struct execlist_port *port = execlists->port; const struct execlist_port * const last_port = &execlists->port[execlists->port_mask]; + struct drm_i915_gem_request *last = port_request(port); struct rb_node *rb; bool submit = false; - last = port_request(port); - if (last) - /* WaIdleLiteRestore:bdw,skl - * Apply the wa NOOPs to prevent ring:HEAD == req:TAIL - * as we resubmit the request. See gen8_emit_breadcrumb() - * for where we prepare the padding after the end of the - * request. - */ - last->tail = last->wa_tail; - /* Hardware submission is through 2 ports. Conceptually each port * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is * static for a context, and unique to each, so we only execute @@ -495,7 +554,65 @@ static void execlists_dequeue(struct intel_engine_cs *engine) spin_lock_irq(&engine->timeline->lock); rb = execlists->first; GEM_BUG_ON(rb_first(&execlists->queue) != rb); - while (rb) { + if (!rb) + goto unlock; + + if (last) { + /* + * Don't resubmit or switch until all outstanding + * preemptions (lite-restore) are seen. Then we + * know the next preemption status we see corresponds + * to this ELSP update. + */ + if (port_count(&port[0]) > 1) + goto unlock; + + if (can_preempt(engine) && + rb_entry(rb, struct i915_priolist, node)->priority > + max(last->priotree.priority, 0)) { + /* + * Switch to our empty preempt context so + * the state of the GPU is known (idle). + */ + inject_preempt_context(engine); + execlists->preempt = true; + goto unlock; + } else { + /* + * In theory, we could coalesce more requests onto + * the second port (the first port is active, with + * no preemptions pending). However, that means we + * then have to deal with the possible lite-restore + * of the second port (as we submit the ELSP, there + * may be a context-switch) but also we may complete + * the resubmission before the context-switch. Ergo, + * coalescing onto the second port will cause a + * preemption event, but we cannot predict whether + * that will affect port[0] or port[1]. + * + * If the second port is already active, we can wait + * until the next context-switch before contemplating + * new requests. The GPU will be busy and we should be + * able to resubmit the new ELSP before it idles, + * avoiding pipeline bubbles (momentary pauses where + * the driver is unable to keep up the supply of new + * work). + */ + if (port_count(&port[1])) + goto unlock; + + /* WaIdleLiteRestore:bdw,skl + * Apply the wa NOOPs to prevent + * ring:HEAD == req:TAIL as we resubmit the + * request. See gen8_emit_breadcrumb() for + * where we prepare the padding after the + * end of the request. + */ + last->tail = last->wa_tail; + } + } + + do { struct i915_priolist *p = rb_entry(rb, typeof(*p), node); struct drm_i915_gem_request *rq, *rn; @@ -547,8 +664,6 @@ static void execlists_dequeue(struct intel_engine_cs *engine) } INIT_LIST_HEAD(&rq->priotree.link); - rq->priotree.priority = INT_MAX; - __i915_gem_request_submit(rq); trace_i915_gem_request_in(rq, port_index(port, execlists)); last = rq; @@ -560,11 +675,12 @@ static void execlists_dequeue(struct intel_engine_cs *engine) INIT_LIST_HEAD(&p->requests); if (p->priority != I915_PRIORITY_NORMAL) kmem_cache_free(engine->i915->priorities, p); - } + } while (rb); done: execlists->first = rb; if (submit) port_assign(port, last); +unlock: spin_unlock_irq(&engine->timeline->lock); if (submit) @@ -575,12 +691,12 @@ static void execlist_cancel_port_requests(struct intel_engine_execlists *execlists) { struct execlist_port *port = execlists->port; - unsigned int num_ports = ARRAY_SIZE(execlists->port); + unsigned int num_ports = execlists_num_ports(execlists); while (num_ports-- && port_isset(port)) { struct drm_i915_gem_request *rq = port_request(port); - execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT); + execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_PREEMPTED); i915_gem_request_put(rq); memset(port, 0, sizeof(*port)); @@ -645,13 +761,6 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine) spin_unlock_irqrestore(&engine->timeline->lock, flags); } -static bool execlists_elsp_ready(const struct intel_engine_cs *engine) -{ - const struct execlist_port *port = engine->execlists.port; - - return port_count(&port[0]) + port_count(&port[1]) < 2; -} - /* * Check the unread Context Status Buffers and manage the submission of new * contexts to the ELSP accordingly. @@ -660,7 +769,7 @@ static void intel_lrc_irq_handler(unsigned long data) { struct intel_engine_cs * const engine = (struct intel_engine_cs *)data; struct intel_engine_execlists * const execlists = &engine->execlists; - struct execlist_port *port = execlists->port; + struct execlist_port * const port = execlists->port; struct drm_i915_private *dev_priv = engine->i915; /* We can skip acquiring intel_runtime_pm_get() here as it was taken @@ -745,6 +854,23 @@ static void intel_lrc_irq_handler(unsigned long data) if (!(status & GEN8_CTX_STATUS_COMPLETED_MASK)) continue; + if (status & GEN8_CTX_STATUS_ACTIVE_IDLE && + buf[2*head + 1] == PREEMPT_ID) { + execlist_cancel_port_requests(execlists); + + spin_lock_irq(&engine->timeline->lock); + unwind_incomplete_requests(engine); + spin_unlock_irq(&engine->timeline->lock); + + GEM_BUG_ON(!execlists->preempt); + execlists->preempt = false; + continue; + } + + if (status & GEN8_CTX_STATUS_PREEMPTED && + execlists->preempt) + continue; + /* Check the context/desc id for this event matches */ GEM_DEBUG_BUG_ON(buf[2 * head + 1] != port->context_id); @@ -756,6 +882,7 @@ static void intel_lrc_irq_handler(unsigned long data) execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT); trace_i915_gem_request_out(rq); + rq->priotree.priority = INT_MAX; i915_gem_request_put(rq); execlists_port_complete(execlists, port); @@ -775,7 +902,7 @@ static void intel_lrc_irq_handler(unsigned long data) } } - if (execlists_elsp_ready(engine)) + if (!execlists->preempt) execlists_dequeue(engine); intel_uncore_forcewake_put(dev_priv, execlists->fw_domains); @@ -788,7 +915,7 @@ static void insert_request(struct intel_engine_cs *engine, struct i915_priolist *p = lookup_priolist(engine, pt, prio); list_add_tail(&pt->link, &ptr_mask_bits(p, 1)->requests); - if (ptr_unmask_bits(p, 1) && execlists_elsp_ready(engine)) + if (ptr_unmask_bits(p, 1)) tasklet_hi_schedule(&engine->execlists.irq_tasklet); } @@ -808,11 +935,15 @@ static void execlists_submit_request(struct drm_i915_gem_request *request) spin_unlock_irqrestore(&engine->timeline->lock, flags); } +static struct drm_i915_gem_request *pt_to_request(struct i915_priotree *pt) +{ + return container_of(pt, struct drm_i915_gem_request, priotree); +} + static struct intel_engine_cs * pt_lock_engine(struct i915_priotree *pt, struct intel_engine_cs *locked) { - struct intel_engine_cs *engine = - container_of(pt, struct drm_i915_gem_request, priotree)->engine; + struct intel_engine_cs *engine = pt_to_request(pt)->engine; GEM_BUG_ON(!locked); @@ -831,6 +962,8 @@ static void execlists_schedule(struct drm_i915_gem_request *request, int prio) struct i915_dependency stack; LIST_HEAD(dfs); + GEM_BUG_ON(prio == I915_PRIORITY_INVALID); + if (prio <= READ_ONCE(request->priotree.priority)) return; @@ -866,6 +999,9 @@ static void execlists_schedule(struct drm_i915_gem_request *request, int prio) * engines. */ list_for_each_entry(p, &pt->signalers_list, signal_link) { + if (i915_gem_request_completed(pt_to_request(p->signaler))) + continue; + GEM_BUG_ON(p->signaler->priority < pt->priority); if (prio > READ_ONCE(p->signaler->priority)) list_move_tail(&p->dfs_link, &dfs); @@ -879,7 +1015,7 @@ static void execlists_schedule(struct drm_i915_gem_request *request, int prio) * execlists_submit_request()), we can set our own priority and skip * acquiring the engine locks. */ - if (request->priotree.priority == INT_MIN) { + if (request->priotree.priority == I915_PRIORITY_INVALID) { GEM_BUG_ON(!list_empty(&request->priotree.link)); request->priotree.priority = prio; if (stack.dfs_link.next == stack.dfs_link.prev) @@ -909,8 +1045,6 @@ static void execlists_schedule(struct drm_i915_gem_request *request, int prio) } spin_unlock_irq(&engine->timeline->lock); - - /* XXX Do we need to preempt to make room for us and our deps? */ } static struct intel_ring * @@ -1106,6 +1240,8 @@ static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) i915_ggtt_offset(engine->scratch) + 2 * CACHELINE_BYTES); + *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; + /* Pad to end of cacheline */ while ((unsigned long)batch % CACHELINE_BYTES) *batch++ = MI_NOOP; @@ -1119,26 +1255,10 @@ static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) return batch; } -/* - * This batch is started immediately after indirect_ctx batch. Since we ensure - * that indirect_ctx ends on a cacheline this batch is aligned automatically. - * - * The number of DWORDS written are returned using this field. - * - * This batch is terminated with MI_BATCH_BUFFER_END and so we need not add padding - * to align it with cacheline as padding after MI_BATCH_BUFFER_END is redundant. - */ -static u32 *gen8_init_perctx_bb(struct intel_engine_cs *engine, u32 *batch) -{ - /* WaDisableCtxRestoreArbitration:bdw,chv */ - *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; - *batch++ = MI_BATCH_BUFFER_END; - - return batch; -} - static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) { + *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; + /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */ batch = gen8_emit_flush_coherentl3_wa(engine, batch); @@ -1184,6 +1304,8 @@ static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) *batch++ = 0; } + *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; + /* Pad to end of cacheline */ while ((unsigned long)batch % CACHELINE_BYTES) *batch++ = MI_NOOP; @@ -1251,7 +1373,7 @@ static int intel_init_workaround_bb(struct intel_engine_cs *engine) break; case 8: wa_bb_fn[0] = gen8_init_indirectctx_bb; - wa_bb_fn[1] = gen8_init_perctx_bb; + wa_bb_fn[1] = NULL; break; default: MISSING_CASE(INTEL_GEN(engine->i915)); @@ -1337,6 +1459,7 @@ static int gen8_init_common_ring(struct intel_engine_cs *engine) GT_CONTEXT_SWITCH_INTERRUPT << engine->irq_shift); clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted); execlists->csb_head = -1; + execlists->preempt = false; /* After a GPU reset, we may have requests to replay */ if (!i915_modparams.enable_guc_submission && execlists->first) @@ -1382,7 +1505,6 @@ static void reset_common_ring(struct intel_engine_cs *engine, struct drm_i915_gem_request *request) { struct intel_engine_execlists * const execlists = &engine->execlists; - struct drm_i915_gem_request *rq, *rn; struct intel_context *ce; unsigned long flags; @@ -1400,21 +1522,7 @@ static void reset_common_ring(struct intel_engine_cs *engine, execlist_cancel_port_requests(execlists); /* Push back any incomplete requests for replay after the reset. */ - list_for_each_entry_safe_reverse(rq, rn, - &engine->timeline->requests, link) { - struct i915_priolist *p; - - if (i915_gem_request_completed(rq)) - break; - - __i915_gem_request_unsubmit(rq); - - p = lookup_priolist(engine, - &rq->priotree, - rq->priotree.priority); - list_add(&rq->priotree.link, - &ptr_mask_bits(p, 1)->requests); - } + unwind_incomplete_requests(engine); spin_unlock_irqrestore(&engine->timeline->lock, flags); @@ -1451,10 +1559,7 @@ static void reset_common_ring(struct intel_engine_cs *engine, intel_ring_update_space(request->ring); /* Reset WaIdleLiteRestore:bdw,skl as well */ - request->tail = - intel_ring_wrap(request->ring, - request->wa_tail - WA_TAIL_DWORDS*sizeof(u32)); - assert_ring_tail_valid(request->ring, request->tail); + unwind_wa_tail(request); } static int intel_logical_ring_emit_pdps(struct drm_i915_gem_request *req) @@ -1513,13 +1618,31 @@ static int gen8_emit_bb_start(struct drm_i915_gem_request *req, if (IS_ERR(cs)) return PTR_ERR(cs); + /* + * WaDisableCtxRestoreArbitration:bdw,chv + * + * We don't need to perform MI_ARB_ENABLE as often as we do (in + * particular all the gen that do not need the w/a at all!), if we + * took care to make sure that on every switch into this context + * (both ordinary and for preemption) that arbitrartion was enabled + * we would be fine. However, there doesn't seem to be a downside to + * being paranoid and making sure it is set before each batch and + * every context-switch. + * + * Note that if we fail to enable arbitration before the request + * is complete, then we do not see the context-switch interrupt and + * the engine hangs (with RING_HEAD == RING_TAIL). + * + * That satisfies both the GPGPU w/a and our heavy-handed paranoia. + */ + *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; + /* FIXME(BDW): Address space and security selectors. */ *cs++ = MI_BATCH_BUFFER_START_GEN8 | (flags & I915_DISPATCH_SECURE ? 0 : BIT(8)) | (flags & I915_DISPATCH_RS ? MI_BATCH_RESOURCE_STREAMER : 0); *cs++ = lower_32_bits(offset); *cs++ = upper_32_bits(offset); - *cs++ = MI_NOOP; intel_ring_advance(req, cs); return 0; @@ -1648,7 +1771,8 @@ static int gen8_emit_flush_render(struct drm_i915_gem_request *request, */ static void gen8_emit_wa_tail(struct drm_i915_gem_request *request, u32 *cs) { - *cs++ = MI_NOOP; + /* Ensure there's always at least one preemption point per-request. */ + *cs++ = MI_ARB_CHECK; *cs++ = MI_NOOP; request->wa_tail = intel_ring_offset(request, cs); } @@ -1669,7 +1793,6 @@ static void gen8_emit_breadcrumb(struct drm_i915_gem_request *request, u32 *cs) gen8_emit_wa_tail(request, cs); } - static const int gen8_emit_breadcrumb_sz = 6 + WA_TAIL_DWORDS; static void gen8_emit_breadcrumb_render(struct drm_i915_gem_request *request, @@ -1697,7 +1820,6 @@ static void gen8_emit_breadcrumb_render(struct drm_i915_gem_request *request, gen8_emit_wa_tail(request, cs); } - static const int gen8_emit_breadcrumb_render_sz = 8 + WA_TAIL_DWORDS; static int gen8_init_rcs_context(struct drm_i915_gem_request *req) |