diff options
Diffstat (limited to 'drivers/gpu/drm/i915/intel_lrc.c')
| -rw-r--r-- | drivers/gpu/drm/i915/intel_lrc.c | 742 |
1 files changed, 426 insertions, 316 deletions
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c index e8d3da9f3373..9b74ffae5f5a 100644 --- a/drivers/gpu/drm/i915/intel_lrc.c +++ b/drivers/gpu/drm/i915/intel_lrc.c @@ -188,6 +188,15 @@ #define GEN8_CTX_FORCE_RESTORE (1<<2) #define GEN8_CTX_L3LLC_COHERENT (1<<5) #define GEN8_CTX_PRIVILEGE (1<<8) + +#define ASSIGN_CTX_PDP(ppgtt, reg_state, n) { \ + const u64 _addr = test_bit(n, ppgtt->pdp.used_pdpes) ? \ + ppgtt->pdp.page_directory[n]->daddr : \ + ppgtt->scratch_pd->daddr; \ + reg_state[CTX_PDP ## n ## _UDW+1] = upper_32_bits(_addr); \ + reg_state[CTX_PDP ## n ## _LDW+1] = lower_32_bits(_addr); \ +} + enum { ADVANCED_CONTEXT = 0, LEGACY_CONTEXT, @@ -254,8 +263,10 @@ u32 intel_execlists_ctx_id(struct drm_i915_gem_object *ctx_obj) return lrca >> 12; } -static uint64_t execlists_ctx_descriptor(struct drm_i915_gem_object *ctx_obj) +static uint64_t execlists_ctx_descriptor(struct intel_engine_cs *ring, + struct drm_i915_gem_object *ctx_obj) { + struct drm_device *dev = ring->dev; uint64_t desc; uint64_t lrca = i915_gem_obj_ggtt_offset(ctx_obj); @@ -263,7 +274,8 @@ static uint64_t execlists_ctx_descriptor(struct drm_i915_gem_object *ctx_obj) desc = GEN8_CTX_VALID; desc |= LEGACY_CONTEXT << GEN8_CTX_MODE_SHIFT; - desc |= GEN8_CTX_L3LLC_COHERENT; + if (IS_GEN8(ctx_obj->base.dev)) + desc |= GEN8_CTX_L3LLC_COHERENT; desc |= GEN8_CTX_PRIVILEGE; desc |= lrca; desc |= (u64)intel_execlists_ctx_id(ctx_obj) << GEN8_CTX_ID_SHIFT; @@ -272,6 +284,13 @@ static uint64_t execlists_ctx_descriptor(struct drm_i915_gem_object *ctx_obj) * signalling between Command Streamers */ /* desc |= GEN8_CTX_FORCE_RESTORE; */ + /* WaEnableForceRestoreInCtxtDescForVCS:skl */ + if (IS_GEN9(dev) && + INTEL_REVID(dev) <= SKL_REVID_B0 && + (ring->id == BCS || ring->id == VCS || + ring->id == VECS || ring->id == VCS2)) + desc |= GEN8_CTX_FORCE_RESTORE; + return desc; } @@ -286,31 +305,34 @@ static void execlists_elsp_write(struct intel_engine_cs *ring, /* XXX: You must always write both descriptors in the order below. */ if (ctx_obj1) - temp = execlists_ctx_descriptor(ctx_obj1); + temp = execlists_ctx_descriptor(ring, ctx_obj1); else temp = 0; desc[1] = (u32)(temp >> 32); desc[0] = (u32)temp; - temp = execlists_ctx_descriptor(ctx_obj0); + temp = execlists_ctx_descriptor(ring, ctx_obj0); desc[3] = (u32)(temp >> 32); desc[2] = (u32)temp; - intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL); - I915_WRITE(RING_ELSP(ring), desc[1]); - I915_WRITE(RING_ELSP(ring), desc[0]); - I915_WRITE(RING_ELSP(ring), desc[3]); + spin_lock(&dev_priv->uncore.lock); + intel_uncore_forcewake_get__locked(dev_priv, FORCEWAKE_ALL); + I915_WRITE_FW(RING_ELSP(ring), desc[1]); + I915_WRITE_FW(RING_ELSP(ring), desc[0]); + I915_WRITE_FW(RING_ELSP(ring), desc[3]); /* The context is automatically loaded after the following */ - I915_WRITE(RING_ELSP(ring), desc[2]); + I915_WRITE_FW(RING_ELSP(ring), desc[2]); /* ELSP is a wo register, so use another nearby reg for posting instead */ - POSTING_READ(RING_EXECLIST_STATUS(ring)); - intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL); + POSTING_READ_FW(RING_EXECLIST_STATUS(ring)); + intel_uncore_forcewake_put__locked(dev_priv, FORCEWAKE_ALL); + spin_unlock(&dev_priv->uncore.lock); } static int execlists_update_context(struct drm_i915_gem_object *ctx_obj, struct drm_i915_gem_object *ring_obj, + struct i915_hw_ppgtt *ppgtt, u32 tail) { struct page *page; @@ -322,6 +344,16 @@ static int execlists_update_context(struct drm_i915_gem_object *ctx_obj, reg_state[CTX_RING_TAIL+1] = tail; reg_state[CTX_RING_BUFFER_START+1] = i915_gem_obj_ggtt_offset(ring_obj); + /* True PPGTT with dynamic page allocation: update PDP registers and + * point the unallocated PDPs to the scratch page + */ + if (ppgtt) { + ASSIGN_CTX_PDP(ppgtt, reg_state, 3); + ASSIGN_CTX_PDP(ppgtt, reg_state, 2); + ASSIGN_CTX_PDP(ppgtt, reg_state, 1); + ASSIGN_CTX_PDP(ppgtt, reg_state, 0); + } + kunmap_atomic(reg_state); return 0; @@ -340,7 +372,7 @@ static void execlists_submit_contexts(struct intel_engine_cs *ring, WARN_ON(!i915_gem_obj_is_pinned(ctx_obj0)); WARN_ON(!i915_gem_obj_is_pinned(ringbuf0->obj)); - execlists_update_context(ctx_obj0, ringbuf0->obj, tail0); + execlists_update_context(ctx_obj0, ringbuf0->obj, to0->ppgtt, tail0); if (to1) { ringbuf1 = to1->engine[ring->id].ringbuf; @@ -349,7 +381,7 @@ static void execlists_submit_contexts(struct intel_engine_cs *ring, WARN_ON(!i915_gem_obj_is_pinned(ctx_obj1)); WARN_ON(!i915_gem_obj_is_pinned(ringbuf1->obj)); - execlists_update_context(ctx_obj1, ringbuf1->obj, tail1); + execlists_update_context(ctx_obj1, ringbuf1->obj, to1->ppgtt, tail1); } execlists_elsp_write(ring, ctx_obj0, ctx_obj1); @@ -362,6 +394,12 @@ static void execlists_context_unqueue(struct intel_engine_cs *ring) assert_spin_locked(&ring->execlist_lock); + /* + * If irqs are not active generate a warning as batches that finish + * without the irqs may get lost and a GPU Hang may occur. + */ + WARN_ON(!intel_irqs_enabled(ring->dev->dev_private)); + if (list_empty(&ring->execlist_queue)) return; @@ -384,6 +422,26 @@ static void execlists_context_unqueue(struct intel_engine_cs *ring) } } + if (IS_GEN8(ring->dev) || IS_GEN9(ring->dev)) { + /* + * WaIdleLiteRestore: make sure we never cause a lite + * restore with HEAD==TAIL + */ + if (req0->elsp_submitted) { + /* + * Apply the wa NOOPS to prevent ring:HEAD == req:TAIL + * as we resubmit the request. See gen8_emit_request() + * for where we prepare the padding after the end of the + * request. + */ + struct intel_ringbuffer *ringbuf; + + ringbuf = req0->ctx->engine[ring->id].ringbuf; + req0->tail += 8; + req0->tail &= ringbuf->size - 1; + } + } + WARN_ON(req1 && req1->elsp_submitted); execlists_submit_contexts(ring, req0->ctx, req0->tail, @@ -491,8 +549,6 @@ static int execlists_context_queue(struct intel_engine_cs *ring, struct drm_i915_gem_request *request) { struct drm_i915_gem_request *cursor; - struct drm_i915_private *dev_priv = ring->dev->dev_private; - unsigned long flags; int num_elements = 0; if (to != ring->default_context) @@ -509,7 +565,6 @@ static int execlists_context_queue(struct intel_engine_cs *ring, request->ring = ring; request->ctx = to; kref_init(&request->ref); - request->uniq = dev_priv->request_uniq++; i915_gem_context_reference(request->ctx); } else { i915_gem_request_reference(request); @@ -517,9 +572,7 @@ static int execlists_context_queue(struct intel_engine_cs *ring, } request->tail = tail; - intel_runtime_pm_get(dev_priv); - - spin_lock_irqsave(&ring->execlist_lock, flags); + spin_lock_irq(&ring->execlist_lock); list_for_each_entry(cursor, &ring->execlist_queue, execlist_link) if (++num_elements > 2) @@ -545,7 +598,7 @@ static int execlists_context_queue(struct intel_engine_cs *ring, if (num_elements == 0) execlists_context_unqueue(ring); - spin_unlock_irqrestore(&ring->execlist_lock, flags); + spin_unlock_irq(&ring->execlist_lock); return 0; } @@ -575,6 +628,7 @@ static int execlists_move_to_gpu(struct intel_ringbuffer *ringbuf, struct list_head *vmas) { struct intel_engine_cs *ring = ringbuf->ring; + const unsigned other_rings = ~intel_ring_flag(ring); struct i915_vma *vma; uint32_t flush_domains = 0; bool flush_chipset = false; @@ -583,9 +637,11 @@ static int execlists_move_to_gpu(struct intel_ringbuffer *ringbuf, list_for_each_entry(vma, vmas, exec_list) { struct drm_i915_gem_object *obj = vma->obj; - ret = i915_gem_object_sync(obj, ring); - if (ret) - return ret; + if (obj->active & other_rings) { + ret = i915_gem_object_sync(obj, ring); + if (ret) + return ret; + } if (obj->base.write_domain & I915_GEM_DOMAIN_CPU) flush_chipset |= i915_gem_clflush_object(obj, false); @@ -602,6 +658,170 @@ static int execlists_move_to_gpu(struct intel_ringbuffer *ringbuf, return logical_ring_invalidate_all_caches(ringbuf, ctx); } +int intel_logical_ring_alloc_request_extras(struct drm_i915_gem_request *request, + struct intel_context *ctx) +{ + int ret; + + if (ctx != request->ring->default_context) { + ret = intel_lr_context_pin(request->ring, ctx); + if (ret) + return ret; + } + + request->ringbuf = ctx->engine[request->ring->id].ringbuf; + request->ctx = ctx; + i915_gem_context_reference(request->ctx); + + return 0; +} + +static int logical_ring_wait_for_space(struct intel_ringbuffer *ringbuf, + struct intel_context *ctx, + int bytes) +{ + struct intel_engine_cs *ring = ringbuf->ring; + struct drm_i915_gem_request *request; + unsigned space; + int ret; + + if (intel_ring_space(ringbuf) >= bytes) + return 0; + + list_for_each_entry(request, &ring->request_list, list) { + /* + * The request queue is per-engine, so can contain requests + * from multiple ringbuffers. Here, we must ignore any that + * aren't from the ringbuffer we're considering. + */ + if (request->ringbuf != ringbuf) + continue; + + /* Would completion of this request free enough space? */ + space = __intel_ring_space(request->postfix, ringbuf->tail, + ringbuf->size); + if (space >= bytes) + break; + } + + if (WARN_ON(&request->list == &ring->request_list)) + return -ENOSPC; + + ret = i915_wait_request(request); + if (ret) + return ret; + + ringbuf->space = space; + return 0; +} + +/* + * intel_logical_ring_advance_and_submit() - advance the tail and submit the workload + * @ringbuf: Logical Ringbuffer to advance. + * + * The tail is updated in our logical ringbuffer struct, not in the actual context. What + * really happens during submission is that the context and current tail will be placed + * on a queue waiting for the ELSP to be ready to accept a new context submission. At that + * point, the tail *inside* the context is updated and the ELSP written to. + */ +static void +intel_logical_ring_advance_and_submit(struct intel_ringbuffer *ringbuf, + struct intel_context *ctx, + struct drm_i915_gem_request *request) +{ + struct intel_engine_cs *ring = ringbuf->ring; + + intel_logical_ring_advance(ringbuf); + + if (intel_ring_stopped(ring)) + return; + + execlists_context_queue(ring, ctx, ringbuf->tail, request); +} + +static int logical_ring_wrap_buffer(struct intel_ringbuffer *ringbuf, + struct intel_context *ctx) +{ + uint32_t __iomem *virt; + int rem = ringbuf->size - ringbuf->tail; + + if (ringbuf->space < rem) { + int ret = logical_ring_wait_for_space(ringbuf, ctx, rem); + + if (ret) + return ret; + } + + virt = ringbuf->virtual_start + ringbuf->tail; + rem /= 4; + while (rem--) + iowrite32(MI_NOOP, virt++); + + ringbuf->tail = 0; + intel_ring_update_space(ringbuf); + + return 0; +} + +static int logical_ring_prepare(struct intel_ringbuffer *ringbuf, + struct intel_context *ctx, int bytes) +{ + int ret; + + if (unlikely(ringbuf->tail + bytes > ringbuf->effective_size)) { + ret = logical_ring_wrap_buffer(ringbuf, ctx); + if (unlikely(ret)) + return ret; + } + + if (unlikely(ringbuf->space < bytes)) { + ret = logical_ring_wait_for_space(ringbuf, ctx, bytes); + if (unlikely(ret)) + return ret; + } + + return 0; +} + +/** + * intel_logical_ring_begin() - prepare the logical ringbuffer to accept some commands + * + * @ringbuf: Logical ringbuffer. + * @num_dwords: number of DWORDs that we plan to write to the ringbuffer. + * + * The ringbuffer might not be ready to accept the commands right away (maybe it needs to + * be wrapped, or wait a bit for the tail to be updated). This function takes care of that + * and also preallocates a request (every workload submission is still mediated through + * requests, same as it did with legacy ringbuffer submission). + * + * Return: non-zero if the ringbuffer is not ready to be written to. + */ +static int intel_logical_ring_begin(struct intel_ringbuffer *ringbuf, + struct intel_context *ctx, int num_dwords) +{ + struct intel_engine_cs *ring = ringbuf->ring; + struct drm_device *dev = ring->dev; + struct drm_i915_private *dev_priv = dev->dev_private; + int ret; + + ret = i915_gem_check_wedge(&dev_priv->gpu_error, + dev_priv->mm.interruptible); + if (ret) + return ret; + + ret = logical_ring_prepare(ringbuf, ctx, num_dwords * sizeof(uint32_t)); + if (ret) + return ret; + + /* Preallocate the olr before touching the ring */ + ret = i915_gem_request_alloc(ring, ctx); + if (ret) + return ret; + + ringbuf->space -= num_dwords * sizeof(uint32_t); + return 0; +} + /** * execlists_submission() - submit a batchbuffer for execution, Execlists style * @dev: DRM device. @@ -612,7 +832,7 @@ static int execlists_move_to_gpu(struct intel_ringbuffer *ringbuf, * @vmas: list of vmas. * @batch_obj: the batchbuffer to submit. * @exec_start: batchbuffer start virtual address pointer. - * @flags: translated execbuffer call flags. + * @dispatch_flags: translated execbuffer call flags. * * This is the evil twin version of i915_gem_ringbuffer_submission. It abstracts * away the submission details of the execbuffer ioctl call. @@ -625,7 +845,7 @@ int intel_execlists_submission(struct drm_device *dev, struct drm_file *file, struct drm_i915_gem_execbuffer2 *args, struct list_head *vmas, struct drm_i915_gem_object *batch_obj, - u64 exec_start, u32 flags) + u64 exec_start, u32 dispatch_flags) { struct drm_i915_private *dev_priv = dev->dev_private; struct intel_ringbuffer *ringbuf = ctx->engine[ring->id].ringbuf; @@ -698,10 +918,12 @@ int intel_execlists_submission(struct drm_device *dev, struct drm_file *file, dev_priv->relative_constants_mode = instp_mode; } - ret = ring->emit_bb_start(ringbuf, ctx, exec_start, flags); + ret = ring->emit_bb_start(ringbuf, ctx, exec_start, dispatch_flags); if (ret) return ret; + trace_i915_gem_ring_dispatch(intel_ring_get_request(ring), dispatch_flags); + i915_gem_execbuffer_move_to_active(vmas, ring); i915_gem_execbuffer_retire_commands(dev, file, ring, batch_obj); @@ -711,8 +933,6 @@ int intel_execlists_submission(struct drm_device *dev, struct drm_file *file, void intel_execlists_retire_requests(struct intel_engine_cs *ring) { struct drm_i915_gem_request *req, *tmp; - struct drm_i915_private *dev_priv = ring->dev->dev_private; - unsigned long flags; struct list_head retired_list; WARN_ON(!mutex_is_locked(&ring->dev->struct_mutex)); @@ -720,9 +940,9 @@ void intel_execlists_retire_requests(struct intel_engine_cs *ring) return; INIT_LIST_HEAD(&retired_list); - spin_lock_irqsave(&ring->execlist_lock, flags); + spin_lock_irq(&ring->execlist_lock); list_replace_init(&ring->execlist_retired_req_list, &retired_list); - spin_unlock_irqrestore(&ring->execlist_lock, flags); + spin_unlock_irq(&ring->execlist_lock); list_for_each_entry_safe(req, tmp, &retired_list, execlist_link) { struct intel_context *ctx = req->ctx; @@ -731,7 +951,6 @@ void intel_execlists_retire_requests(struct intel_engine_cs *ring) if (ctx_obj && (ctx != ring->default_context)) intel_lr_context_unpin(ring, ctx); - intel_runtime_pm_put(dev_priv); list_del(&req->execlist_link); i915_gem_request_unreference(req); } @@ -776,29 +995,6 @@ int logical_ring_flush_all_caches(struct intel_ringbuffer *ringbuf, return 0; } -/** - * intel_logical_ring_advance_and_submit() - advance the tail and submit the workload - * @ringbuf: Logical Ringbuffer to advance. - * - * The tail is updated in our logical ringbuffer struct, not in the actual context. What - * really happens during submission is that the context and current tail will be placed - * on a queue waiting for the ELSP to be ready to accept a new context submission. At that - * point, the tail *inside* the context is updated and the ELSP written to. - */ -void intel_logical_ring_advance_and_submit(struct intel_ringbuffer *ringbuf, - struct intel_context *ctx, - struct drm_i915_gem_request *request) -{ - struct intel_engine_cs *ring = ringbuf->ring; - - intel_logical_ring_advance(ringbuf); - - if (intel_ring_stopped(ring)) - return; - - execlists_context_queue(ring, ctx, ringbuf->tail, request); -} - static int intel_lr_context_pin(struct intel_engine_cs *ring, struct intel_context *ctx) { @@ -843,222 +1039,6 @@ void intel_lr_context_unpin(struct intel_engine_cs *ring, } } -static int logical_ring_alloc_request(struct intel_engine_cs *ring, - struct intel_context *ctx) -{ - struct drm_i915_gem_request *request; - struct drm_i915_private *dev_private = ring->dev->dev_private; - int ret; - - if (ring->outstanding_lazy_request) - return 0; - - request = kzalloc(sizeof(*request), GFP_KERNEL); - if (request == NULL) - return -ENOMEM; - - if (ctx != ring->default_context) { - ret = intel_lr_context_pin(ring, ctx); - if (ret) { - kfree(request); - return ret; - } - } - - kref_init(&request->ref); - request->ring = ring; - request->uniq = dev_private->request_uniq++; - - ret = i915_gem_get_seqno(ring->dev, &request->seqno); - if (ret) { - intel_lr_context_unpin(ring, ctx); - kfree(request); - return ret; - } - - /* Hold a reference to the context this request belongs to - * (we will need it when the time comes to emit/retire the - * request). - */ - request->ctx = ctx; - i915_gem_context_reference(request->ctx); - - ring->outstanding_lazy_request = request; - return 0; -} - -static int logical_ring_wait_request(struct intel_ringbuffer *ringbuf, - int bytes) -{ - struct intel_engine_cs *ring = ringbuf->ring; - struct drm_i915_gem_request *request; - int ret; - - if (intel_ring_space(ringbuf) >= bytes) - return 0; - - list_for_each_entry(request, &ring->request_list, list) { - /* - * The request queue is per-engine, so can contain requests - * from multiple ringbuffers. Here, we must ignore any that - * aren't from the ringbuffer we're considering. - */ - struct intel_context *ctx = request->ctx; - if (ctx->engine[ring->id].ringbuf != ringbuf) - continue; - - /* Would completion of this request free enough space? */ - if (__intel_ring_space(request->tail, ringbuf->tail, - ringbuf->size) >= bytes) { - break; - } - } - - if (&request->list == &ring->request_list) - return -ENOSPC; - - ret = i915_wait_request(request); - if (ret) - return ret; - - i915_gem_retire_requests_ring(ring); - - return intel_ring_space(ringbuf) >= bytes ? 0 : -ENOSPC; -} - -static int logical_ring_wait_for_space(struct intel_ringbuffer *ringbuf, - struct intel_context *ctx, - int bytes) -{ - struct intel_engine_cs *ring = ringbuf->ring; - struct drm_device *dev = ring->dev; - struct drm_i915_private *dev_priv = dev->dev_private; - unsigned long end; - int ret; - - ret = logical_ring_wait_request(ringbuf, bytes); - if (ret != -ENOSPC) - return ret; - - /* Force the context submission in case we have been skipping it */ - intel_logical_ring_advance_and_submit(ringbuf, ctx, NULL); - - /* With GEM the hangcheck timer should kick us out of the loop, - * leaving it early runs the risk of corrupting GEM state (due - * to running on almost untested codepaths). But on resume - * timers don't work yet, so prevent a complete hang in that - * case by choosing an insanely large timeout. */ - end = jiffies + 60 * HZ; - - ret = 0; - do { - if (intel_ring_space(ringbuf) >= bytes) - break; - - msleep(1); - - if (dev_priv->mm.interruptible && signal_pending(current)) { - ret = -ERESTARTSYS; - break; - } - - ret = i915_gem_check_wedge(&dev_priv->gpu_error, - dev_priv->mm.interruptible); - if (ret) - break; - - if (time_after(jiffies, end)) { - ret = -EBUSY; - break; - } - } while (1); - - return ret; -} - -static int logical_ring_wrap_buffer(struct intel_ringbuffer *ringbuf, - struct intel_context *ctx) -{ - uint32_t __iomem *virt; - int rem = ringbuf->size - ringbuf->tail; - - if (ringbuf->space < rem) { - int ret = logical_ring_wait_for_space(ringbuf, ctx, rem); - - if (ret) - return ret; - } - - virt = ringbuf->virtual_start + ringbuf->tail; - rem /= 4; - while (rem--) - iowrite32(MI_NOOP, virt++); - - ringbuf->tail = 0; - intel_ring_update_space(ringbuf); - - return 0; -} - -static int logical_ring_prepare(struct intel_ringbuffer *ringbuf, - struct intel_context *ctx, int bytes) -{ - int ret; - - if (unlikely(ringbuf->tail + bytes > ringbuf->effective_size)) { - ret = logical_ring_wrap_buffer(ringbuf, ctx); - if (unlikely(ret)) - return ret; - } - - if (unlikely(ringbuf->space < bytes)) { - ret = logical_ring_wait_for_space(ringbuf, ctx, bytes); - if (unlikely(ret)) - return ret; - } - - return 0; -} - -/** - * intel_logical_ring_begin() - prepare the logical ringbuffer to accept some commands - * - * @ringbuf: Logical ringbuffer. - * @num_dwords: number of DWORDs that we plan to write to the ringbuffer. - * - * The ringbuffer might not be ready to accept the commands right away (maybe it needs to - * be wrapped, or wait a bit for the tail to be updated). This function takes care of that - * and also preallocates a request (every workload submission is still mediated through - * requests, same as it did with legacy ringbuffer submission). - * - * Return: non-zero if the ringbuffer is not ready to be written to. - */ -int intel_logical_ring_begin(struct intel_ringbuffer *ringbuf, - struct intel_context *ctx, int num_dwords) -{ - struct intel_engine_cs *ring = ringbuf->ring; - struct drm_device *dev = ring->dev; - struct drm_i915_private *dev_priv = dev->dev_private; - int ret; - - ret = i915_gem_check_wedge(&dev_priv->gpu_error, - dev_priv->mm.interruptible); - if (ret) - return ret; - - ret = logical_ring_prepare(ringbuf, ctx, num_dwords * sizeof(uint32_t)); - if (ret) - return ret; - - /* Preallocate the olr before touching the ring */ - ret = logical_ring_alloc_request(ring, ctx); - if (ret) - return ret; - - ringbuf->space -= num_dwords * sizeof(uint32_t); - return 0; -} - static int intel_logical_ring_workarounds_emit(struct intel_engine_cs *ring, struct intel_context *ctx) { @@ -1105,6 +1085,12 @@ static int gen8_init_common_ring(struct intel_engine_cs *ring) I915_WRITE_IMR(ring, ~(ring->irq_enable_mask | ring->irq_keep_mask)); I915_WRITE(RING_HWSTAM(ring->mmio_base), 0xffffffff); + if (ring->status_page.obj) { + I915_WRITE(RING_HWS_PGA(ring->mmio_base), + (u32)ring->status_page.gfx_addr); + POSTING_READ(RING_HWS_PGA(ring->mmio_base)); + } + I915_WRITE(RING_MODE_GEN7(ring), _MASKED_BIT_DISABLE(GFX_REPLAY_MODE) | _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE)); @@ -1140,11 +1126,22 @@ static int gen8_init_render_ring(struct intel_engine_cs *ring) return init_workarounds_ring(ring); } +static int gen9_init_render_ring(struct intel_engine_cs *ring) +{ + int ret; + + ret = gen8_init_common_ring(ring); + if (ret) + return ret; + + return init_workarounds_ring(ring); +} + static int gen8_emit_bb_start(struct intel_ringbuffer *ringbuf, struct intel_context *ctx, - u64 offset, unsigned flags) + u64 offset, unsigned dispatch_flags) { - bool ppgtt = !(flags & I915_DISPATCH_SECURE); + bool ppgtt = !(dispatch_flags & I915_DISPATCH_SECURE); int ret; ret = intel_logical_ring_begin(ringbuf, ctx, 4); @@ -1242,6 +1239,7 @@ static int gen8_emit_flush_render(struct intel_ringbuffer *ringbuf, { struct intel_engine_cs *ring = ringbuf->ring; u32 scratch_addr = ring->scratch.gtt_offset + 2 * CACHELINE_BYTES; + bool vf_flush_wa; u32 flags = 0; int ret; @@ -1263,10 +1261,26 @@ static int gen8_emit_flush_render(struct intel_ringbuffer *ringbuf, flags |= PIPE_CONTROL_GLOBAL_GTT_IVB; } - ret = intel_logical_ring_begin(ringbuf, ctx, 6); + /* + * On GEN9+ Before VF_CACHE_INVALIDATE we need to emit a NULL pipe + * control. + */ + vf_flush_wa = INTEL_INFO(ring->dev)->gen >= 9 && + flags & PIPE_CONTROL_VF_CACHE_INVALIDATE; + + ret = intel_logical_ring_begin(ringbuf, ctx, vf_flush_wa ? 12 : 6); if (ret) return ret; + if (vf_flush_wa) { + intel_logical_ring_emit(ringbuf, GFX_OP_PIPE_CONTROL(6)); + intel_logical_ring_emit(ringbuf, 0); + intel_logical_ring_emit(ringbuf, 0); + intel_logical_ring_emit(ringbuf, 0); + intel_logical_ring_emit(ringbuf, 0); + intel_logical_ring_emit(ringbuf, 0); + } + intel_logical_ring_emit(ringbuf, GFX_OP_PIPE_CONTROL(6)); intel_logical_ring_emit(ringbuf, flags); intel_logical_ring_emit(ringbuf, scratch_addr); @@ -1295,7 +1309,12 @@ static int gen8_emit_request(struct intel_ringbuffer *ringbuf, u32 cmd; int ret; - ret = intel_logical_ring_begin(ringbuf, request->ctx, 6); + /* + * Reserve space for 2 NOOPs at the end of each request to be + * used as a workaround for not being allowed to do lite + * restore with HEAD==TAIL (WaIdleLiteRestore). + */ + ret = intel_logical_ring_begin(ringbuf, request->ctx, 8); if (ret) return ret; @@ -1313,9 +1332,50 @@ static int gen8_emit_request(struct intel_ringbuffer *ringbuf, intel_logical_ring_emit(ringbuf, MI_NOOP); intel_logical_ring_advance_and_submit(ringbuf, request->ctx, request); + /* + * Here we add two extra NOOPs as padding to avoid + * lite restore of a context with HEAD==TAIL. + */ + intel_logical_ring_emit(ringbuf, MI_NOOP); + intel_logical_ring_emit(ringbuf, MI_NOOP); + intel_logical_ring_advance(ringbuf); + return 0; } +static int intel_lr_context_render_state_init(struct intel_engine_cs *ring, + struct intel_context *ctx) +{ + struct intel_ringbuffer *ringbuf = ctx->engine[ring->id].ringbuf; + struct render_state so; + struct drm_i915_file_private *file_priv = ctx->file_priv; + struct drm_file *file = file_priv ? file_priv->file : NULL; + int ret; + + ret = i915_gem_render_state_prepare(ring, &so); + if (ret) + return ret; + + if (so.rodata == NULL) + return 0; + + ret = ring->emit_bb_start(ringbuf, + ctx, + so.ggtt_offset, + I915_DISPATCH_SECURE); + if (ret) + goto out; + + i915_vma_move_to_active(i915_gem_obj_to_ggtt(so.obj), ring); + + ret = __i915_add_request(ring, file, so.obj); + /* intel_logical_ring_add_request moves object to inactive if it + * fails */ +out: + i915_gem_render_state_fini(&so); + return ret; +} + static int gen8_init_rcs_context(struct intel_engine_cs *ring, struct intel_context *ctx) { @@ -1351,6 +1411,7 @@ void intel_logical_ring_cleanup(struct intel_engine_cs *ring) ring->cleanup(ring); i915_cmd_parser_fini_ring(ring); + i915_gem_batch_pool_fini(&ring->batch_pool); if (ring->status_page.obj) { kunmap(sg_page(ring->status_page.obj->pages->sgl)); @@ -1368,6 +1429,7 @@ static int logical_ring_init(struct drm_device *dev, struct intel_engine_cs *rin ring->dev = dev; INIT_LIST_HEAD(&ring->active_list); INIT_LIST_HEAD(&ring->request_list); + i915_gem_batch_pool_init(dev, &ring->batch_pool); init_waitqueue_head(&ring->irq_queue); INIT_LIST_HEAD(&ring->execlist_queue); @@ -1399,7 +1461,10 @@ static int logical_render_ring_init(struct drm_device *dev) if (HAS_L3_DPF(dev)) ring->irq_keep_mask |= GT_RENDER_L3_PARITY_ERROR_INTERRUPT; - ring->init_hw = gen8_init_render_ring; + if (INTEL_INFO(dev)->gen >= 9) + ring->init_hw = gen9_init_render_ring; + else + ring->init_hw = gen8_init_render_ring; ring->init_context = gen8_init_rcs_context; ring->cleanup = intel_fini_pipe_control; ring->get_seqno = gen8_get_seqno; @@ -1581,37 +1646,47 @@ cleanup_render_ring: return ret; } -int intel_lr_context_render_state_init(struct intel_engine_cs *ring, - struct intel_context *ctx) +static u32 +make_rpcs(struct drm_device *dev) { - struct intel_ringbuffer *ringbuf = ctx->engine[ring->id].ringbuf; - struct render_state so; - struct drm_i915_file_private *file_priv = ctx->file_priv; - struct drm_file *file = file_priv ? file_priv->file : NULL; - int ret; - - ret = i915_gem_render_state_prepare(ring, &so); - if (ret) - return ret; + u32 rpcs = 0; - if (so.rodata == NULL) + /* + * No explicit RPCS request is needed to ensure full + * slice/subslice/EU enablement prior to Gen9. + */ + if (INTEL_INFO(dev)->gen < 9) return 0; - ret = ring->emit_bb_start(ringbuf, - ctx, - so.ggtt_offset, - I915_DISPATCH_SECURE); - if (ret) - goto out; + /* + * Starting in Gen9, render power gating can leave + * slice/subslice/EU in a partially enabled state. We + * must make an explicit request through RPCS for full + * enablement. + */ + if (INTEL_INFO(dev)->has_slice_pg) { + rpcs |= GEN8_RPCS_S_CNT_ENABLE; + rpcs |= INTEL_INFO(dev)->slice_total << + GEN8_RPCS_S_CNT_SHIFT; + rpcs |= GEN8_RPCS_ENABLE; + } - i915_vma_move_to_active(i915_gem_obj_to_ggtt(so.obj), ring); + if (INTEL_INFO(dev)->has_subslice_pg) { + rpcs |= GEN8_RPCS_SS_CNT_ENABLE; + rpcs |= INTEL_INFO(dev)->subslice_per_slice << + GEN8_RPCS_SS_CNT_SHIFT; + rpcs |= GEN8_RPCS_ENABLE; + } - ret = __i915_add_request(ring, file, so.obj); - /* intel_logical_ring_add_request moves object to inactive if it - * fails */ -out: - i915_gem_render_state_fini(&so); - return ret; + if (INTEL_INFO(dev)->has_eu_pg) { + rpcs |= INTEL_INFO(dev)->eu_per_subslice << + GEN8_RPCS_EU_MIN_SHIFT; + rpcs |= INTEL_INFO(dev)->eu_per_subslice << + GEN8_RPCS_EU_MAX_SHIFT; + rpcs |= GEN8_RPCS_ENABLE; + } + + return rpcs; } static int @@ -1659,7 +1734,8 @@ populate_lr_context(struct intel_context *ctx, struct drm_i915_gem_object *ctx_o reg_state[CTX_LRI_HEADER_0] |= MI_LRI_FORCE_POSTED; reg_state[CTX_CONTEXT_CONTROL] = RING_CONTEXT_CONTROL(ring); reg_state[CTX_CONTEXT_CONTROL+1] = - _MASKED_BIT_ENABLE((1<<3) | MI_RESTORE_INHIBIT); + _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH | + CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT); reg_state[CTX_RING_HEAD] = RING_HEAD(ring->mmio_base); reg_state[CTX_RING_HEAD+1] = 0; reg_state[CTX_RING_TAIL] = RING_TAIL(ring->mmio_base); @@ -1706,18 +1782,18 @@ populate_lr_context(struct intel_context *ctx, struct drm_i915_gem_object *ctx_o reg_state[CTX_PDP1_LDW] = GEN8_RING_PDP_LDW(ring, 1); reg_state[CTX_PDP0_UDW] = GEN8_RING_PDP_UDW(ring, 0); reg_state[CTX_PDP0_LDW] = GEN8_RING_PDP_LDW(ring, 0); - reg_state[CTX_PDP3_UDW+1] = upper_32_bits(ppgtt->pd_dma_addr[3]); - reg_state[CTX_PDP3_LDW+1] = lower_32_bits(ppgtt->pd_dma_addr[3]); - reg_state[CTX_PDP2_UDW+1] = upper_32_bits(ppgtt->pd_dma_addr[2]); - reg_state[CTX_PDP2_LDW+1] = lower_32_bits(ppgtt->pd_dma_addr[2]); - reg_state[CTX_PDP1_UDW+1] = upper_32_bits(ppgtt->pd_dma_addr[1]); - reg_state[CTX_PDP1_LDW+1] = lower_32_bits(ppgtt->pd_dma_addr[1]); - reg_state[CTX_PDP0_UDW+1] = upper_32_bits(ppgtt->pd_dma_addr[0]); - reg_state[CTX_PDP0_LDW+1] = lower_32_bits(ppgtt->pd_dma_addr[0]); + + /* With dynamic page allocation, PDPs may not be allocated at this point, + * Point the unallocated PDPs to the scratch page + */ + ASSIGN_CTX_PDP(ppgtt, reg_state, 3); + ASSIGN_CTX_PDP(ppgtt, reg_state, 2); + ASSIGN_CTX_PDP(ppgtt, reg_state, 1); + ASSIGN_CTX_PDP(ppgtt, reg_state, 0); if (ring->id == RCS) { reg_state[CTX_LRI_HEADER_2] = MI_LOAD_REGISTER_IMM(1); - reg_state[CTX_R_PWR_CLK_STATE] = 0x20c8; - reg_state[CTX_R_PWR_CLK_STATE+1] = 0; + reg_state[CTX_R_PWR_CLK_STATE] = GEN8_R_PWR_CLK_STATE; + reg_state[CTX_R_PWR_CLK_STATE+1] = make_rpcs(dev); } kunmap_atomic(reg_state); @@ -1830,11 +1906,10 @@ int intel_lr_context_deferred_create(struct intel_context *ctx, context_size = round_up(get_lr_context_size(ring), 4096); - ctx_obj = i915_gem_alloc_context_obj(dev, context_size); - if (IS_ERR(ctx_obj)) { - ret = PTR_ERR(ctx_obj); - DRM_DEBUG_DRIVER("Alloc LRC backing obj failed: %d\n", ret); - return ret; + ctx_obj = i915_gem_alloc_object(dev, context_size); + if (!ctx_obj) { + DRM_DEBUG_DRIVER("Alloc LRC backing obj failed.\n"); + return -ENOMEM; } if (is_global_default_ctx) { @@ -1925,3 +2000,38 @@ error_unpin_ctx: drm_gem_object_unreference(&ctx_obj->base); return ret; } + +void intel_lr_context_reset(struct drm_device *dev, + struct intel_context *ctx) +{ + struct drm_i915_private *dev_priv = dev->dev_private; + struct intel_engine_cs *ring; + int i; + + for_each_ring(ring, dev_priv, i) { + struct drm_i915_gem_object *ctx_obj = + ctx->engine[ring->id].state; + struct intel_ringbuffer *ringbuf = + ctx->engine[ring->id].ringbuf; + uint32_t *reg_state; + struct page *page; + + if (!ctx_obj) + continue; + + if (i915_gem_object_get_pages(ctx_obj)) { + WARN(1, "Failed get_pages for context obj\n"); + continue; + } + page = i915_gem_object_get_page(ctx_obj, 1); + reg_state = kmap_atomic(page); + + reg_state[CTX_RING_HEAD+1] = 0; + reg_state[CTX_RING_TAIL+1] = 0; + + kunmap_atomic(reg_state); + + ringbuf->head = 0; + ringbuf->tail = 0; + } +} |