diff options
Diffstat (limited to 'drivers/gpu/drm/i915/i915_request.c')
| -rw-r--r-- | drivers/gpu/drm/i915/i915_request.c | 502 |
1 files changed, 260 insertions, 242 deletions
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c index bec9c3652188..89cccefeea63 100644 --- a/drivers/gpu/drm/i915/i915_request.c +++ b/drivers/gpu/drm/i915/i915_request.c @@ -29,6 +29,7 @@ #include <linux/sched.h> #include <linux/sched/clock.h> #include <linux/sched/signal.h> +#include <linux/sched/mm.h> #include "gem/i915_gem_context.h" #include "gt/intel_breadcrumbs.h" @@ -42,22 +43,17 @@ #include "i915_active.h" #include "i915_drv.h" -#include "i915_globals.h" #include "i915_trace.h" #include "intel_pm.h" struct execute_cb { struct irq_work work; struct i915_sw_fence *fence; - void (*hook)(struct i915_request *rq, struct dma_fence *signal); struct i915_request *signal; }; -static struct i915_global_request { - struct i915_global base; - struct kmem_cache *slab_requests; - struct kmem_cache *slab_execute_cbs; -} global; +static struct kmem_cache *slab_requests; +static struct kmem_cache *slab_execute_cbs; static const char *i915_fence_get_driver_name(struct dma_fence *fence) { @@ -108,13 +104,16 @@ static signed long i915_fence_wait(struct dma_fence *fence, struct kmem_cache *i915_request_slab_cache(void) { - return global.slab_requests; + return slab_requests; } static void i915_fence_release(struct dma_fence *fence) { struct i915_request *rq = to_request(fence); + GEM_BUG_ON(rq->guc_prio != GUC_PRIO_INIT && + rq->guc_prio != GUC_PRIO_FINI); + /* * The request is put onto a RCU freelist (i.e. the address * is immediately reused), mark the fences as being freed now. @@ -126,41 +125,19 @@ static void i915_fence_release(struct dma_fence *fence) i915_sw_fence_fini(&rq->semaphore); /* - * Keep one request on each engine for reserved use under mempressure - * - * We do not hold a reference to the engine here and so have to be - * very careful in what rq->engine we poke. The virtual engine is - * referenced via the rq->context and we released that ref during - * i915_request_retire(), ergo we must not dereference a virtual - * engine here. Not that we would want to, as the only consumer of - * the reserved engine->request_pool is the power management parking, - * which must-not-fail, and that is only run on the physical engines. - * - * Since the request must have been executed to be have completed, - * we know that it will have been processed by the HW and will - * not be unsubmitted again, so rq->engine and rq->execution_mask - * at this point is stable. rq->execution_mask will be a single - * bit if the last and _only_ engine it could execution on was a - * physical engine, if it's multiple bits then it started on and - * could still be on a virtual engine. Thus if the mask is not a - * power-of-two we assume that rq->engine may still be a virtual - * engine and so a dangling invalid pointer that we cannot dereference - * - * For example, consider the flow of a bonded request through a virtual - * engine. The request is created with a wide engine mask (all engines - * that we might execute on). On processing the bond, the request mask - * is reduced to one or more engines. If the request is subsequently - * bound to a single engine, it will then be constrained to only - * execute on that engine and never returned to the virtual engine - * after timeslicing away, see __unwind_incomplete_requests(). Thus we - * know that if the rq->execution_mask is a single bit, rq->engine - * can be a physical engine with the exact corresponding mask. + * Keep one request on each engine for reserved use under mempressure, + * do not use with virtual engines as this really is only needed for + * kernel contexts. */ - if (is_power_of_2(rq->execution_mask) && - !cmpxchg(&rq->engine->request_pool, NULL, rq)) + if (!intel_engine_is_virtual(rq->engine) && + !cmpxchg(&rq->engine->request_pool, NULL, rq)) { + intel_context_put(rq->context); return; + } + + intel_context_put(rq->context); - kmem_cache_free(global.slab_requests, rq); + kmem_cache_free(slab_requests, rq); } const struct dma_fence_ops i915_fence_ops = { @@ -177,18 +154,7 @@ static void irq_execute_cb(struct irq_work *wrk) struct execute_cb *cb = container_of(wrk, typeof(*cb), work); i915_sw_fence_complete(cb->fence); - kmem_cache_free(global.slab_execute_cbs, cb); -} - -static void irq_execute_cb_hook(struct irq_work *wrk) -{ - struct execute_cb *cb = container_of(wrk, typeof(*cb), work); - - cb->hook(container_of(cb->fence, struct i915_request, submit), - &cb->signal->fence); - i915_request_put(cb->signal); - - irq_execute_cb(wrk); + kmem_cache_free(slab_execute_cbs, cb); } static __always_inline void @@ -216,7 +182,7 @@ static bool irq_work_imm(struct irq_work *wrk) return false; } -static void __notify_execute_cb_imm(struct i915_request *rq) +void i915_request_notify_execute_cb_imm(struct i915_request *rq) { __notify_execute_cb(rq, irq_work_imm); } @@ -272,11 +238,11 @@ i915_request_active_engine(struct i915_request *rq, * check that we have acquired the lock on the final engine. */ locked = READ_ONCE(rq->engine); - spin_lock_irq(&locked->active.lock); + spin_lock_irq(&locked->sched_engine->lock); while (unlikely(locked != (engine = READ_ONCE(rq->engine)))) { - spin_unlock(&locked->active.lock); + spin_unlock(&locked->sched_engine->lock); locked = engine; - spin_lock(&locked->active.lock); + spin_lock(&locked->sched_engine->lock); } if (i915_request_is_active(rq)) { @@ -285,42 +251,11 @@ i915_request_active_engine(struct i915_request *rq, ret = true; } - spin_unlock_irq(&locked->active.lock); + spin_unlock_irq(&locked->sched_engine->lock); return ret; } - -static void remove_from_engine(struct i915_request *rq) -{ - struct intel_engine_cs *engine, *locked; - - /* - * Virtual engines complicate acquiring the engine timeline lock, - * as their rq->engine pointer is not stable until under that - * engine lock. The simple ploy we use is to take the lock then - * check that the rq still belongs to the newly locked engine. - */ - locked = READ_ONCE(rq->engine); - spin_lock_irq(&locked->active.lock); - while (unlikely(locked != (engine = READ_ONCE(rq->engine)))) { - spin_unlock(&locked->active.lock); - spin_lock(&engine->active.lock); - locked = engine; - } - list_del_init(&rq->sched.link); - - clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); - clear_bit(I915_FENCE_FLAG_HOLD, &rq->fence.flags); - - /* Prevent further __await_execution() registering a cb, then flush */ - set_bit(I915_FENCE_FLAG_ACTIVE, &rq->fence.flags); - - spin_unlock_irq(&locked->active.lock); - - __notify_execute_cb_imm(rq); -} - static void __rq_init_watchdog(struct i915_request *rq) { rq->watchdog.timer.function = NULL; @@ -417,8 +352,7 @@ bool i915_request_retire(struct i915_request *rq) * after removing the breadcrumb and signaling it, so that we do not * inadvertently attach the breadcrumb to a completed request. */ - if (!list_empty(&rq->sched.link)) - remove_from_engine(rq); + rq->engine->remove_active_request(rq); GEM_BUG_ON(!llist_empty(&rq->execute_cb)); __list_del_entry(&rq->link); /* poison neither prev/next (RCU walks) */ @@ -443,6 +377,7 @@ void i915_request_retire_upto(struct i915_request *rq) do { tmp = list_first_entry(&tl->requests, typeof(*tmp), link); + GEM_BUG_ON(!i915_request_completed(tmp)); } while (i915_request_retire(tmp) && tmp != rq); } @@ -517,19 +452,14 @@ static bool __request_in_flight(const struct i915_request *signal) static int __await_execution(struct i915_request *rq, struct i915_request *signal, - void (*hook)(struct i915_request *rq, - struct dma_fence *signal), gfp_t gfp) { struct execute_cb *cb; - if (i915_request_is_active(signal)) { - if (hook) - hook(rq, &signal->fence); + if (i915_request_is_active(signal)) return 0; - } - cb = kmem_cache_alloc(global.slab_execute_cbs, gfp); + cb = kmem_cache_alloc(slab_execute_cbs, gfp); if (!cb) return -ENOMEM; @@ -537,12 +467,6 @@ __await_execution(struct i915_request *rq, i915_sw_fence_await(cb->fence); init_irq_work(&cb->work, irq_execute_cb); - if (hook) { - cb->hook = hook; - cb->signal = i915_request_get(signal); - cb->work.func = irq_execute_cb_hook; - } - /* * Register the callback first, then see if the signaler is already * active. This ensures that if we race with the @@ -559,7 +483,7 @@ __await_execution(struct i915_request *rq, if (llist_add(&cb->work.node.llist, &signal->execute_cb)) { if (i915_request_is_active(signal) || __request_in_flight(signal)) - __notify_execute_cb_imm(signal); + i915_request_notify_execute_cb_imm(signal); } return 0; @@ -637,7 +561,7 @@ bool __i915_request_submit(struct i915_request *request) RQ_TRACE(request, "\n"); GEM_BUG_ON(!irqs_disabled()); - lockdep_assert_held(&engine->active.lock); + lockdep_assert_held(&engine->sched_engine->lock); /* * With the advent of preempt-to-busy, we frequently encounter @@ -649,7 +573,7 @@ bool __i915_request_submit(struct i915_request *request) * * We must remove the request from the caller's priority queue, * and the caller must only call us when the request is in their - * priority queue, under the active.lock. This ensures that the + * priority queue, under the sched_engine->lock. This ensures that the * request has *not* yet been retired and we can safely move * the request into the engine->active.list where it will be * dropped upon retiring. (Otherwise if resubmit a *retired* @@ -690,11 +614,15 @@ bool __i915_request_submit(struct i915_request *request) request->ring->vaddr + request->postfix); trace_i915_request_execute(request); - engine->serial++; + if (engine->bump_serial) + engine->bump_serial(engine); + else + engine->serial++; + result = true; GEM_BUG_ON(test_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags)); - list_move_tail(&request->sched.link, &engine->active.requests); + engine->add_active_request(request); active: clear_bit(I915_FENCE_FLAG_PQUEUE, &request->fence.flags); set_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags); @@ -724,11 +652,11 @@ void i915_request_submit(struct i915_request *request) unsigned long flags; /* Will be called from irq-context when using foreign fences. */ - spin_lock_irqsave(&engine->active.lock, flags); + spin_lock_irqsave(&engine->sched_engine->lock, flags); __i915_request_submit(request); - spin_unlock_irqrestore(&engine->active.lock, flags); + spin_unlock_irqrestore(&engine->sched_engine->lock, flags); } void __i915_request_unsubmit(struct i915_request *request) @@ -742,7 +670,7 @@ void __i915_request_unsubmit(struct i915_request *request) RQ_TRACE(request, "\n"); GEM_BUG_ON(!irqs_disabled()); - lockdep_assert_held(&engine->active.lock); + lockdep_assert_held(&engine->sched_engine->lock); /* * Before we remove this breadcrumb from the signal list, we have @@ -775,23 +703,11 @@ void i915_request_unsubmit(struct i915_request *request) unsigned long flags; /* Will be called from irq-context when using foreign fences. */ - spin_lock_irqsave(&engine->active.lock, flags); + spin_lock_irqsave(&engine->sched_engine->lock, flags); __i915_request_unsubmit(request); - spin_unlock_irqrestore(&engine->active.lock, flags); -} - -static void __cancel_request(struct i915_request *rq) -{ - struct intel_engine_cs *engine = NULL; - - i915_request_active_engine(rq, &engine); - - if (engine && intel_engine_pulse(engine)) - intel_gt_handle_error(engine->gt, engine->mask, 0, - "request cancellation by %s", - current->comm); + spin_unlock_irqrestore(&engine->sched_engine->lock, flags); } void i915_request_cancel(struct i915_request *rq, int error) @@ -801,7 +717,7 @@ void i915_request_cancel(struct i915_request *rq, int error) set_bit(I915_FENCE_FLAG_SENTINEL, &rq->fence.flags); - __cancel_request(rq); + intel_context_cancel_request(rq->context, rq); } static int __i915_sw_fence_call @@ -889,7 +805,7 @@ request_alloc_slow(struct intel_timeline *tl, rq = list_first_entry(&tl->requests, typeof(*rq), link); i915_request_retire(rq); - rq = kmem_cache_alloc(global.slab_requests, + rq = kmem_cache_alloc(slab_requests, gfp | __GFP_RETRY_MAYFAIL | __GFP_NOWARN); if (rq) return rq; @@ -902,7 +818,7 @@ request_alloc_slow(struct intel_timeline *tl, retire_requests(tl); out: - return kmem_cache_alloc(global.slab_requests, gfp); + return kmem_cache_alloc(slab_requests, gfp); } static void __i915_request_ctor(void *arg) @@ -914,8 +830,6 @@ static void __i915_request_ctor(void *arg) i915_sw_fence_init(&rq->submit, submit_notify); i915_sw_fence_init(&rq->semaphore, semaphore_notify); - dma_fence_init(&rq->fence, &i915_fence_ops, &rq->lock, 0, 0); - rq->capture_list = NULL; init_llist_head(&rq->execute_cb); @@ -929,7 +843,7 @@ __i915_request_create(struct intel_context *ce, gfp_t gfp) u32 seqno; int ret; - might_sleep_if(gfpflags_allow_blocking(gfp)); + might_alloc(gfp); /* Check that the caller provided an already pinned context */ __intel_context_pin(ce); @@ -963,7 +877,7 @@ __i915_request_create(struct intel_context *ce, gfp_t gfp) * * Do not use kmem_cache_zalloc() here! */ - rq = kmem_cache_alloc(global.slab_requests, + rq = kmem_cache_alloc(slab_requests, gfp | __GFP_RETRY_MAYFAIL | __GFP_NOWARN); if (unlikely(!rq)) { rq = request_alloc_slow(tl, &ce->engine->request_pool, gfp); @@ -973,22 +887,29 @@ __i915_request_create(struct intel_context *ce, gfp_t gfp) } } - rq->context = ce; + /* + * Hold a reference to the intel_context over life of an i915_request. + * Without this an i915_request can exist after the context has been + * destroyed (e.g. request retired, context closed, but user space holds + * a reference to the request from an out fence). In the case of GuC + * submission + virtual engine, the engine that the request references + * is also destroyed which can trigger bad pointer dref in fence ops + * (e.g. i915_fence_get_driver_name). We could likely change these + * functions to avoid touching the engine but let's just be safe and + * hold the intel_context reference. In execlist mode the request always + * eventually points to a physical engine so this isn't an issue. + */ + rq->context = intel_context_get(ce); rq->engine = ce->engine; rq->ring = ce->ring; rq->execution_mask = ce->engine->mask; - kref_init(&rq->fence.refcount); - rq->fence.flags = 0; - rq->fence.error = 0; - INIT_LIST_HEAD(&rq->fence.cb_list); - ret = intel_timeline_get_seqno(tl, rq, &seqno); if (ret) goto err_free; - rq->fence.context = tl->fence_context; - rq->fence.seqno = seqno; + dma_fence_init(&rq->fence, &i915_fence_ops, &rq->lock, + tl->fence_context, seqno); RCU_INIT_POINTER(rq->timeline, tl); rq->hwsp_seqno = tl->hwsp_seqno; @@ -996,6 +917,8 @@ __i915_request_create(struct intel_context *ce, gfp_t gfp) rq->rcustate = get_state_synchronize_rcu(); /* acts as smp_mb() */ + rq->guc_prio = GUC_PRIO_INIT; + /* We bump the ref for the fence chain */ i915_sw_fence_reinit(&i915_request_get(rq)->submit); i915_sw_fence_reinit(&i915_request_get(rq)->semaphore); @@ -1050,7 +973,8 @@ err_unwind: GEM_BUG_ON(!list_empty(&rq->sched.waiters_list)); err_free: - kmem_cache_free(global.slab_requests, rq); + intel_context_put(ce); + kmem_cache_free(slab_requests, rq); err_unreserve: intel_context_unpin(ce); return ERR_PTR(ret); @@ -1176,12 +1100,12 @@ __emit_semaphore_wait(struct i915_request *to, struct i915_request *from, u32 seqno) { - const int has_token = INTEL_GEN(to->engine->i915) >= 12; + const int has_token = GRAPHICS_VER(to->engine->i915) >= 12; u32 hwsp_offset; int len, err; u32 *cs; - GEM_BUG_ON(INTEL_GEN(to->engine->i915) < 8); + GEM_BUG_ON(GRAPHICS_VER(to->engine->i915) < 8); GEM_BUG_ON(i915_request_has_initial_breadcrumb(to)); /* We need to pin the signaler's HWSP until we are finished reading. */ @@ -1222,6 +1146,12 @@ __emit_semaphore_wait(struct i915_request *to, return 0; } +static bool +can_use_semaphore_wait(struct i915_request *to, struct i915_request *from) +{ + return to->engine->gt->ggtt == from->engine->gt->ggtt; +} + static int emit_semaphore_wait(struct i915_request *to, struct i915_request *from, @@ -1230,6 +1160,9 @@ emit_semaphore_wait(struct i915_request *to, const intel_engine_mask_t mask = READ_ONCE(from->engine)->mask; struct i915_sw_fence *wait = &to->submit; + if (!can_use_semaphore_wait(to, from)) + goto await_fence; + if (!intel_context_use_semaphores(to->context)) goto await_fence; @@ -1253,7 +1186,7 @@ emit_semaphore_wait(struct i915_request *to, goto await_fence; /* Only submit our spinner after the signaler is running! */ - if (__await_execution(to, from, NULL, gfp)) + if (__await_execution(to, from, gfp)) goto await_fence; if (__emit_semaphore_wait(to, from, from->fence.seqno)) @@ -1284,16 +1217,14 @@ static int intel_timeline_sync_set_start(struct intel_timeline *tl, static int __i915_request_await_execution(struct i915_request *to, - struct i915_request *from, - void (*hook)(struct i915_request *rq, - struct dma_fence *signal)) + struct i915_request *from) { int err; GEM_BUG_ON(intel_context_is_barrier(from->context)); /* Submit both requests at the same time */ - err = __await_execution(to, from, hook, I915_FENCE_GFP); + err = __await_execution(to, from, I915_FENCE_GFP); if (err) return err; @@ -1335,7 +1266,8 @@ __i915_request_await_execution(struct i915_request *to, * immediate execution, and so we must wait until it reaches the * active slot. */ - if (intel_engine_has_semaphores(to->engine) && + if (can_use_semaphore_wait(to, from) && + intel_engine_has_semaphores(to->engine) && !i915_request_has_initial_breadcrumb(to)) { err = __emit_semaphore_wait(to, from, from->fence.seqno - 1); if (err < 0) @@ -1343,7 +1275,7 @@ __i915_request_await_execution(struct i915_request *to, } /* Couple the dependency tree for PI on this exposed to->fence */ - if (to->engine->schedule) { + if (to->engine->sched_engine->schedule) { err = i915_sched_node_add_dependency(&to->sched, &from->sched, I915_DEPENDENCY_WEAK); @@ -1404,11 +1336,28 @@ i915_request_await_external(struct i915_request *rq, struct dma_fence *fence) return err; } +static inline bool is_parallel_rq(struct i915_request *rq) +{ + return intel_context_is_parallel(rq->context); +} + +static inline struct intel_context *request_to_parent(struct i915_request *rq) +{ + return intel_context_to_parent(rq->context); +} + +static bool is_same_parallel_context(struct i915_request *to, + struct i915_request *from) +{ + if (is_parallel_rq(to)) + return request_to_parent(to) == request_to_parent(from); + + return false; +} + int i915_request_await_execution(struct i915_request *rq, - struct dma_fence *fence, - void (*hook)(struct i915_request *rq, - struct dma_fence *signal)) + struct dma_fence *fence) { struct dma_fence **child = &fence; unsigned int nchild = 1; @@ -1426,10 +1375,8 @@ i915_request_await_execution(struct i915_request *rq, do { fence = *child++; - if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags)) { - i915_sw_fence_set_error_once(&rq->submit, fence->error); + if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags)) continue; - } if (fence->context == rq->fence.context) continue; @@ -1439,12 +1386,14 @@ i915_request_await_execution(struct i915_request *rq, * want to run our callback in all cases. */ - if (dma_fence_is_i915(fence)) + if (dma_fence_is_i915(fence)) { + if (is_same_parallel_context(rq, to_request(fence))) + continue; ret = __i915_request_await_execution(rq, - to_request(fence), - hook); - else + to_request(fence)); + } else { ret = i915_request_await_external(rq, fence); + } if (ret < 0) return ret; } while (--nchild); @@ -1468,7 +1417,7 @@ await_request_submit(struct i915_request *to, struct i915_request *from) &from->submit, I915_FENCE_GFP); else - return __i915_request_await_execution(to, from, NULL); + return __i915_request_await_execution(to, from); } static int @@ -1484,7 +1433,7 @@ i915_request_await_request(struct i915_request *to, struct i915_request *from) return 0; } - if (to->engine->schedule) { + if (to->engine->sched_engine->schedule) { ret = i915_sched_node_add_dependency(&to->sched, &from->sched, I915_DEPENDENCY_EXTERNAL); @@ -1492,7 +1441,8 @@ i915_request_await_request(struct i915_request *to, struct i915_request *from) return ret; } - if (is_power_of_2(to->execution_mask | READ_ONCE(from->execution_mask))) + if (!intel_engine_uses_guc(to->engine) && + is_power_of_2(to->execution_mask | READ_ONCE(from->execution_mask))) ret = await_request_submit(to, from); else ret = emit_semaphore_wait(to, from, I915_FENCE_GFP); @@ -1527,10 +1477,8 @@ i915_request_await_dma_fence(struct i915_request *rq, struct dma_fence *fence) do { fence = *child++; - if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags)) { - i915_sw_fence_set_error_once(&rq->submit, fence->error); + if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags)) continue; - } /* * Requests on the same timeline are explicitly ordered, along @@ -1546,10 +1494,13 @@ i915_request_await_dma_fence(struct i915_request *rq, struct dma_fence *fence) fence)) continue; - if (dma_fence_is_i915(fence)) + if (dma_fence_is_i915(fence)) { + if (is_same_parallel_context(rq, to_request(fence))) + continue; ret = i915_request_await_request(rq, to_request(fence)); - else + } else { ret = i915_request_await_external(rq, fence); + } if (ret < 0) return ret; @@ -1587,83 +1538,77 @@ i915_request_await_object(struct i915_request *to, struct drm_i915_gem_object *obj, bool write) { - struct dma_fence *excl; + struct dma_resv_iter cursor; + struct dma_fence *fence; int ret = 0; - if (write) { - struct dma_fence **shared; - unsigned int count, i; - - ret = dma_resv_get_fences_rcu(obj->base.resv, - &excl, &count, &shared); + dma_resv_for_each_fence(&cursor, obj->base.resv, write, fence) { + ret = i915_request_await_dma_fence(to, fence); if (ret) - return ret; + break; + } - for (i = 0; i < count; i++) { - ret = i915_request_await_dma_fence(to, shared[i]); - if (ret) - break; + return ret; +} - dma_fence_put(shared[i]); - } +static struct i915_request * +__i915_request_ensure_parallel_ordering(struct i915_request *rq, + struct intel_timeline *timeline) +{ + struct i915_request *prev; - for (; i < count; i++) - dma_fence_put(shared[i]); - kfree(shared); - } else { - excl = dma_resv_get_excl_rcu(obj->base.resv); - } + GEM_BUG_ON(!is_parallel_rq(rq)); - if (excl) { - if (ret == 0) - ret = i915_request_await_dma_fence(to, excl); + prev = request_to_parent(rq)->parallel.last_rq; + if (prev) { + if (!__i915_request_is_complete(prev)) { + i915_sw_fence_await_sw_fence(&rq->submit, + &prev->submit, + &rq->submitq); - dma_fence_put(excl); + if (rq->engine->sched_engine->schedule) + __i915_sched_node_add_dependency(&rq->sched, + &prev->sched, + &rq->dep, + 0); + } + i915_request_put(prev); } - return ret; + request_to_parent(rq)->parallel.last_rq = i915_request_get(rq); + + return to_request(__i915_active_fence_set(&timeline->last_request, + &rq->fence)); } static struct i915_request * -__i915_request_add_to_timeline(struct i915_request *rq) +__i915_request_ensure_ordering(struct i915_request *rq, + struct intel_timeline *timeline) { - struct intel_timeline *timeline = i915_request_timeline(rq); struct i915_request *prev; - /* - * Dependency tracking and request ordering along the timeline - * is special cased so that we can eliminate redundant ordering - * operations while building the request (we know that the timeline - * itself is ordered, and here we guarantee it). - * - * As we know we will need to emit tracking along the timeline, - * we embed the hooks into our request struct -- at the cost of - * having to have specialised no-allocation interfaces (which will - * be beneficial elsewhere). - * - * A second benefit to open-coding i915_request_await_request is - * that we can apply a slight variant of the rules specialised - * for timelines that jump between engines (such as virtual engines). - * If we consider the case of virtual engine, we must emit a dma-fence - * to prevent scheduling of the second request until the first is - * complete (to maximise our greedy late load balancing) and this - * precludes optimising to use semaphores serialisation of a single - * timeline across engines. - */ + GEM_BUG_ON(is_parallel_rq(rq)); + prev = to_request(__i915_active_fence_set(&timeline->last_request, &rq->fence)); + if (prev && !__i915_request_is_complete(prev)) { + bool uses_guc = intel_engine_uses_guc(rq->engine); + bool pow2 = is_power_of_2(READ_ONCE(prev->engine)->mask | + rq->engine->mask); + bool same_context = prev->context == rq->context; + /* * The requests are supposed to be kept in order. However, * we need to be wary in case the timeline->last_request * is used as a barrier for external modification to this * context. */ - GEM_BUG_ON(prev->context == rq->context && + GEM_BUG_ON(same_context && i915_seqno_passed(prev->fence.seqno, rq->fence.seqno)); - if (is_power_of_2(READ_ONCE(prev->engine)->mask | rq->engine->mask)) + if ((same_context && uses_guc) || (!uses_guc && pow2)) i915_sw_fence_await_sw_fence(&rq->submit, &prev->submit, &rq->submitq); @@ -1671,13 +1616,57 @@ __i915_request_add_to_timeline(struct i915_request *rq) __i915_sw_fence_await_dma_fence(&rq->submit, &prev->fence, &rq->dmaq); - if (rq->engine->schedule) + if (rq->engine->sched_engine->schedule) __i915_sched_node_add_dependency(&rq->sched, &prev->sched, &rq->dep, 0); } + return prev; +} + +static struct i915_request * +__i915_request_add_to_timeline(struct i915_request *rq) +{ + struct intel_timeline *timeline = i915_request_timeline(rq); + struct i915_request *prev; + + /* + * Dependency tracking and request ordering along the timeline + * is special cased so that we can eliminate redundant ordering + * operations while building the request (we know that the timeline + * itself is ordered, and here we guarantee it). + * + * As we know we will need to emit tracking along the timeline, + * we embed the hooks into our request struct -- at the cost of + * having to have specialised no-allocation interfaces (which will + * be beneficial elsewhere). + * + * A second benefit to open-coding i915_request_await_request is + * that we can apply a slight variant of the rules specialised + * for timelines that jump between engines (such as virtual engines). + * If we consider the case of virtual engine, we must emit a dma-fence + * to prevent scheduling of the second request until the first is + * complete (to maximise our greedy late load balancing) and this + * precludes optimising to use semaphores serialisation of a single + * timeline across engines. + * + * We do not order parallel submission requests on the timeline as each + * parallel submission context has its own timeline and the ordering + * rules for parallel requests are that they must be submitted in the + * order received from the execbuf IOCTL. So rather than using the + * timeline we store a pointer to last request submitted in the + * relationship in the gem context and insert a submission fence + * between that request and request passed into this function or + * alternatively we use completion fence if gem context has a single + * timeline and this is the first submission of an execbuf IOCTL. + */ + if (likely(!is_parallel_rq(rq))) + prev = __i915_request_ensure_ordering(rq, timeline); + else + prev = __i915_request_ensure_parallel_ordering(rq, timeline); + /* * Make sure that no request gazumped us - if it was allocated after * our i915_request_alloc() and called __i915_request_add() before @@ -1743,8 +1732,8 @@ void __i915_request_queue(struct i915_request *rq, * decide whether to preempt the entire chain so that it is ready to * run at the earliest possible convenience. */ - if (attr && rq->engine->schedule) - rq->engine->schedule(rq, attr); + if (attr && rq->engine->sched_engine->schedule) + rq->engine->sched_engine->schedule(rq, attr); local_bh_disable(); __i915_request_queue_bh(rq); @@ -1933,7 +1922,7 @@ long i915_request_wait(struct i915_request *rq, * completion. That requires having a good predictor for the request * duration, which we currently lack. */ - if (IS_ACTIVE(CONFIG_DRM_I915_MAX_REQUEST_BUSYWAIT) && + if (CONFIG_DRM_I915_MAX_REQUEST_BUSYWAIT && __i915_spin_request(rq, state)) goto out; @@ -2104,31 +2093,61 @@ void i915_request_show(struct drm_printer *m, name); } -#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) -#include "selftests/mock_request.c" -#include "selftests/i915_request.c" -#endif +static bool engine_match_ring(struct intel_engine_cs *engine, struct i915_request *rq) +{ + u32 ring = ENGINE_READ(engine, RING_START); + + return ring == i915_ggtt_offset(rq->ring->vma); +} -static void i915_global_request_shrink(void) +static bool match_ring(struct i915_request *rq) { - kmem_cache_shrink(global.slab_execute_cbs); - kmem_cache_shrink(global.slab_requests); + struct intel_engine_cs *engine; + bool found; + int i; + + if (!intel_engine_is_virtual(rq->engine)) + return engine_match_ring(rq->engine, rq); + + found = false; + i = 0; + while ((engine = intel_engine_get_sibling(rq->engine, i++))) { + found = engine_match_ring(engine, rq); + if (found) + break; + } + + return found; } -static void i915_global_request_exit(void) +enum i915_request_state i915_test_request_state(struct i915_request *rq) { - kmem_cache_destroy(global.slab_execute_cbs); - kmem_cache_destroy(global.slab_requests); + if (i915_request_completed(rq)) + return I915_REQUEST_COMPLETE; + + if (!i915_request_started(rq)) + return I915_REQUEST_PENDING; + + if (match_ring(rq)) + return I915_REQUEST_ACTIVE; + + return I915_REQUEST_QUEUED; } -static struct i915_global_request global = { { - .shrink = i915_global_request_shrink, - .exit = i915_global_request_exit, -} }; +#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) +#include "selftests/mock_request.c" +#include "selftests/i915_request.c" +#endif + +void i915_request_module_exit(void) +{ + kmem_cache_destroy(slab_execute_cbs); + kmem_cache_destroy(slab_requests); +} -int __init i915_global_request_init(void) +int __init i915_request_module_init(void) { - global.slab_requests = + slab_requests = kmem_cache_create("i915_request", sizeof(struct i915_request), __alignof__(struct i915_request), @@ -2136,20 +2155,19 @@ int __init i915_global_request_init(void) SLAB_RECLAIM_ACCOUNT | SLAB_TYPESAFE_BY_RCU, __i915_request_ctor); - if (!global.slab_requests) + if (!slab_requests) return -ENOMEM; - global.slab_execute_cbs = KMEM_CACHE(execute_cb, + slab_execute_cbs = KMEM_CACHE(execute_cb, SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT | SLAB_TYPESAFE_BY_RCU); - if (!global.slab_execute_cbs) + if (!slab_execute_cbs) goto err_requests; - i915_global_register(&global.base); return 0; err_requests: - kmem_cache_destroy(global.slab_requests); + kmem_cache_destroy(slab_requests); return -ENOMEM; } |