diff options
-rw-r--r-- | drivers/gpu/drm/i915/i915_drv.c | 13 | ||||
-rw-r--r-- | drivers/gpu/drm/i915/i915_drv.h | 10 | ||||
-rw-r--r-- | drivers/gpu/drm/i915/i915_gem.c | 5 | ||||
-rw-r--r-- | drivers/gpu/drm/i915/i915_gpu_error.h | 3 | ||||
-rw-r--r-- | drivers/gpu/drm/i915/i915_irq.c | 12 | ||||
-rw-r--r-- | drivers/gpu/drm/i915/i915_request.c | 6 | ||||
-rw-r--r-- | drivers/gpu/drm/i915/selftests/intel_hangcheck.c | 30 |
7 files changed, 47 insertions, 32 deletions
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c index 7ce229c6f424..f770be18b2d7 100644 --- a/drivers/gpu/drm/i915/i915_drv.c +++ b/drivers/gpu/drm/i915/i915_drv.c @@ -1866,6 +1866,8 @@ static int i915_resume_switcheroo(struct drm_device *dev) /** * i915_reset - reset chip after a hang * @i915: #drm_i915_private to reset + * @stalled_mask: mask of the stalled engines with the guilty requests + * @reason: user error message for why we are resetting * * Reset the chip. Useful if a hang is detected. Marks the device as wedged * on failure. @@ -1880,7 +1882,9 @@ static int i915_resume_switcheroo(struct drm_device *dev) * - re-init interrupt state * - re-init display */ -void i915_reset(struct drm_i915_private *i915) +void i915_reset(struct drm_i915_private *i915, + unsigned int stalled_mask, + const char *reason) { struct i915_gpu_error *error = &i915->gpu_error; int ret; @@ -1899,9 +1903,8 @@ void i915_reset(struct drm_i915_private *i915) if (!i915_gem_unset_wedged(i915)) goto wakeup; - if (error->reason) - dev_notice(i915->drm.dev, - "Resetting chip for %s\n", error->reason); + if (reason) + dev_notice(i915->drm.dev, "Resetting chip for %s\n", reason); error->reset_count++; disable_irq(i915->drm.irq); @@ -1944,7 +1947,7 @@ void i915_reset(struct drm_i915_private *i915) goto error; } - i915_gem_reset(i915); + i915_gem_reset(i915, stalled_mask); intel_overlay_reset(i915); /* diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 6b3f2f651def..9bca104c409e 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -2701,8 +2701,11 @@ extern void i915_driver_unload(struct drm_device *dev); extern int intel_gpu_reset(struct drm_i915_private *dev_priv, u32 engine_mask); extern bool intel_has_gpu_reset(struct drm_i915_private *dev_priv); -extern void i915_reset(struct drm_i915_private *i915); -extern int i915_reset_engine(struct intel_engine_cs *engine, const char *msg); +extern void i915_reset(struct drm_i915_private *i915, + unsigned int stalled_mask, + const char *reason); +extern int i915_reset_engine(struct intel_engine_cs *engine, + const char *reason); extern bool intel_has_reset_engine(struct drm_i915_private *dev_priv); extern int intel_reset_guc(struct drm_i915_private *dev_priv); @@ -3126,7 +3129,8 @@ static inline u32 i915_reset_engine_count(struct i915_gpu_error *error, struct i915_request * i915_gem_reset_prepare_engine(struct intel_engine_cs *engine); int i915_gem_reset_prepare(struct drm_i915_private *dev_priv); -void i915_gem_reset(struct drm_i915_private *dev_priv); +void i915_gem_reset(struct drm_i915_private *dev_priv, + unsigned int stalled_mask); void i915_gem_reset_finish_engine(struct intel_engine_cs *engine); void i915_gem_reset_finish(struct drm_i915_private *dev_priv); void i915_gem_set_wedged(struct drm_i915_private *dev_priv); diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 306d7a805eb7..28ab0beff86c 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -3213,7 +3213,8 @@ void i915_gem_reset_engine(struct intel_engine_cs *engine, engine->reset_hw(engine, request); } -void i915_gem_reset(struct drm_i915_private *dev_priv) +void i915_gem_reset(struct drm_i915_private *dev_priv, + unsigned int stalled_mask) { struct intel_engine_cs *engine; enum intel_engine_id id; @@ -3227,7 +3228,7 @@ void i915_gem_reset(struct drm_i915_private *dev_priv) i915_gem_reset_engine(engine, engine->hangcheck.active_request, - engine->hangcheck.stalled); + stalled_mask & ENGINE_MASK(id)); ctx = fetch_and_zero(&engine->last_retired_context); if (ctx) engine->context_unpin(engine, ctx); diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h index ac5760673cc9..c05b6034d718 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.h +++ b/drivers/gpu/drm/i915/i915_gpu_error.h @@ -269,6 +269,9 @@ struct i915_gpu_error { /** Number of times an engine has been reset */ u32 reset_engine_count[I915_NUM_ENGINES]; + /** Set of stalled engines with guilty requests, in the current reset */ + u32 stalled_mask; + /** Reason for the current *global* reset */ const char *reason; diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c index c2f878ace0ea..b03d18561b55 100644 --- a/drivers/gpu/drm/i915/i915_irq.c +++ b/drivers/gpu/drm/i915/i915_irq.c @@ -2961,7 +2961,8 @@ static irqreturn_t gen11_irq_handler(int irq, void *arg) } static void i915_reset_device(struct drm_i915_private *dev_priv, - const char *msg) + u32 engine_mask, + const char *reason) { struct i915_gpu_error *error = &dev_priv->gpu_error; struct kobject *kobj = &dev_priv->drm.primary->kdev->kobj; @@ -2979,9 +2980,11 @@ static void i915_reset_device(struct drm_i915_private *dev_priv, i915_wedge_on_timeout(&w, dev_priv, 5*HZ) { intel_prepare_reset(dev_priv); - error->reason = msg; + error->reason = reason; + error->stalled_mask = engine_mask; /* Signal that locked waiters should reset the GPU */ + smp_mb__before_atomic(); set_bit(I915_RESET_HANDOFF, &error->flags); wake_up_all(&error->wait_queue); @@ -2990,7 +2993,7 @@ static void i915_reset_device(struct drm_i915_private *dev_priv, */ do { if (mutex_trylock(&dev_priv->drm.struct_mutex)) { - i915_reset(dev_priv); + i915_reset(dev_priv, engine_mask, reason); mutex_unlock(&dev_priv->drm.struct_mutex); } } while (wait_on_bit_timeout(&error->flags, @@ -2998,6 +3001,7 @@ static void i915_reset_device(struct drm_i915_private *dev_priv, TASK_UNINTERRUPTIBLE, 1)); + error->stalled_mask = 0; error->reason = NULL; intel_finish_reset(dev_priv); @@ -3122,7 +3126,7 @@ void i915_handle_error(struct drm_i915_private *dev_priv, TASK_UNINTERRUPTIBLE); } - i915_reset_device(dev_priv, msg); + i915_reset_device(dev_priv, engine_mask, msg); for_each_engine(engine, dev_priv, tmp) { clear_bit(I915_RESET_ENGINE + engine->id, diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c index a9d0bde16443..629f3e860592 100644 --- a/drivers/gpu/drm/i915/i915_request.c +++ b/drivers/gpu/drm/i915/i915_request.c @@ -1185,11 +1185,13 @@ static bool __i915_spin_request(const struct i915_request *rq, static bool __i915_wait_request_check_and_reset(struct i915_request *request) { - if (likely(!i915_reset_handoff(&request->i915->gpu_error))) + struct i915_gpu_error *error = &request->i915->gpu_error; + + if (likely(!i915_reset_handoff(error))) return false; __set_current_state(TASK_RUNNING); - i915_reset(request->i915); + i915_reset(request->i915, error->stalled_mask, error->reason); return true; } diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c index acfb4dcc9fb5..24f913f26a7b 100644 --- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c +++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c @@ -437,7 +437,7 @@ static int igt_global_reset(void *arg) mutex_lock(&i915->drm.struct_mutex); reset_count = i915_reset_count(&i915->gpu_error); - i915_reset(i915); + i915_reset(i915, ALL_ENGINES, NULL); if (i915_reset_count(&i915->gpu_error) == reset_count) { pr_err("No GPU reset recorded!\n"); @@ -881,17 +881,18 @@ static int igt_reset_engines(void *arg) return 0; } -static u32 fake_hangcheck(struct i915_request *rq) +static u32 fake_hangcheck(struct i915_request *rq, u32 mask) { - u32 reset_count; + struct i915_gpu_error *error = &rq->i915->gpu_error; + u32 reset_count = i915_reset_count(error); - rq->engine->hangcheck.stalled = true; - rq->engine->hangcheck.seqno = intel_engine_get_seqno(rq->engine); + error->stalled_mask = mask; - reset_count = i915_reset_count(&rq->i915->gpu_error); + /* set_bit() must be after we have setup the backchannel (mask) */ + smp_mb__before_atomic(); + set_bit(I915_RESET_HANDOFF, &error->flags); - set_bit(I915_RESET_HANDOFF, &rq->i915->gpu_error.flags); - wake_up_all(&rq->i915->gpu_error.wait_queue); + wake_up_all(&error->wait_queue); return reset_count; } @@ -939,7 +940,7 @@ static int igt_wait_reset(void *arg) goto out_rq; } - reset_count = fake_hangcheck(rq); + reset_count = fake_hangcheck(rq, ALL_ENGINES); timeout = i915_request_wait(rq, I915_WAIT_LOCKED, 10); if (timeout < 0) { @@ -1075,9 +1076,9 @@ static int igt_reset_queue(void *arg) goto fini; } - reset_count = fake_hangcheck(prev); + reset_count = fake_hangcheck(prev, ENGINE_MASK(id)); - i915_reset(i915); + i915_reset(i915, ENGINE_MASK(id), NULL); GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags)); @@ -1150,7 +1151,7 @@ static int igt_handle_error(void *arg) if (!intel_has_reset_engine(i915)) return 0; - if (!intel_engine_can_store_dword(i915->engine[RCS])) + if (!engine || !intel_engine_can_store_dword(engine)) return 0; mutex_lock(&i915->drm.struct_mutex); @@ -1186,10 +1187,7 @@ static int igt_handle_error(void *arg) /* Temporarily disable error capture */ error = xchg(&i915->gpu_error.first_error, (void *)-1); - engine->hangcheck.stalled = true; - engine->hangcheck.seqno = intel_engine_get_seqno(engine); - - i915_handle_error(i915, intel_engine_flag(engine), 0, NULL); + i915_handle_error(i915, ENGINE_MASK(engine->id), 0, NULL); xchg(&i915->gpu_error.first_error, error); |