[Intel-gfx] [PATCH 10/34] drm/i915: Remove GPU reset dependence on struct_mutex
Mika Kuoppala
mika.kuoppala at linux.intel.com
Thu Jan 24 12:06:01 UTC 2019
Chris Wilson <chris at chris-wilson.co.uk> writes:
> Now that the submission backends are controlled via their own spinlocks,
> with a wave of a magic wand we can lift the struct_mutex requirement
> around GPU reset. That is we allow the submission frontend (userspace)
> to keep on submitting while we process the GPU reset as we can suspend
> the backend independently.
>
> The major change is around the backoff/handoff strategy for performing
> the reset. With no mutex deadlock, we no longer have to coordinate with
> any waiter, and just perform the reset immediately.
>
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> ---
> drivers/gpu/drm/i915/i915_debugfs.c | 38 +-
> drivers/gpu/drm/i915/i915_drv.h | 5 -
> drivers/gpu/drm/i915/i915_gem.c | 18 +-
> drivers/gpu/drm/i915/i915_gem_fence_reg.h | 1 -
> drivers/gpu/drm/i915/i915_gem_gtt.h | 1 +
> drivers/gpu/drm/i915/i915_gpu_error.c | 104 +++--
> drivers/gpu/drm/i915/i915_gpu_error.h | 28 +-
> drivers/gpu/drm/i915/i915_request.c | 47 ---
> drivers/gpu/drm/i915/i915_reset.c | 397 ++++++++----------
> drivers/gpu/drm/i915/i915_reset.h | 3 +
> drivers/gpu/drm/i915/intel_engine_cs.c | 6 +-
> drivers/gpu/drm/i915/intel_guc_submission.c | 5 +-
> drivers/gpu/drm/i915/intel_hangcheck.c | 28 +-
> drivers/gpu/drm/i915/intel_lrc.c | 92 ++--
> drivers/gpu/drm/i915/intel_overlay.c | 2 -
> drivers/gpu/drm/i915/intel_ringbuffer.c | 91 ++--
> drivers/gpu/drm/i915/intel_ringbuffer.h | 17 +-
> .../gpu/drm/i915/selftests/intel_hangcheck.c | 57 +--
> .../drm/i915/selftests/intel_workarounds.c | 3 -
> .../gpu/drm/i915/selftests/mock_gem_device.c | 4 +-
> 20 files changed, 393 insertions(+), 554 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index 24d6d4ce14ef..3ec369980d40 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -1284,8 +1284,6 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
> seq_puts(m, "Wedged\n");
> if (test_bit(I915_RESET_BACKOFF, &dev_priv->gpu_error.flags))
> seq_puts(m, "Reset in progress: struct_mutex backoff\n");
> - if (test_bit(I915_RESET_HANDOFF, &dev_priv->gpu_error.flags))
> - seq_puts(m, "Reset in progress: reset handoff to waiter\n");
> if (waitqueue_active(&dev_priv->gpu_error.wait_queue))
> seq_puts(m, "Waiter holding struct mutex\n");
> if (waitqueue_active(&dev_priv->gpu_error.reset_queue))
> @@ -1321,15 +1319,15 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
> struct rb_node *rb;
>
> seq_printf(m, "%s:\n", engine->name);
> - seq_printf(m, "\tseqno = %x [current %x, last %x]\n",
> + seq_printf(m, "\tseqno = %x [current %x, last %x], %dms ago\n",
> engine->hangcheck.seqno, seqno[id],
> - intel_engine_last_submit(engine));
> - seq_printf(m, "\twaiters? %s, fake irq active? %s, stalled? %s, wedged? %s\n",
> + intel_engine_last_submit(engine),
> + jiffies_to_msecs(jiffies -
> + engine->hangcheck.action_timestamp));
> + seq_printf(m, "\twaiters? %s, fake irq active? %s\n",
> yesno(intel_engine_has_waiter(engine)),
> yesno(test_bit(engine->id,
> - &dev_priv->gpu_error.missed_irq_rings)),
> - yesno(engine->hangcheck.stalled),
> - yesno(engine->hangcheck.wedged));
> + &dev_priv->gpu_error.missed_irq_rings)));
>
> spin_lock_irq(&b->rb_lock);
> for (rb = rb_first(&b->waiters); rb; rb = rb_next(rb)) {
> @@ -1343,11 +1341,6 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
> seq_printf(m, "\tACTHD = 0x%08llx [current 0x%08llx]\n",
> (long long)engine->hangcheck.acthd,
> (long long)acthd[id]);
> - seq_printf(m, "\taction = %s(%d) %d ms ago\n",
> - hangcheck_action_to_str(engine->hangcheck.action),
> - engine->hangcheck.action,
> - jiffies_to_msecs(jiffies -
> - engine->hangcheck.action_timestamp));
Yeah it is a time for sample and most decision are on top of
seqno. Welcomed compression.
>
> if (engine->id == RCS) {
> seq_puts(m, "\tinstdone read =\n");
> @@ -3886,8 +3879,6 @@ static int
> i915_wedged_set(void *data, u64 val)
*hones his axe*
> {
> struct drm_i915_private *i915 = data;
> - struct intel_engine_cs *engine;
> - unsigned int tmp;
>
> /*
> * There is no safeguard against this debugfs entry colliding
> @@ -3900,18 +3891,8 @@ i915_wedged_set(void *data, u64 val)
> if (i915_reset_backoff(&i915->gpu_error))
> return -EAGAIN;
>
> - for_each_engine_masked(engine, i915, val, tmp) {
> - engine->hangcheck.seqno = intel_engine_get_seqno(engine);
> - engine->hangcheck.stalled = true;
> - }
> -
> i915_handle_error(i915, val, I915_ERROR_CAPTURE,
> "Manually set wedged engine mask = %llx", val);
> -
> - wait_on_bit(&i915->gpu_error.flags,
> - I915_RESET_HANDOFF,
> - TASK_UNINTERRUPTIBLE);
> -
> return 0;
> }
>
> @@ -4066,13 +4047,8 @@ i915_drop_caches_set(void *data, u64 val)
> mutex_unlock(&i915->drm.struct_mutex);
> }
>
> - if (val & DROP_RESET_ACTIVE &&
> - i915_terminally_wedged(&i915->gpu_error)) {
> + if (val & DROP_RESET_ACTIVE && i915_terminally_wedged(&i915->gpu_error))
> i915_handle_error(i915, ALL_ENGINES, 0, NULL);
> - wait_on_bit(&i915->gpu_error.flags,
> - I915_RESET_HANDOFF,
> - TASK_UNINTERRUPTIBLE);
> - }
>
> fs_reclaim_acquire(GFP_KERNEL);
> if (val & DROP_BOUND)
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 03db011caa8e..59a7e90113d7 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -3001,11 +3001,6 @@ static inline bool i915_reset_backoff(struct i915_gpu_error *error)
> return unlikely(test_bit(I915_RESET_BACKOFF, &error->flags));
> }
>
> -static inline bool i915_reset_handoff(struct i915_gpu_error *error)
> -{
> - return unlikely(test_bit(I915_RESET_HANDOFF, &error->flags));
> -}
> -
> static inline bool i915_terminally_wedged(struct i915_gpu_error *error)
> {
> return unlikely(test_bit(I915_WEDGED, &error->flags));
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index b359390ba22c..d20b42386c3c 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -657,11 +657,6 @@ i915_gem_object_wait(struct drm_i915_gem_object *obj,
> struct intel_rps_client *rps_client)
> {
> might_sleep();
> -#if IS_ENABLED(CONFIG_LOCKDEP)
> - GEM_BUG_ON(debug_locks &&
> - !!lockdep_is_held(&obj->base.dev->struct_mutex) !=
> - !!(flags & I915_WAIT_LOCKED));
> -#endif
> GEM_BUG_ON(timeout < 0);
>
> timeout = i915_gem_object_wait_reservation(obj->resv,
> @@ -4493,8 +4488,6 @@ void i915_gem_sanitize(struct drm_i915_private *i915)
>
> GEM_TRACE("\n");
>
> - mutex_lock(&i915->drm.struct_mutex);
> -
> wakeref = intel_runtime_pm_get(i915);
> intel_uncore_forcewake_get(i915, FORCEWAKE_ALL);
>
> @@ -4520,6 +4513,7 @@ void i915_gem_sanitize(struct drm_i915_private *i915)
> intel_uncore_forcewake_put(i915, FORCEWAKE_ALL);
> intel_runtime_pm_put(i915, wakeref);
>
unset_wedged looks ok, I should have faith
as I reviewed the patch. In retrospect
READ_ONCE on gt.scratch might have been
good at rising suspicion, even tho
superfluous.
Looks like that engines we are saved by the
timeline lock. Andw we have layed some GEM_BUG_ON mines
there so we will hear the explosions if any.
> + mutex_lock(&i915->drm.struct_mutex);
> i915_gem_contexts_lost(i915);
> mutex_unlock(&i915->drm.struct_mutex);
> }
> @@ -4534,6 +4528,8 @@ int i915_gem_suspend(struct drm_i915_private *i915)
> wakeref = intel_runtime_pm_get(i915);
> intel_suspend_gt_powersave(i915);
>
> + flush_workqueue(i915->wq);
I don't know what is happening here. Why
don't we need the i915_gem_drain_workqueue in here?
> +
> mutex_lock(&i915->drm.struct_mutex);
>
> /*
> @@ -4563,11 +4559,9 @@ int i915_gem_suspend(struct drm_i915_private *i915)
> i915_retire_requests(i915); /* ensure we flush after wedging */
>
> mutex_unlock(&i915->drm.struct_mutex);
> + i915_reset_flush(i915);
>
> - intel_uc_suspend(i915);
> -
> - cancel_delayed_work_sync(&i915->gpu_error.hangcheck_work);
> - cancel_delayed_work_sync(&i915->gt.retire_work);
> + drain_delayed_work(&i915->gt.retire_work);
Hangcheck is inside reset flush but why the change
for retire?
>
> /*
> * As the idle_work is rearming if it detects a race, play safe and
> @@ -4575,6 +4569,8 @@ int i915_gem_suspend(struct drm_i915_private *i915)
> */
> drain_delayed_work(&i915->gt.idle_work);
>
> + intel_uc_suspend(i915);
> +
> /*
> * Assert that we successfully flushed all the work and
> * reset the GPU back to its idle, low power state.
> diff --git a/drivers/gpu/drm/i915/i915_gem_fence_reg.h b/drivers/gpu/drm/i915/i915_gem_fence_reg.h
> index 99a31ded4dfd..09dcaf14121b 100644
> --- a/drivers/gpu/drm/i915/i915_gem_fence_reg.h
> +++ b/drivers/gpu/drm/i915/i915_gem_fence_reg.h
> @@ -50,4 +50,3 @@ struct drm_i915_fence_reg {
> };
>
> #endif
> -
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
> index 9229b03d629b..a0039ea97cdc 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.h
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
> @@ -39,6 +39,7 @@
> #include <linux/pagevec.h>
>
> #include "i915_request.h"
> +#include "i915_reset.h"
> #include "i915_selftest.h"
> #include "i915_timeline.h"
>
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index 1f8e80e31b49..4eef0462489c 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -533,10 +533,7 @@ static void error_print_engine(struct drm_i915_error_state_buf *m,
> err_printf(m, " waiting: %s\n", yesno(ee->waiting));
> err_printf(m, " ring->head: 0x%08x\n", ee->cpu_ring_head);
> err_printf(m, " ring->tail: 0x%08x\n", ee->cpu_ring_tail);
> - err_printf(m, " hangcheck stall: %s\n", yesno(ee->hangcheck_stalled));
> - err_printf(m, " hangcheck action: %s\n",
> - hangcheck_action_to_str(ee->hangcheck_action));
> - err_printf(m, " hangcheck action timestamp: %dms (%lu%s)\n",
> + err_printf(m, " hangcheck timestamp: %dms (%lu%s)\n",
> jiffies_to_msecs(ee->hangcheck_timestamp - epoch),
> ee->hangcheck_timestamp,
> ee->hangcheck_timestamp == epoch ? "; epoch" : "");
> @@ -684,15 +681,15 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,
> jiffies_to_msecs(error->capture - error->epoch));
>
> for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
> - if (error->engine[i].hangcheck_stalled &&
> - error->engine[i].context.pid) {
> - err_printf(m, "Active process (on ring %s): %s [%d], score %d%s\n",
> - engine_name(m->i915, i),
> - error->engine[i].context.comm,
> - error->engine[i].context.pid,
> - error->engine[i].context.ban_score,
> - bannable(&error->engine[i].context));
> - }
> + if (!error->engine[i].context.pid)
> + continue;
> +
> + err_printf(m, "Active process (on ring %s): %s [%d], score %d%s\n",
> + engine_name(m->i915, i),
> + error->engine[i].context.comm,
> + error->engine[i].context.pid,
> + error->engine[i].context.ban_score,
> + bannable(&error->engine[i].context));
> }
> err_printf(m, "Reset count: %u\n", error->reset_count);
> err_printf(m, "Suspend count: %u\n", error->suspend_count);
> @@ -1144,7 +1141,8 @@ static u32 capture_error_bo(struct drm_i915_error_buffer *err,
> return i;
> }
>
> -/* Generate a semi-unique error code. The code is not meant to have meaning, The
> +/*
> + * Generate a semi-unique error code. The code is not meant to have meaning, The
> * code's only purpose is to try to prevent false duplicated bug reports by
> * grossly estimating a GPU error state.
> *
> @@ -1153,29 +1151,23 @@ static u32 capture_error_bo(struct drm_i915_error_buffer *err,
> *
> * It's only a small step better than a random number in its current form.
> */
> -static u32 i915_error_generate_code(struct drm_i915_private *dev_priv,
> - struct i915_gpu_state *error,
> - int *engine_id)
> +static u32 i915_error_generate_code(struct i915_gpu_state *error,
> + unsigned long engine_mask)
> {
> - u32 error_code = 0;
> - int i;
> -
> - /* IPEHR would be an ideal way to detect errors, as it's the gross
> + /*
> + * IPEHR would be an ideal way to detect errors, as it's the gross
> * measure of "the command that hung." However, has some very common
> * synchronization commands which almost always appear in the case
> * strictly a client bug. Use instdone to differentiate those some.
> */
> - for (i = 0; i < I915_NUM_ENGINES; i++) {
> - if (error->engine[i].hangcheck_stalled) {
> - if (engine_id)
> - *engine_id = i;
> + if (engine_mask) {
> + struct drm_i915_error_engine *ee =
> + &error->engine[ffs(engine_mask)];
>
> - return error->engine[i].ipehr ^
> - error->engine[i].instdone.instdone;
> - }
> + return ee->ipehr ^ ee->instdone.instdone;
> }
>
> - return error_code;
> + return 0;
> }
>
> static void gem_record_fences(struct i915_gpu_state *error)
> @@ -1338,9 +1330,8 @@ static void error_record_engine_registers(struct i915_gpu_state *error,
> }
>
> ee->idle = intel_engine_is_idle(engine);
> - ee->hangcheck_timestamp = engine->hangcheck.action_timestamp;
> - ee->hangcheck_action = engine->hangcheck.action;
> - ee->hangcheck_stalled = engine->hangcheck.stalled;
> + if (!ee->idle)
> + ee->hangcheck_timestamp = engine->hangcheck.action_timestamp;
> ee->reset_count = i915_reset_engine_count(&dev_priv->gpu_error,
> engine);
>
> @@ -1783,31 +1774,35 @@ static void capture_reg_state(struct i915_gpu_state *error)
> error->pgtbl_er = I915_READ(PGTBL_ER);
> }
>
> -static void i915_error_capture_msg(struct drm_i915_private *dev_priv,
> - struct i915_gpu_state *error,
> - u32 engine_mask,
> - const char *error_msg)
> +static const char *
> +error_msg(struct i915_gpu_state *error, unsigned long engines, const char *msg)
> {
> - u32 ecode;
> - int engine_id = -1, len;
> + int len;
> + int i;
>
> - ecode = i915_error_generate_code(dev_priv, error, &engine_id);
> + for (i = 0; i < ARRAY_SIZE(error->engine); i++)
> + if (!error->engine[i].context.pid)
> + engines &= ~BIT(i);
No more grouping for driver internal hangs...?
>
> len = scnprintf(error->error_msg, sizeof(error->error_msg),
> - "GPU HANG: ecode %d:%d:0x%08x",
> - INTEL_GEN(dev_priv), engine_id, ecode);
> -
> - if (engine_id != -1 && error->engine[engine_id].context.pid)
> + "GPU HANG: ecode %d:%lx:0x%08x",
> + INTEL_GEN(error->i915), engines,
> + i915_error_generate_code(error, engines));
> + if (engines) {
> + /* Just show the first executing process, more is confusing */
> + i = ffs(engines);
then why not just make the ecode accepting single engine and move it here.
> len += scnprintf(error->error_msg + len,
> sizeof(error->error_msg) - len,
> ", in %s [%d]",
> - error->engine[engine_id].context.comm,
> - error->engine[engine_id].context.pid);
> + error->engine[i].context.comm,
> + error->engine[i].context.pid);
> + }
> + if (msg)
> + len += scnprintf(error->error_msg + len,
> + sizeof(error->error_msg) - len,
> + ", %s", msg);
>
> - scnprintf(error->error_msg + len, sizeof(error->error_msg) - len,
> - ", reason: %s, action: %s",
> - error_msg,
> - engine_mask ? "reset" : "continue");
> + return error->error_msg;
> }
>
> static void capture_gen_state(struct i915_gpu_state *error)
> @@ -1847,7 +1842,7 @@ static unsigned long capture_find_epoch(const struct i915_gpu_state *error)
> for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
> const struct drm_i915_error_engine *ee = &error->engine[i];
>
> - if (ee->hangcheck_stalled &&
> + if (ee->hangcheck_timestamp &&
> time_before(ee->hangcheck_timestamp, epoch))
> epoch = ee->hangcheck_timestamp;
> }
> @@ -1921,7 +1916,7 @@ i915_capture_gpu_state(struct drm_i915_private *i915)
> * i915_capture_error_state - capture an error record for later analysis
> * @i915: i915 device
> * @engine_mask: the mask of engines triggering the hang
> - * @error_msg: a message to insert into the error capture header
> + * @msg: a message to insert into the error capture header
> *
> * Should be called when an error is detected (either a hang or an error
> * interrupt) to capture error state from the time of the error. Fills
> @@ -1929,8 +1924,8 @@ i915_capture_gpu_state(struct drm_i915_private *i915)
> * to pick up.
> */
> void i915_capture_error_state(struct drm_i915_private *i915,
> - u32 engine_mask,
> - const char *error_msg)
> + unsigned long engine_mask,
> + const char *msg)
> {
> static bool warned;
> struct i915_gpu_state *error;
> @@ -1946,8 +1941,7 @@ void i915_capture_error_state(struct drm_i915_private *i915,
> if (IS_ERR(error))
> return;
>
> - i915_error_capture_msg(i915, error, engine_mask, error_msg);
> - DRM_INFO("%s\n", error->error_msg);
> + dev_info(i915->drm.dev, "%s\n", error_msg(error, engine_mask, msg));
>
> if (!error->simulated) {
> spin_lock_irqsave(&i915->gpu_error.lock, flags);
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h
> index 604291f7762d..231173786eae 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.h
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.h
> @@ -85,8 +85,6 @@ struct i915_gpu_state {
> bool waiting;
> int num_waiters;
> unsigned long hangcheck_timestamp;
> - bool hangcheck_stalled;
> - enum intel_engine_hangcheck_action hangcheck_action;
> struct i915_address_space *vm;
> int num_requests;
> u32 reset_count;
> @@ -197,6 +195,8 @@ struct i915_gpu_state {
> struct scatterlist *sgl, *fit;
> };
>
> +struct i915_gpu_restart;
> +
> struct i915_gpu_error {
> /* For hangcheck timer */
> #define DRM_I915_HANGCHECK_PERIOD 1500 /* in ms */
> @@ -247,15 +247,6 @@ struct i915_gpu_error {
> * i915_mutex_lock_interruptible()?). I915_RESET_BACKOFF serves a
> * secondary role in preventing two concurrent global reset attempts.
> *
> - * #I915_RESET_HANDOFF - To perform the actual GPU reset, we need the
> - * struct_mutex. We try to acquire the struct_mutex in the reset worker,
> - * but it may be held by some long running waiter (that we cannot
> - * interrupt without causing trouble). Once we are ready to do the GPU
> - * reset, we set the I915_RESET_HANDOFF bit and wakeup any waiters. If
> - * they already hold the struct_mutex and want to participate they can
> - * inspect the bit and do the reset directly, otherwise the worker
> - * waits for the struct_mutex.
> - *
> * #I915_RESET_ENGINE[num_engines] - Since the driver doesn't need to
> * acquire the struct_mutex to reset an engine, we need an explicit
> * flag to prevent two concurrent reset attempts in the same engine.
> @@ -269,20 +260,13 @@ struct i915_gpu_error {
> */
> unsigned long flags;
> #define I915_RESET_BACKOFF 0
> -#define I915_RESET_HANDOFF 1
> -#define I915_RESET_MODESET 2
> -#define I915_RESET_ENGINE 3
> +#define I915_RESET_MODESET 1
> +#define I915_RESET_ENGINE 2
> #define I915_WEDGED (BITS_PER_LONG - 1)
>
> /** Number of times an engine has been reset */
> u32 reset_engine_count[I915_NUM_ENGINES];
>
> - /** Set of stalled engines with guilty requests, in the current reset */
> - u32 stalled_mask;
> -
> - /** Reason for the current *global* reset */
> - const char *reason;
> -
> struct mutex wedge_mutex; /* serialises wedging/unwedging */
>
> /**
> @@ -299,6 +283,8 @@ struct i915_gpu_error {
>
> /* For missed irq/seqno simulation. */
> unsigned long test_irq_rings;
> +
> + struct i915_gpu_restart *restart;
> };
>
> struct drm_i915_error_state_buf {
> @@ -320,7 +306,7 @@ void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...);
>
> struct i915_gpu_state *i915_capture_gpu_state(struct drm_i915_private *i915);
> void i915_capture_error_state(struct drm_i915_private *dev_priv,
> - u32 engine_mask,
> + unsigned long engine_mask,
> const char *error_msg);
>
> static inline struct i915_gpu_state *
> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> index 5e178f5ac18b..80232de8e2be 100644
> --- a/drivers/gpu/drm/i915/i915_request.c
> +++ b/drivers/gpu/drm/i915/i915_request.c
> @@ -1083,18 +1083,6 @@ static bool __i915_spin_request(const struct i915_request *rq,
> return false;
> }
>
> -static bool __i915_wait_request_check_and_reset(struct i915_request *request)
> -{
> - struct i915_gpu_error *error = &request->i915->gpu_error;
> -
> - if (likely(!i915_reset_handoff(error)))
> - return false;
> -
> - __set_current_state(TASK_RUNNING);
> - i915_reset(request->i915, error->stalled_mask, error->reason);
> - return true;
> -}
> -
> /**
> * i915_request_wait - wait until execution of request has finished
> * @rq: the request to wait upon
> @@ -1120,17 +1108,10 @@ long i915_request_wait(struct i915_request *rq,
> {
> const int state = flags & I915_WAIT_INTERRUPTIBLE ?
> TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE;
> - wait_queue_head_t *errq = &rq->i915->gpu_error.wait_queue;
> - DEFINE_WAIT_FUNC(reset, default_wake_function);
> DEFINE_WAIT_FUNC(exec, default_wake_function);
> struct intel_wait wait;
>
> might_sleep();
> -#if IS_ENABLED(CONFIG_LOCKDEP)
> - GEM_BUG_ON(debug_locks &&
> - !!lockdep_is_held(&rq->i915->drm.struct_mutex) !=
> - !!(flags & I915_WAIT_LOCKED));
> -#endif
> GEM_BUG_ON(timeout < 0);
>
> if (i915_request_completed(rq))
> @@ -1140,11 +1121,7 @@ long i915_request_wait(struct i915_request *rq,
> return -ETIME;
>
> trace_i915_request_wait_begin(rq, flags);
> -
> add_wait_queue(&rq->execute, &exec);
> - if (flags & I915_WAIT_LOCKED)
> - add_wait_queue(errq, &reset);
> -
> intel_wait_init(&wait);
> if (flags & I915_WAIT_PRIORITY)
> i915_schedule_bump_priority(rq, I915_PRIORITY_WAIT);
> @@ -1155,10 +1132,6 @@ long i915_request_wait(struct i915_request *rq,
> if (intel_wait_update_request(&wait, rq))
> break;
>
> - if (flags & I915_WAIT_LOCKED &&
> - __i915_wait_request_check_and_reset(rq))
> - continue;
> -
> if (signal_pending_state(state, current)) {
> timeout = -ERESTARTSYS;
> goto complete;
> @@ -1188,9 +1161,6 @@ long i915_request_wait(struct i915_request *rq,
> */
> goto wakeup;
>
> - if (flags & I915_WAIT_LOCKED)
> - __i915_wait_request_check_and_reset(rq);
> -
> for (;;) {
> if (signal_pending_state(state, current)) {
> timeout = -ERESTARTSYS;
> @@ -1214,21 +1184,6 @@ long i915_request_wait(struct i915_request *rq,
> if (i915_request_completed(rq))
> break;
>
> - /*
> - * If the GPU is hung, and we hold the lock, reset the GPU
> - * and then check for completion. On a full reset, the engine's
> - * HW seqno will be advanced passed us and we are complete.
> - * If we do a partial reset, we have to wait for the GPU to
> - * resume and update the breadcrumb.
> - *
> - * If we don't hold the mutex, we can just wait for the worker
> - * to come along and update the breadcrumb (either directly
> - * itself, or indirectly by recovering the GPU).
> - */
> - if (flags & I915_WAIT_LOCKED &&
> - __i915_wait_request_check_and_reset(rq))
> - continue;
> -
> /* Only spin if we know the GPU is processing this request */
> if (__i915_spin_request(rq, wait.seqno, state, 2))
> break;
> @@ -1242,8 +1197,6 @@ long i915_request_wait(struct i915_request *rq,
> intel_engine_remove_wait(rq->engine, &wait);
> complete:
> __set_current_state(TASK_RUNNING);
> - if (flags & I915_WAIT_LOCKED)
> - remove_wait_queue(errq, &reset);
> remove_wait_queue(&rq->execute, &exec);
> trace_i915_request_wait_end(rq);
>
> diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c
> index 2961c21d9420..064fc6da1512 100644
> --- a/drivers/gpu/drm/i915/i915_reset.c
> +++ b/drivers/gpu/drm/i915/i915_reset.c
> @@ -5,6 +5,7 @@
> */
>
> #include <linux/sched/mm.h>
> +#include <linux/stop_machine.h>
>
> #include "i915_drv.h"
> #include "i915_gpu_error.h"
> @@ -17,22 +18,23 @@ static void engine_skip_context(struct i915_request *rq)
> struct intel_engine_cs *engine = rq->engine;
> struct i915_gem_context *hung_ctx = rq->gem_context;
> struct i915_timeline *timeline = rq->timeline;
> - unsigned long flags;
>
> + lockdep_assert_held(&engine->timeline.lock);
> GEM_BUG_ON(timeline == &engine->timeline);
>
> - spin_lock_irqsave(&engine->timeline.lock, flags);
> spin_lock(&timeline->lock);
>
> - list_for_each_entry_continue(rq, &engine->timeline.requests, link)
> - if (rq->gem_context == hung_ctx)
> - i915_request_skip(rq, -EIO);
> + if (rq->global_seqno) {
> + list_for_each_entry_continue(rq,
> + &engine->timeline.requests, link)
> + if (rq->gem_context == hung_ctx)
> + i915_request_skip(rq, -EIO);
> + }
>
> list_for_each_entry(rq, &timeline->requests, link)
> i915_request_skip(rq, -EIO);
>
> spin_unlock(&timeline->lock);
> - spin_unlock_irqrestore(&engine->timeline.lock, flags);
> }
>
> static void client_mark_guilty(struct drm_i915_file_private *file_priv,
> @@ -59,7 +61,7 @@ static void client_mark_guilty(struct drm_i915_file_private *file_priv,
> }
> }
>
> -static void context_mark_guilty(struct i915_gem_context *ctx)
> +static bool context_mark_guilty(struct i915_gem_context *ctx)
> {
> unsigned int score;
> bool banned, bannable;
> @@ -72,7 +74,7 @@ static void context_mark_guilty(struct i915_gem_context *ctx)
>
> /* Cool contexts don't accumulate client ban score */
> if (!bannable)
> - return;
> + return false;
>
> if (banned) {
> DRM_DEBUG_DRIVER("context %s: guilty %d, score %u, banned\n",
> @@ -83,6 +85,8 @@ static void context_mark_guilty(struct i915_gem_context *ctx)
>
> if (!IS_ERR_OR_NULL(ctx->file_priv))
> client_mark_guilty(ctx->file_priv, ctx);
> +
> + return banned;
> }
>
> static void context_mark_innocent(struct i915_gem_context *ctx)
> @@ -90,6 +94,21 @@ static void context_mark_innocent(struct i915_gem_context *ctx)
> atomic_inc(&ctx->active_count);
> }
>
> +void i915_reset_request(struct i915_request *rq, bool guilty)
> +{
> + lockdep_assert_held(&rq->engine->timeline.lock);
> + GEM_BUG_ON(i915_request_completed(rq));
> +
> + if (guilty) {
> + i915_request_skip(rq, -EIO);
> + if (context_mark_guilty(rq->gem_context))
> + engine_skip_context(rq);
> + } else {
> + dma_fence_set_error(&rq->fence, -EAGAIN);
> + context_mark_innocent(rq->gem_context);
> + }
> +}
> +
> static void gen3_stop_engine(struct intel_engine_cs *engine)
> {
> struct drm_i915_private *dev_priv = engine->i915;
> @@ -533,22 +552,6 @@ int intel_gpu_reset(struct drm_i915_private *i915, unsigned int engine_mask)
> int retry;
> int ret;
>
> - /*
> - * We want to perform per-engine reset from atomic context (e.g.
> - * softirq), which imposes the constraint that we cannot sleep.
> - * However, experience suggests that spending a bit of time waiting
> - * for a reset helps in various cases, so for a full-device reset
> - * we apply the opposite rule and wait if we want to. As we should
> - * always follow up a failed per-engine reset with a full device reset,
> - * being a little faster, stricter and more error prone for the
> - * atomic case seems an acceptable compromise.
> - *
> - * Unfortunately this leads to a bimodal routine, when the goal was
> - * to have a single reset function that worked for resetting any
> - * number of engines simultaneously.
> - */
> - might_sleep_if(engine_mask == ALL_ENGINES);
Oh here it is. I was after this on atomic resets.
> -
> /*
> * If the power well sleeps during the reset, the reset
> * request may be dropped and never completes (causing -EIO).
> @@ -580,8 +583,6 @@ int intel_gpu_reset(struct drm_i915_private *i915, unsigned int engine_mask)
> }
> if (ret != -ETIMEDOUT || engine_mask != ALL_ENGINES)
> break;
> -
> - cond_resched();
> }
> intel_uncore_forcewake_put(i915, FORCEWAKE_ALL);
>
> @@ -620,11 +621,8 @@ int intel_reset_guc(struct drm_i915_private *i915)
> * Ensure irq handler finishes, and not run again.
> * Also return the active request so that we only search for it once.
> */
> -static struct i915_request *
> -reset_prepare_engine(struct intel_engine_cs *engine)
> +static void reset_prepare_engine(struct intel_engine_cs *engine)
> {
> - struct i915_request *rq;
> -
> /*
> * During the reset sequence, we must prevent the engine from
> * entering RC6. As the context state is undefined until we restart
> @@ -633,162 +631,86 @@ reset_prepare_engine(struct intel_engine_cs *engine)
> * GPU state upon resume, i.e. fail to restart after a reset.
> */
> intel_uncore_forcewake_get(engine->i915, FORCEWAKE_ALL);
> -
> - rq = engine->reset.prepare(engine);
> - if (rq && rq->fence.error == -EIO)
> - rq = ERR_PTR(-EIO); /* Previous reset failed! */
> -
> - return rq;
> + engine->reset.prepare(engine);
> }
>
> -static int reset_prepare(struct drm_i915_private *i915)
> +static void reset_prepare(struct drm_i915_private *i915)
> {
> struct intel_engine_cs *engine;
> - struct i915_request *rq;
> enum intel_engine_id id;
> - int err = 0;
>
> - for_each_engine(engine, i915, id) {
> - rq = reset_prepare_engine(engine);
> - if (IS_ERR(rq)) {
> - err = PTR_ERR(rq);
> - continue;
> - }
> -
> - engine->hangcheck.active_request = rq;
> - }
> + for_each_engine(engine, i915, id)
> + reset_prepare_engine(engine);
>
> - i915_gem_revoke_fences(i915);
> intel_uc_sanitize(i915);
> -
> - return err;
> }
>
> -/* Returns the request if it was guilty of the hang */
> -static struct i915_request *
> -reset_request(struct intel_engine_cs *engine,
> - struct i915_request *rq,
> - bool stalled)
> +static int gt_reset(struct drm_i915_private *i915, unsigned int stalled_mask)
> {
> + struct intel_engine_cs *engine;
> + enum intel_engine_id id;
> + int err;
> +
> /*
> - * The guilty request will get skipped on a hung engine.
> - *
> - * Users of client default contexts do not rely on logical
> - * state preserved between batches so it is safe to execute
> - * queued requests following the hang. Non default contexts
> - * rely on preserved state, so skipping a batch loses the
> - * evolution of the state and it needs to be considered corrupted.
> - * Executing more queued batches on top of corrupted state is
> - * risky. But we take the risk by trying to advance through
> - * the queued requests in order to make the client behaviour
> - * more predictable around resets, by not throwing away random
> - * amount of batches it has prepared for execution. Sophisticated
> - * clients can use gem_reset_stats_ioctl and dma fence status
> - * (exported via sync_file info ioctl on explicit fences) to observe
> - * when it loses the context state and should rebuild accordingly.
> - *
> - * The context ban, and ultimately the client ban, mechanism are safety
> - * valves if client submission ends up resulting in nothing more than
> - * subsequent hangs.
> + * Everything depends on having the GTT running, so we need to start
> + * there.
> */
> + err = i915_ggtt_enable_hw(i915);
> + if (err)
> + return err;
>
> - if (i915_request_completed(rq)) {
> - GEM_TRACE("%s pardoned global=%d (fence %llx:%lld), current %d\n",
> - engine->name, rq->global_seqno,
> - rq->fence.context, rq->fence.seqno,
> - intel_engine_get_seqno(engine));
> - stalled = false;
> - }
> -
> - if (stalled) {
> - context_mark_guilty(rq->gem_context);
> - i915_request_skip(rq, -EIO);
> + for_each_engine(engine, i915, id)
> + intel_engine_reset(engine, stalled_mask & ENGINE_MASK(id));
>
> - /* If this context is now banned, skip all pending requests. */
> - if (i915_gem_context_is_banned(rq->gem_context))
> - engine_skip_context(rq);
> - } else {
> - /*
> - * Since this is not the hung engine, it may have advanced
> - * since the hang declaration. Double check by refinding
> - * the active request at the time of the reset.
> - */
> - rq = i915_gem_find_active_request(engine);
> - if (rq) {
> - unsigned long flags;
> -
> - context_mark_innocent(rq->gem_context);
> - dma_fence_set_error(&rq->fence, -EAGAIN);
> -
> - /* Rewind the engine to replay the incomplete rq */
> - spin_lock_irqsave(&engine->timeline.lock, flags);
> - rq = list_prev_entry(rq, link);
> - if (&rq->link == &engine->timeline.requests)
> - rq = NULL;
> - spin_unlock_irqrestore(&engine->timeline.lock, flags);
> - }
> - }
> + i915_gem_restore_fences(i915);
>
> - return rq;
> + return err;
> }
>
> -static void reset_engine(struct intel_engine_cs *engine,
> - struct i915_request *rq,
> - bool stalled)
> +static void reset_finish_engine(struct intel_engine_cs *engine)
> {
> - if (rq)
> - rq = reset_request(engine, rq, stalled);
> -
> - /* Setup the CS to resume from the breadcrumb of the hung request */
> - engine->reset.reset(engine, rq);
> + engine->reset.finish(engine);
> + intel_uncore_forcewake_put(engine->i915, FORCEWAKE_ALL);
> }
>
> -static void gt_reset(struct drm_i915_private *i915, unsigned int stalled_mask)
> +struct i915_gpu_restart {
> + struct work_struct work;
> + struct drm_i915_private *i915;
> +};
> +
> +static void restart_work(struct work_struct *work)
> {
> + struct i915_gpu_restart *arg = container_of(work, typeof(*arg), work);
> + struct drm_i915_private *i915 = arg->i915;
> struct intel_engine_cs *engine;
> enum intel_engine_id id;
> + intel_wakeref_t wakeref;
>
> - lockdep_assert_held(&i915->drm.struct_mutex);
> + wakeref = intel_runtime_pm_get(i915);
> + mutex_lock(&i915->drm.struct_mutex);
>
> - i915_retire_requests(i915);
Can't do this anymore yes. What will be the effect
of delaying this and the other explicit retirements?
Are we more prone to starvation?
> + smp_store_mb(i915->gpu_error.restart, NULL);
Checkpatch might want a comment for the mb.
>
> for_each_engine(engine, i915, id) {
> - struct intel_context *ce;
> -
> - reset_engine(engine,
> - engine->hangcheck.active_request,
> - stalled_mask & ENGINE_MASK(id));
> - ce = fetch_and_zero(&engine->last_retired_context);
> - if (ce)
> - intel_context_unpin(ce);
> + struct i915_request *rq;
>
> /*
> * Ostensibily, we always want a context loaded for powersaving,
> * so if the engine is idle after the reset, send a request
> * to load our scratch kernel_context.
> - *
> - * More mysteriously, if we leave the engine idle after a reset,
> - * the next userspace batch may hang, with what appears to be
> - * an incoherent read by the CS (presumably stale TLB). An
> - * empty request appears sufficient to paper over the glitch.
> */
> - if (intel_engine_is_idle(engine)) {
> - struct i915_request *rq;
> + if (!intel_engine_is_idle(engine))
> + continue;
Why did you remove the comment on needing a empty request?
Also if the request causing nonidle could be troublesome one,
from troublesome context, why not just skip the idle check and
always add one for kernel ctx?
>
> - rq = i915_request_alloc(engine, i915->kernel_context);
> - if (!IS_ERR(rq))
> - i915_request_add(rq);
> - }
> + rq = i915_request_alloc(engine, i915->kernel_context);
> + if (!IS_ERR(rq))
> + i915_request_add(rq);
> }
>
> - i915_gem_restore_fences(i915);
> -}
> -
> -static void reset_finish_engine(struct intel_engine_cs *engine)
> -{
> - engine->reset.finish(engine);
> + mutex_unlock(&i915->drm.struct_mutex);
> + intel_runtime_pm_put(i915, wakeref);
>
> - intel_uncore_forcewake_put(engine->i915, FORCEWAKE_ALL);
> + kfree(arg);
> }
>
> static void reset_finish(struct drm_i915_private *i915)
> @@ -796,11 +718,30 @@ static void reset_finish(struct drm_i915_private *i915)
> struct intel_engine_cs *engine;
> enum intel_engine_id id;
>
> - lockdep_assert_held(&i915->drm.struct_mutex);
> -
> - for_each_engine(engine, i915, id) {
> - engine->hangcheck.active_request = NULL;
> + for_each_engine(engine, i915, id)
> reset_finish_engine(engine);
> +}
> +
> +static void reset_restart(struct drm_i915_private *i915)
> +{
> + struct i915_gpu_restart *arg;
> +
> + /*
> + * Following the reset, ensure that we always reload context for
> + * powersaving, and to correct engine->last_retired_context. Since
> + * this requires us to submit a request, queue a worker to do that
> + * task for us to evade any locking here.
> + */
Nice, this was/will be helpful!
> + if (READ_ONCE(i915->gpu_error.restart))
> + return;
> +
> + arg = kmalloc(sizeof(*arg), GFP_KERNEL);
> + if (arg) {
> + arg->i915 = i915;
> + INIT_WORK(&arg->work, restart_work);
> +
> + WRITE_ONCE(i915->gpu_error.restart, arg);
> + queue_work(i915->wq, &arg->work);
> }
> }
>
> @@ -889,8 +830,6 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
> struct i915_timeline *tl;
> bool ret = false;
>
> - lockdep_assert_held(&i915->drm.struct_mutex);
> -
> if (!test_bit(I915_WEDGED, &error->flags))
> return true;
>
> @@ -913,9 +852,9 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
> */
> list_for_each_entry(tl, &i915->gt.timelines, link) {
> struct i915_request *rq;
> + long timeout;
>
> - rq = i915_gem_active_peek(&tl->last_request,
> - &i915->drm.struct_mutex);
> + rq = i915_gem_active_get_unlocked(&tl->last_request);
> if (!rq)
> continue;
>
> @@ -930,12 +869,12 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
> * and when the seqno passes the fence, the signaler
> * then signals the fence waking us up).
> */
> - if (dma_fence_default_wait(&rq->fence, true,
> - MAX_SCHEDULE_TIMEOUT) < 0)
> + timeout = dma_fence_default_wait(&rq->fence, true,
> + MAX_SCHEDULE_TIMEOUT);
> + i915_request_put(rq);
> + if (timeout < 0)
> goto unlock;
> }
> - i915_retire_requests(i915);
> - GEM_BUG_ON(i915->gt.active_requests);
>
> intel_engines_sanitize(i915, false);
>
> @@ -949,7 +888,6 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
> * context and do not require stop_machine().
> */
> intel_engines_reset_default_submission(i915);
> - i915_gem_contexts_lost(i915);
>
> GEM_TRACE("end\n");
>
> @@ -962,6 +900,43 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
> return ret;
> }
>
> +struct __i915_reset {
> + struct drm_i915_private *i915;
> + unsigned int stalled_mask;
> +};
> +
> +static int __i915_reset__BKL(void *data)
> +{
> + struct __i915_reset *arg = data;
> + int err;
> +
> + err = intel_gpu_reset(arg->i915, ALL_ENGINES);
> + if (err)
> + return err;
> +
> + return gt_reset(arg->i915, arg->stalled_mask);
> +}
> +
> +#if 0
> +#define __do_reset(fn, arg) stop_machine(fn, arg, NULL)
Lets remove the machinery to select reset stop_machine and the #include.
> +#else
> +#define __do_reset(fn, arg) fn(arg)
> +#endif
> +
> +static int do_reset(struct drm_i915_private *i915, unsigned int stalled_mask)
> +{
> + struct __i915_reset arg = { i915, stalled_mask };
> + int err, i;
> +
> + err = __do_reset(__i915_reset__BKL, &arg);
> + for (i = 0; err && i < 3; i++) {
> + msleep(100);
> + err = __do_reset(__i915_reset__BKL, &arg);
> + }
> +
> + return err;
> +}
> +
> /**
> * i915_reset - reset chip after a hang
> * @i915: #drm_i915_private to reset
> @@ -987,31 +962,22 @@ void i915_reset(struct drm_i915_private *i915,
> {
> struct i915_gpu_error *error = &i915->gpu_error;
> int ret;
> - int i;
>
> GEM_TRACE("flags=%lx\n", error->flags);
>
> might_sleep();
What will? I didn't spot anything in execlists_reset_prepare.
> - lockdep_assert_held(&i915->drm.struct_mutex);
> assert_rpm_wakelock_held(i915);
> GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &error->flags));
>
> - if (!test_bit(I915_RESET_HANDOFF, &error->flags))
> - return;
> -
> /* Clear any previous failed attempts at recovery. Time to try again. */
> if (!i915_gem_unset_wedged(i915))
> - goto wakeup;
> + return;
>
> if (reason)
> dev_notice(i915->drm.dev, "Resetting chip for %s\n", reason);
> error->reset_count++;
>
> - ret = reset_prepare(i915);
> - if (ret) {
> - dev_err(i915->drm.dev, "GPU recovery failed\n");
> - goto taint;
> - }
> + reset_prepare(i915);
>
> if (!intel_has_gpu_reset(i915)) {
> if (i915_modparams.reset)
> @@ -1021,32 +987,11 @@ void i915_reset(struct drm_i915_private *i915,
> goto error;
> }
>
> - for (i = 0; i < 3; i++) {
> - ret = intel_gpu_reset(i915, ALL_ENGINES);
> - if (ret == 0)
> - break;
> -
> - msleep(100);
> - }
> - if (ret) {
> + if (do_reset(i915, stalled_mask)) {
> dev_err(i915->drm.dev, "Failed to reset chip\n");
> goto taint;
> }
>
> - /* Ok, now get things going again... */
> -
> - /*
> - * Everything depends on having the GTT running, so we need to start
> - * there.
> - */
> - ret = i915_ggtt_enable_hw(i915);
> - if (ret) {
> - DRM_ERROR("Failed to re-enable GGTT following reset (%d)\n",
> - ret);
> - goto error;
> - }
> -
> - gt_reset(i915, stalled_mask);
> intel_overlay_reset(i915);
>
> /*
> @@ -1068,9 +1013,8 @@ void i915_reset(struct drm_i915_private *i915,
>
> finish:
> reset_finish(i915);
> -wakeup:
> - clear_bit(I915_RESET_HANDOFF, &error->flags);
> - wake_up_bit(&error->flags, I915_RESET_HANDOFF);
> + if (!i915_terminally_wedged(error))
> + reset_restart(i915);
> return;
>
> taint:
> @@ -1089,7 +1033,6 @@ void i915_reset(struct drm_i915_private *i915,
> add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
> error:
> i915_gem_set_wedged(i915);
> - i915_retire_requests(i915);
> goto finish;
> }
>
> @@ -1115,18 +1058,16 @@ static inline int intel_gt_reset_engine(struct drm_i915_private *i915,
> int i915_reset_engine(struct intel_engine_cs *engine, const char *msg)
> {
> struct i915_gpu_error *error = &engine->i915->gpu_error;
> - struct i915_request *active_request;
> int ret;
>
> GEM_TRACE("%s flags=%lx\n", engine->name, error->flags);
> GEM_BUG_ON(!test_bit(I915_RESET_ENGINE + engine->id, &error->flags));
>
> - active_request = reset_prepare_engine(engine);
> - if (IS_ERR_OR_NULL(active_request)) {
> - /* Either the previous reset failed, or we pardon the reset. */
> - ret = PTR_ERR(active_request);
> - goto out;
> - }
> + if (i915_seqno_passed(intel_engine_get_seqno(engine),
> + intel_engine_last_submit(engine)))
> + return 0;
You seem to have a patch to remove this shortly after so
squash?
I need to restock on coffee at this point.
-Mika
> +
> + reset_prepare_engine(engine);
>
> if (msg)
> dev_notice(engine->i915->drm.dev,
> @@ -1150,7 +1091,7 @@ int i915_reset_engine(struct intel_engine_cs *engine, const char *msg)
> * active request and can drop it, adjust head to skip the offending
> * request to resume executing remaining requests in the queue.
> */
> - reset_engine(engine, active_request, true);
> + intel_engine_reset(engine, true);
>
> /*
> * The engine and its registers (and workarounds in case of render)
> @@ -1187,30 +1128,7 @@ static void i915_reset_device(struct drm_i915_private *i915,
> i915_wedge_on_timeout(&w, i915, 5 * HZ) {
> intel_prepare_reset(i915);
>
> - error->reason = reason;
> - error->stalled_mask = engine_mask;
> -
> - /* Signal that locked waiters should reset the GPU */
> - smp_mb__before_atomic();
> - set_bit(I915_RESET_HANDOFF, &error->flags);
> - wake_up_all(&error->wait_queue);
> -
> - /*
> - * Wait for anyone holding the lock to wakeup, without
> - * blocking indefinitely on struct_mutex.
> - */
> - do {
> - if (mutex_trylock(&i915->drm.struct_mutex)) {
> - i915_reset(i915, engine_mask, reason);
> - mutex_unlock(&i915->drm.struct_mutex);
> - }
> - } while (wait_on_bit_timeout(&error->flags,
> - I915_RESET_HANDOFF,
> - TASK_UNINTERRUPTIBLE,
> - 1));
> -
> - error->stalled_mask = 0;
> - error->reason = NULL;
> + i915_reset(i915, engine_mask, reason);
>
> intel_finish_reset(i915);
> }
> @@ -1366,6 +1284,25 @@ void i915_handle_error(struct drm_i915_private *i915,
> intel_runtime_pm_put(i915, wakeref);
> }
>
> +bool i915_reset_flush(struct drm_i915_private *i915)
> +{
> + int err;
> +
> + cancel_delayed_work_sync(&i915->gpu_error.hangcheck_work);
> +
> + flush_workqueue(i915->wq);
> + GEM_BUG_ON(READ_ONCE(i915->gpu_error.restart));
> +
> + mutex_lock(&i915->drm.struct_mutex);
> + err = i915_gem_wait_for_idle(i915,
> + I915_WAIT_LOCKED |
> + I915_WAIT_FOR_IDLE_BOOST,
> + MAX_SCHEDULE_TIMEOUT);
> + mutex_unlock(&i915->drm.struct_mutex);
> +
> + return !err;
> +}
> +
> static void i915_wedge_me(struct work_struct *work)
> {
> struct i915_wedge_me *w = container_of(work, typeof(*w), work.work);
> diff --git a/drivers/gpu/drm/i915/i915_reset.h b/drivers/gpu/drm/i915/i915_reset.h
> index b6a519bde67d..f2d347f319df 100644
> --- a/drivers/gpu/drm/i915/i915_reset.h
> +++ b/drivers/gpu/drm/i915/i915_reset.h
> @@ -29,6 +29,9 @@ void i915_reset(struct drm_i915_private *i915,
> int i915_reset_engine(struct intel_engine_cs *engine,
> const char *reason);
>
> +void i915_reset_request(struct i915_request *rq, bool guilty);
> +bool i915_reset_flush(struct drm_i915_private *i915);
> +
> bool intel_has_gpu_reset(struct drm_i915_private *i915);
> bool intel_has_reset_engine(struct drm_i915_private *i915);
>
> diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
> index 2f3c71f6d313..fc52737751e7 100644
> --- a/drivers/gpu/drm/i915/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/intel_engine_cs.c
> @@ -1071,10 +1071,8 @@ void intel_engines_sanitize(struct drm_i915_private *i915, bool force)
> if (!reset_engines(i915) && !force)
> return;
>
> - for_each_engine(engine, i915, id) {
> - if (engine->reset.reset)
> - engine->reset.reset(engine, NULL);
> - }
> + for_each_engine(engine, i915, id)
> + intel_engine_reset(engine, false);
> }
>
> /**
> diff --git a/drivers/gpu/drm/i915/intel_guc_submission.c b/drivers/gpu/drm/i915/intel_guc_submission.c
> index ab1c49b106f2..7217c7e3ee8d 100644
> --- a/drivers/gpu/drm/i915/intel_guc_submission.c
> +++ b/drivers/gpu/drm/i915/intel_guc_submission.c
> @@ -834,8 +834,7 @@ static void guc_submission_tasklet(unsigned long data)
> spin_unlock_irqrestore(&engine->timeline.lock, flags);
> }
>
> -static struct i915_request *
> -guc_reset_prepare(struct intel_engine_cs *engine)
> +static void guc_reset_prepare(struct intel_engine_cs *engine)
> {
> struct intel_engine_execlists * const execlists = &engine->execlists;
>
> @@ -861,8 +860,6 @@ guc_reset_prepare(struct intel_engine_cs *engine)
> */
> if (engine->i915->guc.preempt_wq)
> flush_workqueue(engine->i915->guc.preempt_wq);
> -
> - return i915_gem_find_active_request(engine);
> }
>
> /*
> diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c
> index 741441daae32..5662d6fed523 100644
> --- a/drivers/gpu/drm/i915/intel_hangcheck.c
> +++ b/drivers/gpu/drm/i915/intel_hangcheck.c
> @@ -25,6 +25,17 @@
> #include "i915_drv.h"
> #include "i915_reset.h"
>
> +struct hangcheck {
> + u64 acthd;
> + u32 seqno;
> + enum intel_engine_hangcheck_action action;
> + unsigned long action_timestamp;
> + int deadlock;
> + struct intel_instdone instdone;
> + bool wedged:1;
> + bool stalled:1;
> +};
> +
> static bool instdone_unchanged(u32 current_instdone, u32 *old_instdone)
> {
> u32 tmp = current_instdone | *old_instdone;
> @@ -119,25 +130,22 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd)
> }
>
> static void hangcheck_load_sample(struct intel_engine_cs *engine,
> - struct intel_engine_hangcheck *hc)
> + struct hangcheck *hc)
> {
> hc->acthd = intel_engine_get_active_head(engine);
> hc->seqno = intel_engine_get_seqno(engine);
> }
>
> static void hangcheck_store_sample(struct intel_engine_cs *engine,
> - const struct intel_engine_hangcheck *hc)
> + const struct hangcheck *hc)
> {
> engine->hangcheck.acthd = hc->acthd;
> engine->hangcheck.seqno = hc->seqno;
> - engine->hangcheck.action = hc->action;
> - engine->hangcheck.stalled = hc->stalled;
> - engine->hangcheck.wedged = hc->wedged;
> }
>
> static enum intel_engine_hangcheck_action
> hangcheck_get_action(struct intel_engine_cs *engine,
> - const struct intel_engine_hangcheck *hc)
> + const struct hangcheck *hc)
> {
> if (engine->hangcheck.seqno != hc->seqno)
> return ENGINE_ACTIVE_SEQNO;
> @@ -149,7 +157,7 @@ hangcheck_get_action(struct intel_engine_cs *engine,
> }
>
> static void hangcheck_accumulate_sample(struct intel_engine_cs *engine,
> - struct intel_engine_hangcheck *hc)
> + struct hangcheck *hc)
> {
> unsigned long timeout = I915_ENGINE_DEAD_TIMEOUT;
>
> @@ -265,19 +273,19 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
> intel_uncore_arm_unclaimed_mmio_detection(dev_priv);
>
> for_each_engine(engine, dev_priv, id) {
> - struct intel_engine_hangcheck hc;
> + struct hangcheck hc;
>
> hangcheck_load_sample(engine, &hc);
> hangcheck_accumulate_sample(engine, &hc);
> hangcheck_store_sample(engine, &hc);
>
> - if (engine->hangcheck.stalled) {
> + if (hc.stalled) {
> hung |= intel_engine_flag(engine);
> if (hc.action != ENGINE_DEAD)
> stuck |= intel_engine_flag(engine);
> }
>
> - if (engine->hangcheck.wedged)
> + if (hc.wedged)
> wedged |= intel_engine_flag(engine);
> }
>
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 28d183439952..c11cbf34258d 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -136,6 +136,7 @@
> #include <drm/i915_drm.h>
> #include "i915_drv.h"
> #include "i915_gem_render_state.h"
> +#include "i915_reset.h"
> #include "i915_vgpu.h"
> #include "intel_lrc_reg.h"
> #include "intel_mocs.h"
> @@ -288,7 +289,8 @@ static void unwind_wa_tail(struct i915_request *rq)
> assert_ring_tail_valid(rq->ring, rq->tail);
> }
>
> -static void __unwind_incomplete_requests(struct intel_engine_cs *engine)
> +static struct i915_request *
> +__unwind_incomplete_requests(struct intel_engine_cs *engine)
> {
> struct i915_request *rq, *rn, *active = NULL;
> struct list_head *uninitialized_var(pl);
> @@ -330,6 +332,8 @@ static void __unwind_incomplete_requests(struct intel_engine_cs *engine)
> list_move_tail(&active->sched.link,
> i915_sched_lookup_priolist(engine, prio));
> }
> +
> + return active;
> }
>
> void
> @@ -1743,11 +1747,9 @@ static int gen8_init_common_ring(struct intel_engine_cs *engine)
> return 0;
> }
>
> -static struct i915_request *
> -execlists_reset_prepare(struct intel_engine_cs *engine)
> +static void execlists_reset_prepare(struct intel_engine_cs *engine)
> {
> struct intel_engine_execlists * const execlists = &engine->execlists;
> - struct i915_request *request, *active;
> unsigned long flags;
>
> GEM_TRACE("%s: depth<-%d\n", engine->name,
> @@ -1763,59 +1765,21 @@ execlists_reset_prepare(struct intel_engine_cs *engine)
> * prevents the race.
> */
> __tasklet_disable_sync_once(&execlists->tasklet);
> + GEM_BUG_ON(!reset_in_progress(execlists));
>
> + /* And flush any current direct submission. */
> spin_lock_irqsave(&engine->timeline.lock, flags);
> -
> - /*
> - * We want to flush the pending context switches, having disabled
> - * the tasklet above, we can assume exclusive access to the execlists.
> - * For this allows us to catch up with an inflight preemption event,
> - * and avoid blaming an innocent request if the stall was due to the
> - * preemption itself.
> - */
> - process_csb(engine);
> -
> - /*
> - * The last active request can then be no later than the last request
> - * now in ELSP[0]. So search backwards from there, so that if the GPU
> - * has advanced beyond the last CSB update, it will be pardoned.
> - */
> - active = NULL;
> - request = port_request(execlists->port);
> - if (request) {
> - /*
> - * Prevent the breadcrumb from advancing before we decide
> - * which request is currently active.
> - */
> - intel_engine_stop_cs(engine);
> -
> - list_for_each_entry_from_reverse(request,
> - &engine->timeline.requests,
> - link) {
> - if (__i915_request_completed(request,
> - request->global_seqno))
> - break;
> -
> - active = request;
> - }
> - }
> -
> + process_csb(engine); /* drain preemption events */
> spin_unlock_irqrestore(&engine->timeline.lock, flags);
> -
> - return active;
> }
>
> -static void execlists_reset(struct intel_engine_cs *engine,
> - struct i915_request *request)
> +static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
> {
> struct intel_engine_execlists * const execlists = &engine->execlists;
> + struct i915_request *rq;
> unsigned long flags;
> u32 *regs;
>
> - GEM_TRACE("%s request global=%d, current=%d\n",
> - engine->name, request ? request->global_seqno : 0,
> - intel_engine_get_seqno(engine));
> -
> spin_lock_irqsave(&engine->timeline.lock, flags);
>
> /*
> @@ -1830,12 +1794,18 @@ static void execlists_reset(struct intel_engine_cs *engine,
> execlists_cancel_port_requests(execlists);
>
> /* Push back any incomplete requests for replay after the reset. */
> - __unwind_incomplete_requests(engine);
> + rq = __unwind_incomplete_requests(engine);
>
> /* Following the reset, we need to reload the CSB read/write pointers */
> reset_csb_pointers(&engine->execlists);
>
> - spin_unlock_irqrestore(&engine->timeline.lock, flags);
> + GEM_TRACE("%s seqno=%d, current=%d, stalled? %s\n",
> + engine->name,
> + rq ? rq->global_seqno : 0,
> + intel_engine_get_seqno(engine),
> + yesno(stalled));
> + if (!rq)
> + goto out_unlock;
>
> /*
> * If the request was innocent, we leave the request in the ELSP
> @@ -1848,8 +1818,9 @@ static void execlists_reset(struct intel_engine_cs *engine,
> * and have to at least restore the RING register in the context
> * image back to the expected values to skip over the guilty request.
> */
> - if (!request || request->fence.error != -EIO)
> - return;
> + i915_reset_request(rq, stalled);
> + if (!stalled)
> + goto out_unlock;
>
> /*
> * We want a simple context + ring to execute the breadcrumb update.
> @@ -1859,25 +1830,23 @@ static void execlists_reset(struct intel_engine_cs *engine,
> * future request will be after userspace has had the opportunity
> * to recreate its own state.
> */
> - regs = request->hw_context->lrc_reg_state;
> + regs = rq->hw_context->lrc_reg_state;
> if (engine->pinned_default_state) {
> memcpy(regs, /* skip restoring the vanilla PPHWSP */
> engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
> engine->context_size - PAGE_SIZE);
> }
> - execlists_init_reg_state(regs,
> - request->gem_context, engine, request->ring);
> + execlists_init_reg_state(regs, rq->gem_context, engine, rq->ring);
>
> /* Move the RING_HEAD onto the breadcrumb, past the hanging batch */
> - regs[CTX_RING_BUFFER_START + 1] = i915_ggtt_offset(request->ring->vma);
> -
> - request->ring->head = intel_ring_wrap(request->ring, request->postfix);
> - regs[CTX_RING_HEAD + 1] = request->ring->head;
> + regs[CTX_RING_BUFFER_START + 1] = i915_ggtt_offset(rq->ring->vma);
>
> - intel_ring_update_space(request->ring);
> + rq->ring->head = intel_ring_wrap(rq->ring, rq->postfix);
> + regs[CTX_RING_HEAD + 1] = rq->ring->head;
> + intel_ring_update_space(rq->ring);
>
> - /* Reset WaIdleLiteRestore:bdw,skl as well */
> - unwind_wa_tail(request);
> +out_unlock:
> + spin_unlock_irqrestore(&engine->timeline.lock, flags);
> }
>
> static void execlists_reset_finish(struct intel_engine_cs *engine)
> @@ -1890,6 +1859,7 @@ static void execlists_reset_finish(struct intel_engine_cs *engine)
> * to sleep before we restart and reload a context.
> *
> */
> + GEM_BUG_ON(!reset_in_progress(execlists));
> if (!RB_EMPTY_ROOT(&execlists->queue.rb_root))
> execlists->tasklet.func(execlists->tasklet.data);
>
> diff --git a/drivers/gpu/drm/i915/intel_overlay.c b/drivers/gpu/drm/i915/intel_overlay.c
> index c81db81e4416..f68c7975006c 100644
> --- a/drivers/gpu/drm/i915/intel_overlay.c
> +++ b/drivers/gpu/drm/i915/intel_overlay.c
> @@ -478,8 +478,6 @@ void intel_overlay_reset(struct drm_i915_private *dev_priv)
> if (!overlay)
> return;
>
> - intel_overlay_release_old_vid(overlay);
> -
How to compensate for this?
> overlay->old_xscale = 0;
> overlay->old_yscale = 0;
> overlay->crtc = NULL;
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index 26b7274a2d43..662907e1a286 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -33,6 +33,7 @@
>
> #include "i915_drv.h"
> #include "i915_gem_render_state.h"
> +#include "i915_reset.h"
> #include "i915_trace.h"
> #include "intel_drv.h"
> #include "intel_workarounds.h"
> @@ -707,52 +708,80 @@ static int init_ring_common(struct intel_engine_cs *engine)
> return ret;
> }
>
> -static struct i915_request *reset_prepare(struct intel_engine_cs *engine)
> +static void reset_prepare(struct intel_engine_cs *engine)
> {
> intel_engine_stop_cs(engine);
> - return i915_gem_find_active_request(engine);
> }
>
> -static void skip_request(struct i915_request *rq)
> +static void reset_ring(struct intel_engine_cs *engine, bool stalled)
> {
> - void *vaddr = rq->ring->vaddr;
> + struct i915_timeline *tl = &engine->timeline;
> + struct i915_request *pos, *rq;
> + unsigned long flags;
> u32 head;
>
> - head = rq->infix;
> - if (rq->postfix < head) {
> - memset32(vaddr + head, MI_NOOP,
> - (rq->ring->size - head) / sizeof(u32));
> - head = 0;
> + rq = NULL;
> + spin_lock_irqsave(&tl->lock, flags);
> + list_for_each_entry(pos, &tl->requests, link) {
> + if (!__i915_request_completed(pos, pos->global_seqno)) {
> + rq = pos;
> + break;
> + }
> }
> - memset32(vaddr + head, MI_NOOP, (rq->postfix - head) / sizeof(u32));
> -}
> -
> -static void reset_ring(struct intel_engine_cs *engine, struct i915_request *rq)
> -{
> - GEM_TRACE("%s request global=%d, current=%d\n",
> - engine->name, rq ? rq->global_seqno : 0,
> - intel_engine_get_seqno(engine));
>
> + GEM_TRACE("%s seqno=%d, current=%d, stalled? %s\n",
> + engine->name,
> + rq ? rq->global_seqno : 0,
> + intel_engine_get_seqno(engine),
> + yesno(stalled));
> /*
> - * Try to restore the logical GPU state to match the continuation
> - * of the request queue. If we skip the context/PD restore, then
> - * the next request may try to execute assuming that its context
> - * is valid and loaded on the GPU and so may try to access invalid
> - * memory, prompting repeated GPU hangs.
> + * The guilty request will get skipped on a hung engine.
> *
> - * If the request was guilty, we still restore the logical state
> - * in case the next request requires it (e.g. the aliasing ppgtt),
> - * but skip over the hung batch.
> + * Users of client default contexts do not rely on logical
> + * state preserved between batches so it is safe to execute
> + * queued requests following the hang. Non default contexts
> + * rely on preserved state, so skipping a batch loses the
> + * evolution of the state and it needs to be considered corrupted.
> + * Executing more queued batches on top of corrupted state is
> + * risky. But we take the risk by trying to advance through
> + * the queued requests in order to make the client behaviour
> + * more predictable around resets, by not throwing away random
> + * amount of batches it has prepared for execution. Sophisticated
> + * clients can use gem_reset_stats_ioctl and dma fence status
> + * (exported via sync_file info ioctl on explicit fences) to observe
> + * when it loses the context state and should rebuild accordingly.
> *
> - * If the request was innocent, we try to replay the request with
> - * the restored context.
> + * The context ban, and ultimately the client ban, mechanism are safety
> + * valves if client submission ends up resulting in nothing more than
> + * subsequent hangs.
> */
> +
> if (rq) {
> - /* If the rq hung, jump to its breadcrumb and skip the batch */
> - rq->ring->head = intel_ring_wrap(rq->ring, rq->head);
> - if (rq->fence.error == -EIO)
> - skip_request(rq);
> + /*
> + * Try to restore the logical GPU state to match the
> + * continuation of the request queue. If we skip the
> + * context/PD restore, then the next request may try to execute
> + * assuming that its context is valid and loaded on the GPU and
> + * so may try to access invalid memory, prompting repeated GPU
> + * hangs.
> + *
> + * If the request was guilty, we still restore the logical
> + * state in case the next request requires it (e.g. the
> + * aliasing ppgtt), but skip over the hung batch.
> + *
> + * If the request was innocent, we try to replay the request
> + * with the restored context.
> + */
> + i915_reset_request(rq, stalled);
> +
> + GEM_BUG_ON(rq->ring != engine->buffer);
> + head = rq->head;
> + } else {
> + head = engine->buffer->tail;
> }
> + engine->buffer->head = intel_ring_wrap(engine->buffer, head);
> +
> + spin_unlock_irqrestore(&tl->lock, flags);
> }
>
> static void reset_finish(struct intel_engine_cs *engine)
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index c3ef0f9bf321..32ed44196c1a 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -120,13 +120,8 @@ struct intel_instdone {
> struct intel_engine_hangcheck {
> u64 acthd;
> u32 seqno;
> - enum intel_engine_hangcheck_action action;
> unsigned long action_timestamp;
> - int deadlock;
> struct intel_instdone instdone;
> - struct i915_request *active_request;
> - bool stalled:1;
> - bool wedged:1;
> };
>
> struct intel_ring {
> @@ -444,9 +439,8 @@ struct intel_engine_cs {
> int (*init_hw)(struct intel_engine_cs *engine);
>
> struct {
> - struct i915_request *(*prepare)(struct intel_engine_cs *engine);
> - void (*reset)(struct intel_engine_cs *engine,
> - struct i915_request *rq);
> + void (*prepare)(struct intel_engine_cs *engine);
> + void (*reset)(struct intel_engine_cs *engine, bool stalled);
> void (*finish)(struct intel_engine_cs *engine);
> } reset;
>
> @@ -1018,6 +1012,13 @@ gen8_emit_ggtt_write(u32 *cs, u32 value, u32 gtt_offset)
> return cs;
> }
>
> +static inline void intel_engine_reset(struct intel_engine_cs *engine,
> + bool stalled)
> +{
> + if (engine->reset.reset)
> + engine->reset.reset(engine, stalled);
> +}
> +
> void intel_engines_sanitize(struct drm_i915_private *i915, bool force);
>
> bool intel_engine_is_idle(struct intel_engine_cs *engine);
> diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> index 12550b55c42f..67431355cd6e 100644
> --- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> +++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> @@ -363,9 +363,7 @@ static int igt_global_reset(void *arg)
> /* Check that we can issue a global GPU reset */
>
> igt_global_reset_lock(i915);
> - set_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags);
>
> - mutex_lock(&i915->drm.struct_mutex);
> reset_count = i915_reset_count(&i915->gpu_error);
>
> i915_reset(i915, ALL_ENGINES, NULL);
> @@ -374,9 +372,7 @@ static int igt_global_reset(void *arg)
> pr_err("No GPU reset recorded!\n");
> err = -EINVAL;
> }
> - mutex_unlock(&i915->drm.struct_mutex);
>
> - GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
> igt_global_reset_unlock(i915);
>
> if (i915_terminally_wedged(&i915->gpu_error))
> @@ -399,9 +395,7 @@ static int igt_wedged_reset(void *arg)
> i915_gem_set_wedged(i915);
> GEM_BUG_ON(!i915_terminally_wedged(&i915->gpu_error));
>
> - set_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags);
> i915_reset(i915, ALL_ENGINES, NULL);
> - GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
>
> intel_runtime_pm_put(i915, wakeref);
> mutex_unlock(&i915->drm.struct_mutex);
> @@ -511,7 +505,7 @@ static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
> break;
> }
>
> - if (!wait_for_idle(engine)) {
> + if (!i915_reset_flush(i915)) {
> struct drm_printer p =
> drm_info_printer(i915->drm.dev);
>
> @@ -903,20 +897,13 @@ static int igt_reset_engines(void *arg)
> return 0;
> }
>
> -static u32 fake_hangcheck(struct i915_request *rq, u32 mask)
> +static u32 fake_hangcheck(struct drm_i915_private *i915, u32 mask)
> {
> - struct i915_gpu_error *error = &rq->i915->gpu_error;
> - u32 reset_count = i915_reset_count(error);
> -
> - error->stalled_mask = mask;
> -
> - /* set_bit() must be after we have setup the backchannel (mask) */
> - smp_mb__before_atomic();
> - set_bit(I915_RESET_HANDOFF, &error->flags);
> + u32 count = i915_reset_count(&i915->gpu_error);
>
> - wake_up_all(&error->wait_queue);
> + i915_reset(i915, mask, NULL);
>
> - return reset_count;
> + return count;
> }
>
> static int igt_reset_wait(void *arg)
> @@ -962,7 +949,7 @@ static int igt_reset_wait(void *arg)
> goto out_rq;
> }
>
> - reset_count = fake_hangcheck(rq, ALL_ENGINES);
> + reset_count = fake_hangcheck(i915, ALL_ENGINES);
>
> timeout = i915_request_wait(rq, I915_WAIT_LOCKED, 10);
> if (timeout < 0) {
> @@ -972,7 +959,6 @@ static int igt_reset_wait(void *arg)
> goto out_rq;
> }
>
> - GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
> if (i915_reset_count(&i915->gpu_error) == reset_count) {
> pr_err("No GPU reset recorded!\n");
> err = -EINVAL;
> @@ -1162,7 +1148,7 @@ static int __igt_reset_evict_vma(struct drm_i915_private *i915,
> }
>
> out_reset:
> - fake_hangcheck(rq, intel_engine_flag(rq->engine));
> + fake_hangcheck(rq->i915, intel_engine_flag(rq->engine));
>
> if (tsk) {
> struct igt_wedge_me w;
> @@ -1341,12 +1327,7 @@ static int igt_reset_queue(void *arg)
> goto fini;
> }
>
> - reset_count = fake_hangcheck(prev, ENGINE_MASK(id));
> -
> - i915_reset(i915, ENGINE_MASK(id), NULL);
> -
> - GEM_BUG_ON(test_bit(I915_RESET_HANDOFF,
> - &i915->gpu_error.flags));
> + reset_count = fake_hangcheck(i915, ENGINE_MASK(id));
>
> if (prev->fence.error != -EIO) {
> pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
> @@ -1565,6 +1546,7 @@ static int igt_atomic_reset_engine(struct intel_engine_cs *engine,
> pr_err("%s(%s): Failed to start request %llx, at %x\n",
> __func__, engine->name,
> rq->fence.seqno, hws_seqno(&h, rq));
> + i915_gem_set_wedged(i915);
> err = -EIO;
> }
>
> @@ -1588,7 +1570,6 @@ static int igt_atomic_reset_engine(struct intel_engine_cs *engine,
> static void force_reset(struct drm_i915_private *i915)
> {
> i915_gem_set_wedged(i915);
> - set_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags);
> i915_reset(i915, 0, NULL);
> }
>
> @@ -1618,6 +1599,26 @@ static int igt_atomic_reset(void *arg)
> if (i915_terminally_wedged(&i915->gpu_error))
> goto unlock;
>
> + if (intel_has_gpu_reset(i915)) {
> + const typeof(*phases) *p;
> +
> + for (p = phases; p->name; p++) {
> + GEM_TRACE("intel_gpu_reset under %s\n", p->name);
> +
> + p->critical_section_begin();
> + err = intel_gpu_reset(i915, ALL_ENGINES);
> + p->critical_section_end();
> +
> + if (err) {
> + pr_err("intel_gpu_reset failed under %s\n",
> + p->name);
> + goto out;
> + }
> + }
> +
> + force_reset(i915);
> + }
> +
> if (intel_has_reset_engine(i915)) {
> struct intel_engine_cs *engine;
> enum intel_engine_id id;
> diff --git a/drivers/gpu/drm/i915/selftests/intel_workarounds.c b/drivers/gpu/drm/i915/selftests/intel_workarounds.c
> index a8cac56be835..b15c4f26c593 100644
> --- a/drivers/gpu/drm/i915/selftests/intel_workarounds.c
> +++ b/drivers/gpu/drm/i915/selftests/intel_workarounds.c
> @@ -214,7 +214,6 @@ static int check_whitelist(struct i915_gem_context *ctx,
>
> static int do_device_reset(struct intel_engine_cs *engine)
> {
> - set_bit(I915_RESET_HANDOFF, &engine->i915->gpu_error.flags);
> i915_reset(engine->i915, ENGINE_MASK(engine->id), "live_workarounds");
> return 0;
> }
> @@ -394,7 +393,6 @@ static int
> live_gpu_reset_gt_engine_workarounds(void *arg)
> {
> struct drm_i915_private *i915 = arg;
> - struct i915_gpu_error *error = &i915->gpu_error;
> intel_wakeref_t wakeref;
> struct wa_lists lists;
> bool ok;
> @@ -413,7 +411,6 @@ live_gpu_reset_gt_engine_workarounds(void *arg)
> if (!ok)
> goto out;
>
> - set_bit(I915_RESET_HANDOFF, &error->flags);
> i915_reset(i915, ALL_ENGINES, "live_workarounds");
>
> ok = verify_gt_engine_wa(i915, &lists, "after reset");
> diff --git a/drivers/gpu/drm/i915/selftests/mock_gem_device.c b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
> index 5477ad4a7e7d..8ab5a2688a0c 100644
> --- a/drivers/gpu/drm/i915/selftests/mock_gem_device.c
> +++ b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
> @@ -58,8 +58,8 @@ static void mock_device_release(struct drm_device *dev)
> i915_gem_contexts_lost(i915);
> mutex_unlock(&i915->drm.struct_mutex);
>
> - cancel_delayed_work_sync(&i915->gt.retire_work);
> - cancel_delayed_work_sync(&i915->gt.idle_work);
> + drain_delayed_work(&i915->gt.retire_work);
> + drain_delayed_work(&i915->gt.idle_work);
> i915_gem_drain_workqueue(i915);
>
> mutex_lock(&i915->drm.struct_mutex);
> --
> 2.20.1
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
More information about the Intel-gfx
mailing list