[PATCH 28/29] reset-mutexless
Chris Wilson
chris at chris-wilson.co.uk
Fri Jun 22 13:08:52 UTC 2018
---
drivers/gpu/drm/i915/i915_debugfs.c | 7 -
drivers/gpu/drm/i915/i915_drv.h | 5 -
drivers/gpu/drm/i915/i915_gpu_error.h | 18 +-
drivers/gpu/drm/i915/i915_request.c | 157 ++++++---
drivers/gpu/drm/i915/i915_request.h | 3 +
drivers/gpu/drm/i915/i915_reset.c | 301 ++----------------
drivers/gpu/drm/i915/intel_engine_cs.c | 6 +-
drivers/gpu/drm/i915/intel_guc_submission.c | 5 +-
drivers/gpu/drm/i915/intel_lrc.c | 98 ++----
drivers/gpu/drm/i915/intel_overlay.c | 2 -
drivers/gpu/drm/i915/intel_ringbuffer.c | 85 ++++-
drivers/gpu/drm/i915/intel_ringbuffer.h | 13 +-
.../gpu/drm/i915/selftests/intel_hangcheck.c | 27 +-
13 files changed, 261 insertions(+), 466 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index d2c8e9edd0d5..06ae43132b55 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1317,8 +1317,6 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
seq_puts(m, "Wedged\n");
if (test_bit(I915_RESET_BACKOFF, &dev_priv->gpu_error.flags))
seq_puts(m, "Reset in progress: struct_mutex backoff\n");
- if (test_bit(I915_RESET_HANDOFF, &dev_priv->gpu_error.flags))
- seq_puts(m, "Reset in progress: reset handoff to waiter\n");
if (waitqueue_active(&dev_priv->gpu_error.wait_queue))
seq_puts(m, "Waiter holding struct mutex\n");
if (waitqueue_active(&dev_priv->gpu_error.reset_queue))
@@ -4061,11 +4059,6 @@ i915_wedged_set(void *data, u64 val)
i915_handle_error(i915, val, I915_ERROR_CAPTURE,
"Manually set wedged engine mask = %llx", val);
-
- wait_on_bit(&i915->gpu_error.flags,
- I915_RESET_HANDOFF,
- TASK_UNINTERRUPTIBLE);
-
return 0;
}
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index a4f52c6b8778..2059d7ffc1f9 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3126,11 +3126,6 @@ static inline bool i915_reset_backoff(struct i915_gpu_error *error)
return unlikely(test_bit(I915_RESET_BACKOFF, &error->flags));
}
-static inline bool i915_reset_handoff(struct i915_gpu_error *error)
-{
- return unlikely(test_bit(I915_RESET_HANDOFF, &error->flags));
-}
-
static inline bool i915_terminally_wedged(struct i915_gpu_error *error)
{
return unlikely(test_bit(I915_WEDGED, &error->flags));
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h
index f893a4e8b783..a709b5247127 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.h
+++ b/drivers/gpu/drm/i915/i915_gpu_error.h
@@ -243,15 +243,6 @@ struct i915_gpu_error {
* i915_mutex_lock_interruptible()?). I915_RESET_BACKOFF serves a
* secondary role in preventing two concurrent global reset attempts.
*
- * #I915_RESET_HANDOFF - To perform the actual GPU reset, we need the
- * struct_mutex. We try to acquire the struct_mutex in the reset worker,
- * but it may be held by some long running waiter (that we cannot
- * interrupt without causing trouble). Once we are ready to do the GPU
- * reset, we set the I915_RESET_HANDOFF bit and wakeup any waiters. If
- * they already hold the struct_mutex and want to participate they can
- * inspect the bit and do the reset directly, otherwise the worker
- * waits for the struct_mutex.
- *
* #I915_RESET_ENGINE[num_engines] - Since the driver doesn't need to
* acquire the struct_mutex to reset an engine, we need an explicit
* flag to prevent two concurrent reset attempts in the same engine.
@@ -265,20 +256,13 @@ struct i915_gpu_error {
*/
unsigned long flags;
#define I915_RESET_BACKOFF 0
-#define I915_RESET_HANDOFF 1
-#define I915_RESET_MODESET 2
+#define I915_RESET_MODESET 1
#define I915_WEDGED (BITS_PER_LONG - 1)
#define I915_RESET_ENGINE (I915_WEDGED - I915_NUM_ENGINES)
/** Number of times an engine has been reset */
u32 reset_engine_count[I915_NUM_ENGINES];
- /** Set of stalled engines with guilty requests, in the current reset */
- u32 stalled_mask;
-
- /** Reason for the current *global* reset */
- const char *reason;
-
/**
* Waitqueue to signal when a hang is detected. Used to for waiters
* to release the struct_mutex for the reset to procede.
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index 1c5771aaa82e..b75dfceec704 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -1223,18 +1223,6 @@ static bool __i915_spin_request(const struct i915_request *rq,
return false;
}
-static bool __i915_wait_request_check_and_reset(struct i915_request *request)
-{
- struct i915_gpu_error *error = &request->i915->gpu_error;
-
- if (likely(!i915_reset_handoff(error)))
- return false;
-
- __set_current_state(TASK_RUNNING);
- i915_reset(request->i915, error->stalled_mask, error->reason);
- return true;
-}
-
/**
* i915_request_wait - wait until execution of request has finished
* @rq: the request to wait upon
@@ -1260,8 +1248,6 @@ long i915_request_wait(struct i915_request *rq,
{
const int state = flags & I915_WAIT_INTERRUPTIBLE ?
TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE;
- wait_queue_head_t *errq = &rq->i915->gpu_error.wait_queue;
- DEFINE_WAIT_FUNC(reset, default_wake_function);
DEFINE_WAIT_FUNC(exec, default_wake_function);
struct intel_wait wait;
@@ -1280,10 +1266,7 @@ long i915_request_wait(struct i915_request *rq,
return -ETIME;
trace_i915_request_wait_begin(rq, flags);
-
add_wait_queue(&rq->execute, &exec);
- if (flags & I915_WAIT_LOCKED)
- add_wait_queue(errq, &reset);
intel_wait_init(&wait);
@@ -1293,10 +1276,6 @@ long i915_request_wait(struct i915_request *rq,
if (intel_wait_update_request(&wait, rq))
break;
- if (flags & I915_WAIT_LOCKED &&
- __i915_wait_request_check_and_reset(rq))
- continue;
-
if (signal_pending_state(state, current)) {
timeout = -ERESTARTSYS;
goto complete;
@@ -1326,9 +1305,6 @@ long i915_request_wait(struct i915_request *rq,
*/
goto wakeup;
- if (flags & I915_WAIT_LOCKED)
- __i915_wait_request_check_and_reset(rq);
-
for (;;) {
if (signal_pending_state(state, current)) {
timeout = -ERESTARTSYS;
@@ -1358,21 +1334,6 @@ long i915_request_wait(struct i915_request *rq,
if (__i915_request_irq_complete(rq))
break;
- /*
- * If the GPU is hung, and we hold the lock, reset the GPU
- * and then check for completion. On a full reset, the engine's
- * HW seqno will be advanced passed us and we are complete.
- * If we do a partial reset, we have to wait for the GPU to
- * resume and update the breadcrumb.
- *
- * If we don't hold the mutex, we can just wait for the worker
- * to come along and update the breadcrumb (either directly
- * itself, or indirectly by recovering the GPU).
- */
- if (flags & I915_WAIT_LOCKED &&
- __i915_wait_request_check_and_reset(rq))
- continue;
-
/* Only spin if we know the GPU is processing this request */
if (__i915_spin_request(rq, wait.seqno, state, 2))
break;
@@ -1386,8 +1347,6 @@ long i915_request_wait(struct i915_request *rq,
intel_engine_remove_wait(rq->engine, &wait);
complete:
__set_current_state(TASK_RUNNING);
- if (flags & I915_WAIT_LOCKED)
- remove_wait_queue(errq, &reset);
remove_wait_queue(&rq->execute, &exec);
trace_i915_request_wait_end(rq);
@@ -1420,6 +1379,122 @@ void i915_retire_requests(struct drm_i915_private *i915)
ring_retire_requests(ring);
}
+static void skip_request(struct i915_request *request)
+{
+ void *vaddr = request->ring->vaddr;
+ u32 head;
+
+ /* As this request likely depends on state from the lost
+ * context, clear out all the user operations leaving the
+ * breadcrumb at the end (so we get the fence notifications).
+ */
+ head = request->head;
+ if (request->postfix < head) {
+ memset(vaddr + head, 0, request->ring->size - head);
+ head = 0;
+ }
+ memset(vaddr + head, 0, request->postfix - head);
+
+ dma_fence_set_error(&request->fence, -EIO);
+}
+
+static void engine_skip_context(struct i915_request *request)
+{
+ struct intel_engine_cs *engine = request->engine;
+ struct i915_gem_context *hung_ctx = request->gem_context;
+ struct i915_timeline *timeline = request->timeline;
+ unsigned long flags;
+
+ GEM_BUG_ON(timeline == &engine->timeline);
+
+ spin_lock_irqsave(&engine->timeline.lock, flags);
+ spin_lock_nested(&timeline->lock, SINGLE_DEPTH_NESTING);
+
+ list_for_each_entry_continue(request, &engine->timeline.requests, link)
+ if (request->gem_context == hung_ctx)
+ skip_request(request);
+
+ list_for_each_entry(request, &timeline->requests, link)
+ skip_request(request);
+
+ spin_unlock(&timeline->lock);
+ spin_unlock_irqrestore(&engine->timeline.lock, flags);
+}
+
+static void client_mark_guilty(struct drm_i915_file_private *file_priv,
+ const struct i915_gem_context *ctx)
+{
+ unsigned int score;
+ unsigned long prev_hang;
+
+ if (i915_gem_context_is_banned(ctx))
+ score = I915_CLIENT_SCORE_CONTEXT_BAN;
+ else
+ score = 0;
+
+ prev_hang = xchg(&file_priv->hang_timestamp, jiffies);
+ if (time_before(jiffies, prev_hang + I915_CLIENT_FAST_HANG_JIFFIES))
+ score += I915_CLIENT_SCORE_HANG_FAST;
+
+ if (score) {
+ atomic_add(score, &file_priv->ban_score);
+
+ DRM_DEBUG_DRIVER("client %s: gained %u ban score, now %u\n",
+ ctx->name, score,
+ atomic_read(&file_priv->ban_score));
+ }
+}
+
+static bool context_mark_guilty(struct i915_gem_context *ctx)
+{
+ unsigned int score;
+ bool banned, bannable;
+
+ atomic_inc(&ctx->guilty_count);
+
+ bannable = i915_gem_context_is_bannable(ctx);
+ score = atomic_add_return(CONTEXT_SCORE_GUILTY, &ctx->ban_score);
+ banned = score >= CONTEXT_SCORE_BAN_THRESHOLD;
+
+ /* Cool contexts don't accumulate client ban score */
+ if (!bannable)
+ return false;
+
+ if (banned) {
+ DRM_DEBUG_DRIVER("context %s: guilty %d, score %u, banned\n",
+ ctx->name, atomic_read(&ctx->guilty_count),
+ score);
+ i915_gem_context_set_banned(ctx);
+ }
+
+ if (!IS_ERR_OR_NULL(ctx->file_priv))
+ client_mark_guilty(ctx->file_priv, ctx);
+
+ return banned;
+}
+
+void i915_request_guilty(struct i915_request *rq)
+{
+ GEM_BUG_ON(i915_request_completed(rq));
+ dma_fence_set_error(&rq->fence, -EIO);
+
+ if (context_mark_guilty(rq->gem_context))
+ engine_skip_context(rq);
+}
+
+static void context_mark_innocent(struct i915_gem_context *ctx)
+{
+ atomic_inc(&ctx->active_count);
+}
+
+void i915_request_innocent(struct i915_request *rq)
+{
+ GEM_BUG_ON(i915_request_completed(rq));
+ dma_fence_set_error(&rq->fence, -EAGAIN);
+
+ context_mark_innocent(rq->gem_context);
+}
+
#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
#include "selftests/mock_request.c"
#include "selftests/i915_request.c"
diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
index 9a6490001625..9d00942ba985 100644
--- a/drivers/gpu/drm/i915/i915_request.h
+++ b/drivers/gpu/drm/i915/i915_request.h
@@ -272,6 +272,9 @@ long i915_request_wait(struct i915_request *rq,
#define I915_WAIT_ALL BIT(2) /* used by i915_gem_object_wait() */
#define I915_WAIT_FOR_IDLE_BOOST BIT(3)
+void i915_request_guilty(struct i915_request *rq);
+void i915_request_innocent(struct i915_request *rq);
+
static inline u32 intel_engine_get_seqno(struct intel_engine_cs *engine);
/**
diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c
index 60aecacf3ce5..9256e1a62419 100644
--- a/drivers/gpu/drm/i915/i915_reset.c
+++ b/drivers/gpu/drm/i915/i915_reset.c
@@ -396,70 +396,12 @@ int intel_reset_guc(struct drm_i915_private *i915)
return ret;
}
-static void i915_gem_client_mark_guilty(struct drm_i915_file_private *file_priv,
- const struct i915_gem_context *ctx)
-{
- unsigned int score;
- unsigned long prev_hang;
-
- if (i915_gem_context_is_banned(ctx))
- score = I915_CLIENT_SCORE_CONTEXT_BAN;
- else
- score = 0;
-
- prev_hang = xchg(&file_priv->hang_timestamp, jiffies);
- if (time_before(jiffies, prev_hang + I915_CLIENT_FAST_HANG_JIFFIES))
- score += I915_CLIENT_SCORE_HANG_FAST;
-
- if (score) {
- atomic_add(score, &file_priv->ban_score);
-
- DRM_DEBUG_DRIVER("client %s: gained %u ban score, now %u\n",
- ctx->name, score,
- atomic_read(&file_priv->ban_score));
- }
-}
-
-static void i915_gem_context_mark_guilty(struct i915_gem_context *ctx)
-{
- unsigned int score;
- bool banned, bannable;
-
- atomic_inc(&ctx->guilty_count);
-
- bannable = i915_gem_context_is_bannable(ctx);
- score = atomic_add_return(CONTEXT_SCORE_GUILTY, &ctx->ban_score);
- banned = score >= CONTEXT_SCORE_BAN_THRESHOLD;
-
- /* Cool contexts don't accumulate client ban score */
- if (!bannable)
- return;
-
- if (banned) {
- DRM_DEBUG_DRIVER("context %s: guilty %d, score %u, banned\n",
- ctx->name, atomic_read(&ctx->guilty_count),
- score);
- i915_gem_context_set_banned(ctx);
- }
-
- if (!IS_ERR_OR_NULL(ctx->file_priv))
- i915_gem_client_mark_guilty(ctx->file_priv, ctx);
-}
-
-static void i915_gem_context_mark_innocent(struct i915_gem_context *ctx)
-{
- atomic_inc(&ctx->active_count);
-}
-
/*
* Ensure irq handler finishes, and not run again.
* Also return the active request so that we only search for it once.
*/
-static struct i915_request *
-i915_gem_reset_prepare_engine(struct intel_engine_cs *engine)
+static void i915_gem_reset_prepare_engine(struct intel_engine_cs *engine)
{
- struct i915_request *request;
-
/*
* During the reset sequence, we must prevent the engine from
* entering RC6. As the context state is undefined until we restart
@@ -468,143 +410,19 @@ i915_gem_reset_prepare_engine(struct intel_engine_cs *engine)
* GPU state upon resume, i.e. fail to restart after a reset.
*/
intel_uncore_forcewake_get(engine->i915, FORCEWAKE_ALL);
-
- request = engine->reset.prepare(engine);
- if (request && request->fence.error == -EIO)
- request = ERR_PTR(-EIO); /* Previous reset failed! */
-
- return request;
+ engine->reset.prepare(engine);
}
-static int i915_gem_reset_prepare(struct drm_i915_private *i915)
+static void i915_gem_reset_prepare(struct drm_i915_private *i915)
{
struct intel_engine_cs *engine;
- struct i915_request *request;
enum intel_engine_id id;
- int err = 0;
-
- for_each_engine(engine, i915, id) {
- request = i915_gem_reset_prepare_engine(engine);
- if (IS_ERR(request)) {
- err = PTR_ERR(request);
- continue;
- }
- engine->hangcheck.active_request = request;
- }
+ for_each_engine(engine, i915, id)
+ i915_gem_reset_prepare_engine(engine);
i915_gem_revoke_fences(i915);
intel_uc_sanitize(i915);
-
- return err;
-}
-
-static void engine_skip_context(struct i915_request *request)
-{
- struct intel_engine_cs *engine = request->engine;
- struct i915_gem_context *hung_ctx = request->gem_context;
- struct i915_timeline *timeline = request->timeline;
- unsigned long flags;
-
- GEM_BUG_ON(timeline == &engine->timeline);
-
- spin_lock_irqsave(&engine->timeline.lock, flags);
- spin_lock_nested(&timeline->lock, SINGLE_DEPTH_NESTING);
-
- list_for_each_entry_continue(request, &engine->timeline.requests, link)
- if (request->gem_context == hung_ctx)
- i915_request_skip(request);
-
- list_for_each_entry(request, &timeline->requests, link)
- i915_request_skip(request);
-
- spin_unlock(&timeline->lock);
- spin_unlock_irqrestore(&engine->timeline.lock, flags);
-}
-
-/* Returns the request if it was guilty of the hang */
-static struct i915_request *
-i915_gem_reset_request(struct intel_engine_cs *engine,
- struct i915_request *request,
- bool stalled)
-{
- /* The guilty request will get skipped on a hung engine.
- *
- * Users of client default contexts do not rely on logical
- * state preserved between batches so it is safe to execute
- * queued requests following the hang. Non default contexts
- * rely on preserved state, so skipping a batch loses the
- * evolution of the state and it needs to be considered corrupted.
- * Executing more queued batches on top of corrupted state is
- * risky. But we take the risk by trying to advance through
- * the queued requests in order to make the client behaviour
- * more predictable around resets, by not throwing away random
- * amount of batches it has prepared for execution. Sophisticated
- * clients can use gem_reset_stats_ioctl and dma fence status
- * (exported via sync_file info ioctl on explicit fences) to observe
- * when it loses the context state and should rebuild accordingly.
- *
- * The context ban, and ultimately the client ban, mechanism are safety
- * valves if client submission ends up resulting in nothing more than
- * subsequent hangs.
- */
-
- if (i915_request_completed(request)) {
- GEM_TRACE("%s pardoned global=%d (fence %llx:%d), current %d\n",
- engine->name, request->global_seqno,
- request->fence.context, request->fence.seqno,
- intel_engine_get_seqno(engine));
- stalled = false;
- }
-
- if (stalled) {
- i915_gem_context_mark_guilty(request->gem_context);
- i915_request_skip(request);
-
- /* If this context is now banned, skip all pending requests. */
- if (i915_gem_context_is_banned(request->gem_context))
- engine_skip_context(request);
- } else {
- /*
- * Since this is not the hung engine, it may have advanced
- * since the hang declaration. Double check by refinding
- * the active request at the time of the reset.
- */
- request = i915_gem_find_active_request(engine);
- if (request) {
- unsigned long flags;
-
- i915_gem_context_mark_innocent(request->gem_context);
- dma_fence_set_error(&request->fence, -EAGAIN);
-
- /* Rewind the engine to replay the incomplete rq */
- spin_lock_irqsave(&engine->timeline.lock, flags);
- request = list_prev_entry(request, link);
- if (&request->link == &engine->timeline.requests)
- request = NULL;
- spin_unlock_irqrestore(&engine->timeline.lock, flags);
- }
- }
-
- return request;
-}
-
-static void i915_gem_reset_engine(struct intel_engine_cs *engine,
- struct i915_request *request,
- bool stalled)
-{
- /*
- * Make sure this write is visible before we re-enable the interrupt
- * handlers on another CPU, as tasklet_enable() resolves to just
- * a compiler barrier which is insufficient for our purpose here.
- */
- smp_store_mb(engine->irq_posted, 0);
-
- if (request)
- request = i915_gem_reset_request(engine, request, stalled);
-
- /* Setup the CS to resume from the breadcrumb of the hung request */
- engine->reset.reset(engine, request);
}
static void i915_gem_reset(struct drm_i915_private *i915,
@@ -613,38 +431,8 @@ static void i915_gem_reset(struct drm_i915_private *i915,
struct intel_engine_cs *engine;
enum intel_engine_id id;
- lockdep_assert_held(&i915->drm.struct_mutex);
-
- i915_retire_requests(i915);
-
- for_each_engine(engine, i915, id) {
- struct intel_context *ce;
-
- i915_gem_reset_engine(engine,
- engine->hangcheck.active_request,
- stalled_mask & ENGINE_MASK(id));
- ce = fetch_and_zero(&engine->last_retired_context);
- if (ce)
- intel_context_unpin(ce);
-
- /*
- * Ostensibily, we always want a context loaded for powersaving,
- * so if the engine is idle after the reset, send a request
- * to load our scratch kernel_context.
- *
- * More mysteriously, if we leave the engine idle after a reset,
- * the next userspace batch may hang, with what appears to be
- * an incoherent read by the CS (presumably stale TLB). An
- * empty request appears sufficient to paper over the glitch.
- */
- if (intel_engine_is_idle(engine)) {
- struct i915_request *rq;
-
- rq = i915_request_alloc(engine, i915->kernel_context);
- if (!IS_ERR(rq))
- i915_request_add(rq);
- }
- }
+ for_each_engine(engine, i915, id)
+ intel_engine_reset(engine, stalled_mask & ENGINE_MASK(id));
i915_gem_restore_fences(i915);
}
@@ -652,7 +440,6 @@ static void i915_gem_reset(struct drm_i915_private *i915,
static void i915_gem_reset_finish_engine(struct intel_engine_cs *engine)
{
engine->reset.finish(engine);
-
intel_uncore_forcewake_put(engine->i915, FORCEWAKE_ALL);
}
@@ -661,12 +448,8 @@ static void i915_gem_reset_finish(struct drm_i915_private *i915)
struct intel_engine_cs *engine;
enum intel_engine_id id;
- lockdep_assert_held(&i915->drm.struct_mutex);
-
- for_each_engine(engine, i915, id) {
- engine->hangcheck.active_request = NULL;
+ for_each_engine(engine, i915, id)
i915_gem_reset_finish_engine(engine);
- }
}
static void nop_submit_request(struct i915_request *request)
@@ -777,7 +560,6 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
{
struct i915_timeline *tl;
- lockdep_assert_held(&i915->drm.struct_mutex);
if (!test_bit(I915_WEDGED, &i915->gpu_error.flags))
return true;
@@ -795,9 +577,9 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
*/
list_for_each_entry(tl, &i915->gt.timelines, link) {
struct i915_request *rq;
+ long timeout;
- rq = i915_gem_active_peek(&tl->last_request,
- &i915->drm.struct_mutex);
+ rq = i915_gem_active_get_unlocked(&tl->last_request);
if (!rq)
continue;
@@ -812,12 +594,12 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
* and when the seqno passes the fence, the signaler
* then signals the fence waking us up).
*/
- if (dma_fence_default_wait(&rq->fence, true,
- MAX_SCHEDULE_TIMEOUT) < 0)
+ timeout = dma_fence_default_wait(&rq->fence, true,
+ MAX_SCHEDULE_TIMEOUT);
+ i915_request_put(rq);
+ if (timeout < 0)
return false;
}
- i915_retire_requests(i915);
- GEM_BUG_ON(i915->gt.active_requests);
/*
* Undo nop_submit_request. We prevent all new i915 requests from
@@ -829,7 +611,6 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
* context and do not require stop_machine().
*/
intel_engines_reset_default_submission(i915);
- i915_gem_contexts_lost(i915);
GEM_TRACE("end\n");
@@ -869,26 +650,18 @@ void i915_reset(struct drm_i915_private *i915,
GEM_TRACE("flags=%lx\n", error->flags);
might_sleep();
- lockdep_assert_held(&i915->drm.struct_mutex);
GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &error->flags));
- if (!test_bit(I915_RESET_HANDOFF, &error->flags))
- return;
-
/* Clear any previous failed attempts at recovery. Time to try again. */
if (!i915_gem_unset_wedged(i915))
- goto wakeup;
+ return;
if (reason)
dev_notice(i915->drm.dev, "Resetting chip for %s\n", reason);
error->reset_count++;
disable_irq(i915->drm.irq);
- ret = i915_gem_reset_prepare(i915);
- if (ret) {
- dev_err(i915->drm.dev, "GPU recovery failed\n");
- goto taint;
- }
+ i915_gem_reset_prepare(i915);
if (!intel_has_gpu_reset(i915)) {
if (i915_modparams.reset)
@@ -946,10 +719,6 @@ void i915_reset(struct drm_i915_private *i915,
finish:
i915_gem_reset_finish(i915);
enable_irq(i915->drm.irq);
-
-wakeup:
- clear_bit(I915_RESET_HANDOFF, &error->flags);
- wake_up_bit(&error->flags, I915_RESET_HANDOFF);
return;
taint:
@@ -968,7 +737,6 @@ void i915_reset(struct drm_i915_private *i915,
add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
error:
i915_gem_set_wedged(i915);
- i915_retire_requests(i915);
goto finish;
}
@@ -994,18 +762,15 @@ static inline int intel_gt_reset_engine(struct drm_i915_private *i915,
int i915_reset_engine(struct intel_engine_cs *engine, const char *msg)
{
struct i915_gpu_error *error = &engine->i915->gpu_error;
- struct i915_request *active_request;
int ret;
GEM_TRACE("%s flags=%lx\n", engine->name, error->flags);
GEM_BUG_ON(!test_bit(I915_RESET_ENGINE + engine->id, &error->flags));
- active_request = i915_gem_reset_prepare_engine(engine);
- if (IS_ERR_OR_NULL(active_request)) {
- /* Either the previous reset failed, or we pardon the reset. */
- ret = PTR_ERR(active_request);
- goto out;
- }
+ if (intel_engine_is_idle(engine))
+ return 0;
+
+ i915_gem_reset_prepare_engine(engine);
if (msg)
dev_notice(engine->i915->drm.dev,
@@ -1029,7 +794,7 @@ int i915_reset_engine(struct intel_engine_cs *engine, const char *msg)
* active request and can drop it, adjust head to skip the offending
* request to resume executing remaining requests in the queue.
*/
- i915_gem_reset_engine(engine, active_request, true);
+ intel_engine_reset(engine, true);
/*
* The engine and its registers (and workarounds in case of render)
@@ -1105,29 +870,7 @@ static void i915_reset_device(struct drm_i915_private *i915,
i915_wedge_on_timeout(&w, i915, 5*HZ) {
intel_prepare_reset(i915);
- error->reason = reason;
- error->stalled_mask = engine_mask;
-
- /* Signal that locked waiters should reset the GPU */
- smp_mb__before_atomic();
- set_bit(I915_RESET_HANDOFF, &error->flags);
- wake_up_all(&error->wait_queue);
-
- /* Wait for anyone holding the lock to wakeup, without
- * blocking indefinitely on struct_mutex.
- */
- do {
- if (mutex_trylock(&i915->drm.struct_mutex)) {
- i915_reset(i915, engine_mask, reason);
- mutex_unlock(&i915->drm.struct_mutex);
- }
- } while (wait_on_bit_timeout(&error->flags,
- I915_RESET_HANDOFF,
- TASK_UNINTERRUPTIBLE,
- 1));
-
- error->stalled_mask = 0;
- error->reason = NULL;
+ i915_reset(i915, engine_mask, reason);
intel_finish_reset(i915);
}
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index c5a4a8f7cc49..7c0271609cb2 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -1086,10 +1086,8 @@ void intel_engines_sanitize(struct drm_i915_private *i915)
GEM_TRACE("\n");
- for_each_engine(engine, i915, id) {
- if (engine->reset.reset)
- engine->reset.reset(engine, NULL);
- }
+ for_each_engine(engine, i915, id)
+ intel_engine_reset(engine, false);
}
/**
diff --git a/drivers/gpu/drm/i915/intel_guc_submission.c b/drivers/gpu/drm/i915/intel_guc_submission.c
index 660c41ec71ab..df280e2085d9 100644
--- a/drivers/gpu/drm/i915/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/intel_guc_submission.c
@@ -806,8 +806,7 @@ static void guc_submission_tasklet(unsigned long data)
guc_dequeue(engine);
}
-static struct i915_request *
-guc_reset_prepare(struct intel_engine_cs *engine)
+static void guc_reset_prepare(struct intel_engine_cs *engine)
{
struct intel_engine_execlists * const execlists = &engine->execlists;
@@ -833,8 +832,6 @@ guc_reset_prepare(struct intel_engine_cs *engine)
*/
if (engine->i915->guc.preempt_wq)
flush_workqueue(engine->i915->guc.preempt_wq);
-
- return i915_gem_find_active_request(engine);
}
/*
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 98c922808e3e..2f3bb270f66e 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -361,9 +361,10 @@ static void unwind_wa_tail(struct i915_request *rq)
assert_ring_tail_valid(rq->ring, rq->tail);
}
-static void __unwind_incomplete_requests(struct intel_engine_cs *engine)
+static struct i915_request *
+__unwind_incomplete_requests(struct intel_engine_cs *engine)
{
- struct i915_request *rq, *rn;
+ struct i915_request *rq, *rn, *active = NULL;
struct list_head *uninitialized_var(pl);
int last_prio = I915_PRIORITY_INVALID;
@@ -373,7 +374,7 @@ static void __unwind_incomplete_requests(struct intel_engine_cs *engine)
&engine->timeline.requests,
link) {
if (i915_request_completed(rq))
- return;
+ break;
__i915_request_unsubmit(rq);
unwind_wa_tail(rq);
@@ -385,7 +386,11 @@ static void __unwind_incomplete_requests(struct intel_engine_cs *engine)
}
list_add(&rq->sched.link, pl);
+
+ active = rq;
}
+
+ return active;
}
void
@@ -1898,12 +1903,9 @@ static int gen9_init_render_ring(struct intel_engine_cs *engine)
return 0;
}
-static struct i915_request *
-execlists_reset_prepare(struct intel_engine_cs *engine)
+static void execlists_reset_prepare(struct intel_engine_cs *engine)
{
struct intel_engine_execlists * const execlists = &engine->execlists;
- struct i915_request *request, *active;
- unsigned long flags;
GEM_TRACE("%s\n", engine->name);
@@ -1917,46 +1919,6 @@ execlists_reset_prepare(struct intel_engine_cs *engine)
* prevents the race.
*/
__tasklet_disable_sync_once(&execlists->tasklet);
-
- /*
- * We want to flush the pending context switches, having disabled
- * the tasklet above, we can assume exclusive access to the execlists.
- * For this allows us to catch up with an inflight preemption event,
- * and avoid blaming an innocent request if the stall was due to the
- * preemption itself.
- */
- spin_lock_irqsave(&engine->timeline.lock, flags);
-
- process_csb(engine);
-
- /*
- * The last active request can then be no later than the last request
- * now in ELSP[0]. So search backwards from there, so that if the GPU
- * has advanced beyond the last CSB update, it will be pardoned.
- */
- active = NULL;
- request = port_request(execlists->port);
- if (request) {
- /*
- * Prevent the breadcrumb from advancing before we decide
- * which request is currently active.
- */
- intel_engine_stop_cs(engine);
-
- list_for_each_entry_from_reverse(request,
- &engine->timeline.requests,
- link) {
- if (__i915_request_completed(request,
- request->global_seqno))
- break;
-
- active = request;
- }
- }
-
- spin_unlock_irqrestore(&engine->timeline.lock, flags);
-
- return active;
}
static void reset_csb_pointers(struct intel_engine_execlists *execlists)
@@ -1974,17 +1936,13 @@ static void reset_csb_pointers(struct intel_engine_execlists *execlists)
WRITE_ONCE(*execlists->csb_write, (GEN8_CSB_ENTRIES - 1) | 0xff << 16);
}
-static void execlists_reset(struct intel_engine_cs *engine,
- struct i915_request *request)
+static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
{
struct intel_engine_execlists * const execlists = &engine->execlists;
+ struct i915_request *rq;
unsigned long flags;
u32 *regs;
- GEM_TRACE("%s request global=%x, current=%d\n",
- engine->name, request ? request->global_seqno : 0,
- intel_engine_get_seqno(engine));
-
synchronize_hardirq(engine->i915->drm.irq);
spin_lock_irqsave(&engine->timeline.lock, flags);
@@ -2001,13 +1959,19 @@ static void execlists_reset(struct intel_engine_cs *engine,
reset_irq(engine);
/* Push back any incomplete requests for replay after the reset. */
- __unwind_incomplete_requests(engine);
+ rq = __unwind_incomplete_requests(engine);
/* Following the reset, we need to reload the CSB read/write pointers */
reset_csb_pointers(&engine->execlists);
spin_unlock_irqrestore(&engine->timeline.lock, flags);
+ GEM_TRACE("%s request global=%x, current=%d\n",
+ engine->name, rq ? rq->global_seqno : 0,
+ intel_engine_get_seqno(engine));
+ if (!rq)
+ return;
+
/*
* If the request was innocent, we leave the request in the ELSP
* and will try to replay it on restarting. The context image may
@@ -2019,8 +1983,10 @@ static void execlists_reset(struct intel_engine_cs *engine,
* and have to at least restore the RING register in the context
* image back to the expected values to skip over the guilty request.
*/
- if (!request || request->fence.error != -EIO)
+ if (!stalled) {
+ i915_request_innocent(rq);
return;
+ }
/*
* We want a simple context + ring to execute the breadcrumb update.
@@ -2030,25 +1996,27 @@ static void execlists_reset(struct intel_engine_cs *engine,
* future request will be after userspace has had the opportunity
* to recreate its own state.
*/
- regs = request->hw_context->lrc_reg_state;
- if (engine->pinned_default_state) {
+ regs = rq->hw_context->lrc_reg_state;
+
+ if (engine->pinned_default_state)
memcpy(regs, /* skip restoring the vanilla PPHWSP */
engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
engine->context_size - PAGE_SIZE);
- }
- execlists_init_reg_state(regs,
- request->gem_context, engine, request->ring);
+
+ execlists_init_reg_state(regs, rq->gem_context, engine, rq->ring);
/* Move the RING_HEAD onto the breadcrumb, past the hanging batch */
- regs[CTX_RING_BUFFER_START + 1] = i915_ggtt_offset(request->ring->vma);
+ regs[CTX_RING_BUFFER_START + 1] = i915_ggtt_offset(rq->ring->vma);
- request->ring->head = intel_ring_wrap(request->ring, request->postfix);
- regs[CTX_RING_HEAD + 1] = request->ring->head;
+ rq->ring->head = intel_ring_wrap(rq->ring, rq->postfix);
+ regs[CTX_RING_HEAD + 1] = rq->ring->head;
- intel_ring_update_space(request->ring);
+ intel_ring_update_space(rq->ring);
/* Reset WaIdleLiteRestore:bdw,skl as well */
- unwind_wa_tail(request);
+ unwind_wa_tail(rq);
+
+ i915_request_guilty(rq);
}
static void execlists_reset_finish(struct intel_engine_cs *engine)
diff --git a/drivers/gpu/drm/i915/intel_overlay.c b/drivers/gpu/drm/i915/intel_overlay.c
index c2f10d899329..371eb3dbedc0 100644
--- a/drivers/gpu/drm/i915/intel_overlay.c
+++ b/drivers/gpu/drm/i915/intel_overlay.c
@@ -501,8 +501,6 @@ void intel_overlay_reset(struct drm_i915_private *dev_priv)
if (!overlay)
return;
- intel_overlay_release_old_vid(overlay);
-
overlay->old_xscale = 0;
overlay->old_yscale = 0;
overlay->crtc = NULL;
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index e0448eff12bd..79382b5a0055 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -535,14 +535,12 @@ static int init_ring_common(struct intel_engine_cs *engine)
return ret;
}
-static struct i915_request *reset_prepare(struct intel_engine_cs *engine)
+static void reset_prepare(struct intel_engine_cs *engine)
{
intel_engine_stop_cs(engine);
if (engine->irq_seqno_barrier)
engine->irq_seqno_barrier(engine);
-
- return i915_gem_find_active_request(engine);
}
static void skip_request(struct i915_request *rq)
@@ -559,29 +557,82 @@ static void skip_request(struct i915_request *rq)
memset32(vaddr + head, MI_NOOP, (rq->postfix - head) / sizeof(u32));
}
-static void reset_ring(struct intel_engine_cs *engine, struct i915_request *rq)
+static void reset_ring(struct intel_engine_cs *engine, bool stalled)
{
- GEM_TRACE("%s seqno=%x\n", engine->name, rq ? rq->global_seqno : 0);
+ struct i915_timeline *tl = &engine->timeline;
+ struct i915_request *pos, *rq;
+ unsigned long flags;
+
+ rq = NULL;
+ spin_lock_irqsave(&tl->lock, flags);
+ list_for_each_entry(pos, &tl->requests, link) {
+ if (!__i915_request_completed(pos, pos->global_seqno)) {
+ rq = pos;
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&tl->lock, flags);
+
+ GEM_TRACE("%s seqno=%x, stalled? %s\n",
+ engine->name,
+ rq ? rq->global_seqno : 0,
+ yesno(stalled));
/*
- * Try to restore the logical GPU state to match the continuation
- * of the request queue. If we skip the context/PD restore, then
- * the next request may try to execute assuming that its context
- * is valid and loaded on the GPU and so may try to access invalid
- * memory, prompting repeated GPU hangs.
+ * Make sure this write is visible before we re-enable the interrupt
+ * handlers on another CPU, as tasklet_enable() resolves to just
+ * a compiler barrier which is insufficient for our purpose here.
+ */
+ smp_store_mb(engine->irq_posted, 0);
+
+ /*
+ * The guilty request will get skipped on a hung engine.
*
- * If the request was guilty, we still restore the logical state
- * in case the next request requires it (e.g. the aliasing ppgtt),
- * but skip over the hung batch.
+ * Users of client default contexts do not rely on logical
+ * state preserved between batches so it is safe to execute
+ * queued requests following the hang. Non default contexts
+ * rely on preserved state, so skipping a batch loses the
+ * evolution of the state and it needs to be considered corrupted.
+ * Executing more queued batches on top of corrupted state is
+ * risky. But we take the risk by trying to advance through
+ * the queued requests in order to make the client behaviour
+ * more predictable around resets, by not throwing away random
+ * amount of batches it has prepared for execution. Sophisticated
+ * clients can use gem_reset_stats_ioctl and dma fence status
+ * (exported via sync_file info ioctl on explicit fences) to observe
+ * when it loses the context state and should rebuild accordingly.
*
- * If the request was innocent, we try to replay the request with
- * the restored context.
+ * The context ban, and ultimately the client ban, mechanism are safety
+ * valves if client submission ends up resulting in nothing more than
+ * subsequent hangs.
*/
+
if (rq) {
+ /*
+ * Try to restore the logical GPU state to match the
+ * continuation of the request queue. If we skip the
+ * context/PD restore, then the next request may try to execute
+ * assuming that its context is valid and loaded on the GPU and
+ * so may try to access invalid memory, prompting repeated GPU
+ * hangs.
+ *
+ * If the request was guilty, we still restore the logical
+ * state in case the next request requires it (e.g. the
+ * aliasing ppgtt), but skip over the hung batch.
+ *
+ * If the request was innocent, we try to replay the request
+ * with the restored context.
+ */
+
+ if (stalled) {
+ i915_request_guilty(rq);
+ skip_request(rq);
+ } else {
+ i915_request_innocent(rq);
+ }
+
/* If the rq hung, jump to its breadcrumb and skip the batch */
rq->ring->head = intel_ring_wrap(rq->ring, rq->head);
- if (rq->fence.error == -EIO)
- skip_request(rq);
}
}
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index f00ebd50b49b..889e583043c5 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -121,7 +121,6 @@ struct intel_engine_hangcheck {
unsigned long action_timestamp;
int deadlock;
struct intel_instdone instdone;
- struct i915_request *active_request;
bool stalled:1;
bool wedged:1;
};
@@ -443,9 +442,8 @@ struct intel_engine_cs {
int (*init_hw)(struct intel_engine_cs *engine);
struct {
- struct i915_request *(*prepare)(struct intel_engine_cs *engine);
- void (*reset)(struct intel_engine_cs *engine,
- struct i915_request *rq);
+ void (*prepare)(struct intel_engine_cs *engine);
+ void (*reset)(struct intel_engine_cs *engine, bool stalled);
void (*finish)(struct intel_engine_cs *engine);
} reset;
@@ -1069,6 +1067,13 @@ gen8_emit_ggtt_write(u32 *cs, u32 value, u32 gtt_offset)
return cs;
}
+static inline void intel_engine_reset(struct intel_engine_cs *engine,
+ bool stalled)
+{
+ if (engine->reset.reset)
+ engine->reset.reset(engine, stalled);
+}
+
void intel_engines_sanitize(struct drm_i915_private *i915);
static inline bool
diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
index e11df2743704..6905cc470489 100644
--- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
@@ -390,7 +390,6 @@ static int igt_global_reset(void *arg)
/* Check that we can issue a global GPU reset */
global_reset_lock(i915);
- set_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags);
mutex_lock(&i915->drm.struct_mutex);
reset_count = i915_reset_count(&i915->gpu_error);
@@ -403,7 +402,6 @@ static int igt_global_reset(void *arg)
}
mutex_unlock(&i915->drm.struct_mutex);
- GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
global_reset_unlock(i915);
if (i915_terminally_wedged(&i915->gpu_error))
@@ -905,20 +903,13 @@ static int igt_reset_engines(void *arg)
return 0;
}
-static u32 fake_hangcheck(struct i915_request *rq, u32 mask)
+static u32 fake_hangcheck(struct drm_i915_private *i915, u32 mask)
{
- struct i915_gpu_error *error = &rq->i915->gpu_error;
- u32 reset_count = i915_reset_count(error);
+ u32 count = i915_reset_count(&i915->gpu_error);
- error->stalled_mask = mask;
+ i915_reset(i915, mask, NULL);
- /* set_bit() must be after we have setup the backchannel (mask) */
- smp_mb__before_atomic();
- set_bit(I915_RESET_HANDOFF, &error->flags);
-
- wake_up_all(&error->wait_queue);
-
- return reset_count;
+ return count;
}
static int igt_wait_reset(void *arg)
@@ -964,7 +955,7 @@ static int igt_wait_reset(void *arg)
goto out_rq;
}
- reset_count = fake_hangcheck(rq, ALL_ENGINES);
+ reset_count = fake_hangcheck(i915, ALL_ENGINES);
timeout = i915_request_wait(rq, I915_WAIT_LOCKED, 10);
if (timeout < 0) {
@@ -974,7 +965,6 @@ static int igt_wait_reset(void *arg)
goto out_rq;
}
- GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
if (i915_reset_count(&i915->gpu_error) == reset_count) {
pr_err("No GPU reset recorded!\n");
err = -EINVAL;
@@ -1100,12 +1090,7 @@ static int igt_reset_queue(void *arg)
goto fini;
}
- reset_count = fake_hangcheck(prev, ENGINE_MASK(id));
-
- i915_reset(i915, ENGINE_MASK(id), NULL);
-
- GEM_BUG_ON(test_bit(I915_RESET_HANDOFF,
- &i915->gpu_error.flags));
+ reset_count = fake_hangcheck(i915, ENGINE_MASK(id));
if (prev->fence.error != -EIO) {
pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
--
2.18.0.rc2
More information about the Intel-gfx-trybot
mailing list