[Intel-gfx] [PATCH 1/6] drm/i915: Split up hangcheck phases
Mika Kuoppala
mika.kuoppala at linux.intel.com
Wed Nov 16 15:20:29 UTC 2016
In order to simplify hangcheck state keeping, split hangcheck
per engine loop in three phases: state load, action, state save.
Add few more hangcheck actions to separate between seqno, head
and subunit movements. This helps to gather all the hangcheck
actions under a single switch umbrella.
Cc: Chris Wilson <chris at chris-wilson.co.uk>
Signed-off-by: Mika Kuoppala <mika.kuoppala at intel.com>
Reviewed-by: Chris Wilson <chris at chris-wilson.co.uk>
---
drivers/gpu/drm/i915/i915_gpu_error.c | 8 +-
drivers/gpu/drm/i915/intel_hangcheck.c | 241 ++++++++++++++++++--------------
drivers/gpu/drm/i915/intel_ringbuffer.h | 4 +-
3 files changed, 146 insertions(+), 107 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 5d620bd..f02f581 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -323,8 +323,12 @@ static const char *hangcheck_action_to_str(enum intel_engine_hangcheck_action a)
return "idle";
case HANGCHECK_WAIT:
return "wait";
- case HANGCHECK_ACTIVE:
- return "active";
+ case HANGCHECK_ACTIVE_SEQNO:
+ return "active seqno";
+ case HANGCHECK_ACTIVE_HEAD:
+ return "active head";
+ case HANGCHECK_ACTIVE_SUBUNITS:
+ return "active subunits";
case HANGCHECK_KICK:
return "kick";
case HANGCHECK_HUNG:
diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c
index 53df5b1..3d2e81c 100644
--- a/drivers/gpu/drm/i915/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/intel_hangcheck.c
@@ -236,11 +236,11 @@ head_stuck(struct intel_engine_cs *engine, u64 acthd)
memset(&engine->hangcheck.instdone, 0,
sizeof(engine->hangcheck.instdone));
- return HANGCHECK_ACTIVE;
+ return HANGCHECK_ACTIVE_HEAD;
}
if (!subunits_stuck(engine))
- return HANGCHECK_ACTIVE;
+ return HANGCHECK_ACTIVE_SUBUNITS;
return HANGCHECK_HUNG;
}
@@ -291,6 +291,129 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd)
return HANGCHECK_HUNG;
}
+static void hangcheck_load_sample(struct intel_engine_cs *engine,
+ struct intel_engine_hangcheck *hc)
+{
+ /* We don't strictly need an irq-barrier here, as we are not
+ * serving an interrupt request, be paranoid in case the
+ * barrier has side-effects (such as preventing a broken
+ * cacheline snoop) and so be sure that we can see the seqno
+ * advance. If the seqno should stick, due to a stale
+ * cacheline, we would erroneously declare the GPU hung.
+ */
+ if (engine->irq_seqno_barrier)
+ engine->irq_seqno_barrier(engine);
+
+ hc->acthd = intel_engine_get_active_head(engine);
+ hc->seqno = intel_engine_get_seqno(engine);
+ hc->score = engine->hangcheck.score;
+}
+
+static void hangcheck_store_sample(struct intel_engine_cs *engine,
+ const struct intel_engine_hangcheck *hc)
+{
+ engine->hangcheck.acthd = hc->acthd;
+ engine->hangcheck.seqno = hc->seqno;
+ engine->hangcheck.score = hc->score;
+ engine->hangcheck.action = hc->action;
+}
+
+static enum intel_engine_hangcheck_action
+hangcheck_get_action(struct intel_engine_cs *engine,
+ const struct intel_engine_hangcheck *hc)
+{
+ if (engine->hangcheck.seqno != hc->seqno)
+ return HANGCHECK_ACTIVE_SEQNO;
+
+ if (i915_seqno_passed(hc->seqno, intel_engine_last_submit(engine)))
+ return HANGCHECK_IDLE;
+
+ return engine_stuck(engine, hc->acthd);
+}
+
+static void hangcheck_accumulate_sample(struct intel_engine_cs *engine,
+ struct intel_engine_hangcheck *hc)
+{
+ hc->action = hangcheck_get_action(engine, hc);
+
+ switch (hc->action) {
+ case HANGCHECK_IDLE:
+ case HANGCHECK_WAIT:
+ break;
+
+ case HANGCHECK_ACTIVE_HEAD:
+ case HANGCHECK_ACTIVE_SUBUNITS:
+ /* We always increment the hangcheck score
+ * if the engine is busy and still processing
+ * the same request, so that no single request
+ * can run indefinitely (such as a chain of
+ * batches). The only time we do not increment
+ * the hangcheck score on this ring, if this
+ * engine is in a legitimate wait for another
+ * engine. In that case the waiting engine is a
+ * victim and we want to be sure we catch the
+ * right culprit. Then every time we do kick
+ * the ring, add a small increment to the
+ * score so that we can catch a batch that is
+ * being repeatedly kicked and so responsible
+ * for stalling the machine.
+ */
+ hc->score += 1;
+ break;
+
+ case HANGCHECK_KICK:
+ hc->score += 5;
+ break;
+
+ case HANGCHECK_HUNG:
+ hc->score += 20;
+ break;
+
+ case HANGCHECK_ACTIVE_SEQNO:
+ /* Gradually reduce the count so that we catch DoS
+ * attempts across multiple batches.
+ */
+ if (hc->score > 0)
+ hc->score -= 15;
+ if (hc->score < 0)
+ hc->score = 0;
+
+ /* Clear head and subunit states on seqno movement */
+ hc->acthd = 0;
+
+ memset(&engine->hangcheck.instdone, 0,
+ sizeof(engine->hangcheck.instdone));
+ break;
+
+ default:
+ MISSING_CASE(hc->action);
+ }
+}
+
+static void hangcheck_declare_hang(struct drm_i915_private *i915,
+ unsigned int hung,
+ unsigned int stuck)
+{
+ struct intel_engine_cs *engine;
+ char msg[80];
+ unsigned int tmp;
+ int len;
+
+ /* If some rings hung but others were still busy, only
+ * blame the hanging rings in the synopsis.
+ */
+ if (stuck != hung)
+ hung &= ~stuck;
+ len = scnprintf(msg, sizeof(msg),
+ "%s on ", stuck == hung ? "No progress" : "Hang");
+ for_each_engine_masked(engine, i915, hung, tmp)
+ len += scnprintf(msg + len, sizeof(msg) - len,
+ "%s, ", engine->name);
+ msg[len-2] = '\0';
+
+ return i915_handle_error(i915, hung, msg);
+}
+
/*
* This is called when the chip hasn't reported back with completed
* batchbuffers in a long time. We keep track per ring seqno progress and
@@ -308,10 +431,6 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
enum intel_engine_id id;
unsigned int hung = 0, stuck = 0;
int busy_count = 0;
-#define BUSY 1
-#define KICK 5
-#define HUNG 20
-#define ACTIVE_DECAY 15
if (!i915.enable_hangcheck)
return;
@@ -326,112 +445,26 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
intel_uncore_arm_unclaimed_mmio_detection(dev_priv);
for_each_engine(engine, dev_priv, id) {
- bool busy = intel_engine_has_waiter(engine);
- u64 acthd;
- u32 seqno;
- u32 submit;
+ struct intel_engine_hangcheck cur_state, *hc = &cur_state;
+ const bool busy = intel_engine_has_waiter(engine);
semaphore_clear_deadlocks(dev_priv);
- /* We don't strictly need an irq-barrier here, as we are not
- * serving an interrupt request, be paranoid in case the
- * barrier has side-effects (such as preventing a broken
- * cacheline snoop) and so be sure that we can see the seqno
- * advance. If the seqno should stick, due to a stale
- * cacheline, we would erroneously declare the GPU hung.
- */
- if (engine->irq_seqno_barrier)
- engine->irq_seqno_barrier(engine);
-
- acthd = intel_engine_get_active_head(engine);
- seqno = intel_engine_get_seqno(engine);
- submit = intel_engine_last_submit(engine);
-
- if (engine->hangcheck.seqno == seqno) {
- if (i915_seqno_passed(seqno, submit)) {
- engine->hangcheck.action = HANGCHECK_IDLE;
- } else {
- /* We always increment the hangcheck score
- * if the engine is busy and still processing
- * the same request, so that no single request
- * can run indefinitely (such as a chain of
- * batches). The only time we do not increment
- * the hangcheck score on this ring, if this
- * engine is in a legitimate wait for another
- * engine. In that case the waiting engine is a
- * victim and we want to be sure we catch the
- * right culprit. Then every time we do kick
- * the ring, add a small increment to the
- * score so that we can catch a batch that is
- * being repeatedly kicked and so responsible
- * for stalling the machine.
- */
- engine->hangcheck.action =
- engine_stuck(engine, acthd);
-
- switch (engine->hangcheck.action) {
- case HANGCHECK_IDLE:
- case HANGCHECK_WAIT:
- break;
- case HANGCHECK_ACTIVE:
- engine->hangcheck.score += BUSY;
- break;
- case HANGCHECK_KICK:
- engine->hangcheck.score += KICK;
- break;
- case HANGCHECK_HUNG:
- engine->hangcheck.score += HUNG;
- break;
- }
- }
-
- if (engine->hangcheck.score >= HANGCHECK_SCORE_RING_HUNG) {
- hung |= intel_engine_flag(engine);
- if (engine->hangcheck.action != HANGCHECK_HUNG)
- stuck |= intel_engine_flag(engine);
- }
- } else {
- engine->hangcheck.action = HANGCHECK_ACTIVE;
-
- /* Gradually reduce the count so that we catch DoS
- * attempts across multiple batches.
- */
- if (engine->hangcheck.score > 0)
- engine->hangcheck.score -= ACTIVE_DECAY;
- if (engine->hangcheck.score < 0)
- engine->hangcheck.score = 0;
-
- /* Clear head and subunit states on seqno movement */
- acthd = 0;
-
- memset(&engine->hangcheck.instdone, 0,
- sizeof(engine->hangcheck.instdone));
+ hangcheck_load_sample(engine, hc);
+ hangcheck_accumulate_sample(engine, hc);
+ hangcheck_store_sample(engine, hc);
+
+ if (hc->score >= HANGCHECK_SCORE_RING_HUNG) {
+ hung |= intel_engine_flag(engine);
+ if (hc->action != HANGCHECK_HUNG)
+ stuck |= intel_engine_flag(engine);
}
- engine->hangcheck.seqno = seqno;
- engine->hangcheck.acthd = acthd;
busy_count += busy;
}
- if (hung) {
- char msg[80];
- unsigned int tmp;
- int len;
-
- /* If some rings hung but others were still busy, only
- * blame the hanging rings in the synopsis.
- */
- if (stuck != hung)
- hung &= ~stuck;
- len = scnprintf(msg, sizeof(msg),
- "%s on ", stuck == hung ? "No progress" : "Hang");
- for_each_engine_masked(engine, dev_priv, hung, tmp)
- len += scnprintf(msg + len, sizeof(msg) - len,
- "%s, ", engine->name);
- msg[len-2] = '\0';
-
- return i915_handle_error(dev_priv, hung, msg);
- }
+ if (hung)
+ hangcheck_declare_hang(dev_priv, hung, stuck);
/* Reset timer in case GPU hangs without another request being added */
if (busy_count)
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 3466b4e..3152b2b 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -67,7 +67,9 @@ struct intel_hw_status_page {
enum intel_engine_hangcheck_action {
HANGCHECK_IDLE = 0,
HANGCHECK_WAIT,
- HANGCHECK_ACTIVE,
+ HANGCHECK_ACTIVE_SEQNO,
+ HANGCHECK_ACTIVE_HEAD,
+ HANGCHECK_ACTIVE_SUBUNITS,
HANGCHECK_KICK,
HANGCHECK_HUNG,
};
--
2.7.4
More information about the Intel-gfx
mailing list