[Intel-gfx] [PATCH 2/6] drm/i915: Decouple hang detection from hangcheck period

Wed Nov 16 15:20:30 UTC 2016

Hangcheck state accumulation has gained more steps
along the years, like head movement and more recently the
subunit inactivity check. As the subunit sampling is only
done if the previous state check showed inactivity, we
have added more stages (and time) to reach a hang verdict.

Asymmetric engine states led to different actual weight of
'one hangcheck unit' and it was demonstrated in some
hangs that due to difference in stages, simpler engines
were accused falsely of a hang as their scoring was much
more quicker to accumulate above the hang treshold.

To completely decouple the hangcheck guilty score
from the hangcheck period, convert hangcheck score to a
rough period of inactivity measurement. As these are
tracked as jiffies, they are meaningful also across
reset boundaries. This makes finding a guilty engine
more accurate across multi engine activity scenarios,
especially across asymmetric engines.

We lose the ability to detect cross batch malicious attempts
to hinder the progress. Plan is to move this functionality
to be part of context banning which is more natural fit,
later in the series.

v2: use time_before macros (Chris)
    reinstate the pardoning of moving engine after hc (Chris)
v3: avoid global state for per engine stall detection (Chris)
v4: take timeline last retirement into account (Chris)

Cc: Chris Wilson <chris at chris-wilson.co.uk>
Signed-off-by: Mika Kuoppala <mika.kuoppala at intel.com>
---
 drivers/gpu/drm/i915/i915_debugfs.c      |  17 +++--
 drivers/gpu/drm/i915/i915_drv.h          |   6 +-
 drivers/gpu/drm/i915/i915_gem.c          |  11 ++-
 drivers/gpu/drm/i915/i915_gem_timeline.c |  12 +++-
 drivers/gpu/drm/i915/i915_gem_timeline.h |   1 +
 drivers/gpu/drm/i915/i915_gpu_error.c    |  46 ++++---------
 drivers/gpu/drm/i915/intel_hangcheck.c   | 115 ++++++++++++++++---------------
 drivers/gpu/drm/i915/intel_ringbuffer.h  |  40 ++++++++---
 8 files changed, 139 insertions(+), 109 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 1cc971c..5ef5c1a 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1351,10 +1351,12 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
 		seq_printf(m, "\tseqno = %x [current %x, last %x]\n",
 			   engine->hangcheck.seqno, seqno[id],
 			   intel_engine_last_submit(engine));
-		seq_printf(m, "\twaiters? %s, fake irq active? %s\n",
+		seq_printf(m, "\twaiters? %s, fake irq active? %s, stall? %s\n",
 			   yesno(intel_engine_has_waiter(engine)),
 			   yesno(test_bit(engine->id,
-					  &dev_priv->gpu_error.missed_irq_rings)));
+					  &dev_priv->gpu_error.missed_irq_rings)),
+			   yesno(engine->hangcheck.stall));
+
 		spin_lock_irq(&b->lock);
 		for (rb = rb_first(&b->waiters); rb; rb = rb_next(rb)) {
 			struct intel_wait *w = container_of(rb, typeof(*w), node);
@@ -1367,8 +1369,11 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
 		seq_printf(m, "\tACTHD = 0x%08llx [current 0x%08llx]\n",
 			   (long long)engine->hangcheck.acthd,
 			   (long long)acthd[id]);
-		seq_printf(m, "\tscore = %d\n", engine->hangcheck.score);
-		seq_printf(m, "\taction = %d\n", engine->hangcheck.action);
+		seq_printf(m, "\taction = %s(%d) %d ms ago\n",
+			   hangcheck_action_to_str(engine->hangcheck.action),
+			   engine->hangcheck.action,
+			   jiffies_to_msecs(jiffies -
+					    engine->hangcheck.action_timestamp));
 
 		if (engine->id == RCS) {
 			seq_puts(m, "\tinstdone read =\n");
@@ -3163,11 +3168,11 @@ static int i915_engine_info(struct seq_file *m, void *unused)
 		u64 addr;
 
 		seq_printf(m, "%s\n", engine->name);
-		seq_printf(m, "\tcurrent seqno %x, last %x, hangcheck %x [score %d]\n",
+		seq_printf(m, "\tcurrent seqno %x, last %x, hangcheck %x [%d ms]\n",
 			   intel_engine_get_seqno(engine),
 			   intel_engine_last_submit(engine),
 			   engine->hangcheck.seqno,
-			   engine->hangcheck.score);
+			   jiffies_to_msecs(jiffies - engine->hangcheck.action_timestamp));
 
 		rcu_read_lock();
 
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 5192206..4562a39 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -795,7 +795,8 @@ struct drm_i915_error_state {
 		/* Software tracked state */
 		bool waiting;
 		int num_waiters;
-		int hangcheck_score;
+		unsigned long hangcheck_timestamp;
+		bool hangcheck_stall;
 		enum intel_engine_hangcheck_action hangcheck_action;
 		struct i915_address_space *vm;
 		int num_requests;
@@ -1441,6 +1442,9 @@ struct i915_error_state_file_priv {
 #define I915_RESET_TIMEOUT (10 * HZ) /* 10s */
 #define I915_FENCE_TIMEOUT (10 * HZ) /* 10s */
 
+#define I915_ENGINE_DEAD_TIMEOUT  (4 * HZ)  /* No observed seqno/head/subunit progress */
+#define I915_SEQNO_DEAD_TIMEOUT   (12 * HZ) /* Engine progress without head progress */
+
 struct i915_gpu_error {
 	/* For hangcheck timer */
 #define DRM_I915_HANGCHECK_PERIOD 1500 /* in ms */
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 3fb5e66..ae2a219 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2703,9 +2703,16 @@ static void i915_gem_reset_engine(struct intel_engine_cs *engine)
 	if (!request)
 		return;
 
-	ring_hung = engine->hangcheck.score >= HANGCHECK_SCORE_RING_HUNG;
-	if (engine->hangcheck.seqno != intel_engine_get_seqno(engine))
+	ring_hung = engine->hangcheck.stall;
+	if (engine->hangcheck.seqno != intel_engine_get_seqno(engine)) {
+		if (ring_hung)
+			DRM_ERROR("%s pardoned due to progress after hangcheck %x vs %x\n",
+				  engine->name,
+				  engine->hangcheck.seqno,
+				  intel_engine_get_seqno(engine));
+
 		ring_hung = false;
+	}
 
 	i915_set_reset_status(request->ctx, ring_hung);
 	if (!ring_hung)
diff --git a/drivers/gpu/drm/i915/i915_gem_timeline.c b/drivers/gpu/drm/i915/i915_gem_timeline.c
index bf8a471..348b0f2 100644
--- a/drivers/gpu/drm/i915/i915_gem_timeline.c
+++ b/drivers/gpu/drm/i915/i915_gem_timeline.c
@@ -24,6 +24,15 @@
 
 #include "i915_drv.h"
 
+static void i915_gem_timeline_retire(struct i915_gem_active *active,
+				     struct drm_i915_gem_request *request)
+{
+	struct intel_timeline *tl =
+		container_of(active, typeof(*tl), last_request);
+
+	tl->last_retire_timestamp = jiffies;
+}
+
 static int __i915_gem_timeline_init(struct drm_i915_private *i915,
 				    struct i915_gem_timeline *timeline,
 				    const char *name,
@@ -54,7 +63,8 @@ static int __i915_gem_timeline_init(struct drm_i915_private *i915,
 #else
 		spin_lock_init(&tl->lock);
 #endif
-		init_request_active(&tl->last_request, NULL);
+		init_request_active(&tl->last_request,
+				    i915_gem_timeline_retire);
 		INIT_LIST_HEAD(&tl->requests);
 	}
 
diff --git a/drivers/gpu/drm/i915/i915_gem_timeline.h b/drivers/gpu/drm/i915/i915_gem_timeline.h
index 98d99a6..d4eea88 100644
--- a/drivers/gpu/drm/i915/i915_gem_timeline.h
+++ b/drivers/gpu/drm/i915/i915_gem_timeline.h
@@ -34,6 +34,7 @@ struct i915_gem_timeline;
 struct intel_timeline {
 	u64 fence_context;
 	u32 last_submitted_seqno;
+	unsigned long last_retire_timestamp;
 
 	spinlock_t lock;
 
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index f02f581..3eee6c2 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -316,28 +316,6 @@ static void print_error_buffers(struct drm_i915_error_state_buf *m,
 	}
 }
 
-static const char *hangcheck_action_to_str(enum intel_engine_hangcheck_action a)
-{
-	switch (a) {
-	case HANGCHECK_IDLE:
-		return "idle";
-	case HANGCHECK_WAIT:
-		return "wait";
-	case HANGCHECK_ACTIVE_SEQNO:
-		return "active seqno";
-	case HANGCHECK_ACTIVE_HEAD:
-		return "active head";
-	case HANGCHECK_ACTIVE_SUBUNITS:
-		return "active subunits";
-	case HANGCHECK_KICK:
-		return "kick";
-	case HANGCHECK_HUNG:
-		return "hung";
-	}
-
-	return "unknown";
-}
-
 static void error_print_instdone(struct drm_i915_error_state_buf *m,
 				 struct drm_i915_error_engine *ee)
 {
@@ -445,9 +423,13 @@ static void error_print_engine(struct drm_i915_error_state_buf *m,
 	err_printf(m, "  waiting: %s\n", yesno(ee->waiting));
 	err_printf(m, "  ring->head: 0x%08x\n", ee->cpu_ring_head);
 	err_printf(m, "  ring->tail: 0x%08x\n", ee->cpu_ring_tail);
-	err_printf(m, "  hangcheck: %s [%d]\n",
-		   hangcheck_action_to_str(ee->hangcheck_action),
-		   ee->hangcheck_score);
+	err_printf(m, "  hangcheck stall: %s\n", yesno(ee->hangcheck_stall));
+	err_printf(m, "  hangcheck action: %s\n",
+		   hangcheck_action_to_str(ee->hangcheck_action));
+	err_printf(m, "  hangcheck action timestamp: %lu, %u ms ago\n",
+		   ee->hangcheck_timestamp,
+		   jiffies_to_msecs(jiffies - ee->hangcheck_timestamp));
+
 	error_print_request(m, "  ELSP[0]: ", &ee->execlist[0]);
 	error_print_request(m, "  ELSP[1]: ", &ee->execlist[1]);
 }
@@ -537,7 +519,6 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 	struct pci_dev *pdev = dev_priv->drm.pdev;
 	struct drm_i915_error_state *error = error_priv->error;
 	struct drm_i915_error_object *obj;
-	int max_hangcheck_score;
 	int i, j;
 
 	if (!error) {
@@ -554,13 +535,9 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 	err_printf(m, "Uptime: %ld s %ld us\n",
 		   error->uptime.tv_sec, error->uptime.tv_usec);
 	err_print_capabilities(m, &error->device_info);
-	max_hangcheck_score = 0;
-	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
-		if (error->engine[i].hangcheck_score > max_hangcheck_score)
-			max_hangcheck_score = error->engine[i].hangcheck_score;
-	}
+
 	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
-		if (error->engine[i].hangcheck_score == max_hangcheck_score &&
+		if (error->engine[i].hangcheck_stall &&
 		    error->engine[i].pid != -1) {
 			err_printf(m, "Active process (on ring %s): %s [%d]\n",
 				   engine_str(i),
@@ -946,7 +923,7 @@ static uint32_t i915_error_generate_code(struct drm_i915_private *dev_priv,
 	 * strictly a client bug. Use instdone to differentiate those some.
 	 */
 	for (i = 0; i < I915_NUM_ENGINES; i++) {
-		if (error->engine[i].hangcheck_action == HANGCHECK_HUNG) {
+		if (error->engine[i].hangcheck_stall) {
 			if (engine_id)
 				*engine_id = i;
 
@@ -1164,8 +1141,9 @@ static void error_record_engine_registers(struct drm_i915_error_state *error,
 		ee->hws = I915_READ(mmio);
 	}
 
-	ee->hangcheck_score = engine->hangcheck.score;
+	ee->hangcheck_timestamp = engine->hangcheck.action_timestamp;
 	ee->hangcheck_action = engine->hangcheck.action;
+	ee->hangcheck_stall = engine->hangcheck.stall;
 
 	if (USES_PPGTT(dev_priv)) {
 		int i;
diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c
index 3d2e81c..71ee837 100644
--- a/drivers/gpu/drm/i915/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/intel_hangcheck.c
@@ -236,13 +236,13 @@ head_stuck(struct intel_engine_cs *engine, u64 acthd)
 		memset(&engine->hangcheck.instdone, 0,
 		       sizeof(engine->hangcheck.instdone));
 
-		return HANGCHECK_ACTIVE_HEAD;
+		return ENGINE_ACTIVE_HEAD;
 	}
 
 	if (!subunits_stuck(engine))
-		return HANGCHECK_ACTIVE_SUBUNITS;
+		return ENGINE_ACTIVE_SUBUNITS;
 
-	return HANGCHECK_HUNG;
+	return ENGINE_DEAD;
 }
 
 static enum intel_engine_hangcheck_action
@@ -253,11 +253,11 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd)
 	u32 tmp;
 
 	ha = head_stuck(engine, acthd);
-	if (ha != HANGCHECK_HUNG)
+	if (ha != ENGINE_DEAD)
 		return ha;
 
 	if (IS_GEN2(dev_priv))
-		return HANGCHECK_HUNG;
+		return ENGINE_DEAD;
 
 	/* Is the chip hanging on a WAIT_FOR_EVENT?
 	 * If so we can simply poke the RB_WAIT bit
@@ -270,25 +270,25 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd)
 				  "Kicking stuck wait on %s",
 				  engine->name);
 		I915_WRITE_CTL(engine, tmp);
-		return HANGCHECK_KICK;
+		return ENGINE_WAIT_KICK;
 	}
 
 	if (INTEL_GEN(dev_priv) >= 6 && tmp & RING_WAIT_SEMAPHORE) {
 		switch (semaphore_passed(engine)) {
 		default:
-			return HANGCHECK_HUNG;
+			return ENGINE_DEAD;
 		case 1:
 			i915_handle_error(dev_priv, 0,
 					  "Kicking stuck semaphore on %s",
 					  engine->name);
 			I915_WRITE_CTL(engine, tmp);
-			return HANGCHECK_KICK;
+			return ENGINE_WAIT_KICK;
 		case 0:
-			return HANGCHECK_WAIT;
+			return ENGINE_WAIT;
 		}
 	}
 
-	return HANGCHECK_HUNG;
+	return ENGINE_DEAD;
 }
 
 static void hangcheck_load_sample(struct intel_engine_cs *engine,
@@ -306,7 +306,6 @@ static void hangcheck_load_sample(struct intel_engine_cs *engine,
 
 	hc->acthd = intel_engine_get_active_head(engine);
 	hc->seqno = intel_engine_get_seqno(engine);
-	hc->score = engine->hangcheck.score;
 }
 
 static void hangcheck_store_sample(struct intel_engine_cs *engine,
@@ -314,8 +313,8 @@ static void hangcheck_store_sample(struct intel_engine_cs *engine,
 {
 	engine->hangcheck.acthd = hc->acthd;
 	engine->hangcheck.seqno = hc->seqno;
-	engine->hangcheck.score = hc->score;
 	engine->hangcheck.action = hc->action;
+	engine->hangcheck.stall = hc->stall;
 }
 
 static enum intel_engine_hangcheck_action
@@ -323,10 +322,10 @@ hangcheck_get_action(struct intel_engine_cs *engine,
 		     const struct intel_engine_hangcheck *hc)
 {
 	if (engine->hangcheck.seqno != hc->seqno)
-		return HANGCHECK_ACTIVE_SEQNO;
+		return ENGINE_ACTIVE_SEQNO;
 
 	if (i915_seqno_passed(hc->seqno, intel_engine_last_submit(engine)))
-		return HANGCHECK_IDLE;
+		return ENGINE_IDLE;
 
 	return engine_stuck(engine, hc->acthd);
 }
@@ -334,60 +333,64 @@ hangcheck_get_action(struct intel_engine_cs *engine,
 static void hangcheck_accumulate_sample(struct intel_engine_cs *engine,
 					struct intel_engine_hangcheck *hc)
 {
+	unsigned long timeout = I915_ENGINE_DEAD_TIMEOUT;
+	unsigned long last_retire;
+
 	hc->action = hangcheck_get_action(engine, hc);
 
-	switch (hc->action) {
-	case HANGCHECK_IDLE:
-	case HANGCHECK_WAIT:
-		break;
+	/* We always increment the progress
+	 * if the engine is busy and still processing
+	 * the same request, so that no single request
+	 * can run indefinitely (such as a chain of
+	 * batches). The only time we do not increment
+	 * the hangcheck score on this ring, if this
+	 * engine is in a legitimate wait for another
+	 * engine. In that case the waiting engine is a
+	 * victim and we want to be sure we catch the
+	 * right culprit. Then every time we do kick
+	 * the ring, make it as a progress as the seqno
+	 * advancement might ensure and if not, it
+	 * will catch the hanging engine.
+	 */
 
-	case HANGCHECK_ACTIVE_HEAD:
-	case HANGCHECK_ACTIVE_SUBUNITS:
-		/* We always increment the hangcheck score
-		 * if the engine is busy and still processing
-		 * the same request, so that no single request
-		 * can run indefinitely (such as a chain of
-		 * batches). The only time we do not increment
-		 * the hangcheck score on this ring, if this
-		 * engine is in a legitimate wait for another
-		 * engine. In that case the waiting engine is a
-		 * victim and we want to be sure we catch the
-		 * right culprit. Then every time we do kick
-		 * the ring, add a small increment to the
-		 * score so that we can catch a batch that is
-		 * being repeatedly kicked and so responsible
-		 * for stalling the machine.
-		 */
-		hc->score += 1;
-		break;
+	switch (hc->action) {
+	case ENGINE_IDLE:
+	case ENGINE_ACTIVE_SEQNO:
+		/* Clear head and subunit states on seqno movement */
+		hc->acthd = 0;
 
-	case HANGCHECK_KICK:
-		hc->score += 5;
-		break;
+		memset(&engine->hangcheck.instdone, 0,
+		       sizeof(engine->hangcheck.instdone));
 
-	case HANGCHECK_HUNG:
-		hc->score += 20;
+		/* Intentional fall through */
+	case ENGINE_WAIT_KICK:
+	case ENGINE_WAIT:
+		engine->hangcheck.action_timestamp = jiffies;
 		break;
 
-	case HANGCHECK_ACTIVE_SEQNO:
-		/* Gradually reduce the count so that we catch DoS
-		 * attempts across multiple batches.
+	case ENGINE_ACTIVE_HEAD:
+	case ENGINE_ACTIVE_SUBUNITS:
+		/* Seqno stuck with still active engine gets leeway,
+		 * in hopes that it is just a long shader.
 		 */
-		if (hc->score > 0)
-			hc->score -= 15;
-		if (hc->score < 0)
-			hc->score = 0;
-
-		/* Clear head and subunit states on seqno movement */
-		hc->acthd = 0;
+		timeout = I915_SEQNO_DEAD_TIMEOUT;
+		break;
 
-		memset(&engine->hangcheck.instdone, 0,
-		       sizeof(engine->hangcheck.instdone));
+	case ENGINE_DEAD:
 		break;
 
 	default:
 		MISSING_CASE(hc->action);
 	}
+
+	last_retire = READ_ONCE(engine->timeline->last_retire_timestamp);
+
+	/* If there is retire on the timeline, take that into account */
+	if (time_after(last_retire, engine->hangcheck.action_timestamp))
+		engine->hangcheck.action_timestamp = last_retire;
+
+	hc->stall = time_after(jiffies,
+			       engine->hangcheck.action_timestamp + timeout);
 }
 
 static void hangcheck_declare_hang(struct drm_i915_private *i915,
@@ -454,9 +457,9 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
 		hangcheck_accumulate_sample(engine, hc);
 		hangcheck_store_sample(engine, hc);
 
-		if (hc->score >= HANGCHECK_SCORE_RING_HUNG) {
+		if (engine->hangcheck.stall) {
 			hung |= intel_engine_flag(engine);
-			if (hc->action != HANGCHECK_HUNG)
+			if (hc->action != ENGINE_DEAD)
 				stuck |= intel_engine_flag(engine);
 		}
 
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 3152b2b..d2d870b 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -65,16 +65,37 @@ struct intel_hw_status_page {
 	 GEN8_SEMAPHORE_OFFSET(from, (__ring)->id))
 
 enum intel_engine_hangcheck_action {
-	HANGCHECK_IDLE = 0,
-	HANGCHECK_WAIT,
-	HANGCHECK_ACTIVE_SEQNO,
-	HANGCHECK_ACTIVE_HEAD,
-	HANGCHECK_ACTIVE_SUBUNITS,
-	HANGCHECK_KICK,
-	HANGCHECK_HUNG,
+	ENGINE_IDLE = 0,
+	ENGINE_WAIT,
+	ENGINE_ACTIVE_SEQNO,
+	ENGINE_ACTIVE_HEAD,
+	ENGINE_ACTIVE_SUBUNITS,
+	ENGINE_WAIT_KICK,
+	ENGINE_DEAD,
 };
 
-#define HANGCHECK_SCORE_RING_HUNG 31
+static inline const char *
+hangcheck_action_to_str(const enum intel_engine_hangcheck_action a)
+{
+	switch (a) {
+	case ENGINE_IDLE:
+		return "idle";
+	case ENGINE_WAIT:
+		return "wait";
+	case ENGINE_ACTIVE_SEQNO:
+		return "active seqno";
+	case ENGINE_ACTIVE_HEAD:
+		return "active head";
+	case ENGINE_ACTIVE_SUBUNITS:
+		return "active subunits";
+	case ENGINE_WAIT_KICK:
+		return "wait kick";
+	case ENGINE_DEAD:
+		return "dead";
+	}
+
+	return "unknown";
+}
 
 #define I915_MAX_SLICES	3
 #define I915_MAX_SUBSLICES 3
@@ -106,10 +127,11 @@ struct intel_instdone {
 struct intel_engine_hangcheck {
 	u64 acthd;
 	u32 seqno;
-	int score;
 	enum intel_engine_hangcheck_action action;
+	unsigned long action_timestamp;
 	int deadlock;
 	struct intel_instdone instdone;
+	bool stall;
 };
 
 struct intel_ring {
-- 
2.7.4