[Intel-gfx] [PATCH 2/6] drm/i915: Decouple hang detection from hangcheck period

Tue Nov 15 14:36:32 UTC 2016

Hangcheck state accumulation has gained more steps
along the years, like head movement and more recently the
subunit inactivity check. As the subunit sampling is only
done if the previous state check showed inactivity, we
have added more stages (and time) to reach a hang verdict.

Asymmetric engine states led to different actual weight of
'one hangcheck unit' and it was demonstrated in some
hangs that due to difference in stages, simpler engines
were accused falsely of a hang as their scoring was much
more quicker to accumulate above the hang treshold.

To completely decouple the hangcheck guilty score
from the hangcheck period, convert hangcheck score to a
rough period of inactivity measurement. As these are
tracked as jiffies, they are meaningful also across
reset boundaries. This makes finding a guilty engine
more accurate across multi engine activity scenarios,
especially across asymmetric engines.

We lose the ability to detect cross batch malicious attempts
to hinder the progress. Plan is to move this functionality
to be part of context banning which is more natural fit,
later in the series.

v2: use time_before macros (Chris)
    reinstate the pardoning of moving engine after hc (Chris)

Cc: Chris Wilson <chris at chris-wilson.co.uk>
Signed-off-by: Mika Kuoppala <mika.kuoppala at intel.com>
---
 drivers/gpu/drm/i915/i915_debugfs.c      |  17 ++--
 drivers/gpu/drm/i915/i915_drv.h          |  20 +++--
 drivers/gpu/drm/i915/i915_gem.c          |  11 ++-
 drivers/gpu/drm/i915/i915_gem_context.c  |   2 +-
 drivers/gpu/drm/i915/i915_gpu_error.c    |  39 ++-------
 drivers/gpu/drm/i915/intel_breadcrumbs.c |   2 +-
 drivers/gpu/drm/i915/intel_hangcheck.c   | 134 +++++++++++++++++++++----------
 drivers/gpu/drm/i915/intel_ringbuffer.h  |  26 +++++-
 8 files changed, 159 insertions(+), 92 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 1cc971c..cf0ca0f 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1351,10 +1351,12 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
 		seq_printf(m, "\tseqno = %x [current %x, last %x]\n",
 			   engine->hangcheck.seqno, seqno[id],
 			   intel_engine_last_submit(engine));
-		seq_printf(m, "\twaiters? %s, fake irq active? %s\n",
+		seq_printf(m, "\twaiters? %s, fake irq active? %s, guilty? %s\n",
 			   yesno(intel_engine_has_waiter(engine)),
 			   yesno(test_bit(engine->id,
-					  &dev_priv->gpu_error.missed_irq_rings)));
+					  &dev_priv->gpu_error.missed_irq_rings)),
+			   yesno(engine->hangcheck.guilty));
+
 		spin_lock_irq(&b->lock);
 		for (rb = rb_first(&b->waiters); rb; rb = rb_next(rb)) {
 			struct intel_wait *w = container_of(rb, typeof(*w), node);
@@ -1367,8 +1369,11 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
 		seq_printf(m, "\tACTHD = 0x%08llx [current 0x%08llx]\n",
 			   (long long)engine->hangcheck.acthd,
 			   (long long)acthd[id]);
-		seq_printf(m, "\tscore = %d\n", engine->hangcheck.score);
-		seq_printf(m, "\taction = %d\n", engine->hangcheck.action);
+		seq_printf(m, "\taction = %s(%d) %d ms ago\n",
+			   hangcheck_action_to_str(engine->hangcheck.action),
+			   engine->hangcheck.action,
+			   jiffies_to_msecs(jiffies -
+					    engine->hangcheck.action_timestamp));
 
 		if (engine->id == RCS) {
 			seq_puts(m, "\tinstdone read =\n");
@@ -3163,11 +3168,11 @@ static int i915_engine_info(struct seq_file *m, void *unused)
 		u64 addr;
 
 		seq_printf(m, "%s\n", engine->name);
-		seq_printf(m, "\tcurrent seqno %x, last %x, hangcheck %x [score %d]\n",
+		seq_printf(m, "\tcurrent seqno %x, last %x, hangcheck %x [%d ms]\n",
 			   intel_engine_get_seqno(engine),
 			   intel_engine_last_submit(engine),
 			   engine->hangcheck.seqno,
-			   engine->hangcheck.score);
+			   jiffies_to_msecs(jiffies - engine->hangcheck.action_timestamp));
 
 		rcu_read_lock();
 
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 006914c..3caa55d 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -782,7 +782,8 @@ struct drm_i915_error_state {
 		/* Software tracked state */
 		bool waiting;
 		int num_waiters;
-		int hangcheck_score;
+		unsigned long hangcheck_timestamp;
+		bool hangcheck_guilty;
 		enum intel_engine_hangcheck_action hangcheck_action;
 		struct i915_address_space *vm;
 		int num_requests;
@@ -1429,11 +1430,16 @@ struct i915_error_state_file_priv {
 #define I915_FENCE_TIMEOUT (10 * HZ) /* 10s */
 
 struct i915_gpu_error {
-	/* For hangcheck timer */
-#define DRM_I915_HANGCHECK_PERIOD 1500 /* in ms */
-#define DRM_I915_HANGCHECK_JIFFIES msecs_to_jiffies(DRM_I915_HANGCHECK_PERIOD)
-	/* Hang gpu twice in this window and your context gets banned */
-#define DRM_I915_CTX_BAN_PERIOD DIV_ROUND_UP(8*DRM_I915_HANGCHECK_PERIOD, 1000)
+
+#define DRM_I915_STUCK_PERIOD_SEC 24 /* No observed seqno progress */
+#define DRM_I915_HUNG_PERIOD_SEC 4 /* No observed seqno nor head progress */
+
+/* Hang gpu twice in this window and your context gets banned */
+#define DRM_I915_CTX_BAN_PERIOD_SEC 12
+
+#define HANGCHECK_STUCK_JIFFIES (DRM_I915_STUCK_PERIOD_SEC * HZ)
+#define HANGCHECK_HUNG_JIFFIES (DRM_I915_HUNG_PERIOD_SEC * HZ)
+#define HANGCHECK_PERIOD_JIFFIES msecs_to_jiffies(1500)
 
 	struct delayed_work hangcheck_work;
 
@@ -2723,7 +2729,7 @@ static inline void i915_queue_hangcheck(struct drm_i915_private *dev_priv)
 	 * we will ignore a hung ring if a second ring is kept busy.
 	 */
 
-	delay = round_jiffies_up_relative(DRM_I915_HANGCHECK_JIFFIES);
+	delay = round_jiffies_up_relative(HANGCHECK_PERIOD_JIFFIES);
 	queue_delayed_work(system_long_wq,
 			   &dev_priv->gpu_error.hangcheck_work, delay);
 }
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 3fb5e66..708e289 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2703,9 +2703,16 @@ static void i915_gem_reset_engine(struct intel_engine_cs *engine)
 	if (!request)
 		return;
 
-	ring_hung = engine->hangcheck.score >= HANGCHECK_SCORE_RING_HUNG;
-	if (engine->hangcheck.seqno != intel_engine_get_seqno(engine))
+	ring_hung = engine->hangcheck.guilty;
+	if (engine->hangcheck.seqno != intel_engine_get_seqno(engine)) {
+		if (ring_hung)
+			DRM_ERROR("%s pardoned due to progress after hangcheck %x vs %x\n",
+				  engine->name,
+				  engine->hangcheck.seqno,
+				  intel_engine_get_seqno(engine));
+
 		ring_hung = false;
+	}
 
 	i915_set_reset_status(request->ctx, ring_hung);
 	if (!ring_hung)
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 1f94b8d..958a526 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -331,7 +331,7 @@ __create_hw_context(struct drm_device *dev,
 	 * is no remap info, it will be a NOP. */
 	ctx->remap_slice = ALL_L3_SLICES(dev_priv);
 
-	ctx->hang_stats.ban_period_seconds = DRM_I915_CTX_BAN_PERIOD;
+	ctx->hang_stats.ban_period_seconds = DRM_I915_CTX_BAN_PERIOD_SEC;
 	ctx->ring_size = 4 * PAGE_SIZE;
 	ctx->desc_template = GEN8_CTX_ADDRESSING_MODE(dev_priv) <<
 			     GEN8_CTX_ADDRESSING_MODE_SHIFT;
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index f02f581..8d0f2bc 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -316,28 +316,6 @@ static void print_error_buffers(struct drm_i915_error_state_buf *m,
 	}
 }
 
-static const char *hangcheck_action_to_str(enum intel_engine_hangcheck_action a)
-{
-	switch (a) {
-	case HANGCHECK_IDLE:
-		return "idle";
-	case HANGCHECK_WAIT:
-		return "wait";
-	case HANGCHECK_ACTIVE_SEQNO:
-		return "active seqno";
-	case HANGCHECK_ACTIVE_HEAD:
-		return "active head";
-	case HANGCHECK_ACTIVE_SUBUNITS:
-		return "active subunits";
-	case HANGCHECK_KICK:
-		return "kick";
-	case HANGCHECK_HUNG:
-		return "hung";
-	}
-
-	return "unknown";
-}
-
 static void error_print_instdone(struct drm_i915_error_state_buf *m,
 				 struct drm_i915_error_engine *ee)
 {
@@ -445,9 +423,10 @@ static void error_print_engine(struct drm_i915_error_state_buf *m,
 	err_printf(m, "  waiting: %s\n", yesno(ee->waiting));
 	err_printf(m, "  ring->head: 0x%08x\n", ee->cpu_ring_head);
 	err_printf(m, "  ring->tail: 0x%08x\n", ee->cpu_ring_tail);
-	err_printf(m, "  hangcheck: %s [%d]\n",
+	err_printf(m, "  hangcheck: %s %s [%lu]\n",
+		   yesno(ee->hangcheck_guilty),
 		   hangcheck_action_to_str(ee->hangcheck_action),
-		   ee->hangcheck_score);
+		   ee->hangcheck_timestamp);
 	error_print_request(m, "  ELSP[0]: ", &ee->execlist[0]);
 	error_print_request(m, "  ELSP[1]: ", &ee->execlist[1]);
 }
@@ -537,7 +516,6 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 	struct pci_dev *pdev = dev_priv->drm.pdev;
 	struct drm_i915_error_state *error = error_priv->error;
 	struct drm_i915_error_object *obj;
-	int max_hangcheck_score;
 	int i, j;
 
 	if (!error) {
@@ -554,13 +532,9 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 	err_printf(m, "Uptime: %ld s %ld us\n",
 		   error->uptime.tv_sec, error->uptime.tv_usec);
 	err_print_capabilities(m, &error->device_info);
-	max_hangcheck_score = 0;
-	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
-		if (error->engine[i].hangcheck_score > max_hangcheck_score)
-			max_hangcheck_score = error->engine[i].hangcheck_score;
-	}
+
 	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
-		if (error->engine[i].hangcheck_score == max_hangcheck_score &&
+		if (error->engine[i].hangcheck_guilty &&
 		    error->engine[i].pid != -1) {
 			err_printf(m, "Active process (on ring %s): %s [%d]\n",
 				   engine_str(i),
@@ -1164,8 +1138,9 @@ static void error_record_engine_registers(struct drm_i915_error_state *error,
 		ee->hws = I915_READ(mmio);
 	}
 
-	ee->hangcheck_score = engine->hangcheck.score;
+	ee->hangcheck_timestamp = engine->hangcheck.action_timestamp;
 	ee->hangcheck_action = engine->hangcheck.action;
+	ee->hangcheck_guilty = engine->hangcheck.guilty;
 
 	if (USES_PPGTT(dev_priv)) {
 		int i;
diff --git a/drivers/gpu/drm/i915/intel_breadcrumbs.c b/drivers/gpu/drm/i915/intel_breadcrumbs.c
index c9c46a5..ebb3b8e 100644
--- a/drivers/gpu/drm/i915/intel_breadcrumbs.c
+++ b/drivers/gpu/drm/i915/intel_breadcrumbs.c
@@ -57,7 +57,7 @@ static void intel_breadcrumbs_hangcheck(unsigned long data)
 
 static unsigned long wait_timeout(void)
 {
-	return round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES);
+	return round_jiffies_up(jiffies + HANGCHECK_PERIOD_JIFFIES);
 }
 
 static void intel_breadcrumbs_fake_irq(unsigned long data)
diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c
index 3d2e81c..7bc8eaa 100644
--- a/drivers/gpu/drm/i915/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/intel_hangcheck.c
@@ -306,7 +306,6 @@ static void hangcheck_load_sample(struct intel_engine_cs *engine,
 
 	hc->acthd = intel_engine_get_active_head(engine);
 	hc->seqno = intel_engine_get_seqno(engine);
-	hc->score = engine->hangcheck.score;
 }
 
 static void hangcheck_store_sample(struct intel_engine_cs *engine,
@@ -314,7 +313,6 @@ static void hangcheck_store_sample(struct intel_engine_cs *engine,
 {
 	engine->hangcheck.acthd = hc->acthd;
 	engine->hangcheck.seqno = hc->seqno;
-	engine->hangcheck.score = hc->score;
 	engine->hangcheck.action = hc->action;
 }
 
@@ -336,58 +334,109 @@ static void hangcheck_accumulate_sample(struct intel_engine_cs *engine,
 {
 	hc->action = hangcheck_get_action(engine, hc);
 
+	/* We always increment the progress
+	 * if the engine is busy and still processing
+	 * the same request, so that no single request
+	 * can run indefinitely (such as a chain of
+	 * batches). The only time we do not increment
+	 * the hangcheck score on this ring, if this
+	 * engine is in a legitimate wait for another
+	 * engine. In that case the waiting engine is a
+	 * victim and we want to be sure we catch the
+	 * right culprit. Then every time we do kick
+	 * the ring, make it as a progress as the seqno
+	 * advancement might ensure and if not, it
+	 * will catch the hanging engine.
+	 */
+
 	switch (hc->action) {
 	case HANGCHECK_IDLE:
-	case HANGCHECK_WAIT:
+	case HANGCHECK_ACTIVE_SEQNO:
+		/* Clear head and subunit states on seqno movement */
+		hc->acthd = 0;
+
+		memset(&engine->hangcheck.instdone, 0,
+		       sizeof(engine->hangcheck.instdone));
+
+		engine->hangcheck.action_timestamp = jiffies;
 		break;
 
 	case HANGCHECK_ACTIVE_HEAD:
 	case HANGCHECK_ACTIVE_SUBUNITS:
-		/* We always increment the hangcheck score
-		 * if the engine is busy and still processing
-		 * the same request, so that no single request
-		 * can run indefinitely (such as a chain of
-		 * batches). The only time we do not increment
-		 * the hangcheck score on this ring, if this
-		 * engine is in a legitimate wait for another
-		 * engine. In that case the waiting engine is a
-		 * victim and we want to be sure we catch the
-		 * right culprit. Then every time we do kick
-		 * the ring, add a small increment to the
-		 * score so that we can catch a batch that is
-		 * being repeatedly kicked and so responsible
-		 * for stalling the machine.
-		 */
-		hc->score += 1;
-		break;
-
 	case HANGCHECK_KICK:
-		hc->score += 5;
-		break;
 
+	case HANGCHECK_WAIT:
 	case HANGCHECK_HUNG:
-		hc->score += 20;
 		break;
 
-	case HANGCHECK_ACTIVE_SEQNO:
-		/* Gradually reduce the count so that we catch DoS
-		 * attempts across multiple batches.
-		 */
-		if (hc->score > 0)
-			hc->score -= 15;
-		if (hc->score < 0)
-			hc->score = 0;
+	default:
+		MISSING_CASE(hc->action);
+	}
+}
 
-		/* Clear head and subunit states on seqno movement */
-		hc->acthd = 0;
+static bool
+hangcheck_engine_stall(struct intel_engine_cs *engine,
+		       struct intel_engine_hangcheck *hc)
+{
+	const unsigned long last_action = engine->hangcheck.action_timestamp;
 
-		memset(&engine->hangcheck.instdone, 0,
-		       sizeof(engine->hangcheck.instdone));
-		break;
+	if (hc->action == HANGCHECK_ACTIVE_SEQNO ||
+	    hc->action == HANGCHECK_IDLE)
+		return false;
+
+	if (time_before(jiffies, last_action + HANGCHECK_HUNG_JIFFIES))
+		return false;
+
+	if (time_before(jiffies, last_action + HANGCHECK_STUCK_JIFFIES))
+		if (hc->action != HANGCHECK_HUNG)
+			return false;
+
+	return true;
+}
+
+static struct intel_engine_cs *find_lra_engine(struct drm_i915_private *i915,
+					       const unsigned int mask)
+{
+	struct intel_engine_cs *engine = NULL, *c;
+	enum intel_engine_id id;
+
+	for_each_engine_masked(c, i915, mask, id) {
+		if (engine == NULL) {
+			engine = c;
+			continue;
+		}
+
+		if (time_before(c->hangcheck.action_timestamp,
+				engine->hangcheck.action_timestamp))
+			engine = c;
+		else if (c->hangcheck.action_timestamp ==
+			 engine->hangcheck.action_timestamp &&
+			 c->hangcheck.seqno < engine->hangcheck.seqno)
+			engine = c;
 
-	default:
-		MISSING_CASE(hc->action);
 	}
+
+	return engine;
+}
+
+static struct intel_engine_cs *find_guilty_engine(struct drm_i915_private *i915,
+						  const unsigned int hung_mask,
+						  const unsigned int stuck_mask)
+{
+	struct intel_engine_cs *engine;
+
+	engine = find_lra_engine(i915, hung_mask);
+	if (engine)
+		return engine;
+
+	engine = find_lra_engine(i915, stuck_mask);
+	if (engine)
+		return engine;
+
+	DRM_DEBUG_DRIVER("No engine found for hang (0x%x,0x%x)\n",
+			 hung_mask, stuck_mask);
+	/* Should not get here. But as a safety valve, blame someone */
+	return find_lra_engine(i915, ~0);
 }
 
 static void hangcheck_declare_hang(struct drm_i915_private *i915,
@@ -454,7 +503,7 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
 		hangcheck_accumulate_sample(engine, hc);
 		hangcheck_store_sample(engine, hc);
 
-		if (hc->score >= HANGCHECK_SCORE_RING_HUNG) {
+		if (hangcheck_engine_stall(engine, hc)) {
 			hung |= intel_engine_flag(engine);
 			if (hc->action != HANGCHECK_HUNG)
 				stuck |= intel_engine_flag(engine);
@@ -463,8 +512,11 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
 		busy_count += busy;
 	}
 
-	if (hung)
+	if (hung) {
+		engine = find_guilty_engine(dev_priv, hung, stuck);
+		engine->hangcheck.guilty = true;
 		hangcheck_declare_hang(dev_priv, hung, stuck);
+	}
 
 	/* Reset timer in case GPU hangs without another request being added */
 	if (busy_count)
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 3152b2b..92852e5 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -74,7 +74,28 @@ enum intel_engine_hangcheck_action {
 	HANGCHECK_HUNG,
 };
 
-#define HANGCHECK_SCORE_RING_HUNG 31
+static inline const char *
+hangcheck_action_to_str(const enum intel_engine_hangcheck_action a)
+{
+	switch (a) {
+	case HANGCHECK_IDLE:
+		return "idle";
+	case HANGCHECK_WAIT:
+		return "wait";
+	case HANGCHECK_ACTIVE_SEQNO:
+		return "active seqno";
+	case HANGCHECK_ACTIVE_HEAD:
+		return "active head";
+	case HANGCHECK_ACTIVE_SUBUNITS:
+		return "active subunits";
+	case HANGCHECK_KICK:
+		return "kick";
+	case HANGCHECK_HUNG:
+		return "hung";
+	}
+
+	return "unknown";
+}
 
 #define I915_MAX_SLICES	3
 #define I915_MAX_SUBSLICES 3
@@ -106,10 +127,11 @@ struct intel_instdone {
 struct intel_engine_hangcheck {
 	u64 acthd;
 	u32 seqno;
-	int score;
 	enum intel_engine_hangcheck_action action;
+	unsigned long action_timestamp;
 	int deadlock;
 	struct intel_instdone instdone;
+	bool guilty;
 };
 
 struct intel_ring {
-- 
2.7.4