[PATCH 51/60] hangcheck

Chris Wilson chris at chris-wilson.co.uk
Fri Dec 28 10:43:41 UTC 2018


---
 drivers/gpu/drm/i915/i915_debugfs.c     |  80 +-----
 drivers/gpu/drm/i915/i915_drv.h         |   5 +-
 drivers/gpu/drm/i915/i915_gpu_error.c   | 120 +++------
 drivers/gpu/drm/i915/i915_gpu_error.h   |   9 +-
 drivers/gpu/drm/i915/intel_engine_cs.c  |   9 +-
 drivers/gpu/drm/i915/intel_hangcheck.c  | 330 +++++++-----------------
 drivers/gpu/drm/i915/intel_ringbuffer.h |  46 +---
 7 files changed, 148 insertions(+), 451 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 6c463dbfe3f6..8b18b0fd86b5 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1252,42 +1252,10 @@ static int i915_frequency_info(struct seq_file *m, void *unused)
 	return ret;
 }
 
-static void i915_instdone_info(struct drm_i915_private *dev_priv,
-			       struct seq_file *m,
-			       struct intel_instdone *instdone)
-{
-	int slice;
-	int subslice;
-
-	seq_printf(m, "\t\tINSTDONE: 0x%08x\n",
-		   instdone->instdone);
-
-	if (INTEL_GEN(dev_priv) <= 3)
-		return;
-
-	seq_printf(m, "\t\tSC_INSTDONE: 0x%08x\n",
-		   instdone->slice_common);
-
-	if (INTEL_GEN(dev_priv) <= 6)
-		return;
-
-	for_each_instdone_slice_subslice(dev_priv, slice, subslice)
-		seq_printf(m, "\t\tSAMPLER_INSTDONE[%d][%d]: 0x%08x\n",
-			   slice, subslice, instdone->sampler[slice][subslice]);
-
-	for_each_instdone_slice_subslice(dev_priv, slice, subslice)
-		seq_printf(m, "\t\tROW_INSTDONE[%d][%d]: 0x%08x\n",
-			   slice, subslice, instdone->row[slice][subslice]);
-}
-
 static int i915_hangcheck_info(struct seq_file *m, void *unused)
 {
 	struct drm_i915_private *dev_priv = node_to_i915(m->private);
 	struct intel_engine_cs *engine;
-	u64 acthd[I915_NUM_ENGINES];
-	u32 seqno[I915_NUM_ENGINES];
-	struct intel_instdone instdone;
-	intel_wakeref_t wakeref;
 	enum intel_engine_id id;
 
 	if (test_bit(I915_WEDGED, &dev_priv->gpu_error.flags))
@@ -1304,15 +1272,6 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
 		return 0;
 	}
 
-	with_intel_runtime_pm(dev_priv, wakeref) {
-		for_each_engine(engine, dev_priv, id) {
-			acthd[id] = intel_engine_get_active_head(engine);
-			seqno[id] = intel_engine_get_seqno(engine);
-		}
-
-		intel_engine_get_instdone(dev_priv->engine[RCS], &instdone);
-	}
-
 	if (timer_pending(&dev_priv->gpu_error.hangcheck_work.timer))
 		seq_printf(m, "Hangcheck active, timer fires in %dms\n",
 			   jiffies_to_msecs(dev_priv->gpu_error.hangcheck_work.timer.expires -
@@ -1329,15 +1288,13 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
 		struct rb_node *rb;
 
 		seq_printf(m, "%s:\n", engine->name);
-		seq_printf(m, "\tseqno = %x [current %x, last %x]\n",
-			   engine->hangcheck.seqno, seqno[id],
-			   intel_engine_last_submit(engine));
-		seq_printf(m, "\twaiters? %s, fake irq active? %s, stalled? %s, wedged? %s\n",
+		seq_printf(m, "\tseqno = %llx, emitted %dms ago\n",
+			   engine->hangcheck ? engine->hangcheck->fence.seqno : 0,
+			   engine->hangcheck ? jiffies_to_msecs(jiffies - engine->hangcheck->emitted_jiffies) : -1);
+		seq_printf(m, "\twaiters? %s, fake irq active? %s\n",
 			   yesno(intel_engine_has_waiter(engine)),
 			   yesno(test_bit(engine->id,
-					  &dev_priv->gpu_error.missed_irq_rings)),
-			   yesno(engine->hangcheck.stalled),
-			   yesno(engine->hangcheck.wedged));
+					  &dev_priv->gpu_error.missed_irq_rings)));
 
 		spin_lock_irq(&b->rb_lock);
 		for (rb = rb_first(&b->waiters); rb; rb = rb_next(rb)) {
@@ -1349,26 +1306,6 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
 				   lower_32_bits(w->global_seqno));
 		}
 		spin_unlock_irq(&b->rb_lock);
-
-		seq_printf(m, "\tACTHD = 0x%08llx [current 0x%08llx]\n",
-			   (long long)engine->hangcheck.acthd,
-			   (long long)acthd[id]);
-		seq_printf(m, "\taction = %s(%d) %d ms ago\n",
-			   hangcheck_action_to_str(engine->hangcheck.action),
-			   engine->hangcheck.action,
-			   jiffies_to_msecs(jiffies -
-					    engine->hangcheck.action_timestamp));
-
-		if (engine->id == RCS) {
-			seq_puts(m, "\tinstdone read =\n");
-
-			i915_instdone_info(dev_priv, m, &instdone);
-
-			seq_puts(m, "\tinstdone accu =\n");
-
-			i915_instdone_info(dev_priv, m,
-					   &engine->hangcheck.instdone);
-		}
 	}
 
 	return 0;
@@ -3903,8 +3840,6 @@ static int
 i915_wedged_set(void *data, u64 val)
 {
 	struct drm_i915_private *i915 = data;
-	struct intel_engine_cs *engine;
-	unsigned int tmp;
 
 	/*
 	 * There is no safeguard against this debugfs entry colliding
@@ -3917,11 +3852,6 @@ i915_wedged_set(void *data, u64 val)
 	if (i915_reset_backoff(&i915->gpu_error))
 		return -EAGAIN;
 
-	for_each_engine_masked(engine, i915, val, tmp) {
-		engine->hangcheck.seqno = intel_engine_get_seqno(engine);
-		engine->hangcheck.stalled = true;
-	}
-
 	i915_handle_error(i915, val, I915_ERROR_CAPTURE,
 			  "Manually set wedged engine mask = %llx", val);
 	return 0;
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index e6dfbd7b0c56..4ff898d07639 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -924,9 +924,7 @@ struct i915_gem_mm {
 #define I915_RESET_TIMEOUT (10 * HZ) /* 10s */
 #define I915_FENCE_TIMEOUT (10 * HZ) /* 10s */
 
-#define I915_ENGINE_DEAD_TIMEOUT  (4 * HZ)  /* Seqno, head and subunits dead */
-#define I915_SEQNO_DEAD_TIMEOUT   (12 * HZ) /* Seqno dead with active head */
-
+#define I915_ENGINE_STALLED_TIMEOUT   (12 * HZ) /* No progress? */
 #define I915_ENGINE_WEDGED_TIMEOUT  (60 * HZ)  /* Reset but no recovery? */
 
 struct ddi_vbt_port_info {
@@ -2605,7 +2603,6 @@ extern int i915_driver_load(struct pci_dev *pdev,
 			    const struct pci_device_id *ent);
 extern void i915_driver_unload(struct drm_device *dev);
 
-extern void intel_engine_init_hangcheck(struct intel_engine_cs *engine);
 extern void intel_hangcheck_init(struct drm_i915_private *dev_priv);
 extern unsigned long i915_chipset_val(struct drm_i915_private *dev_priv);
 extern unsigned long i915_mch_val(struct drm_i915_private *dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index be6c80f38351..75e978d21bc5 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -530,13 +530,6 @@ static void error_print_engine(struct drm_i915_error_state_buf *m,
 	err_printf(m, "  waiting: %s\n", yesno(ee->waiting));
 	err_printf(m, "  ring->head: 0x%08x\n", ee->cpu_ring_head);
 	err_printf(m, "  ring->tail: 0x%08x\n", ee->cpu_ring_tail);
-	err_printf(m, "  hangcheck stall: %s\n", yesno(ee->hangcheck_stalled));
-	err_printf(m, "  hangcheck action: %s\n",
-		   hangcheck_action_to_str(ee->hangcheck_action));
-	err_printf(m, "  hangcheck action timestamp: %dms (%lu%s)\n",
-		   jiffies_to_msecs(ee->hangcheck_timestamp - epoch),
-		   ee->hangcheck_timestamp,
-		   ee->hangcheck_timestamp == epoch ? "; epoch" : "");
 	err_printf(m, "  engine reset count: %u\n", ee->reset_count);
 
 	for (n = 0; n < ee->num_ports; n++) {
@@ -677,17 +670,6 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,
 		   jiffies_to_msecs(jiffies - error->capture),
 		   jiffies_to_msecs(error->capture - error->epoch));
 
-	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
-		if (error->engine[i].hangcheck_stalled &&
-		    error->engine[i].context.pid) {
-			err_printf(m, "Active process (on ring %s): %s [%d], score %d%s\n",
-				   engine_name(m->i915, i),
-				   error->engine[i].context.comm,
-				   error->engine[i].context.pid,
-				   error->engine[i].context.ban_score,
-				   bannable(&error->engine[i].context));
-		}
-	}
 	err_printf(m, "Reset count: %u\n", error->reset_count);
 	err_printf(m, "Suspend count: %u\n", error->suspend_count);
 	err_printf(m, "Platform: %s\n", intel_platform_name(error->device_info.platform));
@@ -1123,40 +1105,6 @@ static u32 capture_error_bo(struct drm_i915_error_buffer *err,
 	return i;
 }
 
-/* Generate a semi-unique error code. The code is not meant to have meaning, The
- * code's only purpose is to try to prevent false duplicated bug reports by
- * grossly estimating a GPU error state.
- *
- * TODO Ideally, hashing the batchbuffer would be a very nice way to determine
- * the hang if we could strip the GTT offset information from it.
- *
- * It's only a small step better than a random number in its current form.
- */
-static uint32_t i915_error_generate_code(struct drm_i915_private *dev_priv,
-					 struct i915_gpu_state *error,
-					 int *engine_id)
-{
-	uint32_t error_code = 0;
-	int i;
-
-	/* IPEHR would be an ideal way to detect errors, as it's the gross
-	 * measure of "the command that hung." However, has some very common
-	 * synchronization commands which almost always appear in the case
-	 * strictly a client bug. Use instdone to differentiate those some.
-	 */
-	for (i = 0; i < I915_NUM_ENGINES; i++) {
-		if (error->engine[i].hangcheck_stalled) {
-			if (engine_id)
-				*engine_id = i;
-
-			return error->engine[i].ipehr ^
-			       error->engine[i].instdone.instdone;
-		}
-	}
-
-	return error_code;
-}
-
 static void gem_record_fences(struct i915_gpu_state *error)
 {
 	struct drm_i915_private *dev_priv = error->i915;
@@ -1317,9 +1265,6 @@ static void error_record_engine_registers(struct i915_gpu_state *error,
 	}
 
 	ee->idle = intel_engine_is_idle(engine);
-	ee->hangcheck_timestamp = engine->hangcheck.action_timestamp;
-	ee->hangcheck_action = engine->hangcheck.action;
-	ee->hangcheck_stalled = engine->hangcheck.stalled;
 	ee->reset_count = i915_reset_engine_count(&dev_priv->gpu_error,
 						  engine);
 
@@ -1757,31 +1702,35 @@ static void capture_reg_state(struct i915_gpu_state *error)
 	error->pgtbl_er = I915_READ(PGTBL_ER);
 }
 
-static void i915_error_capture_msg(struct drm_i915_private *dev_priv,
-				   struct i915_gpu_state *error,
-				   u32 engine_mask,
-				   const char *error_msg)
+static const char *
+error_msg(struct i915_gpu_state *error,
+	  unsigned long engine_mask,
+	  const char *msg)
 {
-	u32 ecode;
-	int engine_id = -1, len;
+	int id, len;
 
-	ecode = i915_error_generate_code(dev_priv, error, &engine_id);
+	len = scnprintf(error->error_msg, sizeof(error->error_msg), "GPU hang");
 
-	len = scnprintf(error->error_msg, sizeof(error->error_msg),
-			"GPU HANG: ecode %d:%d:0x%08x",
-			INTEL_GEN(dev_priv), engine_id, ecode);
+	for_each_set_bit(id, &engine_mask, BITS_PER_LONG) {
+		if (!error->engine[id].context.pid)
+			continue;
 
-	if (engine_id != -1 && error->engine[engine_id].context.pid)
 		len += scnprintf(error->error_msg + len,
 				 sizeof(error->error_msg) - len,
-				 ", in %s [%d]",
-				 error->engine[engine_id].context.comm,
-				 error->engine[engine_id].context.pid);
+				 " in %s [%d]",
+				 error->engine[id].context.comm,
+				 error->engine[id].context.pid);
+		break;
+	}
 
-	scnprintf(error->error_msg + len, sizeof(error->error_msg) - len,
-		  ", reason: %s, action: %s",
-		  error_msg,
-		  engine_mask ? "reset" : "continue");
+	if (msg) {
+		scnprintf(error->error_msg + len,
+			  sizeof(error->error_msg) - len,
+			  ", %s",
+			  msg);
+	}
+
+	return error->error_msg;
 }
 
 static void capture_gen_state(struct i915_gpu_state *error)
@@ -1821,15 +1770,17 @@ static void capture_params(struct i915_gpu_state *error)
 
 static unsigned long capture_find_epoch(const struct i915_gpu_state *error)
 {
-	unsigned long epoch = error->capture;
-	int i;
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+	unsigned long epoch;
 
-	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
-		const struct drm_i915_error_engine *ee = &error->engine[i];
+	epoch = error->capture;
+	for_each_engine(engine, error->i915, id) {
+		if (!engine->hangcheck)
+			continue;
 
-		if (ee->hangcheck_stalled &&
-		    time_before(ee->hangcheck_timestamp, epoch))
-			epoch = ee->hangcheck_timestamp;
+		if (time_before(engine->hangcheck->emitted_jiffies, epoch))
+			epoch = engine->hangcheck->emitted_jiffies;
 	}
 
 	return epoch;
@@ -1901,7 +1852,7 @@ i915_capture_gpu_state(struct drm_i915_private *i915)
  * i915_capture_error_state - capture an error record for later analysis
  * @i915: i915 device
  * @engine_mask: the mask of engines triggering the hang
- * @error_msg: a message to insert into the error capture header
+ * @msg: supplementary message to attach to the error state
  *
  * Should be called when an error is detected (either a hang or an error
  * interrupt) to capture error state from the time of the error.  Fills
@@ -1909,8 +1860,8 @@ i915_capture_gpu_state(struct drm_i915_private *i915)
  * to pick up.
  */
 void i915_capture_error_state(struct drm_i915_private *i915,
-			      u32 engine_mask,
-			      const char *error_msg)
+			      unsigned long engine_mask,
+			      const char *msg)
 {
 	static bool warned;
 	struct i915_gpu_state *error;
@@ -1926,8 +1877,7 @@ void i915_capture_error_state(struct drm_i915_private *i915,
 	if (IS_ERR(error))
 		return;
 
-	i915_error_capture_msg(i915, error, engine_mask, error_msg);
-	DRM_INFO("%s\n", error->error_msg);
+	dev_info(i915->drm.dev, "%s\n", error_msg(error, engine_mask, msg));
 
 	if (!error->simulated) {
 		spin_lock_irqsave(&i915->gpu_error.lock, flags);
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h
index ea3346d5acae..7f68db4d2f19 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.h
+++ b/drivers/gpu/drm/i915/i915_gpu_error.h
@@ -83,9 +83,6 @@ struct i915_gpu_state {
 		bool idle;
 		bool waiting;
 		int num_waiters;
-		unsigned long hangcheck_timestamp;
-		bool hangcheck_stalled;
-		enum intel_engine_hangcheck_action hangcheck_action;
 		struct i915_address_space *vm;
 		int num_requests;
 		u32 reset_count;
@@ -304,9 +301,9 @@ __printf(2, 3)
 void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...);
 
 struct i915_gpu_state *i915_capture_gpu_state(struct drm_i915_private *i915);
-void i915_capture_error_state(struct drm_i915_private *dev_priv,
-			      u32 engine_mask,
-			      const char *error_msg);
+void i915_capture_error_state(struct drm_i915_private *i915,
+			      unsigned long engine_mask,
+			      const char *msg);
 
 static inline struct i915_gpu_state *
 i915_gpu_state_get(struct i915_gpu_state *gpu)
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index 82f870c7a1e9..df81d53a9e28 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -489,7 +489,6 @@ void intel_engine_setup_common(struct intel_engine_cs *engine)
 	i915_timeline_set_subclass(&engine->timeline, TIMELINE_ENGINE);
 
 	intel_engine_init_execlist(engine);
-	intel_engine_init_hangcheck(engine);
 	intel_engine_init_batch_pool(engine);
 	intel_engine_init_cmd_parser(engine);
 }
@@ -1113,6 +1112,8 @@ void intel_engines_park(struct drm_i915_private *i915)
 
 		i915_gem_batch_pool_fini(&engine->batch_pool);
 		engine->execlists.no_priolist = false;
+
+		i915_request_put(fetch_and_zero(&engine->hangcheck));
 	}
 }
 
@@ -1140,8 +1141,6 @@ void intel_engines_unpark(struct drm_i915_private *i915)
 
 		if (engine->unpark)
 			engine->unpark(engine);
-
-		intel_engine_init_hangcheck(engine);
 	}
 }
 
@@ -1433,8 +1432,8 @@ void intel_engine_dump(struct intel_engine_cs *engine,
 	drm_printf(m, "\tcurrent seqno %x, last %x, hangcheck %x [%d ms]\n",
 		   intel_engine_get_seqno(engine),
 		   intel_engine_last_submit(engine),
-		   engine->hangcheck.seqno,
-		   jiffies_to_msecs(jiffies - engine->hangcheck.action_timestamp));
+		   engine->hangcheck ? lower_32_bits(i915_request_global_seqno(engine->hangcheck)) : 0,
+		   engine->hangcheck ? jiffies_to_msecs(jiffies - engine->hangcheck->emitted_jiffies) : -1);
 	drm_printf(m, "\tReset count: %d (global %d)\n",
 		   i915_reset_engine_count(error, engine),
 		   i915_reset_count(error));
diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c
index 7dc11fcb13de..665d56c2ff03 100644
--- a/drivers/gpu/drm/i915/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/intel_hangcheck.c
@@ -25,215 +25,25 @@
 #include "i915_drv.h"
 #include "i915_reset.h"
 
-static bool instdone_unchanged(u32 current_instdone, u32 *old_instdone)
-{
-	u32 tmp = current_instdone | *old_instdone;
-	bool unchanged;
-
-	unchanged = tmp == *old_instdone;
-	*old_instdone |= tmp;
-
-	return unchanged;
-}
-
-static bool subunits_stuck(struct intel_engine_cs *engine)
-{
-	struct drm_i915_private *dev_priv = engine->i915;
-	struct intel_instdone instdone;
-	struct intel_instdone *accu_instdone = &engine->hangcheck.instdone;
-	bool stuck;
-	int slice;
-	int subslice;
-
-	if (engine->id != RCS)
-		return true;
-
-	intel_engine_get_instdone(engine, &instdone);
-
-	/* There might be unstable subunit states even when
-	 * actual head is not moving. Filter out the unstable ones by
-	 * accumulating the undone -> done transitions and only
-	 * consider those as progress.
-	 */
-	stuck = instdone_unchanged(instdone.instdone,
-				   &accu_instdone->instdone);
-	stuck &= instdone_unchanged(instdone.slice_common,
-				    &accu_instdone->slice_common);
-
-	for_each_instdone_slice_subslice(dev_priv, slice, subslice) {
-		stuck &= instdone_unchanged(instdone.sampler[slice][subslice],
-					    &accu_instdone->sampler[slice][subslice]);
-		stuck &= instdone_unchanged(instdone.row[slice][subslice],
-					    &accu_instdone->row[slice][subslice]);
-	}
-
-	return stuck;
-}
-
-static enum intel_engine_hangcheck_action
-head_stuck(struct intel_engine_cs *engine, u64 acthd)
-{
-	if (acthd != engine->hangcheck.acthd) {
-
-		/* Clear subunit states on head movement */
-		memset(&engine->hangcheck.instdone, 0,
-		       sizeof(engine->hangcheck.instdone));
-
-		return ENGINE_ACTIVE_HEAD;
-	}
-
-	if (!subunits_stuck(engine))
-		return ENGINE_ACTIVE_SUBUNITS;
-
-	return ENGINE_DEAD;
-}
-
-static enum intel_engine_hangcheck_action
-engine_stuck(struct intel_engine_cs *engine, u64 acthd)
-{
-	struct drm_i915_private *dev_priv = engine->i915;
-	enum intel_engine_hangcheck_action ha;
-	u32 tmp;
-
-	ha = head_stuck(engine, acthd);
-	if (ha != ENGINE_DEAD)
-		return ha;
-
-	if (IS_GEN(dev_priv, 2))
-		return ENGINE_DEAD;
-
-	/* Is the chip hanging on a WAIT_FOR_EVENT?
-	 * If so we can simply poke the RB_WAIT bit
-	 * and break the hang. This should work on
-	 * all but the second generation chipsets.
-	 */
-	tmp = I915_READ_CTL(engine);
-	if (tmp & RING_WAIT) {
-		i915_handle_error(dev_priv, BIT(engine->id), 0,
-				  "stuck wait on %s", engine->name);
-		I915_WRITE_CTL(engine, tmp);
-		return ENGINE_WAIT_KICK;
-	}
-
-	return ENGINE_DEAD;
-}
-
-static void hangcheck_load_sample(struct intel_engine_cs *engine,
-				  struct intel_engine_hangcheck *hc)
-{
-	hc->acthd = intel_engine_get_active_head(engine);
-	hc->seqno = intel_engine_get_seqno(engine);
-}
-
-static void hangcheck_store_sample(struct intel_engine_cs *engine,
-				   const struct intel_engine_hangcheck *hc)
-{
-	engine->hangcheck.acthd = hc->acthd;
-	engine->hangcheck.seqno = hc->seqno;
-	engine->hangcheck.action = hc->action;
-	engine->hangcheck.stalled = hc->stalled;
-	engine->hangcheck.wedged = hc->wedged;
-}
-
-static enum intel_engine_hangcheck_action
-hangcheck_get_action(struct intel_engine_cs *engine,
-		     const struct intel_engine_hangcheck *hc)
-{
-	if (engine->hangcheck.seqno != hc->seqno)
-		return ENGINE_ACTIVE_SEQNO;
-
-	if (intel_engine_is_idle(engine))
-		return ENGINE_IDLE;
-
-	return engine_stuck(engine, hc->acthd);
-}
-
-static void hangcheck_accumulate_sample(struct intel_engine_cs *engine,
-					struct intel_engine_hangcheck *hc)
-{
-	unsigned long timeout = I915_ENGINE_DEAD_TIMEOUT;
-
-	hc->action = hangcheck_get_action(engine, hc);
-
-	/* We always increment the progress
-	 * if the engine is busy and still processing
-	 * the same request, so that no single request
-	 * can run indefinitely (such as a chain of
-	 * batches). The only time we do not increment
-	 * the hangcheck score on this ring, if this
-	 * engine is in a legitimate wait for another
-	 * engine. In that case the waiting engine is a
-	 * victim and we want to be sure we catch the
-	 * right culprit. Then every time we do kick
-	 * the ring, make it as a progress as the seqno
-	 * advancement might ensure and if not, it
-	 * will catch the hanging engine.
-	 */
-
-	switch (hc->action) {
-	case ENGINE_IDLE:
-	case ENGINE_ACTIVE_SEQNO:
-		/* Clear head and subunit states on seqno movement */
-		hc->acthd = 0;
-
-		memset(&engine->hangcheck.instdone, 0,
-		       sizeof(engine->hangcheck.instdone));
-
-		/* Intentional fall through */
-	case ENGINE_WAIT_KICK:
-	case ENGINE_WAIT:
-		engine->hangcheck.action_timestamp = jiffies;
-		break;
-
-	case ENGINE_ACTIVE_HEAD:
-	case ENGINE_ACTIVE_SUBUNITS:
-		/*
-		 * Seqno stuck with still active engine gets leeway,
-		 * in hopes that it is just a long shader.
-		 */
-		timeout = I915_SEQNO_DEAD_TIMEOUT;
-		break;
-
-	case ENGINE_DEAD:
-		if (GEM_SHOW_DEBUG()) {
-			struct drm_printer p = drm_debug_printer("hangcheck");
-			intel_engine_dump(engine, &p, "%s\n", engine->name);
-		}
-		break;
-
-	default:
-		MISSING_CASE(hc->action);
-	}
-
-	hc->stalled = time_after(jiffies,
-				 engine->hangcheck.action_timestamp + timeout);
-	hc->wedged = time_after(jiffies,
-				 engine->hangcheck.action_timestamp +
-				 I915_ENGINE_WEDGED_TIMEOUT);
-}
-
-static void hangcheck_declare_hang(struct drm_i915_private *i915,
-				   unsigned int hung,
-				   unsigned int stuck)
+static void
+hangcheck_declare_stall(struct drm_i915_private *i915, unsigned int stalled)
 {
 	struct intel_engine_cs *engine;
-	char msg[80];
 	unsigned int tmp;
+	char msg[80];
 	int len;
 
-	/* If some rings hung but others were still busy, only
+	/*
+	 * If some rings stalled but others were still busy, only
 	 * blame the hanging rings in the synopsis.
 	 */
-	if (stuck != hung)
-		hung &= ~stuck;
-	len = scnprintf(msg, sizeof(msg),
-			"%s on ", stuck == hung ? "no progress" : "hang");
-	for_each_engine_masked(engine, i915, hung, tmp)
+	len = scnprintf(msg, sizeof(msg), "stall on ");
+	for_each_engine_masked(engine, i915, stalled, tmp)
 		len += scnprintf(msg + len, sizeof(msg) - len,
 				 "%s, ", engine->name);
 	msg[len-2] = '\0';
 
-	return i915_handle_error(i915, hung, I915_ERROR_CAPTURE, "%s", msg);
+	return i915_handle_error(i915, stalled, I915_ERROR_CAPTURE, "%s", msg);
 }
 
 /*
@@ -246,64 +56,122 @@ static void hangcheck_declare_hang(struct drm_i915_private *i915,
  */
 static void i915_hangcheck_elapsed(struct work_struct *work)
 {
-	struct drm_i915_private *dev_priv =
-		container_of(work, typeof(*dev_priv),
+	struct drm_i915_private *i915 =
+		container_of(work, typeof(*i915),
 			     gpu_error.hangcheck_work.work);
 	struct intel_engine_cs *engine;
 	enum intel_engine_id id;
-	unsigned int hung = 0, stuck = 0, wedged = 0;
+	unsigned int inject = 0, stalled = 0, wedged = 0;
+	struct i915_wedge_me w;
 
 	if (!i915_modparams.enable_hangcheck)
 		return;
 
-	if (!READ_ONCE(dev_priv->gt.awake))
+	if (!READ_ONCE(i915->gt.awake))
 		return;
 
-	if (i915_terminally_wedged(&dev_priv->gpu_error))
-		return;
+	for_each_engine(engine, i915, id) {
+		struct i915_request *rq;
 
-	/* As enabling the GPU requires fairly extensive mmio access,
-	 * periodically arm the mmio checker to see if we are triggering
-	 * any invalid access.
-	 */
-	intel_uncore_arm_unclaimed_mmio_detection(dev_priv);
+		/* Did our heartbeat stop? */
+		rq = engine->hangcheck;
+		if (rq) {
+			if (i915_request_completed(rq)) {
+				engine->hangcheck = NULL;
+				continue;
+			}
 
-	for_each_engine(engine, dev_priv, id) {
-		struct intel_engine_hangcheck hc;
+			if (time_after(jiffies,
+				       rq->emitted_jiffies + I915_ENGINE_STALLED_TIMEOUT))
+				stalled |= intel_engine_flag(engine);
 
-		hangcheck_load_sample(engine, &hc);
-		hangcheck_accumulate_sample(engine, &hc);
-		hangcheck_store_sample(engine, &hc);
+			if (time_after(jiffies,
+				       rq->emitted_jiffies + I915_ENGINE_WEDGED_TIMEOUT))
+				wedged |= intel_engine_flag(engine);
 
-		if (engine->hangcheck.stalled) {
-			hung |= intel_engine_flag(engine);
-			if (hc.action != ENGINE_DEAD)
-				stuck |= intel_engine_flag(engine);
+			continue;
 		}
 
-		if (engine->hangcheck.wedged)
-			wedged |= intel_engine_flag(engine);
+		/* No heartbeat yet, is the engine active? */
+		spin_lock_irq(&engine->timeline.lock);
+		list_for_each_entry(rq, &engine->timeline.requests, link) {
+			if (i915_request_completed(rq))
+				continue;
+
+			/*
+			 * Does the engine appear to be stalling?
+			 *
+			 * If a ready-to-run request picked at random from the
+			 * queue has been stuck for a while, queue our own
+			 * request to check the engine's pulse.
+			 */
+			if (time_after(jiffies,
+				       rq->emitted_jiffies + I915_ENGINE_STALLED_TIMEOUT))
+				inject |= intel_engine_flag(engine);
+
+			break;
+		}
+		spin_unlock_irq(&engine->timeline.lock);
 	}
 
 	if (wedged) {
-		dev_err(dev_priv->drm.dev,
+		dev_err(i915->drm.dev,
 			"GPU recovery timed out,"
 			" cancelling all in-flight rendering.\n");
 		GEM_TRACE_DUMP();
-		i915_gem_set_wedged(dev_priv);
+		i915_gem_set_wedged(i915);
 	}
 
-	if (hung)
-		hangcheck_declare_hang(dev_priv, hung, stuck);
+	if (i915_terminally_wedged(&i915->gpu_error))
+		return;
 
-	/* Reset timer in case GPU hangs without another request being added */
-	i915_queue_hangcheck(dev_priv);
-}
+	if (stalled) {
+		hangcheck_declare_stall(i915, stalled);
+		goto restart;
+	}
 
-void intel_engine_init_hangcheck(struct intel_engine_cs *engine)
-{
-	memset(&engine->hangcheck, 0, sizeof(engine->hangcheck));
-	engine->hangcheck.action_timestamp = jiffies;
+	if (!inject)
+		goto restart;
+
+	/*
+	 * As enabling the GPU requires fairly extensive mmio access,
+	 * periodically arm the mmio checker to see if we are triggering
+	 * any invalid access.
+	 */
+	intel_uncore_arm_unclaimed_mmio_detection(i915);
+
+	i915_wedge_on_timeout(&w, i915, I915_ENGINE_WEDGED_TIMEOUT) {
+		mutex_lock(&i915->drm.struct_mutex);
+		for_each_engine(engine, i915, id) {
+			struct i915_sched_attr attr = {
+				.priority = INT_MAX
+			};
+			struct i915_request *rq;
+
+			if (!(inject & intel_engine_flag(engine)))
+				continue;
+
+			GEM_BUG_ON(engine->hangcheck);
+
+			rq = i915_request_alloc(engine, i915->kernel_context);
+			if (IS_ERR(rq))
+				continue;
+
+			engine->hangcheck = i915_request_get(rq);
+			i915_request_add(rq);
+
+			/* Priority boost for preemption past blocking CL kernels */
+			rcu_read_lock();
+			if (engine->schedule)
+				engine->schedule(engine->hangcheck, &attr);
+			rcu_read_unlock();
+		}
+		mutex_unlock(&i915->drm.struct_mutex);
+	}
+
+restart:
+	/* Reset timer in case GPU hangs without another request being added */
+	i915_queue_hangcheck(i915);
 }
 
 void intel_hangcheck_init(struct drm_i915_private *i915)
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index be650704bab0..cd18f6fff265 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -57,39 +57,6 @@ struct intel_hw_status_page {
 /* seqno size is actually only a uint32, but since we plan to use MI_FLUSH_DW to
  * do the writes, and that must have qw aligned offsets, simply pretend it's 8b.
  */
-enum intel_engine_hangcheck_action {
-	ENGINE_IDLE = 0,
-	ENGINE_WAIT,
-	ENGINE_ACTIVE_SEQNO,
-	ENGINE_ACTIVE_HEAD,
-	ENGINE_ACTIVE_SUBUNITS,
-	ENGINE_WAIT_KICK,
-	ENGINE_DEAD,
-};
-
-static inline const char *
-hangcheck_action_to_str(const enum intel_engine_hangcheck_action a)
-{
-	switch (a) {
-	case ENGINE_IDLE:
-		return "idle";
-	case ENGINE_WAIT:
-		return "wait";
-	case ENGINE_ACTIVE_SEQNO:
-		return "active seqno";
-	case ENGINE_ACTIVE_HEAD:
-		return "active head";
-	case ENGINE_ACTIVE_SUBUNITS:
-		return "active subunits";
-	case ENGINE_WAIT_KICK:
-		return "wait kick";
-	case ENGINE_DEAD:
-		return "dead";
-	}
-
-	return "unknown";
-}
-
 #define I915_MAX_SLICES	3
 #define I915_MAX_SUBSLICES 8
 
@@ -117,17 +84,6 @@ struct intel_instdone {
 	u32 row[I915_MAX_SLICES][I915_MAX_SUBSLICES];
 };
 
-struct intel_engine_hangcheck {
-	u64 acthd;
-	u32 seqno;
-	enum intel_engine_hangcheck_action action;
-	unsigned long action_timestamp;
-	int deadlock;
-	struct intel_instdone instdone;
-	bool stalled:1;
-	bool wedged:1;
-};
-
 struct intel_ring {
 	struct i915_vma *vma;
 	void *vaddr;
@@ -514,7 +470,7 @@ struct intel_engine_cs {
 	/* status_notifier: list of callbacks for context-switch changes */
 	struct atomic_notifier_head context_status_notifier;
 
-	struct intel_engine_hangcheck hangcheck;
+	struct i915_request *hangcheck;
 
 #define I915_ENGINE_NEEDS_CMD_PARSER BIT(0)
 #define I915_ENGINE_SUPPORTS_STATS   BIT(1)
-- 
2.20.1



More information about the Intel-gfx-trybot mailing list