[PATCH 36/45] drm/i915: Replace hangcheck seqno with heartbeat requests

Chris Wilson chris at chris-wilson.co.uk
Sat Dec 29 16:45:07 UTC 2018


Previously we were very strict on not allowing ourselves to use any
mutex from within hangcheck, especially not struct_mutex. The reason is
very simple, as struct_mutex is a BKL, any GPU hang was likely to occur
while the driver is waiting inside struct_mutex. However, userspace now
almost entirely waits before taking struct_mutex dramatically reducing
the risk of hangcheck not being able to take struct_mutex for itself to
allocate a request.

In the future, this risk should be reduced even further with the
elimination of struct_mutex and its replacement by fine grained client
locking, but for now we use a big hammer to declare the driver wedged if
hangcheck finds itself stuck.

The major benefit from using a heartbeat request is that it acts as a
regular client of the GPU and as such can measure pipeline stalls by its
preemption latency. This allows us to discern long running blocking
kernels, e.g. some OpenCL persistent tasks, that are still preemptible
(and so still allow the system and other users to continue to function)
from those are truly wedged.

Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_debugfs.c           |  80 +----
 drivers/gpu/drm/i915/i915_drv.h               |   5 +-
 drivers/gpu/drm/i915/i915_gpu_error.c         | 120 ++-----
 drivers/gpu/drm/i915/i915_gpu_error.h         |   9 +-
 drivers/gpu/drm/i915/intel_engine_cs.c        |   9 +-
 drivers/gpu/drm/i915/intel_hangcheck.c        | 336 ++++++------------
 drivers/gpu/drm/i915/intel_ringbuffer.h       |  46 +--
 .../gpu/drm/i915/selftests/intel_hangcheck.c  |   1 +
 8 files changed, 155 insertions(+), 451 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 8059f6dd3886..a9f57c77cc05 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1242,42 +1242,10 @@ static int i915_frequency_info(struct seq_file *m, void *unused)
 	return ret;
 }
 
-static void i915_instdone_info(struct drm_i915_private *dev_priv,
-			       struct seq_file *m,
-			       struct intel_instdone *instdone)
-{
-	int slice;
-	int subslice;
-
-	seq_printf(m, "\t\tINSTDONE: 0x%08x\n",
-		   instdone->instdone);
-
-	if (INTEL_GEN(dev_priv) <= 3)
-		return;
-
-	seq_printf(m, "\t\tSC_INSTDONE: 0x%08x\n",
-		   instdone->slice_common);
-
-	if (INTEL_GEN(dev_priv) <= 6)
-		return;
-
-	for_each_instdone_slice_subslice(dev_priv, slice, subslice)
-		seq_printf(m, "\t\tSAMPLER_INSTDONE[%d][%d]: 0x%08x\n",
-			   slice, subslice, instdone->sampler[slice][subslice]);
-
-	for_each_instdone_slice_subslice(dev_priv, slice, subslice)
-		seq_printf(m, "\t\tROW_INSTDONE[%d][%d]: 0x%08x\n",
-			   slice, subslice, instdone->row[slice][subslice]);
-}
-
 static int i915_hangcheck_info(struct seq_file *m, void *unused)
 {
 	struct drm_i915_private *dev_priv = node_to_i915(m->private);
 	struct intel_engine_cs *engine;
-	u64 acthd[I915_NUM_ENGINES];
-	u32 seqno[I915_NUM_ENGINES];
-	struct intel_instdone instdone;
-	intel_wakeref_t wakeref;
 	enum intel_engine_id id;
 
 	if (test_bit(I915_WEDGED, &dev_priv->gpu_error.flags))
@@ -1294,15 +1262,6 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
 		return 0;
 	}
 
-	with_intel_runtime_pm(dev_priv, wakeref) {
-		for_each_engine(engine, dev_priv, id) {
-			acthd[id] = intel_engine_get_active_head(engine);
-			seqno[id] = intel_engine_get_seqno(engine);
-		}
-
-		intel_engine_get_instdone(dev_priv->engine[RCS], &instdone);
-	}
-
 	if (timer_pending(&dev_priv->gpu_error.hangcheck_work.timer))
 		seq_printf(m, "Hangcheck active, timer fires in %dms\n",
 			   jiffies_to_msecs(dev_priv->gpu_error.hangcheck_work.timer.expires -
@@ -1319,15 +1278,13 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
 		struct rb_node *rb;
 
 		seq_printf(m, "%s:\n", engine->name);
-		seq_printf(m, "\tseqno = %x [current %x, last %x]\n",
-			   engine->hangcheck.seqno, seqno[id],
-			   intel_engine_last_submit(engine));
-		seq_printf(m, "\twaiters? %s, fake irq active? %s, stalled? %s, wedged? %s\n",
+		seq_printf(m, "\tseqno = %llx, emitted %dms ago\n",
+			   engine->hangcheck ? engine->hangcheck->fence.seqno : 0,
+			   engine->hangcheck ? jiffies_to_msecs(jiffies - engine->hangcheck->emitted_jiffies) : -1);
+		seq_printf(m, "\twaiters? %s, fake irq active? %s\n",
 			   yesno(intel_engine_has_waiter(engine)),
 			   yesno(test_bit(engine->id,
-					  &dev_priv->gpu_error.missed_irq_rings)),
-			   yesno(engine->hangcheck.stalled),
-			   yesno(engine->hangcheck.wedged));
+					  &dev_priv->gpu_error.missed_irq_rings)));
 
 		spin_lock_irq(&b->rb_lock);
 		for (rb = rb_first(&b->waiters); rb; rb = rb_next(rb)) {
@@ -1337,26 +1294,6 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
 				   w->tsk->comm, w->tsk->pid, w->seqno);
 		}
 		spin_unlock_irq(&b->rb_lock);
-
-		seq_printf(m, "\tACTHD = 0x%08llx [current 0x%08llx]\n",
-			   (long long)engine->hangcheck.acthd,
-			   (long long)acthd[id]);
-		seq_printf(m, "\taction = %s(%d) %d ms ago\n",
-			   hangcheck_action_to_str(engine->hangcheck.action),
-			   engine->hangcheck.action,
-			   jiffies_to_msecs(jiffies -
-					    engine->hangcheck.action_timestamp));
-
-		if (engine->id == RCS) {
-			seq_puts(m, "\tinstdone read =\n");
-
-			i915_instdone_info(dev_priv, m, &instdone);
-
-			seq_puts(m, "\tinstdone accu =\n");
-
-			i915_instdone_info(dev_priv, m,
-					   &engine->hangcheck.instdone);
-		}
 	}
 
 	return 0;
@@ -3891,8 +3828,6 @@ static int
 i915_wedged_set(void *data, u64 val)
 {
 	struct drm_i915_private *i915 = data;
-	struct intel_engine_cs *engine;
-	unsigned int tmp;
 
 	/*
 	 * There is no safeguard against this debugfs entry colliding
@@ -3905,11 +3840,6 @@ i915_wedged_set(void *data, u64 val)
 	if (i915_reset_backoff(&i915->gpu_error))
 		return -EAGAIN;
 
-	for_each_engine_masked(engine, i915, val, tmp) {
-		engine->hangcheck.seqno = intel_engine_get_seqno(engine);
-		engine->hangcheck.stalled = true;
-	}
-
 	i915_handle_error(i915, val, I915_ERROR_CAPTURE,
 			  "Manually set wedged engine mask = %llx", val);
 	return 0;
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index cad286d65ba8..b53173a2d2bd 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -921,9 +921,7 @@ struct i915_gem_mm {
 #define I915_RESET_TIMEOUT (10 * HZ) /* 10s */
 #define I915_FENCE_TIMEOUT (10 * HZ) /* 10s */
 
-#define I915_ENGINE_DEAD_TIMEOUT  (4 * HZ)  /* Seqno, head and subunits dead */
-#define I915_SEQNO_DEAD_TIMEOUT   (12 * HZ) /* Seqno dead with active head */
-
+#define I915_ENGINE_STALLED_TIMEOUT   (12 * HZ) /* No progress? */
 #define I915_ENGINE_WEDGED_TIMEOUT  (60 * HZ)  /* Reset but no recovery? */
 
 struct ddi_vbt_port_info {
@@ -2604,7 +2602,6 @@ extern int i915_driver_load(struct pci_dev *pdev,
 			    const struct pci_device_id *ent);
 extern void i915_driver_unload(struct drm_device *dev);
 
-extern void intel_engine_init_hangcheck(struct intel_engine_cs *engine);
 extern void intel_hangcheck_init(struct drm_i915_private *dev_priv);
 extern unsigned long i915_chipset_val(struct drm_i915_private *dev_priv);
 extern unsigned long i915_mch_val(struct drm_i915_private *dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index ded2448b8ebb..bb84e5871cc9 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -533,13 +533,6 @@ static void error_print_engine(struct drm_i915_error_state_buf *m,
 	err_printf(m, "  waiting: %s\n", yesno(ee->waiting));
 	err_printf(m, "  ring->head: 0x%08x\n", ee->cpu_ring_head);
 	err_printf(m, "  ring->tail: 0x%08x\n", ee->cpu_ring_tail);
-	err_printf(m, "  hangcheck stall: %s\n", yesno(ee->hangcheck_stalled));
-	err_printf(m, "  hangcheck action: %s\n",
-		   hangcheck_action_to_str(ee->hangcheck_action));
-	err_printf(m, "  hangcheck action timestamp: %dms (%lu%s)\n",
-		   jiffies_to_msecs(ee->hangcheck_timestamp - epoch),
-		   ee->hangcheck_timestamp,
-		   ee->hangcheck_timestamp == epoch ? "; epoch" : "");
 	err_printf(m, "  engine reset count: %u\n", ee->reset_count);
 
 	for (n = 0; n < ee->num_ports; n++) {
@@ -680,17 +673,6 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,
 		   jiffies_to_msecs(jiffies - error->capture),
 		   jiffies_to_msecs(error->capture - error->epoch));
 
-	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
-		if (error->engine[i].hangcheck_stalled &&
-		    error->engine[i].context.pid) {
-			err_printf(m, "Active process (on ring %s): %s [%d], score %d%s\n",
-				   engine_name(m->i915, i),
-				   error->engine[i].context.comm,
-				   error->engine[i].context.pid,
-				   error->engine[i].context.ban_score,
-				   bannable(&error->engine[i].context));
-		}
-	}
 	err_printf(m, "Reset count: %u\n", error->reset_count);
 	err_printf(m, "Suspend count: %u\n", error->suspend_count);
 	err_printf(m, "Platform: %s\n", intel_platform_name(error->device_info.platform));
@@ -1150,40 +1132,6 @@ static u32 capture_error_bo(struct drm_i915_error_buffer *err,
 	return i;
 }
 
-/* Generate a semi-unique error code. The code is not meant to have meaning, The
- * code's only purpose is to try to prevent false duplicated bug reports by
- * grossly estimating a GPU error state.
- *
- * TODO Ideally, hashing the batchbuffer would be a very nice way to determine
- * the hang if we could strip the GTT offset information from it.
- *
- * It's only a small step better than a random number in its current form.
- */
-static uint32_t i915_error_generate_code(struct drm_i915_private *dev_priv,
-					 struct i915_gpu_state *error,
-					 int *engine_id)
-{
-	uint32_t error_code = 0;
-	int i;
-
-	/* IPEHR would be an ideal way to detect errors, as it's the gross
-	 * measure of "the command that hung." However, has some very common
-	 * synchronization commands which almost always appear in the case
-	 * strictly a client bug. Use instdone to differentiate those some.
-	 */
-	for (i = 0; i < I915_NUM_ENGINES; i++) {
-		if (error->engine[i].hangcheck_stalled) {
-			if (engine_id)
-				*engine_id = i;
-
-			return error->engine[i].ipehr ^
-			       error->engine[i].instdone.instdone;
-		}
-	}
-
-	return error_code;
-}
-
 static void gem_record_fences(struct i915_gpu_state *error)
 {
 	struct drm_i915_private *dev_priv = error->i915;
@@ -1344,9 +1292,6 @@ static void error_record_engine_registers(struct i915_gpu_state *error,
 	}
 
 	ee->idle = intel_engine_is_idle(engine);
-	ee->hangcheck_timestamp = engine->hangcheck.action_timestamp;
-	ee->hangcheck_action = engine->hangcheck.action;
-	ee->hangcheck_stalled = engine->hangcheck.stalled;
 	ee->reset_count = i915_reset_engine_count(&dev_priv->gpu_error,
 						  engine);
 
@@ -1784,31 +1729,35 @@ static void capture_reg_state(struct i915_gpu_state *error)
 	error->pgtbl_er = I915_READ(PGTBL_ER);
 }
 
-static void i915_error_capture_msg(struct drm_i915_private *dev_priv,
-				   struct i915_gpu_state *error,
-				   u32 engine_mask,
-				   const char *error_msg)
+static const char *
+error_msg(struct i915_gpu_state *error,
+	  unsigned long engine_mask,
+	  const char *msg)
 {
-	u32 ecode;
-	int engine_id = -1, len;
+	int id, len;
 
-	ecode = i915_error_generate_code(dev_priv, error, &engine_id);
+	len = scnprintf(error->error_msg, sizeof(error->error_msg), "GPU hang");
 
-	len = scnprintf(error->error_msg, sizeof(error->error_msg),
-			"GPU HANG: ecode %d:%d:0x%08x",
-			INTEL_GEN(dev_priv), engine_id, ecode);
+	for_each_set_bit(id, &engine_mask, BITS_PER_LONG) {
+		if (!error->engine[id].context.pid)
+			continue;
 
-	if (engine_id != -1 && error->engine[engine_id].context.pid)
 		len += scnprintf(error->error_msg + len,
 				 sizeof(error->error_msg) - len,
-				 ", in %s [%d]",
-				 error->engine[engine_id].context.comm,
-				 error->engine[engine_id].context.pid);
+				 " in %s [%d]",
+				 error->engine[id].context.comm,
+				 error->engine[id].context.pid);
+		break;
+	}
 
-	scnprintf(error->error_msg + len, sizeof(error->error_msg) - len,
-		  ", reason: %s, action: %s",
-		  error_msg,
-		  engine_mask ? "reset" : "continue");
+	if (msg) {
+		scnprintf(error->error_msg + len,
+			  sizeof(error->error_msg) - len,
+			  ", %s",
+			  msg);
+	}
+
+	return error->error_msg;
 }
 
 static void capture_gen_state(struct i915_gpu_state *error)
@@ -1848,15 +1797,17 @@ static void capture_params(struct i915_gpu_state *error)
 
 static unsigned long capture_find_epoch(const struct i915_gpu_state *error)
 {
-	unsigned long epoch = error->capture;
-	int i;
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+	unsigned long epoch;
 
-	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
-		const struct drm_i915_error_engine *ee = &error->engine[i];
+	epoch = error->capture;
+	for_each_engine(engine, error->i915, id) {
+		if (!engine->hangcheck)
+			continue;
 
-		if (ee->hangcheck_stalled &&
-		    time_before(ee->hangcheck_timestamp, epoch))
-			epoch = ee->hangcheck_timestamp;
+		if (time_before(engine->hangcheck->emitted_jiffies, epoch))
+			epoch = engine->hangcheck->emitted_jiffies;
 	}
 
 	return epoch;
@@ -1928,7 +1879,7 @@ i915_capture_gpu_state(struct drm_i915_private *i915)
  * i915_capture_error_state - capture an error record for later analysis
  * @i915: i915 device
  * @engine_mask: the mask of engines triggering the hang
- * @error_msg: a message to insert into the error capture header
+ * @msg: supplementary message to attach to the error state
  *
  * Should be called when an error is detected (either a hang or an error
  * interrupt) to capture error state from the time of the error.  Fills
@@ -1936,8 +1887,8 @@ i915_capture_gpu_state(struct drm_i915_private *i915)
  * to pick up.
  */
 void i915_capture_error_state(struct drm_i915_private *i915,
-			      u32 engine_mask,
-			      const char *error_msg)
+			      unsigned long engine_mask,
+			      const char *msg)
 {
 	static bool warned;
 	struct i915_gpu_state *error;
@@ -1953,8 +1904,7 @@ void i915_capture_error_state(struct drm_i915_private *i915,
 	if (IS_ERR(error))
 		return;
 
-	i915_error_capture_msg(i915, error, engine_mask, error_msg);
-	DRM_INFO("%s\n", error->error_msg);
+	dev_info(i915->drm.dev, "%s\n", error_msg(error, engine_mask, msg));
 
 	if (!error->simulated) {
 		spin_lock_irqsave(&i915->gpu_error.lock, flags);
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h
index 5bfee972b8f1..a17c369d4890 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.h
+++ b/drivers/gpu/drm/i915/i915_gpu_error.h
@@ -83,9 +83,6 @@ struct i915_gpu_state {
 		bool idle;
 		bool waiting;
 		int num_waiters;
-		unsigned long hangcheck_timestamp;
-		bool hangcheck_stalled;
-		enum intel_engine_hangcheck_action hangcheck_action;
 		struct i915_address_space *vm;
 		int num_requests;
 		u32 reset_count;
@@ -306,9 +303,9 @@ __printf(2, 3)
 void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...);
 
 struct i915_gpu_state *i915_capture_gpu_state(struct drm_i915_private *i915);
-void i915_capture_error_state(struct drm_i915_private *dev_priv,
-			      u32 engine_mask,
-			      const char *error_msg);
+void i915_capture_error_state(struct drm_i915_private *i915,
+			      unsigned long engine_mask,
+			      const char *msg);
 
 static inline struct i915_gpu_state *
 i915_gpu_state_get(struct i915_gpu_state *gpu)
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index 7b80a087cc32..1be6895ae005 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -499,7 +499,6 @@ void intel_engine_setup_common(struct intel_engine_cs *engine)
 	i915_timeline_set_subclass(&engine->timeline, TIMELINE_ENGINE);
 
 	intel_engine_init_execlist(engine);
-	intel_engine_init_hangcheck(engine);
 	intel_engine_init_batch_pool(engine);
 	intel_engine_init_cmd_parser(engine);
 }
@@ -1120,6 +1119,8 @@ void intel_engines_park(struct drm_i915_private *i915)
 
 		i915_gem_batch_pool_fini(&engine->batch_pool);
 		engine->execlists.no_priolist = false;
+
+		i915_request_put(fetch_and_zero(&engine->hangcheck));
 	}
 }
 
@@ -1147,8 +1148,6 @@ void intel_engines_unpark(struct drm_i915_private *i915)
 
 		if (engine->unpark)
 			engine->unpark(engine);
-
-		intel_engine_init_hangcheck(engine);
 	}
 }
 
@@ -1439,8 +1438,8 @@ void intel_engine_dump(struct intel_engine_cs *engine,
 	drm_printf(m, "\tcurrent seqno %x, last %x, hangcheck %x [%d ms]\n",
 		   intel_engine_get_seqno(engine),
 		   intel_engine_last_submit(engine),
-		   engine->hangcheck.seqno,
-		   jiffies_to_msecs(jiffies - engine->hangcheck.action_timestamp));
+		   engine->hangcheck ? lower_32_bits(i915_request_global_seqno(engine->hangcheck)) : 0,
+		   engine->hangcheck ? jiffies_to_msecs(jiffies - engine->hangcheck->emitted_jiffies) : -1);
 	drm_printf(m, "\tReset count: %d (global %d)\n",
 		   i915_reset_engine_count(error, engine),
 		   i915_reset_count(error));
diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c
index 7dc11fcb13de..b37cb4c3308a 100644
--- a/drivers/gpu/drm/i915/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/intel_hangcheck.c
@@ -25,215 +25,25 @@
 #include "i915_drv.h"
 #include "i915_reset.h"
 
-static bool instdone_unchanged(u32 current_instdone, u32 *old_instdone)
-{
-	u32 tmp = current_instdone | *old_instdone;
-	bool unchanged;
-
-	unchanged = tmp == *old_instdone;
-	*old_instdone |= tmp;
-
-	return unchanged;
-}
-
-static bool subunits_stuck(struct intel_engine_cs *engine)
-{
-	struct drm_i915_private *dev_priv = engine->i915;
-	struct intel_instdone instdone;
-	struct intel_instdone *accu_instdone = &engine->hangcheck.instdone;
-	bool stuck;
-	int slice;
-	int subslice;
-
-	if (engine->id != RCS)
-		return true;
-
-	intel_engine_get_instdone(engine, &instdone);
-
-	/* There might be unstable subunit states even when
-	 * actual head is not moving. Filter out the unstable ones by
-	 * accumulating the undone -> done transitions and only
-	 * consider those as progress.
-	 */
-	stuck = instdone_unchanged(instdone.instdone,
-				   &accu_instdone->instdone);
-	stuck &= instdone_unchanged(instdone.slice_common,
-				    &accu_instdone->slice_common);
-
-	for_each_instdone_slice_subslice(dev_priv, slice, subslice) {
-		stuck &= instdone_unchanged(instdone.sampler[slice][subslice],
-					    &accu_instdone->sampler[slice][subslice]);
-		stuck &= instdone_unchanged(instdone.row[slice][subslice],
-					    &accu_instdone->row[slice][subslice]);
-	}
-
-	return stuck;
-}
-
-static enum intel_engine_hangcheck_action
-head_stuck(struct intel_engine_cs *engine, u64 acthd)
-{
-	if (acthd != engine->hangcheck.acthd) {
-
-		/* Clear subunit states on head movement */
-		memset(&engine->hangcheck.instdone, 0,
-		       sizeof(engine->hangcheck.instdone));
-
-		return ENGINE_ACTIVE_HEAD;
-	}
-
-	if (!subunits_stuck(engine))
-		return ENGINE_ACTIVE_SUBUNITS;
-
-	return ENGINE_DEAD;
-}
-
-static enum intel_engine_hangcheck_action
-engine_stuck(struct intel_engine_cs *engine, u64 acthd)
-{
-	struct drm_i915_private *dev_priv = engine->i915;
-	enum intel_engine_hangcheck_action ha;
-	u32 tmp;
-
-	ha = head_stuck(engine, acthd);
-	if (ha != ENGINE_DEAD)
-		return ha;
-
-	if (IS_GEN(dev_priv, 2))
-		return ENGINE_DEAD;
-
-	/* Is the chip hanging on a WAIT_FOR_EVENT?
-	 * If so we can simply poke the RB_WAIT bit
-	 * and break the hang. This should work on
-	 * all but the second generation chipsets.
-	 */
-	tmp = I915_READ_CTL(engine);
-	if (tmp & RING_WAIT) {
-		i915_handle_error(dev_priv, BIT(engine->id), 0,
-				  "stuck wait on %s", engine->name);
-		I915_WRITE_CTL(engine, tmp);
-		return ENGINE_WAIT_KICK;
-	}
-
-	return ENGINE_DEAD;
-}
-
-static void hangcheck_load_sample(struct intel_engine_cs *engine,
-				  struct intel_engine_hangcheck *hc)
-{
-	hc->acthd = intel_engine_get_active_head(engine);
-	hc->seqno = intel_engine_get_seqno(engine);
-}
-
-static void hangcheck_store_sample(struct intel_engine_cs *engine,
-				   const struct intel_engine_hangcheck *hc)
-{
-	engine->hangcheck.acthd = hc->acthd;
-	engine->hangcheck.seqno = hc->seqno;
-	engine->hangcheck.action = hc->action;
-	engine->hangcheck.stalled = hc->stalled;
-	engine->hangcheck.wedged = hc->wedged;
-}
-
-static enum intel_engine_hangcheck_action
-hangcheck_get_action(struct intel_engine_cs *engine,
-		     const struct intel_engine_hangcheck *hc)
-{
-	if (engine->hangcheck.seqno != hc->seqno)
-		return ENGINE_ACTIVE_SEQNO;
-
-	if (intel_engine_is_idle(engine))
-		return ENGINE_IDLE;
-
-	return engine_stuck(engine, hc->acthd);
-}
-
-static void hangcheck_accumulate_sample(struct intel_engine_cs *engine,
-					struct intel_engine_hangcheck *hc)
-{
-	unsigned long timeout = I915_ENGINE_DEAD_TIMEOUT;
-
-	hc->action = hangcheck_get_action(engine, hc);
-
-	/* We always increment the progress
-	 * if the engine is busy and still processing
-	 * the same request, so that no single request
-	 * can run indefinitely (such as a chain of
-	 * batches). The only time we do not increment
-	 * the hangcheck score on this ring, if this
-	 * engine is in a legitimate wait for another
-	 * engine. In that case the waiting engine is a
-	 * victim and we want to be sure we catch the
-	 * right culprit. Then every time we do kick
-	 * the ring, make it as a progress as the seqno
-	 * advancement might ensure and if not, it
-	 * will catch the hanging engine.
-	 */
-
-	switch (hc->action) {
-	case ENGINE_IDLE:
-	case ENGINE_ACTIVE_SEQNO:
-		/* Clear head and subunit states on seqno movement */
-		hc->acthd = 0;
-
-		memset(&engine->hangcheck.instdone, 0,
-		       sizeof(engine->hangcheck.instdone));
-
-		/* Intentional fall through */
-	case ENGINE_WAIT_KICK:
-	case ENGINE_WAIT:
-		engine->hangcheck.action_timestamp = jiffies;
-		break;
-
-	case ENGINE_ACTIVE_HEAD:
-	case ENGINE_ACTIVE_SUBUNITS:
-		/*
-		 * Seqno stuck with still active engine gets leeway,
-		 * in hopes that it is just a long shader.
-		 */
-		timeout = I915_SEQNO_DEAD_TIMEOUT;
-		break;
-
-	case ENGINE_DEAD:
-		if (GEM_SHOW_DEBUG()) {
-			struct drm_printer p = drm_debug_printer("hangcheck");
-			intel_engine_dump(engine, &p, "%s\n", engine->name);
-		}
-		break;
-
-	default:
-		MISSING_CASE(hc->action);
-	}
-
-	hc->stalled = time_after(jiffies,
-				 engine->hangcheck.action_timestamp + timeout);
-	hc->wedged = time_after(jiffies,
-				 engine->hangcheck.action_timestamp +
-				 I915_ENGINE_WEDGED_TIMEOUT);
-}
-
-static void hangcheck_declare_hang(struct drm_i915_private *i915,
-				   unsigned int hung,
-				   unsigned int stuck)
+static void
+hangcheck_declare_stall(struct drm_i915_private *i915, unsigned int stalled)
 {
 	struct intel_engine_cs *engine;
-	char msg[80];
 	unsigned int tmp;
+	char msg[80];
 	int len;
 
-	/* If some rings hung but others were still busy, only
+	/*
+	 * If some rings stalled but others were still busy, only
 	 * blame the hanging rings in the synopsis.
 	 */
-	if (stuck != hung)
-		hung &= ~stuck;
-	len = scnprintf(msg, sizeof(msg),
-			"%s on ", stuck == hung ? "no progress" : "hang");
-	for_each_engine_masked(engine, i915, hung, tmp)
+	len = scnprintf(msg, sizeof(msg), "stall on ");
+	for_each_engine_masked(engine, i915, stalled, tmp)
 		len += scnprintf(msg + len, sizeof(msg) - len,
 				 "%s, ", engine->name);
 	msg[len-2] = '\0';
 
-	return i915_handle_error(i915, hung, I915_ERROR_CAPTURE, "%s", msg);
+	return i915_handle_error(i915, stalled, I915_ERROR_CAPTURE, "%s", msg);
 }
 
 /*
@@ -246,64 +56,128 @@ static void hangcheck_declare_hang(struct drm_i915_private *i915,
  */
 static void i915_hangcheck_elapsed(struct work_struct *work)
 {
-	struct drm_i915_private *dev_priv =
-		container_of(work, typeof(*dev_priv),
+	struct drm_i915_private *i915 =
+		container_of(work, typeof(*i915),
 			     gpu_error.hangcheck_work.work);
 	struct intel_engine_cs *engine;
 	enum intel_engine_id id;
-	unsigned int hung = 0, stuck = 0, wedged = 0;
+	unsigned int inject = 0, stalled = 0, wedged = 0;
+	struct i915_wedge_me w;
 
 	if (!i915_modparams.enable_hangcheck)
 		return;
 
-	if (!READ_ONCE(dev_priv->gt.awake))
+	if (!READ_ONCE(i915->gt.awake))
 		return;
 
-	if (i915_terminally_wedged(&dev_priv->gpu_error))
-		return;
+	for_each_engine(engine, i915, id) {
+		struct i915_request *rq;
 
-	/* As enabling the GPU requires fairly extensive mmio access,
-	 * periodically arm the mmio checker to see if we are triggering
-	 * any invalid access.
-	 */
-	intel_uncore_arm_unclaimed_mmio_detection(dev_priv);
+		/* Did our heartbeat stop? */
+		rq = engine->hangcheck;
+		if (rq) {
+			if (i915_request_completed(rq)) {
+				engine->hangcheck = NULL;
+				continue;
+			}
 
-	for_each_engine(engine, dev_priv, id) {
-		struct intel_engine_hangcheck hc;
+			if (time_after(jiffies,
+				       rq->emitted_jiffies + I915_ENGINE_STALLED_TIMEOUT))
+				stalled |= intel_engine_flag(engine);
 
-		hangcheck_load_sample(engine, &hc);
-		hangcheck_accumulate_sample(engine, &hc);
-		hangcheck_store_sample(engine, &hc);
+			if (time_after(jiffies,
+				       rq->emitted_jiffies + I915_ENGINE_WEDGED_TIMEOUT))
+				wedged |= intel_engine_flag(engine);
 
-		if (engine->hangcheck.stalled) {
-			hung |= intel_engine_flag(engine);
-			if (hc.action != ENGINE_DEAD)
-				stuck |= intel_engine_flag(engine);
+			continue;
 		}
 
-		if (engine->hangcheck.wedged)
-			wedged |= intel_engine_flag(engine);
+		/* No heartbeat yet, is the engine active? */
+		spin_lock_irq(&engine->timeline.lock);
+		list_for_each_entry(rq, &engine->timeline.requests, link) {
+			if (i915_request_completed(rq))
+				continue;
+
+			/*
+			 * Does the engine appear to be stalling?
+			 *
+			 * If a ready-to-run request picked at random from the
+			 * queue has been stuck for a while, queue our own
+			 * request to check the engine's pulse.
+			 */
+			if (time_after(jiffies,
+				       rq->emitted_jiffies + I915_ENGINE_STALLED_TIMEOUT))
+				inject |= intel_engine_flag(engine);
+
+			break;
+		}
+		spin_unlock_irq(&engine->timeline.lock);
 	}
 
 	if (wedged) {
-		dev_err(dev_priv->drm.dev,
+		dev_err(i915->drm.dev,
 			"GPU recovery timed out,"
 			" cancelling all in-flight rendering.\n");
 		GEM_TRACE_DUMP();
-		i915_gem_set_wedged(dev_priv);
+		i915_gem_set_wedged(i915);
 	}
 
-	if (hung)
-		hangcheck_declare_hang(dev_priv, hung, stuck);
+	if (i915_terminally_wedged(&i915->gpu_error))
+		return;
 
-	/* Reset timer in case GPU hangs without another request being added */
-	i915_queue_hangcheck(dev_priv);
-}
+	if (stalled) {
+		hangcheck_declare_stall(i915, stalled);
+		goto restart;
+	}
 
-void intel_engine_init_hangcheck(struct intel_engine_cs *engine)
-{
-	memset(&engine->hangcheck, 0, sizeof(engine->hangcheck));
-	engine->hangcheck.action_timestamp = jiffies;
+	if (!inject)
+		goto restart;
+
+	/*
+	 * As enabling the GPU requires fairly extensive mmio access,
+	 * periodically arm the mmio checker to see if we are triggering
+	 * any invalid access.
+	 */
+	intel_uncore_arm_unclaimed_mmio_detection(i915);
+
+	i915_wedge_on_timeout(&w, i915, I915_ENGINE_WEDGED_TIMEOUT) {
+		mutex_lock(&i915->drm.struct_mutex);
+
+		i915_retire_requests(i915);
+		if (!i915->gt.active_requests)
+			inject = 0;
+
+		for_each_engine(engine, i915, id) {
+			struct i915_sched_attr attr = {
+				.priority = INT_MAX
+			};
+			struct i915_request *rq;
+
+			if (!(inject & intel_engine_flag(engine)))
+				continue;
+
+			GEM_BUG_ON(engine->hangcheck);
+
+			rq = i915_request_alloc(engine, i915->kernel_context);
+			if (IS_ERR(rq))
+				continue;
+
+			engine->hangcheck = i915_request_get(rq);
+			i915_request_add(rq);
+
+			/* Priority boost for preemption past CL kernels */
+			rcu_read_lock();
+			if (engine->schedule)
+				engine->schedule(engine->hangcheck, &attr);
+			rcu_read_unlock();
+		}
+
+		mutex_unlock(&i915->drm.struct_mutex);
+	}
+
+restart:
+	/* Reset timer in case GPU hangs without another request being added */
+	i915_queue_hangcheck(i915);
 }
 
 void intel_hangcheck_init(struct drm_i915_private *i915)
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 452c13f3dded..a159c7366a08 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -57,39 +57,6 @@ struct intel_hw_status_page {
 /* seqno size is actually only a uint32, but since we plan to use MI_FLUSH_DW to
  * do the writes, and that must have qw aligned offsets, simply pretend it's 8b.
  */
-enum intel_engine_hangcheck_action {
-	ENGINE_IDLE = 0,
-	ENGINE_WAIT,
-	ENGINE_ACTIVE_SEQNO,
-	ENGINE_ACTIVE_HEAD,
-	ENGINE_ACTIVE_SUBUNITS,
-	ENGINE_WAIT_KICK,
-	ENGINE_DEAD,
-};
-
-static inline const char *
-hangcheck_action_to_str(const enum intel_engine_hangcheck_action a)
-{
-	switch (a) {
-	case ENGINE_IDLE:
-		return "idle";
-	case ENGINE_WAIT:
-		return "wait";
-	case ENGINE_ACTIVE_SEQNO:
-		return "active seqno";
-	case ENGINE_ACTIVE_HEAD:
-		return "active head";
-	case ENGINE_ACTIVE_SUBUNITS:
-		return "active subunits";
-	case ENGINE_WAIT_KICK:
-		return "wait kick";
-	case ENGINE_DEAD:
-		return "dead";
-	}
-
-	return "unknown";
-}
-
 #define I915_MAX_SLICES	3
 #define I915_MAX_SUBSLICES 8
 
@@ -117,17 +84,6 @@ struct intel_instdone {
 	u32 row[I915_MAX_SLICES][I915_MAX_SUBSLICES];
 };
 
-struct intel_engine_hangcheck {
-	u64 acthd;
-	u32 seqno;
-	enum intel_engine_hangcheck_action action;
-	unsigned long action_timestamp;
-	int deadlock;
-	struct intel_instdone instdone;
-	bool stalled:1;
-	bool wedged:1;
-};
-
 struct intel_ring {
 	struct i915_vma *vma;
 	void *vaddr;
@@ -513,7 +469,7 @@ struct intel_engine_cs {
 	/* status_notifier: list of callbacks for context-switch changes */
 	struct atomic_notifier_head context_status_notifier;
 
-	struct intel_engine_hangcheck hangcheck;
+	struct i915_request *hangcheck;
 
 #define I915_ENGINE_NEEDS_CMD_PARSER BIT(0)
 #define I915_ENGINE_SUPPORTS_STATS   BIT(1)
diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
index 4809874ab28c..920fa4588e28 100644
--- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
@@ -1665,6 +1665,7 @@ int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
 
 	wakeref = intel_runtime_pm_get(i915);
 	saved_hangcheck = fetch_and_zero(&i915_modparams.enable_hangcheck);
+	cancel_delayed_work_sync(&i915->gpu_error.hangcheck_work);
 
 	err = i915_subtests(tests, i915);
 
-- 
2.20.1



More information about the Intel-gfx-trybot mailing list