[Intel-gfx] [RFC 09/11] drm/i915: Fake lost context interrupts through forced CSB check.

Mon Jun 8 10:03:27 PDT 2015

A recurring issue during long-duration operations testing of concurrent
rendering tasks with intermittent hangs is that context completion interrupts
following engine resets are sometimes lost. This becomes a real problem since
the hardware might have completed a previously hung context following a
per-engine hang recovery and then gone idle somehow without sending an
interrupt telling the driver about this. At this point the driver would be
stuck waiting for context completion, thinking that the context is still active,
even though the hardware would be idle and waiting for more work.

The way this is solved is by periodically checking for context submission
status inconsistencies. What this means is that the ID of the currently running
context on a given engine is compared against the context ID in the
EXECLIST_STATUS register of the respective engine. If the two do not match and
if the state does not change over time it is assumed that an interrupt was
missed and that the driver is now stuck in an inconsistent state.

Following the decision that the driver and the hardware are irreversibly stuck
in an inconsistent state on a certain engine, the presumably lost interrupt is
faked by simply calling the execlist interrupt handler from a non-interrupt
context. Even though interrupts might be lost that does not mean that the
hardware does not always update the context status buffer (CSB) when
appropriate, which means that any context state transitions would be captured
there regardless of the interrupt being sent or not. By faking the lost
interrupt the interrupt handler could act on the outstanding context status
transition events in the CSB, e.g. a context completion event. In the case
where the hardware would be idle but the driver would be waiting for
completion, faking an interrupt and finding a context completion status event
would cause the driver to remove the currently active request from the execlist
queue and go idle - thereby reestablishing a consistent context submission
status between the hardware and the driver.

The way this is implemented is that the hang checker will always keep alive as
long as there is outstanding work. Even if the enable_hangcheck flag is
disabled one part of the hang checker will always keep alive and reschedule
itself, only to scan for inconsistent context submission states on all engines.
As long as the context submission status of the currently running context on a
given engine is consistent the hang checker works as normal and schedules hang
recoveries as expected. If the status is not consistent no hang recoveries will
be scheduled since no context resubmission will be possible anyway, so there is
no point in trying until the status becomes consistent again. Of course, if
enough hangs on the same engine are detected without any change in consistency
the hang checker will go straight for the full GPU reset so there is no chance
of getting stuck in this state.

It's worth keeping in mind that the watchdog timeout hang detection mechanism
relies entirely on the per-engine hang recovery path. So if we have an
inconsistent context submission status on the engine that the watchdog timeout
has detected a hang there is no way to recover from that hang if the period
hangchecker is turned off since the per-engine hang recovery cannot do its
final context resubmission if the context submission status is inconsistent.
That's why we need to make sure that there is always a thread alive that keeps
an eye out for inconsistent context submission states, not only for the
periodic hang checker but also for watchdog timeout.

Finally, since a non-interrupt context thread could end up in the interrupt
handler as part of the forced CSB checking there's the chance of a race
condition between the interrupt handler and the ring init code since both
update ring->next_context_status_buffer. Therefore we've had to update the
interrupt handler so that it grabs the execlist spinlock before updating
the variable. We've also had to make sure that the ring init code
grabs the execlist spinlock before initing this variable.

Signed-off-by: Tomas Elf <tomas.elf at intel.com>
---
 drivers/gpu/drm/i915/i915_dma.c         |    6 +-
 drivers/gpu/drm/i915/i915_irq.c         |   77 ++++++++++++++++++++++++--
 drivers/gpu/drm/i915/intel_lrc.c        |   91 +++++++++++++++++++++++++++++--
 drivers/gpu/drm/i915/intel_lrc.h        |    2 +-
 drivers/gpu/drm/i915/intel_lrc_tdr.h    |    3 +
 drivers/gpu/drm/i915/intel_ringbuffer.h |   14 +++++
 6 files changed, 179 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
index 2ec3163..ad4c9efa 100644
--- a/drivers/gpu/drm/i915/i915_dma.c
+++ b/drivers/gpu/drm/i915/i915_dma.c
@@ -784,10 +784,12 @@ i915_hangcheck_init(struct drm_device *dev)
 
 	for (i = 0; i < I915_NUM_RINGS; i++) {
 		struct intel_engine_cs *engine = &dev_priv->ring[i];
+		struct intel_ring_hangcheck *hc = &engine->hangcheck;
 
 		i915_hangcheck_reinit(engine);
-		engine->hangcheck.reset_count = 0;
-		engine->hangcheck.tdr_count = 0;
+		hc->reset_count = 0;
+		hc->tdr_count = 0;
+		hc->inconsistent_ctx_status_cnt = 0;
 	}
 }
 
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 57c8568..56bd967 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -36,6 +36,7 @@
 #include "i915_drv.h"
 #include "i915_trace.h"
 #include "intel_drv.h"
+#include "intel_lrc_tdr.h"
 
 /**
  * DOC: interrupt handling
@@ -1286,7 +1287,7 @@ static irqreturn_t gen8_gt_irq_handler(struct drm_i915_private *dev_priv,
 			ret = IRQ_HANDLED;
 
 			if (tmp & (GT_CONTEXT_SWITCH_INTERRUPT << GEN8_RCS_IRQ_SHIFT))
-				intel_lrc_irq_handler(&dev_priv->ring[RCS]);
+				intel_lrc_irq_handler(&dev_priv->ring[RCS], true);
 			if (tmp & (GT_RENDER_USER_INTERRUPT << GEN8_RCS_IRQ_SHIFT))
 				notify_ring(&dev_priv->ring[RCS]);
 			if (tmp & (GT_GEN8_RCS_WATCHDOG_INTERRUPT << GEN8_RCS_IRQ_SHIFT)) {
@@ -1303,7 +1304,7 @@ static irqreturn_t gen8_gt_irq_handler(struct drm_i915_private *dev_priv,
 			}
 
 			if (tmp & (GT_CONTEXT_SWITCH_INTERRUPT << GEN8_BCS_IRQ_SHIFT))
-				intel_lrc_irq_handler(&dev_priv->ring[BCS]);
+				intel_lrc_irq_handler(&dev_priv->ring[BCS], true);
 			if (tmp & (GT_RENDER_USER_INTERRUPT << GEN8_BCS_IRQ_SHIFT))
 				notify_ring(&dev_priv->ring[BCS]);
 		} else
@@ -1317,7 +1318,7 @@ static irqreturn_t gen8_gt_irq_handler(struct drm_i915_private *dev_priv,
 			ret = IRQ_HANDLED;
 
 			if (tmp & (GT_CONTEXT_SWITCH_INTERRUPT << GEN8_VCS1_IRQ_SHIFT))
-				intel_lrc_irq_handler(&dev_priv->ring[VCS]);
+				intel_lrc_irq_handler(&dev_priv->ring[VCS], true);
 			if (tmp & (GT_RENDER_USER_INTERRUPT << GEN8_VCS1_IRQ_SHIFT))
 				notify_ring(&dev_priv->ring[VCS]);
 			if (tmp & (GT_GEN8_VCS_WATCHDOG_INTERRUPT << GEN8_VCS1_IRQ_SHIFT)) {
@@ -1334,7 +1335,7 @@ static irqreturn_t gen8_gt_irq_handler(struct drm_i915_private *dev_priv,
 			}
 
 			if (tmp & (GT_CONTEXT_SWITCH_INTERRUPT << GEN8_VCS2_IRQ_SHIFT))
-				intel_lrc_irq_handler(&dev_priv->ring[VCS2]);
+				intel_lrc_irq_handler(&dev_priv->ring[VCS2], true);
 			if (tmp & (GT_RENDER_USER_INTERRUPT << GEN8_VCS2_IRQ_SHIFT))
 				notify_ring(&dev_priv->ring[VCS2]);
 			if (tmp & (GT_GEN8_VCS_WATCHDOG_INTERRUPT << GEN8_VCS2_IRQ_SHIFT)) {
@@ -1360,7 +1361,7 @@ static irqreturn_t gen8_gt_irq_handler(struct drm_i915_private *dev_priv,
 			ret = IRQ_HANDLED;
 
 			if (tmp & (GT_CONTEXT_SWITCH_INTERRUPT << GEN8_VECS_IRQ_SHIFT))
-				intel_lrc_irq_handler(&dev_priv->ring[VECS]);
+				intel_lrc_irq_handler(&dev_priv->ring[VECS], true);
 			if (tmp & (GT_RENDER_USER_INTERRUPT << GEN8_VECS_IRQ_SHIFT))
 				notify_ring(&dev_priv->ring[VECS]);
 		} else
@@ -3050,6 +3051,27 @@ ring_stuck(struct intel_engine_cs *ring, u64 acthd)
 	return HANGCHECK_HUNG;
 }
 
+static void check_ctx_submission_consistency(struct drm_i915_private *dev_priv,
+				   struct intel_engine_cs *engine,
+				   enum context_submission_status status)
+{
+	struct intel_ring_hangcheck *hc = &engine->hangcheck;
+
+	if (status == CONTEXT_SUBMISSION_STATUS_INCONSISTENT) {
+		if (hc->inconsistent_ctx_status_cnt++ >
+			I915_FAKED_CONTEXT_IRQ_THRESHOLD) {
+
+			DRM_ERROR("Inconsistent context submission state. " \
+				  "Faking interrupt on %s!\n", engine->name);
+
+			intel_execlists_TDR_force_CSB_check(dev_priv, engine);
+			hc->inconsistent_ctx_status_cnt = 0;
+		}
+	}
+	else
+		hc->inconsistent_ctx_status_cnt = 0;
+}
+
 /*
  * This is called when the chip hasn't reported back with completed
  * batchbuffers in a long time. We keep track per ring seqno progress and
@@ -3070,10 +3092,43 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
 	int busy_count = 0;
 	bool stuck[I915_NUM_RINGS] = { 0 };
 	bool force_full_gpu_reset = false;
+	enum context_submission_status status[I915_NUM_RINGS] =
+		{ CONTEXT_SUBMISSION_STATUS_OK };
 #define BUSY 1
 #define KICK 5
 #define HUNG 20
 
+	/*
+	 * In execlist mode we need to check for inconsistent context
+	 * submission states regardless if we want to actually check for hangs
+	 * or not since watchdog timeout is dependent on per-engine recovery
+	 * working properly, which will not be the case if there is an
+	 * inconsistent submission state between hardware and driver.
+	 */
+	if (i915.enable_execlists)
+		for_each_ring(ring, dev_priv, i) {
+			status[i] = intel_execlists_TDR_get_current_request(ring, NULL);
+			check_ctx_submission_consistency(dev_priv,
+							 ring,
+							 status[i]);
+
+			/*
+			 * Work is still pending! If hang checking is turned on
+			 * then go through the normal hang check procedure.
+			 * Otherwise we obviously don't do the normal busyness
+			 * check but instead go for a simple check of the
+			 * execlist queues to see if there's work pending. If
+			 * so, there's the potential for an inconsistent
+			 * context submission state so we must keep hang
+			 * checking.
+			 */
+			if (!i915.enable_hangcheck &&
+			   (status[i] != CONTEXT_SUBMISSION_STATUS_NONE_SUBMITTED)) {
+				 i915_queue_hangcheck(dev);
+				 return;
+			}
+		}
+
 	if (!i915.enable_hangcheck)
 		return;
 
@@ -3160,7 +3215,17 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
 	}
 
 	for_each_ring(ring, dev_priv, i) {
-		if (ring->hangcheck.score >= HANGCHECK_SCORE_RING_HUNG) {
+		/*
+		 * If the engine is hung but the context submission state is
+		 * inconsistent we cannot attempt recovery since we have no way
+		 * of resubmitting the context. Trying to do so would just
+		 * cause unforseen preemptions. At the top of this function we
+		 * check for - and attempt to rectify - any inconsistencies so
+		 * that future hang checks can safely proceed to recover from
+		 * the hang.
+		 */
+		if ((ring->hangcheck.score >= HANGCHECK_SCORE_RING_HUNG) &&
+		    (status[i] == CONTEXT_SUBMISSION_STATUS_OK)) {
 			DRM_INFO("%s on %s\n",
 				 stuck[i] ? "stuck" : "no progress",
 				 ring->name);
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 051da09..0d197fe 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -641,7 +641,7 @@ static bool execlists_check_remove_request(struct intel_engine_cs *ring,
  * Check the unread Context Status Buffers and manage the submission of new
  * contexts to the ELSP accordingly.
  */
-void intel_lrc_irq_handler(struct intel_engine_cs *ring)
+int intel_lrc_irq_handler(struct intel_engine_cs *ring, bool do_lock)
 {
 	struct drm_i915_private *dev_priv = ring->dev->dev_private;
 	u32 status_pointer;
@@ -653,13 +653,14 @@ void intel_lrc_irq_handler(struct intel_engine_cs *ring)
 
 	status_pointer = I915_READ(RING_CONTEXT_STATUS_PTR(ring));
 
+	if (do_lock)
+		spin_lock(&ring->execlist_lock);
+
 	read_pointer = ring->next_context_status_buffer;
 	write_pointer = status_pointer & 0x07;
 	if (read_pointer > write_pointer)
 		write_pointer += 6;
 
-	spin_lock(&ring->execlist_lock);
-
 	while (read_pointer < write_pointer) {
 		read_pointer++;
 		status = I915_READ(RING_CONTEXT_STATUS_BUF(ring) +
@@ -685,13 +686,16 @@ void intel_lrc_irq_handler(struct intel_engine_cs *ring)
 	if (submit_contexts != 0)
 		execlists_context_unqueue(ring);
 
-	spin_unlock(&ring->execlist_lock);
-
 	WARN(submit_contexts > 2, "More than two context complete events?\n");
 	ring->next_context_status_buffer = write_pointer % 6;
 
+	if (do_lock)
+		spin_unlock(&ring->execlist_lock);
+
 	I915_WRITE(RING_CONTEXT_STATUS_PTR(ring),
 		   ((u32)ring->next_context_status_buffer & 0x07) << 8);
+
+	return submit_contexts;
 }
 
 static int execlists_context_queue(struct intel_engine_cs *ring,
@@ -1473,6 +1477,7 @@ static int gen8_init_common_ring(struct intel_engine_cs *ring)
 {
 	struct drm_device *dev = ring->dev;
 	struct drm_i915_private *dev_priv = dev->dev_private;
+	unsigned long flags;
 
 	I915_WRITE_IMR(ring, ~(ring->irq_enable_mask | ring->irq_keep_mask));
 	I915_WRITE(RING_HWSTAM(ring->mmio_base), 0xffffffff);
@@ -1481,7 +1486,11 @@ static int gen8_init_common_ring(struct intel_engine_cs *ring)
 		   _MASKED_BIT_DISABLE(GFX_REPLAY_MODE) |
 		   _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE));
 	POSTING_READ(RING_MODE_GEN7(ring));
+
+	spin_lock_irqsave(&ring->execlist_lock, flags);
 	ring->next_context_status_buffer = 0;
+	spin_unlock_irqrestore(&ring->execlist_lock, flags);
+
 	DRM_DEBUG_DRIVER("Execlists enabled for %s\n", ring->name);
 
 	i915_hangcheck_reinit(ring);
@@ -2703,3 +2712,75 @@ intel_execlists_TDR_get_current_request(struct intel_engine_cs *ring,
 
 	return status;
 }
+
+/**
+ * execlists_TDR_force_CSB_check() - check CSB manually to act on pending
+ * context status events.
+ *
+ * @dev_priv: ...
+ * @engine: engine whose CSB is to be checked.
+ *
+ * In case we missed a context event interrupt we can fake this interrupt by
+ * acting on pending CSB events manually by calling this function. This is
+ * normally what would happen in interrupt context but that does not prevent us
+ * from calling it from a user thread.
+ */
+void intel_execlists_TDR_force_CSB_check(struct drm_i915_private *dev_priv,
+					 struct intel_engine_cs *engine)
+{
+	unsigned long flags;
+	bool hw_active;
+	int was_effective;
+
+	if (atomic_read(&engine->hangcheck.flags)
+		& I915_ENGINE_RESET_IN_PROGRESS) {
+
+		/*
+		 * Normally it's not a problem to fake context event interrupts
+		 * at any point even though the real interrupt might come in as
+		 * well. However, following a per-engine reset the read pointer
+		 * is set to 0 and the write pointer is set to 7.
+		 * Seeing as 7 % 6 = 1 (% 6 meaning there are 6 event slots),
+		 * which is 1 above the post-reset read pointer position, that
+		 * means that we've got a CSB window of non-zero size that
+		 * might be populated with context events by the hardware
+		 * following the TDR context resubmission. If we do a faked
+		 * interrupt too early (before finishing hang recovery) we
+		 * clear out this window by setting read pointer = write
+		 * pointer = 1 expecting that all contained events have been
+		 * processed (following a reset there will be nothing but
+		 * zeroes in there, though). This does not prevent the hardware
+		 * from filling in CSB slots 0 and 1 with events after this
+		 * point in time, though. By checking the CSB before allowing
+		 * the hardware fill in the events we hide these events from
+		 * being processed, potentially causing irrecoverable hangs.
+		 *
+		 * Solution: Do not fake interrupts while hang recovery is ongoing.
+		 */
+		DRM_ERROR("Hang recovery in progress. Abort %s CSB check!\n",
+			engine->name);
+
+		return;
+	}
+
+	hw_active =
+		(I915_READ(RING_EXECLIST_STATUS(engine)) &
+			EXECLIST_STATUS_CURRENT_ACTIVE_ELEMENT_STATUS) ?
+				true : false;
+	if (hw_active) {
+		u32 hw_context;
+
+		hw_context = I915_READ(RING_EXECLIST_STATUS_CTX_ID(engine));
+		WARN(hw_active, "Context (%x) executing on %s - " \
+				"No need for faked IRQ!\n",
+				hw_context, engine->name);
+	}
+
+	spin_lock_irqsave(&engine->execlist_lock, flags);
+	if (!(was_effective = intel_lrc_irq_handler(engine, false)))
+		DRM_ERROR("Forced CSB check of %s ineffective!\n", engine->name);
+	spin_unlock_irqrestore(&engine->execlist_lock, flags);
+
+	wake_up_all(&engine->irq_queue);
+}
+
diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
index d2f497c..6fae3c8 100644
--- a/drivers/gpu/drm/i915/intel_lrc.h
+++ b/drivers/gpu/drm/i915/intel_lrc.h
@@ -88,7 +88,7 @@ int intel_execlists_submission(struct drm_device *dev, struct drm_file *file,
 			       u64 exec_start, u32 dispatch_flags);
 u32 intel_execlists_ctx_id(struct drm_i915_gem_object *ctx_obj);
 
-void intel_lrc_irq_handler(struct intel_engine_cs *ring);
+int intel_lrc_irq_handler(struct intel_engine_cs *ring, bool do_lock);
 void intel_execlists_retire_requests(struct intel_engine_cs *ring);
 
 int intel_execlists_read_tail(struct intel_engine_cs *ring,
diff --git a/drivers/gpu/drm/i915/intel_lrc_tdr.h b/drivers/gpu/drm/i915/intel_lrc_tdr.h
index 684b009..79cae7d 100644
--- a/drivers/gpu/drm/i915/intel_lrc_tdr.h
+++ b/drivers/gpu/drm/i915/intel_lrc_tdr.h
@@ -33,5 +33,8 @@ enum context_submission_status
 intel_execlists_TDR_get_current_request(struct intel_engine_cs *ring,
 		struct drm_i915_gem_request **req);
 
+void intel_execlists_TDR_force_CSB_check(struct drm_i915_private *dev_priv,
+					 struct intel_engine_cs *engine);
+
 #endif /* _INTEL_LRC_TDR_H_ */
 
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 9058789..f779d4d 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -141,6 +141,20 @@ struct intel_ring_hangcheck {
 
 	/* Number of watchdog hang detections for this ring */
 	u32 watchdog_count;
+
+	/*
+	 * Number of detected context submission status
+	 * inconsistencies
+	 */
+	u32 inconsistent_ctx_status_cnt;
+
+	/*
+	 * Number of detected context submission status
+	 * inconsistencies before faking the context event IRQ
+	 * that is presumed missing.
+	 */
+#define I915_FAKED_CONTEXT_IRQ_THRESHOLD 1
+
 };
 
 struct intel_ringbuffer {
-- 
1.7.9.5