[Intel-gfx] [PATCH 13/20] drm/i915: Test infrastructure for context state inconsistency simulation

Thu Oct 22 18:32:35 PDT 2015

Added debugfs functions and embedded test infrastructure in the context event
interrupt handler for simulating the loss of context event interrupts so that a
context submission state inconsistency can be induced. This is useful for
testing the consistency checker pre-stage to the engine hang recovery path
since in order to test that the inconsistency detection works we first need to
induce a state inconsistency that the inconsistency checker can detect and act
upon.

Signed-off-by: Tomas Elf <tomas.elf at intel.com>
---
 drivers/gpu/drm/i915/i915_debugfs.c | 88 +++++++++++++++++++++++++++++++++++++
 drivers/gpu/drm/i915/i915_dma.c     |  2 +
 drivers/gpu/drm/i915/i915_drv.c     |  3 ++
 drivers/gpu/drm/i915/i915_drv.h     | 12 +++++
 drivers/gpu/drm/i915/intel_lrc.c    | 68 ++++++++++++++++++++++++++++
 5 files changed, 173 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index edb79a7..233088e 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -4667,6 +4667,93 @@ DEFINE_SIMPLE_ATTRIBUTE(i915_wedged_fops,
 			"%llu\n");
 
 static int
+i915_fake_ctx_submission_inconsistency_get(void *data, u64 *val)
+{
+	struct drm_device *dev = data;
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	struct intel_engine_cs *ring;
+	unsigned i;
+
+	DRM_INFO("Faked inconsistent context submission state: %x\n",
+		dev_priv->gpu_error.faked_lost_ctx_event_irq);
+
+	for_each_ring(ring, dev_priv, i) {
+		u32 fake_cnt =
+			(dev_priv->gpu_error.faked_lost_ctx_event_irq >> (i<<2)) & 0xf;
+
+		DRM_INFO("%s: Faking %s [%u IRQs left to drop]\n",
+			ring->name,
+			fake_cnt?"enabled":"disabled",
+			fake_cnt);
+	}
+
+	*val = (u64) dev_priv->gpu_error.faked_lost_ctx_event_irq;
+
+	return 0;
+}
+
+static int
+i915_fake_ctx_submission_inconsistency_set(void *data, u64 val)
+{
+	struct drm_device *dev = data;
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	u32 fake_status;
+
+	/*
+	 * Set up a simulated/faked lost context event interrupt. This is used
+	 * to induce inconsistent HW/driver states that the context submission
+	 * status consistency checker (involved as a pre-stage to GPU engine
+	 * hang recovery), which is required for validation purposes.
+	 *
+	 * val contains the new faked_lost_ctx_event_irq word that is to be
+	 * merged with the already set faked_lost_ctx_event_irq word.
+	 *
+	 * val == 0 means clear all previously set fake bits.
+	 *
+	 * Each nibble contains a number between 0-15 denoting the number of
+	 * interrupts left to lose on the engine that nibble corresponds to.
+	 *
+	 * RCS: faked_lost_ctx_event_irq[3:0]
+	 * VCS: faked_lost_ctx_event_irq[7:4]
+	 * BCS: faked_lost_ctx_event_irq[11:8]
+	 * VECS: faked_lost_ctx_event_irq[15:12]
+	 * etc
+	 *
+	 * The number in each nibble is decremented by the context event
+	 * interrupt handler in intel_lrc.c once the faked interrupt loss is
+	 * executed. If a targetted interrupt is received when bit
+	 * corresponding to that engine is set that interrupt will be dropped
+	 * without side-effects, thus inducing an inconsistency since the
+	 * hardware has entered a state where removal of a context from the
+	 * context queue is required but the driver is not informed of this and
+	 * is therefore stuck in that state until inconsistency rectification
+	 * (forced CSB checking) or reboot.
+	 */
+
+	fake_status =
+		dev_priv->gpu_error.faked_lost_ctx_event_irq;
+
+	DRM_INFO("Faking lost context event IRQ (new status: %x, old status: %x)\n",
+		(u32) val, fake_status);
+
+	if (val) {
+		dev_priv->gpu_error.faked_lost_ctx_event_irq |= ((u32) val);
+	} else {
+		DRM_INFO("Clearing lost context event IRQ mask\n");
+
+		dev_priv->gpu_error.faked_lost_ctx_event_irq = 0;
+	}
+
+
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(i915_fake_ctx_submission_inconsistency_fops,
+			i915_fake_ctx_submission_inconsistency_get,
+			i915_fake_ctx_submission_inconsistency_set,
+			"%llu\n");
+
+static int
 i915_ring_stop_get(void *data, u64 *val)
 {
 	struct drm_device *dev = data;
@@ -5320,6 +5407,7 @@ static const struct i915_debugfs_files {
 	const struct file_operations *fops;
 } i915_debugfs_files[] = {
 	{"i915_wedged", &i915_wedged_fops},
+	{"i915_fake_ctx_inconsistency", &i915_fake_ctx_submission_inconsistency_fops},
 	{"i915_max_freq", &i915_max_freq_fops},
 	{"i915_min_freq", &i915_min_freq_fops},
 	{"i915_cache_sharing", &i915_cache_sharing_fops},
diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
index 1e203e7d..e8193c9 100644
--- a/drivers/gpu/drm/i915/i915_dma.c
+++ b/drivers/gpu/drm/i915/i915_dma.c
@@ -843,6 +843,8 @@ i915_hangcheck_init(struct drm_device *dev)
 	int i;
 	struct drm_i915_private *dev_priv = dev->dev_private;
 
+	dev_priv->gpu_error.faked_lost_ctx_event_irq = 0;
+
 	for (i = 0; i < I915_NUM_RINGS; i++) {
 		struct intel_engine_cs *engine = &dev_priv->ring[i];
 		struct intel_ring_hangcheck *hc = &engine->hangcheck;
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index c361b19..c32c475 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -917,6 +917,9 @@ int i915_reset(struct drm_device *dev)
 		}
 	}
 
+	/* Clear simulated lost context event interrupts */
+	dev_priv->gpu_error.faked_lost_ctx_event_irq = 0;
+
 	if (i915_stop_ring_allow_warn(dev_priv))
 		pr_notice("drm/i915: Resetting chip after gpu hang\n");
 
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 9219904..7ebf800 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1400,6 +1400,18 @@ struct i915_gpu_error {
 #define I915_STOP_RING_ALLOW_BAN       (1 << 31)
 #define I915_STOP_RING_ALLOW_WARN      (1 << 30)
 
+	/*
+	 * Bit mask for simulation of lost context event IRQs on each
+	 * respective engine.
+	 *
+	 *   Bits 0:3: 	 Number of lost IRQs to be faked on RCS
+	 *   Bits 4:7:	 Number of lost IRQs to be faked on VCS
+	 *   Bits 8:11:  Number of lost IRQs to be faked on BCS
+	 *   Bits 12:15: Number of lost IRQs to be faked on VECS
+	 *   Bits 16:19: Number of lost IRQs to be faked on VCS2
+	*/
+	u32 faked_lost_ctx_event_irq;
+
 	/* For missed irq/seqno simulation. */
 	unsigned int test_irq_rings;
 
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index b48f74c..5bb7d6e 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -700,6 +700,52 @@ static bool execlists_check_remove_request(struct intel_engine_cs *ring,
 }
 
 /**
+ * fake_lost_ctx_event_irq() - Checks for pending faked lost context event IRQs.
+ * @dev_priv: ...
+ * @ring: Engine to check pending faked lost IRQs for.
+ *
+ * Checks the bits in dev_priv->gpu_error.faked_lost_ctx_event_irq corresponding
+ * to the specified engine and updates the bits and returns a value accordingly.
+ *
+ * Return:
+ * 	true: If the current IRQ is to be lost.
+ * 	false: If the current IRQ is to be processed as normal.
+ */
+static inline bool fake_lost_ctx_event_irq(struct drm_i915_private *dev_priv,
+				           struct intel_engine_cs *ring)
+{
+	u32 *faked_lost_irq_mask =
+		&dev_priv->gpu_error.faked_lost_ctx_event_irq;
+
+	/*
+	 * Point out the least significant bit in the nibble of the faked lost
+	 * context event IRQ mask that corresponds to the engine at hand.
+	 */
+	u32 engine_nibble = (ring->id << 2);
+
+	/* Check engine nibble for any pending IRQs to be simulated as lost */
+	if (*faked_lost_irq_mask & (0xf << engine_nibble)) {
+		DRM_INFO("Faked lost interrupt on %s! (%x)\n",
+			ring->name,
+			*faked_lost_irq_mask);
+
+		/*
+		 * Subtract the IRQ that is to be simulated as lost from the
+		 * engine nibble.
+		 */
+		*faked_lost_irq_mask -= (0x1 << engine_nibble);
+
+		DRM_INFO("New fake lost irq mask: %x\n",
+			*faked_lost_irq_mask);
+
+		/* Tell the IRQ handler to simulate lost context event IRQ */
+		return true;
+	}
+
+	return false;
+}
+
+/**
  * intel_lrc_irq_handler() - handle Context Switch interrupts
  * @ring: Engine Command Streamer to handle.
  * @do_lock: Lock execlist spinlock (if false the caller is responsible for this)
@@ -740,6 +786,23 @@ int intel_lrc_irq_handler(struct intel_engine_cs *ring, bool do_lock)
 
 		if (status & GEN8_CTX_STATUS_PREEMPTED) {
 			if (status & GEN8_CTX_STATUS_LITE_RESTORE) {
+				if (fake_lost_ctx_event_irq(dev_priv, ring)) {
+				    /*
+				     * If we want to simulate the loss of a
+				     * context event IRQ (only for such events
+				     * that could affect the execlist queue,
+				     * since this is something that could
+				     * affect the context submission status
+				     * consistency checker) then just exit the
+				     * IRQ handler early with no side-effects!
+				     * We want to pretend like this IRQ never
+				     * happened. The next time the IRQ handler
+				     * is entered for this engine the CSB
+				     * events should remain in the CSB, waiting
+				     * to be processed.
+				     */
+				    goto exit;
+				}
 				if (execlists_check_remove_request(ring, status_id))
 					WARN(1, "Lite Restored request removed from queue\n");
 			} else
@@ -748,6 +811,10 @@ int intel_lrc_irq_handler(struct intel_engine_cs *ring, bool do_lock)
 
 		 if ((status & GEN8_CTX_STATUS_ACTIVE_IDLE) ||
 		     (status & GEN8_CTX_STATUS_ELEMENT_SWITCH)) {
+
+			if (fake_lost_ctx_event_irq(dev_priv, ring))
+			    goto exit;
+
 			if (execlists_check_remove_request(ring, status_id))
 				submit_contexts++;
 		}
@@ -770,6 +837,7 @@ int intel_lrc_irq_handler(struct intel_engine_cs *ring, bool do_lock)
 				 ((u32)ring->next_context_status_buffer &
 				  GEN8_CSB_PTR_MASK) << 8));
 
+exit:
 	if (do_lock)
 		spin_unlock(&ring->execlist_lock);
 
-- 
1.9.1