[Intel-gfx] [PATCH 13/20] drm/i915: Test infrastructure for context state inconsistency simulation
Tomas Elf
tomas.elf at intel.com
Thu Oct 22 18:32:35 PDT 2015
Added debugfs functions and embedded test infrastructure in the context event
interrupt handler for simulating the loss of context event interrupts so that a
context submission state inconsistency can be induced. This is useful for
testing the consistency checker pre-stage to the engine hang recovery path
since in order to test that the inconsistency detection works we first need to
induce a state inconsistency that the inconsistency checker can detect and act
upon.
Signed-off-by: Tomas Elf <tomas.elf at intel.com>
---
drivers/gpu/drm/i915/i915_debugfs.c | 88 +++++++++++++++++++++++++++++++++++++
drivers/gpu/drm/i915/i915_dma.c | 2 +
drivers/gpu/drm/i915/i915_drv.c | 3 ++
drivers/gpu/drm/i915/i915_drv.h | 12 +++++
drivers/gpu/drm/i915/intel_lrc.c | 68 ++++++++++++++++++++++++++++
5 files changed, 173 insertions(+)
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index edb79a7..233088e 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -4667,6 +4667,93 @@ DEFINE_SIMPLE_ATTRIBUTE(i915_wedged_fops,
"%llu\n");
static int
+i915_fake_ctx_submission_inconsistency_get(void *data, u64 *val)
+{
+ struct drm_device *dev = data;
+ struct drm_i915_private *dev_priv = dev->dev_private;
+ struct intel_engine_cs *ring;
+ unsigned i;
+
+ DRM_INFO("Faked inconsistent context submission state: %x\n",
+ dev_priv->gpu_error.faked_lost_ctx_event_irq);
+
+ for_each_ring(ring, dev_priv, i) {
+ u32 fake_cnt =
+ (dev_priv->gpu_error.faked_lost_ctx_event_irq >> (i<<2)) & 0xf;
+
+ DRM_INFO("%s: Faking %s [%u IRQs left to drop]\n",
+ ring->name,
+ fake_cnt?"enabled":"disabled",
+ fake_cnt);
+ }
+
+ *val = (u64) dev_priv->gpu_error.faked_lost_ctx_event_irq;
+
+ return 0;
+}
+
+static int
+i915_fake_ctx_submission_inconsistency_set(void *data, u64 val)
+{
+ struct drm_device *dev = data;
+ struct drm_i915_private *dev_priv = dev->dev_private;
+ u32 fake_status;
+
+ /*
+ * Set up a simulated/faked lost context event interrupt. This is used
+ * to induce inconsistent HW/driver states that the context submission
+ * status consistency checker (involved as a pre-stage to GPU engine
+ * hang recovery), which is required for validation purposes.
+ *
+ * val contains the new faked_lost_ctx_event_irq word that is to be
+ * merged with the already set faked_lost_ctx_event_irq word.
+ *
+ * val == 0 means clear all previously set fake bits.
+ *
+ * Each nibble contains a number between 0-15 denoting the number of
+ * interrupts left to lose on the engine that nibble corresponds to.
+ *
+ * RCS: faked_lost_ctx_event_irq[3:0]
+ * VCS: faked_lost_ctx_event_irq[7:4]
+ * BCS: faked_lost_ctx_event_irq[11:8]
+ * VECS: faked_lost_ctx_event_irq[15:12]
+ * etc
+ *
+ * The number in each nibble is decremented by the context event
+ * interrupt handler in intel_lrc.c once the faked interrupt loss is
+ * executed. If a targetted interrupt is received when bit
+ * corresponding to that engine is set that interrupt will be dropped
+ * without side-effects, thus inducing an inconsistency since the
+ * hardware has entered a state where removal of a context from the
+ * context queue is required but the driver is not informed of this and
+ * is therefore stuck in that state until inconsistency rectification
+ * (forced CSB checking) or reboot.
+ */
+
+ fake_status =
+ dev_priv->gpu_error.faked_lost_ctx_event_irq;
+
+ DRM_INFO("Faking lost context event IRQ (new status: %x, old status: %x)\n",
+ (u32) val, fake_status);
+
+ if (val) {
+ dev_priv->gpu_error.faked_lost_ctx_event_irq |= ((u32) val);
+ } else {
+ DRM_INFO("Clearing lost context event IRQ mask\n");
+
+ dev_priv->gpu_error.faked_lost_ctx_event_irq = 0;
+ }
+
+
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(i915_fake_ctx_submission_inconsistency_fops,
+ i915_fake_ctx_submission_inconsistency_get,
+ i915_fake_ctx_submission_inconsistency_set,
+ "%llu\n");
+
+static int
i915_ring_stop_get(void *data, u64 *val)
{
struct drm_device *dev = data;
@@ -5320,6 +5407,7 @@ static const struct i915_debugfs_files {
const struct file_operations *fops;
} i915_debugfs_files[] = {
{"i915_wedged", &i915_wedged_fops},
+ {"i915_fake_ctx_inconsistency", &i915_fake_ctx_submission_inconsistency_fops},
{"i915_max_freq", &i915_max_freq_fops},
{"i915_min_freq", &i915_min_freq_fops},
{"i915_cache_sharing", &i915_cache_sharing_fops},
diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
index 1e203e7d..e8193c9 100644
--- a/drivers/gpu/drm/i915/i915_dma.c
+++ b/drivers/gpu/drm/i915/i915_dma.c
@@ -843,6 +843,8 @@ i915_hangcheck_init(struct drm_device *dev)
int i;
struct drm_i915_private *dev_priv = dev->dev_private;
+ dev_priv->gpu_error.faked_lost_ctx_event_irq = 0;
+
for (i = 0; i < I915_NUM_RINGS; i++) {
struct intel_engine_cs *engine = &dev_priv->ring[i];
struct intel_ring_hangcheck *hc = &engine->hangcheck;
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index c361b19..c32c475 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -917,6 +917,9 @@ int i915_reset(struct drm_device *dev)
}
}
+ /* Clear simulated lost context event interrupts */
+ dev_priv->gpu_error.faked_lost_ctx_event_irq = 0;
+
if (i915_stop_ring_allow_warn(dev_priv))
pr_notice("drm/i915: Resetting chip after gpu hang\n");
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 9219904..7ebf800 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1400,6 +1400,18 @@ struct i915_gpu_error {
#define I915_STOP_RING_ALLOW_BAN (1 << 31)
#define I915_STOP_RING_ALLOW_WARN (1 << 30)
+ /*
+ * Bit mask for simulation of lost context event IRQs on each
+ * respective engine.
+ *
+ * Bits 0:3: Number of lost IRQs to be faked on RCS
+ * Bits 4:7: Number of lost IRQs to be faked on VCS
+ * Bits 8:11: Number of lost IRQs to be faked on BCS
+ * Bits 12:15: Number of lost IRQs to be faked on VECS
+ * Bits 16:19: Number of lost IRQs to be faked on VCS2
+ */
+ u32 faked_lost_ctx_event_irq;
+
/* For missed irq/seqno simulation. */
unsigned int test_irq_rings;
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index b48f74c..5bb7d6e 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -700,6 +700,52 @@ static bool execlists_check_remove_request(struct intel_engine_cs *ring,
}
/**
+ * fake_lost_ctx_event_irq() - Checks for pending faked lost context event IRQs.
+ * @dev_priv: ...
+ * @ring: Engine to check pending faked lost IRQs for.
+ *
+ * Checks the bits in dev_priv->gpu_error.faked_lost_ctx_event_irq corresponding
+ * to the specified engine and updates the bits and returns a value accordingly.
+ *
+ * Return:
+ * true: If the current IRQ is to be lost.
+ * false: If the current IRQ is to be processed as normal.
+ */
+static inline bool fake_lost_ctx_event_irq(struct drm_i915_private *dev_priv,
+ struct intel_engine_cs *ring)
+{
+ u32 *faked_lost_irq_mask =
+ &dev_priv->gpu_error.faked_lost_ctx_event_irq;
+
+ /*
+ * Point out the least significant bit in the nibble of the faked lost
+ * context event IRQ mask that corresponds to the engine at hand.
+ */
+ u32 engine_nibble = (ring->id << 2);
+
+ /* Check engine nibble for any pending IRQs to be simulated as lost */
+ if (*faked_lost_irq_mask & (0xf << engine_nibble)) {
+ DRM_INFO("Faked lost interrupt on %s! (%x)\n",
+ ring->name,
+ *faked_lost_irq_mask);
+
+ /*
+ * Subtract the IRQ that is to be simulated as lost from the
+ * engine nibble.
+ */
+ *faked_lost_irq_mask -= (0x1 << engine_nibble);
+
+ DRM_INFO("New fake lost irq mask: %x\n",
+ *faked_lost_irq_mask);
+
+ /* Tell the IRQ handler to simulate lost context event IRQ */
+ return true;
+ }
+
+ return false;
+}
+
+/**
* intel_lrc_irq_handler() - handle Context Switch interrupts
* @ring: Engine Command Streamer to handle.
* @do_lock: Lock execlist spinlock (if false the caller is responsible for this)
@@ -740,6 +786,23 @@ int intel_lrc_irq_handler(struct intel_engine_cs *ring, bool do_lock)
if (status & GEN8_CTX_STATUS_PREEMPTED) {
if (status & GEN8_CTX_STATUS_LITE_RESTORE) {
+ if (fake_lost_ctx_event_irq(dev_priv, ring)) {
+ /*
+ * If we want to simulate the loss of a
+ * context event IRQ (only for such events
+ * that could affect the execlist queue,
+ * since this is something that could
+ * affect the context submission status
+ * consistency checker) then just exit the
+ * IRQ handler early with no side-effects!
+ * We want to pretend like this IRQ never
+ * happened. The next time the IRQ handler
+ * is entered for this engine the CSB
+ * events should remain in the CSB, waiting
+ * to be processed.
+ */
+ goto exit;
+ }
if (execlists_check_remove_request(ring, status_id))
WARN(1, "Lite Restored request removed from queue\n");
} else
@@ -748,6 +811,10 @@ int intel_lrc_irq_handler(struct intel_engine_cs *ring, bool do_lock)
if ((status & GEN8_CTX_STATUS_ACTIVE_IDLE) ||
(status & GEN8_CTX_STATUS_ELEMENT_SWITCH)) {
+
+ if (fake_lost_ctx_event_irq(dev_priv, ring))
+ goto exit;
+
if (execlists_check_remove_request(ring, status_id))
submit_contexts++;
}
@@ -770,6 +837,7 @@ int intel_lrc_irq_handler(struct intel_engine_cs *ring, bool do_lock)
((u32)ring->next_context_status_buffer &
GEN8_CSB_PTR_MASK) << 8));
+exit:
if (do_lock)
spin_unlock(&ring->execlist_lock);
--
1.9.1
More information about the Intel-gfx
mailing list