[Intel-gfx] [PATCH RFC] drm/i915: Print dmesg warn on unintended hangs
Mika Kuoppala
mika.kuoppala at linux.intel.com
Fri Nov 10 11:53:47 UTC 2017
We have a problem of distinguishing intended hangs
submitted by igt during CI/bat and hangs that are nonintended
happening in close proximity.
As we know how igt constructs a batch intended to hang
the gpu, we can use this in our advantage when error state
is constructed. The signature of a intended hang can
be parsed out in this stage.
Batches that are expected to hang can be watermarked by:
batch[i++] = MI_BATCH_BUFFER_END;
batch[i++] = MI_NOOP;
batch[i++] = IGT_HANG_SIGNATURE; /* 0xc5c5c5c5 */
Note that we do the parsing only with gem debug builds.
Later when we spew out notification about the hang into
the dmesg, we can use DRM_INFO for intended hangs
and DRM_WARN for nonintended. This way auxiliary tools
can make better judgement on what to consider important
for reporting.
Cc: Chris Wilson <chris at chris-wilson.co.uk>
Cc: Daniel Vetter <daniel.vetter at ffwll.ch>
Cc: Joonas Lahtinen <joonas.lahtinen at linux.intel.com>
Signed-off-by: Mika Kuoppala <mika.kuoppala at linux.intel.com>
---
drivers/gpu/drm/i915/i915_debugfs.c | 2 +-
drivers/gpu/drm/i915/i915_drv.h | 6 +-
drivers/gpu/drm/i915/i915_gpu_error.c | 100 +++++++++++++++++++++--
drivers/gpu/drm/i915/i915_irq.c | 4 +-
drivers/gpu/drm/i915/intel_hangcheck.c | 6 +-
drivers/gpu/drm/i915/selftests/intel_hangcheck.c | 2 +-
6 files changed, 104 insertions(+), 16 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 39883cd915db..678f472a8956 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -4092,7 +4092,7 @@ i915_wedged_set(void *data, u64 val)
engine->hangcheck.stalled = true;
}
- i915_handle_error(i915, val, "Manually setting wedged to %llu", val);
+ i915_handle_error(i915, true, val, "Manually setting wedged to %llu", val);
wait_on_bit(&i915->gpu_error.flags,
I915_RESET_HANDOFF,
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index fe93115c4caa..91e0ab8509d2 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1004,6 +1004,8 @@ struct i915_gpu_state {
u32 *pages[0];
} *ringbuffer, *batchbuffer, *wa_batchbuffer, *ctx, *hws_page;
+ bool batch_had_hang_signature;
+
struct drm_i915_error_object **user_bo;
long user_bo_count;
@@ -3372,8 +3374,9 @@ static inline void i915_queue_hangcheck(struct drm_i915_private *dev_priv)
&dev_priv->gpu_error.hangcheck_work, delay);
}
-__printf(3, 4)
+__printf(4, 5)
void i915_handle_error(struct drm_i915_private *dev_priv,
+ bool intentional,
u32 engine_mask,
const char *fmt, ...);
@@ -3960,6 +3963,7 @@ static inline void i915_error_state_buf_release(
struct i915_gpu_state *i915_capture_gpu_state(struct drm_i915_private *i915);
void i915_capture_error_state(struct drm_i915_private *dev_priv,
+ bool intentional,
u32 engine_mask,
const char *error_msg);
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 5c2d83a838d8..a5bfefb94d6b 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -918,15 +918,52 @@ void __i915_gpu_state_free(struct kref *error_ref)
kfree(error);
}
+#ifdef CONFIG_DRM_I915_DEBUG_GEM
+
+#define HANG_SIGNATURE_MAGIC 0xc5c5c5c5
+
+static bool batch_has_hang_signature(const u32 * const batch, u64 len)
+{
+ struct marker { u32 val; u64 offset; } m[2] = {
+ { MI_BATCH_BUFFER_END, -1 },
+ { HANG_SIGNATURE_MAGIC, -1 }
+ };
+ u64 i, j;
+
+ len = min_t(u64, len, PAGE_SIZE);
+ len = DIV_ROUND_UP(len, 4);
+
+ for (i = 0, j = 0; i < len && j < ARRAY_SIZE(m); i++) {
+ if (m[j].offset == -1 && batch[i] == m[j].val)
+ m[j++].offset = i;
+
+ if (m[0].offset != -1 && i - m[0].offset > 2)
+ break;
+ }
+
+ if (m[0].offset == -1 || m[1].offset == -1)
+ return false;
+
+ return m[1].offset - m[0].offset == 2;
+}
+#else
+static bool batch_has_hang_signature(struct i915_vma *vma)
+{
+ return false;
+}
+#endif
+
static struct drm_i915_error_object *
-i915_error_object_create(struct drm_i915_private *i915,
- struct i915_vma *vma)
+__i915_error_object_create(struct drm_i915_private *i915,
+ struct i915_vma *vma,
+ bool *hangsig)
{
struct i915_ggtt *ggtt = &i915->ggtt;
const u64 slot = ggtt->error_capture.start;
struct drm_i915_error_object *dst;
struct compress compress;
- unsigned long num_pages;
+ unsigned long num_pages, src_pages;
+ unsigned long page_count = 0;
struct sgt_iter iter;
dma_addr_t dma;
@@ -934,6 +971,7 @@ i915_error_object_create(struct drm_i915_private *i915,
return NULL;
num_pages = min_t(u64, vma->size, vma->obj->base.size) >> PAGE_SHIFT;
+ src_pages = num_pages;
num_pages = DIV_ROUND_UP(10 * num_pages, 8); /* worstcase zlib growth */
dst = kmalloc(sizeof(*dst) + num_pages * sizeof(u32 *),
GFP_ATOMIC | __GFP_NOWARN);
@@ -958,9 +996,16 @@ i915_error_object_create(struct drm_i915_private *i915,
I915_CACHE_NONE, 0);
s = io_mapping_map_atomic_wc(&ggtt->mappable, slot);
+
+ /* Check last page only for intentional hang signature */
+ if (hangsig && page_count == src_pages - 1)
+ *hangsig = batch_has_hang_signature((void __force *)s,
+ vma->size);
+
ret = compress_page(&compress, (void __force *)s, dst);
io_mapping_unmap_atomic(s);
+ page_count++;
if (ret)
goto unwind;
}
@@ -978,6 +1023,21 @@ i915_error_object_create(struct drm_i915_private *i915,
return dst;
}
+static struct drm_i915_error_object *
+i915_error_object_create(struct drm_i915_private *i915,
+ struct i915_vma *vma)
+{
+ return __i915_error_object_create(i915, vma, NULL);
+}
+
+static struct drm_i915_error_object *
+i915_error_batch_object_create(struct drm_i915_private *i915,
+ struct i915_vma *vma,
+ bool *signature)
+{
+ return __i915_error_object_create(i915, vma, signature);
+}
+
/* The error capture is special as tries to run underneath the normal
* locking rules - so we use the raw version of the i915_gem_active lookup.
*/
@@ -1484,8 +1544,9 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
* by userspace.
*/
ee->batchbuffer =
- i915_error_object_create(dev_priv,
- request->batch);
+ i915_error_batch_object_create(dev_priv,
+ request->batch,
+ &ee->batch_had_hang_signature);
if (HAS_BROKEN_CS_TLB(dev_priv))
ee->wa_batchbuffer =
@@ -1701,7 +1762,8 @@ static void i915_capture_reg_state(struct drm_i915_private *dev_priv,
static void i915_error_capture_msg(struct drm_i915_private *dev_priv,
struct i915_gpu_state *error,
u32 engine_mask,
- const char *error_msg)
+ const char *error_msg,
+ bool simulated)
{
u32 ecode;
int engine_id = -1, len;
@@ -1709,7 +1771,8 @@ static void i915_error_capture_msg(struct drm_i915_private *dev_priv,
ecode = i915_error_generate_code(dev_priv, error, &engine_id);
len = scnprintf(error->error_msg, sizeof(error->error_msg),
- "GPU HANG: ecode %d:%d:0x%08x",
+ "GPU HANG%s: ecode %d:%d:0x%08x",
+ simulated ? "(simulated)" : "",
INTEL_GEN(dev_priv), engine_id, ecode);
if (engine_id != -1 && error->engine[engine_id].context.pid)
@@ -1803,6 +1866,18 @@ i915_capture_gpu_state(struct drm_i915_private *i915)
return error;
}
+static bool error_state_has_hang_signature(const struct i915_gpu_state *error)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
+ if (error->engine[i].batch_had_hang_signature)
+ return true;
+ }
+
+ return false;
+}
+
/**
* i915_capture_error_state - capture an error record for later analysis
* @dev: drm device
@@ -1813,12 +1888,14 @@ i915_capture_gpu_state(struct drm_i915_private *i915)
* to pick up.
*/
void i915_capture_error_state(struct drm_i915_private *dev_priv,
+ bool intentional,
u32 engine_mask,
const char *error_msg)
{
static bool warned;
struct i915_gpu_state *error;
unsigned long flags;
+ bool expected;
if (!i915_modparams.error_capture)
return;
@@ -1832,8 +1909,13 @@ void i915_capture_error_state(struct drm_i915_private *dev_priv,
return;
}
- i915_error_capture_msg(dev_priv, error, engine_mask, error_msg);
- DRM_INFO("%s\n", error->error_msg);
+ expected = intentional || error_state_has_hang_signature(error);
+ i915_error_capture_msg(dev_priv, error, engine_mask, error_msg, expected);
+
+ if (expected)
+ DRM_INFO("%s\n", error->error_msg);
+ else
+ DRM_WARN("%s\n", error->error_msg);
if (!error->simulated) {
spin_lock_irqsave(&dev_priv->gpu_error.lock, flags);
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index ff00e462697a..75f519f910cc 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2828,6 +2828,7 @@ static void i915_clear_error_registers(struct drm_i915_private *dev_priv)
/**
* i915_handle_error - handle a gpu error
* @dev_priv: i915 device private
+ * @intentional: error was intentionally injected
* @engine_mask: mask representing engines that are hung
* @fmt: Error message format string
*
@@ -2838,6 +2839,7 @@ static void i915_clear_error_registers(struct drm_i915_private *dev_priv)
* of a ring dump etc.).
*/
void i915_handle_error(struct drm_i915_private *dev_priv,
+ bool intentional,
u32 engine_mask,
const char *fmt, ...)
{
@@ -2859,7 +2861,7 @@ void i915_handle_error(struct drm_i915_private *dev_priv,
*/
intel_runtime_pm_get(dev_priv);
- i915_capture_error_state(dev_priv, engine_mask, error_msg);
+ i915_capture_error_state(dev_priv, intentional, engine_mask, error_msg);
i915_clear_error_registers(dev_priv);
/*
diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c
index 12ac270a5f93..7b9e8dec7d65 100644
--- a/drivers/gpu/drm/i915/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/intel_hangcheck.c
@@ -266,7 +266,7 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd)
*/
tmp = I915_READ_CTL(engine);
if (tmp & RING_WAIT) {
- i915_handle_error(dev_priv, 0,
+ i915_handle_error(dev_priv, false, 0,
"Kicking stuck wait on %s",
engine->name);
I915_WRITE_CTL(engine, tmp);
@@ -278,7 +278,7 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd)
default:
return ENGINE_DEAD;
case 1:
- i915_handle_error(dev_priv, 0,
+ i915_handle_error(dev_priv, false, 0,
"Kicking stuck semaphore on %s",
engine->name);
I915_WRITE_CTL(engine, tmp);
@@ -407,7 +407,7 @@ static void hangcheck_declare_hang(struct drm_i915_private *i915,
"%s, ", engine->name);
msg[len-2] = '\0';
- return i915_handle_error(i915, hung, "%s", msg);
+ return i915_handle_error(i915, false, hung, "%s", msg);
}
/*
diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
index 71ce06680d66..7900bba9fe16 100644
--- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
@@ -846,7 +846,7 @@ static int igt_handle_error(void *arg)
engine->hangcheck.stalled = true;
engine->hangcheck.seqno = intel_engine_get_seqno(engine);
- i915_handle_error(i915, intel_engine_flag(engine), "%s", __func__);
+ i915_handle_error(i915, true, intel_engine_flag(engine), "%s", __func__);
xchg(&i915->gpu_error.first_error, error);
--
2.11.0
More information about the Intel-gfx
mailing list