[Intel-gfx] [PATCH 3/5] drm/i915: Harden detection of missed interrupts
Chris Wilson
chris at chris-wilson.co.uk
Tue Feb 16 11:47:46 UTC 2016
Only declare a missed interrupt if we find that the GPU is idle with
waiters and a hangcheck interval has passed in which no new user
interrupts have been raised.
v2: Clear the stuck interrupt marker between successful batches
Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala at intel.com>
---
drivers/gpu/drm/i915/i915_debugfs.c | 11 +++++++----
drivers/gpu/drm/i915/i915_irq.c | 10 +++++++++-
drivers/gpu/drm/i915/intel_ringbuffer.h | 2 ++
3 files changed, 18 insertions(+), 5 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index c4df580ed0de..f3ba97ad3e00 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -730,10 +730,10 @@ static int i915_gem_request_info(struct seq_file *m, void *data)
static void i915_ring_seqno_info(struct seq_file *m,
struct intel_engine_cs *ring)
{
- if (ring->get_seqno) {
- seq_printf(m, "Current sequence (%s): %x\n",
- ring->name, ring->get_seqno(ring));
- }
+ seq_printf(m, "Current sequence (%s): %x\n",
+ ring->name, ring->get_seqno(ring));
+ seq_printf(m, "Current user interrupts (%s): %x\n",
+ ring->name, READ_ONCE(ring->user_interrupts));
}
static int i915_gem_seqno_info(struct seq_file *m, void *data)
@@ -1361,6 +1361,9 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
seq_printf(m, "%s:\n", ring->name);
seq_printf(m, "\tseqno = %x [current %x]\n",
ring->hangcheck.seqno, seqno[i]);
+ seq_printf(m, "\tuser interrupts = %x [current %x]\n",
+ ring->hangcheck.user_interrupts,
+ ring->user_interrupts);
seq_printf(m, "\tACTHD = 0x%08llx [current 0x%08llx]\n",
(long long)ring->hangcheck.acthd,
(long long)acthd[i]);
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 07bc2cdd6252..c0aeff607130 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -1000,6 +1000,7 @@ static void notify_ring(struct intel_engine_cs *ring)
return;
trace_i915_gem_request_notify(ring);
+ ring->user_interrupts++;
wake_up_all(&ring->irq_queue);
}
@@ -3097,6 +3098,7 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
for_each_ring(ring, dev_priv, i) {
u64 acthd;
u32 seqno;
+ unsigned user_interrupts;
bool busy = true;
semaphore_clear_deadlocks(dev_priv);
@@ -3113,6 +3115,7 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
acthd = intel_ring_get_active_head(ring);
seqno = ring->get_seqno(ring);
+ user_interrupts = READ_ONCE(ring->user_interrupts);
if (ring->hangcheck.seqno == seqno) {
if (ring_idle(ring, seqno)) {
@@ -3120,7 +3123,8 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
if (waitqueue_active(&ring->irq_queue)) {
/* Issue a wake-up to catch stuck h/w. */
- if (!test_and_set_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings)) {
+ if (ring->hangcheck.user_interrupts == user_interrupts &&
+ !test_and_set_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings)) {
if (!(dev_priv->gpu_error.test_irq_rings & intel_ring_flag(ring)))
DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
ring->name);
@@ -3183,10 +3187,14 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
memset(ring->hangcheck.instdone, 0,
sizeof(ring->hangcheck.instdone));
+
+ /* Reset stuck interrupts between batch advances */
+ user_interrupts = 0;
}
ring->hangcheck.seqno = seqno;
ring->hangcheck.acthd = acthd;
+ ring->hangcheck.user_interrupts = user_interrupts;
busy_count += busy;
}
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 4cea04491392..dfb14bfe5bc8 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -90,6 +90,7 @@ struct intel_ring_hangcheck {
u64 acthd;
u64 max_acthd;
u32 seqno;
+ unsigned user_interrupts;
int score;
enum intel_ring_hangcheck_action action;
int deadlock;
@@ -306,6 +307,7 @@ struct intel_engine_cs {
* inspecting request list.
*/
u32 last_submitted_seqno;
+ unsigned user_interrupts;
bool gpu_caches_dirty;
--
2.7.0
More information about the Intel-gfx
mailing list