[Intel-gfx] [PATCH v2] drm/i915: Move missed interrupt detection from hangcheck to breadcrumbs

Chris Wilson chris at chris-wilson.co.uk
Tue Aug 9 14:08:55 UTC 2016


In commit 2529d57050af ("drm/i915: Drop racy markup of missed-irqs from
idle-worker") the racy detection of missed interrupts was removed when
we went idle. This however opened up the issue that the stuck waiters
were not being reported, causing a test case failure. If we move the
stuck waiter detection out of hangcheck and into the breadcrumb
mechanims (i.e. the waiter) itself, we can avoid this issue entirely.
This leaves hangcheck looking for a stuck GPU (inspecting for request
advancement and HEAD motion), and breadcrumbs looking for a stuck
waiter - hopefully make both easier to understand by their segregation.

v2: Reduce the error message as we now run independently of hangcheck,
and the hanging batch used by igt also counts as a stuck waiter causing
extra warnings in dmesg.
v3: Move the breadcrumb's hangcheck kickstart to the first missed wait.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=97104
Fixes: 2529d57050af (waiter"drm/i915: Drop racy markup of missed-irqs...")
Testcase: igt/drv_missed_irq
Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
Cc: Joonas Lahtinen <joonas.lahtinen at linux.intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin at linux.intel.com>
Cc: Mika Kuoppala <mika.kuoppala at intel.com>
---
 drivers/gpu/drm/i915/i915_debugfs.c      | 11 ++---
 drivers/gpu/drm/i915/i915_gem.c          | 10 -----
 drivers/gpu/drm/i915/i915_irq.c          | 26 +-----------
 drivers/gpu/drm/i915/intel_breadcrumbs.c | 69 ++++++++++++++++++++++----------
 drivers/gpu/drm/i915/intel_engine_cs.c   |  1 +
 drivers/gpu/drm/i915/intel_ringbuffer.h  |  6 +--
 6 files changed, 56 insertions(+), 67 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index f62285c1ed7f..96bfc745a820 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -787,8 +787,6 @@ static void i915_ring_seqno_info(struct seq_file *m,
 
 	seq_printf(m, "Current sequence (%s): %x\n",
 		   engine->name, intel_engine_get_seqno(engine));
-	seq_printf(m, "Current user interrupts (%s): %lx\n",
-		   engine->name, READ_ONCE(engine->breadcrumbs.irq_wakeups));
 
 	spin_lock(&b->lock);
 	for (rb = rb_first(&b->waiters); rb; rb = rb_next(rb)) {
@@ -1434,11 +1432,10 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
 			   engine->hangcheck.seqno,
 			   seqno[id],
 			   engine->last_submitted_seqno);
-		seq_printf(m, "\twaiters? %d\n",
-			   intel_engine_has_waiter(engine));
-		seq_printf(m, "\tuser interrupts = %lx [current %lx]\n",
-			   engine->hangcheck.user_interrupts,
-			   READ_ONCE(engine->breadcrumbs.irq_wakeups));
+		seq_printf(m, "\twaiters? %s, fake irq active? %s\n",
+			   yesno(intel_engine_has_waiter(engine)),
+			   yesno(test_bit(engine->id,
+					  &dev_priv->gpu_error.missed_irq_rings)));
 		seq_printf(m, "\tACTHD = 0x%08llx [current 0x%08llx]\n",
 			   (long long)engine->hangcheck.acthd,
 			   (long long)acthd[id]);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index d71fa9a93afa..2bb9ef91a243 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2524,7 +2524,6 @@ i915_gem_idle_work_handler(struct work_struct *work)
 		container_of(work, typeof(*dev_priv), gt.idle_work.work);
 	struct drm_device *dev = &dev_priv->drm;
 	struct intel_engine_cs *engine;
-	unsigned int stuck_engines;
 	bool rearm_hangcheck;
 
 	if (!READ_ONCE(dev_priv->gt.awake))
@@ -2554,15 +2553,6 @@ i915_gem_idle_work_handler(struct work_struct *work)
 	dev_priv->gt.awake = false;
 	rearm_hangcheck = false;
 
-	/* As we have disabled hangcheck, we need to unstick any waiters still
-	 * hanging around. However, as we may be racing against the interrupt
-	 * handler or the waiters themselves, we skip enabling the fake-irq.
-	 */
-	stuck_engines = intel_kick_waiters(dev_priv);
-	if (unlikely(stuck_engines))
-		DRM_DEBUG_DRIVER("kicked stuck waiters (%x)...missed irq?\n",
-				 stuck_engines);
-
 	if (INTEL_GEN(dev_priv) >= 6)
 		gen6_rps_idle(dev_priv);
 	intel_runtime_pm_put(dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 591f452ece68..ebb83d5a448b 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -972,10 +972,8 @@ static void ironlake_rps_change_irq_handler(struct drm_i915_private *dev_priv)
 static void notify_ring(struct intel_engine_cs *engine)
 {
 	smp_store_mb(engine->breadcrumbs.irq_posted, true);
-	if (intel_engine_wakeup(engine)) {
+	if (intel_engine_wakeup(engine))
 		trace_i915_gem_request_notify(engine);
-		engine->breadcrumbs.irq_wakeups++;
-	}
 }
 
 static void vlv_c0_read(struct drm_i915_private *dev_priv,
@@ -3044,22 +3042,6 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd)
 	return HANGCHECK_HUNG;
 }
 
-static unsigned long kick_waiters(struct intel_engine_cs *engine)
-{
-	struct drm_i915_private *i915 = engine->i915;
-	unsigned long irq_count = READ_ONCE(engine->breadcrumbs.irq_wakeups);
-
-	if (engine->hangcheck.user_interrupts == irq_count &&
-	    !test_and_set_bit(engine->id, &i915->gpu_error.missed_irq_rings)) {
-		if (!test_bit(engine->id, &i915->gpu_error.test_irq_rings))
-			DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
-				  engine->name);
-
-		intel_engine_enable_fake_irq(engine);
-	}
-
-	return irq_count;
-}
 /*
  * This is called when the chip hasn't reported back with completed
  * batchbuffers in a long time. We keep track per ring seqno progress and
@@ -3097,7 +3079,6 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
 		bool busy = intel_engine_has_waiter(engine);
 		u64 acthd;
 		u32 seqno;
-		unsigned user_interrupts;
 
 		semaphore_clear_deadlocks(dev_priv);
 
@@ -3114,15 +3095,11 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
 		acthd = intel_engine_get_active_head(engine);
 		seqno = intel_engine_get_seqno(engine);
 
-		/* Reset stuck interrupts between batch advances */
-		user_interrupts = 0;
-
 		if (engine->hangcheck.seqno == seqno) {
 			if (!intel_engine_is_active(engine)) {
 				engine->hangcheck.action = HANGCHECK_IDLE;
 				if (busy) {
 					/* Safeguard against driver failure */
-					user_interrupts = kick_waiters(engine);
 					engine->hangcheck.score += BUSY;
 				}
 			} else {
@@ -3185,7 +3162,6 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
 
 		engine->hangcheck.seqno = seqno;
 		engine->hangcheck.acthd = acthd;
-		engine->hangcheck.user_interrupts = user_interrupts;
 		busy_count += busy;
 	}
 
diff --git a/drivers/gpu/drm/i915/intel_breadcrumbs.c b/drivers/gpu/drm/i915/intel_breadcrumbs.c
index 90867446f1a5..c83ae3cb51df 100644
--- a/drivers/gpu/drm/i915/intel_breadcrumbs.c
+++ b/drivers/gpu/drm/i915/intel_breadcrumbs.c
@@ -26,6 +26,40 @@
 
 #include "i915_drv.h"
 
+static void intel_breadcrumbs_hangcheck(unsigned long data)
+{
+	struct intel_engine_cs *engine = (struct intel_engine_cs *)data;
+	struct intel_breadcrumbs *b = &engine->breadcrumbs;
+
+	if (!b->irq_enabled)
+		return;
+
+	if (time_before(jiffies, b->timeout)) {
+		mod_timer(&b->hangcheck, b->timeout);
+		return;
+	}
+
+	DRM_DEBUG("Hangcheck timer elapsed... %s idle\n", engine->name);
+	set_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings);
+	mod_timer(&engine->breadcrumbs.fake_irq, jiffies + 1);
+
+	/* Ensure that even if the GPU hangs, we get woken up.
+	 *
+	 * However, note that if no one is waiting, we never notice
+	 * a gpu hang. Eventually, we will have to wait for a resource
+	 * held by the GPU and so trigger a hangcheck. In the most
+	 * pathological case, this will be upon memory starvation! To
+	 * prevent this, we also queue the hangcheck from the retire
+	 * worker.
+	 */
+	i915_queue_hangcheck(i915);
+}
+
+static unsigned long wait_timeout(void)
+{
+	return round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES);
+}
+
 static void intel_breadcrumbs_fake_irq(unsigned long data)
 {
 	struct intel_engine_cs *engine = (struct intel_engine_cs *)data;
@@ -51,13 +85,6 @@ static void irq_enable(struct intel_engine_cs *engine)
 	 */
 	engine->breadcrumbs.irq_posted = true;
 
-	/* Make sure the current hangcheck doesn't falsely accuse a just
-	 * started irq handler from missing an interrupt (because the
-	 * interrupt count still matches the stale value from when
-	 * the irq handler was disabled, many hangchecks ago).
-	 */
-	engine->breadcrumbs.irq_wakeups++;
-
 	spin_lock_irq(&engine->i915->irq_lock);
 	engine->irq_enable(engine);
 	spin_unlock_irq(&engine->i915->irq_lock);
@@ -98,17 +125,13 @@ static void __intel_breadcrumbs_enable_irq(struct intel_breadcrumbs *b)
 	}
 
 	if (!b->irq_enabled ||
-	    test_bit(engine->id, &i915->gpu_error.missed_irq_rings))
+	    test_bit(engine->id, &i915->gpu_error.missed_irq_rings)) {
 		mod_timer(&b->fake_irq, jiffies + 1);
-
-	/* Ensure that even if the GPU hangs, we get woken up.
-	 *
-	 * However, note that if no one is waiting, we never notice
-	 * a gpu hang. Eventually, we will have to wait for a resource
-	 * held by the GPU and so trigger a hangcheck. In the most
-	 * pathological case, this will be upon memory starvation!
-	 */
-	i915_queue_hangcheck(i915);
+	} else {
+		/* Ensure we never sleep indefinitely */
+		GEM_BUG_ON(!time_after(b->timeout, jiffies));
+		mod_timer(&b->hangcheck, b->timeout);
+	}
 }
 
 static void __intel_breadcrumbs_disable_irq(struct intel_breadcrumbs *b)
@@ -219,6 +242,7 @@ static bool __intel_engine_add_wait(struct intel_engine_cs *engine,
 		GEM_BUG_ON(!next && !first);
 		if (next && next != &wait->node) {
 			GEM_BUG_ON(first);
+			b->timeout = wait_timeout();
 			b->first_wait = to_wait(next);
 			smp_store_mb(b->irq_seqno_bh, b->first_wait->tsk);
 			/* As there is a delay between reading the current
@@ -245,6 +269,7 @@ static bool __intel_engine_add_wait(struct intel_engine_cs *engine,
 
 	if (first) {
 		GEM_BUG_ON(rb_first(&b->waiters) != &wait->node);
+		b->timeout = wait_timeout();
 		b->first_wait = wait;
 		smp_store_mb(b->irq_seqno_bh, wait->tsk);
 		/* After assigning ourselves as the new bottom-half, we must
@@ -277,11 +302,6 @@ bool intel_engine_add_wait(struct intel_engine_cs *engine,
 	return first;
 }
 
-void intel_engine_enable_fake_irq(struct intel_engine_cs *engine)
-{
-	mod_timer(&engine->breadcrumbs.fake_irq, jiffies + 1);
-}
-
 static inline bool chain_wakeup(struct rb_node *rb, int priority)
 {
 	return rb && to_wait(rb)->tsk->prio <= priority;
@@ -359,6 +379,7 @@ void intel_engine_remove_wait(struct intel_engine_cs *engine,
 			 * the interrupt, or if we have to handle an
 			 * exception rather than a seqno completion.
 			 */
+			b->timeout = wait_timeout();
 			b->first_wait = to_wait(next);
 			smp_store_mb(b->irq_seqno_bh, b->first_wait->tsk);
 			if (b->first_wait->seqno != wait->seqno)
@@ -533,6 +554,9 @@ int intel_engine_init_breadcrumbs(struct intel_engine_cs *engine)
 	struct task_struct *tsk;
 
 	spin_lock_init(&b->lock);
+	setup_timer(&b->hangcheck,
+		    intel_breadcrumbs_hangcheck,
+		    (unsigned long)engine);
 	setup_timer(&b->fake_irq,
 		    intel_breadcrumbs_fake_irq,
 		    (unsigned long)engine);
@@ -561,6 +585,7 @@ void intel_engine_fini_breadcrumbs(struct intel_engine_cs *engine)
 		kthread_stop(b->signaler);
 
 	del_timer_sync(&b->fake_irq);
+	del_timer_sync(&b->hangcheck);
 }
 
 unsigned int intel_kick_waiters(struct drm_i915_private *i915)
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index e9b301ae2d0c..0dd3d1de18aa 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -164,6 +164,7 @@ cleanup:
 void intel_engine_init_hangcheck(struct intel_engine_cs *engine)
 {
 	memset(&engine->hangcheck, 0, sizeof(engine->hangcheck));
+	clear_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings);
 }
 
 static void intel_engine_init_requests(struct intel_engine_cs *engine)
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 43e545e44352..4aed4586b0b6 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -75,7 +75,6 @@ enum intel_engine_hangcheck_action {
 
 struct intel_engine_hangcheck {
 	u64 acthd;
-	unsigned long user_interrupts;
 	u32 seqno;
 	int score;
 	enum intel_engine_hangcheck_action action;
@@ -173,7 +172,6 @@ struct intel_engine_cs {
 	 */
 	struct intel_breadcrumbs {
 		struct task_struct *irq_seqno_bh; /* bh for user interrupts */
-		unsigned long irq_wakeups;
 		bool irq_posted;
 
 		spinlock_t lock; /* protects the lists of requests */
@@ -183,6 +181,9 @@ struct intel_engine_cs {
 		struct task_struct *signaler; /* used for fence signalling */
 		struct drm_i915_gem_request *first_signal;
 		struct timer_list fake_irq; /* used after a missed interrupt */
+		struct timer_list hangcheck; /* detect missed interrupts */
+
+		unsigned long timeout;
 
 		bool irq_enabled : 1;
 		bool rpm_wakelock : 1;
@@ -560,7 +561,6 @@ static inline bool intel_engine_wakeup(struct intel_engine_cs *engine)
 	return wakeup;
 }
 
-void intel_engine_enable_fake_irq(struct intel_engine_cs *engine);
 void intel_engine_fini_breadcrumbs(struct intel_engine_cs *engine);
 unsigned int intel_kick_waiters(struct drm_i915_private *i915);
 unsigned int intel_kick_signalers(struct drm_i915_private *i915);
-- 
2.8.1



More information about the Intel-gfx mailing list