[Intel-gfx] [PATCH 1/2] drm/i915: Detach hangcheck from request lists

Fri May 8 06:39:54 PDT 2015

Hangcheck tries to peek into request list to see
if the ring was busy or not. But that leads to race
against the list addition in request submission.
And hangcheck saw a ring being idle, when in fact work was
just being submitted.

There is strong desire to keep hangcheck without
locks of any kind as it is our last line of defense
against failures that surpass our imagination.

Rework the hangcheck logic so that we find out
the ring busyness by inspecting hw engine state
and omit the request->list inspection.

v2: start is u32, push for 80 cols (Chris)

References: https://bugs.freedesktop.org/show_bug.cgi?id=89931
References: https://bugs.freedesktop.org/show_bug.cgi?id=89644
References: https://bugs.freedesktop.org/show_bug.cgi?id=88984
References: https://bugs.freedesktop.org/show_bug.cgi?id=88981
References: https://bugs.freedesktop.org/show_bug.cgi?id=87730
Cc: Chris Wilson <chris at chris-wilson.co.uk>
Signed-off-by: Mika Kuoppala <mika.kuoppala at intel.com>
---
 drivers/gpu/drm/i915/i915_debugfs.c     |  10 +-
 drivers/gpu/drm/i915/i915_gpu_error.c   |   4 +-
 drivers/gpu/drm/i915/i915_irq.c         | 214 ++++++++++++++++++++------------
 drivers/gpu/drm/i915/intel_ringbuffer.h |   3 +
 4 files changed, 149 insertions(+), 82 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index adbbdda..8c647bf 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1295,6 +1295,7 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	struct intel_engine_cs *ring;
 	u64 acthd[I915_NUM_RINGS];
+	u32 start[I915_NUM_RINGS];
 	u32 seqno[I915_NUM_RINGS];
 	int i;
 
@@ -1308,6 +1309,7 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
 	for_each_ring(ring, dev_priv, i) {
 		seqno[i] = ring->get_seqno(ring, false);
 		acthd[i] = intel_ring_get_active_head(ring);
+		start[i] = I915_READ_START(ring);
 	}
 
 	intel_runtime_pm_put(dev_priv);
@@ -1328,8 +1330,14 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
 			   (long long)acthd[i]);
 		seq_printf(m, "\tmax ACTHD = 0x%08llx\n",
 			   (long long)ring->hangcheck.max_acthd);
+		seq_printf(m, "\tSTART = 0x%08x [current 0x%08x]\n",
+			   ring->hangcheck.start,
+			   start[i]);
+
 		seq_printf(m, "\tscore = %d\n", ring->hangcheck.score);
-		seq_printf(m, "\taction = %d\n", ring->hangcheck.action);
+		seq_printf(m, "\taction = %s [%d]\n",
+			   i915_hangcheck_action_to_str(ring->hangcheck.action),
+			   ring->hangcheck.action);
 	}
 
 	return 0;
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index a3e330d..9c0db19 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -220,7 +220,7 @@ static void print_error_buffers(struct drm_i915_error_state_buf *m,
 	}
 }
 
-static const char *hangcheck_action_to_str(enum intel_ring_hangcheck_action a)
+const char *i915_hangcheck_action_to_str(enum intel_ring_hangcheck_action a)
 {
 	switch (a) {
 	case HANGCHECK_IDLE:
@@ -301,7 +301,7 @@ static void i915_ring_error_state(struct drm_i915_error_state_buf *m,
 	err_printf(m, "  ring->head: 0x%08x\n", ring->cpu_ring_head);
 	err_printf(m, "  ring->tail: 0x%08x\n", ring->cpu_ring_tail);
 	err_printf(m, "  hangcheck: %s [%d]\n",
-		   hangcheck_action_to_str(ring->hangcheck_action),
+		   i915_hangcheck_action_to_str(ring->hangcheck_action),
 		   ring->hangcheck_score);
 }
 
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 9da955e..a3244bd 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2685,20 +2685,6 @@ static void gen8_disable_vblank(struct drm_device *dev, int pipe)
 	spin_unlock_irqrestore(&dev_priv->irq_lock, irqflags);
 }
 
-static struct drm_i915_gem_request *
-ring_last_request(struct intel_engine_cs *ring)
-{
-	return list_entry(ring->request_list.prev,
-			  struct drm_i915_gem_request, list);
-}
-
-static bool
-ring_idle(struct intel_engine_cs *ring)
-{
-	return (list_empty(&ring->request_list) ||
-		i915_gem_request_completed(ring_last_request(ring), false));
-}
-
 static bool
 ipehr_is_semaphore_wait(struct drm_device *dev, u32 ipehr)
 {
@@ -2882,6 +2868,109 @@ ring_stuck(struct intel_engine_cs *ring, u64 acthd)
 	return HANGCHECK_HUNG;
 }
 
+static bool engine_busy(struct intel_engine_cs *ring,
+			const u64 acthd, const u32 start)
+{
+	struct drm_i915_private *dev_priv = to_i915(ring->dev);
+	u32 tail, head;
+
+	if (ring->hangcheck.acthd != acthd)
+		return true;
+
+	if (ring->hangcheck.start != start)
+		return true;
+
+	head = I915_READ_HEAD(ring) & HEAD_ADDR;
+	tail = I915_READ_TAIL(ring) & TAIL_ADDR;
+
+	if (head != tail)
+		return true;
+
+	/* Stop rings mechanism keeps head==tail even if
+	 *  there is work to be done.
+	 */
+	if (ring->buffer &&
+	    ring->buffer->tail != tail &&
+	    waitqueue_active(&ring->irq_queue))
+		return true;
+
+	return false;
+}
+
+#define engine_idle(ring, acthd, start) (!engine_busy((ring), (acthd), (start)))
+
+static bool check_for_missed_irq(struct intel_engine_cs *ring)
+{
+	struct drm_i915_private *dev_priv = to_i915(ring->dev);
+	bool irq_missing;
+
+	if (!waitqueue_active(&ring->irq_queue))
+		return false;
+
+	irq_missing = !test_and_set_bit(ring->id,
+					&dev_priv->gpu_error.missed_irq_rings);
+
+	if (irq_missing) {
+		const bool irq_test =
+			dev_priv->gpu_error.test_irq_rings &
+			intel_ring_flag(ring);
+
+		if (irq_test)
+			DRM_INFO("Fake missed irq on %s\n", ring->name);
+		else
+			DRM_ERROR("Missed irq on %s\n", ring->name);
+	}
+
+	return true;
+}
+
+static bool hangcheck_handle_stuck_ring(struct intel_engine_cs *ring, u64 acthd)
+{
+#define BUSY 1
+#define KICK 5
+#define HUNG 20
+
+	struct intel_ring_hangcheck *hc = &ring->hangcheck;
+	bool there_is_hope = true;
+
+	/* We always increment the hangcheck score
+	 * if the ring is busy and still processing
+	 * the same request, so that no single request
+	 * can run indefinitely (such as a chain of
+	 * batches). If we detect a loop in acthd,
+	 * we increment the busyness twice as fast.
+	 * If this ring is in a legitimate wait for another
+	 * ring, we omit incrementing the score. In that case
+	 * the waiting ring is a victim and we want to be sure we
+	 * catch the right culprit. Then every time we do kick
+	 * the ring, add a small increment to the
+	 * score so that we can catch a batch that is
+	 * being repeatedly kicked and so responsible
+	 * for stalling the machine.
+	 */
+	hc->action = ring_stuck(ring, acthd);
+
+	switch (hc->action) {
+	case HANGCHECK_IDLE:
+	case HANGCHECK_WAIT:
+	case HANGCHECK_ACTIVE:
+		hc->score += BUSY;
+		break;
+	case HANGCHECK_ACTIVE_LOOP:
+		hc->score += 2 * BUSY;
+		break;
+	case HANGCHECK_KICK:
+		hc->score += KICK;
+		break;
+	case HANGCHECK_HUNG:
+		hc->score += HUNG;
+		there_is_hope = false;
+		break;
+	}
+
+	return there_is_hope;
+}
+
 /*
  * This is called when the chip hasn't reported back with completed
  * batchbuffers in a long time. We keep track per ring seqno progress and
@@ -2900,15 +2989,14 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
 	int i;
 	int busy_count = 0, rings_hung = 0;
 	bool stuck[I915_NUM_RINGS] = { 0 };
-#define BUSY 1
-#define KICK 5
-#define HUNG 20
 
 	if (!i915.enable_hangcheck)
 		return;
 
 	for_each_ring(ring, dev_priv, i) {
+		struct intel_ring_hangcheck *hc = &ring->hangcheck;
 		u64 acthd;
+		u32 start;
 		u32 seqno;
 		bool busy = true;
 
@@ -2916,76 +3004,44 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
 
 		seqno = ring->get_seqno(ring, false);
 		acthd = intel_ring_get_active_head(ring);
+		start = I915_READ_START(ring);
 
-		if (ring->hangcheck.seqno == seqno) {
-			if (ring_idle(ring)) {
-				ring->hangcheck.action = HANGCHECK_IDLE;
-
-				if (waitqueue_active(&ring->irq_queue)) {
-					/* Issue a wake-up to catch stuck h/w. */
-					if (!test_and_set_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings)) {
-						if (!(dev_priv->gpu_error.test_irq_rings & intel_ring_flag(ring)))
-							DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
-								  ring->name);
-						else
-							DRM_INFO("Fake missed irq on %s\n",
-								 ring->name);
-						wake_up_all(&ring->irq_queue);
-					}
-					/* Safeguard against driver failure */
-					ring->hangcheck.score += BUSY;
-				} else
-					busy = false;
-			} else {
-				/* We always increment the hangcheck score
-				 * if the ring is busy and still processing
-				 * the same request, so that no single request
-				 * can run indefinitely (such as a chain of
-				 * batches). The only time we do not increment
-				 * the hangcheck score on this ring, if this
-				 * ring is in a legitimate wait for another
-				 * ring. In that case the waiting ring is a
-				 * victim and we want to be sure we catch the
-				 * right culprit. Then every time we do kick
-				 * the ring, add a small increment to the
-				 * score so that we can catch a batch that is
-				 * being repeatedly kicked and so responsible
-				 * for stalling the machine.
-				 */
-				ring->hangcheck.action = ring_stuck(ring,
-								    acthd);
-
-				switch (ring->hangcheck.action) {
-				case HANGCHECK_IDLE:
-				case HANGCHECK_WAIT:
-				case HANGCHECK_ACTIVE:
-					break;
-				case HANGCHECK_ACTIVE_LOOP:
-					ring->hangcheck.score += BUSY;
-					break;
-				case HANGCHECK_KICK:
-					ring->hangcheck.score += KICK;
-					break;
-				case HANGCHECK_HUNG:
-					ring->hangcheck.score += HUNG;
-					stuck[i] = true;
-					break;
-				}
-			}
-		} else {
-			ring->hangcheck.action = HANGCHECK_ACTIVE;
+		if (hc->seqno != seqno) {
+			hc->action = HANGCHECK_ACTIVE;
 
 			/* Gradually reduce the count so that we catch DoS
 			 * attempts across multiple batches.
 			 */
-			if (ring->hangcheck.score > 0)
-				ring->hangcheck.score--;
+			if (hc->score > 0)
+				hc->score--;
 
-			ring->hangcheck.acthd = ring->hangcheck.max_acthd = 0;
+			hc->max_acthd = 0;
+			goto engine_check_done;
 		}
 
-		ring->hangcheck.seqno = seqno;
-		ring->hangcheck.acthd = acthd;
+		if (engine_idle(ring, acthd, start)) {
+			busy = check_for_missed_irq(ring);
+			if (busy) {
+				/* Start waking up irrespective of
+				   our missed_irq bookkeeping */
+				if (hc->score >= KICK)
+					wake_up_all(&ring->irq_queue);
+
+				hc->score += KICK;
+				hc->action = HANGCHECK_ACTIVE;
+			} else {
+				hc->score = 0;
+				hc->action = HANGCHECK_IDLE;
+			}
+			goto engine_check_done;
+		}
+
+		hangcheck_handle_stuck_ring(ring, acthd);
+
+engine_check_done:
+		hc->seqno = seqno;
+		hc->acthd = acthd;
+		hc->start = start;
 		busy_count += busy;
 	}
 
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 39f6dfc..83edcef 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -83,11 +83,14 @@ enum intel_ring_hangcheck_action {
 	HANGCHECK_HUNG,
 };
 
+const char *i915_hangcheck_action_to_str(enum intel_ring_hangcheck_action a);
+
 #define HANGCHECK_SCORE_RING_HUNG 31
 
 struct intel_ring_hangcheck {
 	u64 acthd;
 	u64 max_acthd;
+	u32 start;
 	u32 seqno;
 	int score;
 	enum intel_ring_hangcheck_action action;
-- 
1.9.1