[Intel-gfx] [PATCH 3/4] drm/i915: Don't count semaphore waits towards a stuck ring
Daniel Vetter
daniel at ffwll.ch
Tue Jun 11 11:51:28 CEST 2013
On Mon, Jun 10, 2013 at 11:20:21AM +0100, Chris Wilson wrote:
> If we detect a ring is in a valid wait for another, just let it be.
> Eventually it will either begin to progress again, or the entire system
> will come grinding to a halt and then hangcheck will fire as soon as the
> deadlock is detected.
>
> This error was foretold by Ben in
> commit 05407ff889ceebe383aa5907219f86582ef96b72
> Author: Mika Kuoppala <mika.kuoppala at linux.intel.com>
> Date: Thu May 30 09:04:29 2013 +0300
>
> drm/i915: detect hang using per ring hangcheck_score
>
> "If ring B is waiting on ring A via semaphore, and ring A is making
> progress, albeit slowly - the hangcheck will fire. The check will
> determine that A is moving, however ring B will appear hung because
> the ACTHD doesn't move. I honestly can't say if that's actually a
> realistic problem to hit it probably implies the timeout value is too
> low."
>
> v2: Make sure we don't even incur the KICK cost whilst waiting.
>
> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=65394
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala at linux.intel.com>
> Cc: Ben Widawsky <ben at bwidawsk.net>
> ---
> drivers/gpu/drm/i915/i915_irq.c | 121 +++++++++++++++++++++++--------
> drivers/gpu/drm/i915/intel_ringbuffer.h | 1 +
> 2 files changed, 90 insertions(+), 32 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 32b2465..cf8584c 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -2331,21 +2331,21 @@ ring_idle(struct intel_ring_buffer *ring, u32 seqno)
> i915_seqno_passed(seqno, ring_last_seqno(ring)));
> }
>
> -static bool semaphore_passed(struct intel_ring_buffer *ring)
> +static struct intel_ring_buffer *
> +semaphore_waits_for(struct intel_ring_buffer *ring, u32 *seqno)
> {
> struct drm_i915_private *dev_priv = ring->dev->dev_private;
> - u32 acthd = intel_ring_get_active_head(ring) & HEAD_ADDR;
> - struct intel_ring_buffer *signaller;
> - u32 cmd, ipehr, acthd_min;
> + u32 cmd, ipehr, acthd, acthd_min;
>
> ipehr = I915_READ(RING_IPEHR(ring->mmio_base));
> if ((ipehr & ~(0x3 << 16)) !=
> (MI_SEMAPHORE_MBOX | MI_SEMAPHORE_COMPARE | MI_SEMAPHORE_REGISTER))
> - return false;
> + return NULL;
>
> /* ACTHD is likely pointing to the dword after the actual command,
> * so scan backwards until we find the MBOX.
> */
> + acthd = intel_ring_get_active_head(ring) & HEAD_ADDR;
> acthd_min = max((int)acthd - 3 * 4, 0);
> do {
> cmd = ioread32(ring->virtual_start + acthd);
> @@ -2354,22 +2354,53 @@ static bool semaphore_passed(struct intel_ring_buffer *ring)
>
> acthd -= 4;
> if (acthd < acthd_min)
> - return false;
> + return NULL;
> } while (1);
>
> - signaller = &dev_priv->ring[(ring->id + (((ipehr >> 17) & 1) + 1)) % 3];
> - return i915_seqno_passed(signaller->get_seqno(signaller, false),
> - ioread32(ring->virtual_start+acthd+4)+1);
> + *seqno = ioread32(ring->virtual_start+acthd+4)+1;
> + return &dev_priv->ring[(ring->id + (((ipehr >> 17) & 1) + 1)) % 3];
> +}
> +
> +static int semaphore_passed(struct intel_ring_buffer *ring)
> +{
> + struct drm_i915_private *dev_priv = ring->dev->dev_private;
> + struct intel_ring_buffer *signaller;
> + u32 seqno, ctl;
> +
> + ring->hangcheck.deadlock = true;
> +
> + signaller = semaphore_waits_for(ring, &seqno);
> + if (signaller == NULL || signaller->hangcheck.deadlock)
> + return -1;
> +
> + /* cursory check for an unkickable deadlock */
> + ctl = I915_READ_CTL(signaller);
> + if (ctl & RING_WAIT_SEMAPHORE && semaphore_passed(signaller) < 0)
> + return -1;
> +
> + return i915_seqno_passed(signaller->get_seqno(signaller, false), seqno);
> +}
> +
> +static void semaphore_clear_deadlocks(struct drm_i915_private *dev_priv)
> +{
> + struct intel_ring_buffer *ring;
> + int i;
> +
> + for_each_ring(ring, dev_priv, i)
> + ring->hangcheck.deadlock = false;
> }
>
> -static bool ring_hung(struct intel_ring_buffer *ring)
> +static enum { wait, active, kick, hung } ring_stuck(struct intel_ring_buffer *ring, u32 acthd)
> {
> struct drm_device *dev = ring->dev;
> struct drm_i915_private *dev_priv = dev->dev_private;
> u32 tmp;
>
> + if (ring->hangcheck.acthd != acthd)
> + return active;
> +
> if (IS_GEN2(dev))
> - return true;
> + return hung;
>
> /* Is the chip hanging on a WAIT_FOR_EVENT?
> * If so we can simply poke the RB_WAIT bit
> @@ -2381,19 +2412,24 @@ static bool ring_hung(struct intel_ring_buffer *ring)
> DRM_ERROR("Kicking stuck wait on %s\n",
> ring->name);
> I915_WRITE_CTL(ring, tmp);
> - return false;
> - }
> -
> - if (INTEL_INFO(dev)->gen >= 6 &&
> - tmp & RING_WAIT_SEMAPHORE &&
> - semaphore_passed(ring)) {
> - DRM_ERROR("Kicking stuck semaphore on %s\n",
> - ring->name);
> - I915_WRITE_CTL(ring, tmp);
> - return false;
> + return kick;
> + }
> +
> + if (INTEL_INFO(dev)->gen >= 6 && tmp & RING_WAIT_SEMAPHORE) {
> + switch (semaphore_passed(ring)) {
> + default:
> + return hung;
> + case 1:
> + DRM_ERROR("Kicking stuck semaphore on %s\n",
> + ring->name);
> + I915_WRITE_CTL(ring, tmp);
> + return kick;
> + case 0:
> + return wait;
> + }
> }
>
> - return true;
> + return hung;
> }
>
> /**
> @@ -2424,6 +2460,8 @@ void i915_hangcheck_elapsed(unsigned long data)
> u32 seqno, acthd;
> bool busy = true;
>
> + semaphore_clear_deadlocks(dev_priv);
> +
> seqno = ring->get_seqno(ring, false);
> acthd = intel_ring_get_active_head(ring);
>
> @@ -2440,17 +2478,36 @@ void i915_hangcheck_elapsed(unsigned long data)
> } else {
> int score;
>
> - stuck[i] = ring->hangcheck.acthd == acthd;
> - if (stuck[i]) {
> - /* Every time we kick the ring, add a
> - * small increment to the hangcheck
> - * score so that we can catch a
> - * batch that is repeatedly kicked.
> - */
> - score = ring_hung(ring) ? HUNG : KICK;
> - } else
> + /* We always increment the hangcheck score
> + * if the ring is busy and still processing
> + * the same request, so that no single request
> + * can run indefinitely (such as a chain of
> + * batches). The only time we do not increment
> + * the hangcheck score on this ring, if this
> + * ring is in a legitimate wait for another
> + * ring. In that case the waiting ring is a
> + * victim and we want to be sure we catch the
> + * right culprit. Then every time we do kick
> + * the ring, add a small increment to the
> + * score so that we can catch a batch that is
> + * being repeatedly kicked and so responsible
> + * for stalling the machine.
> + */
> + switch (ring_stuck(ring, acthd)) {
> + case wait:
> + score = 0;
> + break;
> + case active:
> score = BUSY;
> -
> + break;
> + case kick:
> + score = KICK;
> + break;
> + case hung:
> + score = HUNG;
> + stuck[i] = true;
> + break;
> + }
> ring->hangcheck.score += score;
I think extracting the score selection logic here would be nice, stuff is
falling of the cliff here a bit ;-)
Anyway, series merged, thanks a lot to everyone for digging into this an
coming up with a pretty neat solution.
Cheers, Daniel
> }
> } else {
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index efc403d..a3e9610 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -38,6 +38,7 @@ struct intel_hw_status_page {
> #define I915_READ_SYNC_1(ring) I915_READ(RING_SYNC_1((ring)->mmio_base))
>
> struct intel_ring_hangcheck {
> + bool deadlock;
> u32 seqno;
> u32 acthd;
> int score;
> --
> 1.7.10.4
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
--
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch
More information about the Intel-gfx
mailing list