[Intel-gfx] [PATCH 2/4] drm/i915: Only slightly increment hangcheck score if we succesfully kick a ring
Daniel Vetter
daniel at ffwll.ch
Tue Jun 11 11:45:00 CEST 2013
On Mon, Jun 10, 2013 at 11:20:20AM +0100, Chris Wilson wrote:
> After kicking a ring, it should be free to make progress again and so
> should not be accused of being stuck until hangcheck fires once more. In
> order to catch a denial-of-service within a batch or across multiple
> batches, we still do increment the hangcheck score - just not as
> severely so that it takes multiple kicks to fail.
>
> This should address part of Ben's justified criticism of
>
> commit 05407ff889ceebe383aa5907219f86582ef96b72
> Author: Mika Kuoppala <mika.kuoppala at linux.intel.com>
> Date: Thu May 30 09:04:29 2013 +0300
>
> drm/i915: detect hang using per ring hangcheck_score
>
> "There's also another corner case on the kick. If the seqno = 2
> (though not stuck), and on the 3rd hangcheck, the ring is stuck, and
> we try to kick it... we don't actually try to find out if the kick
> helped."
>
> v2: Make sure we catch DoS attempts with batches full of invalid WAITs.
> v3: Preserve the ability to detect loops by always charging the ring
> if it is busy on the same request.
> v4: Make sure we queue another check if on a new batch
>
> References: https://bugs.freedesktop.org/show_bug.cgi?id=65394
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala at linux.intel.com>
> Cc: Ben Widawsky <ben at bwidawsk.net>
> ---
> drivers/gpu/drm/i915/i915_irq.c | 110 +++++++++++++++++++++------------------
> 1 file changed, 58 insertions(+), 52 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index dcb5209..32b2465 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -2324,21 +2324,11 @@ ring_last_seqno(struct intel_ring_buffer *ring)
> struct drm_i915_gem_request, list)->seqno;
> }
>
> -static bool i915_hangcheck_ring_idle(struct intel_ring_buffer *ring,
> - u32 ring_seqno, bool *err)
> -{
> - if (list_empty(&ring->request_list) ||
> - i915_seqno_passed(ring_seqno, ring_last_seqno(ring))) {
> - /* Issue a wake-up to catch stuck h/w. */
> - if (waitqueue_active(&ring->irq_queue)) {
> - DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
> - ring->name);
> - wake_up_all(&ring->irq_queue);
> - *err = true;
> - }
> - return true;
> - }
> - return false;
> +static bool
> +ring_idle(struct intel_ring_buffer *ring, u32 seqno)
> +{
> + return (list_empty(&ring->request_list) ||
> + i915_seqno_passed(seqno, ring_last_seqno(ring)));
> }
>
> static bool semaphore_passed(struct intel_ring_buffer *ring)
> @@ -2372,16 +2362,26 @@ static bool semaphore_passed(struct intel_ring_buffer *ring)
> ioread32(ring->virtual_start+acthd+4)+1);
> }
>
> -static bool kick_ring(struct intel_ring_buffer *ring)
> +static bool ring_hung(struct intel_ring_buffer *ring)
> {
> struct drm_device *dev = ring->dev;
> struct drm_i915_private *dev_priv = dev->dev_private;
> - u32 tmp = I915_READ_CTL(ring);
> + u32 tmp;
> +
> + if (IS_GEN2(dev))
> + return true;
> +
> + /* Is the chip hanging on a WAIT_FOR_EVENT?
> + * If so we can simply poke the RB_WAIT bit
> + * and break the hang. This should work on
> + * all but the second generation chipsets.
> + */
> + tmp = I915_READ_CTL(ring);
> if (tmp & RING_WAIT) {
> DRM_ERROR("Kicking stuck wait on %s\n",
> ring->name);
> I915_WRITE_CTL(ring, tmp);
> - return true;
> + return false;
> }
>
> if (INTEL_INFO(dev)->gen >= 6 &&
> @@ -2390,22 +2390,10 @@ static bool kick_ring(struct intel_ring_buffer *ring)
> DRM_ERROR("Kicking stuck semaphore on %s\n",
> ring->name);
> I915_WRITE_CTL(ring, tmp);
> - return true;
> - }
> - return false;
> -}
> -
> -static bool i915_hangcheck_ring_hung(struct intel_ring_buffer *ring)
> -{
> - if (IS_GEN2(ring->dev))
> return false;
> + }
>
> - /* Is the chip hanging on a WAIT_FOR_EVENT?
> - * If so we can simply poke the RB_WAIT bit
> - * and break the hang. This should work on
> - * all but the second generation chipsets.
> - */
> - return !kick_ring(ring);
> + return true;
> }
>
> /**
> @@ -2423,45 +2411,63 @@ void i915_hangcheck_elapsed(unsigned long data)
> struct intel_ring_buffer *ring;
> int i;
> int busy_count = 0, rings_hung = 0;
> - bool stuck[I915_NUM_RINGS];
> + bool stuck[I915_NUM_RINGS] = { 0 };
> +#define BUSY 1
> +#define KICK 5
> +#define HUNG 20
> +#define FIRE 30
>
> if (!i915_enable_hangcheck)
> return;
>
> for_each_ring(ring, dev_priv, i) {
> u32 seqno, acthd;
> - bool idle, err = false;
> + bool busy = true;
>
> seqno = ring->get_seqno(ring, false);
> acthd = intel_ring_get_active_head(ring);
> - idle = i915_hangcheck_ring_idle(ring, seqno, &err);
> - stuck[i] = ring->hangcheck.acthd == acthd;
> -
> - if (idle) {
> - if (err)
> - ring->hangcheck.score += 2;
> - else
> - ring->hangcheck.score = 0;
> - } else {
> - busy_count++;
>
> - if (ring->hangcheck.seqno == seqno) {
> - ring->hangcheck.score++;
> -
> - /* Kick ring if stuck*/
> - if (stuck[i])
> - i915_hangcheck_ring_hung(ring);
> + if (ring->hangcheck.seqno == seqno) {
> + if (ring_idle(ring, seqno)) {
> + if (waitqueue_active(&ring->irq_queue)) {
> + /* Issue a wake-up to catch stuck h/w. */
> + DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
> + ring->name);
> + wake_up_all(&ring->irq_queue);
> + ring->hangcheck.score += HUNG;
Not sure whether we want to hit missed interrupts this badly, it was
rather common a while back ;-) But we can fine-tune this easily now, so
now reservations for merging from my side.
-Daniel
> + } else
> + busy = false;
> } else {
> - ring->hangcheck.score = 0;
> + int score;
> +
> + stuck[i] = ring->hangcheck.acthd == acthd;
> + if (stuck[i]) {
> + /* Every time we kick the ring, add a
> + * small increment to the hangcheck
> + * score so that we can catch a
> + * batch that is repeatedly kicked.
> + */
> + score = ring_hung(ring) ? HUNG : KICK;
> + } else
> + score = BUSY;
> +
> + ring->hangcheck.score += score;
> }
> + } else {
> + /* Gradually reduce the count so that we catch DoS
> + * attempts across multiple batches.
> + */
> + if (ring->hangcheck.score > 0)
> + ring->hangcheck.score--;
> }
>
> ring->hangcheck.seqno = seqno;
> ring->hangcheck.acthd = acthd;
> + busy_count += busy;
> }
>
> for_each_ring(ring, dev_priv, i) {
> - if (ring->hangcheck.score > 2) {
> + if (ring->hangcheck.score > FIRE) {
> rings_hung++;
> DRM_ERROR("%s: %s on %s 0x%x\n", ring->name,
> stuck[i] ? "stuck" : "no progress",
> --
> 1.7.10.4
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
--
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch
More information about the Intel-gfx
mailing list