[Intel-gfx] [RFC 13/13] drm/i915: Exec buffer inserts watchdog commands
Daniel Vetter
daniel at ffwll.ch
Mon Dec 16 19:25:43 CET 2013
On Mon, Dec 16, 2013 at 04:03:52PM +0000, Lister, Ian wrote:
> From f71a7de85e9d81be3aa3962c8fe2557235ff21c1 Mon Sep 17 00:00:00 2001
> Message-Id: <f71a7de85e9d81be3aa3962c8fe2557235ff21c1.1387201899.git.ian.lister at intel.com>
> In-Reply-To: <cover.1387201899.git.ian.lister at intel.com>
> References: <cover.1387201899.git.ian.lister at intel.com>
> From: ian-lister <ian.lister at intel.com>
> Date: Wed, 11 Dec 2013 11:25:44 +0000
> Subject: [RFC 13/13] drm/i915: Exec buffer inserts watchdog commands
>
> A start timer command is inserted before the batch buffer start command
> and a stop timer command is inserted afterwards. If the batch executes
> normally then the timer will be cancelled before it can fire, however if
> the batch buffer hangs then the watchdog timer will fire and
> i915_handle_error is called to complete the recovery work.
>
> This patch requires the per-engine TDR patches.
>
> Signed-off-by: ian-lister <ian.lister at intel.com>
> ---
> drivers/gpu/drm/i915/i915_debugfs.c | 16 +++++++++++++---
> drivers/gpu/drm/i915/i915_gem_execbuffer.c | 22 ++++++++++++++++++++++
> drivers/gpu/drm/i915/i915_irq.c | 1 +
> drivers/gpu/drm/i915/intel_ringbuffer.h | 1 +
> 4 files changed, 37 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index f00b19c..6867b096 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -906,15 +906,23 @@ i915_ring_hangcheck_read(struct file *filp,
> * have hung and been reset since boot */
> struct drm_device *dev = filp->private_data;
> drm_i915_private_t *dev_priv = dev->dev_private;
> - char buf[100];
> + char buf[200];
> int len;
>
> len = scnprintf(buf, sizeof(buf),
> - "GPU=0x%08X,RCS=0x%08X,VCS=0x%08X,BCS=0x%08X\n",
> + "GPU=0x%08X,RCS=0x%08X,VCS=0x%08X,BCS=0x%08X,"
> + "RCS_T=0x%08x,VCS_T=0x%08x,BCS_T=0x%08x,"
> + "RCS_W=0x%08x,VCS_W=0x%08x,BCS_W=0x%08x\n",
> dev_priv->gpu_error.global_resets,
> dev_priv->ring[RCS].hangcheck.total,
> dev_priv->ring[VCS].hangcheck.total,
> - dev_priv->ring[BCS].hangcheck.total);
> + dev_priv->ring[BCS].hangcheck.total,
> + dev_priv->ring[RCS].hangcheck.tdr_count,
> + dev_priv->ring[VCS].hangcheck.tdr_count,
> + dev_priv->ring[BCS].hangcheck.tdr_count,
> + dev_priv->ring[RCS].hangcheck.watchdog_count,
> + dev_priv->ring[VCS].hangcheck.watchdog_count,
> + dev_priv->ring[BCS].hangcheck.watchdog_count);
>
> return simple_read_from_buffer(ubuf, max, ppos, buf, len);
> }
> @@ -935,6 +943,8 @@ i915_ring_hangcheck_write(struct file *filp,
> for (i = 0; i < I915_NUM_RINGS; i++) {
> /* Reset the hangcheck counters */
> dev_priv->ring[i].hangcheck.total = 0;
> + dev_priv->ring[i].hangcheck.tdr_count = 0;
> + dev_priv->ring[i].hangcheck.watchdog_count = 0;
> }
Looks like unrelated debug code, needs to be extracted and shuffled into
the right patch (or separate debug patch). There's more like this below.
>
> dev_priv->gpu_error.global_resets = 0;
> diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> index c9d330a..722b41e 100644
> --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> @@ -32,6 +32,7 @@
> #include "i915_trace.h"
> #include "intel_drv.h"
> #include <linux/dma_remapping.h>
> +#include "intel_ringbuffer.h"
>
> struct eb_vmas {
> struct list_head vmas;
> @@ -1196,6 +1197,17 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
> goto err;
> }
>
> + /* Start watchdog timer*/
> + if ((args->flags & I915_EXEC_ENABLE_WATCHDOG) &&
> + i915_enable_watchdog &&
> + intel_ring_watchdog_supported(ring)) {
Imo just yell at userspace (with an -EINVAL) if it tries to use the
watchdog support on unsupported rings. Which means we need a driver flag
for this.
Also I don't see the value of of a module option for this (I presume
i915_enable_watchdog is this) since this is purely opt-in from userspace.
> +
> + ret = intel_ring_start_watchdog(ring);
> +
> + if (ret)
> + goto err;
> + }
> +
> exec_start = i915_gem_obj_offset(batch_obj, vm) +
> args->batch_start_offset;
> exec_len = args->batch_len;
> @@ -1220,6 +1232,16 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
> goto err;
> }
>
> + /* Cancel watchdog timer */
> + if ((args->flags & I915_EXEC_ENABLE_WATCHDOG) &&
> + i915_enable_watchdog &&
> + intel_ring_watchdog_supported(ring)) {
> +
> + ret = intel_ring_stop_watchdog(ring);
> + if (ret)
> + goto err;
> + }
> +
> trace_i915_gem_ring_dispatch(ring, intel_ring_get_seqno(ring), flags);
>
> i915_gem_execbuffer_move_to_active(&eb->vmas, ring);
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index c26c3db..bc7d68b 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -2974,6 +2974,7 @@ static void i915_hangcheck_elapsed(unsigned long data)
> busy_count += i915_hangcheck_ring_sample(ring);
>
> if (ring->hangcheck.score > FIRE) {
> + ring->hangcheck.tdr_count++;
> DRM_ERROR("Hang detected on %s\n",
> ring->name);
>
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index 473cb94..4a42638 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -72,6 +72,7 @@ struct intel_ring_hangcheck {
> u32 total; /* Total resets applied to this ring/engine*/
> u32 last_head; /* Head value recorded at last hang */
> u32 status_updated;
> + u32 tdr_count; /* Total TDR resets for this ring */
> u32 watchdog_threshold;
> u32 watchdog_count; /* Total watchdog resets for this ring */
> };
> --
> 1.8.5.1
>
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
--
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch
More information about the Intel-gfx
mailing list