[Intel-gfx] [RFC 13/13] drm/i915: Exec buffer inserts watchdog commands

Lister, Ian ian.lister at intel.com
Mon Dec 16 17:03:52 CET 2013


From f71a7de85e9d81be3aa3962c8fe2557235ff21c1 Mon Sep 17 00:00:00 2001
Message-Id: <f71a7de85e9d81be3aa3962c8fe2557235ff21c1.1387201899.git.ian.lister at intel.com>
In-Reply-To: <cover.1387201899.git.ian.lister at intel.com>
References: <cover.1387201899.git.ian.lister at intel.com>
From: ian-lister <ian.lister at intel.com>
Date: Wed, 11 Dec 2013 11:25:44 +0000
Subject: [RFC 13/13] drm/i915: Exec buffer inserts watchdog commands

A start timer command is inserted before the batch buffer start command
and a stop timer command is inserted afterwards. If the batch executes
normally then the timer will be cancelled before it can fire, however if
the batch buffer hangs then the watchdog timer will fire and
i915_handle_error is called to complete the recovery work.

This patch requires the per-engine TDR patches.

Signed-off-by: ian-lister <ian.lister at intel.com>
---
 drivers/gpu/drm/i915/i915_debugfs.c        | 16 +++++++++++++---
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 22 ++++++++++++++++++++++
 drivers/gpu/drm/i915/i915_irq.c            |  1 +
 drivers/gpu/drm/i915/intel_ringbuffer.h    |  1 +
 4 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index f00b19c..6867b096 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -906,15 +906,23 @@ i915_ring_hangcheck_read(struct file *filp,
 	 * have hung and been reset since boot */
 	struct drm_device *dev = filp->private_data;
 	drm_i915_private_t *dev_priv = dev->dev_private;
-	char buf[100];
+	char buf[200];
 	int len;
 
 	len = scnprintf(buf, sizeof(buf),
-		"GPU=0x%08X,RCS=0x%08X,VCS=0x%08X,BCS=0x%08X\n",
+		"GPU=0x%08X,RCS=0x%08X,VCS=0x%08X,BCS=0x%08X,"
+		"RCS_T=0x%08x,VCS_T=0x%08x,BCS_T=0x%08x,"
+		"RCS_W=0x%08x,VCS_W=0x%08x,BCS_W=0x%08x\n",
 		dev_priv->gpu_error.global_resets,
 		dev_priv->ring[RCS].hangcheck.total,
 		dev_priv->ring[VCS].hangcheck.total,
-		dev_priv->ring[BCS].hangcheck.total);
+		dev_priv->ring[BCS].hangcheck.total,
+		dev_priv->ring[RCS].hangcheck.tdr_count,
+		dev_priv->ring[VCS].hangcheck.tdr_count,
+		dev_priv->ring[BCS].hangcheck.tdr_count,
+		dev_priv->ring[RCS].hangcheck.watchdog_count,
+		dev_priv->ring[VCS].hangcheck.watchdog_count,
+		dev_priv->ring[BCS].hangcheck.watchdog_count);
 
 	return simple_read_from_buffer(ubuf, max, ppos, buf, len);
 }
@@ -935,6 +943,8 @@ i915_ring_hangcheck_write(struct file *filp,
 	for (i = 0; i < I915_NUM_RINGS; i++) {
 		/* Reset the hangcheck counters */
 		dev_priv->ring[i].hangcheck.total = 0;
+		dev_priv->ring[i].hangcheck.tdr_count = 0;
+		dev_priv->ring[i].hangcheck.watchdog_count = 0;
 	}
 
 	dev_priv->gpu_error.global_resets = 0;
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index c9d330a..722b41e 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -32,6 +32,7 @@
 #include "i915_trace.h"
 #include "intel_drv.h"
 #include <linux/dma_remapping.h>
+#include "intel_ringbuffer.h"
 
 struct eb_vmas {
 	struct list_head vmas;
@@ -1196,6 +1197,17 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 			goto err;
 	}
 
+	/* Start watchdog timer*/
+	if ((args->flags & I915_EXEC_ENABLE_WATCHDOG) &&
+		i915_enable_watchdog &&
+		intel_ring_watchdog_supported(ring)) {
+
+		ret = intel_ring_start_watchdog(ring);
+
+		if (ret)
+			goto err;
+	}
+
 	exec_start = i915_gem_obj_offset(batch_obj, vm) +
 		args->batch_start_offset;
 	exec_len = args->batch_len;
@@ -1220,6 +1232,16 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 			goto err;
 	}
 
+	/* Cancel watchdog timer */
+	if ((args->flags & I915_EXEC_ENABLE_WATCHDOG) &&
+		i915_enable_watchdog &&
+		intel_ring_watchdog_supported(ring)) {
+
+		ret = intel_ring_stop_watchdog(ring);
+		if (ret)
+			goto err;
+	}
+
 	trace_i915_gem_ring_dispatch(ring, intel_ring_get_seqno(ring), flags);
 
 	i915_gem_execbuffer_move_to_active(&eb->vmas, ring);
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index c26c3db..bc7d68b 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2974,6 +2974,7 @@ static void i915_hangcheck_elapsed(unsigned long data)
 		busy_count += i915_hangcheck_ring_sample(ring);
 
 		if (ring->hangcheck.score > FIRE) {
+			ring->hangcheck.tdr_count++;
 			DRM_ERROR("Hang detected on %s\n",
 				 ring->name);
 
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 473cb94..4a42638 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -72,6 +72,7 @@ struct intel_ring_hangcheck {
 	u32 total; /* Total resets applied to this ring/engine*/
 	u32 last_head; /* Head value recorded at last hang */
 	u32 status_updated;
+	u32 tdr_count;      /* Total TDR resets for this ring */
 	u32 watchdog_threshold;
 	u32 watchdog_count; /* Total watchdog resets for this ring */
 };
-- 
1.8.5.1




More information about the Intel-gfx mailing list