[Intel-gfx] [PATCH 3/3] drm/i915: Record error state capture reason

Mika Kuoppala mika.kuoppala at linux.intel.com
Mon Feb 10 15:30:51 CET 2014


We capture error state not only when the GPU hangs but
also on other situations as in interrupt errors and
in situations where we can kick things forward without GPU reset.
There will be log entry on most of these cases. But as error state
capture might be only thing we have, if dmesg was not captured. Or as
in GEN4 case, interrupt error can trigger error state capture without log
entry, the exact reason why capture was made is hard to decipher.

To avoid confusion why the error state was captured, print reason
along with error code into log and also store it into the error state.

References: https://bugs.freedesktop.org/show_bug.cgi?id=74193
Signed-off-by: Mika Kuoppala <mika.kuoppala at intel.com>
---
 drivers/gpu/drm/i915/i915_debugfs.c   |    5 +-
 drivers/gpu/drm/i915/i915_drv.h       |    9 +++-
 drivers/gpu/drm/i915/i915_gpu_error.c |   84 +++++++++++++++++++++++----------
 drivers/gpu/drm/i915/i915_irq.c       |   45 +++++++++++-------
 4 files changed, 98 insertions(+), 45 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 2dc05c3..175e524 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -3006,9 +3006,8 @@ i915_wedged_set(void *data, u64 val)
 {
 	struct drm_device *dev = data;
 
-	DRM_INFO("Manually setting wedged to %llu\n", val);
-	i915_handle_error(dev, val);
-
+	i915_handle_error(dev, val,
+			  "Manually setting wedged to %llu", val);
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 2a61a29..ad41c71 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -299,6 +299,8 @@ struct drm_i915_error_state {
 	struct kref ref;
 	struct timeval time;
 
+	char error_msg[128];
+
 	/* Generic register state */
 	u32 eir;
 	u32 pgtbl_er;
@@ -1994,7 +1996,9 @@ extern void intel_console_resume(struct work_struct *work);
 
 /* i915_irq.c */
 void i915_queue_hangcheck(struct drm_device *dev);
-void i915_handle_error(struct drm_device *dev, bool wedged);
+__printf(3, 4)
+void i915_handle_error(struct drm_device *dev, bool wedged,
+		       const char *fmt, ...);
 
 void gen6_set_pm_mask(struct drm_i915_private *dev_priv, u32 pm_iir,
 							int new_delay);
@@ -2464,7 +2468,8 @@ static inline void i915_error_state_buf_release(
 {
 	kfree(eb->buf);
 }
-void i915_capture_error_state(struct drm_device *dev);
+void i915_capture_error_state(struct drm_device *dev, bool wedge,
+			      const char *error_msg);
 void i915_error_state_get(struct drm_device *dev,
 			  struct i915_error_state_file_priv *error_priv);
 void i915_error_state_put(struct i915_error_state_file_priv *error_priv);
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index a90971a..dce569b 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -329,6 +329,7 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 		goto out;
 	}
 
+	err_printf(m, "%s\n", error->error_msg);
 	err_printf(m, "Time: %ld s %ld us\n", error->time.tv_sec,
 		   error->time.tv_usec);
 	err_printf(m, "Kernel: " UTS_RELEASE "\n");
@@ -686,7 +687,8 @@ static u32 capture_pinned_bo(struct drm_i915_error_buffer *err,
  * It's only a small step better than a random number in its current form.
  */
 static uint32_t i915_error_generate_code(struct drm_i915_private *dev_priv,
-					 struct drm_i915_error_state *error)
+					 struct drm_i915_error_state *error,
+					 int *ring_id)
 {
 	uint32_t error_code = 0;
 	int i;
@@ -696,9 +698,14 @@ static uint32_t i915_error_generate_code(struct drm_i915_private *dev_priv,
 	 * synchronization commands which almost always appear in the case
 	 * strictly a client bug. Use instdone to differentiate those some.
 	 */
-	for (i = 0; i < I915_NUM_RINGS; i++)
-		if (error->ring[i].hangcheck_action == HANGCHECK_HUNG)
+	for (i = 0; i < I915_NUM_RINGS; i++) {
+		if (error->ring[i].hangcheck_action == HANGCHECK_HUNG) {
+			if (ring_id)
+				*ring_id = i;
+
 			return error->ring[i].ipehr ^ error->ring[i].instdone;
+		}
+	}
 
 	return error_code;
 }
@@ -1075,6 +1082,39 @@ static void i915_capture_reg_state(struct drm_i915_private *dev_priv,
 	i915_get_extra_instdone(dev, error->extra_instdone);
 }
 
+static void i915_error_generate_msg(struct drm_device *dev,
+				    struct drm_i915_error_state *error,
+				    bool wedge,
+				    const char *error_msg)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	u32 ecode;
+	int ring_id = -1, len;
+
+	ecode = i915_error_generate_code(dev_priv, error, &ring_id);
+
+	len = scnprintf(error->error_msg, sizeof(error->error_msg),
+			"GPU HANG: ecode %d:0x%08x", ring_id, ecode);
+
+	if (ring_id != -1 && error->ring[ring_id].pid != -1)
+			len += scnprintf(error->error_msg + len,
+					 sizeof(error->error_msg) - len,
+					 ", in %s [%d]",
+					 error->ring[ring_id].comm,
+					 error->ring[ring_id].pid);
+
+	if (error_msg)
+		len += scnprintf(error->error_msg + len,
+				 sizeof(error->error_msg) - len,
+				 ", reason: %s",
+				 error_msg);
+
+	if (wedge)
+		len += scnprintf(error->error_msg + len,
+				 sizeof(error->error_msg) - len,
+				 ", resetting GPU");
+}
+
 /**
  * i915_capture_error_state - capture an error record for later analysis
  * @dev: drm device
@@ -1084,19 +1124,13 @@ static void i915_capture_reg_state(struct drm_i915_private *dev_priv,
  * out a structure which becomes available in debugfs for user level tools
  * to pick up.
  */
-void i915_capture_error_state(struct drm_device *dev)
+void i915_capture_error_state(struct drm_device *dev, bool wedged,
+			      const char *error_msg)
 {
 	static bool warned;
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	struct drm_i915_error_state *error;
 	unsigned long flags;
-	uint32_t ecode;
-
-	spin_lock_irqsave(&dev_priv->gpu_error.lock, flags);
-	error = dev_priv->gpu_error.first_error;
-	spin_unlock_irqrestore(&dev_priv->gpu_error.lock, flags);
-	if (error)
-		return;
 
 	/* Account for pipe specific data like PIPE*STAT */
 	error = kzalloc(sizeof(*error), GFP_ATOMIC);
@@ -1105,30 +1139,21 @@ void i915_capture_error_state(struct drm_device *dev)
 		return;
 	}
 
-	DRM_INFO("GPU crash dump saved to /sys/class/drm/card%d/error\n",
-		 dev->primary->index);
 	kref_init(&error->ref);
 
 	i915_capture_reg_state(dev_priv, error);
 	i915_gem_capture_buffers(dev_priv, error);
 	i915_gem_record_fences(dev, error);
 	i915_gem_record_rings(dev, error);
-	ecode = i915_error_generate_code(dev_priv, error);
-
-	if (!warned) {
-		DRM_INFO("GPU HANG [%x]\n", ecode);
-		DRM_INFO("GPU hangs can indicate a bug anywhere in the entire gfx stack, including userspace.\n");
-		DRM_INFO("Please file a _new_ bug report on bugs.freedesktop.org against DRI -> DRM/Intel\n");
-		DRM_INFO("drm/i915 developers can then reassign to the right component if it's not a kernel issue.\n");
-		DRM_INFO("The gpu crash dump is required to analyze gpu hangs, so please always attach it.\n");
-		warned = true;
-	}
 
 	do_gettimeofday(&error->time);
 
 	error->overlay = intel_overlay_capture_error_state(dev);
 	error->display = intel_display_capture_error_state(dev);
 
+	i915_error_generate_msg(dev, error, wedged, error_msg);
+	DRM_INFO("%s\n", error->error_msg);
+
 	spin_lock_irqsave(&dev_priv->gpu_error.lock, flags);
 	if (dev_priv->gpu_error.first_error == NULL) {
 		dev_priv->gpu_error.first_error = error;
@@ -1136,8 +1161,19 @@ void i915_capture_error_state(struct drm_device *dev)
 	}
 	spin_unlock_irqrestore(&dev_priv->gpu_error.lock, flags);
 
-	if (error)
+	if (error) {
 		i915_error_state_free(&error->ref);
+		return;
+	}
+
+	if (!warned) {
+		DRM_INFO("GPU hangs can indicate a bug anywhere in the entire gfx stack, including userspace.\n");
+		DRM_INFO("Please file a _new_ bug report on bugs.freedesktop.org against DRI -> DRM/Intel\n");
+		DRM_INFO("drm/i915 developers can then reassign to the right component if it's not a kernel issue.\n");
+		DRM_INFO("The gpu crash dump is required to analyze gpu hangs, so please always attach it.\n");
+		DRM_INFO("GPU crash dump saved to /sys/class/drm/card%d/error\n", dev->primary->index);
+		warned = true;
+	}
 }
 
 void i915_error_state_get(struct drm_device *dev,
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index d4defd8..36bba16 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -1223,8 +1223,7 @@ static void snb_gt_irq_handler(struct drm_device *dev,
 	if (gt_iir & (GT_BLT_CS_ERROR_INTERRUPT |
 		      GT_BSD_CS_ERROR_INTERRUPT |
 		      GT_RENDER_CS_MASTER_ERROR_INTERRUPT)) {
-		DRM_ERROR("GT error interrupt 0x%08x\n", gt_iir);
-		i915_handle_error(dev, false);
+		i915_handle_error(dev, false, "GT error interrupt 0x%08x", gt_iir);
 	}
 
 	if (gt_iir & GT_PARITY_ERROR(dev))
@@ -1471,8 +1470,9 @@ static void gen6_rps_irq_handler(struct drm_i915_private *dev_priv, u32 pm_iir)
 			notify_ring(dev_priv->dev, &dev_priv->ring[VECS]);
 
 		if (pm_iir & PM_VEBOX_CS_ERROR_INTERRUPT) {
-			DRM_ERROR("VEBOX CS error interrupt 0x%08x\n", pm_iir);
-			i915_handle_error(dev_priv->dev, false);
+			i915_handle_error(dev_priv->dev, false,
+					  "VEBOX CS error interrupt 0x%08x",
+					  pm_iir);
 		}
 	}
 }
@@ -2174,11 +2174,18 @@ static void i915_report_and_clear_eir(struct drm_device *dev)
  * so userspace knows something bad happened (should trigger collection
  * of a ring dump etc.).
  */
-void i915_handle_error(struct drm_device *dev, bool wedged)
+void i915_handle_error(struct drm_device *dev, bool wedged,
+		       const char *fmt, ...)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
+	va_list args;
+	char error_msg[80];
 
-	i915_capture_error_state(dev);
+	va_start(args, fmt);
+	vscnprintf(error_msg, sizeof(error_msg), fmt, args);
+	va_end(args);
+
+	i915_capture_error_state(dev, wedged, error_msg);
 	i915_report_and_clear_eir(dev);
 
 	if (wedged) {
@@ -2481,9 +2488,9 @@ ring_stuck(struct intel_ring_buffer *ring, u32 acthd)
 	 */
 	tmp = I915_READ_CTL(ring);
 	if (tmp & RING_WAIT) {
-		DRM_ERROR("Kicking stuck wait on %s\n",
-			  ring->name);
-		i915_handle_error(dev, false);
+		i915_handle_error(dev, false,
+				  "Kicking stuck wait on %s",
+				  ring->name);
 		I915_WRITE_CTL(ring, tmp);
 		return HANGCHECK_KICK;
 	}
@@ -2493,9 +2500,9 @@ ring_stuck(struct intel_ring_buffer *ring, u32 acthd)
 		default:
 			return HANGCHECK_HUNG;
 		case 1:
-			DRM_ERROR("Kicking stuck semaphore on %s\n",
-				  ring->name);
-			i915_handle_error(dev, false);
+			i915_handle_error(dev, false,
+					  "Kicking stuck semaphore on %s",
+					  ring->name);
 			I915_WRITE_CTL(ring, tmp);
 			return HANGCHECK_KICK;
 		case 0:
@@ -2617,7 +2624,7 @@ static void i915_hangcheck_elapsed(unsigned long data)
 	}
 
 	if (rings_hung)
-		return i915_handle_error(dev, true);
+		return i915_handle_error(dev, true, "Ring hung");
 
 	if (busy_count)
 		/* Reset timer case chip hangs without another request
@@ -3234,7 +3241,9 @@ static irqreturn_t i8xx_irq_handler(int irq, void *arg)
 		 */
 		spin_lock_irqsave(&dev_priv->irq_lock, irqflags);
 		if (iir & I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT)
-			i915_handle_error(dev, false);
+			i915_handle_error(dev, false,
+					  "Command parser error, iir 0x%08x",
+					  iir);
 
 		for_each_pipe(pipe) {
 			int reg = PIPESTAT(pipe);
@@ -3416,7 +3425,9 @@ static irqreturn_t i915_irq_handler(int irq, void *arg)
 		 */
 		spin_lock_irqsave(&dev_priv->irq_lock, irqflags);
 		if (iir & I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT)
-			i915_handle_error(dev, false);
+			i915_handle_error(dev, false,
+					  "Command parser error, iir 0x%08x",
+					  iir);
 
 		for_each_pipe(pipe) {
 			int reg = PIPESTAT(pipe);
@@ -3653,7 +3664,9 @@ static irqreturn_t i965_irq_handler(int irq, void *arg)
 		 */
 		spin_lock_irqsave(&dev_priv->irq_lock, irqflags);
 		if (iir & I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT)
-			i915_handle_error(dev, false);
+			i915_handle_error(dev, false,
+					  "Command parser error, iir 0x%08x",
+					  iir);
 
 		for_each_pipe(pipe) {
 			int reg = PIPESTAT(pipe);
-- 
1.7.9.5




More information about the Intel-gfx mailing list