[Intel-gfx] [PATCH 3/3] drm/i915: Record error state capture reason
Ben Widawsky
ben at bwidawsk.net
Wed Feb 12 03:32:24 CET 2014
On Mon, Feb 10, 2014 at 04:30:51PM +0200, Mika Kuoppala wrote:
> We capture error state not only when the GPU hangs but
> also on other situations as in interrupt errors and
> in situations where we can kick things forward without GPU reset.
> There will be log entry on most of these cases. But as error state
> capture might be only thing we have, if dmesg was not captured. Or as
> in GEN4 case, interrupt error can trigger error state capture without log
> entry, the exact reason why capture was made is hard to decipher.
>
> To avoid confusion why the error state was captured, print reason
> along with error code into log and also store it into the error state.
>
I could have done with a better commit subject and message. We already
capture error state, so "Record error state capture" is a bit weird.
Also, I think the commit message glosses over the main point of why
these cases are special as opposed to hangcheck.
> References: https://bugs.freedesktop.org/show_bug.cgi?id=74193
> Signed-off-by: Mika Kuoppala <mika.kuoppala at intel.com>
> ---
> drivers/gpu/drm/i915/i915_debugfs.c | 5 +-
> drivers/gpu/drm/i915/i915_drv.h | 9 +++-
> drivers/gpu/drm/i915/i915_gpu_error.c | 84 +++++++++++++++++++++++----------
> drivers/gpu/drm/i915/i915_irq.c | 45 +++++++++++-------
> 4 files changed, 98 insertions(+), 45 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index 2dc05c3..175e524 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -3006,9 +3006,8 @@ i915_wedged_set(void *data, u64 val)
> {
> struct drm_device *dev = data;
>
> - DRM_INFO("Manually setting wedged to %llu\n", val);
> - i915_handle_error(dev, val);
> -
> + i915_handle_error(dev, val,
> + "Manually setting wedged to %llu", val);
> return 0;
> }
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 2a61a29..ad41c71 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -299,6 +299,8 @@ struct drm_i915_error_state {
> struct kref ref;
> struct timeval time;
>
> + char error_msg[128];
> +
> /* Generic register state */
> u32 eir;
> u32 pgtbl_er;
> @@ -1994,7 +1996,9 @@ extern void intel_console_resume(struct work_struct *work);
>
> /* i915_irq.c */
> void i915_queue_hangcheck(struct drm_device *dev);
> -void i915_handle_error(struct drm_device *dev, bool wedged);
> +__printf(3, 4)
> +void i915_handle_error(struct drm_device *dev, bool wedged,
> + const char *fmt, ...);
>
> void gen6_set_pm_mask(struct drm_i915_private *dev_priv, u32 pm_iir,
> int new_delay);
> @@ -2464,7 +2468,8 @@ static inline void i915_error_state_buf_release(
> {
> kfree(eb->buf);
> }
> -void i915_capture_error_state(struct drm_device *dev);
> +void i915_capture_error_state(struct drm_device *dev, bool wedge,
> + const char *error_msg);
> void i915_error_state_get(struct drm_device *dev,
> struct i915_error_state_file_priv *error_priv);
> void i915_error_state_put(struct i915_error_state_file_priv *error_priv);
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index a90971a..dce569b 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -329,6 +329,7 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
> goto out;
> }
>
> + err_printf(m, "%s\n", error->error_msg);
> err_printf(m, "Time: %ld s %ld us\n", error->time.tv_sec,
> error->time.tv_usec);
> err_printf(m, "Kernel: " UTS_RELEASE "\n");
> @@ -686,7 +687,8 @@ static u32 capture_pinned_bo(struct drm_i915_error_buffer *err,
> * It's only a small step better than a random number in its current form.
> */
> static uint32_t i915_error_generate_code(struct drm_i915_private *dev_priv,
> - struct drm_i915_error_state *error)
> + struct drm_i915_error_state *error,
> + int *ring_id)
> {
> uint32_t error_code = 0;
> int i;
> @@ -696,9 +698,14 @@ static uint32_t i915_error_generate_code(struct drm_i915_private *dev_priv,
> * synchronization commands which almost always appear in the case
> * strictly a client bug. Use instdone to differentiate those some.
> */
> - for (i = 0; i < I915_NUM_RINGS; i++)
> - if (error->ring[i].hangcheck_action == HANGCHECK_HUNG)
> + for (i = 0; i < I915_NUM_RINGS; i++) {
> + if (error->ring[i].hangcheck_action == HANGCHECK_HUNG) {
> + if (ring_id)
> + *ring_id = i;
> +
> return error->ring[i].ipehr ^ error->ring[i].instdone;
> + }
> + }
>
> return error_code;
> }
> @@ -1075,6 +1082,39 @@ static void i915_capture_reg_state(struct drm_i915_private *dev_priv,
> i915_get_extra_instdone(dev, error->extra_instdone);
> }
>
> +static void i915_error_generate_msg(struct drm_device *dev,
> + struct drm_i915_error_state *error,
> + bool wedge,
> + const char *error_msg)
> +{
> + struct drm_i915_private *dev_priv = dev->dev_private;
> + u32 ecode;
> + int ring_id = -1, len;
> +
> + ecode = i915_error_generate_code(dev_priv, error, &ring_id);
> +
> + len = scnprintf(error->error_msg, sizeof(error->error_msg),
> + "GPU HANG: ecode %d:0x%08x", ring_id, ecode);
> +
> + if (ring_id != -1 && error->ring[ring_id].pid != -1)
> + len += scnprintf(error->error_msg + len,
> + sizeof(error->error_msg) - len,
> + ", in %s [%d]",
> + error->ring[ring_id].comm,
> + error->ring[ring_id].pid);
> +
> + if (error_msg)
I'd drop this and just always have an error_msg. But whatever you want
is fine.
> + len += scnprintf(error->error_msg + len,
> + sizeof(error->error_msg) - len,
> + ", reason: %s",
> + error_msg);
> +
> + if (wedge)
> + len += scnprintf(error->error_msg + len,
> + sizeof(error->error_msg) - len,
> + ", resetting GPU");
I also don't think the fact that we're resetting the GPU belongs here
(for instance, if it's disabled via modparam). On the other hand, I'd be
in favor of a taint on GPU reset for all future error states.
> +}
> +
> /**
> * i915_capture_error_state - capture an error record for later analysis
> * @dev: drm device
> @@ -1084,19 +1124,13 @@ static void i915_capture_reg_state(struct drm_i915_private *dev_priv,
> * out a structure which becomes available in debugfs for user level tools
> * to pick up.
> */
> -void i915_capture_error_state(struct drm_device *dev)
> +void i915_capture_error_state(struct drm_device *dev, bool wedged,
> + const char *error_msg)
> {
> static bool warned;
> struct drm_i915_private *dev_priv = dev->dev_private;
> struct drm_i915_error_state *error;
> unsigned long flags;
> - uint32_t ecode;
> -
> - spin_lock_irqsave(&dev_priv->gpu_error.lock, flags);
> - error = dev_priv->gpu_error.first_error;
> - spin_unlock_irqrestore(&dev_priv->gpu_error.lock, flags);
> - if (error)
> - return;
This seems like it should be a separate patch, or else I'm not clear why
you've introduced this change in this series. It doesn't seem to be
related to what the rest of the patch does.
>
> /* Account for pipe specific data like PIPE*STAT */
> error = kzalloc(sizeof(*error), GFP_ATOMIC);
> @@ -1105,30 +1139,21 @@ void i915_capture_error_state(struct drm_device *dev)
> return;
> }
>
> - DRM_INFO("GPU crash dump saved to /sys/class/drm/card%d/error\n",
> - dev->primary->index);
> kref_init(&error->ref);
>
> i915_capture_reg_state(dev_priv, error);
> i915_gem_capture_buffers(dev_priv, error);
> i915_gem_record_fences(dev, error);
> i915_gem_record_rings(dev, error);
> - ecode = i915_error_generate_code(dev_priv, error);
> -
> - if (!warned) {
> - DRM_INFO("GPU HANG [%x]\n", ecode);
> - DRM_INFO("GPU hangs can indicate a bug anywhere in the entire gfx stack, including userspace.\n");
> - DRM_INFO("Please file a _new_ bug report on bugs.freedesktop.org against DRI -> DRM/Intel\n");
> - DRM_INFO("drm/i915 developers can then reassign to the right component if it's not a kernel issue.\n");
> - DRM_INFO("The gpu crash dump is required to analyze gpu hangs, so please always attach it.\n");
> - warned = true;
> - }
>
> do_gettimeofday(&error->time);
>
> error->overlay = intel_overlay_capture_error_state(dev);
> error->display = intel_display_capture_error_state(dev);
>
> + i915_error_generate_msg(dev, error, wedged, error_msg);
> + DRM_INFO("%s\n", error->error_msg);
> +
> spin_lock_irqsave(&dev_priv->gpu_error.lock, flags);
> if (dev_priv->gpu_error.first_error == NULL) {
> dev_priv->gpu_error.first_error = error;
> @@ -1136,8 +1161,19 @@ void i915_capture_error_state(struct drm_device *dev)
> }
> spin_unlock_irqrestore(&dev_priv->gpu_error.lock, flags);
>
> - if (error)
> + if (error) {
> i915_error_state_free(&error->ref);
> + return;
> + }
Same comment as above.
> +
> + if (!warned) {
> + DRM_INFO("GPU hangs can indicate a bug anywhere in the entire gfx stack, including userspace.\n");
> + DRM_INFO("Please file a _new_ bug report on bugs.freedesktop.org against DRI -> DRM/Intel\n");
> + DRM_INFO("drm/i915 developers can then reassign to the right component if it's not a kernel issue.\n");
> + DRM_INFO("The gpu crash dump is required to analyze gpu hangs, so please always attach it.\n");
> + DRM_INFO("GPU crash dump saved to /sys/class/drm/card%d/error\n", dev->primary->index);
> + warned = true;
> + }
> }
>
> void i915_error_state_get(struct drm_device *dev,
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index d4defd8..36bba16 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -1223,8 +1223,7 @@ static void snb_gt_irq_handler(struct drm_device *dev,
> if (gt_iir & (GT_BLT_CS_ERROR_INTERRUPT |
> GT_BSD_CS_ERROR_INTERRUPT |
> GT_RENDER_CS_MASTER_ERROR_INTERRUPT)) {
> - DRM_ERROR("GT error interrupt 0x%08x\n", gt_iir);
> - i915_handle_error(dev, false);
> + i915_handle_error(dev, false, "GT error interrupt 0x%08x", gt_iir);
> }
>
> if (gt_iir & GT_PARITY_ERROR(dev))
> @@ -1471,8 +1470,9 @@ static void gen6_rps_irq_handler(struct drm_i915_private *dev_priv, u32 pm_iir)
> notify_ring(dev_priv->dev, &dev_priv->ring[VECS]);
>
> if (pm_iir & PM_VEBOX_CS_ERROR_INTERRUPT) {
> - DRM_ERROR("VEBOX CS error interrupt 0x%08x\n", pm_iir);
> - i915_handle_error(dev_priv->dev, false);
> + i915_handle_error(dev_priv->dev, false,
> + "VEBOX CS error interrupt 0x%08x",
> + pm_iir);
> }
> }
> }
> @@ -2174,11 +2174,18 @@ static void i915_report_and_clear_eir(struct drm_device *dev)
> * so userspace knows something bad happened (should trigger collection
> * of a ring dump etc.).
> */
> -void i915_handle_error(struct drm_device *dev, bool wedged)
> +void i915_handle_error(struct drm_device *dev, bool wedged,
> + const char *fmt, ...)
> {
> struct drm_i915_private *dev_priv = dev->dev_private;
> + va_list args;
> + char error_msg[80];
>
> - i915_capture_error_state(dev);
> + va_start(args, fmt);
> + vscnprintf(error_msg, sizeof(error_msg), fmt, args);
> + va_end(args);
> +
> + i915_capture_error_state(dev, wedged, error_msg);
As I said above regarding always having an error_msg, I'd be in favor of
moving most of this to capture error state, and just passing along the
string - but whatever you want is fine with me.
> i915_report_and_clear_eir(dev);
>
> if (wedged) {
> @@ -2481,9 +2488,9 @@ ring_stuck(struct intel_ring_buffer *ring, u32 acthd)
> */
> tmp = I915_READ_CTL(ring);
> if (tmp & RING_WAIT) {
> - DRM_ERROR("Kicking stuck wait on %s\n",
> - ring->name);
> - i915_handle_error(dev, false);
> + i915_handle_error(dev, false,
> + "Kicking stuck wait on %s",
> + ring->name);
> I915_WRITE_CTL(ring, tmp);
> return HANGCHECK_KICK;
> }
> @@ -2493,9 +2500,9 @@ ring_stuck(struct intel_ring_buffer *ring, u32 acthd)
> default:
> return HANGCHECK_HUNG;
> case 1:
> - DRM_ERROR("Kicking stuck semaphore on %s\n",
> - ring->name);
> - i915_handle_error(dev, false);
> + i915_handle_error(dev, false,
> + "Kicking stuck semaphore on %s",
> + ring->name);
> I915_WRITE_CTL(ring, tmp);
> return HANGCHECK_KICK;
> case 0:
> @@ -2617,7 +2624,7 @@ static void i915_hangcheck_elapsed(unsigned long data)
> }
>
> if (rings_hung)
> - return i915_handle_error(dev, true);
> + return i915_handle_error(dev, true, "Ring hung");
>
> if (busy_count)
> /* Reset timer case chip hangs without another request
> @@ -3234,7 +3241,9 @@ static irqreturn_t i8xx_irq_handler(int irq, void *arg)
> */
> spin_lock_irqsave(&dev_priv->irq_lock, irqflags);
> if (iir & I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT)
> - i915_handle_error(dev, false);
> + i915_handle_error(dev, false,
> + "Command parser error, iir 0x%08x",
> + iir);
>
> for_each_pipe(pipe) {
> int reg = PIPESTAT(pipe);
> @@ -3416,7 +3425,9 @@ static irqreturn_t i915_irq_handler(int irq, void *arg)
> */
> spin_lock_irqsave(&dev_priv->irq_lock, irqflags);
> if (iir & I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT)
> - i915_handle_error(dev, false);
> + i915_handle_error(dev, false,
> + "Command parser error, iir 0x%08x",
> + iir);
>
> for_each_pipe(pipe) {
> int reg = PIPESTAT(pipe);
> @@ -3653,7 +3664,9 @@ static irqreturn_t i965_irq_handler(int irq, void *arg)
> */
> spin_lock_irqsave(&dev_priv->irq_lock, irqflags);
> if (iir & I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT)
> - i915_handle_error(dev, false);
> + i915_handle_error(dev, false,
> + "Command parser error, iir 0x%08x",
> + iir);
>
Since you're not directly using any of the DRM_ printers, we lose the
file/line. Would you care to add __FILE__/__LINE? I think it would be
helpful - not sure what others thing.
> for_each_pipe(pipe) {
> int reg = PIPESTAT(pipe);
> --
> 1.7.9.5
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
--
Ben Widawsky, Intel Open Source Technology Center
More information about the Intel-gfx
mailing list