[Intel-gfx] [PATCH] drm/i915: add hang detection and reset
Eric Anholt
eric at anholt.net
Wed Jun 24 01:06:04 CEST 2009
On Fri, 2009-06-19 at 13:08 -0700, Jesse Barnes wrote:
> This patch adds a hangcheck timer, which set set to a 5s timeout at
> each batch execution, and cleared when sequence number pass or when a
> leave VT event happens. It also adds infrastructure for saving &
> restoring just the display state, in the event we need to reset the
> display engine too (though that's currently unused).
>
> If the timer fires, the hangcheck function captures some error state
> and attempts a reset (on 965+ only thus far), and generates a uevent.
> It's enough to recover from the gem_hang test case, allowing me to
> start a fresh desktop after the hang has been detected and recovered
> from, which wasn't possible before.
>
> Wider testing would be much appreciated, with other types of hangs.
> And of course review is always welcome. I'm hoping this is safe to
> include in 2.6.31 since it shouldn't make things any worse than they
> are today.
So, 5s will be too short. The Mesa trispd demo I've seen take up to 15s
for a single batchbuffer. I'm thinking we'll need to track the
intra-batchbuffer head pointer when the timeout fires and reset if it's
moved.
> Signed-off-by: Jesse Barnes <jbarnes at virtuousgeek.org>
>
> diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
> index 1029750..a8a8b44 100644
> --- a/drivers/gpu/drm/i915/i915_dma.c
> +++ b/drivers/gpu/drm/i915/i915_dma.c
> @@ -1014,6 +1014,9 @@ static int i915_load_modeset_init(struct drm_device *dev)
> /* Basic memrange allocator for stolen space (aka vram) */
> drm_mm_init(&dev_priv->vram, 0, prealloc_size);
>
> + /* We're off and running w/KMS */
> + dev_priv->mm.suspended = 0;
> +
> /* Let GEM Manage from end of prealloc space to end of aperture.
> *
> * However, leave one page at the end still bound to the scratch page.
> @@ -1025,7 +1028,9 @@ static int i915_load_modeset_init(struct drm_device *dev)
> */
> i915_gem_do_init(dev, prealloc_size, agp_size - 4096);
>
> + mutex_lock(&dev->struct_mutex);
> ret = i915_gem_init_ringbuffer(dev);
> + mutex_unlock(&dev->struct_mutex);
> if (ret)
> goto out;
>
> @@ -1202,6 +1207,9 @@ int i915_driver_load(struct drm_device *dev, unsigned long flags)
> return ret;
> }
>
> + /* Start out suspended */
> + dev_priv->mm.suspended = 1;
> +
> if (drm_core_check_feature(dev, DRIVER_MODESET)) {
> ret = i915_load_modeset_init(dev);
> if (ret < 0) {
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index 98560e1..4d5b907 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -119,6 +119,100 @@ static int i915_resume(struct drm_device *dev)
> return ret;
> }
>
> +/**
> + * i965_reset - reset chip after a hang
> + * @dev: drm device to reset
> + * @flags: reset domains
> + *
> + * Reset the chip. Useful if a hang is detected.
> + *
> + * Procedure is fairly simple:
> + * - reset the chip using the reset reg
> + * - re-init context state
> + * - re-init hardware status page
> + * - re-init ring buffer
> + * - re-init interrupt state
> + * - re-init display
> + */
> +void i965_reset(struct drm_device *dev, u8 flags)
> +{
> + drm_i915_private_t *dev_priv = dev->dev_private;
> + unsigned long timeout = jiffies + msecs_to_jiffies(500);
> + u8 gdrst;
> +
> + BUG_ON(mutex_is_locked(&dev->struct_mutex));
> +
> + if (flags & GDRST_FULL)
> + i915_save_display(dev);
> +
> + /*
> + * Set the reset bit, wait for reset, then clear it. Hardware
> + * will clear the status bit (bit 1) when it's actually ready
> + * for action again.
> + */
> + pci_read_config_byte(dev->pdev, GDRST, &gdrst);
> + pci_write_config_byte(dev->pdev, GDRST, gdrst | flags);
> + udelay(50);
> + pci_write_config_byte(dev->pdev, GDRST, gdrst & 0xfe);
> +
> + /* ...we don't want to loop forever though, 500ms should be plenty */
> + do {
> + udelay(100);
> + pci_read_config_byte(dev->pdev, GDRST, &gdrst);
> + } while ((gdrst & 2) && time_after(timeout, jiffies));
> +
> + /* Ok now get things going again... */
> +
> + /*
> + * Everything depends on having the GTT running, so we need to start
> + * there. Fortunately we don't need to do this unless we reset the
> + * chip at a PCI level.
> + *
> + * Next we need to restore the context, but we don't use those
> + * yet either...
> + *
> + * Ring buffer needs to be re-initialized in the KMS case, or if X
> + * was running at the time of the reset (i.e. we weren't VT
> + * switched away).
> + */
> + if (drm_core_check_feature(dev, DRIVER_MODESET) ||
> + !dev_priv->mm.suspended) {
> + drm_i915_ring_buffer_t *ring = &dev_priv->ring;
> + struct drm_gem_object *obj = ring->ring_obj;
> + struct drm_i915_gem_object *obj_priv = obj->driver_private;
> + dev_priv->mm.suspended = 0;
> +
> + /* Stop the ring if it's running. */
> + I915_WRITE(PRB0_CTL, 0);
> + I915_WRITE(PRB0_TAIL, 0);
> + I915_WRITE(PRB0_HEAD, 0);
> +
> + /* Initialize the ring. */
> + I915_WRITE(PRB0_START, obj_priv->gtt_offset);
> + I915_WRITE(PRB0_CTL,
> + ((obj->size - 4096) & RING_NR_PAGES) |
> + RING_NO_REPORT |
> + RING_VALID);
> + if (!drm_core_check_feature(dev, DRIVER_MODESET))
> + i915_kernel_lost_context(dev);
> + else {
> + ring->head = I915_READ(PRB0_HEAD) & HEAD_ADDR;
> + ring->tail = I915_READ(PRB0_TAIL) & TAIL_ADDR;
> + ring->space = ring->head - (ring->tail + 8);
> + if (ring->space < 0)
> + ring->space += ring->Size;
> + }
> +
> + drm_irq_install(dev);
> + }
> +
> + /*
> + * Display needs restore too...
> + */
> + if (flags & GDRST_FULL)
> + i915_restore_display(dev);
> +}
> +
> static int __devinit
> i915_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
> {
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 7756f78..c51ddb2 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -228,6 +228,7 @@ typedef struct drm_i915_private {
> spinlock_t error_lock;
> struct drm_i915_error_state *first_error;
> struct work_struct error_work;
> + struct timer_list hangcheck_timer;
>
> /* Register state */
> u8 saveLBB;
> @@ -557,6 +558,8 @@ extern struct drm_ioctl_desc i915_ioctls[];
> extern int i915_max_ioctl;
> extern unsigned int i915_fbpercrtc;
>
> +extern void i915_save_display(struct drm_device *dev);
> +extern void i915_restore_display(struct drm_device *dev);
> extern int i915_master_create(struct drm_device *dev, struct drm_master *master);
> extern void i915_master_destroy(struct drm_device *dev, struct drm_master *master);
>
> @@ -576,6 +579,7 @@ extern long i915_compat_ioctl(struct file *filp, unsigned int cmd,
> extern int i915_emit_box(struct drm_device *dev,
> struct drm_clip_rect *boxes,
> int i, int DR1, int DR4);
> +extern void i965_reset(struct drm_device *dev, u8 flags);
>
> /* i915_irq.c */
> extern int i915_irq_emit(struct drm_device *dev, void *data,
> @@ -607,7 +611,7 @@ i915_enable_pipestat(drm_i915_private_t *dev_priv, int pipe, u32 mask);
>
> void
> i915_disable_pipestat(drm_i915_private_t *dev_priv, int pipe, u32 mask);
> -
> +extern void i915_gem_hangcheck(unsigned long arg);
>
> /* i915_mem.c */
> extern int i915_mem_alloc(struct drm_device *dev, void *data,
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 174aef2..c74e29f 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -1545,8 +1545,12 @@ i915_add_request(struct drm_device *dev, struct drm_file *file_priv,
>
> }
>
> - if (was_empty && !dev_priv->mm.suspended)
> - schedule_delayed_work(&dev_priv->mm.retire_work, HZ);
> + if (!dev_priv->mm.suspended) {
> + if (was_empty)
> + schedule_delayed_work(&dev_priv->mm.retire_work, HZ);
> +
> + mod_timer(&dev_priv->hangcheck_timer, jiffies + 5*DRM_HZ);
> + }
> return seqno;
> }
>
> @@ -3866,6 +3870,7 @@ i915_gem_idle(struct drm_device *dev)
> * We need to replace this with a semaphore, or something.
> */
> dev_priv->mm.suspended = 1;
> + del_timer(&dev_priv->hangcheck_timer);
>
> /* Cancel the retire work handler, wait for it to finish if running
> */
> @@ -4250,6 +4255,9 @@ i915_gem_load(struct drm_device *dev)
> dev_priv->num_fence_regs = 8;
>
> i915_gem_detect_bit_6_swizzle(dev);
> +
> + setup_timer(&dev_priv->hangcheck_timer, i915_gem_hangcheck,
> + (unsigned long)dev);
> }
>
> /*
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 39a2b40..df32ee8 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -298,6 +298,18 @@ static void i915_error_work_func(struct work_struct *work)
> DRM_DEBUG("generating error event\n");
>
> kobject_uevent_env(&dev->primary->kdev.kobj, KOBJ_CHANGE, envp);
> +
> + if (dev_priv->mm.wedged) {
> + printk(KERN_ERR "GPU hang detected...");
> + if (IS_I965G(dev)) {
> + printk("resetting...");
> + i965_reset(dev, GDRST_RENDER);
> + printk("done.\n");
> + dev_priv->mm.wedged = 0;
> + } else {
> + printk("reboot required\n");
> + }
> + }
> }
>
> /**
> @@ -475,6 +487,23 @@ static void i915_handle_error(struct drm_device *dev)
> schedule_work(&dev_priv->error_work);
> }
>
> +/**
> + * i915_gem_hangcheck - hang timer function
> + * @dev: drm device
> + *
> + * This function is called by the hangcheck timer if a batch doesn't finish
> + * within the timeout value. It should capture error state and try to reset
> + * the GPU.
> + */
> +void i915_gem_hangcheck(unsigned long arg)
> +{
> + struct drm_device *dev = (struct drm_device *)arg;
> + drm_i915_private_t *dev_priv = dev->dev_private;
> +
> + dev_priv->mm.wedged = 1;
> + i915_handle_error(dev);
> +}
> +
> irqreturn_t i915_driver_irq_handler(DRM_IRQ_ARGS)
> {
> struct drm_device *dev = (struct drm_device *) arg;
> @@ -566,6 +595,7 @@ irqreturn_t i915_driver_irq_handler(DRM_IRQ_ARGS)
> if (iir & I915_USER_INTERRUPT) {
> dev_priv->mm.irq_gem_seqno = i915_get_gem_seqno(dev);
> DRM_WAKEUP(&dev_priv->irq_queue);
> + del_timer(&dev_priv->hangcheck_timer);
> }
>
> if (pipea_stats & vblank_status) {
> diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
> index afb9835..2ffafbf 100644
> --- a/drivers/gpu/drm/i915/i915_reg.h
> +++ b/drivers/gpu/drm/i915/i915_reg.h
> @@ -66,6 +66,10 @@
> #define GC_DISPLAY_CLOCK_333_MHZ (4 << 4)
> #define GC_DISPLAY_CLOCK_MASK (7 << 4)
> #define LBB 0xf4
> +#define GDRST 0xc0
> +#define GDRST_FULL (0<<2)
> +#define GDRST_RENDER (1<<2)
> +#define GDRST_MEDIA (3<<2)
>
> /* VGA stuff */
>
> diff --git a/drivers/gpu/drm/i915/i915_suspend.c b/drivers/gpu/drm/i915/i915_suspend.c
> index a98e283..cfb469c 100644
> --- a/drivers/gpu/drm/i915/i915_suspend.c
> +++ b/drivers/gpu/drm/i915/i915_suspend.c
> @@ -222,19 +222,15 @@ static void i915_restore_vga(struct drm_device *dev)
> I915_WRITE8(VGA_DACMASK, dev_priv->saveDACMASK);
> }
>
> -int i915_save_state(struct drm_device *dev)
> +/**
> + * i915_save_display - save display & mode info
> + * @dev: DRM device
> + *
> + * Save mode timings and display info.
> + */
> +void i915_save_display(struct drm_device *dev)
> {
> struct drm_i915_private *dev_priv = dev->dev_private;
> - int i;
> -
> - pci_read_config_byte(dev->pdev, LBB, &dev_priv->saveLBB);
> -
> - /* Render Standby */
> - if (IS_I965G(dev) && IS_MOBILE(dev))
> - dev_priv->saveRENDERSTANDBY = I915_READ(MCHBAR_RENDER_STANDBY);
> -
> - /* Hardware status page */
> - dev_priv->saveHWS = I915_READ(HWS_PGA);
>
> /* Display arbitration control */
> dev_priv->saveDSPARB = I915_READ(DSPARB);
> @@ -330,17 +326,36 @@ int i915_save_state(struct drm_device *dev)
> dev_priv->saveFBC_CONTROL2 = I915_READ(FBC_CONTROL2);
> dev_priv->saveFBC_CONTROL = I915_READ(FBC_CONTROL);
>
> - /* Interrupt state */
> - dev_priv->saveIIR = I915_READ(IIR);
> - dev_priv->saveIER = I915_READ(IER);
> - dev_priv->saveIMR = I915_READ(IMR);
> -
> /* VGA state */
> dev_priv->saveVGA0 = I915_READ(VGA0);
> dev_priv->saveVGA1 = I915_READ(VGA1);
> dev_priv->saveVGA_PD = I915_READ(VGA_PD);
> dev_priv->saveVGACNTRL = I915_READ(VGACNTRL);
>
> + i915_save_vga(dev);
> +}
> +
> +int i915_save_state(struct drm_device *dev)
> +{
> + struct drm_i915_private *dev_priv = dev->dev_private;
> + int i;
> +
> + pci_read_config_byte(dev->pdev, LBB, &dev_priv->saveLBB);
> +
> + /* Render Standby */
> + if (IS_I965G(dev) && IS_MOBILE(dev))
> + dev_priv->saveRENDERSTANDBY = I915_READ(MCHBAR_RENDER_STANDBY);
> +
> + /* Hardware status page */
> + dev_priv->saveHWS = I915_READ(HWS_PGA);
> +
> + i915_save_display(dev);
> +
> + /* Interrupt state */
> + dev_priv->saveIIR = I915_READ(IIR);
> + dev_priv->saveIER = I915_READ(IER);
> + dev_priv->saveIMR = I915_READ(IMR);
> +
> /* Clock gating state */
> dev_priv->saveD_STATE = I915_READ(D_STATE);
> dev_priv->saveCG_2D_DIS = I915_READ(CG_2D_DIS);
> @@ -371,25 +386,15 @@ int i915_save_state(struct drm_device *dev)
> for (i = 0; i < 8; i++)
> dev_priv->saveFENCE[i+8] = I915_READ(FENCE_REG_945_8 + (i * 4));
> }
> - i915_save_vga(dev);
>
> return 0;
> }
>
> -int i915_restore_state(struct drm_device *dev)
> +void i915_restore_display(struct drm_device *dev)
> {
> struct drm_i915_private *dev_priv = dev->dev_private;
> int i;
>
> - pci_write_config_byte(dev->pdev, LBB, dev_priv->saveLBB);
> -
> - /* Render Standby */
> - if (IS_I965G(dev) && IS_MOBILE(dev))
> - I915_WRITE(MCHBAR_RENDER_STANDBY, dev_priv->saveRENDERSTANDBY);
> -
> - /* Hardware status page */
> - I915_WRITE(HWS_PGA, dev_priv->saveHWS);
> -
> /* Display arbitration */
> I915_WRITE(DSPARB, dev_priv->saveDSPARB);
>
> @@ -533,6 +538,25 @@ int i915_restore_state(struct drm_device *dev)
> I915_WRITE(VGA_PD, dev_priv->saveVGA_PD);
> DRM_UDELAY(150);
>
> + i915_restore_vga(dev);
> +}
> +
> +int i915_restore_state(struct drm_device *dev)
> +{
> + struct drm_i915_private *dev_priv = dev->dev_private;
> + int i;
> +
> + pci_write_config_byte(dev->pdev, LBB, dev_priv->saveLBB);
> +
> + /* Render Standby */
> + if (IS_I965G(dev) && IS_MOBILE(dev))
> + I915_WRITE(MCHBAR_RENDER_STANDBY, dev_priv->saveRENDERSTANDBY);
> +
> + /* Hardware status page */
> + I915_WRITE(HWS_PGA, dev_priv->saveHWS);
> +
> + i915_restore_display(dev);
> +
> /* Clock gating state */
> I915_WRITE (D_STATE, dev_priv->saveD_STATE);
> I915_WRITE (CG_2D_DIS, dev_priv->saveCG_2D_DIS);
> @@ -550,8 +574,6 @@ int i915_restore_state(struct drm_device *dev)
> for (i = 0; i < 3; i++)
> I915_WRITE(SWF30 + (i << 2), dev_priv->saveSWF2[i]);
>
> - i915_restore_vga(dev);
> -
> return 0;
> }
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
--
Eric Anholt
eric at anholt.net eric.anholt at intel.com
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 197 bytes
Desc: This is a digitally signed message part
URL: <http://lists.freedesktop.org/archives/intel-gfx/attachments/20090623/df84cb0d/attachment.sig>
More information about the Intel-gfx
mailing list