[Intel-gfx] [PATCH] drm/i915: add hang detection and reset

Eric Anholt eric at anholt.net
Wed Jun 24 01:06:04 CEST 2009


On Fri, 2009-06-19 at 13:08 -0700, Jesse Barnes wrote:
> This patch adds a hangcheck timer, which set set to a 5s timeout at
> each batch execution, and cleared when sequence number pass or when a
> leave VT event happens.  It also adds infrastructure for saving &
> restoring just the display state, in the event we need to reset the
> display engine too (though that's currently unused).
> 
> If the timer fires, the hangcheck function captures some error state
> and attempts a reset (on 965+ only thus far), and generates a uevent.
> It's enough to recover from the gem_hang test case, allowing me to
> start a fresh desktop after the hang has been detected and recovered
> from, which wasn't possible before.
> 
> Wider testing would be much appreciated, with other types of hangs.
> And of course review is always welcome.  I'm hoping this is safe to
> include in 2.6.31 since it shouldn't make things any worse than they
> are today.

So, 5s will be too short.  The Mesa trispd demo I've seen take up to 15s
for a single batchbuffer.  I'm thinking we'll need to track the
intra-batchbuffer head pointer when the timeout fires and reset if it's
moved.

> Signed-off-by: Jesse Barnes <jbarnes at virtuousgeek.org>
> 
> diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
> index 1029750..a8a8b44 100644
> --- a/drivers/gpu/drm/i915/i915_dma.c
> +++ b/drivers/gpu/drm/i915/i915_dma.c
> @@ -1014,6 +1014,9 @@ static int i915_load_modeset_init(struct drm_device *dev)
>  	/* Basic memrange allocator for stolen space (aka vram) */
>  	drm_mm_init(&dev_priv->vram, 0, prealloc_size);
>  
> +	/* We're off and running w/KMS */
> +	dev_priv->mm.suspended = 0;
> +
>  	/* Let GEM Manage from end of prealloc space to end of aperture.
>  	 *
>  	 * However, leave one page at the end still bound to the scratch page.
> @@ -1025,7 +1028,9 @@ static int i915_load_modeset_init(struct drm_device *dev)
>  	 */
>  	i915_gem_do_init(dev, prealloc_size, agp_size - 4096);
>  
> +	mutex_lock(&dev->struct_mutex);
>  	ret = i915_gem_init_ringbuffer(dev);
> +	mutex_unlock(&dev->struct_mutex);
>  	if (ret)
>  		goto out;
>  
> @@ -1202,6 +1207,9 @@ int i915_driver_load(struct drm_device *dev, unsigned long flags)
>  		return ret;
>  	}
>  
> +	/* Start out suspended */
> +	dev_priv->mm.suspended = 1;
> +
>  	if (drm_core_check_feature(dev, DRIVER_MODESET)) {
>  		ret = i915_load_modeset_init(dev);
>  		if (ret < 0) {
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index 98560e1..4d5b907 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -119,6 +119,100 @@ static int i915_resume(struct drm_device *dev)
>  	return ret;
>  }
>  
> +/**
> + * i965_reset - reset chip after a hang
> + * @dev: drm device to reset
> + * @flags: reset domains
> + *
> + * Reset the chip.  Useful if a hang is detected.
> + *
> + * Procedure is fairly simple:
> + *   - reset the chip using the reset reg
> + *   - re-init context state
> + *   - re-init hardware status page
> + *   - re-init ring buffer
> + *   - re-init interrupt state
> + *   - re-init display
> + */
> +void i965_reset(struct drm_device *dev, u8 flags)
> +{
> +	drm_i915_private_t *dev_priv = dev->dev_private;
> +	unsigned long timeout = jiffies + msecs_to_jiffies(500);
> +	u8 gdrst;
> +
> +	BUG_ON(mutex_is_locked(&dev->struct_mutex));
> +
> +	if (flags & GDRST_FULL)
> +		i915_save_display(dev);
> +
> +	/*
> +	 * Set the reset bit, wait for reset, then clear it.  Hardware
> +	 * will clear the status bit (bit 1) when it's actually ready
> +	 * for action again.
> +	 */
> +	pci_read_config_byte(dev->pdev, GDRST, &gdrst);
> +	pci_write_config_byte(dev->pdev, GDRST, gdrst | flags);
> +	udelay(50);
> +	pci_write_config_byte(dev->pdev, GDRST, gdrst & 0xfe);
> +
> +	/* ...we don't want to loop forever though, 500ms should be plenty */
> +	do {
> +		udelay(100);
> +		pci_read_config_byte(dev->pdev, GDRST, &gdrst);
> +	} while ((gdrst & 2) && time_after(timeout, jiffies));
> +
> +	/* Ok now get things going again... */
> +
> +	/*
> +	 * Everything depends on having the GTT running, so we need to start
> +	 * there.  Fortunately we don't need to do this unless we reset the
> +	 * chip at a PCI level.
> +	 *
> +	 * Next we need to restore the context, but we don't use those
> +	 * yet either...
> +	 *
> +	 * Ring buffer needs to be re-initialized in the KMS case, or if X
> +	 * was running at the time of the reset (i.e. we weren't VT
> +	 * switched away).
> +	 */
> +	if (drm_core_check_feature(dev, DRIVER_MODESET) ||
> +	    !dev_priv->mm.suspended) {
> +		drm_i915_ring_buffer_t *ring = &dev_priv->ring;
> +		struct drm_gem_object *obj = ring->ring_obj;
> +		struct drm_i915_gem_object *obj_priv = obj->driver_private;
> +		dev_priv->mm.suspended = 0;
> +
> +		/* Stop the ring if it's running. */
> +		I915_WRITE(PRB0_CTL, 0);
> +		I915_WRITE(PRB0_TAIL, 0);
> +		I915_WRITE(PRB0_HEAD, 0);
> +
> +		/* Initialize the ring. */
> +		I915_WRITE(PRB0_START, obj_priv->gtt_offset);
> +		I915_WRITE(PRB0_CTL,
> +			   ((obj->size - 4096) & RING_NR_PAGES) |
> +			   RING_NO_REPORT |
> +			   RING_VALID);
> +		if (!drm_core_check_feature(dev, DRIVER_MODESET))
> +			i915_kernel_lost_context(dev);
> +		else {
> +			ring->head = I915_READ(PRB0_HEAD) & HEAD_ADDR;
> +			ring->tail = I915_READ(PRB0_TAIL) & TAIL_ADDR;
> +			ring->space = ring->head - (ring->tail + 8);
> +			if (ring->space < 0)
> +				ring->space += ring->Size;
> +		}
> +
> +		drm_irq_install(dev);
> +	}
> +
> +	/*
> +	 * Display needs restore too...
> +	 */
> +	if (flags & GDRST_FULL)
> +		i915_restore_display(dev);
> +}
> +
>  static int __devinit
>  i915_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
>  {
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 7756f78..c51ddb2 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -228,6 +228,7 @@ typedef struct drm_i915_private {
>  	spinlock_t error_lock;
>  	struct drm_i915_error_state *first_error;
>  	struct work_struct error_work;
> +	struct timer_list hangcheck_timer;
>  
>  	/* Register state */
>  	u8 saveLBB;
> @@ -557,6 +558,8 @@ extern struct drm_ioctl_desc i915_ioctls[];
>  extern int i915_max_ioctl;
>  extern unsigned int i915_fbpercrtc;
>  
> +extern void i915_save_display(struct drm_device *dev);
> +extern void i915_restore_display(struct drm_device *dev);
>  extern int i915_master_create(struct drm_device *dev, struct drm_master *master);
>  extern void i915_master_destroy(struct drm_device *dev, struct drm_master *master);
>  
> @@ -576,6 +579,7 @@ extern long i915_compat_ioctl(struct file *filp, unsigned int cmd,
>  extern int i915_emit_box(struct drm_device *dev,
>  			 struct drm_clip_rect *boxes,
>  			 int i, int DR1, int DR4);
> +extern void i965_reset(struct drm_device *dev, u8 flags);
>  
>  /* i915_irq.c */
>  extern int i915_irq_emit(struct drm_device *dev, void *data,
> @@ -607,7 +611,7 @@ i915_enable_pipestat(drm_i915_private_t *dev_priv, int pipe, u32 mask);
>  
>  void
>  i915_disable_pipestat(drm_i915_private_t *dev_priv, int pipe, u32 mask);
> -
> +extern void i915_gem_hangcheck(unsigned long arg);
>  
>  /* i915_mem.c */
>  extern int i915_mem_alloc(struct drm_device *dev, void *data,
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 174aef2..c74e29f 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -1545,8 +1545,12 @@ i915_add_request(struct drm_device *dev, struct drm_file *file_priv,
>  
>  	}
>  
> -	if (was_empty && !dev_priv->mm.suspended)
> -		schedule_delayed_work(&dev_priv->mm.retire_work, HZ);
> +	if (!dev_priv->mm.suspended) {
> +		if (was_empty)
> +		    schedule_delayed_work(&dev_priv->mm.retire_work, HZ);
> +
> +		mod_timer(&dev_priv->hangcheck_timer, jiffies + 5*DRM_HZ);
> +	}
>  	return seqno;
>  }
>  
> @@ -3866,6 +3870,7 @@ i915_gem_idle(struct drm_device *dev)
>  	 * We need to replace this with a semaphore, or something.
>  	 */
>  	dev_priv->mm.suspended = 1;
> +	del_timer(&dev_priv->hangcheck_timer);
>  
>  	/* Cancel the retire work handler, wait for it to finish if running
>  	 */
> @@ -4250,6 +4255,9 @@ i915_gem_load(struct drm_device *dev)
>  		dev_priv->num_fence_regs = 8;
>  
>  	i915_gem_detect_bit_6_swizzle(dev);
> +
> +	setup_timer(&dev_priv->hangcheck_timer, i915_gem_hangcheck,
> +		    (unsigned long)dev);
>  }
>  
>  /*
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 39a2b40..df32ee8 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -298,6 +298,18 @@ static void i915_error_work_func(struct work_struct *work)
>  	DRM_DEBUG("generating error event\n");
>  
>  	kobject_uevent_env(&dev->primary->kdev.kobj, KOBJ_CHANGE, envp);
> +
> +	if (dev_priv->mm.wedged) {
> +		printk(KERN_ERR "GPU hang detected...");
> +		if (IS_I965G(dev)) {
> +			printk("resetting...");
> +			i965_reset(dev, GDRST_RENDER);
> +			printk("done.\n");
> +			dev_priv->mm.wedged = 0;
> +		} else {
> +			printk("reboot required\n");
> +		}
> +	}
>  }
>  
>  /**
> @@ -475,6 +487,23 @@ static void i915_handle_error(struct drm_device *dev)
>  	schedule_work(&dev_priv->error_work);
>  }
>  
> +/**
> + * i915_gem_hangcheck - hang timer function
> + * @dev: drm device
> + *
> + * This function is called by the hangcheck timer if a batch doesn't finish
> + * within the timeout value.  It should capture error state and try to reset
> + * the GPU.
> + */
> +void i915_gem_hangcheck(unsigned long arg)
> +{
> +	struct drm_device *dev = (struct drm_device *)arg;
> +	drm_i915_private_t *dev_priv = dev->dev_private;
> +
> +	dev_priv->mm.wedged = 1;
> +	i915_handle_error(dev);
> +}
> +
>  irqreturn_t i915_driver_irq_handler(DRM_IRQ_ARGS)
>  {
>  	struct drm_device *dev = (struct drm_device *) arg;
> @@ -566,6 +595,7 @@ irqreturn_t i915_driver_irq_handler(DRM_IRQ_ARGS)
>  		if (iir & I915_USER_INTERRUPT) {
>  			dev_priv->mm.irq_gem_seqno = i915_get_gem_seqno(dev);
>  			DRM_WAKEUP(&dev_priv->irq_queue);
> +			del_timer(&dev_priv->hangcheck_timer);
>  		}
>  
>  		if (pipea_stats & vblank_status) {
> diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
> index afb9835..2ffafbf 100644
> --- a/drivers/gpu/drm/i915/i915_reg.h
> +++ b/drivers/gpu/drm/i915/i915_reg.h
> @@ -66,6 +66,10 @@
>  #define   GC_DISPLAY_CLOCK_333_MHZ	(4 << 4)
>  #define   GC_DISPLAY_CLOCK_MASK		(7 << 4)
>  #define LBB	0xf4
> +#define GDRST 0xc0
> +#define  GDRST_FULL	(0<<2)
> +#define  GDRST_RENDER	(1<<2)
> +#define  GDRST_MEDIA	(3<<2)
>  
>  /* VGA stuff */
>  
> diff --git a/drivers/gpu/drm/i915/i915_suspend.c b/drivers/gpu/drm/i915/i915_suspend.c
> index a98e283..cfb469c 100644
> --- a/drivers/gpu/drm/i915/i915_suspend.c
> +++ b/drivers/gpu/drm/i915/i915_suspend.c
> @@ -222,19 +222,15 @@ static void i915_restore_vga(struct drm_device *dev)
>  	I915_WRITE8(VGA_DACMASK, dev_priv->saveDACMASK);
>  }
>  
> -int i915_save_state(struct drm_device *dev)
> +/**
> + * i915_save_display - save display & mode info
> + * @dev: DRM device
> + *
> + * Save mode timings and display info.
> + */
> +void i915_save_display(struct drm_device *dev)
>  {
>  	struct drm_i915_private *dev_priv = dev->dev_private;
> -	int i;
> -
> -	pci_read_config_byte(dev->pdev, LBB, &dev_priv->saveLBB);
> -
> -	/* Render Standby */
> -	if (IS_I965G(dev) && IS_MOBILE(dev))
> -		dev_priv->saveRENDERSTANDBY = I915_READ(MCHBAR_RENDER_STANDBY);
> -
> -	/* Hardware status page */
> -	dev_priv->saveHWS = I915_READ(HWS_PGA);
>  
>  	/* Display arbitration control */
>  	dev_priv->saveDSPARB = I915_READ(DSPARB);
> @@ -330,17 +326,36 @@ int i915_save_state(struct drm_device *dev)
>  	dev_priv->saveFBC_CONTROL2 = I915_READ(FBC_CONTROL2);
>  	dev_priv->saveFBC_CONTROL = I915_READ(FBC_CONTROL);
>  
> -	/* Interrupt state */
> -	dev_priv->saveIIR = I915_READ(IIR);
> -	dev_priv->saveIER = I915_READ(IER);
> -	dev_priv->saveIMR = I915_READ(IMR);
> -
>  	/* VGA state */
>  	dev_priv->saveVGA0 = I915_READ(VGA0);
>  	dev_priv->saveVGA1 = I915_READ(VGA1);
>  	dev_priv->saveVGA_PD = I915_READ(VGA_PD);
>  	dev_priv->saveVGACNTRL = I915_READ(VGACNTRL);
>  
> +	i915_save_vga(dev);
> +}
> +
> +int i915_save_state(struct drm_device *dev)
> +{
> +	struct drm_i915_private *dev_priv = dev->dev_private;
> +	int i;
> +
> +	pci_read_config_byte(dev->pdev, LBB, &dev_priv->saveLBB);
> +
> +	/* Render Standby */
> +	if (IS_I965G(dev) && IS_MOBILE(dev))
> +		dev_priv->saveRENDERSTANDBY = I915_READ(MCHBAR_RENDER_STANDBY);
> +
> +	/* Hardware status page */
> +	dev_priv->saveHWS = I915_READ(HWS_PGA);
> +
> +	i915_save_display(dev);
> +
> +	/* Interrupt state */
> +	dev_priv->saveIIR = I915_READ(IIR);
> +	dev_priv->saveIER = I915_READ(IER);
> +	dev_priv->saveIMR = I915_READ(IMR);
> +
>  	/* Clock gating state */
>  	dev_priv->saveD_STATE = I915_READ(D_STATE);
>  	dev_priv->saveCG_2D_DIS = I915_READ(CG_2D_DIS);
> @@ -371,25 +386,15 @@ int i915_save_state(struct drm_device *dev)
>  			for (i = 0; i < 8; i++)
>  				dev_priv->saveFENCE[i+8] = I915_READ(FENCE_REG_945_8 + (i * 4));
>  	}
> -	i915_save_vga(dev);
>  
>  	return 0;
>  }
>  
> -int i915_restore_state(struct drm_device *dev)
> +void i915_restore_display(struct drm_device *dev)
>  {
>  	struct drm_i915_private *dev_priv = dev->dev_private;
>  	int i;
>  
> -	pci_write_config_byte(dev->pdev, LBB, dev_priv->saveLBB);
> -
> -	/* Render Standby */
> -	if (IS_I965G(dev) && IS_MOBILE(dev))
> -		I915_WRITE(MCHBAR_RENDER_STANDBY, dev_priv->saveRENDERSTANDBY);
> -
> -	/* Hardware status page */
> -	I915_WRITE(HWS_PGA, dev_priv->saveHWS);
> -
>  	/* Display arbitration */
>  	I915_WRITE(DSPARB, dev_priv->saveDSPARB);
>  
> @@ -533,6 +538,25 @@ int i915_restore_state(struct drm_device *dev)
>  	I915_WRITE(VGA_PD, dev_priv->saveVGA_PD);
>  	DRM_UDELAY(150);
>  
> +	i915_restore_vga(dev);
> +}
> +
> +int i915_restore_state(struct drm_device *dev)
> +{
> +	struct drm_i915_private *dev_priv = dev->dev_private;
> +	int i;
> +
> +	pci_write_config_byte(dev->pdev, LBB, dev_priv->saveLBB);
> +
> +	/* Render Standby */
> +	if (IS_I965G(dev) && IS_MOBILE(dev))
> +		I915_WRITE(MCHBAR_RENDER_STANDBY, dev_priv->saveRENDERSTANDBY);
> +
> +	/* Hardware status page */
> +	I915_WRITE(HWS_PGA, dev_priv->saveHWS);
> +
> +	i915_restore_display(dev);
> +
>  	/* Clock gating state */
>  	I915_WRITE (D_STATE, dev_priv->saveD_STATE);
>  	I915_WRITE (CG_2D_DIS, dev_priv->saveCG_2D_DIS);
> @@ -550,8 +574,6 @@ int i915_restore_state(struct drm_device *dev)
>  	for (i = 0; i < 3; i++)
>  		I915_WRITE(SWF30 + (i << 2), dev_priv->saveSWF2[i]);
>  
> -	i915_restore_vga(dev);
> -
>  	return 0;
>  }
>  
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
-- 
Eric Anholt
eric at anholt.net                         eric.anholt at intel.com


-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 197 bytes
Desc: This is a digitally signed message part
URL: <http://lists.freedesktop.org/archives/intel-gfx/attachments/20090623/df84cb0d/attachment.sig>


More information about the Intel-gfx mailing list