[Intel-gfx] [PATCH 5/6] drm/i915: Implement GPU reset

Sun Sep 13 19:27:48 CEST 2009

A couple of things just caught my eye while looking through the patch,
perhaps to consider tweaking?

On Sun, Sep 13, 2009 at 3:51 PM, Ben Gamari <bgamari.foss at gmail.com> wrote:
> This patch puts in place the machinery to attempt to reset the GPU. This
> will be used when attempting to recover from a GPU hang.
>
> Signed-off-by: Jesse Barnes <jbarnes at virtuousgeek.org>
> Signed-off-by: Ben Gamari <bgamari.foss at gmail.com>
> ---
>  drivers/gpu/drm/i915/i915_dma.c |    8 ++
>  drivers/gpu/drm/i915/i915_drv.c |  141 +++++++++++++++++++++++++++++++++++++++
>  drivers/gpu/drm/i915/i915_drv.h |    1 +
>  drivers/gpu/drm/i915/i915_irq.c |    8 ++
>  drivers/gpu/drm/i915/i915_reg.h |    4 +
>  5 files changed, 162 insertions(+), 0 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
> index aabe41b..25f8e75 100644
> --- a/drivers/gpu/drm/i915/i915_dma.c
> +++ b/drivers/gpu/drm/i915/i915_dma.c
> @@ -1173,6 +1173,9 @@ static int i915_load_modeset_init(struct drm_device *dev,
>        drm_mm_init(&dev_priv->vram, 0, prealloc_size);
>        DRM_INFO("set up %ldM of stolen space\n", prealloc_size / (1024*1024));
>
> +       /* We're off and running w/KMS */
> +       dev_priv->mm.suspended = 0;
> +
>        /* Let GEM Manage from end of prealloc space to end of aperture.
>         *
>         * However, leave one page at the end still bound to the scratch page.
> @@ -1184,7 +1187,9 @@ static int i915_load_modeset_init(struct drm_device *dev,
>         */
>        i915_gem_do_init(dev, prealloc_size, agp_size - 4096);
>
> +       mutex_lock(&dev->struct_mutex);
>        ret = i915_gem_init_ringbuffer(dev);
> +       mutex_unlock(&dev->struct_mutex);
>        if (ret)
>                goto out;
>
> @@ -1433,6 +1438,9 @@ int i915_driver_load(struct drm_device *dev, unsigned long flags)
>                return ret;
>        }
>
> +       /* Start out suspended */
> +       dev_priv->mm.suspended = 1;
> +
>        if (drm_core_check_feature(dev, DRIVER_MODESET)) {
>                ret = i915_load_modeset_init(dev, prealloc_start,
>                                             prealloc_size, agp_size);
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index dbe568c..0c03a2a 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -127,6 +127,147 @@ static int i915_resume(struct drm_device *dev)
>        return ret;
>  }
>
> +/**
> + * i965_reset - reset chip after a hang
> + * @dev: drm device to reset
> + * @flags: reset domains
> + *
> + * Reset the chip.  Useful if a hang is detected.
> + *
> + * Procedure is fairly simple:
> + *   - reset the chip using the reset reg
> + *   - re-init context state
> + *   - re-init hardware status page
> + *   - re-init ring buffer
> + *   - re-init interrupt state
> + *   - re-init display
> + */
> +void i965_reset(struct drm_device *dev, u8 flags)
> +{
> +       drm_i915_private_t *dev_priv = dev->dev_private;
> +       unsigned long timeout;
> +       u8 gdrst;
> +       bool need_display = true; //!(flags & (GDRST_RENDER | GDRST_MEDIA));

As the kernel's coding style isn't C99, is it worth using /* foo */
here to be compliant?

> +
> +#if defined(CONFIG_SMP)
> +       timeout = jiffies + msecs_to_jiffies(500);
> +       do {
> +               udelay(100);
> +       } while (mutex_is_locked(&dev->struct_mutex) && time_after(timeout, jiffies));
> +
> +       if (mutex_is_locked(&dev->struct_mutex)) {
> +#if 1
> +               DRM_ERROR("i915 struct_mutex lock is still held by %s. Giving on up reset.\n", dev->struct_mutex.owner->task->comm);
> +               return;
> +#else
> +               struct task_struct *task = dev->struct_mutex.owner->task;
> +               DRM_ERROR("Killing process %d (%s) for holding i915 device mutex\n",
> +                               task->pid, task->comm);
> +               force_sig(SIGILL, task);

Should be SIGKILL here?

> +#endif
> +       }
> +#else
> +       BUG_ON(mutex_is_locked(&dev->struct_mutex));
> +#endif
> +
> +       debug_show_all_locks();
> +       mutex_lock(&dev->struct_mutex);
> +
> +       /*
> +        * Clear request list
> +        */
> +       i915_gem_retire_requests(dev);
> +
> +       if (need_display)
> +               i915_save_display(dev);
> +
> +       if (IS_I965G(dev) || IS_G4X(dev)) {
> +               /*
> +                * Set the domains we want to reset, then the reset bit (bit 0).
> +                * Clear the reset bit after a while and wait for hardware status
> +                * bit (bit 1) to be set
> +                */
> +               pci_read_config_byte(dev->pdev, GDRST, &gdrst);
> +               //TODO: Set domains

Consider using kernel coding style perhaps?

> +               pci_write_config_byte(dev->pdev, GDRST, gdrst | flags | ((flags == GDRST_FULL) ? 0x1 : 0x0));
> +               udelay(50);
> +               pci_write_config_byte(dev->pdev, GDRST, gdrst & 0xfe);
> +
> +               /* ...we don't want to loop forever though, 500ms should be plenty */
> +              timeout = jiffies + msecs_to_jiffies(500);
> +               do {
> +                       udelay(100);
> +                       pci_read_config_byte(dev->pdev, GDRST, &gdrst);
> +               } while ((gdrst & 0x1) && time_after(timeout, jiffies));
> +
> +               if (gdrst & 0x1) {
> +                       DRM_ERROR("Failed to reset chip. You're screwed.");

Perhaps drop the second sentence, to keep things looking neat.

> +                       mutex_unlock(&dev->struct_mutex);
> +                       return;
> +               }
> +       } else {
> +               DRM_ERROR("Error occurred. Don't know how to reset this chip.\n");
> +               return;
> +       }
> +
> +       /* Ok, now get things going again... */
> +
> +       /*
> +        * Everything depends on having the GTT running, so we need to start
> +        * there.  Fortunately we don't need to do this unless we reset the
> +        * chip at a PCI level.
> +        *
> +        * Next we need to restore the context, but we don't use those
> +        * yet either...
> +        *
> +        * Ring buffer needs to be re-initialized in the KMS case, or if X
> +        * was running at the time of the reset (i.e. we weren't VT
> +        * switched away).
> +        */
> +       if (drm_core_check_feature(dev, DRIVER_MODESET) ||
> +           !dev_priv->mm.suspended) {
> +               drm_i915_ring_buffer_t *ring = &dev_priv->ring;
> +               struct drm_gem_object *obj = ring->ring_obj;
> +               struct drm_i915_gem_object *obj_priv = obj->driver_private;
> +               dev_priv->mm.suspended = 0;
> +
> +               /* Stop the ring if it's running. */
> +               I915_WRITE(PRB0_CTL, 0);
> +               I915_WRITE(PRB0_TAIL, 0);
> +               I915_WRITE(PRB0_HEAD, 0);
> +
> +               /* Initialize the ring. */
> +               I915_WRITE(PRB0_START, obj_priv->gtt_offset);
> +               I915_WRITE(PRB0_CTL,
> +                          ((obj->size - 4096) & RING_NR_PAGES) |
> +                          RING_NO_REPORT |
> +                          RING_VALID);
> +               if (!drm_core_check_feature(dev, DRIVER_MODESET))
> +                       i915_kernel_lost_context(dev);
> +               else {
> +                       ring->head = I915_READ(PRB0_HEAD) & HEAD_ADDR;
> +                       ring->tail = I915_READ(PRB0_TAIL) & TAIL_ADDR;
> +                       ring->space = ring->head - (ring->tail + 8);
> +                       if (ring->space < 0)
> +                               ring->space += ring->Size;
> +               }
> +
> +               mutex_unlock(&dev->struct_mutex);
> +               drm_irq_uninstall(dev);
> +               drm_irq_install(dev);
> +               mutex_lock(&dev->struct_mutex);
> +       }
> +
> +       /*
> +        * Display needs restore too...
> +        */
> +       if (need_display)
> +               i915_restore_display(dev);
> +
> +       mutex_unlock(&dev->struct_mutex);
> +}
> +
> +
>  static int __devinit
>  i915_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
>  {
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index afbcaa9..8797777 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -624,6 +624,7 @@ extern long i915_compat_ioctl(struct file *filp, unsigned int cmd,
>  extern int i915_emit_box(struct drm_device *dev,
>                         struct drm_clip_rect *boxes,
>                         int i, int DR1, int DR4);
> +extern void i965_reset(struct drm_device *dev, u8 flags);
>
>  /* i915_irq.c */
>  void i915_hangcheck_elapsed(unsigned long data);
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 126696a..dbfcf0a 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -482,6 +482,14 @@ static void i915_handle_error(struct drm_device *dev)
>                I915_WRITE(IIR, I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT);
>        }
>
> +       if (dev_priv->mm.wedged) {
> +               /*
> +                * Wakeup waiting processes so they don't hang
> +                */
> +               printk("i915: Waking up sleeping processes\n");
> +               DRM_WAKEUP(&dev_priv->irq_queue);
> +       }
> +
>        queue_work(dev_priv->wq, &dev_priv->error_work);
>  }
>
> diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
> index f7c2de8..99981a0 100644
> --- a/drivers/gpu/drm/i915/i915_reg.h
> +++ b/drivers/gpu/drm/i915/i915_reg.h
> @@ -85,6 +85,10 @@
>  #define   I915_GC_RENDER_CLOCK_200_MHZ (1 << 0)
>  #define   I915_GC_RENDER_CLOCK_333_MHZ (4 << 0)
>  #define LBB    0xf4
> +#define GDRST 0xc0
> +#define  GDRST_FULL    (0<<2)
> +#define  GDRST_RENDER  (1<<2)
> +#define  GDRST_MEDIA   (3<<2)
>
>  /* VGA stuff */
>
-- 
Daniel J Blueman