[Intel-gfx] [PATCH 01/10] drm/i915: add interface to simulate gpu hangs
Ben Widawsky
ben at bwidawsk.net
Sat Apr 28 06:56:55 CEST 2012
On Fri, 27 Apr 2012 15:17:38 +0200
Daniel Vetter <daniel.vetter at ffwll.ch> wrote:
> gpu reset is a very important piece of our infrastructure.
> Unfortunately we only really it test by actually hanging the gpu,
> which often has bad side-effects for the entire system. And the gpu
> hang handling code is one of the rather complicated pieces of code we
> have, consisting of
> - hang detection
> - error capture
> - actual gpu reset
> - reset of all the gem bookkeeping
> - reinitialition of the entire gpu
>
> This patch adds a debugfs to selectively stopping rings by ceasing to
> update the hw tail pointer, which will result in the gpu no longer
> updating it's head pointer and eventually to the hangcheck firing.
> This way we can exercise the gpu hang code under controlled conditions
> without a dying gpu taking down the entire systems.
You mean, tail pointer?
>
> Patch motivated by me forgetting to properly reinitialize ppgtt after
> a gpu reset.
>
> Usage:
>
> echo $((1 << $ringnum)) > i915_ring_stop # stops one ring
>
> echo 0xffffffff > i915_ring_stop # stops all, future-proof version
>
> then run whatever testload is desired. i915_ring_stop automatically
> resets after a gpu hang is detected to avoid hanging the gpu to fast
> and declaring it wedged.
>
> v2: Incorporate feedback from Chris Wilson.
>
> v3: Add the missing cleanup.
>
> Signed-Off-by: Daniel Vetter <daniel.vetter at ffwll.ch>
> Reviewed-by: Chris Wilson <chris at chris-wilson.co.uk>
> ---
> drivers/gpu/drm/i915/i915_debugfs.c | 65
> +++++++++++++++++++++++++++++++
> drivers/gpu/drm/i915/i915_drv.h | 2 +
> drivers/gpu/drm/i915/intel_ringbuffer.c | 4 ++ 3 files changed, 71
> insertions(+), 0 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c
> b/drivers/gpu/drm/i915/i915_debugfs.c index 120db46..3f6e28e 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -1585,6 +1585,64 @@ static const struct file_operations
> i915_wedged_fops = { };
>
> static ssize_t
> +i915_ring_stop_read(struct file *filp,
> + char __user *ubuf,
> + size_t max,
> + loff_t *ppos)
> +{
> + struct drm_device *dev = filp->private_data;
> + drm_i915_private_t *dev_priv = dev->dev_private;
> + char buf[80];
> + int len;
> +
> + len = snprintf(buf, sizeof(buf),
> + "0x%08x\n", dev_priv->stop_rings);
> +
> + if (len > sizeof(buf))
> + len = sizeof(buf);
> +
> + return simple_read_from_buffer(ubuf, max, ppos, buf, len);
> +}
> +
> +static ssize_t
> +i915_ring_stop_write(struct file *filp,
> + const char __user *ubuf,
> + size_t cnt,
> + loff_t *ppos)
> +{
> + struct drm_device *dev = filp->private_data;
> + struct drm_i915_private *dev_priv = dev->dev_private;
> + char buf[20];
> + int val = 0;
> +
> + if (cnt > 0) {
> + if (cnt > sizeof(buf) - 1)
> + return -EINVAL;
> +
> + if (copy_from_user(buf, ubuf, cnt))
> + return -EFAULT;
> + buf[cnt] = 0;
> +
> + val = simple_strtoul(buf, NULL, 0);
> + }
> +
> + DRM_DEBUG_DRIVER("Stopping rings 0x%08x\n", val);
> +
> + mutex_lock(&dev->struct_mutex);
> + dev_priv->stop_rings = val;
> + mutex_unlock(&dev->struct_mutex);
> +
> + return cnt;
> +}
Not a huge deal, but I'd go with an interruptible mutex lock there.
Also, I think it would be cool if you did a tail update when going from
!0->0 stop_rings.
> +
> +static const struct file_operations i915_ring_stop_fops = {
> + .owner = THIS_MODULE,
> + .open = simple_open,
> + .read = i915_ring_stop_read,
> + .write = i915_ring_stop_write,
> + .llseek = default_llseek,
> +};
> +static ssize_t
> i915_max_freq_read(struct file *filp,
> char __user *ubuf,
> size_t max,
> @@ -1884,6 +1942,11 @@ int i915_debugfs_init(struct drm_minor *minor)
> &i915_cache_sharing_fops);
> if (ret)
> return ret;
> + ret = i915_debugfs_create(minor->debugfs_root, minor,
> + "i915_ring_stop",
> + &i915_ring_stop_fops);
> + if (ret)
> + return ret;
>
> return drm_debugfs_create_files(i915_debugfs_list,
> I915_DEBUGFS_ENTRIES,
> @@ -1902,6 +1965,8 @@ void i915_debugfs_cleanup(struct drm_minor
> *minor) 1, minor);
> drm_debugfs_remove_files((struct drm_info_list *)
> &i915_cache_sharing_fops, 1, minor);
> + drm_debugfs_remove_files((struct drm_info_list *)
> &i915_ring_stop_fops,
> + 1, minor);
> }
>
> #endif /* CONFIG_DEBUG_FS */
> diff --git a/drivers/gpu/drm/i915/i915_drv.h
> b/drivers/gpu/drm/i915/i915_drv.h index 0095c8d..5e62b35 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -413,6 +413,8 @@ typedef struct drm_i915_private {
> uint32_t last_instdone;
> uint32_t last_instdone1;
>
> + unsigned int stop_rings;
> +
> unsigned long cfb_size;
> unsigned int cfb_fb;
> enum plane cfb_plane;
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c
> b/drivers/gpu/drm/i915/intel_ringbuffer.c index 427b7c5..d2de16d
> 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -1221,7 +1221,11 @@ int intel_ring_begin(struct intel_ring_buffer
> *ring,
> void intel_ring_advance(struct intel_ring_buffer *ring)
> {
> + struct drm_i915_private *dev_priv = ring->dev->dev_private;
> +
> ring->tail &= ring->size - 1;
> + if (dev_priv->stop_rings & intel_ring_flag(ring))
> + return;
> ring->write_tail(ring, ring->tail);
> }
>
More information about the Intel-gfx
mailing list