[Intel-gfx] [PATCH v3 14/16] drm/i915: refuse to submit more batchbuffers from guilty context
Mika Kuoppala
mika.kuoppala at linux.intel.com
Thu Apr 11 17:13:34 CEST 2013
Mika Kuoppala <mika.kuoppala at linux.intel.com> writes:
> If context has recently submitted a faulty batchbuffers guilty of
> gpu hang and decides to keep submitting more crap, ban it permanently.
>
> Signed-off-by: Mika Kuoppala <mika.kuoppala at intel.com>
> ---
> drivers/gpu/drm/i915/i915_drv.c | 23 ++++++++++++++++++++++-
> drivers/gpu/drm/i915/i915_drv.h | 7 +++++++
> drivers/gpu/drm/i915/i915_gem.c | 7 +++++--
> drivers/gpu/drm/i915/i915_gem_execbuffer.c | 13 +++++++++++++
> 4 files changed, 47 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index a5b8aa9..0928f11 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -852,6 +852,8 @@ int intel_gpu_reset(struct drm_device *dev)
> int i915_reset(struct drm_device *dev)
> {
> drm_i915_private_t *dev_priv = dev->dev_private;
> + struct i915_ctx_hang_stats *hs;
> + bool do_wedge = true;
> int ret;
>
> if (!i915_try_reset)
> @@ -859,10 +861,29 @@ int i915_reset(struct drm_device *dev)
>
> mutex_lock(&dev->struct_mutex);
>
> + /* i915_gem_reset will set this if it finds guilty context */
> + dev_priv->gpu_error.hang_stats = NULL;
> +
> i915_gem_reset(dev);
>
> + hs = dev_priv->gpu_error.hang_stats;
> +
> + if (hs) {
> + if (hs->batch_active == 1) {
> + do_wedge = false;
> + } else if (!hs->banned &&
> + get_seconds() - hs->batch_active_reset_ts < 5) {
> + hs->banned = true;
> + do_wedge = false;
> + }
> +
> + hs->batch_active_reset_ts = get_seconds();
> + }
> +
> + dev_priv->gpu_error.hang_stats = NULL;
> +
> ret = -ENODEV;
> - if (get_seconds() - dev_priv->gpu_error.last_reset < 5)
> + if (do_wedge && get_seconds() - dev_priv->gpu_error.last_reset < 5)
> DRM_ERROR("GPU hanging too fast, declaring wedged!\n");
> else
> ret = intel_gpu_reset(dev);
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 8223908..30ba79c 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -458,6 +458,12 @@ struct i915_ctx_hang_stats {
>
> /* This context had batch active when hang was declared */
> unsigned batch_active;
> +
> + /* Time when this context was last blamed for a GPU reset */
> + unsigned long batch_active_reset_ts;
> +
> + /* This context is banned to submit more work */
> + bool banned;
> };
>
> /* This must match up with the value previously used for execbuf2.rsvd1. */
> @@ -831,6 +837,7 @@ struct i915_gpu_error {
> struct work_struct work;
>
> unsigned long last_reset;
> + struct i915_ctx_hang_stats *hang_stats;
>
> /**
> * State variable and reset counter controlling the reset flow
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 475b6ad..ca5c9c3 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2147,6 +2147,7 @@ static void i915_set_reset_status(struct intel_ring_buffer *ring,
> struct drm_i915_gem_request *request,
> u32 acthd)
> {
> + struct drm_i915_private *dev_priv = ring->dev->dev_private;
> struct i915_ctx_hang_stats *hs = NULL;
> bool inside, guilty;
>
> @@ -2175,10 +2176,12 @@ static void i915_set_reset_status(struct intel_ring_buffer *ring,
> hs = &request->file_priv->hang_stats;
>
> if (hs) {
> - if (guilty)
> + if (guilty) {
> hs->batch_active++;
> - else
> + dev_priv->gpu_error.hang_stats = hs;
This is wrong. The context can be destroyed shortly after,
before reset is handled making this a dangling pointer.
> + } else {
> hs->batch_pending++;
> + }
> }
> }
>
> diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> index bd1750a..f1b1ea9 100644
> --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> @@ -844,6 +844,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
> struct drm_clip_rect *cliprects = NULL;
> struct intel_ring_buffer *ring;
> struct i915_hw_context *ctx;
> + struct i915_ctx_hang_stats *hs;
> u32 ctx_id = i915_execbuffer2_get_context_id(*args);
> u32 exec_start, exec_len;
> u32 mask, flags;
> @@ -1026,6 +1027,18 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
> if (ret)
> goto err;
>
> + hs = i915_gem_context_get_hang_stats(&dev_priv->ring[RCS],
> + file, ctx_id);
> + if (IS_ERR(hs)) {
> + ret = PTR_ERR(hs);
> + goto err;
> + }
> +
> + if (hs->banned) {
> + ret = -EIO;
> + goto err;
> + }
> +
> ctx = i915_switch_context(ring, file, ctx_id);
> if (IS_ERR(ctx)) {
> ret = PTR_ERR(ctx);
> --
> 1.7.9.5
More information about the Intel-gfx
mailing list