[Intel-gfx] [PATCH 13/13] drm/i915: refuse to submit more batchbuffers from guilty context

Mika Kuoppala mika.kuoppala at linux.intel.com
Tue Feb 26 12:05:16 CET 2013


If context has recently submitted a faulty batchbuffers guilty of
gpu hang and decides to keep submitting more crap, ban it permanently.

Signed-off-by: Mika Kuoppala <mika.kuoppala at intel.com>
---
 drivers/gpu/drm/i915/i915_drv.c            |   23 ++++++++++++++++++++++-
 drivers/gpu/drm/i915/i915_drv.h            |    5 +++++
 drivers/gpu/drm/i915/i915_gem.c            |    8 ++++++--
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |   12 ++++++++++++
 4 files changed, 45 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index b342749..e305fbe 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -815,6 +815,8 @@ int intel_gpu_reset(struct drm_device *dev)
 int i915_reset(struct drm_device *dev)
 {
 	drm_i915_private_t *dev_priv = dev->dev_private;
+	struct ctx_reset_state *gstate;
+	bool do_wedge = true;
 	int ret;
 
 	if (!i915_try_reset)
@@ -822,10 +824,29 @@ int i915_reset(struct drm_device *dev)
 
 	mutex_lock(&dev->struct_mutex);
 
+	/* i915_gem_reset will set this if it finds guilty context */
+	dev_priv->gpu_error.guilty_state = NULL;
+
 	i915_gem_reset(dev);
 
+	gstate = dev_priv->gpu_error.guilty_state;
+
+	if (gstate) {
+		if (gstate->guilty == 1) {
+			do_wedge = false;
+		} else if (!gstate->banned &&
+			   get_seconds() - gstate->last_guilty_reset < 5) {
+			gstate->banned = true;
+			do_wedge = false;
+		}
+
+		gstate->last_guilty_reset = get_seconds();
+	}
+
+	dev_priv->gpu_error.guilty_state = NULL;
+
 	ret = -ENODEV;
-	if (get_seconds() - dev_priv->gpu_error.last_reset < 5)
+	if (do_wedge && get_seconds() - dev_priv->gpu_error.last_reset < 5)
 		DRM_ERROR("GPU hanging too fast, declaring wedged!\n");
 	else
 		ret = intel_gpu_reset(dev);
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 9361b2e..06c518f 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -437,6 +437,10 @@ struct ctx_reset_state {
 	u32 total;
 	u32 innocent;
 	u32 guilty;
+	unsigned long last_guilty_reset;
+
+	/* banned to submit more work */
+	bool banned;
 };
 
 /* This must match up with the value previously used for execbuf2.rsvd1. */
@@ -810,6 +814,7 @@ struct i915_gpu_error {
 	struct work_struct work;
 
 	unsigned long last_reset;
+	struct ctx_reset_state *guilty_state;
 
 	/**
 	 * State variable and reset counter controlling the reset flow
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 6380a50..1d41e97 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2143,6 +2143,7 @@ static void i915_set_reset_status(struct intel_ring_buffer *ring,
 				  struct drm_i915_gem_request *request,
 				  u32 acthd)
 {
+	drm_i915_private_t *dev_priv = ring->dev->dev_private;
 	bool inside;
 	struct ctx_reset_state *rs = NULL;
 	bool guilty;
@@ -2174,10 +2175,13 @@ static void i915_set_reset_status(struct intel_ring_buffer *ring,
 	if (rs) {
 		rs->total++;
 
-		if (guilty)
+		if (guilty) {
 			rs->guilty++;
-		else
+
+			dev_priv->gpu_error.guilty_state = rs;
+		} else {
 			rs->innocent++;
+		}
 	}
 }
 
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 710784d..20a4011 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -837,6 +837,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 	struct drm_clip_rect *cliprects = NULL;
 	struct intel_ring_buffer *ring;
 	struct i915_hw_context *ctx;
+	struct ctx_reset_state *rs;
 	u32 ctx_id = i915_execbuffer2_get_context_id(*args);
 	u32 exec_start, exec_len;
 	u32 mask, flags;
@@ -1020,6 +1021,17 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 	if (ret)
 		goto err;
 
+
+	ret = i915_gem_context_get_reset_state(&dev_priv->ring[RCS],
+					       file, ctx_id, &rs);
+	if (ret)
+		goto err;
+
+	if (rs->banned) {
+		ret = -EIO;
+		goto err;
+	}
+
 	ret = i915_switch_context(ring, file, ctx_id, &ctx);
 	if (ret)
 		goto err;
-- 
1.7.9.5




More information about the Intel-gfx mailing list