[Intel-gfx] [PATCH 7/7] drm/i915: refuse to submit more batchbuffers from guilty context
Mika Kuoppala
mika.kuoppala at linux.intel.com
Wed Jun 12 11:35:34 CEST 2013
If context has recently submitted a faulty batchbuffers guilty of
gpu hang and decides to keep submitting more crap, ban it permanently.
v2: Store guilty ban status bool in gpu_error instead of pointers
that might become danling before hang is declared.
v3: Use return value for banned status instead of stashing state
into gpu_error (Chris Wilson)
Signed-off-by: Mika Kuoppala <mika.kuoppala at intel.com>
---
drivers/gpu/drm/i915/i915_drv.c | 6 +++--
drivers/gpu/drm/i915/i915_drv.h | 8 ++++++-
drivers/gpu/drm/i915/i915_gem.c | 34 ++++++++++++++++++++--------
drivers/gpu/drm/i915/i915_gem_execbuffer.c | 13 +++++++++++
4 files changed, 49 insertions(+), 12 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index c3e4f29..70b64fd 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -867,6 +867,7 @@ int i915_reset(struct drm_device *dev)
{
drm_i915_private_t *dev_priv = dev->dev_private;
bool simulated;
+ bool ctx_banned;
int ret;
if (!i915_try_reset)
@@ -874,11 +875,12 @@ int i915_reset(struct drm_device *dev)
mutex_lock(&dev->struct_mutex);
- i915_gem_reset(dev);
+ ctx_banned = i915_gem_reset(dev);
simulated = dev_priv->gpu_error.stop_rings != 0;
- if (!simulated && get_seconds() - dev_priv->gpu_error.last_reset < 5) {
+ if (!(simulated || ctx_banned) &&
+ get_seconds() - dev_priv->gpu_error.last_reset < 5) {
DRM_ERROR("GPU hanging too fast, declaring wedged!\n");
ret = -ENODEV;
} else {
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 8bc399c..364afff 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -504,6 +504,12 @@ struct i915_ctx_hang_stats {
/* This context had batch active when hang was declared */
unsigned batch_active;
+
+ /* Time when this context was last blamed for a GPU reset */
+ unsigned long batch_active_reset_ts;
+
+ /* This context is banned to submit more work */
+ bool banned;
};
/* This must match up with the value previously used for execbuf2.rsvd1. */
@@ -1738,7 +1744,7 @@ static inline bool i915_terminally_wedged(struct i915_gpu_error *error)
return atomic_read(&error->reset_counter) == I915_WEDGED;
}
-void i915_gem_reset(struct drm_device *dev);
+bool i915_gem_reset(struct drm_device *dev);
void i915_gem_clflush_object(struct drm_i915_gem_object *obj);
int __must_check i915_gem_object_set_domain(struct drm_i915_gem_object *obj,
uint32_t read_domains,
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 6144f0b..3ecf1fe 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2143,15 +2143,15 @@ static bool i915_request_guilty(struct drm_i915_gem_request *request,
return false;
}
-static void i915_set_reset_status(struct intel_ring_buffer *ring,
+static bool i915_set_reset_status(struct intel_ring_buffer *ring,
struct drm_i915_gem_request *request,
u32 acthd)
{
struct i915_ctx_hang_stats *hs = NULL;
- bool inside, guilty;
+ bool inside, guilty, banned;
/* Innocent until proven guilty */
- guilty = false;
+ guilty = banned = false;
if (ring->hangcheck.last_action != wait &&
i915_request_guilty(request, acthd, &inside)) {
@@ -2175,11 +2175,20 @@ static void i915_set_reset_status(struct intel_ring_buffer *ring,
hs = &request->file_priv->hang_stats;
if (hs) {
- if (guilty)
+ if (guilty) {
+ if (!hs->banned &&
+ get_seconds() - hs->batch_active_reset_ts < 15) {
+ hs->banned = banned = true;
+ DRM_ERROR("context hanging too fast, declaring banned\n");
+ }
hs->batch_active++;
- else
+ hs->batch_active_reset_ts = get_seconds();
+ } else {
hs->batch_pending++;
+ }
}
+
+ return banned;
}
static void i915_gem_free_request(struct drm_i915_gem_request *request)
@@ -2193,11 +2202,12 @@ static void i915_gem_free_request(struct drm_i915_gem_request *request)
kfree(request);
}
-static void i915_gem_reset_ring_lists(struct drm_i915_private *dev_priv,
+static bool i915_gem_reset_ring_lists(struct drm_i915_private *dev_priv,
struct intel_ring_buffer *ring)
{
u32 completed_seqno;
u32 acthd;
+ bool ctx_banned = false;
acthd = intel_ring_get_active_head(ring);
completed_seqno = ring->get_seqno(ring, false);
@@ -2210,7 +2220,8 @@ static void i915_gem_reset_ring_lists(struct drm_i915_private *dev_priv,
list);
if (request->seqno > completed_seqno)
- i915_set_reset_status(ring, request, acthd);
+ ctx_banned |= i915_set_reset_status(ring,
+ request, acthd);
i915_gem_free_request(request);
}
@@ -2224,6 +2235,8 @@ static void i915_gem_reset_ring_lists(struct drm_i915_private *dev_priv,
i915_gem_object_move_to_inactive(obj);
}
+
+ return ctx_banned;
}
static void i915_gem_reset_fences(struct drm_device *dev)
@@ -2247,15 +2260,16 @@ static void i915_gem_reset_fences(struct drm_device *dev)
INIT_LIST_HEAD(&dev_priv->mm.fence_list);
}
-void i915_gem_reset(struct drm_device *dev)
+bool i915_gem_reset(struct drm_device *dev)
{
struct drm_i915_private *dev_priv = dev->dev_private;
struct drm_i915_gem_object *obj;
struct intel_ring_buffer *ring;
int i;
+ bool ctx_banned = false;
for_each_ring(ring, dev_priv, i)
- i915_gem_reset_ring_lists(dev_priv, ring);
+ ctx_banned |= i915_gem_reset_ring_lists(dev_priv, ring);
/* Move everything out of the GPU domains to ensure we do any
* necessary invalidation upon reuse.
@@ -2269,6 +2283,8 @@ void i915_gem_reset(struct drm_device *dev)
/* The fence registers are invalidated so clear them out */
i915_gem_reset_fences(dev);
+
+ return ctx_banned;
}
/**
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 87a3227..7fcd6c0 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -842,6 +842,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
struct drm_i915_gem_object *batch_obj;
struct drm_clip_rect *cliprects = NULL;
struct intel_ring_buffer *ring;
+ struct i915_ctx_hang_stats *hs;
u32 ctx_id = i915_execbuffer2_get_context_id(*args);
u32 exec_start, exec_len;
u32 mask, flags;
@@ -1033,6 +1034,18 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
if (ret)
goto err;
+ hs = i915_gem_context_get_hang_stats(&dev_priv->ring[RCS],
+ file, ctx_id);
+ if (IS_ERR(hs)) {
+ ret = PTR_ERR(hs);
+ goto err;
+ }
+
+ if (hs->banned) {
+ ret = -EIO;
+ goto err;
+ }
+
ret = i915_switch_context(ring, file, ctx_id);
if (ret)
goto err;
--
1.7.9.5
More information about the Intel-gfx
mailing list