[Intel-gfx] [PATCH] drm/i915: Provide a timeout to i915_gem_wait_for_idle()
Chris Wilson
chris at chris-wilson.co.uk
Mon Jul 9 08:54:33 UTC 2018
Usually we have no idea about the upper bound we need to wait to catch
up with userspace when idling the device, but in a few situations we
know the system was idle beforehand and can provide a short timeout in
order to very quickly catch a failure, long before hangcheck kicks in.
In particular, with a broken GPU we expect it to fail during the initial
GPU setup where do a couple of context switches to record the defaults.
This is a task that takes a few milliseconds even on the slowest of
devices, but we may have to wait 60s for hangcheck to give in and
declare the machine inoperable. In thiis a case where any gpu hang is
unacceptable, both from a timeliness and practical standpoint.
The other improvement is that in selftests, we do not need to arm an
independent timer to inject a wedge, as we can just limit the timeout on
the wait directly.
Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
Cc: Joonas Lahtinen <joonas.lahtinen at linux.intel.com>
Cc: Mika Kuoppala <mika.kuoppala at linux.intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
---
drivers/gpu/drm/i915/i915_debugfs.c | 6 +-
drivers/gpu/drm/i915/i915_drv.h | 2 +-
drivers/gpu/drm/i915/i915_gem.c | 46 ++++++++++------
drivers/gpu/drm/i915/i915_gem_evict.c | 3 +-
drivers/gpu/drm/i915/i915_gem_gtt.c | 2 +-
drivers/gpu/drm/i915/i915_gem_shrinker.c | 11 +++-
drivers/gpu/drm/i915/i915_perf.c | 4 +-
drivers/gpu/drm/i915/i915_request.c | 6 +-
.../gpu/drm/i915/selftests/i915_gem_context.c | 3 +-
drivers/gpu/drm/i915/selftests/i915_request.c | 4 +-
.../gpu/drm/i915/selftests/igt_flush_test.c | 55 +++----------------
11 files changed, 65 insertions(+), 77 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 544e5e7f011f..099f97ef2303 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -4105,7 +4105,8 @@ fault_irq_set(struct drm_i915_private *i915,
err = i915_gem_wait_for_idle(i915,
I915_WAIT_LOCKED |
- I915_WAIT_INTERRUPTIBLE);
+ I915_WAIT_INTERRUPTIBLE,
+ MAX_SCHEDULE_TIMEOUT);
if (err)
goto err_unlock;
@@ -4210,7 +4211,8 @@ i915_drop_caches_set(void *data, u64 val)
if (val & DROP_ACTIVE)
ret = i915_gem_wait_for_idle(dev_priv,
I915_WAIT_INTERRUPTIBLE |
- I915_WAIT_LOCKED);
+ I915_WAIT_LOCKED,
+ MAX_SCHEDULE_TIMEOUT);
if (val & DROP_RETIRE)
i915_retire_requests(dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 09ab12458244..bec0a2796c37 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3154,7 +3154,7 @@ void i915_gem_init_swizzling(struct drm_i915_private *dev_priv);
void i915_gem_fini(struct drm_i915_private *dev_priv);
void i915_gem_cleanup_engines(struct drm_i915_private *dev_priv);
int i915_gem_wait_for_idle(struct drm_i915_private *dev_priv,
- unsigned int flags);
+ unsigned int flags, long timeout);
int __must_check i915_gem_suspend(struct drm_i915_private *dev_priv);
void i915_gem_suspend_late(struct drm_i915_private *dev_priv);
void i915_gem_resume(struct drm_i915_private *dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 1a9dab302433..28d62f002276 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2267,7 +2267,9 @@ static int i915_gem_object_create_mmap_offset(struct drm_i915_gem_object *obj)
/* Attempt to reap some mmap space from dead objects */
do {
- err = i915_gem_wait_for_idle(dev_priv, I915_WAIT_INTERRUPTIBLE);
+ err = i915_gem_wait_for_idle(dev_priv,
+ I915_WAIT_INTERRUPTIBLE,
+ MAX_SCHEDULE_TIMEOUT);
if (err)
break;
@@ -3742,14 +3744,14 @@ i915_gem_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
return ret;
}
-static int wait_for_timeline(struct i915_timeline *tl, unsigned int flags)
+static long wait_for_timeline(struct i915_timeline *tl,
+ unsigned int flags, long timeout)
{
struct i915_request *rq;
- long ret;
rq = i915_gem_active_get_unlocked(&tl->last_request);
if (!rq)
- return 0;
+ return timeout;
/*
* "Race-to-idle".
@@ -3763,10 +3765,10 @@ static int wait_for_timeline(struct i915_timeline *tl, unsigned int flags)
if (flags & I915_WAIT_FOR_IDLE_BOOST)
gen6_rps_boost(rq, NULL);
- ret = i915_request_wait(rq, flags, MAX_SCHEDULE_TIMEOUT);
+ timeout = i915_request_wait(rq, flags, timeout);
i915_request_put(rq);
- return ret < 0 ? ret : 0;
+ return timeout;
}
static int wait_for_engines(struct drm_i915_private *i915)
@@ -3782,7 +3784,8 @@ static int wait_for_engines(struct drm_i915_private *i915)
return 0;
}
-int i915_gem_wait_for_idle(struct drm_i915_private *i915, unsigned int flags)
+int i915_gem_wait_for_idle(struct drm_i915_private *i915,
+ unsigned int flags, long timeout)
{
GEM_TRACE("flags=%x (%s)\n",
flags, flags & I915_WAIT_LOCKED ? "locked" : "unlocked");
@@ -3798,9 +3801,9 @@ int i915_gem_wait_for_idle(struct drm_i915_private *i915, unsigned int flags)
lockdep_assert_held(&i915->drm.struct_mutex);
list_for_each_entry(tl, &i915->gt.timelines, link) {
- err = wait_for_timeline(tl, flags);
- if (err)
- return err;
+ timeout = wait_for_timeline(tl, flags, timeout);
+ if (timeout < 0)
+ return timeout;
}
err = wait_for_engines(i915);
@@ -3812,12 +3815,13 @@ int i915_gem_wait_for_idle(struct drm_i915_private *i915, unsigned int flags)
} else {
struct intel_engine_cs *engine;
enum intel_engine_id id;
- int err;
for_each_engine(engine, i915, id) {
- err = wait_for_timeline(&engine->timeline, flags);
- if (err)
- return err;
+ struct i915_timeline *tl = &engine->timeline;
+
+ timeout = wait_for_timeline(tl, flags, timeout);
+ if (timeout < 0)
+ return timeout;
}
}
@@ -5052,7 +5056,8 @@ int i915_gem_suspend(struct drm_i915_private *dev_priv)
ret = i915_gem_wait_for_idle(dev_priv,
I915_WAIT_INTERRUPTIBLE |
I915_WAIT_LOCKED |
- I915_WAIT_FOR_IDLE_BOOST);
+ I915_WAIT_FOR_IDLE_BOOST,
+ MAX_SCHEDULE_TIMEOUT);
if (ret && ret != -EIO)
goto err_unlock;
@@ -5356,9 +5361,12 @@ static int __intel_engines_record_defaults(struct drm_i915_private *i915)
if (err)
goto err_active;
- err = i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED);
- if (err)
+ if (i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED, HZ / 5)) {
+ dev_err(i915->drm.dev, "Initial GPU setup failed; disabling\n");
+ i915_gem_set_wedged(i915);
+ err = -EIO;
goto err_active;
+ }
assert_kernel_context_is_current(i915);
@@ -5421,7 +5429,9 @@ static int __intel_engines_record_defaults(struct drm_i915_private *i915)
if (WARN_ON(i915_gem_switch_to_kernel_context(i915)))
goto out_ctx;
- if (WARN_ON(i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED)))
+ if (WARN_ON(i915_gem_wait_for_idle(i915,
+ I915_WAIT_LOCKED,
+ MAX_SCHEDULE_TIMEOUT)))
goto out_ctx;
i915_gem_contexts_lost(i915);
diff --git a/drivers/gpu/drm/i915/i915_gem_evict.c b/drivers/gpu/drm/i915/i915_gem_evict.c
index 54814a196ee4..02b83a5ed96c 100644
--- a/drivers/gpu/drm/i915/i915_gem_evict.c
+++ b/drivers/gpu/drm/i915/i915_gem_evict.c
@@ -69,7 +69,8 @@ static int ggtt_flush(struct drm_i915_private *i915)
err = i915_gem_wait_for_idle(i915,
I915_WAIT_INTERRUPTIBLE |
- I915_WAIT_LOCKED);
+ I915_WAIT_LOCKED,
+ MAX_SCHEDULE_TIMEOUT);
if (err)
return err;
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index 2ad319e17e39..abd81fb9b0b6 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -2793,7 +2793,7 @@ void i915_gem_gtt_finish_pages(struct drm_i915_gem_object *obj,
struct i915_ggtt *ggtt = &dev_priv->ggtt;
if (unlikely(ggtt->do_idle_maps)) {
- if (i915_gem_wait_for_idle(dev_priv, 0)) {
+ if (i915_gem_wait_for_idle(dev_priv, 0, MAX_SCHEDULE_TIMEOUT)) {
DRM_ERROR("Failed to wait for idle; VT'd may hang.\n");
/* Wait a bit, in hopes it avoids the hang */
udelay(10);
diff --git a/drivers/gpu/drm/i915/i915_gem_shrinker.c b/drivers/gpu/drm/i915/i915_gem_shrinker.c
index 55e84e71f526..c61f5b80fee3 100644
--- a/drivers/gpu/drm/i915/i915_gem_shrinker.c
+++ b/drivers/gpu/drm/i915/i915_gem_shrinker.c
@@ -172,7 +172,9 @@ i915_gem_shrink(struct drm_i915_private *i915,
* we will free as much as we can and hope to get a second chance.
*/
if (flags & I915_SHRINK_ACTIVE)
- i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED);
+ i915_gem_wait_for_idle(i915,
+ I915_WAIT_LOCKED,
+ MAX_SCHEDULE_TIMEOUT);
trace_i915_gem_shrink(i915, target, flags);
i915_retire_requests(i915);
@@ -392,7 +394,8 @@ shrinker_lock_uninterruptible(struct drm_i915_private *i915, bool *unlock,
unsigned long timeout = jiffies + msecs_to_jiffies_timeout(timeout_ms);
do {
- if (i915_gem_wait_for_idle(i915, 0) == 0 &&
+ if (i915_gem_wait_for_idle(i915,
+ 0, MAX_SCHEDULE_TIMEOUT) == 0 &&
shrinker_lock(i915, unlock))
break;
@@ -466,7 +469,9 @@ i915_gem_shrinker_vmap(struct notifier_block *nb, unsigned long event, void *ptr
return NOTIFY_DONE;
/* Force everything onto the inactive lists */
- ret = i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED);
+ ret = i915_gem_wait_for_idle(i915,
+ I915_WAIT_LOCKED,
+ MAX_SCHEDULE_TIMEOUT);
if (ret)
goto out;
diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 447407fee3b8..6bf10952c724 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -1836,7 +1836,9 @@ static int gen8_configure_all_contexts(struct drm_i915_private *dev_priv,
* So far the best way to work around this issue seems to be draining
* the GPU from any submitted work.
*/
- ret = i915_gem_wait_for_idle(dev_priv, wait_flags);
+ ret = i915_gem_wait_for_idle(dev_priv,
+ wait_flags,
+ MAX_SCHEDULE_TIMEOUT);
if (ret)
goto out;
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index 3248369dbcfb..5c2c93cbab12 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -206,7 +206,8 @@ static int reset_all_global_seqno(struct drm_i915_private *i915, u32 seqno)
/* Carefully retire all requests without writing to the rings */
ret = i915_gem_wait_for_idle(i915,
I915_WAIT_INTERRUPTIBLE |
- I915_WAIT_LOCKED);
+ I915_WAIT_LOCKED,
+ MAX_SCHEDULE_TIMEOUT);
if (ret)
return ret;
@@ -735,7 +736,8 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
/* Ratelimit ourselves to prevent oom from malicious clients */
ret = i915_gem_wait_for_idle(i915,
I915_WAIT_LOCKED |
- I915_WAIT_INTERRUPTIBLE);
+ I915_WAIT_INTERRUPTIBLE,
+ MAX_SCHEDULE_TIMEOUT);
if (ret)
goto err_unreserve;
diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_context.c b/drivers/gpu/drm/i915/selftests/i915_gem_context.c
index 5fbe15f4effd..bebb3d37fac8 100644
--- a/drivers/gpu/drm/i915/selftests/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/selftests/i915_gem_context.c
@@ -478,7 +478,8 @@ static int __igt_switch_to_kernel_context(struct drm_i915_private *i915,
}
}
- err = i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED);
+ err = i915_gem_wait_for_idle(i915,
+ I915_WAIT_LOCKED, MAX_SCHEDULE_TIMEOUT);
if (err)
return err;
diff --git a/drivers/gpu/drm/i915/selftests/i915_request.c b/drivers/gpu/drm/i915/selftests/i915_request.c
index 43995fc3534d..c4aac6141e04 100644
--- a/drivers/gpu/drm/i915/selftests/i915_request.c
+++ b/drivers/gpu/drm/i915/selftests/i915_request.c
@@ -286,7 +286,9 @@ static int begin_live_test(struct live_test *t,
t->func = func;
t->name = name;
- err = i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED);
+ err = i915_gem_wait_for_idle(i915,
+ I915_WAIT_LOCKED,
+ MAX_SCHEDULE_TIMEOUT);
if (err) {
pr_err("%s(%s): failed to idle before, with err=%d!",
func, name, err);
diff --git a/drivers/gpu/drm/i915/selftests/igt_flush_test.c b/drivers/gpu/drm/i915/selftests/igt_flush_test.c
index 0d06f559243f..af66e3d4e23a 100644
--- a/drivers/gpu/drm/i915/selftests/igt_flush_test.c
+++ b/drivers/gpu/drm/i915/selftests/igt_flush_test.c
@@ -9,52 +9,8 @@
#include "../i915_selftest.h"
#include "igt_flush_test.h"
-struct wedge_me {
- struct delayed_work work;
- struct drm_i915_private *i915;
- const void *symbol;
-};
-
-static void wedge_me(struct work_struct *work)
-{
- struct wedge_me *w = container_of(work, typeof(*w), work.work);
-
- pr_err("%pS timed out, cancelling all further testing.\n", w->symbol);
-
- GEM_TRACE("%pS timed out.\n", w->symbol);
- GEM_TRACE_DUMP();
-
- i915_gem_set_wedged(w->i915);
-}
-
-static void __init_wedge(struct wedge_me *w,
- struct drm_i915_private *i915,
- long timeout,
- const void *symbol)
-{
- w->i915 = i915;
- w->symbol = symbol;
-
- INIT_DELAYED_WORK_ONSTACK(&w->work, wedge_me);
- schedule_delayed_work(&w->work, timeout);
-}
-
-static void __fini_wedge(struct wedge_me *w)
-{
- cancel_delayed_work_sync(&w->work);
- destroy_delayed_work_on_stack(&w->work);
- w->i915 = NULL;
-}
-
-#define wedge_on_timeout(W, DEV, TIMEOUT) \
- for (__init_wedge((W), (DEV), (TIMEOUT), __builtin_return_address(0)); \
- (W)->i915; \
- __fini_wedge((W)))
-
int igt_flush_test(struct drm_i915_private *i915, unsigned int flags)
{
- struct wedge_me w;
-
cond_resched();
if (flags & I915_WAIT_LOCKED &&
@@ -63,8 +19,15 @@ int igt_flush_test(struct drm_i915_private *i915, unsigned int flags)
i915_gem_set_wedged(i915);
}
- wedge_on_timeout(&w, i915, HZ)
- i915_gem_wait_for_idle(i915, flags);
+ if (i915_gem_wait_for_idle(i915, flags, HZ / 5) == -ETIME) {
+ pr_err("%pS timed out, cancelling all further testing.\n",
+ __builtin_return_address(0));
+
+ GEM_TRACE("%pS timed out.\n", __builtin_return_address(0));
+ GEM_TRACE_DUMP();
+
+ i915_gem_set_wedged(i915);
+ }
return i915_terminally_wedged(&i915->gpu_error) ? -EIO : 0;
}
--
2.18.0
More information about the Intel-gfx
mailing list