[PATCH 25/26] SQUASHME: drm/i915: mutex-less reset engine
Michel Thierry
michel.thierry at intel.com
Mon Apr 17 17:51:19 UTC 2017
Signed-off-by: Michel Thierry <michel.thierry at intel.com>
more mutexless cleanup
Signed-off-by: Michel Thierry <michel.thierry at intel.com>
---
drivers/gpu/drm/i915/i915_drv.c | 29 +----------
drivers/gpu/drm/i915/i915_drv.h | 4 +-
drivers/gpu/drm/i915/i915_gem.c | 87 +++++++++++++++++++--------------
drivers/gpu/drm/i915/i915_gem_request.c | 1 +
drivers/gpu/drm/i915/i915_irq.c | 17 ++++++-
5 files changed, 71 insertions(+), 67 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 8e86757adfd4..bb17363b2d1b 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -1872,8 +1872,6 @@ void i915_reset_chip(struct drm_i915_private *dev_priv)
* Reset a specific GPU engine. Useful if a hang is detected.
* Returns zero on successful reset or otherwise an error code.
*
- * Caller must hold the struct_mutex.
- *
* Procedure is:
* - identifies the request that caused the hang and it is dropped
* - force engine to idle: this is done by issuing a reset request
@@ -1886,12 +1884,8 @@ int i915_reset_engine(struct intel_engine_cs *engine)
struct drm_i915_private *dev_priv = engine->i915;
struct i915_gpu_error *error = &dev_priv->gpu_error;
- lockdep_assert_held(&dev_priv->drm.struct_mutex);
GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &error->flags));
- if (!test_and_clear_bit(I915_RESET_HANDOFF, &error->flags))
- return 0;
-
DRM_DEBUG_DRIVER("resetting %s\n", engine->name);
/*
@@ -1903,15 +1897,12 @@ int i915_reset_engine(struct intel_engine_cs *engine)
intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
disable_irq(dev_priv->drm.irq);
- ret = i915_gem_reset_prepare(dev_priv, intel_engine_flag(engine));
+ ret = i915_gem_reset_prepare_engine(engine);
if (ret) {
DRM_ERROR("Previous reset failed - promote to full reset\n");
goto error;
}
- if (dev_priv->gt.active_requests)
- engine_retire_requests(engine);
-
/*
* the request that caused the hang is stuck on elsp, identify the
* active request and drop it, adjust head to skip the offending
@@ -1948,9 +1939,7 @@ int i915_reset_engine(struct intel_engine_cs *engine)
}
}
- /* i915_gem_reset_prepare revoked the fences */
- i915_gem_restore_fences(dev_priv);
- i915_gem_reset_finish(dev_priv, intel_engine_flag(engine));
+ i915_gem_reset_finish_engine(engine);
/* replay remaining requests in the queue */
ret = engine->init_hw(engine);
@@ -1971,7 +1960,6 @@ int i915_reset_engine(struct intel_engine_cs *engine)
error:
/* use full gpu reset to recover on error */
- set_bit(I915_RESET_HANDOFF, &error->flags);
goto wakeup;
}
@@ -1989,19 +1977,6 @@ int i915_reset_engine(struct intel_engine_cs *engine)
*/
void i915_reset(struct drm_i915_private *dev_priv, u32 engine_mask)
{
- /* try engine reset first */
- if (intel_has_reset_engine(dev_priv) &&
- !(engine_mask & (engine_mask - 1))) {
- struct intel_engine_cs *engine =
- dev_priv->engine[intel_engineid_from_flag(engine_mask)];
-
- if (i915_reset_engine(engine))
- goto reset_chip;
-
- return;
- }
-
-reset_chip:
i915_reset_chip(dev_priv);
}
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 6be2e9881408..6301934277fe 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3030,6 +3030,7 @@ extern void i915_driver_unload(struct drm_device *dev);
extern int intel_gpu_reset(struct drm_i915_private *dev_priv, u32 engine_mask);
extern bool intel_has_gpu_reset(struct drm_i915_private *dev_priv);
extern void i915_reset(struct drm_i915_private *dev_priv, u32 engine_mask);
+extern int i915_reset_engine(struct intel_engine_cs *engine);
extern bool intel_has_reset_engine(struct drm_i915_private *dev_priv);
extern int intel_reset_engine_start(struct intel_engine_cs *engine);
extern void intel_reset_engine_cancel(struct intel_engine_cs *engine);
@@ -3422,7 +3423,6 @@ int __must_check i915_gem_set_global_seqno(struct drm_device *dev, u32 seqno);
struct drm_i915_gem_request *
i915_gem_find_active_request(struct intel_engine_cs *engine);
-void engine_retire_requests(struct intel_engine_cs *engine);
void i915_gem_retire_requests(struct drm_i915_private *dev_priv);
static inline bool i915_reset_backoff(struct i915_gpu_error *error)
@@ -3456,9 +3456,11 @@ static inline u32 i915_reset_engine_count(struct i915_gpu_error *error,
return READ_ONCE(error->reset_engine_count[engine->id]);
}
+int i915_gem_reset_prepare_engine(struct intel_engine_cs *engine);
int i915_gem_reset_prepare(struct drm_i915_private *dev_priv,
unsigned int engine_mask);
void i915_gem_reset(struct drm_i915_private *dev_priv);
+void i915_gem_reset_finish_engine(struct intel_engine_cs *engine);
void i915_gem_reset_finish(struct drm_i915_private *dev_priv,
unsigned int engine_mask);
void i915_gem_set_wedged(struct drm_i915_private *dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 3f0281197056..bce38062f94e 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2793,6 +2793,47 @@ static bool engine_stalled(struct intel_engine_cs *engine)
return true;
}
+/* Ensure irq handler finishes, and not run again. */
+int i915_gem_reset_prepare_engine(struct intel_engine_cs *engine)
+{
+ struct drm_i915_gem_request *request;
+ int err = 0;
+
+
+ /* Prevent the signaler thread from updating the request
+ * state (by calling dma_fence_signal) as we are processing
+ * the reset. The write from the GPU of the seqno is
+ * asynchronous and the signaler thread may see a different
+ * value to us and declare the request complete, even though
+ * the reset routine have picked that request as the active
+ * (incomplete) request. This conflict is not handled
+ * gracefully!
+ */
+ kthread_park(engine->breadcrumbs.signaler);
+
+ /* Prevent request submission to the hardware until we have
+ * completed the reset in i915_gem_reset_finish(). If a request
+ * is completed by one engine, it may then queue a request
+ * to a second via its engine->irq_tasklet *just* as we are
+ * calling engine->init_hw() and also writing the ELSP.
+ * Turning off the engine->irq_tasklet until the reset is over
+ * prevents the race.
+ */
+ tasklet_kill(&engine->irq_tasklet);
+ tasklet_disable(&engine->irq_tasklet);
+
+ if (engine->irq_seqno_barrier)
+ engine->irq_seqno_barrier(engine);
+
+ if (engine_stalled(engine)) {
+ request = i915_gem_find_active_request(engine);
+ if (request && request->fence.error == -EIO)
+ err = -EIO; /* Previous reset failed! */
+ }
+
+ return err;
+}
+
int i915_gem_reset_prepare(struct drm_i915_private *dev_priv,
unsigned int engine_mask)
{
@@ -2800,41 +2841,8 @@ int i915_gem_reset_prepare(struct drm_i915_private *dev_priv,
unsigned int tmp;
int err = 0;
- /* Ensure irq handler finishes, and not run again. */
- for_each_engine_masked(engine, dev_priv, engine_mask, tmp) {
- struct drm_i915_gem_request *request;
-
- /* Prevent the signaler thread from updating the request
- * state (by calling dma_fence_signal) as we are processing
- * the reset. The write from the GPU of the seqno is
- * asynchronous and the signaler thread may see a different
- * value to us and declare the request complete, even though
- * the reset routine have picked that request as the active
- * (incomplete) request. This conflict is not handled
- * gracefully!
- */
- kthread_park(engine->breadcrumbs.signaler);
-
- /* Prevent request submission to the hardware until we have
- * completed the reset in i915_gem_reset_finish(). If a request
- * is completed by one engine, it may then queue a request
- * to a second via its engine->irq_tasklet *just* as we are
- * calling engine->init_hw() and also writing the ELSP.
- * Turning off the engine->irq_tasklet until the reset is over
- * prevents the race.
- */
- tasklet_kill(&engine->irq_tasklet);
- tasklet_disable(&engine->irq_tasklet);
-
- if (engine->irq_seqno_barrier)
- engine->irq_seqno_barrier(engine);
-
- if (engine_stalled(engine)) {
- request = i915_gem_find_active_request(engine);
- if (request && request->fence.error == -EIO)
- err = -EIO; /* Previous reset failed! */
- }
- }
+ for_each_engine_masked(engine, dev_priv, engine_mask, tmp)
+ err = i915_gem_reset_prepare_engine(engine);
i915_gem_revoke_fences(dev_priv);
@@ -2967,6 +2975,12 @@ void i915_gem_reset(struct drm_i915_private *dev_priv)
}
}
+void i915_gem_reset_finish_engine(struct intel_engine_cs *engine)
+{
+ tasklet_enable(&engine->irq_tasklet);
+ kthread_unpark(engine->breadcrumbs.signaler);
+}
+
void i915_gem_reset_finish(struct drm_i915_private *dev_priv,
unsigned int engine_mask)
{
@@ -2976,8 +2990,7 @@ void i915_gem_reset_finish(struct drm_i915_private *dev_priv,
lockdep_assert_held(&dev_priv->drm.struct_mutex);
for_each_engine_masked(engine, dev_priv, engine_mask, tmp) {
- tasklet_enable(&engine->irq_tasklet);
- kthread_unpark(engine->breadcrumbs.signaler);
+ i915_gem_reset_finish_engine(engine);
}
}
diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index 6f17eb6ee4b6..27759c9eff89 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -311,6 +311,7 @@ static void i915_gem_request_retire(struct drm_i915_gem_request *request)
struct i915_gem_active *active, *next;
lockdep_assert_held(&request->i915->drm.struct_mutex);
+
GEM_BUG_ON(!i915_sw_fence_signaled(&request->submit));
GEM_BUG_ON(!i915_gem_request_completed(request));
GEM_BUG_ON(!request->i915->gt.active_requests);
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 6543e77e0621..ae120d7a8efd 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2649,6 +2649,7 @@ static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv)
char *error_event[] = { I915_ERROR_UEVENT "=1", NULL };
char *reset_event[] = { I915_RESET_UEVENT "=1", NULL };
char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL };
+ u32 engine_mask = dev_priv->gpu_error.reset_engine_mask;
kobject_uevent_env(kobj, KOBJ_CHANGE, error_event);
@@ -2663,6 +2664,18 @@ static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv)
*/
kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event);
+ /* try engine reset first, and continue if fails; look mom, no mutex! */
+ if (intel_has_reset_engine(dev_priv) && !(engine_mask & (engine_mask - 1))) {
+ struct intel_engine_cs *engine =
+ dev_priv->engine[intel_engineid_from_flag(engine_mask)];
+
+ if (i915_reset_engine(engine) == 0)
+ goto finish;
+
+ DRM_ERROR("reset %s failed, promoting to full gpu reset\n",
+ engine->name);
+ }
+
intel_prepare_reset(dev_priv);
set_bit(I915_RESET_HANDOFF, &dev_priv->gpu_error.flags);
@@ -2676,8 +2689,7 @@ static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv)
* deadlocks with the reset work.
*/
if (mutex_trylock(&dev_priv->drm.struct_mutex)) {
- i915_reset(dev_priv,
- dev_priv->gpu_error.reset_engine_mask);
+ i915_reset(dev_priv, ALL_ENGINES);
mutex_unlock(&dev_priv->drm.struct_mutex);
}
@@ -2693,6 +2705,7 @@ static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv)
kobject_uevent_env(kobj,
KOBJ_CHANGE, reset_done_event);
+finish:
/*
* Note: The wake_up also serves as a memory barrier so that
* waiters see the updated value of the dev_priv->gpu_error.
--
2.11.0
More information about the Intel-gfx-trybot
mailing list