[Intel-gfx] [PATCH] drm/i915: Track any mutex used inside GPU reset

Chris Wilson chris at chris-wilson.co.uk
Sat Jul 14 11:20:52 UTC 2018


With a view to removing the nasty recursion on struct_mutex (which
allows us to wait upon it while also servicing GPU resets and the
shrinker), explicitly track any mutex which might be used inside
i915_reset() and warn if we hit a potential inversion where we are
holding the mutex and may need to reset (e.g. waiting on the GPU).

A simple premise we enforce is that we cannot wait on the GPU while
holding any mutex required to perform a GPU reset, and by extension this
means we cannot shrink while holding that mutex either (as we must be
able to perform a GPU reset to advance the shrinker).

Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_drv.c               | 33 ++++++++++++++++++-
 drivers/gpu/drm/i915/i915_drv.h               |  3 ++
 drivers/gpu/drm/i915/i915_gpu_error.h         |  2 ++
 drivers/gpu/drm/i915/i915_request.c           | 12 +++++--
 .../gpu/drm/i915/selftests/mock_gem_device.c  |  2 ++
 5 files changed, 49 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 3834bd758a2e..f8b69aff3d45 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -35,6 +35,7 @@
 #include <linux/pm.h>
 #include <linux/pm_runtime.h>
 #include <linux/pnp.h>
+#include <linux/sched/mm.h>
 #include <linux/slab.h>
 #include <linux/vgaarb.h>
 #include <linux/vga_switcheroo.h>
@@ -864,6 +865,19 @@ static void intel_detect_preproduction_hw(struct drm_i915_private *dev_priv)
 	}
 }
 
+static void i915_reset_lockdep_init(struct i915_gpu_error *error)
+{
+	static struct lock_class_key reset_key;
+
+	lockdep_init_map(&error->lockdep, "i915.reset", &reset_key, 0);
+
+	/* We must be able to reset while stuck in the shrinker */
+	fs_reclaim_acquire(GFP_KERNEL);
+	lock_map_acquire(&error->lockdep);
+	lock_map_release(&error->lockdep);
+	fs_reclaim_release(GFP_KERNEL);
+}
+
 /**
  * i915_driver_init_early - setup state not requiring device access
  * @dev_priv: device private
@@ -894,11 +908,14 @@ static int i915_driver_init_early(struct drm_i915_private *dev_priv,
 	BUILD_BUG_ON(INTEL_MAX_PLATFORMS >
 		     sizeof(device_info->platform_mask) * BITS_PER_BYTE);
 	BUG_ON(device_info->gen > sizeof(device_info->gen_mask) * BITS_PER_BYTE);
+
 	spin_lock_init(&dev_priv->irq_lock);
-	spin_lock_init(&dev_priv->gpu_error.lock);
 	mutex_init(&dev_priv->backlight_lock);
 	spin_lock_init(&dev_priv->uncore.lock);
 
+	spin_lock_init(&dev_priv->gpu_error.lock);
+	i915_reset_lockdep_init(&dev_priv->gpu_error);
+
 	mutex_init(&dev_priv->sb_lock);
 	mutex_init(&dev_priv->modeset_restore_lock);
 	mutex_init(&dev_priv->av_mutex);
@@ -1917,6 +1934,8 @@ void i915_reset(struct drm_i915_private *i915,
 	if (!test_bit(I915_RESET_HANDOFF, &error->flags))
 		return;
 
+	lock_map_acquire(&error->lockdep);
+
 	/* Clear any previous failed attempts at recovery. Time to try again. */
 	if (!i915_gem_unset_wedged(i915))
 		goto wakeup;
@@ -1990,6 +2009,7 @@ void i915_reset(struct drm_i915_private *i915,
 	enable_irq(i915->drm.irq);
 
 wakeup:
+	lock_map_release(&error->lockdep);
 	clear_bit(I915_RESET_HANDOFF, &error->flags);
 	wake_up_bit(&error->flags, I915_RESET_HANDOFF);
 	return;
@@ -2014,6 +2034,17 @@ void i915_reset(struct drm_i915_private *i915,
 	goto finish;
 }
 
+void i915_reset_taints_mutex(struct drm_i915_private *i915, struct mutex *mutex)
+{
+	if (!IS_ENABLED(CONFIG_LOCKDEP))
+		return;
+
+	lock_map_acquire(&i915->gpu_error.lockdep);
+	mutex_lock(mutex);
+	mutex_unlock(mutex);
+	lock_map_release(&i915->gpu_error.lockdep);
+}
+
 static inline int intel_gt_reset_engine(struct drm_i915_private *dev_priv,
 					struct intel_engine_cs *engine)
 {
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 4fb937399440..006000bfffcf 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2719,6 +2719,9 @@ extern void i915_reset(struct drm_i915_private *i915,
 extern int i915_reset_engine(struct intel_engine_cs *engine,
 			     const char *reason);
 
+extern void i915_reset_taints_mutex(struct drm_i915_private *i915,
+				    struct mutex *mutex);
+
 extern bool intel_has_reset_engine(struct drm_i915_private *dev_priv);
 extern int intel_reset_guc(struct drm_i915_private *dev_priv);
 extern int intel_guc_reset_engine(struct intel_guc *guc,
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h
index f893a4e8b783..78521b10d9e5 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.h
+++ b/drivers/gpu/drm/i915/i915_gpu_error.h
@@ -293,6 +293,8 @@ struct i915_gpu_error {
 
 	/* For missed irq/seqno simulation. */
 	unsigned long test_irq_rings;
+
+	struct lockdep_map lockdep;
 };
 
 struct drm_i915_error_state_buf {
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index 5c2c93cbab12..8216639697a4 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -1253,8 +1253,12 @@ static bool __i915_wait_request_check_and_reset(struct i915_request *request)
 	if (likely(!i915_reset_handoff(error)))
 		return false;
 
+	lock_map_release(&error->lockdep);
+
 	__set_current_state(TASK_RUNNING);
 	i915_reset(request->i915, error->stalled_mask, error->reason);
+
+	lock_map_acquire_read(&error->lockdep);
 	return true;
 }
 
@@ -1305,8 +1309,10 @@ long i915_request_wait(struct i915_request *rq,
 	trace_i915_request_wait_begin(rq, flags);
 
 	add_wait_queue(&rq->execute, &exec);
-	if (flags & I915_WAIT_LOCKED)
+	if (flags & I915_WAIT_LOCKED) {
+		lock_map_acquire_read(&rq->i915->gpu_error.lockdep);
 		add_wait_queue(errq, &reset);
+	}
 
 	intel_wait_init(&wait);
 
@@ -1409,8 +1415,10 @@ long i915_request_wait(struct i915_request *rq,
 	intel_engine_remove_wait(rq->engine, &wait);
 complete:
 	__set_current_state(TASK_RUNNING);
-	if (flags & I915_WAIT_LOCKED)
+	if (flags & I915_WAIT_LOCKED) {
+		lock_map_release(&rq->i915->gpu_error.lockdep);
 		remove_wait_queue(errq, &reset);
+	}
 	remove_wait_queue(&rq->execute, &exec);
 	trace_i915_request_wait_end(rq);
 
diff --git a/drivers/gpu/drm/i915/selftests/mock_gem_device.c b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
index 43ed8b28aeaa..d14830a3794c 100644
--- a/drivers/gpu/drm/i915/selftests/mock_gem_device.c
+++ b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
@@ -135,6 +135,7 @@ static struct dev_pm_domain pm_domain = {
 
 struct drm_i915_private *mock_gem_device(void)
 {
+	static struct lock_class_key reset_key;
 	struct drm_i915_private *i915;
 	struct pci_dev *pdev;
 	int err;
@@ -184,6 +185,7 @@ struct drm_i915_private *mock_gem_device(void)
 	mock_uncore_init(i915);
 	i915_gem_init__mm(i915);
 
+	lockdep_init_map(&i915->gpu_error.lockdep, "i915.reset", &reset_key, 0);
 	init_waitqueue_head(&i915->gpu_error.wait_queue);
 	init_waitqueue_head(&i915->gpu_error.reset_queue);
 
-- 
2.18.0



More information about the Intel-gfx mailing list