[PATCH 05/12] drm/i915: Track any mutex used inside GPU reset
Chris Wilson
chris at chris-wilson.co.uk
Sat Jul 21 10:47:17 UTC 2018
With a view to removing the nasty recursion on struct_mutex (which
allows us to wait upon it while also servicing GPU resets and the
shrinker), explicitly track any mutex which might be used inside
i915_reset() and warn if we hit a potential inversion where we are
holding the mutex and may need to reset (e.g. waiting on the GPU).
A simple premise we enforce is that we cannot wait on the GPU while
holding any mutex required to perform a GPU reset, and by extension this
means we cannot shrink while holding that mutex either (as we must be
able to perform a GPU reset to advance the shrinker).
Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
---
drivers/gpu/drm/i915/i915_drv.c | 33 ++++++++++++++++++-
drivers/gpu/drm/i915/i915_drv.h | 3 ++
drivers/gpu/drm/i915/i915_gpu_error.h | 2 ++
drivers/gpu/drm/i915/i915_request.c | 17 ++++++++--
.../gpu/drm/i915/selftests/mock_gem_device.c | 2 ++
5 files changed, 53 insertions(+), 4 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 18a45e7a3d7c..446057b50b78 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -35,6 +35,7 @@
#include <linux/pm.h>
#include <linux/pm_runtime.h>
#include <linux/pnp.h>
+#include <linux/sched/mm.h>
#include <linux/slab.h>
#include <linux/vgaarb.h>
#include <linux/vga_switcheroo.h>
@@ -867,6 +868,19 @@ static void intel_detect_preproduction_hw(struct drm_i915_private *dev_priv)
}
}
+static void i915_reset_lockdep_init(struct drm_i915_private *i915)
+{
+ static struct lock_class_key reset_key;
+
+ lockdep_init_map(&i915->gpu_error.lockdep, "i915.reset", &reset_key, 0);
+
+ /* We must be able to reset while stuck in the shrinker */
+ fs_reclaim_acquire(GFP_KERNEL);
+ lock_map_acquire(&i915->gpu_error.lockdep);
+ lock_map_release(&i915->gpu_error.lockdep);
+ fs_reclaim_release(GFP_KERNEL);
+}
+
/**
* i915_driver_init_early - setup state not requiring device access
* @dev_priv: device private
@@ -897,11 +911,14 @@ static int i915_driver_init_early(struct drm_i915_private *dev_priv,
BUILD_BUG_ON(INTEL_MAX_PLATFORMS >
sizeof(device_info->platform_mask) * BITS_PER_BYTE);
BUG_ON(device_info->gen > sizeof(device_info->gen_mask) * BITS_PER_BYTE);
+
spin_lock_init(&dev_priv->irq_lock);
- spin_lock_init(&dev_priv->gpu_error.lock);
mutex_init(&dev_priv->backlight_lock);
spin_lock_init(&dev_priv->uncore.lock);
+ spin_lock_init(&dev_priv->gpu_error.lock);
+ i915_reset_lockdep_init(dev_priv);
+
mutex_init(&dev_priv->sb_lock);
mutex_init(&dev_priv->av_mutex);
mutex_init(&dev_priv->wm.wm_mutex);
@@ -1910,6 +1927,8 @@ void i915_reset(struct drm_i915_private *i915,
if (!test_bit(I915_RESET_HANDOFF, &error->flags))
return;
+ lock_map_acquire(&error->lockdep);
+
/* Clear any previous failed attempts at recovery. Time to try again. */
if (!i915_gem_unset_wedged(i915))
goto wakeup;
@@ -1983,6 +2002,7 @@ void i915_reset(struct drm_i915_private *i915,
enable_irq(i915->drm.irq);
wakeup:
+ lock_map_release(&error->lockdep);
clear_bit(I915_RESET_HANDOFF, &error->flags);
wake_up_bit(&error->flags, I915_RESET_HANDOFF);
return;
@@ -2007,6 +2027,17 @@ void i915_reset(struct drm_i915_private *i915,
goto finish;
}
+void i915_reset_taints_mutex(struct drm_i915_private *i915, struct mutex *mutex)
+{
+ if (!IS_ENABLED(CONFIG_LOCKDEP))
+ return;
+
+ lock_map_acquire(&i915->gpu_error.lockdep);
+ mutex_lock(mutex);
+ mutex_unlock(mutex);
+ lock_map_release(&i915->gpu_error.lockdep);
+}
+
static inline int intel_gt_reset_engine(struct drm_i915_private *dev_priv,
struct intel_engine_cs *engine)
{
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 0f49f9988dfa..66e809b918fa 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2711,6 +2711,9 @@ extern void i915_reset(struct drm_i915_private *i915,
extern int i915_reset_engine(struct intel_engine_cs *engine,
const char *reason);
+extern void i915_reset_taints_mutex(struct drm_i915_private *i915,
+ struct mutex *mutex);
+
extern bool intel_has_reset_engine(struct drm_i915_private *dev_priv);
extern int intel_reset_guc(struct drm_i915_private *dev_priv);
extern int intel_guc_reset_engine(struct intel_guc *guc,
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h
index 1c1bc0c23468..81faef79da46 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.h
+++ b/drivers/gpu/drm/i915/i915_gpu_error.h
@@ -295,6 +295,8 @@ struct i915_gpu_error {
/* For missed irq/seqno simulation. */
unsigned long test_irq_rings;
+
+ struct lockdep_map lockdep;
};
struct drm_i915_error_state_buf {
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index 5c2c93cbab12..ae97159b169b 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -1253,8 +1253,13 @@ static bool __i915_wait_request_check_and_reset(struct i915_request *request)
if (likely(!i915_reset_handoff(error)))
return false;
+ /* Dance to upgrade to an exclusive lock */
+ lock_map_release(&error->lockdep);
+
__set_current_state(TASK_RUNNING);
i915_reset(request->i915, error->stalled_mask, error->reason);
+
+ lock_map_acquire_read(&error->lockdep);
return true;
}
@@ -1296,11 +1301,15 @@ long i915_request_wait(struct i915_request *rq,
#endif
GEM_BUG_ON(timeout < 0);
+ lock_map_acquire_read(&rq->i915->gpu_error.lockdep);
+
if (i915_request_completed(rq))
- return timeout;
+ goto out;
- if (!timeout)
- return -ETIME;
+ if (!timeout) {
+ timeout = -ETIME;
+ goto out;
+ }
trace_i915_request_wait_begin(rq, flags);
@@ -1414,6 +1423,8 @@ long i915_request_wait(struct i915_request *rq,
remove_wait_queue(&rq->execute, &exec);
trace_i915_request_wait_end(rq);
+out:
+ lock_map_release(&rq->i915->gpu_error.lockdep);
return timeout;
}
diff --git a/drivers/gpu/drm/i915/selftests/mock_gem_device.c b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
index 43ed8b28aeaa..d14830a3794c 100644
--- a/drivers/gpu/drm/i915/selftests/mock_gem_device.c
+++ b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
@@ -135,6 +135,7 @@ static struct dev_pm_domain pm_domain = {
struct drm_i915_private *mock_gem_device(void)
{
+ static struct lock_class_key reset_key;
struct drm_i915_private *i915;
struct pci_dev *pdev;
int err;
@@ -184,6 +185,7 @@ struct drm_i915_private *mock_gem_device(void)
mock_uncore_init(i915);
i915_gem_init__mm(i915);
+ lockdep_init_map(&i915->gpu_error.lockdep, "i915.reset", &reset_key, 0);
init_waitqueue_head(&i915->gpu_error.wait_queue);
init_waitqueue_head(&i915->gpu_error.reset_queue);
--
2.18.0
More information about the Intel-gfx-trybot
mailing list