[PATCH] drm/i915: Revoke mmaps and prevent access to fence registers across reset
Chris Wilson
chris at chris-wilson.co.uk
Wed Jan 30 00:12:57 UTC 2019
Previously, we were able to rely on the recursive properties of
struct_mutex to allow us to serialise revoking mmaps and reacquiring the
FENCE registers with them being clobbered over a global device reset.
I then proceeded to throw out the baby with the bath water in order to
pursue a struct_mutex-less reset.
Perusing LWN for alternative strategies, the dilemma on how to serialise
access to a global resource on one side was answered by
https://lwn.net/Articles/202847/ -- Sleepable RCU:
1 int readside(void) {
2 int idx;
3 rcu_read_lock();
4 if (nomoresrcu) {
5 rcu_read_unlock();
6 return -EINVAL;
7 }
8 idx = srcu_read_lock(&ss);
9 rcu_read_unlock();
10 /* SRCU read-side critical section. */
11 srcu_read_unlock(&ss, idx);
12 return 0;
13 }
14
15 void cleanup(void)
16 {
17 nomoresrcu = 1;
18 synchronize_rcu();
19 synchronize_srcu(&ss);
20 cleanup_srcu_struct(&ss);
21 }
No more worrying about stop_machine, just an uber-complex mutex,
optimised for reads, with the overhead pushed to the rare reset path.
However, we do run the risk of a deadlock as we allocate underneath the
SRCU read lock, and the allocation may require a GPU reset, causing a
dependency cycle via the in-flight requests. We resolve that by declaring
the driver wedged and cancelling all in-flight rendering.
Testcase: igt/gem_mmap_gtt/hang
Fixes: eb8d0f5af4ec ("drm/i915: Remove GPU reset dependence on struct_mutex")
Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala at intel.com>
---
drivers/gpu/drm/i915/i915_debugfs.c | 12 +---
drivers/gpu/drm/i915/i915_drv.h | 17 ++---
drivers/gpu/drm/i915/i915_gem.c | 45 +-----------
drivers/gpu/drm/i915/i915_gem_fence_reg.c | 57 ++++++++++++---
drivers/gpu/drm/i915/i915_gem_fence_reg.h | 1 +
drivers/gpu/drm/i915/i915_gpu_error.h | 12 ++--
drivers/gpu/drm/i915/i915_reset.c | 71 +++++++++----------
drivers/gpu/drm/i915/i915_reset.h | 4 ++
drivers/gpu/drm/i915/i915_vma.c | 2 -
drivers/gpu/drm/i915/i915_vma.h | 7 +-
.../gpu/drm/i915/selftests/mock_gem_device.c | 1 +
11 files changed, 108 insertions(+), 121 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index fa2c226fc779..2cea263b4d79 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1281,14 +1281,11 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
intel_wakeref_t wakeref;
enum intel_engine_id id;
+ seq_printf(m, "Reset flags: %lx\n", dev_priv->gpu_error.flags);
if (test_bit(I915_WEDGED, &dev_priv->gpu_error.flags))
- seq_puts(m, "Wedged\n");
+ seq_puts(m, "\tWedged\n");
if (test_bit(I915_RESET_BACKOFF, &dev_priv->gpu_error.flags))
- seq_puts(m, "Reset in progress: struct_mutex backoff\n");
- if (waitqueue_active(&dev_priv->gpu_error.wait_queue))
- seq_puts(m, "Waiter holding struct mutex\n");
- if (waitqueue_active(&dev_priv->gpu_error.reset_queue))
- seq_puts(m, "struct_mutex blocked for reset\n");
+ seq_puts(m, "\tDevice (global) reset in progress\n");
if (!i915_modparams.enable_hangcheck) {
seq_puts(m, "Hangcheck disabled\n");
@@ -3885,9 +3882,6 @@ i915_wedged_set(void *data, u64 val)
* while it is writing to 'i915_wedged'
*/
- if (i915_reset_backoff(&i915->gpu_error))
- return -EAGAIN;
-
i915_handle_error(i915, val, I915_ERROR_CAPTURE,
"Manually set wedged engine mask = %llx", val);
return 0;
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index d072f3369ee1..33c39092d5dd 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2986,7 +2986,12 @@ i915_gem_obj_finish_shmem_access(struct drm_i915_gem_object *obj)
i915_gem_object_unpin_pages(obj);
}
-int __must_check i915_mutex_lock_interruptible(struct drm_device *dev);
+static inline int __must_check
+i915_mutex_lock_interruptible(struct drm_device *dev)
+{
+ return mutex_lock_interruptible(&dev->struct_mutex);
+}
+
int i915_gem_dumb_create(struct drm_file *file_priv,
struct drm_device *dev,
struct drm_mode_create_dumb *args);
@@ -3003,21 +3008,11 @@ int __must_check i915_gem_set_global_seqno(struct drm_device *dev, u32 seqno);
struct i915_request *
i915_gem_find_active_request(struct intel_engine_cs *engine);
-static inline bool i915_reset_backoff(struct i915_gpu_error *error)
-{
- return unlikely(test_bit(I915_RESET_BACKOFF, &error->flags));
-}
-
static inline bool i915_terminally_wedged(struct i915_gpu_error *error)
{
return unlikely(test_bit(I915_WEDGED, &error->flags));
}
-static inline bool i915_reset_backoff_or_wedged(struct i915_gpu_error *error)
-{
- return i915_reset_backoff(error) | i915_terminally_wedged(error);
-}
-
static inline u32 i915_reset_count(struct i915_gpu_error *error)
{
return READ_ONCE(error->reset_count);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index e802af64d628..21cc3d8e84d0 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -100,47 +100,6 @@ static void i915_gem_info_remove_obj(struct drm_i915_private *dev_priv,
spin_unlock(&dev_priv->mm.object_stat_lock);
}
-static int
-i915_gem_wait_for_error(struct i915_gpu_error *error)
-{
- int ret;
-
- might_sleep();
-
- /*
- * Only wait 10 seconds for the gpu reset to complete to avoid hanging
- * userspace. If it takes that long something really bad is going on and
- * we should simply try to bail out and fail as gracefully as possible.
- */
- ret = wait_event_interruptible_timeout(error->reset_queue,
- !i915_reset_backoff(error),
- I915_RESET_TIMEOUT);
- if (ret == 0) {
- DRM_ERROR("Timed out waiting for the gpu reset to complete\n");
- return -EIO;
- } else if (ret < 0) {
- return ret;
- } else {
- return 0;
- }
-}
-
-int i915_mutex_lock_interruptible(struct drm_device *dev)
-{
- struct drm_i915_private *dev_priv = to_i915(dev);
- int ret;
-
- ret = i915_gem_wait_for_error(&dev_priv->gpu_error);
- if (ret)
- return ret;
-
- ret = mutex_lock_interruptible(&dev->struct_mutex);
- if (ret)
- return ret;
-
- return 0;
-}
-
static u32 __i915_gem_park(struct drm_i915_private *i915)
{
intel_wakeref_t wakeref;
@@ -1908,7 +1867,6 @@ vm_fault_t i915_gem_fault(struct vm_fault *vmf)
goto err_unlock;
}
-
/* Now pin it into the GTT as needed */
vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
PIN_MAPPABLE |
@@ -5324,6 +5282,7 @@ int i915_gem_init_early(struct drm_i915_private *dev_priv)
init_waitqueue_head(&dev_priv->gpu_error.wait_queue);
init_waitqueue_head(&dev_priv->gpu_error.reset_queue);
mutex_init(&dev_priv->gpu_error.wedge_mutex);
+ init_srcu_struct(&dev_priv->gpu_error.srcu);
atomic_set(&dev_priv->mm.bsd_engine_dispatch_index, 0);
@@ -5356,6 +5315,8 @@ void i915_gem_cleanup_early(struct drm_i915_private *dev_priv)
GEM_BUG_ON(atomic_read(&dev_priv->mm.free_count));
WARN_ON(dev_priv->mm.object_count);
+ cleanup_srcu_struct(&dev_priv->gpu_error.srcu);
+
kmem_cache_destroy(dev_priv->priorities);
kmem_cache_destroy(dev_priv->dependencies);
kmem_cache_destroy(dev_priv->requests);
diff --git a/drivers/gpu/drm/i915/i915_gem_fence_reg.c b/drivers/gpu/drm/i915/i915_gem_fence_reg.c
index 46e259661294..5bcba61b9419 100644
--- a/drivers/gpu/drm/i915/i915_gem_fence_reg.c
+++ b/drivers/gpu/drm/i915/i915_gem_fence_reg.c
@@ -319,6 +319,40 @@ static struct drm_i915_fence_reg *fence_find(struct drm_i915_private *dev_priv)
return ERR_PTR(-EDEADLK);
}
+static int fence_pin(struct drm_i915_fence_reg *fence)
+{
+ int err;
+
+ if (fence->pin_count++)
+ return 0;
+
+ err = i915_reset_lock(fence->i915);
+ if (err < 0)
+ goto err_unpin;
+
+ fence->tag = err;
+ return 0;
+
+err_unpin:
+ fence->pin_count = 0;
+ return err;
+}
+
+static void fence_unpin(struct drm_i915_fence_reg *fence)
+{
+ GEM_BUG_ON(!fence->pin_count);
+ if (--fence->pin_count)
+ return;
+
+ i915_reset_unlock(fence->i915, fence->tag);
+}
+
+void __i915_vma_unpin_fence(struct i915_vma *vma)
+{
+ GEM_BUG_ON(!vma->fence);
+ fence_unpin(vma->fence);
+}
+
/**
* i915_vma_pin_fence - set up fencing for a vma
* @vma: vma to map through a fence reg
@@ -353,7 +387,11 @@ i915_vma_pin_fence(struct i915_vma *vma)
if (vma->fence) {
fence = vma->fence;
GEM_BUG_ON(fence->vma != vma);
- fence->pin_count++;
+
+ err = fence_pin(fence);
+ if (err)
+ return err;
+
if (!fence->dirty) {
list_move_tail(&fence->link,
&fence->i915->mm.fence_list);
@@ -365,7 +403,9 @@ i915_vma_pin_fence(struct i915_vma *vma)
return PTR_ERR(fence);
GEM_BUG_ON(fence->pin_count);
- fence->pin_count++;
+ err = fence_pin(fence);
+ if (err)
+ return err;
} else
return 0;
@@ -380,7 +420,7 @@ i915_vma_pin_fence(struct i915_vma *vma)
return 0;
out_unpin:
- fence->pin_count--;
+ fence_unpin(fence);
return err;
}
@@ -441,7 +481,7 @@ void i915_unreserve_fence(struct drm_i915_fence_reg *fence)
*
* Removes all GTT mmappings via the fence registers. This forces any user
* of the fence to reacquire that fence before continuing with their access.
- * One use is during GPU reset where the fence register is lost and we need to
+ * Only use is during GPU reset where the fence register is lost and we need to
* revoke concurrent userspace access via GTT mmaps until the hardware has been
* reset and the fence registers have been restored.
*/
@@ -449,15 +489,14 @@ void i915_gem_revoke_fences(struct drm_i915_private *dev_priv)
{
int i;
- lockdep_assert_held(&dev_priv->drm.struct_mutex);
-
for (i = 0; i < dev_priv->num_fence_regs; i++) {
struct drm_i915_fence_reg *fence = &dev_priv->fence_regs[i];
- GEM_BUG_ON(fence->vma && fence->vma->fence != fence);
+ if (!fence->vma)
+ continue;
- if (fence->vma)
- i915_vma_revoke_mmap(fence->vma);
+ GEM_BUG_ON(fence->vma->fence != fence);
+ i915_vma_revoke_mmap(fence->vma);
}
}
diff --git a/drivers/gpu/drm/i915/i915_gem_fence_reg.h b/drivers/gpu/drm/i915/i915_gem_fence_reg.h
index 09dcaf14121b..3e6b0df70629 100644
--- a/drivers/gpu/drm/i915/i915_gem_fence_reg.h
+++ b/drivers/gpu/drm/i915/i915_gem_fence_reg.h
@@ -37,6 +37,7 @@ struct drm_i915_fence_reg {
struct drm_i915_private *i915;
struct i915_vma *vma;
int pin_count;
+ int tag;
int id;
/**
* Whether the tiling parameters for the currently
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h
index 53b1f22dd365..4e797c552b96 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.h
+++ b/drivers/gpu/drm/i915/i915_gpu_error.h
@@ -231,12 +231,10 @@ struct i915_gpu_error {
/**
* flags: Control various stages of the GPU reset
*
- * #I915_RESET_BACKOFF - When we start a reset, we want to stop any
- * other users acquiring the struct_mutex. To do this we set the
- * #I915_RESET_BACKOFF bit in the error flags when we detect a reset
- * and then check for that bit before acquiring the struct_mutex (in
- * i915_mutex_lock_interruptible()?). I915_RESET_BACKOFF serves a
- * secondary role in preventing two concurrent global reset attempts.
+ * #I915_RESET_BACKOFF - When we start a global reset, we need to
+ * serialise with any other users attempting to do the same, and
+ * any global resources that may be clobber by the reset (such as
+ * FENCE registers).
*
* #I915_RESET_ENGINE[num_engines] - Since the driver doesn't need to
* acquire the struct_mutex to reset an engine, we need an explicit
@@ -272,6 +270,8 @@ struct i915_gpu_error {
*/
wait_queue_head_t reset_queue;
+ struct srcu_struct srcu;
+
struct i915_gpu_restart *restart;
};
diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c
index 4462007a681c..4de07c83d724 100644
--- a/drivers/gpu/drm/i915/i915_reset.c
+++ b/drivers/gpu/drm/i915/i915_reset.c
@@ -648,6 +648,7 @@ static void reset_prepare(struct drm_i915_private *i915)
reset_prepare_engine(engine);
intel_uc_sanitize(i915);
+ i915_gem_revoke_fences(i915);
}
static int gt_reset(struct drm_i915_private *i915, unsigned int stalled_mask)
@@ -911,50 +912,19 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
return ret;
}
-struct __i915_reset {
- struct drm_i915_private *i915;
- unsigned int stalled_mask;
-};
-
-static int __i915_reset__BKL(void *data)
-{
- struct __i915_reset *arg = data;
- int err;
-
- err = intel_gpu_reset(arg->i915, ALL_ENGINES);
- if (err)
- return err;
-
- return gt_reset(arg->i915, arg->stalled_mask);
-}
-
-#if RESET_UNDER_STOP_MACHINE
-/*
- * XXX An alternative to using stop_machine would be to park only the
- * processes that have a GGTT mmap. By remote parking the threads (SIGSTOP)
- * we should be able to prevent their memmory accesses via the lost fence
- * registers over the course of the reset without the potential recursive
- * of mutexes between the pagefault handler and reset.
- *
- * See igt/gem_mmap_gtt/hang
- */
-#define __do_reset(fn, arg) stop_machine(fn, arg, NULL)
-#else
-#define __do_reset(fn, arg) fn(arg)
-#endif
-
static int do_reset(struct drm_i915_private *i915, unsigned int stalled_mask)
{
- struct __i915_reset arg = { i915, stalled_mask };
int err, i;
- err = __do_reset(__i915_reset__BKL, &arg);
+ err = intel_gpu_reset(i915, ALL_ENGINES);
for (i = 0; err && i < RESET_MAX_RETRIES; i++) {
msleep(100);
- err = __do_reset(__i915_reset__BKL, &arg);
+ err = intel_gpu_reset(i915, ALL_ENGINES);
}
+ if (err)
+ return err;
- return err;
+ return gt_reset(i915, stalled_mask);
}
/**
@@ -1277,6 +1247,9 @@ void i915_handle_error(struct drm_i915_private *i915,
goto out;
}
+ synchronize_rcu();
+ synchronize_srcu(&i915->gpu_error.srcu);
+
/* Prevent any other reset-engine attempt. */
for_each_engine(engine, i915, tmp) {
while (test_and_set_bit(I915_RESET_ENGINE + engine->id,
@@ -1300,6 +1273,32 @@ void i915_handle_error(struct drm_i915_private *i915,
intel_runtime_pm_put(i915, wakeref);
}
+int i915_reset_lock(struct drm_i915_private *i915)
+{
+ int srcu;
+
+ rcu_read_lock();
+ while (test_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags)) {
+ rcu_read_unlock();
+
+ if (wait_event_interruptible(i915->gpu_error.reset_queue,
+ !test_bit(I915_RESET_BACKOFF,
+ &i915->gpu_error.flags)))
+ return -EINTR;
+
+ rcu_read_lock();
+ }
+ srcu = srcu_read_lock(&i915->gpu_error.srcu);
+ rcu_read_unlock();
+
+ return srcu;
+}
+
+void i915_reset_unlock(struct drm_i915_private *i915, int tag)
+{
+ srcu_read_unlock(&i915->gpu_error.srcu, tag);
+}
+
bool i915_reset_flush(struct drm_i915_private *i915)
{
int err;
diff --git a/drivers/gpu/drm/i915/i915_reset.h b/drivers/gpu/drm/i915/i915_reset.h
index f2d347f319df..eb412e0158da 100644
--- a/drivers/gpu/drm/i915/i915_reset.h
+++ b/drivers/gpu/drm/i915/i915_reset.h
@@ -9,6 +9,7 @@
#include <linux/compiler.h>
#include <linux/types.h>
+#include <linux/srcu.h>
struct drm_i915_private;
struct intel_engine_cs;
@@ -32,6 +33,9 @@ int i915_reset_engine(struct intel_engine_cs *engine,
void i915_reset_request(struct i915_request *rq, bool guilty);
bool i915_reset_flush(struct drm_i915_private *i915);
+int i915_reset_lock(struct drm_i915_private *i915);
+void i915_reset_unlock(struct drm_i915_private *i915, int tag);
+
bool intel_has_gpu_reset(struct drm_i915_private *i915);
bool intel_has_reset_engine(struct drm_i915_private *i915);
diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
index d83b8ad5f859..f46336dfa011 100644
--- a/drivers/gpu/drm/i915/i915_vma.c
+++ b/drivers/gpu/drm/i915/i915_vma.c
@@ -893,8 +893,6 @@ void i915_vma_revoke_mmap(struct i915_vma *vma)
struct drm_vma_offset_node *node = &vma->obj->base.vma_node;
u64 vma_offset;
- lockdep_assert_held(&vma->vm->i915->drm.struct_mutex);
-
if (!i915_vma_has_userfault(vma))
return;
diff --git a/drivers/gpu/drm/i915/i915_vma.h b/drivers/gpu/drm/i915/i915_vma.h
index 5793abe509a2..b9a93c34c0ff 100644
--- a/drivers/gpu/drm/i915/i915_vma.h
+++ b/drivers/gpu/drm/i915/i915_vma.h
@@ -400,14 +400,9 @@ static inline struct page *i915_vma_first_page(struct i915_vma *vma)
* True if the vma has a fence, false otherwise.
*/
int i915_vma_pin_fence(struct i915_vma *vma);
+void __i915_vma_unpin_fence(struct i915_vma *vma);
int __must_check i915_vma_put_fence(struct i915_vma *vma);
-static inline void __i915_vma_unpin_fence(struct i915_vma *vma)
-{
- GEM_BUG_ON(vma->fence->pin_count <= 0);
- vma->fence->pin_count--;
-}
-
/**
* i915_vma_unpin_fence - unpin fencing state
* @vma: vma to unpin fencing for
diff --git a/drivers/gpu/drm/i915/selftests/mock_gem_device.c b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
index 14ae46fda49f..074a0d9cbf26 100644
--- a/drivers/gpu/drm/i915/selftests/mock_gem_device.c
+++ b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
@@ -189,6 +189,7 @@ struct drm_i915_private *mock_gem_device(void)
init_waitqueue_head(&i915->gpu_error.wait_queue);
init_waitqueue_head(&i915->gpu_error.reset_queue);
+ init_srcu_struct(&i915->gpu_error.srcu);
mutex_init(&i915->gpu_error.wedge_mutex);
i915->wq = alloc_ordered_workqueue("mock", 0);
--
2.20.1
More information about the Intel-gfx-trybot
mailing list