[Intel-gfx] [RFC PATCH 5/5] drm/i915: Parallel execbuf wip
Maarten Lankhorst
maarten.lankhorst at linux.intel.com
Fri Dec 6 16:43:17 UTC 2019
Doesn't work yet, but shows the direction I'm going.
Instead of relying on struct_mutex, we rely on the object locks to
paralellize submission.
Currently we still have some issues because eb_lookup_vmas() pins the
vma, and parallel submission doesn't work yet completely. I'm hoping
to find out a good way to handle this. Ideally we drop the pinning,
and rely only on object lock, but unfortunately the driver doesn't
support it yet.
Posting this as a RFC, to get some feedback on the direction we should be going.
Signed-off-by: Maarten Lankhorst <maarten.lankhorst at linux.intel.com>
---
.../gpu/drm/i915/gem/i915_gem_execbuffer.c | 295 ++++++++++--------
drivers/gpu/drm/i915/i915_cmd_parser.c | 15 +-
drivers/gpu/drm/i915/i915_drv.h | 6 -
3 files changed, 171 insertions(+), 145 deletions(-)
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
index 02fdf081c1be..ee753684f0ba 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
@@ -234,6 +234,8 @@ struct i915_execbuffer {
struct drm_i915_gem_exec_object2 *exec; /** ioctl execobj[] */
struct eb_vma *vma;
+ struct i915_gem_ww_ctx ww;
+
struct intel_engine_cs *engine; /** engine to queue the request to */
struct intel_context *context; /* logical state for the request */
struct i915_gem_context *gem_context; /** caller's context */
@@ -286,6 +288,9 @@ struct i915_execbuffer {
struct hlist_head *buckets; /** ht for relocation handles */
};
+static int eb_parse(struct i915_execbuffer *eb,
+ struct intel_engine_pool_node *pool);
+
/*
* Used to convert any address to canonical form.
* Starting from gen8, some commands (e.g. STATE_BASE_ADDRESS,
@@ -504,8 +509,10 @@ eb_add_vma(struct i915_execbuffer *eb,
if (!(eb->args->flags & __EXEC_VALIDATED)) {
err = eb_validate_vma(eb, entry, vma);
- if (unlikely(err))
+ if (unlikely(err)) {
+ DRM_DEBUG_DRIVER("eb_validate_vma => %i\n", err);
return err;
+ }
}
ev->vma = vma;
@@ -551,8 +558,10 @@ eb_add_vma(struct i915_execbuffer *eb,
eb_unreserve_vma(ev);
list_add_tail(&ev->bind_link, &eb->unbound);
- if (drm_mm_node_allocated(&vma->node))
+ if (drm_mm_node_allocated(&vma->node)) {
err = i915_vma_unbind(vma);
+ WARN(err, "i915_vma_unbind => %i\n", err);
+ }
}
return err;
}
@@ -574,7 +583,7 @@ static inline int use_cpu_reloc(const struct reloc_cache *cache,
obj->cache_level != I915_CACHE_NONE);
}
-static int eb_reserve_vma(const struct i915_execbuffer *eb, struct eb_vma *ev)
+static int eb_reserve_vma(struct i915_execbuffer *eb, struct eb_vma *ev)
{
struct drm_i915_gem_exec_object2 *entry = ev->exec;
unsigned int exec_flags = ev->flags;
@@ -582,6 +591,10 @@ static int eb_reserve_vma(const struct i915_execbuffer *eb, struct eb_vma *ev)
u64 pin_flags;
int err;
+ err = i915_gem_object_lock(&eb->ww, vma->obj);
+ if (err)
+ return err;
+
pin_flags = PIN_USER | PIN_NONBLOCK;
if (exec_flags & EXEC_OBJECT_NEEDS_GTT)
pin_flags |= PIN_GLOBAL;
@@ -797,9 +810,7 @@ static int eb_lookup_vmas(struct i915_execbuffer *eb)
lut->handle = handle;
lut->ctx = eb->gem_context;
- i915_gem_object_lock(NULL, obj);
list_add(&lut->obj_link, &obj->lut_list);
- i915_gem_object_unlock(obj);
add_vma:
err = eb_add_vma(eb, i, batch, vma);
@@ -812,15 +823,33 @@ static int eb_lookup_vmas(struct i915_execbuffer *eb)
mutex_unlock(&eb->gem_context->mutex);
+ if (unlikely(eb->batch->flags & EXEC_OBJECT_WRITE)) {
+ DRM_DEBUG("Attempting to use self-modifying batch buffer\n");
+ return -EINVAL;
+ }
+
+ if (range_overflows_t(u64,
+ eb->batch_start_offset, eb->batch_len,
+ eb->batch->vma->size)) {
+ DRM_DEBUG("Attempting to use out-of-bounds batch\n");
+ return -EINVAL;
+ }
+
+ if (eb->batch_len == 0)
+ eb->batch_len = eb->batch->vma->size - eb->batch_start_offset;
+
eb->args->flags |= __EXEC_VALIDATED;
return eb_reserve(eb);
err_obj:
i915_gem_object_put(obj);
+ DRM_DEBUG_DRIVER("err_obj() => %i\n", err);
err_vma:
eb->vma[i].vma = NULL;
+ DRM_DEBUG_DRIVER("err_vma() => %i\n", err);
err_ctx:
mutex_unlock(&eb->gem_context->mutex);
+ DRM_DEBUG_DRIVER("err_ctx() => %i\n", err);
return err;
}
@@ -866,6 +895,15 @@ static void eb_release_vmas(const struct i915_execbuffer *eb)
}
}
+static void eb_unreserve_vmas(struct i915_execbuffer *eb)
+{
+ const unsigned int count = eb->buffer_count;
+ unsigned int i;
+
+ for (i = 0; i < count; i++)
+ eb_unreserve_vma(&eb->vma[i]);
+}
+
static void eb_reset_vmas(const struct i915_execbuffer *eb)
{
eb_release_vmas(eb);
@@ -1121,7 +1159,7 @@ static int reloc_move_to_gpu(struct i915_request *rq, struct i915_vma *vma)
struct drm_i915_gem_object *obj = vma->obj;
int err;
- i915_vma_lock(vma);
+ assert_vma_held(vma);
if (obj->cache_dirty & ~obj->cache_coherent)
i915_gem_clflush_object(obj, 0);
@@ -1131,8 +1169,6 @@ static int reloc_move_to_gpu(struct i915_request *rq, struct i915_vma *vma)
if (err == 0)
err = i915_vma_move_to_active(vma, rq, EXEC_OBJECT_WRITE);
- i915_vma_unlock(vma);
-
return err;
}
@@ -1190,11 +1226,10 @@ static int __reloc_gpu_alloc(struct i915_execbuffer *eb,
if (err)
goto skip_request;
- i915_vma_lock(batch);
+ assert_vma_held(batch);
err = i915_request_await_object(rq, batch->obj, false);
if (err == 0)
err = i915_vma_move_to_active(batch, rq, 0);
- i915_vma_unlock(batch);
if (err)
goto skip_request;
@@ -1445,6 +1480,11 @@ static int eb_relocate_vma(struct i915_execbuffer *eb, struct eb_vma *ev)
struct drm_i915_gem_relocation_entry __user *urelocs;
const struct drm_i915_gem_exec_object2 *entry = ev->exec;
unsigned int remain;
+ int err;
+
+ err = i915_gem_object_lock(&eb->ww, ev->vma->obj);
+ if (err)
+ return err;
urelocs = u64_to_user_ptr(entry->relocs_ptr);
remain = entry->relocation_count;
@@ -1675,14 +1715,35 @@ static int eb_prefault_relocations(const struct i915_execbuffer *eb)
return 0;
}
+static int eb_lock_and_parse_cmdbuffer(struct i915_execbuffer *eb)
+{
+ struct intel_engine_pool_node *pool;
+ int err;
+
+ if (!eb_use_cmdparser(eb))
+ return 0;
+
+ pool = intel_engine_get_pool(eb->engine, eb->batch_len);
+ if (IS_ERR(pool))
+ return PTR_ERR(pool);
+
+ err = i915_gem_object_lock(&eb->ww, pool->obj);
+ if (!err)
+ err = eb_parse(eb, pool);
+
+ if (err)
+ intel_engine_pool_put(pool);
+
+ return err;
+}
+
static noinline int eb_relocate_slow(struct i915_execbuffer *eb)
{
- struct drm_device *dev = &eb->i915->drm;
bool have_copy = false;
struct eb_vma *ev;
int err = 0;
-repeat:
+repeat_pass:
if (signal_pending(current)) {
err = -ERESTARTSYS;
goto out;
@@ -1690,10 +1751,10 @@ static noinline int eb_relocate_slow(struct i915_execbuffer *eb)
/* We may process another execbuffer during the unlock... */
eb_reset_vmas(eb);
- mutex_unlock(&dev->struct_mutex);
+ i915_gem_ww_ctx_fini(&eb->ww);
/*
- * We take 3 passes through the slowpatch.
+ * We take 3 passes through the slowpath.
*
* 1 - we try to just prefault all the user relocation entries and
* then attempt to reuse the atomic pagefault disabled fast path again.
@@ -1714,20 +1775,14 @@ static noinline int eb_relocate_slow(struct i915_execbuffer *eb)
cond_resched();
err = 0;
}
- if (err) {
- mutex_lock(&dev->struct_mutex);
+ i915_gem_ww_ctx_init(&eb->ww, true);
+ if (err)
goto out;
- }
/* A frequent cause for EAGAIN are currently unavailable client pages */
flush_workqueue(eb->i915->mm.userptr_wq);
- err = i915_mutex_lock_interruptible(dev);
- if (err) {
- mutex_lock(&dev->struct_mutex);
- goto out;
- }
-
+repeat_reloc:
/* reacquire the objects */
err = eb_lookup_vmas(eb);
if (err)
@@ -1740,8 +1795,10 @@ static noinline int eb_relocate_slow(struct i915_execbuffer *eb)
pagefault_disable();
err = eb_relocate_vma(eb, ev);
pagefault_enable();
- if (err)
- goto repeat;
+ if (err == -EDEADLK)
+ goto err;
+ else if (err)
+ goto repeat_pass;
} else {
err = eb_relocate_vma_slow(eb, ev);
if (err)
@@ -1749,6 +1806,9 @@ static noinline int eb_relocate_slow(struct i915_execbuffer *eb)
}
}
+ if (!err)
+ err = eb_lock_and_parse_cmdbuffer(eb);
+
/*
* Leave the user relocations as are, this is the painfully slow path,
* and we want to avoid the complication of dropping the lock whilst
@@ -1757,8 +1817,15 @@ static noinline int eb_relocate_slow(struct i915_execbuffer *eb)
*/
err:
+ if (err == -EDEADLK) {
+ eb_reset_vmas(eb);
+
+ err = i915_gem_ww_ctx_backoff(&eb->ww);
+ if (!err)
+ goto repeat_reloc;
+ }
if (err == -EAGAIN)
- goto repeat;
+ goto repeat_pass;
out:
if (have_copy) {
@@ -1781,60 +1848,79 @@ static noinline int eb_relocate_slow(struct i915_execbuffer *eb)
return err;
}
-static int eb_relocate(struct i915_execbuffer *eb)
+static int eb_relocate_fast(struct i915_execbuffer *eb)
{
- if (eb_lookup_vmas(eb))
- goto slow;
+ int err;
/* The objects are in their final locations, apply the relocations. */
if (eb->args->flags & __EXEC_HAS_RELOC) {
struct eb_vma *ev;
list_for_each_entry(ev, &eb->relocs, reloc_link) {
- if (eb_relocate_vma(eb, ev))
- goto slow;
+ err = eb_relocate_vma(eb, ev);
+ if (err)
+ return err;
}
}
return 0;
-
-slow:
- return eb_relocate_slow(eb);
}
-static int eb_move_to_gpu(struct i915_execbuffer *eb)
+static int eb_relocate_and_parse_cmdbuf_backoff(struct i915_execbuffer *eb)
{
- const unsigned int count = eb->buffer_count;
- struct ww_acquire_ctx acquire;
- unsigned int i;
- int err = 0;
+ int err;
- ww_acquire_init(&acquire, &reservation_ww_class);
+ err = eb_lookup_vmas(eb);
+ if (err) {
+ DRM_DEBUG_DRIVER("eb_lookup_vmas() => %i\n", err);
+ return err;
+ }
- for (i = 0; i < count; i++) {
- struct eb_vma *ev = &eb->vma[i];
- struct i915_vma *vma = ev->vma;
+ DRM_DEBUG_DRIVER("eb_reserve() => %i\n", err);
- err = ww_mutex_lock_interruptible(&vma->resv->lock, &acquire);
- if (err == -EDEADLK) {
- GEM_BUG_ON(i == 0);
- do {
- int j = i - 1;
+retry_fast:
+ if (!err)
+ err = eb_relocate_fast(eb);
+ if (!err) {
+ err = eb_lock_and_parse_cmdbuffer(eb);
+ DRM_DEBUG_DRIVER("Parsing cmdbuf returns %i\n", err);
- ww_mutex_unlock(&eb->vma[j].vma->resv->lock);
+ /* return on success, or any error except -EDEADLK */
+ if (err != -EDEADLK)
+ return err;
+ }
- swap(eb->vma[i], eb->vma[j]);
- } while (--i);
+ if (err == -EDEADLK) {
+ eb_unreserve_vmas(eb);
- err = ww_mutex_lock_slow_interruptible(&vma->resv->lock,
- &acquire);
- }
- if (err == -EALREADY)
- err = 0;
+ err = i915_gem_ww_ctx_backoff(&eb->ww);
if (err)
- break;
+ return err;
+
+ goto retry_fast;
}
- ww_acquire_done(&acquire);
+
+ err = eb_relocate_slow(eb);
+ if (err)
+ /*
+ * If the user expects the execobject.offset and
+ * reloc.presumed_offset to be an exact match,
+ * as for using NO_RELOC, then we cannot update
+ * the execobject.offset until we have completed
+ * relocation.
+ */
+ eb->args->flags &= ~__EXEC_HAS_RELOC;
+
+ DRM_DEBUG_DRIVER("Slow relocation returns %i\n", err);
+ return err;
+}
+
+static int eb_move_to_gpu(struct i915_execbuffer *eb)
+{
+ unsigned int i;
+ int err = 0;
+
+ ww_acquire_done(&eb->ww.ctx);
while (i--) {
struct eb_vma *ev = &eb->vma[i];
@@ -1880,15 +1966,12 @@ static int eb_move_to_gpu(struct i915_execbuffer *eb)
if (err == 0)
err = i915_vma_move_to_active(vma, eb->request, flags);
- i915_vma_unlock(vma);
-
__eb_unreserve_vma(vma, flags);
if (unlikely(flags & __EXEC_OBJECT_HAS_REF))
i915_vma_put(vma);
ev->vma = NULL;
}
- ww_acquire_fini(&acquire);
if (unlikely(err))
goto err_skip;
@@ -1990,21 +2073,17 @@ shadow_batch_pin(struct i915_execbuffer *eb, struct drm_i915_gem_object *obj)
return vma;
}
-static struct i915_vma *eb_parse(struct i915_execbuffer *eb)
+static int eb_parse(struct i915_execbuffer *eb,
+ struct intel_engine_pool_node *pool)
{
- struct intel_engine_pool_node *pool;
struct i915_vma *vma;
u64 batch_start;
u64 shadow_batch_start;
int err;
- pool = intel_engine_get_pool(eb->engine, eb->batch_len);
- if (IS_ERR(pool))
- return ERR_CAST(pool);
-
vma = shadow_batch_pin(eb, pool->obj);
if (IS_ERR(vma))
- goto err;
+ return PTR_ERR(vma);
batch_start = gen8_canonical_addr(eb->batch->vma->node.start) +
eb->batch_start_offset;
@@ -2028,12 +2107,13 @@ static struct i915_vma *eb_parse(struct i915_execbuffer *eb)
* For PPGTT backing however, we have no choice but to forcibly
* reject unsafe buffers
*/
- if (i915_vma_is_ggtt(vma) && err == -EACCES)
+ if (i915_vma_is_ggtt(vma) && err == -EACCES) {
/* Execute original buffer non-secure */
- vma = NULL;
- else
- vma = ERR_PTR(err);
- goto err;
+ intel_engine_pool_put(pool);
+ return 0;
+ }
+
+ return err;
}
eb->vma[eb->buffer_count].vma = i915_vma_get(vma);
@@ -2049,11 +2129,7 @@ static struct i915_vma *eb_parse(struct i915_execbuffer *eb)
/* eb->batch_len unchanged */
vma->private = pool;
- return vma;
-
-err:
- intel_engine_pool_put(pool);
- return vma;
+ return 0;
}
static void
@@ -2485,6 +2561,7 @@ i915_gem_do_execbuffer(struct drm_device *dev,
eb.batch_len = args->batch_len;
eb.batch_flags = 0;
+
if (args->flags & I915_EXEC_SECURE) {
if (INTEL_GEN(i915) >= 11)
return -ENODEV;
@@ -2529,68 +2606,36 @@ i915_gem_do_execbuffer(struct drm_device *dev,
}
err = eb_create(&eb);
- if (err)
+ if (err) {
+ DRM_DEBUG_DRIVER("eb_create() => %i\n", err);
goto err_out_fence;
+ }
GEM_BUG_ON(!eb.lut_size);
err = eb_select_context(&eb);
- if (unlikely(err))
+ if (unlikely(err)) {
+ DRM_DEBUG_DRIVER("eb_select_context() => %i\n", err);
goto err_destroy;
+ }
err = eb_pin_engine(&eb, file, args);
- if (unlikely(err))
+ if (unlikely(err)) {
+ DRM_DEBUG_DRIVER("eb_pin_engine() => %i\n", err);
goto err_context;
-
- err = i915_mutex_lock_interruptible(dev);
- if (err)
- goto err_engine;
-
- err = eb_relocate(&eb);
- if (err) {
- /*
- * If the user expects the execobject.offset and
- * reloc.presumed_offset to be an exact match,
- * as for using NO_RELOC, then we cannot update
- * the execobject.offset until we have completed
- * relocation.
- */
- args->flags &= ~__EXEC_HAS_RELOC;
- goto err_vma;
}
- if (unlikely(eb.batch->flags & EXEC_OBJECT_WRITE)) {
- DRM_DEBUG("Attempting to use self-modifying batch buffer\n");
- err = -EINVAL;
- goto err_vma;
- }
+ i915_gem_ww_ctx_init(&eb.ww, true);
- batch = eb.batch->vma;
- if (range_overflows_t(u64,
- eb.batch_start_offset, eb.batch_len,
- batch->size)) {
- DRM_DEBUG("Attempting to use out-of-bounds batch\n");
- err = -EINVAL;
+ err = eb_relocate_and_parse_cmdbuf_backoff(&eb);
+ if (err)
goto err_vma;
- }
-
- if (eb.batch_len == 0)
- eb.batch_len = eb.batch->vma->size - eb.batch_start_offset;
-
- if (eb_use_cmdparser(&eb)) {
- struct i915_vma *vma;
-
- vma = eb_parse(&eb);
- if (IS_ERR(vma)) {
- err = PTR_ERR(vma);
- goto err_vma;
- }
- }
/*
* snb/ivb/vlv conflate the "batch in ppgtt" bit with the "non-secure
* batch" bit. Hence we need to pin secure batches into the global gtt.
* hsw should have this fixed, but bdw mucks it up again. */
+ batch = eb.batch->vma;
if (eb.batch_flags & I915_DISPATCH_SECURE) {
struct i915_vma *vma;
@@ -2607,7 +2652,7 @@ i915_gem_do_execbuffer(struct drm_device *dev,
vma = i915_gem_object_ggtt_pin(batch->obj, NULL, 0, 0, 0);
if (IS_ERR(vma)) {
err = PTR_ERR(vma);
- goto err_vma;
+ goto err_pool;
}
batch = vma;
@@ -2684,13 +2729,13 @@ i915_gem_do_execbuffer(struct drm_device *dev,
err_batch_unpin:
if (eb.batch_flags & I915_DISPATCH_SECURE)
i915_vma_unpin(batch);
+err_pool:
if (batch->private)
intel_engine_pool_put(batch->private);
err_vma:
if (eb.exec)
eb_release_vmas(&eb);
- mutex_unlock(&dev->struct_mutex);
-err_engine:
+ i915_gem_ww_ctx_fini(&eb.ww);
eb_unpin_engine(&eb);
err_context:
i915_gem_context_put(eb.gem_context);
diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c
index c5b9ec5b3d25..f5e821bf5d59 100644
--- a/drivers/gpu/drm/i915/i915_cmd_parser.c
+++ b/drivers/gpu/drm/i915/i915_cmd_parser.c
@@ -1136,31 +1136,19 @@ static u32 *copy_batch(struct drm_i915_gem_object *dst_obj,
void *dst, *src;
int ret;
- ret = i915_gem_object_lock_interruptible(NULL, dst_obj);
+ ret = i915_gem_object_prepare_write(dst_obj, &dst_needs_clflush);
if (ret)
return ERR_PTR(ret);
- ret = i915_gem_object_prepare_write(dst_obj, &dst_needs_clflush);
- if (ret) {
- i915_gem_object_unlock(dst_obj);
- return ERR_PTR(ret);
- }
-
dst = i915_gem_object_pin_map(dst_obj, I915_MAP_FORCE_WB);
i915_gem_object_finish_access(dst_obj);
- i915_gem_object_unlock(dst_obj);
if (IS_ERR(dst))
return dst;
- ret = i915_gem_object_lock_interruptible(NULL, src_obj);
- if (ret)
- return ERR_PTR(ret);
-
ret = i915_gem_object_prepare_read(src_obj, &src_needs_clflush);
if (ret) {
i915_gem_object_unpin_map(dst_obj);
- i915_gem_object_unlock(src_obj);
return ERR_PTR(ret);
}
@@ -1209,7 +1197,6 @@ static u32 *copy_batch(struct drm_i915_gem_object *dst_obj,
}
i915_gem_object_finish_access(src_obj);
- i915_gem_object_unlock(src_obj);
/* dst_obj is returned with vmap pinned */
*needs_clflush_after = dst_needs_clflush & CLFLUSH_AFTER;
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index c3d8af28bfc1..ab8cb0c851f6 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1848,12 +1848,6 @@ int i915_gem_object_unbind(struct drm_i915_gem_object *obj,
void i915_gem_runtime_suspend(struct drm_i915_private *dev_priv);
-static inline int __must_check
-i915_mutex_lock_interruptible(struct drm_device *dev)
-{
- return mutex_lock_interruptible(&dev->struct_mutex);
-}
-
int i915_gem_dumb_create(struct drm_file *file_priv,
struct drm_device *dev,
struct drm_mode_create_dumb *args);
--
2.24.0
More information about the Intel-gfx
mailing list