[Intel-gfx] [PATCH 08/13] drm/i915: Eliminate lots of iterations over the execobjects array
Joonas Lahtinen
joonas.lahtinen at linux.intel.com
Tue Apr 4 14:57:34 UTC 2017
On ke, 2017-03-29 at 16:56 +0100, Chris Wilson wrote:
> The major scaling bottleneck in execbuffer is the processing of the
> execobjects. Creating an auxiliary list is inefficient when compared to
> using the execobject array we already have allocated.
>
> Reservation is then split into phases. As we lookup up the VMA, we
> try and bind it back into active location. Only if that fails, do we add
> it to the unbound list for phase 2. In phase 2, we try and add all those
> objects that could not fit into their previous location, with fallback
> to retrying all objects and evicting the VM in case of severe
> fragmentation. (This is the same as before, except that phase 1 is now
> done inline with looking up the VMA to avoid an iteration over the
> execobject array. In the ideal case, we eliminate the separate reservation
> phase). During the reservation phase, we only evict from the VM between
> passes (rather than currently as we try to fit every new VMA). In
> testing with Unreal Engine's Atlantis demo which stresses the eviction
> logic on gen7 class hardware, this speed up the framerate by a factor of
> 2.
>
> The second loop amalgamation is between move_to_gpu and move_to_active.
> As we always submit the request, even if incomplete, we can use the
> current request to track active VMA as we perform the flushes and
> synchronisation required.
>
> The next big advancement is to avoid copying back to the user any
> execobjects and relocations that are not changed.
>
> v2: Add a Theory of Operation spiel.
> v3: Fall back to slow relocations in preparation for flushing userptrs.
>
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
<SNIP>
> struct i915_execbuffer {
> struct drm_i915_private *i915;
> struct drm_file *file;
> @@ -63,19 +180,24 @@ struct i915_execbuffer {
> struct i915_address_space *vm;
> struct i915_vma *batch;
> struct drm_i915_gem_request *request;
> - u32 batch_start_offset;
> - u32 batch_len;
> - unsigned int dispatch_flags;
> - struct drm_i915_gem_exec_object2 shadow_exec_entry;
> - bool need_relocs;
> - struct list_head vmas;
> + unsigned int buffer_count;
> + struct list_head unbound;
> + struct list_head relocs;
> struct reloc_cache {
> struct drm_mm_node node;
> unsigned long vaddr;
> unsigned int page;
> bool use_64bit_reloc : 1;
> + bool has_llc : 1;
> + bool has_fence : 1;
> + bool needs_unfenced : 1;
> } reloc_cache;
> - int lut_mask;
> + u64 invalid_flags;
> + u32 context_flags;
> + u32 dispatch_flags;
> + u32 batch_start_offset;
> + u32 batch_len;
> + int lut_size;
> struct hlist_head *buckets;
Please document (new) members.
> +static inline u64 gen8_noncanonical_addr(u64 address)
> +{
> + return address & ((1ULL << (GEN8_HIGH_ADDRESS_BIT + 1)) - 1);
GENMASK_ULL
> @@ -106,71 +256,302 @@ eb_create(struct i915_execbuffer *eb)
> return -ENOMEM;
> }
>
> - eb->lut_mask = size;
> + eb->lut_size = size;
> } else {
> - eb->lut_mask = -eb->args->buffer_count;
> + eb->lut_size = -eb->buffer_count;
Document the negative meaning in the doc mentioned above.
> +static bool
> +eb_vma_misplaced(const struct drm_i915_gem_exec_object2 *entry,
> + const struct i915_vma *vma)
> +{
> + if ((entry->flags & __EXEC_OBJECT_HAS_PIN) == 0)
Lets try to stick to one convention, we gave up == NULL too, so
!(a & FOO).
<SNIP>
> + if ((entry->flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS) == 0 &&
> + (vma->node.start + vma->node.size - 1) >> 32)
upper_32_bits for clarity?
> +static void
> +eb_pin_vma(struct i915_execbuffer *eb,
> + struct drm_i915_gem_exec_object2 *entry,
> + struct i915_vma *vma)
> +{
> + u64 flags;
> +
> + flags = vma->node.start;
I'd be more comfortable if some mask was applied here.
Or at least GEM_BUG_ON(flags & BAD_FLAGS);
> +static inline void
> +eb_unreserve_vma(struct i915_vma *vma,
> + struct drm_i915_gem_exec_object2 *entry)
> {
> - struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
> -
> - __eb_unreserve_vma(vma, entry);
> - entry->flags &= ~(__EXEC_OBJECT_HAS_FENCE | __EXEC_OBJECT_HAS_PIN);
> + if (entry->flags & __EXEC_OBJECT_HAS_PIN) {
I'd treat the if more as an early return, but I guess you have more
code coming.
> +static int
> +eb_add_vma(struct i915_execbuffer *eb,
> > + struct drm_i915_gem_exec_object2 *entry,
> > + struct i915_vma *vma)
> {
> > - struct i915_vma *vma;
> > + int ret;
>
> > - list_for_each_entry(vma, &eb->vmas, exec_link) {
> > - eb_unreserve_vma(vma);
> > - i915_vma_put(vma);
> > - vma->exec_entry = NULL;
> > + GEM_BUG_ON(i915_vma_is_closed(vma));
> +
> + if ((eb->args->flags & __EXEC_VALIDATED) == 0) {
smells like a function here.
<SNIP>
> }
>
> - if (eb->lut_mask >= 0)
> - memset(eb->buckets, 0,
> - sizeof(struct hlist_head) << eb->lut_mask);
> + vma->exec_entry = entry;
> + entry->rsvd2 = (uintptr_t)vma;
Umm, there was a helper introduced in some patch.
<SNIP>
Could add a comment as to why do this?
> + if ((entry->flags & EXEC_OBJECT_PINNED) == 0)
> + entry->flags |= eb->context_flags;
> +
<SNIP>
> +
> +static int
> +eb_reserve_vma(struct i915_execbuffer *eb, struct i915_vma *vma)
> +{
<SNIP>
> - i915_vma_get(vma);
> - __exec_to_vma(&eb->exec[i]) = (uintptr_t)vma;
> - return true;
> + if (entry->flags & EXEC_OBJECT_NEEDS_FENCE) {
> + ret = i915_vma_get_fence(vma);
> + if (ret)
> + return ret;
> +
> + if (i915_vma_pin_fence(vma))
> + entry->flags |= __EXEC_OBJECT_HAS_FENCE;
> + }
Smells like duplicate code with eb_vma_pin.
<SNIP>
> static int
> eb_lookup_vmas(struct i915_execbuffer *eb)
> {
<SNIP>
> - if (ht_needs_resize(eb->ctx)) {
> - eb->ctx->vma_lut.ht_size |= I915_CTX_RESIZE_IN_PROGRESS;
> - queue_work(system_highpri_wq, &eb->ctx->vma_lut.resize);
> - }
> + if (eb->ctx->vma_lut.ht_size & I915_CTX_RESIZE_IN_PROGRESS) {
> + struct i915_gem_context_vma_lut *lut = &eb->ctx->vma_lut;
You could hoist the lut variable, its used quite a few times.
> @@ -616,16 +1048,15 @@ relocate_entry(struct drm_i915_gem_object *obj,
> goto repeat;
> }
>
> - return 0;
> + return gen8_canonical_addr(target->node.start) | 1;
Magic bit.
> +static int eb_relocate_vma(struct i915_execbuffer *eb, struct i915_vma *vma)
> {
> #define N_RELOC(x) ((x) / sizeof(struct drm_i915_gem_relocation_entry))
> - struct drm_i915_gem_relocation_entry stack_reloc[N_RELOC(512)];
> - struct drm_i915_gem_relocation_entry __user *user_relocs;
> - struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
> - int remain, ret = 0;
> -
> - user_relocs = u64_to_user_ptr(entry->relocs_ptr);
> + struct drm_i915_gem_relocation_entry stack[N_RELOC(512)];
> + struct drm_i915_gem_relocation_entry __user *urelocs;
> + const struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
> + unsigned int remain;
>
> + urelocs = u64_to_user_ptr(entry->relocs_ptr);
> remain = entry->relocation_count;
> - while (remain) {
> - struct drm_i915_gem_relocation_entry *r = stack_reloc;
> - unsigned long unwritten;
> - unsigned int count;
> + if (unlikely(remain > ULONG_MAX / sizeof(*urelocs)))
How bout N_RELOC(ULONG_MAX) ?
> @@ -732,66 +1164,66 @@ static int eb_relocate_vma(struct i915_vma *vma, struct i915_execbuffer *eb)
> * this is bad and so lockdep complains vehemently.
> */
> pagefault_disable();
> - unwritten = __copy_from_user_inatomic(r, user_relocs, count*sizeof(r[0]));
> - pagefault_enable();
> - if (unlikely(unwritten)) {
> - ret = -EFAULT;
> + if (__copy_from_user_inatomic(r, urelocs, count*sizeof(r[0]))) {
> + pagefault_enable();
> + remain = -EFAULT;
> goto out;
> }
> + pagefault_enable();
Why dupe the pagefault_enable?
>
> + remain -= count;
> do {
> - u64 offset = r->presumed_offset;
> + u64 offset = eb_relocate_entry(eb, vma, r);
>
> - ret = eb_relocate_entry(vma, eb, r);
> - if (ret)
> + if (likely(offset == 0)) {
Sparse not complaining?
> + } else if ((s64)offset < 0) {
> + remain = (s64)offset;
> goto out;
> -
> - if (r->presumed_offset != offset) {
> + } else {
> + /* Note that reporting an error now
> + * leaves everything in an inconsistent
> + * state as we have *already* changed
> + * the relocation value inside the
> + * object. As we have not changed the
> + * reloc.presumed_offset or will not
> + * change the execobject.offset, on the
> + * call we may not rewrite the value
> + * inside the object, leaving it
> + * dangling and causing a GPU hang.
> + */
> pagefault_disable();
> - unwritten = __put_user(r->presumed_offset,
> - &user_relocs->presumed_offset);
> + __put_user(offset & ~1,
Magic.
> + &urelocs[r-stack].presumed_offset);
There's the comment, so maybe worth if (__put_user()) DRM_DEBUG?
> pagefault_enable();
<SNIP>
> +static int check_relocations(const struct drm_i915_gem_exec_object2 *entry)
> {
<SNIP>
> + const unsigned long relocs_max =
> + ULONG_MAX / sizeof(struct drm_i915_gem_relocation_entry);
Could be a define, this is used above too.
<SNIP>
> + return __get_user(c, end - 1);
What's the point in this final check?
> }
>
> -static bool
> -need_reloc_mappable(struct i915_vma *vma)
> +static int
> +eb_copy_relocations(const struct i915_execbuffer *eb)
> {
> - struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
> -
> - if (entry->relocation_count == 0)
> - return false;
> -
> - if (!i915_vma_is_ggtt(vma))
> - return false;
> + const unsigned int count = eb->buffer_count;
> + unsigned int i;
> + int ret;
>
> - /* See also use_cpu_reloc() */
> - if (HAS_LLC(to_i915(vma->obj->base.dev)))
> - return false;
> + for (i = 0; i < count; i++) {
> + struct drm_i915_gem_relocation_entry __user *urelocs;
> + struct drm_i915_gem_relocation_entry *relocs;
> + unsigned int nreloc = eb->exec[i].relocation_count, j;
No hidden variables in assignment lines.
> -static bool
> -eb_vma_misplaced(struct i915_vma *vma)
> -{
> - struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
> + urelocs = u64_to_user_ptr(eb->exec[i].relocs_ptr);
> + size = nreloc * sizeof(*relocs);
>
> - WARN_ON(entry->flags & __EXEC_OBJECT_NEEDS_MAP &&
> - !i915_vma_is_ggtt(vma));
> + relocs = drm_malloc_gfp(size, 1, GFP_TEMPORARY);
> + if (!relocs) {
> + drm_free_large(relocs);
> + ret = -ENOMEM;
> + goto err;
> + }
>
> - if (entry->alignment && !IS_ALIGNED(vma->node.start, entry->alignment))
> - return true;
> + /* copy_from_user is limited to 4GiB */
> + j = 0;
> + do {
> + u32 len = min_t(u64, 1ull<<31, size);
BIT_ULL(31)
> +static void eb_export_fence(struct drm_i915_gem_object *obj,
> + struct drm_i915_gem_request *req,
> + unsigned int flags)
> +{
> + struct reservation_object *resv = obj->resv;
> +
> + /* Ignore errors from failing to allocate the new fence, we can't
> + * handle an error right now. Worst case should be missed
> + * synchronisation leading to rendering corruption.
> + */
Worthy DRM_DEBUG?
> @@ -1155,10 +1524,33 @@ eb_move_to_gpu(struct i915_execbuffer *eb)
> }
>
> ret = i915_gem_request_await_object
> - (eb->request, obj, vma->exec_entry->flags & EXEC_OBJECT_WRITE);
> + (eb->request, obj, entry->flags & EXEC_OBJECT_WRITE);
> if (ret)
> return ret;
> +
> +skip_flushes:
> + obj->base.write_domain = 0;
> + if (entry->flags & EXEC_OBJECT_WRITE) {
> + obj->base.read_domains = 0;
> + if (!obj->cache_dirty && gpu_write_needs_clflush(obj))
> + obj->cache_dirty = true;
> + intel_fb_obj_invalidate(obj, ORIGIN_CS);
> + }
> + obj->base.read_domains |= I915_GEM_GPU_DOMAINS;
> +
> + i915_vma_move_to_active(vma, eb->request, entry->flags);
> + __eb_unreserve_vma(vma, entry);
> + vma->exec_entry = NULL;
This seems like a bugfix hunk lost to refactoring patch.
> @@ -1377,16 +1629,16 @@ i915_reset_gen7_sol_offsets(struct drm_i915_gem_request *req)
> return -EINVAL;
> }
>
> - cs = intel_ring_begin(req, 4 * 3);
> + cs = intel_ring_begin(req, 4 * 2 + 2);
> if (IS_ERR(cs))
> return PTR_ERR(cs);
>
> + *cs++ = MI_LOAD_REGISTER_IMM(4);
> for (i = 0; i < 4; i++) {
> - *cs++ = MI_LOAD_REGISTER_IMM(1);
> *cs++ = i915_mmio_reg_offset(GEN7_SO_WRITE_OFFSET(i));
> *cs++ = 0;
> }
> -
> + *cs++ = MI_NOOP;
> intel_ring_advance(req, cs);
Again a lost hunk.
>
> > return 0;
> @@ -1422,10 +1674,11 @@ static struct i915_vma *eb_parse(struct i915_execbuffer *eb, bool is_master)
> goto out;
>
> vma->exec_entry =
> - memset(&eb->shadow_exec_entry, 0, sizeof(*vma->exec_entry));
> + memset(&eb->exec[eb->buffer_count++],
> + 0, sizeof(*vma->exec_entry));
> vma->exec_entry->flags = __EXEC_OBJECT_HAS_PIN;
> - i915_gem_object_get(shadow_batch_obj);
> - list_add_tail(&vma->exec_link, &eb->vmas);
> + vma->exec_entry->rsvd2 = (uintptr_t)vma;
Use the helper.
> @@ -1901,56 +2135,63 @@ i915_gem_execbuffer2(struct drm_device *dev, void *data,
> struct drm_file *file)
> {
> struct drm_i915_gem_execbuffer2 *args = data;
> - struct drm_i915_gem_exec_object2 *exec2_list = NULL;
> + struct drm_i915_gem_exec_object2 *exec2_list;
> int ret;
>
> if (args->buffer_count < 1 ||
> - args->buffer_count > UINT_MAX / sizeof(*exec2_list)) {
> + args->buffer_count >= UINT_MAX / sizeof(*exec2_list) - 1) {
> DRM_DEBUG("execbuf2 with %d buffers\n", args->buffer_count);
> return -EINVAL;
> }
>
> - exec2_list = drm_malloc_gfp(args->buffer_count,
> + if (!i915_gem_check_execbuffer(args))
> + return -EINVAL;
> +
> + exec2_list = drm_malloc_gfp(args->buffer_count + 1,
The "+ 1" is very unintuitive without a comment.
With the comments, this is (90%);
Reviewed-by: Joonas Lahtinen <joonas.lahtinen at linux.intel.com>
I'm pretty darn sure I ain't reviewing this again without some very
specific changelog and inter-diff.
Regards, Joonas
--
Joonas Lahtinen
Open Source Technology Center
Intel Corporation
More information about the Intel-gfx
mailing list