[Intel-gfx] [PATCH v3 12/37] drm/i915/blt: support copying objects

Sat Aug 10 10:45:04 UTC 2019

Quoting Matthew Auld (2019-08-09 23:26:18)
> +struct i915_vma *intel_emit_vma_copy_blt(struct intel_engine_pool_node **p,
> +                                        struct intel_context *ce,
> +                                        struct i915_vma *src,
> +                                        struct i915_vma *dst)
> +{
> +       struct drm_i915_private *i915 = ce->vm->i915;
> +       const u32 block_size = S16_MAX * PAGE_SIZE;
> +       struct intel_engine_pool_node *pool;
> +       struct i915_vma *batch;
> +       u64 src_offset, dst_offset;
> +       u64 count;
> +       u64 rem;
> +       u32 size;
> +       u32 *cmd;
> +       int err;
> +
> +       GEM_BUG_ON(src->size != dst->size);
> +
> +       count = div_u64(dst->size, block_size);
> +       size = (1 + 11 * count) * sizeof(u32);
> +       size = round_up(size, PAGE_SIZE);
> +       pool = intel_engine_pool_get(&ce->engine->pool, size);
> +       if (IS_ERR(pool))
> +               return ERR_CAST(pool);
> +
> +       cmd = i915_gem_object_pin_map(pool->obj, I915_MAP_WC);
> +       if (IS_ERR(cmd)) {
> +               err = PTR_ERR(cmd);
> +               goto out_put;
> +       }
> +
> +       rem = src->size;
> +       src_offset = src->node.start;
> +       dst_offset = dst->node.start;
> +
> +       do {
> +               u32 size = min_t(u64, rem, block_size);
> +
> +               GEM_BUG_ON(size >> PAGE_SHIFT > S16_MAX);
> +
> +               if (INTEL_GEN(i915) >= 9) {
> +                       *cmd++ = GEN9_XY_FAST_COPY_BLT_CMD | (10 - 2);
> +                       *cmd++ = BLT_DEPTH_32 | PAGE_SIZE;
> +                       *cmd++ = 0;
> +                       *cmd++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4;
> +                       *cmd++ = lower_32_bits(dst_offset);
> +                       *cmd++ = upper_32_bits(dst_offset);
> +                       *cmd++ = 0;
> +                       *cmd++ = PAGE_SIZE;
> +                       *cmd++ = lower_32_bits(src_offset);
> +                       *cmd++ = upper_32_bits(src_offset);
> +               } else if (INTEL_GEN(i915) >= 8) {
> +                       *cmd++ = XY_SRC_COPY_BLT_CMD | BLT_WRITE_RGBA | (10 - 2);
> +                       *cmd++ = BLT_DEPTH_32 | BLT_ROP_SRC_COPY | PAGE_SIZE;
> +                       *cmd++ = 0;
> +                       *cmd++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4;
> +                       *cmd++ = lower_32_bits(dst_offset);
> +                       *cmd++ = upper_32_bits(dst_offset);
> +                       *cmd++ = 0;
> +                       *cmd++ = PAGE_SIZE;
> +                       *cmd++ = lower_32_bits(src_offset);
> +                       *cmd++ = upper_32_bits(src_offset);
> +               } else {
> +                       *cmd++ = SRC_COPY_BLT_CMD | BLT_WRITE_RGBA | (6 - 2);
> +                       *cmd++ = BLT_DEPTH_32 | BLT_ROP_SRC_COPY | PAGE_SIZE;
> +                       *cmd++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE;
> +                       *cmd++ = dst_offset;
> +                       *cmd++ = PAGE_SIZE;
> +                       *cmd++ = src_offset;
> +               }
> +
> +               /* Allow ourselves to be preempted in between blocks. */
> +               *cmd++ = MI_ARB_CHECK;
> +
> +               src_offset += size;
> +               dst_offset += size;
> +               rem -= size;
> +       } while (rem);
> +
> +       *cmd = MI_BATCH_BUFFER_END;
> +       intel_gt_chipset_flush(ce->vm->gt);
> +
> +       i915_gem_object_unpin_map(pool->obj);
> +
> +       batch = i915_vma_instance(pool->obj, ce->vm, NULL);
> +       if (IS_ERR(batch)) {
> +               err = PTR_ERR(batch);
> +               goto out_put;
> +       }
> +
> +       err = i915_vma_pin(batch, 0, 0, PIN_USER);
> +       if (unlikely(err))
> +               goto out_put;
> +
> +       *p = pool;
> +       return batch;
> +
> +out_put:
> +       intel_engine_pool_put(pool);
> +       return ERR_PTR(err);
> +}
> +
> +int i915_gem_object_copy_blt(struct drm_i915_gem_object *src,
> +                            struct drm_i915_gem_object *dst,
> +                            struct intel_context *ce)
> +{
> +       struct drm_gem_object *objs[] = { &src->base, &dst->base };
> +       struct i915_address_space *vm = ce->vm;
> +       struct intel_engine_pool_node *pool;
> +       struct ww_acquire_ctx acquire;
> +       struct i915_vma *vma_src, *vma_dst;
> +       struct i915_vma *batch;
> +       struct i915_request *rq;
> +       int err;
> +
> +       vma_src = i915_vma_instance(src, vm, NULL);
> +       if (IS_ERR(vma_src))
> +               return PTR_ERR(vma_src);
> +
> +       err = i915_vma_pin(vma_src, 0, 0, PIN_USER);
> +       if (unlikely(err))
> +               return err;
> +
> +       vma_dst = i915_vma_instance(dst, vm, NULL);
> +       if (IS_ERR(vma_dst))
> +               goto out_unpin_src;
> +
> +       err = i915_vma_pin(vma_dst, 0, 0, PIN_USER);
> +       if (unlikely(err))
> +               goto out_unpin_src;
> +
> +       intel_engine_pm_get(ce->engine);
> +       batch = intel_emit_vma_copy_blt(&pool, ce, vma_src, vma_dst);
> +       if (IS_ERR(batch)) {
> +               err = PTR_ERR(batch);
> +               goto out_unpin_dst;
> +       }
> +
> +       rq = intel_context_create_request(ce);
> +       if (IS_ERR(rq)) {
> +               err = PTR_ERR(rq);
> +               goto out_batch;
> +       }
> +
> +       i915_vma_lock(batch);
> +       err = i915_vma_move_to_active(batch, rq, 0);
> +       i915_vma_unlock(batch);
> +       if (unlikely(err))
> +               goto out_request;
> +
> +       err = intel_engine_pool_mark_active(pool, rq);
> +       if (unlikely(err))
> +               goto out_request;
> +
> +       err = drm_gem_lock_reservations(objs, ARRAY_SIZE(objs), &acquire);
> +       if (unlikely(err))
> +               goto out_request;
> +
> +       if (src->cache_dirty & ~src->cache_coherent)
> +               i915_gem_clflush_object(src, 0);
> +
> +       if (dst->cache_dirty & ~dst->cache_coherent)
> +               i915_gem_clflush_object(dst, 0);
> +
> +       err = i915_request_await_object(rq, src, false);
> +       if (unlikely(err))
> +               goto out_unlock;
> +
> +       err = i915_vma_move_to_active(vma_src, rq, 0);
> +       if (unlikely(err))
> +               goto out_unlock;
> +
> +       err = i915_request_await_object(rq, dst, true);
> +       if (unlikely(err))
> +               goto out_unlock;
> +
> +       err = i915_vma_move_to_active(vma_dst, rq, EXEC_OBJECT_WRITE);
> +       if (unlikely(err))
> +               goto out_unlock;

Strictly, wait on all objects, then setup all signals. Avoids any nasty
cycles in the dependency graphs. Such as if someone passed in src = dst.
Time for another selftest ;)

for (i = 0; i < ARRAY_SIZE(obj); i++) {
	clflush_object(obj[i]);
	await_object(rq, obj[i]);
}

for (i = 0; i < ARRAY_SIZE(obj); i++)
	move_to_active(obj[i]);

> +
> +       if (ce->engine->emit_init_breadcrumb) {
> +               err = ce->engine->emit_init_breadcrumb(rq);
> +               if (unlikely(err))
> +                       goto out_unlock;
> +       }
> +
> +       err = ce->engine->emit_bb_start(rq,
> +                                       batch->node.start, batch->node.size,
> +                                       0);
> +out_unlock:
> +       drm_gem_unlock_reservations(objs, ARRAY_SIZE(objs), &acquire);
> +out_request:
> +       if (unlikely(err))
> +               i915_request_skip(rq, err);
> +
> +       i915_request_add(rq);
> +out_batch:
> +       i915_vma_unpin(batch);
> +       intel_engine_pool_put(pool);
> +out_unpin_dst:
> +       i915_vma_unpin(vma_dst);
> +       intel_engine_pm_put(ce->engine);
> +out_unpin_src:
> +       i915_vma_unpin(vma_src);
> +       return err;
> +}