[Intel-gfx] [PATCH 03/22] drm/i915/gem: Implement legacy MI_STORE_DATA_IMM
Tvrtko Ursulin
tvrtko.ursulin at linux.intel.com
Mon May 4 12:58:46 UTC 2020
On 04/05/2020 05:48, Chris Wilson wrote:
> The older arches did not convert MI_STORE_DATA_IMM to using the GTT, but
> left them writing to a physical address. The notes suggest that the
> primary reason would be so that the writes were cache coherent, as the
> CPU cache uses physical tagging. As such we did not implement the
> legacy variant of MI_STORE_DATA_IMM and so left all the relocations
> synchronous -- but with a small function to convert from the vma address
> into the physical address, we can implement asynchronous relocs on these
> older arches, fixing up a few tests that require them.
>
> In order to be able to test the legacy paths, refactor the gpu
> relocations so that we can hook them up to a selftest.
>
> Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/757
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> ---
> .../gpu/drm/i915/gem/i915_gem_execbuffer.c | 204 ++++++++++-------
> .../i915/gem/selftests/i915_gem_execbuffer.c | 206 ++++++++++++++++++
> .../drm/i915/selftests/i915_live_selftests.h | 1 +
> 3 files changed, 337 insertions(+), 74 deletions(-)
> create mode 100644 drivers/gpu/drm/i915/gem/selftests/i915_gem_execbuffer.c
>
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> index ab0d4df13c0b..44d7da0e200e 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> @@ -955,7 +955,7 @@ static void reloc_cache_init(struct reloc_cache *cache,
> cache->needs_unfenced = INTEL_INFO(i915)->unfenced_needs_alignment;
> cache->node.flags = 0;
> cache->rq = NULL;
> - cache->rq_size = 0;
> + cache->target = NULL;
> }
>
> static inline void *unmask_page(unsigned long p)
> @@ -1325,7 +1325,7 @@ static int __reloc_gpu_alloc(struct i915_execbuffer *eb,
>
> ce = intel_context_create(engine);
> if (IS_ERR(ce)) {
> - err = PTR_ERR(rq);
Ouch sloppy reviewer. :)
> + err = PTR_ERR(ce);
> goto err_unpin;
> }
>
> @@ -1376,6 +1376,11 @@ static int __reloc_gpu_alloc(struct i915_execbuffer *eb,
> return err;
> }
>
> +static bool can_store_dword(const struct intel_engine_cs *engine)
> +{
> + return engine->class != VIDEO_DECODE_CLASS || !IS_GEN(engine->i915, 6);
> +}
A bit confusing to future reader who it differs from
intel_engine_can_store_dword but apart from adding a comment I don't
have any better ideas.
> +
> static u32 *reloc_gpu(struct i915_execbuffer *eb,
> struct i915_vma *vma,
> unsigned int len)
> @@ -1387,9 +1392,9 @@ static u32 *reloc_gpu(struct i915_execbuffer *eb,
> if (unlikely(!cache->rq)) {
> struct intel_engine_cs *engine = eb->engine;
>
> - if (!intel_engine_can_store_dword(engine)) {
> + if (!can_store_dword(engine)) {
> engine = engine->gt->engine_class[COPY_ENGINE_CLASS][0];
> - if (!engine || !intel_engine_can_store_dword(engine))
> + if (!engine)
> return ERR_PTR(-ENODEV);
> }
>
> @@ -1435,91 +1440,138 @@ static inline bool use_reloc_gpu(struct i915_vma *vma)
> return !dma_resv_test_signaled_rcu(vma->resv, true);
> }
>
> -static u64
> -relocate_entry(struct i915_vma *vma,
> - const struct drm_i915_gem_relocation_entry *reloc,
> - struct i915_execbuffer *eb,
> - const struct i915_vma *target)
> +static unsigned long vma_phys_addr(struct i915_vma *vma, u32 offset)
> {
> - u64 offset = reloc->offset;
> - u64 target_offset = relocation_target(reloc, target);
> - bool wide = eb->reloc_cache.use_64bit_reloc;
> - void *vaddr;
> + struct page *page;
> + unsigned long addr;
>
> - if (!eb->reloc_cache.vaddr && use_reloc_gpu(vma)) {
> - const unsigned int gen = eb->reloc_cache.gen;
> - unsigned int len;
> - u32 *batch;
> - u64 addr;
> + GEM_BUG_ON(vma->pages != vma->obj->mm.pages);
>
> - if (wide)
> - len = offset & 7 ? 8 : 5;
> - else if (gen >= 4)
> - len = 4;
> - else
> - len = 3;
> + page = i915_gem_object_get_page(vma->obj, offset >> PAGE_SHIFT);
> + addr = PFN_PHYS(page_to_pfn(page));
> + GEM_BUG_ON(overflows_type(addr, u32)); /* expected dma32 */
>
> - batch = reloc_gpu(eb, vma, len);
> - if (IS_ERR(batch))
> - goto repeat;
> + return addr + offset_in_page(offset);
> +}
> +
> +static bool __reloc_entry_gpu(struct i915_vma *vma,
> + struct i915_execbuffer *eb,
> + u64 offset,
> + u64 target_addr)
> +{
> + const unsigned int gen = eb->reloc_cache.gen;
> + unsigned int len;
> + u32 *batch;
> + u64 addr;
> +
> + if (gen >= 8)
> + len = offset & 7 ? 8 : 5;
This used to be driven of eb->reloc_cache.use_64bit_reloc, any practical
effect of the change? Doesn't seem to.. Should you still use it for
consistency though?
> + else if (gen >= 4)
> + len = 4;
> + else
> + len = 3;
> +
> + batch = reloc_gpu(eb, vma, len);
> + if (IS_ERR(batch))
> + return false;
> +
> + addr = gen8_canonical_addr(vma->node.start + offset);
> + if (gen >= 8) {
> + if (offset & 7) {
> + *batch++ = MI_STORE_DWORD_IMM_GEN4;
> + *batch++ = lower_32_bits(addr);
> + *batch++ = upper_32_bits(addr);
> + *batch++ = lower_32_bits(target_addr);
> +
> + addr = gen8_canonical_addr(addr + 4);
>
> - addr = gen8_canonical_addr(vma->node.start + offset);
> - if (wide) {
> - if (offset & 7) {
> - *batch++ = MI_STORE_DWORD_IMM_GEN4;
> - *batch++ = lower_32_bits(addr);
> - *batch++ = upper_32_bits(addr);
> - *batch++ = lower_32_bits(target_offset);
> -
> - addr = gen8_canonical_addr(addr + 4);
> -
> - *batch++ = MI_STORE_DWORD_IMM_GEN4;
> - *batch++ = lower_32_bits(addr);
> - *batch++ = upper_32_bits(addr);
> - *batch++ = upper_32_bits(target_offset);
> - } else {
> - *batch++ = (MI_STORE_DWORD_IMM_GEN4 | (1 << 21)) + 1;
> - *batch++ = lower_32_bits(addr);
> - *batch++ = upper_32_bits(addr);
> - *batch++ = lower_32_bits(target_offset);
> - *batch++ = upper_32_bits(target_offset);
> - }
> - } else if (gen >= 6) {
> *batch++ = MI_STORE_DWORD_IMM_GEN4;
> - *batch++ = 0;
> - *batch++ = addr;
> - *batch++ = target_offset;
> - } else if (gen >= 4) {
> - *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
> - *batch++ = 0;
> - *batch++ = addr;
> - *batch++ = target_offset;
> + *batch++ = lower_32_bits(addr);
> + *batch++ = upper_32_bits(addr);
> + *batch++ = upper_32_bits(target_addr);
> } else {
> - *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
> - *batch++ = addr;
> - *batch++ = target_offset;
> + *batch++ = (MI_STORE_DWORD_IMM_GEN4 | (1 << 21)) + 1;
> + *batch++ = lower_32_bits(addr);
> + *batch++ = upper_32_bits(addr);
> + *batch++ = lower_32_bits(target_addr);
> + *batch++ = upper_32_bits(target_addr);
> }
> -
> - goto out;
> + } else if (gen >= 6) {
> + *batch++ = MI_STORE_DWORD_IMM_GEN4;
> + *batch++ = 0;
> + *batch++ = addr;
> + *batch++ = target_addr;
> + } else if (IS_I965G(eb->i915)) {
> + *batch++ = MI_STORE_DWORD_IMM_GEN4;
> + *batch++ = 0;
> + *batch++ = vma_phys_addr(vma, offset);
> + *batch++ = target_addr;
> + } else if (gen >= 4) {
> + *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
> + *batch++ = 0;
> + *batch++ = addr;
> + *batch++ = target_addr;
> + } else if (gen >= 3 &&
> + !(IS_I915G(eb->i915) || IS_I915GM(eb->i915))) {
> + *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
> + *batch++ = addr;
> + *batch++ = target_addr;
> + } else {
> + *batch++ = MI_STORE_DWORD_IMM;
> + *batch++ = vma_phys_addr(vma, offset);
> + *batch++ = target_addr;
> }
>
> + return true;
> +}
> +
> +static bool reloc_entry_gpu(struct i915_vma *vma,
> + struct i915_execbuffer *eb,
> + u64 offset,
> + u64 target_addr)
> +{
> + if (eb->reloc_cache.vaddr)
> + return false;
> +
> + if (!use_reloc_gpu(vma))
> + return false;
> +
> + return __reloc_entry_gpu(vma, eb, offset, target_addr);
> +}
> +
> +static u64
> +relocate_entry(struct i915_vma *vma,
> + const struct drm_i915_gem_relocation_entry *reloc,
> + struct i915_execbuffer *eb,
> + const struct i915_vma *target)
> +{
> + u64 target_addr = relocation_target(reloc, target);
> + u64 offset = reloc->offset;
> +
> + if (!reloc_entry_gpu(vma, eb, offset, target_addr)) {
> + bool wide = eb->reloc_cache.use_64bit_reloc;
> + void *vaddr;
> +
> repeat:
> - vaddr = reloc_vaddr(vma->obj, &eb->reloc_cache, offset >> PAGE_SHIFT);
> - if (IS_ERR(vaddr))
> - return PTR_ERR(vaddr);
> + vaddr = reloc_vaddr(vma->obj,
> + &eb->reloc_cache,
> + offset >> PAGE_SHIFT);
> + if (IS_ERR(vaddr))
> + return PTR_ERR(vaddr);
>
> - clflush_write32(vaddr + offset_in_page(offset),
> - lower_32_bits(target_offset),
> - eb->reloc_cache.vaddr);
> + GEM_BUG_ON(!IS_ALIGNED(offset, sizeof(u32)));
> + clflush_write32(vaddr + offset_in_page(offset),
> + lower_32_bits(target_addr),
> + eb->reloc_cache.vaddr);
>
> - if (wide) {
> - offset += sizeof(u32);
> - target_offset >>= 32;
> - wide = false;
> - goto repeat;
> + if (wide) {
> + offset += sizeof(u32);
> + target_addr >>= 32;
> + wide = false;
> + goto repeat;
> + }
> }
>
> -out:
> return target->node.start | UPDATE;
> }
>
> @@ -3022,3 +3074,7 @@ end:;
> kvfree(exec2_list);
> return err;
> }
The diff is a bit messy but looks okay and in CI we trust.
> +
> +#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
> +#include "selftests/i915_gem_execbuffer.c"
> +#endif
> diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/selftests/i915_gem_execbuffer.c
> new file mode 100644
> index 000000000000..985f9fbd0ba0
> --- /dev/null
> +++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_execbuffer.c
> @@ -0,0 +1,206 @@
> +// SPDX-License-Identifier: MIT
> +/*
> + * Copyright © 2020 Intel Corporation
> + */
> +
> +#include "i915_selftest.h"
> +
> +#include "gt/intel_engine_pm.h"
> +#include "selftests/igt_flush_test.h"
> +
> +static void hexdump(const void *buf, size_t len)
Uh-oh seems to be the third copy! ;)
> +{
> + const size_t rowsize = 8 * sizeof(u32);
> + const void *prev = NULL;
> + bool skip = false;
> + size_t pos;
> +
> + for (pos = 0; pos < len; pos += rowsize) {
> + char line[128];
> +
> + if (prev && !memcmp(prev, buf + pos, rowsize)) {
> + if (!skip) {
> + pr_info("*\n");
> + skip = true;
> + }
> + continue;
> + }
> +
> + WARN_ON_ONCE(hex_dump_to_buffer(buf + pos, len - pos,
> + rowsize, sizeof(u32),
> + line, sizeof(line),
> + false) >= sizeof(line));
> + pr_info("[%04zx] %s\n", pos, line);
> +
> + prev = buf + pos;
> + skip = false;
> + }
> +}
> +
> +static u64 read_reloc(const u32 *map, int x, const u64 mask)
> +{
> + u64 reloc;
> +
> + memcpy(&reloc, &map[x], sizeof(reloc));
> + return reloc & mask;
> +}
> +
> +static int __igt_gpu_reloc(struct i915_execbuffer *eb,
> + struct drm_i915_gem_object *obj)
> +{
> + enum {
> + X = 0,
> + Y = 3,
> + Z = 8
> + };
> + const u64 mask = GENMASK_ULL(eb->reloc_cache.gen >= 8 ? 63 : 31, 0);
Mask is to remove the poison? use_64_bit relocs instead of gen, or this
is more correct?
> + const u32 *map = page_mask_bits(obj->mm.mapping);
> + struct i915_request *rq;
> + struct i915_vma *vma;
> + int inc;
> + int err;
> +
> + vma = i915_vma_instance(obj, eb->context->vm, NULL);
> + if (IS_ERR(vma))
> + return PTR_ERR(vma);
> +
> + err = i915_vma_pin(vma, 0, 0, PIN_USER | PIN_HIGH);
> + if (err)
> + return err;
> +
> + /* 8-Byte aligned */
> + if (!__reloc_entry_gpu(vma, eb, X * sizeof(u32), X)) {
> + err = -EIO;
> + goto unpin_vma;
> + }
> +
> + /* !8-Byte aligned */
What is the significance of the non 8 byte aligned?
> + if (!__reloc_entry_gpu(vma, eb, Y * sizeof(u32), Y)) {
> + err = -EIO;
> + goto unpin_vma;
> + }
> +
> + /* Skip to the end of the cmd page */
> + inc = PAGE_SIZE / sizeof(u32) - RELOC_TAIL - 1;
> + inc -= eb->reloc_cache.rq_size;
> + memset(eb->reloc_cache.rq_cmd + eb->reloc_cache.rq_size,
> + 0, inc * sizeof(u32));
> + eb->reloc_cache.rq_size += inc;
> +
> + /* Force batch chaining */
> + if (!__reloc_entry_gpu(vma, eb, Z * sizeof(u32), Z)) {
> + err = -EIO;
> + goto unpin_vma;
> + }
> +
> + GEM_BUG_ON(!eb->reloc_cache.rq);
> + rq = i915_request_get(eb->reloc_cache.rq);
> + err = reloc_gpu_flush(&eb->reloc_cache);
> + if (err)
> + goto put_rq;
> + GEM_BUG_ON(eb->reloc_cache.rq);
> +
> + err = i915_gem_object_wait(obj, I915_WAIT_INTERRUPTIBLE, HZ / 2);
> + if (err) {
> + intel_gt_set_wedged(eb->engine->gt);
> + goto put_rq;
> + }
> +
> + if (!i915_request_completed(rq)) {
> + pr_err("%s: did not wait for relocations!\n", eb->engine->name);
> + err = -EINVAL;
> + goto put_rq;
> + }
> +
> + if (read_reloc(map, X, mask) != X) {
> + pr_err("%s[X]: map[%d] %llx != %x\n",
> + eb->engine->name, X, read_reloc(map, X, mask), X);
> + err = -EINVAL;
> + }
> +
> + if (read_reloc(map, Y, mask) != Y) {
> + pr_err("%s[Y]: map[%d] %llx != %x\n",
> + eb->engine->name, Y, read_reloc(map, Y, mask), Y);
> + err = -EINVAL;
> + }
> +
> + if (read_reloc(map, Z, mask) != Z) {
> + pr_err("%s[Z]: map[%d] %llx != %x\n",
> + eb->engine->name, Z, read_reloc(map, Z, mask), Z);
> + err = -EINVAL;
> + }
Why this couldn't just be an array of [0, 3, 8] instead of enums and
then all these tests could be a single loop? I can't figure out what is
the benefit of enum in other words.. Okay in the test phase it can't be
a simple loop since it needs the special case for last iteration, but
here it could be.
> +
> + if (err)
> + hexdump(map, 4096);
> +
> +put_rq:
> + i915_request_put(rq);
> +unpin_vma:
> + i915_vma_unpin(vma);
> + return err;
> +}
> +
> +static int igt_gpu_reloc(void *arg)
> +{
> + struct i915_execbuffer eb;
> + struct drm_i915_gem_object *scratch;
> + int err = 0;
> + u32 *map;
> +
> + eb.i915 = arg;
> +
> + scratch = i915_gem_object_create_internal(eb.i915, 4096);
> + if (IS_ERR(scratch))
> + return PTR_ERR(scratch);
> +
> + map = i915_gem_object_pin_map(scratch, I915_MAP_WC);
> + if (IS_ERR(map)) {
> + err = PTR_ERR(map);
> + goto err_scratch;
> + }
> +
> + for_each_uabi_engine(eb.engine, eb.i915) {
> + reloc_cache_init(&eb.reloc_cache, eb.i915);
> + memset(map, POISON_INUSE, 4096);
> +
> + intel_engine_pm_get(eb.engine);
> + eb.context = intel_context_create(eb.engine);
> + if (IS_ERR(eb.context)) {
> + err = PTR_ERR(eb.context);
> + goto err_pm;
> + }
> +
> + err = intel_context_pin(eb.context);
> + if (err)
> + goto err_put;
> +
> + err = __igt_gpu_reloc(&eb, scratch);
> +
> + intel_context_unpin(eb.context);
> +err_put:
> + intel_context_put(eb.context);
> +err_pm:
> + intel_engine_pm_put(eb.engine);
> + if (err)
> + break;
> + }
> +
> + if (igt_flush_test(eb.i915))
> + err = -EIO;
> +
> +err_scratch:
> + i915_gem_object_put(scratch);
> + return err;
> +}
> +
> +int i915_gem_execbuffer_live_selftests(struct drm_i915_private *i915)
> +{
> + static const struct i915_subtest tests[] = {
> + SUBTEST(igt_gpu_reloc),
> + };
> +
> + if (intel_gt_is_wedged(&i915->gt))
> + return 0;
> +
> + return i915_live_subtests(tests, i915);
> +}
> diff --git a/drivers/gpu/drm/i915/selftests/i915_live_selftests.h b/drivers/gpu/drm/i915/selftests/i915_live_selftests.h
> index 0a953bfc0585..5dd5d81646c4 100644
> --- a/drivers/gpu/drm/i915/selftests/i915_live_selftests.h
> +++ b/drivers/gpu/drm/i915/selftests/i915_live_selftests.h
> @@ -37,6 +37,7 @@ selftest(gem, i915_gem_live_selftests)
> selftest(evict, i915_gem_evict_live_selftests)
> selftest(hugepages, i915_gem_huge_page_live_selftests)
> selftest(gem_contexts, i915_gem_context_live_selftests)
> +selftest(gem_execbuf, i915_gem_execbuffer_live_selftests)
> selftest(blt, i915_gem_object_blt_live_selftests)
> selftest(client, i915_gem_client_blt_live_selftests)
> selftest(reset, intel_reset_live_selftests)
>
Regards,
Tvrtko
More information about the Intel-gfx
mailing list