[Intel-gfx] [PATCH 03/22] drm/i915/gem: Implement legacy MI_STORE_DATA_IMM
Chris Wilson
chris at chris-wilson.co.uk
Mon May 4 13:08:56 UTC 2020
Quoting Tvrtko Ursulin (2020-05-04 13:58:46)
>
> On 04/05/2020 05:48, Chris Wilson wrote:
> > +static bool can_store_dword(const struct intel_engine_cs *engine)
> > +{
> > + return engine->class != VIDEO_DECODE_CLASS || !IS_GEN(engine->i915, 6);
> > +}
>
> A bit confusing to future reader who it differs from
> intel_engine_can_store_dword but apart from adding a comment I don't
> have any better ideas.
can_use_engine_for_reloc_without_killing_the_machine()
if (!engine_can_reloc_gpu()) ?
>
> > +
> > static u32 *reloc_gpu(struct i915_execbuffer *eb,
> > struct i915_vma *vma,
> > unsigned int len)
> > @@ -1387,9 +1392,9 @@ static u32 *reloc_gpu(struct i915_execbuffer *eb,
> > if (unlikely(!cache->rq)) {
> > struct intel_engine_cs *engine = eb->engine;
> >
> > - if (!intel_engine_can_store_dword(engine)) {
> > + if (!can_store_dword(engine)) {
> > engine = engine->gt->engine_class[COPY_ENGINE_CLASS][0];
> > - if (!engine || !intel_engine_can_store_dword(engine))
> > + if (!engine)
> > return ERR_PTR(-ENODEV);
> > }
> >
> > @@ -1435,91 +1440,138 @@ static inline bool use_reloc_gpu(struct i915_vma *vma)
> > return !dma_resv_test_signaled_rcu(vma->resv, true);
> > }
> >
> > -static u64
> > -relocate_entry(struct i915_vma *vma,
> > - const struct drm_i915_gem_relocation_entry *reloc,
> > - struct i915_execbuffer *eb,
> > - const struct i915_vma *target)
> > +static unsigned long vma_phys_addr(struct i915_vma *vma, u32 offset)
> > {
> > - u64 offset = reloc->offset;
> > - u64 target_offset = relocation_target(reloc, target);
> > - bool wide = eb->reloc_cache.use_64bit_reloc;
> > - void *vaddr;
> > + struct page *page;
> > + unsigned long addr;
> >
> > - if (!eb->reloc_cache.vaddr && use_reloc_gpu(vma)) {
> > - const unsigned int gen = eb->reloc_cache.gen;
> > - unsigned int len;
> > - u32 *batch;
> > - u64 addr;
> > + GEM_BUG_ON(vma->pages != vma->obj->mm.pages);
> >
> > - if (wide)
> > - len = offset & 7 ? 8 : 5;
> > - else if (gen >= 4)
> > - len = 4;
> > - else
> > - len = 3;
> > + page = i915_gem_object_get_page(vma->obj, offset >> PAGE_SHIFT);
> > + addr = PFN_PHYS(page_to_pfn(page));
> > + GEM_BUG_ON(overflows_type(addr, u32)); /* expected dma32 */
> >
> > - batch = reloc_gpu(eb, vma, len);
> > - if (IS_ERR(batch))
> > - goto repeat;
> > + return addr + offset_in_page(offset);
> > +}
> > +
> > +static bool __reloc_entry_gpu(struct i915_vma *vma,
> > + struct i915_execbuffer *eb,
> > + u64 offset,
> > + u64 target_addr)
> > +{
> > + const unsigned int gen = eb->reloc_cache.gen;
> > + unsigned int len;
> > + u32 *batch;
> > + u64 addr;
> > +
> > + if (gen >= 8)
> > + len = offset & 7 ? 8 : 5;
>
> This used to be driven of eb->reloc_cache.use_64bit_reloc, any practical
> effect of the change? Doesn't seem to.. Should you still use it for
> consistency though?
It used to be because we already pulled it into a local wide. I switched to
gen here for consistency with the if-else ladders.
>
> > + else if (gen >= 4)
> > + len = 4;
> > + else
> > + len = 3;
> > +
> > + batch = reloc_gpu(eb, vma, len);
> > + if (IS_ERR(batch))
> > + return false;
> > +
> > + addr = gen8_canonical_addr(vma->node.start + offset);
> > + if (gen >= 8) {
> > + if (offset & 7) {
> > + *batch++ = MI_STORE_DWORD_IMM_GEN4;
> > + *batch++ = lower_32_bits(addr);
> > + *batch++ = upper_32_bits(addr);
> > + *batch++ = lower_32_bits(target_addr);
> > +
> > + addr = gen8_canonical_addr(addr + 4);
> >
> > - addr = gen8_canonical_addr(vma->node.start + offset);
> > - if (wide) {
> > - if (offset & 7) {
> > - *batch++ = MI_STORE_DWORD_IMM_GEN4;
> > - *batch++ = lower_32_bits(addr);
> > - *batch++ = upper_32_bits(addr);
> > - *batch++ = lower_32_bits(target_offset);
> > -
> > - addr = gen8_canonical_addr(addr + 4);
> > -
> > - *batch++ = MI_STORE_DWORD_IMM_GEN4;
> > - *batch++ = lower_32_bits(addr);
> > - *batch++ = upper_32_bits(addr);
> > - *batch++ = upper_32_bits(target_offset);
> > - } else {
> > - *batch++ = (MI_STORE_DWORD_IMM_GEN4 | (1 << 21)) + 1;
> > - *batch++ = lower_32_bits(addr);
> > - *batch++ = upper_32_bits(addr);
> > - *batch++ = lower_32_bits(target_offset);
> > - *batch++ = upper_32_bits(target_offset);
> > - }
> > - } else if (gen >= 6) {
> > *batch++ = MI_STORE_DWORD_IMM_GEN4;
> > - *batch++ = 0;
> > - *batch++ = addr;
> > - *batch++ = target_offset;
> > - } else if (gen >= 4) {
> > - *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
> > - *batch++ = 0;
> > - *batch++ = addr;
> > - *batch++ = target_offset;
> > + *batch++ = lower_32_bits(addr);
> > + *batch++ = upper_32_bits(addr);
> > + *batch++ = upper_32_bits(target_addr);
> > } else {
> > - *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
> > - *batch++ = addr;
> > - *batch++ = target_offset;
> > + *batch++ = (MI_STORE_DWORD_IMM_GEN4 | (1 << 21)) + 1;
> > + *batch++ = lower_32_bits(addr);
> > + *batch++ = upper_32_bits(addr);
> > + *batch++ = lower_32_bits(target_addr);
> > + *batch++ = upper_32_bits(target_addr);
> > }
> > -
> > - goto out;
> > + } else if (gen >= 6) {
> > + *batch++ = MI_STORE_DWORD_IMM_GEN4;
> > + *batch++ = 0;
> > + *batch++ = addr;
> > + *batch++ = target_addr;
> > + } else if (IS_I965G(eb->i915)) {
> > + *batch++ = MI_STORE_DWORD_IMM_GEN4;
> > + *batch++ = 0;
> > + *batch++ = vma_phys_addr(vma, offset);
> > + *batch++ = target_addr;
> > + } else if (gen >= 4) {
> > + *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
> > + *batch++ = 0;
> > + *batch++ = addr;
> > + *batch++ = target_addr;
> > + } else if (gen >= 3 &&
> > + !(IS_I915G(eb->i915) || IS_I915GM(eb->i915))) {
> > + *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
> > + *batch++ = addr;
> > + *batch++ = target_addr;
> > + } else {
> > + *batch++ = MI_STORE_DWORD_IMM;
> > + *batch++ = vma_phys_addr(vma, offset);
> > + *batch++ = target_addr;
> > }
> >
> > + return true;
> > +}
> > +
> > +static bool reloc_entry_gpu(struct i915_vma *vma,
> > + struct i915_execbuffer *eb,
> > + u64 offset,
> > + u64 target_addr)
> > +{
> > + if (eb->reloc_cache.vaddr)
> > + return false;
> > +
> > + if (!use_reloc_gpu(vma))
> > + return false;
> > +
> > + return __reloc_entry_gpu(vma, eb, offset, target_addr);
> > +}
> > +
> > +static u64
> > +relocate_entry(struct i915_vma *vma,
> > + const struct drm_i915_gem_relocation_entry *reloc,
> > + struct i915_execbuffer *eb,
> > + const struct i915_vma *target)
> > +{
> > + u64 target_addr = relocation_target(reloc, target);
> > + u64 offset = reloc->offset;
> > +
> > + if (!reloc_entry_gpu(vma, eb, offset, target_addr)) {
> > + bool wide = eb->reloc_cache.use_64bit_reloc;
> > + void *vaddr;
> > +
> > repeat:
> > - vaddr = reloc_vaddr(vma->obj, &eb->reloc_cache, offset >> PAGE_SHIFT);
> > - if (IS_ERR(vaddr))
> > - return PTR_ERR(vaddr);
> > + vaddr = reloc_vaddr(vma->obj,
> > + &eb->reloc_cache,
> > + offset >> PAGE_SHIFT);
> > + if (IS_ERR(vaddr))
> > + return PTR_ERR(vaddr);
> >
> > - clflush_write32(vaddr + offset_in_page(offset),
> > - lower_32_bits(target_offset),
> > - eb->reloc_cache.vaddr);
> > + GEM_BUG_ON(!IS_ALIGNED(offset, sizeof(u32)));
> > + clflush_write32(vaddr + offset_in_page(offset),
> > + lower_32_bits(target_addr),
> > + eb->reloc_cache.vaddr);
> >
> > - if (wide) {
> > - offset += sizeof(u32);
> > - target_offset >>= 32;
> > - wide = false;
> > - goto repeat;
> > + if (wide) {
> > + offset += sizeof(u32);
> > + target_addr >>= 32;
> > + wide = false;
> > + goto repeat;
> > + }
> > }
> >
> > -out:
> > return target->node.start | UPDATE;
> > }
> >
> > @@ -3022,3 +3074,7 @@ end:;
> > kvfree(exec2_list);
> > return err;
> > }
>
> The diff is a bit messy but looks okay and in CI we trust.
>
> > +
> > +#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
> > +#include "selftests/i915_gem_execbuffer.c"
> > +#endif
> > diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/selftests/i915_gem_execbuffer.c
> > new file mode 100644
> > index 000000000000..985f9fbd0ba0
> > --- /dev/null
> > +++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_execbuffer.c
> > @@ -0,0 +1,206 @@
> > +// SPDX-License-Identifier: MIT
> > +/*
> > + * Copyright © 2020 Intel Corporation
> > + */
> > +
> > +#include "i915_selftest.h"
> > +
> > +#include "gt/intel_engine_pm.h"
> > +#include "selftests/igt_flush_test.h"
> > +
> > +static void hexdump(const void *buf, size_t len)
>
> Uh-oh seems to be the third copy! ;)
* jedi handwave
Yeah, I'm close to pulling them into i915_selftests.c as pr_hexdump(). A
certain Tvrtko might one day win his argument to land the wrapper in lib/
> > +static int __igt_gpu_reloc(struct i915_execbuffer *eb,
> > + struct drm_i915_gem_object *obj)
> > +{
> > + enum {
> > + X = 0,
> > + Y = 3,
> > + Z = 8
> > + };
> > + const u64 mask = GENMASK_ULL(eb->reloc_cache.gen >= 8 ? 63 : 31, 0);
>
> Mask is to remove the poison? use_64_bit relocs instead of gen, or this
> is more correct?
Might as well just use_64_bit_relocs. I suppose doing a manual check
against gen might help notice a use_64_bit_relocs slip?
>
> > + const u32 *map = page_mask_bits(obj->mm.mapping);
> > + struct i915_request *rq;
> > + struct i915_vma *vma;
> > + int inc;
> > + int err;
> > +
> > + vma = i915_vma_instance(obj, eb->context->vm, NULL);
> > + if (IS_ERR(vma))
> > + return PTR_ERR(vma);
> > +
> > + err = i915_vma_pin(vma, 0, 0, PIN_USER | PIN_HIGH);
> > + if (err)
> > + return err;
> > +
> > + /* 8-Byte aligned */
> > + if (!__reloc_entry_gpu(vma, eb, X * sizeof(u32), X)) {
> > + err = -EIO;
> > + goto unpin_vma;
> > + }
> > +
> > + /* !8-Byte aligned */
>
> What is the significance of the non 8 byte aligned?
That's the if(offset & 7) path in the gen8 code. If the offset is 8-byte
aligned we can do a QWord MI_STORE_DATA_IMM, if it is only 4-byte
aligned, we need 2 MI_STORE_DATA_IMM.
>
> > + if (!__reloc_entry_gpu(vma, eb, Y * sizeof(u32), Y)) {
> > + err = -EIO;
> > + goto unpin_vma;
> > + }
> > +
> > + /* Skip to the end of the cmd page */
> > + inc = PAGE_SIZE / sizeof(u32) - RELOC_TAIL - 1;
> > + inc -= eb->reloc_cache.rq_size;
> > + memset(eb->reloc_cache.rq_cmd + eb->reloc_cache.rq_size,
> > + 0, inc * sizeof(u32));
> > + eb->reloc_cache.rq_size += inc;
> > +
> > + /* Force batch chaining */
> > + if (!__reloc_entry_gpu(vma, eb, Z * sizeof(u32), Z)) {
> > + err = -EIO;
> > + goto unpin_vma;
> > + }
> > +
> > + GEM_BUG_ON(!eb->reloc_cache.rq);
> > + rq = i915_request_get(eb->reloc_cache.rq);
> > + err = reloc_gpu_flush(&eb->reloc_cache);
> > + if (err)
> > + goto put_rq;
> > + GEM_BUG_ON(eb->reloc_cache.rq);
> > +
> > + err = i915_gem_object_wait(obj, I915_WAIT_INTERRUPTIBLE, HZ / 2);
> > + if (err) {
> > + intel_gt_set_wedged(eb->engine->gt);
> > + goto put_rq;
> > + }
> > +
> > + if (!i915_request_completed(rq)) {
> > + pr_err("%s: did not wait for relocations!\n", eb->engine->name);
> > + err = -EINVAL;
> > + goto put_rq;
> > + }
> > +
> > + if (read_reloc(map, X, mask) != X) {
> > + pr_err("%s[X]: map[%d] %llx != %x\n",
> > + eb->engine->name, X, read_reloc(map, X, mask), X);
> > + err = -EINVAL;
> > + }
> > +
> > + if (read_reloc(map, Y, mask) != Y) {
> > + pr_err("%s[Y]: map[%d] %llx != %x\n",
> > + eb->engine->name, Y, read_reloc(map, Y, mask), Y);
> > + err = -EINVAL;
> > + }
> > +
> > + if (read_reloc(map, Z, mask) != Z) {
> > + pr_err("%s[Z]: map[%d] %llx != %x\n",
> > + eb->engine->name, Z, read_reloc(map, Z, mask), Z);
> > + err = -EINVAL;
> > + }
>
> Why this couldn't just be an array of [0, 3, 8] instead of enums and
> then all these tests could be a single loop? I can't figure out what is
> the benefit of enum in other words.. Okay in the test phase it can't be
> a simple loop since it needs the special case for last iteration, but
> here it could be.
If you saw how far this came from the first version... I just liked X,
Y, Z as labels.
-Chris
More information about the Intel-gfx
mailing list