[Intel-gfx] [PATCH 03/22] drm/i915/gem: Implement legacy MI_STORE_DATA_IMM

Mon May 4 12:58:46 UTC 2020

On 04/05/2020 05:48, Chris Wilson wrote:
> The older arches did not convert MI_STORE_DATA_IMM to using the GTT, but
> left them writing to a physical address. The notes suggest that the
> primary reason would be so that the writes were cache coherent, as the
> CPU cache uses physical tagging. As such we did not implement the
> legacy variant of MI_STORE_DATA_IMM and so left all the relocations
> synchronous -- but with a small function to convert from the vma address
> into the physical address, we can implement asynchronous relocs on these
> older arches, fixing up a few tests that require them.
> 
> In order to be able to test the legacy paths, refactor the gpu
> relocations so that we can hook them up to a selftest.
> 
> Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/757
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> ---
>   .../gpu/drm/i915/gem/i915_gem_execbuffer.c    | 204 ++++++++++-------
>   .../i915/gem/selftests/i915_gem_execbuffer.c  | 206 ++++++++++++++++++
>   .../drm/i915/selftests/i915_live_selftests.h  |   1 +
>   3 files changed, 337 insertions(+), 74 deletions(-)
>   create mode 100644 drivers/gpu/drm/i915/gem/selftests/i915_gem_execbuffer.c
> 
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> index ab0d4df13c0b..44d7da0e200e 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
> @@ -955,7 +955,7 @@ static void reloc_cache_init(struct reloc_cache *cache,
>   	cache->needs_unfenced = INTEL_INFO(i915)->unfenced_needs_alignment;
>   	cache->node.flags = 0;
>   	cache->rq = NULL;
> -	cache->rq_size = 0;
> +	cache->target = NULL;
>   }
>   
>   static inline void *unmask_page(unsigned long p)
> @@ -1325,7 +1325,7 @@ static int __reloc_gpu_alloc(struct i915_execbuffer *eb,
>   
>   		ce = intel_context_create(engine);
>   		if (IS_ERR(ce)) {
> -			err = PTR_ERR(rq);

Ouch sloppy reviewer. :)

> +			err = PTR_ERR(ce);
>   			goto err_unpin;
>   		}
>   
> @@ -1376,6 +1376,11 @@ static int __reloc_gpu_alloc(struct i915_execbuffer *eb,
>   	return err;
>   }
>   
> +static bool can_store_dword(const struct intel_engine_cs *engine)
> +{
> +	return engine->class != VIDEO_DECODE_CLASS || !IS_GEN(engine->i915, 6);
> +}

A bit confusing to future reader who it differs from 
intel_engine_can_store_dword but apart from adding a comment I don't 
have any better ideas.

> +
>   static u32 *reloc_gpu(struct i915_execbuffer *eb,
>   		      struct i915_vma *vma,
>   		      unsigned int len)
> @@ -1387,9 +1392,9 @@ static u32 *reloc_gpu(struct i915_execbuffer *eb,
>   	if (unlikely(!cache->rq)) {
>   		struct intel_engine_cs *engine = eb->engine;
>   
> -		if (!intel_engine_can_store_dword(engine)) {
> +		if (!can_store_dword(engine)) {
>   			engine = engine->gt->engine_class[COPY_ENGINE_CLASS][0];
> -			if (!engine || !intel_engine_can_store_dword(engine))
> +			if (!engine)
>   				return ERR_PTR(-ENODEV);
>   		}
>   
> @@ -1435,91 +1440,138 @@ static inline bool use_reloc_gpu(struct i915_vma *vma)
>   	return !dma_resv_test_signaled_rcu(vma->resv, true);
>   }
>   
> -static u64
> -relocate_entry(struct i915_vma *vma,
> -	       const struct drm_i915_gem_relocation_entry *reloc,
> -	       struct i915_execbuffer *eb,
> -	       const struct i915_vma *target)
> +static unsigned long vma_phys_addr(struct i915_vma *vma, u32 offset)
>   {
> -	u64 offset = reloc->offset;
> -	u64 target_offset = relocation_target(reloc, target);
> -	bool wide = eb->reloc_cache.use_64bit_reloc;
> -	void *vaddr;
> +	struct page *page;
> +	unsigned long addr;
>   
> -	if (!eb->reloc_cache.vaddr && use_reloc_gpu(vma)) {
> -		const unsigned int gen = eb->reloc_cache.gen;
> -		unsigned int len;
> -		u32 *batch;
> -		u64 addr;
> +	GEM_BUG_ON(vma->pages != vma->obj->mm.pages);
>   
> -		if (wide)
> -			len = offset & 7 ? 8 : 5;
> -		else if (gen >= 4)
> -			len = 4;
> -		else
> -			len = 3;
> +	page = i915_gem_object_get_page(vma->obj, offset >> PAGE_SHIFT);
> +	addr = PFN_PHYS(page_to_pfn(page));
> +	GEM_BUG_ON(overflows_type(addr, u32)); /* expected dma32 */
>   
> -		batch = reloc_gpu(eb, vma, len);
> -		if (IS_ERR(batch))
> -			goto repeat;
> +	return addr + offset_in_page(offset);
> +}
> +
> +static bool __reloc_entry_gpu(struct i915_vma *vma,
> +			      struct i915_execbuffer *eb,
> +			      u64 offset,
> +			      u64 target_addr)
> +{
> +	const unsigned int gen = eb->reloc_cache.gen;
> +	unsigned int len;
> +	u32 *batch;
> +	u64 addr;
> +
> +	if (gen >= 8)
> +		len = offset & 7 ? 8 : 5;

This used to be driven of eb->reloc_cache.use_64bit_reloc, any practical 
effect of the change? Doesn't seem to.. Should you still use it for 
consistency though?

> +	else if (gen >= 4)
> +		len = 4;
> +	else
> +		len = 3;
> +
> +	batch = reloc_gpu(eb, vma, len);
> +	if (IS_ERR(batch))
> +		return false;
> +
> +	addr = gen8_canonical_addr(vma->node.start + offset);
> +	if (gen >= 8) {
> +		if (offset & 7) {
> +			*batch++ = MI_STORE_DWORD_IMM_GEN4;
> +			*batch++ = lower_32_bits(addr);
> +			*batch++ = upper_32_bits(addr);
> +			*batch++ = lower_32_bits(target_addr);
> +
> +			addr = gen8_canonical_addr(addr + 4);
>   
> -		addr = gen8_canonical_addr(vma->node.start + offset);
> -		if (wide) {
> -			if (offset & 7) {
> -				*batch++ = MI_STORE_DWORD_IMM_GEN4;
> -				*batch++ = lower_32_bits(addr);
> -				*batch++ = upper_32_bits(addr);
> -				*batch++ = lower_32_bits(target_offset);
> -
> -				addr = gen8_canonical_addr(addr + 4);
> -
> -				*batch++ = MI_STORE_DWORD_IMM_GEN4;
> -				*batch++ = lower_32_bits(addr);
> -				*batch++ = upper_32_bits(addr);
> -				*batch++ = upper_32_bits(target_offset);
> -			} else {
> -				*batch++ = (MI_STORE_DWORD_IMM_GEN4 | (1 << 21)) + 1;
> -				*batch++ = lower_32_bits(addr);
> -				*batch++ = upper_32_bits(addr);
> -				*batch++ = lower_32_bits(target_offset);
> -				*batch++ = upper_32_bits(target_offset);
> -			}
> -		} else if (gen >= 6) {
>   			*batch++ = MI_STORE_DWORD_IMM_GEN4;
> -			*batch++ = 0;
> -			*batch++ = addr;
> -			*batch++ = target_offset;
> -		} else if (gen >= 4) {
> -			*batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
> -			*batch++ = 0;
> -			*batch++ = addr;
> -			*batch++ = target_offset;
> +			*batch++ = lower_32_bits(addr);
> +			*batch++ = upper_32_bits(addr);
> +			*batch++ = upper_32_bits(target_addr);
>   		} else {
> -			*batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
> -			*batch++ = addr;
> -			*batch++ = target_offset;
> +			*batch++ = (MI_STORE_DWORD_IMM_GEN4 | (1 << 21)) + 1;
> +			*batch++ = lower_32_bits(addr);
> +			*batch++ = upper_32_bits(addr);
> +			*batch++ = lower_32_bits(target_addr);
> +			*batch++ = upper_32_bits(target_addr);
>   		}
> -
> -		goto out;
> +	} else if (gen >= 6) {
> +		*batch++ = MI_STORE_DWORD_IMM_GEN4;
> +		*batch++ = 0;
> +		*batch++ = addr;
> +		*batch++ = target_addr;
> +	} else if (IS_I965G(eb->i915)) {
> +		*batch++ = MI_STORE_DWORD_IMM_GEN4;
> +		*batch++ = 0;
> +		*batch++ = vma_phys_addr(vma, offset);
> +		*batch++ = target_addr;
> +	} else if (gen >= 4) {
> +		*batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
> +		*batch++ = 0;
> +		*batch++ = addr;
> +		*batch++ = target_addr;
> +	} else if (gen >= 3 &&
> +		   !(IS_I915G(eb->i915) || IS_I915GM(eb->i915))) {
> +		*batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
> +		*batch++ = addr;
> +		*batch++ = target_addr;
> +	} else {
> +		*batch++ = MI_STORE_DWORD_IMM;
> +		*batch++ = vma_phys_addr(vma, offset);
> +		*batch++ = target_addr;
>   	}
>   
> +	return true;
> +}
> +
> +static bool reloc_entry_gpu(struct i915_vma *vma,
> +			    struct i915_execbuffer *eb,
> +			    u64 offset,
> +			    u64 target_addr)
> +{
> +	if (eb->reloc_cache.vaddr)
> +		return false;
> +
> +	if (!use_reloc_gpu(vma))
> +		return false;
> +
> +	return __reloc_entry_gpu(vma, eb, offset, target_addr);
> +}
> +
> +static u64
> +relocate_entry(struct i915_vma *vma,
> +	       const struct drm_i915_gem_relocation_entry *reloc,
> +	       struct i915_execbuffer *eb,
> +	       const struct i915_vma *target)
> +{
> +	u64 target_addr = relocation_target(reloc, target);
> +	u64 offset = reloc->offset;
> +
> +	if (!reloc_entry_gpu(vma, eb, offset, target_addr)) {
> +		bool wide = eb->reloc_cache.use_64bit_reloc;
> +		void *vaddr;
> +
>   repeat:
> -	vaddr = reloc_vaddr(vma->obj, &eb->reloc_cache, offset >> PAGE_SHIFT);
> -	if (IS_ERR(vaddr))
> -		return PTR_ERR(vaddr);
> +		vaddr = reloc_vaddr(vma->obj,
> +				    &eb->reloc_cache,
> +				    offset >> PAGE_SHIFT);
> +		if (IS_ERR(vaddr))
> +			return PTR_ERR(vaddr);
>   
> -	clflush_write32(vaddr + offset_in_page(offset),
> -			lower_32_bits(target_offset),
> -			eb->reloc_cache.vaddr);
> +		GEM_BUG_ON(!IS_ALIGNED(offset, sizeof(u32)));
> +		clflush_write32(vaddr + offset_in_page(offset),
> +				lower_32_bits(target_addr),
> +				eb->reloc_cache.vaddr);
>   
> -	if (wide) {
> -		offset += sizeof(u32);
> -		target_offset >>= 32;
> -		wide = false;
> -		goto repeat;
> +		if (wide) {
> +			offset += sizeof(u32);
> +			target_addr >>= 32;
> +			wide = false;
> +			goto repeat;
> +		}
>   	}
>   
> -out:
>   	return target->node.start | UPDATE;
>   }
>   
> @@ -3022,3 +3074,7 @@ end:;
>   	kvfree(exec2_list);
>   	return err;
>   }

The diff is a bit messy but looks okay and in CI we trust.

> +
> +#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
> +#include "selftests/i915_gem_execbuffer.c"
> +#endif
> diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/selftests/i915_gem_execbuffer.c
> new file mode 100644
> index 000000000000..985f9fbd0ba0
> --- /dev/null
> +++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_execbuffer.c
> @@ -0,0 +1,206 @@
> +// SPDX-License-Identifier: MIT
> +/*
> + * Copyright © 2020 Intel Corporation
> + */
> +
> +#include "i915_selftest.h"
> +
> +#include "gt/intel_engine_pm.h"
> +#include "selftests/igt_flush_test.h"
> +
> +static void hexdump(const void *buf, size_t len)

Uh-oh seems to be the third copy! ;)

> +{
> +	const size_t rowsize = 8 * sizeof(u32);
> +	const void *prev = NULL;
> +	bool skip = false;
> +	size_t pos;
> +
> +	for (pos = 0; pos < len; pos += rowsize) {
> +		char line[128];
> +
> +		if (prev && !memcmp(prev, buf + pos, rowsize)) {
> +			if (!skip) {
> +				pr_info("*\n");
> +				skip = true;
> +			}
> +			continue;
> +		}
> +
> +		WARN_ON_ONCE(hex_dump_to_buffer(buf + pos, len - pos,
> +						rowsize, sizeof(u32),
> +						line, sizeof(line),
> +						false) >= sizeof(line));
> +		pr_info("[%04zx] %s\n", pos, line);
> +
> +		prev = buf + pos;
> +		skip = false;
> +	}
> +}
> +
> +static u64 read_reloc(const u32 *map, int x, const u64 mask)
> +{
> +	u64 reloc;
> +
> +	memcpy(&reloc, &map[x], sizeof(reloc));
> +	return reloc & mask;
> +}
> +
> +static int __igt_gpu_reloc(struct i915_execbuffer *eb,
> +			   struct drm_i915_gem_object *obj)
> +{
> +	enum {
> +		X = 0,
> +		Y = 3,
> +		Z = 8
> +	};
> +	const u64 mask = GENMASK_ULL(eb->reloc_cache.gen >= 8 ? 63 : 31, 0);

Mask is to remove the poison? use_64_bit relocs instead of gen, or this 
is more correct?

> +	const u32 *map = page_mask_bits(obj->mm.mapping);
> +	struct i915_request *rq;
> +	struct i915_vma *vma;
> +	int inc;
> +	int err;
> +
> +	vma = i915_vma_instance(obj, eb->context->vm, NULL);
> +	if (IS_ERR(vma))
> +		return PTR_ERR(vma);
> +
> +	err = i915_vma_pin(vma, 0, 0, PIN_USER | PIN_HIGH);
> +	if (err)
> +		return err;
> +
> +	/* 8-Byte aligned */
> +	if (!__reloc_entry_gpu(vma, eb, X * sizeof(u32), X)) {
> +		err = -EIO;
> +		goto unpin_vma;
> +	}
> +
> +	/* !8-Byte aligned */

What is the significance of the non 8 byte aligned?

> +	if (!__reloc_entry_gpu(vma, eb, Y * sizeof(u32), Y)) {
> +		err = -EIO;
> +		goto unpin_vma;
> +	}
> +
> +	/* Skip to the end of the cmd page */
> +	inc = PAGE_SIZE / sizeof(u32) - RELOC_TAIL - 1;
> +	inc -= eb->reloc_cache.rq_size;
> +	memset(eb->reloc_cache.rq_cmd + eb->reloc_cache.rq_size,
> +	       0, inc * sizeof(u32));
> +	eb->reloc_cache.rq_size += inc;
> +
> +	/* Force batch chaining */
> +	if (!__reloc_entry_gpu(vma, eb, Z * sizeof(u32), Z)) {
> +		err = -EIO;
> +		goto unpin_vma;
> +	}
> +
> +	GEM_BUG_ON(!eb->reloc_cache.rq);
> +	rq = i915_request_get(eb->reloc_cache.rq);
> +	err = reloc_gpu_flush(&eb->reloc_cache);
> +	if (err)
> +		goto put_rq;
> +	GEM_BUG_ON(eb->reloc_cache.rq);
> +
> +	err = i915_gem_object_wait(obj, I915_WAIT_INTERRUPTIBLE, HZ / 2);
> +	if (err) {
> +		intel_gt_set_wedged(eb->engine->gt);
> +		goto put_rq;
> +	}
> +
> +	if (!i915_request_completed(rq)) {
> +		pr_err("%s: did not wait for relocations!\n", eb->engine->name);
> +		err = -EINVAL;
> +		goto put_rq;
> +	}
> +
> +	if (read_reloc(map, X, mask) != X) {
> +		pr_err("%s[X]: map[%d] %llx != %x\n",
> +		       eb->engine->name, X, read_reloc(map, X, mask), X);
> +		err = -EINVAL;
> +	}
> +
> +	if (read_reloc(map, Y, mask) != Y) {
> +		pr_err("%s[Y]: map[%d] %llx != %x\n",
> +		       eb->engine->name, Y, read_reloc(map, Y, mask), Y);
> +		err = -EINVAL;
> +	}
> +
> +	if (read_reloc(map, Z, mask) != Z) {
> +		pr_err("%s[Z]: map[%d] %llx != %x\n",
> +		       eb->engine->name, Z, read_reloc(map, Z, mask), Z);
> +		err = -EINVAL;
> +	}

Why this couldn't just be an array of [0, 3, 8] instead of enums and 
then all these tests could be a single loop? I can't figure out what is 
the benefit of enum in other words.. Okay in the test phase it can't be 
a simple loop since it needs the special case for last iteration, but 
here it could be.

> +
> +	if (err)
> +		hexdump(map, 4096);
> +
> +put_rq:
> +	i915_request_put(rq);
> +unpin_vma:
> +	i915_vma_unpin(vma);
> +	return err;
> +}
> +
> +static int igt_gpu_reloc(void *arg)
> +{
> +	struct i915_execbuffer eb;
> +	struct drm_i915_gem_object *scratch;
> +	int err = 0;
> +	u32 *map;
> +
> +	eb.i915 = arg;
> +
> +	scratch = i915_gem_object_create_internal(eb.i915, 4096);
> +	if (IS_ERR(scratch))
> +		return PTR_ERR(scratch);
> +
> +	map = i915_gem_object_pin_map(scratch, I915_MAP_WC);
> +	if (IS_ERR(map)) {
> +		err = PTR_ERR(map);
> +		goto err_scratch;
> +	}
> +
> +	for_each_uabi_engine(eb.engine, eb.i915) {
> +		reloc_cache_init(&eb.reloc_cache, eb.i915);
> +		memset(map, POISON_INUSE, 4096);
> +
> +		intel_engine_pm_get(eb.engine);
> +		eb.context = intel_context_create(eb.engine);
> +		if (IS_ERR(eb.context)) {
> +			err = PTR_ERR(eb.context);
> +			goto err_pm;
> +		}
> +
> +		err = intel_context_pin(eb.context);
> +		if (err)
> +			goto err_put;
> +
> +		err = __igt_gpu_reloc(&eb, scratch);
> +
> +		intel_context_unpin(eb.context);
> +err_put:
> +		intel_context_put(eb.context);
> +err_pm:
> +		intel_engine_pm_put(eb.engine);
> +		if (err)
> +			break;
> +	}
> +
> +	if (igt_flush_test(eb.i915))
> +		err = -EIO;
> +
> +err_scratch:
> +	i915_gem_object_put(scratch);
> +	return err;
> +}
> +
> +int i915_gem_execbuffer_live_selftests(struct drm_i915_private *i915)
> +{
> +	static const struct i915_subtest tests[] = {
> +		SUBTEST(igt_gpu_reloc),
> +	};
> +
> +	if (intel_gt_is_wedged(&i915->gt))
> +		return 0;
> +
> +	return i915_live_subtests(tests, i915);
> +}
> diff --git a/drivers/gpu/drm/i915/selftests/i915_live_selftests.h b/drivers/gpu/drm/i915/selftests/i915_live_selftests.h
> index 0a953bfc0585..5dd5d81646c4 100644
> --- a/drivers/gpu/drm/i915/selftests/i915_live_selftests.h
> +++ b/drivers/gpu/drm/i915/selftests/i915_live_selftests.h
> @@ -37,6 +37,7 @@ selftest(gem, i915_gem_live_selftests)
>   selftest(evict, i915_gem_evict_live_selftests)
>   selftest(hugepages, i915_gem_huge_page_live_selftests)
>   selftest(gem_contexts, i915_gem_context_live_selftests)
> +selftest(gem_execbuf, i915_gem_execbuffer_live_selftests)
>   selftest(blt, i915_gem_object_blt_live_selftests)
>   selftest(client, i915_gem_client_blt_live_selftests)
>   selftest(reset, intel_reset_live_selftests)
> 

Regards,

Tvrtko