[igt-dev] [PATCH i-g-t 1/2] tests/i915/gem_exec_gttfill: Support gens without relocations

Wed May 12 08:14:18 UTC 2021

On Wed, May 12, 2021 at 07:40:09AM +0200, Andrzej Turko wrote:
> With relocations disabled for newer generations
> addresses of objects need to be assigned by the test.
> As all the objects won't fit in the gtt, using the allocator
> does not guarantee that submitted batches won't overlap.
> It only reduces the number of overlapping objects while ensuring
> that evictions happen at different offsets.
> 
> Signed-off-by: Andrzej Turko <andrzej.turko at linux.intel.com>
> Cc: Zbigniew Kempczyński <zbigniew.kempczynski at intel.com>
> ---
>  tests/i915/gem_exec_gttfill.c | 75 ++++++++++++++++++++++++++++++-----
>  1 file changed, 64 insertions(+), 11 deletions(-)
> 
> diff --git a/tests/i915/gem_exec_gttfill.c b/tests/i915/gem_exec_gttfill.c
> index c0e27c9bb..091c74ebb 100644
> --- a/tests/i915/gem_exec_gttfill.c
> +++ b/tests/i915/gem_exec_gttfill.c
> @@ -28,6 +28,8 @@
>  IGT_TEST_DESCRIPTION("Fill the GTT with batches.");
>  
>  #define BATCH_SIZE (4096<<10)
> +/* We don't have alignment detection yet, so assume the worst-case scenario. */
> +#define BATCH_ALIGNMENT (1 << 21)
>  
>  struct batch {
>  	uint32_t handle;
> @@ -47,15 +49,21 @@ static void xchg_batch(void *array, unsigned int i, unsigned int j)
>  static void submit(int fd, int gen,
>  		   struct drm_i915_gem_execbuffer2 *eb,
>  		   struct drm_i915_gem_relocation_entry *reloc,
> -		   struct batch *batches, unsigned int count)
> +		   struct batch *batches, unsigned int count,
> +		   uint64_t ahnd, bool do_relocs)

ahnd == 0 is invalid, you can use it instead of additional
do_relocs variable.

>  {
>  	struct drm_i915_gem_exec_object2 obj;
>  	uint32_t batch[16];
> -	unsigned n;
> +	uint64_t address, value;
> +	unsigned n, j;
>  
>  	memset(&obj, 0, sizeof(obj));
> -	obj.relocs_ptr = to_user_pointer(reloc);
> -	obj.relocation_count = 2;
> +	if (do_relocs) {
> +		obj.relocs_ptr = to_user_pointer(reloc);
> +		obj.relocation_count = 2;
> +	} else {
> +		obj.flags |= EXEC_OBJECT_PINNED;
> +	}
>  
>  	memset(reloc, 0, 2*sizeof(*reloc));
>  	reloc[0].offset = eb->batch_start_offset;
> @@ -85,16 +93,40 @@ static void submit(int fd, int gen,
>  	batch[++n] = 0; /* lower_32_bits(value) */
>  	batch[++n] = 0; /* upper_32_bits(value) / nop */
>  	batch[++n] = MI_BATCH_BUFFER_END;
> -
>  	eb->buffers_ptr = to_user_pointer(&obj);
> +	j = 0;
>  	for (unsigned i = 0; i < count; i++) {
>  		obj.handle = batches[i].handle;
>  		reloc[0].target_handle = obj.handle;
>  		reloc[1].target_handle = obj.handle;
>  
> -		obj.offset = 0;
> -		reloc[0].presumed_offset = obj.offset;
> -		reloc[1].presumed_offset = obj.offset;
> +		if (do_relocs) {
> +			obj.offset = 0;
> +		} else {
> +			obj.offset = __intel_allocator_alloc(ahnd, obj.handle,
> +							     BATCH_SIZE,
> +							     BATCH_ALIGNMENT,
> +							     ALLOC_STRATEGY_HIGH_TO_LOW);
> +			for (; obj.offset == -1; j = ((++j) == count ? 0 : j)) {
> +				if (i != j)
> +					intel_allocator_free(ahnd, batches[j].handle);
> +				obj.offset = __intel_allocator_alloc(ahnd, obj.handle,
> +								     BATCH_SIZE,
> +								     BATCH_ALIGNMENT,
> +								     ALLOC_STRATEGY_HIGH_TO_LOW);
> +			}

Ha, we're in userspace competing over single offsets set.

Why you just don't use:

j = (j + 1) % count;

It is more readable and no sequencing risk can occur (it would
be likely catched by compiler). 

> +
> +			/* If there is no relocation support, we assume gen >= 8. */
> +			reloc[0].presumed_offset = obj.offset;
> +			address = obj.offset + reloc[0].delta;
> +			batch[1] = address;
> +			batch[2] = address >> 32;
> +
> +			reloc[1].presumed_offset = obj.offset;
> +			value = obj.offset + reloc[1].delta;
> +			batch[3] = value;
> +			batch[4] = value >> 32;
> +		}
>  
>  		memcpy(batches[i].ptr + eb->batch_start_offset,
>  		       batch, sizeof(batch));
> @@ -116,7 +148,8 @@ static void fillgtt(int fd, unsigned ring, int timeout)
>  	struct batch *batches;
>  	unsigned nengine;
>  	unsigned count;
> -	uint64_t size;
> +	uint64_t size, ahnd;
> +	bool do_relocs = gem_has_relocations(fd);
>  
>  	shared = mmap(NULL, 4096, PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0);
>  	igt_assert(shared != MAP_FAILED);
> @@ -138,6 +171,8 @@ static void fillgtt(int fd, unsigned ring, int timeout)
>  	igt_assert(nengine * 64 <= BATCH_SIZE);
>  
>  	size = gem_aperture_size(fd);
> +	if (!gem_uses_full_ppgtt(fd))
> +		size /= 2;
>  	if (size > 1ull<<32) /* Limit to 4GiB as we do not use allow-48b */
>  		size = 1ull << 32;
>  	igt_require(size < (1ull<<32) * BATCH_SIZE);
> @@ -145,6 +180,12 @@ static void fillgtt(int fd, unsigned ring, int timeout)
>  	count = size / BATCH_SIZE + 1;
>  	igt_debug("Using %'d batches to fill %'llu aperture on %d engines\n",
>  		  count, (long long)size, nengine);
> +
> +	intel_allocator_multiprocess_start();

intel_allocator_multiprocess_start()|stop() should be in igt_fixture.
Otherwise if test will fail we got hanging allocator thread. This likely
is not a problem for CI (igt_runner calls tests individually) we can 
encounter unpredictable effects when tests are running sequentially.

--
Zbigniew

> +	/* Avoid allocating on the last page */
> +	ahnd = intel_allocator_open_full(fd, 0, 0, size - 4096,
> +					 INTEL_ALLOCATOR_SIMPLE,
> +					 ALLOC_STRATEGY_HIGH_TO_LOW);
>  	intel_require_memory(count, BATCH_SIZE, CHECK_RAM);
>  	intel_detect_and_clear_missed_interrupts(fd);
>  
> @@ -165,7 +206,7 @@ static void fillgtt(int fd, unsigned ring, int timeout)
>  	}
>  
>  	/* Flush all memory before we start the timer */
> -	submit(fd, gen, &execbuf, reloc, batches, count);
> +	submit(fd, gen, &execbuf, reloc, batches, count, ahnd, do_relocs);
>  
>  	igt_info("Setup %u batches in %.2fms\n",
>  		 count, 1e-6 * igt_nsec_elapsed(&tv));
> @@ -176,8 +217,14 @@ static void fillgtt(int fd, unsigned ring, int timeout)
>  		igt_permute_array(batches, count, xchg_batch);
>  		execbuf.batch_start_offset = child*64;
>  		execbuf.flags |= engines[child];
> +
> +		/* We need to open the allocator again in the new process */
> +		ahnd = intel_allocator_open_full(fd, 0, 0, size - 4096,
> +						 INTEL_ALLOCATOR_SIMPLE,
> +						 ALLOC_STRATEGY_HIGH_TO_LOW);
> +
>  		igt_until_timeout(timeout) {
> -			submit(fd, gen, &execbuf, reloc, batches, count);
> +			submit(fd, gen, &execbuf, reloc, batches, count, ahnd, do_relocs);
>  			for (unsigned i = 0; i < count; i++) {
>  				uint64_t offset, delta;
>  
> @@ -189,13 +236,18 @@ static void fillgtt(int fd, unsigned ring, int timeout)
>  		}
>  		shared[child] = cycles;
>  		igt_info("engine[%d]: %llu cycles\n", child, (long long)cycles);
> +		intel_allocator_close(ahnd);
>  	}
>  	igt_waitchildren();
>  
> +	intel_allocator_close(ahnd);
> +	intel_allocator_multiprocess_stop();
> +
>  	for (unsigned i = 0; i < count; i++) {
>  		munmap(batches[i].ptr, BATCH_SIZE);
>  		gem_close(fd, batches[i].handle);
>  	}
> +	free(batches);
>  
>  	shared[nengine] = 0;
>  	for (unsigned i = 0; i < nengine; i++)
> @@ -216,6 +268,7 @@ igt_main
>  		igt_fork_hang_detector(i915);
>  	}
>  
> +
>  	igt_subtest("basic") /* just enough to run a single pass */
>  		fillgtt(i915, ALL_ENGINES, 1);
>  
> -- 
> 2.25.1
>