[Intel-gfx] [PATCH i-g-t 2/3] i915/gem_cpu_reloc: Use a self-modifying chained batch

Wed Jan 16 14:22:59 UTC 2019

Chris Wilson <chris at chris-wilson.co.uk> writes:

> Use another sensitive CPU reloc to emit a chained batch from inside the
> updated buffer to reduce the workload on slow machines to fit within the
> CI timeout.
>
> References: https://bugs.freedesktop.org/show_bug.cgi?id=108248
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> ---
>  tests/i915/gem_cpu_reloc.c | 347 ++++++++++++++++++++-----------------
>  1 file changed, 189 insertions(+), 158 deletions(-)
>
> diff --git a/tests/i915/gem_cpu_reloc.c b/tests/i915/gem_cpu_reloc.c
> index 882c312d4..33c4e4e0f 100644
> --- a/tests/i915/gem_cpu_reloc.c
> +++ b/tests/i915/gem_cpu_reloc.c
> @@ -59,214 +59,245 @@
>  
>  #include "intel_bufmgr.h"
>  
> -IGT_TEST_DESCRIPTION("Test the relocations through the CPU domain.");
> +#define MI_INSTR(opcode, flags) ((opcode) << 23 | (flags))
>  
> -static uint32_t use_blt;
> +IGT_TEST_DESCRIPTION("Test the relocations through the CPU domain.");
>  
> -static void copy(int fd, uint32_t batch, uint32_t src, uint32_t dst)
> +static uint32_t *
> +gen2_emit_store_addr(uint32_t *cs, struct drm_i915_gem_relocation_entry *addr)
>  {
> -	struct drm_i915_gem_execbuffer2 execbuf;
> -	struct drm_i915_gem_relocation_entry gem_reloc[2];
> -	struct drm_i915_gem_exec_object2 gem_exec[3];
> -
> -	gem_reloc[0].offset = 4 * sizeof(uint32_t);
> -	gem_reloc[0].delta = 0;
> -	gem_reloc[0].target_handle = dst;
> -	gem_reloc[0].read_domains = I915_GEM_DOMAIN_RENDER;
> -	gem_reloc[0].write_domain = I915_GEM_DOMAIN_RENDER;
> -	gem_reloc[0].presumed_offset = -1;
> -
> -	gem_reloc[1].offset = 7 * sizeof(uint32_t);
> -	if (intel_gen(intel_get_drm_devid(fd)) >= 8)
> -		gem_reloc[1].offset += sizeof(uint32_t);
> -	gem_reloc[1].delta = 0;
> -	gem_reloc[1].target_handle = src;
> -	gem_reloc[1].read_domains = I915_GEM_DOMAIN_RENDER;
> -	gem_reloc[1].write_domain = 0;
> -	gem_reloc[1].presumed_offset = -1;
> -
> -	memset(gem_exec, 0, sizeof(gem_exec));
> -	gem_exec[0].handle = src;
> -	gem_exec[1].handle = dst;
> -	gem_exec[2].handle = batch;
> -	gem_exec[2].relocation_count = 2;
> -	gem_exec[2].relocs_ptr = to_user_pointer(gem_reloc);
> -
> -	memset(&execbuf, 0, sizeof(execbuf));
> -	execbuf.buffers_ptr = to_user_pointer(gem_exec);
> -	execbuf.buffer_count = 3;
> -	execbuf.batch_len = 4096;
> -	execbuf.flags = use_blt;
> -
> -	gem_execbuf(fd, &execbuf);
> +	*cs++ = MI_STORE_DWORD_IMM - 1;
> +	addr->offset += sizeof(*cs);
> +	cs += 1; /* addr */
> +	cs += 1; /* value: implicit 0xffffffff */
> +	return cs;
> +}
> +static uint32_t *
> +gen4_emit_store_addr(uint32_t *cs, struct drm_i915_gem_relocation_entry *addr)
> +{
> +	*cs++ = MI_STORE_DWORD_IMM;
> +	*cs++ = 0;
> +	addr->offset += 2 * sizeof(*cs);
> +	cs += 1; /* addr */
> +	cs += 1; /* value: implicit 0xffffffff */
> +	return cs;
> +}
> +static uint32_t *
> +gen8_emit_store_addr(uint32_t *cs, struct drm_i915_gem_relocation_entry *addr)
> +{
> +	*cs++ = (MI_STORE_DWORD_IMM | 1 << 21) + 1;
> +	addr->offset += sizeof(*cs);
> +	igt_assert((addr->delta & 7) == 0);
> +	cs += 2; /* addr */
> +	cs += 2; /* value: implicit 0xffffffffffffffff */
> +	return cs;
>  }
>  
> -static void exec(int fd, uint32_t handle)
> +static uint32_t *
> +gen2_emit_bb_start(uint32_t *cs, struct drm_i915_gem_relocation_entry *reloc)
>  {
> -	struct drm_i915_gem_execbuffer2 execbuf;
> -	struct drm_i915_gem_exec_object2 gem_exec;
> +	*cs++ = MI_BATCH_BUFFER_START | 2 << 6;
> +	reloc->offset += sizeof(*cs);
> +	reloc->delta += 1;
> +	cs += 1; /* addr */
> +	return cs;
> +}
> +static uint32_t *
> +gen4_emit_bb_start(uint32_t *cs, struct drm_i915_gem_relocation_entry *reloc)
> +{
> +	*cs++ = MI_BATCH_BUFFER_START | 2 << 6 | 1 << 8;
> +	reloc->offset += sizeof(*cs);
> +	cs += 1; /* addr */
> +	return cs;
> +}
> +static uint32_t *
> +gen6_emit_bb_start(uint32_t *cs, struct drm_i915_gem_relocation_entry *reloc)
> +{
> +	*cs++ = MI_BATCH_BUFFER_START | 1 << 8;
> +	reloc->offset += sizeof(*cs);
> +	cs += 1; /* addr */
> +	return cs;
> +}
> +static uint32_t *
> +hsw_emit_bb_start(uint32_t *cs, struct drm_i915_gem_relocation_entry *reloc)
> +{
> +	*cs++ = MI_BATCH_BUFFER_START | 2 << 6 | 1 << 8 | 1 << 13;
> +	reloc->offset += sizeof(*cs);
> +	cs += 1; /* addr */
> +	return cs;
> +}
> +static uint32_t *
> +gen8_emit_bb_start(uint32_t *cs, struct drm_i915_gem_relocation_entry *reloc)
> +{
> +	if (((uintptr_t)cs & 7) == 0) {
> +		*cs++ = MI_NOOP; /* align addr for MI_STORE_DWORD_IMM */
> +		reloc->offset += sizeof(*cs);
> +	}

Align it so that after the bb start emitted we are in right alignment?
Otherwise it looks it should have '!' in it.
-Mika

>  
> -	memset(&gem_exec, 0, sizeof(gem_exec));
> -	gem_exec.handle = handle;
> +	*cs++ = MI_BATCH_BUFFER_START + 1;
> +	reloc->offset += sizeof(*cs);
> +	cs += 2; /* addr */
>  
> -	memset(&execbuf, 0, sizeof(execbuf));
> -	execbuf.buffers_ptr = to_user_pointer(&gem_exec);
> -	execbuf.buffer_count = 1;
> -	execbuf.batch_len = 4096;
> +	return cs;
> +}
>  
> -	gem_execbuf(fd, &execbuf);
> +static void *
> +create_tmpl(int i915, struct drm_i915_gem_relocation_entry *reloc)
> +{
> +	const uint32_t devid = intel_get_drm_devid(i915);
> +	const int gen = intel_gen(devid);
> +	uint32_t *(*emit_store_addr)(uint32_t *cs,
> +				   struct drm_i915_gem_relocation_entry *addr);
> +	uint32_t *(*emit_bb_start)(uint32_t *cs,
> +				   struct drm_i915_gem_relocation_entry *reloc);
> +	void *tmpl;
> +
> +	/* could use BLT_FILL instead for gen2 */
> +	igt_require(gem_can_store_dword(i915, 0));
> +
> +	if (gen >= 8)
> +		emit_store_addr = gen8_emit_store_addr;
> +	else if (gen >= 4)
> +		emit_store_addr = gen4_emit_store_addr;
> +	else
> +		emit_store_addr = gen2_emit_store_addr;
> +
> +	if (gen >= 8)
> +		emit_bb_start = gen8_emit_bb_start;
> +	else if (IS_HASWELL(devid))
> +		emit_bb_start = hsw_emit_bb_start;
> +	else if (gen >= 6)
> +		emit_bb_start = gen6_emit_bb_start;
> +	else if (gen >= 4)
> +		emit_bb_start = gen4_emit_bb_start;
> +	else
> +		emit_bb_start = gen2_emit_bb_start;
> +
> +	tmpl = malloc(4096);
> +	igt_assert(tmpl);
> +	memset(tmpl, 0xff, 4096);
> +
> +	/* Jump over the booby traps to the end */
> +	reloc[0].delta = 64;
> +	emit_bb_start(tmpl, &reloc[0]);
> +
> +	/* Restore the bad address to catch missing relocs */
> +	reloc[1].offset = 64;
> +	reloc[1].delta = reloc[0].offset;
> +	*emit_store_addr(tmpl + 64, &reloc[1]) = MI_BATCH_BUFFER_END;
> +
> +	return tmpl;
>  }
>  
> -uint32_t gen6_batch[] = {
> -	(XY_SRC_COPY_BLT_CMD | 6 |
> -	 XY_SRC_COPY_BLT_WRITE_ALPHA |
> -	 XY_SRC_COPY_BLT_WRITE_RGB),
> -	(3 << 24 | /* 32 bits */
> -	 0xcc << 16 | /* copy ROP */
> -	 4096),
> -	0 << 16 | 0, /* dst x1, y1 */
> -	1 << 16 | 2,
> -	0, /* dst relocation */
> -	0 << 16 | 0, /* src x1, y1 */
> -	4096,
> -	0, /* src relocation */
> -	MI_BATCH_BUFFER_END,
> -};
> -
> -uint32_t gen8_batch[] = {
> -	(XY_SRC_COPY_BLT_CMD | 8 |
> -	 XY_SRC_COPY_BLT_WRITE_ALPHA |
> -	 XY_SRC_COPY_BLT_WRITE_RGB),
> -	(3 << 24 | /* 32 bits */
> -	 0xcc << 16 | /* copy ROP */
> -	 4096),
> -	0 << 16 | 0, /* dst x1, y1 */
> -	1 << 16 | 2,
> -	0, /* dst relocation */
> -	0, /* FIXME */
> -	0 << 16 | 0, /* src x1, y1 */
> -	4096,
> -	0, /* src relocation */
> -	0, /* FIXME */
> -	MI_BATCH_BUFFER_END,
> -};
> -
> -uint32_t *batch = gen6_batch;
> -uint32_t batch_size = sizeof(gen6_batch);
> -
> -static void run_test(int fd, int count)
> +static void run_test(int i915, int count)
>  {
> -	const uint32_t hang[] = {-1, -1, -1, -1};
> -	const uint32_t end[] = {MI_BATCH_BUFFER_END, 0};
> -	uint32_t noop;
> -	uint32_t *handles;
> -	int i;
> +	struct drm_i915_gem_execbuffer2 execbuf;
> +	struct drm_i915_gem_relocation_entry reloc[2];
> +	struct drm_i915_gem_exec_object2 obj;
>  
> -	noop = intel_get_drm_devid(fd);
> +	uint32_t *handles;
> +	uint32_t *tmpl;
>  
> -	use_blt = 0;
> -	if (intel_gen(noop) >= 6)
> -		use_blt = I915_EXEC_BLT;
> +	handles = malloc(count * sizeof(uint32_t));
> +	igt_assert(handles);
>  
> -	if (intel_gen(noop) >= 8) {
> -		batch = gen8_batch;
> -		batch_size += 2 * 4;
> +	memset(reloc, 0, sizeof(reloc));
> +	tmpl = create_tmpl(i915, reloc);
> +	for (int i = 0; i < count; i++) {
> +		handles[i] = gem_create(i915, 4096);
> +		gem_write(i915, handles[i], 0, tmpl, 4096);
>  	}
> +	free(tmpl);
>  
> -	handles = malloc (count * sizeof(uint32_t));
> -	igt_assert(handles);
> +	memset(&obj, 0, sizeof(obj));
> +	obj.relocs_ptr = to_user_pointer(reloc);
> +	obj.relocation_count = ARRAY_SIZE(reloc);
>  
> -	noop = gem_create(fd, 4096);
> -	gem_write(fd, noop, 0, end, sizeof(end));
> +	memset(&execbuf, 0, sizeof(execbuf));
> +	execbuf.buffers_ptr = to_user_pointer(&obj);
> +	execbuf.buffer_count = 1;
>  
>  	/* fill the entire gart with batches and run them */
> -	for (i = 0; i < count; i++) {
> -		uint32_t bad;
> -
> -		handles[i] = gem_create(fd, 4096);
> -		gem_write(fd, handles[i], 0, batch, batch_size);
> -
> -		bad = gem_create(fd, 4096);
> -		gem_write(fd, bad, 0, hang, sizeof(hang));
> -		gem_write(fd, bad, 4096-sizeof(end), end, sizeof(end));
> +	for (int i = 0; i < count; i++) {
> +		obj.handle = handles[i];
>  
> -		/* launch the newly created batch */
> -		copy(fd, handles[i], noop, bad);
> -		exec(fd, bad);
> -		gem_close(fd, bad);
> +		reloc[0].target_handle = obj.handle;
> +		reloc[0].presumed_offset = -1;
> +		reloc[1].target_handle = obj.handle;
> +		reloc[1].presumed_offset = -1;
>  
> -		igt_progress("gem_cpu_reloc: ", i, 2*count);
> +		gem_execbuf(i915, &execbuf);
>  	}
>  
>  	/* And again in reverse to try and catch the relocation code out */
> -	for (i = 0; i < count; i++) {
> -		uint32_t bad;
> +	for (int i = 0; i < count; i++) {
> +		obj.handle = handles[count - i - 1];
>  
> -		bad = gem_create(fd, 4096);
> -		gem_write(fd, bad, 0, hang, sizeof(hang));
> -		gem_write(fd, bad, 4096-sizeof(end), end, sizeof(end));
> +		reloc[0].target_handle = obj.handle;
> +		reloc[0].presumed_offset = -1;
> +		reloc[1].target_handle = obj.handle;
> +		reloc[1].presumed_offset = -1;
>  
> -		/* launch the newly created batch */
> -		copy(fd, handles[count-i-1], noop, bad);
> -		exec(fd, bad);
> -		gem_close(fd, bad);
> -
> -		igt_progress("gem_cpu_reloc: ", count+i, 3*count);
> +		gem_execbuf(i915, &execbuf);
>  	}
>  
> -	/* Third time lucky? */
> -	for (i = 0; i < count; i++) {
> -		uint32_t bad;
> +	/* Third time unlucky? */
> +	for (int i = 0; i < count; i++) {
> +		obj.handle = handles[i];
>  
> -		bad = gem_create(fd, 4096);
> -		gem_write(fd, bad, 0, hang, sizeof(hang));
> -		gem_write(fd, bad, 4096-sizeof(end), end, sizeof(end));
> +		reloc[0].target_handle = obj.handle;
> +		reloc[0].presumed_offset = -1;
> +		reloc[1].target_handle = obj.handle;
> +		reloc[1].presumed_offset = -1;
>  
> -		/* launch the newly created batch */
> -		gem_set_domain(fd, handles[i],
> -			       I915_GEM_DOMAIN_CPU, I915_GEM_DOMAIN_CPU);
> -		copy(fd, handles[i], noop, bad);
> -		exec(fd, bad);
> -		gem_close(fd, bad);
> +		gem_set_domain(i915, obj.handle,
> +			       I915_GEM_DOMAIN_CPU,
> +			       I915_GEM_DOMAIN_CPU);
>  
> -		igt_progress("gem_cpu_reloc: ", 2*count+i, 3*count);
> +		gem_execbuf(i915, &execbuf);
>  	}
>  
> -	igt_info("Subtest suceeded, cleanup up - this might take a while.\n");
> -	for (i = 0; i < count; i++) {
> -		gem_close(fd, handles[i]);
> -	}
> -	gem_close(fd, noop);
> +	for (int i = 0; i < count; i++)
> +		gem_close(i915, handles[i]);
>  	free(handles);
>  }
>  
>  igt_main
>  {
> -	uint64_t aper_size;
> -	int fd, count;
> +	int i915;
>  
>  	igt_fixture {
> -		fd = drm_open_driver(DRIVER_INTEL);
> -		igt_require_gem(fd);
> -	}
> +		i915 = drm_open_driver(DRIVER_INTEL);
> +		igt_require_gem(i915);
>  
> -	igt_subtest("basic") {
> -		run_test (fd, 10);
> +		igt_fork_hang_detector(i915);
>  	}
>  
> +	igt_subtest("basic")
> +		run_test(i915, 1);
>  
>  	igt_subtest("full") {
> -		aper_size = gem_mappable_aperture_size();
> -		count = aper_size / 4096 * 2;
> +		uint64_t aper_size = gem_mappable_aperture_size();
> +		unsigned long count = aper_size / 4096 + 1;
> +
> +		intel_require_memory(count, 4096, CHECK_RAM);
> +
> +		run_test(i915, count);
> +	}
> +
> +	igt_subtest("forked") {
> +		uint64_t aper_size = gem_mappable_aperture_size();
> +		unsigned long count = aper_size / 4096 + 1;
> +		int ncpus = sysconf(_SC_NPROCESSORS_ONLN);
>  
> -		/* count + 2 (noop & bad) buffers. A gem object appears to
> -                   require about 2kb + buffer + kernel overhead */
> -		intel_require_memory(2+count, 2048+4096, CHECK_RAM);
> +		intel_require_memory(count, 4096, CHECK_RAM);
>  
> -		run_test (fd, count);
> +		igt_fork(child, ncpus)
> +			run_test(i915, count / ncpus + 1);
> +		igt_waitchildren();
>  	}
>  
>  	igt_fixture {
> -		close(fd);
> +		igt_stop_hang_detector();
>  	}
>  }
> -- 
> 2.20.1