[Intel-gfx] [PATCH i-g-t] i915/gem_exec_balancer: Race breadcrumb signaling against timeslicing

Fri Jul 17 08:34:07 UTC 2020

On 16/07/2020 21:44, Chris Wilson wrote:
> This is an attempt to chase down some preempt-to-busy races with
> breadcrumb signaling on the virtual engines. By using more semaphore
> spinners than available engines, we encourage very short timeslices, and
> we make each batch of random duration to try and coincide the end of a
> batch with the context being scheduled out.
> 
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> ---
>   tests/i915/gem_exec_balancer.c | 109 +++++++++++++++++++++++++++++++++
>   1 file changed, 109 insertions(+)
> 
> diff --git a/tests/i915/gem_exec_balancer.c b/tests/i915/gem_exec_balancer.c
> index c5c0055fc..e4d9e0464 100644
> --- a/tests/i915/gem_exec_balancer.c
> +++ b/tests/i915/gem_exec_balancer.c
> @@ -2240,6 +2240,112 @@ static void hog(int i915)
>   	gem_quiescent_gpu(i915);
>   }
>   
> +static uint32_t sema_create(int i915, uint64_t addr, uint32_t **x)
> +{
> +	uint32_t handle = gem_create(i915, 4096);
> +
> +	*x = gem_mmap__device_coherent(i915, handle, 0, 4096, PROT_WRITE);
> +	for (int n = 1; n <= 32; n++) {
> +		uint32_t *cs = *x + n * 16;

Okay so semaphore target is in the batch itself, that's why first 16 
dwords are nops for convenience.

> +
> +		*cs++ = MI_SEMAPHORE_WAIT |
> +			MI_SEMAPHORE_POLL |
> +			MI_SEMAPHORE_SAD_GTE_SDD |
> +			(4 - 2);
> +		*cs++ = n;
> +		*cs++ = addr;
> +		*cs++ = addr >> 32;
> +
> +		*cs++ = MI_BATCH_BUFFER_END;
> +	}
> +
> +	return handle;
> +}
> +
> +static uint32_t *sema(int i915, uint32_t ctx)
> +{
> +	uint32_t *ctl;
> +	struct drm_i915_gem_exec_object2 batch = {
> +		.handle = sema_create(i915, 64 << 20, &ctl),
> +		.offset = 64 << 20,
> +		.flags = EXEC_OBJECT_PINNED
> +	};
> +	struct drm_i915_gem_execbuffer2 execbuf = {
> +		.buffers_ptr = to_user_pointer(&batch),
> +		.buffer_count = 1,
> +		.rsvd1 = gem_context_clone_with_engines(i915, ctx),
> +	};
> +	int64_t poll = 1;
> +
> +	for (int n = 1; n <= 32; n++) {
> +		execbuf.batch_start_offset = 64 * n,
> +		gem_execbuf(i915, &execbuf);
> +		/* Force a breadcrumb to be installed on each request */
> +		gem_wait(i915, batch.handle, &poll);
> +	}
> +
> +	gem_context_destroy(i915, execbuf.rsvd1);
> +
> +	igt_assert(gem_bo_busy(i915, batch.handle));
> +	gem_close(i915, batch.handle);
> +
> +	return ctl;
> +}
> +
> +static void __waits(int i915, int timeout, uint32_t ctx, unsigned int count)
> +{
> +	uint32_t *semaphores[count + 1];
> +
> +	for (int i = 0; i <= count; i++)
> +		semaphores[i] = sema(i915, ctx);
> +
> +	igt_until_timeout(timeout) {
> +		int i = rand() % (count + 1);
> +
> +		if ((*semaphores[i] += rand() % 32) >= 32) {

Write releases some batch buffers, until it knows it released all of 
them when it creates a new set.

> +			munmap(semaphores[i], 4096);
> +			semaphores[i] = sema(i915, ctx);
> +		}
> +	}
> +
> +	for (int i = 0; i <= count; i++) {
> +		*semaphores[i] = 0xffffffff;
> +		munmap(semaphores[i], 4096);
> +	}
> +}
> +
> +static void waits(int i915, int timeout)
> +{
> +	igt_require(gem_scheduler_has_preemption(i915));
> +	igt_require(gem_scheduler_has_semaphores(i915));
> +
> +	for (int class = 0; class < 32; class++) {
> +		struct i915_engine_class_instance *ci;
> +		unsigned int count;
> +		uint32_t ctx;
> +
> +		ci = list_engines(i915, 1u << class, &count);
> +		if (!ci)
> +			continue;
> +
> +		if (count < 2) {
> +			free(ci);
> +			continue;
> +		}
> +
> +		ctx = load_balancer_create(i915, ci, count);
> +
> +		__waits(i915, timeout, ctx, count);
> +
> +		gem_context_destroy(i915, ctx);
> +		igt_waitchildren();

Don't see any forking in the test.

> +
> +		free(ci);
> +	}
> +
> +	gem_quiescent_gpu(i915);
> +}
> +
>   static void nop(int i915)
>   {
>   	struct drm_i915_gem_exec_object2 batch = {
> @@ -2729,6 +2835,9 @@ igt_main
>   	igt_subtest("hog")
>   		hog(i915);
>   
> +	igt_subtest("waits")
> +		waits(i915, 5);
> +
>   	igt_subtest("smoke")
>   		smoketest(i915, 20);
>   
> 

Looks okay in principle.

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>

I am not sure if the batch duration is not too short in practice, the 
add loop will really rapidly end all, just needs 64 iterations on 
average to end all 32 I think. So 64 WC writes from the CPU compared to 
CSB processing and breadcrumb signaling latencies might be too short. 
Maybe some small random udelays in the loop would be more realistic. 
Maybe as a 2nd flavour of the test just in case.. more coverage the better.

Regards,

Tvrtko