[igt-dev] [PATCH i-g-t v5 2/2] tests/i915/perf_pmu: Avoid unordered sampling with SEMA_WAIT

Tue Jul 26 05:46:45 UTC 2022

Hi Zbigniew,

On 2022-07-26 at 06:13:25 +0200, Zbigniew Kempczyński wrote:
> On Broadwell/Braswell, we sometimes see that the MI_SEMAPHORE_WAIT
> samples from memory before the MI_STORE_DWORD_IMM was written: unordered
- ^
imho s/samples/reads/ and s/: unordered reads// or if you want
that note here into s/unordered reads/out-of-order memory access/.

> reads. This causes the semaphore to complete too early, and the test
> asserts as the batch is completed before the measurement is begun.
> 
> We can use more MI_STORE_DWORD_IMM before the MI_SEMAPHORE_WAIT to
> ensure the MI pipelined is flushed and the write to memory is visible
> before our first read by the semaphore-wait. However, we can also just
> change our MI_SEMAPHORE_WAIT operation to not break if it sees the
> initial 0 by waiting for the value to change to 2. (Now the value starts
> at 0, is set to 1 by the batch to indicate it has started and the caller
> should start its sampling, and then set by 2 by the caller as it
----------------------------------------- ^
s/by 2/to 2/

> finishes sampling the semaphore busyness.)
> 
> Closes: https://gitlab.freedesktop.org/drm/intel/issues/2383
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Signed-off-by: Zbigniew Kempczyński <zbigniew.kempczynski at intel.com>

With that fixed,
Reviewed-by: Kamil Konieczny <kamil.konieczny at linux.intel.com>

> ---
> 
> v5: Rework, change semaphore wait value
> ---
>  tests/i915/perf_pmu.c | 27 +++++++++++++++++++--------
>  1 file changed, 19 insertions(+), 8 deletions(-)
> 
> diff --git a/tests/i915/perf_pmu.c b/tests/i915/perf_pmu.c
> index 39e9fc5fef..e2b13540e7 100644
> --- a/tests/i915/perf_pmu.c
> +++ b/tests/i915/perf_pmu.c
> @@ -685,7 +685,7 @@ no_sema(int gem_fd, const intel_ctx_t *ctx,
>  #define MI_SEMAPHORE_WAIT	MI_INSTR(0x1c, 2) /* GEN8+ */
>  #define   MI_SEMAPHORE_POLL		(1<<15)
>  #define   MI_SEMAPHORE_SAD_GTE_SDD	(1<<12)
> -#define   MI_SEMAPHORE_SAD_NEQ_SDD      (5 << 12)
> +#define   MI_SEMAPHORE_SAD_EQ_SDD       (4 << 12)
>  
>  static void
>  sema_wait(int gem_fd, const intel_ctx_t *ctx,
> @@ -812,9 +812,9 @@ create_sema(int gem_fd, uint64_t ahnd,
>  		0,
>  		1,
>  
> -		/* Wait until the semaphore value is set to 0 [by caller] */
> -		MI_SEMAPHORE_WAIT | MI_SEMAPHORE_POLL | MI_SEMAPHORE_SAD_NEQ_SDD,
> -		1,
> +		/* Wait until the semaphore value is set to 2 [by caller] */
> +		MI_SEMAPHORE_WAIT | MI_SEMAPHORE_POLL | MI_SEMAPHORE_SAD_EQ_SDD,
> +		2,
>  		0,
>  		0,
>  
> @@ -872,26 +872,37 @@ __sema_busy(int gem_fd, uint64_t ahnd, int pmu, const intel_ctx_t *ctx,
>  	};
>  	igt_spin_t *spin;
>  	uint32_t *map;
> +	struct timespec tv = {};
> +	int timeout = 3;
>  
>  	/* Time spent being busy includes time waiting on semaphores */
>  	igt_assert(busy_pct >= sema_pct);
>  
>  	gem_quiescent_gpu(gem_fd);
>  
> -	map = gem_mmap__device_coherent(gem_fd, obj.handle, 0, 4096, PROT_WRITE);
> +	map = gem_mmap__device_coherent(gem_fd, obj.handle, 0, 4096, PROT_READ | PROT_WRITE);
>  	gem_execbuf(gem_fd, &eb);
>  	spin = igt_spin_new(gem_fd, .ahnd = ahnd, .ctx = ctx, .engine = e->flags);
>  
> -	/* Wait until the batch is executed and the semaphore is busy-waiting */
> -	while (!READ_ONCE(*map) && gem_bo_busy(gem_fd, obj.handle))
> +	/*
> +	 * Wait until the batch is executed and the semaphore is busy-waiting.
> +	 * Also stop on timeout.
> +	 */
> +	igt_nsec_elapsed(&tv);
> +	while (READ_ONCE(*map) != 1 && gem_bo_busy(gem_fd, obj.handle) &&
> +	       igt_seconds_elapsed(&tv) < timeout)
>  		;
> +	igt_debug("bo_busy = %d, *map = %u, timeout: [%u/%u]\n",
> +		  gem_bo_busy(gem_fd, obj.handle), *map,
> +		  igt_seconds_elapsed(&tv), timeout);
> +	igt_assert(*map == 1);
>  	igt_assert(gem_bo_busy(gem_fd, obj.handle));
>  	gem_close(gem_fd, obj.handle);
>  
>  	total = pmu_read_multi(pmu, 2, start);
>  
>  	sema = measured_usleep(batch_duration_ns * sema_pct / 100 / 1000);
> -	*map = 0; __sync_synchronize();
> +	*map = 2; __sync_synchronize();
>  	busy = measured_usleep(batch_duration_ns * (busy_pct - sema_pct) / 100 / 1000);
>  	igt_spin_end(spin);
>  	measured_usleep(batch_duration_ns * (100 - busy_pct) / 100 / 1000);
> -- 
> 2.34.1
>