[Intel-gfx] [PATCH i-g-t] i915/gem_ctx_switch: Use minimum qlen over all engines and measure switches

Mon Feb 25 18:28:34 UTC 2019

Chris,
By your patch, measure_qlen() reports how many gem_execbuf() can be
executed(queue length) within timeout of the slowest engine, correct?

Run time becomes 95 sec which is less than half.
-caz

On Sat, 2019-02-23 at 01:34 +0000, Chris Wilson wrote:
> Not all engines are created equal, and our weighting ends up
> favouring
> the many faster xCS rings at the expense of RCS. Our qlen estimation
> also failed to factor in the context switch overhead, which is a
> significant factor for nop batches. So we oversubscribe the number of
> batches submitted to RCS and end up waiting for those to complete at
> the
> end of our subtest timeslice.
> 
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Caz Yokoyama <caz.yokoyama at intel.com>
> ---
>  tests/i915/gem_ctx_switch.c | 39 +++++++++++++++++++++++++++++----
> ----
>  1 file changed, 31 insertions(+), 8 deletions(-)
> 
> diff --git a/tests/i915/gem_ctx_switch.c
> b/tests/i915/gem_ctx_switch.c
> index 1208cb8d7..87e13b915 100644
> --- a/tests/i915/gem_ctx_switch.c
> +++ b/tests/i915/gem_ctx_switch.c
> @@ -26,6 +26,7 @@
>   */
>  
>  #include "igt.h"
> +#include <limits.h>
>  #include <unistd.h>
>  #include <stdlib.h>
>  #include <stdint.h>
> @@ -58,29 +59,50 @@ static int measure_qlen(int fd,
>  {
>  	const struct drm_i915_gem_exec_object2 * const obj =
>  		(struct drm_i915_gem_exec_object2 *)(uintptr_t)execbuf-
> >buffers_ptr;
> -	int qlen = 64;
> +	uint32_t ctx[64];
> +	int min = INT_MAX, max = 0;
> +
> +	for (int i = 0; i < ARRAY_SIZE(ctx); i++)
> +		ctx[i] = gem_context_create(fd);
>  
>  	for (unsigned int n = 0; n < nengine; n++) {
>  		uint64_t saved = execbuf->flags;
>  		struct timespec tv = {};
> +		int q;
>  
>  		execbuf->flags |= engine[n];
>  
> -		igt_nsec_elapsed(&tv);
> -		for (int loop = 0; loop < qlen; loop++)
> +		for (int i = 0; i < ARRAY_SIZE(ctx); i++) {
> +			execbuf->rsvd1 = ctx[i];
>  			gem_execbuf(fd, execbuf);
> +		}
>  		gem_sync(fd, obj->handle);
>  
> -		execbuf->flags = saved;
> +		igt_nsec_elapsed(&tv);
> +		for (int i = 0; i < ARRAY_SIZE(ctx); i++) {
> +			execbuf->rsvd1 = ctx[i];
> +			gem_execbuf(fd, execbuf);
> +		}
> +		gem_sync(fd, obj->handle);
>  
>  		/*
>  		 * Be conservative and aim not to overshoot timeout, so
> scale
>  		 * down by 8 for hopefully a max of 12.5% error.
>  		 */
> -		qlen = qlen * timeout * 1e9 / igt_nsec_elapsed(&tv) / 8
> + 1;
> +		q = ARRAY_SIZE(ctx) * timeout * 1e9 /
> igt_nsec_elapsed(&tv) / 8 + 1;
> +		if (q < min)
> +			min = q;
> +		if (q > max)
> +			max = q;
> +
> +		execbuf->flags = saved;
>  	}
>  
> -	return qlen;
> +	for (int i = 0; i < ARRAY_SIZE(ctx); i++)
> +		gem_context_destroy(fd, ctx[i]);
> +
> +	igt_debug("Estimated qlen: {min:%d, max:%d}\n", min, max);
> +	return min;
>  }
>  
>  static void single(int fd, uint32_t handle,
> @@ -259,9 +281,10 @@ static void all(int fd, uint32_t handle,
> unsigned flags, int timeout)
>  				clock_gettime(CLOCK_MONOTONIC, &now);
>  				gem_close(fd, obj[0].handle);
>  
> -				igt_info("[%d:%d] %s: %'u cycles:
> %.3fus%s\n",
> +				igt_info("[%d:%d] %s: %'u cycles:
> %.3fus%s (elapsed: %.3fs)\n",
>  					 nctx, child, name[child],
> count, elapsed(&start, &now)*1e6 / count,
> -					 flags & INTERRUPTIBLE ? "
> (interruptible)" : "");
> +					 flags & INTERRUPTIBLE ? "
> (interruptible)" : "",
> +					 elapsed(&start, &now));
>  			}
>  			igt_waitchildren();
>  		}