[Intel-gfx] [PATCH i-g-t 09/12] i915: Add gem_exec_balancer

Wed May 22 12:13:25 UTC 2019

On 22/05/2019 12:37, Chris Wilson wrote:
> Exercise the in-kernel load balancer checking that we can distribute
> batches across the set of ctx->engines to avoid load.
> 
> v2: Many assorted improvements from Tvrtko.
> 
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> ---
>   tests/Makefile.am              |    1 +
>   tests/Makefile.sources         |    1 +
>   tests/i915/gem_exec_balancer.c | 1052 ++++++++++++++++++++++++++++++++
>   tests/meson.build              |    7 +
>   4 files changed, 1061 insertions(+)
>   create mode 100644 tests/i915/gem_exec_balancer.c
> 
> diff --git a/tests/Makefile.am b/tests/Makefile.am
> index 5097debf6..c6af0aeaf 100644
> --- a/tests/Makefile.am
> +++ b/tests/Makefile.am
> @@ -96,6 +96,7 @@ gem_close_race_LDADD = $(LDADD) -lpthread
>   gem_ctx_thrash_CFLAGS = $(AM_CFLAGS) $(THREAD_CFLAGS)
>   gem_ctx_thrash_LDADD = $(LDADD) -lpthread
>   gem_ctx_sseu_LDADD = $(LDADD) $(top_builddir)/lib/libigt_perf.la
> +i915_gem_exec_balancer_LDADD = $(LDADD) $(top_builddir)/lib/libigt_perf.la
>   gem_exec_capture_LDADD = $(LDADD) -lz
>   gem_exec_parallel_CFLAGS = $(AM_CFLAGS) $(THREAD_CFLAGS)
>   gem_exec_parallel_LDADD = $(LDADD) -lpthread
> diff --git a/tests/Makefile.sources b/tests/Makefile.sources
> index 2ef689023..17fca6d77 100644
> --- a/tests/Makefile.sources
> +++ b/tests/Makefile.sources
> @@ -24,6 +24,7 @@ TESTS_progs = \
>   	i915/gem_ctx_clone \
>   	i915/gem_ctx_engines \
>   	i915/gem_ctx_shared \
> +	i915/gem_exec_balancer \
>   	i915/gem_vm_create \
>   	kms_3d \
>   	kms_addfb_basic \
> diff --git a/tests/i915/gem_exec_balancer.c b/tests/i915/gem_exec_balancer.c
> new file mode 100644
> index 000000000..40a2719c0
> --- /dev/null
> +++ b/tests/i915/gem_exec_balancer.c
> @@ -0,0 +1,1052 @@
> +/*
> + * Copyright © 2018-2019 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
> + * IN THE SOFTWARE.
> + */
> +
> +#include <sched.h>
> +
> +#include "igt.h"
> +#include "igt_perf.h"
> +#include "i915/gem_ring.h"
> +#include "sw_sync.h"
> +
> +IGT_TEST_DESCRIPTION("Exercise in-kernel load-balancing");
> +
> +#define INSTANCE_COUNT (1 << I915_PMU_SAMPLE_INSTANCE_BITS)
> +
> +static bool has_class_instance(int i915, uint16_t class, uint16_t instance)
> +{
> +	int fd;
> +
> +	fd = perf_i915_open(I915_PMU_ENGINE_BUSY(class, instance));
> +	if (fd != -1) {
> +		close(fd);
> +		return true;
> +	}
> +
> +	return false;
> +}
> +
> +static struct i915_engine_class_instance *
> +list_engines(int i915, uint32_t class_mask, unsigned int *out)
> +{
> +	unsigned int count = 0, size = 64;
> +	struct i915_engine_class_instance *engines;
> +
> +	engines = malloc(size * sizeof(*engines));
> +	if (!engines) {
> +		*out = 0;
> +		return NULL;
> +	}
> +
> +	for (enum drm_i915_gem_engine_class class = I915_ENGINE_CLASS_RENDER;
> +	     class_mask;
> +	     class++, class_mask >>= 1) {
> +		if (!(class_mask & 1))
> +			continue;
> +
> +		for (unsigned int instance = 0;
> +		     instance < INSTANCE_COUNT;
> +		     instance++) {
> +			if (!has_class_instance(i915, class, instance))
> +				continue;
> +
> +			if (count == size) {
> +				struct i915_engine_class_instance *e;
> +
> +				size *= 2;
> +				e = realloc(engines, size*sizeof(*engines));
> +				if (!e) {
> +					*out = count;
> +					return engines;
> +				}
> +
> +				engines = e;
> +			}
> +
> +			engines[count++] = (struct i915_engine_class_instance){
> +				.engine_class = class,
> +					.engine_instance = instance,
> +			};
> +		}
> +	}
> +
> +	if (!count) {
> +		free(engines);
> +		engines = NULL;
> +	}
> +
> +	*out = count;
> +	return engines;
> +}
> +
> +static int __set_load_balancer(int i915, uint32_t ctx,
> +			       const struct i915_engine_class_instance *ci,
> +			       unsigned int count)
> +{
> +	I915_DEFINE_CONTEXT_ENGINES_LOAD_BALANCE(balancer, count);
> +	I915_DEFINE_CONTEXT_PARAM_ENGINES(engines, 1 + count);
> +	struct drm_i915_gem_context_param p = {
> +		.ctx_id = ctx,
> +		.param = I915_CONTEXT_PARAM_ENGINES,
> +		.size = sizeof(engines),
> +		.value = to_user_pointer(&engines)
> +	};
> +
> +	memset(&balancer, 0, sizeof(balancer));
> +	balancer.base.name = I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE;
> +
> +	igt_assert(count);
> +	balancer.num_siblings = count;
> +	memcpy(balancer.engines, ci, count * sizeof(*ci));
> +
> +	memset(&engines, 0, sizeof(engines));
> +	engines.extensions = to_user_pointer(&balancer);
> +	engines.engines[0].engine_class =
> +		I915_ENGINE_CLASS_INVALID;
> +	engines.engines[0].engine_instance =
> +		I915_ENGINE_CLASS_INVALID_NONE;
> +	memcpy(engines.engines + 1, ci, count * sizeof(*ci));
> +
> +	return __gem_context_set_param(i915, &p);
> +}
> +
> +static void set_load_balancer(int i915, uint32_t ctx,
> +			      const struct i915_engine_class_instance *ci,
> +			      unsigned int count)
> +{
> +	igt_assert_eq(__set_load_balancer(i915, ctx, ci, count), 0);
> +}
> +
> +static uint32_t load_balancer_create(int i915,
> +				     const struct i915_engine_class_instance *ci,
> +				     unsigned int count)
> +{
> +	uint32_t ctx;
> +
> +	ctx = gem_context_create(i915);
> +	set_load_balancer(i915, ctx, ci, count);
> +
> +	return ctx;
> +}
> +
> +static uint32_t __batch_create(int i915, uint32_t offset)
> +{
> +	const uint32_t bbe = MI_BATCH_BUFFER_END;
> +	uint32_t handle;
> +
> +	handle = gem_create(i915, ALIGN(offset + 4, 4096));
> +	gem_write(i915, handle, offset, &bbe, sizeof(bbe));
> +
> +	return handle;
> +}
> +
> +static uint32_t batch_create(int i915)
> +{
> +	return __batch_create(i915, 0);
> +}
> +
> +static void invalid_balancer(int i915)
> +{
> +	I915_DEFINE_CONTEXT_ENGINES_LOAD_BALANCE(balancer, 64);
> +	I915_DEFINE_CONTEXT_PARAM_ENGINES(engines, 64);
> +	struct drm_i915_gem_context_param p = {
> +		.param = I915_CONTEXT_PARAM_ENGINES,
> +		.value = to_user_pointer(&engines)
> +	};
> +	uint32_t handle;
> +	void *ptr;
> +
> +	/*
> +	 * Assume that I915_CONTEXT_PARAM_ENGINE validates the array
> +	 * of engines[], our job is to determine if the load_balancer
> +	 * extension explodes.
> +	 */
> +
> +	for (int class = 0; class < 32; class++) {
> +		struct i915_engine_class_instance *ci;
> +		unsigned int count;
> +
> +		ci = list_engines(i915, 1 << class, &count);
> +		if (!ci)
> +			continue;

I know it is a huge stretch, but I do think correct pattern for 
list_engines is to assert on allocation (malloc + realloc) failures 
rather than equal that to no engines of this class and just carry and 
the pass the test.

With that change:

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>

Regards,

Tvrtko

> +
> +		igt_assert_lte(count, 64);
> +
> +		p.ctx_id = gem_context_create(i915);
> +		p.size = (sizeof(struct i915_context_param_engines) +
> +			  (count + 1) * sizeof(*engines.engines));
> +
> +		memset(&engines, 0, sizeof(engines));
> +		engines.engines[0].engine_class = I915_ENGINE_CLASS_INVALID;
> +		engines.engines[0].engine_instance = I915_ENGINE_CLASS_INVALID_NONE;
> +		memcpy(engines.engines + 1, ci, count * sizeof(*ci));
> +		gem_context_set_param(i915, &p);
> +
> +		engines.extensions = -1ull;
> +		igt_assert_eq(__gem_context_set_param(i915, &p), -EFAULT);
> +
> +		engines.extensions = 1ull;
> +		igt_assert_eq(__gem_context_set_param(i915, &p), -EFAULT);
> +
> +		memset(&balancer, 0, sizeof(balancer));
> +		balancer.base.name = I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE;
> +		balancer.num_siblings = count;
> +		memcpy(balancer.engines, ci, count * sizeof(*ci));
> +
> +		engines.extensions = to_user_pointer(&balancer);
> +		gem_context_set_param(i915, &p);
> +
> +		balancer.engine_index = 1;
> +		igt_assert_eq(__gem_context_set_param(i915, &p), -EEXIST);
> +
> +		balancer.engine_index = count;
> +		igt_assert_eq(__gem_context_set_param(i915, &p), -EEXIST);
> +
> +		balancer.engine_index = count + 1;
> +		igt_assert_eq(__gem_context_set_param(i915, &p), -EINVAL);
> +
> +		balancer.engine_index = 0;
> +		gem_context_set_param(i915, &p);
> +
> +		balancer.base.next_extension = to_user_pointer(&balancer);
> +		igt_assert_eq(__gem_context_set_param(i915, &p), -EEXIST);
> +
> +		balancer.base.next_extension = -1ull;
> +		igt_assert_eq(__gem_context_set_param(i915, &p), -EFAULT);
> +
> +		handle = gem_create(i915, 4096 * 3);
> +		ptr = gem_mmap__gtt(i915, handle, 4096 * 3, PROT_WRITE);
> +		gem_close(i915, handle);
> +
> +		memset(&engines, 0, sizeof(engines));
> +		engines.engines[0].engine_class = I915_ENGINE_CLASS_INVALID;
> +		engines.engines[0].engine_instance = I915_ENGINE_CLASS_INVALID_NONE;
> +		engines.engines[1].engine_class = I915_ENGINE_CLASS_INVALID;
> +		engines.engines[1].engine_instance = I915_ENGINE_CLASS_INVALID_NONE;
> +		memcpy(engines.engines + 2, ci, count * sizeof(ci));
> +		p.size = (sizeof(struct i915_context_param_engines) +
> +			  (count + 2) * sizeof(*engines.engines));
> +		gem_context_set_param(i915, &p);
> +
> +		balancer.base.next_extension = 0;
> +		balancer.engine_index = 1;
> +		engines.extensions = to_user_pointer(&balancer);
> +		gem_context_set_param(i915, &p);
> +
> +		memcpy(ptr + 4096 - 8, &balancer, sizeof(balancer));
> +		memcpy(ptr + 8192 - 8, &balancer, sizeof(balancer));
> +		balancer.engine_index = 0;
> +
> +		engines.extensions = to_user_pointer(ptr) + 4096 - 8;
> +		gem_context_set_param(i915, &p);
> +
> +		balancer.base.next_extension = engines.extensions;
> +		engines.extensions = to_user_pointer(&balancer);
> +		gem_context_set_param(i915, &p);
> +
> +		munmap(ptr, 4096);
> +		igt_assert_eq(__gem_context_set_param(i915, &p), -EFAULT);
> +		engines.extensions = to_user_pointer(ptr) + 4096 - 8;
> +		igt_assert_eq(__gem_context_set_param(i915, &p), -EFAULT);
> +
> +		engines.extensions = to_user_pointer(ptr) + 8192 - 8;
> +		gem_context_set_param(i915, &p);
> +
> +		balancer.base.next_extension = engines.extensions;
> +		engines.extensions = to_user_pointer(&balancer);
> +		gem_context_set_param(i915, &p);
> +
> +		munmap(ptr + 8192, 4096);
> +		igt_assert_eq(__gem_context_set_param(i915, &p), -EFAULT);
> +		engines.extensions = to_user_pointer(ptr) + 8192 - 8;
> +		igt_assert_eq(__gem_context_set_param(i915, &p), -EFAULT);
> +
> +		munmap(ptr + 4096, 4096);
> +
> +		gem_context_destroy(i915, p.ctx_id);
> +		free(ci);
> +	}
> +}
> +
> +static void kick_kthreads(void)
> +{
> +	usleep(20 * 1000); /* 20ms should be enough for ksoftirqd! */
> +}
> +
> +static double measure_load(int pmu, int period_us)
> +{
> +	uint64_t data[2];
> +	uint64_t d_t, d_v;
> +
> +	kick_kthreads();
> +
> +	igt_assert_eq(read(pmu, data, sizeof(data)), sizeof(data));
> +	d_v = -data[0];
> +	d_t = -data[1];
> +
> +	usleep(period_us);
> +
> +	igt_assert_eq(read(pmu, data, sizeof(data)), sizeof(data));
> +	d_v += data[0];
> +	d_t += data[1];
> +
> +	return d_v / (double)d_t;
> +}
> +
> +static double measure_min_load(int pmu, unsigned int num, int period_us)
> +{
> +	uint64_t data[2 + num];
> +	uint64_t d_t, d_v[num];
> +	uint64_t min = -1, max = 0;
> +
> +	kick_kthreads();
> +
> +	igt_assert_eq(read(pmu, data, sizeof(data)), sizeof(data));
> +	for (unsigned int n = 0; n < num; n++)
> +		d_v[n] = -data[2 + n];
> +	d_t = -data[1];
> +
> +	usleep(period_us);
> +
> +	igt_assert_eq(read(pmu, data, sizeof(data)), sizeof(data));
> +
> +	d_t += data[1];
> +	for (unsigned int n = 0; n < num; n++) {
> +		d_v[n] += data[2 + n];
> +		igt_debug("engine[%d]: %.1f%%\n",
> +			  n, d_v[n] / (double)d_t * 100);
> +		if (d_v[n] < min)
> +			min = d_v[n];
> +		if (d_v[n] > max)
> +			max = d_v[n];
> +	}
> +
> +	igt_debug("elapsed: %"PRIu64"ns, load [%.1f, %.1f]%%\n",
> +		  d_t, min / (double)d_t * 100,  max / (double)d_t * 100);
> +
> +	return min / (double)d_t;
> +}
> +
> +static void check_individual_engine(int i915,
> +				    uint32_t ctx,
> +				    const struct i915_engine_class_instance *ci,
> +				    int idx)
> +{
> +	igt_spin_t *spin;
> +	double load;
> +	int pmu;
> +
> +	pmu = perf_i915_open(I915_PMU_ENGINE_BUSY(ci[idx].engine_class,
> +						  ci[idx].engine_instance));
> +
> +	spin = igt_spin_new(i915, .ctx = ctx, .engine = idx + 1);
> +	load = measure_load(pmu, 10000);
> +	igt_spin_free(i915, spin);
> +
> +	close(pmu);
> +
> +	igt_assert_f(load > 0.90,
> +		     "engine %d (class:instance %d:%d) was found to be only %.1f%% busy\n",
> +		     idx, ci[idx].engine_class, ci[idx].engine_instance, load*100);
> +}
> +
> +static void individual(int i915)
> +{
> +	uint32_t ctx;
> +
> +	/*
> +	 * I915_CONTEXT_PARAM_ENGINE allows us to index into the user
> +	 * supplied array from gem_execbuf(). Our check is to build the
> +	 * ctx->engine[] with various different engine classes, feed in
> +	 * a spinner and then ask pmu to confirm it the expected engine
> +	 * was busy.
> +	 */
> +
> +	ctx = gem_context_create(i915);
> +
> +	for (int class = 0; class < 32; class++) {
> +		struct i915_engine_class_instance *ci;
> +		unsigned int count;
> +
> +		ci = list_engines(i915, 1u << class, &count);
> +		if (!ci)
> +			continue;
> +
> +		for (int pass = 0; pass < count; pass++) { /* approx. count! */
> +			igt_assert(sizeof(*ci) == sizeof(int));
> +			igt_permute_array(ci, count, igt_exchange_int);
> +			set_load_balancer(i915, ctx, ci, count);
> +			for (unsigned int n = 0; n < count; n++)
> +				check_individual_engine(i915, ctx, ci, n);
> +		}
> +
> +		free(ci);
> +	}
> +
> +	gem_context_destroy(i915, ctx);
> +	gem_quiescent_gpu(i915);
> +}
> +
> +static void indices(int i915)
> +{
> +	I915_DEFINE_CONTEXT_PARAM_ENGINES(engines, I915_EXEC_RING_MASK + 1);
> +	struct drm_i915_gem_context_param p = {
> +		.ctx_id = gem_context_create(i915),
> +		.param = I915_CONTEXT_PARAM_ENGINES,
> +		.value = to_user_pointer(&engines)
> +	};
> +
> +	struct drm_i915_gem_exec_object2 batch = {
> +		.handle = batch_create(i915),
> +	};
> +
> +	unsigned int nengines = 0;
> +	void *balancers = NULL;
> +
> +	/*
> +	 * We can populate our engine map with multiple virtual engines.
> +	 * Do so.
> +	 */
> +
> +	for (int class = 0; class < 32; class++) {
> +		struct i915_engine_class_instance *ci;
> +		unsigned int count;
> +
> +		ci = list_engines(i915, 1u << class, &count);
> +		if (!ci)
> +			continue;
> +
> +		for (int n = 0; n < count; n++) {
> +			I915_DEFINE_CONTEXT_ENGINES_LOAD_BALANCE(*balancer,
> +								 count);
> +
> +			engines.engines[nengines].engine_class =
> +				I915_ENGINE_CLASS_INVALID;
> +			engines.engines[nengines].engine_instance =
> +				I915_ENGINE_CLASS_INVALID_NONE;
> +
> +			balancer = calloc(sizeof(*balancer), 1);
> +			igt_assert(balancer);
> +
> +			balancer->base.name =
> +				I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE;
> +			balancer->base.next_extension =
> +				to_user_pointer(balancers);
> +			balancers = balancer;
> +
> +			balancer->engine_index = nengines++;
> +			balancer->num_siblings = count;
> +
> +			memcpy(balancer->engines,
> +			       ci, count * sizeof(*ci));
> +		}
> +		free(ci);
> +	}
> +
> +	igt_require(balancers);
> +	engines.extensions = to_user_pointer(balancers);
> +	p.size = (sizeof(struct i915_engine_class_instance) * nengines +
> +		  sizeof(struct i915_context_param_engines));
> +	gem_context_set_param(i915, &p);
> +
> +	for (unsigned int n = 0; n < nengines; n++) {
> +		struct drm_i915_gem_execbuffer2 eb = {
> +			.buffers_ptr = to_user_pointer(&batch),
> +			.buffer_count = 1,
> +			.flags = n,
> +			.rsvd1 = p.ctx_id,
> +		};
> +		igt_debug("Executing on index=%d\n", n);
> +		gem_execbuf(i915, &eb);
> +	}
> +	gem_context_destroy(i915, p.ctx_id);
> +
> +	gem_sync(i915, batch.handle);
> +	gem_close(i915, batch.handle);
> +
> +	while (balancers) {
> +		struct i915_context_engines_load_balance *b, *n;
> +
> +		b = balancers;
> +		n = from_user_pointer(b->base.next_extension);
> +		free(b);
> +
> +		balancers = n;
> +	}
> +
> +	gem_quiescent_gpu(i915);
> +}
> +
> +static void busy(int i915)
> +{
> +	uint32_t scratch = gem_create(i915, 4096);
> +
> +	/*
> +	 * Check that virtual engines are reported via GEM_BUSY.
> +	 *
> +	 * When running, the batch will be on the real engine and report
> +	 * the actual class.
> +	 *
> +	 * Prior to running, if the load-balancer is across multiple
> +	 * classes we don't know which engine the batch will
> +	 * execute on, so we report them all!
> +	 *
> +	 * However, as we only support (and test) creating a load-balancer
> +	 * from engines of only one class, that can be propagated accurately
> +	 * through to GEM_BUSY.
> +	 */
> +
> +	for (int class = 0; class < 16; class++) {
> +		struct drm_i915_gem_busy busy;
> +		struct i915_engine_class_instance *ci;
> +		unsigned int count;
> +		igt_spin_t *spin[2];
> +		uint32_t ctx;
> +
> +		ci = list_engines(i915, 1u << class, &count);
> +		if (!ci)
> +			continue;
> +
> +		ctx = load_balancer_create(i915, ci, count);
> +		free(ci);
> +
> +		spin[0] = __igt_spin_new(i915,
> +					 .ctx = ctx,
> +					 .flags = IGT_SPIN_POLL_RUN);
> +		spin[1] = __igt_spin_new(i915,
> +					 .ctx = ctx,
> +					 .dependency = scratch);
> +
> +		igt_spin_busywait_until_started(spin[0]);
> +
> +		/* Running: actual class */
> +		busy.handle = spin[0]->handle;
> +		do_ioctl(i915, DRM_IOCTL_I915_GEM_BUSY, &busy);
> +		igt_assert_eq_u32(busy.busy, 1u << (class + 16));
> +
> +		/* Queued(read): expected class */
> +		busy.handle = spin[1]->handle;
> +		do_ioctl(i915, DRM_IOCTL_I915_GEM_BUSY, &busy);
> +		igt_assert_eq_u32(busy.busy, 1u << (class + 16));
> +
> +		/* Queued(write): expected class */
> +		busy.handle = scratch;
> +		do_ioctl(i915, DRM_IOCTL_I915_GEM_BUSY, &busy);
> +		igt_assert_eq_u32(busy.busy,
> +				  (1u << (class + 16)) | (class + 1));
> +
> +		igt_spin_free(i915, spin[1]);
> +		igt_spin_free(i915, spin[0]);
> +
> +		gem_context_destroy(i915, ctx);
> +	}
> +
> +	gem_close(i915, scratch);
> +	gem_quiescent_gpu(i915);
> +}
> +
> +static int add_pmu(int pmu, const struct i915_engine_class_instance *ci)
> +{
> +	return perf_i915_open_group(I915_PMU_ENGINE_BUSY(ci->engine_class,
> +							 ci->engine_instance),
> +				    pmu);
> +}
> +
> +static void full(int i915, unsigned int flags)
> +#define PULSE 0x1
> +#define LATE 0x2
> +{
> +	struct drm_i915_gem_exec_object2 batch = {
> +		.handle = batch_create(i915),
> +	};
> +
> +	if (flags & LATE)
> +		igt_require_sw_sync();
> +
> +	/*
> +	 * I915_CONTEXT_PARAM_ENGINE changes the meaning of engine selector in
> +	 * execbuf to utilize our own map, into which we replace I915_EXEC_DEFAULT
> +	 * to provide an automatic selection from the other ctx->engine[]. It
> +	 * employs load-balancing to evenly distribute the workload the
> +	 * array. If we submit N spinners, we expect them to be simultaneously
> +	 * running across N engines and use PMU to confirm that the entire
> +	 * set of engines are busy.
> +	 *
> +	 * We complicate matters by interspersing short-lived tasks to
> +	 * challenge the kernel to search for space in which to insert new
> +	 * batches.
> +	 */
> +
> +	for (int class = 0; class < 32; class++) {
> +		struct i915_engine_class_instance *ci;
> +		igt_spin_t *spin = NULL;
> +		IGT_CORK_FENCE(cork);
> +		unsigned int count;
> +		double load;
> +		int fence = -1;
> +		int *pmu;
> +
> +		ci = list_engines(i915, 1u << class, &count);
> +		if (!ci)
> +			continue;
> +
> +		pmu = malloc(sizeof(*pmu) * count);
> +		igt_assert(pmu);
> +
> +		if (flags & LATE)
> +			fence = igt_cork_plug(&cork, i915);
> +
> +		pmu[0] = -1;
> +		for (unsigned int n = 0; n < count; n++) {
> +			uint32_t ctx;
> +
> +			pmu[n] = add_pmu(pmu[0], &ci[n]);
> +
> +			if (flags & PULSE) {
> +				struct drm_i915_gem_execbuffer2 eb = {
> +					.buffers_ptr = to_user_pointer(&batch),
> +					.buffer_count = 1,
> +					.rsvd2 = fence,
> +					.flags = flags & LATE ? I915_EXEC_FENCE_IN : 0,
> +				};
> +				gem_execbuf(i915, &eb);
> +			}
> +
> +			/*
> +			 * Each spinner needs to be one a new timeline,
> +			 * otherwise they will just sit in the single queue
> +			 * and not run concurrently.
> +			 */
> +			ctx = load_balancer_create(i915, ci, count);
> +
> +			if (spin == NULL) {
> +				spin = __igt_spin_new(i915, .ctx = ctx);
> +			} else {
> +				struct drm_i915_gem_execbuffer2 eb = {
> +					.buffers_ptr = spin->execbuf.buffers_ptr,
> +					.buffer_count = spin->execbuf.buffer_count,
> +					.rsvd1 = ctx,
> +					.rsvd2 = fence,
> +					.flags = flags & LATE ? I915_EXEC_FENCE_IN : 0,
> +				};
> +				gem_execbuf(i915, &eb);
> +			}
> +
> +			gem_context_destroy(i915, ctx);
> +		}
> +
> +		if (flags & LATE) {
> +			igt_cork_unplug(&cork);
> +			close(fence);
> +		}
> +
> +		load = measure_min_load(pmu[0], count, 10000);
> +		igt_spin_free(i915, spin);
> +
> +		close(pmu[0]);
> +		free(pmu);
> +
> +		free(ci);
> +
> +		igt_assert_f(load > 0.90,
> +			     "minimum load for %d x class:%d was found to be only %.1f%% busy\n",
> +			     count, class, load*100);
> +		gem_quiescent_gpu(i915);
> +	}
> +
> +	gem_close(i915, batch.handle);
> +	gem_quiescent_gpu(i915);
> +}
> +
> +static const char *class_to_str(int class)
> +{
> +	const char *str[] = {
> +		[I915_ENGINE_CLASS_RENDER] = "rcs",
> +		[I915_ENGINE_CLASS_COPY] = "bcs",
> +		[I915_ENGINE_CLASS_VIDEO] = "vcs",
> +		[I915_ENGINE_CLASS_VIDEO_ENHANCE] = "vecs",
> +	};
> +
> +	if (class < ARRAY_SIZE(str))
> +		return str[class];
> +
> +	return "unk";
> +}
> +
> +static void nop(int i915)
> +{
> +	struct drm_i915_gem_exec_object2 batch = {
> +		.handle = batch_create(i915),
> +	};
> +
> +	for (int class = 0; class < 32; class++) {
> +		struct i915_engine_class_instance *ci;
> +		unsigned int count;
> +		uint32_t ctx;
> +
> +		ci = list_engines(i915, 1u << class, &count);
> +		if (!ci)
> +			continue;
> +
> +		ctx = load_balancer_create(i915, ci, count);
> +
> +		for (int n = 0; n < count; n++) {
> +			struct drm_i915_gem_execbuffer2 execbuf = {
> +				.buffers_ptr = to_user_pointer(&batch),
> +				.buffer_count = 1,
> +				.flags = n + 1,
> +				.rsvd1 = ctx,
> +			};
> +			struct timespec tv = {};
> +			unsigned long nops;
> +			double t;
> +
> +			igt_nsec_elapsed(&tv);
> +			nops = 0;
> +			do {
> +				for (int r = 0; r < 1024; r++)
> +					gem_execbuf(i915, &execbuf);
> +				nops += 1024;
> +			} while (igt_seconds_elapsed(&tv) < 2);
> +			gem_sync(i915, batch.handle);
> +
> +			t = igt_nsec_elapsed(&tv) * 1e-3 / nops;
> +			igt_info("%s:%d %.3fus\n", class_to_str(class), n, t);
> +		}
> +
> +		{
> +			struct drm_i915_gem_execbuffer2 execbuf = {
> +				.buffers_ptr = to_user_pointer(&batch),
> +				.buffer_count = 1,
> +				.rsvd1 = ctx,
> +			};
> +			struct timespec tv = {};
> +			unsigned long nops;
> +			double t;
> +
> +			igt_nsec_elapsed(&tv);
> +			nops = 0;
> +			do {
> +				for (int r = 0; r < 1024; r++)
> +					gem_execbuf(i915, &execbuf);
> +				nops += 1024;
> +			} while (igt_seconds_elapsed(&tv) < 2);
> +			gem_sync(i915, batch.handle);
> +
> +			t = igt_nsec_elapsed(&tv) * 1e-3 / nops;
> +			igt_info("%s:* %.3fus\n", class_to_str(class), t);
> +		}
> +
> +
> +		igt_fork(child, count) {
> +			struct drm_i915_gem_execbuffer2 execbuf = {
> +				.buffers_ptr = to_user_pointer(&batch),
> +				.buffer_count = 1,
> +				.flags = child + 1,
> +				.rsvd1 = gem_context_clone(i915, ctx,
> +							   I915_CONTEXT_CLONE_ENGINES, 0),
> +			};
> +			struct timespec tv = {};
> +			unsigned long nops;
> +			double t;
> +
> +			igt_nsec_elapsed(&tv);
> +			nops = 0;
> +			do {
> +				for (int r = 0; r < 1024; r++)
> +					gem_execbuf(i915, &execbuf);
> +				nops += 1024;
> +			} while (igt_seconds_elapsed(&tv) < 2);
> +			gem_sync(i915, batch.handle);
> +
> +			t = igt_nsec_elapsed(&tv) * 1e-3 / nops;
> +			igt_info("[%d] %s:%d %.3fus\n",
> +				 child, class_to_str(class), child, t);
> +
> +			memset(&tv, 0, sizeof(tv));
> +			execbuf.flags = 0;
> +
> +			igt_nsec_elapsed(&tv);
> +			nops = 0;
> +			do {
> +				for (int r = 0; r < 1024; r++)
> +					gem_execbuf(i915, &execbuf);
> +				nops += 1024;
> +			} while (igt_seconds_elapsed(&tv) < 2);
> +			gem_sync(i915, batch.handle);
> +
> +			t = igt_nsec_elapsed(&tv) * 1e-3 / nops;
> +			igt_info("[%d] %s:* %.3fus\n",
> +				 child, class_to_str(class), t);
> +
> +			gem_context_destroy(i915, execbuf.rsvd1);
> +		}
> +
> +		igt_waitchildren();
> +
> +		gem_context_destroy(i915, ctx);
> +		free(ci);
> +	}
> +
> +	gem_close(i915, batch.handle);
> +	gem_quiescent_gpu(i915);
> +}
> +
> +static void ping(int i915, uint32_t ctx, unsigned int engine)
> +{
> +	struct drm_i915_gem_exec_object2 obj = {
> +		.handle = batch_create(i915),
> +	};
> +	struct drm_i915_gem_execbuffer2 execbuf = {
> +		.buffers_ptr = to_user_pointer(&obj),
> +		.buffer_count = 1,
> +		.flags = engine,
> +		.rsvd1 = ctx,
> +	};
> +	gem_execbuf(i915, &execbuf);
> +	gem_sync(i915, obj.handle);
> +	gem_close(i915, obj.handle);
> +}
> +
> +static void semaphore(int i915)
> +{
> +	uint32_t block[2], scratch;
> +	igt_spin_t *spin[3];
> +
> +	/*
> +	 * If we are using HW semaphores to launch serialised requests
> +	 * on different engine concurrently, we want to verify that real
> +	 * work is unimpeded.
> +	 */
> +	igt_require(gem_scheduler_has_preemption(i915));
> +
> +	block[0] = gem_context_create(i915);
> +	block[1] = gem_context_create(i915);
> +
> +	scratch = gem_create(i915, 4096);
> +	spin[2] = igt_spin_new(i915, .dependency = scratch);
> +	for (int class = 1; class < 32; class++) {
> +		struct i915_engine_class_instance *ci;
> +		unsigned int count;
> +		uint32_t vip;
> +
> +		ci = list_engines(i915, 1u << class, &count);
> +		if (!ci)
> +			continue;
> +
> +		if (count < ARRAY_SIZE(block))
> +			continue;
> +
> +		/* Ensure that we completely occupy all engines in this group */
> +		count = ARRAY_SIZE(block);
> +
> +		for (int i = 0; i < count; i++) {
> +			set_load_balancer(i915, block[i], ci, count);
> +			spin[i] = __igt_spin_new(i915,
> +						 .ctx = block[i],
> +						 .dependency = scratch);
> +		}
> +
> +		/*
> +		 * Either we haven't blocked both engines with semaphores,
> +		 * or we let the vip through. If not, we hang.
> +		 */
> +		vip = gem_context_create(i915);
> +		set_load_balancer(i915, vip, ci, count);
> +		ping(i915, vip, 0);
> +		gem_context_destroy(i915, vip);
> +
> +		for (int i = 0; i < count; i++)
> +			igt_spin_free(i915, spin[i]);
> +
> +		free(ci);
> +	}
> +	igt_spin_free(i915, spin[2]);
> +	gem_close(i915, scratch);
> +
> +	gem_context_destroy(i915, block[1]);
> +	gem_context_destroy(i915, block[0]);
> +
> +	gem_quiescent_gpu(i915);
> +}
> +
> +static void smoketest(int i915, int timeout)
> +{
> +	struct drm_i915_gem_exec_object2 batch[2] = {
> +		{ .handle = __batch_create(i915, 16380) }
> +	};
> +	unsigned int ncontext = 0;
> +	uint32_t *contexts = NULL;
> +	uint32_t *handles = NULL;
> +
> +	igt_require_sw_sync();
> +
> +	for (int class = 0; class < 32; class++) {
> +		struct i915_engine_class_instance *ci;
> +		unsigned int count = 0;
> +
> +		ci = list_engines(i915, 1u << class, &count);
> +		if (!ci || count < 2) {
> +			free(ci);
> +			continue;
> +		}
> +
> +		ncontext += 128;
> +		contexts = realloc(contexts, sizeof(*contexts) * ncontext);
> +		igt_assert(contexts);
> +
> +		for (unsigned int n = ncontext - 128; n < ncontext; n++) {
> +			contexts[n] = load_balancer_create(i915, ci, count);
> +			igt_assert(contexts[n]);
> +		}
> +
> +		free(ci);
> +	}
> +	igt_debug("Created %d virtual engines (one per context)\n", ncontext);
> +	igt_require(ncontext);
> +
> +	contexts = realloc(contexts, sizeof(*contexts) * ncontext * 4);
> +	igt_assert(contexts);
> +	memcpy(contexts + ncontext, contexts, ncontext * sizeof(*contexts));
> +	ncontext *= 2;
> +	memcpy(contexts + ncontext, contexts, ncontext * sizeof(*contexts));
> +	ncontext *= 2;
> +
> +	handles = malloc(sizeof(*handles) * ncontext);
> +	igt_assert(handles);
> +	for (unsigned int n = 0; n < ncontext; n++)
> +		handles[n] = gem_create(i915, 4096);
> +
> +	igt_until_timeout(timeout) {
> +		unsigned int count = 1 + (rand() % (ncontext - 1));
> +		IGT_CORK_FENCE(cork);
> +		int fence = igt_cork_plug(&cork, i915);
> +
> +		for (unsigned int n = 0; n < count; n++) {
> +			struct drm_i915_gem_execbuffer2 eb = {
> +				.buffers_ptr = to_user_pointer(batch),
> +				.buffer_count = ARRAY_SIZE(batch),
> +				.rsvd1 = contexts[n],
> +				.rsvd2 = fence,
> +				.flags = I915_EXEC_BATCH_FIRST | I915_EXEC_FENCE_IN,
> +			};
> +			batch[1].handle = handles[n];
> +			gem_execbuf(i915, &eb);
> +		}
> +		igt_permute_array(handles, count, igt_exchange_int);
> +
> +		igt_cork_unplug(&cork);
> +		for (unsigned int n = 0; n < count; n++)
> +			gem_sync(i915, handles[n]);
> +
> +		close(fence);
> +	}
> +
> +	for (unsigned int n = 0; n < ncontext; n++) {
> +		gem_close(i915, handles[n]);
> +		__gem_context_destroy(i915, contexts[n]);
> +	}
> +	free(handles);
> +	free(contexts);
> +	gem_close(i915, batch[0].handle);
> +}
> +
> +static bool has_context_engines(int i915)
> +{
> +	struct drm_i915_gem_context_param p = {
> +		.param = I915_CONTEXT_PARAM_ENGINES,
> +	};
> +
> +	return __gem_context_set_param(i915, &p) == 0;
> +}
> +
> +static bool has_load_balancer(int i915)
> +{
> +	struct i915_engine_class_instance ci = {};
> +	uint32_t ctx;
> +	int err;
> +
> +	ctx = gem_context_create(i915);
> +	err = __set_load_balancer(i915, ctx, &ci, 1);
> +	gem_context_destroy(i915, ctx);
> +
> +	return err == 0;
> +}
> +
> +igt_main
> +{
> +	int i915 = -1;
> +
> +	igt_skip_on_simulation();
> +
> +	igt_fixture {
> +		i915 = drm_open_driver(DRIVER_INTEL);
> +		igt_require_gem(i915);
> +
> +		gem_require_contexts(i915);
> +		igt_require(has_context_engines(i915));
> +		igt_require(has_load_balancer(i915));
> +
> +		igt_fork_hang_detector(i915);
> +	}
> +
> +	igt_subtest("invalid-balancer")
> +		invalid_balancer(i915);
> +
> +	igt_subtest("individual")
> +		individual(i915);
> +
> +	igt_subtest("indices")
> +		indices(i915);
> +
> +	igt_subtest("busy")
> +		busy(i915);
> +
> +	igt_subtest_group {
> +		static const struct {
> +			const char *name;
> +			unsigned int flags;
> +		} phases[] = {
> +			{ "", 0 },
> +			{ "-pulse", PULSE },
> +			{ "-late", LATE },
> +			{ "-late-pulse", PULSE | LATE },
> +			{ }
> +		};
> +		for (typeof(*phases) *p = phases; p->name; p++)
> +			igt_subtest_f("full%s", p->name)
> +				full(i915, p->flags);
> +	}
> +
> +	igt_subtest("nop")
> +		nop(i915);
> +
> +	igt_subtest("semaphore")
> +		semaphore(i915);
> +
> +	igt_subtest("smoke")
> +		smoketest(i915, 20);
> +
> +	igt_fixture {
> +		igt_stop_hang_detector();
> +	}
> +}
> diff --git a/tests/meson.build b/tests/meson.build
> index b7e56103d..20366b68b 100644
> --- a/tests/meson.build
> +++ b/tests/meson.build
> @@ -293,6 +293,13 @@ test_executables += executable('gem_eio',
>   	   install : true)
>   test_list += 'gem_eio'
>   
> +test_executables += executable('gem_exec_balancer', 'i915/gem_exec_balancer.c',
> +	   dependencies : test_deps + [ lib_igt_perf ],
> +	   install_dir : libexecdir,
> +	   install_rpath : libexecdir_rpathdir,
> +	   install : true)
> +test_progs += 'gem_exec_balancer'
> +
>   test_executables += executable('gem_mocs_settings',
>   	   join_paths('i915', 'gem_mocs_settings.c'),
>   	   dependencies : test_deps + [ lib_igt_perf ],
>