[igt-dev] [PATCH i-g-t v3 10/11] tests/i915/vm_bind: Add gem_exec3_balancer test

Fri Oct 14 09:12:10 UTC 2022

Hi Niranjana,

On 2022-10-09 at 23:59:28 -0700, Niranjana Vishwanathapura wrote:
> From: "Vishwanathapura, Niranjana" <niranjana.vishwanathapura at intel.com>
- ^^^^^  ^                ^
You still keep this here switched, but you can just delete this
line as you are sending this patch.

> 
> To test parallel submissions support in execbuf3, port the subtest
> gem_exec_balancer at parallel-ordering to a new gem_exec3_balancer test
> and switch to execbuf3 ioctl.

Could you keep this test inside old one ? If not, then how many
new code you added ? What new functions did you add ? What about
just adding new subtest @vmbind-parallel-ordering ?

> 
> Signed-off-by: Niranjana Vishwanathapura <niranjana.vishwanathapura at intel.com>
> ---
>  tests/i915/gem_exec3_balancer.c | 500 ++++++++++++++++++++++++++++++++
>  tests/meson.build               |   7 +
>  2 files changed, 507 insertions(+)
>  create mode 100644 tests/i915/gem_exec3_balancer.c
> 
> diff --git a/tests/i915/gem_exec3_balancer.c b/tests/i915/gem_exec3_balancer.c
> new file mode 100644
> index 0000000000..3719e9fe0c
> --- /dev/null
> +++ b/tests/i915/gem_exec3_balancer.c
> @@ -0,0 +1,500 @@
> +// SPDX-License-Identifier: MIT
> +/*
> + * Copyright © 2022 Intel Corporation
> + */
> +
> +/** @file gem_exec3_balancer.c
> + *
> + * Load balancer tests with execbuf3.
> + * Ported from gem_exec_balancer and made to work with
> + * vm_bind and execbuf3.
> + *
> + */
> +
> +#include <poll.h>
> +
> +#include "i915/gem.h"
> +#include "i915/gem_engine_topology.h"
> +#include "i915/gem_create.h"
> +#include "i915/gem_vm.h"
> +#include "igt.h"
> +#include "igt_gt.h"
> +#include "igt_perf.h"
> +#include "igt_syncobj.h"
> +
> +IGT_TEST_DESCRIPTION("Exercise in-kernel load-balancing with execbuf3");
> +
> +#define INSTANCE_COUNT (1 << I915_PMU_SAMPLE_INSTANCE_BITS)
> +
> +static bool has_class_instance(int i915, uint16_t class, uint16_t instance)
> +{
> +	int fd;
> +
> +	fd = perf_i915_open(i915, I915_PMU_ENGINE_BUSY(class, instance));
> +	if (fd >= 0) {
> +		close(fd);
> +		return true;
> +	}
> +
> +	return false;
> +}
> +
> +static struct i915_engine_class_instance *
> +list_engines(int i915, uint32_t class_mask, unsigned int *out)
> +{
> +	unsigned int count = 0, size = 64;
> +	struct i915_engine_class_instance *engines;
> +
> +	engines = malloc(size * sizeof(*engines));
> +	igt_assert(engines);
> +
> +	for (enum drm_i915_gem_engine_class class = I915_ENGINE_CLASS_RENDER;
> +	     class_mask;
> +	     class++, class_mask >>= 1) {
> +		if (!(class_mask & 1))
> +			continue;
> +
> +		for (unsigned int instance = 0;
> +		     instance < INSTANCE_COUNT;
> +		     instance++) {
> +			if (!has_class_instance(i915, class, instance))
> +				continue;
> +
> +			if (count == size) {
> +				size *= 2;
> +				engines = realloc(engines,
> +						  size * sizeof(*engines));
> +				igt_assert(engines);
> +			}
> +
> +			engines[count++] = (struct i915_engine_class_instance){
> +				.engine_class = class,
> +				.engine_instance = instance,
> +			};
> +		}
> +	}
> +
> +	if (!count) {
> +		free(engines);
> +		engines = NULL;
> +	}
> +
> +	*out = count;
> +	return engines;
> +}
> +
> +static bool has_perf_engines(int i915)
> +{
> +	return i915_perf_type_id(i915);
> +}
> +
> +static intel_ctx_cfg_t
> +ctx_cfg_for_engines(const struct i915_engine_class_instance *ci,
> +		    unsigned int count)
> +{
> +	intel_ctx_cfg_t cfg = { };
> +	unsigned int i;
> +
> +	for (i = 0; i < count; i++)
> +		cfg.engines[i] = ci[i];
> +	cfg.num_engines = count;
> +
> +	return cfg;
> +}
> +
> +static const intel_ctx_t *
> +ctx_create_engines(int i915, const struct i915_engine_class_instance *ci,
> +		   unsigned int count)
> +{
> +	intel_ctx_cfg_t cfg = ctx_cfg_for_engines(ci, count);
> +	return intel_ctx_create(i915, &cfg);
> +}
> +
> +static void check_bo(int i915, uint32_t handle, unsigned int expected,
> +		     bool wait)
> +{
> +	uint32_t *map;
> +
> +	map = gem_mmap__cpu(i915, handle, 0, 4096, PROT_READ);
> +	if (wait)
> +		gem_set_domain(i915, handle, I915_GEM_DOMAIN_CPU,
> +			       I915_GEM_DOMAIN_CPU);
> +	igt_assert_eq(map[0], expected);
> +	munmap(map, 4096);
> +}
> +
> +static struct drm_i915_query_engine_info *query_engine_info(int i915)
> +{
> +	struct drm_i915_query_engine_info *engines;
> +
> +#define QUERY_SIZE	0x4000
> +	engines = malloc(QUERY_SIZE);
> +	igt_assert(engines);
> +	memset(engines, 0, QUERY_SIZE);
> +	igt_assert(!__gem_query_engines(i915, engines, QUERY_SIZE));
> +#undef QUERY_SIZE
> +
> +	return engines;
> +}
> +
> +/* This function only works if siblings contains all instances of a class */
> +static void logical_sort_siblings(int i915,
> +				  struct i915_engine_class_instance *siblings,
> +				  unsigned int count)
> +{
> +	struct i915_engine_class_instance *sorted;
> +	struct drm_i915_query_engine_info *engines;
> +	unsigned int i, j;
> +
> +	sorted = calloc(count, sizeof(*sorted));
> +	igt_assert(sorted);
> +
> +	engines = query_engine_info(i915);
> +
> +	for (j = 0; j < count; ++j) {
> +		for (i = 0; i < engines->num_engines; ++i) {
> +			if (siblings[j].engine_class ==
> +			    engines->engines[i].engine.engine_class &&
> +			    siblings[j].engine_instance ==
> +			    engines->engines[i].engine.engine_instance) {
> +				uint16_t logical_instance =
> +					engines->engines[i].logical_instance;
> +
> +				igt_assert(logical_instance < count);
> +				igt_assert(!sorted[logical_instance].engine_class);
> +				igt_assert(!sorted[logical_instance].engine_instance);
> +
> +				sorted[logical_instance] = siblings[j];
> +				break;
> +			}
> +		}
> +		igt_assert(i != engines->num_engines);
> +	}
> +
> +	memcpy(siblings, sorted, sizeof(*sorted) * count);
> +	free(sorted);
> +	free(engines);
> +}
> +
> +static bool fence_busy(int fence)
> +{
> +	return poll(&(struct pollfd){fence, POLLIN}, 1, 0) == 0;
> +}
> +
> +/*
> + * Always reading from engine instance 0, with GuC submission the values are the
> + * same across all instances. Execlists they may differ but quite unlikely they
> + * would be and if they are we can live with this.
> + */
> +static unsigned int get_timeslice(int i915,
> +				  struct i915_engine_class_instance engine)
> +{
> +	unsigned int val;
> +
> +	switch (engine.engine_class) {
> +	case I915_ENGINE_CLASS_RENDER:
> +		gem_engine_property_scanf(i915, "rcs0", "timeslice_duration_ms",
> +					  "%d", &val);
> +		break;
> +	case I915_ENGINE_CLASS_COPY:
> +		gem_engine_property_scanf(i915, "bcs0", "timeslice_duration_ms",
> +					  "%d", &val);
> +		break;
> +	case I915_ENGINE_CLASS_VIDEO:
> +		gem_engine_property_scanf(i915, "vcs0", "timeslice_duration_ms",
> +					  "%d", &val);
> +		break;
> +	case I915_ENGINE_CLASS_VIDEO_ENHANCE:
> +		gem_engine_property_scanf(i915, "vecs0", "timeslice_duration_ms",
> +					  "%d", &val);
> +		break;
> +	}
> +
> +	return val;
> +}
> +
> +static void i915_vm_bind(int i915, uint32_t vm_id, uint64_t va, uint32_t handle,
> +			 uint64_t length, uint32_t syncobj)
> +{
> +	struct drm_i915_gem_vm_bind bind;
> +
> +	memset(&bind, 0, sizeof(bind));
> +	bind.vm_id = vm_id;
> +	bind.handle = handle;
> +	bind.start = va;
> +	bind.offset = 0;
> +	bind.length = length;
> +	bind.fence.flags = I915_TIMELINE_FENCE_SIGNAL;
> +	bind.fence.handle = syncobj;
> +
> +	gem_vm_bind(i915, &bind);
> +}
> +
> +static void i915_vm_unbind(int i915, uint32_t vm_id, uint64_t va, uint64_t length)
> +{
> +	struct drm_i915_gem_vm_unbind unbind;
> +
> +	memset(&unbind, 0, sizeof(unbind));
> +	unbind.vm_id = vm_id;
> +	unbind.start = va;
> +	unbind.length = length;
> +
> +	gem_vm_unbind(i915, &unbind);
> +}
> +
> +static uint64_t gettime_ns(void)
> +{
> +	struct timespec current;
> +	clock_gettime(CLOCK_MONOTONIC, &current);
> +	return (uint64_t)current.tv_sec * NSEC_PER_SEC + current.tv_nsec;
> +}
> +
> +static bool syncobj_busy(int i915, uint32_t handle)
> +{
> +	bool result;
> +	int sf;
> +
> +	sf = syncobj_handle_to_fd(i915, handle,
> +				  DRM_SYNCOBJ_HANDLE_TO_FD_FLAGS_EXPORT_SYNC_FILE);
> +	result = poll(&(struct pollfd){sf, POLLIN}, 1, 0) == 0;
> +	close(sf);
> +
> +	return result;
> +}
> +
> +/*
> + * Ensure a parallel submit actually runs on HW in parallel by putting on a
> + * spinner on 1 engine, doing a parallel submit, and parallel submit is blocked
> + * behind spinner.
> + */
> +static void parallel_ordering(int i915, unsigned int flags)
> +{
> +	uint32_t vm_id;
> +	int class;
> +
> +	vm_id = gem_vm_create_in_vm_bind_mode(i915);
> +
> +	for (class = 0; class < 32; class++) {
> +		struct drm_i915_gem_timeline_fence exec_fence[32] = { };
> +		const intel_ctx_t *ctx = NULL, *spin_ctx = NULL;
> +		uint64_t fence_value = 0, batch_addr[32] = { };
> +		struct i915_engine_class_instance *siblings;
> +		uint32_t exec_syncobj, bind_syncobj[32];
> +		struct drm_i915_gem_execbuffer3 execbuf;
> +		uint32_t batch[16], obj[32];
> +		intel_ctx_cfg_t cfg;
> +		unsigned int count;
> +		igt_spin_t *spin;
> +		uint64_t ahnd;
> +		int i = 0;
> +
> +		siblings = list_engines(i915, 1u << class, &count);
> +		if (!siblings)
> +			continue;
> +
> +		if (count < 2) {
> +			free(siblings);
> +			continue;
> +		}
> +
> +		logical_sort_siblings(i915, siblings, count);
> +
> +		memset(&cfg, 0, sizeof(cfg));
> +		cfg.parallel = true;
> +		cfg.num_engines = 1;
> +		cfg.width = count;
> +		memcpy(cfg.engines, siblings, sizeof(*siblings) * count);
> +
> +		if (__intel_ctx_create(i915, &cfg, &ctx)) {
> +			free(siblings);
> +			continue;
> +		}
> +		gem_context_set_vm(i915, ctx->id, vm_id);
> +
> +		batch[i] = MI_ATOMIC | MI_ATOMIC_INC;
> +#define TARGET_BO_OFFSET	(0x1 << 16)
> +		batch[++i] = TARGET_BO_OFFSET;
> +		batch[++i] = 0;
> +		batch[++i] = MI_BATCH_BUFFER_END;
> +
> +		obj[0] = gem_create(i915, 4096);
> +		bind_syncobj[0] = syncobj_create(i915, 0);
> +		exec_fence[0].handle = bind_syncobj[0];
> +		exec_fence[0].flags = I915_TIMELINE_FENCE_WAIT;
> +		i915_vm_bind(i915, vm_id, TARGET_BO_OFFSET, obj[0], 4096, bind_syncobj[0]);
> +
> +		for (i = 1; i < count + 1; ++i) {
> +			obj[i] = gem_create(i915, 4096);
> +			gem_write(i915, obj[i], 0, batch, sizeof(batch));
> +
> +			batch_addr[i - 1] = TARGET_BO_OFFSET * (i + 1);
> +			bind_syncobj[i] = syncobj_create(i915, 0);
> +			exec_fence[i].handle = bind_syncobj[i];
> +			exec_fence[i].flags = I915_TIMELINE_FENCE_WAIT;
> +			i915_vm_bind(i915, vm_id, batch_addr[i - 1], obj[i], 4096, bind_syncobj[i]);
> +		}
> +
> +		exec_syncobj = syncobj_create(i915, 0);
> +		exec_fence[i].handle = exec_syncobj;
> +		exec_fence[i].flags = I915_TIMELINE_FENCE_SIGNAL;
> +
> +		memset(&execbuf, 0, sizeof(execbuf));
> +		execbuf.ctx_id = ctx->id,
> +		execbuf.batch_address = to_user_pointer(batch_addr),
> +		execbuf.fence_count = count + 2,
> +		execbuf.timeline_fences = to_user_pointer(exec_fence),
> +
> +		/* Block parallel submission */
> +		spin_ctx = ctx_create_engines(i915, siblings, count);
> +		ahnd = get_simple_ahnd(i915, spin_ctx->id);
> +		spin = __igt_spin_new(i915,
> +				      .ahnd = ahnd,
> +				      .ctx = spin_ctx,
> +				      .engine = 0,
> +				      .flags = IGT_SPIN_FENCE_OUT |
> +				      IGT_SPIN_NO_PREEMPTION);
> +
> +		/* Wait for spinners to start */
> +		usleep(5 * 10000);
> +		igt_assert(fence_busy(spin->out_fence));
> +
> +		/* Submit parallel execbuf */
> +		gem_execbuf3(i915, &execbuf);
> +
> +		/*
> +		 * Wait long enough for timeslcing to kick in but not
> +		 * preemption. Spinner + parallel execbuf should be
> +		 * active. Assuming default timeslice / preemption values, if
> +		 * these are changed it is possible for the test to fail.
> +		 */
> +		usleep(get_timeslice(i915, siblings[0]) * 2);
> +		igt_assert(fence_busy(spin->out_fence));
> +		igt_assert(syncobj_busy(i915, exec_syncobj));
> +		check_bo(i915, obj[0], 0, false);
> +
> +		/*
> +		 * End spinner and wait for spinner + parallel execbuf
> +		 * to compelte.
> +		 */
> +		igt_spin_end(spin);
> +		igt_assert(syncobj_timeline_wait(i915, &exec_syncobj, &fence_value, 1,
> +						 gettime_ns() + (2 * NSEC_PER_SEC),
> +						 DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT, NULL));
> +		igt_assert(!syncobj_busy(i915, exec_syncobj));
> +		syncobj_destroy(i915, exec_syncobj);
> +		for (i = 0; i < count + 1; ++i)
> +			syncobj_destroy(i915, bind_syncobj[i]);
> +		check_bo(i915, obj[0], count, true);
> +
> +		/* Clean up */
> +		intel_ctx_destroy(i915, ctx);
> +		intel_ctx_destroy(i915, spin_ctx);
> +		i915_vm_unbind(i915, vm_id, TARGET_BO_OFFSET, 4096);
> +		for (i = 1; i < count + 1; ++i)
> +			i915_vm_unbind(i915, vm_id, batch_addr[i - 1], 4096);
> +
> +		for (i = 0; i < count + 1; ++i)
> +			gem_close(i915, obj[i]);
> +		free(siblings);
> +		igt_spin_free(i915, spin);
> +		put_ahnd(ahnd);
> +	}
> +
> +	gem_vm_destroy(i915, vm_id);
> +}
> +
> +static bool has_load_balancer(int i915)
> +{
> +	const intel_ctx_cfg_t cfg = {
> +		.load_balance = true,
> +		.num_engines = 1,
> +	};
> +	const intel_ctx_t *ctx = NULL;
> +	int err;
> +
> +	err = __intel_ctx_create(i915, &cfg, &ctx);
> +	intel_ctx_destroy(i915, ctx);
> +
> +	return err == 0;
> +}
> +
> +static bool has_logical_mapping(int i915)
> +{
> +	struct drm_i915_query_engine_info *engines;
> +	unsigned int i;
> +
> +	engines = query_engine_info(i915);
> +
> +	for (i = 0; i < engines->num_engines; ++i)
> +		if (!(engines->engines[i].flags &
> +		     I915_ENGINE_INFO_HAS_LOGICAL_INSTANCE)) {
> +			free(engines);
> +			return false;
> +		}
> +
> +	free(engines);
> +	return true;
> +}
> +
> +static bool has_parallel_execbuf(int i915)
> +{
> +	intel_ctx_cfg_t cfg = {
> +		.parallel = true,
> +		.num_engines = 1,
> +	};
> +	const intel_ctx_t *ctx = NULL;
> +	int err;
> +
> +	for (int class = 0; class < 32; class++) {
> +		struct i915_engine_class_instance *siblings;
> +		unsigned int count;
> +
> +		siblings = list_engines(i915, 1u << class, &count);
> +		if (!siblings)
> +			continue;
> +
> +		if (count < 2) {
> +			free(siblings);
> +			continue;
> +		}
> +
> +		logical_sort_siblings(i915, siblings, count);
> +
> +		cfg.width = count;
> +		memcpy(cfg.engines, siblings, sizeof(*siblings) * count);
> +		free(siblings);
> +
> +		err = __intel_ctx_create(i915, &cfg, &ctx);
> +		intel_ctx_destroy(i915, ctx);
> +
> +		return err == 0;
> +	}
> +
> +	return false;
> +}
> +
> +igt_main
> +{
> +	int i915 = -1;
> +
> +	igt_fixture {
> +		i915 = drm_open_driver(DRIVER_INTEL);
> +		igt_require_gem(i915);
> +
> +		gem_require_contexts(i915);
> +		igt_require(gem_has_engine_topology(i915));
> +		igt_require(has_load_balancer(i915));
> +		igt_require(has_perf_engines(i915));

imho we could have also function
		has_vm_bind(i915)
and use it
		igt_require(has_vm_bind(i915));

Regards,
Kamil

> +	}
> +
> +	igt_subtest_group {
> +		igt_fixture {
> +			igt_require(has_logical_mapping(i915));
> +			igt_require(has_parallel_execbuf(i915));
> +		}
> +
> +		igt_describe("Ensure a parallel submit actually runs in parallel");
> +		igt_subtest("parallel-ordering")
> +			parallel_ordering(i915, 0);
> +	}
> +}
> diff --git a/tests/meson.build b/tests/meson.build
> index 50de8b7e89..6ca5a94282 100644
> --- a/tests/meson.build
> +++ b/tests/meson.build
> @@ -375,6 +375,13 @@ test_executables += executable('gem_exec_balancer', 'i915/gem_exec_balancer.c',
>  	   install : true)
>  test_list += 'gem_exec_balancer'
>  
> +test_executables += executable('gem_exec3_balancer', 'i915/gem_exec3_balancer.c',
> +	   dependencies : test_deps + [ lib_igt_perf ],
> +	   install_dir : libexecdir,
> +	   install_rpath : libexecdir_rpathdir,
> +	   install : true)
> +test_list += 'gem_exec3_balancer'
> +
>  test_executables += executable('gem_mmap_offset',
>  	   join_paths('i915', 'gem_mmap_offset.c'),
>  	   dependencies : test_deps + [ libatomic ],
> -- 
> 2.21.0.rc0.32.g243a4c7e27
>