[PATCH i-g-t 1/1] tests/intel/xe_exec_mix_modes: Add new tests for parallel execution

Wed Jul 17 13:47:39 UTC 2024

On Wed, Jul 17, 2024 at 02:30:48PM +0200, Francois Dugast wrote:
> Test parallel execution of LR and dma fence jobs on the same device.
> 
> Add the following tests:
> * "exec-simple-batch-store-lr"
> * "exec-simple-batch-store-dma-fence"
> * "exec-spinner-interrupted-lr"
> * "exec-spinner-interrupted-dma-fence"
> 

Really good test, a couple of nits / questions.

> Signed-off-by: Francois Dugast <francois.dugast at intel.com>
> ---
>  tests/intel/xe_exec_mix_modes.c | 277 ++++++++++++++++++++++++++++++++
>  tests/meson.build               |   1 +
>  2 files changed, 278 insertions(+)
>  create mode 100644 tests/intel/xe_exec_mix_modes.c
> 
> diff --git a/tests/intel/xe_exec_mix_modes.c b/tests/intel/xe_exec_mix_modes.c
> new file mode 100644
> index 000000000..44265b220
> --- /dev/null
> +++ b/tests/intel/xe_exec_mix_modes.c
> @@ -0,0 +1,277 @@
> +// SPDX-License-Identifier: MIT
> +/*
> + * Copyright © 2024 Intel Corporation
> + */
> +
> +/**
> + * TEST: Test the parallel submission of jobs in LR and dma fence modecs
> + * Category: Core
> + * Mega feature: General Core features
> + * Sub-category: CMD submission
> + * Functionality: fault mode
> + * GPU requirements: GPU needs support for DRM_XE_VM_CREATE_FLAG_FAULT_MODE
> + */
> +
> +#include <fcntl.h>
> +
> +#include "igt.h"
> +#include "lib/igt_syncobj.h"
> +#include "lib/intel_reg.h"
> +#include "xe_drm.h"
> +
> +#include "xe/xe_ioctl.h"
> +#include "xe/xe_query.h"
> +#include "xe/xe_spin.h"
> +#include <string.h>
> +
> +#define FLAG_EXEC_MODE_LR	(0x1 << 0)
> +#define FLAG_JOB_TYPE_SIMPLE	(0x1 << 1)
> +
> +#define NUM_INTERRUPTING_JOBS	5
> +#define USER_FENCE_VALUE	0xdeadbeefdeadbeefull
> +#define ONE_SEC			MS_TO_NS(1000)
> +#define VM_DATA			0
> +#define SPIN_DATA		1
> +#define EXEC_DATA		2
> +#define DATA_COUNT		3
> +
> +struct data {
> +	struct xe_spin spin;
> +	uint32_t batch[16];
> +	uint64_t vm_sync;
> +	uint32_t data;
> +	uint64_t exec_sync;
> +	uint64_t addr;
> +};
> +
> +static void store_dword_batch(struct data *data, uint64_t addr, int value)
> +{
> +	int b;
> +	uint64_t batch_offset = (char *)&(data->batch) - (char *)data;
> +	uint64_t batch_addr = addr + batch_offset;
> +	uint64_t sdi_offset = (char *)&(data->data) - (char *)data;
> +	uint64_t sdi_addr = addr + sdi_offset;
> +
> +	b = 0;
> +	data->batch[b++] = MI_STORE_DWORD_IMM_GEN4;
> +	data->batch[b++] = sdi_addr;
> +	data->batch[b++] = sdi_addr >> 32;
> +	data->batch[b++] = value;
> +	data->batch[b++] = MI_BATCH_BUFFER_END;
> +	igt_assert(b <= ARRAY_SIZE(data->batch));
> +
> +	data->addr = batch_addr;
> +}
> +
> +enum engine_execution_mode {
> +	EXEC_MODE_LR,
> +	EXEC_MODE_DMA_FENCE,
> +};
> +
> +enum job_type {
> +	SIMPLE_BATCH_STORE,
> +	SPINNER_INTERRUPTED,
> +};
> +
> +static void
> +run_job(int fd, struct drm_xe_engine_class_instance *hwe,
> +	enum engine_execution_mode engine_execution_mode,
> +	enum job_type job_type)
> +{
> +	struct drm_xe_sync sync[1] = {
> +		{ .flags = DRM_XE_SYNC_FLAG_SIGNAL, },
> +	};
> +	struct drm_xe_exec exec = {
> +		.num_batch_buffer = 1,
> +		.num_syncs = 1,
> +		.syncs = to_user_pointer(&sync),
> +	};
> +	struct drm_xe_ext_set_property ext = {
> +		.base.next_extension = 0,
> +		.base.name = DRM_XE_EXEC_QUEUE_EXTENSION_SET_PROPERTY,
> +		.property = DRM_XE_EXEC_QUEUE_SET_PROPERTY_PRIORITY,
> +		.value = 2, /* High priority */
> +	};

I don't understand why setting High priority here. Can you explain?

> +	struct data *data;
> +	uint32_t vm;
> +	uint32_t exec_queue;
> +	size_t bo_size;
> +	int value = 0x123456;
> +	uint64_t addr = 0x100000;
> +	uint32_t bo = 0;
> +	unsigned int vm_flags = 0;
> +	struct xe_spin_opts spin_opts = { .preempt = true };
> +	const uint64_t duration_ns = NSEC_PER_SEC / 2; /* 500ms */
> +	struct timespec tv;
> +
> +	if (engine_execution_mode == EXEC_MODE_LR) {
> +		sync[0].type = DRM_XE_SYNC_TYPE_USER_FENCE;
> +		sync[0].timeline_value = USER_FENCE_VALUE;
> +		vm_flags = DRM_XE_VM_CREATE_FLAG_LR_MODE | DRM_XE_VM_CREATE_FLAG_FAULT_MODE;
> +	} else if (engine_execution_mode == EXEC_MODE_DMA_FENCE) {
> +		sync[0].type = DRM_XE_SYNC_TYPE_SYNCOBJ;
> +		sync[0].handle = syncobj_create(fd, 0);
> +	}
> +
> +	vm = xe_vm_create(fd, vm_flags, 0);
> +	bo_size = sizeof(*data) * DATA_COUNT;
> +	bo_size = xe_bb_size(fd, bo_size);
> +	bo = xe_bo_create(fd, vm, bo_size,
> +			  vram_if_possible(fd, hwe->gt_id),
> +			  DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM);
> +	data = xe_bo_map(fd, bo, bo_size);
> +	if (engine_execution_mode == EXEC_MODE_LR)
> +		sync[0].addr = to_user_pointer(&data[VM_DATA].vm_sync);
> +	xe_vm_bind_async(fd, vm, 0, bo, 0, addr, bo_size, &sync[0], 1);
> +
> +	store_dword_batch(data, addr, value);
> +	if (engine_execution_mode == EXEC_MODE_LR) {
> +		xe_wait_ufence(fd, &data[VM_DATA].vm_sync, USER_FENCE_VALUE, 0, ONE_SEC);
> +		sync[0].addr = addr + (char *)&data[EXEC_DATA].exec_sync - (char *)data;
> +	} else if (engine_execution_mode == EXEC_MODE_DMA_FENCE) {
> +		igt_assert(syncobj_wait(fd, &sync[0].handle, 1, INT64_MAX, 0, NULL));
> +		syncobj_reset(fd, &sync[0].handle, 1);
> +		sync[0].flags &= DRM_XE_SYNC_FLAG_SIGNAL;
> +	}
> +	exec_queue = xe_exec_queue_create(fd, vm, hwe, to_user_pointer(&ext));
> +	exec.exec_queue_id = exec_queue;
> +
> +	if (job_type == SPINNER_INTERRUPTED) {
> +		spin_opts.addr = addr + (char *)&data[SPIN_DATA].spin - (char *)data;
> +		spin_opts.ctx_ticks = duration_to_ctx_ticks(fd, 0, duration_ns);
> +		xe_spin_init(&data[SPIN_DATA].spin, &spin_opts);
> +		if (engine_execution_mode == EXEC_MODE_LR)
> +		    sync[0].addr = addr + (char *)&data[SPIN_DATA].exec_sync - (char *)data;

Identation looks off here.

> +		exec.address = spin_opts.addr;
> +	} else if (job_type == SIMPLE_BATCH_STORE) {
> +		exec.address = data->addr;
> +	}
> +	xe_exec(fd, &exec);
> +
> +	if (job_type == SPINNER_INTERRUPTED) {
> +		enum engine_execution_mode interrupting_engine_execution_mode;
> +		if (engine_execution_mode == EXEC_MODE_LR)
> +			interrupting_engine_execution_mode = EXEC_MODE_DMA_FENCE;
> +		else if (engine_execution_mode == EXEC_MODE_DMA_FENCE)
> +			interrupting_engine_execution_mode = EXEC_MODE_LR;
> +		xe_spin_wait_started(&data[SPIN_DATA].spin);
> +		igt_gettime(&tv);
> +		for (int i = 0; i < NUM_INTERRUPTING_JOBS; i++)
> +		{
> +			run_job(fd, hwe, interrupting_engine_execution_mode, SIMPLE_BATCH_STORE);
> +			/**
> +			 * Executing a SIMPLE_BATCH_STORE job takes significantly less time than
> +			 * duration_ns.
> +			 * When a spinner is running in LR mode, the interrupting job preempts it
> +			 * in KMD and should complete fast, shortly after starting the spinner.
> +			 * When a spinner is running in dma fence mode, the interrupting job waits
> +			 * in KMD and should complete shortly after the spinner has ended.
> +			 * The checks below are to verify preempting/waiting happens as expected
> +			 * depending on the execution mode.
> +			 */
> +			if (engine_execution_mode == EXEC_MODE_LR)
> +				igt_assert(igt_nsec_elapsed(&tv) < 0.5 * duration_ns);
> +			else if (engine_execution_mode == EXEC_MODE_DMA_FENCE)
> +				igt_assert(igt_nsec_elapsed(&tv) > duration_ns);
> +		}
> +	}

Should we also run the 'NUM_INTERRUPTING_JOBS' loop here? i.e.
simple-batch-store-lr / simple-batch-store-dma-fence test opening VMs in
both modes doing a simple store (no preemption)?

We'd have to protect again infinite recursion though but adding flag to
protect against that should be easy. e.g.
SIMPLE_BATCH_STORE_NO_RECURSION.

> +
> +	if (engine_execution_mode == EXEC_MODE_LR) {
> +		if (job_type == SPINNER_INTERRUPTED)
> +			xe_wait_ufence(fd, &data[SPIN_DATA].exec_sync, USER_FENCE_VALUE, 0, ONE_SEC * 2);
> +		else if (job_type == SIMPLE_BATCH_STORE)
> +			xe_wait_ufence(fd, &data[EXEC_DATA].exec_sync, USER_FENCE_VALUE, 0, ONE_SEC * 2);
> +	} else if (engine_execution_mode == EXEC_MODE_DMA_FENCE) {
> +		igt_assert(syncobj_wait(fd, &sync[0].handle, 1, INT64_MAX, 0, NULL));
> +		syncobj_destroy(fd, sync[0].handle);
> +	}
> +
> +	if (job_type == SIMPLE_BATCH_STORE)
> +		igt_assert_eq(data->data, value);
> +
> +	munmap(data, bo_size);
> +	gem_close(fd, bo);
> +	xe_exec_queue_destroy(fd, exec_queue);
> +	xe_vm_destroy(fd, vm);
> +}
> +
> +/**
> + * SUBTEST: exec-simple-batch-store-lr
> + * Description: Execute a simple batch store job in long running mode
> + *
> + * SUBTEST: exec-simple-batch-store-dma-fence
> + * Description: Execute a simple batch store job in dma fence mode
> + *
> + * SUBTEST: exec-spinner-interrupted-lr
> + * Description: Spin in long running mode then get interrupted by a simple
> + *              batch store job in dma fence mode
> + *
> + * SUBTEST: exec-spinner-interrupted-dma-fence
> + * Description: Spin in dma fence mode then get interrupted by a simple
> + *              batch store job in long running mode
> + */
> +static void
> +test_exec(int fd, struct drm_xe_engine_class_instance *hwe,
> +	  unsigned int flags)
> +{
> +	enum engine_execution_mode engine_execution_mode;
> +	enum job_type job_type;
> +
> +	if (flags & FLAG_EXEC_MODE_LR)
> +		engine_execution_mode = EXEC_MODE_LR;
> +	else
> +		engine_execution_mode = EXEC_MODE_DMA_FENCE;
> +
> +	if (flags & FLAG_JOB_TYPE_SIMPLE)
> +		job_type = SIMPLE_BATCH_STORE;
> +	else
> +		job_type = SPINNER_INTERRUPTED;
> +
> +	run_job(fd, hwe, engine_execution_mode, job_type);
> +}
> +
> +igt_main
> +
> +

Extra whitespace.

> +{
> +	struct drm_xe_engine_class_instance *hwe;
> +	const struct section {
> +		const char *name;
> +		unsigned int flags;
> +	} sections[] = {
> +		{ "simple-batch-store-lr", FLAG_JOB_TYPE_SIMPLE | FLAG_EXEC_MODE_LR },
> +		{ "simple-batch-store-dma-fence", FLAG_JOB_TYPE_SIMPLE },
> +		{ "spinner-interrupted-lr", FLAG_EXEC_MODE_LR },
> +		{ "spinner-interrupted-dma-fence", 0 },
> +		{ NULL },
> +	};
> +	int fd;
> +
> +	igt_fixture {
> +		struct timespec tv = {};
> +		bool supports_faults;
> +		int ret = 0;
> +		int timeout = igt_run_in_simulation() ? 20 : 2;
> +
> +		fd = drm_open_driver(DRIVER_XE);
> +		do {
> +			if (ret)
> +				usleep(5000);
> +			ret = xe_supports_faults(fd);
> +		} while (ret == -EBUSY && igt_seconds_elapsed(&tv) < timeout);
> +
> +		supports_faults = !ret;
> +		igt_require(supports_faults);

I was unsure why this code was added in xe_exec_fault_mode, so had to
look:

git format-patch -1 8abb25ffe58

If you read the explaination for that it because we don't support mixing
faulting and non-faulting VMs being open at the same time + races
closing the VMs. With your KMD series we support having both a faulting
VM and non-faulting VM open so this loop is not required.

e.g. I think you can just do this:

igt_fixture {
	fd = drm_open_driver(DRIVER_XE);;
	igt_require(xe_supports_faults(fd));
}

Matt

> +	}
> +
> +	for (const struct section *s = sections; s->name; s++) {
> +		igt_subtest_f("exec-%s", s->name)
> +			xe_for_each_engine(fd, hwe)
> +				if (hwe->engine_class == DRM_XE_ENGINE_CLASS_COMPUTE)
> +					test_exec(fd, hwe, s->flags);
> +	}
> +
> +	igt_fixture {
> +		drm_close_driver(fd);
> +	}
> +}
> diff --git a/tests/meson.build b/tests/meson.build
> index 357db2723..e649466be 100644
> --- a/tests/meson.build
> +++ b/tests/meson.build
> @@ -286,6 +286,7 @@ intel_xe_progs = [
>  	'xe_exec_basic',
>  	'xe_exec_compute_mode',
>  	'xe_exec_fault_mode',
> +	'xe_exec_mix_modes',
>  	'xe_exec_queue_property',
>  	'xe_exec_reset',
>  	'xe_exec_sip',
> -- 
> 2.43.0
>