[PATCH i-g-t 4/4] xe_exec_nop: create xe_exec_nop

Thu Apr 10 08:53:42 UTC 2025


> -----Original Message-----
> From: igt-dev <igt-dev-bounces at lists.freedesktop.org> On Behalf Of Pravalika
> Gurram
> Sent: 02 April 2025 10:11 PM
> To: igt-dev at lists.freedesktop.org
> Cc: Gurram, Pravalika <pravalika.gurram at intel.com>
> Subject: [PATCH i-g-t 4/4] xe_exec_nop: create xe_exec_nop
> 
> Signed-off-by: Pravalika Gurram <pravalika.gurram at intel.com>
> ---
>  benchmarks/meson.build   |   1 +
>  benchmarks/xe_exec_nop.c | 221
> +++++++++++++++++++++++++++++++++++++++
>  2 files changed, 222 insertions(+)
>  create mode 100644 benchmarks/xe_exec_nop.c
> 
> diff --git a/benchmarks/meson.build b/benchmarks/meson.build index
> f29d5a288..1af13b0c6 100644
> --- a/benchmarks/meson.build
> +++ b/benchmarks/meson.build
> @@ -24,6 +24,7 @@ benchmark_progs = [
>          'xe_blt',
>          'xe_create',
>          'xe_exec_ctx',
> +        'xe_exec_nop',
>  ]
> 
>  benchmarksdir = join_paths(libexecdir, 'benchmarks') diff --git
> a/benchmarks/xe_exec_nop.c b/benchmarks/xe_exec_nop.c new file mode
> 100644 index 000000000..99249dd96
> --- /dev/null
> +++ b/benchmarks/xe_exec_nop.c
> @@ -0,0 +1,221 @@
> +/*
> + * Copyright © 2025 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person
> +obtaining a
> + * copy of this software and associated documentation files (the
> +"Software"),
> + * to deal in the Software without restriction, including without
> +limitation
> + * the rights to use, copy, modify, merge, publish, distribute,
> +sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom
> +the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the
> +next
> + * paragraph) shall be included in all copies or substantial portions
> +of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> +EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> +MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO
> EVENT
> +SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
> DAMAGES OR
> +OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> +ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
> OTHER
> +DEALINGS
> + * IN THE SOFTWARE.
> + *
> + * Authors:
> + *    Pravalika Gurram <pravalika.gurram at intel.com>
> + *
> + */
> +
> +#include "drm.h"
> +#include "drmtest.h"
> +
> +#include "intel_io.h"
> +#include "intel_reg.h"
> +#include <sys/ioctl.h>
> +#include "ioctl_wrappers.h"
> +#include "igt_syncobj.h"
> +#include "xe/xe_ioctl.h"
> +#include "xe/xe_query.h"
> +#include "xe/xe_util.h"
> +#include <time.h>
> +
> +#define READ_ALL 0x4
> +static double elapsed(const struct timespec *start,
> +		      const struct timespec *end)
> +{
> +	return (end->tv_sec - start->tv_sec) + 1e-9*(end->tv_nsec -
> +start->tv_nsec); }
> +
> +static void first_batch(int fd, struct drm_xe_engine_class_instance
> +*eci) {
> +	uint64_t bo_size = xe_bb_size(fd, SZ_4K);
> +	struct drm_xe_engine_class_instance inst = {
> +		.engine_class = DRM_XE_ENGINE_CLASS_RENDER,
> +	};
Why are you using inst here, no where you are using in this function.
Can you explain why are you using first batch and second batch.
> +	uint32_t vm, bo;
> +	u32 q;
> +	vm = xe_vm_create(fd, DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE,
> 0);
> +	bo = xe_bo_create(fd, vm, bo_size, system_memory(fd), 0);
> +	q = xe_exec_queue_create(fd, vm, eci, 0);
> +	xe_exec_queue_destroy(fd, q);
> +	gem_close(fd, bo);
> +	xe_vm_destroy(fd, vm);
> +}
> +static void second_batch(int fd, struct drm_xe_engine_class_instance
> +*eci) {
> +
> +	int err;
> +	uint64_t bo_size = xe_bb_size(fd, SZ_4K), bo_addr = 0x1a0000;
> +	uint32_t vm, bo, *batch, exec_queue;
> +	struct drm_xe_engine_class_instance inst = {
> +		.engine_class = DRM_XE_ENGINE_CLASS_RENDER,
> +	};
> +
> +	struct drm_xe_sync sync = {
> +		.type = DRM_XE_SYNC_TYPE_SYNCOBJ,
> +		.flags = DRM_XE_SYNC_FLAG_SIGNAL,
> +		.handle = syncobj_create(fd, 0),
> +	};
> +	struct drm_xe_exec exec = {
> +		.num_syncs = 1,
> +		.syncs = to_user_pointer(&sync),
> +		.address = bo_addr,
> +		.num_batch_buffer = 1,
> +	};
> +
> +	vm = xe_vm_create(fd, DRM_XE_VM_CREATE_FLAG_SCRATCH_PAGE,
> 0);
> +	bo = xe_bo_create(fd, vm, bo_size, system_memory(fd), 0);
> +
> +	batch = xe_bo_map(fd, bo, bo_size);
> +	*batch = MI_BATCH_BUFFER_END;
> +	munmap(batch, bo_size);
> +
> +	xe_vm_bind_sync(fd, vm, bo, 0, bo_addr, bo_size);
> +	exec_queue = xe_exec_queue_create(fd, vm, eci, 0);
> +
> +	exec.exec_queue_id = exec_queue;
> +	err = __xe_exec(fd, &exec);
> +
> +	err = syncobj_wait(fd, &sync.handle, 1, INT64_MAX, 0, NULL);
> +	xe_exec_queue_destroy(fd, exec_queue);
> +	gem_close(fd, bo);
> +	xe_vm_destroy(fd, vm);
> +	syncobj_destroy(fd, sync.handle);
> +
> +}
> +static int loop(unsigned ring, int reps, int ncpus, unsigned flags) {
> +	double *shared;
> +	int fd;
> +	struct drm_xe_engine_class_instance *hwe;
> +	shared = mmap(0, 4096, PROT_WRITE, MAP_SHARED | MAP_ANON, -
> 1, 0);
> +
> +	fd = drm_open_driver(DRIVER_XE);
> +	if (ring) {
> +		xe_for_each_engine(fd, hwe) {
> +			if (hwe->engine_class == ring) {
> +				first_batch(fd, hwe);
> +				second_batch(fd, hwe);
> +			}
> +		}
> +	} else {
> +		xe_for_each_engine(fd, hwe) {
> +			first_batch(fd, hwe);
> +			second_batch(fd, hwe);
> +		}
> +	}
> +
> +
> +	while (reps--) {
> +		memset(shared, 0, 4096);
> +
> +		sleep(1); /* wait for the hw to go back to sleep */
> +
> +		igt_fork(child, ncpus) {
> +			struct timespec start, end;
> +			unsigned count = 0;
> +
> +			first_batch(fd, hwe);
> +			second_batch(fd, hwe);
> +
> +			clock_gettime(CLOCK_MONOTONIC, &start);
> +			do {
> +				for (int inner = 0; inner < 1024; inner++) {
> +					if (flags & READ_ALL) {
What does here READ_ALL mean, actual interpretation for this in i915 is different
Refer this commit: 05ca171aa9a6902614241f9685de2f62f30126d8
"we look at the throughput for submitting a read batch to a
    single engine or any. The kernel optimises for this by allowing multiple
    engine to read at the same time, but writes are exclusive to a single
    engine. So, lets try to measure the impact of inserting the barriers
    between writes on different engines."
Look into it and you will get the meaning behind it, First go through the code flow in i915 and then port to xe.

> +						if (ring) {
> +
> 	xe_for_each_engine(fd, hwe) {
> +								if (hwe-
> >engine_class == ring) {
> +
> 	first_batch(fd, hwe);
> +
> 	second_batch(fd, hwe);
> +								}
> +							}
> +						} else {
> +
> 	xe_for_each_engine(fd, hwe) {
> +								first_batch(fd,
> hwe);
> +
> 	second_batch(fd, hwe);
> +							}
> +						}
> +					}
> +				}
> +
> +				clock_gettime(CLOCK_MONOTONIC, &end);
> +			} while (elapsed(&start, &end) < 2.);
> +
> +			clock_gettime(CLOCK_MONOTONIC, &end);
> +			shared[child] = 1e6*elapsed(&start, &end) / count;
> +
> +		}
> +		igt_waitchildren();
> +
> +		for (int child = 0; child < ncpus; child++)
> +			shared[ncpus] += shared[child];
> +		printf("%7.3f ncpus %d\n", shared[ncpus] / ncpus, ncpus);
> +	}
> +	return 0;
> +}
> +
> +int main(int argc, char **argv)
> +{
> +	unsigned ring = DRM_XE_ENGINE_CLASS_RENDER;
> +	unsigned flags = 0;
> +	int reps = 1;
> +	int ncpus = 1;
> +	int c;
> +
> +	while ((c = getopt (argc, argv, "e:r:f:A")) != -1) {
> +		switch (c) {
> +		case 'e':
> +			if (strcmp(optarg, "rcs") == 0)
> +				ring = DRM_XE_ENGINE_CLASS_RENDER;
> +			else if (strcmp(optarg, "vcs") == 0)
> +				ring =
> DRM_XE_ENGINE_CLASS_VIDEO_DECODE;
> +			else if (strcmp(optarg, "bcs") == 0)
> +				ring = DRM_XE_ENGINE_CLASS_COPY;
> +			else if (strcmp(optarg, "vecs") == 0)
> +				ring =
> DRM_XE_ENGINE_CLASS_VIDEO_ENHANCE;
> +			else if (strcmp(optarg, "ccs") == 0)
> +				ring = DRM_XE_ENGINE_CLASS_COMPUTE;
> +			else
> +				ring = atoi(optarg);
> +			break;
> +
> +		case 'r':
> +			reps = atoi(optarg);
> +			if (reps < 1)
> +				reps = 1;
> +			break;
> +
> +		case 'f':
> +			ncpus = sysconf(_SC_NPROCESSORS_ONLN);
> +			break;
> +
> +		case 'A':
> +			flags |= READ_ALL;
> +			break;
> +
> +		default:
> +			break;
> +		}
> +	}
> +
> +	return loop(ring, reps, ncpus, flags); }
> --
> 2.34.1