[igt-dev] [PATCH i-g-t 14/14] benchmarks/gem_wsim: added basic xe support

Tue Sep 26 13:10:09 UTC 2023

On 26/09/2023 09:44, Marcin Bernatowicz wrote:
> Added basic xe support. Single binary handles both i915 and Xe devices.
> 
> Some functionality is still missing: working sets, bonding.
> 
> The tool is handy for scheduling tests, we find it useful to verify vGPU
> profiles defining different execution quantum/preemption timeout
> settings.
> 
> There is also some rationale for the tool in following thread:
> https://lore.kernel.org/dri-devel/a443495f-5d1b-52e1-9b2f-80167deb6d57@linux.intel.com/
> 
> With this patch it should be possible to run following on xe device:
> 
> gem_wsim -w benchmarks/wsim/media_load_balance_fhd26u7.wsim -c 36 -r 600
> 
> Best with drm debug logs disabled:
> 
> echo 0 > /sys/module/drm/parameters/debug
> 
> v2: minimizing divergence - same workload syntax for both drivers,
>      so most existing examples should run on xe unmodified (Tvrtko)

Awesome!

>      This version creates one common VM per workload.
>      Explicit VM management, compute mode will come in next patchset.

I think this is going quite well and is looking promising we will end up 
with something clean.

The only thing I feel needs to be said ahead of time is that I am not 
convinced we should be merging any xe specific changes until xe arrives 
upstream.

But much good progress with refactoring, cleanup and review can still be 
made.

> 
> Signed-off-by: Marcin Bernatowicz <marcin.bernatowicz at linux.intel.com>
> ---
>   benchmarks/gem_wsim.c  | 515 ++++++++++++++++++++++++++++++++++++++---
>   benchmarks/wsim/README |   6 +-
>   2 files changed, 485 insertions(+), 36 deletions(-)
> 
> diff --git a/benchmarks/gem_wsim.c b/benchmarks/gem_wsim.c
> index 7703ca822..c83ed4882 100644
> --- a/benchmarks/gem_wsim.c
> +++ b/benchmarks/gem_wsim.c
> @@ -62,6 +62,12 @@
>   #include "i915/gem_engine_topology.h"
>   #include "i915/gem_mman.h"
>   
> +#include "igt_syncobj.h"
> +#include "intel_allocator.h"
> +#include "xe_drm.h"
> +#include "xe/xe_ioctl.h"
> +#include "xe/xe_spin.h"
> +
>   enum intel_engine_id {
>   	DEFAULT,
>   	RCS,
> @@ -109,6 +115,10 @@ struct deps {
>   	struct dep_entry *list;
>   };
>   
> +#define for_each_dep(__dep, __deps) \
> +	for (int __i = 0; __i < __deps.nr && \
> +	     (__dep = &__deps.list[__i]); ++__i)

Could you make use of this macro outside xe too? Like in do_eb()? If so 
you could extract it and merge ahead of time.

> +
>   struct w_arg {
>   	char *filename;
>   	char *desc;
> @@ -177,10 +187,30 @@ struct w_step {
>   	struct drm_i915_gem_execbuffer2 eb;
>   	struct drm_i915_gem_exec_object2 *obj;
>   	struct drm_i915_gem_relocation_entry reloc[3];
> +
> +	struct drm_xe_exec exec;
> +	size_t bb_size;
> +	struct xe_spin *spin;
> +	struct drm_xe_sync *syncs;

Lets think how to create backend specific containers here and make them 
an union. So it is clear what gets used in each case and that it is 
mutually exclusive. It may end up requiring a separate refactoring 
patch(-es) which move the i915 bits into the i915 unions/namespace.

I know gem_wsim did not have that much a focus for clean design, but now 
that 2nd backend is coming I think it is much more important for ease of 
maintenance in the future.

> +
>   	uint32_t bb_handle;
>   	uint32_t *bb_duration;
>   };
>   
> +struct vm {

Everything xe specific please prefix with xe_. Structs, functions, etc.

> +	uint32_t id;
> +	bool compute_mode;
> +	uint64_t ahnd;
> +};
> +
> +struct exec_queue {
> +	uint32_t id;
> +	struct drm_xe_engine_class_instance hwe;
> +	/* for qd_throttle */
> +	unsigned int nrequest;
> +	struct igt_list_head requests;
> +};
> +
>   struct ctx {
>   	uint32_t id;
>   	int priority;
> @@ -190,6 +220,10 @@ struct ctx {
>   	struct bond *bonds;
>   	bool load_balance;
>   	uint64_t sseu;
> +	/* reference to vm */
> +	struct vm *vm;
> +	/* queue for each class */
> +	struct exec_queue queues[NUM_ENGINES];
>   };
>   
>   struct workload {
> @@ -213,7 +247,10 @@ struct workload {
>   	unsigned int nr_ctxs;
>   	struct ctx *ctx_list;
>   
> -	struct working_set **working_sets; /* array indexed by set id */

Comment got lost.

> +	unsigned int nr_vms;
> +	struct vm *vm_list;
> +
> +	struct working_set **working_sets;
>   	int max_working_set_id;
>   
>   	int sync_timeline;
> @@ -223,6 +260,18 @@ struct workload {
>   	unsigned int nrequest[NUM_ENGINES];
>   };
>   
> +#define for_each_ctx(__ctx, __wrk) \
> +	for (int __i = 0; __i < (__wrk)->nr_ctxs && \
> +	     (__ctx = &(__wrk)->ctx_list[__i]); ++__i)
> +
> +#define for_each_exec_queue(__eq, __ctx) \
> +		for (int __j = 0; __j < NUM_ENGINES && ((__eq) = &((__ctx)->queues[__j])); ++__j) \
> +			for_if((__eq)->id > 0)
> +
> +#define for_each_vm(__vm, __wrk) \
> +	for (int __i = 0; __i < (__wrk)->nr_vms && \
> +	     (__vm = &(__wrk)->vm_list[__i]); ++__i)
> +
>   static unsigned int master_prng;
>   
>   static int verbose = 1;
> @@ -231,6 +280,8 @@ static struct drm_i915_gem_context_param_sseu device_sseu = {
>   	.slice_mask = -1 /* Force read on first use. */
>   };
>   
> +static bool is_xe;

Put it next to global 'int fd'.

> +
>   #define SYNCEDCLIENTS	(1<<1)
>   #define DEPSYNC		(1<<2)
>   #define FLAG_SSEU	(1<<3)
> @@ -247,7 +298,10 @@ static const char *ring_str_map[NUM_ENGINES] = {
>   
>   static void w_step_sync(struct w_step *w)
>   {
> -	gem_sync(fd, w->obj[0].handle);
> +	if (is_xe)
> +		igt_assert(syncobj_wait(fd, &w->syncs[0].handle, 1, INT64_MAX, 0, NULL));
> +	else
> +		gem_sync(fd, w->obj[0].handle);
>   }
>   
>   static int read_timestamp_frequency(int i915)
> @@ -351,15 +405,23 @@ parse_dependency(unsigned int nr_steps, struct w_step *w, char *str)
>   		if (entry.target > 0 || ((int)nr_steps + entry.target) < 0)
>   			return -1;
>   
> -		add_dep(&w->data_deps, entry);
> +		/* only fence deps in xe, let f-1 <==> -1 */
> +		if (is_xe)
> +			add_dep(&w->fence_deps, entry);
> +		else
> +			add_dep(&w->data_deps, entry);
>   
>   		break;
>   	case 's':
> -		submit_fence = true;
> +		/* no submit fence in xe ? */
> +		if (!is_xe)
> +			submit_fence = true;
>   		/* Fall-through. */
>   	case 'f':
> -		/* Multiple fences not yet supported. */
> -		igt_assert_eq(w->fence_deps.nr, 0);
> +		/* xe supports multiple fences */
> +		if (!is_xe)
> +			/* Multiple fences not yet supported. */
> +			igt_assert_eq(w->fence_deps.nr, 0);
>   
>   		entry.target = atoi(++str);
>   		if (entry.target > 0 || ((int)nr_steps + entry.target) < 0)
> @@ -469,7 +531,17 @@ static struct intel_engine_data *query_engines(void)
>   	if (engines.nengines)
>   		return &engines;
>   
> -	engines = intel_engine_list_of_physical(fd);
> +	if (is_xe) {
> +		struct drm_xe_engine_class_instance *hwe;
> +
> +		xe_for_each_hw_engine(fd, hwe) {
> +			engines.engines[engines.nengines].class = hwe->engine_class;
> +			engines.engines[engines.nengines].instance = hwe->engine_instance;
> +			engines.nengines++;
> +		}
> +	} else
> +		engines = intel_engine_list_of_physical(fd);
> +
>   	igt_assert(engines.nengines);
>   	return &engines;
>   }
> @@ -562,6 +634,40 @@ get_engine(enum intel_engine_id engine)
>   	return ci;
>   }
>   
> +static struct drm_xe_engine_class_instance
> +get_xe_engine(enum intel_engine_id engine)
> +{
> +	struct drm_xe_engine_class_instance ci;
> +
> +	switch (engine) {
> +	case DEFAULT:
> +	case RCS:
> +		ci.engine_class = DRM_XE_ENGINE_CLASS_RENDER;
> +		ci.engine_instance = 0;
> +		break;
> +	case BCS:
> +		ci.engine_class = DRM_XE_ENGINE_CLASS_COPY;
> +		ci.engine_instance = 0;
> +		break;
> +	case VCS1:
> +		ci.engine_class = DRM_XE_ENGINE_CLASS_VIDEO_DECODE;
> +		ci.engine_instance = 0;
> +		break;
> +	case VCS2:
> +		ci.engine_class = DRM_XE_ENGINE_CLASS_VIDEO_DECODE;
> +		ci.engine_instance = 1;
> +		break;
> +	case VECS:
> +		ci.engine_class = DRM_XE_ENGINE_CLASS_VIDEO_ENHANCE;
> +		ci.engine_instance = 0;
> +		break;
> +	default:
> +		igt_assert(0);
> +	};
> +
> +	return ci;
> +}
> +
>   static int parse_engine_map(struct w_step *step, const char *_str)
>   {
>   	char *token, *tctx = NULL, *tstart = (char *)_str;
> @@ -838,6 +944,13 @@ parse_workload(struct w_arg *arg, unsigned int flags, double scale_dur,
>   			} else if (!strcmp(field, "P")) {
>   				unsigned int nr = 0;
>   
> +				if (is_xe) {
> +					if (verbose > 3)
> +						printf("skipped line: %s\n", _token);

There are no priorities at all in xe?

I'd make this a warning level print during parsing, that is printed if 
verbose > 0.

> +					free(token);
> +					continue;
> +				}
> +
>   				while ((field = strtok_r(fstart, ".", &fctx))) {
>   					tmp = atoi(field);
>   					check_arg(nr == 0 && tmp <= 0,
> @@ -864,6 +977,13 @@ parse_workload(struct w_arg *arg, unsigned int flags, double scale_dur,
>   			} else if (!strcmp(field, "S")) {
>   				unsigned int nr = 0;
>   
> +				if (is_xe) {
> +					if (verbose > 3)
> +						printf("skipped line: %s\n", _token);

This one probably best if fails parsing with an user friendly message.

> +					free(token);
> +					continue;
> +				}
> +
>   				while ((field = strtok_r(fstart, ".", &fctx))) {
>   					tmp = atoi(field);
>   					check_arg(tmp <= 0 && nr == 0,
> @@ -977,6 +1097,12 @@ parse_workload(struct w_arg *arg, unsigned int flags, double scale_dur,
>   			} else if (!strcmp(field, "b")) {
>   				unsigned int nr = 0;
>   
> +				if (is_xe) {
> +					if (verbose > 3)
> +						printf("skipped line: %s\n", _token);

Ditto.

> +					free(token);
> +					continue;
> +				}
>   				while ((field = strtok_r(fstart, ".", &fctx))) {
>   					check_arg(nr > 2,
>   						  "Invalid bond format at step %u!\n",
> @@ -1041,7 +1167,7 @@ parse_workload(struct w_arg *arg, unsigned int flags, double scale_dur,
>   			}
>   
>   			tmp = atoi(field);
> -			check_arg(tmp < 0, "Invalid ctx id at step %u!\n",
> +			check_arg(tmp <= 0, "Invalid context id at step %u!\n",

If context id 0, eg. '0.RCS.1000.0.0', works today, please make this a 
separate patch which adds a new restriction. If it doesn't work then 
still make it a separate patch which fixes the validation bug.

>   				  nr_steps);
>   			step.context = tmp;
>   
> @@ -1054,7 +1180,7 @@ parse_workload(struct w_arg *arg, unsigned int flags, double scale_dur,
>   
>   			i = str_to_engine(field);
>   			check_arg(i < 0,
> -				  "Invalid engine id at step %u!\n", nr_steps);
> +				"Invalid engine id at step %u!\n", nr_steps);

Noise which breaks indentation. :)

>   
>   			valid++;
>   
> @@ -1288,6 +1414,20 @@ __get_ctx(struct workload *wrk, const struct w_step *w)
>   	return &wrk->ctx_list[w->context];
>   }
>   
> +static struct exec_queue *
> +get_eq(struct workload *wrk, const struct w_step *w)
> +{
> +	igt_assert(w->engine >= 0 && w->engine < NUM_ENGINES);
> +
> +	return &__get_ctx(wrk, w)->queues[w->engine];
> +}
> +
> +static struct vm *
> +get_vm(struct workload *wrk, const struct w_step *w)
> +{
> +	return wrk->vm_list;
> +}
> +
>   static uint32_t mmio_base(int i915, enum intel_engine_id engine, int gen)
>   {
>   	const char *name;
> @@ -1540,6 +1680,61 @@ alloc_step_batch(struct workload *wrk, struct w_step *w)
>   #endif
>   }
>   
> +static void
> +xe_alloc_step_batch(struct workload *wrk, struct w_step *w)
> +{
> +	struct vm *vm = get_vm(wrk, w);
> +	struct exec_queue *eq = get_eq(wrk, w);
> +	struct dep_entry *dep;
> +	int i;
> +
> +	w->bb_size = ALIGN(sizeof(*w->spin) + xe_cs_prefetch_size(fd),
> +			   xe_get_default_alignment(fd));
> +	w->bb_handle = xe_bo_create(fd, 0, vm->id, w->bb_size);
> +	w->spin = xe_bo_map(fd, w->bb_handle, w->bb_size);
> +	w->exec.address =
> +		intel_allocator_alloc_with_strategy(vm->ahnd, w->bb_handle, w->bb_size,
> +						    0, ALLOC_STRATEGY_LOW_TO_HIGH);
> +	xe_vm_bind_sync(fd, vm->id, w->bb_handle, 0, w->exec.address, w->bb_size);
> +	xe_spin_init_opts(w->spin, .addr = w->exec.address,
> +				   .preempt = (w->preempt_us > 0),
> +				   .ctx_ticks = duration_to_ctx_ticks(fd, eq->hwe.gt_id,
> +								1000LL * get_duration(wrk, w)));
> +	w->exec.exec_queue_id = eq->id;
> +	w->exec.num_batch_buffer = 1;
> +	/* always at least one out fence */
> +	w->exec.num_syncs = 1;
> +	/* count syncs */
> +	igt_assert_eq(0, w->data_deps.nr);
> +	for_each_dep(dep, w->fence_deps) {
> +		int dep_idx = w->idx + dep->target;
> +
> +		igt_assert(dep_idx >= 0 && dep_idx < w->idx);
> +		igt_assert(wrk->steps[dep_idx].type == SW_FENCE ||
> +			   wrk->steps[dep_idx].type == BATCH);
> +
> +		w->exec.num_syncs++;
> +	}
> +	w->syncs = calloc(w->exec.num_syncs, sizeof(*w->syncs));
> +	/* fill syncs */
> +	i = 0;
> +	/* out fence */
> +	w->syncs[i].handle = syncobj_create(fd, 0);
> +	w->syncs[i++].flags = DRM_XE_SYNC_SYNCOBJ | DRM_XE_SYNC_SIGNAL;
> +	/* in fence(s) */
> +	for_each_dep(dep, w->fence_deps) {
> +		int dep_idx = w->idx + dep->target;
> +
> +		igt_assert(wrk->steps[dep_idx].type == SW_FENCE ||
> +			   wrk->steps[dep_idx].type == BATCH);
> +		igt_assert(wrk->steps[dep_idx].syncs && wrk->steps[dep_idx].syncs[0].handle);
> +
> +		w->syncs[i].handle = wrk->steps[dep_idx].syncs[0].handle;
> +		w->syncs[i++].flags = DRM_XE_SYNC_SYNCOBJ;
> +	}
> +	w->exec.syncs = to_user_pointer(w->syncs);
> +}
> +
>   static bool set_priority(uint32_t ctx_id, int prio)
>   {
>   	struct drm_i915_gem_context_param param = {
> @@ -1766,6 +1961,61 @@ static void measure_active_set(struct workload *wrk)
>   
>   #define alloca0(sz) ({ size_t sz__ = (sz); memset(alloca(sz__), 0, sz__); })
>   
> +static void vm_create(struct vm *vm)
> +{
> +	uint32_t flags = 0;
> +
> +	if (vm->compute_mode)
> +		flags |= DRM_XE_VM_CREATE_ASYNC_BIND_OPS |
> +			 DRM_XE_VM_CREATE_COMPUTE_MODE;
> +
> +	vm->id = xe_vm_create(fd, flags, 0);
> +}
> +
> +static void exec_queue_create(struct ctx *ctx, struct exec_queue *eq)
> +{
> +	struct drm_xe_exec_queue_create create = {
> +		.vm_id = ctx->vm->id,
> +		.width = 1,
> +		.num_placements = 1,
> +		.instances = to_user_pointer(&eq->hwe),
> +	};
> +	struct drm_xe_engine_class_instance *eci = NULL;
> +
> +	if (ctx->load_balance && eq->hwe.engine_class == DRM_XE_ENGINE_CLASS_VIDEO_DECODE) {
> +		struct drm_xe_engine_class_instance *hwe;
> +		int i;
> +
> +		for (i = 0; i < ctx->engine_map_count; ++i)
> +			igt_assert(ctx->engine_map[i] == VCS || ctx->engine_map[i] == VCS1 ||
> +				   ctx->engine_map[i] == VCS2);
> +
> +		eci = calloc(16, sizeof(struct drm_xe_engine_class_instance));
> +		create.num_placements = 0;
> +		xe_for_each_hw_engine(fd, hwe) {
> +			if (hwe->engine_class != DRM_XE_ENGINE_CLASS_VIDEO_DECODE ||
> +			    hwe->gt_id != 0)
> +				continue;
> +
> +			igt_assert(create.num_placements < 16);
> +			eci[create.num_placements++] = *hwe;
> +		}
> +		igt_assert(create.num_placements);
> +		create.instances = to_user_pointer(eci);
> +
> +		if (verbose > 3)
> +			printf("num_placements=%d class=%d gt=%d\n", create.num_placements,
> +				eq->hwe.engine_class, eq->hwe.gt_id);
> +	}
> +
> +	igt_assert_eq(igt_ioctl(fd, DRM_IOCTL_XE_EXEC_QUEUE_CREATE, &create), 0);
> +
> +	if (eci)
> +		free(eci);
> +
> +	eq->id = create.exec_queue_id;
> +}
> +
>   static int prepare_contexts(unsigned int id, struct workload *wrk)
>   {
>   	uint32_t share_vm = 0;
> @@ -1796,6 +2046,84 @@ static int prepare_contexts(unsigned int id, struct workload *wrk)
>   		max_ctx = ctx;
>   	}
>   
> +	if (is_xe) {

Shouldn't the i915 and xe parts be mutually exclusive? Or xe ctx setup 
depends on some parts of the existing setup run?

> +		int engine_classes[NUM_ENGINES] = {};
> +
> +		/* shortcut, create one vm */
> +		wrk->nr_vms = 1;
> +		wrk->vm_list = calloc(wrk->nr_vms, sizeof(struct vm));
> +		wrk->vm_list->compute_mode = false;
> +		vm_create(wrk->vm_list);
> +		wrk->vm_list->ahnd =
> +			intel_allocator_open(fd, wrk->vm_list->id, INTEL_ALLOCATOR_RELOC);
> +
> +		/* create exec queues of each referenced engine class */
> +		for (j = 0; j < wrk->nr_ctxs; j++) {
> +			struct ctx *ctx = &wrk->ctx_list[j];
> +
> +			/* link with vm */
> +			ctx->vm = wrk->vm_list;
> +
> +			for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) {
> +				if (w->context != j)
> +					continue;
> +
> +				if (w->type == ENGINE_MAP) {
> +					ctx->engine_map = w->engine_map;
> +					ctx->engine_map_count = w->engine_map_count;
> +				} else if (w->type == LOAD_BALANCE) {
> +					if (!ctx->engine_map) {
> +						wsim_err("Load balancing needs an engine map!\n");
> +						return 1;
> +					}
> +					if (intel_gen(intel_get_drm_devid(fd)) < 11) {
> +						wsim_err("Load balancing needs relative mmio support, gen11+!\n");
> +						return 1;
> +					}
> +					ctx->load_balance = w->load_balance;
> +				}
> +			}
> +
> +			/* create exec queue for each referenced engine */
> +			for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) {
> +				if (w->context != j)
> +					continue;
> +
> +				if (w->type == BATCH)
> +					engine_classes[w->engine]++;
> +			}
> +
> +			for (i = 0; i < NUM_ENGINES; i++) {
> +				if (engine_classes[i]) {
> +					if (verbose > 3)
> +						printf("%u ctx[%d] eq(%s) load_balance=%d\n",
> +							id, j, ring_str_map[i], ctx->load_balance);
> +					if (i == VCS) {
> +						ctx->queues[i].hwe.engine_class =
> +							get_xe_engine(VCS1).engine_class;
> +						ctx->queues[i].hwe.engine_instance = 1;
> +					} else
> +						ctx->queues[i].hwe = get_xe_engine(i);
> +					exec_queue_create(ctx, &ctx->queues[i]);
> +					/* init request list */
> +					IGT_INIT_LIST_HEAD(&ctx->queues[i].requests);
> +					ctx->queues[i].nrequest = 0;
> +				}
> +				engine_classes[i] = 0;
> +			}
> +		}
> +
> +		/* create syncobjs for SW_FENCE */
> +		for (j = 0, i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++)
> +			if (w->type == SW_FENCE) {
> +				w->syncs = calloc(1, sizeof(struct drm_xe_sync));
> +				w->syncs[0].handle = syncobj_create(fd, 0);
> +				w->syncs[0].flags = DRM_XE_SYNC_SYNCOBJ;
> +			}
> +
> +		return 0;
> +	}
> +
>   	/*
>   	 * Transfer over engine map configuration from the workload step.
>   	 */
> @@ -2075,7 +2403,8 @@ static int prepare_workload(unsigned int id, struct workload *wrk)
>   		}
>   	}
>   
> -	prepare_working_sets(id, wrk);
> +	if (!is_xe)
> +		prepare_working_sets(id, wrk);

Lets make it error out during parsing, with a user friendly message, 
when working sets are used.

>   
>   	/*
>   	 * Allocate batch buffers.
> @@ -2084,10 +2413,14 @@ static int prepare_workload(unsigned int id, struct workload *wrk)
>   		if (w->type != BATCH)
>   			continue;
>   
> -		alloc_step_batch(wrk, w);
> +		if (is_xe)
> +			xe_alloc_step_batch(wrk, w);
> +		else
> +			alloc_step_batch(wrk, w);
>   	}
>   
> -	measure_active_set(wrk);
> +	if (!is_xe)
> +		measure_active_set(wrk);
>   
>   	return 0;
>   }
> @@ -2134,6 +2467,31 @@ static void w_sync_to(struct workload *wrk, struct w_step *w, int target)
>   	w_step_sync(&wrk->steps[target]);
>   }
>   
> +static void do_xe_exec(struct workload *wrk, struct w_step *w)
> +{
> +	struct exec_queue *eq = get_eq(wrk, w);
> +
> +	igt_assert(w->emit_fence <= 0);
> +	if (w->emit_fence == -1)
> +		syncobj_reset(fd, &w->syncs[0].handle, 1);
> +
> +	/* update duration if random */
> +	if (w->duration.max != w->duration.min)
> +		xe_spin_init_opts(w->spin, .addr = w->exec.address,
> +					   .preempt = (w->preempt_us > 0),
> +					   .ctx_ticks = duration_to_ctx_ticks(fd, eq->hwe.gt_id,
> +								1000LL * get_duration(wrk, w)));
> +	xe_exec(fd, &w->exec);
> +
> +	/* for qd_throttle */
> +	if (w->rq_link.prev != NULL || w->rq_link.next != NULL) {
> +		igt_list_del(&w->rq_link);
> +		eq->nrequest--;
> +	}
> +	igt_list_add_tail(&w->rq_link, &eq->requests);
> +	eq->nrequest++;
> +}
> +
>   static void
>   do_eb(struct workload *wrk, struct w_step *w, enum intel_engine_id engine)
>   {
> @@ -2258,6 +2616,10 @@ static void *run_workload(void *data)
>   					sw_sync_timeline_create_fence(wrk->sync_timeline,
>   								      cur_seqno + w->idx);
>   				igt_assert(w->emit_fence > 0);
> +				if (is_xe)
> +					/* Convert sync file to syncobj */
> +					syncobj_import_sync_file(fd, w->syncs[0].handle,
> +								 w->emit_fence);
>   				continue;
>   			} else if (w->type == SW_FENCE_SIGNAL) {
>   				int tgt = w->idx + w->target;
> @@ -2270,6 +2632,9 @@ static void *run_workload(void *data)
>   				sw_sync_timeline_inc(wrk->sync_timeline, inc);
>   				continue;
>   			} else if (w->type == CTX_PRIORITY) {
> +				if (is_xe)
> +					continue;
> +
>   				if (w->priority != wrk->ctx_list[w->context].priority) {
>   					struct drm_i915_gem_context_param param = {
>   						.ctx_id = wrk->ctx_list[w->context].id,
> @@ -2289,7 +2654,10 @@ static void *run_workload(void *data)
>   				igt_assert(wrk->steps[t_idx].type == BATCH);
>   				igt_assert(wrk->steps[t_idx].duration.unbound_duration);
>   
> -				*wrk->steps[t_idx].bb_duration = 0xffffffff;
> +				if (is_xe)
> +					xe_spin_end(wrk->steps[t_idx].spin);
> +				else
> +					*wrk->steps[t_idx].bb_duration = 0xffffffff;
>   				__sync_synchronize();
>   				continue;
>   			} else if (w->type == SSEU) {
> @@ -2321,15 +2689,19 @@ static void *run_workload(void *data)
>   			if (throttle > 0)
>   				w_sync_to(wrk, w, i - throttle);
>   
> -			do_eb(wrk, w, engine);
> +			if (is_xe)
> +				do_xe_exec(wrk, w);
> +			else {
> +				do_eb(wrk, w, engine);
>   
> -			if (w->request != -1) {
> -				igt_list_del(&w->rq_link);
> -				wrk->nrequest[w->request]--;
> +				if (w->request != -1) {
> +					igt_list_del(&w->rq_link);
> +					wrk->nrequest[w->request]--;
> +				}
> +				w->request = engine;
> +				igt_list_add_tail(&w->rq_link, &wrk->requests[engine]);
> +				wrk->nrequest[engine]++;

Is the rq list management the same in here and do_xe_exec? If so please 
consolidate into a common wrapper, which can then branch off into i915 
and xe specific parts.

>   			}
> -			w->request = engine;
> -			igt_list_add_tail(&w->rq_link, &wrk->requests[engine]);
> -			wrk->nrequest[engine]++;
>   
>   			if (!wrk->run)
>   				break;
> @@ -2338,17 +2710,32 @@ static void *run_workload(void *data)
>   				w_step_sync(w);
>   
>   			if (qd_throttle > 0) {
> -				while (wrk->nrequest[engine] > qd_throttle) {
> -					struct w_step *s;
> +				if (is_xe) {
> +					struct exec_queue *eq = get_eq(wrk, w);
> +
> +					while (eq->nrequest > qd_throttle) {
> +						struct w_step *s;
> +
> +						s = igt_list_first_entry(&eq->requests, s, rq_link);
> +
> +						w_step_sync(s);
>   
> -					s = igt_list_first_entry(&wrk->requests[engine],
> -								 s, rq_link);
> +						igt_list_del(&s->rq_link);
> +						eq->nrequest--;
> +					}
> +				} else {
> +					while (wrk->nrequest[engine] > qd_throttle) {
> +						struct w_step *s;
> +
> +						s = igt_list_first_entry(&wrk->requests[engine],
> +									s, rq_link);
>   
>   						w_step_sync(s);
>   
> -					s->request = -1;
> -					igt_list_del(&s->rq_link);
> -					wrk->nrequest[engine]--;
> +						s->request = -1;
> +						igt_list_del(&s->rq_link);
> +						wrk->nrequest[engine]--;
> +					}

Hm okay throttling is kind of very similar but not exactly the same. 
What is the conceptual difference?

>   				}
>   			}
>   		}
> @@ -2365,18 +2752,51 @@ static void *run_workload(void *data)
>   		for (i = 0, w = wrk->steps; wrk->run && (i < wrk->nr_steps);
>   		     i++, w++) {
>   			if (w->emit_fence > 0) {
> -				close(w->emit_fence);
> -				w->emit_fence = -1;
> +				if (is_xe) {
> +					igt_assert(w->type == SW_FENCE);
> +					close(w->emit_fence);
> +					w->emit_fence = -1;
> +					syncobj_reset(fd, &w->syncs[0].handle, 1);
> +				} else {
> +					close(w->emit_fence);
> +					w->emit_fence = -1;
> +				}
>   			}
>   		}
>   	}
>   
> -	for (i = 0; i < NUM_ENGINES; i++) {
> -		if (!wrk->nrequest[i])
> -			continue;
> +	if (is_xe) {
> +		struct exec_queue *eq;
> +		struct ctx *ctx;
>   
> -		w = igt_list_last_entry(&wrk->requests[i], w, rq_link);
> -		w_step_sync(w);
> +		for_each_ctx(ctx, wrk)
> +			for_each_exec_queue(eq, ctx)
> +				if (eq->nrequest) {
> +					w = igt_list_last_entry(&eq->requests, w, rq_link);
> +					w_step_sync(w);
> +				}
> +
> +		for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) {
> +			if (w->type == BATCH) {
> +				syncobj_destroy(fd, w->syncs[0].handle);
> +				free(w->syncs);
> +				xe_vm_unbind_sync(fd, get_vm(wrk, w)->id, 0, w->exec.address,
> +						  w->bb_size);
> +				gem_munmap(w->spin, w->bb_size);
> +				gem_close(fd, w->bb_handle);
> +			} else if (w->type == SW_FENCE) {
> +				syncobj_destroy(fd, w->syncs[0].handle);
> +				free(w->syncs);
> +			}
> +		}
> +	} else {
> +		for (i = 0; i < NUM_ENGINES; i++) {
> +			if (!wrk->nrequest[i])
> +				continue;
> +
> +			w = igt_list_last_entry(&wrk->requests[i], w, rq_link);
> +			w_step_sync(w);
> +		}
>   	}
>   
>   	clock_gettime(CLOCK_MONOTONIC, &t_end);
> @@ -2398,6 +2818,23 @@ static void *run_workload(void *data)
>   
>   static void fini_workload(struct workload *wrk)
>   {
> +	if (is_xe) {
> +		struct exec_queue *eq;
> +		struct ctx *ctx;
> +		struct vm *vm;
> +
> +		for_each_ctx(ctx, wrk)
> +			for_each_exec_queue(eq, ctx) {
> +				xe_exec_queue_destroy(fd, eq->id);
> +				eq->id = 0;
> +			}
> +		for_each_vm(vm, wrk) {
> +			put_ahnd(vm->ahnd);
> +			xe_vm_destroy(fd, vm->id);
> +		}
> +		free(wrk->vm_list);
> +		wrk->nr_vms = 0;
> +	}
>   	free(wrk->steps);
>   	free(wrk);
>   }
> @@ -2605,8 +3042,12 @@ int main(int argc, char **argv)
>   		ret = igt_device_find_first_i915_discrete_card(&card);
>   		if (!ret)
>   			ret = igt_device_find_integrated_card(&card);
> +		if (!ret)
> +			ret = igt_device_find_first_xe_discrete_card(&card);
> +		if (!ret)
> +			ret = igt_device_find_xe_integrated_card(&card);
>   		if (!ret) {
> -			wsim_err("No device filter specified and no i915 devices found!\n");
> +			wsim_err("No device filter specified and no intel devices found!\n");
>   			return EXIT_FAILURE;
>   		}
>   	}
> @@ -2629,6 +3070,10 @@ int main(int argc, char **argv)
>   	if (verbose > 1)
>   		printf("Using device %s\n", drm_dev);
>   
> +	is_xe = is_xe_device(fd);
> +	if (is_xe)
> +		xe_device_get(fd);

What does this do, out of curiosity? There is no put AFAICT.

> +
>   	if (!nr_w_args) {
>   		wsim_err("No workload descriptor(s)!\n");
>   		goto err;
> diff --git a/benchmarks/wsim/README b/benchmarks/wsim/README
> index e4fd61645..f49a73989 100644
> --- a/benchmarks/wsim/README
> +++ b/benchmarks/wsim/README
> @@ -88,6 +88,10 @@ Batch durations can also be specified as infinite by using the '*' in the
>   duration field. Such batches must be ended by the terminate command ('T')
>   otherwise they will cause a GPU hang to be reported.
>   
> +Note: On Xe Batch dependencies are expressed with syncobjects,
> +so there is no difference between f-1 and -1
> +ex. 1.1000.-2.0 is same as 1.1000.f-2.0.
> +

Maybe add a "chapter" talking about the differences between i915 and xe, 
for all the ones which may be relevant. This one in particular may not 
have any practical effect, dont' know. Presumably mixing explicit fence 
creating with implicit, simulated data dependencies all works fine?

Or maybe on top of that we will end up needing two chapters to list the 
commands only available for each backend.

>   Sync (fd) fences
>   ----------------
>   
> @@ -131,7 +135,7 @@ runnable. When the second RCS batch completes the standalone fence is signaled
>   which allows the two VCS batches to be executed. Finally we wait until the both
>   VCS batches have completed before starting the (optional) next iteration.
>   
> -Submit fences
> +Submit fences (i915 only?)

s/?//

>   -------------
>   
>   Submit fences are a type of input fence which are signalled when the originating

Regards,

Tvrtko