[igt-dev] [PATCH i-g-t 17/17] benchmarks/gem_wsim: added basic xe support
Tvrtko Ursulin
tvrtko.ursulin at linux.intel.com
Fri Sep 29 10:45:26 UTC 2023
On 28/09/2023 18:45, Marcin Bernatowicz wrote:
> Added basic xe support. Single binary handles both i915 and Xe devices.
>
> Some functionality is still missing: working sets, bonding.
>
> The tool is handy for scheduling tests, we find it useful to verify vGPU
> profiles defining different execution quantum/preemption timeout
> settings.
>
> There is also some rationale for the tool in following thread:
> https://lore.kernel.org/dri-devel/a443495f-5d1b-52e1-9b2f-80167deb6d57@linux.intel.com/
>
> With this patch it should be possible to run following on xe device:
>
> gem_wsim -w benchmarks/wsim/media_load_balance_fhd26u7.wsim -c 36 -r 600
>
> Best with drm debug logs disabled:
>
> echo 0 > /sys/module/drm/parameters/debug
>
> v2: minimizing divergence - same workload syntax for both drivers,
> so most existing examples should run on xe unmodified (Tvrtko)
> This version creates one common VM per workload.
> Explicit VM management, compute mode will come in next patchset.
>
> v3:
> - use calloc in parse_workload for struct workload,
> to allow cleanups in fini_workload
> - grouped xe specific fields (Tvrtko)
> - moved is_xe boolean next to fd (Tvrtko)
> - use xe_ prefix for Xe specific things (Tvrtko)
> - left throttling untouched (Tvrtko)
> - parse errors vs silent skips on not implemented steps (Tvrtko)
> - need to think on better engine handling in next version
> - add 'Xe and i915 differences' section to README (Tvrtko)
> for now no data dependency implemented, left -1 <=> f-1
> to not modify examples (maybe too optimistic assumption?)
>
> Signed-off-by: Marcin Bernatowicz <marcin.bernatowicz at linux.intel.com>
> ---
> benchmarks/gem_wsim.c | 430 +++++++++++++++++++++++++++++++++++++++--
> benchmarks/wsim/README | 15 +-
> 2 files changed, 432 insertions(+), 13 deletions(-)
>
> diff --git a/benchmarks/gem_wsim.c b/benchmarks/gem_wsim.c
> index d447dced4..68c3b2cb0 100644
> --- a/benchmarks/gem_wsim.c
> +++ b/benchmarks/gem_wsim.c
> @@ -62,6 +62,12 @@
> #include "i915/gem_engine_topology.h"
> #include "i915/gem_mman.h"
>
> +#include "igt_syncobj.h"
> +#include "intel_allocator.h"
> +#include "xe_drm.h"
> +#include "xe/xe_ioctl.h"
> +#include "xe/xe_spin.h"
> +
> enum intel_engine_id {
> DEFAULT,
> RCS,
> @@ -185,11 +191,31 @@ struct w_step {
> struct drm_i915_gem_relocation_entry reloc[3];
> uint32_t *bb_duration;
> } i915;
> + struct {
> + struct drm_xe_exec exec;
> + struct {
> + struct xe_spin spin;
> + uint64_t vm_sync;
> + uint64_t exec_sync;
> + } *data;
> + struct drm_xe_sync *syncs;
> + } xe;
> };
> uint32_t bb_handle;
> size_t bb_size;
> };
>
> +struct xe_vm {
> + uint32_t id;
> + bool compute_mode;
> + uint64_t ahnd;
> +};
> +
> +struct xe_exec_queue {
> + uint32_t id;
> + struct drm_xe_engine_class_instance hwe;
> +};
> +
> struct ctx {
> uint32_t id;
> int priority;
> @@ -199,6 +225,12 @@ struct ctx {
> struct bond *bonds;
> bool load_balance;
> uint64_t sseu;
> + struct {
> + /* reference to vm */
> + struct xe_vm *vm;
> + /* queue for each class */
> + struct xe_exec_queue queues[NUM_ENGINES];
> + } xe;
> };
>
> struct workload {
> @@ -222,6 +254,11 @@ struct workload {
> unsigned int nr_ctxs;
> struct ctx *ctx_list;
>
> + struct {
> + unsigned int nr_vms;
> + struct xe_vm *vm_list;
> + } xe;
> +
> struct working_set **working_sets; /* array indexed by set id */
> int max_working_set_id;
>
> @@ -232,10 +269,24 @@ struct workload {
> unsigned int nrequest[NUM_ENGINES];
> };
>
> +#define for_each_ctx(__ctx, __wrk) \
> + for (int __i = 0; __i < (__wrk)->nr_ctxs && \
> + (__ctx = &(__wrk)->ctx_list[__i]); ++__i)
This one looks you could also extract and make use of ahead of xe support.
> +
> +#define for_each_xe_exec_queue(__eq, __ctx) \
> + for (int __j = 0; __j < NUM_ENGINES && \
> + ((__eq) = &((__ctx)->xe.queues[__j])); ++__j) \
> + for_if((__eq)->id > 0)
> +
> +#define for_each_xe_vm(__vm, __wrk) \
> + for (int __i = 0; __i < (__wrk)->xe.nr_vms && \
> + (__vm = &(__wrk)->xe.vm_list[__i]); ++__i)
> +
> static unsigned int master_prng;
>
> static int verbose = 1;
> static int fd;
> +static bool is_xe;
> static struct drm_i915_gem_context_param_sseu device_sseu = {
> .slice_mask = -1 /* Force read on first use. */
> };
> @@ -256,7 +307,10 @@ static const char *ring_str_map[NUM_ENGINES] = {
>
> static void w_step_sync(struct w_step *w)
> {
> - gem_sync(fd, w->i915.obj[0].handle);
> + if (is_xe)
> + igt_assert(syncobj_wait(fd, &w->xe.syncs[0].handle, 1, INT64_MAX, 0, NULL));
> + else
> + gem_sync(fd, w->i915.obj[0].handle);
> }
>
> static int read_timestamp_frequency(int i915)
> @@ -360,15 +414,23 @@ parse_dependency(unsigned int nr_steps, struct w_step *w, char *str)
> if (entry.target > 0 || ((int)nr_steps + entry.target) < 0)
> return -1;
>
> - add_dep(&w->data_deps, entry);
> + /* only fence deps in xe, let f-1 <==> -1 */
> + if (is_xe)
> + add_dep(&w->fence_deps, entry);
> + else
> + add_dep(&w->data_deps, entry);
>
> break;
> case 's':
> - submit_fence = true;
> + /* no submit fence in xe ? */
> + if (!is_xe)
> + submit_fence = true;
I think best to make parsing fail with an user facing message like
"Submit fences are not supported with xe" or so.
> /* Fall-through. */
> case 'f':
> - /* Multiple fences not yet supported. */
> - igt_assert_eq(w->fence_deps.nr, 0);
> + /* xe supports multiple fences */
> + if (!is_xe)
> + /* Multiple fences not yet supported. */
> + igt_assert_eq(w->fence_deps.nr, 0);
>
> entry.target = atoi(++str);
> if (entry.target > 0 || ((int)nr_steps + entry.target) < 0)
> @@ -478,7 +540,17 @@ static struct intel_engine_data *query_engines(void)
> if (engines.nengines)
> return &engines;
>
> - engines = intel_engine_list_of_physical(fd);
> + if (is_xe) {
> + struct drm_xe_engine_class_instance *hwe;
> +
> + xe_for_each_hw_engine(fd, hwe) {
> + engines.engines[engines.nengines].class = hwe->engine_class;
> + engines.engines[engines.nengines].instance = hwe->engine_instance;
> + engines.nengines++;
> + }
> + } else
> + engines = intel_engine_list_of_physical(fd);
> +
> igt_assert(engines.nengines);
> return &engines;
> }
> @@ -571,6 +643,40 @@ get_engine(enum intel_engine_id engine)
> return ci;
> }
>
> +static struct drm_xe_engine_class_instance
> +xe_get_engine(enum intel_engine_id engine)
> +{
> + struct drm_xe_engine_class_instance ci = {};
> +
> + switch (engine) {
> + case DEFAULT:
> + case RCS:
> + ci.engine_class = DRM_XE_ENGINE_CLASS_RENDER;
> + ci.engine_instance = 0;
> + break;
Workload such as:
M.1.VCS
1.DEFAULT.1000.0.0
Keep working and submit to VCS queue with xe?
> + case BCS:
> + ci.engine_class = DRM_XE_ENGINE_CLASS_COPY;
> + ci.engine_instance = 0;
> + break;
> + case VCS1:
> + ci.engine_class = DRM_XE_ENGINE_CLASS_VIDEO_DECODE;
> + ci.engine_instance = 0;
> + break;
> + case VCS2:
> + ci.engine_class = DRM_XE_ENGINE_CLASS_VIDEO_DECODE;
> + ci.engine_instance = 1;
> + break;
> + case VECS:
> + ci.engine_class = DRM_XE_ENGINE_CLASS_VIDEO_ENHANCE;
> + ci.engine_instance = 0;
> + break;
> + default:
> + igt_assert(0);
> + };
> +
> + return ci;
> +}
> +
> static int parse_engine_map(struct w_step *step, const char *_str)
> {
> char *token, *tctx = NULL, *tstart = (char *)_str;
> @@ -845,6 +951,12 @@ parse_workload(struct w_arg *arg, unsigned int flags, double scale_dur,
> } else if (!strcmp(field, "P")) {
> unsigned int nr = 0;
>
> + if (is_xe) {
> + wsim_err("Priority step is not implemented yet :(\n");
Clarify error message by adding "with xe" or something please. Here and
the bunch below.
> + free(token);
> + return NULL;
> + }
> +
> while ((field = strtok_r(fstart, ".", &fctx))) {
> tmp = atoi(field);
> check_arg(nr == 0 && tmp <= 0,
> @@ -871,6 +983,12 @@ parse_workload(struct w_arg *arg, unsigned int flags, double scale_dur,
> } else if (!strcmp(field, "S")) {
> unsigned int nr = 0;
>
> + if (is_xe) {
> + wsim_err("SSEU step is not implemented yet :(\n");
> + free(token);
> + return NULL;
> + }
> +
> while ((field = strtok_r(fstart, ".", &fctx))) {
> tmp = atoi(field);
> check_arg(tmp <= 0 && nr == 0,
> @@ -984,6 +1102,12 @@ parse_workload(struct w_arg *arg, unsigned int flags, double scale_dur,
> } else if (!strcmp(field, "b")) {
> unsigned int nr = 0;
>
> + if (is_xe) {
> + wsim_err("Bonding is not implemented yet :(\n");
> + free(token);
> + return NULL;
> + }
> +
> while ((field = strtok_r(fstart, ".", &fctx))) {
> check_arg(nr > 2,
> "Invalid bond format at step %u!\n",
> @@ -1018,6 +1142,12 @@ parse_workload(struct w_arg *arg, unsigned int flags, double scale_dur,
> } else if (!strcmp(field, "w") || !strcmp(field, "W")) {
> unsigned int nr = 0;
>
> + if (is_xe) {
> + wsim_err("Working sets are not implemented yet :(\n");
> + free(token);
> + return NULL;
> + }
> +
> step.working_set.shared = field[0] == 'W';
>
> while ((field = strtok_r(fstart, ".", &fctx))) {
> @@ -1136,7 +1266,7 @@ add_step:
> nr_steps += app_w->nr_steps;
> }
>
> - wrk = malloc(sizeof(*wrk));
> + wrk = calloc(1, sizeof(*wrk));
> igt_assert(wrk);
>
> wrk->nr_steps = nr_steps;
> @@ -1297,6 +1427,20 @@ __get_ctx(struct workload *wrk, const struct w_step *w)
> return &wrk->ctx_list[w->context];
> }
>
> +static struct xe_exec_queue *
> +xe_get_eq(struct workload *wrk, const struct w_step *w)
> +{
> + igt_assert(w->engine >= 0 && w->engine < NUM_ENGINES);
> +
> + return &__get_ctx(wrk, w)->xe.queues[w->engine];
> +}
> +
> +static struct xe_vm *
> +xe_get_vm(struct workload *wrk, const struct w_step *w)
> +{
> + return wrk->xe.vm_list;
> +}
> +
> static uint32_t mmio_base(int i915, enum intel_engine_id engine, int gen)
> {
> const char *name;
> @@ -1549,6 +1693,62 @@ alloc_step_batch(struct workload *wrk, struct w_step *w)
> #endif
> }
>
> +static void
> +xe_alloc_step_batch(struct workload *wrk, struct w_step *w)
> +{
> + struct xe_vm *vm = xe_get_vm(wrk, w);
> + struct xe_exec_queue *eq = xe_get_eq(wrk, w);
> + struct dep_entry *dep;
> + int i;
> +
> + w->bb_size = ALIGN(sizeof(*w->xe.data) + xe_cs_prefetch_size(fd),
> + xe_get_default_alignment(fd));
> + w->bb_handle = xe_bo_create_flags(fd, vm->id, w->bb_size,
> + visible_vram_if_possible(fd, eq->hwe.gt_id));
> + w->xe.data = xe_bo_map(fd, w->bb_handle, w->bb_size);
> + w->xe.exec.address =
> + intel_allocator_alloc_with_strategy(vm->ahnd, w->bb_handle, w->bb_size,
> + 0, ALLOC_STRATEGY_LOW_TO_HIGH);
> + xe_vm_bind_sync(fd, vm->id, w->bb_handle, 0, w->xe.exec.address, w->bb_size);
> + xe_spin_init_opts(&w->xe.data->spin, .addr = w->xe.exec.address,
> + .preempt = (w->preempt_us > 0),
> + .ctx_ticks = duration_to_ctx_ticks(fd, eq->hwe.gt_id,
> + 1000LL * get_duration(wrk, w)));
> + w->xe.exec.exec_queue_id = eq->id;
> + w->xe.exec.num_batch_buffer = 1;
> + /* always at least one out fence */
> + w->xe.exec.num_syncs = 1;
> + /* count syncs */
> + igt_assert_eq(0, w->data_deps.nr);
> + for_each_dep(dep, w->fence_deps) {
> + int dep_idx = w->idx + dep->target;
> +
> + igt_assert(dep_idx >= 0 && dep_idx < w->idx);
> + igt_assert(wrk->steps[dep_idx].type == SW_FENCE ||
> + wrk->steps[dep_idx].type == BATCH);
> +
> + w->xe.exec.num_syncs++;
> + }
> + w->xe.syncs = calloc(w->xe.exec.num_syncs, sizeof(*w->xe.syncs));
> + /* fill syncs */
> + i = 0;
> + /* out fence */
> + w->xe.syncs[i].handle = syncobj_create(fd, 0);
> + w->xe.syncs[i++].flags = DRM_XE_SYNC_SYNCOBJ | DRM_XE_SYNC_SIGNAL;
> + /* in fence(s) */
> + for_each_dep(dep, w->fence_deps) {
> + int dep_idx = w->idx + dep->target;
> +
> + igt_assert(wrk->steps[dep_idx].type == SW_FENCE ||
> + wrk->steps[dep_idx].type == BATCH);
> + igt_assert(wrk->steps[dep_idx].xe.syncs && wrk->steps[dep_idx].xe.syncs[0].handle);
> +
> + w->xe.syncs[i].handle = wrk->steps[dep_idx].xe.syncs[0].handle;
> + w->xe.syncs[i++].flags = DRM_XE_SYNC_SYNCOBJ;
> + }
> + w->xe.exec.syncs = to_user_pointer(w->xe.syncs);
> +}
> +
> static bool set_priority(uint32_t ctx_id, int prio)
> {
> struct drm_i915_gem_context_param param = {
> @@ -1774,6 +1974,61 @@ static void measure_active_set(struct workload *wrk)
>
> #define alloca0(sz) ({ size_t sz__ = (sz); memset(alloca(sz__), 0, sz__); })
>
> +static void xe_vm_create_(struct xe_vm *vm)
> +{
> + uint32_t flags = 0;
> +
> + if (vm->compute_mode)
> + flags |= DRM_XE_VM_CREATE_ASYNC_BIND_OPS |
> + DRM_XE_VM_CREATE_COMPUTE_MODE;
> +
> + vm->id = xe_vm_create(fd, flags, 0);
> +}
> +
> +static void xe_exec_queue_create_(struct ctx *ctx, struct xe_exec_queue *eq)
> +{
> + struct drm_xe_exec_queue_create create = {
> + .vm_id = ctx->xe.vm->id,
> + .width = 1,
> + .num_placements = 1,
> + .instances = to_user_pointer(&eq->hwe),
> + };
> + struct drm_xe_engine_class_instance *eci = NULL;
> +
> + if (ctx->load_balance && eq->hwe.engine_class == DRM_XE_ENGINE_CLASS_VIDEO_DECODE) {
> + struct drm_xe_engine_class_instance *hwe;
> + int i;
> +
> + for (i = 0; i < ctx->engine_map_count; ++i)
> + igt_assert(ctx->engine_map[i] == VCS || ctx->engine_map[i] == VCS1 ||
> + ctx->engine_map[i] == VCS2);
ctx->engine_map should not contain VCS at this point, AFAICT the meta
"all engines" token is expanded to individual instances during parsing
in parse_engine_map().
So adjust this assert and double-check it really is as I say please.
> +
> + eci = calloc(16, sizeof(struct drm_xe_engine_class_instance));
> + create.num_placements = 0;
> + xe_for_each_hw_engine(fd, hwe) {
> + if (hwe->engine_class != DRM_XE_ENGINE_CLASS_VIDEO_DECODE ||
> + hwe->gt_id != 0)
> + continue;
1)
Why the gt_id check - does it work on Meteorlake like that?
2)
You seem to be adding all vcs engines irrespective of the configure
engine map? Make sure that:
M.1.VCS - maps to all vcs instances
M.2.VCS1|VCS2 - maps to vcs0 & vcs1
M.3.VCS1 - maps to vcs0 only
M.3.VCS2 - maps to vcs1 only
> +
> + igt_assert(create.num_placements < 16);
> + eci[create.num_placements++] = *hwe;
> + }
> + igt_assert(create.num_placements);
> + create.instances = to_user_pointer(eci);
> +
> + if (verbose > 3)
> + printf("num_placements=%d class=%d gt=%d\n", create.num_placements,
> + eq->hwe.engine_class, eq->hwe.gt_id);
> + }
> +
> + igt_assert_eq(igt_ioctl(fd, DRM_IOCTL_XE_EXEC_QUEUE_CREATE, &create), 0);
> +
> + if (eci)
> + free(eci);
> +
> + eq->id = create.exec_queue_id;
> +}
> +
> static void allocate_contexts(unsigned int id, struct workload *wrk)
> {
> int max_ctx = -1;
> @@ -1978,6 +2233,77 @@ static int prepare_contexts(unsigned int id, struct workload *wrk)
> return 0;
> }
>
> +static int xe_prepare_vms_contexts_syncobjs(unsigned int id, struct workload *wrk)
> +{
> + int engine_classes[NUM_ENGINES] = {};
> + struct w_step *w;
> + int i, j;
> +
> + /* shortcut, create one vm */
> + wrk->xe.nr_vms = 1;
> + wrk->xe.vm_list = calloc(wrk->xe.nr_vms, sizeof(struct xe_vm));
> + wrk->xe.vm_list->compute_mode = false;
> + xe_vm_create_(wrk->xe.vm_list);
> + wrk->xe.vm_list->ahnd = intel_allocator_open(fd, wrk->xe.vm_list->id, INTEL_ALLOCATOR_RELOC);
> +
> + /* create exec queues of each referenced engine class */
> + for (j = 0; j < wrk->nr_ctxs; j++) {
> + struct ctx *ctx = &wrk->ctx_list[j];
> + /* link with vm */
> + ctx->xe.vm = wrk->xe.vm_list;
> + for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) {
> + if (w->context != j)
> + continue;
> + if (w->type == ENGINE_MAP) {
> + ctx->engine_map = w->engine_map;
> + ctx->engine_map_count = w->engine_map_count;
> + } else if (w->type == LOAD_BALANCE) {
> + if (!ctx->engine_map) {
> + wsim_err("Load balancing needs an engine map!\n");
> + return 1;
> + }
> + if (intel_gen(intel_get_drm_devid(fd)) < 11) {
> + wsim_err("Load balancing needs relative mmio support, gen11+!\n");
> + return 1;
> + }
> + ctx->load_balance = w->load_balance;
> + }
> + }
> + /* create exec queue for each referenced engine */
> + for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) {
> + if (w->context != j)
> + continue;
> + if (w->type == BATCH)
> + engine_classes[w->engine]++;
> + }
> + for (i = 0; i < NUM_ENGINES; i++) {
> + if (engine_classes[i]) {
> + if (verbose > 3)
> + printf("%u ctx[%d] eq(%s) load_balance=%d\n",
> + id, j, ring_str_map[i], ctx->load_balance);
> + if (i == VCS) {
> + ctx->xe.queues[i].hwe.engine_class =
> + xe_get_engine(VCS1).engine_class;
> + ctx->xe.queues[i].hwe.engine_instance = 1;
> + } else
> + ctx->xe.queues[i].hwe = xe_get_engine(i);
> + xe_exec_queue_create_(ctx, &ctx->xe.queues[i]);
> + }
> + engine_classes[i] = 0;
> + }
> + }
> +
> + /* create syncobjs for SW_FENCE */
> + for (j = 0, i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++)
> + if (w->type == SW_FENCE) {
> + w->xe.syncs = calloc(1, sizeof(struct drm_xe_sync));
> + w->xe.syncs[0].handle = syncobj_create(fd, 0);
> + w->xe.syncs[0].flags = DRM_XE_SYNC_SYNCOBJ;
> + }
> +
> + return 0;
> +}
> +
> static void prepare_working_sets(unsigned int id, struct workload *wrk)
> {
> struct working_set **sets;
> @@ -2047,7 +2373,11 @@ static int prepare_workload(unsigned int id, struct workload *wrk)
>
> allocate_contexts(id, wrk);
>
> - ret = prepare_contexts(id, wrk);
> + if (is_xe)
> + ret = xe_prepare_vms_contexts_syncobjs(id, wrk);
> + else
> + ret = prepare_contexts(id, wrk);
Should this then be more obviously named like:
if (is_xe)
ret = xe_prepare_contexts(..);
else
ret = i915_prepare_contexts(..);
?
i915_ prefix optional I guess for less churn, since it would otherwise
apply to alloc_step_batch too. Whatever you prefer.
> +
> if (ret)
> return ret;
>
> @@ -2101,7 +2431,10 @@ static int prepare_workload(unsigned int id, struct workload *wrk)
> if (w->type != BATCH)
> continue;
>
> - alloc_step_batch(wrk, w);
> + if (is_xe)
> + xe_alloc_step_batch(wrk, w);
> + else
> + alloc_step_batch(wrk, w);
> }
>
> measure_active_set(wrk);
> @@ -2151,6 +2484,23 @@ static void w_sync_to(struct workload *wrk, struct w_step *w, int target)
> w_step_sync(&wrk->steps[target]);
> }
>
> +static void do_xe_exec(struct workload *wrk, struct w_step *w)
> +{
> + struct xe_exec_queue *eq = xe_get_eq(wrk, w);
> +
> + igt_assert(w->emit_fence <= 0);
> + if (w->emit_fence == -1)
> + syncobj_reset(fd, &w->xe.syncs[0].handle, 1);
> +
> + /* update duration if random */
> + if (w->duration.max != w->duration.min)
> + xe_spin_init_opts(&w->xe.data->spin, .addr = w->xe.exec.address,
> + .preempt = (w->preempt_us > 0),
> + .ctx_ticks = duration_to_ctx_ticks(fd, eq->hwe.gt_id,
> + 1000LL * get_duration(wrk, w)));
> + xe_exec(fd, &w->xe.exec);
> +}
> +
> static void
> do_eb(struct workload *wrk, struct w_step *w, enum intel_engine_id engine)
> {
> @@ -2276,6 +2626,10 @@ static void *run_workload(void *data)
> sw_sync_timeline_create_fence(wrk->sync_timeline,
> cur_seqno + w->idx);
> igt_assert(w->emit_fence > 0);
> + if (is_xe)
> + /* Convert sync file to syncobj */
> + syncobj_import_sync_file(fd, w->xe.syncs[0].handle,
> + w->emit_fence);
> continue;
> } else if (w->type == SW_FENCE_SIGNAL) {
> int tgt = w->idx + w->target;
> @@ -2307,7 +2661,10 @@ static void *run_workload(void *data)
> igt_assert(wrk->steps[t_idx].type == BATCH);
> igt_assert(wrk->steps[t_idx].duration.unbound);
>
> - *wrk->steps[t_idx].i915.bb_duration = 0xffffffff;
> + if (is_xe)
> + xe_spin_end(&wrk->steps[t_idx].xe.data->spin);
> + else
> + *wrk->steps[t_idx].i915.bb_duration = 0xffffffff;
> __sync_synchronize();
> continue;
> } else if (w->type == SSEU) {
> @@ -2339,7 +2696,10 @@ static void *run_workload(void *data)
> if (throttle > 0)
> w_sync_to(wrk, w, i - throttle);
>
> - do_eb(wrk, w, engine);
> + if (is_xe)
> + do_xe_exec(wrk, w);
> + else
> + do_eb(wrk, w, engine);
>
> if (w->request != -1) {
> igt_list_del(&w->rq_link);
> @@ -2383,6 +2743,10 @@ static void *run_workload(void *data)
> for (i = 0, w = wrk->steps; wrk->run && (i < wrk->nr_steps);
> i++, w++) {
> if (w->emit_fence > 0) {
> + if (is_xe) {
> + igt_assert(w->type == SW_FENCE);
> + syncobj_reset(fd, &w->xe.syncs[0].handle, 1);
> + }
> close(w->emit_fence);
> w->emit_fence = -1;
> }
> @@ -2397,6 +2761,23 @@ static void *run_workload(void *data)
> w_step_sync(w);
> }
>
> + if (is_xe) {
> + for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) {
> + if (w->type == BATCH) {
> + w_step_sync(w);
> + syncobj_destroy(fd, w->xe.syncs[0].handle);
> + free(w->xe.syncs);
> + xe_vm_unbind_sync(fd, xe_get_vm(wrk, w)->id, 0, w->xe.exec.address,
> + w->bb_size);
> + gem_munmap(w->xe.data, w->bb_size);
> + gem_close(fd, w->bb_handle);
> + } else if (w->type == SW_FENCE) {
> + syncobj_destroy(fd, w->xe.syncs[0].handle);
> + free(w->xe.syncs);
> + }
> + }
> + }
> +
> clock_gettime(CLOCK_MONOTONIC, &t_end);
>
> if (wrk->print_stats) {
> @@ -2416,6 +2797,23 @@ static void *run_workload(void *data)
>
> static void fini_workload(struct workload *wrk)
> {
> + if (is_xe) {
> + struct xe_exec_queue *eq;
> + struct xe_vm *vm;
> + struct ctx *ctx;
> +
> + for_each_ctx(ctx, wrk)
> + for_each_xe_exec_queue(eq, ctx) {
> + xe_exec_queue_destroy(fd, eq->id);
> + eq->id = 0;
> + }
> + for_each_xe_vm(vm, wrk) {
> + put_ahnd(vm->ahnd);
> + xe_vm_destroy(fd, vm->id);
> + }
> + free(wrk->xe.vm_list);
> + wrk->xe.nr_vms = 0;
> + }
Does it really matter since device fd is getting closed anyway? But
apart from making the patch bigger it doesn't harm anything so up to you.
> free(wrk->steps);
> free(wrk);
> }
> @@ -2623,8 +3021,12 @@ int main(int argc, char **argv)
> ret = igt_device_find_first_i915_discrete_card(&card);
> if (!ret)
> ret = igt_device_find_integrated_card(&card);
> + if (!ret)
> + ret = igt_device_find_first_xe_discrete_card(&card);
> + if (!ret)
> + ret = igt_device_find_xe_integrated_card(&card);
> if (!ret) {
> - wsim_err("No device filter specified and no i915 devices found!\n");
> + wsim_err("No device filter specified and no intel devices found!\n");
> return EXIT_FAILURE;
> }
> }
> @@ -2647,6 +3049,10 @@ int main(int argc, char **argv)
> if (verbose > 1)
> printf("Using device %s\n", drm_dev);
>
> + is_xe = is_xe_device(fd);
> + if (is_xe)
> + xe_device_get(fd);
Still no put, but meh on that and meh on the get/put naming to begin
with. :)
> +
> if (!nr_w_args) {
> wsim_err("No workload descriptor(s)!\n");
> goto err;
> diff --git a/benchmarks/wsim/README b/benchmarks/wsim/README
> index d4f3dd8ae..38f305ba0 100644
> --- a/benchmarks/wsim/README
> +++ b/benchmarks/wsim/README
> @@ -88,6 +88,19 @@ Batch durations can also be specified as infinite by using the '*' in the
> duration field. Such batches must be ended by the terminate command ('T')
> otherwise they will cause a GPU hang to be reported.
>
> +Xe and i915 differences
> +------------------------
> +
> +There are differences between Xe and i915, like not allowing a BO list to
> +be passed to an exec (and create implicit syncs). For more details see:
> +https://gitlab.freedesktop.org/drm/xe/kernel/-/blob/drm-xe-next/drivers/gpu/drm/xe/xe_exec.c
> +
> +Currently following batch steps are equal on Xe:
> +1.1000.-2.0 <==> 1.1000.f-2.0
> +and will create explicit sync fence dependency (via syncobjects).
I don't remember if I asked if possible misuse combinations of implicit
fence creation and data deps keep working. Like:
f
1.DEFAULT.1000.-1.0 # should fail
f
1.DEFAULT.1000.f-1.0
1.DEFAULT.1000.f-1.0 # should fail, but without 'f' should work
At least I think this is what should happen. Basically we want to make
sure existing workloads work but perversions like above should be
detected as something the auto-magic i915<->xe compatibility translation
does not want to start allowing.
> +
> +The data dependency need to wait for working sets implementation.
For explicit working set dependencies or also for implicits?
> +
> Sync (fd) fences
> ----------------
>
> @@ -131,7 +144,7 @@ runnable. When the second RCS batch completes the standalone fence is signaled
> which allows the two VCS batches to be executed. Finally we wait until the both
> VCS batches have completed before starting the (optional) next iteration.
>
> -Submit fences
> +Submit fences (i915 only)
Mark priorities, SSEU, working sets and bonds as well please.
> -------------
>
> Submit fences are a type of input fence which are signalled when the originating
Overall - not too much code to add xe support, nice!
Regards,
Tvrtko
More information about the igt-dev
mailing list