[PATCH v3 i-g-t 1/3] tests/intel/xe_sriov_scheduling: Keep K submissions in flight
Laguna, Lukasz
lukasz.laguna at intel.com
Mon Aug 25 09:15:51 UTC 2025
On 8/25/2025 10:22, Marcin Bernatowicz wrote:
> Refactor submission to a prefill->wait->refill pipeline so each VF can
> keep K jobs in flight. Introduce per-slot resources (addr/bo/spin/
> out-fence) and submit per slot.
>
> This patch sets K=1, preserving current behavior; follow-ups will pick
> a higher/default K and add CLI control. This improves HW saturation and
> is less sensitive to CPU scheduling hiccups, especially for short jobs.
>
> v2: drop redundant num_syncs init; simplify subm_exec_slot (Lukasz)
>
> Signed-off-by: Marcin Bernatowicz <marcin.bernatowicz at linux.intel.com>
> Cc: Adam Miszczak <adam.miszczak at linux.intel.com>
> Cc: Jakub Kolakowski <jakub1.kolakowski at intel.com>
> Cc: Kamil Konieczny <kamil.konieczny at linux.intel.com>
> Cc: Lukasz Laguna <lukasz.laguna at intel.com>
Reviewed-by: Lukasz Laguna <lukasz.laguna at intel.com>
> Cc: Satyanarayana K V P <satyanarayana.k.v.p at intel.com>
> ---
> tests/intel/xe_sriov_scheduling.c | 124 +++++++++++++++++++++---------
> 1 file changed, 87 insertions(+), 37 deletions(-)
>
> diff --git a/tests/intel/xe_sriov_scheduling.c b/tests/intel/xe_sriov_scheduling.c
> index d69315690..df93eaaca 100644
> --- a/tests/intel/xe_sriov_scheduling.c
> +++ b/tests/intel/xe_sriov_scheduling.c
> @@ -51,13 +51,16 @@ struct subm {
> int vf_num;
> struct subm_work_desc work;
> uint32_t expected_ticks;
> - uint64_t addr;
> uint32_t vm;
> struct drm_xe_engine_class_instance hwe;
> uint32_t exec_queue_id;
> - uint32_t bo;
> + /* K slots (K BOs / addresses / mapped spinners / done fences) */
> + unsigned int slots;
> + uint64_t *addr;
> + uint32_t *bo;
> size_t bo_size;
> - struct xe_spin *spin;
> + struct xe_spin **spin;
> + uint32_t *done_fence;
> struct drm_xe_sync sync[1];
> struct drm_xe_exec exec;
> };
> @@ -78,43 +81,61 @@ struct subm_set {
> };
>
> static void subm_init(struct subm *s, int fd, int vf_num, uint64_t addr,
> - struct drm_xe_engine_class_instance hwe)
> + struct drm_xe_engine_class_instance hwe,
> + unsigned int inflight)
> {
> + uint64_t base, stride;
> +
> memset(s, 0, sizeof(*s));
> s->fd = fd;
> s->vf_num = vf_num;
> s->hwe = hwe;
> snprintf(s->id, sizeof(s->id), "VF%d %d:%d:%d", vf_num,
> hwe.engine_class, hwe.engine_instance, hwe.gt_id);
> - s->addr = addr ? addr : 0x1a0000;
> + s->slots = inflight ? inflight : 1;
> s->vm = xe_vm_create(s->fd, 0, 0);
> s->exec_queue_id = xe_exec_queue_create(s->fd, s->vm, &s->hwe, 0);
> s->bo_size = ALIGN(sizeof(struct xe_spin) + xe_cs_prefetch_size(s->fd),
> xe_get_default_alignment(s->fd));
> - s->bo = xe_bo_create(s->fd, s->vm, s->bo_size,
> - vram_if_possible(fd, s->hwe.gt_id),
> - DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM);
> - s->spin = xe_bo_map(s->fd, s->bo, s->bo_size);
> - xe_vm_bind_sync(s->fd, s->vm, s->bo, 0, s->addr, s->bo_size);
> - /* out fence */
> - s->sync[0].type = DRM_XE_SYNC_TYPE_SYNCOBJ;
> - s->sync[0].flags = DRM_XE_SYNC_FLAG_SIGNAL;
> - s->sync[0].handle = syncobj_create(s->fd, 0);
> - s->exec.num_syncs = 1;
> - s->exec.syncs = to_user_pointer(&s->sync[0]);
> + s->addr = calloc(s->slots, sizeof(*s->addr));
> + s->bo = calloc(s->slots, sizeof(*s->bo));
> + s->spin = calloc(s->slots, sizeof(*s->spin));
> + s->done_fence = calloc(s->slots, sizeof(*s->done_fence));
> +
> + igt_assert(s->addr && s->bo && s->spin && s->done_fence);
> +
> + base = addr ? addr : 0x1a0000;
> + stride = ALIGN(s->bo_size, 0x10000);
> + for (unsigned int i = 0; i < s->slots; i++) {
> + s->addr[i] = base + i * stride;
> + s->bo[i] = xe_bo_create(s->fd, s->vm, s->bo_size,
> + vram_if_possible(fd, s->hwe.gt_id),
> + DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM);
> + s->spin[i] = xe_bo_map(s->fd, s->bo[i], s->bo_size);
> + xe_vm_bind_sync(s->fd, s->vm, s->bo[i], 0, s->addr[i], s->bo_size);
> + s->done_fence[i] = syncobj_create(s->fd, 0);
> + }
> +
> s->exec.num_batch_buffer = 1;
> s->exec.exec_queue_id = s->exec_queue_id;
> - s->exec.address = s->addr;
> + /* s->exec.address set per submission */
> }
>
> static void subm_fini(struct subm *s)
> {
> - xe_vm_unbind_sync(s->fd, s->vm, 0, s->addr, s->bo_size);
> - gem_munmap(s->spin, s->bo_size);
> - gem_close(s->fd, s->bo);
> + for (unsigned int i = 0; i < s->slots; i++) {
> + xe_vm_unbind_sync(s->fd, s->vm, 0, s->addr[i], s->bo_size);
> + gem_munmap(s->spin[i], s->bo_size);
> + gem_close(s->fd, s->bo[i]);
> + if (s->done_fence[i])
> + syncobj_destroy(s->fd, s->done_fence[i]);
> + }
> xe_exec_queue_destroy(s->fd, s->exec_queue_id);
> xe_vm_destroy(s->fd, s->vm);
> - syncobj_destroy(s->fd, s->sync[0].handle);
> + free(s->addr);
> + free(s->bo);
> + free(s->spin);
> + free(s->done_fence);
> }
>
> static void subm_workload_init(struct subm *s, struct subm_work_desc *work)
> @@ -122,25 +143,36 @@ static void subm_workload_init(struct subm *s, struct subm_work_desc *work)
> s->work = *work;
> s->expected_ticks = xe_spin_nsec_to_ticks(s->fd, s->hwe.gt_id,
> s->work.duration_ms * 1000000);
> - xe_spin_init_opts(s->spin, .addr = s->addr, .preempt = s->work.preempt,
> - .ctx_ticks = s->expected_ticks);
> + for (unsigned int i = 0; i < s->slots; i++)
> + xe_spin_init_opts(s->spin[i], .addr = s->addr[i],
> + .preempt = s->work.preempt,
> + .ctx_ticks = s->expected_ticks);
> }
>
> -static void subm_wait(struct subm *s, uint64_t abs_timeout_nsec)
> +static void subm_wait_slot(struct subm *s, unsigned int slot, uint64_t abs_timeout_nsec)
> {
> - igt_assert(syncobj_wait(s->fd, &s->sync[0].handle, 1, abs_timeout_nsec,
> - 0, NULL));
> + igt_assert(syncobj_wait(s->fd, &s->done_fence[slot], 1,
> + abs_timeout_nsec, 0, NULL));
> }
>
> -static void subm_exec(struct subm *s)
> +static void subm_exec_slot(struct subm *s, unsigned int slot)
> {
> - syncobj_reset(s->fd, &s->sync[0].handle, 1);
> + struct timespec tv;
> +
> + syncobj_reset(s->fd, &s->done_fence[slot], 1);
> + memset(&s->sync[0], 0, sizeof(s->sync));
> + s->sync[0].type = DRM_XE_SYNC_TYPE_SYNCOBJ;
> + s->sync[0].flags = DRM_XE_SYNC_FLAG_SIGNAL;
> + s->sync[0].handle = s->done_fence[slot];
> + s->exec.num_syncs = 1;
> + s->exec.syncs = to_user_pointer(&s->sync[0]);
> + s->exec.address = s->addr[slot];
> xe_exec(s->fd, &s->exec);
> }
>
> -static bool subm_is_work_complete(struct subm *s)
> +static bool subm_is_work_complete(struct subm *s, unsigned int slot)
> {
> - return s->expected_ticks <= ~s->spin->ticks_delta;
> + return s->expected_ticks <= ~s->spin[slot]->ticks_delta;
> }
>
> static bool subm_is_exec_queue_banned(struct subm *s)
> @@ -157,6 +189,8 @@ static bool subm_is_exec_queue_banned(struct subm *s)
> static void subm_exec_loop(struct subm *s, struct subm_stats *stats,
> const struct subm_opts *opts)
> {
> + const unsigned int inflight = s->slots;
> + unsigned int submitted = 0;
> struct timespec tv;
> unsigned int i;
>
> @@ -165,16 +199,24 @@ static void subm_exec_loop(struct subm *s, struct subm_stats *stats,
> tv.tv_sec * (uint64_t)NSEC_PER_SEC + tv.tv_nsec;
> igt_debug("[%s] start_timestamp: %f\n", s->id, stats->start_timestamp * 1e-9);
>
> - for (i = 0; i < s->work.repeats; ++i) {
> - igt_gettime(&tv);
> + /* Prefill */
> + if (s->work.repeats) {
> + unsigned int can_prefill = min(inflight, s->work.repeats);
>
> - subm_exec(s);
> + for (i = 0; i < can_prefill; i++)
> + subm_exec_slot(s, i % inflight);
> + submitted = can_prefill;
> + }
>
> - subm_wait(s, INT64_MAX);
> + /* Process completions in order: sample i -> slot (i % inflight) */
> + for (i = 0; i < s->work.repeats; ++i) {
> + unsigned int slot = i % inflight;
>
> + igt_gettime(&tv);
> + subm_wait_slot(s, slot, INT64_MAX);
> igt_stats_push(&stats->samples, igt_nsec_elapsed(&tv));
>
> - if (!subm_is_work_complete(s)) {
> + if (!subm_is_work_complete(s, slot)) {
> stats->num_early_finish++;
>
> igt_debug("[%s] subm #%d early_finish=%u\n",
> @@ -183,6 +225,14 @@ static void subm_exec_loop(struct subm *s, struct subm_stats *stats,
> if (subm_is_exec_queue_banned(s))
> break;
> }
> +
> + /* Keep the pipeline full */
> + if (submitted < s->work.repeats) {
> + unsigned int next_slot = submitted % inflight;
> +
> + subm_exec_slot(s, next_slot);
> + submitted++;
> + }
> }
>
> igt_gettime(&tv);
> @@ -607,7 +657,7 @@ static void throughput_ratio(int pf_fd, int num_vfs, const struct subm_opts *opt
> igt_assert_fd(vf_fd);
> set->data[n].opts = opts;
> subm_init(&set->data[n].subm, vf_fd, vf_ids[n], 0,
> - xe_engine(vf_fd, 0)->instance);
> + xe_engine(vf_fd, 0)->instance, 1);
> subm_workload_init(&set->data[n].subm,
> &(struct subm_work_desc){
> .duration_ms = job_sched_params.duration_ms,
> @@ -702,7 +752,7 @@ static void nonpreempt_engine_resets(int pf_fd, int num_vfs,
> igt_assert_fd(vf_fd);
> set->data[n].opts = opts;
> subm_init(&set->data[n].subm, vf_fd, vf_ids[n], 0,
> - xe_engine(vf_fd, 0)->instance);
> + xe_engine(vf_fd, 0)->instance, 1);
> subm_workload_init(&set->data[n].subm,
> &(struct subm_work_desc){
> .duration_ms = duration_ms,
More information about the igt-dev
mailing list