[PATCH i-g-t] tests/intel/xe_sriov_scheduling: K-in-flight; completion window; --inflight

Tue Aug 19 11:00:55 UTC 2025

Refactor submission/measurement to better saturate HW and make
throughput comparisons more robust, especially with short jobs.
Add the --inflight option.

- Drive a K-in-flight pipeline per VF using per-slot BO/addr/spin and
  binary out-fences; add subm_exec_slot()/subm_wait_slot() (prefill +
  refill).
- Record complete_ts[] and per-slot submit_ts[]; build the common
  window from completions [max(first), min(last)] and compute
  throughput as count/window.
- Push durations as submit-to-completion (complete_ts - submit_ts) and
  print "mean submit->signal latency".
- Add --inflight (0=auto; non-preempt defaults to 1; short jobs pick
  higher K); print chosen K in the banner.

Signed-off-by: Marcin Bernatowicz <marcin.bernatowicz at linux.intel.com>
Cc: Adam Miszczak <adam.miszczak at linux.intel.com>
Cc: Jakub Kolakowski <jakub1.kolakowski at intel.com>
Cc: Lukasz Laguna <lukasz.laguna at intel.com>
Cc: Satyanarayana K V P <satyanarayana.k.v.p at intel.com>
---
 tests/intel/xe_sriov_scheduling.c | 241 +++++++++++++++++++++---------
 1 file changed, 171 insertions(+), 70 deletions(-)

diff --git a/tests/intel/xe_sriov_scheduling.c b/tests/intel/xe_sriov_scheduling.c
index d69315690..467eb1e29 100644
--- a/tests/intel/xe_sriov_scheduling.c
+++ b/tests/intel/xe_sriov_scheduling.c
@@ -27,6 +27,8 @@ struct subm_opts {
 	uint32_t exec_quantum_ms;
 	uint32_t preempt_timeout_us;
 	double outlier_treshold;
+	/* --inflight=0 => auto; >=1 => explicit K */
+	unsigned int inflight;
 };
 
 struct subm_work_desc {
@@ -39,6 +41,7 @@ struct subm_stats {
 	igt_stats_t samples;
 	uint64_t start_timestamp;
 	uint64_t end_timestamp;
+	uint64_t *complete_ts; /* absolute completion timestamps (ns) */
 	unsigned int num_early_finish;
 	unsigned int concurrent_execs;
 	double concurrent_rate;
@@ -51,13 +54,17 @@ struct subm {
 	int vf_num;
 	struct subm_work_desc work;
 	uint32_t expected_ticks;
-	uint64_t addr;
 	uint32_t vm;
 	struct drm_xe_engine_class_instance hwe;
 	uint32_t exec_queue_id;
-	uint32_t bo;
+	/* K slots (K BOs / addresses / mapped spinners / done fences / submit_ts) */
+	unsigned int slots;
+	uint64_t *submit_ts; /* per-slot submit timestamps (ns) */
+	uint64_t *addr;
+	uint32_t *bo;
 	size_t bo_size;
-	struct xe_spin *spin;
+	struct xe_spin **spin;
+	uint32_t *done_fence;
 	struct drm_xe_sync sync[1];
 	struct drm_xe_exec exec;
 };
@@ -78,43 +85,62 @@ struct subm_set {
 };
 
 static void subm_init(struct subm *s, int fd, int vf_num, uint64_t addr,
-		      struct drm_xe_engine_class_instance hwe)
+		      struct drm_xe_engine_class_instance hwe,
+		      unsigned int inflight)
 {
+	uint64_t base, stride;
+
 	memset(s, 0, sizeof(*s));
 	s->fd = fd;
 	s->vf_num = vf_num;
 	s->hwe = hwe;
 	snprintf(s->id, sizeof(s->id), "VF%d %d:%d:%d", vf_num,
 		 hwe.engine_class, hwe.engine_instance, hwe.gt_id);
-	s->addr = addr ? addr : 0x1a0000;
+	s->slots = inflight ? inflight : 1;
 	s->vm = xe_vm_create(s->fd, 0, 0);
 	s->exec_queue_id = xe_exec_queue_create(s->fd, s->vm, &s->hwe, 0);
 	s->bo_size = ALIGN(sizeof(struct xe_spin) + xe_cs_prefetch_size(s->fd),
 			   xe_get_default_alignment(s->fd));
-	s->bo = xe_bo_create(s->fd, s->vm, s->bo_size,
-			     vram_if_possible(fd, s->hwe.gt_id),
-			     DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM);
-	s->spin = xe_bo_map(s->fd, s->bo, s->bo_size);
-	xe_vm_bind_sync(s->fd, s->vm, s->bo, 0, s->addr, s->bo_size);
-	/* out fence */
-	s->sync[0].type = DRM_XE_SYNC_TYPE_SYNCOBJ;
-	s->sync[0].flags = DRM_XE_SYNC_FLAG_SIGNAL;
-	s->sync[0].handle = syncobj_create(s->fd, 0);
-	s->exec.num_syncs = 1;
-	s->exec.syncs = to_user_pointer(&s->sync[0]);
+	s->addr = calloc(s->slots, sizeof(*s->addr));
+	s->bo = calloc(s->slots, sizeof(*s->bo));
+	s->spin = calloc(s->slots, sizeof(*s->spin));
+	s->done_fence = calloc(s->slots, sizeof(*s->done_fence));
+	s->submit_ts = calloc(s->slots, sizeof(*s->submit_ts));
+	igt_assert(s->addr && s->bo && s->spin && s->done_fence && s->submit_ts);
+
+	base = addr ? addr : 0x1a0000;
+	stride = ALIGN(s->bo_size, 0x10000);
+	for (unsigned int i = 0; i < s->slots; i++) {
+		s->addr[i] = base + i * stride;
+		s->bo[i] = xe_bo_create(s->fd, s->vm, s->bo_size,
+					vram_if_possible(fd, s->hwe.gt_id),
+					DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM);
+		s->spin[i] = xe_bo_map(s->fd, s->bo[i], s->bo_size);
+		xe_vm_bind_sync(s->fd, s->vm, s->bo[i], 0, s->addr[i], s->bo_size);
+		s->done_fence[i] = syncobj_create(s->fd, 0);
+	}
+
 	s->exec.num_batch_buffer = 1;
 	s->exec.exec_queue_id = s->exec_queue_id;
-	s->exec.address = s->addr;
+	/* s->exec.address set per submission */
 }
 
 static void subm_fini(struct subm *s)
 {
-	xe_vm_unbind_sync(s->fd, s->vm, 0, s->addr, s->bo_size);
-	gem_munmap(s->spin, s->bo_size);
-	gem_close(s->fd, s->bo);
+	for (unsigned int i = 0; i < s->slots; i++) {
+		xe_vm_unbind_sync(s->fd, s->vm, 0, s->addr[i], s->bo_size);
+		gem_munmap(s->spin[i], s->bo_size);
+		gem_close(s->fd, s->bo[i]);
+		if (s->done_fence[i])
+			syncobj_destroy(s->fd, s->done_fence[i]);
+	}
 	xe_exec_queue_destroy(s->fd, s->exec_queue_id);
 	xe_vm_destroy(s->fd, s->vm);
-	syncobj_destroy(s->fd, s->sync[0].handle);
+	free(s->addr);
+	free(s->bo);
+	free(s->spin);
+	free(s->done_fence);
+	free(s->submit_ts);
 }
 
 static void subm_workload_init(struct subm *s, struct subm_work_desc *work)
@@ -122,25 +148,41 @@ static void subm_workload_init(struct subm *s, struct subm_work_desc *work)
 	s->work = *work;
 	s->expected_ticks = xe_spin_nsec_to_ticks(s->fd, s->hwe.gt_id,
 						  s->work.duration_ms * 1000000);
-	xe_spin_init_opts(s->spin, .addr = s->addr, .preempt = s->work.preempt,
-			  .ctx_ticks = s->expected_ticks);
+	for (unsigned int i = 0; i < s->slots; i++)
+		xe_spin_init_opts(s->spin[i], .addr = s->addr[i],
+				  .preempt = s->work.preempt,
+				  .ctx_ticks = s->expected_ticks);
 }
 
-static void subm_wait(struct subm *s, uint64_t abs_timeout_nsec)
+static void subm_wait_slot(struct subm *s, unsigned int slot, uint64_t abs_timeout_nsec)
 {
-	igt_assert(syncobj_wait(s->fd, &s->sync[0].handle, 1, abs_timeout_nsec,
-				0, NULL));
+	igt_assert(syncobj_wait(s->fd, &s->done_fence[slot], 1,
+				abs_timeout_nsec, 0, NULL));
 }
 
-static void subm_exec(struct subm *s)
+static void subm_exec_slot(struct subm *s, unsigned int slot)
 {
-	syncobj_reset(s->fd, &s->sync[0].handle, 1);
+	struct timespec tv;
+	int nsync = 0;
+
+	syncobj_reset(s->fd, &s->done_fence[slot], 1);
+	memset(&s->sync[0], 0, sizeof(s->sync));
+	s->sync[nsync].type = DRM_XE_SYNC_TYPE_SYNCOBJ;
+	s->sync[nsync].flags = DRM_XE_SYNC_FLAG_SIGNAL;
+	s->sync[nsync].handle = s->done_fence[slot];
+	nsync++;
+	s->exec.num_syncs = nsync;
+	s->exec.syncs = to_user_pointer(&s->sync[0]);
+	s->exec.address = s->addr[slot];
+
+	igt_gettime(&tv);
+	s->submit_ts[slot] = (uint64_t)tv.tv_sec * (uint64_t)NSEC_PER_SEC + (uint64_t)tv.tv_nsec;
 	xe_exec(s->fd, &s->exec);
 }
 
-static bool subm_is_work_complete(struct subm *s)
+static bool subm_is_work_complete(struct subm *s, unsigned int slot)
 {
-	return s->expected_ticks <= ~s->spin->ticks_delta;
+	return s->expected_ticks <= ~s->spin[slot]->ticks_delta;
 }
 
 static bool subm_is_exec_queue_banned(struct subm *s)
@@ -157,6 +199,8 @@ static bool subm_is_exec_queue_banned(struct subm *s)
 static void subm_exec_loop(struct subm *s, struct subm_stats *stats,
 			   const struct subm_opts *opts)
 {
+	const unsigned int inflight = s->slots;
+	unsigned int submitted = 0;
 	struct timespec tv;
 	unsigned int i;
 
@@ -165,16 +209,27 @@ static void subm_exec_loop(struct subm *s, struct subm_stats *stats,
 		tv.tv_sec * (uint64_t)NSEC_PER_SEC + tv.tv_nsec;
 	igt_debug("[%s] start_timestamp: %f\n", s->id, stats->start_timestamp * 1e-9);
 
-	for (i = 0; i < s->work.repeats; ++i) {
-		igt_gettime(&tv);
+	/* Prefill */
+	if (s->work.repeats) {
+		unsigned int can_prefill = min(inflight, s->work.repeats);
 
-		subm_exec(s);
+		for (i = 0; i < can_prefill; i++)
+			subm_exec_slot(s, i % inflight);
+		submitted = can_prefill;
+	}
 
-		subm_wait(s, INT64_MAX);
+	/* Process completions in order: sample i -> slot (i % inflight) */
+	for (i = 0; i < s->work.repeats; ++i) {
+		unsigned int slot = i % inflight;
+
+		subm_wait_slot(s, slot, INT64_MAX);
 
-		igt_stats_push(&stats->samples, igt_nsec_elapsed(&tv));
+		igt_gettime(&tv);
+		stats->complete_ts[i] = (uint64_t)tv.tv_sec * (uint64_t)NSEC_PER_SEC +
+					(uint64_t)tv.tv_nsec;
+		igt_stats_push(&stats->samples, stats->complete_ts[i] - s->submit_ts[slot]);
 
-		if (!subm_is_work_complete(s)) {
+		if (!subm_is_work_complete(s, slot)) {
 			stats->num_early_finish++;
 
 			igt_debug("[%s] subm #%d early_finish=%u\n",
@@ -183,6 +238,14 @@ static void subm_exec_loop(struct subm *s, struct subm_stats *stats,
 			if (subm_is_exec_queue_banned(s))
 				break;
 		}
+
+		/* Keep the pipeline full */
+		if (submitted < s->work.repeats) {
+			unsigned int next_slot = submitted % inflight;
+
+			subm_exec_slot(s, next_slot);
+			submitted++;
+		}
 	}
 
 	igt_gettime(&tv);
@@ -272,8 +335,10 @@ static void subm_set_fini(struct subm_set *set)
 
 	subm_set_close_handles(set);
 
-	for (i = 0; i < set->ndata; ++i)
+	for (i = 0; i < set->ndata; ++i) {
 		igt_stats_fini(&set->data[i].stats.samples);
+		free(set->data[i].stats.complete_ts);
+	}
 
 	subm_set_free_data(set);
 }
@@ -334,16 +399,22 @@ static void compute_common_time_frame_stats(struct subm_set *set)
 	struct subm_stats *stats;
 	uint64_t common_start = 0;
 	uint64_t common_end = UINT64_MAX;
+	uint64_t first_ts, last_ts;
 
-	/* Find the common time frame */
+	/* Find common window from completion timestamps */
 	for (i = 0; i < ndata; i++) {
 		stats = &data[i].stats;
 
-		if (stats->start_timestamp > common_start)
-			common_start = stats->start_timestamp;
+		if (!stats->samples.n_values)
+			continue;
 
-		if (stats->end_timestamp < common_end)
-			common_end = stats->end_timestamp;
+		first_ts = stats->complete_ts[0];
+		last_ts = stats->complete_ts[stats->samples.n_values - 1];
+
+		if (first_ts > common_start)
+			common_start = first_ts;
+		if (last_ts < common_end)
+			common_end = last_ts;
 	}
 
 	igt_info("common time frame: [%" PRIu64 ";%" PRIu64 "] %.2fms\n",
@@ -354,8 +425,7 @@ static void compute_common_time_frame_stats(struct subm_set *set)
 
 	/* Compute concurrent_rate for each sample set within the common time frame */
 	for (i = 0; i < ndata; i++) {
-		uint64_t total_samples_duration = 0;
-		uint64_t samples_duration_in_common_frame = 0;
+		const double window_s = (common_end - common_start) * 1e-9;
 
 		stats = &data[i].stats;
 		stats->concurrent_execs = 0;
@@ -363,29 +433,21 @@ static void compute_common_time_frame_stats(struct subm_set *set)
 		stats->concurrent_mean = 0.0;
 
 		for (j = 0; j < stats->samples.n_values; j++) {
-			uint64_t sample_start = stats->start_timestamp + total_samples_duration;
-			uint64_t sample_end = sample_start + stats->samples.values_u64[j];
+			uint64_t cts = stats->complete_ts[j];
 
-			if (sample_start >= common_start &&
-			    sample_end <= common_end) {
+			if (cts >= common_start && cts <= common_end) {
 				stats->concurrent_execs++;
-				samples_duration_in_common_frame +=
-					stats->samples.values_u64[j];
+				stats->concurrent_mean += stats->samples.values_u64[j];
 			}
-
-			total_samples_duration += stats->samples.values_u64[j];
 		}
 
-		stats->concurrent_rate = samples_duration_in_common_frame ?
-				     (double)stats->concurrent_execs /
-					     (samples_duration_in_common_frame *
-					      1e-9) :
-				     0.0;
+		stats->concurrent_rate = (window_s > 0.0) ?
+					 ((double)stats->concurrent_execs / window_s) : 0.0;
+
 		stats->concurrent_mean = stats->concurrent_execs ?
-				      (double)samples_duration_in_common_frame /
-					      stats->concurrent_execs :
-				      0.0;
-		igt_info("[%s] Throughput = %.4f execs/s mean duration=%.4fms nsamples=%d\n",
+					 (double)stats->concurrent_mean /
+					 stats->concurrent_execs : 0.0;
+		igt_info("[%s] Throughput = %.4f execs/s mean submit->signal latency=%.4fms nsamples=%d\n",
 			 data[i].subm.id, stats->concurrent_rate, stats->concurrent_mean * 1e-6,
 			 stats->concurrent_execs);
 	}
@@ -439,9 +501,9 @@ static void log_sample_values(char *id, struct subm_stats *stats,
 }
 
 #define MIN_NUM_REPEATS 25
-#define MIN_EXEC_QUANTUM_MS 8
+#define MIN_EXEC_QUANTUM_MS 1
 #define MAX_EXEC_QUANTUM_MS 32
-#define MIN_JOB_DURATION_MS 16
+#define MIN_JOB_DURATION_MS 2
 #define MAX_TOTAL_DURATION_MS 15000
 #define PREFERRED_TOTAL_DURATION_MS 10000
 #define MAX_PREFERRED_REPEATS 100
@@ -546,6 +608,25 @@ static struct vf_sched_params prepare_vf_sched_params(int num_threads,
 	return params;
 }
 
+/* inflight K selection:
+ *   user_k == 0  => auto
+ *   user_k >= 1  => explicit K
+ */
+static unsigned int select_inflight_k(unsigned int duration_ms,
+				      unsigned int user_k,
+				      bool nonpreempt)
+{
+	if (user_k)
+		return user_k >= 1 ? user_k : 1;
+	if (nonpreempt)
+		return 1;
+	if (duration_ms <= 12)
+		return 4;
+	if (duration_ms <= 20)
+		return 3;
+	return 2;
+}
+
 static struct job_sched_params
 prepare_job_sched_params(int num_threads, int job_timeout_ms, const struct subm_opts *opts)
 {
@@ -573,12 +654,14 @@ static void throughput_ratio(int pf_fd, int num_vfs, const struct subm_opts *opt
 	struct job_sched_params job_sched_params = prepare_job_sched_params(num_vfs + 1,
 									    job_timeout_ms,
 									    opts);
+	const unsigned int k = select_inflight_k(job_sched_params.duration_ms,
+						 opts->inflight, false);
 
-	igt_info("eq=%ums pt=%uus duration=%ums repeats=%d num_vfs=%d job_timeout=%ums\n",
+	igt_info("eq=%ums pt=%uus duration=%ums repeats=%d inflight=%u num_vfs=%d job_timeout=%ums\n",
 		 job_sched_params.sched_params.exec_quantum_ms,
 		 job_sched_params.sched_params.preempt_timeout_us,
 		 job_sched_params.duration_ms, job_sched_params.num_repeats,
-		 num_vfs + 1, job_timeout_ms);
+		 k, num_vfs + 1, job_timeout_ms);
 
 	init_vf_ids(vf_ids, ARRAY_SIZE(vf_ids),
 		    &(struct init_vf_ids_opts){ .shuffle = true,
@@ -607,7 +690,7 @@ static void throughput_ratio(int pf_fd, int num_vfs, const struct subm_opts *opt
 		igt_assert_fd(vf_fd);
 		set->data[n].opts = opts;
 		subm_init(&set->data[n].subm, vf_fd, vf_ids[n], 0,
-			  xe_engine(vf_fd, 0)->instance);
+			  xe_engine(vf_fd, 0)->instance, k);
 		subm_workload_init(&set->data[n].subm,
 				   &(struct subm_work_desc){
 					.duration_ms = job_sched_params.duration_ms,
@@ -615,6 +698,8 @@ static void throughput_ratio(int pf_fd, int num_vfs, const struct subm_opts *opt
 					.repeats = job_sched_params.num_repeats });
 		igt_stats_init_with_size(&set->data[n].stats.samples,
 					 set->data[n].subm.work.repeats);
+		set->data[n].stats.complete_ts = calloc(set->data[n].subm.work.repeats,
+							sizeof(uint64_t));
 		if (set->sync_method == SYNC_BARRIER)
 			set->data[n].barrier = &set->barrier;
 	}
@@ -670,10 +755,11 @@ static void nonpreempt_engine_resets(int pf_fd, int num_vfs,
 			       vf_sched_params.preempt_timeout_us / USEC_PER_MSEC;
 	int preemptible_end = 1;
 	uint8_t vf_ids[num_vfs + 1 /*PF*/];
+	const unsigned int k = select_inflight_k(duration_ms, opts->inflight, true);
 
-	igt_info("eq=%ums pt=%uus duration=%" PRIu64 "ms num_vfs=%d job_timeout=%ums\n",
+	igt_info("eq=%ums pt=%uus duration=%" PRIu64 "ms inflight=%u num_vfs=%d job_timeout=%ums\n",
 		 vf_sched_params.exec_quantum_ms, vf_sched_params.preempt_timeout_us,
-		 duration_ms, num_vfs, job_timeout_ms);
+		 duration_ms, k, num_vfs, job_timeout_ms);
 
 	init_vf_ids(vf_ids, ARRAY_SIZE(vf_ids),
 		    &(struct init_vf_ids_opts){ .shuffle = true,
@@ -702,7 +788,7 @@ static void nonpreempt_engine_resets(int pf_fd, int num_vfs,
 		igt_assert_fd(vf_fd);
 		set->data[n].opts = opts;
 		subm_init(&set->data[n].subm, vf_fd, vf_ids[n], 0,
-			  xe_engine(vf_fd, 0)->instance);
+			  xe_engine(vf_fd, 0)->instance, k);
 		subm_workload_init(&set->data[n].subm,
 				   &(struct subm_work_desc){
 					.duration_ms = duration_ms,
@@ -710,6 +796,8 @@ static void nonpreempt_engine_resets(int pf_fd, int num_vfs,
 					.repeats = MIN_NUM_REPEATS });
 		igt_stats_init_with_size(&set->data[n].stats.samples,
 					 set->data[n].subm.work.repeats);
+		set->data[n].stats.complete_ts = calloc(set->data[n].subm.work.repeats,
+							sizeof(uint64_t));
 		if (set->sync_method == SYNC_BARRIER)
 			set->data[n].barrier = &set->barrier;
 	}
@@ -738,6 +826,7 @@ static void nonpreempt_engine_resets(int pf_fd, int num_vfs,
 static struct subm_opts subm_opts = {
 	.sync_method = SYNC_BARRIER,
 	.outlier_treshold = 0.1,
+	.inflight = 0,
 };
 
 static bool extended_scope;
@@ -764,6 +853,16 @@ static int subm_opts_handler(int opt, int opt_index, void *data)
 		subm_opts.outlier_treshold = atoi(optarg) / 100.0;
 		igt_info("Outlier threshold: %.2f\n", subm_opts.outlier_treshold);
 		break;
+	case 'i': {
+		int val = atoi(optarg);
+
+		subm_opts.inflight = val > 0 ? val : 0;
+		if (subm_opts.inflight)
+			igt_info("In-flight submissions: %u\n", subm_opts.inflight);
+		else
+			igt_info("In-flight submissions: auto (0)\n");
+		break;
+	}
 	default:
 		return IGT_OPT_HANDLER_ERROR;
 	}
@@ -777,6 +876,7 @@ static const struct option long_opts[] = {
 	{ .name = "threshold", .has_arg = true, .val = 't', },
 	{ .name = "eq_ms", .has_arg = true, .val = 'q', },
 	{ .name = "pt_us", .has_arg = true, .val = 'p', },
+	{ .name = "inflight", .has_arg = true, .val = 'i', },
 	{}
 };
 
@@ -785,7 +885,8 @@ static const char help_str[] =
 	"  --sync\tThreads synchronization method: 0 - none 1 - barrier (Default 1)\n"
 	"  --threshold\tSample outlier threshold (Default 0.1)\n"
 	"  --eq_ms\texec_quantum_ms\n"
-	"  --pt_us\tpreempt_timeout_us\n";
+	"  --pt_us\tpreempt_timeout_us\n"
+	"  --inflight\tNumber of submissions kept in flight per VF (0=auto)\n";
 
 igt_main_args("", long_opts, help_str, subm_opts_handler, NULL)
 {
-- 
2.31.1