[PATCH i-g-t] tests/intel/xe_sriov_scheduling: K-in-flight; completion window; --inflight

Tue Aug 19 16:54:13 UTC 2025

On 8/19/2025 2:15 PM, Kamil Konieczny wrote:
> Hi Marcin,
> On 2025-08-19 at 13:00:55 +0200, Marcin Bernatowicz wrote:
> 
> please improve subject or split this into more patches.

Ok, I'll split into more patches.

Thanks,
marcin

> 
> For example better subject could be:
> 
> [PATCH i-g-t] tests/intel/xe_sriov_scheduling: Refactor to better saturate HW
> 
> Regards,
> Kamil
> 
>> Refactor submission/measurement to better saturate HW and make
>> throughput comparisons more robust, especially with short jobs.
>> Add the --inflight option.
>>
>> - Drive a K-in-flight pipeline per VF using per-slot BO/addr/spin and
>>    binary out-fences; add subm_exec_slot()/subm_wait_slot() (prefill +
>>    refill).
>> - Record complete_ts[] and per-slot submit_ts[]; build the common
>>    window from completions [max(first), min(last)] and compute
>>    throughput as count/window.
>> - Push durations as submit-to-completion (complete_ts - submit_ts) and
>>    print "mean submit->signal latency".
>> - Add --inflight (0=auto; non-preempt defaults to 1; short jobs pick
>>    higher K); print chosen K in the banner.
>>
>> Signed-off-by: Marcin Bernatowicz <marcin.bernatowicz at linux.intel.com>
>> Cc: Adam Miszczak <adam.miszczak at linux.intel.com>
>> Cc: Jakub Kolakowski <jakub1.kolakowski at intel.com>
>> Cc: Lukasz Laguna <lukasz.laguna at intel.com>
>> Cc: Satyanarayana K V P <satyanarayana.k.v.p at intel.com>
>> ---
>>   tests/intel/xe_sriov_scheduling.c | 241 +++++++++++++++++++++---------
>>   1 file changed, 171 insertions(+), 70 deletions(-)
>>
>> diff --git a/tests/intel/xe_sriov_scheduling.c b/tests/intel/xe_sriov_scheduling.c
>> index d69315690..467eb1e29 100644
>> --- a/tests/intel/xe_sriov_scheduling.c
>> +++ b/tests/intel/xe_sriov_scheduling.c
>> @@ -27,6 +27,8 @@ struct subm_opts {
>>   	uint32_t exec_quantum_ms;
>>   	uint32_t preempt_timeout_us;
>>   	double outlier_treshold;
>> +	/* --inflight=0 => auto; >=1 => explicit K */
>> +	unsigned int inflight;
>>   };
>>   
>>   struct subm_work_desc {
>> @@ -39,6 +41,7 @@ struct subm_stats {
>>   	igt_stats_t samples;
>>   	uint64_t start_timestamp;
>>   	uint64_t end_timestamp;
>> +	uint64_t *complete_ts; /* absolute completion timestamps (ns) */
>>   	unsigned int num_early_finish;
>>   	unsigned int concurrent_execs;
>>   	double concurrent_rate;
>> @@ -51,13 +54,17 @@ struct subm {
>>   	int vf_num;
>>   	struct subm_work_desc work;
>>   	uint32_t expected_ticks;
>> -	uint64_t addr;
>>   	uint32_t vm;
>>   	struct drm_xe_engine_class_instance hwe;
>>   	uint32_t exec_queue_id;
>> -	uint32_t bo;
>> +	/* K slots (K BOs / addresses / mapped spinners / done fences / submit_ts) */
>> +	unsigned int slots;
>> +	uint64_t *submit_ts; /* per-slot submit timestamps (ns) */
>> +	uint64_t *addr;
>> +	uint32_t *bo;
>>   	size_t bo_size;
>> -	struct xe_spin *spin;
>> +	struct xe_spin **spin;
>> +	uint32_t *done_fence;
>>   	struct drm_xe_sync sync[1];
>>   	struct drm_xe_exec exec;
>>   };
>> @@ -78,43 +85,62 @@ struct subm_set {
>>   };
>>   
>>   static void subm_init(struct subm *s, int fd, int vf_num, uint64_t addr,
>> -		      struct drm_xe_engine_class_instance hwe)
>> +		      struct drm_xe_engine_class_instance hwe,
>> +		      unsigned int inflight)
>>   {
>> +	uint64_t base, stride;
>> +
>>   	memset(s, 0, sizeof(*s));
>>   	s->fd = fd;
>>   	s->vf_num = vf_num;
>>   	s->hwe = hwe;
>>   	snprintf(s->id, sizeof(s->id), "VF%d %d:%d:%d", vf_num,
>>   		 hwe.engine_class, hwe.engine_instance, hwe.gt_id);
>> -	s->addr = addr ? addr : 0x1a0000;
>> +	s->slots = inflight ? inflight : 1;
>>   	s->vm = xe_vm_create(s->fd, 0, 0);
>>   	s->exec_queue_id = xe_exec_queue_create(s->fd, s->vm, &s->hwe, 0);
>>   	s->bo_size = ALIGN(sizeof(struct xe_spin) + xe_cs_prefetch_size(s->fd),
>>   			   xe_get_default_alignment(s->fd));
>> -	s->bo = xe_bo_create(s->fd, s->vm, s->bo_size,
>> -			     vram_if_possible(fd, s->hwe.gt_id),
>> -			     DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM);
>> -	s->spin = xe_bo_map(s->fd, s->bo, s->bo_size);
>> -	xe_vm_bind_sync(s->fd, s->vm, s->bo, 0, s->addr, s->bo_size);
>> -	/* out fence */
>> -	s->sync[0].type = DRM_XE_SYNC_TYPE_SYNCOBJ;
>> -	s->sync[0].flags = DRM_XE_SYNC_FLAG_SIGNAL;
>> -	s->sync[0].handle = syncobj_create(s->fd, 0);
>> -	s->exec.num_syncs = 1;
>> -	s->exec.syncs = to_user_pointer(&s->sync[0]);
>> +	s->addr = calloc(s->slots, sizeof(*s->addr));
>> +	s->bo = calloc(s->slots, sizeof(*s->bo));
>> +	s->spin = calloc(s->slots, sizeof(*s->spin));
>> +	s->done_fence = calloc(s->slots, sizeof(*s->done_fence));
>> +	s->submit_ts = calloc(s->slots, sizeof(*s->submit_ts));
>> +	igt_assert(s->addr && s->bo && s->spin && s->done_fence && s->submit_ts);
>> +
>> +	base = addr ? addr : 0x1a0000;
>> +	stride = ALIGN(s->bo_size, 0x10000);
>> +	for (unsigned int i = 0; i < s->slots; i++) {
>> +		s->addr[i] = base + i * stride;
>> +		s->bo[i] = xe_bo_create(s->fd, s->vm, s->bo_size,
>> +					vram_if_possible(fd, s->hwe.gt_id),
>> +					DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM);
>> +		s->spin[i] = xe_bo_map(s->fd, s->bo[i], s->bo_size);
>> +		xe_vm_bind_sync(s->fd, s->vm, s->bo[i], 0, s->addr[i], s->bo_size);
>> +		s->done_fence[i] = syncobj_create(s->fd, 0);
>> +	}
>> +
>>   	s->exec.num_batch_buffer = 1;
>>   	s->exec.exec_queue_id = s->exec_queue_id;
>> -	s->exec.address = s->addr;
>> +	/* s->exec.address set per submission */
>>   }
>>   
>>   static void subm_fini(struct subm *s)
>>   {
>> -	xe_vm_unbind_sync(s->fd, s->vm, 0, s->addr, s->bo_size);
>> -	gem_munmap(s->spin, s->bo_size);
>> -	gem_close(s->fd, s->bo);
>> +	for (unsigned int i = 0; i < s->slots; i++) {
>> +		xe_vm_unbind_sync(s->fd, s->vm, 0, s->addr[i], s->bo_size);
>> +		gem_munmap(s->spin[i], s->bo_size);
>> +		gem_close(s->fd, s->bo[i]);
>> +		if (s->done_fence[i])
>> +			syncobj_destroy(s->fd, s->done_fence[i]);
>> +	}
>>   	xe_exec_queue_destroy(s->fd, s->exec_queue_id);
>>   	xe_vm_destroy(s->fd, s->vm);
>> -	syncobj_destroy(s->fd, s->sync[0].handle);
>> +	free(s->addr);
>> +	free(s->bo);
>> +	free(s->spin);
>> +	free(s->done_fence);
>> +	free(s->submit_ts);
>>   }
>>   
>>   static void subm_workload_init(struct subm *s, struct subm_work_desc *work)
>> @@ -122,25 +148,41 @@ static void subm_workload_init(struct subm *s, struct subm_work_desc *work)
>>   	s->work = *work;
>>   	s->expected_ticks = xe_spin_nsec_to_ticks(s->fd, s->hwe.gt_id,
>>   						  s->work.duration_ms * 1000000);
>> -	xe_spin_init_opts(s->spin, .addr = s->addr, .preempt = s->work.preempt,
>> -			  .ctx_ticks = s->expected_ticks);
>> +	for (unsigned int i = 0; i < s->slots; i++)
>> +		xe_spin_init_opts(s->spin[i], .addr = s->addr[i],
>> +				  .preempt = s->work.preempt,
>> +				  .ctx_ticks = s->expected_ticks);
>>   }
>>   
>> -static void subm_wait(struct subm *s, uint64_t abs_timeout_nsec)
>> +static void subm_wait_slot(struct subm *s, unsigned int slot, uint64_t abs_timeout_nsec)
>>   {
>> -	igt_assert(syncobj_wait(s->fd, &s->sync[0].handle, 1, abs_timeout_nsec,
>> -				0, NULL));
>> +	igt_assert(syncobj_wait(s->fd, &s->done_fence[slot], 1,
>> +				abs_timeout_nsec, 0, NULL));
>>   }
>>   
>> -static void subm_exec(struct subm *s)
>> +static void subm_exec_slot(struct subm *s, unsigned int slot)
>>   {
>> -	syncobj_reset(s->fd, &s->sync[0].handle, 1);
>> +	struct timespec tv;
>> +	int nsync = 0;
>> +
>> +	syncobj_reset(s->fd, &s->done_fence[slot], 1);
>> +	memset(&s->sync[0], 0, sizeof(s->sync));
>> +	s->sync[nsync].type = DRM_XE_SYNC_TYPE_SYNCOBJ;
>> +	s->sync[nsync].flags = DRM_XE_SYNC_FLAG_SIGNAL;
>> +	s->sync[nsync].handle = s->done_fence[slot];
>> +	nsync++;
>> +	s->exec.num_syncs = nsync;
>> +	s->exec.syncs = to_user_pointer(&s->sync[0]);
>> +	s->exec.address = s->addr[slot];
>> +
>> +	igt_gettime(&tv);
>> +	s->submit_ts[slot] = (uint64_t)tv.tv_sec * (uint64_t)NSEC_PER_SEC + (uint64_t)tv.tv_nsec;
>>   	xe_exec(s->fd, &s->exec);
>>   }
>>   
>> -static bool subm_is_work_complete(struct subm *s)
>> +static bool subm_is_work_complete(struct subm *s, unsigned int slot)
>>   {
>> -	return s->expected_ticks <= ~s->spin->ticks_delta;
>> +	return s->expected_ticks <= ~s->spin[slot]->ticks_delta;
>>   }
>>   
>>   static bool subm_is_exec_queue_banned(struct subm *s)
>> @@ -157,6 +199,8 @@ static bool subm_is_exec_queue_banned(struct subm *s)
>>   static void subm_exec_loop(struct subm *s, struct subm_stats *stats,
>>   			   const struct subm_opts *opts)
>>   {
>> +	const unsigned int inflight = s->slots;
>> +	unsigned int submitted = 0;
>>   	struct timespec tv;
>>   	unsigned int i;
>>   
>> @@ -165,16 +209,27 @@ static void subm_exec_loop(struct subm *s, struct subm_stats *stats,
>>   		tv.tv_sec * (uint64_t)NSEC_PER_SEC + tv.tv_nsec;
>>   	igt_debug("[%s] start_timestamp: %f\n", s->id, stats->start_timestamp * 1e-9);
>>   
>> -	for (i = 0; i < s->work.repeats; ++i) {
>> -		igt_gettime(&tv);
>> +	/* Prefill */
>> +	if (s->work.repeats) {
>> +		unsigned int can_prefill = min(inflight, s->work.repeats);
>>   
>> -		subm_exec(s);
>> +		for (i = 0; i < can_prefill; i++)
>> +			subm_exec_slot(s, i % inflight);
>> +		submitted = can_prefill;
>> +	}
>>   
>> -		subm_wait(s, INT64_MAX);
>> +	/* Process completions in order: sample i -> slot (i % inflight) */
>> +	for (i = 0; i < s->work.repeats; ++i) {
>> +		unsigned int slot = i % inflight;
>> +
>> +		subm_wait_slot(s, slot, INT64_MAX);
>>   
>> -		igt_stats_push(&stats->samples, igt_nsec_elapsed(&tv));
>> +		igt_gettime(&tv);
>> +		stats->complete_ts[i] = (uint64_t)tv.tv_sec * (uint64_t)NSEC_PER_SEC +
>> +					(uint64_t)tv.tv_nsec;
>> +		igt_stats_push(&stats->samples, stats->complete_ts[i] - s->submit_ts[slot]);
>>   
>> -		if (!subm_is_work_complete(s)) {
>> +		if (!subm_is_work_complete(s, slot)) {
>>   			stats->num_early_finish++;
>>   
>>   			igt_debug("[%s] subm #%d early_finish=%u\n",
>> @@ -183,6 +238,14 @@ static void subm_exec_loop(struct subm *s, struct subm_stats *stats,
>>   			if (subm_is_exec_queue_banned(s))
>>   				break;
>>   		}
>> +
>> +		/* Keep the pipeline full */
>> +		if (submitted < s->work.repeats) {
>> +			unsigned int next_slot = submitted % inflight;
>> +
>> +			subm_exec_slot(s, next_slot);
>> +			submitted++;
>> +		}
>>   	}
>>   
>>   	igt_gettime(&tv);
>> @@ -272,8 +335,10 @@ static void subm_set_fini(struct subm_set *set)
>>   
>>   	subm_set_close_handles(set);
>>   
>> -	for (i = 0; i < set->ndata; ++i)
>> +	for (i = 0; i < set->ndata; ++i) {
>>   		igt_stats_fini(&set->data[i].stats.samples);
>> +		free(set->data[i].stats.complete_ts);
>> +	}
>>   
>>   	subm_set_free_data(set);
>>   }
>> @@ -334,16 +399,22 @@ static void compute_common_time_frame_stats(struct subm_set *set)
>>   	struct subm_stats *stats;
>>   	uint64_t common_start = 0;
>>   	uint64_t common_end = UINT64_MAX;
>> +	uint64_t first_ts, last_ts;
>>   
>> -	/* Find the common time frame */
>> +	/* Find common window from completion timestamps */
>>   	for (i = 0; i < ndata; i++) {
>>   		stats = &data[i].stats;
>>   
>> -		if (stats->start_timestamp > common_start)
>> -			common_start = stats->start_timestamp;
>> +		if (!stats->samples.n_values)
>> +			continue;
>>   
>> -		if (stats->end_timestamp < common_end)
>> -			common_end = stats->end_timestamp;
>> +		first_ts = stats->complete_ts[0];
>> +		last_ts = stats->complete_ts[stats->samples.n_values - 1];
>> +
>> +		if (first_ts > common_start)
>> +			common_start = first_ts;
>> +		if (last_ts < common_end)
>> +			common_end = last_ts;
>>   	}
>>   
>>   	igt_info("common time frame: [%" PRIu64 ";%" PRIu64 "] %.2fms\n",
>> @@ -354,8 +425,7 @@ static void compute_common_time_frame_stats(struct subm_set *set)
>>   
>>   	/* Compute concurrent_rate for each sample set within the common time frame */
>>   	for (i = 0; i < ndata; i++) {
>> -		uint64_t total_samples_duration = 0;
>> -		uint64_t samples_duration_in_common_frame = 0;
>> +		const double window_s = (common_end - common_start) * 1e-9;
>>   
>>   		stats = &data[i].stats;
>>   		stats->concurrent_execs = 0;
>> @@ -363,29 +433,21 @@ static void compute_common_time_frame_stats(struct subm_set *set)
>>   		stats->concurrent_mean = 0.0;
>>   
>>   		for (j = 0; j < stats->samples.n_values; j++) {
>> -			uint64_t sample_start = stats->start_timestamp + total_samples_duration;
>> -			uint64_t sample_end = sample_start + stats->samples.values_u64[j];
>> +			uint64_t cts = stats->complete_ts[j];
>>   
>> -			if (sample_start >= common_start &&
>> -			    sample_end <= common_end) {
>> +			if (cts >= common_start && cts <= common_end) {
>>   				stats->concurrent_execs++;
>> -				samples_duration_in_common_frame +=
>> -					stats->samples.values_u64[j];
>> +				stats->concurrent_mean += stats->samples.values_u64[j];
>>   			}
>> -
>> -			total_samples_duration += stats->samples.values_u64[j];
>>   		}
>>   
>> -		stats->concurrent_rate = samples_duration_in_common_frame ?
>> -				     (double)stats->concurrent_execs /
>> -					     (samples_duration_in_common_frame *
>> -					      1e-9) :
>> -				     0.0;
>> +		stats->concurrent_rate = (window_s > 0.0) ?
>> +					 ((double)stats->concurrent_execs / window_s) : 0.0;
>> +
>>   		stats->concurrent_mean = stats->concurrent_execs ?
>> -				      (double)samples_duration_in_common_frame /
>> -					      stats->concurrent_execs :
>> -				      0.0;
>> -		igt_info("[%s] Throughput = %.4f execs/s mean duration=%.4fms nsamples=%d\n",
>> +					 (double)stats->concurrent_mean /
>> +					 stats->concurrent_execs : 0.0;
>> +		igt_info("[%s] Throughput = %.4f execs/s mean submit->signal latency=%.4fms nsamples=%d\n",
>>   			 data[i].subm.id, stats->concurrent_rate, stats->concurrent_mean * 1e-6,
>>   			 stats->concurrent_execs);
>>   	}
>> @@ -439,9 +501,9 @@ static void log_sample_values(char *id, struct subm_stats *stats,
>>   }
>>   
>>   #define MIN_NUM_REPEATS 25
>> -#define MIN_EXEC_QUANTUM_MS 8
>> +#define MIN_EXEC_QUANTUM_MS 1
>>   #define MAX_EXEC_QUANTUM_MS 32
>> -#define MIN_JOB_DURATION_MS 16
>> +#define MIN_JOB_DURATION_MS 2
>>   #define MAX_TOTAL_DURATION_MS 15000
>>   #define PREFERRED_TOTAL_DURATION_MS 10000
>>   #define MAX_PREFERRED_REPEATS 100
>> @@ -546,6 +608,25 @@ static struct vf_sched_params prepare_vf_sched_params(int num_threads,
>>   	return params;
>>   }
>>   
>> +/* inflight K selection:
>> + *   user_k == 0  => auto
>> + *   user_k >= 1  => explicit K
>> + */
>> +static unsigned int select_inflight_k(unsigned int duration_ms,
>> +				      unsigned int user_k,
>> +				      bool nonpreempt)
>> +{
>> +	if (user_k)
>> +		return user_k >= 1 ? user_k : 1;
>> +	if (nonpreempt)
>> +		return 1;
>> +	if (duration_ms <= 12)
>> +		return 4;
>> +	if (duration_ms <= 20)
>> +		return 3;
>> +	return 2;
>> +}
>> +
>>   static struct job_sched_params
>>   prepare_job_sched_params(int num_threads, int job_timeout_ms, const struct subm_opts *opts)
>>   {
>> @@ -573,12 +654,14 @@ static void throughput_ratio(int pf_fd, int num_vfs, const struct subm_opts *opt
>>   	struct job_sched_params job_sched_params = prepare_job_sched_params(num_vfs + 1,
>>   									    job_timeout_ms,
>>   									    opts);
>> +	const unsigned int k = select_inflight_k(job_sched_params.duration_ms,
>> +						 opts->inflight, false);
>>   
>> -	igt_info("eq=%ums pt=%uus duration=%ums repeats=%d num_vfs=%d job_timeout=%ums\n",
>> +	igt_info("eq=%ums pt=%uus duration=%ums repeats=%d inflight=%u num_vfs=%d job_timeout=%ums\n",
>>   		 job_sched_params.sched_params.exec_quantum_ms,
>>   		 job_sched_params.sched_params.preempt_timeout_us,
>>   		 job_sched_params.duration_ms, job_sched_params.num_repeats,
>> -		 num_vfs + 1, job_timeout_ms);
>> +		 k, num_vfs + 1, job_timeout_ms);
>>   
>>   	init_vf_ids(vf_ids, ARRAY_SIZE(vf_ids),
>>   		    &(struct init_vf_ids_opts){ .shuffle = true,
>> @@ -607,7 +690,7 @@ static void throughput_ratio(int pf_fd, int num_vfs, const struct subm_opts *opt
>>   		igt_assert_fd(vf_fd);
>>   		set->data[n].opts = opts;
>>   		subm_init(&set->data[n].subm, vf_fd, vf_ids[n], 0,
>> -			  xe_engine(vf_fd, 0)->instance);
>> +			  xe_engine(vf_fd, 0)->instance, k);
>>   		subm_workload_init(&set->data[n].subm,
>>   				   &(struct subm_work_desc){
>>   					.duration_ms = job_sched_params.duration_ms,
>> @@ -615,6 +698,8 @@ static void throughput_ratio(int pf_fd, int num_vfs, const struct subm_opts *opt
>>   					.repeats = job_sched_params.num_repeats });
>>   		igt_stats_init_with_size(&set->data[n].stats.samples,
>>   					 set->data[n].subm.work.repeats);
>> +		set->data[n].stats.complete_ts = calloc(set->data[n].subm.work.repeats,
>> +							sizeof(uint64_t));
>>   		if (set->sync_method == SYNC_BARRIER)
>>   			set->data[n].barrier = &set->barrier;
>>   	}
>> @@ -670,10 +755,11 @@ static void nonpreempt_engine_resets(int pf_fd, int num_vfs,
>>   			       vf_sched_params.preempt_timeout_us / USEC_PER_MSEC;
>>   	int preemptible_end = 1;
>>   	uint8_t vf_ids[num_vfs + 1 /*PF*/];
>> +	const unsigned int k = select_inflight_k(duration_ms, opts->inflight, true);
>>   
>> -	igt_info("eq=%ums pt=%uus duration=%" PRIu64 "ms num_vfs=%d job_timeout=%ums\n",
>> +	igt_info("eq=%ums pt=%uus duration=%" PRIu64 "ms inflight=%u num_vfs=%d job_timeout=%ums\n",
>>   		 vf_sched_params.exec_quantum_ms, vf_sched_params.preempt_timeout_us,
>> -		 duration_ms, num_vfs, job_timeout_ms);
>> +		 duration_ms, k, num_vfs, job_timeout_ms);
>>   
>>   	init_vf_ids(vf_ids, ARRAY_SIZE(vf_ids),
>>   		    &(struct init_vf_ids_opts){ .shuffle = true,
>> @@ -702,7 +788,7 @@ static void nonpreempt_engine_resets(int pf_fd, int num_vfs,
>>   		igt_assert_fd(vf_fd);
>>   		set->data[n].opts = opts;
>>   		subm_init(&set->data[n].subm, vf_fd, vf_ids[n], 0,
>> -			  xe_engine(vf_fd, 0)->instance);
>> +			  xe_engine(vf_fd, 0)->instance, k);
>>   		subm_workload_init(&set->data[n].subm,
>>   				   &(struct subm_work_desc){
>>   					.duration_ms = duration_ms,
>> @@ -710,6 +796,8 @@ static void nonpreempt_engine_resets(int pf_fd, int num_vfs,
>>   					.repeats = MIN_NUM_REPEATS });
>>   		igt_stats_init_with_size(&set->data[n].stats.samples,
>>   					 set->data[n].subm.work.repeats);
>> +		set->data[n].stats.complete_ts = calloc(set->data[n].subm.work.repeats,
>> +							sizeof(uint64_t));
>>   		if (set->sync_method == SYNC_BARRIER)
>>   			set->data[n].barrier = &set->barrier;
>>   	}
>> @@ -738,6 +826,7 @@ static void nonpreempt_engine_resets(int pf_fd, int num_vfs,
>>   static struct subm_opts subm_opts = {
>>   	.sync_method = SYNC_BARRIER,
>>   	.outlier_treshold = 0.1,
>> +	.inflight = 0,
>>   };
>>   
>>   static bool extended_scope;
>> @@ -764,6 +853,16 @@ static int subm_opts_handler(int opt, int opt_index, void *data)
>>   		subm_opts.outlier_treshold = atoi(optarg) / 100.0;
>>   		igt_info("Outlier threshold: %.2f\n", subm_opts.outlier_treshold);
>>   		break;
>> +	case 'i': {
>> +		int val = atoi(optarg);
>> +
>> +		subm_opts.inflight = val > 0 ? val : 0;
>> +		if (subm_opts.inflight)
>> +			igt_info("In-flight submissions: %u\n", subm_opts.inflight);
>> +		else
>> +			igt_info("In-flight submissions: auto (0)\n");
>> +		break;
>> +	}
>>   	default:
>>   		return IGT_OPT_HANDLER_ERROR;
>>   	}
>> @@ -777,6 +876,7 @@ static const struct option long_opts[] = {
>>   	{ .name = "threshold", .has_arg = true, .val = 't', },
>>   	{ .name = "eq_ms", .has_arg = true, .val = 'q', },
>>   	{ .name = "pt_us", .has_arg = true, .val = 'p', },
>> +	{ .name = "inflight", .has_arg = true, .val = 'i', },
>>   	{}
>>   };
>>   
>> @@ -785,7 +885,8 @@ static const char help_str[] =
>>   	"  --sync\tThreads synchronization method: 0 - none 1 - barrier (Default 1)\n"
>>   	"  --threshold\tSample outlier threshold (Default 0.1)\n"
>>   	"  --eq_ms\texec_quantum_ms\n"
>> -	"  --pt_us\tpreempt_timeout_us\n";
>> +	"  --pt_us\tpreempt_timeout_us\n"
>> +	"  --inflight\tNumber of submissions kept in flight per VF (0=auto)\n";
>>   
>>   igt_main_args("", long_opts, help_str, subm_opts_handler, NULL)
>>   {
>> -- 
>> 2.31.1
>>