[PATCH i-g-t 1/2] tests/intel/xe_drm_fdinfo: Wire up parallel/virtual submission

Fri Sep 6 21:05:01 UTC 2024

On Fri, Sep 06, 2024 at 01:54:36PM -0500, Lucas De Marchi wrote:
> On Fri, Sep 06, 2024 at 05:56:59PM GMT, Matthew Brost wrote:
> > On Wed, Sep 04, 2024 at 03:57:45PM -0700, Lucas De Marchi wrote:
> > > Add the boiler plate code for parallel and virtual submission in the
> > > spin_ctx_* and check_results(). This is based on previous code by Umesh
> > > that got simplified before applying.
> > > 
> > > Cc: Umesh Nerlige Ramappa <umesh.nerlige.ramappa at intel.com>
> > > Signed-off-by: Lucas De Marchi <lucas.demarchi at intel.com>
> > > ---
> > >  tests/intel/xe_drm_fdinfo.c | 61 ++++++++++++++++++++++++-------------
> > >  1 file changed, 40 insertions(+), 21 deletions(-)
> > > 
> > > diff --git a/tests/intel/xe_drm_fdinfo.c b/tests/intel/xe_drm_fdinfo.c
> > > index d1ed0fcaa..8acb95040 100644
> > > --- a/tests/intel/xe_drm_fdinfo.c
> > > +++ b/tests/intel/xe_drm_fdinfo.c
> > > @@ -365,7 +365,7 @@ static void basic_engine_utilization(int xe)
> > > 
> > >  struct spin_ctx {
> > >  	uint32_t vm;
> > > -	uint64_t addr;
> > > +	uint64_t addr[XE_MAX_ENGINE_INSTANCE];
> > >  	struct drm_xe_sync sync[2];
> > >  	struct drm_xe_exec exec;
> > >  	uint32_t exec_queue;
> > > @@ -375,18 +375,29 @@ struct spin_ctx {
> > >  	struct xe_spin_opts spin_opts;
> > >  	bool ended;
> > >  	uint16_t class;
> > > +	uint16_t width;
> > > +	uint16_t num_placements;
> > >  };
> > > 
> > >  static struct spin_ctx *
> > > -spin_ctx_init(int fd, struct drm_xe_engine_class_instance *hwe, uint32_t vm)
> > > +spin_ctx_init(int fd, struct drm_xe_engine_class_instance *hwe, uint32_t vm,
> > > +	      uint16_t width, uint16_t num_placements)
> > >  {
> > >  	struct spin_ctx *ctx = calloc(1, sizeof(*ctx));
> > > 
> > > +	igt_assert(width && num_placements &&
> > > +		   (width == 1 || num_placements == 1));
> > > +	igt_assert_lt(width, XE_MAX_ENGINE_INSTANCE);
> > > +
> > >  	ctx->class = hwe->engine_class;
> > > +	ctx->width = width;
> > > +	ctx->num_placements = num_placements;
> > >  	ctx->vm = vm;
> > > -	ctx->addr = 0x100000 + 0x100000 * hwe->engine_class;
> > > 
> > > -	ctx->exec.num_batch_buffer = 1;
> > > +	for (unsigned int i = 0; i < width; i++)
> > > +		ctx->addr[i] = 0x100000 + 0x100000 * hwe->engine_class;
> > > +
> > > +	ctx->exec.num_batch_buffer = width;
> > >  	ctx->exec.num_syncs = 2;
> > >  	ctx->exec.syncs = to_user_pointer(ctx->sync);
> > > 
> > > @@ -405,10 +416,10 @@ spin_ctx_init(int fd, struct drm_xe_engine_class_instance *hwe, uint32_t vm)
> > >  			       DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM);
> > >  	ctx->spin = xe_bo_map(fd, ctx->bo, ctx->bo_size);
> > > 
> > > -	igt_assert_eq(__xe_exec_queue_create(fd, ctx->vm, 1, 1,
> > > +	igt_assert_eq(__xe_exec_queue_create(fd, ctx->vm, width, num_placements,
> > >  					     hwe, 0, &ctx->exec_queue), 0);
> > > 
> > > -	xe_vm_bind_async(fd, ctx->vm, 0, ctx->bo, 0, ctx->addr, ctx->bo_size,
> > > +	xe_vm_bind_async(fd, ctx->vm, 0, ctx->bo, 0, ctx->addr[0], ctx->bo_size,
> > >  			 ctx->sync, 1);
> > > 
> > >  	return ctx;
> > > @@ -420,7 +431,7 @@ spin_sync_start(int fd, struct spin_ctx *ctx)
> > >  	if (!ctx)
> > >  		return;
> > > 
> > > -	ctx->spin_opts.addr = ctx->addr;
> > > +	ctx->spin_opts.addr = ctx->addr[0];
> > >  	ctx->spin_opts.write_timestamp = true;
> > >  	ctx->spin_opts.preempt = true;
> > >  	xe_spin_init(ctx->spin, &ctx->spin_opts);
> > > @@ -429,7 +440,12 @@ spin_sync_start(int fd, struct spin_ctx *ctx)
> > >  	ctx->sync[0].flags &= ~DRM_XE_SYNC_FLAG_SIGNAL;
> > > 
> > >  	ctx->exec.exec_queue_id = ctx->exec_queue;
> > > -	ctx->exec.address = ctx->addr;
> > > +
> > > +	if (ctx->width > 1)
> > > +		ctx->exec.address = to_user_pointer(ctx->addr);
> > > +	else
> > > +		ctx->exec.address = ctx->addr[0];
> > > +
> > >  	xe_exec(fd, &ctx->exec);
> > > 
> > >  	xe_spin_wait_started(ctx->spin);
> > > @@ -450,7 +466,7 @@ spin_sync_end(int fd, struct spin_ctx *ctx)
> > >  	igt_assert(syncobj_wait(fd, &ctx->sync[0].handle, 1, INT64_MAX, 0, NULL));
> > > 
> > >  	ctx->sync[0].flags |= DRM_XE_SYNC_FLAG_SIGNAL;
> > > -	xe_vm_unbind_async(fd, ctx->vm, 0, 0, ctx->addr, ctx->bo_size, ctx->sync, 1);
> > > +	xe_vm_unbind_async(fd, ctx->vm, 0, 0, ctx->addr[0], ctx->bo_size, ctx->sync, 1);
> > >  	igt_assert(syncobj_wait(fd, &ctx->sync[0].handle, 1, INT64_MAX, 0, NULL));
> > > 
> > >  	ctx->ended = true;
> > > @@ -476,7 +492,7 @@ spin_ctx_destroy(int fd, struct spin_ctx *ctx)
> > > 
> > >  static void
> > >  check_results(struct pceu_cycles *s1, struct pceu_cycles *s2,
> > > -	      int class, enum expected_load expected_load)
> > > +	      int class, int width, enum expected_load expected_load)
> > >  {
> > >  	double percent;
> > >  	u64 den, num;
> > > @@ -490,6 +506,9 @@ check_results(struct pceu_cycles *s1, struct pceu_cycles *s2,
> > >  	den = s2[class].total_cycles - s1[class].total_cycles;
> > >  	percent = (num * 100.0) / (den + 1);
> > > 
> > > +	/* for parallel submission scale the busyness with width */
> > > +	percent /= width;
> > 
> > This doesn't look right. Wouldn't a width submission be busier?
> > 
> > Maybe I'm confusing myself but everything else LGTM.
> 
> the thing is... the fdinfo reports the number of engines of each class
> (drm-engine-capacity-<engine>) and expects the reader to scale the
> total_cycles according to that number.  We are lazy here and instead of
> using that number we use width that was passed as input.
> 
> Example, with "*" denoting when cycle is ticking.
> 
> 		s1		    s2
> ccs0		[**********..........] == 10
> ccs1		[**********..........] == 10
> total_cycles	[********************] == 20
> 
> cycles[ccs] == 20
> total_cycles == 20
> width == 2
> 
> Calculating according to check_results():
> 
> num = 20
> den = 20
> percent = (20 * 100) / 20
> percent /= width == 50%
> 
> which corresponds to the reality that this client occupied CCS
> for 50% of the available time.

I kinda reasoned this after sending my reply. Thanks for confirming.

With that:
Reviewed-by: Matthew Brost <matthew.brost at intel.com>

> 
> Lucas De Marchi
> 
> > 
> > Matt
> > 
> > > +
> > >  	igt_debug("%s: percent: %f\n", engine_map[class], percent);
> > > 
> > >  	switch (expected_load) {
> > > @@ -522,7 +541,7 @@ utilization_single(int fd, struct drm_xe_engine_class_instance *hwe, unsigned in
> > > 
> > >  	vm = xe_vm_create(fd, 0, 0);
> > >  	if (flags & TEST_BUSY) {
> > > -		ctx = spin_ctx_init(fd, hwe, vm);
> > > +		ctx = spin_ctx_init(fd, hwe, vm, 1, 1);
> > >  		spin_sync_start(fd, ctx);
> > >  	}
> > > 
> > > @@ -540,14 +559,14 @@ utilization_single(int fd, struct drm_xe_engine_class_instance *hwe, unsigned in
> > > 
> > >  	expected_load = flags & TEST_BUSY ?
> > >  	       EXPECTED_LOAD_FULL : EXPECTED_LOAD_IDLE;
> > > -	check_results(pceu1[0], pceu2[0], hwe->engine_class, expected_load);
> > > +	check_results(pceu1[0], pceu2[0], hwe->engine_class, 1, expected_load);
> > > 
> > >  	if (flags & TEST_ISOLATION) {
> > >  		/*
> > >  		 * Load from one client shouldn't spill on another,
> > >  		 * so check for idle
> > >  		 */
> > > -		check_results(pceu1[1], pceu2[1], hwe->engine_class, EXPECTED_LOAD_IDLE);
> > > +		check_results(pceu1[1], pceu2[1], hwe->engine_class, 1, EXPECTED_LOAD_IDLE);
> > >  		close(new_fd);
> > >  	}
> > > 
> > > @@ -565,7 +584,7 @@ utilization_single_destroy_queue(int fd, struct drm_xe_engine_class_instance *hw
> > >  	uint32_t vm;
> > > 
> > >  	vm = xe_vm_create(fd, 0, 0);
> > > -	ctx = spin_ctx_init(fd, hwe, vm);
> > > +	ctx = spin_ctx_init(fd, hwe, vm, 1, 1);
> > >  	spin_sync_start(fd, ctx);
> > > 
> > >  	read_engine_cycles(fd, pceu1);
> > > @@ -579,7 +598,7 @@ utilization_single_destroy_queue(int fd, struct drm_xe_engine_class_instance *hw
> > > 
> > >  	xe_vm_destroy(fd, vm);
> > > 
> > > -	check_results(pceu1, pceu2, hwe->engine_class, EXPECTED_LOAD_FULL);
> > > +	check_results(pceu1, pceu2, hwe->engine_class, 1, EXPECTED_LOAD_FULL);
> > >  }
> > > 
> > >  static void
> > > @@ -593,7 +612,7 @@ utilization_others_idle(int fd, struct drm_xe_engine_class_instance *hwe)
> > > 
> > >  	vm = xe_vm_create(fd, 0, 0);
> > > 
> > > -	ctx = spin_ctx_init(fd, hwe, vm);
> > > +	ctx = spin_ctx_init(fd, hwe, vm, 1, 1);
> > >  	spin_sync_start(fd, ctx);
> > > 
> > >  	read_engine_cycles(fd, pceu1);
> > > @@ -605,7 +624,7 @@ utilization_others_idle(int fd, struct drm_xe_engine_class_instance *hwe)
> > >  		enum expected_load expected_load = hwe->engine_class != class ?
> > >  			EXPECTED_LOAD_IDLE : EXPECTED_LOAD_FULL;
> > > 
> > > -		check_results(pceu1, pceu2, class, expected_load);
> > > +		check_results(pceu1, pceu2, class, 1, expected_load);
> > >  	}
> > > 
> > >  	spin_sync_end(fd, ctx);
> > > @@ -632,7 +651,7 @@ utilization_others_full_load(int fd, struct drm_xe_engine_class_instance *hwe)
> > >  		if (_class == hwe->engine_class || ctx[_class])
> > >  			continue;
> > > 
> > > -		ctx[_class] = spin_ctx_init(fd, _hwe, vm);
> > > +		ctx[_class] = spin_ctx_init(fd, _hwe, vm, 1, 1);
> > >  		spin_sync_start(fd, ctx[_class]);
> > >  	}
> > > 
> > > @@ -649,7 +668,7 @@ utilization_others_full_load(int fd, struct drm_xe_engine_class_instance *hwe)
> > >  		if (!ctx[class])
> > >  			continue;
> > > 
> > > -		check_results(pceu1, pceu2, class, expected_load);
> > > +		check_results(pceu1, pceu2, class, 1, expected_load);
> > >  		spin_sync_end(fd, ctx[class]);
> > >  		spin_ctx_destroy(fd, ctx[class]);
> > >  	}
> > > @@ -675,7 +694,7 @@ utilization_all_full_load(int fd)
> > >  		if (ctx[class])
> > >  			continue;
> > > 
> > > -		ctx[class] = spin_ctx_init(fd, hwe, vm);
> > > +		ctx[class] = spin_ctx_init(fd, hwe, vm, 1, 1);
> > >  		spin_sync_start(fd, ctx[class]);
> > >  	}
> > > 
> > > @@ -689,7 +708,7 @@ utilization_all_full_load(int fd)
> > >  		if (!ctx[class])
> > >  			continue;
> > > 
> > > -		check_results(pceu1, pceu2, class, EXPECTED_LOAD_FULL);
> > > +		check_results(pceu1, pceu2, class, 1, EXPECTED_LOAD_FULL);
> > >  		spin_sync_end(fd, ctx[class]);
> > >  		spin_ctx_destroy(fd, ctx[class]);
> > >  	}
> > > --
> > > 2.43.0
> > >