[PATCH i-g-t 3/8] tests/intel/xe_drm_fdinfo: Add helpers for spinning batches

Mon Jul 1 17:27:54 UTC 2024

On Mon, Jul 01, 2024 at 11:57:14AM -0500, Lucas De Marchi wrote:
>On Sat, Jun 22, 2024 at 07:00:57AM GMT, Umesh Nerlige Ramappa wrote:
>>Add helpers for submitting batches and waiting for them to start.
>>
>>Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa at intel.com>
>>---
>>tests/intel/xe_drm_fdinfo.c | 135 ++++++++++++++++++++++++++++++++++++
>>1 file changed, 135 insertions(+)
>>
>>diff --git a/tests/intel/xe_drm_fdinfo.c b/tests/intel/xe_drm_fdinfo.c
>>index 41409b2d2..27459b7f1 100644
>>--- a/tests/intel/xe_drm_fdinfo.c
>>+++ b/tests/intel/xe_drm_fdinfo.c
>>@@ -51,6 +51,17 @@ static const char *engine_map[] = {
>>	"vecs",
>>	"ccs",
>>};
>>+
>>+static const uint64_t batch_addr[] = {
>>+	0x170000,
>>+	0x180000,
>>+	0x190000,
>>+	0x1a0000,
>>+	0x1b0000,
>>+	0x1c0000,
>>+	0x1d0000,
>>+	0x1e0000,
>>+};
>>static void read_engine_cycles(int xe, struct pceu_cycles *pceu)
>>{
>>	struct drm_client_fdinfo info = { };
>>@@ -316,6 +327,130 @@ static void basic(int xe, unsigned int num_classes)
>>	}
>>}
>>
>>+#define MAX_PARALLEL 8
>>+struct xe_spin_ctx {
>>+	uint32_t vm;
>>+	uint64_t addr[MAX_PARALLEL];
>>+	struct drm_xe_sync sync[2];
>>+	struct drm_xe_exec exec;
>>+	uint32_t exec_queue;
>>+	size_t bo_size;
>>+	uint32_t bo;
>>+	struct xe_spin *spin;
>>+	struct xe_spin_opts spin_opts;
>>+	bool ended;
>>+	uint16_t class;
>>+	uint16_t width;
>>+	uint16_t num_placements;
>>+};
>>+
>>+static struct xe_spin_ctx *
>>+xe_spin_ctx_init(int fd, struct drm_xe_engine_class_instance *hwe, uint32_t vm,
>>+		 uint16_t width, uint16_t num_placements)
>>+{
>>+	struct xe_spin_ctx *ctx = calloc(1, sizeof(*ctx));
>>+
>>+	igt_assert(width && num_placements &&
>>+		   (width == 1 || num_placements == 1));
>>+
>>+	igt_assert(width <= MAX_PARALLEL);
>>+
>>+	ctx->class = hwe->engine_class;
>>+	ctx->width = width;
>>+	ctx->num_placements = num_placements;
>>+	ctx->vm = vm;
>>+	for (int i = 0; i < ctx->width; i++)
>>+		ctx->addr[i] = batch_addr[hwe->engine_class];
>>+
>>+	ctx->exec.num_batch_buffer = width;
>>+	ctx->exec.num_syncs = 2;
>>+	ctx->exec.syncs = to_user_pointer(ctx->sync);
>>+
>>+	ctx->sync[0].type = DRM_XE_SYNC_TYPE_SYNCOBJ;
>>+	ctx->sync[0].flags = DRM_XE_SYNC_FLAG_SIGNAL;
>>+	ctx->sync[0].handle = syncobj_create(fd, 0);
>>+
>>+	ctx->sync[1].type = DRM_XE_SYNC_TYPE_SYNCOBJ;
>>+	ctx->sync[1].flags = DRM_XE_SYNC_FLAG_SIGNAL;
>>+	ctx->sync[1].handle = syncobj_create(fd, 0);
>>+
>>+	ctx->bo_size = sizeof(struct xe_spin);
>>+	ctx->bo_size = xe_bb_size(fd, ctx->bo_size);
>>+	ctx->bo = xe_bo_create(fd, ctx->vm, ctx->bo_size,
>>+			       vram_if_possible(fd, hwe->gt_id),
>>+			       DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM);
>>+	ctx->spin = xe_bo_map(fd, ctx->bo, ctx->bo_size);
>>+
>>+	igt_assert_eq(__xe_exec_queue_create(fd, ctx->vm, width, num_placements,
>>+					     hwe, 0, &ctx->exec_queue), 0);
>>+
>>+	xe_vm_bind_async(fd, ctx->vm, 0, ctx->bo, 0, ctx->addr[0], ctx->bo_size,
>>+			 ctx->sync, 1);
>>+
>>+	return ctx;
>>+}
>>+
>>+static void
>>+xe_spin_sync_start(int fd, struct xe_spin_ctx *ctx)
>
>I don't think we should create these wrappers on each individual test.
>If a wrapper like this is needed, can we add the proper abstraction in
>lib/xe/xe_spin.{c,h}?

The wrappers are required to be able to control when the utilization is 
sampled when a batch runs.

I thought the general rule was to add stuff to (IGT) library when there 
are more users of a particular code. Right now this test suite is the 
only one using this breakdown.

In future single engine utilization exposed from GuC (busy v3) will make 
use of these helpers to build the various tests for normal, virtual and 
parallel submissions. In addition to sampling counters before and after, 
these tests will also sample perf counters when the batch is actively 
running on the engine, so the breakdown provided by the above helpers is 
useful. I can work on abstracting it into the libraries then since we 
would have more users and more data on what's needed for the 
abstractions.

Right now, I think we should target test coverage now and then work on 
the abstraction part as an improvement when implementing busy v3 tests.  
Does that sound reasonable?

Thanks,
Umesh

>
>Lucas De Marchi
>
>>+{
>>+	if (!ctx)
>>+		return;
>>+
>>+	ctx->spin_opts.addr = ctx->addr[0];
>>+	ctx->spin_opts.preempt = true;
>>+	xe_spin_init(ctx->spin, &ctx->spin_opts);
>>+
>>+	/* re-use sync[0] for exec */
>>+	ctx->sync[0].flags &= ~DRM_XE_SYNC_FLAG_SIGNAL;
>>+
>>+	ctx->exec.exec_queue_id = ctx->exec_queue;
>>+	if (ctx->width > 1)
>>+		ctx->exec.address = to_user_pointer(ctx->addr);
>>+	else
>>+		ctx->exec.address = ctx->addr[0];
>>+	xe_exec(fd, &ctx->exec);
>>+
>>+	xe_spin_wait_started(ctx->spin);
>>+	igt_assert(!syncobj_wait(fd, &ctx->sync[1].handle, 1, 1, 0, NULL));
>>+
>>+	igt_debug("%s: spinner started\n", engine_map[ctx->class]);
>>+}
>>+
>>+static void
>>+xe_spin_sync_end(int fd, struct xe_spin_ctx *ctx)
>>+{
>>+	if (!ctx || ctx->ended)
>>+		return;
>>+
>>+	xe_spin_end(ctx->spin);
>>+
>>+	igt_assert(syncobj_wait(fd, &ctx->sync[1].handle, 1, INT64_MAX, 0, NULL));
>>+	igt_assert(syncobj_wait(fd, &ctx->sync[0].handle, 1, INT64_MAX, 0, NULL));
>>+
>>+	ctx->sync[0].flags |= DRM_XE_SYNC_FLAG_SIGNAL;
>>+	xe_vm_unbind_async(fd, ctx->vm, 0, 0, ctx->addr[0], ctx->bo_size, ctx->sync, 1);
>>+	igt_assert(syncobj_wait(fd, &ctx->sync[0].handle, 1, INT64_MAX, 0, NULL));
>>+
>>+	ctx->ended = true;
>>+	igt_debug("%s: spinner ended\n", engine_map[ctx->class]);
>>+}
>>+
>>+static void
>>+xe_spin_ctx_destroy(int fd, struct xe_spin_ctx *ctx)
>>+{
>>+	if (!ctx)
>>+		return;
>>+
>>+	syncobj_destroy(fd, ctx->sync[0].handle);
>>+	syncobj_destroy(fd, ctx->sync[1].handle);
>>+	xe_exec_queue_destroy(fd, ctx->exec_queue);
>>+
>>+	munmap(ctx->spin, ctx->bo_size);
>>+	gem_close(fd, ctx->bo);
>>+
>>+	free(ctx);
>>+}
>>+
>>igt_main
>>{
>>	struct drm_xe_engine_class_instance *hwe;
>>-- 
>>2.34.1
>>