[PATCH i-g-t v2 3/4] lib/intel_compute: Add LNL compute preempt library support

Tue Jan 16 21:27:55 UTC 2024

From: Janga Rahul Kumar <janga.rahul.kumar at intel.com>

Add STATE_SIP instruction in compute pipeline.
Add library support to submit long and short opencl kernels
to exercise preemption scenario.

Cc: Nirmoy Das <nirmoy.das at intel.com>
Signed-off-by: Janga Rahul Kumar <janga.rahul.kumar at intel.com>
---
 include/intel_gpu_commands.h |   2 +
 lib/intel_compute.c          | 270 ++++++++++++++++++++++++++++++++++-
 lib/intel_compute.h          |   2 +-
 3 files changed, 270 insertions(+), 4 deletions(-)

diff --git a/include/intel_gpu_commands.h b/include/intel_gpu_commands.h
index 5a98fb63f..fe734c4bb 100644
--- a/include/intel_gpu_commands.h
+++ b/include/intel_gpu_commands.h
@@ -441,6 +441,8 @@
 #define GSC_FW_LOAD GSC_INSTR(1, 0, 2)
 #define   HECI1_FW_LIMIT_VALID (1 << 31)
 
+#define XE2_STATE_SIP \
+	((0x3 << 29) | (0x0 << 27) | (0x1 << 24) | (0x1 << 17))
 /*
  * Used to convert any address to canonical form.
  * Starting from gen8, some commands (e.g. STATE_BASE_ADDRESS,
diff --git a/lib/intel_compute.c b/lib/intel_compute.c
index 723f4ff0c..d4245d088 100644
--- a/lib/intel_compute.c
+++ b/lib/intel_compute.c
@@ -41,6 +41,7 @@
 #define OFFSET_BINDING_TABLE		0x1000
 
 #define XE2_ADDR_STATE_CONTEXT_DATA_BASE	0x900000UL
+#define OFFSET_STATE_SIP			0xFFFF0000
 
 struct bo_dict_entry {
 	uint64_t addr;
@@ -1160,7 +1161,8 @@ static void xe2lpg_compute_exec_compute(uint32_t *addr_bo_buffer_batch,
 					uint64_t addr_instruction_state_base,
 					uint64_t addr_state_contect_data_base,
 					uint64_t offset_indirect_data_start,
-					uint64_t kernel_start_pointer)
+					uint64_t kernel_start_pointer,
+					uint64_t sip_start_pointer)
 {
 	int b = 0;
 
@@ -1172,6 +1174,7 @@ static void xe2lpg_compute_exec_compute(uint32_t *addr_bo_buffer_batch,
 	igt_debug("state context data base addr: %lx\n", addr_state_contect_data_base);
 	igt_debug("offset indirect addr: %lx\n", offset_indirect_data_start);
 	igt_debug("kernel start pointer: %lx\n", kernel_start_pointer);
+	igt_debug("sip start pointer: %lx\n", sip_start_pointer);
 
 	addr_bo_buffer_batch[b++] = GEN7_PIPELINE_SELECT | GEN9_PIPELINE_SELECTION_MASK |
 				    PIPELINE_SELECT_GPGPU;
@@ -1220,6 +1223,12 @@ static void xe2lpg_compute_exec_compute(uint32_t *addr_bo_buffer_batch,
 	addr_bo_buffer_batch[b++] = addr_surface_state_base >> 32;
 	addr_bo_buffer_batch[b++] = 0x001ff000;
 
+	if (sip_start_pointer) {
+		addr_bo_buffer_batch[b++] = XE2_STATE_SIP | 0x1;
+		addr_bo_buffer_batch[b++] = sip_start_pointer;
+		addr_bo_buffer_batch[b++] = 0x00000000;
+	}
+
 	addr_bo_buffer_batch[b++] = XEHP_COMPUTE_WALKER | 0x26;
 	addr_bo_buffer_batch[b++] = 0x00000000;
 	addr_bo_buffer_batch[b++] = 0x00000040;
@@ -1242,7 +1251,7 @@ static void xe2lpg_compute_exec_compute(uint32_t *addr_bo_buffer_batch,
 
 	addr_bo_buffer_batch[b++] = kernel_start_pointer;
 	addr_bo_buffer_batch[b++] = 0x00000000;
-	addr_bo_buffer_batch[b++] = 0x00000000;
+	addr_bo_buffer_batch[b++] = 0x00100000;
 	addr_bo_buffer_batch[b++] = 0x00000000;
 	addr_bo_buffer_batch[b++] = 0x00000000;
 	addr_bo_buffer_batch[b++] = 0x0c000020;
@@ -1265,6 +1274,31 @@ static void xe2lpg_compute_exec_compute(uint32_t *addr_bo_buffer_batch,
 	addr_bo_buffer_batch[b++] = MI_BATCH_BUFFER_END;
 }
 
+static void xe2_create_indirect_data_inc_kernel(uint32_t *addr_bo_buffer_batch,
+						uint64_t addr_input,
+						uint64_t addr_output,
+						unsigned int loop_count)
+{
+	int b = 0;
+
+	addr_bo_buffer_batch[b++] = addr_input & 0xffffffff;
+	addr_bo_buffer_batch[b++] = addr_input >> 32;
+	addr_bo_buffer_batch[b++] = addr_output & 0xffffffff;
+	addr_bo_buffer_batch[b++] = addr_output >> 32;
+	addr_bo_buffer_batch[b++] = loop_count;
+	addr_bo_buffer_batch[b++] = 0x00000000;
+	addr_bo_buffer_batch[b++] = 0x00000000;
+	addr_bo_buffer_batch[b++] = 0x00000000;
+	addr_bo_buffer_batch[b++] = 0x00000400;
+	addr_bo_buffer_batch[b++] = 0x00000001;
+	addr_bo_buffer_batch[b++] = 0x00000001;
+	addr_bo_buffer_batch[b++] = 0x00000000;
+	addr_bo_buffer_batch[b++] = 0x00000000;
+	addr_bo_buffer_batch[b++] = 0x00000000;
+	addr_bo_buffer_batch[b++] = 0x00000000;
+	addr_bo_buffer_batch[b++] = 0x00000000;
+}
+
 /**
  * xe2lpg_compute_exec - run a pipeline compatible with XE2
  *
@@ -1335,7 +1369,7 @@ static void xe2lpg_compute_exec(int fd, const unsigned char *kernel,
 				  ADDR_INSTRUCTION_STATE_BASE,
 				  XE2_ADDR_STATE_CONTEXT_DATA_BASE,
 				  OFFSET_INDIRECT_DATA_START,
-				  OFFSET_KERNEL);
+				  OFFSET_KERNEL, 0);
 
 	bo_execenv_exec(&execenv, ADDR_BATCH);
 
@@ -1474,3 +1508,233 @@ bool xe_run_intel_compute_kernel_on_engine(int fd,
 
 	return __run_intel_compute_kernel(fd, eci);
 }
+
+/**
+ * xe2lpg_compute_preempt_exec - run a pipeline compatible with XE2 and
+ * submit long and short kernels for preemption occurrence.
+ *
+ * @fd: file descriptor of the opened DRM device
+ * @long_kernel: GPU long kernel binary to be executed
+ * @long_kernel_size: size of @long_kernel
+ * @short_kernel: GPU short kernel binary to be executed
+ * @short_kernel_size: size of @short_kernel
+ * @sip_kernel: WMTP sip kernel which does save restore during preemption
+ * @sip_kernel_size: size of @sip_kernel
+ */
+static void xe2lpg_compute_preempt_exec(int fd, const unsigned char *long_kernel,
+					unsigned int long_kernel_size,
+					const unsigned char *short_kernel,
+					unsigned int short_kernel_size,
+					const unsigned char *sip_kernel,
+					unsigned int sip_kernel_size)
+{
+#define XE2_BO_PREEMPT_DICT_ENTRIES 11
+	struct bo_dict_entry bo_dict_long[XE2_BO_PREEMPT_DICT_ENTRIES] = {
+		{ .addr = ADDR_INSTRUCTION_STATE_BASE + OFFSET_KERNEL,
+		  .name = "instr state base"},
+		{ .addr = ADDR_DYNAMIC_STATE_BASE,
+		  .size = 0x100000,
+		  .name = "dynamic state base"},
+		{ .addr = ADDR_SURFACE_STATE_BASE,
+		  .size = 0x1000,
+		  .name = "surface state base"},
+		{ .addr = ADDR_GENERAL_STATE_BASE + OFFSET_INDIRECT_DATA_START,
+		  .size =  0x1000,
+		  .name = "indirect object base"},
+		{ .addr = ADDR_INPUT, .size = SIZE_BUFFER_INPUT,
+		  .name = "addr input"},
+		{ .addr = ADDR_OUTPUT, .size = SIZE_BUFFER_OUTPUT,
+		  .name = "addr output" },
+		{ .addr = ADDR_GENERAL_STATE_BASE, .size = 0x100000,
+		  .name = "general state base" },
+		{ .addr = ADDR_SURFACE_STATE_BASE + OFFSET_BINDING_TABLE,
+		  .size = 0x1000,
+		  .name = "binding table" },
+		{ .addr = ADDR_BATCH,
+		  .size = SIZE_BATCH,
+		  .name = "batch" },
+		{ .addr = XE2_ADDR_STATE_CONTEXT_DATA_BASE,
+		  .size = 0x6400000,
+		  .name = "state context data base"},
+		{ .addr = ADDR_INSTRUCTION_STATE_BASE + OFFSET_STATE_SIP,
+		  .name = "sip kernel"},
+	};
+
+	struct bo_dict_entry bo_dict_short[XE2_BO_PREEMPT_DICT_ENTRIES];
+	struct bo_execenv execenv_short, execenv_long;
+	float *dinput;
+	unsigned int long_kernel_loop_count = 1000000;
+
+	for (int i = 0; i < XE2_BO_PREEMPT_DICT_ENTRIES; ++i)
+		bo_dict_short[i] = bo_dict_long[i];
+
+	bo_execenv_create(fd, &execenv_short, NULL);
+	bo_execenv_create(fd, &execenv_long, NULL);
+
+	bo_dict_long[0].size = ALIGN(long_kernel_size, 0x1000);
+	bo_dict_short[0].size = ALIGN(short_kernel_size, 0x1000);
+
+	bo_dict_long[10].size = ALIGN(sip_kernel_size, 0x1000);
+	bo_dict_short[10].size = ALIGN(sip_kernel_size, 0x1000);
+
+	bo_execenv_bind(&execenv_long, bo_dict_long, XE2_BO_PREEMPT_DICT_ENTRIES);
+	bo_execenv_bind(&execenv_short, bo_dict_short, XE2_BO_PREEMPT_DICT_ENTRIES);
+
+	memcpy(bo_dict_long[0].data, long_kernel, long_kernel_size);
+	memcpy(bo_dict_short[0].data, short_kernel, short_kernel_size);
+
+	memcpy(bo_dict_long[10].data, sip_kernel, sip_kernel_size);
+	memcpy(bo_dict_short[10].data, sip_kernel, sip_kernel_size);
+
+	create_dynamic_state(bo_dict_long[1].data, OFFSET_KERNEL);
+	xehp_create_surface_state(bo_dict_long[2].data, ADDR_INPUT, ADDR_OUTPUT);
+	xe2_create_indirect_data_inc_kernel(bo_dict_long[3].data, ADDR_INPUT, ADDR_OUTPUT,
+					    long_kernel_loop_count);
+	xehp_create_surface_state(bo_dict_long[7].data, ADDR_INPUT, ADDR_OUTPUT);
+
+	create_dynamic_state(bo_dict_short[1].data, OFFSET_KERNEL);
+	xehp_create_surface_state(bo_dict_short[2].data, ADDR_INPUT, ADDR_OUTPUT);
+	xehp_create_indirect_data(bo_dict_short[3].data, ADDR_INPUT, ADDR_OUTPUT);
+	xehp_create_surface_state(bo_dict_short[7].data, ADDR_INPUT, ADDR_OUTPUT);
+
+	dinput = (float *)bo_dict_long[4].data;
+	srand(time(NULL));
+
+	for (int i = 0; i < SIZE_DATA; i++)
+		((float *)dinput)[i] = rand() / (float)RAND_MAX;
+
+	dinput = (float *)bo_dict_short[4].data;
+
+	for (int i = 0; i < SIZE_DATA; i++)
+		((float *)dinput)[i] = rand() / (float)RAND_MAX;
+
+	xe2lpg_compute_exec_compute(bo_dict_long[8].data, ADDR_GENERAL_STATE_BASE,
+				    ADDR_SURFACE_STATE_BASE, ADDR_DYNAMIC_STATE_BASE,
+				    ADDR_INSTRUCTION_STATE_BASE, XE2_ADDR_STATE_CONTEXT_DATA_BASE,
+				    OFFSET_INDIRECT_DATA_START, OFFSET_KERNEL, OFFSET_STATE_SIP);
+
+	xe2lpg_compute_exec_compute(bo_dict_short[8].data, ADDR_GENERAL_STATE_BASE,
+				    ADDR_SURFACE_STATE_BASE, ADDR_DYNAMIC_STATE_BASE,
+				    ADDR_INSTRUCTION_STATE_BASE, XE2_ADDR_STATE_CONTEXT_DATA_BASE,
+				    OFFSET_INDIRECT_DATA_START, OFFSET_KERNEL, OFFSET_STATE_SIP);
+
+	struct drm_xe_sync sync_long = {
+		.type = DRM_XE_SYNC_TYPE_SYNCOBJ,
+		.flags = DRM_XE_SYNC_FLAG_SIGNAL,
+		.handle = syncobj_create(fd, 0),
+	};
+
+	xe_exec_sync(fd, execenv_long.exec_queue, ADDR_BATCH, &sync_long, 1);
+
+	struct drm_xe_sync sync_short = {
+		.type = DRM_XE_SYNC_TYPE_SYNCOBJ,
+		.flags = DRM_XE_SYNC_FLAG_SIGNAL,
+		.handle = syncobj_create(fd, 0),
+	};
+
+	xe_exec_sync(fd, execenv_short.exec_queue, ADDR_BATCH, &sync_short, 1);
+
+	igt_assert(syncobj_wait(fd, &sync_short.handle, 1, INT64_MAX, 0, NULL));
+	syncobj_destroy(fd, sync_short.handle);
+
+	igt_assert(syncobj_wait(fd, &sync_long.handle, 1, INT64_MAX, 0, NULL));
+	syncobj_destroy(fd, sync_long.handle);
+
+	for (int i = 0; i < SIZE_DATA; i++) {
+		float f1, f2;
+
+		f1 = ((float *) bo_dict_short[5].data)[i];
+		f2 = ((float *) bo_dict_short[4].data)[i];
+
+		if (f1 != f2 * f2)
+			igt_debug("[%4d] f1: %f != %f\n", i, f1, f2 * f2);
+		igt_assert(f1 == f2 * f2);
+	}
+
+	for (int i = 0; i < SIZE_DATA; i++) {
+		float f1;
+
+		f1 = ((float *) bo_dict_long[5].data)[i];
+
+		if (f1 != long_kernel_loop_count)
+			igt_debug("[%4d] f1: %f != %u\n", i, f1, long_kernel_loop_count);
+		igt_assert(f1 == long_kernel_loop_count);
+	}
+
+	bo_execenv_unbind(&execenv_short, bo_dict_short, XE2_BO_PREEMPT_DICT_ENTRIES);
+	bo_execenv_unbind(&execenv_long, bo_dict_long, XE2_BO_PREEMPT_DICT_ENTRIES);
+
+	bo_execenv_destroy(&execenv_short);
+	bo_execenv_destroy(&execenv_long);
+}
+
+static const struct {
+	unsigned int ip_ver;
+	void (*compute_exec)(int fd, const unsigned char *long_kernel,
+			     unsigned int long_kernel_size,
+			     const unsigned char *short_kernel,
+			     unsigned int short_kernel_size,
+			     const unsigned char *sip_kernel,
+			     unsigned int sip_kernel_size);
+	uint32_t compat;
+} intel_compute_preempt_batches[] = {
+	{
+		.ip_ver = IP_VER(20, 04),
+		.compute_exec = xe2lpg_compute_preempt_exec,
+		.compat = COMPAT_DRIVER_XE,
+	},
+};
+
+static bool __run_intel_compute_kernel_preempt(int fd)
+{
+	unsigned int ip_ver = intel_graphics_ver(intel_get_drm_devid(fd));
+	unsigned int batch;
+	const struct intel_compute_kernels *kernels = intel_compute_square_kernels;
+	enum intel_driver driver = get_intel_driver(fd);
+
+	for (batch = 0; batch < ARRAY_SIZE(intel_compute_preempt_batches); batch++)
+		if (ip_ver == intel_compute_preempt_batches[batch].ip_ver)
+			break;
+
+
+	if (batch == ARRAY_SIZE(intel_compute_preempt_batches)) {
+		igt_debug("GPU version 0x%x not supported\n", ip_ver);
+		return false;
+	}
+
+	if (!(COMPAT_DRIVER_FLAG(driver) & intel_compute_preempt_batches[batch].compat)) {
+		igt_debug("Driver is not supported: flags %x & %x\n",
+			  COMPAT_DRIVER_FLAG(driver),
+			  intel_compute_preempt_batches[batch].compat);
+		return false;
+	}
+
+	while (kernels->kernel) {
+		if (ip_ver == kernels->ip_ver)
+			break;
+		kernels++;
+	}
+
+	if (!kernels->kernel || !kernels->sip_kernel || !kernels->long_kernel)
+		return 0;
+
+	intel_compute_preempt_batches[batch].compute_exec(fd, kernels->long_kernel,
+							  kernels->long_kernel_size,
+							  kernels->kernel, kernels->size,
+							  kernels->sip_kernel,
+							  kernels->sip_kernel_size);
+
+	return true;
+}
+/**
+ * run_intel_compute_kernel_preempt - runs compute kernels to
+ * exercise preemption scenario.
+ *
+ * @fd: file descriptor of the opened DRM Xe device
+ *
+ * Returns true on success, false otherwise.
+ */
+bool run_intel_compute_kernel_preempt(int fd)
+{
+	return __run_intel_compute_kernel_preempt(fd);
+}
diff --git a/lib/intel_compute.h b/lib/intel_compute.h
index 9ea87b528..bba8bed94 100644
--- a/lib/intel_compute.h
+++ b/lib/intel_compute.h
@@ -35,5 +35,5 @@ extern const struct intel_compute_kernels intel_compute_square_kernels[];
 
 bool run_intel_compute_kernel(int fd);
 bool xe_run_intel_compute_kernel_on_engine(int fd, struct drm_xe_engine_class_instance *eci);
-
+bool run_intel_compute_kernel_preempt(int fd);
 #endif	/* INTEL_COMPUTE_H */
-- 
2.25.1