[PATCH i-g-t 11/11] lib/intel_compute: Make array size a dynamic parameter
Francois Dugast
francois.dugast at intel.com
Tue Mar 11 15:21:31 UTC 2025
Give the users of run_intel_compute_kernel() the possibility to change
the default size of the input and output arrays by adding a custom
size in struct user_execenv::array_size.
If no value is provided, the existing default value of SIZE_DATA will
be used.
Example:
struct user_execenv env = {};
env.array_size = 1024 * 1024;
run_intel_compute_kernel(fd, &env);
Signed-off-by: Francois Dugast <francois.dugast at intel.com>
---
lib/intel_compute.c | 144 +++++++++++++++++++++++++++-----------------
lib/intel_compute.h | 2 +
2 files changed, 90 insertions(+), 56 deletions(-)
diff --git a/lib/intel_compute.c b/lib/intel_compute.c
index 068d64b24..b2cba0fe0 100644
--- a/lib/intel_compute.c
+++ b/lib/intel_compute.c
@@ -26,8 +26,6 @@
#define SIZE_DATA 64
#define SIZE_BATCH 0x10000
-#define SIZE_BUFFER_INPUT MAX(sizeof(float) * SIZE_DATA, 0x10000)
-#define SIZE_BUFFER_OUTPUT MAX(sizeof(float) * SIZE_DATA, 0x10000)
#define SIZE_SURFACE_STATE 0x10000
#define SIZE_DYNAMIC_STATE 0x100000
#define SIZE_INDIRECT_OBJECT 0x10000
@@ -56,9 +54,6 @@
#define USER_FENCE_VALUE 0xdeadbeefdeadbeefull
#define THREADS_PER_GROUP 32
-#define THREAD_GROUP_X MAX(1, SIZE_DATA / (ENQUEUED_LOCAL_SIZE_X * \
- ENQUEUED_LOCAL_SIZE_Y * \
- ENQUEUED_LOCAL_SIZE_Z))
#define THREAD_GROUP_Y 1
#define THREAD_GROUP_Z 1
#define ENQUEUED_LOCAL_SIZE_X 1024
@@ -91,6 +86,7 @@ struct bo_execenv {
/* Xe part */
uint32_t vm;
uint32_t exec_queue;
+ uint32_t array_size;
/* i915 part */
struct drm_i915_gem_execbuffer2 execbuf;
@@ -118,6 +114,11 @@ static void bo_execenv_create(int fd, struct bo_execenv *execenv,
else
execenv->vm = xe_vm_create(fd, DRM_XE_VM_CREATE_FLAG_LR_MODE, 0);
+ if (user && user->array_size)
+ execenv->array_size = user->array_size;
+ else
+ execenv->array_size = SIZE_DATA;
+
if (eci) {
execenv->exec_queue = xe_exec_queue_create(fd, execenv->vm,
eci, 0);
@@ -306,6 +307,23 @@ static void bo_execenv_exec(struct bo_execenv *execenv, uint64_t start_addr)
}
}
+static uint32_t size_thread_group_x(uint32_t work_size)
+{
+ return MAX(1, work_size / (ENQUEUED_LOCAL_SIZE_X *
+ ENQUEUED_LOCAL_SIZE_Y *
+ ENQUEUED_LOCAL_SIZE_Z));
+}
+
+static size_t size_input(uint32_t work_size)
+{
+ return MAX(sizeof(float) * work_size, 0x10000);
+}
+
+static size_t size_output(uint32_t work_size)
+{
+ return MAX(sizeof(float) * work_size, 0x10000);
+}
+
/*
* TGL compatible batch
*/
@@ -715,10 +733,8 @@ static void compute_exec(int fd, const unsigned char *kernel,
.size = SIZE_INDIRECT_OBJECT,
.name = "indirect data start" },
{ .addr = ADDR_INPUT,
- .size = SIZE_BUFFER_INPUT,
.name = "input" },
{ .addr = ADDR_OUTPUT,
- .size = SIZE_BUFFER_OUTPUT,
.name = "output" },
{ .addr = ADDR_BATCH,
.size = SIZE_BATCH,
@@ -730,8 +746,10 @@ static void compute_exec(int fd, const unsigned char *kernel,
bo_execenv_create(fd, &execenv, eci, user);
- /* Sets Kernel size */
+ /* Set dynamic sizes */
bo_dict[0].size = ALIGN(size, 0x1000);
+ bo_dict[4].size = size_input(execenv.array_size);
+ bo_dict[5].size = size_output(execenv.array_size);
bo_execenv_bind(&execenv, bo_dict, BO_DICT_ENTRIES);
@@ -739,13 +757,13 @@ static void compute_exec(int fd, const unsigned char *kernel,
create_dynamic_state(bo_dict[1].data, OFFSET_KERNEL);
create_surface_state(bo_dict[2].data, ADDR_INPUT, ADDR_OUTPUT);
create_indirect_data(bo_dict[3].data, ADDR_INPUT, ADDR_OUTPUT,
- IS_DG1(devid) ? 0x200 : 0x40, SIZE_DATA);
+ IS_DG1(devid) ? 0x200 : 0x40, execenv.array_size);
input_data = (float *) bo_dict[4].data;
output_data = (float *) bo_dict[5].data;
srand(time(NULL));
- for (int i = 0; i < SIZE_DATA; i++)
+ for (int i = 0; i < execenv.array_size; i++)
input_data[i] = rand() / (float)RAND_MAX;
if (IS_DG1(devid))
@@ -763,7 +781,7 @@ static void compute_exec(int fd, const unsigned char *kernel,
bo_execenv_exec(&execenv, ADDR_BATCH);
- for (int i = 0; i < SIZE_DATA; i++) {
+ for (int i = 0; i < execenv.array_size; i++) {
float input = input_data[i];
float output = output_data[i];
float expected_output = input * input;
@@ -999,9 +1017,9 @@ static void xehp_compute_exec(int fd, const unsigned char *kernel,
{ .addr = ADDR_GENERAL_STATE_BASE + OFFSET_INDIRECT_DATA_START,
.size = SIZE_INDIRECT_OBJECT,
.name = "indirect object base"},
- { .addr = ADDR_INPUT, .size = SIZE_BUFFER_INPUT,
+ { .addr = ADDR_INPUT,
.name = "addr input"},
- { .addr = ADDR_OUTPUT, .size = SIZE_BUFFER_OUTPUT,
+ { .addr = ADDR_OUTPUT,
.name = "addr output" },
{ .addr = ADDR_GENERAL_STATE_BASE,
.size = SIZE_GENERAL_STATE,
@@ -1017,22 +1035,24 @@ static void xehp_compute_exec(int fd, const unsigned char *kernel,
bo_execenv_create(fd, &execenv, eci, user);
- /* Sets Kernel size */
+ /* Set dynamic sizes */
bo_dict[0].size = ALIGN(size, xe_get_default_alignment(fd));
+ bo_dict[4].size = size_input(execenv.array_size);
+ bo_dict[5].size = size_output(execenv.array_size);
bo_execenv_bind(&execenv, bo_dict, XEHP_BO_DICT_ENTRIES);
memcpy(bo_dict[0].data, kernel, size);
create_dynamic_state(bo_dict[1].data, OFFSET_KERNEL);
xehp_create_surface_state(bo_dict[2].data, ADDR_INPUT, ADDR_OUTPUT);
- xehp_create_indirect_data(bo_dict[3].data, ADDR_INPUT, ADDR_OUTPUT, SIZE_DATA);
+ xehp_create_indirect_data(bo_dict[3].data, ADDR_INPUT, ADDR_OUTPUT, execenv.array_size);
xehp_create_surface_state(bo_dict[7].data, ADDR_INPUT, ADDR_OUTPUT);
input_data = (float *) bo_dict[4].data;
output_data = (float *) bo_dict[5].data;
srand(time(NULL));
- for (int i = 0; i < SIZE_DATA; i++)
+ for (int i = 0; i < execenv.array_size; i++)
input_data[i] = rand() / (float)RAND_MAX;
xehp_compute_exec_compute(bo_dict[8].data,
@@ -1045,7 +1065,7 @@ static void xehp_compute_exec(int fd, const unsigned char *kernel,
bo_execenv_exec(&execenv, ADDR_BATCH);
- for (int i = 0; i < SIZE_DATA; i++) {
+ for (int i = 0; i < execenv.array_size; i++) {
float input = input_data[i];
float output = output_data[i];
float expected_output = input * input;
@@ -1217,9 +1237,9 @@ static void xehpc_compute_exec(int fd, const unsigned char *kernel,
{ .addr = ADDR_GENERAL_STATE_BASE + OFFSET_INDIRECT_DATA_START,
.size = SIZE_INDIRECT_OBJECT,
.name = "indirect object base"},
- { .addr = ADDR_INPUT, .size = SIZE_BUFFER_INPUT,
+ { .addr = ADDR_INPUT,
.name = "addr input"},
- { .addr = ADDR_OUTPUT, .size = SIZE_BUFFER_OUTPUT,
+ { .addr = ADDR_OUTPUT,
.name = "addr output" },
{ .addr = ADDR_GENERAL_STATE_BASE,
.size = SIZE_GENERAL_STATE,
@@ -1232,19 +1252,21 @@ static void xehpc_compute_exec(int fd, const unsigned char *kernel,
bo_execenv_create(fd, &execenv, eci, user);
- /* Sets Kernel size */
+ /* Set dynamic sizes */
bo_dict[0].size = ALIGN(size, xe_get_default_alignment(fd));
+ bo_dict[2].size = size_input(execenv.array_size);
+ bo_dict[3].size = size_output(execenv.array_size);
bo_execenv_bind(&execenv, bo_dict, XEHPC_BO_DICT_ENTRIES);
memcpy(bo_dict[0].data, kernel, size);
- xehpc_create_indirect_data(bo_dict[1].data, ADDR_INPUT, ADDR_OUTPUT, SIZE_DATA);
+ xehpc_create_indirect_data(bo_dict[1].data, ADDR_INPUT, ADDR_OUTPUT, execenv.array_size);
input_data = (float *) bo_dict[2].data;
output_data = (float *) bo_dict[3].data;
srand(time(NULL));
- for (int i = 0; i < SIZE_DATA; i++)
+ for (int i = 0; i < execenv.array_size; i++)
input_data[i] = rand() / (float)RAND_MAX;
xehpc_compute_exec_compute(bo_dict[5].data,
@@ -1257,7 +1279,7 @@ static void xehpc_compute_exec(int fd, const unsigned char *kernel,
bo_execenv_exec(&execenv, ADDR_BATCH);
- for (int i = 0; i < SIZE_DATA; i++) {
+ for (int i = 0; i < execenv.array_size; i++) {
float input = input_data[i];
float output = output_data[i];
float expected_output = input * input;
@@ -1274,12 +1296,13 @@ static void xehpc_compute_exec(int fd, const unsigned char *kernel,
}
static void xelpg_compute_exec_compute(uint32_t *addr_bo_buffer_batch,
- uint64_t addr_general_state_base,
- uint64_t addr_surface_state_base,
- uint64_t addr_dynamic_state_base,
- uint64_t addr_instruction_state_base,
- uint64_t offset_indirect_data_start,
- uint64_t kernel_start_pointer)
+ uint64_t addr_general_state_base,
+ uint64_t addr_surface_state_base,
+ uint64_t addr_dynamic_state_base,
+ uint64_t addr_instruction_state_base,
+ uint64_t offset_indirect_data_start,
+ uint64_t kernel_start_pointer,
+ uint32_t work_size)
{
int b = 0;
@@ -1342,7 +1365,7 @@ static void xelpg_compute_exec_compute(uint32_t *addr_bo_buffer_batch,
addr_bo_buffer_batch[b++] = 0xbe040000;
addr_bo_buffer_batch[b++] = 0xffffffff;
addr_bo_buffer_batch[b++] = 0x000003ff;
- addr_bo_buffer_batch[b++] = THREAD_GROUP_X;
+ addr_bo_buffer_batch[b++] = size_thread_group_x(work_size);
addr_bo_buffer_batch[b++] = THREAD_GROUP_Y;
addr_bo_buffer_batch[b++] = THREAD_GROUP_Z;
@@ -1398,7 +1421,8 @@ static void xe2lpg_compute_exec_compute(uint32_t *addr_bo_buffer_batch,
uint64_t offset_indirect_data_start,
uint64_t kernel_start_pointer,
uint64_t sip_start_pointer,
- bool threadgroup_preemption)
+ bool threadgroup_preemption,
+ uint32_t work_size)
{
int b = 0;
@@ -1480,7 +1504,7 @@ static void xe2lpg_compute_exec_compute(uint32_t *addr_bo_buffer_batch,
*/
addr_bo_buffer_batch[b++] = 0x00200000; // Thread Group ID X Dimension
else
- addr_bo_buffer_batch[b++] = THREAD_GROUP_X;
+ addr_bo_buffer_batch[b++] = size_thread_group_x(work_size);
addr_bo_buffer_batch[b++] = THREAD_GROUP_Y;
addr_bo_buffer_batch[b++] = THREAD_GROUP_Z;
@@ -1576,9 +1600,9 @@ static void xelpg_compute_exec(int fd, const unsigned char *kernel,
{ .addr = ADDR_GENERAL_STATE_BASE + OFFSET_INDIRECT_DATA_START,
.size = SIZE_INDIRECT_OBJECT,
.name = "indirect object base"},
- { .addr = ADDR_INPUT, .size = SIZE_BUFFER_INPUT,
+ { .addr = ADDR_INPUT,
.name = "addr input"},
- { .addr = ADDR_OUTPUT, .size = SIZE_BUFFER_OUTPUT,
+ { .addr = ADDR_OUTPUT,
.name = "addr output" },
{ .addr = ADDR_GENERAL_STATE_BASE,
.size = SIZE_GENERAL_STATE,
@@ -1596,8 +1620,10 @@ static void xelpg_compute_exec(int fd, const unsigned char *kernel,
bo_execenv_create(fd, &execenv, eci, user);
- /* Sets Kernel size */
+ /* Set dynamic sizes */
bo_dict[0].size = ALIGN(size, 0x1000);
+ bo_dict[4].size = size_input(execenv.array_size);
+ bo_dict[5].size = size_output(execenv.array_size);
bo_execenv_bind(&execenv, bo_dict, XELPG_BO_DICT_ENTRIES);
@@ -1605,14 +1631,14 @@ static void xelpg_compute_exec(int fd, const unsigned char *kernel,
create_dynamic_state(bo_dict[1].data, OFFSET_KERNEL);
xehp_create_surface_state(bo_dict[2].data, ADDR_INPUT, ADDR_OUTPUT);
- xehp_create_indirect_data(bo_dict[3].data, ADDR_INPUT, ADDR_OUTPUT, SIZE_DATA);
+ xehp_create_indirect_data(bo_dict[3].data, ADDR_INPUT, ADDR_OUTPUT, execenv.array_size);
xehp_create_surface_state(bo_dict[7].data, ADDR_INPUT, ADDR_OUTPUT);
input_data = (float *) bo_dict[4].data;
output_data = (float *) bo_dict[5].data;
srand(time(NULL));
- for (int i = 0; i < SIZE_DATA; i++)
+ for (int i = 0; i < execenv.array_size; i++)
input_data[i] = rand() / (float)RAND_MAX;
xelpg_compute_exec_compute(bo_dict[8].data,
@@ -1621,11 +1647,12 @@ static void xelpg_compute_exec(int fd, const unsigned char *kernel,
ADDR_DYNAMIC_STATE_BASE,
ADDR_INSTRUCTION_STATE_BASE,
OFFSET_INDIRECT_DATA_START,
- OFFSET_KERNEL);
+ OFFSET_KERNEL,
+ execenv.array_size);
bo_execenv_exec(&execenv, ADDR_BATCH);
- for (int i = 0; i < SIZE_DATA; i++) {
+ for (int i = 0; i < execenv.array_size; i++) {
float input = input_data[i];
float output = output_data[i];
float expected_output = input * input;
@@ -1667,9 +1694,9 @@ static void xe2lpg_compute_exec(int fd, const unsigned char *kernel,
{ .addr = ADDR_GENERAL_STATE_BASE + OFFSET_INDIRECT_DATA_START,
.size = SIZE_INDIRECT_OBJECT,
.name = "indirect object base"},
- { .addr = ADDR_INPUT, .size = SIZE_BUFFER_INPUT,
+ { .addr = ADDR_INPUT,
.name = "addr input"},
- { .addr = ADDR_OUTPUT, .size = SIZE_BUFFER_OUTPUT,
+ { .addr = ADDR_OUTPUT,
.name = "addr output" },
{ .addr = ADDR_GENERAL_STATE_BASE,
.size = SIZE_GENERAL_STATE,
@@ -1690,36 +1717,39 @@ static void xe2lpg_compute_exec(int fd, const unsigned char *kernel,
bo_execenv_create(fd, &execenv, eci, user);
- /* Sets Kernel size */
+ /* Set dynamic sizes */
bo_dict[0].size = ALIGN(size, 0x1000);
+ bo_dict[4].size = size_input(execenv.array_size);
+ bo_dict[5].size = size_output(execenv.array_size);
bo_execenv_bind(&execenv, bo_dict, XE2_BO_DICT_ENTRIES);
memcpy(bo_dict[0].data, kernel, size);
create_dynamic_state(bo_dict[1].data, OFFSET_KERNEL);
xehp_create_surface_state(bo_dict[2].data, ADDR_INPUT, ADDR_OUTPUT);
- xehp_create_indirect_data(bo_dict[3].data, ADDR_INPUT, ADDR_OUTPUT, SIZE_DATA);
+ xehp_create_indirect_data(bo_dict[3].data, ADDR_INPUT, ADDR_OUTPUT, execenv.array_size);
xehp_create_surface_state(bo_dict[7].data, ADDR_INPUT, ADDR_OUTPUT);
input_data = (float *) bo_dict[4].data;
output_data = (float *) bo_dict[5].data;
srand(time(NULL));
- for (int i = 0; i < SIZE_DATA; i++)
+ for (int i = 0; i < execenv.array_size; i++)
input_data[i] = rand() / (float)RAND_MAX;
xe2lpg_compute_exec_compute(bo_dict[8].data,
- ADDR_GENERAL_STATE_BASE,
- ADDR_SURFACE_STATE_BASE,
- ADDR_DYNAMIC_STATE_BASE,
- ADDR_INSTRUCTION_STATE_BASE,
- XE2_ADDR_STATE_CONTEXT_DATA_BASE,
- OFFSET_INDIRECT_DATA_START,
- OFFSET_KERNEL, 0, false);
+ ADDR_GENERAL_STATE_BASE,
+ ADDR_SURFACE_STATE_BASE,
+ ADDR_DYNAMIC_STATE_BASE,
+ ADDR_INSTRUCTION_STATE_BASE,
+ XE2_ADDR_STATE_CONTEXT_DATA_BASE,
+ OFFSET_INDIRECT_DATA_START,
+ OFFSET_KERNEL, 0, false,
+ execenv.array_size);
bo_execenv_exec(&execenv, ADDR_BATCH);
- for (int i = 0; i < SIZE_DATA; i++) {
+ for (int i = 0; i < execenv.array_size; i++) {
float input = input_data[i];
float output = output_data[i];
float expected_output = input * input;
@@ -1919,9 +1949,9 @@ static void xe2lpg_compute_preempt_exec(int fd, const unsigned char *long_kernel
{ .addr = ADDR_GENERAL_STATE_BASE + OFFSET_INDIRECT_DATA_START,
.size = SIZE_INDIRECT_OBJECT,
.name = "indirect object base"},
- { .addr = ADDR_INPUT, .size = SIZE_BUFFER_INPUT,
+ { .addr = ADDR_INPUT, .size = MAX(sizeof(float) * SIZE_DATA, 0x10000),
.name = "addr input"},
- { .addr = ADDR_OUTPUT, .size = SIZE_BUFFER_OUTPUT,
+ { .addr = ADDR_OUTPUT, .size = MAX(sizeof(float) * SIZE_DATA, 0x10000),
.name = "addr output" },
{ .addr = ADDR_GENERAL_STATE_BASE,
.size = SIZE_GENERAL_STATE,
@@ -2039,12 +2069,14 @@ static void xe2lpg_compute_preempt_exec(int fd, const unsigned char *long_kernel
xe2lpg_compute_exec_compute(bo_dict_long[8].data, ADDR_GENERAL_STATE_BASE,
ADDR_SURFACE_STATE_BASE, ADDR_DYNAMIC_STATE_BASE,
ADDR_INSTRUCTION_STATE_BASE, XE2_ADDR_STATE_CONTEXT_DATA_BASE,
- OFFSET_INDIRECT_DATA_START, OFFSET_KERNEL, OFFSET_STATE_SIP, threadgroup_preemption);
+ OFFSET_INDIRECT_DATA_START, OFFSET_KERNEL, OFFSET_STATE_SIP,
+ threadgroup_preemption, SIZE_DATA);
xe2lpg_compute_exec_compute(bo_dict_short[8].data, ADDR_GENERAL_STATE_BASE,
ADDR_SURFACE_STATE_BASE, ADDR_DYNAMIC_STATE_BASE,
ADDR_INSTRUCTION_STATE_BASE, XE2_ADDR_STATE_CONTEXT_DATA_BASE,
- OFFSET_INDIRECT_DATA_START, OFFSET_KERNEL, OFFSET_STATE_SIP, false);
+ OFFSET_INDIRECT_DATA_START, OFFSET_KERNEL, OFFSET_STATE_SIP,
+ false, SIZE_DATA);
xe_exec_sync(fd, execenv_long.exec_queue, ADDR_BATCH, &sync_long, 1);
xe_exec_sync(fd, execenv_short.exec_queue, ADDR_BATCH, &sync_short, 1);
diff --git a/lib/intel_compute.h b/lib/intel_compute.h
index dc0fe2ec2..9fdb7fc73 100644
--- a/lib/intel_compute.h
+++ b/lib/intel_compute.h
@@ -55,6 +55,8 @@ struct user_execenv {
unsigned int kernel_size;
/** @skip_results_check: do not verify correctness of the results if true */
bool skip_results_check;
+ /** @array_size: size of input and output arrays */
+ uint32_t array_size;
};
extern const struct intel_compute_kernels intel_compute_square_kernels[];
--
2.43.0
More information about the igt-dev
mailing list