[PATCH i-g-t v2 2/4] lib/intel_compute: simplify compute-preempt subtests
Francois Dugast
francois.dugast at intel.com
Wed May 28 09:12:23 UTC 2025
On Thu, May 22, 2025 at 09:55:13AM +0200, Zbigniew Kempczyński wrote:
> Code in compute-preempt-exec uses locally created user-fences what
> takes a lot of lines. Introduced bo_execenv_exec_async() allows to
> synchronize later with user-fence defined in execenv itself.
>
> Moreover logic of wmtp and threadgroup preempt scenarios was simplified.
> Now both cases assume compute-square (short running job) must complete
> before long running job will end. For WMTP we have full control and
> we're able to stop it via write MAGIC value to input buffer (shader
> waits for it to break endless loop). For threadgroup this is harder
> to achieve because preemption occurs on some point where threadgroup
> ends its execution. Shader (increment loop) for threadgroup preemption
> uses large x-dim what causes short job preempts it on some point of time.
> This unfortunately requires some fine tweaking (setting x-dim) but
> at the moment I haven't found better way to implement this case.
>
> Signed-off-by: Zbigniew Kempczyński <zbigniew.kempczynski at intel.com>
> Cc: Francois Dugast <francois.dugast at intel.com>
Reviewed-by: Francois Dugast <francois.dugast at intel.com>
> ---
> lib/intel_compute.c | 152 ++++++++++++--------------------------------
> 1 file changed, 41 insertions(+), 111 deletions(-)
>
> diff --git a/lib/intel_compute.c b/lib/intel_compute.c
> index 5579bec85b..d2629ed911 100644
> --- a/lib/intel_compute.c
> +++ b/lib/intel_compute.c
> @@ -67,6 +67,7 @@
> */
> #define TGP_long_kernel_loop_count 10
> #define WMTP_long_kernel_loop_count 1000000
> +#define XE2_THREADGROUP_PREEMPT_XDIM 0x200000
>
> struct bo_dict_entry {
> uint64_t addr;
> @@ -102,6 +103,25 @@ struct bo_execenv {
> struct user_execenv *user;
> };
>
> +static void bo_randomize(float *ptr, int size)
> +{
> + srand(time(NULL));
> + for (int i = 0; i < size; i++)
> + ptr[i] = rand() / (float)RAND_MAX;
> +}
> +
> +static void bo_check_square(float *input, float *output, int size)
> +{
> + for (int i = 0; i < size; i++) {
> + float expected_output = input[i] * input[i];
> +
> + if (output[i] != expected_output)
> + igt_debug("[%4d] input:%f output:%f expected_output:%f\n",
> + i, input[i], output[i], expected_output);
> + igt_assert_eq_double(output[i], expected_output);
> + }
> +}
> +
> static void bo_execenv_create(int fd, struct bo_execenv *execenv,
> struct drm_xe_engine_class_instance *eci,
> struct user_execenv *user)
> @@ -1584,10 +1604,10 @@ static void xe2lpg_compute_exec_compute(uint32_t *addr_bo_buffer_batch,
>
> if (threadgroup_preemption)
> /*
> - * Create multiple threadgroups using higher gloabl workgroup size
> + * Create multiple threadgroups using higher global workgroup size
> * Global Workgroup size = Local X * Thread Group X + Local Y * Thread Group Y + Local Z * Thread Group Z
> */
> - addr_bo_buffer_batch[b++] = 0x00200000; // Thread Group ID X Dimension
> + addr_bo_buffer_batch[b++] = XE2_THREADGROUP_PREEMPT_XDIM; // Thread Group ID X Dimension
> else
> addr_bo_buffer_batch[b++] = size_thread_group_x(work_size);
>
> @@ -2077,30 +2097,15 @@ static void xe2lpg_compute_preempt_exec(int fd, const unsigned char *long_kernel
> .size = 0x6400000,
> .name = "state context data base"},
> { .addr = ADDR_INSTRUCTION_STATE_BASE + OFFSET_STATE_SIP,
> + .size = ALIGN(sip_kernel_size, 0x1000),
> .name = "sip kernel"},
> };
>
> struct bo_dict_entry bo_dict_short[XE2_BO_PREEMPT_DICT_ENTRIES];
> struct bo_execenv execenv_short, execenv_long;
> - float *input_data, *output_data;
> - unsigned int long_kernel_loop_count;
> - struct drm_xe_sync sync_long = {
> - .type = DRM_XE_SYNC_TYPE_USER_FENCE,
> - .flags = DRM_XE_SYNC_FLAG_SIGNAL,
> - .timeline_value = USER_FENCE_VALUE,
> - };
> - struct bo_sync *bo_sync_long;
> - size_t bo_size_long = sizeof(*bo_sync_long);
> - uint32_t bo_long = 0;
> - struct drm_xe_sync sync_short = {
> - .type = DRM_XE_SYNC_TYPE_USER_FENCE,
> - .flags = DRM_XE_SYNC_FLAG_SIGNAL,
> - .timeline_value = USER_FENCE_VALUE,
> - };
> - struct bo_sync *bo_sync_short;
> - size_t bo_size_short = sizeof(*bo_sync_short);
> - uint32_t bo_short = 0;
> - int64_t timeout_short = 1;
> + float *input_short, *output_short, *input_long;
> + unsigned int long_kernel_loop_count = 0;
> + int64_t timeout_one_ns = 1;
> bool use_loop_kernel = loop_kernel && !threadgroup_preemption;
>
> if (threadgroup_preemption)
> @@ -2114,41 +2119,12 @@ static void xe2lpg_compute_preempt_exec(int fd, const unsigned char *long_kernel
> bo_execenv_create(fd, &execenv_short, eci, NULL);
> bo_execenv_create(fd, &execenv_long, eci, NULL);
>
> - /* Prepare sync object for long */
> - bo_size_long = xe_bb_size(fd, bo_size_long);
> - bo_long = xe_bo_create(fd, execenv_long.vm, bo_size_long, vram_if_possible(fd, 0),
> - DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM);
> - bo_sync_long = xe_bo_map(fd, bo_long, bo_size_long);
> - sync_long.addr = to_user_pointer(&bo_sync_long->sync);
> - xe_vm_bind_async(fd, execenv_long.vm, 0, bo_long, 0, ADDR_SYNC, bo_size_long,
> - &sync_long, 1);
> - xe_wait_ufence(fd, &bo_sync_long->sync, USER_FENCE_VALUE, execenv_long.exec_queue,
> - INT64_MAX);
> - bo_sync_long->sync = 0;
> - sync_long.addr = ADDR_SYNC;
> -
> - /* Prepare sync object for short */
> - bo_size_short = xe_bb_size(fd, bo_size_short);
> - bo_short = xe_bo_create(fd, execenv_short.vm, bo_size_short, vram_if_possible(fd, 0),
> - DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM);
> - bo_sync_short = xe_bo_map(fd, bo_short, bo_size_short);
> - sync_short.addr = to_user_pointer(&bo_sync_short->sync);
> - xe_vm_bind_async(fd, execenv_short.vm, 0, bo_short, 0, ADDR_SYNC2, bo_size_short,
> - &sync_short, 1);
> - xe_wait_ufence(fd, &bo_sync_short->sync, USER_FENCE_VALUE, execenv_short.exec_queue,
> - INT64_MAX);
> - bo_sync_short->sync = 0;
> - sync_short.addr = ADDR_SYNC2;
> -
> if (use_loop_kernel)
> bo_dict_long[0].size = ALIGN(loop_kernel_size, 0x1000);
> else
> bo_dict_long[0].size = ALIGN(long_kernel_size, 0x1000);
> bo_dict_short[0].size = ALIGN(short_kernel_size, 0x1000);
>
> - bo_dict_long[10].size = ALIGN(sip_kernel_size, 0x1000);
> - bo_dict_short[10].size = ALIGN(sip_kernel_size, 0x1000);
> -
> bo_execenv_bind(&execenv_long, bo_dict_long, XE2_BO_PREEMPT_DICT_ENTRIES);
> bo_execenv_bind(&execenv_short, bo_dict_short, XE2_BO_PREEMPT_DICT_ENTRIES);
>
> @@ -2172,17 +2148,11 @@ static void xe2lpg_compute_preempt_exec(int fd, const unsigned char *long_kernel
> xelpg_create_indirect_data(bo_dict_short[3].data, ADDR_INPUT, ADDR_OUTPUT, SIZE_DATA);
> xehp_create_surface_state(bo_dict_short[7].data, ADDR_INPUT, ADDR_OUTPUT);
>
> - input_data = (float *) bo_dict_long[4].data;
> - output_data = (float *) bo_dict_short[5].data;
> - srand(time(NULL));
> + input_long = (float *) bo_dict_long[4].data;
> + input_short = (float *) bo_dict_short[4].data;
> + output_short = (float *) bo_dict_short[5].data;
>
> - for (int i = 0; i < SIZE_DATA; i++)
> - input_data[i] = rand() / (float)RAND_MAX;
> -
> - input_data = (float *) bo_dict_short[4].data;
> -
> - for (int i = 0; i < SIZE_DATA; i++)
> - input_data[i] = rand() / (float)RAND_MAX;
> + bo_randomize(input_short, SIZE_DATA);
>
> xe2lpg_compute_exec_compute(bo_dict_long[8].data, ADDR_GENERAL_STATE_BASE,
> ADDR_SURFACE_STATE_BASE, ADDR_DYNAMIC_STATE_BASE,
> @@ -2196,66 +2166,26 @@ static void xe2lpg_compute_preempt_exec(int fd, const unsigned char *long_kernel
> OFFSET_INDIRECT_DATA_START, OFFSET_KERNEL, OFFSET_STATE_SIP,
> false, SIZE_DATA);
>
> - xe_exec_sync(fd, execenv_long.exec_queue, ADDR_BATCH, &sync_long, 1);
> + bo_execenv_exec_async(&execenv_long, ADDR_BATCH);
>
> /* Wait until multiple LR jobs will start to occupy gpu */
> if (use_loop_kernel)
> sleep(1);
>
> - xe_exec_sync(fd, execenv_short.exec_queue, ADDR_BATCH, &sync_short, 1);
> -
> - xe_wait_ufence(fd, &bo_sync_short->sync, USER_FENCE_VALUE, execenv_short.exec_queue,
> - INT64_MAX);
> + /*
> + * Regardless scenario - wmtp or threadgroup short job (compute
> + * square) must complete first and long job must be still active.
> + */
> + bo_execenv_exec(&execenv_short, ADDR_BATCH);
> + bo_check_square(input_short, output_short, SIZE_DATA);
>
> /* Check that the long kernel has not completed yet */
> - igt_assert_neq(0, __xe_wait_ufence(fd, &bo_sync_long->sync, USER_FENCE_VALUE,
> - execenv_long.exec_queue, &timeout_short));
> + igt_assert_neq(0, __xe_wait_ufence(fd, &execenv_long.bo_sync->sync, USER_FENCE_VALUE,
> + execenv_long.exec_queue, &timeout_one_ns));
> if (use_loop_kernel)
> - ((int *)bo_dict_long[4].data)[0] = MAGIC_LOOP_STOP;
> + ((int *)input_long)[0] = MAGIC_LOOP_STOP;
>
> - xe_wait_ufence(fd, &bo_sync_long->sync, USER_FENCE_VALUE, execenv_long.exec_queue,
> - INT64_MAX);
> -
> - munmap(bo_sync_long, bo_size_long);
> - gem_close(fd, bo_long);
> -
> - munmap(bo_sync_short, bo_size_short);
> - gem_close(fd, bo_short);
> -
> - for (int i = use_loop_kernel ? 1 : 0; i < SIZE_DATA; i++) {
> - float input = input_data[i];
> - float output = output_data[i];
> - float expected_output = input * input;
> -
> - if (output != expected_output)
> - igt_debug("[%4d] input:%f output:%f expected_output:%f\n",
> - i, input, output, expected_output);
> - igt_assert_eq_double(output, expected_output);
> - }
> -
> - for (int i = 0; i < SIZE_DATA; i++) {
> - float f1;
> -
> - f1 = ((float *) bo_dict_long[5].data)[i];
> -
> - if (threadgroup_preemption) {
> - if (f1 < long_kernel_loop_count)
> - igt_debug("[%4d] f1: %f != %u\n", i, f1, long_kernel_loop_count);
> -
> - /* Final incremented value should be greater than loop count
> - * as the kernel is ran by multiple threads and output variable
> - * is shared among all threads. This enusres multiple threadgroup
> - * workload execution
> - */
> - igt_assert(f1 > long_kernel_loop_count);
> - } else {
> - if (!loop_kernel) {
> - if (f1 != long_kernel_loop_count)
> - igt_debug("[%4d] f1: %f != %u\n", i, f1, long_kernel_loop_count);
> - igt_assert(f1 == long_kernel_loop_count);
> - }
> - }
> - }
> + bo_execenv_sync(&execenv_long);
>
> bo_execenv_unbind(&execenv_short, bo_dict_short, XE2_BO_PREEMPT_DICT_ENTRIES);
> bo_execenv_unbind(&execenv_long, bo_dict_long, XE2_BO_PREEMPT_DICT_ENTRIES);
> --
> 2.43.0
>
More information about the igt-dev
mailing list