[PATCH i-g-t v3 1/5] lib/intel_compute: add support for stoppable loop
Dandamudi, Priyanka
priyanka.dandamudi at intel.com
Wed Apr 9 08:46:15 UTC 2025
LGTM,
Reviewed-by: Priyanka Dandamudi <priyanka.dandamudi at intel.com>
> -----Original Message-----
> From: Kempczynski, Zbigniew <zbigniew.kempczynski at intel.com>
> Sent: 09 April 2025 12:21 AM
> To: igt-dev at lists.freedesktop.org
> Cc: Kempczynski, Zbigniew <zbigniew.kempczynski at intel.com>; Dugast,
> Francois <francois.dugast at intel.com>; Dandamudi, Priyanka
> <priyanka.dandamudi at intel.com>
> Subject: [PATCH i-g-t v3 1/5] lib/intel_compute: add support for stoppable
> loop
>
> Current loop used for long running job in wmtp case has a drawback in which
> we tweak number of loops.
>
> Lets add loop which allows to be stopped from cpu write to first input data
> dword. This requires to use volatile for input buffer and uc.uc send to avoid
> checking cache instead of direct memory read.
>
> Before submitting short (compute square) job I've added 1 second delay to
> allow other processes to just start many long running jobs (loops) to make
> gpu really busy. Previously submission long / short would complete before
> another process started same long / short pair so concurency was more
> random.
>
> Signed-off-by: Zbigniew Kempczyński <zbigniew.kempczynski at intel.com>
> Cc: Francois Dugast <francois.dugast at intel.com>
> Cc: Priyanka Dandamudi <priyanka.dandamudi at intel.com>
> ---
> v3: - use previous threadgroup preemption logic/shaders
> - use same MAGIC_LOOP_STOP definition + comment (Francois)
> ---
> lib/intel_compute.c | 40 +++++++++++++++++++++++++++++++++-------
> lib/intel_compute.h | 2 ++
> opencl/loop.cl | 12 ++++++++++++
> 3 files changed, 47 insertions(+), 7 deletions(-) create mode 100644
> opencl/loop.cl
>
> diff --git a/lib/intel_compute.c b/lib/intel_compute.c index
> 28149db53e..825bfb1109 100644
> --- a/lib/intel_compute.c
> +++ b/lib/intel_compute.c
> @@ -46,7 +46,7 @@
> #define OFFSET_STATE_SIP 0xFFFF0000
>
> #define USER_FENCE_VALUE 0xdeadbeefdeadbeefull
> -
> +#define MAGIC_LOOP_STOP 0x12341234
> /*
> * TGP - ThreadGroup Preemption
> * WMTP - Walker Mid Thread Preemption
> @@ -1874,6 +1874,8 @@ bool xe_run_intel_compute_kernel_on_engine(int
> fd,
> * @short_kernel_size: size of @short_kernel
> * @sip_kernel: WMTP sip kernel which does save restore during preemption
> * @sip_kernel_size: size of @sip_kernel
> + * @loop_kernel: loop kernel binary stoppable by cpu write
> + * @loop_kernel_size: size of @loop_kernel
> */
> static void xe2lpg_compute_preempt_exec(int fd, const unsigned char
> *long_kernel,
> unsigned int long_kernel_size,
> @@ -1881,6 +1883,8 @@ static void xe2lpg_compute_preempt_exec(int fd,
> const unsigned char *long_kernel
> unsigned int short_kernel_size,
> const unsigned char *sip_kernel,
> unsigned int sip_kernel_size,
> + const unsigned char *loop_kernel,
> + unsigned int loop_kernel_size,
> struct drm_xe_engine_class_instance
> *eci,
> bool threadgroup_preemption)
> {
> @@ -1937,6 +1941,7 @@ static void xe2lpg_compute_preempt_exec(int fd,
> const unsigned char *long_kernel
> size_t bo_size_short = sizeof(*bo_sync_short);
> uint32_t bo_short = 0;
> int64_t timeout_short = 1;
> + bool use_loop_kernel = loop_kernel && !threadgroup_preemption;
>
> if (threadgroup_preemption)
> long_kernel_loop_count = TGP_long_kernel_loop_count; @@
> -1975,7 +1980,10 @@ static void xe2lpg_compute_preempt_exec(int fd, const
> unsigned char *long_kernel
> bo_sync_short->sync = 0;
> sync_short.addr = ADDR_SYNC2;
>
> - bo_dict_long[0].size = ALIGN(long_kernel_size, 0x1000);
> + if (use_loop_kernel)
> + bo_dict_long[0].size = ALIGN(loop_kernel_size, 0x1000);
> + else
> + bo_dict_long[0].size = ALIGN(long_kernel_size, 0x1000);
> bo_dict_short[0].size = ALIGN(short_kernel_size, 0x1000);
>
> bo_dict_long[10].size = ALIGN(sip_kernel_size, 0x1000); @@ -1984,7
> +1992,10 @@ static void xe2lpg_compute_preempt_exec(int fd, const
> unsigned char *long_kernel
> bo_execenv_bind(&execenv_long, bo_dict_long,
> XE2_BO_PREEMPT_DICT_ENTRIES);
> bo_execenv_bind(&execenv_short, bo_dict_short,
> XE2_BO_PREEMPT_DICT_ENTRIES);
>
> - memcpy(bo_dict_long[0].data, long_kernel, long_kernel_size);
> + if (use_loop_kernel)
> + memcpy(bo_dict_long[0].data, loop_kernel,
> loop_kernel_size);
> + else
> + memcpy(bo_dict_long[0].data, long_kernel,
> long_kernel_size);
> memcpy(bo_dict_short[0].data, short_kernel, short_kernel_size);
>
> memcpy(bo_dict_long[10].data, sip_kernel, sip_kernel_size); @@ -
> 2024,13 +2035,22 @@ static void xe2lpg_compute_preempt_exec(int fd, const
> unsigned char *long_kernel
> OFFSET_INDIRECT_DATA_START,
> OFFSET_KERNEL, OFFSET_STATE_SIP, false);
>
> xe_exec_sync(fd, execenv_long.exec_queue, ADDR_BATCH,
> &sync_long, 1);
> +
> + /* Wait until multiple LR jobs will start to occupy gpu */
> + if (use_loop_kernel)
> + sleep(1);
> +
> xe_exec_sync(fd, execenv_short.exec_queue, ADDR_BATCH,
> &sync_short, 1);
>
> xe_wait_ufence(fd, &bo_sync_short->sync, USER_FENCE_VALUE,
> execenv_short.exec_queue,
> INT64_MAX);
> +
> /* Check that the long kernel has not completed yet */
> igt_assert_neq(0, __xe_wait_ufence(fd, &bo_sync_long->sync,
> USER_FENCE_VALUE,
> execenv_long.exec_queue,
> &timeout_short));
> + if (use_loop_kernel)
> + ((int *)bo_dict_long[4].data)[0] = MAGIC_LOOP_STOP;
> +
> xe_wait_ufence(fd, &bo_sync_long->sync, USER_FENCE_VALUE,
> execenv_long.exec_queue,
> INT64_MAX);
>
> @@ -2040,7 +2060,7 @@ static void xe2lpg_compute_preempt_exec(int fd,
> const unsigned char *long_kernel
> munmap(bo_sync_short, bo_size_short);
> gem_close(fd, bo_short);
>
> - for (int i = 0; i < SIZE_DATA; i++) {
> + for (int i = use_loop_kernel ? 1 : 0; i < SIZE_DATA; i++) {
> float input = input_data[i];
> float output = output_data[i];
> float expected_output = input * input; @@ -2067,9 +2087,11
> @@ static void xe2lpg_compute_preempt_exec(int fd, const unsigned char
> *long_kernel
> */
> igt_assert(f1 > long_kernel_loop_count);
> } else {
> - if (f1 != long_kernel_loop_count)
> - igt_debug("[%4d] f1: %f != %u\n", i, f1,
> long_kernel_loop_count);
> - igt_assert(f1 == long_kernel_loop_count);
> + if (!loop_kernel) {
> + if (f1 != long_kernel_loop_count)
> + igt_debug("[%4d] f1: %f != %u\n", i,
> f1, long_kernel_loop_count);
> + igt_assert(f1 == long_kernel_loop_count);
> + }
> }
> }
>
> @@ -2088,6 +2110,8 @@ static const struct {
> unsigned int short_kernel_size,
> const unsigned char *sip_kernel,
> unsigned int sip_kernel_size,
> + const unsigned char *loop_kernel,
> + unsigned int loop_kernel_size,
> struct drm_xe_engine_class_instance *eci,
> bool threadgroup_preemption);
> uint32_t compat;
> @@ -2149,6 +2173,8 @@ static bool
> __run_intel_compute_kernel_preempt(int fd,
> kernels->kernel,
> kernels->size,
> kernels->sip_kernel,
> kernels-
> >sip_kernel_size,
> + kernels-
> >loop_kernel,
> + kernels-
> >loop_kernel_size,
> eci,
>
> threadgroup_preemption);
>
> diff --git a/lib/intel_compute.h b/lib/intel_compute.h index
> dc0fe2ec20..8310536a96 100644
> --- a/lib/intel_compute.h
> +++ b/lib/intel_compute.h
> @@ -31,6 +31,8 @@ struct intel_compute_kernels {
> const unsigned char *sip_kernel;
> unsigned int long_kernel_size;
> const unsigned char *long_kernel;
> + unsigned int loop_kernel_size;
> + const unsigned char *loop_kernel;
> };
>
> /**
> diff --git a/opencl/loop.cl b/opencl/loop.cl new file mode 100644 index
> 0000000000..e35eadbc1d
> --- /dev/null
> +++ b/opencl/loop.cl
> @@ -0,0 +1,12 @@
> +/* Must be same as MAGIC_LOOP_STOP in lib/intel_compute.c */ #define
> +MAGIC_LOOP_STOP 0x12341234
> +
> +__kernel void loop(volatile __global int *input,
> + __global int *output,
> + unsigned int count)
> +{
> + while (1) {
> + if (input[0] == MAGIC_LOOP_STOP)
> + break;
> + }
> +}
> --
> 2.34.1
More information about the igt-dev
mailing list