[PATCH i-g-t v2 1/5] lib/intel_compute: add support for stoppable loop
Francois Dugast
francois.dugast at intel.com
Fri Apr 4 13:18:23 UTC 2025
Hi,
On Fri, Apr 04, 2025 at 02:31:36PM +0200, Zbigniew Kempczyński wrote:
> Current loop used for long running job in wmtp case has a drawback
> in which we tweak number of loops.
>
> Lets add loop which allows to be stopped from cpu write to first
> input data dword. This requires to use volatile for input buffer
> and uc.uc send to avoid checking cache instead of direct memory
> read.
Ack on the approach, similar to xe_spin end.
>
> Before submitting short (compute square) job I've added 1 second
> delay to allow other processes to just start many long running
> jobs (loops) to make gpu really busy. Previously submission
> long / short would complete before another process started same
> long / short pair so concurency was more random.
>
> Signed-off-by: Zbigniew Kempczyński <zbigniew.kempczynski at intel.com>
> Cc: Francois Dugast <francois.dugast at intel.com>
> Cc: Priyanka Dandamudi <priyanka.dandamudi at intel.com>
> ---
> lib/intel_compute.c | 39 ++++++++++++++++++++++++++++++++-------
> lib/intel_compute.h | 2 ++
> opencl/loop.cl | 9 +++++++++
> 3 files changed, 43 insertions(+), 7 deletions(-)
> create mode 100644 opencl/loop.cl
>
> diff --git a/lib/intel_compute.c b/lib/intel_compute.c
> index 28149db53e..50e134c8a5 100644
> --- a/lib/intel_compute.c
> +++ b/lib/intel_compute.c
> @@ -46,7 +46,7 @@
> #define OFFSET_STATE_SIP 0xFFFF0000
>
> #define USER_FENCE_VALUE 0xdeadbeefdeadbeefull
> -
> +#define MAGIC_LOOP_STOP 0x12341234
> /*
> * TGP - ThreadGroup Preemption
> * WMTP - Walker Mid Thread Preemption
> @@ -1874,6 +1874,8 @@ bool xe_run_intel_compute_kernel_on_engine(int fd,
> * @short_kernel_size: size of @short_kernel
> * @sip_kernel: WMTP sip kernel which does save restore during preemption
> * @sip_kernel_size: size of @sip_kernel
> + * @loop_kernel: loop kernel binary stoppable by cpu write
> + * @loop_kernel_size: size of @loop_kernel
> */
> static void xe2lpg_compute_preempt_exec(int fd, const unsigned char *long_kernel,
> unsigned int long_kernel_size,
> @@ -1881,6 +1883,8 @@ static void xe2lpg_compute_preempt_exec(int fd, const unsigned char *long_kernel
> unsigned int short_kernel_size,
> const unsigned char *sip_kernel,
> unsigned int sip_kernel_size,
> + const unsigned char *loop_kernel,
> + unsigned int loop_kernel_size,
> struct drm_xe_engine_class_instance *eci,
> bool threadgroup_preemption)
> {
> @@ -1975,7 +1979,10 @@ static void xe2lpg_compute_preempt_exec(int fd, const unsigned char *long_kernel
> bo_sync_short->sync = 0;
> sync_short.addr = ADDR_SYNC2;
>
> - bo_dict_long[0].size = ALIGN(long_kernel_size, 0x1000);
> + if (loop_kernel)
> + bo_dict_long[0].size = ALIGN(loop_kernel_size, 0x1000);
> + else
> + bo_dict_long[0].size = ALIGN(long_kernel_size, 0x1000);
> bo_dict_short[0].size = ALIGN(short_kernel_size, 0x1000);
>
> bo_dict_long[10].size = ALIGN(sip_kernel_size, 0x1000);
> @@ -1984,7 +1991,10 @@ static void xe2lpg_compute_preempt_exec(int fd, const unsigned char *long_kernel
> bo_execenv_bind(&execenv_long, bo_dict_long, XE2_BO_PREEMPT_DICT_ENTRIES);
> bo_execenv_bind(&execenv_short, bo_dict_short, XE2_BO_PREEMPT_DICT_ENTRIES);
>
> - memcpy(bo_dict_long[0].data, long_kernel, long_kernel_size);
> + if (loop_kernel)
> + memcpy(bo_dict_long[0].data, loop_kernel, loop_kernel_size);
> + else
> + memcpy(bo_dict_long[0].data, long_kernel, long_kernel_size);
> memcpy(bo_dict_short[0].data, short_kernel, short_kernel_size);
>
> memcpy(bo_dict_long[10].data, sip_kernel, sip_kernel_size);
> @@ -2024,13 +2034,22 @@ static void xe2lpg_compute_preempt_exec(int fd, const unsigned char *long_kernel
> OFFSET_INDIRECT_DATA_START, OFFSET_KERNEL, OFFSET_STATE_SIP, false);
>
> xe_exec_sync(fd, execenv_long.exec_queue, ADDR_BATCH, &sync_long, 1);
> +
> + /* Wait until multiple LR jobs will start to occupy gpu */
> + if (loop_kernel)
> + sleep(1);
> +
> xe_exec_sync(fd, execenv_short.exec_queue, ADDR_BATCH, &sync_short, 1);
>
> xe_wait_ufence(fd, &bo_sync_short->sync, USER_FENCE_VALUE, execenv_short.exec_queue,
> INT64_MAX);
> +
> /* Check that the long kernel has not completed yet */
> igt_assert_neq(0, __xe_wait_ufence(fd, &bo_sync_long->sync, USER_FENCE_VALUE,
> execenv_long.exec_queue, &timeout_short));
> + if (loop_kernel)
> + ((int *)bo_dict_long[4].data)[0] = MAGIC_LOOP_STOP;
> +
> xe_wait_ufence(fd, &bo_sync_long->sync, USER_FENCE_VALUE, execenv_long.exec_queue,
> INT64_MAX);
>
> @@ -2040,7 +2059,7 @@ static void xe2lpg_compute_preempt_exec(int fd, const unsigned char *long_kernel
> munmap(bo_sync_short, bo_size_short);
> gem_close(fd, bo_short);
>
> - for (int i = 0; i < SIZE_DATA; i++) {
> + for (int i = loop_kernel ? 1 : 0; i < SIZE_DATA; i++) {
> float input = input_data[i];
> float output = output_data[i];
> float expected_output = input * input;
> @@ -2067,9 +2086,11 @@ static void xe2lpg_compute_preempt_exec(int fd, const unsigned char *long_kernel
> */
> igt_assert(f1 > long_kernel_loop_count);
> } else {
> - if (f1 != long_kernel_loop_count)
> - igt_debug("[%4d] f1: %f != %u\n", i, f1, long_kernel_loop_count);
> - igt_assert(f1 == long_kernel_loop_count);
> + if (!loop_kernel) {
> + if (f1 != long_kernel_loop_count)
> + igt_debug("[%4d] f1: %f != %u\n", i, f1, long_kernel_loop_count);
> + igt_assert(f1 == long_kernel_loop_count);
> + }
> }
> }
>
> @@ -2088,6 +2109,8 @@ static const struct {
> unsigned int short_kernel_size,
> const unsigned char *sip_kernel,
> unsigned int sip_kernel_size,
> + const unsigned char *loop_kernel,
> + unsigned int loop_kernel_size,
> struct drm_xe_engine_class_instance *eci,
> bool threadgroup_preemption);
> uint32_t compat;
> @@ -2149,6 +2172,8 @@ static bool __run_intel_compute_kernel_preempt(int fd,
> kernels->kernel, kernels->size,
> kernels->sip_kernel,
> kernels->sip_kernel_size,
> + kernels->loop_kernel,
> + kernels->loop_kernel_size,
> eci,
> threadgroup_preemption);
>
> diff --git a/lib/intel_compute.h b/lib/intel_compute.h
> index dc0fe2ec20..8310536a96 100644
> --- a/lib/intel_compute.h
> +++ b/lib/intel_compute.h
> @@ -31,6 +31,8 @@ struct intel_compute_kernels {
> const unsigned char *sip_kernel;
> unsigned int long_kernel_size;
> const unsigned char *long_kernel;
> + unsigned int loop_kernel_size;
> + const unsigned char *loop_kernel;
> };
>
> /**
> diff --git a/opencl/loop.cl b/opencl/loop.cl
> new file mode 100644
> index 0000000000..7fd2c13368
> --- /dev/null
> +++ b/opencl/loop.cl
> @@ -0,0 +1,9 @@
> +__kernel void loop(volatile __global int *input,
> + __global int *output,
> + unsigned int count)
> +{
> + while (1) {
> + if (input[0] == 0x12341234)
Not sure if shared header makes sense but at least a comment with a
reference to MAGIC_LOOP_STOP would be helpful for understanding.
/* See MAGIC_LOOP_STOP in lib/intel_compute.c */
Francois
> + break;
> + }
> +}
> --
> 2.34.1
>
More information about the igt-dev
mailing list