[PATCH i-g-t v3 1/5] lib/intel_compute: add support for stoppable loop

Wed Apr 9 08:46:15 UTC 2025

LGTM,
Reviewed-by: Priyanka Dandamudi <priyanka.dandamudi at intel.com>

> -----Original Message-----
> From: Kempczynski, Zbigniew <zbigniew.kempczynski at intel.com>
> Sent: 09 April 2025 12:21 AM
> To: igt-dev at lists.freedesktop.org
> Cc: Kempczynski, Zbigniew <zbigniew.kempczynski at intel.com>; Dugast,
> Francois <francois.dugast at intel.com>; Dandamudi, Priyanka
> <priyanka.dandamudi at intel.com>
> Subject: [PATCH i-g-t v3 1/5] lib/intel_compute: add support for stoppable
> loop
> 
> Current loop used for long running job in wmtp case has a drawback in which
> we tweak number of loops.
> 
> Lets add loop which allows to be stopped from cpu write to first input data
> dword. This requires to use volatile for input buffer and uc.uc send to avoid
> checking cache instead of direct memory read.
> 
> Before submitting short (compute square) job I've added 1 second delay to
> allow other processes to just start many long running jobs (loops) to make
> gpu really busy. Previously submission long / short would complete before
> another process started same long / short pair so concurency was more
> random.
> 
> Signed-off-by: Zbigniew Kempczyński <zbigniew.kempczynski at intel.com>
> Cc: Francois Dugast <francois.dugast at intel.com>
> Cc: Priyanka Dandamudi <priyanka.dandamudi at intel.com>
> ---
> v3: - use previous threadgroup preemption logic/shaders
>     - use same MAGIC_LOOP_STOP definition + comment (Francois)
> ---
>  lib/intel_compute.c | 40 +++++++++++++++++++++++++++++++++-------
>  lib/intel_compute.h |  2 ++
>  opencl/loop.cl      | 12 ++++++++++++
>  3 files changed, 47 insertions(+), 7 deletions(-)  create mode 100644
> opencl/loop.cl
> 
> diff --git a/lib/intel_compute.c b/lib/intel_compute.c index
> 28149db53e..825bfb1109 100644
> --- a/lib/intel_compute.c
> +++ b/lib/intel_compute.c
> @@ -46,7 +46,7 @@
>  #define OFFSET_STATE_SIP			0xFFFF0000
> 
>  #define USER_FENCE_VALUE			0xdeadbeefdeadbeefull
> -
> +#define MAGIC_LOOP_STOP			0x12341234
>  /*
>   * TGP  - ThreadGroup Preemption
>   * WMTP - Walker Mid Thread Preemption
> @@ -1874,6 +1874,8 @@ bool xe_run_intel_compute_kernel_on_engine(int
> fd,
>   * @short_kernel_size: size of @short_kernel
>   * @sip_kernel: WMTP sip kernel which does save restore during preemption
>   * @sip_kernel_size: size of @sip_kernel
> + * @loop_kernel: loop kernel binary stoppable by cpu write
> + * @loop_kernel_size: size of @loop_kernel
>   */
>  static void xe2lpg_compute_preempt_exec(int fd, const unsigned char
> *long_kernel,
>  					unsigned int long_kernel_size,
> @@ -1881,6 +1883,8 @@ static void xe2lpg_compute_preempt_exec(int fd,
> const unsigned char *long_kernel
>  					unsigned int short_kernel_size,
>  					const unsigned char *sip_kernel,
>  					unsigned int sip_kernel_size,
> +					const unsigned char *loop_kernel,
> +					unsigned int loop_kernel_size,
>  					struct drm_xe_engine_class_instance
> *eci,
>  					bool threadgroup_preemption)
>  {
> @@ -1937,6 +1941,7 @@ static void xe2lpg_compute_preempt_exec(int fd,
> const unsigned char *long_kernel
>  	size_t bo_size_short = sizeof(*bo_sync_short);
>  	uint32_t bo_short = 0;
>  	int64_t timeout_short = 1;
> +	bool use_loop_kernel = loop_kernel && !threadgroup_preemption;
> 
>  	if (threadgroup_preemption)
>  		long_kernel_loop_count = TGP_long_kernel_loop_count; @@
> -1975,7 +1980,10 @@ static void xe2lpg_compute_preempt_exec(int fd, const
> unsigned char *long_kernel
>  	bo_sync_short->sync = 0;
>  	sync_short.addr = ADDR_SYNC2;
> 
> -	bo_dict_long[0].size = ALIGN(long_kernel_size, 0x1000);
> +	if (use_loop_kernel)
> +		bo_dict_long[0].size = ALIGN(loop_kernel_size, 0x1000);
> +	else
> +		bo_dict_long[0].size = ALIGN(long_kernel_size, 0x1000);
>  	bo_dict_short[0].size = ALIGN(short_kernel_size, 0x1000);
> 
>  	bo_dict_long[10].size = ALIGN(sip_kernel_size, 0x1000); @@ -1984,7
> +1992,10 @@ static void xe2lpg_compute_preempt_exec(int fd, const
> unsigned char *long_kernel
>  	bo_execenv_bind(&execenv_long, bo_dict_long,
> XE2_BO_PREEMPT_DICT_ENTRIES);
>  	bo_execenv_bind(&execenv_short, bo_dict_short,
> XE2_BO_PREEMPT_DICT_ENTRIES);
> 
> -	memcpy(bo_dict_long[0].data, long_kernel, long_kernel_size);
> +	if (use_loop_kernel)
> +		memcpy(bo_dict_long[0].data, loop_kernel,
> loop_kernel_size);
> +	else
> +		memcpy(bo_dict_long[0].data, long_kernel,
> long_kernel_size);
>  	memcpy(bo_dict_short[0].data, short_kernel, short_kernel_size);
> 
>  	memcpy(bo_dict_long[10].data, sip_kernel, sip_kernel_size); @@ -
> 2024,13 +2035,22 @@ static void xe2lpg_compute_preempt_exec(int fd, const
> unsigned char *long_kernel
>  				    OFFSET_INDIRECT_DATA_START,
> OFFSET_KERNEL, OFFSET_STATE_SIP, false);
> 
>  	xe_exec_sync(fd, execenv_long.exec_queue, ADDR_BATCH,
> &sync_long, 1);
> +
> +	/* Wait until multiple LR jobs will start to occupy gpu */
> +	if (use_loop_kernel)
> +		sleep(1);
> +
>  	xe_exec_sync(fd, execenv_short.exec_queue, ADDR_BATCH,
> &sync_short, 1);
> 
>  	xe_wait_ufence(fd, &bo_sync_short->sync, USER_FENCE_VALUE,
> execenv_short.exec_queue,
>  		       INT64_MAX);
> +
>  	/* Check that the long kernel has not completed yet */
>  	igt_assert_neq(0, __xe_wait_ufence(fd, &bo_sync_long->sync,
> USER_FENCE_VALUE,
>  					   execenv_long.exec_queue,
> &timeout_short));
> +	if (use_loop_kernel)
> +		((int *)bo_dict_long[4].data)[0] = MAGIC_LOOP_STOP;
> +
>  	xe_wait_ufence(fd, &bo_sync_long->sync, USER_FENCE_VALUE,
> execenv_long.exec_queue,
>  		       INT64_MAX);
> 
> @@ -2040,7 +2060,7 @@ static void xe2lpg_compute_preempt_exec(int fd,
> const unsigned char *long_kernel
>  	munmap(bo_sync_short, bo_size_short);
>  	gem_close(fd, bo_short);
> 
> -	for (int i = 0; i < SIZE_DATA; i++) {
> +	for (int i = use_loop_kernel ? 1 : 0; i < SIZE_DATA; i++) {
>  		float input = input_data[i];
>  		float output = output_data[i];
>  		float expected_output = input * input; @@ -2067,9 +2087,11
> @@ static void xe2lpg_compute_preempt_exec(int fd, const unsigned char
> *long_kernel
>  			 */
>  			igt_assert(f1 > long_kernel_loop_count);
>  		} else {
> -			if (f1 != long_kernel_loop_count)
> -				igt_debug("[%4d] f1: %f != %u\n", i, f1,
> long_kernel_loop_count);
> -			igt_assert(f1 == long_kernel_loop_count);
> +			if (!loop_kernel) {
> +				if (f1 != long_kernel_loop_count)
> +					igt_debug("[%4d] f1: %f != %u\n", i,
> f1, long_kernel_loop_count);
> +				igt_assert(f1 == long_kernel_loop_count);
> +			}
>  		}
>  	}
> 
> @@ -2088,6 +2110,8 @@ static const struct {
>  			     unsigned int short_kernel_size,
>  			     const unsigned char *sip_kernel,
>  			     unsigned int sip_kernel_size,
> +			     const unsigned char *loop_kernel,
> +			     unsigned int loop_kernel_size,
>  			     struct drm_xe_engine_class_instance *eci,
>  			     bool threadgroup_preemption);
>  	uint32_t compat;
> @@ -2149,6 +2173,8 @@ static bool
> __run_intel_compute_kernel_preempt(int fd,
>  							  kernels->kernel,
> kernels->size,
>  							  kernels->sip_kernel,
>  							  kernels-
> >sip_kernel_size,
> +							  kernels-
> >loop_kernel,
> +							  kernels-
> >loop_kernel_size,
>  							  eci,
> 
> threadgroup_preemption);
> 
> diff --git a/lib/intel_compute.h b/lib/intel_compute.h index
> dc0fe2ec20..8310536a96 100644
> --- a/lib/intel_compute.h
> +++ b/lib/intel_compute.h
> @@ -31,6 +31,8 @@ struct intel_compute_kernels {
>  	const unsigned char *sip_kernel;
>  	unsigned int long_kernel_size;
>  	const unsigned char *long_kernel;
> +	unsigned int loop_kernel_size;
> +	const unsigned char *loop_kernel;
>  };
> 
>  /**
> diff --git a/opencl/loop.cl b/opencl/loop.cl new file mode 100644 index
> 0000000000..e35eadbc1d
> --- /dev/null
> +++ b/opencl/loop.cl
> @@ -0,0 +1,12 @@
> +/* Must be same as MAGIC_LOOP_STOP in lib/intel_compute.c */ #define
> +MAGIC_LOOP_STOP 0x12341234
> +
> +__kernel void loop(volatile __global int *input,
> +		   __global int *output,
> +		   unsigned int count)
> +{
> +	while (1) {
> +		if (input[0] == MAGIC_LOOP_STOP)
> +			break;
> +	}
> +}
> --
> 2.34.1