[PATCH i-g-t v2 1/5] lib/intel_compute: add support for stoppable loop

Fri Apr 4 13:18:23 UTC 2025

Hi,

On Fri, Apr 04, 2025 at 02:31:36PM +0200, Zbigniew Kempczyński wrote:
> Current loop used for long running job in wmtp case has a drawback
> in which we tweak number of loops.
> 
> Lets add loop which allows to be stopped from cpu write to first
> input data dword. This requires to use volatile for input buffer
> and uc.uc send to avoid checking cache instead of direct memory
> read.

Ack on the approach, similar to xe_spin end.

> 
> Before submitting short (compute square) job I've added 1 second
> delay to allow other processes to just start many long running
> jobs (loops) to make gpu really busy. Previously submission
> long / short would complete before another process started same
> long / short pair so concurency was more random.
> 
> Signed-off-by: Zbigniew Kempczyński <zbigniew.kempczynski at intel.com>
> Cc: Francois Dugast <francois.dugast at intel.com>
> Cc: Priyanka Dandamudi <priyanka.dandamudi at intel.com>
> ---
>  lib/intel_compute.c | 39 ++++++++++++++++++++++++++++++++-------
>  lib/intel_compute.h |  2 ++
>  opencl/loop.cl      |  9 +++++++++
>  3 files changed, 43 insertions(+), 7 deletions(-)
>  create mode 100644 opencl/loop.cl
> 
> diff --git a/lib/intel_compute.c b/lib/intel_compute.c
> index 28149db53e..50e134c8a5 100644
> --- a/lib/intel_compute.c
> +++ b/lib/intel_compute.c
> @@ -46,7 +46,7 @@
>  #define OFFSET_STATE_SIP			0xFFFF0000
>  
>  #define USER_FENCE_VALUE			0xdeadbeefdeadbeefull
> -
> +#define MAGIC_LOOP_STOP			0x12341234
>  /*
>   * TGP  - ThreadGroup Preemption
>   * WMTP - Walker Mid Thread Preemption
> @@ -1874,6 +1874,8 @@ bool xe_run_intel_compute_kernel_on_engine(int fd,
>   * @short_kernel_size: size of @short_kernel
>   * @sip_kernel: WMTP sip kernel which does save restore during preemption
>   * @sip_kernel_size: size of @sip_kernel
> + * @loop_kernel: loop kernel binary stoppable by cpu write
> + * @loop_kernel_size: size of @loop_kernel
>   */
>  static void xe2lpg_compute_preempt_exec(int fd, const unsigned char *long_kernel,
>  					unsigned int long_kernel_size,
> @@ -1881,6 +1883,8 @@ static void xe2lpg_compute_preempt_exec(int fd, const unsigned char *long_kernel
>  					unsigned int short_kernel_size,
>  					const unsigned char *sip_kernel,
>  					unsigned int sip_kernel_size,
> +					const unsigned char *loop_kernel,
> +					unsigned int loop_kernel_size,
>  					struct drm_xe_engine_class_instance *eci,
>  					bool threadgroup_preemption)
>  {
> @@ -1975,7 +1979,10 @@ static void xe2lpg_compute_preempt_exec(int fd, const unsigned char *long_kernel
>  	bo_sync_short->sync = 0;
>  	sync_short.addr = ADDR_SYNC2;
>  
> -	bo_dict_long[0].size = ALIGN(long_kernel_size, 0x1000);
> +	if (loop_kernel)
> +		bo_dict_long[0].size = ALIGN(loop_kernel_size, 0x1000);
> +	else
> +		bo_dict_long[0].size = ALIGN(long_kernel_size, 0x1000);
>  	bo_dict_short[0].size = ALIGN(short_kernel_size, 0x1000);
>  
>  	bo_dict_long[10].size = ALIGN(sip_kernel_size, 0x1000);
> @@ -1984,7 +1991,10 @@ static void xe2lpg_compute_preempt_exec(int fd, const unsigned char *long_kernel
>  	bo_execenv_bind(&execenv_long, bo_dict_long, XE2_BO_PREEMPT_DICT_ENTRIES);
>  	bo_execenv_bind(&execenv_short, bo_dict_short, XE2_BO_PREEMPT_DICT_ENTRIES);
>  
> -	memcpy(bo_dict_long[0].data, long_kernel, long_kernel_size);
> +	if (loop_kernel)
> +		memcpy(bo_dict_long[0].data, loop_kernel, loop_kernel_size);
> +	else
> +		memcpy(bo_dict_long[0].data, long_kernel, long_kernel_size);
>  	memcpy(bo_dict_short[0].data, short_kernel, short_kernel_size);
>  
>  	memcpy(bo_dict_long[10].data, sip_kernel, sip_kernel_size);
> @@ -2024,13 +2034,22 @@ static void xe2lpg_compute_preempt_exec(int fd, const unsigned char *long_kernel
>  				    OFFSET_INDIRECT_DATA_START, OFFSET_KERNEL, OFFSET_STATE_SIP, false);
>  
>  	xe_exec_sync(fd, execenv_long.exec_queue, ADDR_BATCH, &sync_long, 1);
> +
> +	/* Wait until multiple LR jobs will start to occupy gpu */
> +	if (loop_kernel)
> +		sleep(1);
> +
>  	xe_exec_sync(fd, execenv_short.exec_queue, ADDR_BATCH, &sync_short, 1);
>  
>  	xe_wait_ufence(fd, &bo_sync_short->sync, USER_FENCE_VALUE, execenv_short.exec_queue,
>  		       INT64_MAX);
> +
>  	/* Check that the long kernel has not completed yet */
>  	igt_assert_neq(0, __xe_wait_ufence(fd, &bo_sync_long->sync, USER_FENCE_VALUE,
>  					   execenv_long.exec_queue, &timeout_short));
> +	if (loop_kernel)
> +		((int *)bo_dict_long[4].data)[0] = MAGIC_LOOP_STOP;
> +
>  	xe_wait_ufence(fd, &bo_sync_long->sync, USER_FENCE_VALUE, execenv_long.exec_queue,
>  		       INT64_MAX);
>  
> @@ -2040,7 +2059,7 @@ static void xe2lpg_compute_preempt_exec(int fd, const unsigned char *long_kernel
>  	munmap(bo_sync_short, bo_size_short);
>  	gem_close(fd, bo_short);
>  
> -	for (int i = 0; i < SIZE_DATA; i++) {
> +	for (int i = loop_kernel ? 1 : 0; i < SIZE_DATA; i++) {
>  		float input = input_data[i];
>  		float output = output_data[i];
>  		float expected_output = input * input;
> @@ -2067,9 +2086,11 @@ static void xe2lpg_compute_preempt_exec(int fd, const unsigned char *long_kernel
>  			 */
>  			igt_assert(f1 > long_kernel_loop_count);
>  		} else {
> -			if (f1 != long_kernel_loop_count)
> -				igt_debug("[%4d] f1: %f != %u\n", i, f1, long_kernel_loop_count);
> -			igt_assert(f1 == long_kernel_loop_count);
> +			if (!loop_kernel) {
> +				if (f1 != long_kernel_loop_count)
> +					igt_debug("[%4d] f1: %f != %u\n", i, f1, long_kernel_loop_count);
> +				igt_assert(f1 == long_kernel_loop_count);
> +			}
>  		}
>  	}
>  
> @@ -2088,6 +2109,8 @@ static const struct {
>  			     unsigned int short_kernel_size,
>  			     const unsigned char *sip_kernel,
>  			     unsigned int sip_kernel_size,
> +			     const unsigned char *loop_kernel,
> +			     unsigned int loop_kernel_size,
>  			     struct drm_xe_engine_class_instance *eci,
>  			     bool threadgroup_preemption);
>  	uint32_t compat;
> @@ -2149,6 +2172,8 @@ static bool __run_intel_compute_kernel_preempt(int fd,
>  							  kernels->kernel, kernels->size,
>  							  kernels->sip_kernel,
>  							  kernels->sip_kernel_size,
> +							  kernels->loop_kernel,
> +							  kernels->loop_kernel_size,
>  							  eci,
>  							  threadgroup_preemption);
>  
> diff --git a/lib/intel_compute.h b/lib/intel_compute.h
> index dc0fe2ec20..8310536a96 100644
> --- a/lib/intel_compute.h
> +++ b/lib/intel_compute.h
> @@ -31,6 +31,8 @@ struct intel_compute_kernels {
>  	const unsigned char *sip_kernel;
>  	unsigned int long_kernel_size;
>  	const unsigned char *long_kernel;
> +	unsigned int loop_kernel_size;
> +	const unsigned char *loop_kernel;
>  };
>  
>  /**
> diff --git a/opencl/loop.cl b/opencl/loop.cl
> new file mode 100644
> index 0000000000..7fd2c13368
> --- /dev/null
> +++ b/opencl/loop.cl
> @@ -0,0 +1,9 @@
> +__kernel void loop(volatile __global int *input,
> +		   __global int *output,
> +		   unsigned int count)
> +{
> +	while (1) {
> +		if (input[0] == 0x12341234)

Not sure if shared header makes sense but at least a comment with a
reference to MAGIC_LOOP_STOP would be helpful for understanding.

    /* See MAGIC_LOOP_STOP in lib/intel_compute.c */

Francois

> +			break;
> +	}
> +}
> -- 
> 2.34.1
>