[PATCH i-g-t] lib/intel_compute: change shader and logic for threadgroup preemption

Francois Dugast francois.dugast at intel.com
Fri Jun 13 09:43:06 UTC 2025


On Wed, Jun 11, 2025 at 07:46:29AM +0200, Zbigniew Kempczyński wrote:
> Threadgroup preemption happens on thread group dispatch boundary.
> It is harder to control it comparing to WMTP. Simple loop with
> n-iterations and relatively high dimensions allows to achieve that
> but requires tweaking work dimension as each loop (single thread)
> executes similar time.
> 
> Change then this "constant" time loop to one which allows to be
> stopped by writing MAGIC number to buffer observed by the loop
> conditional. Until threadgroup preemption occurs threads will execute
> relatively long time allowing another short job (compute square)
> to complete. When short job is done MAGIC write causes all threads
> exit loop immediately. This strategy is more reliable and shortens
> threadgroup preemption execution time.
> 
> Signed-off-by: Zbigniew Kempczyński <zbigniew.kempczynski at intel.com>
> Cc: Francois Dugast <francois.dugast at intel.com>

Reviewed-by: Francois Dugast <francois.dugast at intel.com>

> ---
>  lib/intel_compute.c                |   9 +-
>  lib/intel_compute_square_kernels.c | 147 +++++++++++++++--------------
>  opencl/loop_count.cl               |  15 +++
>  3 files changed, 99 insertions(+), 72 deletions(-)
>  create mode 100644 opencl/loop_count.cl
> 
> diff --git a/lib/intel_compute.c b/lib/intel_compute.c
> index bfb9024ba8..9f5fc1bc59 100644
> --- a/lib/intel_compute.c
> +++ b/lib/intel_compute.c
> @@ -67,7 +67,7 @@
>   */
>  #define TGP_long_kernel_loop_count		10
>  #define WMTP_long_kernel_loop_count		1000000
> -#define XE2_THREADGROUP_PREEMPT_XDIM		0x200000
> +#define XE2_THREADGROUP_PREEMPT_XDIM		0x4000
>  
>  struct bo_dict_entry {
>  	uint64_t addr;
> @@ -2180,8 +2180,11 @@ static void xe2lpg_compute_preempt_exec(int fd, const unsigned char *long_kernel
>  	/* Check that the long kernel has not completed yet */
>  	igt_assert_neq(0, __xe_wait_ufence(fd, &execenv_long.bo_sync->sync, USER_FENCE_VALUE,
>  					   execenv_long.exec_queue, &timeout_one_ns));
> -	if (use_loop_kernel)
> -		((int *)input_long)[0] = MAGIC_LOOP_STOP;
> +	/*
> +	 * For threadgroup preemption it breaks the loop. So rest shaders exit
> +	 * immediately without reaching whole loop count.
> +	 */
> +	((int *)input_long)[0] = MAGIC_LOOP_STOP;
>  
>  	bo_execenv_sync(&execenv_long);
>  
> diff --git a/lib/intel_compute_square_kernels.c b/lib/intel_compute_square_kernels.c
> index f67c048583..5bf9a1feb5 100644
> --- a/lib/intel_compute_square_kernels.c
> +++ b/lib/intel_compute_square_kernels.c
> @@ -3806,44 +3806,47 @@ static const unsigned char xe2lpg_kernel_sip_bin[] = {
>  };
>  
>  /*
> - * Opencl code for below binary is provided in below path:
> - *	opencl/compute_increment_kernel.cl
> + * Opencl code is in opencl/loop_count.cl
> + *
> + * To work properly it requires to use uncached reads, so ocloc has to
> + * be called with: -options " -igc_opts 'LscLoadCacheControlOverride=1'" arg
>   */
> -static const unsigned char xe2lpg_kernel_inc_bin[] = {
> -	0x65, 0x00, 0x00, 0x80, 0x20, 0x82, 0x05, 0x7f, 0x04, 0x00, 0x00, 0x02,
> -	0xc0, 0xff, 0xff, 0xff, 0x40, 0x19, 0x00, 0x80, 0x20, 0x82, 0x05, 0x7f,
> -	0x04, 0x7f, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x31, 0x22, 0x03, 0x80,
> -	0x00, 0x00, 0x0c, 0x05, 0x8f, 0x7f, 0x00, 0xfa, 0x03, 0x00, 0x70, 0xf6,
> -	0x01, 0x00, 0x00, 0x00, 0x00, 0x42, 0x01, 0x00, 0x00, 0x00, 0x00, 0x20,
> -	0x04, 0x00, 0x00, 0x00, 0x66, 0x09, 0x00, 0x80, 0x20, 0x82, 0x01, 0x80,
> +static const unsigned char xe2lpg_kernel_count_bin[] = {
> +	0x01, 0xa1, 0x8c, 0x3c, 0x00, 0x00, 0x10, 0x00, 0x65, 0xa0, 0x00, 0x80,
> +	0x20, 0x82, 0x05, 0x7f, 0x04, 0x00, 0x00, 0x02, 0xc0, 0xff, 0xff, 0xff,
> +	0x40, 0x19, 0x00, 0x80, 0x20, 0x82, 0x05, 0x7f, 0x04, 0x7f, 0x00, 0x02,
> +	0x00, 0x00, 0x00, 0x00, 0x31, 0x22, 0x03, 0x80, 0x00, 0x00, 0x0c, 0x05,
> +	0x8f, 0x7f, 0x00, 0xfa, 0x03, 0x00, 0x70, 0xf6, 0x61, 0x80, 0x10, 0x2c,
> +	0x02, 0x00, 0x10, 0x00, 0x66, 0x09, 0x00, 0x80, 0x20, 0x82, 0x01, 0x80,
>  	0x00, 0x80, 0x00, 0x02, 0xc0, 0x04, 0x00, 0x40, 0x41, 0x22, 0x03, 0x80,
> -	0x60, 0x06, 0x01, 0x20, 0x54, 0x05, 0x00, 0x01, 0x14, 0x00, 0x00, 0x00,
> -	0x53, 0x80, 0x00, 0x80, 0x60, 0x06, 0x05, 0x02, 0x54, 0x05, 0x00, 0x06,
> -	0x14, 0x00, 0x00, 0x00, 0x70, 0x00, 0x14, 0x80, 0x60, 0x86, 0x01, 0x00,
> -	0x44, 0x05, 0x00, 0x15, 0x00, 0x00, 0x00, 0x00, 0x52, 0x1a, 0x14, 0x00,
> -	0x60, 0x06, 0x04, 0x06, 0x04, 0x02, 0x0e, 0x01, 0x04, 0x01, 0x04, 0x04,
> +	0x60, 0x06, 0x01, 0x20, 0x54, 0x05, 0x00, 0x01, 0x14, 0x02, 0x00, 0x00,
> +	0x53, 0x81, 0x00, 0x80, 0x60, 0x06, 0x05, 0x03, 0x54, 0x05, 0x00, 0x06,
> +	0x14, 0x02, 0x00, 0x00, 0x70, 0x00, 0x14, 0x80, 0x60, 0x86, 0x01, 0x00,
> +	0x44, 0x05, 0x00, 0x15, 0x00, 0x00, 0x00, 0x00, 0x40, 0x1a, 0x14, 0x00,
> +	0x60, 0x06, 0x05, 0x06, 0x04, 0x03, 0x00, 0x01, 0x04, 0x01, 0x10, 0x00,
>  	0x20, 0x00, 0x00, 0x84, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
> -	0xd0, 0x00, 0x00, 0x00, 0x61, 0x1a, 0x00, 0xb4, 0x0e, 0x06, 0x10, 0x00,
> -	0x61, 0x00, 0x08, 0xb4, 0x10, 0x07, 0x10, 0x00, 0x69, 0x1a, 0x10, 0x00,
> -	0x70, 0x86, 0x05, 0x12, 0x04, 0x0e, 0x20, 0x05, 0x02, 0x00, 0x02, 0x00,
> -	0x69, 0x1a, 0x10, 0x02, 0x70, 0x86, 0x05, 0x14, 0x04, 0x10, 0x20, 0x05,
> -	0x02, 0x00, 0x02, 0x00, 0x40, 0x1a, 0x00, 0x38, 0x08, 0x12, 0x30, 0x05,
> -	0x40, 0x1a, 0x08, 0x38, 0x0a, 0x14, 0x30, 0x05, 0x31, 0x23, 0x17, 0x00,
> -	0x00, 0x00, 0x14, 0x0c, 0x24, 0x08, 0x00, 0xfb, 0x00, 0x00, 0x00, 0x00,
> -	0x61, 0x00, 0x84, 0xf4, 0x02, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00,
> -	0x00, 0x42, 0x01, 0x00, 0x00, 0x00, 0x00, 0x20, 0x08, 0x00, 0x00, 0x00,
> -	0x40, 0x19, 0x00, 0x80, 0x60, 0x86, 0x15, 0x02, 0x14, 0x02, 0x00, 0x05,
> -	0x01, 0x00, 0x01, 0x00, 0x40, 0x83, 0x20, 0x30, 0x0c, 0x0c, 0x80, 0x3f,
> -	0x70, 0x19, 0x54, 0x80, 0x20, 0x02, 0x01, 0x00, 0x14, 0x02, 0x00, 0x52,
> -	0x44, 0x05, 0x00, 0x00, 0x20, 0x00, 0x40, 0x84, 0x00, 0x40, 0x00, 0x00,
> -	0x00, 0x00, 0x00, 0x00, 0xd8, 0xff, 0xff, 0xff, 0x31, 0x24, 0x16, 0x00,
> -	0x00, 0x00, 0x00, 0x00, 0x24, 0x08, 0x08, 0xfb, 0x14, 0x0c, 0x00, 0x00,
> -	0x01, 0x00, 0x00, 0x00, 0x00, 0x42, 0x01, 0x00, 0x00, 0x00, 0x00, 0x20,
> -	0x10, 0x00, 0x00, 0x00, 0x61, 0x00, 0x10, 0x28, 0x7f, 0x00, 0x10, 0x00,
> -	0x31, 0x20, 0x02, 0x80, 0x04, 0x00, 0x00, 0x00, 0x0c, 0x7f, 0x20, 0x30,
> -	0x00, 0x00, 0x00, 0x00
> +	0xf0, 0x00, 0x00, 0x00, 0x61, 0x00, 0x84, 0xf4, 0x03, 0x00, 0x02, 0x00,
> +	0x61, 0x00, 0x84, 0xa4, 0x09, 0x05, 0x10, 0x00, 0x31, 0x24, 0x03, 0x80,
> +	0x00, 0x00, 0x0c, 0x08, 0x0c, 0x09, 0x00, 0xfb, 0x00, 0x00, 0xa0, 0x00,
> +	0x70, 0x84, 0x94, 0x80, 0x60, 0x86, 0x01, 0x00, 0x04, 0x08, 0x00, 0x16,
> +	0x34, 0x12, 0x34, 0x12, 0x20, 0x00, 0x80, 0x84, 0x00, 0x40, 0x00, 0x00,
> +	0x00, 0x00, 0x00, 0x00, 0xb0, 0x00, 0x00, 0x00, 0x61, 0x00, 0x84, 0xa4,
> +	0x0a, 0x05, 0x16, 0x00, 0x40, 0x00, 0x00, 0x80, 0x60, 0x86, 0x15, 0x03,
> +	0x14, 0x03, 0x00, 0x05, 0x01, 0x00, 0x01, 0x00, 0x31, 0x45, 0x03, 0x80,
> +	0x00, 0x00, 0x0c, 0x0b, 0x0c, 0x0a, 0x00, 0xfb, 0x00, 0x00, 0xa0, 0x00,
> +	0x61, 0xa3, 0x10, 0x00, 0xa0, 0x0a, 0x06, 0x0e, 0x04, 0x0a, 0x00, 0x00,
> +	0x00, 0x00, 0x00, 0x00, 0x61, 0x00, 0x10, 0x02, 0xa0, 0x0a, 0x06, 0x10,
> +	0x04, 0x0a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x61, 0x00, 0x10, 0x00,
> +	0xa0, 0x0a, 0x16, 0x0e, 0x14, 0x0a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
> +	0x61, 0x12, 0x10, 0x02, 0x20, 0x02, 0x16, 0x10, 0x14, 0x0a, 0x00, 0x00,
> +	0x00, 0x00, 0x00, 0x00, 0x70, 0x1a, 0x54, 0x80, 0x20, 0x02, 0x01, 0x00,
> +	0x14, 0x03, 0x00, 0x52, 0x44, 0x05, 0x00, 0x00, 0x52, 0x85, 0xb8, 0x20,
> +	0x0c, 0x0b, 0x04, 0x06, 0x31, 0x23, 0x15, 0x00, 0x00, 0x00, 0x00, 0x00,
> +	0x24, 0x0e, 0x08, 0xfb, 0x14, 0x0c, 0x00, 0x00, 0x20, 0x00, 0x40, 0x84,
> +	0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x38, 0xff, 0xff, 0xff,
> +	0x61, 0x00, 0x10, 0x28, 0x7f, 0x02, 0x10, 0x00, 0x31, 0x26, 0x02, 0x80,
> +	0x04, 0x00, 0x00, 0x00, 0x0c, 0x7f, 0x20, 0x30, 0x00, 0x00, 0x00, 0x00,
>  };
> -
>  /*
>   * Opencl code is in opencl/loop.cl
>   *
> @@ -6635,35 +6638,41 @@ static const unsigned char xe3lpg_kernel_sip_bin[] = {
>  	0x20, 0x02, 0xb5, 0x00, 0x04, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
>  };
>  
> -static const unsigned char xe3lpg_kernel_inc_bin[] = {
> -	0x65, 0xa0, 0x00, 0x80, 0x20, 0x82, 0x05, 0x1f, 0x04, 0x00, 0x00, 0x02,
> -	0xc0, 0xff, 0xff, 0xff, 0x40, 0x19, 0x00, 0x80, 0x20, 0x82, 0x05, 0x1f,
> -	0x04, 0x1f, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x31, 0x22, 0x03, 0x80,
> -	0x00, 0x00, 0x0c, 0x05, 0x8f, 0x1f, 0x00, 0xfa, 0x03, 0x00, 0x70, 0xf6,
> -	0x61, 0x80, 0x24, 0x2c, 0x02, 0x00, 0x10, 0x00, 0x66, 0x09, 0x00, 0x80,
> -	0x20, 0x82, 0x01, 0x80, 0x00, 0x80, 0x00, 0x02, 0xc0, 0x04, 0x00, 0x40,
> -	0x41, 0x22, 0x03, 0x80, 0x60, 0x06, 0x01, 0x20, 0x54, 0x05, 0x00, 0x01,
> -	0x14, 0x02, 0x00, 0x00, 0x53, 0x81, 0x00, 0x80, 0x60, 0x06, 0x05, 0x03,
> -	0x54, 0x05, 0x00, 0x06, 0x14, 0x02, 0x00, 0x00, 0x70, 0x00, 0x14, 0x80,
> -	0x60, 0x86, 0x01, 0x00, 0x44, 0x05, 0x00, 0x15, 0x00, 0x00, 0x00, 0x00,
> -	0x52, 0x1a, 0x14, 0x00, 0x60, 0x06, 0x04, 0x06, 0x04, 0x03, 0x0e, 0x01,
> -	0x04, 0x01, 0x04, 0x04, 0x20, 0x00, 0x00, 0x84, 0x00, 0x40, 0x00, 0x00,
> -	0x00, 0x00, 0x00, 0x00, 0xb0, 0x00, 0x00, 0x00, 0x61, 0x1a, 0x00, 0xb4,
> -	0x0f, 0x06, 0x10, 0x00, 0x61, 0x00, 0x08, 0xb4, 0x11, 0x07, 0x10, 0x00,
> -	0x69, 0x1a, 0x10, 0x00, 0x70, 0x86, 0x05, 0x13, 0x04, 0x0f, 0x20, 0x05,
> -	0x02, 0x00, 0x02, 0x00, 0x69, 0x1a, 0x10, 0x02, 0x70, 0x86, 0x05, 0x15,
> -	0x04, 0x11, 0x20, 0x05, 0x02, 0x00, 0x02, 0x00, 0x40, 0x1a, 0x00, 0x38,
> -	0x08, 0x13, 0x30, 0x05, 0x40, 0x1a, 0x08, 0x38, 0x0a, 0x15, 0x30, 0x05,
> -	0x31, 0x23, 0x17, 0x00, 0x00, 0x00, 0x14, 0x0c, 0x24, 0x08, 0x00, 0xfb,
> -	0x00, 0x00, 0x00, 0x00, 0x61, 0x00, 0x84, 0xf4, 0x03, 0x00, 0x02, 0x00,
> -	0x40, 0x19, 0x00, 0x80, 0x60, 0x86, 0x15, 0x03, 0x14, 0x03, 0x00, 0x05,
> -	0x01, 0x00, 0x01, 0x00, 0x40, 0x83, 0x20, 0x30, 0x0c, 0x0c, 0x80, 0x3f,
> -	0x70, 0x19, 0x54, 0x80, 0x20, 0x02, 0x01, 0x00, 0x14, 0x03, 0x00, 0x52,
> -	0x44, 0x05, 0x00, 0x00, 0x20, 0x00, 0x40, 0x84, 0x00, 0x40, 0x00, 0x00,
> -	0x00, 0x00, 0x00, 0x00, 0xd8, 0xff, 0xff, 0xff, 0x31, 0x24, 0x16, 0x00,
> -	0x00, 0x00, 0x00, 0x00, 0x24, 0x08, 0x08, 0xfb, 0x14, 0x0c, 0x00, 0x00,
> -	0x61, 0x00, 0x10, 0x28, 0x0e, 0x02, 0x10, 0x00, 0x31, 0x20, 0x02, 0x80,
> -	0x04, 0x00, 0x00, 0x00, 0x0c, 0x0e, 0x20, 0x30, 0x00, 0x00, 0x00, 0x00
> +static const unsigned char xe3lpg_kernel_count_bin[] = {
> +	0x01, 0xa1, 0x8c, 0x3c, 0x00, 0x00, 0x10, 0x00, 0x65, 0xa0, 0x00, 0x80,
> +	0x20, 0x82, 0x05, 0x1f, 0x04, 0x00, 0x00, 0x02, 0xc0, 0xff, 0xff, 0xff,
> +	0x40, 0x19, 0x00, 0x80, 0x20, 0x82, 0x05, 0x1f, 0x04, 0x1f, 0x00, 0x02,
> +	0x00, 0x00, 0x00, 0x00, 0x31, 0x22, 0x03, 0x80, 0x00, 0x00, 0x0c, 0x05,
> +	0x8f, 0x1f, 0x00, 0xfa, 0x03, 0x00, 0x70, 0xf6, 0x61, 0x80, 0x10, 0x2c,
> +	0x02, 0x00, 0x10, 0x00, 0x66, 0x09, 0x00, 0x80, 0x20, 0x82, 0x01, 0x80,
> +	0x00, 0x80, 0x00, 0x02, 0xc0, 0x04, 0x00, 0x40, 0x41, 0x22, 0x03, 0x80,
> +	0x60, 0x06, 0x01, 0x20, 0x54, 0x05, 0x00, 0x01, 0x14, 0x02, 0x00, 0x00,
> +	0x53, 0x81, 0x00, 0x80, 0x60, 0x06, 0x05, 0x03, 0x54, 0x05, 0x00, 0x06,
> +	0x14, 0x02, 0x00, 0x00, 0x70, 0x00, 0x14, 0x80, 0x60, 0x86, 0x01, 0x00,
> +	0x44, 0x05, 0x00, 0x15, 0x00, 0x00, 0x00, 0x00, 0x40, 0x1a, 0x14, 0x00,
> +	0x60, 0x06, 0x05, 0x06, 0x04, 0x03, 0x00, 0x01, 0x04, 0x01, 0x10, 0x00,
> +	0x20, 0x00, 0x00, 0x84, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
> +	0xf0, 0x00, 0x00, 0x00, 0x61, 0x00, 0x84, 0xf4, 0x03, 0x00, 0x02, 0x00,
> +	0x61, 0x00, 0x84, 0xa4, 0x09, 0x05, 0x10, 0x00, 0x31, 0x24, 0x03, 0x80,
> +	0x00, 0x00, 0x0c, 0x08, 0x0c, 0x09, 0x00, 0xfb, 0x00, 0x00, 0xa0, 0x00,
> +	0x70, 0x84, 0x94, 0x80, 0x60, 0x86, 0x01, 0x00, 0x04, 0x08, 0x00, 0x16,
> +	0x34, 0x12, 0x34, 0x12, 0x20, 0x00, 0x80, 0x84, 0x00, 0x40, 0x00, 0x00,
> +	0x00, 0x00, 0x00, 0x00, 0xb0, 0x00, 0x00, 0x00, 0x61, 0x00, 0x84, 0xa4,
> +	0x0a, 0x05, 0x16, 0x00, 0x40, 0x00, 0x00, 0x80, 0x60, 0x86, 0x15, 0x03,
> +	0x14, 0x03, 0x00, 0x05, 0x01, 0x00, 0x01, 0x00, 0x31, 0x45, 0x03, 0x80,
> +	0x00, 0x00, 0x0c, 0x0b, 0x0c, 0x0a, 0x00, 0xfb, 0x00, 0x00, 0xa0, 0x00,
> +	0x61, 0xa3, 0x10, 0x00, 0xa0, 0x0a, 0x06, 0x0e, 0x04, 0x0a, 0x00, 0x00,
> +	0x00, 0x00, 0x00, 0x00, 0x61, 0x00, 0x10, 0x02, 0xa0, 0x0a, 0x06, 0x10,
> +	0x04, 0x0a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x61, 0x00, 0x10, 0x00,
> +	0xa0, 0x0a, 0x16, 0x0e, 0x14, 0x0a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
> +	0x61, 0x12, 0x10, 0x02, 0x20, 0x02, 0x16, 0x10, 0x14, 0x0a, 0x00, 0x00,
> +	0x00, 0x00, 0x00, 0x00, 0x70, 0x1a, 0x54, 0x80, 0x20, 0x02, 0x01, 0x00,
> +	0x14, 0x03, 0x00, 0x52, 0x44, 0x05, 0x00, 0x00, 0x52, 0x85, 0xb8, 0x20,
> +	0x0c, 0x0b, 0x04, 0x06, 0x31, 0x23, 0x15, 0x00, 0x00, 0x00, 0x00, 0x00,
> +	0x24, 0x0e, 0x08, 0xfb, 0x14, 0x0c, 0x00, 0x00, 0x20, 0x00, 0x40, 0x84,
> +	0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x38, 0xff, 0xff, 0xff,
> +	0x61, 0x00, 0x10, 0x28, 0x12, 0x02, 0x10, 0x00, 0x31, 0x26, 0x02, 0x80,
> +	0x04, 0x00, 0x00, 0x00, 0x0c, 0x12, 0x20, 0x30, 0x00, 0x00, 0x00, 0x00,
>  };
>  
>  const struct intel_compute_kernels intel_compute_square_kernels[] = {
> @@ -6696,8 +6705,8 @@ const struct intel_compute_kernels intel_compute_square_kernels[] = {
>  		.ip_ver = IP_VER(20, 01),
>  		.size = sizeof(xe2lpg_kernel_square_bin),
>  		.kernel = xe2lpg_kernel_square_bin,
> -		.long_kernel = xe2lpg_kernel_inc_bin,
> -		.long_kernel_size = sizeof(xe2lpg_kernel_inc_bin),
> +		.long_kernel = xe2lpg_kernel_count_bin,
> +		.long_kernel_size = sizeof(xe2lpg_kernel_count_bin),
>  		.sip_kernel = xe2lpg_kernel_sip_bin,
>  		.sip_kernel_size = sizeof(xe2lpg_kernel_sip_bin),
>  		.loop_kernel = xe2lpg_kernel_loop_bin,
> @@ -6707,8 +6716,8 @@ const struct intel_compute_kernels intel_compute_square_kernels[] = {
>  		.ip_ver = IP_VER(20, 04),
>  		.size = sizeof(xe2lpg_kernel_square_bin),
>  		.kernel = xe2lpg_kernel_square_bin,
> -		.long_kernel = xe2lpg_kernel_inc_bin,
> -		.long_kernel_size = sizeof(xe2lpg_kernel_inc_bin),
> +		.long_kernel = xe2lpg_kernel_count_bin,
> +		.long_kernel_size = sizeof(xe2lpg_kernel_count_bin),
>  		.sip_kernel = xe2lpg_kernel_sip_bin,
>  		.sip_kernel_size = sizeof(xe2lpg_kernel_sip_bin),
>  		.loop_kernel = xe2lpg_kernel_loop_bin,
> @@ -6718,8 +6727,8 @@ const struct intel_compute_kernels intel_compute_square_kernels[] = {
>  		.ip_ver = IP_VER(30, 00),
>  		.size = sizeof(xe3lpg_kernel_square_bin),
>  		.kernel = xe3lpg_kernel_square_bin,
> -		.long_kernel = xe3lpg_kernel_inc_bin,
> -		.long_kernel_size = sizeof(xe3lpg_kernel_inc_bin),
> +		.long_kernel = xe3lpg_kernel_count_bin,
> +		.long_kernel_size = sizeof(xe3lpg_kernel_count_bin),
>  		.sip_kernel = xe3lpg_kernel_sip_bin,
>  		.sip_kernel_size = sizeof(xe3lpg_kernel_sip_bin),
>  		.loop_kernel = xe3lpg_kernel_loop_bin,
> diff --git a/opencl/loop_count.cl b/opencl/loop_count.cl
> new file mode 100644
> index 0000000000..c5856ab7a8
> --- /dev/null
> +++ b/opencl/loop_count.cl
> @@ -0,0 +1,15 @@
> +/* Must be same as MAGIC_LOOP_STOP in lib/intel_compute.c */
> +#define MAGIC_LOOP_STOP 0x12341234
> +
> +__kernel void loop(volatile __global int *input,
> +		   __global int *output,
> +		   unsigned int count)
> +{
> +	int i = get_global_id(0);
> +
> +	for (int c = 0; c < count; c++) {
> +		if (input[0] == MAGIC_LOOP_STOP)
> +			break;
> +		output[0] = output[0] + i;
> +	}
> +}
> -- 
> 2.43.0
> 


More information about the igt-dev mailing list