[PATCH i-g-t] lib/intel_compute: change shader and logic for threadgroup preemption
Francois Dugast
francois.dugast at intel.com
Fri Jun 13 09:43:06 UTC 2025
On Wed, Jun 11, 2025 at 07:46:29AM +0200, Zbigniew Kempczyński wrote:
> Threadgroup preemption happens on thread group dispatch boundary.
> It is harder to control it comparing to WMTP. Simple loop with
> n-iterations and relatively high dimensions allows to achieve that
> but requires tweaking work dimension as each loop (single thread)
> executes similar time.
>
> Change then this "constant" time loop to one which allows to be
> stopped by writing MAGIC number to buffer observed by the loop
> conditional. Until threadgroup preemption occurs threads will execute
> relatively long time allowing another short job (compute square)
> to complete. When short job is done MAGIC write causes all threads
> exit loop immediately. This strategy is more reliable and shortens
> threadgroup preemption execution time.
>
> Signed-off-by: Zbigniew Kempczyński <zbigniew.kempczynski at intel.com>
> Cc: Francois Dugast <francois.dugast at intel.com>
Reviewed-by: Francois Dugast <francois.dugast at intel.com>
> ---
> lib/intel_compute.c | 9 +-
> lib/intel_compute_square_kernels.c | 147 +++++++++++++++--------------
> opencl/loop_count.cl | 15 +++
> 3 files changed, 99 insertions(+), 72 deletions(-)
> create mode 100644 opencl/loop_count.cl
>
> diff --git a/lib/intel_compute.c b/lib/intel_compute.c
> index bfb9024ba8..9f5fc1bc59 100644
> --- a/lib/intel_compute.c
> +++ b/lib/intel_compute.c
> @@ -67,7 +67,7 @@
> */
> #define TGP_long_kernel_loop_count 10
> #define WMTP_long_kernel_loop_count 1000000
> -#define XE2_THREADGROUP_PREEMPT_XDIM 0x200000
> +#define XE2_THREADGROUP_PREEMPT_XDIM 0x4000
>
> struct bo_dict_entry {
> uint64_t addr;
> @@ -2180,8 +2180,11 @@ static void xe2lpg_compute_preempt_exec(int fd, const unsigned char *long_kernel
> /* Check that the long kernel has not completed yet */
> igt_assert_neq(0, __xe_wait_ufence(fd, &execenv_long.bo_sync->sync, USER_FENCE_VALUE,
> execenv_long.exec_queue, &timeout_one_ns));
> - if (use_loop_kernel)
> - ((int *)input_long)[0] = MAGIC_LOOP_STOP;
> + /*
> + * For threadgroup preemption it breaks the loop. So rest shaders exit
> + * immediately without reaching whole loop count.
> + */
> + ((int *)input_long)[0] = MAGIC_LOOP_STOP;
>
> bo_execenv_sync(&execenv_long);
>
> diff --git a/lib/intel_compute_square_kernels.c b/lib/intel_compute_square_kernels.c
> index f67c048583..5bf9a1feb5 100644
> --- a/lib/intel_compute_square_kernels.c
> +++ b/lib/intel_compute_square_kernels.c
> @@ -3806,44 +3806,47 @@ static const unsigned char xe2lpg_kernel_sip_bin[] = {
> };
>
> /*
> - * Opencl code for below binary is provided in below path:
> - * opencl/compute_increment_kernel.cl
> + * Opencl code is in opencl/loop_count.cl
> + *
> + * To work properly it requires to use uncached reads, so ocloc has to
> + * be called with: -options " -igc_opts 'LscLoadCacheControlOverride=1'" arg
> */
> -static const unsigned char xe2lpg_kernel_inc_bin[] = {
> - 0x65, 0x00, 0x00, 0x80, 0x20, 0x82, 0x05, 0x7f, 0x04, 0x00, 0x00, 0x02,
> - 0xc0, 0xff, 0xff, 0xff, 0x40, 0x19, 0x00, 0x80, 0x20, 0x82, 0x05, 0x7f,
> - 0x04, 0x7f, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x31, 0x22, 0x03, 0x80,
> - 0x00, 0x00, 0x0c, 0x05, 0x8f, 0x7f, 0x00, 0xfa, 0x03, 0x00, 0x70, 0xf6,
> - 0x01, 0x00, 0x00, 0x00, 0x00, 0x42, 0x01, 0x00, 0x00, 0x00, 0x00, 0x20,
> - 0x04, 0x00, 0x00, 0x00, 0x66, 0x09, 0x00, 0x80, 0x20, 0x82, 0x01, 0x80,
> +static const unsigned char xe2lpg_kernel_count_bin[] = {
> + 0x01, 0xa1, 0x8c, 0x3c, 0x00, 0x00, 0x10, 0x00, 0x65, 0xa0, 0x00, 0x80,
> + 0x20, 0x82, 0x05, 0x7f, 0x04, 0x00, 0x00, 0x02, 0xc0, 0xff, 0xff, 0xff,
> + 0x40, 0x19, 0x00, 0x80, 0x20, 0x82, 0x05, 0x7f, 0x04, 0x7f, 0x00, 0x02,
> + 0x00, 0x00, 0x00, 0x00, 0x31, 0x22, 0x03, 0x80, 0x00, 0x00, 0x0c, 0x05,
> + 0x8f, 0x7f, 0x00, 0xfa, 0x03, 0x00, 0x70, 0xf6, 0x61, 0x80, 0x10, 0x2c,
> + 0x02, 0x00, 0x10, 0x00, 0x66, 0x09, 0x00, 0x80, 0x20, 0x82, 0x01, 0x80,
> 0x00, 0x80, 0x00, 0x02, 0xc0, 0x04, 0x00, 0x40, 0x41, 0x22, 0x03, 0x80,
> - 0x60, 0x06, 0x01, 0x20, 0x54, 0x05, 0x00, 0x01, 0x14, 0x00, 0x00, 0x00,
> - 0x53, 0x80, 0x00, 0x80, 0x60, 0x06, 0x05, 0x02, 0x54, 0x05, 0x00, 0x06,
> - 0x14, 0x00, 0x00, 0x00, 0x70, 0x00, 0x14, 0x80, 0x60, 0x86, 0x01, 0x00,
> - 0x44, 0x05, 0x00, 0x15, 0x00, 0x00, 0x00, 0x00, 0x52, 0x1a, 0x14, 0x00,
> - 0x60, 0x06, 0x04, 0x06, 0x04, 0x02, 0x0e, 0x01, 0x04, 0x01, 0x04, 0x04,
> + 0x60, 0x06, 0x01, 0x20, 0x54, 0x05, 0x00, 0x01, 0x14, 0x02, 0x00, 0x00,
> + 0x53, 0x81, 0x00, 0x80, 0x60, 0x06, 0x05, 0x03, 0x54, 0x05, 0x00, 0x06,
> + 0x14, 0x02, 0x00, 0x00, 0x70, 0x00, 0x14, 0x80, 0x60, 0x86, 0x01, 0x00,
> + 0x44, 0x05, 0x00, 0x15, 0x00, 0x00, 0x00, 0x00, 0x40, 0x1a, 0x14, 0x00,
> + 0x60, 0x06, 0x05, 0x06, 0x04, 0x03, 0x00, 0x01, 0x04, 0x01, 0x10, 0x00,
> 0x20, 0x00, 0x00, 0x84, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
> - 0xd0, 0x00, 0x00, 0x00, 0x61, 0x1a, 0x00, 0xb4, 0x0e, 0x06, 0x10, 0x00,
> - 0x61, 0x00, 0x08, 0xb4, 0x10, 0x07, 0x10, 0x00, 0x69, 0x1a, 0x10, 0x00,
> - 0x70, 0x86, 0x05, 0x12, 0x04, 0x0e, 0x20, 0x05, 0x02, 0x00, 0x02, 0x00,
> - 0x69, 0x1a, 0x10, 0x02, 0x70, 0x86, 0x05, 0x14, 0x04, 0x10, 0x20, 0x05,
> - 0x02, 0x00, 0x02, 0x00, 0x40, 0x1a, 0x00, 0x38, 0x08, 0x12, 0x30, 0x05,
> - 0x40, 0x1a, 0x08, 0x38, 0x0a, 0x14, 0x30, 0x05, 0x31, 0x23, 0x17, 0x00,
> - 0x00, 0x00, 0x14, 0x0c, 0x24, 0x08, 0x00, 0xfb, 0x00, 0x00, 0x00, 0x00,
> - 0x61, 0x00, 0x84, 0xf4, 0x02, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00,
> - 0x00, 0x42, 0x01, 0x00, 0x00, 0x00, 0x00, 0x20, 0x08, 0x00, 0x00, 0x00,
> - 0x40, 0x19, 0x00, 0x80, 0x60, 0x86, 0x15, 0x02, 0x14, 0x02, 0x00, 0x05,
> - 0x01, 0x00, 0x01, 0x00, 0x40, 0x83, 0x20, 0x30, 0x0c, 0x0c, 0x80, 0x3f,
> - 0x70, 0x19, 0x54, 0x80, 0x20, 0x02, 0x01, 0x00, 0x14, 0x02, 0x00, 0x52,
> - 0x44, 0x05, 0x00, 0x00, 0x20, 0x00, 0x40, 0x84, 0x00, 0x40, 0x00, 0x00,
> - 0x00, 0x00, 0x00, 0x00, 0xd8, 0xff, 0xff, 0xff, 0x31, 0x24, 0x16, 0x00,
> - 0x00, 0x00, 0x00, 0x00, 0x24, 0x08, 0x08, 0xfb, 0x14, 0x0c, 0x00, 0x00,
> - 0x01, 0x00, 0x00, 0x00, 0x00, 0x42, 0x01, 0x00, 0x00, 0x00, 0x00, 0x20,
> - 0x10, 0x00, 0x00, 0x00, 0x61, 0x00, 0x10, 0x28, 0x7f, 0x00, 0x10, 0x00,
> - 0x31, 0x20, 0x02, 0x80, 0x04, 0x00, 0x00, 0x00, 0x0c, 0x7f, 0x20, 0x30,
> - 0x00, 0x00, 0x00, 0x00
> + 0xf0, 0x00, 0x00, 0x00, 0x61, 0x00, 0x84, 0xf4, 0x03, 0x00, 0x02, 0x00,
> + 0x61, 0x00, 0x84, 0xa4, 0x09, 0x05, 0x10, 0x00, 0x31, 0x24, 0x03, 0x80,
> + 0x00, 0x00, 0x0c, 0x08, 0x0c, 0x09, 0x00, 0xfb, 0x00, 0x00, 0xa0, 0x00,
> + 0x70, 0x84, 0x94, 0x80, 0x60, 0x86, 0x01, 0x00, 0x04, 0x08, 0x00, 0x16,
> + 0x34, 0x12, 0x34, 0x12, 0x20, 0x00, 0x80, 0x84, 0x00, 0x40, 0x00, 0x00,
> + 0x00, 0x00, 0x00, 0x00, 0xb0, 0x00, 0x00, 0x00, 0x61, 0x00, 0x84, 0xa4,
> + 0x0a, 0x05, 0x16, 0x00, 0x40, 0x00, 0x00, 0x80, 0x60, 0x86, 0x15, 0x03,
> + 0x14, 0x03, 0x00, 0x05, 0x01, 0x00, 0x01, 0x00, 0x31, 0x45, 0x03, 0x80,
> + 0x00, 0x00, 0x0c, 0x0b, 0x0c, 0x0a, 0x00, 0xfb, 0x00, 0x00, 0xa0, 0x00,
> + 0x61, 0xa3, 0x10, 0x00, 0xa0, 0x0a, 0x06, 0x0e, 0x04, 0x0a, 0x00, 0x00,
> + 0x00, 0x00, 0x00, 0x00, 0x61, 0x00, 0x10, 0x02, 0xa0, 0x0a, 0x06, 0x10,
> + 0x04, 0x0a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x61, 0x00, 0x10, 0x00,
> + 0xa0, 0x0a, 0x16, 0x0e, 0x14, 0x0a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
> + 0x61, 0x12, 0x10, 0x02, 0x20, 0x02, 0x16, 0x10, 0x14, 0x0a, 0x00, 0x00,
> + 0x00, 0x00, 0x00, 0x00, 0x70, 0x1a, 0x54, 0x80, 0x20, 0x02, 0x01, 0x00,
> + 0x14, 0x03, 0x00, 0x52, 0x44, 0x05, 0x00, 0x00, 0x52, 0x85, 0xb8, 0x20,
> + 0x0c, 0x0b, 0x04, 0x06, 0x31, 0x23, 0x15, 0x00, 0x00, 0x00, 0x00, 0x00,
> + 0x24, 0x0e, 0x08, 0xfb, 0x14, 0x0c, 0x00, 0x00, 0x20, 0x00, 0x40, 0x84,
> + 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x38, 0xff, 0xff, 0xff,
> + 0x61, 0x00, 0x10, 0x28, 0x7f, 0x02, 0x10, 0x00, 0x31, 0x26, 0x02, 0x80,
> + 0x04, 0x00, 0x00, 0x00, 0x0c, 0x7f, 0x20, 0x30, 0x00, 0x00, 0x00, 0x00,
> };
> -
> /*
> * Opencl code is in opencl/loop.cl
> *
> @@ -6635,35 +6638,41 @@ static const unsigned char xe3lpg_kernel_sip_bin[] = {
> 0x20, 0x02, 0xb5, 0x00, 0x04, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
> };
>
> -static const unsigned char xe3lpg_kernel_inc_bin[] = {
> - 0x65, 0xa0, 0x00, 0x80, 0x20, 0x82, 0x05, 0x1f, 0x04, 0x00, 0x00, 0x02,
> - 0xc0, 0xff, 0xff, 0xff, 0x40, 0x19, 0x00, 0x80, 0x20, 0x82, 0x05, 0x1f,
> - 0x04, 0x1f, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x31, 0x22, 0x03, 0x80,
> - 0x00, 0x00, 0x0c, 0x05, 0x8f, 0x1f, 0x00, 0xfa, 0x03, 0x00, 0x70, 0xf6,
> - 0x61, 0x80, 0x24, 0x2c, 0x02, 0x00, 0x10, 0x00, 0x66, 0x09, 0x00, 0x80,
> - 0x20, 0x82, 0x01, 0x80, 0x00, 0x80, 0x00, 0x02, 0xc0, 0x04, 0x00, 0x40,
> - 0x41, 0x22, 0x03, 0x80, 0x60, 0x06, 0x01, 0x20, 0x54, 0x05, 0x00, 0x01,
> - 0x14, 0x02, 0x00, 0x00, 0x53, 0x81, 0x00, 0x80, 0x60, 0x06, 0x05, 0x03,
> - 0x54, 0x05, 0x00, 0x06, 0x14, 0x02, 0x00, 0x00, 0x70, 0x00, 0x14, 0x80,
> - 0x60, 0x86, 0x01, 0x00, 0x44, 0x05, 0x00, 0x15, 0x00, 0x00, 0x00, 0x00,
> - 0x52, 0x1a, 0x14, 0x00, 0x60, 0x06, 0x04, 0x06, 0x04, 0x03, 0x0e, 0x01,
> - 0x04, 0x01, 0x04, 0x04, 0x20, 0x00, 0x00, 0x84, 0x00, 0x40, 0x00, 0x00,
> - 0x00, 0x00, 0x00, 0x00, 0xb0, 0x00, 0x00, 0x00, 0x61, 0x1a, 0x00, 0xb4,
> - 0x0f, 0x06, 0x10, 0x00, 0x61, 0x00, 0x08, 0xb4, 0x11, 0x07, 0x10, 0x00,
> - 0x69, 0x1a, 0x10, 0x00, 0x70, 0x86, 0x05, 0x13, 0x04, 0x0f, 0x20, 0x05,
> - 0x02, 0x00, 0x02, 0x00, 0x69, 0x1a, 0x10, 0x02, 0x70, 0x86, 0x05, 0x15,
> - 0x04, 0x11, 0x20, 0x05, 0x02, 0x00, 0x02, 0x00, 0x40, 0x1a, 0x00, 0x38,
> - 0x08, 0x13, 0x30, 0x05, 0x40, 0x1a, 0x08, 0x38, 0x0a, 0x15, 0x30, 0x05,
> - 0x31, 0x23, 0x17, 0x00, 0x00, 0x00, 0x14, 0x0c, 0x24, 0x08, 0x00, 0xfb,
> - 0x00, 0x00, 0x00, 0x00, 0x61, 0x00, 0x84, 0xf4, 0x03, 0x00, 0x02, 0x00,
> - 0x40, 0x19, 0x00, 0x80, 0x60, 0x86, 0x15, 0x03, 0x14, 0x03, 0x00, 0x05,
> - 0x01, 0x00, 0x01, 0x00, 0x40, 0x83, 0x20, 0x30, 0x0c, 0x0c, 0x80, 0x3f,
> - 0x70, 0x19, 0x54, 0x80, 0x20, 0x02, 0x01, 0x00, 0x14, 0x03, 0x00, 0x52,
> - 0x44, 0x05, 0x00, 0x00, 0x20, 0x00, 0x40, 0x84, 0x00, 0x40, 0x00, 0x00,
> - 0x00, 0x00, 0x00, 0x00, 0xd8, 0xff, 0xff, 0xff, 0x31, 0x24, 0x16, 0x00,
> - 0x00, 0x00, 0x00, 0x00, 0x24, 0x08, 0x08, 0xfb, 0x14, 0x0c, 0x00, 0x00,
> - 0x61, 0x00, 0x10, 0x28, 0x0e, 0x02, 0x10, 0x00, 0x31, 0x20, 0x02, 0x80,
> - 0x04, 0x00, 0x00, 0x00, 0x0c, 0x0e, 0x20, 0x30, 0x00, 0x00, 0x00, 0x00
> +static const unsigned char xe3lpg_kernel_count_bin[] = {
> + 0x01, 0xa1, 0x8c, 0x3c, 0x00, 0x00, 0x10, 0x00, 0x65, 0xa0, 0x00, 0x80,
> + 0x20, 0x82, 0x05, 0x1f, 0x04, 0x00, 0x00, 0x02, 0xc0, 0xff, 0xff, 0xff,
> + 0x40, 0x19, 0x00, 0x80, 0x20, 0x82, 0x05, 0x1f, 0x04, 0x1f, 0x00, 0x02,
> + 0x00, 0x00, 0x00, 0x00, 0x31, 0x22, 0x03, 0x80, 0x00, 0x00, 0x0c, 0x05,
> + 0x8f, 0x1f, 0x00, 0xfa, 0x03, 0x00, 0x70, 0xf6, 0x61, 0x80, 0x10, 0x2c,
> + 0x02, 0x00, 0x10, 0x00, 0x66, 0x09, 0x00, 0x80, 0x20, 0x82, 0x01, 0x80,
> + 0x00, 0x80, 0x00, 0x02, 0xc0, 0x04, 0x00, 0x40, 0x41, 0x22, 0x03, 0x80,
> + 0x60, 0x06, 0x01, 0x20, 0x54, 0x05, 0x00, 0x01, 0x14, 0x02, 0x00, 0x00,
> + 0x53, 0x81, 0x00, 0x80, 0x60, 0x06, 0x05, 0x03, 0x54, 0x05, 0x00, 0x06,
> + 0x14, 0x02, 0x00, 0x00, 0x70, 0x00, 0x14, 0x80, 0x60, 0x86, 0x01, 0x00,
> + 0x44, 0x05, 0x00, 0x15, 0x00, 0x00, 0x00, 0x00, 0x40, 0x1a, 0x14, 0x00,
> + 0x60, 0x06, 0x05, 0x06, 0x04, 0x03, 0x00, 0x01, 0x04, 0x01, 0x10, 0x00,
> + 0x20, 0x00, 0x00, 0x84, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
> + 0xf0, 0x00, 0x00, 0x00, 0x61, 0x00, 0x84, 0xf4, 0x03, 0x00, 0x02, 0x00,
> + 0x61, 0x00, 0x84, 0xa4, 0x09, 0x05, 0x10, 0x00, 0x31, 0x24, 0x03, 0x80,
> + 0x00, 0x00, 0x0c, 0x08, 0x0c, 0x09, 0x00, 0xfb, 0x00, 0x00, 0xa0, 0x00,
> + 0x70, 0x84, 0x94, 0x80, 0x60, 0x86, 0x01, 0x00, 0x04, 0x08, 0x00, 0x16,
> + 0x34, 0x12, 0x34, 0x12, 0x20, 0x00, 0x80, 0x84, 0x00, 0x40, 0x00, 0x00,
> + 0x00, 0x00, 0x00, 0x00, 0xb0, 0x00, 0x00, 0x00, 0x61, 0x00, 0x84, 0xa4,
> + 0x0a, 0x05, 0x16, 0x00, 0x40, 0x00, 0x00, 0x80, 0x60, 0x86, 0x15, 0x03,
> + 0x14, 0x03, 0x00, 0x05, 0x01, 0x00, 0x01, 0x00, 0x31, 0x45, 0x03, 0x80,
> + 0x00, 0x00, 0x0c, 0x0b, 0x0c, 0x0a, 0x00, 0xfb, 0x00, 0x00, 0xa0, 0x00,
> + 0x61, 0xa3, 0x10, 0x00, 0xa0, 0x0a, 0x06, 0x0e, 0x04, 0x0a, 0x00, 0x00,
> + 0x00, 0x00, 0x00, 0x00, 0x61, 0x00, 0x10, 0x02, 0xa0, 0x0a, 0x06, 0x10,
> + 0x04, 0x0a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x61, 0x00, 0x10, 0x00,
> + 0xa0, 0x0a, 0x16, 0x0e, 0x14, 0x0a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
> + 0x61, 0x12, 0x10, 0x02, 0x20, 0x02, 0x16, 0x10, 0x14, 0x0a, 0x00, 0x00,
> + 0x00, 0x00, 0x00, 0x00, 0x70, 0x1a, 0x54, 0x80, 0x20, 0x02, 0x01, 0x00,
> + 0x14, 0x03, 0x00, 0x52, 0x44, 0x05, 0x00, 0x00, 0x52, 0x85, 0xb8, 0x20,
> + 0x0c, 0x0b, 0x04, 0x06, 0x31, 0x23, 0x15, 0x00, 0x00, 0x00, 0x00, 0x00,
> + 0x24, 0x0e, 0x08, 0xfb, 0x14, 0x0c, 0x00, 0x00, 0x20, 0x00, 0x40, 0x84,
> + 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x38, 0xff, 0xff, 0xff,
> + 0x61, 0x00, 0x10, 0x28, 0x12, 0x02, 0x10, 0x00, 0x31, 0x26, 0x02, 0x80,
> + 0x04, 0x00, 0x00, 0x00, 0x0c, 0x12, 0x20, 0x30, 0x00, 0x00, 0x00, 0x00,
> };
>
> const struct intel_compute_kernels intel_compute_square_kernels[] = {
> @@ -6696,8 +6705,8 @@ const struct intel_compute_kernels intel_compute_square_kernels[] = {
> .ip_ver = IP_VER(20, 01),
> .size = sizeof(xe2lpg_kernel_square_bin),
> .kernel = xe2lpg_kernel_square_bin,
> - .long_kernel = xe2lpg_kernel_inc_bin,
> - .long_kernel_size = sizeof(xe2lpg_kernel_inc_bin),
> + .long_kernel = xe2lpg_kernel_count_bin,
> + .long_kernel_size = sizeof(xe2lpg_kernel_count_bin),
> .sip_kernel = xe2lpg_kernel_sip_bin,
> .sip_kernel_size = sizeof(xe2lpg_kernel_sip_bin),
> .loop_kernel = xe2lpg_kernel_loop_bin,
> @@ -6707,8 +6716,8 @@ const struct intel_compute_kernels intel_compute_square_kernels[] = {
> .ip_ver = IP_VER(20, 04),
> .size = sizeof(xe2lpg_kernel_square_bin),
> .kernel = xe2lpg_kernel_square_bin,
> - .long_kernel = xe2lpg_kernel_inc_bin,
> - .long_kernel_size = sizeof(xe2lpg_kernel_inc_bin),
> + .long_kernel = xe2lpg_kernel_count_bin,
> + .long_kernel_size = sizeof(xe2lpg_kernel_count_bin),
> .sip_kernel = xe2lpg_kernel_sip_bin,
> .sip_kernel_size = sizeof(xe2lpg_kernel_sip_bin),
> .loop_kernel = xe2lpg_kernel_loop_bin,
> @@ -6718,8 +6727,8 @@ const struct intel_compute_kernels intel_compute_square_kernels[] = {
> .ip_ver = IP_VER(30, 00),
> .size = sizeof(xe3lpg_kernel_square_bin),
> .kernel = xe3lpg_kernel_square_bin,
> - .long_kernel = xe3lpg_kernel_inc_bin,
> - .long_kernel_size = sizeof(xe3lpg_kernel_inc_bin),
> + .long_kernel = xe3lpg_kernel_count_bin,
> + .long_kernel_size = sizeof(xe3lpg_kernel_count_bin),
> .sip_kernel = xe3lpg_kernel_sip_bin,
> .sip_kernel_size = sizeof(xe3lpg_kernel_sip_bin),
> .loop_kernel = xe3lpg_kernel_loop_bin,
> diff --git a/opencl/loop_count.cl b/opencl/loop_count.cl
> new file mode 100644
> index 0000000000..c5856ab7a8
> --- /dev/null
> +++ b/opencl/loop_count.cl
> @@ -0,0 +1,15 @@
> +/* Must be same as MAGIC_LOOP_STOP in lib/intel_compute.c */
> +#define MAGIC_LOOP_STOP 0x12341234
> +
> +__kernel void loop(volatile __global int *input,
> + __global int *output,
> + unsigned int count)
> +{
> + int i = get_global_id(0);
> +
> + for (int c = 0; c < count; c++) {
> + if (input[0] == MAGIC_LOOP_STOP)
> + break;
> + output[0] = output[0] + i;
> + }
> +}
> --
> 2.43.0
>
More information about the igt-dev
mailing list