[PATCH i-g-t v2 2/4] lib/intel_compute: simplify compute-preempt subtests

Wed May 28 09:12:23 UTC 2025

On Thu, May 22, 2025 at 09:55:13AM +0200, Zbigniew Kempczyński wrote:
> Code in compute-preempt-exec uses locally created user-fences what
> takes a lot of lines. Introduced bo_execenv_exec_async() allows to
> synchronize later with user-fence defined in execenv itself.
> 
> Moreover logic of wmtp and threadgroup preempt scenarios was simplified.
> Now both cases assume compute-square (short running job) must complete
> before long running job will end. For WMTP we have full control and
> we're able to stop it via write MAGIC value to input buffer (shader
> waits for it to break endless loop). For threadgroup this is harder
> to achieve because preemption occurs on some point where threadgroup
> ends its execution. Shader (increment loop) for threadgroup preemption
> uses large x-dim what causes short job preempts it on some point of time.
> This unfortunately requires some fine tweaking (setting x-dim) but
> at the moment I haven't found better way to implement this case.
> 
> Signed-off-by: Zbigniew Kempczyński <zbigniew.kempczynski at intel.com>
> Cc: Francois Dugast <francois.dugast at intel.com>

Reviewed-by: Francois Dugast <francois.dugast at intel.com>

> ---
>  lib/intel_compute.c | 152 ++++++++++++--------------------------------
>  1 file changed, 41 insertions(+), 111 deletions(-)
> 
> diff --git a/lib/intel_compute.c b/lib/intel_compute.c
> index 5579bec85b..d2629ed911 100644
> --- a/lib/intel_compute.c
> +++ b/lib/intel_compute.c
> @@ -67,6 +67,7 @@
>   */
>  #define TGP_long_kernel_loop_count		10
>  #define WMTP_long_kernel_loop_count		1000000
> +#define XE2_THREADGROUP_PREEMPT_XDIM		0x200000
>  
>  struct bo_dict_entry {
>  	uint64_t addr;
> @@ -102,6 +103,25 @@ struct bo_execenv {
>  	struct user_execenv *user;
>  };
>  
> +static void bo_randomize(float *ptr, int size)
> +{
> +	srand(time(NULL));
> +	for (int i = 0; i < size; i++)
> +		ptr[i] = rand() / (float)RAND_MAX;
> +}
> +
> +static void bo_check_square(float *input, float *output, int size)
> +{
> +	for (int i = 0; i < size; i++) {
> +		float expected_output = input[i] * input[i];
> +
> +		if (output[i] != expected_output)
> +			igt_debug("[%4d] input:%f output:%f expected_output:%f\n",
> +				  i, input[i], output[i], expected_output);
> +		igt_assert_eq_double(output[i], expected_output);
> +	}
> +}
> +
>  static void bo_execenv_create(int fd, struct bo_execenv *execenv,
>  			      struct drm_xe_engine_class_instance *eci,
>  			      struct user_execenv *user)
> @@ -1584,10 +1604,10 @@ static void xe2lpg_compute_exec_compute(uint32_t *addr_bo_buffer_batch,
>  
>  	if (threadgroup_preemption)
>  		/*
> -		 * Create multiple threadgroups using higher gloabl workgroup size
> +		 * Create multiple threadgroups using higher global workgroup size
>  		 * Global Workgroup size = Local X * Thread Group X +  Local Y * Thread Group Y + Local Z * Thread Group Z
>  		 */
> -		addr_bo_buffer_batch[b++] = 0x00200000; // Thread Group ID X Dimension
> +		addr_bo_buffer_batch[b++] = XE2_THREADGROUP_PREEMPT_XDIM; // Thread Group ID X Dimension
>  	else
>  		addr_bo_buffer_batch[b++] = size_thread_group_x(work_size);
>  
> @@ -2077,30 +2097,15 @@ static void xe2lpg_compute_preempt_exec(int fd, const unsigned char *long_kernel
>  		  .size = 0x6400000,
>  		  .name = "state context data base"},
>  		{ .addr = ADDR_INSTRUCTION_STATE_BASE + OFFSET_STATE_SIP,
> +		  .size = ALIGN(sip_kernel_size, 0x1000),
>  		  .name = "sip kernel"},
>  	};
>  
>  	struct bo_dict_entry bo_dict_short[XE2_BO_PREEMPT_DICT_ENTRIES];
>  	struct bo_execenv execenv_short, execenv_long;
> -	float *input_data, *output_data;
> -	unsigned int long_kernel_loop_count;
> -	struct drm_xe_sync sync_long = {
> -		.type = DRM_XE_SYNC_TYPE_USER_FENCE,
> -		.flags = DRM_XE_SYNC_FLAG_SIGNAL,
> -		.timeline_value = USER_FENCE_VALUE,
> -	};
> -	struct bo_sync *bo_sync_long;
> -	size_t bo_size_long = sizeof(*bo_sync_long);
> -	uint32_t bo_long = 0;
> -	struct drm_xe_sync sync_short = {
> -		.type = DRM_XE_SYNC_TYPE_USER_FENCE,
> -		.flags = DRM_XE_SYNC_FLAG_SIGNAL,
> -		.timeline_value = USER_FENCE_VALUE,
> -	};
> -	struct bo_sync *bo_sync_short;
> -	size_t bo_size_short = sizeof(*bo_sync_short);
> -	uint32_t bo_short = 0;
> -	int64_t timeout_short = 1;
> +	float *input_short, *output_short, *input_long;
> +	unsigned int long_kernel_loop_count = 0;
> +	int64_t timeout_one_ns = 1;
>  	bool use_loop_kernel = loop_kernel && !threadgroup_preemption;
>  
>  	if (threadgroup_preemption)
> @@ -2114,41 +2119,12 @@ static void xe2lpg_compute_preempt_exec(int fd, const unsigned char *long_kernel
>  	bo_execenv_create(fd, &execenv_short, eci, NULL);
>  	bo_execenv_create(fd, &execenv_long, eci, NULL);
>  
> -	/* Prepare sync object for long */
> -	bo_size_long = xe_bb_size(fd, bo_size_long);
> -	bo_long = xe_bo_create(fd, execenv_long.vm, bo_size_long, vram_if_possible(fd, 0),
> -			       DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM);
> -	bo_sync_long = xe_bo_map(fd, bo_long, bo_size_long);
> -	sync_long.addr = to_user_pointer(&bo_sync_long->sync);
> -	xe_vm_bind_async(fd, execenv_long.vm, 0, bo_long, 0, ADDR_SYNC, bo_size_long,
> -			 &sync_long, 1);
> -	xe_wait_ufence(fd, &bo_sync_long->sync, USER_FENCE_VALUE, execenv_long.exec_queue,
> -		       INT64_MAX);
> -	bo_sync_long->sync = 0;
> -	sync_long.addr = ADDR_SYNC;
> -
> -	/* Prepare sync object for short */
> -	bo_size_short = xe_bb_size(fd, bo_size_short);
> -	bo_short = xe_bo_create(fd, execenv_short.vm, bo_size_short, vram_if_possible(fd, 0),
> -			       DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM);
> -	bo_sync_short = xe_bo_map(fd, bo_short, bo_size_short);
> -	sync_short.addr = to_user_pointer(&bo_sync_short->sync);
> -	xe_vm_bind_async(fd, execenv_short.vm, 0, bo_short, 0, ADDR_SYNC2, bo_size_short,
> -			 &sync_short, 1);
> -	xe_wait_ufence(fd, &bo_sync_short->sync, USER_FENCE_VALUE, execenv_short.exec_queue,
> -		       INT64_MAX);
> -	bo_sync_short->sync = 0;
> -	sync_short.addr = ADDR_SYNC2;
> -
>  	if (use_loop_kernel)
>  		bo_dict_long[0].size = ALIGN(loop_kernel_size, 0x1000);
>  	else
>  		bo_dict_long[0].size = ALIGN(long_kernel_size, 0x1000);
>  	bo_dict_short[0].size = ALIGN(short_kernel_size, 0x1000);
>  
> -	bo_dict_long[10].size = ALIGN(sip_kernel_size, 0x1000);
> -	bo_dict_short[10].size = ALIGN(sip_kernel_size, 0x1000);
> -
>  	bo_execenv_bind(&execenv_long, bo_dict_long, XE2_BO_PREEMPT_DICT_ENTRIES);
>  	bo_execenv_bind(&execenv_short, bo_dict_short, XE2_BO_PREEMPT_DICT_ENTRIES);
>  
> @@ -2172,17 +2148,11 @@ static void xe2lpg_compute_preempt_exec(int fd, const unsigned char *long_kernel
>  	xelpg_create_indirect_data(bo_dict_short[3].data, ADDR_INPUT, ADDR_OUTPUT, SIZE_DATA);
>  	xehp_create_surface_state(bo_dict_short[7].data, ADDR_INPUT, ADDR_OUTPUT);
>  
> -	input_data = (float *) bo_dict_long[4].data;
> -	output_data = (float *) bo_dict_short[5].data;
> -	srand(time(NULL));
> +	input_long = (float *) bo_dict_long[4].data;
> +	input_short = (float *) bo_dict_short[4].data;
> +	output_short = (float *) bo_dict_short[5].data;
>  
> -	for (int i = 0; i < SIZE_DATA; i++)
> -		input_data[i] = rand() / (float)RAND_MAX;
> -
> -	input_data = (float *) bo_dict_short[4].data;
> -
> -	for (int i = 0; i < SIZE_DATA; i++)
> -		input_data[i] = rand() / (float)RAND_MAX;
> +	bo_randomize(input_short, SIZE_DATA);
>  
>  	xe2lpg_compute_exec_compute(bo_dict_long[8].data, ADDR_GENERAL_STATE_BASE,
>  				    ADDR_SURFACE_STATE_BASE, ADDR_DYNAMIC_STATE_BASE,
> @@ -2196,66 +2166,26 @@ static void xe2lpg_compute_preempt_exec(int fd, const unsigned char *long_kernel
>  				    OFFSET_INDIRECT_DATA_START, OFFSET_KERNEL, OFFSET_STATE_SIP,
>  				    false, SIZE_DATA);
>  
> -	xe_exec_sync(fd, execenv_long.exec_queue, ADDR_BATCH, &sync_long, 1);
> +	bo_execenv_exec_async(&execenv_long, ADDR_BATCH);
>  
>  	/* Wait until multiple LR jobs will start to occupy gpu */
>  	if (use_loop_kernel)
>  		sleep(1);
>  
> -	xe_exec_sync(fd, execenv_short.exec_queue, ADDR_BATCH, &sync_short, 1);
> -
> -	xe_wait_ufence(fd, &bo_sync_short->sync, USER_FENCE_VALUE, execenv_short.exec_queue,
> -		       INT64_MAX);
> +	/*
> +	 * Regardless scenario - wmtp or threadgroup short job (compute
> +	 * square) must complete first and long job must be still active.
> +	 */
> +	bo_execenv_exec(&execenv_short, ADDR_BATCH);
> +	bo_check_square(input_short, output_short, SIZE_DATA);
>  
>  	/* Check that the long kernel has not completed yet */
> -	igt_assert_neq(0, __xe_wait_ufence(fd, &bo_sync_long->sync, USER_FENCE_VALUE,
> -					   execenv_long.exec_queue, &timeout_short));
> +	igt_assert_neq(0, __xe_wait_ufence(fd, &execenv_long.bo_sync->sync, USER_FENCE_VALUE,
> +					   execenv_long.exec_queue, &timeout_one_ns));
>  	if (use_loop_kernel)
> -		((int *)bo_dict_long[4].data)[0] = MAGIC_LOOP_STOP;
> +		((int *)input_long)[0] = MAGIC_LOOP_STOP;
>  
> -	xe_wait_ufence(fd, &bo_sync_long->sync, USER_FENCE_VALUE, execenv_long.exec_queue,
> -		       INT64_MAX);
> -
> -	munmap(bo_sync_long, bo_size_long);
> -	gem_close(fd, bo_long);
> -
> -	munmap(bo_sync_short, bo_size_short);
> -	gem_close(fd, bo_short);
> -
> -	for (int i = use_loop_kernel ? 1 : 0; i < SIZE_DATA; i++) {
> -		float input = input_data[i];
> -		float output = output_data[i];
> -		float expected_output = input * input;
> -
> -		if (output != expected_output)
> -			igt_debug("[%4d] input:%f output:%f expected_output:%f\n",
> -				  i, input, output, expected_output);
> -		igt_assert_eq_double(output, expected_output);
> -	}
> -
> -	for (int i = 0; i < SIZE_DATA; i++) {
> -		float f1;
> -
> -		f1 = ((float *) bo_dict_long[5].data)[i];
> -
> -		if (threadgroup_preemption) {
> -			if (f1 < long_kernel_loop_count)
> -				igt_debug("[%4d] f1: %f != %u\n", i, f1, long_kernel_loop_count);
> -
> -			/* Final incremented value should be greater than loop count
> -			 * as the kernel is ran by multiple threads and output variable
> -			 * is shared among all threads. This enusres multiple threadgroup
> -			 * workload execution
> -			 */
> -			igt_assert(f1 > long_kernel_loop_count);
> -		} else {
> -			if (!loop_kernel) {
> -				if (f1 != long_kernel_loop_count)
> -					igt_debug("[%4d] f1: %f != %u\n", i, f1, long_kernel_loop_count);
> -				igt_assert(f1 == long_kernel_loop_count);
> -			}
> -		}
> -	}
> +	bo_execenv_sync(&execenv_long);
>  
>  	bo_execenv_unbind(&execenv_short, bo_dict_short, XE2_BO_PREEMPT_DICT_ENTRIES);
>  	bo_execenv_unbind(&execenv_long, bo_dict_long, XE2_BO_PREEMPT_DICT_ENTRIES);
> -- 
> 2.43.0
>