[PATCH i-g-t] tests/intel/xe_compute_preempt: Add compute threadgroup preempt test

Thu Mar 14 21:31:38 UTC 2024

On 3/14/2024 2:00 PM, janga.rahul.kumar at intel.com wrote:
> From: Janga Rahul Kumar<janga.rahul.kumar at intel.com>
>
> Test submits long kernel with a higher threadgroup count, lower
> iteration kernel and a short opencl kernel to exercise threadgroup
> preemption scenario with WMTP disabled.
>
> v2: Use macros instead of const values. Add documentation for
>      validation check. (Nirmoy)
>
> Cc: Nirmoy Das<nirmoy.das at intel.com>
> Signed-off-by: Janga Rahul Kumar<janga.rahul.kumar at intel.com>
> Reviewed-by: Nirmoy Das<nirmoy.das at intel.com>
> ---
>   lib/intel_compute.c              | 74 +++++++++++++++++++++++++-------
>   lib/intel_compute.h              |  3 +-
>   tests/intel/xe_compute_preempt.c | 24 +++++++++--
>   3 files changed, 80 insertions(+), 21 deletions(-)
>
> diff --git a/lib/intel_compute.c b/lib/intel_compute.c
> index c5d253ebc..9d3b97efe 100644
> --- a/lib/intel_compute.c
> +++ b/lib/intel_compute.c
> @@ -43,6 +43,13 @@
>   #define XE2_ADDR_STATE_CONTEXT_DATA_BASE	0x900000UL
>   #define OFFSET_STATE_SIP			0xFFFF0000
>   
> +/*
> + * TGP  - ThreadGroup Preemption
> + * WMTP - Walker Mid Thread Preemption
> + */
> +#define TGP_long_kernel_loop_count		10
> +#define WMTP_long_kernel_loop_count		1000000
> +
>   struct bo_dict_entry {
>   	uint64_t addr;
>   	uint32_t size;
> @@ -1162,7 +1169,8 @@ static void xe2lpg_compute_exec_compute(uint32_t *addr_bo_buffer_batch,
>   					uint64_t addr_state_contect_data_base,
>   					uint64_t offset_indirect_data_start,
>   					uint64_t kernel_start_pointer,
> -					uint64_t sip_start_pointer)
> +					uint64_t sip_start_pointer,
> +					bool	 threadgroup_preemption)
>   {
>   	int b = 0;
>   
> @@ -1236,7 +1244,12 @@ static void xe2lpg_compute_exec_compute(uint32_t *addr_bo_buffer_batch,
>   	addr_bo_buffer_batch[b++] = 0xbe040000;
>   	addr_bo_buffer_batch[b++] = 0xffffffff;
>   	addr_bo_buffer_batch[b++] = 0x000003ff;
> -	addr_bo_buffer_batch[b++] = 0x00000002;
> +
> +	if (threadgroup_preemption)
> +		addr_bo_buffer_batch[b++] = 0x00200000; // Global workgroup size
> +	else
> +		addr_bo_buffer_batch[b++] = 0x00000002;
> +
>   	addr_bo_buffer_batch[b++] = 0x00000001;
>   	addr_bo_buffer_batch[b++] = 0x00000001;
>   	addr_bo_buffer_batch[b++] = 0x00000000;
> @@ -1251,7 +1264,12 @@ static void xe2lpg_compute_exec_compute(uint32_t *addr_bo_buffer_batch,
>   
>   	addr_bo_buffer_batch[b++] = kernel_start_pointer;
>   	addr_bo_buffer_batch[b++] = 0x00000000;
> -	addr_bo_buffer_batch[b++] = 0x00100000; // Enable Thread Preemption BitField:20
> +
> +	if (threadgroup_preemption)
> +		addr_bo_buffer_batch[b++] = 0x00000000;
> +	else
> +		addr_bo_buffer_batch[b++] = 0x00100000; // Enable Mid Thread Preemption BitField:20
> +
>   	addr_bo_buffer_batch[b++] = 0x00000000;
>   	addr_bo_buffer_batch[b++] = 0x00000000;
>   	addr_bo_buffer_batch[b++] = 0x0c000020;
> @@ -1369,7 +1387,7 @@ static void xe2lpg_compute_exec(int fd, const unsigned char *kernel,
>   				  ADDR_INSTRUCTION_STATE_BASE,
>   				  XE2_ADDR_STATE_CONTEXT_DATA_BASE,
>   				  OFFSET_INDIRECT_DATA_START,
> -				  OFFSET_KERNEL, 0);
> +				  OFFSET_KERNEL, 0, false);
>   
>   	bo_execenv_exec(&execenv, ADDR_BATCH);
>   
> @@ -1527,7 +1545,8 @@ static void xe2lpg_compute_preempt_exec(int fd, const unsigned char *long_kernel
>   					unsigned int short_kernel_size,
>   					const unsigned char *sip_kernel,
>   					unsigned int sip_kernel_size,
> -					struct drm_xe_engine_class_instance *eci)
> +					struct drm_xe_engine_class_instance *eci,
> +					bool threadgroup_preemption)
>   {
>   #define XE2_BO_PREEMPT_DICT_ENTRIES 11
>   	struct bo_dict_entry bo_dict_long[XE2_BO_PREEMPT_DICT_ENTRIES] = {
> @@ -1564,6 +1583,7 @@ static void xe2lpg_compute_preempt_exec(int fd, const unsigned char *long_kernel
>   	struct bo_dict_entry bo_dict_short[XE2_BO_PREEMPT_DICT_ENTRIES];
>   	struct bo_execenv execenv_short, execenv_long;
>   	float *dinput;
> +	unsigned int long_kernel_loop_count;
>   	struct drm_xe_sync sync_long = {
>   		.type = DRM_XE_SYNC_TYPE_SYNCOBJ,
>   		.flags = DRM_XE_SYNC_FLAG_SIGNAL,
> @@ -1574,7 +1594,11 @@ static void xe2lpg_compute_preempt_exec(int fd, const unsigned char *long_kernel
>   		.flags = DRM_XE_SYNC_FLAG_SIGNAL,
>   		.handle = syncobj_create(fd, 0),
>   	};
> -	unsigned int long_kernel_loop_count = 1000000;
> +
> +	if (threadgroup_preemption)
> +		long_kernel_loop_count = TGP_long_kernel_loop_count;
> +	else
> +		long_kernel_loop_count = WMTP_long_kernel_loop_count;
>   
>   	for (int i = 0; i < XE2_BO_PREEMPT_DICT_ENTRIES; ++i)
>   		bo_dict_short[i] = bo_dict_long[i];
> @@ -1622,12 +1646,12 @@ static void xe2lpg_compute_preempt_exec(int fd, const unsigned char *long_kernel
>   	xe2lpg_compute_exec_compute(bo_dict_long[8].data, ADDR_GENERAL_STATE_BASE,
>   				    ADDR_SURFACE_STATE_BASE, ADDR_DYNAMIC_STATE_BASE,
>   				    ADDR_INSTRUCTION_STATE_BASE, XE2_ADDR_STATE_CONTEXT_DATA_BASE,
> -				    OFFSET_INDIRECT_DATA_START, OFFSET_KERNEL, OFFSET_STATE_SIP);
> +				    OFFSET_INDIRECT_DATA_START, OFFSET_KERNEL, OFFSET_STATE_SIP, threadgroup_preemption);
>   
>   	xe2lpg_compute_exec_compute(bo_dict_short[8].data, ADDR_GENERAL_STATE_BASE,
>   				    ADDR_SURFACE_STATE_BASE, ADDR_DYNAMIC_STATE_BASE,
>   				    ADDR_INSTRUCTION_STATE_BASE, XE2_ADDR_STATE_CONTEXT_DATA_BASE,
> -				    OFFSET_INDIRECT_DATA_START, OFFSET_KERNEL, OFFSET_STATE_SIP);
> +				    OFFSET_INDIRECT_DATA_START, OFFSET_KERNEL, OFFSET_STATE_SIP, false);
>   
>   	xe_exec_sync(fd, execenv_long.exec_queue, ADDR_BATCH, &sync_long, 1);
>   
> @@ -1655,9 +1679,21 @@ static void xe2lpg_compute_preempt_exec(int fd, const unsigned char *long_kernel
>   
>   		f1 = ((float *) bo_dict_long[5].data)[i];
>   
> -		if (f1 != long_kernel_loop_count)
> -			igt_debug("[%4d] f1: %f != %u\n", i, f1, long_kernel_loop_count);
> -		igt_assert(f1 == long_kernel_loop_count);
> +		if (threadgroup_preemption) {
> +			if (f1 < long_kernel_loop_count)
> +				igt_debug("[%4d] f1: %f != %u\n", i, f1, long_kernel_loop_count);
> +
> +			/* Final incremented value should be greater than loop count
> +			 * as the kernel is ran by multiple threads and output variable
> +			 * is shared among all threads. This enusres multiple threadgroup
> +			 * workload execution
> +			 */
> +			igt_assert(f1 > long_kernel_loop_count);
> +		} else {
> +			if (f1 != long_kernel_loop_count)
> +				igt_debug("[%4d] f1: %f != %u\n", i, f1, long_kernel_loop_count);
> +			igt_assert(f1 == long_kernel_loop_count);
> +		}
>   	}
>   
>   	bo_execenv_unbind(&execenv_short, bo_dict_short, XE2_BO_PREEMPT_DICT_ENTRIES);
> @@ -1675,7 +1711,8 @@ static const struct {
>   			     unsigned int short_kernel_size,
>   			     const unsigned char *sip_kernel,
>   			     unsigned int sip_kernel_size,
> -			     struct drm_xe_engine_class_instance *eci);
> +			     struct drm_xe_engine_class_instance *eci,
> +			     bool threadgroup_preemption);
>   	uint32_t compat;
>   } intel_compute_preempt_batches[] = {
>   	{
> @@ -1686,7 +1723,8 @@ static const struct {
>   };
>   
>   static bool __run_intel_compute_kernel_preempt(int fd,
> -		struct drm_xe_engine_class_instance *eci)
> +		struct drm_xe_engine_class_instance *eci,
> +		bool threadgroup_preemption)
>   {
>   	unsigned int ip_ver = intel_graphics_ver(intel_get_drm_devid(fd));
>   	unsigned int batch;
> @@ -1724,7 +1762,8 @@ static bool __run_intel_compute_kernel_preempt(int fd,
>   							  kernels->kernel, kernels->size,
>   							  kernels->sip_kernel,
>   							  kernels->sip_kernel_size,
> -							  eci);
> +							  eci,
> +							  threadgroup_preemption);
>   
>   	return true;
>   }
> @@ -1733,11 +1772,14 @@ static bool __run_intel_compute_kernel_preempt(int fd,
>    * exercise preemption scenario.
>    *
>    * @fd: file descriptor of the opened DRM Xe device
> + * @eci: engine class instance
> + * @thread_preemption: enable/disable threadgroup preemption test
>    *
>    * Returns true on success, false otherwise.
>    */
>   bool run_intel_compute_kernel_preempt(int fd,
> -		struct drm_xe_engine_class_instance *eci)
> +		struct drm_xe_engine_class_instance *eci,
> +		bool threadgroup_preemption)
>   {
> -	return __run_intel_compute_kernel_preempt(fd, eci);
> +	return __run_intel_compute_kernel_preempt(fd, eci, threadgroup_preemption);
>   }
> diff --git a/lib/intel_compute.h b/lib/intel_compute.h
> index fe9637b91..3c2cd010c 100644
> --- a/lib/intel_compute.h
> +++ b/lib/intel_compute.h
> @@ -37,5 +37,6 @@ extern const struct intel_compute_kernels intel_compute_square_kernels[];
>   
>   bool run_intel_compute_kernel(int fd);
>   bool xe_run_intel_compute_kernel_on_engine(int fd, struct drm_xe_engine_class_instance *eci);
> -bool run_intel_compute_kernel_preempt(int fd, struct drm_xe_engine_class_instance *eci);
> +bool run_intel_compute_kernel_preempt(int fd, struct drm_xe_engine_class_instance *eci,
> +				      bool threadgroup_preemption);
>   #endif	/* INTEL_COMPUTE_H */
> diff --git a/tests/intel/xe_compute_preempt.c b/tests/intel/xe_compute_preempt.c
> index 0aeb10547..2bc27eff1 100644
> --- a/tests/intel/xe_compute_preempt.c
> +++ b/tests/intel/xe_compute_preempt.c
> @@ -27,11 +27,16 @@
>    * Description:
>    *      Exercise multiple walker mid thread preemption scenario
>    * Functionality: compute openCL kernel
> + * SUBTEST: compute-threadgroup-preempt
> + * GPU requirement: LNL
> + * Description:
> + *      Exercise compute walker threadgroup preemption scenario
> + * Functionality: compute openCL kernel
>    */
>   static void
> -test_compute_preempt(int fd, struct drm_xe_engine_class_instance *hwe)
> +test_compute_preempt(int fd, struct drm_xe_engine_class_instance *hwe, bool threadgroup_preemption)
>   {
> -	igt_require_f(run_intel_compute_kernel_preempt(fd, hwe), "GPU not supported\n");
> +	igt_require_f(run_intel_compute_kernel_preempt(fd, hwe, threadgroup_preemption), "GPU not supported\n");
>   }
>   
>   igt_main
> @@ -49,7 +54,7 @@ igt_main
>   				continue;
>   
>   			igt_dynamic_f("engine-%s", xe_engine_class_string(hwe->engine_class))
> -				test_compute_preempt(xe, hwe);
> +				test_compute_preempt(xe, hwe, false);
>   		}
>   	}
>   
> @@ -61,12 +66,23 @@ igt_main
>   
>   			igt_dynamic_f("engine-%s", xe_engine_class_string(hwe->engine_class)) {
>   				igt_fork(child, 100)
> -					test_compute_preempt(xe, hwe);
> +					test_compute_preempt(xe, hwe, false);
>   				igt_waitchildren();
>   			}
>   		}
>   	}
>   
> +	igt_subtest_with_dynamic("compute-threadgroup-preempt") {
> +		xe_for_each_engine(xe, hwe) {
> +			if (hwe->engine_class != DRM_XE_ENGINE_CLASS_COMPUTE &&
> +			    hwe->engine_class != DRM_XE_ENGINE_CLASS_RENDER)
> +				continue;
> +
> +			igt_dynamic_f("engine-%s", xe_engine_class_string(hwe->engine_class))
> +			test_compute_preempt(xe, hwe, true);
> +		}
> +	}
> +
>   	igt_fixture
>   		drm_close_driver(xe);
>   

> Reviewed-by: Jagmeet Randhawa <jagmeet.randhawa at intel.com> 
> <mailto:jagmeet.randhawa at intel.com>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/igt-dev/attachments/20240314/cab51fe6/attachment-0001.htm>