[PATCH 1/2] tests/intel/xe_exec_system_allocator: Validate preftch of svm with single and multi ranges

Mon Jun 9 17:23:27 UTC 2025

On Thu, May 29, 2025 at 11:52:36AM +0000, sai.gowtham.ch at intel.com wrote:
> From: Sai Gowtham Ch <sai.gowtham.ch at intel.com>
> 
> Tests validates Prefetch of SVM with single range and multiple ranges, with different
> range of sizes. checks if not svm pagefaults are seen while prefetching the ranges of
> svm.
> 
> v2: Enhance test to utilize smem/vram flags from the selection loop (Jonathan Cavitt)
> 
> v3: Integrate prefetch tests in exixting test_exec (Matthew Brost)
> 

This not what I meant at all.

Here you just moved the directed tests in a previous rev from a
dedicated function into test_exec() but they are still directed tests.

Directed tests are out ok for very basic checkout but utlimately usually
worthless in exposing hard to find issues in the KMD memory management
(e.g. races, concurrency issues, locking issues, missing TLB
invalidations, etc...). That is why this test is built to scale from
single threaded, to multi-threaded, to multi-process (e.g. do it once,
do many times, do many time in parallel...).

I am suggested you build prefetch naturally into test_exec() flow so you
get all single threaded, multi-threaded, and multi-process sections
generated. I've done this locally and can share on the list if you want,
or let you try again. FWIW I've appeared to found a KMD bug too in the
multi-threaded / multi-process section too which I'm still root causing.

Matt

> Cc: Himal Prasad Ghimiray <himal.prasad.ghimiray at intel.com>
> Cc: Matthew Brost <matthew.brost at intel.com>
> Cc: Jonathan Cavitt <jonathan.cavitt at intel.com>
> Signed-off-by: Sai Gowtham Ch <sai.gowtham.ch at intel.com>
> ---
>  tests/intel/xe_exec_system_allocator.c | 135 ++++++++++++++++++++-----
>  1 file changed, 109 insertions(+), 26 deletions(-)
> 
> diff --git a/tests/intel/xe_exec_system_allocator.c b/tests/intel/xe_exec_system_allocator.c
> index 06daac8c2..9596f7be3 100644
> --- a/tests/intel/xe_exec_system_allocator.c
> +++ b/tests/intel/xe_exec_system_allocator.c
> @@ -20,6 +20,7 @@
>  #include "lib/igt_syncobj.h"
>  #include "lib/intel_reg.h"
>  #include "xe_drm.h"
> +#include "xe/xe_gt.c"
>  
>  #include "xe/xe_ioctl.h"
>  #include "xe/xe_query.h"
> @@ -770,8 +771,11 @@ partial(int fd, struct drm_xe_engine_class_instance *eci, unsigned int flags)
>  #define SYNC_EXEC		(0x1 << 19)
>  #define EVERY_OTHER_CHECK	(0x1 << 20)
>  #define MULTI_FAULT		(0x1 << 21)
> +#define PREFETCH		(0x1 << 22)
> +#define VRAM			(0x1 << 23)
>  
>  #define N_MULTI_FAULT		4
> +#define MAX_BATCH_DWORDS	16
>  
>  /**
>   * SUBTEST: once-%s
> @@ -957,7 +961,24 @@ partial(int fd, struct drm_xe_engine_class_instance *eci, unsigned int flags)
>   * Description: Create multiple threads with a faults on different hardware engines to same addresses, racing between CPU and GPU access
>   * Test category: stress test
>   */
> -
> +/**
> + * SUBTEST: prefetch-%s
> + * Description: Test to validate functionality of Prefetch of SVM %arg[1]
> + * Test category: functionality test
> + *
> + * SUBTEST: multi-range-%s
> + * Description: Multi range Prefetch of SVM %arg[1] and  check if multiple ranges are created
> + * Test category: functionality test
> + *
> + * arg[1]:
> + *
> + * @smem-SZ_4K: with size of SZ_4K on smem region
> + * @smem-SZ_64K: with size of SZ_64K on smem region
> + * @smem-SZ_2M: with size of SZ_2M on smem region
> + * @vram-SZ_4K: with size of SZ_4K on vram region
> + * @vram-SZ_64K: with size of SZ_64K on vram region
> + * @vram-SZ_2M: with size of SZ_2M on vram region
> + */
>  struct test_exec_data {
>  	uint32_t batch[32];
>  	uint64_t pad;
> @@ -981,7 +1002,7 @@ test_exec(int fd, struct drm_xe_engine_class_instance *eci,
>  	  size_t stride, uint32_t vm, void *alloc, pthread_barrier_t *barrier,
>  	  unsigned int flags)
>  {
> -	uint64_t addr;
> +	uint64_t addr, target_addr, ba_addr;
>  	struct drm_xe_sync sync[1] = {
>  		{ .type = DRM_XE_SYNC_TYPE_USER_FENCE, .flags = DRM_XE_SYNC_FLAG_SIGNAL,
>  	          .timeline_value = USER_FENCE_VALUE },
> @@ -993,15 +1014,20 @@ test_exec(int fd, struct drm_xe_engine_class_instance *eci,
>  	};
>  	uint32_t exec_queues[MAX_N_EXEC_QUEUES];
>  	struct test_exec_data *data, *next_data = NULL;
> -	uint32_t bo_flags;
> +	uint32_t bo_flags, expected, *result_ptr, *batch;
>  	uint32_t bo = 0;
>  	void **pending_free;
>  	u64 *exec_ufence = NULL;
> -	int i, j, b, file_fd = -1, prev_idx;
> +	int i, j, b, file_fd = -1, prev_idx, svm_pf_count_pre, svm_pf_count_pos;
>  	bool free_vm = false;
>  	size_t aligned_size = bo_size ?: xe_get_default_alignment(fd);
>  	size_t orig_size = bo_size;
> +	size_t slice_size = bo_size;
>  	struct aligned_alloc_type aligned_alloc_type;
> +	const char *stat = "svm_pagefault_count";
> +
> +	if (flags & PREFETCH)
> +		bo_size = bo_size * n_execs;
>  
>  	if (flags & MULTI_FAULT) {
>  		if (!bo_size)
> @@ -1134,7 +1160,7 @@ test_exec(int fd, struct drm_xe_engine_class_instance *eci,
>  
>  	for (i = 0; i < n_execs; i++) {
>  		int idx = !stride ? i : i * stride, next_idx = !stride
> -			? (i + 1) : (i + 1) * stride;
> +			  ? (i + 1) : (i + 1) * stride;
>  		uint64_t batch_offset = (char *)&data[idx].batch - (char *)data;
>  		uint64_t batch_addr = addr + batch_offset;
>  		uint64_t sdi_offset = (char *)&data[idx].data - (char *)data;
> @@ -1155,12 +1181,12 @@ test_exec(int fd, struct drm_xe_engine_class_instance *eci,
>  			write_dword(data[idx].batch, sdi_addr + j * orig_size,
>  				    WRITE_VALUE(&data[idx], idx), &b);
>  			igt_assert(b <= ARRAY_SIZE(data[idx].batch));
> -		} else if (!(flags & EVERY_OTHER_CHECK)) {
> +		} else if (!(flags & EVERY_OTHER_CHECK) && !(flags & PREFETCH)) {
>  			b = 0;
>  			write_dword(data[idx].batch, sdi_addr,
>  				    WRITE_VALUE(&data[idx], idx), &b);
>  			igt_assert(b <= ARRAY_SIZE(data[idx].batch));
> -		} else if (flags & EVERY_OTHER_CHECK && !odd(i)) {
> +		} else if (flags & EVERY_OTHER_CHECK && !odd(i) && !(flags & PREFETCH)) {
>  			b = 0;
>  			write_dword(data[idx].batch, sdi_addr,
>  				    WRITE_VALUE(&data[idx], idx), &b);
> @@ -1177,28 +1203,36 @@ test_exec(int fd, struct drm_xe_engine_class_instance *eci,
>  				    (char *)&data[next_idx].data - (char *)data,
>  				    WRITE_VALUE(&data[next_idx], next_idx), &b);
>  			igt_assert(b <= ARRAY_SIZE(data[next_idx].batch));
> -		}
> +		} else if ( flags & PREFETCH) {
> +			batch = (uint32_t *)((uint8_t *)data + i * slice_size);
> +			target_addr = addr + i * slice_size + 0x100;
> +			b = 0;
>  
> -		if (!exec_ufence)
> -			data[idx].exec_sync = 0;
> +			igt_assert(b + 5 <= MAX_BATCH_DWORDS);
> +			write_dword(batch, target_addr, 0xDEADBEEF + i, &b);
> +		}
> +		if (!(flags & PREFETCH)) {
> +			if (!exec_ufence)
> +				data[idx].exec_sync = 0;
>  
> -		sync[0].addr = exec_ufence ? to_user_pointer(exec_ufence) :
> -			addr + (char *)&data[idx].exec_sync - (char *)data;
> +			sync[0].addr = exec_ufence ? to_user_pointer(exec_ufence) :
> +				       addr + (char *)&data[idx].exec_sync - (char *)data;
>  
> -		exec.exec_queue_id = exec_queues[e];
> -		if (fault_inject)
> -			exec.address = batch_addr * 2;
> -		else
> -			exec.address = batch_addr;
> +			exec.exec_queue_id = exec_queues[e];
> +			if (fault_inject)
> +				exec.address = batch_addr * 2;
> +			else
> +				exec.address = batch_addr;
>  
> -		if (fault_injected) {
> -			err = __xe_exec(fd, &exec);
> -			igt_assert(err == -ENOENT);
> -		} else {
> -			xe_exec(fd, &exec);
> +			if (fault_injected) {
> +				err = __xe_exec(fd, &exec);
> +				igt_assert(err == -ENOENT);
> +			} else {
> +				xe_exec(fd, &exec);
> +			}
>  		}
>  
> -		if (barrier)
> +		if (barrier && ! (flags & PREFETCH))
>  			pthread_barrier_wait(barrier);
>  
>  		if (fault_inject || fault_injected) {
> @@ -1209,7 +1243,7 @@ test_exec(int fd, struct drm_xe_engine_class_instance *eci,
>  					       USER_FENCE_VALUE,
>  					       exec_queues[e], &timeout);
>  			igt_assert(err == -ETIME || err == -EIO);
> -		} else {
> +		} else if (!(flags & PREFETCH)){
>  			xe_wait_ufence(fd, exec_ufence ? exec_ufence :
>  				       &data[idx].exec_sync, USER_FENCE_VALUE,
>  				       exec_queues[e], FIVE_SEC);
> @@ -1289,8 +1323,7 @@ test_exec(int fd, struct drm_xe_engine_class_instance *eci,
>  						      READ_VALUE(&data[prev_idx]));
>  			}
>  		}
> -
> -		if (exec_ufence)
> +		if (!(flags & PREFETCH) && exec_ufence)
>  			exec_ufence[0] = 0;
>  
>  		if (bo) {
> @@ -1355,6 +1388,31 @@ test_exec(int fd, struct drm_xe_engine_class_instance *eci,
>  		prev_idx = idx;
>  	}
>  
> +	if (flags & PREFETCH) {
> +		sync[0].addr = to_user_pointer(exec_ufence);
> +		xe_vm_prefetch_async(fd, vm, 0, 0, addr, bo_size, sync, 1, flags & VRAM ? 1 : 0);
> +		xe_wait_ufence(fd, exec_ufence, USER_FENCE_VALUE, 0, NSEC_PER_SEC);
> +
> +		for (i =0; i < n_execs; i++) {
> +			int e = i % n_exec_queues;
> +			result_ptr = (uint32_t *)((uint8_t *)data + i * slice_size + 0x100);
> +			expected = 0xDEADBEEF + i;
> +
> +			svm_pf_count_pre = xe_gt_stats_get_count(fd, eci->gt_id, stat);
> +			ba_addr = addr + i * slice_size;
> +			exec.exec_queue_id = exec_queues[e];
> +			exec.address = ba_addr;
> +			exec_ufence[0] = 0;
> +			sync[0].addr = to_user_pointer(exec_ufence);
> +			xe_exec(fd, &exec);
> +			svm_pf_count_pos = xe_gt_stats_get_count(fd, eci->gt_id, stat);
> +			igt_assert(svm_pf_count_pre == svm_pf_count_pos);
> +			xe_wait_ufence(fd, exec_ufence, USER_FENCE_VALUE, exec_queues[e], NSEC_PER_SEC);
> +			exec_ufence[0] = 0;
> +			igt_assert_eq(*result_ptr, expected);
> +		}
> +	}
> +
>  	if (bo) {
>  		__xe_vm_bind_assert(fd, vm, 0,
>  				    0, 0, addr, bo_size,
> @@ -1598,6 +1656,19 @@ struct section {
>  igt_main
>  {
>  	struct drm_xe_engine_class_instance *hwe;
> +	const struct mode {
> +		const char *name;
> +		unsigned int flags;
> +		size_t size;
> +	} mode[] = {
> +		{ "smem-SZ_4K", PREFETCH, SZ_4K},
> +		{ "smem-SZ_64K", PREFETCH, SZ_64K},
> +		{ "smem-SZ_2M", PREFETCH, SZ_2M},
> +		{ "vram-SZ_4K", PREFETCH, SZ_4K},
> +		{ "vram-SZ_64K", PREFETCH | VRAM, SZ_64K},
> +		{ "vram-SZ_2M", PREFETCH | VRAM, SZ_2M},
> +		{ NULL },
> +	}, *m;
>  	const struct section sections[] = {
>  		{ "malloc", 0 },
>  		{ "malloc-multi-fault", MULTI_FAULT },
> @@ -1792,6 +1863,18 @@ igt_main
>  			processes(fd, 16, 128, SZ_2M, 0, s->flags);
>  	}
>  
> +	for (m = mode; m->name; m++) {
> +                igt_subtest_f("prefetch-%s", m->name)
> +                        xe_for_each_engine(fd, hwe)
> +                                test_exec(fd, hwe, 1, 1, m->size, 0, 0, NULL,
> +                                          NULL, m->flags);
> +
> +		igt_subtest_f("multi-range-%s", m->name)
> +			xe_for_each_engine(fd, hwe)
> +				test_exec(fd, hwe, 1, 10, m->size, 0, 0, NULL,
> +					  NULL, m->flags);
> +	}
> +
>  	igt_subtest("threads-shared-vm-shared-alloc-many-stride-malloc")
>  		threads(fd, 1, 128, 0, 256, SHARED_ALLOC, true);
>  
> -- 
> 2.34.1
>