[PATCH 1/2] tests/intel/xe_exec_system_allocator: Validate preftch of svm with single and multi ranges

Thu May 29 11:52:36 UTC 2025

From: Sai Gowtham Ch <sai.gowtham.ch at intel.com>

Tests validates Prefetch of SVM with single range and multiple ranges, with different
range of sizes. checks if not svm pagefaults are seen while prefetching the ranges of
svm.

v2: Enhance test to utilize smem/vram flags from the selection loop (Jonathan Cavitt)

v3: Integrate prefetch tests in exixting test_exec (Matthew Brost)

Cc: Himal Prasad Ghimiray <himal.prasad.ghimiray at intel.com>
Cc: Matthew Brost <matthew.brost at intel.com>
Cc: Jonathan Cavitt <jonathan.cavitt at intel.com>
Signed-off-by: Sai Gowtham Ch <sai.gowtham.ch at intel.com>
---
 tests/intel/xe_exec_system_allocator.c | 135 ++++++++++++++++++++-----
 1 file changed, 109 insertions(+), 26 deletions(-)

diff --git a/tests/intel/xe_exec_system_allocator.c b/tests/intel/xe_exec_system_allocator.c
index 06daac8c2..9596f7be3 100644
--- a/tests/intel/xe_exec_system_allocator.c
+++ b/tests/intel/xe_exec_system_allocator.c
@@ -20,6 +20,7 @@
 #include "lib/igt_syncobj.h"
 #include "lib/intel_reg.h"
 #include "xe_drm.h"
+#include "xe/xe_gt.c"
 
 #include "xe/xe_ioctl.h"
 #include "xe/xe_query.h"
@@ -770,8 +771,11 @@ partial(int fd, struct drm_xe_engine_class_instance *eci, unsigned int flags)
 #define SYNC_EXEC		(0x1 << 19)
 #define EVERY_OTHER_CHECK	(0x1 << 20)
 #define MULTI_FAULT		(0x1 << 21)
+#define PREFETCH		(0x1 << 22)
+#define VRAM			(0x1 << 23)
 
 #define N_MULTI_FAULT		4
+#define MAX_BATCH_DWORDS	16
 
 /**
  * SUBTEST: once-%s
@@ -957,7 +961,24 @@ partial(int fd, struct drm_xe_engine_class_instance *eci, unsigned int flags)
  * Description: Create multiple threads with a faults on different hardware engines to same addresses, racing between CPU and GPU access
  * Test category: stress test
  */
-
+/**
+ * SUBTEST: prefetch-%s
+ * Description: Test to validate functionality of Prefetch of SVM %arg[1]
+ * Test category: functionality test
+ *
+ * SUBTEST: multi-range-%s
+ * Description: Multi range Prefetch of SVM %arg[1] and  check if multiple ranges are created
+ * Test category: functionality test
+ *
+ * arg[1]:
+ *
+ * @smem-SZ_4K: with size of SZ_4K on smem region
+ * @smem-SZ_64K: with size of SZ_64K on smem region
+ * @smem-SZ_2M: with size of SZ_2M on smem region
+ * @vram-SZ_4K: with size of SZ_4K on vram region
+ * @vram-SZ_64K: with size of SZ_64K on vram region
+ * @vram-SZ_2M: with size of SZ_2M on vram region
+ */
 struct test_exec_data {
 	uint32_t batch[32];
 	uint64_t pad;
@@ -981,7 +1002,7 @@ test_exec(int fd, struct drm_xe_engine_class_instance *eci,
 	  size_t stride, uint32_t vm, void *alloc, pthread_barrier_t *barrier,
 	  unsigned int flags)
 {
-	uint64_t addr;
+	uint64_t addr, target_addr, ba_addr;
 	struct drm_xe_sync sync[1] = {
 		{ .type = DRM_XE_SYNC_TYPE_USER_FENCE, .flags = DRM_XE_SYNC_FLAG_SIGNAL,
 	          .timeline_value = USER_FENCE_VALUE },
@@ -993,15 +1014,20 @@ test_exec(int fd, struct drm_xe_engine_class_instance *eci,
 	};
 	uint32_t exec_queues[MAX_N_EXEC_QUEUES];
 	struct test_exec_data *data, *next_data = NULL;
-	uint32_t bo_flags;
+	uint32_t bo_flags, expected, *result_ptr, *batch;
 	uint32_t bo = 0;
 	void **pending_free;
 	u64 *exec_ufence = NULL;
-	int i, j, b, file_fd = -1, prev_idx;
+	int i, j, b, file_fd = -1, prev_idx, svm_pf_count_pre, svm_pf_count_pos;
 	bool free_vm = false;
 	size_t aligned_size = bo_size ?: xe_get_default_alignment(fd);
 	size_t orig_size = bo_size;
+	size_t slice_size = bo_size;
 	struct aligned_alloc_type aligned_alloc_type;
+	const char *stat = "svm_pagefault_count";
+
+	if (flags & PREFETCH)
+		bo_size = bo_size * n_execs;
 
 	if (flags & MULTI_FAULT) {
 		if (!bo_size)
@@ -1134,7 +1160,7 @@ test_exec(int fd, struct drm_xe_engine_class_instance *eci,
 
 	for (i = 0; i < n_execs; i++) {
 		int idx = !stride ? i : i * stride, next_idx = !stride
-			? (i + 1) : (i + 1) * stride;
+			  ? (i + 1) : (i + 1) * stride;
 		uint64_t batch_offset = (char *)&data[idx].batch - (char *)data;
 		uint64_t batch_addr = addr + batch_offset;
 		uint64_t sdi_offset = (char *)&data[idx].data - (char *)data;
@@ -1155,12 +1181,12 @@ test_exec(int fd, struct drm_xe_engine_class_instance *eci,
 			write_dword(data[idx].batch, sdi_addr + j * orig_size,
 				    WRITE_VALUE(&data[idx], idx), &b);
 			igt_assert(b <= ARRAY_SIZE(data[idx].batch));
-		} else if (!(flags & EVERY_OTHER_CHECK)) {
+		} else if (!(flags & EVERY_OTHER_CHECK) && !(flags & PREFETCH)) {
 			b = 0;
 			write_dword(data[idx].batch, sdi_addr,
 				    WRITE_VALUE(&data[idx], idx), &b);
 			igt_assert(b <= ARRAY_SIZE(data[idx].batch));
-		} else if (flags & EVERY_OTHER_CHECK && !odd(i)) {
+		} else if (flags & EVERY_OTHER_CHECK && !odd(i) && !(flags & PREFETCH)) {
 			b = 0;
 			write_dword(data[idx].batch, sdi_addr,
 				    WRITE_VALUE(&data[idx], idx), &b);
@@ -1177,28 +1203,36 @@ test_exec(int fd, struct drm_xe_engine_class_instance *eci,
 				    (char *)&data[next_idx].data - (char *)data,
 				    WRITE_VALUE(&data[next_idx], next_idx), &b);
 			igt_assert(b <= ARRAY_SIZE(data[next_idx].batch));
-		}
+		} else if ( flags & PREFETCH) {
+			batch = (uint32_t *)((uint8_t *)data + i * slice_size);
+			target_addr = addr + i * slice_size + 0x100;
+			b = 0;
 
-		if (!exec_ufence)
-			data[idx].exec_sync = 0;
+			igt_assert(b + 5 <= MAX_BATCH_DWORDS);
+			write_dword(batch, target_addr, 0xDEADBEEF + i, &b);
+		}
+		if (!(flags & PREFETCH)) {
+			if (!exec_ufence)
+				data[idx].exec_sync = 0;
 
-		sync[0].addr = exec_ufence ? to_user_pointer(exec_ufence) :
-			addr + (char *)&data[idx].exec_sync - (char *)data;
+			sync[0].addr = exec_ufence ? to_user_pointer(exec_ufence) :
+				       addr + (char *)&data[idx].exec_sync - (char *)data;
 
-		exec.exec_queue_id = exec_queues[e];
-		if (fault_inject)
-			exec.address = batch_addr * 2;
-		else
-			exec.address = batch_addr;
+			exec.exec_queue_id = exec_queues[e];
+			if (fault_inject)
+				exec.address = batch_addr * 2;
+			else
+				exec.address = batch_addr;
 
-		if (fault_injected) {
-			err = __xe_exec(fd, &exec);
-			igt_assert(err == -ENOENT);
-		} else {
-			xe_exec(fd, &exec);
+			if (fault_injected) {
+				err = __xe_exec(fd, &exec);
+				igt_assert(err == -ENOENT);
+			} else {
+				xe_exec(fd, &exec);
+			}
 		}
 
-		if (barrier)
+		if (barrier && ! (flags & PREFETCH))
 			pthread_barrier_wait(barrier);
 
 		if (fault_inject || fault_injected) {
@@ -1209,7 +1243,7 @@ test_exec(int fd, struct drm_xe_engine_class_instance *eci,
 					       USER_FENCE_VALUE,
 					       exec_queues[e], &timeout);
 			igt_assert(err == -ETIME || err == -EIO);
-		} else {
+		} else if (!(flags & PREFETCH)){
 			xe_wait_ufence(fd, exec_ufence ? exec_ufence :
 				       &data[idx].exec_sync, USER_FENCE_VALUE,
 				       exec_queues[e], FIVE_SEC);
@@ -1289,8 +1323,7 @@ test_exec(int fd, struct drm_xe_engine_class_instance *eci,
 						      READ_VALUE(&data[prev_idx]));
 			}
 		}
-
-		if (exec_ufence)
+		if (!(flags & PREFETCH) && exec_ufence)
 			exec_ufence[0] = 0;
 
 		if (bo) {
@@ -1355,6 +1388,31 @@ test_exec(int fd, struct drm_xe_engine_class_instance *eci,
 		prev_idx = idx;
 	}
 
+	if (flags & PREFETCH) {
+		sync[0].addr = to_user_pointer(exec_ufence);
+		xe_vm_prefetch_async(fd, vm, 0, 0, addr, bo_size, sync, 1, flags & VRAM ? 1 : 0);
+		xe_wait_ufence(fd, exec_ufence, USER_FENCE_VALUE, 0, NSEC_PER_SEC);
+
+		for (i =0; i < n_execs; i++) {
+			int e = i % n_exec_queues;
+			result_ptr = (uint32_t *)((uint8_t *)data + i * slice_size + 0x100);
+			expected = 0xDEADBEEF + i;
+
+			svm_pf_count_pre = xe_gt_stats_get_count(fd, eci->gt_id, stat);
+			ba_addr = addr + i * slice_size;
+			exec.exec_queue_id = exec_queues[e];
+			exec.address = ba_addr;
+			exec_ufence[0] = 0;
+			sync[0].addr = to_user_pointer(exec_ufence);
+			xe_exec(fd, &exec);
+			svm_pf_count_pos = xe_gt_stats_get_count(fd, eci->gt_id, stat);
+			igt_assert(svm_pf_count_pre == svm_pf_count_pos);
+			xe_wait_ufence(fd, exec_ufence, USER_FENCE_VALUE, exec_queues[e], NSEC_PER_SEC);
+			exec_ufence[0] = 0;
+			igt_assert_eq(*result_ptr, expected);
+		}
+	}
+
 	if (bo) {
 		__xe_vm_bind_assert(fd, vm, 0,
 				    0, 0, addr, bo_size,
@@ -1598,6 +1656,19 @@ struct section {
 igt_main
 {
 	struct drm_xe_engine_class_instance *hwe;
+	const struct mode {
+		const char *name;
+		unsigned int flags;
+		size_t size;
+	} mode[] = {
+		{ "smem-SZ_4K", PREFETCH, SZ_4K},
+		{ "smem-SZ_64K", PREFETCH, SZ_64K},
+		{ "smem-SZ_2M", PREFETCH, SZ_2M},
+		{ "vram-SZ_4K", PREFETCH, SZ_4K},
+		{ "vram-SZ_64K", PREFETCH | VRAM, SZ_64K},
+		{ "vram-SZ_2M", PREFETCH | VRAM, SZ_2M},
+		{ NULL },
+	}, *m;
 	const struct section sections[] = {
 		{ "malloc", 0 },
 		{ "malloc-multi-fault", MULTI_FAULT },
@@ -1792,6 +1863,18 @@ igt_main
 			processes(fd, 16, 128, SZ_2M, 0, s->flags);
 	}
 
+	for (m = mode; m->name; m++) {
+                igt_subtest_f("prefetch-%s", m->name)
+                        xe_for_each_engine(fd, hwe)
+                                test_exec(fd, hwe, 1, 1, m->size, 0, 0, NULL,
+                                          NULL, m->flags);
+
+		igt_subtest_f("multi-range-%s", m->name)
+			xe_for_each_engine(fd, hwe)
+				test_exec(fd, hwe, 1, 10, m->size, 0, 0, NULL,
+					  NULL, m->flags);
+	}
+
 	igt_subtest("threads-shared-vm-shared-alloc-many-stride-malloc")
 		threads(fd, 1, 128, 0, 256, SHARED_ALLOC, true);
 
-- 
2.34.1