[PATCH 2/2] tests/intel/xe_exec_system_allocator: Add prefetch sections

Thu Jul 3 17:51:21 UTC 2025

On Thu, Jun 26, 2025 at 11:25:47AM -0600, Ch, Sai Gowtham wrote:
> Hi Matt, 
> 
> I was revisiting the code and here are my few observations,
> 
> >-----Original Message-----
> >From: igt-dev <igt-dev-bounces at lists.freedesktop.org> On Behalf Of Matthew
> >Brost
> >Sent: Monday, June 16, 2025 12:37 PM
> >To: igt-dev at lists.freedesktop.org
> >Subject: [PATCH 2/2] tests/intel/xe_exec_system_allocator: Add prefetch sections
> >
> >Update the test_exec() function with flags to initiate and understand prefetches.
> >Prefetching is tested by verifying that the fault count in the GT stats has not
> >increased. However, a mismatch does not cause the test to fail—instead, a
> >warning  is printed. This is due to the unreliability of system allocator behavior; the
> >added sections primarily serve to ensure that the driver remains stable.
> >
> >That said, if the page fault counts are consistently incorrect, it may indicate a
> >problem in the KMD.
> >
> >A prefetch benchmark section has also been added to help profile prefetch
> >behavior  in the KMD.
> >
> >Signed-off-by: Matthew Brost <matthew.brost at intel.com>
> >---
> > tests/intel/xe_exec_system_allocator.c | 121 ++++++++++++++++++++++++-
> > 1 file changed, 116 insertions(+), 5 deletions(-)
> >
> >diff --git a/tests/intel/xe_exec_system_allocator.c
> >b/tests/intel/xe_exec_system_allocator.c
> >index 65d281c784..6899da5c44 100644
> >--- a/tests/intel/xe_exec_system_allocator.c
> >+++ b/tests/intel/xe_exec_system_allocator.c
> >@@ -21,6 +21,7 @@
> > #include "lib/intel_reg.h"
> > #include "xe_drm.h"
> >
> >+#include "xe/xe_gt.h"
> > #include "xe/xe_ioctl.h"
> > #include "xe/xe_query.h"
> > #include <string.h>
> >@@ -770,6 +771,10 @@ partial(int fd, struct drm_xe_engine_class_instance *eci,
> >unsigned int flags)
> > #define SYNC_EXEC		(0x1 << 19)
> > #define EVERY_OTHER_CHECK	(0x1 << 20)
> > #define MULTI_FAULT		(0x1 << 21)
> >+#define PREFETCH		(0x1 << 22)
> >+#define THREADS			(0x1 << 23)
> >+#define PROCESSES		(0x1 << 24)
> >+#define PREFETCH_BENCHMARK	(0x1 << 25)
> >
> > #define N_MULTI_FAULT		4
> >
> >@@ -878,14 +883,17 @@ partial(int fd, struct drm_xe_engine_class_instance
> >*eci, unsigned int flags)
> >  * arg[1]:
> >  *
> >  * @malloc:				malloc single buffer for all execs, issue a
> >command which will trigger multiple faults
> >+ * @malloc-prefetch:			malloc single buffer for all execs,
> >prefetch buffer before each exec
> >  * @malloc-multi-fault:			malloc single buffer for all execs
> >  * @malloc-fork-read:			malloc single buffer for all execs, fork a
> >process to read test output
> >  * @malloc-fork-read-after:		malloc single buffer for all execs, fork a
> >process to read test output, check again after fork returns in parent
> >  * @malloc-mlock:			malloc and mlock single buffer for all
> >execs
> >  * @malloc-race:			malloc single buffer for all execs with
> >race between cpu and gpu access
> >+ * @malloc-prefetch-race:		malloc single buffer for all execs,
> >prefetch buffer before each exec, with race between cpu and gpu access
> >  * @malloc-bo-unmap:			malloc single buffer for all execs, bind
> >and unbind a BO to same address before execs
> >  * @malloc-busy:			malloc single buffer for all execs, try to
> >unbind while buffer valid
> >  * @mmap:				mmap single buffer for all execs
> >+ * @mmap-prefetch:			mmap single buffer for all execs, prefetch
> >buffer before each exec
> >  * @mmap-remap:				mmap and mremap a buffer for
> >all execs
> >  * @mmap-remap-dontunmap:		mmap and mremap a buffer with
> >dontunmap flag for all execs
> >  * @mmap-remap-ro:			mmap and mremap a read-only buffer
> >for all execs
> >@@ -896,6 +904,7 @@ partial(int fd, struct drm_xe_engine_class_instance *eci,
> >unsigned int flags)
> >  * @mmap-remap-ro-dontunmap-eocheck:	mmap and mremap a read-only
> >buffer with dontunmap flag for all execs, check data every other loop iteration
> >  * @mmap-huge:				mmap huge page single buffer
> >for all execs
> >  * @mmap-shared:			mmap shared single buffer for all execs
> >+ * @mmap-prefetch-shared:		mmap shared single buffer for all execs,
> >prefetch buffer before each exec
> >  * @mmap-shared-remap:			mmap shared and mremap a
> >buffer for all execs
> >  * @mmap-shared-remap-dontunmap:	mmap shared and mremap a buffer with
> >dontunmap flag for all execs
> >  * @mmap-shared-remap-eocheck:		mmap shared and mremap a
> >buffer for all execs, check data every other loop iteration
> >@@ -907,6 +916,7 @@ partial(int fd, struct drm_xe_engine_class_instance *eci,
> >unsigned int flags)
> >  * @free:				malloc and free buffer for each exec
> >  * @free-race:				malloc and free buffer for each exec with
> >race between cpu and gpu access
> >  * @new:				malloc a new buffer for each exec
> >+ * @new-prefetch:			malloc a new buffer and prefetch for
> >each exec
> >  * @new-race:				malloc a new buffer for each exec with
> >race between cpu and gpu access
> >  * @new-bo-map:				malloc a new buffer or map BO
> >for each exec
> >  * @new-busy:				malloc a new buffer for each exec, try to
> >unbind while buffers valid
> >@@ -941,6 +951,10 @@ partial(int fd, struct drm_xe_engine_class_instance *eci,
> >unsigned int flags)
> >  * @mmap-new-huge-nomemset:		mmap huge page new buffer for
> >each exec, skip memset of buffers
> >  * @mmap-new-race-nomemset:		mmap a new buffer for each
> >exec with race between cpu and gpu access, skip memset of buffers
> >  *
> >+ * SUBTEST: prefetch-benchmark
> >+ * Description: Prefetch a 64M buffer 128 times, measure bandwidth of
> >+ prefetch
> >+ * Test category: performance test
> >+ *
> >  * SUBTEST: threads-shared-vm-shared-alloc-many-stride-malloc
> >  * Description: Create multiple threads with a shared VM triggering faults on
> >different hardware engines to same addresses
> >  * Test category: stress test
> >@@ -998,14 +1012,18 @@ test_exec(int fd, struct drm_xe_engine_class_instance
> >*eci,
> > 	uint32_t exec_queues[MAX_N_EXEC_QUEUES];
> > 	struct test_exec_data *data, *next_data = NULL;
> > 	uint32_t bo_flags;
> >-	uint32_t bo = 0;
> >+	uint32_t bo = 0, prefetch_sync = 0;
> > 	void **pending_free;
> >-	u64 *exec_ufence = NULL;
> >-	int i, j, b, file_fd = -1, prev_idx;
> >+	u64 *exec_ufence = NULL, *prefetch_ufence = NULL;
> >+	int i, j, b, file_fd = -1, prev_idx, pf_count;
> > 	bool free_vm = false;
> > 	size_t aligned_size = bo_size ?: xe_get_default_alignment(fd);
> > 	size_t orig_size = bo_size;
> > 	struct aligned_alloc_type aligned_alloc_type;
> >+	uint32_t mem_region = vram_if_possible(fd, eci->gt_id);
> >+	uint32_t region = mem_region & 4 ? 2 : mem_region & 2 ? 1 : 0;
> >+	uint64_t prefetch_ns = 0;
> >+	const char *pf_count_stat = "svm_pagefault_count";
> >
> > 	if (flags & MULTI_FAULT) {
> > 		if (!bo_size)
> >@@ -1136,6 +1154,43 @@ test_exec(int fd, struct drm_xe_engine_class_instance
> >*eci,
> > 		memset(exec_ufence, 0, SZ_4K);
> > 	}
> >
> >+	if (!(flags & FAULT) && flags & PREFETCH) {
> >+		bo_flags =
> >DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM;
> >+
> >+		aligned_alloc_type = __aligned_alloc(SZ_4K, SZ_4K);
> >+		prefetch_ufence = aligned_alloc_type.ptr;
> >+		igt_assert(prefetch_ufence);
> >+		__aligned_partial_free(&aligned_alloc_type);
> >+
> >+		prefetch_sync = xe_bo_create(fd, vm, SZ_4K, system_memory(fd),
> >+					     bo_flags);
> >+		prefetch_ufence = xe_bo_map_fixed(fd, prefetch_sync, SZ_4K,
> >+
> >to_user_pointer(prefetch_ufence));
> >+
> >+		sync[0].addr = to_user_pointer(prefetch_ufence);
> >+
> >+		pf_count = xe_gt_stats_get_count(fd, eci->gt_id, pf_count_stat);
> >+
> >+		if (flags & (RACE | FILE_BACKED |
> >+			     LOCK | MMAP_SHARED | HUGE_PAGE)) {

btw - there is bug here, should be:

               if (flags & (RACE | FILE_BACKED |
                            LOCK | MMAP_SHARED | HUGE_PAGE) || !region) {

> >+			region = 0;
> >+			xe_vm_prefetch_async(fd, vm, 0, 0, addr, bo_size, sync,
> >+					     1, region);

Please wrap replies at reasonable intervals (e.g., 80 lines or so) for easier
parsing.

> I see this coverage dose create prefetch address ranges for system memory and not specific to VRAM's, having said that in some coverage like (region || flags & (NEW | MREMAP))  I could see from dmesg prefetch ranges are migrated to VRAM's 
> However I'm trying to understanding do you thing we need some coverage in creating prefetch ranges specific to VRAM and use them for further exec submissions.

As long as the section doesn't have any of the flags tested which are tested
here, it will do a prefetch to VRAM in the main loop.

> >+			xe_wait_ufence(fd, prefetch_ufence,
> >USER_FENCE_VALUE, 0,
> >+				       FIVE_SEC);
> >+			prefetch_ufence[0] = 0;
> >+		}
> >+
> >+		if (exec_ufence) {
> >+			xe_vm_prefetch_async(fd, vm, 0, 0,
> >+					     to_user_pointer(exec_ufence),
> >+					     SZ_4K, sync, 1, 0);
> >+			xe_wait_ufence(fd, prefetch_ufence,
> >USER_FENCE_VALUE, 0,
> >+				       FIVE_SEC);
> >+			prefetch_ufence[0] = 0;
> >+		}
> >+	}
> >+
> > 	for (i = 0; i < n_execs; i++) {
> > 		int idx = !stride ? i : i * stride, next_idx = !stride
> > 			? (i + 1) : (i + 1) * stride;
> >@@ -1186,6 +1241,25 @@ test_exec(int fd, struct drm_xe_engine_class_instance
> >*eci,
> > 		if (!exec_ufence)
> > 			data[idx].exec_sync = 0;
> >
> >+		if (!(flags & FAULT) && flags & PREFETCH &&
> >+		    (region || flags & (NEW | MREMAP))) {
> I don’t see we are testing MREMAP with prefetch (at least we don’t have this subtest being added in main) , do you think we should still use this flag here. 
> >+			struct timespec tv = {};
> >+			u64 start, end;
> >+
> >+			sync[0].addr = to_user_pointer(prefetch_ufence);
> >+
> >+			start = igt_nsec_elapsed(&tv);
> >+			xe_vm_prefetch_async(fd, vm, 0, 0, addr, bo_size, sync,
> >+					     1, region);
> Rebinding of prefetch ranges I feel it's a bit tricky, initially we are are by default creating prefetch ranges in smem and rebinding/remapping of region zero prefetch ranges here again
> I'm not completely aligned to it.
> 

In the case of region being non-zero (VRAM), as soon as the test touches the
memory it is moved back to SRAM. In the case of (NEW | MREMAP) this is a new
buffer or VA. In either case to avoid a fault, we need to issue another
prefetch.

> In case of VRAM, we should keep platforms which dosen't support vram in mind before prefetching non-zero regions.

I'm not following.

> >+			end = igt_nsec_elapsed(&tv);
> >+
> >+			xe_wait_ufence(fd, prefetch_ufence,
> >USER_FENCE_VALUE, 0,
> >+				       FIVE_SEC);
> >+			prefetch_ufence[0] = 0;
> >+
> >+			prefetch_ns += (end - start);
> >+		}
> >+
> > 		sync[0].addr = exec_ufence ? to_user_pointer(exec_ufence) :
> > 			addr + (char *)&data[idx].exec_sync - (char *)data;
> >
> >@@ -1277,6 +1351,8 @@ test_exec(int fd, struct drm_xe_engine_class_instance
> >*eci,
> > 				} else {
> > 					igt_assert_eq(data[idx].data,
> > 						      READ_VALUE(&data[idx]));
> >+					if (flags & PREFETCH_BENCHMARK)
> >+						memset(data, 0, bo_size);
> >
> > 					if (flags & MULTI_FAULT) {
> > 						for (j = 1; j < N_MULTI_FAULT;
> >++j) { @@ -1359,6 +1435,26 @@ test_exec(int fd, struct
> >drm_xe_engine_class_instance *eci,
> > 		prev_idx = idx;
> > 	}
> >
> >+	if (flags & PREFETCH_BENCHMARK)
> >+		igt_info("Prefetch execution took %.3fms, %.1f5 GB/s\n",
> >+			 1e-6 * prefetch_ns,
> >+			 bo_size * n_execs  / (float)prefetch_ns);
> >+
> >+	if (!(flags & FAULT) && flags & PREFETCH &&
> >+	    (flags & MMAP || !(flags & (NEW | THREADS | PROCESSES)))) {
> >+		int pf_count_after = xe_gt_stats_get_count(fd, eci->gt_id,
> >+							   pf_count_stat);
> >+
> >+		/*
> >+		 * Due how system allocations work, we can't make this check
> >+		 * 100% reliable, rather than fail the test, just print a
> >+		 * warning message.
> >+		 */
> Why can't we rely on "svm_pagefault_count" ? Should we expect any false svm pagefault's ?
> >+		if (pf_count != pf_count_after)
> >+			igt_warn("pf_count(%d) != pf_count_after(%d)\n",
> >+				 pf_count, pf_count_after);
> >+	}
> >+
> While going through the code, I've realized we are missing error handling mechanisms during prefetch.
> 

That is part of part of this - but it more there isn't a way to guarnetee the
core MM does not invalidate the buffer - this is completely out of the drivers
control. Prefetch really is just opportunistic - hopefully it avoids faults but
no guarnetee.

Matt

> -----
> Gowtham
> > 	if (bo) {
> > 		__xe_vm_bind_assert(fd, vm, 0,
> > 				    0, 0, addr, bo_size,
> >@@ -1370,6 +1466,11 @@ test_exec(int fd, struct drm_xe_engine_class_instance
> >*eci,
> > 		gem_close(fd, bo);
> > 	}
> >
> >+	if (prefetch_sync) {
> >+		munmap(prefetch_ufence, SZ_4K);
> >+		gem_close(fd, prefetch_sync);
> >+	}
> >+
> > 	if (flags & BUSY)
> > 		igt_assert_eq(unbind_system_allocator(), -EBUSY);
> >
> >@@ -1435,7 +1536,7 @@ static void *thread(void *data)
> >
> > 	test_exec(t->fd, t->eci, t->n_exec_queues, t->n_execs,
> > 		  t->bo_size, t->stride, t->vm, t->alloc, t->barrier,
> >-		  t->flags);
> >+		  t->flags | THREADS);
> >
> > 	return NULL;
> > }
> >@@ -1553,7 +1654,7 @@ static void process(struct
> >drm_xe_engine_class_instance *hwe, int n_exec_queues,
> >
> > 	fd = drm_open_driver(DRIVER_XE);
> > 	test_exec(fd, hwe, n_exec_queues, n_execs,
> >-		  bo_size, stride, 0, NULL, NULL, flags);
> >+		  bo_size, stride, 0, NULL, NULL, flags | PROCESSES);
> > 	drm_close_driver(fd);
> >
> > 	close(map_fd);
> >@@ -1604,14 +1705,17 @@ igt_main
> > 	struct drm_xe_engine_class_instance *hwe;
> > 	const struct section sections[] = {
> > 		{ "malloc", 0 },
> >+		{ "malloc-prefetch", PREFETCH },
> > 		{ "malloc-multi-fault", MULTI_FAULT },
> > 		{ "malloc-fork-read", FORK_READ },
> > 		{ "malloc-fork-read-after", FORK_READ | FORK_READ_AFTER },
> > 		{ "malloc-mlock", LOCK },
> > 		{ "malloc-race", RACE },
> >+		{ "malloc-prefetch-race", RACE | PREFETCH },
> > 		{ "malloc-busy", BUSY },
> > 		{ "malloc-bo-unmap", BO_UNMAP },
> > 		{ "mmap", MMAP },
> >+		{ "mmap-prefetch", MMAP | PREFETCH },
> > 		{ "mmap-remap", MMAP | MREMAP },
> > 		{ "mmap-remap-dontunmap", MMAP | MREMAP | DONTUNMAP
> >},
> > 		{ "mmap-remap-ro", MMAP | MREMAP | READ_ONLY_REMAP },
> >@@ -1626,6 +1730,7 @@ igt_main
> > 			READ_ONLY_REMAP | EVERY_OTHER_CHECK },
> > 		{ "mmap-huge", MMAP | HUGE_PAGE },
> > 		{ "mmap-shared", MMAP | LOCK | MMAP_SHARED },
> >+		{ "mmap-prefetch-shared", MMAP | LOCK | MMAP_SHARED |
> >PREFETCH },
> > 		{ "mmap-shared-remap", MMAP | LOCK | MMAP_SHARED |
> >MREMAP },
> > 		{ "mmap-shared-remap-dontunmap", MMAP | LOCK |
> >MMAP_SHARED |
> > 			MREMAP | DONTUNMAP },
> >@@ -1640,6 +1745,7 @@ igt_main
> > 		{ "free", NEW | FREE },
> > 		{ "free-race", NEW | FREE | RACE },
> > 		{ "new", NEW },
> >+		{ "new-prefetch", NEW | PREFETCH },
> > 		{ "new-race", NEW | RACE },
> > 		{ "new-bo-map", NEW | BO_MAP },
> > 		{ "new-busy", NEW | BUSY },
> >@@ -1801,6 +1907,11 @@ igt_main
> > 			processes(fd, 16, 128, SZ_2M, 0, s->flags);
> > 	}
> >
> >+	igt_subtest_f("prefetch-benchmark")
> >+		xe_for_each_engine(fd, hwe)
> >+			test_exec(fd, hwe, 1, 128, SZ_64M, 0, 0, NULL,
> >+				  NULL, PREFETCH | PREFETCH_BENCHMARK);
> >+
> > 	igt_subtest("threads-shared-vm-shared-alloc-many-stride-malloc")
> > 		threads(fd, 1, 128, 0, 256, SHARED_ALLOC, true);
> >
> >--
> >2.34.1
>