[PATCH i-g-t v3 06/10] tests/intel/xe_svm: Add support for GPU atomic access test for svm

Sat May 18 02:16:18 UTC 2024

> -----Original Message-----
> From: Bommu, Krishnaiah <krishnaiah.bommu at intel.com>
> Sent: Friday, May 17, 2024 7:47 AM
> To: igt-dev at lists.freedesktop.org
> Cc: Bommu, Krishnaiah <krishnaiah.bommu at intel.com>; Zeng, Oak
> <oak.zeng at intel.com>; Ghimiray, Himal Prasad
> <himal.prasad.ghimiray at intel.com>
> Subject: [PATCH i-g-t v3 06/10] tests/intel/xe_svm: Add support for GPU atomic
> access test for svm
> 
> Verify GPU atomic access using multiple threads by performing operations on
> randomly allocated locations within malloc'ed memory in shared virtual memory.
> 
> Signed-off-by: Bommu Krishnaiah <krishnaiah.bommu at intel.com>
> Cc: Oak Zeng <oak.zeng at intel.com>
> Cc: Himal Prasad Ghimiray <himal.prasad.ghimiray at intel.com>
> ---
>  lib/xe/xe_util.c     | 11 +++++++
>  lib/xe/xe_util.h     |  1 +
>  tests/intel/xe_svm.c | 71 ++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 83 insertions(+)
> 
> diff --git a/lib/xe/xe_util.c b/lib/xe/xe_util.c
> index de848b8bc..672e9dcef 100644
> --- a/lib/xe/xe_util.c
> +++ b/lib/xe/xe_util.c
> @@ -117,6 +117,17 @@ void insert_store(uint32_t *batch, uint64_t dst_va,
> uint32_t val)
>  	batch[++i] = MI_BATCH_BUFFER_END;
>  }
> 
> +// Function to insert atomic increment command
> +void insert_atomic_inc(uint32_t *batch, uint64_t dst_va, uint32_t val)
> +{
> +	int i = 0;

You can write:

(void)val;

To annotate the parameter val is not used.

> +
> +	batch[i] = MI_ATOMIC | MI_ATOMIC_INC;
> +	batch[++i] = dst_va;
> +	batch[++i] = dst_va >> 32;
> +	batch[++i] = MI_BATCH_BUFFER_END;
> +}
> +
>  /**
>   * Creates a command buffer, fills it with commands using the provided fill
>   * function, and sets up the execution queue for submission.
> diff --git a/lib/xe/xe_util.h b/lib/xe/xe_util.h
> index c38f79e60..46e1ccc9a 100644
> --- a/lib/xe/xe_util.h
> +++ b/lib/xe/xe_util.h
> @@ -40,6 +40,7 @@ void xe_create_cmdbuf(struct xe_buffer *cmd_buf,
> cmdbuf_fill_func_t fill_func,
>  uint64_t xe_cmdbuf_exec_ufence_gpuva(struct xe_buffer *cmd_buf);
>  uint64_t *xe_cmdbuf_exec_ufence_cpuva(struct xe_buffer *cmd_buf);
>  void insert_store(uint32_t *batch, uint64_t dst_va, uint32_t val);
> +void insert_atomic_inc(uint32_t *batch, uint64_t dst_va, uint32_t val);
>  void xe_submit_cmd(struct xe_buffer *cmdbuf);
>  int64_t __xe_submit_cmd(struct xe_buffer *cmdbuf);
>  void xe_destroy_buffer(struct xe_buffer *buffer);
> diff --git a/tests/intel/xe_svm.c b/tests/intel/xe_svm.c
> index d9629246c..f9e8eb2d9 100644
> --- a/tests/intel/xe_svm.c
> +++ b/tests/intel/xe_svm.c
> @@ -33,6 +33,9 @@
>   *
>   * SUBTEST: svm-huge-page
>   * Description: verify SVM basic functionality by using huge page access
> + *
> + * SUBTEST: svm-atomic-access
> + * Description: verify SVM basic functionality by using GPU atomic access any
> location in malloc'ed memory
>   */
> 
>  #include <fcntl.h>
> @@ -47,6 +50,18 @@
>  #include "xe/xe_ioctl.h"
>  #include "xe/xe_query.h"
> 
> +#define NUM_THREADS 10
> +
> +// Thread argument structure
> +typedef struct {
> +	int fd;
> +	uint32_t vm;
> +	void *gpu_va;
> +	uint64_t dst_va;
> +	uint32_t val;
> +	struct drm_xe_engine_class_instance *eci;
> +} thread_args_t;
> +
>  /**
>   *  @brief Verifies basic workload execution on the GPU.
>   *
> @@ -202,6 +217,58 @@ static void svm_thp(int fd, uint32_t vm, struct
> drm_xe_engine_class_instance *ec
>  	free(dst);
>  }
> 
> +
> +// Thread function for submitting atomic increment commands
> +static void* thread_func(void* args)
> +{
> +	thread_args_t *thread_args = (thread_args_t *)args;
> +	struct xe_buffer cmd_buf = {
> +		.fd = thread_args->fd,
> +		.gpu_addr = (void *)(uintptr_t)thread_args->gpu_va,

So you are creating many cmd buffers, one per thread. But all those command buffers are bind to the same gpu_va. This won't work.

You need to use different gpu va for each command buffer

> +		.vm = thread_args->vm,
> +		.size = xe_bb_size(thread_args->fd, PAGE_ALIGN_UFENCE),
> +		.placement = vram_if_possible(thread_args->fd, thread_args-
> >eci->gt_id),
> +		.flag = DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM,
> +	};
> +
> +	xe_create_cmdbuf(&cmd_buf, insert_atomic_inc, thread_args->dst_va,
> thread_args->val, thread_args->eci);
> +	xe_submit_cmd(&cmd_buf);
> +
> +	xe_destroy_cmdbuf(&cmd_buf);
> +
> +	return NULL;
> +}
> +
> +// Test GPU atomic access with multiple threads
> +static void svm_atomic_access(int fd, uint32_t vm, struct
> drm_xe_engine_class_instance *eci)
> +{
> +	uint64_t gpu_va = 0x1a0000;
> +	int val = 0xc0ffee;
> +	uint32_t *dst, *dst_to_access;
> +	uint32_t size = 1024 * 1024, sz_dw = size / 4;
> +	pthread_t threads[NUM_THREADS];
> +
> +	dst = aligned_alloc(xe_get_default_alignment(fd), size);
> +	dst_to_access = dst + (rand() % sz_dw);
> +	*dst_to_access = val;
> +
> +	thread_args_t thread_args = { fd, vm, (void *)(uintptr_t)gpu_va,
> (uint64_t)dst_to_access, val, eci };

Same question as before, why uintptr_t?

Oak
> +
> +	// Create and launch threads
> +	for (int i = 0; i < NUM_THREADS; i++) {
> +		pthread_create(&threads[i], NULL, thread_func, &thread_args);
> +	}
> +
> +	// Wait for all threads to finish
> +	for (int i = 0; i < NUM_THREADS; i++) {
> +		pthread_join(threads[i], NULL);
> +	}
> +
> +	igt_assert_eq(*dst_to_access, val + NUM_THREADS);
> +
> +	free(dst);
> +}
> +
>  igt_main
>  {
>  	int fd;
> @@ -234,6 +301,10 @@ igt_main
>  		xe_for_each_engine(fd, hwe)
>  			svm_thp(fd, vm, hwe);
> 
> +	igt_subtest_f("svm-atomic-access")
> +		xe_for_each_engine(fd, hwe)
> +			svm_atomic_access(fd, vm, hwe);
> +
>  	igt_fixture {
>  		xe_vm_destroy(fd, vm);
>  		drm_close_driver(fd);
> --
> 2.25.1