[PATCH i-g-t v3 08/10] tests/intel/xe_svm: Add svm-benchmark test to measure SVM performance with a simple benchmark

Sat May 18 02:27:03 UTC 2024

> -----Original Message-----
> From: Bommu, Krishnaiah <krishnaiah.bommu at intel.com>
> Sent: Friday, May 17, 2024 7:47 AM
> To: igt-dev at lists.freedesktop.org
> Cc: Bommu, Krishnaiah <krishnaiah.bommu at intel.com>; Zeng, Oak
> <oak.zeng at intel.com>; Ghimiray, Himal Prasad
> <himal.prasad.ghimiray at intel.com>
> Subject: [PATCH i-g-t v3 08/10] tests/intel/xe_svm: Add svm-benchmark test to
> measure SVM performance with a simple benchmark
> 
> svm-benchmark test provides a basic benchmark to compare the performance
> of
> system allocators against runtime allocators in SVM scenarios.

In the i915 igt, we have a similar benchmark test for runtime allocator, thus above description.

In xekmd igt, right now we don't have a corresponding runtime allocator benchmark test. So we need to modify above description. 

> 
> Signed-off-by: Bommu Krishnaiah <krishnaiah.bommu at intel.com>
> Cc: Oak Zeng <oak.zeng at intel.com>
> Cc: Himal Prasad Ghimiray <himal.prasad.ghimiray at intel.com>
> ---
>  lib/xe/xe_util.c     | 34 +++++++++++++++++++++++++++++++
>  lib/xe/xe_util.h     |  5 +++++
>  tests/intel/xe_svm.c | 48 ++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 87 insertions(+)
> 
> diff --git a/lib/xe/xe_util.c b/lib/xe/xe_util.c
> index 672e9dcef..c19cdae0c 100644
> --- a/lib/xe/xe_util.c
> +++ b/lib/xe/xe_util.c
> @@ -128,6 +128,24 @@ void insert_atomic_inc(uint32_t *batch, uint64_t
> dst_va, uint32_t val)
>  	batch[++i] = MI_BATCH_BUFFER_END;
>  }
> 
> +/** Insert commands to batch buffer to memset dst_va buffer with val
> + */
> +void insert_memset(uint32_t *batch, uint64_t dst_va, uint64_t size, uint32_t
> val)
> +{
> +#define PVC_MEM_SET_CMD		(2 << 29 | 0x5b << 22)
> +#define MS_MATRIX		(1 << 17)
> +	const int page_shift = 12;
> +
> +	*batch++ = PVC_MEM_SET_CMD | MS_MATRIX | (7 - 2);
> +	*batch++ = BIT(page_shift) - 1;
> +	*batch++ = (size >> page_shift) - 1;
> +	*batch++ = BIT(page_shift) - 1;
> +	*batch++ = lower_32_bits(dst_va);
> +	*batch++ = upper_32_bits(dst_va);
> +	*batch++ = (uint32_t)val << 24;
> +	*batch++ = MI_BATCH_BUFFER_END;
> +}
> +
>  /**
>   * Creates a command buffer, fills it with commands using the provided fill
>   * function, and sets up the execution queue for submission.
> @@ -143,6 +161,22 @@ void xe_create_cmdbuf(struct xe_buffer *cmd_buf,
> cmdbuf_fill_func_t fill_func, u
>  	fill_func(cmd_buf->cpu_addr, dst_va, val);
>  }
> 
> +/**
> + * Create a command buffer and fill it with a two-DW command function.
> + */
> +void xe_create_cmdbuf_fill_two_dw(struct xe_buffer *cmd_buf,
> cmdbuf_fill_two_dw_func_t fill_func,
> +		uint64_t dst_va, uint64_t dst_va1, uint32_t val, struct
> drm_xe_engine_class_instance *eci)
> +{
> +	//make some room for a exec_ufence, which will be used to sync the
> +	//submission of this command....
> +
> +	cmd_buf->size = xe_bb_size(cmd_buf->fd, cmd_buf->size +
> PAGE_ALIGN_UFENCE);
> +	xe_create_buffer(cmd_buf);
> +	cmd_buf->exec_queue = xe_exec_queue_create(cmd_buf->fd,
> cmd_buf->vm, eci, 0);
> +	fill_func(cmd_buf->cpu_addr, dst_va, dst_va1, val);
> +}
> +
> +
>  /**
>   * Destroys a command buffer created by xe_create_cmdbuf and releases
>   * associated resources.
> diff --git a/lib/xe/xe_util.h b/lib/xe/xe_util.h
> index 46e1ccc9a..50f2a4bc4 100644
> --- a/lib/xe/xe_util.h
> +++ b/lib/xe/xe_util.h
> @@ -34,13 +34,18 @@ struct xe_buffer {
>  };
> 
>  typedef void (*cmdbuf_fill_func_t) (uint32_t *batch, uint64_t dst_gpu_va,
> uint32_t val);
> +typedef void (*cmdbuf_fill_two_dw_func_t) (uint32_t *batch, uint64_t
> dst_gpu_va,
> +		uint64_t dst_gpu_va1, uint32_t val);
>  void xe_create_buffer(struct xe_buffer *buffer);
>  void xe_create_cmdbuf(struct xe_buffer *cmd_buf, cmdbuf_fill_func_t fill_func,
>  		uint64_t dst_va, uint32_t val, struct
> drm_xe_engine_class_instance *eci);
> +void xe_create_cmdbuf_fill_two_dw(struct xe_buffer *cmd_buf,
> cmdbuf_fill_two_dw_func_t fill_func,
> +		uint64_t dst_va, uint64_t dst_va1, uint32_t val, struct
> drm_xe_engine_class_instance *eci);
>  uint64_t xe_cmdbuf_exec_ufence_gpuva(struct xe_buffer *cmd_buf);
>  uint64_t *xe_cmdbuf_exec_ufence_cpuva(struct xe_buffer *cmd_buf);
>  void insert_store(uint32_t *batch, uint64_t dst_va, uint32_t val);
>  void insert_atomic_inc(uint32_t *batch, uint64_t dst_va, uint32_t val);
> +void insert_memset(uint32_t *batch, uint64_t dst_va, uint64_t size, uint32_t
> val);
>  void xe_submit_cmd(struct xe_buffer *cmdbuf);
>  int64_t __xe_submit_cmd(struct xe_buffer *cmdbuf);
>  void xe_destroy_buffer(struct xe_buffer *buffer);
> diff --git a/tests/intel/xe_svm.c b/tests/intel/xe_svm.c
> index 895cf26ac..072a602b1 100644
> --- a/tests/intel/xe_svm.c
> +++ b/tests/intel/xe_svm.c
> @@ -39,6 +39,9 @@
>   *
>   * SUBTEST: svm-invalid-va
>   * Description: Verify SVM functionality while accessing an invalid address.
> + *
> + * SUBTEST: svm-benchmark
> + * Description: Verify SVM performance with a simple benchmark test.
>   */
> 
>  #include <fcntl.h>
> @@ -301,6 +304,47 @@ static void svm_invalid_va(int fd, uint32_t vm, struct
> drm_xe_engine_class_insta
>  	free(dst);
>  }
> 
> +/**
> + * A simple benchmark test.
> + * Uses the GPU to memset a buffer with a specific value and measures the
> end-to-end bandwidth.
> + * This provides a basic comparison of the performance between the system
> allocator and the runtime allocator.

Change the description as well, remove runtme allocator

Oak
> + *
> + * By comparing the output of those two tests, we can have
> + * a very basic concept of the performance of sytem allocator
> + * compared to runtime allocator.
> + */
> +static void svm_benchmark(int fd, uint32_t vm, struct
> drm_xe_engine_class_instance *eci)
> +{
> +	uint64_t gpu_va = 0x1a0000;
> +	size_t bo_size = xe_bb_size(fd, PAGE_ALIGN_UFENCE);
> +	uint32_t *dst, size = 1 << 26;
> +	struct timespec start_time;
> +	double bandwidth;
> +
> +	struct xe_buffer cmd_buf = {
> +			.fd = fd,
> +			.gpu_addr = (void *)(uintptr_t)gpu_va,
> +			.vm = vm,
> +			.size = bo_size,
> +			.placement = vram_if_possible(fd, eci->gt_id),
> +			.flag =
> DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM,
> +	};
> +
> +	igt_gettime(&start_time);
> +	dst = aligned_alloc(xe_get_default_alignment(fd), size);
> +
> +	xe_create_cmdbuf_fill_two_dw(&cmd_buf, insert_memset,
> (uint64_t)dst, (uint64_t)size, 0x12, eci);
> +	xe_submit_cmd(&cmd_buf);
> +
> +	igt_assert_eq(*dst, 0x12121212);
> +
> +	xe_destroy_cmdbuf(&cmd_buf);
> +	free(dst);
> +
> +	bandwidth =
> (double)(size>>20)*NSEC_PER_SEC/igt_nsec_elapsed(&start_time);
> +	igt_info("engine class %d, engine id %d memset E2E bandwidth(include
> sync overhead) %.3f MiB/s\n", eci->engine_class, eci->engine_instance,
> bandwidth);
> +}
> +
>  igt_main
>  {
>  	int fd;
> @@ -341,6 +385,10 @@ igt_main
>  		xe_for_each_engine(fd, hwe)
>  			svm_invalid_va(fd, vm, hwe);
> 
> +	igt_subtest_f("svm-benchmark")
> +		xe_for_each_engine(fd, hwe)
> +			svm_benchmark(fd, vm, hwe);
> +
>  	igt_fixture {
>  		xe_vm_destroy(fd, vm);
>  		drm_close_driver(fd);
> --
> 2.25.1