[PATCH i-g-t v3 08/10] tests/intel/xe_svm: Add svm-benchmark test to measure SVM performance with a simple benchmark
Zeng, Oak
oak.zeng at intel.com
Sat May 18 02:27:03 UTC 2024
> -----Original Message-----
> From: Bommu, Krishnaiah <krishnaiah.bommu at intel.com>
> Sent: Friday, May 17, 2024 7:47 AM
> To: igt-dev at lists.freedesktop.org
> Cc: Bommu, Krishnaiah <krishnaiah.bommu at intel.com>; Zeng, Oak
> <oak.zeng at intel.com>; Ghimiray, Himal Prasad
> <himal.prasad.ghimiray at intel.com>
> Subject: [PATCH i-g-t v3 08/10] tests/intel/xe_svm: Add svm-benchmark test to
> measure SVM performance with a simple benchmark
>
> svm-benchmark test provides a basic benchmark to compare the performance
> of
> system allocators against runtime allocators in SVM scenarios.
In the i915 igt, we have a similar benchmark test for runtime allocator, thus above description.
In xekmd igt, right now we don't have a corresponding runtime allocator benchmark test. So we need to modify above description.
>
> Signed-off-by: Bommu Krishnaiah <krishnaiah.bommu at intel.com>
> Cc: Oak Zeng <oak.zeng at intel.com>
> Cc: Himal Prasad Ghimiray <himal.prasad.ghimiray at intel.com>
> ---
> lib/xe/xe_util.c | 34 +++++++++++++++++++++++++++++++
> lib/xe/xe_util.h | 5 +++++
> tests/intel/xe_svm.c | 48 ++++++++++++++++++++++++++++++++++++++++++++
> 3 files changed, 87 insertions(+)
>
> diff --git a/lib/xe/xe_util.c b/lib/xe/xe_util.c
> index 672e9dcef..c19cdae0c 100644
> --- a/lib/xe/xe_util.c
> +++ b/lib/xe/xe_util.c
> @@ -128,6 +128,24 @@ void insert_atomic_inc(uint32_t *batch, uint64_t
> dst_va, uint32_t val)
> batch[++i] = MI_BATCH_BUFFER_END;
> }
>
> +/** Insert commands to batch buffer to memset dst_va buffer with val
> + */
> +void insert_memset(uint32_t *batch, uint64_t dst_va, uint64_t size, uint32_t
> val)
> +{
> +#define PVC_MEM_SET_CMD (2 << 29 | 0x5b << 22)
> +#define MS_MATRIX (1 << 17)
> + const int page_shift = 12;
> +
> + *batch++ = PVC_MEM_SET_CMD | MS_MATRIX | (7 - 2);
> + *batch++ = BIT(page_shift) - 1;
> + *batch++ = (size >> page_shift) - 1;
> + *batch++ = BIT(page_shift) - 1;
> + *batch++ = lower_32_bits(dst_va);
> + *batch++ = upper_32_bits(dst_va);
> + *batch++ = (uint32_t)val << 24;
> + *batch++ = MI_BATCH_BUFFER_END;
> +}
> +
> /**
> * Creates a command buffer, fills it with commands using the provided fill
> * function, and sets up the execution queue for submission.
> @@ -143,6 +161,22 @@ void xe_create_cmdbuf(struct xe_buffer *cmd_buf,
> cmdbuf_fill_func_t fill_func, u
> fill_func(cmd_buf->cpu_addr, dst_va, val);
> }
>
> +/**
> + * Create a command buffer and fill it with a two-DW command function.
> + */
> +void xe_create_cmdbuf_fill_two_dw(struct xe_buffer *cmd_buf,
> cmdbuf_fill_two_dw_func_t fill_func,
> + uint64_t dst_va, uint64_t dst_va1, uint32_t val, struct
> drm_xe_engine_class_instance *eci)
> +{
> + //make some room for a exec_ufence, which will be used to sync the
> + //submission of this command....
> +
> + cmd_buf->size = xe_bb_size(cmd_buf->fd, cmd_buf->size +
> PAGE_ALIGN_UFENCE);
> + xe_create_buffer(cmd_buf);
> + cmd_buf->exec_queue = xe_exec_queue_create(cmd_buf->fd,
> cmd_buf->vm, eci, 0);
> + fill_func(cmd_buf->cpu_addr, dst_va, dst_va1, val);
> +}
> +
> +
> /**
> * Destroys a command buffer created by xe_create_cmdbuf and releases
> * associated resources.
> diff --git a/lib/xe/xe_util.h b/lib/xe/xe_util.h
> index 46e1ccc9a..50f2a4bc4 100644
> --- a/lib/xe/xe_util.h
> +++ b/lib/xe/xe_util.h
> @@ -34,13 +34,18 @@ struct xe_buffer {
> };
>
> typedef void (*cmdbuf_fill_func_t) (uint32_t *batch, uint64_t dst_gpu_va,
> uint32_t val);
> +typedef void (*cmdbuf_fill_two_dw_func_t) (uint32_t *batch, uint64_t
> dst_gpu_va,
> + uint64_t dst_gpu_va1, uint32_t val);
> void xe_create_buffer(struct xe_buffer *buffer);
> void xe_create_cmdbuf(struct xe_buffer *cmd_buf, cmdbuf_fill_func_t fill_func,
> uint64_t dst_va, uint32_t val, struct
> drm_xe_engine_class_instance *eci);
> +void xe_create_cmdbuf_fill_two_dw(struct xe_buffer *cmd_buf,
> cmdbuf_fill_two_dw_func_t fill_func,
> + uint64_t dst_va, uint64_t dst_va1, uint32_t val, struct
> drm_xe_engine_class_instance *eci);
> uint64_t xe_cmdbuf_exec_ufence_gpuva(struct xe_buffer *cmd_buf);
> uint64_t *xe_cmdbuf_exec_ufence_cpuva(struct xe_buffer *cmd_buf);
> void insert_store(uint32_t *batch, uint64_t dst_va, uint32_t val);
> void insert_atomic_inc(uint32_t *batch, uint64_t dst_va, uint32_t val);
> +void insert_memset(uint32_t *batch, uint64_t dst_va, uint64_t size, uint32_t
> val);
> void xe_submit_cmd(struct xe_buffer *cmdbuf);
> int64_t __xe_submit_cmd(struct xe_buffer *cmdbuf);
> void xe_destroy_buffer(struct xe_buffer *buffer);
> diff --git a/tests/intel/xe_svm.c b/tests/intel/xe_svm.c
> index 895cf26ac..072a602b1 100644
> --- a/tests/intel/xe_svm.c
> +++ b/tests/intel/xe_svm.c
> @@ -39,6 +39,9 @@
> *
> * SUBTEST: svm-invalid-va
> * Description: Verify SVM functionality while accessing an invalid address.
> + *
> + * SUBTEST: svm-benchmark
> + * Description: Verify SVM performance with a simple benchmark test.
> */
>
> #include <fcntl.h>
> @@ -301,6 +304,47 @@ static void svm_invalid_va(int fd, uint32_t vm, struct
> drm_xe_engine_class_insta
> free(dst);
> }
>
> +/**
> + * A simple benchmark test.
> + * Uses the GPU to memset a buffer with a specific value and measures the
> end-to-end bandwidth.
> + * This provides a basic comparison of the performance between the system
> allocator and the runtime allocator.
Change the description as well, remove runtme allocator
Oak
> + *
> + * By comparing the output of those two tests, we can have
> + * a very basic concept of the performance of sytem allocator
> + * compared to runtime allocator.
> + */
> +static void svm_benchmark(int fd, uint32_t vm, struct
> drm_xe_engine_class_instance *eci)
> +{
> + uint64_t gpu_va = 0x1a0000;
> + size_t bo_size = xe_bb_size(fd, PAGE_ALIGN_UFENCE);
> + uint32_t *dst, size = 1 << 26;
> + struct timespec start_time;
> + double bandwidth;
> +
> + struct xe_buffer cmd_buf = {
> + .fd = fd,
> + .gpu_addr = (void *)(uintptr_t)gpu_va,
> + .vm = vm,
> + .size = bo_size,
> + .placement = vram_if_possible(fd, eci->gt_id),
> + .flag =
> DRM_XE_GEM_CREATE_FLAG_NEEDS_VISIBLE_VRAM,
> + };
> +
> + igt_gettime(&start_time);
> + dst = aligned_alloc(xe_get_default_alignment(fd), size);
> +
> + xe_create_cmdbuf_fill_two_dw(&cmd_buf, insert_memset,
> (uint64_t)dst, (uint64_t)size, 0x12, eci);
> + xe_submit_cmd(&cmd_buf);
> +
> + igt_assert_eq(*dst, 0x12121212);
> +
> + xe_destroy_cmdbuf(&cmd_buf);
> + free(dst);
> +
> + bandwidth =
> (double)(size>>20)*NSEC_PER_SEC/igt_nsec_elapsed(&start_time);
> + igt_info("engine class %d, engine id %d memset E2E bandwidth(include
> sync overhead) %.3f MiB/s\n", eci->engine_class, eci->engine_instance,
> bandwidth);
> +}
> +
> igt_main
> {
> int fd;
> @@ -341,6 +385,10 @@ igt_main
> xe_for_each_engine(fd, hwe)
> svm_invalid_va(fd, vm, hwe);
>
> + igt_subtest_f("svm-benchmark")
> + xe_for_each_engine(fd, hwe)
> + svm_benchmark(fd, vm, hwe);
> +
> igt_fixture {
> xe_vm_destroy(fd, vm);
> drm_close_driver(fd);
> --
> 2.25.1
More information about the igt-dev
mailing list