[PATCH] benchmarks: Measure BLT performance
Gurram, Pravalika
pravalika.gurram at intel.com
Mon Apr 28 10:40:31 UTC 2025
> -----Original Message-----
> From: Ch, Sai Gowtham <sai.gowtham.ch at intel.com>
> Sent: Friday, April 25, 2025 1:33 PM
> To: Gurram, Pravalika <pravalika.gurram at intel.com>; igt-
> dev at lists.freedesktop.org
> Cc: Ramadeva, Dwarakanath <dwarakanath.ramadeva at intel.com>
> Subject: RE: [PATCH] benchmarks: Measure BLT performance
>
> Hi Pravalika,
>
> >-----Original Message-----
> >From: Gurram, Pravalika <pravalika.gurram at intel.com>
> >Sent: Thursday, April 24, 2025 2:40 PM
> >To: igt-dev at lists.freedesktop.org
> >Cc: Ch, Sai Gowtham <sai.gowtham.ch at intel.com>; Ramadeva, Dwarakanath
> ><dwarakanath.ramadeva at intel.com>; Gurram, Pravalika
> ><pravalika.gurram at intel.com>
> >Subject: [PATCH] benchmarks: Measure BLT performance
> >
> >Execute N blits and time how long they complete to measure both GPU
> >limited bandwidth and submission overhead.
> >
> >Signed-off-by: Pravalika Gurram <pravalika.gurram at intel.com>
> >---
> > benchmarks/meson.build | 1 +
> > benchmarks/xe_blt.c | 330
> +++++++++++++++++++++++++++++++++++++++++
> > 2 files changed, 331 insertions(+)
> > create mode 100644 benchmarks/xe_blt.c
> >
> >diff --git a/benchmarks/meson.build b/benchmarks/meson.build index
> >2c9a88fd3..4421ede86 100644
> >--- a/benchmarks/meson.build
> >+++ b/benchmarks/meson.build
> >@@ -21,6 +21,7 @@ benchmark_progs = [
> > 'kms_vblank',
> > 'prime_lookup',
> > 'vgem_mmap',
> >+ 'xe_blt',
> > ]
> >
> > benchmarksdir = join_paths(libexecdir, 'benchmarks') diff --git
> >a/benchmarks/xe_blt.c b/benchmarks/xe_blt.c new file mode 100644 index
> >000000000..e95ed6d47
> >--- /dev/null
> >+++ b/benchmarks/xe_blt.c
> >@@ -0,0 +1,330 @@
> >+// SPDX-License-Identifier: MIT
> >+/*
> >+ * Copyright © 2025 Intel Corporation
> >+ */
> >+
> >+#include "drm.h"
> >+#include "igt_syncobj.h"
> >+#include "intel_blt.h"
> >+#include "xe/xe_ioctl.h"
> >+#include "xe/xe_query.h"
> >+#include "xe/xe_util.h"
> >+
> >+
> >+#define COPY_BLT_CMD (2<<29|0x53<<22|0x6)
> >+#define BLT_WRITE_ALPHA (1<<21)
> >+#define BLT_WRITE_RGB (1<<20)
> >+#define BLT_SRC_TILED (1<<15)
> >+#define BLT_DST_TILED (1<<11)
> >+
> >+static double
> >+elapsed(const struct timespec *start, const struct timespec *end) {
> >+ return (end->tv_sec - start->tv_sec) + 1e-9*(end->tv_nsec -
> Looks like it's not properly aligned, do checkpatch.
> >+start->tv_nsec); }
> >+
> >+static uint64_t emit_blt_src_copy(int fd,
> >+ uint64_t ahnd,
> Allocator handle can be removed from here.
> >+ const struct blt_copy_data *blt,
> >+ uint64_t bb_pos,
> >+ bool emit_bbe,
> >+ uint64_t dst_offset,
> >+ uint64_t src_offset,
> >+ uint32_t height)
> >+{
> >+ uint32_t b[12];
> >+ uint32_t bbe = MI_BATCH_BUFFER_END;
> >+ uint32_t *bb;
> >+ int i = 0;
> >+
> >+ src_offset += blt->src.plane_offset;
> >+ dst_offset += blt->dst.plane_offset;
> >+
> >+ b[i++] = COPY_BLT_CMD | BLT_WRITE_ALPHA | BLT_WRITE_RGB;
> It's very fascinating to understand why not use XY_SRC_COPY_BLT_CMD instead
> of COPY_BLT_CMD, As per my understanding this benchmark does basic copy
> operations i think COPY_BLT_CMD will be much faster though, However do we
> have any further plans to include in the future XY_SRC_COPY_BLT_CMD ?
Both the instructions are same but in COPT_BTL_CMD we are using the platform info that is the reason did not use XY_SRC_COPY_BLT_CMD
No in future am not going to change
> >+ b[i-1] += 2;
> >+ b[i++] = 0xcc << 16 | 1 << 25 | 1 << 24 | (16*1024);
> >+ b[i++] = 0;
> >+ b[i++] = height << 16 | (4*1024);
> >+ b[i++] = dst_offset;
> >+ b[i++] = dst_offset >> 32; /* FIXME */
> >+ b[i++] = 0;
> >+ b[i++] = 16*1024;
> >+ b[i++] = src_offset;
> >+ b[i++] = src_offset >> 32; /* FIXME */
> >+
> >+ bb = xe_bo_map(fd, blt->bb.handle, blt->bb.size);
> >+
> >+ igt_assert(bb_pos + sizeof(b) < blt->bb.size);
> >+ memcpy(bb + bb_pos, &b, sizeof(b));
> >+ bb_pos += sizeof(b);
> >+
> >+ if (emit_bbe) {
> >+ igt_assert(bb_pos + sizeof(uint32_t) < blt->bb.size);
> >+ memcpy(bb + bb_pos, &bbe, sizeof(bbe));
> >+ bb_pos += sizeof(uint32_t);
> >+ }
> >+
> >+ munmap(bb, blt->bb.size);
> >+
> >+ return bb_pos;
> >+
> >+}
> >+
> >+static int count;
> >+/*
> >+ * val = True To check BB is working fine or not
> >+ * count_val = To get the counter value how many buffers we can send
> >+in
> >+0.1 sec
> >+ * loop_count = count_val time we have to submit the buffers */
> >+static int blt_src_copy(int fd,
> >+ const intel_ctx_t *ctx,
> >+ const struct intel_execution_engine2 *e,
> >+ uint64_t ahnd,
> >+ const struct blt_copy_data *blt, uint32_t height,
> >+ bool val, bool count_val, bool loop_count) {
> >+ uint64_t dst_offset = 0, src_offset = 0, bb_offset = 0;
> >+ int ret = 0;
> >+ uint64_t bb_pos = 0;
> >+ struct timespec start, end;
> >+
> >+ igt_assert_f(ahnd, "src-copy supports softpin only\n");
> >+ igt_assert_f(blt, "src-copy requires data to do src-copy blit\n");
> >+ igt_assert_neq(blt->driver, 0);
> >+
> >+ if (!val) {
> >+ src_offset = get_offset_pat_index(ahnd, blt->src.handle, blt-
> >>src.size,
> >+ 0, blt->src.pat_index);
> >+ dst_offset = get_offset_pat_index(ahnd, blt->dst.handle, blt-
> >>dst.size,
> >+ 0, blt->dst.pat_index);
> >+ bb_offset = get_offset(ahnd, blt->bb.handle, blt->bb.size, 0);
> >+ }
> >+
> >+ bb_pos = emit_blt_src_copy(fd, ahnd, blt, 0, true, src_offset,
> Same here ahnd can be removed.
> >+ dst_offset, height);
> >+ if (count_val) {
> >+ clock_gettime(CLOCK_MONOTONIC, &start);
> >+ do {
> >+ if (blt->driver == INTEL_DRIVER_XE)
> >+ intel_ctx_xe_exec(ctx, ahnd,
> >CANONICAL(bb_offset));
> >+ count++;
> >+ clock_gettime(CLOCK_MONOTONIC, &end);
> >+ if (elapsed(&start, &end) > (100 / 1000.))
> >+ break;
> >+ } while (1);
> >+ } else if (loop_count) {
> >+ for (int loop = 0; loop < count; loop++) {
> >+ if (blt->driver == INTEL_DRIVER_XE)
> >+ intel_ctx_xe_exec(ctx, ahnd,
> >CANONICAL(bb_offset));
> >+ }
> >+
> >+ } else {
> >+ if (blt->driver == INTEL_DRIVER_XE)
> >+ intel_ctx_xe_exec(ctx, ahnd, CANONICAL(bb_offset));
> >+ }
> >+ return ret;
> >+}
> >+
> >+static int src_copy(int xe, const intel_ctx_t *ctx,
> >+ uint32_t width, uint32_t height,
> >+ uint32_t region1, uint32_t region2, bool val, bool count, bool
> loop)
> >+{
> >+ struct blt_copy_data blt = {};
> >+ struct blt_copy_object *src, *dst;
> >+ const uint32_t bpp = 32;
> >+ uint64_t bb_size = xe_bb_size(xe, SZ_4K);
> >+ uint64_t ahnd = intel_allocator_open_full(xe, ctx->vm, 0, 0,
> >+ INTEL_ALLOCATOR_SIMPLE,
> >+ ALLOC_STRATEGY_LOW_TO_HIGH, 0);
> >+ uint32_t bb;
> >+ int ret = 0;
> >+
> >+ bb = xe_bo_create(xe, 0, bb_size, region1, 0);
> >+ blt_copy_init(xe, &blt);
> >+ src = blt_create_object(&blt, region1, width, height, bpp, 0,
> >+ T_LINEAR, COMPRESSION_DISABLED, 0, true);
> >+ dst = blt_create_object(&blt, region1, width, height, bpp, 0,
> >+ T_LINEAR, COMPRESSION_DISABLED, 0, true);
> >+ igt_assert(src->size == dst->size);
> >+
> >+ blt_set_copy_object(&blt.src, src);
> >+ blt_set_copy_object(&blt.dst, dst);
> >+ blt_set_batch(&blt.bb, bb, bb_size, region1);
> >+
> >+ ret = blt_src_copy(xe, ctx, NULL, ahnd, &blt, height, val, count,
> >+loop);
> >+
> >+ put_offset(ahnd, src->handle);
> >+ put_offset(ahnd, dst->handle);
> >+ put_offset(ahnd, bb);
> >+ intel_allocator_bind(ahnd, 0, 0);
> >+ blt_destroy_object(xe, src);
> >+ blt_destroy_object(xe, dst);
> >+ gem_close(xe, bb);
> >+ put_ahnd(ahnd);
> >+ return ret;
> >+
> >+}
> >+
> >+#define SYNC 0x1
> >+
> >+static int run(int width, int batch, int time, int reps, int ncpus,
> >+unsigned int flags) {
> >+ struct igt_collection *set;
> >+ int xe;
> >+ int height = width / (16 * 1024);
> >+ struct drm_xe_engine_class_instance inst = {
> >+ .engine_class = DRM_XE_ENGINE_CLASS_COPY,
> >+ };
> >+ intel_ctx_t *ctx;
> >+ double *shared;
> >+ uint32_t region1, region2;
> >+ uint32_t vm, exec_queue;
> >+ int ret;
> >+
> >+ shared = mmap(0, 4096, PROT_WRITE, MAP_SHARED | MAP_ANON, -1,
> >0);
> >+
> >+ xe = drm_open_driver(DRIVER_XE);
> >+ xe_device_get(xe);
> >+ set = xe_get_memory_region_set(xe,
> >+ DRM_XE_MEM_REGION_CLASS_SYSMEM,
> >+ DRM_XE_MEM_REGION_CLASS_VRAM);
> >+ intel_allocator_multiprocess_start();
> >+
> >+ region1 = 1;
> >+ region2 = 2;
> >+
> >+ vm = xe_vm_create(xe, 0, 0);
> >+ exec_queue = xe_exec_queue_create(xe, vm, &inst, 0);
> >+ ctx = intel_ctx_xe(xe, vm, exec_queue, 0, 0, 0);
> >+ ret = src_copy(xe, ctx, width, height, region1, region2, true,
> >+ false, false);
> >+ xe_exec_queue_destroy(xe, exec_queue);
> >+ xe_vm_destroy(xe, vm);
> >+ free(ctx);
> >+ if (!ret) {
> >+ vm = xe_vm_create(xe, 0, 0);
> >+ exec_queue = xe_exec_queue_create(xe, vm, &inst, 0);
> >+ ctx = intel_ctx_xe(xe, vm, exec_queue, 0, 0, 0);
> >+ ret = src_copy(xe, ctx, width, height, region1, region2,
> >+ false, false, false);
> >+ xe_exec_queue_destroy(xe, exec_queue);
> >+ xe_vm_destroy(xe, vm);
> >+ free(ctx);
> >+ }
> >+ if (batch > 1) {
> >+ for (int i = 1; i < batch; i++) {
> >+ vm = xe_vm_create(xe, 0, 0);
> >+ exec_queue = xe_exec_queue_create(xe, vm, &inst, 0);
> >+ ctx = intel_ctx_xe(xe, vm, exec_queue, 0, 0, 0);
> >+ ret = src_copy(xe, ctx, width, height, region1, region2,
> >+ false, false, false);
> >+ xe_exec_queue_destroy(xe, exec_queue);
> >+ xe_vm_destroy(xe, vm);
> >+ free(ctx);
> >+ }
> >+ }
> >+ vm = xe_vm_create(xe, 0, 0);
> >+ exec_queue = xe_exec_queue_create(xe, vm, &inst, 0);
> >+ ctx = intel_ctx_xe(xe, vm, exec_queue, 0, 0, 0);
> >+ ret = src_copy(xe, ctx, width, height, region1, region2,
> >+ false, true, false);
> >+ xe_exec_queue_destroy(xe, exec_queue);
> >+ xe_vm_destroy(xe, vm);
> Can't we use same vm_id's and exec_queue_id's for all the different conditions ?
> why to create and destroy multiple times ?
> Idea behind this benchmark looks good.
>
> ---
> Gowtham
> >+ free(ctx);
> >+ if (flags & SYNC) {
> >+ time *= count / 2;
> >+ count = 1;
> >+ }
> >+
> >+ while (reps--) {
> >+ memset(shared, 0, 4096);
> >+
> >+ igt_fork(child, ncpus) {
> >+ double min = HUGE_VAL;
> >+
> >+ for (int s = 0; s <= time / 100; s++) {
> >+ struct timespec start, end;
> >+ double t;
> >+
> >+ clock_gettime(CLOCK_MONOTONIC, &start);
> >+ vm = xe_vm_create(xe, 0, 0);
> >+ exec_queue = xe_exec_queue_create(xe, vm,
> >&inst, 0);
> >+ ctx = intel_ctx_xe(xe, vm, exec_queue, 0, 0, 0);
> >+ ret = src_copy(xe, ctx, width, height, region1,
> >+ region2, false, false, true);
> >+ free(ctx);
> >+ xe_exec_queue_destroy(xe, exec_queue);
> >+ xe_vm_destroy(xe, vm);
> >+ clock_gettime(CLOCK_MONOTONIC, &end);
> >+
> >+ t = elapsed(&start, &end);
> >+ if (t < min)
> >+ min = t;
> >+ }
> >+
> >+ shared[child] = width/(1024*1024.)*batch*count/min;
> >+ }
> >+ igt_waitchildren();
> >+
> >+ for (int child = 0; child < ncpus; child++)
> >+ shared[ncpus] += shared[child];
> >+ printf("%7.3f\n", shared[ncpus] / ncpus);
> >+ }
> >+ intel_allocator_multiprocess_stop();
> >+
> >+ close(xe);
> >+ return 0;
> >+}
> >+
> >+int main(int argc, char **argv)
> >+{
> >+ int size = 1024*1024;
> >+ int reps = 13;
> >+ int time = 2000;
> >+ int ncpus = 1;
> >+ int batch = 1;
> >+ unsigned int flags = 0;
> >+ int c;
> >+
> >+ while ((c = getopt(argc, argv, "s:S:t:r:b:f")) != -1) {
> >+ switch (c) {
> >+ case 's':
> >+ size = atoi(optarg);
> >+ size = ALIGN(size, 4);
> >+ if (size < 4)
> >+ size = 4;
> >+ break;
> >+
> >+ case 'S':
> >+ flags |= SYNC;
> >+ break;
> >+
> >+ case 't':
> >+ time = atoi(optarg);
> >+ if (time < 1)
> >+ time = 1;
> >+ break;
> >+
> >+ case 'r':
> >+ reps = atoi(optarg);
> >+ if (reps < 1)
> >+ reps = 1;
> >+ break;
> >+
> >+ case 'b':
> >+ batch = atoi(optarg);
> >+ if (batch < 1)
> >+ batch = 1;
> >+ break;
> >+
> >+ case 'f':
> >+ ncpus = sysconf(_SC_NPROCESSORS_ONLN);
> >+ break;
> >+
> >+ default:
> >+ break;
> >+ }
> >+ }
> >+
> >+ return run(size, batch, time, reps, ncpus, flags); }
> >--
> >2.34.1
More information about the igt-dev
mailing list