[igt-dev] [PATCH] i915/poc: Use semaphore wait to sync gpu and cpu times
Kamil Konieczny
kamil.konieczny at linux.intel.com
Thu Aug 24 17:50:48 UTC 2023
Hi Umesh,
On 2023-08-23 at 18:45:38 +0000, Umesh Nerlige Ramappa wrote:
> This is just a POC to sync gpu and cpu time. The requirement is to
> provide a solution that works with SRIOV as well.
>
> The CS will block polling on a semaphore. The semaphore is signaled by
> CPU by writing the CPU timestamp into the SAD field. As soon as the CS
> unblocks, it reads the RING_TIMESTAMP. This makes the 2 values as close
> to each other as possible.
>
> Accuracy is within a few us (1 to 2). Repeated runs get better accuracy.
After all runs it would help if you print both times.
>
> Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa at intel.com>
> ---
> tests/i915/cpu_gpu_time.c | 220 ++++++++++++++++++++++++++++++++++++++
> tests/meson.build | 1 +
> 2 files changed, 221 insertions(+)
> create mode 100644 tests/i915/cpu_gpu_time.c
>
> diff --git a/tests/i915/cpu_gpu_time.c b/tests/i915/cpu_gpu_time.c
> new file mode 100644
> index 000000000..a87a3fa88
> --- /dev/null
> +++ b/tests/i915/cpu_gpu_time.c
> @@ -0,0 +1,220 @@
> +/*
Use SPDX licence.
> + * Copyright © 2016 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
> + * IN THE SOFTWARE.
> + *
> + */
> +
> +#include <stdlib.h>
> +#include <stdio.h>
> +#include <string.h>
> +#include <fcntl.h>
------------ ^
Sort sys includes alphabetically.
> +#include <inttypes.h>
------------ ^
> +#include <errno.h>
------------ ^
> +#include <signal.h>
> +#include <sys/stat.h>
> +#include <sys/time.h>
> +#include <sys/times.h>
> +#include <sys/types.h>
> +#include <dirent.h>
------------ ^
> +#include <time.h>
> +#include <poll.h>
------------ ^
> +#include <math.h>
------------ ^
> +
> +#include "i915/gem.h"
> +#include "i915/gem_create.h"
> +#include "igt.h"
> +#include "igt_core.h"
> +#include "igt_device.h"
> +#include "igt_kmod.h"
> +#include "igt_perf.h"
> +#include "igt_sysfs.h"
> +#include "igt_pm.h"
---------------- ^
Same here, sort it.
> +#include "intel_ctx.h"
> +#include "sw_sync.h"
> +
> +/**
> + * TEST: cpu_gpu_time
> + * Description: Test correlated time
> + * Run type: FULL
> + *
> + * SUBTEST: cpu-gpu-time
> + * Description: Test time correlation
> + * Feature: i915 streaming interface, oa
> + * Test category: Perf
There are other fields needed.
> + */
> +
> +static void
> +test_cpu_gpu_time(int gem_fd,
-------------------- ^
> + const intel_ctx_t *ctx,
--------- ^
> + const struct intel_execution_engine2 *e,
> + uint64_t *cpu_ns,
> + uint64_t *gpu_ns)
> +{
> + struct drm_i915_gem_relocation_entry reloc[2] = {};
> + struct drm_i915_gem_exec_object2 obj[2] = {};
> + struct drm_i915_gem_execbuffer2 eb = {};
> + uint32_t bb_handle, obj_handle;
> + uint32_t *obj_ptr;
> + uint32_t batch[64];
> + uint32_t mmio_base;
> + uint64_t ahnd = get_reloc_ahnd(gem_fd, ctx->id);
> + uint64_t obj_offset, bb_offset, *gpu_ts;
> + struct timespec *ts;
> + int i = 0;
> +
> + igt_require(intel_gen(intel_get_drm_devid(gem_fd)) >= 8);
> +
> + mmio_base = gem_engine_mmio_base(gem_fd, e->name);
> +
> + /**
> + * Setup up a batchbuffer with a polling semaphore wait command which
> + * will wait on an value in a shared bo to change. This way we are able
> + * to control how much time we will spend in this bb.
> + */
> +
> + bb_handle = gem_create(gem_fd, 4096);
> + obj_handle = gem_create(gem_fd, 4096);
> + bb_offset = get_offset(ahnd, bb_handle, 4096, 0);
> + obj_offset = get_offset(ahnd, obj_handle, 4096, 0);
> +
> + obj_ptr = gem_mmap__device_coherent(gem_fd, obj_handle, 0, 4096, PROT_WRITE);
> +
> +#define obj(__o) (obj_offset + __o)
> + /* Poll from CPU to check the batch started */
> + batch[i++] = MI_STORE_DWORD_IMM_GEN4;
> + batch[i++] = obj(0);
> + batch[i++] = obj(0) >> 32;
> + batch[i++] = 1;
> +
> + /* Block the batch until this offset has a value GTE than 1 */
> + batch[i++] = MI_SEMAPHORE_WAIT |
> + MI_SEMAPHORE_POLL |
> + MI_SEMAPHORE_SAD_GTE_SDD;
> + batch[i++] = 1;
> + batch[i++] = obj(4);
> + batch[i++] = obj(4) >> 32;
> +
> + /* Once unblocked, capture RING timestamp */
> + batch[i++] = MI_STORE_REGISTER_MEM_GEN8;
> + batch[i++] = mmio_base + 0x358;
> + batch[i++] = obj(20);
> + batch[i++] = 0;
> +
> + batch[i++] = MI_STORE_REGISTER_MEM_GEN8;
> + batch[i++] = mmio_base + 0x35c;
> + batch[i++] = obj(24);
> + batch[i++] = 0;
> +
> + batch[i++] = MI_BATCH_BUFFER_END;
> +
> + gem_write(gem_fd, bb_handle, 0, batch, sizeof(batch));
> +
> + reloc[0].target_handle = obj_handle;
> + reloc[0].offset = 1 * sizeof(uint32_t);
> + reloc[0].read_domains = I915_GEM_DOMAIN_RENDER;
> + reloc[0].write_domain = I915_GEM_DOMAIN_RENDER;
> + reloc[0].delta = 4;
> +
> + reloc[1].target_handle = obj_handle;
> + reloc[1].offset = 6 * sizeof(uint32_t);
> + reloc[1].read_domains = I915_GEM_DOMAIN_RENDER;
> +
> + obj[0].handle = obj_handle;
> +
> + obj[1].handle = bb_handle;
> + obj[1].relocation_count = !ahnd ? 2 : 0;
> + obj[1].relocs_ptr = to_user_pointer(reloc);
> +
> + eb.buffer_count = 2;
> + eb.buffers_ptr = to_user_pointer(obj);
> + eb.flags = e->flags;
> + eb.rsvd1 = ctx->id;
> +
> + if (ahnd) {
> + obj[0].flags |= EXEC_OBJECT_PINNED | EXEC_OBJECT_WRITE;
> + obj[0].offset = obj_offset;
> + obj[1].flags |= EXEC_OBJECT_PINNED;
> + obj[1].offset = bb_offset;
> + }
> +
> + gem_execbuf(gem_fd, &eb);
> +
> + /* wait for the batch to start executing */
> + while (!obj_ptr[0])
> + usleep(5e3);
> +
> + ts = (struct timespec *)&obj_ptr[1];
> + clock_gettime(CLOCK_REALTIME, ts);
> +
> + gem_sync(gem_fd, bb_handle);
> +
> + for (int j = 0; j < 16; j++)
> + igt_debug("[%d] %08x\n", j, obj_ptr[j]);
Move this after time reads below.
> +
> + gpu_ts = (uint64_t *) &obj_ptr[5];
Move this before gem_sync.
> + *gpu_ns = (*gpu_ts * 1000000) / 19200;
> + *cpu_ns = ts->tv_sec * NSEC_PER_SEC + ts->tv_nsec;
> +
> + munmap(obj_ptr, 4096);
> + gem_close(gem_fd, obj_handle);
> + gem_close(gem_fd, bb_handle);
> + put_ahnd(ahnd);
> +}
> +
> +#define test_each_engine(T, i915, ctx, e) \
> + igt_subtest_with_dynamic(T) for_each_ctx_engine(i915, ctx, e) \
> + igt_dynamic_f("%s", e->name)
> +
> +igt_main
> +{
> + const struct intel_execution_engine2 *e;
> + uint64_t prev_cpu = 0, prev_gpu = 0;
> + uint64_t cpu_ns, gpu_ns;
> + const intel_ctx_t *ctx;
> + int device;
> +
> + igt_fixture {
> + drm_load_module(DRIVER_INTEL);
> + device = drm_open_driver(DRIVER_INTEL);
> + igt_require_gem(device);
> + ctx = intel_ctx_create_all_physical(device);
> + }
> +
> + igt_describe("Capture cpu and gpu time close to each other");
> + test_each_engine("cpu-gpu-time", device, ctx, e) {
Why for al engines? Maybe -basic test repeated with the help
of only one engine?
> + test_cpu_gpu_time(device, ctx, e, &cpu_ns, &gpu_ns);
> + igt_debug("CPU = %ld, GPU = %ld\n", cpu_ns, gpu_ns);
> + if (prev_cpu && prev_gpu) {
> + int64_t cpu_delta = cpu_ns - prev_cpu;
> + int64_t gpu_delta = gpu_ns - prev_gpu;
> +
> + igt_debug("d_CPU = %ld, d_GPU = %ld\n", cpu_delta, gpu_delta);
> + igt_info("d_d = %ld\n", labs(gpu_delta - cpu_delta));
----------------^
imho _debug will be better, imho print with igt_info both
times with smallest delta after a loop.
> + }
> + prev_cpu = cpu_ns;
> + prev_gpu = gpu_ns;
> + }
> +
> + igt_fixture {
> + intel_ctx_destroy(device, ctx);
> + drm_close_driver(device);
> + }
> +}
> diff --git a/tests/meson.build b/tests/meson.build
> index 58061dbc2..c18dae125 100644
> --- a/tests/meson.build
> +++ b/tests/meson.build
> @@ -260,6 +260,7 @@ i915_progs = [
> 'sysfs_heartbeat_interval',
> 'sysfs_preempt_timeout',
> 'sysfs_timeslice_duration',
> + 'cpu_gpu_time',
---- ^
Keep it sorted here.
Regards,
Kamil
> ]
>
> xe_progs = [
> --
> 2.34.1
>
More information about the igt-dev
mailing list