[Intel-gfx] [PATCH i-g-t] amdgpu/amd_syslatency: Measure the impact of CS load on the rest of the system

Wed Jul 4 14:20:53 UTC 2018

Like benchmarks/gem_syslatency, but to investigate/compare the impact
with amdgpu.

Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
---
 lib/igt_aux.h                 |   1 +
 tests/Makefile.am             |   2 +
 tests/Makefile.sources        |   1 +
 tests/amdgpu/amd_syslatency.c | 404 ++++++++++++++++++++++++++++++++++
 tests/amdgpu/meson.build      |   1 +
 5 files changed, 409 insertions(+)
 create mode 100644 tests/amdgpu/amd_syslatency.c

diff --git a/lib/igt_aux.h b/lib/igt_aux.h
index 9a962881b..3641e4ee3 100644
--- a/lib/igt_aux.h
+++ b/lib/igt_aux.h
@@ -33,6 +33,7 @@
 #include <stdbool.h>
 #include <stddef.h>
 #include <sys/time.h>
+#include <sys/syscall.h>
 
 #include <i915/gem_submission.h>
 
diff --git a/tests/Makefile.am b/tests/Makefile.am
index f41ad5096..69300448a 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -156,5 +156,7 @@ amdgpu_amd_cs_nop_CFLAGS = $(AM_CFLAGS) $(DRM_AMDGPU_CFLAGS)
 amdgpu_amd_cs_nop_LDADD = $(LDADD) $(DRM_AMDGPU_LIBS)
 amdgpu_amd_prime_CFLAGS = $(AM_CFLAGS) $(DRM_AMDGPU_CFLAGS)
 amdgpu_amd_prime_LDADD = $(LDADD) $(DRM_AMDGPU_LIBS)
+amdgpu_amd_syslatency_CFLAGS = $(AM_CFLAGS) $(DRM_AMDGPU_CFLAGS) -pthread
+amdgpu_amd_syslatency_LDADD = $(LDADD) $(DRM_AMDGPU_LIBS) -lpthread
 
 endif
diff --git a/tests/Makefile.sources b/tests/Makefile.sources
index 54b4a3c21..002af360e 100644
--- a/tests/Makefile.sources
+++ b/tests/Makefile.sources
@@ -19,6 +19,7 @@ AMDGPU_TESTS = \
 	amdgpu/amd_basic \
 	amdgpu/amd_cs_nop \
 	amdgpu/amd_prime \
+	amdgpu/amd_syslatency \
 	$(NULL)
 
 TESTS_progs = \
diff --git a/tests/amdgpu/amd_syslatency.c b/tests/amdgpu/amd_syslatency.c
new file mode 100644
index 000000000..b4fb2fc01
--- /dev/null
+++ b/tests/amdgpu/amd_syslatency.c
@@ -0,0 +1,404 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "igt.h"
+#include "drmtest.h"
+
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <sched.h>
+#include <signal.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <unistd.h>
+
+#include <amdgpu.h>
+#include <amdgpu_drm.h>
+
+#include "igt_stats.h"
+
+#define GFX_COMPUTE_NOP  0xffff1000
+#define SDMA_NOP  0x0
+
+static int
+amdgpu_bo_alloc_and_map(amdgpu_device_handle dev, unsigned size,
+			unsigned alignment, unsigned heap, uint64_t flags,
+			amdgpu_bo_handle *bo, void **cpu, uint64_t *mc_address,
+			amdgpu_va_handle *va_handle)
+{
+	struct amdgpu_bo_alloc_request request = {
+		.alloc_size = size,
+		.phys_alignment = alignment,
+		.preferred_heap = heap,
+		.flags = flags,
+	};
+	amdgpu_bo_handle buf_handle;
+	amdgpu_va_handle handle;
+	uint64_t vmc_addr;
+	int r;
+
+	r = amdgpu_bo_alloc(dev, &request, &buf_handle);
+	if (r)
+		return r;
+
+	r = amdgpu_va_range_alloc(dev,
+				  amdgpu_gpu_va_range_general,
+				  size, alignment, 0, &vmc_addr,
+				  &handle, 0);
+	if (r)
+		goto error_va_alloc;
+
+	r = amdgpu_bo_va_op(buf_handle, 0, size, vmc_addr, 0, AMDGPU_VA_OP_MAP);
+	if (r)
+		goto error_va_map;
+
+	r = amdgpu_bo_cpu_map(buf_handle, cpu);
+	if (r)
+		goto error_cpu_map;
+
+	*bo = buf_handle;
+	*mc_address = vmc_addr;
+	*va_handle = handle;
+
+	return 0;
+
+error_cpu_map:
+	amdgpu_bo_cpu_unmap(buf_handle);
+
+error_va_map:
+	amdgpu_bo_va_op(buf_handle, 0, size, vmc_addr, 0, AMDGPU_VA_OP_UNMAP);
+
+error_va_alloc:
+	amdgpu_bo_free(buf_handle);
+	return r;
+}
+
+static void
+amdgpu_bo_unmap_and_free(amdgpu_bo_handle bo, amdgpu_va_handle va_handle,
+			 uint64_t mc_addr, uint64_t size)
+{
+	amdgpu_bo_cpu_unmap(bo);
+	amdgpu_bo_va_op(bo, 0, size, mc_addr, 0, AMDGPU_VA_OP_UNMAP);
+	amdgpu_va_range_free(va_handle);
+	amdgpu_bo_free(bo);
+}
+
+static void amdgpu_cs_sync(amdgpu_context_handle context,
+			   unsigned int ip_type,
+			   int ring,
+			   unsigned int seqno)
+{
+	struct amdgpu_cs_fence fence = {
+		.context = context,
+		.ip_type = ip_type,
+		.ring = ring,
+		.fence = seqno,
+	};
+	uint32_t expired;
+	int err;
+
+	err = amdgpu_cs_query_fence_status(&fence,
+					   AMDGPU_TIMEOUT_INFINITE,
+					   0, &expired);
+	igt_assert_eq(err, 0);
+}
+
+static volatile int done;
+
+struct busyspin {
+	pthread_t thread;
+	unsigned long count;
+	amdgpu_device_handle device;
+	unsigned int ip_type;
+	unsigned int ring;
+};
+
+struct sys_wait {
+	pthread_t thread;
+	struct igt_mean mean;
+};
+
+static void force_low_latency(void)
+{
+	int32_t target = 0;
+	int fd = open("/dev/cpu_dma_latency", O_RDWR);
+	if (fd < 0 || write(fd, &target, sizeof(target)) < 0)
+		fprintf(stderr,
+			"Unable to prevent CPU sleeps and force low latency using /dev/cpu_dma_latency: %s\n",
+			strerror(errno));
+}
+
+static void *busyspin(void *arg)
+{
+	struct busyspin *bs = arg;
+	amdgpu_bo_handle ib_result_handle;
+	void *ib_result_cpu;
+	uint64_t ib_result_mc_address;
+	amdgpu_context_handle context;
+	struct amdgpu_cs_request ibs_request;
+	struct amdgpu_cs_ib_info ib_info;
+	uint32_t *ptr;
+	amdgpu_bo_list_handle bo_list;
+	amdgpu_va_handle va_handle;
+	int i, r;
+
+	amdgpu_cs_ctx_create(bs->device, &context);
+
+	r = amdgpu_bo_alloc_and_map(bs->device, 4096, 4096,
+				    AMDGPU_GEM_DOMAIN_GTT, 0,
+				    &ib_result_handle, &ib_result_cpu,
+				    &ib_result_mc_address, &va_handle);
+	igt_assert_eq(r, 0);
+
+	ptr = ib_result_cpu;
+	for (i = 0; i < 16; ++i)
+		ptr[i] = GFX_COMPUTE_NOP;
+
+	r = amdgpu_bo_list_create(bs->device, 1, &ib_result_handle, NULL, &bo_list);
+	igt_assert_eq(r, 0);
+
+
+	memset(&ib_info, 0, sizeof(struct amdgpu_cs_ib_info));
+	ib_info.ib_mc_address = ib_result_mc_address;
+	ib_info.size = 16;
+
+	memset(&ibs_request, 0, sizeof(struct amdgpu_cs_request));
+	ibs_request.ip_type = bs->ip_type;
+	ibs_request.ring = bs->ring;
+	ibs_request.number_of_ibs = 1;
+	ibs_request.ibs = &ib_info;
+	ibs_request.resources = bo_list;
+
+	bs->count = 0;
+	while (!done) {
+		amdgpu_cs_submit(context, 0, &ibs_request, 1);
+		bs->count++;
+	}
+
+	amdgpu_cs_sync(context, bs->ip_type, bs->ring, ibs_request.seq_no);
+
+	r = amdgpu_bo_list_destroy(bo_list);
+	igt_assert_eq(r, 0);
+
+	amdgpu_bo_unmap_and_free(ib_result_handle, va_handle,
+				 ib_result_mc_address, 4096);
+
+	amdgpu_cs_ctx_free(context);
+
+	return NULL;
+}
+
+static double elapsed(const struct timespec *a, const struct timespec *b)
+{
+	return 1e9*(b->tv_sec - a->tv_sec) + (b->tv_nsec - a ->tv_nsec);
+}
+
+static void *sys_wait(void *arg)
+{
+	struct sys_wait *w = arg;
+	struct sigevent sev;
+	timer_t timer;
+	sigset_t mask;
+	struct timespec now;
+#define SIG SIGRTMIN
+
+	sigemptyset(&mask);
+	sigaddset(&mask, SIG);
+	sigprocmask(SIG_SETMASK, &mask, NULL);
+
+	sev.sigev_notify = SIGEV_SIGNAL | SIGEV_THREAD_ID;
+	sev.sigev_notify_thread_id = gettid();
+	sev.sigev_signo = SIG;
+	timer_create(CLOCK_MONOTONIC, &sev, &timer);
+
+	clock_gettime(CLOCK_MONOTONIC, &now);
+	while (!done) {
+		struct itimerspec its;
+		int sigs;
+
+		its.it_value = now;
+		its.it_value.tv_nsec += 100 * 1000;
+		its.it_value.tv_nsec += rand() % (NSEC_PER_SEC / 1000);
+		if (its.it_value.tv_nsec >= NSEC_PER_SEC) {
+			its.it_value.tv_nsec -= NSEC_PER_SEC;
+			its.it_value.tv_sec += 1;
+		}
+		its.it_interval.tv_sec = its.it_interval.tv_nsec = 0;
+		timer_settime(timer, TIMER_ABSTIME, &its, NULL);
+
+		sigwait(&mask, &sigs);
+		clock_gettime(CLOCK_MONOTONIC, &now);
+		igt_mean_add(&w->mean, elapsed(&its.it_value, &now));
+	}
+
+	sigprocmask(SIG_UNBLOCK, &mask, NULL);
+	timer_delete(timer);
+
+	return NULL;
+}
+
+static void bind_cpu(pthread_attr_t *attr, int cpu)
+{
+#ifdef __USE_GNU
+	cpu_set_t mask;
+
+	if (cpu == -1)
+		return;
+
+	CPU_ZERO(&mask);
+	CPU_SET(cpu, &mask);
+
+	pthread_attr_setaffinity_np(attr, sizeof(mask), &mask);
+#endif
+}
+
+static void rtprio(pthread_attr_t *attr, int prio)
+{
+#ifdef PTHREAD_EXPLICIT_SCHED
+	struct sched_param param = { .sched_priority = 99 };
+	pthread_attr_setinheritsched(attr, PTHREAD_EXPLICIT_SCHED);
+	pthread_attr_setschedpolicy(attr, SCHED_FIFO);
+	pthread_attr_setschedparam(attr, &param);
+#endif
+}
+
+static double l_estimate(igt_stats_t *stats)
+{
+	if (stats->n_values > 9)
+		return igt_stats_get_trimean(stats);
+	else if (stats->n_values > 5)
+		return igt_stats_get_median(stats);
+	else
+		return igt_stats_get_mean(stats);
+}
+
+static double min_measurement_error(void)
+{
+	struct timespec start, end;
+	int n;
+
+	clock_gettime(CLOCK_MONOTONIC, &start);
+	for (n = 0; n < 1024; n++)
+		clock_gettime(CLOCK_MONOTONIC, &end);
+
+	return elapsed(&start, &end) / n;
+}
+
+static void syslatency(amdgpu_device_handle device,
+		       double min,
+		       const char *name,
+		       unsigned int ip_type,
+		       unsigned int ring,
+		       unsigned int timeout)
+{
+	const int ncpus = sysconf(_SC_NPROCESSORS_ONLN);
+	struct busyspin *busy;
+	struct sys_wait *wait;
+	pthread_attr_t attr;
+	igt_stats_t cycles, mean, max;
+	int n;
+
+	done = 0;
+
+	busy = calloc(ncpus, sizeof(*busy));
+	pthread_attr_init(&attr);
+	for (n = 0; n < ncpus; n++) {
+		bind_cpu(&attr, n);
+		busy[n].device = device;
+		busy[n].ip_type = ip_type;
+		busy[n].ring = ring;
+		pthread_create(&busy[n].thread, &attr, busyspin, &busy[n]);
+	}
+
+	wait = calloc(ncpus, sizeof(*wait));
+	pthread_attr_init(&attr);
+	rtprio(&attr, 99);
+	for (n = 0; n < ncpus; n++) {
+		igt_mean_init(&wait[n].mean);
+		bind_cpu(&attr, n);
+		pthread_create(&wait[n].thread, &attr, sys_wait, &wait[n]);
+	}
+
+	sleep(timeout);
+	done = 1;
+
+	igt_stats_init_with_size(&cycles, ncpus);
+	for (n = 0; n < ncpus; n++) {
+		pthread_join(busy[n].thread, NULL);
+		igt_stats_push(&cycles, busy[n].count);
+	}
+
+	igt_stats_init_with_size(&mean, ncpus);
+	igt_stats_init_with_size(&max, ncpus);
+	for (n = 0; n < ncpus; n++) {
+		pthread_join(wait[n].thread, NULL);
+		igt_stats_push_float(&mean, wait[n].mean.mean);
+		igt_stats_push_float(&max, wait[n].mean.max);
+	}
+
+	igt_info("%s: cycles=%.0f, latency mean=%.3fus max=%.0fus\n",
+		 name,
+		 igt_stats_get_mean(&cycles),
+		 (igt_stats_get_mean(&mean) - min)/ 1000,
+		 (l_estimate(&max) - min) / 1000);
+}
+
+igt_main
+{
+	amdgpu_device_handle device;
+	const struct engine {
+		const char *name;
+		unsigned int ip_type;
+	} engines[] = {
+		{ "compute", AMDGPU_HW_IP_COMPUTE },
+		{ "gfx", AMDGPU_HW_IP_GFX },
+		{ },
+	}, *e;
+	double min;
+	int fd = -1;
+
+	igt_fixture {
+		uint32_t major, minor;
+		int err;
+
+		fd = __drm_open_driver(DRIVER_AMDGPU);
+		igt_require(fd >= 0);
+
+		err = amdgpu_device_initialize(fd, &major, &minor, &device);
+		igt_require(err == 0);
+
+		force_low_latency();
+		min = min_measurement_error();
+	}
+
+	for (e = engines; e->name; e++) {
+		igt_subtest_f("%s-0", e->name)
+			syslatency(device, min, e->name, e->ip_type, 0, 20);
+	}
+
+	igt_fixture {
+		amdgpu_device_deinitialize(device);
+		close(fd);
+	}
+}
diff --git a/tests/amdgpu/meson.build b/tests/amdgpu/meson.build
index af5e74c7a..d24a84f68 100644
--- a/tests/amdgpu/meson.build
+++ b/tests/amdgpu/meson.build
@@ -5,6 +5,7 @@ if libdrm_amdgpu.found()
 	amdgpu_progs += [ 'amd_basic',
 			  'amd_cs_nop',
 			  'amd_prime',
+			  'amd_syslatency',
 			]
 	amdgpu_deps += libdrm_amdgpu
 endif
-- 
2.18.0