[Intel-gfx] [RFC i-g-t] benchmarks/gem_wsim: Command submission workload simulator

Tvrtko Ursulin tursulin at ursulin.net
Thu Mar 30 13:45:53 UTC 2017


From: Tvrtko Ursulin <tvrtko.ursulin at intel.com>

Tool which emits batch buffers to engines with configurable
sequences, durations, contexts, dependencies and userspace waits.

Unfinished but shows promise so sending out for early feedback.

First run the tool with no arguments to get the calibration number
which needs to be passed to subsequent invocations using the -n
command line argument.

Example invocation:

  gem_wsim -w "1.VCS1.3000.0.1,1.RCS.1000.-1.0,1.RCS.3700.0.0,1.RCS.1000.-2.0,1.VCS2.2300.-2.0,1.RCS.4700.-1.0,1.VCS2.600.-1.1" \
	   -n <local_calibration> -r 600

This will send the workload as described 600 times to the GPU and
output the elapsed time.

Other interesting options.

  -c n	Spawn n clients
  -x    For workloads which reference VCS1 and VCS2, swap them
        around for every other client (in combination with -c).

Workload descriptor format:

 * ctx.engine.duration.dependency.wait,...
 * <uint>.<str>.<uint>.<int <= 0>.<0|1>,...
 *
 * Engine ids: RCS, BCS, VCS, VCS1, VCS2, VECS

Dependency is a back reference to a previous batch the current
will depend on.

TODO list:

 * Reading workloads from files (for more readable workload
   descriptor format).
 * Better error handling.
 * Multi-context support for individual clients.
 * Random/variable batch length.
 * Load balancing plug-in.
 * Help text.
 * ... ?

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
Cc: Chris Wilson <chris at chris-wilson.co.uk>
Cc: "Rogozhkin, Dmitry V" <dmitry.v.rogozhkin at intel.com>
---
 benchmarks/Makefile.sources |   1 +
 benchmarks/gem_wsim.c       | 489 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 490 insertions(+)
 create mode 100644 benchmarks/gem_wsim.c

diff --git a/benchmarks/Makefile.sources b/benchmarks/Makefile.sources
index 3af54ebe36f2..3a941150abb3 100644
--- a/benchmarks/Makefile.sources
+++ b/benchmarks/Makefile.sources
@@ -14,6 +14,7 @@ benchmarks_prog_list =			\
 	gem_prw				\
 	gem_set_domain			\
 	gem_syslatency			\
+	gem_wsim			\
 	kms_vblank			\
 	prime_lookup			\
 	vgem_mmap			\
diff --git a/benchmarks/gem_wsim.c b/benchmarks/gem_wsim.c
new file mode 100644
index 000000000000..d1f1f40c492f
--- /dev/null
+++ b/benchmarks/gem_wsim.c
@@ -0,0 +1,489 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <sys/time.h>
+#include <time.h>
+#include <assert.h>
+
+#include "drm.h"
+#include "ioctl_wrappers.h"
+#include "drmtest.h"
+#include "intel_io.h"
+
+struct w_step
+{
+	/* Workload step metadata */
+	unsigned int context;
+	unsigned int engine;
+	unsigned int duration;
+	int dependency;
+	int wait;
+
+	/* Implementation details */
+	struct drm_i915_gem_execbuffer2 eb;
+	struct drm_i915_gem_exec_object2 obj[3];
+};
+
+struct workload
+{
+	unsigned int nr_steps;
+	struct w_step *steps;
+
+	uint32_t ctx_id;
+};
+
+enum intel_engine_id {
+	RCS,
+	BCS,
+	VCS,
+	VCS1,
+	VCS2,
+	VECS,
+	NUM_ENGINES
+};
+
+static const unsigned int eb_engine_map[NUM_ENGINES] = {
+	[RCS] = I915_EXEC_RENDER,
+	[BCS] = I915_EXEC_BLT,
+	[VCS] = I915_EXEC_BSD,
+	[VCS1] = I915_EXEC_BSD | I915_EXEC_BSD_RING1,
+	[VCS2] = I915_EXEC_BSD | I915_EXEC_BSD_RING2,
+	[VECS] = I915_EXEC_VEBOX };
+
+static const uint32_t bbe = 0xa << 23;
+static const unsigned int nop_calibration_us = 1000;
+static unsigned long nop_calibration;
+
+static bool quiet;
+static int fd;
+
+/*
+ * Workload descriptor:
+ *
+ * ctx.engine.duration.dependency.wait,...
+ * <uint>.<str>.<uint>.<int <= 0>.<0|1>,...
+ *
+ * Engine ids: RCS, BCS, VCS, VCS1, VCS2, VECS
+ *
+ * "1.VCS1.3000.0.1,1.RCS.1000.-1.0,1.RCS.3700.0.0,1.RCS.1000.-2.0,1.VCS2.2300.-2.0,1.RCS.4700.-1.0,1.VCS2.600.-1.1"
+ */
+
+static struct workload *parse_workload(char *desc)
+{
+	struct workload *wrk;
+	unsigned int nr_steps = 0;
+	char *token, *tctx, *tstart = desc;
+	char *field, *fctx, *fstart;
+	struct w_step step, *steps = NULL;
+	unsigned int valid;
+	int tmp;
+
+	while ((token = strtok_r(tstart, ",", &tctx)) != NULL) {
+		tstart = NULL;
+		fstart = token;
+		valid = 0;
+
+		if ((field = strtok_r(fstart, ".", &fctx)) != NULL) {
+			fstart = NULL;
+
+			tmp = atoi(field);
+			if (tmp != 1) {
+				if (!quiet)
+					fprintf(stderr,
+						"Invalid ctx id at step %u!\n",
+						nr_steps);
+				return NULL;
+			}
+			step.context = tmp;
+
+			valid++;
+		}
+
+		if ((field = strtok_r(fstart, ".", &fctx)) != NULL) {
+			fstart = NULL;
+
+			if (!strcasecmp(field, "RCS")) {
+				step.engine = RCS;
+				valid++;
+			} else if (!strcasecmp(field, "BCS")) {
+				step.engine = BCS;
+				valid++;
+			} else if (!strcasecmp(field, "VCS")) {
+				step.engine = VCS;
+				valid++;
+			} else if (!strcasecmp(field, "VCS1")) {
+				step.engine = VCS1;
+				valid++;
+			} else if (!strcasecmp(field, "VCS2")) {
+				step.engine = VCS2;
+				valid++;
+			} else if (!strcasecmp(field, "VECS")) {
+				step.engine = VECS;
+				valid++;
+			} else {
+				if (!quiet)
+					fprintf(stderr,
+						"Invalid engine id at step %u!\n",
+						nr_steps);
+				return NULL;
+			}
+		}
+
+		if ((field = strtok_r(fstart, ".", &fctx)) != NULL) {
+			fstart = NULL;
+
+			tmp = atoi(field);
+			if (tmp <= 0) {
+				if (!quiet)
+					fprintf(stderr,
+						"Invalid duration at step %u!\n",
+						nr_steps);
+				return NULL;
+			}
+			step.duration = tmp;
+
+			valid++;
+		}
+
+		if ((field = strtok_r(fstart, ".", &fctx)) != NULL) {
+			fstart = NULL;
+
+			tmp = atoi(field);
+			if (tmp > 0) {
+				if (!quiet)
+					fprintf(stderr,
+						"Invalid forward dependency at step %u!\n",
+						nr_steps);
+				return NULL;
+			}
+			step.dependency = tmp;
+
+			valid++;
+		}
+
+		if ((field = strtok_r(fstart, ".", &fctx)) != NULL) {
+			fstart = NULL;
+
+			tmp = atoi(field);
+			if (tmp != 0 && tmp != 1) {
+				if (!quiet)
+					fprintf(stderr,
+						"Invalid wait boolean at step %u!\n",
+						nr_steps);
+				return NULL;
+			}
+			step.wait = tmp;
+
+			valid++;
+		}
+
+		if (valid != 5) {
+			if (!quiet)
+				fprintf(stderr, "Invalid record at step %u!\n",
+					nr_steps);
+			return NULL;
+		}
+
+		nr_steps++;
+		steps = realloc(steps, sizeof(step) * nr_steps);
+		igt_assert(steps);
+
+		memcpy(&steps[nr_steps - 1], &step, sizeof(step));
+	}
+
+	wrk = malloc(sizeof(*wrk));
+	igt_assert(wrk);
+
+	wrk->nr_steps = nr_steps;
+	wrk->steps = steps;
+
+	return wrk;
+}
+
+static struct workload *
+clone_workload(struct workload *_wrk)
+{
+	struct workload *wrk;
+
+	wrk = malloc(sizeof(*wrk));
+	igt_assert(wrk);
+
+	wrk->nr_steps = _wrk->nr_steps;
+	wrk->steps = malloc(sizeof(struct w_step) * wrk->nr_steps);
+	igt_assert(wrk->steps);
+
+	memcpy(wrk->steps, _wrk->steps, sizeof(struct w_step) * wrk->nr_steps);
+
+	return wrk;
+}
+
+static void prepare_workload(struct workload *wrk, bool swap_vcs)
+{
+	struct drm_i915_gem_context_create arg = {};
+	struct w_step *w;
+	int i;
+
+	drmIoctl(fd, DRM_IOCTL_I915_GEM_CONTEXT_CREATE, &arg);
+	wrk->ctx_id = arg.ctx_id;
+
+	for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) {
+		memset(&w->eb, 0, sizeof(w->eb));
+		memset(&w->obj, 0, sizeof(w->obj));
+	}
+
+	for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) {
+		unsigned long sz;
+		enum intel_engine_id engine = w->engine;
+
+		sz = ALIGN(w->duration * nop_calibration * sizeof(uint32_t) /
+			   nop_calibration_us, sizeof(uint32_t));
+
+		igt_assert(w->context == 1); /* TODO */
+
+		w->obj[0].handle = gem_create(fd, 4096);
+		w->obj[0].flags = EXEC_OBJECT_WRITE;
+
+		w->obj[1].handle = gem_create(fd, sz);
+		gem_write(fd, w->obj[1].handle, sz - sizeof(bbe), &bbe,
+			  sizeof(bbe));
+
+		w->eb.buffer_count = 2;
+		w->eb.buffers_ptr = to_user_pointer(w->obj);
+		if (swap_vcs && engine == VCS1)
+			engine = VCS2;
+		else if (swap_vcs && engine == VCS2)
+			engine = VCS1;
+		w->eb.flags = eb_engine_map[engine];
+		w->eb.rsvd1 = wrk->ctx_id;
+
+		igt_assert(w->dependency <= 0);
+		if (w->dependency) {
+			int dep_idx = i + w->dependency;
+
+			igt_assert(dep_idx >= 0 && dep_idx < wrk->nr_steps);
+
+			w->obj[2].handle = w->obj[1].handle;
+			w->obj[1].handle = wrk->steps[dep_idx].obj[0].handle;
+			w->eb.buffer_count = 3;
+		}
+
+#ifdef DEBUG
+		printf("%u: %u:%x|%x|%x %10lu flags=%llx\n",
+		       i, w->eb.buffer_count,
+		       w->obj[0].handle, w->obj[1].handle, w->obj[2].handle,
+		       sz, w->eb.flags);
+#endif
+	}
+}
+
+static double elapsed(const struct timespec *start, const struct timespec *end)
+{
+	return (end->tv_sec - start->tv_sec) +
+	       (end->tv_nsec - start->tv_nsec) / 1e9;
+}
+
+static void
+run_workload(unsigned int id, struct workload *wrk, unsigned int repeat)
+{
+	struct timespec t_start, t_end;
+	struct w_step *w;
+	double t;
+	int i, j;
+
+	clock_gettime(CLOCK_MONOTONIC, &t_start);
+
+	for (j = 0; j < repeat; j++) {
+		for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) {
+			gem_execbuf(fd, &w->eb);
+			if (w->wait)
+				gem_sync(fd, w->obj[0].handle);
+		}
+	}
+
+	clock_gettime(CLOCK_MONOTONIC, &t_end);
+
+	t = elapsed(&t_start, &t_end);
+	if (!quiet)
+		printf("%u: %fs elapsed (%f workloads/s)\n", id, t, repeat / t);
+}
+
+static void fini_workload(struct workload *wrk)
+{
+	free(wrk->steps);
+	free(wrk);
+}
+
+static unsigned long calibrate_nop(void)
+{
+	unsigned int loops = 17;
+	int warmup = 17;
+	unsigned int usecs = nop_calibration_us;
+	struct drm_i915_gem_exec_object2 obj = {};
+	struct drm_i915_gem_execbuffer2 eb =
+		{ .buffer_count = 1, .buffers_ptr = (uintptr_t)&obj};
+	long size, last_size;
+	struct timespec t_0, t_end;
+
+	clock_gettime(CLOCK_MONOTONIC, &t_0);
+
+	size = 256 * 1024;
+	do {
+		struct timespec t_start;
+
+		obj.handle = gem_create(fd, size);
+		gem_write(fd, obj.handle, size - sizeof(bbe), &bbe,
+			  sizeof(bbe));
+		gem_execbuf(fd, &eb);
+		gem_sync(fd, obj.handle);
+
+		clock_gettime(CLOCK_MONOTONIC, &t_start);
+		for (int loop = 0; loop < loops; loop++)
+			gem_execbuf(fd, &eb);
+		gem_sync(fd, obj.handle);
+		clock_gettime(CLOCK_MONOTONIC, &t_end);
+
+		gem_close(fd, obj.handle);
+
+		last_size = size;
+		size = loops * size /
+		       elapsed(&t_start, &t_end) / 1e6 * usecs;
+		size = ALIGN(size, sizeof(uint32_t));
+	} while (warmup-- > 0 ||
+		 elapsed(&t_0, &t_end) < 5.0 ||
+		 abs(size - last_size) > (size * 5 / 100));
+
+	return size / sizeof(uint32_t);
+}
+
+int main(int argc, char **argv)
+{
+	unsigned int repeat = 1;
+	unsigned int clients = 1;
+	bool swap_vcs = false;
+	struct timespec t_start, t_end;
+	struct workload **w, *wrk;
+	char *w_str = NULL;
+	double t;
+	int i, c;
+
+	fd = drm_open_driver(DRIVER_INTEL);
+
+	while ((c = getopt(argc, argv, "c:n:r:qxw:")) != -1) {
+		switch (c) {
+		case 'w':
+			w_str = optarg;
+			break;
+		case 'c':
+			clients = strtol(optarg, NULL, 0);
+			break;
+		case 'n':
+			nop_calibration = strtol(optarg, NULL, 0);
+			break;
+		case 'r':
+			repeat = strtol(optarg, NULL, 0);
+			break;
+		case 'q':
+			quiet = true;
+			break;
+		case 'x':
+			swap_vcs = true;
+			break;
+		default:
+			break;
+		}
+	}
+
+	if (!nop_calibration) {
+		if (!quiet)
+			printf("Calibrating nop delay...\n");
+		nop_calibration = calibrate_nop();
+		if (!quiet)
+			printf("Nop calibration for %uus delay is %lu.\n",
+			       nop_calibration_us, nop_calibration);
+
+		return 0;
+	} else {
+		if (!w_str) {
+			if (!quiet)
+				fprintf(stderr,
+					"Workload descriptor missing!\n");
+			return 1;
+		}
+
+		wrk = parse_workload(w_str);
+		if (!wrk) {
+			if (!quiet)
+				fprintf(stderr, "Failed to parse workload!\n");
+			return 1;
+		}
+	}
+
+	if (!quiet) {
+		printf("Using %lu nop calibration for %uus delay.\n",
+		       nop_calibration, nop_calibration_us);
+		printf("%u client%s.\n", clients, clients > 1 ? "s" : "");
+		if (swap_vcs)
+			printf("Swapping VCS rings between clients.\n");
+	}
+
+	w = malloc(sizeof(struct workload *) * clients);
+	igt_assert(w);
+
+	for (i = 0; i < clients; i++) {
+		w[i] = clone_workload(wrk);
+		prepare_workload(w[i], swap_vcs && (i & 1));
+	}
+
+	clock_gettime(CLOCK_MONOTONIC, &t_start);
+
+	igt_fork(child, clients)
+		run_workload(child, w[child], repeat);
+
+	igt_waitchildren();
+
+	clock_gettime(CLOCK_MONOTONIC, &t_end);
+
+	t = elapsed(&t_start, &t_end);
+	if (!quiet)
+		printf("%fs elapsed (%f workloads/s)\n",
+		       t, clients * repeat / t);
+
+	for (i = 0; i < clients; i++)
+		fini_workload(w[i]);
+
+	free(w);
+	fini_workload(wrk);
+
+	return 0;
+}
-- 
2.9.3



More information about the Intel-gfx mailing list