[Intel-gfx] [PATCH i-g-t 1/2] benchmarks/gem_wsim: Command submission workload simulator

Fri Mar 31 14:58:25 UTC 2017

From: Tvrtko Ursulin <tvrtko.ursulin at intel.com>

Tool which emits batch buffers to engines with configurable
sequences, durations, contexts, dependencies and userspace waits.

Unfinished but shows promise so sending out for early feedback.

v2:
 * Load workload descriptors from files. (also -w)
 * Help text.
 * Calibration control if needed. (-t)
 * NORELOC | LUT to eb flags.
 * Added sample workload to wsim/workload1.

TODO list:

 * Better error handling.
 * Multi-context support for individual clients.
 * Random/variable batch length.
 * Load balancing plug-in.
 * ... ?

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
Cc: Chris Wilson <chris at chris-wilson.co.uk>
Cc: "Rogozhkin, Dmitry V" <dmitry.v.rogozhkin at intel.com>

gem_wsim updates

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
---
 benchmarks/Makefile.sources |   1 +
 benchmarks/gem_wsim.c       | 593 ++++++++++++++++++++++++++++++++++++++++++++
 benchmarks/wsim/workload1   |   7 +
 3 files changed, 601 insertions(+)
 create mode 100644 benchmarks/gem_wsim.c
 create mode 100644 benchmarks/wsim/workload1

diff --git a/benchmarks/Makefile.sources b/benchmarks/Makefile.sources
index 3af54ebe36f2..3a941150abb3 100644
--- a/benchmarks/Makefile.sources
+++ b/benchmarks/Makefile.sources
@@ -14,6 +14,7 @@ benchmarks_prog_list =			\
 	gem_prw				\
 	gem_set_domain			\
 	gem_syslatency			\
+	gem_wsim			\
 	kms_vblank			\
 	prime_lookup			\
 	vgem_mmap			\
diff --git a/benchmarks/gem_wsim.c b/benchmarks/gem_wsim.c
new file mode 100644
index 000000000000..029967281251
--- /dev/null
+++ b/benchmarks/gem_wsim.c
@@ -0,0 +1,593 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/ioctl.h>
+#include <sys/time.h>
+#include <time.h>
+#include <assert.h>
+
+#include "drm.h"
+#include "ioctl_wrappers.h"
+#include "drmtest.h"
+#include "intel_io.h"
+
+struct w_step
+{
+	/* Workload step metadata */
+	unsigned int context;
+	unsigned int engine;
+	unsigned int duration;
+	int dependency;
+	int wait;
+
+	/* Implementation details */
+	struct drm_i915_gem_execbuffer2 eb;
+	struct drm_i915_gem_exec_object2 obj[3];
+};
+
+struct workload
+{
+	unsigned int nr_steps;
+	struct w_step *steps;
+
+	uint32_t ctx_id;
+};
+
+enum intel_engine_id {
+	RCS,
+	BCS,
+	balance_VCS,
+	VCS,
+	VCS1,
+	VCS2,
+	VECS,
+	NUM_ENGINES
+};
+
+static const unsigned int eb_engine_map[NUM_ENGINES] = {
+	[RCS] = I915_EXEC_RENDER,
+	[BCS] = I915_EXEC_BLT,
+	[balance_VCS] = I915_EXEC_BSD,
+	[VCS] = I915_EXEC_BSD,
+	[VCS1] = I915_EXEC_BSD | I915_EXEC_BSD_RING1,
+	[VCS2] = I915_EXEC_BSD | I915_EXEC_BSD_RING2,
+	[VECS] = I915_EXEC_VEBOX };
+
+static const uint32_t bbe = 0xa << 23;
+static const unsigned int nop_calibration_us = 1000;
+static unsigned long nop_calibration;
+
+static bool quiet;
+static int fd;
+
+/*
+ * Workload descriptor:
+ *
+ * ctx.engine.duration.dependency.wait,...
+ * <uint>.<str>.<uint>.<int <= 0>.<0|1>,...
+ *
+ * Engine ids: RCS, BCS, VCS, VCS1, VCS2, VECS
+ *
+ * "1.VCS1.3000.0.1,1.RCS.1000.-1.0,1.RCS.3700.0.0,1.RCS.1000.-2.0,1.VCS2.2300.-2.0,1.RCS.4700.-1.0,1.VCS2.600.-1.1"
+ */
+
+static struct workload *parse_workload(char *desc)
+{
+	struct workload *wrk;
+	unsigned int nr_steps = 0;
+	char *token, *tctx, *tstart = desc;
+	char *field, *fctx, *fstart;
+	struct w_step step, *steps = NULL;
+	unsigned int valid;
+	int tmp;
+
+	while ((token = strtok_r(tstart, ",", &tctx)) != NULL) {
+		tstart = NULL;
+		fstart = token;
+		valid = 0;
+
+		if ((field = strtok_r(fstart, ".", &fctx)) != NULL) {
+			fstart = NULL;
+
+			tmp = atoi(field);
+			if (tmp != 1) {
+				if (!quiet)
+					fprintf(stderr,
+						"Invalid ctx id at step %u!\n",
+						nr_steps);
+				return NULL;
+			}
+			step.context = tmp;
+
+			valid++;
+		}
+
+		if ((field = strtok_r(fstart, ".", &fctx)) != NULL) {
+			fstart = NULL;
+
+			if (!strcasecmp(field, "RCS")) {
+				step.engine = RCS;
+				valid++;
+			} else if (!strcasecmp(field, "BCS")) {
+				step.engine = BCS;
+				valid++;
+			} else if (!strcasecmp(field, "balance_VCS")) {
+				step.engine = balance_VCS;
+				valid++;
+			} else if (!strcasecmp(field, "VCS")) {
+				step.engine = VCS;
+				valid++;
+			} else if (!strcasecmp(field, "VCS1")) {
+				step.engine = VCS1;
+				valid++;
+			} else if (!strcasecmp(field, "VCS2")) {
+				step.engine = VCS2;
+				valid++;
+			} else if (!strcasecmp(field, "VECS")) {
+				step.engine = VECS;
+				valid++;
+			} else {
+				if (!quiet)
+					fprintf(stderr,
+						"Invalid engine id at step %u!\n",
+						nr_steps);
+				return NULL;
+			}
+		}
+
+		if ((field = strtok_r(fstart, ".", &fctx)) != NULL) {
+			fstart = NULL;
+
+			tmp = atoi(field);
+			if (tmp <= 0) {
+				if (!quiet)
+					fprintf(stderr,
+						"Invalid duration at step %u!\n",
+						nr_steps);
+				return NULL;
+			}
+			step.duration = tmp;
+
+			valid++;
+		}
+
+		if ((field = strtok_r(fstart, ".", &fctx)) != NULL) {
+			fstart = NULL;
+
+			tmp = atoi(field);
+			if (tmp > 0) {
+				if (!quiet)
+					fprintf(stderr,
+						"Invalid forward dependency at step %u!\n",
+						nr_steps);
+				return NULL;
+			}
+			step.dependency = tmp;
+
+			valid++;
+		}
+
+		if ((field = strtok_r(fstart, ".", &fctx)) != NULL) {
+			fstart = NULL;
+
+			tmp = atoi(field);
+			if (tmp != 0 && tmp != 1) {
+				if (!quiet)
+					fprintf(stderr,
+						"Invalid wait boolean at step %u!\n",
+						nr_steps);
+				return NULL;
+			}
+			step.wait = tmp;
+
+			valid++;
+		}
+
+		if (valid != 5) {
+			if (!quiet)
+				fprintf(stderr, "Invalid record at step %u!\n",
+					nr_steps);
+			return NULL;
+		}
+
+		nr_steps++;
+		steps = realloc(steps, sizeof(step) * nr_steps);
+		igt_assert(steps);
+
+		memcpy(&steps[nr_steps - 1], &step, sizeof(step));
+	}
+
+	wrk = malloc(sizeof(*wrk));
+	igt_assert(wrk);
+
+	wrk->nr_steps = nr_steps;
+	wrk->steps = steps;
+
+	return wrk;
+}
+
+static struct workload *
+clone_workload(struct workload *_wrk)
+{
+	struct workload *wrk;
+
+	wrk = malloc(sizeof(*wrk));
+	igt_assert(wrk);
+
+	wrk->nr_steps = _wrk->nr_steps;
+	wrk->steps = malloc(sizeof(struct w_step) * wrk->nr_steps);
+	igt_assert(wrk->steps);
+
+	memcpy(wrk->steps, _wrk->steps, sizeof(struct w_step) * wrk->nr_steps);
+
+	return wrk;
+}
+
+static void prepare_workload(struct workload *wrk, bool swap_vcs)
+{
+	struct drm_i915_gem_context_create arg = {};
+	struct w_step *w;
+	int i;
+
+	drmIoctl(fd, DRM_IOCTL_I915_GEM_CONTEXT_CREATE, &arg);
+	wrk->ctx_id = arg.ctx_id;
+
+	for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) {
+		memset(&w->eb, 0, sizeof(w->eb));
+		memset(&w->obj, 0, sizeof(w->obj));
+	}
+
+	for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) {
+		unsigned long sz;
+		enum intel_engine_id engine = w->engine;
+
+		sz = ALIGN(w->duration * nop_calibration * sizeof(uint32_t) /
+			   nop_calibration_us, sizeof(uint32_t));
+
+		igt_assert(w->context == 1); /* TODO */
+
+		w->obj[0].handle = gem_create(fd, 4096);
+		w->obj[0].flags = EXEC_OBJECT_WRITE;
+
+		w->obj[1].handle = gem_create(fd, sz);
+		gem_write(fd, w->obj[1].handle, sz - sizeof(bbe), &bbe,
+			  sizeof(bbe));
+
+		w->eb.buffer_count = 2;
+		w->eb.buffers_ptr = to_user_pointer(w->obj);
+		if (swap_vcs && engine == VCS1)
+			engine = VCS2;
+		else if (swap_vcs && engine == VCS2)
+			engine = VCS1;
+		w->eb.flags = eb_engine_map[engine];
+		w->eb.flags |= I915_EXEC_NO_RELOC;
+		w->eb.flags |= I915_EXEC_HANDLE_LUT;
+		w->eb.rsvd1 = wrk->ctx_id;
+
+		igt_assert(w->dependency <= 0);
+		if (w->dependency) {
+			int dep_idx = i + w->dependency;
+
+			igt_assert(dep_idx >= 0 && dep_idx < wrk->nr_steps);
+
+			w->obj[2].handle = w->obj[1].handle;
+			w->obj[1].handle = wrk->steps[dep_idx].obj[0].handle;
+			w->eb.buffer_count = 3;
+		}
+
+#ifdef DEBUG
+		printf("%u: %u:%x|%x|%x %10lu flags=%llx\n",
+		       i, w->eb.buffer_count,
+		       w->obj[0].handle, w->obj[1].handle, w->obj[2].handle,
+		       sz, w->eb.flags);
+#endif
+	}
+}
+
+static double elapsed(const struct timespec *start, const struct timespec *end)
+{
+	return (end->tv_sec - start->tv_sec) +
+	       (end->tv_nsec - start->tv_nsec) / 1e9;
+}
+
+static void
+run_workload(unsigned int id, struct workload *wrk, unsigned int repeat)
+{
+	struct timespec t_start, t_end;
+	struct w_step *w;
+	double t;
+	int i, j;
+
+	clock_gettime(CLOCK_MONOTONIC, &t_start);
+
+	for (j = 0; j < repeat; j++) {
+		for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) {
+			gem_execbuf(fd, &w->eb);
+			if (w->wait)
+				gem_sync(fd, w->obj[0].handle);
+		}
+	}
+
+	clock_gettime(CLOCK_MONOTONIC, &t_end);
+
+	t = elapsed(&t_start, &t_end);
+	if (!quiet)
+		printf("%u: %fs elapsed (%f workloads/s)\n", id, t, repeat / t);
+}
+
+static void fini_workload(struct workload *wrk)
+{
+	free(wrk->steps);
+	free(wrk);
+}
+
+static unsigned long calibrate_nop(unsigned int tolerance_pct)
+{
+	unsigned int loops = 17;
+	unsigned int usecs = nop_calibration_us;
+	struct drm_i915_gem_exec_object2 obj = {};
+	struct drm_i915_gem_execbuffer2 eb =
+		{ .buffer_count = 1, .buffers_ptr = (uintptr_t)&obj};
+	long size, last_size;
+	struct timespec t_0, t_end;
+
+	clock_gettime(CLOCK_MONOTONIC, &t_0);
+
+	size = 256 * 1024;
+	do {
+		struct timespec t_start;
+
+		obj.handle = gem_create(fd, size);
+		gem_write(fd, obj.handle, size - sizeof(bbe), &bbe,
+			  sizeof(bbe));
+		gem_execbuf(fd, &eb);
+		gem_sync(fd, obj.handle);
+
+		clock_gettime(CLOCK_MONOTONIC, &t_start);
+		for (int loop = 0; loop < loops; loop++)
+			gem_execbuf(fd, &eb);
+		gem_sync(fd, obj.handle);
+		clock_gettime(CLOCK_MONOTONIC, &t_end);
+
+		gem_close(fd, obj.handle);
+
+		last_size = size;
+		size = loops * size / elapsed(&t_start, &t_end) / 1e6 * usecs;
+		size = ALIGN(size, sizeof(uint32_t));
+	} while (elapsed(&t_0, &t_end) < 5 ||
+		 abs(size - last_size) > (size * tolerance_pct / 100));
+
+	return size / sizeof(uint32_t);
+}
+
+static void print_help(void)
+{
+	puts(
+"Usage: gem_wsim [OPTIONS]\n"
+"\n"
+"Runs a simulated workload on the GPU.\n"
+"When ran without arguments performs a GPU calibration result of which needs\n"
+"to be provided when running the simulation in subsequent invocations.\n"
+"\n"
+"Options:\n"
+"	-h		This text.\n"
+"	-q		Be quiet - do not output anything to stdout.\n"
+"	-n <n>		Nop calibration value.\n"
+"	-t <n>		Nop calibration tolerance percentage.\n"
+"			Use when there is a difficuly obtaining calibration\n"
+"			with the default settings.\n"
+"	-w <desc|path>	Filename or a workload descriptor.\n"
+"	-r <n>		How many times to emit the workload.\n"
+"	-c <n>		Fork n clients emitting the workload simultaneously.\n"
+"	-x		Swap VCS1 and VCS2 engines in every other client.\n"
+"\n"
+"Workload descriptor format:\n"
+"\n"
+"	ctx.engine.duration_us.dependency.wait,...\n"
+"	<uint>.<str>.<uint>.<int <= 0>.<0|1>,...\n"
+"\n"
+"	Engine ids: RCS, BCS, balance_VCS, VCS, VCS1, VCS2, VECS\n"
+"\n"
+"Example:\n"
+"	1.VCS1.3000.0.1\n"
+"	1.RCS.1000.-1.0\n"
+"	1.RCS.3700.0.0\n"
+"	1.RCS.1000.-2.0\n"
+"	1.VCS2.2300.-2.0\n"
+"	1.RCS.4700.-1.0\n"
+"	1.VCS2.600.-1.1\n"
+"\n"
+"The above workload described in human language works like this:\n"
+"A batch is sent to the VCS1 engine which will be executing for 3ms on the\n"
+"GPU and userspace will wait until it is finished before proceeding.\n"
+"Now three batches are sent to RCS with durations of 1ms, 3.7ms and 1ms\n"
+"respectively. The first batch has a data dependency on the preceding VCS1\n"
+"batch, and the last of the group depends on the first from the group.\n"
+"Now a 2.3ms batch is sent to VCS2, with a data dependency on the 3.7ms RCS\n"
+"batch, followed by a 4.7ms RCS batch with a data dependency on the 2.3ms\n"
+"VCS2 batch, and finally a 0.6ms VCS2 batch depending on the previous RCS one.\n"
+"The tool is then told to wait for the last one to complete before optionally\n"
+"starting the next iteration (-r).\n"
+"\n"
+"When workload descriptors are provided on the command line, commas must be\n"
+"used instead of newlines.\n"
+	);
+}
+
+static char *load_workload_descriptor(char *filename)
+{
+	struct stat sbuf;
+	char *buf;
+	int infd, ret, i;
+	ssize_t len;
+
+	ret = stat(filename, &sbuf);
+	if (ret || !S_ISREG(sbuf.st_mode))
+		return filename;
+
+	igt_assert(sbuf.st_size < 1024 * 1024); /* Just so. */
+	buf = malloc(sbuf.st_size);
+	igt_assert(buf);
+
+	infd = open(filename, O_RDONLY);
+	igt_assert(infd >= 0);
+	len = read(infd, buf, sbuf.st_size);
+	igt_assert(len == sbuf.st_size);
+	close(infd);
+
+	for (i = 0; i < len; i++) {
+		if (buf[i] == '\n')
+			buf[i] = ',';
+	}
+
+	return buf;
+}
+
+int main(int argc, char **argv)
+{
+	unsigned int repeat = 1;
+	unsigned int clients = 1;
+	bool swap_vcs = false;
+	struct timespec t_start, t_end;
+	struct workload **w, *wrk;
+	char *w_str = NULL;
+	unsigned int tolerance_pct = 1;
+	double t;
+	int i, c;
+
+	fd = drm_open_driver(DRIVER_INTEL);
+
+	while ((c = getopt(argc, argv, "c:n:r:qxw:t:h")) != -1) {
+		switch (c) {
+		case 'w':
+			w_str = optarg;
+			break;
+		case 'c':
+			clients = strtol(optarg, NULL, 0);
+			break;
+		case 't':
+			tolerance_pct = strtol(optarg, NULL, 0);
+			break;
+		case 'n':
+			nop_calibration = strtol(optarg, NULL, 0);
+			break;
+		case 'r':
+			repeat = strtol(optarg, NULL, 0);
+			break;
+		case 'q':
+			quiet = true;
+			break;
+		case 'x':
+			swap_vcs = true;
+			break;
+		case 'h':
+			print_help();
+			return 0;
+		default:
+			return 1;
+		}
+	}
+
+	if (!nop_calibration) {
+		if (!quiet)
+			printf("Calibrating nop delay with %u%% tolerance...\n",
+				tolerance_pct);
+		nop_calibration = calibrate_nop(tolerance_pct);
+		if (!quiet)
+			printf("Nop calibration for %uus delay is %lu.\n",
+			       nop_calibration_us, nop_calibration);
+
+		return 0;
+	} else {
+		if (!w_str) {
+			if (!quiet)
+				fprintf(stderr,
+					"Workload descriptor missing!\n");
+			return 1;
+		}
+
+		w_str = load_workload_descriptor(w_str);
+		if (!w_str) {
+			if (!quiet)
+				fprintf(stderr,
+					"Failed to load workload descriptor!\n");
+			return 1;
+		}
+
+		wrk = parse_workload(w_str);
+		if (!wrk) {
+			if (!quiet)
+				fprintf(stderr, "Failed to parse workload!\n");
+			return 1;
+		}
+	}
+
+	if (!quiet) {
+		printf("Using %lu nop calibration for %uus delay.\n",
+		       nop_calibration, nop_calibration_us);
+		printf("%u client%s.\n", clients, clients > 1 ? "s" : "");
+		if (swap_vcs)
+			printf("Swapping VCS rings between clients.\n");
+	}
+
+	w = malloc(sizeof(struct workload *) * clients);
+	igt_assert(w);
+
+	for (i = 0; i < clients; i++) {
+		w[i] = clone_workload(wrk);
+		prepare_workload(w[i], swap_vcs && (i & 1));
+	}
+
+	clock_gettime(CLOCK_MONOTONIC, &t_start);
+
+	igt_fork(child, clients)
+		run_workload(child, w[child], repeat);
+
+	igt_waitchildren();
+
+	clock_gettime(CLOCK_MONOTONIC, &t_end);
+
+	t = elapsed(&t_start, &t_end);
+	if (!quiet)
+		printf("%fs elapsed (%f workloads/s)\n",
+		       t, clients * repeat / t);
+
+	for (i = 0; i < clients; i++)
+		fini_workload(w[i]);
+
+	free(w);
+	fini_workload(wrk);
+
+	return 0;
+}
diff --git a/benchmarks/wsim/workload1 b/benchmarks/wsim/workload1
new file mode 100644
index 000000000000..5f533d8e168b
--- /dev/null
+++ b/benchmarks/wsim/workload1
@@ -0,0 +1,7 @@
+1.VCS1.3000.0.1
+1.RCS.1000.-1.0
+1.RCS.3700.0.0
+1.RCS.1000.-2.0
+1.VCS2.2300.-2.0
+1.RCS.4700.-1.0
+1.VCS2.600.-1.1
-- 
2.9.3