[Intel-gfx] [RFC i-g-t] benchmarks/gem_wsim: Command submission workload simulator

Chris Wilson chris at chris-wilson.co.uk
Thu Mar 30 14:04:13 UTC 2017


On Thu, Mar 30, 2017 at 02:45:53PM +0100, Tvrtko Ursulin wrote:
> From: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> 
> Tool which emits batch buffers to engines with configurable
> sequences, durations, contexts, dependencies and userspace waits.
> 
> Unfinished but shows promise so sending out for early feedback.
> 
> First run the tool with no arguments to get the calibration number
> which needs to be passed to subsequent invocations using the -n
> command line argument.
> 
> Example invocation:
> 
>   gem_wsim -w "1.VCS1.3000.0.1,1.RCS.1000.-1.0,1.RCS.3700.0.0,1.RCS.1000.-2.0,1.VCS2.2300.-2.0,1.RCS.4700.-1.0,1.VCS2.600.-1.1" \
> 	   -n <local_calibration> -r 600

Also add benchmarks/ezbench.d/gem_wsim.test (simple shell script) that
you can define some test loads that are interesting. Just makes it more
likely that we are looking at the same loads when talking about issues.

> 
> This will send the workload as described 600 times to the GPU and
> output the elapsed time.
> 
> Other interesting options.
> 
>   -c n	Spawn n clients
>   -x    For workloads which reference VCS1 and VCS2, swap them
>         around for every other client (in combination with -c).
> 
> Workload descriptor format:
> 
>  * ctx.engine.duration.dependency.wait,...
>  * <uint>.<str>.<uint>.<int <= 0>.<0|1>,...
>  *
>  * Engine ids: RCS, BCS, VCS, VCS1, VCS2, VECS
> 
> Dependency is a back reference to a previous batch the current
> will depend on.
> 
> TODO list:
> 
>  * Reading workloads from files (for more readable workload
>    descriptor format).

Seconded ;)

>  * Better error handling.
>  * Multi-context support for individual clients.
>  * Random/variable batch length.
>  * Load balancing plug-in.
>  * Help text.
>  * ... ?
> 
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> Cc: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: "Rogozhkin, Dmitry V" <dmitry.v.rogozhkin at intel.com>
> ---
>  benchmarks/Makefile.sources |   1 +
>  benchmarks/gem_wsim.c       | 489 ++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 490 insertions(+)
>  create mode 100644 benchmarks/gem_wsim.c
> 
> diff --git a/benchmarks/Makefile.sources b/benchmarks/Makefile.sources
> index 3af54ebe36f2..3a941150abb3 100644
> --- a/benchmarks/Makefile.sources
> +++ b/benchmarks/Makefile.sources
> @@ -14,6 +14,7 @@ benchmarks_prog_list =			\
>  	gem_prw				\
>  	gem_set_domain			\
>  	gem_syslatency			\
> +	gem_wsim			\
>  	kms_vblank			\
>  	prime_lookup			\
>  	vgem_mmap			\
> diff --git a/benchmarks/gem_wsim.c b/benchmarks/gem_wsim.c
> new file mode 100644
> index 000000000000..d1f1f40c492f
> --- /dev/null
> +++ b/benchmarks/gem_wsim.c
> @@ -0,0 +1,489 @@
> +/*
> + * Copyright © 2017 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
> + * IN THE SOFTWARE.
> + *
> + */
> +
> +#include <unistd.h>
> +#include <stdlib.h>
> +#include <stdint.h>
> +#include <stdio.h>
> +#include <string.h>
> +#include <fcntl.h>
> +#include <inttypes.h>
> +#include <errno.h>
> +#include <sys/stat.h>
> +#include <sys/ioctl.h>
> +#include <sys/time.h>
> +#include <time.h>
> +#include <assert.h>
> +
> +#include "drm.h"
> +#include "ioctl_wrappers.h"
> +#include "drmtest.h"
> +#include "intel_io.h"
> +
> +struct w_step
> +{
> +	/* Workload step metadata */
> +	unsigned int context;
> +	unsigned int engine;
> +	unsigned int duration;
> +	int dependency;
> +	int wait;
> +
> +	/* Implementation details */
> +	struct drm_i915_gem_execbuffer2 eb;
> +	struct drm_i915_gem_exec_object2 obj[3];
> +};
> +
> +struct workload
> +{
> +	unsigned int nr_steps;
> +	struct w_step *steps;
> +
> +	uint32_t ctx_id;
> +};
> +
> +enum intel_engine_id {
> +	RCS,
> +	BCS,
> +	VCS,
> +	VCS1,
> +	VCS2,
> +	VECS,
> +	NUM_ENGINES
> +};
> +
> +static const unsigned int eb_engine_map[NUM_ENGINES] = {
> +	[RCS] = I915_EXEC_RENDER,
> +	[BCS] = I915_EXEC_BLT,
> +	[VCS] = I915_EXEC_BSD,
> +	[VCS1] = I915_EXEC_BSD | I915_EXEC_BSD_RING1,
> +	[VCS2] = I915_EXEC_BSD | I915_EXEC_BSD_RING2,
> +	[VECS] = I915_EXEC_VEBOX };
> +
> +static const uint32_t bbe = 0xa << 23;
> +static const unsigned int nop_calibration_us = 1000;
> +static unsigned long nop_calibration;
> +
> +static bool quiet;
> +static int fd;
> +
> +/*
> + * Workload descriptor:
> + *
> + * ctx.engine.duration.dependency.wait,...
> + * <uint>.<str>.<uint>.<int <= 0>.<0|1>,...
> + *
> + * Engine ids: RCS, BCS, VCS, VCS1, VCS2, VECS
> + *
> + * "1.VCS1.3000.0.1,1.RCS.1000.-1.0,1.RCS.3700.0.0,1.RCS.1000.-2.0,1.VCS2.2300.-2.0,1.RCS.4700.-1.0,1.VCS2.600.-1.1"
> + */
> +
> +static struct workload *parse_workload(char *desc)
> +{
> +	struct workload *wrk;
> +	unsigned int nr_steps = 0;
> +	char *token, *tctx, *tstart = desc;
> +	char *field, *fctx, *fstart;
> +	struct w_step step, *steps = NULL;
> +	unsigned int valid;
> +	int tmp;
> +
> +	while ((token = strtok_r(tstart, ",", &tctx)) != NULL) {
> +		tstart = NULL;
> +		fstart = token;
> +		valid = 0;
> +
> +		if ((field = strtok_r(fstart, ".", &fctx)) != NULL) {
> +			fstart = NULL;
> +
> +			tmp = atoi(field);
> +			if (tmp != 1) {
> +				if (!quiet)
> +					fprintf(stderr,
> +						"Invalid ctx id at step %u!\n",
> +						nr_steps);
> +				return NULL;
> +			}
> +			step.context = tmp;
> +
> +			valid++;
> +		}
> +
> +		if ((field = strtok_r(fstart, ".", &fctx)) != NULL) {
> +			fstart = NULL;
> +
> +			if (!strcasecmp(field, "RCS")) {
> +				step.engine = RCS;
> +				valid++;
> +			} else if (!strcasecmp(field, "BCS")) {
> +				step.engine = BCS;
> +				valid++;
> +			} else if (!strcasecmp(field, "VCS")) {
> +				step.engine = VCS;
> +				valid++;
> +			} else if (!strcasecmp(field, "VCS1")) {
> +				step.engine = VCS1;
> +				valid++;
> +			} else if (!strcasecmp(field, "VCS2")) {
> +				step.engine = VCS2;
> +				valid++;
> +			} else if (!strcasecmp(field, "VECS")) {
> +				step.engine = VECS;
> +				valid++;
> +			} else {
> +				if (!quiet)
> +					fprintf(stderr,
> +						"Invalid engine id at step %u!\n",
> +						nr_steps);
> +				return NULL;
> +			}
> +		}
> +
> +		if ((field = strtok_r(fstart, ".", &fctx)) != NULL) {
> +			fstart = NULL;
> +
> +			tmp = atoi(field);
> +			if (tmp <= 0) {
> +				if (!quiet)
> +					fprintf(stderr,
> +						"Invalid duration at step %u!\n",
> +						nr_steps);
> +				return NULL;
> +			}
> +			step.duration = tmp;
> +
> +			valid++;
> +		}
> +
> +		if ((field = strtok_r(fstart, ".", &fctx)) != NULL) {
> +			fstart = NULL;
> +
> +			tmp = atoi(field);
> +			if (tmp > 0) {
> +				if (!quiet)
> +					fprintf(stderr,
> +						"Invalid forward dependency at step %u!\n",
> +						nr_steps);
> +				return NULL;
> +			}
> +			step.dependency = tmp;
> +
> +			valid++;
> +		}
> +
> +		if ((field = strtok_r(fstart, ".", &fctx)) != NULL) {
> +			fstart = NULL;
> +
> +			tmp = atoi(field);
> +			if (tmp != 0 && tmp != 1) {
> +				if (!quiet)
> +					fprintf(stderr,
> +						"Invalid wait boolean at step %u!\n",
> +						nr_steps);
> +				return NULL;
> +			}
> +			step.wait = tmp;
> +
> +			valid++;
> +		}
> +
> +		if (valid != 5) {
> +			if (!quiet)
> +				fprintf(stderr, "Invalid record at step %u!\n",
> +					nr_steps);
> +			return NULL;
> +		}
> +
> +		nr_steps++;
> +		steps = realloc(steps, sizeof(step) * nr_steps);
> +		igt_assert(steps);
> +
> +		memcpy(&steps[nr_steps - 1], &step, sizeof(step));
> +	}
> +
> +	wrk = malloc(sizeof(*wrk));
> +	igt_assert(wrk);
> +
> +	wrk->nr_steps = nr_steps;
> +	wrk->steps = steps;
> +
> +	return wrk;
> +}
> +
> +static struct workload *
> +clone_workload(struct workload *_wrk)
> +{
> +	struct workload *wrk;
> +
> +	wrk = malloc(sizeof(*wrk));
> +	igt_assert(wrk);
> +
> +	wrk->nr_steps = _wrk->nr_steps;
> +	wrk->steps = malloc(sizeof(struct w_step) * wrk->nr_steps);
> +	igt_assert(wrk->steps);
> +
> +	memcpy(wrk->steps, _wrk->steps, sizeof(struct w_step) * wrk->nr_steps);
> +
> +	return wrk;
> +}
> +
> +static void prepare_workload(struct workload *wrk, bool swap_vcs)
> +{
> +	struct drm_i915_gem_context_create arg = {};
> +	struct w_step *w;
> +	int i;
> +
> +	drmIoctl(fd, DRM_IOCTL_I915_GEM_CONTEXT_CREATE, &arg);
> +	wrk->ctx_id = arg.ctx_id;
> +
> +	for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) {
> +		memset(&w->eb, 0, sizeof(w->eb));
> +		memset(&w->obj, 0, sizeof(w->obj));
> +	}
> +
> +	for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) {
> +		unsigned long sz;
> +		enum intel_engine_id engine = w->engine;
> +
> +		sz = ALIGN(w->duration * nop_calibration * sizeof(uint32_t) /
> +			   nop_calibration_us, sizeof(uint32_t));
> +
> +		igt_assert(w->context == 1); /* TODO */
> +
> +		w->obj[0].handle = gem_create(fd, 4096);
> +		w->obj[0].flags = EXEC_OBJECT_WRITE;
> +
> +		w->obj[1].handle = gem_create(fd, sz);
> +		gem_write(fd, w->obj[1].handle, sz - sizeof(bbe), &bbe,
> +			  sizeof(bbe));
> +
> +		w->eb.buffer_count = 2;
> +		w->eb.buffers_ptr = to_user_pointer(w->obj);
> +		if (swap_vcs && engine == VCS1)
> +			engine = VCS2;
> +		else if (swap_vcs && engine == VCS2)
> +			engine = VCS1;
> +		w->eb.flags = eb_engine_map[engine];

w->eb.flags |= NORELOC;
That'll skip a step to determine there weren't any relocations.

w->eb.flags |= LUT;
That'll skip building the ht. Trivial but still an extra kmalloc!

> +		w->eb.rsvd1 = wrk->ctx_id;
> +
> +		igt_assert(w->dependency <= 0);
> +		if (w->dependency) {
> +			int dep_idx = i + w->dependency;
> +
> +			igt_assert(dep_idx >= 0 && dep_idx < wrk->nr_steps);
> +
> +			w->obj[2].handle = w->obj[1].handle;
> +			w->obj[1].handle = wrk->steps[dep_idx].obj[0].handle;
> +			w->eb.buffer_count = 3;
> +		}
> +
> +#ifdef DEBUG
> +		printf("%u: %u:%x|%x|%x %10lu flags=%llx\n",
> +		       i, w->eb.buffer_count,
> +		       w->obj[0].handle, w->obj[1].handle, w->obj[2].handle,
> +		       sz, w->eb.flags);
> +#endif
> +	}
> +}
> +
> +static double elapsed(const struct timespec *start, const struct timespec *end)
> +{
> +	return (end->tv_sec - start->tv_sec) +
> +	       (end->tv_nsec - start->tv_nsec) / 1e9;
> +}
> +
> +static void
> +run_workload(unsigned int id, struct workload *wrk, unsigned int repeat)
> +{
> +	struct timespec t_start, t_end;
> +	struct w_step *w;
> +	double t;
> +	int i, j;
> +
> +	clock_gettime(CLOCK_MONOTONIC, &t_start);
> +
> +	for (j = 0; j < repeat; j++) {
> +		for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) {
> +			gem_execbuf(fd, &w->eb);
> +			if (w->wait)
> +				gem_sync(fd, w->obj[0].handle);
> +		}
> +	}
> +
> +	clock_gettime(CLOCK_MONOTONIC, &t_end);
> +
> +	t = elapsed(&t_start, &t_end);
> +	if (!quiet)
> +		printf("%u: %fs elapsed (%f workloads/s)\n", id, t, repeat / t);
> +}
> +
> +static void fini_workload(struct workload *wrk)
> +{
> +	free(wrk->steps);
> +	free(wrk);
> +}
> +
> +static unsigned long calibrate_nop(void)
> +{
> +	unsigned int loops = 17;
> +	int warmup = 17;
> +	unsigned int usecs = nop_calibration_us;
> +	struct drm_i915_gem_exec_object2 obj = {};
> +	struct drm_i915_gem_execbuffer2 eb =
> +		{ .buffer_count = 1, .buffers_ptr = (uintptr_t)&obj};
> +	long size, last_size;
> +	struct timespec t_0, t_end;
> +
> +	clock_gettime(CLOCK_MONOTONIC, &t_0);
> +
> +	size = 256 * 1024;
> +	do {
> +		struct timespec t_start;
> +
> +		obj.handle = gem_create(fd, size);
> +		gem_write(fd, obj.handle, size - sizeof(bbe), &bbe,
> +			  sizeof(bbe));
> +		gem_execbuf(fd, &eb);
> +		gem_sync(fd, obj.handle);
> +
> +		clock_gettime(CLOCK_MONOTONIC, &t_start);
> +		for (int loop = 0; loop < loops; loop++)
> +			gem_execbuf(fd, &eb);
> +		gem_sync(fd, obj.handle);
> +		clock_gettime(CLOCK_MONOTONIC, &t_end);
> +
> +		gem_close(fd, obj.handle);
> +
> +		last_size = size;
> +		size = loops * size /
> +		       elapsed(&t_start, &t_end) / 1e6 * usecs;
> +		size = ALIGN(size, sizeof(uint32_t));
> +	} while (warmup-- > 0 ||

Ok, 17 passes as a warmup! That's scary. Danger from unpinned CPU / GPU
frequencies? And you fear non-convergence -- oh, rounding to uint32_t
not a whole page.

Reasonable idea, though I can't help thinking that a script that
compiles to bytecode would be easier to write tests in.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre


More information about the Intel-gfx mailing list