[igt-dev] [PATCH i-g-t] gem_wsim: Distinguish particular engines during calculating nop calibration.
Tvrtko Ursulin
tvrtko.ursulin at linux.intel.com
Fri Jan 24 11:35:35 UTC 2020
On 24/01/2020 11:18, Anna Karas wrote:
> Extend handling -n parameter by accepting multiple values of per
> engine nop calibration. Add raw numbers handling to set default
> calibration values. Print copyable and pastable string with
> calibrations. Allow to switch between calculating in parallel
> or doing it sequentially.
>
> Accepted input values:
> -n 123456
> All calibrations will be set to 123456.
>
> -n ENG=value,ENG2=value2,value3
> i.e.
> -n RCS=123456,BCS=345678,999999
> RCS engine's value is set to 123456, BCS engine's value is set to
> 345678, 999999 is copied to rest of engines. All engines must be set;
> you can either provide values for each of the engines, or you can set
> specific values and provide a default value for the others.
>
> -n value,ENG1=value1,ENG2=value2
> First, value is copied to all engines, then value1 overrides ENG1, and
> finally value2 overrides ENG2.
>
> New output follows the pattern:
> Nop calibrations for 1000us delay is: <eng1>=<v1>,<eng2>=<v2>,...
> So you can easily copy-paste it to the next invocation.
>
> Switching between calculation modes:
> Run program with -T parameter to calculate calibrations in parallel.
> The calculations are performed sequentially by default.
>
> v2: Get rid of trailing whitespaces. Skip DEFAULT and VCS engines
> when printing out calibrations. Reject them in the string passed
> to -n. Re-align rest of help text. Fix accepting unknown engines.
>
> v3: Consider all cases of arguments
> for -n (Tvrtko).
>
> -n 10 (raw number)
> -n RCS (engine without calib)
> -n AA (neither the engine nor the number)
> -n RCS=500 (valid eng=val pair)
> -n RCS=AA (calib is not a number)
> -n XYZ=10 (engine is not an engine)
> -n XYZ=AA (combo)
>
> v4: Print calculated values (Chris). Do not make any assumptions
> about the order of the engines (Tvrtko).
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
Interesting, or maybe entirely expected, observation is that parallel
calibration gives different values. I think more engines slower they get
in parallel because of memory bandwith? Probably. That's why suggested
default is sequential. Although in practice is depends a lot on the
workload to be executed what matches the reality better. Calibrating for
the workload sounded a step too far. In theory could be doable using
Chris' wsim to theoretical throughput parser. Along the lines probably.
But definitely to complicated for now.
Regards,
Tvrtko
> Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> Cc: Chris Wilson <chris at chris-wilson.co.uk>
> Signed-off-by: Anna Karas <anna.karas at intel.com>
> ---
> benchmarks/gem_wsim.c | 317 ++++++++++++++++++++++++++++++++++--------
> 1 file changed, 259 insertions(+), 58 deletions(-)
>
> diff --git a/benchmarks/gem_wsim.c b/benchmarks/gem_wsim.c
> index 9156fdc9..d3451f63 100644
> --- a/benchmarks/gem_wsim.c
> +++ b/benchmarks/gem_wsim.c
> @@ -23,6 +23,7 @@
> */
>
> #include <unistd.h>
> +#include <stdbool.h>
> #include <stdlib.h>
> #include <stdint.h>
> #include <stdio.h>
> @@ -55,6 +56,7 @@
> #include "i915/gem_mman.h"
>
> #include "ewma.h"
> +#include "i915/gem_engine_topology.h"
>
> enum intel_engine_id {
> DEFAULT,
> @@ -240,7 +242,8 @@ struct workload
>
> struct intel_mmio_data mmio_data;
> static const unsigned int nop_calibration_us = 1000;
> -static unsigned long nop_calibration;
> +static bool has_nop_calibration = false;
> +static bool sequential = true;
>
> static unsigned int master_prng;
>
> @@ -281,6 +284,61 @@ static const char *ring_str_map[NUM_ENGINES] = {
> [VECS] = "VECS",
> };
>
> +/* stores calibrations for particular engines */
> +static unsigned long engine_calib_map[NUM_ENGINES];
> +
> +static enum intel_engine_id
> +ci_to_engine_id(int class, int instance)
> +{
> + static const struct {
> + int class;
> + int instance;
> + unsigned int id;
> + } map[] = {
> + { I915_ENGINE_CLASS_RENDER, 0, RCS },
> + { I915_ENGINE_CLASS_COPY, 0, BCS },
> + { I915_ENGINE_CLASS_VIDEO, 0, VCS1 },
> + { I915_ENGINE_CLASS_VIDEO, 1, VCS2 },
> + { I915_ENGINE_CLASS_VIDEO, 2, VCS2 }, /* FIXME/ICL */
> + { I915_ENGINE_CLASS_VIDEO_ENHANCE, 0, VECS },
> + };
> +
> + unsigned int i;
> +
> + for (i = 0; i < ARRAY_SIZE(map); i++) {
> + if (class == map[i].class && instance == map[i].instance)
> + return map[i].id;
> + }
> + return -1;
> +}
> +
> +static void
> +apply_unset_calibrations(unsigned long raw_number)
> +{
> + for (int i = 0; i < NUM_ENGINES; i++)
> + engine_calib_map[i] += engine_calib_map[i] ? 0 : raw_number;
> +}
> +
> +static void
> +print_engine_calibrations(void)
> +{
> + bool first_entry = true;
> +
> + printf("Nop calibrations for %uus delay is: ", nop_calibration_us);
> + for (int i = 0; i < NUM_ENGINES; i++) {
> + /* skip DEFAULT and VCS engines */
> + if (i != DEFAULT && i != VCS) {
> + if (first_entry) {
> + printf("%s=%lu", ring_str_map[i], engine_calib_map[i]);
> + first_entry = false;
> + } else {
> + printf(",%s=%lu", ring_str_map[i], engine_calib_map[i]);
> + }
> + }
> + }
> + printf("\n");
> +}
> +
> static int
> parse_dependencies(unsigned int nr_steps, struct w_step *w, char *_desc)
> {
> @@ -1082,17 +1140,17 @@ static unsigned int get_duration(struct workload *wrk, struct w_step *w)
> (dur->max + 1 - dur->min);
> }
>
> -static unsigned long get_bb_sz(unsigned int duration)
> +static unsigned long get_bb_sz(unsigned int duration, enum intel_engine_id engine)
> {
> - return ALIGN(duration * nop_calibration * sizeof(uint32_t) /
> - nop_calibration_us, sizeof(uint32_t));
> + return ALIGN(duration * engine_calib_map[engine] * sizeof(uint32_t) /
> + nop_calibration_us, sizeof(uint32_t));
> }
>
> static void
> init_bb(struct w_step *w, unsigned int flags)
> {
> const unsigned int arb_period =
> - get_bb_sz(w->preempt_us) / sizeof(uint32_t);
> + get_bb_sz(w->preempt_us, w->engine) / sizeof(uint32_t);
> const unsigned int mmap_len = ALIGN(w->bb_sz, 4096);
> unsigned int i;
> uint32_t *ptr;
> @@ -1319,10 +1377,11 @@ alloc_step_batch(struct workload *wrk, struct w_step *w, unsigned int flags)
>
> if (w->unbound_duration)
> /* nops + MI_ARB_CHK + MI_BATCH_BUFFER_START */
> - w->bb_sz = max(PAGE_SIZE, get_bb_sz(w->preempt_us)) +
> + w->bb_sz = max(PAGE_SIZE, get_bb_sz(w->preempt_us, w->engine)) +
> (1 + 3) * sizeof(uint32_t);
> else
> - w->bb_sz = get_bb_sz(w->duration.max);
> + w->bb_sz = get_bb_sz(w->duration.max, w->engine);
> +
> w->bb_handle = w->obj[j].handle = gem_create(fd, w->bb_sz + (w->unbound_duration ? 4096 : 0));
> init_bb(w, flags);
> w->obj[j].relocation_count = terminate_bb(w, flags);
> @@ -2622,7 +2681,7 @@ do_eb(struct workload *wrk, struct w_step *w, enum intel_engine_id engine,
> w->eb.batch_start_offset =
> w->unbound_duration ?
> 0 :
> - ALIGN(w->bb_sz - get_bb_sz(get_duration(wrk, w)),
> + ALIGN(w->bb_sz - get_bb_sz(get_duration(wrk, w), w->engine),
> 2 * sizeof(uint32_t));
>
> for (i = 0; i < w->fence_deps.nr; i++) {
> @@ -2899,15 +2958,18 @@ static void fini_workload(struct workload *wrk)
> free(wrk);
> }
>
> -static unsigned long calibrate_nop(unsigned int tolerance_pct)
> +static unsigned long calibrate_nop(unsigned int tolerance_pct, struct intel_execution_engine2 *engine)
> {
> const uint32_t bbe = 0xa << 23;
> unsigned int loops = 17;
> unsigned int usecs = nop_calibration_us;
> struct drm_i915_gem_exec_object2 obj = {};
> - struct drm_i915_gem_execbuffer2 eb =
> - { .buffer_count = 1, .buffers_ptr = (uintptr_t)&obj};
> - long size, last_size;
> + struct drm_i915_gem_execbuffer2 eb = {
> + .buffer_count = 1,
> + .buffers_ptr = (uintptr_t)&obj,
> + .flags = engine->flags
> + };
> + unsigned long size, last_size;
> struct timespec t_0, t_end;
>
> clock_gettime(CLOCK_MONOTONIC, &t_0);
> @@ -2939,6 +3001,70 @@ static unsigned long calibrate_nop(unsigned int tolerance_pct)
> return size / sizeof(uint32_t);
> }
>
> +static void
> +calibrate_sequentially(void)
> +{
> + struct intel_execution_engine2 *engine;
> + enum intel_engine_id eng_id;
> +
> + __for_each_physical_engine(fd, engine) {
> + eng_id = ci_to_engine_id(engine->class, engine->instance);
> + igt_assert(eng_id >= 0);
> + engine_calib_map[eng_id] = calibrate_nop(fd, engine);
> + }
> +}
> +
> +struct thread_data {
> + struct intel_execution_engine2 *eng;
> + pthread_t thr;
> + unsigned long calib;
> +};
> +
> +static void *
> +engine_calibration_thread(void *data)
> +{
> + struct thread_data *thr_d = (struct thread_data *) data;
> +
> + thr_d->calib = calibrate_nop(fd, thr_d->eng);
> + return NULL;
> +}
> +
> +static void
> +calibrate_in_parallel(void)
> +{
> + struct thread_data *thr_d = malloc(NUM_ENGINES * sizeof(*thr_d));
> + struct intel_execution_engine2 *engine;
> + enum intel_engine_id id;
> + int ret;
> +
> + __for_each_physical_engine(fd, engine) {
> + id = ci_to_engine_id(engine->class, engine->instance);
> + thr_d[id].eng = engine;
> + ret = pthread_create(&thr_d[id].thr, NULL, engine_calibration_thread, &thr_d[id]);
> + igt_assert_eq(ret, 0);
> + }
> +
> + __for_each_physical_engine(fd, engine) {
> + id = ci_to_engine_id(engine->class, engine->instance);
> + igt_assert(id >= 0);
> +
> + ret = pthread_join(thr_d[id].thr, NULL);
> + igt_assert_eq(ret, 0);
> + engine_calib_map[id] = thr_d[id].calib;
> + }
> +
> + free(thr_d);
> +}
> +
> +static void
> +calibrate_engines(void)
> +{
> + if (sequential)
> + calibrate_sequentially();
> + else
> + calibrate_in_parallel();
> +}
> +
> static void print_help(void)
> {
> unsigned int i;
> @@ -2951,50 +3077,53 @@ static void print_help(void)
> "be provided when running the simulation in subsequent invocations.\n"
> "\n"
> "Options:\n"
> -" -h This text.\n"
> -" -q Be quiet - do not output anything to stdout.\n"
> -" -n <n> Nop calibration value.\n"
> -" -t <n> Nop calibration tolerance percentage.\n"
> -" Use when there is a difficulty obtaining calibration with the\n"
> -" default settings.\n"
> -" -I <n> Initial randomness seed.\n"
> -" -p <n> Context priority to use for the following workload on the\n"
> -" command line.\n"
> -" -w <desc|path> Filename or a workload descriptor.\n"
> -" Can be given multiple times.\n"
> -" -W <desc|path> Filename or a master workload descriptor.\n"
> -" Only one master workload can be optinally specified in which\n"
> -" case all other workloads become background ones and run as\n"
> -" long as the master.\n"
> -" -a <desc|path> Append a workload to all other workloads.\n"
> -" -r <n> How many times to emit the workload.\n"
> -" -c <n> Fork N clients emitting the workload simultaneously.\n"
> -" -x Swap VCS1 and VCS2 engines in every other client.\n"
> -" -b <n> Load balancing to use.\n"
> -" Available load balancers are:"
> +" -h This text.\n"
> +" -q Be quiet - do not output anything to stdout.\n"
> +" -n <n | Nop calibration value - single value is set to all engines\n"
> +" e1=v1,e2=v2,n...> without specified value; you can also specify calibrations for\n"
> +" particular engines.\n"
> +" -t <n> Nop calibration tolerance percentage.\n"
> +" -T Disable sequential calibration and perform calibration in parallel.\n"
> +" Use when there is a difficulty obtaining calibration with the\n"
> +" default settings.\n"
> +" -I <n> Initial randomness seed.\n"
> +" -p <n> Context priority to use for the following workload on the\n"
> +" command line.\n"
> +" -w <desc|path> Filename or a workload descriptor.\n"
> +" Can be given multiple times.\n"
> +" -W <desc|path> Filename or a master workload descriptor.\n"
> +" Only one master workload can be optinally specified in which\n"
> +" case all other workloads become background ones and run as\n"
> +" long as the master.\n"
> +" -a <desc|path> Append a workload to all other workloads.\n"
> +" -r <n> How many times to emit the workload.\n"
> +" -c <n> Fork N clients emitting the workload simultaneously.\n"
> +" -x Swap VCS1 and VCS2 engines in every other client.\n"
> +" -b <n> Load balancing to use.\n"
> +" Available load balancers are:"
> );
>
> for (i = 0; i < ARRAY_SIZE(all_balancers); i++) {
> igt_assert(all_balancers[i].desc);
> printf(
> -" %s (%u): %s\n",
> +" %s (%u): %s\n",
> all_balancers[i].name, all_balancers[i].id,
> all_balancers[i].desc);
> }
> puts(
> -" Balancers can be specified either as names or as their id\n"
> -" number as listed above.\n"
> -" -2 Remap VCS2 to BCS.\n"
> -" -R Round-robin initial VCS assignment per client.\n"
> -" -H Send heartbeat on synchronisation points with seqno based\n"
> -" balancers. Gives better engine busyness view in some cases.\n"
> -" -s Turn on small SSEU config for the next workload on the\n"
> -" command line. Subsequent -s switches it off.\n"
> -" -S Synchronize the sequence of random batch durations between\n"
> -" clients.\n"
> -" -G Global load balancing - a single load balancer will be shared\n"
> -" between all clients and there will be a single seqno domain.\n"
> -" -d Sync between data dependencies in userspace."
> +" Balancers can be specified either as names or as their id\n"
> +" number as listed above.\n"
> +" -2 Remap VCS2 to BCS.\n"
> +" -R Round-robin initial VCS assignment per client.\n"
> +" -H Send heartbeat on synchronisation points with seqno based\n"
> +" balancers. Gives better engine busyness view in some cases.\n"
> +" -s Turn on small SSEU config for the next workload on the\n"
> +" command line. Subsequent -s switches it off.\n"
> +" -S Synchronize the sequence of random batch durations between\n"
> +" clients.\n"
> +" -G Global load balancing - a single load balancer will be shared\n"
> +" between all clients and there will be a single seqno domain.\n"
> +" -d Sync between data dependencies in userspace."
> );
> }
>
> @@ -3117,6 +3246,10 @@ int main(int argc, char **argv)
> int prio = 0;
> double t;
> int i, c;
> + char *subopts, *value;
> + int raw_number = 0;
> + long calib_val;
> + int eng;
>
> /*
> * Open the device via the low-level API so we can do the GPU quiesce
> @@ -3134,7 +3267,7 @@ int main(int argc, char **argv)
> master_prng = time(NULL);
>
> while ((c = getopt(argc, argv,
> - "hqv2RsSHxGdc:n:r:w:W:a:t:b:p:I:")) != -1) {
> + "Thqv2RsSHxGdc:n:r:w:W:a:t:b:p:I:")) != -1) {
> switch (c) {
> case 'W':
> if (master_workload >= 0) {
> @@ -3163,8 +3296,62 @@ int main(int argc, char **argv)
> case 't':
> tolerance_pct = strtol(optarg, NULL, 0);
> break;
> + case 'T':
> + sequential = false;
> + break;
> +
> case 'n':
> - nop_calibration = strtol(optarg, NULL, 0);
> + subopts = optarg;
> + while (*subopts != '\0') {
> + eng = getsubopt(&subopts, (char **)ring_str_map, &value);
> + if (!value) {
> + /* only engine name was given */
> + wsim_err("Missing calibration value for '%s'!\n",
> + ring_str_map[eng]);
> + goto err;
> + }
> +
> + calib_val = atol(value);
> +
> + if (eng >= 0 && eng < NUM_ENGINES) {
> + /* engine name with some value were given */
> +
> + if (eng == DEFAULT || eng == VCS) {
> + wsim_err("'%s' not allowed in engine calibrations!\n",
> + ring_str_map[eng]);
> + goto err;
> + } else if (calib_val <= 0) {
> + wsim_err("Invalid calibration for engine '%s' - value "
> + "is either non-positive or is not a number!\n",
> + ring_str_map[eng]);
> + goto err;
> + } else if (engine_calib_map[eng]) {
> + wsim_err("Invalid repeated calibration of '%s'!\n",
> + ring_str_map[eng]);
> + goto err;
> + } else {
> + engine_calib_map[eng] = calib_val;
> + has_nop_calibration = true;
> + }
> + } else {
> + /* raw number was given */
> +
> + if (!calib_val) {
> + wsim_err("Invalid engine or zero calibration!\n");
> + goto err;
> + } else if (calib_val < 0) {
> + wsim_err("Invalid negative calibration!\n");
> + goto err;
> + } else if (raw_number) {
> + wsim_err("Default engine calibration provided more than once!\n");
> + goto err;
> + } else {
> + raw_number = calib_val;
> + apply_unset_calibrations(raw_number);
> + has_nop_calibration = true;
> + }
> + }
> + }
> break;
> case 'r':
> repeat = strtol(optarg, NULL, 0);
> @@ -3242,16 +3429,31 @@ int main(int argc, char **argv)
> goto err;
> }
>
> - if (!nop_calibration) {
> - if (verbose > 1)
> - printf("Calibrating nop delay with %u%% tolerance...\n",
> + if (!has_nop_calibration) {
> + if (verbose > 1) {
> + printf("Calibrating nop delays with %u%% tolerance...\n",
> tolerance_pct);
> - nop_calibration = calibrate_nop(tolerance_pct);
> - if (verbose)
> - printf("Nop calibration for %uus delay is %lu.\n",
> - nop_calibration_us, nop_calibration);
> + }
>
> + calibrate_engines();
> + print_engine_calibrations();
> goto out;
> + } else {
> + bool missing = false;
> +
> + for (i = 0; i < NUM_ENGINES; i++) {
> + if (i == DEFAULT || i == VCS)
> + continue;
> +
> + if (!engine_calib_map[i]) {
> + wsim_err("Missing calibration for '%s'!\n",
> + ring_str_map[i]);
> + missing = true;
> + }
> + }
> +
> + if (missing)
> + goto err;
> }
>
> if (!nr_w_args) {
> @@ -3309,8 +3511,7 @@ int main(int argc, char **argv)
>
> if (verbose > 1) {
> printf("Random seed is %u.\n", master_prng);
> - printf("Using %lu nop calibration for %uus delay.\n",
> - nop_calibration, nop_calibration_us);
> + print_engine_calibrations();
> printf("%u client%s.\n", clients, clients > 1 ? "s" : "");
> if (flags & SWAPVCS)
> printf("Swapping VCS rings between clients.\n");
>
More information about the igt-dev
mailing list