[Intel-gfx] [PATCH i-g-t] benchmarks/gem_wsim: Measure nop latency on all engines
Chris Wilson
chris at chris-wilson.co.uk
Thu May 16 19:42:44 UTC 2019
Different engines take different number of cycles for MI_NOOP. As we
specify workloads in us, we need to take into account the different
calibration values so that the workloads behave as expected.
Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
---
benchmarks/gem_wsim.c | 72 +++++++++++++++++++++++++++++++------------
1 file changed, 52 insertions(+), 20 deletions(-)
diff --git a/benchmarks/gem_wsim.c b/benchmarks/gem_wsim.c
index 9564dcb70..50a062f0e 100644
--- a/benchmarks/gem_wsim.c
+++ b/benchmarks/gem_wsim.c
@@ -238,7 +238,7 @@ struct workload
};
static const unsigned int nop_calibration_us = 1000;
-static unsigned long nop_calibration;
+static unsigned long nop_calibration[NUM_ENGINES];
static unsigned int context_vcs_rr;
@@ -808,9 +808,9 @@ static unsigned int get_duration(struct w_step *w)
(dur->max + 1 - dur->min);
}
-static unsigned long get_bb_sz(unsigned int duration)
+static unsigned long get_bb_sz(unsigned int engine, unsigned int duration)
{
- return ALIGN(duration * nop_calibration * sizeof(uint32_t) /
+ return ALIGN(duration * nop_calibration[engine] * sizeof(uint32_t) /
nop_calibration_us, sizeof(uint32_t));
}
@@ -818,7 +818,7 @@ static void
init_bb(struct w_step *w, unsigned int flags)
{
const unsigned int arb_period =
- get_bb_sz(w->preempt_us) / sizeof(uint32_t);
+ get_bb_sz(w->engine, w->preempt_us) / sizeof(uint32_t);
const unsigned int mmap_len = ALIGN(w->bb_sz, 4096);
unsigned int i;
uint32_t *ptr;
@@ -1043,10 +1043,10 @@ alloc_step_batch(struct workload *wrk, struct w_step *w, unsigned int flags)
if (w->unbound_duration)
/* nops + MI_ARB_CHK + MI_BATCH_BUFFER_START */
- w->bb_sz = max(64, get_bb_sz(w->preempt_us)) +
+ w->bb_sz = max(64, get_bb_sz(w->engine, w->preempt_us)) +
(1 + 3) * sizeof(uint32_t);
else
- w->bb_sz = get_bb_sz(w->duration.max);
+ w->bb_sz = get_bb_sz(w->engine, w->duration.max);
w->bb_handle = w->obj[j].handle = gem_create(fd, w->bb_sz + (w->unbound_duration ? 4096 : 0));
init_bb(w, flags);
terminate_bb(w, flags);
@@ -2300,7 +2300,7 @@ do_eb(struct workload *wrk, struct w_step *w, enum intel_engine_id engine,
w->eb.batch_start_offset =
w->unbound_duration ?
0 :
- ALIGN(w->bb_sz - get_bb_sz(get_duration(w)),
+ ALIGN(w->bb_sz - get_bb_sz(engine, get_duration(w)),
2 * sizeof(uint32_t));
for (i = 0; i < w->fence_deps.nr; i++) {
@@ -2580,17 +2580,23 @@ static void fini_workload(struct workload *wrk)
free(wrk);
}
-static unsigned long calibrate_nop(unsigned int tolerance_pct)
+static unsigned long calibrate_nop(unsigned int engine, double tolerance_pct)
{
const uint32_t bbe = 0xa << 23;
unsigned int loops = 17;
unsigned int usecs = nop_calibration_us;
struct drm_i915_gem_exec_object2 obj = {};
- struct drm_i915_gem_execbuffer2 eb =
- { .buffer_count = 1, .buffers_ptr = (uintptr_t)&obj};
+ struct drm_i915_gem_execbuffer2 eb = {
+ .buffer_count = 1,
+ .buffers_ptr = (uintptr_t)&obj,
+ .flags = eb_engine_map[engine],
+ };
long size, last_size;
struct timespec t_0, t_end;
+ if (__gem_execbuf(fd, &eb) != -ENOENT)
+ return 0;
+
clock_gettime(CLOCK_MONOTONIC, &t_0);
size = 256 * 1024;
@@ -2803,8 +2809,8 @@ int main(int argc, char **argv)
int master_workload = -1;
char *append_workload_arg = NULL;
struct w_arg *w_args = NULL;
- unsigned int tolerance_pct = 1;
const struct workload_balancer *balancer = NULL;
+ double tolerance_pct = 1;
char *endptr = NULL;
int prio = 0;
double t;
@@ -2852,10 +2858,28 @@ int main(int argc, char **argv)
clients = strtol(optarg, NULL, 0);
break;
case 't':
- tolerance_pct = strtol(optarg, NULL, 0);
+ tolerance_pct = strtod(optarg, NULL);
break;
case 'n':
- nop_calibration = strtol(optarg, NULL, 0);
+ if (strchr(optarg, ',')) {
+ char *ctx = NULL;
+ char *str = optarg;
+ char *token;
+
+ while ((token = strtok_r(str, ",", &ctx)) != NULL) {
+ unsigned long nop;
+ int engine;
+
+ str = NULL;
+ if (sscanf(token, "%d:%lu",
+ &engine, &nop) == 2)
+ nop_calibration[engine] = nop;
+ }
+ } else {
+ nop_calibration[0] = strtol(optarg, NULL, 0);
+ for (i = 1; i < NUM_ENGINES; i++)
+ nop_calibration[i] = nop_calibration[0];
+ }
break;
case 'r':
repeat = strtol(optarg, NULL, 0);
@@ -2930,14 +2954,22 @@ int main(int argc, char **argv)
return 1;
}
- if (!nop_calibration) {
+ if (!nop_calibration[0]) {
+ int engine;
+
if (verbose > 1)
- printf("Calibrating nop delay with %u%% tolerance...\n",
+ printf("Calibrating nop delay with %.1f%% tolerance...\n",
tolerance_pct);
- nop_calibration = calibrate_nop(tolerance_pct);
- if (verbose)
- printf("Nop calibration for %uus delay is %lu.\n",
- nop_calibration_us, nop_calibration);
+
+ for (engine = 0; engine < NUM_ENGINES; engine++) {
+ nop_calibration[engine] = calibrate_nop(engine, tolerance_pct);
+ if (!nop_calibration[engine])
+ continue;
+
+ if (verbose)
+ printf("Nop(engine:%d) calibration for %uus delay is %lu.\n",
+ engine, nop_calibration_us, nop_calibration[engine]);
+ }
return 0;
}
@@ -2997,7 +3029,7 @@ int main(int argc, char **argv)
if (verbose > 1) {
printf("Using %lu nop calibration for %uus delay.\n",
- nop_calibration, nop_calibration_us);
+ nop_calibration[0], nop_calibration_us);
printf("%u client%s.\n", clients, clients > 1 ? "s" : "");
if (flags & SWAPVCS)
printf("Swapping VCS rings between clients.\n");
--
2.20.1
More information about the Intel-gfx
mailing list