[Intel-gfx] [PATCH i-g-t] benchmarks/gem_wsim: Measure nop latency on all engines

Thu May 16 19:42:44 UTC 2019

Different engines take different number of cycles for MI_NOOP. As we
specify workloads in us, we need to take into account the different
calibration values so that the workloads behave as expected.

Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
---
 benchmarks/gem_wsim.c | 72 +++++++++++++++++++++++++++++++------------
 1 file changed, 52 insertions(+), 20 deletions(-)

diff --git a/benchmarks/gem_wsim.c b/benchmarks/gem_wsim.c
index 9564dcb70..50a062f0e 100644
--- a/benchmarks/gem_wsim.c
+++ b/benchmarks/gem_wsim.c
@@ -238,7 +238,7 @@ struct workload
 };
 
 static const unsigned int nop_calibration_us = 1000;
-static unsigned long nop_calibration;
+static unsigned long nop_calibration[NUM_ENGINES];
 
 static unsigned int context_vcs_rr;
 
@@ -808,9 +808,9 @@ static unsigned int get_duration(struct w_step *w)
 		       (dur->max + 1 - dur->min);
 }
 
-static unsigned long get_bb_sz(unsigned int duration)
+static unsigned long get_bb_sz(unsigned int engine, unsigned int duration)
 {
-	return ALIGN(duration * nop_calibration * sizeof(uint32_t) /
+	return ALIGN(duration * nop_calibration[engine] * sizeof(uint32_t) /
 		     nop_calibration_us, sizeof(uint32_t));
 }
 
@@ -818,7 +818,7 @@ static void
 init_bb(struct w_step *w, unsigned int flags)
 {
 	const unsigned int arb_period =
-			get_bb_sz(w->preempt_us) / sizeof(uint32_t);
+			get_bb_sz(w->engine, w->preempt_us) / sizeof(uint32_t);
 	const unsigned int mmap_len = ALIGN(w->bb_sz, 4096);
 	unsigned int i;
 	uint32_t *ptr;
@@ -1043,10 +1043,10 @@ alloc_step_batch(struct workload *wrk, struct w_step *w, unsigned int flags)
 
 	if (w->unbound_duration)
 		/* nops + MI_ARB_CHK + MI_BATCH_BUFFER_START */
-		w->bb_sz = max(64, get_bb_sz(w->preempt_us)) +
+		w->bb_sz = max(64, get_bb_sz(w->engine, w->preempt_us)) +
 			   (1 + 3) * sizeof(uint32_t);
 	else
-		w->bb_sz = get_bb_sz(w->duration.max);
+		w->bb_sz = get_bb_sz(w->engine, w->duration.max);
 	w->bb_handle = w->obj[j].handle = gem_create(fd, w->bb_sz + (w->unbound_duration ? 4096 : 0));
 	init_bb(w, flags);
 	terminate_bb(w, flags);
@@ -2300,7 +2300,7 @@ do_eb(struct workload *wrk, struct w_step *w, enum intel_engine_id engine,
 	w->eb.batch_start_offset =
 		w->unbound_duration ?
 		0 :
-		ALIGN(w->bb_sz - get_bb_sz(get_duration(w)),
+		ALIGN(w->bb_sz - get_bb_sz(engine, get_duration(w)),
 		      2 * sizeof(uint32_t));
 
 	for (i = 0; i < w->fence_deps.nr; i++) {
@@ -2580,17 +2580,23 @@ static void fini_workload(struct workload *wrk)
 	free(wrk);
 }
 
-static unsigned long calibrate_nop(unsigned int tolerance_pct)
+static unsigned long calibrate_nop(unsigned int engine, double tolerance_pct)
 {
 	const uint32_t bbe = 0xa << 23;
 	unsigned int loops = 17;
 	unsigned int usecs = nop_calibration_us;
 	struct drm_i915_gem_exec_object2 obj = {};
-	struct drm_i915_gem_execbuffer2 eb =
-		{ .buffer_count = 1, .buffers_ptr = (uintptr_t)&obj};
+	struct drm_i915_gem_execbuffer2 eb = {
+		.buffer_count = 1,
+		.buffers_ptr = (uintptr_t)&obj,
+		.flags = eb_engine_map[engine],
+	};
 	long size, last_size;
 	struct timespec t_0, t_end;
 
+	if (__gem_execbuf(fd, &eb) != -ENOENT)
+		return 0;
+
 	clock_gettime(CLOCK_MONOTONIC, &t_0);
 
 	size = 256 * 1024;
@@ -2803,8 +2809,8 @@ int main(int argc, char **argv)
 	int master_workload = -1;
 	char *append_workload_arg = NULL;
 	struct w_arg *w_args = NULL;
-	unsigned int tolerance_pct = 1;
 	const struct workload_balancer *balancer = NULL;
+	double tolerance_pct = 1;
 	char *endptr = NULL;
 	int prio = 0;
 	double t;
@@ -2852,10 +2858,28 @@ int main(int argc, char **argv)
 			clients = strtol(optarg, NULL, 0);
 			break;
 		case 't':
-			tolerance_pct = strtol(optarg, NULL, 0);
+			tolerance_pct = strtod(optarg, NULL);
 			break;
 		case 'n':
-			nop_calibration = strtol(optarg, NULL, 0);
+			if (strchr(optarg, ',')) {
+				char *ctx = NULL;
+				char *str = optarg;
+				char *token;
+
+				while ((token = strtok_r(str, ",", &ctx)) != NULL) {
+					unsigned long nop;
+					int engine;
+
+					str = NULL;
+					if (sscanf(token, "%d:%lu",
+						   &engine, &nop) == 2)
+						nop_calibration[engine] = nop;
+				}
+			} else {
+				nop_calibration[0] = strtol(optarg, NULL, 0);
+				for (i = 1; i < NUM_ENGINES; i++)
+					nop_calibration[i] = nop_calibration[0];
+			}
 			break;
 		case 'r':
 			repeat = strtol(optarg, NULL, 0);
@@ -2930,14 +2954,22 @@ int main(int argc, char **argv)
 		return 1;
 	}
 
-	if (!nop_calibration) {
+	if (!nop_calibration[0]) {
+		int engine;
+
 		if (verbose > 1)
-			printf("Calibrating nop delay with %u%% tolerance...\n",
+			printf("Calibrating nop delay with %.1f%% tolerance...\n",
 				tolerance_pct);
-		nop_calibration = calibrate_nop(tolerance_pct);
-		if (verbose)
-			printf("Nop calibration for %uus delay is %lu.\n",
-			       nop_calibration_us, nop_calibration);
+
+		for (engine = 0; engine < NUM_ENGINES; engine++) {
+			nop_calibration[engine] = calibrate_nop(engine, tolerance_pct);
+			if (!nop_calibration[engine])
+				continue;
+
+			if (verbose)
+				printf("Nop(engine:%d) calibration for %uus delay is %lu.\n",
+				       engine, nop_calibration_us, nop_calibration[engine]);
+		}
 
 		return 0;
 	}
@@ -2997,7 +3029,7 @@ int main(int argc, char **argv)
 
 	if (verbose > 1) {
 		printf("Using %lu nop calibration for %uus delay.\n",
-		       nop_calibration, nop_calibration_us);
+		       nop_calibration[0], nop_calibration_us);
 		printf("%u client%s.\n", clients, clients > 1 ? "s" : "");
 		if (flags & SWAPVCS)
 			printf("Swapping VCS rings between clients.\n");
-- 
2.20.1