[Intel-gfx] [PATCH i-g-t v2] gem_wsim: Use CTX_TIMESTAMP for timed spinners

Wed Nov 4 17:09:07 UTC 2020

Use MI_MATH and MI_COND_BBE we can construct a loop that runs for a
precise number of clock cycles, as measured by the CTX_TIMESTAMP. We use
the CTX_TIMESTAMP (as opposed to the CS_TIMESTAMP) so that the elapsed
time is measured local to the context, and the length of the batch is
unaffected by preemption. Since the clock ticks at a known frequency, we
can directly translate the batch durations into cycles and so remove the
requirement for nop calibration, and the often excessively large nop
batches.

The downside to this is that we need to use engine local registers, and
before gen11 there is no support in the CS for relative mmio and so this
technique does not support transparent load balancing on a virtual
engine before Icelake.

v2: More commentary, more code removal.

Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
---
 benchmarks/gem_wsim.c | 538 ++++++++++++++----------------------------
 1 file changed, 181 insertions(+), 357 deletions(-)

diff --git a/benchmarks/gem_wsim.c b/benchmarks/gem_wsim.c
index dbb46b9aa..ecad4a8dc 100644
--- a/benchmarks/gem_wsim.c
+++ b/benchmarks/gem_wsim.c
@@ -176,10 +176,9 @@ struct w_step
 
 	struct drm_i915_gem_execbuffer2 eb;
 	struct drm_i915_gem_exec_object2 *obj;
-	struct drm_i915_gem_relocation_entry reloc[1];
-	unsigned long bb_sz;
+	struct drm_i915_gem_relocation_entry reloc[3];
 	uint32_t bb_handle;
-	uint32_t *recursive_bb_start;
+	uint32_t *bb_duration;
 };
 
 struct ctx {
@@ -227,10 +226,6 @@ struct workload
 	unsigned int nrequest[NUM_ENGINES];
 };
 
-static const unsigned int nop_calibration_us = 1000;
-static bool has_nop_calibration = false;
-static bool sequential = true;
-
 static unsigned int master_prng;
 
 static int verbose = 1;
@@ -253,59 +248,71 @@ static const char *ring_str_map[NUM_ENGINES] = {
 	[VECS] = "VECS",
 };
 
-/* stores calibrations for particular engines */
-static unsigned long engine_calib_map[NUM_ENGINES];
-
-static enum intel_engine_id
-ci_to_engine_id(int class, int instance)
-{
-	static const struct {
-		int class;
-		int instance;
-		unsigned int id;
-	} map[] = {
-		{ I915_ENGINE_CLASS_RENDER, 0, RCS },
-		{ I915_ENGINE_CLASS_COPY, 0, BCS },
-		{ I915_ENGINE_CLASS_VIDEO, 0, VCS1 },
-		{ I915_ENGINE_CLASS_VIDEO, 1, VCS2 },
-		{ I915_ENGINE_CLASS_VIDEO, 2, VCS2 }, /* FIXME/ICL */
-		{ I915_ENGINE_CLASS_VIDEO_ENHANCE, 0, VECS },
+static int read_timestamp_frequency(int i915)
+{
+	int value = 0;
+	drm_i915_getparam_t gp = {
+		.value = &value,
+		.param = I915_PARAM_CS_TIMESTAMP_FREQUENCY,
 	};
-
-	unsigned int i;
-
-	for (i = 0; i < ARRAY_SIZE(map); i++) {
-		if (class == map[i].class && instance == map[i].instance)
-			return map[i].id;
-	}
-	return -1;
+	ioctl(i915, DRM_IOCTL_I915_GETPARAM, &gp);
+	return value;
 }
 
-static void
-apply_unset_calibrations(unsigned long raw_number)
+static uint64_t div64_u64_round_up(uint64_t x, uint64_t y)
 {
-	for (int i = 0; i < NUM_ENGINES; i++)
-		engine_calib_map[i] += engine_calib_map[i] ? 0 : raw_number;
+	return (x + y - 1) / y;
 }
 
-static void
-print_engine_calibrations(void)
+static uint64_t ns_to_ctx_ticks(uint64_t ns)
 {
-	bool first_entry = true;
+	static long f;
 
-	printf("Nop calibration for %uus delay is: ", nop_calibration_us);
-	for (int i = 0; i < NUM_ENGINES; i++) {
-		/* skip engines not present and DEFAULT and VCS */
-		if (i != DEFAULT && i != VCS && engine_calib_map[i]) {
-			if (first_entry) {
-				printf("%s=%lu", ring_str_map[i], engine_calib_map[i]);
-				first_entry = false;
-			} else {
-				printf(",%s=%lu", ring_str_map[i], engine_calib_map[i]);
-			}
-		}
+	if (!f) {
+		f = read_timestamp_frequency(fd);
+		if (intel_gen(intel_get_drm_devid(fd)) == 11)
+			f = 12500000; /* icl!!! are you feeling alright? */
 	}
-	printf("\n");
+
+	return div64_u64_round_up(ns * f, NSEC_PER_SEC);
+}
+
+#define MI_INSTR(opcode, flags) (((opcode) << 23) | (flags))
+
+#define MI_ARB_CHECK MI_INSTR(0x5, 0)
+
+#define MI_MATH(x)                      MI_INSTR(0x1a, (x) - 1)
+#define MI_MATH_INSTR(opcode, op1, op2) ((opcode) << 20 | (op1) << 10 | (op2))
+/* Opcodes for MI_MATH_INSTR */
+#define   MI_MATH_NOOP                  MI_MATH_INSTR(0x000, 0x0, 0x0)
+#define   MI_MATH_LOAD(op1, op2)        MI_MATH_INSTR(0x080, op1, op2)
+#define   MI_MATH_LOADINV(op1, op2)     MI_MATH_INSTR(0x480, op1, op2)
+#define   MI_MATH_LOAD0(op1)            MI_MATH_INSTR(0x081, op1)
+#define   MI_MATH_LOAD1(op1)            MI_MATH_INSTR(0x481, op1)
+#define   MI_MATH_ADD                   MI_MATH_INSTR(0x100, 0x0, 0x0)
+#define   MI_MATH_SUB                   MI_MATH_INSTR(0x101, 0x0, 0x0)
+#define   MI_MATH_AND                   MI_MATH_INSTR(0x102, 0x0, 0x0)
+#define   MI_MATH_OR                    MI_MATH_INSTR(0x103, 0x0, 0x0)
+#define   MI_MATH_XOR                   MI_MATH_INSTR(0x104, 0x0, 0x0)
+#define   MI_MATH_STORE(op1, op2)       MI_MATH_INSTR(0x180, op1, op2)
+#define   MI_MATH_STOREINV(op1, op2)    MI_MATH_INSTR(0x580, op1, op2)
+/* Registers used as operands in MI_MATH_INSTR */
+#define   MI_MATH_REG(x)                (x)
+#define   MI_MATH_REG_SRCA              0x20
+#define   MI_MATH_REG_SRCB              0x21
+#define   MI_MATH_REG_ACCU              0x31
+#define   MI_MATH_REG_ZF                0x32
+#define   MI_MATH_REG_CF                0x33
+
+#define MI_STORE_DWORD_INDEX	MI_INSTR(0x21, 1)
+#define MI_STORE_REGISTER_MEM	MI_INSTR(0x24, 0)
+#define MI_LOAD_REGISTER_REG    MI_INSTR(0x2A, 1)
+#define   MI_CS_MMIO_DST BIT(19)
+#define   MI_CS_MMIO_SRC BIT(18)
+
+static unsigned int offset_in_page(void *addr)
+{
+	return (uintptr_t)addr & 4095;
 }
 
 static void add_dep(struct deps *deps, struct dep_entry entry)
@@ -1392,92 +1399,126 @@ __get_ctx(struct workload *wrk, const struct w_step *w)
 	return &wrk->ctx_list[w->context];
 }
 
-static unsigned long
-__get_bb_sz(const struct w_step *w, unsigned int duration)
-{
-	enum intel_engine_id engine = w->engine;
-	struct ctx *ctx = __get_ctx(w->wrk, w);
-	unsigned long d;
-
-	if (ctx->engine_map && engine == DEFAULT)
-		/* Assume first engine calibration. */
-		engine = ctx->engine_map[0];
-
-	igt_assert(engine_calib_map[engine]);
-	d = ALIGN(duration * engine_calib_map[engine] * sizeof(uint32_t) /
-		  nop_calibration_us,
-		  sizeof(uint32_t));
-
-	return d;
-}
-
-static unsigned long
-get_bb_sz(const struct w_step *w, unsigned int duration)
+static uint32_t mmio_base(int i915, enum intel_engine_id engine, int gen)
 {
-	unsigned long d = __get_bb_sz(w, duration);
+	const char *name;
 
-	igt_assert(d);
-
-	return d;
-}
+	if (gen >= 11)
+		return 0;
 
-static void init_bb(struct w_step *w)
-{
-	const unsigned int arb_period =
-			__get_bb_sz(w, w->preempt_us) / sizeof(uint32_t);
-	const unsigned int mmap_len = ALIGN(w->bb_sz, 4096);
-	unsigned int i;
-	uint32_t *ptr;
+	switch (engine) {
+	case NUM_ENGINES:
+	default:
+		return 0;
 
-	if (w->unbound_duration || !arb_period)
-		return;
+	case DEFAULT:
+	case RCS:
+		name = "rcs0";
+		break;
 
-	gem_set_domain(fd, w->bb_handle,
-		       I915_GEM_DOMAIN_WC, I915_GEM_DOMAIN_WC);
+	case BCS:
+		name = "bcs0";
+		break;
 
-	ptr = gem_mmap__wc(fd, w->bb_handle, 0, mmap_len, PROT_WRITE);
+	case VCS:
+	case VCS1:
+		name = "vcs0";
+		break;
+	case VCS2:
+		name = "vcs1";
+		break;
 
-	for (i = arb_period; i < w->bb_sz / sizeof(uint32_t); i += arb_period)
-		ptr[i] = 0x5 << 23; /* MI_ARB_CHK */
+	case VECS:
+		name = "vecs0";
+		break;
+	}
 
-	munmap(ptr, mmap_len);
+	return gem_engine_mmio_base(i915, name);
 }
 
-static unsigned int terminate_bb(struct w_step *w)
+static unsigned int create_bb(struct w_step *w, int self)
 {
-	const uint32_t bbe = 0xa << 23;
-	unsigned long mmap_start, mmap_len;
-	unsigned long batch_start = w->bb_sz;
+	const int gen = intel_gen(intel_get_drm_devid(fd));
+	const uint32_t base = mmio_base(fd, w->engine, gen);
+#define CS_GPR(x) (base + 0x600 + 8 * (x))
+#define TIMESTAMP (base + 0x3a8)
+	const int use_64b = gen >= 8;
+	enum { START_TS, NOW_TS };
+	uint32_t *ptr, *cs, *jmp;
 	unsigned int r = 0;
-	uint32_t *ptr, *cs;
-
-	batch_start -= sizeof(uint32_t); /* bbend */
-
-	if (w->unbound_duration)
-		batch_start -= 4 * sizeof(uint32_t); /* MI_ARB_CHK + MI_BATCH_BUFFER_START */
 
-	mmap_start = rounddown(batch_start, PAGE_SIZE);
-	mmap_len = ALIGN(w->bb_sz - mmap_start, PAGE_SIZE);
+	/* Loop until CTX_TIMESTAMP - initial > target ns */
 
 	gem_set_domain(fd, w->bb_handle,
 		       I915_GEM_DOMAIN_WC, I915_GEM_DOMAIN_WC);
 
-	ptr = gem_mmap__wc(fd, w->bb_handle, mmap_start, mmap_len, PROT_WRITE);
-	cs = (uint32_t *)((char *)ptr + batch_start - mmap_start);
+	cs = ptr = gem_mmap__wc(fd, w->bb_handle, 0, 4096, PROT_WRITE);
 
-	if (w->unbound_duration) {
-		w->reloc[r++].offset = batch_start + 2 * sizeof(uint32_t);
-		batch_start += 4 * sizeof(uint32_t);
+	/* Store initial 64b timestamp: start */
+	*cs++ = MI_LOAD_REGISTER_IMM | MI_CS_MMIO_DST;
+	*cs++ = CS_GPR(START_TS) + 4;
+	*cs++ = 0;
+	*cs++ = MI_LOAD_REGISTER_REG | MI_CS_MMIO_DST | MI_CS_MMIO_SRC;
+	*cs++ = TIMESTAMP;
+	*cs++ = CS_GPR(START_TS);
 
-		*cs++ = w->preempt_us ? 0x5 << 23 /* MI_ARB_CHK; */ : MI_NOOP;
-		w->recursive_bb_start = cs;
-		*cs++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
+	if (offset_in_page(cs) & 4)
 		*cs++ = 0;
+	jmp = cs;
+
+	if (w->preempt_us) /* Not precise! */
+		*cs++ = MI_ARB_CHECK;
+
+	/* Store this 64b timestamp: now */
+	*cs++ = MI_LOAD_REGISTER_IMM | MI_CS_MMIO_DST;
+	*cs++ = CS_GPR(NOW_TS) + 4;
+	*cs++ = 0;
+	*cs++ = MI_LOAD_REGISTER_REG | MI_CS_MMIO_DST | MI_CS_MMIO_SRC;
+	*cs++ = TIMESTAMP;
+	*cs++ = CS_GPR(NOW_TS);
+
+	/* delta = now - start; inverted to match COND_BBE */
+	*cs++ = MI_MATH(4);
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(NOW_TS));
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(START_TS));
+	*cs++ = MI_MATH_SUB;
+	*cs++ = MI_MATH_STOREINV(MI_MATH_REG(NOW_TS), MI_MATH_REG_ACCU);
+
+	/* Save delta for indirect read by COND_BBE */
+	*cs++ = MI_STORE_REGISTER_MEM | (1 + use_64b) | MI_CS_MMIO_DST;
+	*cs++ = CS_GPR(NOW_TS);
+	w->reloc[r].target_handle = self;
+	w->reloc[r].offset = offset_in_page(cs);
+	*cs++ = w->reloc[r].delta = 4000;
+	*cs++ = 0;
+	r++;
+
+	/* Delay between SRM and COND_BBE to post the writes */
+	for (int n = 0; n < 8; n++) {
+		*cs++ = MI_STORE_DWORD_INDEX;
+		*cs++ = 2048; /* offset into ppHWSP */
 		*cs++ = 0;
 	}
 
-	*cs = bbe;
-
+	/* Break if delta [time elapsed] > target ns (target filled in later) */
+	*cs++ = MI_COND_BATCH_BUFFER_END | MI_DO_COMPARE | (1 + use_64b);
+	w->bb_duration = cs;
+	*cs++ = 0;
+	w->reloc[r].target_handle = self;
+	w->reloc[r].offset = offset_in_page(cs);
+	*cs++ = w->reloc[r].delta = 4000;
+	*cs++ = 0;
+	r++;
+
+	/* Otherwise back to recalculating delta */
+	*cs++ = MI_BATCH_BUFFER_START | 1 << 8 | use_64b;
+	w->reloc[r].target_handle = self;
+	w->reloc[r].offset = offset_in_page(cs);
+	*cs++ = w->reloc[r].delta = offset_in_page(jmp);
+	*cs++ = 0;
+	r++;
+
+	/* returns still mmapped for w->bb_duration to be filled in later */
 	return r;
 }
 
@@ -1590,23 +1631,10 @@ alloc_step_batch(struct workload *wrk, struct w_step *w)
 		igt_assert(j < nr_obj);
 	}
 
-	if (w->unbound_duration)
-		/* nops + MI_ARB_CHK + MI_BATCH_BUFFER_START */
-		w->bb_sz = max(PAGE_SIZE, __get_bb_sz(w, w->preempt_us)) +
-			   (1 + 3) * sizeof(uint32_t);
-	else
-		w->bb_sz = get_bb_sz(w, w->duration.max);
-
-	w->bb_handle = w->obj[j].handle =
-		alloc_bo(fd, w->bb_sz + (w->unbound_duration ? 4096 : 0));
-	init_bb(w);
-	w->obj[j].relocation_count = terminate_bb(w);
-
-	if (w->obj[j].relocation_count) {
-		igt_assert(w->unbound_duration);
-		w->obj[j].relocs_ptr = to_user_pointer(&w->reloc);
-		w->reloc[0].target_handle = j;
-	}
+	w->bb_handle = w->obj[j].handle = gem_create(fd, 4096);
+	w->obj[j].relocation_count = create_bb(w, j);
+	igt_assert(w->obj[j].relocation_count <= ARRAY_SIZE(w->reloc));
+	w->obj[j].relocs_ptr = to_user_pointer(&w->reloc);
 
 	w->eb.buffers_ptr = to_user_pointer(w->obj);
 	w->eb.buffer_count = j + 1;
@@ -1617,8 +1645,8 @@ alloc_step_batch(struct workload *wrk, struct w_step *w)
 	printf("%u: %u:|", w->idx, w->eb.buffer_count);
 	for (i = 0; i <= j; i++)
 		printf("%x|", w->obj[i].handle);
-	printf(" %10lu flags=%llx bb=%x[%u] ctx[%u]=%u\n",
-		w->bb_sz, w->eb.flags, w->bb_handle, j, w->context,
+	printf(" flags=%llx bb=%x[%u] ctx[%u]=%u\n",
+		w->eb.flags, w->bb_handle, j, w->context,
 		get_ctxid(wrk, w));
 #endif
 }
@@ -1803,7 +1831,7 @@ static void measure_active_set(struct workload *wrk)
 		if (w->type != BATCH)
 			continue;
 
-		batch_sizes += w->bb_sz;
+		batch_sizes += 4096;
 
 		for (j = 0; j < w->data_deps.nr; j++) {
 			struct dep_entry *dep = &w->data_deps.list[j];
@@ -1904,6 +1932,10 @@ static int prepare_workload(unsigned int id, struct workload *wrk)
 					wsim_err("Load balancing needs an engine map!\n");
 					return 1;
 				}
+				if (intel_gen(intel_get_drm_devid(fd)) < 11) {
+					wsim_err("Load balancing needs relative mmio support, gen11+!\n");
+					return 1;
+				}
 				ctx->load_balance = w->load_balance;
 			} else if (w->type == BOND) {
 				if (!ctx->load_balance) {
@@ -2163,15 +2195,16 @@ static int elapsed_us(const struct timespec *start, const struct timespec *end)
 }
 
 static void
-update_bb_start(struct w_step *w)
+update_bb_start(struct workload *wrk, struct w_step *w)
 {
-	if (!w->unbound_duration)
-		return;
+	uint32_t ticks;
 
-	gem_set_domain(fd, w->bb_handle,
-		       I915_GEM_DOMAIN_WC, I915_GEM_DOMAIN_WC);
+	/* ticks is inverted for MI_DO_COMPARE (less-than comparison) */
+	ticks = 0;
+	if (!w->unbound_duration)
+		ticks = ~ns_to_ctx_ticks(1000 * get_duration(wrk, w));
 
-	*w->recursive_bb_start = MI_BATCH_BUFFER_START | (1 << 8) | 1;
+	*w->bb_duration = ticks;
 }
 
 static void w_sync_to(struct workload *wrk, struct w_step *w, int target)
@@ -2198,13 +2231,7 @@ do_eb(struct workload *wrk, struct w_step *w, enum intel_engine_id engine)
 	unsigned int i;
 
 	eb_update_flags(wrk, w, engine);
-	update_bb_start(w);
-
-	w->eb.batch_start_offset =
-		w->unbound_duration ?
-		0 :
-		ALIGN(w->bb_sz - get_bb_sz(w, get_duration(wrk, w)),
-		      2 * sizeof(uint32_t));
+	update_bb_start(wrk, w);
 
 	for (i = 0; i < w->fence_deps.nr; i++) {
 		int tgt = w->idx + w->fence_deps.list[i].target;
@@ -2353,8 +2380,7 @@ static void *run_workload(void *data)
 				igt_assert(wrk->steps[t_idx].type == BATCH);
 				igt_assert(wrk->steps[t_idx].unbound_duration);
 
-				*wrk->steps[t_idx].recursive_bb_start =
-					MI_BATCH_BUFFER_END;
+				*wrk->steps[t_idx].bb_duration = 0xffffffff;
 				__sync_synchronize();
 				continue;
 			} else if (w->type == SSEU) {
@@ -2467,132 +2493,15 @@ static void fini_workload(struct workload *wrk)
 	free(wrk);
 }
 
-static unsigned long calibrate_nop(unsigned int tolerance_pct, struct intel_execution_engine2 *engine)
-{
-	const uint32_t bbe = 0xa << 23;
-	unsigned int loops = 17;
-	unsigned int usecs = nop_calibration_us;
-	struct drm_i915_gem_exec_object2 obj = {};
-	struct drm_i915_gem_execbuffer2 eb = {
-		.buffer_count = 1,
-		.buffers_ptr = (uintptr_t)&obj,
-		.flags = engine->flags
-	};
-	long size, last_size;
-	struct timespec t_0, t_end;
-
-	clock_gettime(CLOCK_MONOTONIC, &t_0);
-
-	size = 256 * 1024;
-	do {
-		struct timespec t_start;
-
-		obj.handle = alloc_bo(fd, size);
-		gem_write(fd, obj.handle, size - sizeof(bbe), &bbe,
-			  sizeof(bbe));
-		gem_execbuf(fd, &eb);
-		gem_sync(fd, obj.handle);
-
-		clock_gettime(CLOCK_MONOTONIC, &t_start);
-		for (int loop = 0; loop < loops; loop++)
-			gem_execbuf(fd, &eb);
-		gem_sync(fd, obj.handle);
-		clock_gettime(CLOCK_MONOTONIC, &t_end);
-
-		gem_close(fd, obj.handle);
-
-		last_size = size;
-		size = loops * size / elapsed(&t_start, &t_end) / 1e6 * usecs;
-		size = ALIGN(size, sizeof(uint32_t));
-	} while (elapsed(&t_0, &t_end) < 5 ||
-		 labs(size - last_size) > (size * tolerance_pct / 100));
-
-	return size / sizeof(uint32_t);
-}
-
-static void
-calibrate_sequentially(void)
-{
-	struct intel_execution_engine2 *engine;
-	enum intel_engine_id eng_id;
-
-	__for_each_physical_engine(fd, engine) {
-		eng_id = ci_to_engine_id(engine->class, engine->instance);
-		igt_assert(eng_id >= 0);
-		engine_calib_map[eng_id] = calibrate_nop(fd, engine);
-	}
-}
-
-struct thread_data {
-	struct intel_execution_engine2 *eng;
-	pthread_t thr;
-	unsigned long calib;
-};
-
-static void *
-engine_calibration_thread(void *data)
-{
-	struct thread_data *thr_d = (struct thread_data *) data;
-
-	thr_d->calib = calibrate_nop(fd, thr_d->eng);
-	return NULL;
-}
-
-static void
-calibrate_in_parallel(void)
-{
-	struct thread_data *thr_d = malloc(NUM_ENGINES * sizeof(*thr_d));
-	struct intel_execution_engine2 *engine;
-	enum intel_engine_id id;
-	int ret;
-
-	__for_each_physical_engine(fd, engine) {
-		id = ci_to_engine_id(engine->class, engine->instance);
-		thr_d[id].eng = engine;
-		ret = pthread_create(&thr_d[id].thr, NULL, engine_calibration_thread, &thr_d[id]);
-		igt_assert_eq(ret, 0);
-	}
-
-	__for_each_physical_engine(fd, engine) {
-		id = ci_to_engine_id(engine->class, engine->instance);
-		igt_assert(id >= 0);
-
-		ret = pthread_join(thr_d[id].thr, NULL);
-		igt_assert_eq(ret, 0);
-		engine_calib_map[id] = thr_d[id].calib;
-	}
-
-	free(thr_d);
-}
-
-static void
-calibrate_engines(void)
-{
-	if (sequential)
-		calibrate_sequentially();
-	else
-		calibrate_in_parallel();
-}
-
 static void print_help(void)
 {
 	puts(
 "Usage: gem_wsim [OPTIONS]\n"
 "\n"
 "Runs a simulated workload on the GPU.\n"
-"When ran without arguments performs a GPU calibration result of which needs to\n"
-"be provided when running the simulation in subsequent invocations.\n"
-"\n"
 "Options:\n"
 "  -h                This text.\n"
 "  -q                Be quiet - do not output anything to stdout.\n"
-"  -n <n |           Nop calibration value - single value is set to all engines\n"
-"  e1=v1,e2=v2,n...> without specified value; you can also specify calibrations for\n"
-"                    particular engines.\n"
-"  -t <n>            Nop calibration tolerance percentage.\n"
-"  -T                Disable sequential calibration and perform calibration in parallel.\n"
-"                    Use when there is a difficulty obtaining calibration with the\n"
-"                    default settings.\n"
 "  -I <n>            Initial randomness seed.\n"
 "  -p <n>            Context priority to use for the following workload on the\n"
 "                    command line.\n"
@@ -2671,17 +2580,12 @@ int main(int argc, char **argv)
 	int master_workload = -1;
 	char *append_workload_arg = NULL;
 	struct w_arg *w_args = NULL;
-	unsigned int tolerance_pct = 1;
 	int exitcode = EXIT_FAILURE;
 	double scale_time = 1.0f;
 	double scale_dur = 1.0f;
 	int prio = 0;
 	double t;
-	int i, c;
-	char *subopts, *value;
-	int raw_number = 0;
-	long calib_val;
-	int eng;
+	int i, c, ret;
 
 	/*
 	 * Open the device via the low-level API so we can do the GPU quiesce
@@ -2695,7 +2599,7 @@ int main(int argc, char **argv)
 	master_prng = time(NULL);
 
 	while ((c = getopt(argc, argv,
-			   "ThqvsSdc:n:r:w:W:a:t:p:I:f:F:")) != -1) {
+			   "hqvsSdc:r:w:W:a:p:I:f:F:")) != -1) {
 		switch (c) {
 		case 'W':
 			if (master_workload >= 0) {
@@ -2721,70 +2625,7 @@ int main(int argc, char **argv)
 		case 'c':
 			clients = strtol(optarg, NULL, 0);
 			break;
-		case 't':
-			tolerance_pct = strtol(optarg, NULL, 0);
-			break;
-		case 'T':
-			sequential = false;
-			break;
-
-		case 'n':
-			subopts = optarg;
-			while (*subopts != '\0') {
-				eng = getsubopt(&subopts, (char **)ring_str_map, &value);
-				if (!value) {
-					/* only engine name was given */
-					wsim_err("Missing calibration value for '%s'!\n",
-						ring_str_map[eng]);
-					goto err;
-				}
 
-				calib_val = atol(value);
-
-				if (eng >= 0 && eng < NUM_ENGINES) {
-				/* engine name with some value were given */
-
-					if (eng == DEFAULT || eng == VCS) {
-						wsim_err("'%s' not allowed in engine calibrations!\n",
-							ring_str_map[eng]);
-						goto err;
-					} else if (calib_val <= 0) {
-						wsim_err("Invalid calibration for engine '%s' - value "
-						"is either non-positive or is not a number!\n",
-							ring_str_map[eng]);
-						goto err;
-					} else if (engine_calib_map[eng]) {
-						wsim_err("Invalid repeated calibration of '%s'!\n",
-							ring_str_map[eng]);
-						goto err;
-					} else {
-						engine_calib_map[eng] = calib_val;
-						if (eng == RCS)
-							engine_calib_map[DEFAULT] = calib_val;
-						else if (eng == VCS1 || eng == VCS2)
-							engine_calib_map[VCS] = calib_val;
-						has_nop_calibration = true;
-					}
-				} else {
-					/* raw number was given */
-
-					if (!calib_val) {
-						wsim_err("Invalid engine or zero calibration!\n");
-						goto err;
-					} else if (calib_val < 0) {
-						wsim_err("Invalid negative calibration!\n");
-						goto err;
-					} else if (raw_number) {
-						wsim_err("Default engine calibration provided more than once!\n");
-						goto err;
-					} else {
-						raw_number = calib_val;
-						apply_unset_calibrations(raw_number);
-						has_nop_calibration = true;
-					}
-				}
-			}
-			break;
 		case 'r':
 			repeat = strtol(optarg, NULL, 0);
 			break;
@@ -2820,19 +2661,6 @@ int main(int argc, char **argv)
 		}
 	}
 
-	if (!has_nop_calibration) {
-		if (verbose > 1) {
-			printf("Calibrating nop delays with %u%% tolerance...\n",
-				tolerance_pct);
-		}
-
-		calibrate_engines();
-
-		if (verbose)
-			print_engine_calibrations();
-		goto out;
-	}
-
 	if (!nr_w_args) {
 		wsim_err("No workload descriptor(s)!\n");
 		goto err;
@@ -2885,7 +2713,6 @@ int main(int argc, char **argv)
 
 	if (verbose > 1) {
 		printf("Random seed is %u.\n", master_prng);
-		print_engine_calibrations();
 		printf("%u client%s.\n", clients, clients > 1 ? "s" : "");
 	}
 
@@ -2916,16 +2743,13 @@ int main(int argc, char **argv)
 	clock_gettime(CLOCK_MONOTONIC, &t_start);
 
 	for (i = 0; i < clients; i++) {
-		int ret;
-
 		ret = pthread_create(&w[i]->thread, NULL, run_workload, w[i]);
 		igt_assert_eq(ret, 0);
 	}
 
 	if (master_workload >= 0) {
-		int ret = pthread_join(w[master_workload]->thread, NULL);
-
-		igt_assert(ret == 0);
+		ret = pthread_join(w[master_workload]->thread, NULL);
+		igt_assert_eq(ret, 0);
 
 		for (i = 0; i < clients; i++)
 			w[i]->run = false;
@@ -2933,8 +2757,8 @@ int main(int argc, char **argv)
 
 	for (i = 0; i < clients; i++) {
 		if (master_workload != i) {
-			int ret = pthread_join(w[i]->thread, NULL);
-			igt_assert(ret == 0);
+			ret = pthread_join(w[i]->thread, NULL);
+			igt_assert_eq(ret, 0);
 		}
 	}
 
-- 
2.29.1