[Intel-gfx] [PATCH i-g-t] i915/perf_pmu: Measure accuracy of busyness

Tue Dec 22 13:23:54 UTC 2020

Create a delay loop for a specific duration and see what PMU reports as
how long it was running, and check that it fits to a linear regression.

Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
---
 tests/i915/perf_pmu.c | 269 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 266 insertions(+), 3 deletions(-)

diff --git a/tests/i915/perf_pmu.c b/tests/i915/perf_pmu.c
index da06a28b0..183090553 100644
--- a/tests/i915/perf_pmu.c
+++ b/tests/i915/perf_pmu.c
@@ -29,6 +29,7 @@
 #include <inttypes.h>
 #include <errno.h>
 #include <signal.h>
+#include <sys/ioctl.h>
 #include <sys/stat.h>
 #include <sys/time.h>
 #include <sys/times.h>
@@ -51,6 +52,8 @@
 
 IGT_TEST_DESCRIPTION("Test the i915 pmu perf interface");
 
+#define NSEC64 ((uint64_t)NSEC_PER_SEC)
+
 const double tolerance = 0.05f;
 const unsigned long batch_duration_ns = 500e6;
 
@@ -1784,9 +1787,6 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
 	int link[2];
 	int fd;
 
-	/* Sampling platforms cannot reach the high accuracy criteria. */
-	igt_require(gem_scheduler_has_engine_busy_stats(gem_fd));
-
 	/* Aim for approximately 100 iterations for calibration */
 	cycle_us = min_test_us / target_iters;
 	busy_us = cycle_us * target_busy_pct / 100;
@@ -1918,6 +1918,253 @@ accuracy(int gem_fd, const struct intel_execution_engine2 *e,
 	assert_within(100.0 * busy_r, 100.0 * expected, 2);
 }
 
+static uint32_t __batch_create(int i915, uint32_t offset)
+{
+	const uint32_t bbe = MI_BATCH_BUFFER_END;
+	uint32_t handle;
+
+	handle = gem_create(i915, offset + sizeof(bbe));
+	gem_write(i915, handle, offset, &bbe, sizeof(bbe));
+
+	return handle;
+}
+
+static uint32_t batch_create(int i915)
+{
+	return __batch_create(i915, 0);
+}
+
+static int read_timestamp_frequency(int i915)
+{
+	int value = 0;
+	drm_i915_getparam_t gp = {
+		.value = &value,
+		.param = I915_PARAM_CS_TIMESTAMP_FREQUENCY,
+	};
+	ioctl(i915, DRM_IOCTL_I915_GETPARAM, &gp);
+	return value;
+}
+
+static uint64_t div64_u64_round_up(uint64_t x, uint64_t y)
+{
+	return (x + y - 1) / y;
+}
+
+static uint64_t ns_to_ctx_ticks(int i915, uint64_t ns)
+{
+	int f = read_timestamp_frequency(i915);
+	if (intel_gen(intel_get_drm_devid(i915)) == 11)
+		f = 12500000; /* icl!!! are you feeling alright? CTX vs CS */
+	return div64_u64_round_up(ns * f, NSEC64);
+}
+
+#define MI_INSTR(opcode, flags) (((opcode) << 23) | (flags))
+
+#define MI_MATH(x)                      MI_INSTR(0x1a, (x) - 1)
+#define MI_MATH_INSTR(opcode, op1, op2) ((opcode) << 20 | (op1) << 10 | (op2))
+/* Opcodes for MI_MATH_INSTR */
+#define   MI_MATH_NOOP                  MI_MATH_INSTR(0x000, 0x0, 0x0)
+#define   MI_MATH_LOAD(op1, op2)        MI_MATH_INSTR(0x080, op1, op2)
+#define   MI_MATH_LOADINV(op1, op2)     MI_MATH_INSTR(0x480, op1, op2)
+#define   MI_MATH_LOAD0(op1)            MI_MATH_INSTR(0x081, op1)
+#define   MI_MATH_LOAD1(op1)            MI_MATH_INSTR(0x481, op1)
+#define   MI_MATH_ADD                   MI_MATH_INSTR(0x100, 0x0, 0x0)
+#define   MI_MATH_SUB                   MI_MATH_INSTR(0x101, 0x0, 0x0)
+#define   MI_MATH_AND                   MI_MATH_INSTR(0x102, 0x0, 0x0)
+#define   MI_MATH_OR                    MI_MATH_INSTR(0x103, 0x0, 0x0)
+#define   MI_MATH_XOR                   MI_MATH_INSTR(0x104, 0x0, 0x0)
+#define   MI_MATH_STORE(op1, op2)       MI_MATH_INSTR(0x180, op1, op2)
+#define   MI_MATH_STOREINV(op1, op2)    MI_MATH_INSTR(0x580, op1, op2)
+/* Registers used as operands in MI_MATH_INSTR */
+#define   MI_MATH_REG(x)                (x)
+#define   MI_MATH_REG_SRCA              0x20
+#define   MI_MATH_REG_SRCB              0x21
+#define   MI_MATH_REG_ACCU              0x31
+#define   MI_MATH_REG_ZF                0x32
+#define   MI_MATH_REG_CF                0x33
+
+#define MI_LOAD_REGISTER_REG    MI_INSTR(0x2A, 1)
+
+static unsigned int offset_in_page(void *addr)
+{
+	return (uintptr_t)addr & 4095;
+}
+
+static void delay(int i915,
+		  const struct intel_execution_engine2 *e,
+		  uint32_t handle,
+		  uint64_t addr,
+		  uint64_t ns)
+{
+	const int use_64b = intel_gen(intel_get_drm_devid(i915)) >= 8;
+	const uint32_t base = gem_engine_mmio_base(i915, e->name);
+#define CS_GPR(x) (base + 0x600 + 8 * (x))
+#define RUNTIME (base + 0x3a8)
+	enum { START_TS, NOW_TS };
+	uint32_t *map, *cs, *jmp;
+
+	igt_require(base);
+
+	/* Loop until CTX_TIMESTAMP - initial > @ns */
+
+	cs = map = gem_mmap__device_coherent(i915, handle, 0, 4096, PROT_WRITE);
+
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(START_TS) + 4;
+	*cs++ = 0;
+	*cs++ = MI_LOAD_REGISTER_REG;
+	*cs++ = RUNTIME;
+	*cs++ = CS_GPR(START_TS);
+
+	while (offset_in_page(cs) & 63)
+		*cs++ = 0;
+	jmp = cs;
+
+	*cs++ = 0x5 << 23; /* MI_ARB_CHECK */
+
+	*cs++ = MI_LOAD_REGISTER_IMM;
+	*cs++ = CS_GPR(NOW_TS) + 4;
+	*cs++ = 0;
+	*cs++ = MI_LOAD_REGISTER_REG;
+	*cs++ = RUNTIME;
+	*cs++ = CS_GPR(NOW_TS);
+
+	/* delta = now - start; inverted to match COND_BBE */
+	*cs++ = MI_MATH(4);
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(NOW_TS));
+	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(START_TS));
+	*cs++ = MI_MATH_SUB;
+	*cs++ = MI_MATH_STOREINV(MI_MATH_REG(NOW_TS), MI_MATH_REG_ACCU);
+
+	/* Save delta for reading by COND_BBE */
+	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
+	*cs++ = CS_GPR(NOW_TS);
+	*cs++ = addr + 4000;
+	*cs++ = addr >> 32;
+
+	/* Delay between SRM and COND_BBE to post the writes */
+	for (int n = 0; n < 8; n++) {
+		*cs++ = MI_STORE_DWORD_IMM;
+		if (use_64b) {
+			*cs++ = addr + 4064;
+			*cs++ = addr >> 32;
+		} else {
+			*cs++ = 0;
+			*cs++ = addr + 4064;
+		}
+		*cs++ = 0;
+	}
+
+	/* Break if delta [time elapsed] > ns */
+	*cs++ = MI_COND_BATCH_BUFFER_END | MI_DO_COMPARE | (1 + use_64b);
+	*cs++ = ~ns_to_ctx_ticks(i915, ns);
+	*cs++ = addr + 4000;
+	*cs++ = addr >> 32;
+
+	/* Otherwise back to recalculating delta */
+	*cs++ = MI_BATCH_BUFFER_START | 1 << 8 | use_64b;
+	*cs++ = addr + offset_in_page(jmp);
+	*cs++ = addr >> 32;
+
+	munmap(map, 4096);
+}
+
+static struct drm_i915_gem_exec_object2
+delay_create(int i915, uint32_t ctx,
+	     const struct intel_execution_engine2 *e,
+	     uint64_t target_ns)
+{
+	struct drm_i915_gem_exec_object2 obj = {
+		.handle = batch_create(i915),
+		.flags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS,
+	};
+	struct drm_i915_gem_execbuffer2 execbuf = {
+		.buffers_ptr = to_user_pointer(&obj),
+		.buffer_count = 1,
+		.rsvd1 = ctx,
+		.flags = e->flags,
+	};
+
+	obj.offset = obj.handle << 12;
+	gem_execbuf(i915, &execbuf);
+	gem_sync(i915, obj.handle);
+
+	delay(i915, e, obj.handle, obj.offset, target_ns);
+
+	obj.flags |= EXEC_OBJECT_PINNED;
+	return obj;
+}
+
+static bool has_ctx_timestamp(int i915, const struct intel_execution_engine2 *e)
+{
+	const int gen = intel_gen(intel_get_drm_devid(i915));
+
+	if (gen == 8 && e->class == I915_ENGINE_CLASS_VIDEO)
+		return false; /* looks fubar */
+
+	return true;
+}
+
+static void minimum(int i915, const struct intel_execution_engine2 *e)
+{
+	struct drm_i915_gem_exec_object2 obj = {};
+	struct drm_i915_gem_execbuffer2 execbuf = {
+		.buffer_count = 1,
+		.buffers_ptr = to_user_pointer(&obj),
+		.flags = e->flags,
+	};
+	double sy, sx, sxx, sxy, syy;
+	const int tries = 50;
+	double a, b, denom, r;
+	int count;
+	int fd;
+
+	igt_require(has_ctx_timestamp(i915, e));
+
+	count = 0;
+	sy = sx = 0;
+	sxx = sxy = syy = 0;
+	fd = open_pmu(i915, I915_PMU_ENGINE_BUSY(e->class, e->instance));
+	for (int target = 10000; target <= 500000; target += 10000) {
+		uint64_t delay = -1;
+
+		obj = delay_create(i915, 0, e, target);
+		for (int i = 0; i < tries; i++) {
+			uint64_t ts[2], val[2];
+
+			val[0] = __pmu_read_single(fd, &ts[0]);
+			gem_execbuf(i915, &execbuf);
+			gem_sync(i915, obj.handle);
+			val[1] = __pmu_read_single(fd, &ts[1]);
+
+			igt_debug("delay[%d, %d]: elapsed %"PRIu64"ns, busy %"PRIu64"ns\n",
+				  target, i, ts[1] - ts[0], val[1] - val[0]);
+
+			if (val[1] - val[0] < delay)
+				delay = val[1] - val[0];
+		}
+		gem_close(i915, obj.handle);
+
+		sx += target;
+		sy += delay;
+		sxx += (double)target * target;
+		sxy += (double)target * delay;
+		syy += (double)delay * delay;
+		count++;
+	}
+	close(fd);
+
+	denom = count * sxx - sx * sx;
+	b = (count * sxy - sx * sy) / denom;
+	a = (sy * sxx - sx * sxy) / denom;
+	r = sxy - sx * sy / count;
+	r /= sqrt((sxx - sx * sx / count) * (syy - sy * sy / count));
+
+	igt_info("delay = %f + %f * T; r: %f\n", a, b, r);
+	assert_within_epsilon(r, 1, 0.01);
+	assert_within_epsilon(b, 1, 0.05);
+}
+
 static void *create_mmap(int gem_fd, const struct mmap_offset *t, int sz)
 {
 	uint32_t handle;
@@ -2212,6 +2459,14 @@ igt_main
 	igt_subtest_group {
 		const unsigned int pct[] = { 2, 50, 98 };
 
+		igt_fixture {
+			/*
+			 * Sampling platforms are not expected to reach the
+			 * high accuracy criteria.
+			 */
+			igt_require(gem_scheduler_has_engine_busy_stats(fd));
+		}
+
 		/**
 		 * Check engine busyness accuracy is as expected.
 		 */
@@ -2223,6 +2478,14 @@ igt_main
 				}
 			}
 		}
+
+		igt_subtest_with_dynamic("busy-minimum") {
+			igt_require(intel_gen(intel_get_drm_devid(fd)) >= 8);
+			__for_each_physical_engine(fd, e) {
+				igt_dynamic_f("%s", e->name)
+					minimum(fd, e);
+			}
+		}
 	}
 
 	test_each_engine("busy-hang", fd, e) {
-- 
2.30.0.rc1