[Intel-gfx] [igt-dev] [PATCH i-g-t] i915/gem_exec_schedule: Try to spot unfairness

Wed Nov 25 11:25:02 UTC 2020

On 24/11/2020 23:39, Chris Wilson wrote:
> An important property for multi-client systems is that each client gets
> a 'fair' allotment of system time. (Where fairness is at the whim of the
> context properties, such as priorities.) This test forks N independent
> clients (albeit they happen to share a single vm), and does an equal
> amount of work in client and asserts that they take an equal amount of
> time.
> 
> Though we have never claimed to have a completely fair scheduler, that
> is what is expected.
> 
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> Cc: Ramalingam C <ramalingam.c at intel.com>
> ---
>   tests/i915/gem_exec_schedule.c | 847 +++++++++++++++++++++++++++++++++
>   1 file changed, 847 insertions(+)
> 
> diff --git a/tests/i915/gem_exec_schedule.c b/tests/i915/gem_exec_schedule.c
> index f23d63ac3..d888efcd7 100644
> --- a/tests/i915/gem_exec_schedule.c
> +++ b/tests/i915/gem_exec_schedule.c
> @@ -29,6 +29,7 @@
>   #include <sys/poll.h>
>   #include <sys/ioctl.h>
>   #include <sys/mman.h>
> +#include <sys/resource.h>
>   #include <sys/syscall.h>
>   #include <sched.h>
>   #include <signal.h>
> @@ -2516,6 +2517,819 @@ static void measure_semaphore_power(int i915)
>   	rapl_close(&pkg);
>   }
>   
> +static int read_timestamp_frequency(int i915)
> +{
> +	int value = 0;
> +	drm_i915_getparam_t gp = {
> +		.value = &value,
> +		.param = I915_PARAM_CS_TIMESTAMP_FREQUENCY,
> +	};
> +	ioctl(i915, DRM_IOCTL_I915_GETPARAM, &gp);
> +	return value;
> +}
> +
> +static uint64_t div64_u64_round_up(uint64_t x, uint64_t y)
> +{
> +	return (x + y - 1) / y;
> +}
> +
> +static uint64_t ns_to_ctx_ticks(int i915, uint64_t ns)
> +{
> +	int f = read_timestamp_frequency(i915);
> +	if (intel_gen(intel_get_drm_devid(i915)) == 11)
> +		f = 12500000; /* icl!!! are you feeling alright? CTX vs CS */
> +	return div64_u64_round_up(ns * f, NSEC_PER_SEC);
> +}
> +
> +static uint64_t ticks_to_ns(int i915, uint64_t ticks)
> +{
> +	return div64_u64_round_up(ticks * NSEC_PER_SEC,
> +				  read_timestamp_frequency(i915));
> +}
> +
> +#define MI_INSTR(opcode, flags) (((opcode) << 23) | (flags))
> +
> +#define MI_MATH(x)                      MI_INSTR(0x1a, (x) - 1)
> +#define MI_MATH_INSTR(opcode, op1, op2) ((opcode) << 20 | (op1) << 10 | (op2))
> +/* Opcodes for MI_MATH_INSTR */
> +#define   MI_MATH_NOOP                  MI_MATH_INSTR(0x000, 0x0, 0x0)
> +#define   MI_MATH_LOAD(op1, op2)        MI_MATH_INSTR(0x080, op1, op2)
> +#define   MI_MATH_LOADINV(op1, op2)     MI_MATH_INSTR(0x480, op1, op2)
> +#define   MI_MATH_LOAD0(op1)            MI_MATH_INSTR(0x081, op1)
> +#define   MI_MATH_LOAD1(op1)            MI_MATH_INSTR(0x481, op1)
> +#define   MI_MATH_ADD                   MI_MATH_INSTR(0x100, 0x0, 0x0)
> +#define   MI_MATH_SUB                   MI_MATH_INSTR(0x101, 0x0, 0x0)
> +#define   MI_MATH_AND                   MI_MATH_INSTR(0x102, 0x0, 0x0)
> +#define   MI_MATH_OR                    MI_MATH_INSTR(0x103, 0x0, 0x0)
> +#define   MI_MATH_XOR                   MI_MATH_INSTR(0x104, 0x0, 0x0)
> +#define   MI_MATH_STORE(op1, op2)       MI_MATH_INSTR(0x180, op1, op2)
> +#define   MI_MATH_STOREINV(op1, op2)    MI_MATH_INSTR(0x580, op1, op2)
> +/* Registers used as operands in MI_MATH_INSTR */
> +#define   MI_MATH_REG(x)                (x)
> +#define   MI_MATH_REG_SRCA              0x20
> +#define   MI_MATH_REG_SRCB              0x21
> +#define   MI_MATH_REG_ACCU              0x31
> +#define   MI_MATH_REG_ZF                0x32
> +#define   MI_MATH_REG_CF                0x33
> +
> +#define MI_LOAD_REGISTER_REG    MI_INSTR(0x2A, 1)
> +
> +static void delay(int i915,
> +		  const struct intel_execution_engine2 *e,
> +		  uint32_t handle,
> +		  uint64_t addr,
> +		  uint64_t ns)
> +{
> +	const int use_64b = intel_gen(intel_get_drm_devid(i915)) >= 8;
> +	const uint32_t base = gem_engine_mmio_base(i915, e->name);
> +#define CS_GPR(x) (base + 0x600 + 8 * (x))
> +#define RUNTIME (base + 0x3a8)
> +	enum { START_TS, NOW_TS };
> +	uint32_t *map, *cs, *jmp;
> +
> +	igt_require(base);
> +
> +	/* Loop until CTX_TIMESTAMP - initial > @ns */
> +
> +	cs = map = gem_mmap__device_coherent(i915, handle, 0, 4096, PROT_WRITE);
> +
> +	*cs++ = MI_LOAD_REGISTER_IMM;
> +	*cs++ = CS_GPR(START_TS) + 4;
> +	*cs++ = 0;
> +	*cs++ = MI_LOAD_REGISTER_REG;
> +	*cs++ = RUNTIME;
> +	*cs++ = CS_GPR(START_TS);
> +
> +	while (offset_in_page(cs) & 63)
> +		*cs++ = 0;
> +	jmp = cs;
> +
> +	*cs++ = 0x5 << 23; /* MI_ARB_CHECK */
> +
> +	*cs++ = MI_LOAD_REGISTER_IMM;
> +	*cs++ = CS_GPR(NOW_TS) + 4;
> +	*cs++ = 0;
> +	*cs++ = MI_LOAD_REGISTER_REG;
> +	*cs++ = RUNTIME;
> +	*cs++ = CS_GPR(NOW_TS);
> +
> +	/* delta = now - start; inverted to match COND_BBE */
> +	*cs++ = MI_MATH(4);
> +	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(NOW_TS));
> +	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(START_TS));
> +	*cs++ = MI_MATH_SUB;
> +	*cs++ = MI_MATH_STOREINV(MI_MATH_REG(NOW_TS), MI_MATH_REG_ACCU);
> +
> +	/* Save delta for reading by COND_BBE */
> +	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
> +	*cs++ = CS_GPR(NOW_TS);
> +	*cs++ = addr + 4000;
> +	*cs++ = addr >> 32;
> +
> +	/* Delay between SRM and COND_BBE to post the writes */
> +	for (int n = 0; n < 8; n++) {
> +		*cs++ = MI_STORE_DWORD_IMM;
> +		if (use_64b) {
> +			*cs++ = addr + 4064;
> +			*cs++ = addr >> 32;
> +		} else {
> +			*cs++ = 0;
> +			*cs++ = addr + 4064;
> +		}
> +		*cs++ = 0;
> +	}
> +
> +	/* Break if delta [time elapsed] > ns */
> +	*cs++ = MI_COND_BATCH_BUFFER_END | MI_DO_COMPARE | (1 + use_64b);
> +	*cs++ = ~ns_to_ctx_ticks(i915, ns);
> +	*cs++ = addr + 4000;
> +	*cs++ = addr >> 32;
> +
> +	/* Otherwise back to recalculating delta */
> +	*cs++ = MI_BATCH_BUFFER_START | 1 << 8 | use_64b;
> +	*cs++ = addr + offset_in_page(jmp);
> +	*cs++ = addr >> 32;
> +
> +	munmap(map, 4096);
> +}
> +
> +static struct drm_i915_gem_exec_object2
> +delay_create(int i915, uint32_t ctx,
> +	     const struct intel_execution_engine2 *e,
> +	     uint64_t target_ns)
> +{
> +	struct drm_i915_gem_exec_object2 obj = {
> +		.handle = batch_create(i915),
> +		.flags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS,
> +	};
> +	struct drm_i915_gem_execbuffer2 execbuf = {
> +		.buffers_ptr = to_user_pointer(&obj),
> +		.buffer_count = 1,
> +		.rsvd1 = ctx,
> +		.flags = e->flags,
> +	};
> +
> +	obj.offset = obj.handle << 12;
> +	gem_execbuf(i915, &execbuf);
> +	gem_sync(i915, obj.handle);
> +
> +	delay(i915, e, obj.handle, obj.offset, target_ns);
> +
> +	obj.flags |= EXEC_OBJECT_PINNED;
> +	return obj;
> +}
> +
> +static void tslog(int i915,
> +		  const struct intel_execution_engine2 *e,
> +		  uint32_t handle,
> +		  uint64_t addr)
> +{
> +	const int use_64b = intel_gen(intel_get_drm_devid(i915)) >= 8;
> +	const uint32_t base = gem_engine_mmio_base(i915, e->name);
> +#define CS_GPR(x) (base + 0x600 + 8 * (x))
> +#define CS_TIMESTAMP (base + 0x358)
> +	enum { INC, MASK, ADDR };
> +	uint32_t *timestamp_lo, *addr_lo;
> +	uint32_t *map, *cs;
> +
> +	igt_require(base);
> +
> +	map = gem_mmap__device_coherent(i915, handle, 0, 4096, PROT_WRITE);
> +	cs = map + 512;
> +
> +	/* Record the current CS_TIMESTAMP into a journal [a 512 slot ring]. */
> +	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
> +	*cs++ = CS_TIMESTAMP;
> +	timestamp_lo = cs;
> +	*cs++ = addr;
> +	*cs++ = addr >> 32;
> +
> +	/* Load the address + inc & mask variables */
> +	*cs++ = MI_LOAD_REGISTER_IMM;
> +	*cs++ = CS_GPR(ADDR);
> +	addr_lo = cs;
> +	*cs++ = addr;
> +	*cs++ = MI_LOAD_REGISTER_IMM;
> +	*cs++ = CS_GPR(ADDR) + 4;
> +	*cs++ = addr >> 32;
> +
> +	*cs++ = MI_LOAD_REGISTER_IMM;
> +	*cs++ = CS_GPR(INC);
> +	*cs++ = 4;
> +	*cs++ = MI_LOAD_REGISTER_IMM;
> +	*cs++ = CS_GPR(INC) + 4;
> +	*cs++ = 0;
> +
> +	*cs++ = MI_LOAD_REGISTER_IMM;
> +	*cs++ = CS_GPR(MASK);
> +	*cs++ = 0xfffff7ff;
> +	*cs++ = MI_LOAD_REGISTER_IMM;
> +	*cs++ = CS_GPR(MASK) + 4;
> +	*cs++ = 0xffffffff;
> +
> +	/* Increment the [ring] address for saving CS_TIMESTAMP */
> +	*cs++ = MI_MATH(8);
> +	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(INC));
> +	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(ADDR));
> +	*cs++ = MI_MATH_ADD;
> +	*cs++ = MI_MATH_STORE(MI_MATH_REG(ADDR), MI_MATH_REG_ACCU);
> +	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(ADDR));
> +	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(MASK));
> +	*cs++ = MI_MATH_AND;
> +	*cs++ = MI_MATH_STORE(MI_MATH_REG(ADDR), MI_MATH_REG_ACCU);
> +
> +	/* Rewrite the batch buffer for the next execution */
> +	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
> +	*cs++ = CS_GPR(ADDR);
> +	*cs++ = addr + offset_in_page(timestamp_lo);
> +	*cs++ = addr >> 32;
> +	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
> +	*cs++ = CS_GPR(ADDR);
> +	*cs++ = addr + offset_in_page(addr_lo);
> +	*cs++ = addr >> 32;
> +
> +	*cs++ = MI_BATCH_BUFFER_END;
> +
> +	munmap(map, 4096);
> +}
> +
> +static struct drm_i915_gem_exec_object2
> +tslog_create(int i915, uint32_t ctx, const struct intel_execution_engine2 *e)
> +{
> +	struct drm_i915_gem_exec_object2 obj = {
> +		.handle = batch_create(i915),
> +		.flags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS,
> +	};
> +	struct drm_i915_gem_execbuffer2 execbuf = {
> +		.buffers_ptr = to_user_pointer(&obj),
> +		.buffer_count = 1,
> +		.rsvd1 = ctx,
> +		.flags = e->flags,
> +	};
> +
> +	obj.offset = obj.handle << 12;
> +	gem_execbuf(i915, &execbuf);
> +	gem_sync(i915, obj.handle);
> +
> +	tslog(i915, e, obj.handle, obj.offset);
> +
> +	obj.flags |= EXEC_OBJECT_PINNED;
> +	return obj;
> +}
> +
> +static int cmp_u32(const void *A, const void *B)
> +{
> +	const uint32_t *a = A, *b = B;
> +
> +	if (*a < *b)
> +		return -1;
> +	else if (*a > *b)
> +		return 1;
> +	else
> +		return 0;
> +}
> +
> +static bool has_ctx_timestamp(int i915, const struct intel_execution_engine2 *e)
> +{
> +	const int gen = intel_gen(intel_get_drm_devid(i915));
> +
> +	if (gen == 8 && e->class == I915_ENGINE_CLASS_VIDEO)
> +		return false; /* looks fubar */
> +
> +	return true;
> +}
> +
> +static struct intel_execution_engine2
> +pick_random_engine(int i915, const struct intel_execution_engine2 *not)
> +{
> +	const struct intel_execution_engine2 *e;
> +	unsigned int count = 0;
> +
> +	__for_each_physical_engine(i915, e) {
> +		if (e->flags == not->flags)
> +			continue;
> +		if (!gem_class_has_mutable_submission(i915, e->class))
> +			continue;
> +		count++;
> +	}
> +	if (!count)
> +		return *not;
> +
> +	count = rand() % count;
> +	__for_each_physical_engine(i915, e) {
> +		if (e->flags == not->flags)
> +			continue;
> +		if (!gem_class_has_mutable_submission(i915, e->class))
> +			continue;
> +		if (!count--)
> +			break;
> +	}
> +
> +	return *e;
> +}
> +
> +static void fair_child(int i915, uint32_t ctx,
> +		       const struct intel_execution_engine2 *e,
> +		       uint64_t frame_ns,
> +		       int timeline,
> +		       uint32_t common,
> +		       unsigned int flags,
> +		       unsigned long *ctl,
> +		       unsigned long *out)
> +#define F_SYNC		(1 << 0)
> +#define F_PACE		(1 << 1)
> +#define F_FLOW		(1 << 2)
> +#define F_HALF		(1 << 3)
> +#define F_SOLO		(1 << 4)
> +#define F_SPARE		(1 << 5)
> +#define F_NEXT		(1 << 6)
> +#define F_VIP		(1 << 7)
> +#define F_RRUL		(1 << 8)
> +#define F_SHARE		(1 << 9)
> +#define F_PING		(1 << 10)
> +#define F_THROTTLE	(1 << 11)
> +#define F_ISOLATE	(1 << 12)
> +{
> +	const int batches_per_frame = flags & F_SOLO ? 1 : 3;
> +	struct drm_i915_gem_exec_object2 obj[4] = {
> +		{},
> +		{
> +			.handle = common ?: gem_create(i915, 4096),
> +		},
> +		delay_create(i915, ctx, e, frame_ns / batches_per_frame),
> +		delay_create(i915, ctx, e, frame_ns / batches_per_frame),
> +	};
> +	struct intel_execution_engine2 ping = *e;
> +	int p_fence = -1, n_fence = -1;
> +	unsigned long count = 0;
> +	int n;
> +
> +	srandom(getpid());
> +	if (flags & F_PING)
> +		ping = pick_random_engine(i915, e);
> +	obj[0] = tslog_create(i915, ctx, &ping);
> +
> +	while (!READ_ONCE(*ctl)) {
> +		struct drm_i915_gem_execbuffer2 execbuf = {
> +			.buffers_ptr = to_user_pointer(obj),
> +			.buffer_count = 4,
> +			.rsvd1 = ctx,
> +			.rsvd2 = -1,
> +			.flags = e->flags,
> +		};
> +
> +		if (flags & F_FLOW) {
> +			unsigned int seq;
> +
> +			seq = count;
> +			if (flags & F_NEXT)
> +				seq++;
> +
> +			execbuf.rsvd2 =
> +				sw_sync_timeline_create_fence(timeline, seq);
> +			execbuf.flags |= I915_EXEC_FENCE_IN;
> +		}
> +
> +		execbuf.flags |= I915_EXEC_FENCE_OUT;
> +		gem_execbuf_wr(i915, &execbuf);
> +		n_fence = execbuf.rsvd2 >> 32;
> +		execbuf.flags &= ~(I915_EXEC_FENCE_OUT | I915_EXEC_FENCE_IN);
> +		for (n = 1; n < batches_per_frame; n++)
> +			gem_execbuf(i915, &execbuf);
> +		close(execbuf.rsvd2);
> +
> +		execbuf.buffer_count = 1;
> +		execbuf.batch_start_offset = 2048;
> +		execbuf.flags = ping.flags | I915_EXEC_FENCE_IN;
> +		execbuf.rsvd2 = n_fence;
> +		gem_execbuf(i915, &execbuf);
> +
> +		if (flags & F_PACE && p_fence != -1) {
> +			struct pollfd pfd = {
> +				.fd = p_fence,
> +				.events = POLLIN,
> +			};
> +			poll(&pfd, 1, -1);
> +		}
> +		close(p_fence);
> +
> +		if (flags & F_SYNC) {
> +			struct pollfd pfd = {
> +				.fd = n_fence,
> +				.events = POLLIN,
> +			};
> +			poll(&pfd, 1, -1);
> +		}
> +
> +		if (flags & F_THROTTLE)
> +			igt_ioctl(i915, DRM_IOCTL_I915_GEM_THROTTLE, 0);
> +
> +		igt_swap(obj[2], obj[3]);
> +		igt_swap(p_fence, n_fence);

What are the sync fences simulating and how come they are always used? I 
mean no children which submit batched up load?

> +		count++;
> +	}
> +	close(p_fence);
> +
> +	gem_close(i915, obj[3].handle);
> +	gem_close(i915, obj[2].handle);
> +	if (obj[1].handle != common)
> +		gem_close(i915, obj[1].handle);
> +
> +	gem_sync(i915, obj[0].handle);
> +	if (out) {
> +		uint32_t *map;
> +
> +		map = gem_mmap__device_coherent(i915, obj[0].handle,
> +						0, 4096, PROT_WRITE);
> +		for (n = 1; n < min(count, 512); n++) {
> +			igt_assert(map[n]);
> +			map[n - 1] = map[n] - map[n - 1];
> +		}
> +		qsort(map, --n, sizeof(*map), cmp_u32);
> +		*out = ticks_to_ns(i915, map[n / 2]);

What is returned? Could you explain the ts journal part a bit?

> +		munmap(map, 4096);
> +	}
> +	gem_close(i915, obj[0].handle);
> +}
> +
> +static int cmp_ul(const void *A, const void *B)
> +{
> +	const unsigned long *a = A, *b = B;
> +
> +	if (*a < *b)
> +		return -1;
> +	else if (*a > *b)
> +		return 1;
> +	else
> +		return 0;
> +}
> +
> +static uint64_t d_cpu_time(const struct rusage *a, const struct rusage *b)
> +{
> +	uint64_t cpu_time = 0;
> +
> +	cpu_time += (a->ru_utime.tv_sec - b->ru_utime.tv_sec) * NSEC_PER_SEC;
> +	cpu_time += (a->ru_utime.tv_usec - b->ru_utime.tv_usec) * 1000;
> +
> +	cpu_time += (a->ru_stime.tv_sec - b->ru_stime.tv_sec) * NSEC_PER_SEC;
> +	cpu_time += (a->ru_stime.tv_usec - b->ru_stime.tv_usec) * 1000;
> +
> +	return cpu_time;
> +}
> +
> +static void timeline_advance(int timeline, int delay_ns)
> +{
> +	struct timespec tv = { .tv_nsec = delay_ns };
> +	nanosleep(&tv, NULL);
> +	sw_sync_timeline_inc(timeline, 1);
> +}
> +
> +static void fairness(int i915,
> +		     const struct intel_execution_engine2 *e,
> +		     int timeout, unsigned int flags)
> +{
> +	const int frame_ns = 16666 * 1000;
> +	const int fence_ns = flags & F_HALF ? 2 * frame_ns : frame_ns;
> +	unsigned long *result;
> +	uint32_t common = 0;
> +
> +	igt_require(has_ctx_timestamp(i915, e));
> +	igt_require(gem_class_has_mutable_submission(i915, e->class));
> +
> +	if (flags & F_SHARE)
> +		common = gem_create(i915, 4095);
> +
> +	result = mmap(NULL, 4096, PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0);
> +
> +	for (int n = 2; n <= 64; n <<= 1) { /* 32 == 500us per client */
> +		int timeline = sw_sync_timeline_create();
> +		int nfences = timeout * NSEC_PER_SEC / fence_ns + 1;
> +		const int nchild = n - 1; /* odd for easy medians */
> +		const int child_ns = frame_ns / (nchild + !!(flags & F_SPARE));
> +		const int lo = nchild / 4;
> +		const int hi = (3 * nchild + 3) / 4 - 1;
> +		struct rusage old_usage, usage;
> +		uint64_t cpu_time, d_time;
> +		unsigned long vip = -1;
> +		struct timespec tv;
> +		struct igt_mean m;
> +
> +		if (flags & F_PING) {
> +			struct intel_execution_engine2 *ping;
> +
> +			__for_each_physical_engine(i915, ping) {
> +				if (ping->flags == e->flags)
> +					continue;
> +
> +				igt_fork(child, 1) {
> +					uint32_t ctx = gem_context_clone_with_engines(i915, 0);
> +
> +					fair_child(i915, ctx, ping,
> +						   child_ns / 8,
> +						   -1, common,
> +						   F_SOLO | F_PACE | F_SHARE,
> +						   &result[nchild],
> +						   NULL);
> +
> +					gem_context_destroy(i915, ctx);
> +				}
> +			}
> +		}
> +
> +		memset(result, 0, (nchild + 1) * sizeof(result[0]));

Children probably can't write into it before, but still would probably 
be better moved before the first fork (which passes the results array to 
children).

> +		getrusage(RUSAGE_CHILDREN, &old_usage);
> +		igt_nsec_elapsed(memset(&tv, 0, sizeof(tv)));
> +		igt_fork(child, nchild) {
> +			uint32_t ctx;
> +
> +			if (flags & F_ISOLATE) {
> +				int clone, dmabuf = -1;
> +
> +				if (common)
> +					dmabuf = prime_handle_to_fd(i915, common);
> +
> +				clone = gem_reopen_driver(i915);
> +				gem_context_copy_engines(i915, 0, clone, 0);
> +				i915 = clone;
> +
> +				if (dmabuf != -1)
> +					common = prime_fd_to_handle(i915, dmabuf);
> +			}
> +
> +			ctx = gem_context_clone_with_engines(i915, 0);
> +
> +			if (flags & F_VIP && child == 0) {
> +				gem_context_set_priority(i915, ctx, MAX_PRIO);
> +				flags |= F_FLOW;
> +			}
> +			if (flags & F_RRUL && child == 0)
> +				flags |= F_SOLO | F_FLOW | F_SYNC;
> +
> +			fair_child(i915, ctx, e, child_ns,
> +				   timeline, common, flags,
> +				   &result[nchild],
> +				   &result[child]);
> +
> +			gem_context_destroy(i915, ctx);
> +		}
> +
> +		while (nfences--)
> +			timeline_advance(timeline, fence_ns);
> +
> +		result[nchild] = 1;
> +		for (int child = 0; child < nchild; child++) {
> +			while (!READ_ONCE(result[child]))
> +				timeline_advance(timeline, fence_ns);
> +		}
> +
> +		igt_waitchildren();
> +		close(timeline);
> +
> +		/* Are we running out of CPU time, and fail to submit frames? */
> +		d_time = igt_nsec_elapsed(&tv);
> +		getrusage(RUSAGE_CHILDREN, &usage);
> +		cpu_time = d_cpu_time(&usage, &old_usage);
> +		if (10 * cpu_time > 9 * d_time) {
> +			if (nchild > 7)
> +				break;
> +
> +			igt_skip_on_f(10 * cpu_time > 9 * d_time,
> +				      "%.0f%% CPU usage, presuming capacity exceeded\n",
> +				      100. * cpu_time / d_time);

Aren't children mostly sleeping waiting on fences and like? And if so 
how/when the test ends up using a lot of CPU time?

> +		}
> +
> +		igt_mean_init(&m);
> +		for (int child = 0; child < nchild; child++)
> +			igt_mean_add(&m, result[child]);
> +
> +		if (flags & (F_VIP | F_RRUL))
> +			vip = result[0];
> +
> +		qsort(result, nchild, sizeof(*result), cmp_ul);
> +		igt_info("%2d clients, range: [%.1f, %.1f], iqr: [%.1f, %.1f], median: %.1f, mean: %.1f ± %.2f ms\n",
> +			 nchild,
> +			 1e-6 * result[0],  1e-6 * result[nchild - 1],
> +			 1e-6 * result[lo], 1e-6 * result[hi],
> +			 1e-6 * result[nchild / 2],
> +			 1e-6 * igt_mean_get(&m),
> +			 1e-6 * sqrt(igt_mean_get_variance(&m)));
> +
> +		if (vip != -1) {
> +			igt_info("VIP interval %.2f ms\n", 1e-6 * vip);
> +			igt_assert(4 * vip > 3 * fence_ns &&
> +				   3 * vip < 4 * fence_ns);
> +		}
> +
> +		/* May be slowed due to sheer volume of context switches */
> +		igt_assert(4 * igt_mean_get(&m) > 3 * fence_ns &&
> +			       igt_mean_get(&m) < 3 * fence_ns);
> +
> +		igt_assert(4 * igt_mean_get(&m) > 3 * result[nchild / 2] &&
> +			   3 * igt_mean_get(&m) < 4 * result[nchild / 2]);
> +
> +		igt_assert(2 * (result[hi] - result[lo]) < result[nchild / 2]);

Put some human readable text above the asserts explaining the criteria 
please.

VIP child takes part in the mean and does not affect the result?

> +	}
> +
> +	munmap(result, 4096);
> +	if (common)
> +		gem_close(i915, common);
> +}
> +
> +static void test_fairness(int i915, int timeout)
> +{
> +	static const struct {
> +		const char *name;
> +		unsigned int flags;
> +	} fair[] = {
> +		/*
> +		 * none - maximal greed in each client
> +		 *
> +		 * Push as many frames from each client as fast as possible
> +		 */
> +		{ "none",       0 },
> +		{ "none-vip",   F_VIP }, /* one vip client must meet deadlines */
> +		{ "none-solo",  F_SOLO }, /* 1 batch per frame per client */
> +		{ "none-share", F_SHARE }, /* read from a common buffer */
> +		{ "none-rrul",  F_RRUL }, /* "realtime-response under load" */
> +		{ "none-ping",  F_PING }, /* measure inter-engine fairness */
> +
> +		/*
> +		 * throttle - original per client throttling
> +		 *
> +		 * Used for front buffering rendering where there is no
> +		 * extenal frame marker. Each client tries to only keep
> +		 * 20ms of work submitted, though that measurement is
> +		 * flawed...
> +		 *
> +		 * This is used by Xorg to try and maintain some resembalance
> +		 * of input/output consistency when being feed a continuous
> +		 * stream of X11 draw requests straight into scanout, where
> +		 * the clients may submit the work faster than can be drawn.
> +		 *
> +		 * Throttling tracks requests per-file (and assumes that
> +		 * all requests are in submission order across the whole file),
> +		 * so we split each child to its own fd.
> +		 */
> +		{ "throttle",       F_THROTTLE | F_ISOLATE },
> +		{ "throttle-vip",   F_THROTTLE | F_ISOLATE | F_VIP },
> +		{ "throttle-solo",  F_THROTTLE | F_ISOLATE | F_SOLO },
> +		{ "throttle-share", F_THROTTLE | F_ISOLATE | F_SHARE },
> +		{ "throttle-rrul",  F_THROTTLE | F_ISOLATE | F_RRUL },
> +
> +		/*
> +		 * pace - mesa "submit double buffering"
> +		 *
> +		 * Submit a frame, wait for previous frame to start. This
> +		 * prevents each client from getting too far ahead of its
> +		 * rendering, maintaining a consistent input/output latency.
> +		 */
> +		{ "pace",       F_PACE },
> +		{ "pace-solo",  F_PACE | F_SOLO},
> +		{ "pace-share", F_PACE | F_SHARE},
> +		{ "pace-ping",  F_PACE | F_SHARE | F_PING},
> +
> +		/* sync - only submit a frame at a time */
> +		{ "sync",      F_SYNC },
> +		{ "sync-vip",  F_SYNC | F_VIP },
> +		{ "sync-solo", F_SYNC | F_SOLO },
> +
> +		/* flow - synchronise execution against the clock (vblank) */
> +		{ "flow",       F_PACE | F_FLOW },
> +		{ "flow-share", F_PACE | F_FLOW | F_SHARE },
> +		{ "flow-ping",  F_PACE | F_FLOW | F_SHARE | F_PING },
> +
> +		/* next - submit ahead of the clock (vblank double buffering) */
> +		{ "next",       F_PACE | F_FLOW | F_NEXT },
> +		{ "next-share", F_PACE | F_FLOW | F_NEXT | F_SHARE },
> +		{ "next-ping",  F_PACE | F_FLOW | F_NEXT | F_SHARE | F_PING },
> +
> +		/* spare - underutilise by a single client timeslice */
> +		{ "spare", F_PACE | F_FLOW | F_SPARE },
> +
> +		/* half - run at half pace (submit 16ms of work every 32ms) */
> +		{ "half",  F_PACE | F_FLOW | F_HALF },
> +
> +		{}
> +	};
> +
> +	igt_fixture {
> +		igt_info("CS timestamp frequency: %d\n",
> +			 read_timestamp_frequency(i915));
> +
> +		igt_require(intel_gen(intel_get_drm_devid(i915)) >= 8);
> +	}
> +
> +	for (typeof(*fair) *f = fair; f->name; f++) {
> +		igt_subtest_with_dynamic_f("fair-%s", f->name)  {
> +			const struct intel_execution_engine2 *e;
> +
> +			__for_each_physical_engine(i915, e) {
> +				if (!gem_class_can_store_dword(i915, e->class))
> +					continue;
> +
> +				igt_dynamic_f("%s", e->name)
> +					fairness(i915, e, timeout, f->flags);
> +			}
> +		}
> +	}
> +}
> +
> +static uint32_t read_ctx_timestamp(int i915,
> +				   uint32_t ctx,
> +				   const struct intel_execution_engine2 *e)
> +{
> +	const int use_64b = intel_gen(intel_get_drm_devid(i915)) >= 8;
> +	const uint32_t base = gem_engine_mmio_base(i915, e->name);
> +	struct drm_i915_gem_relocation_entry reloc;
> +	struct drm_i915_gem_exec_object2 obj = {
> +		.handle = gem_create(i915, 4096),
> +		.offset = 32 << 20,
> +		.relocs_ptr = to_user_pointer(&reloc),
> +		.relocation_count = 1,
> +	};
> +	struct drm_i915_gem_execbuffer2 execbuf = {
> +		.buffers_ptr = to_user_pointer(&obj),
> +		.buffer_count = 1,
> +		.flags = e->flags,
> +		.rsvd1 = ctx,
> +	};
> +#define RUNTIME (base + 0x3a8)
> +	uint32_t *map, *cs;
> +	uint32_t ts;
> +
> +	igt_require(base);
> +
> +	cs = map = gem_mmap__device_coherent(i915, obj.handle,
> +					     0, 4096, PROT_WRITE);
> +
> +	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
> +	*cs++ = RUNTIME;
> +	memset(&reloc, 0, sizeof(reloc));
> +	reloc.target_handle = obj.handle;
> +	reloc.presumed_offset = obj.offset;
> +	reloc.offset = offset_in_page(cs);
> +	reloc.delta = 4000;
> +	*cs++ = obj.offset + 4000;
> +	*cs++ = obj.offset >> 32;
> +
> +	*cs++ = MI_BATCH_BUFFER_END;
> +
> +	gem_execbuf(i915, &execbuf);
> +	gem_sync(i915, obj.handle);
> +	gem_close(i915, obj.handle);
> +
> +	ts = map[1000];
> +	munmap(map, 4096);
> +
> +	return ts;
> +}
> +
> +static void fairslice(int i915,
> +		      const struct intel_execution_engine2 *e,
> +		      unsigned long flags)
> +{
> +	igt_spin_t *spin = NULL;
> +	uint32_t ctx[3];
> +	uint32_t ts[3];
> +
> +	for (int i = 0; i < ARRAY_SIZE(ctx); i++) {
> +		ctx[i] = gem_context_clone_with_engines(i915, 0);
> +		if (spin == NULL) {
> +			spin = __igt_spin_new(i915,
> +					      .ctx = ctx[i],
> +					      .engine = e->flags,
> +					      .flags = flags);
> +		} else {
> +			struct drm_i915_gem_execbuffer2 eb = {
> +				.buffer_count = 1,
> +				.buffers_ptr = to_user_pointer(&spin->obj[IGT_SPIN_BATCH]),
> +				.flags = e->flags,
> +				.rsvd1 = ctx[i],
> +			};
> +			gem_execbuf(i915, &eb);
> +		}
> +	}
> +
> +	sleep(2); /* over the course of many timeslices */
> +
> +	igt_assert(gem_bo_busy(i915, spin->handle));
> +	igt_spin_end(spin);
> +	for (int i = 0; i < ARRAY_SIZE(ctx); i++)
> +		ts[i] = read_ctx_timestamp(i915, ctx[i], e);
> +
> +	for (int i = 0; i < ARRAY_SIZE(ctx); i++)
> +		gem_context_destroy(i915, ctx[i]);
> +	igt_spin_free(i915, spin);
> +
> +	qsort(ts, 3, sizeof(*ts), cmp_u32);
> +	igt_info("%s: [%.1f, %.1f] ms\n", e->name,
> +		 1e-6 * ticks_to_ns(i915, ts[0]),
> +		 1e-6 * ticks_to_ns(i915, ts[2]));

Log all three just as well?

> +
> +	igt_assert(ts[0] && ts[2] > ts[0]);
 > +	igt_assert(4 * ts[0] > 3 * ts[2]);

Three equal priority contexts - why would distribution be expected to be 
unfair? Intuitively I'd expect a check that all three are within some 
tolerance of each other, but okay, min and max is good enough, just 
don't understand the asserts. Max can just as well be equal to min, no? 
I mean and scheduler would still be considered fair. We should ignore 
the submission order I think, if that was the point.

> +}
> +
>   #define test_each_engine(T, i915, e) \
>   	igt_subtest_with_dynamic(T) __for_each_physical_engine(i915, e) \
>   		igt_dynamic_f("%s", e->name)
> @@ -2582,6 +3396,35 @@ igt_main
>   		test_each_engine("lateslice", fd, e)
>   			lateslice(fd, e->flags);
>   
> +		igt_subtest_group {
> +			igt_fixture {
> +				igt_require(gem_scheduler_has_semaphores(fd));
> +				igt_require(gem_scheduler_has_preemption(fd));
> +				igt_require(intel_gen(intel_get_drm_devid(fd)) >= 8);
> +			}
> +
> +			test_each_engine("fairslice", fd, e)
> +				fairslice(fd, e, 0);
> +
> +			test_each_engine("u-fairslice", fd, e)
> +				fairslice(fd, e, IGT_SPIN_USERPTR);
> +
> +			igt_subtest("fairslice-all")  {
> +				__for_each_physical_engine(fd, e) {
> +					igt_fork(child, 1)
> +						fairslice(fd, e, 0);
> +				}
> +				igt_waitchildren();
> +			}
> +			igt_subtest("u-fairslice-all")  {
> +				__for_each_physical_engine(fd, e) {
> +					igt_fork(child, 1)
> +						fairslice(fd, e, IGT_SPIN_USERPTR);
> +				}
> +				igt_waitchildren();
> +			}
> +		}
> +
>   		test_each_engine("submit-early-slice", fd, e)
>   			submit_slice(fd, e, EARLY_SUBMIT);
>   		test_each_engine("submit-golden-slice", fd, e)
> @@ -2610,6 +3453,10 @@ igt_main
>   		test_each_engine_store("promotion", fd, e)
>   			promotion(fd, e->flags);
>   
> +		igt_subtest_group {
> +			test_fairness(fd, 2);
> +		}
> +
>   		igt_subtest_group {
>   			igt_fixture {
>   				igt_require(gem_scheduler_has_preemption(fd));
> 

Seem clean and logical on the high level and on the implementation 
level. On the "medium" level I don't claim I tried to understand 
everything but it's not completely important. With medium level I mean 
all the different test scenarios, where the important thing is that as 
long as all children are doing the same thing, which I think they are 
(small open of VIP), it seems correct to test they will get equal amount 
of GPU time.

All subtests pass with the fair scheduler patches?

Regards,

Tvrtko