[Intel-gfx] [PATCH i-g-t v3] benchmarks/gem_wsim: Command submission workload simulator

Thu Apr 6 08:55:26 UTC 2017

On Thu, Apr 06, 2017 at 09:18:36AM +0100, Tvrtko Ursulin wrote:
> 
> On 05/04/2017 17:48, Chris Wilson wrote:
> >On Wed, Apr 05, 2017 at 05:14:01PM +0100, Tvrtko Ursulin wrote:
> >>+static void
> >>+__emit_bb_end(struct w_step *w, bool terminate, bool seqnos, uint32_t seqno)
> >>+{
> >>+	const uint32_t bbe = 0xa << 23;
> >>+	unsigned long bb_sz = get_bb_sz(&w->duration);
> >>+	unsigned long mmap_start, cmd_offset, mmap_len;
> >>+	uint32_t *ptr, *cs;
> >>+
> >>+	mmap_len = (seqnos ? 5 : 1) * sizeof(uint32_t);
> >>+	cmd_offset = bb_sz - mmap_len;
> >>+	mmap_start = rounddown(cmd_offset, PAGE_SIZE);
> >>+	mmap_len += cmd_offset - mmap_start;
> >>+
> >>+	gem_set_domain(fd, w->bb_handle,
> >>+		       I915_GEM_DOMAIN_CPU, I915_GEM_DOMAIN_CPU);
> >>+
> >>+	ptr = gem_mmap__cpu(fd, w->bb_handle, mmap_start, mmap_len, PROT_WRITE);
> >>+	cs = (uint32_t *)((char *)ptr + cmd_offset - mmap_start);
> >>+
> >>+	if (seqnos) {
> >>+		const int gen = intel_gen(intel_get_drm_devid(fd));
> >>+
> >>+		igt_assert(gen >= 8);
> >>+
> >>+		w->reloc.offset = bb_sz - 4 * sizeof(uint32_t);
> >>+		w->seqno_offset = bb_sz - 2 * sizeof(uint32_t);
> >>+
> >>+		*cs++ = terminate ? MI_STORE_DWORD_IMM : 0;
> >>+		*cs++ = 0;
> >>+		*cs++ = 0;
> >>+		*cs++ = seqno;
> >>+	}
> >>+
> >>+	*cs = terminate ? bbe : 0;
> >>+
> >>+	munmap(ptr, mmap_len);
> >>+}
> >>+
> >>+static void terminate_bb(struct w_step *w, bool seqnos, uint32_t seqno)
> >>+{
> >>+	__emit_bb_end(w, true, seqnos, seqno);
> >>+}
> >>+
> >>+static void unterminate_bb(struct w_step *w, bool seqnos)
> >>+{
> >>+	__emit_bb_end(w, false, seqnos, 0);
> >>+}
> >>+
> >>+static void
> >>+prepare_workload(struct workload *wrk, bool swap_vcs, bool seqnos)
> >>+{
> >>+	int max_ctx = -1;
> >>+	struct w_step *w;
> >>+	int i;
> >>+
> >>+	if (seqnos) {
> >>+		const unsigned int status_sz = sizeof(uint32_t);
> >>+
> >>+		for (i = 0; i < NUM_ENGINES; i++) {
> >>+			wrk->status_page_handle[i] = gem_create(fd, status_sz);
> >
> >Need to set_cache_level(CACHED) for llc.
> >
> >You can use one page for all engines. Just use a different cacheline
> >for each, for safety.
> >
> >>+			wrk->status_page[i] =
> >>+				gem_mmap__cpu(fd, wrk->status_page_handle[i],
> >>+					      0, status_sz, PROT_READ);
> >>+		}
> >>+	}
> >>+
> >>+	for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) {
> >>+		if ((int)w->context > max_ctx) {
> >>+			int delta = w->context + 1 - wrk->nr_ctxs;
> >>+
> >>+			wrk->nr_ctxs += delta;
> >>+			wrk->ctx_id = realloc(wrk->ctx_id,
> >>+					      wrk->nr_ctxs * sizeof(uint32_t));
> >>+			memset(&wrk->ctx_id[wrk->nr_ctxs - delta], 0,
> >>+			       delta * sizeof(uint32_t));
> >>+
> >>+			max_ctx = w->context;
> >>+		}
> >>+
> >>+		if (!wrk->ctx_id[w->context]) {
> >>+			struct drm_i915_gem_context_create arg = {};
> >>+
> >>+			drmIoctl(fd, DRM_IOCTL_I915_GEM_CONTEXT_CREATE, &arg);
> >>+			igt_assert(arg.ctx_id);
> >>+
> >>+			wrk->ctx_id[w->context] = arg.ctx_id;
> >>+		}
> >>+	}
> >>+
> >>+	for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) {
> >>+		enum intel_engine_id engine = w->engine;
> >>+		unsigned int bb_i, j = 0;
> >>+
> >>+		if (w->type != BATCH)
> >>+			continue;
> >>+
> >>+		w->obj[j].handle = gem_create(fd, 4096);
> >>+		w->obj[j].flags = EXEC_OBJECT_WRITE;
> >>+		j++;
> >>+
> >>+		if (seqnos) {
> >>+			w->obj[j].handle = wrk->status_page_handle[engine];
> >>+			w->obj[j].flags = EXEC_OBJECT_WRITE;
> >
> >The trick for sharing between engines is to not mark this as a WRITE.
> >Fun little lies.
> 
> Yeah thats why I have per-engine objects. Which I don't mind since
> it is not like they are wasting any resources compared to everything
> else. But not admitting the write sounds still interesting. What
> would the repercussions of that be - limit us to llc platforms or
> something?

It used to be that if we evicted the object (e.g. mempressure/suspend),
then we would not save the contents since it was never marked as dirty.
However, between libva being buggy and Vk deliberately eskewing write
hazards, we had to always mark GPU usage as dirtying the buffers. So
nowadays, EXEC_OBJECT_WRITE only means to track the implicit write
hazard.

> 
> >>+			j++;
> >>+		}
> >>+
> >>+		bb_i = j++;
> >>+		w->duration.cur = w->duration.max;
> >>+		w->bb_sz = get_bb_sz(&w->duration);
> >>+		w->bb_handle = w->obj[bb_i].handle = gem_create(fd, w->bb_sz);
> >>+		terminate_bb(w, seqnos, 0);
> >>+		if (seqnos) {
> >>+			w->reloc.presumed_offset = -1;
> >>+			w->reloc.target_handle = 1;
> >>+			w->reloc.read_domains = I915_GEM_DOMAIN_INSTRUCTION;
> >>+			w->reloc.write_domain = I915_GEM_DOMAIN_INSTRUCTION;
> >
> >Ugh. That's a magic w/a value for pipecontrols. Fortunately we don't want
> >to set write_domain here anyway.
> 
> I think I copy-pasted this from another IGT. So you say cheat here
> as well and set zero for both domains?

Technically the MI is outside of all the GPU cache domains we have :)
Which you pick is immaterial, aside from understanding that
(INSTRUCTION, INSTRUCTION) is special ;)

If you were to drop EXEC_OBJECT_WRITE, you would also drop
reloc.write_domain.

> >>+		}
> >>+
> >>+		igt_assert(w->dependency <= 0);
> >>+		if (w->dependency) {
> >>+			int dep_idx = i + w->dependency;
> >>+
> >>+			igt_assert(dep_idx >= 0 && dep_idx < wrk->nr_steps);
> >>+			igt_assert(wrk->steps[dep_idx].type == BATCH);
> >>+
> >>+			w->obj[j].handle = w->obj[bb_i].handle;
> >>+			bb_i = j;
> >>+			w->obj[j - 1].handle =
> >>+					wrk->steps[dep_idx].obj[0].handle;
> >>+			j++;
> >>+		}
> >>+
> >>+		if (seqnos) {
> >>+			w->obj[bb_i].relocs_ptr = to_user_pointer(&w->reloc);
> >>+			w->obj[bb_i].relocation_count = 1;
> >>+		}
> >>+
> >>+		w->eb.buffers_ptr = to_user_pointer(w->obj);
> >>+		w->eb.buffer_count = j;
> >>+		w->eb.rsvd1 = wrk->ctx_id[w->context];
> >>+
> >>+		if (swap_vcs && engine == VCS1)
> >>+			engine = VCS2;
> >>+		else if (swap_vcs && engine == VCS2)
> >>+			engine = VCS1;
> >>+		w->eb.flags = eb_engine_map[engine];
> >>+		w->eb.flags |= I915_EXEC_HANDLE_LUT;
> >>+		if (!seqnos)
> >>+			w->eb.flags |= I915_EXEC_NO_RELOC;
> >
> >Doesn't look too hard to get the relocation right. Forcing relocations
> >between batches is probably a good one to check (just to say don't do
> >that)
> 
> I am not following here? You are saying don't do relocations at all?
> How do I make sure things stay fixed and even how to find out where
> they are in the first pass?

Depending on the workload, it may be informative to also do comparisons
between NORELOC and always RELOC. Personally I would make sure we were
using NORELOC as this should be a simulator/example.

> >>+static void update_bb_seqno(struct w_step *w, uint32_t seqno)
> >>+{
> >>+	unsigned long mmap_start, mmap_offset, mmap_len;
> >>+	void *ptr;
> >>+
> >>+	mmap_start = rounddown(w->seqno_offset, PAGE_SIZE);
> >>+	mmap_offset = w->seqno_offset - mmap_start;
> >>+	mmap_len = sizeof(uint32_t) + mmap_offset;
> >>+
> >>+	gem_set_domain(fd, w->bb_handle,
> >>+		       I915_GEM_DOMAIN_CPU, I915_GEM_DOMAIN_CPU);
> >>+
> >>+	ptr = gem_mmap__cpu(fd, w->bb_handle, mmap_start, mmap_len, PROT_WRITE);
> >>+
> >>+	*(uint32_t *)((char *)ptr + mmap_offset) = seqno;
> >
> >Uh oh. I hope this isn't called inside any loop. Note this is
> >unsynchronized to the gpu so I wonder what this is for.
> 
> To update the seqno inside the store_dword_imm. It is called every
> time before a batch is executed so I was thinking whether a gem_sync
> should be preceding it. But then I was thinking it is problematic in
> general if we queue up multiple same batches before they get
> executed. :( Sounds like I would need a separate batch for every
> iteration for this to work correctly. But that sounds too costly. So
> I don't know at the moment.

mmap/munmap, especially munmap, is not free. The munmap will do a
tlb_flush across all cores -- though maybe that's batched and the
munmaps I do all tend to be large enough to trigger every time.

Since you are using a CPU write, on !llc this will be clflushing
everytime. I would suggest stashing the gem_mmap__wc for updating the
seqno between repeats.

[snip]

> >I need to study this a bit more...
> 
> Yes please, especially the bit about how to get accurate seqnos
> written out in each step without needing separate execbuf batches.
> 
> I've heard recursive batches mentioned in the past so maybe each
> iteration could have it's own small batch which would jump to the
> nop/delay one (shared between all iterations) and write the unique
> seqno. No idea if that is possible/supported at the moment - I'll go
> and dig a bit.

You end up with the same problem of having the reloc change and need to
update every cycle. You could use a fresh batch to rewrite the seqno
values... However, now that you explained what you want, just keep the
WC mmap.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre