[Intel-gfx] [PATCH i-g-t v3] benchmarks/gem_wsim: Command submission workload simulator
Chris Wilson
chris at chris-wilson.co.uk
Wed Apr 5 16:48:05 UTC 2017
On Wed, Apr 05, 2017 at 05:14:01PM +0100, Tvrtko Ursulin wrote:
> +static void
> +__emit_bb_end(struct w_step *w, bool terminate, bool seqnos, uint32_t seqno)
> +{
> + const uint32_t bbe = 0xa << 23;
> + unsigned long bb_sz = get_bb_sz(&w->duration);
> + unsigned long mmap_start, cmd_offset, mmap_len;
> + uint32_t *ptr, *cs;
> +
> + mmap_len = (seqnos ? 5 : 1) * sizeof(uint32_t);
> + cmd_offset = bb_sz - mmap_len;
> + mmap_start = rounddown(cmd_offset, PAGE_SIZE);
> + mmap_len += cmd_offset - mmap_start;
> +
> + gem_set_domain(fd, w->bb_handle,
> + I915_GEM_DOMAIN_CPU, I915_GEM_DOMAIN_CPU);
> +
> + ptr = gem_mmap__cpu(fd, w->bb_handle, mmap_start, mmap_len, PROT_WRITE);
> + cs = (uint32_t *)((char *)ptr + cmd_offset - mmap_start);
> +
> + if (seqnos) {
> + const int gen = intel_gen(intel_get_drm_devid(fd));
> +
> + igt_assert(gen >= 8);
> +
> + w->reloc.offset = bb_sz - 4 * sizeof(uint32_t);
> + w->seqno_offset = bb_sz - 2 * sizeof(uint32_t);
> +
> + *cs++ = terminate ? MI_STORE_DWORD_IMM : 0;
> + *cs++ = 0;
> + *cs++ = 0;
> + *cs++ = seqno;
> + }
> +
> + *cs = terminate ? bbe : 0;
> +
> + munmap(ptr, mmap_len);
> +}
> +
> +static void terminate_bb(struct w_step *w, bool seqnos, uint32_t seqno)
> +{
> + __emit_bb_end(w, true, seqnos, seqno);
> +}
> +
> +static void unterminate_bb(struct w_step *w, bool seqnos)
> +{
> + __emit_bb_end(w, false, seqnos, 0);
> +}
> +
> +static void
> +prepare_workload(struct workload *wrk, bool swap_vcs, bool seqnos)
> +{
> + int max_ctx = -1;
> + struct w_step *w;
> + int i;
> +
> + if (seqnos) {
> + const unsigned int status_sz = sizeof(uint32_t);
> +
> + for (i = 0; i < NUM_ENGINES; i++) {
> + wrk->status_page_handle[i] = gem_create(fd, status_sz);
Need to set_cache_level(CACHED) for llc.
You can use one page for all engines. Just use a different cacheline
for each, for safety.
> + wrk->status_page[i] =
> + gem_mmap__cpu(fd, wrk->status_page_handle[i],
> + 0, status_sz, PROT_READ);
> + }
> + }
> +
> + for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) {
> + if ((int)w->context > max_ctx) {
> + int delta = w->context + 1 - wrk->nr_ctxs;
> +
> + wrk->nr_ctxs += delta;
> + wrk->ctx_id = realloc(wrk->ctx_id,
> + wrk->nr_ctxs * sizeof(uint32_t));
> + memset(&wrk->ctx_id[wrk->nr_ctxs - delta], 0,
> + delta * sizeof(uint32_t));
> +
> + max_ctx = w->context;
> + }
> +
> + if (!wrk->ctx_id[w->context]) {
> + struct drm_i915_gem_context_create arg = {};
> +
> + drmIoctl(fd, DRM_IOCTL_I915_GEM_CONTEXT_CREATE, &arg);
> + igt_assert(arg.ctx_id);
> +
> + wrk->ctx_id[w->context] = arg.ctx_id;
> + }
> + }
> +
> + for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) {
> + enum intel_engine_id engine = w->engine;
> + unsigned int bb_i, j = 0;
> +
> + if (w->type != BATCH)
> + continue;
> +
> + w->obj[j].handle = gem_create(fd, 4096);
> + w->obj[j].flags = EXEC_OBJECT_WRITE;
> + j++;
> +
> + if (seqnos) {
> + w->obj[j].handle = wrk->status_page_handle[engine];
> + w->obj[j].flags = EXEC_OBJECT_WRITE;
The trick for sharing between engines is to not mark this as a WRITE.
Fun little lies.
> + j++;
> + }
> +
> + bb_i = j++;
> + w->duration.cur = w->duration.max;
> + w->bb_sz = get_bb_sz(&w->duration);
> + w->bb_handle = w->obj[bb_i].handle = gem_create(fd, w->bb_sz);
> + terminate_bb(w, seqnos, 0);
> + if (seqnos) {
> + w->reloc.presumed_offset = -1;
> + w->reloc.target_handle = 1;
> + w->reloc.read_domains = I915_GEM_DOMAIN_INSTRUCTION;
> + w->reloc.write_domain = I915_GEM_DOMAIN_INSTRUCTION;
Ugh. That's a magic w/a value for pipecontrols. Fortunately we don't want
to set write_domain here anyway.
> + }
> +
> + igt_assert(w->dependency <= 0);
> + if (w->dependency) {
> + int dep_idx = i + w->dependency;
> +
> + igt_assert(dep_idx >= 0 && dep_idx < wrk->nr_steps);
> + igt_assert(wrk->steps[dep_idx].type == BATCH);
> +
> + w->obj[j].handle = w->obj[bb_i].handle;
> + bb_i = j;
> + w->obj[j - 1].handle =
> + wrk->steps[dep_idx].obj[0].handle;
> + j++;
> + }
> +
> + if (seqnos) {
> + w->obj[bb_i].relocs_ptr = to_user_pointer(&w->reloc);
> + w->obj[bb_i].relocation_count = 1;
> + }
> +
> + w->eb.buffers_ptr = to_user_pointer(w->obj);
> + w->eb.buffer_count = j;
> + w->eb.rsvd1 = wrk->ctx_id[w->context];
> +
> + if (swap_vcs && engine == VCS1)
> + engine = VCS2;
> + else if (swap_vcs && engine == VCS2)
> + engine = VCS1;
> + w->eb.flags = eb_engine_map[engine];
> + w->eb.flags |= I915_EXEC_HANDLE_LUT;
> + if (!seqnos)
> + w->eb.flags |= I915_EXEC_NO_RELOC;
Doesn't look too hard to get the relocation right. Forcing relocations
between batches is probably a good one to check (just to say don't do
that)
> +#ifdef DEBUG
> + printf("%u: %u:%x|%x|%x|%x %10lu flags=%llx bb=%x[%u] ctx[%u]=%u\n",
> + i, w->eb.buffer_count, w->obj[0].handle,
> + w->obj[1].handle, w->obj[2].handle, w->obj[3].handle,
> + w->bb_sz, w->eb.flags, w->bb_handle, bb_i,
> + w->context, wrk->ctx_id[w->context]);
> +#endif
> + }
> +}
> +
> +static double elapsed(const struct timespec *start, const struct timespec *end)
> +{
> + return (end->tv_sec - start->tv_sec) +
> + (end->tv_nsec - start->tv_nsec) / 1e9;
> +}
> +
> +static int elapsed_us(const struct timespec *start, const struct timespec *end)
> +{
return 1e6 * elapsed(); might as well use gcc for something!
> + return (1e9 * (end->tv_sec - start->tv_sec) +
> + (end->tv_nsec - start->tv_nsec)) / 1e3;
> +}
> +
> +static enum intel_engine_id
> +rr_balance(struct workload *wrk, struct w_step *w)
> +{
> + unsigned int engine;
> +
> + if (wrk->vcs_rr)
> + engine = VCS2;
> + else
> + engine = VCS1;
> +
> + wrk->vcs_rr ^= 1;
> +
> + return engine;
> +}
> +
> +static enum intel_engine_id
> +qd_balance(struct workload *wrk, struct w_step *w)
> +{
> + unsigned long qd[NUM_ENGINES];
> + enum intel_engine_id engine = w->engine;
> +
> + igt_assert(engine == VCS);
> +
> + qd[VCS1] = wrk->seqno[VCS1] - wrk->status_page[VCS1][0];
> + wrk->qd_sum[VCS1] += qd[VCS1];
> +
> + qd[VCS2] = wrk->seqno[VCS2] - wrk->status_page[VCS2][0];
> + wrk->qd_sum[VCS2] += qd[VCS2];
> +
> + if (qd[VCS1] < qd[VCS2]) {
> + engine = VCS1;
> + wrk->vcs_rr = 0;
> + } else if (qd[VCS2] < qd[VCS1]) {
> + engine = VCS2;
> + wrk->vcs_rr = 1;
> + } else {
> + unsigned int vcs = wrk->vcs_rr ^ 1;
> +
> + wrk->vcs_rr = vcs;
> +
> + if (vcs == 0)
> + engine = VCS1;
> + else
> + engine = VCS2;
> + }
Hmm. Just thinking we don't even need hw to simulate a load-balancer,
but that would be boring!
> +// printf("qd_balance: 1:%lu 2:%lu rr:%u = %u\n", qd[VCS1], qd[VCS2], wrk->vcs_rr, engine);
> +
> + return engine;
> +}
> +
> +static void update_bb_seqno(struct w_step *w, uint32_t seqno)
> +{
> + unsigned long mmap_start, mmap_offset, mmap_len;
> + void *ptr;
> +
> + mmap_start = rounddown(w->seqno_offset, PAGE_SIZE);
> + mmap_offset = w->seqno_offset - mmap_start;
> + mmap_len = sizeof(uint32_t) + mmap_offset;
> +
> + gem_set_domain(fd, w->bb_handle,
> + I915_GEM_DOMAIN_CPU, I915_GEM_DOMAIN_CPU);
> +
> + ptr = gem_mmap__cpu(fd, w->bb_handle, mmap_start, mmap_len, PROT_WRITE);
> +
> + *(uint32_t *)((char *)ptr + mmap_offset) = seqno;
Uh oh. I hope this isn't called inside any loop. Note this is
unsynchronized to the gpu so I wonder what this is for.
> +
> + munmap(ptr, mmap_len);
> +}
> +
> +static void
> +run_workload(unsigned int id, struct workload *wrk, unsigned int repeat,
> + enum intel_engine_id (*balance)(struct workload *wrk,
> + struct w_step *w), bool seqnos)
> +{
> + struct timespec t_start, t_end;
> + struct w_step *w;
> + double t;
> + int i, j;
> +
> + clock_gettime(CLOCK_MONOTONIC, &t_start);
> +
> + srand(t_start.tv_nsec);
> +
> + for (j = 0; j < repeat; j++) {
> + for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) {
> + enum intel_engine_id engine = w->engine;
> + uint32_t seqno;
> + bool seqno_updated = false;
> + int do_sleep = 0;
> +
> + if (i == 0)
> + clock_gettime(CLOCK_MONOTONIC,
> + &wrk->repeat_start);
> +
> + if (w->type == DELAY) {
> + do_sleep = w->wait;
> + } else if (w->type == PERIOD) {
> + struct timespec now;
> +
> + clock_gettime(CLOCK_MONOTONIC, &now);
> + do_sleep = w->wait -
> + elapsed_us(&wrk->repeat_start, &now);
> + if (do_sleep < 0) {
> + if (!quiet) {
> + printf("%u: Dropped period @ %u/%u (%dus late)!\n",
> + id, j, i, do_sleep);
> + continue;
> + }
> + }
> + } else if (w->type == SYNC) {
> + unsigned int s_idx = i + w->wait;
> +
> + igt_assert(i > 0 && i < wrk->nr_steps);
> + igt_assert(wrk->steps[s_idx].type == BATCH);
> + gem_sync(fd, wrk->steps[s_idx].obj[0].handle);
> + continue;
> + }
> +
> + if (do_sleep) {
> + usleep(do_sleep);
> + continue;
> + }
> +
> + wrk->nr_bb[engine]++;
> +
> + if (engine == VCS && balance) {
> + engine = balance(wrk, w);
> + wrk->nr_bb[engine]++;
> +
> + w->obj[1].handle = wrk->status_page_handle[engine];
> +
> + w->eb.flags = eb_engine_map[engine];
> + w->eb.flags |= I915_EXEC_HANDLE_LUT;
> + }
> +
> + seqno = ++wrk->seqno[engine];
> +
> + if (w->duration.min != w->duration.max) {
> + unsigned int cur = get_duration(&w->duration);
> +
> + if (cur != w->duration.cur) {
> + unterminate_bb(w, seqnos);
Ah, you said this was for adjusting runlength of the batches. I suggest
using batch_start_offset to change the number of nops rather than
rewrite the batch.
I need to study this a bit more...
-Chris
--
Chris Wilson, Intel Open Source Technology Centre
More information about the Intel-gfx
mailing list