[igt-dev] [PATCH i-g-t] tests/perf_pmu: Improve accuracy by waiting on spinner to start

Thu Mar 15 13:14:29 UTC 2018

Quoting Tvrtko Ursulin (2018-03-15 12:56:17)
> From: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> 
> More than one test assumes that the spinner is running pretty much
> immediately after we have create or submitted it.
> 
> In actuality there is a variable delay, especially on execlists platforms,
> between submission and spin batch starting to run on the hardware.
> 
> To enable tests which care about this level of timing to account for this,
> we add a new spin batch constructor which provides an output field which
> can be polled to determine when the batch actually started running.
> 
> This is implemented via MI_STOREDW_IMM from the spin batch, writing into
> memory mapped page shared with userspace.
> 
> Using this facility from perf_pmu, where applicable, should improve very
> occasional test fails across the set and platforms.
> 
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> Suggested-by: Chris Wilson <chris at chris-wilson.co.uk>
> ---
>  lib/igt_dummyload.c |  99 +++++++++++++++++++++++++++++++----
>  lib/igt_dummyload.h |   9 ++++
>  tests/perf_pmu.c    | 145 +++++++++++++++++++++++++++++++++++-----------------
>  3 files changed, 196 insertions(+), 57 deletions(-)
> 
> diff --git a/lib/igt_dummyload.c b/lib/igt_dummyload.c
> index 4b20f23dfe26..0447d2f14d57 100644
> --- a/lib/igt_dummyload.c
> +++ b/lib/igt_dummyload.c
> @@ -74,9 +74,12 @@ fill_reloc(struct drm_i915_gem_relocation_entry *reloc,
>         reloc->write_domain = write_domains;
>  }
>  
> -static int emit_recursive_batch(igt_spin_t *spin,
> -                               int fd, uint32_t ctx, unsigned engine,
> -                               uint32_t dep, bool out_fence)
> +#define OUT_FENCE      (1 << 0)
> +#define POLL_RUN       (1 << 1)
> +
> +static int
> +emit_recursive_batch(igt_spin_t *spin, int fd, uint32_t ctx, unsigned engine,
> +                    uint32_t dep, unsigned int flags)
>  {
>  #define SCRATCH 0
>  #define BATCH 1
> @@ -116,6 +119,8 @@ static int emit_recursive_batch(igt_spin_t *spin,
>         execbuf.buffer_count++;
>  
>         if (dep) {
> +               igt_assert(!(flags & POLL_RUN));
> +

Challenge left to the reader :)

>                 /* dummy write to dependency */
>                 obj[SCRATCH].handle = dep;
>                 fill_reloc(&relocs[obj[BATCH].relocation_count++],
> @@ -123,6 +128,41 @@ static int emit_recursive_batch(igt_spin_t *spin,
>                            I915_GEM_DOMAIN_RENDER,
>                            I915_GEM_DOMAIN_RENDER);
>                 execbuf.buffer_count++;
> +       } else if (flags & POLL_RUN) {
> +               unsigned int offset;
> +
> +               igt_assert(!dep);
> +
> +               spin->poll_handle = gem_create(fd, 4096);
> +               spin->running = __gem_mmap__wc(fd, spin->poll_handle,
> +                                              0, 4096, PROT_READ | PROT_WRITE);

Use mmap_cpu and gem_set_caching().

> +               igt_assert(spin->running);
> +               igt_assert_eq(*spin->running, 0);
> +
> +               *batch++ = MI_STORE_DWORD_IMM | (gen < 6 ? 1 << 22 : 0);

Hmm, have we forgot the (len-2) or is this an unusual command that knows
its own length?

> +
> +               if (gen >= 8) {
> +                       offset = sizeof(uint32_t);
> +                       *batch++ = 0;
> +                       *batch++ = 0;
> +               } else if (gen >= 4) {
> +                       offset = 2 * sizeof(uint32_t);
> +                       *batch++ = 0;
> +                       *batch++ = 0;
> +               } else {
> +                       offset = sizeof(uint32_t);
> +                       batch[-1]--;
> +                       *batch++ = 0;
> +               }
> +
> +               *batch++ = 1;
> +
> +               obj[SCRATCH].handle = spin->poll_handle;
> +               fill_reloc(&relocs[obj[BATCH].relocation_count++],
> +                          spin->poll_handle, offset,
> +                          I915_GEM_DOMAIN_INSTRUCTION,
> +                          I915_GEM_DOMAIN_INSTRUCTION);

DOMAIN_RENDER preferably. You don't need the w/a. Could we not lie about
the write-hazard? Removes the need for EXEC_OBJECT_ASYNC and opens up
the possibility for using different dwords for different engines and then
waiting for all-engines.

> +               execbuf.buffer_count++;

gen4 and gen5 require I915_EXEC_SECURE and a DRM_MASTER fd.

We can just do something like
	if (gen == 4 || gen == 5)
		igt_require(igt_device_set_master(fd) == 0));

> +/**
> + * igt_spin_batch_new_poll:
> + * @fd: open i915 drm file descriptor
> + * @engine: Ring to execute batch OR'd with execbuf flags. If value is less
> + *          than 0, execute on all available rings.
> + *
> + * Start a recursive batch on a ring. Immediately returns a #igt_spin_t that
> + * contains the batch's handle that can be waited upon. The returned structure
> + * must be passed to igt_spin_batch_free() for post-processing.
> + *
> + * igt_spin_t->running will containt a pointer which target will change from
> + * zero to one once the spinner actually starts executing on the GPU.
> + *
> + * Returns:
> + * Structure with helper internal state for igt_spin_batch_free().
> + */
> +igt_spin_t *
> +igt_spin_batch_new_poll(int fd, uint32_t ctx, unsigned engine)
> +{
> +       igt_spin_t *spin;
> +
> +       igt_require_gem(fd);
> +       igt_require(gem_mmap__has_wc(fd));

igt_require(gem_can_store_dword(fd, engine));

Not all platforms have a MI_STORE_DWORD/DATA_IMM (with virtual addresses
at least) and some platforms will die (*cough* snb *cough*).

> +
> +       spin = __igt_spin_batch_new_poll(fd, ctx, engine);
> +       igt_assert(gem_bo_busy(fd, spin->handle));
> +
> +       return spin;
> +}

>  igt_spin_t *__igt_spin_batch_new(int fd,
> @@ -55,6 +57,13 @@ igt_spin_t *igt_spin_batch_new_fence(int fd,
>                                      uint32_t ctx,
>                                      unsigned engine);
>  
> +igt_spin_t *__igt_spin_batch_new_poll(int fd,
> +                                      uint32_t ctx,
> +                                      unsigned engine);
> +igt_spin_t *igt_spin_batch_new_poll(int fd,
> +                                   uint32_t ctx,
> +                                   unsigned engine);
> +
>  void igt_spin_batch_set_timeout(igt_spin_t *spin, int64_t ns);
>  void igt_spin_batch_end(igt_spin_t *spin);
>  void igt_spin_batch_free(int fd, igt_spin_t *spin);
> diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c
> index 19fcc95ffc7f..d1b7b23bc646 100644
> --- a/tests/perf_pmu.c
> +++ b/tests/perf_pmu.c
> @@ -184,6 +184,38 @@ static void end_spin(int fd, igt_spin_t *spin, unsigned int flags)
>                 usleep(batch_duration_ns / 5000);
>  }
>  
> +static igt_spin_t * __spin_poll(int fd, uint32_t ctx, unsigned long flags)
> +{
> +       return __igt_spin_batch_new_poll(fd, ctx, flags);
> +}
> +
> +static unsigned long __spin_wait(igt_spin_t *spin)
> +{
> +       struct timespec start = { };
> +
> +       igt_nsec_elapsed(&start);
> +
> +       while (!spin->running);

Put ';' on a new line so it's clearly visible.

> +
> +       return igt_nsec_elapsed(&start);
> +}
> +
> +static igt_spin_t * __spin_sync(int fd, uint32_t ctx, unsigned long flags)
> +{
> +       igt_spin_t *spin = __spin_poll(fd, ctx, flags);
> +
> +       __spin_wait(spin);
> +
> +       return spin;
> +}
> +
> +static igt_spin_t * spin_sync(int fd, uint32_t ctx, unsigned long flags)

spin_sync() has connotations with gem_sync(). gem_sync is wait for end,
but spin_sync is wait_for_start. Maybe spin_wait_for_execute? Nah.

> +{
> +       igt_require_gem(fd);
> +
> +       return __spin_sync(fd, ctx, flags);
> +}

>  static void
>  __submit_spin_batch(int gem_fd,
> +                   igt_spin_t *spin,
>                     struct drm_i915_gem_exec_object2 *obj,
>                     const struct intel_execution_engine2 *e)
>  {
>         struct drm_i915_gem_execbuffer2 eb = {
> -               .buffer_count = 1,
>                 .buffers_ptr = to_user_pointer(obj),
>                 .flags = e2ring(gem_fd, e),
>         };
>  
> +       if (spin->running) {
> +               obj[0].handle = spin->poll_handle;
> +               obj[0].flags = EXEC_OBJECT_ASYNC;
> +               obj[1].handle = spin->handle;
> +               eb.buffer_count = 2;
> +       } else {
> +               obj[0].handle = spin->handle;
> +               eb.buffer_count = 1;
> +       }

obj[] must be set up by the caller; the EXEC_OBJECT_PINNED are
essential. Or else the kernel *will* move spin->poll_handle and then it
is fubar.
-Chris