[Intel-gfx] [igt-dev] [PATCH i-g-t 4/5] tests/perf_pmu: Add tests for engine queued/runnable/running stats

Fri Mar 23 10:08:05 UTC 2018

On 19/03/2018 20:58, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2018-03-19 18:22:04)
>> From: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
>>
>> Simple tests to check reported queue depths are correct.
>>
>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
>> ---
>>   tests/perf_pmu.c | 224 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>   1 file changed, 224 insertions(+)
>>
>> diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c
>> index 469b9becdbac..206c18960b7b 100644
>> --- a/tests/perf_pmu.c
>> +++ b/tests/perf_pmu.c
>> @@ -966,6 +966,196 @@ multi_client(int gem_fd, const struct intel_execution_engine2 *e)
>>          assert_within_epsilon(val[1], perf_slept[1], tolerance);
>>   }
>>   
>> +static double calc_queued(uint64_t d_val, uint64_t d_ns)
>> +{
>> +       return (double)d_val * 1e9 / I915_SAMPLE_QUEUED_DIVISOR / d_ns;
>> +}
>> +
>> +static void
>> +queued(int gem_fd, const struct intel_execution_engine2 *e)
>> +{
>> +       const unsigned long engine = e2ring(gem_fd, e);
>> +       const unsigned int max_rq = 10;
>> +       double queued[max_rq + 1];
>> +       uint32_t bo[max_rq + 1];
>> +       unsigned int n, i;
>> +       uint64_t val[2];
>> +       uint64_t ts[2];
>> +       int fd;
> 
> igt_require_sw_sync();
> 
> I guess we should do igt_require_cork(CORK_SYNC_FD) or something like
> that.
> 
>> +
>> +       memset(queued, 0, sizeof(queued));
>> +       memset(bo, 0, sizeof(bo));
>> +
>> +       fd = open_pmu(I915_PMU_ENGINE_QUEUED(e->class, e->instance));
>> +
>> +       for (n = 0; n <= max_rq; n++) {
>> +               int fence = -1;
>> +               struct igt_cork cork = { .fd = fence, .type = CORK_SYNC_FD };
> 
> IGT_CORK_FENCE(cork); if you prefer

Missed it.

> 
>> +
>> +               gem_quiescent_gpu(gem_fd);
>> +
>> +               if (n)
>> +                       fence = igt_cork_plug(&cork, -1);
>> +
>> +               for (i = 0; i < n; i++) {
>> +                       struct drm_i915_gem_exec_object2 obj = { };
>> +                       struct drm_i915_gem_execbuffer2 eb = { };
>> +
>> +                       if (!bo[i]) {
>> +                               const uint32_t bbe = MI_BATCH_BUFFER_END;
>> +
>> +                               bo[i] = gem_create(gem_fd, 4096);
>> +                               gem_write(gem_fd, bo[i], 4092, &bbe,
>> +                                         sizeof(bbe));
>> +                       }
>> +
>> +                       obj.handle = bo[i];
> 
> Looks like you can use just the one handle multiple times?

Hm yeah.

> 
>> +
>> +                       eb.buffer_count = 1;
>> +                       eb.buffers_ptr = to_user_pointer(&obj);
>> +
>> +                       eb.flags = engine | I915_EXEC_FENCE_IN;
>> +                       eb.rsvd2 = fence;
> 
> You do however also want to check with one context per execbuf.

Ok.

> 
> if (flags & CONTEXTS)
> 	eb.rsvd1 = gem_context_create(fd);
>> +
>> +                       gem_execbuf(gem_fd, &eb);
> 
> if (flags & CONTEXTS)
> 	gem_context_destroy(fd, eb.rsvd1);
> 	eb.rsvd1 = gem_context_create();
> 
>> +               }
>> +
>> +               val[0] = __pmu_read_single(fd, &ts[0]);
>> +               usleep(batch_duration_ns / 1000);
>> +               val[1] = __pmu_read_single(fd, &ts[1]);
>> +
>> +               queued[n] = calc_queued(val[1] - val[0], ts[1] - ts[0]);
>> +               igt_info("n=%u queued=%.2f\n", n, queued[n]);
>> +
>> +               if (fence >= 0)
>> +                       igt_cork_unplug(&cork);
> 
> Maybe we should just make this a no-op when used on an unplugged cork.

Don't know really, there's an assert in there and I didn't feel like 
evaluating all callers.

> 
>> +
>> +               for (i = 0; i < n; i++)
>> +                       gem_sync(gem_fd, bo[i]);
>> +       }
>> +
>> +       close(fd);
>> +
>> +       for (i = 0; i < max_rq; i++) {
>> +               if (bo[i])
>> +                       gem_close(gem_fd, bo[i]);
>> +       }
>> +
>> +       for (i = 0; i <= max_rq; i++)
>> +               assert_within_epsilon(queued[i], i, tolerance);
>> +}
>> +
>> +static void
>> +runnable(int gem_fd, const struct intel_execution_engine2 *e)
>> +{
>> +       const unsigned long engine = e2ring(gem_fd, e);
>> +       const unsigned int max_rq = 10;
>> +       igt_spin_t *spin[max_rq + 1];
>> +       double runnable[max_rq + 1];
>> +       uint32_t ctx[max_rq];
>> +       unsigned int n, i;
>> +       uint64_t val[2];
>> +       uint64_t ts[2];
>> +       int fd;
>> +
>> +       memset(runnable, 0, sizeof(runnable));
>> +       memset(ctx, 0, sizeof(ctx));
>> +
>> +       fd = open_pmu(I915_PMU_ENGINE_RUNNABLE(e->class, e->instance));
>> +
>> +       for (n = 0; n <= max_rq; n++) {
>> +               gem_quiescent_gpu(gem_fd);
>> +
>> +               for (i = 0; i < n; i++) {
>> +                       if (!ctx[i])
>> +                               ctx[i] = gem_context_create(gem_fd);
>> +
>> +                       if (i == 0)
>> +                               spin[i] = __spin_poll(gem_fd, ctx[i], engine);
>> +                       else
>> +                               spin[i] = __igt_spin_batch_new(gem_fd, ctx[i],
>> +                                                              engine, 0);
>> +               }
>> +
>> +               if (n)
>> +                       __spin_wait(gem_fd, spin[0]);
>> +
>> +               val[0] = __pmu_read_single(fd, &ts[0]);
>> +               usleep(batch_duration_ns / 1000);
>> +               val[1] = __pmu_read_single(fd, &ts[1]);
>> +
>> +               runnable[n] = calc_queued(val[1] - val[0], ts[1] - ts[0]);
>> +               igt_info("n=%u runnable=%.2f\n", n, runnable[n]);
>> +
>> +               for (i = 0; i < n; i++) {
>> +                       end_spin(gem_fd, spin[i], FLAG_SYNC);
>> +                       igt_spin_batch_free(gem_fd, spin[i]);
>> +               }
>> +       }
>> +
>> +       for (i = 0; i < max_rq; i++) {
>> +               if (ctx[i])
>> +                       gem_context_destroy(gem_fd, ctx[i]);
> 
> I would just create the contexts unconditionally.

Can do.

> 
>> +       }
>> +
>> +       close(fd);
>> +
>> +       assert_within_epsilon(runnable[0], 0, tolerance);
>> +       igt_assert(runnable[max_rq] > 0.0);
>> +       assert_within_epsilon(runnable[max_rq] - runnable[max_rq - 1], 1,
>> +                             tolerance);
>> +}
>> +
>> +static void
>> +running(int gem_fd, const struct intel_execution_engine2 *e)
>> +{
>> +       const unsigned long engine = e2ring(gem_fd, e);
>> +       const unsigned int max_rq = 10;
>> +       igt_spin_t *spin[max_rq + 1];
>> +       double running[max_rq + 1];
>> +       unsigned int n, i;
>> +       uint64_t val[2];
>> +       uint64_t ts[2];
>> +       int fd;
>> +
>> +       memset(running, 0, sizeof(running));
>> +       memset(spin, 0, sizeof(spin));
>> +
>> +       fd = open_pmu(I915_PMU_ENGINE_RUNNING(e->class, e->instance));
>> +
>> +       for (n = 0; n <= max_rq; n++) {
>> +               gem_quiescent_gpu(gem_fd);
>> +
>> +               for (i = 0; i < n; i++) {
>> +                       if (i == 0)
>> +                               spin[i] = __spin_poll(gem_fd, 0, engine);
>> +                       else
>> +                               spin[i] = __igt_spin_batch_new(gem_fd, 0,
>> +                                                              engine, 0);
>> +               }
>> +
>> +               if (n)
>> +                       __spin_wait(gem_fd, spin[0]);
> 
> So create N requests on the same context so that running == N due to
> lite-restore every time. I have some caveats that this relies on the
> precise implementation, e.g. I don't think it will work for guc (using
> execlists emulation with no lite-restore) for N > 2 or 8, or if we get
> creative with execlists.

Yep, I think it doesn't work with GuC. Well, the assert would need to be 
different as minimum.

For N > 2 and execlists I think it works since it only needs port 0.

Runnable subtest on the other hand tries to be flexible towards 
different possible Ns already. I guess for running I could downgrade the 
asserts to just check running[N + 1] >= running[N] and running[1] > 0.

> 
>> +
>> +               val[0] = __pmu_read_single(fd, &ts[0]);
>> +               usleep(batch_duration_ns / 1000);
>> +               val[1] = __pmu_read_single(fd, &ts[1]);
>> +
>> +               running[n] = calc_queued(val[1] - val[0], ts[1] - ts[0]);
>> +               igt_info("n=%u running=%.2f\n", n, running[n]);
>> +
>> +               for (i = 0; i < n; i++) {
>> +                       end_spin(gem_fd, spin[i], FLAG_SYNC);
>> +                       igt_spin_batch_free(gem_fd, spin[i]);
>> +               }
>> +       }
>> +
>> +       close(fd);
>> +
>> +       for (i = 0; i <= max_rq; i++)
>> +               assert_within_epsilon(running[i], i, tolerance);
>> +}
> 
> Ok, the tests look like they should be covering the counters.
> 
> Do we need to do an all-engines pass to check concurrent usage?

Depends how grey is your approach between white box and black box testing.

But what is definitely needed is some tests involving hangs, resets and 
preemption, since I am pretty sure a bug sneaked in somewhere in those 
areas.

Regards,

Tvrtko