[Intel-gfx] [PATCH i-g-t] i915/gem_exec_balancer: Throw a few hangs into the virtual pipelines

Chris Wilson chris at chris-wilson.co.uk
Fri Nov 15 14:59:17 UTC 2019


Quoting Chris Wilson (2019-11-15 14:58:00)
> Quoting Tvrtko Ursulin (2019-11-15 14:52:16)
> > 
> > On 15/11/2019 13:09, Chris Wilson wrote:
> > > Quoting Tvrtko Ursulin (2019-11-15 13:02:24)
> > >>
> > >> On 14/11/2019 19:15, Chris Wilson wrote:
> > >>> Although a virtual engine itself has no hang detection; that is on the
> > >>> underlying physical engines, it does provide a unique means for clients
> > >>> to try and break the system. Try and break it before they do.
> > >>>
> > >>> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> > >>> Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> > >>> ---
> > >>>    tests/i915/gem_exec_balancer.c | 105 +++++++++++++++++++++++++++++++++
> > >>>    1 file changed, 105 insertions(+)
> > >>>
> > >>> diff --git a/tests/i915/gem_exec_balancer.c b/tests/i915/gem_exec_balancer.c
> > >>> index 70c4529b4..86028cfdd 100644
> > >>> --- a/tests/i915/gem_exec_balancer.c
> > >>> +++ b/tests/i915/gem_exec_balancer.c
> > >>> @@ -24,6 +24,7 @@
> > >>>    #include <sched.h>
> > >>>    
> > >>>    #include "igt.h"
> > >>> +#include "igt_gt.h"
> > >>>    #include "igt_perf.h"
> > >>>    #include "i915/gem_ring.h"
> > >>>    #include "sw_sync.h"
> > >>> @@ -1314,6 +1315,102 @@ static void semaphore(int i915)
> > >>>        gem_quiescent_gpu(i915);
> > >>>    }
> > >>>    
> > >>> +static void set_unbannable(int i915, uint32_t ctx)
> > >>> +{
> > >>> +     struct drm_i915_gem_context_param p = {
> > >>> +             .ctx_id = ctx,
> > >>> +             .param = I915_CONTEXT_PARAM_BANNABLE,
> > >>> +     };
> > >>> +
> > >>> +     igt_assert_eq(__gem_context_set_param(i915, &p), 0);
> > >>> +}
> > >>> +
> > >>> +static void hangme(int i915)
> > >>> +{
> > >>> +     struct drm_i915_gem_exec_object2 batch = {
> > >>> +             .handle = batch_create(i915),
> > >>> +     };
> > >>> +
> > >>> +     /*
> > >>> +      * Fill the available engines with hanging virtual engines and verify
> > >>> +      * that execution continues onto the second batch.
> > >>> +      */
> > >>> +
> > >>> +     for (int class = 1; class < 32; class++) {
> > >>> +             struct i915_engine_class_instance *ci;
> > >>> +             struct client {
> > >>> +                     igt_spin_t *spin[2];
> > >>> +             } *client;
> > >>> +             unsigned int count;
> > >>> +             uint32_t bg;
> > >>> +
> > >>> +             ci = list_engines(i915, 1u << class, &count);
> > >>> +             if (!ci)
> > >>> +                     continue;
> > >>> +
> > >>> +             if (count < 2) {
> > >>> +                     free(ci);
> > >>> +                     continue;
> > >>> +             }
> > >>> +
> > >>> +             client = malloc(sizeof(*client) * count);
> > >>> +             igt_assert(client);
> > >>> +
> > >>> +             for (int i = 0; i < count; i++) {
> > >>> +                     uint32_t ctx = gem_context_create(i915);
> > >>> +                     struct client *c = &client[i];
> > >>> +                     unsigned int flags;
> > >>> +
> > >>> +                     set_unbannable(i915, ctx);
> > >>> +                     set_load_balancer(i915, ctx, ci, count, NULL);
> > >>> +
> > >>> +                     flags = IGT_SPIN_FENCE_OUT | IGT_SPIN_NO_PREEMPTION;
> > >>> +                     for (int j = 0; j < ARRAY_SIZE(c->spin); j++)  {
> > >>> +                             c->spin[j] = igt_spin_new(i915, ctx,
> > >>> +                                                       .flags = flags);
> > >>> +                             flags = IGT_SPIN_FENCE_OUT;
> > >>> +                     }
> > >>> +
> > >>> +                     gem_context_destroy(i915, ctx);
> > >>> +             }
> > >>> +
> > >>> +             /* Apply some background context to speed up hang detection */
> > >>> +             bg = gem_context_create(i915);
> > >>> +             set_engines(i915, bg, ci, count);
> > >>> +             for (int i = 0; i < count; i++) {
> > >>> +                     struct drm_i915_gem_execbuffer2 execbuf = {
> > >>> +                             .buffers_ptr = to_user_pointer(&batch),
> > >>> +                             .buffer_count = 1,
> > >>> +                             .flags = i,
> > >>> +                             .rsvd1 = bg,
> > >>> +                     };
> > >>> +                     gem_execbuf(i915, &execbuf);
> > >>> +             }
> > >>> +             gem_context_destroy(i915, bg);
> > >>> +
> > >>> +             for (int i = 0; i < count; i++) {
> > >>> +                     struct client *c = &client[i];
> > >>> +
> > >>> +                     igt_debug("Waiting for client[%d].spin[%d]\n", i, 0);
> > >>> +                     gem_sync(i915, c->spin[0]->handle);
> > >>> +                     igt_assert_eq(sync_fence_status(c->spin[0]->out_fence),
> > >>> +                                   -EIO);
> > >>> +
> > >>> +                     igt_debug("Waiting for client[%d].spin[%d]\n", i, 1);
> > >>> +                     gem_sync(i915, c->spin[1]->handle);
> > >>> +                     igt_assert_eq(sync_fence_status(c->spin[1]->out_fence),
> > >>> +                                   -EIO);
> > >>> +
> > >>> +                     igt_spin_free(i915, c->spin[0]);
> > >>> +                     igt_spin_free(i915, c->spin[1]);
> > >>> +             }
> > >>> +             free(client);
> > >>> +     }
> > >>> +
> > >>> +     gem_close(i915, batch.handle);
> > >>> +     gem_quiescent_gpu(i915);
> > >>> +}
> > >>> +
> > >>>    static void smoketest(int i915, int timeout)
> > >>>    {
> > >>>        struct drm_i915_gem_exec_object2 batch[2] = {
> > >>> @@ -1486,4 +1583,12 @@ igt_main
> > >>>        igt_fixture {
> > >>>                igt_stop_hang_detector();
> > >>>        }
> > >>> +
> > >>> +     igt_subtest("hang") {
> > >>> +             igt_hang_t hang = igt_allow_hang(i915, 0, 0);
> > >>> +
> > >>> +             hangme(i915);
> > >>> +
> > >>> +             igt_disallow_hang(i915, hang);
> > >>> +     }
> > >>>    }
> > >>>
> > >>
> > >> Looks good. But do we need some core helpers to figure out when preempt
> > >> timeout is compiled out?
> > > 
> > > It should still work the same, but slower; 10s hang detection rather
> > > than ~200ms.
> > 
> > You are talking about old hangcheck? I was thinking about all new 
> > Kconfig's compiled out. No heartbeats, no preemption timeout. Still works?
> 
> Works even faster. :)
> 
> The spinners then get killed when the contexts are closed (default is
> non-persistent contexts if you disable heartbeats entirely). The
> challenge is really on the per-engine heartbeat controls to make sure we
> kick off the dead contexts, but that's for the future.

And for the other kconfig, with no preemption timeout, you just get
regular heartbeats, so roughly the 10s hangcheck timeout.
-Chris


More information about the Intel-gfx mailing list