[Intel-gfx] [PATCH 1/2] drm/i915/selftests: Exercise resetting during non-user payloads

Fri Feb 15 15:33:25 UTC 2019

Quoting Mika Kuoppala (2019-02-15 15:18:08)
> Chris Wilson <chris at chris-wilson.co.uk> writes:
> 
> > In selftests/live_hangcheck, we have a lot of tests for resetting simple
> > spinners, but nothing quite prepared us for how the GPU reacted to
> > triggering a reset outside of the safe spinner. These two subtests fill
> > the ring with plain old empty, non-spinning requests, and then triggers
> > a reset. Without a user-payload to blame, these requests will exercise
> > the 'non-started' paths and mostly be replayed verbatim.
> >
> > Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> > Cc: Mika Kuoppala <mika.kuoppala at intel.com>
> > ---
> >  .../gpu/drm/i915/selftests/intel_hangcheck.c  | 217 ++++++++++++++++++
> >  1 file changed, 217 insertions(+)
> >
> > diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> > index 92475596ff40..f75cb56ff8ad 100644
> > --- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> > +++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> > @@ -415,6 +415,221 @@ static bool wait_for_idle(struct intel_engine_cs *engine)
> >       return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0;
> >  }
> >  
> > +static int igt_reset_nop(void *arg)
> > +{
> > +     struct drm_i915_private *i915 = arg;
> > +     struct intel_engine_cs *engine;
> > +     struct i915_gem_context *ctx;
> > +     unsigned int reset_count, count;
> > +     enum intel_engine_id id;
> > +     intel_wakeref_t wakeref;
> > +     struct drm_file *file;
> > +     IGT_TIMEOUT(end_time);
> > +     int err = 0;
> > +
> > +     /* Check that we can reset during non-user portions of requests */
> > +
> > +     file = mock_file(i915);
> > +     if (IS_ERR(file))
> > +             return PTR_ERR(file);
> > +
> > +     mutex_lock(&i915->drm.struct_mutex);
> > +     ctx = live_context(i915, file);
> > +     mutex_unlock(&i915->drm.struct_mutex);
> > +     if (IS_ERR(ctx)) {
> > +             err = PTR_ERR(ctx);
> > +             goto out;
> > +     }
> > +
> > +     wakeref = intel_runtime_pm_get(i915);
> > +
> > +     reset_count = i915_reset_count(&i915->gpu_error);
> > +     count = 0;
> > +     do {
> > +             mutex_lock(&i915->drm.struct_mutex);
> > +             for_each_engine(engine, i915, id) {
> > +                     int i;
> > +
> > +                     for (i = 0; i < 16; i++) {
> > +                             struct i915_request *rq;
> > +
> > +                             rq = i915_request_alloc(engine, ctx);
> > +                             if (IS_ERR(rq)) {
> > +                                     err = PTR_ERR(rq);
> > +                                     break;
> > +                             }
> > +
> > +                             i915_request_add(rq);
> > +                     }
> > +             }
> > +             mutex_unlock(&i915->drm.struct_mutex);
> > +
> > +             igt_global_reset_lock(i915);
> > +             i915_reset(i915, ALL_ENGINES, NULL);
> > +             igt_global_reset_unlock(i915);
> > +             if (i915_terminally_wedged(&i915->gpu_error)) {
> > +                     err = -EIO;
> > +                     break;
> > +             }
> > +
> > +             if (i915_reset_count(&i915->gpu_error) !=
> > +                 reset_count + ++count) {
> > +                     pr_err("Full GPU reset not recorded!\n");
> > +                     err = -EINVAL;
> > +                     break;
> > +             }
> > +
> > +             if (!i915_reset_flush(i915)) {
> > +                     struct drm_printer p =
> > +                             drm_info_printer(i915->drm.dev);
> > +
> > +                     pr_err("%s failed to idle after reset\n",
> > +                            engine->name);
> > +                     intel_engine_dump(engine, &p,
> > +                                       "%s\n", engine->name);
> > +
> > +                     err = -EIO;
> > +                     break;
> > +             }
> > +
> > +             err = igt_flush_test(i915, 0);
> > +             if (err)
> > +                     break;
> > +     } while (time_before(jiffies, end_time));
> > +     pr_info("%s: %d resets\n", __func__, count);
> > +
> > +     mutex_lock(&i915->drm.struct_mutex);
> > +     err = igt_flush_test(i915, I915_WAIT_LOCKED);
> > +     mutex_unlock(&i915->drm.struct_mutex);
> > +
> > +     intel_runtime_pm_put(i915, wakeref);
> > +
> > +out:
> > +     mock_file_free(i915, file);
> > +     if (i915_terminally_wedged(&i915->gpu_error))
> > +             err = -EIO;
> > +     return err;
> > +}
> > +
> > +static int igt_reset_nop_engine(void *arg)
> > +{
> > +     struct drm_i915_private *i915 = arg;
> > +     struct intel_engine_cs *engine;
> > +     struct i915_gem_context *ctx;
> > +     enum intel_engine_id id;
> > +     intel_wakeref_t wakeref;
> > +     struct drm_file *file;
> > +     int err = 0;
> 
> err = 0 for the gcc as it seems unnecessary. Regardless
> I didn't spot anything out of place in this patch. 

It's copy'n'paste. There probably isn't a path where it isn't set, but
I've been burnt by sparse/smatch enough around selftests not to push too
hard.

> The amount of 16 requests feels low as a number but
> we should have plenty of time to hit the button during.

Yeah. 16 was a figure to avoid running out of ring space, given that
we're using small rings (4K rather than the usual 16K) and that some
platforms are getting quite fat in their requests.

> Nice addition to reset tests. And as you mentioned
> we should have plenty of resets going in on idle engines
> on other tests. Not that someone would object one explicit
> test into this file :)

Where to go next? I was thinking of scribbling over the context images
with garbage and/or executing garbage batches... But that strays into HW
validation and away from proving the driver can handle the recovery.
-Chris