[Intel-gfx] [PATCH 17/21] drm/i915: Drop struct_mutex from around i915_retire_requests()

Wed Sep 25 08:43:27 UTC 2019

Quoting Tvrtko Ursulin (2019-09-24 16:25:29)
> 
> On 02/09/2019 05:02, Chris Wilson wrote:
> > @@ -449,8 +447,6 @@ tl_write(struct intel_timeline *tl, struct intel_engine_cs *engine, u32 value)
> >       struct i915_request *rq;
> >       int err;
> >   
> > -     lockdep_assert_held(&tl->gt->i915->drm.struct_mutex); /* lazy rq refs */
> > -
> >       err = intel_timeline_pin(tl);
> >       if (err) {
> >               rq = ERR_PTR(err);
> > @@ -461,10 +457,14 @@ tl_write(struct intel_timeline *tl, struct intel_engine_cs *engine, u32 value)
> >       if (IS_ERR(rq))
> >               goto out_unpin;
> >   
> > +     i915_request_get(rq);
> > +
> >       err = emit_ggtt_store_dw(rq, tl->hwsp_offset, value);
> >       i915_request_add(rq);
> > -     if (err)
> > +     if (err) {
> > +             i915_request_put(rq);
> >               rq = ERR_PTR(err);
> > +     }
> >   
> >   out_unpin:
> >       intel_timeline_unpin(tl);
> > @@ -500,7 +500,6 @@ static int live_hwsp_engine(void *arg)
> >       struct intel_timeline **timelines;
> >       struct intel_engine_cs *engine;
> >       enum intel_engine_id id;
> > -     intel_wakeref_t wakeref;
> >       unsigned long count, n;
> >       int err = 0;
> >   
> > @@ -515,14 +514,13 @@ static int live_hwsp_engine(void *arg)
> >       if (!timelines)
> >               return -ENOMEM;
> >   
> > -     mutex_lock(&i915->drm.struct_mutex);
> > -     wakeref = intel_runtime_pm_get(&i915->runtime_pm);
> > -
> >       count = 0;
> >       for_each_engine(engine, i915, id) {
> >               if (!intel_engine_can_store_dword(engine))
> >                       continue;
> >   
> > +             intel_engine_pm_get(engine);
> > +
> >               for (n = 0; n < NUM_TIMELINES; n++) {
> >                       struct intel_timeline *tl;
> >                       struct i915_request *rq;
> > @@ -530,22 +528,26 @@ static int live_hwsp_engine(void *arg)
> >                       tl = checked_intel_timeline_create(i915);
> >                       if (IS_ERR(tl)) {
> >                               err = PTR_ERR(tl);
> > -                             goto out;
> > +                             break;
> >                       }
> >   
> >                       rq = tl_write(tl, engine, count);
> >                       if (IS_ERR(rq)) {
> >                               intel_timeline_put(tl);
> >                               err = PTR_ERR(rq);
> > -                             goto out;
> > +                             break;
> >                       }
> >   
> >                       timelines[count++] = tl;
> > +                     i915_request_put(rq);
> 
> This was a leak until now?

No, we added a get into tl_write() so that we own a reference to the
request, just so that ownership is clear across the waits (and it can't
accidentally be retired).

> > @@ -681,7 +670,9 @@ static int live_hwsp_wrap(void *arg)
> >               if (!intel_engine_can_store_dword(engine))
> >                       continue;
> >   
> > +             intel_engine_pm_get(engine);
> >               rq = i915_request_create(engine->kernel_context);
> > +             intel_engine_pm_put(engine);
> >               if (IS_ERR(rq)) {
> >                       err = PTR_ERR(rq);
> >                       goto out;
> > @@ -747,16 +738,12 @@ static int live_hwsp_wrap(void *arg)
> >       }
> >   
> 
> No i915_request_put in this one and I can't see why it would be different.

It's not using tl_write with the added ref.

> > diff --git a/drivers/gpu/drm/i915/selftests/igt_flush_test.c b/drivers/gpu/drm/i915/selftests/igt_flush_test.c
> > index d3b5eb402d33..2a5fbe46ea9f 100644
> > --- a/drivers/gpu/drm/i915/selftests/igt_flush_test.c
> > +++ b/drivers/gpu/drm/i915/selftests/igt_flush_test.c
> > @@ -12,31 +12,25 @@
> >   
> >   #include "igt_flush_test.h"
> >   
> > -int igt_flush_test(struct drm_i915_private *i915, unsigned int flags)
> > +int igt_flush_test(struct drm_i915_private *i915)
> >   {
> >       int ret = intel_gt_is_wedged(&i915->gt) ? -EIO : 0;
> > -     int repeat = !!(flags & I915_WAIT_LOCKED);
> >   
> >       cond_resched();
> >   
> > -     do {
> > -             if (i915_gem_wait_for_idle(i915, flags, HZ / 5) == -ETIME) {
> > -                     pr_err("%pS timed out, cancelling all further testing.\n",
> > -                            __builtin_return_address(0));
> > +     i915_retire_requests(i915);
> 
> Do you need this one or it would be better without, for clarity, given 
> there is one at the end? i915_gem_wait_for_idle will retire all it can.

It's to aide kick starting of the idle-barrier. Eventually this is
pulled together again.

> > +     if (i915_gem_wait_for_idle(i915, 0, HZ / 5) == -ETIME) {
> > +             pr_err("%pS timed out, cancelling all further testing.\n",
> > +                    __builtin_return_address(0));
> >   
> > -                     GEM_TRACE("%pS timed out.\n",
> > -                               __builtin_return_address(0));
> > -                     GEM_TRACE_DUMP();
> > +             GEM_TRACE("%pS timed out.\n",
> > +                       __builtin_return_address(0));
> > +             GEM_TRACE_DUMP();
> >   
> > -                     intel_gt_set_wedged(&i915->gt);
> > -                     repeat = 0;
> > -                     ret = -EIO;
> > -             }
> > -
> > -             /* Ensure we also flush after wedging. */
> > -             if (flags & I915_WAIT_LOCKED)
> > -                     i915_retire_requests(i915);
> > -     } while (repeat--);
> > +             intel_gt_set_wedged(&i915->gt);
> > +             ret = -EIO;
> > +     }
> > +     i915_retire_requests(i915);
> >   
> >       return ret;
> >   }

> Essentially looks fine. Provisional, meaning keep it if you do some 
> small tweaks:

Nothing seemed to be required.
-Chris