[Intel-gfx] [PATCH 2/2] drm/i915: Unshare the idle-barrier from other kernel requests

Mon Jul 29 08:54:05 UTC 2019

Quoting Tvrtko Ursulin (2019-07-29 08:46:59)
> 
> On 25/07/2019 14:14, Chris Wilson wrote:
> > +static int __live_remote_context(struct intel_engine_cs *engine,
> > +                              struct i915_gem_context *fixme)
> > +{
> > +     struct intel_context *local, *remote;
> > +     struct i915_request *rq;
> > +     int pass;
> > +     int err;
> > +
> > +     /*
> > +      * Check that our idle barriers do not interfere with normal
> > +      * activity tracking. In particular, check that operating
> > +      * on the context image remotely (intel_context_prepare_remote_request)
> > +      * which inserts foriegn fences into intel_context.active are not
> 
> typo in foreign
> 
> "operating ... are not .." ? Foreign fences are not clobbered?

, does not clobber the active tracking.

> 
> > +      * clobbered.
> > +      */
> > +
> > +     remote = intel_context_create(fixme, engine);
> > +     if (!remote)
> > +             return -ENOMEM;
> > +
> > +     local = intel_context_create(fixme, engine);
> > +     if (!local) {
> > +             err = -ENOMEM;
> > +             goto err_remote;
> > +     }
> > +
> > +     for (pass = 0; pass <= 2; pass++) {
> > +             err = intel_context_pin(remote);
> > +             if (err)
> > +                     goto err_local;
> > +
> > +             rq = intel_context_create_request(local);
> > +             if (IS_ERR(rq)) {
> > +                     err = PTR_ERR(rq);
> > +                     goto err_unpin;
> > +             }
> > +
> > +             err = intel_context_prepare_remote_request(remote, rq);
> > +             if (err) {
> > +                     i915_request_add(rq);
> > +                     goto err_unpin;
> > +             }
> > +
> > +             err = request_sync(rq);
> > +             if (err)
> > +                     goto err_unpin;
> > +
> > +             intel_context_unpin(remote);
> > +             err = intel_context_pin(remote);
> 
> unpin-pin is to trigger transfer of idle barriers and maybe trigger some 
> asserts?

unpin is to trigger the idle-barrier. The pin is just the start of the
next phase with another context. At first I tried doing two remote
requests within on pin-phase, but that doesn't hit the bug. It needed
the idle barrier in the middle of the test, not between passes.

v2 wrapped it with another subroutine so the unpin-pin is not so
glaringly obvious.

> > +             if (err)
> > +                     goto err_local;
> > +
> > +             rq = i915_request_create(engine->kernel_context);
> 
> Why a request on kernel context here, a third context?

The kernel_context is most important since that's the one used by the
idle barrier. I included the normal context as well for completeness as
the intel_context_prepare_remote_request() interface should not assume
it is working from the kernel context.

> > +             if (IS_ERR(rq)) {
> > +                     err = PTR_ERR(rq);
> > +                     goto err_unpin;
> > +             }
> > +
> > +             err = intel_context_prepare_remote_request(remote, rq);
> > +             if (err) {
> > +                     i915_request_add(rq);
> > +                     goto err_unpin;
> > +             }
> > +
> > +             err = request_sync(rq);
> > +             if (err)
> > +                     goto err_unpin;
> > +
> > +             intel_context_unpin(remote);
> > +
> > +             if (i915_active_is_idle(&remote->active)) {
> > +                     pr_err("remote context is not active; expected idle-barrier (pass %d)\n", pass);
> > +                     err = -EINVAL;
> > +                     goto err_local;
> > +             }
> > +     }
> > +
> > +     goto err_local;
> > +
> > +err_unpin:
> > +     intel_context_unpin(remote);
> > +err_local:
> > +     intel_context_put(local);
> > +err_remote:
> > +     intel_context_put(remote);
> > +     return err;
> > +}
> > +
> > +static int live_remote_context(void *arg)
> > +{
> > +     struct intel_gt *gt = arg;
> > +     struct intel_engine_cs *engine;
> > +     struct i915_gem_context *fixme;
> > +     enum intel_engine_id id;
> > +     struct drm_file *file;
> > +     int err = 0;
> > +
> > +     file = mock_file(gt->i915);
> > +     if (IS_ERR(file))
> > +             return PTR_ERR(file);
> > +
> > +     mutex_lock(&gt->i915->drm.struct_mutex);
> > +
> > +     fixme = live_context(gt->i915, file);
> > +     if (!fixme) {
> > +             err = -ENOMEM;
> > +             goto unlock;
> > +     }
> > +
> > +     for_each_engine(engine, gt->i915, id) {
> > +             err = __live_remote_context(engine, fixme);
> > +             if (err)
> > +                     break;
> > +
> > +             err = igt_flush_test(gt->i915, I915_WAIT_LOCKED);
> > +             if (err)
> > +                     break;
> > +     }
> > +
> > +unlock:
> > +     mutex_unlock(&gt->i915->drm.struct_mutex);
> > +     mock_file_free(gt->i915, file);
> > +     return err;
> > +}
> > +
> > +int intel_context_live_selftests(struct drm_i915_private *i915)
> > +{
> > +     static const struct i915_subtest tests[] = {
> > +             SUBTEST(live_active_context),
> > +             SUBTEST(live_remote_context),
> > +     };
> > +     struct intel_gt *gt = &i915->gt;
> > +
> > +     if (intel_gt_is_wedged(gt))
> > +             return 0;
> > +
> > +     return intel_gt_live_subtests(tests, gt);
> > +}
> > diff --git a/drivers/gpu/drm/i915/i915_active.c b/drivers/gpu/drm/i915/i915_active.c
> > index 13f304a29fc8..e04afb519264 100644
> > --- a/drivers/gpu/drm/i915/i915_active.c
> > +++ b/drivers/gpu/drm/i915/i915_active.c
> > @@ -184,6 +184,7 @@ active_instance(struct i915_active *ref, u64 idx)
> >       ref->cache = node;
> >       mutex_unlock(&ref->mutex);
> >   
> > +     BUILD_BUG_ON(offsetof(typeof(*node), base));
> >       return &node->base;
> >   }
> >   
> > @@ -212,6 +213,8 @@ int i915_active_ref(struct i915_active *ref,
> >       struct i915_active_request *active;
> >       int err;
> >   
> > +     GEM_BUG_ON(!timeline); /* reserved for idle-barrier */
> > +
> >       /* Prevent reaping in case we malloc/wait while building the tree */
> >       err = i915_active_acquire(ref);
> >       if (err)
> > @@ -222,6 +225,7 @@ int i915_active_ref(struct i915_active *ref,
> >               err = -ENOMEM;
> >               goto out;
> >       }
> > +     GEM_BUG_ON(IS_ERR(active->request));
> >   
> >       if (!i915_active_request_isset(active))
> >               atomic_inc(&ref->count);
> > @@ -342,6 +346,34 @@ void i915_active_fini(struct i915_active *ref)
> >   }
> >   #endif
> >   
> > +static struct active_node *idle_barrier(struct i915_active *ref)
> > +{
> > +     struct active_node *idle = NULL;
> > +     struct rb_node *rb;
> > +
> > +     if (RB_EMPTY_ROOT(&ref->tree))
> > +             return NULL;
> > +
> > +     mutex_lock(&ref->mutex);
> > +     for (rb = rb_first(&ref->tree); rb; rb = rb_next(rb)) {
> > +             struct active_node *node;
> > +
> > +             node = rb_entry(rb, typeof(*node), node);
> > +             if (node->timeline)
> > +                     break;
> > +
> > +             if (!i915_active_request_isset(&node->base)) {
> > +                     GEM_BUG_ON(!list_empty(&node->base.link));
> > +                     rb_erase(rb, &ref->tree);
> > +                     idle = node;
> > +                     break;
> > +             }
> 
> Under what circumstances does the walk continue? There can be two idle 
> barriers (timeline == 0) in the tree?

Yes, there can be more than one (virtual engines). It should be the case
that when i915_active becomes idle (all idle barriers are idle) that the
tree is reaped. But... if we overlap active phases, we will get multiple
idle barriers, some idle, some active, which we want to reuse to avoid
having a potentially unbounded allocation.
-Chris