[Intel-gfx] [PATCH 4/7] drm/i915: Combine multiple internal plists into the same i915_priolist bucket

Thu Sep 27 13:05:54 UTC 2018

Quoting Tvrtko Ursulin (2018-09-25 10:48:44)
> 
> On 25/09/2018 09:32, Chris Wilson wrote:
> > As we are about to allow ourselves to slightly bump the user priority
> > into a few different sublevels, packthose internal priority lists
> > into the same i915_priolist to keep the rbtree compact and avoid having
> > to allocate the default user priority even after the internal bumping.
> > The downside to having an requests[] rather than a node per active list,
> > is that we then have to walk over the empty higher priority lists. To
> > compensate, we track the active buckets and use a small bitmap to skip
> > over any inactive ones.
> > 
> > v2: Use MASK of internal levels to simplify our usage.
> > 
> > Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> > Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> > ---
> >   drivers/gpu/drm/i915/intel_engine_cs.c      |  6 +-
> >   drivers/gpu/drm/i915/intel_guc_submission.c | 12 ++-
> >   drivers/gpu/drm/i915/intel_lrc.c            | 87 ++++++++++++++-------
> >   drivers/gpu/drm/i915/intel_ringbuffer.h     | 13 ++-
> >   4 files changed, 80 insertions(+), 38 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
> > index 217ed3ee1cab..83f2f7774c1f 100644
> > --- a/drivers/gpu/drm/i915/intel_engine_cs.c
> > +++ b/drivers/gpu/drm/i915/intel_engine_cs.c
> > @@ -1534,10 +1534,10 @@ void intel_engine_dump(struct intel_engine_cs *engine,
> >       count = 0;
> >       drm_printf(m, "\t\tQueue priority: %d\n", execlists->queue_priority);
> >       for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
> > -             struct i915_priolist *p =
> > -                     rb_entry(rb, typeof(*p), node);
> > +             struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
> > +             int i;
> >   
> > -             list_for_each_entry(rq, &p->requests, sched.link) {
> > +             priolist_for_each_request(rq, p, i) {
> >                       if (count++ < MAX_REQUESTS_TO_SHOW - 1)
> >                               print_request(m, rq, "\t\tQ ");
> >                       else
> > diff --git a/drivers/gpu/drm/i915/intel_guc_submission.c b/drivers/gpu/drm/i915/intel_guc_submission.c
> > index 4874a212754c..ac862b42f6a1 100644
> > --- a/drivers/gpu/drm/i915/intel_guc_submission.c
> > +++ b/drivers/gpu/drm/i915/intel_guc_submission.c
> > @@ -746,30 +746,28 @@ static bool __guc_dequeue(struct intel_engine_cs *engine)
> >       while ((rb = rb_first_cached(&execlists->queue))) {
> >               struct i915_priolist *p = to_priolist(rb);
> >               struct i915_request *rq, *rn;
> > +             int i;
> >   
> > -             list_for_each_entry_safe(rq, rn, &p->requests, sched.link) {
> > +             priolist_for_each_request_consume(rq, rn, p, i) {
> >                       if (last && rq->hw_context != last->hw_context) {
> > -                             if (port == last_port) {
> > -                                     __list_del_many(&p->requests,
> > -                                                     &rq->sched.link);
> > +                             if (port == last_port)
> >                                       goto done;
> > -                             }
> >   
> >                               if (submit)
> >                                       port_assign(port, last);
> >                               port++;
> >                       }
> >   
> > -                     INIT_LIST_HEAD(&rq->sched.link);
> > +                     list_del_init(&rq->sched.link);
> >   
> >                       __i915_request_submit(rq);
> >                       trace_i915_request_in(rq, port_index(port, execlists));
> > +
> >                       last = rq;
> >                       submit = true;
> >               }
> >   
> >               rb_erase_cached(&p->node, &execlists->queue);
> > -             INIT_LIST_HEAD(&p->requests);
> >               if (p->priority != I915_PRIORITY_NORMAL)
> >                       kmem_cache_free(engine->i915->priorities, p);
> >       }
> > diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> > index e8de250c3413..b1b3f67d1120 100644
> > --- a/drivers/gpu/drm/i915/intel_lrc.c
> > +++ b/drivers/gpu/drm/i915/intel_lrc.c
> > @@ -259,14 +259,49 @@ intel_lr_context_descriptor_update(struct i915_gem_context *ctx,
> >       ce->lrc_desc = desc;
> >   }
> >   
> > -static struct i915_priolist *
> > +static void assert_priolists(struct intel_engine_execlists * const execlists,
> > +                          int queue_priority)
> > +{
> > +     struct rb_node *rb;
> > +     int last_prio, i;
> > +
> > +     if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
> > +             return;
> > +
> > +     GEM_BUG_ON(rb_first_cached(&execlists->queue) !=
> > +                rb_first(&execlists->queue.rb_root));
> > +
> > +     last_prio = (queue_priority >> I915_USER_PRIORITY_SHIFT) + 1;
> > +     for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
> > +             struct i915_priolist *p = to_priolist(rb);
> > +
> > +             GEM_BUG_ON(p->priority >= last_prio);
> > +             last_prio = p->priority;
> > +
> > +             GEM_BUG_ON(!p->used);
> > +             for (i = 0; i < ARRAY_SIZE(p->requests); i++) {
> > +                     if (list_empty(&p->requests[i]))
> > +                             continue;
> > +
> > +                     GEM_BUG_ON(!(p->used & BIT(i)));
> > +             }
> > +     }
> > +}
> > +
> > +static struct list_head *
> >   lookup_priolist(struct intel_engine_cs *engine, int prio)
> >   {
> >       struct intel_engine_execlists * const execlists = &engine->execlists;
> >       struct i915_priolist *p;
> >       struct rb_node **parent, *rb;
> >       bool first = true;
> > +     int idx, i;
> > +
> > +     assert_priolists(execlists, INT_MAX);
> >   
> > +     /* buckets sorted from highest [in slot 0] to lowest priority */
> > +     idx = I915_PRIORITY_MASK - (prio & I915_PRIORITY_MASK);
> 
> How about:
> 
> #define I915_INTERNAL_PRIO(p) (p & I915_PRIORITY_MASK)

Not convinced. It's the only place we use it (one use of MASK later to
assert correct less). There's two views, here the user/kernel priority
levels are not as important as what we are concerned about are the split
lists with the chunking of lowest priority bits. At the extreme we could
separate the two and make the chunks BITS_PER_LONG -- I think that's a
waste and my goal was simply to avoid kmallocs for the common case of
default user priority.

The intention is that we don't even think about the user/internal split,
and simply consider it an integrated field, with most positive
priorities executing first and most negative last.
-Chris