[Intel-gfx] [PATCH 4/7] drm/i915: Combine multiple internal plists into the same i915_priolist bucket

Fri Sep 28 13:04:34 UTC 2018

On 27/09/2018 14:05, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2018-09-25 10:48:44)
>>
>> On 25/09/2018 09:32, Chris Wilson wrote:
>>> As we are about to allow ourselves to slightly bump the user priority
>>> into a few different sublevels, packthose internal priority lists
>>> into the same i915_priolist to keep the rbtree compact and avoid having
>>> to allocate the default user priority even after the internal bumping.
>>> The downside to having an requests[] rather than a node per active list,
>>> is that we then have to walk over the empty higher priority lists. To
>>> compensate, we track the active buckets and use a small bitmap to skip
>>> over any inactive ones.
>>>
>>> v2: Use MASK of internal levels to simplify our usage.
>>>
>>> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
>>> Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
>>> ---
>>>    drivers/gpu/drm/i915/intel_engine_cs.c      |  6 +-
>>>    drivers/gpu/drm/i915/intel_guc_submission.c | 12 ++-
>>>    drivers/gpu/drm/i915/intel_lrc.c            | 87 ++++++++++++++-------
>>>    drivers/gpu/drm/i915/intel_ringbuffer.h     | 13 ++-
>>>    4 files changed, 80 insertions(+), 38 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
>>> index 217ed3ee1cab..83f2f7774c1f 100644
>>> --- a/drivers/gpu/drm/i915/intel_engine_cs.c
>>> +++ b/drivers/gpu/drm/i915/intel_engine_cs.c
>>> @@ -1534,10 +1534,10 @@ void intel_engine_dump(struct intel_engine_cs *engine,
>>>        count = 0;
>>>        drm_printf(m, "\t\tQueue priority: %d\n", execlists->queue_priority);
>>>        for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
>>> -             struct i915_priolist *p =
>>> -                     rb_entry(rb, typeof(*p), node);
>>> +             struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
>>> +             int i;
>>>    
>>> -             list_for_each_entry(rq, &p->requests, sched.link) {
>>> +             priolist_for_each_request(rq, p, i) {
>>>                        if (count++ < MAX_REQUESTS_TO_SHOW - 1)
>>>                                print_request(m, rq, "\t\tQ ");
>>>                        else
>>> diff --git a/drivers/gpu/drm/i915/intel_guc_submission.c b/drivers/gpu/drm/i915/intel_guc_submission.c
>>> index 4874a212754c..ac862b42f6a1 100644
>>> --- a/drivers/gpu/drm/i915/intel_guc_submission.c
>>> +++ b/drivers/gpu/drm/i915/intel_guc_submission.c
>>> @@ -746,30 +746,28 @@ static bool __guc_dequeue(struct intel_engine_cs *engine)
>>>        while ((rb = rb_first_cached(&execlists->queue))) {
>>>                struct i915_priolist *p = to_priolist(rb);
>>>                struct i915_request *rq, *rn;
>>> +             int i;
>>>    
>>> -             list_for_each_entry_safe(rq, rn, &p->requests, sched.link) {
>>> +             priolist_for_each_request_consume(rq, rn, p, i) {
>>>                        if (last && rq->hw_context != last->hw_context) {
>>> -                             if (port == last_port) {
>>> -                                     __list_del_many(&p->requests,
>>> -                                                     &rq->sched.link);
>>> +                             if (port == last_port)
>>>                                        goto done;
>>> -                             }
>>>    
>>>                                if (submit)
>>>                                        port_assign(port, last);
>>>                                port++;
>>>                        }
>>>    
>>> -                     INIT_LIST_HEAD(&rq->sched.link);
>>> +                     list_del_init(&rq->sched.link);
>>>    
>>>                        __i915_request_submit(rq);
>>>                        trace_i915_request_in(rq, port_index(port, execlists));
>>> +
>>>                        last = rq;
>>>                        submit = true;
>>>                }
>>>    
>>>                rb_erase_cached(&p->node, &execlists->queue);
>>> -             INIT_LIST_HEAD(&p->requests);
>>>                if (p->priority != I915_PRIORITY_NORMAL)
>>>                        kmem_cache_free(engine->i915->priorities, p);
>>>        }
>>> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
>>> index e8de250c3413..b1b3f67d1120 100644
>>> --- a/drivers/gpu/drm/i915/intel_lrc.c
>>> +++ b/drivers/gpu/drm/i915/intel_lrc.c
>>> @@ -259,14 +259,49 @@ intel_lr_context_descriptor_update(struct i915_gem_context *ctx,
>>>        ce->lrc_desc = desc;
>>>    }
>>>    
>>> -static struct i915_priolist *
>>> +static void assert_priolists(struct intel_engine_execlists * const execlists,
>>> +                          int queue_priority)
>>> +{
>>> +     struct rb_node *rb;
>>> +     int last_prio, i;
>>> +
>>> +     if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
>>> +             return;
>>> +
>>> +     GEM_BUG_ON(rb_first_cached(&execlists->queue) !=
>>> +                rb_first(&execlists->queue.rb_root));
>>> +
>>> +     last_prio = (queue_priority >> I915_USER_PRIORITY_SHIFT) + 1;
>>> +     for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
>>> +             struct i915_priolist *p = to_priolist(rb);
>>> +
>>> +             GEM_BUG_ON(p->priority >= last_prio);
>>> +             last_prio = p->priority;
>>> +
>>> +             GEM_BUG_ON(!p->used);
>>> +             for (i = 0; i < ARRAY_SIZE(p->requests); i++) {
>>> +                     if (list_empty(&p->requests[i]))
>>> +                             continue;
>>> +
>>> +                     GEM_BUG_ON(!(p->used & BIT(i)));
>>> +             }
>>> +     }
>>> +}
>>> +
>>> +static struct list_head *
>>>    lookup_priolist(struct intel_engine_cs *engine, int prio)
>>>    {
>>>        struct intel_engine_execlists * const execlists = &engine->execlists;
>>>        struct i915_priolist *p;
>>>        struct rb_node **parent, *rb;
>>>        bool first = true;
>>> +     int idx, i;
>>> +
>>> +     assert_priolists(execlists, INT_MAX);
>>>    
>>> +     /* buckets sorted from highest [in slot 0] to lowest priority */
>>> +     idx = I915_PRIORITY_MASK - (prio & I915_PRIORITY_MASK);
>>
>> How about:
>>
>> #define I915_INTERNAL_PRIO(p) (p & I915_PRIORITY_MASK)
> 
> Not convinced. It's the only place we use it (one use of MASK later to
> assert correct less). There's two views, here the user/kernel priority
> levels are not as important as what we are concerned about are the split
> lists with the chunking of lowest priority bits. At the extreme we could
> separate the two and make the chunks BITS_PER_LONG -- I think that's a
> waste and my goal was simply to avoid kmallocs for the common case of
> default user priority.
> 
> The intention is that we don't even think about the user/internal split,
> and simply consider it an integrated field, with most positive
> priorities executing first and most negative last.

I just find MASK - INT confusing, those two data flavours do not match 
in my head.  With the local macro I clarified that we are getting an INT 
from prio, and then the bit you did not quote I suggested we do not 
substract from the mask but from the count - 1. It is effectively the 
same thing just IMHO easier to understand while reading the code. You 
can skip the macro and decrement from the count, I'd be happy with that 
as well.

Regards,

Tvrtko