[Intel-gfx] [PATCH 1/8] drm/i915/execlists: Lift process_csb() out of the irq-off spinlock

Fri Aug 16 08:50:38 UTC 2019

Quoting Mika Kuoppala (2019-08-16 08:50:29)
> Chris Wilson <chris at chris-wilson.co.uk> writes:
> >  static inline struct i915_request *
> >  execlists_schedule_in(struct i915_request *rq, int idx)
> >  {
> > -     struct intel_context *ce = rq->hw_context;
> > -     int count;
> > +     struct intel_context * const ce = rq->hw_context;
> > +     struct intel_engine_cs *old;
> >  
> > +     GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine));
> >       trace_i915_request_in(rq, idx);
> >  
> > -     count = intel_context_inflight_count(ce);
> > -     if (!count) {
> > -             intel_context_get(ce);
> > -             ce->inflight = rq->engine;
> > -
> > -             intel_gt_pm_get(ce->inflight->gt);
> > -             execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
> > -             intel_engine_context_in(ce->inflight);
> > -     }
> > +     old = READ_ONCE(ce->inflight);
> > +     do {
> > +             if (!old) {
> 
> The schedule out might have swapped inflight in here ruining our day.
> So I am here trying to figure out how you can pull it off.

Once we _know_ the context is idle, the only way it can become busy again
is via submitting a request under the engine->active.lock, which we
hold.

Note that schedule-out also has exclusive access by its caller (only one
submission tasklet at a time), but schedule-out may run concurrently to
schedule-in. (But once we idle a context in schedule-out, we will never
see it again until a schedule-in, hence we know that upon seeing NULL we
have exclusive access.)

> > +                     WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq));
> > +                     break;
> > +             }
> > +     } while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old)));
> >  
> > -     intel_context_inflight_inc(ce);
> >       GEM_BUG_ON(intel_context_inflight(ce) != rq->engine);
> > -
> >       return i915_request_get(rq);
> >  }

> >  enable_timeslice(struct intel_engine_cs *engine)
> >  {
> > -     struct i915_request *last = last_active(&engine->execlists);
> > +     struct i915_request * const *port;
> > +     int hint;
> > +
> > +     port = engine->execlists.active;
> > +     while (port[0] && i915_request_completed(port[0]))
> > +             port++;
> > +     if (!port[0])
> > +             return false;
> >  
> > -     return last && need_timeslice(engine, last);
> > +     hint = engine->execlists.queue_priority_hint;
> > +     if (port[1])
> > +             hint = max(rq_prio(port[1]), hint);
> > +
> > +     /* Compare the two end-points as an unlocked approximation */
> > +     return hint >= effective_prio(port[0]);
> 
> What happens if we get it wrong?

So the last element in the next context is always the lowest priority
(normally they are all the same priority). If there is a variation in
priority along the requests in the second context, that may mask that
the first request there should trigger a timeslice.

Storing an execlists.switch_priority_hint should remove the ambiguity.

> > @@ -1494,9 +1527,12 @@ static void execlists_submission_tasklet(unsigned long data)
> >       struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
> >       unsigned long flags;
> >  
> > -     spin_lock_irqsave(&engine->active.lock, flags);
> > -     __execlists_submission_tasklet(engine);
> > -     spin_unlock_irqrestore(&engine->active.lock, flags);
> > +     process_csb(engine);
> > +     if (!engine->execlists.pending[0]) {
> 
> !READ_ONCE(...)? Would atleast pair documentatically.

How about if (process_csb()) {

> > +             spin_lock_irqsave(&engine->active.lock, flags);
> > +             __execlists_submission_tasklet(engine);
> > +             spin_unlock_irqrestore(&engine->active.lock, flags);
> > +     }
> >  }
> >  
> >  static void execlists_submission_timer(struct timer_list *timer)
> > diff --git a/drivers/gpu/drm/i915/i915_utils.h b/drivers/gpu/drm/i915/i915_utils.h
> > index d652ba5d2320..562f756da421 100644
> > --- a/drivers/gpu/drm/i915/i915_utils.h
> > +++ b/drivers/gpu/drm/i915/i915_utils.h
> > @@ -161,17 +161,15 @@ __check_struct_size(size_t base, size_t arr, size_t count, size_t *size)
> >       ((typeof(ptr))((unsigned long)(ptr) | __bits));                 \
> >  })
> >  
> > -#define ptr_count_dec(p_ptr) do {                                    \
> > -     typeof(p_ptr) __p = (p_ptr);                                    \
> > -     unsigned long __v = (unsigned long)(*__p);                      \
> > -     *__p = (typeof(*p_ptr))(--__v);                                 \
> > -} while (0)
> > -
> > -#define ptr_count_inc(p_ptr) do {                                    \
> > -     typeof(p_ptr) __p = (p_ptr);                                    \
> > -     unsigned long __v = (unsigned long)(*__p);                      \
> > -     *__p = (typeof(*p_ptr))(++__v);                                 \
> > -} while (0)
> > +#define ptr_dec(ptr) ({                                                      \
> > +     unsigned long __v = (unsigned long)(ptr);                       \
> > +     (typeof(ptr))(__v - 1);                                         \
> > +})
> > +
> > +#define ptr_inc(ptr) ({                                                      \
> > +     unsigned long __v = (unsigned long)(ptr);                       \
> > +     (typeof(ptr))(__v + 1);                                         \
> > +})
> 
> Should we add GEM_DEBUG_WARN_ON if we overflow or do we
> detect through other means?

What's an overflow? What's the alignment of the target pointer? Perhaps
the user intended that every 8 uses starts at the next location. That's
not known at this basic level. And these are decidedly
use-at-you-own-risk :)
-Chris