[Intel-gfx] [PATCH 05/13] drm/i915: Cache LRCA in the context

Tue Jan 12 01:59:03 PST 2016

On 11/01/16 19:41, Dave Gordon wrote:
> On 11/01/16 08:38, Daniel Vetter wrote:
>> On Fri, Jan 08, 2016 at 11:29:44AM +0000, Tvrtko Ursulin wrote:
>>> From: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
>>>
>>> We are not allowed to call i915_gem_obj_ggtt_offset from irq
>>> context without the big kernel lock held.
>>>
>>> LRCA lifetime is well defined so cache it so it can be looked up
>>> cheaply from the interrupt context and at command submission
>>> time.
>>>
>>> v2: Added irq context reasoning to the commit message. (Daniel Vetter)
>>>
>>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
>>
>> A i915_obj_for_each_vma macro with a
>> WARN_ON(!mutex_is_locked(dev->struct_mutex)) would be awesome to validate
>> this. Especially since this is by far not the only time I've seen this
>> bug. Needs to be a follow-up though to avoid stalling this fix.
>>
>>> ---
>>>   drivers/gpu/drm/i915/i915_debugfs.c | 15 ++++++--------
>>>   drivers/gpu/drm/i915/i915_drv.h     |  1 +
>>>   drivers/gpu/drm/i915/intel_lrc.c    | 40
>>> ++++++++++++++++---------------------
>>>   drivers/gpu/drm/i915/intel_lrc.h    |  3 ++-
>>>   4 files changed, 26 insertions(+), 33 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c
>>> b/drivers/gpu/drm/i915/i915_debugfs.c
>>> index 3b05bd189eab..714a45cf8a51 100644
>>> --- a/drivers/gpu/drm/i915/i915_debugfs.c
>>> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
>>> @@ -1976,12 +1976,13 @@ static int i915_context_status(struct
>>> seq_file *m, void *unused)
>>>   }
>>>
>>>   static void i915_dump_lrc_obj(struct seq_file *m,
>>> -                  struct intel_engine_cs *ring,
>>> -                  struct drm_i915_gem_object *ctx_obj)
>>> +                  struct intel_context *ctx,
>>> +                  struct intel_engine_cs *ring)
>>>   {
>>>       struct page *page;
>>>       uint32_t *reg_state;
>>>       int j;
>>> +    struct drm_i915_gem_object *ctx_obj = ctx->engine[ring->id].state;
>>>       unsigned long ggtt_offset = 0;
>>>
>>>       if (ctx_obj == NULL) {
>>> @@ -1991,7 +1992,7 @@ static void i915_dump_lrc_obj(struct seq_file *m,
>>>       }
>>>
>>>       seq_printf(m, "CONTEXT: %s %u\n", ring->name,
>>> -           intel_execlists_ctx_id(ctx_obj));
>>> +           intel_execlists_ctx_id(ctx, ring));
>>>
>>>       if (!i915_gem_obj_ggtt_bound(ctx_obj))
>>>           seq_puts(m, "\tNot bound in GGTT\n");
>>> @@ -2040,8 +2041,7 @@ static int i915_dump_lrc(struct seq_file *m,
>>> void *unused)
>>>       list_for_each_entry(ctx, &dev_priv->context_list, link) {
>>>           for_each_ring(ring, dev_priv, i) {
>>>               if (ring->default_context != ctx)
>>> -                i915_dump_lrc_obj(m, ring,
>>> -                          ctx->engine[i].state);
>>> +                i915_dump_lrc_obj(m, ctx, ring);
>>>           }
>>>       }
>>>
>>> @@ -2115,11 +2115,8 @@ static int i915_execlists(struct seq_file *m,
>>> void *data)
>>>
>>>           seq_printf(m, "\t%d requests in queue\n", count);
>>>           if (head_req) {
>>> -            struct drm_i915_gem_object *ctx_obj;
>>> -
>>> -            ctx_obj = head_req->ctx->engine[ring_id].state;
>>>               seq_printf(m, "\tHead request id: %u\n",
>>> -                   intel_execlists_ctx_id(ctx_obj));
>>> +                   intel_execlists_ctx_id(head_req->ctx, ring));
>>>               seq_printf(m, "\tHead request tail: %u\n",
>>>                      head_req->tail);
>>>           }
>>> diff --git a/drivers/gpu/drm/i915/i915_drv.h
>>> b/drivers/gpu/drm/i915/i915_drv.h
>>> index 8cf655c6fc03..b77a5d84eac2 100644
>>> --- a/drivers/gpu/drm/i915/i915_drv.h
>>> +++ b/drivers/gpu/drm/i915/i915_drv.h
>>> @@ -881,6 +881,7 @@ struct intel_context {
>>>           struct drm_i915_gem_object *state;
>>>           struct intel_ringbuffer *ringbuf;
>>>           int pin_count;
>>> +        u32 lrca;
>>
>> lrc_offset imo. Consistent with our other usage in the driver, and
>> actually readable. Please apply liberally everywhere else (I know that
>> bsepc calls it lrca, but we don't need to follow bad naming styles
>> blindly).
>>
>> With that Reviewed-by: Daniel Vetter <daniel.vetter at ffwll.ch>
>>
>>>       } engine[I915_NUM_RINGS];
>>>
>>>       struct list_head link;
>>> diff --git a/drivers/gpu/drm/i915/intel_lrc.c
>>> b/drivers/gpu/drm/i915/intel_lrc.c
>>> index ff146a15d395..ffe004de22b0 100644
>>> --- a/drivers/gpu/drm/i915/intel_lrc.c
>>> +++ b/drivers/gpu/drm/i915/intel_lrc.c
>>> @@ -265,7 +265,8 @@ int intel_sanitize_enable_execlists(struct
>>> drm_device *dev, int enable_execlists
>>>
>>>   /**
>>>    * intel_execlists_ctx_id() - get the Execlists Context ID
>>> - * @ctx_obj: Logical Ring Context backing object.
>>> + * @ctx: Context to get the ID for
>>> + * @ring: Engine to get the ID for
>>>    *
>>>    * Do not confuse with ctx->id! Unfortunately we have a name overload
>>>    * here: the old context ID we pass to userspace as a handler so that
>>> @@ -275,14 +276,12 @@ int intel_sanitize_enable_execlists(struct
>>> drm_device *dev, int enable_execlists
>>>    *
>>>    * Return: 20-bits globally unique context ID.
>>>    */
>>> -u32 intel_execlists_ctx_id(struct drm_i915_gem_object *ctx_obj)
>>> +u32 intel_execlists_ctx_id(struct intel_context *ctx,
>>> +               struct intel_engine_cs *ring)
>>>   {
>>> -    u32 lrca = i915_gem_obj_ggtt_offset(ctx_obj) +
>>> -            LRC_PPHWSP_PN * PAGE_SIZE;
>>> -
>>>       /* LRCA is required to be 4K aligned so the more significant 20
>>> bits
>>>        * are globally unique */
>>> -    return lrca >> 12;
>>> +    return ctx->engine[ring->id].lrca >> 12;
>>>   }
>>>
>>>   static bool disable_lite_restore_wa(struct intel_engine_cs *ring)
>>> @@ -297,13 +296,11 @@ static bool disable_lite_restore_wa(struct
>>> intel_engine_cs *ring)
>>>   uint64_t intel_lr_context_descriptor(struct intel_context *ctx,
>>>                        struct intel_engine_cs *ring)
>>>   {
>>> -    struct drm_i915_gem_object *ctx_obj = ctx->engine[ring->id].state;
>>> +    uint64_t lrca = ctx->engine[ring->id].lrca;
>>>       uint64_t desc = ring->ctx_desc_template;
>>> -    uint64_t lrca = i915_gem_obj_ggtt_offset(ctx_obj) +
>>> -            LRC_PPHWSP_PN * PAGE_SIZE;
>>>
>>>       desc |= lrca;
>>> -    desc |= (u64)intel_execlists_ctx_id(ctx_obj) << GEN8_CTX_ID_SHIFT;
>>> +    desc |= (u64)intel_execlists_ctx_id(ctx, ring) <<
>>> GEN8_CTX_ID_SHIFT;
>>>
>>>       return desc;
>>>   }
>>> @@ -461,9 +458,7 @@ static bool execlists_check_remove_request(struct
>>> intel_engine_cs *ring,
>>>                           execlist_link);
>>>
>>>       if (head_req != NULL) {
>>> -        struct drm_i915_gem_object *ctx_obj =
>>> -                head_req->ctx->engine[ring->id].state;
>>> -        if (intel_execlists_ctx_id(ctx_obj) == request_id) {
>>> +        if (intel_execlists_ctx_id(head_req->ctx, ring) ==
>>> request_id) {
>>>               WARN(head_req->elsp_submitted == 0,
>>>                    "Never submitted head request\n");
>>>
>>> @@ -1023,15 +1018,17 @@ int logical_ring_flush_all_caches(struct
>>> drm_i915_gem_request *req)
>>>   }
>>>
>>>   static int intel_lr_context_do_pin(struct intel_engine_cs *ring,
>>> -        struct drm_i915_gem_object *ctx_obj,
>>> -        struct intel_ringbuffer *ringbuf)
>>> +                   struct intel_context *ctx)
>>>   {
>>>       struct drm_device *dev = ring->dev;
>>>       struct drm_i915_private *dev_priv = dev->dev_private;
>>> +    struct drm_i915_gem_object *ctx_obj = ctx->engine[ring->id].state;
>>> +    struct intel_ringbuffer *ringbuf = ctx->engine[ring->id].ringbuf;
>>>       u64 lrca;
>>> -    int ret = 0;
>>> +    int ret;
>>>
>>>       WARN_ON(!mutex_is_locked(&ring->dev->struct_mutex));
>>> +
>>>       ret = i915_gem_obj_ggtt_pin(ctx_obj, GEN8_LR_CONTEXT_ALIGN,
>>>               PIN_OFFSET_BIAS | GUC_WOPCM_TOP);
>>>       if (ret)
>>> @@ -1047,6 +1044,7 @@ static int intel_lr_context_do_pin(struct
>>> intel_engine_cs *ring,
>>>       if (ret)
>>>           goto unpin_ctx_obj;
>>>
>>> +    ctx->engine[ring->id].lrca = lrca;
>>>       ctx_obj->dirty = true;
>>>
>>>       /* Invalidate GuC TLB. */
>>> @@ -1065,11 +1063,9 @@ static int intel_lr_context_pin(struct
>>> drm_i915_gem_request *rq)
>>>   {
>>>       int ret = 0;
>>>       struct intel_engine_cs *ring = rq->ring;
>>> -    struct drm_i915_gem_object *ctx_obj =
>>> rq->ctx->engine[ring->id].state;
>>> -    struct intel_ringbuffer *ringbuf = rq->ringbuf;
>>>
>>>       if (rq->ctx->engine[ring->id].pin_count++ == 0) {
>>> -        ret = intel_lr_context_do_pin(ring, ctx_obj, ringbuf);
>>> +        ret = intel_lr_context_do_pin(ring, rq->ctx);
>>>           if (ret)
>>>               goto reset_pin_count;
>>>       }
>>> @@ -1091,6 +1087,7 @@ void intel_lr_context_unpin(struct
>>> drm_i915_gem_request *rq)
>>>           if (--rq->ctx->engine[ring->id].pin_count == 0) {
>>>               intel_unpin_ringbuffer_obj(ringbuf);
>>>               i915_gem_object_ggtt_unpin(ctx_obj);
>>> +            rq->ctx->engine[ring->id].lrca = 0;
>>>           }
>>>       }
>>>   }
>>> @@ -1960,10 +1957,7 @@ static int logical_ring_init(struct drm_device
>>> *dev, struct intel_engine_cs *rin
>>>           goto error;
>>>
>>>       /* As this is the default context, always pin it */
>>> -    ret = intel_lr_context_do_pin(
>>> -            ring,
>>> -            ring->default_context->engine[ring->id].state,
>>> -            ring->default_context->engine[ring->id].ringbuf);
>>> +    ret = intel_lr_context_do_pin(ring, ring->default_context);
>>>       if (ret) {
>>>           DRM_ERROR(
>>>               "Failed to pin and map ringbuffer %s: %d\n",
>>> diff --git a/drivers/gpu/drm/i915/intel_lrc.h
>>> b/drivers/gpu/drm/i915/intel_lrc.h
>>> index de41ad6cd63d..f9db5f187d77 100644
>>> --- a/drivers/gpu/drm/i915/intel_lrc.h
>>> +++ b/drivers/gpu/drm/i915/intel_lrc.h
>>> @@ -106,6 +106,8 @@ void intel_lr_context_reset(struct drm_device *dev,
>>>               struct intel_context *ctx);
>>>   uint64_t intel_lr_context_descriptor(struct intel_context *ctx,
>>>                        struct intel_engine_cs *ring);
>>> +u32 intel_execlists_ctx_id(struct intel_context *ctx,
>>> +               struct intel_engine_cs *ring);
>>>
>>>   /* Execlists */
>>>   int intel_sanitize_enable_execlists(struct drm_device *dev, int
>>> enable_execlists);
>>> @@ -113,7 +115,6 @@ struct i915_execbuffer_params;
>>>   int intel_execlists_submission(struct i915_execbuffer_params *params,
>>>                      struct drm_i915_gem_execbuffer2 *args,
>>>                      struct list_head *vmas);
>>> -u32 intel_execlists_ctx_id(struct drm_i915_gem_object *ctx_obj);
>>>
>>>   void intel_lrc_irq_handler(struct intel_engine_cs *ring);
>>>   void intel_execlists_retire_requests(struct intel_engine_cs *ring);
>>> --
>>> 1.9.1
>>>
>>> _______________________________________________
>>> Intel-gfx mailing list
>>> Intel-gfx at lists.freedesktop.org
>>> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
>
> How about caching the WHOLE descriptor so that retrieval becomes
> trivial, retrieving the context ID becomes almost as trivial, as
> does retrieving the LRCA (all these could be inlined) and the single
> calculate-and-cache function can be nicely documented as to how a
> descriptor encodes LRCA etc. Something like this ...

Similar to v3 of my series from yesterday as Daniel already suggested 
caching it all. :)

But I think moving towards storing the VMA will be the approach unless 
we can agree on that I915_GTT_BAD_ADDRESS idea.

> /**
>   * intel_lr_context_descriptor_update() - calculate & cache the
>   *                      descriptor for a pinned
>   *                       context
>   * @ctx: Context to work on
>   * @ring: Engine the descriptor will be used with
>   *
>   * The context descriptor encodes various attributes of a context,
>   * including its GTT address and some flags. Because it's fairly
>   * expensive to calculate, we'll just do it once and cache the result,
>   * which remains valid until the context is unpinned.
>   *
>   * This is what a descriptor looks like, from LSB to MSB:
>   *    bits 0-11:    flags, GEN8_CTX_* (cached in ctx_desc_template)
>   *    bits 12-31:    LRCA, GTT address of (the HWSP of) this context
>   *    bits 32-51:    ctx ID, a globally unique tag (the LRCA again!)
>   *    bits 52-63:    reserved, may encode the engine ID (for GuC)
>   */
> static void
> intel_lr_context_descriptor_update(struct intel_context *ctx,
>                     struct intel_engine_cs *ring)
> {
>          struct drm_i915_gem_object *ctx_obj;
>      uint64_t lrca, desc;
>
>      ctx_obj = ctx->engine[ring->id].state;
>      lrca = i915_gem_obj_ggtt_offset(ctx_obj) + LRC_PPHWSP_PN * PAGE_SIZE;
>
>      desc = ring->ctx_desc_template;            /* bits  0-11 */
>      desc |= lrca;                    /* bits 12-31 */
>      desc |= (lrca >> PAGE_SHIFT) << 32;        /* bits 32-51 */
>
>      ctx->engine[ring->id].ctx_desc = desc;
> }
>
> /**
>   * intel_lr_context_descriptor() - get the descriptor for the specified
>   *                    context/engine combination
>   * @ctx: Context to get the ID for
>   * @ring: Engine to get the ID for
>   *
>   * Now just a trivial accessor, since the value was computed and cached
>   * above at the point when the context was pinned.
>   *
>   * Return: 64-bit context descriptor (encodes LRCA and various flags)
>   */
> uint64_t inline
> intel_lr_context_descriptor(struct intel_context *ctx,
>                  struct intel_engine_cs *ring)
> {
>      return ctx->engine[ring->id].ctx_desc;
> }
>
> /**
> * intel_lr_context_address() - get the GTT address of (the HWSP of) the
> *                   specified context/engine combination
> * @ctx: Context to get the ID for
> * @ring: Engine to get the ID for
> *
> * The address (LRCA) is a portion of the context descriptor, so we can
> * just extract the required part from the cached descriptor.
> *
> * Return: 32-bit GTT address
> */
> uint32_t inline
> intel_lr_context_address(struct intel_context *ctx,
>               struct intel_engine_cs *ring)
> {
>      return (u32)intel_lr_context_descriptor(ctx, ring) & PAGE_MASK;
> }
>
> /**
>   * intel_execlists_ctx_id() - get the Execlists Context ID
>   * @ctx: Context to get the ID for
>   * @ring: Engine to get the ID for
>   *
>   * Do not confuse with ctx->id! Unfortunately we have a name overload
>   * here: the old context ID we pass to userspace as a handler so that
>   * they can refer to a context, and the new context ID we pass to the
>   * ELSP so that the GPU can inform us of the context status via
>   * interrupts.
>   *
>   * The context ID is a portion of the context descriptor, so we can
>   * just extract the required part from the cached descriptor.
>   *
>   * Return: 20-bit globally unique ctx ID (actually, a GTT page number)
>   */
> uint32_t inline
> intel_execlists_ctx_id(struct intel_context *ctx,
>                 struct intel_engine_cs *ring)
> {
>      return (u32)(intel_lr_context_descriptor(ctx, ring) >> 32);
> }
>
> ... and insert a call to intel_lr_context_descriptor_update() into the
> appropriate spot in intel_lr_context_do_pin().

Regards,

Tvrtko