[Intel-gfx] [PATCH 05/13] drm/i915: Cache LRCA in the context

Mon Jan 11 11:41:19 PST 2016

On 11/01/16 08:38, Daniel Vetter wrote:
> On Fri, Jan 08, 2016 at 11:29:44AM +0000, Tvrtko Ursulin wrote:
>> From: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
>>
>> We are not allowed to call i915_gem_obj_ggtt_offset from irq
>> context without the big kernel lock held.
>>
>> LRCA lifetime is well defined so cache it so it can be looked up
>> cheaply from the interrupt context and at command submission
>> time.
>>
>> v2: Added irq context reasoning to the commit message. (Daniel Vetter)
>>
>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
>
> A i915_obj_for_each_vma macro with a
> WARN_ON(!mutex_is_locked(dev->struct_mutex)) would be awesome to validate
> this. Especially since this is by far not the only time I've seen this
> bug. Needs to be a follow-up though to avoid stalling this fix.
>
>> ---
>>   drivers/gpu/drm/i915/i915_debugfs.c | 15 ++++++--------
>>   drivers/gpu/drm/i915/i915_drv.h     |  1 +
>>   drivers/gpu/drm/i915/intel_lrc.c    | 40 ++++++++++++++++---------------------
>>   drivers/gpu/drm/i915/intel_lrc.h    |  3 ++-
>>   4 files changed, 26 insertions(+), 33 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
>> index 3b05bd189eab..714a45cf8a51 100644
>> --- a/drivers/gpu/drm/i915/i915_debugfs.c
>> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
>> @@ -1976,12 +1976,13 @@ static int i915_context_status(struct seq_file *m, void *unused)
>>   }
>>
>>   static void i915_dump_lrc_obj(struct seq_file *m,
>> -			      struct intel_engine_cs *ring,
>> -			      struct drm_i915_gem_object *ctx_obj)
>> +			      struct intel_context *ctx,
>> +			      struct intel_engine_cs *ring)
>>   {
>>   	struct page *page;
>>   	uint32_t *reg_state;
>>   	int j;
>> +	struct drm_i915_gem_object *ctx_obj = ctx->engine[ring->id].state;
>>   	unsigned long ggtt_offset = 0;
>>
>>   	if (ctx_obj == NULL) {
>> @@ -1991,7 +1992,7 @@ static void i915_dump_lrc_obj(struct seq_file *m,
>>   	}
>>
>>   	seq_printf(m, "CONTEXT: %s %u\n", ring->name,
>> -		   intel_execlists_ctx_id(ctx_obj));
>> +		   intel_execlists_ctx_id(ctx, ring));
>>
>>   	if (!i915_gem_obj_ggtt_bound(ctx_obj))
>>   		seq_puts(m, "\tNot bound in GGTT\n");
>> @@ -2040,8 +2041,7 @@ static int i915_dump_lrc(struct seq_file *m, void *unused)
>>   	list_for_each_entry(ctx, &dev_priv->context_list, link) {
>>   		for_each_ring(ring, dev_priv, i) {
>>   			if (ring->default_context != ctx)
>> -				i915_dump_lrc_obj(m, ring,
>> -						  ctx->engine[i].state);
>> +				i915_dump_lrc_obj(m, ctx, ring);
>>   		}
>>   	}
>>
>> @@ -2115,11 +2115,8 @@ static int i915_execlists(struct seq_file *m, void *data)
>>
>>   		seq_printf(m, "\t%d requests in queue\n", count);
>>   		if (head_req) {
>> -			struct drm_i915_gem_object *ctx_obj;
>> -
>> -			ctx_obj = head_req->ctx->engine[ring_id].state;
>>   			seq_printf(m, "\tHead request id: %u\n",
>> -				   intel_execlists_ctx_id(ctx_obj));
>> +				   intel_execlists_ctx_id(head_req->ctx, ring));
>>   			seq_printf(m, "\tHead request tail: %u\n",
>>   				   head_req->tail);
>>   		}
>> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
>> index 8cf655c6fc03..b77a5d84eac2 100644
>> --- a/drivers/gpu/drm/i915/i915_drv.h
>> +++ b/drivers/gpu/drm/i915/i915_drv.h
>> @@ -881,6 +881,7 @@ struct intel_context {
>>   		struct drm_i915_gem_object *state;
>>   		struct intel_ringbuffer *ringbuf;
>>   		int pin_count;
>> +		u32 lrca;
>
> lrc_offset imo. Consistent with our other usage in the driver, and
> actually readable. Please apply liberally everywhere else (I know that
> bsepc calls it lrca, but we don't need to follow bad naming styles
> blindly).
>
> With that Reviewed-by: Daniel Vetter <daniel.vetter at ffwll.ch>
>
>>   	} engine[I915_NUM_RINGS];
>>
>>   	struct list_head link;
>> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
>> index ff146a15d395..ffe004de22b0 100644
>> --- a/drivers/gpu/drm/i915/intel_lrc.c
>> +++ b/drivers/gpu/drm/i915/intel_lrc.c
>> @@ -265,7 +265,8 @@ int intel_sanitize_enable_execlists(struct drm_device *dev, int enable_execlists
>>
>>   /**
>>    * intel_execlists_ctx_id() - get the Execlists Context ID
>> - * @ctx_obj: Logical Ring Context backing object.
>> + * @ctx: Context to get the ID for
>> + * @ring: Engine to get the ID for
>>    *
>>    * Do not confuse with ctx->id! Unfortunately we have a name overload
>>    * here: the old context ID we pass to userspace as a handler so that
>> @@ -275,14 +276,12 @@ int intel_sanitize_enable_execlists(struct drm_device *dev, int enable_execlists
>>    *
>>    * Return: 20-bits globally unique context ID.
>>    */
>> -u32 intel_execlists_ctx_id(struct drm_i915_gem_object *ctx_obj)
>> +u32 intel_execlists_ctx_id(struct intel_context *ctx,
>> +			   struct intel_engine_cs *ring)
>>   {
>> -	u32 lrca = i915_gem_obj_ggtt_offset(ctx_obj) +
>> -			LRC_PPHWSP_PN * PAGE_SIZE;
>> -
>>   	/* LRCA is required to be 4K aligned so the more significant 20 bits
>>   	 * are globally unique */
>> -	return lrca >> 12;
>> +	return ctx->engine[ring->id].lrca >> 12;
>>   }
>>
>>   static bool disable_lite_restore_wa(struct intel_engine_cs *ring)
>> @@ -297,13 +296,11 @@ static bool disable_lite_restore_wa(struct intel_engine_cs *ring)
>>   uint64_t intel_lr_context_descriptor(struct intel_context *ctx,
>>   				     struct intel_engine_cs *ring)
>>   {
>> -	struct drm_i915_gem_object *ctx_obj = ctx->engine[ring->id].state;
>> +	uint64_t lrca = ctx->engine[ring->id].lrca;
>>   	uint64_t desc = ring->ctx_desc_template;
>> -	uint64_t lrca = i915_gem_obj_ggtt_offset(ctx_obj) +
>> -			LRC_PPHWSP_PN * PAGE_SIZE;
>>
>>   	desc |= lrca;
>> -	desc |= (u64)intel_execlists_ctx_id(ctx_obj) << GEN8_CTX_ID_SHIFT;
>> +	desc |= (u64)intel_execlists_ctx_id(ctx, ring) << GEN8_CTX_ID_SHIFT;
>>
>>   	return desc;
>>   }
>> @@ -461,9 +458,7 @@ static bool execlists_check_remove_request(struct intel_engine_cs *ring,
>>   					    execlist_link);
>>
>>   	if (head_req != NULL) {
>> -		struct drm_i915_gem_object *ctx_obj =
>> -				head_req->ctx->engine[ring->id].state;
>> -		if (intel_execlists_ctx_id(ctx_obj) == request_id) {
>> +		if (intel_execlists_ctx_id(head_req->ctx, ring) == request_id) {
>>   			WARN(head_req->elsp_submitted == 0,
>>   			     "Never submitted head request\n");
>>
>> @@ -1023,15 +1018,17 @@ int logical_ring_flush_all_caches(struct drm_i915_gem_request *req)
>>   }
>>
>>   static int intel_lr_context_do_pin(struct intel_engine_cs *ring,
>> -		struct drm_i915_gem_object *ctx_obj,
>> -		struct intel_ringbuffer *ringbuf)
>> +				   struct intel_context *ctx)
>>   {
>>   	struct drm_device *dev = ring->dev;
>>   	struct drm_i915_private *dev_priv = dev->dev_private;
>> +	struct drm_i915_gem_object *ctx_obj = ctx->engine[ring->id].state;
>> +	struct intel_ringbuffer *ringbuf = ctx->engine[ring->id].ringbuf;
>>   	u64 lrca;
>> -	int ret = 0;
>> +	int ret;
>>
>>   	WARN_ON(!mutex_is_locked(&ring->dev->struct_mutex));
>> +
>>   	ret = i915_gem_obj_ggtt_pin(ctx_obj, GEN8_LR_CONTEXT_ALIGN,
>>   			PIN_OFFSET_BIAS | GUC_WOPCM_TOP);
>>   	if (ret)
>> @@ -1047,6 +1044,7 @@ static int intel_lr_context_do_pin(struct intel_engine_cs *ring,
>>   	if (ret)
>>   		goto unpin_ctx_obj;
>>
>> +	ctx->engine[ring->id].lrca = lrca;
>>   	ctx_obj->dirty = true;
>>
>>   	/* Invalidate GuC TLB. */
>> @@ -1065,11 +1063,9 @@ static int intel_lr_context_pin(struct drm_i915_gem_request *rq)
>>   {
>>   	int ret = 0;
>>   	struct intel_engine_cs *ring = rq->ring;
>> -	struct drm_i915_gem_object *ctx_obj = rq->ctx->engine[ring->id].state;
>> -	struct intel_ringbuffer *ringbuf = rq->ringbuf;
>>
>>   	if (rq->ctx->engine[ring->id].pin_count++ == 0) {
>> -		ret = intel_lr_context_do_pin(ring, ctx_obj, ringbuf);
>> +		ret = intel_lr_context_do_pin(ring, rq->ctx);
>>   		if (ret)
>>   			goto reset_pin_count;
>>   	}
>> @@ -1091,6 +1087,7 @@ void intel_lr_context_unpin(struct drm_i915_gem_request *rq)
>>   		if (--rq->ctx->engine[ring->id].pin_count == 0) {
>>   			intel_unpin_ringbuffer_obj(ringbuf);
>>   			i915_gem_object_ggtt_unpin(ctx_obj);
>> +			rq->ctx->engine[ring->id].lrca = 0;
>>   		}
>>   	}
>>   }
>> @@ -1960,10 +1957,7 @@ static int logical_ring_init(struct drm_device *dev, struct intel_engine_cs *rin
>>   		goto error;
>>
>>   	/* As this is the default context, always pin it */
>> -	ret = intel_lr_context_do_pin(
>> -			ring,
>> -			ring->default_context->engine[ring->id].state,
>> -			ring->default_context->engine[ring->id].ringbuf);
>> +	ret = intel_lr_context_do_pin(ring, ring->default_context);
>>   	if (ret) {
>>   		DRM_ERROR(
>>   			"Failed to pin and map ringbuffer %s: %d\n",
>> diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
>> index de41ad6cd63d..f9db5f187d77 100644
>> --- a/drivers/gpu/drm/i915/intel_lrc.h
>> +++ b/drivers/gpu/drm/i915/intel_lrc.h
>> @@ -106,6 +106,8 @@ void intel_lr_context_reset(struct drm_device *dev,
>>   			struct intel_context *ctx);
>>   uint64_t intel_lr_context_descriptor(struct intel_context *ctx,
>>   				     struct intel_engine_cs *ring);
>> +u32 intel_execlists_ctx_id(struct intel_context *ctx,
>> +			   struct intel_engine_cs *ring);
>>
>>   /* Execlists */
>>   int intel_sanitize_enable_execlists(struct drm_device *dev, int enable_execlists);
>> @@ -113,7 +115,6 @@ struct i915_execbuffer_params;
>>   int intel_execlists_submission(struct i915_execbuffer_params *params,
>>   			       struct drm_i915_gem_execbuffer2 *args,
>>   			       struct list_head *vmas);
>> -u32 intel_execlists_ctx_id(struct drm_i915_gem_object *ctx_obj);
>>
>>   void intel_lrc_irq_handler(struct intel_engine_cs *ring);
>>   void intel_execlists_retire_requests(struct intel_engine_cs *ring);
>> --
>> 1.9.1
>>
>> _______________________________________________
>> Intel-gfx mailing list
>> Intel-gfx at lists.freedesktop.org
>> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

How about caching the WHOLE descriptor so that retrieval becomes 
trivial, retrieving the context ID becomes almost as trivial, as
does retrieving the LRCA (all these could be inlined) and the single 
calculate-and-cache function can be nicely documented as to how a 
descriptor encodes LRCA etc. Something like this ...

/**
  * intel_lr_context_descriptor_update() - calculate & cache the
  *					  descriptor for a pinned
  * 					  context
  * @ctx: Context to work on
  * @ring: Engine the descriptor will be used with
  *
  * The context descriptor encodes various attributes of a context,
  * including its GTT address and some flags. Because it's fairly
  * expensive to calculate, we'll just do it once and cache the result,
  * which remains valid until the context is unpinned.
  *
  * This is what a descriptor looks like, from LSB to MSB:
  *	bits 0-11:	flags, GEN8_CTX_* (cached in ctx_desc_template)
  *	bits 12-31:	LRCA, GTT address of (the HWSP of) this context
  *	bits 32-51:	ctx ID, a globally unique tag (the LRCA again!)
  *	bits 52-63:	reserved, may encode the engine ID (for GuC)
  */
static void
intel_lr_context_descriptor_update(struct intel_context *ctx,
				   struct intel_engine_cs *ring)
{
         struct drm_i915_gem_object *ctx_obj;
	uint64_t lrca, desc;

	ctx_obj = ctx->engine[ring->id].state;
	lrca = i915_gem_obj_ggtt_offset(ctx_obj) + LRC_PPHWSP_PN * PAGE_SIZE;

	desc = ring->ctx_desc_template;			/* bits  0-11 */
	desc |= lrca;					/* bits 12-31 */
	desc |= (lrca >> PAGE_SHIFT) << 32;		/* bits 32-51 */

	ctx->engine[ring->id].ctx_desc = desc;
}

/**
  * intel_lr_context_descriptor() - get the descriptor for the specified
  * 				   context/engine combination
  * @ctx: Context to get the ID for
  * @ring: Engine to get the ID for
  *
  * Now just a trivial accessor, since the value was computed and cached
  * above at the point when the context was pinned.
  *
  * Return: 64-bit context descriptor (encodes LRCA and various flags)
  */
uint64_t inline
intel_lr_context_descriptor(struct intel_context *ctx,
			    struct intel_engine_cs *ring)
{
	return ctx->engine[ring->id].ctx_desc;
}

/**
* intel_lr_context_address() - get the GTT address of (the HWSP of) the
*			       specified context/engine combination
* @ctx: Context to get the ID for
* @ring: Engine to get the ID for
*
* The address (LRCA) is a portion of the context descriptor, so we can
* just extract the required part from the cached descriptor.
*
* Return: 32-bit GTT address
*/
uint32_t inline
intel_lr_context_address(struct intel_context *ctx,
			 struct intel_engine_cs *ring)
{
	return (u32)intel_lr_context_descriptor(ctx, ring) & PAGE_MASK;
}

/**
  * intel_execlists_ctx_id() - get the Execlists Context ID
  * @ctx: Context to get the ID for
  * @ring: Engine to get the ID for
  *
  * Do not confuse with ctx->id! Unfortunately we have a name overload
  * here: the old context ID we pass to userspace as a handler so that
  * they can refer to a context, and the new context ID we pass to the
  * ELSP so that the GPU can inform us of the context status via
  * interrupts.
  *
  * The context ID is a portion of the context descriptor, so we can
  * just extract the required part from the cached descriptor.
  *
  * Return: 20-bit globally unique ctx ID (actually, a GTT page number)
  */
uint32_t inline
intel_execlists_ctx_id(struct intel_context *ctx,
		       struct intel_engine_cs *ring)
{
	return (u32)(intel_lr_context_descriptor(ctx, ring) >> 32);
}

... and insert a call to intel_lr_context_descriptor_update() into the 
appropriate spot in intel_lr_context_do_pin().

.Dave.