[Intel-gfx] [PATCH v6 1/6] drm/i915/gen8: Add infrastructure to initialize WA batch buffers

Mon Jun 22 08:43:59 PDT 2015

On 22/06/2015 16:41, Daniel Vetter wrote:
> On Fri, Jun 19, 2015 at 07:07:01PM +0100, Arun Siluvery wrote:
>> Some of the WA are to be applied during context save but before restore and
>> some at the end of context save/restore but before executing the instructions
>> in the ring, WA batch buffers are created for this purpose and these WA cannot
>> be applied using normal means. Each context has two registers to load the
>> offsets of these batch buffers. If they are non-zero, HW understands that it
>> need to execute these batches.
>>
>> v1: In this version two separate ring_buffer objects were used to load WA
>> instructions for indirect and per context batch buffers and they were part
>> of every context.
>>
>> v2: Chris suggested to include additional page in context and use it to load
>> these WA instead of creating separate objects. This will simplify lot of things
>> as we need not explicity pin/unpin them. Thomas Daniel further pointed that GuC
>> is planning to use a similar setup to share data between GuC and driver and
>> WA batch buffers can probably share that page. However after discussions with
>> Dave who is implementing GuC changes, he suggested to use an independent page
>> for the reasons - GuC area might grow and these WA are initialized only once and
>> are not changed afterwards so we can share them share across all contexts.
>>
>> The page is updated with WA during render ring init. This has an advantage of
>> not adding more special cases to default_context.
>>
>> We don't know upfront the number of WA we will applying using these batch buffers.
>> For this reason the size was fixed earlier but it is not a good idea. To fix this,
>> the functions that load instructions are modified to report the no of commands
>> inserted and the size is now calculated after the batch is updated. A macro is
>> introduced to add commands to these batch buffers which also checks for overflow
>> and returns error.
>> We have a full page dedicated for these WA so that should be sufficient for
>> good number of WA, anything more means we have major issues.
>> The list for Gen8 is small, same for Gen9 also, maybe few more gets added
>> going forward but not close to filling entire page. Chris suggested a two-pass
>> approach but we agreed to go with single page setup as it is a one-off routine
>> and simpler code wins.
>>
>> One additional option is offset field which is helpful if we would like to
>> have multiple batches at different offsets within the page and select them
>> based on some criteria. This is not a requirement at this point but could
>> help in future (Dave).
>>
>> Chris provided some helpful macros and suggestions which further simplified
>> the code, they will also help in reducing code duplication when WA for
>> other Gen are added. Add detailed comments explaining restrictions.
>> Use do {} while(0) for wa_ctx_emit() macro.
>>
>> (Many thanks to Chris, Dave and Thomas for their reviews and inputs)
>>
>> Cc: Chris Wilson <chris at chris-wilson.co.uk>
>> Cc: Dave Gordon <david.s.gordon at intel.com>
>> Signed-off-by: Rafael Barbalho <rafael.barbalho at intel.com>
>> Signed-off-by: Arun Siluvery <arun.siluvery at linux.intel.com>
>> Reviewed-by: Chris Wilson <chris at chris-wilson.co.uk>
>
> Why did you resend this one - I don't spot any updates in the commit
> message? Also when resending please in-reply to the corresponding previous
> version of that patch, not the cover letter of the series.

Hi Daniel,

This is the updated patch with do {} while (0).
I picked a different message-id of cover letter by mistake which is why 
it is replied to cover letter instead of the corresponding patch.

regards
Arun

> -Daniel
>
>> ---
>>   drivers/gpu/drm/i915/intel_lrc.c        | 223 +++++++++++++++++++++++++++++++-
>>   drivers/gpu/drm/i915/intel_ringbuffer.h |  21 +++
>>   2 files changed, 240 insertions(+), 4 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
>> index 0413b8f..0585298 100644
>> --- a/drivers/gpu/drm/i915/intel_lrc.c
>> +++ b/drivers/gpu/drm/i915/intel_lrc.c
>> @@ -211,6 +211,7 @@ enum {
>>   	FAULT_AND_CONTINUE /* Unsupported */
>>   };
>>   #define GEN8_CTX_ID_SHIFT 32
>> +#define CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT  0x17
>>
>>   static int intel_lr_context_pin(struct intel_engine_cs *ring,
>>   		struct intel_context *ctx);
>> @@ -1077,6 +1078,191 @@ static int intel_logical_ring_workarounds_emit(struct intel_engine_cs *ring,
>>   	return 0;
>>   }
>>
>> +#define wa_ctx_emit(batch, cmd)						\
>> +	do {								\
>> +		if (WARN_ON(index >= (PAGE_SIZE / sizeof(uint32_t)))) {	\
>> +			return -ENOSPC;					\
>> +		}							\
>> +		batch[index++] = (cmd);					\
>> +	} while (0)
>> +
>> +static inline uint32_t wa_ctx_start(struct i915_wa_ctx_bb *wa_ctx,
>> +				    uint32_t offset,
>> +				    uint32_t start_alignment)
>> +{
>> +	return wa_ctx->offset = ALIGN(offset, start_alignment);
>> +}
>> +
>> +static inline int wa_ctx_end(struct i915_wa_ctx_bb *wa_ctx,
>> +			     uint32_t offset,
>> +			     uint32_t size_alignment)
>> +{
>> +	wa_ctx->size = offset - wa_ctx->offset;
>> +
>> +	WARN(wa_ctx->size % size_alignment,
>> +	     "wa_ctx_bb failed sanity checks: size %d is not aligned to %d\n",
>> +	     wa_ctx->size, size_alignment);
>> +	return 0;
>> +}
>> +
>> +/**
>> + * gen8_init_indirectctx_bb() - initialize indirect ctx batch with WA
>> + *
>> + * @ring: only applicable for RCS
>> + * @wa_ctx: structure representing wa_ctx
>> + *  offset: specifies start of the batch, should be cache-aligned. This is updated
>> + *    with the offset value received as input.
>> + *  size: size of the batch in DWORDS but HW expects in terms of cachelines
>> + * @batch: page in which WA are loaded
>> + * @offset: This field specifies the start of the batch, it should be
>> + *  cache-aligned otherwise it is adjusted accordingly.
>> + *  Typically we only have one indirect_ctx and per_ctx batch buffer which are
>> + *  initialized at the beginning and shared across all contexts but this field
>> + *  helps us to have multiple batches at different offsets and select them based
>> + *  on a criteria. At the moment this batch always start at the beginning of the page
>> + *  and at this point we don't have multiple wa_ctx batch buffers.
>> + *
>> + *  The number of WA applied are not known at the beginning; we use this field
>> + *  to return the no of DWORDS written.
>> +
>> + *  It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
>> + *  so it adds NOOPs as padding to make it cacheline aligned.
>> + *  MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
>> + *  makes a complete batch buffer.
>> + *
>> + * Return: non-zero if we exceed the PAGE_SIZE limit.
>> + */
>> +
>> +static int gen8_init_indirectctx_bb(struct intel_engine_cs *ring,
>> +				    struct i915_wa_ctx_bb *wa_ctx,
>> +				    uint32_t *const batch,
>> +				    uint32_t *offset)
>> +{
>> +	uint32_t index = wa_ctx_start(wa_ctx, *offset, CACHELINE_DWORDS);
>> +
>> +	/* FIXME: Replace me with WA */
>> +	wa_ctx_emit(batch, MI_NOOP);
>> +
>> +	/* Pad to end of cacheline */
>> +	while (index % CACHELINE_DWORDS)
>> +		wa_ctx_emit(batch, MI_NOOP);
>> +
>> +	/*
>> +	 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
>> +	 * execution depends on the length specified in terms of cache lines
>> +	 * in the register CTX_RCS_INDIRECT_CTX
>> +	 */
>> +
>> +	return wa_ctx_end(wa_ctx, *offset = index, CACHELINE_DWORDS);
>> +}
>> +
>> +/**
>> + * gen8_init_perctx_bb() - initialize per ctx batch with WA
>> + *
>> + * @ring: only applicable for RCS
>> + * @wa_ctx: structure representing wa_ctx
>> + *  offset: specifies start of the batch, should be cache-aligned.
>> + *  size: size of the batch in DWORDS but HW expects in terms of cachelines
>> + * @offset: This field specifies the start of this batch.
>> + *   This batch is started immediately after indirect_ctx batch. Since we ensure
>> + *   that indirect_ctx ends on a cacheline this batch is aligned automatically.
>> + *
>> + *   The number of DWORDS written are returned using this field.
>> + *
>> + *  This batch is terminated with MI_BATCH_BUFFER_END and so we need not add padding
>> + *  to align it with cacheline as padding after MI_BATCH_BUFFER_END is redundant.
>> + */
>> +static int gen8_init_perctx_bb(struct intel_engine_cs *ring,
>> +			       struct i915_wa_ctx_bb *wa_ctx,
>> +			       uint32_t *const batch,
>> +			       uint32_t *offset)
>> +{
>> +	uint32_t index = wa_ctx_start(wa_ctx, *offset, CACHELINE_DWORDS);
>> +
>> +	wa_ctx_emit(batch, MI_BATCH_BUFFER_END);
>> +
>> +	return wa_ctx_end(wa_ctx, *offset = index, 1);
>> +}
>> +
>> +static int lrc_setup_wa_ctx_obj(struct intel_engine_cs *ring, u32 size)
>> +{
>> +	int ret;
>> +
>> +	ring->wa_ctx.obj = i915_gem_alloc_object(ring->dev, PAGE_ALIGN(size));
>> +	if (!ring->wa_ctx.obj) {
>> +		DRM_DEBUG_DRIVER("alloc LRC WA ctx backing obj failed.\n");
>> +		return -ENOMEM;
>> +	}
>> +
>> +	ret = i915_gem_obj_ggtt_pin(ring->wa_ctx.obj, PAGE_SIZE, 0);
>> +	if (ret) {
>> +		DRM_DEBUG_DRIVER("pin LRC WA ctx backing obj failed: %d\n",
>> +				 ret);
>> +		drm_gem_object_unreference(&ring->wa_ctx.obj->base);
>> +		return ret;
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +static void lrc_destroy_wa_ctx_obj(struct intel_engine_cs *ring)
>> +{
>> +	if (ring->wa_ctx.obj) {
>> +		i915_gem_object_ggtt_unpin(ring->wa_ctx.obj);
>> +		drm_gem_object_unreference(&ring->wa_ctx.obj->base);
>> +		ring->wa_ctx.obj = NULL;
>> +	}
>> +}
>> +
>> +static int intel_init_workaround_bb(struct intel_engine_cs *ring)
>> +{
>> +	int ret;
>> +	uint32_t *batch;
>> +	uint32_t offset;
>> +	struct page *page;
>> +	struct i915_ctx_workarounds *wa_ctx = &ring->wa_ctx;
>> +
>> +	WARN_ON(ring->id != RCS);
>> +
>> +	ret = lrc_setup_wa_ctx_obj(ring, PAGE_SIZE);
>> +	if (ret) {
>> +		DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n", ret);
>> +		return ret;
>> +	}
>> +
>> +	page = i915_gem_object_get_page(wa_ctx->obj, 0);
>> +	batch = kmap_atomic(page);
>> +	offset = 0;
>> +
>> +	if (INTEL_INFO(ring->dev)->gen == 8) {
>> +		ret = gen8_init_indirectctx_bb(ring,
>> +					       &wa_ctx->indirect_ctx,
>> +					       batch,
>> +					       &offset);
>> +		if (ret)
>> +			goto out;
>> +
>> +		ret = gen8_init_perctx_bb(ring,
>> +					  &wa_ctx->per_ctx,
>> +					  batch,
>> +					  &offset);
>> +		if (ret)
>> +			goto out;
>> +	} else {
>> +		WARN(INTEL_INFO(ring->dev)->gen >= 8,
>> +		     "WA batch buffer is not initialized for Gen%d\n",
>> +		     INTEL_INFO(ring->dev)->gen);
>> +		lrc_destroy_wa_ctx_obj(ring);
>> +	}
>> +
>> +out:
>> +	kunmap_atomic(batch);
>> +	if (ret)
>> +		lrc_destroy_wa_ctx_obj(ring);
>> +
>> +	return ret;
>> +}
>> +
>>   static int gen8_init_common_ring(struct intel_engine_cs *ring)
>>   {
>>   	struct drm_device *dev = ring->dev;
>> @@ -1411,6 +1597,8 @@ void intel_logical_ring_cleanup(struct intel_engine_cs *ring)
>>   		kunmap(sg_page(ring->status_page.obj->pages->sgl));
>>   		ring->status_page.obj = NULL;
>>   	}
>> +
>> +	lrc_destroy_wa_ctx_obj(ring);
>>   }
>>
>>   static int logical_ring_init(struct drm_device *dev, struct intel_engine_cs *ring)
>> @@ -1474,7 +1662,22 @@ static int logical_render_ring_init(struct drm_device *dev)
>>   	if (ret)
>>   		return ret;
>>
>> -	return intel_init_pipe_control(ring);
>> +	ret = intel_init_workaround_bb(ring);
>> +	if (ret) {
>> +		/*
>> +		 * We continue even if we fail to initialize WA batch
>> +		 * because we only expect rare glitches but nothing
>> +		 * critical to prevent us from using GPU
>> +		 */
>> +		DRM_ERROR("WA batch buffer initialization failed: %d\n",
>> +			  ret);
>> +	}
>> +
>> +	ret = intel_init_pipe_control(ring);
>> +	if (ret)
>> +		lrc_destroy_wa_ctx_obj(ring);
>> +
>> +	return ret;
>>   }
>>
>>   static int logical_bsd_ring_init(struct drm_device *dev)
>> @@ -1754,15 +1957,27 @@ populate_lr_context(struct intel_context *ctx, struct drm_i915_gem_object *ctx_o
>>   	reg_state[CTX_SECOND_BB_STATE] = ring->mmio_base + 0x118;
>>   	reg_state[CTX_SECOND_BB_STATE+1] = 0;
>>   	if (ring->id == RCS) {
>> -		/* TODO: according to BSpec, the register state context
>> -		 * for CHV does not have these. OTOH, these registers do
>> -		 * exist in CHV. I'm waiting for a clarification */
>>   		reg_state[CTX_BB_PER_CTX_PTR] = ring->mmio_base + 0x1c0;
>>   		reg_state[CTX_BB_PER_CTX_PTR+1] = 0;
>>   		reg_state[CTX_RCS_INDIRECT_CTX] = ring->mmio_base + 0x1c4;
>>   		reg_state[CTX_RCS_INDIRECT_CTX+1] = 0;
>>   		reg_state[CTX_RCS_INDIRECT_CTX_OFFSET] = ring->mmio_base + 0x1c8;
>>   		reg_state[CTX_RCS_INDIRECT_CTX_OFFSET+1] = 0;
>> +		if (ring->wa_ctx.obj) {
>> +			struct i915_ctx_workarounds *wa_ctx = &ring->wa_ctx;
>> +			uint32_t ggtt_offset = i915_gem_obj_ggtt_offset(wa_ctx->obj);
>> +
>> +			reg_state[CTX_RCS_INDIRECT_CTX+1] =
>> +				(ggtt_offset + wa_ctx->indirect_ctx.offset * sizeof(uint32_t)) |
>> +				(wa_ctx->indirect_ctx.size / CACHELINE_DWORDS);
>> +
>> +			reg_state[CTX_RCS_INDIRECT_CTX_OFFSET+1] =
>> +				CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT << 6;
>> +
>> +			reg_state[CTX_BB_PER_CTX_PTR+1] =
>> +				(ggtt_offset + wa_ctx->per_ctx.offset * sizeof(uint32_t)) |
>> +				0x01;
>> +		}
>>   	}
>>   	reg_state[CTX_LRI_HEADER_1] = MI_LOAD_REGISTER_IMM(9);
>>   	reg_state[CTX_LRI_HEADER_1] |= MI_LRI_FORCE_POSTED;
>> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
>> index 39f6dfc..06467c6 100644
>> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
>> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
>> @@ -12,6 +12,7 @@
>>    * workarounds!
>>    */
>>   #define CACHELINE_BYTES 64
>> +#define CACHELINE_DWORDS (CACHELINE_BYTES / sizeof(uint32_t))
>>
>>   /*
>>    * Gen2 BSpec "1. Programming Environment" / 1.4.4.6 "Ring Buffer Use"
>> @@ -119,6 +120,25 @@ struct intel_ringbuffer {
>>
>>   struct	intel_context;
>>
>> +/*
>> + * we use a single page to load ctx workarounds so all of these
>> + * values are referred in terms of dwords
>> + *
>> + * struct i915_wa_ctx_bb:
>> + *  offset: specifies batch starting position, also helpful in case
>> + *    if we want to have multiple batches at different offsets based on
>> + *    some criteria. It is not a requirement at the moment but provides
>> + *    an option for future use.
>> + *  size: size of the batch in DWORDS
>> + */
>> +struct  i915_ctx_workarounds {
>> +	struct i915_wa_ctx_bb {
>> +		u32 offset;
>> +		u32 size;
>> +	} indirect_ctx, per_ctx;
>> +	struct drm_i915_gem_object *obj;
>> +};
>> +
>>   struct  intel_engine_cs {
>>   	const char	*name;
>>   	enum intel_ring_id {
>> @@ -142,6 +162,7 @@ struct  intel_engine_cs {
>>   	struct i915_gem_batch_pool batch_pool;
>>
>>   	struct intel_hw_status_page status_page;
>> +	struct i915_ctx_workarounds wa_ctx;
>>
>>   	unsigned irq_refcount; /* protected by dev_priv->irq_lock */
>>   	u32		irq_enable_mask;	/* bitmask to enable ring interrupt */
>> --
>> 2.3.0
>>
>> _______________________________________________
>> Intel-gfx mailing list
>> Intel-gfx at lists.freedesktop.org
>> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
>