[Intel-gfx] [PATCH v6 6/6] drm/i915/gen8: Add WaRsRestoreWithPerCtxtBb workaround

Tue Jun 23 07:48:33 PDT 2015

On 22/06/2015 17:59, Siluvery, Arun wrote:
> On 22/06/2015 17:21, Ville Syrjälä wrote:
>> On Fri, Jun 19, 2015 at 06:37:15PM +0100, Arun Siluvery wrote:
>>> In Per context w/a batch buffer,
>>> WaRsRestoreWithPerCtxtBb
>>>
>>> This WA performs writes to scratch page so it must be valid, this check
>>> is performed before initializing the batch with this WA.
>>>
>>> v2: This patches modifies definitions of MI_LOAD_REGISTER_MEM and
>>> MI_LOAD_REGISTER_REG; Add GEN8 specific defines for these instructions
>>> so as to not break any future users of existing definitions (Michel)
>>>
>>> v3: Length defined in current definitions of LRM, LRR instructions was specified
>>> as 0. It seems it is common convention for instructions whose length vary between
>>> platforms. This is not an issue so far because they are not used anywhere except
>>> command parser; now that we use in this patch update them with correct length
>>> and also move them out of command parser placeholder to appropriate place.
>>> remove unnecessary padding and follow the WA programming sequence exactly
>>> as mentioned in spec which is essential for this WA (Dave).
>>>
>>> Cc: Chris Wilson <chris at chris-wilson.co.uk>
>>> Cc: Dave Gordon <david.s.gordon at intel.com>
>>> Signed-off-by: Rafael Barbalho <rafael.barbalho at intel.com>
>>> Signed-off-by: Arun Siluvery <arun.siluvery at linux.intel.com>
>>> ---
>>>    drivers/gpu/drm/i915/i915_reg.h  | 29 +++++++++++++++++++--
>>>    drivers/gpu/drm/i915/intel_lrc.c | 54 ++++++++++++++++++++++++++++++++++++++++
>>>    2 files changed, 81 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
>>> index 7637e64..208620d 100644
>>> --- a/drivers/gpu/drm/i915/i915_reg.h
>>> +++ b/drivers/gpu/drm/i915/i915_reg.h
>>> @@ -347,6 +347,31 @@
>>>    #define   MI_INVALIDATE_BSD		(1<<7)
>>>    #define   MI_FLUSH_DW_USE_GTT		(1<<2)
>>>    #define   MI_FLUSH_DW_USE_PPGTT		(0<<2)
>>> +#define MI_LOAD_REGISTER_MEM    MI_INSTR(0x29, 1)
>>> +#define MI_LOAD_REGISTER_MEM_GEN8 MI_INSTR(0x29, 2)
>>> +#define   MI_LRM_USE_GLOBAL_GTT (1<<22)
>>> +#define   MI_LRM_ASYNC_MODE_ENABLE (1<<21)
>>> +#define MI_LOAD_REGISTER_REG    MI_INSTR(0x2A, 1)
>>> +#define MI_ATOMIC(len)	MI_INSTR(0x2F, (len-2))
>>> +#define   MI_ATOMIC_MEMORY_TYPE_GGTT	(1<<22)
>>> +#define   MI_ATOMIC_INLINE_DATA		(1<<18)
>>> +#define   MI_ATOMIC_CS_STALL		(1<<17)
>>> +#define   MI_ATOMIC_RETURN_DATA_CTL	(1<<16)
>>> +#define MI_ATOMIC_OP_MASK(op)  ((op) << 8)
>>> +#define MI_ATOMIC_AND	MI_ATOMIC_OP_MASK(0x01)
>>> +#define MI_ATOMIC_OR	MI_ATOMIC_OP_MASK(0x02)
>>> +#define MI_ATOMIC_XOR	MI_ATOMIC_OP_MASK(0x03)
>>> +#define MI_ATOMIC_MOVE	MI_ATOMIC_OP_MASK(0x04)
>>> +#define MI_ATOMIC_INC	MI_ATOMIC_OP_MASK(0x05)
>>> +#define MI_ATOMIC_DEC	MI_ATOMIC_OP_MASK(0x06)
>>> +#define MI_ATOMIC_ADD	MI_ATOMIC_OP_MASK(0x07)
>>> +#define MI_ATOMIC_SUB	MI_ATOMIC_OP_MASK(0x08)
>>> +#define MI_ATOMIC_RSUB	MI_ATOMIC_OP_MASK(0x09)
>>> +#define MI_ATOMIC_IMAX	MI_ATOMIC_OP_MASK(0x0A)
>>> +#define MI_ATOMIC_IMIN	MI_ATOMIC_OP_MASK(0x0B)
>>> +#define MI_ATOMIC_UMAX	MI_ATOMIC_OP_MASK(0x0C)
>>> +#define MI_ATOMIC_UMIN	MI_ATOMIC_OP_MASK(0x0D)
>>> +
>>>    #define MI_BATCH_BUFFER		MI_INSTR(0x30, 1)
>>>    #define   MI_BATCH_NON_SECURE		(1)
>>>    /* for snb/ivb/vlv this also means "batch in ppgtt" when ppgtt is enabled. */
>>> @@ -451,8 +476,6 @@
>>>    #define MI_CLFLUSH              MI_INSTR(0x27, 0)
>>>    #define MI_REPORT_PERF_COUNT    MI_INSTR(0x28, 0)
>>>    #define   MI_REPORT_PERF_COUNT_GGTT (1<<0)
>>> -#define MI_LOAD_REGISTER_MEM    MI_INSTR(0x29, 0)
>>> -#define MI_LOAD_REGISTER_REG    MI_INSTR(0x2A, 0)
>>>    #define MI_RS_STORE_DATA_IMM    MI_INSTR(0x2B, 0)
>>>    #define MI_LOAD_URB_MEM         MI_INSTR(0x2C, 0)
>>>    #define MI_STORE_URB_MEM        MI_INSTR(0x2D, 0)
>>> @@ -1799,6 +1822,8 @@ enum skl_disp_power_wells {
>>>    #define   GEN8_RC_SEMA_IDLE_MSG_DISABLE	(1 << 12)
>>>    #define   GEN8_FF_DOP_CLOCK_GATE_DISABLE	(1<<10)
>>>
>>> +#define GEN8_RS_PREEMPT_STATUS		0x215C
>>> +
>>>    /* Fuse readout registers for GT */
>>>    #define CHV_FUSE_GT			(VLV_DISPLAY_BASE + 0x2168)
>>>    #define   CHV_FGT_DISABLE_SS0		(1 << 10)
>>> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
>>> index 664455c..28198c4 100644
>>> --- a/drivers/gpu/drm/i915/intel_lrc.c
>>> +++ b/drivers/gpu/drm/i915/intel_lrc.c
>>> @@ -1215,11 +1215,65 @@ static int gen8_init_perctx_bb(struct intel_engine_cs *ring,
>>>    			       uint32_t *const batch,
>>>    			       uint32_t *offset)
>>>    {
>>> +	uint32_t scratch_addr;
>>>    	uint32_t index = wa_ctx_start(wa_ctx, *offset, CACHELINE_DWORDS);
>>>
>>> +	/* Actual scratch location is at 128 bytes offset */
>>> +	scratch_addr = ring->scratch.gtt_offset + 2*CACHELINE_BYTES;
>>> +	scratch_addr |= PIPE_CONTROL_GLOBAL_GTT;
>>> +
>>>    	/* WaDisableCtxRestoreArbitration:bdw,chv */
>>>    	wa_ctx_emit(batch, MI_ARB_ON_OFF | MI_ARB_ENABLE);
>>>
>>> +	/*
>>> +	 * As per Bspec, to workaround a known HW issue, SW must perform the
>>> +	 * below programming sequence prior to programming MI_BATCH_BUFFER_END.
>>> +	 *
>>> +	 * This is only applicable for Gen8.
>>> +	 */
>>> +
>>> +	/* WaRsRestoreWithPerCtxtBb:bdw,chv */
>>
>> This w/a doesn't seem to be needed for CHV. Also BDW seems to have
>> gained a chicken bit in H0 (FF_SLICE_CS_CHICKEN3[5]) that supposedly
>> means we shouldn't need this w/a on BDW either.
>>
> looks like this chicken bit is applying this WA, if this is working as
> expected then we can ignore this patch, I will try to get some
> confirmation on this.

I got confirmation from HW that chicken bit is enough, this patch can be 
ignored.

regards
Arun

>
> regards
> Arun
>
>>> +	wa_ctx_emit(batch, MI_LOAD_REGISTER_IMM(1));
>>> +	wa_ctx_emit(batch, INSTPM);
>>> +	wa_ctx_emit(batch, _MASKED_BIT_DISABLE(INSTPM_FORCE_ORDERING));
>>> +
>>> +	wa_ctx_emit(batch, (MI_ATOMIC(5) |
>>> +			    MI_ATOMIC_MEMORY_TYPE_GGTT |
>>> +			    MI_ATOMIC_INLINE_DATA |
>>> +			    MI_ATOMIC_CS_STALL |
>>> +			    MI_ATOMIC_RETURN_DATA_CTL |
>>> +			    MI_ATOMIC_MOVE));
>>> +	wa_ctx_emit(batch, scratch_addr);
>>> +	wa_ctx_emit(batch, 0);
>>> +	wa_ctx_emit(batch, _MASKED_BIT_ENABLE(INSTPM_FORCE_ORDERING));
>>> +	wa_ctx_emit(batch, _MASKED_BIT_ENABLE(INSTPM_FORCE_ORDERING));
>>> +
>>> +	/*
>>> +	 * BSpec says MI_LOAD_REGISTER_MEM, MI_LOAD_REGISTER_REG and
>>> +	 * MI_BATCH_BUFFER_END instructions in this sequence need to be
>>> +	 * in the same cacheline. To satisfy this case even if more WA are
>>> +	 * added in future, pad current cacheline and start remaining sequence
>>> +	 * in new cacheline.
>>> +	 */
>>> +	while (index % CACHELINE_DWORDS)
>>> +		wa_ctx_emit(batch, MI_NOOP);
>>> +
>>> +	wa_ctx_emit(batch, (MI_LOAD_REGISTER_MEM_GEN8 |
>>> +			    MI_LRM_USE_GLOBAL_GTT |
>>> +			    MI_LRM_ASYNC_MODE_ENABLE));
>>> +	wa_ctx_emit(batch, INSTPM);
>>> +	wa_ctx_emit(batch, scratch_addr);
>>> +	wa_ctx_emit(batch, 0);
>>> +
>>> +	/*
>>> +	 * BSpec says there should not be any commands programmed
>>> +	 * between MI_LOAD_REGISTER_REG and MI_BATCH_BUFFER_END so
>>> +	 * do not add any new commands
>>> +	 */
>>> +	wa_ctx_emit(batch, MI_LOAD_REGISTER_REG);
>>> +	wa_ctx_emit(batch, GEN8_RS_PREEMPT_STATUS);
>>> +	wa_ctx_emit(batch, GEN8_RS_PREEMPT_STATUS);
>>> +
>>>    	wa_ctx_emit(batch, MI_BATCH_BUFFER_END);
>>>
>>>    	return wa_ctx_end(wa_ctx, *offset = index, 1);
>>> --
>>> 2.3.0
>>>
>>> _______________________________________________
>>> Intel-gfx mailing list
>>> Intel-gfx at lists.freedesktop.org
>>> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
>>
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
>