[Intel-gfx] [PATCH 1/2] drm/i915/bdw: Apply workarounds in render ring init function
Siluvery, Arun
arun.siluvery at linux.intel.com
Tue Aug 26 11:27:10 CEST 2014
On 25/08/2014 13:18, Ville Syrjälä wrote:
> On Fri, Aug 22, 2014 at 08:39:11PM +0100, Arun Siluvery wrote:
>> For BDW workarounds are currently initialized in init_clock_gating() but
>> they are lost during reset, suspend/resume etc; this patch moves the WAs
>> that are part of register state context to render ring init fn otherwise
>> default context ends up with incorrect values as they don't get initialized
>> until init_clock_gating fn.
>>
>> v2: Add workarounds to golden render state
>> This method has its own issues, first of all this is different for
>> each gen and it is generated using a tool so adding new workaround
>> and mainitaining them across gens is not a straightforward process.
>>
>> v3: Use LRIs to emit these workarounds (Ville)
>> Instead of modifying the golden render state the same LRIs are
>> emitted from within the driver.
>>
>> For: VIZ-4092
>> Signed-off-by: Arun Siluvery <arun.siluvery at linux.intel.com>
>> ---
>> drivers/gpu/drm/i915/i915_gem_context.c | 6 +++
>> drivers/gpu/drm/i915/intel_pm.c | 48 ----------------------
>> drivers/gpu/drm/i915/intel_ringbuffer.c | 70 +++++++++++++++++++++++++++++++++
>> drivers/gpu/drm/i915/intel_ringbuffer.h | 7 ++++
>> 4 files changed, 83 insertions(+), 48 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
>> index 9683e62..2debce4 100644
>> --- a/drivers/gpu/drm/i915/i915_gem_context.c
>> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
>> @@ -631,20 +631,26 @@ static int do_switch(struct intel_engine_cs *ring,
>> }
>>
>> uninitialized = !to->legacy_hw_ctx.initialized && from == NULL;
>> to->legacy_hw_ctx.initialized = true;
>>
>> done:
>> i915_gem_context_reference(to);
>> ring->last_context = to;
>>
>> if (uninitialized) {
>> + if (IS_BROADWELL(ring->dev)) {
>> + ret = bdw_init_workarounds(ring);
>> + if (ret)
>> + DRM_ERROR("init workarounds: %d\n", ret);
>> + }
>> +
>> ret = i915_gem_render_state_init(ring);
>> if (ret)
>> DRM_ERROR("init render state: %d\n", ret);
>> }
>>
>> return 0;
>>
>> unpin_out:
>> if (ring->id == RCS)
>> i915_gem_object_ggtt_unpin(to->legacy_hw_ctx.rcs_state);
>> diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c
>> index c8f744c..668acd9 100644
>> --- a/drivers/gpu/drm/i915/intel_pm.c
>> +++ b/drivers/gpu/drm/i915/intel_pm.c
>> @@ -5507,101 +5507,53 @@ static void gen8_init_clock_gating(struct drm_device *dev)
>> struct drm_i915_private *dev_priv = dev->dev_private;
>> enum pipe pipe;
>>
>> I915_WRITE(WM3_LP_ILK, 0);
>> I915_WRITE(WM2_LP_ILK, 0);
>> I915_WRITE(WM1_LP_ILK, 0);
>>
>> /* FIXME(BDW): Check all the w/a, some might only apply to
>> * pre-production hw. */
>>
>> - /* WaDisablePartialInstShootdown:bdw */
>> - I915_WRITE(GEN8_ROW_CHICKEN,
>> - _MASKED_BIT_ENABLE(PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE));
>> -
>> - /* WaDisableThreadStallDopClockGating:bdw */
>> - /* FIXME: Unclear whether we really need this on production bdw. */
>> - I915_WRITE(GEN8_ROW_CHICKEN,
>> - _MASKED_BIT_ENABLE(STALL_DOP_GATING_DISABLE));
>>
>> - /*
>> - * This GEN8_CENTROID_PIXEL_OPT_DIS W/A is only needed for
>> - * pre-production hardware
>> - */
>> - I915_WRITE(HALF_SLICE_CHICKEN3,
>> - _MASKED_BIT_ENABLE(GEN8_CENTROID_PIXEL_OPT_DIS));
>> - I915_WRITE(HALF_SLICE_CHICKEN3,
>> - _MASKED_BIT_ENABLE(GEN8_SAMPLER_POWER_BYPASS_DIS));
>> I915_WRITE(GAMTARBMODE, _MASKED_BIT_ENABLE(ARB_MODE_BWGTLB_DISABLE));
>>
>> I915_WRITE(_3D_CHICKEN3,
>> _MASKED_BIT_ENABLE(_3D_CHICKEN_SDE_LIMIT_FIFO_POLY_DEPTH(2)));
>>
>> - I915_WRITE(COMMON_SLICE_CHICKEN2,
>> - _MASKED_BIT_ENABLE(GEN8_CSC2_SBE_VUE_CACHE_CONSERVATIVE));
>> -
>> - I915_WRITE(GEN7_HALF_SLICE_CHICKEN1,
>> - _MASKED_BIT_ENABLE(GEN7_SINGLE_SUBSCAN_DISPATCH_ENABLE));
>> -
>> - /* WaDisableDopClockGating:bdw May not be needed for production */
>> - I915_WRITE(GEN7_ROW_CHICKEN2,
>> - _MASKED_BIT_ENABLE(DOP_CLOCK_GATING_DISABLE));
>>
>> /* WaSwitchSolVfFArbitrationPriority:bdw */
>> I915_WRITE(GAM_ECOCHK, I915_READ(GAM_ECOCHK) | HSW_ECOCHK_ARB_PRIO_SOL);
>>
>> /* WaPsrDPAMaskVBlankInSRD:bdw */
>> I915_WRITE(CHICKEN_PAR1_1,
>> I915_READ(CHICKEN_PAR1_1) | DPA_MASK_VBLANK_SRD);
>>
>> /* WaPsrDPRSUnmaskVBlankInSRD:bdw */
>> for_each_pipe(pipe) {
>> I915_WRITE(CHICKEN_PIPESL_1(pipe),
>> I915_READ(CHICKEN_PIPESL_1(pipe)) |
>> BDW_DPRS_MASK_VBLANK_SRD);
>> }
>>
>> - /* Use Force Non-Coherent whenever executing a 3D context. This is a
>> - * workaround for for a possible hang in the unlikely event a TLB
>> - * invalidation occurs during a PSD flush.
>> - */
>> - I915_WRITE(HDC_CHICKEN0,
>> - I915_READ(HDC_CHICKEN0) |
>> - _MASKED_BIT_ENABLE(HDC_FORCE_NON_COHERENT));
>> -
>> /* WaVSRefCountFullforceMissDisable:bdw */
>> /* WaDSRefCountFullforceMissDisable:bdw */
>> I915_WRITE(GEN7_FF_THREAD_MODE,
>> I915_READ(GEN7_FF_THREAD_MODE) &
>> ~(GEN8_FF_DS_REF_CNT_FFME | GEN7_FF_VS_REF_CNT_FFME));
>>
>> - /*
>> - * BSpec recommends 8x4 when MSAA is used,
>> - * however in practice 16x4 seems fastest.
>> - *
>> - * Note that PS/WM thread counts depend on the WIZ hashing
>> - * disable bit, which we don't touch here, but it's good
>> - * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
>> - */
>> - I915_WRITE(GEN7_GT_MODE,
>> - GEN6_WIZ_HASHING_MASK | GEN6_WIZ_HASHING_16x4);
>> -
>> I915_WRITE(GEN6_RC_SLEEP_PSMI_CONTROL,
>> _MASKED_BIT_ENABLE(GEN8_RC_SEMA_IDLE_MSG_DISABLE));
>>
>> /* WaDisableSDEUnitClockGating:bdw */
>> I915_WRITE(GEN8_UCGCTL6, I915_READ(GEN8_UCGCTL6) |
>> GEN8_SDEUNIT_CLOCK_GATE_DISABLE);
>> -
>> - /* Wa4x4STCOptimizationDisable:bdw */
>> - I915_WRITE(CACHE_MODE_1,
>> - _MASKED_BIT_ENABLE(GEN8_4x4_STC_OPTIMIZATION_DISABLE));
>> }
>>
>> static void haswell_init_clock_gating(struct drm_device *dev)
>> {
>> struct drm_i915_private *dev_priv = dev->dev_private;
>>
>> ilk_init_lp_watermarks(dev);
>>
>> /* L3 caching of data atomics doesn't work -- disable it. */
>> I915_WRITE(HSW_SCRATCH1, HSW_SCRATCH1_L3_DATA_ATOMICS_DISABLE);
>> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
>> index 13543f8..9e24073 100644
>> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
>> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
>> @@ -643,20 +643,90 @@ intel_init_pipe_control(struct intel_engine_cs *ring)
>> return 0;
>>
>> err_unpin:
>> i915_gem_object_ggtt_unpin(ring->scratch.obj);
>> err_unref:
>> drm_gem_object_unreference(&ring->scratch.obj->base);
>> err:
>> return ret;
>> }
>>
>> +int bdw_init_workarounds(struct intel_engine_cs *ring)
>> +{
>> + int ret;
>> +
>> + /*
>> + * workarounds applied in this fn are part of register state context,
>> + * they need to be re-initialized followed by gpu reset, suspend/resume,
>> + * module reload.
>> + */
>> +
>> + /*
>> + * update the number of dwords required based on the
>> + * actual number of workarounds applied
>> + */
>> + ret = intel_ring_begin(ring, 24);
>> + if (ret)
>> + return ret;
>> +
>> + /* WaDisablePartialInstShootdown:bdw */
>> + /* WaDisableThreadStallDopClockGating:bdw */
>> + /* FIXME: Unclear whether we really need this on production bdw. */
>> + INTEL_RING_EMIT_WA(ring, GEN8_ROW_CHICKEN,
>> + _MASKED_BIT_ENABLE(PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE
>> + | STALL_DOP_GATING_DISABLE));
>> +
>> + /* WaDisableDopClockGating:bdw May not be needed for production */
>> + INTEL_RING_EMIT_WA(ring, GEN7_ROW_CHICKEN2,
>> + _MASKED_BIT_ENABLE(DOP_CLOCK_GATING_DISABLE));
>> +
>> + /*
>> + * This GEN8_CENTROID_PIXEL_OPT_DIS W/A is only needed for
>> + * pre-production hardware
>> + */
>> + INTEL_RING_EMIT_WA(ring, HALF_SLICE_CHICKEN3,
>> + _MASKED_BIT_ENABLE(GEN8_CENTROID_PIXEL_OPT_DIS
>> + | GEN8_SAMPLER_POWER_BYPASS_DIS));
>> +
>> + INTEL_RING_EMIT_WA(ring, GEN7_HALF_SLICE_CHICKEN1,
>> + _MASKED_BIT_ENABLE(GEN7_SINGLE_SUBSCAN_DISPATCH_ENABLE));
>> +
>> + INTEL_RING_EMIT_WA(ring, COMMON_SLICE_CHICKEN2,
>> + _MASKED_BIT_ENABLE(GEN8_CSC2_SBE_VUE_CACHE_CONSERVATIVE));
>> +
>> + /* Use Force Non-Coherent whenever executing a 3D context. This is a
>> + * workaround for for a possible hang in the unlikely event a TLB
>> + * invalidation occurs during a PSD flush.
>> + */
>> + INTEL_RING_EMIT_WA(ring, HDC_CHICKEN0,
>> + _MASKED_BIT_ENABLE(HDC_FORCE_NON_COHERENT));
>> +
>> + /* Wa4x4STCOptimizationDisable:bdw */
>> + INTEL_RING_EMIT_WA(ring, CACHE_MODE_1,
>> + _MASKED_BIT_ENABLE(GEN8_4x4_STC_OPTIMIZATION_DISABLE));
>> +
>> + /*
>> + * BSpec recommends 8x4 when MSAA is used,
>> + * however in practice 16x4 seems fastest.
>> + *
>> + * Note that PS/WM thread counts depend on the WIZ hashing
>> + * disable bit, which we don't touch here, but it's good
>> + * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
>> + */
>> + INTEL_RING_EMIT_WA(ring, GEN7_GT_MODE,
>> + GEN6_WIZ_HASHING_MASK | GEN6_WIZ_HASHING_16x4);
>> +
>> + intel_ring_advance(ring);
>> +
>> + return 0;
>> +}
>> +
>> static int init_render_ring(struct intel_engine_cs *ring)
>> {
>> struct drm_device *dev = ring->dev;
>> struct drm_i915_private *dev_priv = dev->dev_private;
>> int ret = init_ring_common(ring);
>> if (ret)
>> return ret;
>>
>> /* WaTimedSingleVertexDispatch:cl,bw,ctg,elk,ilk,snb */
>> if (INTEL_INFO(dev)->gen >= 4 && INTEL_INFO(dev)->gen < 7)
>> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
>> index 9cbf7b0..77fe667 100644
>> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
>> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
>> @@ -66,20 +66,26 @@ struct intel_hw_status_page {
>> break; \
>> } \
>> ring->semaphore.signal_ggtt[RCS] = GEN8_SIGNAL_OFFSET(ring, RCS); \
>> ring->semaphore.signal_ggtt[VCS] = GEN8_SIGNAL_OFFSET(ring, VCS); \
>> ring->semaphore.signal_ggtt[BCS] = GEN8_SIGNAL_OFFSET(ring, BCS); \
>> ring->semaphore.signal_ggtt[VECS] = GEN8_SIGNAL_OFFSET(ring, VECS); \
>> ring->semaphore.signal_ggtt[VCS2] = GEN8_SIGNAL_OFFSET(ring, VCS2); \
>> ring->semaphore.signal_ggtt[ring->id] = MI_SEMAPHORE_SYNC_INVALID; \
>> } while(0)
>>
>> +#define INTEL_RING_EMIT_WA(_ring, _wa_reg, _wa_val) ({ \
>> + intel_ring_emit(_ring, MI_LOAD_REGISTER_IMM(1)); \
>> + intel_ring_emit(_ring, _wa_reg); \
>> + intel_ring_emit(_ring, _wa_val); \
>> + })
>
> do {} while (0) would suffice since the macro doesn't return anything.
> Not sure why it's a macro actually. A static function in intel_ringbuffer.c
> would do just as well. If you want to keep it a macro, the argument
> evaluations should have () around them.
>
> I was also going to say that you could just call it ring_emit_lri() or
> something, but I guess you plan to add the debugfs w/a stuff there which
> makes the current name more appropriate. I'll let others bikeshed the
> debugfs stuff since I have no real opinion on it.
>
> Apart from that the patch seems good to me.
>
It was started as a macro because I wanted to keep it similar as
I915_WRITE() as initially these workarounds are applied using register
writes, as you mentioned it could be a fn as well. I will change it as a
fn and send updated patch.
The debugfs part is mainly added to support testing. In i-g-t we need
the list of workarounds applied to compare their state before and after
test and adding them in i-g-t itself is not so nice; Daniel suggested to
generate the list from within the driver and export them to a debugfs file.
regards
Arun
>> +
>> enum intel_ring_hangcheck_action {
>> HANGCHECK_IDLE = 0,
>> HANGCHECK_WAIT,
>> HANGCHECK_ACTIVE,
>> HANGCHECK_ACTIVE_LOOP,
>> HANGCHECK_KICK,
>> HANGCHECK_HUNG,
>> };
>>
>> #define HANGCHECK_SCORE_RING_HUNG 31
>> @@ -411,20 +417,21 @@ int intel_ring_flush_all_caches(struct intel_engine_cs *ring);
>> int intel_ring_invalidate_all_caches(struct intel_engine_cs *ring);
>>
>> void intel_fini_pipe_control(struct intel_engine_cs *ring);
>> int intel_init_pipe_control(struct intel_engine_cs *ring);
>>
>> int intel_init_render_ring_buffer(struct drm_device *dev);
>> int intel_init_bsd_ring_buffer(struct drm_device *dev);
>> int intel_init_bsd2_ring_buffer(struct drm_device *dev);
>> int intel_init_blt_ring_buffer(struct drm_device *dev);
>> int intel_init_vebox_ring_buffer(struct drm_device *dev);
>> +int bdw_init_workarounds(struct intel_engine_cs *ring);
>>
>> u64 intel_ring_get_active_head(struct intel_engine_cs *ring);
>> void intel_ring_setup_status_page(struct intel_engine_cs *ring);
>>
>> static inline u32 intel_ring_get_tail(struct intel_ringbuffer *ringbuf)
>> {
>> return ringbuf->tail;
>> }
>>
>> static inline u32 intel_ring_get_seqno(struct intel_engine_cs *ring)
>> --
>> 2.0.4
>>
>> _______________________________________________
>> Intel-gfx mailing list
>> Intel-gfx at lists.freedesktop.org
>> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
>
More information about the Intel-gfx
mailing list