[Intel-gfx] [PATCH 1/2] drm/i915/bdw: Apply workarounds in render ring init function

Tue Aug 26 16:47:52 CEST 2014

On 26/08/2014 15:37, Ville Syrjälä wrote:
> On Tue, Aug 26, 2014 at 02:44:50PM +0100, Arun Siluvery wrote:
>> For BDW workarounds are currently initialized in init_clock_gating() but
>> they are lost during reset, suspend/resume etc; this patch moves the WAs
>> that are part of register state context to render ring init fn otherwise
>> default context ends up with incorrect values as they don't get initialized
>> until init_clock_gating fn.
>>
>> v2: Add workarounds to golden render state
>> This method has its own issues, first of all this is different for
>> each gen and it is generated using a tool so adding new workaround
>> and mainitaining them across gens is not a straightforward process.
>>
>> v3: Use LRIs to emit these workarounds (Ville)
>> Instead of modifying the golden render state the same LRIs are
>> emitted from within the driver.
>>
>> v4: Use abstract name when exporting gen specific routines (Chris)
>>
>> For: VIZ-4092
>> Signed-off-by: Arun Siluvery <arun.siluvery at linux.intel.com>
>
> This one looks good as far as I'm concerned.
> Reviewed-by: Ville Syrjälä <ville.syrjala at linux.intel.com>
>
> Do you plan to give other platforms the same treatment? We need at least
> CHV converted ASAP. But if you don't have a test machine I can take care
> of that myself.
>
I don't have hardware for CHV, I can borrow and try to do but since it 
is required at the earliest could you please modify it for CHV?

regards
Arun

>> ---
>>   drivers/gpu/drm/i915/i915_gem_context.c |  6 +++
>>   drivers/gpu/drm/i915/intel_pm.c         | 48 --------------------
>>   drivers/gpu/drm/i915/intel_ringbuffer.c | 79 +++++++++++++++++++++++++++++++++
>>   drivers/gpu/drm/i915/intel_ringbuffer.h |  2 +
>>   4 files changed, 87 insertions(+), 48 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
>> index 9683e62..0a9bb0e 100644
>> --- a/drivers/gpu/drm/i915/i915_gem_context.c
>> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
>> @@ -631,20 +631,26 @@ static int do_switch(struct intel_engine_cs *ring,
>>   	}
>>
>>   	uninitialized = !to->legacy_hw_ctx.initialized && from == NULL;
>>   	to->legacy_hw_ctx.initialized = true;
>>
>>   done:
>>   	i915_gem_context_reference(to);
>>   	ring->last_context = to;
>>
>>   	if (uninitialized) {
>> +		if (ring->init_context) {
>> +			ret = ring->init_context(ring);
>> +			if (ret)
>> +				DRM_ERROR("ring init context: %d\n", ret);
>> +		}
>> +
>>   		ret = i915_gem_render_state_init(ring);
>>   		if (ret)
>>   			DRM_ERROR("init render state: %d\n", ret);
>>   	}
>>
>>   	return 0;
>>
>>   unpin_out:
>>   	if (ring->id == RCS)
>>   		i915_gem_object_ggtt_unpin(to->legacy_hw_ctx.rcs_state);
>> diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c
>> index c8f744c..668acd9 100644
>> --- a/drivers/gpu/drm/i915/intel_pm.c
>> +++ b/drivers/gpu/drm/i915/intel_pm.c
>> @@ -5507,101 +5507,53 @@ static void gen8_init_clock_gating(struct drm_device *dev)
>>   	struct drm_i915_private *dev_priv = dev->dev_private;
>>   	enum pipe pipe;
>>
>>   	I915_WRITE(WM3_LP_ILK, 0);
>>   	I915_WRITE(WM2_LP_ILK, 0);
>>   	I915_WRITE(WM1_LP_ILK, 0);
>>
>>   	/* FIXME(BDW): Check all the w/a, some might only apply to
>>   	 * pre-production hw. */
>>
>> -	/* WaDisablePartialInstShootdown:bdw */
>> -	I915_WRITE(GEN8_ROW_CHICKEN,
>> -		   _MASKED_BIT_ENABLE(PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE));
>> -
>> -	/* WaDisableThreadStallDopClockGating:bdw */
>> -	/* FIXME: Unclear whether we really need this on production bdw. */
>> -	I915_WRITE(GEN8_ROW_CHICKEN,
>> -		   _MASKED_BIT_ENABLE(STALL_DOP_GATING_DISABLE));
>>
>> -	/*
>> -	 * This GEN8_CENTROID_PIXEL_OPT_DIS W/A is only needed for
>> -	 * pre-production hardware
>> -	 */
>> -	I915_WRITE(HALF_SLICE_CHICKEN3,
>> -		   _MASKED_BIT_ENABLE(GEN8_CENTROID_PIXEL_OPT_DIS));
>> -	I915_WRITE(HALF_SLICE_CHICKEN3,
>> -		   _MASKED_BIT_ENABLE(GEN8_SAMPLER_POWER_BYPASS_DIS));
>>   	I915_WRITE(GAMTARBMODE, _MASKED_BIT_ENABLE(ARB_MODE_BWGTLB_DISABLE));
>>
>>   	I915_WRITE(_3D_CHICKEN3,
>>   		   _MASKED_BIT_ENABLE(_3D_CHICKEN_SDE_LIMIT_FIFO_POLY_DEPTH(2)));
>>
>> -	I915_WRITE(COMMON_SLICE_CHICKEN2,
>> -		   _MASKED_BIT_ENABLE(GEN8_CSC2_SBE_VUE_CACHE_CONSERVATIVE));
>> -
>> -	I915_WRITE(GEN7_HALF_SLICE_CHICKEN1,
>> -		   _MASKED_BIT_ENABLE(GEN7_SINGLE_SUBSCAN_DISPATCH_ENABLE));
>> -
>> -	/* WaDisableDopClockGating:bdw May not be needed for production */
>> -	I915_WRITE(GEN7_ROW_CHICKEN2,
>> -		   _MASKED_BIT_ENABLE(DOP_CLOCK_GATING_DISABLE));
>>
>>   	/* WaSwitchSolVfFArbitrationPriority:bdw */
>>   	I915_WRITE(GAM_ECOCHK, I915_READ(GAM_ECOCHK) | HSW_ECOCHK_ARB_PRIO_SOL);
>>
>>   	/* WaPsrDPAMaskVBlankInSRD:bdw */
>>   	I915_WRITE(CHICKEN_PAR1_1,
>>   		   I915_READ(CHICKEN_PAR1_1) | DPA_MASK_VBLANK_SRD);
>>
>>   	/* WaPsrDPRSUnmaskVBlankInSRD:bdw */
>>   	for_each_pipe(pipe) {
>>   		I915_WRITE(CHICKEN_PIPESL_1(pipe),
>>   			   I915_READ(CHICKEN_PIPESL_1(pipe)) |
>>   			   BDW_DPRS_MASK_VBLANK_SRD);
>>   	}
>>
>> -	/* Use Force Non-Coherent whenever executing a 3D context. This is a
>> -	 * workaround for for a possible hang in the unlikely event a TLB
>> -	 * invalidation occurs during a PSD flush.
>> -	 */
>> -	I915_WRITE(HDC_CHICKEN0,
>> -		   I915_READ(HDC_CHICKEN0) |
>> -		   _MASKED_BIT_ENABLE(HDC_FORCE_NON_COHERENT));
>> -
>>   	/* WaVSRefCountFullforceMissDisable:bdw */
>>   	/* WaDSRefCountFullforceMissDisable:bdw */
>>   	I915_WRITE(GEN7_FF_THREAD_MODE,
>>   		   I915_READ(GEN7_FF_THREAD_MODE) &
>>   		   ~(GEN8_FF_DS_REF_CNT_FFME | GEN7_FF_VS_REF_CNT_FFME));
>>
>> -	/*
>> -	 * BSpec recommends 8x4 when MSAA is used,
>> -	 * however in practice 16x4 seems fastest.
>> -	 *
>> -	 * Note that PS/WM thread counts depend on the WIZ hashing
>> -	 * disable bit, which we don't touch here, but it's good
>> -	 * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
>> -	 */
>> -	I915_WRITE(GEN7_GT_MODE,
>> -		   GEN6_WIZ_HASHING_MASK | GEN6_WIZ_HASHING_16x4);
>> -
>>   	I915_WRITE(GEN6_RC_SLEEP_PSMI_CONTROL,
>>   		   _MASKED_BIT_ENABLE(GEN8_RC_SEMA_IDLE_MSG_DISABLE));
>>
>>   	/* WaDisableSDEUnitClockGating:bdw */
>>   	I915_WRITE(GEN8_UCGCTL6, I915_READ(GEN8_UCGCTL6) |
>>   		   GEN8_SDEUNIT_CLOCK_GATE_DISABLE);
>> -
>> -	/* Wa4x4STCOptimizationDisable:bdw */
>> -	I915_WRITE(CACHE_MODE_1,
>> -		   _MASKED_BIT_ENABLE(GEN8_4x4_STC_OPTIMIZATION_DISABLE));
>>   }
>>
>>   static void haswell_init_clock_gating(struct drm_device *dev)
>>   {
>>   	struct drm_i915_private *dev_priv = dev->dev_private;
>>
>>   	ilk_init_lp_watermarks(dev);
>>
>>   	/* L3 caching of data atomics doesn't work -- disable it. */
>>   	I915_WRITE(HSW_SCRATCH1, HSW_SCRATCH1_L3_DATA_ATOMICS_DISABLE);
>> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
>> index 13543f8..4146582 100644
>> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
>> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
>> @@ -643,20 +643,98 @@ intel_init_pipe_control(struct intel_engine_cs *ring)
>>   	return 0;
>>
>>   err_unpin:
>>   	i915_gem_object_ggtt_unpin(ring->scratch.obj);
>>   err_unref:
>>   	drm_gem_object_unreference(&ring->scratch.obj->base);
>>   err:
>>   	return ret;
>>   }
>>
>> +static inline void intel_ring_emit_wa(struct intel_engine_cs *ring,
>> +				       u32 addr, u32 value)
>> +{
>> +	intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));
>> +	intel_ring_emit(ring, addr);
>> +	intel_ring_emit(ring, value);
>> +}
>> +
>> +static int gen8_init_workarounds(struct intel_engine_cs *ring)
>> +{
>> +	int ret;
>> +
>> +	/*
>> +	 * workarounds applied in this fn are part of register state context,
>> +	 * they need to be re-initialized followed by gpu reset, suspend/resume,
>> +	 * module reload.
>> +	 */
>> +
>> +	/*
>> +	 * update the number of dwords required based on the
>> +	 * actual number of workarounds applied
>> +	 */
>> +	ret = intel_ring_begin(ring, 24);
>> +	if (ret)
>> +		return ret;
>> +
>> +	/* WaDisablePartialInstShootdown:bdw */
>> +	/* WaDisableThreadStallDopClockGating:bdw */
>> +	/* FIXME: Unclear whether we really need this on production bdw. */
>> +	intel_ring_emit_wa(ring, GEN8_ROW_CHICKEN,
>> +			   _MASKED_BIT_ENABLE(PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE
>> +					     | STALL_DOP_GATING_DISABLE));
>> +
>> +	/* WaDisableDopClockGating:bdw May not be needed for production */
>> +	intel_ring_emit_wa(ring, GEN7_ROW_CHICKEN2,
>> +			   _MASKED_BIT_ENABLE(DOP_CLOCK_GATING_DISABLE));
>> +
>> +	/*
>> +	 * This GEN8_CENTROID_PIXEL_OPT_DIS W/A is only needed for
>> +	 * pre-production hardware
>> +	 */
>> +	intel_ring_emit_wa(ring, HALF_SLICE_CHICKEN3,
>> +			   _MASKED_BIT_ENABLE(GEN8_CENTROID_PIXEL_OPT_DIS
>> +					      | GEN8_SAMPLER_POWER_BYPASS_DIS));
>> +
>> +	intel_ring_emit_wa(ring, GEN7_HALF_SLICE_CHICKEN1,
>> +			   _MASKED_BIT_ENABLE(GEN7_SINGLE_SUBSCAN_DISPATCH_ENABLE));
>> +
>> +	intel_ring_emit_wa(ring, COMMON_SLICE_CHICKEN2,
>> +			   _MASKED_BIT_ENABLE(GEN8_CSC2_SBE_VUE_CACHE_CONSERVATIVE));
>> +
>> +	/* Use Force Non-Coherent whenever executing a 3D context. This is a
>> +	 * workaround for for a possible hang in the unlikely event a TLB
>> +	 * invalidation occurs during a PSD flush.
>> +	 */
>> +	intel_ring_emit_wa(ring, HDC_CHICKEN0,
>> +			   _MASKED_BIT_ENABLE(HDC_FORCE_NON_COHERENT));
>> +
>> +	/* Wa4x4STCOptimizationDisable:bdw */
>> +	intel_ring_emit_wa(ring, CACHE_MODE_1,
>> +			   _MASKED_BIT_ENABLE(GEN8_4x4_STC_OPTIMIZATION_DISABLE));
>> +
>> +	/*
>> +	 * BSpec recommends 8x4 when MSAA is used,
>> +	 * however in practice 16x4 seems fastest.
>> +	 *
>> +	 * Note that PS/WM thread counts depend on the WIZ hashing
>> +	 * disable bit, which we don't touch here, but it's good
>> +	 * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
>> +	 */
>> +	intel_ring_emit_wa(ring, GEN7_GT_MODE,
>> +			   GEN6_WIZ_HASHING_MASK | GEN6_WIZ_HASHING_16x4);
>> +
>> +	intel_ring_advance(ring);
>> +
>> +	return 0;
>> +}
>> +
>>   static int init_render_ring(struct intel_engine_cs *ring)
>>   {
>>   	struct drm_device *dev = ring->dev;
>>   	struct drm_i915_private *dev_priv = dev->dev_private;
>>   	int ret = init_ring_common(ring);
>>   	if (ret)
>>   		return ret;
>>
>>   	/* WaTimedSingleVertexDispatch:cl,bw,ctg,elk,ilk,snb */
>>   	if (INTEL_INFO(dev)->gen >= 4 && INTEL_INFO(dev)->gen < 7)
>> @@ -2128,20 +2206,21 @@ int intel_init_render_ring_buffer(struct drm_device *dev)
>>   				i915_gem_object_set_cache_level(obj, I915_CACHE_LLC);
>>   				ret = i915_gem_obj_ggtt_pin(obj, 0, PIN_NONBLOCK);
>>   				if (ret != 0) {
>>   					drm_gem_object_unreference(&obj->base);
>>   					DRM_ERROR("Failed to pin semaphore bo. Disabling semaphores\n");
>>   					i915.semaphores = 0;
>>   				} else
>>   					dev_priv->semaphore_obj = obj;
>>   			}
>>   		}
>> +		ring->init_context = gen8_init_workarounds;
>>   		ring->add_request = gen6_add_request;
>>   		ring->flush = gen8_render_ring_flush;
>>   		ring->irq_get = gen8_ring_get_irq;
>>   		ring->irq_put = gen8_ring_put_irq;
>>   		ring->irq_enable_mask = GT_RENDER_USER_INTERRUPT;
>>   		ring->get_seqno = gen6_ring_get_seqno;
>>   		ring->set_seqno = ring_set_seqno;
>>   		if (i915_semaphore_is_enabled(dev)) {
>>   			WARN_ON(!dev_priv->semaphore_obj);
>>   			ring->semaphore.sync_to = gen8_ring_sync;
>> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
>> index 9cbf7b0..96479c8 100644
>> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
>> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
>> @@ -141,20 +141,22 @@ struct  intel_engine_cs {
>>   	struct intel_hw_status_page status_page;
>>
>>   	unsigned irq_refcount; /* protected by dev_priv->irq_lock */
>>   	u32		irq_enable_mask;	/* bitmask to enable ring interrupt */
>>   	u32		trace_irq_seqno;
>>   	bool __must_check (*irq_get)(struct intel_engine_cs *ring);
>>   	void		(*irq_put)(struct intel_engine_cs *ring);
>>
>>   	int		(*init)(struct intel_engine_cs *ring);
>>
>> +	int		(*init_context)(struct intel_engine_cs *ring);
>> +
>>   	void		(*write_tail)(struct intel_engine_cs *ring,
>>   				      u32 value);
>>   	int __must_check (*flush)(struct intel_engine_cs *ring,
>>   				  u32	invalidate_domains,
>>   				  u32	flush_domains);
>>   	int		(*add_request)(struct intel_engine_cs *ring);
>>   	/* Some chipsets are not quite as coherent as advertised and need
>>   	 * an expensive kick to force a true read of the up-to-date seqno.
>>   	 * However, the up-to-date seqno is not always required and the last
>>   	 * seen value is good enough. Note that the seqno will always be
>> --
>> 2.0.4
>>
>> _______________________________________________
>> Intel-gfx mailing list
>> Intel-gfx at lists.freedesktop.org
>> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
>