[Intel-gfx] [PATCH] drm/i915/bdw: Initialize workarounds in render ring init fn.

Michael H. Nguyen michael.h.nguyen at intel.com
Sat Aug 16 01:26:26 CEST 2014



On 08/14/2014 09:51 AM, arun.siluvery at linux.intel.com wrote:
> From: Arun Siluvery <arun.siluvery at linux.intel.com>
>
> The workarounds at the moment are initialized in init_clock_gating() but
> they are lost during reset, suspend/resume; this patch moves workarounds
> that are part of register state context to render ring init fn otherwise
> default context ends up with incorrect values as they don't get initialized
> until init_clock_gating fn. This should be ok as they are not related to
> display clock gating in the first place.
>
> v2: Add macro to generate w/a table (Daniel)
>
> Signed-off-by: Arun Siluvery <arun.siluvery at linux.intel.com>
> ---
>   drivers/gpu/drm/i915/i915_debugfs.c     | 38 +++++++++++++++++++
>   drivers/gpu/drm/i915/i915_drv.h         | 32 ++++++++++++++++
>   drivers/gpu/drm/i915/intel_pm.c         | 50 -------------------------
>   drivers/gpu/drm/i915/intel_ringbuffer.c | 66 +++++++++++++++++++++++++++++++++
>   4 files changed, 136 insertions(+), 50 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index 9e737b7..fc28835 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -2406,6 +2406,43 @@ static int i915_shared_dplls_info(struct seq_file *m, void *unused)
>   	return 0;
>   }
>
> +static int intel_wa_registers(struct seq_file *m, void *unused)
> +{
> +	struct drm_info_node *node = (struct drm_info_node *) m->private;
> +	struct drm_device *dev = node->minor->dev;
> +	struct drm_i915_private *dev_priv = dev->dev_private;
> +	int i;
> +	int ret;
> +
> +	if (!dev_priv->num_wa) {
> +		DRM_DEBUG_DRIVER("Workaround table not available\n");
> +		return -EINVAL;
> +	}
> +
> +	if (dev_priv->num_wa > I915_MAX_WA_REGS)
> +		dev_priv->num_wa = I915_MAX_WA_REGS;
> +
> +	ret = mutex_lock_interruptible(&dev->struct_mutex);
> +	if (ret)
> +		return ret;
> +
> +	intel_runtime_pm_get(dev_priv);
> +
> +	seq_printf(m, "Workarounds applied: %d\n", dev_priv->num_wa);
> +	for (i = 0; i < dev_priv->num_wa; ++i) {
> +		if (dev_priv->wa_regs[i].addr)
> +			seq_printf(m, "0x%X: 0x%08X, mask: 0x%08X\n",
> +				   dev_priv->wa_regs[i].addr,
> +				   I915_READ(dev_priv->wa_regs[i].addr),
> +				   dev_priv->wa_regs[i].mask);
> +	}
> +
> +	intel_runtime_pm_put(dev_priv);
> +	mutex_unlock(&dev->struct_mutex);
> +
> +	return 0;
> +}
> +
>   struct pipe_crc_info {
>   	const char *name;
>   	struct drm_device *dev;
> @@ -3936,6 +3973,7 @@ static const struct drm_info_list i915_debugfs_list[] = {
>   	{"i915_semaphore_status", i915_semaphore_status, 0},
>   	{"i915_shared_dplls_info", i915_shared_dplls_info, 0},
>   	{"i915_dp_mst_info", i915_dp_mst_info, 0},
> +	{"intel_wa_registers", intel_wa_registers, 0},
>   };
>   #define I915_DEBUGFS_ENTRIES ARRAY_SIZE(i915_debugfs_list)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 18c9ad8..c4190a9 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1222,6 +1222,24 @@ struct i915_gpu_error {
>   	unsigned int test_irq_rings;
>   };
>
> +/*
> + * workarounds are currently applied at different places and changes
> + * are being done to consolidate them at a single place hence the
> + * exact count is not clear at this point; replace this with accurate
> + * number based on gen later.
> + */
> +#define I915_MAX_WA_REGS      16
> +
> +/*
> + * structure to verify workarounds status
> + * mask: represent w/a bits
> + */
> +struct intel_wa {
> +	u32 addr;
> +	u32 val;
> +	u32 mask;
> +};

Can we rename this to intel_reg_wa or similar? intel_wa confused me for 
a second. Personally when I think about wa's, they can be simple 
register writes like is the case here or more complex requiring some 
software logic.

Also, should this be defined in a userspace accessible header? I see 
this defined again locally in the IGT test.

> +
>   enum modeset_restore {
>   	MODESET_ON_LID_OPEN,
>   	MODESET_DONE,
> @@ -1422,6 +1440,9 @@ struct drm_i915_private {
>   	struct drm_i915_gem_object *semaphore_obj;
>   	uint32_t last_seqno, next_seqno;
>
> +	uint32_t num_wa;
> +	struct intel_wa wa_regs[I915_MAX_WA_REGS];
> +
>   	drm_dma_handle_t *status_page_dmah;
>   	struct resource mch_res;
>
> @@ -2803,6 +2824,17 @@ int vlv_freq_opcode(struct drm_i915_private *dev_priv, int val);
>   #define POSTING_READ(reg)	(void)I915_READ_NOTRACE(reg)
>   #define POSTING_READ16(reg)	(void)I915_READ16_NOTRACE(reg)
>
> +#define I915_WRITE_WA(reg, val) ({					\
> +		if (dev_priv->num_wa >= 0				\
> +		    && dev_priv->num_wa < I915_MAX_WA_REGS) {		\
> +			dev_priv->wa_regs[dev_priv->num_wa].addr = reg;	\
> +			dev_priv->wa_regs[dev_priv->num_wa].mask	\
> +				= (val) & 0xFFFF;			\
> +			dev_priv->num_wa++;				\
> +			I915_WRITE(reg, val);				\
> +		}							\

If this macro is called too many times, there will be silent failures I 
believe. Should we add an else case that does a WARN_ON or prints a 
message? This will help developers catch bugs.

Thanks,
Mike

> +	})
> +
>   /* "Broadcast RGB" property */
>   #define INTEL_BROADCAST_RGB_AUTO 0
>   #define INTEL_BROADCAST_RGB_FULL 1
> diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c
> index 1ddd4df..ab64b64 100644
> --- a/drivers/gpu/drm/i915/intel_pm.c
> +++ b/drivers/gpu/drm/i915/intel_pm.c
> @@ -5402,38 +5402,11 @@ static void gen8_init_clock_gating(struct drm_device *dev)
>   	/* FIXME(BDW): Check all the w/a, some might only apply to
>   	 * pre-production hw. */
>
> -	/* WaDisablePartialInstShootdown:bdw */
> -	I915_WRITE(GEN8_ROW_CHICKEN,
> -		   _MASKED_BIT_ENABLE(PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE));
> -
> -	/* WaDisableThreadStallDopClockGating:bdw */
> -	/* FIXME: Unclear whether we really need this on production bdw. */
> -	I915_WRITE(GEN8_ROW_CHICKEN,
> -		   _MASKED_BIT_ENABLE(STALL_DOP_GATING_DISABLE));
> -
> -	/*
> -	 * This GEN8_CENTROID_PIXEL_OPT_DIS W/A is only needed for
> -	 * pre-production hardware
> -	 */
> -	I915_WRITE(HALF_SLICE_CHICKEN3,
> -		   _MASKED_BIT_ENABLE(GEN8_CENTROID_PIXEL_OPT_DIS));
> -	I915_WRITE(HALF_SLICE_CHICKEN3,
> -		   _MASKED_BIT_ENABLE(GEN8_SAMPLER_POWER_BYPASS_DIS));
>   	I915_WRITE(GAMTARBMODE, _MASKED_BIT_ENABLE(ARB_MODE_BWGTLB_DISABLE));
>
>   	I915_WRITE(_3D_CHICKEN3,
>   		   _MASKED_BIT_ENABLE(_3D_CHICKEN_SDE_LIMIT_FIFO_POLY_DEPTH(2)));
>
> -	I915_WRITE(COMMON_SLICE_CHICKEN2,
> -		   _MASKED_BIT_ENABLE(GEN8_CSC2_SBE_VUE_CACHE_CONSERVATIVE));
> -
> -	I915_WRITE(GEN7_HALF_SLICE_CHICKEN1,
> -		   _MASKED_BIT_ENABLE(GEN7_SINGLE_SUBSCAN_DISPATCH_ENABLE));
> -
> -	/* WaDisableDopClockGating:bdw May not be needed for production */
> -	I915_WRITE(GEN7_ROW_CHICKEN2,
> -		   _MASKED_BIT_ENABLE(DOP_CLOCK_GATING_DISABLE));
> -
>   	/* WaSwitchSolVfFArbitrationPriority:bdw */
>   	I915_WRITE(GAM_ECOCHK, I915_READ(GAM_ECOCHK) | HSW_ECOCHK_ARB_PRIO_SOL);
>
> @@ -5448,41 +5421,18 @@ static void gen8_init_clock_gating(struct drm_device *dev)
>   			   BDW_DPRS_MASK_VBLANK_SRD);
>   	}
>
> -	/* Use Force Non-Coherent whenever executing a 3D context. This is a
> -	 * workaround for for a possible hang in the unlikely event a TLB
> -	 * invalidation occurs during a PSD flush.
> -	 */
> -	I915_WRITE(HDC_CHICKEN0,
> -		   I915_READ(HDC_CHICKEN0) |
> -		   _MASKED_BIT_ENABLE(HDC_FORCE_NON_COHERENT));
> -
>   	/* WaVSRefCountFullforceMissDisable:bdw */
>   	/* WaDSRefCountFullforceMissDisable:bdw */
>   	I915_WRITE(GEN7_FF_THREAD_MODE,
>   		   I915_READ(GEN7_FF_THREAD_MODE) &
>   		   ~(GEN8_FF_DS_REF_CNT_FFME | GEN7_FF_VS_REF_CNT_FFME));
>
> -	/*
> -	 * BSpec recommends 8x4 when MSAA is used,
> -	 * however in practice 16x4 seems fastest.
> -	 *
> -	 * Note that PS/WM thread counts depend on the WIZ hashing
> -	 * disable bit, which we don't touch here, but it's good
> -	 * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
> -	 */
> -	I915_WRITE(GEN7_GT_MODE,
> -		   GEN6_WIZ_HASHING_MASK | GEN6_WIZ_HASHING_16x4);
> -
>   	I915_WRITE(GEN6_RC_SLEEP_PSMI_CONTROL,
>   		   _MASKED_BIT_ENABLE(GEN8_RC_SEMA_IDLE_MSG_DISABLE));
>
>   	/* WaDisableSDEUnitClockGating:bdw */
>   	I915_WRITE(GEN8_UCGCTL6, I915_READ(GEN8_UCGCTL6) |
>   		   GEN8_SDEUNIT_CLOCK_GATE_DISABLE);
> -
> -	/* Wa4x4STCOptimizationDisable:bdw */
> -	I915_WRITE(CACHE_MODE_1,
> -		   _MASKED_BIT_ENABLE(GEN8_4x4_STC_OPTIMIZATION_DISABLE));
>   }
>
>   static void haswell_init_clock_gating(struct drm_device *dev)
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index b3d8f76..abec00c 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -598,6 +598,69 @@ err:
>   	return ret;
>   }
>
> +static void bdw_init_workarounds(struct drm_device *dev)
> +{
> +	struct drm_i915_private *dev_priv = dev->dev_private;
> +
> +	/*
> +	 * workarounds applied in this fn are part of register state context,
> +	 * they need to be re-initialized followed by gpu reset, suspend/resume,
> +	 * module reload.
> +	 */
> +	dev_priv->num_wa = 0;
> +	memset(dev_priv->wa_regs, 0, sizeof(dev_priv->wa_regs));
> +
> +	/* WaDisablePartialInstShootdown:bdw */
> +	/* WaDisableThreadStallDopClockGating:bdw */
> +	/* FIXME: Unclear whether we really need this on production bdw. */
> +
> +	I915_WRITE_WA(GEN8_ROW_CHICKEN,
> +		      _MASKED_BIT_ENABLE(PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE
> +					 | STALL_DOP_GATING_DISABLE));
> +
> +	/* WaDisableDopClockGating:bdw May not be needed for production */
> +	I915_WRITE_WA(GEN7_ROW_CHICKEN2,
> +		      _MASKED_BIT_ENABLE(DOP_CLOCK_GATING_DISABLE));
> +
> +	/*
> +	 * This GEN8_CENTROID_PIXEL_OPT_DIS W/A is only needed for
> +	 * pre-production hardware
> +	 */
> +	I915_WRITE_WA(HALF_SLICE_CHICKEN3,
> +		      _MASKED_BIT_ENABLE(GEN8_CENTROID_PIXEL_OPT_DIS
> +					 | GEN8_SAMPLER_POWER_BYPASS_DIS));
> +
> +	I915_WRITE_WA(GEN7_HALF_SLICE_CHICKEN1,
> +		      _MASKED_BIT_ENABLE(GEN7_SINGLE_SUBSCAN_DISPATCH_ENABLE));
> +
> +	/* Wa4x4STCOptimizationDisable:bdw */
> +	I915_WRITE_WA(CACHE_MODE_1,
> +		      _MASKED_BIT_ENABLE(GEN8_4x4_STC_OPTIMIZATION_DISABLE));
> +
> +	/*
> +	 * BSpec recommends 8x4 when MSAA is used,
> +	 * however in practice 16x4 seems fastest.
> +	 *
> +	 * Note that PS/WM thread counts depend on the WIZ hashing
> +	 * disable bit, which we don't touch here, but it's good
> +	 * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
> +	 */
> +	I915_WRITE_WA(GEN7_GT_MODE,
> +		      GEN6_WIZ_HASHING_MASK | GEN6_WIZ_HASHING_16x4);
> +
> +	I915_WRITE_WA(COMMON_SLICE_CHICKEN2,
> +		      _MASKED_BIT_ENABLE(GEN8_CSC2_SBE_VUE_CACHE_CONSERVATIVE));
> +
> +	/* Use Force Non-Coherent whenever executing a 3D context. This is a
> +	 * workaround for for a possible hang in the unlikely event a TLB
> +	 * invalidation occurs during a PSD flush.
> +	 */
> +	I915_WRITE_WA(HDC_CHICKEN0,
> +		      _MASKED_BIT_ENABLE(HDC_FORCE_NON_COHERENT));
> +
> +	DRM_DEBUG_DRIVER("No. of workarounds applied: %d\n", dev_priv->num_wa);
> +}
> +
>   static int init_render_ring(struct intel_engine_cs *ring)
>   {
>   	struct drm_device *dev = ring->dev;
> @@ -653,6 +716,9 @@ static int init_render_ring(struct intel_engine_cs *ring)
>   	if (HAS_L3_DPF(dev))
>   		I915_WRITE_IMR(ring, ~GT_PARITY_ERROR(dev));
>
> +	if (IS_BROADWELL(dev))
> +		bdw_init_workarounds(dev);
> +
>   	return ret;
>   }
>
>



More information about the Intel-gfx mailing list