[Intel-gfx] [PATCH] drm/i915/bdw: Initialize workarounds in render ring init fn.
Michael H. Nguyen
michael.h.nguyen at intel.com
Sat Aug 16 01:26:26 CEST 2014
On 08/14/2014 09:51 AM, arun.siluvery at linux.intel.com wrote:
> From: Arun Siluvery <arun.siluvery at linux.intel.com>
>
> The workarounds at the moment are initialized in init_clock_gating() but
> they are lost during reset, suspend/resume; this patch moves workarounds
> that are part of register state context to render ring init fn otherwise
> default context ends up with incorrect values as they don't get initialized
> until init_clock_gating fn. This should be ok as they are not related to
> display clock gating in the first place.
>
> v2: Add macro to generate w/a table (Daniel)
>
> Signed-off-by: Arun Siluvery <arun.siluvery at linux.intel.com>
> ---
> drivers/gpu/drm/i915/i915_debugfs.c | 38 +++++++++++++++++++
> drivers/gpu/drm/i915/i915_drv.h | 32 ++++++++++++++++
> drivers/gpu/drm/i915/intel_pm.c | 50 -------------------------
> drivers/gpu/drm/i915/intel_ringbuffer.c | 66 +++++++++++++++++++++++++++++++++
> 4 files changed, 136 insertions(+), 50 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index 9e737b7..fc28835 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -2406,6 +2406,43 @@ static int i915_shared_dplls_info(struct seq_file *m, void *unused)
> return 0;
> }
>
> +static int intel_wa_registers(struct seq_file *m, void *unused)
> +{
> + struct drm_info_node *node = (struct drm_info_node *) m->private;
> + struct drm_device *dev = node->minor->dev;
> + struct drm_i915_private *dev_priv = dev->dev_private;
> + int i;
> + int ret;
> +
> + if (!dev_priv->num_wa) {
> + DRM_DEBUG_DRIVER("Workaround table not available\n");
> + return -EINVAL;
> + }
> +
> + if (dev_priv->num_wa > I915_MAX_WA_REGS)
> + dev_priv->num_wa = I915_MAX_WA_REGS;
> +
> + ret = mutex_lock_interruptible(&dev->struct_mutex);
> + if (ret)
> + return ret;
> +
> + intel_runtime_pm_get(dev_priv);
> +
> + seq_printf(m, "Workarounds applied: %d\n", dev_priv->num_wa);
> + for (i = 0; i < dev_priv->num_wa; ++i) {
> + if (dev_priv->wa_regs[i].addr)
> + seq_printf(m, "0x%X: 0x%08X, mask: 0x%08X\n",
> + dev_priv->wa_regs[i].addr,
> + I915_READ(dev_priv->wa_regs[i].addr),
> + dev_priv->wa_regs[i].mask);
> + }
> +
> + intel_runtime_pm_put(dev_priv);
> + mutex_unlock(&dev->struct_mutex);
> +
> + return 0;
> +}
> +
> struct pipe_crc_info {
> const char *name;
> struct drm_device *dev;
> @@ -3936,6 +3973,7 @@ static const struct drm_info_list i915_debugfs_list[] = {
> {"i915_semaphore_status", i915_semaphore_status, 0},
> {"i915_shared_dplls_info", i915_shared_dplls_info, 0},
> {"i915_dp_mst_info", i915_dp_mst_info, 0},
> + {"intel_wa_registers", intel_wa_registers, 0},
> };
> #define I915_DEBUGFS_ENTRIES ARRAY_SIZE(i915_debugfs_list)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 18c9ad8..c4190a9 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1222,6 +1222,24 @@ struct i915_gpu_error {
> unsigned int test_irq_rings;
> };
>
> +/*
> + * workarounds are currently applied at different places and changes
> + * are being done to consolidate them at a single place hence the
> + * exact count is not clear at this point; replace this with accurate
> + * number based on gen later.
> + */
> +#define I915_MAX_WA_REGS 16
> +
> +/*
> + * structure to verify workarounds status
> + * mask: represent w/a bits
> + */
> +struct intel_wa {
> + u32 addr;
> + u32 val;
> + u32 mask;
> +};
Can we rename this to intel_reg_wa or similar? intel_wa confused me for
a second. Personally when I think about wa's, they can be simple
register writes like is the case here or more complex requiring some
software logic.
Also, should this be defined in a userspace accessible header? I see
this defined again locally in the IGT test.
> +
> enum modeset_restore {
> MODESET_ON_LID_OPEN,
> MODESET_DONE,
> @@ -1422,6 +1440,9 @@ struct drm_i915_private {
> struct drm_i915_gem_object *semaphore_obj;
> uint32_t last_seqno, next_seqno;
>
> + uint32_t num_wa;
> + struct intel_wa wa_regs[I915_MAX_WA_REGS];
> +
> drm_dma_handle_t *status_page_dmah;
> struct resource mch_res;
>
> @@ -2803,6 +2824,17 @@ int vlv_freq_opcode(struct drm_i915_private *dev_priv, int val);
> #define POSTING_READ(reg) (void)I915_READ_NOTRACE(reg)
> #define POSTING_READ16(reg) (void)I915_READ16_NOTRACE(reg)
>
> +#define I915_WRITE_WA(reg, val) ({ \
> + if (dev_priv->num_wa >= 0 \
> + && dev_priv->num_wa < I915_MAX_WA_REGS) { \
> + dev_priv->wa_regs[dev_priv->num_wa].addr = reg; \
> + dev_priv->wa_regs[dev_priv->num_wa].mask \
> + = (val) & 0xFFFF; \
> + dev_priv->num_wa++; \
> + I915_WRITE(reg, val); \
> + } \
If this macro is called too many times, there will be silent failures I
believe. Should we add an else case that does a WARN_ON or prints a
message? This will help developers catch bugs.
Thanks,
Mike
> + })
> +
> /* "Broadcast RGB" property */
> #define INTEL_BROADCAST_RGB_AUTO 0
> #define INTEL_BROADCAST_RGB_FULL 1
> diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c
> index 1ddd4df..ab64b64 100644
> --- a/drivers/gpu/drm/i915/intel_pm.c
> +++ b/drivers/gpu/drm/i915/intel_pm.c
> @@ -5402,38 +5402,11 @@ static void gen8_init_clock_gating(struct drm_device *dev)
> /* FIXME(BDW): Check all the w/a, some might only apply to
> * pre-production hw. */
>
> - /* WaDisablePartialInstShootdown:bdw */
> - I915_WRITE(GEN8_ROW_CHICKEN,
> - _MASKED_BIT_ENABLE(PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE));
> -
> - /* WaDisableThreadStallDopClockGating:bdw */
> - /* FIXME: Unclear whether we really need this on production bdw. */
> - I915_WRITE(GEN8_ROW_CHICKEN,
> - _MASKED_BIT_ENABLE(STALL_DOP_GATING_DISABLE));
> -
> - /*
> - * This GEN8_CENTROID_PIXEL_OPT_DIS W/A is only needed for
> - * pre-production hardware
> - */
> - I915_WRITE(HALF_SLICE_CHICKEN3,
> - _MASKED_BIT_ENABLE(GEN8_CENTROID_PIXEL_OPT_DIS));
> - I915_WRITE(HALF_SLICE_CHICKEN3,
> - _MASKED_BIT_ENABLE(GEN8_SAMPLER_POWER_BYPASS_DIS));
> I915_WRITE(GAMTARBMODE, _MASKED_BIT_ENABLE(ARB_MODE_BWGTLB_DISABLE));
>
> I915_WRITE(_3D_CHICKEN3,
> _MASKED_BIT_ENABLE(_3D_CHICKEN_SDE_LIMIT_FIFO_POLY_DEPTH(2)));
>
> - I915_WRITE(COMMON_SLICE_CHICKEN2,
> - _MASKED_BIT_ENABLE(GEN8_CSC2_SBE_VUE_CACHE_CONSERVATIVE));
> -
> - I915_WRITE(GEN7_HALF_SLICE_CHICKEN1,
> - _MASKED_BIT_ENABLE(GEN7_SINGLE_SUBSCAN_DISPATCH_ENABLE));
> -
> - /* WaDisableDopClockGating:bdw May not be needed for production */
> - I915_WRITE(GEN7_ROW_CHICKEN2,
> - _MASKED_BIT_ENABLE(DOP_CLOCK_GATING_DISABLE));
> -
> /* WaSwitchSolVfFArbitrationPriority:bdw */
> I915_WRITE(GAM_ECOCHK, I915_READ(GAM_ECOCHK) | HSW_ECOCHK_ARB_PRIO_SOL);
>
> @@ -5448,41 +5421,18 @@ static void gen8_init_clock_gating(struct drm_device *dev)
> BDW_DPRS_MASK_VBLANK_SRD);
> }
>
> - /* Use Force Non-Coherent whenever executing a 3D context. This is a
> - * workaround for for a possible hang in the unlikely event a TLB
> - * invalidation occurs during a PSD flush.
> - */
> - I915_WRITE(HDC_CHICKEN0,
> - I915_READ(HDC_CHICKEN0) |
> - _MASKED_BIT_ENABLE(HDC_FORCE_NON_COHERENT));
> -
> /* WaVSRefCountFullforceMissDisable:bdw */
> /* WaDSRefCountFullforceMissDisable:bdw */
> I915_WRITE(GEN7_FF_THREAD_MODE,
> I915_READ(GEN7_FF_THREAD_MODE) &
> ~(GEN8_FF_DS_REF_CNT_FFME | GEN7_FF_VS_REF_CNT_FFME));
>
> - /*
> - * BSpec recommends 8x4 when MSAA is used,
> - * however in practice 16x4 seems fastest.
> - *
> - * Note that PS/WM thread counts depend on the WIZ hashing
> - * disable bit, which we don't touch here, but it's good
> - * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
> - */
> - I915_WRITE(GEN7_GT_MODE,
> - GEN6_WIZ_HASHING_MASK | GEN6_WIZ_HASHING_16x4);
> -
> I915_WRITE(GEN6_RC_SLEEP_PSMI_CONTROL,
> _MASKED_BIT_ENABLE(GEN8_RC_SEMA_IDLE_MSG_DISABLE));
>
> /* WaDisableSDEUnitClockGating:bdw */
> I915_WRITE(GEN8_UCGCTL6, I915_READ(GEN8_UCGCTL6) |
> GEN8_SDEUNIT_CLOCK_GATE_DISABLE);
> -
> - /* Wa4x4STCOptimizationDisable:bdw */
> - I915_WRITE(CACHE_MODE_1,
> - _MASKED_BIT_ENABLE(GEN8_4x4_STC_OPTIMIZATION_DISABLE));
> }
>
> static void haswell_init_clock_gating(struct drm_device *dev)
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index b3d8f76..abec00c 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -598,6 +598,69 @@ err:
> return ret;
> }
>
> +static void bdw_init_workarounds(struct drm_device *dev)
> +{
> + struct drm_i915_private *dev_priv = dev->dev_private;
> +
> + /*
> + * workarounds applied in this fn are part of register state context,
> + * they need to be re-initialized followed by gpu reset, suspend/resume,
> + * module reload.
> + */
> + dev_priv->num_wa = 0;
> + memset(dev_priv->wa_regs, 0, sizeof(dev_priv->wa_regs));
> +
> + /* WaDisablePartialInstShootdown:bdw */
> + /* WaDisableThreadStallDopClockGating:bdw */
> + /* FIXME: Unclear whether we really need this on production bdw. */
> +
> + I915_WRITE_WA(GEN8_ROW_CHICKEN,
> + _MASKED_BIT_ENABLE(PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE
> + | STALL_DOP_GATING_DISABLE));
> +
> + /* WaDisableDopClockGating:bdw May not be needed for production */
> + I915_WRITE_WA(GEN7_ROW_CHICKEN2,
> + _MASKED_BIT_ENABLE(DOP_CLOCK_GATING_DISABLE));
> +
> + /*
> + * This GEN8_CENTROID_PIXEL_OPT_DIS W/A is only needed for
> + * pre-production hardware
> + */
> + I915_WRITE_WA(HALF_SLICE_CHICKEN3,
> + _MASKED_BIT_ENABLE(GEN8_CENTROID_PIXEL_OPT_DIS
> + | GEN8_SAMPLER_POWER_BYPASS_DIS));
> +
> + I915_WRITE_WA(GEN7_HALF_SLICE_CHICKEN1,
> + _MASKED_BIT_ENABLE(GEN7_SINGLE_SUBSCAN_DISPATCH_ENABLE));
> +
> + /* Wa4x4STCOptimizationDisable:bdw */
> + I915_WRITE_WA(CACHE_MODE_1,
> + _MASKED_BIT_ENABLE(GEN8_4x4_STC_OPTIMIZATION_DISABLE));
> +
> + /*
> + * BSpec recommends 8x4 when MSAA is used,
> + * however in practice 16x4 seems fastest.
> + *
> + * Note that PS/WM thread counts depend on the WIZ hashing
> + * disable bit, which we don't touch here, but it's good
> + * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
> + */
> + I915_WRITE_WA(GEN7_GT_MODE,
> + GEN6_WIZ_HASHING_MASK | GEN6_WIZ_HASHING_16x4);
> +
> + I915_WRITE_WA(COMMON_SLICE_CHICKEN2,
> + _MASKED_BIT_ENABLE(GEN8_CSC2_SBE_VUE_CACHE_CONSERVATIVE));
> +
> + /* Use Force Non-Coherent whenever executing a 3D context. This is a
> + * workaround for for a possible hang in the unlikely event a TLB
> + * invalidation occurs during a PSD flush.
> + */
> + I915_WRITE_WA(HDC_CHICKEN0,
> + _MASKED_BIT_ENABLE(HDC_FORCE_NON_COHERENT));
> +
> + DRM_DEBUG_DRIVER("No. of workarounds applied: %d\n", dev_priv->num_wa);
> +}
> +
> static int init_render_ring(struct intel_engine_cs *ring)
> {
> struct drm_device *dev = ring->dev;
> @@ -653,6 +716,9 @@ static int init_render_ring(struct intel_engine_cs *ring)
> if (HAS_L3_DPF(dev))
> I915_WRITE_IMR(ring, ~GT_PARITY_ERROR(dev));
>
> + if (IS_BROADWELL(dev))
> + bdw_init_workarounds(dev);
> +
> return ret;
> }
>
>
More information about the Intel-gfx
mailing list