[Intel-gfx] [PATCH] drm/i915/vlv: Added/removed Render specific Hw Workarounds for VLV
akash.goel at intel.com
akash.goel at intel.com
Thu Jan 9 12:51:49 CET 2014
From: akashgoe <akash.goel at intel.com>
The following changes leads to stable behavior, especially
when playing 3D Apps, benchmarks.
Added 4 new rendering specific Workarounds
1. WaTlbInvalidateStoreDataBefore :-
Before pipecontrol with TLB invalidate set,
need 2 store data commands
2. WaReadAfterWriteHazard :-
Send 8 store dword commands after flush
for read after write hazard
(HSD Gen6 bug_de 3047871)
3. WaVSThreadDispatchOverride
Performance optimization - Hw will
decide which half slice the thread
will dispatch, May not be really needed
for VLV, as its single slice
4. WaDisable_RenderCache_OperationalFlush
Operational flush cannot be enabled on
BWG A0 [Errata BWT006]
Removed 3 workarounds as not needed for VLV+(B0 onwards)
1. WaDisableRHWOOptimizationForRenderHang
2. WaDisableL3CacheAging
3. WaDisableDopClockGating
Modified the implementation of 1 workaround
1. WaDisableL3Bank2xClockGate
Disabling L3 clock gating- MMIO 940c[25] = 1
Modified the programming of 2 registers in render ring init function
1. GFX_MODE_GEN7 (Enabling TLB invalidate)
2. MI_MODE (Enabling MI Flush)
Change-Id: Ib60f8dc7d2213552e498d588e1bba7eaa1a921ed
Signed-off-by: akashgoe <akash.goel at intel.com>
Signed-off-by: Akash Goel <akash.goel at intel.com>
---
drivers/gpu/drm/i915/i915_reg.h | 3 ++
drivers/gpu/drm/i915/intel_pm.c | 32 +++++++++++-------
drivers/gpu/drm/i915/intel_ringbuffer.c | 58 ++++++++++++++++++++++++++++++---
3 files changed, 77 insertions(+), 16 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index a699efd..d829754 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -934,6 +934,9 @@
#define ECO_GATING_CX_ONLY (1<<3)
#define ECO_FLIP_DONE (1<<0)
+#define GEN7_CACHE_MODE_0 0x07000 /* IVB+ only */
+#define GEN7_RC_OP_FLUSH_ENABLE (1<<0)
+
#define CACHE_MODE_1 0x7004 /* IVB+ */
#define PIXEL_SUBSPAN_COLLECT_OPT_DISABLE (1<<6)
diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c
index 469170c..e4d220c 100644
--- a/drivers/gpu/drm/i915/intel_pm.c
+++ b/drivers/gpu/drm/i915/intel_pm.c
@@ -4947,22 +4947,18 @@ static void valleyview_init_clock_gating(struct drm_device *dev)
_MASKED_BIT_ENABLE(GEN7_MAX_PS_THREAD_DEP |
GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE));
- /* Apply the WaDisableRHWOOptimizationForRenderHang:vlv workaround. */
- I915_WRITE(GEN7_COMMON_SLICE_CHICKEN1,
- GEN7_CSC1_RHWO_OPT_DISABLE_IN_RCC);
-
- /* WaApplyL3ControlAndL3ChickenMode:vlv */
- I915_WRITE(GEN7_L3CNTLREG1, I915_READ(GEN7_L3CNTLREG1) | GEN7_L3AGDIS);
I915_WRITE(GEN7_L3_CHICKEN_MODE_REGISTER, GEN7_WA_L3_CHICKEN_MODE);
+ /* WaDisable_RenderCache_OperationalFlush
+ * Clear bit 0, so we do a AND with the mask
+ * to keep other bits the same */
+ I915_WRITE(GEN7_CACHE_MODE_0, (I915_READ(GEN7_CACHE_MODE_0) |
+ _MASKED_BIT_DISABLE(GEN7_RC_OP_FLUSH_ENABLE)));
+
/* WaForceL3Serialization:vlv */
I915_WRITE(GEN7_L3SQCREG4, I915_READ(GEN7_L3SQCREG4) &
~L3SQ_URB_READ_CAM_MATCH_DISABLE);
- /* WaDisableDopClockGating:vlv */
- I915_WRITE(GEN7_ROW_CHICKEN2,
- _MASKED_BIT_ENABLE(DOP_CLOCK_GATING_DISABLE));
-
/* This is required by WaCatErrorRejectionIssue:vlv */
I915_WRITE(GEN7_SQ_CHICKEN_MBCUNIT_CONFIG,
I915_READ(GEN7_SQ_CHICKEN_MBCUNIT_CONFIG) |
@@ -4991,10 +4987,24 @@ static void valleyview_init_clock_gating(struct drm_device *dev)
GEN6_RCPBUNIT_CLOCK_GATE_DISABLE |
GEN6_RCCUNIT_CLOCK_GATE_DISABLE);
- I915_WRITE(GEN7_UCGCTL4, GEN7_L3BANK2X_CLOCK_GATE_DISABLE);
+ /* WaDisableL3Bank2xClockGate
+ * Disabling L3 clock gating- MMIO 940c[25] = 1
+ * Set bit 25, to disable L3_BANK_2x_CLK_GATING */
+ I915_WRITE(GEN7_UCGCTL4,
+ I915_READ(GEN7_UCGCTL4) | GEN7_L3BANK2X_CLOCK_GATE_DISABLE);
I915_WRITE(MI_ARB_VLV, MI_ARB_DISPLAY_TRICKLE_FEED_DISABLE);
+ /* WaVSThreadDispatchOverride
+ * Hw will decide which half slice the thread will dispatch.
+ * May not be needed for VLV, as its a single slice */
+ I915_WRITE(GEN7_CACHE_MODE_0,
+ I915_READ(GEN7_FF_THREAD_MODE) &
+ (~GEN7_FF_VS_SCHED_LOAD_BALANCE));
+
+ /* WaDisable4x2SubspanOptimization,
+ * Disable combining of two 2x2 subspans into a 4x2 subspan
+ * Set chicken bit to disable subspan optimization */
I915_WRITE(CACHE_MODE_1,
_MASKED_BIT_ENABLE(PIXEL_SUBSPAN_COLLECT_OPT_DISABLE));
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 442c9a6..c9350a1 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -563,7 +563,9 @@ static int init_render_ring(struct intel_ring_buffer *ring)
int ret = init_ring_common(ring);
if (INTEL_INFO(dev)->gen > 3)
- I915_WRITE(MI_MODE, _MASKED_BIT_ENABLE(VS_TIMER_DISPATCH));
+ if (!IS_VALLEYVIEW(dev))
+ I915_WRITE(MI_MODE,
+ _MASKED_BIT_ENABLE(VS_TIMER_DISPATCH));
/* We need to disable the AsyncFlip performance optimisations in order
* to use MI_WAIT_FOR_EVENT within the CS. It should already be
@@ -579,10 +581,17 @@ static int init_render_ring(struct intel_ring_buffer *ring)
I915_WRITE(GFX_MODE,
_MASKED_BIT_ENABLE(GFX_TLB_INVALIDATE_ALWAYS));
- if (IS_GEN7(dev))
- I915_WRITE(GFX_MODE_GEN7,
- _MASKED_BIT_DISABLE(GFX_TLB_INVALIDATE_ALWAYS) |
- _MASKED_BIT_ENABLE(GFX_REPLAY_MODE));
+ if (IS_GEN7(dev)) {
+ if (IS_VALLEYVIEW(dev)) {
+ I915_WRITE(GFX_MODE_GEN7,
+ _MASKED_BIT_ENABLE(GFX_REPLAY_MODE));
+ I915_WRITE(MI_MODE, I915_READ(MI_MODE) |
+ _MASKED_BIT_ENABLE(MI_FLUSH_ENABLE));
+ } else
+ I915_WRITE(GFX_MODE_GEN7,
+ _MASKED_BIT_DISABLE(GFX_TLB_INVALIDATE_ALWAYS) |
+ _MASKED_BIT_ENABLE(GFX_REPLAY_MODE));
+ }
if (INTEL_INFO(dev)->gen >= 5) {
ret = init_pipe_control(ring);
@@ -2157,6 +2166,7 @@ int
intel_ring_flush_all_caches(struct intel_ring_buffer *ring)
{
int ret;
+ int i;
if (!ring->gpu_caches_dirty)
return 0;
@@ -2165,6 +2175,26 @@ intel_ring_flush_all_caches(struct intel_ring_buffer *ring)
if (ret)
return ret;
+ /* WaReadAfterWriteHazard*/
+ /* Send a number of Store Data commands here to finish
+ flushing hardware pipeline.This is needed in the case
+ where the next workload tries reading from the same
+ surface that this batch writes to. Without these StoreDWs,
+ not all of the data will actually be flushd to the surface
+ by the time the next batch starts reading it, possibly
+ causing a small amount of corruption.*/
+ ret = intel_ring_begin(ring, 4 * 12);
+ if (ret)
+ return ret;
+ for (i = 0; i < 12; i++) {
+ intel_ring_emit(ring, MI_STORE_DWORD_INDEX);
+ intel_ring_emit(ring, I915_GEM_HWS_SCRATCH_INDEX <<
+ MI_STORE_DWORD_INDEX_SHIFT);
+ intel_ring_emit(ring, 0);
+ intel_ring_emit(ring, MI_NOOP);
+ }
+ intel_ring_advance(ring);
+
trace_i915_gem_ring_flush(ring, 0, I915_GEM_GPU_DOMAINS);
ring->gpu_caches_dirty = false;
@@ -2176,6 +2206,24 @@ intel_ring_invalidate_all_caches(struct intel_ring_buffer *ring)
{
uint32_t flush_domains;
int ret;
+ int i;
+
+ /* WaTlbInvalidateStoreDataBefore*/
+ /* Before pipecontrol with TLB invalidate set, need 2 store
+ data commands (such as MI_STORE_DATA_IMM or MI_STORE_DATA_INDEX)
+ Without this, hardware cannot guarantee the command after the
+ PIPE_CONTROL with TLB inv will not use the old TLB values.*/
+ ret = intel_ring_begin(ring, 4 * 2);
+ if (ret)
+ return ret;
+ for (i = 0; i < 2; i++) {
+ intel_ring_emit(ring, MI_STORE_DWORD_INDEX);
+ intel_ring_emit(ring, I915_GEM_HWS_SCRATCH_INDEX <<
+ MI_STORE_DWORD_INDEX_SHIFT);
+ intel_ring_emit(ring, 0);
+ intel_ring_emit(ring, MI_NOOP);
+ }
+ intel_ring_advance(ring);
flush_domains = 0;
if (ring->gpu_caches_dirty)
--
1.8.5.2
More information about the Intel-gfx
mailing list