[PATCH 37/76] drm/i915: Apply context workarounds directly

Chris Wilson chris at chris-wilson.co.uk
Mon Jul 9 15:15:48 UTC 2018


From: Oscar Mateo <oscar.mateo at intel.com>

Once upon a time, we tried to apply workarounds for registers that lived
inside the context image for every new context. That meant emitting LRI
commands soon after each context was created.

Nowadays, we have a single golden context that gets used as a master
template for future contexts. That golden context will acquire initial
values for its image from the existing values in HW (thanks to inhibit
restore bit). If all WAs are applied normally (i.e. using MMIO writes)
before that happens, they will get soaked up by the golden context and
transmitted correctly to new contexts.

All of this means we don't have to distinguish between context and
non-context WAs anymore, because both can be applied in the same way
(we still want to distinguish them though, because we would like to
check their validity using i-g-t, and that means making sure we have
a context loaded for ctx-residing WAs).

Signed-off-by: Oscar Mateo <oscar.mateo at intel.com>
Cc: Chris Wilson <chris at chris-wilson.co.uk>
Cc: Joonas Lahtinen <joonas.lahtinen at linux.intel.com>
Cc: Ville Syrjälä <ville.syrjala at linux.intel.com>
---
 drivers/gpu/drm/i915/i915_gem.c          |   1 +
 drivers/gpu/drm/i915/i915_gem_context.c  |   5 -
 drivers/gpu/drm/i915/intel_lrc.c         |   4 -
 drivers/gpu/drm/i915/intel_ringbuffer.c  |   4 -
 drivers/gpu/drm/i915/intel_workarounds.c | 196 ++++++++---------------
 drivers/gpu/drm/i915/intel_workarounds.h |   5 +-
 6 files changed, 67 insertions(+), 148 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 25da43c0f995..c332aa6cc999 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -5114,6 +5114,7 @@ int i915_gem_init(struct drm_i915_private *dev_priv)
 	 * FIXME: break up the workarounds and apply them at the right time!
 	 */
 	intel_init_clock_gating(dev_priv);
+	intel_ctx_workarounds_apply(dev_priv);
 
 	ret = __intel_engines_record_defaults(dev_priv);
 	if (ret)
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index cf91121d78a6..c6f64f32e415 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -567,16 +567,11 @@ static bool needs_preempt_context(struct drm_i915_private *i915)
 int i915_gem_contexts_init(struct drm_i915_private *dev_priv)
 {
 	struct i915_gem_context *ctx;
-	int ret;
 
 	/* Reassure ourselves we are only called once */
 	GEM_BUG_ON(dev_priv->kernel_context);
 	GEM_BUG_ON(dev_priv->preempt_context);
 
-	ret = intel_ctx_workarounds_init(dev_priv);
-	if (ret)
-		return ret;
-
 	init_contexts(dev_priv);
 
 	/* lowest priority; idle task */
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 6aac1826ffeb..345f3b39e14e 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -2497,10 +2497,6 @@ static int gen8_init_rcs_context(struct i915_request *rq)
 {
 	int ret;
 
-	ret = intel_ctx_workarounds_emit(rq);
-	if (ret)
-		return ret;
-
 	ret = intel_rcs_context_init_mocs(rq);
 	/*
 	 * Failing to program the MOCS is non-fatal.The system will not
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 6077f7575677..7a02203f6b66 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -620,10 +620,6 @@ static int intel_rcs_ctx_init(struct i915_request *rq)
 {
 	int ret;
 
-	ret = intel_ctx_workarounds_emit(rq);
-	if (ret != 0)
-		return ret;
-
 	ret = i915_gem_render_state_emit(rq);
 	if (ret)
 		return ret;
diff --git a/drivers/gpu/drm/i915/intel_workarounds.c b/drivers/gpu/drm/i915/intel_workarounds.c
index f8bb32e974f6..aa534b364b30 100644
--- a/drivers/gpu/drm/i915/intel_workarounds.c
+++ b/drivers/gpu/drm/i915/intel_workarounds.c
@@ -48,14 +48,16 @@
  * - Public functions to init or apply the given workaround type.
  */
 
-static void wa_add(struct drm_i915_private *i915,
+static void wa_add(struct drm_i915_private *dev_priv,
 		   i915_reg_t reg, const u32 mask, const u32 val)
 {
-	struct i915_workarounds *wa = &i915->workarounds;
+	struct i915_workarounds *wa = &dev_priv->workarounds;
 	unsigned int start = 0, end = wa->count;
 	unsigned int addr = i915_mmio_reg_offset(reg);
 	struct i915_wa_reg *r;
 
+	I915_WRITE(reg, val);
+
 	while (start < end) {
 		unsigned int mid = start + (end - start) / 2;
 
@@ -110,7 +112,7 @@ static void wa_add(struct drm_i915_private *i915,
 #define WA_SET_FIELD_MASKED(addr, mask, value) \
 	WA_REG(addr, (mask), _MASKED_FIELD(mask, value))
 
-static int gen8_ctx_workarounds_init(struct drm_i915_private *dev_priv)
+static void gen8_ctx_workarounds_apply(struct drm_i915_private *dev_priv)
 {
 	WA_SET_BIT_MASKED(INSTPM, INSTPM_FORCE_ORDERING);
 
@@ -121,17 +123,20 @@ static int gen8_ctx_workarounds_init(struct drm_i915_private *dev_priv)
 	WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN,
 			  PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE);
 
-	/* Use Force Non-Coherent whenever executing a 3D context. This is a
+	/*
+	 * Use Force Non-Coherent whenever executing a 3D context. This is a
 	 * workaround for for a possible hang in the unlikely event a TLB
 	 * invalidation occurs during a PSD flush.
+	 *
+	 * WaForceEnableNonCoherent:bdw,chv
+	 * WaHdcDisableFetchWhenMasked:bdw,chv
 	 */
-	/* WaForceEnableNonCoherent:bdw,chv */
-	/* WaHdcDisableFetchWhenMasked:bdw,chv */
 	WA_SET_BIT_MASKED(HDC_CHICKEN0,
 			  HDC_DONOT_FETCH_MEM_WHEN_MASKED |
 			  HDC_FORCE_NON_COHERENT);
 
-	/* From the Haswell PRM, Command Reference: Registers, CACHE_MODE_0:
+	/*
+	 * From the Haswell PRM, Command Reference: Registers, CACHE_MODE_0:
 	 * "The Hierarchical Z RAW Stall Optimization allows non-overlapping
 	 *  polygons in the same 8x4 pixel/sample area to be processed without
 	 *  stalling waiting for the earlier ones to write to Hierarchical Z
@@ -155,22 +160,17 @@ static int gen8_ctx_workarounds_init(struct drm_i915_private *dev_priv)
 	WA_SET_FIELD_MASKED(GEN7_GT_MODE,
 			    GEN6_WIZ_HASHING_MASK,
 			    GEN6_WIZ_HASHING_16x4);
-
-	return 0;
 }
 
-static int bdw_ctx_workarounds_init(struct drm_i915_private *dev_priv)
+static void bdw_ctx_workarounds_apply(struct drm_i915_private *dev_priv)
 {
-	int ret;
-
-	ret = gen8_ctx_workarounds_init(dev_priv);
-	if (ret)
-		return ret;
+	gen8_ctx_workarounds_apply(dev_priv);
 
 	/* WaDisableThreadStallDopClockGating:bdw (pre-production) */
 	WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN, STALL_DOP_GATING_DISABLE);
 
-	/* WaDisableDopClockGating:bdw
+	/*
+	 * WaDisableDopClockGating:bdw
 	 *
 	 * Also see the related UCGTCL1 write in broadwell_init_clock_gating()
 	 * to disable EUTC clock gating.
@@ -186,31 +186,24 @@ static int bdw_ctx_workarounds_init(struct drm_i915_private *dev_priv)
 			  HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT |
 			  /* WaDisableFenceDestinationToSLM:bdw (pre-prod) */
 			  (IS_BDW_GT3(dev_priv) ? HDC_FENCE_DEST_SLM_DISABLE : 0));
-
-	return 0;
 }
 
-static int chv_ctx_workarounds_init(struct drm_i915_private *dev_priv)
+static void chv_ctx_workarounds_apply(struct drm_i915_private *dev_priv)
 {
-	int ret;
-
-	ret = gen8_ctx_workarounds_init(dev_priv);
-	if (ret)
-		return ret;
+	gen8_ctx_workarounds_apply(dev_priv);
 
 	/* WaDisableThreadStallDopClockGating:chv */
 	WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN, STALL_DOP_GATING_DISABLE);
 
 	/* Improve HiZ throughput on CHV. */
 	WA_SET_BIT_MASKED(HIZ_CHICKEN, CHV_HZ_8X8_MODE_IN_1X);
-
-	return 0;
 }
 
-static int gen9_ctx_workarounds_init(struct drm_i915_private *dev_priv)
+static void gen9_ctx_workarounds_apply(struct drm_i915_private *dev_priv)
 {
 	if (HAS_LLC(dev_priv)) {
-		/* WaCompressedResourceSamplerPbeMediaNewHashMode:skl,kbl
+		/*
+		 * WaCompressedResourceSamplerPbeMediaNewHashMode:skl,kbl
 		 *
 		 * Must match Display Engine. See
 		 * WaCompressedResourceDisplayNewHashMode.
@@ -253,7 +246,8 @@ static int gen9_ctx_workarounds_init(struct drm_i915_private *dev_priv)
 			  HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT |
 			  HDC_FORCE_CSR_NON_COHERENT_OVR_DISABLE);
 
-	/* WaForceEnableNonCoherent and WaDisableHDCInvalidation are
+	/*
+	 * WaForceEnableNonCoherent and WaDisableHDCInvalidation are
 	 * both tied to WaForceContextSaveRestoreNonCoherent
 	 * in some hsds for skl. We keep the tie for all gen9. The
 	 * documentation is a bit hazy and so we want to get common behaviour,
@@ -302,11 +296,9 @@ static int gen9_ctx_workarounds_init(struct drm_i915_private *dev_priv)
 	/* WaClearHIZ_WM_CHICKEN3:bxt,glk */
 	if (IS_GEN9_LP(dev_priv))
 		WA_SET_BIT_MASKED(GEN9_WM_CHICKEN3, GEN9_FACTOR_IN_CLR_VAL_HIZ);
-
-	return 0;
 }
 
-static int skl_tune_iz_hashing(struct drm_i915_private *dev_priv)
+static void skl_tune_iz_hashing(struct drm_i915_private *dev_priv)
 {
 	u8 vals[3] = { 0, 0, 0 };
 	unsigned int i;
@@ -332,7 +324,7 @@ static int skl_tune_iz_hashing(struct drm_i915_private *dev_priv)
 	}
 
 	if (vals[0] == 0 && vals[1] == 0 && vals[2] == 0)
-		return 0;
+		return;
 
 	/* Tune IZ hashing. See intel_device_info_runtime_init() */
 	WA_SET_FIELD_MASKED(GEN7_GT_MODE,
@@ -342,28 +334,18 @@ static int skl_tune_iz_hashing(struct drm_i915_private *dev_priv)
 			    GEN9_IZ_HASHING(2, vals[2]) |
 			    GEN9_IZ_HASHING(1, vals[1]) |
 			    GEN9_IZ_HASHING(0, vals[0]));
-
-	return 0;
 }
 
-static int skl_ctx_workarounds_init(struct drm_i915_private *dev_priv)
+static void skl_ctx_workarounds_apply(struct drm_i915_private *dev_priv)
 {
-	int ret;
-
-	ret = gen9_ctx_workarounds_init(dev_priv);
-	if (ret)
-		return ret;
+	gen9_ctx_workarounds_apply(dev_priv);
 
-	return skl_tune_iz_hashing(dev_priv);
+	skl_tune_iz_hashing(dev_priv);
 }
 
-static int bxt_ctx_workarounds_init(struct drm_i915_private *dev_priv)
+static void bxt_ctx_workarounds_apply(struct drm_i915_private *dev_priv)
 {
-	int ret;
-
-	ret = gen9_ctx_workarounds_init(dev_priv);
-	if (ret)
-		return ret;
+	gen9_ctx_workarounds_apply(dev_priv);
 
 	/* WaDisableThreadStallDopClockGating:bxt */
 	WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN,
@@ -372,17 +354,11 @@ static int bxt_ctx_workarounds_init(struct drm_i915_private *dev_priv)
 	/* WaToEnableHwFixForPushConstHWBug:bxt */
 	WA_SET_BIT_MASKED(COMMON_SLICE_CHICKEN2,
 			  GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
-
-	return 0;
 }
 
-static int kbl_ctx_workarounds_init(struct drm_i915_private *dev_priv)
+static void kbl_ctx_workarounds_apply(struct drm_i915_private *dev_priv)
 {
-	int ret;
-
-	ret = gen9_ctx_workarounds_init(dev_priv);
-	if (ret)
-		return ret;
+	gen9_ctx_workarounds_apply(dev_priv);
 
 	/* WaDisableFenceDestinationToSLM:kbl (pre-prod) */
 	if (IS_KBL_REVID(dev_priv, KBL_REVID_A0, KBL_REVID_A0))
@@ -397,32 +373,20 @@ static int kbl_ctx_workarounds_init(struct drm_i915_private *dev_priv)
 	/* WaDisableSbeCacheDispatchPortSharing:kbl */
 	WA_SET_BIT_MASKED(GEN7_HALF_SLICE_CHICKEN1,
 			  GEN7_SBE_SS_CACHE_DISPATCH_PORT_SHARING_DISABLE);
-
-	return 0;
 }
 
-static int glk_ctx_workarounds_init(struct drm_i915_private *dev_priv)
+static void glk_ctx_workarounds_apply(struct drm_i915_private *dev_priv)
 {
-	int ret;
-
-	ret = gen9_ctx_workarounds_init(dev_priv);
-	if (ret)
-		return ret;
+	gen9_ctx_workarounds_apply(dev_priv);
 
 	/* WaToEnableHwFixForPushConstHWBug:glk */
 	WA_SET_BIT_MASKED(COMMON_SLICE_CHICKEN2,
 			  GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION);
-
-	return 0;
 }
 
-static int cfl_ctx_workarounds_init(struct drm_i915_private *dev_priv)
+static void cfl_ctx_workarounds_apply(struct drm_i915_private *dev_priv)
 {
-	int ret;
-
-	ret = gen9_ctx_workarounds_init(dev_priv);
-	if (ret)
-		return ret;
+	gen9_ctx_workarounds_apply(dev_priv);
 
 	/* WaToEnableHwFixForPushConstHWBug:cfl */
 	WA_SET_BIT_MASKED(COMMON_SLICE_CHICKEN2,
@@ -431,11 +395,9 @@ static int cfl_ctx_workarounds_init(struct drm_i915_private *dev_priv)
 	/* WaDisableSbeCacheDispatchPortSharing:cfl */
 	WA_SET_BIT_MASKED(GEN7_HALF_SLICE_CHICKEN1,
 			  GEN7_SBE_SS_CACHE_DISPATCH_PORT_SHARING_DISABLE);
-
-	return 0;
 }
 
-static int cnl_ctx_workarounds_init(struct drm_i915_private *dev_priv)
+static void cnl_ctx_workarounds_apply(struct drm_i915_private *dev_priv)
 {
 	/* WaForceContextSaveRestoreNonCoherent:cnl */
 	WA_SET_BIT_MASKED(CNL_HDC_CHICKEN0,
@@ -470,20 +432,20 @@ static int cnl_ctx_workarounds_init(struct drm_i915_private *dev_priv)
 
 	/* WaDisableEarlyEOT:cnl */
 	WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN, DISABLE_EARLY_EOT);
-
-	return 0;
 }
 
-static int icl_ctx_workarounds_init(struct drm_i915_private *dev_priv)
+static void icl_ctx_workarounds_apply(struct drm_i915_private *dev_priv)
 {
-	/* Wa_1604370585:icl (pre-prod)
+	/*
+	 * Wa_1604370585:icl (pre-prod)
 	 * Formerly known as WaPushConstantDereferenceHoldDisable
 	 */
 	if (IS_ICL_REVID(dev_priv, ICL_REVID_A0, ICL_REVID_B0))
 		WA_SET_BIT_MASKED(GEN7_ROW_CHICKEN2,
 				  PUSH_CONSTANT_DEREF_DISABLE);
 
-	/* WaForceEnableNonCoherent:icl
+	/*
+	 * WaForceEnableNonCoherent:icl
 	 * This is not the same workaround as in early Gen9 platforms, where
 	 * lacking this could cause system hangs, but coherency performance
 	 * overhead is high and only a few compute workloads really need it
@@ -492,7 +454,8 @@ static int icl_ctx_workarounds_init(struct drm_i915_private *dev_priv)
 	 */
 	WA_SET_BIT_MASKED(ICL_HDC_MODE, HDC_FORCE_NON_COHERENT);
 
-	/* Wa_2006611047:icl (pre-prod)
+	/*
+	 * Wa_2006611047:icl (pre-prod)
 	 * Formerly known as WaDisableImprovedTdlClkGating
 	 */
 	if (IS_ICL_REVID(dev_priv, ICL_REVID_A0, ICL_REVID_A0))
@@ -510,77 +473,48 @@ static int icl_ctx_workarounds_init(struct drm_i915_private *dev_priv)
 
 	/* WaEnableFloatBlendOptimization:icl */
 	WA_SET_BIT_MASKED(GEN10_CACHE_MODE_SS, FLOAT_BLEND_OPTIMIZATION_ENABLE);
-
-	return 0;
 }
 
-int intel_ctx_workarounds_init(struct drm_i915_private *dev_priv)
+void intel_ctx_workarounds_apply(struct drm_i915_private *dev_priv)
 {
-	int err = 0;
+	/*
+	 * We rely on not losing the active state before we save these
+	 * register writes into the default context image, so we must be
+	 * sure to keep forcewake held at all times. We use immediate MMIO
+	 * writes to update the current register state, which are then
+	 * inherited by the first context and saved into the default
+	 * context image; after which this register state is reused for all
+	 * user contexts.
+	 */
+	assert_forcewakes_active(dev_priv, FORCEWAKE_ALL);
 
 	dev_priv->workarounds.count = 0;
 
 	if (INTEL_GEN(dev_priv) < 8)
-		err = 0;
+		;
 	else if (IS_BROADWELL(dev_priv))
-		err = bdw_ctx_workarounds_init(dev_priv);
+		bdw_ctx_workarounds_apply(dev_priv);
 	else if (IS_CHERRYVIEW(dev_priv))
-		err = chv_ctx_workarounds_init(dev_priv);
+		chv_ctx_workarounds_apply(dev_priv);
 	else if (IS_SKYLAKE(dev_priv))
-		err = skl_ctx_workarounds_init(dev_priv);
+		skl_ctx_workarounds_apply(dev_priv);
 	else if (IS_BROXTON(dev_priv))
-		err = bxt_ctx_workarounds_init(dev_priv);
+		bxt_ctx_workarounds_apply(dev_priv);
 	else if (IS_KABYLAKE(dev_priv))
-		err = kbl_ctx_workarounds_init(dev_priv);
+		kbl_ctx_workarounds_apply(dev_priv);
 	else if (IS_GEMINILAKE(dev_priv))
-		err = glk_ctx_workarounds_init(dev_priv);
+		glk_ctx_workarounds_apply(dev_priv);
 	else if (IS_COFFEELAKE(dev_priv))
-		err = cfl_ctx_workarounds_init(dev_priv);
+		cfl_ctx_workarounds_apply(dev_priv);
 	else if (IS_CANNONLAKE(dev_priv))
-		err = cnl_ctx_workarounds_init(dev_priv);
+		cnl_ctx_workarounds_apply(dev_priv);
 	else if (IS_ICELAKE(dev_priv))
-		err = icl_ctx_workarounds_init(dev_priv);
+		icl_ctx_workarounds_apply(dev_priv);
 	else
 		MISSING_CASE(INTEL_GEN(dev_priv));
-	if (err)
-		return err;
 
 	DRM_DEBUG_DRIVER("Number of context specific w/a: %d\n",
 			 dev_priv->workarounds.count);
-	return 0;
-}
-
-int intel_ctx_workarounds_emit(struct i915_request *rq)
-{
-	struct i915_workarounds *w = &rq->i915->workarounds;
-	u32 *cs;
-	int ret, i;
-
-	if (w->count == 0)
-		return 0;
-
-	ret = rq->engine->emit_flush(rq, EMIT_BARRIER);
-	if (ret)
-		return ret;
-
-	cs = intel_ring_begin(rq, (w->count * 2 + 2));
-	if (IS_ERR(cs))
-		return PTR_ERR(cs);
-
-	*cs++ = MI_LOAD_REGISTER_IMM(w->count);
-	for (i = 0; i < w->count; i++) {
-		*cs++ = w->reg[i].addr;
-		*cs++ = w->reg[i].value;
-	}
-	*cs++ = MI_NOOP;
-
-	intel_ring_advance(rq, cs);
-
-	ret = rq->engine->emit_flush(rq, EMIT_BARRIER);
-	if (ret)
-		return ret;
-
-	return 0;
 }
 
 static void bdw_gt_workarounds_apply(struct drm_i915_private *dev_priv)
diff --git a/drivers/gpu/drm/i915/intel_workarounds.h b/drivers/gpu/drm/i915/intel_workarounds.h
index b11d0623e626..256f8313208e 100644
--- a/drivers/gpu/drm/i915/intel_workarounds.h
+++ b/drivers/gpu/drm/i915/intel_workarounds.h
@@ -7,11 +7,8 @@
 #ifndef _I915_WORKAROUNDS_H_
 #define _I915_WORKAROUNDS_H_
 
-int intel_ctx_workarounds_init(struct drm_i915_private *dev_priv);
-int intel_ctx_workarounds_emit(struct i915_request *rq);
-
+void intel_ctx_workarounds_apply(struct drm_i915_private *dev_priv);
 void intel_gt_workarounds_apply(struct drm_i915_private *dev_priv);
-
 void intel_whitelist_workarounds_apply(struct intel_engine_cs *engine);
 
 #endif
-- 
2.18.0



More information about the Intel-gfx-trybot mailing list