[Intel-gfx] [RFC PATCH 4/6] drm/i915: Move WA BB stuff to the workarounds file as well

Wed Nov 8 23:59:47 UTC 2017

Since we are trying to put all WA stuff together, do not forget about the BB WAs.

v2: s/intel_bb_workarounds_init/intel_engine_init_bb_workarounds (Chris)
v3: Rebased to before the WAs are stored

Signed-off-by: Oscar Mateo <oscar.mateo at intel.com>
Cc: Mika Kuoppala <mika.kuoppala at linux.intel.com>
Reviewed-by: Chris Wilson <chris at chris-wilson.co.uk> (v2)
---
 drivers/gpu/drm/i915/intel_lrc.c         | 253 +-----------------------------
 drivers/gpu/drm/i915/intel_workarounds.c | 254 +++++++++++++++++++++++++++++++
 drivers/gpu/drm/i915/intel_workarounds.h |   3 +
 3 files changed, 259 insertions(+), 251 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index f0b4d2f..7f731d0 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1181,255 +1181,6 @@ static int execlists_request_alloc(struct drm_i915_gem_request *request)
 	return 0;
 }
 
-/*
- * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
- * PIPE_CONTROL instruction. This is required for the flush to happen correctly
- * but there is a slight complication as this is applied in WA batch where the
- * values are only initialized once so we cannot take register value at the
- * beginning and reuse it further; hence we save its value to memory, upload a
- * constant value with bit21 set and then we restore it back with the saved value.
- * To simplify the WA, a constant value is formed by using the default value
- * of this register. This shouldn't be a problem because we are only modifying
- * it for a short period and this batch in non-premptible. We can ofcourse
- * use additional instructions that read the actual value of the register
- * at that time and set our bit of interest but it makes the WA complicated.
- *
- * This WA is also required for Gen9 so extracting as a function avoids
- * code duplication.
- */
-static u32 *
-gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
-{
-	*batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
-	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
-	*batch++ = i915_ggtt_offset(engine->scratch) + 256;
-	*batch++ = 0;
-
-	*batch++ = MI_LOAD_REGISTER_IMM(1);
-	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
-	*batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
-
-	batch = gen8_emit_pipe_control(batch,
-				       PIPE_CONTROL_CS_STALL |
-				       PIPE_CONTROL_DC_FLUSH_ENABLE,
-				       0);
-
-	*batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
-	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
-	*batch++ = i915_ggtt_offset(engine->scratch) + 256;
-	*batch++ = 0;
-
-	return batch;
-}
-
-/*
- * Typically we only have one indirect_ctx and per_ctx batch buffer which are
- * initialized at the beginning and shared across all contexts but this field
- * helps us to have multiple batches at different offsets and select them based
- * on a criteria. At the moment this batch always start at the beginning of the page
- * and at this point we don't have multiple wa_ctx batch buffers.
- *
- * The number of WA applied are not known at the beginning; we use this field
- * to return the no of DWORDS written.
- *
- * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
- * so it adds NOOPs as padding to make it cacheline aligned.
- * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
- * makes a complete batch buffer.
- */
-static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
-{
-	/* WaDisableCtxRestoreArbitration:bdw,chv */
-	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
-
-	/* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
-	if (IS_BROADWELL(engine->i915))
-		batch = gen8_emit_flush_coherentl3_wa(engine, batch);
-
-	/* WaClearSlmSpaceAtContextSwitch:bdw,chv */
-	/* Actual scratch location is at 128 bytes offset */
-	batch = gen8_emit_pipe_control(batch,
-				       PIPE_CONTROL_FLUSH_L3 |
-				       PIPE_CONTROL_GLOBAL_GTT_IVB |
-				       PIPE_CONTROL_CS_STALL |
-				       PIPE_CONTROL_QW_WRITE,
-				       i915_ggtt_offset(engine->scratch) +
-				       2 * CACHELINE_BYTES);
-
-	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
-
-	/* Pad to end of cacheline */
-	while ((unsigned long)batch % CACHELINE_BYTES)
-		*batch++ = MI_NOOP;
-
-	/*
-	 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
-	 * execution depends on the length specified in terms of cache lines
-	 * in the register CTX_RCS_INDIRECT_CTX
-	 */
-
-	return batch;
-}
-
-static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
-{
-	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
-
-	/* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
-	batch = gen8_emit_flush_coherentl3_wa(engine, batch);
-
-	/* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
-	*batch++ = MI_LOAD_REGISTER_IMM(1);
-	*batch++ = i915_mmio_reg_offset(COMMON_SLICE_CHICKEN2);
-	*batch++ = _MASKED_BIT_DISABLE(
-			GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE);
-	*batch++ = MI_NOOP;
-
-	/* WaClearSlmSpaceAtContextSwitch:kbl */
-	/* Actual scratch location is at 128 bytes offset */
-	if (IS_KBL_REVID(engine->i915, 0, KBL_REVID_A0)) {
-		batch = gen8_emit_pipe_control(batch,
-					       PIPE_CONTROL_FLUSH_L3 |
-					       PIPE_CONTROL_GLOBAL_GTT_IVB |
-					       PIPE_CONTROL_CS_STALL |
-					       PIPE_CONTROL_QW_WRITE,
-					       i915_ggtt_offset(engine->scratch)
-					       + 2 * CACHELINE_BYTES);
-	}
-
-	/* WaMediaPoolStateCmdInWABB:bxt,glk */
-	if (HAS_POOLED_EU(engine->i915)) {
-		/*
-		 * EU pool configuration is setup along with golden context
-		 * during context initialization. This value depends on
-		 * device type (2x6 or 3x6) and needs to be updated based
-		 * on which subslice is disabled especially for 2x6
-		 * devices, however it is safe to load default
-		 * configuration of 3x6 device instead of masking off
-		 * corresponding bits because HW ignores bits of a disabled
-		 * subslice and drops down to appropriate config. Please
-		 * see render_state_setup() in i915_gem_render_state.c for
-		 * possible configurations, to avoid duplication they are
-		 * not shown here again.
-		 */
-		*batch++ = GEN9_MEDIA_POOL_STATE;
-		*batch++ = GEN9_MEDIA_POOL_ENABLE;
-		*batch++ = 0x00777000;
-		*batch++ = 0;
-		*batch++ = 0;
-		*batch++ = 0;
-	}
-
-	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
-
-	/* Pad to end of cacheline */
-	while ((unsigned long)batch % CACHELINE_BYTES)
-		*batch++ = MI_NOOP;
-
-	return batch;
-}
-
-#define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE)
-
-static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
-{
-	struct drm_i915_gem_object *obj;
-	struct i915_vma *vma;
-	int err;
-
-	obj = i915_gem_object_create(engine->i915, CTX_WA_BB_OBJ_SIZE);
-	if (IS_ERR(obj))
-		return PTR_ERR(obj);
-
-	vma = i915_vma_instance(obj, &engine->i915->ggtt.base, NULL);
-	if (IS_ERR(vma)) {
-		err = PTR_ERR(vma);
-		goto err;
-	}
-
-	err = i915_vma_pin(vma, 0, PAGE_SIZE, PIN_GLOBAL | PIN_HIGH);
-	if (err)
-		goto err;
-
-	engine->wa_ctx.vma = vma;
-	return 0;
-
-err:
-	i915_gem_object_put(obj);
-	return err;
-}
-
-static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine)
-{
-	i915_vma_unpin_and_release(&engine->wa_ctx.vma);
-}
-
-typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
-
-static int intel_init_workaround_bb(struct intel_engine_cs *engine)
-{
-	struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
-	struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx,
-					    &wa_ctx->per_ctx };
-	wa_bb_func_t wa_bb_fn[2];
-	struct page *page;
-	void *batch, *batch_ptr;
-	unsigned int i;
-	int ret;
-
-	if (WARN_ON(engine->id != RCS || !engine->scratch))
-		return -EINVAL;
-
-	switch (INTEL_GEN(engine->i915)) {
-	case 10:
-		return 0;
-	case 9:
-		wa_bb_fn[0] = gen9_init_indirectctx_bb;
-		wa_bb_fn[1] = NULL;
-		break;
-	case 8:
-		wa_bb_fn[0] = gen8_init_indirectctx_bb;
-		wa_bb_fn[1] = NULL;
-		break;
-	default:
-		MISSING_CASE(INTEL_GEN(engine->i915));
-		return 0;
-	}
-
-	ret = lrc_setup_wa_ctx(engine);
-	if (ret) {
-		DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n", ret);
-		return ret;
-	}
-
-	page = i915_gem_object_get_dirty_page(wa_ctx->vma->obj, 0);
-	batch = batch_ptr = kmap_atomic(page);
-
-	/*
-	 * Emit the two workaround batch buffers, recording the offset from the
-	 * start of the workaround batch buffer object for each and their
-	 * respective sizes.
-	 */
-	for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
-		wa_bb[i]->offset = batch_ptr - batch;
-		if (WARN_ON(!IS_ALIGNED(wa_bb[i]->offset, CACHELINE_BYTES))) {
-			ret = -EINVAL;
-			break;
-		}
-		if (wa_bb_fn[i])
-			batch_ptr = wa_bb_fn[i](engine, batch_ptr);
-		wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
-	}
-
-	BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE);
-
-	kunmap_atomic(batch);
-	if (ret)
-		lrc_destroy_wa_ctx(engine);
-
-	return ret;
-}
-
 static u8 gtiir[] = {
 	[RCS] = 0,
 	[BCS] = 0,
@@ -1875,7 +1626,7 @@ void intel_logical_ring_cleanup(struct intel_engine_cs *engine)
 
 	intel_engine_cleanup_common(engine);
 
-	lrc_destroy_wa_ctx(engine);
+	intel_engine_fini_bb_workarounds(engine);
 	engine->i915 = NULL;
 	dev_priv->engine[engine->id] = NULL;
 	kfree(engine);
@@ -1994,7 +1745,7 @@ int logical_render_ring_init(struct intel_engine_cs *engine)
 	if (ret)
 		return ret;
 
-	ret = intel_init_workaround_bb(engine);
+	ret = intel_engine_init_bb_workarounds(engine);
 	if (ret) {
 		/*
 		 * We continue even if we fail to initialize WA batch
diff --git a/drivers/gpu/drm/i915/intel_workarounds.c b/drivers/gpu/drm/i915/intel_workarounds.c
index 0a8f265..d7ec27c 100644
--- a/drivers/gpu/drm/i915/intel_workarounds.c
+++ b/drivers/gpu/drm/i915/intel_workarounds.c
@@ -862,3 +862,257 @@ int intel_whitelist_workarounds_apply(struct intel_engine_cs *engine)
 			 dev_priv->workarounds.hw_whitelist_count[engine->id]);
 	return 0;
 }
+
+/*
+ * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
+ * PIPE_CONTROL instruction. This is required for the flush to happen correctly
+ * but there is a slight complication as this is applied in WA batch where the
+ * values are only initialized once so we cannot take register value at the
+ * beginning and reuse it further; hence we save its value to memory, upload a
+ * constant value with bit21 set and then we restore it back with the saved value.
+ * To simplify the WA, a constant value is formed by using the default value
+ * of this register. This shouldn't be a problem because we are only modifying
+ * it for a short period and this batch in non-premptible. We can ofcourse
+ * use additional instructions that read the actual value of the register
+ * at that time and set our bit of interest but it makes the WA complicated.
+ *
+ * This WA is also required for Gen9 so extracting as a function avoids
+ * code duplication.
+ */
+static u32 *
+gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
+{
+	*batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
+	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
+	*batch++ = i915_ggtt_offset(engine->scratch) + 256;
+	*batch++ = 0;
+
+	*batch++ = MI_LOAD_REGISTER_IMM(1);
+	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
+	*batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
+
+	batch = gen8_emit_pipe_control(batch,
+				       PIPE_CONTROL_CS_STALL |
+				       PIPE_CONTROL_DC_FLUSH_ENABLE,
+				       0);
+
+	*batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
+	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
+	*batch++ = i915_ggtt_offset(engine->scratch) + 256;
+	*batch++ = 0;
+
+	return batch;
+}
+
+/*
+ * Typically we only have one indirect_ctx and per_ctx batch buffer which are
+ * initialized at the beginning and shared across all contexts but this field
+ * helps us to have multiple batches at different offsets and select them based
+ * on a criteria. At the moment this batch always start at the beginning of the page
+ * and at this point we don't have multiple wa_ctx batch buffers.
+ *
+ * The number of WA applied are not known at the beginning; we use this field
+ * to return the no of DWORDS written.
+ *
+ * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
+ * so it adds NOOPs as padding to make it cacheline aligned.
+ * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
+ * makes a complete batch buffer.
+ */
+static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
+{
+	/* WaDisableCtxRestoreArbitration:bdw,chv */
+	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
+
+	/* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
+	if (IS_BROADWELL(engine->i915))
+		batch = gen8_emit_flush_coherentl3_wa(engine, batch);
+
+	/* WaClearSlmSpaceAtContextSwitch:bdw,chv */
+	/* Actual scratch location is at 128 bytes offset */
+	batch = gen8_emit_pipe_control(batch,
+				       PIPE_CONTROL_FLUSH_L3 |
+				       PIPE_CONTROL_GLOBAL_GTT_IVB |
+				       PIPE_CONTROL_CS_STALL |
+				       PIPE_CONTROL_QW_WRITE,
+				       i915_ggtt_offset(engine->scratch) +
+				       2 * CACHELINE_BYTES);
+
+	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
+
+	/* Pad to end of cacheline */
+	while ((unsigned long)batch % CACHELINE_BYTES)
+		*batch++ = MI_NOOP;
+
+	/*
+	 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
+	 * execution depends on the length specified in terms of cache lines
+	 * in the register CTX_RCS_INDIRECT_CTX
+	 */
+
+	return batch;
+}
+
+static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
+{
+	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
+
+	/* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
+	batch = gen8_emit_flush_coherentl3_wa(engine, batch);
+
+	/* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
+	*batch++ = MI_LOAD_REGISTER_IMM(1);
+	*batch++ = i915_mmio_reg_offset(COMMON_SLICE_CHICKEN2);
+	*batch++ = _MASKED_BIT_DISABLE(
+			GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE);
+	*batch++ = MI_NOOP;
+
+	/* WaClearSlmSpaceAtContextSwitch:kbl */
+	/* Actual scratch location is at 128 bytes offset */
+	if (IS_KBL_REVID(engine->i915, 0, KBL_REVID_A0)) {
+		batch = gen8_emit_pipe_control(batch,
+					       PIPE_CONTROL_FLUSH_L3 |
+					       PIPE_CONTROL_GLOBAL_GTT_IVB |
+					       PIPE_CONTROL_CS_STALL |
+					       PIPE_CONTROL_QW_WRITE,
+					       i915_ggtt_offset(engine->scratch)
+					       + 2 * CACHELINE_BYTES);
+	}
+
+	/* WaMediaPoolStateCmdInWABB:bxt,glk */
+	if (HAS_POOLED_EU(engine->i915)) {
+		/*
+		 * EU pool configuration is setup along with golden context
+		 * during context initialization. This value depends on
+		 * device type (2x6 or 3x6) and needs to be updated based
+		 * on which subslice is disabled especially for 2x6
+		 * devices, however it is safe to load default
+		 * configuration of 3x6 device instead of masking off
+		 * corresponding bits because HW ignores bits of a disabled
+		 * subslice and drops down to appropriate config. Please
+		 * see render_state_setup() in i915_gem_render_state.c for
+		 * possible configurations, to avoid duplication they are
+		 * not shown here again.
+		 */
+		*batch++ = GEN9_MEDIA_POOL_STATE;
+		*batch++ = GEN9_MEDIA_POOL_ENABLE;
+		*batch++ = 0x00777000;
+		*batch++ = 0;
+		*batch++ = 0;
+		*batch++ = 0;
+	}
+
+	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
+
+	/* Pad to end of cacheline */
+	while ((unsigned long)batch % CACHELINE_BYTES)
+		*batch++ = MI_NOOP;
+
+	return batch;
+}
+
+#define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE)
+
+static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
+{
+	struct drm_i915_gem_object *obj;
+	struct i915_vma *vma;
+	int err;
+
+	obj = i915_gem_object_create(engine->i915, CTX_WA_BB_OBJ_SIZE);
+	if (IS_ERR(obj))
+		return PTR_ERR(obj);
+
+	vma = i915_vma_instance(obj, &engine->i915->ggtt.base, NULL);
+	if (IS_ERR(vma)) {
+		err = PTR_ERR(vma);
+		goto err;
+	}
+
+	err = i915_vma_pin(vma, 0, PAGE_SIZE, PIN_GLOBAL | PIN_HIGH);
+	if (err)
+		goto err;
+
+	engine->wa_ctx.vma = vma;
+	return 0;
+
+err:
+	i915_gem_object_put(obj);
+	return err;
+}
+
+static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine)
+{
+	i915_vma_unpin_and_release(&engine->wa_ctx.vma);
+}
+
+typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
+
+int intel_engine_init_bb_workarounds(struct intel_engine_cs *engine)
+{
+	struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
+	struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx,
+					    &wa_ctx->per_ctx };
+	wa_bb_func_t wa_bb_fn[2];
+	struct page *page;
+	void *batch, *batch_ptr;
+	unsigned int i;
+	int ret;
+
+	if (WARN_ON(engine->id != RCS || !engine->scratch))
+		return -EINVAL;
+
+	switch (INTEL_GEN(engine->i915)) {
+	case 10:
+		return 0;
+	case 9:
+		wa_bb_fn[0] = gen9_init_indirectctx_bb;
+		wa_bb_fn[1] = NULL;
+		break;
+	case 8:
+		wa_bb_fn[0] = gen8_init_indirectctx_bb;
+		wa_bb_fn[1] = NULL;
+		break;
+	default:
+		MISSING_CASE(INTEL_GEN(engine->i915));
+		return 0;
+	}
+
+	ret = lrc_setup_wa_ctx(engine);
+	if (ret) {
+		DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n", ret);
+		return ret;
+	}
+
+	page = i915_gem_object_get_dirty_page(wa_ctx->vma->obj, 0);
+	batch = batch_ptr = kmap_atomic(page);
+
+	/*
+	 * Emit the two workaround batch buffers, recording the offset from the
+	 * start of the workaround batch buffer object for each and their
+	 * respective sizes.
+	 */
+	for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
+		wa_bb[i]->offset = batch_ptr - batch;
+		if (WARN_ON(!IS_ALIGNED(wa_bb[i]->offset, CACHELINE_BYTES))) {
+			ret = -EINVAL;
+			break;
+		}
+		if (wa_bb_fn[i])
+			batch_ptr = wa_bb_fn[i](engine, batch_ptr);
+		wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
+	}
+
+	BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE);
+
+	kunmap_atomic(batch);
+	if (ret)
+		lrc_destroy_wa_ctx(engine);
+
+	return ret;
+}
+
+void intel_engine_fini_bb_workarounds(struct intel_engine_cs *engine)
+{
+	lrc_destroy_wa_ctx(engine);
+}
diff --git a/drivers/gpu/drm/i915/intel_workarounds.h b/drivers/gpu/drm/i915/intel_workarounds.h
index bba51bb..aa9a894 100644
--- a/drivers/gpu/drm/i915/intel_workarounds.h
+++ b/drivers/gpu/drm/i915/intel_workarounds.h
@@ -32,4 +32,7 @@
 
 int intel_whitelist_workarounds_apply(struct intel_engine_cs *engine);
 
+int intel_engine_init_bb_workarounds(struct intel_engine_cs *engine);
+void intel_engine_fini_bb_workarounds(struct intel_engine_cs *engine);
+
 #endif
-- 
1.9.1