[RFC PATCH 007/162] drm/i915: split wa_bb code to its own file
Matthew Auld
matthew.auld at intel.com
Fri Nov 27 12:04:43 UTC 2020
From: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com>
Continuing the split of back-end independent code from the execlist
submission specific file.
Based on a patch by Chris Wilson.
Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com>
Cc: Chris P Wilson <chris.p.wilson at intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin at linux.intel.com>
Reviewed-by: John Harrison <John.C.Harrison at Intel.com>
---
drivers/gpu/drm/i915/Makefile | 1 +
.../drm/i915/gt/intel_engine_workaround_bb.c | 335 ++++++++++++++++++
.../drm/i915/gt/intel_engine_workaround_bb.h | 14 +
.../drm/i915/gt/intel_execlists_submission.c | 327 +----------------
4 files changed, 352 insertions(+), 325 deletions(-)
create mode 100644 drivers/gpu/drm/i915/gt/intel_engine_workaround_bb.c
create mode 100644 drivers/gpu/drm/i915/gt/intel_engine_workaround_bb.h
diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index f9ef5199b124..2445cc990e15 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -92,6 +92,7 @@ gt-y += \
gt/intel_engine_heartbeat.o \
gt/intel_engine_pm.o \
gt/intel_engine_user.o \
+ gt/intel_engine_workaround_bb.o \
gt/intel_execlists_submission.o \
gt/intel_ggtt.o \
gt/intel_ggtt_fencing.o \
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_workaround_bb.c b/drivers/gpu/drm/i915/gt/intel_engine_workaround_bb.c
new file mode 100644
index 000000000000..b03bdfc92bb2
--- /dev/null
+++ b/drivers/gpu/drm/i915/gt/intel_engine_workaround_bb.c
@@ -0,0 +1,335 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright © 2014 Intel Corporation
+ */
+
+#include "i915_drv.h"
+#include "intel_engine_types.h"
+#include "intel_engine_workaround_bb.h"
+#include "intel_execlists_submission.h" /* XXX */
+#include "intel_gpu_commands.h"
+#include "intel_gt.h"
+
+/*
+ * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
+ * PIPE_CONTROL instruction. This is required for the flush to happen correctly
+ * but there is a slight complication as this is applied in WA batch where the
+ * values are only initialized once so we cannot take register value at the
+ * beginning and reuse it further; hence we save its value to memory, upload a
+ * constant value with bit21 set and then we restore it back with the saved value.
+ * To simplify the WA, a constant value is formed by using the default value
+ * of this register. This shouldn't be a problem because we are only modifying
+ * it for a short period and this batch in non-premptible. We can ofcourse
+ * use additional instructions that read the actual value of the register
+ * at that time and set our bit of interest but it makes the WA complicated.
+ *
+ * This WA is also required for Gen9 so extracting as a function avoids
+ * code duplication.
+ */
+static u32 *
+gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
+{
+ /* NB no one else is allowed to scribble over scratch + 256! */
+ *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
+ *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
+ *batch++ = intel_gt_scratch_offset(engine->gt,
+ INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
+ *batch++ = 0;
+
+ *batch++ = MI_LOAD_REGISTER_IMM(1);
+ *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
+ *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
+
+ batch = gen8_emit_pipe_control(batch,
+ PIPE_CONTROL_CS_STALL |
+ PIPE_CONTROL_DC_FLUSH_ENABLE,
+ 0);
+
+ *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
+ *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
+ *batch++ = intel_gt_scratch_offset(engine->gt,
+ INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
+ *batch++ = 0;
+
+ return batch;
+}
+
+/*
+ * Typically we only have one indirect_ctx and per_ctx batch buffer which are
+ * initialized at the beginning and shared across all contexts but this field
+ * helps us to have multiple batches at different offsets and select them based
+ * on a criteria. At the moment this batch always start at the beginning of the page
+ * and at this point we don't have multiple wa_ctx batch buffers.
+ *
+ * The number of WA applied are not known at the beginning; we use this field
+ * to return the no of DWORDS written.
+ *
+ * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
+ * so it adds NOOPs as padding to make it cacheline aligned.
+ * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
+ * makes a complete batch buffer.
+ */
+static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
+{
+ /* WaDisableCtxRestoreArbitration:bdw,chv */
+ *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
+
+ /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
+ if (IS_BROADWELL(engine->i915))
+ batch = gen8_emit_flush_coherentl3_wa(engine, batch);
+
+ /* WaClearSlmSpaceAtContextSwitch:bdw,chv */
+ /* Actual scratch location is at 128 bytes offset */
+ batch = gen8_emit_pipe_control(batch,
+ PIPE_CONTROL_FLUSH_L3 |
+ PIPE_CONTROL_STORE_DATA_INDEX |
+ PIPE_CONTROL_CS_STALL |
+ PIPE_CONTROL_QW_WRITE,
+ LRC_PPHWSP_SCRATCH_ADDR);
+
+ *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
+
+ /* Pad to end of cacheline */
+ while ((unsigned long)batch % CACHELINE_BYTES)
+ *batch++ = MI_NOOP;
+
+ /*
+ * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
+ * execution depends on the length specified in terms of cache lines
+ * in the register CTX_RCS_INDIRECT_CTX
+ */
+
+ return batch;
+}
+
+struct lri {
+ i915_reg_t reg;
+ u32 value;
+};
+
+static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
+{
+ GEM_BUG_ON(!count || count > 63);
+
+ *batch++ = MI_LOAD_REGISTER_IMM(count);
+ do {
+ *batch++ = i915_mmio_reg_offset(lri->reg);
+ *batch++ = lri->value;
+ } while (lri++, --count);
+ *batch++ = MI_NOOP;
+
+ return batch;
+}
+
+static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
+{
+ static const struct lri lri[] = {
+ /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
+ {
+ COMMON_SLICE_CHICKEN2,
+ __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
+ 0),
+ },
+
+ /* BSpec: 11391 */
+ {
+ FF_SLICE_CHICKEN,
+ __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
+ FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
+ },
+
+ /* BSpec: 11299 */
+ {
+ _3D_CHICKEN3,
+ __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
+ _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
+ }
+ };
+
+ *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
+
+ /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
+ batch = gen8_emit_flush_coherentl3_wa(engine, batch);
+
+ /* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
+ batch = gen8_emit_pipe_control(batch,
+ PIPE_CONTROL_FLUSH_L3 |
+ PIPE_CONTROL_STORE_DATA_INDEX |
+ PIPE_CONTROL_CS_STALL |
+ PIPE_CONTROL_QW_WRITE,
+ LRC_PPHWSP_SCRATCH_ADDR);
+
+ batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
+
+ /* WaMediaPoolStateCmdInWABB:bxt,glk */
+ if (HAS_POOLED_EU(engine->i915)) {
+ /*
+ * EU pool configuration is setup along with golden context
+ * during context initialization. This value depends on
+ * device type (2x6 or 3x6) and needs to be updated based
+ * on which subslice is disabled especially for 2x6
+ * devices, however it is safe to load default
+ * configuration of 3x6 device instead of masking off
+ * corresponding bits because HW ignores bits of a disabled
+ * subslice and drops down to appropriate config. Please
+ * see render_state_setup() in i915_gem_render_state.c for
+ * possible configurations, to avoid duplication they are
+ * not shown here again.
+ */
+ *batch++ = GEN9_MEDIA_POOL_STATE;
+ *batch++ = GEN9_MEDIA_POOL_ENABLE;
+ *batch++ = 0x00777000;
+ *batch++ = 0;
+ *batch++ = 0;
+ *batch++ = 0;
+ }
+
+ *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
+
+ /* Pad to end of cacheline */
+ while ((unsigned long)batch % CACHELINE_BYTES)
+ *batch++ = MI_NOOP;
+
+ return batch;
+}
+
+static u32 *
+gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
+{
+ int i;
+
+ /*
+ * WaPipeControlBefore3DStateSamplePattern: cnl
+ *
+ * Ensure the engine is idle prior to programming a
+ * 3DSTATE_SAMPLE_PATTERN during a context restore.
+ */
+ batch = gen8_emit_pipe_control(batch,
+ PIPE_CONTROL_CS_STALL,
+ 0);
+ /*
+ * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
+ * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
+ * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
+ * confusing. Since gen8_emit_pipe_control() already advances the
+ * batch by 6 dwords, we advance the other 10 here, completing a
+ * cacheline. It's not clear if the workaround requires this padding
+ * before other commands, or if it's just the regular padding we would
+ * already have for the workaround bb, so leave it here for now.
+ */
+ for (i = 0; i < 10; i++)
+ *batch++ = MI_NOOP;
+
+ /* Pad to end of cacheline */
+ while ((unsigned long)batch % CACHELINE_BYTES)
+ *batch++ = MI_NOOP;
+
+ return batch;
+}
+
+#define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE)
+
+static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
+{
+ struct drm_i915_gem_object *obj;
+ struct i915_vma *vma;
+ int err;
+
+ obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE);
+ if (IS_ERR(obj))
+ return PTR_ERR(obj);
+
+ vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
+ if (IS_ERR(vma)) {
+ err = PTR_ERR(vma);
+ goto err;
+ }
+
+ err = i915_ggtt_pin(vma, NULL, 0, PIN_HIGH);
+ if (err)
+ goto err;
+
+ engine->wa_ctx.vma = vma;
+ return 0;
+
+err:
+ i915_gem_object_put(obj);
+ return err;
+}
+
+typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
+
+int intel_init_workaround_bb(struct intel_engine_cs *engine)
+{
+ struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
+ struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx,
+ &wa_ctx->per_ctx };
+ wa_bb_func_t wa_bb_fn[2];
+ void *batch, *batch_ptr;
+ unsigned int i;
+ int ret;
+
+ if (engine->class != RENDER_CLASS)
+ return 0;
+
+ switch (INTEL_GEN(engine->i915)) {
+ case 12:
+ case 11:
+ return 0;
+ case 10:
+ wa_bb_fn[0] = gen10_init_indirectctx_bb;
+ wa_bb_fn[1] = NULL;
+ break;
+ case 9:
+ wa_bb_fn[0] = gen9_init_indirectctx_bb;
+ wa_bb_fn[1] = NULL;
+ break;
+ case 8:
+ wa_bb_fn[0] = gen8_init_indirectctx_bb;
+ wa_bb_fn[1] = NULL;
+ break;
+ default:
+ MISSING_CASE(INTEL_GEN(engine->i915));
+ return 0;
+ }
+
+ ret = lrc_setup_wa_ctx(engine);
+ if (ret) {
+ drm_dbg(&engine->i915->drm,
+ "Failed to setup context WA page: %d\n", ret);
+ return ret;
+ }
+
+ batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB);
+
+ /*
+ * Emit the two workaround batch buffers, recording the offset from the
+ * start of the workaround batch buffer object for each and their
+ * respective sizes.
+ */
+ batch_ptr = batch;
+ for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
+ wa_bb[i]->offset = batch_ptr - batch;
+ if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
+ CACHELINE_BYTES))) {
+ ret = -EINVAL;
+ break;
+ }
+ if (wa_bb_fn[i])
+ batch_ptr = wa_bb_fn[i](engine, batch_ptr);
+ wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
+ }
+ GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE);
+
+ __i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch);
+ __i915_gem_object_release_map(wa_ctx->vma->obj);
+ if (ret)
+ intel_fini_workaround_bb(engine);
+
+ return ret;
+}
+
+void intel_fini_workaround_bb(struct intel_engine_cs *engine)
+{
+ i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
+}
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_workaround_bb.h b/drivers/gpu/drm/i915/gt/intel_engine_workaround_bb.h
new file mode 100644
index 000000000000..88771d77fd42
--- /dev/null
+++ b/drivers/gpu/drm/i915/gt/intel_engine_workaround_bb.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright © 2014 Intel Corporation
+ */
+
+#ifndef __INTEL_ENGINE_WORKAROUND_BB_H__
+#define __INTEL_ENGINE_WORKAROUND_BB_H__
+
+struct intel_engine_cs;
+
+int intel_init_workaround_bb(struct intel_engine_cs *engine);
+void intel_fini_workaround_bb(struct intel_engine_cs *engine);
+
+#endif /* __INTEL_ENGINE_WORKAROUND_BB_H__ */
diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
index 9069a456d2f7..1cc93ea6b7f0 100644
--- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
+++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
@@ -116,6 +116,7 @@
#include "intel_breadcrumbs.h"
#include "intel_context.h"
#include "intel_engine_pm.h"
+#include "intel_engine_workaround_bb.h"
#include "intel_execlists_submission.h"
#include "intel_gt.h"
#include "intel_gt_pm.h"
@@ -3695,330 +3696,6 @@ static int execlists_request_alloc(struct i915_request *request)
return 0;
}
-/*
- * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
- * PIPE_CONTROL instruction. This is required for the flush to happen correctly
- * but there is a slight complication as this is applied in WA batch where the
- * values are only initialized once so we cannot take register value at the
- * beginning and reuse it further; hence we save its value to memory, upload a
- * constant value with bit21 set and then we restore it back with the saved value.
- * To simplify the WA, a constant value is formed by using the default value
- * of this register. This shouldn't be a problem because we are only modifying
- * it for a short period and this batch in non-premptible. We can ofcourse
- * use additional instructions that read the actual value of the register
- * at that time and set our bit of interest but it makes the WA complicated.
- *
- * This WA is also required for Gen9 so extracting as a function avoids
- * code duplication.
- */
-static u32 *
-gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
-{
- /* NB no one else is allowed to scribble over scratch + 256! */
- *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
- *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
- *batch++ = intel_gt_scratch_offset(engine->gt,
- INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
- *batch++ = 0;
-
- *batch++ = MI_LOAD_REGISTER_IMM(1);
- *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
- *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
-
- batch = gen8_emit_pipe_control(batch,
- PIPE_CONTROL_CS_STALL |
- PIPE_CONTROL_DC_FLUSH_ENABLE,
- 0);
-
- *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
- *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
- *batch++ = intel_gt_scratch_offset(engine->gt,
- INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
- *batch++ = 0;
-
- return batch;
-}
-
-/*
- * Typically we only have one indirect_ctx and per_ctx batch buffer which are
- * initialized at the beginning and shared across all contexts but this field
- * helps us to have multiple batches at different offsets and select them based
- * on a criteria. At the moment this batch always start at the beginning of the page
- * and at this point we don't have multiple wa_ctx batch buffers.
- *
- * The number of WA applied are not known at the beginning; we use this field
- * to return the no of DWORDS written.
- *
- * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
- * so it adds NOOPs as padding to make it cacheline aligned.
- * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
- * makes a complete batch buffer.
- */
-static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
-{
- /* WaDisableCtxRestoreArbitration:bdw,chv */
- *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
-
- /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
- if (IS_BROADWELL(engine->i915))
- batch = gen8_emit_flush_coherentl3_wa(engine, batch);
-
- /* WaClearSlmSpaceAtContextSwitch:bdw,chv */
- /* Actual scratch location is at 128 bytes offset */
- batch = gen8_emit_pipe_control(batch,
- PIPE_CONTROL_FLUSH_L3 |
- PIPE_CONTROL_STORE_DATA_INDEX |
- PIPE_CONTROL_CS_STALL |
- PIPE_CONTROL_QW_WRITE,
- LRC_PPHWSP_SCRATCH_ADDR);
-
- *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
-
- /* Pad to end of cacheline */
- while ((unsigned long)batch % CACHELINE_BYTES)
- *batch++ = MI_NOOP;
-
- /*
- * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
- * execution depends on the length specified in terms of cache lines
- * in the register CTX_RCS_INDIRECT_CTX
- */
-
- return batch;
-}
-
-struct lri {
- i915_reg_t reg;
- u32 value;
-};
-
-static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
-{
- GEM_BUG_ON(!count || count > 63);
-
- *batch++ = MI_LOAD_REGISTER_IMM(count);
- do {
- *batch++ = i915_mmio_reg_offset(lri->reg);
- *batch++ = lri->value;
- } while (lri++, --count);
- *batch++ = MI_NOOP;
-
- return batch;
-}
-
-static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
-{
- static const struct lri lri[] = {
- /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
- {
- COMMON_SLICE_CHICKEN2,
- __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
- 0),
- },
-
- /* BSpec: 11391 */
- {
- FF_SLICE_CHICKEN,
- __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
- FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
- },
-
- /* BSpec: 11299 */
- {
- _3D_CHICKEN3,
- __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
- _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
- }
- };
-
- *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
-
- /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
- batch = gen8_emit_flush_coherentl3_wa(engine, batch);
-
- /* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
- batch = gen8_emit_pipe_control(batch,
- PIPE_CONTROL_FLUSH_L3 |
- PIPE_CONTROL_STORE_DATA_INDEX |
- PIPE_CONTROL_CS_STALL |
- PIPE_CONTROL_QW_WRITE,
- LRC_PPHWSP_SCRATCH_ADDR);
-
- batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
-
- /* WaMediaPoolStateCmdInWABB:bxt,glk */
- if (HAS_POOLED_EU(engine->i915)) {
- /*
- * EU pool configuration is setup along with golden context
- * during context initialization. This value depends on
- * device type (2x6 or 3x6) and needs to be updated based
- * on which subslice is disabled especially for 2x6
- * devices, however it is safe to load default
- * configuration of 3x6 device instead of masking off
- * corresponding bits because HW ignores bits of a disabled
- * subslice and drops down to appropriate config. Please
- * see render_state_setup() in i915_gem_render_state.c for
- * possible configurations, to avoid duplication they are
- * not shown here again.
- */
- *batch++ = GEN9_MEDIA_POOL_STATE;
- *batch++ = GEN9_MEDIA_POOL_ENABLE;
- *batch++ = 0x00777000;
- *batch++ = 0;
- *batch++ = 0;
- *batch++ = 0;
- }
-
- *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
-
- /* Pad to end of cacheline */
- while ((unsigned long)batch % CACHELINE_BYTES)
- *batch++ = MI_NOOP;
-
- return batch;
-}
-
-static u32 *
-gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
-{
- int i;
-
- /*
- * WaPipeControlBefore3DStateSamplePattern: cnl
- *
- * Ensure the engine is idle prior to programming a
- * 3DSTATE_SAMPLE_PATTERN during a context restore.
- */
- batch = gen8_emit_pipe_control(batch,
- PIPE_CONTROL_CS_STALL,
- 0);
- /*
- * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
- * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
- * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
- * confusing. Since gen8_emit_pipe_control() already advances the
- * batch by 6 dwords, we advance the other 10 here, completing a
- * cacheline. It's not clear if the workaround requires this padding
- * before other commands, or if it's just the regular padding we would
- * already have for the workaround bb, so leave it here for now.
- */
- for (i = 0; i < 10; i++)
- *batch++ = MI_NOOP;
-
- /* Pad to end of cacheline */
- while ((unsigned long)batch % CACHELINE_BYTES)
- *batch++ = MI_NOOP;
-
- return batch;
-}
-
-#define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE)
-
-static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
-{
- struct drm_i915_gem_object *obj;
- struct i915_vma *vma;
- int err;
-
- obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE);
- if (IS_ERR(obj))
- return PTR_ERR(obj);
-
- vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
- if (IS_ERR(vma)) {
- err = PTR_ERR(vma);
- goto err;
- }
-
- err = i915_ggtt_pin(vma, NULL, 0, PIN_HIGH);
- if (err)
- goto err;
-
- engine->wa_ctx.vma = vma;
- return 0;
-
-err:
- i915_gem_object_put(obj);
- return err;
-}
-
-static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine)
-{
- i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
-}
-
-typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
-
-static int intel_init_workaround_bb(struct intel_engine_cs *engine)
-{
- struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
- struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx,
- &wa_ctx->per_ctx };
- wa_bb_func_t wa_bb_fn[2];
- void *batch, *batch_ptr;
- unsigned int i;
- int ret;
-
- if (engine->class != RENDER_CLASS)
- return 0;
-
- switch (INTEL_GEN(engine->i915)) {
- case 12:
- case 11:
- return 0;
- case 10:
- wa_bb_fn[0] = gen10_init_indirectctx_bb;
- wa_bb_fn[1] = NULL;
- break;
- case 9:
- wa_bb_fn[0] = gen9_init_indirectctx_bb;
- wa_bb_fn[1] = NULL;
- break;
- case 8:
- wa_bb_fn[0] = gen8_init_indirectctx_bb;
- wa_bb_fn[1] = NULL;
- break;
- default:
- MISSING_CASE(INTEL_GEN(engine->i915));
- return 0;
- }
-
- ret = lrc_setup_wa_ctx(engine);
- if (ret) {
- drm_dbg(&engine->i915->drm,
- "Failed to setup context WA page: %d\n", ret);
- return ret;
- }
-
- batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB);
-
- /*
- * Emit the two workaround batch buffers, recording the offset from the
- * start of the workaround batch buffer object for each and their
- * respective sizes.
- */
- batch_ptr = batch;
- for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
- wa_bb[i]->offset = batch_ptr - batch;
- if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
- CACHELINE_BYTES))) {
- ret = -EINVAL;
- break;
- }
- if (wa_bb_fn[i])
- batch_ptr = wa_bb_fn[i](engine, batch_ptr);
- wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
- }
- GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE);
-
- __i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch);
- __i915_gem_object_release_map(wa_ctx->vma->obj);
- if (ret)
- lrc_destroy_wa_ctx(engine);
-
- return ret;
-}
-
static void reset_csb_pointers(struct intel_engine_cs *engine)
{
struct intel_engine_execlists * const execlists = &engine->execlists;
@@ -4707,7 +4384,7 @@ static void execlists_release(struct intel_engine_cs *engine)
execlists_shutdown(engine);
intel_engine_cleanup_common(engine);
- lrc_destroy_wa_ctx(engine);
+ intel_fini_workaround_bb(engine);
}
static void
--
2.26.2
More information about the dri-devel
mailing list