[PATCH 26/64] drm/i915: Make intel_init_workaround_bb more compatible with ww locking.
Maarten Lankhorst
maarten.lankhorst at linux.intel.com
Mon Dec 14 12:49:14 UTC 2020
Make creation separate from pinning, in order to take the lock only
once, and pin the mapping with the lock held.
Signed-off-by: Maarten Lankhorst <maarten.lankhorst at linux.intel.com>
Reviewed-by: Thomas Hellström <thomas.hellstrom at linux.intel.com>
---
.../drm/i915/gt/intel_engine_workaround_bb.c | 356 ++++++++++++++++++
.../drm/i915/gt/intel_execlists_submission.c | 36 +-
2 files changed, 384 insertions(+), 8 deletions(-)
create mode 100644 drivers/gpu/drm/i915/gt/intel_engine_workaround_bb.c
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_workaround_bb.c b/drivers/gpu/drm/i915/gt/intel_engine_workaround_bb.c
new file mode 100644
index 000000000000..984e5fc72ac1
--- /dev/null
+++ b/drivers/gpu/drm/i915/gt/intel_engine_workaround_bb.c
@@ -0,0 +1,356 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright © 2014 Intel Corporation
+ */
+
+#include "i915_drv.h"
+#include "intel_engine_types.h"
+#include "intel_engine_workaround_bb.h"
+#include "intel_execlists_submission.h" /* XXX */
+#include "intel_gpu_commands.h"
+#include "intel_gt.h"
+
+/*
+ * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
+ * PIPE_CONTROL instruction. This is required for the flush to happen correctly
+ * but there is a slight complication as this is applied in WA batch where the
+ * values are only initialized once so we cannot take register value at the
+ * beginning and reuse it further; hence we save its value to memory, upload a
+ * constant value with bit21 set and then we restore it back with the saved value.
+ * To simplify the WA, a constant value is formed by using the default value
+ * of this register. This shouldn't be a problem because we are only modifying
+ * it for a short period and this batch in non-premptible. We can ofcourse
+ * use additional instructions that read the actual value of the register
+ * at that time and set our bit of interest but it makes the WA complicated.
+ *
+ * This WA is also required for Gen9 so extracting as a function avoids
+ * code duplication.
+ */
+static u32 *
+gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
+{
+ /* NB no one else is allowed to scribble over scratch + 256! */
+ *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
+ *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
+ *batch++ = intel_gt_scratch_offset(engine->gt,
+ INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
+ *batch++ = 0;
+
+ *batch++ = MI_LOAD_REGISTER_IMM(1);
+ *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
+ *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
+
+ batch = gen8_emit_pipe_control(batch,
+ PIPE_CONTROL_CS_STALL |
+ PIPE_CONTROL_DC_FLUSH_ENABLE,
+ 0);
+
+ *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
+ *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
+ *batch++ = intel_gt_scratch_offset(engine->gt,
+ INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
+ *batch++ = 0;
+
+ return batch;
+}
+
+/*
+ * Typically we only have one indirect_ctx and per_ctx batch buffer which are
+ * initialized at the beginning and shared across all contexts but this field
+ * helps us to have multiple batches at different offsets and select them based
+ * on a criteria. At the moment this batch always start at the beginning of the page
+ * and at this point we don't have multiple wa_ctx batch buffers.
+ *
+ * The number of WA applied are not known at the beginning; we use this field
+ * to return the no of DWORDS written.
+ *
+ * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
+ * so it adds NOOPs as padding to make it cacheline aligned.
+ * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
+ * makes a complete batch buffer.
+ */
+static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
+{
+ /* WaDisableCtxRestoreArbitration:bdw,chv */
+ *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
+
+ /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
+ if (IS_BROADWELL(engine->i915))
+ batch = gen8_emit_flush_coherentl3_wa(engine, batch);
+
+ /* WaClearSlmSpaceAtContextSwitch:bdw,chv */
+ /* Actual scratch location is at 128 bytes offset */
+ batch = gen8_emit_pipe_control(batch,
+ PIPE_CONTROL_FLUSH_L3 |
+ PIPE_CONTROL_STORE_DATA_INDEX |
+ PIPE_CONTROL_CS_STALL |
+ PIPE_CONTROL_QW_WRITE,
+ LRC_PPHWSP_SCRATCH_ADDR);
+
+ *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
+
+ /* Pad to end of cacheline */
+ while ((unsigned long)batch % CACHELINE_BYTES)
+ *batch++ = MI_NOOP;
+
+ /*
+ * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
+ * execution depends on the length specified in terms of cache lines
+ * in the register CTX_RCS_INDIRECT_CTX
+ */
+
+ return batch;
+}
+
+struct lri {
+ i915_reg_t reg;
+ u32 value;
+};
+
+static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
+{
+ GEM_BUG_ON(!count || count > 63);
+
+ *batch++ = MI_LOAD_REGISTER_IMM(count);
+ do {
+ *batch++ = i915_mmio_reg_offset(lri->reg);
+ *batch++ = lri->value;
+ } while (lri++, --count);
+ *batch++ = MI_NOOP;
+
+ return batch;
+}
+
+static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
+{
+ static const struct lri lri[] = {
+ /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
+ {
+ COMMON_SLICE_CHICKEN2,
+ __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
+ 0),
+ },
+
+ /* BSpec: 11391 */
+ {
+ FF_SLICE_CHICKEN,
+ __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
+ FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
+ },
+
+ /* BSpec: 11299 */
+ {
+ _3D_CHICKEN3,
+ __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
+ _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
+ }
+ };
+
+ *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
+
+ /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
+ batch = gen8_emit_flush_coherentl3_wa(engine, batch);
+
+ /* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
+ batch = gen8_emit_pipe_control(batch,
+ PIPE_CONTROL_FLUSH_L3 |
+ PIPE_CONTROL_STORE_DATA_INDEX |
+ PIPE_CONTROL_CS_STALL |
+ PIPE_CONTROL_QW_WRITE,
+ LRC_PPHWSP_SCRATCH_ADDR);
+
+ batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
+
+ /* WaMediaPoolStateCmdInWABB:bxt,glk */
+ if (HAS_POOLED_EU(engine->i915)) {
+ /*
+ * EU pool configuration is setup along with golden context
+ * during context initialization. This value depends on
+ * device type (2x6 or 3x6) and needs to be updated based
+ * on which subslice is disabled especially for 2x6
+ * devices, however it is safe to load default
+ * configuration of 3x6 device instead of masking off
+ * corresponding bits because HW ignores bits of a disabled
+ * subslice and drops down to appropriate config. Please
+ * see render_state_setup() in i915_gem_render_state.c for
+ * possible configurations, to avoid duplication they are
+ * not shown here again.
+ */
+ *batch++ = GEN9_MEDIA_POOL_STATE;
+ *batch++ = GEN9_MEDIA_POOL_ENABLE;
+ *batch++ = 0x00777000;
+ *batch++ = 0;
+ *batch++ = 0;
+ *batch++ = 0;
+ }
+
+ *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
+
+ /* Pad to end of cacheline */
+ while ((unsigned long)batch % CACHELINE_BYTES)
+ *batch++ = MI_NOOP;
+
+ return batch;
+}
+
+static u32 *
+gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
+{
+ int i;
+
+ /*
+ * WaPipeControlBefore3DStateSamplePattern: cnl
+ *
+ * Ensure the engine is idle prior to programming a
+ * 3DSTATE_SAMPLE_PATTERN during a context restore.
+ */
+ batch = gen8_emit_pipe_control(batch,
+ PIPE_CONTROL_CS_STALL,
+ 0);
+ /*
+ * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
+ * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
+ * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
+ * confusing. Since gen8_emit_pipe_control() already advances the
+ * batch by 6 dwords, we advance the other 10 here, completing a
+ * cacheline. It's not clear if the workaround requires this padding
+ * before other commands, or if it's just the regular padding we would
+ * already have for the workaround bb, so leave it here for now.
+ */
+ for (i = 0; i < 10; i++)
+ *batch++ = MI_NOOP;
+
+ /* Pad to end of cacheline */
+ while ((unsigned long)batch % CACHELINE_BYTES)
+ *batch++ = MI_NOOP;
+
+ return batch;
+}
+
+#define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE)
+
+static int lrc_init_wa_ctx(struct intel_engine_cs *engine)
+{
+ struct drm_i915_gem_object *obj;
+ struct i915_vma *vma;
+ int err;
+
+ obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE);
+ if (IS_ERR(obj))
+ return PTR_ERR(obj);
+
+ vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
+ if (IS_ERR(vma)) {
+ err = PTR_ERR(vma);
+ goto err;
+ }
+
+ engine->wa_ctx.vma = vma;
+ return 0;
+
+err:
+ i915_gem_object_put(obj);
+ return err;
+}
+
+typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
+
+int intel_init_workaround_bb(struct intel_engine_cs *engine)
+{
+ struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
+ struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx,
+ &wa_ctx->per_ctx };
+ wa_bb_func_t wa_bb_fn[2];
+ void *batch, *batch_ptr;
+ struct i915_gem_ww_ctx ww;
+ unsigned int i;
+ int ret;
+
+ if (engine->class != RENDER_CLASS)
+ return 0;
+
+ switch (INTEL_GEN(engine->i915)) {
+ case 12:
+ case 11:
+ return 0;
+ case 10:
+ wa_bb_fn[0] = gen10_init_indirectctx_bb;
+ wa_bb_fn[1] = NULL;
+ break;
+ case 9:
+ wa_bb_fn[0] = gen9_init_indirectctx_bb;
+ wa_bb_fn[1] = NULL;
+ break;
+ case 8:
+ wa_bb_fn[0] = gen8_init_indirectctx_bb;
+ wa_bb_fn[1] = NULL;
+ break;
+ default:
+ MISSING_CASE(INTEL_GEN(engine->i915));
+ return 0;
+ }
+
+ ret = lrc_init_wa_ctx(engine);
+ if (ret) {
+ drm_dbg(&engine->i915->drm,
+ "Failed to setup context WA page: %d\n", ret);
+ return ret;
+ }
+
+ i915_gem_ww_ctx_init(&ww, true);
+retry:
+ ret = i915_gem_object_lock(wa_ctx->vma->obj, &ww);
+ if (!ret)
+ ret = i915_ggtt_pin(wa_ctx->vma, &ww, 0, PIN_HIGH);
+ if (ret)
+ goto err;
+
+ batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB);
+ if (IS_ERR(batch))
+ goto err_unpin;
+
+ /*
+ * Emit the two workaround batch buffers, recording the offset from the
+ * start of the workaround batch buffer object for each and their
+ * respective sizes.
+ */
+ batch_ptr = batch;
+ for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
+ wa_bb[i]->offset = batch_ptr - batch;
+ if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
+ CACHELINE_BYTES))) {
+ ret = -EINVAL;
+ break;
+ }
+ if (wa_bb_fn[i])
+ batch_ptr = wa_bb_fn[i](engine, batch_ptr);
+ wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
+ }
+ GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE);
+
+err_unpin:
+ if (ret)
+ i915_vma_unpin(wa_ctx->vma);
+
+err:
+ if (ret == -EDEADLK) {
+ ret = i915_gem_ww_ctx_backoff(&ww);
+ if (!ret)
+ goto retry;
+ }
+ i915_gem_ww_ctx_fini(&ww);
+
+ __i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch);
+ __i915_gem_object_release_map(wa_ctx->vma->obj);
+ if (ret) {
+ i915_vma_put(wa_ctx->vma);
+ wa_ctx->vma = NULL;
+ }
+
+ return ret;
+}
+
+void intel_fini_workaround_bb(struct intel_engine_cs *engine)
+{
+ i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
+}
diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
index 832deac8eed2..c97e77a0ba2e 100644
--- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
+++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
@@ -3911,7 +3911,7 @@ gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
#define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE)
-static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
+static int lrc_init_wa_ctx(struct intel_engine_cs *engine)
{
struct drm_i915_gem_object *obj;
struct i915_vma *vma;
@@ -3927,10 +3927,6 @@ static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
goto err;
}
- err = i915_ggtt_pin(vma, NULL, 0, PIN_HIGH);
- if (err)
- goto err;
-
engine->wa_ctx.vma = vma;
return 0;
@@ -3953,6 +3949,7 @@ static int intel_init_workaround_bb(struct intel_engine_cs *engine)
&wa_ctx->per_ctx };
wa_bb_func_t wa_bb_fn[2];
void *batch, *batch_ptr;
+ struct i915_gem_ww_ctx ww;
unsigned int i;
int ret;
@@ -3980,14 +3977,24 @@ static int intel_init_workaround_bb(struct intel_engine_cs *engine)
return 0;
}
- ret = lrc_setup_wa_ctx(engine);
+ ret = lrc_init_wa_ctx(engine);
if (ret) {
drm_dbg(&engine->i915->drm,
"Failed to setup context WA page: %d\n", ret);
return ret;
}
+ i915_gem_ww_ctx_init(&ww, true);
+retry:
+ ret = i915_gem_object_lock(wa_ctx->vma->obj, &ww);
+ if (!ret)
+ ret = i915_ggtt_pin(wa_ctx->vma, &ww, 0, PIN_HIGH);
+ if (ret)
+ goto err;
+
batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB);
+ if (IS_ERR(batch))
+ goto err_unpin;
/*
* Emit the two workaround batch buffers, recording the offset from the
@@ -4008,10 +4015,23 @@ static int intel_init_workaround_bb(struct intel_engine_cs *engine)
}
GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE);
+err_unpin:
+ if (ret)
+ i915_vma_unpin(wa_ctx->vma);
+err:
+ if (ret == -EDEADLK) {
+ ret = i915_gem_ww_ctx_backoff(&ww);
+ if (!ret)
+ goto retry;
+ }
+ i915_gem_ww_ctx_fini(&ww);
+
__i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch);
__i915_gem_object_release_map(wa_ctx->vma->obj);
- if (ret)
- lrc_destroy_wa_ctx(engine);
+ if (ret) {
+ i915_vma_put(wa_ctx->vma);
+ wa_ctx->vma = NULL;
+ }
return ret;
}
--
2.29.2.222.g5d2a92d10f8
More information about the Intel-gfx-trybot
mailing list