[Intel-gfx] [PATCH v3 1/6] drm/i915/gen8: Add infrastructure to initialize WA batch buffers

Fri Jun 5 06:54:51 PDT 2015

Some of the WA are to be applied during context save but before restore and
some at the end of context save/restore but before executing the instructions
in the ring, WA batch buffers are created for this purpose and these WA cannot
be applied using normal means. Each context has two registers to load the
offsets of these batch buffers. If they are non-zero, HW understands that it
need to execute these batches.

v1: In this version two separate ring_buffer objects were used to load WA
instructions for indirect and per context batch buffers and they were part
of every context.

v2: Chris suggested to include additional page in context and use it to load
these WA instead of creating separate objects. This will simplify lot of things
as we need not explicity pin/unpin them. Thomas Daniel further pointed that GuC
is planning to use a similar setup to share data between GuC and driver and
WA batch buffers can probably share that page. However after discussions with
Dave who is implementing GuC changes, he suggested to use an independent page
for the reasons - GuC area might grow and these WA are initialized only once and
are not changed afterwards so we can share them share across all contexts.

The page is updated with WA during render ring init. This has an advantage of
not adding more special cases to default_contenxt.

(Thanks to Chris, Dave and Thomas for their inputs)

Signed-off-by: Rafael Barbalho <rafael.barbalho at intel.com>
Signed-off-by: Arun Siluvery <arun.siluvery at linux.intel.com>
---
 drivers/gpu/drm/i915/intel_lrc.c        | 177 +++++++++++++++++++++++++++++++-
 drivers/gpu/drm/i915/intel_ringbuffer.h |   9 ++
 2 files changed, 181 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 0413b8f..4e68b54 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -211,6 +211,7 @@ enum {
 	FAULT_AND_CONTINUE /* Unsupported */
 };
 #define GEN8_CTX_ID_SHIFT 32
+#define CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT  0x17
 
 static int intel_lr_context_pin(struct intel_engine_cs *ring,
 		struct intel_context *ctx);
@@ -1077,6 +1078,96 @@ static int intel_logical_ring_workarounds_emit(struct intel_engine_cs *ring,
 	return 0;
 }
 
+static int gen8_init_indirectctx_bb(struct intel_engine_cs *ring)
+{
+	int index;
+	int end;
+	struct page *page;
+	uint32_t *cmd;
+
+	page = i915_gem_object_get_page(ring->wa_ctx.obj, 0);
+	cmd = kmap_atomic(page);
+
+	index = ring->wa_ctx.indctx_batch_offset / sizeof(uint32_t);
+	end = index + (ring->wa_ctx.indctx_batch_size *
+		       CACHELINE_BYTES) / sizeof(uint32_t);
+
+	if ((end * sizeof(uint32_t)) > PAGE_SIZE) {
+		DRM_ERROR("context WA instruction exceeding alloted size\n");
+		kunmap_atomic(cmd);
+		return -EINVAL;
+	}
+
+	/* FIXME: fill unused locations with NOOPs.
+	 * Replace these instructions with WA
+	 */
+        while (index < end)
+		cmd[index++] = MI_NOOP;
+
+	/*
+	 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
+	 * execution depends on the length specified in terms of cache lines
+	 * in the register CTX_RCS_INDIRECT_CTX
+	 */
+
+	kunmap_atomic(cmd);
+
+	return 0;
+}
+
+static int gen8_init_perctx_bb(struct intel_engine_cs *ring)
+{
+	int index;
+	int end;
+	struct page *page;
+	uint32_t *cmd;
+
+	page = i915_gem_object_get_page(ring->wa_ctx.obj, 0);
+	cmd = kmap_atomic(page);
+
+	index = ring->wa_ctx.perctx_batch_offset / sizeof(uint32_t);
+	end = index + (ring->wa_ctx.perctx_batch_size *
+		       CACHELINE_BYTES) / sizeof(uint32_t);
+
+	if ((end * sizeof(uint32_t)) > PAGE_SIZE) {
+		DRM_ERROR("context WA instruction exceeding alloted size\n");
+		kunmap_atomic(cmd);
+		return -EINVAL;
+	}
+
+	/* FIXME: fill unused locations with NOOPs.
+	 * Replace these instructions with WA
+	 */
+        while (index < end)
+		cmd[index++] = MI_NOOP;
+
+	cmd[index - 1] = MI_BATCH_BUFFER_END;
+	kunmap_atomic(cmd);
+
+	return 0;
+}
+
+static int intel_init_workaround_bb(struct intel_engine_cs *ring)
+{
+	int ret;
+
+	if (INTEL_INFO(ring->dev)->gen == 8) {
+		ret = gen8_init_indirectctx_bb(ring);
+		if (ret)
+			return ret;
+
+		ret = gen8_init_perctx_bb(ring);
+		if (ret)
+			return ret;
+	} else {
+		WARN(INTEL_INFO(ring->dev)->gen >= 9,
+		     "WA batch buffer is not initialized for Gen%d\n",
+		     INTEL_INFO(ring->dev)->gen);
+	}
+
+	return 0;
+}
+
 static int gen8_init_common_ring(struct intel_engine_cs *ring)
 {
 	struct drm_device *dev = ring->dev;
@@ -1382,6 +1473,46 @@ static int gen8_init_rcs_context(struct intel_engine_cs *ring,
 	return intel_lr_context_render_state_init(ring, ctx);
 }
 
+static int lrc_setup_wa_ctx_obj(struct intel_engine_cs *ring, u32 size)
+{
+	int ret;
+	struct drm_device *dev = ring->dev;
+
+	WARN_ON(ring->id != RCS);
+
+	size = roundup(size, PAGE_SIZE);
+	ring->wa_ctx.obj = i915_gem_alloc_object(dev, size);
+	if (!ring->wa_ctx.obj) {
+		DRM_DEBUG_DRIVER("alloc LRC WA ctx backing obj failed.\n");
+		return -ENOMEM;
+	}
+
+	ret = i915_gem_obj_ggtt_pin(ring->wa_ctx.obj, GEN8_LR_CONTEXT_ALIGN, 0);
+	if (ret) {
+		DRM_DEBUG_DRIVER("pin LRC WA ctx backing obj failed: %d\n",
+				 ret);
+		drm_gem_object_unreference(&ring->wa_ctx.obj->base);
+		return ret;
+	}
+
+	ring->wa_ctx.indctx_batch_offset = 0;
+	ring->wa_ctx.indctx_batch_size = 4; /* in cache lines */
+	ring->wa_ctx.perctx_batch_offset =
+		ring->wa_ctx.indctx_batch_size * CACHELINE_BYTES;
+	ring->wa_ctx.perctx_batch_size = 2;
+
+	return 0;
+}
+
+static void lrc_destroy_wa_ctx_obj(struct intel_engine_cs *ring)
+{
+	WARN_ON(ring->id != RCS);
+
+	i915_gem_object_ggtt_unpin(ring->wa_ctx.obj);
+	drm_gem_object_unreference(&ring->wa_ctx.obj->base);
+	ring->wa_ctx.obj = NULL;
+}
+
 /**
  * intel_logical_ring_cleanup() - deallocate the Engine Command Streamer
  *
@@ -1470,11 +1601,37 @@ static int logical_render_ring_init(struct drm_device *dev)
 	ring->emit_bb_start = gen8_emit_bb_start;
 
 	ring->dev = dev;
+
+	if (INTEL_INFO(ring->dev)->gen >= 8) {
+		ret = lrc_setup_wa_ctx_obj(ring, PAGE_SIZE);
+		if (ret) {
+			DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n",
+					 ret);
+			return ret;
+		}
+
+		ret = intel_init_workaround_bb(ring);
+		if (ret) {
+			lrc_destroy_wa_ctx_obj(ring);
+			DRM_ERROR("WA batch buffers are not initialized: %d\n",
+				  ret);
+		}
+	}
+
 	ret = logical_ring_init(dev, ring);
 	if (ret)
-		return ret;
+		goto clear_wa_ctx;
+
+	ret = intel_init_pipe_control(ring);
+	if (ret)
+		goto clear_wa_ctx;
+
+	return 0;
 
-	return intel_init_pipe_control(ring);
+clear_wa_ctx:
+	if (ring->wa_ctx.obj)
+		lrc_destroy_wa_ctx_obj(ring);
+	return ret;
 }
 
 static int logical_bsd_ring_init(struct drm_device *dev)
@@ -1754,15 +1911,25 @@ populate_lr_context(struct intel_context *ctx, struct drm_i915_gem_object *ctx_o
 	reg_state[CTX_SECOND_BB_STATE] = ring->mmio_base + 0x118;
 	reg_state[CTX_SECOND_BB_STATE+1] = 0;
 	if (ring->id == RCS) {
-		/* TODO: according to BSpec, the register state context
-		 * for CHV does not have these. OTOH, these registers do
-		 * exist in CHV. I'm waiting for a clarification */
 		reg_state[CTX_BB_PER_CTX_PTR] = ring->mmio_base + 0x1c0;
 		reg_state[CTX_BB_PER_CTX_PTR+1] = 0;
 		reg_state[CTX_RCS_INDIRECT_CTX] = ring->mmio_base + 0x1c4;
 		reg_state[CTX_RCS_INDIRECT_CTX+1] = 0;
 		reg_state[CTX_RCS_INDIRECT_CTX_OFFSET] = ring->mmio_base + 0x1c8;
 		reg_state[CTX_RCS_INDIRECT_CTX_OFFSET+1] = 0;
+		if (ring->wa_ctx.obj) {
+			reg_state[CTX_RCS_INDIRECT_CTX+1] =
+				(i915_gem_obj_ggtt_offset(ring->wa_ctx.obj) +
+				 ring->wa_ctx.indctx_batch_offset) |
+				ring->wa_ctx.indctx_batch_size;
+
+			reg_state[CTX_RCS_INDIRECT_CTX_OFFSET+1] =
+				CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT << 6;
+
+			reg_state[CTX_BB_PER_CTX_PTR+1] =
+				(i915_gem_obj_ggtt_offset(ring->wa_ctx.obj) +
+				 ring->wa_ctx.perctx_batch_offset) | 0x01;
+		}
 	}
 	reg_state[CTX_LRI_HEADER_1] = MI_LOAD_REGISTER_IMM(9);
 	reg_state[CTX_LRI_HEADER_1] |= MI_LRI_FORCE_POSTED;
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 39f6dfc..2a0f7ce 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -119,6 +119,14 @@ struct intel_ringbuffer {
 
 struct	intel_context;
 
+struct  i915_ctx_workarounds {
+	u32 indctx_batch_offset;
+	u32 indctx_batch_size;
+	u32 perctx_batch_offset;
+	u32 perctx_batch_size;
+	struct drm_i915_gem_object *obj;
+};
+
 struct  intel_engine_cs {
 	const char	*name;
 	enum intel_ring_id {
@@ -142,6 +150,7 @@ struct  intel_engine_cs {
 	struct i915_gem_batch_pool batch_pool;
 
 	struct intel_hw_status_page status_page;
+	struct i915_ctx_workarounds wa_ctx;
 
 	unsigned irq_refcount; /* protected by dev_priv->irq_lock */
 	u32		irq_enable_mask;	/* bitmask to enable ring interrupt */
-- 
2.3.0