[Intel-gfx] [PATCH v6 11/14] drm/i915: Enable commands submission via GuC

Wed Apr 29 15:13:32 PDT 2015

From: Alex Dai <yu.dai at intel.com>

Add functions to submit work queue item and ring the door bell.
GuC TLB needs to be invalided if LRC context changes.

Issue: VIZ-4884
Signed-off-by: Alex Dai <yu.dai at intel.com>
---
 drivers/gpu/drm/i915/intel_guc.h        |   3 +
 drivers/gpu/drm/i915/intel_guc_client.c | 198 +++++++++++++++++++++++++++++++-
 drivers/gpu/drm/i915/intel_lrc.c        |  34 +++++-
 drivers/gpu/drm/i915/intel_lrc.h        |   2 +
 4 files changed, 230 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_guc.h b/drivers/gpu/drm/i915/intel_guc.h
index 892f974..f8065cf 100644
--- a/drivers/gpu/drm/i915/intel_guc.h
+++ b/drivers/gpu/drm/i915/intel_guc.h
@@ -31,6 +31,7 @@
 #define GUC_WQ_SIZE	(PAGE_SIZE * 2)
 
 struct i915_guc_client {
+	spinlock_t wq_lock;
 	struct drm_i915_gem_object *client_obj;
 	u32 priority;
 	off_t doorbell_offset;
@@ -39,6 +40,8 @@ struct i915_guc_client {
 	uint16_t doorbell_id;
 	uint32_t ctx_index;
 	uint32_t wq_size;
+	uint32_t wq_tail;
+	uint32_t cookie;
 };
 
 #define I915_MAX_DOORBELLS	256
diff --git a/drivers/gpu/drm/i915/intel_guc_client.c b/drivers/gpu/drm/i915/intel_guc_client.c
index 7922427..31934a3 100644
--- a/drivers/gpu/drm/i915/intel_guc_client.c
+++ b/drivers/gpu/drm/i915/intel_guc_client.c
@@ -22,6 +22,7 @@
  *
  */
 #include <linux/firmware.h>
+#include <linux/circ_buf.h>
 #include "i915_drv.h"
 #include "intel_guc.h"
 
@@ -52,6 +53,14 @@
  * Doorbells are interrupts to uKernel. A doorbell is a single cache line (QW)
  * mapped into process space.
  *
+ * Work Items:
+ * There are several types of work items that the host may place into a
+ * workqueue, each with its own requirements and limitations. Currently only
+ * WQ_TYPE_INORDER is needed to support legacy submission via GuC, which
+ * represents in-order queue. The kernel driver packs ring tail pointer and an
+ * ELSP context descriptor dword into Work Item.
+ * See add_workqueue_item()
+ *
  */
 
 /*
@@ -395,6 +404,8 @@ i915_guc_client_alloc(struct drm_device *dev, u32 priority)
 		/* XXX: evict a doorbell instead */
 		goto err;
 
+	spin_lock_init(&client->wq_lock);
+
 	init_ctx_desc(guc, client);
 	init_proc_desc(guc, client);
 	init_doorbell(guc, client);
@@ -414,6 +425,183 @@ err:
 	return NULL;
 }
 
+/* Get valid workqueue item and return it back to offset */
+static int get_workqueue_space(struct i915_guc_client *gc, u32 *offset)
+{
+	struct guc_process_desc *desc;
+	void *base;
+	u32 size = sizeof(struct guc_wq_item);
+	int ret = 0, timeout_counter = 200;
+	unsigned long flags;
+
+	base = kmap_atomic(i915_gem_object_get_page(gc->client_obj, 0));
+	desc = base + gc->proc_desc_offset;
+
+	while (timeout_counter-- > 0) {
+		spin_lock_irqsave(&gc->wq_lock, flags);
+
+		ret = wait_for_atomic(CIRC_SPACE(gc->wq_tail, desc->head,
+				gc->wq_size) >= size, 1);
+
+		if (!ret) {
+			*offset = gc->wq_tail;
+
+			/* advance the tail for next workqueue item */
+			gc->wq_tail += size;
+			gc->wq_tail &= gc->wq_size - 1;
+
+			/* this will break the loop */
+			timeout_counter = 0;
+		}
+
+		spin_unlock_irqrestore(&gc->wq_lock, flags);
+	};
+
+	kunmap_atomic(base);
+
+	return ret;
+}
+
+static void guc_update_context(struct intel_context *ctx,
+				struct intel_engine_cs *ring)
+{
+	struct intel_ringbuffer *ringbuf = ctx->engine[ring->id].ringbuf;
+	struct drm_i915_gem_object *ctx_obj = ctx->engine[ring->id].state;
+	struct page *page;
+	uint32_t *reg_state;
+
+	page = i915_gem_object_get_page(ctx_obj, 1);
+	reg_state = kmap_atomic(page);
+
+	reg_state[CTX_RING_BUFFER_START + 1] =
+			i915_gem_obj_ggtt_offset(ringbuf->obj);
+
+	/* True PPGTT with dynamic page allocation: update PDP registers and
+	 * point the unallocated PDPs to the scratch page
+	 */
+	if (ctx->ppgtt) {
+		ASSIGN_CTX_PDP(ctx->ppgtt, reg_state, 3);
+		ASSIGN_CTX_PDP(ctx->ppgtt, reg_state, 2);
+		ASSIGN_CTX_PDP(ctx->ppgtt, reg_state, 1);
+		ASSIGN_CTX_PDP(ctx->ppgtt, reg_state, 0);
+	}
+
+	kunmap_atomic(reg_state);
+}
+
+static int add_workqueue_item(struct i915_guc_client *gc,
+			      struct intel_context *ctx,
+			      struct intel_engine_cs *ring)
+{
+	struct intel_ringbuffer *ringbuf = ctx->engine[ring->id].ringbuf;
+	struct drm_i915_gem_object *ctx_obj;
+	struct guc_wq_item *wqi;
+	void *base;
+	u32 wq_off = 0, tail = ringbuf->tail, wq_len;
+	int ret;
+
+	ctx_obj = ctx->engine[ring->id].state;
+
+	/* Need this because of the deferred pin ctx and ring */
+	/* Shall we move this right after ring is pinned? */
+	guc_update_context(ctx, ring);
+
+	ret = get_workqueue_space(gc, &wq_off);
+	if (ret)
+		return ret;
+
+	/* For now workqueue item is 4 DWs; workqueue buffer is 2 pages. So we
+	 * should not have the case where structure wqi is across page, neither
+	 * wrapped to the beginning. This simplifies the implementation below.
+	 *
+	 * XXX: if not the case, we need save data to a temp wqi and copy it to
+	 * workqueue buffer dw by dw.
+	 */
+	WARN_ON(sizeof(struct guc_wq_item) != 16);
+	WARN_ON(wq_off & 3);
+
+	/* wq starts from the page after doorbell / process_desc */
+	base = kmap_atomic(i915_gem_object_get_page(gc->client_obj,
+			(wq_off + GUC_DB_SIZE) >> PAGE_SHIFT));
+	wq_off &= PAGE_SIZE - 1;
+	wqi = (struct guc_wq_item *)((char *)base + wq_off);
+
+	/* len does not include the header */
+	wq_len = sizeof(struct guc_wq_item) / sizeof(u32) - 1;
+	wqi->header = WQ_TYPE_INORDER |
+			(wq_len << WQ_LEN_SHIFT) |
+			(ring->id << WQ_TARGET_SHIFT) |
+			WQ_NO_WCFLUSH_WAIT;
+
+	wqi->context_desc = (u32)execlists_ctx_descriptor(ring, ctx_obj);
+	/* tail index is in qw */
+	tail >>= 3;
+	wqi->ring_tail = tail << WQ_RING_TAIL_SHIFT;
+	wqi->fence_id = 0; /*XXX: what fence to be here */
+
+	kunmap_atomic(base);
+
+	return 0;
+}
+
+static int ring_doorbell(struct i915_guc_client *gc)
+{
+	struct guc_process_desc *desc;
+	union guc_doorbell_qw db_cmp, db_exc, db_ret;
+	union guc_doorbell_qw *db;
+	void *base;
+	int attempt = 2, ret = -EAGAIN;
+
+	base = kmap_atomic(i915_gem_object_get_page(gc->client_obj, 0));
+	desc = base + gc->proc_desc_offset;
+
+	/* Update the tail so it is visible to GuC */
+	desc->tail = gc->wq_tail;
+
+	/* current cookie */
+	db_cmp.db_status = GUC_DOORBELL_ENABLED;
+	db_cmp.cookie = gc->cookie;
+
+	/* cookie to be updated */
+	db_exc.db_status = GUC_DOORBELL_ENABLED;
+	db_exc.cookie = gc->cookie + 1;
+	if (db_exc.cookie == 0)
+		db_exc.cookie = 1;
+
+	/* pointer of current doorbell cacheline */
+	db = base + gc->doorbell_offset;
+
+	while (attempt--) {
+		/* lets ring the doorbell */
+		db_ret.value_qw = atomic64_cmpxchg((atomic64_t *)db,
+			db_cmp.value_qw, db_exc.value_qw);
+
+		/* if the exchange was successfully executed */
+		if (db_ret.value_qw == db_cmp.value_qw) {
+			/* db was successfully rung */
+			gc->cookie = db_exc.cookie;
+			ret = 0;
+			break;
+		}
+
+		/* XXX: doorbell was lost and need to acquire it again */
+		if (db_ret.db_status == GUC_DOORBELL_DISABLED)
+			break;
+
+		DRM_ERROR("Cookie mismatch. Expected %d, returned %d\n",
+			  db_cmp.cookie, db_ret.cookie);
+
+		/* update the cookie to newly read cookie from GuC */
+		db_cmp.cookie = db_ret.cookie;
+		db_exc.cookie = db_ret.cookie + 1;
+		if (db_exc.cookie == 0)
+			db_exc.cookie = 1;
+	}
+
+	kunmap_atomic(base);
+	return ret;
+}
+
 /**
  * i915_guc_client_submit() - Submit commands through GuC
  * @client:	the guc client where commands will go through
@@ -426,5 +614,13 @@ int i915_guc_client_submit(struct i915_guc_client *client,
 			   struct intel_context *ctx,
 			   struct intel_engine_cs *ring)
 {
-	return 0;
+	int ret;
+
+	ret = add_workqueue_item(client, ctx, ring);
+	if (ret)
+		return ret;
+
+	ret = ring_doorbell(client);
+
+	return ret;
 }
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 8685205..15295db 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -187,8 +187,8 @@ u32 intel_execlists_ctx_id(struct drm_i915_gem_object *ctx_obj)
 	return lrca >> 12;
 }
 
-static uint64_t execlists_ctx_descriptor(struct intel_engine_cs *ring,
-					 struct drm_i915_gem_object *ctx_obj)
+uint64_t execlists_ctx_descriptor(struct intel_engine_cs *ring,
+				  struct drm_i915_gem_object *ctx_obj)
 {
 	struct drm_device *dev = ring->dev;
 	uint64_t desc;
@@ -648,13 +648,17 @@ intel_logical_ring_advance_and_submit(struct intel_ringbuffer *ringbuf,
 				      struct drm_i915_gem_request *request)
 {
 	struct intel_engine_cs *ring = ringbuf->ring;
+	struct drm_i915_private *dev_priv = ring->dev->dev_private;
 
 	intel_logical_ring_advance(ringbuf);
 
 	if (intel_ring_stopped(ring))
 		return;
 
-	execlists_context_queue(ring, ctx, ringbuf->tail, request);
+	if (dev_priv->guc.execbuf_client)
+		i915_guc_client_submit(dev_priv->guc.execbuf_client, ctx, ring);
+	else
+		execlists_context_queue(ring, ctx, ringbuf->tail, request);
 }
 
 static int logical_ring_wrap_buffer(struct intel_ringbuffer *ringbuf,
@@ -918,18 +922,23 @@ static int intel_lr_context_pin(struct intel_engine_cs *ring,
 {
 	struct drm_i915_gem_object *ctx_obj = ctx->engine[ring->id].state;
 	struct intel_ringbuffer *ringbuf = ctx->engine[ring->id].ringbuf;
+	struct drm_i915_private *dev_priv = ring->dev->dev_private;
 	int ret = 0;
 
 	WARN_ON(!mutex_is_locked(&ring->dev->struct_mutex));
 	if (ctx->engine[ring->id].pin_count++ == 0) {
-		ret = i915_gem_obj_ggtt_pin(ctx_obj,
-				GEN8_LR_CONTEXT_ALIGN, 0);
+		ret = i915_gem_obj_ggtt_pin(ctx_obj, GEN8_LR_CONTEXT_ALIGN,
+				PIN_OFFSET_BIAS | GUC_WOPCM_SIZE_VALUE);
 		if (ret)
 			goto reset_pin_count;
 
 		ret = intel_pin_and_map_ringbuffer_obj(ring->dev, ringbuf);
 		if (ret)
 			goto unpin_ctx_obj;
+
+		/* Invalidate GuC TLB. */
+		if (i915.enable_guc_scheduling)
+			I915_WRITE(GEN8_GTCR, GEN8_GTCR_INVALIDATE);
 	}
 
 	return ret;
@@ -1283,6 +1292,13 @@ static int intel_lr_context_render_state_init(struct intel_engine_cs *ring,
 	ret = __i915_add_request(ring, file, so.obj);
 	/* intel_logical_ring_add_request moves object to inactive if it
 	 * fails */
+
+	/* GuC firmware will try to collapse its DPC work queue if the new one
+	 * is for same context. So the following breadcrumb could be amended to
+	 * this batch and submitted as one batch. Wait here to make sure the
+	 * context state init is finished before any other submission to GuC. */
+	if (!ret && i915.enable_guc_scheduling)
+		ret = i915_wait_request(so.obj->last_read_req);
 out:
 	i915_gem_render_state_fini(&so);
 	return ret;
@@ -1291,8 +1307,13 @@ out:
 static int gen8_init_rcs_context(struct intel_engine_cs *ring,
 		       struct intel_context *ctx)
 {
+	struct drm_i915_private *dev_priv = ring->dev->dev_private;
 	int ret;
 
+	/* Invalidate GuC TLB. */
+	if (i915.enable_guc_scheduling)
+		I915_WRITE(GEN8_GTCR, GEN8_GTCR_INVALIDATE);
+
 	ret = intel_logical_ring_workarounds_emit(ring, ctx);
 	if (ret)
 		return ret;
@@ -1819,7 +1840,8 @@ int intel_lr_context_deferred_create(struct intel_context *ctx,
 	}
 
 	if (is_global_default_ctx) {
-		ret = i915_gem_obj_ggtt_pin(ctx_obj, GEN8_LR_CONTEXT_ALIGN, 0);
+		ret = i915_gem_obj_ggtt_pin(ctx_obj, GEN8_LR_CONTEXT_ALIGN,
+				PIN_OFFSET_BIAS | GUC_WOPCM_SIZE_VALUE);
 		if (ret) {
 			DRM_DEBUG_DRIVER("Pin LRC backing obj failed: %d\n",
 					ret);
diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
index 04d3a6d..19c9a02 100644
--- a/drivers/gpu/drm/i915/intel_lrc.h
+++ b/drivers/gpu/drm/i915/intel_lrc.h
@@ -85,6 +85,8 @@ int intel_execlists_submission(struct drm_device *dev, struct drm_file *file,
 			       struct drm_i915_gem_object *batch_obj,
 			       u64 exec_start, u32 dispatch_flags);
 u32 intel_execlists_ctx_id(struct drm_i915_gem_object *ctx_obj);
+uint64_t execlists_ctx_descriptor(struct intel_engine_cs *ring,
+				  struct drm_i915_gem_object *ctx_obj);
 
 void intel_lrc_irq_handler(struct intel_engine_cs *ring);
 void intel_execlists_retire_requests(struct intel_engine_cs *ring);
-- 
1.9.1