[Intel-gfx] [RFC] drm/i915/guc: New GuC stage descriptors

Fri Oct 12 18:25:30 UTC 2018

With the new interface, GuC now requires every lrc to be registered in
one of the stage descriptors, which have been re-designed so that each
descriptor can store up to 64 lrc per class (i.e. equal to the possible
SW counter values).
Similarly to what happened with the previous legacy design, it is possible
to have a single "proxy" descriptor that owns the workqueue and the
doorbell and use it for all submission. To distinguish the proxy
descriptors from the one used for lrc storage, the latter have been
called "principal". A descriptor can't be both a proxy and a principal
at the same time; to enforce this, since we only use 1 proxy descriptor
per client, we reserve enough descriptor from the bottom of the pool to
be used as proxy and leave the others as principals. For simplicity, we
currently map context IDs 1:1 to principal descriptors, but we could
have more contexts in flight if needed by using the SW counter.
Note that the lrcs need to be mapped in the principal descriptor until
guc is done with them. This means that we can't release the HW id when
the user app closes the ctx because it might still be in flight with GuC
and that we need to be careful when unpinning because the fact that the
a request on the next context has completed doesn't mean that GuC is
done processing the first one. See in-code comments for details.

NOTE: GuC is not going to look at lrcs that are not in flight, so we
could potentially skip the unpinning steps. However, the unpinnig steps
perform extra correctness check so better keep them until we're sure
that the flow is solid.

Based on an initial patch by Oscar Mateo.

RFC: this will be sent as part of the updated series once we have
the gen9 FW with the new interface, but since the flow is
significantly different compared to the previous version I'd like
to gather some early feedback to make sure there aren't any major
issues.

Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com>
Signed-off-by: Michal Wajdeczko <michal.wajdeczko at intel.com>
Cc: Chris Wilson <chris at chris-wilson.co.uk>
Cc: Michel Thierry <michel.thierry at intel.com>
Cc: Michal Winiarski <michal.winiarski at intel.com>
Cc: Tomasz Lis <tomasz.lis at intel.com>
---
 drivers/gpu/drm/i915/i915_debugfs.c         |  30 +-
 drivers/gpu/drm/i915/i915_drv.h             |   5 +-
 drivers/gpu/drm/i915/i915_gem_context.c     |   9 +-
 drivers/gpu/drm/i915/intel_guc.h            |  14 +-
 drivers/gpu/drm/i915/intel_guc_fwif.h       |  73 +++--
 drivers/gpu/drm/i915/intel_guc_submission.c | 346 +++++++++++++++-----
 drivers/gpu/drm/i915/intel_lrc.c            |  18 +-
 drivers/gpu/drm/i915/intel_lrc.h            |   7 +
 drivers/gpu/drm/i915/selftests/intel_guc.c  |   4 +-
 9 files changed, 360 insertions(+), 146 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 00c551d3e409..04bbde4a38a6 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -2474,11 +2474,10 @@ static int i915_guc_stage_pool(struct seq_file *m, void *data)
 
 		seq_printf(m, "GuC stage descriptor %u:\n", index);
 		seq_printf(m, "\tIndex: %u\n", desc->stage_id);
+		seq_printf(m, "\tProxy Index: %u\n", desc->proxy_id);
 		seq_printf(m, "\tAttribute: 0x%x\n", desc->attribute);
 		seq_printf(m, "\tPriority: %d\n", desc->priority);
 		seq_printf(m, "\tDoorbell id: %d\n", desc->db_id);
-		seq_printf(m, "\tEngines used: 0x%x\n",
-			   desc->engines_used);
 		seq_printf(m, "\tDoorbell trigger phy: 0x%llx, cpu: 0x%llx, uK: 0x%x\n",
 			   desc->db_trigger_phy,
 			   desc->db_trigger_cpu,
@@ -2490,18 +2489,21 @@ static int i915_guc_stage_pool(struct seq_file *m, void *data)
 		seq_putc(m, '\n');
 
 		for_each_engine_masked(engine, dev_priv, client->engines, tmp) {
-			u32 guc_engine_id = engine->guc_id;
-			struct guc_execlist_context *lrc =
-						&desc->lrc[guc_engine_id];
-
-			seq_printf(m, "\t%s LRC:\n", engine->name);
-			seq_printf(m, "\t\tContext desc: 0x%x\n",
-				   lrc->context_desc);
-			seq_printf(m, "\t\tContext id: 0x%x\n", lrc->context_id);
-			seq_printf(m, "\t\tLRCA: 0x%x\n", lrc->ring_lrca);
-			seq_printf(m, "\t\tRing begin: 0x%x\n", lrc->ring_begin);
-			seq_printf(m, "\t\tRing end: 0x%x\n", lrc->ring_end);
-			seq_putc(m, '\n');
+			u8 class = engine->class;
+			u8 inst = engine->instance;
+
+			if (desc->lrc_alloc_map[class].bitmap & BIT(inst)) {
+				struct guc_execlist_context *lrc =
+							&desc->lrc[class][inst];
+				seq_printf(m, "\t%s LRC:\n", engine->name);
+				seq_printf(m, "\t\tHW context desc: 0x%x:0x%x\n",
+						lower_32_bits(lrc->hw_context_desc),
+						upper_32_bits(lrc->hw_context_desc));
+				seq_printf(m, "\t\tLRC: 0x%x\n", lrc->ring_lrc);
+				seq_printf(m, "\t\tRing begin: 0x%x\n", lrc->ring_begin);
+				seq_printf(m, "\t\tRing end: 0x%x\n", lrc->ring_end);
+				seq_putc(m, '\n');
+			}
 		}
 	}
 
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index bd76931987ef..ce095d57e050 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1878,13 +1878,14 @@ struct drm_i915_private {
 		 * (the SW Context ID field) but GuC limits it further so
 		 * without taking advantage of part of the SW counter field the
 		 * firmware only supports a max number of contexts equal to the
-		 * number of entries in the GuC stage descriptor pool.
+		 * number of entries in the GuC stage descriptor pool, minus
+		 * the descriptors reserved for proxy usage
 		 */
 		struct ida hw_ida;
 #define MAX_CONTEXT_HW_ID (1<<21) /* exclusive */
 #define MAX_GUC_CONTEXT_HW_ID (1 << 20) /* exclusive */
 #define GEN11_MAX_CONTEXT_HW_ID (1<<11) /* exclusive */
-#define GEN11_MAX_CONTEXT_HW_ID_WITH_GUC GUC_MAX_STAGE_DESCRIPTORS
+#define GEN11_MAX_CONTEXT_HW_ID_WITH_GUC GUC_MAX_PPAL_STAGE_DESCRIPTORS
 		struct list_head hw_id_list;
 	} contexts;
 
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 552d2e108de4..c239d9b9307c 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -284,10 +284,15 @@ static void context_close(struct i915_gem_context *ctx)
 	i915_gem_context_set_closed(ctx);
 
 	/*
-	 * This context will never again be assinged to HW, so we can
+	 * This context will never again be assigned to HW, so we can
 	 * reuse its ID for the next context.
+	 *
+	 * if GuC is in use, we need to keep the ID until GuC has finished
+	 * processing all submitted requests because the ID is used by the
+	 * firmware to index the guc stage_desc pool.
 	 */
-	release_hw_id(ctx);
+	if (!USES_GUC_SUBMISSION(ctx->i915))
+		release_hw_id(ctx);
 
 	/*
 	 * The LUT uses the VMA as a backpointer to unref the object,
diff --git a/drivers/gpu/drm/i915/intel_guc.h b/drivers/gpu/drm/i915/intel_guc.h
index 11b3882482f4..05ee44fb66af 100644
--- a/drivers/gpu/drm/i915/intel_guc.h
+++ b/drivers/gpu/drm/i915/intel_guc.h
@@ -58,10 +58,14 @@ struct intel_guc {
 	bool interrupts_enabled;
 	unsigned int msg_enabled_mask;
 
+	struct ida client_ids;
+#define GUC_MAX_CLIENT_IDS 2
+
 	struct i915_vma *ads_vma;
 	struct i915_vma *stage_desc_pool;
 	void *stage_desc_pool_vaddr;
-	struct ida stage_ids;
+#define	GUC_MAX_PPAL_STAGE_DESCRIPTORS (GUC_MAX_STAGE_DESCRIPTORS - GUC_MAX_CLIENT_IDS)
+
 	struct i915_vma *shared_data;
 	void *shared_data_vaddr;
 
@@ -94,6 +98,14 @@ struct intel_guc {
 
 	/* GuC's FW specific notify function */
 	void (*notify)(struct intel_guc *guc);
+
+	/*
+	 * Override the first stage_desc to be used as proxy
+	 * (Default: GUC_MAX_PPAL_STAGE_DESCRIPTORS). The max number of ppal
+	 * descriptors is not updated accordingly since the test using this does
+	 * not allocate any context.
+	 */
+	I915_SELFTEST_DECLARE(u32 starting_proxy_id);
 };
 
 static inline bool intel_guc_is_alive(struct intel_guc *guc)
diff --git a/drivers/gpu/drm/i915/intel_guc_fwif.h b/drivers/gpu/drm/i915/intel_guc_fwif.h
index ce3ab6ed21d5..1a0f41a26173 100644
--- a/drivers/gpu/drm/i915/intel_guc_fwif.h
+++ b/drivers/gpu/drm/i915/intel_guc_fwif.h
@@ -32,6 +32,8 @@
 #define GUC_MAX_STAGE_DESCRIPTORS	1024
 #define	GUC_INVALID_STAGE_ID		GUC_MAX_STAGE_DESCRIPTORS
 
+#define GUC_MAX_LRC_PER_CLASS		64
+
 #define GUC_RENDER_ENGINE		0
 #define GUC_VIDEO_ENGINE		1
 #define GUC_BLITTER_ENGINE		2
@@ -66,9 +68,12 @@
 #define GUC_DOORBELL_DISABLED		0
 
 #define GUC_STAGE_DESC_ATTR_ACTIVE	BIT(0)
-#define GUC_STAGE_DESC_ATTR_PENDING_DB	BIT(1)
-#define GUC_STAGE_DESC_ATTR_KERNEL	BIT(2)
-#define GUC_STAGE_DESC_ATTR_PREEMPT	BIT(3)
+#define GUC_STAGE_DESC_ATTR_TYPE_SHIFT	1
+#define GUC_STAGE_DESC_ATTR_PRINCIPAL	(0x0 << GUC_STAGE_DESC_ATTR_TYPE_SHIFT)
+#define GUC_STAGE_DESC_ATTR_PROXY	(0x1 << GUC_STAGE_DESC_ATTR_TYPE_SHIFT)
+#define GUC_STAGE_DESC_ATTR_REAL	(0x2 << GUC_STAGE_DESC_ATTR_TYPE_SHIFT)
+#define GUC_STAGE_DESC_ATTR_TYPE_MASK	(0x3 << GUC_STAGE_DESC_ATTR_TYPE_SHIFT)
+#define GUC_STAGE_DESC_ATTR_KERNEL	(1 << 3)
 #define GUC_STAGE_DESC_ATTR_RESET	BIT(4)
 #define GUC_STAGE_DESC_ATTR_WQLOCKED	BIT(5)
 #define GUC_STAGE_DESC_ATTR_PCH		BIT(6)
@@ -277,9 +282,10 @@ struct guc_process_desc {
 	u64 wq_base_addr;
 	u32 wq_size_bytes;
 	u32 wq_status;
-	u32 engine_presence;
 	u32 priority;
-	u32 reserved[30];
+	u32 token;
+	u32 queue_engine_error;
+	u32 reserved[23];
 } __packed;
 
 /* engine id and context id is packed into guc_execlist_context.context_id*/
@@ -288,18 +294,20 @@ struct guc_process_desc {
 
 /* The execlist context including software and HW information */
 struct guc_execlist_context {
-	u32 context_desc;
-	u32 context_id;
-	u32 ring_status;
-	u32 ring_lrca;
+	u64 hw_context_desc;
+	u32 reserved0;
+	u32 ring_lrc;
 	u32 ring_begin;
 	u32 ring_end;
 	u32 ring_next_free_location;
 	u32 ring_current_tail_pointer_value;
-	u8 engine_state_submit_value;
-	u8 engine_state_wait_value;
-	u16 pagefault_count;
-	u16 engine_submit_queue_count;
+	u32 engine_state_wait_value;
+	u32 state_reserved;
+	u32 is_present_in_sq;
+	u32 sync_value;
+	u32 sync_addr;
+	u32 slpc_hints;
+	u32 reserved1[4];
 } __packed;
 
 /*
@@ -312,36 +320,33 @@ struct guc_execlist_context {
  * with the GuC, being allocated before the GuC is loaded with its firmware.
  */
 struct guc_stage_desc {
-	u32 sched_common_area;
+	u64 desc_private;
 	u32 stage_id;
-	u32 pas_id;
-	u8 engines_used;
+	u32 proxy_id;
 	u64 db_trigger_cpu;
 	u32 db_trigger_uk;
 	u64 db_trigger_phy;
-	u16 db_id;
-
-	struct guc_execlist_context lrc[GUC_MAX_ENGINES_NUM];
-
-	u8 attribute;
-
-	u32 priority;
-
+	u32 db_id;
+	struct guc_execlist_context lrc[GUC_MAX_ENGINE_CLASSES][GUC_MAX_LRC_PER_CLASS];
+	struct {
+		u64 bitmap;
+		u32 reserved0;
+	} __packed lrc_alloc_map[GUC_MAX_ENGINE_CLASSES];
+	u32 lrc_count;
+	u32 max_lrc_per_class;
+	u32 attribute; /* GUC_STAGE_DESC_ATTR_xxx */
+	u32 priority; /* GUC_CLIENT_PRIORITY_xxx */
 	u32 wq_sampled_tail_offset;
 	u32 wq_total_submit_enqueues;
-
 	u32 process_desc;
 	u32 wq_addr;
 	u32 wq_size;
-
-	u32 engine_presence;
-
-	u8 engine_suspended;
-
-	u8 reserved0[3];
-	u64 reserved1[1];
-
-	u64 desc_private;
+	u32 feature0;
+	u32 feature1;
+	u32 feature2;
+	u32 queue_engine_error;
+	u32 reserved[2];
+	u64 reserved3[12];
 } __packed;
 
 /**
diff --git a/drivers/gpu/drm/i915/intel_guc_submission.c b/drivers/gpu/drm/i915/intel_guc_submission.c
index eae668442ebe..9bf8ebbc4de1 100644
--- a/drivers/gpu/drm/i915/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/intel_guc_submission.c
@@ -46,17 +46,22 @@
  * that contains all required pages for these elements).
  *
  * GuC stage descriptor:
- * During initialization, the driver allocates a static pool of 1024 such
- * descriptors, and shares them with the GuC.
- * Currently, there exists a 1:1 mapping between a intel_guc_client and a
- * guc_stage_desc (via the client's stage_id), so effectively only one
- * gets used. This stage descriptor lets the GuC know about the doorbell,
- * workqueue and process descriptor. Theoretically, it also lets the GuC
- * know about our HW contexts (context ID, etc...), but we actually
- * employ a kind of submission where the GuC uses the LRCA sent via the work
- * item instead (the single guc_stage_desc associated to execbuf client
- * contains information about the default kernel context only, but this is
- * essentially unused). This is called a "proxy" submission.
+ * During initialization, the driver allocates a static pool of descriptors
+ * and shares them with the GuC. This stage descriptor lets the GuC know about
+ * the doorbell, workqueue and process descriptor, additionally it stores
+ * information about all possible HW contexts that use it (64 x number of
+ * engine classes of guc_execlist_context structs).
+ *
+ * The idea is that every direct-submission GuC client gets one SW Context ID
+ * and every HW context created by that client gets one SW Counter. The "SW
+ * Context ID" and "SW Counter" to use now get passed on every work queue item.
+ *
+ * But we don't have direct submission yet: does that mean we are limited to 64
+ * contexts in total (one client)? Not really: we can use extra GuC context
+ * descriptors to store more HW contexts. They are special in that they don't
+ * have their own work queue, doorbell or process descriptor. Instead, these
+ * "principal" GuC context descriptors use the one that belongs to the client
+ * as a "proxy" for submission (a generalization of the old proxy submission).
  *
  * The Scratch registers:
  * There are 16 MMIO-based registers start from 0xC180. The kernel driver writes
@@ -164,11 +169,28 @@ static int __guc_deallocate_doorbell(struct intel_guc *guc, u32 stage_id)
 	return intel_guc_send(guc, action, ARRAY_SIZE(action));
 }
 
-static struct guc_stage_desc *__get_stage_desc(struct intel_guc_client *client)
+static struct guc_stage_desc *__get_stage_desc(struct intel_guc *guc, u32 index)
+{
+	struct guc_stage_desc *base = guc->stage_desc_pool_vaddr;
+
+	GEM_BUG_ON(!USES_GUC_SUBMISSION(guc_to_i915(guc)));
+	GEM_BUG_ON(index >= GUC_MAX_STAGE_DESCRIPTORS);
+
+	return &base[index];
+}
+
+static struct guc_stage_desc *__get_proxy_stage_desc(struct intel_guc_client *client)
 {
-	struct guc_stage_desc *base = client->guc->stage_desc_pool_vaddr;
+	GEM_BUG_ON(!I915_SELFTEST_ONLY(client->guc->starting_proxy_id) &&
+			client->stage_id < GUC_MAX_PPAL_STAGE_DESCRIPTORS);
+	return __get_stage_desc(client->guc, client->stage_id);
+}
 
-	return &base[client->stage_id];
+static struct guc_stage_desc *__get_ppal_stage_desc(struct intel_guc *guc,
+						    u32 index)
+{
+	GEM_BUG_ON(index >= GUC_MAX_PPAL_STAGE_DESCRIPTORS);
+	return __get_stage_desc(guc, index);
 }
 
 /*
@@ -183,7 +205,7 @@ static void __update_doorbell_desc(struct intel_guc_client *client, u16 new_id)
 	struct guc_stage_desc *desc;
 
 	/* Update the GuC's idea of the doorbell ID */
-	desc = __get_stage_desc(client);
+	desc = __get_proxy_stage_desc(client);
 	desc->db_id = new_id;
 }
 
@@ -329,14 +351,12 @@ static int guc_stage_desc_pool_create(struct intel_guc *guc)
 
 	guc->stage_desc_pool = vma;
 	guc->stage_desc_pool_vaddr = vaddr;
-	ida_init(&guc->stage_ids);
 
 	return 0;
 }
 
 static void guc_stage_desc_pool_destroy(struct intel_guc *guc)
 {
-	ida_destroy(&guc->stage_ids);
 	i915_vma_unpin_and_release(&guc->stage_desc_pool, I915_VMA_RELEASE_MAP);
 }
 
@@ -347,78 +367,26 @@ static void guc_stage_desc_pool_destroy(struct intel_guc *guc)
  * data structures relating to this client (doorbell, process descriptor,
  * write queue, etc).
  */
-static void guc_stage_desc_init(struct intel_guc_client *client)
+static void guc_proxy_stage_desc_init(struct intel_guc_client *client)
 {
-	struct intel_guc *guc = client->guc;
-	struct drm_i915_private *dev_priv = guc_to_i915(guc);
-	struct intel_engine_cs *engine;
-	struct i915_gem_context *ctx = client->owner;
 	struct guc_stage_desc *desc;
-	unsigned int tmp;
 	u32 gfx_addr;
 
-	desc = __get_stage_desc(client);
+	desc = __get_proxy_stage_desc(client);
 	memset(desc, 0, sizeof(*desc));
 
 	desc->attribute = GUC_STAGE_DESC_ATTR_ACTIVE |
+			  GUC_STAGE_DESC_ATTR_PROXY |
 			  GUC_STAGE_DESC_ATTR_KERNEL;
-	if (is_high_priority(client))
-		desc->attribute |= GUC_STAGE_DESC_ATTR_PREEMPT;
 	desc->stage_id = client->stage_id;
 	desc->priority = client->priority;
 	desc->db_id = client->doorbell_id;
 
-	for_each_engine_masked(engine, dev_priv, client->engines, tmp) {
-		struct intel_context *ce = to_intel_context(ctx, engine);
-		u32 guc_engine_id = engine->guc_id;
-		struct guc_execlist_context *lrc = &desc->lrc[guc_engine_id];
-
-		/* TODO: We have a design issue to be solved here. Only when we
-		 * receive the first batch, we know which engine is used by the
-		 * user. But here GuC expects the lrc and ring to be pinned. It
-		 * is not an issue for default context, which is the only one
-		 * for now who owns a GuC client. But for future owner of GuC
-		 * client, need to make sure lrc is pinned prior to enter here.
-		 */
-		if (!ce->state)
-			break;	/* XXX: continue? */
-
-		/*
-		 * XXX: When this is a GUC_STAGE_DESC_ATTR_KERNEL client (proxy
-		 * submission or, in other words, not using a direct submission
-		 * model) the KMD's LRCA is not used for any work submission.
-		 * Instead, the GuC uses the LRCA of the user mode context (see
-		 * guc_add_request below).
-		 */
-		lrc->context_desc = lower_32_bits(ce->lrc_desc);
-
-		/* The state page is after PPHWSP */
-		lrc->ring_lrca = intel_guc_ggtt_offset(guc, ce->state) +
-				 LRC_STATE_PN * PAGE_SIZE;
-
-		/* XXX: In direct submission, the GuC wants the HW context id
-		 * here. In proxy submission, it wants the stage id
-		 */
-		lrc->context_id = (client->stage_id << GUC_ELC_CTXID_OFFSET) |
-				(guc_engine_id << GUC_ELC_ENGINE_OFFSET);
-
-		lrc->ring_begin = intel_guc_ggtt_offset(guc, ce->ring->vma);
-		lrc->ring_end = lrc->ring_begin + ce->ring->size - 1;
-		lrc->ring_next_free_location = lrc->ring_begin;
-		lrc->ring_current_tail_pointer_value = 0;
-
-		desc->engines_used |= (1 << guc_engine_id);
-	}
-
-	DRM_DEBUG_DRIVER("Host engines 0x%x => GuC engines used 0x%x\n",
-			 client->engines, desc->engines_used);
-	WARN_ON(desc->engines_used == 0);
-
 	/*
 	 * The doorbell, process descriptor, and workqueue are all parts
 	 * of the client object, which the GuC will reference via the GGTT
 	 */
-	gfx_addr = intel_guc_ggtt_offset(guc, client->vma);
+	gfx_addr = intel_guc_ggtt_offset(client->guc, client->vma);
 	desc->db_trigger_phy = sg_dma_address(client->vma->pages->sgl) +
 				client->doorbell_offset;
 	desc->db_trigger_cpu = ptr_to_u64(__get_doorbell(client));
@@ -430,11 +398,11 @@ static void guc_stage_desc_init(struct intel_guc_client *client)
 	desc->desc_private = ptr_to_u64(client);
 }
 
-static void guc_stage_desc_fini(struct intel_guc_client *client)
+static void guc_proxy_stage_desc_fini(struct intel_guc_client *client)
 {
 	struct guc_stage_desc *desc;
 
-	desc = __get_stage_desc(client);
+	desc = __get_proxy_stage_desc(client);
 	memset(desc, 0, sizeof(*desc));
 }
 
@@ -553,7 +521,7 @@ static void inject_preempt_context(struct work_struct *work)
 	struct intel_guc *guc = container_of(preempt_work, typeof(*guc),
 					     preempt_work[engine->id]);
 	struct intel_guc_client *client = guc->preempt_client;
-	struct guc_stage_desc *stage_desc = __get_stage_desc(client);
+	struct guc_stage_desc *stage_desc = __get_proxy_stage_desc(client);
 	struct intel_context *ce = to_intel_context(client->owner, engine);
 	u32 data[7];
 
@@ -919,6 +887,7 @@ guc_client_alloc(struct drm_i915_private *dev_priv,
 	struct i915_vma *vma;
 	void *vaddr;
 	int ret;
+	u32 starting_id;
 
 	client = kzalloc(sizeof(*client), GFP_KERNEL);
 	if (!client)
@@ -931,8 +900,11 @@ guc_client_alloc(struct drm_i915_private *dev_priv,
 	client->doorbell_id = GUC_DOORBELL_INVALID;
 	spin_lock_init(&client->wq_lock);
 
-	ret = ida_simple_get(&guc->stage_ids, 0, GUC_MAX_STAGE_DESCRIPTORS,
-			     GFP_KERNEL);
+	if (!I915_SELFTEST_ONLY(starting_id = guc->starting_proxy_id))
+		starting_id = GUC_MAX_PPAL_STAGE_DESCRIPTORS;
+
+	ret = ida_simple_get(&guc->client_ids, starting_id,
+			     GUC_MAX_STAGE_DESCRIPTORS, GFP_KERNEL);
 	if (ret < 0)
 		goto err_client;
 
@@ -983,7 +955,7 @@ guc_client_alloc(struct drm_i915_private *dev_priv,
 err_vma:
 	i915_vma_unpin_and_release(&client->vma, 0);
 err_id:
-	ida_simple_remove(&guc->stage_ids, client->stage_id);
+	ida_simple_remove(&guc->client_ids, client->stage_id);
 err_client:
 	kfree(client);
 	return ERR_PTR(ret);
@@ -993,7 +965,7 @@ static void guc_client_free(struct intel_guc_client *client)
 {
 	unreserve_doorbell(client);
 	i915_vma_unpin_and_release(&client->vma, I915_VMA_RELEASE_MAP);
-	ida_simple_remove(&client->guc->stage_ids, client->stage_id);
+	ida_simple_remove(&client->guc->client_ids, client->stage_id);
 	kfree(client);
 }
 
@@ -1063,7 +1035,7 @@ static int __guc_client_enable(struct intel_guc_client *client)
 	int ret;
 
 	guc_proc_desc_init(client);
-	guc_stage_desc_init(client);
+	guc_proxy_stage_desc_init(client);
 
 	ret = create_doorbell(client);
 	if (ret)
@@ -1072,7 +1044,7 @@ static int __guc_client_enable(struct intel_guc_client *client)
 	return 0;
 
 fail:
-	guc_stage_desc_fini(client);
+	guc_proxy_stage_desc_fini(client);
 	guc_proc_desc_fini(client);
 	return ret;
 }
@@ -1089,7 +1061,7 @@ static void __guc_client_disable(struct intel_guc_client *client)
 	else
 		__destroy_doorbell(client);
 
-	guc_stage_desc_fini(client);
+	guc_proxy_stage_desc_fini(client);
 	guc_proc_desc_fini(client);
 }
 
@@ -1145,6 +1117,9 @@ int intel_guc_submission_init(struct intel_guc *guc)
 	GEM_BUG_ON(!guc->stage_desc_pool);
 
 	WARN_ON(!guc_verify_doorbells(guc));
+
+	ida_init(&guc->client_ids);
+
 	ret = guc_clients_create(guc);
 	if (ret)
 		goto err_pool;
@@ -1157,6 +1132,7 @@ int intel_guc_submission_init(struct intel_guc *guc)
 	return 0;
 
 err_pool:
+	ida_destroy(&guc->client_ids);
 	guc_stage_desc_pool_destroy(guc);
 	return ret;
 }
@@ -1173,6 +1149,8 @@ void intel_guc_submission_fini(struct intel_guc *guc)
 	guc_clients_destroy(guc);
 	WARN_ON(!guc_verify_doorbells(guc));
 
+	ida_destroy(&guc->client_ids);
+
 	if (guc->stage_desc_pool)
 		guc_stage_desc_pool_destroy(guc);
 }
@@ -1257,6 +1235,203 @@ static void guc_submission_unpark(struct intel_engine_cs *engine)
 	intel_engine_pin_breadcrumbs_irq(engine);
 }
 
+static void guc_map_gem_ctx_to_ppal_stage(struct intel_guc *guc,
+					  struct guc_stage_desc *desc,
+					  u32 id)
+{
+	GEM_BUG_ON(desc->attribute & GUC_STAGE_DESC_ATTR_ACTIVE);
+
+	desc->attribute = GUC_STAGE_DESC_ATTR_ACTIVE |
+			  GUC_STAGE_DESC_ATTR_PRINCIPAL |
+			  GUC_STAGE_DESC_ATTR_KERNEL;
+	desc->stage_id = id;
+
+	/* all ppal contexts will be submitted trough the execbuf client */
+	desc->proxy_id = guc->execbuf_client->stage_id;
+
+	/*
+	 * max_lrc_per_class is used in GuC to cut short loops over the
+	 * lrc_bitmap when only a small amount of lrcs are used. We could
+	 * recalculate this value every time an lrc is added or removed, but
+	 * given the fact that we only have a max number of lrcs per stage_desc
+	 * equal to the max number of instances of a class (because we map
+	 * gem_context 1:1 with stage_desc) and that the GuC loops only in
+	 * specific cases, redoing the calculation each time doesn't give us a
+	 * big benefit for the cost so we can just use a static value.
+	 */
+	desc->max_lrc_per_class = MAX_ENGINE_INSTANCE + 1;
+}
+
+static void guc_unmap_gem_ctx_from_ppal_stage(struct intel_guc *guc,
+					      struct guc_stage_desc *desc)
+{
+	GEM_BUG_ON(!(desc->attribute & GUC_STAGE_DESC_ATTR_ACTIVE));
+	GEM_BUG_ON(desc->lrc_count > 0);
+
+	memset(desc, 0, sizeof(*desc));
+}
+
+static inline void guc_ppal_stage_lrc_pin(struct intel_engine_cs *engine,
+					  struct i915_gem_context *ctx,
+					  struct intel_context *ce)
+{
+	struct intel_guc *guc = &ctx->i915->guc;
+	struct guc_stage_desc *desc;
+	struct guc_execlist_context *lrc;
+	u8 guc_class = engine->class;
+
+	/* 1:1 gem_context to ppal mapping */
+	GEM_BUG_ON(ce->sw_counter > MAX_ENGINE_INSTANCE);
+
+	desc = __get_ppal_stage_desc(guc, ce->sw_context_id);
+	GEM_BUG_ON(desc->lrc_alloc_map[guc_class].bitmap & BIT(ce->sw_counter));
+
+	if (!desc->lrc_count++)
+		guc_map_gem_ctx_to_ppal_stage(guc, desc, ce->sw_context_id);
+
+	lrc = &desc->lrc[guc_class][ce->sw_counter];
+	lrc->hw_context_desc = ce->lrc_desc;
+	lrc->ring_lrc = intel_guc_ggtt_offset(guc, ce->state) +
+			LRC_STATE_PN * PAGE_SIZE;
+	lrc->ring_begin = intel_guc_ggtt_offset(guc, ce->ring->vma);
+	lrc->ring_end = lrc->ring_begin + ce->ring->size - 1;
+
+	desc->lrc_alloc_map[guc_class].bitmap |= BIT(ce->sw_counter);
+}
+
+static inline void guc_ppal_stage_lrc_unpin(struct intel_context *ce)
+{
+	struct i915_gem_context *ctx = ce->gem_context;
+	struct intel_guc *guc = &ctx->i915->guc;
+	struct intel_engine_cs *engine = ctx->i915->engine[ce - ctx->__engine];
+	struct guc_stage_desc *desc;
+	struct guc_execlist_context *lrc;
+	u8 guc_class = engine->class;
+
+	desc = __get_ppal_stage_desc(guc, ce->sw_context_id);
+	GEM_BUG_ON(!(desc->lrc_alloc_map[guc_class].bitmap & BIT(ce->sw_counter)));
+
+	lrc = &desc->lrc[guc_class][ce->sw_counter];
+
+	/*
+	 * GuC needs us to keep the lrc mapped until it has finished processing
+	 * the ctx switch interrupt. When executing nop or very small workloads
+	 * it is possible (but quite unlikely) that 2 contexts on different
+	 * ELSPs of the same engine complete before the GuC manages to process
+	 * the interrupt for the first completion. Experiments show this happens
+	 * for ~0.2% of contexts when executing nop workloads on different
+	 * contexts back to back on the same engine. When submitting nop
+	 * workloads on all engines at the same time the hit-rate goes up to
+	 * ~0.7%. In all the observed cases GuC required < 100us to catch up,
+	 * with the single engine case being always below 20us.
+	 *
+	 * The completion of the request on the second lrc will reduce our
+	 * pin_count on the first lrc to zero, thus triggering a call to this
+	 * function potentially before GuC has had time to process the
+	 * interrupt. To avoid this, we could get an extra pin on the context or
+	 * delay the unpin when guc is in use, but given that the issue is
+	 * limited to pathological scenarios and has very low hit rate even
+	 * there, we can just introduce a small delay when it happens to give
+	 * time to GuC to catch up. Also to be noted that since the requests
+	 * have completed on the HW we've most likely already sent GuC the next
+	 * contexts to be executed, so it is unlikely that by waiting we'll add
+	 * bubbles in the HW execution.
+	 */
+	WARN_ON(wait_for_us(lrc->is_present_in_sq == 0, 1000));
+
+	desc->lrc_alloc_map[guc_class].bitmap &= ~BIT(ce->sw_counter);
+	memset(lrc, 0, sizeof(*lrc));
+
+	if (!--desc->lrc_count)
+		guc_unmap_gem_ctx_from_ppal_stage(guc, desc);
+}
+
+static inline void guc_init_lrc_mapping(struct intel_guc *guc)
+{
+	struct drm_i915_private *i915 = guc_to_i915(guc);
+	struct intel_engine_cs *engine;
+	struct i915_gem_context *ctx;
+	struct intel_context *ce;
+	enum intel_engine_id id;
+
+	/*
+	 * Some context (e.g. kernel_context) might have been pinned before we
+	 * enabled GuC submission, so we need to add them to the GuC bookeping.
+	 * Also, after a reset the GuC we want to make sure that the information
+	 * shared with GuC is properly reset.
+	 *
+	 * NOTE: the code below assumes 1:1 mapping between ppal descriptors and
+	 * gem contexts for simplicity.
+	 */
+	list_for_each_entry(ctx, &i915->contexts.list, link) {
+		if (atomic_read(&ctx->hw_id_pin_count)) {
+			struct guc_stage_desc *desc;
+
+			/* make sure the descriptor is clean... */
+			GEM_BUG_ON(ctx->hw_id > GUC_MAX_PPAL_STAGE_DESCRIPTORS);
+			desc = __get_ppal_stage_desc(guc, ctx->hw_id);
+			memset(desc, 0, sizeof(*desc));
+
+			/* ...and the (re-)pin all the lrcs */
+			for_each_engine(engine, i915, id) {
+				ce = to_intel_context(ctx, engine);
+				if (ce->pin_count > 0)
+					guc_ppal_stage_lrc_pin(engine, ctx, ce);
+			}
+		}
+	}
+}
+
+static inline void guc_fini_lrc_mapping(struct intel_guc *guc)
+{
+	struct drm_i915_private *i915 = guc_to_i915(guc);
+	struct intel_engine_cs *engine;
+	struct i915_gem_context *ctx;
+	struct intel_context *ce;
+	enum intel_engine_id id;
+
+	list_for_each_entry(ctx, &i915->contexts.list, link) {
+		if (atomic_read(&ctx->hw_id_pin_count)) {
+			for_each_engine(engine, i915, id) {
+				ce = to_intel_context(ctx, engine);
+				if (ce->pin_count > 0)
+					guc_ppal_stage_lrc_unpin(ce);
+			}
+		}
+	}
+}
+
+static void guc_context_unpin(struct intel_context *ce)
+{
+	guc_ppal_stage_lrc_unpin(ce);
+	intel_execlists_context_unpin(ce);
+}
+
+static const struct intel_context_ops guc_context_ops = {
+	.unpin = guc_context_unpin,
+	.destroy = intel_execlists_context_destroy,
+};
+
+static struct intel_context *guc_context_pin(struct intel_engine_cs *engine,
+					     struct i915_gem_context *ctx)
+{
+	struct intel_context *ce = to_intel_context(ctx, engine);
+
+	lockdep_assert_held(&ctx->i915->drm.struct_mutex);
+
+	if (likely(ce->pin_count++))
+		return ce;
+	GEM_BUG_ON(!ce->pin_count); /* no overflow please! */
+
+	ce->ops = &guc_context_ops;
+
+	ce = intel_execlists_context_pin(engine, ctx, ce);
+	if (!IS_ERR(ce))
+		guc_ppal_stage_lrc_pin(engine, ctx, ce);
+
+	return ce;
+}
+
 static void guc_set_default_submission(struct intel_engine_cs *engine)
 {
 	/*
@@ -1274,6 +1449,8 @@ static void guc_set_default_submission(struct intel_engine_cs *engine)
 
 	engine->execlists.tasklet.func = guc_submission_tasklet;
 
+	engine->context_pin = guc_context_pin;
+
 	engine->park = guc_submission_park;
 	engine->unpark = guc_submission_unpark;
 
@@ -1320,6 +1497,8 @@ int intel_guc_submission_enable(struct intel_guc *guc)
 		engine->set_default_submission(engine);
 	}
 
+	guc_init_lrc_mapping(guc);
+
 	return 0;
 }
 
@@ -1329,6 +1508,7 @@ void intel_guc_submission_disable(struct intel_guc *guc)
 
 	GEM_BUG_ON(dev_priv->gt.awake); /* GT should be parked first */
 
+	guc_fini_lrc_mapping(guc);
 	guc_interrupts_release(dev_priv);
 	guc_clients_disable(guc);
 }
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 48e0cdf42221..444bc83554c5 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1071,7 +1071,7 @@ static void execlists_submit_request(struct i915_request *request)
 	spin_unlock_irqrestore(&engine->timeline.lock, flags);
 }
 
-static void execlists_context_destroy(struct intel_context *ce)
+void intel_execlists_context_destroy(struct intel_context *ce)
 {
 	GEM_BUG_ON(ce->pin_count);
 
@@ -1084,7 +1084,7 @@ static void execlists_context_destroy(struct intel_context *ce)
 	i915_gem_object_put(ce->state->obj);
 }
 
-static void execlists_context_unpin(struct intel_context *ce)
+void intel_execlists_context_unpin(struct intel_context *ce)
 {
 	struct intel_engine_cs *engine;
 
@@ -1141,10 +1141,10 @@ static int __context_pin(struct i915_gem_context *ctx, struct i915_vma *vma)
 	return i915_vma_pin(vma, 0, 0, flags);
 }
 
-static struct intel_context *
-__execlists_context_pin(struct intel_engine_cs *engine,
-			struct i915_gem_context *ctx,
-			struct intel_context *ce)
+struct intel_context *
+intel_execlists_context_pin(struct intel_engine_cs *engine,
+			    struct i915_gem_context *ctx,
+			    struct intel_context *ce)
 {
 	void *vaddr;
 	int ret;
@@ -1205,8 +1205,8 @@ __execlists_context_pin(struct intel_engine_cs *engine,
 }
 
 static const struct intel_context_ops execlists_context_ops = {
-	.unpin = execlists_context_unpin,
-	.destroy = execlists_context_destroy,
+	.unpin = intel_execlists_context_unpin,
+	.destroy = intel_execlists_context_destroy,
 };
 
 static struct intel_context *
@@ -1224,7 +1224,7 @@ execlists_context_pin(struct intel_engine_cs *engine,
 
 	ce->ops = &execlists_context_ops;
 
-	return __execlists_context_pin(engine, ctx, ce);
+	return intel_execlists_context_pin(engine, ctx, ce);
 }
 
 static int execlists_request_alloc(struct i915_request *request)
diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
index f5a5502ecf70..178b181ea651 100644
--- a/drivers/gpu/drm/i915/intel_lrc.h
+++ b/drivers/gpu/drm/i915/intel_lrc.h
@@ -104,4 +104,11 @@ void intel_lr_context_resume(struct drm_i915_private *dev_priv);
 
 void intel_execlists_set_default_submission(struct intel_engine_cs *engine);
 
+struct intel_context *
+intel_execlists_context_pin(struct intel_engine_cs *engine,
+			    struct i915_gem_context *ctx,
+			    struct intel_context *ce);
+void intel_execlists_context_unpin(struct intel_context *ce);
+void intel_execlists_context_destroy(struct intel_context *ce);
+
 #endif /* _INTEL_LRC_H_ */
diff --git a/drivers/gpu/drm/i915/selftests/intel_guc.c b/drivers/gpu/drm/i915/selftests/intel_guc.c
index bf27162fb327..eb4e8bbe8c82 100644
--- a/drivers/gpu/drm/i915/selftests/intel_guc.c
+++ b/drivers/gpu/drm/i915/selftests/intel_guc.c
@@ -301,6 +301,7 @@ static int igt_guc_doorbells(void *arg)
 	if (err)
 		goto unlock;
 
+	guc->starting_proxy_id = GUC_MAX_PPAL_STAGE_DESCRIPTORS - ATTEMPTS;
 	for (i = 0; i < ATTEMPTS; i++) {
 		clients[i] = guc_client_alloc(dev_priv,
 					      INTEL_INFO(dev_priv)->ring_mask,
@@ -334,7 +335,8 @@ static int igt_guc_doorbells(void *arg)
 		 * The check below is only valid because we keep a doorbell
 		 * assigned during the whole life of the client.
 		 */
-		if (clients[i]->stage_id >= GUC_NUM_DOORBELLS) {
+		if ((clients[i]->stage_id - guc->starting_proxy_id) >=
+		     GUC_NUM_DOORBELLS) {
 			pr_err("[%d] more clients than doorbells (%d >= %d)\n",
 			       i, clients[i]->stage_id, GUC_NUM_DOORBELLS);
 			err = -EINVAL;
-- 
2.19.0