[Intel-gfx] [PATCH 21/30] drm/i915/guc: New GuC stage descriptors

Fri Mar 29 22:11:09 UTC 2019

From: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com>

With the new interface, GuC now requires every lrc to be registered in
one of the stage descriptors, which have been re-designed so that each
descriptor can store up to 64 lrc per class (i.e. equal to the possible
SW counter values).
Similarly to what happened with the previous legacy design, it is possible
to have a single "proxy" descriptor that owns the workqueue and the
doorbell and use it for all submission. To distinguish the proxy
descriptors from the one used for lrc storage, the latter have been
called "principal". A descriptor can't be both a proxy and a principal
at the same time; to enforce this, since we only use 1 proxy descriptor
per client, we reserve enough descriptor from the bottom of the pool to
be used as proxy and leave the others as principals. For simplicity, we
currently map context IDs 1:1 to principal descriptors, but we could
have more contexts in flight if needed by using the SW counter.
Note that the lrcs need to be mapped in the principal descriptor until
GuC is done with them. This means that we can't release the HW id when
the user app closes the ctx because it might still be in flight with GuC
and that we need to be careful when unpinning because the fact that the
a request on the next context has completed doesn't mean that GuC is
done processing the first one. See in-code comments for details.

NOTE: GuC is not going to look at lrcs that are not in flight, so we
could potentially skip the unpinning steps. However, the unpining steps
perform extra correctness check so better keep them until we're sure
that the flow is solid.

Based on an initial patch by Oscar Mateo.

Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio at intel.com>
Signed-off-by: Michal Wajdeczko <michal.wajdeczko at intel.com>
Cc: Michal Winiarski <michal.winiarski at intel.com>
Cc: Tomasz Lis <tomasz.lis at intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
---
 drivers/gpu/drm/i915/i915_debugfs.c         |  30 +-
 drivers/gpu/drm/i915/i915_drv.h             |   5 +-
 drivers/gpu/drm/i915/i915_gem_context.c     |   9 +-
 drivers/gpu/drm/i915/intel_guc.h            |  14 +-
 drivers/gpu/drm/i915/intel_guc_fwif.h       |  73 +++--
 drivers/gpu/drm/i915/intel_guc_submission.c | 345 +++++++++++++++-----
 drivers/gpu/drm/i915/intel_lrc.c            |  12 +-
 drivers/gpu/drm/i915/intel_lrc.h            |   4 +
 drivers/gpu/drm/i915/selftests/intel_guc.c  |  14 +-
 9 files changed, 363 insertions(+), 143 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index f93a043f033a..24f983a21e87 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -2259,11 +2259,10 @@ static int i915_guc_stage_pool(struct seq_file *m, void *data)
 
 		seq_printf(m, "GuC stage descriptor %u:\n", index);
 		seq_printf(m, "\tIndex: %u\n", desc->stage_id);
+		seq_printf(m, "\tProxy Index: %u\n", desc->proxy_id);
 		seq_printf(m, "\tAttribute: 0x%x\n", desc->attribute);
 		seq_printf(m, "\tPriority: %d\n", desc->priority);
 		seq_printf(m, "\tDoorbell id: %d\n", desc->db_id);
-		seq_printf(m, "\tEngines used: 0x%x\n",
-			   desc->engines_used);
 		seq_printf(m, "\tDoorbell trigger phy: 0x%llx, cpu: 0x%llx, uK: 0x%x\n",
 			   desc->db_trigger_phy,
 			   desc->db_trigger_cpu,
@@ -2275,18 +2274,21 @@ static int i915_guc_stage_pool(struct seq_file *m, void *data)
 		seq_putc(m, '\n');
 
 		for_each_engine_masked(engine, dev_priv, client->engines, tmp) {
-			u32 guc_engine_id = engine->guc_id;
-			struct guc_execlist_context *lrc =
-						&desc->lrc[guc_engine_id];
-
-			seq_printf(m, "\t%s LRC:\n", engine->name);
-			seq_printf(m, "\t\tContext desc: 0x%x\n",
-				   lrc->context_desc);
-			seq_printf(m, "\t\tContext id: 0x%x\n", lrc->context_id);
-			seq_printf(m, "\t\tLRCA: 0x%x\n", lrc->ring_lrca);
-			seq_printf(m, "\t\tRing begin: 0x%x\n", lrc->ring_begin);
-			seq_printf(m, "\t\tRing end: 0x%x\n", lrc->ring_end);
-			seq_putc(m, '\n');
+			u8 class = engine->class;
+			u8 inst = engine->instance;
+
+			if (desc->lrc_alloc_map[class].bitmap & BIT(inst)) {
+				struct guc_execlist_context *lrc =
+							&desc->lrc[class][inst];
+				seq_printf(m, "\t%s LRC:\n", engine->name);
+				seq_printf(m, "\t\tHW context desc: 0x%x:0x%x\n",
+						lower_32_bits(lrc->hw_context_desc),
+						upper_32_bits(lrc->hw_context_desc));
+				seq_printf(m, "\t\tLRC: 0x%x\n", lrc->ring_lrc);
+				seq_printf(m, "\t\tRing begin: 0x%x\n", lrc->ring_begin);
+				seq_printf(m, "\t\tRing end: 0x%x\n", lrc->ring_end);
+				seq_putc(m, '\n');
+			}
 		}
 	}
 
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index b3d5b715b637..3e7d0f3d2c82 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1766,13 +1766,14 @@ struct drm_i915_private {
 		 * (the SW Context ID field) but GuC limits it further so
 		 * without taking advantage of part of the SW counter field the
 		 * firmware only supports a max number of contexts equal to the
-		 * number of entries in the GuC stage descriptor pool.
+		 * number of entries in the GuC stage descriptor pool, minus
+		 * the descriptors reserved for proxy usage
 		 */
 		struct ida hw_ida;
 #define MAX_CONTEXT_HW_ID (1<<21) /* exclusive */
 #define MAX_GUC_CONTEXT_HW_ID (1 << 20) /* exclusive */
 #define GEN11_MAX_CONTEXT_HW_ID (1<<11) /* exclusive */
-#define GEN11_MAX_CONTEXT_HW_ID_WITH_GUC GUC_MAX_STAGE_DESCRIPTORS
+#define GEN11_MAX_CONTEXT_HW_ID_WITH_GUC GUC_MAX_PPAL_STAGE_DESCRIPTORS
 		struct list_head hw_id_list;
 	} contexts;
 
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index d882525f67a3..74fa6610d532 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -308,10 +308,15 @@ static void context_close(struct i915_gem_context *ctx)
 	i915_gem_context_set_closed(ctx);
 
 	/*
-	 * This context will never again be assinged to HW, so we can
+	 * This context will never again be assigned to HW, so we can
 	 * reuse its ID for the next context.
+	 *
+	 * if GuC is in use, we need to keep the ID until GuC has finished
+	 * processing all submitted requests because the ID is used by the
+	 * firmware to index the guc stage_desc pool.
 	 */
-	release_hw_id(ctx);
+	if (!USES_GUC_SUBMISSION(ctx->i915))
+		release_hw_id(ctx);
 
 	/*
 	 * The LUT uses the VMA as a backpointer to unref the object,
diff --git a/drivers/gpu/drm/i915/intel_guc.h b/drivers/gpu/drm/i915/intel_guc.h
index 4f3cf8eddfe6..dbe4297f8a7d 100644
--- a/drivers/gpu/drm/i915/intel_guc.h
+++ b/drivers/gpu/drm/i915/intel_guc.h
@@ -59,10 +59,14 @@ struct intel_guc {
 	bool interrupts_enabled;
 	unsigned int msg_enabled_mask;
 
+	struct ida client_ids;
+#define GUC_MAX_CLIENT_IDS 2
+
 	struct i915_vma *ads_vma;
 	struct i915_vma *stage_desc_pool;
 	void *stage_desc_pool_vaddr;
-	struct ida stage_ids;
+#define	GUC_MAX_PPAL_STAGE_DESCRIPTORS (GUC_MAX_STAGE_DESCRIPTORS - GUC_MAX_CLIENT_IDS)
+
 	struct i915_vma *shared_data;
 	void *shared_data_vaddr;
 
@@ -95,6 +99,14 @@ struct intel_guc {
 
 	/* GuC's FW specific notify function */
 	void (*notify)(struct intel_guc *guc);
+
+	/*
+	 * Override the first stage_desc to be used as proxy
+	 * (Default: GUC_MAX_PPAL_STAGE_DESCRIPTORS). The max number of ppal
+	 * descriptors is not updated accordingly since the test using this does
+	 * not allocate any context.
+	 */
+	I915_SELFTEST_DECLARE(u32 starting_proxy_id);
 };
 
 static inline bool intel_guc_is_alive(struct intel_guc *guc)
diff --git a/drivers/gpu/drm/i915/intel_guc_fwif.h b/drivers/gpu/drm/i915/intel_guc_fwif.h
index 61f718b907f3..a900680ff5a4 100644
--- a/drivers/gpu/drm/i915/intel_guc_fwif.h
+++ b/drivers/gpu/drm/i915/intel_guc_fwif.h
@@ -32,6 +32,8 @@
 #define GUC_MAX_STAGE_DESCRIPTORS	1024
 #define	GUC_INVALID_STAGE_ID		GUC_MAX_STAGE_DESCRIPTORS
 
+#define GUC_MAX_LRC_PER_CLASS		64
+
 #define GUC_RENDER_ENGINE		0
 #define GUC_VIDEO_ENGINE		1
 #define GUC_BLITTER_ENGINE		2
@@ -68,9 +70,12 @@
 #define WQ_RING_TAIL_MASK		(WQ_RING_TAIL_MAX << WQ_RING_TAIL_SHIFT)
 
 #define GUC_STAGE_DESC_ATTR_ACTIVE	BIT(0)
-#define GUC_STAGE_DESC_ATTR_PENDING_DB	BIT(1)
-#define GUC_STAGE_DESC_ATTR_KERNEL	BIT(2)
-#define GUC_STAGE_DESC_ATTR_PREEMPT	BIT(3)
+#define GUC_STAGE_DESC_ATTR_TYPE_SHIFT	1
+#define GUC_STAGE_DESC_ATTR_PRINCIPAL	(0x0 << GUC_STAGE_DESC_ATTR_TYPE_SHIFT)
+#define GUC_STAGE_DESC_ATTR_PROXY	(0x1 << GUC_STAGE_DESC_ATTR_TYPE_SHIFT)
+#define GUC_STAGE_DESC_ATTR_REAL	(0x2 << GUC_STAGE_DESC_ATTR_TYPE_SHIFT)
+#define GUC_STAGE_DESC_ATTR_TYPE_MASK	(0x3 << GUC_STAGE_DESC_ATTR_TYPE_SHIFT)
+#define GUC_STAGE_DESC_ATTR_KERNEL	(1 << 3)
 #define GUC_STAGE_DESC_ATTR_RESET	BIT(4)
 #define GUC_STAGE_DESC_ATTR_WQLOCKED	BIT(5)
 #define GUC_STAGE_DESC_ATTR_PCH		BIT(6)
@@ -234,9 +239,10 @@ struct guc_process_desc {
 	u64 wq_base_addr;
 	u32 wq_size_bytes;
 	u32 wq_status;
-	u32 engine_presence;
 	u32 priority;
-	u32 reserved[30];
+	u32 token;
+	u32 queue_engine_error;
+	u32 reserved[23];
 } __packed;
 
 /* engine id and context id is packed into guc_execlist_context.context_id*/
@@ -245,18 +251,20 @@ struct guc_process_desc {
 
 /* The execlist context including software and HW information */
 struct guc_execlist_context {
-	u32 context_desc;
-	u32 context_id;
-	u32 ring_status;
-	u32 ring_lrca;
+	u64 hw_context_desc;
+	u32 reserved0;
+	u32 ring_lrc;
 	u32 ring_begin;
 	u32 ring_end;
 	u32 ring_next_free_location;
 	u32 ring_current_tail_pointer_value;
-	u8 engine_state_submit_value;
-	u8 engine_state_wait_value;
-	u16 pagefault_count;
-	u16 engine_submit_queue_count;
+	u32 engine_state_wait_value;
+	u32 state_reserved;
+	u32 is_present_in_sq;
+	u32 sync_value;
+	u32 sync_addr;
+	u32 slpc_hints;
+	u32 reserved1[4];
 } __packed;
 
 /*
@@ -269,36 +277,33 @@ struct guc_execlist_context {
  * with the GuC, being allocated before the GuC is loaded with its firmware.
  */
 struct guc_stage_desc {
-	u32 sched_common_area;
+	u64 desc_private;
 	u32 stage_id;
-	u32 pas_id;
-	u8 engines_used;
+	u32 proxy_id;
 	u64 db_trigger_cpu;
 	u32 db_trigger_uk;
 	u64 db_trigger_phy;
-	u16 db_id;
-
-	struct guc_execlist_context lrc[GUC_MAX_ENGINES_NUM];
-
-	u8 attribute;
-
-	u32 priority;
-
+	u32 db_id;
+	struct guc_execlist_context lrc[GUC_MAX_ENGINE_CLASSES][GUC_MAX_LRC_PER_CLASS];
+	struct {
+		u64 bitmap;
+		u32 reserved0;
+	} __packed lrc_alloc_map[GUC_MAX_ENGINE_CLASSES];
+	u32 lrc_count;
+	u32 max_lrc_per_class;
+	u32 attribute; /* GUC_STAGE_DESC_ATTR_xxx */
+	u32 priority; /* GUC_CLIENT_PRIORITY_xxx */
 	u32 wq_sampled_tail_offset;
 	u32 wq_total_submit_enqueues;
-
 	u32 process_desc;
 	u32 wq_addr;
 	u32 wq_size;
-
-	u32 engine_presence;
-
-	u8 engine_suspended;
-
-	u8 reserved0[3];
-	u64 reserved1[1];
-
-	u64 desc_private;
+	u32 feature0;
+	u32 feature1;
+	u32 feature2;
+	u32 queue_engine_error;
+	u32 reserved[2];
+	u64 reserved3[12];
 } __packed;
 
 /**
diff --git a/drivers/gpu/drm/i915/intel_guc_submission.c b/drivers/gpu/drm/i915/intel_guc_submission.c
index f5dc11536850..3104f091abc5 100644
--- a/drivers/gpu/drm/i915/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/intel_guc_submission.c
@@ -46,17 +46,25 @@
  * that contains all required pages for these elements).
  *
  * GuC stage descriptor:
- * During initialization, the driver allocates a static pool of 1024 such
- * descriptors, and shares them with the GuC.
- * Currently, there exists a 1:1 mapping between a intel_guc_client and a
- * guc_stage_desc (via the client's stage_id), so effectively only one
- * gets used. This stage descriptor lets the GuC know about the doorbell,
- * workqueue and process descriptor. Theoretically, it also lets the GuC
- * know about our HW contexts (context ID, etc...), but we actually
- * employ a kind of submission where the GuC uses the LRCA sent via the work
- * item instead (the single guc_stage_desc associated to execbuf client
- * contains information about the default kernel context only, but this is
- * essentially unused). This is called a "proxy" submission.
+ * During initialization, the driver allocates a static pool of descriptors
+ * and shares them with the GuC. A stage descriptor lets the GuC know about
+ * the doorbell, workqueue and process descriptor, additionally it stores
+ * information about all possible HW contexts that use it (64 x number of
+ * engine classes of guc_execlist_context structs).
+ *
+ * The idea is that every direct-submission GuC client gets one SW Context ID
+ * and every HW context created by that client gets one SW Counter. The "SW
+ * Context ID" and "SW Counter" to use now get passed on every work queue item.
+ *
+ * But we don't have direct submission yet: does that mean we are limited to 64
+ * contexts in total (one client)? Not really: for non-direct submissions we can
+ * reserve one or more descriptors to act as "proxy", which are descriptors that
+ * own a doorbell and a workqueue but contain no HW contexts information. Proxy
+ * descriptors can be used to submit HW contexts whose info is stored in other
+ * context descriptors, which we call "principals". With this mechanism, we can
+ * use almost all descriptors as principals to store HW context info, thus
+ * increasing the maximum number of contexts we can have. This is a
+ * generalization of the old proxy submission.
  *
  * The Scratch registers:
  * There are 16 MMIO-based registers start from 0xC180. The kernel driver writes
@@ -170,11 +178,28 @@ static int __guc_deallocate_doorbell(struct intel_guc *guc, u32 stage_id)
 	return intel_guc_send(guc, action, ARRAY_SIZE(action));
 }
 
-static struct guc_stage_desc *__get_stage_desc(struct intel_guc_client *client)
+static struct guc_stage_desc *__get_stage_desc(struct intel_guc *guc, u32 index)
+{
+	struct guc_stage_desc *base = guc->stage_desc_pool_vaddr;
+
+	GEM_BUG_ON(!USES_GUC_SUBMISSION(guc_to_i915(guc)));
+	GEM_BUG_ON(index >= GUC_MAX_STAGE_DESCRIPTORS);
+
+	return &base[index];
+}
+
+static struct guc_stage_desc *__get_proxy_stage_desc(struct intel_guc_client *client)
 {
-	struct guc_stage_desc *base = client->guc->stage_desc_pool_vaddr;
+	GEM_BUG_ON(!I915_SELFTEST_ONLY(client->guc->starting_proxy_id) &&
+			client->stage_id < GUC_MAX_PPAL_STAGE_DESCRIPTORS);
+	return __get_stage_desc(client->guc, client->stage_id);
+}
 
-	return &base[client->stage_id];
+static struct guc_stage_desc *__get_ppal_stage_desc(struct intel_guc *guc,
+						    u32 index)
+{
+	GEM_BUG_ON(index >= GUC_MAX_PPAL_STAGE_DESCRIPTORS);
+	return __get_stage_desc(guc, index);
 }
 
 /*
@@ -189,7 +214,7 @@ static void __update_doorbell_desc(struct intel_guc_client *client, u16 new_id)
 	struct guc_stage_desc *desc;
 
 	/* Update the GuC's idea of the doorbell ID */
-	desc = __get_stage_desc(client);
+	desc = __get_proxy_stage_desc(client);
 	desc->db_id = new_id;
 }
 
@@ -342,14 +367,12 @@ static int guc_stage_desc_pool_create(struct intel_guc *guc)
 
 	guc->stage_desc_pool = vma;
 	guc->stage_desc_pool_vaddr = vaddr;
-	ida_init(&guc->stage_ids);
 
 	return 0;
 }
 
 static void guc_stage_desc_pool_destroy(struct intel_guc *guc)
 {
-	ida_destroy(&guc->stage_ids);
 	i915_vma_unpin_and_release(&guc->stage_desc_pool, I915_VMA_RELEASE_MAP);
 }
 
@@ -360,78 +383,26 @@ static void guc_stage_desc_pool_destroy(struct intel_guc *guc)
  * data structures relating to this client (doorbell, process descriptor,
  * write queue, etc).
  */
-static void guc_stage_desc_init(struct intel_guc_client *client)
+static void guc_proxy_stage_desc_init(struct intel_guc_client *client)
 {
-	struct intel_guc *guc = client->guc;
-	struct drm_i915_private *dev_priv = guc_to_i915(guc);
-	struct intel_engine_cs *engine;
-	struct i915_gem_context *ctx = client->owner;
 	struct guc_stage_desc *desc;
-	unsigned int tmp;
 	u32 gfx_addr;
 
-	desc = __get_stage_desc(client);
+	desc = __get_proxy_stage_desc(client);
 	memset(desc, 0, sizeof(*desc));
 
 	desc->attribute = GUC_STAGE_DESC_ATTR_ACTIVE |
+			  GUC_STAGE_DESC_ATTR_PROXY |
 			  GUC_STAGE_DESC_ATTR_KERNEL;
-	if (is_high_priority(client))
-		desc->attribute |= GUC_STAGE_DESC_ATTR_PREEMPT;
 	desc->stage_id = client->stage_id;
 	desc->priority = client->priority;
 	desc->db_id = client->doorbell_id;
 
-	for_each_engine_masked(engine, dev_priv, client->engines, tmp) {
-		struct intel_context *ce = intel_context_lookup(ctx, engine);
-		u32 guc_engine_id = engine->guc_id;
-		struct guc_execlist_context *lrc = &desc->lrc[guc_engine_id];
-
-		/* TODO: We have a design issue to be solved here. Only when we
-		 * receive the first batch, we know which engine is used by the
-		 * user. But here GuC expects the lrc and ring to be pinned. It
-		 * is not an issue for default context, which is the only one
-		 * for now who owns a GuC client. But for future owner of GuC
-		 * client, need to make sure lrc is pinned prior to enter here.
-		 */
-		if (!ce || !ce->state)
-			break;	/* XXX: continue? */
-
-		/*
-		 * XXX: When this is a GUC_STAGE_DESC_ATTR_KERNEL client (proxy
-		 * submission or, in other words, not using a direct submission
-		 * model) the KMD's LRCA is not used for any work submission.
-		 * Instead, the GuC uses the LRCA of the user mode context (see
-		 * guc_add_request below).
-		 */
-		lrc->context_desc = lower_32_bits(ce->lrc_desc);
-
-		/* The state page is after PPHWSP */
-		lrc->ring_lrca = intel_guc_ggtt_offset(guc, ce->state) +
-				 LRC_STATE_PN * PAGE_SIZE;
-
-		/* XXX: In direct submission, the GuC wants the HW context id
-		 * here. In proxy submission, it wants the stage id
-		 */
-		lrc->context_id = (client->stage_id << GUC_ELC_CTXID_OFFSET) |
-				(guc_engine_id << GUC_ELC_ENGINE_OFFSET);
-
-		lrc->ring_begin = intel_guc_ggtt_offset(guc, ce->ring->vma);
-		lrc->ring_end = lrc->ring_begin + ce->ring->size - 1;
-		lrc->ring_next_free_location = lrc->ring_begin;
-		lrc->ring_current_tail_pointer_value = 0;
-
-		desc->engines_used |= (1 << guc_engine_id);
-	}
-
-	DRM_DEBUG_DRIVER("Host engines 0x%x => GuC engines used 0x%x\n",
-			 client->engines, desc->engines_used);
-	WARN_ON(desc->engines_used == 0);
-
 	/*
 	 * The doorbell, process descriptor, and workqueue are all parts
 	 * of the client object, which the GuC will reference via the GGTT
 	 */
-	gfx_addr = intel_guc_ggtt_offset(guc, client->vma);
+	gfx_addr = intel_guc_ggtt_offset(client->guc, client->vma);
 	desc->db_trigger_phy = sg_dma_address(client->vma->pages->sgl) +
 				client->doorbell_offset;
 	desc->db_trigger_cpu = ptr_to_u64(__get_doorbell(client));
@@ -443,11 +414,11 @@ static void guc_stage_desc_init(struct intel_guc_client *client)
 	desc->desc_private = ptr_to_u64(client);
 }
 
-static void guc_stage_desc_fini(struct intel_guc_client *client)
+static void guc_proxy_stage_desc_fini(struct intel_guc_client *client)
 {
 	struct guc_stage_desc *desc;
 
-	desc = __get_stage_desc(client);
+	desc = __get_proxy_stage_desc(client);
 	memset(desc, 0, sizeof(*desc));
 }
 
@@ -566,7 +537,7 @@ static void inject_preempt_context(struct work_struct *work)
 	struct intel_guc *guc = container_of(preempt_work, typeof(*guc),
 					     preempt_work[engine->id]);
 	struct intel_guc_client *client = guc->preempt_client;
-	struct guc_stage_desc *stage_desc = __get_stage_desc(client);
+	struct guc_stage_desc *stage_desc = __get_proxy_stage_desc(client);
 	struct intel_context *ce = intel_context_lookup(client->owner, engine);
 	u32 data[7];
 
@@ -931,6 +902,7 @@ guc_client_alloc(struct drm_i915_private *dev_priv,
 	struct i915_vma *vma;
 	void *vaddr;
 	int ret;
+	u32 starting_id = GUC_MAX_PPAL_STAGE_DESCRIPTORS;
 
 	client = kzalloc(sizeof(*client), GFP_KERNEL);
 	if (!client)
@@ -943,8 +915,13 @@ guc_client_alloc(struct drm_i915_private *dev_priv,
 	client->doorbell_id = GUC_DOORBELL_INVALID;
 	spin_lock_init(&client->wq_lock);
 
-	ret = ida_simple_get(&guc->stage_ids, 0, GUC_MAX_STAGE_DESCRIPTORS,
-			     GFP_KERNEL);
+#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
+	if (unlikely(guc->starting_proxy_id))
+		starting_id = guc->starting_proxy_id;
+#endif
+
+	ret = ida_simple_get(&guc->client_ids, starting_id,
+			     GUC_MAX_STAGE_DESCRIPTORS, GFP_KERNEL);
 	if (ret < 0)
 		goto err_client;
 
@@ -995,7 +972,7 @@ guc_client_alloc(struct drm_i915_private *dev_priv,
 err_vma:
 	i915_vma_unpin_and_release(&client->vma, 0);
 err_id:
-	ida_simple_remove(&guc->stage_ids, client->stage_id);
+	ida_simple_remove(&guc->client_ids, client->stage_id);
 err_client:
 	kfree(client);
 	return ERR_PTR(ret);
@@ -1005,7 +982,7 @@ static void guc_client_free(struct intel_guc_client *client)
 {
 	unreserve_doorbell(client);
 	i915_vma_unpin_and_release(&client->vma, I915_VMA_RELEASE_MAP);
-	ida_simple_remove(&client->guc->stage_ids, client->stage_id);
+	ida_simple_remove(&client->guc->client_ids, client->stage_id);
 	kfree(client);
 }
 
@@ -1075,7 +1052,7 @@ static int __guc_client_enable(struct intel_guc_client *client)
 	int ret;
 
 	guc_proc_desc_init(client);
-	guc_stage_desc_init(client);
+	guc_proxy_stage_desc_init(client);
 
 	ret = create_doorbell(client);
 	if (ret)
@@ -1084,7 +1061,7 @@ static int __guc_client_enable(struct intel_guc_client *client)
 	return 0;
 
 fail:
-	guc_stage_desc_fini(client);
+	guc_proxy_stage_desc_fini(client);
 	guc_proc_desc_fini(client);
 	return ret;
 }
@@ -1101,7 +1078,7 @@ static void __guc_client_disable(struct intel_guc_client *client)
 	else
 		__fini_doorbell(client);
 
-	guc_stage_desc_fini(client);
+	guc_proxy_stage_desc_fini(client);
 	guc_proc_desc_fini(client);
 }
 
@@ -1157,6 +1134,9 @@ int intel_guc_submission_init(struct intel_guc *guc)
 	GEM_BUG_ON(!guc->stage_desc_pool);
 
 	WARN_ON(!guc_verify_doorbells(guc));
+
+	ida_init(&guc->client_ids);
+
 	ret = guc_clients_create(guc);
 	if (ret)
 		goto err_pool;
@@ -1169,6 +1149,7 @@ int intel_guc_submission_init(struct intel_guc *guc)
 	return 0;
 
 err_pool:
+	ida_destroy(&guc->client_ids);
 	guc_stage_desc_pool_destroy(guc);
 	return ret;
 }
@@ -1185,6 +1166,8 @@ void intel_guc_submission_fini(struct intel_guc *guc)
 	guc_clients_destroy(guc);
 	WARN_ON(!guc_verify_doorbells(guc));
 
+	ida_destroy(&guc->client_ids);
+
 	if (guc->stage_desc_pool)
 		guc_stage_desc_pool_destroy(guc);
 }
@@ -1269,6 +1252,197 @@ static void guc_submission_unpark(struct intel_engine_cs *engine)
 	intel_engine_pin_breadcrumbs_irq(engine);
 }
 
+static void guc_map_gem_ctx_to_ppal_stage(struct intel_guc *guc,
+					  struct guc_stage_desc *desc,
+					  u32 id)
+{
+	GEM_BUG_ON(desc->attribute & GUC_STAGE_DESC_ATTR_ACTIVE);
+
+	desc->attribute = GUC_STAGE_DESC_ATTR_ACTIVE |
+			  GUC_STAGE_DESC_ATTR_PRINCIPAL |
+			  GUC_STAGE_DESC_ATTR_KERNEL;
+	desc->stage_id = id;
+
+	/* all ppal contexts will be submitted trough the execbuf client */
+	desc->proxy_id = guc->execbuf_client->stage_id;
+
+	/*
+	 * max_lrc_per_class is used in GuC to cut short loops over the
+	 * lrc_bitmap when only a small amount of lrcs are used. We could
+	 * recalculate this value every time an lrc is added or removed, but
+	 * given the fact that we only have a max number of lrcs per stage_desc
+	 * equal to the max number of instances of a class (because we map
+	 * gem_context 1:1 with stage_desc) and that the GuC loops only in
+	 * specific cases, redoing the calculation each time doesn't give us a
+	 * big benefit for the cost so we can just use a static value.
+	 */
+	desc->max_lrc_per_class = MAX_ENGINE_INSTANCE + 1;
+}
+
+static void guc_unmap_gem_ctx_from_ppal_stage(struct intel_guc *guc,
+					      struct guc_stage_desc *desc)
+{
+	GEM_BUG_ON(!(desc->attribute & GUC_STAGE_DESC_ATTR_ACTIVE));
+	GEM_BUG_ON(desc->lrc_count > 0);
+
+	memset(desc, 0, sizeof(*desc));
+}
+
+static inline void guc_ppal_stage_lrc_pin(struct intel_context *ce)
+{
+	struct intel_engine_cs *engine = ce->engine;
+	struct intel_guc *guc = &engine->i915->guc;
+	struct guc_stage_desc *desc;
+	struct guc_execlist_context *lrc;
+	u8 guc_class = engine->class;
+
+	/* 1:1 gem_context to ppal mapping */
+	GEM_BUG_ON(ce->sw_counter > MAX_ENGINE_INSTANCE);
+
+	desc = __get_ppal_stage_desc(guc, ce->sw_context_id);
+	GEM_BUG_ON(desc->lrc_alloc_map[guc_class].bitmap &
+		   BIT_ULL(ce->sw_counter));
+
+	if (!desc->lrc_count++)
+		guc_map_gem_ctx_to_ppal_stage(guc, desc, ce->sw_context_id);
+
+	lrc = &desc->lrc[guc_class][ce->sw_counter];
+	lrc->hw_context_desc = ce->lrc_desc;
+	lrc->ring_lrc = intel_guc_ggtt_offset(guc, ce->state) +
+			LRC_STATE_PN * PAGE_SIZE;
+	lrc->ring_begin = intel_guc_ggtt_offset(guc, ce->ring->vma);
+	lrc->ring_end = lrc->ring_begin + ce->ring->size - 1;
+
+	desc->lrc_alloc_map[guc_class].bitmap |= BIT_ULL(ce->sw_counter);
+}
+
+static inline void guc_ppal_stage_lrc_unpin(struct intel_context *ce)
+{
+	struct i915_gem_context *ctx = ce->gem_context;
+	struct intel_guc *guc = &ctx->i915->guc;
+	struct intel_engine_cs *engine = ce->engine;
+	struct guc_stage_desc *desc;
+	struct guc_execlist_context *lrc;
+	u8 guc_class = engine->class;
+
+	desc = __get_ppal_stage_desc(guc, ce->sw_context_id);
+	GEM_BUG_ON(!(desc->lrc_alloc_map[guc_class].bitmap &
+		     BIT_ULL(ce->sw_counter)));
+
+	lrc = &desc->lrc[guc_class][ce->sw_counter];
+
+	/*
+	 * GuC needs us to keep the lrc mapped until it has finished processing
+	 * the ctx switch interrupt. When executing nop or very small workloads
+	 * it is possible (but quite unlikely) that 2 contexts on different
+	 * ELSPs of the same engine complete before the GuC manages to process
+	 * the interrupt for the first completion. Experiments show this happens
+	 * for ~0.2% of contexts when executing nop workloads on different
+	 * contexts back to back on the same engine. When submitting nop
+	 * workloads on all engines at the same time the hit-rate goes up to
+	 * ~0.7%. In all the observed cases GuC required < 100us to catch up,
+	 * with the single engine case being always below 20us.
+	 *
+	 * The completion of the request on the second lrc will reduce our
+	 * pin_count on the first lrc to zero, thus triggering a call to this
+	 * function potentially before GuC has had time to process the
+	 * interrupt. To avoid this, we could get an extra pin on the context or
+	 * delay the unpin when guc is in use, but given that the issue is
+	 * limited to pathological scenarios and has very low hit rate even
+	 * there, we can just introduce a small delay when it happens to give
+	 * time to GuC to catch up. Also to be noted that since the requests
+	 * have completed on the HW we've most likely already sent GuC the next
+	 * contexts to be executed, so it is unlikely that by waiting we'll add
+	 * bubbles in the HW execution.
+	 */
+	if (likely(intel_guc_is_alive(guc)))
+		WARN_ON(wait_for_us(lrc->is_present_in_sq == 0, 1000));
+
+	desc->lrc_alloc_map[guc_class].bitmap &= ~BIT_ULL(ce->sw_counter);
+	memset(lrc, 0, sizeof(*lrc));
+
+	if (!--desc->lrc_count)
+		guc_unmap_gem_ctx_from_ppal_stage(guc, desc);
+}
+
+static inline void guc_init_lrc_mapping(struct intel_guc *guc)
+{
+	struct drm_i915_private *i915 = guc_to_i915(guc);
+	struct intel_engine_cs *engine;
+	struct i915_gem_context *ctx;
+	struct intel_context *ce;
+	enum intel_engine_id id;
+
+	/*
+	 * Some contexts (e.g. kernel_context) might have been pinned before we
+	 * enabled GuC submission, so we need to add them to the GuC bookeping.
+	 * Also, after a reset the GuC we want to make sure that the information
+	 * shared with GuC is properly reset.
+	 *
+	 * NOTE: the code below assumes 1:1 mapping between ppal descriptors and
+	 * gem contexts for simplicity.
+	 */
+	list_for_each_entry(ctx, &i915->contexts.list, link) {
+		if (atomic_read(&ctx->hw_id_pin_count)) {
+			struct guc_stage_desc *desc;
+
+			/* make sure the descriptor is clean... */
+			GEM_BUG_ON(ctx->hw_id > GUC_MAX_PPAL_STAGE_DESCRIPTORS);
+			desc = __get_ppal_stage_desc(guc, ctx->hw_id);
+			memset(desc, 0, sizeof(*desc));
+
+			/* ...and the (re-)pin all the lrcs */
+			for_each_engine(engine, i915, id) {
+				ce = intel_context_lookup(ctx, engine);
+				if (ce && intel_context_is_pinned(ce))
+					guc_ppal_stage_lrc_pin(ce);
+			}
+		}
+	}
+}
+
+static inline void guc_fini_lrc_mapping(struct intel_guc *guc)
+{
+	struct drm_i915_private *i915 = guc_to_i915(guc);
+	struct intel_engine_cs *engine;
+	struct i915_gem_context *ctx;
+	struct intel_context *ce;
+	enum intel_engine_id id;
+
+	list_for_each_entry(ctx, &i915->contexts.list, link) {
+		if (atomic_read(&ctx->hw_id_pin_count)) {
+			for_each_engine(engine, i915, id) {
+				ce = intel_context_lookup(ctx, engine);
+				if (ce && intel_context_is_pinned(ce))
+					guc_ppal_stage_lrc_unpin(ce);
+			}
+		}
+	}
+}
+
+static void guc_context_unpin(struct intel_context *ce)
+{
+	guc_ppal_stage_lrc_unpin(ce);
+	intel_execlists_context_unpin(ce);
+}
+
+static int guc_context_pin(struct intel_context *ce)
+{
+	int ret = 0;
+
+	ret = intel_execlists_context_pin(ce);
+	if (!ret)
+		guc_ppal_stage_lrc_pin(ce);
+
+	return ret;
+}
+
+static const struct intel_context_ops guc_context_ops = {
+	.pin = guc_context_pin,
+	.unpin = guc_context_unpin,
+	.destroy = intel_execlists_context_destroy,
+};
+
 static void guc_set_default_submission(struct intel_engine_cs *engine)
 {
 	/*
@@ -1286,6 +1460,8 @@ static void guc_set_default_submission(struct intel_engine_cs *engine)
 
 	engine->execlists.tasklet.func = guc_submission_tasklet;
 
+	engine->cops = &guc_context_ops;
+
 	engine->park = guc_submission_park;
 	engine->unpark = guc_submission_unpark;
 
@@ -1328,6 +1504,8 @@ int intel_guc_submission_enable(struct intel_guc *guc)
 		engine->set_default_submission(engine);
 	}
 
+	guc_init_lrc_mapping(guc);
+
 	return 0;
 }
 
@@ -1337,6 +1515,7 @@ void intel_guc_submission_disable(struct intel_guc *guc)
 
 	GEM_BUG_ON(dev_priv->gt.awake); /* GT should be parked first */
 
+	guc_fini_lrc_mapping(guc);
 	guc_interrupts_release(dev_priv);
 	guc_clients_disable(guc);
 }
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 31db38560680..e750ff31e58e 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1237,7 +1237,7 @@ static void __execlists_context_fini(struct intel_context *ce)
 	i915_gem_object_put(ce->state->obj);
 }
 
-static void execlists_context_destroy(struct kref *kref)
+void intel_execlists_context_destroy(struct kref *kref)
 {
 	struct intel_context *ce = container_of(kref, typeof(*ce), ref);
 
@@ -1273,7 +1273,7 @@ static void __context_unpin(struct i915_vma *vma)
 	__i915_vma_unpin(vma);
 }
 
-static void execlists_context_unpin(struct intel_context *ce)
+void intel_execlists_context_unpin(struct intel_context *ce)
 {
 	struct intel_engine_cs *engine;
 
@@ -1380,15 +1380,15 @@ __execlists_context_pin(struct intel_context *ce,
 	return ret;
 }
 
-static int execlists_context_pin(struct intel_context *ce)
+int intel_execlists_context_pin(struct intel_context *ce)
 {
 	return __execlists_context_pin(ce, ce->engine);
 }
 
 static const struct intel_context_ops execlists_context_ops = {
-	.pin = execlists_context_pin,
-	.unpin = execlists_context_unpin,
-	.destroy = execlists_context_destroy,
+	.pin = intel_execlists_context_pin,
+	.unpin = intel_execlists_context_unpin,
+	.destroy = intel_execlists_context_destroy,
 };
 
 static int gen8_emit_init_breadcrumb(struct i915_request *rq)
diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
index 92642ab91472..addf796355de 100644
--- a/drivers/gpu/drm/i915/intel_lrc.h
+++ b/drivers/gpu/drm/i915/intel_lrc.h
@@ -114,4 +114,8 @@ void intel_execlists_show_requests(struct intel_engine_cs *engine,
 
 u32 gen8_make_rpcs(struct drm_i915_private *i915, struct intel_sseu *ctx_sseu);
 
+int intel_execlists_context_pin(struct intel_context *ce);
+void intel_execlists_context_unpin(struct intel_context *ce);
+void intel_execlists_context_destroy(struct kref *kref);
+
 #endif /* _INTEL_LRC_H_ */
diff --git a/drivers/gpu/drm/i915/selftests/intel_guc.c b/drivers/gpu/drm/i915/selftests/intel_guc.c
index b05a21eaa8f4..0102ee90355c 100644
--- a/drivers/gpu/drm/i915/selftests/intel_guc.c
+++ b/drivers/gpu/drm/i915/selftests/intel_guc.c
@@ -259,6 +259,8 @@ static int igt_guc_doorbells(void *arg)
 	if (err)
 		goto unlock;
 
+	BUILD_BUG_ON(GUC_MAX_PPAL_STAGE_DESCRIPTORS < ATTEMPTS);
+	guc->starting_proxy_id = GUC_MAX_PPAL_STAGE_DESCRIPTORS - ATTEMPTS;
 	for (i = 0; i < ATTEMPTS; i++) {
 		clients[i] = guc_client_alloc(dev_priv,
 					      INTEL_INFO(dev_priv)->engine_mask,
@@ -288,11 +290,19 @@ static int igt_guc_doorbells(void *arg)
 			continue;
 		}
 
+		if (clients[i]->stage_id < guc->starting_proxy_id) {
+			pr_err("[%u] invalid stage id %u assigned to client\n",
+			       i, clients[i]->stage_id);
+			err = -EINVAL;
+			goto out;
+		}
+
 		/*
 		 * The check below is only valid because we keep a doorbell
 		 * assigned during the whole life of the client.
 		 */
-		if (clients[i]->stage_id >= GUC_NUM_DOORBELLS) {
+		if ((clients[i]->stage_id - guc->starting_proxy_id) >=
+		     GUC_NUM_DOORBELLS) {
 			pr_err("[%d] more clients than doorbells (%d >= %d)\n",
 			       i, clients[i]->stage_id, GUC_NUM_DOORBELLS);
 			err = -EINVAL;
@@ -338,6 +348,8 @@ static int igt_guc_doorbells(void *arg)
 			__guc_client_disable(clients[i]);
 			guc_client_free(clients[i]);
 		}
+
+	guc->starting_proxy_id = 0;
 unlock:
 	intel_runtime_pm_put(dev_priv, wakeref);
 	mutex_unlock(&dev_priv->drm.struct_mutex);
-- 
2.19.2