[Intel-xe] [PATCH 11/30] drm/xe/guc: Use doorbells for submission if possible

Matthew Brost matthew.brost at intel.com
Mon May 1 07:50:55 UTC 2023


We have 256 doorbells (on most platforms) that we can allocate to bypass
using the H2G channel for submission. This will avoid contention on the
CT mutex.

Signed-off-by: Matthew Brost <matthew.brost at intel.com>
Suggested-by: Faith Ekstrand <faith.ekstrand at collabora.com>
---
 drivers/gpu/drm/xe/regs/xe_guc_regs.h    |   1 +
 drivers/gpu/drm/xe/xe_guc.c              |   6 +
 drivers/gpu/drm/xe/xe_guc_engine_types.h |   7 +
 drivers/gpu/drm/xe/xe_guc_submit.c       | 295 ++++++++++++++++++++++-
 drivers/gpu/drm/xe/xe_guc_submit.h       |   1 +
 drivers/gpu/drm/xe/xe_guc_types.h        |   4 +
 drivers/gpu/drm/xe/xe_trace.h            |   5 +
 7 files changed, 315 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/xe/regs/xe_guc_regs.h b/drivers/gpu/drm/xe/regs/xe_guc_regs.h
index 37e0ac550931..11b117293a62 100644
--- a/drivers/gpu/drm/xe/regs/xe_guc_regs.h
+++ b/drivers/gpu/drm/xe/regs/xe_guc_regs.h
@@ -109,6 +109,7 @@ struct guc_doorbell_info {
 
 #define DIST_DBS_POPULATED			XE_REG(0xd08)
 #define   DOORBELLS_PER_SQIDI_MASK		REG_GENMASK(23, 16)
+#define	  DOORBELLS_PER_SQIDI_SHIFT		16
 #define   SQIDIS_DOORBELL_EXIST_MASK		REG_GENMASK(15, 0)
 
 #define GUC_BCS_RCS_IER				XE_REG(0xC550)
diff --git a/drivers/gpu/drm/xe/xe_guc.c b/drivers/gpu/drm/xe/xe_guc.c
index 89d20faced19..0c87f78a868b 100644
--- a/drivers/gpu/drm/xe/xe_guc.c
+++ b/drivers/gpu/drm/xe/xe_guc.c
@@ -297,6 +297,12 @@ int xe_guc_init(struct xe_guc *guc)
  */
 int xe_guc_init_post_hwconfig(struct xe_guc *guc)
 {
+	int ret;
+
+	ret = xe_guc_submit_init_post_hwconfig(guc);
+	if (ret)
+		return ret;
+
 	return xe_guc_ads_init_post_hwconfig(&guc->ads);
 }
 
diff --git a/drivers/gpu/drm/xe/xe_guc_engine_types.h b/drivers/gpu/drm/xe/xe_guc_engine_types.h
index 5d83132034a6..420b7f53e649 100644
--- a/drivers/gpu/drm/xe/xe_guc_engine_types.h
+++ b/drivers/gpu/drm/xe/xe_guc_engine_types.h
@@ -12,6 +12,7 @@
 #include <drm/gpu_scheduler.h>
 
 struct dma_fence;
+struct xe_bo;
 struct xe_engine;
 
 /**
@@ -37,6 +38,10 @@ struct xe_guc_engine {
 	struct work_struct fini_async;
 	/** @resume_time: time of last resume */
 	u64 resume_time;
+	/** @doorbell_bo: BO for memory doorbell */
+	struct xe_bo *doorbell_bo;
+	/** @doorbell_offset: MMIO doorbell offset */
+	u32 doorbell_offset;
 	/** @state: GuC specific state for this xe_engine */
 	atomic_t state;
 	/** @wqi_head: work queue item tail */
@@ -45,6 +50,8 @@ struct xe_guc_engine {
 	u32 wqi_tail;
 	/** @id: GuC id for this xe_engine */
 	u16 id;
+	/** @doorbell_id: doorbell id */
+	u16 doorbell_id;
 	/** @suspend_wait: wait queue used to wait on pending suspends */
 	wait_queue_head_t suspend_wait;
 	/** @suspend_pending: a suspend of the engine is pending */
diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
index 0a41f5d04f6d..1b6f36b04cd1 100644
--- a/drivers/gpu/drm/xe/xe_guc_submit.c
+++ b/drivers/gpu/drm/xe/xe_guc_submit.c
@@ -13,7 +13,10 @@
 
 #include <drm/drm_managed.h>
 
+#include "regs/xe_guc_regs.h"
 #include "regs/xe_lrc_layout.h"
+
+#include "xe_bo.h"
 #include "xe_device.h"
 #include "xe_engine.h"
 #include "xe_force_wake.h"
@@ -26,12 +29,22 @@
 #include "xe_lrc.h"
 #include "xe_macros.h"
 #include "xe_map.h"
+#include "xe_mmio.h"
 #include "xe_mocs.h"
 #include "xe_ring_ops_types.h"
 #include "xe_sched_job.h"
 #include "xe_trace.h"
 #include "xe_vm.h"
 
+#define HAS_GUC_MMIO_DB(xe) (IS_DGFX(xe) || GRAPHICS_VERx100(xe) >= 1250)
+#define HAS_GUC_DIST_DB(xe) \
+	(GRAPHICS_VERx100(xe) >= 1200 && !HAS_GUC_MMIO_DB(xe))
+
+#define GUC_NUM_HW_DOORBELLS 256
+
+#define GUC_MMIO_DB_BAR_OFFSET SZ_4M
+#define GUC_MMIO_DB_BAR_SIZE SZ_4M
+
 static struct xe_gt *
 guc_to_gt(struct xe_guc *guc)
 {
@@ -63,6 +76,7 @@ engine_to_guc(struct xe_engine *e)
 #define ENGINE_STATE_SUSPENDED		(1 << 5)
 #define ENGINE_STATE_RESET		(1 << 6)
 #define ENGINE_STATE_KILLED		(1 << 7)
+#define ENGINE_STATE_DB_REGISTERED	(1 << 8)
 
 static bool engine_registered(struct xe_engine *e)
 {
@@ -179,6 +193,16 @@ static void set_engine_killed(struct xe_engine *e)
 	atomic_or(ENGINE_STATE_KILLED, &e->guc->state);
 }
 
+static bool engine_doorbell_registered(struct xe_engine *e)
+{
+	return atomic_read(&e->guc->state) & ENGINE_STATE_DB_REGISTERED;
+}
+
+static void set_engine_doorbell_registered(struct xe_engine *e)
+{
+	atomic_or(ENGINE_STATE_DB_REGISTERED, &e->guc->state);
+}
+
 static bool engine_killed_or_banned(struct xe_engine *e)
 {
 	return engine_killed(e) || engine_banned(e);
@@ -190,6 +214,7 @@ static void guc_submit_fini(struct drm_device *drm, void *arg)
 
 	xa_destroy(&guc->submission_state.engine_lookup);
 	ida_destroy(&guc->submission_state.guc_ids);
+	ida_destroy(&guc->submission_state.doorbell_ids);
 	bitmap_free(guc->submission_state.guc_ids_bitmap);
 }
 
@@ -230,6 +255,7 @@ int xe_guc_submit_init(struct xe_guc *guc)
 	mutex_init(&guc->submission_state.lock);
 	xa_init(&guc->submission_state.engine_lookup);
 	ida_init(&guc->submission_state.guc_ids);
+	ida_init(&guc->submission_state.doorbell_ids);
 
 	spin_lock_init(&guc->submission_state.suspend.lock);
 	guc->submission_state.suspend.context = dma_fence_context_alloc(1);
@@ -243,6 +269,237 @@ int xe_guc_submit_init(struct xe_guc *guc)
 	return 0;
 }
 
+int xe_guc_submit_init_post_hwconfig(struct xe_guc *guc)
+{
+	if (HAS_GUC_DIST_DB(guc_to_xe(guc))) {
+		u32 distdbreg = xe_mmio_read32(guc_to_gt(guc),
+					       DIST_DBS_POPULATED.reg);
+		u32 num_sqidi =
+			hweight32(distdbreg & SQIDIS_DOORBELL_EXIST_MASK);
+		u32 doorbells_per_sqidi =
+			((distdbreg >> DOORBELLS_PER_SQIDI_SHIFT) &
+			 DOORBELLS_PER_SQIDI_MASK) + 1;
+
+		guc->submission_state.num_doorbells =
+			num_sqidi * doorbells_per_sqidi;
+	} else {
+		guc->submission_state.num_doorbells = GUC_NUM_HW_DOORBELLS;
+	}
+
+	return 0;
+}
+
+static bool alloc_doorbell_id(struct xe_guc *guc, struct xe_engine *e)
+{
+	int ret;
+
+	lockdep_assert_held(&guc->submission_state.lock);
+
+	e->guc->doorbell_id = GUC_NUM_HW_DOORBELLS;
+	ret = ida_simple_get(&guc->submission_state.doorbell_ids, 0,
+			     guc->submission_state.num_doorbells, GFP_NOWAIT);
+	if (ret < 0)
+		return false;
+
+	e->guc->doorbell_id = ret;
+
+	return true;
+}
+
+static void release_doorbell_id(struct xe_guc *guc, struct xe_engine *e)
+{
+	mutex_lock(&guc->submission_state.lock);
+	ida_simple_remove(&guc->submission_state.doorbell_ids,
+			  e->guc->doorbell_id);
+	mutex_unlock(&guc->submission_state.lock);
+
+	e->guc->doorbell_id = GUC_NUM_HW_DOORBELLS;
+}
+
+static int allocate_doorbell(struct xe_guc *guc, u16 guc_id, u16 doorbell_id,
+			     u64 gpa, u32 gtt_addr)
+{
+	u32 action[] = {
+		XE_GUC_ACTION_ALLOCATE_DOORBELL,
+		guc_id,
+		doorbell_id,
+		lower_32_bits(gpa),
+		upper_32_bits(gpa),
+		gtt_addr
+	};
+
+	return xe_guc_ct_send_block(&guc->ct, action, ARRAY_SIZE(action));
+}
+
+static void deallocate_doorbell(struct xe_guc *guc, u16 guc_id)
+{
+	u32 action[] = {
+		XE_GUC_ACTION_DEALLOCATE_DOORBELL,
+		guc_id
+	};
+
+	xe_guc_ct_send(&guc->ct, action, ARRAY_SIZE(action), 0, 0);
+}
+
+static bool has_doorbell(struct xe_engine *e)
+{
+	return e->guc->doorbell_id != GUC_NUM_HW_DOORBELLS;
+}
+
+#define doorbell_read(guc_, e_, field_) ({			\
+	struct iosys_map _vmap = (e_)->guc->doorbell_bo->vmap;	\
+	iosys_map_incr(&_vmap, (e_)->guc->doorbell_offset);	\
+	xe_map_rd_field(guc_to_xe((guc_)), &_vmap, 0,		\
+				  struct guc_doorbell_info, field_); \
+	})
+#define doorbell_write(guc_, e_, field_, val_) ({		\
+	struct iosys_map _vmap = (e_)->guc->doorbell_bo->vmap;	\
+	iosys_map_incr(&_vmap, (e_)->guc->doorbell_offset);	\
+	xe_map_wr_field(guc_to_xe((guc_)), &_vmap, 0,		\
+				  struct guc_doorbell_info, field_, val_); \
+	})
+
+static void init_doorbell(struct xe_guc *guc, struct xe_engine *e)
+{
+	struct xe_device *xe = guc_to_xe(guc);
+
+	/* GuC does the initialization with distributed and MMIO doorbells */
+	if (!HAS_GUC_DIST_DB(xe) && !HAS_GUC_MMIO_DB(xe)) {
+		doorbell_write(guc, e, db_status, GUC_DOORBELL_ENABLED);
+		doorbell_write(guc, e, cookie, 0);
+	}
+}
+
+static void fini_doorbell(struct xe_guc *guc, struct xe_engine *e)
+{
+	if (!HAS_GUC_MMIO_DB(guc_to_xe(guc)) &&
+	    xe_device_mem_access_ongoing(guc_to_xe(guc)))
+		doorbell_write(guc, e, db_status, GUC_DOORBELL_DISABLED);
+}
+
+static void destroy_doorbell(struct xe_guc *guc, struct xe_engine *e)
+{
+	if (has_doorbell(e)) {
+		release_doorbell_id(guc, e);
+		xe_bo_unpin_map_no_vm(e->guc->doorbell_bo);
+	}
+}
+
+static void ring_memory_doorbell(struct xe_guc *guc, struct xe_engine *e)
+{
+	u32 cookie;
+
+	cookie = doorbell_read(guc, e, cookie);
+	doorbell_write(guc, e, cookie, cookie + 1 ?: cookie + 2);
+
+	XE_WARN_ON(doorbell_read(guc, e, db_status) != GUC_DOORBELL_ENABLED);
+}
+
+#define GUC_MMIO_DOORBELL_RING_ACK	0xACEDBEEF
+#define GUC_MMIO_DOORBELL_RING_NACK	0xDEADBEEF
+static void ring_mmio_doorbell(struct xe_guc *guc, u32 doorbell_offset)
+{
+	u32 db_value;
+
+	db_value = xe_mmio_read32(guc_to_gt(guc), GUC_MMIO_DB_BAR_OFFSET +
+				  doorbell_offset);
+
+	/*
+	 * The read from the doorbell page will return ack/nack. We don't remove
+	 * doorbells from active clients so we don't expect to ever get a nack.
+	 * XXX: if doorbell is lost, re-acquire it?
+	 */
+	XE_WARN_ON(db_value == GUC_MMIO_DOORBELL_RING_NACK);
+	XE_WARN_ON(db_value != GUC_MMIO_DOORBELL_RING_ACK);
+}
+
+static void ring_doorbell(struct xe_guc *guc, struct xe_engine *e)
+{
+	XE_BUG_ON(!has_doorbell(e));
+
+	if (HAS_GUC_MMIO_DB(guc_to_xe(guc)))
+		ring_mmio_doorbell(guc, e->guc->doorbell_offset);
+	else
+		ring_memory_doorbell(guc, e);
+
+	trace_xe_engine_ring_db(e);
+}
+
+static void register_engine(struct xe_engine *e);
+
+static int create_doorbell(struct xe_guc *guc, struct xe_engine *e, bool init)
+{
+	struct xe_gt *gt = guc_to_gt(guc);
+	struct xe_device *xe = gt_to_xe(gt);
+	u64 gpa;
+	u32 gtt_addr;
+	int ret;
+
+	XE_BUG_ON(!has_doorbell(e));
+
+	if (HAS_GUC_MMIO_DB(xe)) {
+		e->guc->doorbell_offset = PAGE_SIZE * e->guc->doorbell_id;
+		gpa = GUC_MMIO_DB_BAR_OFFSET + e->guc->doorbell_offset;
+		gtt_addr = 0;
+	} else {
+		struct xe_bo *bo;
+
+		if (!e->guc->doorbell_bo) {
+			bo = xe_bo_create_pin_map(xe, gt, NULL, PAGE_SIZE,
+						  ttm_bo_type_kernel,
+						  XE_BO_CREATE_VRAM_IF_DGFX(gt) |
+						  XE_BO_CREATE_GGTT_BIT);
+			if (IS_ERR(bo))
+				return PTR_ERR(bo);
+
+			e->guc->doorbell_bo = bo;
+		} else {
+			bo = e->guc->doorbell_bo;
+		}
+
+		init_doorbell(guc, e);
+		gpa = xe_bo_main_addr(bo, PAGE_SIZE);
+		gtt_addr = xe_bo_ggtt_addr(bo);
+	}
+
+	if (init && e->flags & ENGINE_FLAG_KERNEL)
+		return 0;
+
+	register_engine(e);
+	ret = allocate_doorbell(guc, e->guc->id, e->guc->doorbell_id, gpa,
+				gtt_addr);
+	if (ret < 0) {
+		fini_doorbell(guc, e);
+		return ret;
+	}
+
+	/*
+	 * In distributed doorbells, guc is returning the cacheline selected
+	 * by HW as part of the 7bit data from the allocate doorbell command:
+	 *  bit [22]   - Cacheline allocated
+	 *  bit [21:16] - Cacheline offset address
+	 * (bit 21 must be zero, or our assumption of only using half a page is
+	 * no longer correct).
+	 */
+	if (HAS_GUC_DIST_DB(xe)) {
+		u32 dd_cacheline_info;
+
+		XE_WARN_ON(!(ret & BIT(22)));
+		XE_WARN_ON(ret & BIT(21));
+
+		dd_cacheline_info = FIELD_GET(GENMASK(21, 16), ret);
+		e->guc->doorbell_offset = dd_cacheline_info * cache_line_size();
+
+		/* and verify db status was updated correctly by the guc fw */
+		XE_WARN_ON(doorbell_read(guc, e, db_status) !=
+			   GUC_DOORBELL_ENABLED);
+	}
+
+	set_engine_doorbell_registered(e);
+
+	return 0;
+}
+
 static int alloc_guc_id(struct xe_guc *guc, struct xe_engine *e)
 {
 	int ret;
@@ -623,6 +880,7 @@ static void submit_engine(struct xe_engine *e)
 	u32 num_g2h = 0;
 	int len = 0;
 	bool extra_submit = false;
+	bool enable = false;
 
 	XE_BUG_ON(!engine_registered(e));
 
@@ -642,6 +900,7 @@ static void submit_engine(struct xe_engine *e)
 		num_g2h = 1;
 		if (xe_engine_is_parallel(e))
 			extra_submit = true;
+		enable = true;
 
 		e->guc->resume_time = RESUME_PENDING;
 		set_engine_pending_enable(e);
@@ -653,7 +912,10 @@ static void submit_engine(struct xe_engine *e)
 		trace_xe_engine_submit(e);
 	}
 
-	xe_guc_ct_send(&guc->ct, action, len, g2h_len, num_g2h);
+	if (enable || !engine_doorbell_registered(e))
+		xe_guc_ct_send(&guc->ct, action, len, g2h_len, num_g2h);
+	else
+		ring_doorbell(guc, e);
 
 	if (extra_submit) {
 		len = 0;
@@ -678,8 +940,17 @@ guc_engine_run_job(struct drm_sched_job *drm_job)
 	trace_xe_sched_job_run(job);
 
 	if (!engine_killed_or_banned(e) && !xe_sched_job_is_error(job)) {
-		if (!engine_registered(e))
-			register_engine(e);
+		if (!engine_registered(e)) {
+			if (has_doorbell(e)) {
+				int err = create_doorbell(engine_to_guc(e), e,
+							  false);
+
+				/* Not fatal, but let's warn */
+				XE_WARN_ON(err);
+			} else {
+				register_engine(e);
+			}
+		}
 		if (!lr)	/* Written in IOCTL */
 			e->ring_ops->emit_job(job);
 		submit_engine(e);
@@ -722,6 +993,11 @@ static void disable_scheduling_deregister(struct xe_guc *guc,
 	MAKE_SCHED_CONTEXT_ACTION(e, DISABLE);
 	int ret;
 
+	if (has_doorbell(e)) {
+		fini_doorbell(guc, e);
+		deallocate_doorbell(guc, e->guc->id);
+	}
+
 	set_min_preemption_timeout(guc, e);
 	smp_rmb();
 	ret = wait_event_timeout(guc->ct.wq, !engine_pending_enable(e) ||
@@ -958,6 +1234,7 @@ static void __guc_engine_fini_async(struct work_struct *w)
 		cancel_work_sync(&ge->lr_tdr);
 	if (e->flags & ENGINE_FLAG_PERSISTENT)
 		xe_device_remove_persistent_engines(gt_to_xe(e->gt), e);
+	destroy_doorbell(guc, e);
 	release_guc_id(guc, e);
 	drm_sched_entity_fini(&ge->entity);
 	drm_sched_fini(&ge->sched);
@@ -1136,6 +1413,7 @@ static int guc_engine_init(struct xe_engine *e)
 	struct xe_guc_engine *ge;
 	long timeout;
 	int err;
+	bool create_db = false;
 
 	XE_BUG_ON(!xe_device_guc_submission_enabled(guc_to_xe(guc)));
 
@@ -1177,8 +1455,17 @@ static int guc_engine_init(struct xe_engine *e)
 	if (guc_read_stopped(guc))
 		drm_sched_stop(sched, NULL);
 
+	create_db = alloc_doorbell_id(guc, e);
+
 	mutex_unlock(&guc->submission_state.lock);
 
+	if (create_db) {
+		/* Error isn't fatal as we don't need a doorbell */
+		err = create_doorbell(guc, e, true);
+		if (err)
+			release_doorbell_id(guc, e);
+	}
+
 	switch (e->class) {
 	case XE_ENGINE_CLASS_RENDER:
 		sprintf(e->name, "rcs%d", e->guc->id);
@@ -1302,7 +1589,7 @@ static int guc_engine_set_job_timeout(struct xe_engine *e, u32 job_timeout_ms)
 {
 	struct drm_gpu_scheduler *sched = &e->guc->sched;
 
-	XE_BUG_ON(engine_registered(e));
+	XE_BUG_ON(engine_registered(e) && !has_doorbell(e));
 	XE_BUG_ON(engine_banned(e));
 	XE_BUG_ON(engine_killed(e));
 
diff --git a/drivers/gpu/drm/xe/xe_guc_submit.h b/drivers/gpu/drm/xe/xe_guc_submit.h
index 8002734d6f24..bada6c02d6aa 100644
--- a/drivers/gpu/drm/xe/xe_guc_submit.h
+++ b/drivers/gpu/drm/xe/xe_guc_submit.h
@@ -13,6 +13,7 @@ struct xe_engine;
 struct xe_guc;
 
 int xe_guc_submit_init(struct xe_guc *guc);
+int xe_guc_submit_init_post_hwconfig(struct xe_guc *guc);
 void xe_guc_submit_print(struct xe_guc *guc, struct drm_printer *p);
 
 int xe_guc_submit_reset_prepare(struct xe_guc *guc);
diff --git a/drivers/gpu/drm/xe/xe_guc_types.h b/drivers/gpu/drm/xe/xe_guc_types.h
index ac7eec28934d..9ee4d572f4e0 100644
--- a/drivers/gpu/drm/xe/xe_guc_types.h
+++ b/drivers/gpu/drm/xe/xe_guc_types.h
@@ -36,10 +36,14 @@ struct xe_guc {
 		struct xarray engine_lookup;
 		/** @guc_ids: used to allocate new guc_ids, single-lrc */
 		struct ida guc_ids;
+		/** @doorbell_ids: use to allocate new doorbells */
+		struct ida doorbell_ids;
 		/** @guc_ids_bitmap: used to allocate new guc_ids, multi-lrc */
 		unsigned long *guc_ids_bitmap;
 		/** @stopped: submissions are stopped */
 		atomic_t stopped;
+		/** @num_doorbells: number of doorbels */
+		int num_doorbells;
 		/** @lock: protects submission state */
 		struct mutex lock;
 		/** @suspend: suspend fence state */
diff --git a/drivers/gpu/drm/xe/xe_trace.h b/drivers/gpu/drm/xe/xe_trace.h
index 02861c26e145..38e9d7c6197b 100644
--- a/drivers/gpu/drm/xe/xe_trace.h
+++ b/drivers/gpu/drm/xe/xe_trace.h
@@ -149,6 +149,11 @@ DEFINE_EVENT(xe_engine, xe_engine_submit,
 	     TP_ARGS(e)
 );
 
+DEFINE_EVENT(xe_engine, xe_engine_ring_db,
+	     TP_PROTO(struct xe_engine *e),
+	     TP_ARGS(e)
+);
+
 DEFINE_EVENT(xe_engine, xe_engine_scheduling_enable,
 	     TP_PROTO(struct xe_engine *e),
 	     TP_ARGS(e)
-- 
2.34.1



More information about the Intel-xe mailing list