[Intel-gfx] [PATCH 1/3] drm/i915/guc: Support new and improved engine busyness

Umesh Nerlige Ramappa umesh.nerlige.ramappa at intel.com
Tue Oct 3 20:58:03 UTC 2023


On Fri, Sep 22, 2023 at 03:25:08PM -0700, John.C.Harrison at Intel.com wrote:
>From: John Harrison <John.C.Harrison at Intel.com>
>
>The GuC has been extended to support a much more friendly engine
>busyness interface. So partition the old interface into a 'busy_v1'
>space and add 'busy_v2' support alongside. And if v2 is available, use
>that in preference to v1. Note that v2 provides extra features over
>and above v1 which will be exposed via PMU in subsequent patches.

Since we are thinking of using the existing busyness counter to expose 
the v2 values, we can drop the last sentence from above.

>
>Signed-off-by: John Harrison <John.C.Harrison at Intel.com>
>---
> drivers/gpu/drm/i915/gt/intel_engine_types.h  |   4 +-
> .../gpu/drm/i915/gt/uc/abi/guc_actions_abi.h  |   4 +-
> drivers/gpu/drm/i915/gt/uc/intel_guc.h        |  82 ++--
> drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c    |  55 ++-
> drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h    |   9 +-
> drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |  23 +-
> .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 381 ++++++++++++++----
> 7 files changed, 427 insertions(+), 131 deletions(-)
>
>diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
>index a7e6775980043..40fd8f984d64b 100644
>--- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
>+++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
>@@ -323,7 +323,7 @@ struct intel_engine_execlists_stats {
> 	ktime_t start;
> };
>
>-struct intel_engine_guc_stats {
>+struct intel_engine_guc_stats_v1 {
> 	/**
> 	 * @running: Active state of the engine when busyness was last sampled.
> 	 */
>@@ -603,7 +603,7 @@ struct intel_engine_cs {
> 	struct {
> 		union {
> 			struct intel_engine_execlists_stats execlists;
>-			struct intel_engine_guc_stats guc;
>+			struct intel_engine_guc_stats_v1 guc_v1;
> 		};

Overall, I would suggest having the renames as a separate patch. Would 
make the review easier.

>
> 		/**
>diff --git a/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h b/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
>index f359bef046e0b..c190a99a36c38 100644
>--- a/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
>+++ b/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
>@@ -137,7 +137,9 @@ enum intel_guc_action {
> 	INTEL_GUC_ACTION_DEREGISTER_CONTEXT_DONE = 0x4600,
> 	INTEL_GUC_ACTION_REGISTER_CONTEXT_MULTI_LRC = 0x4601,
> 	INTEL_GUC_ACTION_CLIENT_SOFT_RESET = 0x5507,
>-	INTEL_GUC_ACTION_SET_ENG_UTIL_BUFF = 0x550A,
>+	INTEL_GUC_ACTION_SET_ENG_UTIL_BUFF_V1 = 0x550A,
>+	INTEL_GUC_ACTION_SET_DEVICE_ENGINE_UTILIZATION_V2 = 0x550C,
>+	INTEL_GUC_ACTION_SET_FUNCTION_ENGINE_UTILIZATION_V2 = 0x550D,
> 	INTEL_GUC_ACTION_STATE_CAPTURE_NOTIFICATION = 0x8002,
> 	INTEL_GUC_ACTION_NOTIFY_FLUSH_LOG_BUFFER_TO_FILE = 0x8003,
> 	INTEL_GUC_ACTION_NOTIFY_CRASH_DUMP_POSTED = 0x8004,
>diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
>index 6c392bad29c19..e6502ab5f049f 100644
>--- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h
>+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
>@@ -226,45 +226,61 @@ struct intel_guc {
> 	struct mutex send_mutex;
>
> 	/**
>-	 * @timestamp: GT timestamp object that stores a copy of the timestamp
>-	 * and adjusts it for overflow using a worker.
>+	 * @busy: Data used by the different versions of engine busyness implementations.
> 	 */
>-	struct {
>-		/**
>-		 * @lock: Lock protecting the below fields and the engine stats.
>-		 */
>-		spinlock_t lock;
>-
>-		/**
>-		 * @gt_stamp: 64 bit extended value of the GT timestamp.
>-		 */
>-		u64 gt_stamp;
>-
>-		/**
>-		 * @ping_delay: Period for polling the GT timestamp for
>-		 * overflow.
>-		 */
>-		unsigned long ping_delay;
>-
>-		/**
>-		 * @work: Periodic work to adjust GT timestamp, engine and
>-		 * context usage for overflows.
>-		 */
>-		struct delayed_work work;
>-
>+	union {
> 		/**
>-		 * @shift: Right shift value for the gpm timestamp
>+		 * @v1: Data used by v1 engine busyness implementation. Mostly a copy
>+		 * of the GT timestamp extended to 64 bits and the worker for maintaining it.
> 		 */
>-		u32 shift;
>+		struct {
>+			/**
>+			 * @lock: Lock protecting the below fields and the engine stats.
>+			 */
>+			spinlock_t lock;
>+
>+			/**
>+			 * @gt_stamp: 64 bit extended value of the GT timestamp.
>+			 */
>+			u64 gt_stamp;
>+
>+			/**
>+			 * @ping_delay: Period for polling the GT timestamp for
>+			 * overflow.
>+			 */
>+			unsigned long ping_delay;
>+
>+			/**
>+			 * @work: Periodic work to adjust GT timestamp, engine and
>+			 * context usage for overflows.
>+			 */
>+			struct delayed_work work;
>+
>+			/**
>+			 * @shift: Right shift value for the gpm timestamp
>+			 */
>+			u32 shift;
>+
>+			/**
>+			 * @last_stat_jiffies: jiffies at last actual stats collection time
>+			 * We use this timestamp to ensure we don't oversample the
>+			 * stats because runtime power management events can trigger
>+			 * stats collection at much higher rates than required.
>+			 */
>+			unsigned long last_stat_jiffies;
>+		} v1;
>
> 		/**
>-		 * @last_stat_jiffies: jiffies at last actual stats collection time
>-		 * We use this timestamp to ensure we don't oversample the
>-		 * stats because runtime power management events can trigger
>-		 * stats collection at much higher rates than required.
>+		 * @v2: Data used by v2 engine busyness implementation - a memory object
>+		 * that is filled in by the GuC and read by the driver.
> 		 */
>-		unsigned long last_stat_jiffies;
>-	} timestamp;
>+		struct {
>+			/** @device_vma: object allocated to hold the device level busyness data */
>+			struct i915_vma *device_vma;
>+			/** @device_map: access object for @device_vma */
>+			struct iosys_map device_map;
>+		} v2;
>+	} busy;
>
> 	/**
> 	 * @dead_guc_worker: Asynchronous worker thread for forcing a GuC reset.
>diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
>index 63724e17829a7..1ce595d6816f7 100644
>--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
>+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
>@@ -59,7 +59,10 @@ struct __guc_ads_blob {
> 	struct guc_ads ads;
> 	struct guc_policies policies;
> 	struct guc_gt_system_info system_info;
>-	struct guc_engine_usage engine_usage;
>+	union {
>+		struct guc_engine_usage v1;
>+		struct guc_function_observation_data v2;
>+	} engine_usage;
> 	/* From here on, location is dynamic! Refer to above diagram. */
> 	struct guc_mmio_reg regset[];
> } __packed;
>@@ -948,18 +951,62 @@ void intel_guc_ads_reset(struct intel_guc *guc)
> 	guc_ads_private_data_reset(guc);
> }
>
>-u32 intel_guc_engine_usage_offset(struct intel_guc *guc)
>+u32 intel_guc_engine_usage_offset_pf(struct intel_guc *guc)
> {
> 	return intel_guc_ggtt_offset(guc, guc->ads_vma) +
> 		offsetof(struct __guc_ads_blob, engine_usage);
> }
>
>-struct iosys_map intel_guc_engine_usage_record_map(struct intel_engine_cs *engine)
>+struct iosys_map intel_guc_engine_usage_record_map_v1(struct intel_engine_cs *engine)
> {
> 	struct intel_guc *guc = &engine->gt->uc.guc;
> 	u8 guc_class = engine_class_to_guc_class(engine->class);
> 	size_t offset = offsetof(struct __guc_ads_blob,
>-				 engine_usage.engines[guc_class][ilog2(engine->logical_mask)]);
>+				 engine_usage.v1.engines[guc_class][ilog2(engine->logical_mask)]);
>
> 	return IOSYS_MAP_INIT_OFFSET(&guc->ads_map, offset);
> }
>+
>+int intel_guc_engine_usage_record_map_v2(struct intel_guc *guc,
>+					 struct intel_engine_cs *engine,
>+					 u32 guc_vf,
>+					 struct iosys_map *engine_map,
>+					 struct iosys_map *global_map)
>+{
>+	size_t offset_global, offset_engine;
>+	struct iosys_map *map;
>+	u32 instance;
>+	u8 guc_class;
>+
>+	if (engine) {

engine is not being passed as NULL in this patch, so we can drop the 
checks in this function.

>+		guc_class = engine_class_to_guc_class(engine->class);
>+		instance = ilog2(engine->logical_mask);
>+	}
>+
>+	if (guc_vf >= GUC_MAX_VF_COUNT) {

Is it possible to split the code in if/else blocks into seperate 
functions and do away with using guc_vf == ~0U to switch between the 2 logics.

>+		if (guc_vf != ~0U) {
>+			guc_err(guc, "Out of range VF in busyness query: 0x%X\n", guc_vf);
>+			return -EINVAL;
>+		}
>+
>+		map = &guc->busy.v2.device_map;
>+		offset_global = 0;
>+
>+		if (engine)
>+			offset_engine = offsetof(struct guc_engine_observation_data,
>+						 engine_data[guc_class][instance]);
>+	} else {
>+		map = &guc->ads_map;
>+		offset_global = offsetof(struct __guc_ads_blob,
>+					 engine_usage.v2.function_data[guc_vf]);
>+		if (engine)
>+			offset_engine = offsetof(struct __guc_ads_blob,
>+						 engine_usage.v2.function_data[guc_vf].engine_data[guc_class][instance]);

Recommending splitting the vf id based counter support to a future patch.

>+	}
>+
>+	*global_map = IOSYS_MAP_INIT_OFFSET(map, offset_global);
>+	if (engine)
>+		*engine_map = IOSYS_MAP_INIT_OFFSET(map, offset_engine);
>+
>+	return 0;
>+}

<snip>

>+static void __busy_v2_get_engine_usage_record(struct intel_guc *guc,
>+					      struct intel_engine_cs *engine,
>+					      u64 *_ticks_engine, u64 *_ticks_function,
>+					      u64 *_ticks_gt)
>+{
>+	struct iosys_map rec_map_engine, rec_map_global;
>+	u64 ticks_engine, ticks_function, ticks_gt;
>+	int i = 0, ret;
>+
>+	ret = intel_guc_engine_usage_record_map_v2(guc, engine, ~0U,
>+						   &rec_map_engine, &rec_map_global);
>+	if (ret) {
>+		ticks_engine = 0;
>+		ticks_function = 0;
>+		ticks_gt = 0;
>+		goto done;
>+	}
>+
>+#define record_read_engine(map_, field_) \
>+	iosys_map_rd_field(map_, 0, struct guc_engine_data, field_)
>+#define record_read_global(map_, field_) \
>+	iosys_map_rd_field(map_, 0, struct guc_engine_observation_data, field_)
>+
>+	do {
>+		if (engine)
>+			ticks_engine = record_read_engine(&rec_map_engine, total_execution_ticks);
>+		ticks_function = record_read_global(&rec_map_global, total_active_ticks);
>+		ticks_gt = record_read_global(&rec_map_global, gt_timestamp);
>+
>+		if (engine && (record_read_engine(&rec_map_engine, total_execution_ticks) !=
>+			       ticks_engine))
>+			continue;
>+

engine record and global record could use separate functions, maybe like
__busy_v2_get_engine_usage_record
__busy_v2_get_global_usage_record

Regards,
Umesh


>+		if (record_read_global(&rec_map_global, total_active_ticks) == ticks_function &&
>+		    record_read_global(&rec_map_global, gt_timestamp) == ticks_gt)
>+			break;
>+	} while (++i < 6);
>+
>+#undef record_read_engine
>+#undef record_read_global
>+
>+done:
>+	if (_ticks_engine)
>+		*_ticks_engine = ticks_engine;
>+	if (_ticks_function)
>+		*_ticks_function = ticks_function;
>+	if (_ticks_gt)
>+		*_ticks_gt = ticks_gt;
>+}
>+

<snip>


More information about the Intel-gfx mailing list