[PATCH 1/3] drm/i915/guc: Support new and improved engine busyness
John Harrison
john.c.harrison at intel.com
Fri Oct 6 22:28:15 UTC 2023
On 10/3/2023 13:58, Umesh Nerlige Ramappa wrote:
> On Fri, Sep 22, 2023 at 03:25:08PM -0700, John.C.Harrison at Intel.com
> wrote:
>> From: John Harrison <John.C.Harrison at Intel.com>
>>
>> The GuC has been extended to support a much more friendly engine
>> busyness interface. So partition the old interface into a 'busy_v1'
>> space and add 'busy_v2' support alongside. And if v2 is available, use
>> that in preference to v1. Note that v2 provides extra features over
>> and above v1 which will be exposed via PMU in subsequent patches.
>
> Since we are thinking of using the existing busyness counter to expose
> the v2 values, we can drop the last sentence from above.
>
>>
>> Signed-off-by: John Harrison <John.C.Harrison at Intel.com>
>> ---
>> drivers/gpu/drm/i915/gt/intel_engine_types.h | 4 +-
>> .../gpu/drm/i915/gt/uc/abi/guc_actions_abi.h | 4 +-
>> drivers/gpu/drm/i915/gt/uc/intel_guc.h | 82 ++--
>> drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c | 55 ++-
>> drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h | 9 +-
>> drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h | 23 +-
>> .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 381 ++++++++++++++----
>> 7 files changed, 427 insertions(+), 131 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h
>> b/drivers/gpu/drm/i915/gt/intel_engine_types.h
>> index a7e6775980043..40fd8f984d64b 100644
>> --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
>> +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
>> @@ -323,7 +323,7 @@ struct intel_engine_execlists_stats {
>> ktime_t start;
>> };
>>
>> -struct intel_engine_guc_stats {
>> +struct intel_engine_guc_stats_v1 {
>> /**
>> * @running: Active state of the engine when busyness was last
>> sampled.
>> */
>> @@ -603,7 +603,7 @@ struct intel_engine_cs {
>> struct {
>> union {
>> struct intel_engine_execlists_stats execlists;
>> - struct intel_engine_guc_stats guc;
>> + struct intel_engine_guc_stats_v1 guc_v1;
>> };
>
> Overall, I would suggest having the renames as a separate patch. Would
> make the review easier.
>
>>
>> /**
>> diff --git a/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
>> b/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
>> index f359bef046e0b..c190a99a36c38 100644
>> --- a/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
>> +++ b/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
>> @@ -137,7 +137,9 @@ enum intel_guc_action {
>> INTEL_GUC_ACTION_DEREGISTER_CONTEXT_DONE = 0x4600,
>> INTEL_GUC_ACTION_REGISTER_CONTEXT_MULTI_LRC = 0x4601,
>> INTEL_GUC_ACTION_CLIENT_SOFT_RESET = 0x5507,
>> - INTEL_GUC_ACTION_SET_ENG_UTIL_BUFF = 0x550A,
>> + INTEL_GUC_ACTION_SET_ENG_UTIL_BUFF_V1 = 0x550A,
>> + INTEL_GUC_ACTION_SET_DEVICE_ENGINE_UTILIZATION_V2 = 0x550C,
>> + INTEL_GUC_ACTION_SET_FUNCTION_ENGINE_UTILIZATION_V2 = 0x550D,
>> INTEL_GUC_ACTION_STATE_CAPTURE_NOTIFICATION = 0x8002,
>> INTEL_GUC_ACTION_NOTIFY_FLUSH_LOG_BUFFER_TO_FILE = 0x8003,
>> INTEL_GUC_ACTION_NOTIFY_CRASH_DUMP_POSTED = 0x8004,
>> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h
>> b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
>> index 6c392bad29c19..e6502ab5f049f 100644
>> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h
>> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
>> @@ -226,45 +226,61 @@ struct intel_guc {
>> struct mutex send_mutex;
>>
>> /**
>> - * @timestamp: GT timestamp object that stores a copy of the
>> timestamp
>> - * and adjusts it for overflow using a worker.
>> + * @busy: Data used by the different versions of engine busyness
>> implementations.
>> */
>> - struct {
>> - /**
>> - * @lock: Lock protecting the below fields and the engine
>> stats.
>> - */
>> - spinlock_t lock;
>> -
>> - /**
>> - * @gt_stamp: 64 bit extended value of the GT timestamp.
>> - */
>> - u64 gt_stamp;
>> -
>> - /**
>> - * @ping_delay: Period for polling the GT timestamp for
>> - * overflow.
>> - */
>> - unsigned long ping_delay;
>> -
>> - /**
>> - * @work: Periodic work to adjust GT timestamp, engine and
>> - * context usage for overflows.
>> - */
>> - struct delayed_work work;
>> -
>> + union {
>> /**
>> - * @shift: Right shift value for the gpm timestamp
>> + * @v1: Data used by v1 engine busyness implementation.
>> Mostly a copy
>> + * of the GT timestamp extended to 64 bits and the worker
>> for maintaining it.
>> */
>> - u32 shift;
>> + struct {
>> + /**
>> + * @lock: Lock protecting the below fields and the
>> engine stats.
>> + */
>> + spinlock_t lock;
>> +
>> + /**
>> + * @gt_stamp: 64 bit extended value of the GT timestamp.
>> + */
>> + u64 gt_stamp;
>> +
>> + /**
>> + * @ping_delay: Period for polling the GT timestamp for
>> + * overflow.
>> + */
>> + unsigned long ping_delay;
>> +
>> + /**
>> + * @work: Periodic work to adjust GT timestamp, engine and
>> + * context usage for overflows.
>> + */
>> + struct delayed_work work;
>> +
>> + /**
>> + * @shift: Right shift value for the gpm timestamp
>> + */
>> + u32 shift;
>> +
>> + /**
>> + * @last_stat_jiffies: jiffies at last actual stats
>> collection time
>> + * We use this timestamp to ensure we don't oversample the
>> + * stats because runtime power management events can
>> trigger
>> + * stats collection at much higher rates than required.
>> + */
>> + unsigned long last_stat_jiffies;
>> + } v1;
>>
>> /**
>> - * @last_stat_jiffies: jiffies at last actual stats
>> collection time
>> - * We use this timestamp to ensure we don't oversample the
>> - * stats because runtime power management events can trigger
>> - * stats collection at much higher rates than required.
>> + * @v2: Data used by v2 engine busyness implementation - a
>> memory object
>> + * that is filled in by the GuC and read by the driver.
>> */
>> - unsigned long last_stat_jiffies;
>> - } timestamp;
>> + struct {
>> + /** @device_vma: object allocated to hold the device
>> level busyness data */
>> + struct i915_vma *device_vma;
>> + /** @device_map: access object for @device_vma */
>> + struct iosys_map device_map;
>> + } v2;
>> + } busy;
>>
>> /**
>> * @dead_guc_worker: Asynchronous worker thread for forcing a GuC
>> reset.
>> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
>> b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
>> index 63724e17829a7..1ce595d6816f7 100644
>> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
>> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
>> @@ -59,7 +59,10 @@ struct __guc_ads_blob {
>> struct guc_ads ads;
>> struct guc_policies policies;
>> struct guc_gt_system_info system_info;
>> - struct guc_engine_usage engine_usage;
>> + union {
>> + struct guc_engine_usage v1;
>> + struct guc_function_observation_data v2;
>> + } engine_usage;
>> /* From here on, location is dynamic! Refer to above diagram. */
>> struct guc_mmio_reg regset[];
>> } __packed;
>> @@ -948,18 +951,62 @@ void intel_guc_ads_reset(struct intel_guc *guc)
>> guc_ads_private_data_reset(guc);
>> }
>>
>> -u32 intel_guc_engine_usage_offset(struct intel_guc *guc)
>> +u32 intel_guc_engine_usage_offset_pf(struct intel_guc *guc)
>> {
>> return intel_guc_ggtt_offset(guc, guc->ads_vma) +
>> offsetof(struct __guc_ads_blob, engine_usage);
>> }
>>
>> -struct iosys_map intel_guc_engine_usage_record_map(struct
>> intel_engine_cs *engine)
>> +struct iosys_map intel_guc_engine_usage_record_map_v1(struct
>> intel_engine_cs *engine)
>> {
>> struct intel_guc *guc = &engine->gt->uc.guc;
>> u8 guc_class = engine_class_to_guc_class(engine->class);
>> size_t offset = offsetof(struct __guc_ads_blob,
>> - engine_usage.engines[guc_class][ilog2(engine->logical_mask)]);
>> + engine_usage.v1.engines[guc_class][ilog2(engine->logical_mask)]);
>>
>> return IOSYS_MAP_INIT_OFFSET(&guc->ads_map, offset);
>> }
>> +
>> +int intel_guc_engine_usage_record_map_v2(struct intel_guc *guc,
>> + struct intel_engine_cs *engine,
>> + u32 guc_vf,
>> + struct iosys_map *engine_map,
>> + struct iosys_map *global_map)
>> +{
>> + size_t offset_global, offset_engine;
>> + struct iosys_map *map;
>> + u32 instance;
>> + u8 guc_class;
>> +
>> + if (engine) {
>
> engine is not being passed as NULL in this patch, so we can drop the
> checks in this function.
>
>> + guc_class = engine_class_to_guc_class(engine->class);
>> + instance = ilog2(engine->logical_mask);
>> + }
>> +
>> + if (guc_vf >= GUC_MAX_VF_COUNT) {
>
> Is it possible to split the code in if/else blocks into seperate
> functions and do away with using guc_vf == ~0U to switch between the 2
> logics.
This was the clearest way of organising it that didn't have checkpatch
complaining about annoying things.
>
>> + if (guc_vf != ~0U) {
>> + guc_err(guc, "Out of range VF in busyness query:
>> 0x%X\n", guc_vf);
>> + return -EINVAL;
>> + }
>> +
>> + map = &guc->busy.v2.device_map;
>> + offset_global = 0;
>> +
>> + if (engine)
>> + offset_engine = offsetof(struct
>> guc_engine_observation_data,
>> + engine_data[guc_class][instance]);
>> + } else {
>> + map = &guc->ads_map;
>> + offset_global = offsetof(struct __guc_ads_blob,
>> + engine_usage.v2.function_data[guc_vf]);
>> + if (engine)
>> + offset_engine = offsetof(struct __guc_ads_blob,
>> +
>> engine_usage.v2.function_data[guc_vf].engine_data[guc_class][instance]);
>
> Recommending splitting the vf id based counter support to a future patch.
>
>> + }
>> +
>> + *global_map = IOSYS_MAP_INIT_OFFSET(map, offset_global);
>> + if (engine)
>> + *engine_map = IOSYS_MAP_INIT_OFFSET(map, offset_engine);
>> +
>> + return 0;
>> +}
>
> <snip>
>
>> +static void __busy_v2_get_engine_usage_record(struct intel_guc *guc,
>> + struct intel_engine_cs *engine,
>> + u64 *_ticks_engine, u64 *_ticks_function,
>> + u64 *_ticks_gt)
>> +{
>> + struct iosys_map rec_map_engine, rec_map_global;
>> + u64 ticks_engine, ticks_function, ticks_gt;
>> + int i = 0, ret;
>> +
>> + ret = intel_guc_engine_usage_record_map_v2(guc, engine, ~0U,
>> + &rec_map_engine, &rec_map_global);
>> + if (ret) {
>> + ticks_engine = 0;
>> + ticks_function = 0;
>> + ticks_gt = 0;
>> + goto done;
>> + }
>> +
>> +#define record_read_engine(map_, field_) \
>> + iosys_map_rd_field(map_, 0, struct guc_engine_data, field_)
>> +#define record_read_global(map_, field_) \
>> + iosys_map_rd_field(map_, 0, struct guc_engine_observation_data,
>> field_)
>> +
>> + do {
>> + if (engine)
>> + ticks_engine = record_read_engine(&rec_map_engine,
>> total_execution_ticks);
>> + ticks_function = record_read_global(&rec_map_global,
>> total_active_ticks);
>> + ticks_gt = record_read_global(&rec_map_global, gt_timestamp);
>> +
>> + if (engine && (record_read_engine(&rec_map_engine,
>> total_execution_ticks) !=
>> + ticks_engine))
>> + continue;
>> +
>
> engine record and global record could use separate functions, maybe like
> __busy_v2_get_engine_usage_record
> __busy_v2_get_global_usage_record
I originally had them split. But there is so much common code that it
was much simpler code-wise to have a single function.
John.
>
> Regards,
> Umesh
>
>
>> + if (record_read_global(&rec_map_global, total_active_ticks)
>> == ticks_function &&
>> + record_read_global(&rec_map_global, gt_timestamp) ==
>> ticks_gt)
>> + break;
>> + } while (++i < 6);
>> +
>> +#undef record_read_engine
>> +#undef record_read_global
>> +
>> +done:
>> + if (_ticks_engine)
>> + *_ticks_engine = ticks_engine;
>> + if (_ticks_function)
>> + *_ticks_function = ticks_function;
>> + if (_ticks_gt)
>> + *_ticks_gt = ticks_gt;
>> +}
>> +
>
> <snip>
More information about the dri-devel
mailing list