[Intel-gfx] [PATCH 1/3] drm/i915/guc: Support new and improved engine busyness

Fri Oct 6 22:28:15 UTC 2023

On 10/3/2023 13:58, Umesh Nerlige Ramappa wrote:
> On Fri, Sep 22, 2023 at 03:25:08PM -0700, John.C.Harrison at Intel.com 
> wrote:
>> From: John Harrison <John.C.Harrison at Intel.com>
>>
>> The GuC has been extended to support a much more friendly engine
>> busyness interface. So partition the old interface into a 'busy_v1'
>> space and add 'busy_v2' support alongside. And if v2 is available, use
>> that in preference to v1. Note that v2 provides extra features over
>> and above v1 which will be exposed via PMU in subsequent patches.
>
> Since we are thinking of using the existing busyness counter to expose 
> the v2 values, we can drop the last sentence from above.
>
>>
>> Signed-off-by: John Harrison <John.C.Harrison at Intel.com>
>> ---
>> drivers/gpu/drm/i915/gt/intel_engine_types.h  |   4 +-
>> .../gpu/drm/i915/gt/uc/abi/guc_actions_abi.h  |   4 +-
>> drivers/gpu/drm/i915/gt/uc/intel_guc.h        |  82 ++--
>> drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c    |  55 ++-
>> drivers/gpu/drm/i915/gt/uc/intel_guc_ads.h    |   9 +-
>> drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |  23 +-
>> .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 381 ++++++++++++++----
>> 7 files changed, 427 insertions(+), 131 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h 
>> b/drivers/gpu/drm/i915/gt/intel_engine_types.h
>> index a7e6775980043..40fd8f984d64b 100644
>> --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
>> +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
>> @@ -323,7 +323,7 @@ struct intel_engine_execlists_stats {
>>     ktime_t start;
>> };
>>
>> -struct intel_engine_guc_stats {
>> +struct intel_engine_guc_stats_v1 {
>>     /**
>>      * @running: Active state of the engine when busyness was last 
>> sampled.
>>      */
>> @@ -603,7 +603,7 @@ struct intel_engine_cs {
>>     struct {
>>         union {
>>             struct intel_engine_execlists_stats execlists;
>> -            struct intel_engine_guc_stats guc;
>> +            struct intel_engine_guc_stats_v1 guc_v1;
>>         };
>
> Overall, I would suggest having the renames as a separate patch. Would 
> make the review easier.
>
>>
>>         /**
>> diff --git a/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h 
>> b/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
>> index f359bef046e0b..c190a99a36c38 100644
>> --- a/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
>> +++ b/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
>> @@ -137,7 +137,9 @@ enum intel_guc_action {
>>     INTEL_GUC_ACTION_DEREGISTER_CONTEXT_DONE = 0x4600,
>>     INTEL_GUC_ACTION_REGISTER_CONTEXT_MULTI_LRC = 0x4601,
>>     INTEL_GUC_ACTION_CLIENT_SOFT_RESET = 0x5507,
>> -    INTEL_GUC_ACTION_SET_ENG_UTIL_BUFF = 0x550A,
>> +    INTEL_GUC_ACTION_SET_ENG_UTIL_BUFF_V1 = 0x550A,
>> +    INTEL_GUC_ACTION_SET_DEVICE_ENGINE_UTILIZATION_V2 = 0x550C,
>> +    INTEL_GUC_ACTION_SET_FUNCTION_ENGINE_UTILIZATION_V2 = 0x550D,
>>     INTEL_GUC_ACTION_STATE_CAPTURE_NOTIFICATION = 0x8002,
>>     INTEL_GUC_ACTION_NOTIFY_FLUSH_LOG_BUFFER_TO_FILE = 0x8003,
>>     INTEL_GUC_ACTION_NOTIFY_CRASH_DUMP_POSTED = 0x8004,
>> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h 
>> b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
>> index 6c392bad29c19..e6502ab5f049f 100644
>> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h
>> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
>> @@ -226,45 +226,61 @@ struct intel_guc {
>>     struct mutex send_mutex;
>>
>>     /**
>> -     * @timestamp: GT timestamp object that stores a copy of the 
>> timestamp
>> -     * and adjusts it for overflow using a worker.
>> +     * @busy: Data used by the different versions of engine busyness 
>> implementations.
>>      */
>> -    struct {
>> -        /**
>> -         * @lock: Lock protecting the below fields and the engine 
>> stats.
>> -         */
>> -        spinlock_t lock;
>> -
>> -        /**
>> -         * @gt_stamp: 64 bit extended value of the GT timestamp.
>> -         */
>> -        u64 gt_stamp;
>> -
>> -        /**
>> -         * @ping_delay: Period for polling the GT timestamp for
>> -         * overflow.
>> -         */
>> -        unsigned long ping_delay;
>> -
>> -        /**
>> -         * @work: Periodic work to adjust GT timestamp, engine and
>> -         * context usage for overflows.
>> -         */
>> -        struct delayed_work work;
>> -
>> +    union {
>>         /**
>> -         * @shift: Right shift value for the gpm timestamp
>> +         * @v1: Data used by v1 engine busyness implementation. 
>> Mostly a copy
>> +         * of the GT timestamp extended to 64 bits and the worker 
>> for maintaining it.
>>          */
>> -        u32 shift;
>> +        struct {
>> +            /**
>> +             * @lock: Lock protecting the below fields and the 
>> engine stats.
>> +             */
>> +            spinlock_t lock;
>> +
>> +            /**
>> +             * @gt_stamp: 64 bit extended value of the GT timestamp.
>> +             */
>> +            u64 gt_stamp;
>> +
>> +            /**
>> +             * @ping_delay: Period for polling the GT timestamp for
>> +             * overflow.
>> +             */
>> +            unsigned long ping_delay;
>> +
>> +            /**
>> +             * @work: Periodic work to adjust GT timestamp, engine and
>> +             * context usage for overflows.
>> +             */
>> +            struct delayed_work work;
>> +
>> +            /**
>> +             * @shift: Right shift value for the gpm timestamp
>> +             */
>> +            u32 shift;
>> +
>> +            /**
>> +             * @last_stat_jiffies: jiffies at last actual stats 
>> collection time
>> +             * We use this timestamp to ensure we don't oversample the
>> +             * stats because runtime power management events can 
>> trigger
>> +             * stats collection at much higher rates than required.
>> +             */
>> +            unsigned long last_stat_jiffies;
>> +        } v1;
>>
>>         /**
>> -         * @last_stat_jiffies: jiffies at last actual stats 
>> collection time
>> -         * We use this timestamp to ensure we don't oversample the
>> -         * stats because runtime power management events can trigger
>> -         * stats collection at much higher rates than required.
>> +         * @v2: Data used by v2 engine busyness implementation - a 
>> memory object
>> +         * that is filled in by the GuC and read by the driver.
>>          */
>> -        unsigned long last_stat_jiffies;
>> -    } timestamp;
>> +        struct {
>> +            /** @device_vma: object allocated to hold the device 
>> level busyness data */
>> +            struct i915_vma *device_vma;
>> +            /** @device_map: access object for @device_vma */
>> +            struct iosys_map device_map;
>> +        } v2;
>> +    } busy;
>>
>>     /**
>>      * @dead_guc_worker: Asynchronous worker thread for forcing a GuC 
>> reset.
>> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c 
>> b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
>> index 63724e17829a7..1ce595d6816f7 100644
>> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
>> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
>> @@ -59,7 +59,10 @@ struct __guc_ads_blob {
>>     struct guc_ads ads;
>>     struct guc_policies policies;
>>     struct guc_gt_system_info system_info;
>> -    struct guc_engine_usage engine_usage;
>> +    union {
>> +        struct guc_engine_usage v1;
>> +        struct guc_function_observation_data v2;
>> +    } engine_usage;
>>     /* From here on, location is dynamic! Refer to above diagram. */
>>     struct guc_mmio_reg regset[];
>> } __packed;
>> @@ -948,18 +951,62 @@ void intel_guc_ads_reset(struct intel_guc *guc)
>>     guc_ads_private_data_reset(guc);
>> }
>>
>> -u32 intel_guc_engine_usage_offset(struct intel_guc *guc)
>> +u32 intel_guc_engine_usage_offset_pf(struct intel_guc *guc)
>> {
>>     return intel_guc_ggtt_offset(guc, guc->ads_vma) +
>>         offsetof(struct __guc_ads_blob, engine_usage);
>> }
>>
>> -struct iosys_map intel_guc_engine_usage_record_map(struct 
>> intel_engine_cs *engine)
>> +struct iosys_map intel_guc_engine_usage_record_map_v1(struct 
>> intel_engine_cs *engine)
>> {
>>     struct intel_guc *guc = &engine->gt->uc.guc;
>>     u8 guc_class = engine_class_to_guc_class(engine->class);
>>     size_t offset = offsetof(struct __guc_ads_blob,
>> - engine_usage.engines[guc_class][ilog2(engine->logical_mask)]);
>> + engine_usage.v1.engines[guc_class][ilog2(engine->logical_mask)]);
>>
>>     return IOSYS_MAP_INIT_OFFSET(&guc->ads_map, offset);
>> }
>> +
>> +int intel_guc_engine_usage_record_map_v2(struct intel_guc *guc,
>> +                     struct intel_engine_cs *engine,
>> +                     u32 guc_vf,
>> +                     struct iosys_map *engine_map,
>> +                     struct iosys_map *global_map)
>> +{
>> +    size_t offset_global, offset_engine;
>> +    struct iosys_map *map;
>> +    u32 instance;
>> +    u8 guc_class;
>> +
>> +    if (engine) {
>
> engine is not being passed as NULL in this patch, so we can drop the 
> checks in this function.
>
>> +        guc_class = engine_class_to_guc_class(engine->class);
>> +        instance = ilog2(engine->logical_mask);
>> +    }
>> +
>> +    if (guc_vf >= GUC_MAX_VF_COUNT) {
>
> Is it possible to split the code in if/else blocks into seperate 
> functions and do away with using guc_vf == ~0U to switch between the 2 
> logics.
This was the clearest way of organising it that didn't have checkpatch 
complaining about annoying things.

>
>> +        if (guc_vf != ~0U) {
>> +            guc_err(guc, "Out of range VF in busyness query: 
>> 0x%X\n", guc_vf);
>> +            return -EINVAL;
>> +        }
>> +
>> +        map = &guc->busy.v2.device_map;
>> +        offset_global = 0;
>> +
>> +        if (engine)
>> +            offset_engine = offsetof(struct 
>> guc_engine_observation_data,
>> +                         engine_data[guc_class][instance]);
>> +    } else {
>> +        map = &guc->ads_map;
>> +        offset_global = offsetof(struct __guc_ads_blob,
>> +                     engine_usage.v2.function_data[guc_vf]);
>> +        if (engine)
>> +            offset_engine = offsetof(struct __guc_ads_blob,
>> + 
>> engine_usage.v2.function_data[guc_vf].engine_data[guc_class][instance]);
>
> Recommending splitting the vf id based counter support to a future patch.
>
>> +    }
>> +
>> +    *global_map = IOSYS_MAP_INIT_OFFSET(map, offset_global);
>> +    if (engine)
>> +        *engine_map = IOSYS_MAP_INIT_OFFSET(map, offset_engine);
>> +
>> +    return 0;
>> +}
>
> <snip>
>
>> +static void __busy_v2_get_engine_usage_record(struct intel_guc *guc,
>> +                          struct intel_engine_cs *engine,
>> +                          u64 *_ticks_engine, u64 *_ticks_function,
>> +                          u64 *_ticks_gt)
>> +{
>> +    struct iosys_map rec_map_engine, rec_map_global;
>> +    u64 ticks_engine, ticks_function, ticks_gt;
>> +    int i = 0, ret;
>> +
>> +    ret = intel_guc_engine_usage_record_map_v2(guc, engine, ~0U,
>> +                           &rec_map_engine, &rec_map_global);
>> +    if (ret) {
>> +        ticks_engine = 0;
>> +        ticks_function = 0;
>> +        ticks_gt = 0;
>> +        goto done;
>> +    }
>> +
>> +#define record_read_engine(map_, field_) \
>> +    iosys_map_rd_field(map_, 0, struct guc_engine_data, field_)
>> +#define record_read_global(map_, field_) \
>> +    iosys_map_rd_field(map_, 0, struct guc_engine_observation_data, 
>> field_)
>> +
>> +    do {
>> +        if (engine)
>> +            ticks_engine = record_read_engine(&rec_map_engine, 
>> total_execution_ticks);
>> +        ticks_function = record_read_global(&rec_map_global, 
>> total_active_ticks);
>> +        ticks_gt = record_read_global(&rec_map_global, gt_timestamp);
>> +
>> +        if (engine && (record_read_engine(&rec_map_engine, 
>> total_execution_ticks) !=
>> +                   ticks_engine))
>> +            continue;
>> +
>
> engine record and global record could use separate functions, maybe like
> __busy_v2_get_engine_usage_record
> __busy_v2_get_global_usage_record
I originally had them split. But there is so much common code that it 
was much simpler code-wise to have a single function.

John.

>
> Regards,
> Umesh
>
>
>> +        if (record_read_global(&rec_map_global, total_active_ticks) 
>> == ticks_function &&
>> +            record_read_global(&rec_map_global, gt_timestamp) == 
>> ticks_gt)
>> +            break;
>> +    } while (++i < 6);
>> +
>> +#undef record_read_engine
>> +#undef record_read_global
>> +
>> +done:
>> +    if (_ticks_engine)
>> +        *_ticks_engine = ticks_engine;
>> +    if (_ticks_function)
>> +        *_ticks_function = ticks_function;
>> +    if (_ticks_gt)
>> +        *_ticks_gt = ticks_gt;
>> +}
>> +
>
> <snip>