[Intel-gfx] [PATCH 3/3] drm/i915/mtl: Add counters for engine busyness ticks
John Harrison
john.c.harrison at intel.com
Thu Oct 19 19:56:52 UTC 2023
On 10/19/2023 09:21, Dong, Zhanjun wrote:
> See comments inline below.
>
> Zhanjun
>
> On 2023-09-22 6:25 p.m., John.C.Harrison at Intel.com wrote:
>> From: Umesh Nerlige Ramappa <umesh.nerlige.ramappa at intel.com>
>>
>> In new version of GuC engine busyness, GuC provides engine busyness
>> ticks as a 64 bit counter. Add a new counter to relay this value to the
>> user as is.
>>
>> Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa at intel.com>
>> Signed-off-by: John Harrison <John.C.Harrison at Intel.com>
>> ---
>> drivers/gpu/drm/i915/gt/intel_engine.h | 1 +
>> drivers/gpu/drm/i915/gt/intel_engine_cs.c | 16 +++++
>> drivers/gpu/drm/i915/gt/intel_engine_types.h | 12 ++++
>> drivers/gpu/drm/i915/gt/intel_engine_user.c | 1 +
>> .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 67 ++++++++++++++-----
>> drivers/gpu/drm/i915/i915_pmu.c | 25 ++++++-
>> drivers/gpu/drm/i915/i915_pmu.h | 2 +-
>> include/uapi/drm/i915_drm.h | 13 +++-
>> 8 files changed, 116 insertions(+), 21 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/gt/intel_engine.h
>> b/drivers/gpu/drm/i915/gt/intel_engine.h
>> index b58c30ac8ef02..57af7ec8ecd82 100644
>> --- a/drivers/gpu/drm/i915/gt/intel_engine.h
>> +++ b/drivers/gpu/drm/i915/gt/intel_engine.h
>> @@ -249,6 +249,7 @@ void intel_engine_dump_active_requests(struct
>> list_head *requests,
>> ktime_t intel_engine_get_busy_time(struct intel_engine_cs *engine,
>> ktime_t *now);
>> +u64 intel_engine_get_busy_ticks(struct intel_engine_cs *engine);
>> void intel_engine_get_hung_entity(struct intel_engine_cs *engine,
>> struct intel_context **ce, struct i915_request
>> **rq);
>> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
>> b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
>> index 84a75c95f3f7d..1c9ffb1ae9889 100644
>> --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
>> +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
>> @@ -2426,6 +2426,22 @@ ktime_t intel_engine_get_busy_time(struct
>> intel_engine_cs *engine, ktime_t *now)
>> return engine->busyness(engine, now);
>> }
>> +/**
>> + * intel_engine_get_busy_ticks() - Return current accumulated engine
>> busyness
>> + * ticks
>> + * @engine: engine to report on
>> + *
>> + * Returns accumulated ticks @engine was busy since engine stats
>> were enabled.
>> + */
>> +u64 intel_engine_get_busy_ticks(struct intel_engine_cs *engine)
>> +{
>> + if (!engine->busyness_ticks ||
>> + !(engine->flags & I915_ENGINE_SUPPORTS_TICKS_STATS))
>> + return 0;
>> +
>> + return engine->busyness_ticks(engine);
>> +}
>> +
>> struct intel_context *
>> intel_engine_create_virtual(struct intel_engine_cs **siblings,
>> unsigned int count, unsigned long flags)
>> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h
>> b/drivers/gpu/drm/i915/gt/intel_engine_types.h
>> index 40fd8f984d64b..a88d40c74d604 100644
>> --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
>> +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
>> @@ -548,6 +548,11 @@ struct intel_engine_cs {
>> ktime_t (*busyness)(struct intel_engine_cs *engine,
>> ktime_t *now);
>> + /*
>> + * Get engine busyness ticks
>> + */
>> + u64 (*busyness_ticks)(struct intel_engine_cs *engine);
>> +
>> struct intel_engine_execlists execlists;
>> /*
>> @@ -574,6 +579,7 @@ struct intel_engine_cs {
>> #define I915_ENGINE_HAS_EU_PRIORITY BIT(10)
>> #define I915_ENGINE_FIRST_RENDER_COMPUTE BIT(11)
>> #define I915_ENGINE_USES_WA_HOLD_CCS_SWITCHOUT BIT(12)
>> +#define I915_ENGINE_SUPPORTS_TICKS_STATS BIT(13)
>> unsigned int flags;
>> /*
>> @@ -649,6 +655,12 @@ intel_engine_supports_stats(const struct
>> intel_engine_cs *engine)
>> return engine->flags & I915_ENGINE_SUPPORTS_STATS;
>> }
>> +static inline bool
>> +intel_engine_supports_tick_stats(const struct intel_engine_cs *engine)
>> +{
>> + return engine->flags & I915_ENGINE_SUPPORTS_TICKS_STATS;
>> +}
>> +
>> static inline bool
>> intel_engine_has_preemption(const struct intel_engine_cs *engine)
>> {
>> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_user.c
>> b/drivers/gpu/drm/i915/gt/intel_engine_user.c
>> index dcedff41a825f..69eb610b5ab0a 100644
>> --- a/drivers/gpu/drm/i915/gt/intel_engine_user.c
>> +++ b/drivers/gpu/drm/i915/gt/intel_engine_user.c
>> @@ -100,6 +100,7 @@ static void set_scheduler_caps(struct
>> drm_i915_private *i915)
>> MAP(HAS_PREEMPTION, PREEMPTION),
>> MAP(HAS_SEMAPHORES, SEMAPHORES),
>> MAP(SUPPORTS_STATS, ENGINE_BUSY_STATS),
>> + MAP(SUPPORTS_TICKS_STATS, ENGINE_BUSY_TICKS_STATS),
>> #undef MAP
>> };
>> struct intel_engine_cs *engine;
>> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>> b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>> index 0c1fee5360777..71749fb9ad35b 100644
>> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>> @@ -1289,12 +1289,7 @@ static void
>> busy_v1_guc_update_pm_timestamp(struct intel_guc *guc, ktime_t *now)
>> guc->busy.v1.gt_stamp = ((u64)gt_stamp_hi << 32) | gt_stamp_lo;
>> }
>> -/*
>> - * Unlike the execlist mode of submission total and active times are
>> in terms of
>> - * gt clocks. The *now parameter is retained to return the cpu time
>> at which the
>> - * busyness was sampled.
>> - */
>> -static ktime_t busy_v1_guc_engine_busyness(struct intel_engine_cs
>> *engine, ktime_t *now)
>> +static u64 __busy_v1_guc_engine_busyness_ticks(struct
>> intel_engine_cs *engine, ktime_t *now_out)
>> {
>> struct intel_engine_guc_stats_v1 stats_saved, *stats =
>> &engine->stats.guc_v1;
>> struct i915_gpu_error *gpu_error = &engine->i915->gpu_error;
>> @@ -1304,6 +1299,7 @@ static ktime_t
>> busy_v1_guc_engine_busyness(struct intel_engine_cs *engine, ktime
>> unsigned long flags;
>> u32 reset_count;
>> bool in_reset;
>> + ktime_t now;
>> spin_lock_irqsave(&guc->busy.v1.lock, flags);
>> @@ -1318,7 +1314,7 @@ static ktime_t
>> busy_v1_guc_engine_busyness(struct intel_engine_cs *engine, ktime
>> reset_count = i915_reset_count(gpu_error);
>> in_reset = test_bit(I915_RESET_BACKOFF, >->reset.flags);
>> - *now = ktime_get();
>> + now = ktime_get();
>> /*
>> * The active busyness depends on start_gt_clk and gt_stamp.
>> @@ -1334,7 +1330,7 @@ static ktime_t
>> busy_v1_guc_engine_busyness(struct intel_engine_cs *engine, ktime
>> * start_gt_clk' calculation below for active engines.
>> */
>> busy_v1_guc_update_engine_gt_clks(engine);
>> - busy_v1_guc_update_pm_timestamp(guc, now);
>> + busy_v1_guc_update_pm_timestamp(guc, &now);
>> intel_gt_pm_put_async(gt);
>> if (i915_reset_count(gpu_error) != reset_count) {
>> *stats = stats_saved;
>> @@ -1342,16 +1338,37 @@ static ktime_t
>> busy_v1_guc_engine_busyness(struct intel_engine_cs *engine, ktime
>> }
>> }
>> - total = intel_gt_clock_interval_to_ns(gt, stats->total_gt_clks);
>> + total = stats->total_gt_clks;
>> if (stats->running) {
>> u64 clk = guc->busy.v1.gt_stamp - stats->start_gt_clk;
>> - total += intel_gt_clock_interval_to_ns(gt, clk);
>> + total += clk;
>> }
>> spin_unlock_irqrestore(&guc->busy.v1.lock, flags);
>> - return ns_to_ktime(total);
>> + if (now_out)
>> + *now_out = now;
>> +
>> + return total;
>> +}
>> +
>> +/*
>> + * Unlike the execlist mode of submission total and active times are
>> in terms of
>> + * gt clocks. The *now parameter is retained to return the cpu time
>> at which the
>> + * busyness was sampled.
>> + */
>> +static ktime_t busy_v1_guc_engine_busyness(struct intel_engine_cs
>> *engine, ktime_t *now)
>> +{
>> + u64 ticks = __busy_v1_guc_engine_busyness_ticks(engine, now);
>> + u64 ns = intel_gt_clock_interval_to_ns(engine->gt, ticks);
>> +
>> + return ns_to_ktime(ns);
>> +}
>> +
>> +static u64 busy_v1_guc_engine_busyness_ticks(struct intel_engine_cs
>> *engine)
>> +{
>> + return __busy_v1_guc_engine_busyness_ticks(engine, NULL);
>> }
>> static void busy_v1_guc_enable_worker(struct intel_guc *guc)
>> @@ -1607,6 +1624,16 @@ static ktime_t
>> busy_v2_guc_engine_busyness(struct intel_engine_cs *engine, ktime
>> return ns_to_ktime(total);
>> }
>> +static u64 busy_v2_guc_engine_busyness_ticks(struct
>> intel_engine_cs *engine)
>> +{
>> + struct intel_guc *guc = &engine->gt->uc.guc;
>> + u64 ticks_engine;
>> +
>> + __busy_v2_get_engine_usage_record(guc, engine, &ticks_engine,
>> NULL, NULL);
>> +
>> + return ticks_engine;
>> +}
>> +
>> static u64 busy_v1_intel_guc_total_active_ticks(struct intel_guc *guc)
>> {
>> return guc->busy.v1.gt_stamp;
>> @@ -4480,12 +4507,20 @@ static void guc_default_vfuncs(struct
>> intel_engine_cs *engine)
>> engine->emit_flush = gen12_emit_flush_xcs;
>> }
>> engine->set_default_submission = guc_set_default_submission;
>> - if (GUC_SUBMIT_VER(&engine->gt->uc.guc) < MAKE_GUC_VER(1, 3, 1))
>> - engine->busyness = busy_v1_guc_engine_busyness;
>> - else
>> - engine->busyness = busy_v2_guc_engine_busyness;
>> + if (GUC_SUBMIT_VER(&engine->gt->uc.guc) < MAKE_GUC_VER(1, 3, 1)) {
>> + if (GRAPHICS_VER_FULL(engine->i915) < IP_VER(12, 70))
>> + engine->busyness = busy_v1_guc_engine_busyness;
>> + engine->busyness_ticks = busy_v1_guc_engine_busyness_ticks;
>> + } else {
>> + if (GRAPHICS_VER_FULL(engine->i915) < IP_VER(12, 70))
>> + engine->busyness = busy_v2_guc_engine_busyness;
>
> Do we leave engine->busyness not initialized when >= IP_VER(12, 70)?
These vfuncs are all zero initialised and checked for null before being
called. And the intention was that MTL would not support the old API as
it not a good fit for current hardware. However, that plan was rejected
and the whole v2 implementation is currently shelved for upstream due to
power implications that need to be resolved.
John.
>
>> + engine->busyness_ticks = busy_v2_guc_engine_busyness_ticks;
>> + }
>> +
>> + if (engine->busyness)
>> + engine->flags |= I915_ENGINE_SUPPORTS_STATS;
>> - engine->flags |= I915_ENGINE_SUPPORTS_STATS;
>> + engine->flags |= I915_ENGINE_SUPPORTS_TICKS_STATS;
>> engine->flags |= I915_ENGINE_HAS_PREEMPTION;
>> engine->flags |= I915_ENGINE_HAS_TIMESLICES;
>> diff --git a/drivers/gpu/drm/i915/i915_pmu.c
>> b/drivers/gpu/drm/i915/i915_pmu.c
>> index 4f52636eb4a80..1b859556644f6 100644
>> --- a/drivers/gpu/drm/i915/i915_pmu.c
>> +++ b/drivers/gpu/drm/i915/i915_pmu.c
>> @@ -26,7 +26,8 @@
>> #define ENGINE_SAMPLE_MASK \
>> (BIT(I915_SAMPLE_BUSY) | \
>> BIT(I915_SAMPLE_WAIT) | \
>> - BIT(I915_SAMPLE_SEMA))
>> + BIT(I915_SAMPLE_SEMA) | \
>> + BIT(I915_SAMPLE_BUSY_TICKS))
>> static cpumask_t i915_pmu_cpumask;
>> static unsigned int i915_pmu_target_cpu = -1;
>> @@ -161,8 +162,11 @@ static bool pmu_needs_timer(struct i915_pmu *pmu)
>> * Also there is software busyness tracking available we do not
>> * need the timer for I915_SAMPLE_BUSY counter.
>> */
>> - if (i915->caps.scheduler & I915_SCHEDULER_CAP_ENGINE_BUSY_STATS)
>> + if ((i915->caps.scheduler &
>> I915_SCHEDULER_CAP_ENGINE_BUSY_STATS) ||
>> + (i915->caps.scheduler &
>> I915_SCHEDULER_CAP_ENGINE_BUSY_TICKS_STATS)) {
>> enable &= ~BIT(I915_SAMPLE_BUSY);
>> + enable &= ~BIT(I915_SAMPLE_BUSY_TICKS);
>> + }
>> /*
>> * If some bits remain it means we need the sampling timer
>> running.
>> @@ -362,7 +366,8 @@ static void engine_sample(struct intel_engine_cs
>> *engine, unsigned int period_ns
>> add_sample(&pmu->sample[I915_SAMPLE_SEMA], period_ns);
>> /* No need to sample when busy stats are supported. */
>> - if (intel_engine_supports_stats(engine))
>> + if (intel_engine_supports_stats(engine) ||
>> + intel_engine_supports_tick_stats(engine))
>> return;
>> /*
>> @@ -519,6 +524,13 @@ engine_event_status(struct intel_engine_cs *engine,
>> {
>> switch (sample) {
>> case I915_SAMPLE_BUSY:
>> + if (!intel_engine_supports_stats(engine))
>> + return -ENODEV;
>> + break;
>> + case I915_SAMPLE_BUSY_TICKS:
>> + if (!intel_engine_supports_tick_stats(engine))
>> + return -ENODEV;
>> + break;
>> case I915_SAMPLE_WAIT:
>> break;
>> case I915_SAMPLE_SEMA:
>> @@ -651,6 +663,9 @@ static u64 __i915_pmu_event_read(struct
>> perf_event *event)
>> val = ktime_to_ns(intel_engine_get_busy_time(engine,
>> &unused));
>> + } else if (sample == I915_SAMPLE_BUSY_TICKS &&
>> + intel_engine_supports_tick_stats(engine)) {
>> + val = intel_engine_get_busy_ticks(engine);
>> } else {
>> val = engine->pmu.sample[sample].cur;
>> }
>> @@ -1000,6 +1015,7 @@ create_event_attributes(struct i915_pmu *pmu)
>> __engine_event(I915_SAMPLE_BUSY, "busy"),
>> __engine_event(I915_SAMPLE_SEMA, "sema"),
>> __engine_event(I915_SAMPLE_WAIT, "wait"),
>> + __engine_event(I915_SAMPLE_BUSY_TICKS, "busy-ticks"),
>> };
>> unsigned int count = 0;
>> struct perf_pmu_events_attr *pmu_attr = NULL, *pmu_iter;
>> @@ -1103,6 +1119,9 @@ create_event_attributes(struct i915_pmu *pmu)
>> engine->uabi_instance,
>> engine_events[i].sample));
>> + if (engine_events[i].sample == I915_SAMPLE_BUSY_TICKS)
>> + continue;
>> +
>> str = kasprintf(GFP_KERNEL, "%s-%s.unit",
>> engine->name, engine_events[i].name);
>> if (!str)
>> diff --git a/drivers/gpu/drm/i915/i915_pmu.h
>> b/drivers/gpu/drm/i915/i915_pmu.h
>> index 41af038c37388..72a9c71944f8d 100644
>> --- a/drivers/gpu/drm/i915/i915_pmu.h
>> +++ b/drivers/gpu/drm/i915/i915_pmu.h
>> @@ -49,7 +49,7 @@ enum {
>> (I915_ENGINE_SAMPLE_COUNT + \
>> I915_PMU_MAX_GT * __I915_PMU_TRACKED_EVENT_COUNT)
>> -#define I915_ENGINE_SAMPLE_COUNT (I915_SAMPLE_SEMA + 1)
>> +#define I915_ENGINE_SAMPLE_COUNT (I915_SAMPLE_BUSY_TICKS + 1)
>> struct i915_pmu_sample {
>> u64 cur;
>> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
>> index e26dd27ff4a5f..8ae98c1bda0ea 100644
>> --- a/include/uapi/drm/i915_drm.h
>> +++ b/include/uapi/drm/i915_drm.h
>> @@ -257,7 +257,8 @@ struct i915_engine_class_instance {
>> enum drm_i915_pmu_engine_sample {
>> I915_SAMPLE_BUSY = 0,
>> I915_SAMPLE_WAIT = 1,
>> - I915_SAMPLE_SEMA = 2
>> + I915_SAMPLE_SEMA = 2,
>> + I915_SAMPLE_BUSY_TICKS = 3,
>> };
>> #define I915_PMU_SAMPLE_BITS (4)
>> @@ -274,6 +275,9 @@ enum drm_i915_pmu_engine_sample {
>> #define I915_PMU_ENGINE_BUSY(class, instance) \
>> __I915_PMU_ENGINE(class, instance, I915_SAMPLE_BUSY)
>> +#define I915_PMU_ENGINE_BUSY_TICKS(class, instance) \
>> + __I915_PMU_ENGINE(class, instance, I915_SAMPLE_BUSY_TICKS)
>> +
>> #define I915_PMU_ENGINE_WAIT(class, instance) \
>> __I915_PMU_ENGINE(class, instance, I915_SAMPLE_WAIT)
>> @@ -651,7 +655,14 @@ typedef struct drm_i915_irq_wait {
>> #define I915_SCHEDULER_CAP_PRIORITY (1ul << 1)
>> #define I915_SCHEDULER_CAP_PREEMPTION (1ul << 2)
>> #define I915_SCHEDULER_CAP_SEMAPHORES (1ul << 3)
>> +/*
>> + * BUSY_STATS is deprecated on platforms with GuC based submission
>> and will nt
>> + * be available at all on newer platforms. It has accuracy issues
>> due to the
>> + * conversions from tick counts to wall time.
>> + * BUSY_TICKS_STATS should be used instead.
>> + */
>> #define I915_SCHEDULER_CAP_ENGINE_BUSY_STATS (1ul << 4)
>> +#define I915_SCHEDULER_CAP_ENGINE_BUSY_TICKS_STATS (1ul << 5)
>> /*
>> * Indicates the 2k user priority levels are statically mapped into
>> 3 buckets as
>> * follows:
More information about the Intel-gfx
mailing list