[Intel-gfx] [PATCH] i915/pmu: Wire GuC backend to per-client busyness
Umesh Nerlige Ramappa
umesh.nerlige.ramappa at intel.com
Tue Aug 2 23:38:45 UTC 2022
On Tue, Aug 02, 2022 at 09:41:38AM +0100, Tvrtko Ursulin wrote:
>
>On 01/08/2022 20:02, Umesh Nerlige Ramappa wrote:
>>On Wed, Jul 27, 2022 at 09:48:18AM +0100, Tvrtko Ursulin wrote:
>>>
>>>On 27/07/2022 07:01, Umesh Nerlige Ramappa wrote:
>>>>On Fri, Jun 17, 2022 at 09:00:06AM +0100, Tvrtko Ursulin wrote:
>>>>>
>>>>>On 16/06/2022 23:13, Nerlige Ramappa, Umesh wrote:
>>>>>>From: John Harrison <John.C.Harrison at Intel.com>
>>>>>>
>>>>>>GuC provides engine_id and last_switch_in ticks for an
>>>>>>active context in
>>>>>>the pphwsp. The context image provides a 32 bit total ticks
>>>>>>which is the
>>>>>>accumulated by the context (a.k.a. context[CTX_TIMESTAMP]). This
>>>>>>information is used to calculate the context busyness as follows:
>>>>>>
>>>>>>If the engine_id is valid, then busyness is the sum of
>>>>>>accumulated total
>>>>>>ticks and active ticks. Active ticks is calculated with
>>>>>>current gt time
>>>>>>as reference.
>>>>>>
>>>>>>If engine_id is invalid, busyness is equal to accumulated total ticks.
>>>>>>
>>>>>>Since KMD (CPU) retrieves busyness data from 2 sources - GPU
>>>>>>and GuC, a
>>>>>>potential race was highlighted in an earlier review that can lead to
>>>>>>double accounting of busyness. While the solution to this is a wip,
>>>>>>busyness is still usable for platforms running GuC submission.
>>>>>>
>>>>>>v2: (Tvrtko)
>>>>>>- Use COPS_RUNTIME_ACTIVE_TOTAL
>>>>>>- Add code comment for the race
>>>>>>- Undo local variables initializations
>>>>>>
>>>>>>v3:
>>>>>>- Add support for virtual engines based on
>>>>>> https://patchwork.freedesktop.org/series/105227/
>>>>>>
>>>>>>Signed-off-by: John Harrison <John.C.Harrison at Intel.com>
>>>>>>Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa at intel.com>
>>>>>>---
>>>>>> drivers/gpu/drm/i915/gt/intel_context.c | 12 +++-
>>>>>> drivers/gpu/drm/i915/gt/intel_context.h | 6 +-
>>>>>> drivers/gpu/drm/i915/gt/intel_context_types.h | 6 ++
>>>>>> drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h | 5 ++
>>>>>> .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 65
>>>>>>++++++++++++++++++-
>>>>>> drivers/gpu/drm/i915/i915_drm_client.c | 6 +-
>>>>>> 6 files changed, 89 insertions(+), 11 deletions(-)
>>>>>>
>>>>>>diff --git a/drivers/gpu/drm/i915/gt/intel_context.c
>>>>>>b/drivers/gpu/drm/i915/gt/intel_context.c
>>>>>>index 4070cb5711d8..4a84146710e0 100644
>>>>>>--- a/drivers/gpu/drm/i915/gt/intel_context.c
>>>>>>+++ b/drivers/gpu/drm/i915/gt/intel_context.c
>>>>>>@@ -576,16 +576,24 @@ void
>>>>>>intel_context_bind_parent_child(struct intel_context
>>>>>>*parent,
>>>>>> child->parallel.parent = parent;
>>>>>> }
>>>>>>-u64 intel_context_get_total_runtime_ns(const struct
>>>>>>intel_context *ce)
>>>>>>+u64 intel_context_get_total_runtime_ns(struct intel_context *ce)
>>>>>> {
>>>>>> u64 total, active;
>>>>>>+ if (ce->ops->update_stats)
>>>>>>+ ce->ops->update_stats(ce);
>>>>>>+
>>>>>> total = ce->stats.runtime.total;
>>>>>> if (ce->ops->flags & COPS_RUNTIME_CYCLES)
>>>>>> total *= ce->engine->gt->clock_period_ns;
>>>>>> active = READ_ONCE(ce->stats.active);
>>>>>>- if (active)
>>>>>>+ /*
>>>>>>+ * When COPS_RUNTIME_ACTIVE_TOTAL is set for ce->cops,
>>>>>>the backend
>>>>>>+ * already provides the total active time of the
>>>>>>context, so skip this
>>>>>>+ * calculation when this flag is set.
>>>>>>+ */
>>>>>>+ if (active && !(ce->ops->flags & COPS_RUNTIME_ACTIVE_TOTAL))
>>>>>> active = intel_context_clock() - active;
>>>>>> return total + active;
>>>>>>diff --git a/drivers/gpu/drm/i915/gt/intel_context.h
>>>>>>b/drivers/gpu/drm/i915/gt/intel_context.h
>>>>>>index b7d3214d2cdd..5fc7c19ab29b 100644
>>>>>>--- a/drivers/gpu/drm/i915/gt/intel_context.h
>>>>>>+++ b/drivers/gpu/drm/i915/gt/intel_context.h
>>>>>>@@ -56,7 +56,7 @@ static inline bool
>>>>>>intel_context_is_parent(struct intel_context *ce)
>>>>>> return !!ce->parallel.number_children;
>>>>>> }
>>>>>>-static inline bool intel_context_is_pinned(struct intel_context *ce);
>>>>>>+static inline bool intel_context_is_pinned(const struct
>>>>>>intel_context *ce);
>>>>>> static inline struct intel_context *
>>>>>> intel_context_to_parent(struct intel_context *ce)
>>>>>>@@ -116,7 +116,7 @@ static inline int
>>>>>>intel_context_lock_pinned(struct intel_context *ce)
>>>>>> * Returns: true if the context is currently pinned for use
>>>>>>by the GPU.
>>>>>> */
>>>>>> static inline bool
>>>>>>-intel_context_is_pinned(struct intel_context *ce)
>>>>>>+intel_context_is_pinned(const struct intel_context *ce)
>>>>>> {
>>>>>> return atomic_read(&ce->pin_count);
>>>>>> }
>>>>>>@@ -351,7 +351,7 @@ intel_context_clear_nopreempt(struct
>>>>>>intel_context *ce)
>>>>>> clear_bit(CONTEXT_NOPREEMPT, &ce->flags);
>>>>>> }
>>>>>>-u64 intel_context_get_total_runtime_ns(const struct
>>>>>>intel_context *ce);
>>>>>>+u64 intel_context_get_total_runtime_ns(struct intel_context *ce);
>>>>>> u64 intel_context_get_avg_runtime_ns(struct intel_context *ce);
>>>>>> static inline u64 intel_context_clock(void)
>>>>>>diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h
>>>>>>b/drivers/gpu/drm/i915/gt/intel_context_types.h
>>>>>>index 09f82545789f..797bb4242c18 100644
>>>>>>--- a/drivers/gpu/drm/i915/gt/intel_context_types.h
>>>>>>+++ b/drivers/gpu/drm/i915/gt/intel_context_types.h
>>>>>>@@ -38,6 +38,9 @@ struct intel_context_ops {
>>>>>> #define COPS_RUNTIME_CYCLES_BIT 1
>>>>>> #define COPS_RUNTIME_CYCLES BIT(COPS_RUNTIME_CYCLES_BIT)
>>>>>>+#define COPS_RUNTIME_ACTIVE_TOTAL_BIT 2
>>>>>>+#define COPS_RUNTIME_ACTIVE_TOTAL BIT(COPS_RUNTIME_ACTIVE_TOTAL_BIT)
>>>>>>+
>>>>>> int (*alloc)(struct intel_context *ce);
>>>>>> void (*ban)(struct intel_context *ce, struct i915_request *rq);
>>>>>>@@ -55,6 +58,8 @@ struct intel_context_ops {
>>>>>> void (*sched_disable)(struct intel_context *ce);
>>>>>>+ void (*update_stats)(struct intel_context *ce);
>>>>>>+
>>>>>> void (*reset)(struct intel_context *ce);
>>>>>> void (*destroy)(struct kref *kref);
>>>>>>@@ -146,6 +151,7 @@ struct intel_context {
>>>>>> struct ewma_runtime avg;
>>>>>> u64 total;
>>>>>> u32 last;
>>>>>>+ u64 start_gt_clk;
>>>>>> I915_SELFTEST_DECLARE(u32 num_underflow);
>>>>>> I915_SELFTEST_DECLARE(u32 max_underflow);
>>>>>> } runtime;
>>>>>>diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
>>>>>>b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
>>>>>>index b3c9a9327f76..6231ad03e4eb 100644
>>>>>>--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
>>>>>>+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
>>>>>>@@ -196,6 +196,11 @@ static inline u8
>>>>>>guc_class_to_engine_class(u8 guc_class)
>>>>>> return guc_class_engine_class_map[guc_class];
>>>>>> }
>>>>>>+/* Per context engine usage stats: */
>>>>>>+#define PPHWSP_GUC_CONTEXT_USAGE_STAMP_LO (0x500 / sizeof(u32))
>>>>>>+#define PPHWSP_GUC_CONTEXT_USAGE_STAMP_HI
>>>>>>(PPHWSP_GUC_CONTEXT_USAGE_STAMP_LO + 1)
>>>>>>+#define PPHWSP_GUC_CONTEXT_USAGE_ENGINE_ID
>>>>>>(PPHWSP_GUC_CONTEXT_USAGE_STAMP_HI + 1)
>>>>>>+
>>>>>> /* Work item for submitting workloads into work queue of GuC. */
>>>>>> struct guc_wq_item {
>>>>>> u32 header;
>>>>>>diff --git
>>>>>>a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>>>>>b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>>>>>index 5a1dfacf24ea..cbf3cbb983ce 100644
>>>>>>--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>>>>>+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>>>>>@@ -378,7 +378,7 @@ static inline void
>>>>>>set_context_guc_id_invalid(struct intel_context *ce)
>>>>>> ce->guc_id.id = GUC_INVALID_CONTEXT_ID;
>>>>>> }
>>>>>>-static inline struct intel_guc *ce_to_guc(struct intel_context *ce)
>>>>>>+static inline struct intel_guc *ce_to_guc(const struct
>>>>>>intel_context *ce)
>>>>>> {
>>>>>> return &ce->engine->gt->uc.guc;
>>>>>> }
>>>>>>@@ -1323,13 +1323,16 @@ static void
>>>>>>__update_guc_busyness_stats(struct intel_guc *guc)
>>>>>> spin_unlock_irqrestore(&guc->timestamp.lock, flags);
>>>>>> }
>>>>>>+static void __guc_context_update_clks(struct intel_context *ce);
>>>>>> static void guc_timestamp_ping(struct work_struct *wrk)
>>>>>> {
>>>>>> struct intel_guc *guc = container_of(wrk, typeof(*guc),
>>>>>> timestamp.work.work);
>>>>>> struct intel_uc *uc = container_of(guc, typeof(*uc), guc);
>>>>>> struct intel_gt *gt = guc_to_gt(guc);
>>>>>>+ struct intel_context *ce;
>>>>>> intel_wakeref_t wakeref;
>>>>>>+ unsigned long index;
>>>>>> int srcu, ret;
>>>>>> /*
>>>>>>@@ -1343,6 +1346,10 @@ static void guc_timestamp_ping(struct
>>>>>>work_struct *wrk)
>>>>>> with_intel_runtime_pm(>->i915->runtime_pm, wakeref)
>>>>>> __update_guc_busyness_stats(guc);
>>>>>>+ /* adjust context stats for overflow */
>>>>>>+ xa_for_each(&guc->context_lookup, index, ce)
>>>>>>+ __guc_context_update_clks(ce);
>>>>>>+
>>>>>> intel_gt_reset_unlock(gt, srcu);
>>>>>> mod_delayed_work(system_highpri_wq, &guc->timestamp.work,
>>>>>>@@ -1405,6 +1412,56 @@ void intel_guc_busyness_unpark(struct
>>>>>>intel_gt *gt)
>>>>>> guc->timestamp.ping_delay);
>>>>>> }
>>>>>>+static void __guc_context_update_clks(struct intel_context *ce)
>>>>>>+{
>>>>>>+ struct intel_guc *guc = ce_to_guc(ce);
>>>>>>+ struct intel_gt *gt = ce->engine->gt;
>>>>>>+ u32 *pphwsp, last_switch, engine_id;
>>>>>>+ u64 start_gt_clk, active;
>>>>>>+ unsigned long flags;
>>>>>>+ ktime_t unused;
>>>>>>+
>>>>>>+ spin_lock_irqsave(&guc->timestamp.lock, flags);
>>>>>>+
>>>>>>+ /*
>>>>>>+ * GPU updates ce->lrc_reg_state[CTX_TIMESTAMP] when
>>>>>>context is switched
>>>>>>+ * out, however GuC updates PPHWSP offsets below. Hence KMD (CPU)
>>>>>>+ * relies on GuC and GPU for busyness calculations. Due
>>>>>>to this, A
>>>>>>+ * potential race was highlighted in an earlier review
>>>>>>that can lead to
>>>>>>+ * double accounting of busyness. While the solution to
>>>>>>this is a wip,
>>>>>>+ * busyness is still usable for platforms running GuC submission.
>>>>>>+ */
>>>>>>+ pphwsp = ((void *)ce->lrc_reg_state) - LRC_STATE_OFFSET;
>>>>>>+ last_switch =
>>>>>>READ_ONCE(pphwsp[PPHWSP_GUC_CONTEXT_USAGE_STAMP_LO]);
>>>>>>+ engine_id =
>>>>>>READ_ONCE(pphwsp[PPHWSP_GUC_CONTEXT_USAGE_ENGINE_ID]);
>>>>>>+
>>>>>>+ guc_update_pm_timestamp(guc, &unused);
>>>>>>+
>>>>>>+ if (engine_id != 0xffffffff && last_switch) {
>>>>>>+ start_gt_clk = READ_ONCE(ce->stats.runtime.start_gt_clk);
>>>>>>+ __extend_last_switch(guc, &start_gt_clk, last_switch);
>>>>>>+ active = intel_gt_clock_interval_to_ns(gt,
>>>>>>guc->timestamp.gt_stamp - start_gt_clk);
>>>>>>+ WRITE_ONCE(ce->stats.runtime.start_gt_clk, start_gt_clk);
>>>>>>+ WRITE_ONCE(ce->stats.active, active);
>>>>>>+ } else {
>>>>>>+ lrc_update_runtime(ce);
>>>>>>+ }
>>>>>>+
>>>>>>+ spin_unlock_irqrestore(&guc->timestamp.lock, flags);
>>>>>>+}
>>>>>>+
>>>>>>+static void guc_context_update_stats(struct intel_context *ce)
>>>>>>+{
>>>>>>+ if (!intel_context_pin_if_active(ce)) {
>>>>>>+ WRITE_ONCE(ce->stats.runtime.start_gt_clk, 0);
>>>>>>+ WRITE_ONCE(ce->stats.active, 0);
>>>>>>+ return;
>>>>>>+ }
>>>>>>+
>>>>>>+ __guc_context_update_clks(ce);
>>>>>>+ intel_context_unpin(ce);
>>>>>>+}
>>>>>>+
>>>>>> static inline bool
>>>>>> submission_disabled(struct intel_guc *guc)
>>>>>> {
>>>>>>@@ -2585,6 +2642,7 @@ static void guc_context_unpin(struct
>>>>>>intel_context *ce)
>>>>>> {
>>>>>> struct intel_guc *guc = ce_to_guc(ce);
>>>>>>+ lrc_update_runtime(ce);
>>>>>> unpin_guc_id(guc, ce);
>>>>>> lrc_unpin(ce);
>>>>>>@@ -3183,6 +3241,7 @@ static void remove_from_context(struct
>>>>>>i915_request *rq)
>>>>>> }
>>>>>> static const struct intel_context_ops guc_context_ops = {
>>>>>>+ .flags = COPS_RUNTIME_CYCLES | COPS_RUNTIME_ACTIVE_TOTAL,
>>>>>> .alloc = guc_context_alloc,
>>>>>> .pre_pin = guc_context_pre_pin,
>>>>>>@@ -3199,6 +3258,8 @@ static const struct intel_context_ops
>>>>>>guc_context_ops = {
>>>>>> .sched_disable = guc_context_sched_disable,
>>>>>>+ .update_stats = guc_context_update_stats,
>>>>>>+
>>>>>> .reset = lrc_reset,
>>>>>> .destroy = guc_context_destroy,
>>>>>>@@ -3432,6 +3493,7 @@ static int
>>>>>>guc_virtual_context_alloc(struct intel_context *ce)
>>>>>> }
>>>>>> static const struct intel_context_ops virtual_guc_context_ops = {
>>>>>>+ .flags = COPS_RUNTIME_CYCLES | COPS_RUNTIME_ACTIVE_TOTAL,
>>>>>> .alloc = guc_virtual_context_alloc,
>>>>>> .pre_pin = guc_virtual_context_pre_pin,
>>>>>>@@ -3447,6 +3509,7 @@ static const struct intel_context_ops
>>>>>>virtual_guc_context_ops = {
>>>>>> .exit = guc_virtual_context_exit,
>>>>>> .sched_disable = guc_context_sched_disable,
>>>>>>+ .update_stats = guc_context_update_stats,
>>>>>
>>>>>There are also virtual_parent_context_ops and
>>>>>virtual_child_context_ops - which means more test coverage is
>>>>>needed..
>>>>
>>>>Trying to come back to this... The
>>>>virtual_parent_context_ops/virtual_child_context_ops are used
>>>>for parallel engines. GuC would only update the pphwsp of the
>>>>parent context with the last_switched_in_time.
>>>>
>>>>In general, how should I report the busyness for a parallel engine?
>>>>
>>>>I would think it is busyness reported by parent context
>>>>multiplied by width.
>>>
>>>That could a reasonable approximation but I can't say for certain.
>>>Depends on the GuC scheduler implementation a bit. Like is
>>>anything preventing child contexts from finishing their useful
>>>work ahead of the parent context, or they are always strictly
>>>scheduled as one entity and child engines are blocked from taking
>>>other workloads until the parent is scheduled out?
>>
>>Correct, if a child finishes the work before parent/siblings for
>>some reason, it cannot take up other work until all siblings are
>>done.
>
>The only problem is that I guess one day that assumption might break
>and we will "never" now. If you have some spare time it would be best
>to add an IGT to verify this assumption, or at least put that work as
>TODO in the backlog?
I added some tests to IGT for parallel engine, but something is missing
in the way I am submitting the batches to the parallel engine. I see
some hangs, haven't had a chance to debug that. Will try to get to it
and then post the updated i915 patches.
Thanks,
Umesh
>
>Regards,
>
>Tvrtko
More information about the Intel-gfx
mailing list