[Intel-gfx] [PATCH 9/9] drm/i915/pmu: Enable legacy PMU events for MTL
Umesh Nerlige Ramappa
umesh.nerlige.ramappa at intel.com
Mon Apr 3 19:16:53 UTC 2023
+ Joonas for inputs
On Thu, Mar 30, 2023 at 11:31:53AM -0700, Umesh Nerlige Ramappa wrote:
>+ Joonas for comments on this
>
>On Thu, Mar 30, 2023 at 02:38:03PM +0100, Tvrtko Ursulin wrote:
>>
>>On 30/03/2023 01:41, Umesh Nerlige Ramappa wrote:
>>>MTL introduces separate GTs for render and media. This complicates the
>>>definition of frequency and rc6 counters for the GPU as a whole since
>>>each GT has an independent counter. The best way to support this change
>>>is to deprecate the GPU-specific counters and create GT-specific
>>>counters, however that just breaks ABI. Since perf tools and scripts may
>>>be decentralized with probably many users, it's hard to deprecate the
>>>legacy counters and have all the users on board with that.
>>>
>>>Re-introduce the legacy counters and support them as min/max of
>>>GT-specific counters as necessary to ensure backwards compatibility.
>>>
>>>I915_PMU_ACTUAL_FREQUENCY - will show max of GT-specific counters
>>>I915_PMU_REQUESTED_FREQUENCY - will show max of GT-specific counters
>>>I915_PMU_INTERRUPTS - no changes since it is GPU specific on all platforms
>>>I915_PMU_RC6_RESIDENCY - will show min of GT-specific counters
>>>I915_PMU_SOFTWARE_GT_AWAKE_TIME - will show max of GT-specific counters
>>
>>IMO max/min games are _very_ low value and probably just confusing.
>
>By value, do you mean ROI or actually that the values would be
>incorrect?
>
>>
>>I am not convinced we need to burden the kernel with this. New
>>platform, new counters.. userspace can just deal with it.
>
>I agree and would prefer to drop this patch. There are some counter
>arguments, I have added Joonas here for comments.
>
>1) an app/script hard-coded with the legacy events would be used on a
>new platform and fail and we should maintain backwards compatibility.
>
>2) the sysfs attributes for rc6/frequency have already adopted an
>aggregate vs gt0/gt1 approach to address that and pmu should have a
>similar solution (or rather, PMU and the sysfs approaches should match
>based on whatever is the approach)
>
>Regards,
>Umesh
>
>>
>>In intel_gpu_top we can do the smarts in maybe default aggregated
>>view (piggy back/extend on engines aggregation via command line '-p'
>>or '1' at runtime). But then it's not min/max but probably
>>normalized by number of gts.
>>
>>Regards,
>>
>>Tvrtko
>>
>>>
>>>Note:
>>>- For deeper debugging of performance issues, tools must be upgraded to
>>> read the GT-specific counters.
>>>- This patch deserves to be separate from the other PMU features so that
>>> it can be easily dropped if legacy events are ever deprecated.
>>>- Internal implementation relies on creating an extra entry in the
>>> arrays used for GT specific counters. Index 0 is empty.
>>> Index 1 through N are mapped to GTs 0 through N - 1.
>>>- User interface will use GT numbers indexed from 0 to specify the GT of
>>> interest.
>>>
>>>Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa at intel.com>
>>>---
>>> drivers/gpu/drm/i915/i915_pmu.c | 134 +++++++++++++++++++++++++++-----
>>> drivers/gpu/drm/i915/i915_pmu.h | 2 +-
>>> include/uapi/drm/i915_drm.h | 14 ++--
>>> 3 files changed, 125 insertions(+), 25 deletions(-)
>>>
>>>diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
>>>index 9bd9605d2662..0dc7711c3b4b 100644
>>>--- a/drivers/gpu/drm/i915/i915_pmu.c
>>>+++ b/drivers/gpu/drm/i915/i915_pmu.c
>>>@@ -221,7 +221,7 @@ add_sample_mult(struct i915_pmu *pmu, unsigned int gt_id, int sample, u32 val,
>>> static u64 get_rc6(struct intel_gt *gt)
>>> {
>>> struct drm_i915_private *i915 = gt->i915;
>>>- const unsigned int gt_id = gt->info.id;
>>>+ const unsigned int gt_id = gt->info.id + 1;
>>> struct i915_pmu *pmu = &i915->pmu;
>>> unsigned long flags;
>>> bool awake = false;
>>>@@ -267,24 +267,26 @@ static void init_rc6(struct i915_pmu *pmu)
>>> for_each_gt(gt, i915, i) {
>>> intel_wakeref_t wakeref;
>>>+ const unsigned int gt_id = i + 1;
>>> with_intel_runtime_pm(gt->uncore->rpm, wakeref) {
>>> u64 val = __get_rc6(gt);
>>>- store_sample(pmu, i, __I915_SAMPLE_RC6, val);
>>>- store_sample(pmu, i, __I915_SAMPLE_RC6_LAST_REPORTED,
>>>+ store_sample(pmu, gt_id, __I915_SAMPLE_RC6, val);
>>>+ store_sample(pmu, gt_id, __I915_SAMPLE_RC6_LAST_REPORTED,
>>> val);
>>>- pmu->sleep_last[i] = ktime_get_raw();
>>>+ pmu->sleep_last[gt_id] = ktime_get_raw();
>>> }
>>> }
>>> }
>>> static void park_rc6(struct intel_gt *gt)
>>> {
>>>+ const unsigned int gt_id = gt->info.id + 1;
>>> struct i915_pmu *pmu = >->i915->pmu;
>>>- store_sample(pmu, gt->info.id, __I915_SAMPLE_RC6, __get_rc6(gt));
>>>- pmu->sleep_last[gt->info.id] = ktime_get_raw();
>>>+ store_sample(pmu, gt_id, __I915_SAMPLE_RC6, __get_rc6(gt));
>>>+ pmu->sleep_last[gt_id] = ktime_get_raw();
>>> }
>>> static void __i915_pmu_maybe_start_timer(struct i915_pmu *pmu)
>>>@@ -436,18 +438,18 @@ static void
>>> frequency_sample(struct intel_gt *gt, unsigned int period_ns)
>>> {
>>> struct drm_i915_private *i915 = gt->i915;
>>>- const unsigned int gt_id = gt->info.id;
>>>+ const unsigned int gt_id = gt->info.id + 1;
>>> struct i915_pmu *pmu = &i915->pmu;
>>> struct intel_rps *rps = >->rps;
>>>- if (!frequency_sampling_enabled(pmu, gt_id))
>>>+ if (!frequency_sampling_enabled(pmu, gt->info.id))
>>> return;
>>> /* Report 0/0 (actual/requested) frequency while parked. */
>>> if (!intel_gt_pm_get_if_awake(gt))
>>> return;
>>>- if (pmu->enable & config_mask(__I915_PMU_ACTUAL_FREQUENCY(gt_id))) {
>>>+ if (pmu->enable & config_mask(__I915_PMU_ACTUAL_FREQUENCY(gt->info.id))) {
>>> u32 val;
>>> /*
>>>@@ -467,7 +469,7 @@ frequency_sample(struct intel_gt *gt, unsigned int period_ns)
>>> val, period_ns / 1000);
>>> }
>>>- if (pmu->enable & config_mask(__I915_PMU_REQUESTED_FREQUENCY(gt_id))) {
>>>+ if (pmu->enable & config_mask(__I915_PMU_REQUESTED_FREQUENCY(gt->info.id))) {
>>> add_sample_mult(pmu, gt_id, __I915_SAMPLE_FREQ_REQ,
>>> intel_rps_get_requested_frequency(rps),
>>> period_ns / 1000);
>>>@@ -545,14 +547,15 @@ engine_event_status(struct intel_engine_cs *engine,
>>> static int
>>> config_status(struct drm_i915_private *i915, u64 config)
>>> {
>>>- struct intel_gt *gt = to_gt(i915);
>>>-
>>> unsigned int gt_id = config_gt_id(config);
>>>- unsigned int max_gt_id = HAS_EXTRA_GT_LIST(i915) ? 1 : 0;
>>>+ unsigned int max_gt_id = HAS_EXTRA_GT_LIST(i915) ? 2 : 1;
>>>+ struct intel_gt *gt;
>>> if (gt_id > max_gt_id)
>>> return -ENOENT;
>>>+ gt = !gt_id ? to_gt(i915) : i915->gt[gt_id - 1];
>>>+
>>> switch (config_counter(config)) {
>>> case I915_PMU_ACTUAL_FREQUENCY:
>>> if (IS_VALLEYVIEW(i915) || IS_CHERRYVIEW(i915))
>>>@@ -673,23 +676,58 @@ static u64 __i915_pmu_event_read_other(struct perf_event *event)
>>> const unsigned int gt_id = config_gt_id(event->attr.config);
>>> const u64 config = config_counter(event->attr.config);
>>> struct i915_pmu *pmu = &i915->pmu;
>>>+ struct intel_gt *gt;
>>> u64 val = 0;
>>>+ int i;
>>> switch (config) {
>>> case I915_PMU_ACTUAL_FREQUENCY:
>>>- val = read_sample_us(pmu, gt_id, __I915_SAMPLE_FREQ_ACT);
>>>+ if (gt_id)
>>>+ return read_sample_us(pmu, gt_id, __I915_SAMPLE_FREQ_ACT);
>>>+
>>>+ if (!HAS_EXTRA_GT_LIST(i915))
>>>+ return read_sample_us(pmu, 1, __I915_SAMPLE_FREQ_ACT);
>>>+
>>>+ for_each_gt(gt, i915, i)
>>>+ val = max(val, read_sample_us(pmu, i + 1, __I915_SAMPLE_FREQ_ACT));
>>>+
>>> break;
>>> case I915_PMU_REQUESTED_FREQUENCY:
>>>- val = read_sample_us(pmu, gt_id, __I915_SAMPLE_FREQ_REQ);
>>>+ if (gt_id)
>>>+ return read_sample_us(pmu, gt_id, __I915_SAMPLE_FREQ_REQ);
>>>+
>>>+ if (!HAS_EXTRA_GT_LIST(i915))
>>>+ return read_sample_us(pmu, 1, __I915_SAMPLE_FREQ_REQ);
>>>+
>>>+ for_each_gt(gt, i915, i)
>>>+ val = max(val, read_sample_us(pmu, i + 1, __I915_SAMPLE_FREQ_REQ));
>>>+
>>> break;
>>> case I915_PMU_INTERRUPTS:
>>> val = READ_ONCE(pmu->irq_count);
>>> break;
>>> case I915_PMU_RC6_RESIDENCY:
>>>- val = get_rc6(i915->gt[gt_id]);
>>>+ if (gt_id)
>>>+ return get_rc6(i915->gt[gt_id - 1]);
>>>+
>>>+ if (!HAS_EXTRA_GT_LIST(i915))
>>>+ return get_rc6(i915->gt[0]);
>>>+
>>>+ val = U64_MAX;
>>>+ for_each_gt(gt, i915, i)
>>>+ val = min(val, get_rc6(gt));
>>>+
>>> break;
>>> case I915_PMU_SOFTWARE_GT_AWAKE_TIME:
>>>- val = ktime_to_ns(intel_gt_get_awake_time(to_gt(i915)));
>>>+ if (gt_id)
>>>+ return ktime_to_ns(intel_gt_get_awake_time(i915->gt[gt_id - 1]));
>>>+
>>>+ if (!HAS_EXTRA_GT_LIST(i915))
>>>+ return ktime_to_ns(intel_gt_get_awake_time(i915->gt[0]));
>>>+
>>>+ val = 0;
>>>+ for_each_gt(gt, i915, i)
>>>+ val = max((s64)val, ktime_to_ns(intel_gt_get_awake_time(gt)));
>>> break;
>>> }
>>>@@ -728,11 +766,14 @@ static void i915_pmu_event_read(struct perf_event *event)
>>> static void i915_pmu_enable(struct perf_event *event)
>>> {
>>>+ const unsigned int gt_id = config_gt_id(event->attr.config);
>>> struct drm_i915_private *i915 =
>>> container_of(event->pmu, typeof(*i915), pmu.base);
>>> struct i915_pmu *pmu = &i915->pmu;
>>>+ struct intel_gt *gt;
>>> unsigned long flags;
>>> unsigned int bit;
>>>+ u64 i;
>>> bit = event_bit(event);
>>> if (bit == -1)
>>>@@ -745,12 +786,42 @@ static void i915_pmu_enable(struct perf_event *event)
>>> * the event reference counter.
>>> */
>>> BUILD_BUG_ON(ARRAY_SIZE(pmu->enable_count) != I915_PMU_MASK_BITS);
>>>+ BUILD_BUG_ON(BITS_PER_TYPE(pmu->enable) < I915_PMU_MASK_BITS);
>>> GEM_BUG_ON(bit >= ARRAY_SIZE(pmu->enable_count));
>>> GEM_BUG_ON(pmu->enable_count[bit] == ~0);
>>> pmu->enable |= BIT_ULL(bit);
>>> pmu->enable_count[bit]++;
>>>+ /*
>>>+ * The arrays that i915_pmu maintains are now indexed as
>>>+ *
>>>+ * 0 - aggregate events (a.k.a !gt_id)
>>>+ * 1 - gt0
>>>+ * 2 - gt1
>>>+ *
>>>+ * The same logic applies to event_bit masks. The first set of mask are
>>>+ * for aggregate, followed by gt0 and gt1 masks. The idea here is to
>>>+ * enable the event on all gts if the aggregate event bit is set. This
>>>+ * applies only to the non-engine-events.
>>>+ */
>>>+ if (!gt_id && !is_engine_event(event)) {
>>>+ for_each_gt(gt, i915, i) {
>>>+ u64 counter = config_counter(event->attr.config);
>>>+ u64 config = ((i + 1) << __I915_PMU_GT_SHIFT) | counter;
>>>+ unsigned int bit = config_bit(config);
>>>+
>>>+ if (bit == -1)
>>>+ continue;
>>>+
>>>+ GEM_BUG_ON(bit >= ARRAY_SIZE(pmu->enable_count));
>>>+ GEM_BUG_ON(pmu->enable_count[bit] == ~0);
>>>+
>>>+ pmu->enable |= BIT_ULL(bit);
>>>+ pmu->enable_count[bit]++;
>>>+ }
>>>+ }
>>>+
>>> /*
>>> * Start the sampling timer if needed and not already enabled.
>>> */
>>>@@ -793,6 +864,7 @@ static void i915_pmu_enable(struct perf_event *event)
>>> static void i915_pmu_disable(struct perf_event *event)
>>> {
>>>+ const unsigned int gt_id = config_gt_id(event->attr.config);
>>> struct drm_i915_private *i915 =
>>> container_of(event->pmu, typeof(*i915), pmu.base);
>>> unsigned int bit = event_bit(event);
>>>@@ -822,6 +894,26 @@ static void i915_pmu_disable(struct perf_event *event)
>>> */
>>> if (--engine->pmu.enable_count[sample] == 0)
>>> engine->pmu.enable &= ~BIT(sample);
>>>+ } else if (!gt_id) {
>>>+ struct intel_gt *gt;
>>>+ u64 i;
>>>+
>>>+ for_each_gt(gt, i915, i) {
>>>+ u64 counter = config_counter(event->attr.config);
>>>+ u64 config = ((i + 1) << __I915_PMU_GT_SHIFT) | counter;
>>>+ unsigned int bit = config_bit(config);
>>>+
>>>+ if (bit == -1)
>>>+ continue;
>>>+
>>>+ GEM_BUG_ON(bit >= ARRAY_SIZE(pmu->enable_count));
>>>+ GEM_BUG_ON(pmu->enable_count[bit] == 0);
>>>+
>>>+ if (--pmu->enable_count[bit] == 0) {
>>>+ pmu->enable &= ~BIT_ULL(bit);
>>>+ pmu->timer_enabled &= pmu_needs_timer(pmu, true);
>>>+ }
>>>+ }
>>> }
>>> GEM_BUG_ON(bit >= ARRAY_SIZE(pmu->enable_count));
>>>@@ -1002,7 +1094,11 @@ create_event_attributes(struct i915_pmu *pmu)
>>> const char *name;
>>> const char *unit;
>>> } global_events[] = {
>>>+ __event(0, "actual-frequency", "M"),
>>>+ __event(1, "requested-frequency", "M"),
>>> __event(2, "interrupts", NULL),
>>>+ __event(3, "rc6-residency", "ns"),
>>>+ __event(4, "software-gt-awake-time", "ns"),
>>> };
>>> static const struct {
>>> enum drm_i915_pmu_engine_sample sample;
>>>@@ -1024,7 +1120,7 @@ create_event_attributes(struct i915_pmu *pmu)
>>> /* per gt counters */
>>> for_each_gt(gt, i915, j) {
>>> for (i = 0; i < ARRAY_SIZE(events); i++) {
>>>- u64 config = ___I915_PMU_OTHER(j, events[i].counter);
>>>+ u64 config = ___I915_PMU_OTHER(j + 1, events[i].counter);
>>> if (!config_status(i915, config))
>>> count++;
>>>@@ -1070,7 +1166,7 @@ create_event_attributes(struct i915_pmu *pmu)
>>> /* per gt counters */
>>> for_each_gt(gt, i915, j) {
>>> for (i = 0; i < ARRAY_SIZE(events); i++) {
>>>- u64 config = ___I915_PMU_OTHER(j, events[i].counter);
>>>+ u64 config = ___I915_PMU_OTHER(j + 1, events[i].counter);
>>> char *str;
>>> if (config_status(i915, config))
>>>diff --git a/drivers/gpu/drm/i915/i915_pmu.h b/drivers/gpu/drm/i915/i915_pmu.h
>>>index a708e44a227e..a4cc1eb218fc 100644
>>>--- a/drivers/gpu/drm/i915/i915_pmu.h
>>>+++ b/drivers/gpu/drm/i915/i915_pmu.h
>>>@@ -38,7 +38,7 @@ enum {
>>> __I915_NUM_PMU_SAMPLERS
>>> };
>>>-#define I915_PMU_MAX_GTS (4) /* FIXME */
>>>+#define I915_PMU_MAX_GTS (4 + 1) /* FIXME */
>>> /**
>>> * How many different events we track in the global PMU mask.
>>>diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
>>>index bbab7f3dbeb4..18794c30027f 100644
>>>--- a/include/uapi/drm/i915_drm.h
>>>+++ b/include/uapi/drm/i915_drm.h
>>>@@ -290,6 +290,7 @@ enum drm_i915_pmu_engine_sample {
>>> (((__u64)__I915_PMU_ENGINE(0xff, 0xff, 0xf) + 1 + (x)) | \
>>> ((__u64)(gt) << __I915_PMU_GT_SHIFT))
>>>+/* Aggregate from all gts */
>>> #define __I915_PMU_OTHER(x) ___I915_PMU_OTHER(0, x)
>>> #define I915_PMU_ACTUAL_FREQUENCY __I915_PMU_OTHER(0)
>>>@@ -300,11 +301,14 @@ enum drm_i915_pmu_engine_sample {
>>> #define I915_PMU_LAST /* Deprecated - do not use */ I915_PMU_RC6_RESIDENCY
>>>-#define __I915_PMU_ACTUAL_FREQUENCY(gt) ___I915_PMU_OTHER(gt, 0)
>>>-#define __I915_PMU_REQUESTED_FREQUENCY(gt) ___I915_PMU_OTHER(gt, 1)
>>>-#define __I915_PMU_INTERRUPTS(gt) ___I915_PMU_OTHER(gt, 2)
>>>-#define __I915_PMU_RC6_RESIDENCY(gt) ___I915_PMU_OTHER(gt, 3)
>>>-#define __I915_PMU_SOFTWARE_GT_AWAKE_TIME(gt) ___I915_PMU_OTHER(gt, 4)
>>>+/* GT specific counters */
>>>+#define ____I915_PMU_OTHER(gt, x) ___I915_PMU_OTHER(((gt) + 1), x)
>>>+
>>>+#define __I915_PMU_ACTUAL_FREQUENCY(gt) ____I915_PMU_OTHER(gt, 0)
>>>+#define __I915_PMU_REQUESTED_FREQUENCY(gt) ____I915_PMU_OTHER(gt, 1)
>>>+#define __I915_PMU_INTERRUPTS(gt) ____I915_PMU_OTHER(gt, 2)
>>>+#define __I915_PMU_RC6_RESIDENCY(gt) ____I915_PMU_OTHER(gt, 3)
>>>+#define __I915_PMU_SOFTWARE_GT_AWAKE_TIME(gt) ____I915_PMU_OTHER(gt, 4)
>>> /* Each region is a minimum of 16k, and there are at most 255 of them.
>>> */
More information about the Intel-gfx
mailing list