[PATCH v10 4/4] drm/xe/pmu: Add GT frequency events
Belgaumkar, Vinay
vinay.belgaumkar at intel.com
Mon Dec 23 08:18:38 UTC 2024
On 12/20/2024 1:17 PM, Rodrigo Vivi wrote:
> On Thu, Dec 19, 2024 at 05:19:10PM -0800, Vinay Belgaumkar wrote:
>> Define PMU events for GT frequency (actual and requested). This is
>> a port from the i915 driver implementation, where an internal timer
>> is used to aggregate GT frequencies over certain fixed interval.
>> Following PMU events are being added-
>>
>> xe_0000_00_02.0/actual-frequency/ [Kernel PMU event]
>> xe_0000_00_02.0/requested-frequency/ [Kernel PMU event]
>>
>> Standard perf commands can be used to monitor GT frequency-
>> $ perf stat -e xe_0000_00_02.0/requested-frequency,gt_id=0/ -I1000
>>
>> 1.001175175 700 M xe/requested-frequency,gt_id=0/
>> 2.005891881 703 M xe/requested-frequency,gt_id=0/
>> 3.007318169 700 M xe/requested-frequency,gt_id=0/
>>
>> Actual/requested frequencies will be 0 when GT is suspended.
>>
>> v2: Checkpatch fix, moved timer code to this patch
>> v3: Fix kunit issue
>> v4: Checkpatch warning fixes
>> v5: Make PMU events per device (Lucas)
>> v6: Reuse bits from config mask for gt_id (Lucas)
>> v7: Fix bug in pmu_enable (Riana)
>>
>> Cc: Lucas De Marchi <lucas.demarchi at intel.com>
>> Cc: Rodrigo Vivi <rodrigo.vivi at intel.com>
>> Reviewed-by: Rodrigo Vivi <rodrigo.vivi at intel.com> #v3
>> Signed-off-by: Vinay Belgaumkar <vinay.belgaumkar at intel.com>
>> ---
>> drivers/gpu/drm/xe/xe_gt.c | 2 +
>> drivers/gpu/drm/xe/xe_pmu.c | 259 +++++++++++++++++++++++++++++-
>> drivers/gpu/drm/xe/xe_pmu.h | 2 +
>> drivers/gpu/drm/xe/xe_pmu_types.h | 26 +++
>> 4 files changed, 288 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c
>> index 64e60bcf131a..88ce52c63018 100644
>> --- a/drivers/gpu/drm/xe/xe_gt.c
>> +++ b/drivers/gpu/drm/xe/xe_gt.c
>> @@ -939,6 +939,8 @@ int xe_gt_resume(struct xe_gt *gt)
>>
>> xe_gt_idle_enable_pg(gt);
>>
>> + xe_pmu_resume(gt);
>> +
>> xe_force_wake_put(gt_to_fw(gt), fw_ref);
>> xe_gt_dbg(gt, "resumed\n");
>>
>> diff --git a/drivers/gpu/drm/xe/xe_pmu.c b/drivers/gpu/drm/xe/xe_pmu.c
>> index 1115724a580d..7b0921ef8a1b 100644
>> --- a/drivers/gpu/drm/xe/xe_pmu.c
>> +++ b/drivers/gpu/drm/xe/xe_pmu.c
>> @@ -26,6 +26,8 @@
>> static cpumask_t xe_pmu_cpumask;
>> static unsigned int xe_pmu_target_cpu = -1;
>>
>> +#define FREQUENCY 200
> this is too generic... please add a better name to represent what this is for...
> specially in a case that is adding gt frequency stuff this can get very confusing...
ok, will use sampling_freq or something similar.
>
>> +
>> /**
>> * DOC: Xe PMU (Performance Monitoring Unit)
>> *
>> @@ -65,6 +67,29 @@ static unsigned int xe_pmu_target_cpu = -1;
>> * 2352945
>> *
>> * Each value is roughly a 1000ms increment here as well. This is expected GT residency when idle.
>> + *
>> + * PMU frequency events use a software timer to aggregate GT freq values over the time of capture.
>> + * This allows us to calculate a rough average over the timespan. This is why sysfs is the best way
>> + * to obtain instantaneous frequency if accuracy is intended. Advantage of using PMU is that it
>> + * results in lesser CPU utilization as compared to dumping sysfs entries repeatedly.
>> + *
>> + * To list GT frequency events, use the following-
>> + *
>> + * $ perf list | grep frequency
>> + * xe_0000_00_02.0/actual-frequency/ [Kernel PMU event]
>> + * xe_0000_00_02.0/requested-frequency/ [Kernel PMU event]
>> + *
>> + * $ perf stat -e xe_0000_00_02.0/requested-frequency,xe_gt_id=0/ -I1000
>> + * time counts unit events
>> + * 1.001189056 1950 M xe_0000_00_02.0/requested-frequency,xe_gt_id=0/
>> + * 2.006388494 1960 M xe_0000_00_02.0/requested-frequency,xe_gt_id=0/
>> + * 3.007930311 1959 M xe_0000_00_02.0/requested-frequency,xe_gt_id=0/
>> + *
>> + * Dumping requested freq from sysfs-
>> + * $ while true; do cat /sys/class/drm/card0/device/tile0/gt0/freq0/cur_freq ; sleep 1; done
>> + * 1950
>> + * 1950
>> + * 1950
>> */
>>
>> static struct xe_pmu *event_to_pmu(struct perf_event *event)
>> @@ -90,6 +115,12 @@ static unsigned int pm_bit(const u64 config)
>> case XE_PMU_C6_RESIDENCY:
>> val = __XE_PMU_C6_RESIDENCY_ENABLED;
>> break;
>> + case XE_PMU_ACTUAL_FREQUENCY:
>> + val = __XE_PMU_ACTUAL_FREQUENCY_ENABLED;
>> + break;
>> + case XE_PMU_REQUESTED_FREQUENCY:
>> + val = __XE_PMU_REQUESTED_FREQUENCY_ENABLED;
>> + break;
>> default:
>> /*
>> * Events that do not require sampling, or tracking state
>> @@ -106,6 +137,22 @@ static unsigned int config_bit(const u64 config)
>> return pm_bit(config);
>> }
>>
>> +static u32 config_mask(const u64 config)
>> +{
>> + unsigned int bit = config_bit(config);
>> +
>> + if (__builtin_constant_p(config))
>> + BUILD_BUG_ON(bit >
>> + BITS_PER_TYPE(typeof_member(struct xe_pmu,
>> + enable)) - 1);
>> + else
>> + WARN_ON_ONCE(bit >
>> + BITS_PER_TYPE(typeof_member(struct xe_pmu,
>> + enable)) - 1);
>> +
>> + return BIT(config_bit(config));
>> +}
>> +
>> static unsigned int event_bit(struct perf_event *event)
>> {
>> return config_bit(event->attr.config);
>> @@ -134,6 +181,10 @@ config_status(struct xe_device *xe, u64 config)
>> if (xe->info.skip_guc_pc)
>> return -ENODEV;
>> break;
>> + case XE_PMU_ACTUAL_FREQUENCY:
>> + fallthrough;
>> + case XE_PMU_REQUESTED_FREQUENCY:
>> + break;
>> default:
>> return -ENOENT;
>> }
>> @@ -198,6 +249,12 @@ store_sample(struct xe_pmu *pmu, unsigned int gt_id, int sample, u64 val)
>> pmu->event_sample[gt_id][sample].cur = val;
>> }
>>
>> +static void
>> +add_sample_mult(struct xe_pmu *pmu, unsigned int gt_id, int sample, u32 val, u32 mul)
>> +{
>> + pmu->event_sample[gt_id][sample].cur += mul_u32_u32(val, mul);
>> +}
>> +
>> static u64 get_c6(struct xe_gt *gt)
>> {
>> struct xe_device *xe = gt_to_xe(gt);
>> @@ -243,6 +300,7 @@ static u64 __xe_pmu_event_read(struct perf_event *event)
>> {
>> struct xe_device *xe =
>> container_of(event->pmu, typeof(*xe), pmu.base);
>> + struct xe_pmu *pmu = &xe->pmu;
>> const u64 config = event->attr.config;
>> const u64 gt_id = config >> __XE_PMU_GT_SHIFT;
>> struct xe_gt *gt = xe_device_get_gt(xe, gt_id);
>> @@ -252,6 +310,18 @@ static u64 __xe_pmu_event_read(struct perf_event *event)
>> case XE_PMU_C6_RESIDENCY:
>> val = get_c6(gt);
>> break;
>> + case XE_PMU_ACTUAL_FREQUENCY:
>> + val =
>> + div_u64(read_sample(pmu, gt_id,
>> + __XE_SAMPLE_FREQ_ACT),
>> + USEC_PER_SEC /* to MHz */);
>> + break;
>> + case XE_PMU_REQUESTED_FREQUENCY:
>> + val =
>> + div_u64(read_sample(pmu, gt_id,
>> + __XE_SAMPLE_FREQ_REQ),
>> + USEC_PER_SEC /* to MHz */);
>> + break;
>> default:
>> drm_warn(>->tile->xe->drm, "unknown pmu event\n");
>> }
>> @@ -280,11 +350,153 @@ static void xe_pmu_event_read(struct perf_event *event)
>> local64_add(new - prev, &event->count);
>> }
>>
>> +static u32 frequency_enabled_mask(void)
>> +{
>> + unsigned int i;
>> + u32 mask = 0;
>> +
>> + for (i = 0; i < XE_PMU_MAX_GT; i++)
>> + mask |= config_mask(__XE_PMU_ACTUAL_FREQUENCY(i)) |
>> + config_mask(__XE_PMU_REQUESTED_FREQUENCY(i));
>> +
>> + return mask;
>> +}
>> +
>> +static bool
>> +frequency_sampling_enabled(struct xe_pmu *pmu, unsigned int gt)
>> +{
>> + return pmu->enable &
>> + (config_mask(__XE_PMU_ACTUAL_FREQUENCY(gt)) |
>> + config_mask(__XE_PMU_REQUESTED_FREQUENCY(gt)));
>> +}
>> +
>> +static void
>> +frequency_sample(struct xe_gt *gt, unsigned int period_ns)
>> +{
>> + struct xe_device *xe = gt_to_xe(gt);
>> + const unsigned int gt_id = gt->info.id;
>> + struct xe_pmu *pmu = &xe->pmu;
>> + bool device_awake;
>> + int ret;
>> + u32 cur_freq;
>> +
>> + if (!frequency_sampling_enabled(pmu, gt_id))
>> + return;
>> +
>> + /* Report 0/0 (actual/requested) frequency while GT is suspended. */
>> + device_awake = xe_pm_runtime_get_if_active(xe);
>> + if (!device_awake)
>> + return;
>> +
>> + if (pmu->enable & config_mask(__XE_PMU_ACTUAL_FREQUENCY(gt_id))) {
>> + u32 val;
>> +
>> + /*
>> + * We take a quick peek here without using forcewake
>> + * so that we don't perturb the system under observation
>> + * (forcewake => !rc6 => increased power use). We expect
>> + * that if the read fails because it is outside of the
>> + * mmio power well, then it will return 0 -- in which
>> + * case we assume the system is running at the intended
>> + * frequency. Fortunately, the read should rarely fail!
>> + */
> I'm just wondering here if we could/should use the latest know valid freq
> when we read 0...?!
When the system is idle, we want the actual freq to be 0. If we use the
last read non-zero value, it will depict incorrect act_freq.
Thanks,
Vinay.
>
>> + val = xe_guc_pc_get_act_freq(>->uc.guc.pc);
>> +
>> + add_sample_mult(pmu, gt_id, __XE_SAMPLE_FREQ_ACT,
>> + val, period_ns / 1000);
>> + }
>> +
>> + if (pmu->enable & config_mask(__XE_PMU_REQUESTED_FREQUENCY(gt_id))) {
>> + ret = xe_guc_pc_get_cur_freq(>->uc.guc.pc, &cur_freq);
>> + if (!ret)
>> + add_sample_mult(pmu, gt_id, __XE_SAMPLE_FREQ_REQ,
>> + cur_freq,
>> + period_ns / 1000);
>> + }
>> +
>> + xe_pm_runtime_put(xe);
>> +}
>> +
>> +static enum hrtimer_restart xe_sample(struct hrtimer *hrtimer)
>> +{
>> + struct xe_pmu *pmu = container_of(hrtimer, struct xe_pmu, timer);
>> + struct xe_device *xe = container_of(pmu, typeof(*xe), pmu);
>> + u64 period = max_t(u64, 10000, NSEC_PER_SEC / FREQUENCY);
>> + unsigned int period_ns;
>> + struct xe_gt *gt;
>> + unsigned int i;
>> + ktime_t now;
>> +
>> + if (!READ_ONCE(pmu->timer_enabled))
>> + return HRTIMER_NORESTART;
>> +
>> + now = ktime_get();
>> + period_ns = ktime_to_ns(ktime_sub(now, pmu->timer_last));
>> + pmu->timer_last = now;
>> +
>> + /*
>> + * Strictly speaking the passed in period may not be 100% accurate for
>> + * all internal calculation, since some amount of time can be spent on
>> + * grabbing the forcewake. However the potential error from timer call-
>> + * back delay greatly dominates this so we keep it simple.
>> + */
>> +
>> + for_each_gt(gt, xe, i) {
>> + if (!(pmu->active_gts & BIT(i)))
>> + continue;
>> + frequency_sample(gt, period_ns);
>> + }
>> +
>> + hrtimer_forward(hrtimer, now, ns_to_ktime(period));
>> +
>> + return HRTIMER_RESTART;
>> +}
>> +
>> +static bool pmu_needs_timer(struct xe_pmu *pmu)
>> +{
>> + u32 enable;
>> +
>> + /*
>> + * Only some counters need the sampling timer.
>> + *
>> + * We start with a bitmask of all currently enabled events.
>> + */
>> + enable = pmu->enable;
>> +
>> + /*
>> + * Mask out all the ones which do not need the timer, or in
>> + * other words keep all the ones that could need the timer.
>> + */
>> + enable &= frequency_enabled_mask();
>> +
>> + /*
>> + * If some bits remain it means we need the sampling timer running.
>> + */
>> + return enable;
>> +}
>> +
>> +static void __xe_pmu_maybe_start_timer(struct xe_pmu *pmu)
>> +{
>> + u64 period = max_t(u64, 10000, NSEC_PER_SEC / FREQUENCY);
>> +
>> + if (!pmu->timer_enabled && pmu_needs_timer(pmu)) {
>> + pmu->timer_enabled = true;
>> + pmu->timer_last = ktime_get();
>> + hrtimer_start_range_ns(&pmu->timer,
>> + ns_to_ktime(period), 0,
>> + HRTIMER_MODE_REL_PINNED);
>> + }
>> +}
>> +
>> static void xe_pmu_enable(struct perf_event *event)
>> {
>> struct xe_pmu *pmu = event_to_pmu(event);
>> + struct xe_device *xe = container_of(pmu, typeof(*xe), pmu);
>> + struct xe_gt *gt;
>> const unsigned int bit = event_bit(event);
>> unsigned long flags;
>> + bool device_awake;
>> + unsigned int i;
>>
>> if (bit == -1)
>> goto update;
>> @@ -302,6 +514,18 @@ static void xe_pmu_enable(struct perf_event *event)
>> pmu->enable |= BIT(bit);
>> pmu->enable_count[bit]++;
>>
>> + /*
>> + * Start the sampling timer if needed and not already enabled.
>> + */
>> + __xe_pmu_maybe_start_timer(pmu);
>> +
>> + device_awake = xe_pm_runtime_get_if_active(xe);
>> + if (device_awake) {
>> + for_each_gt(gt, xe, i)
>> + pmu->active_gts |= BIT(gt->info.id);
>> + xe_pm_runtime_put(xe);
>> + }
>> +
>> raw_spin_unlock_irqrestore(&pmu->lock, flags);
>> update:
>> /*
>> @@ -331,8 +555,10 @@ static void xe_pmu_disable(struct perf_event *event)
>> * Decrement the reference count and clear the enabled
>> * bitmask when the last listener on an event goes away.
>> */
>> - if (--pmu->enable_count[bit] == 0)
>> + if (--pmu->enable_count[bit] == 0) {
>> pmu->enable &= ~BIT(bit);
>> + pmu->timer_enabled &= pmu_needs_timer(pmu);
>> + }
>>
>> raw_spin_unlock_irqrestore(&pmu->lock, flags);
>> }
>> @@ -497,6 +723,8 @@ create_event_attributes(struct xe_pmu *pmu)
>> const char *unit;
>> } events[] = {
>> __event(0, "c6-residency", "ms"),
>> + __event(1, "actual-frequency", "M"),
>> + __event(2, "requested-frequency", "M"),
>> };
>>
>> struct perf_pmu_events_attr *pmu_attr = NULL, *pmu_iter;
>> @@ -705,6 +933,31 @@ void xe_pmu_suspend(struct xe_gt *gt)
>> raw_spin_unlock_irq(&pmu->lock);
>> }
>>
>> +/**
>> + * xe_pmu_resume() - Restart the timer if needed
>> + * @gt: GT object
>> + */
>> +void xe_pmu_resume(struct xe_gt *gt)
>> +{
>> + struct xe_device *xe = gt_to_xe(gt);
>> + struct xe_pmu *pmu = &xe->pmu;
>> +
>> + if (!pmu->base.event_init)
>> + return;
>> +
>> + raw_spin_lock_irq(&pmu->lock);
>> +
>> + /*
>> + * Re-enable sampling timer when GPU goes active.
>> + */
>> + if (pmu->active_gts == 0)
>> + __xe_pmu_maybe_start_timer(pmu);
>> +
>> + pmu->active_gts |= BIT(gt->info.id);
>> +
>> + raw_spin_unlock_irq(&pmu->lock);
>> +}
>> +
>> /**
>> * xe_pmu_unregister() - Remove/cleanup PMU registration
>> * @arg: Ptr to pmu
>> @@ -722,6 +975,8 @@ void xe_pmu_unregister(void *arg)
>>
>> pmu->registered = false;
>>
>> + hrtimer_cancel(&pmu->timer);
>> +
>> xe_pmu_unregister_cpuhp_state(pmu);
>>
>> perf_pmu_unregister(&pmu->base);
>> @@ -769,6 +1024,8 @@ void xe_pmu_register(struct xe_pmu *pmu)
>> return;
>>
>> raw_spin_lock_init(&pmu->lock);
>> + hrtimer_init(&pmu->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
>> + pmu->timer.function = xe_sample;
>> pmu->cpuhp.cpu = -1;
>>
>> pmu->name = kasprintf(GFP_KERNEL,
>> diff --git a/drivers/gpu/drm/xe/xe_pmu.h b/drivers/gpu/drm/xe/xe_pmu.h
>> index 17f5a8d7d45c..fb6e3819d7bc 100644
>> --- a/drivers/gpu/drm/xe/xe_pmu.h
>> +++ b/drivers/gpu/drm/xe/xe_pmu.h
>> @@ -16,12 +16,14 @@ void xe_pmu_exit(void);
>> void xe_pmu_register(struct xe_pmu *pmu);
>> void xe_pmu_unregister(void *arg);
>> void xe_pmu_suspend(struct xe_gt *gt);
>> +void xe_pmu_resume(struct xe_gt *gt);
>> #else
>> static inline int xe_pmu_init(void) { return 0; }
>> static inline void xe_pmu_exit(void) {}
>> static inline void xe_pmu_register(struct xe_pmu *pmu) {}
>> static inline void xe_pmu_unregister(void *arg) {}
>> static inline void xe_pmu_suspend(struct xe_gt *gt) {}
>> +static inline void xe_pmu_resume(struct xe_gt *gt) {}
>> #endif
>>
>> #endif
>> diff --git a/drivers/gpu/drm/xe/xe_pmu_types.h b/drivers/gpu/drm/xe/xe_pmu_types.h
>> index f47a6e1b109c..5d873bae4a0d 100644
>> --- a/drivers/gpu/drm/xe/xe_pmu_types.h
>> +++ b/drivers/gpu/drm/xe/xe_pmu_types.h
>> @@ -12,6 +12,8 @@
>> enum {
>> __XE_SAMPLE_C6,
>> __XE_SAMPLE_C6_LAST_REPORTED,
>> + __XE_SAMPLE_FREQ_ACT,
>> + __XE_SAMPLE_FREQ_REQ,
>> __XE_NUM_PMU_SAMPLERS
>> };
>>
>> @@ -28,7 +30,11 @@ enum {
>> #define __XE_PMU_PM(x) ___XE_PMU_PM(0, x)
>>
>> #define XE_PMU_C6_RESIDENCY __XE_PMU_PM(0)
>> +#define XE_PMU_ACTUAL_FREQUENCY __XE_PMU_PM(1)
>> +#define XE_PMU_REQUESTED_FREQUENCY __XE_PMU_PM(2)
>> #define __XE_PMU_C6_RESIDENCY(gt) ___XE_PMU_PM(gt, 0)
>> +#define __XE_PMU_ACTUAL_FREQUENCY(gt) ___XE_PMU_PM(gt, 1)
>> +#define __XE_PMU_REQUESTED_FREQUENCY(gt) ___XE_PMU_PM(gt, 2)
>>
>> /*
>> * Non-engine events that we need to track enabled-disabled transition and
>> @@ -36,6 +42,8 @@ enum {
>> */
>> enum xe_pmu_tracked_events {
>> __XE_PMU_C6_RESIDENCY_ENABLED,
>> + __XE_PMU_ACTUAL_FREQUENCY_ENABLED,
>> + __XE_PMU_REQUESTED_FREQUENCY_ENABLED,
>> __XE_PMU_TRACKED_EVENT_COUNT, /* count marker */
>> };
>>
>> @@ -116,6 +124,24 @@ struct xe_pmu {
>> * @sleep_last: Last time GT parked for C6 estimation.
>> */
>> ktime_t sleep_last[XE_PMU_MAX_GT];
>> + /**
>> + * @timer: Timer for internal Xe PMU sampling.
>> + */
>> + struct hrtimer timer;
>> + /**
>> + * @timer_last:
>> + *
>> + * Timestmap of the previous timer invocation.
>> + */
>> + ktime_t timer_last;
>> + /**
>> + * @timer_enabled: Should the internal sampling timer be running.
>> + */
>> + bool timer_enabled;
>> + /**
>> + * @active_gts: GT active mask.
>> + */
>> + unsigned int active_gts;
>> };
>>
>> #endif
>> --
>> 2.38.1
>>
More information about the Intel-xe
mailing list