[PATCH] drm/xe/pmu: Add GT frequency events

Fri Jan 31 05:41:59 UTC 2025

On Thu, Jan 30, 2025 at 08:47:28PM -0800, Vinay Belgaumkar wrote:
>Define PMU events for GT frequency (actual and requested). This is
>a port from the i915 driver implementation, where an internal timer
>is used to aggregate GT frequencies over certain fixed interval.
>Following PMU events are being added-
>
>  xe_0000_00_02.0/gt-actual-frequency/              [Kernel PMU event]
>  xe_0000_00_02.0/gt-requested-frequency/           [Kernel PMU event]
>
>Standard perf commands can be used to monitor GT frequency-
>  $ perf stat -e xe_0000_00_02.0/gt-requested-frequency,gt=0/ -I1000
>
>     1.001175175                700 M    xe/gt-requested-frequency,gt=0/
>     2.005891881                703 M    xe/gt-requested-frequency,gt=0/
>     3.007318169                700 M    xe/gt-requested-frequency,gt=0/
>
>Actual frequencies will be reported as 0 when GT is suspended.
>
>Cc: Riana Tauro <riana.tauro at intel.com>
>Cc: Lucas De Marchi <lucas.demarchi at intel.com>
>Cc: Rodrigo Vivi <rodrigo.vivi at intel.com>
>Signed-off-by: Vinay Belgaumkar <vinay.belgaumkar at intel.com>
>---
> drivers/gpu/drm/xe/xe_pmu.c       | 157 +++++++++++++++++++++++++++++-
> drivers/gpu/drm/xe/xe_pmu_types.h |  26 +++++
> 2 files changed, 181 insertions(+), 2 deletions(-)
>
>diff --git a/drivers/gpu/drm/xe/xe_pmu.c b/drivers/gpu/drm/xe/xe_pmu.c
>index 3910a82328ee..31ce575c834f 100644
>--- a/drivers/gpu/drm/xe/xe_pmu.c
>+++ b/drivers/gpu/drm/xe/xe_pmu.c
>@@ -8,6 +8,7 @@
>
> #include "xe_device.h"
> #include "xe_gt_idle.h"
>+#include "xe_guc_pc.h"
> #include "xe_pm.h"
> #include "xe_pmu.h"
>
>@@ -37,6 +38,12 @@
>
> #define XE_PMU_EVENT_GT_MASK		GENMASK_ULL(63, 60)
> #define XE_PMU_EVENT_ID_MASK		GENMASK_ULL(11, 0)
>+#define SAMPLING_TIMER_FREQUENCY_HZ	200
>+
>+static struct xe_pmu *event_to_pmu(struct perf_event *event)
>+{
>+	return container_of(event->pmu, struct xe_pmu, base);
>+}
>
> static unsigned int config_to_event_id(u64 config)
> {
>@@ -49,6 +56,8 @@ static unsigned int config_to_gt_id(u64 config)
> }
>
> #define XE_PMU_EVENT_GT_C6_RESIDENCY	0x01
>+#define XE_PMU_EVENT_GT_ACTUAL_FREQUENCY	0x02
>+#define XE_PMU_EVENT_GT_REQUESTED_FREQUENCY	0x04
>
> static struct xe_gt *event_to_gt(struct perf_event *event)
> {
>@@ -113,9 +122,18 @@ static int xe_pmu_event_init(struct perf_event *event)
> 	return 0;
> }
>
>+static u64 read_sample(struct xe_pmu *pmu, unsigned int gt_id, int sample)

please reserve the word "read" for when we are reading from hw
get_stored_sample() would be better.

>+{
>+	return pmu->event_sample[gt_id][sample];

this is read by perf, written by timer, you will need a lock to
synchronize as a u64 is or may not be atomic.

>+}
>+
> static u64 __xe_pmu_event_read(struct perf_event *event)
> {
> 	struct xe_gt *gt = event_to_gt(event);
>+	struct xe_device *xe =
>+		container_of(event->pmu, typeof(*xe), pmu.base);
>+	struct xe_pmu *pmu = &xe->pmu;
>+	u64 gt_id = gt->info.id;
>
> 	if (!gt)
> 		return 0;
>@@ -123,6 +141,14 @@ static u64 __xe_pmu_event_read(struct perf_event *event)
> 	switch (config_to_event_id(event->attr.config)) {
> 	case XE_PMU_EVENT_GT_C6_RESIDENCY:
> 		return xe_gt_idle_residency_msec(&gt->gtidle);
>+	case XE_PMU_EVENT_GT_ACTUAL_FREQUENCY:
>+		return div_u64(read_sample(pmu, gt_id,
>+			       __XE_SAMPLE_FREQ_ACT),

if you index it by the event id, then you can pretty much group them
here. However I'm not sure if the design is correct to store the value
read in one place. What happens when you have 3 clients reading the same
event? You are only storing the hw value in one place and returning it
in all of them. However the old value read is kept per perf_event,
so it will calculate the different deltas... maybe it's ok, but you are
definitely missing the lock for the hw sampling, which will need to be
a raw_spin_lock.

>+			       USEC_PER_SEC /* to MHz */);
>+	case XE_PMU_EVENT_GT_REQUESTED_FREQUENCY:
>+		return div_u64(read_sample(pmu, gt_id,
>+			       __XE_SAMPLE_FREQ_REQ),
>+			       USEC_PER_SEC /* to MHz */);
> 	}
>
> 	return 0;
>@@ -154,8 +180,109 @@ static void xe_pmu_event_read(struct perf_event *event)
> 	xe_pmu_event_update(event);
> }
>
>+#define CONFIG_EVENT_ENABLED(config, event) \
>+	(FIELD_GET(XE_PMU_EVENT_ID_MASK, config) & event)
>+
>+#define CONFIG_EVENT_ENABLED_GT(config, event, gt) \
>+	(CONFIG_EVENT_ENABLED(config, event) && \
>+	(FIELD_GET(XE_PMU_EVENT_GT_MASK, config) == gt))
>+
>+static bool pmu_needs_timer(struct xe_pmu *pmu)
>+{
>+	return CONFIG_EVENT_ENABLED(pmu->enable,
>+				    (XE_PMU_EVENT_GT_ACTUAL_FREQUENCY |
>+				     XE_PMU_EVENT_GT_REQUESTED_FREQUENCY));
>+}
>+
>+static void xe_pmu_start_timer(struct xe_pmu *pmu)
>+{
>+	u64 period = max_t(u64, 10000, NSEC_PER_SEC / SAMPLING_TIMER_FREQUENCY_HZ);
>+
>+	if (!pmu->timer_enabled && pmu_needs_timer(pmu)) {
>+		pmu->timer_enabled = true;
>+		pmu->timer_last = ktime_get();
>+		hrtimer_start_range_ns(&pmu->timer,
>+				       ns_to_ktime(period), 0,
>+				       HRTIMER_MODE_REL_PINNED);
>+	}
>+}
>+
>+static void
>+add_sample_mult(struct xe_pmu *pmu, unsigned int gt_id, int sample, u32 val, u32 mul)
>+{
>+	pmu->event_sample[gt_id][sample] += mul_u32_u32(val, mul);
>+}
>+
>+static void
>+frequency_sample(struct xe_gt *gt, unsigned int period_ns)
>+{
>+	struct xe_device *xe = gt_to_xe(gt);
>+	struct xe_pmu *pmu = &xe->pmu;
>+	int ret;
>+	u32 cur_freq;
>+
>+	if (CONFIG_EVENT_ENABLED_GT(pmu->enable, XE_PMU_EVENT_GT_ACTUAL_FREQUENCY, gt->info.id)) {
>+		u32 val;
>+
>+		/* Actual freq will be 0 when GT is in C6 */
>+		val = xe_guc_pc_get_act_freq(&gt->uc.guc.pc);
>+
>+		add_sample_mult(pmu, gt->info.id, __XE_SAMPLE_FREQ_ACT,
>+				val, period_ns / 1000);
>+	}
>+
>+	if (CONFIG_EVENT_ENABLED_GT(pmu->enable, XE_PMU_EVENT_GT_REQUESTED_FREQUENCY,
>+				    gt->info.id)) {
>+		ret = xe_guc_pc_get_cur_freq(&gt->uc.guc.pc, &cur_freq);
>+		if (!ret)
>+			add_sample_mult(pmu, gt->info.id, __XE_SAMPLE_FREQ_REQ,
>+					cur_freq,
>+					period_ns / 1000);
>+	}
>+}
>+
>+static enum hrtimer_restart xe_sample(struct hrtimer *hrtimer)
>+{
>+	struct xe_pmu *pmu = container_of(hrtimer, struct xe_pmu, timer);
>+	struct xe_device *xe = container_of(pmu, typeof(*xe), pmu);
>+	u64 period = max_t(u64, 10000, NSEC_PER_SEC / SAMPLING_TIMER_FREQUENCY_HZ);
>+	unsigned int period_ns;
>+	struct xe_gt *gt;
>+	ktime_t now;
>+	unsigned int i;
>+
>+	if (!READ_ONCE(pmu->timer_enabled))
>+		return HRTIMER_NORESTART;
>+
>+	now = ktime_get();
>+	period_ns = ktime_to_ns(ktime_sub(now, pmu->timer_last));
>+	pmu->timer_last = now;
>+
>+	/*
>+	 * The passed in period includes the time needed for grabbing the forcewake.
>+	 * However, the potential error from timer callback delay greatly dominates
>+	 * this so we keep it simple.
>+	 */
>+
>+	for_each_gt(gt, xe, i)
>+		frequency_sample(gt, period_ns);
>+
>+	hrtimer_forward(hrtimer, now, ns_to_ktime(period));
>+
>+	return HRTIMER_RESTART;
>+}
>+
> static void xe_pmu_enable(struct perf_event *event)
> {
>+	struct xe_pmu *pmu = event_to_pmu(event);
>+	struct xe_device *xe = container_of(pmu, typeof(*xe), pmu);
>+
>+	/* Save the event config */
>+	pmu->enable |= event->attr.config;

same concern as above... what happens when you have 2 clients reading
the same event? You storing here the same bits, then when any of them
stop, you stop updating the other as well.

alternative is to have a linked list of active sampling events and use
the event->active_entry to track that, like done in
arch/x86/events/rapl.c.

Lucas De Marchi

>+
>+	/* Start a timer, if needed, to collect samples */
>+	xe_pmu_start_timer(pmu);
>+
> 	/*
> 	 * Store the current counter value so we can report the correct delta
> 	 * for all listeners. Even when the event was already enabled and has
>@@ -176,15 +303,27 @@ static void xe_pmu_event_start(struct perf_event *event, int flags)
> 	event->hw.state = 0;
> }
>
>+static void xe_pmu_disable(struct perf_event *event)
>+{
>+	struct xe_device *xe = container_of(event->pmu, typeof(*xe), pmu.base);
>+	struct xe_pmu *pmu = &xe->pmu;
>+
>+	pmu->enable &=  ~event->attr.config;
>+	pmu->timer_enabled &= pmu_needs_timer(pmu);
>+}
>+
> static void xe_pmu_event_stop(struct perf_event *event, int flags)
> {
> 	struct xe_device *xe = container_of(event->pmu, typeof(*xe), pmu.base);
> 	struct xe_pmu *pmu = &xe->pmu;
>
>-	if (pmu->registered)
>+	if (pmu->registered) {
> 		if (flags & PERF_EF_UPDATE)
> 			xe_pmu_event_update(event);
>
>+		xe_pmu_disable(event);
>+	}
>+
> 	event->hw.state = PERF_HES_STOPPED;
> }
>
>@@ -270,6 +409,10 @@ static ssize_t event_attr_show(struct device *dev,
> 	XE_EVENT_ATTR_GROUP(v_, id_, &pmu_event_ ##v_.attr.attr)
>
> XE_EVENT_ATTR_SIMPLE(gt-c6-residency, gt_c6_residency, XE_PMU_EVENT_GT_C6_RESIDENCY, "ms");
>+XE_EVENT_ATTR_SIMPLE(gt-actual-frequency, gt_actual_frequency,
>+		     XE_PMU_EVENT_GT_ACTUAL_FREQUENCY, "Mhz");
>+XE_EVENT_ATTR_SIMPLE(gt-requested-frequency, gt_requested_frequency,
>+		     XE_PMU_EVENT_GT_REQUESTED_FREQUENCY, "Mhz");
>
> static struct attribute *pmu_empty_event_attrs[] = {
> 	/* Empty - all events are added as groups with .attr_update() */
>@@ -283,6 +426,8 @@ static const struct attribute_group pmu_events_attr_group = {
>
> static const struct attribute_group *pmu_events_attr_update[] = {
> 	&pmu_group_gt_c6_residency,
>+	&pmu_group_gt_actual_frequency,
>+	&pmu_group_gt_requested_frequency,
> 	NULL,
> };
>
>@@ -290,8 +435,11 @@ static void set_supported_events(struct xe_pmu *pmu)
> {
> 	struct xe_device *xe = container_of(pmu, typeof(*xe), pmu);
>
>-	if (!xe->info.skip_guc_pc)
>+	if (!xe->info.skip_guc_pc) {
> 		pmu->supported_events |= BIT_ULL(XE_PMU_EVENT_GT_C6_RESIDENCY);
>+		pmu->supported_events |= BIT_ULL(XE_PMU_EVENT_GT_ACTUAL_FREQUENCY);
>+		pmu->supported_events |= BIT_ULL(XE_PMU_EVENT_GT_REQUESTED_FREQUENCY);
>+	}
> }
>
> /**
>@@ -306,6 +454,8 @@ static void xe_pmu_unregister(void *arg)
> 	if (!pmu->registered)
> 		return;
>
>+	hrtimer_cancel(&pmu->timer);
>+
> 	pmu->registered = false;
>
> 	perf_pmu_unregister(&pmu->base);
>@@ -334,6 +484,9 @@ int xe_pmu_register(struct xe_pmu *pmu)
> 	if (IS_SRIOV_VF(xe))
> 		return 0;
>
>+	hrtimer_init(&pmu->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
>+	pmu->timer.function = xe_sample;
>+
> 	name = kasprintf(GFP_KERNEL, "xe_%s",
> 			 dev_name(xe->drm.dev));
> 	if (!name)
>diff --git a/drivers/gpu/drm/xe/xe_pmu_types.h b/drivers/gpu/drm/xe/xe_pmu_types.h
>index f5ba4d56622c..8dc235e1b6d3 100644
>--- a/drivers/gpu/drm/xe/xe_pmu_types.h
>+++ b/drivers/gpu/drm/xe/xe_pmu_types.h
>@@ -9,6 +9,12 @@
> #include <linux/perf_event.h>
> #include <linux/spinlock_types.h>
>
>+enum {
>+	__XE_SAMPLE_FREQ_ACT,
>+	__XE_SAMPLE_FREQ_REQ,
>+	__XE_NUM_PMU_SAMPLERS
>+};
>+
> #define XE_PMU_MAX_GT 2
>
> /**
>@@ -34,6 +40,26 @@ struct xe_pmu {
> 	 * @supported_events: Bitmap of supported events, indexed by event id
> 	 */
> 	u64 supported_events;
>+	/**
>+	 * @timer: Timer for sampling GT frequency.
>+	 */
>+	struct hrtimer timer;
>+	/**
>+	 * @timer_last: Timestmap of the previous timer invocation.
>+	 */
>+	ktime_t timer_last;
>+	/**
>+	 * @timer_enabled: Should the internal sampling timer be running.
>+	 */
>+	bool timer_enabled;
>+	/**
>+	 * @event_sample: Store freq related counters.
>+	 */
>+	u64 event_sample[XE_PMU_MAX_GT][__XE_NUM_PMU_SAMPLERS];
>+	/**
>+	 * @enable: Store requested PMU event config.
>+	 */
>+	u64 enable;
> };
>
> #endif
>-- 
>2.38.1
>