[PATCH v3] drm/xe/pmu: Add GT frequency events

Lucas De Marchi lucas.demarchi at intel.com
Sat Mar 1 01:19:05 UTC 2025


On Fri, Feb 28, 2025 at 04:59:56PM -0800, Belgaumkar, Vinay wrote:
>
>On 2/25/2025 9:47 AM, Lucas De Marchi wrote:
>>On Wed, Feb 12, 2025 at 04:02:20PM -0800, Vinay Belgaumkar wrote:
>>>Define PMU events for GT frequency (actual and requested). This is
>>>a port from the i915 driver implementation, where an internal timer
>>>is used to aggregate GT frequencies over certain fixed interval.
>>>Following PMU events are being added-
>>>
>>> xe_0000_00_02.0/gt-actual-frequency/              [Kernel PMU event]
>>> xe_0000_00_02.0/gt-requested-frequency/           [Kernel PMU event]
>>>
>>>Standard perf commands can be used to monitor GT frequency-
>>> $ perf stat -e xe_0000_00_02.0/gt-requested-frequency,gt=0/ -I1000
>>>
>>>    1.001175175                700 M xe/gt-requested-frequency,gt=0/
>>>    2.005891881                703 M xe/gt-requested-frequency,gt=0/
>>>    3.007318169                700 M xe/gt-requested-frequency,gt=0/
>>>
>>>Actual frequencies will be reported as 0 when GT is in C6.
>>>
>>>v2: Use locks while storing/reading samples, keep track of multiple
>>>clients (Lucas) and other general cleanup.
>>>
>>>Cc: Riana Tauro <riana.tauro at intel.com>
>>>Cc: Lucas De Marchi <lucas.demarchi at intel.com>
>>>Cc: Rodrigo Vivi <rodrigo.vivi at intel.com>
>>>Signed-off-by: Vinay Belgaumkar <vinay.belgaumkar at intel.com>
>>>---
>>>drivers/gpu/drm/xe/xe_pmu.c       | 211 ++++++++++++++++++++++++++++--
>>>drivers/gpu/drm/xe/xe_pmu_types.h |  29 ++++
>>>2 files changed, 230 insertions(+), 10 deletions(-)
>>>
>>>diff --git a/drivers/gpu/drm/xe/xe_pmu.c b/drivers/gpu/drm/xe/xe_pmu.c
>>>index 3910a82328ee..ed76756403ea 100644
>>>--- a/drivers/gpu/drm/xe/xe_pmu.c
>>>+++ b/drivers/gpu/drm/xe/xe_pmu.c
>>>@@ -8,6 +8,8 @@
>>>
>>>#include "xe_device.h"
>>>#include "xe_gt_idle.h"
>>>+#include "xe_guc_pc.h"
>>>+#include "xe_macros.h"
>>>#include "xe_pm.h"
>>>#include "xe_pmu.h"
>>>
>>>@@ -37,6 +39,17 @@
>>>
>>>#define XE_PMU_EVENT_GT_MASK        GENMASK_ULL(63, 60)
>>>#define XE_PMU_EVENT_ID_MASK        GENMASK_ULL(11, 0)
>>>+#define XE_HRTIMER_INTERVAL_NS        5000000
>>>+
>>>+static struct xe_pmu *event_to_pmu(struct perf_event *event)
>>>+{
>>>+    return container_of(event->pmu, typeof(struct xe_pmu), base);
>>>+}
>>>+
>>>+static struct xe_device *event_to_xe(struct perf_event *event)
>>>+{
>>>+    return container_of(event->pmu, typeof(struct xe_device), 
>>>pmu.base);
>>>+}
>>
>>we shouldn't add refactors like this in the same patch you are adding a
>>new feature, it creates noise in the patch and makes it harder to see
>>what's going on. Please drop both event_to_pmu() and event_to_xe().
>ok, will add as a separate patch later.
>>
>>>
>>>static unsigned int config_to_event_id(u64 config)
>>>{
>>>@@ -49,10 +62,12 @@ static unsigned int config_to_gt_id(u64 config)
>>>}
>>>
>>>#define XE_PMU_EVENT_GT_C6_RESIDENCY    0x01
>>>+#define XE_PMU_EVENT_GT_ACTUAL_FREQUENCY    0x02
>>>+#define XE_PMU_EVENT_GT_REQUESTED_FREQUENCY    0x03
>>
>>this will need a rebase
>yes.
>>
>>>
>>>static struct xe_gt *event_to_gt(struct perf_event *event)
>>>{
>>>-    struct xe_device *xe = container_of(event->pmu, typeof(*xe), 
>>>pmu.base);
>>>+    struct xe_device *xe = event_to_xe(event);
>>>    u64 gt = config_to_gt_id(event->attr.config);
>>>
>>>    return xe_device_get_gt(xe, gt);
>>>@@ -70,7 +85,7 @@ static bool event_supported(struct xe_pmu *pmu, 
>>>unsigned int gt,
>>>
>>>static void xe_pmu_event_destroy(struct perf_event *event)
>>>{
>>>-    struct xe_device *xe = container_of(event->pmu, typeof(*xe), 
>>>pmu.base);
>>>+    struct xe_device *xe = event_to_xe(event);
>>>
>>>    drm_WARN_ON(&xe->drm, event->parent);
>>>    xe_pm_runtime_put(xe);
>>>@@ -79,7 +94,7 @@ static void xe_pmu_event_destroy(struct 
>>>perf_event *event)
>>>
>>>static int xe_pmu_event_init(struct perf_event *event)
>>>{
>>>-    struct xe_device *xe = container_of(event->pmu, typeof(*xe), 
>>>pmu.base);
>>>+    struct xe_device *xe = event_to_xe(event);
>>>    struct xe_pmu *pmu = &xe->pmu;
>>>    unsigned int id, gt;
>>>
>>>@@ -104,6 +119,8 @@ static int xe_pmu_event_init(struct perf_event 
>>>*event)
>>>    if (has_branch_stack(event))
>>>        return -EOPNOTSUPP;
>>>
>>>+    XE_WARN_ON(id >= XE_PMU_MAX_EVENT_TYPES);
>>
>>I was going to say that we don't need the other ones. But then I noticed
>>XE_PMU_MAX_EVENT_TYPES == 64, which is a lot more than we currently
>>support.
>>
>>I suggested it before, but what's the issue of keeping a linked list of
>>the events like rapl_pmu::active_list?
>ok, will add the active_list. However, still need the 
>XE_PMU_MAX_EVENT_TYPES, since I am storing the samples in an array 
>based on event_id -  u64 
>event_sample[XE_PMU_MAX_GT][XE_PMU_MAX_EVENT_TYPES].

can't you simply walk the list updating the values in place? we are
likely going to have 1 or very few active events:

a) read hw, which may involve locks/forcewake, and get the sample
b)
	raw_spin_lock(pmu->lock);
		walk the list of active events, update with the new
		sample
	raw_spin_unlock(pmu->lock);

I didn't try it though... may be missing something.

Considering we now even get the forcewake for other events on event
init... does it make sense to still do this with a hrtimer? Are we
trying to smooth the curve out instead of reporting the instantaneous
value?

Lucas De Marchi

>>
>>>+
>>>    if (!event->parent) {
>>>        drm_dev_get(&xe->drm);
>>>        xe_pm_runtime_get(xe);
>>>@@ -113,16 +130,50 @@ static int xe_pmu_event_init(struct 
>>>perf_event *event)
>>>    return 0;
>>>}
>>>
>>>+static u64
>>>+xe_get_pmu_sample(struct xe_pmu *pmu, unsigned int gt_id, uint32_t id)
>>>+{
>>>+    unsigned long flags;
>>>+    u64 val;
>>>+
>>>+    raw_spin_lock_irqsave(&pmu->lock, flags);
>>>+    val = pmu->event_sample[gt_id][id];
>>>+    raw_spin_unlock_irqrestore(&pmu->lock, flags);
>>>+
>>>+    return val;
>>>+}
>>>+
>>>+static void
>>>+xe_store_pmu_sample(struct xe_pmu *pmu, uint32_t gt_id, uint32_t id,
>>>+            uint32_t val, uint32_t period)
>>>+{
>>>+    unsigned long flags;
>>>+
>>>+    raw_spin_lock_irqsave(&pmu->lock, flags);
>>>+    pmu->event_sample[gt_id][id] += mul_u32_u32(val, period);
>>>+    raw_spin_unlock_irqrestore(&pmu->lock, flags);
>>>+}
>>>+
>>>static u64 __xe_pmu_event_read(struct perf_event *event)
>>>{
>>>    struct xe_gt *gt = event_to_gt(event);
>>>+    struct xe_device *xe = event_to_xe(event);
>>
>>gt_to_xe() avoids the additional helper and you don't even need it.
>>You are intested in the pmu, not xe... so you could just have used
>>
>>    pmu = container_of(event->pmu, typeof(struct xe_pmu), base);
>>
>>... like is done in other places.
>ok.
>>
>>>+    struct xe_pmu *pmu = &xe->pmu;
>>>+    unsigned long id;
>>>
>>>    if (!gt)
>>>        return 0;
>>>
>>>-    switch (config_to_event_id(event->attr.config)) {
>>>+    id = config_to_event_id(event->attr.config);
>>>+
>>>+    switch (id) {
>>>    case XE_PMU_EVENT_GT_C6_RESIDENCY:
>>>        return xe_gt_idle_residency_msec(&gt->gtidle);
>>>+    case XE_PMU_EVENT_GT_ACTUAL_FREQUENCY:
>>>+    case XE_PMU_EVENT_GT_REQUESTED_FREQUENCY:
>>>+        return div_u64(xe_get_pmu_sample(pmu, gt->info.id,
>>>+                   id),
>>
>>keep in the line above, it doesn't help readability.
>ok.
>>
>>>+                   USEC_PER_SEC /* to MHz */);
>>
>>why is the unit nsec -> usec when storing, couldn't we just use
>>NSEC_PER_SEC here?
>Yup.
>>
>>
>>>    }
>>>
>>>    return 0;
>>>@@ -143,7 +194,7 @@ static void xe_pmu_event_update(struct 
>>>perf_event *event)
>>>
>>>static void xe_pmu_event_read(struct perf_event *event)
>>>{
>>>-    struct xe_device *xe = container_of(event->pmu, typeof(*xe), 
>>>pmu.base);
>>>+    struct xe_device *xe = event_to_xe(event);
>>>    struct xe_pmu *pmu = &xe->pmu;
>>>
>>>    if (!pmu->registered) {
>>>@@ -154,8 +205,105 @@ static void xe_pmu_event_read(struct 
>>>perf_event *event)
>>>    xe_pmu_event_update(event);
>>>}
>>>
>>>+static bool gt_pmu_event_active(struct xe_pmu *pmu, uint32_t id, 
>>>uint32_t gt_id)
>>>+{
>>>+    return pmu->active_mask[gt_id] & BIT_ULL(id);
>>>+}
>>>+
>>>+static bool pmu_needs_timer(struct xe_pmu *pmu)
>>>+{
>>>+    struct xe_device *xe = container_of(pmu, typeof(*xe), pmu);
>>>+    struct xe_gt *gt;
>>>+    int i;
>>>+
>>>+    for_each_gt(gt, xe, i)
>>>+        if (gt_pmu_event_active(pmu, 
>>>XE_PMU_EVENT_GT_ACTUAL_FREQUENCY, i) ||
>>>+            gt_pmu_event_active(pmu, 
>>>XE_PMU_EVENT_GT_REQUESTED_FREQUENCY, i))
>>>+            return true;
>>>+
>>>+    return false;
>>>+}
>>>+
>>>+static void xe_pmu_start_timer(struct xe_pmu *pmu, uint32_t gt_id)
>>>+{
>>>+    /* Timer is per device */
>>>+    if (!pmu->timer_enabled && pmu_needs_timer(pmu)) {
>>>+        pmu->timer_enabled = true;
>>>+        pmu->timer_last = ktime_get();
>>>+        hrtimer_start_range_ns(&pmu->timer,
>>>+                       ns_to_ktime(XE_HRTIMER_INTERVAL_NS), 0,
>>>+                       HRTIMER_MODE_REL_PINNED);
>>>+    }
>>>+}
>>>+
>>>+static void
>>>+xe_sample_gt_frequency(struct xe_gt *gt, uint32_t period_ns)
>>>+{
>>>+    struct xe_device *xe = gt_to_xe(gt);
>>>+    struct xe_pmu *pmu = &xe->pmu;
>>>+    uint32_t act_freq, cur_freq;
>>>+    int ret;
>>>+
>>>+    if (gt_pmu_event_active(pmu, 
>>>XE_PMU_EVENT_GT_ACTUAL_FREQUENCY, gt->info.id)) {
>>>+
>>
>>drop this line
>ok.
>>
>>>+        /* Actual freq will be 0 when GT is in C6 */
>>>+        act_freq = xe_guc_pc_get_act_freq(&gt->uc.guc.pc);
>>>+        xe_store_pmu_sample(pmu, gt->info.id, 
>>>XE_PMU_EVENT_GT_ACTUAL_FREQUENCY,
>>>+                 act_freq, period_ns / NSEC_PER_USEC);
>>>+    }
>>>+
>>>+    if (gt_pmu_event_active(pmu, 
>>>XE_PMU_EVENT_GT_REQUESTED_FREQUENCY, gt->info.id)) {
>>>+        ret = xe_guc_pc_get_cur_freq(&gt->uc.guc.pc, &cur_freq);
>>>+        if (!ret)
>>>+            xe_store_pmu_sample(pmu, gt->info.id, 
>>>XE_PMU_EVENT_GT_REQUESTED_FREQUENCY,
>>>+                     cur_freq, period_ns / NSEC_PER_USEC);
>>>+    }
>>>+}
>>>+
>>>+static enum hrtimer_restart xe_sample(struct hrtimer *hrtimer)
>>>+{
>>>+    struct xe_pmu *pmu = container_of(hrtimer, struct xe_pmu, timer);
>>>+    struct xe_device *xe = container_of(pmu, typeof(*xe), pmu);
>>>+    u64 time_diff_ns;
>>>+    struct xe_gt *gt;
>>>+    ktime_t now;
>>>+    int i;
>>>+
>>>+    if (!READ_ONCE(pmu->timer_enabled))
>>>+        return HRTIMER_NORESTART;
>>>+
>>>+    now = ktime_get();
>>>+    time_diff_ns = ktime_to_ns(ktime_sub(now, pmu->timer_last));
>>>+    pmu->timer_last = now;
>>>+
>>>+    /*
>>>+     * The passed in period includes the time needed for grabbing 
>>>the forcewake.
>>
>>I don't think this is true anymore, is it?
>>
>>xe_sample()
>>  1. calculate time_diff at the beginning of the function
>>  2. for each gt, readout freq samples, using the time_diff, may include
>>     delays on forcewake, but that is not included in the time_diff
>>  3. htimer is adjusted to timer_last + period
>yup, makes sense.
>>
>>>+     */
>>>+    for_each_gt(gt, xe, i)
>>>+        xe_sample_gt_frequency(gt, time_diff_ns);
>>>+
>>>+    hrtimer_forward(hrtimer, now, ns_to_ktime(XE_HRTIMER_INTERVAL_NS));
>>>+
>>>+    return HRTIMER_RESTART;
>>>+}
>>>+
>>>static void xe_pmu_enable(struct perf_event *event)
>>>{
>>>+    struct xe_pmu *pmu = event_to_pmu(event);
>>>+    struct xe_gt *gt = event_to_gt(event);
>>>+    uint32_t id = config_to_event_id(event->attr.config);
>>>+    unsigned long flags;
>>>+
>>>+    raw_spin_lock_irqsave(&pmu->lock, flags);
>>>+
>>>+    /* Update event mask */
>>>+    pmu->active_mask[gt->info.id] |= BIT_ULL(id);
>>>+    pmu->n_active[gt->info.id][id]++;
>>>+
>>>+    /* Start a timer, if needed, to collect samples */
>>>+    xe_pmu_start_timer(pmu, gt->info.id);
>>>+
>>>+    raw_spin_unlock_irqrestore(&pmu->lock, flags);
>>>    /*
>>>     * Store the current counter value so we can report the 
>>>correct delta
>>>     * for all listeners. Even when the event was already enabled 
>>>and has
>>>@@ -166,7 +314,7 @@ static void xe_pmu_enable(struct perf_event *event)
>>>
>>>static void xe_pmu_event_start(struct perf_event *event, int flags)
>>>{
>>>-    struct xe_device *xe = container_of(event->pmu, typeof(*xe), 
>>>pmu.base);
>>>+    struct xe_device *xe = event_to_xe(event);
>>>    struct xe_pmu *pmu = &xe->pmu;
>>>
>>>    if (!pmu->registered)
>>>@@ -176,26 +324,53 @@ static void xe_pmu_event_start(struct 
>>>perf_event *event, int flags)
>>>    event->hw.state = 0;
>>>}
>>>
>>>+static void xe_pmu_disable(struct perf_event *event)
>>>+{
>>>+    struct xe_device *xe = event_to_xe(event);
>>>+    struct xe_pmu *pmu = &xe->pmu;
>>>+    unsigned long flags;
>>>+    uint32_t id = config_to_event_id(event->attr.config);
>>>+    uint32_t gt_id = config_to_gt_id(event->attr.config);
>>>+
>>>+    raw_spin_lock_irqsave(&pmu->lock, flags);
>>>+
>>>+    if (--pmu->n_active[gt_id][id] == 0) {
>>>+        pmu->active_mask[gt_id] &= ~BIT_ULL(id);
>>>+        pmu->timer_enabled &= pmu_needs_timer(pmu);
>>>+    }
>>>+
>>>+    raw_spin_unlock_irqrestore(&pmu->lock, flags);
>>>+}
>>>+
>>>static void xe_pmu_event_stop(struct perf_event *event, int flags)
>>>{
>>>-    struct xe_device *xe = container_of(event->pmu, typeof(*xe), 
>>>pmu.base);
>>>+    struct xe_device *xe = event_to_xe(event);
>>>    struct xe_pmu *pmu = &xe->pmu;
>>>+    uint32_t id = config_to_event_id(event->attr.config);
>>>+
>>>+    XE_WARN_ON(id >= XE_PMU_MAX_EVENT_TYPES);
>>
>>if not using a linked list as suggested, please drop this. Instead block
>>the event init from succeeding.
>
>ok.
>
>Thanks,
>
>Vinay.
>
>>
>>>
>>>-    if (pmu->registered)
>>>+    if (pmu->registered) {
>>>        if (flags & PERF_EF_UPDATE)
>>>            xe_pmu_event_update(event);
>>>
>>>+        xe_pmu_disable(event);
>>>+    }
>>>+
>>>    event->hw.state = PERF_HES_STOPPED;
>>>}
>>>
>>>static int xe_pmu_event_add(struct perf_event *event, int flags)
>>>{
>>>-    struct xe_device *xe = container_of(event->pmu, typeof(*xe), 
>>>pmu.base);
>>>+    struct xe_device *xe = event_to_xe(event);
>>>    struct xe_pmu *pmu = &xe->pmu;
>>>+    uint32_t id = config_to_event_id(event->attr.config);
>>>
>>>    if (!pmu->registered)
>>>        return -ENODEV;
>>>
>>>+    XE_WARN_ON(id >= XE_PMU_MAX_EVENT_TYPES);
>>
>>ditto.
>>
>>Lucas De Marchi
>>
>>>+
>>>    if (flags & PERF_EF_START)
>>>        xe_pmu_event_start(event, flags);
>>>
>>>@@ -270,6 +445,10 @@ static ssize_t event_attr_show(struct device *dev,
>>>    XE_EVENT_ATTR_GROUP(v_, id_, &pmu_event_ ##v_.attr.attr)
>>>
>>>XE_EVENT_ATTR_SIMPLE(gt-c6-residency, gt_c6_residency, 
>>>XE_PMU_EVENT_GT_C6_RESIDENCY, "ms");
>>>+XE_EVENT_ATTR_SIMPLE(gt-actual-frequency, gt_actual_frequency,
>>>+             XE_PMU_EVENT_GT_ACTUAL_FREQUENCY, "Mhz");
>>>+XE_EVENT_ATTR_SIMPLE(gt-requested-frequency, gt_requested_frequency,
>>>+             XE_PMU_EVENT_GT_REQUESTED_FREQUENCY, "Mhz");
>>>
>>>static struct attribute *pmu_empty_event_attrs[] = {
>>>    /* Empty - all events are added as groups with .attr_update() */
>>>@@ -283,6 +462,8 @@ static const struct attribute_group 
>>>pmu_events_attr_group = {
>>>
>>>static const struct attribute_group *pmu_events_attr_update[] = {
>>>    &pmu_group_gt_c6_residency,
>>>+    &pmu_group_gt_actual_frequency,
>>>+    &pmu_group_gt_requested_frequency,
>>>    NULL,
>>>};
>>>
>>>@@ -290,8 +471,11 @@ static void set_supported_events(struct 
>>>xe_pmu *pmu)
>>>{
>>>    struct xe_device *xe = container_of(pmu, typeof(*xe), pmu);
>>>
>>>-    if (!xe->info.skip_guc_pc)
>>>+    if (!xe->info.skip_guc_pc) {
>>>        pmu->supported_events |= BIT_ULL(XE_PMU_EVENT_GT_C6_RESIDENCY);
>>>+        pmu->supported_events |= 
>>>BIT_ULL(XE_PMU_EVENT_GT_ACTUAL_FREQUENCY);
>>>+        pmu->supported_events |= 
>>>BIT_ULL(XE_PMU_EVENT_GT_REQUESTED_FREQUENCY);
>>>+    }
>>>}
>>>
>>>/**
>>>@@ -306,6 +490,8 @@ static void xe_pmu_unregister(void *arg)
>>>    if (!pmu->registered)
>>>        return;
>>>
>>>+    hrtimer_cancel(&pmu->timer);
>>>+
>>>    pmu->registered = false;
>>>
>>>    perf_pmu_unregister(&pmu->base);
>>>@@ -334,6 +520,11 @@ int xe_pmu_register(struct xe_pmu *pmu)
>>>    if (IS_SRIOV_VF(xe))
>>>        return 0;
>>>
>>>+    raw_spin_lock_init(&pmu->lock);
>>>+
>>>+    hrtimer_init(&pmu->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
>>>+    pmu->timer.function = xe_sample;
>>>+
>>>    name = kasprintf(GFP_KERNEL, "xe_%s",
>>>             dev_name(xe->drm.dev));
>>>    if (!name)
>>>diff --git a/drivers/gpu/drm/xe/xe_pmu_types.h 
>>>b/drivers/gpu/drm/xe/xe_pmu_types.h
>>>index f5ba4d56622c..cd68cba58693 100644
>>>--- a/drivers/gpu/drm/xe/xe_pmu_types.h
>>>+++ b/drivers/gpu/drm/xe/xe_pmu_types.h
>>>@@ -10,6 +10,7 @@
>>>#include <linux/spinlock_types.h>
>>>
>>>#define XE_PMU_MAX_GT 2
>>>+#define XE_PMU_MAX_EVENT_TYPES    64
>>>
>>>/**
>>> * struct xe_pmu - PMU related data per Xe device
>>>@@ -34,6 +35,34 @@ struct xe_pmu {
>>>     * @supported_events: Bitmap of supported events, indexed by 
>>>event id
>>>     */
>>>    u64 supported_events;
>>>+    /**
>>>+     * @lock: Lock protecting enable mask and sampling
>>>+     */
>>>+    raw_spinlock_t lock;
>>>+    /**
>>>+     * @active_mask: Bit representation of active events
>>>+     */
>>>+    u64 active_mask[XE_PMU_MAX_GT];
>>>+    /**
>>>+     * @n_active: Counts for the enabled events
>>>+     */
>>>+    unsigned int n_active[XE_PMU_MAX_GT][XE_PMU_MAX_EVENT_TYPES];
>>>+    /**
>>>+     * @timer: Timer for sampling GT frequency.
>>>+     */
>>>+    struct hrtimer timer;
>>>+    /**
>>>+     * @timer_last: Timestmap of the previous timer invocation.
>>>+     */
>>>+    ktime_t timer_last;
>>>+    /**
>>>+     * @timer_enabled: Should the internal sampling timer be running.
>>>+     */
>>>+    bool timer_enabled;
>>>+    /**
>>>+     * @event_sample: Store freq related counters.
>>>+     */
>>>+    u64 event_sample[XE_PMU_MAX_GT][XE_PMU_MAX_EVENT_TYPES];
>>>};
>>>
>>>#endif
>>>-- 
>>>2.38.1
>>>


More information about the Intel-xe mailing list