[PATCH v7 5/5] drm/xe/xe_pmu: Acquire forcewake on event init for engine events

Riana Tauro riana.tauro at intel.com
Fri Feb 21 06:32:02 UTC 2025


Hi Lucas

On 2/21/2025 11:56 AM, Lucas De Marchi wrote:
> On Thu, Feb 20, 2025 at 10:45:46PM -0600, Lucas De Marchi wrote:
>> On Thu, Feb 20, 2025 at 05:14:07PM -0800, Umesh Nerlige Ramappa wrote:
>>> On Thu, Feb 20, 2025 at 03:46:55PM -0600, Lucas De Marchi wrote:
>>>> On Fri, Feb 14, 2025 at 03:38:13PM +0530, Riana Tauro wrote:
>>>>> When the engine events are created, acquire GT forcewake to read gpm
>>>>> timestamp required for the events and release on event destroy. This
>>>>> cannot be done during read due to the raw spinlock held my pmu.
>>>>>
>>>>> v2: remove forcewake counting (Umesh)
>>>>> v3: remove extra space (Umesh)
>>>>>
>>>>> Cc: Rodrigo Vivi <rodrigo.vivi at intel.com>
>>>>> Cc: Himal Prasad Ghimiray <himal.prasad.ghimiray at intel.com>
>>>>> Signed-off-by: Riana Tauro <riana.tauro at intel.com>
>>>>> Reviewed-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa at intel.com>
>>>>> ---
>>>>> drivers/gpu/drm/xe/xe_pmu.c       | 52 +++++++++++++++++++++++++++++--
>>>>> drivers/gpu/drm/xe/xe_pmu_types.h |  4 +++
>>>>> 2 files changed, 54 insertions(+), 2 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/xe/xe_pmu.c b/drivers/gpu/drm/xe/xe_pmu.c
>>>>> index dc89fa6d0ec5..67693d642f5a 100644
>>>>> --- a/drivers/gpu/drm/xe/xe_pmu.c
>>>>> +++ b/drivers/gpu/drm/xe/xe_pmu.c
>>>>> @@ -7,6 +7,7 @@
>>>>> #include <linux/device.h>
>>>>>
>>>>> #include "xe_device.h"
>>>>> +#include "xe_force_wake.h"
>>>>> #include "xe_gt_idle.h"
>>>>> #include "xe_guc_engine_activity.h"
>>>>> #include "xe_hw_engine.h"
>>>>> @@ -102,6 +103,37 @@ static struct xe_hw_engine 
>>>>> *event_to_hwe(struct perf_event *event)
>>>>>     return hwe;
>>>>> }
>>>>>
>>>>> +static bool is_engine_event(u64 config)
>>>>> +{
>>>>> +    unsigned int event_id = config_to_event_id(config);
>>>>> +
>>>>> +    return (event_id == XE_PMU_EVENT_ENGINE_TOTAL_TICKS ||
>>>>> +        event_id == XE_PMU_EVENT_ENGINE_ACTIVE_TICKS);
>>>>> +}
>>>>> +
>>>>> +static bool event_gt_forcewake(struct perf_event *event)
>>>>> +{
>>>>> +    struct xe_device *xe = container_of(event->pmu, typeof(*xe), 
>>>>> pmu.base);
>>>>> +    u64 config = event->attr.config;
>>>>> +    struct xe_pmu *pmu = &xe->pmu;
>>>>> +    struct xe_gt *gt;
>>>>> +    unsigned int fw_ref;
>>>>> +
>>>>> +    if (!is_engine_event(config))
>>>>> +        return true;
>>>>> +
>>>>> +    gt = xe_device_get_gt(xe, config_to_gt_id(config));
>>>>> +
>>>>> +    fw_ref = xe_force_wake_get(gt_to_fw(gt), XE_FW_GT);
>>>>> +    if (!fw_ref)
>>>>> +        return false;
>>>>> +
>>>>> +    if (!pmu->fw_ref)
>>>>> +        pmu->fw_ref = fw_ref;
>>>>
>>>> how this shared fw_ref is supposed to work for multiple
>>>> perf_event_open()?
>>>
>>> Agree, not ideal, but I don't see an issue. This forcewake is only 
>>> being taken for engine-* events and the domain is always XE_FW_GT. 
>>> Looking at xe_force_wake_get(), I see that it returns a mask of 
>>> domains enabled. In this case, it would be the XE_FW_GT. The return 
>>> value is just stored so that the corresponding event destroy can put 
>>> the forcewake.
>>>
>>>>
>>>> fd1 = perf_event_open( ... gt=0 ...);
>>>>
>>>>     event_get_forcewake()
>>>>         pmu->fw_ref = xe_force_wake_get()
>>>>
>>>> fd2 = perf_event_open( ... gt=1 ...);
>>>>
>>>>     event_get_forcewake()
>>>>         // get the forcewake, but don't save the ref
>>>>
>>>> forcewake for gt1 is never put.
>>>
>>> pmu->fw_ref should be identical for all events taking this forcewake.
>>>
>>>>
>>>>
>>>> Or even multiple perf_event_open() for the same gt: we will not handle
>>>> the count correctly.
>>>
>>> The count is actually handled in domain->ref in the forcewake 
>>> implementation and note that forcewake is always taken for every 
>>> engine event that is being initialized and hence always being put for 
>>> every event that is destroyed. This code is not refcounting that.
>>
>> so... we never set pmu->fw_ref back to 0 and any event destroy will 
>> try to
>> put the force wake? That seems a different bug that avoids the bug I
>> was thinking about.
> 
> try calling perf stat multiple times and you end up with this:
> 
> cat  /sys/kernel/debug/dri/0/info | grep "force wake"
> gt0 force wake -1
> gt1 force wake 0
> cat  /sys/kernel/debug/dri/0/info | grep "force wake"
> gt0 force wake -2
> gt1 force wake 0

This would be for non-engine events and engine events with non-engine 
events as leader?
> 
> and
> 
> [ 1614.778054] RIP: 0010:xe_force_wake_put+0xab6/0x1430 [xe]
> [ 1614.778370] Code: ff 41 53 8b 85 f8 fe ff ff 50 ff b5 f0 fe ff ff 44 
> 8b 8d e8 fe ff ff 4c 8b 85 d8 fe ff ff 48 8b 95 d0 fe ff ff e8 4a c3 27 
> df <0f> 0b 4c 89 e2 48 83 c4 60 48 b8 00 00 00 00 00 fc ff df 48 c1
>   ea
> [ 1614.778381] RSP: 0018:ffff88812cb2f7d0 EFLAGS: 00010046
> [ 1614.778395] RAX: 0000000000000000 RBX: ffff888138b000d0 RCX: 
> 0000000000000000
> [ 1614.778405] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 
> 0000000000000000
> [ 1614.778413] RBP: ffff88812cb2f968 R08: 0000000000000000 R09: 
> 0000000000000000
> [ 1614.778421] R10: 0000000000000000 R11: 0000000000000000 R12: 
> ffff888138b000e4
> [ 1614.778429] R13: 0000000000000001 R14: ffff888138b000d0 R15: 
> 00000000ffffffff
> [ 1614.778438] FS:  00007a9b757acd40(0000) GS:ffff8883f0e00000(0000) 
> knlGS:0000000000000000
> [ 1614.778447] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [ 1614.778456] CR2: 00007fcac269d5e0 CR3: 000000012a8ee003 CR4: 
> 0000000000f72ef0
> [ 1614.778464] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 
> 0000000000000000
> [ 1614.778472] DR3: 0000000000000000 DR6: 00000000ffff07f0 DR7: 
> 0000000000000400
> [ 1614.778480] PKRU: 55555554
> [ 1614.778488] Call Trace:
> [ 1614.778496]  <TASK>
> [ 1614.778506]  ? show_regs+0x6c/0x80
> [ 1614.778528]  ? __warn+0xd2/0x2e0
> [ 1614.778545]  ? xe_force_wake_put+0xab6/0x1430 [xe]
> [ 1614.778867]  ? report_bug+0x282/0x2f0
> [ 1614.778892]  ? handle_bug+0x6e/0xc0
> [ 1614.778906]  ? exc_invalid_op+0x18/0x50
> [ 1614.778919]  ? asm_exc_invalid_op+0x1b/0x20
> [ 1614.778953]  ? xe_force_wake_put+0xab6/0x1430 [xe]
> [ 1614.779318]  ? __pfx_xe_force_wake_put+0x10/0x10 [xe]
> [ 1614.779650]  xe_pmu_event_destroy+0x143/0x240 [xe]
> 
> 
>>
>>>
>>>>
>>>> In summary I think this fw ref needs to be per event... an easy way 
>>>> to do
>>>> that is to use the event->pmu_private field, to be populated on init...
>>>
>>> I am not opposed to that since that makes it future proof so that we 
>>> can indeed have events taking different forcewake domains, but let me 
>>> know if I missed something here since I think this alone should still 
>>> work.
>>>
>>>>
>>>>> +
>>>>> +    return true;
>>>>> +}
>>>>> +
>>>>> static bool event_supported(struct xe_pmu *pmu, unsigned int gt,
>>>>>                 unsigned int id)
>>>>> {
>>>>> @@ -144,6 +176,13 @@ static bool event_param_valid(struct 
>>>>> perf_event *event)
>>>>> static void xe_pmu_event_destroy(struct perf_event *event)
>>>>> {
>>>>>     struct xe_device *xe = container_of(event->pmu, typeof(*xe), 
>>>>> pmu.base);
>>>>> +    struct xe_pmu *pmu = &xe->pmu;
>>>>> +    struct xe_gt *gt;
>>>>> +
>>>>> +    if (pmu->fw_ref) {
>>>>> +        gt = xe_device_get_gt(xe, config_to_gt_id(event- 
>>>>> >attr.config));
>>>>> +        xe_force_wake_put(gt_to_fw(gt), pmu->fw_ref);
>>>>> +    }
>>>>>
>>>>>     drm_WARN_ON(&xe->drm, event->parent);
>>>>>     xe_pm_runtime_put(xe);
>>>>> @@ -183,18 +222,27 @@ static int xe_pmu_event_init(struct 
>>>>> perf_event *event)
>>>>>     if (!event->parent) {
>>>>>         drm_dev_get(&xe->drm);
>>>>>         xe_pm_runtime_get(xe);
>>>>> +        if (!event_gt_forcewake(event)) {
>>>>
>>>> if you group an engine vs non-engine counter, this won't work I think.
>>>> Can we test it?
>>>
>>> When you group events, init is called for each event. From the Xe PMU 
>>> implementation perspective, grouping shouldn't be any different.
>>
>> the event->parent is a shortcut to do this only once per group, but 
>> then you
>> can't look inside the event to decide if you need to take the 
>> forcewake or
>> not because it may be true for one event and false for another. Try
>> creating the events with the group leader being a non-engine event.
>> AFAICS the forcewake will not be taken.... something like (typed here
>> without testing):
>>
>> perf stat  -I1000 -e '{xe_0000_00_02.0/gt-c6- 
>> residency/,xe_0000_00_02.0/engine-active-ticks/}'
> 
> it seems like I mixed up here. Neither of these will have parent
> set - afaics that would be the case for events inherited from one task
> to another which is not the case here.
> 
> Lucas De Marchi
> 
>>
>> Lucas De Marchi
>>
>>>
>>> Regards,
>>> Umesh
>>>
>>>>
>>>> Lucas De Marchi
>>>>
>>>>> +            xe_pm_runtime_put(xe);
>>>>> +            drm_dev_put(&xe->drm);
>>>>> +            return -EINVAL;
>>>>> +        }
>>>>>         event->destroy = xe_pmu_event_destroy;
>>>>>     }
>>>>>
>>>>>     return 0;
>>>>> }
>>>>>
>>>>> -static u64 read_engine_events(struct xe_gt *gt, struct perf_event 
>>>>> *event)
>>>>> +static u64 read_engine_events(struct xe_gt *gt, struct perf_event 
>>>>> *event, u64 prev)
>>>>> {
>>>>>     struct xe_device *xe = gt_to_xe(gt);
>>>>> +    struct xe_pmu *pmu = &xe->pmu;
>>>>>     struct xe_hw_engine *hwe;
>>>>>     u64 val = 0;
>>>>>
>>>>> +    if (!pmu->fw_ref)
>>>>> +        return prev;
>>>>> +
>>>>>     hwe = event_to_hwe(event);
>>>>>     if (!hwe)
>>>>>         drm_warn(&xe->drm, "unknown engine\n");
>>>>> @@ -218,7 +266,7 @@ static u64 __xe_pmu_event_read(struct 
>>>>> perf_event *event, u64 prev)
>>>>>         return xe_gt_idle_residency_msec(&gt->gtidle);
>>>>>     case XE_PMU_EVENT_ENGINE_ACTIVE_TICKS:
>>>>>     case XE_PMU_EVENT_ENGINE_TOTAL_TICKS:
>>>>> -        return read_engine_events(gt, event);
>>>>> +        return read_engine_events(gt, event, prev);
>>>>>     }
>>>>>
>>>>>     return 0;
>>>>> diff --git a/drivers/gpu/drm/xe/xe_pmu_types.h b/drivers/gpu/drm/ 
>>>>> xe/xe_pmu_types.h
>>>>> index f5ba4d56622c..07c4e592106e 100644
>>>>> --- a/drivers/gpu/drm/xe/xe_pmu_types.h
>>>>> +++ b/drivers/gpu/drm/xe/xe_pmu_types.h
>>>>> @@ -30,6 +30,10 @@ struct xe_pmu {
>>>>>      * @name: Name as registered with perf core.
>>>>>      */
>>>>>     const char *name;
>>>>> +    /**
>>>>> +     * @fw_ref: force_wake ref
>>>>> +     */
>>>>> +    unsigned int fw_ref;
>>>>>     /**
>>>>>      * @supported_events: Bitmap of supported events, indexed by 
>>>>> event id
>>>>>      */
>>>>> -- 
>>>>> 2.47.1
>>>>>



More information about the Intel-xe mailing list