[PATCH] perf/x86/rapl: Fix PP1 event for Intel Meteor/Lunar Lake
Lucas De Marchi
lucas.demarchi at intel.com
Thu Feb 20 18:30:56 UTC 2025
On Thu, Feb 20, 2025 at 11:47:21AM -0500, Liang, Kan wrote:
>
>
>On 2025-02-20 10:36 a.m., Lucas De Marchi wrote:
>> On some boots the read of MSR_PP1_ENERGY_STATUS msr returns 0, causing
>> perf_msr_probe() to make the power/events/energy-gpu event non-visible.
>> When that happens, the msr always read 0 until the graphics module (i915
>> for Meteor Lake, xe for Lunar Lake) is loaded. Then it starts returning
>> something different and re-loading the rapl module "fixes" it.
>>
>> This is tested on the following platforms with the fail rates before
>> this patch:
>>
>> Alder Lake S 0/20
>> Arrow Lake H 0/20
>> Lunar Lake M 8/20
>> Meteor Lake U 6/20
>> Raptor Lake P 4/20
>> Raptor Lake S 0/20
>>
>> For those platforms failing, use a separate msr list with .no_check
>> set so it doesn't check the runtime value to create the event - it will
>> just return 0 until the i915/xe module initializes the GPU.
>>
>> The issue https://github.com/ulissesf/qmassa/issues/4 is workarounded by
>> reading the MSR directly since it works after xe is loaded, but the
>> issue with not having the perf event is still there.
>>
>> Closes: https://github.com/ulissesf/qmassa/issues/4
>> Closes: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/4241
>> Signed-off-by: Lucas De Marchi <lucas.demarchi at intel.com
>> ---
>>
>> Maybe a clearer alternative is to just move all the platforms after
>> RAPTORLAKE with a gpu to use the new msr list.
>>
>> arch/x86/events/rapl.c | 26 ++++++++++++++++++++++----
>> 1 file changed, 22 insertions(+), 4 deletions(-)
>>
>> diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c
>> index 4952faf03e82d..18e324b8fa82c 100644
>> --- a/arch/x86/events/rapl.c
>> +++ b/arch/x86/events/rapl.c
>> @@ -588,6 +588,14 @@ static struct perf_msr intel_rapl_spr_msrs[] = {
>> [PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group, test_msr, true, RAPL_MSR_MASK },
>> };
>>
>> +static struct perf_msr intel_rapl_mtl_msrs[] = {
>> + [PERF_RAPL_PP0] = { MSR_PP0_ENERGY_STATUS, &rapl_events_cores_group, test_msr, false, RAPL_MSR_MASK },
>> + [PERF_RAPL_PKG] = { MSR_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr, false, RAPL_MSR_MASK },
>> + [PERF_RAPL_RAM] = { MSR_DRAM_ENERGY_STATUS, &rapl_events_ram_group, test_msr, false, RAPL_MSR_MASK },
>> + [PERF_RAPL_PP1] = { MSR_PP1_ENERGY_STATUS, &rapl_events_gpu_group, test_msr, true, RAPL_MSR_MASK },
>> + [PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group, test_msr, false, RAPL_MSR_MASK },
>> +};
>> +
>> /*
>> * Force to PERF_RAPL_PKG_EVENTS_MAX size due to:
>> * - perf_msr_probe(PERF_RAPL_PKG_EVENTS_MAX)
>> @@ -826,6 +834,16 @@ static struct rapl_model model_spr = {
>> .rapl_pkg_msrs = intel_rapl_spr_msrs,
>> };
>>
>> +static struct rapl_model model_rpl = {
>> + .pkg_events = BIT(PERF_RAPL_PP0) |
>> + BIT(PERF_RAPL_PKG) |
>> + BIT(PERF_RAPL_RAM) |
>> + BIT(PERF_RAPL_PP1) |
>> + BIT(PERF_RAPL_PSYS),
>> + .msr_power_unit = MSR_RAPL_POWER_UNIT,
>> + .rapl_pkg_msrs = intel_rapl_mtl_msrs,
>
>It's better to make the name consistent, e.g., intel_rapl_rpl_msrs.
that's what happens when you decide to test on RPL just before sending
and forget to rename all the variables. Thanks for noticing.
I will rename it on next version if we decide to keep this approach.
Also please let me know what you think about moving arl and other rpl to
model_rpl, too.
thanks
Lucas De Marchi
>
>Thanks,
>Kan
>> +};
>> +
>> static struct rapl_model model_amd_hygon = {
>> .pkg_events = BIT(PERF_RAPL_PKG),
>> .core_events = BIT(PERF_RAPL_CORE),
>> @@ -873,13 +891,13 @@ static const struct x86_cpu_id rapl_model_match[] __initconst = {
>> X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, &model_spr),
>> X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, &model_spr),
>> X86_MATCH_VFM(INTEL_RAPTORLAKE, &model_skl),
>> - X86_MATCH_VFM(INTEL_RAPTORLAKE_P, &model_skl),
>> + X86_MATCH_VFM(INTEL_RAPTORLAKE_P, &model_rpl),
>> X86_MATCH_VFM(INTEL_RAPTORLAKE_S, &model_skl),
>> - X86_MATCH_VFM(INTEL_METEORLAKE, &model_skl),
>> - X86_MATCH_VFM(INTEL_METEORLAKE_L, &model_skl),
>> + X86_MATCH_VFM(INTEL_METEORLAKE, &model_rpl),
>> + X86_MATCH_VFM(INTEL_METEORLAKE_L, &model_rpl),
>> X86_MATCH_VFM(INTEL_ARROWLAKE_H, &model_skl),
>> X86_MATCH_VFM(INTEL_ARROWLAKE, &model_skl),
>> - X86_MATCH_VFM(INTEL_LUNARLAKE_M, &model_skl),
>> + X86_MATCH_VFM(INTEL_LUNARLAKE_M, &model_rpl),
>> {},
>> };
>> MODULE_DEVICE_TABLE(x86cpu, rapl_model_match);
>
More information about the Intel-xe
mailing list