[PATCH] drm/amd: Don't wake dGPUs while reading sensors
Mario Limonciello
mario.limonciello at amd.com
Fri Aug 23 14:39:44 UTC 2024
On 8/23/2024 09:31, Alex Deucher wrote:
> On Fri, Aug 23, 2024 at 10:13 AM Mario Limonciello
> <mario.limonciello at amd.com> wrote:
>>
>> On 8/23/2024 09:09, Alex Deucher wrote:
>>> On Mon, Aug 19, 2024 at 10:30 PM Mario Limonciello <superm1 at kernel.org> wrote:
>>>>
>>>> From: Mario Limonciello <mario.limonciello at amd.com>
>>>>
>>>> If the dGPU is off, then reading the sysfs files with a sensor monitoring
>>>> application will wake it. Change the behavior to return an error when the
>>>> dGPU is in D3cold.
>>>
>>> I'm a little concerned that this will generate a flurry of bug reports
>>> if this now reports an error. One more comment below.
>>>
>>
>> Do you have a particular app you're worried about, or just a general
>> worry? I've had a lot of people reach out to me complaining about
>> battery life on A+A systems, and it comes down to the use of sensor
>> monitoring software waking the dGPU which people don't seem to expect.
>
> Nothing in particular. Just expecting reports of "I checked my GPU
> temperature and it returned an error. It was working with the last
> kernel."
>
>>
>> I did double check that software like 'sensors', 'mission center' and
>> 'nvtop' don't freak out from this change.
>
> Do we know what other devices do?
I'm not sure. Let me CC Luke Jones, he might know.
> I wonder if it would make more
> sense to have the userspace tools check the runpm state before trying
> to access the device. Of course there are a lot of them and fixing
> them all is non-trivial...
Yes that's another way to go about it. But I've raised a bug with
mission center folks 8 months ago [1] to tell them to stop querying
dGPUs in runtime PM, and Luke Jones also raised it with them 4 months
earlier [2] but it's still not sorted in their project.
[1] https://gitlab.com/mission-center-devs/mission-center/-/issues/143
[2] https://gitlab.com/mission-center-devs/mission-center/-/issues/30
I suspect we'll hit similar road blocks with the dozens of other
softwares that do this. So that's why I was thinking cut off the
snake's head.
>
> Alex
>
>>
>> Here is what 'sensors' shows on my local workstation with this change.
>>
>> amdgpu-pci-6100
>> Adapter: PCI adapter
>> vddgfx: N/A
>> ERROR: Can't get value of subfeature fan1_min: Can't read
>> ERROR: Can't get value of subfeature fan1_max: Can't read
>> fan1: N/A (min = 0 RPM, max = 0 RPM)
>> edge: N/A (crit = +97.0°C, hyst = -273.1°C)
>> ERROR: Can't get value of subfeature power1_cap: Can't read
>> PPT: N/A (cap = 0.00 W)
>>
>>>>
>>>> Signed-off-by: Mario Limonciello <mario.limonciello at amd.com>
>>>> ---
>>>> drivers/gpu/drm/amd/pm/amdgpu_pm.c | 90 +++++++++++++++---------------
>>>> 1 file changed, 45 insertions(+), 45 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
>>>> index c11952a4389bc..d6e38466fbb82 100644
>>>> --- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c
>>>> +++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
>>>> @@ -142,7 +142,7 @@ static ssize_t amdgpu_get_power_dpm_state(struct device *dev,
>>>>
>>>> if (amdgpu_in_reset(adev))
>>>> return -EPERM;
>>>> - if (adev->in_suspend && !adev->in_runpm)
>>>> + if (adev->in_suspend || adev->in_runpm)
>>>> return -EPERM;
>>>>
>>>> ret = pm_runtime_get_sync(ddev->dev);
>>>> @@ -173,7 +173,7 @@ static ssize_t amdgpu_set_power_dpm_state(struct device *dev,
>>>>
>>>> if (amdgpu_in_reset(adev))
>>>> return -EPERM;
>>>> - if (adev->in_suspend && !adev->in_runpm)
>>>> + if (adev->in_suspend || adev->in_runpm)
>>>> return -EPERM;
>>>>
>>>> if (strncmp("battery", buf, strlen("battery")) == 0)
>>>> @@ -270,7 +270,7 @@ static ssize_t amdgpu_get_power_dpm_force_performance_level(struct device *dev,
>>>>
>>>> if (amdgpu_in_reset(adev))
>>>> return -EPERM;
>>>> - if (adev->in_suspend && !adev->in_runpm)
>>>> + if (adev->in_suspend || adev->in_runpm)
>>>> return -EPERM;
>>>>
>>>> ret = pm_runtime_get_sync(ddev->dev);
>>>> @@ -309,7 +309,7 @@ static ssize_t amdgpu_set_power_dpm_force_performance_level(struct device *dev,
>>>>
>>>> if (amdgpu_in_reset(adev))
>>>> return -EPERM;
>>>> - if (adev->in_suspend && !adev->in_runpm)
>>>> + if (adev->in_suspend || adev->in_runpm)
>>>> return -EPERM;
>>>>
>>>> if (strncmp("low", buf, strlen("low")) == 0) {
>>>> @@ -371,7 +371,7 @@ static ssize_t amdgpu_get_pp_num_states(struct device *dev,
>>>>
>>>> if (amdgpu_in_reset(adev))
>>>> return -EPERM;
>>>> - if (adev->in_suspend && !adev->in_runpm)
>>>> + if (adev->in_suspend || adev->in_runpm)
>>>> return -EPERM;
>>>>
>>>> ret = pm_runtime_get_sync(ddev->dev);
>>>> @@ -409,7 +409,7 @@ static ssize_t amdgpu_get_pp_cur_state(struct device *dev,
>>>>
>>>> if (amdgpu_in_reset(adev))
>>>> return -EPERM;
>>>> - if (adev->in_suspend && !adev->in_runpm)
>>>> + if (adev->in_suspend || adev->in_runpm)
>>>> return -EPERM;
>>>>
>>>> ret = pm_runtime_get_sync(ddev->dev);
>>>> @@ -448,7 +448,7 @@ static ssize_t amdgpu_get_pp_force_state(struct device *dev,
>>>>
>>>> if (amdgpu_in_reset(adev))
>>>> return -EPERM;
>>>> - if (adev->in_suspend && !adev->in_runpm)
>>>> + if (adev->in_suspend || adev->in_runpm)
>>>> return -EPERM;
>>>>
>>>> if (adev->pm.pp_force_state_enabled)
>>>> @@ -471,7 +471,7 @@ static ssize_t amdgpu_set_pp_force_state(struct device *dev,
>>>>
>>>> if (amdgpu_in_reset(adev))
>>>> return -EPERM;
>>>> - if (adev->in_suspend && !adev->in_runpm)
>>>> + if (adev->in_suspend || adev->in_runpm)
>>>> return -EPERM;
>>>>
>>>> adev->pm.pp_force_state_enabled = false;
>>>> @@ -541,7 +541,7 @@ static ssize_t amdgpu_get_pp_table(struct device *dev,
>>>>
>>>> if (amdgpu_in_reset(adev))
>>>> return -EPERM;
>>>> - if (adev->in_suspend && !adev->in_runpm)
>>>> + if (adev->in_suspend || adev->in_runpm)
>>>> return -EPERM;
>>>>
>>>> ret = pm_runtime_get_sync(ddev->dev);
>>>> @@ -577,7 +577,7 @@ static ssize_t amdgpu_set_pp_table(struct device *dev,
>>>>
>>>> if (amdgpu_in_reset(adev))
>>>> return -EPERM;
>>>> - if (adev->in_suspend && !adev->in_runpm)
>>>> + if (adev->in_suspend || adev->in_runpm)
>>>> return -EPERM;
>>>>
>>>> ret = pm_runtime_get_sync(ddev->dev);
>>>> @@ -760,7 +760,7 @@ static ssize_t amdgpu_set_pp_od_clk_voltage(struct device *dev,
>>>>
>>>> if (amdgpu_in_reset(adev))
>>>> return -EPERM;
>>>> - if (adev->in_suspend && !adev->in_runpm)
>>>> + if (adev->in_suspend || adev->in_runpm)
>>>> return -EPERM;
>>>>
>>>> if (count > 127 || count == 0)
>>>> @@ -862,7 +862,7 @@ static ssize_t amdgpu_get_pp_od_clk_voltage(struct device *dev,
>>>>
>>>> if (amdgpu_in_reset(adev))
>>>> return -EPERM;
>>>> - if (adev->in_suspend && !adev->in_runpm)
>>>> + if (adev->in_suspend || adev->in_runpm)
>>>> return -EPERM;
>>>>
>>>> ret = pm_runtime_get_sync(ddev->dev);
>>>> @@ -922,7 +922,7 @@ static ssize_t amdgpu_set_pp_features(struct device *dev,
>>>>
>>>> if (amdgpu_in_reset(adev))
>>>> return -EPERM;
>>>> - if (adev->in_suspend && !adev->in_runpm)
>>>> + if (adev->in_suspend || adev->in_runpm)
>>>> return -EPERM;
>>>>
>>>> ret = kstrtou64(buf, 0, &featuremask);
>>>> @@ -957,7 +957,7 @@ static ssize_t amdgpu_get_pp_features(struct device *dev,
>>>>
>>>> if (amdgpu_in_reset(adev))
>>>> return -EPERM;
>>>> - if (adev->in_suspend && !adev->in_runpm)
>>>> + if (adev->in_suspend || adev->in_runpm)
>>>> return -EPERM;
>>>>
>>>> ret = pm_runtime_get_sync(ddev->dev);
>>>> @@ -1026,7 +1026,7 @@ static ssize_t amdgpu_get_pp_dpm_clock(struct device *dev,
>>>>
>>>> if (amdgpu_in_reset(adev))
>>>> return -EPERM;
>>>> - if (adev->in_suspend && !adev->in_runpm)
>>>> + if (adev->in_suspend || adev->in_runpm)
>>>> return -EPERM;
>>>>
>>>> ret = pm_runtime_get_sync(ddev->dev);
>>>> @@ -1095,7 +1095,7 @@ static ssize_t amdgpu_set_pp_dpm_clock(struct device *dev,
>>>>
>>>> if (amdgpu_in_reset(adev))
>>>> return -EPERM;
>>>> - if (adev->in_suspend && !adev->in_runpm)
>>>> + if (adev->in_suspend || adev->in_runpm)
>>>> return -EPERM;
>>>>
>>>> ret = amdgpu_read_mask(buf, count, &mask);
>>>> @@ -1280,7 +1280,7 @@ static ssize_t amdgpu_get_pp_sclk_od(struct device *dev,
>>>>
>>>> if (amdgpu_in_reset(adev))
>>>> return -EPERM;
>>>> - if (adev->in_suspend && !adev->in_runpm)
>>>> + if (adev->in_suspend || adev->in_runpm)
>>>> return -EPERM;
>>>>
>>>> ret = pm_runtime_get_sync(ddev->dev);
>>>> @@ -1309,7 +1309,7 @@ static ssize_t amdgpu_set_pp_sclk_od(struct device *dev,
>>>>
>>>> if (amdgpu_in_reset(adev))
>>>> return -EPERM;
>>>> - if (adev->in_suspend && !adev->in_runpm)
>>>> + if (adev->in_suspend || adev->in_runpm)
>>>> return -EPERM;
>>>>
>>>> ret = kstrtol(buf, 0, &value);
>>>> @@ -1342,7 +1342,7 @@ static ssize_t amdgpu_get_pp_mclk_od(struct device *dev,
>>>>
>>>> if (amdgpu_in_reset(adev))
>>>> return -EPERM;
>>>> - if (adev->in_suspend && !adev->in_runpm)
>>>> + if (adev->in_suspend || adev->in_runpm)
>>>> return -EPERM;
>>>>
>>>> ret = pm_runtime_get_sync(ddev->dev);
>>>> @@ -1371,7 +1371,7 @@ static ssize_t amdgpu_set_pp_mclk_od(struct device *dev,
>>>>
>>>> if (amdgpu_in_reset(adev))
>>>> return -EPERM;
>>>> - if (adev->in_suspend && !adev->in_runpm)
>>>> + if (adev->in_suspend || adev->in_runpm)
>>>> return -EPERM;
>>>>
>>>> ret = kstrtol(buf, 0, &value);
>>>> @@ -1424,7 +1424,7 @@ static ssize_t amdgpu_get_pp_power_profile_mode(struct device *dev,
>>>>
>>>> if (amdgpu_in_reset(adev))
>>>> return -EPERM;
>>>> - if (adev->in_suspend && !adev->in_runpm)
>>>> + if (adev->in_suspend || adev->in_runpm)
>>>> return -EPERM;
>>>>
>>>> ret = pm_runtime_get_sync(ddev->dev);
>>>> @@ -1463,7 +1463,7 @@ static ssize_t amdgpu_set_pp_power_profile_mode(struct device *dev,
>>>>
>>>> if (amdgpu_in_reset(adev))
>>>> return -EPERM;
>>>> - if (adev->in_suspend && !adev->in_runpm)
>>>> + if (adev->in_suspend || adev->in_runpm)
>>>> return -EPERM;
>>>>
>>>> tmp[0] = *(buf);
>>>> @@ -1517,7 +1517,7 @@ static int amdgpu_hwmon_get_sensor_generic(struct amdgpu_device *adev,
>>>>
>>>> if (amdgpu_in_reset(adev))
>>>> return -EPERM;
>>>> - if (adev->in_suspend && !adev->in_runpm)
>>>> + if (adev->in_suspend || adev->in_runpm)
>>>> return -EPERM;
>>>>
>>>> r = pm_runtime_get_sync(adev_to_drm(adev)->dev);
>>>> @@ -1630,7 +1630,7 @@ static ssize_t amdgpu_get_pcie_bw(struct device *dev,
>>>>
>>>> if (amdgpu_in_reset(adev))
>>>> return -EPERM;
>>>> - if (adev->in_suspend && !adev->in_runpm)
>>>> + if (adev->in_suspend || adev->in_runpm)
>>>> return -EPERM;
>>>>
>>>> if (adev->flags & AMD_IS_APU)
>>>> @@ -1673,7 +1673,7 @@ static ssize_t amdgpu_get_unique_id(struct device *dev,
>>>>
>>>> if (amdgpu_in_reset(adev))
>>>> return -EPERM;
>>>> - if (adev->in_suspend && !adev->in_runpm)
>>>> + if (adev->in_suspend || adev->in_runpm)
>>>> return -EPERM;
>>>>
>>>> if (adev->unique_id)
>>>> @@ -1846,7 +1846,7 @@ static ssize_t amdgpu_get_pm_metrics(struct device *dev,
>>>>
>>>> if (amdgpu_in_reset(adev))
>>>> return -EPERM;
>>>> - if (adev->in_suspend && !adev->in_runpm)
>>>> + if (adev->in_suspend || adev->in_runpm)
>>>> return -EPERM;
>>>>
>>>> ret = pm_runtime_get_sync(ddev->dev);
>>>> @@ -1887,7 +1887,7 @@ static ssize_t amdgpu_get_gpu_metrics(struct device *dev,
>>>>
>>>> if (amdgpu_in_reset(adev))
>>>> return -EPERM;
>>>> - if (adev->in_suspend && !adev->in_runpm)
>>>> + if (adev->in_suspend || adev->in_runpm)
>>>> return -EPERM;
>>>>
>>>> ret = pm_runtime_get_sync(ddev->dev);
>>>> @@ -2005,7 +2005,7 @@ static ssize_t amdgpu_set_smartshift_bias(struct device *dev,
>>>>
>>>> if (amdgpu_in_reset(adev))
>>>> return -EPERM;
>>>> - if (adev->in_suspend && !adev->in_runpm)
>>>> + if (adev->in_suspend || adev->in_runpm)
>>>> return -EPERM;
>>>>
>>>> r = pm_runtime_get_sync(ddev->dev);
>>>> @@ -2227,7 +2227,7 @@ static ssize_t amdgpu_get_xgmi_plpd_policy(struct device *dev,
>>>>
>>>> if (amdgpu_in_reset(adev))
>>>> return -EPERM;
>>>> - if (adev->in_suspend && !adev->in_runpm)
>>>> + if (adev->in_suspend || adev->in_runpm)
>>>> return -EPERM;
>>>>
>>>> mode = amdgpu_dpm_get_xgmi_plpd_mode(adev, &mode_desc);
>>>> @@ -2250,7 +2250,7 @@ static ssize_t amdgpu_set_xgmi_plpd_policy(struct device *dev,
>>>>
>>>> if (amdgpu_in_reset(adev))
>>>> return -EPERM;
>>>> - if (adev->in_suspend && !adev->in_runpm)
>>>> + if (adev->in_suspend || adev->in_runpm)
>>>> return -EPERM;
>>>>
>>>> ret = kstrtos32(buf, 0, &mode);
>>>> @@ -2652,7 +2652,7 @@ static ssize_t amdgpu_hwmon_get_pwm1_enable(struct device *dev,
>>>>
>>>> if (amdgpu_in_reset(adev))
>>>> return -EPERM;
>>>> - if (adev->in_suspend && !adev->in_runpm)
>>>> + if (adev->in_suspend || adev->in_runpm)
>>>> return -EPERM;
>>>>
>>>> ret = pm_runtime_get_sync(adev_to_drm(adev)->dev);
>>>> @@ -2684,7 +2684,7 @@ static ssize_t amdgpu_hwmon_set_pwm1_enable(struct device *dev,
>>>>
>>>> if (amdgpu_in_reset(adev))
>>>> return -EPERM;
>>>> - if (adev->in_suspend && !adev->in_runpm)
>>>> + if (adev->in_suspend || adev->in_runpm)
>>>> return -EPERM;
>>>>
>>>> err = kstrtoint(buf, 10, &value);
>>>> @@ -2742,7 +2742,7 @@ static ssize_t amdgpu_hwmon_set_pwm1(struct device *dev,
>>>>
>>>> if (amdgpu_in_reset(adev))
>>>> return -EPERM;
>>>> - if (adev->in_suspend && !adev->in_runpm)
>>>> + if (adev->in_suspend || adev->in_runpm)
>>>> return -EPERM;
>>>>
>>>> err = kstrtou32(buf, 10, &value);
>>>> @@ -2787,7 +2787,7 @@ static ssize_t amdgpu_hwmon_get_pwm1(struct device *dev,
>>>>
>>>> if (amdgpu_in_reset(adev))
>>>> return -EPERM;
>>>> - if (adev->in_suspend && !adev->in_runpm)
>>>> + if (adev->in_suspend || adev->in_runpm)
>>>> return -EPERM;
>>>>
>>>> err = pm_runtime_get_sync(adev_to_drm(adev)->dev);
>>>> @@ -2817,7 +2817,7 @@ static ssize_t amdgpu_hwmon_get_fan1_input(struct device *dev,
>>>>
>>>> if (amdgpu_in_reset(adev))
>>>> return -EPERM;
>>>> - if (adev->in_suspend && !adev->in_runpm)
>>>> + if (adev->in_suspend || adev->in_runpm)
>>>> return -EPERM;
>>>>
>>>> err = pm_runtime_get_sync(adev_to_drm(adev)->dev);
>>>> @@ -2881,7 +2881,7 @@ static ssize_t amdgpu_hwmon_get_fan1_target(struct device *dev,
>>>>
>>>> if (amdgpu_in_reset(adev))
>>>> return -EPERM;
>>>> - if (adev->in_suspend && !adev->in_runpm)
>>>> + if (adev->in_suspend || adev->in_runpm)
>>>> return -EPERM;
>>>>
>>>> err = pm_runtime_get_sync(adev_to_drm(adev)->dev);
>>>> @@ -2912,7 +2912,7 @@ static ssize_t amdgpu_hwmon_set_fan1_target(struct device *dev,
>>>>
>>>> if (amdgpu_in_reset(adev))
>>>> return -EPERM;
>>>> - if (adev->in_suspend && !adev->in_runpm)
>>>> + if (adev->in_suspend || adev->in_runpm)
>>>> return -EPERM;
>>>>
>>>> err = kstrtou32(buf, 10, &value);
>>>> @@ -2956,7 +2956,7 @@ static ssize_t amdgpu_hwmon_get_fan1_enable(struct device *dev,
>>>>
>>>> if (amdgpu_in_reset(adev))
>>>> return -EPERM;
>>>> - if (adev->in_suspend && !adev->in_runpm)
>>>> + if (adev->in_suspend || adev->in_runpm)
>>>> return -EPERM;
>>>>
>>>> ret = pm_runtime_get_sync(adev_to_drm(adev)->dev);
>>>> @@ -2988,7 +2988,7 @@ static ssize_t amdgpu_hwmon_set_fan1_enable(struct device *dev,
>>>>
>>>> if (amdgpu_in_reset(adev))
>>>> return -EPERM;
>>>> - if (adev->in_suspend && !adev->in_runpm)
>>>> + if (adev->in_suspend || adev->in_runpm)
>>>> return -EPERM;
>>>>
>>>> err = kstrtoint(buf, 10, &value);
>>>> @@ -3128,7 +3128,7 @@ static ssize_t amdgpu_hwmon_show_power_cap_generic(struct device *dev,
>>>>
>>>> if (amdgpu_in_reset(adev))
>>>> return -EPERM;
>>>> - if (adev->in_suspend && !adev->in_runpm)
>>>> + if (adev->in_suspend || adev->in_runpm)
>>>> return -EPERM;
>>>>
>>>> r = pm_runtime_get_sync(adev_to_drm(adev)->dev);
>>>> @@ -3209,7 +3209,7 @@ static ssize_t amdgpu_hwmon_set_power_cap(struct device *dev,
>>>>
>>>> if (amdgpu_in_reset(adev))
>>>> return -EPERM;
>>>> - if (adev->in_suspend && !adev->in_runpm)
>>>> + if (adev->in_suspend || adev->in_runpm)
>>>> return -EPERM;
>>>>
>>>> if (amdgpu_sriov_vf(adev))
>>>> @@ -3663,7 +3663,7 @@ static int amdgpu_retrieve_od_settings(struct amdgpu_device *adev,
>>>>
>>>> if (amdgpu_in_reset(adev))
>>>> return -EPERM;
>>>> - if (adev->in_suspend && !adev->in_runpm)
>>>> + if (adev->in_suspend || adev->in_runpm)
>>>> return -EPERM;
>>>>
>>>> ret = pm_runtime_get_sync(adev->dev);
>>>> @@ -3747,7 +3747,7 @@ amdgpu_distribute_custom_od_settings(struct amdgpu_device *adev,
>>>>
>>>> if (amdgpu_in_reset(adev))
>>>> return -EPERM;
>>>> - if (adev->in_suspend && !adev->in_runpm)
>>>> + if (adev->in_suspend || adev->in_runpm)
>>>> return -EPERM;
>>>>
>>>> ret = parse_input_od_command_lines(in_buf,
>>>> @@ -4626,7 +4626,7 @@ static int amdgpu_debugfs_pm_info_show(struct seq_file *m, void *unused)
>>>>
>>>> if (amdgpu_in_reset(adev))
>>>> return -EPERM;
>>>> - if (adev->in_suspend && !adev->in_runpm)
>>>> + if (adev->in_suspend || adev->in_runpm)
>>>> return -EPERM;
>>>
>>> I'd prefer to keep the current behavior for debugfs.
>>
>> OK. I'll exclude it for debugfs in the next spin.
>>
>>>
>>> Alex
>>>
>>>>
>>>> r = pm_runtime_get_sync(dev->dev);
>>>> @@ -4671,7 +4671,7 @@ static ssize_t amdgpu_pm_prv_buffer_read(struct file *f, char __user *buf,
>>>>
>>>> if (amdgpu_in_reset(adev))
>>>> return -EPERM;
>>>> - if (adev->in_suspend && !adev->in_runpm)
>>>> + if (adev->in_suspend || adev->in_runpm)
>>>> return -EPERM;
>>>>
>>>> ret = amdgpu_dpm_get_smu_prv_buf_details(adev, &smu_prv_buf, &smu_prv_buf_size);
>>>> --
>>>> 2.43.0
>>>>
>>
More information about the amd-gfx
mailing list