[PATCH] drm/amd: Don't wake dGPUs while reading sensors

Alex Deucher alexdeucher at gmail.com
Fri Aug 23 14:31:35 UTC 2024


On Fri, Aug 23, 2024 at 10:13 AM Mario Limonciello
<mario.limonciello at amd.com> wrote:
>
> On 8/23/2024 09:09, Alex Deucher wrote:
> > On Mon, Aug 19, 2024 at 10:30 PM Mario Limonciello <superm1 at kernel.org> wrote:
> >>
> >> From: Mario Limonciello <mario.limonciello at amd.com>
> >>
> >> If the dGPU is off, then reading the sysfs files with a sensor monitoring
> >> application will wake it. Change the behavior to return an error when the
> >> dGPU is in D3cold.
> >
> > I'm a little concerned that this will generate a flurry of bug reports
> > if this now reports an error.  One more comment below.
> >
>
> Do you have a particular app you're worried about, or just a general
> worry?  I've had a lot of people reach out to me complaining about
> battery life on A+A systems, and it comes down to the use of sensor
> monitoring software waking the dGPU which people don't seem to expect.

Nothing in particular.  Just expecting reports of "I checked my GPU
temperature and it returned an error.  It was working with the last
kernel."

>
> I did double check that software like 'sensors', 'mission center' and
> 'nvtop' don't freak out from this change.

Do we know what other devices do?  I wonder if it would make more
sense to have the userspace tools check the runpm state before trying
to access the device.  Of course there are a lot of them and fixing
them all is non-trivial...

Alex

>
> Here is what 'sensors' shows on my local workstation with this change.
>
> amdgpu-pci-6100
> Adapter: PCI adapter
> vddgfx:           N/A
> ERROR: Can't get value of subfeature fan1_min: Can't read
> ERROR: Can't get value of subfeature fan1_max: Can't read
> fan1:             N/A  (min =    0 RPM, max =    0 RPM)
> edge:             N/A  (crit = +97.0°C, hyst = -273.1°C)
> ERROR: Can't get value of subfeature power1_cap: Can't read
> PPT:              N/A  (cap =   0.00 W)
>
> >>
> >> Signed-off-by: Mario Limonciello <mario.limonciello at amd.com>
> >> ---
> >>   drivers/gpu/drm/amd/pm/amdgpu_pm.c | 90 +++++++++++++++---------------
> >>   1 file changed, 45 insertions(+), 45 deletions(-)
> >>
> >> diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
> >> index c11952a4389bc..d6e38466fbb82 100644
> >> --- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c
> >> +++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
> >> @@ -142,7 +142,7 @@ static ssize_t amdgpu_get_power_dpm_state(struct device *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          ret = pm_runtime_get_sync(ddev->dev);
> >> @@ -173,7 +173,7 @@ static ssize_t amdgpu_set_power_dpm_state(struct device *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          if (strncmp("battery", buf, strlen("battery")) == 0)
> >> @@ -270,7 +270,7 @@ static ssize_t amdgpu_get_power_dpm_force_performance_level(struct device *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          ret = pm_runtime_get_sync(ddev->dev);
> >> @@ -309,7 +309,7 @@ static ssize_t amdgpu_set_power_dpm_force_performance_level(struct device *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          if (strncmp("low", buf, strlen("low")) == 0) {
> >> @@ -371,7 +371,7 @@ static ssize_t amdgpu_get_pp_num_states(struct device *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          ret = pm_runtime_get_sync(ddev->dev);
> >> @@ -409,7 +409,7 @@ static ssize_t amdgpu_get_pp_cur_state(struct device *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          ret = pm_runtime_get_sync(ddev->dev);
> >> @@ -448,7 +448,7 @@ static ssize_t amdgpu_get_pp_force_state(struct device *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          if (adev->pm.pp_force_state_enabled)
> >> @@ -471,7 +471,7 @@ static ssize_t amdgpu_set_pp_force_state(struct device *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          adev->pm.pp_force_state_enabled = false;
> >> @@ -541,7 +541,7 @@ static ssize_t amdgpu_get_pp_table(struct device *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          ret = pm_runtime_get_sync(ddev->dev);
> >> @@ -577,7 +577,7 @@ static ssize_t amdgpu_set_pp_table(struct device *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          ret = pm_runtime_get_sync(ddev->dev);
> >> @@ -760,7 +760,7 @@ static ssize_t amdgpu_set_pp_od_clk_voltage(struct device *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          if (count > 127 || count == 0)
> >> @@ -862,7 +862,7 @@ static ssize_t amdgpu_get_pp_od_clk_voltage(struct device *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          ret = pm_runtime_get_sync(ddev->dev);
> >> @@ -922,7 +922,7 @@ static ssize_t amdgpu_set_pp_features(struct device *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          ret = kstrtou64(buf, 0, &featuremask);
> >> @@ -957,7 +957,7 @@ static ssize_t amdgpu_get_pp_features(struct device *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          ret = pm_runtime_get_sync(ddev->dev);
> >> @@ -1026,7 +1026,7 @@ static ssize_t amdgpu_get_pp_dpm_clock(struct device *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          ret = pm_runtime_get_sync(ddev->dev);
> >> @@ -1095,7 +1095,7 @@ static ssize_t amdgpu_set_pp_dpm_clock(struct device *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          ret = amdgpu_read_mask(buf, count, &mask);
> >> @@ -1280,7 +1280,7 @@ static ssize_t amdgpu_get_pp_sclk_od(struct device *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          ret = pm_runtime_get_sync(ddev->dev);
> >> @@ -1309,7 +1309,7 @@ static ssize_t amdgpu_set_pp_sclk_od(struct device *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          ret = kstrtol(buf, 0, &value);
> >> @@ -1342,7 +1342,7 @@ static ssize_t amdgpu_get_pp_mclk_od(struct device *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          ret = pm_runtime_get_sync(ddev->dev);
> >> @@ -1371,7 +1371,7 @@ static ssize_t amdgpu_set_pp_mclk_od(struct device *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          ret = kstrtol(buf, 0, &value);
> >> @@ -1424,7 +1424,7 @@ static ssize_t amdgpu_get_pp_power_profile_mode(struct device *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          ret = pm_runtime_get_sync(ddev->dev);
> >> @@ -1463,7 +1463,7 @@ static ssize_t amdgpu_set_pp_power_profile_mode(struct device *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          tmp[0] = *(buf);
> >> @@ -1517,7 +1517,7 @@ static int amdgpu_hwmon_get_sensor_generic(struct amdgpu_device *adev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          r = pm_runtime_get_sync(adev_to_drm(adev)->dev);
> >> @@ -1630,7 +1630,7 @@ static ssize_t amdgpu_get_pcie_bw(struct device *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          if (adev->flags & AMD_IS_APU)
> >> @@ -1673,7 +1673,7 @@ static ssize_t amdgpu_get_unique_id(struct device *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          if (adev->unique_id)
> >> @@ -1846,7 +1846,7 @@ static ssize_t amdgpu_get_pm_metrics(struct device *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          ret = pm_runtime_get_sync(ddev->dev);
> >> @@ -1887,7 +1887,7 @@ static ssize_t amdgpu_get_gpu_metrics(struct device *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          ret = pm_runtime_get_sync(ddev->dev);
> >> @@ -2005,7 +2005,7 @@ static ssize_t amdgpu_set_smartshift_bias(struct device *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          r = pm_runtime_get_sync(ddev->dev);
> >> @@ -2227,7 +2227,7 @@ static ssize_t amdgpu_get_xgmi_plpd_policy(struct device *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          mode = amdgpu_dpm_get_xgmi_plpd_mode(adev, &mode_desc);
> >> @@ -2250,7 +2250,7 @@ static ssize_t amdgpu_set_xgmi_plpd_policy(struct device *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          ret = kstrtos32(buf, 0, &mode);
> >> @@ -2652,7 +2652,7 @@ static ssize_t amdgpu_hwmon_get_pwm1_enable(struct device *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          ret = pm_runtime_get_sync(adev_to_drm(adev)->dev);
> >> @@ -2684,7 +2684,7 @@ static ssize_t amdgpu_hwmon_set_pwm1_enable(struct device *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          err = kstrtoint(buf, 10, &value);
> >> @@ -2742,7 +2742,7 @@ static ssize_t amdgpu_hwmon_set_pwm1(struct device *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          err = kstrtou32(buf, 10, &value);
> >> @@ -2787,7 +2787,7 @@ static ssize_t amdgpu_hwmon_get_pwm1(struct device *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          err = pm_runtime_get_sync(adev_to_drm(adev)->dev);
> >> @@ -2817,7 +2817,7 @@ static ssize_t amdgpu_hwmon_get_fan1_input(struct device *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          err = pm_runtime_get_sync(adev_to_drm(adev)->dev);
> >> @@ -2881,7 +2881,7 @@ static ssize_t amdgpu_hwmon_get_fan1_target(struct device *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          err = pm_runtime_get_sync(adev_to_drm(adev)->dev);
> >> @@ -2912,7 +2912,7 @@ static ssize_t amdgpu_hwmon_set_fan1_target(struct device *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          err = kstrtou32(buf, 10, &value);
> >> @@ -2956,7 +2956,7 @@ static ssize_t amdgpu_hwmon_get_fan1_enable(struct device *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          ret = pm_runtime_get_sync(adev_to_drm(adev)->dev);
> >> @@ -2988,7 +2988,7 @@ static ssize_t amdgpu_hwmon_set_fan1_enable(struct device *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          err = kstrtoint(buf, 10, &value);
> >> @@ -3128,7 +3128,7 @@ static ssize_t amdgpu_hwmon_show_power_cap_generic(struct device *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          r = pm_runtime_get_sync(adev_to_drm(adev)->dev);
> >> @@ -3209,7 +3209,7 @@ static ssize_t amdgpu_hwmon_set_power_cap(struct device *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          if (amdgpu_sriov_vf(adev))
> >> @@ -3663,7 +3663,7 @@ static int amdgpu_retrieve_od_settings(struct amdgpu_device *adev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          ret = pm_runtime_get_sync(adev->dev);
> >> @@ -3747,7 +3747,7 @@ amdgpu_distribute_custom_od_settings(struct amdgpu_device *adev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          ret = parse_input_od_command_lines(in_buf,
> >> @@ -4626,7 +4626,7 @@ static int amdgpu_debugfs_pm_info_show(struct seq_file *m, void *unused)
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >
> > I'd prefer to keep the current behavior for debugfs.
>
> OK.  I'll exclude it for debugfs in the next spin.
>
> >
> > Alex
> >
> >>
> >>          r = pm_runtime_get_sync(dev->dev);
> >> @@ -4671,7 +4671,7 @@ static ssize_t amdgpu_pm_prv_buffer_read(struct file *f, char __user *buf,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          ret = amdgpu_dpm_get_smu_prv_buf_details(adev, &smu_prv_buf, &smu_prv_buf_size);
> >> --
> >> 2.43.0
> >>
>


More information about the amd-gfx mailing list