[Nouveau] [PATCH 03/32] therm: Split return code and value in nvkm_get_temp
Karol Herbst
kherbst at redhat.com
Wed Nov 22 10:38:50 UTC 2017
On Wed, Nov 22, 2017 at 9:29 AM, Martin Peres <martin.peres at free.fr> wrote:
> On 22/11/17 03:42, Karol Herbst wrote:
>> On Wed, Nov 22, 2017 at 1:32 AM, Martin Peres <martin.peres at free.fr> wrote:
>>> On 17/11/17 02:04, Karol Herbst wrote:
>>>> The current hwmon code doesn't check if the returned value was actually an
>>>> error.
>>>>
>>>> Since Kepler temperature sensors are able to report negative values. Those
>>>> negative values are not for error reporting, but rather when you buried
>>>> your GPU in snow somewhere in Antarctica and still want a valid
>>>> temperature to be reported (unverified).
>>>>
>>>> Since Pascal (and maybe earlier) we have sensors with improved precision.
>>>>
>>>> Adjust the nvkm_get_temp method to be able to deal with those changes
>>>> and let hwmon return an error properly.
>>>
>>> Where did you get this information? And if it is the case, then we will
>>> royally screw up the value because we don't even know on how many bits
>>> 0x20400 uses...
>>>
>>> If you are indeed right, I would like to see g84.c's temp_get().
>>>
>>
>> Nvidia uses 0x020460 on Pascal and I thought we had that in envytools already?
>
> Nothing in nvbios.
>
>>
>> [0] 228.412618 MMIO32 R 0x020460 0x20002f20 PTHERM.I2C_SLAVE+0x60 => 0x20002f20
>> [0] 229.413039 MMIO32 R 0x020460 0x20002f10 PTHERM.I2C_SLAVE+0x60 => 0x20002f10
>> [0] 230.416976 MMIO32 R 0x020460 0x20002f68 PTHERM.I2C_SLAVE+0x60 => 0x20002f68
>> [0] 231.417407 MMIO32 R 0x020460 0x20002fe8 PTHERM.I2C_SLAVE+0x60 => 0x20002fe8
>> [0] 232.417742 MMIO32 R 0x020460 0x20002ff0 PTHERM.I2C_SLAVE+0x60 => 0x20002ff0
>> [0] 233.417742 MMIO32 R 0x020460 0x20003020 PTHERM.I2C_SLAVE+0x60 => 0x20003020
>>
>> or nvapeek 0x20400 && nvapeek 0x020460;
>> 00020400: 0000002c
>> 00020460: 20002c60
>
> Cool that they increased the accuracy, but where do you get that the
> value can be negative? And if it is, how many bits are used? 16 or more?
> What are those high bits?
>
I was asking rhyskidd to add stuff to rnndb, because he wrote the
nouveau patch for those sensors, check out for more details:
https://github.com/skeggsb/nouveau/blob/master/drm/nouveau/nvkm/subdev/therm/gp100.c#L27
regarding negative values, we have 0x02044c + 0x020450 for that and I
could imagine that 0x020460 can display negative values as well...
nobody tried for sure, but it seems plausible.
Anyway, moving from u8 to int should make life much easier for us if
we decide to really care about the higher precision. Currently our
code can't handle that for multiple reasons, but this solves one of
those at least.
>>
>>>>
>>>> Signed-off-by: Karol Herbst <karolherbst at gmail.com>
>>>> ---
>>>> drm/nouveau/include/nvkm/subdev/clk.h | 4 ++--
>>>> drm/nouveau/include/nvkm/subdev/therm.h | 2 +-
>>>> drm/nouveau/nouveau_hwmon.c | 15 +++++++++------
>>>> drm/nouveau/nvkm/subdev/clk/base.c | 2 +-
>>>> drm/nouveau/nvkm/subdev/therm/base.c | 19 ++++++++++++++-----
>>>> drm/nouveau/nvkm/subdev/therm/g84.c | 13 ++++++++-----
>>>> drm/nouveau/nvkm/subdev/therm/gp100.c | 9 +++++----
>>>> drm/nouveau/nvkm/subdev/therm/nv40.c | 9 +++------
>>>> drm/nouveau/nvkm/subdev/therm/nv50.c | 9 +++------
>>>> drm/nouveau/nvkm/subdev/therm/priv.h | 4 ++--
>>>> drm/nouveau/nvkm/subdev/therm/temp.c | 16 ++++++++++++----
>>>> 11 files changed, 60 insertions(+), 42 deletions(-)
>>>>
>>>> diff --git a/drm/nouveau/include/nvkm/subdev/clk.h b/drm/nouveau/include/nvkm/subdev/clk.h
>>>> index e5275f74..506f8cc6 100644
>>>> --- a/drm/nouveau/include/nvkm/subdev/clk.h
>>>> +++ b/drm/nouveau/include/nvkm/subdev/clk.h
>>>> @@ -100,7 +100,7 @@ struct nvkm_clk {
>>>> int ustate_dc; /* user-requested (-1 disabled, -2 perfmon) */
>>>> int astate; /* perfmon adjustment (base) */
>>>> int dstate; /* display adjustment (min+) */
>>>> - u8 temp;
>>>> + int temp;
>>>>
>>>> bool allow_reclock;
>>>> #define NVKM_CLK_BOOST_NONE 0x0
>>>> @@ -122,7 +122,7 @@ int nvkm_clk_read(struct nvkm_clk *, enum nv_clk_src);
>>>> int nvkm_clk_ustate(struct nvkm_clk *, int req, int pwr);
>>>> int nvkm_clk_astate(struct nvkm_clk *, int req, int rel, bool wait);
>>>> int nvkm_clk_dstate(struct nvkm_clk *, int req, int rel);
>>>> -int nvkm_clk_tstate(struct nvkm_clk *, u8 temperature);
>>>> +int nvkm_clk_tstate(struct nvkm_clk *, int temperature);
>>>>
>>>> int nv04_clk_new(struct nvkm_device *, int, struct nvkm_clk **);
>>>> int nv40_clk_new(struct nvkm_device *, int, struct nvkm_clk **);
>>>> diff --git a/drm/nouveau/include/nvkm/subdev/therm.h b/drm/nouveau/include/nvkm/subdev/therm.h
>>>> index 9841f076..8c84017f 100644
>>>> --- a/drm/nouveau/include/nvkm/subdev/therm.h
>>>> +++ b/drm/nouveau/include/nvkm/subdev/therm.h
>>>> @@ -86,7 +86,7 @@ struct nvkm_therm {
>>>> int (*attr_set)(struct nvkm_therm *, enum nvkm_therm_attr_type, int);
>>>> };
>>>>
>>>> -int nvkm_therm_temp_get(struct nvkm_therm *);
>>>> +int nvkm_therm_temp_get(struct nvkm_therm *, int *);
>>>> int nvkm_therm_fan_sense(struct nvkm_therm *);
>>>> int nvkm_therm_cstate(struct nvkm_therm *, int, int);
>>>>
>>>> diff --git a/drm/nouveau/nouveau_hwmon.c b/drm/nouveau/nouveau_hwmon.c
>>>> index 7c965648..7486e4af 100644
>>>> --- a/drm/nouveau/nouveau_hwmon.c
>>>> +++ b/drm/nouveau/nouveau_hwmon.c
>>>> @@ -326,8 +326,9 @@ nouveau_temp_is_visible(const void *data, u32 attr, int channel)
>>>> {
>>>> struct nouveau_drm *drm = nouveau_drm((struct drm_device *)data);
>>>> struct nvkm_therm *therm = nvxx_therm(&drm->client.device);
>>>> + int val;
>>>>
>>>> - if (therm && therm->attr_get && nvkm_therm_temp_get(therm) < 0)
>>>> + if (therm && therm->attr_get && nvkm_therm_temp_get(therm, &val))
>>>> return 0;
>>>>
>>>> switch (attr) {
>>>> @@ -421,15 +422,16 @@ nouveau_temp_read(struct device *dev, u32 attr, int channel, long *val)
>>>> struct drm_device *drm_dev = dev_get_drvdata(dev);
>>>> struct nouveau_drm *drm = nouveau_drm(drm_dev);
>>>> struct nvkm_therm *therm = nvxx_therm(&drm->client.device);
>>>> - int ret;
>>>> + int ret = 0;
>>>> + int temp;
>>>>
>>>> if (!therm || !therm->attr_get)
>>>> return -EOPNOTSUPP;
>>>>
>>>> switch (attr) {
>>>> case hwmon_temp_input:
>>>> - ret = nvkm_therm_temp_get(therm);
>>>> - *val = ret < 0 ? ret : (ret * 1000);
>>>> + ret = nvkm_therm_temp_get(therm, &temp);
>>>> + *val = temp * 1000;
>>>> break;
>>>> case hwmon_temp_max:
>>>> *val = therm->attr_get(therm, NVKM_THERM_ATTR_THRS_DOWN_CLK)
>>>> @@ -459,7 +461,7 @@ nouveau_temp_read(struct device *dev, u32 attr, int channel, long *val)
>>>> return -EOPNOTSUPP;
>>>> }
>>>>
>>>> - return 0;
>>>> + return ret;
>>>> }
>>>>
>>>> static int
>>>> @@ -713,6 +715,7 @@ nouveau_hwmon_init(struct drm_device *dev)
>>>> struct device *hwmon_dev;
>>>> int ret = 0;
>>>> int i = 0;
>>>> + int val;
>>>>
>>>> hwmon = drm->hwmon = kzalloc(sizeof(*hwmon), GFP_KERNEL);
>>>> if (!hwmon)
>>>> @@ -720,7 +723,7 @@ nouveau_hwmon_init(struct drm_device *dev)
>>>> hwmon->dev = dev;
>>>>
>>>> if (therm && therm->attr_get && therm->attr_set) {
>>>> - if (nvkm_therm_temp_get(therm) >= 0)
>>>> + if (!nvkm_therm_temp_get(therm, &val))
>>>> special_groups[i++] = &temp1_auto_point_sensor_group;
>>>> if (therm->fan_get && therm->fan_get(therm) >= 0)
>>>> special_groups[i++] = &pwm_fan_sensor_group;
>>>> diff --git a/drm/nouveau/nvkm/subdev/clk/base.c b/drm/nouveau/nvkm/subdev/clk/base.c
>>>> index e4c8d310..0b28dbb9 100644
>>>> --- a/drm/nouveau/nvkm/subdev/clk/base.c
>>>> +++ b/drm/nouveau/nvkm/subdev/clk/base.c
>>>> @@ -540,7 +540,7 @@ nvkm_clk_astate(struct nvkm_clk *clk, int req, int rel, bool wait)
>>>> }
>>>>
>>>> int
>>>> -nvkm_clk_tstate(struct nvkm_clk *clk, u8 temp)
>>>> +nvkm_clk_tstate(struct nvkm_clk *clk, int temp)
>>>> {
>>>> if (clk->temp == temp)
>>>> return 0;
>>>> diff --git a/drm/nouveau/nvkm/subdev/therm/base.c b/drm/nouveau/nvkm/subdev/therm/base.c
>>>> index f27fc6d0..8e5f6f7f 100644
>>>> --- a/drm/nouveau/nvkm/subdev/therm/base.c
>>>> +++ b/drm/nouveau/nvkm/subdev/therm/base.c
>>>> @@ -24,22 +24,26 @@
>>>> #include "priv.h"
>>>>
>>>> int
>>>> -nvkm_therm_temp_get(struct nvkm_therm *therm)
>>>> +nvkm_therm_temp_get(struct nvkm_therm *therm, int *val)
>>>> {
>>>> if (therm->func->temp_get)
>>>> - return therm->func->temp_get(therm);
>>>> + return therm->func->temp_get(therm, val);
>>>> return -ENODEV;
>>>> }
>>>>
>>>> static int
>>>> nvkm_therm_update_trip(struct nvkm_therm *therm)
>>>> {
>>>> + int temp, ret;
>>>> struct nvbios_therm_trip_point *trip = therm->fan->bios.trip,
>>>> *cur_trip = NULL,
>>>> *last_trip = therm->last_trip;
>>>> - u8 temp = therm->func->temp_get(therm);
>>>> u16 duty, i;
>>>>
>>>> + ret = therm->func->temp_get(therm, &temp);
>>>> + if (ret < 0)
>>>> + return ret;
>>>> +
>>>> /* look for the trip point corresponding to the current temperature */
>>>> cur_trip = NULL;
>>>> for (i = 0; i < therm->fan->bios.nr_fan_trip; i++) {
>>>> @@ -67,9 +71,13 @@ static int
>>>> nvkm_therm_compute_linear_duty(struct nvkm_therm *therm, u8 linear_min_temp,
>>>> u8 linear_max_temp)
>>>> {
>>>> - u8 temp = therm->func->temp_get(therm);
>>>> + int temp, ret;
>>>> u16 duty;
>>>>
>>>> + ret = therm->func->temp_get(therm, &temp);
>>>> + if (ret < 0)
>>>> + return ret;
>>>> +
>>>> /* handle the non-linear part first */
>>>> if (temp < linear_min_temp)
>>>> return therm->fan->bios.min_duty;
>>>> @@ -182,6 +190,7 @@ nvkm_therm_fan_mode(struct nvkm_therm *therm, int mode)
>>>> {
>>>> struct nvkm_subdev *subdev = &therm->subdev;
>>>> struct nvkm_device *device = subdev->device;
>>>> + int val;
>>>> static const char *name[] = {
>>>> "disabled",
>>>> "manual",
>>>> @@ -197,7 +206,7 @@ nvkm_therm_fan_mode(struct nvkm_therm *therm, int mode)
>>>> /* do not allow automatic fan management if the thermal sensor is
>>>> * not available */
>>>> if (mode == NVKM_THERM_CTRL_AUTO &&
>>>> - therm->func->temp_get(therm) < 0)
>>>> + therm->func->temp_get(therm, &val))
>>>> return -EINVAL;
>>>>
>>>> if (therm->mode == mode)
>>>> diff --git a/drm/nouveau/nvkm/subdev/therm/g84.c b/drm/nouveau/nvkm/subdev/therm/g84.c
>>>> index 96f8da40..81c0bda8 100644
>>>> --- a/drm/nouveau/nvkm/subdev/therm/g84.c
>>>> +++ b/drm/nouveau/nvkm/subdev/therm/g84.c
>>>> @@ -27,14 +27,15 @@
>>>> #include <subdev/fuse.h>
>>>>
>>>> int
>>>> -g84_temp_get(struct nvkm_therm *therm)
>>>> +g84_temp_get(struct nvkm_therm *therm, int *val)
>>>> {
>>>> struct nvkm_device *device = therm->subdev.device;
>>>>
>>>> - if (nvkm_fuse_read(device->fuse, 0x1a8) == 1)
>>>> - return nvkm_rd32(device, 0x20400);
>>>> - else
>>>> + if (nvkm_fuse_read(device->fuse, 0x1a8) != 1)
>>>> return -ENODEV;
>>>> +
>>>> + *val = nvkm_rd32(device, 0x20400);
>>>> + return 0;
>>>> }
>>>>
>>>> void
>>>> @@ -114,8 +115,10 @@ g84_therm_threshold_hyst_emulation(struct nvkm_therm *therm,
>>>> new_state = NVKM_THERM_THRS_LOWER;
>>>> }
>>>>
>>>> + if (therm->func->temp_get(therm, &cur))
>>>> + return;
>>>> +
>>>> /* fix the state (in case someone reprogrammed the alarms) */
>>>> - cur = therm->func->temp_get(therm);
>>>> if (new_state == NVKM_THERM_THRS_LOWER && cur > thrs->temp)
>>>> new_state = NVKM_THERM_THRS_HIGHER;
>>>> else if (new_state == NVKM_THERM_THRS_HIGHER &&
>>>> diff --git a/drm/nouveau/nvkm/subdev/therm/gp100.c b/drm/nouveau/nvkm/subdev/therm/gp100.c
>>>> index 9f0dea3f..d8206748 100644
>>>> --- a/drm/nouveau/nvkm/subdev/therm/gp100.c
>>>> +++ b/drm/nouveau/nvkm/subdev/therm/gp100.c
>>>> @@ -24,7 +24,7 @@
>>>> #include "priv.h"
>>>>
>>>> static int
>>>> -gp100_temp_get(struct nvkm_therm *therm)
>>>> +gp100_temp_get(struct nvkm_therm *therm, int *val)
>>>> {
>>>> struct nvkm_device *device = therm->subdev.device;
>>>> struct nvkm_subdev *subdev = &therm->subdev;
>>>> @@ -36,9 +36,10 @@ gp100_temp_get(struct nvkm_therm *therm)
>>>> nvkm_trace(subdev, "reading temperature from SHADOWed sensor\n");
>>>>
>>>> /* device valid */
>>>> - if (tsensor & 0x20000000)
>>>> - return (inttemp >> 8);
>>>> - else
>>>> + if (tsensor & 0x20000000) {
>>>> + *val = inttemp >> 8;
>>>> + return 0;
>>>> + } else
>>>> return -ENODEV;
>>>> }
>>>>
>>>> diff --git a/drm/nouveau/nvkm/subdev/therm/nv40.c b/drm/nouveau/nvkm/subdev/therm/nv40.c
>>>> index 2c92ffb5..cfd5b215 100644
>>>> --- a/drm/nouveau/nvkm/subdev/therm/nv40.c
>>>> +++ b/drm/nouveau/nvkm/subdev/therm/nv40.c
>>>> @@ -70,7 +70,7 @@ nv40_sensor_setup(struct nvkm_therm *therm)
>>>> }
>>>>
>>>> static int
>>>> -nv40_temp_get(struct nvkm_therm *therm)
>>>> +nv40_temp_get(struct nvkm_therm *therm, int *val)
>>>> {
>>>> struct nvkm_device *device = therm->subdev.device;
>>>> struct nvbios_therm_sensor *sensor = &therm->bios_sensor;
>>>> @@ -95,11 +95,8 @@ nv40_temp_get(struct nvkm_therm *therm)
>>>> core_temp = core_temp + sensor->offset_num / sensor->offset_den;
>>>> core_temp = core_temp + sensor->offset_constant - 8;
>>>>
>>>> - /* reserve negative temperatures for errors */
>>>> - if (core_temp < 0)
>>>> - core_temp = 0;
>>>> -
>>>> - return core_temp;
>>>> + *val = core_temp;
>>>> + return 0;
>>>> }
>>>>
>>>> static int
>>>> diff --git a/drm/nouveau/nvkm/subdev/therm/nv50.c b/drm/nouveau/nvkm/subdev/therm/nv50.c
>>>> index 9b57b433..62ec4063 100644
>>>> --- a/drm/nouveau/nvkm/subdev/therm/nv50.c
>>>> +++ b/drm/nouveau/nvkm/subdev/therm/nv50.c
>>>> @@ -126,7 +126,7 @@ nv50_sensor_setup(struct nvkm_therm *therm)
>>>> }
>>>>
>>>> static int
>>>> -nv50_temp_get(struct nvkm_therm *therm)
>>>> +nv50_temp_get(struct nvkm_therm *therm, int *val)
>>>> {
>>>> struct nvkm_device *device = therm->subdev.device;
>>>> struct nvbios_therm_sensor *sensor = &therm->bios_sensor;
>>>> @@ -143,11 +143,8 @@ nv50_temp_get(struct nvkm_therm *therm)
>>>> core_temp = core_temp + sensor->offset_num / sensor->offset_den;
>>>> core_temp = core_temp + sensor->offset_constant - 8;
>>>>
>>>> - /* reserve negative temperatures for errors */
>>>> - if (core_temp < 0)
>>>> - core_temp = 0;
>>>> -
>>>> - return core_temp;
>>>> + *val = core_temp;
>>>> + return 0;
>>>> }
>>>>
>>>> static void
>>>> diff --git a/drm/nouveau/nvkm/subdev/therm/priv.h b/drm/nouveau/nvkm/subdev/therm/priv.h
>>>> index 1f46e371..b325ec5f 100644
>>>> --- a/drm/nouveau/nvkm/subdev/therm/priv.h
>>>> +++ b/drm/nouveau/nvkm/subdev/therm/priv.h
>>>> @@ -91,7 +91,7 @@ struct nvkm_therm_func {
>>>> int (*pwm_set)(struct nvkm_therm *, int line, u32, u32);
>>>> int (*pwm_clock)(struct nvkm_therm *, int line);
>>>>
>>>> - int (*temp_get)(struct nvkm_therm *);
>>>> + int (*temp_get)(struct nvkm_therm *, int *);
>>>>
>>>> int (*fan_sense)(struct nvkm_therm *);
>>>>
>>>> @@ -105,7 +105,7 @@ int nv50_fan_pwm_get(struct nvkm_therm *, int, u32 *, u32 *);
>>>> int nv50_fan_pwm_set(struct nvkm_therm *, int, u32, u32);
>>>> int nv50_fan_pwm_clock(struct nvkm_therm *, int);
>>>>
>>>> -int g84_temp_get(struct nvkm_therm *);
>>>> +int g84_temp_get(struct nvkm_therm *, int *);
>>>> void g84_sensor_setup(struct nvkm_therm *);
>>>> void g84_therm_fini(struct nvkm_therm *);
>>>>
>>>> diff --git a/drm/nouveau/nvkm/subdev/therm/temp.c b/drm/nouveau/nvkm/subdev/therm/temp.c
>>>> index ddb2b2c6..e7b8cbe2 100644
>>>> --- a/drm/nouveau/nvkm/subdev/therm/temp.c
>>>> +++ b/drm/nouveau/nvkm/subdev/therm/temp.c
>>>> @@ -86,7 +86,10 @@ nvkm_therm_sensor_event(struct nvkm_therm *therm, enum nvkm_therm_thrs thrs,
>>>> static const char * const thresholds[] = {
>>>> "fanboost", "downclock", "critical", "shutdown"
>>>> };
>>>> - int temperature = therm->func->temp_get(therm);
>>>> + int temperature;
>>>> +
>>>> + if (therm->func->temp_get(therm, &temperature))
>>>> + return;
>>>>
>>>> if (thrs < 0 || thrs > 3)
>>>> return;
>>>> @@ -140,7 +143,10 @@ nvkm_therm_threshold_hyst_polling(struct nvkm_therm *therm,
>>>> {
>>>> enum nvkm_therm_thrs_direction direction;
>>>> enum nvkm_therm_thrs_state prev_state, new_state;
>>>> - int temp = therm->func->temp_get(therm);
>>>> + int temp;
>>>> +
>>>> + if (therm->func->temp_get(therm, &temp))
>>>> + return;
>>>>
>>>> prev_state = nvkm_therm_sensor_get_threshold_state(therm, thrs_name);
>>>>
>>>> @@ -166,6 +172,7 @@ alarm_timer_callback(struct nvkm_alarm *alarm)
>>>> struct nvbios_therm_sensor *sensor = &therm->bios_sensor;
>>>> struct nvkm_timer *tmr = therm->subdev.device->timer;
>>>> unsigned long flags;
>>>> + int val;
>>>>
>>>> spin_lock_irqsave(&therm->sensor.alarm_program_lock, flags);
>>>>
>>>> @@ -185,7 +192,7 @@ alarm_timer_callback(struct nvkm_alarm *alarm)
>>>> spin_unlock_irqrestore(&therm->sensor.alarm_program_lock, flags);
>>>>
>>>> /* schedule the next poll in one second */
>>>> - if (therm->func->temp_get(therm) >= 0)
>>>> + if (!therm->func->temp_get(therm, &val))
>>>> nvkm_timer_alarm(tmr, 1000000000ULL, alarm);
>>>> }
>>>>
>>>> @@ -227,9 +234,10 @@ nvkm_therm_sensor_fini(struct nvkm_therm *therm, bool suspend)
>>>> void
>>>> nvkm_therm_sensor_preinit(struct nvkm_therm *therm)
>>>> {
>>>> + int val;
>>>> const char *sensor_avail = "yes";
>>>>
>>>> - if (therm->func->temp_get(therm) < 0)
>>>> + if (therm->func->temp_get(therm, &val))
>>>> sensor_avail = "no";
>>>>
>>>> nvkm_debug(&therm->subdev, "internal sensor: %s\n", sensor_avail);
>>>>
>>>
>>> _______________________________________________
>>> Nouveau mailing list
>>> Nouveau at lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/nouveau
>
More information about the Nouveau
mailing list