[PATCH 5/6] drm/amd/powerplay: expose current edge and memory temperatures

Alex Deucher alexdeucher at gmail.com
Fri Apr 19 15:19:23 UTC 2019


On Thu, Apr 18, 2019 at 5:03 AM Evan Quan <evan.quan at amd.com> wrote:
>
> Two new hwmon interfaces(temp2_input and temp3_input) are added.
> They are supported on SOC15 dGPUs only.
>
> Change-Id: I935c512bd38e080fb8b6e3164c5e5294baff4e91
> Signed-off-by: Evan Quan <evan.quan at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c        | 45 +++++++++++++++----
>  .../gpu/drm/amd/include/kgd_pp_interface.h    |  2 +
>  .../drm/amd/powerplay/hwmgr/vega10_hwmgr.c    | 12 +++++
>  .../drm/amd/powerplay/hwmgr/vega12_hwmgr.c    | 19 ++++++++
>  .../drm/amd/powerplay/hwmgr/vega20_hwmgr.c    | 18 ++++++++
>  5 files changed, 88 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c
> index be33144e2dca..1007307845d8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c
> @@ -1434,6 +1434,7 @@ static ssize_t amdgpu_hwmon_show_temp(struct device *dev,
>  {
>         struct amdgpu_device *adev = dev_get_drvdata(dev);
>         struct drm_device *ddev = adev->ddev;
> +       int channel = to_sensor_dev_attr(attr)->index;
>         int r, temp, size = sizeof(temp);
>
>         /* Can't get temperature when the card is off */
> @@ -1441,11 +1442,32 @@ static ssize_t amdgpu_hwmon_show_temp(struct device *dev,
>              (ddev->switch_power_state != DRM_SWITCH_POWER_ON))
>                 return -EINVAL;
>
> -       /* get the temperature */
> -       r = amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_GPU_TEMP,
> -                                  (void *)&temp, &size);
> -       if (r)
> -               return r;
> +       if (channel >= PP_TEMP_MAX)
> +               return -EINVAL;
> +
> +       switch (channel) {
> +       case PP_TEMP_JUNCTION:
> +               /* get current junction temperature */
> +               r = amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_GPU_TEMP,
> +                                          (void *)&temp, &size);
> +               if (r)
> +                       return r;
> +               break;
> +       case PP_TEMP_EDGE:
> +               /* get current edge temperature */
> +               r = amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_EDGE_TEMP,
> +                                          (void *)&temp, &size);
> +               if (r)
> +                       return r;
> +               break;
> +       case PP_TEMP_MEM:
> +               /* get current memory temperature */
> +               r = amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_MEM_TEMP,
> +                                          (void *)&temp, &size);
> +               if (r)
> +                       return r;
> +               break;
> +       }
>
>         return snprintf(buf, PAGE_SIZE, "%d\n", temp);
>  }
> @@ -2109,7 +2131,8 @@ static ssize_t amdgpu_hwmon_show_mclk_label(struct device *dev,
>   * - temp[1-3]_label: temperature channel label
>   *   - temp2_label and temp3_label are supported on SOC15 dGPUs only
>   *
> - * - temp1_input: the on die GPU temperature in millidegrees Celsius
> + * - temp[1-3]_input: the on die GPU temperature in millidegrees Celsius
> + *   - temp2_input and temp3_input are supported on SOC15 dGPUs only
>   *
>   * - temp[1-3]_crit: temperature critical max value in millidegrees Celsius
>   *   - temp2_crit and temp3_crit are supported on SOC15 dGPUs only
> @@ -2166,13 +2189,15 @@ static ssize_t amdgpu_hwmon_show_mclk_label(struct device *dev,
>   *
>   */
>
> -static SENSOR_DEVICE_ATTR(temp1_input, S_IRUGO, amdgpu_hwmon_show_temp, NULL, 0);
> +static SENSOR_DEVICE_ATTR(temp1_input, S_IRUGO, amdgpu_hwmon_show_temp, NULL, PP_TEMP_JUNCTION);
>  static SENSOR_DEVICE_ATTR(temp1_crit, S_IRUGO, amdgpu_hwmon_show_temp_thresh, NULL, 0);
>  static SENSOR_DEVICE_ATTR(temp1_crit_hyst, S_IRUGO, amdgpu_hwmon_show_temp_thresh, NULL, 1);
>  static SENSOR_DEVICE_ATTR(temp1_emergency, S_IRUGO, amdgpu_hwmon_show_temp_emergency, NULL, PP_TEMP_JUNCTION);
> +static SENSOR_DEVICE_ATTR(temp2_input, S_IRUGO, amdgpu_hwmon_show_temp, NULL, PP_TEMP_EDGE);
>  static SENSOR_DEVICE_ATTR(temp2_crit, S_IRUGO, amdgpu_hwmon_show_edge_temp_thresh, NULL, 0);
>  static SENSOR_DEVICE_ATTR(temp2_crit_hyst, S_IRUGO, amdgpu_hwmon_show_edge_temp_thresh, NULL, 1);
>  static SENSOR_DEVICE_ATTR(temp2_emergency, S_IRUGO, amdgpu_hwmon_show_temp_emergency, NULL, PP_TEMP_EDGE);
> +static SENSOR_DEVICE_ATTR(temp3_input, S_IRUGO, amdgpu_hwmon_show_temp, NULL, PP_TEMP_MEM);
>  static SENSOR_DEVICE_ATTR(temp3_crit, S_IRUGO, amdgpu_hwmon_show_mem_temp_thresh, NULL, 0);
>  static SENSOR_DEVICE_ATTR(temp3_crit_hyst, S_IRUGO, amdgpu_hwmon_show_mem_temp_thresh, NULL, 1);
>  static SENSOR_DEVICE_ATTR(temp3_emergency, S_IRUGO, amdgpu_hwmon_show_temp_emergency, NULL, PP_TEMP_MEM);
> @@ -2205,8 +2230,10 @@ static struct attribute *hwmon_attributes[] = {
>         &sensor_dev_attr_temp1_input.dev_attr.attr,
>         &sensor_dev_attr_temp1_crit.dev_attr.attr,
>         &sensor_dev_attr_temp1_crit_hyst.dev_attr.attr,
> +       &sensor_dev_attr_temp2_input.dev_attr.attr,
>         &sensor_dev_attr_temp2_crit.dev_attr.attr,
>         &sensor_dev_attr_temp2_crit_hyst.dev_attr.attr,
> +       &sensor_dev_attr_temp3_input.dev_attr.attr,
>         &sensor_dev_attr_temp3_crit.dev_attr.attr,
>         &sensor_dev_attr_temp3_crit_hyst.dev_attr.attr,
>         &sensor_dev_attr_temp1_label.dev_attr.attr,
> @@ -2348,7 +2375,9 @@ static umode_t hwmon_attributes_visible(struct kobject *kobj,
>              attr == &sensor_dev_attr_temp3_crit_hyst.dev_attr.attr ||
>              attr == &sensor_dev_attr_temp1_emergency.dev_attr.attr ||
>              attr == &sensor_dev_attr_temp2_emergency.dev_attr.attr ||
> -            attr == &sensor_dev_attr_temp3_emergency.dev_attr.attr))
> +            attr == &sensor_dev_attr_temp3_emergency.dev_attr.attr ||
> +            attr == &sensor_dev_attr_temp2_input.dev_attr.attr ||
> +            attr == &sensor_dev_attr_temp3_input.dev_attr.attr))
>                 return 0;
>
>
> diff --git a/drivers/gpu/drm/amd/include/kgd_pp_interface.h b/drivers/gpu/drm/amd/include/kgd_pp_interface.h
> index 17324c0d503e..19713ffdb03e 100644
> --- a/drivers/gpu/drm/amd/include/kgd_pp_interface.h
> +++ b/drivers/gpu/drm/amd/include/kgd_pp_interface.h
> @@ -111,6 +111,8 @@ enum amd_pp_sensors {
>         AMDGPU_PP_SENSOR_GPU_LOAD,
>         AMDGPU_PP_SENSOR_GFX_MCLK,
>         AMDGPU_PP_SENSOR_GPU_TEMP,

Add:
AMDGPU_PP_SENSOR_GPU_JUNCTION_TEMP = AMDGPU_PP_SENSOR_GPU_TEMP,
and use that for clarity.  That said, existing asics use
AMDGPU_PP_SENSOR_GPU_TEMP for the edge temperature, so I'd suggest
making
AMDGPU_PP_SENSOR_GPU_EDGE_TEMP = AMDGPU_PP_SENSOR_GPU_TEMP,
and then add a new entry for JUNCTION.

> +       AMDGPU_PP_SENSOR_EDGE_TEMP,
> +       AMDGPU_PP_SENSOR_MEM_TEMP,
>         AMDGPU_PP_SENSOR_VCE_POWER,
>         AMDGPU_PP_SENSOR_UVD_POWER,
>         AMDGPU_PP_SENSOR_GPU_POWER,
> diff --git a/drivers/gpu/drm/amd/powerplay/hwmgr/vega10_hwmgr.c b/drivers/gpu/drm/amd/powerplay/hwmgr/vega10_hwmgr.c
> index 1d78a5ee9523..f4ecbbe854ee 100644
> --- a/drivers/gpu/drm/amd/powerplay/hwmgr/vega10_hwmgr.c
> +++ b/drivers/gpu/drm/amd/powerplay/hwmgr/vega10_hwmgr.c
> @@ -3785,6 +3785,18 @@ static int vega10_read_sensor(struct pp_hwmgr *hwmgr, int idx,
>                 *((uint32_t *)value) = vega10_thermal_get_temperature(hwmgr);

I think vega10_thermal_get_temperature() returns the edge temperature
on vega10.  Maybe it would be better to switch to
PPSMC_MSG_GetTemperatureHotspot for AMDGPU_PP_SENSOR_GPU_JUNCTION_TEMP
or use vega10_thermal_get_temperature() for EDGE.

>                 *size = 4;
>                 break;
> +       case AMDGPU_PP_SENSOR_EDGE_TEMP:
> +               smum_send_msg_to_smc(hwmgr, PPSMC_MSG_GetTemperatureEdge);
> +               *((uint32_t *)value) = smum_get_argument(hwmgr) *
> +                       PP_TEMPERATURE_UNITS_PER_CENTIGRADES;
> +               *size = 4;
> +               break;
> +       case AMDGPU_PP_SENSOR_MEM_TEMP:
> +               smum_send_msg_to_smc(hwmgr, PPSMC_MSG_GetTemperatureHBM);
> +               *((uint32_t *)value) = smum_get_argument(hwmgr) *
> +                       PP_TEMPERATURE_UNITS_PER_CENTIGRADES;
> +               *size = 4;
> +               break;
>         case AMDGPU_PP_SENSOR_UVD_POWER:
>                 *((uint32_t *)value) = data->uvd_power_gated ? 0 : 1;
>                 *size = 4;
> diff --git a/drivers/gpu/drm/amd/powerplay/hwmgr/vega12_hwmgr.c b/drivers/gpu/drm/amd/powerplay/hwmgr/vega12_hwmgr.c
> index 695ac2875540..86c48cb56f6c 100644
> --- a/drivers/gpu/drm/amd/powerplay/hwmgr/vega12_hwmgr.c
> +++ b/drivers/gpu/drm/amd/powerplay/hwmgr/vega12_hwmgr.c
> @@ -1338,6 +1338,7 @@ static int vega12_read_sensor(struct pp_hwmgr *hwmgr, int idx,
>                               void *value, int *size)
>  {
>         struct vega12_hwmgr *data = (struct vega12_hwmgr *)(hwmgr->backend);
> +       SmuMetrics_t metrics_table;
>         int ret = 0;
>
>         switch (idx) {
> @@ -1360,6 +1361,24 @@ static int vega12_read_sensor(struct pp_hwmgr *hwmgr, int idx,
>                 *((uint32_t *)value) = vega12_thermal_get_temperature(hwmgr);
>                 *size = 4;
>                 break;
> +       case AMDGPU_PP_SENSOR_EDGE_TEMP:
> +               ret = vega12_get_metrics_table(hwmgr, &metrics_table);
> +               if (ret)
> +                       return ret;
> +
> +               *((uint32_t *)value) = metrics_table.TemperatureEdge *
> +                       PP_TEMPERATURE_UNITS_PER_CENTIGRADES;
> +               *size = 4;
> +               break;
> +       case AMDGPU_PP_SENSOR_MEM_TEMP:
> +               ret = vega12_get_metrics_table(hwmgr, &metrics_table);
> +               if (ret)
> +                       return ret;
> +
> +               *((uint32_t *)value) = metrics_table.TemperatureHBM *
> +                       PP_TEMPERATURE_UNITS_PER_CENTIGRADES;
> +               *size = 4;
> +               break;
>         case AMDGPU_PP_SENSOR_UVD_POWER:
>                 *((uint32_t *)value) = data->uvd_power_gated ? 0 : 1;
>                 *size = 4;
> diff --git a/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c b/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c
> index 0c0714862eb8..72a71a002f0b 100644
> --- a/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c
> +++ b/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c
> @@ -2142,6 +2142,24 @@ static int vega20_read_sensor(struct pp_hwmgr *hwmgr, int idx,
>                 *((uint32_t *)value) = vega20_thermal_get_temperature(hwmgr);
>                 *size = 4;
>                 break;
> +       case AMDGPU_PP_SENSOR_EDGE_TEMP:
> +               ret = vega20_get_metrics_table(hwmgr, &metrics_table);
> +               if (ret)
> +                       return ret;
> +
> +               *((uint32_t *)value) = metrics_table.TemperatureEdge *
> +                       PP_TEMPERATURE_UNITS_PER_CENTIGRADES;
> +               *size = 4;
> +               break;
> +       case AMDGPU_PP_SENSOR_MEM_TEMP:
> +               ret = vega20_get_metrics_table(hwmgr, &metrics_table);
> +               if (ret)
> +                       return ret;
> +
> +               *((uint32_t *)value) = metrics_table.TemperatureHBM *
> +                       PP_TEMPERATURE_UNITS_PER_CENTIGRADES;
> +               *size = 4;
> +               break;
>         case AMDGPU_PP_SENSOR_UVD_POWER:
>                 *((uint32_t *)value) = data->uvd_power_gated ? 0 : 1;
>                 *size = 4;
> --
> 2.21.0
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx


More information about the amd-gfx mailing list