[PATCH 5/6] drm/amd/powerplay: expose current edge and memory temperatures
Alex Deucher
alexdeucher at gmail.com
Fri Apr 19 15:19:23 UTC 2019
On Thu, Apr 18, 2019 at 5:03 AM Evan Quan <evan.quan at amd.com> wrote:
>
> Two new hwmon interfaces(temp2_input and temp3_input) are added.
> They are supported on SOC15 dGPUs only.
>
> Change-Id: I935c512bd38e080fb8b6e3164c5e5294baff4e91
> Signed-off-by: Evan Quan <evan.quan at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c | 45 +++++++++++++++----
> .../gpu/drm/amd/include/kgd_pp_interface.h | 2 +
> .../drm/amd/powerplay/hwmgr/vega10_hwmgr.c | 12 +++++
> .../drm/amd/powerplay/hwmgr/vega12_hwmgr.c | 19 ++++++++
> .../drm/amd/powerplay/hwmgr/vega20_hwmgr.c | 18 ++++++++
> 5 files changed, 88 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c
> index be33144e2dca..1007307845d8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c
> @@ -1434,6 +1434,7 @@ static ssize_t amdgpu_hwmon_show_temp(struct device *dev,
> {
> struct amdgpu_device *adev = dev_get_drvdata(dev);
> struct drm_device *ddev = adev->ddev;
> + int channel = to_sensor_dev_attr(attr)->index;
> int r, temp, size = sizeof(temp);
>
> /* Can't get temperature when the card is off */
> @@ -1441,11 +1442,32 @@ static ssize_t amdgpu_hwmon_show_temp(struct device *dev,
> (ddev->switch_power_state != DRM_SWITCH_POWER_ON))
> return -EINVAL;
>
> - /* get the temperature */
> - r = amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_GPU_TEMP,
> - (void *)&temp, &size);
> - if (r)
> - return r;
> + if (channel >= PP_TEMP_MAX)
> + return -EINVAL;
> +
> + switch (channel) {
> + case PP_TEMP_JUNCTION:
> + /* get current junction temperature */
> + r = amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_GPU_TEMP,
> + (void *)&temp, &size);
> + if (r)
> + return r;
> + break;
> + case PP_TEMP_EDGE:
> + /* get current edge temperature */
> + r = amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_EDGE_TEMP,
> + (void *)&temp, &size);
> + if (r)
> + return r;
> + break;
> + case PP_TEMP_MEM:
> + /* get current memory temperature */
> + r = amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_MEM_TEMP,
> + (void *)&temp, &size);
> + if (r)
> + return r;
> + break;
> + }
>
> return snprintf(buf, PAGE_SIZE, "%d\n", temp);
> }
> @@ -2109,7 +2131,8 @@ static ssize_t amdgpu_hwmon_show_mclk_label(struct device *dev,
> * - temp[1-3]_label: temperature channel label
> * - temp2_label and temp3_label are supported on SOC15 dGPUs only
> *
> - * - temp1_input: the on die GPU temperature in millidegrees Celsius
> + * - temp[1-3]_input: the on die GPU temperature in millidegrees Celsius
> + * - temp2_input and temp3_input are supported on SOC15 dGPUs only
> *
> * - temp[1-3]_crit: temperature critical max value in millidegrees Celsius
> * - temp2_crit and temp3_crit are supported on SOC15 dGPUs only
> @@ -2166,13 +2189,15 @@ static ssize_t amdgpu_hwmon_show_mclk_label(struct device *dev,
> *
> */
>
> -static SENSOR_DEVICE_ATTR(temp1_input, S_IRUGO, amdgpu_hwmon_show_temp, NULL, 0);
> +static SENSOR_DEVICE_ATTR(temp1_input, S_IRUGO, amdgpu_hwmon_show_temp, NULL, PP_TEMP_JUNCTION);
> static SENSOR_DEVICE_ATTR(temp1_crit, S_IRUGO, amdgpu_hwmon_show_temp_thresh, NULL, 0);
> static SENSOR_DEVICE_ATTR(temp1_crit_hyst, S_IRUGO, amdgpu_hwmon_show_temp_thresh, NULL, 1);
> static SENSOR_DEVICE_ATTR(temp1_emergency, S_IRUGO, amdgpu_hwmon_show_temp_emergency, NULL, PP_TEMP_JUNCTION);
> +static SENSOR_DEVICE_ATTR(temp2_input, S_IRUGO, amdgpu_hwmon_show_temp, NULL, PP_TEMP_EDGE);
> static SENSOR_DEVICE_ATTR(temp2_crit, S_IRUGO, amdgpu_hwmon_show_edge_temp_thresh, NULL, 0);
> static SENSOR_DEVICE_ATTR(temp2_crit_hyst, S_IRUGO, amdgpu_hwmon_show_edge_temp_thresh, NULL, 1);
> static SENSOR_DEVICE_ATTR(temp2_emergency, S_IRUGO, amdgpu_hwmon_show_temp_emergency, NULL, PP_TEMP_EDGE);
> +static SENSOR_DEVICE_ATTR(temp3_input, S_IRUGO, amdgpu_hwmon_show_temp, NULL, PP_TEMP_MEM);
> static SENSOR_DEVICE_ATTR(temp3_crit, S_IRUGO, amdgpu_hwmon_show_mem_temp_thresh, NULL, 0);
> static SENSOR_DEVICE_ATTR(temp3_crit_hyst, S_IRUGO, amdgpu_hwmon_show_mem_temp_thresh, NULL, 1);
> static SENSOR_DEVICE_ATTR(temp3_emergency, S_IRUGO, amdgpu_hwmon_show_temp_emergency, NULL, PP_TEMP_MEM);
> @@ -2205,8 +2230,10 @@ static struct attribute *hwmon_attributes[] = {
> &sensor_dev_attr_temp1_input.dev_attr.attr,
> &sensor_dev_attr_temp1_crit.dev_attr.attr,
> &sensor_dev_attr_temp1_crit_hyst.dev_attr.attr,
> + &sensor_dev_attr_temp2_input.dev_attr.attr,
> &sensor_dev_attr_temp2_crit.dev_attr.attr,
> &sensor_dev_attr_temp2_crit_hyst.dev_attr.attr,
> + &sensor_dev_attr_temp3_input.dev_attr.attr,
> &sensor_dev_attr_temp3_crit.dev_attr.attr,
> &sensor_dev_attr_temp3_crit_hyst.dev_attr.attr,
> &sensor_dev_attr_temp1_label.dev_attr.attr,
> @@ -2348,7 +2375,9 @@ static umode_t hwmon_attributes_visible(struct kobject *kobj,
> attr == &sensor_dev_attr_temp3_crit_hyst.dev_attr.attr ||
> attr == &sensor_dev_attr_temp1_emergency.dev_attr.attr ||
> attr == &sensor_dev_attr_temp2_emergency.dev_attr.attr ||
> - attr == &sensor_dev_attr_temp3_emergency.dev_attr.attr))
> + attr == &sensor_dev_attr_temp3_emergency.dev_attr.attr ||
> + attr == &sensor_dev_attr_temp2_input.dev_attr.attr ||
> + attr == &sensor_dev_attr_temp3_input.dev_attr.attr))
> return 0;
>
>
> diff --git a/drivers/gpu/drm/amd/include/kgd_pp_interface.h b/drivers/gpu/drm/amd/include/kgd_pp_interface.h
> index 17324c0d503e..19713ffdb03e 100644
> --- a/drivers/gpu/drm/amd/include/kgd_pp_interface.h
> +++ b/drivers/gpu/drm/amd/include/kgd_pp_interface.h
> @@ -111,6 +111,8 @@ enum amd_pp_sensors {
> AMDGPU_PP_SENSOR_GPU_LOAD,
> AMDGPU_PP_SENSOR_GFX_MCLK,
> AMDGPU_PP_SENSOR_GPU_TEMP,
Add:
AMDGPU_PP_SENSOR_GPU_JUNCTION_TEMP = AMDGPU_PP_SENSOR_GPU_TEMP,
and use that for clarity. That said, existing asics use
AMDGPU_PP_SENSOR_GPU_TEMP for the edge temperature, so I'd suggest
making
AMDGPU_PP_SENSOR_GPU_EDGE_TEMP = AMDGPU_PP_SENSOR_GPU_TEMP,
and then add a new entry for JUNCTION.
> + AMDGPU_PP_SENSOR_EDGE_TEMP,
> + AMDGPU_PP_SENSOR_MEM_TEMP,
> AMDGPU_PP_SENSOR_VCE_POWER,
> AMDGPU_PP_SENSOR_UVD_POWER,
> AMDGPU_PP_SENSOR_GPU_POWER,
> diff --git a/drivers/gpu/drm/amd/powerplay/hwmgr/vega10_hwmgr.c b/drivers/gpu/drm/amd/powerplay/hwmgr/vega10_hwmgr.c
> index 1d78a5ee9523..f4ecbbe854ee 100644
> --- a/drivers/gpu/drm/amd/powerplay/hwmgr/vega10_hwmgr.c
> +++ b/drivers/gpu/drm/amd/powerplay/hwmgr/vega10_hwmgr.c
> @@ -3785,6 +3785,18 @@ static int vega10_read_sensor(struct pp_hwmgr *hwmgr, int idx,
> *((uint32_t *)value) = vega10_thermal_get_temperature(hwmgr);
I think vega10_thermal_get_temperature() returns the edge temperature
on vega10. Maybe it would be better to switch to
PPSMC_MSG_GetTemperatureHotspot for AMDGPU_PP_SENSOR_GPU_JUNCTION_TEMP
or use vega10_thermal_get_temperature() for EDGE.
> *size = 4;
> break;
> + case AMDGPU_PP_SENSOR_EDGE_TEMP:
> + smum_send_msg_to_smc(hwmgr, PPSMC_MSG_GetTemperatureEdge);
> + *((uint32_t *)value) = smum_get_argument(hwmgr) *
> + PP_TEMPERATURE_UNITS_PER_CENTIGRADES;
> + *size = 4;
> + break;
> + case AMDGPU_PP_SENSOR_MEM_TEMP:
> + smum_send_msg_to_smc(hwmgr, PPSMC_MSG_GetTemperatureHBM);
> + *((uint32_t *)value) = smum_get_argument(hwmgr) *
> + PP_TEMPERATURE_UNITS_PER_CENTIGRADES;
> + *size = 4;
> + break;
> case AMDGPU_PP_SENSOR_UVD_POWER:
> *((uint32_t *)value) = data->uvd_power_gated ? 0 : 1;
> *size = 4;
> diff --git a/drivers/gpu/drm/amd/powerplay/hwmgr/vega12_hwmgr.c b/drivers/gpu/drm/amd/powerplay/hwmgr/vega12_hwmgr.c
> index 695ac2875540..86c48cb56f6c 100644
> --- a/drivers/gpu/drm/amd/powerplay/hwmgr/vega12_hwmgr.c
> +++ b/drivers/gpu/drm/amd/powerplay/hwmgr/vega12_hwmgr.c
> @@ -1338,6 +1338,7 @@ static int vega12_read_sensor(struct pp_hwmgr *hwmgr, int idx,
> void *value, int *size)
> {
> struct vega12_hwmgr *data = (struct vega12_hwmgr *)(hwmgr->backend);
> + SmuMetrics_t metrics_table;
> int ret = 0;
>
> switch (idx) {
> @@ -1360,6 +1361,24 @@ static int vega12_read_sensor(struct pp_hwmgr *hwmgr, int idx,
> *((uint32_t *)value) = vega12_thermal_get_temperature(hwmgr);
> *size = 4;
> break;
> + case AMDGPU_PP_SENSOR_EDGE_TEMP:
> + ret = vega12_get_metrics_table(hwmgr, &metrics_table);
> + if (ret)
> + return ret;
> +
> + *((uint32_t *)value) = metrics_table.TemperatureEdge *
> + PP_TEMPERATURE_UNITS_PER_CENTIGRADES;
> + *size = 4;
> + break;
> + case AMDGPU_PP_SENSOR_MEM_TEMP:
> + ret = vega12_get_metrics_table(hwmgr, &metrics_table);
> + if (ret)
> + return ret;
> +
> + *((uint32_t *)value) = metrics_table.TemperatureHBM *
> + PP_TEMPERATURE_UNITS_PER_CENTIGRADES;
> + *size = 4;
> + break;
> case AMDGPU_PP_SENSOR_UVD_POWER:
> *((uint32_t *)value) = data->uvd_power_gated ? 0 : 1;
> *size = 4;
> diff --git a/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c b/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c
> index 0c0714862eb8..72a71a002f0b 100644
> --- a/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c
> +++ b/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c
> @@ -2142,6 +2142,24 @@ static int vega20_read_sensor(struct pp_hwmgr *hwmgr, int idx,
> *((uint32_t *)value) = vega20_thermal_get_temperature(hwmgr);
> *size = 4;
> break;
> + case AMDGPU_PP_SENSOR_EDGE_TEMP:
> + ret = vega20_get_metrics_table(hwmgr, &metrics_table);
> + if (ret)
> + return ret;
> +
> + *((uint32_t *)value) = metrics_table.TemperatureEdge *
> + PP_TEMPERATURE_UNITS_PER_CENTIGRADES;
> + *size = 4;
> + break;
> + case AMDGPU_PP_SENSOR_MEM_TEMP:
> + ret = vega20_get_metrics_table(hwmgr, &metrics_table);
> + if (ret)
> + return ret;
> +
> + *((uint32_t *)value) = metrics_table.TemperatureHBM *
> + PP_TEMPERATURE_UNITS_PER_CENTIGRADES;
> + *size = 4;
> + break;
> case AMDGPU_PP_SENSOR_UVD_POWER:
> *((uint32_t *)value) = data->uvd_power_gated ? 0 : 1;
> *size = 4;
> --
> 2.21.0
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
More information about the amd-gfx
mailing list