[PATCH 1/2] drm/amdgpu: added a sysfs interface for thermal throttling

Quan, Evan Evan.Quan at amd.com
Tue Feb 14 09:15:40 UTC 2023


[AMD Official Use Only - General]



> -----Original Message-----
> From: kunliu13 <Kun.Liu2 at amd.com>
> Sent: Tuesday, February 14, 2023 3:54 PM
> To: Limonciello, Mario <Mario.Limonciello at amd.com>; Liang, Richard qi
> <Richardqi.Liang at amd.com>; Yuan, Perry <Perry.Yuan at amd.com>; amd-
> gfx at lists.freedesktop.org
> Cc: Deucher, Alexander <Alexander.Deucher at amd.com>; Du, Xiaojian
> <Xiaojian.Du at amd.com>; Quan, Evan <Evan.Quan at amd.com>; Liu, Kun
> <Kun.Liu2 at amd.com>
> Subject: [PATCH 1/2] drm/amdgpu: added a sysfs interface for thermal
> throttling
> 
> added a sysfs interface for thermal throttling, then userspace can get/update
> thermal limit
> 
> Jira ID: SWDEV-354511
[Quan, Evan] Please drop this internal link. Other than this, the patch is Reviewed-by: Evan Quan <evan.quan at amd.com>

Evan
> Signed-off-by: Kun Liu <Kun.Liu2 at amd.com>
> 
> Change-Id: I9948cb8966b731d2d74d7aad87cbcdc840dd34c8
> ---
>  .../gpu/drm/amd/include/kgd_pp_interface.h    |  2 +
>  drivers/gpu/drm/amd/pm/amdgpu_dpm.c           | 28 +++++++
>  drivers/gpu/drm/amd/pm/amdgpu_pm.c            | 76
> +++++++++++++++++++
>  drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h       |  3 +
>  drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c     | 24 ++++++
>  drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h | 12 +++
>  6 files changed, 145 insertions(+)
> 
> diff --git a/drivers/gpu/drm/amd/include/kgd_pp_interface.h
> b/drivers/gpu/drm/amd/include/kgd_pp_interface.h
> index f3d64c78f..8394464ea 100644
> --- a/drivers/gpu/drm/amd/include/kgd_pp_interface.h
> +++ b/drivers/gpu/drm/amd/include/kgd_pp_interface.h
> @@ -331,6 +331,8 @@ struct amd_pm_funcs {
>  	int (*get_mclk_od)(void *handle);
>  	int (*set_mclk_od)(void *handle, uint32_t value);
>  	int (*read_sensor)(void *handle, int idx, void *value, int *size);
> +	int (*get_apu_thermal_limit)(void *handle, uint32_t *limit);
> +	int (*set_apu_thermal_limit)(void *handle, uint32_t limit);
>  	enum amd_dpm_forced_level (*get_performance_level)(void
> *handle);
>  	enum amd_pm_state_type (*get_current_power_state)(void
> *handle);
>  	int (*get_fan_speed_rpm)(void *handle, uint32_t *rpm);
> diff --git a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
> b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
> index 1b300c569..d9a9cf189 100644
> --- a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
> +++ b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
> @@ -438,6 +438,34 @@ int amdgpu_dpm_read_sensor(struct
> amdgpu_device *adev, enum amd_pp_sensors senso
>  	return ret;
>  }
> 
> +int amdgpu_dpm_get_apu_thermal_limit(struct amdgpu_device *adev,
> uint32_t *limit)
> +{
> +	const struct amd_pm_funcs *pp_funcs = adev->powerplay.pp_funcs;
> +	int ret = -EINVAL;
> +
> +	if (pp_funcs && pp_funcs->get_apu_thermal_limit) {
> +		mutex_lock(&adev->pm.mutex);
> +		ret = pp_funcs->get_apu_thermal_limit(adev-
> >powerplay.pp_handle, limit);
> +		mutex_unlock(&adev->pm.mutex);
> +	}
> +
> +	return ret;
> +}
> +
> +int amdgpu_dpm_set_apu_thermal_limit(struct amdgpu_device *adev,
> uint32_t limit)
> +{
> +	const struct amd_pm_funcs *pp_funcs = adev->powerplay.pp_funcs;
> +	int ret = -EINVAL;
> +
> +	if (pp_funcs && pp_funcs->set_apu_thermal_limit) {
> +		mutex_lock(&adev->pm.mutex);
> +		ret = pp_funcs->set_apu_thermal_limit(adev-
> >powerplay.pp_handle, limit);
> +		mutex_unlock(&adev->pm.mutex);
> +	}
> +
> +	return ret;
> +}
> +
>  void amdgpu_dpm_compute_clocks(struct amdgpu_device *adev)
>  {
>  	const struct amd_pm_funcs *pp_funcs = adev->powerplay.pp_funcs;
> diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c
> b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
> index 236657eec..99b249e55 100644
> --- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c
> +++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
> @@ -1685,6 +1685,81 @@ static ssize_t
> amdgpu_set_thermal_throttling_logging(struct device *dev,
>  	return count;
>  }
> 
> +/**
> + * DOC: apu_thermal_cap
> + *
> + * The amdgpu driver provides a sysfs API for retrieving/updating thermal
> + * limit temperature in millidegrees Celsius
> + *
> + * Reading back the file shows you core limit value
> + *
> + * Writing an integer to the file, sets a new thermal limit. The value
> + * should be between 0 and 100. If the value is less than 0 or greater
> + * than 100, then the write request will be ignored.
> + */
> +static ssize_t amdgpu_get_apu_thermal_cap(struct device *dev,
> +				     struct device_attribute *attr,
> +				     char *buf)
> +{
> +	int ret, size = 0;
> +	u32 limit;
> +	struct drm_device *ddev = dev_get_drvdata(dev);
> +	struct amdgpu_device *adev = drm_to_adev(ddev);
> +
> +	ret = pm_runtime_get_sync(ddev->dev);
> +	if (ret < 0) {
> +		pm_runtime_put_autosuspend(ddev->dev);
> +		return size;
> +	}
> +
> +	ret = amdgpu_dpm_get_apu_thermal_limit(adev, &limit);
> +	if (!ret)
> +		size = sysfs_emit(buf, "%u\n", limit);
> +	else
> +		size = sysfs_emit(buf, "failed to get thermal limit\n");
> +
> +	pm_runtime_mark_last_busy(ddev->dev);
> +	pm_runtime_put_autosuspend(ddev->dev);
> +
> +	return size;
> +}
> +
> +static ssize_t amdgpu_set_apu_thermal_cap(struct device *dev,
> +				     struct device_attribute *attr,
> +				     const char *buf,
> +				     size_t count)
> +{
> +	int ret;
> +	u32 value;
> +	struct drm_device *ddev = dev_get_drvdata(dev);
> +	struct amdgpu_device *adev = drm_to_adev(ddev);
> +
> +	ret = kstrtou32(buf, 10, &value);
> +	if (ret)
> +		return ret;
> +
> +	if (value < 0 || value > 100) {
> +		dev_err(dev, "Invalid argument !\n");
> +		return count;
> +	}
> +
> +	ret = pm_runtime_get_sync(ddev->dev);
> +	if (ret < 0) {
> +		pm_runtime_put_autosuspend(ddev->dev);
> +		return ret;
> +	}
> +
> +	ret = amdgpu_dpm_set_apu_thermal_limit(adev, value);
> +	if (ret)
> +		dev_err(dev, "failed to update thermal limit\n");
> +
> +	pm_runtime_mark_last_busy(ddev->dev);
> +	pm_runtime_put_autosuspend(ddev->dev);
> +
> +	return count;
> +}
> +
> +
>  /**
>   * DOC: gpu_metrics
>   *
> @@ -1937,6 +2012,7 @@ static struct amdgpu_device_attr
> amdgpu_device_attrs[] = {
>  	AMDGPU_DEVICE_ATTR_RW(pp_features,
> 	ATTR_FLAG_BASIC|ATTR_FLAG_ONEVF),
>  	AMDGPU_DEVICE_ATTR_RO(unique_id,
> 	ATTR_FLAG_BASIC|ATTR_FLAG_ONEVF),
>  	AMDGPU_DEVICE_ATTR_RW(thermal_throttling_logging,
> 	ATTR_FLAG_BASIC|ATTR_FLAG_ONEVF),
> +	AMDGPU_DEVICE_ATTR_RW(apu_thermal_cap,
> 	ATTR_FLAG_BASIC|ATTR_FLAG_ONEVF),
>  	AMDGPU_DEVICE_ATTR_RO(gpu_metrics,
> 	ATTR_FLAG_BASIC|ATTR_FLAG_ONEVF),
>  	AMDGPU_DEVICE_ATTR_RO(smartshift_apu_power,
> 	ATTR_FLAG_BASIC,
>  			      .attr_update = ss_power_attr_update),
> diff --git a/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h
> b/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h
> index cb5b9df78..0cc379ea1 100644
> --- a/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h
> +++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h
> @@ -369,6 +369,9 @@ struct amdgpu_pm {
>  int amdgpu_dpm_read_sensor(struct amdgpu_device *adev, enum
> amd_pp_sensors sensor,
>  			   void *data, uint32_t *size);
> 
> +int amdgpu_dpm_get_apu_thermal_limit(struct amdgpu_device *adev,
> uint32_t *limit);
> +int amdgpu_dpm_set_apu_thermal_limit(struct amdgpu_device *adev,
> uint32_t limit);
> +
>  int amdgpu_dpm_set_powergating_by_smu(struct amdgpu_device *adev,
>  				      uint32_t block_type, bool gate);
> 
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> index 2fa79f892..b612fb6bd 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> @@ -2514,6 +2514,28 @@ static int smu_read_sensor(void *handle,
>  	return ret;
>  }
> 
> +static int smu_get_apu_thermal_limit(void *handle, uint32_t *limit)
> +{
> +	int ret = -EINVAL;
> +	struct smu_context *smu = handle;
> +
> +	if (smu->ppt_funcs && smu->ppt_funcs->get_apu_thermal_limit)
> +		ret = smu->ppt_funcs->get_apu_thermal_limit(smu, limit);
> +
> +	return ret;
> +}
> +
> +static int smu_set_apu_thermal_limit(void *handle, uint32_t limit)
> +{
> +	int ret = -EINVAL;
> +	struct smu_context *smu = handle;
> +
> +	if (smu->ppt_funcs && smu->ppt_funcs->set_apu_thermal_limit)
> +		ret = smu->ppt_funcs->set_apu_thermal_limit(smu, limit);
> +
> +	return ret;
> +}
> +
>  static int smu_get_power_profile_mode(void *handle, char *buf)
>  {
>  	struct smu_context *smu = handle;
> @@ -2998,6 +3020,8 @@ static const struct amd_pm_funcs
> swsmu_pm_funcs = {
>  	.emit_clock_levels       = smu_emit_ppclk_levels,
>  	.force_performance_level = smu_force_performance_level,
>  	.read_sensor             = smu_read_sensor,
> +	.get_apu_thermal_limit       = smu_get_apu_thermal_limit,
> +	.set_apu_thermal_limit       = smu_set_apu_thermal_limit,
>  	.get_performance_level   = smu_get_performance_level,
>  	.get_current_power_state = smu_get_current_power_state,
>  	.get_fan_speed_rpm       = smu_get_fan_speed_rpm,
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> index 3bc4128a2..378d3df4d 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> @@ -721,6 +721,18 @@ struct pptable_funcs {
>  	int (*read_sensor)(struct smu_context *smu, enum
> amd_pp_sensors sensor,
>  			   void *data, uint32_t *size);
> 
> +	/**
> +	 * @get_apu_thermal_limit: get apu core limit from smu
> +	 * &limit: current limit temperature in millidegrees Celsius
> +	 */
> +	int (*get_apu_thermal_limit)(struct smu_context *smu, uint32_t
> *limit);
> +
> +	/**
> +	 * @set_apu_thermal_limit: update all controllers with new limit
> +	 * &limit: limit temperature to be setted, in millidegrees Celsius
> +	 */
> +	int (*set_apu_thermal_limit)(struct smu_context *smu, uint32_t
> limit);
> +
>  	/**
>  	 * @pre_display_config_changed: Prepare GPU for a display
> configuration
>  	 *                              change.
> --
> 2.25.1


More information about the amd-gfx mailing list