[PATCH v2 1/6] drm/amd/pm: Add dpm interface for temp metrics

Mon Aug 4 12:13:23 UTC 2025

On 8/4/2025 4:35 PM, Asad Kamal wrote:
> Add dpm interface to get gpuboard/baseboard temperature metrics
> 
> v2: Add temperature metrics support check(Lijo)
> 
> Signed-off-by: Asad Kamal <asad.kamal at amd.com>
> ---
>  .../gpu/drm/amd/include/kgd_pp_interface.h    | 81 +++++++++++++++++++
>  drivers/gpu/drm/amd/pm/amdgpu_dpm.c           | 60 ++++++++++++++
>  drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h       |  4 +
>  3 files changed, 145 insertions(+)
> 
> diff --git a/drivers/gpu/drm/amd/include/kgd_pp_interface.h b/drivers/gpu/drm/amd/include/kgd_pp_interface.h
> index e2b1ea7467b0..2f7e4b5bebf3 100644
> --- a/drivers/gpu/drm/amd/include/kgd_pp_interface.h
> +++ b/drivers/gpu/drm/amd/include/kgd_pp_interface.h
> @@ -30,6 +30,12 @@ extern const struct amdgpu_ip_block_version smu_v12_0_ip_block;
>  extern const struct amdgpu_ip_block_version smu_v13_0_ip_block;
>  extern const struct amdgpu_ip_block_version smu_v14_0_ip_block;
>  
> +enum smu_temp_metric_type {
> +	SMU_TEMP_METRIC_BASEBOARD,
> +	SMU_TEMP_METRIC_GPUBOARD,
> +	SMU_TEMP_METRIC_MAX,
> +};
> +
>  enum smu_event_type {
>  	SMU_EVENT_RESET_COMPLETE = 0,
>  };
> @@ -496,6 +502,8 @@ struct amd_pm_funcs {
>  	int (*set_df_cstate)(void *handle, enum pp_df_cstate state);
>  	int (*set_xgmi_pstate)(void *handle, uint32_t pstate);
>  	ssize_t (*get_gpu_metrics)(void *handle, void **table);
> +	ssize_t (*get_temp_metrics)(void *handle, enum smu_temp_metric_type type, void *table);
> +	bool (*temp_metrics_is_supported)(void *handle, enum smu_temp_metric_type type);
>  	ssize_t (*get_xcp_metrics)(void *handle, int xcp_id, void *table);
>  	ssize_t (*get_pm_metrics)(void *handle, void *pmmetrics, size_t size);
>  	int (*set_watermarks_for_clock_ranges)(void *handle,
> @@ -1595,6 +1603,79 @@ struct amdgpu_pm_metrics {
>  	uint8_t data[];
>  };
>  
> +enum amdgpu_vr_temp {
> +	AMDGPU_VDDCR_VDD0_TEMP,
> +	AMDGPU_VDDCR_VDD1_TEMP,
> +	AMDGPU_VDDCR_VDD2_TEMP,
> +	AMDGPU_VDDCR_VDD3_TEMP,
> +	AMDGPU_VDDCR_SOC_A_TEMP,
> +	AMDGPU_VDDCR_SOC_C_TEMP,
> +	AMDGPU_VDDCR_SOCIO_A_TEMP,
> +	AMDGPU_VDDCR_SOCIO_C_TEMP,
> +	AMDGPU_VDD_085_HBM_TEMP,
> +	AMDGPU_VDDCR_11_HBM_B_TEMP,
> +	AMDGPU_VDDCR_11_HBM_D_TEMP,
> +	AMDGPU_VDD_USR_TEMP,
> +	AMDGPU_VDDIO_11_E32_TEMP,
> +	AMDGPU_VR_MAX_TEMP_ENTRIES,
> +};
> +
> +enum amdgpu_system_temp {
> +	AMDGPU_UBB_FPGA_TEMP,
> +	AMDGPU_UBB_FRONT_TEMP,
> +	AMDGPU_UBB_BACK_TEMP,
> +	AMDGPU_UBB_OAM7_TEMP,
> +	AMDGPU_UBB_IBC_TEMP,
> +	AMDGPU_UBB_UFPGA_TEMP,
> +	AMDGPU_UBB_OAM1_TEMP,
> +	AMDGPU_OAM_0_1_HSC_TEMP,
> +	AMDGPU_OAM_2_3_HSC_TEMP,
> +	AMDGPU_OAM_4_5_HSC_TEMP,
> +	AMDGPU_OAM_6_7_HSC_TEMP,
> +	AMDGPU_UBB_FPGA_0V72_VR_TEMP,
> +	AMDGPU_UBB_FPGA_3V3_VR_TEMP,
> +	AMDGPU_RETIMER_0_1_2_3_1V2_VR_TEMP,
> +	AMDGPU_RETIMER_4_5_6_7_1V2_VR_TEMP,
> +	AMDGPU_RETIMER_0_1_0V9_VR_TEMP,
> +	AMDGPU_RETIMER_4_5_0V9_VR_TEMP,
> +	AMDGPU_RETIMER_2_3_0V9_VR_TEMP,
> +	AMDGPU_RETIMER_6_7_0V9_VR_TEMP,
> +	AMDGPU_OAM_0_1_2_3_3V3_VR_TEMP,
> +	AMDGPU_OAM_4_5_6_7_3V3_VR_TEMP,
> +	AMDGPU_IBC_HSC_TEMP,
> +	AMDGPU_IBC_TEMP,
> +	AMDGPU_SYSTEM_MAX_TEMP_ENTRIES = 32,
> +};
> +
> +enum amdgpu_node_temp {
> +	AMDGPU_RETIMER_X_TEMP,
> +	AMDGPU_OAM_X_IBC_TEMP,
> +	AMDGPU_OAM_X_IBC_2_TEMP,
> +	AMDGPU_OAM_X_VDD18_VR_TEMP,
> +	AMDGPU_OAM_X_04_HBM_B_VR_TEMP,
> +	AMDGPU_OAM_X_04_HBM_D_VR_TEMP,
> +	AMDGPU_NODE_MAX_TEMP_ENTRIES = 12,
> +};
> +
> +struct amdgpu_gpuboard_temp_metrics_v1_0 {
> +	struct metrics_table_header common_header;
> +	uint16_t label_version;
> +	uint16_t node_id;
> +	uint64_t accumulation_counter;
> +	/* Encoded temperature in Celcius, 24:31 is sensor id 0:23 is temp value */
> +	uint32_t node_temp[AMDGPU_NODE_MAX_TEMP_ENTRIES];
> +	uint32_t vr_temp[AMDGPU_VR_MAX_TEMP_ENTRIES];
> +};
> +
> +struct amdgpu_baseboard_temp_metrics_v1_0 {
> +	struct metrics_table_header common_header;
> +	uint16_t label_version;
> +	uint16_t node_id;
> +	uint64_t accumulation_counter;
> +	/* Encoded temperature in Celcius, 24:31 is sensor id 0:23 is temp value */
> +	uint32_t system_temp[AMDGPU_SYSTEM_MAX_TEMP_ENTRIES];
> +};
> +
>  struct amdgpu_partition_metrics_v1_0 {
>  	struct metrics_table_header common_header;
>  	/* Current clocks (Mhz) */
> diff --git a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
> index 71d986dd7a6e..a5e97582d853 100644
> --- a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
> +++ b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
> @@ -2037,6 +2037,66 @@ int amdgpu_dpm_get_dpm_clock_table(struct amdgpu_device *adev,
>  	return ret;
>  }
>  
> +/**
> + * amdgpu_dpm_get_temp_metrics - Retrieve metrics for a specific compute
> + * partition
> + * @adev: Pointer to the device.
> + * @type: Identifier for the temperature type metrics to be fetched.
> + * @table: Pointer to a buffer where the metrics will be stored. If NULL, the
> + * function returns the size of the metrics structure.
> + *
> + * This function retrieves metrics for a specific temperature type, If the
> + * table parameter is NULL, the function returns the size of the metrics
> + * structure without populating it.
> + *
> + * Return: Size of the metrics structure on success, or a negative error code on failure.
> + */
> +ssize_t amdgpu_dpm_get_temp_metrics(struct amdgpu_device *adev,
> +				    enum smu_temp_metric_type type, void *table)
> +{
> +	const struct amd_pm_funcs *pp_funcs = adev->powerplay.pp_funcs;
> +	int ret = 0;

This init can be avoided.
> +
> +	if (!pp_funcs->get_temp_metrics ||
> +	    !amdgpu_dpm_is_temp_metrics_supported(adev, type))
> +		return 0;

Better to return error code when not supported.

Thanks,
Lijo

> +
> +	mutex_lock(&adev->pm.mutex);
> +	ret = pp_funcs->get_temp_metrics(adev->powerplay.pp_handle, type, table);
> +	mutex_unlock(&adev->pm.mutex);
> +
> +	return ret;
> +}
> +
> +/**
> + * amdgpu_dpm_is_temp_metrics_supported - Return if specific temperature metrics support
> + * is available
> + * @adev: Pointer to the device.
> + * @type: Identifier for the temperature type metrics to be fetched.
> + *
> + * This function returns metrics if specific temperature metrics type is supported or not.
> + *
> + * Return: True in case of metrics type supported else false.
> + */
> +bool amdgpu_dpm_is_temp_metrics_supported(struct amdgpu_device *adev,
> +					  enum smu_temp_metric_type type)
> +{
> +	const struct amd_pm_funcs *pp_funcs = adev->powerplay.pp_funcs;
> +	bool support_temp_metrics = false;
> +
> +	if (!pp_funcs->temp_metrics_is_supported)
> +		return support_temp_metrics;
> +
> +	if (is_support_sw_smu(adev)) {
> +		mutex_lock(&adev->pm.mutex);
> +		support_temp_metrics =
> +			pp_funcs->temp_metrics_is_supported(adev->powerplay.pp_handle, type);
> +		mutex_unlock(&adev->pm.mutex);
> +	}
> +
> +	return support_temp_metrics;
> +}
> +
>  /**
>   * amdgpu_dpm_get_xcp_metrics - Retrieve metrics for a specific compute
>   * partition
> diff --git a/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h b/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h
> index 768317ee1486..09962db988d6 100644
> --- a/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h
> +++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h
> @@ -526,6 +526,8 @@ int amdgpu_dpm_set_power_profile_mode(struct amdgpu_device *adev,
>  int amdgpu_dpm_get_gpu_metrics(struct amdgpu_device *adev, void **table);
>  ssize_t amdgpu_dpm_get_xcp_metrics(struct amdgpu_device *adev, int xcp_id,
>  				   void *table);
> +ssize_t amdgpu_dpm_get_temp_metrics(struct amdgpu_device *adev,
> +				    enum smu_temp_metric_type type, void *table);
>  
>  /**
>   * @get_pm_metrics: Get one snapshot of power management metrics from PMFW. The
> @@ -613,5 +615,7 @@ ssize_t amdgpu_dpm_get_pm_policy_info(struct amdgpu_device *adev,
>  int amdgpu_dpm_reset_sdma(struct amdgpu_device *adev, uint32_t inst_mask);
>  bool amdgpu_dpm_reset_sdma_is_supported(struct amdgpu_device *adev);
>  int amdgpu_dpm_reset_vcn(struct amdgpu_device *adev, uint32_t inst_mask);
> +bool amdgpu_dpm_is_temp_metrics_supported(struct amdgpu_device *adev,
> +					  enum smu_temp_metric_type type);
>  
>  #endif