[PATCH 2/2] drm/amd/pm: avoid unintentional shutdown due to temperature momentary fluctuation

Mon Jun 26 11:54:22 UTC 2023

On 6/26/2023 1:17 PM, Evan Quan wrote:
> An intentional delay is added on soft ctf triggered. Then there will
> be a double check for the GPU temperature before taking further
> action. This can avoid unintended shutdown due to temperature
> momentary fluctuation.
> 
> Signed-off-by: Evan Quan <evan.quan at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h           |  3 ++
>   .../gpu/drm/amd/pm/powerplay/amd_powerplay.c  | 48 +++++++++++++++++++
>   .../drm/amd/pm/powerplay/hwmgr/smu_helper.c   | 27 ++++-------
>   drivers/gpu/drm/amd/pm/powerplay/inc/hwmgr.h  |  2 +
>   drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c     | 34 +++++++++++++
>   drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h |  2 +
>   .../gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c    |  9 +---
>   .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c    |  9 +---
>   8 files changed, 102 insertions(+), 32 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index e459381dc759..5ef1f31e703c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -287,6 +287,9 @@ extern int amdgpu_user_partt_mode;
>   #define AMDGPU_SMARTSHIFT_MAX_BIAS (100)
>   #define AMDGPU_SMARTSHIFT_MIN_BIAS (-100)
>   
> +/* Extra time delay(in ms) to eliminate the influence of temperature momentary fluctuation */
> +#define AMDGPU_SWCTF_EXTRA_DELAY		50

I think a delay of 10-15ms is good enough to filter out any spike.

With that change, the series is
	Reviewed-by: Lijo Lazar <lijo.lazar at amd.com>

Thanks,
Lijo

> +
>   struct amdgpu_xcp_mgr;
>   struct amdgpu_device;
>   struct amdgpu_irq_src;
> diff --git a/drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c b/drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c
> index 11b7b4cffaae..ff360c699171 100644
> --- a/drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c
> +++ b/drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c
> @@ -26,6 +26,7 @@
>   #include <linux/gfp.h>
>   #include <linux/slab.h>
>   #include <linux/firmware.h>
> +#include <linux/reboot.h>
>   #include "amd_shared.h"
>   #include "amd_powerplay.h"
>   #include "power_state.h"
> @@ -91,6 +92,45 @@ static int pp_early_init(void *handle)
>   	return 0;
>   }
>   
> +static void pp_swctf_delayed_work_handler(struct work_struct *work)
> +{
> +	struct pp_hwmgr *hwmgr =
> +		container_of(work, struct pp_hwmgr, swctf_delayed_work.work);
> +	struct amdgpu_device *adev = hwmgr->adev;
> +	struct amdgpu_dpm_thermal *range =
> +				&adev->pm.dpm.thermal;
> +	uint32_t gpu_temperature, size;
> +	int ret;
> +
> +	/*
> +	 * If the hotspot/edge temperature is confirmed as below SW CTF setting point
> +	 * after the delay enforced, nothing will be done.
> +	 * Otherwise, a graceful shutdown will be performed to prevent further damage.
> +	 */
> +	if (range->sw_ctf_threshold &&
> +	    hwmgr->hwmgr_func->read_sensor) {
> +		ret = hwmgr->hwmgr_func->read_sensor(hwmgr,
> +						     AMDGPU_PP_SENSOR_HOTSPOT_TEMP,
> +						     &gpu_temperature,
> +						     &size);
> +		/*
> +		 * For some legacy ASICs, hotspot temperature retrieving might be not
> +		 * supported. Check the edge temperature instead then.
> +		 */
> +		if (ret == -EOPNOTSUPP)
> +			ret = hwmgr->hwmgr_func->read_sensor(hwmgr,
> +							     AMDGPU_PP_SENSOR_EDGE_TEMP,
> +							     &gpu_temperature,
> +							     &size);
> +		if (!ret && gpu_temperature / 1000 < range->sw_ctf_threshold)
> +			return;
> +	}
> +
> +	dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) detected!\n");
> +	dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n");
> +	orderly_poweroff(true);
> +}
> +
>   static int pp_sw_init(void *handle)
>   {
>   	struct amdgpu_device *adev = handle;
> @@ -101,6 +141,10 @@ static int pp_sw_init(void *handle)
>   
>   	pr_debug("powerplay sw init %s\n", ret ? "failed" : "successfully");
>   
> +	if (!ret)
> +		INIT_DELAYED_WORK(&hwmgr->swctf_delayed_work,
> +				  pp_swctf_delayed_work_handler);
> +
>   	return ret;
>   }
>   
> @@ -135,6 +179,8 @@ static int pp_hw_fini(void *handle)
>   	struct amdgpu_device *adev = handle;
>   	struct pp_hwmgr *hwmgr = adev->powerplay.pp_handle;
>   
> +	cancel_delayed_work_sync(&hwmgr->swctf_delayed_work);
> +
>   	hwmgr_hw_fini(hwmgr);
>   
>   	return 0;
> @@ -221,6 +267,8 @@ static int pp_suspend(void *handle)
>   	struct amdgpu_device *adev = handle;
>   	struct pp_hwmgr *hwmgr = adev->powerplay.pp_handle;
>   
> +	cancel_delayed_work_sync(&hwmgr->swctf_delayed_work);
> +
>   	return hwmgr_suspend(hwmgr);
>   }
>   
> diff --git a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu_helper.c b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu_helper.c
> index bfe80ac0ad8c..d0b1ab6c4523 100644
> --- a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu_helper.c
> +++ b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu_helper.c
> @@ -603,21 +603,17 @@ int phm_irq_process(struct amdgpu_device *adev,
>   			   struct amdgpu_irq_src *source,
>   			   struct amdgpu_iv_entry *entry)
>   {
> +	struct pp_hwmgr *hwmgr = adev->powerplay.pp_handle;
>   	uint32_t client_id = entry->client_id;
>   	uint32_t src_id = entry->src_id;
>   
>   	if (client_id == AMDGPU_IRQ_CLIENTID_LEGACY) {
>   		if (src_id == VISLANDS30_IV_SRCID_CG_TSS_THERMAL_LOW_TO_HIGH) {
> -			dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) detected!\n");
> -			/*
> -			 * SW CTF just occurred.
> -			 * Try to do a graceful shutdown to prevent further damage.
> -			 */
> -			dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n");
> -			orderly_poweroff(true);
> -		} else if (src_id == VISLANDS30_IV_SRCID_CG_TSS_THERMAL_HIGH_TO_LOW)
> +			schedule_delayed_work(&hwmgr->swctf_delayed_work,
> +					      msecs_to_jiffies(AMDGPU_SWCTF_EXTRA_DELAY));
> +		} else if (src_id == VISLANDS30_IV_SRCID_CG_TSS_THERMAL_HIGH_TO_LOW) {
>   			dev_emerg(adev->dev, "ERROR: GPU under temperature range detected!\n");
> -		else if (src_id == VISLANDS30_IV_SRCID_GPIO_19) {
> +		} else if (src_id == VISLANDS30_IV_SRCID_GPIO_19) {
>   			dev_emerg(adev->dev, "ERROR: GPU HW Critical Temperature Fault(aka CTF) detected!\n");
>   			/*
>   			 * HW CTF just occurred. Shutdown to prevent further damage.
> @@ -626,15 +622,10 @@ int phm_irq_process(struct amdgpu_device *adev,
>   			orderly_poweroff(true);
>   		}
>   	} else if (client_id == SOC15_IH_CLIENTID_THM) {
> -		if (src_id == 0) {
> -			dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) detected!\n");
> -			/*
> -			 * SW CTF just occurred.
> -			 * Try to do a graceful shutdown to prevent further damage.
> -			 */
> -			dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n");
> -			orderly_poweroff(true);
> -		} else
> +		if (src_id == 0)
> +			schedule_delayed_work(&hwmgr->swctf_delayed_work,
> +					      msecs_to_jiffies(AMDGPU_SWCTF_EXTRA_DELAY));
> +		else
>   			dev_emerg(adev->dev, "ERROR: GPU under temperature range detected!\n");
>   	} else if (client_id == SOC15_IH_CLIENTID_ROM_SMUIO) {
>   		dev_emerg(adev->dev, "ERROR: GPU HW Critical Temperature Fault(aka CTF) detected!\n");
> diff --git a/drivers/gpu/drm/amd/pm/powerplay/inc/hwmgr.h b/drivers/gpu/drm/amd/pm/powerplay/inc/hwmgr.h
> index f1580a26a850..612d66aeaab9 100644
> --- a/drivers/gpu/drm/amd/pm/powerplay/inc/hwmgr.h
> +++ b/drivers/gpu/drm/amd/pm/powerplay/inc/hwmgr.h
> @@ -811,6 +811,8 @@ struct pp_hwmgr {
>   	bool gfxoff_state_changed_by_workload;
>   	uint32_t pstate_sclk_peak;
>   	uint32_t pstate_mclk_peak;
> +
> +	struct delayed_work swctf_delayed_work;
>   };
>   
>   int hwmgr_early_init(struct pp_hwmgr *hwmgr);
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> index 2aecb8faf407..4062b9b46986 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> @@ -24,6 +24,7 @@
>   
>   #include <linux/firmware.h>
>   #include <linux/pci.h>
> +#include <linux/reboot.h>
>   
>   #include "amdgpu.h"
>   #include "amdgpu_smu.h"
> @@ -1078,6 +1079,34 @@ static void smu_interrupt_work_fn(struct work_struct *work)
>   		smu->ppt_funcs->interrupt_work(smu);
>   }
>   
> +static void smu_swctf_delayed_work_handler(struct work_struct *work)
> +{
> +	struct smu_context *smu =
> +		container_of(work, struct smu_context, swctf_delayed_work.work);
> +	struct smu_temperature_range *range =
> +				&smu->thermal_range;
> +	struct amdgpu_device *adev = smu->adev;
> +	uint32_t hotspot_tmp, size;
> +
> +	/*
> +	 * If the hotspot temperature is confirmed as below SW CTF setting point
> +	 * after the delay enforced, nothing will be done.
> +	 * Otherwise, a graceful shutdown will be performed to prevent further damage.
> +	 */
> +	if (range->software_shutdown_temp &&
> +	    smu->ppt_funcs->read_sensor &&
> +	    !smu->ppt_funcs->read_sensor(smu,
> +					 AMDGPU_PP_SENSOR_HOTSPOT_TEMP,
> +					 &hotspot_tmp,
> +					 &size) &&
> +	    hotspot_tmp / 1000 < range->software_shutdown_temp)
> +		return;
> +
> +	dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) detected!\n");
> +	dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n");
> +	orderly_poweroff(true);
> +}
> +
>   static int smu_sw_init(void *handle)
>   {
>   	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> @@ -1120,6 +1149,9 @@ static int smu_sw_init(void *handle)
>   	smu->smu_dpm.dpm_level = AMD_DPM_FORCED_LEVEL_AUTO;
>   	smu->smu_dpm.requested_dpm_level = AMD_DPM_FORCED_LEVEL_AUTO;
>   
> +	INIT_DELAYED_WORK(&smu->swctf_delayed_work,
> +			  smu_swctf_delayed_work_handler);
> +
>   	ret = smu_smc_table_sw_init(smu);
>   	if (ret) {
>   		dev_err(adev->dev, "Failed to sw init smc table!\n");
> @@ -1600,6 +1632,8 @@ static int smu_smc_hw_cleanup(struct smu_context *smu)
>   		return ret;
>   	}
>   
> +	cancel_delayed_work_sync(&smu->swctf_delayed_work);
> +
>   	ret = smu_disable_dpms(smu);
>   	if (ret) {
>   		dev_err(adev->dev, "Fail to disable dpm features!\n");
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> index 4ce394903c07..18101ec0ae6e 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> @@ -573,6 +573,8 @@ struct smu_context
>   	u32 debug_param_reg;
>   	u32 debug_msg_reg;
>   	u32 debug_resp_reg;
> +
> +	struct delayed_work		swctf_delayed_work;
>   };
>   
>   struct i2c_adapter;
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c
> index e1ef88ee1ed3..aa4a5498a12f 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c
> @@ -1412,13 +1412,8 @@ static int smu_v11_0_irq_process(struct amdgpu_device *adev,
>   	if (client_id == SOC15_IH_CLIENTID_THM) {
>   		switch (src_id) {
>   		case THM_11_0__SRCID__THM_DIG_THERM_L2H:
> -			dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) detected!\n");
> -			/*
> -			 * SW CTF just occurred.
> -			 * Try to do a graceful shutdown to prevent further damage.
> -			 */
> -			dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n");
> -			orderly_poweroff(true);
> +			schedule_delayed_work(&smu->swctf_delayed_work,
> +					      msecs_to_jiffies(AMDGPU_SWCTF_EXTRA_DELAY));
>   		break;
>   		case THM_11_0__SRCID__THM_DIG_THERM_H2L:
>   			dev_emerg(adev->dev, "ERROR: GPU under temperature range detected\n");
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> index 9e8414037638..4071bfa38727 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> @@ -1353,13 +1353,8 @@ static int smu_v13_0_irq_process(struct amdgpu_device *adev,
>   	if (client_id == SOC15_IH_CLIENTID_THM) {
>   		switch (src_id) {
>   		case THM_11_0__SRCID__THM_DIG_THERM_L2H:
> -			dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) detected!\n");
> -			/*
> -			 * SW CTF just occurred.
> -			 * Try to do a graceful shutdown to prevent further damage.
> -			 */
> -			dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n");
> -			orderly_poweroff(true);
> +			schedule_delayed_work(&smu->swctf_delayed_work,
> +					      msecs_to_jiffies(AMDGPU_SWCTF_EXTRA_DELAY));
>   			break;
>   		case THM_11_0__SRCID__THM_DIG_THERM_H2L:
>   			dev_emerg(adev->dev, "ERROR: GPU under temperature range detected\n");