[PATCH] drm/amd/pm: add delay to avoid unintened shutdown due to hotspot temperature spark
Feng, Kenneth
Kenneth.Feng at amd.com
Wed May 17 00:33:19 UTC 2023
[AMD Official Use Only - General]
Do we really need this delay on all the ASICs?
Maybe set the default value to 0 is more reasonable?
Thanks.
-----Original Message-----
From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> On Behalf Of Evan Quan
Sent: Tuesday, May 16, 2023 10:51 AM
To: amd-gfx at lists.freedesktop.org
Cc: Deucher, Alexander <Alexander.Deucher at amd.com>; Quan, Evan <Evan.Quan at amd.com>
Subject: [PATCH] drm/amd/pm: add delay to avoid unintened shutdown due to hotspot temperature spark
Caution: This message originated from an External Source. Use proper caution when opening attachments, clicking links, or responding.
There will be a double check for the hotspot temperature on delay expired. This can avoid unintended shutdown due to hotspot temperature spark.
Signed-off-by: Evan Quan <evan.quan at amd.com>
--
v1->v2:
- add the proper millidegree Celsius to degree Celsius transform
---
drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 +
drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 14 ++++++++
drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 34 +++++++++++++++++++
drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h | 2 ++
.../gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c | 9 ++---
.../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c | 9 ++---
6 files changed, 55 insertions(+), 14 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 39192eba3ff8..4cd873659365 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -243,6 +243,7 @@ extern int amdgpu_num_kcq; #define AMDGPU_VCNFW_LOG_SIZE (32 * 1024) extern int amdgpu_vcnfw_log; extern int amdgpu_sg_display;
+extern uint amdgpu_ctf_delay;
#define AMDGPU_VM_MAX_NUM_CTX 4096
#define AMDGPU_SG_THRESHOLD (256*1024*1024)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 749eeb9a2976..6c699fefdf92 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -198,6 +198,7 @@ int amdgpu_smartshift_bias; int amdgpu_use_xgmi_p2p = 1; int amdgpu_vcnfw_log; int amdgpu_sg_display = -1; /* auto */
+uint amdgpu_ctf_delay = 50; /* in ms */
static void amdgpu_drv_delayed_reset_work_handler(struct work_struct *work);
@@ -973,6 +974,19 @@ MODULE_PARM_DESC(smu_pptable_id,
"specify pptable id to be used (-1 = auto(default) value, 0 = use pptable from vbios, > 0 = soft pptable id)"); module_param_named(smu_pptable_id, amdgpu_smu_pptable_id, int, 0444);
+/**
+ * DOC: ctf_delay (uint)
+ * On SW CTF triggerred, to protect the chip from over-heated and
+possible damage, we usually
+ * trigger a system shutdown. However, considering there may be a
+hotspot temperature spark
+ * momentarily hitting the SW CTF setting point, a delay is added to avoid unintended shutdown.
+ * On the delay expired, the shutdown will be performed if the hotspot
+temp is still
+ * bigger than the SW CTF setting. Otherwise, nothing will be done.
+ * The default setting for the delay is 50ms.
+ */
+MODULE_PARM_DESC(ctf_delay,
+ "the delay(default 50ms) enforced before real action
+taken on ctf triggerred"); module_param_named(ctf_delay,
+amdgpu_ctf_delay, uint, 0444);
+
/* These devices are not supported by amdgpu.
* They are supported by the mach64, r128, radeon drivers
*/
diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
index 3c860939031e..71153b335ad9 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
@@ -24,6 +24,7 @@
#include <linux/firmware.h>
#include <linux/pci.h>
+#include <linux/reboot.h>
#include "amdgpu.h"
#include "amdgpu_smu.h"
@@ -1070,6 +1071,34 @@ static void smu_interrupt_work_fn(struct work_struct *work)
smu->ppt_funcs->interrupt_work(smu);
}
+static void smu_swctf_delayed_work_handler(struct work_struct *work) {
+ struct smu_context *smu =
+ container_of(work, struct smu_context, swctf_delayed_work.work);
+ struct smu_temperature_range *range =
+ &smu->thermal_range;
+ struct amdgpu_device *adev = smu->adev;
+ uint32_t hotspot_tmp, size;
+
+ /*
+ * If the hotspot temperature is confirmed as below SW CTF setting point
+ * after the delay enforced, nothing will be done.
+ * Otherwise, a graceful shutdown will be performed to prevent further damage.
+ */
+ if (smu->ppt_funcs->read_sensor &&
+ !smu->ppt_funcs->read_sensor(smu,
+ AMDGPU_PP_SENSOR_HOTSPOT_TEMP,
+ &hotspot_tmp,
+ &size) &&
+ range->software_shutdown_temp &&
+ hotspot_tmp / 1000 < range->software_shutdown_temp)
+ return;
+
+ dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) detected!\n");
+ dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n");
+ orderly_poweroff(true);
+}
+
static int smu_sw_init(void *handle)
{
struct amdgpu_device *adev = (struct amdgpu_device *)handle; @@ -1358,6 +1387,9 @@ static int smu_smc_hw_setup(struct smu_context *smu)
return ret;
}
+ INIT_DELAYED_WORK(&smu->swctf_delayed_work,
+ smu_swctf_delayed_work_handler);
+
ret = smu_enable_thermal_alert(smu);
if (ret) {
dev_err(adev->dev, "Failed to enable thermal alert!\n"); @@ -1592,6 +1624,8 @@ static int smu_smc_hw_cleanup(struct smu_context *smu)
return ret;
}
+ cancel_delayed_work_sync(&smu->swctf_delayed_work);
+
ret = smu_disable_dpms(smu);
if (ret) {
dev_err(adev->dev, "Fail to disable dpm features!\n"); diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
index 4ce394903c07..18101ec0ae6e 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
+++ b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
@@ -573,6 +573,8 @@ struct smu_context
u32 debug_param_reg;
u32 debug_msg_reg;
u32 debug_resp_reg;
+
+ struct delayed_work swctf_delayed_work;
};
struct i2c_adapter;
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c
index e1ef88ee1ed3..4c3c682bf7a0 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c
@@ -1412,13 +1412,8 @@ static int smu_v11_0_irq_process(struct amdgpu_device *adev,
if (client_id == SOC15_IH_CLIENTID_THM) {
switch (src_id) {
case THM_11_0__SRCID__THM_DIG_THERM_L2H:
- dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) detected!\n");
- /*
- * SW CTF just occurred.
- * Try to do a graceful shutdown to prevent further damage.
- */
- dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n");
- orderly_poweroff(true);
+ schedule_delayed_work(&smu->swctf_delayed_work,
+
+ msecs_to_jiffies(amdgpu_ctf_delay));
break;
case THM_11_0__SRCID__THM_DIG_THERM_H2L:
dev_emerg(adev->dev, "ERROR: GPU under temperature range detected\n"); diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
index 0bc0a6e97b5a..a5447119d5f5 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
@@ -1377,13 +1377,8 @@ static int smu_v13_0_irq_process(struct amdgpu_device *adev,
if (client_id == SOC15_IH_CLIENTID_THM) {
switch (src_id) {
case THM_11_0__SRCID__THM_DIG_THERM_L2H:
- dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) detected!\n");
- /*
- * SW CTF just occurred.
- * Try to do a graceful shutdown to prevent further damage.
- */
- dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n");
- orderly_poweroff(true);
+ schedule_delayed_work(&smu->swctf_delayed_work,
+
+ msecs_to_jiffies(amdgpu_ctf_delay));
break;
case THM_11_0__SRCID__THM_DIG_THERM_H2L:
dev_emerg(adev->dev, "ERROR: GPU under temperature range detected\n");
--
2.34.1
More information about the amd-gfx
mailing list