[PATCH 2/3] drm/amdgpu: set poison mode for RAS
Tao Zhou
tao.zhou1 at amd.com
Sat Sep 18 08:07:50 UTC 2021
Add RAS poison mode flag and tell PSP RAS TA about the info.
Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 4 ++--
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 28 +++++++++++++++++++++++++
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 5 +++++
3 files changed, 35 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 7d09b28889af..140b94da2f5a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -1442,9 +1442,9 @@ static int psp_ras_initialize(struct psp_context *psp)
ras_cmd = (struct ta_ras_shared_memory *)psp->ras_context.context.mem_context.shared_buf;
memset(ras_cmd, 0, sizeof(struct ta_ras_shared_memory));
- if (psp->adev->gmc.xgmi.connected_to_cpu)
+ if (amdgpu_ras_is_poison_enabled(adev))
ras_cmd->ras_in_message.init_flags.poison_mode_en = 1;
- else
+ if (!adev->gmc.xgmi.connected_to_cpu)
ras_cmd->ras_in_message.init_flags.dgpu_mode = 1;
ret = psp_ras_load(psp);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index b5332db4d287..7b7e54fdd785 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2180,6 +2180,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
int r;
+ bool df_poison, umc_poison;
if (con)
return 0;
@@ -2249,6 +2250,23 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
goto release_con;
}
+ /* Init poison mode, the default value is false */
+ if (adev->df.funcs &&
+ adev->df.funcs->query_ras_poison_mode &&
+ adev->umc.ras_funcs &&
+ adev->umc.ras_funcs->query_ras_poison_mode) {
+ df_poison =
+ adev->df.funcs->query_ras_poison_mode(adev);
+ umc_poison =
+ adev->umc.ras_funcs->query_ras_poison_mode(adev);
+ /* Only poison is set in both DF and UMC, we can enable it */
+ if (df_poison && umc_poison)
+ con->poison_mode_en = true;
+ else if (df_poison != umc_poison)
+ dev_warn(adev->dev, "Poison setting is inconsistent in DF/UMC(%d:%d)!\n",
+ df_poison, umc_poison);
+ }
+
if (amdgpu_ras_fs_init(adev)) {
r = -EINVAL;
goto release_con;
@@ -2292,6 +2310,16 @@ static int amdgpu_persistent_edc_harvesting(struct amdgpu_device *adev,
return 0;
}
+bool amdgpu_ras_is_poison_enabled(struct amdgpu_device *adev)
+{
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+
+ if (!con)
+ return false;
+
+ return con->poison_mode_en;
+}
+
/* helper function to handle common stuff in ip late init phase */
int amdgpu_ras_late_init(struct amdgpu_device *adev,
struct ras_common_if *ras_block,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 1670467c2054..044bd19b7cce 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -345,6 +345,9 @@ struct amdgpu_ras {
/* disable ras error count harvest in recovery */
bool disable_ras_err_cnt_harvest;
+ /* is poison mode */
+ bool poison_mode_en;
+
/* RAS count errors delayed work */
struct delayed_work ras_counte_delay_work;
atomic_t ras_ue_count;
@@ -640,4 +643,6 @@ void amdgpu_release_ras_context(struct amdgpu_device *adev);
int amdgpu_persistent_edc_harvesting_supported(struct amdgpu_device *adev);
+bool amdgpu_ras_is_poison_enabled(struct amdgpu_device *adev);
+
#endif
--
2.17.1
More information about the amd-gfx
mailing list