[PATCH v3] drm/amdgpu: Move mca debug mode decision to ras

Zhang, Hawking Hawking.Zhang at amd.com
Thu Nov 16 12:40:26 UTC 2023


[AMD Official Use Only - General]

Reviewed-by: Hawking Zhang <Hawking.Zhang at amd.com>

Regards,
Hawking
-----Original Message-----
From: Lazar, Lijo <Lijo.Lazar at amd.com>
Sent: Friday, November 10, 2023 15:56
To: amd-gfx at lists.freedesktop.org
Cc: Zhang, Hawking <Hawking.Zhang at amd.com>; Deucher, Alexander <Alexander.Deucher at amd.com>; Wang, Yang(Kevin) <KevinYang.Wang at amd.com>
Subject: [PATCH v3] drm/amdgpu: Move mca debug mode decision to ras

Refactor code such that ras block decides the default mca debug mode, and not swsmu block.

By default mca debug mode is set to false.

Signed-off-by: Lijo Lazar <lijo.lazar at amd.com>
---
v3: Default mca debug mode is set to false

v2: Set mca debug mode early before ras block late init as ras query is initiated during late init of ras blocks (KevinYang)

 drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c            |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c            | 14 +++++++++++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h            |  2 +-
 .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c   | 12 ------------
 4 files changed, 13 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
index cf33eb219e25..54f2f346579e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
@@ -377,7 +377,7 @@ static int amdgpu_mca_smu_debug_mode_set(void *data, u64 val)
        struct amdgpu_device *adev = (struct amdgpu_device *)data;
        int ret;

-       ret = amdgpu_mca_smu_set_debug_mode(adev, val ? true : false);
+       ret = amdgpu_ras_set_mca_debug_mode(adev, val ? true : false);
        if (ret)
                return ret;

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 84e5987b14e0..6747fbe4feab 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3132,6 +3132,8 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev)
        if (amdgpu_sriov_vf(adev))
                return 0;

+       amdgpu_ras_set_mca_debug_mode(adev, false);
+
        list_for_each_entry_safe(node, tmp, &adev->ras_list, node) {
                if (!node->ras_obj) {
                        dev_warn(adev->dev, "Warning: abnormal ras list node.\n"); @@ -3405,12 +3407,18 @@ int amdgpu_ras_reset_gpu(struct amdgpu_device *adev)
        return 0;
 }

-void amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable)
+int amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool
+enable)
 {
        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+       int ret;

-       if (con)
-               con->is_mca_debug_mode = enable;
+       if (con) {
+               ret = amdgpu_mca_smu_set_debug_mode(adev, enable);
+               if (!ret)
+                       con->is_mca_debug_mode = enable;
+       }
+
+       return ret;
 }

 bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 19161916ac46..6a941eb8fb8f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -773,7 +773,7 @@ struct amdgpu_ras* amdgpu_ras_get_context(struct amdgpu_device *adev);

 int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras *ras_con);

-void amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable);
+int amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool
+enable);
 bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev);  bool amdgpu_ras_get_error_query_mode(struct amdgpu_device *adev,
                                     unsigned int *mode);
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index 6cbfb25a05de..f09f56efbdc3 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -1516,7 +1516,6 @@ static int smu_v13_0_6_mca_set_debug_mode(struct smu_context *smu, bool enable)
        if (smu->smc_fw_version < 0x554800)
                return 0;

-       amdgpu_ras_set_mca_debug_mode(smu->adev, enable);
        return smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_ClearMcaOnRead,
                                               enable ? 0 : ClearMcaOnRead_UE_FLAG_MASK | ClearMcaOnRead_CE_POLL_MASK,
                                               NULL);
@@ -2338,16 +2337,6 @@ static int smu_v13_0_6_smu_send_hbm_bad_page_num(struct smu_context *smu,
        return ret;
 }

-static int smu_v13_0_6_post_init(struct smu_context *smu) -{
-       struct amdgpu_device *adev = smu->adev;
-
-       if (!amdgpu_sriov_vf(adev) && adev->ras_enabled)
-               return smu_v13_0_6_mca_set_debug_mode(smu, false);
-
-       return 0;
-}
-
 static int mca_smu_set_debug_mode(struct amdgpu_device *adev, bool enable)  {
        struct smu_context *smu = adev->powerplay.pp_handle; @@ -2904,7 +2893,6 @@ static const struct pptable_funcs smu_v13_0_6_ppt_funcs = {
        .i2c_init = smu_v13_0_6_i2c_control_init,
        .i2c_fini = smu_v13_0_6_i2c_control_fini,
        .send_hbm_bad_pages_num = smu_v13_0_6_smu_send_hbm_bad_page_num,
-       .post_init = smu_v13_0_6_post_init,
 };

 void smu_v13_0_6_set_ppt_funcs(struct smu_context *smu)
--
2.25.1



More information about the amd-gfx mailing list