[PATCH 1/3] drm/amdgpu: add RAS poison creation handler (v2)

Zhang, Hawking Hawking.Zhang at amd.com
Thu Apr 21 14:54:13 UTC 2022


[AMD Official Use Only]

Hi Tao,

I was thinking more aggressive change - current amdgpu_ras_interrupt_handler only serves as umc poison (poison mode) or uncorrectable error handler (fue mode).

We can still keep it as a unified entry point, but how about check ip block first, then if it is umc, then check poison mode to decide poison creation handling or legacy uncorrectable error handling.

As discussed before, we'll optimize the poison creation handling later. so keeping poison_creation_handler and legacy umc ue handler in separated functions seems right direction to me.

Thoughts?

Regards,
Hawking

-----Original Message-----
From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> On Behalf Of Tao Zhou
Sent: Wednesday, April 20, 2022 19:30
To: amd-gfx at lists.freedesktop.org; Zhang, Hawking <Hawking.Zhang at amd.com>; Yang, Stanley <Stanley.Yang at amd.com>; Lazar, Lijo <Lijo.Lazar at amd.com>; Ziya, Mohammad zafar <Mohammadzafar.Ziya at amd.com>; Chai, Thomas <YiPeng.Chai at amd.com>
Cc: Zhou1, Tao <Tao.Zhou1 at amd.com>
Subject: [PATCH 1/3] drm/amdgpu: add RAS poison creation handler (v2)

Prepare for the implementation of poison consumption handler.

v2: separate umc handler from poison creation.

Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 70 ++++++++++++++++---------
 1 file changed, 44 insertions(+), 26 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 4bbed76b79c8..22477f76913a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1515,12 +1515,45 @@ static int amdgpu_ras_fs_fini(struct amdgpu_device *adev)
 /* ras fs end */

 /* ih begin */
+static void amdgpu_ras_interrupt_poison_creation_handler(struct ras_manager *obj,
+                               struct amdgpu_iv_entry *entry)
+{
+       dev_info(obj->adev->dev,
+               "Poison is created, no user action is needed.\n"); }
+
+static void amdgpu_ras_interrupt_umc_handler(struct ras_manager *obj,
+                               struct amdgpu_iv_entry *entry)
+{
+       struct ras_ih_data *data = &obj->ih_data;
+       struct ras_err_data err_data = {0, 0, 0, NULL};
+       int ret;
+
+       if (!data->cb)
+               return;
+
+       /* Let IP handle its data, maybe we need get the output
+        * from the callback to update the error type/count, etc
+        */
+       ret = data->cb(obj->adev, &err_data, entry);
+       /* ue will trigger an interrupt, and in that case
+        * we need do a reset to recovery the whole system.
+        * But leave IP do that recovery, here we just dispatch
+        * the error.
+        */
+       if (ret == AMDGPU_RAS_SUCCESS) {
+               /* these counts could be left as 0 if
+                * some blocks do not count error number
+                */
+               obj->err_data.ue_count += err_data.ue_count;
+               obj->err_data.ce_count += err_data.ce_count;
+       }
+}
+
 static void amdgpu_ras_interrupt_handler(struct ras_manager *obj)  {
        struct ras_ih_data *data = &obj->ih_data;
        struct amdgpu_iv_entry entry;
-       int ret;
-       struct ras_err_data err_data = {0, 0, 0, NULL};

        while (data->rptr != data->wptr) {
                rmb();
@@ -1531,30 +1564,15 @@ static void amdgpu_ras_interrupt_handler(struct ras_manager *obj)
                data->rptr = (data->aligned_element_size +
                                data->rptr) % data->ring_size;

-               if (data->cb) {
-                       if (amdgpu_ras_is_poison_mode_supported(obj->adev) &&
-                           obj->head.block == AMDGPU_RAS_BLOCK__UMC)
-                               dev_info(obj->adev->dev,
-                                               "Poison is created, no user action is needed.\n");
-                       else {
-                               /* Let IP handle its data, maybe we need get the output
-                                * from the callback to udpate the error type/count, etc
-                                */
-                               memset(&err_data, 0, sizeof(err_data));
-                               ret = data->cb(obj->adev, &err_data, &entry);
-                               /* ue will trigger an interrupt, and in that case
-                                * we need do a reset to recovery the whole system.
-                                * But leave IP do that recovery, here we just dispatch
-                                * the error.
-                                */
-                               if (ret == AMDGPU_RAS_SUCCESS) {
-                                       /* these counts could be left as 0 if
-                                        * some blocks do not count error number
-                                        */
-                                       obj->err_data.ue_count += err_data.ue_count;
-                                       obj->err_data.ce_count += err_data.ce_count;
-                               }
-                       }
+               if (amdgpu_ras_is_poison_mode_supported(obj->adev)) {
+                       if (obj->head.block == AMDGPU_RAS_BLOCK__UMC)
+                               amdgpu_ras_interrupt_poison_creation_handler(obj, &entry);
+               } else {
+                       if (obj->head.block == AMDGPU_RAS_BLOCK__UMC)
+                               amdgpu_ras_interrupt_umc_handler(obj, &entry);
+                       else
+                               dev_warn(obj->adev->dev,
+                                       "No RAS interrupt handler for non-UMC block with poison
+disabled.\n");
                }
        }
 }
--
2.35.1



More information about the amd-gfx mailing list