[PATCH 2/5] drm/amdgpu: trigger mode1 reset for RAS RMA status

Wed Jun 5 07:47:36 UTC 2024

[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: Hawking Zhang <Hawking.Zhang at amd.com>

Regards,
Hawking
-----Original Message-----
From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> On Behalf Of Tao Zhou
Sent: Friday, May 31, 2024 18:49
To: amd-gfx at lists.freedesktop.org
Cc: Zhou1, Tao <Tao.Zhou1 at amd.com>
Subject: [PATCH 2/5] drm/amdgpu: trigger mode1 reset for RAS RMA status

Check RMA status in bad page retirement flow.

v2: fix coding bugs in v1.

Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c  | 28 +++++++++++++++++++-----  drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c  |  8 +++----  drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c |  4 +++-
 3 files changed, 29 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 616dc2387f34..10cbcc0d1a1a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2049,8 +2049,9 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
        struct amdgpu_device *adev = obj->adev;
        struct amdgpu_ras_block_object *block_obj =
                amdgpu_ras_get_ras_block(adev, obj->head.block, 0);
+       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);

-       if (!block_obj)
+       if (!block_obj || !con)
                return;

        /* both query_poison_status and handle_poison_consumption are optional, @@ -2073,14 +2074,17 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
        if (block_obj->hw_ops && block_obj->hw_ops->handle_poison_consumption)
                poison_stat = block_obj->hw_ops->handle_poison_consumption(adev);

-       /* gpu reset is fallback for failed and default cases */
-       if (poison_stat) {
+       /* gpu reset is fallback for failed and default cases.
+        * For RMA case, amdgpu_umc_poison_handler will handle gpu reset.
+        */
+       if (poison_stat && !con->is_rma) {
                dev_info(adev->dev, "GPU reset for %s RAS poison consumption is issued!\n",
                                block_obj->ras_comm.name);
                amdgpu_ras_reset_gpu(adev);
-       } else {
-               amdgpu_gfx_poison_consumption_handler(adev, entry);
        }
+
+       if (!poison_stat)
+               amdgpu_gfx_poison_consumption_handler(adev, entry);
 }

 static void amdgpu_ras_interrupt_poison_creation_handler(struct ras_manager *obj, @@ -2801,6 +2805,7 @@ static void amdgpu_ras_do_page_retirement(struct work_struct *work)
                                              page_retirement_dwork.work);
        struct amdgpu_device *adev = con->adev;
        struct ras_err_data err_data;
+       unsigned long err_cnt;

        if (amdgpu_in_reset(adev) || atomic_read(&con->in_recovery))
                return;
@@ -2808,9 +2813,13 @@ static void amdgpu_ras_do_page_retirement(struct work_struct *work)
        amdgpu_ras_error_data_init(&err_data);

        amdgpu_umc_handle_bad_pages(adev, &err_data);
+       err_cnt = err_data.err_addr_cnt;

        amdgpu_ras_error_data_fini(&err_data);

+       if (err_cnt && con->is_rma)
+               amdgpu_ras_reset_gpu(adev);
+
        mutex_lock(&con->umc_ecc_log.lock);
        if (radix_tree_tagged(&con->umc_ecc_log.de_page_tree,
                                UMC_ECC_NEW_DETECTED_TAG))
@@ -2867,7 +2876,8 @@ static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev,
        if (poison_msg->pasid_fn)
                poison_msg->pasid_fn(adev, pasid, poison_msg->data);

-       if (reset) {
+       /* for RMA, amdgpu_ras_poison_creation_handler will trigger gpu reset */
+       if (reset && !con->is_rma) {
                flush_delayed_work(&con->page_retirement_dwork);

                con->gpu_reset_flags |= reset;
@@ -3983,6 +3993,12 @@ int amdgpu_ras_reset_gpu(struct amdgpu_device *adev)  {
        struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);

+       /* mode1 is the only selection for RMA status */
+       if (ras->is_rma) {
+               ras->gpu_reset_flags = 0;
+               ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET;
+       }
+
        if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0)
                amdgpu_reset_domain_schedule(ras->adev->reset_domain, &ras->recovery_work);
        return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index 1dbe69eabb9a..4a72ff8d8d80 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -195,7 +195,8 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
        kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
        amdgpu_umc_handle_bad_pages(adev, ras_error_status);

-       if (err_data->ue_count && reset) {
+       if ((err_data->ue_count || err_data->de_count) &&
+           (reset || (con && con->is_rma))) {
                con->gpu_reset_flags |= reset;
                amdgpu_ras_reset_gpu(adev);
        }
@@ -211,6 +212,7 @@ int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev,
                .block = AMDGPU_RAS_BLOCK__UMC,
        };
        struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head);
+       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
        uint32_t timeout = timeout_ms;

        memset(&err_data, 0, sizeof(err_data)); @@ -243,9 +245,7 @@ int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev,

        kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);

-       if (reset) {
-               struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
-
+       if (reset || (err_data.err_addr_cnt && con && con->is_rma)) {
                con->gpu_reset_flags |= reset;
                amdgpu_ras_reset_gpu(adev);
        }
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
index 9e7ce1e6bc06..9cd221ed240c 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
@@ -85,6 +85,7 @@ static int gfx_v11_0_3_poison_consumption_handler(struct amdgpu_device *adev,
        if (entry && (entry->client_id == SOC21_IH_CLIENTID_GFX) &&
            (entry->src_id == GFX_11_0_0__SRCID__RLC_GC_FED_INTERRUPT) &&
             !entry->vmid && !entry->pasid) {
+               struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
                uint32_t rlc_status0 = 0;

                rlc_status0 = RREG32_SOC15(GC, 0, regRLC_RLCS_FED_STATUS_0); @@ -96,7 +97,8 @@ static int gfx_v11_0_3_poison_consumption_handler(struct amdgpu_device *adev,
                        ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE2_RESET;
                }

-               amdgpu_ras_reset_gpu(adev);
+               if (con && !con->is_rma)
+                       amdgpu_ras_reset_gpu(adev);
        }

        return 0;
--
2.34.1