[PATCH 1/3] drm/amdgpu: create function to check RAS RMA status

Zhang, Hawking Hawking.Zhang at amd.com
Thu Aug 1 10:22:22 UTC 2024


[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: Hawking Zhang <Hawking.Zhang at amd.com>

Regards,
Hawking
-----Original Message-----
From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> On Behalf Of Tao Zhou
Sent: Thursday, August 1, 2024 18:00
To: amd-gfx at lists.freedesktop.org
Cc: Zhou1, Tao <Tao.Zhou1 at amd.com>
Subject: [PATCH 1/3] drm/amdgpu: create function to check RAS RMA status

In the convenience of calling it globally.

Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c  | 22 ++++++++++++++++------  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h  |  1 +  drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c  |  2 +-  drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c |  2 +-
 4 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 12ab48f26bd5..0941518f04c2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2153,7 +2153,7 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
        /* gpu reset is fallback for failed and default cases.
         * For RMA case, amdgpu_umc_poison_handler will handle gpu reset.
         */
-       if (poison_stat && !con->is_rma) {
+       if (poison_stat && !amdgpu_ras_is_rma(adev)) {
                event_id = amdgpu_ras_acquire_event_id(adev, type);
                RAS_EVENT_LOG(adev, event_id,
                              "GPU reset for %s RAS poison consumption is issued!\n", @@ -2951,7 +2951,7 @@ static void amdgpu_ras_do_page_retirement(struct work_struct *work)

        amdgpu_ras_error_data_fini(&err_data);

-       if (err_cnt && con->is_rma)
+       if (err_cnt && amdgpu_ras_is_rma(adev))
                amdgpu_ras_reset_gpu(adev);

        amdgpu_ras_schedule_retirement_dwork(con,
@@ -3053,7 +3053,7 @@ static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev,
        }

        /* for RMA, amdgpu_ras_poison_creation_handler will trigger gpu reset */
-       if (reset_flags && !con->is_rma) {
+       if (reset_flags && !amdgpu_ras_is_rma(adev)) {
                if (reset_flags & AMDGPU_RAS_GPU_RESET_MODE1_RESET)
                        reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
                else if (reset_flags & AMDGPU_RAS_GPU_RESET_MODE2_RESET) @@ -3202,7 +3202,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
         * This calling fails when is_rma is true or
         * ret != 0.
         */
-       if (con->is_rma || ret)
+       if (amdgpu_ras_is_rma(adev) || ret)
                goto free;

        if (con->eeprom_control.ras_num_recs) { @@ -3254,7 +3254,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
         * Except error threshold exceeding case, other failure cases in this
         * function would not fail amdgpu driver init.
         */
-       if (!con->is_rma)
+       if (!amdgpu_ras_is_rma(adev))
                ret = 0;
        else
                ret = -EINVAL;
@@ -4301,7 +4301,7 @@ int amdgpu_ras_reset_gpu(struct amdgpu_device *adev)
        struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);

        /* mode1 is the only selection for RMA status */
-       if (ras->is_rma) {
+       if (amdgpu_ras_is_rma(adev)) {
                ras->gpu_reset_flags = 0;
                ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET;
        }
@@ -4835,3 +4835,13 @@ void amdgpu_ras_event_log_print(struct amdgpu_device *adev, u64 event_id,

        va_end(args);
 }
+
+bool amdgpu_ras_is_rma(struct amdgpu_device *adev) {
+       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+
+       if (!con)
+               return false;
+
+       return con->is_rma;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 7ddd13d5c06b..25a19760f098 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -972,4 +972,5 @@ __printf(3, 4)
 void amdgpu_ras_event_log_print(struct amdgpu_device *adev, u64 event_id,
                                const char *fmt, ...);

+bool amdgpu_ras_is_rma(struct amdgpu_device *adev);
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index 32be258d81e1..9e70a7b3aa64 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -196,7 +196,7 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
        amdgpu_umc_handle_bad_pages(adev, ras_error_status);

        if ((err_data->ue_count || err_data->de_count) &&
-           (reset || (con && con->is_rma))) {
+           (reset || amdgpu_ras_is_rma(adev))) {
                con->gpu_reset_flags |= reset;
                amdgpu_ras_reset_gpu(adev);
        }
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
index 9cd221ed240c..999bb3cc88b7 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
@@ -97,7 +97,7 @@ static int gfx_v11_0_3_poison_consumption_handler(struct amdgpu_device *adev,
                        ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE2_RESET;
                }

-               if (con && !con->is_rma)
+               if (con && !amdgpu_ras_is_rma(adev))
                        amdgpu_ras_reset_gpu(adev);
        }

--
2.34.1



More information about the amd-gfx mailing list