[PATCH Review V2 1/1] drm/amdgpu: Fix ecc irq enable/disable unpaired

Zhang, Hawking Hawking.Zhang at amd.com
Wed Dec 20 08:12:09 UTC 2023


[AMD Official Use Only - General]

+       if (adev->gmc.ecc_irq.funcs)
+               amdgpu_irq_put(adev, &adev->gmc.ecc_irq, 0);
+

This doesn't match with amdgpu_irq_get call for gmc.ecc_irq, where driver checks ras cap to decide whether enabling the interrupt or not (see amdgpu_umc_ras_late_init). We do the same check for amdgpu_irq_put call.

Regards,
Hawking

-----Original Message-----
From: Yang, Stanley <Stanley.Yang at amd.com>
Sent: Tuesday, December 19, 2023 20:48
To: amd-gfx at lists.freedesktop.org; Zhang, Hawking <Hawking.Zhang at amd.com>
Cc: Yang, Stanley <Stanley.Yang at amd.com>
Subject: [PATCH Review V2 1/1] drm/amdgpu: Fix ecc irq enable/disable unpaired

The ecc_irq is disabled while GPU mode2 reset suspending process, but not be enabled during GPU mode2 reset resume process.

Changed from V1:
        only do sdma/gfx ras_late_init in aldebaran_mode2_restore_ip,
        delete amdgpu_ras_late_resume function.

Signed-off-by: Stanley.Yang <Stanley.Yang at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/aldebaran.c | 28 +++++++++++++++++++++++++-  drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c |  3 +++  drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c |  4 ++++  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  |  3 +++
 4 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/aldebaran.c b/drivers/gpu/drm/amd/amdgpu/aldebaran.c
index 02f4c6f9d4f6..b60a3c1bd0f2 100644
--- a/drivers/gpu/drm/amd/amdgpu/aldebaran.c
+++ b/drivers/gpu/drm/amd/amdgpu/aldebaran.c
@@ -330,6 +330,7 @@ aldebaran_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl,  {
        struct list_head *reset_device_list = reset_context->reset_device_list;
        struct amdgpu_device *tmp_adev = NULL;
+       struct amdgpu_ras *con;
        int r;

        if (reset_device_list == NULL)
@@ -355,7 +356,32 @@ aldebaran_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl,
                 */
                amdgpu_register_gpu_instance(tmp_adev);

-               /* Resume RAS */
+               /* Resume RAS, ecc_irq */
+               con = amdgpu_ras_get_context(tmp_adev);
+               if (!amdgpu_sriov_vf(tmp_adev) && con) {
+                       if (tmp_adev->sdma.ras &&
+                               amdgpu_ras_is_supported(tmp_adev, AMDGPU_RAS_BLOCK__SDMA) &&
+                               tmp_adev->sdma.ras->ras_block.ras_late_init) {
+                               r = tmp_adev->sdma.ras->ras_block.ras_late_init(tmp_adev,
+                                               &tmp_adev->sdma.ras->ras_block.ras_comm);
+                               if (r) {
+                                       dev_err(tmp_adev->dev, "SDMA failed to execute ras_late_init! ret:%d\n", r);
+                                       goto end;
+                               }
+                       }
+
+                       if (tmp_adev->gfx.ras &&
+                               amdgpu_ras_is_supported(tmp_adev, AMDGPU_RAS_BLOCK__GFX) &&
+                               tmp_adev->gfx.ras->ras_block.ras_late_init) {
+                               r = tmp_adev->gfx.ras->ras_block.ras_late_init(tmp_adev,
+                                               &tmp_adev->gfx.ras->ras_block.ras_comm);
+                               if (r) {
+                                       dev_err(tmp_adev->dev, "GFX failed to execute ras_late_init! ret:%d\n", r);
+                                       goto end;
+                               }
+                       }
+               }
+
                amdgpu_ras_resume(tmp_adev);

                /* Update PSP FW topology after reset */ diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index 09cbca596bb5..b93a0baeb2d3 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -1043,6 +1043,9 @@ static int gmc_v10_0_hw_fini(void *handle)

        amdgpu_irq_put(adev, &adev->gmc.vm_fault, 0);

+       if (adev->gmc.ecc_irq.funcs)
+               amdgpu_irq_put(adev, &adev->gmc.ecc_irq, 0);
+
        return 0;
 }

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
index 416f3e4f0438..e633e60850b3 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
@@ -941,6 +941,10 @@ static int gmc_v11_0_hw_fini(void *handle)
        }

        amdgpu_irq_put(adev, &adev->gmc.vm_fault, 0);
+
+       if (adev->gmc.ecc_irq.funcs)
+               amdgpu_irq_put(adev, &adev->gmc.ecc_irq, 0);
+
        gmc_v11_0_gart_disable(adev);

        return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 205db28a9803..8ac4d5b7fb37 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -2388,6 +2388,9 @@ static int gmc_v9_0_hw_fini(void *handle)

        amdgpu_irq_put(adev, &adev->gmc.vm_fault, 0);

+       if (adev->gmc.ecc_irq.funcs)
+               amdgpu_irq_put(adev, &adev->gmc.ecc_irq, 0);
+
        return 0;
 }

--
2.25.1



More information about the amd-gfx mailing list