[PATCH Review V3 1/1] drm/amdgpu: Fix ecc irq enable/disable unpaired

Zhou1, Tao Tao.Zhou1 at amd.com
Thu Dec 21 08:31:10 UTC 2023


[AMD Official Use Only - General]

Reviewed-by: Tao Zhou <tao.zhou1 at amd.com>

> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> On Behalf Of
> Stanley.Yang
> Sent: Thursday, December 21, 2023 2:05 PM
> To: amd-gfx at lists.freedesktop.org; Zhang, Hawking <Hawking.Zhang at amd.com>
> Cc: Yang, Stanley <Stanley.Yang at amd.com>
> Subject: [PATCH Review V3 1/1] drm/amdgpu: Fix ecc irq enable/disable unpaired
>
> The ecc_irq is disabled while GPU mode2 reset suspending process, but not be
> enabled during GPU mode2 reset resume process.
>
> Changed from V1:
>       only do sdma/gfx ras_late_init in aldebaran_mode2_restore_ip
>       delete amdgpu_ras_late_resume function
>
> Changed from V2:
>       check umc ras supported before put ecc_irq
>
> Signed-off-by: Stanley.Yang <Stanley.Yang at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/aldebaran.c | 28 +++++++++++++++++++++++++-
> drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c |  4 ++++
> drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c |  5 +++++
> drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  |  4 ++++
>  4 files changed, 40 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/aldebaran.c
> b/drivers/gpu/drm/amd/amdgpu/aldebaran.c
> index 02f4c6f9d4f6..b60a3c1bd0f2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/aldebaran.c
> +++ b/drivers/gpu/drm/amd/amdgpu/aldebaran.c
> @@ -330,6 +330,7 @@ aldebaran_mode2_restore_hwcontext(struct
> amdgpu_reset_control *reset_ctl,  {
>       struct list_head *reset_device_list = reset_context->reset_device_list;
>       struct amdgpu_device *tmp_adev = NULL;
> +     struct amdgpu_ras *con;
>       int r;
>
>       if (reset_device_list == NULL)
> @@ -355,7 +356,32 @@ aldebaran_mode2_restore_hwcontext(struct
> amdgpu_reset_control *reset_ctl,
>                */
>               amdgpu_register_gpu_instance(tmp_adev);
>
> -             /* Resume RAS */
> +             /* Resume RAS, ecc_irq */
> +             con = amdgpu_ras_get_context(tmp_adev);
> +             if (!amdgpu_sriov_vf(tmp_adev) && con) {
> +                     if (tmp_adev->sdma.ras &&
> +                             amdgpu_ras_is_supported(tmp_adev,
> AMDGPU_RAS_BLOCK__SDMA) &&
> +                             tmp_adev->sdma.ras->ras_block.ras_late_init) {
> +                             r = tmp_adev->sdma.ras-
> >ras_block.ras_late_init(tmp_adev,
> +                                             &tmp_adev->sdma.ras-
> >ras_block.ras_comm);
> +                             if (r) {
> +                                     dev_err(tmp_adev->dev, "SDMA failed
> to execute ras_late_init! ret:%d\n", r);
> +                                     goto end;
> +                             }
> +                     }
> +
> +                     if (tmp_adev->gfx.ras &&
> +                             amdgpu_ras_is_supported(tmp_adev,
> AMDGPU_RAS_BLOCK__GFX) &&
> +                             tmp_adev->gfx.ras->ras_block.ras_late_init) {
> +                             r = tmp_adev->gfx.ras-
> >ras_block.ras_late_init(tmp_adev,
> +                                             &tmp_adev->gfx.ras-
> >ras_block.ras_comm);
> +                             if (r) {
> +                                     dev_err(tmp_adev->dev, "GFX failed to
> execute ras_late_init! ret:%d\n", r);
> +                                     goto end;
> +                             }
> +                     }
> +             }
> +
>               amdgpu_ras_resume(tmp_adev);
>
>               /* Update PSP FW topology after reset */ diff --git
> a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index 09cbca596bb5..4048539205cb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -1043,6 +1043,10 @@ static int gmc_v10_0_hw_fini(void *handle)
>
>       amdgpu_irq_put(adev, &adev->gmc.vm_fault, 0);
>
> +     if (adev->gmc.ecc_irq.funcs &&
> +             amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC))
> +             amdgpu_irq_put(adev, &adev->gmc.ecc_irq, 0);
> +
>       return 0;
>  }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
> b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
> index 416f3e4f0438..e1ca5a599971 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
> @@ -941,6 +941,11 @@ static int gmc_v11_0_hw_fini(void *handle)
>       }
>
>       amdgpu_irq_put(adev, &adev->gmc.vm_fault, 0);
> +
> +     if (adev->gmc.ecc_irq.funcs &&
> +             amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC))
> +             amdgpu_irq_put(adev, &adev->gmc.ecc_irq, 0);
> +
>       gmc_v11_0_gart_disable(adev);
>
>       return 0;
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index 205db28a9803..f00e5c8c79b0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -2388,6 +2388,10 @@ static int gmc_v9_0_hw_fini(void *handle)
>
>       amdgpu_irq_put(adev, &adev->gmc.vm_fault, 0);
>
> +     if (adev->gmc.ecc_irq.funcs &&
> +             amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC))
> +             amdgpu_irq_put(adev, &adev->gmc.ecc_irq, 0);
> +
>       return 0;
>  }
>
> --
> 2.25.1



More information about the amd-gfx mailing list