[PATCH Review V2 1/1] drm/amdgpu: Fix ecc irq enable/disable unpaired
Yang, Stanley
Stanley.Yang at amd.com
Wed Dec 20 13:14:49 UTC 2023
[AMD Official Use Only - General]
Yes, it should add check ras cap before put gmc.ecc_irq, thanks.
Regards,
Stanley
> -----Original Message-----
> From: Zhang, Hawking <Hawking.Zhang at amd.com>
> Sent: Wednesday, December 20, 2023 4:12 PM
> To: Yang, Stanley <Stanley.Yang at amd.com>; amd-gfx at lists.freedesktop.org
> Subject: RE: [PATCH Review V2 1/1] drm/amdgpu: Fix ecc irq enable/disable
> unpaired
>
> [AMD Official Use Only - General]
>
> + if (adev->gmc.ecc_irq.funcs)
> + amdgpu_irq_put(adev, &adev->gmc.ecc_irq, 0);
> +
>
> This doesn't match with amdgpu_irq_get call for gmc.ecc_irq, where driver
> checks ras cap to decide whether enabling the interrupt or not (see
> amdgpu_umc_ras_late_init). We do the same check for amdgpu_irq_put call.
>
> Regards,
> Hawking
>
> -----Original Message-----
> From: Yang, Stanley <Stanley.Yang at amd.com>
> Sent: Tuesday, December 19, 2023 20:48
> To: amd-gfx at lists.freedesktop.org; Zhang, Hawking
> <Hawking.Zhang at amd.com>
> Cc: Yang, Stanley <Stanley.Yang at amd.com>
> Subject: [PATCH Review V2 1/1] drm/amdgpu: Fix ecc irq enable/disable
> unpaired
>
> The ecc_irq is disabled while GPU mode2 reset suspending process, but not
> be enabled during GPU mode2 reset resume process.
>
> Changed from V1:
> only do sdma/gfx ras_late_init in aldebaran_mode2_restore_ip,
> delete amdgpu_ras_late_resume function.
>
> Signed-off-by: Stanley.Yang <Stanley.Yang at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/aldebaran.c | 28
> +++++++++++++++++++++++++- drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> | 3 +++ drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c | 4 ++++
> drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 3 +++
> 4 files changed, 37 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/aldebaran.c
> b/drivers/gpu/drm/amd/amdgpu/aldebaran.c
> index 02f4c6f9d4f6..b60a3c1bd0f2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/aldebaran.c
> +++ b/drivers/gpu/drm/amd/amdgpu/aldebaran.c
> @@ -330,6 +330,7 @@ aldebaran_mode2_restore_hwcontext(struct
> amdgpu_reset_control *reset_ctl, {
> struct list_head *reset_device_list = reset_context->reset_device_list;
> struct amdgpu_device *tmp_adev = NULL;
> + struct amdgpu_ras *con;
> int r;
>
> if (reset_device_list == NULL)
> @@ -355,7 +356,32 @@ aldebaran_mode2_restore_hwcontext(struct
> amdgpu_reset_control *reset_ctl,
> */
> amdgpu_register_gpu_instance(tmp_adev);
>
> - /* Resume RAS */
> + /* Resume RAS, ecc_irq */
> + con = amdgpu_ras_get_context(tmp_adev);
> + if (!amdgpu_sriov_vf(tmp_adev) && con) {
> + if (tmp_adev->sdma.ras &&
> + amdgpu_ras_is_supported(tmp_adev,
> AMDGPU_RAS_BLOCK__SDMA) &&
> + tmp_adev->sdma.ras->ras_block.ras_late_init) {
> + r = tmp_adev->sdma.ras-
> >ras_block.ras_late_init(tmp_adev,
> + &tmp_adev->sdma.ras->ras_block.ras_comm);
> + if (r) {
> + dev_err(tmp_adev->dev, "SDMA failed to execute
> ras_late_init! ret:%d\n", r);
> + goto end;
> + }
> + }
> +
> + if (tmp_adev->gfx.ras &&
> + amdgpu_ras_is_supported(tmp_adev,
> AMDGPU_RAS_BLOCK__GFX) &&
> + tmp_adev->gfx.ras->ras_block.ras_late_init) {
> + r = tmp_adev->gfx.ras->ras_block.ras_late_init(tmp_adev,
> + &tmp_adev->gfx.ras->ras_block.ras_comm);
> + if (r) {
> + dev_err(tmp_adev->dev, "GFX failed to execute
> ras_late_init! ret:%d\n", r);
> + goto end;
> + }
> + }
> + }
> +
> amdgpu_ras_resume(tmp_adev);
>
> /* Update PSP FW topology after reset */ diff --git
> a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index 09cbca596bb5..b93a0baeb2d3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -1043,6 +1043,9 @@ static int gmc_v10_0_hw_fini(void *handle)
>
> amdgpu_irq_put(adev, &adev->gmc.vm_fault, 0);
>
> + if (adev->gmc.ecc_irq.funcs)
> + amdgpu_irq_put(adev, &adev->gmc.ecc_irq, 0);
> +
> return 0;
> }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
> b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
> index 416f3e4f0438..e633e60850b3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
> @@ -941,6 +941,10 @@ static int gmc_v11_0_hw_fini(void *handle)
> }
>
> amdgpu_irq_put(adev, &adev->gmc.vm_fault, 0);
> +
> + if (adev->gmc.ecc_irq.funcs)
> + amdgpu_irq_put(adev, &adev->gmc.ecc_irq, 0);
> +
> gmc_v11_0_gart_disable(adev);
>
> return 0;
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index 205db28a9803..8ac4d5b7fb37 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -2388,6 +2388,9 @@ static int gmc_v9_0_hw_fini(void *handle)
>
> amdgpu_irq_put(adev, &adev->gmc.vm_fault, 0);
>
> + if (adev->gmc.ecc_irq.funcs)
> + amdgpu_irq_put(adev, &adev->gmc.ecc_irq, 0);
> +
> return 0;
> }
>
> --
> 2.25.1
>
More information about the amd-gfx
mailing list