[PATCH Review V3 1/1] drm/amdgpu: Fix ecc irq enable/disable unpaired

Fri Dec 22 09:08:26 UTC 2023

On 12/21/2023 11:35 AM, Stanley.Yang wrote:
> The ecc_irq is disabled while GPU mode2 reset suspending process,
> but not be enabled during GPU mode2 reset resume process.
> 
> Changed from V1:
> 	only do sdma/gfx ras_late_init in aldebaran_mode2_restore_ip
> 	delete amdgpu_ras_late_resume function
> 
> Changed from V2:
> 	check umc ras supported before put ecc_irq
> 
> Signed-off-by: Stanley.Yang <Stanley.Yang at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/aldebaran.c | 28 +++++++++++++++++++++++++-
>   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c |  4 ++++
>   drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c |  5 +++++
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  |  4 ++++
>   4 files changed, 40 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/aldebaran.c b/drivers/gpu/drm/amd/amdgpu/aldebaran.c
> index 02f4c6f9d4f6..b60a3c1bd0f2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/aldebaran.c
> +++ b/drivers/gpu/drm/amd/amdgpu/aldebaran.c
> @@ -330,6 +330,7 @@ aldebaran_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl,
>   {
>   	struct list_head *reset_device_list = reset_context->reset_device_list;
>   	struct amdgpu_device *tmp_adev = NULL;
> +	struct amdgpu_ras *con;
>   	int r;
>   
>   	if (reset_device_list == NULL)
> @@ -355,7 +356,32 @@ aldebaran_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl,
>   		 */
>   		amdgpu_register_gpu_instance(tmp_adev);
>   
> -		/* Resume RAS */
> +		/* Resume RAS, ecc_irq */
> +		con = amdgpu_ras_get_context(tmp_adev);
> +		if (!amdgpu_sriov_vf(tmp_adev) && con) {
> +			if (tmp_adev->sdma.ras &&
> +				amdgpu_ras_is_supported(tmp_adev, AMDGPU_RAS_BLOCK__SDMA) &&
> +				tmp_adev->sdma.ras->ras_block.ras_late_init) {
> +				r = tmp_adev->sdma.ras->ras_block.ras_late_init(tmp_adev,
> +						&tmp_adev->sdma.ras->ras_block.ras_comm);
> +				if (r) {
> +					dev_err(tmp_adev->dev, "SDMA failed to execute ras_late_init! ret:%d\n", r);
> +					goto end;
> +				}
> +			}
> +
> +			if (tmp_adev->gfx.ras &&
> +				amdgpu_ras_is_supported(tmp_adev, AMDGPU_RAS_BLOCK__GFX) &&
> +				tmp_adev->gfx.ras->ras_block.ras_late_init) {
> +				r = tmp_adev->gfx.ras->ras_block.ras_late_init(tmp_adev,
> +						&tmp_adev->gfx.ras->ras_block.ras_comm);
> +				if (r) {
> +					dev_err(tmp_adev->dev, "GFX failed to execute ras_late_init! ret:%d\n", r);
> +					goto end;
> +				}
> +			}
> +		}

This is the not the only ASIC that supports mode-2 reset.

What is preferred here is a RAS API which doesn't do all these kind of 
ras variable checks to initialize selective ras blocks.

amdgpu_ras_late_init(ras_block_id) or similar. Whatever checks done 
above may be wrapped inside the API. For now, GFX and SDMA are the only 
blocks that need to be inited, but an API gives more flexibility to 
selectively init blocks that are reset.

Thanks,
Lijo

> +
>   		amdgpu_ras_resume(tmp_adev);
>   
>   		/* Update PSP FW topology after reset */
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index 09cbca596bb5..4048539205cb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -1043,6 +1043,10 @@ static int gmc_v10_0_hw_fini(void *handle)
>   
>   	amdgpu_irq_put(adev, &adev->gmc.vm_fault, 0);
>   
> +	if (adev->gmc.ecc_irq.funcs &&
> +		amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC))
> +		amdgpu_irq_put(adev, &adev->gmc.ecc_irq, 0);
> +
>   	return 0;
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
> index 416f3e4f0438..e1ca5a599971 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
> @@ -941,6 +941,11 @@ static int gmc_v11_0_hw_fini(void *handle)
>   	}
>   
>   	amdgpu_irq_put(adev, &adev->gmc.vm_fault, 0);
> +
> +	if (adev->gmc.ecc_irq.funcs &&
> +		amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC))
> +		amdgpu_irq_put(adev, &adev->gmc.ecc_irq, 0);
> +
>   	gmc_v11_0_gart_disable(adev);
>   
>   	return 0;
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index 205db28a9803..f00e5c8c79b0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -2388,6 +2388,10 @@ static int gmc_v9_0_hw_fini(void *handle)
>   
>   	amdgpu_irq_put(adev, &adev->gmc.vm_fault, 0);
>   
> +	if (adev->gmc.ecc_irq.funcs &&
> +		amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC))
> +		amdgpu_irq_put(adev, &adev->gmc.ecc_irq, 0);
> +
>   	return 0;
>   }
>   
-- 
Regards,
Lijo