[PATCH] drm/amdgpu: Avoid rma causes GPU duplicate reset

Tue Jul 29 03:57:22 UTC 2025

[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: Tao Zhou <tao.zhou1 at amd.com>

> -----Original Message-----
> From: Sun, Ce(Overlord) <Ce.Sun at amd.com>
> Sent: Tuesday, July 29, 2025 11:23 AM
> To: amd-gfx at lists.freedesktop.org
> Cc: Zhang, Hawking <Hawking.Zhang at amd.com>; Zhou1, Tao
> <Tao.Zhou1 at amd.com>; Yang, Stanley <Stanley.Yang at amd.com>; Sun,
> Ce(Overlord) <Ce.Sun at amd.com>
> Subject: [PATCH] drm/amdgpu: Avoid rma causes GPU duplicate reset
>
> Try to ensure poison creation handle is completed in time to set device rma value.
>
> Signed-off-by: Ce Sun <cesun102 at amd.com>
> Signed-off-by: Stanley.Yang <Stanley.Yang at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 17 ++++++++++-------
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  1 +
>  2 files changed, 11 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index ac7099d03e89..eea175874ba0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -3356,7 +3356,6 @@ static void amdgpu_ras_do_page_retirement(struct
> work_struct *work)
>                                             page_retirement_dwork.work);
>       struct amdgpu_device *adev = con->adev;
>       struct ras_err_data err_data;
> -     unsigned long err_cnt;
>
>       /* If gpu reset is ongoing, delay retiring the bad pages */
>       if (amdgpu_in_reset(adev) || amdgpu_ras_in_recovery(adev)) { @@ -3368,13
> +3367,9 @@ static void amdgpu_ras_do_page_retirement(struct work_struct *work)
>       amdgpu_ras_error_data_init(&err_data);
>
>       amdgpu_umc_handle_bad_pages(adev, &err_data);
> -     err_cnt = err_data.err_addr_cnt;
>
>       amdgpu_ras_error_data_fini(&err_data);
>
> -     if (err_cnt && amdgpu_ras_is_rma(adev))
> -             amdgpu_ras_reset_gpu(adev);
> -
>       amdgpu_ras_schedule_retirement_dwork(con,
>                       AMDGPU_RAS_RETIRE_PAGE_INTERVAL);
>  }
> @@ -3428,6 +3423,9 @@ static int amdgpu_ras_poison_creation_handler(struct
> amdgpu_device *adev,
>       if (total_detect_count)
>               schedule_delayed_work(&ras->page_retirement_dwork, 0);
>
> +     if (amdgpu_ras_is_rma(adev) && atomic_cmpxchg(&ras->rma_in_recovery,
> 0, 1) == 0)
> +             amdgpu_ras_reset_gpu(adev);
> +
>       return 0;
>  }
>
> @@ -3464,6 +3462,12 @@ static int
> amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev,
>               reset_flags |= msg.reset;
>       }
>
> +     /*
> +      * Try to ensure poison creation handler is completed first
> +      * to set rma if bad page exceed threshold.
> +      */
> +     flush_delayed_work(&con->page_retirement_dwork);
> +
>       /* for RMA, amdgpu_ras_poison_creation_handler will trigger gpu reset */
>       if (reset_flags && !amdgpu_ras_is_rma(adev)) {
>               if (reset_flags & AMDGPU_RAS_GPU_RESET_MODE1_RESET)
> @@ -3473,8 +3477,6 @@ static int
> amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev,
>               else
>                       reset = reset_flags;
>
> -             flush_delayed_work(&con->page_retirement_dwork);
> -
>               con->gpu_reset_flags |= reset;
>               amdgpu_ras_reset_gpu(adev);
>
> @@ -3645,6 +3647,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device
> *adev, bool init_bp_info)
>       mutex_init(&con->recovery_lock);
>       INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery);
>       atomic_set(&con->in_recovery, 0);
> +     atomic_set(&con->rma_in_recovery, 0);
>       con->eeprom_control.bad_channel_bitmap = 0;
>
>       max_eeprom_records_count =
> amdgpu_ras_eeprom_max_record_count(&con->eeprom_control);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index 7f10a7402160..662046ab73ba 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -515,6 +515,7 @@ struct amdgpu_ras {
>       /* gpu recovery */
>       struct work_struct recovery_work;
>       atomic_t in_recovery;
> +     atomic_t rma_in_recovery;
>       struct amdgpu_device *adev;
>       /* error handler data */
>       struct ras_err_handler_data *eh_data;
> --
> 2.34.1