[PATCH] drm/amdgpu: Avoid rma causes GPU duplicate reset
Ce Sun
cesun102 at amd.com
Tue Jul 29 03:23:23 UTC 2025
Try to ensure poison creation handle is completed in time
to set device rma value.
Signed-off-by: Ce Sun <cesun102 at amd.com>
Signed-off-by: Stanley.Yang <Stanley.Yang at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 17 ++++++++++-------
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 1 +
2 files changed, 11 insertions(+), 7 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index ac7099d03e89..eea175874ba0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3356,7 +3356,6 @@ static void amdgpu_ras_do_page_retirement(struct work_struct *work)
page_retirement_dwork.work);
struct amdgpu_device *adev = con->adev;
struct ras_err_data err_data;
- unsigned long err_cnt;
/* If gpu reset is ongoing, delay retiring the bad pages */
if (amdgpu_in_reset(adev) || amdgpu_ras_in_recovery(adev)) {
@@ -3368,13 +3367,9 @@ static void amdgpu_ras_do_page_retirement(struct work_struct *work)
amdgpu_ras_error_data_init(&err_data);
amdgpu_umc_handle_bad_pages(adev, &err_data);
- err_cnt = err_data.err_addr_cnt;
amdgpu_ras_error_data_fini(&err_data);
- if (err_cnt && amdgpu_ras_is_rma(adev))
- amdgpu_ras_reset_gpu(adev);
-
amdgpu_ras_schedule_retirement_dwork(con,
AMDGPU_RAS_RETIRE_PAGE_INTERVAL);
}
@@ -3428,6 +3423,9 @@ static int amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev,
if (total_detect_count)
schedule_delayed_work(&ras->page_retirement_dwork, 0);
+ if (amdgpu_ras_is_rma(adev) && atomic_cmpxchg(&ras->rma_in_recovery, 0, 1) == 0)
+ amdgpu_ras_reset_gpu(adev);
+
return 0;
}
@@ -3464,6 +3462,12 @@ static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev,
reset_flags |= msg.reset;
}
+ /*
+ * Try to ensure poison creation handler is completed first
+ * to set rma if bad page exceed threshold.
+ */
+ flush_delayed_work(&con->page_retirement_dwork);
+
/* for RMA, amdgpu_ras_poison_creation_handler will trigger gpu reset */
if (reset_flags && !amdgpu_ras_is_rma(adev)) {
if (reset_flags & AMDGPU_RAS_GPU_RESET_MODE1_RESET)
@@ -3473,8 +3477,6 @@ static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev,
else
reset = reset_flags;
- flush_delayed_work(&con->page_retirement_dwork);
-
con->gpu_reset_flags |= reset;
amdgpu_ras_reset_gpu(adev);
@@ -3645,6 +3647,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev, bool init_bp_info)
mutex_init(&con->recovery_lock);
INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery);
atomic_set(&con->in_recovery, 0);
+ atomic_set(&con->rma_in_recovery, 0);
con->eeprom_control.bad_channel_bitmap = 0;
max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count(&con->eeprom_control);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 7f10a7402160..662046ab73ba 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -515,6 +515,7 @@ struct amdgpu_ras {
/* gpu recovery */
struct work_struct recovery_work;
atomic_t in_recovery;
+ atomic_t rma_in_recovery;
struct amdgpu_device *adev;
/* error handler data */
struct ras_err_handler_data *eh_data;
--
2.34.1
More information about the amd-gfx
mailing list