[PATCH] drm/amdgpu: avoid repeatedly executing gpu ras reset

Tue Jul 9 06:00:42 UTC 2024

When a gpu in hive is performing ras reset, other
gpus in hive do not need to schedule recovery work
to reset the gpu.

Signed-off-by: YiPeng Chai <YiPeng.Chai at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 34226ae010c7..cbb4d6ccc420 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2489,6 +2489,7 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
 	struct amdgpu_device *adev = ras->adev;
 	struct list_head device_list, *device_list_handle =  NULL;
 	struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
+	struct amdgpu_ras *tmp_ras;
 
 	if (hive) {
 		atomic_set(&hive->ras_recovery, 1);
@@ -2499,11 +2500,19 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
 		 * as part of recovery.
 		 */
 		list_for_each_entry(remote_adev, &hive->device_list,
-				    gmc.xgmi.head)
+				    gmc.xgmi.head) {
+			tmp_ras = amdgpu_ras_get_context(remote_adev);
+			/* When a gpu in hive is performing ras reset, other
+			 * gpus in hive do not need to schedule recovery work
+			 * to reset the gpu.
+			 */
+			atomic_set(&tmp_ras->in_recovery, 1);
+
 			if (amdgpu_ras_get_fed_status(remote_adev)) {
 				amdgpu_ras_set_fed_all(adev, hive, true);
 				break;
 			}
+		}
 	}
 	if (!ras->disable_ras_err_cnt_harvest) {
 
@@ -2556,6 +2565,15 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
 
 		amdgpu_device_gpu_recover(ras->adev, NULL, &reset_context);
 	}
+
+	if (hive) {
+		list_for_each_entry(remote_adev, &hive->device_list,
+						gmc.xgmi.head) {
+			tmp_ras = amdgpu_ras_get_context(remote_adev);
+			atomic_set(&tmp_ras->in_recovery, 0);
+		}
+	}
+
 	atomic_set(&ras->in_recovery, 0);
 	if (hive) {
 		atomic_set(&hive->ras_recovery, 0);
-- 
2.34.1