[PATCH] drm/amdgpu: avoid repeatedly executing gpu ras reset
YiPeng Chai
YiPeng.Chai at amd.com
Tue Jul 9 06:00:42 UTC 2024
When a gpu in hive is performing ras reset, other
gpus in hive do not need to schedule recovery work
to reset the gpu.
Signed-off-by: YiPeng Chai <YiPeng.Chai at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 20 +++++++++++++++++++-
1 file changed, 19 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 34226ae010c7..cbb4d6ccc420 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2489,6 +2489,7 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
struct amdgpu_device *adev = ras->adev;
struct list_head device_list, *device_list_handle = NULL;
struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
+ struct amdgpu_ras *tmp_ras;
if (hive) {
atomic_set(&hive->ras_recovery, 1);
@@ -2499,11 +2500,19 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
* as part of recovery.
*/
list_for_each_entry(remote_adev, &hive->device_list,
- gmc.xgmi.head)
+ gmc.xgmi.head) {
+ tmp_ras = amdgpu_ras_get_context(remote_adev);
+ /* When a gpu in hive is performing ras reset, other
+ * gpus in hive do not need to schedule recovery work
+ * to reset the gpu.
+ */
+ atomic_set(&tmp_ras->in_recovery, 1);
+
if (amdgpu_ras_get_fed_status(remote_adev)) {
amdgpu_ras_set_fed_all(adev, hive, true);
break;
}
+ }
}
if (!ras->disable_ras_err_cnt_harvest) {
@@ -2556,6 +2565,15 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
amdgpu_device_gpu_recover(ras->adev, NULL, &reset_context);
}
+
+ if (hive) {
+ list_for_each_entry(remote_adev, &hive->device_list,
+ gmc.xgmi.head) {
+ tmp_ras = amdgpu_ras_get_context(remote_adev);
+ atomic_set(&tmp_ras->in_recovery, 0);
+ }
+ }
+
atomic_set(&ras->in_recovery, 0);
if (hive) {
atomic_set(&hive->ras_recovery, 0);
--
2.34.1
More information about the amd-gfx
mailing list