[PATCH] drm/amdgpu: Fix the gpu recover deadlock issue in abnormal situations

Fri Jun 6 02:31:10 UTC 2025

rma occurred in the RAS records exceed threshold device. When the
device was performing gpu recover, the reset domain lock was not
released, resulting in kernel panic

[  630.141619] INFO: task umc_page_retire:9472 blocked for more than 122 seconds.
[  630.157663]       Tainted: G           OE      6.9.0-0_fbk6_brcmrdma11_125_gfecec9d12677 #1
[  630.176213] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[  630.193591] task:umc_page_retire state:D stack:0     pid:9472  tgid:9472  ppid:2      flags:0x00004000
[  630.214251] Call Trace:
[  630.219673]  <TASK>
[  630.224326]  __schedule+0x589/0xad0
[  630.232070]  schedule+0x67/0xb0
[  630.239038]  schedule_timeout+0x22/0x100
[  630.247740]  ? __smp_call_single_queue+0x36/0x100
[  630.258195]  ? wake_up_process+0x7df/0x1190
[  630.267477]  ? wait_for_common+0xa4/0x150
[  630.276371]  wait_for_common+0x99/0x150
[  630.284881]  __flush_work.llvm.6727378282878825097+0x20a/0x290
[  630.297826]  ? rcu_work_rcufn+0x20/0x20
[  630.306338]  amdgpu_ras_page_retirement_thread+0x469/0x4e0 [amdgpu]
[  630.320344]  ? amdgpu_ras_do_recovery+0x5f0/0x5f0 [amdgpu]
[  630.332587]  kthread+0xdd/0x120
[  630.339552]  ? __sched_group_set_shares+0x160/0x160
[  630.350373]  ret_from_fork+0x2f/0x40
[  630.358303]  ? __sched_group_set_shares+0x160/0x160
[  630.369128]  ret_from_fork_asm+0x11/0x20
[  630.377831]  </TASK>
[  630.382796] INFO: task kworker/u1536:1:10591 blocked for more than 123 seconds.
[  630.399018]       Tainted: G           OE      6.9.0-0_fbk6_brcmrdma11_125_gfecec9d12677 #1
[  630.417554] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[  630.434935] task:kworker/u1536:1 state:D stack:0     pid:10591 tgid:10591 ppid:2      flags:0x00004000
[  630.455594] Workqueue: amdgpu-reset-hive amdgpu_ras_do_recovery [amdgpu]
[  630.470537] Call Trace:
[  630.475958]  <TASK>
[  630.480609]  __schedule+0x589/0xad0
[  630.488347]  schedule+0x67/0xb0
[  630.495313]  schedule_preempt_disabled+0xa/0x10
[  630.505362]  rwsem_down_write_slowpath+0x2ba/0x510
[  630.515994]  down_write+0x2b/0x30
[  630.523346]  amdgpu_device_halt_activities+0xef/0x400 [amdgpu]
[  630.536363]  amdgpu_device_gpu_recover+0x124/0x230 [amdgpu]
[  630.548797]  amdgpu_ras_do_recovery+0x5af/0x5f0 [amdgpu]
[  630.560653]  process_scheduled_works+0x184/0x370
[  630.570900]  worker_thread+0xc6/0x3f0
[  630.579022]  ? __ipv6_chk_addr_and_flags.llvm.7715710786076949193+0x160/0x160
[  630.594857]  ? __ipv6_chk_addr_and_flags.llvm.7715710786076949193+0x160/0x160
[  630.610693]  kthread+0xdd/0x120
[  630.617660]  ? __sched_group_set_shares+0x160/0x160
[  630.628483]  ret_from_fork+0x2f/0x40
[  630.636413]  ? __sched_group_set_shares+0x160/0x160
[  630.647232]  ret_from_fork_asm+0x11/0x20

Signed-off-by: Ce Sun <cesun102 at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 2b84df8da61a..6b82b5805e3d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -6305,7 +6305,6 @@ static void amdgpu_device_gpu_resume(struct amdgpu_device *adev,
 	tmp_adev = list_first_entry(device_list, struct amdgpu_device,
 					    reset_list);
 	amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
-
 }
 
 
@@ -6396,12 +6395,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 	}
 
 	r = amdgpu_device_asic_reset(adev, &device_list, reset_context);
-	if (r)
-		goto end_reset;
 skip_hw_reset:
 	r = amdgpu_device_sched_resume(&device_list, reset_context, job_signaled);
-	if (r)
-		goto end_reset;
 skip_sched_resume:
 	amdgpu_device_gpu_resume(adev, &device_list, need_emergency_restart);
 end_reset:
@@ -6938,8 +6933,8 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
 		if (hive) {
 			list_for_each_entry(tmp_adev, &device_list, reset_list)
 				amdgpu_device_unset_mp1_state(tmp_adev);
-			amdgpu_device_unlock_reset_domain(adev->reset_domain);
 		}
+		amdgpu_device_unlock_reset_domain(adev->reset_domain);
 	}
 
 	if (hive) {
-- 
2.34.1