[PATCH v2] drm/amdgpu: Fix the gpu recover deadlock issue in abnormal situations
Lazar, Lijo
lijo.lazar at amd.com
Fri Jun 6 05:08:02 UTC 2025
On 6/6/2025 9:24 AM, Ce Sun wrote:
> rma occurred in the RAS records exceed threshold device. When the
> device was performing gpu recover, the reset domain lock was not
> released, resulting in kernel panic
>
> [ 630.141619] INFO: task umc_page_retire:9472 blocked for more than 122 seconds.
> [ 630.157663] Tainted: G OE 6.9.0-0_fbk6_brcmrdma11_125_gfecec9d12677 #1
> [ 630.176213] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> [ 630.193591] task:umc_page_retire state:D stack:0 pid:9472 tgid:9472 ppid:2 flags:0x00004000
> [ 630.214251] Call Trace:
> [ 630.219673] <TASK>
> [ 630.224326] __schedule+0x589/0xad0
> [ 630.232070] schedule+0x67/0xb0
> [ 630.239038] schedule_timeout+0x22/0x100
> [ 630.247740] ? __smp_call_single_queue+0x36/0x100
> [ 630.258195] ? wake_up_process+0x7df/0x1190
> [ 630.267477] ? wait_for_common+0xa4/0x150
> [ 630.276371] wait_for_common+0x99/0x150
> [ 630.284881] __flush_work.llvm.6727378282878825097+0x20a/0x290
> [ 630.297826] ? rcu_work_rcufn+0x20/0x20
> [ 630.306338] amdgpu_ras_page_retirement_thread+0x469/0x4e0 [amdgpu]
> [ 630.320344] ? amdgpu_ras_do_recovery+0x5f0/0x5f0 [amdgpu]
> [ 630.332587] kthread+0xdd/0x120
> [ 630.339552] ? __sched_group_set_shares+0x160/0x160
> [ 630.350373] ret_from_fork+0x2f/0x40
> [ 630.358303] ? __sched_group_set_shares+0x160/0x160
> [ 630.369128] ret_from_fork_asm+0x11/0x20
> [ 630.377831] </TASK>
> [ 630.382796] INFO: task kworker/u1536:1:10591 blocked for more than 123 seconds.
> [ 630.399018] Tainted: G OE 6.9.0-0_fbk6_brcmrdma11_125_gfecec9d12677 #1
> [ 630.417554] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> [ 630.434935] task:kworker/u1536:1 state:D stack:0 pid:10591 tgid:10591 ppid:2 flags:0x00004000
> [ 630.455594] Workqueue: amdgpu-reset-hive amdgpu_ras_do_recovery [amdgpu]
> [ 630.470537] Call Trace:
> [ 630.475958] <TASK>
> [ 630.480609] __schedule+0x589/0xad0
> [ 630.488347] schedule+0x67/0xb0
> [ 630.495313] schedule_preempt_disabled+0xa/0x10
> [ 630.505362] rwsem_down_write_slowpath+0x2ba/0x510
> [ 630.515994] down_write+0x2b/0x30
> [ 630.523346] amdgpu_device_halt_activities+0xef/0x400 [amdgpu]
> [ 630.536363] amdgpu_device_gpu_recover+0x124/0x230 [amdgpu]
> [ 630.548797] amdgpu_ras_do_recovery+0x5af/0x5f0 [amdgpu]
> [ 630.560653] process_scheduled_works+0x184/0x370
> [ 630.570900] worker_thread+0xc6/0x3f0
> [ 630.579022] ? __ipv6_chk_addr_and_flags.llvm.7715710786076949193+0x160/0x160
> [ 630.594857] ? __ipv6_chk_addr_and_flags.llvm.7715710786076949193+0x160/0x160
> [ 630.610693] kthread+0xdd/0x120
> [ 630.617660] ? __sched_group_set_shares+0x160/0x160
> [ 630.628483] ret_from_fork+0x2f/0x40
> [ 630.636413] ? __sched_group_set_shares+0x160/0x160
> [ 630.647232] ret_from_fork_asm+0x11/0x20
>
> Signed-off-by: Ce Sun <cesun102 at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 9 +++------
> 1 file changed, 3 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 2b84df8da61a..f5e69132bce8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -6305,7 +6305,6 @@ static void amdgpu_device_gpu_resume(struct amdgpu_device *adev,
> tmp_adev = list_first_entry(device_list, struct amdgpu_device,
> reset_list);
> amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
> -
> }
>
>
> @@ -6396,12 +6395,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
> }
>
> r = amdgpu_device_asic_reset(adev, &device_list, reset_context);
> - if (r)
> - goto end_reset;
> skip_hw_reset:
> r = amdgpu_device_sched_resume(&device_list, reset_context, job_signaled);
> - if (r)
> - goto end_reset;
I don't think it's right to proceed further in case of failure. Please
try the attached patch.
Thanks,
Lijo
> skip_sched_resume:
> amdgpu_device_gpu_resume(adev, &device_list, need_emergency_restart);
> end_reset:
> @@ -6938,8 +6933,10 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
> if (hive) {
> list_for_each_entry(tmp_adev, &device_list, reset_list)
> amdgpu_device_unset_mp1_state(tmp_adev);
> - amdgpu_device_unlock_reset_domain(adev->reset_domain);
> }
> + tmp_adev = list_first_entry(&device_list, struct amdgpu_device,
> + reset_list);
> + amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
> }
>
> if (hive) {
-------------- next part --------------
From a5c81fe086fa1f9c8df087a41d9a529f57d65faf Mon Sep 17 00:00:00 2001
From: Lijo Lazar <lijo.lazar at amd.com>
Date: Fri, 6 Jun 2025 10:29:28 +0530
Subject: [PATCH] drm/amdgpu: Release reset locks during failures
Make sure to release reset domain lock in case of failures.
Signed-off-by: Lijo Lazar <lijo.lazar at amd.com>
Fixes: 0f936e23cf9d ("drm/amdgpu: refactor amdgpu_device_gpu_recover")
---
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 76 +++++++++++++++-------
1 file changed, 53 insertions(+), 23 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e64969d576a6..0eedcffd2582 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -6057,16 +6057,12 @@ static int amdgpu_device_health_check(struct list_head *device_list_handle)
return ret;
}
-static int amdgpu_device_halt_activities(struct amdgpu_device *adev,
- struct amdgpu_job *job,
- struct amdgpu_reset_context *reset_context,
- struct list_head *device_list,
- struct amdgpu_hive_info *hive,
- bool need_emergency_restart)
+static int amdgpu_device_recovery_prepare(struct amdgpu_device *adev,
+ struct list_head *device_list,
+ struct amdgpu_hive_info *hive)
{
- struct list_head *device_list_handle = NULL;
struct amdgpu_device *tmp_adev = NULL;
- int i, r = 0;
+ int r;
/*
* Build list of devices to reset.
@@ -6083,26 +6079,54 @@ static int amdgpu_device_halt_activities(struct amdgpu_device *adev,
}
if (!list_is_first(&adev->reset_list, device_list))
list_rotate_to_front(&adev->reset_list, device_list);
- device_list_handle = device_list;
} else {
list_add_tail(&adev->reset_list, device_list);
- device_list_handle = device_list;
}
if (!amdgpu_sriov_vf(adev) && (!adev->pcie_reset_ctx.occurs_dpc)) {
- r = amdgpu_device_health_check(device_list_handle);
+ r = amdgpu_device_health_check(device_list);
if (r)
return r;
}
- /* We need to lock reset domain only once both for XGMI and single device */
- tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
- reset_list);
+ return 0;
+}
+
+static void amdgpu_device_recovery_get_reset_lock(struct amdgpu_device *adev,
+ struct list_head *device_list)
+{
+ struct amdgpu_device *tmp_adev = NULL;
+
+ if (list_empty(device_list))
+ return;
+ tmp_adev =
+ list_first_entry(device_list, struct amdgpu_device, reset_list);
amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);
+}
- /* block all schedulers and reset given job's ring */
- list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
+static void amdgpu_device_recovery_put_reset_lock(struct amdgpu_device *adev,
+ struct list_head *device_list)
+{
+ struct amdgpu_device *tmp_adev = NULL;
+
+ if (list_empty(device_list))
+ return;
+ tmp_adev =
+ list_first_entry(device_list, struct amdgpu_device, reset_list);
+ amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
+}
+static int amdgpu_device_halt_activities(
+ struct amdgpu_device *adev, struct amdgpu_job *job,
+ struct amdgpu_reset_context *reset_context,
+ struct list_head *device_list, struct amdgpu_hive_info *hive,
+ bool need_emergency_restart)
+{
+ struct amdgpu_device *tmp_adev = NULL;
+ int i, r = 0;
+
+ /* block all schedulers and reset given job's ring */
+ list_for_each_entry(tmp_adev, device_list, reset_list) {
amdgpu_device_set_mp1_state(tmp_adev);
/*
@@ -6290,11 +6314,6 @@ static void amdgpu_device_gpu_resume(struct amdgpu_device *adev,
amdgpu_ras_set_error_query_ready(tmp_adev, true);
}
-
- tmp_adev = list_first_entry(device_list, struct amdgpu_device,
- reset_list);
- amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
-
}
@@ -6362,10 +6381,16 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
reset_context->hive = hive;
INIT_LIST_HEAD(&device_list);
+ if (amdgpu_device_recovery_prepare(adev, &device_list, hive))
+ goto end_reset;
+
+ /* We need to lock reset domain only once both for XGMI and single device */
+ amdgpu_device_recovery_get_reset_lock(adev, &device_list);
+
r = amdgpu_device_halt_activities(adev, job, reset_context, &device_list,
hive, need_emergency_restart);
if (r)
- goto end_reset;
+ goto reset_unlock;
if (need_emergency_restart)
goto skip_sched_resume;
@@ -6390,6 +6415,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
goto end_reset;
skip_sched_resume:
amdgpu_device_gpu_resume(adev, &device_list, need_emergency_restart);
+reset_unlock:
+ amdgpu_device_recovery_put_reset_lock(adev, &device_list);
end_reset:
if (hive) {
mutex_unlock(&hive->hive_lock);
@@ -6801,6 +6828,8 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta
memset(&reset_context, 0, sizeof(reset_context));
INIT_LIST_HEAD(&device_list);
+ amdgpu_device_recovery_prepare(adev, &device_list, hive);
+ amdgpu_device_recovery_get_reset_lock(adev, &device_list);
r = amdgpu_device_halt_activities(adev, NULL, &reset_context, &device_list,
hive, false);
if (hive) {
@@ -6918,8 +6947,8 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
if (hive) {
list_for_each_entry(tmp_adev, &device_list, reset_list)
amdgpu_device_unset_mp1_state(tmp_adev);
- amdgpu_device_unlock_reset_domain(adev->reset_domain);
}
+ amdgpu_device_recovery_put_reset_lock(adev, &device_list);
}
if (hive) {
@@ -6965,6 +6994,7 @@ void amdgpu_pci_resume(struct pci_dev *pdev)
amdgpu_device_sched_resume(&device_list, NULL, NULL);
amdgpu_device_gpu_resume(adev, &device_list, false);
+ amdgpu_device_recovery_put_reset_lock(adev, &device_list);
adev->pcie_reset_ctx.occurs_dpc = false;
if (hive) {
--
2.25.1
More information about the amd-gfx
mailing list