<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=us-ascii">
<style type="text/css" style="display:none;"> P {margin-top:0;margin-bottom:0;} </style>
</head>
<body dir="ltr">
<p style="font-family:Calibri;font-size:10pt;color:#0000FF;margin:5pt;font-style:normal;font-weight:normal;text-decoration:none;" align="Left">
[AMD Official Use Only - AMD Internal Distribution Only]<br>
</p>
<br>
<div>
<div class="elementToProof" style="font-family: Aptos, Aptos_EmbeddedFont, Aptos_MSFontService, Calibri, Helvetica, sans-serif; font-size: 12pt; color: rgb(0, 0, 0);">
Ping...</div>
<div class="elementToProof" style="font-family: Aptos, Aptos_EmbeddedFont, Aptos_MSFontService, Calibri, Helvetica, sans-serif; font-size: 12pt; color: rgb(0, 0, 0);">
<br>
</div>
<div class="elementToProof" style="font-family: Aptos, Aptos_EmbeddedFont, Aptos_MSFontService, Calibri, Helvetica, sans-serif; font-size: 12pt; color: rgb(0, 0, 0);">
<br>
</div>
<div class="elementToProof" style="font-family: Aptos, Aptos_EmbeddedFont, Aptos_MSFontService, Calibri, Helvetica, sans-serif; font-size: 12pt; color: rgb(0, 0, 0);">
<br>
</div>
<div class="elementToProof" style="font-family: Aptos, Aptos_EmbeddedFont, Aptos_MSFontService, Calibri, Helvetica, sans-serif; font-size: 12pt; color: rgb(0, 0, 0);">
<br>
</div>
<div class="elementToProof" style="font-family: Aptos, Aptos_EmbeddedFont, Aptos_MSFontService, Calibri, Helvetica, sans-serif; font-size: 12pt; color: rgb(0, 0, 0);">
<br>
</div>
<div class="elementToProof" style="font-family: Aptos, Aptos_EmbeddedFont, Aptos_MSFontService, Calibri, Helvetica, sans-serif; font-size: 12pt; color: rgb(0, 0, 0);">
<br>
</div>
<div class="elementToProof" style="font-family: Aptos, Aptos_EmbeddedFont, Aptos_MSFontService, Calibri, Helvetica, sans-serif; font-size: 12pt; color: rgb(0, 0, 0);">
<br>
</div>
<div class="elementToProof" style="direction: ltr; text-align: left; text-indent: 0px; background-color: white; margin: 0px; font-family: Aptos, Aptos_EmbeddedFont, Aptos_MSFontService, Calibri, Helvetica, sans-serif, serif, EmojiFont; font-size: 12pt; color: black;">
Regards,</div>
<div class="elementToProof" style="direction: ltr; text-align: left; text-indent: 0px; background-color: white; margin: 0px; font-family: Aptos, Aptos_EmbeddedFont, Aptos_MSFontService, Calibri, Helvetica, sans-serif, serif, EmojiFont; font-size: 12pt; color: black;">
Ce, Sun</div>
<div class="elementToProof" style="font-family: Aptos, Aptos_EmbeddedFont, Aptos_MSFontService, Calibri, Helvetica, sans-serif; font-size: 12pt; color: rgb(0, 0, 0);">
<br>
</div>
<div class="elementToProof" style="font-family: Aptos, Aptos_EmbeddedFont, Aptos_MSFontService, Calibri, Helvetica, sans-serif; font-size: 12pt; color: rgb(0, 0, 0);">
<br>
</div>
<div id="appendonsend"></div>
<hr style="display:inline-block;width:98%" tabindex="-1">
<div id="divRplyFwdMsg" dir="ltr"><font face="Calibri, sans-serif" style="font-size:11pt" color="#000000"><b>From:</b> Sun, Ce(Overlord) <Ce.Sun@amd.com><br>
<b>Sent:</b> Friday, June 6, 2025 3:12 PM<br>
<b>To:</b> amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org><br>
<b>Cc:</b> Zhou1, Tao <Tao.Zhou1@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Chai, Thomas <YiPeng.Chai@amd.com>; Lazar, Lijo <Lijo.Lazar@amd.com>; Sun, Ce(Overlord) <Ce.Sun@amd.com><br>
<b>Subject:</b> [PATCH] drm/amdgpu: Release reset locks during failures</font>
<div> </div>
</div>
<div class="BodyFragment"><font size="2"><span style="font-size:11pt;">
<div class="PlainText">From: Lijo Lazar <lijo.lazar@amd.com><br>
<br>
Make sure to release reset domain lock in case of failures.<br>
<br>
Signed-off-by: Lijo Lazar <lijo.lazar@amd.com><br>
Signed-off-by: Ce Sun <cesun102@amd.com><br>
<br>
Fixes: 0f936e23cf9d ("drm/amdgpu: refactor amdgpu_device_gpu_recover")<br>
---<br>
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 80 +++++++++++++++-------<br>
 1 file changed, 55 insertions(+), 25 deletions(-)<br>
<br>
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c<br>
index 2b84df8da61a..85509cd4cab8 100644<br>
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c<br>
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c<br>
@@ -6068,16 +6068,12 @@ static int amdgpu_device_health_check(struct list_head *device_list_handle)<br>
         return ret;<br>
 }<br>
 <br>
-static int amdgpu_device_halt_activities(struct amdgpu_device *adev,<br>
-                             struct amdgpu_job *job,<br>
-                             struct amdgpu_reset_context *reset_context,<br>
-                             struct list_head *device_list,<br>
-                             struct amdgpu_hive_info *hive,<br>
-                             bool need_emergency_restart)<br>
+static int amdgpu_device_recovery_prepare(struct amdgpu_device *adev,<br>
+                                         struct list_head *device_list,<br>
+                                         struct amdgpu_hive_info *hive)<br>
 {<br>
-       struct list_head *device_list_handle =  NULL;<br>
         struct amdgpu_device *tmp_adev = NULL;<br>
-       int i, r = 0;<br>
+       int r;<br>
 <br>
         /*<br>
          * Build list of devices to reset.<br>
@@ -6094,26 +6090,54 @@ static int amdgpu_device_halt_activities(struct amdgpu_device *adev,<br>
                 }<br>
                 if (!list_is_first(&adev->reset_list, device_list))<br>
                         list_rotate_to_front(&adev->reset_list, device_list);<br>
-               device_list_handle = device_list;<br>
         } else {<br>
                 list_add_tail(&adev->reset_list, device_list);<br>
-               device_list_handle = device_list;<br>
         }<br>
 <br>
         if (!amdgpu_sriov_vf(adev) && (!adev->pcie_reset_ctx.occurs_dpc)) {<br>
-               r = amdgpu_device_health_check(device_list_handle);<br>
+               r = amdgpu_device_health_check(device_list);<br>
                 if (r)<br>
                         return r;<br>
         }<br>
 <br>
-       /* We need to lock reset domain only once both for XGMI and single device */<br>
-       tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,<br>
-                                   reset_list);<br>
+       return 0;<br>
+}<br>
+<br>
+static void amdgpu_device_recovery_get_reset_lock(struct amdgpu_device *adev,<br>
+                                                 struct list_head *device_list)<br>
+{<br>
+       struct amdgpu_device *tmp_adev = NULL;<br>
+<br>
+       if (list_empty(device_list))<br>
+               return;<br>
+       tmp_adev =<br>
+               list_first_entry(device_list, struct amdgpu_device, reset_list);<br>
         amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);<br>
+}<br>
 <br>
-       /* block all schedulers and reset given job's ring */<br>
-       list_for_each_entry(tmp_adev, device_list_handle, reset_list) {<br>
+static void amdgpu_device_recovery_put_reset_lock(struct amdgpu_device *adev,<br>
+                                                 struct list_head *device_list)<br>
+{<br>
+       struct amdgpu_device *tmp_adev = NULL;<br>
 <br>
+       if (list_empty(device_list))<br>
+               return;<br>
+       tmp_adev =<br>
+               list_first_entry(device_list, struct amdgpu_device, reset_list);<br>
+       amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);<br>
+}<br>
+<br>
+static int amdgpu_device_halt_activities(<br>
+       struct amdgpu_device *adev, struct amdgpu_job *job,<br>
+       struct amdgpu_reset_context *reset_context,<br>
+       struct list_head *device_list, struct amdgpu_hive_info *hive,<br>
+       bool need_emergency_restart)<br>
+{<br>
+       struct amdgpu_device *tmp_adev = NULL;<br>
+       int i, r = 0;<br>
+<br>
+       /* block all schedulers and reset given job's ring */<br>
+       list_for_each_entry(tmp_adev, device_list, reset_list) {<br>
                 amdgpu_device_set_mp1_state(tmp_adev);<br>
 <br>
                 /*<br>
@@ -6301,11 +6325,6 @@ static void amdgpu_device_gpu_resume(struct amdgpu_device *adev,<br>
                 amdgpu_ras_set_error_query_ready(tmp_adev, true);<br>
 <br>
         }<br>
-<br>
-       tmp_adev = list_first_entry(device_list, struct amdgpu_device,<br>
-                                           reset_list);<br>
-       amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);<br>
-<br>
 }<br>
 <br>
 <br>
@@ -6376,10 +6395,16 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,<br>
         reset_context->hive = hive;<br>
         INIT_LIST_HEAD(&device_list);<br>
 <br>
+       if (amdgpu_device_recovery_prepare(adev, &device_list, hive))<br>
+               goto end_reset;<br>
+<br>
+       /* We need to lock reset domain only once both for XGMI and single device */<br>
+       amdgpu_device_recovery_get_reset_lock(adev, &device_list);<br>
+<br>
         r = amdgpu_device_halt_activities(adev, job, reset_context, &device_list,<br>
                                          hive, need_emergency_restart);<br>
         if (r)<br>
-               goto end_reset;<br>
+               goto reset_unlock;<br>
 <br>
         if (need_emergency_restart)<br>
                 goto skip_sched_resume;<br>
@@ -6397,13 +6422,15 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,<br>
 <br>
         r = amdgpu_device_asic_reset(adev, &device_list, reset_context);<br>
         if (r)<br>
-               goto end_reset;<br>
+               goto reset_unlock;<br>
 skip_hw_reset:<br>
         r = amdgpu_device_sched_resume(&device_list, reset_context, job_signaled);<br>
         if (r)<br>
-               goto end_reset;<br>
+               goto reset_unlock;<br>
 skip_sched_resume:<br>
         amdgpu_device_gpu_resume(adev, &device_list, need_emergency_restart);<br>
+reset_unlock:<br>
+       amdgpu_device_recovery_put_reset_lock(adev, &device_list);<br>
 end_reset:<br>
         if (hive) {<br>
                 mutex_unlock(&hive->hive_lock);<br>
@@ -6821,6 +6848,8 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta<br>
                 memset(&reset_context, 0, sizeof(reset_context));<br>
                 INIT_LIST_HEAD(&device_list);<br>
 <br>
+               amdgpu_device_recovery_prepare(adev, &device_list, hive);<br>
+               amdgpu_device_recovery_get_reset_lock(adev, &device_list);<br>
                 r = amdgpu_device_halt_activities(adev, NULL, &reset_context, &device_list,<br>
                                          hive, false);<br>
                 if (hive) {<br>
@@ -6938,8 +6967,8 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)<br>
                 if (hive) {<br>
                         list_for_each_entry(tmp_adev, &device_list, reset_list)<br>
                                 amdgpu_device_unset_mp1_state(tmp_adev);<br>
-                       amdgpu_device_unlock_reset_domain(adev->reset_domain);<br>
                 }<br>
+               amdgpu_device_recovery_put_reset_lock(adev, &device_list);<br>
         }<br>
 <br>
         if (hive) {<br>
@@ -6985,6 +7014,7 @@ void amdgpu_pci_resume(struct pci_dev *pdev)<br>
 <br>
         amdgpu_device_sched_resume(&device_list, NULL, NULL);<br>
         amdgpu_device_gpu_resume(adev, &device_list, false);<br>
+       amdgpu_device_recovery_put_reset_lock(adev, &device_list);<br>
         adev->pcie_reset_ctx.occurs_dpc = false;<br>
 <br>
         if (hive) {<br>
-- <br>
2.34.1<br>
<br>
</div>
</span></font></div>
</div>
</body>
</html>