<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=us-ascii">
</head>
<body>
<p style="font-family:Arial;font-size:10pt;color:#0000FF;margin:5pt;" align="Left">
[AMD Official Use Only - General]<br>
</p>
<br>
<div>
<div>
<div>
<div dir="ltr">
<div dir="ltr">Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com></div>
<div dir="ltr"><br>
</div>
<div dir="ltr">Regards,</div>
<div dir="ltr">Hawking</div>
</div>
</div>
<div id="ms-outlook-mobile-signature">
<div><br>
</div>
Get <a href="https://aka.ms/o0ukef">Outlook for iOS</a></div>
</div>
<hr style="display:inline-block;width:98%" tabindex="-1">
<div id="divRplyFwdMsg" dir="ltr"><font face="Calibri, sans-serif" style="font-size:11pt" color="#000000"><b>From:</b> Lazar, Lijo <Lijo.Lazar@amd.com><br>
<b>Sent:</b> Wednesday, August 3, 2022 7:36:20 PM<br>
<b>To:</b> amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org><br>
<b>Cc:</b> Zhang, Hawking <Hawking.Zhang@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com><br>
<b>Subject:</b> [PATCH] drm/amdgpu: Avoid another list of reset devices</font>
<div> </div>
</div>
<div class="BodyFragment"><font size="2"><span style="font-size:11pt;">
<div class="PlainText">A list of devices to be reset are already created in<br>
amdgpu_device_gpu_recover function. Creating another list with the<br>
same nodes is incorrect and not supported in list_head. Instead, pass<br>
the device list as part of reset context.<br>
<br>
Fixes: 9e08564727fc (drm/amdgpu: Refactor mode2 reset logic for v13.0.2)<br>
Signed-off-by: Lijo Lazar <lijo.lazar@amd.com><br>
---<br>
 drivers/gpu/drm/amd/amdgpu/aldebaran.c     | 45 +++++++---------------<br>
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +<br>
 drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h  |  1 +<br>
 3 files changed, 17 insertions(+), 31 deletions(-)<br>
<br>
diff --git a/drivers/gpu/drm/amd/amdgpu/aldebaran.c b/drivers/gpu/drm/amd/amdgpu/aldebaran.c<br>
index c6cc493a5486..2b97b8a96fb4 100644<br>
--- a/drivers/gpu/drm/amd/amdgpu/aldebaran.c<br>
+++ b/drivers/gpu/drm/amd/amdgpu/aldebaran.c<br>
@@ -148,30 +148,22 @@ aldebaran_mode2_perform_reset(struct amdgpu_reset_control *reset_ctl,<br>
                               struct amdgpu_reset_context *reset_context)<br>
 {<br>
         struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;<br>
+       struct list_head *reset_device_list = reset_context->reset_device_list;<br>
         struct amdgpu_device *tmp_adev = NULL;<br>
-       struct list_head reset_device_list;<br>
         int r = 0;<br>
 <br>
         dev_dbg(adev->dev, "aldebaran perform hw reset\n");<br>
+<br>
+       if (reset_device_list == NULL)<br>
+               return -EINVAL;<br>
+<br>
         if (adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 2) &&<br>
             reset_context->hive == NULL) {<br>
                 /* Wrong context, return error */<br>
                 return -EINVAL;<br>
         }<br>
 <br>
-       INIT_LIST_HEAD(&reset_device_list);<br>
-       if (reset_context->hive) {<br>
-               list_for_each_entry (tmp_adev,<br>
-                                    &reset_context->hive->device_list,<br>
-                                    gmc.xgmi.head)<br>
-                       list_add_tail(&tmp_adev->reset_list,<br>
-                                     &reset_device_list);<br>
-       } else {<br>
-               list_add_tail(&reset_context->reset_req_dev->reset_list,<br>
-                             &reset_device_list);<br>
-       }<br>
-<br>
-       list_for_each_entry (tmp_adev, &reset_device_list, reset_list) {<br>
+       list_for_each_entry(tmp_adev, reset_device_list, reset_list) {<br>
                 mutex_lock(&tmp_adev->reset_cntl->reset_lock);<br>
                 tmp_adev->reset_cntl->active_reset = AMD_RESET_METHOD_MODE2;<br>
         }<br>
@@ -179,7 +171,7 @@ aldebaran_mode2_perform_reset(struct amdgpu_reset_control *reset_ctl,<br>
          * Mode2 reset doesn't need any sync between nodes in XGMI hive, instead launch<br>
          * them together so that they can be completed asynchronously on multiple nodes<br>
          */<br>
-       list_for_each_entry (tmp_adev, &reset_device_list, reset_list) {<br>
+       list_for_each_entry(tmp_adev, reset_device_list, reset_list) {<br>
                 /* For XGMI run all resets in parallel to speed up the process */<br>
                 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {<br>
                         if (!queue_work(system_unbound_wq,<br>
@@ -197,7 +189,7 @@ aldebaran_mode2_perform_reset(struct amdgpu_reset_control *reset_ctl,<br>
 <br>
         /* For XGMI wait for all resets to complete before proceed */<br>
         if (!r) {<br>
-               list_for_each_entry (tmp_adev, &reset_device_list, reset_list) {<br>
+               list_for_each_entry(tmp_adev, reset_device_list, reset_list) {<br>
                         if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {<br>
                                 flush_work(&tmp_adev->reset_cntl->reset_work);<br>
                                 r = tmp_adev->asic_reset_res;<br>
@@ -207,7 +199,7 @@ aldebaran_mode2_perform_reset(struct amdgpu_reset_control *reset_ctl,<br>
                 }<br>
         }<br>
 <br>
-       list_for_each_entry (tmp_adev, &reset_device_list, reset_list) {<br>
+       list_for_each_entry(tmp_adev, reset_device_list, reset_list) {<br>
                 mutex_unlock(&tmp_adev->reset_cntl->reset_lock);<br>
                 tmp_adev->reset_cntl->active_reset = AMD_RESET_METHOD_NONE;<br>
         }<br>
@@ -339,10 +331,13 @@ static int<br>
 aldebaran_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl,<br>
                                   struct amdgpu_reset_context *reset_context)<br>
 {<br>
+       struct list_head *reset_device_list = reset_context->reset_device_list;<br>
         struct amdgpu_device *tmp_adev = NULL;<br>
-       struct list_head reset_device_list;<br>
         int r;<br>
 <br>
+       if (reset_device_list == NULL)<br>
+               return -EINVAL;<br>
+<br>
         if (reset_context->reset_req_dev->ip_versions[MP1_HWIP][0] ==<br>
                     IP_VERSION(13, 0, 2) &&<br>
             reset_context->hive == NULL) {<br>
@@ -350,19 +345,7 @@ aldebaran_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl,<br>
                 return -EINVAL;<br>
         }<br>
 <br>
-       INIT_LIST_HEAD(&reset_device_list);<br>
-       if (reset_context->hive) {<br>
-               list_for_each_entry (tmp_adev,<br>
-                                    &reset_context->hive->device_list,<br>
-                                    gmc.xgmi.head)<br>
-                       list_add_tail(&tmp_adev->reset_list,<br>
-                                     &reset_device_list);<br>
-       } else {<br>
-               list_add_tail(&reset_context->reset_req_dev->reset_list,<br>
-                             &reset_device_list);<br>
-       }<br>
-<br>
-       list_for_each_entry (tmp_adev, &reset_device_list, reset_list) {<br>
+       list_for_each_entry(tmp_adev, reset_device_list, reset_list) {<br>
                 dev_info(tmp_adev->dev,<br>
                          "GPU reset succeeded, trying to resume\n");<br>
                 r = aldebaran_mode2_restore_ip(tmp_adev);<br>
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c<br>
index c4a6fe3070b6..e8a0b19b7398 100644<br>
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c<br>
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c<br>
@@ -4742,6 +4742,8 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,<br>
         tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,<br>
                                     reset_list);<br>
         amdgpu_reset_reg_dumps(tmp_adev);<br>
+<br>
+       reset_context->reset_device_list = device_list_handle;<br>
         r = amdgpu_reset_perform_reset(tmp_adev, reset_context);<br>
         /* If reset handler not implemented, continue; otherwise return */<br>
         if (r == -ENOSYS)<br>
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h<br>
index 9e55a5d7a825..ffda1560c648 100644<br>
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h<br>
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h<br>
@@ -37,6 +37,7 @@ struct amdgpu_reset_context {<br>
         struct amdgpu_device *reset_req_dev;<br>
         struct amdgpu_job *job;<br>
         struct amdgpu_hive_info *hive;<br>
+       struct list_head *reset_device_list;<br>
         unsigned long flags;<br>
 };<br>
 <br>
-- <br>
2.25.1<br>
<br>
</div>
</span></font></div>
</div>
</body>
</html>