<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=us-ascii">
<style type="text/css" style="display:none;"><!-- P {margin-top:0;margin-bottom:0;} --></style>
</head>
<body dir="ltr">
<div id="divtagdefaultwrapper" style="font-size:12pt;color:#000000;font-family:Calibri,Helvetica,sans-serif;" dir="ltr">
<p style="margin-top:0;margin-bottom:0">Patches 1, 2:</p>
<p style="margin-top:0;margin-bottom:0">Reviewed-by: Alex Deucher <alexander.deucher@amd.com></p>
<p style="margin-top:0;margin-bottom:0"><br>
</p>
<p style="margin-top:0;margin-bottom:0">Patch 3:</p>
<p style="margin-top:0;margin-bottom:0">Acked-by: Alex Deucher <alexander.deucher@amd.com><br>
</p>
</div>
<hr style="display:inline-block;width:98%" tabindex="-1">
<div id="divRplyFwdMsg" dir="ltr"><font face="Calibri, sans-serif" style="font-size:11pt" color="#000000"><b>From:</b> amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on behalf of Andrey Grodzovsky <andrey.grodzovsky@amd.com><br>
<b>Sent:</b> Friday, November 30, 2018 4:41:10 PM<br>
<b>To:</b> amd-gfx@lists.freedesktop.org; Deucher, Alexander; Wang, Ken; ckoenig.leichtzumerken@gmail.com; Xu, Feifei<br>
<b>Cc:</b> Grodzovsky, Andrey<br>
<b>Subject:</b> [PATCH v3 3/3] drm/amdgpu: Implement concurrent asic reset for XGMI.</font>
<div> </div>
</div>
<div class="BodyFragment"><font size="2"><span style="font-size:11pt;">
<div class="PlainText">Use per hive wq to concurrently send reset commands to all nodes<br>
in the hive.<br>
<br>
v2:<br>
Switch to system_highpri_wq after dropping dedicated queue.<br>
Fix non XGMI code path KASAN error.<br>
Stop  the hive reset for each node loop if there<br>
is a reset failure on any of the nodes.<br>
<br>
Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com><br>
---<br>
 drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  2 ++<br>
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 44 ++++++++++++++++++++++++++----<br>
 2 files changed, 41 insertions(+), 5 deletions(-)<br>
<br>
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h<br>
index c8ad6bf..6fc023b 100644<br>
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h<br>
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h<br>
@@ -910,7 +910,9 @@ struct amdgpu_device {<br>
         bool                            in_gpu_reset;<br>
         struct mutex  lock_reset;<br>
         struct amdgpu_doorbell_index doorbell_index;<br>
+<br>
         int asic_reset_res;<br>
+       struct work_struct              xgmi_reset_work;<br>
 };<br>
 <br>
 static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)<br>
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c<br>
index bfd286c..9fd9f63 100644<br>
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c<br>
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c<br>
@@ -2356,6 +2356,19 @@ bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)<br>
         return amdgpu_device_asic_has_dc_support(adev->asic_type);<br>
 }<br>
 <br>
+<br>
+static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)<br>
+{<br>
+       struct amdgpu_device *adev =<br>
+               container_of(__work, struct amdgpu_device, xgmi_reset_work);<br>
+<br>
+       adev->asic_reset_res =  amdgpu_asic_reset(adev);<br>
+       if (adev->asic_reset_res)<br>
+               DRM_WARN("ASIC reset failed with err r, %d for drm dev, %s",<br>
+                        adev->asic_reset_res, adev->ddev->unique);<br>
+}<br>
+<br>
+<br>
 /**<br>
  * amdgpu_device_init - initialize the driver<br>
  *<br>
@@ -2454,6 +2467,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,<br>
         INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,<br>
                           amdgpu_device_delay_enable_gfx_off);<br>
 <br>
+       INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);<br>
+<br>
         adev->gfx.gfx_off_req_count = 1;<br>
         adev->pm.ac_power = power_supply_is_system_supplied() > 0 ? true : false;<br>
 <br>
@@ -3331,10 +3346,31 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,<br>
          */<br>
         if (need_full_reset) {<br>
                 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {<br>
-                       r = amdgpu_asic_reset(tmp_adev);<br>
-                       if (r)<br>
-                               DRM_WARN("ASIC reset failed with err r, %d for drm dev, %s",<br>
+                       /* For XGMI run all resets in parallel to speed up the process */<br>
+                       if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {<br>
+                               if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work))<br>
+                                       r = -EALREADY;<br>
+                       } else<br>
+                               r = amdgpu_asic_reset(tmp_adev);<br>
+<br>
+                       if (r) {<br>
+                               DRM_ERROR("ASIC reset failed with err r, %d for drm dev, %s",<br>
                                          r, tmp_adev->ddev->unique);<br>
+                               break;<br>
+                       }<br>
+               }<br>
+<br>
+               /* For XGMI wait for all PSP resets to complete before proceed */<br>
+               if (!r) {<br>
+                       list_for_each_entry(tmp_adev, device_list_handle,<br>
+                                           gmc.xgmi.head) {<br>
+                               if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {<br>
+                                       flush_work(&tmp_adev->xgmi_reset_work);<br>
+                                       r = tmp_adev->asic_reset_res;<br>
+                                       if (r)<br>
+                                               break;<br>
+                               }<br>
+                       }<br>
                 }<br>
         }<br>
 <br>
@@ -3521,8 +3557,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,<br>
                 if (tmp_adev == adev)<br>
                         continue;<br>
 <br>
-               dev_info(tmp_adev->dev, "GPU reset begin for drm dev %s!\n", adev->ddev->unique);<br>
-<br>
                 amdgpu_device_lock_adev(tmp_adev);<br>
                 r = amdgpu_device_pre_asic_reset(tmp_adev,<br>
                                                  NULL,<br>
-- <br>
2.7.4<br>
<br>
_______________________________________________<br>
amd-gfx mailing list<br>
amd-gfx@lists.freedesktop.org<br>
<a href="https://lists.freedesktop.org/mailman/listinfo/amd-gfx">https://lists.freedesktop.org/mailman/listinfo/amd-gfx</a><br>
</div>
</span></font></div>
</body>
</html>