<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=us-ascii">
<style type="text/css" style="display:none;"><!-- P {margin-top:0;margin-bottom:0;} --></style>
</head>
<body dir="ltr">
<div id="divtagdefaultwrapper" style="font-size:12pt;color:#000000;font-family:Calibri,Helvetica,sans-serif;" dir="ltr">
<p style="margin-top:0;margin-bottom:0">Patches 1, 2:</p>
<p style="margin-top:0;margin-bottom:0">Reviewed-by: Alex Deucher <alexander.deucher@amd.com></p>
<p style="margin-top:0;margin-bottom:0"><br>
</p>
<p style="margin-top:0;margin-bottom:0">Patch 3:</p>
<p style="margin-top:0;margin-bottom:0">Acked-by: Alex Deucher <alexander.deucher@amd.com><br>
</p>
</div>
<hr style="display:inline-block;width:98%" tabindex="-1">
<div id="divRplyFwdMsg" dir="ltr"><font face="Calibri, sans-serif" style="font-size:11pt" color="#000000"><b>From:</b> amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on behalf of Andrey Grodzovsky <andrey.grodzovsky@amd.com><br>
<b>Sent:</b> Friday, November 30, 2018 4:41:10 PM<br>
<b>To:</b> amd-gfx@lists.freedesktop.org; Deucher, Alexander; Wang, Ken; ckoenig.leichtzumerken@gmail.com; Xu, Feifei<br>
<b>Cc:</b> Grodzovsky, Andrey<br>
<b>Subject:</b> [PATCH v3 3/3] drm/amdgpu: Implement concurrent asic reset for XGMI.</font>
<div> </div>
</div>
<div class="BodyFragment"><font size="2"><span style="font-size:11pt;">
<div class="PlainText">Use per hive wq to concurrently send reset commands to all nodes<br>
in the hive.<br>
<br>
v2:<br>
Switch to system_highpri_wq after dropping dedicated queue.<br>
Fix non XGMI code path KASAN error.<br>
Stop the hive reset for each node loop if there<br>
is a reset failure on any of the nodes.<br>
<br>
Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com><br>
---<br>
drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 ++<br>
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 44 ++++++++++++++++++++++++++----<br>
2 files changed, 41 insertions(+), 5 deletions(-)<br>
<br>
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h<br>
index c8ad6bf..6fc023b 100644<br>
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h<br>
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h<br>
@@ -910,7 +910,9 @@ struct amdgpu_device {<br>
bool in_gpu_reset;<br>
struct mutex lock_reset;<br>
struct amdgpu_doorbell_index doorbell_index;<br>
+<br>
int asic_reset_res;<br>
+ struct work_struct xgmi_reset_work;<br>
};<br>
<br>
static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)<br>
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c<br>
index bfd286c..9fd9f63 100644<br>
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c<br>
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c<br>
@@ -2356,6 +2356,19 @@ bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)<br>
return amdgpu_device_asic_has_dc_support(adev->asic_type);<br>
}<br>
<br>
+<br>
+static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)<br>
+{<br>
+ struct amdgpu_device *adev =<br>
+ container_of(__work, struct amdgpu_device, xgmi_reset_work);<br>
+<br>
+ adev->asic_reset_res = amdgpu_asic_reset(adev);<br>
+ if (adev->asic_reset_res)<br>
+ DRM_WARN("ASIC reset failed with err r, %d for drm dev, %s",<br>
+ adev->asic_reset_res, adev->ddev->unique);<br>
+}<br>
+<br>
+<br>
/**<br>
* amdgpu_device_init - initialize the driver<br>
*<br>
@@ -2454,6 +2467,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,<br>
INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,<br>
amdgpu_device_delay_enable_gfx_off);<br>
<br>
+ INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);<br>
+<br>
adev->gfx.gfx_off_req_count = 1;<br>
adev->pm.ac_power = power_supply_is_system_supplied() > 0 ? true : false;<br>
<br>
@@ -3331,10 +3346,31 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,<br>
*/<br>
if (need_full_reset) {<br>
list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {<br>
- r = amdgpu_asic_reset(tmp_adev);<br>
- if (r)<br>
- DRM_WARN("ASIC reset failed with err r, %d for drm dev, %s",<br>
+ /* For XGMI run all resets in parallel to speed up the process */<br>
+ if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {<br>
+ if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work))<br>
+ r = -EALREADY;<br>
+ } else<br>
+ r = amdgpu_asic_reset(tmp_adev);<br>
+<br>
+ if (r) {<br>
+ DRM_ERROR("ASIC reset failed with err r, %d for drm dev, %s",<br>
r, tmp_adev->ddev->unique);<br>
+ break;<br>
+ }<br>
+ }<br>
+<br>
+ /* For XGMI wait for all PSP resets to complete before proceed */<br>
+ if (!r) {<br>
+ list_for_each_entry(tmp_adev, device_list_handle,<br>
+ gmc.xgmi.head) {<br>
+ if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {<br>
+ flush_work(&tmp_adev->xgmi_reset_work);<br>
+ r = tmp_adev->asic_reset_res;<br>
+ if (r)<br>
+ break;<br>
+ }<br>
+ }<br>
}<br>
}<br>
<br>
@@ -3521,8 +3557,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,<br>
if (tmp_adev == adev)<br>
continue;<br>
<br>
- dev_info(tmp_adev->dev, "GPU reset begin for drm dev %s!\n", adev->ddev->unique);<br>
-<br>
amdgpu_device_lock_adev(tmp_adev);<br>
r = amdgpu_device_pre_asic_reset(tmp_adev,<br>
NULL,<br>
-- <br>
2.7.4<br>
<br>
_______________________________________________<br>
amd-gfx mailing list<br>
amd-gfx@lists.freedesktop.org<br>
<a href="https://lists.freedesktop.org/mailman/listinfo/amd-gfx">https://lists.freedesktop.org/mailman/listinfo/amd-gfx</a><br>
</div>
</span></font></div>
</body>
</html>