[PATCH v3 3/3] drm/amdgpu: Implement concurrent asic reset for XGMI.

Mon Dec 3 14:21:30 UTC 2018

Patches 1, 2:

Reviewed-by: Alex Deucher <alexander.deucher at amd.com>


Patch 3:

Acked-by: Alex Deucher <alexander.deucher at amd.com>

________________________________
From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> on behalf of Andrey Grodzovsky <andrey.grodzovsky at amd.com>
Sent: Friday, November 30, 2018 4:41:10 PM
To: amd-gfx at lists.freedesktop.org; Deucher, Alexander; Wang, Ken; ckoenig.leichtzumerken at gmail.com; Xu, Feifei
Cc: Grodzovsky, Andrey
Subject: [PATCH v3 3/3] drm/amdgpu: Implement concurrent asic reset for XGMI.

Use per hive wq to concurrently send reset commands to all nodes
in the hive.

v2:
Switch to system_highpri_wq after dropping dedicated queue.
Fix non XGMI code path KASAN error.
Stop  the hive reset for each node loop if there
is a reset failure on any of the nodes.

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  2 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 44 ++++++++++++++++++++++++++----
 2 files changed, 41 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index c8ad6bf..6fc023b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -910,7 +910,9 @@ struct amdgpu_device {
         bool                            in_gpu_reset;
         struct mutex  lock_reset;
         struct amdgpu_doorbell_index doorbell_index;
+
         int asic_reset_res;
+       struct work_struct              xgmi_reset_work;
 };

 static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index bfd286c..9fd9f63 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2356,6 +2356,19 @@ bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
         return amdgpu_device_asic_has_dc_support(adev->asic_type);
 }

+
+static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
+{
+       struct amdgpu_device *adev =
+               container_of(__work, struct amdgpu_device, xgmi_reset_work);
+
+       adev->asic_reset_res =  amdgpu_asic_reset(adev);
+       if (adev->asic_reset_res)
+               DRM_WARN("ASIC reset failed with err r, %d for drm dev, %s",
+                        adev->asic_reset_res, adev->ddev->unique);
+}
+
+
 /**
  * amdgpu_device_init - initialize the driver
  *
@@ -2454,6 +2467,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,
         INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
                           amdgpu_device_delay_enable_gfx_off);

+       INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
+
         adev->gfx.gfx_off_req_count = 1;
         adev->pm.ac_power = power_supply_is_system_supplied() > 0 ? true : false;

@@ -3331,10 +3346,31 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
          */
         if (need_full_reset) {
                 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
-                       r = amdgpu_asic_reset(tmp_adev);
-                       if (r)
-                               DRM_WARN("ASIC reset failed with err r, %d for drm dev, %s",
+                       /* For XGMI run all resets in parallel to speed up the process */
+                       if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
+                               if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work))
+                                       r = -EALREADY;
+                       } else
+                               r = amdgpu_asic_reset(tmp_adev);
+
+                       if (r) {
+                               DRM_ERROR("ASIC reset failed with err r, %d for drm dev, %s",
                                          r, tmp_adev->ddev->unique);
+                               break;
+                       }
+               }
+
+               /* For XGMI wait for all PSP resets to complete before proceed */
+               if (!r) {
+                       list_for_each_entry(tmp_adev, device_list_handle,
+                                           gmc.xgmi.head) {
+                               if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
+                                       flush_work(&tmp_adev->xgmi_reset_work);
+                                       r = tmp_adev->asic_reset_res;
+                                       if (r)
+                                               break;
+                               }
+                       }
                 }
         }

@@ -3521,8 +3557,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
                 if (tmp_adev == adev)
                         continue;

-               dev_info(tmp_adev->dev, "GPU reset begin for drm dev %s!\n", adev->ddev->unique);
-
                 amdgpu_device_lock_adev(tmp_adev);
                 r = amdgpu_device_pre_asic_reset(tmp_adev,
                                                  NULL,
--
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx at lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/amd-gfx/attachments/20181203/2fb077c6/attachment-0001.html>