<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
<style type="text/css" style="display:none;"> P {margin-top:0;margin-bottom:0;} </style>
</head>
<body dir="ltr">
<p style="font-family:Arial;font-size:10pt;color:#008000;margin:15pt;font-style:normal;font-weight:normal;text-decoration:none;" align="Left">
[Public]<br>
</p>
<br>
<div>
<div style="font-family: Aptos, Aptos_EmbeddedFont, Aptos_MSFontService, Calibri, Helvetica, sans-serif; font-size: 12pt; color: rgb(0, 0, 0);" class="elementToProof">
Acked-by: Alex Deucher <alexander.deucher@amd.com></div>
<div id="appendonsend"></div>
<hr style="display:inline-block;width:98%" tabindex="-1">
<div id="divRplyFwdMsg" dir="ltr"><font face="Calibri, sans-serif" style="font-size:11pt" color="#000000"><b>From:</b> Christian König <ckoenig.leichtzumerken@gmail.com><br>
<b>Sent:</b> Wednesday, January 10, 2024 9:31 AM<br>
<b>To:</b> Lazar, Lijo <Lijo.Lazar@amd.com>; Chai, Thomas <YiPeng.Chai@amd.com>; Deucher, Alexander <Alexander.Deucher@amd.com>; amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org><br>
<b>Subject:</b> [PATCH] drm/amdgpu: revert "Adjust removal control flow for smu v13_0_2"</font>
<div> </div>
</div>
<div class="BodyFragment"><font size="2"><span style="font-size:11pt;">
<div class="PlainText">Calling amdgpu_device_ip_resume_phase1() during shutdown leaves the<br>
HW in an active state and is an unbalanced use of the IP callbacks.<br>
<br>
Using the IP callbacks like this can lead to memory leaks, double<br>
free and imbalanced reference counters.<br>
<br>
Leaving the HW in an active state can lead to DMA accesses to memory now<br>
freed by the driver.<br>
<br>
Both is a complete no-go for driver unload so completely revert the<br>
workaround for now.<br>
<br>
This reverts commit f5c7e7797060255dbc8160734ccc5ad6183c5e04.<br>
<br>
Signed-off-by: Christian König <christian.koenig@amd.com><br>
---<br>
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 32 +---------------------<br>
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    | 32 ----------------------<br>
 drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h  |  1 -<br>
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h   |  1 -<br>
 4 files changed, 1 insertion(+), 65 deletions(-)<br>
<br>
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c<br>
index a39c9fea55c4..313316009039 100644<br>
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c<br>
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c<br>
@@ -5232,7 +5232,6 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,<br>
         struct amdgpu_device *tmp_adev = NULL;<br>
         bool need_full_reset, skip_hw_reset, vram_lost = false;<br>
         int r = 0;<br>
-       bool gpu_reset_for_dev_remove = 0;<br>
 <br>
         /* Try reset handler method first */<br>
         tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,<br>
@@ -5252,10 +5251,6 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,<br>
                 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);<br>
         skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);<br>
 <br>
-       gpu_reset_for_dev_remove =<br>
-               test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&<br>
-                       test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);<br>
-<br>
         /*<br>
          * ASIC reset has to be done on all XGMI hive nodes ASAP<br>
          * to allow proper links negotiation in FW (within 1 sec)<br>
@@ -5298,18 +5293,6 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,<br>
                 amdgpu_ras_intr_cleared();<br>
         }<br>
 <br>
-       /* Since the mode1 reset affects base ip blocks, the<br>
-        * phase1 ip blocks need to be resumed. Otherwise there<br>
-        * will be a BIOS signature error and the psp bootloader<br>
-        * can't load kdb on the next amdgpu install.<br>
-        */<br>
-       if (gpu_reset_for_dev_remove) {<br>
-               list_for_each_entry(tmp_adev, device_list_handle, reset_list)<br>
-                       amdgpu_device_ip_resume_phase1(tmp_adev);<br>
-<br>
-               goto end;<br>
-       }<br>
-<br>
         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {<br>
                 if (need_full_reset) {<br>
                         /* post card */<br>
@@ -5543,11 +5526,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,<br>
         int i, r = 0;<br>
         bool need_emergency_restart = false;<br>
         bool audio_suspended = false;<br>
-       bool gpu_reset_for_dev_remove = false;<br>
-<br>
-       gpu_reset_for_dev_remove =<br>
-                       test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&<br>
-                               test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);<br>
 <br>
         /*<br>
          * Special case: RAS triggered and full reset isn't supported<br>
@@ -5585,7 +5563,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,<br>
         if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) {<br>
                 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {<br>
                         list_add_tail(&tmp_adev->reset_list, &device_list);<br>
-                       if (gpu_reset_for_dev_remove && adev->shutdown)<br>
+                       if (adev->shutdown)<br>
                                 tmp_adev->shutdown = true;<br>
                 }<br>
                 if (!list_is_first(&adev->reset_list, &device_list))<br>
@@ -5670,10 +5648,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,<br>
 <br>
 retry:  /* Rest of adevs pre asic reset from XGMI hive. */<br>
         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {<br>
-               if (gpu_reset_for_dev_remove) {<br>
-                       /* Workaroud for ASICs need to disable SMC first */<br>
-                       amdgpu_device_smu_fini_early(tmp_adev);<br>
-               }<br>
                 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);<br>
                 /*TODO Should we stop ?*/<br>
                 if (r) {<br>
@@ -5705,9 +5679,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,<br>
                 r = amdgpu_do_asic_reset(device_list_handle, reset_context);<br>
                 if (r && r == -EAGAIN)<br>
                         goto retry;<br>
-<br>
-               if (!r && gpu_reset_for_dev_remove)<br>
-                       goto recover_end;<br>
         }<br>
 <br>
 skip_hw_reset:<br>
@@ -5763,7 +5734,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,<br>
                 amdgpu_ras_set_error_query_ready(tmp_adev, true);<br>
         }<br>
 <br>
-recover_end:<br>
         tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,<br>
                                             reset_list);<br>
         amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);<br>
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c<br>
index 852cec98ff26..b7de48effe7f 100644<br>
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c<br>
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c<br>
@@ -2336,38 +2336,6 @@ amdgpu_pci_remove(struct pci_dev *pdev)<br>
                 pm_runtime_forbid(dev->dev);<br>
         }<br>
 <br>
-       if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 2) &&<br>
-           !amdgpu_sriov_vf(adev)) {<br>
-               bool need_to_reset_gpu = false;<br>
-<br>
-               if (adev->gmc.xgmi.num_physical_nodes > 1) {<br>
-                       struct amdgpu_hive_info *hive;<br>
-<br>
-                       hive = amdgpu_get_xgmi_hive(adev);<br>
-                       if (hive->device_remove_count == 0)<br>
-                               need_to_reset_gpu = true;<br>
-                       hive->device_remove_count++;<br>
-                       amdgpu_put_xgmi_hive(hive);<br>
-               } else {<br>
-                       need_to_reset_gpu = true;<br>
-               }<br>
-<br>
-               /* Workaround for ASICs need to reset SMU.<br>
-                * Called only when the first device is removed.<br>
-                */<br>
-               if (need_to_reset_gpu) {<br>
-                       struct amdgpu_reset_context reset_context;<br>
-<br>
-                       adev->shutdown = true;<br>
-                       memset(&reset_context, 0, sizeof(reset_context));<br>
-                       reset_context.method = AMD_RESET_METHOD_NONE;<br>
-                       reset_context.reset_req_dev = adev;<br>
-                       set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);<br>
-                       set_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context.flags);<br>
-                       amdgpu_device_gpu_recover(adev, NULL, &reset_context);<br>
-               }<br>
-       }<br>
-<br>
         amdgpu_driver_unload_kms(dev);<br>
 <br>
         /*<br>
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h<br>
index b0335a1c5e90..19899f6b9b2b 100644<br>
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h<br>
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h<br>
@@ -32,7 +32,6 @@ enum AMDGPU_RESET_FLAGS {<br>
 <br>
         AMDGPU_NEED_FULL_RESET = 0,<br>
         AMDGPU_SKIP_HW_RESET = 1,<br>
-       AMDGPU_RESET_FOR_DEVICE_REMOVE = 2,<br>
 };<br>
 <br>
 struct amdgpu_reset_context {<br>
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h<br>
index 6cab882e8061..1592c63b3099 100644<br>
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h<br>
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h<br>
@@ -43,7 +43,6 @@ struct amdgpu_hive_info {<br>
         } pstate;<br>
 <br>
         struct amdgpu_reset_domain *reset_domain;<br>
-       uint32_t device_remove_count;<br>
         atomic_t ras_recovery;<br>
 };<br>
 <br>
-- <br>
2.34.1<br>
<br>
</div>
</span></font></div>
</div>
</body>
</html>