[PATCH] drm/amdgpu: Use driver mode reset for data poison handling

Deucher, Alexander Alexander.Deucher at amd.com
Wed Apr 17 18:59:53 UTC 2024


[Public]

Acked-by: Alex Deucher <alexander.deucher at amd.com>
________________________________
From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> on behalf of Hawking Zhang <Hawking.Zhang at amd.com>
Sent: Tuesday, April 16, 2024 1:56 AM
To: amd-gfx at lists.freedesktop.org <amd-gfx at lists.freedesktop.org>; Zhou1, Tao <Tao.Zhou1 at amd.com>
Cc: Zhang, Hawking <Hawking.Zhang at amd.com>
Subject: [PATCH] drm/amdgpu: Use driver mode reset for data poison handling

mode-2 reset is the only reliable method that can get
GC/SDMA back when poison is consumed. mmhub requires
mode-1 reset.

Signed-off-by: Hawking Zhang <Hawking.Zhang at amd.com>
---
 .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c   | 22 +++----------------
 1 file changed, 3 insertions(+), 19 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index c368c70df3f4a..94eb2493103ef 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -144,7 +144,7 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
                                 uint16_t pasid, uint16_t client_id)
 {
         enum amdgpu_ras_block block = 0;
-       int old_poison, ret = -EINVAL;
+       int old_poison;
         uint32_t reset = 0;
         struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);

@@ -163,17 +163,13 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
         case SOC15_IH_CLIENTID_SE2SH:
         case SOC15_IH_CLIENTID_SE3SH:
         case SOC15_IH_CLIENTID_UTCL2:
-               ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
                 block = AMDGPU_RAS_BLOCK__GFX;
-               if (ret)
-                       reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
+               reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
                 break;
         case SOC15_IH_CLIENTID_VMC:
         case SOC15_IH_CLIENTID_VMC1:
-               ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
                 block = AMDGPU_RAS_BLOCK__MMHUB;
-               if (ret)
-                       reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
+               reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
                 break;
         case SOC15_IH_CLIENTID_SDMA0:
         case SOC15_IH_CLIENTID_SDMA1:
@@ -189,18 +185,6 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,

         kfd_signal_poison_consumed_event(dev, pasid);

-       /* resetting queue passes, do page retirement without gpu reset
-        * resetting queue fails, fallback to gpu reset solution
-        */
-       if (!ret)
-               dev_warn(dev->adev->dev,
-                       "RAS poison consumption, unmap queue flow succeeded: client id %d\n",
-                       client_id);
-       else
-               dev_warn(dev->adev->dev,
-                       "RAS poison consumption, fall back to gpu reset flow: client id %d\n",
-                       client_id);
-
         amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, reset);
 }

--
2.17.1

-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/amd-gfx/attachments/20240417/dd9a59ee/attachment.htm>


More information about the amd-gfx mailing list