<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=us-ascii">
<style type="text/css" style="display:none;"> P {margin-top:0;margin-bottom:0;} </style>
</head>
<body dir="ltr">
<p style="font-family:Arial;font-size:10pt;color:#008000;margin:15pt;font-style:normal;font-weight:normal;text-decoration:none;" align="Left">
[Public]<br>
</p>
<br>
<div>
<div class="elementToProof" style="font-family: Aptos, Aptos_EmbeddedFont, Aptos_MSFontService, Calibri, Helvetica, sans-serif; font-size: 12pt; color: rgb(0, 0, 0);">
Acked-by: Alex Deucher <alexander.deucher@amd.com></div>
<div id="appendonsend"></div>
<hr style="display:inline-block;width:98%" tabindex="-1">
<div id="divRplyFwdMsg" dir="ltr"><font face="Calibri, sans-serif" style="font-size:11pt" color="#000000"><b>From:</b> amd-gfx <amd-gfx-bounces@lists.freedesktop.org> on behalf of Hawking Zhang <Hawking.Zhang@amd.com><br>
<b>Sent:</b> Tuesday, April 16, 2024 1:56 AM<br>
<b>To:</b> amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>; Zhou1, Tao <Tao.Zhou1@amd.com><br>
<b>Cc:</b> Zhang, Hawking <Hawking.Zhang@amd.com><br>
<b>Subject:</b> [PATCH] drm/amdgpu: Use driver mode reset for data poison handling</font>
<div> </div>
</div>
<div class="BodyFragment"><font size="2"><span style="font-size:11pt;">
<div class="PlainText">mode-2 reset is the only reliable method that can get<br>
GC/SDMA back when poison is consumed. mmhub requires<br>
mode-1 reset.<br>
<br>
Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com><br>
---<br>
.../gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 22 +++----------------<br>
1 file changed, 3 insertions(+), 19 deletions(-)<br>
<br>
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c<br>
index c368c70df3f4a..94eb2493103ef 100644<br>
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c<br>
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c<br>
@@ -144,7 +144,7 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,<br>
uint16_t pasid, uint16_t client_id)<br>
{<br>
enum amdgpu_ras_block block = 0;<br>
- int old_poison, ret = -EINVAL;<br>
+ int old_poison;<br>
uint32_t reset = 0;<br>
struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);<br>
<br>
@@ -163,17 +163,13 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,<br>
case SOC15_IH_CLIENTID_SE2SH:<br>
case SOC15_IH_CLIENTID_SE3SH:<br>
case SOC15_IH_CLIENTID_UTCL2:<br>
- ret = kfd_dqm_evict_pasid(dev->dqm, pasid);<br>
block = AMDGPU_RAS_BLOCK__GFX;<br>
- if (ret)<br>
- reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;<br>
+ reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;<br>
break;<br>
case SOC15_IH_CLIENTID_VMC:<br>
case SOC15_IH_CLIENTID_VMC1:<br>
- ret = kfd_dqm_evict_pasid(dev->dqm, pasid);<br>
block = AMDGPU_RAS_BLOCK__MMHUB;<br>
- if (ret)<br>
- reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;<br>
+ reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;<br>
break;<br>
case SOC15_IH_CLIENTID_SDMA0:<br>
case SOC15_IH_CLIENTID_SDMA1:<br>
@@ -189,18 +185,6 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,<br>
<br>
kfd_signal_poison_consumed_event(dev, pasid);<br>
<br>
- /* resetting queue passes, do page retirement without gpu reset<br>
- * resetting queue fails, fallback to gpu reset solution<br>
- */<br>
- if (!ret)<br>
- dev_warn(dev->adev->dev,<br>
- "RAS poison consumption, unmap queue flow succeeded: client id %d\n",<br>
- client_id);<br>
- else<br>
- dev_warn(dev->adev->dev,<br>
- "RAS poison consumption, fall back to gpu reset flow: client id %d\n",<br>
- client_id);<br>
-<br>
amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, reset);<br>
}<br>
<br>
-- <br>
2.17.1<br>
<br>
</div>
</span></font></div>
</div>
</body>
</html>