<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<meta name="Generator" content="Microsoft Exchange Server">
<!-- converted from rtf -->
<style><!-- .EmailQuote { margin-left: 1pt; padding-left: 4pt; border-left: #800000 2px solid; } --></style>
</head>
<body>
<font face="Calibri" size="2"><span style="font-size:10pt;">
<div style="padding-right:5pt;padding-left:5pt;"><font color="blue">[AMD Official Use Only - AMD Internal Distribution Only]<br>
</font></div>
<div style="margin-top:5pt;"><font face="Times New Roman" size="3"><span style="font-size:12pt;"><br>
</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">Right. That involves more changes from kfd to amdkfd interface to amdgpu ras interface. And need to consider reenabling unmap queue at some point. Let me think about more how to put these together
and make it be part of the upcoming ras series. </span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;"> </span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">Regards,<br>
Hawking</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;"> </span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">-----Original Message-----<br>
From: Lazar, Lijo <Lijo.Lazar@amd.com> <br>
Sent: Friday, September 6, 2024 17:47<br>
To: Zhang, Hawking <Hawking.Zhang@amd.com>; amd-gfx@lists.freedesktop.org; Zhou1, Tao <Tao.Zhou1@amd.com><br>
Subject: Re: [PATCH] drm/amdkfd: Select reset method for poison handling</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;"> </span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;"> </span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;"> </span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">On 9/6/2024 1:42 PM, Hawking Zhang wrote:</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> Driver mode-2 is only supported by relative new smc firmware.</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> </span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> Signed-off-by: Hawking Zhang <<a href="mailto:Hawking.Zhang@amd.com">Hawking.Zhang@amd.com</a>></span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> ---</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 40 +++++++++++++++----</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> 1 file changed, 32 insertions(+), 8 deletions(-)</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> </span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c </span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> index fecdbbab9894..d46a13156ee9 100644</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> @@ -167,11 +167,23 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> case SOC15_IH_CLIENTID_SE3SH:</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> case SOC15_IH_CLIENTID_UTCL2:</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> block = AMDGPU_RAS_BLOCK__GFX;</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> - if (amdgpu_ip_version(dev->adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> - amdgpu_ip_version(dev->adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4))</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> - reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> - else</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> + if (amdgpu_ip_version(dev->adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3)) {</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> + /* driver mode-2 for gfx poison is only supported by</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> + * pmfw 0x00557300 and onwards */</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> + if (dev->adev->pm.fw_version < 0x00557300)</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> + reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> + else</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> + reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> + } else if (amdgpu_ip_version(dev->adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) {</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> + /* driver mode-2 for gfx poison is only supported by</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> + * pmfw 0x05550C00 and onwards */</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> + if (dev->adev->pm.fw_version < 0x05550C00)</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> + reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> + else</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> + reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> + } else {</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> + }</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;"> </span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">I think it's better to handle this inside amdgpu_ras_do_recovery rather than here.</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;"> </span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">Something like -</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;"> int amdgpu_ras_reset_method_quirk(adev) which returns the right reset method when (ras->gpu_reset_flags & AMDGPU_RAS_GPU_RESET_MODE2_RESET) is set. Or add a few more flags like RAS_SDMA_POISON/RAS_GFX_POISON
and decide the method in amdgpu_ras handling.</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;"> </span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">Thanks,</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">Lijo</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;"> </span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> break;</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> case SOC15_IH_CLIENTID_VMC:</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> case SOC15_IH_CLIENTID_VMC1:</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> @@ -184,11 +196,23 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> case SOC15_IH_CLIENTID_SDMA3:</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> case SOC15_IH_CLIENTID_SDMA4:</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> block = AMDGPU_RAS_BLOCK__SDMA;</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> - if (amdgpu_ip_version(dev->adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> - amdgpu_ip_version(dev->adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4))</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> - reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> - else</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> + if (amdgpu_ip_version(dev->adev, SDMA0_HWIP, 0) == IP_VERSION(4, 4, 2)) {</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> + /* driver mode-2 for gfx poison is only supported by</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> + * pmfw 0x00557300 and onwards */</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> + if (dev->adev->pm.fw_version < 0x00557300)</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> + reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> + else</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> + reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> + } else if (amdgpu_ip_version(dev->adev, SDMA0_HWIP, 0) == IP_VERSION(4, 4, 5)) {</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> + /* driver mode-2 for gfx poison is only supported by</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> + * pmfw 0x05550C00 and onwards */</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> + if (dev->adev->pm.fw_version < 0x05550C00)</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> + reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> + else</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> + reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> + } else {</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> + }</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> break;</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> default:</span></font></div>
<div><font face="Calibri" size="2"><span style="font-size:11pt;">> dev_warn(dev->adev->dev,</span></font></div>
</span></font>
</body>
</html>