<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
<style type="text/css" style="display:none;"> P {margin-top:0;margin-bottom:0;} </style>
</head>
<body dir="ltr">
<p style="font-family:Calibri;font-size:10pt;color:#0000FF;margin:5pt;font-style:normal;font-weight:normal;text-decoration:none;" align="Left">
[AMD Official Use Only - AMD Internal Distribution Only]<br>
</p>
<br>
<div>
<div class="elementToProof" style="font-family: Aptos, Aptos_EmbeddedFont, Aptos_MSFontService, Calibri, Helvetica, sans-serif; font-size: 11pt; color: rgb(0, 0, 0);">
Signed-off-by: Tony Yi <Tony.Yi@amd.com></div>
<div id="appendonsend"></div>
<div class="elementToProof"><br>
</div>
<div style="font-family: Calibri, Arial, Helvetica, sans-serif; font-size: 12pt; color: rgb(0, 0, 0);">
<br>
</div>
<hr style="display: inline-block; width: 98%;">
<div style="font-family: Calibri, Arial, Helvetica, sans-serif; font-size: 12pt; color: rgb(0, 0, 0);">
<b>From:</b> Zhang, Hawking<br>
<b>Sent:</b> Wednesday, April 2, 2025 12:24 AM<br>
<b>To:</b> Skvortsov, Victor; amd-gfx@lists.freedesktop.org<br>
<b>Cc:</b> Luo, Zhigang; Zhou1, Tao; Zhao, Victor; Yi, Tony<br>
<b>Subject:</b> RE: [PATCH v2] drm/amdgpu: Fix CPER error handling on VFs </div>
<div style="font-family: Calibri, Arial, Helvetica, sans-serif; font-size: 12pt; color: rgb(0, 0, 0);">
<br>
</div>
<div class="elementToProof" style="font-family: "Times New Roman"; font-size: 11pt;">
[AMD Official Use Only - AMD Internal Distribution Only]<br>
<br>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com><br>
<br>
Regards,<br>
Hawking<br>
-----Original Message-----<br>
From: Skvortsov, Victor <Victor.Skvortsov@amd.com><br>
Sent: Wednesday, April 2, 2025 04:59<br>
To: amd-gfx@lists.freedesktop.org<br>
Cc: Luo, Zhigang <Zhigang.Luo@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>; Zhou1, Tao <Tao.Zhou1@amd.com>; Zhao, Victor <Victor.Zhao@amd.com>; Yi, Tony <Tony.Yi@amd.com>; Skvortsov, Victor <Victor.Skvortsov@amd.com><br>
Subject: [PATCH v2] drm/amdgpu: Fix CPER error handling on VFs<br>
<br>
From: Tony Yi <Tony.Yi@amd.com><br>
<br>
CPER read will loop infinitely if an error is encountered and the more bit is set. Add error checks to break upon failure.<br>
<br>
v2: added function pointer checks<br>
<br>
Suggested-by: Tony Yi <Tony.Yi@amd.com><br>
Signed-off-by: Victor Skvortsov <Victor.Skvortsov@amd.com><br>
---<br>
drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 16 ++++++++++++----<br>
1 file changed, 12 insertions(+), 4 deletions(-)<br>
<br>
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c<br>
index 0bb8cbe0dcc0..83f3334b3931 100644<br>
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c<br>
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c<br>
@@ -1323,6 +1323,9 @@ static int amdgpu_virt_req_ras_err_count_internal(struct amdgpu_device *adev, bo {<br>
struct amdgpu_virt *virt = &adev->virt;<br>
<br>
+ if (!virt->ops || !virt->ops->req_ras_err_count)<br>
+ return -EOPNOTSUPP;<br>
+<br>
/* Host allows 15 ras telemetry requests per 60 seconds. Afterwhich, the Host<br>
* will ignore incoming guest messages. Ratelimit the guest messages to<br>
* prevent guest self DOS.<br>
@@ -1378,14 +1381,16 @@ amdgpu_virt_write_cpers_to_ring(struct amdgpu_device *adev,<br>
used_size = host_telemetry->header.used_size;<br>
<br>
if (used_size > (AMD_SRIOV_RAS_TELEMETRY_SIZE_KB << 10))<br>
- return 0;<br>
+ return -EINVAL;<br>
<br>
cper_dump = kmemdup(&host_telemetry->body.cper_dump, used_size, GFP_KERNEL);<br>
if (!cper_dump)<br>
return -ENOMEM;<br>
<br>
- if (checksum != amd_sriov_msg_checksum(cper_dump, used_size, 0, 0))<br>
+ if (checksum != amd_sriov_msg_checksum(cper_dump, used_size, 0, 0)) {<br>
+ ret = -EINVAL;<br>
goto out;<br>
+ }<br>
<br>
*more = cper_dump->more;<br>
<br>
@@ -1425,7 +1430,7 @@ static int amdgpu_virt_req_ras_cper_dump_internal(struct amdgpu_device *adev)<br>
int ret = 0;<br>
uint32_t more = 0;<br>
<br>
- if (!amdgpu_sriov_ras_cper_en(adev))<br>
+ if (!virt->ops || !virt->ops->req_ras_cper_dump)<br>
return -EOPNOTSUPP;<br>
<br>
do {<br>
@@ -1434,7 +1439,7 @@ static int amdgpu_virt_req_ras_cper_dump_internal(struct amdgpu_device *adev)<br>
adev, virt->fw_reserve.ras_telemetry, &more);<br>
else<br>
ret = 0;<br>
- } while (more);<br>
+ } while (more && !ret);<br>
<br>
return ret;<br>
}<br>
@@ -1444,6 +1449,9 @@ int amdgpu_virt_req_ras_cper_dump(struct amdgpu_device *adev, bool force_update)<br>
struct amdgpu_virt *virt = &adev->virt;<br>
int ret = 0;<br>
<br>
+ if (!amdgpu_sriov_ras_cper_en(adev))<br>
+ return -EOPNOTSUPP;<br>
+<br>
if ((__ratelimit(&virt->ras.ras_cper_dump_rs) || force_update) &&<br>
down_read_trylock(&adev->reset_domain->sem)) {<br>
mutex_lock(&virt->ras.ras_telemetry_mutex);<br>
--<br>
2.34.1<br>
<br>
</div>
</div>
</body>
</html>