[PATCH 3/4] drm/amdgpu: add RAS poison handling for MCA

Tao Zhou tao.zhou1 at amd.com
Tue Oct 18 08:31:59 UTC 2022


For MCA poison, if unmap queue fails, gpu reset should be triggered even
no error count is queried.

Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 27 +++++++++++++++----------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index e97b1bd343ee..22995f371eae 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -229,18 +229,23 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
 		bool reset)
 {
 	int ret;
-	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
-	struct ras_common_if head = {
-		.block = AMDGPU_RAS_BLOCK__UMC,
-	};
-	struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head);
 
-	ret =
-		amdgpu_umc_do_page_retirement(adev, ras_error_status, NULL, reset);
-
-	if (ret == AMDGPU_RAS_SUCCESS && obj) {
-		obj->err_data.ue_count += err_data->ue_count;
-		obj->err_data.ce_count += err_data->ce_count;
+	if (adev->gmc.xgmi.connected_to_cpu) {
+		ret = amdgpu_umc_poison_handler_mca(adev, ras_error_status, reset);
+	} else {
+		struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
+		struct ras_common_if head = {
+			.block = AMDGPU_RAS_BLOCK__UMC,
+		};
+		struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head);
+
+		ret =
+			amdgpu_umc_do_page_retirement(adev, ras_error_status, NULL, reset);
+
+		if (ret == AMDGPU_RAS_SUCCESS && obj) {
+			obj->err_data.ue_count += err_data->ue_count;
+			obj->err_data.ce_count += err_data->ce_count;
+		}
 	}
 
 	return ret;
-- 
2.35.1



More information about the amd-gfx mailing list