[PATCH 25/26] drm/amdgpu: support gfx ras error injection and err_cnt query

Alex Deucher alexdeucher at gmail.com
Wed Jul 31 17:58:17 UTC 2019


From: Dennis Li <Dennis.Li at amd.com>

check gfx error count in both ras querry function and
ras interrupt handler.

gfx ras is still disabled by default due to known stability
issue found in gpu reset.

Signed-off-by: Dennis Li <Dennis.Li at amd.com>
Reviewed-by: Tao Zhou <tao.zhou1 at amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 19 ++++++++++++++++---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c   |  2 ++
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index ccd5863bca88..a96b0f17c619 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -600,6 +600,10 @@ int amdgpu_ras_error_query(struct amdgpu_device *adev,
 		if (adev->umc.funcs->query_ras_error_count)
 			adev->umc.funcs->query_ras_error_count(adev, &err_data);
 		break;
+	case AMDGPU_RAS_BLOCK__GFX:
+		if (adev->gfx.funcs->query_ras_error_count)
+			adev->gfx.funcs->query_ras_error_count(adev, &err_data);
+		break;
 	default:
 		break;
 	}
@@ -637,13 +641,22 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
 	if (!obj)
 		return -EINVAL;
 
-	if (block_info.block_id != TA_RAS_BLOCK__UMC) {
+	switch (info->head.block) {
+	case AMDGPU_RAS_BLOCK__GFX:
+		if (adev->gfx.funcs->ras_error_inject)
+			ret = adev->gfx.funcs->ras_error_inject(adev, info);
+		else
+			ret = -EINVAL;
+		break;
+	case AMDGPU_RAS_BLOCK__UMC:
+		ret = psp_ras_trigger_error(&adev->psp, &block_info);
+		break;
+	default:
 		DRM_INFO("%s error injection is not supported yet\n",
 			 ras_block_str(info->head.block));
-		return -EINVAL;
+		ret = -EINVAL;
 	}
 
-	ret = psp_ras_trigger_error(&adev->psp, &block_info);
 	if (ret)
 		DRM_ERROR("RAS ERROR: inject %s error failed ret %d\n",
 				ras_block_str(info->head.block),
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index d7902e782be4..206176710f79 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -5607,6 +5607,8 @@ static int gfx_v9_0_process_ras_data_cb(struct amdgpu_device *adev,
 {
 	/* TODO ue will trigger an interrupt. */
 	kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
+	if (adev->gfx.funcs->query_ras_error_count)
+		adev->gfx.funcs->query_ras_error_count(adev, err_data);
 	amdgpu_ras_reset_gpu(adev, 0);
 	return AMDGPU_RAS_UE;
 }
-- 
2.20.1



More information about the amd-gfx mailing list