[PATCH 1/2] drm/amdgpu: support RAS error inject for SRIOV

Tao Zhou tao.zhou1 at amd.com
Wed Aug 31 08:39:24 UTC 2022


In SRIOV, RAS error injection request will be sent to PF via mailbox, the
injection input information should also be transferred to PF.

Generally, the error injection is operated on PF side directly, but for RAS
poison test, since workload is launched on VF side, VF has to tell PF
about the injection information.

Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c  | 26 ++++++++++++++++--------
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  2 ++
 drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c    | 24 ++++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h    |  9 ++++++++
 4 files changed, 53 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index ab9ba5a9c33d..498642eb5fb7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1103,15 +1103,25 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
 							  block_info.address);
 	}
 
-	if (info->head.block == AMDGPU_RAS_BLOCK__GFX) {
-		if (block_obj->hw_ops->ras_error_inject)
-			ret = block_obj->hw_ops->ras_error_inject(adev, info);
+	if (!amdgpu_sriov_vf(adev)) {
+		if (info->head.block == AMDGPU_RAS_BLOCK__GFX) {
+			if (block_obj->hw_ops->ras_error_inject)
+				ret = block_obj->hw_ops->ras_error_inject(adev, info);
+		} else {
+			/* If defined special ras_error_inject(e.g: xgmi), implement special ras_error_inject */
+			if (block_obj->hw_ops->ras_error_inject)
+				ret = block_obj->hw_ops->ras_error_inject(adev, &block_info);
+			else  /*If not defined .ras_error_inject, use default ras_error_inject*/
+				ret = psp_ras_trigger_error(&adev->psp, &block_info);
+		}
 	} else {
-		/* If defined special ras_error_inject(e.g: xgmi), implement special ras_error_inject */
-		if (block_obj->hw_ops->ras_error_inject)
-			ret = block_obj->hw_ops->ras_error_inject(adev, &block_info);
-		else  /*If not defined .ras_error_inject, use default ras_error_inject*/
-			ret = psp_ras_trigger_error(&adev->psp, &block_info);
+		if (adev->virt.ops && adev->virt.ops->ras_trigger_error) {
+			adev->virt.ops->ras_trigger_error(adev, &block_info);
+			ret = 0;
+		} else {
+			dev_warn(adev->dev,
+				"No ras_trigger_error interface in SRIOV!\n");
+		}
 	}
 
 	if (ret)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
index 239f232f9c02..4534e6f70a4b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
@@ -84,6 +84,8 @@ struct amdgpu_virt_ops {
 	int (*reset_gpu)(struct amdgpu_device *adev);
 	int (*wait_reset)(struct amdgpu_device *adev);
 	void (*trans_msg)(struct amdgpu_device *adev, u32 req, u32 data1, u32 data2, u32 data3);
+	void (*ras_trigger_error)(struct amdgpu_device *adev,
+				struct ta_ras_trigger_error_input *info);
 };
 
 /*
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
index a2f04b249132..3b4c5162a237 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -33,6 +33,7 @@
 #include "mxgpu_ai.h"
 
 #include "amdgpu_reset.h"
+#include "ta_ras_if.h"
 
 static void xgpu_ai_mailbox_send_ack(struct amdgpu_device *adev)
 {
@@ -405,6 +406,28 @@ static int xgpu_ai_request_init_data(struct amdgpu_device *adev)
 	return xgpu_ai_send_access_requests(adev, IDH_REQ_GPU_INIT_DATA);
 }
 
+void amdgpu_virt_ras_trigger_error(struct amdgpu_device *adev,
+               struct ta_ras_trigger_error_input *info)
+{
+       uint32_t addr_lo, addr_hi, data1 = 0;
+
+       addr_lo = lower_32_bits(info->address);
+       addr_hi = upper_32_bits(info->address);
+
+       /* block id : bits[0:4], inject_error_type : bits[5:8]
+        * sub_block_index : bits[9:17], value : bits[18:19]
+        */
+       data1 = info->block_id & RAS_BLOCK_ID_MASK;
+       data1 |= (info->inject_error_type & RAS_INJECT_ERROR_TYPE_MASK) <<
+			RAS_INJECT_ERROR_TYPE_SHIFT;
+       data1 |= (info->sub_block_index & RAS_SUB_BLOCK_INDEX_MASK) <<
+			RAS_SUB_BLOCK_INDEX_SHIFT;
+       data1 |= (info->value & RAS_VALUE_MASK) << RAS_VALUE_SHIFT;
+
+       xgpu_ai_mailbox_trans_msg(adev, IDH_RAS_ERROR_INJECT, data1,
+				addr_lo, addr_hi);
+}
+
 const struct amdgpu_virt_ops xgpu_ai_virt_ops = {
 	.req_full_gpu	= xgpu_ai_request_full_gpu_access,
 	.rel_full_gpu	= xgpu_ai_release_full_gpu_access,
@@ -412,4 +435,5 @@ const struct amdgpu_virt_ops xgpu_ai_virt_ops = {
 	.wait_reset = NULL,
 	.trans_msg = xgpu_ai_mailbox_trans_msg,
 	.req_init_data  = xgpu_ai_request_init_data,
+	.ras_trigger_error = amdgpu_virt_ras_trigger_error,
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h
index fa7e13e0459e..0841d6632328 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h
@@ -29,6 +29,14 @@
 #define AI_MAILBOX_POLL_FLR_TIMEDOUT	10000
 #define AI_MAILBOX_POLL_MSG_REP_MAX	11
 
+#define RAS_BLOCK_ID_MASK 0x1f
+#define RAS_INJECT_ERROR_TYPE_MASK 0xf
+#define RAS_INJECT_ERROR_TYPE_SHIFT 5
+#define RAS_SUB_BLOCK_INDEX_MASK 0x1ff
+#define RAS_SUB_BLOCK_INDEX_SHIFT 9
+#define RAS_VALUE_MASK 0x3
+#define RAS_VALUE_SHIFT 18
+
 enum idh_request {
 	IDH_REQ_GPU_INIT_ACCESS = 1,
 	IDH_REL_GPU_INIT_ACCESS,
@@ -39,6 +47,7 @@ enum idh_request {
 
 	IDH_LOG_VF_ERROR       = 200,
 	IDH_READY_TO_RESET 	= 201,
+	IDH_RAS_ERROR_INJECT = 202,
 };
 
 enum idh_event {
-- 
2.35.1



More information about the amd-gfx mailing list