[PATCH] drm/amdgpu: add ACA error query support for umc_v12_0

Yang Wang kevinyang.wang at amd.com
Wed Apr 17 03:10:03 UTC 2024


add ACA error query support for umc_v12_0.

Signed-off-by: Yang Wang <kevinyang.wang at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c |  6 +++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  4 ++++
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.c  | 18 ++++++++++++++----
 3 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 352ce16a0963..46b7f0c5cd8a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1268,9 +1268,9 @@ int amdgpu_ras_unbind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk)
 	return 0;
 }
 
-static int amdgpu_aca_log_ras_error_data(struct amdgpu_device *adev, enum amdgpu_ras_block blk,
-					 enum aca_error_type type, struct ras_err_data *err_data,
-					 struct ras_query_context *qctx)
+int amdgpu_aca_log_ras_error_data(struct amdgpu_device *adev, enum amdgpu_ras_block blk,
+				  enum aca_error_type type, struct ras_err_data *err_data,
+				  struct ras_query_context *qctx)
 {
 	struct ras_manager *obj;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 8d26989c75c8..487548879c49 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -898,6 +898,10 @@ int amdgpu_ras_unbind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk)
 ssize_t amdgpu_ras_aca_sysfs_read(struct device *dev, struct device_attribute *attr,
 				  struct aca_handle *handle, char *buf, void *data);
 
+int amdgpu_aca_log_ras_error_data(struct amdgpu_device *adev, enum amdgpu_ras_block blk,
+				  enum aca_error_type type, struct ras_err_data *err_data,
+				  struct ras_query_context *qctx);
+
 void amdgpu_ras_add_mca_err_addr(struct ras_err_info *err_info,
 			struct ras_err_addr *err_addr);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index f69871902233..9f2c46814a4f 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -317,16 +317,26 @@ static int umc_v12_0_err_cnt_init_per_channel(struct amdgpu_device *adev,
 static void umc_v12_0_ecc_info_query_ras_error_count(struct amdgpu_device *adev,
 					void *ras_error_status)
 {
+	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
 	struct ras_query_context qctx;
 
 	memset(&qctx, 0, sizeof(qctx));
 	qctx.event_id = amdgpu_ras_acquire_event_id(adev, amdgpu_ras_intr_triggered() ?
 						    RAS_EVENT_TYPE_ISR : RAS_EVENT_TYPE_INVALID);
 
-	amdgpu_mca_smu_log_ras_error(adev,
-		AMDGPU_RAS_BLOCK__UMC, AMDGPU_MCA_ERROR_TYPE_CE, ras_error_status, &qctx);
-	amdgpu_mca_smu_log_ras_error(adev,
-		AMDGPU_RAS_BLOCK__UMC, AMDGPU_MCA_ERROR_TYPE_UE, ras_error_status, &qctx);
+	if (amdgpu_aca_is_enabled(adev)) {
+		amdgpu_aca_get_error_data(adev, AMDGPU_RAS_BLOCK__UMC, ACA_ERROR_TYPE_CE,
+					  err_data, &qctx);
+		amdgpu_aca_get_error_data(adev, AMDGPU_RAS_BLOCK__UMC, ACA_ERROR_TYPE_UE,
+					  err_data, &qctx);
+		amdgpu_aca_get_error_data(adev, AMDGPU_RAS_BLOCK__UMC, ACA_ERROR_TYPE_DEFERRED,
+					  err_data, &qctx);
+	} else {
+		amdgpu_mca_smu_log_ras_error(adev, AMDGPU_RAS_BLOCK__UMC, AMDGPU_MCA_ERROR_TYPE_CE,
+					     err_data, &qctx);
+		amdgpu_mca_smu_log_ras_error(adev, AMDGPU_RAS_BLOCK__UMC, AMDGPU_MCA_ERROR_TYPE_UE,
+					     err_data, &qctx);
+	}
 }
 
 static void umc_v12_0_ecc_info_query_ras_error_address(struct amdgpu_device *adev,
-- 
2.34.1



More information about the amd-gfx mailing list