[PATCH 3/7] drm/amdgpu: Support multiple error query modes
Yang Wang
kevinyang.wang at amd.com
Wed Nov 8 12:57:31 UTC 2023
From: Hawking Zhang <Hawking.Zhang at amd.com>
Direct error query mode and firmware error query mode
are supported for now.
Signed-off-by: Hawking Zhang <Hawking.Zhang at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 93 +++++++++++++++++++------
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 8 +++
2 files changed, 78 insertions(+), 23 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index cfb361cd0320..11055e0856bc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1165,13 +1165,53 @@ static void amdgpu_rasmgr_error_data_statistic_update(struct ras_manager *obj, s
}
}
-/* query/inject/cure begin */
-int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
- struct ras_query_if *info)
+static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev,
+ struct ras_query_if *info,
+ struct ras_err_data *err_data,
+ unsigned int error_query_mode)
{
+ enum amdgpu_ras_block blk = info ? info->head.block : AMDGPU_RAS_BLOCK_COUNT;
struct amdgpu_ras_block_object *block_obj = NULL;
+
+ if (error_query_mode == AMDGPU_RAS_INVALID_ERROR_QUERY)
+ return -EINVAL;
+
+ if (error_query_mode == AMDGPU_RAS_DIRECT_ERROR_QUERY) {
+ if (info->head.block == AMDGPU_RAS_BLOCK__UMC) {
+ amdgpu_ras_get_ecc_info(adev, err_data);
+ } else {
+ block_obj = amdgpu_ras_get_ras_block(adev, info->head.block, 0);
+ if (!block_obj || !block_obj->hw_ops) {
+ dev_dbg_once(adev->dev, "%s doesn't config RAS function\n",
+ get_ras_block_str(&info->head));
+ return -EINVAL;
+ }
+
+ if (block_obj->hw_ops->query_ras_error_count)
+ block_obj->hw_ops->query_ras_error_count(adev, &err_data);
+
+ if ((info->head.block == AMDGPU_RAS_BLOCK__SDMA) ||
+ (info->head.block == AMDGPU_RAS_BLOCK__GFX) ||
+ (info->head.block == AMDGPU_RAS_BLOCK__MMHUB)) {
+ if (block_obj->hw_ops->query_ras_error_status)
+ block_obj->hw_ops->query_ras_error_status(adev);
+ }
+ }
+ } else {
+ /* FIXME: add code to check return value later */
+ amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_UE, err_data);
+ amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_CE, err_data);
+ }
+
+ return 0;
+}
+
+/* query/inject/cure begin */
+int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_if *info)
+{
struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
struct ras_err_data err_data;
+ unsigned int error_query_mode;
int ret;
if (!obj)
@@ -1181,27 +1221,14 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
if (ret)
return ret;
- if (info->head.block == AMDGPU_RAS_BLOCK__UMC) {
- amdgpu_ras_get_ecc_info(adev, &err_data);
- } else {
- block_obj = amdgpu_ras_get_ras_block(adev, info->head.block, 0);
- if (!block_obj || !block_obj->hw_ops) {
- dev_dbg_once(adev->dev, "%s doesn't config RAS function\n",
- get_ras_block_str(&info->head));
- ret = -EINVAL;
- goto out_fini_err_data;
- }
-
- if (block_obj->hw_ops->query_ras_error_count)
- block_obj->hw_ops->query_ras_error_count(adev, &err_data);
+ if (!amdgpu_ras_get_error_query_mode(adev, &error_query_mode))
+ return -EINVAL;
- if ((info->head.block == AMDGPU_RAS_BLOCK__SDMA) ||
- (info->head.block == AMDGPU_RAS_BLOCK__GFX) ||
- (info->head.block == AMDGPU_RAS_BLOCK__MMHUB)) {
- if (block_obj->hw_ops->query_ras_error_status)
- block_obj->hw_ops->query_ras_error_status(adev);
- }
- }
+ ret = amdgpu_ras_query_error_status_helper(adev, info,
+ &err_data,
+ error_query_mode);
+ if (ret)
+ goto out_fini_err_data;
amdgpu_rasmgr_error_data_statistic_update(obj, &err_data);
@@ -3397,6 +3424,26 @@ bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev)
return true;
}
+bool amdgpu_ras_get_error_query_mode(struct amdgpu_device *adev,
+ unsigned int *error_query_mode)
+{
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+ const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
+
+ if (!con) {
+ *error_query_mode = AMDGPU_RAS_INVALID_ERROR_QUERY;
+ return false;
+ }
+
+ if (mca_funcs && mca_funcs->mca_set_debug_mode)
+ *error_query_mode =
+ (con->is_mca_debug_mode) ? AMDGPU_RAS_DIRECT_ERROR_QUERY : AMDGPU_RAS_FIRMWARE_ERROR_QUERY;
+ else
+ *error_query_mode = AMDGPU_RAS_DIRECT_ERROR_QUERY;
+
+ return true;
+}
+
/* Register each ip ras block into amdgpu ras */
int amdgpu_ras_register_ras_block(struct amdgpu_device *adev,
struct amdgpu_ras_block_object *ras_block_obj)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 665414c22ca9..19161916ac46 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -320,6 +320,12 @@ enum amdgpu_ras_ret {
AMDGPU_RAS_PT,
};
+enum amdgpu_ras_error_query_mode {
+ AMDGPU_RAS_INVALID_ERROR_QUERY = 0,
+ AMDGPU_RAS_DIRECT_ERROR_QUERY = 1,
+ AMDGPU_RAS_FIRMWARE_ERROR_QUERY = 2,
+};
+
/* ras error status reisger fields */
#define ERR_STATUS_LO__ERR_STATUS_VALID_FLAG__SHIFT 0x0
#define ERR_STATUS_LO__ERR_STATUS_VALID_FLAG_MASK 0x00000001L
@@ -769,6 +775,8 @@ int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras *ras_co
void amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable);
bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev);
+bool amdgpu_ras_get_error_query_mode(struct amdgpu_device *adev,
+ unsigned int *mode);
int amdgpu_ras_register_ras_block(struct amdgpu_device *adev,
struct amdgpu_ras_block_object *ras_block_obj);
--
2.34.1
More information about the amd-gfx
mailing list