[PATCH 05/14] drm/amdgpu: add amdgpu ras aca query interface
Zhang, Hawking
Hawking.Zhang at amd.com
Wed Jan 3 12:00:09 UTC 2024
[AMD Official Use Only - General]
I assume we are leveraging error_query_mode to differentiate aca path from legacy ras path, right?
But given in-band error reporting is just the start of transition from legacy ras to aca, do we need a flag in amdgpu_aca to indicate whether aca is supported or not? Accordingly, we can initialize the flag in amdgpu_ras_check_supported. it should help us to differentiate aca from legacy ras when implementing other features, thoughts?
Regards,
Hawking
-----Original Message-----
From: Wang, Yang(Kevin) <KevinYang.Wang at amd.com>
Sent: Wednesday, January 3, 2024 16:02
To: amd-gfx at lists.freedesktop.org
Cc: Zhang, Hawking <Hawking.Zhang at amd.com>; Zhou1, Tao <Tao.Zhou1 at amd.com>; Chai, Thomas <YiPeng.Chai at amd.com>; Wang, Yang(Kevin) <KevinYang.Wang at amd.com>
Subject: [PATCH 05/14] drm/amdgpu: add amdgpu ras aca query interface
use new ACA error query interface to instead of legacy MCA query.
Signed-off-by: Yang Wang <kevinyang.wang at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 88 ++++++++++++++++++++----- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 12 +++-
2 files changed, 79 insertions(+), 21 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 038bd1b17cef..bbae41f86e00 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1168,6 +1168,53 @@ static void amdgpu_rasmgr_error_data_statistic_update(struct ras_manager *obj, s
}
}
+static struct ras_manager *get_ras_manager(struct amdgpu_device *adev,
+enum amdgpu_ras_block blk) {
+ struct ras_common_if head;
+
+ memset(&head, 0, sizeof(head));
+ head.block = blk;
+
+ return amdgpu_ras_find_obj(adev, &head); }
+
+int amdgpu_ras_bind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk,
+ const struct aca_info *aca_info, void *data) {
+ struct ras_manager *obj;
+
+ obj = get_ras_manager(adev, blk);
+ if (!obj)
+ return -EINVAL;
+
+ return amdgpu_aca_add_handle(adev, &obj->aca_handle,
+ras_block_str(blk), aca_info, data); }
+
+int amdgpu_ras_unbind_aca(struct amdgpu_device *adev, enum
+amdgpu_ras_block blk) {
+ struct ras_manager *obj;
+
+ obj = get_ras_manager(adev, blk);
+ if (!obj)
+ return -EINVAL;
+
+ amdgpu_aca_remove_handle(&obj->aca_handle);
+
+ return 0;
+}
+
+static int amdgpu_aca_log_ras_error_data(struct amdgpu_device *adev, enum amdgpu_ras_block blk,
+ enum aca_error_type type, struct ras_err_data *err_data) {
+ struct ras_manager *obj;
+
+ obj = get_ras_manager(adev, blk);
+ if (!obj)
+ return -EINVAL;
+
+ return amdgpu_aca_get_error_data(adev, &obj->aca_handle, type,
+err_data); }
+
static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev,
struct ras_query_if *info,
struct ras_err_data *err_data,
@@ -1175,6 +1222,7 @@ static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev, {
enum amdgpu_ras_block blk = info ? info->head.block : AMDGPU_RAS_BLOCK_COUNT;
struct amdgpu_ras_block_object *block_obj = NULL;
+ int ret;
if (blk == AMDGPU_RAS_BLOCK_COUNT)
return -EINVAL;
@@ -1204,9 +1252,13 @@ static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev,
}
}
} else {
- /* FIXME: add code to check return value later */
- amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_UE, err_data);
- amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_CE, err_data);
+ ret = amdgpu_aca_log_ras_error_data(adev, blk, ACA_ERROR_TYPE_UE, err_data);
+ if (ret)
+ return ret;
+
+ ret = amdgpu_aca_log_ras_error_data(adev, blk, ACA_ERROR_TYPE_CE, err_data);
+ if (ret)
+ return ret;
}
return 0;
@@ -1254,7 +1306,7 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device *adev, {
struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, block, 0);
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
- const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
+ const struct aca_smu_funcs *smu_funcs = adev->aca.smu_funcs;
struct amdgpu_hive_info *hive;
int hive_ras_recovery = 0;
@@ -1265,7 +1317,7 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device *adev,
}
if (!amdgpu_ras_is_supported(adev, block) ||
- !amdgpu_ras_get_mca_debug_mode(adev))
+ !amdgpu_ras_get_aca_debug_mode(adev))
return -EOPNOTSUPP;
hive = amdgpu_get_xgmi_hive(adev);
@@ -1277,7 +1329,7 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device *adev,
/* skip ras error reset in gpu reset */
if ((amdgpu_in_reset(adev) || atomic_read(&ras->in_recovery) ||
hive_ras_recovery) &&
- mca_funcs && mca_funcs->mca_set_debug_mode)
+ smu_funcs && smu_funcs->set_debug_mode)
return -EOPNOTSUPP;
if (block_obj->hw_ops->reset_ras_error_count)
@@ -1773,7 +1825,7 @@ void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev)
}
}
- amdgpu_mca_smu_debugfs_init(adev, dir);
+ amdgpu_aca_smu_debugfs_init(adev, dir);
}
/* debugfs end */
@@ -3138,8 +3190,8 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev)
if (amdgpu_sriov_vf(adev))
return 0;
- /* enable MCA debug on APU device */
- amdgpu_ras_set_mca_debug_mode(adev, !!(adev->flags & AMD_IS_APU));
+ /* enable ACA debug on APU device */
+ amdgpu_ras_set_aca_debug_mode(adev, !!(adev->flags & AMD_IS_APU));
list_for_each_entry_safe(node, tmp, &adev->ras_list, node) {
if (!node->ras_obj) {
@@ -3422,7 +3474,7 @@ int amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable)
if (con) {
ret = amdgpu_mca_smu_set_debug_mode(adev, enable);
if (!ret)
- con->is_mca_debug_mode = enable;
+ con->is_aca_debug_mode = enable;
}
return ret;
@@ -3436,22 +3488,22 @@ int amdgpu_ras_set_aca_debug_mode(struct amdgpu_device *adev, bool enable)
if (con) {
ret = amdgpu_aca_smu_set_debug_mode(adev, enable);
if (!ret)
- con->is_mca_debug_mode = enable;
+ con->is_aca_debug_mode = enable;
}
return ret;
}
-bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev)
+bool amdgpu_ras_get_aca_debug_mode(struct amdgpu_device *adev)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
- const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
+ const struct aca_smu_funcs *smu_funcs = adev->aca.smu_funcs;
if (!con)
return false;
- if (mca_funcs && mca_funcs->mca_set_debug_mode)
- return con->is_mca_debug_mode;
+ if (smu_funcs && smu_funcs->set_debug_mode)
+ return con->is_aca_debug_mode;
else
return true;
}
@@ -3460,16 +3512,16 @@ bool amdgpu_ras_get_error_query_mode(struct amdgpu_device *adev,
unsigned int *error_query_mode) {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
- const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
+ const struct aca_smu_funcs *smu_funcs = adev->aca.smu_funcs;
if (!con) {
*error_query_mode = AMDGPU_RAS_INVALID_ERROR_QUERY;
return false;
}
- if (mca_funcs && mca_funcs->mca_set_debug_mode)
+ if (smu_funcs && smu_funcs->set_debug_mode)
*error_query_mode =
- (con->is_mca_debug_mode) ? AMDGPU_RAS_DIRECT_ERROR_QUERY : AMDGPU_RAS_FIRMWARE_ERROR_QUERY;
+ (con->is_aca_debug_mode) ? AMDGPU_RAS_DIRECT_ERROR_QUERY :
+AMDGPU_RAS_FIRMWARE_ERROR_QUERY;
else
*error_query_mode = AMDGPU_RAS_DIRECT_ERROR_QUERY;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 408e21c3cc88..2afac9aa381a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -442,7 +442,7 @@ struct amdgpu_ras {
/* Indicates smu whether need update bad channel info */
bool update_channel_flag;
/* Record status of smu mca debug mode */
- bool is_mca_debug_mode;
+ bool is_aca_debug_mode;
/* Record special requirements of gpu reset caller */
uint32_t gpu_reset_flags;
@@ -530,6 +530,8 @@ struct ras_manager {
struct ras_ih_data ih_data;
struct ras_err_data err_data;
+
+ struct aca_handle aca_handle;
};
struct ras_badpage {
@@ -781,9 +783,9 @@ struct amdgpu_ras* amdgpu_ras_get_context(struct amdgpu_device *adev);
int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras *ras_con);
-int amdgpu_ras_set_aca_debug_mode(struct amdgpu_device *adev, bool enable); int amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable); -bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev);
+int amdgpu_ras_set_aca_debug_mode(struct amdgpu_device *adev, bool
+enable); bool amdgpu_ras_get_aca_debug_mode(struct amdgpu_device
+*adev);
bool amdgpu_ras_get_error_query_mode(struct amdgpu_device *adev,
unsigned int *mode);
@@ -821,4 +823,8 @@ int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data,
struct amdgpu_smuio_mcm_config_info *mcm_info,
struct ras_err_addr *err_addr, u64 count);
+int amdgpu_ras_bind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk,
+ const struct aca_info *aca_info, void *data); int
+amdgpu_ras_unbind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block
+blk);
+
#endif
--
2.34.1
More information about the amd-gfx
mailing list