[PATCH v2 06/12] drm/amdgpu: add amdgpu ras aca query interface
Yang Wang
kevinyang.wang at amd.com
Thu Jan 4 11:48:52 UTC 2024
v1:
- add ACA error query interface
v2:
- Add a new helper function to determine whether to use ACA or MCA.
Signed-off-by: Yang Wang <kevinyang.wang at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c | 8 ++
drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h | 2 +
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 105 ++++++++++++++++++++----
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 12 ++-
4 files changed, 109 insertions(+), 18 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
index a460cde20cf2..2ad06f87de8c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
@@ -594,6 +594,9 @@ int amdgpu_aca_add_handle(struct amdgpu_device *adev, struct aca_handle *handle,
struct amdgpu_aca *aca = &adev->aca;
int ret;
+ if (!amdgpu_aca_is_enabled(adev))
+ return 0;
+
ret = add_aca_handle(adev, &aca->mgr, handle, name, ras_info, data);
if (ret)
return ret;
@@ -634,6 +637,11 @@ static void aca_manager_fini(struct aca_handle_manager *mgr)
remove_aca(handle);
}
+bool amdgpu_aca_is_enabled(struct amdgpu_device *adev)
+{
+ return adev->aca.is_enabled;
+}
+
int amdgpu_aca_init(struct amdgpu_device *adev)
{
struct amdgpu_aca *aca = &adev->aca;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h
index bb0a3be72cc8..ada4f6c4660f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h
@@ -175,6 +175,7 @@ struct aca_smu_funcs {
struct amdgpu_aca {
struct aca_handle_manager mgr;
const struct aca_smu_funcs *smu_funcs;
+ bool is_enabled;
};
struct aca_info {
@@ -186,6 +187,7 @@ struct aca_info {
int amdgpu_aca_init(struct amdgpu_device *adev);
void amdgpu_aca_fini(struct amdgpu_device *adev);
void amdgpu_aca_set_smu_funcs(struct amdgpu_device *adev, const struct aca_smu_funcs *smu_funcs);
+bool amdgpu_aca_is_enabled(struct amdgpu_device *adev);
int aca_bank_info_decode(struct aca_bank *bank, struct aca_bank_info *info);
int aca_bank_check_error_codes(struct amdgpu_device *adev, struct aca_bank *bank, int *err_codes, int size);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 7048bf853cf6..602bfe2f682e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1183,6 +1183,53 @@ ssize_t amdgpu_ras_aca_sysfs_read(struct device *dev, struct device_attribute *a
"ce", info.ce_count);
}
+static struct ras_manager *get_ras_manager(struct amdgpu_device *adev, enum amdgpu_ras_block blk)
+{
+ struct ras_common_if head;
+
+ memset(&head, 0, sizeof(head));
+ head.block = blk;
+
+ return amdgpu_ras_find_obj(adev, &head);
+}
+
+int amdgpu_ras_bind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk,
+ const struct aca_info *aca_info, void *data)
+{
+ struct ras_manager *obj;
+
+ obj = get_ras_manager(adev, blk);
+ if (!obj)
+ return -EINVAL;
+
+ return amdgpu_aca_add_handle(adev, &obj->aca_handle, ras_block_str(blk), aca_info, data);
+}
+
+int amdgpu_ras_unbind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk)
+{
+ struct ras_manager *obj;
+
+ obj = get_ras_manager(adev, blk);
+ if (!obj)
+ return -EINVAL;
+
+ amdgpu_aca_remove_handle(&obj->aca_handle);
+
+ return 0;
+}
+
+static int amdgpu_aca_log_ras_error_data(struct amdgpu_device *adev, enum amdgpu_ras_block blk,
+ enum aca_error_type type, struct ras_err_data *err_data)
+{
+ struct ras_manager *obj;
+
+ obj = get_ras_manager(adev, blk);
+ if (!obj)
+ return -EINVAL;
+
+ return amdgpu_aca_get_error_data(adev, &obj->aca_handle, type, err_data);
+}
+
static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev,
struct ras_query_if *info,
struct ras_err_data *err_data,
@@ -1190,6 +1237,7 @@ static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev,
{
enum amdgpu_ras_block blk = info ? info->head.block : AMDGPU_RAS_BLOCK_COUNT;
struct amdgpu_ras_block_object *block_obj = NULL;
+ int ret;
if (blk == AMDGPU_RAS_BLOCK_COUNT)
return -EINVAL;
@@ -1219,9 +1267,19 @@ static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev,
}
}
} else {
- /* FIXME: add code to check return value later */
- amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_UE, err_data);
- amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_CE, err_data);
+ if (amdgpu_aca_is_enabled(adev)) {
+ ret = amdgpu_aca_log_ras_error_data(adev, blk, ACA_ERROR_TYPE_UE, err_data);
+ if (ret)
+ return ret;
+
+ ret = amdgpu_aca_log_ras_error_data(adev, blk, ACA_ERROR_TYPE_CE, err_data);
+ if (ret)
+ return ret;
+ } else {
+ /* FIXME: add code to check return value later */
+ amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_UE, err_data);
+ amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_CE, err_data);
+ }
}
return 0;
@@ -1270,6 +1328,7 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device *adev,
struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, block, 0);
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
+ const struct aca_smu_funcs *smu_funcs = adev->aca.smu_funcs;
struct amdgpu_hive_info *hive;
int hive_ras_recovery = 0;
@@ -1280,7 +1339,7 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device *adev,
}
if (!amdgpu_ras_is_supported(adev, block) ||
- !amdgpu_ras_get_mca_debug_mode(adev))
+ !amdgpu_ras_get_aca_debug_mode(adev))
return -EOPNOTSUPP;
hive = amdgpu_get_xgmi_hive(adev);
@@ -1292,7 +1351,8 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device *adev,
/* skip ras error reset in gpu reset */
if ((amdgpu_in_reset(adev) || atomic_read(&ras->in_recovery) ||
hive_ras_recovery) &&
- mca_funcs && mca_funcs->mca_set_debug_mode)
+ ((smu_funcs && smu_funcs->set_debug_mode) ||
+ (mca_funcs && mca_funcs->mca_set_debug_mode)))
return -EOPNOTSUPP;
if (block_obj->hw_ops->reset_ras_error_count)
@@ -1788,7 +1848,10 @@ void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev)
}
}
- amdgpu_mca_smu_debugfs_init(adev, dir);
+ if (amdgpu_aca_is_enabled(adev))
+ amdgpu_aca_smu_debugfs_init(adev, dir);
+ else
+ amdgpu_mca_smu_debugfs_init(adev, dir);
}
/* debugfs end */
@@ -2769,6 +2832,9 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev)
adev->ras_enabled = amdgpu_ras_enable == 0 ? 0 :
adev->ras_hw_enabled & amdgpu_ras_mask;
+
+ /* aca is disabled by default */
+ adev->aca.is_enabled = false;
}
static void amdgpu_ras_counte_dw(struct work_struct *work)
@@ -3153,7 +3219,10 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev)
if (amdgpu_sriov_vf(adev))
return 0;
- amdgpu_ras_set_mca_debug_mode(adev, false);
+ if (amdgpu_aca_is_enabled(adev))
+ amdgpu_ras_set_aca_debug_mode(adev, false);
+ else
+ amdgpu_ras_set_mca_debug_mode(adev, false);
list_for_each_entry_safe(node, tmp, &adev->ras_list, node) {
if (!node->ras_obj) {
@@ -3436,7 +3505,7 @@ int amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable)
if (con) {
ret = amdgpu_mca_smu_set_debug_mode(adev, enable);
if (!ret)
- con->is_mca_debug_mode = enable;
+ con->is_aca_debug_mode = enable;
}
return ret;
@@ -3448,24 +3517,29 @@ int amdgpu_ras_set_aca_debug_mode(struct amdgpu_device *adev, bool enable)
int ret = 0;
if (con) {
- ret = amdgpu_aca_smu_set_debug_mode(adev, enable);
+ if (amdgpu_aca_is_enabled(adev))
+ ret = amdgpu_aca_smu_set_debug_mode(adev, enable);
+ else
+ ret = amdgpu_mca_smu_set_debug_mode(adev, enable);
if (!ret)
- con->is_mca_debug_mode = enable;
+ con->is_aca_debug_mode = enable;
}
return ret;
}
-bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev)
+bool amdgpu_ras_get_aca_debug_mode(struct amdgpu_device *adev)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+ const struct aca_smu_funcs *smu_funcs = adev->aca.smu_funcs;
const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
if (!con)
return false;
- if (mca_funcs && mca_funcs->mca_set_debug_mode)
- return con->is_mca_debug_mode;
+ if ((amdgpu_aca_is_enabled(adev) && smu_funcs && smu_funcs->set_debug_mode) ||
+ (!amdgpu_aca_is_enabled(adev) && mca_funcs && mca_funcs->mca_set_debug_mode))
+ return con->is_aca_debug_mode;
else
return true;
}
@@ -3475,15 +3549,16 @@ bool amdgpu_ras_get_error_query_mode(struct amdgpu_device *adev,
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
+ const struct aca_smu_funcs *smu_funcs = adev->aca.smu_funcs;
if (!con) {
*error_query_mode = AMDGPU_RAS_INVALID_ERROR_QUERY;
return false;
}
- if (mca_funcs && mca_funcs->mca_set_debug_mode)
+ if ((smu_funcs && smu_funcs->set_debug_mode) || (mca_funcs && mca_funcs->mca_set_debug_mode))
*error_query_mode =
- (con->is_mca_debug_mode) ? AMDGPU_RAS_DIRECT_ERROR_QUERY : AMDGPU_RAS_FIRMWARE_ERROR_QUERY;
+ (con->is_aca_debug_mode) ? AMDGPU_RAS_DIRECT_ERROR_QUERY : AMDGPU_RAS_FIRMWARE_ERROR_QUERY;
else
*error_query_mode = AMDGPU_RAS_DIRECT_ERROR_QUERY;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 8c487f3bfbf1..4503cc6eec66 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -442,7 +442,7 @@ struct amdgpu_ras {
/* Indicates smu whether need update bad channel info */
bool update_channel_flag;
/* Record status of smu mca debug mode */
- bool is_mca_debug_mode;
+ bool is_aca_debug_mode;
/* Record special requirements of gpu reset caller */
uint32_t gpu_reset_flags;
@@ -530,6 +530,8 @@ struct ras_manager {
struct ras_ih_data ih_data;
struct ras_err_data err_data;
+
+ struct aca_handle aca_handle;
};
struct ras_badpage {
@@ -781,9 +783,9 @@ struct amdgpu_ras* amdgpu_ras_get_context(struct amdgpu_device *adev);
int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras *ras_con);
-int amdgpu_ras_set_aca_debug_mode(struct amdgpu_device *adev, bool enable);
int amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable);
-bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev);
+int amdgpu_ras_set_aca_debug_mode(struct amdgpu_device *adev, bool enable);
+bool amdgpu_ras_get_aca_debug_mode(struct amdgpu_device *adev);
bool amdgpu_ras_get_error_query_mode(struct amdgpu_device *adev,
unsigned int *mode);
@@ -824,4 +826,8 @@ int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data,
ssize_t amdgpu_ras_aca_sysfs_read(struct device *dev, struct device_attribute *attr,
struct aca_handle *handle, char *buf, void *data);
+int amdgpu_ras_bind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk,
+ const struct aca_info *aca_info, void *data);
+int amdgpu_ras_unbind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk);
+
#endif
--
2.34.1
More information about the amd-gfx
mailing list