[PATCH] drm/amdgpu: add ras event id support
Zhou1, Tao
Tao.Zhou1 at amd.com
Thu Mar 14 08:48:33 UTC 2024
[AMD Official Use Only - General]
Reviewed-by: Tao Zhou <tao.zhou1 at amd.com>
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> On Behalf Of Yang
> Wang
> Sent: Thursday, March 14, 2024 4:12 PM
> To: amd-gfx at lists.freedesktop.org
> Cc: Wang, Yang(Kevin) <KevinYang.Wang at amd.com>; Zhang, Hawking
> <Hawking.Zhang at amd.com>
> Subject: [PATCH] drm/amdgpu: add ras event id support
>
> add amdgpu ras event id support to better distinguish different error information
> sources in dmesg logs.
>
> the following log will be identify by event id:
> {event_id} interrupt to inform RAS event {event_id} ACA logs {event_id} errors
> statistic since from current injection/error query {event_id} errors statistic since
> from gpu load
>
> Signed-off-by: Yang Wang <kevinyang.wang at amd.com>
> Reviewed-by: Hawking Zhang <Hawking.Zhang at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c | 32 ++--
> drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h | 3 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 203 +++++++++++++++--------
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 30 ++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h | 1 +
> drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 10 +-
> 6 files changed, 191 insertions(+), 88 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
> index 24ad4b97177b..0734490347db 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
> @@ -210,22 +210,26 @@ int amdgpu_mca_smu_set_debug_mode(struct
> amdgpu_device *adev, bool enable)
> return -EOPNOTSUPP;
> }
>
> -static void amdgpu_mca_smu_mca_bank_dump(struct amdgpu_device *adev,
> int idx, struct mca_bank_entry *entry)
> +static void amdgpu_mca_smu_mca_bank_dump(struct amdgpu_device *adev,
> int idx, struct mca_bank_entry *entry,
> + struct ras_query_context *qctx)
> {
> - dev_info(adev->dev, HW_ERR "Accelerator Check Architecture events
> logged\n");
> - dev_info(adev->dev, HW_ERR "aca entry[%02d].STATUS=0x%016llx\n",
> - idx, entry->regs[MCA_REG_IDX_STATUS]);
> - dev_info(adev->dev, HW_ERR "aca entry[%02d].ADDR=0x%016llx\n",
> - idx, entry->regs[MCA_REG_IDX_ADDR]);
> - dev_info(adev->dev, HW_ERR "aca entry[%02d].MISC0=0x%016llx\n",
> - idx, entry->regs[MCA_REG_IDX_MISC0]);
> - dev_info(adev->dev, HW_ERR "aca entry[%02d].IPID=0x%016llx\n",
> - idx, entry->regs[MCA_REG_IDX_IPID]);
> - dev_info(adev->dev, HW_ERR "aca entry[%02d].SYND=0x%016llx\n",
> - idx, entry->regs[MCA_REG_IDX_SYND]);
> + u64 event_id = qctx->event_id;
> +
> + RAS_EVENT_LOG(adev, event_id, HW_ERR "Accelerator Check
> Architecture events logged\n");
> + RAS_EVENT_LOG(adev, event_id, HW_ERR "aca
> entry[%02d].STATUS=0x%016llx\n",
> + idx, entry->regs[MCA_REG_IDX_STATUS]);
> + RAS_EVENT_LOG(adev, event_id, HW_ERR "aca
> entry[%02d].ADDR=0x%016llx\n",
> + idx, entry->regs[MCA_REG_IDX_ADDR]);
> + RAS_EVENT_LOG(adev, event_id, HW_ERR "aca
> entry[%02d].MISC0=0x%016llx\n",
> + idx, entry->regs[MCA_REG_IDX_MISC0]);
> + RAS_EVENT_LOG(adev, event_id, HW_ERR "aca
> entry[%02d].IPID=0x%016llx\n",
> + idx, entry->regs[MCA_REG_IDX_IPID]);
> + RAS_EVENT_LOG(adev, event_id, HW_ERR "aca
> entry[%02d].SYND=0x%016llx\n",
> + idx, entry->regs[MCA_REG_IDX_SYND]);
> }
>
> -int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum
> amdgpu_ras_block blk, enum amdgpu_mca_error_type type, struct ras_err_data
> *err_data)
> +int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum
> amdgpu_ras_block blk, enum amdgpu_mca_error_type type,
> + struct ras_err_data *err_data, struct
> ras_query_context *qctx)
> {
> struct amdgpu_smuio_mcm_config_info mcm_info;
> struct ras_err_addr err_addr = {0};
> @@ -244,7 +248,7 @@ int amdgpu_mca_smu_log_ras_error(struct
> amdgpu_device *adev, enum amdgpu_ras_blo
> list_for_each_entry(node, &mca_set.list, node) {
> entry = &node->entry;
>
> - amdgpu_mca_smu_mca_bank_dump(adev, i++, entry);
> + amdgpu_mca_smu_mca_bank_dump(adev, i++, entry, qctx);
>
> count = 0;
> ret = amdgpu_mca_smu_parse_mca_error_count(adev, blk, type,
> entry, &count); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h
> index b964110ed1e0..e5bf07ce3451 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h
> @@ -169,6 +169,7 @@ void amdgpu_mca_smu_debugfs_init(struct
> amdgpu_device *adev, struct dentry *root void
> amdgpu_mca_bank_set_init(struct mca_bank_set *mca_set); int
> amdgpu_mca_bank_set_add_entry(struct mca_bank_set *mca_set, struct
> mca_bank_entry *entry); void amdgpu_mca_bank_set_release(struct
> mca_bank_set *mca_set); -int amdgpu_mca_smu_log_ras_error(struct
> amdgpu_device *adev, enum amdgpu_ras_block blk, enum
> amdgpu_mca_error_type type, struct ras_err_data *err_data);
> +int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum
> amdgpu_ras_block blk, enum amdgpu_mca_error_type type,
> + struct ras_err_data *err_data, struct
> ras_query_context *qctx);
>
> #endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 8ebab6f22e5a..ef87f107c942 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -1045,6 +1045,7 @@ static void amdgpu_ras_get_ecc_info(struct
> amdgpu_device *adev, struct ras_err_d static void
> amdgpu_ras_error_print_error_data(struct amdgpu_device *adev,
> struct ras_manager *ras_mgr,
> struct ras_err_data *err_data,
> + struct ras_query_context *qctx,
> const char *blk_name,
> bool is_ue,
> bool is_de)
> @@ -1052,27 +1053,28 @@ static void
> amdgpu_ras_error_print_error_data(struct amdgpu_device *adev,
> struct amdgpu_smuio_mcm_config_info *mcm_info;
> struct ras_err_node *err_node;
> struct ras_err_info *err_info;
> + u64 event_id = qctx->event_id;
>
> if (is_ue) {
> for_each_ras_error(err_node, err_data) {
> err_info = &err_node->err_info;
> mcm_info = &err_info->mcm_info;
> if (err_info->ue_count) {
> - dev_info(adev->dev, "socket: %d, die: %d, "
> - "%lld new uncorrectable hardware
> errors detected in %s block\n",
> - mcm_info->socket_id,
> - mcm_info->die_id,
> - err_info->ue_count,
> - blk_name);
> + RAS_EVENT_LOG(adev, event_id, "socket: %d,
> die: %d, "
> + "%lld new uncorrectable hardware
> errors detected in %s block\n",
> + mcm_info->socket_id,
> + mcm_info->die_id,
> + err_info->ue_count,
> + blk_name);
> }
> }
>
> for_each_ras_error(err_node, &ras_mgr->err_data) {
> err_info = &err_node->err_info;
> mcm_info = &err_info->mcm_info;
> - dev_info(adev->dev, "socket: %d, die: %d, "
> - "%lld uncorrectable hardware errors detected
> in total in %s block\n",
> - mcm_info->socket_id, mcm_info->die_id,
> err_info->ue_count, blk_name);
> + RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, "
> + "%lld uncorrectable hardware errors
> detected in total in %s block\n",
> + mcm_info->socket_id, mcm_info->die_id,
> err_info->ue_count,
> +blk_name);
> }
>
> } else {
> @@ -1081,44 +1083,44 @@ static void
> amdgpu_ras_error_print_error_data(struct amdgpu_device *adev,
> err_info = &err_node->err_info;
> mcm_info = &err_info->mcm_info;
> if (err_info->de_count) {
> - dev_info(adev->dev, "socket: %d,
> die: %d, "
> - "%lld new deferred hardware
> errors detected in %s block\n",
> - mcm_info->socket_id,
> - mcm_info->die_id,
> - err_info->de_count,
> - blk_name);
> + RAS_EVENT_LOG(adev, event_id,
> "socket: %d, die: %d, "
> + "%lld new deferred
> hardware errors detected in %s block\n",
> + mcm_info->socket_id,
> + mcm_info->die_id,
> + err_info->de_count,
> + blk_name);
> }
> }
>
> for_each_ras_error(err_node, &ras_mgr->err_data) {
> err_info = &err_node->err_info;
> mcm_info = &err_info->mcm_info;
> - dev_info(adev->dev, "socket: %d, die: %d, "
> - "%lld deferred hardware errors
> detected in total in %s block\n",
> - mcm_info->socket_id, mcm_info-
> >die_id,
> - err_info->de_count, blk_name);
> + RAS_EVENT_LOG(adev, event_id, "socket: %d,
> die: %d, "
> + "%lld deferred hardware errors
> detected in total in %s block\n",
> + mcm_info->socket_id, mcm_info-
> >die_id,
> + err_info->de_count, blk_name);
> }
> } else {
> for_each_ras_error(err_node, err_data) {
> err_info = &err_node->err_info;
> mcm_info = &err_info->mcm_info;
> if (err_info->ce_count) {
> - dev_info(adev->dev, "socket: %d,
> die: %d, "
> - "%lld new correctable
> hardware errors detected in %s block\n",
> - mcm_info->socket_id,
> - mcm_info->die_id,
> - err_info->ce_count,
> - blk_name);
> + RAS_EVENT_LOG(adev, event_id,
> "socket: %d, die: %d, "
> + "%lld new correctable
> hardware errors detected in %s block\n",
> + mcm_info->socket_id,
> + mcm_info->die_id,
> + err_info->ce_count,
> + blk_name);
> }
> }
>
> for_each_ras_error(err_node, &ras_mgr->err_data) {
> err_info = &err_node->err_info;
> mcm_info = &err_info->mcm_info;
> - dev_info(adev->dev, "socket: %d, die: %d, "
> - "%lld correctable hardware errors
> detected in total in %s block\n",
> - mcm_info->socket_id, mcm_info-
> >die_id,
> - err_info->ce_count, blk_name);
> + RAS_EVENT_LOG(adev, event_id, "socket: %d,
> die: %d, "
> + "%lld correctable hardware errors
> detected in total in %s block\n",
> + mcm_info->socket_id, mcm_info-
> >die_id,
> + err_info->ce_count, blk_name);
> }
> }
> }
> @@ -1131,77 +1133,79 @@ static inline bool err_data_has_source_info(struct
> ras_err_data *data)
>
> static void amdgpu_ras_error_generate_report(struct amdgpu_device *adev,
> struct ras_query_if *query_if,
> - struct ras_err_data *err_data)
> + struct ras_err_data *err_data,
> + struct ras_query_context *qctx)
> {
> struct ras_manager *ras_mgr = amdgpu_ras_find_obj(adev, &query_if-
> >head);
> const char *blk_name = get_ras_block_str(&query_if->head);
> + u64 event_id = qctx->event_id;
>
> if (err_data->ce_count) {
> if (err_data_has_source_info(err_data)) {
> - amdgpu_ras_error_print_error_data(adev, ras_mgr,
> err_data,
> + amdgpu_ras_error_print_error_data(adev, ras_mgr,
> err_data, qctx,
> blk_name, false,
> false);
> } else if (!adev->aid_mask &&
> adev->smuio.funcs &&
> adev->smuio.funcs->get_socket_id &&
> adev->smuio.funcs->get_die_id) {
> - dev_info(adev->dev, "socket: %d, die: %d "
> - "%ld correctable hardware errors "
> - "detected in %s block\n",
> - adev->smuio.funcs->get_socket_id(adev),
> - adev->smuio.funcs->get_die_id(adev),
> - ras_mgr->err_data.ce_count,
> - blk_name);
> + RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d "
> + "%ld correctable hardware errors "
> + "detected in %s block\n",
> + adev->smuio.funcs->get_socket_id(adev),
> + adev->smuio.funcs->get_die_id(adev),
> + ras_mgr->err_data.ce_count,
> + blk_name);
> } else {
> - dev_info(adev->dev, "%ld correctable hardware errors "
> - "detected in %s block\n",
> - ras_mgr->err_data.ce_count,
> - blk_name);
> + RAS_EVENT_LOG(adev, event_id, "%ld correctable
> hardware errors "
> + "detected in %s block\n",
> + ras_mgr->err_data.ce_count,
> + blk_name);
> }
> }
>
> if (err_data->ue_count) {
> if (err_data_has_source_info(err_data)) {
> - amdgpu_ras_error_print_error_data(adev, ras_mgr,
> err_data,
> + amdgpu_ras_error_print_error_data(adev, ras_mgr,
> err_data, qctx,
> blk_name, true,
> false);
> } else if (!adev->aid_mask &&
> adev->smuio.funcs &&
> adev->smuio.funcs->get_socket_id &&
> adev->smuio.funcs->get_die_id) {
> - dev_info(adev->dev, "socket: %d, die: %d "
> - "%ld uncorrectable hardware errors "
> - "detected in %s block\n",
> - adev->smuio.funcs->get_socket_id(adev),
> - adev->smuio.funcs->get_die_id(adev),
> - ras_mgr->err_data.ue_count,
> - blk_name);
> + RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d "
> + "%ld uncorrectable hardware errors "
> + "detected in %s block\n",
> + adev->smuio.funcs->get_socket_id(adev),
> + adev->smuio.funcs->get_die_id(adev),
> + ras_mgr->err_data.ue_count,
> + blk_name);
> } else {
> - dev_info(adev->dev, "%ld uncorrectable hardware errors
> "
> - "detected in %s block\n",
> - ras_mgr->err_data.ue_count,
> - blk_name);
> + RAS_EVENT_LOG(adev, event_id, "%ld uncorrectable
> hardware errors "
> + "detected in %s block\n",
> + ras_mgr->err_data.ue_count,
> + blk_name);
> }
> }
>
> if (err_data->de_count) {
> if (err_data_has_source_info(err_data)) {
> - amdgpu_ras_error_print_error_data(adev, ras_mgr,
> err_data,
> + amdgpu_ras_error_print_error_data(adev, ras_mgr,
> err_data, qctx,
> blk_name, false,
> true);
> } else if (!adev->aid_mask &&
> adev->smuio.funcs &&
> adev->smuio.funcs->get_socket_id &&
> adev->smuio.funcs->get_die_id) {
> - dev_info(adev->dev, "socket: %d, die: %d "
> - "%ld deferred hardware errors "
> - "detected in %s block\n",
> - adev->smuio.funcs->get_socket_id(adev),
> - adev->smuio.funcs->get_die_id(adev),
> - ras_mgr->err_data.de_count,
> - blk_name);
> + RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d "
> + "%ld deferred hardware errors "
> + "detected in %s block\n",
> + adev->smuio.funcs->get_socket_id(adev),
> + adev->smuio.funcs->get_die_id(adev),
> + ras_mgr->err_data.de_count,
> + blk_name);
> } else {
> - dev_info(adev->dev, "%ld deferred hardware errors "
> - "detected in %s block\n",
> - ras_mgr->err_data.de_count,
> - blk_name);
> + RAS_EVENT_LOG(adev, event_id, "%ld deferred
> hardware errors "
> + "detected in %s block\n",
> + ras_mgr->err_data.de_count,
> + blk_name);
> }
> }
> }
> @@ -1294,6 +1298,7 @@ ssize_t amdgpu_ras_aca_sysfs_read(struct device
> *dev, struct device_attribute *a static int
> amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev,
> struct ras_query_if *info,
> struct ras_err_data *err_data,
> + struct ras_query_context
> *qctx,
> unsigned int
> error_query_mode)
> {
> enum amdgpu_ras_block blk = info ? info->head.block :
> AMDGPU_RAS_BLOCK_COUNT; @@ -1338,8 +1343,8 @@ static int
> amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev,
> return ret;
> } else {
> /* FIXME: add code to check return value later */
> - amdgpu_mca_smu_log_ras_error(adev, blk,
> AMDGPU_MCA_ERROR_TYPE_UE, err_data);
> - amdgpu_mca_smu_log_ras_error(adev, blk,
> AMDGPU_MCA_ERROR_TYPE_CE, err_data);
> + amdgpu_mca_smu_log_ras_error(adev, blk,
> AMDGPU_MCA_ERROR_TYPE_UE, err_data, qctx);
> + amdgpu_mca_smu_log_ras_error(adev, blk,
> AMDGPU_MCA_ERROR_TYPE_CE,
> +err_data, qctx);
> }
> }
>
> @@ -1351,6 +1356,7 @@ int amdgpu_ras_query_error_status(struct
> amdgpu_device *adev, struct ras_query_i {
> struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
> struct ras_err_data err_data;
> + struct ras_query_context qctx;
> unsigned int error_query_mode;
> int ret;
>
> @@ -1364,8 +1370,12 @@ int amdgpu_ras_query_error_status(struct
> amdgpu_device *adev, struct ras_query_i
> if (!amdgpu_ras_get_error_query_mode(adev, &error_query_mode))
> return -EINVAL;
>
> + memset(&qctx, 0, sizeof(qctx));
> + qctx.event_id = amdgpu_ras_acquire_event_id(adev,
> amdgpu_ras_intr_triggered() ?
> + RAS_EVENT_TYPE_ISR :
> RAS_EVENT_TYPE_INVALID);
> ret = amdgpu_ras_query_error_status_helper(adev, info,
> &err_data,
> + &qctx,
> error_query_mode);
> if (ret)
> goto out_fini_err_data;
> @@ -1376,7 +1386,7 @@ int amdgpu_ras_query_error_status(struct
> amdgpu_device *adev, struct ras_query_i
> info->ce_count = obj->err_data.ce_count;
> info->de_count = obj->err_data.de_count;
>
> - amdgpu_ras_error_generate_report(adev, info, &err_data);
> + amdgpu_ras_error_generate_report(adev, info, &err_data, &qctx);
>
> out_fini_err_data:
> amdgpu_ras_error_data_fini(&err_data);
> @@ -3036,6 +3046,31 @@ static int amdgpu_get_ras_schema(struct
> amdgpu_device *adev)
> AMDGPU_RAS_ERROR__PARITY;
> }
>
> +static void ras_event_mgr_init(struct ras_event_manager *mgr) {
> + int i;
> +
> + for (i = 0; i < ARRAY_SIZE(mgr->seqnos); i++)
> + atomic64_set(&mgr->seqnos[i], 0);
> +}
> +
> +static void amdgpu_ras_event_mgr_init(struct amdgpu_device *adev) {
> + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
> + struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
> +
> + ras->event_mgr = hive ? &hive->event_mgr : &ras->__event_mgr;
> +
> + /* init event manager with node 0 on xgmi system */
> + if (!amdgpu_in_reset(adev)) {
> + if (!hive || adev->gmc.xgmi.node_id == 0)
> + ras_event_mgr_init(ras->event_mgr);
> + }
> +
> + if (hive)
> + amdgpu_put_xgmi_hive(hive);
> +}
> +
> int amdgpu_ras_init(struct amdgpu_device *adev) {
> struct amdgpu_ras *con = amdgpu_ras_get_context(adev); @@ -3356,6
> +3391,8 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev)
> if (amdgpu_sriov_vf(adev))
> return 0;
>
> + amdgpu_ras_event_mgr_init(adev);
> +
> if (amdgpu_aca_is_enabled(adev)) {
> if (amdgpu_in_reset(adev))
> r = amdgpu_aca_reset(adev);
> @@ -3472,13 +3509,37 @@ void amdgpu_ras_set_fed(struct amdgpu_device
> *adev, bool status)
> atomic_set(&ras->fed, !!status);
> }
>
> +bool amdgpu_ras_event_id_is_valid(struct amdgpu_device *adev, u64 id) {
> + return !(id & BIT_ULL(63));
> +}
> +
> +u64 amdgpu_ras_acquire_event_id(struct amdgpu_device *adev, enum
> +ras_event_type type) {
> + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
> + u64 id;
> +
> + switch (type) {
> + case RAS_EVENT_TYPE_ISR:
> + id = (u64)atomic64_read(&ras->event_mgr->seqnos[type]);
> + break;
> + case RAS_EVENT_TYPE_INVALID:
> + default:
> + id = BIT_ULL(63) | 0ULL;
> + break;
> + }
> +
> + return id;
> +}
> +
> void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev) {
> if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) {
> struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
> + u64 event_id =
> +(u64)atomic64_inc_return(&ras->event_mgr->seqnos[RAS_EVENT_TYPE_ISR]);
>
> - dev_info(adev->dev, "uncorrectable hardware error"
> - "(ERREVENT_ATHUB_INTERRUPT) detected!\n");
> + RAS_EVENT_LOG(adev, event_id, "uncorrectable hardware
> error"
> + "(ERREVENT_ATHUB_INTERRUPT) detected!\n");
>
> ras->gpu_reset_flags |=
> AMDGPU_RAS_GPU_RESET_MODE1_RESET;
> amdgpu_ras_reset_gpu(adev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index e0f8ce9d8440..64788ae7d85d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -64,6 +64,14 @@ struct amdgpu_iv_entry;
> /* The high three bits indicates socketid */ #define
> AMDGPU_RAS_GET_FEATURES(val) ((val) &
> ~AMDGPU_RAS_FEATURES_SOCKETID_MASK)
>
> +#define RAS_EVENT_LOG(adev, id, fmt, ...) \
> +do { \
> + if (amdgpu_ras_event_id_is_valid((adev), (id))) \
> + dev_info((adev)->dev, "{%llu}" fmt, (id), ##__VA_ARGS__); \
> + else \
> + dev_info((adev)->dev, fmt, ##__VA_ARGS__); \
> +} while (0)
> +
> enum amdgpu_ras_block {
> AMDGPU_RAS_BLOCK__UMC = 0,
> AMDGPU_RAS_BLOCK__SDMA,
> @@ -419,6 +427,21 @@ struct umc_ecc_info {
> int record_ce_addr_supported;
> };
>
> +enum ras_event_type {
> + RAS_EVENT_TYPE_INVALID = -1,
> + RAS_EVENT_TYPE_ISR = 0,
> + RAS_EVENT_TYPE_COUNT,
> +};
> +
> +struct ras_event_manager {
> + atomic64_t seqnos[RAS_EVENT_TYPE_COUNT]; };
> +
> +struct ras_query_context {
> + enum ras_event_type type;
> + u64 event_id;
> +};
> +
> struct amdgpu_ras {
> /* ras infrastructure */
> /* for ras itself. */
> @@ -479,6 +502,11 @@ struct amdgpu_ras {
> atomic_t page_retirement_req_cnt;
> /* Fatal error detected flag */
> atomic_t fed;
> +
> + /* RAS event manager */
> + struct ras_event_manager __event_mgr;
> + struct ras_event_manager *event_mgr;
> +
> };
>
> struct ras_fs_data {
> @@ -879,4 +907,6 @@ void amdgpu_ras_del_mca_err_addr(struct ras_err_info
> *err_info, void amdgpu_ras_set_fed(struct amdgpu_device *adev, bool status);
> bool amdgpu_ras_get_fed_status(struct amdgpu_device *adev);
>
> +bool amdgpu_ras_event_id_is_valid(struct amdgpu_device *adev, u64 id);
> +u64 amdgpu_ras_acquire_event_id(struct amdgpu_device *adev, enum
> +ras_event_type type);
> #endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> index 1592c63b3099..a3bfc16de6d4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> @@ -44,6 +44,7 @@ struct amdgpu_hive_info {
>
> struct amdgpu_reset_domain *reset_domain;
> atomic_t ras_recovery;
> + struct ras_event_manager event_mgr;
> };
>
> struct amdgpu_pcs_ras_field {
> diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
> b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
> index 77af4e25ff46..4a02e1f041da 100644
> --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
> @@ -404,10 +404,16 @@ static int umc_v12_0_err_cnt_init_per_channel(struct
> amdgpu_device *adev, static void
> umc_v12_0_ecc_info_query_ras_error_count(struct amdgpu_device *adev,
> void *ras_error_status)
> {
> + struct ras_query_context qctx;
> +
> + memset(&qctx, 0, sizeof(qctx));
> + qctx.event_id = amdgpu_ras_acquire_event_id(adev,
> amdgpu_ras_intr_triggered() ?
> + RAS_EVENT_TYPE_ISR :
> RAS_EVENT_TYPE_INVALID);
> +
> amdgpu_mca_smu_log_ras_error(adev,
> - AMDGPU_RAS_BLOCK__UMC,
> AMDGPU_MCA_ERROR_TYPE_CE, ras_error_status);
> + AMDGPU_RAS_BLOCK__UMC,
> AMDGPU_MCA_ERROR_TYPE_CE, ras_error_status,
> +&qctx);
> amdgpu_mca_smu_log_ras_error(adev,
> - AMDGPU_RAS_BLOCK__UMC,
> AMDGPU_MCA_ERROR_TYPE_UE, ras_error_status);
> + AMDGPU_RAS_BLOCK__UMC,
> AMDGPU_MCA_ERROR_TYPE_UE, ras_error_status,
> +&qctx);
> }
>
> static void umc_v12_0_ecc_info_query_ras_error_address(struct
> amdgpu_device *adev,
> --
> 2.34.1
More information about the amd-gfx
mailing list