[PATCH] drm/amdgpu: add ras event id support

Zhou1, Tao Tao.Zhou1 at amd.com
Thu Mar 14 08:48:33 UTC 2024


[AMD Official Use Only - General]

Reviewed-by: Tao Zhou <tao.zhou1 at amd.com>

> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> On Behalf Of Yang
> Wang
> Sent: Thursday, March 14, 2024 4:12 PM
> To: amd-gfx at lists.freedesktop.org
> Cc: Wang, Yang(Kevin) <KevinYang.Wang at amd.com>; Zhang, Hawking
> <Hawking.Zhang at amd.com>
> Subject: [PATCH] drm/amdgpu: add ras event id support
>
> add amdgpu ras event id support to better distinguish different error information
> sources in dmesg logs.
>
> the following log will be identify by event id:
> {event_id} interrupt to inform RAS event {event_id} ACA logs {event_id} errors
> statistic since from current injection/error query {event_id} errors statistic since
> from gpu load
>
> Signed-off-by: Yang Wang <kevinyang.wang at amd.com>
> Reviewed-by: Hawking Zhang <Hawking.Zhang at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c  |  32 ++--
>  drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h  |   3 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c  | 203 +++++++++++++++--------
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h  |  30 ++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h |   1 +
>  drivers/gpu/drm/amd/amdgpu/umc_v12_0.c   |  10 +-
>  6 files changed, 191 insertions(+), 88 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
> index 24ad4b97177b..0734490347db 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
> @@ -210,22 +210,26 @@ int amdgpu_mca_smu_set_debug_mode(struct
> amdgpu_device *adev, bool enable)
>       return -EOPNOTSUPP;
>  }
>
> -static void amdgpu_mca_smu_mca_bank_dump(struct amdgpu_device *adev,
> int idx, struct mca_bank_entry *entry)
> +static void amdgpu_mca_smu_mca_bank_dump(struct amdgpu_device *adev,
> int idx, struct mca_bank_entry *entry,
> +                                      struct ras_query_context *qctx)
>  {
> -     dev_info(adev->dev, HW_ERR "Accelerator Check Architecture events
> logged\n");
> -     dev_info(adev->dev, HW_ERR "aca entry[%02d].STATUS=0x%016llx\n",
> -              idx, entry->regs[MCA_REG_IDX_STATUS]);
> -     dev_info(adev->dev, HW_ERR "aca entry[%02d].ADDR=0x%016llx\n",
> -              idx, entry->regs[MCA_REG_IDX_ADDR]);
> -     dev_info(adev->dev, HW_ERR "aca entry[%02d].MISC0=0x%016llx\n",
> -              idx, entry->regs[MCA_REG_IDX_MISC0]);
> -     dev_info(adev->dev, HW_ERR "aca entry[%02d].IPID=0x%016llx\n",
> -              idx, entry->regs[MCA_REG_IDX_IPID]);
> -     dev_info(adev->dev, HW_ERR "aca entry[%02d].SYND=0x%016llx\n",
> -              idx, entry->regs[MCA_REG_IDX_SYND]);
> +     u64 event_id = qctx->event_id;
> +
> +     RAS_EVENT_LOG(adev, event_id, HW_ERR "Accelerator Check
> Architecture events logged\n");
> +     RAS_EVENT_LOG(adev, event_id, HW_ERR "aca
> entry[%02d].STATUS=0x%016llx\n",
> +                   idx, entry->regs[MCA_REG_IDX_STATUS]);
> +     RAS_EVENT_LOG(adev, event_id, HW_ERR "aca
> entry[%02d].ADDR=0x%016llx\n",
> +                   idx, entry->regs[MCA_REG_IDX_ADDR]);
> +     RAS_EVENT_LOG(adev, event_id, HW_ERR "aca
> entry[%02d].MISC0=0x%016llx\n",
> +                   idx, entry->regs[MCA_REG_IDX_MISC0]);
> +     RAS_EVENT_LOG(adev, event_id, HW_ERR "aca
> entry[%02d].IPID=0x%016llx\n",
> +                   idx, entry->regs[MCA_REG_IDX_IPID]);
> +     RAS_EVENT_LOG(adev, event_id, HW_ERR "aca
> entry[%02d].SYND=0x%016llx\n",
> +                   idx, entry->regs[MCA_REG_IDX_SYND]);
>  }
>
> -int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum
> amdgpu_ras_block blk, enum amdgpu_mca_error_type type, struct ras_err_data
> *err_data)
> +int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum
> amdgpu_ras_block blk, enum amdgpu_mca_error_type type,
> +                              struct ras_err_data *err_data, struct
> ras_query_context *qctx)
>  {
>       struct amdgpu_smuio_mcm_config_info mcm_info;
>       struct ras_err_addr err_addr = {0};
> @@ -244,7 +248,7 @@ int amdgpu_mca_smu_log_ras_error(struct
> amdgpu_device *adev, enum amdgpu_ras_blo
>       list_for_each_entry(node, &mca_set.list, node) {
>               entry = &node->entry;
>
> -             amdgpu_mca_smu_mca_bank_dump(adev, i++, entry);
> +             amdgpu_mca_smu_mca_bank_dump(adev, i++, entry, qctx);
>
>               count = 0;
>               ret = amdgpu_mca_smu_parse_mca_error_count(adev, blk, type,
> entry, &count); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h
> index b964110ed1e0..e5bf07ce3451 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h
> @@ -169,6 +169,7 @@ void amdgpu_mca_smu_debugfs_init(struct
> amdgpu_device *adev, struct dentry *root  void
> amdgpu_mca_bank_set_init(struct mca_bank_set *mca_set);  int
> amdgpu_mca_bank_set_add_entry(struct mca_bank_set *mca_set, struct
> mca_bank_entry *entry);  void amdgpu_mca_bank_set_release(struct
> mca_bank_set *mca_set); -int amdgpu_mca_smu_log_ras_error(struct
> amdgpu_device *adev, enum amdgpu_ras_block blk, enum
> amdgpu_mca_error_type type, struct ras_err_data *err_data);
> +int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum
> amdgpu_ras_block blk, enum amdgpu_mca_error_type type,
> +                              struct ras_err_data *err_data, struct
> ras_query_context *qctx);
>
>  #endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 8ebab6f22e5a..ef87f107c942 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -1045,6 +1045,7 @@ static void amdgpu_ras_get_ecc_info(struct
> amdgpu_device *adev, struct ras_err_d  static void
> amdgpu_ras_error_print_error_data(struct amdgpu_device *adev,
>                                             struct ras_manager *ras_mgr,
>                                             struct ras_err_data *err_data,
> +                                           struct ras_query_context *qctx,
>                                             const char *blk_name,
>                                             bool is_ue,
>                                             bool is_de)
> @@ -1052,27 +1053,28 @@ static void
> amdgpu_ras_error_print_error_data(struct amdgpu_device *adev,
>       struct amdgpu_smuio_mcm_config_info *mcm_info;
>       struct ras_err_node *err_node;
>       struct ras_err_info *err_info;
> +     u64 event_id = qctx->event_id;
>
>       if (is_ue) {
>               for_each_ras_error(err_node, err_data) {
>                       err_info = &err_node->err_info;
>                       mcm_info = &err_info->mcm_info;
>                       if (err_info->ue_count) {
> -                             dev_info(adev->dev, "socket: %d, die: %d, "
> -                                      "%lld new uncorrectable hardware
> errors detected in %s block\n",
> -                                      mcm_info->socket_id,
> -                                      mcm_info->die_id,
> -                                      err_info->ue_count,
> -                                      blk_name);
> +                             RAS_EVENT_LOG(adev, event_id, "socket: %d,
> die: %d, "
> +                                           "%lld new uncorrectable hardware
> errors detected in %s block\n",
> +                                           mcm_info->socket_id,
> +                                           mcm_info->die_id,
> +                                           err_info->ue_count,
> +                                           blk_name);
>                       }
>               }
>
>               for_each_ras_error(err_node, &ras_mgr->err_data) {
>                       err_info = &err_node->err_info;
>                       mcm_info = &err_info->mcm_info;
> -                     dev_info(adev->dev, "socket: %d, die: %d, "
> -                              "%lld uncorrectable hardware errors detected
> in total in %s block\n",
> -                              mcm_info->socket_id, mcm_info->die_id,
> err_info->ue_count, blk_name);
> +                     RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, "
> +                                   "%lld uncorrectable hardware errors
> detected in total in %s block\n",
> +                                   mcm_info->socket_id, mcm_info->die_id,
> err_info->ue_count,
> +blk_name);
>               }
>
>       } else {
> @@ -1081,44 +1083,44 @@ static void
> amdgpu_ras_error_print_error_data(struct amdgpu_device *adev,
>                               err_info = &err_node->err_info;
>                               mcm_info = &err_info->mcm_info;
>                               if (err_info->de_count) {
> -                                     dev_info(adev->dev, "socket: %d,
> die: %d, "
> -                                             "%lld new deferred hardware
> errors detected in %s block\n",
> -                                             mcm_info->socket_id,
> -                                             mcm_info->die_id,
> -                                             err_info->de_count,
> -                                             blk_name);
> +                                     RAS_EVENT_LOG(adev, event_id,
> "socket: %d, die: %d, "
> +                                                   "%lld new deferred
> hardware errors detected in %s block\n",
> +                                                   mcm_info->socket_id,
> +                                                   mcm_info->die_id,
> +                                                   err_info->de_count,
> +                                                   blk_name);
>                               }
>                       }
>
>                       for_each_ras_error(err_node, &ras_mgr->err_data) {
>                               err_info = &err_node->err_info;
>                               mcm_info = &err_info->mcm_info;
> -                             dev_info(adev->dev, "socket: %d, die: %d, "
> -                                     "%lld deferred hardware errors
> detected in total in %s block\n",
> -                                     mcm_info->socket_id, mcm_info-
> >die_id,
> -                                     err_info->de_count, blk_name);
> +                             RAS_EVENT_LOG(adev, event_id, "socket: %d,
> die: %d, "
> +                                           "%lld deferred hardware errors
> detected in total in %s block\n",
> +                                           mcm_info->socket_id, mcm_info-
> >die_id,
> +                                           err_info->de_count, blk_name);
>                       }
>               } else {
>                       for_each_ras_error(err_node, err_data) {
>                               err_info = &err_node->err_info;
>                               mcm_info = &err_info->mcm_info;
>                               if (err_info->ce_count) {
> -                                     dev_info(adev->dev, "socket: %d,
> die: %d, "
> -                                             "%lld new correctable
> hardware errors detected in %s block\n",
> -                                             mcm_info->socket_id,
> -                                             mcm_info->die_id,
> -                                             err_info->ce_count,
> -                                             blk_name);
> +                                     RAS_EVENT_LOG(adev, event_id,
> "socket: %d, die: %d, "
> +                                                   "%lld new correctable
> hardware errors detected in %s block\n",
> +                                                   mcm_info->socket_id,
> +                                                   mcm_info->die_id,
> +                                                   err_info->ce_count,
> +                                                   blk_name);
>                               }
>                       }
>
>                       for_each_ras_error(err_node, &ras_mgr->err_data) {
>                               err_info = &err_node->err_info;
>                               mcm_info = &err_info->mcm_info;
> -                             dev_info(adev->dev, "socket: %d, die: %d, "
> -                                     "%lld correctable hardware errors
> detected in total in %s block\n",
> -                                     mcm_info->socket_id, mcm_info-
> >die_id,
> -                                     err_info->ce_count, blk_name);
> +                             RAS_EVENT_LOG(adev, event_id, "socket: %d,
> die: %d, "
> +                                           "%lld correctable hardware errors
> detected in total in %s block\n",
> +                                           mcm_info->socket_id, mcm_info-
> >die_id,
> +                                           err_info->ce_count, blk_name);
>                       }
>               }
>       }
> @@ -1131,77 +1133,79 @@ static inline bool err_data_has_source_info(struct
> ras_err_data *data)
>
>  static void amdgpu_ras_error_generate_report(struct amdgpu_device *adev,
>                                            struct ras_query_if *query_if,
> -                                          struct ras_err_data *err_data)
> +                                          struct ras_err_data *err_data,
> +                                          struct ras_query_context *qctx)
>  {
>       struct ras_manager *ras_mgr = amdgpu_ras_find_obj(adev, &query_if-
> >head);
>       const char *blk_name = get_ras_block_str(&query_if->head);
> +     u64 event_id = qctx->event_id;
>
>       if (err_data->ce_count) {
>               if (err_data_has_source_info(err_data)) {
> -                     amdgpu_ras_error_print_error_data(adev, ras_mgr,
> err_data,
> +                     amdgpu_ras_error_print_error_data(adev, ras_mgr,
> err_data, qctx,
>                                                         blk_name, false,
> false);
>               } else if (!adev->aid_mask &&
>                          adev->smuio.funcs &&
>                          adev->smuio.funcs->get_socket_id &&
>                          adev->smuio.funcs->get_die_id) {
> -                     dev_info(adev->dev, "socket: %d, die: %d "
> -                              "%ld correctable hardware errors "
> -                              "detected in %s block\n",
> -                              adev->smuio.funcs->get_socket_id(adev),
> -                              adev->smuio.funcs->get_die_id(adev),
> -                              ras_mgr->err_data.ce_count,
> -                              blk_name);
> +                     RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d "
> +                                   "%ld correctable hardware errors "
> +                                   "detected in %s block\n",
> +                                   adev->smuio.funcs->get_socket_id(adev),
> +                                   adev->smuio.funcs->get_die_id(adev),
> +                                   ras_mgr->err_data.ce_count,
> +                                   blk_name);
>               } else {
> -                     dev_info(adev->dev, "%ld correctable hardware errors "
> -                              "detected in %s block\n",
> -                              ras_mgr->err_data.ce_count,
> -                              blk_name);
> +                     RAS_EVENT_LOG(adev, event_id, "%ld correctable
> hardware errors "
> +                                   "detected in %s block\n",
> +                                   ras_mgr->err_data.ce_count,
> +                                   blk_name);
>               }
>       }
>
>       if (err_data->ue_count) {
>               if (err_data_has_source_info(err_data)) {
> -                     amdgpu_ras_error_print_error_data(adev, ras_mgr,
> err_data,
> +                     amdgpu_ras_error_print_error_data(adev, ras_mgr,
> err_data, qctx,
>                                                         blk_name, true,
> false);
>               } else if (!adev->aid_mask &&
>                          adev->smuio.funcs &&
>                          adev->smuio.funcs->get_socket_id &&
>                          adev->smuio.funcs->get_die_id) {
> -                     dev_info(adev->dev, "socket: %d, die: %d "
> -                              "%ld uncorrectable hardware errors "
> -                              "detected in %s block\n",
> -                              adev->smuio.funcs->get_socket_id(adev),
> -                              adev->smuio.funcs->get_die_id(adev),
> -                              ras_mgr->err_data.ue_count,
> -                              blk_name);
> +                     RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d "
> +                                   "%ld uncorrectable hardware errors "
> +                                   "detected in %s block\n",
> +                                   adev->smuio.funcs->get_socket_id(adev),
> +                                   adev->smuio.funcs->get_die_id(adev),
> +                                   ras_mgr->err_data.ue_count,
> +                                   blk_name);
>               } else {
> -                     dev_info(adev->dev, "%ld uncorrectable hardware errors
> "
> -                              "detected in %s block\n",
> -                              ras_mgr->err_data.ue_count,
> -                              blk_name);
> +                     RAS_EVENT_LOG(adev, event_id, "%ld uncorrectable
> hardware errors "
> +                                   "detected in %s block\n",
> +                                   ras_mgr->err_data.ue_count,
> +                                   blk_name);
>               }
>       }
>
>       if (err_data->de_count) {
>               if (err_data_has_source_info(err_data)) {
> -                     amdgpu_ras_error_print_error_data(adev, ras_mgr,
> err_data,
> +                     amdgpu_ras_error_print_error_data(adev, ras_mgr,
> err_data, qctx,
>                                                         blk_name, false,
> true);
>               } else if (!adev->aid_mask &&
>                          adev->smuio.funcs &&
>                          adev->smuio.funcs->get_socket_id &&
>                          adev->smuio.funcs->get_die_id) {
> -                     dev_info(adev->dev, "socket: %d, die: %d "
> -                              "%ld deferred hardware errors "
> -                              "detected in %s block\n",
> -                              adev->smuio.funcs->get_socket_id(adev),
> -                              adev->smuio.funcs->get_die_id(adev),
> -                              ras_mgr->err_data.de_count,
> -                              blk_name);
> +                     RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d "
> +                                   "%ld deferred hardware errors "
> +                                   "detected in %s block\n",
> +                                   adev->smuio.funcs->get_socket_id(adev),
> +                                   adev->smuio.funcs->get_die_id(adev),
> +                                   ras_mgr->err_data.de_count,
> +                                   blk_name);
>               } else {
> -                     dev_info(adev->dev, "%ld deferred hardware errors "
> -                              "detected in %s block\n",
> -                              ras_mgr->err_data.de_count,
> -                              blk_name);
> +                     RAS_EVENT_LOG(adev, event_id, "%ld deferred
> hardware errors "
> +                                   "detected in %s block\n",
> +                                   ras_mgr->err_data.de_count,
> +                                   blk_name);
>               }
>       }
>  }
> @@ -1294,6 +1298,7 @@ ssize_t amdgpu_ras_aca_sysfs_read(struct device
> *dev, struct device_attribute *a  static int
> amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev,
>                                               struct ras_query_if *info,
>                                               struct ras_err_data *err_data,
> +                                             struct ras_query_context
> *qctx,
>                                               unsigned int
> error_query_mode)
>  {
>       enum amdgpu_ras_block blk = info ? info->head.block :
> AMDGPU_RAS_BLOCK_COUNT; @@ -1338,8 +1343,8 @@ static int
> amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev,
>                               return ret;
>               } else {
>                       /* FIXME: add code to check return value later */
> -                     amdgpu_mca_smu_log_ras_error(adev, blk,
> AMDGPU_MCA_ERROR_TYPE_UE, err_data);
> -                     amdgpu_mca_smu_log_ras_error(adev, blk,
> AMDGPU_MCA_ERROR_TYPE_CE, err_data);
> +                     amdgpu_mca_smu_log_ras_error(adev, blk,
> AMDGPU_MCA_ERROR_TYPE_UE, err_data, qctx);
> +                     amdgpu_mca_smu_log_ras_error(adev, blk,
> AMDGPU_MCA_ERROR_TYPE_CE,
> +err_data, qctx);
>               }
>       }
>
> @@ -1351,6 +1356,7 @@ int amdgpu_ras_query_error_status(struct
> amdgpu_device *adev, struct ras_query_i  {
>       struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
>       struct ras_err_data err_data;
> +     struct ras_query_context qctx;
>       unsigned int error_query_mode;
>       int ret;
>
> @@ -1364,8 +1370,12 @@ int amdgpu_ras_query_error_status(struct
> amdgpu_device *adev, struct ras_query_i
>       if (!amdgpu_ras_get_error_query_mode(adev, &error_query_mode))
>               return -EINVAL;
>
> +     memset(&qctx, 0, sizeof(qctx));
> +     qctx.event_id = amdgpu_ras_acquire_event_id(adev,
> amdgpu_ras_intr_triggered() ?
> +                                                RAS_EVENT_TYPE_ISR :
> RAS_EVENT_TYPE_INVALID);
>       ret = amdgpu_ras_query_error_status_helper(adev, info,
>                                                  &err_data,
> +                                                &qctx,
>                                                  error_query_mode);
>       if (ret)
>               goto out_fini_err_data;
> @@ -1376,7 +1386,7 @@ int amdgpu_ras_query_error_status(struct
> amdgpu_device *adev, struct ras_query_i
>       info->ce_count = obj->err_data.ce_count;
>       info->de_count = obj->err_data.de_count;
>
> -     amdgpu_ras_error_generate_report(adev, info, &err_data);
> +     amdgpu_ras_error_generate_report(adev, info, &err_data, &qctx);
>
>  out_fini_err_data:
>       amdgpu_ras_error_data_fini(&err_data);
> @@ -3036,6 +3046,31 @@ static int amdgpu_get_ras_schema(struct
> amdgpu_device *adev)
>                       AMDGPU_RAS_ERROR__PARITY;
>  }
>
> +static void ras_event_mgr_init(struct ras_event_manager *mgr) {
> +     int i;
> +
> +     for (i = 0; i < ARRAY_SIZE(mgr->seqnos); i++)
> +             atomic64_set(&mgr->seqnos[i], 0);
> +}
> +
> +static void amdgpu_ras_event_mgr_init(struct amdgpu_device *adev) {
> +     struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
> +     struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
> +
> +     ras->event_mgr = hive ? &hive->event_mgr : &ras->__event_mgr;
> +
> +     /* init event manager with node 0 on xgmi system */
> +     if (!amdgpu_in_reset(adev)) {
> +             if (!hive || adev->gmc.xgmi.node_id == 0)
> +                     ras_event_mgr_init(ras->event_mgr);
> +     }
> +
> +     if (hive)
> +             amdgpu_put_xgmi_hive(hive);
> +}
> +
>  int amdgpu_ras_init(struct amdgpu_device *adev)  {
>       struct amdgpu_ras *con = amdgpu_ras_get_context(adev); @@ -3356,6
> +3391,8 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev)
>       if (amdgpu_sriov_vf(adev))
>               return 0;
>
> +     amdgpu_ras_event_mgr_init(adev);
> +
>       if (amdgpu_aca_is_enabled(adev)) {
>               if (amdgpu_in_reset(adev))
>                       r = amdgpu_aca_reset(adev);
> @@ -3472,13 +3509,37 @@ void amdgpu_ras_set_fed(struct amdgpu_device
> *adev, bool status)
>               atomic_set(&ras->fed, !!status);
>  }
>
> +bool amdgpu_ras_event_id_is_valid(struct amdgpu_device *adev, u64 id) {
> +     return !(id & BIT_ULL(63));
> +}
> +
> +u64 amdgpu_ras_acquire_event_id(struct amdgpu_device *adev, enum
> +ras_event_type type) {
> +     struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
> +     u64 id;
> +
> +     switch (type) {
> +     case RAS_EVENT_TYPE_ISR:
> +             id = (u64)atomic64_read(&ras->event_mgr->seqnos[type]);
> +             break;
> +     case RAS_EVENT_TYPE_INVALID:
> +     default:
> +             id = BIT_ULL(63) | 0ULL;
> +             break;
> +     }
> +
> +     return id;
> +}
> +
>  void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)  {
>       if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) {
>               struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
> +             u64 event_id =
> +(u64)atomic64_inc_return(&ras->event_mgr->seqnos[RAS_EVENT_TYPE_ISR]);
>
> -             dev_info(adev->dev, "uncorrectable hardware error"
> -                     "(ERREVENT_ATHUB_INTERRUPT) detected!\n");
> +             RAS_EVENT_LOG(adev, event_id, "uncorrectable hardware
> error"
> +                           "(ERREVENT_ATHUB_INTERRUPT) detected!\n");
>
>               ras->gpu_reset_flags |=
> AMDGPU_RAS_GPU_RESET_MODE1_RESET;
>               amdgpu_ras_reset_gpu(adev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index e0f8ce9d8440..64788ae7d85d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -64,6 +64,14 @@ struct amdgpu_iv_entry;
>  /* The high three bits indicates socketid */  #define
> AMDGPU_RAS_GET_FEATURES(val)  ((val) &
> ~AMDGPU_RAS_FEATURES_SOCKETID_MASK)
>
> +#define RAS_EVENT_LOG(adev, id, fmt, ...)                            \
> +do {                                                                 \
> +     if (amdgpu_ras_event_id_is_valid((adev), (id)))                 \
> +         dev_info((adev)->dev, "{%llu}" fmt, (id), ##__VA_ARGS__);   \
> +     else                                                            \
> +         dev_info((adev)->dev, fmt, ##__VA_ARGS__);                  \
> +} while (0)
> +
>  enum amdgpu_ras_block {
>       AMDGPU_RAS_BLOCK__UMC = 0,
>       AMDGPU_RAS_BLOCK__SDMA,
> @@ -419,6 +427,21 @@ struct umc_ecc_info {
>       int record_ce_addr_supported;
>  };
>
> +enum ras_event_type {
> +     RAS_EVENT_TYPE_INVALID = -1,
> +     RAS_EVENT_TYPE_ISR = 0,
> +     RAS_EVENT_TYPE_COUNT,
> +};
> +
> +struct ras_event_manager {
> +     atomic64_t seqnos[RAS_EVENT_TYPE_COUNT]; };
> +
> +struct ras_query_context {
> +     enum ras_event_type type;
> +     u64 event_id;
> +};
> +
>  struct amdgpu_ras {
>       /* ras infrastructure */
>       /* for ras itself. */
> @@ -479,6 +502,11 @@ struct amdgpu_ras {
>       atomic_t page_retirement_req_cnt;
>       /* Fatal error detected flag */
>       atomic_t fed;
> +
> +     /* RAS event manager */
> +     struct ras_event_manager __event_mgr;
> +     struct ras_event_manager *event_mgr;
> +
>  };
>
>  struct ras_fs_data {
> @@ -879,4 +907,6 @@ void amdgpu_ras_del_mca_err_addr(struct ras_err_info
> *err_info,  void amdgpu_ras_set_fed(struct amdgpu_device *adev, bool status);
> bool amdgpu_ras_get_fed_status(struct amdgpu_device *adev);
>
> +bool amdgpu_ras_event_id_is_valid(struct amdgpu_device *adev, u64 id);
> +u64 amdgpu_ras_acquire_event_id(struct amdgpu_device *adev, enum
> +ras_event_type type);
>  #endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> index 1592c63b3099..a3bfc16de6d4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> @@ -44,6 +44,7 @@ struct amdgpu_hive_info {
>
>       struct amdgpu_reset_domain *reset_domain;
>       atomic_t ras_recovery;
> +     struct ras_event_manager event_mgr;
>  };
>
>  struct amdgpu_pcs_ras_field {
> diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
> b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
> index 77af4e25ff46..4a02e1f041da 100644
> --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
> @@ -404,10 +404,16 @@ static int umc_v12_0_err_cnt_init_per_channel(struct
> amdgpu_device *adev,  static void
> umc_v12_0_ecc_info_query_ras_error_count(struct amdgpu_device *adev,
>                                       void *ras_error_status)
>  {
> +     struct ras_query_context qctx;
> +
> +     memset(&qctx, 0, sizeof(qctx));
> +     qctx.event_id = amdgpu_ras_acquire_event_id(adev,
> amdgpu_ras_intr_triggered() ?
> +                                                 RAS_EVENT_TYPE_ISR :
> RAS_EVENT_TYPE_INVALID);
> +
>       amdgpu_mca_smu_log_ras_error(adev,
> -             AMDGPU_RAS_BLOCK__UMC,
> AMDGPU_MCA_ERROR_TYPE_CE, ras_error_status);
> +             AMDGPU_RAS_BLOCK__UMC,
> AMDGPU_MCA_ERROR_TYPE_CE, ras_error_status,
> +&qctx);
>       amdgpu_mca_smu_log_ras_error(adev,
> -             AMDGPU_RAS_BLOCK__UMC,
> AMDGPU_MCA_ERROR_TYPE_UE, ras_error_status);
> +             AMDGPU_RAS_BLOCK__UMC,
> AMDGPU_MCA_ERROR_TYPE_UE, ras_error_status,
> +&qctx);
>  }
>
>  static void umc_v12_0_ecc_info_query_ras_error_address(struct
> amdgpu_device *adev,
> --
> 2.34.1



More information about the amd-gfx mailing list