[PATCH 05/14] drm/amdgpu: add amdgpu ras aca query interface

Zhou1, Tao Tao.Zhou1 at amd.com
Thu Jan 4 03:17:04 UTC 2024


[AMD Official Use Only - General]

We check debug mode to decide which path is selected currently. The legacy path is still needed even ACA framework is supported (if debug mode is enabled).

"it should help us to differentiate aca from legacy ras when implementing other features", is there a scenario where the aca flag is a must?

Regards,
Tao

> -----Original Message-----
> From: Zhang, Hawking <Hawking.Zhang at amd.com>
> Sent: Wednesday, January 3, 2024 8:00 PM
> To: Wang, Yang(Kevin) <KevinYang.Wang at amd.com>; amd-
> gfx at lists.freedesktop.org
> Cc: Zhou1, Tao <Tao.Zhou1 at amd.com>; Chai, Thomas <YiPeng.Chai at amd.com>
> Subject: RE: [PATCH 05/14] drm/amdgpu: add amdgpu ras aca query interface
>
> [AMD Official Use Only - General]
>
> I assume we are leveraging error_query_mode to differentiate aca path from
> legacy ras path, right?
>
> But given in-band error reporting is just the start of transition from legacy ras to
> aca, do we need a flag in amdgpu_aca to indicate whether aca is supported or
> not? Accordingly, we can initialize the flag in amdgpu_ras_check_supported. it
> should help us to differentiate aca from legacy ras when implementing other
> features, thoughts?
>
> Regards,
> Hawking
>
> -----Original Message-----
> From: Wang, Yang(Kevin) <KevinYang.Wang at amd.com>
> Sent: Wednesday, January 3, 2024 16:02
> To: amd-gfx at lists.freedesktop.org
> Cc: Zhang, Hawking <Hawking.Zhang at amd.com>; Zhou1, Tao
> <Tao.Zhou1 at amd.com>; Chai, Thomas <YiPeng.Chai at amd.com>; Wang,
> Yang(Kevin) <KevinYang.Wang at amd.com>
> Subject: [PATCH 05/14] drm/amdgpu: add amdgpu ras aca query interface
>
> use new ACA error query interface to instead of legacy MCA query.
>
> Signed-off-by: Yang Wang <kevinyang.wang at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 88 ++++++++++++++++++++-----
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 12 +++-
>  2 files changed, 79 insertions(+), 21 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 038bd1b17cef..bbae41f86e00 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -1168,6 +1168,53 @@ static void
> amdgpu_rasmgr_error_data_statistic_update(struct ras_manager *obj, s
>         }
>  }
>
> +static struct ras_manager *get_ras_manager(struct amdgpu_device *adev,
> +enum amdgpu_ras_block blk) {
> +       struct ras_common_if head;
> +
> +       memset(&head, 0, sizeof(head));
> +       head.block = blk;
> +
> +       return amdgpu_ras_find_obj(adev, &head); }
> +
> +int amdgpu_ras_bind_aca(struct amdgpu_device *adev, enum
> amdgpu_ras_block blk,
> +                       const struct aca_info *aca_info, void *data) {
> +       struct ras_manager *obj;
> +
> +       obj = get_ras_manager(adev, blk);
> +       if (!obj)
> +               return -EINVAL;
> +
> +       return amdgpu_aca_add_handle(adev, &obj->aca_handle,
> +ras_block_str(blk), aca_info, data); }
> +
> +int amdgpu_ras_unbind_aca(struct amdgpu_device *adev, enum
> +amdgpu_ras_block blk) {
> +       struct ras_manager *obj;
> +
> +       obj = get_ras_manager(adev, blk);
> +       if (!obj)
> +               return -EINVAL;
> +
> +       amdgpu_aca_remove_handle(&obj->aca_handle);
> +
> +       return 0;
> +}
> +
> +static int amdgpu_aca_log_ras_error_data(struct amdgpu_device *adev, enum
> amdgpu_ras_block blk,
> +                                        enum aca_error_type type, struct ras_err_data *err_data)
> {
> +       struct ras_manager *obj;
> +
> +       obj = get_ras_manager(adev, blk);
> +       if (!obj)
> +               return -EINVAL;
> +
> +       return amdgpu_aca_get_error_data(adev, &obj->aca_handle, type,
> +err_data); }
> +
>  static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev,
>                                                 struct ras_query_if *info,
>                                                 struct ras_err_data *err_data, @@ -1175,6 +1222,7
> @@ static int amdgpu_ras_query_error_status_helper(struct amdgpu_device
> *adev,  {
>         enum amdgpu_ras_block blk = info ? info->head.block :
> AMDGPU_RAS_BLOCK_COUNT;
>         struct amdgpu_ras_block_object *block_obj = NULL;
> +       int ret;
>
>         if (blk == AMDGPU_RAS_BLOCK_COUNT)
>                 return -EINVAL;
> @@ -1204,9 +1252,13 @@ static int
> amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev,
>                         }
>                 }
>         } else {
> -               /* FIXME: add code to check return value later */
> -               amdgpu_mca_smu_log_ras_error(adev, blk,
> AMDGPU_MCA_ERROR_TYPE_UE, err_data);
> -               amdgpu_mca_smu_log_ras_error(adev, blk,
> AMDGPU_MCA_ERROR_TYPE_CE, err_data);
> +               ret = amdgpu_aca_log_ras_error_data(adev, blk,
> ACA_ERROR_TYPE_UE, err_data);
> +               if (ret)
> +                       return ret;
> +
> +               ret = amdgpu_aca_log_ras_error_data(adev, blk,
> ACA_ERROR_TYPE_CE, err_data);
> +               if (ret)
> +                       return ret;
>         }
>
>         return 0;
> @@ -1254,7 +1306,7 @@ int amdgpu_ras_reset_error_count(struct
> amdgpu_device *adev,  {
>         struct amdgpu_ras_block_object *block_obj =
> amdgpu_ras_get_ras_block(adev, block, 0);
>         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
> -       const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
> +       const struct aca_smu_funcs *smu_funcs = adev->aca.smu_funcs;
>         struct amdgpu_hive_info *hive;
>         int hive_ras_recovery = 0;
>
> @@ -1265,7 +1317,7 @@ int amdgpu_ras_reset_error_count(struct
> amdgpu_device *adev,
>         }
>
>         if (!amdgpu_ras_is_supported(adev, block) ||
> -           !amdgpu_ras_get_mca_debug_mode(adev))
> +           !amdgpu_ras_get_aca_debug_mode(adev))
>                 return -EOPNOTSUPP;
>
>         hive = amdgpu_get_xgmi_hive(adev); @@ -1277,7 +1329,7 @@ int
> amdgpu_ras_reset_error_count(struct amdgpu_device *adev,
>         /* skip ras error reset in gpu reset */
>         if ((amdgpu_in_reset(adev) || atomic_read(&ras->in_recovery) ||
>             hive_ras_recovery) &&
> -           mca_funcs && mca_funcs->mca_set_debug_mode)
> +           smu_funcs && smu_funcs->set_debug_mode)
>                 return -EOPNOTSUPP;
>
>         if (block_obj->hw_ops->reset_ras_error_count)
> @@ -1773,7 +1825,7 @@ void amdgpu_ras_debugfs_create_all(struct
> amdgpu_device *adev)
>                 }
>         }
>
> -       amdgpu_mca_smu_debugfs_init(adev, dir);
> +       amdgpu_aca_smu_debugfs_init(adev, dir);
>  }
>
>  /* debugfs end */
> @@ -3138,8 +3190,8 @@ int amdgpu_ras_late_init(struct amdgpu_device
> *adev)
>         if (amdgpu_sriov_vf(adev))
>                 return 0;
>
> -       /* enable MCA debug on APU device */
> -       amdgpu_ras_set_mca_debug_mode(adev, !!(adev->flags & AMD_IS_APU));
> +       /* enable ACA debug on APU device */
> +       amdgpu_ras_set_aca_debug_mode(adev, !!(adev->flags &
> + AMD_IS_APU));
>
>         list_for_each_entry_safe(node, tmp, &adev->ras_list, node) {
>                 if (!node->ras_obj) {
> @@ -3422,7 +3474,7 @@ int amdgpu_ras_set_mca_debug_mode(struct
> amdgpu_device *adev, bool enable)
>         if (con) {
>                 ret = amdgpu_mca_smu_set_debug_mode(adev, enable);
>                 if (!ret)
> -                       con->is_mca_debug_mode = enable;
> +                       con->is_aca_debug_mode = enable;
>         }
>
>         return ret;
> @@ -3436,22 +3488,22 @@ int amdgpu_ras_set_aca_debug_mode(struct
> amdgpu_device *adev, bool enable)
>         if (con) {
>                 ret = amdgpu_aca_smu_set_debug_mode(adev, enable);
>                 if (!ret)
> -                       con->is_mca_debug_mode = enable;
> +                       con->is_aca_debug_mode = enable;
>         }
>
>         return ret;
>  }
>
> -bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev)
> +bool amdgpu_ras_get_aca_debug_mode(struct amdgpu_device *adev)
>  {
>         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> -       const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
> +       const struct aca_smu_funcs *smu_funcs = adev->aca.smu_funcs;
>
>         if (!con)
>                 return false;
>
> -       if (mca_funcs && mca_funcs->mca_set_debug_mode)
> -               return con->is_mca_debug_mode;
> +       if (smu_funcs && smu_funcs->set_debug_mode)
> +               return con->is_aca_debug_mode;
>         else
>                 return true;
>  }
> @@ -3460,16 +3512,16 @@ bool amdgpu_ras_get_error_query_mode(struct
> amdgpu_device *adev,
>                                      unsigned int *error_query_mode)  {
>         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> -       const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
> +       const struct aca_smu_funcs *smu_funcs = adev->aca.smu_funcs;
>
>         if (!con) {
>                 *error_query_mode = AMDGPU_RAS_INVALID_ERROR_QUERY;
>                 return false;
>         }
>
> -       if (mca_funcs && mca_funcs->mca_set_debug_mode)
> +       if (smu_funcs && smu_funcs->set_debug_mode)
>                 *error_query_mode =
> -                       (con->is_mca_debug_mode) ?
> AMDGPU_RAS_DIRECT_ERROR_QUERY :
> AMDGPU_RAS_FIRMWARE_ERROR_QUERY;
> +                       (con->is_aca_debug_mode) ?
> AMDGPU_RAS_DIRECT_ERROR_QUERY :
> +AMDGPU_RAS_FIRMWARE_ERROR_QUERY;
>         else
>                 *error_query_mode = AMDGPU_RAS_DIRECT_ERROR_QUERY;
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index 408e21c3cc88..2afac9aa381a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -442,7 +442,7 @@ struct amdgpu_ras {
>         /* Indicates smu whether need update bad channel info */
>         bool update_channel_flag;
>         /* Record status of smu mca debug mode */
> -       bool is_mca_debug_mode;
> +       bool is_aca_debug_mode;
>
>         /* Record special requirements of gpu reset caller */
>         uint32_t  gpu_reset_flags;
> @@ -530,6 +530,8 @@ struct ras_manager {
>         struct ras_ih_data ih_data;
>
>         struct ras_err_data err_data;
> +
> +       struct aca_handle aca_handle;
>  };
>
>  struct ras_badpage {
> @@ -781,9 +783,9 @@ struct amdgpu_ras* amdgpu_ras_get_context(struct
> amdgpu_device *adev);
>
>  int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras
> *ras_con);
>
> -int amdgpu_ras_set_aca_debug_mode(struct amdgpu_device *adev, bool
> enable);  int amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev,
> bool enable); -bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device
> *adev);
> +int amdgpu_ras_set_aca_debug_mode(struct amdgpu_device *adev, bool
> +enable); bool amdgpu_ras_get_aca_debug_mode(struct amdgpu_device
> +*adev);
>  bool amdgpu_ras_get_error_query_mode(struct amdgpu_device *adev,
>                                      unsigned int *mode);
>
> @@ -821,4 +823,8 @@ int amdgpu_ras_error_statistic_ue_count(struct
> ras_err_data *err_data,
>                 struct amdgpu_smuio_mcm_config_info *mcm_info,
>                 struct ras_err_addr *err_addr, u64 count);
>
> +int amdgpu_ras_bind_aca(struct amdgpu_device *adev, enum
> amdgpu_ras_block blk,
> +                              const struct aca_info *aca_info, void
> +*data); int amdgpu_ras_unbind_aca(struct amdgpu_device *adev, enum
> +amdgpu_ras_block blk);
> +
>  #endif
> --
> 2.34.1
>



More information about the amd-gfx mailing list