[PATCH 4/4] drm/amdgpu: avoid dump mca bank log muti times during ras ISR

Zhou1, Tao Tao.Zhou1 at amd.com
Thu Apr 25 08:30:44 UTC 2024


[AMD Official Use Only - General]

> -----Original Message-----
> From: Wang, Yang(Kevin) <KevinYang.Wang at amd.com>
> Sent: Tuesday, April 23, 2024 4:27 PM
> To: amd-gfx at lists.freedesktop.org
> Cc: Zhang, Hawking <Hawking.Zhang at amd.com>; Zhou1, Tao
> <Tao.Zhou1 at amd.com>; Li, Candice <Candice.Li at amd.com>
> Subject: [PATCH 4/4] drm/amdgpu: avoid dump mca bank log muti times during
> ras ISR
>
> because the ue valid mca count will only be cleared after gpu reset, so only dump
> mca log on the first time to get mca bank after receive RAS interrupt.
>
> Signed-off-by: Yang Wang <kevinyang.wang at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c | 28
> +++++++++++++++++++++++++  drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h |
> 1 +
>  2 files changed, 29 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
> index 264f56fd4f66..b581523fa8d7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
> @@ -229,6 +229,8 @@ int amdgpu_mca_init(struct amdgpu_device *adev)
>       struct mca_bank_cache *mca_cache;
>       int i;
>
> +     atomic_set(&mca->ue_update_flag, 0);
> +
>       for (i = 0; i < ARRAY_SIZE(mca->mca_caches); i++) {
>               mca_cache = &mca->mca_caches[i];
>               mutex_init(&mca_cache->lock);
> @@ -244,6 +246,8 @@ void amdgpu_mca_fini(struct amdgpu_device *adev)
>       struct mca_bank_cache *mca_cache;
>       int i;
>
> +     atomic_set(&mca->ue_update_flag, 0);
> +
>       for (i = 0; i < ARRAY_SIZE(mca->mca_caches); i++) {
>               mca_cache = &mca->mca_caches[i];
>               amdgpu_mca_bank_set_release(&mca_cache->mca_set);
> @@ -325,6 +329,27 @@ static int amdgpu_mca_smu_get_mca_entry(struct
> amdgpu_device *adev, enum amdgpu_
>       return mca_funcs->mca_get_mca_entry(adev, type, idx, entry);  }
>
> +static bool amdgpu_mca_bank_should_update(struct amdgpu_device *adev,
> +enum amdgpu_mca_error_type type) {
> +     struct amdgpu_mca *mca = &adev->mca;
> +     bool ret = true;
> +
> +     /*
> +      * Because the UE Valid MCA count will only be cleared after reset,
> +      * in order to avoid repeated counting of the error count,
> +      * the aca bank is only updated once during the gpu recovery stage.
> +      */
> +     if (type == AMDGPU_MCA_ERROR_TYPE_UE) {
> +             if (amdgpu_ras_intr_triggered())
> +                     ret = atomic_cmpxchg(&mca->ue_update_flag, 0, 1) ==
> 0;
> +             else
> +                     atomic_set(&mca->ue_update_flag, 0);
> +     }
> +
> +     return ret;
> +}
> +
> +

[Tao] redundant line, with this fixed, the patch is:

Reviewed-by: Tao Zhou <tao.zhou1 at amd.com>

>  static int amdgpu_mca_smu_get_mca_set(struct amdgpu_device *adev, enum
> amdgpu_mca_error_type type, struct mca_bank_set *mca_set,
>                                     struct ras_query_context *qctx)  { @@ -
> 335,6 +360,9 @@ static int amdgpu_mca_smu_get_mca_set(struct
> amdgpu_device *adev, enum amdgpu_mc
>       if (!mca_set)
>               return -EINVAL;
>
> +     if (!amdgpu_mca_bank_should_update(adev, type))
> +             return 0;
> +
>       ret = amdgpu_mca_smu_get_valid_mca_count(adev, type, &count);
>       if (ret)
>               return ret;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h
> index 9b97cfa28e05..e80323ff90c1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h
> @@ -93,6 +93,7 @@ struct amdgpu_mca {
>       struct amdgpu_mca_ras mpio;
>       const struct amdgpu_mca_smu_funcs *mca_funcs;
>       struct mca_bank_cache mca_caches[AMDGPU_MCA_ERROR_TYPE_DE];
> +     atomic_t ue_update_flag;
>  };
>
>  enum mca_reg_idx {
> --
> 2.34.1



More information about the amd-gfx mailing list