[PATCH 3/3] drm/amdgpu: add RAS support for VML2 and ATCL2

Alex Deucher alexdeucher at gmail.com
Fri Oct 11 14:12:10 UTC 2019


On Thu, Oct 10, 2019 at 10:50 PM Dennis Li <Dennis.Li at amd.com> wrote:
>
> Add codes to query the EDC count of VML2 & ATCL2
>
> Change-Id: If2c251481ba0a1a34ce3405a85f86d65eecee461
> Signed-off-by: Dennis Li <Dennis.Li at amd.com>

Series is:
Acked-by: Alex Deucher <alexander.deucher at amd.com>

> ---
>  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 167 ++++++++++++++++++++++++++
>  1 file changed, 167 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 2a95093b85a5..22be6177938e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -6152,6 +6152,171 @@ static int gfx_v9_0_ras_error_inject(struct amdgpu_device *adev,
>         return ret;
>  }
>
> +static const char *vml2_mems[] = {
> +       "UTC_VML2_BANK_CACHE_0_BIGK_MEM0",
> +       "UTC_VML2_BANK_CACHE_0_BIGK_MEM1",
> +       "UTC_VML2_BANK_CACHE_0_4K_MEM0",
> +       "UTC_VML2_BANK_CACHE_0_4K_MEM1",
> +       "UTC_VML2_BANK_CACHE_1_BIGK_MEM0",
> +       "UTC_VML2_BANK_CACHE_1_BIGK_MEM1",
> +       "UTC_VML2_BANK_CACHE_1_4K_MEM0",
> +       "UTC_VML2_BANK_CACHE_1_4K_MEM1",
> +       "UTC_VML2_BANK_CACHE_2_BIGK_MEM0",
> +       "UTC_VML2_BANK_CACHE_2_BIGK_MEM1",
> +       "UTC_VML2_BANK_CACHE_2_4K_MEM0",
> +       "UTC_VML2_BANK_CACHE_2_4K_MEM1",
> +       "UTC_VML2_BANK_CACHE_3_BIGK_MEM0",
> +       "UTC_VML2_BANK_CACHE_3_BIGK_MEM1",
> +       "UTC_VML2_BANK_CACHE_3_4K_MEM0",
> +       "UTC_VML2_BANK_CACHE_3_4K_MEM1",
> +};
> +
> +static const char *vml2_walker_mems[] = {
> +       "UTC_VML2_CACHE_PDE0_MEM0",
> +       "UTC_VML2_CACHE_PDE0_MEM1",
> +       "UTC_VML2_CACHE_PDE1_MEM0",
> +       "UTC_VML2_CACHE_PDE1_MEM1",
> +       "UTC_VML2_CACHE_PDE2_MEM0",
> +       "UTC_VML2_CACHE_PDE2_MEM1",
> +       "UTC_VML2_RDIF_LOG_FIFO",
> +};
> +
> +static const char *atc_l2_cache_2m_mems[] = {
> +       "UTC_ATCL2_CACHE_2M_BANK0_WAY0_MEM",
> +       "UTC_ATCL2_CACHE_2M_BANK0_WAY1_MEM",
> +       "UTC_ATCL2_CACHE_2M_BANK1_WAY0_MEM",
> +       "UTC_ATCL2_CACHE_2M_BANK1_WAY1_MEM",
> +};
> +
> +static const char *atc_l2_cache_4k_mems[] = {
> +       "UTC_ATCL2_CACHE_4K_BANK0_WAY0_MEM0",
> +       "UTC_ATCL2_CACHE_4K_BANK0_WAY0_MEM1",
> +       "UTC_ATCL2_CACHE_4K_BANK0_WAY0_MEM2",
> +       "UTC_ATCL2_CACHE_4K_BANK0_WAY0_MEM3",
> +       "UTC_ATCL2_CACHE_4K_BANK0_WAY0_MEM4",
> +       "UTC_ATCL2_CACHE_4K_BANK0_WAY0_MEM5",
> +       "UTC_ATCL2_CACHE_4K_BANK0_WAY0_MEM6",
> +       "UTC_ATCL2_CACHE_4K_BANK0_WAY0_MEM7",
> +       "UTC_ATCL2_CACHE_4K_BANK0_WAY1_MEM0",
> +       "UTC_ATCL2_CACHE_4K_BANK0_WAY1_MEM1",
> +       "UTC_ATCL2_CACHE_4K_BANK0_WAY1_MEM2",
> +       "UTC_ATCL2_CACHE_4K_BANK0_WAY1_MEM3",
> +       "UTC_ATCL2_CACHE_4K_BANK0_WAY1_MEM4",
> +       "UTC_ATCL2_CACHE_4K_BANK0_WAY1_MEM5",
> +       "UTC_ATCL2_CACHE_4K_BANK0_WAY1_MEM6",
> +       "UTC_ATCL2_CACHE_4K_BANK0_WAY1_MEM7",
> +       "UTC_ATCL2_CACHE_4K_BANK1_WAY0_MEM0",
> +       "UTC_ATCL2_CACHE_4K_BANK1_WAY0_MEM1",
> +       "UTC_ATCL2_CACHE_4K_BANK1_WAY0_MEM2",
> +       "UTC_ATCL2_CACHE_4K_BANK1_WAY0_MEM3",
> +       "UTC_ATCL2_CACHE_4K_BANK1_WAY0_MEM4",
> +       "UTC_ATCL2_CACHE_4K_BANK1_WAY0_MEM5",
> +       "UTC_ATCL2_CACHE_4K_BANK1_WAY0_MEM6",
> +       "UTC_ATCL2_CACHE_4K_BANK1_WAY0_MEM7",
> +       "UTC_ATCL2_CACHE_4K_BANK1_WAY1_MEM0",
> +       "UTC_ATCL2_CACHE_4K_BANK1_WAY1_MEM1",
> +       "UTC_ATCL2_CACHE_4K_BANK1_WAY1_MEM2",
> +       "UTC_ATCL2_CACHE_4K_BANK1_WAY1_MEM3",
> +       "UTC_ATCL2_CACHE_4K_BANK1_WAY1_MEM4",
> +       "UTC_ATCL2_CACHE_4K_BANK1_WAY1_MEM5",
> +       "UTC_ATCL2_CACHE_4K_BANK1_WAY1_MEM6",
> +       "UTC_ATCL2_CACHE_4K_BANK1_WAY1_MEM7",
> +};
> +
> +static int gfx_v9_0_query_utc_edc_status(struct amdgpu_device *adev,
> +                                        struct ras_err_data *err_data)
> +{
> +       uint32_t i, data;
> +       uint32_t sec_count, ded_count;
> +
> +       WREG32_SOC15(GC, 0, mmVM_L2_MEM_ECC_INDEX, 255);
> +       WREG32_SOC15(GC, 0, mmVM_L2_MEM_ECC_CNT, 0);
> +       WREG32_SOC15(GC, 0, mmVM_L2_WALKER_MEM_ECC_INDEX, 255);
> +       WREG32_SOC15(GC, 0, mmVM_L2_WALKER_MEM_ECC_CNT, 0);
> +       WREG32_SOC15(GC, 0, mmATC_L2_CACHE_2M_EDC_INDEX, 255);
> +       WREG32_SOC15(GC, 0, mmATC_L2_CACHE_2M_EDC_CNT, 0);
> +       WREG32_SOC15(GC, 0, mmATC_L2_CACHE_4K_EDC_INDEX, 255);
> +       WREG32_SOC15(GC, 0, mmATC_L2_CACHE_4K_EDC_CNT, 0);
> +
> +       for (i = 0; i < 16; i++) {
> +               WREG32_SOC15(GC, 0, mmVM_L2_MEM_ECC_INDEX, i);
> +               data = RREG32_SOC15(GC, 0, mmVM_L2_MEM_ECC_CNT);
> +
> +               sec_count = REG_GET_FIELD(data, VM_L2_MEM_ECC_CNT, SEC_COUNT);
> +               if (sec_count) {
> +                       DRM_INFO("Instance[%d]: SubBlock %s, SEC %d\n", i,
> +                                vml2_mems[i], sec_count);
> +                       err_data->ce_count += sec_count;
> +               }
> +
> +               ded_count = REG_GET_FIELD(data, VM_L2_MEM_ECC_CNT, DED_COUNT);
> +               if (ded_count) {
> +                       DRM_INFO("Instance[%d]: SubBlock %s, DED %d\n", i,
> +                                vml2_mems[i], ded_count);
> +                       err_data->ue_count += ded_count;
> +               }
> +       }
> +
> +       for (i = 0; i < 7; i++) {
> +               WREG32_SOC15(GC, 0, mmVM_L2_WALKER_MEM_ECC_INDEX, i);
> +               data = RREG32_SOC15(GC, 0, mmVM_L2_WALKER_MEM_ECC_CNT);
> +
> +               sec_count = REG_GET_FIELD(data, VM_L2_WALKER_MEM_ECC_CNT,
> +                                               SEC_COUNT);
> +               if (sec_count) {
> +                       DRM_INFO("Instance[%d]: SubBlock %s, SEC %d\n", i,
> +                                vml2_walker_mems[i], sec_count);
> +                       err_data->ce_count += sec_count;
> +               }
> +
> +               ded_count = REG_GET_FIELD(data, VM_L2_WALKER_MEM_ECC_CNT,
> +                                               DED_COUNT);
> +               if (ded_count) {
> +                       DRM_INFO("Instance[%d]: SubBlock %s, DED %d\n", i,
> +                                vml2_walker_mems[i], ded_count);
> +                       err_data->ue_count += ded_count;
> +               }
> +       }
> +
> +       for (i = 0; i < 4; i++) {
> +               WREG32_SOC15(GC, 0, mmATC_L2_CACHE_2M_EDC_INDEX, i);
> +               data = RREG32_SOC15(GC, 0, mmATC_L2_CACHE_2M_EDC_CNT);
> +
> +               sec_count = (data & 0x00006000L) >> 0xd;
> +               if (sec_count) {
> +                       DRM_INFO("Instance[%d]: SubBlock %s, SEC %d\n", i,
> +                                atc_l2_cache_2m_mems[i], sec_count);
> +                       err_data->ce_count += sec_count;
> +               }
> +       }
> +
> +       for (i = 0; i < 32; i++) {
> +               WREG32_SOC15(GC, 0, mmATC_L2_CACHE_4K_EDC_INDEX, i);
> +               data = RREG32_SOC15(GC, 0, mmATC_L2_CACHE_4K_EDC_CNT);
> +
> +               sec_count = (data & 0x00006000L) >> 0xd;
> +               if (sec_count) {
> +                       DRM_INFO("Instance[%d]: SubBlock %s, SEC %d\n", i,
> +                                atc_l2_cache_4k_mems[i], sec_count);
> +                       err_data->ce_count += sec_count;
> +               }
> +
> +               ded_count = (data & 0x00018000L) >> 0xf;
> +               if (ded_count) {
> +                       DRM_INFO("Instance[%d]: SubBlock %s, DED %d\n", i,
> +                                atc_l2_cache_4k_mems[i], ded_count);
> +                       err_data->ue_count += ded_count;
> +               }
> +       }
> +
> +       WREG32_SOC15(GC, 0, mmVM_L2_MEM_ECC_INDEX, 255);
> +       WREG32_SOC15(GC, 0, mmVM_L2_WALKER_MEM_ECC_INDEX, 255);
> +       WREG32_SOC15(GC, 0, mmATC_L2_CACHE_2M_EDC_INDEX, 255);
> +       WREG32_SOC15(GC, 0, mmATC_L2_CACHE_4K_EDC_INDEX, 255);
> +
> +       return 0;
> +}
> +
>  static int __get_ras_error_count(const struct soc15_reg_entry *reg,
>         uint32_t se_id, uint32_t inst_id, uint32_t value,
>         uint32_t *sec_count, uint32_t *ded_count)
> @@ -6226,6 +6391,8 @@ static int gfx_v9_0_query_ras_error_count(struct amdgpu_device *adev,
>         gfx_v9_0_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff);
>         mutex_unlock(&adev->grbm_idx_mutex);
>
> +       gfx_v9_0_query_utc_edc_status(adev, err_data);
> +
>         return 0;
>  }
>
> --
> 2.17.1
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx


More information about the amd-gfx mailing list