[PATCH] drm/amdgpu: handle extra UE register entries for gfx v9_4_3

Tue Oct 31 11:02:16 UTC 2023

[AMD Official Use Only - General]

Is it better to handle CE and UE list separately?
Anyway Reviewed-by: Stanley.Yang <Stanley.Yang at amd.com>

Regards,
Stanley
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> On Behalf Of Tao
> Zhou
> Sent: Tuesday, October 31, 2023 3:09 PM
> To: amd-gfx at lists.freedesktop.org
> Cc: Chai, Thomas <YiPeng.Chai at amd.com>; Zhou1, Tao
> <Tao.Zhou1 at amd.com>
> Subject: [PATCH] drm/amdgpu: handle extra UE register entries for gfx v9_4_3
>
> The UE registe list is larger than CE list.
>
> Reported-by: yipeng.chai at amd.com
> Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 38
> +++++++++++++++++++++++++
>  1 file changed, 38 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> index 41bbabd9ad4d..046ae95b366a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
> @@ -3799,6 +3799,27 @@ static void
> gfx_v9_4_3_inst_query_ras_err_count(struct amdgpu_device *adev,
>               }
>       }
>
> +     /* handle extra register entries of UE */
> +     for (; i < ARRAY_SIZE(gfx_v9_4_3_ue_reg_list); i++) {
> +             for (j = 0; j < gfx_v9_4_3_ue_reg_list[i].se_num; j++) {
> +                     for (k = 0; k <
> gfx_v9_4_3_ue_reg_list[i].reg_entry.reg_inst; k++) {
> +                             /* no need to select if instance number is 1 */
> +                             if (gfx_v9_4_3_ue_reg_list[i].se_num > 1 ||
> +
>       gfx_v9_4_3_ue_reg_list[i].reg_entry.reg_inst > 1)
> +                                     gfx_v9_4_3_xcc_select_se_sh(adev, j,
> 0, k, xcc_id);
> +
> +
>       amdgpu_ras_inst_query_ras_error_count(adev,
> +
>       &(gfx_v9_4_3_ue_reg_list[i].reg_entry),
> +                                     1,
> +
>       gfx_v9_4_3_ras_mem_list_array[gfx_v9_4_3_ue_reg_list[i].mem_id_t
> ype].mem_id_ent,
> +
>       gfx_v9_4_3_ras_mem_list_array[gfx_v9_4_3_ue_reg_list[i].mem_id_t
> ype].size,
> +                                     GET_INST(GC, xcc_id),
> +
>       AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
> +                                     &ue_count);
> +                     }
> +             }
> +     }
> +
>       gfx_v9_4_3_xcc_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff,
>                       xcc_id);
>       mutex_unlock(&adev->grbm_idx_mutex);
> @@ -3838,6 +3859,23 @@ static void
> gfx_v9_4_3_inst_reset_ras_err_count(struct amdgpu_device *adev,
>               }
>       }
>
> +     /* handle extra register entries of UE */
> +     for (; i < ARRAY_SIZE(gfx_v9_4_3_ue_reg_list); i++) {
> +             for (j = 0; j < gfx_v9_4_3_ue_reg_list[i].se_num; j++) {
> +                     for (k = 0; k <
> gfx_v9_4_3_ue_reg_list[i].reg_entry.reg_inst; k++) {
> +                             /* no need to select if instance number is 1 */
> +                             if (gfx_v9_4_3_ue_reg_list[i].se_num > 1 ||
> +
>       gfx_v9_4_3_ue_reg_list[i].reg_entry.reg_inst > 1)
> +                                     gfx_v9_4_3_xcc_select_se_sh(adev, j,
> 0, k, xcc_id);
> +
> +
>       amdgpu_ras_inst_reset_ras_error_count(adev,
> +
>       &(gfx_v9_4_3_ue_reg_list[i].reg_entry),
> +                                     1,
> +                                     GET_INST(GC, xcc_id));
> +                     }
> +             }
> +     }
> +
>       gfx_v9_4_3_xcc_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff,
>                       xcc_id);
>       mutex_unlock(&adev->grbm_idx_mutex);
> --
> 2.35.1