[PATCH 4/4] drm/amdgpu: read sdma edc counter to clear the counters
Alex Deucher
alexdeucher at gmail.com
Wed Jan 8 16:27:25 UTC 2020
On Wed, Jan 8, 2020 at 11:18 AM Hawking Zhang <Hawking.Zhang at amd.com> wrote:
>
> SDMA edc counter registers were added in gfx edc counters
> array. When querying gfx error counter in that array, there
> is no way to differentiate sdma instance number for different
> asic and then results to NULL pointer access when trying to
> read sdma register base address for instances greater
> than 2 on Vega20.
> In addition, this also results to wrong gfx error counters
> since it actually added sdma edc counters.
> Therefore, sdma edc counter registers should be separated
> from gfx edc counter regsiter array and only get initialized
> when driver tries to enable sdma ras.
>
> Change-Id: I206917f9d7b81670a8fed84dc749085ce5a6f678
> Signed-off-by: Hawking Zhang <Hawking.Zhang at amd.com>
Reviewed-by: Alex Deucher <alexander.deucher at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 11 +----------
> drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c | 7 +++++++
> 2 files changed, 8 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 33d1c57aaaf1..c9ade16bbcc3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -4038,14 +4038,6 @@ static const struct soc15_reg_entry sec_ded_counter_registers[] = {
> { SOC15_REG_ENTRY(GC, 0, mmTCA_EDC_CNT), 0, 1, 2},
> { SOC15_REG_ENTRY(GC, 0, mmSQC_EDC_CNT3), 0, 4, 6},
> { SOC15_REG_ENTRY(HDP, 0, mmHDP_EDC_CNT), 0, 1, 1},
> - { SOC15_REG_ENTRY(SDMA0, 0, mmSDMA0_EDC_COUNTER), 0, 1, 1},
> - { SOC15_REG_ENTRY(SDMA1, 0, mmSDMA1_EDC_COUNTER), 0, 1, 1},
> - { SOC15_REG_ENTRY(SDMA2, 0, mmSDMA2_EDC_COUNTER), 0, 1, 1},
> - { SOC15_REG_ENTRY(SDMA3, 0, mmSDMA3_EDC_COUNTER), 0, 1, 1},
> - { SOC15_REG_ENTRY(SDMA4, 0, mmSDMA4_EDC_COUNTER), 0, 1, 1},
> - { SOC15_REG_ENTRY(SDMA5, 0, mmSDMA5_EDC_COUNTER), 0, 1, 1},
> - { SOC15_REG_ENTRY(SDMA6, 0, mmSDMA6_EDC_COUNTER), 0, 1, 1},
> - { SOC15_REG_ENTRY(SDMA7, 0, mmSDMA7_EDC_COUNTER), 0, 1, 1},
> };
>
> static int gfx_v9_0_do_edc_gds_workarounds(struct amdgpu_device *adev)
> @@ -4109,7 +4101,6 @@ static int gfx_v9_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
> adev->gfx.config.max_sh_per_se;
> int sgpr_work_group_size = 5;
> int gpr_reg_size = compute_dim_x / 16 + 6;
> - int sec_ded_counter_reg_size = adev->sdma.num_instances + 34;
>
> /* only support when RAS is enabled */
> if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX))
> @@ -4249,7 +4240,7 @@ static int gfx_v9_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
>
> /* read back registers to clear the counters */
> mutex_lock(&adev->grbm_idx_mutex);
> - for (i = 0; i < sec_ded_counter_reg_size; i++) {
> + for (i = 0; i < ARRAY_SIZE(sec_ded_counter_registers); i++) {
> for (j = 0; j < sec_ded_counter_registers[i].se_num; j++) {
> for (k = 0; k < sec_ded_counter_registers[i].instance; k++) {
> gfx_v9_0_select_se_sh(adev, j, 0x0, k);
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
> index fd20594b6d6e..f4107f9b75f3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
> @@ -1802,6 +1802,13 @@ static int sdma_v4_0_late_init(void *handle)
> struct ras_ih_if ih_info = {
> .cb = sdma_v4_0_process_ras_data_cb,
> };
> + int i;
> +
> + /* read back edc counter registers to clear the counters */
> + if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__SDMA)) {
> + for (i = 0; i < adev->sdma.num_instances; i++)
> + RREG32_SDMA(i, mmSDMA0_EDC_COUNTER);
> + }
>
> return adev->sdma.funcs->ras_late_init(adev, &ih_info);
> }
> --
> 2.17.1
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
More information about the amd-gfx
mailing list