[PATCH 4/4] drm/amdgpu: add reset_ras_error_count function for HDP

Tue Mar 3 03:02:35 UTC 2020

[AMD Official Use Only - Internal Distribution Only]

The series is:

Reviewed-by: Tao Zhou <tao.zhou1 at amd.com>

> -----Original Message-----
> From: Hawking Zhang <Hawking.Zhang at amd.com>
> Sent: 2020年3月2日 18:34
> To: amd-gfx at lists.freedesktop.org; Clements, John
> <John.Clements at amd.com>; Li, Dennis <Dennis.Li at amd.com>; Chen,
> Guchun <Guchun.Chen at amd.com>; Zhou1, Tao <Tao.Zhou1 at amd.com>;
> Deucher, Alexander <Alexander.Deucher at amd.com>
> Cc: Zhang, Hawking <Hawking.Zhang at amd.com>
> Subject: [PATCH 4/4] drm/amdgpu: add reset_ras_error_count function for
> HDP
> 
> HDP ras error counters are dirty ones after cold reboot Read operation is
> needed to reset them to 0
> 
> Signed-off-by: Hawking Zhang <Hawking.Zhang at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h   |  1 +
>  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c |  1 -
>  drivers/gpu/drm/amd/amdgpu/soc15.c    | 14 ++++++++++++++
>  3 files changed, 15 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index a58b0cf9da51..b735e20888a7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -595,6 +595,7 @@ struct amdgpu_asic_funcs {
>  	/* invalidate hdp read cache */
>  	void (*invalidate_hdp)(struct amdgpu_device *adev,
>  			       struct amdgpu_ring *ring);
> +	void (*reset_hdp_ras_error_count)(struct amdgpu_device *adev);
>  	/* check if the asic needs a full reset of if soft reset will work */
>  	bool (*need_full_reset)(struct amdgpu_device *adev);
>  	/* initialize doorbell layout for specific asic*/ diff --git
> a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index b746f26f933c..efd52bcf8785 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -4149,7 +4149,6 @@ static const struct soc15_reg_entry
> gfx_v9_0_edc_counter_regs[] = {
>     { SOC15_REG_ENTRY(GC, 0, mmTCC_EDC_CNT2), 0, 1, 16},
>     { SOC15_REG_ENTRY(GC, 0, mmTCA_EDC_CNT), 0, 1, 2},
>     { SOC15_REG_ENTRY(GC, 0, mmSQC_EDC_CNT3), 0, 4, 6},
> -   { SOC15_REG_ENTRY(HDP, 0, mmHDP_EDC_CNT), 0, 1, 1},
>  };
> 
>  static int gfx_v9_0_do_edc_gds_workarounds(struct amdgpu_device *adev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c
> b/drivers/gpu/drm/amd/amdgpu/soc15.c
> index 4aa5b9c8e43b..6b717691d554 100644
> --- a/drivers/gpu/drm/amd/amdgpu/soc15.c
> +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c
> @@ -827,6 +827,15 @@ static bool soc15_need_full_reset(struct
> amdgpu_device *adev)
>  	/* change this when we implement soft reset */
>  	return true;
>  }
> +
> +static void vega20_reset_hdp_ras_error_count(struct amdgpu_device
> +*adev) {
> +	if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__HDP))
> +		return;
> +	/*read back hdp ras counter to reset it to 0 */
> +	RREG32_SOC15(HDP, 0, mmHDP_EDC_CNT);
> +}
> +
>  static void soc15_get_pcie_usage(struct amdgpu_device *adev, uint64_t
> *count0,
>  				 uint64_t *count1)
>  {
> @@ -994,6 +1003,7 @@ static const struct amdgpu_asic_funcs
> vega20_asic_funcs =
>  	.get_config_memsize = &soc15_get_config_memsize,
>  	.flush_hdp = &soc15_flush_hdp,
>  	.invalidate_hdp = &soc15_invalidate_hdp,
> +	.reset_hdp_ras_error_count = &vega20_reset_hdp_ras_error_count,
>  	.need_full_reset = &soc15_need_full_reset,
>  	.init_doorbell_index = &vega20_doorbell_index_init,
>  	.get_pcie_usage = &vega20_get_pcie_usage, @@ -1239,6 +1249,10
> @@ static int soc15_common_late_init(void *handle)
>  	if (amdgpu_sriov_vf(adev))
>  		xgpu_ai_mailbox_get_irq(adev);
> 
> +	if (adev->asic_funcs &&
> +	    adev->asic_funcs->reset_hdp_ras_error_count)
> +		adev->asic_funcs->reset_hdp_ras_error_count(adev);
> +
>  	if (adev->nbio.funcs->ras_late_init)
>  		r = adev->nbio.funcs->ras_late_init(adev);
> 
> --
> 2.17.1