[PATCH 2/3] drm/amdgpu: Complete Carrizo EDC (Error Detection and Correction) workarounds.

Deucher, Alexander Alexander.Deucher at amd.com
Tue Jun 6 21:16:34 UTC 2017


> -----Original Message-----
> From: amd-gfx [mailto:amd-gfx-bounces at lists.freedesktop.org] On Behalf
> Of David Panariti
> Sent: Tuesday, June 06, 2017 4:33 PM
> To: amd-gfx at lists.freedesktop.org
> Cc: Panariti, David
> Subject: [PATCH 2/3] drm/amdgpu: Complete Carrizo EDC (Error Detection
> and Correction) workarounds.
> 
> The workarounds are unconditionally performed on CZs with EDC enabled.
> EDC detects uncorrected ECC errors and uses data poisoning to prevent
> corrupted compute results from being used (read).
> EDC enabled CZs are often used in embedded environments.
> 
> Signed-off-by: David Panariti <David.Panariti at amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h   |  13 ++++
>  drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 136
> +++++++++++++++++++++++-----------
>  2 files changed, 107 insertions(+), 42 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 0d64bbba..a6f51eb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -2060,5 +2060,18 @@ int amdgpu_dm_display_resume(struct
> amdgpu_device *adev );
>  static inline int amdgpu_dm_display_resume(struct amdgpu_device *adev)
> { return 0; }
>  #endif
> 
> +/* Carrizo EDC support */
> +struct reg32_counter_name_map {
> +	uint32_t rnmap_addr;	     /* Counter register address */
> +	const char *rnmap_name;	     /* Name of the counter */
> +	size_t rnmap_num_instances;  /* Number of block instances */
> +};
> +
> +#define DEF_mmREG32_NAME_MAP_ELEMENT(reg, num_instances) {
> 	\
> +		.rnmap_addr = mm##reg,				\
> +		.rnmap_name = #reg,				\
> +		.rnmap_num_instances = num_instances		\
> +}
> +
>  #include "amdgpu_object.h"
>  #endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> index 07172f8..46e766e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> @@ -1727,35 +1727,80 @@ static const u32 sgpr2_init_regs[] =
>  	mmCOMPUTE_USER_DATA_9, 0xedcedc09,
>  };
> 
> -static const u32 sec_ded_counter_registers[] =
> -{
> -	mmCPC_EDC_ATC_CNT,
> -	mmCPC_EDC_SCRATCH_CNT,
> -	mmCPC_EDC_UCODE_CNT,
> -	mmCPF_EDC_ATC_CNT,
> -	mmCPF_EDC_ROQ_CNT,
> -	mmCPF_EDC_TAG_CNT,
> -	mmCPG_EDC_ATC_CNT,
> -	mmCPG_EDC_DMA_CNT,
> -	mmCPG_EDC_TAG_CNT,
> -	mmDC_EDC_CSINVOC_CNT,
> -	mmDC_EDC_RESTORE_CNT,
> -	mmDC_EDC_STATE_CNT,
> -	mmGDS_EDC_CNT,
> -	mmGDS_EDC_GRBM_CNT,
> -	mmGDS_EDC_OA_DED,
> -	mmSPI_EDC_CNT,
> -	mmSQC_ATC_EDC_GATCL1_CNT,
> -	mmSQC_EDC_CNT,
> -	mmSQ_EDC_DED_CNT,
> -	mmSQ_EDC_INFO,
> -	mmSQ_EDC_SEC_CNT,
> -	mmTCC_EDC_CNT,
> -	mmTCP_ATC_EDC_GATCL1_CNT,
> -	mmTCP_EDC_CNT,
> -	mmTD_EDC_CNT
> +/* See GRBM_GFX_INDEX, et. al. registers. */
> +static const struct reg32_counter_name_map sec_ded_counter_registers[]
> = {
> +	DEF_mmREG32_NAME_MAP_ELEMENT(SQC_EDC_CNT, 2),
> +
> 	DEF_mmREG32_NAME_MAP_ELEMENT(SQC_ATC_EDC_GATCL1_CN
> T, 2),
> +
> +	DEF_mmREG32_NAME_MAP_ELEMENT(SQ_EDC_SEC_CNT, 8),
> +	DEF_mmREG32_NAME_MAP_ELEMENT(SQ_EDC_DED_CNT, 8),
> +	DEF_mmREG32_NAME_MAP_ELEMENT(TCP_EDC_CNT, 8),
> +
> 	DEF_mmREG32_NAME_MAP_ELEMENT(TCP_ATC_EDC_GATCL1_CN
> T, 8),
> +	DEF_mmREG32_NAME_MAP_ELEMENT(TD_EDC_CNT, 8),
> +
> +	DEF_mmREG32_NAME_MAP_ELEMENT(TCC_EDC_CNT, 4),
> +
> +	DEF_mmREG32_NAME_MAP_ELEMENT(CPC_EDC_ATC_CNT, 1),
> +	DEF_mmREG32_NAME_MAP_ELEMENT(CPC_EDC_SCRATCH_CNT,
> 1),
> +	DEF_mmREG32_NAME_MAP_ELEMENT(CPC_EDC_UCODE_CNT, 1),
> +	DEF_mmREG32_NAME_MAP_ELEMENT(CPF_EDC_ATC_CNT, 1),
> +	DEF_mmREG32_NAME_MAP_ELEMENT(CPF_EDC_ROQ_CNT, 1),
> +	DEF_mmREG32_NAME_MAP_ELEMENT(CPF_EDC_TAG_CNT, 1),
> +	DEF_mmREG32_NAME_MAP_ELEMENT(CPG_EDC_ATC_CNT, 1),
> +	DEF_mmREG32_NAME_MAP_ELEMENT(CPG_EDC_DMA_CNT, 1),
> +	DEF_mmREG32_NAME_MAP_ELEMENT(CPG_EDC_TAG_CNT, 1),
> +	DEF_mmREG32_NAME_MAP_ELEMENT(DC_EDC_CSINVOC_CNT, 1),
> +	DEF_mmREG32_NAME_MAP_ELEMENT(DC_EDC_STATE_CNT, 1),
> +	DEF_mmREG32_NAME_MAP_ELEMENT(DC_EDC_RESTORE_CNT, 1),
> +	DEF_mmREG32_NAME_MAP_ELEMENT(GDS_EDC_CNT, 1),
> +	DEF_mmREG32_NAME_MAP_ELEMENT(GDS_EDC_GRBM_CNT, 1),
> +	DEF_mmREG32_NAME_MAP_ELEMENT(SPI_EDC_CNT, 1),
>  };
> 
> +static int gfx_v8_0_edc_clear_counters(struct amdgpu_device *adev)
> +{
> +	int ci, se, sh, i;
> +	uint32_t count;
> +	int r = 0;
> +
> +	mutex_lock(&adev->grbm_idx_mutex);
> +
> +	for (ci = 0; ci < ARRAY_SIZE(sec_ded_counter_registers); ++ci) {
> +		const struct reg32_counter_name_map *cp =
> +			sec_ded_counter_registers + ci;
> +		const char *name = cp->rnmap_name;
> +
> +		for (se = 0; se < adev->gfx.config.max_shader_engines;
> ++se) {
> +			for (sh = 0; sh < adev->gfx.config.max_sh_per_se;
> ++sh) {
> +				for (i = 0; i < cp->rnmap_num_instances; ++i)
> {
> +					gfx_v8_0_select_se_sh(adev, se, sh,
> i);
> +					count = RREG32(cp->rnmap_addr);
> +					count = RREG32(cp->rnmap_addr);
> +					if (count != 0) {
> +						/*
> +						 * EDC workaround failed.
> +						 * If people are interested
> +						 * in EDC at all, they will
> +						 * want to know which
> +						 * counters had problems.
> +						 */
> +						DRM_WARN("EDC counter
> %s is 0x%08x, but should be 0\n.",
> +							 name, count);
> +						r = -EINVAL;
> +						goto ret;
> +					}
> +				}
> +			}
> +		}
> +	}
> +
> +ret:
> +	gfx_v8_0_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff);
> +	mutex_unlock(&adev->grbm_idx_mutex);
> +
> +	return r;
> +}
> +
>  static int gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device
> *adev)
>  {
>  	struct amdgpu_ring *ring = &adev->gfx.compute_ring[0];
> @@ -1763,18 +1808,36 @@ static int
> gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
>  	struct dma_fence *f = NULL;
>  	int r, i;
>  	u32 tmp;
> +	u32 dis_bit;
>  	unsigned total_size, vgpr_offset, sgpr_offset;
>  	u64 gpu_addr;
> 
> -	/* only supported on CZ */
> -	if (adev->asic_type != CHIP_CARRIZO)
> +	if (adev->asic_type != CHIP_CARRIZO) {
> +		/* EDC is only supported on CZ */
> +		return 0;
> +	}

No need to add the additional parens.

> +
> +	DRM_INFO("Detected Carrizo.\n");
> +

Drop this debugging print.

> +	tmp = RREG32(mmCC_GC_EDC_CONFIG);
> +	dis_bit = REG_GET_FIELD(tmp, CC_GC_EDC_CONFIG, DIS_EDC);
> +	if (dis_bit) {

You can just check the bit directly, e.g.,
if (REG_GET_FIELD(tmp, CC_GC_EDC_CONFIG, DIS_EDC)) {

> +		/* On Carrizo, EDC may be permanently disabled by a fuse. */
> +		DRM_INFO("CZ EDC hardware is disabled, GC_EDC_CONFIG:
> 0x%08x.\n",
> +			tmp);

I would reverse the logic here and only print a message if EDC is enabled.

>  		return 0;
> +	}
> 
>  	/* bail if the compute ring is not ready */
>  	if (!ring->ready)
>  		return 0;
> 
> -	tmp = RREG32(mmGB_EDC_MODE);
> +	DRM_INFO("Applying EDC workarounds.\n");
> +

Drop the debugging print.

> +	/*
> +	 * Interested parties can enable EDC using debugfs register reads and
> +	 * writes.
> +	 */
>  	WREG32(mmGB_EDC_MODE, 0);
> 
>  	total_size =
> @@ -1899,18 +1962,7 @@ static int
> gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
>  		goto fail;
>  	}
> 
> -	tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, DED_MODE, 2);
> -	tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, PROP_FED, 1);
> -	WREG32(mmGB_EDC_MODE, tmp);
> -
> -	tmp = RREG32(mmCC_GC_EDC_CONFIG);
> -	tmp = REG_SET_FIELD(tmp, CC_GC_EDC_CONFIG, DIS_EDC, 0) | 1;
> -	WREG32(mmCC_GC_EDC_CONFIG, tmp);
> -
> -
> -	/* read back registers to clear the counters */
> -	for (i = 0; i < ARRAY_SIZE(sec_ded_counter_registers); i++)
> -		RREG32(sec_ded_counter_registers[i]);
> +	gfx_v8_0_edc_clear_counters(adev);
> 
>  fail:
>  	amdgpu_ib_free(adev, &ib, NULL);
> --
> 2.7.4
> 
> _______________________________________________
> amd-gfx mailing list
> amd-gfx at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx


More information about the amd-gfx mailing list