[PATCH 2/3] drm/amdgpu: Complete Carrizo EDC (Error Detection and Correction) workarounds.
Deucher, Alexander
Alexander.Deucher at amd.com
Tue Jun 6 21:16:34 UTC 2017
> -----Original Message-----
> From: amd-gfx [mailto:amd-gfx-bounces at lists.freedesktop.org] On Behalf
> Of David Panariti
> Sent: Tuesday, June 06, 2017 4:33 PM
> To: amd-gfx at lists.freedesktop.org
> Cc: Panariti, David
> Subject: [PATCH 2/3] drm/amdgpu: Complete Carrizo EDC (Error Detection
> and Correction) workarounds.
>
> The workarounds are unconditionally performed on CZs with EDC enabled.
> EDC detects uncorrected ECC errors and uses data poisoning to prevent
> corrupted compute results from being used (read).
> EDC enabled CZs are often used in embedded environments.
>
> Signed-off-by: David Panariti <David.Panariti at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 13 ++++
> drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 136
> +++++++++++++++++++++++-----------
> 2 files changed, 107 insertions(+), 42 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 0d64bbba..a6f51eb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -2060,5 +2060,18 @@ int amdgpu_dm_display_resume(struct
> amdgpu_device *adev );
> static inline int amdgpu_dm_display_resume(struct amdgpu_device *adev)
> { return 0; }
> #endif
>
> +/* Carrizo EDC support */
> +struct reg32_counter_name_map {
> + uint32_t rnmap_addr; /* Counter register address */
> + const char *rnmap_name; /* Name of the counter */
> + size_t rnmap_num_instances; /* Number of block instances */
> +};
> +
> +#define DEF_mmREG32_NAME_MAP_ELEMENT(reg, num_instances) {
> \
> + .rnmap_addr = mm##reg, \
> + .rnmap_name = #reg, \
> + .rnmap_num_instances = num_instances \
> +}
> +
> #include "amdgpu_object.h"
> #endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> index 07172f8..46e766e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> @@ -1727,35 +1727,80 @@ static const u32 sgpr2_init_regs[] =
> mmCOMPUTE_USER_DATA_9, 0xedcedc09,
> };
>
> -static const u32 sec_ded_counter_registers[] =
> -{
> - mmCPC_EDC_ATC_CNT,
> - mmCPC_EDC_SCRATCH_CNT,
> - mmCPC_EDC_UCODE_CNT,
> - mmCPF_EDC_ATC_CNT,
> - mmCPF_EDC_ROQ_CNT,
> - mmCPF_EDC_TAG_CNT,
> - mmCPG_EDC_ATC_CNT,
> - mmCPG_EDC_DMA_CNT,
> - mmCPG_EDC_TAG_CNT,
> - mmDC_EDC_CSINVOC_CNT,
> - mmDC_EDC_RESTORE_CNT,
> - mmDC_EDC_STATE_CNT,
> - mmGDS_EDC_CNT,
> - mmGDS_EDC_GRBM_CNT,
> - mmGDS_EDC_OA_DED,
> - mmSPI_EDC_CNT,
> - mmSQC_ATC_EDC_GATCL1_CNT,
> - mmSQC_EDC_CNT,
> - mmSQ_EDC_DED_CNT,
> - mmSQ_EDC_INFO,
> - mmSQ_EDC_SEC_CNT,
> - mmTCC_EDC_CNT,
> - mmTCP_ATC_EDC_GATCL1_CNT,
> - mmTCP_EDC_CNT,
> - mmTD_EDC_CNT
> +/* See GRBM_GFX_INDEX, et. al. registers. */
> +static const struct reg32_counter_name_map sec_ded_counter_registers[]
> = {
> + DEF_mmREG32_NAME_MAP_ELEMENT(SQC_EDC_CNT, 2),
> +
> DEF_mmREG32_NAME_MAP_ELEMENT(SQC_ATC_EDC_GATCL1_CN
> T, 2),
> +
> + DEF_mmREG32_NAME_MAP_ELEMENT(SQ_EDC_SEC_CNT, 8),
> + DEF_mmREG32_NAME_MAP_ELEMENT(SQ_EDC_DED_CNT, 8),
> + DEF_mmREG32_NAME_MAP_ELEMENT(TCP_EDC_CNT, 8),
> +
> DEF_mmREG32_NAME_MAP_ELEMENT(TCP_ATC_EDC_GATCL1_CN
> T, 8),
> + DEF_mmREG32_NAME_MAP_ELEMENT(TD_EDC_CNT, 8),
> +
> + DEF_mmREG32_NAME_MAP_ELEMENT(TCC_EDC_CNT, 4),
> +
> + DEF_mmREG32_NAME_MAP_ELEMENT(CPC_EDC_ATC_CNT, 1),
> + DEF_mmREG32_NAME_MAP_ELEMENT(CPC_EDC_SCRATCH_CNT,
> 1),
> + DEF_mmREG32_NAME_MAP_ELEMENT(CPC_EDC_UCODE_CNT, 1),
> + DEF_mmREG32_NAME_MAP_ELEMENT(CPF_EDC_ATC_CNT, 1),
> + DEF_mmREG32_NAME_MAP_ELEMENT(CPF_EDC_ROQ_CNT, 1),
> + DEF_mmREG32_NAME_MAP_ELEMENT(CPF_EDC_TAG_CNT, 1),
> + DEF_mmREG32_NAME_MAP_ELEMENT(CPG_EDC_ATC_CNT, 1),
> + DEF_mmREG32_NAME_MAP_ELEMENT(CPG_EDC_DMA_CNT, 1),
> + DEF_mmREG32_NAME_MAP_ELEMENT(CPG_EDC_TAG_CNT, 1),
> + DEF_mmREG32_NAME_MAP_ELEMENT(DC_EDC_CSINVOC_CNT, 1),
> + DEF_mmREG32_NAME_MAP_ELEMENT(DC_EDC_STATE_CNT, 1),
> + DEF_mmREG32_NAME_MAP_ELEMENT(DC_EDC_RESTORE_CNT, 1),
> + DEF_mmREG32_NAME_MAP_ELEMENT(GDS_EDC_CNT, 1),
> + DEF_mmREG32_NAME_MAP_ELEMENT(GDS_EDC_GRBM_CNT, 1),
> + DEF_mmREG32_NAME_MAP_ELEMENT(SPI_EDC_CNT, 1),
> };
>
> +static int gfx_v8_0_edc_clear_counters(struct amdgpu_device *adev)
> +{
> + int ci, se, sh, i;
> + uint32_t count;
> + int r = 0;
> +
> + mutex_lock(&adev->grbm_idx_mutex);
> +
> + for (ci = 0; ci < ARRAY_SIZE(sec_ded_counter_registers); ++ci) {
> + const struct reg32_counter_name_map *cp =
> + sec_ded_counter_registers + ci;
> + const char *name = cp->rnmap_name;
> +
> + for (se = 0; se < adev->gfx.config.max_shader_engines;
> ++se) {
> + for (sh = 0; sh < adev->gfx.config.max_sh_per_se;
> ++sh) {
> + for (i = 0; i < cp->rnmap_num_instances; ++i)
> {
> + gfx_v8_0_select_se_sh(adev, se, sh,
> i);
> + count = RREG32(cp->rnmap_addr);
> + count = RREG32(cp->rnmap_addr);
> + if (count != 0) {
> + /*
> + * EDC workaround failed.
> + * If people are interested
> + * in EDC at all, they will
> + * want to know which
> + * counters had problems.
> + */
> + DRM_WARN("EDC counter
> %s is 0x%08x, but should be 0\n.",
> + name, count);
> + r = -EINVAL;
> + goto ret;
> + }
> + }
> + }
> + }
> + }
> +
> +ret:
> + gfx_v8_0_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff);
> + mutex_unlock(&adev->grbm_idx_mutex);
> +
> + return r;
> +}
> +
> static int gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device
> *adev)
> {
> struct amdgpu_ring *ring = &adev->gfx.compute_ring[0];
> @@ -1763,18 +1808,36 @@ static int
> gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
> struct dma_fence *f = NULL;
> int r, i;
> u32 tmp;
> + u32 dis_bit;
> unsigned total_size, vgpr_offset, sgpr_offset;
> u64 gpu_addr;
>
> - /* only supported on CZ */
> - if (adev->asic_type != CHIP_CARRIZO)
> + if (adev->asic_type != CHIP_CARRIZO) {
> + /* EDC is only supported on CZ */
> + return 0;
> + }
No need to add the additional parens.
> +
> + DRM_INFO("Detected Carrizo.\n");
> +
Drop this debugging print.
> + tmp = RREG32(mmCC_GC_EDC_CONFIG);
> + dis_bit = REG_GET_FIELD(tmp, CC_GC_EDC_CONFIG, DIS_EDC);
> + if (dis_bit) {
You can just check the bit directly, e.g.,
if (REG_GET_FIELD(tmp, CC_GC_EDC_CONFIG, DIS_EDC)) {
> + /* On Carrizo, EDC may be permanently disabled by a fuse. */
> + DRM_INFO("CZ EDC hardware is disabled, GC_EDC_CONFIG:
> 0x%08x.\n",
> + tmp);
I would reverse the logic here and only print a message if EDC is enabled.
> return 0;
> + }
>
> /* bail if the compute ring is not ready */
> if (!ring->ready)
> return 0;
>
> - tmp = RREG32(mmGB_EDC_MODE);
> + DRM_INFO("Applying EDC workarounds.\n");
> +
Drop the debugging print.
> + /*
> + * Interested parties can enable EDC using debugfs register reads and
> + * writes.
> + */
> WREG32(mmGB_EDC_MODE, 0);
>
> total_size =
> @@ -1899,18 +1962,7 @@ static int
> gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
> goto fail;
> }
>
> - tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, DED_MODE, 2);
> - tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, PROP_FED, 1);
> - WREG32(mmGB_EDC_MODE, tmp);
> -
> - tmp = RREG32(mmCC_GC_EDC_CONFIG);
> - tmp = REG_SET_FIELD(tmp, CC_GC_EDC_CONFIG, DIS_EDC, 0) | 1;
> - WREG32(mmCC_GC_EDC_CONFIG, tmp);
> -
> -
> - /* read back registers to clear the counters */
> - for (i = 0; i < ARRAY_SIZE(sec_ded_counter_registers); i++)
> - RREG32(sec_ded_counter_registers[i]);
> + gfx_v8_0_edc_clear_counters(adev);
>
> fail:
> amdgpu_ib_free(adev, &ib, NULL);
> --
> 2.7.4
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
More information about the amd-gfx
mailing list