[PATCH 2/3] drm/amdgpu: Complete Carrizo EDC (Error Detection and Correction) workarounds.

David Panariti David.Panariti at amd.com
Tue Jun 6 20:32:51 UTC 2017


The workarounds are unconditionally performed on CZs with EDC enabled.
EDC detects uncorrected ECC errors and uses data poisoning to prevent
corrupted compute results from being used (read).
EDC enabled CZs are often used in embedded environments.

Signed-off-by: David Panariti <David.Panariti at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h   |  13 ++++
 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 136 +++++++++++++++++++++++-----------
 2 files changed, 107 insertions(+), 42 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 0d64bbba..a6f51eb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -2060,5 +2060,18 @@ int amdgpu_dm_display_resume(struct amdgpu_device *adev );
 static inline int amdgpu_dm_display_resume(struct amdgpu_device *adev) { return 0; }
 #endif
 
+/* Carrizo EDC support */
+struct reg32_counter_name_map {
+	uint32_t rnmap_addr;	     /* Counter register address */
+	const char *rnmap_name;	     /* Name of the counter */
+	size_t rnmap_num_instances;  /* Number of block instances */
+};
+
+#define DEF_mmREG32_NAME_MAP_ELEMENT(reg, num_instances) {	\
+		.rnmap_addr = mm##reg,				\
+		.rnmap_name = #reg,				\
+		.rnmap_num_instances = num_instances		\
+}
+
 #include "amdgpu_object.h"
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index 07172f8..46e766e 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -1727,35 +1727,80 @@ static const u32 sgpr2_init_regs[] =
 	mmCOMPUTE_USER_DATA_9, 0xedcedc09,
 };
 
-static const u32 sec_ded_counter_registers[] =
-{
-	mmCPC_EDC_ATC_CNT,
-	mmCPC_EDC_SCRATCH_CNT,
-	mmCPC_EDC_UCODE_CNT,
-	mmCPF_EDC_ATC_CNT,
-	mmCPF_EDC_ROQ_CNT,
-	mmCPF_EDC_TAG_CNT,
-	mmCPG_EDC_ATC_CNT,
-	mmCPG_EDC_DMA_CNT,
-	mmCPG_EDC_TAG_CNT,
-	mmDC_EDC_CSINVOC_CNT,
-	mmDC_EDC_RESTORE_CNT,
-	mmDC_EDC_STATE_CNT,
-	mmGDS_EDC_CNT,
-	mmGDS_EDC_GRBM_CNT,
-	mmGDS_EDC_OA_DED,
-	mmSPI_EDC_CNT,
-	mmSQC_ATC_EDC_GATCL1_CNT,
-	mmSQC_EDC_CNT,
-	mmSQ_EDC_DED_CNT,
-	mmSQ_EDC_INFO,
-	mmSQ_EDC_SEC_CNT,
-	mmTCC_EDC_CNT,
-	mmTCP_ATC_EDC_GATCL1_CNT,
-	mmTCP_EDC_CNT,
-	mmTD_EDC_CNT
+/* See GRBM_GFX_INDEX, et. al. registers. */
+static const struct reg32_counter_name_map sec_ded_counter_registers[] = {
+	DEF_mmREG32_NAME_MAP_ELEMENT(SQC_EDC_CNT, 2),
+	DEF_mmREG32_NAME_MAP_ELEMENT(SQC_ATC_EDC_GATCL1_CNT, 2),
+
+	DEF_mmREG32_NAME_MAP_ELEMENT(SQ_EDC_SEC_CNT, 8),
+	DEF_mmREG32_NAME_MAP_ELEMENT(SQ_EDC_DED_CNT, 8),
+	DEF_mmREG32_NAME_MAP_ELEMENT(TCP_EDC_CNT, 8),
+	DEF_mmREG32_NAME_MAP_ELEMENT(TCP_ATC_EDC_GATCL1_CNT, 8),
+	DEF_mmREG32_NAME_MAP_ELEMENT(TD_EDC_CNT, 8),
+
+	DEF_mmREG32_NAME_MAP_ELEMENT(TCC_EDC_CNT, 4),
+
+	DEF_mmREG32_NAME_MAP_ELEMENT(CPC_EDC_ATC_CNT, 1),
+	DEF_mmREG32_NAME_MAP_ELEMENT(CPC_EDC_SCRATCH_CNT, 1),
+	DEF_mmREG32_NAME_MAP_ELEMENT(CPC_EDC_UCODE_CNT, 1),
+	DEF_mmREG32_NAME_MAP_ELEMENT(CPF_EDC_ATC_CNT, 1),
+	DEF_mmREG32_NAME_MAP_ELEMENT(CPF_EDC_ROQ_CNT, 1),
+	DEF_mmREG32_NAME_MAP_ELEMENT(CPF_EDC_TAG_CNT, 1),
+	DEF_mmREG32_NAME_MAP_ELEMENT(CPG_EDC_ATC_CNT, 1),
+	DEF_mmREG32_NAME_MAP_ELEMENT(CPG_EDC_DMA_CNT, 1),
+	DEF_mmREG32_NAME_MAP_ELEMENT(CPG_EDC_TAG_CNT, 1),
+	DEF_mmREG32_NAME_MAP_ELEMENT(DC_EDC_CSINVOC_CNT, 1),
+	DEF_mmREG32_NAME_MAP_ELEMENT(DC_EDC_STATE_CNT, 1),
+	DEF_mmREG32_NAME_MAP_ELEMENT(DC_EDC_RESTORE_CNT, 1),
+	DEF_mmREG32_NAME_MAP_ELEMENT(GDS_EDC_CNT, 1),
+	DEF_mmREG32_NAME_MAP_ELEMENT(GDS_EDC_GRBM_CNT, 1),
+	DEF_mmREG32_NAME_MAP_ELEMENT(SPI_EDC_CNT, 1),
 };
 
+static int gfx_v8_0_edc_clear_counters(struct amdgpu_device *adev)
+{
+	int ci, se, sh, i;
+	uint32_t count;
+	int r = 0;
+
+	mutex_lock(&adev->grbm_idx_mutex);
+
+	for (ci = 0; ci < ARRAY_SIZE(sec_ded_counter_registers); ++ci) {
+		const struct reg32_counter_name_map *cp =
+			sec_ded_counter_registers + ci;
+		const char *name = cp->rnmap_name;
+
+		for (se = 0; se < adev->gfx.config.max_shader_engines; ++se) {
+			for (sh = 0; sh < adev->gfx.config.max_sh_per_se; ++sh) {
+				for (i = 0; i < cp->rnmap_num_instances; ++i) {
+					gfx_v8_0_select_se_sh(adev, se, sh, i);
+					count = RREG32(cp->rnmap_addr);
+					count = RREG32(cp->rnmap_addr);
+					if (count != 0) {
+						/*
+						 * EDC workaround failed.
+						 * If people are interested
+						 * in EDC at all, they will
+						 * want to know which
+						 * counters had problems.
+						 */
+						DRM_WARN("EDC counter %s is 0x%08x, but should be 0\n.",
+							 name, count);
+						r = -EINVAL;
+						goto ret;
+					}
+				}
+			}
+		}
+	}
+
+ret:
+	gfx_v8_0_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff);
+	mutex_unlock(&adev->grbm_idx_mutex);
+
+	return r;
+}
+
 static int gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
 {
 	struct amdgpu_ring *ring = &adev->gfx.compute_ring[0];
@@ -1763,18 +1808,36 @@ static int gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
 	struct dma_fence *f = NULL;
 	int r, i;
 	u32 tmp;
+	u32 dis_bit;
 	unsigned total_size, vgpr_offset, sgpr_offset;
 	u64 gpu_addr;
 
-	/* only supported on CZ */
-	if (adev->asic_type != CHIP_CARRIZO)
+	if (adev->asic_type != CHIP_CARRIZO) {
+		/* EDC is only supported on CZ */
+		return 0;
+	}
+
+	DRM_INFO("Detected Carrizo.\n");
+
+	tmp = RREG32(mmCC_GC_EDC_CONFIG);
+	dis_bit = REG_GET_FIELD(tmp, CC_GC_EDC_CONFIG, DIS_EDC);
+	if (dis_bit) {
+		/* On Carrizo, EDC may be permanently disabled by a fuse. */
+		DRM_INFO("CZ EDC hardware is disabled, GC_EDC_CONFIG: 0x%08x.\n",
+			tmp);
 		return 0;
+	}
 
 	/* bail if the compute ring is not ready */
 	if (!ring->ready)
 		return 0;
 
-	tmp = RREG32(mmGB_EDC_MODE);
+	DRM_INFO("Applying EDC workarounds.\n");
+
+	/*
+	 * Interested parties can enable EDC using debugfs register reads and
+	 * writes.
+	 */
 	WREG32(mmGB_EDC_MODE, 0);
 
 	total_size =
@@ -1899,18 +1962,7 @@ static int gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
 		goto fail;
 	}
 
-	tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, DED_MODE, 2);
-	tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, PROP_FED, 1);
-	WREG32(mmGB_EDC_MODE, tmp);
-
-	tmp = RREG32(mmCC_GC_EDC_CONFIG);
-	tmp = REG_SET_FIELD(tmp, CC_GC_EDC_CONFIG, DIS_EDC, 0) | 1;
-	WREG32(mmCC_GC_EDC_CONFIG, tmp);
-
-
-	/* read back registers to clear the counters */
-	for (i = 0; i < ARRAY_SIZE(sec_ded_counter_registers); i++)
-		RREG32(sec_ded_counter_registers[i]);
+	gfx_v8_0_edc_clear_counters(adev);
 
 fail:
 	amdgpu_ib_free(adev, &ib, NULL);
-- 
2.7.4



More information about the amd-gfx mailing list