[Mesa-dev] [PATCH 05/10] r600g: fix CP DMA hazard with index buffer fetches (v3)

Marek Olšák maraeo at gmail.com
Wed Jun 1 18:57:08 UTC 2016


From: Marek Olšák <marek.olsak at amd.com>

v3: use PFP_SYNC_ME on EG-CM only when supported by the kernel,
    otherwise use MEM_WRITE + WAIT_REG_MEM to emulate that
---
 src/gallium/drivers/r600/evergreen_hw_context.c | 16 ++++--
 src/gallium/drivers/r600/evergreend.h           |  1 +
 src/gallium/drivers/r600/r600_blit.c            |  2 +-
 src/gallium/drivers/r600/r600_hw_context.c      | 69 ++++++++++++++++++++++++-
 src/gallium/drivers/r600/r600_pipe.h            |  5 +-
 src/gallium/drivers/r600/r600d.h                |  5 ++
 src/gallium/drivers/radeonsi/sid.h              |  2 +-
 7 files changed, 93 insertions(+), 7 deletions(-)

diff --git a/src/gallium/drivers/r600/evergreen_hw_context.c b/src/gallium/drivers/r600/evergreen_hw_context.c
index f456696..2feb801 100644
--- a/src/gallium/drivers/r600/evergreen_hw_context.c
+++ b/src/gallium/drivers/r600/evergreen_hw_context.c
@@ -85,7 +85,8 @@ void evergreen_dma_copy_buffer(struct r600_context *rctx,
 
 void evergreen_cp_dma_clear_buffer(struct r600_context *rctx,
 				   struct pipe_resource *dst, uint64_t offset,
-				   unsigned size, uint32_t clear_value)
+				   unsigned size, uint32_t clear_value,
+				   enum r600_coherency coher)
 {
 	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 
@@ -117,7 +118,9 @@ void evergreen_cp_dma_clear_buffer(struct r600_context *rctx,
 		unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
 		unsigned reloc;
 
-		r600_need_cs_space(rctx, 10 + (rctx->b.flags ? R600_MAX_FLUSH_CS_DWORDS : 0), FALSE);
+		r600_need_cs_space(rctx,
+				   10 + (rctx->b.flags ? R600_MAX_FLUSH_CS_DWORDS : 0) +
+				   R600_MAX_PFP_SYNC_ME_DWORDS, FALSE);
 
 		/* Flush the caches for the first copy only. */
 		if (rctx->b.flags) {
@@ -148,9 +151,16 @@ void evergreen_cp_dma_clear_buffer(struct r600_context *rctx,
 		offset += byte_count;
 	}
 
+	/* CP DMA is executed in ME, but index buffers are read by PFP.
+	 * This ensures that ME (CP DMA) is idle before PFP starts fetching
+	 * indices. If we wanted to execute CP DMA in PFP, this packet
+	 * should precede it.
+	 */
+	if (coher == R600_COHERENCY_SHADER)
+		r600_emit_pfp_sync_me(rctx);
+
 	/* Invalidate the read caches. */
 	rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
 			 R600_CONTEXT_INV_VERTEX_CACHE |
 			 R600_CONTEXT_INV_TEX_CACHE;
 }
-
diff --git a/src/gallium/drivers/r600/evergreend.h b/src/gallium/drivers/r600/evergreend.h
index c1c6169..a81b6c5 100644
--- a/src/gallium/drivers/r600/evergreend.h
+++ b/src/gallium/drivers/r600/evergreend.h
@@ -88,6 +88,7 @@
 #define		WAIT_REG_MEM_EQUAL		3
 #define PKT3_MEM_WRITE                         0x3D
 #define PKT3_INDIRECT_BUFFER                   0x32
+#define PKT3_PFP_SYNC_ME		       0x42
 #define PKT3_SURFACE_SYNC                      0x43
 #define PKT3_ME_INITIALIZE                     0x44
 #define PKT3_COND_WRITE                        0x45
diff --git a/src/gallium/drivers/r600/r600_blit.c b/src/gallium/drivers/r600/r600_blit.c
index 282645f..76c3364 100644
--- a/src/gallium/drivers/r600/r600_blit.c
+++ b/src/gallium/drivers/r600/r600_blit.c
@@ -589,7 +589,7 @@ static void r600_clear_buffer(struct pipe_context *ctx, struct pipe_resource *ds
 	if (rctx->screen->b.has_cp_dma &&
 	    rctx->b.chip_class >= EVERGREEN &&
 	    offset % 4 == 0 && size % 4 == 0) {
-		evergreen_cp_dma_clear_buffer(rctx, dst, offset, size, value);
+		evergreen_cp_dma_clear_buffer(rctx, dst, offset, size, value, coher);
 	} else if (rctx->screen->b.has_streamout && offset % 4 == 0 && size % 4 == 0) {
 		union pipe_color_union clear_value;
 		clear_value.ui[0] = value;
diff --git a/src/gallium/drivers/r600/r600_hw_context.c b/src/gallium/drivers/r600/r600_hw_context.c
index bbfe620..1ae3f04 100644
--- a/src/gallium/drivers/r600/r600_hw_context.c
+++ b/src/gallium/drivers/r600/r600_hw_context.c
@@ -365,6 +365,66 @@ void r600_begin_new_cs(struct r600_context *ctx)
 	ctx->b.initial_gfx_cs_size = ctx->b.gfx.cs->cdw;
 }
 
+void r600_emit_pfp_sync_me(struct r600_context *rctx)
+{
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+
+	if (rctx->b.chip_class >= EVERGREEN &&
+	    rctx->b.screen->info.drm_minor >= 46) {
+		radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
+		radeon_emit(cs, 0);
+	} else {
+		/* Emulate PFP_SYNC_ME by writing a value to memory in ME and
+		 * waiting for it in PFP.
+		 */
+		struct r600_resource *buf = NULL;
+		unsigned offset, reloc;
+		uint64_t va;
+
+		/* 16-byte address alignment is required by WAIT_REG_MEM. */
+		u_suballocator_alloc(rctx->b.allocator_zeroed_memory, 4, 16,
+				     &offset, (struct pipe_resource**)&buf);
+		if (!buf) {
+			/* This is too heavyweight, but will work. */
+			rctx->b.gfx.flush(rctx, RADEON_FLUSH_ASYNC, NULL);
+			return;
+		}
+
+		reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, buf,
+						  RADEON_USAGE_READWRITE,
+						  RADEON_PRIO_FENCE);
+
+		va = buf->gpu_address + offset;
+		assert(va % 16 == 0);
+
+		/* Write 1 to memory in ME. */
+		radeon_emit(cs, PKT3(PKT3_MEM_WRITE, 3, 0));
+		radeon_emit(cs, va);
+		radeon_emit(cs, ((va >> 32) & 0xff) | MEM_WRITE_32_BITS);
+		radeon_emit(cs, 1);
+		radeon_emit(cs, 0);
+
+		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
+		radeon_emit(cs, reloc);
+
+		/* Wait in PFP (PFP can only do GEQUAL against memory). */
+		radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
+		radeon_emit(cs, WAIT_REG_MEM_GEQUAL |
+			        WAIT_REG_MEM_MEMORY |
+			        WAIT_REG_MEM_PFP);
+		radeon_emit(cs, va);
+		radeon_emit(cs, va >> 32);
+		radeon_emit(cs, 1); /* reference value */
+		radeon_emit(cs, 0xffffffff); /* mask */
+		radeon_emit(cs, 4); /* poll interval */
+
+		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
+		radeon_emit(cs, reloc);
+
+		r600_resource_reference(&buf, NULL);
+	}
+}
+
 /* The max number of bytes to copy per packet. */
 #define CP_DMA_MAX_BYTE_COUNT ((1 << 21) - 8)
 
@@ -408,7 +468,7 @@ void r600_cp_dma_copy_buffer(struct r600_context *rctx,
 
 		r600_need_cs_space(rctx,
 				   10 + (rctx->b.flags ? R600_MAX_FLUSH_CS_DWORDS : 0) +
-				   3, FALSE);
+				   3 + R600_MAX_PFP_SYNC_ME_DWORDS, FALSE);
 
 		/* Flush the caches for the first copy only. */
 		if (rctx->b.flags) {
@@ -448,6 +508,13 @@ void r600_cp_dma_copy_buffer(struct r600_context *rctx,
 		radeon_set_config_reg(cs, R_008040_WAIT_UNTIL,
 				      S_008040_WAIT_CP_DMA_IDLE(1));
 
+	/* CP DMA is executed in ME, but index buffers are read by PFP.
+	 * This ensures that ME (CP DMA) is idle before PFP starts fetching
+	 * indices. If we wanted to execute CP DMA in PFP, this packet
+	 * should precede it.
+	 */
+	r600_emit_pfp_sync_me(rctx);
+
 	/* Invalidate the read caches. */
 	rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE |
 			 R600_CONTEXT_INV_VERTEX_CACHE |
diff --git a/src/gallium/drivers/r600/r600_pipe.h b/src/gallium/drivers/r600/r600_pipe.h
index cdb8e82..58ab14c 100644
--- a/src/gallium/drivers/r600/r600_pipe.h
+++ b/src/gallium/drivers/r600/r600_pipe.h
@@ -57,6 +57,7 @@
 /* the number of CS dwords for flushing and drawing */
 #define R600_MAX_FLUSH_CS_DWORDS	18
 #define R600_MAX_DRAW_CS_DWORDS		58
+#define R600_MAX_PFP_SYNC_ME_DWORDS	16
 
 #define R600_MAX_USER_CONST_BUFFERS 13
 #define R600_MAX_DRIVER_CONST_BUFFERS 3
@@ -663,13 +664,15 @@ void r600_context_gfx_flush(void *context, unsigned flags,
 void r600_begin_new_cs(struct r600_context *ctx);
 void r600_flush_emit(struct r600_context *ctx);
 void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw, boolean count_draw_in);
+void r600_emit_pfp_sync_me(struct r600_context *rctx);
 void r600_cp_dma_copy_buffer(struct r600_context *rctx,
 			     struct pipe_resource *dst, uint64_t dst_offset,
 			     struct pipe_resource *src, uint64_t src_offset,
 			     unsigned size);
 void evergreen_cp_dma_clear_buffer(struct r600_context *rctx,
 				   struct pipe_resource *dst, uint64_t offset,
-				   unsigned size, uint32_t clear_value);
+				   unsigned size, uint32_t clear_value,
+				   enum r600_coherency coher);
 void r600_dma_copy_buffer(struct r600_context *rctx,
 			  struct pipe_resource *dst,
 			  struct pipe_resource *src,
diff --git a/src/gallium/drivers/r600/r600d.h b/src/gallium/drivers/r600/r600d.h
index 24f599e..75d64c1 100644
--- a/src/gallium/drivers/r600/r600d.h
+++ b/src/gallium/drivers/r600/r600d.h
@@ -96,8 +96,13 @@
 #define		COPY_DW_DST_IS_MEM		(1 << 1)
 #define PKT3_WAIT_REG_MEM                      0x3C
 #define		WAIT_REG_MEM_EQUAL		3
+#define		WAIT_REG_MEM_GEQUAL		5
+#define		WAIT_REG_MEM_MEMORY		(1 << 4)
+#define		WAIT_REG_MEM_PFP		(1 << 8)
 #define PKT3_MEM_WRITE                         0x3D
+#define		MEM_WRITE_32_BITS		(1 << 18)
 #define PKT3_INDIRECT_BUFFER                   0x32
+#define PKT3_PFP_SYNC_ME		       0x42 /* EG+ */
 #define PKT3_SURFACE_SYNC                      0x43
 #define PKT3_ME_INITIALIZE                     0x44
 #define PKT3_COND_WRITE                        0x45
diff --git a/src/gallium/drivers/radeonsi/sid.h b/src/gallium/drivers/radeonsi/sid.h
index 25f8cf5..ddbfe00 100644
--- a/src/gallium/drivers/radeonsi/sid.h
+++ b/src/gallium/drivers/radeonsi/sid.h
@@ -135,7 +135,7 @@
 #define		COPY_DATA_DST_SEL(x)		(((unsigned)(x) & 0xf) << 8)
 #define		COPY_DATA_COUNT_SEL		(1 << 16)
 #define		COPY_DATA_WR_CONFIRM		(1 << 20)
-#define PKT3_PFP_SYNC_ME		       0x42 /* r7xx+ */
+#define PKT3_PFP_SYNC_ME		       0x42
 #define PKT3_SURFACE_SYNC                      0x43 /* deprecated on CIK, use ACQUIRE_MEM */
 #define PKT3_ME_INITIALIZE                     0x44 /* not on CIK */
 #define PKT3_COND_WRITE                        0x45
-- 
2.7.4



More information about the mesa-dev mailing list