[Mesa-dev] [PATCH 10/12] radeonsi: do all math in bytes in SI DMA code

Mon Jan 2 22:54:15 UTC 2017

From: Marek Olšák <marek.olsak at amd.com>

---
 src/amd/common/sid.h                  |  6 ++++--
 src/gallium/drivers/radeonsi/si_dma.c | 34 +++++++++++++++++-----------------
 2 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/src/amd/common/sid.h b/src/amd/common/sid.h
index 0a2c616..fc21a18 100644
--- a/src/amd/common/sid.h
+++ b/src/amd/common/sid.h
@@ -9014,22 +9014,24 @@
 #define R_028E34_CB_COLOR7_CLEAR_WORD1                                  0x028E34
 #define R_028E38_CB_COLOR7_DCC_BASE                                     0x028E38 /* VI */
 
 /* SI async DMA packets */
 #define SI_DMA_PACKET(cmd, sub_cmd, n) ((((unsigned)(cmd) & 0xF) << 28) |    \
                                        (((unsigned)(sub_cmd) & 0xFF) << 20) |\
                                        (((unsigned)(n) & 0xFFFFF) << 0))
 /* SI async DMA Packet types */
 #define    SI_DMA_PACKET_WRITE                     0x2
 #define    SI_DMA_PACKET_COPY                      0x3
-#define    SI_DMA_COPY_MAX_SIZE                    0xfffe0
-#define    SI_DMA_COPY_MAX_SIZE_DW                 0xffff8
+#define    SI_DMA_COPY_MAX_BYTE_ALIGNED_SIZE       0xfffe0
+/* The documentation says 0xffff8 is the maximum size in dwords, which is
+ * 0x3fffe0 in bytes. */
+#define    SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE      0x3fffe0
 #define    SI_DMA_COPY_DWORD_ALIGNED               0x00
 #define    SI_DMA_COPY_BYTE_ALIGNED                0x40
 #define    SI_DMA_COPY_TILED                       0x8
 #define    SI_DMA_PACKET_INDIRECT_BUFFER           0x4
 #define    SI_DMA_PACKET_SEMAPHORE                 0x5
 #define    SI_DMA_PACKET_FENCE                     0x6
 #define    SI_DMA_PACKET_TRAP                      0x7
 #define    SI_DMA_PACKET_SRBM_WRITE                0x9
 #define    SI_DMA_PACKET_CONSTANT_FILL             0xd
 #define    SI_DMA_PACKET_NOP                       0xf
diff --git a/src/gallium/drivers/radeonsi/si_dma.c b/src/gallium/drivers/radeonsi/si_dma.c
index 1009bb2..b6aab00 100644
--- a/src/gallium/drivers/radeonsi/si_dma.c
+++ b/src/gallium/drivers/radeonsi/si_dma.c
@@ -30,58 +30,58 @@
 #include "util/u_format.h"
 
 static void si_dma_copy_buffer(struct si_context *ctx,
 				struct pipe_resource *dst,
 				struct pipe_resource *src,
 				uint64_t dst_offset,
 				uint64_t src_offset,
 				uint64_t size)
 {
 	struct radeon_winsys_cs *cs = ctx->b.dma.cs;
-	unsigned i, ncopy, csize, max_csize, sub_cmd, shift;
+	unsigned i, ncopy, count, max_size, sub_cmd, shift;
 	struct r600_resource *rdst = (struct r600_resource*)dst;
 	struct r600_resource *rsrc = (struct r600_resource*)src;
 
 	/* Mark the buffer range of destination as valid (initialized),
 	 * so that transfer_map knows it should wait for the GPU when mapping
 	 * that range. */
 	util_range_add(&rdst->valid_buffer_range, dst_offset,
 		       dst_offset + size);
 
 	dst_offset += rdst->gpu_address;
 	src_offset += rsrc->gpu_address;
 
-	/* see if we use dword or byte copy */
+	/* see whether we should use the dword-aligned or byte-aligned copy */
 	if (!(dst_offset % 4) && !(src_offset % 4) && !(size % 4)) {
-		size >>= 2;
 		sub_cmd = SI_DMA_COPY_DWORD_ALIGNED;
 		shift = 2;
-		max_csize = SI_DMA_COPY_MAX_SIZE_DW;
+		max_size = SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE;
 	} else {
 		sub_cmd = SI_DMA_COPY_BYTE_ALIGNED;
 		shift = 0;
-		max_csize = SI_DMA_COPY_MAX_SIZE;
+		max_size = SI_DMA_COPY_MAX_BYTE_ALIGNED_SIZE;
 	}
-	ncopy = (size / max_csize) + !!(size % max_csize);
 
+	ncopy = DIV_ROUND_UP(size, max_size);
 	r600_need_dma_space(&ctx->b, ncopy * 5, rdst, rsrc);
 
 	for (i = 0; i < ncopy; i++) {
-		csize = size < max_csize ? size : max_csize;
-		radeon_emit(cs, SI_DMA_PACKET(SI_DMA_PACKET_COPY, sub_cmd, csize));
+		count = MIN2(size, max_size);
+		radeon_emit(cs, SI_DMA_PACKET(SI_DMA_PACKET_COPY, sub_cmd,
+					      count >> shift));
 		radeon_emit(cs, dst_offset);
 		radeon_emit(cs, src_offset);
 		radeon_emit(cs, (dst_offset >> 32UL) & 0xff);
 		radeon_emit(cs, (src_offset >> 32UL) & 0xff);
-		dst_offset += csize << shift;
-		src_offset += csize << shift;
-		size -= csize;
+		dst_offset += count;
+		src_offset += count;
+		size -= count;
 	}
 }
 
 static void si_dma_copy_tile(struct si_context *ctx,
 			     struct pipe_resource *dst,
 			     unsigned dst_level,
 			     unsigned dst_x,
 			     unsigned dst_y,
 			     unsigned dst_z,
 			     struct pipe_resource *src,
@@ -144,31 +144,31 @@ static void si_dma_copy_tile(struct si_context *ctx,
 	bank_w = G_009910_BANK_WIDTH(tile_mode);
 	mt_aspect = G_009910_MACRO_TILE_ASPECT(tile_mode);
 	/* Non-depth modes don't have TILE_SPLIT set. */
 	tile_split = util_logbase2(rtiled->surface.tile_split >> 6);
 	nbanks = G_009910_NUM_BANKS(tile_mode);
 	base += rtiled->resource.gpu_address;
 	addr += rlinear->resource.gpu_address;
 
 	pipe_config = G_009910_PIPE_CONFIG(tile_mode);
 	mt = G_009910_MICRO_TILE_MODE(tile_mode);
-	size = (copy_height * pitch) / 4;
-	ncopy = (size / SI_DMA_COPY_MAX_SIZE_DW) + !!(size % SI_DMA_COPY_MAX_SIZE_DW);
+	size = copy_height * pitch;
+	ncopy = DIV_ROUND_UP(size, SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE);
 	r600_need_dma_space(&ctx->b, ncopy * 9, &rdst->resource, &rsrc->resource);
 
 	for (i = 0; i < ncopy; i++) {
 		cheight = copy_height;
-		if (((cheight * pitch) / 4) > SI_DMA_COPY_MAX_SIZE_DW) {
-			cheight = (SI_DMA_COPY_MAX_SIZE_DW * 4) / pitch;
+		if (cheight * pitch > SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE) {
+			cheight = SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE / pitch;
 		}
-		size = (cheight * pitch) / 4;
-		radeon_emit(cs, SI_DMA_PACKET(SI_DMA_PACKET_COPY, sub_cmd, size));
+		size = cheight * pitch;
+		radeon_emit(cs, SI_DMA_PACKET(SI_DMA_PACKET_COPY, sub_cmd, size / 4));
 		radeon_emit(cs, base >> 8);
 		radeon_emit(cs, (detile << 31) | (array_mode << 27) |
 				(lbpp << 24) | (bank_h << 21) |
 				(bank_w << 18) | (mt_aspect << 16));
 		radeon_emit(cs, (pitch_tile_max << 0) | ((height - 1) << 16));
 		radeon_emit(cs, (slice_tile_max << 0) | (pipe_config << 26));
 		radeon_emit(cs, (tiled_x << 0) | (tiled_z << 18));
 		radeon_emit(cs, (tiled_y << 0) | (tile_split << 21) | (nbanks << 25) | (mt << 27));
 		radeon_emit(cs, addr & 0xfffffffc);
 		radeon_emit(cs, (addr >> 32UL) & 0xff);
-- 
2.7.4