[Mesa-dev] [PATCH 5/5] radeonsi: implement ARB_draw_indirect

Marek Olšák maraeo at gmail.com
Mon Jul 7 18:37:06 PDT 2014


From: Marek Olšák <marek.olsak at amd.com>

---
 docs/GL3.txt                                 |  4 +-
 docs/relnotes/10.3.html                      |  2 +
 src/gallium/drivers/radeonsi/si_commands.c   | 53 ++++++++++++++++++++
 src/gallium/drivers/radeonsi/si_pipe.c       |  1 +
 src/gallium/drivers/radeonsi/si_state.h      |  7 +++
 src/gallium/drivers/radeonsi/si_state_draw.c | 73 ++++++++++++++++++++++------
 src/gallium/drivers/radeonsi/sid.h           | 11 ++++-
 7 files changed, 132 insertions(+), 19 deletions(-)

diff --git a/docs/GL3.txt b/docs/GL3.txt
index 296e14c..15b8c87 100644
--- a/docs/GL3.txt
+++ b/docs/GL3.txt
@@ -98,7 +98,7 @@ GL 4.0:
 
   GLSL 4.0                                             not started
   GL_ARB_draw_buffers_blend                            DONE (i965, nv50, nvc0, r600, radeonsi, softpipe)
-  GL_ARB_draw_indirect                                 DONE (i965)
+  GL_ARB_draw_indirect                                 DONE (i965, radeonsi, softpipe, llvmpipe)
   GL_ARB_gpu_shader5                                   started
   - 'precise' qualifier                                DONE
   - Dynamically uniform sampler array indices          started (Chris)
@@ -165,7 +165,7 @@ GL 4.3:
   GL_ARB_framebuffer_no_attachments                    not started
   GL_ARB_internalformat_query2                         not started
   GL_ARB_invalidate_subdata                            DONE (all drivers)
-  GL_ARB_multi_draw_indirect                           DONE (i965)
+  GL_ARB_multi_draw_indirect                           DONE (i965, radeonsi, softpipe, llvmpipe)
   GL_ARB_program_interface_query                       not started
   GL_ARB_robust_buffer_access_behavior                 not started
   GL_ARB_shader_image_size                             not started
diff --git a/docs/relnotes/10.3.html b/docs/relnotes/10.3.html
index 2e718fc..6140ff6 100644
--- a/docs/relnotes/10.3.html
+++ b/docs/relnotes/10.3.html
@@ -45,7 +45,9 @@ Note: some of the new features are only available with certain drivers.
 
 <ul>
 <li>GL_ARB_compressed_texture_pixel_storage on all drivers</li>
+<li>GL_ARB_draw_indirect on radeonsi</li>
 <li>GL_ARB_explicit_uniform_location (all drivers that support GLSL)</li>
+<li>GL_ARB_multi_draw_indirect on radeonsi</li>
 <li>GL_ARB_sample_shading on radeonsi</li>
 <li>GL_ARB_stencil_texturing on nv50, nvc0, r600, and radeonsi</li>
 <li>GL_ARB_texture_cube_map_array on radeonsi</li>
diff --git a/src/gallium/drivers/radeonsi/si_commands.c b/src/gallium/drivers/radeonsi/si_commands.c
index 5ddc40e..2efdeda 100644
--- a/src/gallium/drivers/radeonsi/si_commands.c
+++ b/src/gallium/drivers/radeonsi/si_commands.c
@@ -57,6 +57,59 @@ void si_cmd_draw_index_auto(struct si_pm4_state *pm4, uint32_t count,
 	si_pm4_cmd_end(pm4, predicate);
 }
 
+void si_cmd_draw_indirect(struct si_pm4_state *pm4, uint64_t indirect_va,
+			  uint32_t indirect_offset, uint32_t base_vtx_loc,
+			  uint32_t start_inst_loc, bool predicate)
+{
+	assert(indirect_va % 8 == 0);
+	assert(indirect_offset % 4 == 0);
+
+	si_pm4_cmd_begin(pm4, PKT3_SET_BASE);
+	si_pm4_cmd_add(pm4, 1);
+	si_pm4_cmd_add(pm4, indirect_va);
+	si_pm4_cmd_add(pm4, indirect_va >> 32);
+	si_pm4_cmd_end(pm4, predicate);
+
+	si_pm4_cmd_begin(pm4, PKT3_DRAW_INDIRECT);
+	si_pm4_cmd_add(pm4, indirect_offset);
+	si_pm4_cmd_add(pm4, (base_vtx_loc - SI_SH_REG_OFFSET) >> 2);
+	si_pm4_cmd_add(pm4, (start_inst_loc - SI_SH_REG_OFFSET) >> 2);
+	si_pm4_cmd_add(pm4, V_0287F0_DI_SRC_SEL_AUTO_INDEX);
+	si_pm4_cmd_end(pm4, predicate);
+}
+
+void si_cmd_draw_index_indirect(struct si_pm4_state *pm4, uint64_t indirect_va,
+				uint64_t index_va, uint32_t index_max_size,
+				uint32_t indirect_offset, uint32_t base_vtx_loc,
+				uint32_t start_inst_loc, bool predicate)
+{
+	assert(indirect_va % 8 == 0);
+	assert(index_va % 2 == 0);
+	assert(indirect_offset % 4 == 0);
+
+	si_pm4_cmd_begin(pm4, PKT3_SET_BASE);
+	si_pm4_cmd_add(pm4, 1);
+	si_pm4_cmd_add(pm4, indirect_va);
+	si_pm4_cmd_add(pm4, indirect_va >> 32);
+	si_pm4_cmd_end(pm4, predicate);
+
+	si_pm4_cmd_begin(pm4, PKT3_INDEX_BASE);
+	si_pm4_cmd_add(pm4, index_va);
+	si_pm4_cmd_add(pm4, index_va >> 32);
+	si_pm4_cmd_end(pm4, predicate);
+
+	si_pm4_cmd_begin(pm4, PKT3_INDEX_BUFFER_SIZE);
+	si_pm4_cmd_add(pm4, index_max_size);
+	si_pm4_cmd_end(pm4, predicate);
+
+	si_pm4_cmd_begin(pm4, PKT3_DRAW_INDEX_INDIRECT);
+	si_pm4_cmd_add(pm4, indirect_offset);
+	si_pm4_cmd_add(pm4, (base_vtx_loc - SI_SH_REG_OFFSET) >> 2);
+	si_pm4_cmd_add(pm4, (start_inst_loc - SI_SH_REG_OFFSET) >> 2);
+	si_pm4_cmd_add(pm4, V_0287F0_DI_SRC_SEL_DMA);
+	si_pm4_cmd_end(pm4, predicate);
+}
+
 void si_cmd_surface_sync(struct si_pm4_state *pm4, uint32_t cp_coher_cntl)
 {
 	if (pm4->chip_class >= CIK) {
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 2df6333..9ad1189 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -214,6 +214,7 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT:
 	case PIPE_CAP_CUBE_MAP_ARRAY:
 	case PIPE_CAP_SAMPLE_SHADING:
+	case PIPE_CAP_DRAW_INDIRECT:
 		return 1;
 
 	case PIPE_CAP_TEXTURE_MULTISAMPLE:
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index c051d73..e0fd2ff 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -250,6 +250,13 @@ void si_cmd_draw_index_2(struct si_pm4_state *pm4, uint32_t max_size,
 			 uint32_t initiator, bool predicate);
 void si_cmd_draw_index_auto(struct si_pm4_state *pm4, uint32_t count,
 			    uint32_t initiator, bool predicate);
+void si_cmd_draw_indirect(struct si_pm4_state *pm4, uint64_t indirect_va,
+			  uint32_t indirect_offset, uint32_t base_vtx_loc,
+			  uint32_t start_inst_loc, bool predicate);
+void si_cmd_draw_index_indirect(struct si_pm4_state *pm4, uint64_t indirect_va,
+				uint64_t index_va, uint32_t index_max_size,
+				uint32_t indirect_offset, uint32_t base_vtx_loc,
+				uint32_t start_inst_loc, bool predicate);
 void si_cmd_surface_sync(struct si_pm4_state *pm4, uint32_t cp_coher_cntl);
 
 #endif
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index e2b29c3..bac1846 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -783,15 +783,18 @@ static void si_state_draw(struct si_context *sctx,
 	}
 	si_pm4_cmd_end(pm4, sctx->b.predicate_drawing);
 
-	si_pm4_cmd_begin(pm4, PKT3_NUM_INSTANCES);
-	si_pm4_cmd_add(pm4, info->instance_count);
-	si_pm4_cmd_end(pm4, sctx->b.predicate_drawing);
-
 	if (!info->indirect) {
+		si_pm4_cmd_begin(pm4, PKT3_NUM_INSTANCES);
+		si_pm4_cmd_add(pm4, info->instance_count);
+		si_pm4_cmd_end(pm4, sctx->b.predicate_drawing);
+
 		si_pm4_set_reg(pm4, sh_base_reg + SI_SGPR_BASE_VERTEX * 4,
 			       info->indexed ? info->index_bias : info->start);
 		si_pm4_set_reg(pm4, sh_base_reg + SI_SGPR_START_INSTANCE * 4,
 			       info->start_instance);
+	} else {
+		si_pm4_add_bo(pm4, (struct r600_resource *)info->indirect,
+			      RADEON_USAGE_READ, RADEON_PRIO_MIN);
 	}
 
 	if (info->indexed) {
@@ -803,14 +806,35 @@ static void si_state_draw(struct si_context *sctx,
 
 		si_pm4_add_bo(pm4, (struct r600_resource *)ib->buffer, RADEON_USAGE_READ,
 			      RADEON_PRIO_MIN);
-		va += info->start * ib->index_size;
-		si_cmd_draw_index_2(pm4, max_size, va, info->count,
-				    V_0287F0_DI_SRC_SEL_DMA,
-				    sctx->b.predicate_drawing);
+
+		if (info->indirect) {
+			uint64_t indirect_va = r600_resource_va(&sctx->screen->b.b,
+								info->indirect);
+			si_cmd_draw_index_indirect(pm4, indirect_va, va, max_size,
+						   info->indirect_offset,
+						   sh_base_reg + SI_SGPR_BASE_VERTEX * 4,
+						   sh_base_reg + SI_SGPR_START_INSTANCE * 4,
+						   sctx->b.predicate_drawing);
+		} else {
+			va += info->start * ib->index_size;
+			si_cmd_draw_index_2(pm4, max_size, va, info->count,
+					    V_0287F0_DI_SRC_SEL_DMA,
+					    sctx->b.predicate_drawing);
+		}
 	} else {
-		uint32_t initiator = V_0287F0_DI_SRC_SEL_AUTO_INDEX;
-		initiator |= S_0287F0_USE_OPAQUE(!!info->count_from_stream_output);
-		si_cmd_draw_index_auto(pm4, info->count, initiator, sctx->b.predicate_drawing);
+		if (info->indirect) {
+			uint64_t indirect_va = r600_resource_va(&sctx->screen->b.b,
+								info->indirect);
+			si_cmd_draw_indirect(pm4, indirect_va, info->indirect_offset,
+					     sh_base_reg + SI_SGPR_BASE_VERTEX * 4,
+					     sh_base_reg + SI_SGPR_START_INSTANCE * 4,
+					     sctx->b.predicate_drawing);
+		} else {
+			si_cmd_draw_index_auto(pm4, info->count,
+					       V_0287F0_DI_SRC_SEL_AUTO_INDEX |
+					       S_0287F0_USE_OPAQUE(!!info->count_from_stream_output),
+					       sctx->b.predicate_drawing);
+		}
 	}
 
 	si_pm4_set_state(sctx, draw, pm4);
@@ -898,13 +922,32 @@ void si_emit_cache_flush(struct r600_common_context *sctx, struct r600_atom *ato
 
 const struct r600_atom si_atom_cache_flush = { si_emit_cache_flush, 13 }; /* number of CS dwords */
 
+static void si_get_draw_start_count(struct si_context *sctx,
+				    const struct pipe_draw_info *info,
+				    unsigned *start, unsigned *count)
+{
+	if (info->indirect) {
+		struct r600_resource *indirect =
+			(struct r600_resource*)info->indirect;
+		int *data = r600_buffer_map_sync_with_rings(&sctx->b,
+					indirect, PIPE_TRANSFER_READ);
+                data += info->indirect_offset/sizeof(int);
+		*start = data[2];
+		*count = data[0];
+	} else {
+		*start = info->start;
+		*count = info->count;
+	}
+}
+
 void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
 	struct pipe_index_buffer ib = {};
 	uint32_t i;
 
-	if (!info->count && (info->indexed || !info->count_from_stream_output))
+	if (!info->count && !info->indirect &&
+	    (info->indexed || !info->count_from_stream_output))
 		return;
 
 	if (!sctx->ps_shader || !sctx->vs_shader)
@@ -926,8 +969,7 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 			unsigned out_offset, start, count, start_offset;
 			void *ptr;
 
-			start = info->start;
-			count = info->count;
+			si_get_draw_start_count(sctx, info, &start, &count);
 			start_offset = start * ib.index_size;
 
 			u_upload_alloc(sctx->b.uploader, start_offset, count * 2,
@@ -946,8 +988,7 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 		} else if (ib.user_buffer && !ib.buffer) {
 			unsigned start, count, start_offset;
 
-			start = info->start;
-			count = info->count;
+			si_get_draw_start_count(sctx, info, &start, &count);
 			start_offset = start * ib.index_size;
 
 			u_upload_data(sctx->b.uploader, start_offset, count * ib.index_size,
diff --git a/src/gallium/drivers/radeonsi/sid.h b/src/gallium/drivers/radeonsi/sid.h
index e3f788e..3241725 100644
--- a/src/gallium/drivers/radeonsi/sid.h
+++ b/src/gallium/drivers/radeonsi/sid.h
@@ -70,18 +70,27 @@
 #define R600_TEXEL_PITCH_ALIGNMENT_MASK        0x7
 
 #define PKT3_NOP                               0x10
+#define PKT3_SET_BASE                          0x11
+#define PKT3_CLEAR_STATE                       0x12
+#define PKT3_INDEX_BUFFER_SIZE                 0x13
 #define PKT3_DISPATCH_DIRECT                   0x15
 #define PKT3_DISPATCH_INDIRECT                 0x16
 #define PKT3_OCCLUSION_QUERY                   0x1F /* new for CIK */
 #define PKT3_SET_PREDICATION                   0x20
 #define PKT3_COND_EXEC                         0x22
 #define PKT3_PRED_EXEC                         0x23
+#define PKT3_DRAW_INDIRECT                     0x24
+#define PKT3_DRAW_INDEX_INDIRECT               0x25
+#define PKT3_INDEX_BASE                        0x26
 #define PKT3_DRAW_INDEX_2                      0x27
 #define PKT3_CONTEXT_CONTROL                   0x28
 #define PKT3_INDEX_TYPE                        0x2A
+#define PKT3_DRAW_INDIRECT_MULTI               0x2C
 #define PKT3_DRAW_INDEX_AUTO                   0x2D
 #define PKT3_DRAW_INDEX_IMMD                   0x2E /* not on CIK */
 #define PKT3_NUM_INSTANCES                     0x2F
+#define PKT3_DRAW_INDEX_MULTI_AUTO             0x30
+#define PKT3_INDIRECT_BUFFER                   0x32
 #define PKT3_STRMOUT_BUFFER_UPDATE             0x34
 #define PKT3_DRAW_INDEX_OFFSET_2               0x35
 #define PKT3_DRAW_PREAMBLE                     0x36 /* new on CIK, required on GFX7.2 and later */
@@ -99,12 +108,12 @@
 #define PKT3_WRITE_DATA_ENGINE_SEL_ME              0
 #define PKT3_WRITE_DATA_ENGINE_SEL_PFP             1
 #define PKT3_WRITE_DATA_ENGINE_SEL_CE              2
+#define PKT3_DRAW_INDEX_INDIRECT_MULTI         0x38
 #define PKT3_MEM_SEMAPHORE                     0x39
 #define PKT3_MPEG_INDEX                        0x3A /* not on CIK */
 #define PKT3_WAIT_REG_MEM                      0x3C
 #define		WAIT_REG_MEM_EQUAL		3
 #define PKT3_MEM_WRITE                         0x3D /* not on CIK */
-#define PKT3_INDIRECT_BUFFER                   0x32
 #define PKT3_COPY_DATA			       0x40
 #define		COPY_DATA_SRC_SEL(x)		((x) & 0xf)
 #define			COPY_DATA_REG		0
-- 
1.9.1



More information about the mesa-dev mailing list