[Mesa-dev] [PATCH v2 24/25] radeonsi: shrink each vertex stream to the actually required size

Nicolai Hähnle nhaehnle at gmail.com
Tue Dec 6 10:48:35 UTC 2016


From: Nicolai Hähnle <nicolai.haehnle at amd.com>

---
 src/gallium/drivers/radeonsi/si_shader.c        | 28 ++++++++++++-------
 src/gallium/drivers/radeonsi/si_state_shaders.c | 37 +++++++++++++++----------
 2 files changed, 40 insertions(+), 25 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index b22248e..5604bbb 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -5715,62 +5715,70 @@ static void preload_ring_buffers(struct si_shader_context *ctx)
 		ctx->esgs_ring =
 			build_indexed_load_const(ctx, buf_ptr, offset);
 	}
 
 	if (ctx->shader->is_gs_copy_shader) {
 		LLVMValueRef offset = lp_build_const_int32(gallivm, SI_RING_GSVS);
 
 		ctx->gsvs_ring[0] =
 			build_indexed_load_const(ctx, buf_ptr, offset);
 	} else if (ctx->type == PIPE_SHADER_GEOMETRY) {
+		const struct si_shader_selector *sel = ctx->shader->selector;
 		struct lp_build_context *uint = &ctx->soa.bld_base.uint_bld;
 		LLVMValueRef offset = lp_build_const_int32(gallivm, SI_RING_GSVS);
 		LLVMValueRef base_ring;
 
 		base_ring = build_indexed_load_const(ctx, buf_ptr, offset);
 
 		/* The conceptual layout of the GSVS ring is
 		 *   v0c0 .. vLv0 v0c1 .. vLc1 ..
 		 * but the real memory layout is swizzled across
 		 * threads:
 		 *   t0v0c0 .. t15v0c0 t0v1c0 .. t15v1c0 ... t15vLcL
 		 *   t16v0c0 ..
 		 * Override the buffer descriptor accordingly.
 		 */
 		LLVMTypeRef v2i64 = LLVMVectorType(ctx->i64, 2);
-		unsigned max_gsvs_emit_size = ctx->shader->selector->max_gsvs_emit_size;
-		unsigned num_records;
-
-		num_records = 64;
-		if (ctx->screen->b.chip_class >= VI)
-			num_records *= max_gsvs_emit_size;
+		uint64_t stream_offset = 0;
 
 		for (unsigned stream = 0; stream < 4; ++stream) {
+			unsigned num_components;
+			unsigned stride;
+			unsigned num_records;
 			LLVMValueRef ring, tmp;
 
-			if (!ctx->shader->selector->info.num_stream_output_components[stream])
+			num_components = sel->info.num_stream_output_components[stream];
+			if (!num_components)
 				continue;
 
+			stride = 4 * num_components * sel->gs_max_out_vertices;
+
 			/* Limit on the stride field for <= CIK. */
-			assert(max_gsvs_emit_size < (1 << 14));
+			assert(stride < (1 << 14));
+
+			num_records = 64;
+			if (ctx->screen->b.chip_class >= VI)
+				num_records *= stride;
 
 			ring = LLVMBuildBitCast(builder, base_ring, v2i64, "");
 			tmp = LLVMBuildExtractElement(builder, ring, uint->zero, "");
 			tmp = LLVMBuildAdd(builder, tmp,
 					   LLVMConstInt(ctx->i64,
-							max_gsvs_emit_size * 64 * stream, 0), "");
+							stream_offset, 0), "");
+			stream_offset += stride * 64;
+
 			ring = LLVMBuildInsertElement(builder, ring, tmp, uint->zero, "");
 			ring = LLVMBuildBitCast(builder, ring, ctx->v4i32, "");
 			tmp = LLVMBuildExtractElement(builder, ring, uint->one, "");
 			tmp = LLVMBuildOr(builder, tmp,
 				LLVMConstInt(ctx->i32,
-					     S_008F04_STRIDE(max_gsvs_emit_size) |
+					     S_008F04_STRIDE(stride) |
 					     S_008F04_SWIZZLE_ENABLE(1), 0), "");
 			ring = LLVMBuildInsertElement(builder, ring, tmp, uint->one, "");
 			ring = LLVMBuildInsertElement(builder, ring,
 					LLVMConstInt(ctx->i32, num_records, 0),
 					LLVMConstInt(ctx->i32, 2, 0), "");
 			ring = LLVMBuildInsertElement(builder, ring,
 				LLVMConstInt(ctx->i32,
 					     S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
 					     S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
 					     S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 1e9f5f0..151ed17 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -461,48 +461,55 @@ static uint32_t si_vgt_gs_mode(struct si_shader_selector *sel)
 	}
 
 	return S_028A40_MODE(V_028A40_GS_SCENARIO_G) |
 	       S_028A40_CUT_MODE(cut_mode)|
 	       S_028A40_ES_WRITE_OPTIMIZE(1) |
 	       S_028A40_GS_WRITE_OPTIMIZE(1);
 }
 
 static void si_shader_gs(struct si_shader *shader)
 {
-	unsigned gs_vert_itemsize = shader->selector->gsvs_vertex_size;
-	unsigned gsvs_itemsize = shader->selector->max_gsvs_emit_size >> 2;
-	unsigned gs_num_invocations = shader->selector->gs_num_invocations;
+	struct si_shader_selector *sel = shader->selector;
+	const ubyte *num_components = sel->info.num_stream_output_components;
+	unsigned gs_num_invocations = sel->gs_num_invocations;
 	struct si_pm4_state *pm4;
 	uint64_t va;
-	unsigned max_stream = shader->selector->max_gs_stream;
-
-	/* The GSVS_RING_ITEMSIZE register takes 15 bits */
-	assert(gsvs_itemsize < (1 << 15));
+	unsigned max_stream = sel->max_gs_stream;
+	unsigned offset;
 
 	pm4 = si_get_shader_pm4_state(shader);
 	if (!pm4)
 		return;
 
 	si_pm4_set_reg(pm4, R_028A40_VGT_GS_MODE, si_vgt_gs_mode(shader->selector));
 
-	si_pm4_set_reg(pm4, R_028A60_VGT_GSVS_RING_OFFSET_1, gsvs_itemsize);
-	si_pm4_set_reg(pm4, R_028A64_VGT_GSVS_RING_OFFSET_2, gsvs_itemsize * ((max_stream >= 2) ? 2 : 1));
-	si_pm4_set_reg(pm4, R_028A68_VGT_GSVS_RING_OFFSET_3, gsvs_itemsize * ((max_stream >= 3) ? 3 : 1));
+	offset = num_components[0] * sel->gs_max_out_vertices;
+	si_pm4_set_reg(pm4, R_028A60_VGT_GSVS_RING_OFFSET_1, offset);
+	if (max_stream >= 1)
+		offset += num_components[1] * sel->gs_max_out_vertices;
+	si_pm4_set_reg(pm4, R_028A64_VGT_GSVS_RING_OFFSET_2, offset);
+	if (max_stream >= 2)
+		offset += num_components[2] * sel->gs_max_out_vertices;
+	si_pm4_set_reg(pm4, R_028A68_VGT_GSVS_RING_OFFSET_3, offset);
+	if (max_stream >= 3)
+		offset += num_components[3] * sel->gs_max_out_vertices;
+	si_pm4_set_reg(pm4, R_028AB0_VGT_GSVS_RING_ITEMSIZE, offset);
 
-	si_pm4_set_reg(pm4, R_028AB0_VGT_GSVS_RING_ITEMSIZE, gsvs_itemsize * (max_stream + 1));
+	/* The GSVS_RING_ITEMSIZE register takes 15 bits */
+	assert(offset < (1 << 15));
 
 	si_pm4_set_reg(pm4, R_028B38_VGT_GS_MAX_VERT_OUT, shader->selector->gs_max_out_vertices);
 
-	si_pm4_set_reg(pm4, R_028B5C_VGT_GS_VERT_ITEMSIZE, gs_vert_itemsize >> 2);
-	si_pm4_set_reg(pm4, R_028B60_VGT_GS_VERT_ITEMSIZE_1, (max_stream >= 1) ? gs_vert_itemsize >> 2 : 0);
-	si_pm4_set_reg(pm4, R_028B64_VGT_GS_VERT_ITEMSIZE_2, (max_stream >= 2) ? gs_vert_itemsize >> 2 : 0);
-	si_pm4_set_reg(pm4, R_028B68_VGT_GS_VERT_ITEMSIZE_3, (max_stream >= 3) ? gs_vert_itemsize >> 2 : 0);
+	si_pm4_set_reg(pm4, R_028B5C_VGT_GS_VERT_ITEMSIZE, num_components[0]);
+	si_pm4_set_reg(pm4, R_028B60_VGT_GS_VERT_ITEMSIZE_1, (max_stream >= 1) ? num_components[1] : 0);
+	si_pm4_set_reg(pm4, R_028B64_VGT_GS_VERT_ITEMSIZE_2, (max_stream >= 2) ? num_components[2] : 0);
+	si_pm4_set_reg(pm4, R_028B68_VGT_GS_VERT_ITEMSIZE_3, (max_stream >= 3) ? num_components[3] : 0);
 
 	si_pm4_set_reg(pm4, R_028B90_VGT_GS_INSTANCE_CNT,
 		       S_028B90_CNT(MIN2(gs_num_invocations, 127)) |
 		       S_028B90_ENABLE(gs_num_invocations > 0));
 
 	va = shader->bo->gpu_address;
 	si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
 	si_pm4_set_reg(pm4, R_00B220_SPI_SHADER_PGM_LO_GS, va >> 8);
 	si_pm4_set_reg(pm4, R_00B224_SPI_SHADER_PGM_HI_GS, va >> 40);
 
-- 
2.7.4



More information about the mesa-dev mailing list