Mesa (master): radv: calculate and emit GFX9 GS registers to pipeline state.

Fri Oct 20 05:25:19 UTC 2017

Module: Mesa
Branch: master
Commit: 73749caf0e9bae845d8dd5aed181e4e3cb65c918
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=73749caf0e9bae845d8dd5aed181e4e3cb65c918

Author: Bas Nieuwenhuizen <bas at basnieuwenhuizen.nl>
Date:   Fri Oct 20 02:24:24 2017 +0200

radv: calculate and emit GFX9 GS registers to pipeline state.

Reviewed-by: Dave Airlie <airlied at redhat.com>

---

 src/amd/vulkan/radv_cmd_buffer.c |  25 ++++++--
 src/amd/vulkan/radv_pipeline.c   | 126 ++++++++++++++++++++++++++++++++++++++-
 src/amd/vulkan/radv_private.h    |   8 +++
 src/amd/vulkan/radv_shader.c     |   6 ++
 4 files changed, 158 insertions(+), 7 deletions(-)

diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
index 3dc356ca8e..f4aa9e9b16 100644
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -864,11 +864,26 @@ radv_emit_geometry_shader(struct radv_cmd_buffer *cmd_buffer,
 	ws->cs_add_buffer(cmd_buffer->cs, gs->bo, 8);
 	radv_emit_prefetch(cmd_buffer, va, gs->code_size);
 
-	radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B220_SPI_SHADER_PGM_LO_GS, 4);
-	radeon_emit(cmd_buffer->cs, va >> 8);
-	radeon_emit(cmd_buffer->cs, va >> 40);
-	radeon_emit(cmd_buffer->cs, gs->rsrc1);
-	radeon_emit(cmd_buffer->cs, gs->rsrc2);
+	if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) {
+		radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B210_SPI_SHADER_PGM_LO_ES, 2);
+		radeon_emit(cmd_buffer->cs, va >> 8);
+		radeon_emit(cmd_buffer->cs, va >> 40);
+
+		radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B228_SPI_SHADER_PGM_RSRC1_GS, 2);
+		radeon_emit(cmd_buffer->cs, gs->rsrc1);
+		radeon_emit(cmd_buffer->cs, gs->rsrc2 |
+		                            S_00B22C_LDS_SIZE(pipeline->graphics.gs.lds_size));
+
+		radeon_set_context_reg(cmd_buffer->cs, R_028A44_VGT_GS_ONCHIP_CNTL, pipeline->graphics.gs.vgt_gs_onchip_cntl);
+		radeon_set_context_reg(cmd_buffer->cs, R_028A94_VGT_GS_MAX_PRIMS_PER_SUBGROUP, pipeline->graphics.gs.vgt_gs_max_prims_per_subgroup);
+		radeon_set_context_reg(cmd_buffer->cs, R_028AAC_VGT_ESGS_RING_ITEMSIZE, pipeline->graphics.gs.vgt_esgs_ring_itemsize);
+	} else {
+		radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B220_SPI_SHADER_PGM_LO_GS, 4);
+		radeon_emit(cmd_buffer->cs, va >> 8);
+		radeon_emit(cmd_buffer->cs, va >> 40);
+		radeon_emit(cmd_buffer->cs, gs->rsrc1);
+		radeon_emit(cmd_buffer->cs, gs->rsrc2);
+	}
 
 	radv_emit_hw_vs(cmd_buffer, pipeline, pipeline->gs_copy_shader, &pipeline->gs_copy_shader->info.vs.outinfo);
 
diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c
index 21265c45df..eac4456a3f 100644
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -1161,6 +1161,123 @@ radv_compute_vs_key(const VkGraphicsPipelineCreateInfo *pCreateInfo, bool as_es,
 	return key;
 }
 
+
+static void calculate_gfx9_gs_info(const VkGraphicsPipelineCreateInfo *pCreateInfo,
+                                   struct radv_pipeline *pipeline)
+{
+	struct ac_shader_variant_info *gs_info = &pipeline->shaders[MESA_SHADER_GEOMETRY]->info;
+	struct ac_es_output_info *es_info = radv_pipeline_has_tess(pipeline) ?
+		&gs_info->tes.es_info : &gs_info->vs.es_info;
+	unsigned gs_num_invocations = MAX2(gs_info->gs.invocations, 1);
+	bool uses_adjacency;
+	switch(pCreateInfo->pInputAssemblyState->topology) {
+	case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
+	case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
+	case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
+	case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
+		uses_adjacency = false;
+		break;
+	default:
+		uses_adjacency = false;
+		break;
+	}
+
+	/* All these are in dwords: */
+	/* We can't allow using the whole LDS, because GS waves compete with
+	 * other shader stages for LDS space. */
+	const unsigned max_lds_size = 8 * 1024;
+	const unsigned esgs_itemsize = es_info->esgs_itemsize / 4;
+	unsigned esgs_lds_size;
+
+	/* All these are per subgroup: */
+	const unsigned max_out_prims = 32 * 1024;
+	const unsigned max_es_verts = 255;
+	const unsigned ideal_gs_prims = 64;
+	unsigned max_gs_prims, gs_prims;
+	unsigned min_es_verts, es_verts, worst_case_es_verts;
+
+	assert(gs_num_invocations <= 32); /* GL maximum */
+
+	if (uses_adjacency || gs_num_invocations > 1)
+		max_gs_prims = 127 / gs_num_invocations;
+	else
+		max_gs_prims = 255;
+
+	/* MAX_PRIMS_PER_SUBGROUP = gs_prims * max_vert_out * gs_invocations.
+	 * Make sure we don't go over the maximum value.
+	 */
+	if (gs_info->gs.vertices_out > 0) {
+		max_gs_prims = MIN2(max_gs_prims,
+				    max_out_prims /
+				    (gs_info->gs.vertices_out * gs_num_invocations));
+	}
+	assert(max_gs_prims > 0);
+
+	/* If the primitive has adjacency, halve the number of vertices
+	 * that will be reused in multiple primitives.
+	 */
+	min_es_verts = gs_info->gs.vertices_in / (uses_adjacency ? 2 : 1);
+
+	gs_prims = MIN2(ideal_gs_prims, max_gs_prims);
+	worst_case_es_verts = MIN2(min_es_verts * gs_prims, max_es_verts);
+
+	/* Compute ESGS LDS size based on the worst case number of ES vertices
+	 * needed to create the target number of GS prims per subgroup.
+	 */
+	esgs_lds_size = esgs_itemsize * worst_case_es_verts;
+
+	/* If total LDS usage is too big, refactor partitions based on ratio
+	 * of ESGS item sizes.
+	 */
+	if (esgs_lds_size > max_lds_size) {
+		/* Our target GS Prims Per Subgroup was too large. Calculate
+		 * the maximum number of GS Prims Per Subgroup that will fit
+		 * into LDS, capped by the maximum that the hardware can support.
+		 */
+		gs_prims = MIN2((max_lds_size / (esgs_itemsize * min_es_verts)),
+				max_gs_prims);
+		assert(gs_prims > 0);
+		worst_case_es_verts = MIN2(min_es_verts * gs_prims,
+					   max_es_verts);
+
+		esgs_lds_size = esgs_itemsize * worst_case_es_verts;
+		assert(esgs_lds_size <= max_lds_size);
+	}
+
+	/* Now calculate remaining ESGS information. */
+	if (esgs_lds_size)
+		es_verts = MIN2(esgs_lds_size / esgs_itemsize, max_es_verts);
+	else
+		es_verts = max_es_verts;
+
+	/* Vertices for adjacency primitives are not always reused, so restore
+	 * it for ES_VERTS_PER_SUBGRP.
+	 */
+	min_es_verts = gs_info->gs.vertices_in;
+
+	/* For normal primitives, the VGT only checks if they are past the ES
+	 * verts per subgroup after allocating a full GS primitive and if they
+	 * are, kick off a new subgroup.  But if those additional ES verts are
+	 * unique (e.g. not reused) we need to make sure there is enough LDS
+	 * space to account for those ES verts beyond ES_VERTS_PER_SUBGRP.
+	 */
+	es_verts -= min_es_verts - 1;
+
+	uint32_t es_verts_per_subgroup = es_verts;
+	uint32_t gs_prims_per_subgroup = gs_prims;
+	uint32_t gs_inst_prims_in_subgroup = gs_prims * gs_num_invocations;
+	uint32_t max_prims_per_subgroup = gs_inst_prims_in_subgroup * gs_info->gs.vertices_out;
+	pipeline->graphics.gs.lds_size = align(esgs_lds_size, 128) / 128;
+	pipeline->graphics.gs.vgt_gs_onchip_cntl =
+	                       S_028A44_ES_VERTS_PER_SUBGRP(es_verts_per_subgroup) |
+	                       S_028A44_GS_PRIMS_PER_SUBGRP(gs_prims_per_subgroup) |
+	                       S_028A44_GS_INST_PRIMS_IN_SUBGRP(gs_inst_prims_in_subgroup);
+	pipeline->graphics.gs.vgt_gs_max_prims_per_subgroup =
+	                       S_028A94_MAX_PRIMS_PER_SUBGROUP(max_prims_per_subgroup);
+	pipeline->graphics.gs.vgt_esgs_ring_itemsize  = esgs_itemsize;
+	assert(max_prims_per_subgroup <= max_out_prims);
+}
+
 static void
 calculate_gs_ring_sizes(struct radv_pipeline *pipeline)
 {
@@ -1194,7 +1311,9 @@ calculate_gs_ring_sizes(struct radv_pipeline *pipeline)
 	esgs_ring_size = align(esgs_ring_size, alignment);
 	gsvs_ring_size = align(gsvs_ring_size, alignment);
 
-	pipeline->graphics.esgs_ring_size = CLAMP(esgs_ring_size, min_esgs_ring_size, max_size);
+	if (pipeline->device->physical_device->rad_info.chip_class <= VI)
+		pipeline->graphics.esgs_ring_size = CLAMP(esgs_ring_size, min_esgs_ring_size, max_size);
+
 	pipeline->graphics.gsvs_ring_size = MIN2(gsvs_ring_size, max_size);
 }
 
@@ -1916,8 +2035,11 @@ radv_pipeline_init(struct radv_pipeline *pipeline,
 
 	pipeline->graphics.vgt_shader_stages_en = stages;
 
-	if (radv_pipeline_has_gs(pipeline))
+	if (radv_pipeline_has_gs(pipeline)) {
 		calculate_gs_ring_sizes(pipeline);
+		if (device->physical_device->rad_info.chip_class >= GFX9)
+			calculate_gfx9_gs_info(pCreateInfo, pipeline);
+	}
 
 	if (radv_pipeline_has_tess(pipeline)) {
 		if (pipeline->graphics.prim == V_008958_DI_PT_PATCH) {
diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
index 10b7983760..6c3a1bbbb3 100644
--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@@ -1050,6 +1050,13 @@ struct radv_tessellation_state {
 	uint32_t tf_param;
 };
 
+struct radv_gs_state {
+	uint32_t vgt_gs_onchip_cntl;
+	uint32_t vgt_gs_max_prims_per_subgroup;
+	uint32_t vgt_esgs_ring_itemsize;
+	uint32_t lds_size;
+};
+
 struct radv_vertex_elements_info {
 	uint32_t rsrc_word3[MAX_VERTEX_ATTRIBS];
 	uint32_t format_size[MAX_VERTEX_ATTRIBS];
@@ -1084,6 +1091,7 @@ struct radv_pipeline {
 			struct radv_raster_state raster;
 			struct radv_multisample_state ms;
 			struct radv_tessellation_state tess;
+			struct radv_gs_state gs;
 			uint32_t db_shader_control;
 			uint32_t shader_z_format;
 			unsigned prim;
diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c
index 3688680107..a5e2826401 100644
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@@ -400,6 +400,12 @@ radv_fill_shader_variant(struct radv_device *device,
 	}
 
 	if (device->physical_device->rad_info.chip_class >= GFX9 &&
+	    stage == MESA_SHADER_GEOMETRY) {
+		/* TODO: Figure out how many we actually need. */
+		variant->rsrc1 |= S_00B228_GS_VGPR_COMP_CNT(3);
+		variant->rsrc2 |= S_00B22C_ES_VGPR_COMP_CNT(3) |
+		                  S_00B22C_OC_LDS_EN(1);
+	} else if (device->physical_device->rad_info.chip_class >= GFX9 &&
 	    stage == MESA_SHADER_TESS_CTRL)
 		variant->rsrc1 |= S_00B428_LS_VGPR_COMP_CNT(vgpr_comp_cnt);
 	else