Mesa (master): radv/gfx10: implement NGG GS queries

GitLab Mirror gitlab-mirror at kemper.freedesktop.org
Wed Jan 29 17:31:48 UTC 2020


Module: Mesa
Branch: master
Commit: e4752dafede30fbfc93208d9d4091873a8bd5d31
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=e4752dafede30fbfc93208d9d4091873a8bd5d31

Author: Samuel Pitoiset <samuel.pitoiset at gmail.com>
Date:   Mon Jan 13 18:30:50 2020 +0100

radv/gfx10: implement NGG GS queries

The number of generated primitives is only counted by the hardware
if GS uses the legacy path. For NGG GS, we need to accumulate that
value in the NGG GS itself. To achieve that, we use a plain GDS
atomic operation.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset at gmail.com>
Reviewed-by: Bas Nieuwenhuizen <bas at basnieuwenhuizen.nl>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/merge_requests/3380>

---

 src/amd/vulkan/radv_cmd_buffer.c  | 35 +++++++++++++++++
 src/amd/vulkan/radv_nir_to_llvm.c | 27 +++++++++++++
 src/amd/vulkan/radv_private.h     |  6 ++-
 src/amd/vulkan/radv_query.c       | 79 +++++++++++++++++++++++++++++++++++++--
 src/amd/vulkan/radv_shader.h      |  3 +-
 src/amd/vulkan/radv_shader_args.c |  8 ++++
 src/amd/vulkan/radv_shader_args.h |  3 ++
 7 files changed, 155 insertions(+), 6 deletions(-)

diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
index f81e5fa91dc..21ef5caa8e5 100644
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -2574,6 +2574,35 @@ radv_flush_streamout_descriptors(struct radv_cmd_buffer *cmd_buffer)
 	cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_STREAMOUT_BUFFER;
 }
 
+static void
+radv_flush_ngg_gs_state(struct radv_cmd_buffer *cmd_buffer)
+{
+	struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
+	struct radv_userdata_info *loc;
+	uint32_t ngg_gs_state = 0;
+	uint32_t base_reg;
+
+	if (!radv_pipeline_has_gs(pipeline) ||
+	    !radv_pipeline_has_ngg(pipeline))
+		return;
+
+	/* By default NGG GS queries are disabled but they are enabled if the
+	 * command buffer has active GDS queries or if it's a secondary command
+	 * buffer that inherits the number of generated primitives.
+	 */
+	if (cmd_buffer->state.active_pipeline_gds_queries ||
+	    (cmd_buffer->state.inherited_pipeline_statistics & VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT))
+		ngg_gs_state = 1;
+
+	loc = radv_lookup_user_sgpr(pipeline, MESA_SHADER_GEOMETRY,
+				    AC_UD_NGG_GS_STATE);
+	base_reg = pipeline->user_data_0[MESA_SHADER_GEOMETRY];
+	assert(loc->sgpr_idx != -1);
+
+	radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4,
+			  ngg_gs_state);
+}
+
 static void
 radv_upload_graphics_shader_descriptors(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty)
 {
@@ -2581,6 +2610,7 @@ radv_upload_graphics_shader_descriptors(struct radv_cmd_buffer *cmd_buffer, bool
 	radv_flush_streamout_descriptors(cmd_buffer);
 	radv_flush_descriptors(cmd_buffer, VK_SHADER_STAGE_ALL_GRAPHICS);
 	radv_flush_constants(cmd_buffer, VK_SHADER_STAGE_ALL_GRAPHICS);
+	radv_flush_ngg_gs_state(cmd_buffer);
 }
 
 struct radv_draw_info {
@@ -3349,6 +3379,9 @@ VkResult radv_BeginCommandBuffer(
 				return result;
 		}
 
+		cmd_buffer->state.inherited_pipeline_statistics =
+			pBeginInfo->pInheritanceInfo->pipelineStatistics;
+
 		radv_cmd_buffer_set_subpass(cmd_buffer, subpass);
 	}
 
@@ -4089,6 +4122,8 @@ void radv_CmdExecuteCommands(
 			primary->tess_rings_needed = true;
 		if (secondary->sample_positions_needed)
 			primary->sample_positions_needed = true;
+		if (secondary->gds_needed)
+			primary->gds_needed = true;
 
 		if (!secondary->state.framebuffer &&
 		    (primary->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER)) {
diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c
index 58b679a35ae..422ffa17699 100644
--- a/src/amd/vulkan/radv_nir_to_llvm.c
+++ b/src/amd/vulkan/radv_nir_to_llvm.c
@@ -3170,6 +3170,33 @@ static void gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx)
 		build_streamout(ctx, &nggso);
 	}
 
+	/* Write shader query data. */
+	tmp = ac_get_arg(&ctx->ac, ctx->args->ngg_gs_state);
+	tmp = LLVMBuildTrunc(builder, tmp, ctx->ac.i1, "");
+	ac_build_ifcc(&ctx->ac, tmp, 5109);
+	tmp = LLVMBuildICmp(builder, LLVMIntULT, tid,
+			    LLVMConstInt(ctx->ac.i32, 4, false), "");
+	ac_build_ifcc(&ctx->ac, tmp, 5110);
+	{
+		tmp = LLVMBuildLoad(builder, ac_build_gep0(&ctx->ac, ctx->gs_ngg_scratch, tid), "");
+
+		ac_llvm_add_target_dep_function_attr(ctx->main_function,
+						     "amdgpu-gds-size", 256);
+
+		LLVMTypeRef gdsptr = LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GDS);
+		LLVMValueRef gdsbase = LLVMBuildIntToPtr(builder, ctx->ac.i32_0, gdsptr, "");
+
+		const char *sync_scope = LLVM_VERSION_MAJOR >= 9 ? "workgroup-one-as" : "workgroup";
+
+		/* Use a plain GDS atomic to accumulate the number of generated
+		 * primitives.
+		 */
+		ac_build_atomic_rmw(&ctx->ac, LLVMAtomicRMWBinOpAdd, gdsbase,
+				    tmp, sync_scope);
+	}
+	ac_build_endif(&ctx->ac, 5110);
+	ac_build_endif(&ctx->ac, 5109);
+
 	/* TODO: culling */
 
 	/* Determine vertex liveness. */
diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
index 4494d595074..ca7d9a084f5 100644
--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@@ -1260,6 +1260,7 @@ struct radv_cmd_state {
 	unsigned                                     active_occlusion_queries;
 	bool                                         perfect_occlusion_queries_enabled;
 	unsigned                                     active_pipeline_queries;
+	unsigned                                     active_pipeline_gds_queries;
 	float					     offset_scale;
 	uint32_t                                      trace_id;
 	uint32_t                                      last_ia_multi_vgt_param;
@@ -1275,6 +1276,9 @@ struct radv_cmd_state {
 	int predication_type; /* -1: disabled, 0: normal, 1: inverted */
 	uint64_t predication_va;
 
+	/* Inheritance info. */
+	VkQueryPipelineStatisticFlags inherited_pipeline_statistics;
+
 	bool context_roll_without_scissor_emitted;
 };
 
@@ -1333,7 +1337,7 @@ struct radv_cmd_buffer {
 	uint32_t esgs_ring_size_needed;
 	uint32_t gsvs_ring_size_needed;
 	bool tess_rings_needed;
-	bool gds_needed; /* for GFX10 streamout */
+	bool gds_needed; /* for GFX10 streamout and NGG GS queries */
 	bool gds_oa_needed; /* for GFX10 streamout */
 	bool sample_positions_needed;
 
diff --git a/src/amd/vulkan/radv_query.c b/src/amd/vulkan/radv_query.c
index f59e435e018..6f660c109e6 100644
--- a/src/amd/vulkan/radv_query.c
+++ b/src/amd/vulkan/radv_query.c
@@ -40,6 +40,14 @@
 static const int pipelinestat_block_size = 11 * 8;
 static const unsigned pipeline_statistics_indices[] = {7, 6, 3, 4, 5, 2, 1, 0, 8, 9, 10};
 
+static unsigned
+radv_get_pipeline_statistics_index(const VkQueryPipelineStatisticFlagBits flag)
+{
+	int offset = ffs(flag) - 1;
+	assert(offset < ARRAY_SIZE(pipeline_statistics_indices));
+	return pipeline_statistics_indices[offset];
+}
+
 static nir_ssa_def *nir_test_flag(nir_builder *b, nir_ssa_def *flags, uint32_t flag)
 {
 	return nir_i2b(b, nir_iand(b, flags, nir_imm_int(b, flag)));
@@ -1261,6 +1269,22 @@ static void radv_query_shader(struct radv_cmd_buffer *cmd_buffer,
 	radv_meta_restore(&saved_state, cmd_buffer);
 }
 
+static bool
+radv_query_pool_needs_gds(struct radv_device *device,
+			  struct radv_query_pool *pool)
+{
+	/* The number of primitives generated by geometry shader invocations is
+	 * only counted by the hardware if GS uses the legacy path. When NGG GS
+	 * is used, the hardware can't know the number of generated primitives
+	 * and we have to it manually inside the shader. To achieve that, the
+	 * driver does a plain GDS atomic to accumulate that value.
+	 * TODO: fix use of NGG GS and non-NGG GS inside the same begin/end
+	 * query.
+	 */
+	return device->physical_device->use_ngg &&
+	       (pool->pipeline_stats_mask & VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT);
+}
+
 VkResult radv_CreateQueryPool(
 	VkDevice                                    _device,
 	const VkQueryPoolCreateInfo*                pCreateInfo,
@@ -1725,6 +1749,7 @@ static unsigned event_type_for_stream(unsigned stream)
 }
 
 static void emit_begin_query(struct radv_cmd_buffer *cmd_buffer,
+			     struct radv_query_pool *pool,
 			     uint64_t va,
 			     VkQueryType query_type,
 			     VkQueryControlFlags flags,
@@ -1776,6 +1801,30 @@ static void emit_begin_query(struct radv_cmd_buffer *cmd_buffer,
 		radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
 		radeon_emit(cs, va);
 		radeon_emit(cs, va >> 32);
+
+		if (radv_query_pool_needs_gds(cmd_buffer->device, pool)) {
+			int idx = radv_get_pipeline_statistics_index(VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT);
+
+			/* Make sure GDS is idle before copying the value. */
+			cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
+							RADV_CMD_FLAG_INV_L2;
+			si_emit_cache_flush(cmd_buffer);
+
+			va += 8 * idx;
+
+			si_cs_emit_write_event_eop(cs,
+						   cmd_buffer->device->physical_device->rad_info.chip_class,
+						   radv_cmd_buffer_uses_mec(cmd_buffer),
+						   V_028A90_PS_DONE, 0,
+						   EOP_DST_SEL_TC_L2,
+						   EOP_DATA_SEL_GDS,
+						   va, EOP_DATA_GDS(0, 1), 0);
+
+			/* Record that the command buffer needs GDS. */
+			cmd_buffer->gds_needed = true;
+
+			cmd_buffer->state.active_pipeline_gds_queries++;
+		}
 		break;
 	case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
 		radeon_check_space(cmd_buffer->device->ws, cs, 4);
@@ -1794,6 +1843,7 @@ static void emit_begin_query(struct radv_cmd_buffer *cmd_buffer,
 }
 
 static void emit_end_query(struct radv_cmd_buffer *cmd_buffer,
+			   struct radv_query_pool *pool,
 			   uint64_t va, uint64_t avail_va,
 			   VkQueryType query_type, uint32_t index)
 {
@@ -1841,6 +1891,27 @@ static void emit_end_query(struct radv_cmd_buffer *cmd_buffer,
 					   EOP_DATA_SEL_VALUE_32BIT,
 					   avail_va, 1,
 					   cmd_buffer->gfx9_eop_bug_va);
+
+		if (radv_query_pool_needs_gds(cmd_buffer->device, pool)) {
+			int idx = radv_get_pipeline_statistics_index(VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT);
+
+			/* Make sure GDS is idle before copying the value. */
+			cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
+							RADV_CMD_FLAG_INV_L2;
+			si_emit_cache_flush(cmd_buffer);
+
+			va += 8 * idx;
+
+			si_cs_emit_write_event_eop(cs,
+						   cmd_buffer->device->physical_device->rad_info.chip_class,
+						   radv_cmd_buffer_uses_mec(cmd_buffer),
+						   V_028A90_PS_DONE, 0,
+						   EOP_DST_SEL_TC_L2,
+						   EOP_DATA_SEL_GDS,
+						   va, EOP_DATA_GDS(0, 1), 0);
+
+			cmd_buffer->state.active_pipeline_gds_queries--;
+		}
 		break;
 	case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
 		radeon_check_space(cmd_buffer->device->ws, cs, 4);
@@ -1884,7 +1955,7 @@ void radv_CmdBeginQueryIndexedEXT(
 
 	va += pool->stride * query;
 
-	emit_begin_query(cmd_buffer, va, pool->type, flags, index);
+	emit_begin_query(cmd_buffer, pool, va, pool->type, flags, index);
 }
 
 void radv_CmdBeginQuery(
@@ -1911,7 +1982,7 @@ void radv_CmdEndQueryIndexedEXT(
 	/* Do not need to add the pool BO to the list because the query must
 	 * currently be active, which means the BO is already in the list.
 	 */
-	emit_end_query(cmd_buffer, va, avail_va, pool->type, index);
+	emit_end_query(cmd_buffer, pool, va, avail_va, pool->type, index);
 
 	/*
 	 * For multiview we have to emit a query for each bit in the mask,
@@ -1928,8 +1999,8 @@ void radv_CmdEndQueryIndexedEXT(
 		for (unsigned i = 1; i < util_bitcount(cmd_buffer->state.subpass->view_mask); i++) {
 			va += pool->stride;
 			avail_va += 4;
-			emit_begin_query(cmd_buffer, va, pool->type, 0, 0);
-			emit_end_query(cmd_buffer, va, avail_va, pool->type, 0);
+			emit_begin_query(cmd_buffer, pool, va, pool->type, 0, 0);
+			emit_end_query(cmd_buffer, pool, va, avail_va, pool->type, 0);
 		}
 	}
 }
diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h
index 131774bd886..b38710e6fcf 100644
--- a/src/amd/vulkan/radv_shader.h
+++ b/src/amd/vulkan/radv_shader.h
@@ -148,7 +148,8 @@ enum radv_ud_index {
 	AC_UD_INDIRECT_DESCRIPTOR_SETS = 3,
 	AC_UD_VIEW_INDEX = 4,
 	AC_UD_STREAMOUT_BUFFERS = 5,
-	AC_UD_SHADER_START = 6,
+	AC_UD_NGG_GS_STATE = 6,
+	AC_UD_SHADER_START = 7,
 	AC_UD_VS_VERTEX_BUFFERS = AC_UD_SHADER_START,
 	AC_UD_VS_BASE_VERTEX_START_INSTANCE,
 	AC_UD_VS_MAX_UD,
diff --git a/src/amd/vulkan/radv_shader_args.c b/src/amd/vulkan/radv_shader_args.c
index 6f40808d825..1b57d402d5c 100644
--- a/src/amd/vulkan/radv_shader_args.c
+++ b/src/amd/vulkan/radv_shader_args.c
@@ -615,6 +615,11 @@ radv_declare_shader_args(struct radv_shader_args *args,
 					   &args->ac.view_index);
 			}
 
+			if (args->options->key.vs_common_out.as_ngg) {
+				ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT,
+					   &args->ngg_gs_state);
+			}
+
 			ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT,
 				   &args->gs_vtx_offset[0]);
 			ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT,
@@ -742,6 +747,9 @@ radv_declare_shader_args(struct radv_shader_args *args,
 		}
 		if (args->ac.view_index.used)
 			set_loc_shader(args, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1);
+
+		if (args->ngg_gs_state.used)
+			set_loc_shader(args, AC_UD_NGG_GS_STATE, &user_sgpr_idx, 1);
 		break;
 	case MESA_SHADER_FRAGMENT:
 		break;
diff --git a/src/amd/vulkan/radv_shader_args.h b/src/amd/vulkan/radv_shader_args.h
index 3c7aceb6385..451077a9ede 100644
--- a/src/amd/vulkan/radv_shader_args.h
+++ b/src/amd/vulkan/radv_shader_args.h
@@ -65,6 +65,9 @@ struct radv_shader_args {
 	struct ac_arg streamout_config;
 	struct ac_arg streamout_offset[4];
 
+	/* NGG GS */
+	struct ac_arg ngg_gs_state;
+
 	bool is_gs_copy_shader;
 };
 



More information about the mesa-commit mailing list