[Mesa-dev] [PATCH 2/4] radv/gfx10: add Wave32 support for vertex, tessellation and geometry shaders

Thu Aug 1 08:43:42 UTC 2019

It can be enabled with RADV_PERFTEST=gewave32.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset at gmail.com>
---
 src/amd/vulkan/radv_debug.h       |  1 +
 src/amd/vulkan/radv_device.c      |  5 +++++
 src/amd/vulkan/radv_nir_to_llvm.c | 13 +++++++------
 src/amd/vulkan/radv_pipeline.c    | 10 +++++++++-
 src/amd/vulkan/radv_private.h     |  1 +
 src/amd/vulkan/radv_shader.c      |  3 ++-
 src/amd/vulkan/radv_shader.h      |  1 +
 7 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/src/amd/vulkan/radv_debug.h b/src/amd/vulkan/radv_debug.h
index 65dbec6e90d..ef5b331d188 100644
--- a/src/amd/vulkan/radv_debug.h
+++ b/src/amd/vulkan/radv_debug.h
@@ -66,6 +66,7 @@ enum {
 	RADV_PERFTEST_TC_COMPAT_CMASK = 0x80,
 	RADV_PERFTEST_CS_WAVE_32     = 0x100,
 	RADV_PERFTEST_PS_WAVE_32     = 0x200,
+	RADV_PERFTEST_GE_WAVE_32     = 0x400,
 };
 
 bool
diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
index b66b15edf73..fc961040b6e 100644
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -386,6 +386,7 @@ radv_physical_device_init(struct radv_physical_device *device,
 	/* Determine the number of threads per wave for all stages. */
 	device->cs_wave_size = 64;
 	device->ps_wave_size = 64;
+	device->ge_wave_size = 64;
 
 	if (device->rad_info.chip_class >= GFX10) {
 		if (device->instance->perftest_flags & RADV_PERFTEST_CS_WAVE_32)
@@ -394,6 +395,9 @@ radv_physical_device_init(struct radv_physical_device *device,
 		/* For pixel shaders, wave64 is recommanded. */
 		if (device->instance->perftest_flags & RADV_PERFTEST_PS_WAVE_32)
 			device->ps_wave_size = 32;
+
+		if (device->instance->perftest_flags & RADV_PERFTEST_GE_WAVE_32)
+			device->ge_wave_size = 32;
 	}
 
 	radv_physical_device_init_mem_types(device);
@@ -509,6 +513,7 @@ static const struct debug_control radv_perftest_options[] = {
 	{"tccompatcmask", RADV_PERFTEST_TC_COMPAT_CMASK},
 	{"cswave32", RADV_PERFTEST_CS_WAVE_32},
 	{"pswave32", RADV_PERFTEST_PS_WAVE_32},
+	{"gewave32", RADV_PERFTEST_GE_WAVE_32},
 	{NULL, 0}
 };
 
diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c
index bba5849b152..91251aa69bd 100644
--- a/src/amd/vulkan/radv_nir_to_llvm.c
+++ b/src/amd/vulkan/radv_nir_to_llvm.c
@@ -295,7 +295,7 @@ get_tcs_num_patches(struct radv_shader_context *ctx)
 
 	/* GFX6 bug workaround - limit LS-HS threadgroups to only one wave. */
 	if (ctx->options->chip_class == GFX6) {
-		unsigned one_wave = 64 / MAX2(num_tcs_input_cp, num_tcs_output_cp);
+		unsigned one_wave = ctx->options->ge_wave_size / MAX2(num_tcs_input_cp, num_tcs_output_cp);
 		num_patches = MIN2(num_patches, one_wave);
 	}
 	return num_patches;
@@ -3038,7 +3038,8 @@ handle_es_outputs_post(struct radv_shader_context *ctx,
 		LLVMValueRef wave_idx = ac_unpack_param(&ctx->ac, ctx->merged_wave_info, 24, 4);
 		vertex_idx = LLVMBuildOr(ctx->ac.builder, vertex_idx,
 					 LLVMBuildMul(ctx->ac.builder, wave_idx,
-						      LLVMConstInt(ctx->ac.i32, 64, false), ""), "");
+						      LLVMConstInt(ctx->ac.i32,
+								   ctx->ac.wave_size, false), ""), "");
 		lds_base = LLVMBuildMul(ctx->ac.builder, vertex_idx,
 					LLVMConstInt(ctx->ac.i32, itemsize_dw, 0), "");
 	}
@@ -3140,7 +3141,7 @@ static LLVMValueRef get_thread_id_in_tg(struct radv_shader_context *ctx)
 	LLVMBuilderRef builder = ctx->ac.builder;
 	LLVMValueRef tmp;
 	tmp = LLVMBuildMul(builder, get_wave_id_in_tg(ctx),
-			   LLVMConstInt(ctx->ac.i32, 64, false), "");
+			   LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false), "");
 	return LLVMBuildAdd(builder, tmp, ac_get_thread_id(&ctx->ac), "");
 }
 
@@ -4190,7 +4191,7 @@ ac_setup_rings(struct radv_shader_context *ctx)
 		 */
 		LLVMTypeRef v2i64 = LLVMVectorType(ctx->ac.i64, 2);
 		uint64_t stream_offset = 0;
-		unsigned num_records = 64;
+		unsigned num_records = ctx->ac.wave_size;
 		LLVMValueRef base_ring;
 
 		base_ring =
@@ -4223,7 +4224,7 @@ ac_setup_rings(struct radv_shader_context *ctx)
 			ring = LLVMBuildInsertElement(ctx->ac.builder,
 						      ring, tmp, ctx->ac.i32_0, "");
 
-			stream_offset += stride * 64;
+			stream_offset += stride * ctx->ac.wave_size;
 
 			ring = LLVMBuildBitCast(ctx->ac.builder, ring,
 						ctx->ac.v4i32, "");
@@ -4325,7 +4326,7 @@ radv_nir_shader_wave_size(struct nir_shader *const *shaders, int shader_count,
 		return options->cs_wave_size;
 	else if (shaders[0]->info.stage == MESA_SHADER_FRAGMENT)
 		return options->ps_wave_size;
-	return 64;
+	return options->ge_wave_size;
 }
 
 static
diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c
index dbfe261c982..d722d558a43 100644
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -1782,7 +1782,7 @@ calculate_ngg_info(const VkGraphicsPipelineCreateInfo *pCreateInfo,
 
 	/* Round up towards full wave sizes for better ALU utilization. */
 	if (!max_vert_out_per_gs_instance) {
-		const unsigned wavesize = 64;
+		const unsigned wavesize = pipeline->device->physical_device->ge_wave_size;
 		unsigned orig_max_esverts;
 		unsigned orig_max_gsprims;
 		do {
@@ -4125,6 +4125,14 @@ radv_compute_vgt_shader_stages_en(const struct radv_pipeline *pipeline)
 	if (pipeline->device->physical_device->rad_info.chip_class >= GFX9)
 		stages |= S_028B54_MAX_PRIMGRP_IN_WAVE(2);
 
+	if (pipeline->device->physical_device->rad_info.chip_class >= GFX10 &&
+	    pipeline->device->physical_device->ge_wave_size == 32) {
+		/* legacy GS only supports Wave64 */
+		stages |= S_028B54_HS_W32_EN(1) |
+			  S_028B54_GS_W32_EN(radv_pipeline_has_ngg(pipeline)) |
+			  S_028B54_VS_W32_EN(1);
+	}
+
 	return stages;
 }
 
diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
index a1347060190..1a0b22d63b8 100644
--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@@ -304,6 +304,7 @@ struct radv_physical_device {
 	/* Number of threads per wave. */
 	uint8_t ps_wave_size;
 	uint8_t cs_wave_size;
+	uint8_t ge_wave_size;
 
 	/* This is the drivers on-disk cache used as a fallback as opposed to
 	 * the pipeline cache defined by apps.
diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c
index 48ed86c99b1..97fa80b348c 100644
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@@ -675,7 +675,7 @@ radv_get_shader_wave_size(const struct radv_physical_device *pdevice,
 		return pdevice->cs_wave_size;
 	else if (stage == MESA_SHADER_FRAGMENT)
 		return pdevice->ps_wave_size;
-	return 64;
+	return pdevice->ge_wave_size;
 }
 
 static void radv_postprocess_config(const struct radv_physical_device *pdevice,
@@ -1144,6 +1144,7 @@ shader_variant_compile(struct radv_device *device,
 	options->address32_hi = device->physical_device->rad_info.address32_hi;
 	options->cs_wave_size = device->physical_device->cs_wave_size;
 	options->ps_wave_size = device->physical_device->ps_wave_size;
+	options->ge_wave_size = device->physical_device->ge_wave_size;
 
 	if (options->supports_spill)
 		tm_options |= AC_TM_SUPPORTS_SPILL;
diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h
index 0ef49628b5d..0ab7db20181 100644
--- a/src/amd/vulkan/radv_shader.h
+++ b/src/amd/vulkan/radv_shader.h
@@ -130,6 +130,7 @@ struct radv_nir_compiler_options {
 	uint32_t address32_hi;
 	uint8_t cs_wave_size;
 	uint8_t ps_wave_size;
+	uint8_t ge_wave_size;
 };
 
 enum radv_ud_index {
-- 
2.22.0