[Mesa-dev] [PATCH] radv: add scratch support for spilling.

Dave Airlie airlied at gmail.com
Wed Jan 25 02:32:34 UTC 2017


From: Dave Airlie <airlied at redhat.com>

Currently LLVM 5.0 has support for spilling to a place
pointed to by the user sgprs instead of using relocations.

This is enabled by using the amdgcn-mesa-mesa3d triple.

For compute gfx shaders we spill to a buffer pointed to
by 64-bit address stored in sgprs 0/1.
For other gfx shaders we spill to a buffer pointed to by
the first two dwords of the buffer pointed to in sgprs 0/1.

This patch enables radv to use the llvm support when present.

This fixes Sascha Willems computeshader demo first screen,
and a bunch of CTS tests now pass.

This patch is likely to be in LLVM 4.0 release as well
(fingers crossed) in which case we need to adjust the detection
logic.

SIgned-off-by: Dave Airlie <airlied at redhat.com>
---
 src/amd/common/ac_binary.c       |  30 +++++----
 src/amd/common/ac_binary.h       |   4 +-
 src/amd/common/ac_llvm_util.c    |   4 +-
 src/amd/common/ac_llvm_util.h    |   2 +-
 src/amd/common/ac_nir_to_llvm.c  |  14 ++--
 src/amd/common/ac_nir_to_llvm.h  |   6 +-
 src/amd/vulkan/radv_cmd_buffer.c | 137 ++++++++++++++++++++++++++++++++++++++-
 src/amd/vulkan/radv_device.c     |  22 +++++++
 src/amd/vulkan/radv_pipeline.c   |  10 +--
 src/amd/vulkan/radv_private.h    |  13 ++++
 10 files changed, 215 insertions(+), 27 deletions(-)

diff --git a/src/amd/common/ac_binary.c b/src/amd/common/ac_binary.c
index 01cf000..9c66a82 100644
--- a/src/amd/common/ac_binary.c
+++ b/src/amd/common/ac_binary.c
@@ -212,23 +212,28 @@ static const char *scratch_rsrc_dword1_symbol =
 
 void ac_shader_binary_read_config(struct ac_shader_binary *binary,
 				  struct ac_shader_config *conf,
-				  unsigned symbol_offset)
+				  unsigned symbol_offset,
+				  bool supports_spill)
 {
 	unsigned i;
 	const unsigned char *config =
 		ac_shader_binary_config_start(binary, symbol_offset);
 	bool really_needs_scratch = false;
-
+	uint32_t wavesize = 0;
 	/* LLVM adds SGPR spills to the scratch size.
 	 * Find out if we really need the scratch buffer.
 	 */
-	for (i = 0; i < binary->reloc_count; i++) {
-		const struct ac_shader_reloc *reloc = &binary->relocs[i];
+	if (supports_spill) {
+		really_needs_scratch = true;
+	} else {
+		for (i = 0; i < binary->reloc_count; i++) {
+			const struct ac_shader_reloc *reloc = &binary->relocs[i];
 
-		if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name) ||
-		    !strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
-			really_needs_scratch = true;
-			break;
+			if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name) ||
+			    !strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
+				really_needs_scratch = true;
+				break;
+			}
 		}
 	}
 
@@ -259,9 +264,7 @@ void ac_shader_binary_read_config(struct ac_shader_binary *binary,
 		case R_0286E8_SPI_TMPRING_SIZE:
 		case R_00B860_COMPUTE_TMPRING_SIZE:
 			/* WAVESIZE is in units of 256 dwords. */
-			if (really_needs_scratch)
-				conf->scratch_bytes_per_wave =
-					G_00B860_WAVESIZE(value) * 256 * 4;
+			wavesize = value;
 			break;
 		case SPILLED_SGPRS:
 			conf->spilled_sgprs = value;
@@ -285,4 +288,9 @@ void ac_shader_binary_read_config(struct ac_shader_binary *binary,
 		if (!conf->spi_ps_input_addr)
 			conf->spi_ps_input_addr = conf->spi_ps_input_ena;
 	}
+
+	if (really_needs_scratch) {
+		/* sgprs spills aren't spilling */
+	        conf->scratch_bytes_per_wave = G_00B860_WAVESIZE(wavesize) * 256 * 4;
+	}
 }
diff --git a/src/amd/common/ac_binary.h b/src/amd/common/ac_binary.h
index 282f33d..06fd855 100644
--- a/src/amd/common/ac_binary.h
+++ b/src/amd/common/ac_binary.h
@@ -27,6 +27,7 @@
 #pragma once
 
 #include <stdint.h>
+#include <stdbool.h>
 
 struct ac_shader_reloc {
 	char name[32];
@@ -85,4 +86,5 @@ void ac_elf_read(const char *elf_data, unsigned elf_size,
 
 void ac_shader_binary_read_config(struct ac_shader_binary *binary,
 				  struct ac_shader_config *conf,
-				  unsigned symbol_offset);
+				  unsigned symbol_offset,
+				  bool supports_spill);
diff --git a/src/amd/common/ac_llvm_util.c b/src/amd/common/ac_llvm_util.c
index 770e3bd..3ba5281 100644
--- a/src/amd/common/ac_llvm_util.c
+++ b/src/amd/common/ac_llvm_util.c
@@ -126,11 +126,11 @@ static const char *ac_get_llvm_processor_name(enum radeon_family family)
 	}
 }
 
-LLVMTargetMachineRef ac_create_target_machine(enum radeon_family family)
+LLVMTargetMachineRef ac_create_target_machine(enum radeon_family family, bool supports_spill)
 {
 	assert(family >= CHIP_TAHITI);
 
-	const char *triple = "amdgcn--";
+	const char *triple = supports_spill ? "amdgcn-mesa-mesa3d" : "amdgcn--";
 	LLVMTargetRef target = ac_get_llvm_target(triple);
 	LLVMTargetMachineRef tm = LLVMCreateTargetMachine(
 	                             target,
diff --git a/src/amd/common/ac_llvm_util.h b/src/amd/common/ac_llvm_util.h
index 802c266..2a5f325 100644
--- a/src/amd/common/ac_llvm_util.h
+++ b/src/amd/common/ac_llvm_util.h
@@ -56,7 +56,7 @@ struct ac_llvm_context {
 	LLVMValueRef fpmath_md_2p5_ulp;
 };
 
-LLVMTargetMachineRef ac_create_target_machine(enum radeon_family family);
+LLVMTargetMachineRef ac_create_target_machine(enum radeon_family family, bool supports_spill);
 
 void ac_add_attr_dereferenceable(LLVMValueRef val, uint64_t bytes);
 bool ac_is_sgpr_param(LLVMValueRef param);
diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index 26b87e8..43e079e 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -458,10 +458,10 @@ static void create_function(struct nir_to_llvm_context *ctx)
 	    arg_idx, array_params_mask, sgpr_count, ctx->options->unsafe_math);
 	set_llvm_calling_convention(ctx->main_function, ctx->stage);
 
-
 	ctx->shader_info->num_input_sgprs = 0;
 	ctx->shader_info->num_input_vgprs = 0;
 
+	ctx->shader_info->num_user_sgprs = ctx->options->supports_spill ? 2 : 0;
 	for (i = 0; i < user_sgpr_count; i++)
 		ctx->shader_info->num_user_sgprs += llvm_get_type_size(arg_types[i]) / 4;
 
@@ -475,6 +475,10 @@ static void create_function(struct nir_to_llvm_context *ctx)
 
 	arg_idx = 0;
 	user_sgpr_idx = 0;
+
+	set_userdata_location_shader(ctx, AC_UD_SCRATCH, user_sgpr_idx, 2);
+	user_sgpr_idx += 2;
+
 	for (unsigned i = 0; i < num_sets; ++i) {
 		if (ctx->options->layout->set[i].layout->shader_stages & (1 << ctx->stage)) {
 			set_userdata_location(&ctx->shader_info->user_sgprs_locs.descriptor_sets[i], user_sgpr_idx, 2);
@@ -4429,7 +4433,7 @@ LLVMModuleRef ac_translate_nir_to_llvm(LLVMTargetMachineRef tm,
 
 	memset(shader_info, 0, sizeof(*shader_info));
 
-	LLVMSetTarget(ctx.module, "amdgcn--");
+	LLVMSetTarget(ctx.module, options->supports_spill ? "amdgcn-mesa-mesa3d" : "amdgcn--");
 	setup_types(&ctx);
 
 	ctx.builder = LLVMCreateBuilderInContext(ctx.context);
@@ -4563,7 +4567,7 @@ static void ac_compile_llvm_module(LLVMTargetMachineRef tm,
 				   struct ac_shader_config *config,
 				   struct ac_shader_variant_info *shader_info,
 				   gl_shader_stage stage,
-				   bool dump_shader)
+				   bool dump_shader, bool supports_spill)
 {
 	if (dump_shader)
 		LLVMDumpModule(llvm_module);
@@ -4577,7 +4581,7 @@ static void ac_compile_llvm_module(LLVMTargetMachineRef tm,
 	if (dump_shader)
 		fprintf(stderr, "disasm:\n%s\n", binary->disasm_string);
 
-	ac_shader_binary_read_config(binary, config, 0);
+	ac_shader_binary_read_config(binary, config, 0, supports_spill);
 
 	LLVMContextRef ctx = LLVMGetModuleContext(llvm_module);
 	LLVMDisposeModule(llvm_module);
@@ -4637,7 +4641,7 @@ void ac_compile_nir_shader(LLVMTargetMachineRef tm,
 	LLVMModuleRef llvm_module = ac_translate_nir_to_llvm(tm, nir, shader_info,
 	                                                     options);
 
-	ac_compile_llvm_module(tm, llvm_module, binary, config, shader_info, nir->stage, dump_shader);
+	ac_compile_llvm_module(tm, llvm_module, binary, config, shader_info, nir->stage, dump_shader, options->supports_spill);
 	switch (nir->stage) {
 	case MESA_SHADER_COMPUTE:
 		for (int i = 0; i < 3; ++i)
diff --git a/src/amd/common/ac_nir_to_llvm.h b/src/amd/common/ac_nir_to_llvm.h
index a57558e..9d66f94 100644
--- a/src/amd/common/ac_nir_to_llvm.h
+++ b/src/amd/common/ac_nir_to_llvm.h
@@ -52,6 +52,7 @@ struct ac_nir_compiler_options {
 	struct radv_pipeline_layout *layout;
 	union ac_shader_variant_key key;
 	bool unsafe_math;
+	bool supports_spill;
 	enum radeon_family family;
 	enum chip_class chip_class;
 };
@@ -64,8 +65,9 @@ struct ac_userdata_info {
 };
 
 enum ac_ud_index {
-	AC_UD_PUSH_CONSTANTS = 0,
-	AC_UD_SHADER_START = 1,
+	AC_UD_SCRATCH = 0,
+	AC_UD_PUSH_CONSTANTS = 1,
+	AC_UD_SHADER_START = 2,
 	AC_UD_VS_VERTEX_BUFFERS = AC_UD_SHADER_START,
 	AC_UD_VS_BASE_VERTEX_START_INSTANCE,
 	AC_UD_VS_MAX_UD,
diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
index c62d275..e904897 100644
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -466,6 +466,13 @@ radv_emit_vertex_shader(struct radv_cmd_buffer *cmd_buffer,
 	va = ws->buffer_get_va(vs->bo);
 	ws->cs_add_buffer(cmd_buffer->cs, vs->bo, 8);
 
+	if (vs->config.scratch_bytes_per_wave) {
+		uint32_t needed = vs->config.scratch_bytes_per_wave * cmd_buffer->device->scratch_waves;
+		if (needed > cmd_buffer->scratch_size_needed)
+			cmd_buffer->scratch_size_needed = needed;
+		cmd_buffer->scratch_needed_mask |= (1 << MESA_SHADER_VERTEX);
+	}
+
 	clip_dist_mask = vs->info.vs.clip_dist_mask;
 	cull_dist_mask = vs->info.vs.cull_dist_mask;
 	total_mask = clip_dist_mask | cull_dist_mask;
@@ -536,6 +543,13 @@ radv_emit_fragment_shader(struct radv_cmd_buffer *cmd_buffer,
 	va = ws->buffer_get_va(ps->bo);
 	ws->cs_add_buffer(cmd_buffer->cs, ps->bo, 8);
 
+	if (ps->config.scratch_bytes_per_wave) {
+		uint32_t needed = ps->config.scratch_bytes_per_wave * cmd_buffer->device->scratch_waves;
+		if (needed > cmd_buffer->scratch_size_needed)
+			cmd_buffer->scratch_size_needed = needed;
+		cmd_buffer->scratch_needed_mask |= (1 << MESA_SHADER_FRAGMENT);
+	}
+
 	radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B020_SPI_SHADER_PGM_LO_PS, 4);
 	radeon_emit(cmd_buffer->cs, va >> 8);
 	radeon_emit(cmd_buffer->cs, va >> 40);
@@ -627,6 +641,15 @@ radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer,
 	radeon_set_context_reg(cmd_buffer->cs, R_028A94_VGT_MULTI_PRIM_IB_RESET_EN,
 			       pipeline->graphics.prim_restart_enable);
 
+	uint32_t max_scratch_bytes_per_wave = 0;
+	max_scratch_bytes_per_wave = MAX2(max_scratch_bytes_per_wave,
+					  pipeline->shaders[MESA_SHADER_VERTEX]->config.scratch_bytes_per_wave);
+	max_scratch_bytes_per_wave = MAX2(max_scratch_bytes_per_wave,
+					  pipeline->shaders[MESA_SHADER_FRAGMENT]->config.scratch_bytes_per_wave);
+
+	radeon_set_context_reg(cmd_buffer->cs, R_0286E8_SPI_TMPRING_SIZE,
+			       S_0286E8_WAVES(cmd_buffer->device->scratch_waves) |
+			       S_0286E8_WAVESIZE(max_scratch_bytes_per_wave >> 10));
 	cmd_buffer->state.emitted_pipeline = pipeline;
 }
 
@@ -1372,6 +1395,13 @@ radv_cmd_buffer_destroy(struct radv_cmd_buffer *cmd_buffer)
 
 	if (cmd_buffer->upload.upload_bo)
 		cmd_buffer->device->ws->buffer_destroy(cmd_buffer->upload.upload_bo);
+
+	if (cmd_buffer->scratch_bo)
+		cmd_buffer->device->ws->buffer_destroy(cmd_buffer->scratch_bo);
+
+	if (cmd_buffer->compute_scratch_bo)
+		cmd_buffer->device->ws->buffer_destroy(cmd_buffer->compute_scratch_bo);
+
 	cmd_buffer->device->ws->cs_destroy(cmd_buffer->cs);
 	vk_free(&cmd_buffer->pool->alloc, cmd_buffer);
 }
@@ -1402,6 +1432,19 @@ static void  radv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
 		free(up);
 	}
 
+	if (cmd_buffer->scratch_bo) {
+		cmd_buffer->device->ws->buffer_destroy(cmd_buffer->scratch_bo);
+		cmd_buffer->scratch_bo = NULL;
+	}
+
+	if (cmd_buffer->compute_scratch_bo) {
+		cmd_buffer->device->ws->buffer_destroy(cmd_buffer->compute_scratch_bo);
+		cmd_buffer->compute_scratch_bo = NULL;
+	}
+
+	cmd_buffer->scratch_needed_mask = 0;
+	cmd_buffer->scratch_size_needed = 0;
+	cmd_buffer->compute_scratch_size_needed = 0;
 	if (cmd_buffer->upload.upload_bo)
 		cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs,
 						      cmd_buffer->upload.upload_bo, 8);
@@ -1457,6 +1500,19 @@ VkResult radv_BeginCommandBuffer(
 		default:
 			break;
 		}
+
+		uint32_t pad_word = 0xffff1000U;
+		if (cmd_buffer->device->physical_device->rad_info.gfx_ib_pad_with_type2)
+			pad_word = 0x80000000;
+
+		cmd_buffer->scratch_patch_idx = cmd_buffer->cs->cdw;
+		cmd_buffer->cs_to_patch_scratch = cmd_buffer->cs->buf;
+		for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
+			radeon_emit(cmd_buffer->cs, pad_word);
+			radeon_emit(cmd_buffer->cs, pad_word);
+			radeon_emit(cmd_buffer->cs, pad_word);
+			radeon_emit(cmd_buffer->cs, pad_word);
+		}
 	}
 
 	if (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
@@ -1594,6 +1650,70 @@ VkResult radv_EndCommandBuffer(
 
 	if (cmd_buffer->queue_family_index != RADV_QUEUE_TRANSFER)
 		si_emit_cache_flush(cmd_buffer);
+
+	int idx = cmd_buffer->scratch_patch_idx;
+	if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY && cmd_buffer->compute_scratch_size_needed) {
+		cmd_buffer->compute_scratch_bo = cmd_buffer->device->ws->buffer_create(cmd_buffer->device->ws,
+										       cmd_buffer->compute_scratch_size_needed,
+										       4096,
+										       RADEON_DOMAIN_VRAM,
+										       RADEON_FLAG_NO_CPU_ACCESS);
+
+		if (!cmd_buffer->compute_scratch_bo) {
+			cmd_buffer->record_fail = true;
+			return VK_ERROR_OUT_OF_DEVICE_MEMORY;
+		}
+		cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs, cmd_buffer->compute_scratch_bo, 8);
+
+		uint64_t scratch_va = cmd_buffer->device->ws->buffer_get_va(cmd_buffer->compute_scratch_bo);
+		uint32_t rsrc1 = S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) |
+			S_008F04_SWIZZLE_ENABLE(1);
+		uint32_t reg_base;
+
+		reg_base = shader_stage_to_user_data_0(MESA_SHADER_COMPUTE);
+		cmd_buffer->cs_to_patch_scratch[idx++] = PKT3(PKT3_SET_SH_REG, 2, 0);
+		cmd_buffer->cs_to_patch_scratch[idx++] = (reg_base - SI_SH_REG_OFFSET) >> 2;
+		cmd_buffer->cs_to_patch_scratch[idx++] = scratch_va;
+		cmd_buffer->cs_to_patch_scratch[idx++] = rsrc1;
+	}
+
+	if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY && cmd_buffer->scratch_size_needed) {
+		cmd_buffer->scratch_bo = cmd_buffer->device->ws->buffer_create(cmd_buffer->device->ws,
+									       cmd_buffer->scratch_size_needed,
+									       4096,
+									       RADEON_DOMAIN_VRAM,
+									       RADEON_FLAG_NO_CPU_ACCESS);
+
+		if (!cmd_buffer->scratch_bo) {
+			cmd_buffer->record_fail = true;
+			return VK_ERROR_OUT_OF_DEVICE_MEMORY;
+		}
+
+		cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs, cmd_buffer->scratch_bo, 8);
+
+		uint64_t scratch_va = cmd_buffer->device->ws->buffer_get_va(cmd_buffer->scratch_bo);
+		uint32_t rsrc1 = S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) |
+			S_008F04_SWIZZLE_ENABLE(1);
+
+		uint32_t *ring_ptr;
+		uint32_t ring_offset;
+		radv_cmd_buffer_upload_alloc(cmd_buffer, 4 * 4, 256, &ring_offset,
+					     (void **)&ring_ptr);
+		ring_ptr[0] = scratch_va;
+		ring_ptr[1] = rsrc1;
+		uint64_t va = cmd_buffer->device->ws->buffer_get_va(cmd_buffer->upload.upload_bo) + ring_offset;
+
+		radv_foreach_stage(stage, cmd_buffer->scratch_needed_mask) {
+			uint32_t reg_base;
+
+			reg_base = shader_stage_to_user_data_0(stage);
+			cmd_buffer->cs_to_patch_scratch[idx++] = PKT3(PKT3_SET_SH_REG, 2, 0);
+			cmd_buffer->cs_to_patch_scratch[idx++] = (reg_base - SI_SH_REG_OFFSET) >> 2;
+			cmd_buffer->cs_to_patch_scratch[idx++] = va;
+			cmd_buffer->cs_to_patch_scratch[idx++] = va >> 32;
+		}
+	}
+
 	if (!cmd_buffer->device->ws->cs_finalize(cmd_buffer->cs) ||
 	    cmd_buffer->record_fail)
 		return VK_ERROR_OUT_OF_DEVICE_MEMORY;
@@ -1629,9 +1749,16 @@ radv_emit_compute_pipeline(struct radv_cmd_buffer *cmd_buffer)
 	radeon_emit(cmd_buffer->cs, compute_shader->rsrc1);
 	radeon_emit(cmd_buffer->cs, compute_shader->rsrc2);
 
+	if (compute_shader->config.scratch_bytes_per_wave) {
+		uint32_t needed = compute_shader->config.scratch_bytes_per_wave * cmd_buffer->device->scratch_waves;
+		if (needed > cmd_buffer->compute_scratch_size_needed)
+			cmd_buffer->compute_scratch_size_needed = needed;
+	}
+
 	/* change these once we have scratch support */
 	radeon_set_sh_reg(cmd_buffer->cs, R_00B860_COMPUTE_TMPRING_SIZE,
-			  S_00B860_WAVES(32) | S_00B860_WAVESIZE(0));
+			  S_00B860_WAVES(cmd_buffer->device->scratch_waves) |
+			  S_00B860_WAVESIZE(compute_shader->config.scratch_bytes_per_wave >> 10));
 
 	radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
 	radeon_emit(cmd_buffer->cs,
@@ -1821,6 +1948,14 @@ void radv_CmdExecuteCommands(
 	for (uint32_t i = 0; i < commandBufferCount; i++) {
 		RADV_FROM_HANDLE(radv_cmd_buffer, secondary, pCmdBuffers[i]);
 
+		if (secondary->scratch_size_needed > primary->scratch_size_needed)
+			primary->scratch_size_needed = secondary->scratch_size_needed;
+
+		if (secondary->compute_scratch_size_needed > primary->compute_scratch_size_needed)
+			primary->compute_scratch_size_needed = secondary->compute_scratch_size_needed;
+
+		primary->scratch_needed_mask |= secondary->scratch_needed_mask;
+
 		primary->device->ws->cs_execute_secondary(primary->cs, secondary->cs);
 	}
 
diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
index 4aa6af2..c465186 100644
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -781,6 +781,13 @@ VkResult radv_CreateDevice(
 		}
 	}
 
+	/* TODO : predicate on LLVM version this goes into */
+#if HAVE_LLVM < 0x0500
+	device->llvm_supports_spill = false;
+#else
+	device->llvm_supports_spill = true;
+#endif
+
 	result = radv_device_init_meta(device);
 	if (result != VK_SUCCESS)
 		goto fail;
@@ -814,6 +821,21 @@ VkResult radv_CreateDevice(
 			goto fail;
 	}
 
+	/* The maximum number of scratch waves. Scratch space isn't divided
+	 * evenly between CUs. The number is only a function of the number of CUs.
+	 * We can decrease the constant to decrease the scratch buffer size.
+	 *
+	 * sctx->scratch_waves must be >= the maximum posible size of
+	 * 1 threadgroup, so that the hw doesn't hang from being unable
+	 * to start any.
+	 *
+	 * The recommended value is 4 per CU at most. Higher numbers don't
+	 * bring much benefit, but they still occupy chip resources (think
+	 * async compute). I've seen ~2% performance difference between 4 and 32.
+	 */
+	uint32_t max_threads_per_block = 2048;
+	device->scratch_waves = MAX2(32 * physical_device->rad_info.num_good_compute_units,
+				     max_threads_per_block / 64);
 	*pDevice = radv_device_to_handle(device);
 	return VK_SUCCESS;
 
diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c
index 360b519..060cfbb 100644
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -354,12 +354,13 @@ static void radv_fill_shader_variant(struct radv_device *device,
 				     struct ac_shader_binary *binary,
 				     gl_shader_stage stage)
 {
-	variant->code_size = binary->code_size;
 	bool scratch_enabled = variant->config.scratch_bytes_per_wave > 0;
 	unsigned vgpr_comp_cnt = 0;
 
-	if (scratch_enabled)
-		radv_finishme("shader scratch space");
+	if (scratch_enabled && !device->llvm_supports_spill)
+		radv_finishme("shader scratch support only available with LLVM 5.0");
+
+	variant->code_size = binary->code_size;
 
 	switch (stage) {
 	case MESA_SHADER_VERTEX:
@@ -424,7 +425,8 @@ static struct radv_shader_variant *radv_shader_variant_create(struct radv_device
 	options.unsafe_math = !!(device->debug_flags & RADV_DEBUG_UNSAFE_MATH);
 	options.family = chip_family;
 	options.chip_class = device->physical_device->rad_info.chip_class;
-	tm = ac_create_target_machine(chip_family);
+	options.supports_spill = device->llvm_supports_spill;
+	tm = ac_create_target_machine(chip_family, options.supports_spill);
 	ac_compile_nir_shader(tm, &binary, &variant->config,
 			      &variant->info, shader, &options, dump);
 	LLVMDisposeTargetMachine(tm);
diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
index 0b8f50a..6c746b5 100644
--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@@ -485,6 +485,8 @@ struct radv_device {
 
 	uint64_t debug_flags;
 
+	bool llvm_supports_spill;
+	uint32_t scratch_waves;
 	/* MSAA sample locations.
 	 * The first index is the sample index.
 	 * The second index is the coordinate: X, Y. */
@@ -726,6 +728,17 @@ struct radv_cmd_buffer {
 	struct radv_cmd_buffer_upload upload;
 
 	bool record_fail;
+
+	/* for primary cmd buffers */
+	struct radeon_winsys_bo *scratch_bo;
+	struct radeon_winsys_bo *compute_scratch_bo;
+	uint32_t scratch_patch_idx;
+	uint32_t *cs_to_patch_scratch;
+
+	/* for primary + secondary cmd buffers */
+	uint32_t scratch_needed_mask;
+	uint32_t scratch_size_needed;
+	uint32_t compute_scratch_size_needed;
 };
 
 struct radv_image;
-- 
2.7.4



More information about the mesa-dev mailing list