[Mesa-dev] [PATCH 2/2] [RFC] radv: add scratch support for spilling.

Mon Oct 10 03:25:00 UTC 2016

From: Dave Airlie <airlied at redhat.com>

This is a bit of a hack due to how llvm currently handles
spilling in it's shader ABI. Currently llvm amdgpu backend
uses relocations to patch the shader with the address of
the tmpring. The driver loads the shader and patches the
relocations.

However for vulkan this doesn't work so well for a few reasons
a) when we build/load the shaders we aren't constructing the
command stream yet, and the same shader could be using in multiple
command streams.

b) multiple command execution engines for compute shaders.

So ideally we'd fix LLVM to understand the ABI convention, possibly
we'd fix it so user sgpr 0,1 are used (this hack uses 10/11).

This patch when it gets the shader back from llvm it patches
the relocation dword to a nop, and patches to previous mov command
to move from SGPR 10 and 11. This works usually as it seems the
SGPR loading of the spill stuff is at the start of shaders always
so the 10/11 user sgprs haven't been trashed yet. I'm not 100%
sure this will work all the time, but for now this should allow
us to pass a bunch more CTS tests and make the Sascha computeshader
demo to work.

Signed-off-by: Dave Airlie <airlied at redhat.com>
---
 src/amd/common/ac_nir_to_llvm.c  |  38 +++++++++++++++
 src/amd/common/ac_nir_to_llvm.h  |  11 +++--
 src/amd/vulkan/radv_cmd_buffer.c | 102 ++++++++++++++++++++++++++++++++++++++-
 src/amd/vulkan/radv_device.c     |  16 ++++++
 src/amd/vulkan/radv_private.h    |   9 ++++
 5 files changed, 170 insertions(+), 6 deletions(-)

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index 0e70114..4371451 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -343,6 +343,10 @@ static void create_function(struct nir_to_llvm_context *ctx,
 	arg_types[arg_idx++] = const_array(ctx->i8, 1024 * 1024);
 
 	array_count = arg_idx;
+
+	/* scratch address */
+	arg_types[arg_idx++] = LLVMVectorType(ctx->i32, 2); /* address of scratch */
+
 	switch (nir->stage) {
 	case MESA_SHADER_COMPUTE:
 		arg_types[arg_idx++] = LLVMVectorType(ctx->i32, 3); /* grid size */
@@ -416,6 +420,7 @@ static void create_function(struct nir_to_llvm_context *ctx,
 
 	ctx->push_constants = LLVMGetParam(ctx->main_function, arg_idx++);
 
+	arg_idx++; /* scratch address */
 	switch (nir->stage) {
 	case MESA_SHADER_COMPUTE:
 		ctx->num_work_groups =
@@ -4428,6 +4433,37 @@ static void ac_diagnostic_handler(LLVMDiagnosticInfoRef di, void *context)
 	LLVMDisposeMessage(description);
 }
 
+static const char *scratch_rsrc_dword0_symbol =
+	"SCRATCH_RSRC_DWORD0";
+
+static const char *scratch_rsrc_dword1_symbol =
+	"SCRATCH_RSRC_DWORD1";
+
+static void ac_apply_scratch_relocs(struct ac_shader_binary *binary, int sgpr)
+{
+	uint32_t nop_val = 0xbf800000;
+	unsigned i;
+	for (i = 0 ; i < binary->reloc_count; i++) {
+		const struct ac_shader_reloc *reloc =
+					&binary->relocs[i];
+		if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) {
+			uint32_t reg_val = *(uint32_t *)(binary->code + reloc->offset - 4);
+			reg_val &= 0xffffff00;
+			reg_val |= sgpr;
+			*(uint32_t *)(binary->code + reloc->offset - 4) = reg_val;
+			util_memcpy_cpu_to_le32(binary->code + reloc->offset,
+			&nop_val, 4);
+		} else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
+			uint32_t reg_val = *(uint32_t *)(binary->code + reloc->offset - 4);
+			reg_val &= 0xffffff00;
+			reg_val |= (sgpr + 1);
+			*(uint32_t *)(binary->code + reloc->offset - 4) = reg_val;
+			util_memcpy_cpu_to_le32(binary->code + reloc->offset,
+			&nop_val, 4);
+		}
+	}
+}
+
 static unsigned ac_llvm_compile(LLVMModuleRef M,
                                 struct ac_shader_binary *binary,
                                 LLVMTargetMachineRef tm)
@@ -4467,6 +4503,8 @@ static unsigned ac_llvm_compile(LLVMModuleRef M,
 	/* Clean up */
 	LLVMDisposeMemoryBuffer(out_buffer);
 
+	/* patch for vulkan scratch rsrc */
+	ac_apply_scratch_relocs(binary, AC_USERDATA_SCRATCH_OFFSET);
 out:
 	return retval;
 }
diff --git a/src/amd/common/ac_nir_to_llvm.h b/src/amd/common/ac_nir_to_llvm.h
index a17caf2..836e38a 100644
--- a/src/amd/common/ac_nir_to_llvm.h
+++ b/src/amd/common/ac_nir_to_llvm.h
@@ -104,14 +104,15 @@ void ac_compile_nir_shader(LLVMTargetMachineRef tm,
 #define AC_USERDATA_DESCRIPTOR_SET_2 4
 #define AC_USERDATA_DESCRIPTOR_SET_3 6
 #define AC_USERDATA_PUSH_CONST_DYN 8
+#define AC_USERDATA_SCRATCH_OFFSET 10
 
-#define AC_USERDATA_VS_VERTEX_BUFFERS 10
-#define AC_USERDATA_VS_BASE_VERTEX 12
-#define AC_USERDATA_VS_START_INSTANCE 13
+#define AC_USERDATA_VS_VERTEX_BUFFERS 12
+#define AC_USERDATA_VS_BASE_VERTEX 14
+#define AC_USERDATA_VS_START_INSTANCE 15
 
-#define AC_USERDATA_PS_SAMPLE_POS 10
+#define AC_USERDATA_PS_SAMPLE_POS 12
 
-#define AC_USERDATA_CS_GRID_SIZE 10
+#define AC_USERDATA_CS_GRID_SIZE 12
 
 #ifdef __cplusplus
 extern "C"
diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
index d97b29b..fa2db76 100644
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -372,6 +372,13 @@ radv_emit_vertex_shader(struct radv_cmd_buffer *cmd_buffer,
 	va = ws->buffer_get_va(vs->bo);
 	ws->cs_add_buffer(cmd_buffer->cs, vs->bo, 8);
 
+	if (vs->config.scratch_bytes_per_wave) {
+		uint32_t needed = vs->config.scratch_bytes_per_wave * cmd_buffer->device->scratch_waves;
+		if (needed > cmd_buffer->scratch_size_needed)
+			cmd_buffer->scratch_size_needed = needed;
+		cmd_buffer->scratch_needed_mask |= (1 << MESA_SHADER_VERTEX);
+	}
+
 	clip_dist_mask = vs->info.vs.clip_dist_mask;
 	cull_dist_mask = vs->info.vs.cull_dist_mask;
 	total_mask = clip_dist_mask | cull_dist_mask;
@@ -436,6 +443,13 @@ radv_emit_fragment_shader(struct radv_cmd_buffer *cmd_buffer,
 	va = ws->buffer_get_va(ps->bo);
 	ws->cs_add_buffer(cmd_buffer->cs, ps->bo, 8);
 
+	if (ps->config.scratch_bytes_per_wave) {
+		uint32_t needed = ps->config.scratch_bytes_per_wave * cmd_buffer->device->scratch_waves;
+		if (needed > cmd_buffer->scratch_size_needed)
+			cmd_buffer->scratch_size_needed = needed;
+		cmd_buffer->scratch_needed_mask |= (1 << MESA_SHADER_FRAGMENT);
+	}
+
 	radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B020_SPI_SHADER_PGM_LO_PS, 4);
 	radeon_emit(cmd_buffer->cs, va >> 8);
 	radeon_emit(cmd_buffer->cs, va >> 40);
@@ -527,6 +541,15 @@ radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer,
 	radeon_set_context_reg(cmd_buffer->cs, R_028A94_VGT_MULTI_PRIM_IB_RESET_EN,
 			       pipeline->graphics.prim_restart_enable);
 
+	uint32_t max_scratch_bytes_per_wave = 0;
+	max_scratch_bytes_per_wave = MAX2(max_scratch_bytes_per_wave,
+					  pipeline->shaders[MESA_SHADER_VERTEX]->config.scratch_bytes_per_wave);
+	max_scratch_bytes_per_wave = MAX2(max_scratch_bytes_per_wave,
+					  pipeline->shaders[MESA_SHADER_FRAGMENT]->config.scratch_bytes_per_wave);
+
+	radeon_set_context_reg(cmd_buffer->cs, R_0286E8_SPI_TMPRING_SIZE,
+			       S_0286E8_WAVES(cmd_buffer->device->scratch_waves) |
+			       S_0286E8_WAVESIZE(max_scratch_bytes_per_wave >> 10));
 	cmd_buffer->state.emitted_pipeline = pipeline;
 }
 
@@ -1214,6 +1237,9 @@ radv_cmd_buffer_destroy(struct radv_cmd_buffer *cmd_buffer)
 
 	if (cmd_buffer->upload.upload_bo)
 		cmd_buffer->device->ws->buffer_destroy(cmd_buffer->upload.upload_bo);
+
+	if (cmd_buffer->scratch_bo)
+		cmd_buffer->device->ws->buffer_destroy(cmd_buffer->scratch_bo);
 	cmd_buffer->device->ws->cs_destroy(cmd_buffer->cs);
 	radv_free(&cmd_buffer->pool->alloc, cmd_buffer);
 }
@@ -1244,6 +1270,13 @@ static void  radv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
 		free(up);
 	}
 
+	if (cmd_buffer->scratch_bo) {
+		cmd_buffer->device->ws->buffer_destroy(cmd_buffer->scratch_bo);
+		cmd_buffer->scratch_bo = NULL;
+	}
+	cmd_buffer->scratch_needed_mask = 0;
+	cmd_buffer->scratch_size_needed = 0;
+
 	if (cmd_buffer->upload.upload_bo)
 		cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs,
 						      cmd_buffer->upload.upload_bo, 8);
@@ -1283,6 +1316,16 @@ VkResult radv_BeginCommandBuffer(
 		si_init_config(&cmd_buffer->device->instance->physicalDevice, cmd_buffer);
 		radv_set_db_count_control(cmd_buffer);
 		si_emit_cache_flush(cmd_buffer);
+		cmd_buffer->scratch_patch_idx = cmd_buffer->cs->cdw;
+		uint32_t pad_word = 0xffff1000U;
+		if (cmd_buffer->device->instance->physicalDevice.rad_info.gfx_ib_pad_with_type2)
+			pad_word = 0x80000000;
+		for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
+			radeon_emit(cmd_buffer->cs, pad_word);
+			radeon_emit(cmd_buffer->cs, pad_word);
+			radeon_emit(cmd_buffer->cs, pad_word);
+			radeon_emit(cmd_buffer->cs, pad_word);
+		}
 	}
 
 	if (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
@@ -1434,6 +1477,51 @@ VkResult radv_EndCommandBuffer(
 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
 
 	si_emit_cache_flush(cmd_buffer);
+
+	if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY && cmd_buffer->scratch_size_needed) {
+		int idx = cmd_buffer->scratch_patch_idx;
+
+		cmd_buffer->scratch_bo = cmd_buffer->device->ws->buffer_create(cmd_buffer->device->ws,
+									       cmd_buffer->scratch_size_needed,
+									       4096,
+									       RADEON_DOMAIN_VRAM,
+									       RADEON_FLAG_NO_CPU_ACCESS);
+
+		if (!cmd_buffer->scratch_bo) {
+			cmd_buffer->record_fail = true;
+			return VK_ERROR_OUT_OF_DEVICE_MEMORY;
+		}
+
+		cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs, cmd_buffer->scratch_bo, 8);
+
+		uint64_t scratch_va = cmd_buffer->device->ws->buffer_get_va(cmd_buffer->scratch_bo);
+		radv_foreach_stage(stage, cmd_buffer->scratch_needed_mask) {
+			uint32_t reg_base;
+			uint32_t rsrc1 = S_008F04_BASE_ADDRESS_HI(scratch_va >> 32);
+
+			rsrc1 |= S_008F04_SWIZZLE_ENABLE(1);
+			switch (stage) {
+			case MESA_SHADER_VERTEX:
+				reg_base = R_00B130_SPI_SHADER_USER_DATA_VS_0;
+				break;
+			case MESA_SHADER_FRAGMENT:
+				reg_base = R_00B030_SPI_SHADER_USER_DATA_PS_0;
+				break;
+			case MESA_SHADER_COMPUTE:
+				reg_base = R_00B900_COMPUTE_USER_DATA_0;
+				break;
+			default:
+				radv_finishme("scratch relocs non-compute");
+				exit(-1);
+				break;
+			}
+			reg_base += (AC_USERDATA_SCRATCH_OFFSET * 4);
+			cmd_buffer->cs->buf[idx++] = PKT3(PKT3_SET_SH_REG, 2, 0);
+			cmd_buffer->cs->buf[idx++] = (reg_base - SI_SH_REG_OFFSET) >> 2;
+			cmd_buffer->cs->buf[idx++] = scratch_va;
+			cmd_buffer->cs->buf[idx++] = rsrc1;
+		}
+	}
 	if (!cmd_buffer->device->ws->cs_finalize(cmd_buffer->cs) ||
 	    cmd_buffer->record_fail)
 		return VK_ERROR_OUT_OF_DEVICE_MEMORY;
@@ -1468,9 +1556,16 @@ radv_emit_compute_pipeline(struct radv_cmd_buffer *cmd_buffer)
 	radeon_emit(cmd_buffer->cs, compute_shader->rsrc1);
 	radeon_emit(cmd_buffer->cs, compute_shader->rsrc2);
 
+	if (compute_shader->config.scratch_bytes_per_wave) {
+		uint32_t needed = compute_shader->config.scratch_bytes_per_wave * cmd_buffer->device->scratch_waves;
+		if (needed > cmd_buffer->scratch_size_needed)
+			cmd_buffer->scratch_size_needed = needed;
+		cmd_buffer->scratch_needed_mask |= (1 << MESA_SHADER_COMPUTE);
+	}
+
 	/* change these once we have scratch support */
 	radeon_set_sh_reg(cmd_buffer->cs, R_00B860_COMPUTE_TMPRING_SIZE,
-			  S_00B860_WAVES(32) | S_00B860_WAVESIZE(0));
+			  S_00B860_WAVES(cmd_buffer->device->scratch_waves) | S_00B860_WAVESIZE(compute_shader->config.scratch_bytes_per_wave >> 10));
 
 	radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
 	radeon_emit(cmd_buffer->cs,
@@ -1655,6 +1750,11 @@ void radv_CmdExecuteCommands(
 	for (uint32_t i = 0; i < commandBufferCount; i++) {
 		RADV_FROM_HANDLE(radv_cmd_buffer, secondary, pCmdBuffers[i]);
 
+		if (secondary->scratch_size_needed > primary->scratch_size_needed)
+			primary->scratch_size_needed = secondary->scratch_size_needed;
+
+		primary->scratch_needed_mask |= secondary->scratch_needed_mask;
+
 		primary->device->ws->cs_execute_secondary(primary->cs, secondary->cs);
 	}
 
diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
index 8c59344..1a56a7e 100644
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -609,6 +609,22 @@ VkResult radv_CreateDevice(
 		radv_finishme("DCC fast clears have not been tested\n");
 
 	radv_device_init_msaa(device);
+
+	/* The maximum number of scratch waves. Scratch space isn't divided
+	 * evenly between CUs. The number is only a function of the number of CUs.
+	 * We can decrease the constant to decrease the scratch buffer size.
+	 *
+	 * sctx->scratch_waves must be >= the maximum posible size of
+	 * 1 threadgroup, so that the hw doesn't hang from being unable
+	 * to start any.
+	 *
+	 * The recommended value is 4 per CU at most. Higher numbers don't
+	 * bring much benefit, but they still occupy chip resources (think
+	 * async compute). I've seen ~2% performance difference between 4 and 32.
+	 */
+	uint32_t max_threads_per_block = 2048;
+	device->scratch_waves = MAX2(32 * physical_device->rad_info.num_good_compute_units,
+				     max_threads_per_block / 64);
 	device->empty_cs = device->ws->cs_create(device->ws, RING_GFX);
 	radeon_emit(device->empty_cs, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
 	radeon_emit(device->empty_cs, CONTEXT_CONTROL_LOAD_ENABLE(1));
diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
index e738218..a2e9540 100644
--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@@ -564,6 +564,7 @@ struct radv_device {
 	bool allow_fast_clears;
 	bool allow_dcc;
 
+	uint32_t scratch_waves;
 	/* MSAA sample locations.
 	 * The first index is the sample index.
 	 * The second index is the coordinate: X, Y. */
@@ -797,6 +798,14 @@ struct radv_cmd_buffer {
 	struct radv_cmd_buffer_upload upload;
 
 	bool record_fail;
+
+	/* for primary cmd buffers */
+	struct radeon_winsys_bo *scratch_bo;
+	uint32_t scratch_patch_idx;
+
+	/* for primary + secondary cmd buffers */
+	uint32_t scratch_needed_mask;
+	uint32_t scratch_size_needed;
 };
 
 struct radv_image;
-- 
2.5.5