[Mesa-dev] [PATCH] radv: add scratch support for spilling.

Wed Jan 25 07:43:36 UTC 2017

I'm not sure if using a scratch buffer per command buffer is correct.
AFAIU each ring has a separate counter for the scratch offsets, and if a
command buffer is used in multiple compute rings at the same time, these
separate counters could conflict.

I'd think we need a preamble IB per queue that sets SGPR0/1 for all
relevant stages, and modify the winsys so that that is called in the
same submit ioctl as the application command buffers.

- Bas

On Tue, Jan 24, 2017, at 18:32, Dave Airlie wrote:
> From: Dave Airlie <airlied at redhat.com>
> 
> Currently LLVM 5.0 has support for spilling to a place
> pointed to by the user sgprs instead of using relocations.
> 
> This is enabled by using the amdgcn-mesa-mesa3d triple.
> 
> For compute gfx shaders we spill to a buffer pointed to
> by 64-bit address stored in sgprs 0/1.
> For other gfx shaders we spill to a buffer pointed to by
> the first two dwords of the buffer pointed to in sgprs 0/1.
> 
> This patch enables radv to use the llvm support when present.
> 
> This fixes Sascha Willems computeshader demo first screen,
> and a bunch of CTS tests now pass.
> 
> This patch is likely to be in LLVM 4.0 release as well
> (fingers crossed) in which case we need to adjust the detection
> logic.
> 
> SIgned-off-by: Dave Airlie <airlied at redhat.com>
> ---
>  src/amd/common/ac_binary.c       |  30 +++++----
>  src/amd/common/ac_binary.h       |   4 +-
>  src/amd/common/ac_llvm_util.c    |   4 +-
>  src/amd/common/ac_llvm_util.h    |   2 +-
>  src/amd/common/ac_nir_to_llvm.c  |  14 ++--
>  src/amd/common/ac_nir_to_llvm.h  |   6 +-
>  src/amd/vulkan/radv_cmd_buffer.c | 137
>  ++++++++++++++++++++++++++++++++++++++-
>  src/amd/vulkan/radv_device.c     |  22 +++++++
>  src/amd/vulkan/radv_pipeline.c   |  10 +--
>  src/amd/vulkan/radv_private.h    |  13 ++++
>  10 files changed, 215 insertions(+), 27 deletions(-)
> 
> diff --git a/src/amd/common/ac_binary.c b/src/amd/common/ac_binary.c
> index 01cf000..9c66a82 100644
> --- a/src/amd/common/ac_binary.c
> +++ b/src/amd/common/ac_binary.c
> @@ -212,23 +212,28 @@ static const char *scratch_rsrc_dword1_symbol =
>  
>  void ac_shader_binary_read_config(struct ac_shader_binary *binary,
>  				  struct ac_shader_config *conf,
> -                                 unsigned symbol_offset)
> +                                 unsigned symbol_offset,
> +                                 bool supports_spill)
>  {
>  	unsigned i;
>  	const unsigned char *config =
>  		ac_shader_binary_config_start(binary, symbol_offset);
>  	bool really_needs_scratch = false;
> -
> +       uint32_t wavesize = 0;
>  	/* LLVM adds SGPR spills to the scratch size.
>  	 * Find out if we really need the scratch buffer.
>  	 */
> -       for (i = 0; i < binary->reloc_count; i++) {
> -               const struct ac_shader_reloc *reloc = &binary->relocs[i];
> +       if (supports_spill) {
> +               really_needs_scratch = true;
> +       } else {
> +               for (i = 0; i < binary->reloc_count; i++) {
> +                       const struct ac_shader_reloc *reloc =
> &binary->relocs[i];
>  
> -               if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name) ||
> -                   !strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
> -                       really_needs_scratch = true;
> -                       break;
> +                       if (!strcmp(scratch_rsrc_dword0_symbol,
> reloc->name) ||
> +                           !strcmp(scratch_rsrc_dword1_symbol,
> reloc->name)) {
> +                               really_needs_scratch = true;
> +                               break;
> +                       }
>  		}
>  	}
>  
> @@ -259,9 +264,7 @@ void ac_shader_binary_read_config(struct
> ac_shader_binary *binary,
>  		case R_0286E8_SPI_TMPRING_SIZE:
>  		case R_00B860_COMPUTE_TMPRING_SIZE:
>  			/* WAVESIZE is in units of 256 dwords. */
> -                       if (really_needs_scratch)
> -                               conf->scratch_bytes_per_wave =
> -                                       G_00B860_WAVESIZE(value) * 256 *
> 4;
> +                       wavesize = value;
>  			break;
>  		case SPILLED_SGPRS:
>  			conf->spilled_sgprs = value;
> @@ -285,4 +288,9 @@ void ac_shader_binary_read_config(struct
> ac_shader_binary *binary,
>  		if (!conf->spi_ps_input_addr)
>  			conf->spi_ps_input_addr = conf->spi_ps_input_ena;
>  	}
> +
> +       if (really_needs_scratch) {
> +               /* sgprs spills aren't spilling */
> +               conf->scratch_bytes_per_wave =
> G_00B860_WAVESIZE(wavesize) * 256 * 4;
> +       }
>  }
> diff --git a/src/amd/common/ac_binary.h b/src/amd/common/ac_binary.h
> index 282f33d..06fd855 100644
> --- a/src/amd/common/ac_binary.h
> +++ b/src/amd/common/ac_binary.h
> @@ -27,6 +27,7 @@
>  #pragma once
>  
>  #include <stdint.h>
> +#include <stdbool.h>
>  
>  struct ac_shader_reloc {
>  	char name[32];
> @@ -85,4 +86,5 @@ void ac_elf_read(const char *elf_data, unsigned
> elf_size,
>  
>  void ac_shader_binary_read_config(struct ac_shader_binary *binary,
>  				  struct ac_shader_config *conf,
> -                                 unsigned symbol_offset);
> +                                 unsigned symbol_offset,
> +                                 bool supports_spill);
> diff --git a/src/amd/common/ac_llvm_util.c
> b/src/amd/common/ac_llvm_util.c
> index 770e3bd..3ba5281 100644
> --- a/src/amd/common/ac_llvm_util.c
> +++ b/src/amd/common/ac_llvm_util.c
> @@ -126,11 +126,11 @@ static const char *ac_get_llvm_processor_name(enum
> radeon_family family)
>  	}
>  }
>  
> -LLVMTargetMachineRef ac_create_target_machine(enum radeon_family family)
> +LLVMTargetMachineRef ac_create_target_machine(enum radeon_family family,
> bool supports_spill)
>  {
>  	assert(family >= CHIP_TAHITI);
>  
> -       const char *triple = "amdgcn--";
> +       const char *triple = supports_spill ? "amdgcn-mesa-mesa3d" :
> "amdgcn--";
>  	LLVMTargetRef target = ac_get_llvm_target(triple);
>  	LLVMTargetMachineRef tm = LLVMCreateTargetMachine(
>  	                             target,
> diff --git a/src/amd/common/ac_llvm_util.h
> b/src/amd/common/ac_llvm_util.h
> index 802c266..2a5f325 100644
> --- a/src/amd/common/ac_llvm_util.h
> +++ b/src/amd/common/ac_llvm_util.h
> @@ -56,7 +56,7 @@ struct ac_llvm_context {
>  	LLVMValueRef fpmath_md_2p5_ulp;
>  };
>  
> -LLVMTargetMachineRef ac_create_target_machine(enum radeon_family
> family);
> +LLVMTargetMachineRef ac_create_target_machine(enum radeon_family family,
> bool supports_spill);
>  
>  void ac_add_attr_dereferenceable(LLVMValueRef val, uint64_t bytes);
>  bool ac_is_sgpr_param(LLVMValueRef param);
> diff --git a/src/amd/common/ac_nir_to_llvm.c
> b/src/amd/common/ac_nir_to_llvm.c
> index 26b87e8..43e079e 100644
> --- a/src/amd/common/ac_nir_to_llvm.c
> +++ b/src/amd/common/ac_nir_to_llvm.c
> @@ -458,10 +458,10 @@ static void create_function(struct
> nir_to_llvm_context *ctx)
>  	    arg_idx, array_params_mask, sgpr_count, ctx->options->unsafe_math);
>  	set_llvm_calling_convention(ctx->main_function, ctx->stage);
>  
> -
>  	ctx->shader_info->num_input_sgprs = 0;
>  	ctx->shader_info->num_input_vgprs = 0;
>  
> +       ctx->shader_info->num_user_sgprs = ctx->options->supports_spill ?
> 2 : 0;
>  	for (i = 0; i < user_sgpr_count; i++)
>  		ctx->shader_info->num_user_sgprs += llvm_get_type_size(arg_types[i]) / 4;
>  
> @@ -475,6 +475,10 @@ static void create_function(struct
> nir_to_llvm_context *ctx)
>  
>  	arg_idx = 0;
>  	user_sgpr_idx = 0;
> +
> +       set_userdata_location_shader(ctx, AC_UD_SCRATCH, user_sgpr_idx,
> 2);
> +       user_sgpr_idx += 2;
> +
>  	for (unsigned i = 0; i < num_sets; ++i) {
>  		if (ctx->options->layout->set[i].layout->shader_stages & (1 << ctx->stage)) {
>  			set_userdata_location(&ctx->shader_info->user_sgprs_locs.descriptor_sets[i], user_sgpr_idx, 2);
> @@ -4429,7 +4433,7 @@ LLVMModuleRef
> ac_translate_nir_to_llvm(LLVMTargetMachineRef tm,
>  
>  	memset(shader_info, 0, sizeof(*shader_info));
>  
> -       LLVMSetTarget(ctx.module, "amdgcn--");
> +       LLVMSetTarget(ctx.module, options->supports_spill ?
> "amdgcn-mesa-mesa3d" : "amdgcn--");
>  	setup_types(&ctx);
>  
>  	ctx.builder = LLVMCreateBuilderInContext(ctx.context);
> @@ -4563,7 +4567,7 @@ static void
> ac_compile_llvm_module(LLVMTargetMachineRef tm,
>  				   struct ac_shader_config *config,
>  				   struct ac_shader_variant_info *shader_info,
>  				   gl_shader_stage stage,
> -                                  bool dump_shader)
> +                                  bool dump_shader, bool supports_spill)
>  {
>  	if (dump_shader)
>  		LLVMDumpModule(llvm_module);
> @@ -4577,7 +4581,7 @@ static void
> ac_compile_llvm_module(LLVMTargetMachineRef tm,
>  	if (dump_shader)
>  		fprintf(stderr, "disasm:\n%s\n", binary->disasm_string);
>  
> -       ac_shader_binary_read_config(binary, config, 0);
> +       ac_shader_binary_read_config(binary, config, 0, supports_spill);
>  
>  	LLVMContextRef ctx = LLVMGetModuleContext(llvm_module);
>  	LLVMDisposeModule(llvm_module);
> @@ -4637,7 +4641,7 @@ void ac_compile_nir_shader(LLVMTargetMachineRef tm,
>  	LLVMModuleRef llvm_module = ac_translate_nir_to_llvm(tm, nir, shader_info,
>  	                                                     options);
>  
> -       ac_compile_llvm_module(tm, llvm_module, binary, config,
> shader_info, nir->stage, dump_shader);
> +       ac_compile_llvm_module(tm, llvm_module, binary, config,
> shader_info, nir->stage, dump_shader, options->supports_spill);
>  	switch (nir->stage) {
>  	case MESA_SHADER_COMPUTE:
>  		for (int i = 0; i < 3; ++i)
> diff --git a/src/amd/common/ac_nir_to_llvm.h
> b/src/amd/common/ac_nir_to_llvm.h
> index a57558e..9d66f94 100644
> --- a/src/amd/common/ac_nir_to_llvm.h
> +++ b/src/amd/common/ac_nir_to_llvm.h
> @@ -52,6 +52,7 @@ struct ac_nir_compiler_options {
>  	struct radv_pipeline_layout *layout;
>  	union ac_shader_variant_key key;
>  	bool unsafe_math;
> +       bool supports_spill;
>  	enum radeon_family family;
>  	enum chip_class chip_class;
>  };
> @@ -64,8 +65,9 @@ struct ac_userdata_info {
>  };
>  
>  enum ac_ud_index {
> -       AC_UD_PUSH_CONSTANTS = 0,
> -       AC_UD_SHADER_START = 1,
> +       AC_UD_SCRATCH = 0,
> +       AC_UD_PUSH_CONSTANTS = 1,
> +       AC_UD_SHADER_START = 2,
>  	AC_UD_VS_VERTEX_BUFFERS = AC_UD_SHADER_START,
>  	AC_UD_VS_BASE_VERTEX_START_INSTANCE,
>  	AC_UD_VS_MAX_UD,
> diff --git a/src/amd/vulkan/radv_cmd_buffer.c
> b/src/amd/vulkan/radv_cmd_buffer.c
> index c62d275..e904897 100644
> --- a/src/amd/vulkan/radv_cmd_buffer.c
> +++ b/src/amd/vulkan/radv_cmd_buffer.c
> @@ -466,6 +466,13 @@ radv_emit_vertex_shader(struct radv_cmd_buffer
> *cmd_buffer,
>  	va = ws->buffer_get_va(vs->bo);
>  	ws->cs_add_buffer(cmd_buffer->cs, vs->bo, 8);
>  
> +       if (vs->config.scratch_bytes_per_wave) {
> +               uint32_t needed = vs->config.scratch_bytes_per_wave *
> cmd_buffer->device->scratch_waves;
> +               if (needed > cmd_buffer->scratch_size_needed)
> +                       cmd_buffer->scratch_size_needed = needed;
> +               cmd_buffer->scratch_needed_mask |= (1 <<
> MESA_SHADER_VERTEX);
> +       }
> +
>  	clip_dist_mask = vs->info.vs.clip_dist_mask;
>  	cull_dist_mask = vs->info.vs.cull_dist_mask;
>  	total_mask = clip_dist_mask | cull_dist_mask;
> @@ -536,6 +543,13 @@ radv_emit_fragment_shader(struct radv_cmd_buffer
> *cmd_buffer,
>  	va = ws->buffer_get_va(ps->bo);
>  	ws->cs_add_buffer(cmd_buffer->cs, ps->bo, 8);
>  
> +       if (ps->config.scratch_bytes_per_wave) {
> +               uint32_t needed = ps->config.scratch_bytes_per_wave *
> cmd_buffer->device->scratch_waves;
> +               if (needed > cmd_buffer->scratch_size_needed)
> +                       cmd_buffer->scratch_size_needed = needed;
> +               cmd_buffer->scratch_needed_mask |= (1 <<
> MESA_SHADER_FRAGMENT);
> +       }
> +
>  	radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B020_SPI_SHADER_PGM_LO_PS, 4);
>  	radeon_emit(cmd_buffer->cs, va >> 8);
>  	radeon_emit(cmd_buffer->cs, va >> 40);
> @@ -627,6 +641,15 @@ radv_emit_graphics_pipeline(struct radv_cmd_buffer
> *cmd_buffer,
>  	radeon_set_context_reg(cmd_buffer->cs, R_028A94_VGT_MULTI_PRIM_IB_RESET_EN,
>  			       pipeline->graphics.prim_restart_enable);
>  
> +       uint32_t max_scratch_bytes_per_wave = 0;
> +       max_scratch_bytes_per_wave = MAX2(max_scratch_bytes_per_wave,
> +                                        
> pipeline->shaders[MESA_SHADER_VERTEX]->config.scratch_bytes_per_wave);
> +       max_scratch_bytes_per_wave = MAX2(max_scratch_bytes_per_wave,
> +                                        
> pipeline->shaders[MESA_SHADER_FRAGMENT]->config.scratch_bytes_per_wave);
> +
> +       radeon_set_context_reg(cmd_buffer->cs, R_0286E8_SPI_TMPRING_SIZE,
> +                             
> S_0286E8_WAVES(cmd_buffer->device->scratch_waves) |
> +                             
> S_0286E8_WAVESIZE(max_scratch_bytes_per_wave >> 10));
>  	cmd_buffer->state.emitted_pipeline = pipeline;
>  }
>  
> @@ -1372,6 +1395,13 @@ radv_cmd_buffer_destroy(struct radv_cmd_buffer
> *cmd_buffer)
>  
>  	if (cmd_buffer->upload.upload_bo)
>  		cmd_buffer->device->ws->buffer_destroy(cmd_buffer->upload.upload_bo);
> +
> +       if (cmd_buffer->scratch_bo)
> +              
> cmd_buffer->device->ws->buffer_destroy(cmd_buffer->scratch_bo);
> +
> +       if (cmd_buffer->compute_scratch_bo)
> +              
> cmd_buffer->device->ws->buffer_destroy(cmd_buffer->compute_scratch_bo);
> +
>  	cmd_buffer->device->ws->cs_destroy(cmd_buffer->cs);
>  	vk_free(&cmd_buffer->pool->alloc, cmd_buffer);
>  }
> @@ -1402,6 +1432,19 @@ static void  radv_reset_cmd_buffer(struct
> radv_cmd_buffer *cmd_buffer)
>  		free(up);
>  	}
>  
> +       if (cmd_buffer->scratch_bo) {
> +              
> cmd_buffer->device->ws->buffer_destroy(cmd_buffer->scratch_bo);
> +               cmd_buffer->scratch_bo = NULL;
> +       }
> +
> +       if (cmd_buffer->compute_scratch_bo) {
> +              
> cmd_buffer->device->ws->buffer_destroy(cmd_buffer->compute_scratch_bo);
> +               cmd_buffer->compute_scratch_bo = NULL;
> +       }
> +
> +       cmd_buffer->scratch_needed_mask = 0;
> +       cmd_buffer->scratch_size_needed = 0;
> +       cmd_buffer->compute_scratch_size_needed = 0;
>  	if (cmd_buffer->upload.upload_bo)
>  		cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs,
>  						      cmd_buffer->upload.upload_bo, 8);
> @@ -1457,6 +1500,19 @@ VkResult radv_BeginCommandBuffer(
>  		default:
>  			break;
>  		}
> +
> +               uint32_t pad_word = 0xffff1000U;
> +               if
> (cmd_buffer->device->physical_device->rad_info.gfx_ib_pad_with_type2)
> +                       pad_word = 0x80000000;
> +
> +               cmd_buffer->scratch_patch_idx = cmd_buffer->cs->cdw;
> +               cmd_buffer->cs_to_patch_scratch = cmd_buffer->cs->buf;
> +               for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
> +                       radeon_emit(cmd_buffer->cs, pad_word);
> +                       radeon_emit(cmd_buffer->cs, pad_word);
> +                       radeon_emit(cmd_buffer->cs, pad_word);
> +                       radeon_emit(cmd_buffer->cs, pad_word);
> +               }
>  	}
>  
>  	if (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
> @@ -1594,6 +1650,70 @@ VkResult radv_EndCommandBuffer(
>  
>  	if (cmd_buffer->queue_family_index != RADV_QUEUE_TRANSFER)
>  		si_emit_cache_flush(cmd_buffer);
> +
> +       int idx = cmd_buffer->scratch_patch_idx;
> +       if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY &&
> cmd_buffer->compute_scratch_size_needed) {
> +               cmd_buffer->compute_scratch_bo =
> cmd_buffer->device->ws->buffer_create(cmd_buffer->device->ws,
> +                                                                        
>              cmd_buffer->compute_scratch_size_needed,
> +                                                                        
>              4096,
> +                                                                        
>              RADEON_DOMAIN_VRAM,
> +                                                                        
>              RADEON_FLAG_NO_CPU_ACCESS);
> +
> +               if (!cmd_buffer->compute_scratch_bo) {
> +                       cmd_buffer->record_fail = true;
> +                       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
> +               }
> +               cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs,
> cmd_buffer->compute_scratch_bo, 8);
> +
> +               uint64_t scratch_va =
> cmd_buffer->device->ws->buffer_get_va(cmd_buffer->compute_scratch_bo);
> +               uint32_t rsrc1 = S_008F04_BASE_ADDRESS_HI(scratch_va >>
> 32) |
> +                       S_008F04_SWIZZLE_ENABLE(1);
> +               uint32_t reg_base;
> +
> +               reg_base =
> shader_stage_to_user_data_0(MESA_SHADER_COMPUTE);
> +               cmd_buffer->cs_to_patch_scratch[idx++] =
> PKT3(PKT3_SET_SH_REG, 2, 0);
> +               cmd_buffer->cs_to_patch_scratch[idx++] = (reg_base -
> SI_SH_REG_OFFSET) >> 2;
> +               cmd_buffer->cs_to_patch_scratch[idx++] = scratch_va;
> +               cmd_buffer->cs_to_patch_scratch[idx++] = rsrc1;
> +       }
> +
> +       if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY &&
> cmd_buffer->scratch_size_needed) {
> +               cmd_buffer->scratch_bo =
> cmd_buffer->device->ws->buffer_create(cmd_buffer->device->ws,
> +                                                                        
>      cmd_buffer->scratch_size_needed,
> +                                                                        
>      4096,
> +                                                                        
>      RADEON_DOMAIN_VRAM,
> +                                                                        
>      RADEON_FLAG_NO_CPU_ACCESS);
> +
> +               if (!cmd_buffer->scratch_bo) {
> +                       cmd_buffer->record_fail = true;
> +                       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
> +               }
> +
> +               cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs,
> cmd_buffer->scratch_bo, 8);
> +
> +               uint64_t scratch_va =
> cmd_buffer->device->ws->buffer_get_va(cmd_buffer->scratch_bo);
> +               uint32_t rsrc1 = S_008F04_BASE_ADDRESS_HI(scratch_va >>
> 32) |
> +                       S_008F04_SWIZZLE_ENABLE(1);
> +
> +               uint32_t *ring_ptr;
> +               uint32_t ring_offset;
> +               radv_cmd_buffer_upload_alloc(cmd_buffer, 4 * 4, 256,
> &ring_offset,
> +                                            (void **)&ring_ptr);
> +               ring_ptr[0] = scratch_va;
> +               ring_ptr[1] = rsrc1;
> +               uint64_t va =
> cmd_buffer->device->ws->buffer_get_va(cmd_buffer->upload.upload_bo) +
> ring_offset;
> +
> +               radv_foreach_stage(stage,
> cmd_buffer->scratch_needed_mask) {
> +                       uint32_t reg_base;
> +
> +                       reg_base = shader_stage_to_user_data_0(stage);
> +                       cmd_buffer->cs_to_patch_scratch[idx++] =
> PKT3(PKT3_SET_SH_REG, 2, 0);
> +                       cmd_buffer->cs_to_patch_scratch[idx++] =
> (reg_base - SI_SH_REG_OFFSET) >> 2;
> +                       cmd_buffer->cs_to_patch_scratch[idx++] = va;
> +                       cmd_buffer->cs_to_patch_scratch[idx++] = va >>
> 32;
> +               }
> +       }
> +
>  	if (!cmd_buffer->device->ws->cs_finalize(cmd_buffer->cs) ||
>  	    cmd_buffer->record_fail)
>  		return VK_ERROR_OUT_OF_DEVICE_MEMORY;
> @@ -1629,9 +1749,16 @@ radv_emit_compute_pipeline(struct radv_cmd_buffer
> *cmd_buffer)
>  	radeon_emit(cmd_buffer->cs, compute_shader->rsrc1);
>  	radeon_emit(cmd_buffer->cs, compute_shader->rsrc2);
>  
> +       if (compute_shader->config.scratch_bytes_per_wave) {
> +               uint32_t needed =
> compute_shader->config.scratch_bytes_per_wave *
> cmd_buffer->device->scratch_waves;
> +               if (needed > cmd_buffer->compute_scratch_size_needed)
> +                       cmd_buffer->compute_scratch_size_needed = needed;
> +       }
> +
>  	/* change these once we have scratch support */
>  	radeon_set_sh_reg(cmd_buffer->cs, R_00B860_COMPUTE_TMPRING_SIZE,
> -                         S_00B860_WAVES(32) | S_00B860_WAVESIZE(0));
> +                        
> S_00B860_WAVES(cmd_buffer->device->scratch_waves) |
> +                        
> S_00B860_WAVESIZE(compute_shader->config.scratch_bytes_per_wave >> 10));
>  
>  	radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
>  	radeon_emit(cmd_buffer->cs,
> @@ -1821,6 +1948,14 @@ void radv_CmdExecuteCommands(
>  	for (uint32_t i = 0; i < commandBufferCount; i++) {
>  		RADV_FROM_HANDLE(radv_cmd_buffer, secondary, pCmdBuffers[i]);
>  
> +               if (secondary->scratch_size_needed >
> primary->scratch_size_needed)
> +                       primary->scratch_size_needed =
> secondary->scratch_size_needed;
> +
> +               if (secondary->compute_scratch_size_needed >
> primary->compute_scratch_size_needed)
> +                       primary->compute_scratch_size_needed =
> secondary->compute_scratch_size_needed;
> +
> +               primary->scratch_needed_mask |=
> secondary->scratch_needed_mask;
> +
>  		primary->device->ws->cs_execute_secondary(primary->cs, secondary->cs);
>  	}
>  
> diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
> index 4aa6af2..c465186 100644
> --- a/src/amd/vulkan/radv_device.c
> +++ b/src/amd/vulkan/radv_device.c
> @@ -781,6 +781,13 @@ VkResult radv_CreateDevice(
>  		}
>  	}
>  
> +       /* TODO : predicate on LLVM version this goes into */
> +#if HAVE_LLVM < 0x0500
> +       device->llvm_supports_spill = false;
> +#else
> +       device->llvm_supports_spill = true;
> +#endif
> +
>  	result = radv_device_init_meta(device);
>  	if (result != VK_SUCCESS)
>  		goto fail;
> @@ -814,6 +821,21 @@ VkResult radv_CreateDevice(
>  			goto fail;
>  	}
>  
> +       /* The maximum number of scratch waves. Scratch space isn't
> divided
> +        * evenly between CUs. The number is only a function of the
> number of CUs.
> +        * We can decrease the constant to decrease the scratch buffer
> size.
> +        *
> +        * sctx->scratch_waves must be >= the maximum posible size of
> +        * 1 threadgroup, so that the hw doesn't hang from being unable
> +        * to start any.
> +        *
> +        * The recommended value is 4 per CU at most. Higher numbers
> don't
> +        * bring much benefit, but they still occupy chip resources
> (think
> +        * async compute). I've seen ~2% performance difference between 4
> and 32.
> +        */
> +       uint32_t max_threads_per_block = 2048;
> +       device->scratch_waves = MAX2(32 *
> physical_device->rad_info.num_good_compute_units,
> +                                    max_threads_per_block / 64);
>  	*pDevice = radv_device_to_handle(device);
>  	return VK_SUCCESS;
>  
> diff --git a/src/amd/vulkan/radv_pipeline.c
> b/src/amd/vulkan/radv_pipeline.c
> index 360b519..060cfbb 100644
> --- a/src/amd/vulkan/radv_pipeline.c
> +++ b/src/amd/vulkan/radv_pipeline.c
> @@ -354,12 +354,13 @@ static void radv_fill_shader_variant(struct
> radv_device *device,
>  				     struct ac_shader_binary *binary,
>  				     gl_shader_stage stage)
>  {
> -       variant->code_size = binary->code_size;
>  	bool scratch_enabled = variant->config.scratch_bytes_per_wave > 0;
>  	unsigned vgpr_comp_cnt = 0;
>  
> -       if (scratch_enabled)
> -               radv_finishme("shader scratch space");
> +       if (scratch_enabled && !device->llvm_supports_spill)
> +               radv_finishme("shader scratch support only available with
> LLVM 5.0");
> +
> +       variant->code_size = binary->code_size;
>  
>  	switch (stage) {
>  	case MESA_SHADER_VERTEX:
> @@ -424,7 +425,8 @@ static struct radv_shader_variant
> *radv_shader_variant_create(struct radv_device
>  	options.unsafe_math = !!(device->debug_flags & RADV_DEBUG_UNSAFE_MATH);
>  	options.family = chip_family;
>  	options.chip_class = device->physical_device->rad_info.chip_class;
> -       tm = ac_create_target_machine(chip_family);
> +       options.supports_spill = device->llvm_supports_spill;
> +       tm = ac_create_target_machine(chip_family,
> options.supports_spill);
>  	ac_compile_nir_shader(tm, &binary, &variant->config,
>  			      &variant->info, shader, &options, dump);
>  	LLVMDisposeTargetMachine(tm);
> diff --git a/src/amd/vulkan/radv_private.h
> b/src/amd/vulkan/radv_private.h
> index 0b8f50a..6c746b5 100644
> --- a/src/amd/vulkan/radv_private.h
> +++ b/src/amd/vulkan/radv_private.h
> @@ -485,6 +485,8 @@ struct radv_device {
>  
>  	uint64_t debug_flags;
>  
> +       bool llvm_supports_spill;
> +       uint32_t scratch_waves;
>  	/* MSAA sample locations.
>  	 * The first index is the sample index.
>  	 * The second index is the coordinate: X, Y. */
> @@ -726,6 +728,17 @@ struct radv_cmd_buffer {
>  	struct radv_cmd_buffer_upload upload;
>  
>  	bool record_fail;
> +
> +       /* for primary cmd buffers */
> +       struct radeon_winsys_bo *scratch_bo;
> +       struct radeon_winsys_bo *compute_scratch_bo;
> +       uint32_t scratch_patch_idx;
> +       uint32_t *cs_to_patch_scratch;
> +
> +       /* for primary + secondary cmd buffers */
> +       uint32_t scratch_needed_mask;
> +       uint32_t scratch_size_needed;
> +       uint32_t compute_scratch_size_needed;
>  };
>  
>  struct radv_image;
> -- 
> 2.7.4
> 
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev