[Mesa-dev] [PATCH] radv: add scratch support for spilling.
Bas Nieuwenhuizen
bas at basnieuwenhuizen.nl
Wed Jan 25 07:43:36 UTC 2017
I'm not sure if using a scratch buffer per command buffer is correct.
AFAIU each ring has a separate counter for the scratch offsets, and if a
command buffer is used in multiple compute rings at the same time, these
separate counters could conflict.
I'd think we need a preamble IB per queue that sets SGPR0/1 for all
relevant stages, and modify the winsys so that that is called in the
same submit ioctl as the application command buffers.
- Bas
On Tue, Jan 24, 2017, at 18:32, Dave Airlie wrote:
> From: Dave Airlie <airlied at redhat.com>
>
> Currently LLVM 5.0 has support for spilling to a place
> pointed to by the user sgprs instead of using relocations.
>
> This is enabled by using the amdgcn-mesa-mesa3d triple.
>
> For compute gfx shaders we spill to a buffer pointed to
> by 64-bit address stored in sgprs 0/1.
> For other gfx shaders we spill to a buffer pointed to by
> the first two dwords of the buffer pointed to in sgprs 0/1.
>
> This patch enables radv to use the llvm support when present.
>
> This fixes Sascha Willems computeshader demo first screen,
> and a bunch of CTS tests now pass.
>
> This patch is likely to be in LLVM 4.0 release as well
> (fingers crossed) in which case we need to adjust the detection
> logic.
>
> SIgned-off-by: Dave Airlie <airlied at redhat.com>
> ---
> src/amd/common/ac_binary.c | 30 +++++----
> src/amd/common/ac_binary.h | 4 +-
> src/amd/common/ac_llvm_util.c | 4 +-
> src/amd/common/ac_llvm_util.h | 2 +-
> src/amd/common/ac_nir_to_llvm.c | 14 ++--
> src/amd/common/ac_nir_to_llvm.h | 6 +-
> src/amd/vulkan/radv_cmd_buffer.c | 137
> ++++++++++++++++++++++++++++++++++++++-
> src/amd/vulkan/radv_device.c | 22 +++++++
> src/amd/vulkan/radv_pipeline.c | 10 +--
> src/amd/vulkan/radv_private.h | 13 ++++
> 10 files changed, 215 insertions(+), 27 deletions(-)
>
> diff --git a/src/amd/common/ac_binary.c b/src/amd/common/ac_binary.c
> index 01cf000..9c66a82 100644
> --- a/src/amd/common/ac_binary.c
> +++ b/src/amd/common/ac_binary.c
> @@ -212,23 +212,28 @@ static const char *scratch_rsrc_dword1_symbol =
>
> void ac_shader_binary_read_config(struct ac_shader_binary *binary,
> struct ac_shader_config *conf,
> - unsigned symbol_offset)
> + unsigned symbol_offset,
> + bool supports_spill)
> {
> unsigned i;
> const unsigned char *config =
> ac_shader_binary_config_start(binary, symbol_offset);
> bool really_needs_scratch = false;
> -
> + uint32_t wavesize = 0;
> /* LLVM adds SGPR spills to the scratch size.
> * Find out if we really need the scratch buffer.
> */
> - for (i = 0; i < binary->reloc_count; i++) {
> - const struct ac_shader_reloc *reloc = &binary->relocs[i];
> + if (supports_spill) {
> + really_needs_scratch = true;
> + } else {
> + for (i = 0; i < binary->reloc_count; i++) {
> + const struct ac_shader_reloc *reloc =
> &binary->relocs[i];
>
> - if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name) ||
> - !strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
> - really_needs_scratch = true;
> - break;
> + if (!strcmp(scratch_rsrc_dword0_symbol,
> reloc->name) ||
> + !strcmp(scratch_rsrc_dword1_symbol,
> reloc->name)) {
> + really_needs_scratch = true;
> + break;
> + }
> }
> }
>
> @@ -259,9 +264,7 @@ void ac_shader_binary_read_config(struct
> ac_shader_binary *binary,
> case R_0286E8_SPI_TMPRING_SIZE:
> case R_00B860_COMPUTE_TMPRING_SIZE:
> /* WAVESIZE is in units of 256 dwords. */
> - if (really_needs_scratch)
> - conf->scratch_bytes_per_wave =
> - G_00B860_WAVESIZE(value) * 256 *
> 4;
> + wavesize = value;
> break;
> case SPILLED_SGPRS:
> conf->spilled_sgprs = value;
> @@ -285,4 +288,9 @@ void ac_shader_binary_read_config(struct
> ac_shader_binary *binary,
> if (!conf->spi_ps_input_addr)
> conf->spi_ps_input_addr = conf->spi_ps_input_ena;
> }
> +
> + if (really_needs_scratch) {
> + /* sgprs spills aren't spilling */
> + conf->scratch_bytes_per_wave =
> G_00B860_WAVESIZE(wavesize) * 256 * 4;
> + }
> }
> diff --git a/src/amd/common/ac_binary.h b/src/amd/common/ac_binary.h
> index 282f33d..06fd855 100644
> --- a/src/amd/common/ac_binary.h
> +++ b/src/amd/common/ac_binary.h
> @@ -27,6 +27,7 @@
> #pragma once
>
> #include <stdint.h>
> +#include <stdbool.h>
>
> struct ac_shader_reloc {
> char name[32];
> @@ -85,4 +86,5 @@ void ac_elf_read(const char *elf_data, unsigned
> elf_size,
>
> void ac_shader_binary_read_config(struct ac_shader_binary *binary,
> struct ac_shader_config *conf,
> - unsigned symbol_offset);
> + unsigned symbol_offset,
> + bool supports_spill);
> diff --git a/src/amd/common/ac_llvm_util.c
> b/src/amd/common/ac_llvm_util.c
> index 770e3bd..3ba5281 100644
> --- a/src/amd/common/ac_llvm_util.c
> +++ b/src/amd/common/ac_llvm_util.c
> @@ -126,11 +126,11 @@ static const char *ac_get_llvm_processor_name(enum
> radeon_family family)
> }
> }
>
> -LLVMTargetMachineRef ac_create_target_machine(enum radeon_family family)
> +LLVMTargetMachineRef ac_create_target_machine(enum radeon_family family,
> bool supports_spill)
> {
> assert(family >= CHIP_TAHITI);
>
> - const char *triple = "amdgcn--";
> + const char *triple = supports_spill ? "amdgcn-mesa-mesa3d" :
> "amdgcn--";
> LLVMTargetRef target = ac_get_llvm_target(triple);
> LLVMTargetMachineRef tm = LLVMCreateTargetMachine(
> target,
> diff --git a/src/amd/common/ac_llvm_util.h
> b/src/amd/common/ac_llvm_util.h
> index 802c266..2a5f325 100644
> --- a/src/amd/common/ac_llvm_util.h
> +++ b/src/amd/common/ac_llvm_util.h
> @@ -56,7 +56,7 @@ struct ac_llvm_context {
> LLVMValueRef fpmath_md_2p5_ulp;
> };
>
> -LLVMTargetMachineRef ac_create_target_machine(enum radeon_family
> family);
> +LLVMTargetMachineRef ac_create_target_machine(enum radeon_family family,
> bool supports_spill);
>
> void ac_add_attr_dereferenceable(LLVMValueRef val, uint64_t bytes);
> bool ac_is_sgpr_param(LLVMValueRef param);
> diff --git a/src/amd/common/ac_nir_to_llvm.c
> b/src/amd/common/ac_nir_to_llvm.c
> index 26b87e8..43e079e 100644
> --- a/src/amd/common/ac_nir_to_llvm.c
> +++ b/src/amd/common/ac_nir_to_llvm.c
> @@ -458,10 +458,10 @@ static void create_function(struct
> nir_to_llvm_context *ctx)
> arg_idx, array_params_mask, sgpr_count, ctx->options->unsafe_math);
> set_llvm_calling_convention(ctx->main_function, ctx->stage);
>
> -
> ctx->shader_info->num_input_sgprs = 0;
> ctx->shader_info->num_input_vgprs = 0;
>
> + ctx->shader_info->num_user_sgprs = ctx->options->supports_spill ?
> 2 : 0;
> for (i = 0; i < user_sgpr_count; i++)
> ctx->shader_info->num_user_sgprs += llvm_get_type_size(arg_types[i]) / 4;
>
> @@ -475,6 +475,10 @@ static void create_function(struct
> nir_to_llvm_context *ctx)
>
> arg_idx = 0;
> user_sgpr_idx = 0;
> +
> + set_userdata_location_shader(ctx, AC_UD_SCRATCH, user_sgpr_idx,
> 2);
> + user_sgpr_idx += 2;
> +
> for (unsigned i = 0; i < num_sets; ++i) {
> if (ctx->options->layout->set[i].layout->shader_stages & (1 << ctx->stage)) {
> set_userdata_location(&ctx->shader_info->user_sgprs_locs.descriptor_sets[i], user_sgpr_idx, 2);
> @@ -4429,7 +4433,7 @@ LLVMModuleRef
> ac_translate_nir_to_llvm(LLVMTargetMachineRef tm,
>
> memset(shader_info, 0, sizeof(*shader_info));
>
> - LLVMSetTarget(ctx.module, "amdgcn--");
> + LLVMSetTarget(ctx.module, options->supports_spill ?
> "amdgcn-mesa-mesa3d" : "amdgcn--");
> setup_types(&ctx);
>
> ctx.builder = LLVMCreateBuilderInContext(ctx.context);
> @@ -4563,7 +4567,7 @@ static void
> ac_compile_llvm_module(LLVMTargetMachineRef tm,
> struct ac_shader_config *config,
> struct ac_shader_variant_info *shader_info,
> gl_shader_stage stage,
> - bool dump_shader)
> + bool dump_shader, bool supports_spill)
> {
> if (dump_shader)
> LLVMDumpModule(llvm_module);
> @@ -4577,7 +4581,7 @@ static void
> ac_compile_llvm_module(LLVMTargetMachineRef tm,
> if (dump_shader)
> fprintf(stderr, "disasm:\n%s\n", binary->disasm_string);
>
> - ac_shader_binary_read_config(binary, config, 0);
> + ac_shader_binary_read_config(binary, config, 0, supports_spill);
>
> LLVMContextRef ctx = LLVMGetModuleContext(llvm_module);
> LLVMDisposeModule(llvm_module);
> @@ -4637,7 +4641,7 @@ void ac_compile_nir_shader(LLVMTargetMachineRef tm,
> LLVMModuleRef llvm_module = ac_translate_nir_to_llvm(tm, nir, shader_info,
> options);
>
> - ac_compile_llvm_module(tm, llvm_module, binary, config,
> shader_info, nir->stage, dump_shader);
> + ac_compile_llvm_module(tm, llvm_module, binary, config,
> shader_info, nir->stage, dump_shader, options->supports_spill);
> switch (nir->stage) {
> case MESA_SHADER_COMPUTE:
> for (int i = 0; i < 3; ++i)
> diff --git a/src/amd/common/ac_nir_to_llvm.h
> b/src/amd/common/ac_nir_to_llvm.h
> index a57558e..9d66f94 100644
> --- a/src/amd/common/ac_nir_to_llvm.h
> +++ b/src/amd/common/ac_nir_to_llvm.h
> @@ -52,6 +52,7 @@ struct ac_nir_compiler_options {
> struct radv_pipeline_layout *layout;
> union ac_shader_variant_key key;
> bool unsafe_math;
> + bool supports_spill;
> enum radeon_family family;
> enum chip_class chip_class;
> };
> @@ -64,8 +65,9 @@ struct ac_userdata_info {
> };
>
> enum ac_ud_index {
> - AC_UD_PUSH_CONSTANTS = 0,
> - AC_UD_SHADER_START = 1,
> + AC_UD_SCRATCH = 0,
> + AC_UD_PUSH_CONSTANTS = 1,
> + AC_UD_SHADER_START = 2,
> AC_UD_VS_VERTEX_BUFFERS = AC_UD_SHADER_START,
> AC_UD_VS_BASE_VERTEX_START_INSTANCE,
> AC_UD_VS_MAX_UD,
> diff --git a/src/amd/vulkan/radv_cmd_buffer.c
> b/src/amd/vulkan/radv_cmd_buffer.c
> index c62d275..e904897 100644
> --- a/src/amd/vulkan/radv_cmd_buffer.c
> +++ b/src/amd/vulkan/radv_cmd_buffer.c
> @@ -466,6 +466,13 @@ radv_emit_vertex_shader(struct radv_cmd_buffer
> *cmd_buffer,
> va = ws->buffer_get_va(vs->bo);
> ws->cs_add_buffer(cmd_buffer->cs, vs->bo, 8);
>
> + if (vs->config.scratch_bytes_per_wave) {
> + uint32_t needed = vs->config.scratch_bytes_per_wave *
> cmd_buffer->device->scratch_waves;
> + if (needed > cmd_buffer->scratch_size_needed)
> + cmd_buffer->scratch_size_needed = needed;
> + cmd_buffer->scratch_needed_mask |= (1 <<
> MESA_SHADER_VERTEX);
> + }
> +
> clip_dist_mask = vs->info.vs.clip_dist_mask;
> cull_dist_mask = vs->info.vs.cull_dist_mask;
> total_mask = clip_dist_mask | cull_dist_mask;
> @@ -536,6 +543,13 @@ radv_emit_fragment_shader(struct radv_cmd_buffer
> *cmd_buffer,
> va = ws->buffer_get_va(ps->bo);
> ws->cs_add_buffer(cmd_buffer->cs, ps->bo, 8);
>
> + if (ps->config.scratch_bytes_per_wave) {
> + uint32_t needed = ps->config.scratch_bytes_per_wave *
> cmd_buffer->device->scratch_waves;
> + if (needed > cmd_buffer->scratch_size_needed)
> + cmd_buffer->scratch_size_needed = needed;
> + cmd_buffer->scratch_needed_mask |= (1 <<
> MESA_SHADER_FRAGMENT);
> + }
> +
> radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B020_SPI_SHADER_PGM_LO_PS, 4);
> radeon_emit(cmd_buffer->cs, va >> 8);
> radeon_emit(cmd_buffer->cs, va >> 40);
> @@ -627,6 +641,15 @@ radv_emit_graphics_pipeline(struct radv_cmd_buffer
> *cmd_buffer,
> radeon_set_context_reg(cmd_buffer->cs, R_028A94_VGT_MULTI_PRIM_IB_RESET_EN,
> pipeline->graphics.prim_restart_enable);
>
> + uint32_t max_scratch_bytes_per_wave = 0;
> + max_scratch_bytes_per_wave = MAX2(max_scratch_bytes_per_wave,
> +
> pipeline->shaders[MESA_SHADER_VERTEX]->config.scratch_bytes_per_wave);
> + max_scratch_bytes_per_wave = MAX2(max_scratch_bytes_per_wave,
> +
> pipeline->shaders[MESA_SHADER_FRAGMENT]->config.scratch_bytes_per_wave);
> +
> + radeon_set_context_reg(cmd_buffer->cs, R_0286E8_SPI_TMPRING_SIZE,
> +
> S_0286E8_WAVES(cmd_buffer->device->scratch_waves) |
> +
> S_0286E8_WAVESIZE(max_scratch_bytes_per_wave >> 10));
> cmd_buffer->state.emitted_pipeline = pipeline;
> }
>
> @@ -1372,6 +1395,13 @@ radv_cmd_buffer_destroy(struct radv_cmd_buffer
> *cmd_buffer)
>
> if (cmd_buffer->upload.upload_bo)
> cmd_buffer->device->ws->buffer_destroy(cmd_buffer->upload.upload_bo);
> +
> + if (cmd_buffer->scratch_bo)
> +
> cmd_buffer->device->ws->buffer_destroy(cmd_buffer->scratch_bo);
> +
> + if (cmd_buffer->compute_scratch_bo)
> +
> cmd_buffer->device->ws->buffer_destroy(cmd_buffer->compute_scratch_bo);
> +
> cmd_buffer->device->ws->cs_destroy(cmd_buffer->cs);
> vk_free(&cmd_buffer->pool->alloc, cmd_buffer);
> }
> @@ -1402,6 +1432,19 @@ static void radv_reset_cmd_buffer(struct
> radv_cmd_buffer *cmd_buffer)
> free(up);
> }
>
> + if (cmd_buffer->scratch_bo) {
> +
> cmd_buffer->device->ws->buffer_destroy(cmd_buffer->scratch_bo);
> + cmd_buffer->scratch_bo = NULL;
> + }
> +
> + if (cmd_buffer->compute_scratch_bo) {
> +
> cmd_buffer->device->ws->buffer_destroy(cmd_buffer->compute_scratch_bo);
> + cmd_buffer->compute_scratch_bo = NULL;
> + }
> +
> + cmd_buffer->scratch_needed_mask = 0;
> + cmd_buffer->scratch_size_needed = 0;
> + cmd_buffer->compute_scratch_size_needed = 0;
> if (cmd_buffer->upload.upload_bo)
> cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs,
> cmd_buffer->upload.upload_bo, 8);
> @@ -1457,6 +1500,19 @@ VkResult radv_BeginCommandBuffer(
> default:
> break;
> }
> +
> + uint32_t pad_word = 0xffff1000U;
> + if
> (cmd_buffer->device->physical_device->rad_info.gfx_ib_pad_with_type2)
> + pad_word = 0x80000000;
> +
> + cmd_buffer->scratch_patch_idx = cmd_buffer->cs->cdw;
> + cmd_buffer->cs_to_patch_scratch = cmd_buffer->cs->buf;
> + for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
> + radeon_emit(cmd_buffer->cs, pad_word);
> + radeon_emit(cmd_buffer->cs, pad_word);
> + radeon_emit(cmd_buffer->cs, pad_word);
> + radeon_emit(cmd_buffer->cs, pad_word);
> + }
> }
>
> if (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
> @@ -1594,6 +1650,70 @@ VkResult radv_EndCommandBuffer(
>
> if (cmd_buffer->queue_family_index != RADV_QUEUE_TRANSFER)
> si_emit_cache_flush(cmd_buffer);
> +
> + int idx = cmd_buffer->scratch_patch_idx;
> + if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY &&
> cmd_buffer->compute_scratch_size_needed) {
> + cmd_buffer->compute_scratch_bo =
> cmd_buffer->device->ws->buffer_create(cmd_buffer->device->ws,
> +
> cmd_buffer->compute_scratch_size_needed,
> +
> 4096,
> +
> RADEON_DOMAIN_VRAM,
> +
> RADEON_FLAG_NO_CPU_ACCESS);
> +
> + if (!cmd_buffer->compute_scratch_bo) {
> + cmd_buffer->record_fail = true;
> + return VK_ERROR_OUT_OF_DEVICE_MEMORY;
> + }
> + cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs,
> cmd_buffer->compute_scratch_bo, 8);
> +
> + uint64_t scratch_va =
> cmd_buffer->device->ws->buffer_get_va(cmd_buffer->compute_scratch_bo);
> + uint32_t rsrc1 = S_008F04_BASE_ADDRESS_HI(scratch_va >>
> 32) |
> + S_008F04_SWIZZLE_ENABLE(1);
> + uint32_t reg_base;
> +
> + reg_base =
> shader_stage_to_user_data_0(MESA_SHADER_COMPUTE);
> + cmd_buffer->cs_to_patch_scratch[idx++] =
> PKT3(PKT3_SET_SH_REG, 2, 0);
> + cmd_buffer->cs_to_patch_scratch[idx++] = (reg_base -
> SI_SH_REG_OFFSET) >> 2;
> + cmd_buffer->cs_to_patch_scratch[idx++] = scratch_va;
> + cmd_buffer->cs_to_patch_scratch[idx++] = rsrc1;
> + }
> +
> + if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY &&
> cmd_buffer->scratch_size_needed) {
> + cmd_buffer->scratch_bo =
> cmd_buffer->device->ws->buffer_create(cmd_buffer->device->ws,
> +
> cmd_buffer->scratch_size_needed,
> +
> 4096,
> +
> RADEON_DOMAIN_VRAM,
> +
> RADEON_FLAG_NO_CPU_ACCESS);
> +
> + if (!cmd_buffer->scratch_bo) {
> + cmd_buffer->record_fail = true;
> + return VK_ERROR_OUT_OF_DEVICE_MEMORY;
> + }
> +
> + cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs,
> cmd_buffer->scratch_bo, 8);
> +
> + uint64_t scratch_va =
> cmd_buffer->device->ws->buffer_get_va(cmd_buffer->scratch_bo);
> + uint32_t rsrc1 = S_008F04_BASE_ADDRESS_HI(scratch_va >>
> 32) |
> + S_008F04_SWIZZLE_ENABLE(1);
> +
> + uint32_t *ring_ptr;
> + uint32_t ring_offset;
> + radv_cmd_buffer_upload_alloc(cmd_buffer, 4 * 4, 256,
> &ring_offset,
> + (void **)&ring_ptr);
> + ring_ptr[0] = scratch_va;
> + ring_ptr[1] = rsrc1;
> + uint64_t va =
> cmd_buffer->device->ws->buffer_get_va(cmd_buffer->upload.upload_bo) +
> ring_offset;
> +
> + radv_foreach_stage(stage,
> cmd_buffer->scratch_needed_mask) {
> + uint32_t reg_base;
> +
> + reg_base = shader_stage_to_user_data_0(stage);
> + cmd_buffer->cs_to_patch_scratch[idx++] =
> PKT3(PKT3_SET_SH_REG, 2, 0);
> + cmd_buffer->cs_to_patch_scratch[idx++] =
> (reg_base - SI_SH_REG_OFFSET) >> 2;
> + cmd_buffer->cs_to_patch_scratch[idx++] = va;
> + cmd_buffer->cs_to_patch_scratch[idx++] = va >>
> 32;
> + }
> + }
> +
> if (!cmd_buffer->device->ws->cs_finalize(cmd_buffer->cs) ||
> cmd_buffer->record_fail)
> return VK_ERROR_OUT_OF_DEVICE_MEMORY;
> @@ -1629,9 +1749,16 @@ radv_emit_compute_pipeline(struct radv_cmd_buffer
> *cmd_buffer)
> radeon_emit(cmd_buffer->cs, compute_shader->rsrc1);
> radeon_emit(cmd_buffer->cs, compute_shader->rsrc2);
>
> + if (compute_shader->config.scratch_bytes_per_wave) {
> + uint32_t needed =
> compute_shader->config.scratch_bytes_per_wave *
> cmd_buffer->device->scratch_waves;
> + if (needed > cmd_buffer->compute_scratch_size_needed)
> + cmd_buffer->compute_scratch_size_needed = needed;
> + }
> +
> /* change these once we have scratch support */
> radeon_set_sh_reg(cmd_buffer->cs, R_00B860_COMPUTE_TMPRING_SIZE,
> - S_00B860_WAVES(32) | S_00B860_WAVESIZE(0));
> +
> S_00B860_WAVES(cmd_buffer->device->scratch_waves) |
> +
> S_00B860_WAVESIZE(compute_shader->config.scratch_bytes_per_wave >> 10));
>
> radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
> radeon_emit(cmd_buffer->cs,
> @@ -1821,6 +1948,14 @@ void radv_CmdExecuteCommands(
> for (uint32_t i = 0; i < commandBufferCount; i++) {
> RADV_FROM_HANDLE(radv_cmd_buffer, secondary, pCmdBuffers[i]);
>
> + if (secondary->scratch_size_needed >
> primary->scratch_size_needed)
> + primary->scratch_size_needed =
> secondary->scratch_size_needed;
> +
> + if (secondary->compute_scratch_size_needed >
> primary->compute_scratch_size_needed)
> + primary->compute_scratch_size_needed =
> secondary->compute_scratch_size_needed;
> +
> + primary->scratch_needed_mask |=
> secondary->scratch_needed_mask;
> +
> primary->device->ws->cs_execute_secondary(primary->cs, secondary->cs);
> }
>
> diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
> index 4aa6af2..c465186 100644
> --- a/src/amd/vulkan/radv_device.c
> +++ b/src/amd/vulkan/radv_device.c
> @@ -781,6 +781,13 @@ VkResult radv_CreateDevice(
> }
> }
>
> + /* TODO : predicate on LLVM version this goes into */
> +#if HAVE_LLVM < 0x0500
> + device->llvm_supports_spill = false;
> +#else
> + device->llvm_supports_spill = true;
> +#endif
> +
> result = radv_device_init_meta(device);
> if (result != VK_SUCCESS)
> goto fail;
> @@ -814,6 +821,21 @@ VkResult radv_CreateDevice(
> goto fail;
> }
>
> + /* The maximum number of scratch waves. Scratch space isn't
> divided
> + * evenly between CUs. The number is only a function of the
> number of CUs.
> + * We can decrease the constant to decrease the scratch buffer
> size.
> + *
> + * sctx->scratch_waves must be >= the maximum posible size of
> + * 1 threadgroup, so that the hw doesn't hang from being unable
> + * to start any.
> + *
> + * The recommended value is 4 per CU at most. Higher numbers
> don't
> + * bring much benefit, but they still occupy chip resources
> (think
> + * async compute). I've seen ~2% performance difference between 4
> and 32.
> + */
> + uint32_t max_threads_per_block = 2048;
> + device->scratch_waves = MAX2(32 *
> physical_device->rad_info.num_good_compute_units,
> + max_threads_per_block / 64);
> *pDevice = radv_device_to_handle(device);
> return VK_SUCCESS;
>
> diff --git a/src/amd/vulkan/radv_pipeline.c
> b/src/amd/vulkan/radv_pipeline.c
> index 360b519..060cfbb 100644
> --- a/src/amd/vulkan/radv_pipeline.c
> +++ b/src/amd/vulkan/radv_pipeline.c
> @@ -354,12 +354,13 @@ static void radv_fill_shader_variant(struct
> radv_device *device,
> struct ac_shader_binary *binary,
> gl_shader_stage stage)
> {
> - variant->code_size = binary->code_size;
> bool scratch_enabled = variant->config.scratch_bytes_per_wave > 0;
> unsigned vgpr_comp_cnt = 0;
>
> - if (scratch_enabled)
> - radv_finishme("shader scratch space");
> + if (scratch_enabled && !device->llvm_supports_spill)
> + radv_finishme("shader scratch support only available with
> LLVM 5.0");
> +
> + variant->code_size = binary->code_size;
>
> switch (stage) {
> case MESA_SHADER_VERTEX:
> @@ -424,7 +425,8 @@ static struct radv_shader_variant
> *radv_shader_variant_create(struct radv_device
> options.unsafe_math = !!(device->debug_flags & RADV_DEBUG_UNSAFE_MATH);
> options.family = chip_family;
> options.chip_class = device->physical_device->rad_info.chip_class;
> - tm = ac_create_target_machine(chip_family);
> + options.supports_spill = device->llvm_supports_spill;
> + tm = ac_create_target_machine(chip_family,
> options.supports_spill);
> ac_compile_nir_shader(tm, &binary, &variant->config,
> &variant->info, shader, &options, dump);
> LLVMDisposeTargetMachine(tm);
> diff --git a/src/amd/vulkan/radv_private.h
> b/src/amd/vulkan/radv_private.h
> index 0b8f50a..6c746b5 100644
> --- a/src/amd/vulkan/radv_private.h
> +++ b/src/amd/vulkan/radv_private.h
> @@ -485,6 +485,8 @@ struct radv_device {
>
> uint64_t debug_flags;
>
> + bool llvm_supports_spill;
> + uint32_t scratch_waves;
> /* MSAA sample locations.
> * The first index is the sample index.
> * The second index is the coordinate: X, Y. */
> @@ -726,6 +728,17 @@ struct radv_cmd_buffer {
> struct radv_cmd_buffer_upload upload;
>
> bool record_fail;
> +
> + /* for primary cmd buffers */
> + struct radeon_winsys_bo *scratch_bo;
> + struct radeon_winsys_bo *compute_scratch_bo;
> + uint32_t scratch_patch_idx;
> + uint32_t *cs_to_patch_scratch;
> +
> + /* for primary + secondary cmd buffers */
> + uint32_t scratch_needed_mask;
> + uint32_t scratch_size_needed;
> + uint32_t compute_scratch_size_needed;
> };
>
> struct radv_image;
> --
> 2.7.4
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
More information about the mesa-dev
mailing list