[Mesa-dev] [PATCH 4/4] radeonsi: Enable VGPR spilling for all shader types

Thu Jan 8 10:00:11 PST 2015

On Wed, Jan 7, 2015 at 10:03 PM, Tom Stellard <thomas.stellard at amd.com> wrote:
> ---
>  src/gallium/drivers/radeonsi/si_compute.c       | 40 ++------------
>  src/gallium/drivers/radeonsi/si_shader.c        | 69 ++++++++++++++++++++++++-
>  src/gallium/drivers/radeonsi/si_shader.h        |  7 ++-
>  src/gallium/drivers/radeonsi/si_state_shaders.c | 36 +++++++++++--
>  4 files changed, 108 insertions(+), 44 deletions(-)
>
> diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
> index 37e5c42..e5f158d 100644
> --- a/src/gallium/drivers/radeonsi/si_compute.c
> +++ b/src/gallium/drivers/radeonsi/si_compute.c
> @@ -42,12 +42,6 @@
>  #define NUM_USER_SGPRS 4
>  #endif
>
> -static const char *scratch_rsrc_dword0_symbol =
> -       "SCRATCH_RSRC_DWORD0";
> -
> -static const char *scratch_rsrc_dword1_symbol =
> -       "SCRATCH_RSRC_DWORD1";
> -
>  struct si_compute {
>         struct si_context *ctx;
>
> @@ -183,35 +177,6 @@ static unsigned compute_num_waves_for_scratch(
>         return scratch_waves;
>  }
>
> -static void apply_scratch_relocs(const struct si_screen *sscreen,
> -                       const struct radeon_shader_binary *binary,
> -                       struct si_shader *shader, uint64_t scratch_va) {
> -       unsigned i;
> -       char *ptr;
> -       uint32_t scratch_rsrc_dword0 = scratch_va & 0xffffffff;
> -       uint32_t scratch_rsrc_dword1 =
> -               S_008F04_BASE_ADDRESS_HI(scratch_va >> 32)
> -               |  S_008F04_STRIDE(shader->scratch_bytes_per_wave / 64);
> -
> -       if (!binary->reloc_count) {
> -               return;
> -       }
> -
> -       ptr = sscreen->b.ws->buffer_map(shader->bo->cs_buf, NULL,
> -                                       PIPE_TRANSFER_READ_WRITE);
> -       for (i = 0 ; i < binary->reloc_count; i++) {
> -               const struct radeon_shader_reloc *reloc = &binary->relocs[i];
> -               if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) {
> -                       util_memcpy_cpu_to_le32(ptr + reloc->offset,
> -                               &scratch_rsrc_dword0, 4);
> -               } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
> -                       util_memcpy_cpu_to_le32(ptr + reloc->offset,
> -                               &scratch_rsrc_dword1, 4);
> -               }
> -       }
> -       sscreen->b.ws->buffer_unmap(shader->bo->cs_buf);
> -}
> -
>  static void si_launch_grid(
>                 struct pipe_context *ctx,
>                 const uint *block_layout, const uint *grid_layout,
> @@ -256,7 +221,8 @@ static void si_launch_grid(
>
>  #if HAVE_LLVM >= 0x0306
>         /* Read the config information */
> -       si_shader_binary_read_config(&program->binary, &program->program, pc);
> +       si_shader_binary_read_config(sctx->screen, &program->binary,
> +                                       &program->program, pc);
>  #endif
>
>         /* Upload the kernel arguments */
> @@ -295,7 +261,7 @@ static void si_launch_grid(
>                                 RADEON_PRIO_SHADER_RESOURCE_RW);
>
>                 /* Patch the shader with the scratch buffer address. */
> -               apply_scratch_relocs(sctx->screen,
> +               si_shader_apply_scratch_relocs(sctx->screen,
>                         &program->binary, shader, scratch_buffer_va);
>
>         }
> diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
> index cf28860..d59e736 100644
> --- a/src/gallium/drivers/radeonsi/si_shader.c
> +++ b/src/gallium/drivers/radeonsi/si_shader.c
> @@ -46,6 +46,12 @@
>
>  #include <errno.h>
>
> +static const char *scratch_rsrc_dword0_symbol =
> +       "SCRATCH_RSRC_DWORD0";
> +
> +static const char *scratch_rsrc_dword1_symbol =
> +       "SCRATCH_RSRC_DWORD1";
> +
>  struct si_shader_output_values
>  {
>         LLVMValueRef values[4];
> @@ -2517,7 +2523,8 @@ static void preload_ring_buffers(struct si_shader_context *si_shader_ctx)
>         }
>  }
>
> -void si_shader_binary_read_config(const struct radeon_shader_binary *binary,
> +void si_shader_binary_read_config(const struct si_screen *sscreen,
> +                               const struct radeon_shader_binary *binary,
>                                 struct si_shader *shader,
>                                 unsigned symbol_offset)
>  {
> @@ -2549,6 +2556,14 @@ void si_shader_binary_read_config(const struct radeon_shader_binary *binary,
>                 case R_0286CC_SPI_PS_INPUT_ENA:
>                         shader->spi_ps_input_ena = value;
>                         break;
> +               case R_0286E8_SPI_TMPRING_SIZE:
> +                       /* XXX: This is the maximum value allowed.  I'm not sure
> +                        * how compute this for non-cs shaders.
> +                        */
> +                       shader->scratch_waves =
> +                               32 * sscreen->b.info.max_compute_units;
> +                       /* Fall-through */
> +
>                 case R_00B860_COMPUTE_TMPRING_SIZE:
>                         /* WAVESIZE is in units of 256 dwords. */
>                         shader->scratch_bytes_per_wave =
> @@ -2562,6 +2577,36 @@ void si_shader_binary_read_config(const struct radeon_shader_binary *binary,
>         }
>  }
>
> +void si_shader_apply_scratch_relocs(const struct si_screen *sscreen,
> +                       const struct radeon_shader_binary *binary,
> +                       struct si_shader *shader, uint64_t scratch_va) {
> +       unsigned i;
> +       char *ptr;
> +       uint32_t scratch_rsrc_dword0 = scratch_va & 0xffffffff;
> +       uint32_t scratch_rsrc_dword1 =
> +               S_008F04_BASE_ADDRESS_HI(scratch_va >> 32)
> +               |  S_008F04_STRIDE(shader->scratch_bytes_per_wave / 64);
> +
> +       if (!binary->reloc_count) {
> +               return;
> +       }
> +
> +       ptr = sscreen->b.ws->buffer_map(shader->bo->cs_buf, NULL,
> +                                       PIPE_TRANSFER_READ_WRITE);
> +       for (i = 0 ; i < binary->reloc_count; i++) {
> +               const struct radeon_shader_reloc *reloc = &binary->relocs[i];
> +               if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) {
> +                       util_memcpy_cpu_to_le32(ptr + reloc->offset,
> +                               &scratch_rsrc_dword0, 4);
> +               } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
> +                       util_memcpy_cpu_to_le32(ptr + reloc->offset,
> +                               &scratch_rsrc_dword1, 4);
> +               }
> +       }
> +       sscreen->b.ws->buffer_unmap(shader->bo->cs_buf);
> +}
> +
> +
>  int si_shader_binary_read(struct si_screen *sscreen,
>                         struct si_shader *shader,
>                         const struct radeon_shader_binary *binary)
> @@ -2582,7 +2627,7 @@ int si_shader_binary_read(struct si_screen *sscreen,
>                 }
>         }
>
> -       si_shader_binary_read_config(binary, shader, 0);
> +       si_shader_binary_read_config(sscreen, binary, shader, 0);
>
>         /* copy new shader */
>         code_size = binary->code_size + binary->rodata_size;
> @@ -2601,6 +2646,7 @@ int si_shader_binary_read(struct si_screen *sscreen,
>                 util_memcpy_cpu_to_le32(ptr, binary->rodata, binary->rodata_size);
>         }
>
> +

Unintentional new line?

>         sscreen->b.ws->buffer_unmap(shader->bo->cs_buf);
>
>         return 0;
> @@ -2621,6 +2667,25 @@ int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader,
>                 return r;
>         }
>         r = si_shader_binary_read(sscreen, shader, &binary);
> +
> +       if (shader->scratch_bytes_per_wave > 0) {
> +               uint64_t scratch_buffer_va;
> +               unsigned scratch_bytes = shader->scratch_bytes_per_wave *
> +                                       shader->scratch_waves;
> +               /* It's possible for different shader variants to have
> +                * different scratch buffer sizes.  The scratch buffer sizes
> +                * won't vary by too much, so allocating double the scratch
> +                * space need for the first variant should be enough for the
> +                * rest if they need it.
> +                */
> +               shader->scratch_bo = (struct r600_resource*)
> +                               si_resource_create_custom(&sscreen->b.b,
> +                               PIPE_USAGE_DEFAULT, scratch_bytes * 2);
> +               scratch_buffer_va = shader->scratch_bo->gpu_address;
> +               si_shader_apply_scratch_relocs(sscreen, &binary, shader,
> +                       scratch_buffer_va);
> +       }
> +
>         FREE(binary.code);
>         FREE(binary.config);
>         FREE(binary.rodata);
> diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
> index 08e344a..c5b274e 100644
> --- a/src/gallium/drivers/radeonsi/si_shader.h
> +++ b/src/gallium/drivers/radeonsi/si_shader.h
> @@ -147,6 +147,7 @@ struct si_shader {
>         unsigned                        lds_size;
>         unsigned                        spi_ps_input_ena;
>         unsigned                        scratch_bytes_per_wave;
> +       unsigned                        scratch_waves;
>         unsigned                        spi_shader_col_format;
>         unsigned                        spi_shader_z_format;
>         unsigned                        db_shader_control;
> @@ -185,7 +186,11 @@ void si_shader_destroy(struct pipe_context *ctx, struct si_shader *shader);
>  unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index);
>  int si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader,
>                 const struct radeon_shader_binary *binary);
> -void si_shader_binary_read_config(const struct radeon_shader_binary *binary,
> +void si_shader_apply_scratch_relocs(const struct si_screen *sscreen,
> +                       const struct radeon_shader_binary *binary,
> +                       struct si_shader *shader, uint64_t scratch_va);
> +void si_shader_binary_read_config(const struct si_screen *sscreen,
> +                               const struct radeon_shader_binary *binary,
>                                 struct si_shader *shader,
>                                 unsigned symbol_offset);
>
> diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
> index 817a990..b6a0f32 100644
> --- a/src/gallium/drivers/radeonsi/si_state_shaders.c
> +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
> @@ -33,6 +33,22 @@
>  #include "util/u_memory.h"
>  #include "util/u_simple_shaders.h"
>
> +static void si_shader_setup_scratch_buffer(struct si_shader *shader,
> +                                       struct si_pm4_state *pm4) {
> +       if (shader->scratch_bytes_per_wave == 0) {
> +               return;
> +       }
> +
> +       assert(shader->scratch_bo);
> +
> +       si_pm4_set_reg(pm4, R_0286E8_SPI_TMPRING_SIZE,
> +                       S_0286E8_WAVES(shader->scratch_waves)
> +                       | S_0286E8_WAVESIZE(shader->scratch_bytes_per_wave >> 10));

What happens if all VS, GS, and PS spill VGPRs? Will they share the
scratch buffer? Will the scratch buffer be large enough for all of
them? Will the WAVES and WAVESIZE parameters be the same for all of them?

Marek