[Mesa-dev] [PATCH 4/4] radeonsi: Enable VGPR spilling for all shader types

Thu Jan 8 11:41:19 PST 2015

On Thu, Jan 08, 2015 at 07:51:58PM +0100, Marek Olšák wrote:
> On Thu, Jan 8, 2015 at 7:17 PM, Tom Stellard <tom at stellard.net> wrote:
> > On Thu, Jan 08, 2015 at 07:00:11PM +0100, Marek Olšák wrote:
> >> On Wed, Jan 7, 2015 at 10:03 PM, Tom Stellard <thomas.stellard at amd.com> wrote:
> >> > ---
> >> >  src/gallium/drivers/radeonsi/si_compute.c       | 40 ++------------
> >> >  src/gallium/drivers/radeonsi/si_shader.c        | 69 ++++++++++++++++++++++++-
> >> >  src/gallium/drivers/radeonsi/si_shader.h        |  7 ++-
> >> >  src/gallium/drivers/radeonsi/si_state_shaders.c | 36 +++++++++++--
> >> >  4 files changed, 108 insertions(+), 44 deletions(-)
> >> >
> >> > diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
> >> > index 37e5c42..e5f158d 100644
> >> > --- a/src/gallium/drivers/radeonsi/si_compute.c
> >> > +++ b/src/gallium/drivers/radeonsi/si_compute.c
> >> > @@ -42,12 +42,6 @@
> >> >  #define NUM_USER_SGPRS 4
> >> >  #endif
> >> >
> >> > -static const char *scratch_rsrc_dword0_symbol =
> >> > -       "SCRATCH_RSRC_DWORD0";
> >> > -
> >> > -static const char *scratch_rsrc_dword1_symbol =
> >> > -       "SCRATCH_RSRC_DWORD1";
> >> > -
> >> >  struct si_compute {
> >> >         struct si_context *ctx;
> >> >
> >> > @@ -183,35 +177,6 @@ static unsigned compute_num_waves_for_scratch(
> >> >         return scratch_waves;
> >> >  }
> >> >
> >> > -static void apply_scratch_relocs(const struct si_screen *sscreen,
> >> > -                       const struct radeon_shader_binary *binary,
> >> > -                       struct si_shader *shader, uint64_t scratch_va) {
> >> > -       unsigned i;
> >> > -       char *ptr;
> >> > -       uint32_t scratch_rsrc_dword0 = scratch_va & 0xffffffff;
> >> > -       uint32_t scratch_rsrc_dword1 =
> >> > -               S_008F04_BASE_ADDRESS_HI(scratch_va >> 32)
> >> > -               |  S_008F04_STRIDE(shader->scratch_bytes_per_wave / 64);
> >> > -
> >> > -       if (!binary->reloc_count) {
> >> > -               return;
> >> > -       }
> >> > -
> >> > -       ptr = sscreen->b.ws->buffer_map(shader->bo->cs_buf, NULL,
> >> > -                                       PIPE_TRANSFER_READ_WRITE);
> >> > -       for (i = 0 ; i < binary->reloc_count; i++) {
> >> > -               const struct radeon_shader_reloc *reloc = &binary->relocs[i];
> >> > -               if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) {
> >> > -                       util_memcpy_cpu_to_le32(ptr + reloc->offset,
> >> > -                               &scratch_rsrc_dword0, 4);
> >> > -               } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
> >> > -                       util_memcpy_cpu_to_le32(ptr + reloc->offset,
> >> > -                               &scratch_rsrc_dword1, 4);
> >> > -               }
> >> > -       }
> >> > -       sscreen->b.ws->buffer_unmap(shader->bo->cs_buf);
> >> > -}
> >> > -
> >> >  static void si_launch_grid(
> >> >                 struct pipe_context *ctx,
> >> >                 const uint *block_layout, const uint *grid_layout,
> >> > @@ -256,7 +221,8 @@ static void si_launch_grid(
> >> >
> >> >  #if HAVE_LLVM >= 0x0306
> >> >         /* Read the config information */
> >> > -       si_shader_binary_read_config(&program->binary, &program->program, pc);
> >> > +       si_shader_binary_read_config(sctx->screen, &program->binary,
> >> > +                                       &program->program, pc);
> >> >  #endif
> >> >
> >> >         /* Upload the kernel arguments */
> >> > @@ -295,7 +261,7 @@ static void si_launch_grid(
> >> >                                 RADEON_PRIO_SHADER_RESOURCE_RW);
> >> >
> >> >                 /* Patch the shader with the scratch buffer address. */
> >> > -               apply_scratch_relocs(sctx->screen,
> >> > +               si_shader_apply_scratch_relocs(sctx->screen,
> >> >                         &program->binary, shader, scratch_buffer_va);
> >> >
> >> >         }
> >> > diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
> >> > index cf28860..d59e736 100644
> >> > --- a/src/gallium/drivers/radeonsi/si_shader.c
> >> > +++ b/src/gallium/drivers/radeonsi/si_shader.c
> >> > @@ -46,6 +46,12 @@
> >> >
> >> >  #include <errno.h>
> >> >
> >> > +static const char *scratch_rsrc_dword0_symbol =
> >> > +       "SCRATCH_RSRC_DWORD0";
> >> > +
> >> > +static const char *scratch_rsrc_dword1_symbol =
> >> > +       "SCRATCH_RSRC_DWORD1";
> >> > +
> >> >  struct si_shader_output_values
> >> >  {
> >> >         LLVMValueRef values[4];
> >> > @@ -2517,7 +2523,8 @@ static void preload_ring_buffers(struct si_shader_context *si_shader_ctx)
> >> >         }
> >> >  }
> >> >
> >> > -void si_shader_binary_read_config(const struct radeon_shader_binary *binary,
> >> > +void si_shader_binary_read_config(const struct si_screen *sscreen,
> >> > +                               const struct radeon_shader_binary *binary,
> >> >                                 struct si_shader *shader,
> >> >                                 unsigned symbol_offset)
> >> >  {
> >> > @@ -2549,6 +2556,14 @@ void si_shader_binary_read_config(const struct radeon_shader_binary *binary,
> >> >                 case R_0286CC_SPI_PS_INPUT_ENA:
> >> >                         shader->spi_ps_input_ena = value;
> >> >                         break;
> >> > +               case R_0286E8_SPI_TMPRING_SIZE:
> >> > +                       /* XXX: This is the maximum value allowed.  I'm not sure
> >> > +                        * how compute this for non-cs shaders.
> >> > +                        */
> >> > +                       shader->scratch_waves =
> >> > +                               32 * sscreen->b.info.max_compute_units;
> >> > +                       /* Fall-through */
> >> > +
> >> >                 case R_00B860_COMPUTE_TMPRING_SIZE:
> >> >                         /* WAVESIZE is in units of 256 dwords. */
> >> >                         shader->scratch_bytes_per_wave =
> >> > @@ -2562,6 +2577,36 @@ void si_shader_binary_read_config(const struct radeon_shader_binary *binary,
> >> >         }
> >> >  }
> >> >
> >> > +void si_shader_apply_scratch_relocs(const struct si_screen *sscreen,
> >> > +                       const struct radeon_shader_binary *binary,
> >> > +                       struct si_shader *shader, uint64_t scratch_va) {
> >> > +       unsigned i;
> >> > +       char *ptr;
> >> > +       uint32_t scratch_rsrc_dword0 = scratch_va & 0xffffffff;
> >> > +       uint32_t scratch_rsrc_dword1 =
> >> > +               S_008F04_BASE_ADDRESS_HI(scratch_va >> 32)
> >> > +               |  S_008F04_STRIDE(shader->scratch_bytes_per_wave / 64);
> >> > +
> >> > +       if (!binary->reloc_count) {
> >> > +               return;
> >> > +       }
> >> > +
> >> > +       ptr = sscreen->b.ws->buffer_map(shader->bo->cs_buf, NULL,
> >> > +                                       PIPE_TRANSFER_READ_WRITE);
> >> > +       for (i = 0 ; i < binary->reloc_count; i++) {
> >> > +               const struct radeon_shader_reloc *reloc = &binary->relocs[i];
> >> > +               if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) {
> >> > +                       util_memcpy_cpu_to_le32(ptr + reloc->offset,
> >> > +                               &scratch_rsrc_dword0, 4);
> >> > +               } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
> >> > +                       util_memcpy_cpu_to_le32(ptr + reloc->offset,
> >> > +                               &scratch_rsrc_dword1, 4);
> >> > +               }
> >> > +       }
> >> > +       sscreen->b.ws->buffer_unmap(shader->bo->cs_buf);
> >> > +}
> >> > +
> >> > +
> >> >  int si_shader_binary_read(struct si_screen *sscreen,
> >> >                         struct si_shader *shader,
> >> >                         const struct radeon_shader_binary *binary)
> >> > @@ -2582,7 +2627,7 @@ int si_shader_binary_read(struct si_screen *sscreen,
> >> >                 }
> >> >         }
> >> >
> >> > -       si_shader_binary_read_config(binary, shader, 0);
> >> > +       si_shader_binary_read_config(sscreen, binary, shader, 0);
> >> >
> >> >         /* copy new shader */
> >> >         code_size = binary->code_size + binary->rodata_size;
> >> > @@ -2601,6 +2646,7 @@ int si_shader_binary_read(struct si_screen *sscreen,
> >> >                 util_memcpy_cpu_to_le32(ptr, binary->rodata, binary->rodata_size);
> >> >         }
> >> >
> >> > +
> >>
> >> Unintentional new line?
> >>
> >> >         sscreen->b.ws->buffer_unmap(shader->bo->cs_buf);
> >> >
> >> >         return 0;
> >> > @@ -2621,6 +2667,25 @@ int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader,
> >> >                 return r;
> >> >         }
> >> >         r = si_shader_binary_read(sscreen, shader, &binary);
> >> > +
> >> > +       if (shader->scratch_bytes_per_wave > 0) {
> >> > +               uint64_t scratch_buffer_va;
> >> > +               unsigned scratch_bytes = shader->scratch_bytes_per_wave *
> >> > +                                       shader->scratch_waves;
> >> > +               /* It's possible for different shader variants to have
> >> > +                * different scratch buffer sizes.  The scratch buffer sizes
> >> > +                * won't vary by too much, so allocating double the scratch
> >> > +                * space need for the first variant should be enough for the
> >> > +                * rest if they need it.
> >> > +                */
> >> > +               shader->scratch_bo = (struct r600_resource*)
> >> > +                               si_resource_create_custom(&sscreen->b.b,
> >> > +                               PIPE_USAGE_DEFAULT, scratch_bytes * 2);
> >> > +               scratch_buffer_va = shader->scratch_bo->gpu_address;
> >> > +               si_shader_apply_scratch_relocs(sscreen, &binary, shader,
> >> > +                       scratch_buffer_va);
> >> > +       }
> >> > +
> >> >         FREE(binary.code);
> >> >         FREE(binary.config);
> >> >         FREE(binary.rodata);
> >> > diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
> >> > index 08e344a..c5b274e 100644
> >> > --- a/src/gallium/drivers/radeonsi/si_shader.h
> >> > +++ b/src/gallium/drivers/radeonsi/si_shader.h
> >> > @@ -147,6 +147,7 @@ struct si_shader {
> >> >         unsigned                        lds_size;
> >> >         unsigned                        spi_ps_input_ena;
> >> >         unsigned                        scratch_bytes_per_wave;
> >> > +       unsigned                        scratch_waves;
> >> >         unsigned                        spi_shader_col_format;
> >> >         unsigned                        spi_shader_z_format;
> >> >         unsigned                        db_shader_control;
> >> > @@ -185,7 +186,11 @@ void si_shader_destroy(struct pipe_context *ctx, struct si_shader *shader);
> >> >  unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index);
> >> >  int si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader,
> >> >                 const struct radeon_shader_binary *binary);
> >> > -void si_shader_binary_read_config(const struct radeon_shader_binary *binary,
> >> > +void si_shader_apply_scratch_relocs(const struct si_screen *sscreen,
> >> > +                       const struct radeon_shader_binary *binary,
> >> > +                       struct si_shader *shader, uint64_t scratch_va);
> >> > +void si_shader_binary_read_config(const struct si_screen *sscreen,
> >> > +                               const struct radeon_shader_binary *binary,
> >> >                                 struct si_shader *shader,
> >> >                                 unsigned symbol_offset);
> >> >
> >> > diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
> >> > index 817a990..b6a0f32 100644
> >> > --- a/src/gallium/drivers/radeonsi/si_state_shaders.c
> >> > +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
> >> > @@ -33,6 +33,22 @@
> >> >  #include "util/u_memory.h"
> >> >  #include "util/u_simple_shaders.h"
> >> >
> >> > +static void si_shader_setup_scratch_buffer(struct si_shader *shader,
> >> > +                                       struct si_pm4_state *pm4) {
> >> > +       if (shader->scratch_bytes_per_wave == 0) {
> >> > +               return;
> >> > +       }
> >> > +
> >> > +       assert(shader->scratch_bo);
> >> > +
> >> > +       si_pm4_set_reg(pm4, R_0286E8_SPI_TMPRING_SIZE,
> >> > +                       S_0286E8_WAVES(shader->scratch_waves)
> >> > +                       | S_0286E8_WAVESIZE(shader->scratch_bytes_per_wave >> 10));
> >>
> >> What happens if all VS, GS, and PS spill VGPRs? Will they share the
> >> scratch buffer? Will the scratch buffer be large enough for all of
> >> them? Will the WAVES and WAVESIZE parameters be the same for all of them?
> >>
> >
> > Each shader will have its own scratch buffer.  WAVES will always be the
> > same, but WAVESIZE depends on the number of VGPRs spilled.
> 
> But SPI_TMPRING_SIZE is shared by all stages, right? Will it not cause
> problems if the register value (especially WAVESIZE) is different for
> ES, GS, VS, and PS? Perhaps WAVESIZE should be set to a maximum of all
> enabled shaders, right?
> 
> Example sequence of state emission:
> - emit VS regs
> - emit SPI_TMPRING_SIZE for VS
> - emit PS regs
> - emit SPI_TMPRING_SIZE for PS
> 
> Assuming VS.SPI_TMPRING_SIZE != PS.SPI_TMPRING_SIZE.
> 

Ok, I see why this is a problem.

If we use the same WAVESIZE for all shader types, then this means that
we either need to use the same scratch buffer for all shader types or
have all the scratch buffers be the same size.

In a pipeline with VS and PS, do we know that all VS threads will
complete before any of the PS threads start?  If not, I think we will
need one scratch buffer per shader type.

No matter which of these solutions we choose, we will have to allocate
the scratch buffer when the shaders are bound to the state, because that
is the only time we will know how big they will need to be.

It seems like the best solution may be to have a global per context
scratch buffer which has its size increased every time a shader that
requires more scratch space is bound to a state.

What do you think?

-Tom