[Mesa-dev] [PATCH] radeonsi: Enable VGPR spilling for all shader types v3

Mon Jan 19 07:07:47 PST 2015

1) It's quite a lot of CPU work, so it shouldn't be done
unconditionally. Please see how si_pm4_state_changed is used to check
if a "shader variant" has been changed. I'd like to have an early exit
path for performance reasons, so:
a) If no shader state has been changed (see si_pm4_state_changed), return.
b) If all current shaders don't use the scratch buffer, return.

2) Please put it in si_state_shaders.c. It's the new place for all
non-trivial shader stuff.

3) Since the CP runs in parallel with the graphics engine, shaders can
be busy while WRITE_DATA is changing their code. (no good can come out
of that) Therefore, the partial flushes (= partial "WAIT_UNTIL") must
be emitted before the WRITE_DATA packet. The other thing is that the
code is missing checking if there is enough space in the CS for the
WRITE_DATA packets. In order to simplify everything, I recommend this
solution:

- Forget about partial flushes, forget about WRITE_DATA.
- If the shader relocs need updating, copy the current shader bo to a
new shader bo using the CPU (map the current shader bo with
read+unsynchronized flags).
- Update the relocs using the CPU in the new shader bo.
- Then replace the current shader bo with the new one and mark the
whole shader state as dirty.

4) Like Michel said, si_begin_new_cs should set some flag that
SPI_TMPRING_SIZE should be emitted and the buffer reloc should be
added.

Marek

On Fri, Jan 16, 2015 at 6:54 PM, Tom Stellard <thomas.stellard at amd.com> wrote:
> v2:
>   - Only emit write SPI_TMPRING_SIZE once per packet.
>   - Use context global scratch buffer.
>
> v3:
>   - Patch shaders using WRITE_DATA packet instead of map/unmap.
>   - Emit ICACHE_FLUSH, CS_PARTIAL_FLUSH, PS_PARTIAL_FLUSH, and
>     VS_PARTIAL_FLUSH when patching shaders.
> ---
>  src/gallium/drivers/radeonsi/si_compute.c       |  44 ++--------
>  src/gallium/drivers/radeonsi/si_pipe.c          |   6 ++
>  src/gallium/drivers/radeonsi/si_pipe.h          |   3 +
>  src/gallium/drivers/radeonsi/si_shader.c        |  68 ++++++++++++++-
>  src/gallium/drivers/radeonsi/si_shader.h        |   9 +-
>  src/gallium/drivers/radeonsi/si_state_draw.c    | 105 ++++++++++++++++++++++++
>  src/gallium/drivers/radeonsi/si_state_shaders.c |  12 ++-
>  7 files changed, 200 insertions(+), 47 deletions(-)
>
> diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
> index 4427d3b..e407fb9 100644
> --- a/src/gallium/drivers/radeonsi/si_compute.c
> +++ b/src/gallium/drivers/radeonsi/si_compute.c
> @@ -42,12 +42,6 @@
>  #define NUM_USER_SGPRS 4
>  #endif
>
> -static const char *scratch_rsrc_dword0_symbol =
> -       "SCRATCH_RSRC_DWORD0";
> -
> -static const char *scratch_rsrc_dword1_symbol =
> -       "SCRATCH_RSRC_DWORD1";
> -
>  struct si_compute {
>         struct si_context *ctx;
>
> @@ -183,35 +177,6 @@ static unsigned compute_num_waves_for_scratch(
>         return scratch_waves;
>  }
>
> -static void apply_scratch_relocs(const struct si_screen *sscreen,
> -                       const struct radeon_shader_binary *binary,
> -                       struct si_shader *shader, uint64_t scratch_va) {
> -       unsigned i;
> -       char *ptr;
> -       uint32_t scratch_rsrc_dword0 = scratch_va & 0xffffffff;
> -       uint32_t scratch_rsrc_dword1 =
> -               S_008F04_BASE_ADDRESS_HI(scratch_va >> 32)
> -               |  S_008F04_STRIDE(shader->scratch_bytes_per_wave / 64);
> -
> -       if (!binary->reloc_count) {
> -               return;
> -       }
> -
> -       ptr = sscreen->b.ws->buffer_map(shader->bo->cs_buf, NULL,
> -                                       PIPE_TRANSFER_READ_WRITE);
> -       for (i = 0 ; i < binary->reloc_count; i++) {
> -               const struct radeon_shader_reloc *reloc = &binary->relocs[i];
> -               if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) {
> -                       util_memcpy_cpu_to_le32(ptr + reloc->offset,
> -                               &scratch_rsrc_dword0, 4);
> -               } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
> -                       util_memcpy_cpu_to_le32(ptr + reloc->offset,
> -                               &scratch_rsrc_dword1, 4);
> -               }
> -       }
> -       sscreen->b.ws->buffer_unmap(shader->bo->cs_buf);
> -}
> -
>  static void si_launch_grid(
>                 struct pipe_context *ctx,
>                 const uint *block_layout, const uint *grid_layout,
> @@ -256,7 +221,8 @@ static void si_launch_grid(
>
>  #if HAVE_LLVM >= 0x0306
>         /* Read the config information */
> -       si_shader_binary_read_config(&program->binary, &program->program, pc);
> +       si_shader_binary_read_config(sctx->screen, &program->binary,
> +                                       &program->program, pc);
>  #endif
>
>         /* Upload the kernel arguments */
> @@ -295,8 +261,10 @@ static void si_launch_grid(
>                                 RADEON_PRIO_SHADER_RESOURCE_RW);
>
>                 /* Patch the shader with the scratch buffer address. */
> -               apply_scratch_relocs(sctx->screen,
> -                       &program->binary, shader, scratch_buffer_va);
> +               si_shader_apply_scratch_relocs(sctx,
> +                       shader, program->binary.relocs,
> +                       program->binary.reloc_count,
> +                       scratch_buffer_va);
>
>         }
>
> diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
> index e3f8fcf..8e89f1e 100644
> --- a/src/gallium/drivers/radeonsi/si_pipe.c
> +++ b/src/gallium/drivers/radeonsi/si_pipe.c
> @@ -46,6 +46,7 @@ static void si_destroy_context(struct pipe_context *context)
>         pipe_resource_reference(&sctx->gsvs_ring, NULL);
>         pipe_resource_reference(&sctx->null_const_buf.buffer, NULL);
>         r600_resource_reference(&sctx->border_color_table, NULL);
> +       r600_resource_reference(&sctx->scratch_buffer, NULL);
>
>         si_pm4_free_state(sctx, sctx->init_config, ~0);
>         si_pm4_delete_state(sctx, gs_rings, sctx->gs_rings);
> @@ -158,6 +159,11 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, void *
>                                      sctx->null_const_buf.buffer->width0, 0, false);
>         }
>
> +       /* XXX: This is the maximum value allowed.  I'm not sure how compute
> +        * this for non-cs shaders.
> +        */
> +       sctx->scratch_waves = 32 * sscreen->b.info.max_compute_units;
> +
>         return &sctx->b.b;
>  fail:
>         si_destroy_context(&sctx->b.b);
> diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
> index dfb1cd6..6051763 100644
> --- a/src/gallium/drivers/radeonsi/si_pipe.h
> +++ b/src/gallium/drivers/radeonsi/si_pipe.h
> @@ -174,6 +174,7 @@ struct si_context {
>         struct si_buffer_resources      const_buffers[SI_NUM_SHADERS];
>         struct si_buffer_resources      rw_buffers[SI_NUM_SHADERS];
>         struct si_textures_info         samplers[SI_NUM_SHADERS];
> +       struct r600_resource            *scratch_buffer;
>         struct r600_resource            *border_color_table;
>         unsigned                        border_color_offset;
>
> @@ -221,6 +222,8 @@ struct si_context {
>         int                     last_prim;
>         int                     last_multi_vgt_param;
>         int                     last_rast_prim;
> +
> +       unsigned                scratch_waves;
>  };
>
>  /* si_blit.c */
> diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
> index 81cb2ef..fd5b92b 100644
> --- a/src/gallium/drivers/radeonsi/si_shader.c
> +++ b/src/gallium/drivers/radeonsi/si_shader.c
> @@ -32,6 +32,7 @@
>  #include "gallivm/lp_bld_logic.h"
>  #include "gallivm/lp_bld_arit.h"
>  #include "gallivm/lp_bld_flow.h"
> +#include "radeon/r600_cs.h"
>  #include "radeon/radeon_llvm.h"
>  #include "radeon/radeon_elf_util.h"
>  #include "radeon/radeon_llvm_emit.h"
> @@ -46,6 +47,12 @@
>
>  #include <errno.h>
>
> +static const char *scratch_rsrc_dword0_symbol =
> +       "SCRATCH_RSRC_DWORD0";
> +
> +static const char *scratch_rsrc_dword1_symbol =
> +       "SCRATCH_RSRC_DWORD1";
> +
>  struct si_shader_output_values
>  {
>         LLVMValueRef values[4];
> @@ -2517,7 +2524,8 @@ static void preload_ring_buffers(struct si_shader_context *si_shader_ctx)
>         }
>  }
>
> -void si_shader_binary_read_config(const struct radeon_shader_binary *binary,
> +void si_shader_binary_read_config(const struct si_screen *sscreen,
> +                               const struct radeon_shader_binary *binary,
>                                 struct si_shader *shader,
>                                 unsigned symbol_offset)
>  {
> @@ -2549,6 +2557,7 @@ void si_shader_binary_read_config(const struct radeon_shader_binary *binary,
>                 case R_0286CC_SPI_PS_INPUT_ENA:
>                         shader->spi_ps_input_ena = value;
>                         break;
> +               case R_0286E8_SPI_TMPRING_SIZE:
>                 case R_00B860_COMPUTE_TMPRING_SIZE:
>                         /* WAVESIZE is in units of 256 dwords. */
>                         shader->scratch_bytes_per_wave =
> @@ -2562,6 +2571,54 @@ void si_shader_binary_read_config(const struct radeon_shader_binary *binary,
>         }
>  }
>
> +void si_shader_apply_scratch_relocs(struct si_context *sctx,
> +                       struct si_shader *shader,
> +                       const struct radeon_shader_reloc *relocs,
> +                       unsigned num_relocs, uint64_t scratch_va)
> +{
> +       unsigned i;
> +       struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
> +       uint32_t scratch_rsrc_dword0 = scratch_va & 0xffffffff;
> +       uint32_t scratch_rsrc_dword1 =
> +               S_008F04_BASE_ADDRESS_HI(scratch_va >> 32)
> +               |  S_008F04_STRIDE(shader->scratch_bytes_per_wave / 64);
> +
> +       if (num_relocs == 0) {
> +               return;
> +       }
> +
> +       assert(relocs);
> +
> +       r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, shader->bo,
> +                              RADEON_USAGE_WRITE, RADEON_PRIO_SHADER_DATA);
> +
> +       for (i = 0 ; i < num_relocs; i++) {
> +               const struct radeon_shader_reloc *reloc = &relocs[i];
> +               uint64_t addr = shader->bo->gpu_address + reloc->offset;
> +               uint32_t value;
> +               if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) {
> +                       value = util_cpu_to_le32(scratch_rsrc_dword0);
> +               } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
> +                       value = util_cpu_to_le32(scratch_rsrc_dword1);
> +               } else {
> +                       continue;
> +               }
> +
> +               radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
> +               radeon_emit(cs, PKT3_WRITE_DATA_DST_SEL(PKT3_WRITE_DATA_DST_SEL_MEM_SYNC) |
> +                                PKT3_WRITE_DATA_WR_CONFIRM |
> +                                PKT3_WRITE_DATA_ENGINE_SEL(PKT3_WRITE_DATA_ENGINE_SEL_ME));
> +               radeon_emit(cs, addr & 0xFFFFFFFFUL);
> +               radeon_emit(cs, (addr >> 32UL) & 0xFFFFFFFFUL);
> +               radeon_emit(cs, value);
> +       }
> +       sctx->b.flags |= SI_CONTEXT_INV_ICACHE |
> +                       SI_CONTEXT_CS_PARTIAL_FLUSH |
> +                       SI_CONTEXT_PS_PARTIAL_FLUSH |
> +                       SI_CONTEXT_VS_PARTIAL_FLUSH;
> +}
> +
> +
>  int si_shader_binary_read(struct si_screen *sscreen,
>                         struct si_shader *shader,
>                         const struct radeon_shader_binary *binary)
> @@ -2582,7 +2639,7 @@ int si_shader_binary_read(struct si_screen *sscreen,
>                 }
>         }
>
> -       si_shader_binary_read_config(binary, shader, 0);
> +       si_shader_binary_read_config(sscreen, binary, shader, 0);
>
>         /* copy new shader */
>         code_size = binary->code_size + binary->rodata_size;
> @@ -2621,7 +2678,10 @@ int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader,
>                 return r;
>         }
>         r = si_shader_binary_read(sscreen, shader, &binary);
> -       radeon_shader_binary_free_members(&binary, true);
> +
> +       shader->relocs = binary.relocs;
> +       shader->num_relocs = binary.reloc_count;
> +       radeon_shader_binary_free_members(&binary, false);
>         return r;
>  }
>
> @@ -2857,6 +2917,6 @@ void si_shader_destroy(struct pipe_context *ctx, struct si_shader *shader)
>         if (shader->gs_copy_shader)
>                 si_shader_destroy(ctx, shader->gs_copy_shader);
>
> +       radeon_shader_binary_free_relocs(shader->relocs, shader->num_relocs);
>         r600_resource_reference(&shader->bo, NULL);
> -       r600_resource_reference(&shader->scratch_bo, NULL);
>  }
> diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
> index 08e344a..f33f021 100644
> --- a/src/gallium/drivers/radeonsi/si_shader.h
> +++ b/src/gallium/drivers/radeonsi/si_shader.h
> @@ -34,6 +34,7 @@
>  #include "si_state.h"
>
>  struct radeon_shader_binary;
> +struct radeon_shader_reloc;
>
>  #define SI_SGPR_RW_BUFFERS     0  /* rings (& stream-out, VS only) */
>  #define SI_SGPR_CONST          2
> @@ -142,6 +143,8 @@ struct si_shader {
>         struct si_pm4_state             *pm4;
>         struct r600_resource            *bo;
>         struct r600_resource            *scratch_bo;
> +       struct radeon_shader_reloc      *relocs;
> +       unsigned                        num_relocs;
>         unsigned                        num_sgprs;
>         unsigned                        num_vgprs;
>         unsigned                        lds_size;
> @@ -185,7 +188,11 @@ void si_shader_destroy(struct pipe_context *ctx, struct si_shader *shader);
>  unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index);
>  int si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader,
>                 const struct radeon_shader_binary *binary);
> -void si_shader_binary_read_config(const struct radeon_shader_binary *binary,
> +void si_shader_apply_scratch_relocs(struct si_context *sctx,
> +       struct si_shader *shader, const struct radeon_shader_reloc *relocs,
> +       unsigned num_relocs, uint64_t scratch_va);
> +void si_shader_binary_read_config(const struct si_screen *sscreen,
> +                               const struct radeon_shader_binary *binary,
>                                 struct si_shader *shader,
>                                 unsigned symbol_offset);
>
> diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
> index cd4880b..0cd0492 100644
> --- a/src/gallium/drivers/radeonsi/si_state_draw.c
> +++ b/src/gallium/drivers/radeonsi/si_state_draw.c
> @@ -173,6 +173,110 @@ static void si_emit_rasterizer_prim_state(struct si_context *sctx, unsigned mode
>         sctx->last_rast_prim = mode;
>  }
>
> +static void si_update_scratch_buffer(struct si_context *sctx,
> +                                   struct si_shader_selector *sel)
> +{
> +       struct si_shader *shader;
> +       unsigned scratch_bytes;
> +
> +       if (!sel) {
> +               return;
> +       }
> +
> +       shader = sel->current;
> +       scratch_bytes = shader->scratch_bytes_per_wave *
> +                                       sctx->scratch_waves;
> +
> +       /* This shader doesn't need a scratch buffer */
> +       if (scratch_bytes == 0)  {
> +               return;
> +       }
> +
> +       /* This shader is already configured to use the current
> +        * scratch buffer. */
> +       if (shader->scratch_bo == sctx->scratch_buffer) {
> +               return;
> +       }
> +
> +       assert(sctx->scratch_buffer);
> +
> +       si_shader_apply_scratch_relocs(sctx, shader,
> +               shader->relocs, shader->num_relocs,
> +               sctx->scratch_buffer->gpu_address);
> +
> +       shader->scratch_bo = sctx->scratch_buffer;
> +}
> +
> +static unsigned si_get_current_scratch_buffer_size(struct si_context *sctx)
> +{
> +       if (!sctx->scratch_buffer) {
> +               return 0;
> +       }
> +
> +       return sctx->scratch_buffer->b.b.width0;
> +}
> +
> +static unsigned si_get_scratch_buffer_size(struct si_context *sctx,
> +                                       struct si_shader_selector *sel)
> +{
> +       if (!sel) {
> +               return 0;
> +       }
> +
> +       return sel->current->scratch_bytes_per_wave *
> +               sctx->scratch_waves;
> +
> +}
> +
> +static unsigned si_get_max_scratch_size_needed(struct si_context *sctx)
> +{
> +
> +       return MAX3(si_get_scratch_buffer_size(sctx, sctx->ps_shader),
> +                       si_get_scratch_buffer_size(sctx, sctx->gs_shader),
> +                       si_get_scratch_buffer_size(sctx, sctx->vs_shader));
> +}
> +
> +static void si_emit_spi_tmpring_state(struct si_context *sctx)
> +{
> +       struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
> +       unsigned current_scratch_buffer_size =
> +               si_get_current_scratch_buffer_size(sctx);
> +       unsigned scratch_needed_size =
> +               si_get_max_scratch_size_needed(sctx);
> +       unsigned scratch_bytes_per_wave = scratch_needed_size /
> +               sctx->scratch_waves;
> +
> +       if (scratch_needed_size > current_scratch_buffer_size) {
> +               /* Create a bigger scratch buffer */
> +               struct r600_resource *new_scratch_buffer =
> +                               si_resource_create_custom(&sctx->screen->b.b,
> +                                PIPE_USAGE_DEFAULT, scratch_needed_size);
> +
> +               pipe_resource_reference(
> +                               (struct pipe_resource**)&sctx->scratch_buffer,
> +                               &new_scratch_buffer->b.b);
> +       }
> +
> +       /* Update the shaders, so they are using the latest scratch buffer. */
> +       si_update_scratch_buffer(sctx, sctx->ps_shader);
> +       si_update_scratch_buffer(sctx, sctx->gs_shader);
> +       si_update_scratch_buffer(sctx, sctx->vs_shader);
> +
> +       /* The LLVM shader backend should be reporting aligned scratch_sizes. */
> +       assert((scratch_needed_size & ~0x3FF) == scratch_needed_size &&
> +               "scratch size should already be aligned correctly.");
> +
> +       r600_write_context_reg(cs, R_0286E8_SPI_TMPRING_SIZE,
> +                               S_0286E8_WAVES(sctx->scratch_waves) |
> +                               S_0286E8_WAVESIZE(scratch_bytes_per_wave >> 10));
> +
> +       if (scratch_needed_size > 0) {
> +               r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
> +                       sctx->scratch_buffer, RADEON_USAGE_READWRITE,
> +                       RADEON_PRIO_SHADER_RESOURCE_RW);
> +       }
> +}
> +
>  static void si_emit_draw_registers(struct si_context *sctx,
>                                    const struct pipe_draw_info *info,
>                                    const struct pipe_index_buffer *ib)
> @@ -581,6 +685,7 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
>                 }
>         }
>
> +       si_emit_spi_tmpring_state(sctx);
>         si_pm4_emit_dirty(sctx);
>         si_emit_rasterizer_prim_state(sctx, info->mode);
>         si_emit_draw_registers(sctx, info, &ib);
> diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
> index 817a990..c24573c 100644
> --- a/src/gallium/drivers/radeonsi/si_state_shaders.c
> +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
> @@ -67,7 +67,8 @@ static void si_shader_es(struct si_shader *shader)
>                        S_00B328_VGPR_COMP_CNT(vgpr_comp_cnt) |
>                        S_00B328_DX10_CLAMP(shader->dx10_clamp_mode));
>         si_pm4_set_reg(pm4, R_00B32C_SPI_SHADER_PGM_RSRC2_ES,
> -                      S_00B32C_USER_SGPR(num_user_sgprs));
> +                      S_00B32C_USER_SGPR(num_user_sgprs) |
> +                      S_00B32C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0));
>  }
>
>  static void si_shader_gs(struct si_shader *shader)
> @@ -136,7 +137,8 @@ static void si_shader_gs(struct si_shader *shader)
>                        S_00B228_SGPRS((num_sgprs - 1) / 8) |
>                        S_00B228_DX10_CLAMP(shader->dx10_clamp_mode));
>         si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS,
> -                      S_00B22C_USER_SGPR(num_user_sgprs));
> +                      S_00B22C_USER_SGPR(num_user_sgprs) |
> +                      S_00B22C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0));
>  }
>
>  static void si_shader_vs(struct si_shader *shader)
> @@ -216,7 +218,8 @@ static void si_shader_vs(struct si_shader *shader)
>                        S_00B12C_SO_BASE1_EN(!!shader->selector->so.stride[1]) |
>                        S_00B12C_SO_BASE2_EN(!!shader->selector->so.stride[2]) |
>                        S_00B12C_SO_BASE3_EN(!!shader->selector->so.stride[3]) |
> -                      S_00B12C_SO_EN(!!shader->selector->so.num_outputs));
> +                      S_00B12C_SO_EN(!!shader->selector->so.num_outputs) |
> +                      S_00B12C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0));
>         if (window_space)
>                 si_pm4_set_reg(pm4, R_028818_PA_CL_VTE_CNTL,
>                                S_028818_VTX_XY_FMT(1) | S_028818_VTX_Z_FMT(1));
> @@ -307,7 +310,8 @@ static void si_shader_ps(struct si_shader *shader)
>                        S_00B028_DX10_CLAMP(shader->dx10_clamp_mode));
>         si_pm4_set_reg(pm4, R_00B02C_SPI_SHADER_PGM_RSRC2_PS,
>                        S_00B02C_EXTRA_LDS_SIZE(shader->lds_size) |
> -                      S_00B02C_USER_SGPR(num_user_sgprs));
> +                      S_00B02C_USER_SGPR(num_user_sgprs) |
> +                      S_00B32C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0));
>  }
>
>  static void si_shader_init_pm4_state(struct si_shader *shader)
> --
> 1.8.5.5
>