[Mesa-dev] [PATCH 4/4] radeonsi: Enable VGPR spilling for all shader types
Alex Deucher
alexdeucher at gmail.com
Thu Jan 8 12:50:31 PST 2015
On Thu, Jan 8, 2015 at 3:37 PM, Marek Olšák <maraeo at gmail.com> wrote:
> On Thu, Jan 8, 2015 at 8:41 PM, Tom Stellard <tom at stellard.net> wrote:
>> On Thu, Jan 08, 2015 at 07:51:58PM +0100, Marek Olšák wrote:
>>> On Thu, Jan 8, 2015 at 7:17 PM, Tom Stellard <tom at stellard.net> wrote:
>>> > On Thu, Jan 08, 2015 at 07:00:11PM +0100, Marek Olšák wrote:
>>> >> On Wed, Jan 7, 2015 at 10:03 PM, Tom Stellard <thomas.stellard at amd.com> wrote:
>>> >> > ---
>>> >> > src/gallium/drivers/radeonsi/si_compute.c | 40 ++------------
>>> >> > src/gallium/drivers/radeonsi/si_shader.c | 69 ++++++++++++++++++++++++-
>>> >> > src/gallium/drivers/radeonsi/si_shader.h | 7 ++-
>>> >> > src/gallium/drivers/radeonsi/si_state_shaders.c | 36 +++++++++++--
>>> >> > 4 files changed, 108 insertions(+), 44 deletions(-)
>>> >> >
>>> >> > diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
>>> >> > index 37e5c42..e5f158d 100644
>>> >> > --- a/src/gallium/drivers/radeonsi/si_compute.c
>>> >> > +++ b/src/gallium/drivers/radeonsi/si_compute.c
>>> >> > @@ -42,12 +42,6 @@
>>> >> > #define NUM_USER_SGPRS 4
>>> >> > #endif
>>> >> >
>>> >> > -static const char *scratch_rsrc_dword0_symbol =
>>> >> > - "SCRATCH_RSRC_DWORD0";
>>> >> > -
>>> >> > -static const char *scratch_rsrc_dword1_symbol =
>>> >> > - "SCRATCH_RSRC_DWORD1";
>>> >> > -
>>> >> > struct si_compute {
>>> >> > struct si_context *ctx;
>>> >> >
>>> >> > @@ -183,35 +177,6 @@ static unsigned compute_num_waves_for_scratch(
>>> >> > return scratch_waves;
>>> >> > }
>>> >> >
>>> >> > -static void apply_scratch_relocs(const struct si_screen *sscreen,
>>> >> > - const struct radeon_shader_binary *binary,
>>> >> > - struct si_shader *shader, uint64_t scratch_va) {
>>> >> > - unsigned i;
>>> >> > - char *ptr;
>>> >> > - uint32_t scratch_rsrc_dword0 = scratch_va & 0xffffffff;
>>> >> > - uint32_t scratch_rsrc_dword1 =
>>> >> > - S_008F04_BASE_ADDRESS_HI(scratch_va >> 32)
>>> >> > - | S_008F04_STRIDE(shader->scratch_bytes_per_wave / 64);
>>> >> > -
>>> >> > - if (!binary->reloc_count) {
>>> >> > - return;
>>> >> > - }
>>> >> > -
>>> >> > - ptr = sscreen->b.ws->buffer_map(shader->bo->cs_buf, NULL,
>>> >> > - PIPE_TRANSFER_READ_WRITE);
>>> >> > - for (i = 0 ; i < binary->reloc_count; i++) {
>>> >> > - const struct radeon_shader_reloc *reloc = &binary->relocs[i];
>>> >> > - if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) {
>>> >> > - util_memcpy_cpu_to_le32(ptr + reloc->offset,
>>> >> > - &scratch_rsrc_dword0, 4);
>>> >> > - } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
>>> >> > - util_memcpy_cpu_to_le32(ptr + reloc->offset,
>>> >> > - &scratch_rsrc_dword1, 4);
>>> >> > - }
>>> >> > - }
>>> >> > - sscreen->b.ws->buffer_unmap(shader->bo->cs_buf);
>>> >> > -}
>>> >> > -
>>> >> > static void si_launch_grid(
>>> >> > struct pipe_context *ctx,
>>> >> > const uint *block_layout, const uint *grid_layout,
>>> >> > @@ -256,7 +221,8 @@ static void si_launch_grid(
>>> >> >
>>> >> > #if HAVE_LLVM >= 0x0306
>>> >> > /* Read the config information */
>>> >> > - si_shader_binary_read_config(&program->binary, &program->program, pc);
>>> >> > + si_shader_binary_read_config(sctx->screen, &program->binary,
>>> >> > + &program->program, pc);
>>> >> > #endif
>>> >> >
>>> >> > /* Upload the kernel arguments */
>>> >> > @@ -295,7 +261,7 @@ static void si_launch_grid(
>>> >> > RADEON_PRIO_SHADER_RESOURCE_RW);
>>> >> >
>>> >> > /* Patch the shader with the scratch buffer address. */
>>> >> > - apply_scratch_relocs(sctx->screen,
>>> >> > + si_shader_apply_scratch_relocs(sctx->screen,
>>> >> > &program->binary, shader, scratch_buffer_va);
>>> >> >
>>> >> > }
>>> >> > diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
>>> >> > index cf28860..d59e736 100644
>>> >> > --- a/src/gallium/drivers/radeonsi/si_shader.c
>>> >> > +++ b/src/gallium/drivers/radeonsi/si_shader.c
>>> >> > @@ -46,6 +46,12 @@
>>> >> >
>>> >> > #include <errno.h>
>>> >> >
>>> >> > +static const char *scratch_rsrc_dword0_symbol =
>>> >> > + "SCRATCH_RSRC_DWORD0";
>>> >> > +
>>> >> > +static const char *scratch_rsrc_dword1_symbol =
>>> >> > + "SCRATCH_RSRC_DWORD1";
>>> >> > +
>>> >> > struct si_shader_output_values
>>> >> > {
>>> >> > LLVMValueRef values[4];
>>> >> > @@ -2517,7 +2523,8 @@ static void preload_ring_buffers(struct si_shader_context *si_shader_ctx)
>>> >> > }
>>> >> > }
>>> >> >
>>> >> > -void si_shader_binary_read_config(const struct radeon_shader_binary *binary,
>>> >> > +void si_shader_binary_read_config(const struct si_screen *sscreen,
>>> >> > + const struct radeon_shader_binary *binary,
>>> >> > struct si_shader *shader,
>>> >> > unsigned symbol_offset)
>>> >> > {
>>> >> > @@ -2549,6 +2556,14 @@ void si_shader_binary_read_config(const struct radeon_shader_binary *binary,
>>> >> > case R_0286CC_SPI_PS_INPUT_ENA:
>>> >> > shader->spi_ps_input_ena = value;
>>> >> > break;
>>> >> > + case R_0286E8_SPI_TMPRING_SIZE:
>>> >> > + /* XXX: This is the maximum value allowed. I'm not sure
>>> >> > + * how compute this for non-cs shaders.
>>> >> > + */
>>> >> > + shader->scratch_waves =
>>> >> > + 32 * sscreen->b.info.max_compute_units;
>>> >> > + /* Fall-through */
>>> >> > +
>>> >> > case R_00B860_COMPUTE_TMPRING_SIZE:
>>> >> > /* WAVESIZE is in units of 256 dwords. */
>>> >> > shader->scratch_bytes_per_wave =
>>> >> > @@ -2562,6 +2577,36 @@ void si_shader_binary_read_config(const struct radeon_shader_binary *binary,
>>> >> > }
>>> >> > }
>>> >> >
>>> >> > +void si_shader_apply_scratch_relocs(const struct si_screen *sscreen,
>>> >> > + const struct radeon_shader_binary *binary,
>>> >> > + struct si_shader *shader, uint64_t scratch_va) {
>>> >> > + unsigned i;
>>> >> > + char *ptr;
>>> >> > + uint32_t scratch_rsrc_dword0 = scratch_va & 0xffffffff;
>>> >> > + uint32_t scratch_rsrc_dword1 =
>>> >> > + S_008F04_BASE_ADDRESS_HI(scratch_va >> 32)
>>> >> > + | S_008F04_STRIDE(shader->scratch_bytes_per_wave / 64);
>>> >> > +
>>> >> > + if (!binary->reloc_count) {
>>> >> > + return;
>>> >> > + }
>>> >> > +
>>> >> > + ptr = sscreen->b.ws->buffer_map(shader->bo->cs_buf, NULL,
>>> >> > + PIPE_TRANSFER_READ_WRITE);
>>> >> > + for (i = 0 ; i < binary->reloc_count; i++) {
>>> >> > + const struct radeon_shader_reloc *reloc = &binary->relocs[i];
>>> >> > + if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) {
>>> >> > + util_memcpy_cpu_to_le32(ptr + reloc->offset,
>>> >> > + &scratch_rsrc_dword0, 4);
>>> >> > + } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
>>> >> > + util_memcpy_cpu_to_le32(ptr + reloc->offset,
>>> >> > + &scratch_rsrc_dword1, 4);
>>> >> > + }
>>> >> > + }
>>> >> > + sscreen->b.ws->buffer_unmap(shader->bo->cs_buf);
>>> >> > +}
>>> >> > +
>>> >> > +
>>> >> > int si_shader_binary_read(struct si_screen *sscreen,
>>> >> > struct si_shader *shader,
>>> >> > const struct radeon_shader_binary *binary)
>>> >> > @@ -2582,7 +2627,7 @@ int si_shader_binary_read(struct si_screen *sscreen,
>>> >> > }
>>> >> > }
>>> >> >
>>> >> > - si_shader_binary_read_config(binary, shader, 0);
>>> >> > + si_shader_binary_read_config(sscreen, binary, shader, 0);
>>> >> >
>>> >> > /* copy new shader */
>>> >> > code_size = binary->code_size + binary->rodata_size;
>>> >> > @@ -2601,6 +2646,7 @@ int si_shader_binary_read(struct si_screen *sscreen,
>>> >> > util_memcpy_cpu_to_le32(ptr, binary->rodata, binary->rodata_size);
>>> >> > }
>>> >> >
>>> >> > +
>>> >>
>>> >> Unintentional new line?
>>> >>
>>> >> > sscreen->b.ws->buffer_unmap(shader->bo->cs_buf);
>>> >> >
>>> >> > return 0;
>>> >> > @@ -2621,6 +2667,25 @@ int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader,
>>> >> > return r;
>>> >> > }
>>> >> > r = si_shader_binary_read(sscreen, shader, &binary);
>>> >> > +
>>> >> > + if (shader->scratch_bytes_per_wave > 0) {
>>> >> > + uint64_t scratch_buffer_va;
>>> >> > + unsigned scratch_bytes = shader->scratch_bytes_per_wave *
>>> >> > + shader->scratch_waves;
>>> >> > + /* It's possible for different shader variants to have
>>> >> > + * different scratch buffer sizes. The scratch buffer sizes
>>> >> > + * won't vary by too much, so allocating double the scratch
>>> >> > + * space need for the first variant should be enough for the
>>> >> > + * rest if they need it.
>>> >> > + */
>>> >> > + shader->scratch_bo = (struct r600_resource*)
>>> >> > + si_resource_create_custom(&sscreen->b.b,
>>> >> > + PIPE_USAGE_DEFAULT, scratch_bytes * 2);
>>> >> > + scratch_buffer_va = shader->scratch_bo->gpu_address;
>>> >> > + si_shader_apply_scratch_relocs(sscreen, &binary, shader,
>>> >> > + scratch_buffer_va);
>>> >> > + }
>>> >> > +
>>> >> > FREE(binary.code);
>>> >> > FREE(binary.config);
>>> >> > FREE(binary.rodata);
>>> >> > diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
>>> >> > index 08e344a..c5b274e 100644
>>> >> > --- a/src/gallium/drivers/radeonsi/si_shader.h
>>> >> > +++ b/src/gallium/drivers/radeonsi/si_shader.h
>>> >> > @@ -147,6 +147,7 @@ struct si_shader {
>>> >> > unsigned lds_size;
>>> >> > unsigned spi_ps_input_ena;
>>> >> > unsigned scratch_bytes_per_wave;
>>> >> > + unsigned scratch_waves;
>>> >> > unsigned spi_shader_col_format;
>>> >> > unsigned spi_shader_z_format;
>>> >> > unsigned db_shader_control;
>>> >> > @@ -185,7 +186,11 @@ void si_shader_destroy(struct pipe_context *ctx, struct si_shader *shader);
>>> >> > unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index);
>>> >> > int si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader,
>>> >> > const struct radeon_shader_binary *binary);
>>> >> > -void si_shader_binary_read_config(const struct radeon_shader_binary *binary,
>>> >> > +void si_shader_apply_scratch_relocs(const struct si_screen *sscreen,
>>> >> > + const struct radeon_shader_binary *binary,
>>> >> > + struct si_shader *shader, uint64_t scratch_va);
>>> >> > +void si_shader_binary_read_config(const struct si_screen *sscreen,
>>> >> > + const struct radeon_shader_binary *binary,
>>> >> > struct si_shader *shader,
>>> >> > unsigned symbol_offset);
>>> >> >
>>> >> > diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
>>> >> > index 817a990..b6a0f32 100644
>>> >> > --- a/src/gallium/drivers/radeonsi/si_state_shaders.c
>>> >> > +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
>>> >> > @@ -33,6 +33,22 @@
>>> >> > #include "util/u_memory.h"
>>> >> > #include "util/u_simple_shaders.h"
>>> >> >
>>> >> > +static void si_shader_setup_scratch_buffer(struct si_shader *shader,
>>> >> > + struct si_pm4_state *pm4) {
>>> >> > + if (shader->scratch_bytes_per_wave == 0) {
>>> >> > + return;
>>> >> > + }
>>> >> > +
>>> >> > + assert(shader->scratch_bo);
>>> >> > +
>>> >> > + si_pm4_set_reg(pm4, R_0286E8_SPI_TMPRING_SIZE,
>>> >> > + S_0286E8_WAVES(shader->scratch_waves)
>>> >> > + | S_0286E8_WAVESIZE(shader->scratch_bytes_per_wave >> 10));
>>> >>
>>> >> What happens if all VS, GS, and PS spill VGPRs? Will they share the
>>> >> scratch buffer? Will the scratch buffer be large enough for all of
>>> >> them? Will the WAVES and WAVESIZE parameters be the same for all of them?
>>> >>
>>> >
>>> > Each shader will have its own scratch buffer. WAVES will always be the
>>> > same, but WAVESIZE depends on the number of VGPRs spilled.
>>>
>>> But SPI_TMPRING_SIZE is shared by all stages, right? Will it not cause
>>> problems if the register value (especially WAVESIZE) is different for
>>> ES, GS, VS, and PS? Perhaps WAVESIZE should be set to a maximum of all
>>> enabled shaders, right?
>>>
>>> Example sequence of state emission:
>>> - emit VS regs
>>> - emit SPI_TMPRING_SIZE for VS
>>> - emit PS regs
>>> - emit SPI_TMPRING_SIZE for PS
>>>
>>> Assuming VS.SPI_TMPRING_SIZE != PS.SPI_TMPRING_SIZE.
>>>
>>
>> Ok, I see why this is a problem.
>>
>> If we use the same WAVESIZE for all shader types, then this means that
>> we either need to use the same scratch buffer for all shader types or
>> have all the scratch buffers be the same size.
>>
>> In a pipeline with VS and PS, do we know that all VS threads will
>> complete before any of the PS threads start? If not, I think we will
>> need one scratch buffer per shader type.
>
> No idea. Do you happen to know this, Christian? Does the 8-context CP
> stuff have anything to do with this?
>
The CP context stuff is for pipelining state changes (e.g., so you can
have multiple draws in the pipeline at once). On r600 asics there was
a tmp ring available for each shader stage, on SI, there's just one
tmp ring for the whole gfx pipeline (SPI_TMPRING) and one tmp ring for
the compute pipeline (COMPUTE_TMPRING). SPI_TMPRING is a context
register (0x28000+), so it's multi-state pipelined just like the other
context registers. We can only set one buffer for all shader types
enabled in the current state.
Alex
>>
>> No matter which of these solutions we choose, we will have to allocate
>> the scratch buffer when the shaders are bound to the state, because that
>> is the only time we will know how big they will need to be.
>>
>> It seems like the best solution may be to have a global per context
>> scratch buffer which has its size increased every time a shader that
>> requires more scratch space is bound to a state.
>>
>> What do you think?
>
> This sounds very good to me.
>
> It looks like we may need to add a separate state for the register,
> because the shader pm4 states can't be changed after they are built.
> There are 3 options:
> 1) Add si_pm4_state to struct si_states. See e.g. how the "spi" state
> is done. This is not very efficient for non-CSO states though.
> 2) Add an r600_atom. See e.g. how the "clip_regs" state is done.
> 3) Add the register emission to draw_vbo. Don't worry, it will still
> be emitted conditionally. See e.g. how si_emit_rasterizer_prim_state
> is done.
>
> For 1-register states, (3) is probably the best.
>
> Marek
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-dev
More information about the mesa-dev
mailing list