[Mesa-dev] [PATCH 12/20] radeonsi: rework compute scratch buffer

Tue Apr 5 11:35:20 UTC 2016

On Mon, Apr 4, 2016 at 6:51 PM, Marek Olšák <maraeo at gmail.com> wrote:
> On Sat, Apr 2, 2016 at 3:10 PM, Bas Nieuwenhuizen
> <bas at basnieuwenhuizen.nl> wrote:
>> Instead of having a scratch buffer per program, have one per
>> context.
>>
>> Also removed the per kernel wave count calculations, but
>> that only helped if the total number of waves in the dispatch
>> was smaller than sctx->scratch_waves.
>>
>> Signed-off-by: Bas Nieuwenhuizen <bas at basnieuwenhuizen.nl>
>> ---
>>  src/gallium/drivers/radeonsi/si_compute.c | 136 ++++++++++--------------------
>>  src/gallium/drivers/radeonsi/si_pipe.c    |   1 +
>>  src/gallium/drivers/radeonsi/si_pipe.h    |   2 +
>>  3 files changed, 46 insertions(+), 93 deletions(-)
>>
>> diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
>> index 7320ace..e712b46 100644
>> --- a/src/gallium/drivers/radeonsi/si_compute.c
>> +++ b/src/gallium/drivers/radeonsi/si_compute.c
>> @@ -46,47 +46,6 @@ struct si_compute {
>>         struct pipe_resource *global_buffers[MAX_GLOBAL_BUFFERS];
>>  };
>>
>> -static void init_scratch_buffer(struct si_context *sctx, struct si_compute *program)
>> -{
>> -       unsigned scratch_bytes = 0;
>> -       uint64_t scratch_buffer_va;
>> -       unsigned i;
>> -
>> -       /* Compute the scratch buffer size using the maximum number of waves.
>> -        * This way we don't need to recompute it for each kernel launch. */
>> -       unsigned scratch_waves = 32 * sctx->screen->b.info.num_good_compute_units;
>> -       for (i = 0; i < program->shader.binary.global_symbol_count; i++) {
>> -               unsigned offset =
>> -                               program->shader.binary.global_symbol_offsets[i];
>> -               unsigned scratch_bytes_needed;
>> -
>> -               si_shader_binary_read_config(&program->shader.binary,
>> -                                            &program->shader.config, offset);
>> -               scratch_bytes_needed = program->shader.config.scratch_bytes_per_wave;
>> -               scratch_bytes = MAX2(scratch_bytes, scratch_bytes_needed);
>> -       }
>> -
>> -       if (scratch_bytes == 0)
>> -               return;
>> -
>> -       program->shader.scratch_bo =
>> -                               si_resource_create_custom(sctx->b.b.screen,
>> -                               PIPE_USAGE_DEFAULT,
>> -                               scratch_bytes * scratch_waves);
>> -
>> -       scratch_buffer_va = program->shader.scratch_bo->gpu_address;
>> -
>> -       /* apply_scratch_relocs needs scratch_bytes_per_wave to be set
>> -        * to the maximum bytes needed, so it can compute the stride
>> -        * correctly.
>> -        */
>> -       program->shader.config.scratch_bytes_per_wave = scratch_bytes;
>> -
>> -       /* Patch the shader with the scratch buffer address. */
>> -       si_shader_apply_scratch_relocs(sctx,
>> -                               &program->shader, scratch_buffer_va);
>> -}
>> -
>>  static void *si_create_compute_state(
>>         struct pipe_context *ctx,
>>         const struct pipe_compute_state *cso)
>> @@ -140,11 +99,6 @@ static void *si_create_compute_state(
>>                 code = cso->prog + sizeof(struct pipe_llvm_program_header);
>>
>>                 radeon_elf_read(code, header->num_bytes, &program->shader.binary);
>> -               /* init_scratch_buffer patches the shader code with the scratch address,
>> -               * so we need to call it before si_shader_binary_read() which uploads
>> -               * the shader code to the GPU.
>> -               */
>> -               init_scratch_buffer(sctx, program);
>>                 si_shader_binary_read_config(&program->shader.binary,
>>                              &program->shader.config, 0);
>>         }
>> @@ -189,43 +143,6 @@ static void si_set_global_binding(
>>         }
>>  }
>>
>> -/**
>> - * This function computes the value for R_00B860_COMPUTE_TMPRING_SIZE.WAVES
>> - * /p block_layout is the number of threads in each work group.
>> - * /p grid layout is the number of work groups.
>> - */
>> -static unsigned compute_num_waves_for_scratch(
>> -               const struct radeon_info *info,
>> -               const uint *block_layout,
>> -               const uint *grid_layout)
>> -{
>> -       unsigned num_sh = MAX2(info->max_sh_per_se, 1);
>> -       unsigned num_se = MAX2(info->max_se, 1);
>> -       unsigned num_blocks = 1;
>> -       unsigned threads_per_block = 1;
>> -       unsigned waves_per_block;
>> -       unsigned waves_per_sh;
>> -       unsigned waves;
>> -       unsigned scratch_waves;
>> -       unsigned i;
>> -
>> -       for (i = 0; i < 3; i++) {
>> -               threads_per_block *= block_layout[i];
>> -               num_blocks *= grid_layout[i];
>> -       }
>> -
>> -       waves_per_block = align(threads_per_block, 64) / 64;
>> -       waves = waves_per_block * num_blocks;
>> -       waves_per_sh = align(waves, num_sh * num_se) / (num_sh * num_se);
>> -       scratch_waves = waves_per_sh * num_sh * num_se;
>> -
>> -       if (waves_per_block > waves_per_sh) {
>> -               scratch_waves = waves_per_block * num_sh * num_se;
>> -       }
>> -
>> -       return scratch_waves;
>> -}
>> -
>>  static void si_initialize_compute(struct si_context *sctx)
>>  {
>>         struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
>> @@ -259,6 +176,43 @@ static void si_initialize_compute(struct si_context *sctx)
>>         sctx->cs_shader_state.initialized = true;
>>  }
>>
>> +static bool si_setup_compute_scratch_buffer(struct si_context *sctx,
>> +                                            struct si_shader *shader,
>> +                                            struct si_shader_config *config) {
>
> "{" on the next line please.

With this fixed, patches 10, 12, 15-16, 18 are:

Reviewed-by: Marek Olšák <marek.olsak at amd.com>

Marek