[Mesa-dev] [PATCH 12/20] radeonsi: rework compute scratch buffer

Mon Apr 4 16:51:28 UTC 2016

On Sat, Apr 2, 2016 at 3:10 PM, Bas Nieuwenhuizen
<bas at basnieuwenhuizen.nl> wrote:
> Instead of having a scratch buffer per program, have one per
> context.
>
> Also removed the per kernel wave count calculations, but
> that only helped if the total number of waves in the dispatch
> was smaller than sctx->scratch_waves.
>
> Signed-off-by: Bas Nieuwenhuizen <bas at basnieuwenhuizen.nl>
> ---
>  src/gallium/drivers/radeonsi/si_compute.c | 136 ++++++++++--------------------
>  src/gallium/drivers/radeonsi/si_pipe.c    |   1 +
>  src/gallium/drivers/radeonsi/si_pipe.h    |   2 +
>  3 files changed, 46 insertions(+), 93 deletions(-)
>
> diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
> index 7320ace..e712b46 100644
> --- a/src/gallium/drivers/radeonsi/si_compute.c
> +++ b/src/gallium/drivers/radeonsi/si_compute.c
> @@ -46,47 +46,6 @@ struct si_compute {
>         struct pipe_resource *global_buffers[MAX_GLOBAL_BUFFERS];
>  };
>
> -static void init_scratch_buffer(struct si_context *sctx, struct si_compute *program)
> -{
> -       unsigned scratch_bytes = 0;
> -       uint64_t scratch_buffer_va;
> -       unsigned i;
> -
> -       /* Compute the scratch buffer size using the maximum number of waves.
> -        * This way we don't need to recompute it for each kernel launch. */
> -       unsigned scratch_waves = 32 * sctx->screen->b.info.num_good_compute_units;
> -       for (i = 0; i < program->shader.binary.global_symbol_count; i++) {
> -               unsigned offset =
> -                               program->shader.binary.global_symbol_offsets[i];
> -               unsigned scratch_bytes_needed;
> -
> -               si_shader_binary_read_config(&program->shader.binary,
> -                                            &program->shader.config, offset);
> -               scratch_bytes_needed = program->shader.config.scratch_bytes_per_wave;
> -               scratch_bytes = MAX2(scratch_bytes, scratch_bytes_needed);
> -       }
> -
> -       if (scratch_bytes == 0)
> -               return;
> -
> -       program->shader.scratch_bo =
> -                               si_resource_create_custom(sctx->b.b.screen,
> -                               PIPE_USAGE_DEFAULT,
> -                               scratch_bytes * scratch_waves);
> -
> -       scratch_buffer_va = program->shader.scratch_bo->gpu_address;
> -
> -       /* apply_scratch_relocs needs scratch_bytes_per_wave to be set
> -        * to the maximum bytes needed, so it can compute the stride
> -        * correctly.
> -        */
> -       program->shader.config.scratch_bytes_per_wave = scratch_bytes;
> -
> -       /* Patch the shader with the scratch buffer address. */
> -       si_shader_apply_scratch_relocs(sctx,
> -                               &program->shader, scratch_buffer_va);
> -}
> -
>  static void *si_create_compute_state(
>         struct pipe_context *ctx,
>         const struct pipe_compute_state *cso)
> @@ -140,11 +99,6 @@ static void *si_create_compute_state(
>                 code = cso->prog + sizeof(struct pipe_llvm_program_header);
>
>                 radeon_elf_read(code, header->num_bytes, &program->shader.binary);
> -               /* init_scratch_buffer patches the shader code with the scratch address,
> -               * so we need to call it before si_shader_binary_read() which uploads
> -               * the shader code to the GPU.
> -               */
> -               init_scratch_buffer(sctx, program);
>                 si_shader_binary_read_config(&program->shader.binary,
>                              &program->shader.config, 0);
>         }
> @@ -189,43 +143,6 @@ static void si_set_global_binding(
>         }
>  }
>
> -/**
> - * This function computes the value for R_00B860_COMPUTE_TMPRING_SIZE.WAVES
> - * /p block_layout is the number of threads in each work group.
> - * /p grid layout is the number of work groups.
> - */
> -static unsigned compute_num_waves_for_scratch(
> -               const struct radeon_info *info,
> -               const uint *block_layout,
> -               const uint *grid_layout)
> -{
> -       unsigned num_sh = MAX2(info->max_sh_per_se, 1);
> -       unsigned num_se = MAX2(info->max_se, 1);
> -       unsigned num_blocks = 1;
> -       unsigned threads_per_block = 1;
> -       unsigned waves_per_block;
> -       unsigned waves_per_sh;
> -       unsigned waves;
> -       unsigned scratch_waves;
> -       unsigned i;
> -
> -       for (i = 0; i < 3; i++) {
> -               threads_per_block *= block_layout[i];
> -               num_blocks *= grid_layout[i];
> -       }
> -
> -       waves_per_block = align(threads_per_block, 64) / 64;
> -       waves = waves_per_block * num_blocks;
> -       waves_per_sh = align(waves, num_sh * num_se) / (num_sh * num_se);
> -       scratch_waves = waves_per_sh * num_sh * num_se;
> -
> -       if (waves_per_block > waves_per_sh) {
> -               scratch_waves = waves_per_block * num_sh * num_se;
> -       }
> -
> -       return scratch_waves;
> -}
> -
>  static void si_initialize_compute(struct si_context *sctx)
>  {
>         struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
> @@ -259,6 +176,43 @@ static void si_initialize_compute(struct si_context *sctx)
>         sctx->cs_shader_state.initialized = true;
>  }
>
> +static bool si_setup_compute_scratch_buffer(struct si_context *sctx,
> +                                            struct si_shader *shader,
> +                                            struct si_shader_config *config) {

"{" on the next line please.

Marek