[Mesa-dev] [PATCH 12/20] radeonsi: rework compute scratch buffer
Marek Olšák
maraeo at gmail.com
Mon Apr 4 16:51:28 UTC 2016
On Sat, Apr 2, 2016 at 3:10 PM, Bas Nieuwenhuizen
<bas at basnieuwenhuizen.nl> wrote:
> Instead of having a scratch buffer per program, have one per
> context.
>
> Also removed the per kernel wave count calculations, but
> that only helped if the total number of waves in the dispatch
> was smaller than sctx->scratch_waves.
>
> Signed-off-by: Bas Nieuwenhuizen <bas at basnieuwenhuizen.nl>
> ---
> src/gallium/drivers/radeonsi/si_compute.c | 136 ++++++++++--------------------
> src/gallium/drivers/radeonsi/si_pipe.c | 1 +
> src/gallium/drivers/radeonsi/si_pipe.h | 2 +
> 3 files changed, 46 insertions(+), 93 deletions(-)
>
> diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
> index 7320ace..e712b46 100644
> --- a/src/gallium/drivers/radeonsi/si_compute.c
> +++ b/src/gallium/drivers/radeonsi/si_compute.c
> @@ -46,47 +46,6 @@ struct si_compute {
> struct pipe_resource *global_buffers[MAX_GLOBAL_BUFFERS];
> };
>
> -static void init_scratch_buffer(struct si_context *sctx, struct si_compute *program)
> -{
> - unsigned scratch_bytes = 0;
> - uint64_t scratch_buffer_va;
> - unsigned i;
> -
> - /* Compute the scratch buffer size using the maximum number of waves.
> - * This way we don't need to recompute it for each kernel launch. */
> - unsigned scratch_waves = 32 * sctx->screen->b.info.num_good_compute_units;
> - for (i = 0; i < program->shader.binary.global_symbol_count; i++) {
> - unsigned offset =
> - program->shader.binary.global_symbol_offsets[i];
> - unsigned scratch_bytes_needed;
> -
> - si_shader_binary_read_config(&program->shader.binary,
> - &program->shader.config, offset);
> - scratch_bytes_needed = program->shader.config.scratch_bytes_per_wave;
> - scratch_bytes = MAX2(scratch_bytes, scratch_bytes_needed);
> - }
> -
> - if (scratch_bytes == 0)
> - return;
> -
> - program->shader.scratch_bo =
> - si_resource_create_custom(sctx->b.b.screen,
> - PIPE_USAGE_DEFAULT,
> - scratch_bytes * scratch_waves);
> -
> - scratch_buffer_va = program->shader.scratch_bo->gpu_address;
> -
> - /* apply_scratch_relocs needs scratch_bytes_per_wave to be set
> - * to the maximum bytes needed, so it can compute the stride
> - * correctly.
> - */
> - program->shader.config.scratch_bytes_per_wave = scratch_bytes;
> -
> - /* Patch the shader with the scratch buffer address. */
> - si_shader_apply_scratch_relocs(sctx,
> - &program->shader, scratch_buffer_va);
> -}
> -
> static void *si_create_compute_state(
> struct pipe_context *ctx,
> const struct pipe_compute_state *cso)
> @@ -140,11 +99,6 @@ static void *si_create_compute_state(
> code = cso->prog + sizeof(struct pipe_llvm_program_header);
>
> radeon_elf_read(code, header->num_bytes, &program->shader.binary);
> - /* init_scratch_buffer patches the shader code with the scratch address,
> - * so we need to call it before si_shader_binary_read() which uploads
> - * the shader code to the GPU.
> - */
> - init_scratch_buffer(sctx, program);
> si_shader_binary_read_config(&program->shader.binary,
> &program->shader.config, 0);
> }
> @@ -189,43 +143,6 @@ static void si_set_global_binding(
> }
> }
>
> -/**
> - * This function computes the value for R_00B860_COMPUTE_TMPRING_SIZE.WAVES
> - * /p block_layout is the number of threads in each work group.
> - * /p grid layout is the number of work groups.
> - */
> -static unsigned compute_num_waves_for_scratch(
> - const struct radeon_info *info,
> - const uint *block_layout,
> - const uint *grid_layout)
> -{
> - unsigned num_sh = MAX2(info->max_sh_per_se, 1);
> - unsigned num_se = MAX2(info->max_se, 1);
> - unsigned num_blocks = 1;
> - unsigned threads_per_block = 1;
> - unsigned waves_per_block;
> - unsigned waves_per_sh;
> - unsigned waves;
> - unsigned scratch_waves;
> - unsigned i;
> -
> - for (i = 0; i < 3; i++) {
> - threads_per_block *= block_layout[i];
> - num_blocks *= grid_layout[i];
> - }
> -
> - waves_per_block = align(threads_per_block, 64) / 64;
> - waves = waves_per_block * num_blocks;
> - waves_per_sh = align(waves, num_sh * num_se) / (num_sh * num_se);
> - scratch_waves = waves_per_sh * num_sh * num_se;
> -
> - if (waves_per_block > waves_per_sh) {
> - scratch_waves = waves_per_block * num_sh * num_se;
> - }
> -
> - return scratch_waves;
> -}
> -
> static void si_initialize_compute(struct si_context *sctx)
> {
> struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
> @@ -259,6 +176,43 @@ static void si_initialize_compute(struct si_context *sctx)
> sctx->cs_shader_state.initialized = true;
> }
>
> +static bool si_setup_compute_scratch_buffer(struct si_context *sctx,
> + struct si_shader *shader,
> + struct si_shader_config *config) {
"{" on the next line please.
Marek
More information about the mesa-dev
mailing list