[Mesa-dev] [PATCH 12/20] radeonsi: rework compute scratch buffer
Marek Olšák
maraeo at gmail.com
Tue Apr 5 11:35:20 UTC 2016
On Mon, Apr 4, 2016 at 6:51 PM, Marek Olšák <maraeo at gmail.com> wrote:
> On Sat, Apr 2, 2016 at 3:10 PM, Bas Nieuwenhuizen
> <bas at basnieuwenhuizen.nl> wrote:
>> Instead of having a scratch buffer per program, have one per
>> context.
>>
>> Also removed the per kernel wave count calculations, but
>> that only helped if the total number of waves in the dispatch
>> was smaller than sctx->scratch_waves.
>>
>> Signed-off-by: Bas Nieuwenhuizen <bas at basnieuwenhuizen.nl>
>> ---
>> src/gallium/drivers/radeonsi/si_compute.c | 136 ++++++++++--------------------
>> src/gallium/drivers/radeonsi/si_pipe.c | 1 +
>> src/gallium/drivers/radeonsi/si_pipe.h | 2 +
>> 3 files changed, 46 insertions(+), 93 deletions(-)
>>
>> diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
>> index 7320ace..e712b46 100644
>> --- a/src/gallium/drivers/radeonsi/si_compute.c
>> +++ b/src/gallium/drivers/radeonsi/si_compute.c
>> @@ -46,47 +46,6 @@ struct si_compute {
>> struct pipe_resource *global_buffers[MAX_GLOBAL_BUFFERS];
>> };
>>
>> -static void init_scratch_buffer(struct si_context *sctx, struct si_compute *program)
>> -{
>> - unsigned scratch_bytes = 0;
>> - uint64_t scratch_buffer_va;
>> - unsigned i;
>> -
>> - /* Compute the scratch buffer size using the maximum number of waves.
>> - * This way we don't need to recompute it for each kernel launch. */
>> - unsigned scratch_waves = 32 * sctx->screen->b.info.num_good_compute_units;
>> - for (i = 0; i < program->shader.binary.global_symbol_count; i++) {
>> - unsigned offset =
>> - program->shader.binary.global_symbol_offsets[i];
>> - unsigned scratch_bytes_needed;
>> -
>> - si_shader_binary_read_config(&program->shader.binary,
>> - &program->shader.config, offset);
>> - scratch_bytes_needed = program->shader.config.scratch_bytes_per_wave;
>> - scratch_bytes = MAX2(scratch_bytes, scratch_bytes_needed);
>> - }
>> -
>> - if (scratch_bytes == 0)
>> - return;
>> -
>> - program->shader.scratch_bo =
>> - si_resource_create_custom(sctx->b.b.screen,
>> - PIPE_USAGE_DEFAULT,
>> - scratch_bytes * scratch_waves);
>> -
>> - scratch_buffer_va = program->shader.scratch_bo->gpu_address;
>> -
>> - /* apply_scratch_relocs needs scratch_bytes_per_wave to be set
>> - * to the maximum bytes needed, so it can compute the stride
>> - * correctly.
>> - */
>> - program->shader.config.scratch_bytes_per_wave = scratch_bytes;
>> -
>> - /* Patch the shader with the scratch buffer address. */
>> - si_shader_apply_scratch_relocs(sctx,
>> - &program->shader, scratch_buffer_va);
>> -}
>> -
>> static void *si_create_compute_state(
>> struct pipe_context *ctx,
>> const struct pipe_compute_state *cso)
>> @@ -140,11 +99,6 @@ static void *si_create_compute_state(
>> code = cso->prog + sizeof(struct pipe_llvm_program_header);
>>
>> radeon_elf_read(code, header->num_bytes, &program->shader.binary);
>> - /* init_scratch_buffer patches the shader code with the scratch address,
>> - * so we need to call it before si_shader_binary_read() which uploads
>> - * the shader code to the GPU.
>> - */
>> - init_scratch_buffer(sctx, program);
>> si_shader_binary_read_config(&program->shader.binary,
>> &program->shader.config, 0);
>> }
>> @@ -189,43 +143,6 @@ static void si_set_global_binding(
>> }
>> }
>>
>> -/**
>> - * This function computes the value for R_00B860_COMPUTE_TMPRING_SIZE.WAVES
>> - * /p block_layout is the number of threads in each work group.
>> - * /p grid layout is the number of work groups.
>> - */
>> -static unsigned compute_num_waves_for_scratch(
>> - const struct radeon_info *info,
>> - const uint *block_layout,
>> - const uint *grid_layout)
>> -{
>> - unsigned num_sh = MAX2(info->max_sh_per_se, 1);
>> - unsigned num_se = MAX2(info->max_se, 1);
>> - unsigned num_blocks = 1;
>> - unsigned threads_per_block = 1;
>> - unsigned waves_per_block;
>> - unsigned waves_per_sh;
>> - unsigned waves;
>> - unsigned scratch_waves;
>> - unsigned i;
>> -
>> - for (i = 0; i < 3; i++) {
>> - threads_per_block *= block_layout[i];
>> - num_blocks *= grid_layout[i];
>> - }
>> -
>> - waves_per_block = align(threads_per_block, 64) / 64;
>> - waves = waves_per_block * num_blocks;
>> - waves_per_sh = align(waves, num_sh * num_se) / (num_sh * num_se);
>> - scratch_waves = waves_per_sh * num_sh * num_se;
>> -
>> - if (waves_per_block > waves_per_sh) {
>> - scratch_waves = waves_per_block * num_sh * num_se;
>> - }
>> -
>> - return scratch_waves;
>> -}
>> -
>> static void si_initialize_compute(struct si_context *sctx)
>> {
>> struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
>> @@ -259,6 +176,43 @@ static void si_initialize_compute(struct si_context *sctx)
>> sctx->cs_shader_state.initialized = true;
>> }
>>
>> +static bool si_setup_compute_scratch_buffer(struct si_context *sctx,
>> + struct si_shader *shader,
>> + struct si_shader_config *config) {
>
> "{" on the next line please.
With this fixed, patches 10, 12, 15-16, 18 are:
Reviewed-by: Marek Olšák <marek.olsak at amd.com>
Marek
More information about the mesa-dev
mailing list