[Mesa-dev] [PATCH 3/4] radeonsi: add max waves / SIMD to shader stats (v2)

Tue Jan 26 09:54:33 PST 2016

On Tue, Jan 26, 2016 at 1:47 AM, Tom Stellard <tom at stellard.net> wrote:
> On Fri, Jan 22, 2016 at 03:18:12PM +0100, Marek Olšák wrote:
>> From: Marek Olšák <marek.olsak at amd.com>
>>
>> v2: account for LDS usage in PS
>>     the limit is per SIMD, not per CU
>> ---
>>  src/gallium/drivers/radeonsi/si_shader.c | 54 +++++++++++++++++++++++++++++---
>>  1 file changed, 49 insertions(+), 5 deletions(-)
>>
>> diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
>> index 1bd617f..33c0db6 100644
>> --- a/src/gallium/drivers/radeonsi/si_shader.c
>> +++ b/src/gallium/drivers/radeonsi/si_shader.c
>> @@ -4001,22 +4001,65 @@ static void si_shader_dump_disassembly(const struct radeon_shader_binary *binary
>>
>>  static void si_shader_dump_stats(struct si_screen *sscreen,
>>                                struct si_shader_config *conf,
>> +                              unsigned num_inputs,
>>                                unsigned code_size,
>>                                struct pipe_debug_callback *debug,
>>                                unsigned processor)
>>  {
>> +     unsigned lds_increment = sscreen->b.chip_class >= CIK ? 512 : 256;
>> +     unsigned lds_per_wave = 0;
>> +     unsigned max_simd_waves = 10;
>> +
>> +     /* Compute LDS usage for PS. */
>> +     if (processor == TGSI_PROCESSOR_FRAGMENT) {
>> +             /* The minimum usage per wave is (num_inputs * 36). The maximum
>> +              * usage is (num_inputs * 36 * 16).
>> +              * We can get anything in between and it varies between waves.
>> +              *
>> +              * Other stages don't know the size at compile time or don't
>> +              * allocate LDS per wave, but instead they do it per thread group.
>> +              */
>> +             lds_per_wave = conf->lds_size * lds_increment +
>> +                            align(num_inputs * 36, lds_increment);
>> +     }
>> +
>> +     /* Compute the per-SIMD wave counts. */
>> +     if (conf->num_sgprs) {
>> +             if (sscreen->b.chip_class >= VI)
>> +                     max_simd_waves = MIN2(max_simd_waves, 800 / conf->num_sgprs);
>> +             else
>> +                     max_simd_waves = MIN2(max_simd_waves, 512 / conf->num_sgprs);
>> +     }
>> +
>> +     if (conf->num_vgprs)
>> +             max_simd_waves = MIN2(max_simd_waves, 256 / conf->num_vgprs);
>> +
>> +     /* LDS is 64KB per CU (4 SIMDs), divided into 16KB blocks per SIMD
>> +      * that PS can use.
>> +      */
>> +     if (lds_per_wave)
>> +             max_simd_waves = MIN2(max_simd_waves, 16384 / lds_per_wave);
>> +
>>       if (r600_can_dump_shader(&sscreen->b, processor)) {
>>               fprintf(stderr, "*** SHADER STATS ***\n"
>> -                     "SGPRS: %d\nVGPRS: %d\nCode Size: %d bytes\nLDS: %d blocks\n"
>> -                     "Scratch: %d bytes per wave\n********************\n",
>> +                     "SGPRS: %d\n"
>> +                     "VGPRS: %d\n"
>> +                     "Code Size: %d bytes\n"
>> +                     "LDS: %d blocks\n"
>> +                     "Scratch: %d bytes per wave\n"
>> +                     "Max Waves: %d\n"
>> +                     "********************\n",
>>                       conf->num_sgprs, conf->num_vgprs, code_size,
>> -                     conf->lds_size, conf->scratch_bytes_per_wave);
>> +                     conf->lds_size, conf->scratch_bytes_per_wave,
>> +                     max_simd_waves);
>>       }
>>
>>       pipe_debug_message(debug, SHADER_INFO,
>> -                        "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d LDS: %d Scratch: %d",
>> +                        "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
>> +                        "LDS: %d Scratch: %d Max Waves: %d",
>>                          conf->num_sgprs, conf->num_vgprs, code_size,
>> -                        conf->lds_size, conf->scratch_bytes_per_wave);
>> +                        conf->lds_size, conf->scratch_bytes_per_wave,
>> +                        max_simd_waves);
>>  }
>>
>>  void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
>> @@ -4027,6 +4070,7 @@ void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
>>                       si_shader_dump_disassembly(&shader->binary, debug);
>>
>>       si_shader_dump_stats(sscreen, &shader->config,
>> +                            shader->selector->info.num_inputs,
>
> clover is segfaulting here, because shader->selector is NULL for compute
> shaders.

OK, I'll push a fix in a moment.

Marek