[Mesa-dev] [PATCH] radv/gfx10: add Wave32 support for compute shaders

Tue Jul 30 17:17:34 UTC 2019

r-b

On Tue, Jul 30, 2019 at 6:29 PM Samuel Pitoiset
<samuel.pitoiset at gmail.com> wrote:
>
> It can be enabled with RADV_PERFTEST=cswave32.
>
> Signed-off-by: Samuel Pitoiset <samuel.pitoiset at gmail.com>
> ---
>  src/amd/vulkan/radv_debug.h       |  1 +
>  src/amd/vulkan/radv_device.c      | 12 +++++++++++-
>  src/amd/vulkan/radv_nir_to_llvm.c | 14 +++++++++++++-
>  src/amd/vulkan/radv_pipeline.c    |  3 ++-
>  src/amd/vulkan/radv_private.h     |  3 +++
>  src/amd/vulkan/radv_shader.c      | 25 ++++++++++++++++++++++---
>  src/amd/vulkan/radv_shader.h      |  1 +
>  7 files changed, 53 insertions(+), 6 deletions(-)
>
> diff --git a/src/amd/vulkan/radv_debug.h b/src/amd/vulkan/radv_debug.h
> index 723fabda57f..6414e882676 100644
> --- a/src/amd/vulkan/radv_debug.h
> +++ b/src/amd/vulkan/radv_debug.h
> @@ -64,6 +64,7 @@ enum {
>         RADV_PERFTEST_BO_LIST        =  0x20,
>         RADV_PERFTEST_SHADER_BALLOT  =  0x40,
>         RADV_PERFTEST_TC_COMPAT_CMASK = 0x80,
> +       RADV_PERFTEST_CS_WAVE_32     = 0x100,
>  };
>
>  bool
> diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
> index 65e3ccf91ad..29be192443a 100644
> --- a/src/amd/vulkan/radv_device.c
> +++ b/src/amd/vulkan/radv_device.c
> @@ -383,6 +383,14 @@ radv_physical_device_init(struct radv_physical_device *device,
>
>         device->use_shader_ballot = device->instance->perftest_flags & RADV_PERFTEST_SHADER_BALLOT;
>
> +       /* Determine the number of threads per wave for all stages. */
> +       device->cs_wave_size = 64;
> +
> +       if (device->rad_info.chip_class >= GFX10) {
> +               if (device->instance->perftest_flags & RADV_PERFTEST_CS_WAVE_32)
> +                       device->cs_wave_size = 32;
> +       }
> +
>         radv_physical_device_init_mem_types(device);
>         radv_fill_device_extension_table(device, &device->supported_extensions);
>
> @@ -494,6 +502,7 @@ static const struct debug_control radv_perftest_options[] = {
>         {"bolist", RADV_PERFTEST_BO_LIST},
>         {"shader_ballot", RADV_PERFTEST_SHADER_BALLOT},
>         {"tccompatcmask", RADV_PERFTEST_TC_COMPAT_CMASK},
> +       {"cswave32", RADV_PERFTEST_CS_WAVE_32},
>         {NULL, 0}
>  };
>
> @@ -1930,7 +1939,8 @@ VkResult radv_CreateDevice(
>         device->scratch_waves = MAX2(32 * physical_device->rad_info.num_good_compute_units,
>                                      max_threads_per_block / 64);
>
> -       device->dispatch_initiator = S_00B800_COMPUTE_SHADER_EN(1);
> +       device->dispatch_initiator = S_00B800_COMPUTE_SHADER_EN(1) |
> +                                    S_00B800_CS_W32_EN(device->physical_device->cs_wave_size == 32);
>
>         if (device->physical_device->rad_info.chip_class >= GFX7) {
>                 /* If the KMD allows it (there is a KMD hw register for it),
> diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c
> index 020c6d17771..feaab8f6370 100644
> --- a/src/amd/vulkan/radv_nir_to_llvm.c
> +++ b/src/amd/vulkan/radv_nir_to_llvm.c
> @@ -4317,6 +4317,15 @@ static void declare_esgs_ring(struct radv_shader_context *ctx)
>         LLVMSetAlignment(ctx->esgs_ring, 64 * 1024);
>  }
>
> +static uint8_t
> +radv_nir_shader_wave_size(struct nir_shader *const *shaders, int shader_count,
> +                         const struct radv_nir_compiler_options *options)
> +{
> +       if (shaders[0]->info.stage == MESA_SHADER_COMPUTE)
> +               return options->cs_wave_size;
> +       return 64;
> +}
> +
>  static
>  LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm,
>                                         struct nir_shader *const *shaders,
> @@ -4333,8 +4342,11 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm,
>                 options->unsafe_math ? AC_FLOAT_MODE_UNSAFE_FP_MATH :
>                                        AC_FLOAT_MODE_DEFAULT;
>
> +       uint8_t wave_size = radv_nir_shader_wave_size(shaders,
> +                                                     shader_count, options);
> +
>         ac_llvm_context_init(&ctx.ac, ac_llvm, options->chip_class,
> -                            options->family, float_mode, 64);
> +                            options->family, float_mode, wave_size);
>         ctx.context = ctx.ac.context;
>
>         radv_nir_shader_info_init(&shader_info->info);
> diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c
> index 583b600dfdd..6b8b7bbe25a 100644
> --- a/src/amd/vulkan/radv_pipeline.c
> +++ b/src/amd/vulkan/radv_pipeline.c
> @@ -4648,7 +4648,8 @@ radv_compute_generate_pm4(struct radv_pipeline *pipeline)
>         threads_per_threadgroup = compute_shader->info.cs.block_size[0] *
>                                   compute_shader->info.cs.block_size[1] *
>                                   compute_shader->info.cs.block_size[2];
> -       waves_per_threadgroup = DIV_ROUND_UP(threads_per_threadgroup, 64);
> +       waves_per_threadgroup = DIV_ROUND_UP(threads_per_threadgroup,
> +                                            device->physical_device->cs_wave_size);
>
>         if (device->physical_device->rad_info.chip_class >= GFX10 &&
>             waves_per_threadgroup == 1)
> diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
> index 466f0288399..559cb3b336d 100644
> --- a/src/amd/vulkan/radv_private.h
> +++ b/src/amd/vulkan/radv_private.h
> @@ -334,6 +334,9 @@ struct radv_physical_device {
>         /* Whether DISABLE_CONSTANT_ENCODE_REG is supported. */
>         bool has_dcc_constant_encode;
>
> +       /* Number of threads per wave. */
> +       uint8_t cs_wave_size;
> +
>         /* This is the drivers on-disk cache used as a fallback as opposed to
>          * the pipeline cache defined by apps.
>          */
> diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c
> index 0c3e375ee5e..0d2e5ae836a 100644
> --- a/src/amd/vulkan/radv_shader.c
> +++ b/src/amd/vulkan/radv_shader.c
> @@ -623,6 +623,16 @@ radv_get_shader_binary_size(size_t code_size)
>         return code_size + DEBUGGER_NUM_MARKERS * 4;
>  }
>
> +static uint8_t
> +radv_get_shader_wave_size(const struct radv_physical_device *pdevice,
> +                         gl_shader_stage stage)
> +{
> +       if (stage == MESA_SHADER_COMPUTE)
> +               return pdevice->cs_wave_size;
> +
> +       return 64;
> +}
> +
>  static void radv_postprocess_config(const struct radv_physical_device *pdevice,
>                                     const struct ac_shader_config *config_in,
>                                     const struct radv_shader_variant_info *info,
> @@ -630,6 +640,7 @@ static void radv_postprocess_config(const struct radv_physical_device *pdevice,
>                                     struct ac_shader_config *config_out)
>  {
>         bool scratch_enabled = config_in->scratch_bytes_per_wave > 0;
> +       uint8_t wave_size = radv_get_shader_wave_size(pdevice, stage);
>         unsigned vgpr_comp_cnt = 0;
>         unsigned num_input_vgprs = info->num_input_vgprs;
>
> @@ -699,7 +710,8 @@ static void radv_postprocess_config(const struct radv_physical_device *pdevice,
>                             S_00B12C_SO_BASE3_EN(!!info->info.so.strides[3]) |
>                             S_00B12C_SO_EN(!!info->info.so.num_outputs);
>
> -       config_out->rsrc1 = S_00B848_VGPRS((num_vgprs - 1) / 4) |
> +       config_out->rsrc1 = S_00B848_VGPRS((num_vgprs - 1) /
> +                                          (wave_size == 32 ? 8 : 4)) |
>                             S_00B848_DX10_CLAMP(1) |
>                             S_00B848_FLOAT_MODE(config_out->float_mode);
>
> @@ -965,10 +977,15 @@ radv_shader_variant_create(struct radv_device *device,
>                         if (binary->variant_info.is_ngg)
>                                 sym->size -= 32;
>                 }
> +
> +               uint8_t wave_size =
> +                       radv_get_shader_wave_size(device->physical_device,
> +                                                 binary->stage);
> +
>                 struct ac_rtld_open_info open_info = {
>                         .info = &device->physical_device->rad_info,
>                         .shader_type = binary->stage,
> -                       .wave_size = 64,
> +                       .wave_size = wave_size,
>                         .num_parts = 1,
>                         .elf_ptrs = &elf_data,
>                         .elf_sizes = &elf_size,
> @@ -1080,6 +1097,7 @@ shader_variant_compile(struct radv_device *device,
>         options->check_ir = device->instance->debug_flags & RADV_DEBUG_CHECKIR;
>         options->tess_offchip_block_dw_size = device->tess_offchip_block_dw_size;
>         options->address32_hi = device->physical_device->rad_info.address32_hi;
> +       options->cs_wave_size = device->physical_device->cs_wave_size;
>
>         if (options->supports_spill)
>                 tm_options |= AC_TM_SUPPORTS_SPILL;
> @@ -1229,6 +1247,7 @@ generate_shader_stats(struct radv_device *device,
>  {
>         enum chip_class chip_class = device->physical_device->rad_info.chip_class;
>         unsigned lds_increment = chip_class >= GFX7 ? 512 : 256;
> +       uint8_t wave_size = radv_get_shader_wave_size(device->physical_device, stage);
>         struct ac_shader_config *conf;
>         unsigned max_simd_waves;
>         unsigned lds_per_wave = 0;
> @@ -1245,7 +1264,7 @@ generate_shader_stats(struct radv_device *device,
>                 unsigned max_workgroup_size =
>                         radv_nir_get_max_workgroup_size(chip_class, stage, variant->nir);
>                 lds_per_wave = (conf->lds_size * lds_increment) /
> -                              DIV_ROUND_UP(max_workgroup_size, 64);
> +                              DIV_ROUND_UP(max_workgroup_size, wave_size);
>         }
>
>         if (conf->num_sgprs)
> diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h
> index fea0d1c8df1..966949fae4f 100644
> --- a/src/amd/vulkan/radv_shader.h
> +++ b/src/amd/vulkan/radv_shader.h
> @@ -139,6 +139,7 @@ struct radv_nir_compiler_options {
>         enum chip_class chip_class;
>         uint32_t tess_offchip_block_dw_size;
>         uint32_t address32_hi;
> +       uint8_t cs_wave_size;
>  };
>
>  enum radv_ud_index {
> --
> 2.22.0
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev