[Mesa-dev] [PATCH 2/2] i965: Implement ARB_compute_variable_group_size.

Fri Jun 1 22:44:05 UTC 2018

On Fri, Jun 1, 2018 at 6:21 PM, Plamena Manolova
<plamena.n.manolova at gmail.com> wrote:
> This patch adds the implentation of ARB_compute_variable_group_size
> for i965. We do this by storing the group size in a buffer surface,
> similarly to the work group number.
>
> Signed-off-by: Plamena Manolova <plamena.n.manolova at gmail.com>
> ---
>  docs/features.txt                                |  2 +-
>  docs/relnotes/18.2.0.html                        |  1 +
>  src/compiler/nir/nir_lower_system_values.c       | 14 ++++
>  src/intel/compiler/brw_compiler.h                |  2 +
>  src/intel/compiler/brw_fs.cpp                    | 45 ++++++++----
>  src/intel/compiler/brw_fs_nir.cpp                | 20 ++++++
>  src/intel/compiler/brw_nir_lower_cs_intrinsics.c | 87 +++++++++++++++++-------
>  src/mesa/drivers/dri/i965/brw_compute.c          | 25 ++++++-
>  src/mesa/drivers/dri/i965/brw_context.h          |  1 +
>  src/mesa/drivers/dri/i965/brw_cs.c               |  4 ++
>  src/mesa/drivers/dri/i965/brw_wm_surface_state.c | 27 +++++++-
>  src/mesa/drivers/dri/i965/intel_extensions.c     |  1 +
>  12 files changed, 187 insertions(+), 42 deletions(-)
>
> diff --git a/docs/features.txt b/docs/features.txt
> index ed4050cf98..7c3c856d73 100644
> --- a/docs/features.txt
> +++ b/docs/features.txt
> @@ -298,7 +298,7 @@ Khronos, ARB, and OES extensions that are not part of any OpenGL or OpenGL ES ve
>
>    GL_ARB_bindless_texture                               DONE (nvc0, radeonsi)
>    GL_ARB_cl_event                                       not started
> -  GL_ARB_compute_variable_group_size                    DONE (nvc0, radeonsi)
> +  GL_ARB_compute_variable_group_size                    DONE (nvc0, radeonsi, i965)
>    GL_ARB_ES3_2_compatibility                            DONE (i965/gen8+)
>    GL_ARB_fragment_shader_interlock                      DONE (i965)
>    GL_ARB_gpu_shader_int64                               DONE (i965/gen8+, nvc0, radeonsi, softpipe, llvmpipe)
> diff --git a/docs/relnotes/18.2.0.html b/docs/relnotes/18.2.0.html
> index a3f44a29dc..4ceeb7471f 100644
> --- a/docs/relnotes/18.2.0.html
> +++ b/docs/relnotes/18.2.0.html
> @@ -45,6 +45,7 @@ Note: some of the new features are only available with certain drivers.
>
>  <ul>
>  <li>GL_ARB_fragment_shader_interlock on i965</li>
> +<li>GL_ARB_compute_variable_group_size on i965</li>
>  </ul>
>
>  <h2>Bug fixes</h2>
> diff --git a/src/compiler/nir/nir_lower_system_values.c b/src/compiler/nir/nir_lower_system_values.c
> index 487da04262..0af6d69426 100644
> --- a/src/compiler/nir/nir_lower_system_values.c
> +++ b/src/compiler/nir/nir_lower_system_values.c
> @@ -57,6 +57,15 @@ convert_block(nir_block *block, nir_builder *b)
>            *    gl_WorkGroupID * gl_WorkGroupSize + gl_LocalInvocationID"
>            */
>
> +
> +          /*
> +           * If the local work group size is variable we can't lower the global
> +           * invocation id here.
> +           */
> +          if (b->shader->info.cs.local_size_variable) {
> +             break;
> +          }
> +

There appears to be some tabs vs spaces thing here.

>           nir_const_value local_size;
>           memset(&local_size, 0, sizeof(local_size));
>           local_size.u32[0] = b->shader->info.cs.local_size[0];
> @@ -102,6 +111,11 @@ convert_block(nir_block *block, nir_builder *b)
>        }
>
>        case SYSTEM_VALUE_LOCAL_GROUP_SIZE: {
> +         /* If the local work group size is variable we can't lower it here */
> +         if (b->shader->info.cs.local_size_variable) {
> +            break;
> +         }
> +
>           nir_const_value local_size;
>           memset(&local_size, 0, sizeof(local_size));
>           local_size.u32[0] = b->shader->info.cs.local_size[0];
> diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h
> index 8b4e6fe2e2..f54952c28f 100644
> --- a/src/intel/compiler/brw_compiler.h
> +++ b/src/intel/compiler/brw_compiler.h
> @@ -759,6 +759,7 @@ struct brw_cs_prog_data {
>     unsigned threads;
>     bool uses_barrier;
>     bool uses_num_work_groups;
> +   bool uses_variable_group_size;
>
>     struct {
>        struct brw_push_const_block cross_thread;
> @@ -771,6 +772,7 @@ struct brw_cs_prog_data {
>         * surface indices the CS-specific surfaces
>         */
>        uint32_t work_groups_start;
> +      uint32_t work_group_size_start;
>        /** @} */
>     } binding_table;
>  };
> diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
> index d67c0a4192..28730af47b 100644
> --- a/src/intel/compiler/brw_fs.cpp
> +++ b/src/intel/compiler/brw_fs.cpp
> @@ -7228,18 +7228,32 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
>                 int shader_time_index,
>                 char **error_str)
>  {
> -   prog_data->local_size[0] = src_shader->info.cs.local_size[0];
> -   prog_data->local_size[1] = src_shader->info.cs.local_size[1];
> -   prog_data->local_size[2] = src_shader->info.cs.local_size[2];
> -   unsigned local_workgroup_size =
> -      src_shader->info.cs.local_size[0] * src_shader->info.cs.local_size[1] *
> -      src_shader->info.cs.local_size[2];
> -
> -   unsigned min_dispatch_width =
> -      DIV_ROUND_UP(local_workgroup_size, compiler->devinfo->max_cs_threads);
> -   min_dispatch_width = MAX2(8, min_dispatch_width);
> -   min_dispatch_width = util_next_power_of_two(min_dispatch_width);
> -   assert(min_dispatch_width <= 32);
> +   unsigned min_dispatch_width;
> +
> +   if (!src_shader->info.cs.local_size_variable) {
> +      unsigned local_workgroup_size =
> +         src_shader->info.cs.local_size[0] * src_shader->info.cs.local_size[1] *
> +         src_shader->info.cs.local_size[2];
> +
> +      min_dispatch_width =
> +         DIV_ROUND_UP(local_workgroup_size, compiler->devinfo->max_cs_threads);
> +      min_dispatch_width = MAX2(8, min_dispatch_width);
> +      min_dispatch_width = util_next_power_of_two(min_dispatch_width);
> +      assert(min_dispatch_width <= 32);
> +
> +      prog_data->local_size[0] = src_shader->info.cs.local_size[0];
> +      prog_data->local_size[1] = src_shader->info.cs.local_size[1];
> +      prog_data->local_size[2] = src_shader->info.cs.local_size[2];
> +      prog_data->uses_variable_group_size = false;
> +   } else {
> +      /*
> +       * If the local work group size is variable we have to use a dispatch
> +       * width of 32 here, since at this point we don't know the actual size of
> +       * the workload.
> +       */
> +      min_dispatch_width = 32;

Is that a good idea? You are able to specify a different maximum when
using a variable size (MAX_COMPUTE_VARIABLE_GROUP_INVOCATIONS_ARB)
s.t. this is 16 (or even 8, although that may be too few for practical
use) -- that way you would just set the max to 768 or whatever on
gen8+.

> +      prog_data->uses_variable_group_size = true;
> +   }
>
>     fs_visitor *v8 = NULL, *v16 = NULL, *v32 = NULL;
>     cfg_t *cfg = NULL;

As for the rest of it, I don't know enough, but you seem to be doing a
lot of divisions and mods in the shader. These tend to be expensive
ops -- I wonder if there's a way to alleviate some of that.

  -ilia