[Mesa-dev] [PATCH 2/2] i965: Implement ARB_compute_variable_group_size.
Ilia Mirkin
imirkin at alum.mit.edu
Fri Jun 1 22:44:05 UTC 2018
On Fri, Jun 1, 2018 at 6:21 PM, Plamena Manolova
<plamena.n.manolova at gmail.com> wrote:
> This patch adds the implentation of ARB_compute_variable_group_size
> for i965. We do this by storing the group size in a buffer surface,
> similarly to the work group number.
>
> Signed-off-by: Plamena Manolova <plamena.n.manolova at gmail.com>
> ---
> docs/features.txt | 2 +-
> docs/relnotes/18.2.0.html | 1 +
> src/compiler/nir/nir_lower_system_values.c | 14 ++++
> src/intel/compiler/brw_compiler.h | 2 +
> src/intel/compiler/brw_fs.cpp | 45 ++++++++----
> src/intel/compiler/brw_fs_nir.cpp | 20 ++++++
> src/intel/compiler/brw_nir_lower_cs_intrinsics.c | 87 +++++++++++++++++-------
> src/mesa/drivers/dri/i965/brw_compute.c | 25 ++++++-
> src/mesa/drivers/dri/i965/brw_context.h | 1 +
> src/mesa/drivers/dri/i965/brw_cs.c | 4 ++
> src/mesa/drivers/dri/i965/brw_wm_surface_state.c | 27 +++++++-
> src/mesa/drivers/dri/i965/intel_extensions.c | 1 +
> 12 files changed, 187 insertions(+), 42 deletions(-)
>
> diff --git a/docs/features.txt b/docs/features.txt
> index ed4050cf98..7c3c856d73 100644
> --- a/docs/features.txt
> +++ b/docs/features.txt
> @@ -298,7 +298,7 @@ Khronos, ARB, and OES extensions that are not part of any OpenGL or OpenGL ES ve
>
> GL_ARB_bindless_texture DONE (nvc0, radeonsi)
> GL_ARB_cl_event not started
> - GL_ARB_compute_variable_group_size DONE (nvc0, radeonsi)
> + GL_ARB_compute_variable_group_size DONE (nvc0, radeonsi, i965)
> GL_ARB_ES3_2_compatibility DONE (i965/gen8+)
> GL_ARB_fragment_shader_interlock DONE (i965)
> GL_ARB_gpu_shader_int64 DONE (i965/gen8+, nvc0, radeonsi, softpipe, llvmpipe)
> diff --git a/docs/relnotes/18.2.0.html b/docs/relnotes/18.2.0.html
> index a3f44a29dc..4ceeb7471f 100644
> --- a/docs/relnotes/18.2.0.html
> +++ b/docs/relnotes/18.2.0.html
> @@ -45,6 +45,7 @@ Note: some of the new features are only available with certain drivers.
>
> <ul>
> <li>GL_ARB_fragment_shader_interlock on i965</li>
> +<li>GL_ARB_compute_variable_group_size on i965</li>
> </ul>
>
> <h2>Bug fixes</h2>
> diff --git a/src/compiler/nir/nir_lower_system_values.c b/src/compiler/nir/nir_lower_system_values.c
> index 487da04262..0af6d69426 100644
> --- a/src/compiler/nir/nir_lower_system_values.c
> +++ b/src/compiler/nir/nir_lower_system_values.c
> @@ -57,6 +57,15 @@ convert_block(nir_block *block, nir_builder *b)
> * gl_WorkGroupID * gl_WorkGroupSize + gl_LocalInvocationID"
> */
>
> +
> + /*
> + * If the local work group size is variable we can't lower the global
> + * invocation id here.
> + */
> + if (b->shader->info.cs.local_size_variable) {
> + break;
> + }
> +
There appears to be some tabs vs spaces thing here.
> nir_const_value local_size;
> memset(&local_size, 0, sizeof(local_size));
> local_size.u32[0] = b->shader->info.cs.local_size[0];
> @@ -102,6 +111,11 @@ convert_block(nir_block *block, nir_builder *b)
> }
>
> case SYSTEM_VALUE_LOCAL_GROUP_SIZE: {
> + /* If the local work group size is variable we can't lower it here */
> + if (b->shader->info.cs.local_size_variable) {
> + break;
> + }
> +
> nir_const_value local_size;
> memset(&local_size, 0, sizeof(local_size));
> local_size.u32[0] = b->shader->info.cs.local_size[0];
> diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h
> index 8b4e6fe2e2..f54952c28f 100644
> --- a/src/intel/compiler/brw_compiler.h
> +++ b/src/intel/compiler/brw_compiler.h
> @@ -759,6 +759,7 @@ struct brw_cs_prog_data {
> unsigned threads;
> bool uses_barrier;
> bool uses_num_work_groups;
> + bool uses_variable_group_size;
>
> struct {
> struct brw_push_const_block cross_thread;
> @@ -771,6 +772,7 @@ struct brw_cs_prog_data {
> * surface indices the CS-specific surfaces
> */
> uint32_t work_groups_start;
> + uint32_t work_group_size_start;
> /** @} */
> } binding_table;
> };
> diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
> index d67c0a4192..28730af47b 100644
> --- a/src/intel/compiler/brw_fs.cpp
> +++ b/src/intel/compiler/brw_fs.cpp
> @@ -7228,18 +7228,32 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
> int shader_time_index,
> char **error_str)
> {
> - prog_data->local_size[0] = src_shader->info.cs.local_size[0];
> - prog_data->local_size[1] = src_shader->info.cs.local_size[1];
> - prog_data->local_size[2] = src_shader->info.cs.local_size[2];
> - unsigned local_workgroup_size =
> - src_shader->info.cs.local_size[0] * src_shader->info.cs.local_size[1] *
> - src_shader->info.cs.local_size[2];
> -
> - unsigned min_dispatch_width =
> - DIV_ROUND_UP(local_workgroup_size, compiler->devinfo->max_cs_threads);
> - min_dispatch_width = MAX2(8, min_dispatch_width);
> - min_dispatch_width = util_next_power_of_two(min_dispatch_width);
> - assert(min_dispatch_width <= 32);
> + unsigned min_dispatch_width;
> +
> + if (!src_shader->info.cs.local_size_variable) {
> + unsigned local_workgroup_size =
> + src_shader->info.cs.local_size[0] * src_shader->info.cs.local_size[1] *
> + src_shader->info.cs.local_size[2];
> +
> + min_dispatch_width =
> + DIV_ROUND_UP(local_workgroup_size, compiler->devinfo->max_cs_threads);
> + min_dispatch_width = MAX2(8, min_dispatch_width);
> + min_dispatch_width = util_next_power_of_two(min_dispatch_width);
> + assert(min_dispatch_width <= 32);
> +
> + prog_data->local_size[0] = src_shader->info.cs.local_size[0];
> + prog_data->local_size[1] = src_shader->info.cs.local_size[1];
> + prog_data->local_size[2] = src_shader->info.cs.local_size[2];
> + prog_data->uses_variable_group_size = false;
> + } else {
> + /*
> + * If the local work group size is variable we have to use a dispatch
> + * width of 32 here, since at this point we don't know the actual size of
> + * the workload.
> + */
> + min_dispatch_width = 32;
Is that a good idea? You are able to specify a different maximum when
using a variable size (MAX_COMPUTE_VARIABLE_GROUP_INVOCATIONS_ARB)
s.t. this is 16 (or even 8, although that may be too few for practical
use) -- that way you would just set the max to 768 or whatever on
gen8+.
> + prog_data->uses_variable_group_size = true;
> + }
>
> fs_visitor *v8 = NULL, *v16 = NULL, *v32 = NULL;
> cfg_t *cfg = NULL;
As for the rest of it, I don't know enough, but you seem to be doing a
lot of divisions and mods in the shader. These tend to be expensive
ops -- I wonder if there's a way to alleviate some of that.
-ilia
More information about the mesa-dev
mailing list