[Mesa-dev] [PATCH v2 2/2] i965: Implement ARB_compute_variable_group_size.

Fri Jun 8 07:27:59 UTC 2018

On 2018-06-07 08:34:26, Plamena Manolova wrote:
> This patch adds the implementation of ARB_compute_variable_group_size
> for i965. We do this by storing the group size in a buffer surface,
> similarly to the work group number.
> 
> v2: Fix some indentation inconsistencies (Jordan, Ilia)
>     Do DIV_ROUND_UP correctly in brw_nir_lower_cs_intrinsics.c (Jordan)
>     Use alphabetical order in features.txt (Matt)
>     Set the extension constants properly in brw_context.c
> 
> Signed-off-by: Plamena Manolova <plamena.n.manolova at gmail.com>
> ---
>  docs/features.txt                                |  2 +-
>  docs/relnotes/18.2.0.html                        |  1 +
>  src/compiler/nir/nir_lower_system_values.c       | 13 ++++
>  src/intel/compiler/brw_compiler.h                |  2 +
>  src/intel/compiler/brw_fs.cpp                    | 45 ++++++++----
>  src/intel/compiler/brw_fs_nir.cpp                | 20 ++++++
>  src/intel/compiler/brw_nir_lower_cs_intrinsics.c | 88 +++++++++++++++++-------
>  src/mesa/drivers/dri/i965/brw_compute.c          | 25 ++++++-
>  src/mesa/drivers/dri/i965/brw_context.c          |  6 ++
>  src/mesa/drivers/dri/i965/brw_context.h          |  1 +
>  src/mesa/drivers/dri/i965/brw_cs.c               |  4 ++
>  src/mesa/drivers/dri/i965/brw_wm_surface_state.c | 27 +++++++-
>  src/mesa/drivers/dri/i965/intel_extensions.c     |  1 +
>  13 files changed, 193 insertions(+), 42 deletions(-)
> 
> diff --git a/docs/features.txt b/docs/features.txt
> index ed4050cf98..81b6663288 100644
> --- a/docs/features.txt
> +++ b/docs/features.txt
> @@ -298,7 +298,7 @@ Khronos, ARB, and OES extensions that are not part of any OpenGL or OpenGL ES ve
> 
>    GL_ARB_bindless_texture                               DONE (nvc0, radeonsi)
>    GL_ARB_cl_event                                       not started
> -  GL_ARB_compute_variable_group_size                    DONE (nvc0, radeonsi)
> +  GL_ARB_compute_variable_group_size                    DONE (i965, nvc0, radeonsi)
>    GL_ARB_ES3_2_compatibility                            DONE (i965/gen8+)
>    GL_ARB_fragment_shader_interlock                      DONE (i965)
>    GL_ARB_gpu_shader_int64                               DONE (i965/gen8+, nvc0, radeonsi, softpipe, llvmpipe)
> diff --git a/docs/relnotes/18.2.0.html b/docs/relnotes/18.2.0.html
> index 0db37b620d..7475a56633 100644
> --- a/docs/relnotes/18.2.0.html
> +++ b/docs/relnotes/18.2.0.html
> @@ -52,6 +52,7 @@ Note: some of the new features are only available with certain drivers.
> 
>  <ul>
>  <li>GL_ARB_fragment_shader_interlock on i965</li>
> +<li>GL_ARB_compute_variable_group_size on i965</li>
>  </ul>
> 
>  <h2>Bug fixes</h2>
> diff --git a/src/compiler/nir/nir_lower_system_values.c b/src/compiler/nir/nir_lower_system_values.c
> index 487da04262..7ab005b000 100644
> --- a/src/compiler/nir/nir_lower_system_values.c
> +++ b/src/compiler/nir/nir_lower_system_values.c
> @@ -57,6 +57,14 @@ convert_block(nir_block *block, nir_builder *b)
>            *    gl_WorkGroupID * gl_WorkGroupSize + gl_LocalInvocationID"
>            */
> 
> +         /*
> +          * If the local work group size is variable we can't lower the global
> +          * invocation id here.
> +          */
> +         if (b->shader->info.cs.local_size_variable) {
> +            break;
> +         }
> +
>           nir_const_value local_size;
>           memset(&local_size, 0, sizeof(local_size));
>           local_size.u32[0] = b->shader->info.cs.local_size[0];
> @@ -102,6 +110,11 @@ convert_block(nir_block *block, nir_builder *b)
>        }
> 
>        case SYSTEM_VALUE_LOCAL_GROUP_SIZE: {
> +         /* If the local work group size is variable we can't lower it here */
> +         if (b->shader->info.cs.local_size_variable) {
> +            break;
> +         }
> +
>           nir_const_value local_size;
>           memset(&local_size, 0, sizeof(local_size));
>           local_size.u32[0] = b->shader->info.cs.local_size[0];
> diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h
> index 8b4e6fe2e2..f54952c28f 100644
> --- a/src/intel/compiler/brw_compiler.h
> +++ b/src/intel/compiler/brw_compiler.h
> @@ -759,6 +759,7 @@ struct brw_cs_prog_data {
>     unsigned threads;
>     bool uses_barrier;
>     bool uses_num_work_groups;
> +   bool uses_variable_group_size;
> 
>     struct {
>        struct brw_push_const_block cross_thread;
> @@ -771,6 +772,7 @@ struct brw_cs_prog_data {
>         * surface indices the CS-specific surfaces
>         */
>        uint32_t work_groups_start;
> +      uint32_t work_group_size_start;
>        /** @} */
>     } binding_table;
>  };
> diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
> index d67c0a4192..28730af47b 100644
> --- a/src/intel/compiler/brw_fs.cpp
> +++ b/src/intel/compiler/brw_fs.cpp
> @@ -7228,18 +7228,32 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
>                 int shader_time_index,
>                 char **error_str)
>  {
> -   prog_data->local_size[0] = src_shader->info.cs.local_size[0];
> -   prog_data->local_size[1] = src_shader->info.cs.local_size[1];
> -   prog_data->local_size[2] = src_shader->info.cs.local_size[2];
> -   unsigned local_workgroup_size =
> -      src_shader->info.cs.local_size[0] * src_shader->info.cs.local_size[1] *
> -      src_shader->info.cs.local_size[2];
> -
> -   unsigned min_dispatch_width =
> -      DIV_ROUND_UP(local_workgroup_size, compiler->devinfo->max_cs_threads);
> -   min_dispatch_width = MAX2(8, min_dispatch_width);
> -   min_dispatch_width = util_next_power_of_two(min_dispatch_width);
> -   assert(min_dispatch_width <= 32);

I think based on Ilia's feedback (see more comments below), we might
be able to support simd16 rather than simd32 on many platforms.

With the changes from below, what if we change local_workgroup_size
like this:

   unsigned local_workgroup_size;
   if (!src_shader->info.cs.local_size_variable) {
      local_workgroup_size =
         src_shader->info.cs.local_size[0] * src_shader->info.cs.local_size[1] *
         src_shader->info.cs.local_size[2];
   } else {
      local_workgroup_size = ctx->Const.MaxComputeVariableGroupInvocations;
   }

And, then use the same old code to find min_dispatch_width?

I think we might then be able to use simd8 or simd16 for most
platforms. (simd32 can be slow if there is lots of register usage
in the program.)

> +   unsigned min_dispatch_width;
> +
> +   if (!src_shader->info.cs.local_size_variable) {
> +      unsigned local_workgroup_size =
> +         src_shader->info.cs.local_size[0] * src_shader->info.cs.local_size[1] *
> +         src_shader->info.cs.local_size[2];
> +
> +      min_dispatch_width =
> +         DIV_ROUND_UP(local_workgroup_size, compiler->devinfo->max_cs_threads);
> +      min_dispatch_width = MAX2(8, min_dispatch_width);
> +      min_dispatch_width = util_next_power_of_two(min_dispatch_width);
> +      assert(min_dispatch_width <= 32);
> +
> +      prog_data->local_size[0] = src_shader->info.cs.local_size[0];
> +      prog_data->local_size[1] = src_shader->info.cs.local_size[1];
> +      prog_data->local_size[2] = src_shader->info.cs.local_size[2];
> +      prog_data->uses_variable_group_size = false;
> +   } else {
> +      /*
> +       * If the local work group size is variable we have to use a dispatch
> +       * width of 32 here, since at this point we don't know the actual size of
> +       * the workload.
> +       */
> +      min_dispatch_width = 32;
> +      prog_data->uses_variable_group_size = true;
> +   }
> 
>     fs_visitor *v8 = NULL, *v16 = NULL, *v32 = NULL;
>     cfg_t *cfg = NULL;
> @@ -7324,7 +7338,12 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
>           }
>        } else {
>           cfg = v32->cfg;
> -         cs_set_simd_size(prog_data, 32);
> +         if (!src_shader->info.cs.local_size_variable) {
> +            cs_set_simd_size(prog_data, 32);
> +         } else {
> +            prog_data->simd_size = 32;
> +            prog_data->threads = compiler->devinfo->max_cs_threads;
> +         }
>           cs_fill_push_const_info(compiler->devinfo, prog_data);
>           promoted_constants = v32->promoted_constants;
>        }
> diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
> index 166da0aa6d..c4948c2347 100644
> --- a/src/intel/compiler/brw_fs_nir.cpp
> +++ b/src/intel/compiler/brw_fs_nir.cpp
> @@ -3766,6 +3766,26 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld,
>        break;
>     }
> 
> +   case nir_intrinsic_load_local_group_size: {
> +      const unsigned surface =
> +         cs_prog_data->binding_table.work_group_size_start;
> +
> +      fs_reg surf_index = brw_imm_ud(surface);
> +      brw_mark_surface_used(prog_data, surface);
> +
> +      /* Read the 3 GLuint components of gl_NumWorkGroups */
> +      for (unsigned i = 0; i < 3; i++) {
> +         fs_reg read_result =
> +            emit_untyped_read(bld, surf_index,
> +                              brw_imm_ud(i << 2),
> +                              1 /* dims */, 1 /* size */,
> +                              BRW_PREDICATE_NONE);
> +         read_result.type = dest.type;
> +         bld.MOV(dest, read_result);
> +         dest = offset(dest, bld, 1);
> +      }
> +      break;
> +   }
>     default:
>        nir_emit_intrinsic(bld, instr);
>        break;
> diff --git a/src/intel/compiler/brw_nir_lower_cs_intrinsics.c b/src/intel/compiler/brw_nir_lower_cs_intrinsics.c
> index bfbdea0e8f..096e86db19 100644
> --- a/src/intel/compiler/brw_nir_lower_cs_intrinsics.c
> +++ b/src/intel/compiler/brw_nir_lower_cs_intrinsics.c
> @@ -58,10 +58,12 @@ lower_cs_intrinsics_convert_block(struct lower_intrinsics_state *state,
>            *       cs_thread_local_id + subgroup_invocation;
>            */
>           nir_ssa_def *subgroup_id;
> -         if (state->local_workgroup_size <= state->dispatch_width)
> +         if ((state->local_workgroup_size <= state->dispatch_width) &&
> +             !state->nir->info.cs.local_size_variable) {
>              subgroup_id = nir_imm_int(b, 0);
> -         else
> +         } else {
>              subgroup_id = nir_load_subgroup_id(b);
> +         }
> 
>           nir_ssa_def *thread_local_id =
>              nir_imul(b, subgroup_id, nir_imm_int(b, state->dispatch_width));
> @@ -84,43 +86,81 @@ lower_cs_intrinsics_convert_block(struct lower_intrinsics_state *state,
>            *        (gl_WorkGroupSize.x * gl_WorkGroupSize.y)) %
>            *       gl_WorkGroupSize.z;
>            */
> -         unsigned *size = nir->info.cs.local_size;
> -
>           nir_ssa_def *local_index = nir_load_local_invocation_index(b);
> -
> -         nir_const_value uvec3;
> -         memset(&uvec3, 0, sizeof(uvec3));
> -         uvec3.u32[0] = 1;
> -         uvec3.u32[1] = size[0];
> -         uvec3.u32[2] = size[0] * size[1];
> -         nir_ssa_def *div_val = nir_build_imm(b, 3, 32, uvec3);
> -         uvec3.u32[0] = size[0];
> -         uvec3.u32[1] = size[1];
> -         uvec3.u32[2] = size[2];
> -         nir_ssa_def *mod_val = nir_build_imm(b, 3, 32, uvec3);
> -
> -         sysval = nir_umod(b, nir_udiv(b, local_index, div_val), mod_val);
> +         if (!state->nir->info.cs.local_size_variable) {
> +            unsigned *size = nir->info.cs.local_size;
> +
> +            nir_const_value uvec3;
> +            memset(&uvec3, 0, sizeof(uvec3));
> +            uvec3.u32[0] = 1;
> +            uvec3.u32[1] = size[0];
> +            uvec3.u32[2] = size[0] * size[1];
> +            nir_ssa_def *div_val = nir_build_imm(b, 3, 32, uvec3);
> +            uvec3.u32[0] = size[0];
> +            uvec3.u32[1] = size[1];
> +            uvec3.u32[2] = size[2];
> +            nir_ssa_def *mod_val = nir_build_imm(b, 3, 32, uvec3);
> +
> +            sysval = nir_umod(b, nir_udiv(b, local_index, div_val), mod_val);
> +         } else {
> +            nir_ssa_def *group_size_xyz = nir_load_local_group_size(b);
> +            nir_ssa_def *group_size_x = nir_channel(b, group_size_xyz, 0);
> +            nir_ssa_def *group_size_y = nir_channel(b, group_size_xyz, 1);
> +            nir_ssa_def *group_size_z = nir_channel(b, group_size_xyz, 2);
> +            nir_ssa_def *result[3];
> +            result[0] = nir_umod(b, local_index, group_size_x);
> +            result[1] = nir_umod(b, nir_udiv(b, local_index, group_size_x),
> +               group_size_y);
> +            result[2] = nir_umod(b, nir_udiv(b, local_index,
> +               nir_umul_high(b, group_size_x, group_size_y)), group_size_z);
> +
> +            sysval = nir_vec(b, result, 3);
> +         }
>           break;
>        }
> 
>        case nir_intrinsic_load_subgroup_id:
> -         if (state->local_workgroup_size > 8)
> +         if (state->local_workgroup_size > 8 ||
> +             state->nir->info.cs.local_size_variable) {
>              continue;
> +        }
> 
>           /* For small workgroup sizes, we know subgroup_id will be zero */
>           sysval = nir_imm_int(b, 0);
>           break;
> 
>        case nir_intrinsic_load_num_subgroups: {
> -         unsigned local_workgroup_size =
> -            nir->info.cs.local_size[0] * nir->info.cs.local_size[1] *
> -            nir->info.cs.local_size[2];
> -         unsigned num_subgroups =
> -            DIV_ROUND_UP(local_workgroup_size, state->dispatch_width);
> -         sysval = nir_imm_int(b, num_subgroups);
> +         if (!state->nir->info.cs.local_size_variable) {
> +            unsigned num_subgroups;
> +            unsigned local_workgroup_size =
> +               nir->info.cs.local_size[0] * nir->info.cs.local_size[1] *
> +               nir->info.cs.local_size[2];
> +            num_subgroups =
> +               DIV_ROUND_UP(local_workgroup_size, state->dispatch_width);
> +            sysval = nir_imm_int(b, num_subgroups);
> +         } else {
> +            nir_ssa_def *group_size_xyz = nir_load_local_group_size(b);
> +            nir_ssa_def *group_size_x = nir_channel(b, group_size_xyz, 0);
> +            nir_ssa_def *group_size_y = nir_channel(b, group_size_xyz, 1);
> +            nir_ssa_def *group_size_z = nir_channel(b, group_size_xyz, 2);
> +            nir_ssa_def *group_size = nir_imul(b, group_size_x, nir_imul(b,
> +               group_size_y, group_size_z));
> +            nir_ssa_def *dispatch_width = nir_imm_int(b,
> +               state->dispatch_width - 1);
> +
> +            sysval = nir_udiv(b, group_size, dispatch_width);
> +         }
>           break;
>        }
> 
> +      case nir_intrinsic_load_global_invocation_id: {
> +         nir_ssa_def *group_id = nir_load_work_group_id(b);
> +         nir_ssa_def *local_id = nir_load_local_invocation_id(b);
> +         nir_ssa_def *group_size = nir_load_local_group_size(b);
> +
> +         sysval = nir_iadd(b, nir_imul(b, group_id, group_size), local_id);
> +         break;
> +      }
>        default:
>           continue;
>        }
> diff --git a/src/mesa/drivers/dri/i965/brw_compute.c b/src/mesa/drivers/dri/i965/brw_compute.c
> index de08fc3ac1..7949e0ff51 100644
> --- a/src/mesa/drivers/dri/i965/brw_compute.c
> +++ b/src/mesa/drivers/dri/i965/brw_compute.c
> @@ -121,8 +121,11 @@ brw_emit_gpgpu_walker(struct brw_context *brw)
>     }
> 
>     const unsigned simd_size = prog_data->simd_size;
> -   unsigned group_size = prog_data->local_size[0] *
> -      prog_data->local_size[1] * prog_data->local_size[2];
> +   unsigned group_size = brw->compute.group_size != NULL ?
> +      brw->compute.group_size[0] * brw->compute.group_size[1] *
> +         brw->compute.group_size[2] : prog_data->local_size[0] *
> +            prog_data->local_size[1] * prog_data->local_size[2];
> +
>     unsigned thread_width_max =
>        (group_size + simd_size - 1) / simd_size;
> 
> @@ -229,6 +232,7 @@ brw_dispatch_compute(struct gl_context *ctx, const GLuint *num_groups) {
> 
>     brw->compute.num_work_groups_bo = NULL;
>     brw->compute.num_work_groups = num_groups;
> +   brw->compute.group_size = NULL;
>     ctx->NewDriverState |= BRW_NEW_CS_WORK_GROUPS;
> 
>     brw_dispatch_compute_common(ctx);
> @@ -248,6 +252,22 @@ brw_dispatch_compute_indirect(struct gl_context *ctx, GLintptr indirect)
>     brw->compute.num_work_groups_bo = bo;
>     brw->compute.num_work_groups_offset = indirect;
>     brw->compute.num_work_groups = indirect_group_counts;
> +   brw->compute.group_size = NULL;
> +   ctx->NewDriverState |= BRW_NEW_CS_WORK_GROUPS;
> +
> +   brw_dispatch_compute_common(ctx);
> +}
> +
> +static void
> +brw_dispatch_compute_group_size(struct gl_context *ctx,
> +                                const GLuint *num_groups,
> +                                const GLuint *group_size)
> +{
> +   struct brw_context *brw = brw_context(ctx);
> +
> +   brw->compute.num_work_groups_bo = NULL;
> +   brw->compute.num_work_groups = num_groups;
> +   brw->compute.group_size = group_size;
>     ctx->NewDriverState |= BRW_NEW_CS_WORK_GROUPS;
> 
>     brw_dispatch_compute_common(ctx);
> @@ -258,4 +278,5 @@ brw_init_compute_functions(struct dd_function_table *functions)
>  {
>     functions->DispatchCompute = brw_dispatch_compute;
>     functions->DispatchComputeIndirect = brw_dispatch_compute_indirect;
> +   functions->DispatchComputeGroupSize = brw_dispatch_compute_group_size;
>  }
> diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
> index 9ced230ec1..25d354e155 100644
> --- a/src/mesa/drivers/dri/i965/brw_context.c
> +++ b/src/mesa/drivers/dri/i965/brw_context.c
> @@ -766,6 +766,12 @@ brw_initialize_cs_context_constants(struct brw_context *brw)
>     ctx->Const.MaxComputeWorkGroupSize[2] = max_invocations;
>     ctx->Const.MaxComputeWorkGroupInvocations = max_invocations;
>     ctx->Const.MaxComputeSharedMemorySize = 64 * 1024;
> +
> +   /* ARB_compute_variable_group_size constants */
> +   ctx->Const.MaxComputeVariableGroupSize[0] = max_invocations;
> +   ctx->Const.MaxComputeVariableGroupSize[1] = max_invocations;
> +   ctx->Const.MaxComputeVariableGroupSize[2] = max_invocations;
> +   ctx->Const.MaxComputeVariableGroupInvocations = max_invocations;

I think Ilia's feedback was a bit different. If you look at:

https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_compute_variable_group_size.txt

Notice that MAX_COMPUTE_VARIABLE_WORK_GROUP_SIZE_ARB and
MAX_COMPUTE_VARIABLE_WORK_GROUP_INVOCATIONS_ARB are only required to
be 512.

Can we try something like this? (hope I got it right! :)

   const uint32_t max_var_invocations =
      (max_threads >= 64 ? 8 : (max_threads >= 32 ? 16 : 32)) * max_threads;

   /* ARB_compute_variable_group_size constants */
   ctx->Const.MaxComputeVariableGroupSize[0] = max_var_invocations;
   ctx->Const.MaxComputeVariableGroupSize[1] = max_var_invocations;
   ctx->Const.MaxComputeVariableGroupSize[2] = max_var_invocations;
   ctx->Const.MaxComputeVariableGroupInvocations = max_var_invocations;

On gen7, max_threads should be 64, therefore, we could use simd8 and
still get a size of 512 invocations. We still might use a simd16 if
there is no register spilling, but it will allow simd8 to be used if
there is register pressure.

Ilia also had some feedback about nir_intrinsic_load_num_subgroups,
but I haven't had a chance to look closer at it yet.

-Jordan

>  }
> 
>  /**
> diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
> index 2613b9fda2..0fb533c369 100644
> --- a/src/mesa/drivers/dri/i965/brw_context.h
> +++ b/src/mesa/drivers/dri/i965/brw_context.h
> @@ -931,6 +931,7 @@ struct brw_context
>        struct brw_bo *num_work_groups_bo;
>        GLintptr num_work_groups_offset;
>        const GLuint *num_work_groups;
> +      const GLuint *group_size;
>     } compute;
> 
>     struct {
> diff --git a/src/mesa/drivers/dri/i965/brw_cs.c b/src/mesa/drivers/dri/i965/brw_cs.c
> index e3f8fc67a4..007273390b 100644
> --- a/src/mesa/drivers/dri/i965/brw_cs.c
> +++ b/src/mesa/drivers/dri/i965/brw_cs.c
> @@ -43,6 +43,10 @@ assign_cs_binding_table_offsets(const struct gen_device_info *devinfo,
>     prog_data->binding_table.work_groups_start = next_binding_table_offset;
>     next_binding_table_offset++;
> 
> +   /* May not be used if the work group size is not variable. */
> +   prog_data->binding_table.work_group_size_start = next_binding_table_offset;
> +   next_binding_table_offset++;
> +
>     brw_assign_common_binding_table_offsets(devinfo, prog, &prog_data->base,
>                                             next_binding_table_offset);
>  }
> diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
> index 73cae9ef7c..fa8851e2b4 100644
> --- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
> +++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
> @@ -1634,7 +1634,7 @@ const struct brw_tracked_state brw_wm_image_surfaces = {
>  };
> 
>  static void
> -brw_upload_cs_work_groups_surface(struct brw_context *brw)
> +brw_upload_cs_variable_surfaces(struct brw_context *brw)
>  {
>     struct gl_context *ctx = &brw->ctx;
>     /* _NEW_PROGRAM */
> @@ -1671,6 +1671,29 @@ brw_upload_cs_work_groups_surface(struct brw_context *brw)
>                                      RELOC_WRITE);
>        brw->ctx.NewDriverState |= BRW_NEW_SURFACES;
>     }
> +
> +   if (prog && cs_prog_data->uses_variable_group_size) {
> +      const unsigned surf_idx =
> +         cs_prog_data->binding_table.work_group_size_start;
> +      uint32_t *surf_offset = &brw->cs.base.surf_offset[surf_idx];
> +      struct brw_bo *bo;
> +      uint32_t bo_offset;
> +
> +      bo = NULL;
> +      brw_upload_data(&brw->upload,
> +                     (void *)brw->compute.group_size,
> +                      3 * sizeof(GLuint),
> +                      sizeof(GLuint),
> +                      &bo,
> +                      &bo_offset);
> +
> +      brw_emit_buffer_surface_state(brw, surf_offset,
> +                                    bo, bo_offset,
> +                                    ISL_FORMAT_RAW,
> +                                    3 * sizeof(GLuint), 1,
> +                                    RELOC_WRITE);
> +      brw->ctx.NewDriverState |= BRW_NEW_SURFACES;
> +   }
>  }
> 
>  const struct brw_tracked_state brw_cs_work_groups_surface = {
> @@ -1678,5 +1701,5 @@ const struct brw_tracked_state brw_cs_work_groups_surface = {
>        .brw = BRW_NEW_CS_PROG_DATA |
>               BRW_NEW_CS_WORK_GROUPS
>     },
> -   .emit = brw_upload_cs_work_groups_surface,
> +   .emit = brw_upload_cs_variable_surfaces,
>  };
> diff --git a/src/mesa/drivers/dri/i965/intel_extensions.c b/src/mesa/drivers/dri/i965/intel_extensions.c
> index 5a9369d7b4..f213360ed8 100644
> --- a/src/mesa/drivers/dri/i965/intel_extensions.c
> +++ b/src/mesa/drivers/dri/i965/intel_extensions.c
> @@ -258,6 +258,7 @@ intelInitExtensions(struct gl_context *ctx)
>              ctx->Extensions.ARB_compute_shader = true;
>              ctx->Extensions.ARB_ES3_1_compatibility =
>                 devinfo->gen >= 8 || devinfo->is_haswell;
> +            ctx->Extensions.ARB_compute_variable_group_size = true;
>           }
> 
>           if (can_do_predicate_writes(brw->screen)) {
> --
> 2.11.0