[Mesa-dev] [PATCH v2 2/2] i965: Implement ARB_compute_variable_group_size.

Thu Jun 7 15:34:26 UTC 2018

This patch adds the implementation of ARB_compute_variable_group_size
for i965. We do this by storing the group size in a buffer surface,
similarly to the work group number.

v2: Fix some indentation inconsistencies (Jordan, Ilia)
    Do DIV_ROUND_UP correctly in brw_nir_lower_cs_intrinsics.c (Jordan)
    Use alphabetical order in features.txt (Matt)
    Set the extension constants properly in brw_context.c

Signed-off-by: Plamena Manolova <plamena.n.manolova at gmail.com>
---
 docs/features.txt                                |  2 +-
 docs/relnotes/18.2.0.html                        |  1 +
 src/compiler/nir/nir_lower_system_values.c       | 13 ++++
 src/intel/compiler/brw_compiler.h                |  2 +
 src/intel/compiler/brw_fs.cpp                    | 45 ++++++++----
 src/intel/compiler/brw_fs_nir.cpp                | 20 ++++++
 src/intel/compiler/brw_nir_lower_cs_intrinsics.c | 88 +++++++++++++++++-------
 src/mesa/drivers/dri/i965/brw_compute.c          | 25 ++++++-
 src/mesa/drivers/dri/i965/brw_context.c          |  6 ++
 src/mesa/drivers/dri/i965/brw_context.h          |  1 +
 src/mesa/drivers/dri/i965/brw_cs.c               |  4 ++
 src/mesa/drivers/dri/i965/brw_wm_surface_state.c | 27 +++++++-
 src/mesa/drivers/dri/i965/intel_extensions.c     |  1 +
 13 files changed, 193 insertions(+), 42 deletions(-)

diff --git a/docs/features.txt b/docs/features.txt
index ed4050cf98..81b6663288 100644
--- a/docs/features.txt
+++ b/docs/features.txt
@@ -298,7 +298,7 @@ Khronos, ARB, and OES extensions that are not part of any OpenGL or OpenGL ES ve

   GL_ARB_bindless_texture                               DONE (nvc0, radeonsi)
   GL_ARB_cl_event                                       not started
-  GL_ARB_compute_variable_group_size                    DONE (nvc0, radeonsi)
+  GL_ARB_compute_variable_group_size                    DONE (i965, nvc0, radeonsi)
   GL_ARB_ES3_2_compatibility                            DONE (i965/gen8+)
   GL_ARB_fragment_shader_interlock                      DONE (i965)
   GL_ARB_gpu_shader_int64                               DONE (i965/gen8+, nvc0, radeonsi, softpipe, llvmpipe)
diff --git a/docs/relnotes/18.2.0.html b/docs/relnotes/18.2.0.html
index 0db37b620d..7475a56633 100644
--- a/docs/relnotes/18.2.0.html
+++ b/docs/relnotes/18.2.0.html
@@ -52,6 +52,7 @@ Note: some of the new features are only available with certain drivers.

 <ul>
 <li>GL_ARB_fragment_shader_interlock on i965</li>
+<li>GL_ARB_compute_variable_group_size on i965</li>
 </ul>

 <h2>Bug fixes</h2>
diff --git a/src/compiler/nir/nir_lower_system_values.c b/src/compiler/nir/nir_lower_system_values.c
index 487da04262..7ab005b000 100644
--- a/src/compiler/nir/nir_lower_system_values.c
+++ b/src/compiler/nir/nir_lower_system_values.c
@@ -57,6 +57,14 @@ convert_block(nir_block *block, nir_builder *b)
           *    gl_WorkGroupID * gl_WorkGroupSize + gl_LocalInvocationID"
           */

+         /*
+          * If the local work group size is variable we can't lower the global
+          * invocation id here.
+          */
+         if (b->shader->info.cs.local_size_variable) {
+            break;
+         }
+
          nir_const_value local_size;
          memset(&local_size, 0, sizeof(local_size));
          local_size.u32[0] = b->shader->info.cs.local_size[0];
@@ -102,6 +110,11 @@ convert_block(nir_block *block, nir_builder *b)
       }

       case SYSTEM_VALUE_LOCAL_GROUP_SIZE: {
+         /* If the local work group size is variable we can't lower it here */
+         if (b->shader->info.cs.local_size_variable) {
+            break;
+         }
+
          nir_const_value local_size;
          memset(&local_size, 0, sizeof(local_size));
          local_size.u32[0] = b->shader->info.cs.local_size[0];
diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h
index 8b4e6fe2e2..f54952c28f 100644
--- a/src/intel/compiler/brw_compiler.h
+++ b/src/intel/compiler/brw_compiler.h
@@ -759,6 +759,7 @@ struct brw_cs_prog_data {
    unsigned threads;
    bool uses_barrier;
    bool uses_num_work_groups;
+   bool uses_variable_group_size;

    struct {
       struct brw_push_const_block cross_thread;
@@ -771,6 +772,7 @@ struct brw_cs_prog_data {
        * surface indices the CS-specific surfaces
        */
       uint32_t work_groups_start;
+      uint32_t work_group_size_start;
       /** @} */
    } binding_table;
 };
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index d67c0a4192..28730af47b 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -7228,18 +7228,32 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
                int shader_time_index,
                char **error_str)
 {
-   prog_data->local_size[0] = src_shader->info.cs.local_size[0];
-   prog_data->local_size[1] = src_shader->info.cs.local_size[1];
-   prog_data->local_size[2] = src_shader->info.cs.local_size[2];
-   unsigned local_workgroup_size =
-      src_shader->info.cs.local_size[0] * src_shader->info.cs.local_size[1] *
-      src_shader->info.cs.local_size[2];
-
-   unsigned min_dispatch_width =
-      DIV_ROUND_UP(local_workgroup_size, compiler->devinfo->max_cs_threads);
-   min_dispatch_width = MAX2(8, min_dispatch_width);
-   min_dispatch_width = util_next_power_of_two(min_dispatch_width);
-   assert(min_dispatch_width <= 32);
+   unsigned min_dispatch_width;
+
+   if (!src_shader->info.cs.local_size_variable) {
+      unsigned local_workgroup_size =
+         src_shader->info.cs.local_size[0] * src_shader->info.cs.local_size[1] *
+         src_shader->info.cs.local_size[2];
+
+      min_dispatch_width =
+         DIV_ROUND_UP(local_workgroup_size, compiler->devinfo->max_cs_threads);
+      min_dispatch_width = MAX2(8, min_dispatch_width);
+      min_dispatch_width = util_next_power_of_two(min_dispatch_width);
+      assert(min_dispatch_width <= 32);
+
+      prog_data->local_size[0] = src_shader->info.cs.local_size[0];
+      prog_data->local_size[1] = src_shader->info.cs.local_size[1];
+      prog_data->local_size[2] = src_shader->info.cs.local_size[2];
+      prog_data->uses_variable_group_size = false;
+   } else {
+      /*
+       * If the local work group size is variable we have to use a dispatch
+       * width of 32 here, since at this point we don't know the actual size of
+       * the workload.
+       */
+      min_dispatch_width = 32;
+      prog_data->uses_variable_group_size = true;
+   }

    fs_visitor *v8 = NULL, *v16 = NULL, *v32 = NULL;
    cfg_t *cfg = NULL;
@@ -7324,7 +7338,12 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
          }
       } else {
          cfg = v32->cfg;
-         cs_set_simd_size(prog_data, 32);
+         if (!src_shader->info.cs.local_size_variable) {
+            cs_set_simd_size(prog_data, 32);
+         } else {
+            prog_data->simd_size = 32;
+            prog_data->threads = compiler->devinfo->max_cs_threads;
+         }
          cs_fill_push_const_info(compiler->devinfo, prog_data);
          promoted_constants = v32->promoted_constants;
       }
diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index 166da0aa6d..c4948c2347 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -3766,6 +3766,26 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld,
       break;
    }

+   case nir_intrinsic_load_local_group_size: {
+      const unsigned surface =
+         cs_prog_data->binding_table.work_group_size_start;
+
+      fs_reg surf_index = brw_imm_ud(surface);
+      brw_mark_surface_used(prog_data, surface);
+
+      /* Read the 3 GLuint components of gl_NumWorkGroups */
+      for (unsigned i = 0; i < 3; i++) {
+         fs_reg read_result =
+            emit_untyped_read(bld, surf_index,
+                              brw_imm_ud(i << 2),
+                              1 /* dims */, 1 /* size */,
+                              BRW_PREDICATE_NONE);
+         read_result.type = dest.type;
+         bld.MOV(dest, read_result);
+         dest = offset(dest, bld, 1);
+      }
+      break;
+   }
    default:
       nir_emit_intrinsic(bld, instr);
       break;
diff --git a/src/intel/compiler/brw_nir_lower_cs_intrinsics.c b/src/intel/compiler/brw_nir_lower_cs_intrinsics.c
index bfbdea0e8f..096e86db19 100644
--- a/src/intel/compiler/brw_nir_lower_cs_intrinsics.c
+++ b/src/intel/compiler/brw_nir_lower_cs_intrinsics.c
@@ -58,10 +58,12 @@ lower_cs_intrinsics_convert_block(struct lower_intrinsics_state *state,
           *       cs_thread_local_id + subgroup_invocation;
           */
          nir_ssa_def *subgroup_id;
-         if (state->local_workgroup_size <= state->dispatch_width)
+         if ((state->local_workgroup_size <= state->dispatch_width) &&
+             !state->nir->info.cs.local_size_variable) {
             subgroup_id = nir_imm_int(b, 0);
-         else
+         } else {
             subgroup_id = nir_load_subgroup_id(b);
+         }

          nir_ssa_def *thread_local_id =
             nir_imul(b, subgroup_id, nir_imm_int(b, state->dispatch_width));
@@ -84,43 +86,81 @@ lower_cs_intrinsics_convert_block(struct lower_intrinsics_state *state,
           *        (gl_WorkGroupSize.x * gl_WorkGroupSize.y)) %
           *       gl_WorkGroupSize.z;
           */
-         unsigned *size = nir->info.cs.local_size;
-
          nir_ssa_def *local_index = nir_load_local_invocation_index(b);
-
-         nir_const_value uvec3;
-         memset(&uvec3, 0, sizeof(uvec3));
-         uvec3.u32[0] = 1;
-         uvec3.u32[1] = size[0];
-         uvec3.u32[2] = size[0] * size[1];
-         nir_ssa_def *div_val = nir_build_imm(b, 3, 32, uvec3);
-         uvec3.u32[0] = size[0];
-         uvec3.u32[1] = size[1];
-         uvec3.u32[2] = size[2];
-         nir_ssa_def *mod_val = nir_build_imm(b, 3, 32, uvec3);
-
-         sysval = nir_umod(b, nir_udiv(b, local_index, div_val), mod_val);
+         if (!state->nir->info.cs.local_size_variable) {
+            unsigned *size = nir->info.cs.local_size;
+
+            nir_const_value uvec3;
+            memset(&uvec3, 0, sizeof(uvec3));
+            uvec3.u32[0] = 1;
+            uvec3.u32[1] = size[0];
+            uvec3.u32[2] = size[0] * size[1];
+            nir_ssa_def *div_val = nir_build_imm(b, 3, 32, uvec3);
+            uvec3.u32[0] = size[0];
+            uvec3.u32[1] = size[1];
+            uvec3.u32[2] = size[2];
+            nir_ssa_def *mod_val = nir_build_imm(b, 3, 32, uvec3);
+
+            sysval = nir_umod(b, nir_udiv(b, local_index, div_val), mod_val);
+         } else {
+            nir_ssa_def *group_size_xyz = nir_load_local_group_size(b);
+            nir_ssa_def *group_size_x = nir_channel(b, group_size_xyz, 0);
+            nir_ssa_def *group_size_y = nir_channel(b, group_size_xyz, 1);
+            nir_ssa_def *group_size_z = nir_channel(b, group_size_xyz, 2);
+            nir_ssa_def *result[3];
+            result[0] = nir_umod(b, local_index, group_size_x);
+            result[1] = nir_umod(b, nir_udiv(b, local_index, group_size_x),
+               group_size_y);
+            result[2] = nir_umod(b, nir_udiv(b, local_index,
+               nir_umul_high(b, group_size_x, group_size_y)), group_size_z);
+
+            sysval = nir_vec(b, result, 3);
+         }
          break;
       }

       case nir_intrinsic_load_subgroup_id:
-         if (state->local_workgroup_size > 8)
+         if (state->local_workgroup_size > 8 ||
+             state->nir->info.cs.local_size_variable) {
             continue;
+        }

          /* For small workgroup sizes, we know subgroup_id will be zero */
          sysval = nir_imm_int(b, 0);
          break;

       case nir_intrinsic_load_num_subgroups: {
-         unsigned local_workgroup_size =
-            nir->info.cs.local_size[0] * nir->info.cs.local_size[1] *
-            nir->info.cs.local_size[2];
-         unsigned num_subgroups =
-            DIV_ROUND_UP(local_workgroup_size, state->dispatch_width);
-         sysval = nir_imm_int(b, num_subgroups);
+         if (!state->nir->info.cs.local_size_variable) {
+            unsigned num_subgroups;
+            unsigned local_workgroup_size =
+               nir->info.cs.local_size[0] * nir->info.cs.local_size[1] *
+               nir->info.cs.local_size[2];
+            num_subgroups =
+               DIV_ROUND_UP(local_workgroup_size, state->dispatch_width);
+            sysval = nir_imm_int(b, num_subgroups);
+         } else {
+            nir_ssa_def *group_size_xyz = nir_load_local_group_size(b);
+            nir_ssa_def *group_size_x = nir_channel(b, group_size_xyz, 0);
+            nir_ssa_def *group_size_y = nir_channel(b, group_size_xyz, 1);
+            nir_ssa_def *group_size_z = nir_channel(b, group_size_xyz, 2);
+            nir_ssa_def *group_size = nir_imul(b, group_size_x, nir_imul(b,
+               group_size_y, group_size_z));
+            nir_ssa_def *dispatch_width = nir_imm_int(b,
+               state->dispatch_width - 1);
+
+            sysval = nir_udiv(b, group_size, dispatch_width);
+         }
          break;
       }

+      case nir_intrinsic_load_global_invocation_id: {
+         nir_ssa_def *group_id = nir_load_work_group_id(b);
+         nir_ssa_def *local_id = nir_load_local_invocation_id(b);
+         nir_ssa_def *group_size = nir_load_local_group_size(b);
+
+         sysval = nir_iadd(b, nir_imul(b, group_id, group_size), local_id);
+         break;
+      }
       default:
          continue;
       }
diff --git a/src/mesa/drivers/dri/i965/brw_compute.c b/src/mesa/drivers/dri/i965/brw_compute.c
index de08fc3ac1..7949e0ff51 100644
--- a/src/mesa/drivers/dri/i965/brw_compute.c
+++ b/src/mesa/drivers/dri/i965/brw_compute.c
@@ -121,8 +121,11 @@ brw_emit_gpgpu_walker(struct brw_context *brw)
    }

    const unsigned simd_size = prog_data->simd_size;
-   unsigned group_size = prog_data->local_size[0] *
-      prog_data->local_size[1] * prog_data->local_size[2];
+   unsigned group_size = brw->compute.group_size != NULL ?
+      brw->compute.group_size[0] * brw->compute.group_size[1] *
+         brw->compute.group_size[2] : prog_data->local_size[0] *
+            prog_data->local_size[1] * prog_data->local_size[2];
+
    unsigned thread_width_max =
       (group_size + simd_size - 1) / simd_size;

@@ -229,6 +232,7 @@ brw_dispatch_compute(struct gl_context *ctx, const GLuint *num_groups) {

    brw->compute.num_work_groups_bo = NULL;
    brw->compute.num_work_groups = num_groups;
+   brw->compute.group_size = NULL;
    ctx->NewDriverState |= BRW_NEW_CS_WORK_GROUPS;

    brw_dispatch_compute_common(ctx);
@@ -248,6 +252,22 @@ brw_dispatch_compute_indirect(struct gl_context *ctx, GLintptr indirect)
    brw->compute.num_work_groups_bo = bo;
    brw->compute.num_work_groups_offset = indirect;
    brw->compute.num_work_groups = indirect_group_counts;
+   brw->compute.group_size = NULL;
+   ctx->NewDriverState |= BRW_NEW_CS_WORK_GROUPS;
+
+   brw_dispatch_compute_common(ctx);
+}
+
+static void
+brw_dispatch_compute_group_size(struct gl_context *ctx,
+                                const GLuint *num_groups,
+                                const GLuint *group_size)
+{
+   struct brw_context *brw = brw_context(ctx);
+
+   brw->compute.num_work_groups_bo = NULL;
+   brw->compute.num_work_groups = num_groups;
+   brw->compute.group_size = group_size;
    ctx->NewDriverState |= BRW_NEW_CS_WORK_GROUPS;

    brw_dispatch_compute_common(ctx);
@@ -258,4 +278,5 @@ brw_init_compute_functions(struct dd_function_table *functions)
 {
    functions->DispatchCompute = brw_dispatch_compute;
    functions->DispatchComputeIndirect = brw_dispatch_compute_indirect;
+   functions->DispatchComputeGroupSize = brw_dispatch_compute_group_size;
 }
diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
index 9ced230ec1..25d354e155 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -766,6 +766,12 @@ brw_initialize_cs_context_constants(struct brw_context *brw)
    ctx->Const.MaxComputeWorkGroupSize[2] = max_invocations;
    ctx->Const.MaxComputeWorkGroupInvocations = max_invocations;
    ctx->Const.MaxComputeSharedMemorySize = 64 * 1024;
+
+   /* ARB_compute_variable_group_size constants */
+   ctx->Const.MaxComputeVariableGroupSize[0] = max_invocations;
+   ctx->Const.MaxComputeVariableGroupSize[1] = max_invocations;
+   ctx->Const.MaxComputeVariableGroupSize[2] = max_invocations;
+   ctx->Const.MaxComputeVariableGroupInvocations = max_invocations;
 }

 /**
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 2613b9fda2..0fb533c369 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -931,6 +931,7 @@ struct brw_context
       struct brw_bo *num_work_groups_bo;
       GLintptr num_work_groups_offset;
       const GLuint *num_work_groups;
+      const GLuint *group_size;
    } compute;

    struct {
diff --git a/src/mesa/drivers/dri/i965/brw_cs.c b/src/mesa/drivers/dri/i965/brw_cs.c
index e3f8fc67a4..007273390b 100644
--- a/src/mesa/drivers/dri/i965/brw_cs.c
+++ b/src/mesa/drivers/dri/i965/brw_cs.c
@@ -43,6 +43,10 @@ assign_cs_binding_table_offsets(const struct gen_device_info *devinfo,
    prog_data->binding_table.work_groups_start = next_binding_table_offset;
    next_binding_table_offset++;

+   /* May not be used if the work group size is not variable. */
+   prog_data->binding_table.work_group_size_start = next_binding_table_offset;
+   next_binding_table_offset++;
+
    brw_assign_common_binding_table_offsets(devinfo, prog, &prog_data->base,
                                            next_binding_table_offset);
 }
diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
index 73cae9ef7c..fa8851e2b4 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
@@ -1634,7 +1634,7 @@ const struct brw_tracked_state brw_wm_image_surfaces = {
 };

 static void
-brw_upload_cs_work_groups_surface(struct brw_context *brw)
+brw_upload_cs_variable_surfaces(struct brw_context *brw)
 {
    struct gl_context *ctx = &brw->ctx;
    /* _NEW_PROGRAM */
@@ -1671,6 +1671,29 @@ brw_upload_cs_work_groups_surface(struct brw_context *brw)
                                     RELOC_WRITE);
       brw->ctx.NewDriverState |= BRW_NEW_SURFACES;
    }
+
+   if (prog && cs_prog_data->uses_variable_group_size) {
+      const unsigned surf_idx =
+         cs_prog_data->binding_table.work_group_size_start;
+      uint32_t *surf_offset = &brw->cs.base.surf_offset[surf_idx];
+      struct brw_bo *bo;
+      uint32_t bo_offset;
+
+      bo = NULL;
+      brw_upload_data(&brw->upload,
+                     (void *)brw->compute.group_size,
+                      3 * sizeof(GLuint),
+                      sizeof(GLuint),
+                      &bo,
+                      &bo_offset);
+
+      brw_emit_buffer_surface_state(brw, surf_offset,
+                                    bo, bo_offset,
+                                    ISL_FORMAT_RAW,
+                                    3 * sizeof(GLuint), 1,
+                                    RELOC_WRITE);
+      brw->ctx.NewDriverState |= BRW_NEW_SURFACES;
+   }
 }

 const struct brw_tracked_state brw_cs_work_groups_surface = {
@@ -1678,5 +1701,5 @@ const struct brw_tracked_state brw_cs_work_groups_surface = {
       .brw = BRW_NEW_CS_PROG_DATA |
              BRW_NEW_CS_WORK_GROUPS
    },
-   .emit = brw_upload_cs_work_groups_surface,
+   .emit = brw_upload_cs_variable_surfaces,
 };
diff --git a/src/mesa/drivers/dri/i965/intel_extensions.c b/src/mesa/drivers/dri/i965/intel_extensions.c
index 5a9369d7b4..f213360ed8 100644
--- a/src/mesa/drivers/dri/i965/intel_extensions.c
+++ b/src/mesa/drivers/dri/i965/intel_extensions.c
@@ -258,6 +258,7 @@ intelInitExtensions(struct gl_context *ctx)
             ctx->Extensions.ARB_compute_shader = true;
             ctx->Extensions.ARB_ES3_1_compatibility =
                devinfo->gen >= 8 || devinfo->is_haswell;
+            ctx->Extensions.ARB_compute_variable_group_size = true;
          }

          if (can_do_predicate_writes(brw->screen)) {
--
2.11.0