<div dir="ltr">Thank you for reviewing this Jordan! <div class="gmail_quote"><div dir="ltr">On Fri, 1 Jun 2018 at 23:45, Jordan Justen <<a href="mailto:jordan.l.justen@intel.com">jordan.l.justen@intel.com</a>> wrote: </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">On 2018-06-01 15:21:34, Plamena Manolova wrote: > This patch adds the implentation of ARB_compute_variable_group_size > for i965. We do this by storing the group size in a buffer surface, > similarly to the work group number. > > Signed-off-by: Plamena Manolova <<a href="mailto:plamena.n.manolova@gmail.com" target="_blank">plamena.n.manolova@gmail.com</a>> > --- > docs/features.txt | 2 +- > docs/relnotes/18.2.0.html | 1 + > src/compiler/nir/nir_lower_system_values.c | 14 ++++ > src/intel/compiler/brw_compiler.h | 2 + > src/intel/compiler/brw_fs.cpp | 45 ++++++++---- > src/intel/compiler/brw_fs_nir.cpp | 20 ++++++ > src/intel/compiler/brw_nir_lower_cs_intrinsics.c | 87 +++++++++++++++++------- > src/mesa/drivers/dri/i965/brw_compute.c | 25 ++++++- > src/mesa/drivers/dri/i965/brw_context.h | 1 + > src/mesa/drivers/dri/i965/brw_cs.c | 4 ++ > src/mesa/drivers/dri/i965/brw_wm_surface_state.c | 27 +++++++- > src/mesa/drivers/dri/i965/intel_extensions.c | 1 + > 12 files changed, 187 insertions(+), 42 deletions(-) > > diff --git a/docs/features.txt b/docs/features.txt > index ed4050cf98..7c3c856d73 100644 > --- a/docs/features.txt > +++ b/docs/features.txt > @@ -298,7 +298,7 @@ Khronos, ARB, and OES extensions that are not part of any OpenGL or OpenGL ES ve > > GL_ARB_bindless_texture DONE (nvc0, radeonsi) > GL_ARB_cl_event not started > - GL_ARB_compute_variable_group_size DONE (nvc0, radeonsi) > + GL_ARB_compute_variable_group_size DONE (nvc0, radeonsi, i965) > GL_ARB_ES3_2_compatibility DONE (i965/gen8+) > GL_ARB_fragment_shader_interlock DONE (i965) > GL_ARB_gpu_shader_int64 DONE (i965/gen8+, nvc0, radeonsi, softpipe, llvmpipe) > diff --git a/docs/relnotes/18.2.0.html b/docs/relnotes/18.2.0.html > index a3f44a29dc..4ceeb7471f 100644 > --- a/docs/relnotes/18.2.0.html > +++ b/docs/relnotes/18.2.0.html > @@ -45,6 +45,7 @@ Note: some of the new features are only available with certain drivers. > > <ul> > <li>GL_ARB_fragment_shader_interlock on i965</li> > +<li>GL_ARB_compute_variable_group_size on i965</li> > </ul> > > <h2>Bug fixes</h2> > diff --git a/src/compiler/nir/nir_lower_system_values.c b/src/compiler/nir/nir_lower_system_values.c > index 487da04262..0af6d69426 100644 > --- a/src/compiler/nir/nir_lower_system_values.c > +++ b/src/compiler/nir/nir_lower_system_values.c > @@ -57,6 +57,15 @@ convert_block(nir_block *block, nir_builder *b) > * gl_WorkGroupID * gl_WorkGroupSize + gl_LocalInvocationID" > */ > > + Extra line. > + /* > + * If the local work group size is variable we can't lower the global > + * invocation id here. > + */ > + if (b->shader->info.cs.local_size_variable) { > + break; > + } > + The indent looks off here. One extra space? > nir_const_value local_size; > memset(&local_size, 0, sizeof(local_size)); > local_size.u32[0] = b->shader->info.cs.local_size[0]; > @@ -102,6 +111,11 @@ convert_block(nir_block *block, nir_builder *b) > } > > case SYSTEM_VALUE_LOCAL_GROUP_SIZE: { > + /* If the local work group size is variable we can't lower it here */ > + if (b->shader->info.cs.local_size_variable) { > + break; > + } > + > nir_const_value local_size; > memset(&local_size, 0, sizeof(local_size)); > local_size.u32[0] = b->shader->info.cs.local_size[0]; > diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h > index 8b4e6fe2e2..f54952c28f 100644 > --- a/src/intel/compiler/brw_compiler.h > +++ b/src/intel/compiler/brw_compiler.h > @@ -759,6 +759,7 @@ struct brw_cs_prog_data { > unsigned threads; > bool uses_barrier; > bool uses_num_work_groups; > + bool uses_variable_group_size; > > struct { > struct brw_push_const_block cross_thread; > @@ -771,6 +772,7 @@ struct brw_cs_prog_data { > * surface indices the CS-specific surfaces > */ > uint32_t work_groups_start; > + uint32_t work_group_size_start; > /** @} */ > } binding_table; > }; > diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp > index d67c0a4192..28730af47b 100644 > --- a/src/intel/compiler/brw_fs.cpp > +++ b/src/intel/compiler/brw_fs.cpp > @@ -7228,18 +7228,32 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data, > int shader_time_index, > char **error_str) > { > - prog_data->local_size[0] = src_shader->info.cs.local_size[0]; > - prog_data->local_size[1] = src_shader->info.cs.local_size[1]; > - prog_data->local_size[2] = src_shader->info.cs.local_size[2]; > - unsigned local_workgroup_size = > - src_shader->info.cs.local_size[0] * src_shader->info.cs.local_size[1] * > - src_shader->info.cs.local_size[2]; > - > - unsigned min_dispatch_width = > - DIV_ROUND_UP(local_workgroup_size, compiler->devinfo->max_cs_threads); > - min_dispatch_width = MAX2(8, min_dispatch_width); > - min_dispatch_width = util_next_power_of_two(min_dispatch_width); > - assert(min_dispatch_width <= 32); > + unsigned min_dispatch_width; > + > + if (!src_shader->info.cs.local_size_variable) { > + unsigned local_workgroup_size = > + src_shader->info.cs.local_size[0] * src_shader->info.cs.local_size[1] * > + src_shader->info.cs.local_size[2]; > + > + min_dispatch_width = > + DIV_ROUND_UP(local_workgroup_size, compiler->devinfo->max_cs_threads); > + min_dispatch_width = MAX2(8, min_dispatch_width); > + min_dispatch_width = util_next_power_of_two(min_dispatch_width); > + assert(min_dispatch_width <= 32); > + > + prog_data->local_size[0] = src_shader->info.cs.local_size[0]; > + prog_data->local_size[1] = src_shader->info.cs.local_size[1]; > + prog_data->local_size[2] = src_shader->info.cs.local_size[2]; > + prog_data->uses_variable_group_size = false; > + } else { > + /* > + * If the local work group size is variable we have to use a dispatch > + * width of 32 here, since at this point we don't know the actual size of > + * the workload. > + */ > + min_dispatch_width = 32; If we find cases where this leads to bad perf, we could look into generating other sizes too. If they end up using a smaller size, then we might want to run the simd8 or simd16 instead if there is spilling. </blockquote><div> </div><div>I can have a go at making a patch for that if you think it'll improve performance.</div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"> But, this seems like a good first step. > + prog_data->uses_variable_group_size = true; > + } > > fs_visitor *v8 = NULL, *v16 = NULL, *v32 = NULL; > cfg_t *cfg = NULL; > @@ -7324,7 +7338,12 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data, > } > } else { > cfg = v32->cfg; > - cs_set_simd_size(prog_data, 32); > + if (!src_shader->info.cs.local_size_variable) { > + cs_set_simd_size(prog_data, 32); > + } else { > + prog_data->simd_size = 32; > + prog_data->threads = compiler->devinfo->max_cs_threads; > + } > cs_fill_push_const_info(compiler->devinfo, prog_data); > promoted_constants = v32->promoted_constants; > } > diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp > index 166da0aa6d..c4948c2347 100644 > --- a/src/intel/compiler/brw_fs_nir.cpp > +++ b/src/intel/compiler/brw_fs_nir.cpp > @@ -3766,6 +3766,26 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld, > break; > } > > + case nir_intrinsic_load_local_group_size: { > + const unsigned surface = > + cs_prog_data->binding_table.work_group_size_start; > + > + fs_reg surf_index = brw_imm_ud(surface); > + brw_mark_surface_used(prog_data, surface); > + > + /* Read the 3 GLuint components of gl_NumWorkGroups */ > + for (unsigned i = 0; i < 3; i++) { > + fs_reg read_result = > + emit_untyped_read(bld, surf_index, > + brw_imm_ud(i << 2), > + 1 /* dims */, 1 /* size */, > + BRW_PREDICATE_NONE); > + read_result.type = dest.type; > + bld.MOV(dest, read_result); > + dest = offset(dest, bld, 1); > + } > + break; > + } > default: > nir_emit_intrinsic(bld, instr); > break; > diff --git a/src/intel/compiler/brw_nir_lower_cs_intrinsics.c b/src/intel/compiler/brw_nir_lower_cs_intrinsics.c > index bfbdea0e8f..e9c9d34502 100644 > --- a/src/intel/compiler/brw_nir_lower_cs_intrinsics.c > +++ b/src/intel/compiler/brw_nir_lower_cs_intrinsics.c > @@ -58,10 +58,12 @@ lower_cs_intrinsics_convert_block(struct lower_intrinsics_state *state, > * cs_thread_local_id + subgroup_invocation; > */ > nir_ssa_def *subgroup_id; > - if (state->local_workgroup_size <= state->dispatch_width) > + if ((state->local_workgroup_size <= state->dispatch_width) && > + !state->nir->info.cs.local_size_variable) { > subgroup_id = nir_imm_int(b, 0); > - else > + } else { > subgroup_id = nir_load_subgroup_id(b); > + } > > nir_ssa_def *thread_local_id = > nir_imul(b, subgroup_id, nir_imm_int(b, state->dispatch_width)); > @@ -84,43 +86,80 @@ lower_cs_intrinsics_convert_block(struct lower_intrinsics_state *state, > * (gl_WorkGroupSize.x * gl_WorkGroupSize.y)) % > * gl_WorkGroupSize.z; > */ > - unsigned *size = nir->info.cs.local_size; > - > nir_ssa_def *local_index = nir_load_local_invocation_index(b); > - > - nir_const_value uvec3; > - memset(&uvec3, 0, sizeof(uvec3)); > - uvec3.u32[0] = 1; > - uvec3.u32[1] = size[0]; > - uvec3.u32[2] = size[0] * size[1]; > - nir_ssa_def *div_val = nir_build_imm(b, 3, 32, uvec3); > - uvec3.u32[0] = size[0]; > - uvec3.u32[1] = size[1]; > - uvec3.u32[2] = size[2]; > - nir_ssa_def *mod_val = nir_build_imm(b, 3, 32, uvec3); > - > - sysval = nir_umod(b, nir_udiv(b, local_index, div_val), mod_val); > + if (!state->nir->info.cs.local_size_variable) { > + unsigned *size = nir->info.cs.local_size; > + > + nir_const_value uvec3; > + memset(&uvec3, 0, sizeof(uvec3)); > + uvec3.u32[0] = 1; > + uvec3.u32[1] = size[0]; > + uvec3.u32[2] = size[0] * size[1]; > + nir_ssa_def *div_val = nir_build_imm(b, 3, 32, uvec3); > + uvec3.u32[0] = size[0]; > + uvec3.u32[1] = size[1]; > + uvec3.u32[2] = size[2]; > + nir_ssa_def *mod_val = nir_build_imm(b, 3, 32, uvec3); > + > + sysval = nir_umod(b, nir_udiv(b, local_index, div_val), mod_val); > + } else { > + nir_ssa_def *group_size_xyz = nir_load_local_group_size(b); > + nir_ssa_def *group_size_x = nir_channel(b, group_size_xyz, 0); > + nir_ssa_def *group_size_y = nir_channel(b, group_size_xyz, 1); > + nir_ssa_def *group_size_z = nir_channel(b, group_size_xyz, 2); > + nir_ssa_def *result[3]; > + result[0] = nir_umod(b, local_index, group_size_x); > + result[1] = nir_umod(b, nir_udiv(b, local_index, group_size_x), > + group_size_y); > + result[2] = nir_umod(b, nir_udiv(b, local_index, > + nir_umul_high(b, group_size_x, group_size_y)), group_size_z); > + > + sysval = nir_vec(b, result, 3); > + } > break; > } > > case nir_intrinsic_load_subgroup_id: > - if (state->local_workgroup_size > 8) > + if (state->local_workgroup_size > 8 || > + state->nir->info.cs.local_size_variable) { > continue; > + } > > /* For small workgroup sizes, we know subgroup_id will be zero */ > sysval = nir_imm_int(b, 0); > break; > > case nir_intrinsic_load_num_subgroups: { > - unsigned local_workgroup_size = > - nir->info.cs.local_size[0] * nir->info.cs.local_size[1] * > - nir->info.cs.local_size[2]; > - unsigned num_subgroups = > - DIV_ROUND_UP(local_workgroup_size, state->dispatch_width); > - sysval = nir_imm_int(b, num_subgroups); > + if (!state->nir->info.cs.local_size_variable) { > + unsigned num_subgroups; > + unsigned local_workgroup_size = > + nir->info.cs.local_size[0] * nir->info.cs.local_size[1] * > + nir->info.cs.local_size[2]; > + num_subgroups = > + DIV_ROUND_UP(local_workgroup_size, state->dispatch_width); > + sysval = nir_imm_int(b, num_subgroups); > + } else { > + nir_ssa_def *dispatch_width = nir_imm_int(b, state->dispatch_width); > + nir_ssa_def *group_size_xyz = nir_load_local_group_size(b); > + nir_ssa_def *group_size_x = nir_channel(b, group_size_xyz, 0); > + nir_ssa_def *group_size_y = nir_channel(b, group_size_xyz, 1); > + nir_ssa_def *group_size_z = nir_channel(b, group_size_xyz, 2); > + nir_ssa_def *group_size = nir_imul(b, group_size_x, nir_imul(b, > + group_size_y, group_size_z)); > + > + sysval = nir_udiv(b, group_size, dispatch_width); I guess for DIV_ROUND_UP like above, you'd want to add (dispatch_width - 1) before the udiv. -Jordan > + } > break; > } > > + case nir_intrinsic_load_global_invocation_id: { > + nir_ssa_def *group_id = nir_load_work_group_id(b); > + nir_ssa_def *local_id = nir_load_local_invocation_id(b); > + nir_ssa_def *group_size = nir_load_local_group_size(b); > + > + sysval = nir_iadd(b, nir_imul(b, group_id, group_size), local_id); > + break; > + } > default: > continue; > } > diff --git a/src/mesa/drivers/dri/i965/brw_compute.c b/src/mesa/drivers/dri/i965/brw_compute.c > index de08fc3ac1..7949e0ff51 100644 > --- a/src/mesa/drivers/dri/i965/brw_compute.c > +++ b/src/mesa/drivers/dri/i965/brw_compute.c > @@ -121,8 +121,11 @@ brw_emit_gpgpu_walker(struct brw_context *brw) > } > > const unsigned simd_size = prog_data->simd_size; > - unsigned group_size = prog_data->local_size[0] * > - prog_data->local_size[1] * prog_data->local_size[2]; > + unsigned group_size = brw->compute.group_size != NULL ? > + brw->compute.group_size[0] * brw->compute.group_size[1] * > + brw->compute.group_size[2] : prog_data->local_size[0] * > + prog_data->local_size[1] * prog_data->local_size[2]; > + > unsigned thread_width_max = > (group_size + simd_size - 1) / simd_size; > > @@ -229,6 +232,7 @@ brw_dispatch_compute(struct gl_context *ctx, const GLuint *num_groups) { > > brw->compute.num_work_groups_bo = NULL; > brw->compute.num_work_groups = num_groups; > + brw->compute.group_size = NULL; > ctx->NewDriverState |= BRW_NEW_CS_WORK_GROUPS; > > brw_dispatch_compute_common(ctx); > @@ -248,6 +252,22 @@ brw_dispatch_compute_indirect(struct gl_context *ctx, GLintptr indirect) > brw->compute.num_work_groups_bo = bo; > brw->compute.num_work_groups_offset = indirect; > brw->compute.num_work_groups = indirect_group_counts; > + brw->compute.group_size = NULL; > + ctx->NewDriverState |= BRW_NEW_CS_WORK_GROUPS; > + > + brw_dispatch_compute_common(ctx); > +} > + > +static void > +brw_dispatch_compute_group_size(struct gl_context *ctx, > + const GLuint *num_groups, > + const GLuint *group_size) > +{ > + struct brw_context *brw = brw_context(ctx); > + > + brw->compute.num_work_groups_bo = NULL; > + brw->compute.num_work_groups = num_groups; > + brw->compute.group_size = group_size; > ctx->NewDriverState |= BRW_NEW_CS_WORK_GROUPS; > > brw_dispatch_compute_common(ctx); > @@ -258,4 +278,5 @@ brw_init_compute_functions(struct dd_function_table *functions) > { > functions->DispatchCompute = brw_dispatch_compute; > functions->DispatchComputeIndirect = brw_dispatch_compute_indirect; > + functions->DispatchComputeGroupSize = brw_dispatch_compute_group_size; > } > diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h > index 2613b9fda2..0fb533c369 100644 > --- a/src/mesa/drivers/dri/i965/brw_context.h > +++ b/src/mesa/drivers/dri/i965/brw_context.h > @@ -931,6 +931,7 @@ struct brw_context > struct brw_bo *num_work_groups_bo; > GLintptr num_work_groups_offset; > const GLuint *num_work_groups; > + const GLuint *group_size; > } compute; > > struct { > diff --git a/src/mesa/drivers/dri/i965/brw_cs.c b/src/mesa/drivers/dri/i965/brw_cs.c > index e3f8fc67a4..007273390b 100644 > --- a/src/mesa/drivers/dri/i965/brw_cs.c > +++ b/src/mesa/drivers/dri/i965/brw_cs.c > @@ -43,6 +43,10 @@ assign_cs_binding_table_offsets(const struct gen_device_info *devinfo, > prog_data->binding_table.work_groups_start = next_binding_table_offset; > next_binding_table_offset++; > > + /* May not be used if the work group size is not variable. */ > + prog_data->binding_table.work_group_size_start = next_binding_table_offset; > + next_binding_table_offset++; > + > brw_assign_common_binding_table_offsets(devinfo, prog, &prog_data->base, > next_binding_table_offset); > } > diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c > index 73cae9ef7c..fa8851e2b4 100644 > --- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c > +++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c > @@ -1634,7 +1634,7 @@ const struct brw_tracked_state brw_wm_image_surfaces = { > }; > > static void > -brw_upload_cs_work_groups_surface(struct brw_context *brw) > +brw_upload_cs_variable_surfaces(struct brw_context *brw) > { > struct gl_context *ctx = &brw->ctx; > /* _NEW_PROGRAM */ > @@ -1671,6 +1671,29 @@ brw_upload_cs_work_groups_surface(struct brw_context *brw) > RELOC_WRITE); > brw->ctx.NewDriverState |= BRW_NEW_SURFACES; > } > + > + if (prog && cs_prog_data->uses_variable_group_size) { > + const unsigned surf_idx = > + cs_prog_data->binding_table.work_group_size_start; > + uint32_t *surf_offset = &brw->cs.base.surf_offset[surf_idx]; > + struct brw_bo *bo; > + uint32_t bo_offset; > + > + bo = NULL; > + brw_upload_data(&brw->upload, > + (void *)brw->compute.group_size, > + 3 * sizeof(GLuint), > + sizeof(GLuint), > + &bo, > + &bo_offset); > + > + brw_emit_buffer_surface_state(brw, surf_offset, > + bo, bo_offset, > + ISL_FORMAT_RAW, > + 3 * sizeof(GLuint), 1, > + RELOC_WRITE); > + brw->ctx.NewDriverState |= BRW_NEW_SURFACES; > + } > } > > const struct brw_tracked_state brw_cs_work_groups_surface = { > @@ -1678,5 +1701,5 @@ const struct brw_tracked_state brw_cs_work_groups_surface = { > .brw = BRW_NEW_CS_PROG_DATA | > BRW_NEW_CS_WORK_GROUPS > }, > - .emit = brw_upload_cs_work_groups_surface, > + .emit = brw_upload_cs_variable_surfaces, > }; > diff --git a/src/mesa/drivers/dri/i965/intel_extensions.c b/src/mesa/drivers/dri/i965/intel_extensions.c > index 5a9369d7b4..f213360ed8 100644 > --- a/src/mesa/drivers/dri/i965/intel_extensions.c > +++ b/src/mesa/drivers/dri/i965/intel_extensions.c > @@ -258,6 +258,7 @@ intelInitExtensions(struct gl_context *ctx) > ctx->Extensions.ARB_compute_shader = true; > ctx->Extensions.ARB_ES3_1_compatibility = > devinfo->gen >= 8 || devinfo->is_haswell; > + ctx->Extensions.ARB_compute_variable_group_size = true; > } > > if (can_do_predicate_writes(brw->screen)) { > -- > 2.11.0 > </blockquote></div></div>