<div dir="ltr"><br><div class="gmail_extra"><br><div class="gmail_quote">On Fri, May 27, 2016 at 2:46 PM, Jordan Justen <span dir="ltr"><<a href="mailto:jordan.l.justen@intel.com" target="_blank">jordan.l.justen@intel.com</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"><div class="HOEnZb"><div class="h5">On 2016-05-27 14:23:39, Jason Ekstrand wrote:<br>
>    On Fri, May 27, 2016 at 11:24 AM, Jordan Justen<br>
>    <<a href="mailto:jordan.l.justen@intel.com">jordan.l.justen@intel.com</a>> wrote:<br>
><br>
>      This thread ID uniform will be used to compute the<br>
>      gl_LocalInvocationIndex and gl_LocalInvocationID values.<br>
><br>
>      It is important for this uniform to be added in the last push constant<br>
>      register. fs_visitor::assign_constant_locations is updated to make<br>
>      sure this happens.<br>
><br>
>      The reason this is important is that the cross-thread push constant<br>
>      registers are loaded first, and the per-thread push constant registers<br>
>      are loaded after that. (Broadwell adds another push constant upload<br>
>      mechanism which reverses this order, but we are ignoring this for<br>
>      now.)<br>
><br>
>      v2:<br>
>       * Add variable in intrinsics lowering pass<br>
>       * Make sure the ID is pushed last in assign_constant_locations, and<br>
>         that we save a spot for the ID in the push constants<br>
><br>
>      Signed-off-by: Jordan Justen <<a href="mailto:jordan.l.justen@intel.com">jordan.l.justen@intel.com</a>><br>
>      ---<br>
>       src/mesa/drivers/dri/i965/brw_compiler.h |  1 +<br>
>       src/mesa/drivers/dri/i965/brw_fs.cpp     | 49<br>
>      +++++++++++++++++++++++++++++---<br>
>       2 files changed, 46 insertions(+), 4 deletions(-)<br>
><br>
>      diff --git a/src/mesa/drivers/dri/i965/brw_compiler.h<br>
>      b/src/mesa/drivers/dri/i965/brw_compiler.h<br>
>      index a8fb486..f8379bc 100644<br>
>      --- a/src/mesa/drivers/dri/i965/brw_compiler.h<br>
>      +++ b/src/mesa/drivers/dri/i965/brw_compiler.h<br>
>      @@ -433,6 +433,7 @@ struct brw_cs_prog_data {<br>
>          bool uses_barrier;<br>
>          bool uses_num_work_groups;<br>
>          unsigned local_invocation_id_regs;<br>
>      +   int thread_local_id_index;<br>
><br>
>          struct {<br>
>             /** @{<br>
>      diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp<br>
>      b/src/mesa/drivers/dri/i965/brw_fs.cpp<br>
>      index bb2caa5..82b6781 100644<br>
>      --- a/src/mesa/drivers/dri/i965/brw_fs.cpp<br>
>      +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp<br>
>      @@ -2086,6 +2086,10 @@ fs_visitor::assign_constant_locations()<br>
>          bool contiguous[uniforms];<br>
>          memset(contiguous, 0, sizeof(contiguous));<br>
><br>
>      +   int thread_local_id_index =<br>
>      +      (stage == MESA_SHADER_COMPUTE) ?<br>
>      +      ((brw_cs_prog_data*)stage_prog_data)->thread_local_id_index : -1;<br>
>      +<br>
>          /* First, we walk through the instructions and do two things:<br>
>           *<br>
>           *  1) Figure out which uniforms are live.<br>
>      @@ -2130,6 +2134,9 @@ fs_visitor::assign_constant_locations()<br>
>             }<br>
>          }<br>
><br>
>      +   if (thread_local_id_index >= 0 && !is_live[thread_local_id_index])<br>
>      +      thread_local_id_index = -1;<br>
>      +<br>
>          /* Only allow 16 registers (128 uniform components) as push<br>
>      constants.<br>
>           *<br>
>           * Just demote the end of the list.  We could probably do better<br>
>      @@ -2158,6 +2165,9 @@ fs_visitor::assign_constant_locations()<br>
><br>
>          int chunk_start = -1;<br>
><br>
>      +   /* We may need to save a slot for the thread ID */<br>
>      +   unsigned int saved_slots = thread_local_id_index >= 0 ? 1 : 0;<br>
>      +<br>
>          /* First push 64-bit uniforms to ensure they are properly aligned */<br>
>          for (unsigned u = 0; u < uniforms; u++) {<br>
>             if (!is_live[u] || !is_live_64bit[u])<br>
>      @@ -2166,8 +2176,8 @@ fs_visitor::assign_constant_locations()<br>
>             set_push_pull_constant_loc(u, &chunk_start, contiguous[u],<br>
>                                        push_constant_loc, pull_constant_loc,<br>
>                                        &num_push_constants,<br>
>      &num_pull_constants,<br>
>      -                                 max_push_components, max_chunk_size,<br>
>      -                                 stage_prog_data);<br>
>      +                                 max_push_components - saved_slots,<br>
>      +                                 max_chunk_size, stage_prog_data);<br>
><br>
>    This seems a bit heavy-handed.  I don't think we need to subtract<br>
>    saved_slots from max_push_components.  It's just a heuristic and if we end<br>
>    up burning one extra register, oh well.  Some day, we could try and make<br>
>    it smarter but I think "max_push_components -= 1" is probably as good as<br>
>    anything for that.<br>
><br>
><br>
>          }<br>
><br>
>      @@ -2176,13 +2186,29 @@ fs_visitor::assign_constant_locations()<br>
>             if (!is_live[u] || is_live_64bit[u])<br>
>                continue;<br>
><br>
>      +      /* Skip thread_local_id_index to put it in the last push<br>
>      register. */<br>
>      +      if (thread_local_id_index == (int)u)<br>
>      +         continue;<br>
>      +<br>
>      +      set_push_pull_constant_loc(u, &chunk_start, contiguous[u],<br>
>      +                                 push_constant_loc, pull_constant_loc,<br>
>      +                                 &num_push_constants,<br>
>      &num_pull_constants,<br>
>      +                                 max_push_components - saved_slots,<br>
>      +                                 max_chunk_size, stage_prog_data);<br>
>      +   }<br>
>      +<br>
>      +   if (thread_local_id_index >= 0) {<br>
>      +      /* Add the CS thread ID uniform at the end */<br>
>      +      unsigned u = thread_local_id_index;<br>
>             set_push_pull_constant_loc(u, &chunk_start, contiguous[u],<br>
>                                        push_constant_loc, pull_constant_loc,<br>
>                                        &num_push_constants,<br>
>      &num_pull_constants,<br>
>      -                                 max_push_components, max_chunk_size,<br>
>      -                                 stage_prog_data);<br>
>      +                                 max_push_components,<br>
>      +                                 max_chunk_size, stage_prog_data);<br>
>      +      assert(push_constant_loc[u] >= 0);<br>
><br>
>    Why not just<br>
><br>
>    if (thread_local_id_index >= 0)<br>
>        push_constant_loc[thread_local_id_index] = num_push_constants++<br>
><br>
>    Seems a lot better than calling a really complicated helper whose one job<br>
>    is to decide whether or not to push/pull something when we *always* want<br>
>    push.<br>
><br>
<br>
</div></div>Yeah, that sounds good.<br>
<div><div class="h5"><br>
><br>
>          }<br>
><br>
>      +<br>
><br>
>    Extra line<br>
><br>
><br>
>          /* As the uniforms are going to be reordered, take the data from a<br>
>      temporary<br>
>           * copy of the original param[].<br>
>           */<br>
>      @@ -2201,6 +2227,7 @@ fs_visitor::assign_constant_locations()<br>
>           * push_constant_loc[i] <= i and we can do it in one smooth loop<br>
>      without<br>
>           * having to make a copy.<br>
>           */<br>
>      +   int new_thread_local_id_index = -1;<br>
>          for (unsigned int i = 0; i < uniforms; i++) {<br>
>             const gl_constant_value *value = param[i];<br>
><br>
>      @@ -2208,9 +2235,15 @@ fs_visitor::assign_constant_locations()<br>
>                stage_prog_data->pull_param[pull_constant_loc[i]] = value;<br>
>             } else if (push_constant_loc[i] != -1) {<br>
>                stage_prog_data->param[push_constant_loc[i]] = value;<br>
>      +         if (thread_local_id_index == (int)i)<br>
>      +            new_thread_local_id_index = push_constant_loc[i];<br>
>             }<br>
>          }<br>
>          ralloc_free(param);<br>
>      +<br>
>      +   if (stage == MESA_SHADER_COMPUTE)<br>
>      +      ((brw_cs_prog_data*)stage_prog_data)->thread_local_id_index =<br>
>      +         new_thread_local_id_index;<br>
>       }<br>
><br>
>       /**<br>
>      @@ -6185,6 +6218,14 @@ brw_compile_cs(const struct brw_compiler<br>
>      *compiler, void *log_data,<br>
>             shader->info.cs.local_size[0] * shader->info.cs.local_size[1] *<br>
>             shader->info.cs.local_size[2];<br>
><br>
>      +   prog_data->thread_local_id_index = -1;<br>
>      +   nir_foreach_variable(var, &shader->uniforms) {<br>
>      +      if (strcmp(var->name, "cs_thread_local_id") == 0) {<br>
><br>
>    Hrm... this seems a bit ugly...  I'll think about it a bit.<br>
><br>
<br>
</div></div>Yes, it is ugly! :( I used to have this in two places in v1, but Ken<br>
had a suggestion that fixed one of those.<br>
<br>
One idea that I had was that it would be nice if nir_shader let a<br>
driver store a bit of driver specific context data associated with the<br>
shader. (Ie, add a void * associated with the shader.)<br>
<br>
Then in the early phase of driver nir code before prog_data, we could<br>
still have some driver specific data associated with the shader. This<br>
would allow that code to save off the index after creating the<br>
variable, or something similar.<br>
<br>
Or, perhaps you have another idea?<span class="HOEnZb"><font color="#888888"><br></font></span></blockquote><div><br></div><div>Assuming we go with my suggestion of making brw_nir_lower_intrinsics take a thread_id_location parameter, we could just call it in brw_compile_cs before brw_postprocess_nir.  Then we would already have the location and there would be no need for this hack at all.  That's the best I've been able to come up with so far and I think it works pretty well in theory.  That said, an attempt to implement it may indicate otherwise. :-)<br></div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"><span class="HOEnZb"><font color="#888888">
-Jordan<br>
</font></span><div class="HOEnZb"><div class="h5"><br>
><br>
>      +         prog_data->thread_local_id_index = var->data.driver_location /<br>
>      4;<br>
>      +         break;<br>
>      +      }<br>
>      +   }<br>
>      +<br>
>          unsigned max_cs_threads = compiler->devinfo->max_cs_threads;<br>
>          unsigned simd_required = DIV_ROUND_UP(local_workgroup_size,<br>
>      max_cs_threads);<br>
>      --<br>
>      2.8.1<br>
><br>
>      _______________________________________________<br>
>      mesa-dev mailing list<br>
>      <a href="mailto:mesa-dev@lists.freedesktop.org">mesa-dev@lists.freedesktop.org</a><br>
>      <a href="https://lists.freedesktop.org/mailman/listinfo/mesa-dev" rel="noreferrer" target="_blank">https://lists.freedesktop.org/mailman/listinfo/mesa-dev</a><br>
</div></div></blockquote></div><br></div></div>