<div dir="ltr"><br><div class="gmail_extra"><br><div class="gmail_quote">On Fri, May 27, 2016 at 11:24 AM, Jordan Justen <span dir="ltr"><<a href="mailto:jordan.l.justen@intel.com" target="_blank">jordan.l.justen@intel.com</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">This thread ID uniform will be used to compute the<br>
gl_LocalInvocationIndex and gl_LocalInvocationID values.<br>
<br>
It is important for this uniform to be added in the last push constant<br>
register. fs_visitor::assign_constant_locations is updated to make<br>
sure this happens.<br>
<br>
The reason this is important is that the cross-thread push constant<br>
registers are loaded first, and the per-thread push constant registers<br>
are loaded after that. (Broadwell adds another push constant upload<br>
mechanism which reverses this order, but we are ignoring this for<br>
now.)<br>
<br>
v2:<br>
 * Add variable in intrinsics lowering pass<br>
 * Make sure the ID is pushed last in assign_constant_locations, and<br>
   that we save a spot for the ID in the push constants<br>
<br>
Signed-off-by: Jordan Justen <<a href="mailto:jordan.l.justen@intel.com">jordan.l.justen@intel.com</a>><br>
---<br>
 src/mesa/drivers/dri/i965/brw_compiler.h |  1 +<br>
 src/mesa/drivers/dri/i965/brw_fs.cpp     | 49 +++++++++++++++++++++++++++++---<br>
 2 files changed, 46 insertions(+), 4 deletions(-)<br>
<br>
diff --git a/src/mesa/drivers/dri/i965/brw_compiler.h b/src/mesa/drivers/dri/i965/brw_compiler.h<br>
index a8fb486..f8379bc 100644<br>
--- a/src/mesa/drivers/dri/i965/brw_compiler.h<br>
+++ b/src/mesa/drivers/dri/i965/brw_compiler.h<br>
@@ -433,6 +433,7 @@ struct brw_cs_prog_data {<br>
    bool uses_barrier;<br>
    bool uses_num_work_groups;<br>
    unsigned local_invocation_id_regs;<br>
+   int thread_local_id_index;<br>
<br>
    struct {<br>
       /** @{<br>
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp<br>
index bb2caa5..82b6781 100644<br>
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp<br>
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp<br>
@@ -2086,6 +2086,10 @@ fs_visitor::assign_constant_locations()<br>
    bool contiguous[uniforms];<br>
    memset(contiguous, 0, sizeof(contiguous));<br>
<br>
+   int thread_local_id_index =<br>
+      (stage == MESA_SHADER_COMPUTE) ?<br>
+      ((brw_cs_prog_data*)stage_prog_data)->thread_local_id_index : -1;<br>
+<br>
    /* First, we walk through the instructions and do two things:<br>
     *<br>
     *  1) Figure out which uniforms are live.<br>
@@ -2130,6 +2134,9 @@ fs_visitor::assign_constant_locations()<br>
       }<br>
    }<br>
<br>
+   if (thread_local_id_index >= 0 && !is_live[thread_local_id_index])<br>
+      thread_local_id_index = -1;<br>
+<br>
    /* Only allow 16 registers (128 uniform components) as push constants.<br>
     *<br>
     * Just demote the end of the list.  We could probably do better<br>
@@ -2158,6 +2165,9 @@ fs_visitor::assign_constant_locations()<br>
<br>
    int chunk_start = -1;<br>
<br>
+   /* We may need to save a slot for the thread ID */<br>
+   unsigned int saved_slots = thread_local_id_index >= 0 ? 1 : 0;<br>
+<br>
    /* First push 64-bit uniforms to ensure they are properly aligned */<br>
    for (unsigned u = 0; u < uniforms; u++) {<br>
       if (!is_live[u] || !is_live_64bit[u])<br>
@@ -2166,8 +2176,8 @@ fs_visitor::assign_constant_locations()<br>
       set_push_pull_constant_loc(u, &chunk_start, contiguous[u],<br>
                                  push_constant_loc, pull_constant_loc,<br>
                                  &num_push_constants, &num_pull_constants,<br>
-                                 max_push_components, max_chunk_size,<br>
-                                 stage_prog_data);<br>
+                                 max_push_components - saved_slots,<br>
+                                 max_chunk_size, stage_prog_data);<br></blockquote><div><br></div><div>This seems a bit heavy-handed.  I don't think we need to subtract saved_slots from max_push_components.  It's just a heuristic and if we end up burning one extra register, oh well.  Some day, we could try and make it smarter but I think "max_push_components -= 1" is probably as good as anything for that.<br></div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
<br>
    }<br>
<br>
@@ -2176,13 +2186,29 @@ fs_visitor::assign_constant_locations()<br>
       if (!is_live[u] || is_live_64bit[u])<br>
          continue;<br>
<br>
+      /* Skip thread_local_id_index to put it in the last push register. */<br>
+      if (thread_local_id_index == (int)u)<br>
+         continue;<br>
+<br>
+      set_push_pull_constant_loc(u, &chunk_start, contiguous[u],<br>
+                                 push_constant_loc, pull_constant_loc,<br>
+                                 &num_push_constants, &num_pull_constants,<br>
+                                 max_push_components - saved_slots,<br>
+                                 max_chunk_size, stage_prog_data);<br>
+   }<br>
+<br>
+   if (thread_local_id_index >= 0) {<br>
+      /* Add the CS thread ID uniform at the end */<br>
+      unsigned u = thread_local_id_index;<br>
       set_push_pull_constant_loc(u, &chunk_start, contiguous[u],<br>
                                  push_constant_loc, pull_constant_loc,<br>
                                  &num_push_constants, &num_pull_constants,<br>
-                                 max_push_components, max_chunk_size,<br>
-                                 stage_prog_data);<br>
+                                 max_push_components,<br>
+                                 max_chunk_size, stage_prog_data);<br>
+      assert(push_constant_loc[u] >= 0);<br></blockquote><div><br></div><div>Why not just<br><br></div><div>if (thread_local_id_index >= 0)<br></div><div>    push_constant_loc[thread_local_id_index] = num_push_constants++<br><br></div><div>Seems a lot better than calling a really complicated helper whose one job is to decide whether or not to push/pull something when we *always* want push.<br></div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
    }<br>
<br>
+<br></blockquote><div><br></div><div>Extra line<br></div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
    /* As the uniforms are going to be reordered, take the data from a temporary<br>
     * copy of the original param[].<br>
     */<br>
@@ -2201,6 +2227,7 @@ fs_visitor::assign_constant_locations()<br>
     * push_constant_loc[i] <= i and we can do it in one smooth loop without<br>
     * having to make a copy.<br>
     */<br>
+   int new_thread_local_id_index = -1;<br>
    for (unsigned int i = 0; i < uniforms; i++) {<br>
       const gl_constant_value *value = param[i];<br>
<br>
@@ -2208,9 +2235,15 @@ fs_visitor::assign_constant_locations()<br>
          stage_prog_data->pull_param[pull_constant_loc[i]] = value;<br>
       } else if (push_constant_loc[i] != -1) {<br>
          stage_prog_data->param[push_constant_loc[i]] = value;<br>
+         if (thread_local_id_index == (int)i)<br>
+            new_thread_local_id_index = push_constant_loc[i];<br>
       }<br>
    }<br>
    ralloc_free(param);<br>
+<br>
+   if (stage == MESA_SHADER_COMPUTE)<br>
+      ((brw_cs_prog_data*)stage_prog_data)->thread_local_id_index =<br>
+         new_thread_local_id_index;<br>
 }<br>
<br>
 /**<br>
@@ -6185,6 +6218,14 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,<br>
       shader->info.cs.local_size[0] * shader->info.cs.local_size[1] *<br>
       shader->info.cs.local_size[2];<br>
<br>
+   prog_data->thread_local_id_index = -1;<br>
+   nir_foreach_variable(var, &shader->uniforms) {<br>
+      if (strcmp(var->name, "cs_thread_local_id") == 0) {<br></blockquote><div><br></div><div>Hrm... this seems a bit ugly...  I'll think about it a bit.<br></div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
+         prog_data->thread_local_id_index = var->data.driver_location / 4;<br>
+         break;<br>
+      }<br>
+   }<br>
+<br>
    unsigned max_cs_threads = compiler->devinfo->max_cs_threads;<br>
    unsigned simd_required = DIV_ROUND_UP(local_workgroup_size, max_cs_threads);<br>
<span class="HOEnZb"><font color="#888888"><br>
--<br>
2.8.1<br>
<br>
_______________________________________________<br>
mesa-dev mailing list<br>
<a href="mailto:mesa-dev@lists.freedesktop.org">mesa-dev@lists.freedesktop.org</a><br>
<a href="https://lists.freedesktop.org/mailman/listinfo/mesa-dev" rel="noreferrer" target="_blank">https://lists.freedesktop.org/mailman/listinfo/mesa-dev</a><br>
</font></span></blockquote></div><br></div></div>