<div dir="ltr"><br><div class="gmail_extra"><br><div class="gmail_quote">On Fri, May 27, 2016 at 11:24 AM, Jordan Justen <span dir="ltr"><<a href="mailto:jordan.l.justen@intel.com" target="_blank">jordan.l.justen@intel.com</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">This thread ID uniform will be used to compute the<br>
gl_LocalInvocationIndex and gl_LocalInvocationID values.<br>
<br>
It is important for this uniform to be added in the last push constant<br>
register. fs_visitor::assign_constant_locations is updated to make<br>
sure this happens.<br>
<br>
The reason this is important is that the cross-thread push constant<br>
registers are loaded first, and the per-thread push constant registers<br>
are loaded after that. (Broadwell adds another push constant upload<br>
mechanism which reverses this order, but we are ignoring this for<br>
now.)<br>
<br>
v2:<br>
* Add variable in intrinsics lowering pass<br>
* Make sure the ID is pushed last in assign_constant_locations, and<br>
that we save a spot for the ID in the push constants<br>
<br>
Signed-off-by: Jordan Justen <<a href="mailto:jordan.l.justen@intel.com">jordan.l.justen@intel.com</a>><br>
---<br>
src/mesa/drivers/dri/i965/brw_compiler.h | 1 +<br>
src/mesa/drivers/dri/i965/brw_fs.cpp | 49 +++++++++++++++++++++++++++++---<br>
2 files changed, 46 insertions(+), 4 deletions(-)<br>
<br>
diff --git a/src/mesa/drivers/dri/i965/brw_compiler.h b/src/mesa/drivers/dri/i965/brw_compiler.h<br>
index a8fb486..f8379bc 100644<br>
--- a/src/mesa/drivers/dri/i965/brw_compiler.h<br>
+++ b/src/mesa/drivers/dri/i965/brw_compiler.h<br>
@@ -433,6 +433,7 @@ struct brw_cs_prog_data {<br>
bool uses_barrier;<br>
bool uses_num_work_groups;<br>
unsigned local_invocation_id_regs;<br>
+ int thread_local_id_index;<br>
<br>
struct {<br>
/** @{<br>
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp<br>
index bb2caa5..82b6781 100644<br>
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp<br>
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp<br>
@@ -2086,6 +2086,10 @@ fs_visitor::assign_constant_locations()<br>
bool contiguous[uniforms];<br>
memset(contiguous, 0, sizeof(contiguous));<br>
<br>
+ int thread_local_id_index =<br>
+ (stage == MESA_SHADER_COMPUTE) ?<br>
+ ((brw_cs_prog_data*)stage_prog_data)->thread_local_id_index : -1;<br>
+<br>
/* First, we walk through the instructions and do two things:<br>
*<br>
* 1) Figure out which uniforms are live.<br>
@@ -2130,6 +2134,9 @@ fs_visitor::assign_constant_locations()<br>
}<br>
}<br>
<br>
+ if (thread_local_id_index >= 0 && !is_live[thread_local_id_index])<br>
+ thread_local_id_index = -1;<br>
+<br>
/* Only allow 16 registers (128 uniform components) as push constants.<br>
*<br>
* Just demote the end of the list. We could probably do better<br>
@@ -2158,6 +2165,9 @@ fs_visitor::assign_constant_locations()<br>
<br>
int chunk_start = -1;<br>
<br>
+ /* We may need to save a slot for the thread ID */<br>
+ unsigned int saved_slots = thread_local_id_index >= 0 ? 1 : 0;<br>
+<br>
/* First push 64-bit uniforms to ensure they are properly aligned */<br>
for (unsigned u = 0; u < uniforms; u++) {<br>
if (!is_live[u] || !is_live_64bit[u])<br>
@@ -2166,8 +2176,8 @@ fs_visitor::assign_constant_locations()<br>
set_push_pull_constant_loc(u, &chunk_start, contiguous[u],<br>
push_constant_loc, pull_constant_loc,<br>
&num_push_constants, &num_pull_constants,<br>
- max_push_components, max_chunk_size,<br>
- stage_prog_data);<br>
+ max_push_components - saved_slots,<br>
+ max_chunk_size, stage_prog_data);<br></blockquote><div><br></div><div>This seems a bit heavy-handed. I don't think we need to subtract saved_slots from max_push_components. It's just a heuristic and if we end up burning one extra register, oh well. Some day, we could try and make it smarter but I think "max_push_components -= 1" is probably as good as anything for that.<br></div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
<br>
}<br>
<br>
@@ -2176,13 +2186,29 @@ fs_visitor::assign_constant_locations()<br>
if (!is_live[u] || is_live_64bit[u])<br>
continue;<br>
<br>
+ /* Skip thread_local_id_index to put it in the last push register. */<br>
+ if (thread_local_id_index == (int)u)<br>
+ continue;<br>
+<br>
+ set_push_pull_constant_loc(u, &chunk_start, contiguous[u],<br>
+ push_constant_loc, pull_constant_loc,<br>
+ &num_push_constants, &num_pull_constants,<br>
+ max_push_components - saved_slots,<br>
+ max_chunk_size, stage_prog_data);<br>
+ }<br>
+<br>
+ if (thread_local_id_index >= 0) {<br>
+ /* Add the CS thread ID uniform at the end */<br>
+ unsigned u = thread_local_id_index;<br>
set_push_pull_constant_loc(u, &chunk_start, contiguous[u],<br>
push_constant_loc, pull_constant_loc,<br>
&num_push_constants, &num_pull_constants,<br>
- max_push_components, max_chunk_size,<br>
- stage_prog_data);<br>
+ max_push_components,<br>
+ max_chunk_size, stage_prog_data);<br>
+ assert(push_constant_loc[u] >= 0);<br></blockquote><div><br></div><div>Why not just<br><br></div><div>if (thread_local_id_index >= 0)<br></div><div> push_constant_loc[thread_local_id_index] = num_push_constants++<br><br></div><div>Seems a lot better than calling a really complicated helper whose one job is to decide whether or not to push/pull something when we *always* want push.<br></div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
}<br>
<br>
+<br></blockquote><div><br></div><div>Extra line<br></div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
/* As the uniforms are going to be reordered, take the data from a temporary<br>
* copy of the original param[].<br>
*/<br>
@@ -2201,6 +2227,7 @@ fs_visitor::assign_constant_locations()<br>
* push_constant_loc[i] <= i and we can do it in one smooth loop without<br>
* having to make a copy.<br>
*/<br>
+ int new_thread_local_id_index = -1;<br>
for (unsigned int i = 0; i < uniforms; i++) {<br>
const gl_constant_value *value = param[i];<br>
<br>
@@ -2208,9 +2235,15 @@ fs_visitor::assign_constant_locations()<br>
stage_prog_data->pull_param[pull_constant_loc[i]] = value;<br>
} else if (push_constant_loc[i] != -1) {<br>
stage_prog_data->param[push_constant_loc[i]] = value;<br>
+ if (thread_local_id_index == (int)i)<br>
+ new_thread_local_id_index = push_constant_loc[i];<br>
}<br>
}<br>
ralloc_free(param);<br>
+<br>
+ if (stage == MESA_SHADER_COMPUTE)<br>
+ ((brw_cs_prog_data*)stage_prog_data)->thread_local_id_index =<br>
+ new_thread_local_id_index;<br>
}<br>
<br>
/**<br>
@@ -6185,6 +6218,14 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,<br>
shader->info.cs.local_size[0] * shader->info.cs.local_size[1] *<br>
shader->info.cs.local_size[2];<br>
<br>
+ prog_data->thread_local_id_index = -1;<br>
+ nir_foreach_variable(var, &shader->uniforms) {<br>
+ if (strcmp(var->name, "cs_thread_local_id") == 0) {<br></blockquote><div><br></div><div>Hrm... this seems a bit ugly... I'll think about it a bit.<br></div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
+ prog_data->thread_local_id_index = var->data.driver_location / 4;<br>
+ break;<br>
+ }<br>
+ }<br>
+<br>
unsigned max_cs_threads = compiler->devinfo->max_cs_threads;<br>
unsigned simd_required = DIV_ROUND_UP(local_workgroup_size, max_cs_threads);<br>
<span class="HOEnZb"><font color="#888888"><br>
--<br>
2.8.1<br>
<br>
_______________________________________________<br>
mesa-dev mailing list<br>
<a href="mailto:mesa-dev@lists.freedesktop.org">mesa-dev@lists.freedesktop.org</a><br>
<a href="https://lists.freedesktop.org/mailman/listinfo/mesa-dev" rel="noreferrer" target="_blank">https://lists.freedesktop.org/mailman/listinfo/mesa-dev</a><br>
</font></span></blockquote></div><br></div></div>