[Mesa-dev] [PATCH v2 05/13] i965: Track and place CS thread ID uniform

Jordan Justen jordan.l.justen at intel.com
Fri May 27 18:24:30 UTC 2016


This thread ID uniform will be used to compute the
gl_LocalInvocationIndex and gl_LocalInvocationID values.

It is important for this uniform to be added in the last push constant
register. fs_visitor::assign_constant_locations is updated to make
sure this happens.

The reason this is important is that the cross-thread push constant
registers are loaded first, and the per-thread push constant registers
are loaded after that. (Broadwell adds another push constant upload
mechanism which reverses this order, but we are ignoring this for
now.)

v2:
 * Add variable in intrinsics lowering pass
 * Make sure the ID is pushed last in assign_constant_locations, and
   that we save a spot for the ID in the push constants

Signed-off-by: Jordan Justen <jordan.l.justen at intel.com>
---
 src/mesa/drivers/dri/i965/brw_compiler.h |  1 +
 src/mesa/drivers/dri/i965/brw_fs.cpp     | 49 +++++++++++++++++++++++++++++---
 2 files changed, 46 insertions(+), 4 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_compiler.h b/src/mesa/drivers/dri/i965/brw_compiler.h
index a8fb486..f8379bc 100644
--- a/src/mesa/drivers/dri/i965/brw_compiler.h
+++ b/src/mesa/drivers/dri/i965/brw_compiler.h
@@ -433,6 +433,7 @@ struct brw_cs_prog_data {
    bool uses_barrier;
    bool uses_num_work_groups;
    unsigned local_invocation_id_regs;
+   int thread_local_id_index;
 
    struct {
       /** @{
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index bb2caa5..82b6781 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -2086,6 +2086,10 @@ fs_visitor::assign_constant_locations()
    bool contiguous[uniforms];
    memset(contiguous, 0, sizeof(contiguous));
 
+   int thread_local_id_index =
+      (stage == MESA_SHADER_COMPUTE) ?
+      ((brw_cs_prog_data*)stage_prog_data)->thread_local_id_index : -1;
+
    /* First, we walk through the instructions and do two things:
     *
     *  1) Figure out which uniforms are live.
@@ -2130,6 +2134,9 @@ fs_visitor::assign_constant_locations()
       }
    }
 
+   if (thread_local_id_index >= 0 && !is_live[thread_local_id_index])
+      thread_local_id_index = -1;
+
    /* Only allow 16 registers (128 uniform components) as push constants.
     *
     * Just demote the end of the list.  We could probably do better
@@ -2158,6 +2165,9 @@ fs_visitor::assign_constant_locations()
 
    int chunk_start = -1;
 
+   /* We may need to save a slot for the thread ID */
+   unsigned int saved_slots = thread_local_id_index >= 0 ? 1 : 0;
+
    /* First push 64-bit uniforms to ensure they are properly aligned */
    for (unsigned u = 0; u < uniforms; u++) {
       if (!is_live[u] || !is_live_64bit[u])
@@ -2166,8 +2176,8 @@ fs_visitor::assign_constant_locations()
       set_push_pull_constant_loc(u, &chunk_start, contiguous[u],
                                  push_constant_loc, pull_constant_loc,
                                  &num_push_constants, &num_pull_constants,
-                                 max_push_components, max_chunk_size,
-                                 stage_prog_data);
+                                 max_push_components - saved_slots,
+                                 max_chunk_size, stage_prog_data);
 
    }
 
@@ -2176,13 +2186,29 @@ fs_visitor::assign_constant_locations()
       if (!is_live[u] || is_live_64bit[u])
          continue;
 
+      /* Skip thread_local_id_index to put it in the last push register. */
+      if (thread_local_id_index == (int)u)
+         continue;
+
+      set_push_pull_constant_loc(u, &chunk_start, contiguous[u],
+                                 push_constant_loc, pull_constant_loc,
+                                 &num_push_constants, &num_pull_constants,
+                                 max_push_components - saved_slots,
+                                 max_chunk_size, stage_prog_data);
+   }
+
+   if (thread_local_id_index >= 0) {
+      /* Add the CS thread ID uniform at the end */
+      unsigned u = thread_local_id_index;
       set_push_pull_constant_loc(u, &chunk_start, contiguous[u],
                                  push_constant_loc, pull_constant_loc,
                                  &num_push_constants, &num_pull_constants,
-                                 max_push_components, max_chunk_size,
-                                 stage_prog_data);
+                                 max_push_components,
+                                 max_chunk_size, stage_prog_data);
+      assert(push_constant_loc[u] >= 0);
    }
 
+
    /* As the uniforms are going to be reordered, take the data from a temporary
     * copy of the original param[].
     */
@@ -2201,6 +2227,7 @@ fs_visitor::assign_constant_locations()
     * push_constant_loc[i] <= i and we can do it in one smooth loop without
     * having to make a copy.
     */
+   int new_thread_local_id_index = -1;
    for (unsigned int i = 0; i < uniforms; i++) {
       const gl_constant_value *value = param[i];
 
@@ -2208,9 +2235,15 @@ fs_visitor::assign_constant_locations()
          stage_prog_data->pull_param[pull_constant_loc[i]] = value;
       } else if (push_constant_loc[i] != -1) {
          stage_prog_data->param[push_constant_loc[i]] = value;
+         if (thread_local_id_index == (int)i)
+            new_thread_local_id_index = push_constant_loc[i];
       }
    }
    ralloc_free(param);
+
+   if (stage == MESA_SHADER_COMPUTE)
+      ((brw_cs_prog_data*)stage_prog_data)->thread_local_id_index =
+         new_thread_local_id_index;
 }
 
 /**
@@ -6185,6 +6218,14 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
       shader->info.cs.local_size[0] * shader->info.cs.local_size[1] *
       shader->info.cs.local_size[2];
 
+   prog_data->thread_local_id_index = -1;
+   nir_foreach_variable(var, &shader->uniforms) {
+      if (strcmp(var->name, "cs_thread_local_id") == 0) {
+         prog_data->thread_local_id_index = var->data.driver_location / 4;
+         break;
+      }
+   }
+
    unsigned max_cs_threads = compiler->devinfo->max_cs_threads;
    unsigned simd_required = DIV_ROUND_UP(local_workgroup_size, max_cs_threads);
 
-- 
2.8.1



More information about the mesa-dev mailing list