[Mesa-dev] [PATCH v2 05/13] i965: Track and place CS thread ID uniform

Fri May 27 21:23:39 UTC 2016

On Fri, May 27, 2016 at 11:24 AM, Jordan Justen <jordan.l.justen at intel.com>
wrote:

> This thread ID uniform will be used to compute the
> gl_LocalInvocationIndex and gl_LocalInvocationID values.
>
> It is important for this uniform to be added in the last push constant
> register. fs_visitor::assign_constant_locations is updated to make
> sure this happens.
>
> The reason this is important is that the cross-thread push constant
> registers are loaded first, and the per-thread push constant registers
> are loaded after that. (Broadwell adds another push constant upload
> mechanism which reverses this order, but we are ignoring this for
> now.)
>
> v2:
>  * Add variable in intrinsics lowering pass
>  * Make sure the ID is pushed last in assign_constant_locations, and
>    that we save a spot for the ID in the push constants
>
> Signed-off-by: Jordan Justen <jordan.l.justen at intel.com>
> ---
>  src/mesa/drivers/dri/i965/brw_compiler.h |  1 +
>  src/mesa/drivers/dri/i965/brw_fs.cpp     | 49
> +++++++++++++++++++++++++++++---
>  2 files changed, 46 insertions(+), 4 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_compiler.h
> b/src/mesa/drivers/dri/i965/brw_compiler.h
> index a8fb486..f8379bc 100644
> --- a/src/mesa/drivers/dri/i965/brw_compiler.h
> +++ b/src/mesa/drivers/dri/i965/brw_compiler.h
> @@ -433,6 +433,7 @@ struct brw_cs_prog_data {
>     bool uses_barrier;
>     bool uses_num_work_groups;
>     unsigned local_invocation_id_regs;
> +   int thread_local_id_index;
>
>     struct {
>        /** @{
> diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp
> b/src/mesa/drivers/dri/i965/brw_fs.cpp
> index bb2caa5..82b6781 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
> @@ -2086,6 +2086,10 @@ fs_visitor::assign_constant_locations()
>     bool contiguous[uniforms];
>     memset(contiguous, 0, sizeof(contiguous));
>
> +   int thread_local_id_index =
> +      (stage == MESA_SHADER_COMPUTE) ?
> +      ((brw_cs_prog_data*)stage_prog_data)->thread_local_id_index : -1;
> +
>     /* First, we walk through the instructions and do two things:
>      *
>      *  1) Figure out which uniforms are live.
> @@ -2130,6 +2134,9 @@ fs_visitor::assign_constant_locations()
>        }
>     }
>
> +   if (thread_local_id_index >= 0 && !is_live[thread_local_id_index])
> +      thread_local_id_index = -1;
> +
>     /* Only allow 16 registers (128 uniform components) as push constants.
>      *
>      * Just demote the end of the list.  We could probably do better
> @@ -2158,6 +2165,9 @@ fs_visitor::assign_constant_locations()
>
>     int chunk_start = -1;
>
> +   /* We may need to save a slot for the thread ID */
> +   unsigned int saved_slots = thread_local_id_index >= 0 ? 1 : 0;
> +
>     /* First push 64-bit uniforms to ensure they are properly aligned */
>     for (unsigned u = 0; u < uniforms; u++) {
>        if (!is_live[u] || !is_live_64bit[u])
> @@ -2166,8 +2176,8 @@ fs_visitor::assign_constant_locations()
>        set_push_pull_constant_loc(u, &chunk_start, contiguous[u],
>                                   push_constant_loc, pull_constant_loc,
>                                   &num_push_constants, &num_pull_constants,
> -                                 max_push_components, max_chunk_size,
> -                                 stage_prog_data);
> +                                 max_push_components - saved_slots,
> +                                 max_chunk_size, stage_prog_data);
>

This seems a bit heavy-handed.  I don't think we need to subtract
saved_slots from max_push_components.  It's just a heuristic and if we end
up burning one extra register, oh well.  Some day, we could try and make it
smarter but I think "max_push_components -= 1" is probably as good as
anything for that.

>
>     }
>
> @@ -2176,13 +2186,29 @@ fs_visitor::assign_constant_locations()
>        if (!is_live[u] || is_live_64bit[u])
>           continue;
>
> +      /* Skip thread_local_id_index to put it in the last push register.
> */
> +      if (thread_local_id_index == (int)u)
> +         continue;
> +
> +      set_push_pull_constant_loc(u, &chunk_start, contiguous[u],
> +                                 push_constant_loc, pull_constant_loc,
> +                                 &num_push_constants, &num_pull_constants,
> +                                 max_push_components - saved_slots,
> +                                 max_chunk_size, stage_prog_data);
> +   }
> +
> +   if (thread_local_id_index >= 0) {
> +      /* Add the CS thread ID uniform at the end */
> +      unsigned u = thread_local_id_index;
>        set_push_pull_constant_loc(u, &chunk_start, contiguous[u],
>                                   push_constant_loc, pull_constant_loc,
>                                   &num_push_constants, &num_pull_constants,
> -                                 max_push_components, max_chunk_size,
> -                                 stage_prog_data);
> +                                 max_push_components,
> +                                 max_chunk_size, stage_prog_data);
> +      assert(push_constant_loc[u] >= 0);
>

Why not just

if (thread_local_id_index >= 0)
    push_constant_loc[thread_local_id_index] = num_push_constants++

Seems a lot better than calling a really complicated helper whose one job
is to decide whether or not to push/pull something when we *always* want
push.

>     }
>
> +
>

Extra line

>     /* As the uniforms are going to be reordered, take the data from a
> temporary
>      * copy of the original param[].
>      */
> @@ -2201,6 +2227,7 @@ fs_visitor::assign_constant_locations()
>      * push_constant_loc[i] <= i and we can do it in one smooth loop
> without
>      * having to make a copy.
>      */
> +   int new_thread_local_id_index = -1;
>     for (unsigned int i = 0; i < uniforms; i++) {
>        const gl_constant_value *value = param[i];
>
> @@ -2208,9 +2235,15 @@ fs_visitor::assign_constant_locations()
>           stage_prog_data->pull_param[pull_constant_loc[i]] = value;
>        } else if (push_constant_loc[i] != -1) {
>           stage_prog_data->param[push_constant_loc[i]] = value;
> +         if (thread_local_id_index == (int)i)
> +            new_thread_local_id_index = push_constant_loc[i];
>        }
>     }
>     ralloc_free(param);
> +
> +   if (stage == MESA_SHADER_COMPUTE)
> +      ((brw_cs_prog_data*)stage_prog_data)->thread_local_id_index =
> +         new_thread_local_id_index;
>  }
>
>  /**
> @@ -6185,6 +6218,14 @@ brw_compile_cs(const struct brw_compiler *compiler,
> void *log_data,
>        shader->info.cs.local_size[0] * shader->info.cs.local_size[1] *
>        shader->info.cs.local_size[2];
>
> +   prog_data->thread_local_id_index = -1;
> +   nir_foreach_variable(var, &shader->uniforms) {
> +      if (strcmp(var->name, "cs_thread_local_id") == 0) {
>

Hrm... this seems a bit ugly...  I'll think about it a bit.

> +         prog_data->thread_local_id_index = var->data.driver_location / 4;
> +         break;
> +      }
> +   }
> +
>     unsigned max_cs_threads = compiler->devinfo->max_cs_threads;
>     unsigned simd_required = DIV_ROUND_UP(local_workgroup_size,
> max_cs_threads);
>
> --
> 2.8.1
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/mesa-dev/attachments/20160527/cfa8537d/attachment.html>