[Mesa-dev] [PATCH 05/12] i965/cs: Split out helper for building local id payload

Kristian Høgsberg krh at bitplanet.net
Thu Oct 8 11:00:11 PDT 2015


On Thu, Oct 8, 2015 at 1:34 AM, Pohjolainen, Topi
<topi.pohjolainen at intel.com> wrote:
> On Wed, Oct 07, 2015 at 07:11:45AM -0700, Kristian H?gsberg Kristensen wrote:
>> The initial motivation for this patch was to avoid calling
>> brw_cs_prog_local_id_payload_dwords() in gen7_cs_state.c from the
>> compiler. This commit ends up refactoring things a bit more so as to
>> split out the logic to build the local id payload to brw_fs.cpp. This
>> moves the payload building closer to the compiler code that uses the
>> payload layout and makes it avaiable to other users of the compiler.
>                                   ^
>
>                               available

Thanks, fixed.

>>
>> Signed-off-by: Kristian Høgsberg Kristensen <krh at bitplanet.net>
>> ---
>>  src/mesa/drivers/dri/i965/brw_context.h   |  1 +
>>  src/mesa/drivers/dri/i965/brw_cs.h        |  5 +-
>>  src/mesa/drivers/dri/i965/brw_fs.cpp      | 68 ++++++++++++++++++++++++--
>>  src/mesa/drivers/dri/i965/gen7_cs_state.c | 80 ++++---------------------------
>>  4 files changed, 76 insertions(+), 78 deletions(-)
>>
>> diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
>> index 0a29a69..1869f28 100644
>> --- a/src/mesa/drivers/dri/i965/brw_context.h
>> +++ b/src/mesa/drivers/dri/i965/brw_context.h
>> @@ -484,6 +484,7 @@ struct brw_cs_prog_data {
>>     unsigned simd_size;
>>     bool uses_barrier;
>>     bool uses_num_work_groups;
>> +   unsigned local_invocation_id_regs;
>>
>>     struct {
>>        /** @{
>> diff --git a/src/mesa/drivers/dri/i965/brw_cs.h b/src/mesa/drivers/dri/i965/brw_cs.h
>> index 0c0ed2b..c07eb6c 100644
>> --- a/src/mesa/drivers/dri/i965/brw_cs.h
>> +++ b/src/mesa/drivers/dri/i965/brw_cs.h
>> @@ -48,8 +48,9 @@ brw_cs_emit(struct brw_context *brw,
>>              struct gl_shader_program *prog,
>>              unsigned *final_assembly_size);
>>
>> -unsigned
>> -brw_cs_prog_local_id_payload_dwords(unsigned dispatch_width);
>> +void
>> +brw_cs_fill_local_id_payload(const struct brw_cs_prog_data *cs_prog_data,
>> +                             void *buffer, uint32_t threads, uint32_t stride);
>>
>>  #ifdef __cplusplus
>>  }
>> diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
>> index 1dee888..b4125aa 100644
>> --- a/src/mesa/drivers/dri/i965/brw_fs.cpp
>> +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
>> @@ -4718,20 +4718,43 @@ fs_visitor::setup_vs_payload()
>>     payload.num_regs = 2;
>>  }
>>
>> +/**
>> + * We are building the local ID push constant data using the simplest possible
>> + * method. We simply push the local IDs directly as they should appear in the
>> + * registers for the uvec3 gl_LocalInvocationID variable.
>> + *
>> + * Therefore, for SIMD8, we use 3 full registers, and for SIMD16 we use 6
>> + * registers worth of push constant space.
>> + *
>> + * Note: Any updates to brw_cs_prog_local_id_payload_dwords,
>> + * fill_local_id_payload or fs_visitor::emit_cs_local_invocation_id_setup need
>> + * to coordinated.
>> + *
>> + * FINISHME: There are a few easy optimizations to consider.
>> + *
>> + * 1. If gl_WorkGroupSize x, y or z is 1, we can just use zero, and there is
>> + *    no need for using push constant space for that dimension.
>> + *
>> + * 2. Since GL_MAX_COMPUTE_WORK_GROUP_SIZE is currently 1024 or less, we can
>> + *    easily use 16-bit words rather than 32-bit dwords in the push constant
>> + *    data.
>> + *
>> + * 3. If gl_WorkGroupSize x, y or z is small, then we can use bytes for
>> + *    conveying the data, and thereby reduce push constant usage.
>> + *
>> + */
>>  void
>>  fs_visitor::setup_cs_payload()
>>  {
>>     assert(devinfo->gen >= 7);
>> +   brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
>>
>>     payload.num_regs = 1;
>>
>>     if (nir->info.system_values_read & SYSTEM_BIT_LOCAL_INVOCATION_ID) {
>> -      const unsigned local_id_dwords =
>> -         brw_cs_prog_local_id_payload_dwords(dispatch_width);
>> -      assert((local_id_dwords & 0x7) == 0);
>> -      const unsigned local_id_regs = local_id_dwords / 8;
>> +      prog_data->local_invocation_id_regs = dispatch_width * 3 / 8;
>
> Subtle change in functionality here. Before this was fixed to 3 without
> taking the dispatch width into account. Might be worth to note in the commit
> message.

brw_cs_prog_local_id_payload_dwords() used to return 3 *
dispatch_width, which the code above then divided by 8. Did I miss
sommething?

Kristian

>>        payload.local_invocation_id_reg = payload.num_regs;
>> -      payload.num_regs += local_id_regs;
>> +      payload.num_regs += prog_data->local_invocation_id_regs;
>>     }
>>  }
>>
>> @@ -5171,6 +5194,41 @@ brw_wm_fs_emit(struct brw_context *brw,
>>     return g.get_assembly(final_assembly_size);
>>  }
>>
>> +void
>> +brw_cs_fill_local_id_payload(const struct brw_cs_prog_data *prog_data,
>> +                             void *buffer, uint32_t threads, uint32_t stride)
>> +{
>> +   if (prog_data->local_invocation_id_regs == 0)
>> +      return;
>> +
>> +   /* stride should be an integer number of registers, that is, a multiple of
>> +    * 32 bytes. */
>> +   assert(stride % 32 == 0);
>> +
>> +   unsigned x = 0, y = 0, z = 0;
>> +   for (unsigned t = 0; t < threads; t++) {
>> +      uint32_t *param = (uint32_t *) buffer + stride * t / 4;
>> +
>> +      for (unsigned i = 0; i < prog_data->simd_size; i++) {
>> +         param[0 * prog_data->simd_size + i] = x;
>> +         param[1 * prog_data->simd_size + i] = y;
>> +         param[2 * prog_data->simd_size + i] = z;
>> +
>> +         x++;
>> +         if (x == prog_data->local_size[0]) {
>> +            x = 0;
>> +            y++;
>> +            if (y == prog_data->local_size[1]) {
>> +               y = 0;
>> +               z++;
>> +               if (z == prog_data->local_size[2])
>> +                  z = 0;
>> +            }
>> +         }
>> +      }
>> +   }
>> +}
>> +
>>  fs_reg *
>>  fs_visitor::emit_cs_local_invocation_id_setup()
>>  {
>> diff --git a/src/mesa/drivers/dri/i965/gen7_cs_state.c b/src/mesa/drivers/dri/i965/gen7_cs_state.c
>> index 5edc4fc..6aeb0cb 100644
>> --- a/src/mesa/drivers/dri/i965/gen7_cs_state.c
>> +++ b/src/mesa/drivers/dri/i965/gen7_cs_state.c
>> @@ -70,10 +70,8 @@ brw_upload_cs_state(struct brw_context *brw)
>>
>>     unsigned local_id_dwords = 0;
>>
>> -   if (prog->SystemValuesRead & SYSTEM_BIT_LOCAL_INVOCATION_ID) {
>> -      local_id_dwords =
>> -         brw_cs_prog_local_id_payload_dwords(cs_prog_data->simd_size);
>> -   }
>> +   if (prog->SystemValuesRead & SYSTEM_BIT_LOCAL_INVOCATION_ID)
>> +      local_id_dwords = cs_prog_data->local_invocation_id_regs * 8;
>>
>>     unsigned push_constant_data_size =
>>        (prog_data->nr_params + local_id_dwords) * sizeof(gl_constant_value);
>> @@ -191,63 +189,6 @@ const struct brw_tracked_state brw_cs_state = {
>>
>>
>>  /**
>> - * We are building the local ID push constant data using the simplest possible
>> - * method. We simply push the local IDs directly as they should appear in the
>> - * registers for the uvec3 gl_LocalInvocationID variable.
>> - *
>> - * Therefore, for SIMD8, we use 3 full registers, and for SIMD16 we use 6
>> - * registers worth of push constant space.
>> - *
>> - * Note: Any updates to brw_cs_prog_local_id_payload_dwords,
>> - * fill_local_id_payload or fs_visitor::emit_cs_local_invocation_id_setup need
>> - * to coordinated.
>> - *
>> - * FINISHME: There are a few easy optimizations to consider.
>> - *
>> - * 1. If gl_WorkGroupSize x, y or z is 1, we can just use zero, and there is
>> - *    no need for using push constant space for that dimension.
>> - *
>> - * 2. Since GL_MAX_COMPUTE_WORK_GROUP_SIZE is currently 1024 or less, we can
>> - *    easily use 16-bit words rather than 32-bit dwords in the push constant
>> - *    data.
>> - *
>> - * 3. If gl_WorkGroupSize x, y or z is small, then we can use bytes for
>> - *    conveying the data, and thereby reduce push constant usage.
>> - *
>> - */
>> -unsigned
>> -brw_cs_prog_local_id_payload_dwords(unsigned dispatch_width)
>> -{
>> -   return 3 * dispatch_width;
>> -}
>> -
>> -
>> -static void
>> -fill_local_id_payload(const struct brw_cs_prog_data *cs_prog_data,
>> -                      void *buffer, unsigned *x, unsigned *y, unsigned *z)
>> -{
>> -   uint32_t *param = (uint32_t *)buffer;
>> -   for (unsigned i = 0; i < cs_prog_data->simd_size; i++) {
>> -      param[0 * cs_prog_data->simd_size + i] = *x;
>> -      param[1 * cs_prog_data->simd_size + i] = *y;
>> -      param[2 * cs_prog_data->simd_size + i] = *z;
>> -
>> -      (*x)++;
>> -      if (*x == cs_prog_data->local_size[0]) {
>> -         *x = 0;
>> -         (*y)++;
>> -         if (*y == cs_prog_data->local_size[1]) {
>> -            *y = 0;
>> -            (*z)++;
>> -            if (*z == cs_prog_data->local_size[2])
>> -               *z = 0;
>> -         }
>> -      }
>> -   }
>> -}
>> -
>> -
>> -/**
>>   * Creates a region containing the push constants for the CS on gen7+.
>>   *
>>   * Push constants are constant values (such as GLSL uniforms) that are
>> @@ -269,10 +210,8 @@ brw_upload_cs_push_constants(struct brw_context *brw,
>>        (struct brw_stage_prog_data*) cs_prog_data;
>>     unsigned local_id_dwords = 0;
>>
>> -   if (prog->SystemValuesRead & SYSTEM_BIT_LOCAL_INVOCATION_ID) {
>> -      local_id_dwords =
>> -         brw_cs_prog_local_id_payload_dwords(cs_prog_data->simd_size);
>> -   }
>> +   if (prog->SystemValuesRead & SYSTEM_BIT_LOCAL_INVOCATION_ID)
>> +      local_id_dwords = cs_prog_data->local_invocation_id_regs * 8;
>>
>>     /* Updates the ParamaterValues[i] pointers for all parameters of the
>>      * basic type of PROGRAM_STATE_VAR.
>> @@ -302,14 +241,13 @@ brw_upload_cs_push_constants(struct brw_context *brw,
>>
>>        STATIC_ASSERT(sizeof(gl_constant_value) == sizeof(float));
>>
>> +      brw_cs_fill_local_id_payload(cs_prog_data, param, threads,
>> +                                   reg_aligned_constant_size);
>> +
>>        /* _NEW_PROGRAM_CONSTANTS */
>> -      unsigned x = 0, y = 0, z = 0;
>>        for (t = 0; t < threads; t++) {
>> -         gl_constant_value *next_param = &param[t * param_aligned_count];
>> -         if (local_id_dwords > 0) {
>> -            fill_local_id_payload(cs_prog_data, (void*)next_param, &x, &y, &z);
>> -            next_param += local_id_dwords;
>> -         }
>> +         gl_constant_value *next_param =
>> +            &param[t * param_aligned_count + local_id_dwords];
>>           for (i = 0; i < prog_data->nr_params; i++) {
>>              next_param[i] = *prog_data->param[i];
>>           }
>> --
>> 2.4.3
>>
>> _______________________________________________
>> mesa-dev mailing list
>> mesa-dev at lists.freedesktop.org
>> http://lists.freedesktop.org/mailman/listinfo/mesa-dev


More information about the mesa-dev mailing list