[Mesa-dev] [PATCH 56/59] i965/fs: align access to double-based uniforms in push constant buffer

Pohjolainen, Topi topi.pohjolainen at intel.com
Mon May 2 13:28:18 UTC 2016


On Fri, Apr 29, 2016 at 01:29:53PM +0200, Samuel Iglesias Gons?lvez wrote:
> When there is a mix of definitions of uniforms with 32-bit or 64-bit
> data type sizes, the driver ends up doing misaligned access to double
> based variables in the push constant buffer.
> 
> To fix this, the driver adds padding when needed in the push constant
> buffer and takes it into account to avoid exceeding its limits and to
> update the GRF registers to access uniform variables.
> 
> v2 (Iago):
>   - Renamed some variables for clarity.
>   - Replace the boolean array of paddings that tracked if each param was
>     padded by an integer array that keeps count of accumnulated padding.
>     This allows us to remove a couple of loops that had to compute that.
>   - Reworked things a bit so we can get rid of the nr_paddings field in
>     brw_stage_prog_data.
>   - Use rzalloc_array instead or ralloc_array and memset.
>   - Fixed wrong indentation.
> 
> Signed-off-by: Samuel Iglesias Gonsálvez <siglesias at igalia.com>
> Signed-off-by: Iago Toral Quiroga <itoral at igalia.com>
> ---
>  src/mesa/drivers/dri/i965/brw_compiler.h        |  8 ++++
>  src/mesa/drivers/dri/i965/brw_fs.cpp            | 50 ++++++++++++++++++++++---
>  src/mesa/drivers/dri/i965/brw_program.c         |  1 +
>  src/mesa/drivers/dri/i965/brw_wm.c              |  2 +
>  src/mesa/drivers/dri/i965/gen6_constant_state.c | 12 ++++--
>  5 files changed, 64 insertions(+), 9 deletions(-)
> 
> diff --git a/src/mesa/drivers/dri/i965/brw_compiler.h b/src/mesa/drivers/dri/i965/brw_compiler.h
> index 5807305..bc15bf3 100644
> --- a/src/mesa/drivers/dri/i965/brw_compiler.h
> +++ b/src/mesa/drivers/dri/i965/brw_compiler.h
> @@ -333,6 +333,14 @@ struct brw_stage_prog_data {
>     } binding_table;
>  
>     GLuint nr_params;       /**< number of float params/constants */
> +   /**
> +    * Accumulated padding (in 32-bit units) in the push constant buffer for
> +    * each param. If param_padding[n] = P, it means that params 0..n required
> +    * a total accumulated padding of P 32-bit slots. This is useful to compute
> +    * the actual location, including padding, for each param.
> +    */
> +   uint32_t *param_padding;
> +
>     GLuint nr_pull_params;
>     unsigned nr_image_params;
>  
> diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
> index 53f709b..5aa6ded 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
> @@ -1571,7 +1571,8 @@ fs_visitor::assign_curb_setup()
>              int uniform_nr = inst->src[i].nr + inst->src[i].reg_offset;
>              int constant_nr;
>              if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
> -               constant_nr = push_constant_loc[uniform_nr];
> +               constant_nr = push_constant_loc[uniform_nr] +
> +                             stage_prog_data->param_padding[uniform_nr];
>              } else {
>                 /* Section 5.11 of the OpenGL 4.1 spec says:
>                  * "Out-of-bounds reads return undefined values, which include
> @@ -2010,7 +2011,10 @@ fs_visitor::assign_constant_locations()
>        return;
>  
>     bool is_live[uniforms];
> +   bool is_64bit[uniforms];
> +
>     memset(is_live, 0, sizeof(is_live));
> +   memset(is_64bit, 0, sizeof(is_64bit));
>  
>     /* For each uniform slot, a value of true indicates that the given slot and
>      * the next slot must remain contiguous.  This is used to keep us from
> @@ -2047,8 +2051,11 @@ fs_visitor::assign_constant_locations()
>              is_live[last] = true;
>           } else {
>              if (constant_nr >= 0 && constant_nr < (int) uniforms) {
> -               for (int j = 0; j < inst->regs_read(i); j++)
> +               for (int j = 0; j < inst->regs_read(i); j++) {
>                    is_live[constant_nr + j] = true;
> +                  if (type_sz(inst->src[i].type) == 8)
> +                     is_64bit[constant_nr + j] = true;
> +               }
>              }
>           }
>        }
> @@ -2072,14 +2079,18 @@ fs_visitor::assign_constant_locations()
>  
>     unsigned int num_push_constants = 0;
>     unsigned int num_pull_constants = 0;
> +   unsigned num_paddings = 0;
> +   bool is_64bit_aligned = true;
>  
>     push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
>     pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
> +   stage_prog_data->param_padding = rzalloc_array(NULL, uint32_t, uniforms);
>  
>     int chunk_start = -1;
>     for (unsigned u = 0; u < uniforms; u++) {
>        push_constant_loc[u] = -1;
>        pull_constant_loc[u] = -1;
> +      stage_prog_data->param_padding[u] = num_paddings;
>  
>        if (!is_live[u])
>           continue;
> @@ -2094,17 +2105,44 @@ fs_visitor::assign_constant_locations()
>         */
>        if (!contiguous[u]) {
>           unsigned chunk_size = u - chunk_start + 1;
> +         unsigned total_push_components = num_push_constants + num_paddings;
> +         bool push_32_bits_constant = !is_64bit[u] &&
> +            total_push_components + chunk_size <= max_push_components;
> +         /* Verify if the 64 bits constant can be saved into the push constant
> +          * buffer. Take into account that it might need padding.
> +          */
> +         bool push_64_bits_constant = is_64bit[u] &&
> +            (total_push_components + chunk_size + 1 + !is_64bit_aligned) <=
> +            max_push_components;
>  
>           /* Decide whether we should push or pull this parameter.  In the
>            * Vulkan driver, push constants are explicitly exposed via the API
>            * so we push everything.  In GL, we only push small arrays.
>            */
>           if (stage_prog_data->pull_param == NULL ||
> -             (num_push_constants + chunk_size <= max_push_components &&
> +             ((push_32_bits_constant || push_64_bits_constant) &&
>                chunk_size <= max_chunk_size)) {
> -            assert(num_push_constants + chunk_size <= max_push_components);
> -            for (unsigned j = chunk_start; j <= u; j++)
> +            assert(total_push_components + chunk_size <= max_push_components);
> +
> +            for (unsigned j = chunk_start; j <= u; j++) {
> +               /* If it is a double and it is not aligned to 64 bits, add padding.
> +                * It is already aligned, don't add padding.
> +                */
> +               if (push_64_bits_constant) {
> +                  if (!is_64bit_aligned) {
> +                     num_paddings++;
> +                     stage_prog_data->param_padding[j] = num_paddings;
> +                     is_64bit_aligned = true;
> +                  }
> +               } else {
> +                  /* We are pushing a 32 bits constant. Is the next push constant
> +                   * aligned 64 bits?
> +                   */
> +                  is_64bit_aligned = !is_64bit_aligned;
> +               }
> +
>                 push_constant_loc[j] = num_push_constants++;
> +            }

I am probably greatly over simplifying but I have to ask. Could we simply
add a hole in the push file by increasing the counter? After all it is
push_constant_loc[] that controls in the end the offset in the instructions
(after running assign_curb_setup()) and tells the loader where to copy the
data from the uniform storage. Something of this sort:

        if (!contiguous[u]) {
            unsigned chunk_size = u - chunk_start + 1;

+           if (is_64bit[u] && num_push_constants % 2)
+              num_push_constants++;

            /* Decide whether we should push or pull this parameter.  In the
             * Vulkan driver, push constants are explicitly exposed via the API
             * so we push everything.  In GL, we only push small arrays.
             */
            if (stage_prog_data->pull_param == NULL ||
                (num_push_constants + chunk_size <= max_push_components &&
                 chunk_size <= max_chunk_size)) {
               assert(num_push_constants + chunk_size <= max_push_components);
               for (unsigned j = chunk_start; j <= u; j++)
                  push_constant_loc[j] = num_push_constants++;


>           } else {
>              for (unsigned j = chunk_start; j <= u; j++)
>                 pull_constant_loc[j] = num_pull_constants++;
> @@ -2114,7 +2152,7 @@ fs_visitor::assign_constant_locations()
>        }
>     }
>  
> -   stage_prog_data->nr_params = num_push_constants;
> +   stage_prog_data->nr_params = num_push_constants + num_paddings;
>     stage_prog_data->nr_pull_params = num_pull_constants;
>  
>     /* Up until now, the param[] array has been indexed by reg + reg_offset
> diff --git a/src/mesa/drivers/dri/i965/brw_program.c b/src/mesa/drivers/dri/i965/brw_program.c
> index 3112c0c..2d145cd 100644
> --- a/src/mesa/drivers/dri/i965/brw_program.c
> +++ b/src/mesa/drivers/dri/i965/brw_program.c
> @@ -555,6 +555,7 @@ brw_stage_prog_data_free(const void *p)
>     struct brw_stage_prog_data *prog_data = (struct brw_stage_prog_data *)p;
>  
>     ralloc_free(prog_data->param);
> +   ralloc_free(prog_data->param_padding);
>     ralloc_free(prog_data->pull_param);
>     ralloc_free(prog_data->image_param);
>  }
> diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c
> index dbc626c..025d484 100644
> --- a/src/mesa/drivers/dri/i965/brw_wm.c
> +++ b/src/mesa/drivers/dri/i965/brw_wm.c
> @@ -103,6 +103,8 @@ brw_codegen_wm_prog(struct brw_context *brw,
>     param_count += 2 * ctx->Const.Program[MESA_SHADER_FRAGMENT].MaxTextureImageUnits;
>     prog_data.base.param =
>        rzalloc_array(NULL, const gl_constant_value *, param_count);
> +   prog_data.base.param_padding =
> +      rzalloc_array(NULL, uint32_t, param_count);
>     prog_data.base.pull_param =
>        rzalloc_array(NULL, const gl_constant_value *, param_count);
>     prog_data.base.image_param =
> diff --git a/src/mesa/drivers/dri/i965/gen6_constant_state.c b/src/mesa/drivers/dri/i965/gen6_constant_state.c
> index 6c0c32b..3d0caf2 100644
> --- a/src/mesa/drivers/dri/i965/gen6_constant_state.c
> +++ b/src/mesa/drivers/dri/i965/gen6_constant_state.c
> @@ -135,7 +135,9 @@ gen6_upload_push_constants(struct brw_context *brw,
>           _mesa_load_state_parameters(ctx, prog->Parameters);
>  
>        gl_constant_value *param;
> -      int i;
> +      int i, p;
> +      gl_constant_value zero;
> +      zero.u = 0;
>  
>        param = brw_state_batch(brw, type,
>                                prog_data->nr_params * sizeof(gl_constant_value),
> @@ -149,8 +151,12 @@ gen6_upload_push_constants(struct brw_context *brw,
>         * side effect of dereferencing uniforms, so _NEW_PROGRAM_CONSTANTS
>         * wouldn't be set for them.
>         */
> -      for (i = 0; i < prog_data->nr_params; i++) {
> -         param[i] = *prog_data->param[i];
> +      for (i = 0, p = 0; i < prog_data->nr_params; i++) {
> +         if (p > 0 && prog_data->param_padding &&
> +             prog_data->param_padding[p] > prog_data->param_padding[p - 1]) {
> +            param[i++] = zero;
> +         }
> +         param[i] = *prog_data->param[p++];
>        }
>  
>        if (0) {
> -- 
> 2.5.0
> 
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev


More information about the mesa-dev mailing list