[Mesa-dev] [PATCH 56/59] i965/fs: align access to double-based uniforms in push constant buffer

Samuel Iglesias Gonsálvez siglesias at igalia.com
Fri Apr 29 11:29:53 UTC 2016


When there is a mix of definitions of uniforms with 32-bit or 64-bit
data type sizes, the driver ends up doing misaligned access to double
based variables in the push constant buffer.

To fix this, the driver adds padding when needed in the push constant
buffer and takes it into account to avoid exceeding its limits and to
update the GRF registers to access uniform variables.

v2 (Iago):
  - Renamed some variables for clarity.
  - Replace the boolean array of paddings that tracked if each param was
    padded by an integer array that keeps count of accumnulated padding.
    This allows us to remove a couple of loops that had to compute that.
  - Reworked things a bit so we can get rid of the nr_paddings field in
    brw_stage_prog_data.
  - Use rzalloc_array instead or ralloc_array and memset.
  - Fixed wrong indentation.

Signed-off-by: Samuel Iglesias Gonsálvez <siglesias at igalia.com>
Signed-off-by: Iago Toral Quiroga <itoral at igalia.com>
---
 src/mesa/drivers/dri/i965/brw_compiler.h        |  8 ++++
 src/mesa/drivers/dri/i965/brw_fs.cpp            | 50 ++++++++++++++++++++++---
 src/mesa/drivers/dri/i965/brw_program.c         |  1 +
 src/mesa/drivers/dri/i965/brw_wm.c              |  2 +
 src/mesa/drivers/dri/i965/gen6_constant_state.c | 12 ++++--
 5 files changed, 64 insertions(+), 9 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_compiler.h b/src/mesa/drivers/dri/i965/brw_compiler.h
index 5807305..bc15bf3 100644
--- a/src/mesa/drivers/dri/i965/brw_compiler.h
+++ b/src/mesa/drivers/dri/i965/brw_compiler.h
@@ -333,6 +333,14 @@ struct brw_stage_prog_data {
    } binding_table;
 
    GLuint nr_params;       /**< number of float params/constants */
+   /**
+    * Accumulated padding (in 32-bit units) in the push constant buffer for
+    * each param. If param_padding[n] = P, it means that params 0..n required
+    * a total accumulated padding of P 32-bit slots. This is useful to compute
+    * the actual location, including padding, for each param.
+    */
+   uint32_t *param_padding;
+
    GLuint nr_pull_params;
    unsigned nr_image_params;
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 53f709b..5aa6ded 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -1571,7 +1571,8 @@ fs_visitor::assign_curb_setup()
             int uniform_nr = inst->src[i].nr + inst->src[i].reg_offset;
             int constant_nr;
             if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
-               constant_nr = push_constant_loc[uniform_nr];
+               constant_nr = push_constant_loc[uniform_nr] +
+                             stage_prog_data->param_padding[uniform_nr];
             } else {
                /* Section 5.11 of the OpenGL 4.1 spec says:
                 * "Out-of-bounds reads return undefined values, which include
@@ -2010,7 +2011,10 @@ fs_visitor::assign_constant_locations()
       return;
 
    bool is_live[uniforms];
+   bool is_64bit[uniforms];
+
    memset(is_live, 0, sizeof(is_live));
+   memset(is_64bit, 0, sizeof(is_64bit));
 
    /* For each uniform slot, a value of true indicates that the given slot and
     * the next slot must remain contiguous.  This is used to keep us from
@@ -2047,8 +2051,11 @@ fs_visitor::assign_constant_locations()
             is_live[last] = true;
          } else {
             if (constant_nr >= 0 && constant_nr < (int) uniforms) {
-               for (int j = 0; j < inst->regs_read(i); j++)
+               for (int j = 0; j < inst->regs_read(i); j++) {
                   is_live[constant_nr + j] = true;
+                  if (type_sz(inst->src[i].type) == 8)
+                     is_64bit[constant_nr + j] = true;
+               }
             }
          }
       }
@@ -2072,14 +2079,18 @@ fs_visitor::assign_constant_locations()
 
    unsigned int num_push_constants = 0;
    unsigned int num_pull_constants = 0;
+   unsigned num_paddings = 0;
+   bool is_64bit_aligned = true;
 
    push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
    pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
+   stage_prog_data->param_padding = rzalloc_array(NULL, uint32_t, uniforms);
 
    int chunk_start = -1;
    for (unsigned u = 0; u < uniforms; u++) {
       push_constant_loc[u] = -1;
       pull_constant_loc[u] = -1;
+      stage_prog_data->param_padding[u] = num_paddings;
 
       if (!is_live[u])
          continue;
@@ -2094,17 +2105,44 @@ fs_visitor::assign_constant_locations()
        */
       if (!contiguous[u]) {
          unsigned chunk_size = u - chunk_start + 1;
+         unsigned total_push_components = num_push_constants + num_paddings;
+         bool push_32_bits_constant = !is_64bit[u] &&
+            total_push_components + chunk_size <= max_push_components;
+         /* Verify if the 64 bits constant can be saved into the push constant
+          * buffer. Take into account that it might need padding.
+          */
+         bool push_64_bits_constant = is_64bit[u] &&
+            (total_push_components + chunk_size + 1 + !is_64bit_aligned) <=
+            max_push_components;
 
          /* Decide whether we should push or pull this parameter.  In the
           * Vulkan driver, push constants are explicitly exposed via the API
           * so we push everything.  In GL, we only push small arrays.
           */
          if (stage_prog_data->pull_param == NULL ||
-             (num_push_constants + chunk_size <= max_push_components &&
+             ((push_32_bits_constant || push_64_bits_constant) &&
               chunk_size <= max_chunk_size)) {
-            assert(num_push_constants + chunk_size <= max_push_components);
-            for (unsigned j = chunk_start; j <= u; j++)
+            assert(total_push_components + chunk_size <= max_push_components);
+
+            for (unsigned j = chunk_start; j <= u; j++) {
+               /* If it is a double and it is not aligned to 64 bits, add padding.
+                * It is already aligned, don't add padding.
+                */
+               if (push_64_bits_constant) {
+                  if (!is_64bit_aligned) {
+                     num_paddings++;
+                     stage_prog_data->param_padding[j] = num_paddings;
+                     is_64bit_aligned = true;
+                  }
+               } else {
+                  /* We are pushing a 32 bits constant. Is the next push constant
+                   * aligned 64 bits?
+                   */
+                  is_64bit_aligned = !is_64bit_aligned;
+               }
+
                push_constant_loc[j] = num_push_constants++;
+            }
          } else {
             for (unsigned j = chunk_start; j <= u; j++)
                pull_constant_loc[j] = num_pull_constants++;
@@ -2114,7 +2152,7 @@ fs_visitor::assign_constant_locations()
       }
    }
 
-   stage_prog_data->nr_params = num_push_constants;
+   stage_prog_data->nr_params = num_push_constants + num_paddings;
    stage_prog_data->nr_pull_params = num_pull_constants;
 
    /* Up until now, the param[] array has been indexed by reg + reg_offset
diff --git a/src/mesa/drivers/dri/i965/brw_program.c b/src/mesa/drivers/dri/i965/brw_program.c
index 3112c0c..2d145cd 100644
--- a/src/mesa/drivers/dri/i965/brw_program.c
+++ b/src/mesa/drivers/dri/i965/brw_program.c
@@ -555,6 +555,7 @@ brw_stage_prog_data_free(const void *p)
    struct brw_stage_prog_data *prog_data = (struct brw_stage_prog_data *)p;
 
    ralloc_free(prog_data->param);
+   ralloc_free(prog_data->param_padding);
    ralloc_free(prog_data->pull_param);
    ralloc_free(prog_data->image_param);
 }
diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c
index dbc626c..025d484 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.c
+++ b/src/mesa/drivers/dri/i965/brw_wm.c
@@ -103,6 +103,8 @@ brw_codegen_wm_prog(struct brw_context *brw,
    param_count += 2 * ctx->Const.Program[MESA_SHADER_FRAGMENT].MaxTextureImageUnits;
    prog_data.base.param =
       rzalloc_array(NULL, const gl_constant_value *, param_count);
+   prog_data.base.param_padding =
+      rzalloc_array(NULL, uint32_t, param_count);
    prog_data.base.pull_param =
       rzalloc_array(NULL, const gl_constant_value *, param_count);
    prog_data.base.image_param =
diff --git a/src/mesa/drivers/dri/i965/gen6_constant_state.c b/src/mesa/drivers/dri/i965/gen6_constant_state.c
index 6c0c32b..3d0caf2 100644
--- a/src/mesa/drivers/dri/i965/gen6_constant_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_constant_state.c
@@ -135,7 +135,9 @@ gen6_upload_push_constants(struct brw_context *brw,
          _mesa_load_state_parameters(ctx, prog->Parameters);
 
       gl_constant_value *param;
-      int i;
+      int i, p;
+      gl_constant_value zero;
+      zero.u = 0;
 
       param = brw_state_batch(brw, type,
                               prog_data->nr_params * sizeof(gl_constant_value),
@@ -149,8 +151,12 @@ gen6_upload_push_constants(struct brw_context *brw,
        * side effect of dereferencing uniforms, so _NEW_PROGRAM_CONSTANTS
        * wouldn't be set for them.
        */
-      for (i = 0; i < prog_data->nr_params; i++) {
-         param[i] = *prog_data->param[i];
+      for (i = 0, p = 0; i < prog_data->nr_params; i++) {
+         if (p > 0 && prog_data->param_padding &&
+             prog_data->param_padding[p] > prog_data->param_padding[p - 1]) {
+            param[i++] = zero;
+         }
+         param[i] = *prog_data->param[p++];
       }
 
       if (0) {
-- 
2.5.0



More information about the mesa-dev mailing list