[Mesa-dev] [PATCH v2 56/59] i965/fs: push first double-based uniforms in push constant buffer
Jordan Justen
jordan.l.justen at intel.com
Sat May 7 07:22:05 UTC 2016
On 2016-05-05 23:56:08, Samuel Iglesias Gonsálvez wrote:
> When there is a mix of definitions of uniforms with 32-bit or 64-bit
> data type sizes, the driver ends up doing misaligned access to double
> based variables in the push constant buffer.
>
> To fix this, this patch pushes first all the 64-bit variables and
> then the rest. Then, all the variables would be aligned to
> its data type size.
>
> Signed-off-by: Samuel Iglesias Gonsálvez <siglesias at igalia.com>
> ---
> src/mesa/drivers/dri/i965/brw_fs.cpp | 113 +++++++++++++++++++++++++----------
> 1 file changed, 83 insertions(+), 30 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
> index 65aa3c7..7eaf1b4 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
> @@ -39,6 +39,7 @@
> #include "brw_program.h"
> #include "brw_dead_control_flow.h"
> #include "compiler/glsl_types.h"
> +#include "program/prog_parameter.h"
>
> using namespace brw;
>
> @@ -2004,6 +2005,45 @@ fs_visitor::compact_virtual_grfs()
> return progress;
> }
>
> +static void
> +set_push_pull_constant_loc(unsigned uniform, int &chunk_start, bool contiguous,
> + int *push_constant_loc, int *pull_constant_loc,
> + unsigned &num_push_constants,
> + unsigned &num_pull_constants,
> + const unsigned max_push_components,
> + const unsigned max_chunk_size,
> + struct brw_stage_prog_data *stage_prog_data)
> +{
> + /* This is the first live uniform in the chunk */
> + if (chunk_start < 0)
> + chunk_start = uniform;
> +
> + /* If this element does not need to be contiguous with the next, we
> + * split at this point and everthing between chunk_start and u forms a
> + * single chunk.
> + */
> + if (!contiguous) {
> + unsigned chunk_size = uniform - chunk_start + 1;
> +
> + /* Decide whether we should push or pull this parameter. In the
> + * Vulkan driver, push constants are explicitly exposed via the API
> + * so we push everything. In GL, we only push small arrays.
> + */
> + if (stage_prog_data->pull_param == NULL ||
> + (num_push_constants + chunk_size <= max_push_components &&
> + chunk_size <= max_chunk_size)) {
> + assert(num_push_constants + chunk_size <= max_push_components);
> + for (unsigned j = chunk_start; j <= uniform; j++)
> + push_constant_loc[j] = num_push_constants++;
> + } else {
> + for (unsigned j = chunk_start; j <= uniform; j++)
> + pull_constant_loc[j] = num_pull_constants++;
> + }
> +
> + chunk_start = -1;
> + }
> +}
> +
> /**
> * Assign UNIFORM file registers to either push constants or pull constants.
> *
> @@ -2022,6 +2062,8 @@ fs_visitor::assign_constant_locations()
>
> bool is_live[uniforms];
> memset(is_live, 0, sizeof(is_live));
> + bool is_live_64bit[uniforms];
> + memset(is_live_64bit, 0, sizeof(is_live_64bit));
>
> /* For each uniform slot, a value of true indicates that the given slot and
> * the next slot must remain contiguous. This is used to keep us from
> @@ -2054,14 +2096,21 @@ fs_visitor::assign_constant_locations()
> for (unsigned j = constant_nr; j < last; j++) {
> is_live[j] = true;
> contiguous[j] = true;
> + if (type_sz(inst->src[i].type) == 8) {
> + is_live_64bit[j] = true;
> + }
> }
> is_live[last] = true;
> } else {
> if (constant_nr >= 0 && constant_nr < (int) uniforms) {
> int regs_read = inst->components_read(i) *
> type_sz(inst->src[i].type) / 4;
> - for (int j = 0; j < regs_read; j++)
> + for (int j = 0; j < regs_read; j++) {
> is_live[constant_nr + j] = true;
> + if (type_sz(inst->src[i].type) == 8) {
> + is_live_64bit[constant_nr + j] = true;
> + }
> + }
> }
> }
> }
> @@ -2090,43 +2139,46 @@ fs_visitor::assign_constant_locations()
> pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
>
> int chunk_start = -1;
> +
> +
> + /* First push 64-bit uniforms */
Maybe add "to ensure they are properly aligned"?
> for (unsigned u = 0; u < uniforms; u++) {
> - push_constant_loc[u] = -1;
> + if (!is_live[u] || !is_live_64bit[u])
> + continue;
> +
> pull_constant_loc[u] = -1;
> + push_constant_loc[u] = -1;
>
> - if (!is_live[u])
> - continue;
> + set_push_pull_constant_loc(u, chunk_start, contiguous[u],
> + push_constant_loc, pull_constant_loc,
> + num_push_constants, num_pull_constants,
> + max_push_components, max_chunk_size,
> + stage_prog_data);
>
> - /* This is the first live uniform in the chunk */
> - if (chunk_start < 0)
> - chunk_start = u;
> + }
>
> - /* If this element does not need to be contiguous with the next, we
> - * split at this point and everthing between chunk_start and u forms a
> - * single chunk.
> - */
> - if (!contiguous[u]) {
> - unsigned chunk_size = u - chunk_start + 1;
> + /* Then push the rest of uniforms */
> + for (unsigned u = 0; u < uniforms; u++) {
> + if (!is_live[u] || is_live_64bit[u])
> + continue;
>
> - /* Decide whether we should push or pull this parameter. In the
> - * Vulkan driver, push constants are explicitly exposed via the API
> - * so we push everything. In GL, we only push small arrays.
> - */
> - if (stage_prog_data->pull_param == NULL ||
> - (num_push_constants + chunk_size <= max_push_components &&
> - chunk_size <= max_chunk_size)) {
> - assert(num_push_constants + chunk_size <= max_push_components);
> - for (unsigned j = chunk_start; j <= u; j++)
> - push_constant_loc[j] = num_push_constants++;
> - } else {
> - for (unsigned j = chunk_start; j <= u; j++)
> - pull_constant_loc[j] = num_pull_constants++;
> - }
> + pull_constant_loc[u] = -1;
> + push_constant_loc[u] = -1;
>
> - chunk_start = -1;
> - }
> + set_push_pull_constant_loc(u, chunk_start, contiguous[u],
> + push_constant_loc, pull_constant_loc,
> + num_push_constants, num_pull_constants,
> + max_push_components, max_chunk_size,
> + stage_prog_data);
> }
>
> + /* As the uniforms are going to be reordered, take the data from a temporal
temporary
> + * copy of the original param[].
> + */
> + gl_constant_value **param = rzalloc_array(mem_ctx, gl_constant_value*,
> + stage_prog_data->nr_params);
Since we always freeing it below, should we use NULL instead of
mem_ctx?
I think don't need the 'z' version since we use memcpy next.
Reviewed-by: Jordan Justen <jordan.l.justen at intel.com>
> + memcpy(param, stage_prog_data->param,
> + sizeof(gl_constant_value*) * stage_prog_data->nr_params);
> stage_prog_data->nr_params = num_push_constants;
> stage_prog_data->nr_pull_params = num_pull_constants;
>
> @@ -2139,7 +2191,7 @@ fs_visitor::assign_constant_locations()
> * having to make a copy.
> */
> for (unsigned int i = 0; i < uniforms; i++) {
> - const gl_constant_value *value = stage_prog_data->param[i];
> + const gl_constant_value *value = param[i];
>
> if (pull_constant_loc[i] != -1) {
> stage_prog_data->pull_param[pull_constant_loc[i]] = value;
> @@ -2147,6 +2199,7 @@ fs_visitor::assign_constant_locations()
> stage_prog_data->param[push_constant_loc[i]] = value;
> }
> }
> + ralloc_free(param);
> }
>
> /**
> --
> 2.5.0
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
More information about the mesa-dev
mailing list