[Mesa-dev] [PATCH v2 22/25] nir/i965: use two slots from inputs_read for dvec3/dvec4 vertex input attributes

Jason Ekstrand jason at jlekstrand.net
Wed Jan 4 15:06:19 UTC 2017


On Jan 4, 2017 5:46 AM, "Juan A. Suarez Romero" <jasuarez at igalia.com> wrote:

On Tue, 2017-01-03 at 14:41 -0800, Jason Ekstrand wrote:

I made a few pretty trivial comments.  With those addressed,

Reviewed-by: Jason Ekstrand <jason at jlekstrand.net>

On Dec 16, 2016 8:55 AM, "Juan A. Suarez Romero" <jasuarez at igalia.com>
wrote:

So far, input_reads was a bitmap tracking which vertex input locations
were being used.

In OpenGL, an attribute bigger than a vec4 (like a dvec3 or dvec4)
consumes just one location, any other small attribute. So we mark the
proper bit in inputs_read, and also the same bit in double_inputs_read
if the attribute is a dvec3/dvec4.

But in Vulkan, this is slightly different: a dvec3/dvec4 attribute
consumes two locations, not just one. And hence two bits would be marked
in inputs_read for the same vertex input attribute.

To avoid handling two different situations in NIR, we just choose the
latest one: in OpenGL, when creating NIR from GLSL/IR, any dvec3/dvec4
vertex input attribute is marked with two bits in the inputs_read bitmap
(and also in the double_inputs_read), and following attributes are
adjusted accordingly.

As example, if in our GLSL/IR shader we have three attributes:

layout(location = 0) vec3  attr0;
layout(location = 1) dvec4 attr1;
layout(location = 2) dvec3 attr2;

then in our NIR shader we put attr0 in location 0, attr1 in locations 1
and 2, and attr2 in location 3.


attr2 goes in locations 3 *and* 4, correct?

Checking carefully, basically we are using slots rather than locations
in NIR.

When emitting the vertices, we do a inverse map to know the
corresponding location for each slot.

v2 (Jason):
- use two slots from inputs_read for dvec3/dvec4 NIR from GLSL/IR.
---
 src/compiler/glsl/glsl_to_nir.cpp            | 28 +++++++++++++
 src/compiler/nir/nir_gather_info.c           | 48 ++++++++++-----------
 src/intel/vulkan/genX_pipeline.c             | 63
+++++++++++++++++-----------
 src/mesa/drivers/dri/i965/brw_draw_upload.c  | 11 +++--
 src/mesa/drivers/dri/i965/brw_fs.cpp         | 13 ------
 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp |  3 +-
 src/mesa/drivers/dri/i965/brw_nir.c          |  6 +--
 src/mesa/drivers/dri/i965/brw_nir.h          |  1 -
 src/mesa/drivers/dri/i965/brw_vec4.cpp       | 11 +++--
 9 files changed, 106 insertions(+), 78 deletions(-)

diff --git a/src/compiler/glsl/glsl_to_nir.cpp
b/src/compiler/glsl/glsl_to_nir.cpp
index 4debc37..0814dad 100644
--- a/src/compiler/glsl/glsl_to_nir.cpp
+++ b/src/compiler/glsl/glsl_to_nir.cpp
@@ -129,6 +129,19 @@ private:

 } /* end of anonymous namespace */

+static void
+nir_remap_attributes(nir_shader *shader)
+{
+   nir_foreach_variable(var, &shader->inputs) {
+      var->data.location += _mesa_bitcount_64(shader->info->double_inputs_read
&
+
BITFIELD64_MASK(var->data.location));
+   }
+
+   /* Once the remap is done, reset double_inputs_read, so later it will
have
+    * which location/slots are doubles */
+   shader->info->double_inputs_read = 0;
+}
+
 nir_shader *
 glsl_to_nir(const struct gl_shader_program *shader_prog,
             gl_shader_stage stage,
@@ -146,6 +159,13 @@ glsl_to_nir(const struct gl_shader_program
*shader_prog,

    nir_lower_constant_initializers(shader, (nir_variable_mode)~0);

+   /* Remap the locations to slots so those requiring two slots will occupy
+    * two locations. For instance, if we have in the IR code a dvec3 attr0
in
+    * location 0 and vec4 attr1 in location 1, in NIR attr0 will use
+    * locations/slots 0 and 1, and attr1 will use location/slot 2 */
+   if (shader->stage == MESA_SHADER_VERTEX)
+      nir_remap_attributes(shader);
+
    shader->info->name = ralloc_asprintf(shader, "GLSL%d",
shader_prog->Name);
    if (shader_prog->Label)
       shader->info->label = ralloc_strdup(shader, shader_prog->Label);
@@ -315,6 +335,14 @@ nir_visitor::visit(ir_variable *ir)
       } else {
          var->data.mode = nir_var_shader_in;
       }
+
+      /* Mark all the locations that require two slots */
+      if (glsl_type_is_dual_slot(glsl_without_array(var->type))) {
+         for (uint i = 0; i < glsl_count_attribute_slots(var->type, true);
i++) {
+            uint64_t bitfield = BITFIELD64_BIT(var->data.location + i);
+            shader->info->double_inputs_read |= bitfield;
+         }
+      }
       break;

    case ir_var_shader_out:
diff --git a/src/compiler/nir/nir_gather_info.c
b/src/compiler/nir/nir_gather_info.c
index 07c9949..35a1ce4 100644
--- a/src/compiler/nir/nir_gather_info.c
+++ b/src/compiler/nir/nir_gather_info.c
@@ -53,11 +53,6 @@ set_io_mask(nir_shader *shader, nir_variable *var, int
offset, int len)
          else
             shader->info->inputs_read |= bitfield;

-         /* double inputs read is only for vertex inputs */
-         if (shader->stage == MESA_SHADER_VERTEX &&
-             glsl_type_is_dual_slot(glsl_without_array(var->type)))
-            shader->info->double_inputs_read |= bitfield;
-
          if (shader->stage == MESA_SHADER_FRAGMENT) {
             shader->info->fs.uses_sample_qualifier |= var->data.sample;
          }
@@ -83,26 +78,21 @@ static void
 mark_whole_variable(nir_shader *shader, nir_variable *var)
 {
    const struct glsl_type *type = var->type;
-   bool is_vertex_input = false;

    if (nir_is_per_vertex_io(var, shader->stage)) {
       assert(glsl_type_is_array(type));
       type = glsl_get_array_element(type);
    }

-   if (shader->stage == MESA_SHADER_VERTEX &&
-       var->data.mode == nir_var_shader_in)
-      is_vertex_input = true;
-
    const unsigned slots =
       var->data.compact ? DIV_ROUND_UP(glsl_get_length(type), 4)
-                        : glsl_count_attribute_slots(type,
is_vertex_input);
+                        : glsl_count_attribute_slots(type, false);

    set_io_mask(shader, var, 0, slots);
 }

 static unsigned
-get_io_offset(nir_deref_var *deref, bool is_vertex_input)
+get_io_offset(nir_deref_var *deref)
 {
    unsigned offset = 0;

@@ -117,7 +107,7 @@ get_io_offset(nir_deref_var *deref, bool
is_vertex_input)
             return -1;
          }

-         offset += glsl_count_attribute_slots(tail->type, is_vertex_input)
*
+         offset += glsl_count_attribute_slots(tail->type, false) *
             deref_array->base_offset;
       }
       /* TODO: we can get the offset for structs here see nir_lower_io() */
@@ -163,12 +153,7 @@ try_mask_partial_io(nir_shader *shader, nir_deref_var
*deref)
       return false;
    }

-   bool is_vertex_input = false;
-   if (shader->stage == MESA_SHADER_VERTEX &&
-       var->data.mode == nir_var_shader_in)
-      is_vertex_input = true;
-
-   unsigned offset = get_io_offset(deref, is_vertex_input);
+   unsigned offset = get_io_offset(deref);
    if (offset == -1)
       return false;

@@ -184,8 +169,7 @@ try_mask_partial_io(nir_shader *shader, nir_deref_var
*deref)
    }

    /* double element width for double types that takes two slots */
-   if (!is_vertex_input &&
-       glsl_type_is_dual_slot(glsl_without_array(type))) {
+   if (glsl_type_is_dual_slot(glsl_without_array(type))) {
       elem_width *= 2;
    }

@@ -220,13 +204,27 @@ gather_intrinsic_info(nir_intrinsic_instr *instr,
nir_shader *shader)
    case nir_intrinsic_interp_var_at_sample:
    case nir_intrinsic_interp_var_at_offset:
    case nir_intrinsic_load_var:
-   case nir_intrinsic_store_var:
-      if (instr->variables[0]->var->data.mode == nir_var_shader_in ||
-          instr->variables[0]->var->data.mode == nir_var_shader_out) {
+   case nir_intrinsic_store_var: {
+      nir_variable *var = instr->variables[0]->var;
+
+      if (var->data.mode == nir_var_shader_in ||
+          var->data.mode == nir_var_shader_out) {
          if (!try_mask_partial_io(shader, instr->variables[0]))
-            mark_whole_variable(shader, instr->variables[0]->var);
+            mark_whole_variable(shader, var);
+
+         /* We need to track which input_reads bits correspond to a
+          * dvec3/dvec4 input attribute */
+         if (shader->stage == MESA_SHADER_VERTEX &&
+             var->data.mode == nir_var_shader_in &&
+             glsl_type_is_dual_slot(glsl_without_array(var->type))) {
+            for (uint i = 0; i < glsl_count_attribute_slots(var->type,
false); i++) {
+               int idx = var->data.location + i;
+               shader->info->double_inputs_read |= BITFIELD64_BIT(idx);
+            }
+         }
       }
       break;
+   }

    case nir_intrinsic_load_draw_id:
    case nir_intrinsic_load_front_face:
diff --git a/src/intel/vulkan/genX_pipeline.c b/src/intel/vulkan/genX_pipeli
ne.c
index 845d020..7b94959 100644
--- a/src/intel/vulkan/genX_pipeline.c
+++ b/src/intel/vulkan/genX_pipeline.c
@@ -33,26 +33,33 @@
 static uint32_t
 vertex_element_comp_control(enum isl_format format, unsigned comp)
 {
-   uint8_t bits;
    switch (comp) {
-   case 0: bits = isl_format_layouts[format].channels.r.bits; break;
-   case 1: bits = isl_format_layouts[format].channels.g.bits; break;
-   case 2: bits = isl_format_layouts[format].channels.b.bits; break;
-   case 3: bits = isl_format_layouts[format].channels.a.bits; break;
-   default: unreachable("Invalid component");
-   }
-
-   if (bits) {
-      return VFCOMP_STORE_SRC;
-   } else if (comp < 3) {
-      return VFCOMP_STORE_0;
-   } else if (isl_format_layouts[format].channels.r.type == ISL_UINT ||
-            isl_format_layouts[format].channels.r.type == ISL_SINT) {
-      assert(comp == 3);
-      return VFCOMP_STORE_1_INT;
-   } else {
-      assert(comp == 3);
-      return VFCOMP_STORE_1_FP;
+   case 0:
+      return isl_format_layouts[format].channels.r.bits ?
+         VFCOMP_STORE_SRC : VFCOMP_STORE_0;
+   case 1:
+      return isl_format_layouts[format].channels.g.bits ?
+         VFCOMP_STORE_SRC : VFCOMP_STORE_0;
+   case 2:
+      return isl_format_layouts[format].channels.b.bits ?
+         VFCOMP_STORE_SRC : ((isl_format_layouts[format].channels.r.type
== ISL_RAW) ?
+                             VFCOMP_NOSTORE : VFCOMP_STORE_0);


Given all the line wrapping, I think it would be clearer to just have an if
ladder here

+   case 3:
+      if (isl_format_layouts[format].channels.a.bits)
+         return VFCOMP_STORE_SRC;
+      else


Please use braces when one side of the if/else is multiple lines.

+         switch (isl_format_layouts[format].channels.r.type) {
+         case ISL_RAW:
+            return isl_format_layouts[format].channels.b.bits ?
+               VFCOMP_STORE_0 : VFCOMP_NOSTORE;


This seems a bit odd.  Mind explaining what's going on here?


Yes. When emitting 64-bit components, we either write 128 or 256 bits,
using VFCOMP_STORE_0 to pad output properly. We write 256 bits if we need
to emit Blue and/or Alpha components.

If we only need to write 128bits then we use VFCOMP_NOSTORE for the 3rd and
4th components.

In above code, we already know we are not emitting Alpha (the first
condition in "case 3" is false). If we neither need to emit Blue component,
then we only need to write 128 bits, so we return NOSTORE.

But if Blue is required, then we need to write 256bits, so we return
VFCOMP_STORE_0 to pad the output.


Maybe this would be clearer:  leave it structured the way it was with the
"bits" temporary and add a new case to the if ladder right after the first
one that is

} else if (comp >= 2 && !isl_format_layouts[format].channels.b.bits &&
          isl_format_layouts[format].channels.r.type == ISL_RAW) {
   /* comment about writing 128-bit chunks */
   return VFCOMP_NOSTORE;

+         case ISL_UINT:
+         case ISL_SINT:
+            return VFCOMP_STORE_1_INT;
+         default:
+            return VFCOMP_STORE_1_FP;
+         }
+   default:
+      unreachable("Invalid component");
    }
 }

@@ -64,8 +71,10 @@ emit_vertex_input(struct anv_pipeline *pipeline,

    /* Pull inputs_read out of the VS prog data */
    const uint64_t inputs_read = vs_prog_data->inputs_read;
+   const uint64_t double_inputs_read = vs_prog_data->double_inputs_read;
    assert((inputs_read & ((1 << VERT_ATTRIB_GENERIC0) - 1)) == 0);
    const uint32_t elements = inputs_read >> VERT_ATTRIB_GENERIC0;
+   const uint32_t elements_double = double_inputs_read >>
VERT_ATTRIB_GENERIC0;

 #if GEN_GEN >= 8
    /* On BDW+, we only need to allocate space for base ids.  Setting up
@@ -83,13 +92,16 @@ emit_vertex_input(struct anv_pipeline *pipeline,
                                 vs_prog_data->uses_baseinstance;
 #endif

-   uint32_t elem_count = __builtin_popcount(elements) + needs_svgs_elem;
-   if (elem_count == 0)
+   uint32_t elem_count = __builtin_popcount(elements) -
+      DIV_ROUND_UP(__builtin_popcount(elements_double), 2);


This had better be divisible by 2...


Yes, it is divisible by 2. Totally forgot to remove the DIV_ROUND_UP() in
the series. We'll fix it. Thanks for the catch up!

+
+   uint32_t total_elems = elem_count + needs_svgs_elem;
+   if (total_elems == 0)
       return;

    uint32_t *p;

-   const uint32_t num_dwords = 1 + elem_count * 2;
+   const uint32_t num_dwords = 1 + total_elems * 2;
    p = anv_batch_emitn(&pipeline->batch, num_dwords,
                        GENX(3DSTATE_VERTEX_ELEMENTS));
    memset(p + 1, 0, (num_dwords - 1) * 4);
@@ -107,7 +119,10 @@ emit_vertex_input(struct anv_pipeline *pipeline,
       if ((elements & (1 << desc->location)) == 0)
          continue; /* Binding unused */

-      uint32_t slot = __builtin_popcount(elements & ((1 << desc->location)
- 1));
+      uint32_t slot =
+         __builtin_popcount(elements & ((1 << desc->location) - 1)) -
+         DIV_ROUND_UP(__builtin_popcount(elements_double &
+                                        ((1 << desc->location) -1)), 2);

       struct GENX(VERTEX_ELEMENT_STATE) element = {
          .VertexBufferIndex = desc->binding,
@@ -137,7 +152,7 @@ emit_vertex_input(struct anv_pipeline *pipeline,
 #endif
    }

-   const uint32_t id_slot = __builtin_popcount(elements);
+   const uint32_t id_slot = elem_count;
    if (needs_svgs_elem) {
       /* From the Broadwell PRM for the 3D_Vertex_Component_Control enum:
        *    "Within a VERTEX_ELEMENT_STATE structure, if a Component
diff --git a/src/mesa/drivers/dri/i965/brw_draw_upload.c
b/src/mesa/drivers/dri/i965/brw_draw_upload.c
index b138cb7..c3655d8 100644
--- a/src/mesa/drivers/dri/i965/brw_draw_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_draw_upload.c
@@ -472,11 +472,16 @@ brw_prepare_vertices(struct brw_context *brw)
    /* Accumulate the list of enabled arrays. */
    brw->vb.nr_enabled = 0;
    while (vs_inputs) {
-      GLuint index = ffsll(vs_inputs) - 1;
+      GLuint first = ffsll(vs_inputs) - 1;
+      GLuint index =
+         first - DIV_ROUND_UP(_mesa_bitcount_64
(vs_prog_data->double_inputs_read &
+                                                BITFIELD64_MASK(first)),
2);
       struct brw_vertex_element *input = &brw->vb.inputs[index];
       input->is_dual_slot = brw->gen >= 8 &&
-         (vs_prog_data->double_inputs_read & BITFIELD64_BIT(index)) != 0;
-      vs_inputs &= ~BITFIELD64_BIT(index);
+         (vs_prog_data->double_inputs_read & BITFIELD64_BIT(first)) != 0;
+      vs_inputs &= ~BITFIELD64_BIT(first);
+      if (input->is_dual_slot)
+         vs_inputs &= ~BITFIELD64_BIT(first + 1);
       brw->vb.enabled[brw->vb.nr_enabled++] = input;
    }

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp
b/src/mesa/drivers/dri/i965/brw_fs.cpp
index c218f56..b383d98 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -492,19 +492,6 @@ type_size_scalar(const struct glsl_type *type)
    return 0;
 }

-/* Attribute arrays are loaded as one vec4 per element (or matrix column),
- * except for double-precision types, which are loaded as one dvec4.
- */
-extern "C" int
-type_size_vs_input(const struct glsl_type *type)
-{
-   if (type->is_double()) {
-      return type_size_dvec4(type);
-   } else {
-      return type_size_vec4(type);
-   }
-}
-
 /**
  * Create a MOV to read the timestamp register.
  *
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index 14415bd..5c356d6 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -36,8 +36,7 @@ fs_reg *
 fs_visitor::emit_vs_system_value(int location)
 {
    fs_reg *reg = new(this->mem_ctx)
-      fs_reg(ATTR, 4 * (_mesa_bitcount_64(nir->info->inputs_read) +
-                        _mesa_bitcount_64(nir->info->double_inputs_read)),
+      fs_reg(ATTR, 4 * _mesa_bitcount_64(nir->info->inputs_read),
              BRW_REGISTER_TYPE_D);
    struct brw_vs_prog_data *vs_prog_data = brw_vs_prog_data(prog_data);

diff --git a/src/mesa/drivers/dri/i965/brw_nir.c
b/src/mesa/drivers/dri/i965/brw_nir.c
index 763e3ec..24334c8 100644
--- a/src/mesa/drivers/dri/i965/brw_nir.c
+++ b/src/mesa/drivers/dri/i965/brw_nir.c
@@ -113,9 +113,7 @@ remap_vs_attrs(nir_block *block, shader_info *nir_info)
          int attr = intrin->const_index[0];
          int slot = _mesa_bitcount_64(nir_info->inputs_read &
                                       BITFIELD64_MASK(attr));
-         int dslot = _mesa_bitcount_64(nir_info->double_inputs_read &
-                                       BITFIELD64_MASK(attr));
-         intrin->const_index[0] = 4 * (slot + dslot);
+         intrin->const_index[0] = 4 * slot;
       }
    }
    return true;
@@ -204,7 +202,7 @@ brw_nir_lower_vs_inputs(nir_shader *nir,
     * loaded as one vec4 or dvec4 per element (or matrix column),
depending on
     * whether it is a double-precision type or not.
     */
-   nir_lower_io(nir, nir_var_shader_in, type_size_vs_input, 0);
+   nir_lower_io(nir, nir_var_shader_in, type_size_vec4, 0);

    /* This pass needs actual constants */
    nir_opt_constant_folding(nir);
diff --git a/src/mesa/drivers/dri/i965/brw_nir.h
b/src/mesa/drivers/dri/i965/brw_nir.h
index 3c774d0..04c7ef8 100644
--- a/src/mesa/drivers/dri/i965/brw_nir.h
+++ b/src/mesa/drivers/dri/i965/brw_nir.h
@@ -34,7 +34,6 @@ extern "C" {
 int type_size_scalar(const struct glsl_type *type);
 int type_size_vec4(const struct glsl_type *type);
 int type_size_dvec4(const struct glsl_type *type);
-int type_size_vs_input(const struct glsl_type *type);

 static inline int
 type_size_scalar_bytes(const struct glsl_type *type)
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp
b/src/mesa/drivers/dri/i965/brw_vec4.cpp
index b9e592f..852e88f 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -2127,7 +2127,7 @@ brw_compile_vs(const struct brw_compiler *compiler,
void *log_data,
       ((1 << shader->info->cull_distance_array_size) - 1) <<
       shader->info->clip_distance_array_size;

-   unsigned nr_attributes = _mesa_bitcount_64(prog_data->inputs_read);
+   unsigned nr_attribute_slots = _mesa_bitcount_64(prog_data->inputs_read);

    /* gl_VertexID and gl_InstanceID are system values, but arrive via an
     * incoming vertex attribute.  So, add an extra slot.
@@ -2137,18 +2137,17 @@ brw_compile_vs(const struct brw_compiler *compiler,
void *log_data,
         BITFIELD64_BIT(SYSTEM_VALUE_BASE_INSTANCE) |
         BITFIELD64_BIT(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) |
         BITFIELD64_BIT(SYSTEM_VALUE_INSTANCE_ID))) {
-      nr_attributes++;
+      nr_attribute_slots++;
    }

    /* gl_DrawID has its very own vec4 */
    if (shader->info->system_values_read &
        BITFIELD64_BIT(SYSTEM_VALUE_DRAW_ID)) {
-      nr_attributes++;
+      nr_attribute_slots++;
    }

-   unsigned nr_attribute_slots =
-      nr_attributes +
-      _mesa_bitcount_64(shader->info->double_inputs_read);
+   unsigned nr_attributes = nr_attribute_slots -
+      DIV_ROUND_UP(_mesa_bitcount_64(shader->info->double_inputs_read), 2);

    /* The 3DSTATE_VS documentation lists the lower bound on "Vertex URB
Entry
     * Read Length" as 1 in vec4 mode, and 0 in SIMD8 mode.  Empirically, in
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/mesa-dev/attachments/20170104/4760c632/attachment-0001.html>


More information about the mesa-dev mailing list