[Mesa-dev] [PATCH 05/23] nir: Introduce new nir_intrinsic_load_per_vertex_input intrinsics.

Wed Sep 30 00:58:09 PDT 2015

Geometry and tessellation shaders process multiple vertices; their
inputs are arrays indexed by the vertex number.  While GLSL makes
this look like a normal array, it can be very different behind the
scenes.

On Intel hardware, all inputs for a particular vertex are stored
together - as if they were grouped into a single struct.  This means
that consecutive elements of these top-level arrays are not contiguous.
In fact, they may sometimes be in completely disjoint memory segments.

NIR's existing load_input intrinsics are awkward for this case, as they
distill everything down to a single offset.  We'd much rather keep the
vertex ID separate, but build up an offset as normal beyond that.

This patch introduces new nir_intrinsic_load_per_vertex_input
intrinsics to handle this case.  They work like ordinary load_input
intrinsics, but have an extra source (src[0]) which represents the
outermost array index.

Signed-off-by: Kenneth Graunke <kenneth at whitecape.org>
---
 src/glsl/nir/nir_intrinsics.h                 |  1 +
 src/glsl/nir/nir_lower_io.c                   | 36 +++++++++++++++--
 src/glsl/nir/nir_print.c                      |  2 +
 src/mesa/drivers/dri/i965/brw_nir.c           | 13 +++++-
 src/mesa/drivers/dri/i965/brw_vec4_gs_nir.cpp | 58 +++++++++++----------------
 5 files changed, 69 insertions(+), 41 deletions(-)

Jason and I talked about this on IRC a while back, and we both agreed that
having a separate "vertex" source made a lot of sense.  At the time, we'd
talked about adding the extra source to the regular load_input intrinsics,
and initializing it to 0 for the VS (which kind of makes sense), as well as
the FS (which makes virtually no sense).

One downside to that is that every input load would have an extra pointless
load_const associated with it.  Another is that we'd have to edit existing
uses of load_input to know about the new source, which is more churn.

So, I decided to make a new intrinsic instead.  The original makes sense for
stages that have a single set of inputs (vertex, fragment), while the new one
makes sense for stages that process multiple objects (geometry, tessellation).

Discuss :)

I suspect for tessellation we may want to add load_patch_input intrinsics
as well, but I'm not sure yet.  I wanted to prove this idea in the GS first.

diff --git a/src/glsl/nir/nir_intrinsics.h b/src/glsl/nir/nir_intrinsics.h
index ac4c2ba..263d8c1 100644
--- a/src/glsl/nir/nir_intrinsics.h
+++ b/src/glsl/nir/nir_intrinsics.h
@@ -228,6 +228,7 @@ SYSTEM_VALUE(num_work_groups, 3, 0)
 LOAD(uniform, 0, 2, NIR_INTRINSIC_CAN_ELIMINATE | NIR_INTRINSIC_CAN_REORDER)
 LOAD(ubo, 1, 1, NIR_INTRINSIC_CAN_ELIMINATE | NIR_INTRINSIC_CAN_REORDER)
 LOAD(input, 0, 1, NIR_INTRINSIC_CAN_ELIMINATE | NIR_INTRINSIC_CAN_REORDER)
+LOAD(per_vertex_input, 1, 1, NIR_INTRINSIC_CAN_ELIMINATE | NIR_INTRINSIC_CAN_REORDER)
 LOAD(ssbo, 1, 1, NIR_INTRINSIC_CAN_ELIMINATE)
 
 /*
diff --git a/src/glsl/nir/nir_lower_io.c b/src/glsl/nir/nir_lower_io.c
index 3b0a03a..59deccb 100644
--- a/src/glsl/nir/nir_lower_io.c
+++ b/src/glsl/nir/nir_lower_io.c
@@ -136,12 +136,15 @@ load_op(struct lower_io_state *state,
       case MESA_SHADER_VERTEX:
       case MESA_SHADER_TESS_CTRL:
       case MESA_SHADER_TESS_EVAL:
-      case MESA_SHADER_GEOMETRY:
       case MESA_SHADER_FRAGMENT:
       case MESA_SHADER_COMPUTE:
          op = has_indirect ? nir_intrinsic_load_input_indirect :
                              nir_intrinsic_load_input;
          break;
+      case MESA_SHADER_GEOMETRY:
+         op = has_indirect ? nir_intrinsic_load_per_vertex_input_indirect :
+                             nir_intrinsic_load_per_vertex_input;
+         break;
       }
       break;
    case nir_var_uniform:
@@ -187,8 +190,30 @@ nir_lower_io_block(nir_block *block, void *void_state)
          load->num_components = intrin->num_components;
 
          nir_src indirect;
-         unsigned offset = get_io_offset(&intrin->variables[0]->deref,
-                                         &intrin->instr, &indirect, state);
+         nir_ssa_def *vertex = NULL;
+         nir_deref *deref = &intrin->variables[0]->deref;
+
+         /* For per-vertex input arrays (i.e. geometry shader inputs), treat
+          * the outermost array dereference as a separate vertex index.
+          * Process the dereference tail normally.
+          */
+         if (load->intrinsic == nir_intrinsic_load_per_vertex_input ||
+             load->intrinsic == nir_intrinsic_load_per_vertex_input_indirect) {
+            deref = deref->child;
+            assert(deref->deref_type == nir_deref_type_array);
+            nir_deref_array *deref_array = nir_deref_as_array(deref);
+
+            nir_builder *b = &state->builder;
+            b->cursor = nir_before_instr(instr);
+            vertex = nir_imm_int(b, deref_array->base_offset);
+            if (deref_array->deref_array_type == nir_deref_array_type_indirect) {
+               vertex = nir_iadd(b, vertex,
+                                 nir_ssa_for_src(b, deref_array->indirect, 1));
+            }
+         }
+
+         unsigned offset =
+            get_io_offset(deref, &intrin->instr, &indirect, state);
 
          unsigned location = intrin->variables[0]->var->data.driver_location;
          if (mode == nir_var_uniform) {
@@ -198,8 +223,11 @@ nir_lower_io_block(nir_block *block, void *void_state)
             load->const_index[0] = location + offset;
          }
 
+         if (vertex)
+            load->src[0] = nir_src_for_ssa(vertex);
+
          if (has_indirect)
-            load->src[0] = indirect;
+            load->src[vertex ? 1 : 0] = indirect;
 
          if (intrin->dest.is_ssa) {
             nir_ssa_dest_init(&load->instr, &load->dest,
diff --git a/src/glsl/nir/nir_print.c b/src/glsl/nir/nir_print.c
index a19aa8b..f40c5df 100644
--- a/src/glsl/nir/nir_print.c
+++ b/src/glsl/nir/nir_print.c
@@ -443,6 +443,8 @@ print_intrinsic_instr(nir_intrinsic_instr *instr, print_state *state)
       break;
    case nir_intrinsic_load_input:
    case nir_intrinsic_load_input_indirect:
+   case nir_intrinsic_load_per_vertex_input:
+   case nir_intrinsic_load_per_vertex_input_indirect:
       var_list = &state->shader->inputs;
       break;
    case nir_intrinsic_store_output:
diff --git a/src/mesa/drivers/dri/i965/brw_nir.c b/src/mesa/drivers/dri/i965/brw_nir.c
index 2812fd7..64763e0 100644
--- a/src/mesa/drivers/dri/i965/brw_nir.c
+++ b/src/mesa/drivers/dri/i965/brw_nir.c
@@ -32,8 +32,17 @@ brw_nir_lower_inputs(nir_shader *nir,
                      const struct gl_program *prog,
                      bool is_scalar)
 {
-   nir_assign_var_locations(&nir->inputs, &nir->num_inputs,
-                            is_scalar ? type_size_scalar : type_size_vec4);
+   switch (nir->stage) {
+   case MESA_SHADER_GEOMETRY:
+      foreach_list_typed(nir_variable, var, node, &nir->inputs) {
+         var->data.driver_location = var->data.location;
+      }
+      break;
+   default:
+      nir_assign_var_locations(&nir->inputs, &nir->num_inputs,
+                               is_scalar ? type_size_scalar : type_size_vec4);
+      break;
+   }
 }
 
 static void
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_gs_nir.cpp
index 64a90e5..91280de 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_gs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_nir.cpp
@@ -29,41 +29,6 @@ namespace brw {
 void
 vec4_gs_visitor::nir_setup_inputs(nir_shader *shader)
 {
-   nir_inputs = ralloc_array(mem_ctx, src_reg, shader->num_inputs);
-
-   foreach_list_typed(nir_variable, var, node, &shader->inputs) {
-      int offset = var->data.driver_location;
-      if (var->type->base_type == GLSL_TYPE_ARRAY) {
-         /* Geometry shader inputs are arrays, but they use an unusual array
-          * layout: instead of all array elements for a given geometry shader
-          * input being stored consecutively, all geometry shader inputs are
-          * interleaved into one giant array. At this stage of compilation, we
-          * assume that the stride of the array is BRW_VARYING_SLOT_COUNT.
-          * Later, setup_attributes() will remap our accesses to the actual
-          * input array.
-          */
-         assert(var->type->length > 0);
-         int length = var->type->length;
-         int size = type_size_vec4(var->type) / length;
-         for (int i = 0; i < length; i++) {
-            int location = var->data.location + i * BRW_VARYING_SLOT_COUNT;
-            for (int j = 0; j < size; j++) {
-               src_reg src = src_reg(ATTR, location + j, var->type);
-               src = retype(src, brw_type_for_base_type(var->type));
-               nir_inputs[offset] = src;
-               offset++;
-            }
-         }
-      } else {
-         int size = type_size_vec4(var->type);
-         for (int i = 0; i < size; i++) {
-            src_reg src = src_reg(ATTR, var->data.location + i, var->type);
-            src = retype(src, brw_type_for_base_type(var->type));
-            nir_inputs[offset] = src;
-            offset++;
-         }
-      }
-   }
 }
 
 void
@@ -96,6 +61,29 @@ vec4_gs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
    src_reg src;
 
    switch (instr->intrinsic) {
+   case nir_intrinsic_load_per_vertex_input_indirect:
+      assert(!"EmitNoIndirectInput should prevent this.");
+   case nir_intrinsic_load_per_vertex_input: {
+      /* The EmitNoIndirectInput flag guarantees our vertex index will
+       * be constant.  We should handle indirects someday.
+       */
+      nir_const_value *vertex = nir_src_as_const_value(instr->src[0]);
+
+      /* Make up a type...we have no way of knowing... */
+      const glsl_type *const type = glsl_type::ivec(instr->num_components);
+
+      src = src_reg(ATTR, BRW_VARYING_SLOT_COUNT * vertex->u[0] +
+                          instr->const_index[0], type);
+      dest = get_nir_dest(instr->dest, src.type);
+      dest.writemask = brw_writemask_for_size(instr->num_components);
+      emit(MOV(dest, src));
+      break;
+   }
+
+   case nir_intrinsic_load_input:
+   case nir_intrinsic_load_input_indirect:
+      unreachable("nir_lower_io should have produced per_vertex intrinsics");
+
    case nir_intrinsic_emit_vertex_with_counter: {
       this->vertex_count =
          retype(get_nir_src(instr->src[0], 1), BRW_REGISTER_TYPE_UD);
-- 
2.5.3