[Mesa-dev] [PATCH 2/3] nir: add varying component packing helpers

Wed Oct 18 11:22:11 UTC 2017

This packing pass is relatively simple, it only packs scalar
components. To be effective it relies on nir_lower_io_to_scalar_early()
having already split the majority of varyings into scalar components.

We may wish to add matrix and array support at some point however
a matrix/array splitting pass would also take care of the majority
of cases.
---
 src/compiler/nir/nir.h                 |   2 +
 src/compiler/nir/nir_linking_helpers.c | 237 +++++++++++++++++++++++++++++++++
 2 files changed, 239 insertions(+)

diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 70c23c2db99..e3de6c3dfe8 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -2423,6 +2423,8 @@ void nir_assign_var_locations(struct exec_list *var_list, unsigned *size,
 
 /* Some helpers to do very simple linking */
 bool nir_remove_unused_varyings(nir_shader *producer, nir_shader *consumer);
+void nir_compact_varyings(nir_shader *producer, nir_shader *consumer,
+                          bool default_to_smooth_interp);
 
 typedef enum {
    /* If set, this forces all non-flat fragment shader inputs to be
diff --git a/src/compiler/nir/nir_linking_helpers.c b/src/compiler/nir/nir_linking_helpers.c
index 5591f9be820..3de30650a2e 100644
--- a/src/compiler/nir/nir_linking_helpers.c
+++ b/src/compiler/nir/nir_linking_helpers.c
@@ -150,3 +150,240 @@ nir_remove_unused_varyings(nir_shader *producer, nir_shader *consumer)
 
    return progress;
 }
+
+static uint8_t
+get_interp_type(nir_variable *var, bool default_to_smooth_interp)
+{
+   return var->data.interpolation == INTERP_MODE_NONE &&
+          default_to_smooth_interp ?
+             INTERP_MODE_SMOOTH : var->data.interpolation;
+}
+
+static void
+get_slot_component_masks_and_interp_types(struct exec_list *var_list,
+                                          uint8_t *comps,  uint8_t *interp_type,
+                                          gl_shader_stage stage,
+                                          bool default_to_smooth_interp)
+{
+   nir_foreach_variable_safe(var, var_list) {
+      assert(var->data.location >= 0);
+
+      /* Only remap things that aren't built-ins.
+       * TODO: add TES patch support.
+       */
+      if (var->data.location >= VARYING_SLOT_VAR0 &&
+          var->data.location - VARYING_SLOT_VAR0 < 32) {
+
+         const struct glsl_type *type = var->type;
+         if (nir_is_per_vertex_io(var, stage)) {
+            assert(glsl_type_is_array(type));
+            type = glsl_get_array_element(type);
+         }
+
+         unsigned location = var->data.location - VARYING_SLOT_VAR0;
+         unsigned elements =
+            glsl_get_vector_elements(glsl_without_array(type));
+
+         bool dual_slot = glsl_type_is_dual_slot(glsl_without_array(type));
+         unsigned slots = glsl_count_attribute_slots(type, false);
+         for (unsigned i = 0; i < slots; i++) {
+            interp_type[location + i] =
+               get_interp_type(var, default_to_smooth_interp);
+
+            if (dual_slot) {
+               unsigned comps_slot2 = 0;
+               if (i & 1) {
+                  comps[location + i] |= ((1 << comps_slot2) - 1);
+               } else {
+                  unsigned num_comps = 4 - var->data.location_frac;
+                  comps_slot2 = (elements * 2) - num_comps;
+
+                  /* Assume ARB_enhanced_layouts packing rules for doubles */
+                  assert(var->data.location_frac == 0 ||
+                         var->data.location_frac == 2);
+                  assert(comps_slot2 <= 4);
+
+                  comps[location + i] |=
+                     ((1 << num_comps) - 1) << var->data.location_frac;
+               }
+            } else {
+               comps[location + i] |=
+                  ((1 << elements) - 1) << var->data.location_frac;
+            }
+         }
+      }
+   }
+}
+
+struct varying_loc
+{
+   uint8_t component;
+   uint32_t location;
+};
+
+static void
+remap_slots_and_components(struct exec_list *var_list,
+                           struct varying_loc (*remap)[4])
+{
+   nir_foreach_variable(var, var_list) {
+      assert(var->data.location >= 0);
+
+      /* Only remap things that aren't built-ins */
+      if (var->data.location >= VARYING_SLOT_VAR0 &&
+          var->data.location - VARYING_SLOT_VAR0 < 32) {
+         assert(var->data.location - VARYING_SLOT_VAR0 < 32);
+         assert(remap[var->data.location - VARYING_SLOT_VAR0] >= 0);
+
+         unsigned location = var->data.location - VARYING_SLOT_VAR0;
+         struct varying_loc *new_loc = &remap[location][var->data.location_frac];
+         if (new_loc->location) {
+            var->data.location = new_loc->location;
+            var->data.location_frac = new_loc->component;
+         }
+      }
+   }
+}
+
+/* If there are empty components in the slot compact the remaining components
+ * as close to component 0 as possible. This will make it easier to fill the
+ * empty components with components from a different slot in a following pass.
+ */
+static void
+compact_components(nir_shader *producer, nir_shader *consumer, uint8_t *comps,
+                   uint8_t *interp_type, bool default_to_smooth_interp)
+{
+   struct exec_list *input_list = &consumer->inputs;
+   struct exec_list *output_list = &producer->outputs;
+   struct varying_loc remap[32][4] = {{{0}, {0}}};
+
+   /* Create a cursor for each interpolation type */
+   unsigned cursor[4] = {0};
+
+   /* We only need to pass over one stage and we choose the consumer as it seems
+    * to cause a larger reduction in instruction counts (tested on i965).
+    *
+    * TODO: However since its possible to have more outputs than the consumer
+    * has inputs, it might be better to pack based on the outputs in some
+    * instances. More investigation could be useful here.
+    */
+   nir_foreach_variable(var, input_list) {
+
+      /* Only remap things that aren't builtins.
+       * TODO: add TES patch support.
+       */
+      if (var->data.location >= VARYING_SLOT_VAR0 &&
+          var->data.location - VARYING_SLOT_VAR0 < 32) {
+
+         const struct glsl_type *type = var->type;
+         if (nir_is_per_vertex_io(var, consumer->stage)) {
+            assert(glsl_type_is_array(type));
+            type = glsl_get_array_element(type);
+         }
+
+         /* Skip types that require more complex packing handling.
+          * TODO: add support for these types.
+          */
+         if (glsl_type_is_array(type) ||
+             glsl_type_is_dual_slot(type) ||
+             glsl_type_is_matrix(type) ||
+             glsl_type_is_struct(type) ||
+             glsl_type_is_64bit(type))
+            continue;
+
+         /* We ignore complex types above and all other vector types should
+          * have been split into scalar variables by the lower_io_to_scalar
+          * pass. The only exeption should by OpenGL xfb varyings.
+          */
+         if (glsl_get_vector_elements(type) != 1)
+            continue;
+
+         unsigned location = var->data.location - VARYING_SLOT_VAR0;
+         uint8_t used_comps = comps[location];
+
+         /* If there are no empty components there is nothing more for us to do.
+          */
+         if (used_comps == 0xf)
+            continue;
+
+         bool found_new_offset = false;
+         uint8_t interp = get_interp_type(var, default_to_smooth_interp);
+         for (; cursor[interp] < 32; cursor[interp]++) {
+
+            /* We couldn't find anywhere to pack the varying continue on. */
+            if (cursor[interp] == location)
+               break;
+
+            /* We can only pack varyings with matching interpolation types */
+            if (interp_type[cursor[interp]] != interp)
+               continue;
+
+            uint8_t cursor_used_comps = comps[cursor[interp]];
+
+            /* If the slot is empty just skip it for now, compact_var_list()
+             * can be called after this function to remove empty slots for us.
+             * TODO: finish implementing compact_var_list() requires array and
+             * matrix splitting.
+             */
+            if (!cursor_used_comps)
+               continue;
+
+            uint8_t unused_comps = ~cursor_used_comps;
+
+            for (unsigned i = 0; i < 4; i++) {
+               uint8_t new_var_comps = 1 << i;
+               if (unused_comps & new_var_comps) {
+                  remap[location][var->data.location_frac].component = i;
+                  remap[location][var->data.location_frac].location =
+                     cursor[interp] + VARYING_SLOT_VAR0;
+
+                  found_new_offset = true;
+
+                  /* Turn off the mask for the component we are remapping */
+                  if (comps[location] & 1 << var->data.location_frac) {
+                     comps[location] ^= 1 << var->data.location_frac;
+                     comps[cursor[interp]] |= new_var_comps;
+                  }
+                  break;
+               }
+            }
+
+            if (found_new_offset)
+               break;
+         }
+      }
+   }
+
+   remap_slots_and_components(input_list, remap);
+   remap_slots_and_components(output_list, remap);
+}
+
+/* We assume that this has been called more-or-less directly after
+ * remove_unused_varyings.  At this point, all of the varyings that we
+ * aren't going to be using have been completely removed and the
+ * inputs_read and outputs_written fields in nir_shader_info reflect
+ * this.  Therefore, the total set of valid slots is the OR of the two
+ * sets of varyings;  this accounts for varyings which one side may need
+ * to read/write even if the other doesn't.  This can happen if, for
+ * instance, an array is used indirectly from one side causing it to be
+ * unsplittable but directly from the other.
+ */
+void
+nir_compact_varyings(nir_shader *producer, nir_shader *consumer,
+                     bool default_to_smooth_interp)
+{
+   assert(producer->stage != MESA_SHADER_FRAGMENT);
+   assert(consumer->stage != MESA_SHADER_VERTEX);
+
+   uint8_t comps[32] = {0};
+   uint8_t interp_type[32] = {0};
+
+   get_slot_component_masks_and_interp_types(&producer->outputs, comps,
+                                             interp_type, producer->stage,
+                                             default_to_smooth_interp);
+   get_slot_component_masks_and_interp_types(&consumer->inputs, comps,
+                                             interp_type, consumer->stage,
+                                             default_to_smooth_interp);
+
+   compact_components(producer, consumer, comps, interp_type,
+                      default_to_smooth_interp);
+}
-- 
2.13.6