[Mesa-dev] [RFC 2/2] nir: rewrite varying component packing

Tue Dec 4 03:53:39 UTC 2018

There are three reasons for the rewrite.

1. Adding support for packing tess patch varyings in a sane way.

2. Making use of qsort allowing the code to be much easier to
   follow.

3. Adding a crude live range analysis for deciding which components
   should be packed together.

We could simplify get_unmoveable_components_masks() a bit more as
we just skip these slots if we come across them but I've left it
for now in case we want to pack components in these slots in future.

vkshader-db RADV (VEGA):

Totals from affected shaders:
SGPRS: 103384 -> 103880 (0.48 %)
VGPRS: 70384 -> 70072 (-0.44 %)
Spilled SGPRs: 116 -> 112 (-3.45 %)
Spilled VGPRs: 0 -> 0 (0.00 %)
Private memory VGPRs: 0 -> 0 (0.00 %)
Scratch size: 0 -> 0 (0.00 %) dwords per thread
Code Size: 4875040 -> 4905848 (0.63 %) bytes
LDS: 0 -> 0 (0.00 %) blocks
Max Waves: 18777 -> 18861 (0.45 %)
Wait states: 0 -> 0 (0.00 %)
---
 src/compiler/nir/nir_linking_helpers.c | 368 +++++++++++++++++--------
 1 file changed, 259 insertions(+), 109 deletions(-)

diff --git a/src/compiler/nir/nir_linking_helpers.c b/src/compiler/nir/nir_linking_helpers.c
index 845aba5c87..bf31509cf0 100644
--- a/src/compiler/nir/nir_linking_helpers.c
+++ b/src/compiler/nir/nir_linking_helpers.c
@@ -224,22 +224,43 @@ get_interp_loc(nir_variable *var)
       return INTERPOLATE_LOC_CENTER;
 }
 
+static bool
+is_packing_supported_for_type(const struct glsl_type *type)
+{
+   /* Skip types that require more complex packing handling.
+    * TODO: add support for these types?
+    */
+   if (glsl_type_is_array(type) ||
+       glsl_type_is_dual_slot(type) ||
+       glsl_type_is_matrix(type) ||
+       glsl_type_is_struct(type) ||
+       glsl_type_is_64bit(type))
+      return false;
+
+   /* We ignore complex types above and all other vector types should
+    * have been split into scalar variables by the lower_io_to_scalar
+    * pass. The only exeption should by OpenGL xfb varyings.
+    */
+   if (glsl_get_vector_elements(type) != 1)
+      return false;
+
+   return true;
+}
+
+/* If we cannot pack a component this function mark the components we cannot
+ * move.
+ */
 static void
-get_slot_component_masks_and_interp_types(struct exec_list *var_list,
-                                          uint8_t *comps,
-                                          uint8_t *interp_type,
-                                          uint8_t *interp_loc,
-                                          gl_shader_stage stage,
-                                          bool default_to_smooth_interp)
+get_unmoveable_components_masks(struct exec_list *var_list, uint8_t *comps,
+                                gl_shader_stage stage,
+                                bool default_to_smooth_interp)
 {
    nir_foreach_variable_safe(var, var_list) {
       assert(var->data.location >= 0);
 
-      /* Only remap things that aren't built-ins.
-       * TODO: add TES patch support.
-       */
+      /* Only remap things that aren't built-ins. */
       if (var->data.location >= VARYING_SLOT_VAR0 &&
-          var->data.location - VARYING_SLOT_VAR0 < 32) {
+          var->data.location - VARYING_SLOT_VAR0 < MAX_VARYINGS_INCL_PATCH) {
 
          const struct glsl_type *type = var->type;
          if (nir_is_per_vertex_io(var, stage)) {
@@ -247,6 +268,12 @@ get_slot_component_masks_and_interp_types(struct exec_list *var_list,
             type = glsl_get_array_element(type);
          }
 
+         /* If we can pack this varying then don't mark the components as
+          * used.
+          */
+         if (is_packing_supported_for_type(type))
+            continue;
+
          unsigned location = var->data.location - VARYING_SLOT_VAR0;
          unsigned elements =
             glsl_get_vector_elements(glsl_without_array(type));
@@ -255,10 +282,6 @@ get_slot_component_masks_and_interp_types(struct exec_list *var_list,
          unsigned slots = glsl_count_attribute_slots(type, false);
          unsigned comps_slot2 = 0;
          for (unsigned i = 0; i < slots; i++) {
-            interp_type[location + i] =
-               get_interp_type(var, type, default_to_smooth_interp);
-            interp_loc[location + i] = get_interp_loc(var);
-
             if (dual_slot) {
                if (i & 1) {
                   comps[location + i] |= ((1 << comps_slot2) - 1);
@@ -390,32 +413,83 @@ remap_slots_and_components(struct exec_list *var_list, gl_shader_stage stage,
    *p_out_slots_read = out_slots_read_tmp[1];
 }
 
-/* If there are empty components in the slot compact the remaining components
- * as close to component 0 as possible. This will make it easier to fill the
- * empty components with components from a different slot in a following pass.
- */
+struct varying_component {
+   nir_variable *var;
+   unsigned first_block_use;
+   unsigned last_block_use;
+   uint8_t interp_type;
+   uint8_t interp_loc;
+   bool is_patch;
+   bool initialised;
+};
+
+static int
+cmp_varying_component(const void *comp1_v, const void *comp2_v)
+{
+   struct varying_component *comp1 = (struct varying_component *) comp1_v;
+   struct varying_component *comp2 = (struct varying_component *) comp2_v;
+
+   /* We want patches to be order at the end of the array */
+   if (comp1->is_patch != comp2->is_patch)
+      return comp1->is_patch ? 1 : -1;
+
+   /* We can only pack varyings with matching interpolation types so group
+    * them together.
+    */
+   if (comp1->interp_type != comp2->interp_type)
+      return comp1->interp_type - comp2->interp_type;
+
+   /* Interpolation loc must match also. */
+   if (comp1->interp_loc != comp2->interp_loc)
+      return comp1->interp_loc - comp2->interp_loc;
+
+   /* Attempt to reduce register pressure with crude live range analysis */
+   if (comp1->first_block_use != comp2->first_block_use)
+      return comp1->first_block_use - comp2->first_block_use;
+   if (comp1->last_block_use != comp2->last_block_use)
+      return comp1->last_block_use - comp2->last_block_use;
+
+   /* If everything else matches just use the original location to sort */
+   return comp1->var->data.location - comp2->var->data.location;
+}
+
 static void
-compact_components(nir_shader *producer, nir_shader *consumer, uint8_t *comps,
-                   uint8_t *interp_type, uint8_t *interp_loc,
-                   bool default_to_smooth_interp)
+set_block_use(struct varying_component *vc_info, nir_src *src,
+              bool is_if_condition)
 {
-   struct exec_list *input_list = &consumer->inputs;
-   struct exec_list *output_list = &producer->outputs;
-   struct varying_loc remap[MAX_VARYINGS_INCL_PATCH][4] = {{{0}, {0}}};
+   nir_block *blk
+      = nir_cursor_current_block(nir_before_src(src, is_if_condition));
+
+   if (vc_info->initialised) {
+      if (vc_info->first_block_use > blk->index)
+         vc_info->first_block_use = blk->index;
+
+      if (vc_info->last_block_use < blk->index)
+         vc_info->last_block_use = blk->index;
+   } else {
+      vc_info->first_block_use = blk->index;
+      vc_info->last_block_use = blk->index;
+      vc_info->initialised = true;
+   }
+}
 
-   /* Create a cursor for each interpolation type */
-   unsigned cursor[4] = {0};
+static void
+gather_varying_component_info(nir_shader *consumer,
+                              struct varying_component **varying_comp_info,
+                              unsigned *varying_comp_info_size,
+                              bool default_to_smooth_interp)
+{
+   unsigned store_varying_info_idx[MAX_VARYINGS_INCL_PATCH][4] = {0};
+   unsigned num_of_comps_to_pack = 0;
 
-   /* We only need to pass over one stage and we choose the consumer as it seems
-    * to cause a larger reduction in instruction counts (tested on i965).
+   /* Count the number of varying that can be packed and create a mapping
+    * of those varyings to the array we will pass to qsort.
     */
-   nir_foreach_variable(var, input_list) {
+   nir_foreach_variable(var, &consumer->inputs) {
 
-      /* Only remap things that aren't builtins.
-       * TODO: add TES patch support.
-       */
+      /* Only remap things that aren't builtins. */
       if (var->data.location >= VARYING_SLOT_VAR0 &&
-          var->data.location - VARYING_SLOT_VAR0 < 32) {
+          var->data.location - VARYING_SLOT_VAR0 < MAX_VARYINGS_INCL_PATCH) {
 
          /* We can't repack xfb varyings. */
          if (var->data.always_active_io)
@@ -427,88 +501,159 @@ compact_components(nir_shader *producer, nir_shader *consumer, uint8_t *comps,
             type = glsl_get_array_element(type);
          }
 
-         /* Skip types that require more complex packing handling.
-          * TODO: add support for these types.
-          */
-         if (glsl_type_is_array(type) ||
-             glsl_type_is_dual_slot(type) ||
-             glsl_type_is_matrix(type) ||
-             glsl_type_is_struct(type) ||
-             glsl_type_is_64bit(type))
+         if (!is_packing_supported_for_type(type))
             continue;
 
-         /* We ignore complex types above and all other vector types should
-          * have been split into scalar variables by the lower_io_to_scalar
-          * pass. The only exeption should by OpenGL xfb varyings.
-          */
-         if (glsl_get_vector_elements(type) != 1)
+         unsigned loc = var->data.location - VARYING_SLOT_VAR0;
+         store_varying_info_idx[loc][var->data.location_frac] =
+            ++num_of_comps_to_pack;
+      }
+   }
+
+   *varying_comp_info_size = num_of_comps_to_pack;
+   *varying_comp_info = rzalloc_array(NULL, struct varying_component,
+                                      num_of_comps_to_pack);
+
+   nir_function_impl *impl = nir_shader_get_entrypoint(consumer);
+
+   /* Walk over the shader and populate the varying component info array */
+   nir_foreach_block(block, impl) {
+      nir_foreach_instr(instr, block) {
+         if (instr->type != nir_instr_type_intrinsic)
             continue;
 
-         unsigned location = var->data.location - VARYING_SLOT_VAR0;
-         uint8_t used_comps = comps[location];
+         nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+         if (intr->intrinsic != nir_intrinsic_load_deref)
+            continue;
 
-         /* If there are no empty components there is nothing more for us to do.
-          */
-         if (used_comps == 0xf)
+         nir_variable *in_var =
+            nir_deref_instr_get_variable(nir_src_as_deref(intr->src[0]));
+
+         if (in_var->data.mode != nir_var_shader_in)
             continue;
 
-         bool found_new_offset = false;
-         uint8_t interp = get_interp_type(var, type, default_to_smooth_interp);
-         for (; cursor[interp] < 32; cursor[interp]++) {
-            uint8_t cursor_used_comps = comps[cursor[interp]];
+         /* We only remap things that aren't builtins. */
+         if (in_var->data.location < VARYING_SLOT_VAR0)
+            continue;
 
-            /* We couldn't find anywhere to pack the varying continue on. */
-            if (cursor[interp] == location &&
-                (var->data.location_frac == 0 ||
-                 cursor_used_comps & ((1 << (var->data.location_frac)) - 1)))
-               break;
+         unsigned location = in_var->data.location - VARYING_SLOT_VAR0;
+         if (location >= MAX_VARYINGS_INCL_PATCH)
+            continue;
 
-            /* We can only pack varyings with matching interpolation types */
-            if (interp_type[cursor[interp]] != interp)
-               continue;
+         unsigned var_info_idx =
+            store_varying_info_idx[location][in_var->data.location_frac];
+         if (!var_info_idx)
+            continue;
 
-            /* Interpolation loc must match also.
-             * TODO: i965 can handle these if they don't match, but the
-             * radeonsi nir backend handles everything as vec4s and so expects
-             * this to be the same for all components. We could make this
-             * check driver specfific or drop it if NIR ever become the only
-             * radeonsi backend.
-             */
-            if (interp_loc[cursor[interp]] != get_interp_loc(var))
-               continue;
+         struct varying_component *vc_info =
+            &(*varying_comp_info)[var_info_idx-1];
 
-            /* If the slot is empty just skip it for now, compact_var_list()
-             * can be called after this function to remove empty slots for us.
-             * TODO: finish implementing compact_var_list() requires array and
-             * matrix splitting.
-             */
-            if (!cursor_used_comps)
-               continue;
+         if (!vc_info->initialised) {
+            const struct glsl_type *type = in_var->type;
+            if (nir_is_per_vertex_io(in_var, consumer->info.stage)) {
+               assert(glsl_type_is_array(type));
+               type = glsl_get_array_element(type);
+            }
 
-            uint8_t unused_comps = ~cursor_used_comps;
+            vc_info->var = in_var;
+            vc_info->interp_type =
+               get_interp_type(in_var, type, default_to_smooth_interp);
+            vc_info->interp_loc = get_interp_loc(in_var);
+            vc_info->is_patch = in_var->data.patch;
+         }
 
-            for (unsigned i = 0; i < 4; i++) {
-               uint8_t new_var_comps = 1 << i;
-               if (unused_comps & new_var_comps) {
-                  remap[location][var->data.location_frac].component = i;
-                  remap[location][var->data.location_frac].location =
-                     cursor[interp] + VARYING_SLOT_VAR0;
+         nir_foreach_use(src, &intr->dest.ssa) {
+            set_block_use(vc_info, src, false);
+         }
 
-                  found_new_offset = true;
+         nir_foreach_if_use(src, &intr->dest.ssa) {
+            set_block_use(vc_info, src, true);
+         }
+      }
+   }
+}
 
-                  /* Turn off the mask for the component we are remapping */
-                  if (comps[location] & 1 << var->data.location_frac) {
-                     comps[location] ^= 1 << var->data.location_frac;
-                     comps[cursor[interp]] |= new_var_comps;
-                  }
-                  break;
-               }
-            }
+/* If there are empty components in the slot compact the remaining components
+ * as close to component 0 as possible. This will make it easier to fill the
+ * empty components with components from a different slot in a following pass.
+ */
+static void
+compact_components(nir_shader *producer, nir_shader *consumer, uint8_t *comps,
+                   bool default_to_smooth_interp)
+{
+   struct exec_list *input_list = &consumer->inputs;
+   struct exec_list *output_list = &producer->outputs;
+   struct varying_loc remap[MAX_VARYINGS_INCL_PATCH][4] = {{{0}, {0}}};
+   struct varying_component *varying_comp_info;
+   unsigned varying_comp_info_size;
 
-            if (found_new_offset)
-               break;
+   /* Gather varying component info */
+   gather_varying_component_info(consumer, &varying_comp_info,
+                                 &varying_comp_info_size,
+                                 default_to_smooth_interp);
+
+   /* Sort varying components. This attempts to reduce register pressure by
+    * packing components with a similar live range.
+    */
+   qsort(varying_comp_info, varying_comp_info_size,
+         sizeof(struct varying_component), cmp_varying_component);
+
+   unsigned cursor = 0;
+   unsigned comp = 0;
+   struct varying_component *prev_info = NULL;
+
+   /* Set the remap array based on the sorted components */
+   for (unsigned i = 0; i < varying_comp_info_size; i++ ) {
+      struct varying_component *info = &varying_comp_info[i];
+
+      /* Set the cursor to start at 32 for patches */
+      if (info->is_patch) {
+         if (!prev_info)
+            cursor = MAX_VARYING;
+         else if (info->is_patch != prev_info->is_patch)
+            cursor = MAX_VARYING;
+      }
+
+      for (; cursor < MAX_VARYINGS_INCL_PATCH; cursor++) {
+         if (comps[cursor]) {
+            comp = 0;
+            continue;
          }
+
+         /* We can only pack varyings with matching interpolation types */
+         if (prev_info && comp != 0 &&
+             info->interp_type != prev_info->interp_type) {
+            comp = 0;
+            continue;
+         }
+
+         /* Interpolation loc must match also.
+          * TODO: i965 can handle these if they don't match, but the
+          * radeonsi nir backend handles everything as vec4s and so expects
+          * this to be the same for all components. We could make this
+          * check driver specfific or drop it if NIR ever become the only
+          * radeonsi backend.
+          */
+         if (prev_info && comp != 0 &&
+             info->interp_loc != prev_info->interp_loc) {
+            comp = 0;
+            continue;
+         }
+
+         unsigned location = info->var->data.location - VARYING_SLOT_VAR0;
+         remap[location][info->var->data.location_frac].component = comp++;
+         remap[location][info->var->data.location_frac].location =
+            cursor + VARYING_SLOT_VAR0;
+
+         if (comp == 4) {
+            cursor++;
+            comp = 0;
+         }
+
+         break;
       }
+
+      prev_info = info;
    }
 
    uint64_t zero = 0;
@@ -537,24 +682,29 @@ void
 nir_compact_varyings(nir_shader *producer, nir_shader *consumer,
                      bool default_to_smooth_interp)
 {
+   nir_function_impl *p_impl = nir_shader_get_entrypoint(producer);
+   nir_function_impl *c_impl = nir_shader_get_entrypoint(consumer);
+
+   nir_metadata_require(p_impl, nir_metadata_block_index);
+   nir_metadata_require(c_impl, nir_metadata_block_index);
+
    assert(producer->info.stage != MESA_SHADER_FRAGMENT);
    assert(consumer->info.stage != MESA_SHADER_VERTEX);
 
-   uint8_t comps[32] = {0};
-   uint8_t interp_type[32] = {0};
-   uint8_t interp_loc[32] = {0};
+   uint8_t unmoveable_comps[MAX_VARYINGS_INCL_PATCH] = {0};
 
-   get_slot_component_masks_and_interp_types(&producer->outputs, comps,
-                                             interp_type, interp_loc,
-                                             producer->info.stage,
-                                             default_to_smooth_interp);
-   get_slot_component_masks_and_interp_types(&consumer->inputs, comps,
-                                             interp_type, interp_loc,
-                                             consumer->info.stage,
-                                             default_to_smooth_interp);
+   get_unmoveable_components_masks(&producer->outputs, unmoveable_comps,
+                                   producer->info.stage,
+                                   default_to_smooth_interp);
+   get_unmoveable_components_masks(&consumer->inputs, unmoveable_comps,
+                                   consumer->info.stage,
+                                   default_to_smooth_interp);
 
-   compact_components(producer, consumer, comps, interp_type, interp_loc,
+   compact_components(producer, consumer, unmoveable_comps,
                       default_to_smooth_interp);
+
+   nir_metadata_preserve(p_impl, nir_metadata_block_index);
+   nir_metadata_preserve(c_impl, nir_metadata_block_index);
 }
 
 /*
-- 
2.19.1