[Mesa-dev] [PATCH v4] nir: add varying component packing helpers

Thu Nov 30 02:54:59 UTC 2017

Tested-by: Dieter Nützel <Dieter at nuetzel-hh.de>

Dieter

Am 30.11.2017 01:20, schrieb Timothy Arceri:
> v2: update shader info input/output masks when pack components
> v3: make sure interpolation loc matches, this is required for the
>     radeonsi NIR backend.
> v4: 33dca36f4f28 fixed nir_gather_info to update outputs_read
>     correct, make sure we also adjust this correctly when
>     packing components.
> 
> Reviewed-by: Bas Nieuwenhuizen <bas at basnieuwenhuizen.nl> (v1)
> Reviewed-by: Nicolai Hähnle <nicolai.haehnle at amd.com> (v3)
> ---
>  src/compiler/nir/nir.h                 |   2 +
>  src/compiler/nir/nir_linking_helpers.c | 330 
> +++++++++++++++++++++++++++++++++
>  2 files changed, 332 insertions(+)
> 
> diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
> index 4c5d976a60d..83858afe148 100644
> --- a/src/compiler/nir/nir.h
> +++ b/src/compiler/nir/nir.h
> @@ -2452,20 +2452,22 @@ void nir_lower_io_to_temporaries(nir_shader 
> *shader,
>                                   nir_function_impl *entrypoint,
>                                   bool outputs, bool inputs);
> 
>  void nir_shader_gather_info(nir_shader *shader, nir_function_impl 
> *entrypoint);
> 
>  void nir_assign_var_locations(struct exec_list *var_list, unsigned 
> *size,
>                                int (*type_size)(const struct glsl_type 
> *));
> 
>  /* Some helpers to do very simple linking */
>  bool nir_remove_unused_varyings(nir_shader *producer, nir_shader 
> *consumer);
> +void nir_compact_varyings(nir_shader *producer, nir_shader *consumer,
> +                          bool default_to_smooth_interp);
> 
>  typedef enum {
>     /* If set, this forces all non-flat fragment shader inputs to be
>      * interpolated as if with the "sample" qualifier.  This requires
>      * nir_shader_compiler_options::use_interpolated_input_intrinsics.
>      */
>     nir_lower_io_force_sample_interpolation = (1 << 1),
>  } nir_lower_io_options;
>  bool nir_lower_io(nir_shader *shader,
>                    nir_variable_mode modes,
> diff --git a/src/compiler/nir/nir_linking_helpers.c
> b/src/compiler/nir/nir_linking_helpers.c
> index 4d709c1b3c5..9f0122d4519 100644
> --- a/src/compiler/nir/nir_linking_helpers.c
> +++ b/src/compiler/nir/nir_linking_helpers.c
> @@ -166,10 +166,340 @@ nir_remove_unused_varyings(nir_shader
> *producer, nir_shader *consumer)
> 
>     bool progress = false;
>     progress = remove_unused_io_vars(producer, &producer->outputs, 
> read,
>                                      patches_read);
> 
>     progress = remove_unused_io_vars(consumer, &consumer->inputs, 
> written,
>                                      patches_written) || progress;
> 
>     return progress;
>  }
> +
> +static uint8_t
> +get_interp_type(nir_variable *var, bool default_to_smooth_interp)
> +{
> +   if (var->data.interpolation != INTERP_MODE_NONE)
> +      return var->data.interpolation;
> +   else if (default_to_smooth_interp)
> +      return INTERP_MODE_SMOOTH;
> +   else
> +      return INTERP_MODE_NONE;
> +}
> +
> +#define INTERPOLATE_LOC_SAMPLE 0
> +#define INTERPOLATE_LOC_CENTROID 1
> +#define INTERPOLATE_LOC_CENTER 2
> +
> +static uint8_t
> +get_interp_loc(nir_variable *var)
> +{
> +   if (var->data.sample)
> +      return INTERPOLATE_LOC_SAMPLE;
> +   else if (var->data.centroid)
> +      return INTERPOLATE_LOC_CENTROID;
> +   else
> +      return INTERPOLATE_LOC_CENTER;
> +}
> +
> +static void
> +get_slot_component_masks_and_interp_types(struct exec_list *var_list,
> +                                          uint8_t *comps,
> +                                          uint8_t *interp_type,
> +                                          uint8_t *interp_loc,
> +                                          gl_shader_stage stage,
> +                                          bool 
> default_to_smooth_interp)
> +{
> +   nir_foreach_variable_safe(var, var_list) {
> +      assert(var->data.location >= 0);
> +
> +      /* Only remap things that aren't built-ins.
> +       * TODO: add TES patch support.
> +       */
> +      if (var->data.location >= VARYING_SLOT_VAR0 &&
> +          var->data.location - VARYING_SLOT_VAR0 < 32) {
> +
> +         const struct glsl_type *type = var->type;
> +         if (nir_is_per_vertex_io(var, stage)) {
> +            assert(glsl_type_is_array(type));
> +            type = glsl_get_array_element(type);
> +         }
> +
> +         unsigned location = var->data.location - VARYING_SLOT_VAR0;
> +         unsigned elements =
> +            glsl_get_vector_elements(glsl_without_array(type));
> +
> +         bool dual_slot = 
> glsl_type_is_dual_slot(glsl_without_array(type));
> +         unsigned slots = glsl_count_attribute_slots(type, false);
> +         unsigned comps_slot2 = 0;
> +         for (unsigned i = 0; i < slots; i++) {
> +            interp_type[location + i] =
> +               get_interp_type(var, default_to_smooth_interp);
> +            interp_loc[location + i] = get_interp_loc(var);
> +
> +            if (dual_slot) {
> +               if (i & 1) {
> +                  comps[location + i] |= ((1 << comps_slot2) - 1);
> +               } else {
> +                  unsigned num_comps = 4 - var->data.location_frac;
> +                  comps_slot2 = (elements * 2) - num_comps;
> +
> +                  /* Assume ARB_enhanced_layouts packing rules for 
> doubles */
> +                  assert(var->data.location_frac == 0 ||
> +                         var->data.location_frac == 2);
> +                  assert(comps_slot2 <= 4);
> +
> +                  comps[location + i] |=
> +                     ((1 << num_comps) - 1) << 
> var->data.location_frac;
> +               }
> +            } else {
> +               comps[location + i] |=
> +                  ((1 << elements) - 1) << var->data.location_frac;
> +            }
> +         }
> +      }
> +   }
> +}
> +
> +struct varying_loc
> +{
> +   uint8_t component;
> +   uint32_t location;
> +};
> +
> +static void
> +remap_slots_and_components(struct exec_list *var_list, gl_shader_stage 
> stage,
> +                           struct varying_loc (*remap)[4],
> +                           uint64_t *slots_used, uint64_t 
> *out_slots_read)
> + {
> +   uint64_t out_slots_read_tmp = 0;
> +
> +   /* We don't touch builtins so just copy the bitmask */
> +   uint64_t slots_used_tmp =
> +      *slots_used & (((uint64_t)1 << (VARYING_SLOT_VAR0 - 1)) - 1);
> +
> +   nir_foreach_variable(var, var_list) {
> +      assert(var->data.location >= 0);
> +
> +      /* Only remap things that aren't built-ins */
> +      if (var->data.location >= VARYING_SLOT_VAR0 &&
> +          var->data.location - VARYING_SLOT_VAR0 < 32) {
> +         assert(var->data.location - VARYING_SLOT_VAR0 < 32);
> +         assert(remap[var->data.location - VARYING_SLOT_VAR0] >= 0);
> +
> +         const struct glsl_type *type = var->type;
> +         if (nir_is_per_vertex_io(var, stage)) {
> +            assert(glsl_type_is_array(type));
> +            type = glsl_get_array_element(type);
> +         }
> +
> +         unsigned num_slots = glsl_count_attribute_slots(type, false);
> +         bool used_across_stages = false;
> +         bool outputs_read = false;
> +
> +         unsigned location = var->data.location - VARYING_SLOT_VAR0;
> +         struct varying_loc *new_loc =
> &remap[location][var->data.location_frac];
> +         if (new_loc->location) {
> +            uint64_t slots = (((uint64_t)1 << num_slots) - 1) <<
> var->data.location;
> +            if (slots & *slots_used)
> +               used_across_stages = true;
> +
> +            if (slots & *out_slots_read)
> +               outputs_read = true;
> +
> +            var->data.location = new_loc->location;
> +            var->data.location_frac = new_loc->component;
> +         }
> +
> +         if (var->data.always_active_io) {
> +            /* We can't apply link time optimisations (specifically 
> array
> +             * splitting) to these so we need to copy the existing 
> mask
> +             * otherwise we will mess up the mask for things like 
> partially
> +             * marked arrays.
> +             */
> +            if (used_across_stages) {
> +               slots_used_tmp |=
> +                  *slots_used & (((uint64_t)1 << num_slots) - 1) <<
> var->data.location;
> +            }
> +
> +            if (outputs_read) {
> +               out_slots_read_tmp |=
> +                  *out_slots_read & (((uint64_t)1 << num_slots) - 1)
> << var->data.location;
> +            }
> +
> +         } else {
> +            for (unsigned i = 0; i < num_slots; i++) {
> +               if (used_across_stages)
> +                  slots_used_tmp |= (uint64_t)1 << (var->data.location 
> + i);
> +
> +               if (outputs_read)
> +                  out_slots_read_tmp |= (uint64_t)1 <<
> (var->data.location + i);
> +            }
> +         }
> +      }
> +   }
> +
> +   *slots_used = slots_used_tmp;
> +   *out_slots_read = out_slots_read_tmp;
> +}
> +
> +/* If there are empty components in the slot compact the remaining 
> components
> + * as close to component 0 as possible. This will make it easier to 
> fill the
> + * empty components with components from a different slot in a 
> following pass.
> + */
> +static void
> +compact_components(nir_shader *producer, nir_shader *consumer, uint8_t 
> *comps,
> +                   uint8_t *interp_type, uint8_t *interp_loc,
> +                   bool default_to_smooth_interp)
> +{
> +   struct exec_list *input_list = &consumer->inputs;
> +   struct exec_list *output_list = &producer->outputs;
> +   struct varying_loc remap[32][4] = {{{0}, {0}}};
> +
> +   /* Create a cursor for each interpolation type */
> +   unsigned cursor[4] = {0};
> +
> +   /* We only need to pass over one stage and we choose the consumer
> as it seems
> +    * to cause a larger reduction in instruction counts (tested on 
> i965).
> +    */
> +   nir_foreach_variable(var, input_list) {
> +
> +      /* Only remap things that aren't builtins.
> +       * TODO: add TES patch support.
> +       */
> +      if (var->data.location >= VARYING_SLOT_VAR0 &&
> +          var->data.location - VARYING_SLOT_VAR0 < 32) {
> +
> +         /* We can't repack xfb varyings. */
> +         if (var->data.always_active_io)
> +            continue;
> +
> +         const struct glsl_type *type = var->type;
> +         if (nir_is_per_vertex_io(var, consumer->info.stage)) {
> +            assert(glsl_type_is_array(type));
> +            type = glsl_get_array_element(type);
> +         }
> +
> +         /* Skip types that require more complex packing handling.
> +          * TODO: add support for these types.
> +          */
> +         if (glsl_type_is_array(type) ||
> +             glsl_type_is_dual_slot(type) ||
> +             glsl_type_is_matrix(type) ||
> +             glsl_type_is_struct(type) ||
> +             glsl_type_is_64bit(type))
> +            continue;
> +
> +         /* We ignore complex types above and all other vector types 
> should
> +          * have been split into scalar variables by the 
> lower_io_to_scalar
> +          * pass. The only exeption should by OpenGL xfb varyings.
> +          */
> +         if (glsl_get_vector_elements(type) != 1)
> +            continue;
> +
> +         unsigned location = var->data.location - VARYING_SLOT_VAR0;
> +         uint8_t used_comps = comps[location];
> +
> +         /* If there are no empty components there is nothing more
> for us to do.
> +          */
> +         if (used_comps == 0xf)
> +            continue;
> +
> +         bool found_new_offset = false;
> +         uint8_t interp = get_interp_type(var, 
> default_to_smooth_interp);
> +         for (; cursor[interp] < 32; cursor[interp]++) {
> +            uint8_t cursor_used_comps = comps[cursor[interp]];
> +
> +            /* We couldn't find anywhere to pack the varying continue 
> on. */
> +            if (cursor[interp] == location &&
> +                (var->data.location_frac == 0 ||
> +                 cursor_used_comps & ((1 << (var->data.location_frac)) 
> - 1)))
> +               break;
> +
> +            /* We can only pack varyings with matching interpolation 
> types */
> +            if (interp_type[cursor[interp]] != interp)
> +               continue;
> +
> +            /* Interpolation loc must match also.
> +             * TODO: i965 can handle these if they don't match, but 
> the
> +             * radeonsi nir backend handles everything as vec4s and so 
> expects
> +             * this to be the same for all components. We could make 
> this
> +             * check driver specfific or drop it if NIR ever become 
> the only
> +             * radeonsi backend.
> +             */
> +            if (interp_loc[cursor[interp]] != get_interp_loc(var))
> +               continue;
> +
> +            /* If the slot is empty just skip it for now, 
> compact_var_list()
> +             * can be called after this function to remove empty slots 
> for us.
> +             * TODO: finish implementing compact_var_list() requires 
> array and
> +             * matrix splitting.
> +             */
> +            if (!cursor_used_comps)
> +               continue;
> +
> +            uint8_t unused_comps = ~cursor_used_comps;
> +
> +            for (unsigned i = 0; i < 4; i++) {
> +               uint8_t new_var_comps = 1 << i;
> +               if (unused_comps & new_var_comps) {
> +                  remap[location][var->data.location_frac].component = 
> i;
> +                  remap[location][var->data.location_frac].location =
> +                     cursor[interp] + VARYING_SLOT_VAR0;
> +
> +                  found_new_offset = true;
> +
> +                  /* Turn off the mask for the component we are 
> remapping */
> +                  if (comps[location] & 1 << var->data.location_frac) 
> {
> +                     comps[location] ^= 1 << var->data.location_frac;
> +                     comps[cursor[interp]] |= new_var_comps;
> +                  }
> +                  break;
> +               }
> +            }
> +
> +            if (found_new_offset)
> +               break;
> +         }
> +      }
> +   }
> +
> +   uint64_t zero = 0;
> +   remap_slots_and_components(input_list, consumer->info.stage, remap,
> +                              &consumer->info.inputs_read, &zero);
> +   remap_slots_and_components(output_list, producer->info.stage, 
> remap,
> +                              &producer->info.outputs_written,
> +                              &producer->info.outputs_read);
> +}
> +
> +/* We assume that this has been called more-or-less directly after
> + * remove_unused_varyings.  At this point, all of the varyings that we
> + * aren't going to be using have been completely removed and the
> + * inputs_read and outputs_written fields in nir_shader_info reflect
> + * this.  Therefore, the total set of valid slots is the OR of the two
> + * sets of varyings;  this accounts for varyings which one side may 
> need
> + * to read/write even if the other doesn't.  This can happen if, for
> + * instance, an array is used indirectly from one side causing it to 
> be
> + * unsplittable but directly from the other.
> + */
> +void
> +nir_compact_varyings(nir_shader *producer, nir_shader *consumer,
> +                     bool default_to_smooth_interp)
> +{
> +   assert(producer->info.stage != MESA_SHADER_FRAGMENT);
> +   assert(consumer->info.stage != MESA_SHADER_VERTEX);
> +
> +   uint8_t comps[32] = {0};
> +   uint8_t interp_type[32] = {0};
> +   uint8_t interp_loc[32] = {0};
> +
> +   get_slot_component_masks_and_interp_types(&producer->outputs, 
> comps,
> +                                             interp_type, interp_loc,
> +                                             producer->info.stage,
> +                                             
> default_to_smooth_interp);
> +   get_slot_component_masks_and_interp_types(&consumer->inputs, comps,
> +                                             interp_type, interp_loc,
> +                                             consumer->info.stage,
> +                                             
> default_to_smooth_interp);
> +
> +   compact_components(producer, consumer, comps, interp_type, 
> interp_loc,
> +                      default_to_smooth_interp);
> +}