[Mesa-dev] [PATCH 4/5] glsl: Vectorize multiple scalar assignments

Thu Jan 9 11:28:23 PST 2014

On 01/08/2014 12:43 PM, Matt Turner wrote:
> Reduces vertex shader instruction counts in DOTA2 by 6.42%, L4D2 by
> 4.61%, and CS:GO by 5.71%.
> 
> total instructions in shared programs: 1500153 -> 1498191 (-0.13%)
> instructions in affected programs:     59919 -> 57957 (-3.27%)
> ---
>  src/glsl/Makefile.sources       |   1 +
>  src/glsl/glsl_parser_extras.cpp |   4 +
>  src/glsl/ir_optimization.h      |   1 +
>  src/glsl/opt_vectorize.cpp      | 319 ++++++++++++++++++++++++++++++++++++++++
>  4 files changed, 325 insertions(+)
>  create mode 100644 src/glsl/opt_vectorize.cpp
> 
> diff --git a/src/glsl/Makefile.sources b/src/glsl/Makefile.sources
> index 2e81ded..e69c1ac 100644
> --- a/src/glsl/Makefile.sources
> +++ b/src/glsl/Makefile.sources
> @@ -99,6 +99,7 @@ LIBGLSL_FILES = \
>  	$(GLSL_SRCDIR)/opt_structure_splitting.cpp \
>  	$(GLSL_SRCDIR)/opt_swizzle_swizzle.cpp \
>  	$(GLSL_SRCDIR)/opt_tree_grafting.cpp \
> +	$(GLSL_SRCDIR)/opt_vectorize.cpp \
>  	$(GLSL_SRCDIR)/s_expression.cpp \
>  	$(GLSL_SRCDIR)/strtod.c
>  
> diff --git a/src/glsl/glsl_parser_extras.cpp b/src/glsl/glsl_parser_extras.cpp
> index c759569..3db7eaa 100644
> --- a/src/glsl/glsl_parser_extras.cpp
> +++ b/src/glsl/glsl_parser_extras.cpp
> @@ -1594,6 +1594,10 @@ do_common_optimization(exec_list *ir, bool linked,
>     if (options->OptimizeForAOS && !linked)
>        progress = opt_flip_matrices(ir) || progress;
>  
> +   if (linked && options->OptimizeForAOS) {
> +      progress = do_vectorize(ir) || progress;
> +   }
> +
>     if (linked)
>        progress = do_dead_code(ir, uniform_locations_assigned) || progress;
>     else
> diff --git a/src/glsl/ir_optimization.h b/src/glsl/ir_optimization.h
> index 3ca9f57..055d655 100644
> --- a/src/glsl/ir_optimization.h
> +++ b/src/glsl/ir_optimization.h
> @@ -98,6 +98,7 @@ bool do_mat_op_to_vec(exec_list *instructions);
>  bool do_noop_swizzle(exec_list *instructions);
>  bool do_structure_splitting(exec_list *instructions);
>  bool do_swizzle_swizzle(exec_list *instructions);
> +bool do_vectorize(exec_list *instructions);
>  bool do_tree_grafting(exec_list *instructions);
>  bool do_vec_index_to_cond_assign(exec_list *instructions);
>  bool do_vec_index_to_swizzle(exec_list *instructions);
> diff --git a/src/glsl/opt_vectorize.cpp b/src/glsl/opt_vectorize.cpp
> new file mode 100644
> index 0000000..9ca811a
> --- /dev/null
> +++ b/src/glsl/opt_vectorize.cpp
> @@ -0,0 +1,319 @@
> +/*
> + * Copyright © 2013 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
> + * DEALINGS IN THE SOFTWARE.
> + */
> +
> +/**
> + * \file opt_vectorize.cpp
> + *
> + * Combines scalar assignments of the same expression (modulo swizzle) to
> + * multiple channels of the same variable into a single vectorized expression
> + * and assignment.
> + *
> + * Many generated shaders contain scalarized code. That is, they contain
> + *
> + * r1.x = log2(v0.x);
> + * r1.y = log2(v0.y);
> + * r1.z = log2(v0.z);
> + *
> + * rather than
> + *
> + * r1.xyz = log2(v0.xyz);
> + *
> + * We look for consecutive assignments of the same expression (modulo swizzle)
> + * to each channel of the same variable.
> + *
> + * For instance, we want to convert these three scalar operations
> + *
> + * (assign (x) (var_ref r1) (expression float log2 (swiz x (var_ref v0))))
> + * (assign (y) (var_ref r1) (expression float log2 (swiz y (var_ref v0))))
> + * (assign (z) (var_ref r1) (expression float log2 (swiz z (var_ref v0))))
> + *
> + * into a single vector operation
> + *
> + * (assign (xyz) (var_ref r1) (expression vec3 log2 (swiz xyz (var_ref v0))))

I think it's worth adding a note that this pass only attempts to combine
assignments that are sequential.  The above example gets fully
vectorized, but this sequence would not:

(assign (x) (var_ref r1) (expression float log2 (swiz x (var_ref v0))))
(assign (x) (var_ref r2) (expression float log2 (swiz y (var_ref v0))))
(assign (y) (var_ref r1) (expression float log2 (swiz z (var_ref v0))))
(assign (y) (var_ref r2) (expression float log2 (swiz w (var_ref v0))))

I think this will also break on code like

(assign (x) (var_ref r1) (expression float log2 (swiz w (var_ref r1))))
(assign (y) (var_ref r1) (expression float log2 (swiz z (var_ref r1))))
# r1.xy have different values now.
(assign (z) (var_ref r1) (expression float log2 (swiz y (var_ref r1))))
(assign (w) (var_ref r1) (expression float log2 (swiz x (var_ref r1))))

Maybe just skip assignments where the LHS also appears in the RHS for
now?  Or does the check write_mask_matches_swizzle take care of this?

> + */
> +
> +#include "ir.h"
> +#include "ir_visitor.h"
> +#include "ir_optimization.h"
> +#include "glsl_types.h"
> +#include "program/prog_instruction.h"
> +
> +namespace {
> +
> +class ir_vectorize_visitor : public ir_hierarchical_visitor {
> +public:
> +   void clear()
> +   {
> +      assignment[0] = NULL;
> +      assignment[1] = NULL;
> +      assignment[2] = NULL;
> +      assignment[3] = NULL;
> +      current_assignment = NULL;
> +      last_assignment = NULL;
> +      channels = 0;
> +      has_swizzle = false;
> +   }
> +
> +   ir_vectorize_visitor()
> +   {
> +      clear();
> +      progress = false;
> +   }
> +
> +   virtual ir_visitor_status visit_enter(ir_assignment *);
> +   virtual ir_visitor_status visit_enter(ir_swizzle *);
> +
> +   virtual ir_visitor_status visit_leave(ir_assignment *);
> +
> +   void try_vectorize();
> +
> +   ir_assignment *assignment[4];
> +   ir_assignment *current_assignment, *last_assignment;
> +   unsigned channels;
> +   bool has_swizzle;
> +
> +   bool progress;
> +};
> +
> +} /* unnamed namespace */
> +
> +/**
> + * Rewrites the swizzles and types of a right-hand side of an assignment.
> + *
> + * From the example above, this function would be called (by visit_tree()) on
> + * the nodes of the tree (expression float log2 (swiz z   (var_ref v0))),
> + * rewriting it into     (expression vec3  log2 (swiz xyz (var_ref v0))).
> + *
> + * The function modifies only ir_expressions and ir_swizzles. For expressions
> + * it sets a new type and swizzles any scalar dereferences into appropriately
> + * sized vector arguments. For example, if combining
> + *
> + * (assign (x) (var_ref r1) (expression float + (swiz x (var_ref v0) (var_ref v1))))
> + * (assign (y) (var_ref r1) (expression float + (swiz y (var_ref v0) (var_ref v1))))
> + *
> + * where v1 is a scalar, rewrite_swizzle() would insert a swizzle on
> + * (var_ref v1) such that the final result was
> + *
> + * (assign (xy) (var_ref r1) (expression vec2 + (swiz xy (var_ref v0))
> + *                                              (swiz xx (var_ref v1))))
> + *
> + * For swizzles, it sets a new type, and if the variable being swizzled is a
> + * vector it overwrites the swizzle mask with the ir_swizzle_mask passed as the
> + * data parameter. If the swizzled variable is scalar, then the swizzle was
> + * added by an earlier call to rewrite_swizzle() on an expression, so the
> + * mask should not be modified.
> + */
> +static void
> +rewrite_swizzle(ir_instruction *ir, void *data)
> +{
> +   ir_swizzle_mask *mask = (ir_swizzle_mask *)data;
> +
> +   switch (ir->ir_type) {
> +   case ir_type_swizzle: {
> +      ir_swizzle *swz = (ir_swizzle *)ir;
> +      if (swz->val->type->is_vector()) {
> +         swz->mask = *mask;
> +      }
> +      swz->type = glsl_type::get_instance(swz->type->base_type,
> +                                          mask->num_components, 1);
> +      break;
> +   }
> +   case ir_type_expression: {
> +      ir_expression *expr = (ir_expression *)ir;
> +      expr->type = glsl_type::get_instance(expr->type->base_type,
> +                                           mask->num_components, 1);
> +      for (unsigned i = 0; i < 4; i++) {
> +         if (expr->operands[i]) {
> +            ir_dereference *deref = expr->operands[i]->as_dereference();
> +            if (deref && deref->type->is_scalar()) {
> +               expr->operands[i] = new(ir) ir_swizzle(deref, 0, 0, 0, 0,
> +                                                      mask->num_components);
> +            }
> +         }
> +      }
> +      break;
> +   }
> +   default:
> +      break;
> +   }
> +}
> +
> +/**
> + * Attempt to vectorize the previously saved assignments, and clear them from
> + * consideration.
> + *
> + * If the assignments are able to be combined, it modifies in-place the last
> + * assignment seen to be an equivalent vector form of the scalar assignments.
> + * It then removes the other now obsolete scalar assignments.
> + */
> +void
> +ir_vectorize_visitor::try_vectorize()
> +{
> +   if (this->last_assignment && this->channels > 1) {
> +      ir_swizzle_mask mask = {0, 1, 2, 3, channels, 0};
> +
> +      visit_tree(this->last_assignment->rhs, rewrite_swizzle, &mask);
> +
> +      this->last_assignment->write_mask = 0;
> +
> +      for (unsigned i = 0; i < 4; i++) {
> +         if (this->assignment[i]) {
> +            this->last_assignment->write_mask |= 1 << i;
> +
> +            if (this->assignment[i] != this->last_assignment) {
> +               this->assignment[i]->remove();
> +            }
> +         }
> +      }
> +
> +      this->progress = true;
> +   }
> +   clear();
> +}
> +
> +/**
> + * Returns whether the write mask is a single channel.
> + */
> +static bool
> +single_channel_write_mask(unsigned write_mask)
> +{
> +   return write_mask != 0 && (write_mask & (write_mask - 1)) == 0;

Maybe

   return _mesa_bitcount(write_mask) == 1;

That will be faster (and smaller) on CPUs with a popcount instruction,
but worse everywhere else...  Of course, GCC maybe smart enough to
generate that for you.  Dunno.

> +}
> +
> +/**
> + * Translates single-channeled write mask to single-channeled swizzle.
> + */
> +static unsigned
> +write_mask_to_swizzle(unsigned write_mask)
> +{
> +   switch (write_mask) {
> +   case WRITEMASK_X: return SWIZZLE_X;
> +   case WRITEMASK_Y: return SWIZZLE_Y;
> +   case WRITEMASK_Z: return SWIZZLE_Z;
> +   case WRITEMASK_W: return SWIZZLE_W;
> +   }
> +   assert(!"not reached");
> +   unreachable();
> +}
> +
> +/**
> + * Returns whether a single-channeled write mask matches a swizzle.
> + */
> +static bool
> +write_mask_matches_swizzle(unsigned write_mask,
> +                           const ir_swizzle *swz)
> +{
> +   return ((write_mask == WRITEMASK_X && swz->mask.x == SWIZZLE_X) ||
> +           (write_mask == WRITEMASK_Y && swz->mask.x == SWIZZLE_Y) ||
> +           (write_mask == WRITEMASK_Z && swz->mask.x == SWIZZLE_Z) ||
> +           (write_mask == WRITEMASK_W && swz->mask.x == SWIZZLE_W));
> +}
> +
> +/**
> + * Upon entering an ir_assignment, attempt to vectorize the currently tracked
> + * assignments if the current assignment is not suitable. Keep a pointer to
> + * the current assignment.
> + */
> +ir_visitor_status
> +ir_vectorize_visitor::visit_enter(ir_assignment *ir)
> +{
> +   ir_dereference *lhs = this->last_assignment != NULL ?
> +                         this->last_assignment->lhs : NULL;
> +   ir_rvalue *rhs = this->last_assignment != NULL ?
> +                    this->last_assignment->rhs : NULL;
> +
> +   if (ir->condition ||
> +       this->channels >= 4 ||
> +       !single_channel_write_mask(ir->write_mask) ||
> +       (lhs && !ir->lhs->equals(lhs)) ||
> +       (rhs && !ir->rhs->equals(rhs, ir_type_swizzle))) {
> +      try_vectorize();
> +   }
> +
> +   this->current_assignment = ir;
> +
> +   return visit_continue;
> +}
> +
> +/**
> + * Upon entering an ir_swizzle, set ::has_swizzle if we're visiting from an
> + * ir_assignment (i.e., that ::current_assignment is set) and the swizzle mask
> + * matches the current assignment's write mask.
> + *
> + * If the write mask doesn't match the swizzle mask, remove the current
> + * assignment from further consideration.
> + */
> +ir_visitor_status
> +ir_vectorize_visitor::visit_enter(ir_swizzle *ir)
> +{
> +   if (this->current_assignment) {
> +      if (write_mask_matches_swizzle(this->current_assignment->write_mask, ir)) {
> +         this->has_swizzle = true;
> +      } else {
> +         this->current_assignment = NULL;
> +      }
> +   }
> +   return visit_continue;
> +}
> +
> +/**
> + * Upon leaving an ir_assignment, save a pointer to it in ::assignment[] if
> + * the swizzle mask(s) found were appropriate. Also save a pointer in
> + * ::last_assignment so that we can compare future assignments with it.
> + *
> + * Finally, clear ::current_assignment and ::has_swizzle.
> + */
> +ir_visitor_status
> +ir_vectorize_visitor::visit_leave(ir_assignment *ir)
> +{
> +   if (this->has_swizzle && this->current_assignment) {
> +      assert(this->current_assignment == ir);
> +
> +      unsigned channel = write_mask_to_swizzle(this->current_assignment->write_mask);
> +      this->assignment[channel] = ir;
> +      this->channels++;
> +
> +      this->last_assignment = this->current_assignment;
> +   }
> +   this->current_assignment = NULL;
> +   this->has_swizzle = false;
> +   return visit_continue;
> +}
> +
> +/**
> + * Combines scalar assignments of the same expression (modulo swizzle) to
> + * multiple channels of the same variable into a single vectorized expression
> + * and assignment.
> + */
> +bool
> +do_vectorize(exec_list *instructions)
> +{
> +   ir_vectorize_visitor v;
> +
> +   v.run(instructions);
> +
> +   /* Try to vectorize the last assignments seen. */
> +   v.try_vectorize();
> +
> +   return v.progress;
> +}
>