[Mesa-dev] [PATCH] Add an ir_triop_mid3 expression and lower it to mins and maxs.

Tue Apr 29 11:57:26 PDT 2014

On Tue, Apr 29, 2014 at 6:01 AM, Petri Latvala <petri.latvala at intel.com> wrote:
> If mid3 is called with two constants, the resulting IR was two maxes and three
> mins, when one max and one min would have sufficed. Make mid3() produce an
> ir_expression with ir_triop_mid3 (new ir_expression operation) and lower it in
> a lower_instructions pass to the needed amount of mins and maxs.
>
> Tested on i965/Haswell.
>
> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=76861
> Signed-off-by: Petri Latvala <petri.latvala at intel.com>
> ---
>
> For the record, tested this with the following shader:
>
>
> #extension GL_AMD_shader_trinary_minmax : require
>
> uniform float zero;
> uniform float one;
> uniform float middle;
>
> float test_all_constants()
> {
>         return mid3(0.0, 1.0, 0.5);
> }
>
> float test_two_constants()
> {
>         return mid3(0.5, one, 0.0);
> }
>
> float test_one_constant()
> {
>         return mid3(one, zero, 0.5);
> }
>
> float test_no_constants()
> {
>         return mid3(middle, one, zero);
> }
>
> void main()
> {
>         float r = test_all_constants();
>         float g = test_two_constants();
>         float b = test_one_constant();
>         float a = test_no_constants();
>
>         gl_FragColor = vec4(r, g, b, a);
> }

Cool. Please submit this as a piglit test.

>
> total instructions in shared programs: 61 -> 57 (-6.56%)
> instructions in affected programs:     56 -> 52 (-7.14%)
>
>
> Existing piglit tests didn't stress the two-constants case at all so
> no results from there. Other than all tests passing, naturally.
>
>
>
>  src/glsl/builtin_functions.cpp                     |   2 +-
>  src/glsl/ir.cpp                                    |   2 +
>  src/glsl/ir.h                                      |   9 +-
>  src/glsl/ir_constant_expression.cpp                |  22 ++++
>  src/glsl/ir_optimization.h                         |   1 +
>  src/glsl/ir_validate.cpp                           |   6 ++
>  src/glsl/lower_instructions.cpp                    | 112 +++++++++++++++++++++
>  .../dri/i965/brw_fs_channel_expressions.cpp        |   1 +
>  src/mesa/drivers/dri/i965/brw_fs_visitor.cpp       |   6 ++
>  src/mesa/drivers/dri/i965/brw_shader.cpp           |   3 +-
>  src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp     |   3 +
>  src/mesa/main/macros.h                             |   3 +
>  src/mesa/program/ir_to_mesa.cpp                    |   5 +
>  src/mesa/state_tracker/st_glsl_to_tgsi.cpp         |   2 +
>  14 files changed, 174 insertions(+), 3 deletions(-)
>
> diff --git a/src/glsl/builtin_functions.cpp b/src/glsl/builtin_functions.cpp
> index 3991f9d..12bbfe0 100644
> --- a/src/glsl/builtin_functions.cpp
> +++ b/src/glsl/builtin_functions.cpp
> @@ -4260,7 +4260,7 @@ builtin_builder::_mid3(const glsl_type *type)
>     ir_variable *z = in_var(type, "z");
>     MAKE_SIG(type, shader_trinary_minmax, 3, x, y, z);
>
> -   ir_expression *mid3 = max2(min2(x, y), max2(min2(x, z), min2(y, z)));
> +   ir_expression *mid3 = expr(ir_triop_mid3, x, y, z);

Add a mid3 function to ir_builder and use it here.

>     body.emit(ret(mid3));
>
>     return sig;
> diff --git a/src/glsl/ir.cpp b/src/glsl/ir.cpp
> index 1a18b47..bc585d6 100644
> --- a/src/glsl/ir.cpp
> +++ b/src/glsl/ir.cpp
> @@ -436,6 +436,7 @@ ir_expression::ir_expression(int op, ir_rvalue *op0, ir_rvalue *op1,
>     case ir_triop_lrp:
>     case ir_triop_bitfield_extract:
>     case ir_triop_vector_insert:
> +   case ir_triop_mid3:
>        this->type = op0->type;
>        break;
>
> @@ -566,6 +567,7 @@ static const char *const operator_strs[] = {
>     "bfi",
>     "bitfield_extract",
>     "vector_insert",
> +   "mid3",
>     "bitfield_insert",
>     "vector",
>  };
> diff --git a/src/glsl/ir.h b/src/glsl/ir.h
> index 6c7c60a..399a4ce 100644
> --- a/src/glsl/ir.h
> +++ b/src/glsl/ir.h
> @@ -1390,9 +1390,16 @@ enum ir_expression_operation {
>     ir_triop_vector_insert,
>
>     /**
> +    * \name Yield the per-component median of three values, part of AMD_shader_trinary_minmax.
> +    */
> +   /*@{*/
> +   ir_triop_mid3,
> +   /*@}*/
> +
> +   /**
>      * A sentinel marking the last of the ternary operations.
>      */
> -   ir_last_triop = ir_triop_vector_insert,
> +   ir_last_triop = ir_triop_mid3,
>
>     ir_quadop_bitfield_insert,
>
> diff --git a/src/glsl/ir_constant_expression.cpp b/src/glsl/ir_constant_expression.cpp
> index 8afe8f7..1d4c0e5 100644
> --- a/src/glsl/ir_constant_expression.cpp
> +++ b/src/glsl/ir_constant_expression.cpp
> @@ -1575,6 +1575,28 @@ ir_expression::constant_expression_value(struct hash_table *variable_context)
>        break;
>     }
>
> +   case ir_triop_mid3: {
> +      assert(op[0]->type == op[1]->type);
> +      assert(op[0]->type == op[2]->type);
> +
> +      for (unsigned c = 0; c < components; c++) {
> +        switch (op[0]->type->base_type) {
> +        case GLSL_TYPE_UINT:
> +           data.u[c] = MID3(op[0]->value.u[c], op[1]->value.u[c], op[2]->value.u[c]);
> +           break;
> +        case GLSL_TYPE_INT:
> +           data.i[c] = MID3(op[0]->value.i[c], op[1]->value.i[c], op[2]->value.i[c]);
> +           break;
> +        case GLSL_TYPE_FLOAT:
> +           data.f[c] = MID3(op[0]->value.f[c], op[1]->value.f[c], op[2]->value.f[c]);
> +           break;
> +        default:
> +           assert(0);
> +        }
> +      }
> +      break;
> +   }
> +
>     case ir_quadop_bitfield_insert: {
>        int offset = op[2]->value.i[0];
>        int bits = op[3]->value.i[0];
> diff --git a/src/glsl/ir_optimization.h b/src/glsl/ir_optimization.h
> index 40bb613..bea5ba0 100644
> --- a/src/glsl/ir_optimization.h
> +++ b/src/glsl/ir_optimization.h
> @@ -38,6 +38,7 @@
>  #define INT_DIV_TO_MUL_RCP 0x40
>  #define BITFIELD_INSERT_TO_BFM_BFI 0x80
>  #define LDEXP_TO_ARITH     0x100
> +#define MID3_TO_MIN_MAX    0x200
>
>  /**
>   * \see class lower_packing_builtins_visitor
> diff --git a/src/glsl/ir_validate.cpp b/src/glsl/ir_validate.cpp
> index 71defc8..67c711b 100644
> --- a/src/glsl/ir_validate.cpp
> +++ b/src/glsl/ir_validate.cpp
> @@ -553,6 +553,12 @@ ir_validate::visit_leave(ir_expression *ir)
>        assert(ir->type == ir->operands[0]->type);
>        break;
>
> +   case ir_triop_mid3:
> +      assert(ir->operands[0]->type == ir->type);
> +      assert(ir->operands[1]->type == ir->type);
> +      assert(ir->operands[2]->type == ir->type);
> +      break;
> +
>     case ir_quadop_bitfield_insert:
>        assert(ir->operands[0]->type == ir->type);
>        assert(ir->operands[1]->type == ir->type);
> diff --git a/src/glsl/lower_instructions.cpp b/src/glsl/lower_instructions.cpp
> index 49316d0..f42e217 100644
> --- a/src/glsl/lower_instructions.cpp
> +++ b/src/glsl/lower_instructions.cpp
> @@ -39,6 +39,7 @@
>   * - MOD_TO_FRACT
>   * - LDEXP_TO_ARITH
>   * - BITFIELD_INSERT_TO_BFM_BFI
> + * - MID3_TO_MIN_MAX
>   *
>   * SUB_TO_ADD_NEG:
>   * ---------------
> @@ -94,6 +95,11 @@
>   * Many GPUs implement the bitfieldInsert() built-in from ARB_gpu_shader_5
>   * with a pair of instructions.
>   *
> + * MID3_TO_MIN_MAX:
> + * ----------------
> + * Many GPUs don't have native a mid3 instructions. For these GPUs, convert
> + * ir_triop_mid3(x, y, z) to max(min(x, y), max(min(x, z), min(y, z))).
> + *
>   */
>
>  #include "main/core.h" /* for M_LOG2E */
> @@ -127,6 +133,7 @@ private:
>     void log_to_log2(ir_expression *);
>     void bitfield_insert_to_bfm_bfi(ir_expression *);
>     void ldexp_to_arith(ir_expression *);
> +   void mid3_to_min_max(ir_expression *);
>  };
>
>  } /* anonymous namespace */
> @@ -436,6 +443,106 @@ lower_instructions_visitor::ldexp_to_arith(ir_expression *ir)
>     this->progress = true;
>  }
>
> +void
> +lower_instructions_visitor::mid3_to_min_max(ir_expression *ir)
> +{
> +   /* Translates
> +    *    mid3 x y z
> +    * into
> +    *    max(min(x, y), max(min(x, z), min(y, z)))
> +    *
> +    * If two of the operands are constants, instead translate to
> +    *    clamp(x, y, z)
> +    * or rather,
> +    *    min(max(x, y), z)
> +    *
> +    * where y and z contain the lower and higher vector components of the
> +    * constants, respectively.
> +    *
> +    * If all three operands are constants, the former translation is done, and
> +    * constant folding optimization will handle it.
> +    */
> +
> +   assert(ir->operation == ir_triop_mid3);
> +   assert(ir->get_num_operands() == 3);
> +
> +   ir_rvalue *nonconst = NULL;
> +   ir_constant *constants[3] = { 0 };
> +   unsigned num_constants = 0;
> +
> +   for (unsigned i = 0; i < 3; ++i) {
> +      if (ir_constant *c = ir->operands[i]->constant_expression_value()) {
> +         constants[num_constants++] = c;
> +      } else {
> +         nonconst = ir->operands[i];
> +      }
> +   }
> +
> +   if (num_constants == 2) {
> +      ir_constant_data data[2];
> +
> +      memset(&data, 0, sizeof(data));
> +
> +      assert(nonconst != NULL);
> +      assert(constants[0]->type == constants[1]->type);
> +      assert(constants[0]->type == ir->type);
> +
> +      for (unsigned i = 0; i < constants[0]->type->components(); ++i) {
> +         switch (constants[0]->type->base_type) {
> +         case GLSL_TYPE_UINT:
> +            data[0].u[i] = MIN2(constants[0]->value.u[i], constants[1]->value.u[i]);
> +            data[1].u[i] = MAX2(constants[0]->value.u[i], constants[1]->value.u[i]);
> +            break;
> +         case GLSL_TYPE_INT:
> +            data[0].i[i] = MIN2(constants[0]->value.i[i], constants[1]->value.i[i]);
> +            data[1].i[i] = MAX2(constants[0]->value.i[i], constants[1]->value.i[i]);
> +            break;
> +         case GLSL_TYPE_FLOAT:
> +            data[0].f[i] = MIN2(constants[0]->value.f[i], constants[1]->value.f[i]);
> +            data[1].f[i] = MAX2(constants[0]->value.f[i], constants[1]->value.f[i]);
> +            break;
> +         default:
> +            /* unreachable */
> +            assert(0);
> +         }
> +      }
> +
> +      /* c1 is the lower valued constant, c2 is the higher */
> +      ir_constant *c1 = new(ir) ir_constant(ir->type, &data[0]);
> +      ir_constant *c2 = new(ir) ir_constant(ir->type, &data[1]);
> +
> +      ir_expression *exprmax = new(ir) ir_expression(ir_binop_max, nonconst, c1);
> +      ir->operation = ir_binop_min;
> +      ir->operands[0] = exprmax;
> +      ir->operands[1] = c2;
> +
> +      this->progress = true;
> +      return;
> +   }
> +
> +   ir_rvalue *x = ir->operands[0];
> +   ir_rvalue *y = ir->operands[1];
> +   ir_rvalue *z = ir->operands[2];
> +
> +   ir_rvalue *x2 = x->clone(ir, NULL);
> +   ir_rvalue *y2 = y->clone(ir, NULL);
> +   ir_rvalue *z2 = z->clone(ir, NULL);
> +
> +   ir_expression *firstmin = new(ir) ir_expression(ir_binop_min, x, y);
> +   ir_expression *secondmin = new(ir) ir_expression(ir_binop_min, x2, z);
> +   ir_expression *thirdmin = new(ir) ir_expression(ir_binop_min, y2, z2);
> +
> +   ir_expression *secondmax = new(ir) ir_expression(ir_binop_max, secondmin, thirdmin);
> +
> +   ir->operation = ir_binop_max;
> +   ir->operands[0] = firstmin;
> +   ir->operands[1] = secondmax;
> +
> +   this->progress = true;
> +   return;
> +}

Wouldn't it be simpler to detect constant arguments in opt_algebraic
and do the optimization there, and just perform the standard lowering
here? It seems cleaner to me.

I don't think we generate different code based on the arguments in any
other lowering pass.