[Mesa-dev] [PATCH] Add an ir_triop_mid3 expression and lower it to mins and maxs.

Tue Apr 29 06:01:58 PDT 2014

If mid3 is called with two constants, the resulting IR was two maxes and three
mins, when one max and one min would have sufficed. Make mid3() produce an
ir_expression with ir_triop_mid3 (new ir_expression operation) and lower it in
a lower_instructions pass to the needed amount of mins and maxs.

Tested on i965/Haswell.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=76861
Signed-off-by: Petri Latvala <petri.latvala at intel.com>
---

For the record, tested this with the following shader:


#extension GL_AMD_shader_trinary_minmax : require

uniform float zero;
uniform float one;
uniform float middle;

float test_all_constants()
{
        return mid3(0.0, 1.0, 0.5);
}

float test_two_constants()
{
        return mid3(0.5, one, 0.0);
}

float test_one_constant()
{
        return mid3(one, zero, 0.5);
}

float test_no_constants()
{
        return mid3(middle, one, zero);
}

void main()
{
        float r = test_all_constants();
        float g = test_two_constants();
        float b = test_one_constant();
        float a = test_no_constants();

        gl_FragColor = vec4(r, g, b, a);
}


total instructions in shared programs: 61 -> 57 (-6.56%)
instructions in affected programs:     56 -> 52 (-7.14%)


Existing piglit tests didn't stress the two-constants case at all so
no results from there. Other than all tests passing, naturally.



 src/glsl/builtin_functions.cpp                     |   2 +-
 src/glsl/ir.cpp                                    |   2 +
 src/glsl/ir.h                                      |   9 +-
 src/glsl/ir_constant_expression.cpp                |  22 ++++
 src/glsl/ir_optimization.h                         |   1 +
 src/glsl/ir_validate.cpp                           |   6 ++
 src/glsl/lower_instructions.cpp                    | 112 +++++++++++++++++++++
 .../dri/i965/brw_fs_channel_expressions.cpp        |   1 +
 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp       |   6 ++
 src/mesa/drivers/dri/i965/brw_shader.cpp           |   3 +-
 src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp     |   3 +
 src/mesa/main/macros.h                             |   3 +
 src/mesa/program/ir_to_mesa.cpp                    |   5 +
 src/mesa/state_tracker/st_glsl_to_tgsi.cpp         |   2 +
 14 files changed, 174 insertions(+), 3 deletions(-)

diff --git a/src/glsl/builtin_functions.cpp b/src/glsl/builtin_functions.cpp
index 3991f9d..12bbfe0 100644
--- a/src/glsl/builtin_functions.cpp
+++ b/src/glsl/builtin_functions.cpp
@@ -4260,7 +4260,7 @@ builtin_builder::_mid3(const glsl_type *type)
    ir_variable *z = in_var(type, "z");
    MAKE_SIG(type, shader_trinary_minmax, 3, x, y, z);
 
-   ir_expression *mid3 = max2(min2(x, y), max2(min2(x, z), min2(y, z)));
+   ir_expression *mid3 = expr(ir_triop_mid3, x, y, z);
    body.emit(ret(mid3));
 
    return sig;
diff --git a/src/glsl/ir.cpp b/src/glsl/ir.cpp
index 1a18b47..bc585d6 100644
--- a/src/glsl/ir.cpp
+++ b/src/glsl/ir.cpp
@@ -436,6 +436,7 @@ ir_expression::ir_expression(int op, ir_rvalue *op0, ir_rvalue *op1,
    case ir_triop_lrp:
    case ir_triop_bitfield_extract:
    case ir_triop_vector_insert:
+   case ir_triop_mid3:
       this->type = op0->type;
       break;
 
@@ -566,6 +567,7 @@ static const char *const operator_strs[] = {
    "bfi",
    "bitfield_extract",
    "vector_insert",
+   "mid3",
    "bitfield_insert",
    "vector",
 };
diff --git a/src/glsl/ir.h b/src/glsl/ir.h
index 6c7c60a..399a4ce 100644
--- a/src/glsl/ir.h
+++ b/src/glsl/ir.h
@@ -1390,9 +1390,16 @@ enum ir_expression_operation {
    ir_triop_vector_insert,
 
    /**
+    * \name Yield the per-component median of three values, part of AMD_shader_trinary_minmax.
+    */
+   /*@{*/
+   ir_triop_mid3,
+   /*@}*/
+
+   /**
     * A sentinel marking the last of the ternary operations.
     */
-   ir_last_triop = ir_triop_vector_insert,
+   ir_last_triop = ir_triop_mid3,
 
    ir_quadop_bitfield_insert,
 
diff --git a/src/glsl/ir_constant_expression.cpp b/src/glsl/ir_constant_expression.cpp
index 8afe8f7..1d4c0e5 100644
--- a/src/glsl/ir_constant_expression.cpp
+++ b/src/glsl/ir_constant_expression.cpp
@@ -1575,6 +1575,28 @@ ir_expression::constant_expression_value(struct hash_table *variable_context)
       break;
    }
 
+   case ir_triop_mid3: {
+      assert(op[0]->type == op[1]->type);
+      assert(op[0]->type == op[2]->type);
+
+      for (unsigned c = 0; c < components; c++) {
+	 switch (op[0]->type->base_type) {
+	 case GLSL_TYPE_UINT:
+	    data.u[c] = MID3(op[0]->value.u[c], op[1]->value.u[c], op[2]->value.u[c]);
+	    break;
+	 case GLSL_TYPE_INT:
+	    data.i[c] = MID3(op[0]->value.i[c], op[1]->value.i[c], op[2]->value.i[c]);
+	    break;
+	 case GLSL_TYPE_FLOAT:
+	    data.f[c] = MID3(op[0]->value.f[c], op[1]->value.f[c], op[2]->value.f[c]);
+	    break;
+	 default:
+	    assert(0);
+	 }
+      }
+      break;
+   }
+
    case ir_quadop_bitfield_insert: {
       int offset = op[2]->value.i[0];
       int bits = op[3]->value.i[0];
diff --git a/src/glsl/ir_optimization.h b/src/glsl/ir_optimization.h
index 40bb613..bea5ba0 100644
--- a/src/glsl/ir_optimization.h
+++ b/src/glsl/ir_optimization.h
@@ -38,6 +38,7 @@
 #define INT_DIV_TO_MUL_RCP 0x40
 #define BITFIELD_INSERT_TO_BFM_BFI 0x80
 #define LDEXP_TO_ARITH     0x100
+#define MID3_TO_MIN_MAX    0x200
 
 /**
  * \see class lower_packing_builtins_visitor
diff --git a/src/glsl/ir_validate.cpp b/src/glsl/ir_validate.cpp
index 71defc8..67c711b 100644
--- a/src/glsl/ir_validate.cpp
+++ b/src/glsl/ir_validate.cpp
@@ -553,6 +553,12 @@ ir_validate::visit_leave(ir_expression *ir)
       assert(ir->type == ir->operands[0]->type);
       break;
 
+   case ir_triop_mid3:
+      assert(ir->operands[0]->type == ir->type);
+      assert(ir->operands[1]->type == ir->type);
+      assert(ir->operands[2]->type == ir->type);
+      break;
+
    case ir_quadop_bitfield_insert:
       assert(ir->operands[0]->type == ir->type);
       assert(ir->operands[1]->type == ir->type);
diff --git a/src/glsl/lower_instructions.cpp b/src/glsl/lower_instructions.cpp
index 49316d0..f42e217 100644
--- a/src/glsl/lower_instructions.cpp
+++ b/src/glsl/lower_instructions.cpp
@@ -39,6 +39,7 @@
  * - MOD_TO_FRACT
  * - LDEXP_TO_ARITH
  * - BITFIELD_INSERT_TO_BFM_BFI
+ * - MID3_TO_MIN_MAX
  *
  * SUB_TO_ADD_NEG:
  * ---------------
@@ -94,6 +95,11 @@
  * Many GPUs implement the bitfieldInsert() built-in from ARB_gpu_shader_5
  * with a pair of instructions.
  *
+ * MID3_TO_MIN_MAX:
+ * ----------------
+ * Many GPUs don't have native a mid3 instructions. For these GPUs, convert
+ * ir_triop_mid3(x, y, z) to max(min(x, y), max(min(x, z), min(y, z))).
+ *
  */
 
 #include "main/core.h" /* for M_LOG2E */
@@ -127,6 +133,7 @@ private:
    void log_to_log2(ir_expression *);
    void bitfield_insert_to_bfm_bfi(ir_expression *);
    void ldexp_to_arith(ir_expression *);
+   void mid3_to_min_max(ir_expression *);
 };
 
 } /* anonymous namespace */
@@ -436,6 +443,106 @@ lower_instructions_visitor::ldexp_to_arith(ir_expression *ir)
    this->progress = true;
 }
 
+void
+lower_instructions_visitor::mid3_to_min_max(ir_expression *ir)
+{
+   /* Translates
+    *    mid3 x y z
+    * into
+    *    max(min(x, y), max(min(x, z), min(y, z)))
+    *
+    * If two of the operands are constants, instead translate to
+    *    clamp(x, y, z)
+    * or rather,
+    *    min(max(x, y), z)
+    *
+    * where y and z contain the lower and higher vector components of the
+    * constants, respectively.
+    *
+    * If all three operands are constants, the former translation is done, and
+    * constant folding optimization will handle it.
+    */
+
+   assert(ir->operation == ir_triop_mid3);
+   assert(ir->get_num_operands() == 3);
+
+   ir_rvalue *nonconst = NULL;
+   ir_constant *constants[3] = { 0 };
+   unsigned num_constants = 0;
+
+   for (unsigned i = 0; i < 3; ++i) {
+      if (ir_constant *c = ir->operands[i]->constant_expression_value()) {
+         constants[num_constants++] = c;
+      } else {
+         nonconst = ir->operands[i];
+      }
+   }
+
+   if (num_constants == 2) {
+      ir_constant_data data[2];
+
+      memset(&data, 0, sizeof(data));
+
+      assert(nonconst != NULL);
+      assert(constants[0]->type == constants[1]->type);
+      assert(constants[0]->type == ir->type);
+
+      for (unsigned i = 0; i < constants[0]->type->components(); ++i) {
+         switch (constants[0]->type->base_type) {
+         case GLSL_TYPE_UINT:
+            data[0].u[i] = MIN2(constants[0]->value.u[i], constants[1]->value.u[i]);
+            data[1].u[i] = MAX2(constants[0]->value.u[i], constants[1]->value.u[i]);
+            break;
+         case GLSL_TYPE_INT:
+            data[0].i[i] = MIN2(constants[0]->value.i[i], constants[1]->value.i[i]);
+            data[1].i[i] = MAX2(constants[0]->value.i[i], constants[1]->value.i[i]);
+            break;
+         case GLSL_TYPE_FLOAT:
+            data[0].f[i] = MIN2(constants[0]->value.f[i], constants[1]->value.f[i]);
+            data[1].f[i] = MAX2(constants[0]->value.f[i], constants[1]->value.f[i]);
+            break;
+         default:
+            /* unreachable */
+            assert(0);
+         }
+      }
+
+      /* c1 is the lower valued constant, c2 is the higher */
+      ir_constant *c1 = new(ir) ir_constant(ir->type, &data[0]);
+      ir_constant *c2 = new(ir) ir_constant(ir->type, &data[1]);
+
+      ir_expression *exprmax = new(ir) ir_expression(ir_binop_max, nonconst, c1);
+      ir->operation = ir_binop_min;
+      ir->operands[0] = exprmax;
+      ir->operands[1] = c2;
+
+      this->progress = true;
+      return;
+   }
+
+   ir_rvalue *x = ir->operands[0];
+   ir_rvalue *y = ir->operands[1];
+   ir_rvalue *z = ir->operands[2];
+
+   ir_rvalue *x2 = x->clone(ir, NULL);
+   ir_rvalue *y2 = y->clone(ir, NULL);
+   ir_rvalue *z2 = z->clone(ir, NULL);
+
+   ir_expression *firstmin = new(ir) ir_expression(ir_binop_min, x, y);
+   ir_expression *secondmin = new(ir) ir_expression(ir_binop_min, x2, z);
+   ir_expression *thirdmin = new(ir) ir_expression(ir_binop_min, y2, z2);
+
+   ir_expression *secondmax = new(ir) ir_expression(ir_binop_max, secondmin, thirdmin);
+
+   ir->operation = ir_binop_max;
+   ir->operands[0] = firstmin;
+   ir->operands[1] = secondmax;
+
+   this->progress = true;
+   return;
+}
+
+
 ir_visitor_status
 lower_instructions_visitor::visit_leave(ir_expression *ir)
 {
@@ -482,6 +589,11 @@ lower_instructions_visitor::visit_leave(ir_expression *ir)
          ldexp_to_arith(ir);
       break;
 
+   case ir_triop_mid3:
+      if (lowering(MID3_TO_MIN_MAX))
+         mid3_to_min_max(ir);
+      break;
+
    default:
       return visit_continue;
    }
diff --git a/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp b/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp
index ae5bc56..db33b68 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp
@@ -410,6 +410,7 @@ ir_channel_expressions_visitor::visit_leave(ir_assignment *ir)
    case ir_binop_ldexp:
    case ir_binop_vector_extract:
    case ir_triop_vector_insert:
+   case ir_triop_mid3:
    case ir_quadop_bitfield_insert:
    case ir_quadop_vector:
       assert(!"should have been lowered");
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index 2aa3acd..7488c8e 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -808,7 +808,13 @@ fs_visitor::visit(ir_expression *ir)
       inst = emit(BRW_OPCODE_SEL, this->result, op[1], op[2]);
       inst->predicate = BRW_PREDICATE_NORMAL;
       break;
+
+   case ir_triop_mid3:
+      assert(!"not reached: should be handled by "
+              "lower_instructions::mid3_to_min_max");
+      break;
    }
+
 }
 
 void
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
index 6e74803..604fdb7 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -154,7 +154,8 @@ brw_link_shader(struct gl_context *ctx, struct gl_shader_program *shProg)
 			 EXP_TO_EXP2 |
 			 LOG_TO_LOG2 |
                          bitfield_insert |
-                         LDEXP_TO_ARITH);
+                         LDEXP_TO_ARITH |
+                         MID3_TO_MIN_MAX);
 
       /* Pre-gen6 HW can only nest if-statements 16 deep.  Beyond this,
        * if-statements need to be flattened.
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
index 8fa0aee..0aa8975 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -1721,6 +1721,9 @@ vec4_visitor::visit(ir_expression *ir)
    case ir_binop_ldexp:
       assert(!"not reached: should be handled by ldexp_to_arith()");
       break;
+   case ir_triop_mid3:
+      assert(!"not reached: should be handled by mid3_to_min_max()");
+      break;
    }
 }
 
diff --git a/src/mesa/main/macros.h b/src/mesa/main/macros.h
index 5228c3a..12ad287 100644
--- a/src/mesa/main/macros.h
+++ b/src/mesa/main/macros.h
@@ -678,6 +678,9 @@ INTERP_4F(GLfloat t, GLfloat dst[4], const GLfloat out[4], const GLfloat in[4])
 #define MIN3( A, B, C ) ((A) < (B) ? MIN2(A, C) : MIN2(B, C))
 #define MAX3( A, B, C ) ((A) > (B) ? MAX2(A, C) : MAX2(B, C))
 
+/** Median of three values: */
+#define MID3( A, B, C ) ((A) < (B) ? CLAMP(C, A, B) : CLAMP(C, B, A))
+
 static inline unsigned
 minify(unsigned value, unsigned levels)
 {
diff --git a/src/mesa/program/ir_to_mesa.cpp b/src/mesa/program/ir_to_mesa.cpp
index 59cf123..88e2073 100644
--- a/src/mesa/program/ir_to_mesa.cpp
+++ b/src/mesa/program/ir_to_mesa.cpp
@@ -1450,6 +1450,10 @@ ir_to_mesa_visitor::visit(ir_expression *ir)
       emit(ir, OPCODE_LRP, result_dst, op[2], op[1], op[0]);
       break;
 
+   case ir_triop_mid3:
+      assert(!"not reached: should be handled by mid3_to_min_max");
+      break;
+
    case ir_binop_vector_extract:
    case ir_binop_bfm:
    case ir_triop_fma:
@@ -3002,6 +3006,7 @@ _mesa_ir_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
 	 do_mat_op_to_vec(ir);
 	 lower_instructions(ir, (MOD_TO_FRACT | DIV_TO_MUL_RCP | EXP_TO_EXP2
 				 | LOG_TO_LOG2 | INT_DIV_TO_MUL_RCP
+				 | MID3_TO_MIN_MAX
 				 | ((options->EmitNoPow) ? POW_TO_EXP2 : 0)));
 
 	 progress = do_lower_jumps(ir, true, true, options->EmitNoMainReturn, options->EmitNoCont, options->EmitNoLoops) || progress;
diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index d1c3856..4a61b86 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -2001,6 +2001,7 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
    case ir_binop_ldexp:
    case ir_binop_carry:
    case ir_binop_borrow:
+   case ir_triop_mid3:
       /* This operation is not supported, or should have already been handled.
        */
       assert(!"Invalid ir opcode in glsl_to_tgsi_visitor::visit()");
@@ -5396,6 +5397,7 @@ st_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
                          EXP_TO_EXP2 |
                          LOG_TO_LOG2 |
                          LDEXP_TO_ARITH |
+                         MID3_TO_MIN_MAX |
                          (options->EmitNoPow ? POW_TO_EXP2 : 0) |
                          (!ctx->Const.NativeIntegers ? INT_DIV_TO_MUL_RCP : 0));
 
-- 
1.9.0