[Mesa-dev] [PATCH] Add an ir_triop_mid3 expression and lower it to mins and maxs.
Matt Turner
mattst88 at gmail.com
Tue Apr 29 11:57:26 PDT 2014
On Tue, Apr 29, 2014 at 6:01 AM, Petri Latvala <petri.latvala at intel.com> wrote:
> If mid3 is called with two constants, the resulting IR was two maxes and three
> mins, when one max and one min would have sufficed. Make mid3() produce an
> ir_expression with ir_triop_mid3 (new ir_expression operation) and lower it in
> a lower_instructions pass to the needed amount of mins and maxs.
>
> Tested on i965/Haswell.
>
> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=76861
> Signed-off-by: Petri Latvala <petri.latvala at intel.com>
> ---
>
> For the record, tested this with the following shader:
>
>
> #extension GL_AMD_shader_trinary_minmax : require
>
> uniform float zero;
> uniform float one;
> uniform float middle;
>
> float test_all_constants()
> {
> return mid3(0.0, 1.0, 0.5);
> }
>
> float test_two_constants()
> {
> return mid3(0.5, one, 0.0);
> }
>
> float test_one_constant()
> {
> return mid3(one, zero, 0.5);
> }
>
> float test_no_constants()
> {
> return mid3(middle, one, zero);
> }
>
> void main()
> {
> float r = test_all_constants();
> float g = test_two_constants();
> float b = test_one_constant();
> float a = test_no_constants();
>
> gl_FragColor = vec4(r, g, b, a);
> }
Cool. Please submit this as a piglit test.
>
> total instructions in shared programs: 61 -> 57 (-6.56%)
> instructions in affected programs: 56 -> 52 (-7.14%)
>
>
> Existing piglit tests didn't stress the two-constants case at all so
> no results from there. Other than all tests passing, naturally.
>
>
>
> src/glsl/builtin_functions.cpp | 2 +-
> src/glsl/ir.cpp | 2 +
> src/glsl/ir.h | 9 +-
> src/glsl/ir_constant_expression.cpp | 22 ++++
> src/glsl/ir_optimization.h | 1 +
> src/glsl/ir_validate.cpp | 6 ++
> src/glsl/lower_instructions.cpp | 112 +++++++++++++++++++++
> .../dri/i965/brw_fs_channel_expressions.cpp | 1 +
> src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 6 ++
> src/mesa/drivers/dri/i965/brw_shader.cpp | 3 +-
> src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp | 3 +
> src/mesa/main/macros.h | 3 +
> src/mesa/program/ir_to_mesa.cpp | 5 +
> src/mesa/state_tracker/st_glsl_to_tgsi.cpp | 2 +
> 14 files changed, 174 insertions(+), 3 deletions(-)
>
> diff --git a/src/glsl/builtin_functions.cpp b/src/glsl/builtin_functions.cpp
> index 3991f9d..12bbfe0 100644
> --- a/src/glsl/builtin_functions.cpp
> +++ b/src/glsl/builtin_functions.cpp
> @@ -4260,7 +4260,7 @@ builtin_builder::_mid3(const glsl_type *type)
> ir_variable *z = in_var(type, "z");
> MAKE_SIG(type, shader_trinary_minmax, 3, x, y, z);
>
> - ir_expression *mid3 = max2(min2(x, y), max2(min2(x, z), min2(y, z)));
> + ir_expression *mid3 = expr(ir_triop_mid3, x, y, z);
Add a mid3 function to ir_builder and use it here.
> body.emit(ret(mid3));
>
> return sig;
> diff --git a/src/glsl/ir.cpp b/src/glsl/ir.cpp
> index 1a18b47..bc585d6 100644
> --- a/src/glsl/ir.cpp
> +++ b/src/glsl/ir.cpp
> @@ -436,6 +436,7 @@ ir_expression::ir_expression(int op, ir_rvalue *op0, ir_rvalue *op1,
> case ir_triop_lrp:
> case ir_triop_bitfield_extract:
> case ir_triop_vector_insert:
> + case ir_triop_mid3:
> this->type = op0->type;
> break;
>
> @@ -566,6 +567,7 @@ static const char *const operator_strs[] = {
> "bfi",
> "bitfield_extract",
> "vector_insert",
> + "mid3",
> "bitfield_insert",
> "vector",
> };
> diff --git a/src/glsl/ir.h b/src/glsl/ir.h
> index 6c7c60a..399a4ce 100644
> --- a/src/glsl/ir.h
> +++ b/src/glsl/ir.h
> @@ -1390,9 +1390,16 @@ enum ir_expression_operation {
> ir_triop_vector_insert,
>
> /**
> + * \name Yield the per-component median of three values, part of AMD_shader_trinary_minmax.
> + */
> + /*@{*/
> + ir_triop_mid3,
> + /*@}*/
> +
> + /**
> * A sentinel marking the last of the ternary operations.
> */
> - ir_last_triop = ir_triop_vector_insert,
> + ir_last_triop = ir_triop_mid3,
>
> ir_quadop_bitfield_insert,
>
> diff --git a/src/glsl/ir_constant_expression.cpp b/src/glsl/ir_constant_expression.cpp
> index 8afe8f7..1d4c0e5 100644
> --- a/src/glsl/ir_constant_expression.cpp
> +++ b/src/glsl/ir_constant_expression.cpp
> @@ -1575,6 +1575,28 @@ ir_expression::constant_expression_value(struct hash_table *variable_context)
> break;
> }
>
> + case ir_triop_mid3: {
> + assert(op[0]->type == op[1]->type);
> + assert(op[0]->type == op[2]->type);
> +
> + for (unsigned c = 0; c < components; c++) {
> + switch (op[0]->type->base_type) {
> + case GLSL_TYPE_UINT:
> + data.u[c] = MID3(op[0]->value.u[c], op[1]->value.u[c], op[2]->value.u[c]);
> + break;
> + case GLSL_TYPE_INT:
> + data.i[c] = MID3(op[0]->value.i[c], op[1]->value.i[c], op[2]->value.i[c]);
> + break;
> + case GLSL_TYPE_FLOAT:
> + data.f[c] = MID3(op[0]->value.f[c], op[1]->value.f[c], op[2]->value.f[c]);
> + break;
> + default:
> + assert(0);
> + }
> + }
> + break;
> + }
> +
> case ir_quadop_bitfield_insert: {
> int offset = op[2]->value.i[0];
> int bits = op[3]->value.i[0];
> diff --git a/src/glsl/ir_optimization.h b/src/glsl/ir_optimization.h
> index 40bb613..bea5ba0 100644
> --- a/src/glsl/ir_optimization.h
> +++ b/src/glsl/ir_optimization.h
> @@ -38,6 +38,7 @@
> #define INT_DIV_TO_MUL_RCP 0x40
> #define BITFIELD_INSERT_TO_BFM_BFI 0x80
> #define LDEXP_TO_ARITH 0x100
> +#define MID3_TO_MIN_MAX 0x200
>
> /**
> * \see class lower_packing_builtins_visitor
> diff --git a/src/glsl/ir_validate.cpp b/src/glsl/ir_validate.cpp
> index 71defc8..67c711b 100644
> --- a/src/glsl/ir_validate.cpp
> +++ b/src/glsl/ir_validate.cpp
> @@ -553,6 +553,12 @@ ir_validate::visit_leave(ir_expression *ir)
> assert(ir->type == ir->operands[0]->type);
> break;
>
> + case ir_triop_mid3:
> + assert(ir->operands[0]->type == ir->type);
> + assert(ir->operands[1]->type == ir->type);
> + assert(ir->operands[2]->type == ir->type);
> + break;
> +
> case ir_quadop_bitfield_insert:
> assert(ir->operands[0]->type == ir->type);
> assert(ir->operands[1]->type == ir->type);
> diff --git a/src/glsl/lower_instructions.cpp b/src/glsl/lower_instructions.cpp
> index 49316d0..f42e217 100644
> --- a/src/glsl/lower_instructions.cpp
> +++ b/src/glsl/lower_instructions.cpp
> @@ -39,6 +39,7 @@
> * - MOD_TO_FRACT
> * - LDEXP_TO_ARITH
> * - BITFIELD_INSERT_TO_BFM_BFI
> + * - MID3_TO_MIN_MAX
> *
> * SUB_TO_ADD_NEG:
> * ---------------
> @@ -94,6 +95,11 @@
> * Many GPUs implement the bitfieldInsert() built-in from ARB_gpu_shader_5
> * with a pair of instructions.
> *
> + * MID3_TO_MIN_MAX:
> + * ----------------
> + * Many GPUs don't have native a mid3 instructions. For these GPUs, convert
> + * ir_triop_mid3(x, y, z) to max(min(x, y), max(min(x, z), min(y, z))).
> + *
> */
>
> #include "main/core.h" /* for M_LOG2E */
> @@ -127,6 +133,7 @@ private:
> void log_to_log2(ir_expression *);
> void bitfield_insert_to_bfm_bfi(ir_expression *);
> void ldexp_to_arith(ir_expression *);
> + void mid3_to_min_max(ir_expression *);
> };
>
> } /* anonymous namespace */
> @@ -436,6 +443,106 @@ lower_instructions_visitor::ldexp_to_arith(ir_expression *ir)
> this->progress = true;
> }
>
> +void
> +lower_instructions_visitor::mid3_to_min_max(ir_expression *ir)
> +{
> + /* Translates
> + * mid3 x y z
> + * into
> + * max(min(x, y), max(min(x, z), min(y, z)))
> + *
> + * If two of the operands are constants, instead translate to
> + * clamp(x, y, z)
> + * or rather,
> + * min(max(x, y), z)
> + *
> + * where y and z contain the lower and higher vector components of the
> + * constants, respectively.
> + *
> + * If all three operands are constants, the former translation is done, and
> + * constant folding optimization will handle it.
> + */
> +
> + assert(ir->operation == ir_triop_mid3);
> + assert(ir->get_num_operands() == 3);
> +
> + ir_rvalue *nonconst = NULL;
> + ir_constant *constants[3] = { 0 };
> + unsigned num_constants = 0;
> +
> + for (unsigned i = 0; i < 3; ++i) {
> + if (ir_constant *c = ir->operands[i]->constant_expression_value()) {
> + constants[num_constants++] = c;
> + } else {
> + nonconst = ir->operands[i];
> + }
> + }
> +
> + if (num_constants == 2) {
> + ir_constant_data data[2];
> +
> + memset(&data, 0, sizeof(data));
> +
> + assert(nonconst != NULL);
> + assert(constants[0]->type == constants[1]->type);
> + assert(constants[0]->type == ir->type);
> +
> + for (unsigned i = 0; i < constants[0]->type->components(); ++i) {
> + switch (constants[0]->type->base_type) {
> + case GLSL_TYPE_UINT:
> + data[0].u[i] = MIN2(constants[0]->value.u[i], constants[1]->value.u[i]);
> + data[1].u[i] = MAX2(constants[0]->value.u[i], constants[1]->value.u[i]);
> + break;
> + case GLSL_TYPE_INT:
> + data[0].i[i] = MIN2(constants[0]->value.i[i], constants[1]->value.i[i]);
> + data[1].i[i] = MAX2(constants[0]->value.i[i], constants[1]->value.i[i]);
> + break;
> + case GLSL_TYPE_FLOAT:
> + data[0].f[i] = MIN2(constants[0]->value.f[i], constants[1]->value.f[i]);
> + data[1].f[i] = MAX2(constants[0]->value.f[i], constants[1]->value.f[i]);
> + break;
> + default:
> + /* unreachable */
> + assert(0);
> + }
> + }
> +
> + /* c1 is the lower valued constant, c2 is the higher */
> + ir_constant *c1 = new(ir) ir_constant(ir->type, &data[0]);
> + ir_constant *c2 = new(ir) ir_constant(ir->type, &data[1]);
> +
> + ir_expression *exprmax = new(ir) ir_expression(ir_binop_max, nonconst, c1);
> + ir->operation = ir_binop_min;
> + ir->operands[0] = exprmax;
> + ir->operands[1] = c2;
> +
> + this->progress = true;
> + return;
> + }
> +
> + ir_rvalue *x = ir->operands[0];
> + ir_rvalue *y = ir->operands[1];
> + ir_rvalue *z = ir->operands[2];
> +
> + ir_rvalue *x2 = x->clone(ir, NULL);
> + ir_rvalue *y2 = y->clone(ir, NULL);
> + ir_rvalue *z2 = z->clone(ir, NULL);
> +
> + ir_expression *firstmin = new(ir) ir_expression(ir_binop_min, x, y);
> + ir_expression *secondmin = new(ir) ir_expression(ir_binop_min, x2, z);
> + ir_expression *thirdmin = new(ir) ir_expression(ir_binop_min, y2, z2);
> +
> + ir_expression *secondmax = new(ir) ir_expression(ir_binop_max, secondmin, thirdmin);
> +
> + ir->operation = ir_binop_max;
> + ir->operands[0] = firstmin;
> + ir->operands[1] = secondmax;
> +
> + this->progress = true;
> + return;
> +}
Wouldn't it be simpler to detect constant arguments in opt_algebraic
and do the optimization there, and just perform the standard lowering
here? It seems cleaner to me.
I don't think we generate different code based on the arguments in any
other lowering pass.
More information about the mesa-dev
mailing list