[Mesa-dev] [PATCH 3/5] nir: add new constant folding infrastructure

Mon Jan 19 17:01:18 PST 2015

On Mon, Jan 19, 2015 at 4:04 PM, Jason Ekstrand <jason at jlekstrand.net> wrote:
> I've got some specific comments below, but I want to make some more general
> comments here.  I like this in principle: having all the opcodes
> self-documenting is wonderful.  However, I'm not terribly happy with the way
> it worked out.  A lot of the codegen stuff is very confusing and its not at
> all obvious what's going on.  I'll give it some thought and see if I can
> come up with a good way to clean it up.
>
> On Jan 16, 2015 3:46 PM, "Connor Abbott" <cwabbott0 at gmail.com> wrote:
>>
>> Add a required field to the Opcode class, const_expr, that contains an
>> expression or statement that computes the result of the opcode given known
>> constant inputs. Then take those const_expr's and expand them into a
>> function
>> that takes an opcode and an array of constant inputs and spits out the
>> constant
>> result. This means that when adding opcodes, there's one less place to
>> update,
>> and almost all the opcodes are self-documenting since the information on
>> how to
>> compute the result is right next to the definition.
>>
>> The helper functions in nir_constant_expressions.c were taken from
>> ir_constant_expressions.cpp.
>>
>> v2: use Python formatting and get rid of regex's
>> Signed-off-by: Connor Abbott <cwabbott0 at gmail.com>
>> ---
>>  src/glsl/Makefile.am                     |  10 +-
>>  src/glsl/Makefile.sources                |   3 +-
>>  src/glsl/nir/.gitignore                  |   1 +
>>  src/glsl/nir/nir_constant_expressions.h  |  32 ++
>>  src/glsl/nir/nir_constant_expressions.py | 320 ++++++++++++++++++
>>  src/glsl/nir/nir_opcodes.py              | 562
>> +++++++++++++++++++++----------
>>  6 files changed, 740 insertions(+), 188 deletions(-)
>>  create mode 100644 src/glsl/nir/nir_constant_expressions.h
>>  create mode 100644 src/glsl/nir/nir_constant_expressions.py
>>
>> diff --git a/src/glsl/Makefile.am b/src/glsl/Makefile.am
>> index b2fe16a..51036b7 100644
>> --- a/src/glsl/Makefile.am
>> +++ b/src/glsl/Makefile.am
>> @@ -210,7 +210,8 @@ BUILT_SOURCES =
>> \
>>         glcpp/glcpp-lex.c                               \
>>         nir/nir_opt_algebraic.c                         \
>>         nir/nir_opcodes.h                               \
>> -       nir/nir_opcodes.c
>> +       nir/nir_opcodes.c                               \
>> +       nir/nir_constant_expressions.c
>>  CLEANFILES =                                           \
>>         glcpp/glcpp-parse.h                             \
>>         glsl_parser.h                                   \
>> @@ -236,3 +237,10 @@ nir/nir_opcodes.c: nir/nir_opcodes.py
>> nir/nir_opcodes_c.py
>>         $(PYTHON2) $(PYTHON_FLAGS) $(srcdir)/nir/nir_opcodes_c.py > $@
>>
>>  nir/nir.h: nir/nir_opcodes.h
>> +
>> +nir/nir_constant_expressions.c: nir/nir_opcodes.py
>> nir/nir_constant_expressions.py nir/nir_constant_expressions.h
>> +       $(AM_V_GEN)set -e;
>> \
>> +       $(MKDIR_P) `dirname $@`;
>> \
>> +       $(PYTHON2) $(PYTHON_FLAGS)
>> $(srcdir)/nir/nir_constant_expressions.py > $@.tmp;  \
>> +       mv $@.tmp $@;
>> +
>> diff --git a/src/glsl/Makefile.sources b/src/glsl/Makefile.sources
>> index 03b4f2e..9dd1a56 100644
>> --- a/src/glsl/Makefile.sources
>> +++ b/src/glsl/Makefile.sources
>> @@ -16,7 +16,8 @@ LIBGLCPP_GENERATED_FILES = \
>>  NIR_GENERATED_FILES = \
>>         $(GLSL_BUILDDIR)/nir/nir_opt_algebraic.c \
>>         $(GLSL_BUILDDIR)/nir/nir_opcodes.h \
>> -       $(GLSL_BUILDDIR)/nir/nir_opcodes.c
>> +       $(GLSL_BUILDDIR)/nir/nir_opcodes.c \
>> +       $(GLSL_BUILDDIR)/nir/nir_constant_expressions.c
>>
>>  NIR_FILES = \
>>         $(GLSL_SRCDIR)/nir/nir.c \
>> diff --git a/src/glsl/nir/.gitignore b/src/glsl/nir/.gitignore
>> index 4c28193..261f64f 100644
>> --- a/src/glsl/nir/.gitignore
>> +++ b/src/glsl/nir/.gitignore
>> @@ -1,3 +1,4 @@
>>  nir_opt_algebraic.c
>>  nir_opcodes.c
>>  nir_opcodes.h
>> +nir_constant_expressions.c
>> diff --git a/src/glsl/nir/nir_constant_expressions.h
>> b/src/glsl/nir/nir_constant_expressions.h
>> new file mode 100644
>> index 0000000..4ca09be
>> --- /dev/null
>> +++ b/src/glsl/nir/nir_constant_expressions.h
>> @@ -0,0 +1,32 @@
>> +/*
>> + * Copyright © 2014 Connor Abbott
>> + *
>> + * Permission is hereby granted, free of charge, to any person obtaining
>> a
>> + * copy of this software and associated documentation files (the
>> "Software"),
>> + * to deal in the Software without restriction, including without
>> limitation
>> + * the rights to use, copy, modify, merge, publish, distribute,
>> sublicense,
>> + * and/or sell copies of the Software, and to permit persons to whom the
>> + * Software is furnished to do so, subject to the following conditions:
>> + *
>> + * The above copyright notice and this permission notice (including the
>> next
>> + * paragraph) shall be included in all copies or substantial portions of
>> the
>> + * Software.
>> + *
>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
>> EXPRESS OR
>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
>> MERCHANTABILITY,
>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT
>> SHALL
>> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
>> OTHER
>> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
>> ARISING
>> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
>> DEALINGS
>> + * IN THE SOFTWARE.
>> + *
>> + * Authors:
>> + *    Connor Abbott (cwabbott0 at gmail.com)
>> + *
>> + */
>> +
>> +#include "nir.h"
>> +
>> +nir_const_value nir_eval_const_opcode(nir_op op, unsigned num_components,
>> +                                      nir_const_value *src);
>> +
>> diff --git a/src/glsl/nir/nir_constant_expressions.py
>> b/src/glsl/nir/nir_constant_expressions.py
>> new file mode 100644
>> index 0000000..8498fc3
>> --- /dev/null
>> +++ b/src/glsl/nir/nir_constant_expressions.py
>> @@ -0,0 +1,320 @@
>> +#! /usr/bin/env python
>> +#
>> +# Copyright (C) 2014 Connor Abbott
>> +#
>> +# Permission is hereby granted, free of charge, to any person obtaining a
>> +# copy of this software and associated documentation files (the
>> "Software"),
>> +# to deal in the Software without restriction, including without
>> limitation
>> +# the rights to use, copy, modify, merge, publish, distribute,
>> sublicense,
>> +# and/or sell copies of the Software, and to permit persons to whom the
>> +# Software is furnished to do so, subject to the following conditions:
>> +#
>> +# The above copyright notice and this permission notice (including the
>> next
>> +# paragraph) shall be included in all copies or substantial portions of
>> the
>> +# Software.
>> +#
>> +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
>> OR
>> +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
>> MERCHANTABILITY,
>> +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT
>> SHALL
>> +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
>> OTHER
>> +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
>> +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
>> DEALINGS
>> +# IN THE SOFTWARE.
>> +#
>> +# Authors:
>> +#    Connor Abbott (cwabbott0 at gmail.com)
>> +
>> +from nir_opcodes import opcodes
>> +from mako.template import Template
>> +
>> +# the const_expr string for each opcode has a few shortcuts - most only
>> have
>> +# an expression, and the "dst = (expr);" is implied. In addition,
>> +# per-component inputs and outputs are referenced without any subscripts,
>> so
>> +# we need to create the implicit for-loop for per-component opcodes. In
>> +# addition, we need to expand out the src0, src1, etc. with actual
>> sources
>> +# with the appropriate type using the union.
>> +
>> +def wr(string, wrap_bool):
>> +   if wrap_bool:
>> +      return "((" + string + ") ? NIR_TRUE : NIR_FALSE)"
>> +   return string
>> +
>> +class Operand(object):
>> +   def __init__(self, name, type_, is_src):
>> +      if type_ == "bool" or type_ == "unsigned":
>> +         prefix = "u"
>> +      elif type_ == "int":
>> +         prefix = "i"
>> +      else:
>> +         prefix = "f"
>> +
>> +      wrap_bool = is_src and type_ == "bool"
>> +
>> +      def wr(string, wrap=wrap_bool):
>> +         if wrap:
>> +            return "(" + string + " != NIR_FALSE)"
>> +         return string
>
> Wow, this is confusing... You redefine a global function locally to do
> something similar but very different.  Then give it two parameters one of
> which has a default value that comes from the argument of the parent
> function.  On top of that, its never used with its argument so the argument
> isn't needed at all.  I think we can do better than this...

Whoops, I meant to delete the global version of wr() above :/ also, I
can rename it to wrap() if you want. I added the argument because I
thought it was necessary, but it turns out Python does support a weak
form of closures (it won't keep them alive once you escape the context
of the outer function), so this isn't actually necessary. I'll fix
this, plus a similar thing in unop_reduce() and binop_reduce().

>
>> +
>> +      self.name = wr(name + "." + prefix + "[_i]")
>
> Calling that "name" seems kind of odd.

Yeah, I called it that because I was thinking of it as the "name" of
the operand since calling str() on the object would return it, but it
seems like "per_component" would probably be a better name.

>
>> +      self.x = wr(name + "." + prefix + "[0]")
>> +      self.y = wr(name + "." + prefix + "[1]")
>> +      self.z = wr(name + "." + prefix + "[2]")
>> +      self.w = wr(name + "." + prefix + "[3]")
>> +
>> +   def __str__(self):
>> +      return self.name
>> +
>> +def expand_constexpr(opcode):
>> +   const_expr = opcode.const_expr
>> +
>> +   if "dst" not in const_expr:
>> +      if opcode.output_type == "bool":
>> +         # For convenience, insert the conversion to unsigned.
>> +         # Note that we don't do this for things that aren't expressions.
>> +         const_expr = "(" + const_expr + ") ? NIR_TRUE : NIR_FALSE"
>> +
>> +      if opcode.output_size == 0:
>> +         const_expr = "{dst} = " + const_expr + ";"
>> +      else:
>> +         # for non-per-component opcodes, assume we broadcast to all
>> components
>> +         const_expr = "\n".join(
>> +               "{dst." + "xyzw"[i] + "} = " + const_expr + ";"
>> +                     for i in range(opcode.output_size))
>> +
>> +   replacement_dict = {
>> +      "src" + str(i) : Operand("src[" + str(i) + "]",
>> opcode.input_types[i], True)
>> +         for i in range(opcode.num_inputs)
>> +   }
>> +
>> +   replacement_dict["dst"] = Operand("dst", opcode.output_type, False)
>> +
>> +   const_expr = const_expr.format(**replacement_dict)
>> +
>> +   if opcode.output_size == 0:
>> +      const_expr = "for (unsigned _i = 0; _i < num_components; _i++) {" +
>> const_expr + "}"
>> +
>> +   return const_expr
>
> I can't help but think that the above is far more confusing than needed.
> Also, I'd rather do as much of this in the mako as possible.  That's why we
> are using a templating language after all.

Is there anything in Mako that would really help us here? The things
that need to be done are:

- replacing a pure expression with "dst = ..." or "dst = (...) ?
NIR_TRUE : NIR_FALSE" for bools
- adding the for loop for per-component things
- replacing e.g. ${src0.x} with src[0].f[0] for floats or (src[0].u[0]
!= NIR_FALSE) for bools

The problem with Mako is that the only thing it offers is the ability
to do fancier formatting in the source string, when we don't really
need that: we want the source string to be as simple as possible. It's
the transformations on the source string that are more complicated, so
what we're doing is closer to text substitution than formatting. That
being said, if you can come up with something better than this or the
pile of regex's that we had before, I'm all ears.

>
>> +
>> +
>> +const_exprs = {name : expand_constexpr(opcode)
>> +                 for name, opcode in opcodes.iteritems()}
>> +
>> +template = Template("""
>> +#include <math.h>
>> +#include "main/core.h"
>> +#include "nir_constant_expressions.h"
>> +
>> +#if defined(_MSC_VER) && (_MSC_VER < 1800)
>> +static int isnormal(double x)
>> +{
>> +   return _fpclass(x) == _FPCLASS_NN || _fpclass(x) == _FPCLASS_PN;
>> +}
>> +#elif defined(__SUNPRO_CC)
>> +#include <ieeefp.h>
>> +static int isnormal(double x)
>> +{
>> +   return fpclass(x) == FP_NORMAL;
>> +}
>> +#endif
>> +
>> +#if defined(_MSC_VER)
>> +static double copysign(double x, double y)
>> +{
>> +   return _copysign(x, y);
>> +}
>> +#endif
>> +
>> +/**
>> + * Evaluate one component of packSnorm4x8.
>> + */
>> +static uint8_t
>> +pack_snorm_1x8(float x)
>> +{
>> +    /* From section 8.4 of the GLSL 4.30 spec:
>> +     *
>> +     *    packSnorm4x8
>> +     *    ------------
>> +     *    The conversion for component c of v to fixed point is done as
>> +     *    follows:
>> +     *
>> +     *      packSnorm4x8: round(clamp(c, -1, +1) * 127.0)
>> +     *
>> +     * We must first cast the float to an int, because casting a negative
>> +     * float to a uint is undefined.
>> +     */
>> +   return (uint8_t) (int8_t)
>> +          _mesa_round_to_even(CLAMP(x, -1.0f, +1.0f) * 127.0f);
>> +}
>> +
>> +/**
>> + * Evaluate one component of packSnorm2x16.
>> + */
>> +static uint16_t
>> +pack_snorm_1x16(float x)
>> +{
>> +    /* From section 8.4 of the GLSL ES 3.00 spec:
>> +     *
>> +     *    packSnorm2x16
>> +     *    -------------
>> +     *    The conversion for component c of v to fixed point is done as
>> +     *    follows:
>> +     *
>> +     *      packSnorm2x16: round(clamp(c, -1, +1) * 32767.0)
>> +     *
>> +     * We must first cast the float to an int, because casting a negative
>> +     * float to a uint is undefined.
>> +     */
>> +   return (uint16_t) (int16_t)
>> +          _mesa_round_to_even(CLAMP(x, -1.0f, +1.0f) * 32767.0f);
>> +}
>> +
>> +/**
>> + * Evaluate one component of unpackSnorm4x8.
>> + */
>> +static float
>> +unpack_snorm_1x8(uint8_t u)
>> +{
>> +    /* From section 8.4 of the GLSL 4.30 spec:
>> +     *
>> +     *    unpackSnorm4x8
>> +     *    --------------
>> +     *    The conversion for unpacked fixed-point value f to floating
>> point is
>> +     *    done as follows:
>> +     *
>> +     *       unpackSnorm4x8: clamp(f / 127.0, -1, +1)
>> +     */
>> +   return CLAMP((int8_t) u / 127.0f, -1.0f, +1.0f);
>> +}
>> +
>> +/**
>> + * Evaluate one component of unpackSnorm2x16.
>> + */
>> +static float
>> +unpack_snorm_1x16(uint16_t u)
>> +{
>> +    /* From section 8.4 of the GLSL ES 3.00 spec:
>> +     *
>> +     *    unpackSnorm2x16
>> +     *    ---------------
>> +     *    The conversion for unpacked fixed-point value f to floating
>> point is
>> +     *    done as follows:
>> +     *
>> +     *       unpackSnorm2x16: clamp(f / 32767.0, -1, +1)
>> +     */
>> +   return CLAMP((int16_t) u / 32767.0f, -1.0f, +1.0f);
>> +}
>> +
>> +/**
>> + * Evaluate one component packUnorm4x8.
>> + */
>> +static uint8_t
>> +pack_unorm_1x8(float x)
>> +{
>> +    /* From section 8.4 of the GLSL 4.30 spec:
>> +     *
>> +     *    packUnorm4x8
>> +     *    ------------
>> +     *    The conversion for component c of v to fixed point is done as
>> +     *    follows:
>> +     *
>> +     *       packUnorm4x8: round(clamp(c, 0, +1) * 255.0)
>> +     */
>> +   return (uint8_t) _mesa_round_to_even(CLAMP(x, 0.0f, 1.0f) * 255.0f);
>> +}
>> +
>> +/**
>> + * Evaluate one component packUnorm2x16.
>> + */
>> +static uint16_t
>> +pack_unorm_1x16(float x)
>> +{
>> +    /* From section 8.4 of the GLSL ES 3.00 spec:
>> +     *
>> +     *    packUnorm2x16
>> +     *    -------------
>> +     *    The conversion for component c of v to fixed point is done as
>> +     *    follows:
>> +     *
>> +     *       packUnorm2x16: round(clamp(c, 0, +1) * 65535.0)
>> +     */
>> +   return (uint16_t) _mesa_round_to_even(CLAMP(x, 0.0f, 1.0f) *
>> 65535.0f);
>> +}
>> +
>> +/**
>> + * Evaluate one component of unpackUnorm4x8.
>> + */
>> +static float
>> +unpack_unorm_1x8(uint8_t u)
>> +{
>> +    /* From section 8.4 of the GLSL 4.30 spec:
>> +     *
>> +     *    unpackUnorm4x8
>> +     *    --------------
>> +     *    The conversion for unpacked fixed-point value f to floating
>> point is
>> +     *    done as follows:
>> +     *
>> +     *       unpackUnorm4x8: f / 255.0
>> +     */
>> +   return (float) u / 255.0f;
>> +}
>> +
>> +/**
>> + * Evaluate one component of unpackUnorm2x16.
>> + */
>> +static float
>> +unpack_unorm_1x16(uint16_t u)
>> +{
>> +    /* From section 8.4 of the GLSL ES 3.00 spec:
>> +     *
>> +     *    unpackUnorm2x16
>> +     *    ---------------
>> +     *    The conversion for unpacked fixed-point value f to floating
>> point is
>> +     *    done as follows:
>> +     *
>> +     *       unpackUnorm2x16: f / 65535.0
>> +     */
>> +   return (float) u / 65535.0f;
>> +}
>> +
>> +/**
>> + * Evaluate one component of packHalf2x16.
>> + */
>> +static uint16_t
>> +pack_half_1x16(float x)
>> +{
>> +   return _mesa_float_to_half(x);
>> +}
>> +
>> +/**
>> + * Evaluate one component of unpackHalf2x16.
>> + */
>> +static float
>> +unpack_half_1x16(uint16_t u)
>> +{
>> +   return _mesa_half_to_float(u);
>> +}
>> +
>> +nir_const_value
>> +nir_eval_const_opcode(nir_op op, unsigned num_components,
>> +                      nir_const_value *src)
>> +{
>> +   nir_const_value dst = {
>> +      .u = {0, 0, 0, 0}
>> +   };
>> +
>> +   switch (op) {
>> +% for name, const_expr in sorted(const_exprs.iteritems()):
>> +   case nir_op_${name}: {
>> +      ${const_expr}
>> +      break;
>> +   }
>> +% endfor
>> +   case nir_num_opcodes: unreachable("shouldn't get here");
>> +   }
>> +
>> +   return dst;
>> +}
>> +""")
>> +
>> +print template.render(const_exprs=const_exprs)
>> +
>> diff --git a/src/glsl/nir/nir_opcodes.py b/src/glsl/nir/nir_opcodes.py
>> index fa2f563..6f3c5ba 100644
>> --- a/src/glsl/nir/nir_opcodes.py
>> +++ b/src/glsl/nir/nir_opcodes.py
>> @@ -24,6 +24,7 @@
>>  # Authors:
>>  #    Connor Abbott (cwabbott0 at gmail.com)
>>
>> +
>>  # Class that represents all the information we have about the opcode
>>  # NOTE: this must be kept in sync with nir_op_info
>>
>> @@ -34,7 +35,7 @@ class Opcode:
>>     # algebraic_properties is a space-seperated string,
>>     # where nir_op_is_ is prepended before each entry
>>     def __init__(self, name, output_size, output_type, input_sizes,
>> -                input_types, algebraic_properties):
>> +                input_types, algebraic_properties, const_expr):
>>        assert(isinstance(name, str))
>>        assert(isinstance(output_size, int))
>>        assert(isinstance(output_type, str))
>> @@ -43,6 +44,7 @@ class Opcode:
>>        assert(isinstance(input_types, list))
>>        assert(isinstance(input_types[0], str))
>>        assert(isinstance(algebraic_properties, str))
>> +      assert(isinstance(const_expr, str))
>>        assert(len(input_sizes) == len(input_types))
>>        assert(0 <= output_size <= 4)
>>        for size in input_sizes:
>> @@ -56,6 +58,7 @@ class Opcode:
>>        self.input_sizes = input_sizes
>>        self.input_types = input_types
>>        self.algebraic_properties = algebraic_properties
>> +      self.const_expr = const_expr
>>
>>  # helper variables for strings
>>  tfloat = "float"
>> @@ -70,178 +73,290 @@ associative = "associative "
>>  opcodes = {}
>>
>>  def opcode(name, output_size, output_type, input_sizes, input_types,
>> -           algebraic_properties):
>> +           algebraic_properties, const_expr):
>>     assert(name not in opcodes)
>>     opcodes[name] = Opcode(name, output_size, output_type, input_sizes,
>> -                          input_types, algebraic_properties)
>> -
>> -def unop_convert(name, in_type, out_type):
>> -   opcode(name, 0, out_type, [0], [in_type], "")
>> -
>> -def unop(name, ty):
>> -   opcode(name, 0, ty, [0], [ty], "")
>> -
>> -def unop_horiz(name, output_size, output_type, input_size, input_type):
>> -   opcode(name, output_size, output_type, [input_size], [input_type], "")
>> -
>> -def unop_reduce(name, output_size, output_type, input_type):
>> -   unop_horiz(name + "2", output_size, output_type, 2, input_type)
>> -   unop_horiz(name + "3", output_size, output_type, 3, input_type)
>> -   unop_horiz(name + "4", output_size, output_type, 4, input_type)
>> +                          input_types, algebraic_properties, const_expr)
>> +
>> +def unop_convert(name, in_type, out_type, const_expr):
>> +   opcode(name, 0, out_type, [0], [in_type], "", const_expr)
>> +
>> +def unop(name, ty, const_expr):
>> +   opcode(name, 0, ty, [0], [ty], "", const_expr)
>> +
>> +def unop_horiz(name, output_size, output_type, input_size, input_type,
>> +               const_expr):
>> +   opcode(name, output_size, output_type, [input_size], [input_type], "",
>> +          const_expr)
>> +
>> +def unop_reduce(name, output_size, output_type, input_type,
>> prereduce_expr,
>> +                reduce_expr, final_expr):
>> +   def prereduce(src, expr=prereduce_expr):
>> +      return "(" + expr.format(src=src) + ")"
>> +   def final(src, expr=final_expr):
>> +      return expr.format(src="(" + src + ")")
>> +   def reduce_(src0, src1, expr=reduce_expr):
>> +      return expr.format(src0=src0, src1=src1)
>> +   src0 = prereduce("{src0.x}")
>> +   src1 = prereduce("{src0.y}")
>> +   src2 = prereduce("{src0.z}")
>> +   src3 = prereduce("{src0.w}")
>> +   unop_horiz(name + "2", output_size, output_type, 2, input_type,
>> +              final(reduce_(src0, src1)))
>> +   unop_horiz(name + "3", output_size, output_type, 3, input_type,
>> +              final(reduce_(reduce_(src0, src1), src2)))
>> +   unop_horiz(name + "4", output_size, output_type, 4, input_type,
>> +              final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
>
> I really don't like the way this worked out.  That said, I can't come up
> with anything better at the moment, so I won't complain too much.

Meh, I think it's ok... the only thing is the extra arguments to
prereduce(), final(), and reduce_() that we can get rid of.

>
>>
>>  # These two move instructions differ in what modifiers they support and
>> what
>>  # the negate modifier means. Otherwise, they are identical.
>> -unop("fmov", tfloat)
>> -unop("imov", tint)
>> -
>> -unop("ineg", tint)
>> -unop("fneg", tfloat)
>> -unop("inot", tint) # invert every bit of the integer
>> -unop("fnot", tfloat) # (src == 0.0) ? 1.0 : 0.0
>> -unop("fsign", tfloat)
>> -unop("isign", tint)
>> -unop("iabs", tint)
>> -unop("fabs", tfloat)
>> -unop("fsat", tfloat)
>> -unop("frcp", tfloat)
>> -unop("frsq", tfloat)
>> -unop("fsqrt", tfloat)
>> -unop("fexp", tfloat) # < e^x
>> -unop("flog", tfloat) # log base e
>> -unop("fexp2", tfloat)
>> -unop("flog2", tfloat)
>> -unop_convert("f2i", tfloat, tint) # Float-to-integer conversion.
>> -unop_convert("f2u", tfloat, tunsigned) # Float-to-unsigned conversion
>> -unop_convert("i2f", tint, tfloat) # Integer-to-float conversion.
>> -unop_convert("f2b", tfloat, tbool) # Float-to-boolean conversion
>> -unop_convert("b2f", tbool, tfloat) # Boolean-to-float conversion
>> -unop_convert("i2b", tint, tbool) # int-to-boolean conversion
>> -unop_convert("b2i", tbool, tint) # Boolean-to-int conversion
>> -unop_convert("u2f", tunsigned, tfloat) #Unsigned-to-float conversion.
>> -
>> -unop_reduce("bany", 1, tbool, tbool) # returns ~0 if any component of
>> src[0] != 0
>> -unop_reduce("ball", 1, tbool, tbool) # returns ~0 if all components of
>> src[0] != 0
>> -unop_reduce("fany", 1, tfloat, tfloat) # returns 1.0 if any component of
>> src[0] != 0
>> -unop_reduce("fall", 1, tfloat, tfloat) # returns 1.0 if all components of
>> src[0] != 0
>> +unop("fmov", tfloat, "{src0}")
>> +unop("imov", tint, "{src0}")
>> +
>> +unop("ineg", tint, "-{src0}")
>> +unop("fneg", tfloat, "-{src0}")
>> +unop("inot", tint, "~{src0}") # invert every bit of the integer
>> +unop("fnot", tfloat, "({src0} == 0.0f) ? 1.0f : 0.0f")
>> +unop("fsign", tfloat, "({src0} == 0.0f) ? 0.0f : (({src0} > 0.0f) ? 1.0f
>> : -1.0f)")
>> +unop("isign", tint, "({src0} == 0) ? 0 : ((src > 0) ? 1 : -1)")
>> +unop("iabs", tint, "abs({src0})")
>> +unop("fabs", tfloat, "fabsf({src0})")
>> +unop("fsat", tfloat, "({src0} > 1.0f) ? 1.0f : (({src0} <= 0.0f) ? 0.0f :
>> {src0})")
>> +unop("frcp", tfloat, "1.0f / {src0}")
>> +unop("frsq", tfloat, "1.0f / sqrtf({src0})")
>> +unop("fsqrt", tfloat, "sqrtf({src0})")
>> +unop("fexp", tfloat, "expf({src0})") # < e^x
>> +unop("flog", tfloat, "logf({src0})") # log base e
>> +unop("fexp2", tfloat, "exp2f({src0})")
>> +unop("flog2", tfloat, "log2f({src0})")
>> +unop_convert("f2i", tfloat, tint, "{src0}") # Float-to-integer
>> conversion.
>> +unop_convert("f2u", tfloat, tunsigned, "{src0}") # Float-to-unsigned
>> conversion
>> +unop_convert("i2f", tint, tfloat, "{src0}") # Integer-to-float
>> conversion.
>> +# Float-to-boolean conversion
>> +unop_convert("f2b", tfloat, tbool, "{src0} == 0.0f")
>> +# Boolean-to-float conversion
>> +unop_convert("b2f", tbool, tfloat, "{src0} ? 1.0f : 0.0f")
>> +# Int-to-boolean conversion
>> +unop_convert("i2b", tint, tbool, "{src0} == 0")
>> +unop_convert("b2i", tbool, tint, "{src0} ? 0 : -1") # Boolean-to-int
>> conversion
>> +unop_convert("u2f", tunsigned, tfloat, "{src0}") #Unsigned-to-float
>> conversion.
>> +
>> +unop_reduce("bany", 1, tbool, tbool, "{src}", "{src0} || {src1}",
>> "{src}")
>> +unop_reduce("ball", 1, tbool, tbool, "{src}", "{src0} && {src1}",
>> "{src}")
>> +unop_reduce("fany", 1, tfloat, tfloat, "{src} != 0.0f", "{src0} ||
>> {src1}",
>> +            "{src} ? 1.0f : 0.0f")
>> +unop_reduce("fall", 1, tfloat, tfloat, "{src} != 0.0f", "{src0} &&
>> {src1}",
>> +            "{src} ? 1.0f : 0.0f")
>>
>>  # Unary floating-point rounding operations.
>>
>>
>> -unop("ftrunc", tfloat)
>> -unop("fceil", tfloat)
>> -unop("ffloor", tfloat)
>> -unop("ffract", tfloat)
>> -unop("fround_even", tfloat)
>> +unop("ftrunc", tfloat, "truncf({src0})")
>> +unop("fceil", tfloat, "ceilf({src0})")
>> +unop("ffloor", tfloat, "floorf({src0})")
>> +unop("ffract", tfloat, "{src0} - floorf({src0})")
>> +unop("fround_even", tfloat, "_mesa_round_to_even({src0})")
>>
>>
>>  # Trigonometric operations.
>>
>>
>> -unop("fsin", tfloat)
>> -unop("fcos", tfloat)
>> -unop("fsin_reduced", tfloat)
>> -unop("fcos_reduced", tfloat)
>> +unop("fsin", tfloat, "sinf({src0})")
>> +unop("fcos", tfloat, "cosf({src0})")
>> +unop("fsin_reduced", tfloat, "sinf({src0})")
>> +unop("fcos_reduced", tfloat, "cosf({src0})")
>>
>>
>>  # Partial derivatives.
>>
>>
>> -unop("fddx", tfloat)
>> -unop("fddy", tfloat)
>> -unop("fddx_fine", tfloat)
>> -unop("fddy_fine", tfloat)
>> -unop("fddx_coarse", tfloat)
>> -unop("fddy_coarse", tfloat)
>> +unop("fddx", tfloat, "0.0f") # the derivative of a constant is 0.
>> +unop("fddy", tfloat, "0.0f")
>> +unop("fddx_fine", tfloat, "0.0f")
>> +unop("fddy_fine", tfloat, "0.0f")
>> +unop("fddx_coarse", tfloat, "0.0f")
>> +unop("fddy_coarse", tfloat, "0.0f")
>>
>>
>>  # Floating point pack and unpack operations.
>>
>> -
>> -unop_horiz("pack_snorm_2x16", 1, tunsigned, 2, tfloat)
>> -unop_horiz("pack_snorm_4x8", 1, tunsigned, 4, tfloat)
>> -unop_horiz("pack_unorm_2x16", 1, tunsigned, 2, tfloat)
>> -unop_horiz("pack_unorm_4x8", 1, tunsigned, 4, tfloat)
>> -unop_horiz("pack_half_2x16", 1, tunsigned, 2, tfloat)
>> -unop_horiz("unpack_snorm_2x16", 2, tfloat, 1, tunsigned)
>> -unop_horiz("unpack_snorm_4x8", 4, tfloat, 1, tunsigned)
>> -unop_horiz("unpack_unorm_2x16", 2, tfloat, 1, tunsigned)
>> -unop_horiz("unpack_unorm_4x8", 4, tfloat, 1, tunsigned)
>> -unop_horiz("unpack_half_2x16", 2, tfloat, 1, tunsigned)
>> +def pack_2x16(fmt):
>> +   unop_horiz("pack_" + fmt + "_2x16", 1, tunsigned, 2, tfloat, """
>> +{dst.x} = (uint32_t) pack_fmt_1x16({src0.x});
>> +{dst.x} |= ((uint32_t) pack_fmt_1x16({src0.y})) << 16;
>> +""".replace("fmt", fmt))
>> +
>> +def pack_4x8(fmt):
>> +   unop_horiz("pack_" + fmt + "_4x8", 1, tunsigned, 4, tfloat, """
>> +{dst.x} = (uint32_t) pack_fmt_1x8({src0.x});
>> +{dst.x} |= ((uint32_t) pack_fmt_1x8({src0.y})) << 8;
>> +{dst.x} |= ((uint32_t) pack_fmt_1x8({src0.z})) << 16;
>> +{dst.x} |= ((uint32_t) pack_fmt_1x8({src0.w})) << 24;
>> +""".replace("fmt", fmt))
>> +
>> +def unpack_2x16(fmt):
>> +   unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat, 1, tunsigned, """
>> +{dst.x} = unpack_fmt_1x16((uint16_t)({src0.x} & 0xffff));
>> +{dst.y} = unpack_fmt_1x16((uint16_t)({src0.x} << 16));
>> +""".replace("fmt", fmt))
>> +
>> +def unpack_4x8(fmt):
>> +   unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat, 1, tunsigned, """
>> +{dst.x} = unpack_fmt_1x8((uint8_t)({src0.x} & 0xff));
>> +{dst.y} = unpack_fmt_1x8((uint8_t)(({src0.x} >> 8) & 0xff));
>> +{dst.z} = unpack_fmt_1x8((uint8_t)(({src0.x} >> 16) & 0xff));
>> +{dst.w} = unpack_fmt_1x8((uint8_t)({src0.x} >> 24));
>> +""".replace("fmt", fmt))
>> +
>> +
>> +pack_2x16("snorm")
>> +pack_4x8("snorm")
>> +pack_2x16("unorm")
>> +pack_4x8("unorm")
>> +pack_2x16("half")
>> +unpack_2x16("snorm")
>> +unpack_4x8("snorm")
>> +unpack_2x16("unorm")
>> +unpack_4x8("unorm")
>> +unpack_2x16("half")
>>
>>
>>  # Lowered floating point unpacking operations.
>>
>>
>> -unop_horiz("unpack_half_2x16_split_x", 1, tfloat, 1, tunsigned)
>> -unop_horiz("unpack_half_2x16_split_y", 1, tfloat, 1, tunsigned)
>> +unop_horiz("unpack_half_2x16_split_x", 1, tfloat, 1, tunsigned, """
>> +{dst.x} = unpack_half_1x16((uint16_t)({src0.x} & 0xffff));
>> +""")
>> +unop_horiz("unpack_half_2x16_split_y", 1, tfloat, 1, tunsigned, """
>> +{dst.y} = unpack_half_1x16((uint16_t)({src0.x} >> 16));
>> +""")
>>
>>
>>  # Bit operations, part of ARB_gpu_shader5.
>>
>>
>> -unop("bitfield_reverse", tunsigned)
>> -unop("bit_count", tunsigned)
>> -unop_convert("ufind_msb", tunsigned, tint)
>> -unop("ifind_msb", tint)
>> -unop("find_lsb", tint)
>> +unop("bitfield_reverse", tunsigned, """
>> +/* we're not winning any awards for speed here, but that's ok */
>> +{dst} = 0;
>> +for (unsigned bit = 0; bit < 32; bit++)
>> +   {dst} |= (({src0} >> bit) & 1) << (31 - bit);
>> +""")
>> +unop("bit_count", tunsigned, """
>> +{dst} = 0;
>> +for (unsigned bit = 0; bit < 32; bit++) {{
>> +   if (({src0} >> bit) & 1)
>> +      {dst}++;
>> +}}
>> +""")
>> +
>> +unop_convert("ufind_msb", tunsigned, tint, """
>> +{dst} = -1;
>> +for (int bit = 31; bit > 0; bit--) {{
>> +   if (({src0} >> bit) & 1) {{
>> +      {dst} = bit;
>> +      break;
>> +   }}
>> +}}
>> +""")
>> +
>> +unop("ifind_msb", tint, """
>> +{dst} = -1;
>> +for (int bit = 31; bit >= 0; bit--) {{
>> +   /* If src0 < 0, we're looking for the first 0 bit.
>> +    * if src0 >= 0, we're looking for the first 1 bit.
>> +    */
>> +   if (((({src0} >> bit) & 1) && ({src0} >= 0)) ||
>> +      (!(({src0} >> bit) & 1) && ({src0} < 0))) {{
>> +      {dst} = bit;
>> +      break;
>> +   }}
>> +}}
>> +""")
>> +
>> +unop("find_lsb", tint, """
>> +{dst} = -1;
>> +for (unsigned bit = 0; bit < 32; bit++) {{
>> +   if (({src0} >> bit) & 1) {{
>> +      {dst} = bit;
>> +      break;
>> +   }}
>> +}}
>> +""")
>
> We do have helpers for most of the above.  I was moving them onto util but
> got sidetracked.  I should rework those patches.

Sure, and while we're at it we can remove the hand-written versions in
the GLSL IR constant folding code as well.

>
>>
>>
>>  for i in range(1, 5):
>>     for j in range(1, 5):
>> -      unop_horiz("fnoise" + str(i) + "_" + str(j), i, tfloat, j, tfloat)
>> +      unop_horiz("fnoise" + str(i) + "_" + str(j), i, tfloat, j, tfloat,
>> +                 "0.0f")
>>
>> -def binop_convert(name, out_type, in_type, alg_props):
>> -   opcode(name, 0, out_type, [0, 0], [in_type, in_type], alg_props)
>> +def binop_convert(name, out_type, in_type, alg_props, const_expr):
>> +   opcode(name, 0, out_type, [0, 0], [in_type, in_type], alg_props,
>> const_expr)
>>
>> -def binop(name, ty, alg_props):
>> -   binop_convert(name, ty, ty, alg_props)
>> +def binop(name, ty, alg_props, const_expr):
>> +   binop_convert(name, ty, ty, alg_props, const_expr)
>>
>> -def binop_compare(name, ty, alg_props):
>> -   binop_convert(name, ty, tbool, alg_props)
>> +def binop_compare(name, ty, alg_props, const_expr):
>> +   binop_convert(name, tbool, ty, alg_props, const_expr)
>>
>>  def binop_horiz(name, out_size, out_type, src1_size, src1_type,
>> src2_size,
>> -                src2_type):
>> -   opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type,
>> src2_type], "")
>> -
>> -def binop_reduce(name, output_size, output_type, src_type):
>> -   opcode(name + "2",output_size, output_type,
>> -          [2, 2], [src_type, src_type], commutative)
>> +                src2_type, const_expr):
>> +   opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type,
>> src2_type],
>> +          "", const_expr)
>> +
>> +def binop_reduce(name, output_size, output_type, src_type,
>> prereduce_expr,
>> +                 reduce_expr, final_expr):
>> +   def final(src, expr=final_expr):
>> +      return expr.format(src= "(" + src + ")")
>> +   def reduce_(src0, src1, expr=reduce_expr):
>> +      return expr.format(src0=src0, src1=src1)
>> +   def prereduce(src0, src1, expr=prereduce_expr):
>> +      return "(" + expr.format(src0=src0, src1=src1) + ")"
>> +   src0 = prereduce("{src0.x}", "{src1.x}")
>> +   src1 = prereduce("{src0.y}", "{src1.y}")
>> +   src2 = prereduce("{src0.z}", "{src1.z}")
>> +   src3 = prereduce("{src0.w}", "{src1.w}")
>> +   opcode(name + "2", output_size, output_type,
>> +          [2, 2], [src_type, src_type], commutative,
>> +          final(reduce_(src0, src1)))
>>     opcode(name + "3", output_size, output_type,
>> -          [3, 3], [src_type, src_type], commutative)
>> +          [3, 3], [src_type, src_type], commutative,
>> +          final(reduce_(reduce_(src0, src1), src2)))
>>     opcode(name + "4", output_size, output_type,
>> -          [4, 4], [src_type, src_type], commutative)
>> +          [4, 4], [src_type, src_type], commutative,
>> +          final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
>>
>> -binop("fadd", tfloat, commutative + associative)
>> -binop("iadd", tint, commutative + associative)
>> -binop("fsub", tfloat, "")
>> -binop("isub", tint, "")
>> +binop("fadd", tfloat, commutative + associative, "{src0} + {src1}")
>> +binop("iadd", tint, commutative + associative, "{src0} + {src1}")
>> +binop("fsub", tfloat, "", "{src0} - {src1}")
>> +binop("isub", tint, "", "{src0} - {src1}")
>>
>> -binop("fmul", tfloat, commutative + associative)
>> +binop("fmul", tfloat, commutative + associative, "{src0} * {src1}")
>>  # low 32-bits of signed/unsigned integer multiply
>> -binop("imul", tint, commutative + associative)
>> +binop("imul", tint, commutative + associative, "{src0} * {src1}")
>>  # high 32-bits of signed integer multiply
>> -binop("imul_high", tint, commutative)
>> +binop("imul_high", tint, commutative,
>> +      "(int32_t)(((int64_t) {src0} * (int64_t) {src1}) >> 32)")
>>  # high 32-bits of unsigned integer multiply
>> -binop("umul_high", tunsigned, commutative)
>> +binop("umul_high", tunsigned, commutative,
>> +      "(uint32_t)(((uint64_t) {src0} * (uint64_t) {src1}) >> 32)")
>>
>> -binop("fdiv", tfloat, "")
>> -binop("idiv", tint, "")
>> -binop("udiv", tunsigned, "")
>> +binop("fdiv", tfloat, "", "{src0} / {src1}")
>> +binop("idiv", tint, "", "{src0} / {src1}")
>> +binop("udiv", tunsigned, "", "{src0} / {src1}")
>>
>>  # returns a boolean representing the carry resulting from the addition of
>>  # the two unsigned arguments.
>>
>> -binop_convert("uadd_carry", tbool, tunsigned,
>> -              commutative)
>> +binop_convert("uadd_carry", tbool, tunsigned, commutative, "{src0} +
>> {src1} < {src0}")
>>
>>  # returns a boolean representing the borrow resulting from the
>> subtraction
>>  # of the two unsigned arguments.
>>
>> -binop_convert("usub_borrow", tbool, tunsigned, "")
>> +binop_convert("usub_borrow", tbool, tunsigned, "", "{src1} < {src0}")
>>
>> -binop("fmod", tfloat, "")
>> -binop("umod", tunsigned, "")
>> +binop("fmod", tfloat, "", "{src0} - {src1} * floorf({src0} / {src1})")
>> +binop("umod", tunsigned, "", "{src1} == 0 ? 0 : {src0} % {src1}")
>>
>>  #
>>  # Comparisons
>> @@ -250,41 +365,47 @@ binop("umod", tunsigned, "")
>>
>>  # these integer-aware comparisons return a boolean (0 or ~0)
>>
>> -binop_compare("flt", tfloat, "")
>> -binop_compare("fge", tfloat, "")
>> -binop_compare("feq", tfloat, commutative)
>> -binop_compare("fne", tfloat, commutative)
>> -binop_compare("ilt", tint, "")
>> -binop_compare("ige", tint, "")
>> -binop_compare("ieq", tint, commutative)
>> -binop_compare("ine", tint, commutative)
>> -binop_compare("ult", tunsigned, "")
>> -binop_compare("uge", tunsigned, "")
>> +binop_compare("flt", tfloat, "", "{src0} < {src1}")
>> +binop_compare("fge", tfloat, "", "{src0} >= {src1}")
>> +binop_compare("feq", tfloat, commutative, "{src0} == {src1}")
>> +binop_compare("fne", tfloat, commutative, "{src0} != {src1}")
>> +binop_compare("ilt", tint, "", "{src0} < {src1}")
>> +binop_compare("ige", tint, "", "{src0} >= {src1}")
>> +binop_compare("ieq", tint, commutative, "{src0} == {src1}")
>> +binop_compare("ine", tint, commutative, "{src0} != {src1}")
>> +binop_compare("ult", tunsigned, "", "{src0} < {src1}")
>> +binop_compare("uge", tunsigned, "", "{src0} >= {src1}")
>>
>>  # integer-aware GLSL-style comparisons that compare floats and ints
>>
>> -binop_reduce("ball_fequal",  1, tbool, tfloat)
>> -binop_reduce("bany_fnequal", 1, tbool, tfloat)
>> -binop_reduce("ball_iequal",  1, tbool, tint)
>> -binop_reduce("bany_inequal", 1, tbool, tint)
>> +binop_reduce("ball_fequal",  1, tbool, tfloat, "{src0} == {src1}",
>> +             "{src0} && {src1}", "{src}")
>> +binop_reduce("bany_fnequal", 1, tbool, tfloat, "{src0} != {src1}",
>> +             "{src0} || {src1}", "{src}")
>> +binop_reduce("ball_iequal",  1, tbool, tint, "{src0} == {src0}",
>> +             "{src0} && {src1}", "{src}")
>> +binop_reduce("bany_inequal", 1, tbool, tint, "{src0} != {src1}",
>> +             "{src0} || {src1}", "{src}")
>>
>>  # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
>>
>> -binop_reduce("fall_equal",  1, tfloat, tfloat)
>> -binop_reduce("fany_nequal", 1, tfloat, tfloat)
>> +binop_reduce("fall_equal",  1, tfloat, tfloat, "{src0} == {src1}",
>> +             "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
>> +binop_reduce("fany_nequal", 1, tfloat, tfloat, "{src0} != {src1}",
>> +             "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
>>
>>  # These comparisons for integer-less hardware return 1.0 and 0.0 for true
>>  # and false respectively
>>
>> -binop("slt", tfloat, "") # Set on Less Than
>> -binop("sge", tfloat, "") # Set on Greater Than or Equal
>> -binop("seq", tfloat, commutative) # Set on Equal
>> -binop("sne", tfloat, commutative) # Set on Not Equal
>> +binop("slt", tfloat, "", "({src0} < {src1}) ? 1.0f : 0.0f") # Set on Less
>> Than
>> +binop("sge", tfloat, "", "({src0} >= {src1}) ? 1.0f : 0.0f") # Set on
>> Greater or Equal
>> +binop("seq", tfloat, commutative, "({src0} == {src1}) ? 1.0f : 0.0f") #
>> Set on Equal
>> +binop("sne", tfloat, commutative, "({src0} != {src1}) ? 1.0f : 0.0f") #
>> Set on Not Equal
>>
>>
>> -binop("ishl", tint, "")
>> -binop("ishr", tint, "")
>> -binop("ushr", tunsigned, "")
>> +binop("ishl", tint, "", "{src0} << {src1}")
>> +binop("ishr", tint, "", "{src0} >> {src1}")
>> +binop("ushr", tunsigned, "", "{src0} >> {src1}")
>>
>>  # bitwise logic operators
>>  #
>> @@ -292,9 +413,9 @@ binop("ushr", tunsigned, "")
>>  # integers.
>>
>>
>> -binop("iand", tunsigned, commutative + associative)
>> -binop("ior", tunsigned, commutative + associative)
>> -binop("ixor", tunsigned, commutative + associative)
>> +binop("iand", tunsigned, commutative + associative, "{src0} & {src1}")
>> +binop("ior", tunsigned, commutative + associative, "{src0} | {src1}")
>> +binop("ixor", tunsigned, commutative + associative, "{src0} ^ {src1}")
>>
>>
>>  # floating point logic operators
>> @@ -302,42 +423,60 @@ binop("ixor", tunsigned, commutative + associative)
>>  # These use (src != 0.0) for testing the truth of the input, and output
>> 1.0
>>  # for true and 0.0 for false
>>
>> -binop("fand", tfloat, commutative)
>> -binop("for", tfloat, commutative)
>> -binop("fxor", tfloat, commutative)
>> -
>> -binop_reduce("fdot", 1, tfloat, tfloat)
>> -
>> -binop("fmin", tfloat, commutative + associative)
>> -binop("imin", tint, commutative + associative)
>> -binop("umin", tunsigned, commutative + associative)
>> -binop("fmax", tfloat, commutative + associative)
>> -binop("imax", tint, commutative + associative)
>> -binop("umax", tunsigned, commutative + associative)
>> -
>> -binop("fpow", tfloat, "")
>> -
>> -binop_horiz("pack_half_2x16_split", 1, tunsigned, 1, tfloat, 1, tfloat)
>> -
>> -binop("bfm", tunsigned, "")
>> -
>> -binop("ldexp", tunsigned, "")
>> +binop("fand", tfloat, commutative,
>> +      "(({src0} != 0.0f) && ({src1} != 0.0f)) ? 1.0f : 0.0f")
>> +binop("for", tfloat, commutative,
>> +      "(({src0} != 0.0f) || ({src1} != 0.0f)) ? 1.0f : 0.0f")
>> +binop("fxor", tfloat, commutative,
>> +      "({src0} != 0.0f && {src1} == 0.0f) || ({src0} == 0.0f && {src1} !=
>> 0.0f) ? 1.0f : 0.0f")
>> +
>> +binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} +
>> {src1}",
>> +             "{src}")
>> +
>> +binop("fmin", tfloat, commutative + associative, "fminf({src0}, {src1})")
>> +binop("imin", tint, commutative + associative, "{src1} > {src0} ? {src0}
>> : {src1}")
>> +binop("umin", tunsigned, commutative + associative, "{src1} > {src0} ?
>> {src0} : {src1}")
>> +binop("fmax", tfloat, commutative + associative, "fmaxf({src0}, {src1})")
>> +binop("imax", tint, commutative + associative, "{src1} > {src0} ? {src1}
>> : {src0}")
>> +binop("umax", tunsigned, commutative + associative, "{src1} > {src0} ?
>> {src1} : {src0}")
>> +
>> +binop("fpow", tfloat, "", "powf({src0}, {src1})")
>> +
>> +binop_horiz("pack_half_2x16_split", 1, tunsigned, 1, tfloat, 1, tfloat,
>> +            "pack_half_1x16({src0.x}) | (pack_half_1x16({src1.x}) <<
>> 16)")
>> +
>> +binop_convert("bfm", tunsigned, tint, "", """
>> +int offset = {src0}, bits = {src1};
>> +if (offset < 0 || bits < 0 || offset + bits > 32)
>> +   {dst} = 0; /* undefined per the spec */
>> +else
>> +   {dst} = ((1 << bits)- 1) << offset;
>> +""")
>> +
>> +opcode("ldexp", 0, tunsigned, [0, 0], [tfloat, tint], "", """
>> +{dst} = ldexp({src0}, {src1});
>> +/* flush denormals to zero. */
>> +if (!isnormal({dst}))
>> +   {dst} = copysign(0.0f, {src0});
>> +""")
>>
>>  # Combines the first component of each input to make a 2-component
>> vector.
>>
>> -binop_horiz("vec2", 2, tunsigned, 1, tunsigned, 1, tunsigned)
>> +binop_horiz("vec2", 2, tunsigned, 1, tunsigned, 1, tunsigned, """
>> +{dst.x} = {src0.x};
>> +{dst.y} = {src1.x};
>> +""")
>>
>> -def triop(name, ty):
>> -   opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], "")
>> -def triop_horiz(name, output_size, src1_size, src2_size, src3_size):
>> +def triop(name, ty, const_expr):
>> +   opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], "", const_expr)
>> +def triop_horiz(name, output_size, src1_size, src2_size, src3_size,
>> const_expr):
>>     opcode(name, output_size, tunsigned,
>>     [src1_size, src2_size, src3_size],
>> -   [tunsigned, tunsigned, tunsigned], "")
>> +   [tunsigned, tunsigned, tunsigned], "", const_expr)
>>
>> -# fma(a, b, c) = (a# b) + c
>> -triop("ffma", tfloat)
>> +triop("ffma", tfloat, "{src0} * {src1} + {src2}")
>>
>> -triop("flrp", tfloat)
>> +triop("flrp", tfloat, "{src0} * (1 - {src2}) + {src1} * {src2}")
>>
>>  # Conditional Select
>>  #
>> @@ -346,32 +485,83 @@ triop("flrp", tfloat)
>>  # bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
>>
>>
>> -triop("fcsel", tfloat)
>> +triop("fcsel", tfloat, "({src0} != 0.0f) ? {src1} : {src2}")
>>  opcode("bcsel", 0, tunsigned, [0, 0, 0],
>> -       [tbool, tunsigned, tunsigned], "")
>> -
>> -triop("bfi", tunsigned)
>> -
>> -triop("ubitfield_extract", tunsigned)
>> -opcode("ibitfield_extract", 0, tint, [0, 0, 0],
>> -       [tint, tunsigned, tunsigned], "")
>> +      [tbool, tunsigned, tunsigned], "", "{src0} ? {src1} : {src2}")
>> +
>> +triop("bfi", tunsigned, """
>> +unsigned mask = {src0}, insert = {src1} & mask, base = {src2};
>> +if (mask == 0) {{
>> +   {dst} = base;
>> +}} else {{
>> +   unsigned tmp = mask;
>> +   while (!(tmp & 1)) {{
>> +      tmp >>= 1;
>> +      insert <<= 1;
>> +   }}
>> +   {dst} = (base & ~mask) | insert;
>> +}}
>> +""")
>> +
>> +opcode("ubitfield_extract", 0, tunsigned,
>> +       [0, 1, 1], [tunsigned, tint, tint], "", """
>> +unsigned base = {src0};
>> +int offset = {src1.x}, bits = {src2.x};
>> +if (bits == 0) {{
>> +   {dst} = 0;
>> +}} else if (bits < 0 || offset < 0 || offset + bits > 32) {{
>> +   {dst} = 0; /* undefined per the spec */
>> +}} else {{
>> +   {dst} = (base >> offset) & ((1 << bits) - 1);
>> +}}
>> +""")
>> +opcode("ibitfield_extract", 0, tint,
>> +       [0, 1, 1], [tint, tint, tint], "", """
>> +int base = {src0};
>> +int offset = {src1.x}, bits = {src2.x};
>> +if (bits == 0) {{
>> +   {dst} = 0;
>> +}} else if (offset < 0 || bits < 0 || offset + bits > 32) {{
>> +   {dst} = 0;
>> +}} else {{
>> +   {dst} = (base << (32 - offset - bits)) >> offset; /* use
>> sign-extending shift */
>> +}}
>> +""")
>>
>>  # Combines the first component of each input to make a 3-component
>> vector.
>>
>> -triop_horiz("vec3", 3, 1, 1, 1)
>> +triop_horiz("vec3", 3, 1, 1, 1, """
>> +{dst.x} = {src0.x};
>> +{dst.y} = {src1.x};
>> +{dst.z} = {src2.x};
>> +""")
>>
>> -def quadop(name):
>> -   opcode(name, 0, tunsigned, [0, 0, 0, 0],
>> -          [tunsigned, tunsigned, tunsigned, tunsigned],
>> -          "")
>> -def quadop_horiz(name, output_size, src1_size, src2_size, src3_size,
>> src4_size):
>> +def quadop_horiz(name, output_size, src1_size, src2_size, src3_size,
>> +                 src4_size, const_expr):
>>     opcode(name, output_size, tunsigned,
>>            [src1_size, src2_size, src3_size, src4_size],
>>            [tunsigned, tunsigned, tunsigned, tunsigned],
>> -          "")
>> -
>> -quadop("bitfield_insert")
>> -
>> -quadop_horiz("vec4", 4, 1, 1, 1, 1)
>> +          "", const_expr)
>> +
>> +opcode("bitfield_insert", 0, tunsigned, [0, 0, 1, 1],
>> +       [tunsigned, tunsigned, tint, tint], "", """
>> +unsigned base = {src0}, insert = {src1};
>> +int offset = {src2.x}, bits = {src3.x};
>> +if (bits == 0) {{
>> +   {dst} = 0;
>> +}} else if (offset < 0 || bits < 0 || bits + offset > 32) {{
>> +   {dst} = 0;
>> +}} else {{
>> +   unsigned mask = ((1 << bits) - 1) << offset;
>> +   {dst} = (base & ~mask) | ((insert << bits) & mask);
>> +}}
>> +""")
>> +
>> +quadop_horiz("vec4", 4, 1, 1, 1, 1, """
>> +{dst.x} = {src0.x};
>> +{dst.y} = {src1.x};
>> +{dst.z} = {src2.x};
>> +{dst.w} = {src3.x};
>> +""")
>>
>>
>> --
>> 2.1.0
>>
>> _______________________________________________
>> mesa-dev mailing list
>> mesa-dev at lists.freedesktop.org
>> http://lists.freedesktop.org/mailman/listinfo/mesa-dev