[Mesa-dev] [PATCH v2 3/5] nir: add new constant folding infrastructure

Thu Jan 22 20:32:15 PST 2015

Add a required field to the Opcode class, const_expr, that contains an
expression or statement that computes the result of the opcode given known
constant inputs. Then take those const_expr's and expand them into a function
that takes an opcode and an array of constant inputs and spits out the constant
result. This means that when adding opcodes, there's one less place to update,
and almost all the opcodes are self-documenting since the information on how to
compute the result is right next to the definition.

The helper functions in nir_constant_expressions.c were taken from
ir_constant_expressions.cpp.

v2: use Python formatting and get rid of regex's
v2.5:
- fixup Makefile changes
- remove unused global wr(), rename wr() to wrap() (Jason)
- remove optional arguments in wr() and a few functions in unop_reduce()
  and binop_reduce() that were unnecessary

Signed-off-by: Connor Abbott <cwabbott0 at gmail.com>
---
 src/glsl/Makefile.am                     |   8 +-
 src/glsl/Makefile.sources                |   3 +-
 src/glsl/nir/.gitignore                  |   1 +
 src/glsl/nir/nir_constant_expressions.h  |  32 ++
 src/glsl/nir/nir_constant_expressions.py | 324 ++++++++++++++++++
 src/glsl/nir/nir_opcodes.py              | 562 +++++++++++++++++++++----------
 6 files changed, 743 insertions(+), 187 deletions(-)
 create mode 100644 src/glsl/nir/nir_constant_expressions.h
 create mode 100644 src/glsl/nir/nir_constant_expressions.py

diff --git a/src/glsl/Makefile.am b/src/glsl/Makefile.am
index 8474b70..408db6e 100644
--- a/src/glsl/Makefile.am
+++ b/src/glsl/Makefile.am
@@ -219,7 +219,8 @@ BUILT_SOURCES =						\
 	glcpp/glcpp-lex.c				\
 	nir/nir_opt_algebraic.c				\
 	nir/nir_opcodes.h				\
-	nir/nir_opcodes.c
+	nir/nir_opcodes.c				\
+	nir/nir_constant_expressions.c
 CLEANFILES =						\
 	glcpp/glcpp-parse.h				\
 	glsl_parser.h					\
@@ -245,3 +246,8 @@ nir/nir_opcodes.c: nir/nir_opcodes.py nir/nir_opcodes_c.py
 	$(PYTHON2) $(PYTHON_FLAGS) $(srcdir)/nir/nir_opcodes_c.py > $@
 
 nir/nir.h: nir/nir_opcodes.h
+
+nir/nir_constant_expressions.c: nir/nir_opcodes.py nir/nir_constant_expressions.py nir/nir_constant_expressions.h
+	$(MKDIR_P) nir;							\
+	$(PYTHON2) $(PYTHON_FLAGS) $(srcdir)/nir/nir_constant_expressions.py > $@
+
diff --git a/src/glsl/Makefile.sources b/src/glsl/Makefile.sources
index 56299eb..e803882 100644
--- a/src/glsl/Makefile.sources
+++ b/src/glsl/Makefile.sources
@@ -16,7 +16,8 @@ LIBGLCPP_GENERATED_FILES = \
 NIR_GENERATED_FILES = \
 	$(GLSL_BUILDDIR)/nir/nir_opt_algebraic.c \
 	$(GLSL_BUILDDIR)/nir/nir_opcodes.h \
-	$(GLSL_BUILDDIR)/nir/nir_opcodes.c
+	$(GLSL_BUILDDIR)/nir/nir_opcodes.c \
+	$(GLSL_BUILDDIR)/nir/nir_constant_expressions.c
 
 NIR_FILES = \
 	$(GLSL_SRCDIR)/nir/nir.c \
diff --git a/src/glsl/nir/.gitignore b/src/glsl/nir/.gitignore
index 4c28193..261f64f 100644
--- a/src/glsl/nir/.gitignore
+++ b/src/glsl/nir/.gitignore
@@ -1,3 +1,4 @@
 nir_opt_algebraic.c
 nir_opcodes.c
 nir_opcodes.h
+nir_constant_expressions.c
diff --git a/src/glsl/nir/nir_constant_expressions.h b/src/glsl/nir/nir_constant_expressions.h
new file mode 100644
index 0000000..4ca09be
--- /dev/null
+++ b/src/glsl/nir/nir_constant_expressions.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright © 2014 Connor Abbott
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Connor Abbott (cwabbott0 at gmail.com)
+ *
+ */
+
+#include "nir.h"
+
+nir_const_value nir_eval_const_opcode(nir_op op, unsigned num_components,
+                                      nir_const_value *src);
+
diff --git a/src/glsl/nir/nir_constant_expressions.py b/src/glsl/nir/nir_constant_expressions.py
new file mode 100644
index 0000000..d2698b3
--- /dev/null
+++ b/src/glsl/nir/nir_constant_expressions.py
@@ -0,0 +1,324 @@
+#! /usr/bin/env python
+#
+# Copyright (C) 2014 Connor Abbott
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+#
+# Authors:
+#    Connor Abbott (cwabbott0 at gmail.com)
+
+from nir_opcodes import opcodes
+from mako.template import Template
+
+# the const_expr string for each opcode has a few shortcuts - most only have
+# an expression, and the "dst = (expr);" is implied. In addition,
+# per-component inputs and outputs are referenced without any subscripts, so
+# we need to create the implicit for-loop for per-component opcodes. In
+# addition, we need to expand out the src0, src1, etc. with actual sources
+# with the appropriate type using the union.
+
+class Operand(object):
+"""
+This class is a helper used for formatting the constant expressions. We create
+one for each operand that can be used: dst, src0, src1, etc. Calling str() on
+the object itself returns the per-component channel, so that e.g. ${src0}
+expands to the per-component channel of src[0], and we create x, y, z, and w
+members so that e.g. ${src0.x} will return the 0th channel of src[0]. We need
+to know the type of the operand so that we can use the right element of the
+union.
+"""
+   def __init__(self, name, type_, is_src):
+      if type_ == "bool" or type_ == "unsigned":
+         prefix = "u"
+      elif type_ == "int":
+         prefix = "i"
+      else:
+         prefix = "f"
+
+      wrap_bool = is_src and type_ == "bool"
+
+      def wrap(string):
+         if wrap_bool:
+            return "(" + string + " != NIR_FALSE)"
+         return string
+
+      self.per_component = wrap(name + "." + prefix + "[_i]")
+      self.x = wrap(name + "." + prefix + "[0]")
+      self.y = wrap(name + "." + prefix + "[1]")
+      self.z = wrap(name + "." + prefix + "[2]")
+      self.w = wrap(name + "." + prefix + "[3]")
+
+   def __str__(self):
+      return self.per_component
+
+def expand_constexpr(opcode):
+   const_expr = opcode.const_expr
+
+   if "dst" not in const_expr:
+      if opcode.output_type == "bool":
+         # For convenience, insert the conversion to unsigned.
+         # Note that we don't do this for things that aren't expressions.
+         const_expr = "(" + const_expr + ") ? NIR_TRUE : NIR_FALSE"
+
+      if opcode.output_size == 0:
+         const_expr = "{dst} = " + const_expr + ";"
+      else:
+         # for non-per-component opcodes, assume we broadcast to all components
+         const_expr = "\n".join(
+               "{dst." + "xyzw"[i] + "} = " + const_expr + ";"
+                     for i in range(opcode.output_size))
+
+   replacement_dict = {
+      "src" + str(i) : Operand("src[" + str(i) + "]", opcode.input_types[i], True)
+         for i in range(opcode.num_inputs)
+   }
+
+   replacement_dict["dst"] = Operand("dst", opcode.output_type, False)
+
+   const_expr = const_expr.format(**replacement_dict)
+
+   if opcode.output_size == 0:
+      const_expr = "for (unsigned _i = 0; _i < num_components; _i++) {" + const_expr + "}"
+
+   return const_expr
+
+
+const_exprs = {name : expand_constexpr(opcode)
+                 for name, opcode in opcodes.iteritems()}
+
+template = Template("""
+#include <math.h>
+#include "main/core.h"
+#include "nir_constant_expressions.h"
+
+#if defined(_MSC_VER) && (_MSC_VER < 1800)
+static int isnormal(double x)
+{
+   return _fpclass(x) == _FPCLASS_NN || _fpclass(x) == _FPCLASS_PN;
+}
+#elif defined(__SUNPRO_CC)
+#include <ieeefp.h>
+static int isnormal(double x)
+{
+   return fpclass(x) == FP_NORMAL;
+}
+#endif
+
+#if defined(_MSC_VER)
+static double copysign(double x, double y)
+{
+   return _copysign(x, y);
+}
+#endif
+
+/**
+ * Evaluate one component of packSnorm4x8.
+ */
+static uint8_t
+pack_snorm_1x8(float x)
+{
+    /* From section 8.4 of the GLSL 4.30 spec:
+     *
+     *    packSnorm4x8
+     *    ------------
+     *    The conversion for component c of v to fixed point is done as
+     *    follows:
+     *
+     *      packSnorm4x8: round(clamp(c, -1, +1) * 127.0)
+     *
+     * We must first cast the float to an int, because casting a negative
+     * float to a uint is undefined.
+     */
+   return (uint8_t) (int8_t)
+          _mesa_round_to_even(CLAMP(x, -1.0f, +1.0f) * 127.0f);
+}
+
+/**
+ * Evaluate one component of packSnorm2x16.
+ */
+static uint16_t
+pack_snorm_1x16(float x)
+{
+    /* From section 8.4 of the GLSL ES 3.00 spec:
+     *
+     *    packSnorm2x16
+     *    -------------
+     *    The conversion for component c of v to fixed point is done as
+     *    follows:
+     *
+     *      packSnorm2x16: round(clamp(c, -1, +1) * 32767.0)
+     *
+     * We must first cast the float to an int, because casting a negative
+     * float to a uint is undefined.
+     */
+   return (uint16_t) (int16_t)
+          _mesa_round_to_even(CLAMP(x, -1.0f, +1.0f) * 32767.0f);
+}
+
+/**
+ * Evaluate one component of unpackSnorm4x8.
+ */
+static float
+unpack_snorm_1x8(uint8_t u)
+{
+    /* From section 8.4 of the GLSL 4.30 spec:
+     *
+     *    unpackSnorm4x8
+     *    --------------
+     *    The conversion for unpacked fixed-point value f to floating point is
+     *    done as follows:
+     *
+     *       unpackSnorm4x8: clamp(f / 127.0, -1, +1)
+     */
+   return CLAMP((int8_t) u / 127.0f, -1.0f, +1.0f);
+}
+
+/**
+ * Evaluate one component of unpackSnorm2x16.
+ */
+static float
+unpack_snorm_1x16(uint16_t u)
+{
+    /* From section 8.4 of the GLSL ES 3.00 spec:
+     *
+     *    unpackSnorm2x16
+     *    ---------------
+     *    The conversion for unpacked fixed-point value f to floating point is
+     *    done as follows:
+     *
+     *       unpackSnorm2x16: clamp(f / 32767.0, -1, +1)
+     */
+   return CLAMP((int16_t) u / 32767.0f, -1.0f, +1.0f);
+}
+
+/**
+ * Evaluate one component packUnorm4x8.
+ */
+static uint8_t
+pack_unorm_1x8(float x)
+{
+    /* From section 8.4 of the GLSL 4.30 spec:
+     *
+     *    packUnorm4x8
+     *    ------------
+     *    The conversion for component c of v to fixed point is done as
+     *    follows:
+     *
+     *       packUnorm4x8: round(clamp(c, 0, +1) * 255.0)
+     */
+   return (uint8_t) _mesa_round_to_even(CLAMP(x, 0.0f, 1.0f) * 255.0f);
+}
+
+/**
+ * Evaluate one component packUnorm2x16.
+ */
+static uint16_t
+pack_unorm_1x16(float x)
+{
+    /* From section 8.4 of the GLSL ES 3.00 spec:
+     *
+     *    packUnorm2x16
+     *    -------------
+     *    The conversion for component c of v to fixed point is done as
+     *    follows:
+     *
+     *       packUnorm2x16: round(clamp(c, 0, +1) * 65535.0)
+     */
+   return (uint16_t) _mesa_round_to_even(CLAMP(x, 0.0f, 1.0f) * 65535.0f);
+}
+
+/**
+ * Evaluate one component of unpackUnorm4x8.
+ */
+static float
+unpack_unorm_1x8(uint8_t u)
+{
+    /* From section 8.4 of the GLSL 4.30 spec:
+     *
+     *    unpackUnorm4x8
+     *    --------------
+     *    The conversion for unpacked fixed-point value f to floating point is
+     *    done as follows:
+     *
+     *       unpackUnorm4x8: f / 255.0
+     */
+   return (float) u / 255.0f;
+}
+
+/**
+ * Evaluate one component of unpackUnorm2x16.
+ */
+static float
+unpack_unorm_1x16(uint16_t u)
+{
+    /* From section 8.4 of the GLSL ES 3.00 spec:
+     *
+     *    unpackUnorm2x16
+     *    ---------------
+     *    The conversion for unpacked fixed-point value f to floating point is
+     *    done as follows:
+     *
+     *       unpackUnorm2x16: f / 65535.0
+     */
+   return (float) u / 65535.0f;
+}
+
+/**
+ * Evaluate one component of packHalf2x16.
+ */
+static uint16_t
+pack_half_1x16(float x)
+{
+   return _mesa_float_to_half(x);
+}
+
+/**
+ * Evaluate one component of unpackHalf2x16.
+ */
+static float
+unpack_half_1x16(uint16_t u)
+{
+   return _mesa_half_to_float(u);
+}
+
+nir_const_value
+nir_eval_const_opcode(nir_op op, unsigned num_components,
+                      nir_const_value *src)
+{
+   nir_const_value dst = {
+      .u = {0, 0, 0, 0}
+   };
+
+   switch (op) {
+% for name, const_expr in sorted(const_exprs.iteritems()):
+   case nir_op_${name}: {
+      ${const_expr}
+      break;
+   }
+% endfor
+   case nir_num_opcodes: unreachable("shouldn't get here");
+   }
+
+   return dst;
+}
+""")
+
+print template.render(const_exprs=const_exprs)
+
diff --git a/src/glsl/nir/nir_opcodes.py b/src/glsl/nir/nir_opcodes.py
index 4f09459..7176a2b 100644
--- a/src/glsl/nir/nir_opcodes.py
+++ b/src/glsl/nir/nir_opcodes.py
@@ -24,6 +24,7 @@
 # Authors:
 #    Connor Abbott (cwabbott0 at gmail.com)
 
+
 # Class that represents all the information we have about the opcode
 # NOTE: this must be kept in sync with nir_op_info
 
@@ -40,6 +41,9 @@ class Opcode(object):
       - input_types is a list of types
       - algebraic_properties is a space-seperated string, where nir_op_is_ is
         prepended before each entry
+      - const_expr is an expression or series of statements that computes the
+        constant value of the opcode given the constant values of its inputs.
+        See nir_constant_expressions.py.
       """
       assert isinstance(name, str)
       assert isinstance(output_size, int)
@@ -49,6 +53,7 @@ class Opcode(object):
       assert isinstance(input_types, list)
       assert isinstance(input_types[0], str)
       assert isinstance(algebraic_properties, str)
+      assert isinstance(const_expr, str)
       assert len(input_sizes) == len(input_types)
       assert 0 <= output_size <= 4
       for size in input_sizes:
@@ -62,6 +67,7 @@ class Opcode(object):
       self.input_sizes = input_sizes
       self.input_types = input_types
       self.algebraic_properties = algebraic_properties
+      self.const_expr = const_expr
 
 # helper variables for strings
 tfloat = "float"
@@ -76,178 +82,289 @@ associative = "associative "
 opcodes = {}
 
 def opcode(name, output_size, output_type, input_sizes, input_types,
-           algebraic_properties):
+           algebraic_properties, const_expr):
    assert name not in opcodes
    opcodes[name] = Opcode(name, output_size, output_type, input_sizes,
-                          input_types, algebraic_properties)
-
-def unop_convert(name, in_type, out_type):
-   opcode(name, 0, out_type, [0], [in_type], "")
-
-def unop(name, ty):
-   opcode(name, 0, ty, [0], [ty], "")
-
-def unop_horiz(name, output_size, output_type, input_size, input_type):
-   opcode(name, output_size, output_type, [input_size], [input_type], "")
-
-def unop_reduce(name, output_size, output_type, input_type):
-   unop_horiz(name + "2", output_size, output_type, 2, input_type)
-   unop_horiz(name + "3", output_size, output_type, 3, input_type)
-   unop_horiz(name + "4", output_size, output_type, 4, input_type)
+                          input_types, algebraic_properties, const_expr)
+
+def unop_convert(name, in_type, out_type, const_expr):
+   opcode(name, 0, out_type, [0], [in_type], "", const_expr)
+
+def unop(name, ty, const_expr):
+   opcode(name, 0, ty, [0], [ty], "", const_expr)
+
+def unop_horiz(name, output_size, output_type, input_size, input_type,
+               const_expr):
+   opcode(name, output_size, output_type, [input_size], [input_type], "",
+          const_expr)
+
+def unop_reduce(name, output_size, output_type, input_type, prereduce_expr,
+                reduce_expr, final_expr):
+   def prereduce(src):
+      return "(" + prereduce_expr.format(src=src) + ")"
+   def final(src):
+      return final_expr.format(src="(" + src + ")")
+   def reduce_(src0, src1):
+      return reduce_expr.format(src0=src0, src1=src1)
+   src0 = prereduce("{src0.x}")
+   src1 = prereduce("{src0.y}")
+   src2 = prereduce("{src0.z}")
+   src3 = prereduce("{src0.w}")
+   unop_horiz(name + "2", output_size, output_type, 2, input_type,
+              final(reduce_(src0, src1)))
+   unop_horiz(name + "3", output_size, output_type, 3, input_type,
+              final(reduce_(reduce_(src0, src1), src2)))
+   unop_horiz(name + "4", output_size, output_type, 4, input_type,
+              final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
    
 
 # These two move instructions differ in what modifiers they support and what
 # the negate modifier means. Otherwise, they are identical.
-unop("fmov", tfloat)
-unop("imov", tint)
-
-unop("ineg", tint)
-unop("fneg", tfloat)
-unop("inot", tint) # invert every bit of the integer
-unop("fnot", tfloat) # (src == 0.0) ? 1.0 : 0.0 
-unop("fsign", tfloat)
-unop("isign", tint)
-unop("iabs", tint)
-unop("fabs", tfloat)
-unop("fsat", tfloat)
-unop("frcp", tfloat)
-unop("frsq", tfloat)
-unop("fsqrt", tfloat)
-unop("fexp", tfloat) # < e^x
-unop("flog", tfloat) # log base e
-unop("fexp2", tfloat)
-unop("flog2", tfloat)
-unop_convert("f2i", tfloat, tint) # Float-to-integer conversion.
-unop_convert("f2u", tfloat, tunsigned) # Float-to-unsigned conversion
-unop_convert("i2f", tint, tfloat) # Integer-to-float conversion.
-unop_convert("f2b", tfloat, tbool) # Float-to-boolean conversion
-unop_convert("b2f", tbool, tfloat) # Boolean-to-float conversion
-unop_convert("i2b", tint, tbool) # int-to-boolean conversion
-unop_convert("b2i", tbool, tint) # Boolean-to-int conversion
-unop_convert("u2f", tunsigned, tfloat) #Unsigned-to-float conversion.
-
-unop_reduce("bany", 1, tbool, tbool) # returns ~0 if any component of src[0] != 0
-unop_reduce("ball", 1, tbool, tbool) # returns ~0 if all components of src[0] != 0
-unop_reduce("fany", 1, tfloat, tfloat) # returns 1.0 if any component of src[0] != 0
-unop_reduce("fall", 1, tfloat, tfloat) # returns 1.0 if all components of src[0] != 0
+unop("fmov", tfloat, "{src0}")
+unop("imov", tint, "{src0}")
+
+unop("ineg", tint, "-{src0}")
+unop("fneg", tfloat, "-{src0}")
+unop("inot", tint, "~{src0}") # invert every bit of the integer
+unop("fnot", tfloat, "({src0} == 0.0f) ? 1.0f : 0.0f") 
+unop("fsign", tfloat, "({src0} == 0.0f) ? 0.0f : (({src0} > 0.0f) ? 1.0f : -1.0f)")
+unop("isign", tint, "({src0} == 0) ? 0 : ((src > 0) ? 1 : -1)")
+unop("iabs", tint, "abs({src0})")
+unop("fabs", tfloat, "fabsf({src0})")
+unop("fsat", tfloat, "({src0} > 1.0f) ? 1.0f : (({src0} <= 0.0f) ? 0.0f : {src0})")
+unop("frcp", tfloat, "1.0f / {src0}")
+unop("frsq", tfloat, "1.0f / sqrtf({src0})")
+unop("fsqrt", tfloat, "sqrtf({src0})")
+unop("fexp", tfloat, "expf({src0})") # < e^x
+unop("flog", tfloat, "logf({src0})") # log base e
+unop("fexp2", tfloat, "exp2f({src0})")
+unop("flog2", tfloat, "log2f({src0})")
+unop_convert("f2i", tfloat, tint, "{src0}") # Float-to-integer conversion.
+unop_convert("f2u", tfloat, tunsigned, "{src0}") # Float-to-unsigned conversion
+unop_convert("i2f", tint, tfloat, "{src0}") # Integer-to-float conversion.
+# Float-to-boolean conversion
+unop_convert("f2b", tfloat, tbool, "{src0} == 0.0f")
+# Boolean-to-float conversion
+unop_convert("b2f", tbool, tfloat, "{src0} ? 1.0f : 0.0f")
+# Int-to-boolean conversion
+unop_convert("i2b", tint, tbool, "{src0} == 0")
+unop_convert("b2i", tbool, tint, "{src0} ? 0 : -1") # Boolean-to-int conversion
+unop_convert("u2f", tunsigned, tfloat, "{src0}") #Unsigned-to-float conversion.
+
+unop_reduce("bany", 1, tbool, tbool, "{src}", "{src0} || {src1}", "{src}")
+unop_reduce("ball", 1, tbool, tbool, "{src}", "{src0} && {src1}", "{src}")
+unop_reduce("fany", 1, tfloat, tfloat, "{src} != 0.0f", "{src0} || {src1}",
+            "{src} ? 1.0f : 0.0f")
+unop_reduce("fall", 1, tfloat, tfloat, "{src} != 0.0f", "{src0} && {src1}",
+            "{src} ? 1.0f : 0.0f")
 
 # Unary floating-point rounding operations.
 
 
-unop("ftrunc", tfloat)
-unop("fceil", tfloat)
-unop("ffloor", tfloat)
-unop("ffract", tfloat)
-unop("fround_even", tfloat)
+unop("ftrunc", tfloat, "truncf({src0})")
+unop("fceil", tfloat, "ceilf({src0})")
+unop("ffloor", tfloat, "floorf({src0})")
+unop("ffract", tfloat, "{src0} - floorf({src0})")
+unop("fround_even", tfloat, "_mesa_round_to_even({src0})")
 
 
 # Trigonometric operations.
 
 
-unop("fsin", tfloat)
-unop("fcos", tfloat)
-unop("fsin_reduced", tfloat)
-unop("fcos_reduced", tfloat)
+unop("fsin", tfloat, "sinf({src0})")
+unop("fcos", tfloat, "cosf({src0})")
+unop("fsin_reduced", tfloat, "sinf({src0})")
+unop("fcos_reduced", tfloat, "cosf({src0})")
 
 
 # Partial derivatives.
 
 
-unop("fddx", tfloat)
-unop("fddy", tfloat)
-unop("fddx_fine", tfloat)
-unop("fddy_fine", tfloat)
-unop("fddx_coarse", tfloat)
-unop("fddy_coarse", tfloat)
+unop("fddx", tfloat, "0.0f") # the derivative of a constant is 0.
+unop("fddy", tfloat, "0.0f")
+unop("fddx_fine", tfloat, "0.0f")
+unop("fddy_fine", tfloat, "0.0f")
+unop("fddx_coarse", tfloat, "0.0f")
+unop("fddy_coarse", tfloat, "0.0f")
 
 
 # Floating point pack and unpack operations.
 
-
-unop_horiz("pack_snorm_2x16", 1, tunsigned, 2, tfloat)
-unop_horiz("pack_snorm_4x8", 1, tunsigned, 4, tfloat)
-unop_horiz("pack_unorm_2x16", 1, tunsigned, 2, tfloat)
-unop_horiz("pack_unorm_4x8", 1, tunsigned, 4, tfloat)
-unop_horiz("pack_half_2x16", 1, tunsigned, 2, tfloat)
-unop_horiz("unpack_snorm_2x16", 2, tfloat, 1, tunsigned)
-unop_horiz("unpack_snorm_4x8", 4, tfloat, 1, tunsigned)
-unop_horiz("unpack_unorm_2x16", 2, tfloat, 1, tunsigned)
-unop_horiz("unpack_unorm_4x8", 4, tfloat, 1, tunsigned)
-unop_horiz("unpack_half_2x16", 2, tfloat, 1, tunsigned)
+def pack_2x16(fmt):
+   unop_horiz("pack_" + fmt + "_2x16", 1, tunsigned, 2, tfloat, """
+{dst.x} = (uint32_t) pack_fmt_1x16({src0.x});
+{dst.x} |= ((uint32_t) pack_fmt_1x16({src0.y})) << 16;
+""".replace("fmt", fmt))
+
+def pack_4x8(fmt):
+   unop_horiz("pack_" + fmt + "_4x8", 1, tunsigned, 4, tfloat, """
+{dst.x} = (uint32_t) pack_fmt_1x8({src0.x});
+{dst.x} |= ((uint32_t) pack_fmt_1x8({src0.y})) << 8;
+{dst.x} |= ((uint32_t) pack_fmt_1x8({src0.z})) << 16;
+{dst.x} |= ((uint32_t) pack_fmt_1x8({src0.w})) << 24;
+""".replace("fmt", fmt))
+
+def unpack_2x16(fmt):
+   unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat, 1, tunsigned, """
+{dst.x} = unpack_fmt_1x16((uint16_t)({src0.x} & 0xffff));
+{dst.y} = unpack_fmt_1x16((uint16_t)({src0.x} << 16));
+""".replace("fmt", fmt))
+
+def unpack_4x8(fmt):
+   unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat, 1, tunsigned, """
+{dst.x} = unpack_fmt_1x8((uint8_t)({src0.x} & 0xff));
+{dst.y} = unpack_fmt_1x8((uint8_t)(({src0.x} >> 8) & 0xff));
+{dst.z} = unpack_fmt_1x8((uint8_t)(({src0.x} >> 16) & 0xff));
+{dst.w} = unpack_fmt_1x8((uint8_t)({src0.x} >> 24));
+""".replace("fmt", fmt))
+
+
+pack_2x16("snorm")
+pack_4x8("snorm")
+pack_2x16("unorm")
+pack_4x8("unorm")
+pack_2x16("half")
+unpack_2x16("snorm")
+unpack_4x8("snorm")
+unpack_2x16("unorm")
+unpack_4x8("unorm")
+unpack_2x16("half")
 
 
 # Lowered floating point unpacking operations.
 
 
-unop_horiz("unpack_half_2x16_split_x", 1, tfloat, 1, tunsigned)
-unop_horiz("unpack_half_2x16_split_y", 1, tfloat, 1, tunsigned)
+unop_horiz("unpack_half_2x16_split_x", 1, tfloat, 1, tunsigned, """
+{dst.x} = unpack_half_1x16((uint16_t)({src0.x} & 0xffff));
+""")
+unop_horiz("unpack_half_2x16_split_y", 1, tfloat, 1, tunsigned, """
+{dst.y} = unpack_half_1x16((uint16_t)({src0.x} >> 16));
+""")
 
 
 # Bit operations, part of ARB_gpu_shader5.
 
 
-unop("bitfield_reverse", tunsigned)
-unop("bit_count", tunsigned)
-unop_convert("ufind_msb", tunsigned, tint)
-unop("ifind_msb", tint)
-unop("find_lsb", tint)
+unop("bitfield_reverse", tunsigned, """
+/* we're not winning any awards for speed here, but that's ok */
+{dst} = 0;
+for (unsigned bit = 0; bit < 32; bit++)
+   {dst} |= (({src0} >> bit) & 1) << (31 - bit);
+""")
+unop("bit_count", tunsigned, """
+{dst} = 0;
+for (unsigned bit = 0; bit < 32; bit++) {{
+   if (({src0} >> bit) & 1)
+      {dst}++;
+}}
+""")
+
+unop_convert("ufind_msb", tunsigned, tint, """
+{dst} = -1;
+for (int bit = 31; bit > 0; bit--) {{
+   if (({src0} >> bit) & 1) {{
+      {dst} = bit;
+      break;
+   }}
+}}
+""")
+
+unop("ifind_msb", tint, """
+{dst} = -1;
+for (int bit = 31; bit >= 0; bit--) {{
+   /* If src0 < 0, we're looking for the first 0 bit.
+    * if src0 >= 0, we're looking for the first 1 bit.
+    */
+   if (((({src0} >> bit) & 1) && ({src0} >= 0)) ||
+      (!(({src0} >> bit) & 1) && ({src0} < 0))) {{
+      {dst} = bit;
+      break;
+   }}
+}}
+""")
+
+unop("find_lsb", tint, """
+{dst} = -1;
+for (unsigned bit = 0; bit < 32; bit++) {{
+   if (({src0} >> bit) & 1) {{
+      {dst} = bit;
+      break;
+   }}
+}}
+""")
 
 
 for i in xrange(1, 5):
    for j in xrange(1, 5):
-      unop_horiz("fnoise{0}_{1}".format(i, j), i, tfloat, j, tfloat)
+      unop_horiz("fnoise{0}_{1}".format(i, j), i, tfloat, j, tfloat, "0.0f")
 
-def binop_convert(name, out_type, in_type, alg_props):
-   opcode(name, 0, out_type, [0, 0], [in_type, in_type], alg_props)
+def binop_convert(name, out_type, in_type, alg_props, const_expr):
+   opcode(name, 0, out_type, [0, 0], [in_type, in_type], alg_props, const_expr)
 
-def binop(name, ty, alg_props):
-   binop_convert(name, ty, ty, alg_props)
+def binop(name, ty, alg_props, const_expr):
+   binop_convert(name, ty, ty, alg_props, const_expr)
 
-def binop_compare(name, ty, alg_props):
-   binop_convert(name, ty, tbool, alg_props)
+def binop_compare(name, ty, alg_props, const_expr):
+   binop_convert(name, tbool, ty, alg_props, const_expr)
 
 def binop_horiz(name, out_size, out_type, src1_size, src1_type, src2_size,
-                src2_type):
-   opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type], "")
-
-def binop_reduce(name, output_size, output_type, src_type):
-   opcode(name + "2",output_size, output_type,
-          [2, 2], [src_type, src_type], commutative)
+                src2_type, const_expr):
+   opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type],
+          "", const_expr)
+
+def binop_reduce(name, output_size, output_type, src_type, prereduce_expr,
+                 reduce_expr, final_expr):
+   def final(src):
+      return final_expr.format(src= "(" + src + ")")
+   def reduce_(src0, src1):
+      return reduce_expr.format(src0=src0, src1=src1)
+   def prereduce(src0, src1):
+      return "(" + prereduce_expr.format(src0=src0, src1=src1) + ")"
+   src0 = prereduce("{src0.x}", "{src1.x}")
+   src1 = prereduce("{src0.y}", "{src1.y}")
+   src2 = prereduce("{src0.z}", "{src1.z}")
+   src3 = prereduce("{src0.w}", "{src1.w}")
+   opcode(name + "2", output_size, output_type,
+          [2, 2], [src_type, src_type], commutative,
+          final(reduce_(src0, src1)))
    opcode(name + "3", output_size, output_type,
-          [3, 3], [src_type, src_type], commutative)
+          [3, 3], [src_type, src_type], commutative,
+          final(reduce_(reduce_(src0, src1), src2)))
    opcode(name + "4", output_size, output_type,
-          [4, 4], [src_type, src_type], commutative)
+          [4, 4], [src_type, src_type], commutative,
+          final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
 
-binop("fadd", tfloat, commutative + associative)
-binop("iadd", tint, commutative + associative)
-binop("fsub", tfloat, "")
-binop("isub", tint, "")
+binop("fadd", tfloat, commutative + associative, "{src0} + {src1}")
+binop("iadd", tint, commutative + associative, "{src0} + {src1}")
+binop("fsub", tfloat, "", "{src0} - {src1}")
+binop("isub", tint, "", "{src0} - {src1}")
 
-binop("fmul", tfloat, commutative + associative)
+binop("fmul", tfloat, commutative + associative, "{src0} * {src1}")
 # low 32-bits of signed/unsigned integer multiply
-binop("imul", tint, commutative + associative)
+binop("imul", tint, commutative + associative, "{src0} * {src1}")
 # high 32-bits of signed integer multiply
-binop("imul_high", tint, commutative)
+binop("imul_high", tint, commutative,
+      "(int32_t)(((int64_t) {src0} * (int64_t) {src1}) >> 32)")
 # high 32-bits of unsigned integer multiply
-binop("umul_high", tunsigned, commutative)
+binop("umul_high", tunsigned, commutative,
+      "(uint32_t)(((uint64_t) {src0} * (uint64_t) {src1}) >> 32)")
 
-binop("fdiv", tfloat, "")
-binop("idiv", tint, "")
-binop("udiv", tunsigned, "")
+binop("fdiv", tfloat, "", "{src0} / {src1}")
+binop("idiv", tint, "", "{src0} / {src1}")
+binop("udiv", tunsigned, "", "{src0} / {src1}")
 
 # returns a boolean representing the carry resulting from the addition of
 # the two unsigned arguments.
 
-binop_convert("uadd_carry", tbool, tunsigned,
-              commutative)
+binop_convert("uadd_carry", tbool, tunsigned, commutative, "{src0} + {src1} < {src0}")
 
 # returns a boolean representing the borrow resulting from the subtraction
 # of the two unsigned arguments.
 
-binop_convert("usub_borrow", tbool, tunsigned, "")
+binop_convert("usub_borrow", tbool, tunsigned, "", "{src1} < {src0}")
 
-binop("fmod", tfloat, "")
-binop("umod", tunsigned, "")
+binop("fmod", tfloat, "", "{src0} - {src1} * floorf({src0} / {src1})")
+binop("umod", tunsigned, "", "{src1} == 0 ? 0 : {src0} % {src1}")
 
 #
 # Comparisons
@@ -256,41 +373,47 @@ binop("umod", tunsigned, "")
 
 # these integer-aware comparisons return a boolean (0 or ~0)
 
-binop_compare("flt", tfloat, "")
-binop_compare("fge", tfloat, "")
-binop_compare("feq", tfloat, commutative)
-binop_compare("fne", tfloat, commutative)
-binop_compare("ilt", tint, "")
-binop_compare("ige", tint, "")
-binop_compare("ieq", tint, commutative)
-binop_compare("ine", tint, commutative)
-binop_compare("ult", tunsigned, "")
-binop_compare("uge", tunsigned, "")
+binop_compare("flt", tfloat, "", "{src0} < {src1}")
+binop_compare("fge", tfloat, "", "{src0} >= {src1}")
+binop_compare("feq", tfloat, commutative, "{src0} == {src1}")
+binop_compare("fne", tfloat, commutative, "{src0} != {src1}")
+binop_compare("ilt", tint, "", "{src0} < {src1}")
+binop_compare("ige", tint, "", "{src0} >= {src1}")
+binop_compare("ieq", tint, commutative, "{src0} == {src1}")
+binop_compare("ine", tint, commutative, "{src0} != {src1}")
+binop_compare("ult", tunsigned, "", "{src0} < {src1}")
+binop_compare("uge", tunsigned, "", "{src0} >= {src1}")
 
 # integer-aware GLSL-style comparisons that compare floats and ints
 
-binop_reduce("ball_fequal",  1, tbool, tfloat)
-binop_reduce("bany_fnequal", 1, tbool, tfloat)
-binop_reduce("ball_iequal",  1, tbool, tint)
-binop_reduce("bany_inequal", 1, tbool, tint)
+binop_reduce("ball_fequal",  1, tbool, tfloat, "{src0} == {src1}",
+             "{src0} && {src1}", "{src}")
+binop_reduce("bany_fnequal", 1, tbool, tfloat, "{src0} != {src1}",
+             "{src0} || {src1}", "{src}")
+binop_reduce("ball_iequal",  1, tbool, tint, "{src0} == {src0}",
+             "{src0} && {src1}", "{src}")
+binop_reduce("bany_inequal", 1, tbool, tint, "{src0} != {src1}",
+             "{src0} || {src1}", "{src}")
 
 # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
 
-binop_reduce("fall_equal",  1, tfloat, tfloat)
-binop_reduce("fany_nequal", 1, tfloat, tfloat)
+binop_reduce("fall_equal",  1, tfloat, tfloat, "{src0} == {src1}",
+             "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
+binop_reduce("fany_nequal", 1, tfloat, tfloat, "{src0} != {src1}",
+             "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
 
 # These comparisons for integer-less hardware return 1.0 and 0.0 for true
 # and false respectively
 
-binop("slt", tfloat, "") # Set on Less Than
-binop("sge", tfloat, "") # Set on Greater Than or Equal
-binop("seq", tfloat, commutative) # Set on Equal
-binop("sne", tfloat, commutative) # Set on Not Equal
+binop("slt", tfloat, "", "({src0} < {src1}) ? 1.0f : 0.0f") # Set on Less Than
+binop("sge", tfloat, "", "({src0} >= {src1}) ? 1.0f : 0.0f") # Set on Greater or Equal
+binop("seq", tfloat, commutative, "({src0} == {src1}) ? 1.0f : 0.0f") # Set on Equal
+binop("sne", tfloat, commutative, "({src0} != {src1}) ? 1.0f : 0.0f") # Set on Not Equal
 
 
-binop("ishl", tint, "")
-binop("ishr", tint, "")
-binop("ushr", tunsigned, "")
+binop("ishl", tint, "", "{src0} << {src1}")
+binop("ishr", tint, "", "{src0} >> {src1}")
+binop("ushr", tunsigned, "", "{src0} >> {src1}")
 
 # bitwise logic operators
 #
@@ -298,9 +421,9 @@ binop("ushr", tunsigned, "")
 # integers.
 
 
-binop("iand", tunsigned, commutative + associative)
-binop("ior", tunsigned, commutative + associative)
-binop("ixor", tunsigned, commutative + associative)
+binop("iand", tunsigned, commutative + associative, "{src0} & {src1}")
+binop("ior", tunsigned, commutative + associative, "{src0} | {src1}")
+binop("ixor", tunsigned, commutative + associative, "{src0} ^ {src1}")
 
 
 # floating point logic operators
@@ -308,42 +431,60 @@ binop("ixor", tunsigned, commutative + associative)
 # These use (src != 0.0) for testing the truth of the input, and output 1.0
 # for true and 0.0 for false
 
-binop("fand", tfloat, commutative)
-binop("for", tfloat, commutative)
-binop("fxor", tfloat, commutative)
-
-binop_reduce("fdot", 1, tfloat, tfloat)
-
-binop("fmin", tfloat, "")
-binop("imin", tint, commutative + associative)
-binop("umin", tunsigned, commutative + associative)
-binop("fmax", tfloat, "")
-binop("imax", tint, commutative + associative)
-binop("umax", tunsigned, commutative + associative)
-
-binop("fpow", tfloat, "")
-
-binop_horiz("pack_half_2x16_split", 1, tunsigned, 1, tfloat, 1, tfloat)
-
-binop("bfm", tunsigned, "")
-
-binop("ldexp", tunsigned, "")
+binop("fand", tfloat, commutative,
+      "(({src0} != 0.0f) && ({src1} != 0.0f)) ? 1.0f : 0.0f")
+binop("for", tfloat, commutative,
+      "(({src0} != 0.0f) || ({src1} != 0.0f)) ? 1.0f : 0.0f")
+binop("fxor", tfloat, commutative,
+      "({src0} != 0.0f && {src1} == 0.0f) || ({src0} == 0.0f && {src1} != 0.0f) ? 1.0f : 0.0f")
+
+binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}",
+             "{src}")
+
+binop("fmin", tfloat, "", "fminf({src0}, {src1})")
+binop("imin", tint, commutative + associative, "{src1} > {src0} ? {src0} : {src1}")
+binop("umin", tunsigned, commutative + associative, "{src1} > {src0} ? {src0} : {src1}")
+binop("fmax", tfloat, "", "fmaxf({src0}, {src1})")
+binop("imax", tint, commutative + associative, "{src1} > {src0} ? {src1} : {src0}")
+binop("umax", tunsigned, commutative + associative, "{src1} > {src0} ? {src1} : {src0}")
+
+binop("fpow", tfloat, "", "powf({src0}, {src1})")
+
+binop_horiz("pack_half_2x16_split", 1, tunsigned, 1, tfloat, 1, tfloat,
+            "pack_half_1x16({src0.x}) | (pack_half_1x16({src1.x}) << 16)")
+
+binop_convert("bfm", tunsigned, tint, "", """
+int offset = {src0}, bits = {src1};
+if (offset < 0 || bits < 0 || offset + bits > 32)
+   {dst} = 0; /* undefined per the spec */
+else
+   {dst} = ((1 << bits)- 1) << offset;
+""")
+
+opcode("ldexp", 0, tunsigned, [0, 0], [tfloat, tint], "", """
+{dst} = ldexp({src0}, {src1});
+/* flush denormals to zero. */
+if (!isnormal({dst}))
+   {dst} = copysign(0.0f, {src0});
+""")
 
 # Combines the first component of each input to make a 2-component vector.
 
-binop_horiz("vec2", 2, tunsigned, 1, tunsigned, 1, tunsigned)
+binop_horiz("vec2", 2, tunsigned, 1, tunsigned, 1, tunsigned, """
+{dst.x} = {src0.x};
+{dst.y} = {src1.x};
+""")
 
-def triop(name, ty):
-   opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], "")
-def triop_horiz(name, output_size, src1_size, src2_size, src3_size):
+def triop(name, ty, const_expr):
+   opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], "", const_expr)
+def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr):
    opcode(name, output_size, tunsigned,
    [src1_size, src2_size, src3_size],
-   [tunsigned, tunsigned, tunsigned], "")
+   [tunsigned, tunsigned, tunsigned], "", const_expr)
 
-# fma(a, b, c) = (a# b) + c
-triop("ffma", tfloat)
+triop("ffma", tfloat, "{src0} * {src1} + {src2}")
 
-triop("flrp", tfloat)
+triop("flrp", tfloat, "{src0} * (1 - {src2}) + {src1} * {src2}")
 
 # Conditional Select
 #
@@ -352,32 +493,83 @@ triop("flrp", tfloat)
 # bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
 
 
-triop("fcsel", tfloat)
+triop("fcsel", tfloat, "({src0} != 0.0f) ? {src1} : {src2}")
 opcode("bcsel", 0, tunsigned, [0, 0, 0],
-       [tbool, tunsigned, tunsigned], "")
-
-triop("bfi", tunsigned)
-
-triop("ubitfield_extract", tunsigned)
-opcode("ibitfield_extract", 0, tint, [0, 0, 0],
-       [tint, tunsigned, tunsigned], "")
+      [tbool, tunsigned, tunsigned], "", "{src0} ? {src1} : {src2}")
+
+triop("bfi", tunsigned, """
+unsigned mask = {src0}, insert = {src1} & mask, base = {src2};
+if (mask == 0) {{
+   {dst} = base;
+}} else {{
+   unsigned tmp = mask;
+   while (!(tmp & 1)) {{
+      tmp >>= 1;
+      insert <<= 1;
+   }}
+   {dst} = (base & ~mask) | insert;
+}}
+""")
+
+opcode("ubitfield_extract", 0, tunsigned,
+       [0, 1, 1], [tunsigned, tint, tint], "", """
+unsigned base = {src0};
+int offset = {src1.x}, bits = {src2.x};
+if (bits == 0) {{
+   {dst} = 0;
+}} else if (bits < 0 || offset < 0 || offset + bits > 32) {{
+   {dst} = 0; /* undefined per the spec */
+}} else {{
+   {dst} = (base >> offset) & ((1 << bits) - 1);
+}}
+""")
+opcode("ibitfield_extract", 0, tint,
+       [0, 1, 1], [tint, tint, tint], "", """
+int base = {src0};
+int offset = {src1.x}, bits = {src2.x};
+if (bits == 0) {{
+   {dst} = 0;
+}} else if (offset < 0 || bits < 0 || offset + bits > 32) {{
+   {dst} = 0;
+}} else {{
+   {dst} = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */
+}}
+""")
 
 # Combines the first component of each input to make a 3-component vector.
 
-triop_horiz("vec3", 3, 1, 1, 1)
+triop_horiz("vec3", 3, 1, 1, 1, """
+{dst.x} = {src0.x};
+{dst.y} = {src1.x};
+{dst.z} = {src2.x};
+""")
 
-def quadop(name):
-   opcode(name, 0, tunsigned, [0, 0, 0, 0],
-          [tunsigned, tunsigned, tunsigned, tunsigned],
-          "")
-def quadop_horiz(name, output_size, src1_size, src2_size, src3_size, src4_size):
+def quadop_horiz(name, output_size, src1_size, src2_size, src3_size,
+                 src4_size, const_expr):
    opcode(name, output_size, tunsigned,
           [src1_size, src2_size, src3_size, src4_size],
           [tunsigned, tunsigned, tunsigned, tunsigned],
-          "")
-
-quadop("bitfield_insert")
-
-quadop_horiz("vec4", 4, 1, 1, 1, 1)
+          "", const_expr)
+
+opcode("bitfield_insert", 0, tunsigned, [0, 0, 1, 1],
+       [tunsigned, tunsigned, tint, tint], "", """
+unsigned base = {src0}, insert = {src1};
+int offset = {src2.x}, bits = {src3.x};
+if (bits == 0) {{
+   {dst} = 0;
+}} else if (offset < 0 || bits < 0 || bits + offset > 32) {{
+   {dst} = 0;
+}} else {{
+   unsigned mask = ((1 << bits) - 1) << offset;
+   {dst} = (base & ~mask) | ((insert << bits) & mask);
+}}
+""")
+
+quadop_horiz("vec4", 4, 1, 1, 1, 1, """
+{dst.x} = {src0.x};
+{dst.y} = {src1.x};
+{dst.z} = {src2.x};
+{dst.w} = {src3.x};
+""")
 
 
-- 
2.1.0