[Mesa-dev] [PATCH 2/4] nir: add new constant folding infrastructure

Jason Ekstrand jason at jlekstrand.net
Fri Jan 23 13:43:55 PST 2015


Add a required field to the Opcode class, const_expr, that contains an
expression or statement that computes the result of the opcode given known
constant inputs. Then take those const_expr's and expand them into a function
that takes an opcode and an array of constant inputs and spits out the constant
result. This means that when adding opcodes, there's one less place to update,
and almost all the opcodes are self-documenting since the information on how to
compute the result is right next to the definition.

The helper functions in nir_constant_expressions.c were taken from
ir_constant_expressions.cpp.

v3 Jason Ekstrand <jason.ekstrand at iastate.edu>
 - Use mako to generate one function per opcode instead of doing piles of
   string splicing

Signed-off-by: Jason Ekstrand <jason.ekstrand at intel.com>
---
 src/glsl/Makefile.am                     |   5 +
 src/glsl/Makefile.sources                |   1 +
 src/glsl/nir/.gitignore                  |   1 +
 src/glsl/nir/nir_constant_expressions.h  |  31 ++
 src/glsl/nir/nir_constant_expressions.py | 319 ++++++++++++++++++
 src/glsl/nir/nir_opcodes.py              | 562 +++++++++++++++++++++----------
 6 files changed, 735 insertions(+), 184 deletions(-)
 create mode 100644 src/glsl/nir/nir_constant_expressions.h
 create mode 100644 src/glsl/nir/nir_constant_expressions.py

diff --git a/src/glsl/Makefile.am b/src/glsl/Makefile.am
index 59dda5f..e145cb2 100644
--- a/src/glsl/Makefile.am
+++ b/src/glsl/Makefile.am
@@ -217,6 +217,7 @@ BUILT_SOURCES =						\
 	glsl_lexer.cpp					\
 	glcpp/glcpp-parse.c				\
 	glcpp/glcpp-lex.c				\
+	nir/nir_constant_expressions.c			\
 	nir/nir_opcodes.c				\
 	nir/nir_opcodes.h				\
 	nir/nir_opt_algebraic.c
@@ -232,6 +233,10 @@ dist-hook:
 	$(RM) glcpp/tests/*.out
 	$(RM) glcpp/tests/subtest*/*.out
 
+nir/nir_constant_expressions.c: nir/nir_opcodes.py nir/nir_constant_expressions.py nir/nir_constant_expressions.h
+	$(MKDIR_P) nir;							\
+	$(PYTHON2) $(PYTHON_FLAGS) $(srcdir)/nir/nir_constant_expressions.py > $@
+
 nir/nir_opcodes.h: nir/nir_opcodes.py nir/nir_opcodes_h.py
 	$(MKDIR_P) nir;							\
 	$(PYTHON2) $(PYTHON_FLAGS) $(srcdir)/nir/nir_opcodes_h.py > $@
diff --git a/src/glsl/Makefile.sources b/src/glsl/Makefile.sources
index dc1c55d..dd76c44 100644
--- a/src/glsl/Makefile.sources
+++ b/src/glsl/Makefile.sources
@@ -14,6 +14,7 @@ LIBGLCPP_GENERATED_FILES = \
 	$(GLSL_BUILDDIR)/glcpp/glcpp-parse.c
 
 NIR_GENERATED_FILES = \
+	$(GLSL_BUILDDIR)/nir/nir_constant_expressions.c \
 	$(GLSL_BUILDDIR)/nir/nir_opcodes.c \
 	$(GLSL_BUILDDIR)/nir/nir_opcodes.h \
 	$(GLSL_BUILDDIR)/nir/nir_opt_algebraic.c
diff --git a/src/glsl/nir/.gitignore b/src/glsl/nir/.gitignore
index 4c28193..261f64f 100644
--- a/src/glsl/nir/.gitignore
+++ b/src/glsl/nir/.gitignore
@@ -1,3 +1,4 @@
 nir_opt_algebraic.c
 nir_opcodes.c
 nir_opcodes.h
+nir_constant_expressions.c
diff --git a/src/glsl/nir/nir_constant_expressions.h b/src/glsl/nir/nir_constant_expressions.h
new file mode 100644
index 0000000..97997f2
--- /dev/null
+++ b/src/glsl/nir/nir_constant_expressions.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright © 2014 Connor Abbott
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Connor Abbott (cwabbott0 at gmail.com)
+ *
+ */
+
+#include "nir.h"
+
+nir_const_value nir_eval_const_opcode(nir_op op, unsigned num_components,
+                                      nir_const_value *src);
diff --git a/src/glsl/nir/nir_constant_expressions.py b/src/glsl/nir/nir_constant_expressions.py
new file mode 100644
index 0000000..09c766a
--- /dev/null
+++ b/src/glsl/nir/nir_constant_expressions.py
@@ -0,0 +1,319 @@
+#! /usr/bin/python2
+template = """\
+/*
+ * Copyright (C) 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Jason Ekstrand (jason at jlekstrand.net)
+ */
+
+#include <math.h>
+#include "main/core.h"
+#include "nir_constant_expressions.h"
+
+#if defined(_MSC_VER) && (_MSC_VER < 1800)
+static int isnormal(double x)
+{
+   return _fpclass(x) == _FPCLASS_NN || _fpclass(x) == _FPCLASS_PN;
+}
+#elif defined(__SUNPRO_CC)
+#include <ieeefp.h>
+static int isnormal(double x)
+{
+   return fpclass(x) == FP_NORMAL;
+}
+#endif
+
+#if defined(_MSC_VER)
+static double copysign(double x, double y)
+{
+   return _copysign(x, y);
+}
+#endif
+
+/**
+ * Evaluate one component of packSnorm4x8.
+ */
+static uint8_t
+pack_snorm_1x8(float x)
+{
+    /* From section 8.4 of the GLSL 4.30 spec:
+     *
+     *    packSnorm4x8
+     *    ------------
+     *    The conversion for component c of v to fixed point is done as
+     *    follows:
+     *
+     *      packSnorm4x8: round(clamp(c, -1, +1) * 127.0)
+     *
+     * We must first cast the float to an int, because casting a negative
+     * float to a uint is undefined.
+     */
+   return (uint8_t) (int8_t)
+          _mesa_round_to_even(CLAMP(x, -1.0f, +1.0f) * 127.0f);
+}
+
+/**
+ * Evaluate one component of packSnorm2x16.
+ */
+static uint16_t
+pack_snorm_1x16(float x)
+{
+    /* From section 8.4 of the GLSL ES 3.00 spec:
+     *
+     *    packSnorm2x16
+     *    -------------
+     *    The conversion for component c of v to fixed point is done as
+     *    follows:
+     *
+     *      packSnorm2x16: round(clamp(c, -1, +1) * 32767.0)
+     *
+     * We must first cast the float to an int, because casting a negative
+     * float to a uint is undefined.
+     */
+   return (uint16_t) (int16_t)
+          _mesa_round_to_even(CLAMP(x, -1.0f, +1.0f) * 32767.0f);
+}
+
+/**
+ * Evaluate one component of unpackSnorm4x8.
+ */
+static float
+unpack_snorm_1x8(uint8_t u)
+{
+    /* From section 8.4 of the GLSL 4.30 spec:
+     *
+     *    unpackSnorm4x8
+     *    --------------
+     *    The conversion for unpacked fixed-point value f to floating point is
+     *    done as follows:
+     *
+     *       unpackSnorm4x8: clamp(f / 127.0, -1, +1)
+     */
+   return CLAMP((int8_t) u / 127.0f, -1.0f, +1.0f);
+}
+
+/**
+ * Evaluate one component of unpackSnorm2x16.
+ */
+static float
+unpack_snorm_1x16(uint16_t u)
+{
+    /* From section 8.4 of the GLSL ES 3.00 spec:
+     *
+     *    unpackSnorm2x16
+     *    ---------------
+     *    The conversion for unpacked fixed-point value f to floating point is
+     *    done as follows:
+     *
+     *       unpackSnorm2x16: clamp(f / 32767.0, -1, +1)
+     */
+   return CLAMP((int16_t) u / 32767.0f, -1.0f, +1.0f);
+}
+
+/**
+ * Evaluate one component packUnorm4x8.
+ */
+static uint8_t
+pack_unorm_1x8(float x)
+{
+    /* From section 8.4 of the GLSL 4.30 spec:
+     *
+     *    packUnorm4x8
+     *    ------------
+     *    The conversion for component c of v to fixed point is done as
+     *    follows:
+     *
+     *       packUnorm4x8: round(clamp(c, 0, +1) * 255.0)
+     */
+   return (uint8_t) _mesa_round_to_even(CLAMP(x, 0.0f, 1.0f) * 255.0f);
+}
+
+/**
+ * Evaluate one component packUnorm2x16.
+ */
+static uint16_t
+pack_unorm_1x16(float x)
+{
+    /* From section 8.4 of the GLSL ES 3.00 spec:
+     *
+     *    packUnorm2x16
+     *    -------------
+     *    The conversion for component c of v to fixed point is done as
+     *    follows:
+     *
+     *       packUnorm2x16: round(clamp(c, 0, +1) * 65535.0)
+     */
+   return (uint16_t) _mesa_round_to_even(CLAMP(x, 0.0f, 1.0f) * 65535.0f);
+}
+
+/**
+ * Evaluate one component of unpackUnorm4x8.
+ */
+static float
+unpack_unorm_1x8(uint8_t u)
+{
+    /* From section 8.4 of the GLSL 4.30 spec:
+     *
+     *    unpackUnorm4x8
+     *    --------------
+     *    The conversion for unpacked fixed-point value f to floating point is
+     *    done as follows:
+     *
+     *       unpackUnorm4x8: f / 255.0
+     */
+   return (float) u / 255.0f;
+}
+
+/**
+ * Evaluate one component of unpackUnorm2x16.
+ */
+static float
+unpack_unorm_1x16(uint16_t u)
+{
+    /* From section 8.4 of the GLSL ES 3.00 spec:
+     *
+     *    unpackUnorm2x16
+     *    ---------------
+     *    The conversion for unpacked fixed-point value f to floating point is
+     *    done as follows:
+     *
+     *       unpackUnorm2x16: f / 65535.0
+     */
+   return (float) u / 65535.0f;
+}
+
+/**
+ * Evaluate one component of packHalf2x16.
+ */
+static uint16_t
+pack_half_1x16(float x)
+{
+   return _mesa_float_to_half(x);
+}
+
+/**
+ * Evaluate one component of unpackHalf2x16.
+ */
+static float
+unpack_half_1x16(uint16_t u)
+{
+   return _mesa_half_to_float(u);
+}
+
+% for type in ["float", "int", "unsigned", "bool"]:
+struct ${type}_vec {
+   ${type} x;
+   ${type} y;
+   ${type} z;
+   ${type} w;
+};
+% endfor
+
+% for name, op in sorted(opcodes.iteritems()):
+static nir_const_value
+evaluate_${name}(unsigned num_components, nir_const_value *_src)
+{
+   nir_const_value _dst_val = { { {0, 0, 0, 0} } };
+
+% for j in range(op.num_inputs):
+   % if op.input_sizes[j] == 0:
+      <% continue %>
+   %endif
+
+   struct ${op.input_types[j]}_vec src${j} = {
+   % for k in range(op.input_sizes[j]):
+      % if op.input_types[j] == "bool":
+         _src[${j}].u[${k}] != 0,
+      % else:
+         _src[${j}].${op.input_types[j][:1]}[${k}],
+      % endif
+   % endfor
+   };
+% endfor
+
+% if op.output_size == 0:
+   for (unsigned _i = 0; _i < num_components; _i++) {
+   % for j in range(op.num_inputs):
+      % if op.input_sizes[j] != 0:
+         <% continue %>
+      % elif op.input_types[j] == "bool":
+         bool src${j} = _src[${j}].u[_i] != 0;
+      % else:
+         ${op.input_types[j]} src${j} = _src[${j}].${op.input_types[j][:1]}[_i];
+      % endif
+   % endfor
+
+   % if "dst" in op.const_expr:
+      ${op.output_type} dst;
+      ${op.const_expr}
+   % else:
+      ${op.output_type} dst = ${op.const_expr};
+   % endif
+
+   % if op.output_type == "bool":
+      _dst_val.u[_i] = dst ? NIR_TRUE : NIR_FALSE;
+   % else:
+      _dst_val.${op.output_type[:1]}[_i] = dst;
+   % endif
+   }
+% else:
+   struct ${op.output_type}_vec dst;
+   % if "dst" in op.const_expr:
+      ${op.const_expr}
+   % else:
+      dst.x = ${op.const_expr};
+   % endif
+
+   % for k in range(op.output_size):
+      % if op.output_type == "bool":
+         _dst_val.u[${k}] = dst.${"xyzw"[k]} ? NIR_TRUE : NIR_FALSE;
+      % else:
+         _dst_val.${op.output_type[:1]}[${k}] = dst.${"xyzw"[k]};
+      % endif
+   % endfor
+% endif
+
+   return _dst_val;
+}
+% endfor
+
+nir_const_value
+nir_eval_const_opcode(nir_op op, unsigned num_components,
+                      nir_const_value *src)
+{
+   switch (op) {
+% for name in sorted(opcodes.keys()):
+   case nir_op_${name}: {
+      return evaluate_${name}(num_components, src);
+      break;
+   }
+% endfor
+   default:
+      unreachable("shouldn't get here");
+   }
+}"""
+
+from nir_opcodes import opcodes
+from mako.template import Template
+
+print Template(template).render(opcodes=opcodes)
diff --git a/src/glsl/nir/nir_opcodes.py b/src/glsl/nir/nir_opcodes.py
index 5bafbb0..edc8116 100644
--- a/src/glsl/nir/nir_opcodes.py
+++ b/src/glsl/nir/nir_opcodes.py
@@ -24,6 +24,7 @@
 # Authors:
 #    Connor Abbott (cwabbott0 at gmail.com)
 
+
 # Class that represents all the information we have about the opcode
 # NOTE: this must be kept in sync with nir_op_info
 
@@ -32,7 +33,7 @@ class Opcode(object):
    NOTE: this must be kept in sync with nir_op_info
    """
    def __init__(self, name, output_size, output_type, input_sizes,
-                input_types, algebraic_properties):
+                input_types, algebraic_properties, const_expr):
       """Parameters:
 
       - name is the name of the opcode (prepend nir_op_ for the enum name)
@@ -40,6 +41,9 @@ class Opcode(object):
       - input_types is a list of types
       - algebraic_properties is a space-seperated string, where nir_op_is_ is
         prepended before each entry
+      - const_expr is an expression or series of statements that computes the
+        constant value of the opcode given the constant values of its inputs.
+        See nir_constant_expressions.py.
       """
       assert isinstance(name, str)
       assert isinstance(output_size, int)
@@ -49,6 +53,7 @@ class Opcode(object):
       assert isinstance(input_types, list)
       assert isinstance(input_types[0], str)
       assert isinstance(algebraic_properties, str)
+      assert isinstance(const_expr, str)
       assert len(input_sizes) == len(input_types)
       assert 0 <= output_size <= 4
       for size in input_sizes:
@@ -62,6 +67,7 @@ class Opcode(object):
       self.input_sizes = input_sizes
       self.input_types = input_types
       self.algebraic_properties = algebraic_properties
+      self.const_expr = const_expr
 
 # helper variables for strings
 tfloat = "float"
@@ -76,178 +82,289 @@ associative = "associative "
 opcodes = {}
 
 def opcode(name, output_size, output_type, input_sizes, input_types,
-           algebraic_properties):
+           algebraic_properties, const_expr):
    assert name not in opcodes
    opcodes[name] = Opcode(name, output_size, output_type, input_sizes,
-                          input_types, algebraic_properties)
-
-def unop_convert(name, in_type, out_type):
-   opcode(name, 0, out_type, [0], [in_type], "")
-
-def unop(name, ty):
-   opcode(name, 0, ty, [0], [ty], "")
-
-def unop_horiz(name, output_size, output_type, input_size, input_type):
-   opcode(name, output_size, output_type, [input_size], [input_type], "")
-
-def unop_reduce(name, output_size, output_type, input_type):
-   unop_horiz(name + "2", output_size, output_type, 2, input_type)
-   unop_horiz(name + "3", output_size, output_type, 3, input_type)
-   unop_horiz(name + "4", output_size, output_type, 4, input_type)
+                          input_types, algebraic_properties, const_expr)
+
+def unop_convert(name, in_type, out_type, const_expr):
+   opcode(name, 0, out_type, [0], [in_type], "", const_expr)
+
+def unop(name, ty, const_expr):
+   opcode(name, 0, ty, [0], [ty], "", const_expr)
+
+def unop_horiz(name, output_size, output_type, input_size, input_type,
+               const_expr):
+   opcode(name, output_size, output_type, [input_size], [input_type], "",
+          const_expr)
+
+def unop_reduce(name, output_size, output_type, input_type, prereduce_expr,
+                reduce_expr, final_expr):
+   def prereduce(src):
+      return "(" + prereduce_expr.format(src=src) + ")"
+   def final(src):
+      return final_expr.format(src="(" + src + ")")
+   def reduce_(src0, src1):
+      return reduce_expr.format(src0=src0, src1=src1)
+   src0 = prereduce("src0.x")
+   src1 = prereduce("src0.y")
+   src2 = prereduce("src0.z")
+   src3 = prereduce("src0.w")
+   unop_horiz(name + "2", output_size, output_type, 2, input_type,
+              final(reduce_(src0, src1)))
+   unop_horiz(name + "3", output_size, output_type, 3, input_type,
+              final(reduce_(reduce_(src0, src1), src2)))
+   unop_horiz(name + "4", output_size, output_type, 4, input_type,
+              final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
 
 
 # These two move instructions differ in what modifiers they support and what
 # the negate modifier means. Otherwise, they are identical.
-unop("fmov", tfloat)
-unop("imov", tint)
-
-unop("ineg", tint)
-unop("fneg", tfloat)
-unop("inot", tint) # invert every bit of the integer
-unop("fnot", tfloat) # (src == 0.0) ? 1.0 : 0.0
-unop("fsign", tfloat)
-unop("isign", tint)
-unop("iabs", tint)
-unop("fabs", tfloat)
-unop("fsat", tfloat)
-unop("frcp", tfloat)
-unop("frsq", tfloat)
-unop("fsqrt", tfloat)
-unop("fexp", tfloat) # < e^x
-unop("flog", tfloat) # log base e
-unop("fexp2", tfloat)
-unop("flog2", tfloat)
-unop_convert("f2i", tfloat, tint) # Float-to-integer conversion.
-unop_convert("f2u", tfloat, tunsigned) # Float-to-unsigned conversion
-unop_convert("i2f", tint, tfloat) # Integer-to-float conversion.
-unop_convert("f2b", tfloat, tbool) # Float-to-boolean conversion
-unop_convert("b2f", tbool, tfloat) # Boolean-to-float conversion
-unop_convert("i2b", tint, tbool) # int-to-boolean conversion
-unop_convert("b2i", tbool, tint) # Boolean-to-int conversion
-unop_convert("u2f", tunsigned, tfloat) #Unsigned-to-float conversion.
-
-unop_reduce("bany", 1, tbool, tbool) # returns ~0 if any component of src[0] != 0
-unop_reduce("ball", 1, tbool, tbool) # returns ~0 if all components of src[0] != 0
-unop_reduce("fany", 1, tfloat, tfloat) # returns 1.0 if any component of src[0] != 0
-unop_reduce("fall", 1, tfloat, tfloat) # returns 1.0 if all components of src[0] != 0
+unop("fmov", tfloat, "src0")
+unop("imov", tint, "src0")
+
+unop("ineg", tint, "-src0")
+unop("fneg", tfloat, "-src0")
+unop("inot", tint, "~src0") # invert every bit of the integer
+unop("fnot", tfloat, "(src0 == 0.0f) ? 1.0f : 0.0f")
+unop("fsign", tfloat, "(src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f)")
+unop("isign", tint, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
+unop("iabs", tint, "abs(src0)")
+unop("fabs", tfloat, "fabsf(src0)")
+unop("fsat", tfloat, "(src0 > 1.0f) ? 1.0f : ((src0 <= 0.0f) ? 0.0f : src0)")
+unop("frcp", tfloat, "1.0f / src0")
+unop("frsq", tfloat, "1.0f / sqrtf(src0)")
+unop("fsqrt", tfloat, "sqrtf(src0)")
+unop("fexp", tfloat, "expf(src0)") # < e^x
+unop("flog", tfloat, "logf(src0)") # log base e
+unop("fexp2", tfloat, "exp2f(src0)")
+unop("flog2", tfloat, "log2f(src0)")
+unop_convert("f2i", tfloat, tint, "src0") # Float-to-integer conversion.
+unop_convert("f2u", tfloat, tunsigned, "src0") # Float-to-unsigned conversion
+unop_convert("i2f", tint, tfloat, "src0") # Integer-to-float conversion.
+# Float-to-boolean conversion
+unop_convert("f2b", tfloat, tbool, "src0 == 0.0f")
+# Boolean-to-float conversion
+unop_convert("b2f", tbool, tfloat, "src0 ? 1.0f : 0.0f")
+# Int-to-boolean conversion
+unop_convert("i2b", tint, tbool, "src0 == 0")
+unop_convert("b2i", tbool, tint, "src0 ? 0 : -1") # Boolean-to-int conversion
+unop_convert("u2f", tunsigned, tfloat, "src0") #Unsigned-to-float conversion.
+
+unop_reduce("bany", 1, tbool, tbool, "{src}", "{src0} || {src1}", "{src}")
+unop_reduce("ball", 1, tbool, tbool, "{src}", "{src0} && {src1}", "{src}")
+unop_reduce("fany", 1, tfloat, tfloat, "{src} != 0.0f", "{src0} || {src1}",
+            "{src} ? 1.0f : 0.0f")
+unop_reduce("fall", 1, tfloat, tfloat, "{src} != 0.0f", "{src0} && {src1}",
+            "{src} ? 1.0f : 0.0f")
 
 # Unary floating-point rounding operations.
 
 
-unop("ftrunc", tfloat)
-unop("fceil", tfloat)
-unop("ffloor", tfloat)
-unop("ffract", tfloat)
-unop("fround_even", tfloat)
+unop("ftrunc", tfloat, "truncf(src0)")
+unop("fceil", tfloat, "ceilf(src0)")
+unop("ffloor", tfloat, "floorf(src0)")
+unop("ffract", tfloat, "src0 - floorf(src0)")
+unop("fround_even", tfloat, "_mesa_round_to_even(src0)")
 
 
 # Trigonometric operations.
 
 
-unop("fsin", tfloat)
-unop("fcos", tfloat)
-unop("fsin_reduced", tfloat)
-unop("fcos_reduced", tfloat)
+unop("fsin", tfloat, "sinf(src0)")
+unop("fcos", tfloat, "cosf(src0)")
+unop("fsin_reduced", tfloat, "sinf(src0)")
+unop("fcos_reduced", tfloat, "cosf(src0)")
 
 
 # Partial derivatives.
 
 
-unop("fddx", tfloat)
-unop("fddy", tfloat)
-unop("fddx_fine", tfloat)
-unop("fddy_fine", tfloat)
-unop("fddx_coarse", tfloat)
-unop("fddy_coarse", tfloat)
+unop("fddx", tfloat, "0.0f") # the derivative of a constant is 0.
+unop("fddy", tfloat, "0.0f")
+unop("fddx_fine", tfloat, "0.0f")
+unop("fddy_fine", tfloat, "0.0f")
+unop("fddx_coarse", tfloat, "0.0f")
+unop("fddy_coarse", tfloat, "0.0f")
 
 
 # Floating point pack and unpack operations.
 
-
-unop_horiz("pack_snorm_2x16", 1, tunsigned, 2, tfloat)
-unop_horiz("pack_snorm_4x8", 1, tunsigned, 4, tfloat)
-unop_horiz("pack_unorm_2x16", 1, tunsigned, 2, tfloat)
-unop_horiz("pack_unorm_4x8", 1, tunsigned, 4, tfloat)
-unop_horiz("pack_half_2x16", 1, tunsigned, 2, tfloat)
-unop_horiz("unpack_snorm_2x16", 2, tfloat, 1, tunsigned)
-unop_horiz("unpack_snorm_4x8", 4, tfloat, 1, tunsigned)
-unop_horiz("unpack_unorm_2x16", 2, tfloat, 1, tunsigned)
-unop_horiz("unpack_unorm_4x8", 4, tfloat, 1, tunsigned)
-unop_horiz("unpack_half_2x16", 2, tfloat, 1, tunsigned)
+def pack_2x16(fmt):
+   unop_horiz("pack_" + fmt + "_2x16", 1, tunsigned, 2, tfloat, """
+dst.x = (uint32_t) pack_fmt_1x16(src0.x);
+dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
+""".replace("fmt", fmt))
+
+def pack_4x8(fmt):
+   unop_horiz("pack_" + fmt + "_4x8", 1, tunsigned, 4, tfloat, """
+dst.x = (uint32_t) pack_fmt_1x8(src0.x);
+dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
+dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
+dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
+""".replace("fmt", fmt))
+
+def unpack_2x16(fmt):
+   unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat, 1, tunsigned, """
+dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
+dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
+""".replace("fmt", fmt))
+
+def unpack_4x8(fmt):
+   unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat, 1, tunsigned, """
+dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
+dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
+dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
+dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
+""".replace("fmt", fmt))
+
+
+pack_2x16("snorm")
+pack_4x8("snorm")
+pack_2x16("unorm")
+pack_4x8("unorm")
+pack_2x16("half")
+unpack_2x16("snorm")
+unpack_4x8("snorm")
+unpack_2x16("unorm")
+unpack_4x8("unorm")
+unpack_2x16("half")
 
 
 # Lowered floating point unpacking operations.
 
 
-unop_horiz("unpack_half_2x16_split_x", 1, tfloat, 1, tunsigned)
-unop_horiz("unpack_half_2x16_split_y", 1, tfloat, 1, tunsigned)
+unop_horiz("unpack_half_2x16_split_x", 1, tfloat, 1, tunsigned, """
+dst.x = unpack_half_1x16((uint16_t)(src0.x & 0xffff));
+""")
+unop_horiz("unpack_half_2x16_split_y", 1, tfloat, 1, tunsigned, """
+dst.y = unpack_half_1x16((uint16_t)(src0.x >> 16));
+""")
 
 
 # Bit operations, part of ARB_gpu_shader5.
 
 
-unop("bitfield_reverse", tunsigned)
-unop("bit_count", tunsigned)
-unop_convert("ufind_msb", tunsigned, tint)
-unop("ifind_msb", tint)
-unop("find_lsb", tint)
+unop("bitfield_reverse", tunsigned, """
+/* we're not winning any awards for speed here, but that's ok */
+dst = 0;
+for (unsigned bit = 0; bit < 32; bit++)
+   dst |= ((src0 >> bit) & 1) << (31 - bit);
+""")
+unop("bit_count", tunsigned, """
+dst = 0;
+for (unsigned bit = 0; bit < 32; bit++) {{
+   if ((src0 >> bit) & 1)
+      dst++;
+}}
+""")
+
+unop_convert("ufind_msb", tunsigned, tint, """
+dst = -1;
+for (int bit = 31; bit > 0; bit--) {{
+   if ((src0 >> bit) & 1) {{
+      dst = bit;
+      break;
+   }}
+}}
+""")
+
+unop("ifind_msb", tint, """
+dst = -1;
+for (int bit = 31; bit >= 0; bit--) {{
+   /* If src0 < 0, we're looking for the first 0 bit.
+    * if src0 >= 0, we're looking for the first 1 bit.
+    */
+   if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
+      (!((src0 >> bit) & 1) && (src0 < 0))) {{
+      dst = bit;
+      break;
+   }}
+}}
+""")
+
+unop("find_lsb", tint, """
+dst = -1;
+for (unsigned bit = 0; bit < 32; bit++) {{
+   if ((src0 >> bit) & 1) {{
+      dst = bit;
+      break;
+   }}
+}}
+""")
 
 
 for i in xrange(1, 5):
    for j in xrange(1, 5):
-      unop_horiz("fnoise{0}_{1}".format(i, j), i, tfloat, j, tfloat)
+      unop_horiz("fnoise{0}_{1}".format(i, j), i, tfloat, j, tfloat, "0.0f")
 
-def binop_convert(name, out_type, in_type, alg_props):
-   opcode(name, 0, out_type, [0, 0], [in_type, in_type], alg_props)
+def binop_convert(name, out_type, in_type, alg_props, const_expr):
+   opcode(name, 0, out_type, [0, 0], [in_type, in_type], alg_props, const_expr)
 
-def binop(name, ty, alg_props):
-   binop_convert(name, ty, ty, alg_props)
+def binop(name, ty, alg_props, const_expr):
+   binop_convert(name, ty, ty, alg_props, const_expr)
 
-def binop_compare(name, ty, alg_props):
-   binop_convert(name, ty, tbool, alg_props)
+def binop_compare(name, ty, alg_props, const_expr):
+   binop_convert(name, tbool, ty, alg_props, const_expr)
 
 def binop_horiz(name, out_size, out_type, src1_size, src1_type, src2_size,
-                src2_type):
-   opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type], "")
-
-def binop_reduce(name, output_size, output_type, src_type):
-   opcode(name + "2",output_size, output_type,
-          [2, 2], [src_type, src_type], commutative)
+                src2_type, const_expr):
+   opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type],
+          "", const_expr)
+
+def binop_reduce(name, output_size, output_type, src_type, prereduce_expr,
+                 reduce_expr, final_expr):
+   def final(src):
+      return final_expr.format(src= "(" + src + ")")
+   def reduce_(src0, src1):
+      return reduce_expr.format(src0=src0, src1=src1)
+   def prereduce(src0, src1):
+      return "(" + prereduce_expr.format(src0=src0, src1=src1) + ")"
+   src0 = prereduce("src0.x", "src1.x")
+   src1 = prereduce("src0.y", "src1.y")
+   src2 = prereduce("src0.z", "src1.z")
+   src3 = prereduce("src0.w", "src1.w")
+   opcode(name + "2", output_size, output_type,
+          [2, 2], [src_type, src_type], commutative,
+          final(reduce_(src0, src1)))
    opcode(name + "3", output_size, output_type,
-          [3, 3], [src_type, src_type], commutative)
+          [3, 3], [src_type, src_type], commutative,
+          final(reduce_(reduce_(src0, src1), src2)))
    opcode(name + "4", output_size, output_type,
-          [4, 4], [src_type, src_type], commutative)
+          [4, 4], [src_type, src_type], commutative,
+          final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
 
-binop("fadd", tfloat, commutative + associative)
-binop("iadd", tint, commutative + associative)
-binop("fsub", tfloat, "")
-binop("isub", tint, "")
+binop("fadd", tfloat, commutative + associative, "src0 + src1")
+binop("iadd", tint, commutative + associative, "src0 + src1")
+binop("fsub", tfloat, "", "src0 - src1")
+binop("isub", tint, "", "src0 - src1")
 
-binop("fmul", tfloat, commutative + associative)
+binop("fmul", tfloat, commutative + associative, "src0 * src1")
 # low 32-bits of signed/unsigned integer multiply
-binop("imul", tint, commutative + associative)
+binop("imul", tint, commutative + associative, "src0 * src1")
 # high 32-bits of signed integer multiply
-binop("imul_high", tint, commutative)
+binop("imul_high", tint, commutative,
+      "(int32_t)(((int64_t) src0 * (int64_t) src1) >> 32)")
 # high 32-bits of unsigned integer multiply
-binop("umul_high", tunsigned, commutative)
+binop("umul_high", tunsigned, commutative,
+      "(uint32_t)(((uint64_t) src0 * (uint64_t) src1) >> 32)")
 
-binop("fdiv", tfloat, "")
-binop("idiv", tint, "")
-binop("udiv", tunsigned, "")
+binop("fdiv", tfloat, "", "src0 / src1")
+binop("idiv", tint, "", "src0 / src1")
+binop("udiv", tunsigned, "", "src0 / src1")
 
 # returns a boolean representing the carry resulting from the addition of
 # the two unsigned arguments.
 
-binop_convert("uadd_carry", tbool, tunsigned,
-              commutative)
+binop_convert("uadd_carry", tbool, tunsigned, commutative, "src0 + src1 < src0")
 
 # returns a boolean representing the borrow resulting from the subtraction
 # of the two unsigned arguments.
 
-binop_convert("usub_borrow", tbool, tunsigned, "")
+binop_convert("usub_borrow", tbool, tunsigned, "", "src1 < src0")
 
-binop("fmod", tfloat, "")
-binop("umod", tunsigned, "")
+binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)")
+binop("umod", tunsigned, "", "src1 == 0 ? 0 : src0 % src1")
 
 #
 # Comparisons
@@ -256,41 +373,47 @@ binop("umod", tunsigned, "")
 
 # these integer-aware comparisons return a boolean (0 or ~0)
 
-binop_compare("flt", tfloat, "")
-binop_compare("fge", tfloat, "")
-binop_compare("feq", tfloat, commutative)
-binop_compare("fne", tfloat, commutative)
-binop_compare("ilt", tint, "")
-binop_compare("ige", tint, "")
-binop_compare("ieq", tint, commutative)
-binop_compare("ine", tint, commutative)
-binop_compare("ult", tunsigned, "")
-binop_compare("uge", tunsigned, "")
+binop_compare("flt", tfloat, "", "src0 < src1")
+binop_compare("fge", tfloat, "", "src0 >= src1")
+binop_compare("feq", tfloat, commutative, "src0 == src1")
+binop_compare("fne", tfloat, commutative, "src0 != src1")
+binop_compare("ilt", tint, "", "src0 < src1")
+binop_compare("ige", tint, "", "src0 >= src1")
+binop_compare("ieq", tint, commutative, "src0 == src1")
+binop_compare("ine", tint, commutative, "src0 != src1")
+binop_compare("ult", tunsigned, "", "src0 < src1")
+binop_compare("uge", tunsigned, "", "src0 >= src1")
 
 # integer-aware GLSL-style comparisons that compare floats and ints
 
-binop_reduce("ball_fequal",  1, tbool, tfloat)
-binop_reduce("bany_fnequal", 1, tbool, tfloat)
-binop_reduce("ball_iequal",  1, tbool, tint)
-binop_reduce("bany_inequal", 1, tbool, tint)
+binop_reduce("ball_fequal",  1, tbool, tfloat, "{src0} == {src1}",
+             "{src0} && {src1}", "{src}")
+binop_reduce("bany_fnequal", 1, tbool, tfloat, "{src0} != {src1}",
+             "{src0} || {src1}", "{src}")
+binop_reduce("ball_iequal",  1, tbool, tint, "{src0} == {src1}",
+             "{src0} && {src1}", "{src}")
+binop_reduce("bany_inequal", 1, tbool, tint, "{src0} != {src1}",
+             "{src0} || {src1}", "{src}")
 
 # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
 
-binop_reduce("fall_equal",  1, tfloat, tfloat)
-binop_reduce("fany_nequal", 1, tfloat, tfloat)
+binop_reduce("fall_equal",  1, tfloat, tfloat, "{src0} == {src1}",
+             "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
+binop_reduce("fany_nequal", 1, tfloat, tfloat, "{src0} != {src1}",
+             "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
 
 # These comparisons for integer-less hardware return 1.0 and 0.0 for true
 # and false respectively
 
-binop("slt", tfloat, "") # Set on Less Than
-binop("sge", tfloat, "") # Set on Greater Than or Equal
-binop("seq", tfloat, commutative) # Set on Equal
-binop("sne", tfloat, commutative) # Set on Not Equal
+binop("slt", tfloat, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
+binop("sge", tfloat, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
+binop("seq", tfloat, commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
+binop("sne", tfloat, commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
 
 
-binop("ishl", tint, "")
-binop("ishr", tint, "")
-binop("ushr", tunsigned, "")
+binop("ishl", tint, "", "src0 << src1")
+binop("ishr", tint, "", "src0 >> src1")
+binop("ushr", tunsigned, "", "src0 >> src1")
 
 # bitwise logic operators
 #
@@ -298,9 +421,9 @@ binop("ushr", tunsigned, "")
 # integers.
 
 
-binop("iand", tunsigned, commutative + associative)
-binop("ior", tunsigned, commutative + associative)
-binop("ixor", tunsigned, commutative + associative)
+binop("iand", tunsigned, commutative + associative, "src0 & src1")
+binop("ior", tunsigned, commutative + associative, "src0 | src1")
+binop("ixor", tunsigned, commutative + associative, "src0 ^ src1")
 
 
 # floating point logic operators
@@ -308,42 +431,60 @@ binop("ixor", tunsigned, commutative + associative)
 # These use (src != 0.0) for testing the truth of the input, and output 1.0
 # for true and 0.0 for false
 
-binop("fand", tfloat, commutative)
-binop("for", tfloat, commutative)
-binop("fxor", tfloat, commutative)
-
-binop_reduce("fdot", 1, tfloat, tfloat)
-
-binop("fmin", tfloat, "")
-binop("imin", tint, commutative + associative)
-binop("umin", tunsigned, commutative + associative)
-binop("fmax", tfloat, "")
-binop("imax", tint, commutative + associative)
-binop("umax", tunsigned, commutative + associative)
-
-binop("fpow", tfloat, "")
-
-binop_horiz("pack_half_2x16_split", 1, tunsigned, 1, tfloat, 1, tfloat)
-
-binop("bfm", tunsigned, "")
-
-binop("ldexp", tunsigned, "")
+binop("fand", tfloat, commutative,
+      "((src0 != 0.0f) && (src1 != 0.0f)) ? 1.0f : 0.0f")
+binop("for", tfloat, commutative,
+      "((src0 != 0.0f) || (src1 != 0.0f)) ? 1.0f : 0.0f")
+binop("fxor", tfloat, commutative,
+      "(src0 != 0.0f && src1 == 0.0f) || (src0 == 0.0f && src1 != 0.0f) ? 1.0f : 0.0f")
+
+binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}",
+             "{src}")
+
+binop("fmin", tfloat, "", "fminf(src0, src1)")
+binop("imin", tint, commutative + associative, "src1 > src0 ? src0 : src1")
+binop("umin", tunsigned, commutative + associative, "src1 > src0 ? src0 : src1")
+binop("fmax", tfloat, "", "fmaxf(src0, src1)")
+binop("imax", tint, commutative + associative, "src1 > src0 ? src1 : src0")
+binop("umax", tunsigned, commutative + associative, "src1 > src0 ? src1 : src0")
+
+binop("fpow", tfloat, "", "powf(src0, src1)")
+
+binop_horiz("pack_half_2x16_split", 1, tunsigned, 1, tfloat, 1, tfloat,
+            "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
+
+binop_convert("bfm", tunsigned, tint, "", """
+int offset = src0, bits = src1;
+if (offset < 0 || bits < 0 || offset + bits > 32)
+   dst = 0; /* undefined per the spec */
+else
+   dst = ((1 << bits)- 1) << offset;
+""")
+
+opcode("ldexp", 0, tunsigned, [0, 0], [tfloat, tint], "", """
+dst = ldexp(src0, src1);
+/* flush denormals to zero. */
+if (!isnormal(dst))
+   dst = copysign(0.0f, src0);
+""")
 
 # Combines the first component of each input to make a 2-component vector.
 
-binop_horiz("vec2", 2, tunsigned, 1, tunsigned, 1, tunsigned)
+binop_horiz("vec2", 2, tunsigned, 1, tunsigned, 1, tunsigned, """
+dst.x = src0.x;
+dst.y = src1.x;
+""")
 
-def triop(name, ty):
-   opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], "")
-def triop_horiz(name, output_size, src1_size, src2_size, src3_size):
+def triop(name, ty, const_expr):
+   opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], "", const_expr)
+def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr):
    opcode(name, output_size, tunsigned,
    [src1_size, src2_size, src3_size],
-   [tunsigned, tunsigned, tunsigned], "")
+   [tunsigned, tunsigned, tunsigned], "", const_expr)
 
-# fma(a, b, c) = (a# b) + c
-triop("ffma", tfloat)
+triop("ffma", tfloat, "src0 * src1 + src2")
 
-triop("flrp", tfloat)
+triop("flrp", tfloat, "src0 * (1 - src2) + src1 * src2")
 
 # Conditional Select
 #
@@ -352,30 +493,83 @@ triop("flrp", tfloat)
 # bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
 
 
-triop("fcsel", tfloat)
+triop("fcsel", tfloat, "(src0 != 0.0f) ? src1 : src2")
 opcode("bcsel", 0, tunsigned, [0, 0, 0],
-       [tbool, tunsigned, tunsigned], "")
-
-triop("bfi", tunsigned)
-
-triop("ubitfield_extract", tunsigned)
-opcode("ibitfield_extract", 0, tint, [0, 0, 0],
-       [tint, tunsigned, tunsigned], "")
+      [tbool, tunsigned, tunsigned], "", "src0 ? src1 : src2")
+
+triop("bfi", tunsigned, """
+unsigned mask = src0, insert = src1 & mask, base = src2;
+if (mask == 0) {{
+   dst = base;
+}} else {{
+   unsigned tmp = mask;
+   while (!(tmp & 1)) {{
+      tmp >>= 1;
+      insert <<= 1;
+   }}
+   dst = (base & ~mask) | insert;
+}}
+""")
+
+opcode("ubitfield_extract", 0, tunsigned,
+       [0, 1, 1], [tunsigned, tint, tint], "", """
+unsigned base = src0;
+int offset = src1.x, bits = src2.x;
+if (bits == 0) {{
+   dst = 0;
+}} else if (bits < 0 || offset < 0 || offset + bits > 32) {{
+   dst = 0; /* undefined per the spec */
+}} else {{
+   dst = (base >> offset) & ((1 << bits) - 1);
+}}
+""")
+opcode("ibitfield_extract", 0, tint,
+       [0, 1, 1], [tint, tint, tint], "", """
+int base = src0;
+int offset = src1.x, bits = src2.x;
+if (bits == 0) {{
+   dst = 0;
+}} else if (offset < 0 || bits < 0 || offset + bits > 32) {{
+   dst = 0;
+}} else {{
+   dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */
+}}
+""")
 
 # Combines the first component of each input to make a 3-component vector.
 
-triop_horiz("vec3", 3, 1, 1, 1)
+triop_horiz("vec3", 3, 1, 1, 1, """
+dst.x = src0.x;
+dst.y = src1.x;
+dst.z = src2.x;
+""")
 
-def quadop(name):
-   opcode(name, 0, tunsigned, [0, 0, 0, 0],
-          [tunsigned, tunsigned, tunsigned, tunsigned],
-          "")
-def quadop_horiz(name, output_size, src1_size, src2_size, src3_size, src4_size):
+def quadop_horiz(name, output_size, src1_size, src2_size, src3_size,
+                 src4_size, const_expr):
    opcode(name, output_size, tunsigned,
           [src1_size, src2_size, src3_size, src4_size],
           [tunsigned, tunsigned, tunsigned, tunsigned],
-          "")
+          "", const_expr)
+
+opcode("bitfield_insert", 0, tunsigned, [0, 0, 1, 1],
+       [tunsigned, tunsigned, tint, tint], "", """
+unsigned base = src0, insert = src1;
+int offset = src2.x, bits = src3.x;
+if (bits == 0) {{
+   dst = 0;
+}} else if (offset < 0 || bits < 0 || bits + offset > 32) {{
+   dst = 0;
+}} else {{
+   unsigned mask = ((1 << bits) - 1) << offset;
+   dst = (base & ~mask) | ((insert << bits) & mask);
+}}
+""")
+
+quadop_horiz("vec4", 4, 1, 1, 1, 1, """
+dst.x = src0.x;
+dst.y = src1.x;
+dst.z = src2.x;
+dst.w = src3.x;
+""")
 
-quadop("bitfield_insert")
 
-quadop_horiz("vec4", 4, 1, 1, 1, 1)
-- 
2.2.1



More information about the mesa-dev mailing list