Mesa (master): gallium/util: add fast half float conversion functions

Thu Apr 1 11:33:26 UTC 2010

Module: Mesa
Branch: master
Commit: 3ff175d6de89ad92d167362355501f99d06f0f97
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=3ff175d6de89ad92d167362355501f99d06f0f97

Author: Luca Barbieri <luca at luca-barbieri.com>
Date:   Wed Mar 24 18:12:45 2010 +0100

gallium/util: add fast half float conversion functions

This adds a fast half float conversion facility to Gallium.

Mesa already contains such a facility, but using a much worse algorithm.

This one is an implementation of
www.fox-toolkit.org/ftp/fasthalffloatconversion.pdf
and uses a branch-less algorithm with some lookup tables small enough
to fit in the L1 cache.

Ideally, Mesa should start using these functions too, but I'm not sure
how to arrange that with the current build system.

A new "u_gctors.cpp" is added that defines a global C++ constructor
allowing to initialize to conversion lookup tables at library init.

---

 src/gallium/auxiliary/Makefile          |    4 +
 src/gallium/auxiliary/util/u_gctors.cpp |   17 ++++
 src/gallium/auxiliary/util/u_half.c     |  123 +++++++++++++++++++++++++++++++
 src/gallium/auxiliary/util/u_half.h     |   55 ++++++++++++++
 4 files changed, 199 insertions(+), 0 deletions(-)

diff --git a/src/gallium/auxiliary/Makefile b/src/gallium/auxiliary/Makefile
index 4c62992..14c0fb1 100644
--- a/src/gallium/auxiliary/Makefile
+++ b/src/gallium/auxiliary/Makefile
@@ -110,6 +110,7 @@ C_SOURCES = \
 	util/u_format_table.c \
 	util/u_format_tests.c \
 	util/u_gen_mipmap.c \
+	util/u_half.c \
 	util/u_handle_table.c \
 	util/u_hash_table.c \
 	util/u_hash.c \
@@ -138,6 +139,9 @@ C_SOURCES = \
 	#vl/vl_csc.c \
 	#vl/vl_shader_build.c \
 
+CPP_SOURCES = \
+	util/u_gctors.cpp
+
 GALLIVM_SOURCES = \
         gallivm/lp_bld_alpha.c \
         gallivm/lp_bld_arit.c \
diff --git a/src/gallium/auxiliary/util/u_gctors.cpp b/src/gallium/auxiliary/util/u_gctors.cpp
new file mode 100644
index 0000000..9ea9819
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_gctors.cpp
@@ -0,0 +1,17 @@
+/* this file uses the C++ global constructor mechanism to automatically
+   initialize global data
+
+   __attribute__((constructor)) allows to do this in C, but is GCC-only
+*/
+
+extern "C" void util_half_init_tables(void);
+
+struct util_gctor_t
+{
+	util_gctor_t()
+	{
+		util_half_init_tables();
+	}
+};
+
+static struct util_gctor_t util_gctor;
diff --git a/src/gallium/auxiliary/util/u_half.c b/src/gallium/auxiliary/util/u_half.c
new file mode 100644
index 0000000..8865acb
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_half.c
@@ -0,0 +1,123 @@
+#include "util/u_half.h"
+
+/* see www.fox-toolkit.org/ftp/fasthalffloatconversion.pdf
+ * "Fast Half Float Conversions" by Jeroen van der Zijp, Nov 2008
+ */
+
+/* Note that using a 64K * 4 table is a terrible idea since it will not fit
+ * in the L1 cache and will massively pollute the L2 cache as well
+ *
+ * These should instead fit in the L1 cache.
+ *
+ * TODO: we could use a denormal bias table instead of the mantissa/offset
+ * tables: this would reduce the L1 cache usage from 8704 to 2304 bytes
+ * but would involve more computation
+ *
+ * Note however that if denormals are never encountered, the L1 cache usage
+ * is only about 4608 bytes anyway.
+ */
+uint32_t util_half_to_float_mantissa_table[2048];
+uint32_t util_half_to_float_exponent_table[64];
+uint32_t util_half_to_float_offset_table[64];
+uint16_t util_float_to_half_base_table[512];
+uint8_t util_float_to_half_shift_table[512];
+
+/* called by u_gctors.cpp, which defines the prototype itself */
+void util_half_init_tables(void);
+
+void util_half_init_tables(void)
+{
+	int i;
+
+	/* zero */
+	util_half_to_float_mantissa_table[0] = 0;
+
+	/* denormals */
+	for(i = 1; i < 1024; ++i) {
+		unsigned int m = i << 13;
+		unsigned int e = 0;
+
+		/* Normalize number */
+		while(!(m & 0x00800000)) {
+			e -= 0x00800000;
+			m<<=1;
+		}
+		m &= ~0x00800000;
+		e+= 0x38800000;
+		util_half_to_float_mantissa_table[i] = m | e;
+	}
+
+	/* normals */
+	for(i = 1024; i < 2048; ++i)
+		util_half_to_float_mantissa_table[i] = ((i-1024)<<13);
+
+	/* positive zero or denormals */
+	util_half_to_float_exponent_table[0] = 0;
+
+	/* positive numbers */
+	for(i = 1; i <= 30; ++i)
+		util_half_to_float_exponent_table[i] = 0x38000000 + (i << 23);
+
+	/* positive infinity/NaN */
+	util_half_to_float_exponent_table[31] = 0x7f800000;
+
+	/* negative zero or denormals */
+	util_half_to_float_exponent_table[32] = 0x80000000;
+
+	/* negative numbers */
+	for(i = 33; i <= 62; ++i)
+		util_half_to_float_exponent_table[i] = 0xb8000000 + ((i - 32) << 23);
+
+	/* negative infinity/NaN */
+	util_half_to_float_exponent_table[63] = 0xff800000;
+
+	/* positive zero or denormals */
+	util_half_to_float_offset_table[0] = 0;
+
+	/* positive normals */
+	for(i = 1; i < 32; ++i)
+		util_half_to_float_offset_table[i] = 1024;
+
+	/* negative zero or denormals */
+	util_half_to_float_offset_table[32] = 0;
+
+	/* negative normals */
+	for(i = 33; i < 64; ++i)
+		util_half_to_float_offset_table[i] = 1024;
+
+
+
+	/* very small numbers mapping to zero */
+	for(i = -127; i < -24; ++i) {
+		util_float_to_half_base_table[127 + i] = 0;
+		util_float_to_half_shift_table[127 + i] = 24;
+	}
+
+	/* small numbers mapping to denormals */
+	for(i = -24; i < -14; ++i) {
+		util_float_to_half_base_table[127 + i] = 0x0400 >> (-14 - i);
+		util_float_to_half_shift_table[127 + i] = -i - 1;
+	}
+
+	/* normal numbers */
+	for(i = -14; i < 16; ++i) {
+		util_float_to_half_base_table[127 + i] = (i + 15) << 10;
+		util_float_to_half_shift_table[127 + i] = 13;
+	}
+
+	/* large numbers mapping to infinity */
+	for(i = 16; i < 128; ++i) {
+		util_float_to_half_base_table[127 + i] = 0x7c00;
+		util_float_to_half_shift_table[127 + i] = 24;
+	}
+
+	/* infinity and NaNs */
+	util_float_to_half_base_table[255] = 0x7c00;
+	util_float_to_half_shift_table[255] = 13;
+
+	/* negative numbers */
+	for(i = 0; i < 256; ++i) {
+		util_float_to_half_base_table[256 + i] = util_float_to_half_base_table[i] | 0x8000;
+		util_float_to_half_shift_table[256 + i] = util_float_to_half_shift_table[i];
+	}
+}
diff --git a/src/gallium/auxiliary/util/u_half.h b/src/gallium/auxiliary/util/u_half.h
new file mode 100644
index 0000000..464d43d
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_half.h
@@ -0,0 +1,55 @@
+#ifndef U_HALF_H
+#define U_HALF_H
+
+#include "pipe/p_compiler.h"
+
+extern uint32_t util_half_to_float_mantissa_table[2048];
+extern uint32_t util_half_to_float_exponent_table[64];
+extern uint32_t util_half_to_float_offset_table[64];
+extern uint16_t util_float_to_half_base_table[512];
+extern uint8_t util_float_to_half_shift_table[512];
+
+/*
+ * Note that if the half float is a signaling NaN, the x87 FPU will turn
+ * it into a quiet NaN immediately upon loading into a float.
+ *
+ * Additionally, denormals may be flushed to zero.
+ *
+ * To avoid this, use the floatui functions instead of the float ones
+ * when just doing conversion rather than computation on the resulting
+ * floats.
+ */
+
+static INLINE uint32_t
+util_half_to_floatui(half h)
+{
+	unsigned exp = h >> 10;
+	return util_half_to_float_mantissa_table[util_half_to_float_offset_table[exp] + (h & 0x3ff)]
+		+ util_half_to_float_exponent_table[exp];
+}
+
+static INLINE float
+util_half_to_float(half h)
+{
+	union {float f; uint32_t v;} r;
+	r.v = util_half_to_floatui(h);
+	return r.f;
+}
+
+static INLINE half
+util_floatui_to_half(uint32_t v)
+{
+	unsigned signexp = v >> 23;
+	return util_float_to_half_base_table[signexp]
+		+ ((v & 0x007fffff) >> util_float_to_half_shift_table[signexp]);
+}
+
+static INLINE half
+util_float_to_half(float f)
+{
+	union {float f; uint32_t v;} i;
+	i.f = f;
+	return util_floatui_to_half(i.v);
+}
+
+#endif /* U_HALF_H */