[Mesa-dev] [PATCH (gles3) 15/20] glsl: Add lowering pass for GLSL ES 3.00 pack/unpack operations (v2)
Chad Versace
chad.versace at linux.intel.com
Mon Jan 21 00:49:27 PST 2013
Lower them to arithmetic and bit manipulation expressions.
v2:
- Rewrite using ir_builder. [for idr]
- In lowering packHalf2x16, don't truncate subnormal float16 values to zero.
And round to even rather than to zero. [for stereotype441]
CC: Ian Romanick <idr at freedesktop.org>
CC: Paul Berry <stereotype441 at gmail.com>
Signed-off-by: Chad Versace <chad.versace at linux.intel.com>
---
src/glsl/Makefile.sources | 1 +
src/glsl/ir_optimization.h | 20 +
src/glsl/lower_packing_builtins.cpp | 1043 +++++++++++++++++++++++++++++++++++
3 files changed, 1064 insertions(+)
create mode 100644 src/glsl/lower_packing_builtins.cpp
diff --git a/src/glsl/Makefile.sources b/src/glsl/Makefile.sources
index 2227c5e..ef71715 100644
--- a/src/glsl/Makefile.sources
+++ b/src/glsl/Makefile.sources
@@ -59,6 +59,7 @@ LIBGLSL_FILES = \
$(GLSL_SRCDIR)/lower_mat_op_to_vec.cpp \
$(GLSL_SRCDIR)/lower_noise.cpp \
$(GLSL_SRCDIR)/lower_packed_varyings.cpp \
+ $(GLSL_SRCDIR)/lower_packing_builtins.cpp \
$(GLSL_SRCDIR)/lower_texture_projection.cpp \
$(GLSL_SRCDIR)/lower_variable_index_to_cond_assign.cpp \
$(GLSL_SRCDIR)/lower_vec_index_to_cond_assign.cpp \
diff --git a/src/glsl/ir_optimization.h b/src/glsl/ir_optimization.h
index 6b95191..ac90b87 100644
--- a/src/glsl/ir_optimization.h
+++ b/src/glsl/ir_optimization.h
@@ -37,6 +37,25 @@
#define MOD_TO_FRACT 0x20
#define INT_DIV_TO_MUL_RCP 0x40
+/**
+ * \see class lower_packing_builtins_visitor
+ */
+enum lower_packing_builtins_op {
+ LOWER_PACK_UNPACK_NONE = 0x0000,
+
+ LOWER_PACK_SNORM_2x16 = 0x0001,
+ LOWER_UNPACK_SNORM_2x16 = 0x0002,
+
+ LOWER_PACK_UNORM_2x16 = 0x0004,
+ LOWER_UNPACK_UNORM_2x16 = 0x0008,
+
+ LOWER_PACK_HALF_2x16 = 0x0010,
+ LOWER_UNPACK_HALF_2x16 = 0x0020,
+
+ LOWER_PACK_HALF_2x16_TO_SPLIT = 0x0040,
+ LOWER_UNPACK_HALF_2x16_TO_SPLIT = 0x0080,
+};
+
bool do_common_optimization(exec_list *ir, bool linked,
bool uniform_locations_assigned,
unsigned max_unroll_iterations);
@@ -74,6 +93,7 @@ bool lower_variable_index_to_cond_assign(exec_list *instructions,
bool lower_quadop_vector(exec_list *instructions, bool dont_lower_swz);
bool lower_clip_distance(gl_shader *shader);
void lower_output_reads(exec_list *instructions);
+bool lower_packing_builtins(exec_list *instructions, int op_mask);
void lower_ubo_reference(struct gl_shader *shader, exec_list *instructions);
void lower_packed_varyings(void *mem_ctx, unsigned location_base,
unsigned locations_used, ir_variable_mode mode,
diff --git a/src/glsl/lower_packing_builtins.cpp b/src/glsl/lower_packing_builtins.cpp
new file mode 100644
index 0000000..49176cc
--- /dev/null
+++ b/src/glsl/lower_packing_builtins.cpp
@@ -0,0 +1,1043 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "ir.h"
+#include "ir_builder.h"
+#include "ir_optimization.h"
+#include "ir_rvalue_visitor.h"
+
+namespace {
+
+using namespace ir_builder;
+
+/**
+ * A visitor that lowers built-in floating-point pack/unpack expressions
+ * such packSnorm2x16.
+ */
+class lower_packing_builtins_visitor : public ir_rvalue_visitor {
+public:
+ /**
+ * \param op_mask is a bitmask of `enum lower_packing_builtins_op`
+ */
+ explicit lower_packing_builtins_visitor(int op_mask)
+ : op_mask(op_mask),
+ progress(false)
+ {
+ /* Mutually exclusive options. */
+ assert(!((op_mask & LOWER_PACK_HALF_2x16) &&
+ (op_mask & LOWER_PACK_HALF_2x16_TO_SPLIT)));
+
+ assert(!((op_mask & LOWER_UNPACK_HALF_2x16) &&
+ (op_mask & LOWER_UNPACK_HALF_2x16_TO_SPLIT)));
+ }
+
+ virtual ~lower_packing_builtins_visitor()
+ {
+ assert(factory.instructions == NULL ||
+ factory.instructions->is_empty());
+
+ delete factory.instructions;
+ }
+
+ bool get_progress() { return progress; }
+
+ void handle_rvalue(ir_rvalue **rvalue)
+ {
+ if (!*rvalue)
+ return;
+
+ ir_expression *expr = (*rvalue)->as_expression();
+ if (!expr)
+ return;
+
+ enum lower_packing_builtins_op lowering_op =
+ choose_lowering_op(expr->operation);
+
+ if (lowering_op == LOWER_PACK_UNPACK_NONE)
+ return;
+
+ setup_factory(ralloc_parent(expr));
+
+ ir_rvalue *op0 = expr->operands[0];
+ ralloc_steal(factory.mem_ctx, op0);
+
+ switch (lowering_op) {
+ case LOWER_PACK_SNORM_2x16:
+ *rvalue = lower_pack_snorm_2x16(op0);
+ break;
+ case LOWER_PACK_UNORM_2x16:
+ *rvalue = lower_pack_unorm_2x16(op0);
+ break;
+ case LOWER_PACK_HALF_2x16:
+ *rvalue = lower_pack_half_2x16(op0);
+ break;
+ case LOWER_PACK_HALF_2x16_TO_SPLIT:
+ *rvalue = split_pack_half_2x16(op0);
+ break;
+ case LOWER_UNPACK_SNORM_2x16:
+ *rvalue = lower_unpack_snorm_2x16(op0);
+ break;
+ case LOWER_UNPACK_UNORM_2x16:
+ *rvalue = lower_unpack_unorm_2x16(op0);
+ break;
+ case LOWER_UNPACK_HALF_2x16:
+ *rvalue = lower_unpack_half_2x16(op0);
+ break;
+ case LOWER_UNPACK_HALF_2x16_TO_SPLIT:
+ *rvalue = split_unpack_half_2x16(op0);
+ break;
+ case LOWER_PACK_UNPACK_NONE:
+ assert(!"not reached");
+ break;
+ }
+
+ teardown_factory();
+ progress = true;
+ }
+
+private:
+ const int op_mask;
+ bool progress;
+ ir_factory factory;
+
+ /**
+ * Determine the needed lowering operation by filtering \a expr_op
+ * through \ref op_mask.
+ */
+ enum lower_packing_builtins_op
+ choose_lowering_op(ir_expression_operation expr_op)
+ {
+ /* C++ regards int and enum as fundamentally different types.
+ * So, we can't simply return from each case; we must cast the return
+ * value.
+ */
+ int result;
+
+ switch (expr_op) {
+ case ir_unop_pack_snorm_2x16:
+ result = op_mask & LOWER_PACK_SNORM_2x16;
+ break;
+ case ir_unop_pack_unorm_2x16:
+ result = op_mask & LOWER_PACK_UNORM_2x16;
+ break;
+ case ir_unop_pack_half_2x16:
+ result = op_mask & (LOWER_PACK_HALF_2x16 | LOWER_PACK_HALF_2x16_TO_SPLIT);
+ break;
+ case ir_unop_unpack_snorm_2x16:
+ result = op_mask & LOWER_UNPACK_SNORM_2x16;
+ break;
+ case ir_unop_unpack_unorm_2x16:
+ result = op_mask & LOWER_UNPACK_UNORM_2x16;
+ break;
+ case ir_unop_unpack_half_2x16:
+ result = op_mask & (LOWER_UNPACK_HALF_2x16 | LOWER_UNPACK_HALF_2x16_TO_SPLIT);
+ break;
+ default:
+ result = LOWER_PACK_UNPACK_NONE;
+ break;
+ }
+
+ return static_cast<enum lower_packing_builtins_op>(result);
+ }
+
+ void
+ setup_factory(void *mem_ctx)
+ {
+ assert(factory.mem_ctx == NULL);
+ factory.mem_ctx = mem_ctx;
+
+ /* Avoid making a new list for each call to handle_rvalue(). Make a
+ * single list and reuse it.
+ */
+ if (factory.instructions == NULL) {
+ factory.instructions = new(NULL) exec_list();
+ } else {
+ assert(factory.instructions->is_empty());
+ }
+ }
+
+ void
+ teardown_factory()
+ {
+ base_ir->insert_before(factory.instructions);
+ assert(factory.instructions->is_empty());
+ factory.mem_ctx = NULL;
+ }
+
+ template <typename T>
+ ir_constant*
+ constant(T x)
+ {
+ return factory.constant(x);
+ }
+
+ /**
+ * \brief Pack two uint16's into a single uint32.
+ *
+ * Interpret the given uvec2 as a uint16 pair. Pack the pair into a uint32
+ * where the least significant bits specify the first element of the pair.
+ * Return the uint32.
+ */
+ ir_rvalue*
+ pack_uvec2_to_uint(ir_rvalue *uvec2_rval)
+ {
+ assert(uvec2_rval->type == glsl_type::uvec2_type);
+
+ /* uvec2 u = UVEC2_RVAL; */
+ ir_variable *u = factory.make_temp(glsl_type::uvec2_type,
+ "tmp_pack_uvec2_to_uint");
+ factory.emit(assign(u, uvec2_rval));
+
+ /* return (u.y << 16) | (u.x & 0xffff); */
+ return bit_or(lshift(swizzle_y(u), constant(16u)),
+ bit_and(swizzle_x(u), constant(0xffffu)));
+ }
+
+ /**
+ * \brief Unpack a uint32 into two uint16's.
+ *
+ * Interpret the given uint32 as a uint16 pair where the uint32's least
+ * significant bits specify the pair's first element. Return the uint16
+ * pair as a uvec2.
+ */
+ ir_rvalue*
+ unpack_uint_to_uvec2(ir_rvalue *uint_rval)
+ {
+ assert(uint_rval->type == glsl_type::uint_type);
+
+ /* uint u = UINT_RVAL; */
+ ir_variable *u = factory.make_temp(glsl_type::uint_type,
+ "tmp_unpack_uint_to_uvec2_u");
+ factory.emit(assign(u, uint_rval));
+
+ /* uvec2 u2; */
+ ir_variable *u2 = factory.make_temp(glsl_type::uvec2_type,
+ "tmp_unpack_uint_to_uvec2_u2");
+
+ /* u2.x = u & 0xffffu; */
+ factory.emit(assign(u2, bit_and(u, constant(0xffffu)), WRITEMASK_X));
+
+ /* u2.y = u >> 16u; */
+ factory.emit(assign(u2, rshift(u, constant(16u)), WRITEMASK_Y));
+
+ return deref(u2).val;
+ }
+
+ /**
+ * \brief Lower a packSnorm2x16 expression.
+ *
+ * \param vec2_rval is packSnorm2x16's input
+ * \return packSnorm2x16's output as a uint rvalue
+ */
+ ir_rvalue*
+ lower_pack_snorm_2x16(ir_rvalue *vec2_rval)
+ {
+ /* From page 88 (94 of pdf) of the GLSL ES 3.00 spec:
+ *
+ * hihgp uint packSnorm2x16(vec2 v)
+ * --------------------------------
+ * First, converts each component of the normalized floating-point value
+ * v into 16-bit integer values. Then, the results are packed into the
+ * returned 32-bit unsigned integer.
+ *
+ * The conversion for component c of v to fixed point is done as
+ * follows:
+ *
+ * packSnorm2x16: round(clamp(c, -1, +1) * 32767.0)
+ *
+ * The first component of the vector will be written to the least
+ * significant bits of the output; the last component will be written to
+ * the most significant bits.
+ *
+ * This function generates IR that approximates the following pseudo-GLSL:
+ *
+ * return pack_uvec2_to_uint(
+ * uvec2(ivec2(
+ * round(clamp(VEC2_RVALUE, -1.0f, 1.0f) * 32767.0f))));
+ *
+ * It is necessary to first convert the vec2 to ivec2 rather than directly
+ * converting vec2 to uvec2 because the latter conversion is undefined.
+ * From page 56 (62 of pdf) of the GLSL ES 3.00 spec: "It is undefined to
+ * convert a negative floating point value to an uint".
+ */
+ assert(vec2_rval->type == glsl_type::vec2_type);
+
+ ir_rvalue *result = pack_uvec2_to_uint(
+ i2u(f2i(round_even(mul(clamp(vec2_rval,
+ constant(-1.0f),
+ constant(1.0f)),
+ constant(32767.0f))))));
+
+ assert(result->type == glsl_type::uint_type);
+ return result;
+ }
+
+ /**
+ * \brief Lower an unpackSnorm2x16 expression.
+ *
+ * \param uint_rval is unpackSnorm2x16's input
+ * \return unpackSnorm2x16's output as a vec2 rvalue
+ */
+ ir_rvalue*
+ lower_unpack_snorm_2x16(ir_rvalue *uint_rval)
+ {
+ /* From page 88 (94 of pdf) of the GLSL ES 3.00 spec:
+ *
+ * highp vec2 unpackSnorm2x16 (highp uint p)
+ * -----------------------------------------
+ * First, unpacks a single 32-bit unsigned integer p into a pair of
+ * 16-bit unsigned integers. Then, each component is converted to
+ * a normalized floating-point value to generate the returned
+ * two-component vector.
+ *
+ * The conversion for unpacked fixed-point value f to floating point is
+ * done as follows:
+ *
+ * unpackSnorm2x16: clamp(f / 32767.0, -1,+1)
+ *
+ * The first component of the returned vector will be extracted from the
+ * least significant bits of the input; the last component will be
+ * extracted from the most significant bits.
+ *
+ * This function generates IR that approximates the following pseudo-GLSL:
+ *
+ * return clamp(
+ * ((ivec2(unpack_uint_to_uvec2(UINT_RVALUE)) << 16) >> 16) / 32767.0f,
+ * -1.0f, 1.0f);
+ *
+ * The above IR may appear unnecessarily complex, but the intermediate
+ * conversion to ivec2 and the bit shifts are necessary to correctly unpack
+ * negative floats.
+ *
+ * To see why, consider packing and then unpacking vec2(-1.0, 0.0).
+ * packSnorm2x16 encodes -1.0 as the int16 0xffff. During unpacking, we
+ * place that int16 into an int32, which results in the *positive* integer
+ * 0x0000ffff. The int16's sign bit becomes, in the int32, the rather
+ * unimportant bit 16. We must now extend the int16's sign bit into bits
+ * 17-32, which is accomplished by left-shifting then right-shifting.
+ */
+
+ assert(uint_rval->type == glsl_type::uint_type);
+
+ ir_rvalue *result =
+ clamp(div(i2f(rshift(lshift(u2i(unpack_uint_to_uvec2(uint_rval)),
+ constant(16)),
+ constant(16u))),
+ constant(32767.0f)),
+ constant(-1.0f),
+ constant(1.0f));
+
+ assert(result->type == glsl_type::vec2_type);
+ return result;
+ }
+
+ /**
+ * \brief Lower a packUnorm2x16 expression.
+ *
+ * \param vec2_rval is packUnorm2x16's input
+ * \return packUnorm2x16's as a uint rvalue
+ */
+ ir_rvalue*
+ lower_pack_unorm_2x16(ir_rvalue *vec2_rval)
+ {
+ /* From page 88 (94 of pdf) of the GLSL ES 3.00 spec:
+ *
+ * highp uint packUnorm2x16 (vec2 v)
+ * ---------------------------------
+ * First, converts each component of the normalized floating-point value
+ * v into 16-bit integer values. Then, the results are packed into the
+ * returned 32-bit unsigned integer.
+ *
+ * The conversion for component c of v to fixed point is done as
+ * follows:
+ *
+ * packUnorm2x16: round(clamp(c, 0, +1) * 65535.0)
+ *
+ * The first component of the vector will be written to the least
+ * significant bits of the output; the last component will be written to
+ * the most significant bits.
+ *
+ * This function generates IR that approximates the following pseudo-GLSL:
+ *
+ * return pack_uvec2_to_uint(uvec2(
+ * round(clamp(VEC2_RVALUE, 0.0f, 1.0f) * 65535.0f)));
+ *
+ * Here it is safe to directly convert the vec2 to uvec2 because the the
+ * vec2 has been clamped to a non-negative range.
+ */
+
+ assert(vec2_rval->type == glsl_type::vec2_type);
+
+ ir_rvalue *result = pack_uvec2_to_uint(
+ f2u(round_even(mul(saturate(vec2_rval), constant(65535.0f)))));
+
+ assert(result->type == glsl_type::uint_type);
+ return result;
+ }
+
+ /**
+ * \brief Lower an unpackUnorm2x16 expression.
+ *
+ * \param uint_rval is unpackUnorm2x16's input
+ * \return unpackUnorm2x16's output as a vec2 rvalue
+ */
+ ir_rvalue*
+ lower_unpack_unorm_2x16(ir_rvalue *uint_rval)
+ {
+ /* From page 89 (95 of pdf) of the GLSL ES 3.00 spec:
+ *
+ * highp vec2 unpackUnorm2x16 (highp uint p)
+ * -----------------------------------------
+ * First, unpacks a single 32-bit unsigned integer p into a pair of
+ * 16-bit unsigned integers. Then, each component is converted to
+ * a normalized floating-point value to generate the returned
+ * two-component vector.
+ *
+ * The conversion for unpacked fixed-point value f to floating point is
+ * done as follows:
+ *
+ * unpackUnorm2x16: f / 65535.0
+ *
+ * The first component of the returned vector will be extracted from the
+ * least significant bits of the input; the last component will be
+ * extracted from the most significant bits.
+ *
+ * This function generates IR that approximates the following pseudo-GLSL:
+ *
+ * return vec2(unpack_uint_to_uvec2(UINT_RVALUE)) / 65535.0;
+ */
+
+ assert(uint_rval->type == glsl_type::uint_type);
+
+ ir_rvalue *result = div(u2f(unpack_uint_to_uvec2(uint_rval)),
+ constant(65535.0f));
+
+ assert(result->type == glsl_type::vec2_type);
+ return result;
+ }
+
+ /**
+ * \brief Lower the component-wise calculation of packHalf2x16.
+ *
+ * \param f_rval is one component of packHafl2x16's input
+ * \param e_rval is the unshifted exponent bits of f_rval
+ * \param m_rval is the unshifted mantissa bits of f_rval
+ *
+ * \return a uint rvalue that encodes a float16 in its lower 16 bits
+ */
+ ir_rvalue*
+ pack_half_1x16_nosign(ir_rvalue *f_rval,
+ ir_rvalue *e_rval,
+ ir_rvalue *m_rval)
+ {
+ assert(e_rval->type == glsl_type::uint_type);
+ assert(m_rval->type == glsl_type::uint_type);
+
+ /* uint u16; */
+ ir_variable *u16 = factory.make_temp(glsl_type::uint_type,
+ "tmp_pack_half_1x16_u16");
+
+ /* float f = FLOAT_RVAL; */
+ ir_variable *f = factory.make_temp(glsl_type::float_type,
+ "tmp_pack_half_1x16_f");
+ factory.emit(assign(f, f_rval));
+
+ /* uint e = E_RVAL; */
+ ir_variable *e = factory.make_temp(glsl_type::uint_type,
+ "tmp_pack_half_1x16_e");
+ factory.emit(assign(e, e_rval));
+
+ /* uint m = M_RVAL; */
+ ir_variable *m = factory.make_temp(glsl_type::uint_type,
+ "tmp_pack_half_1x16_m");
+ factory.emit(assign(m, m_rval));
+
+ /* Preliminaries
+ * -------------
+ *
+ * For a float16, the bit layout is:
+ *
+ * sign: 15
+ * exponent: 10:14
+ * mantissa: 0:9
+ *
+ * Let f16 be a float16 value. The sign, exponent, and mantissa
+ * determine its value thus:
+ *
+ * if e16 = 0 and m16 = 0, then zero: (-1)^s16 * 0 (1)
+ * if e16 = 0 and m16!= 0, then subnormal: (-1)^s16 * 2^(e16 - 14) * (m16 / 2^10) (2)
+ * if 0 < e16 < 31, then normal: (-1)^s16 * 2^(e16 - 15) * (1 + m16 / 2^10) (3)
+ * if e16 = 31 and m16 = 0, then infinite: (-1)^s16 * inf (4)
+ * if e16 = 31 and m16 != 0, then NaN (5)
+ *
+ * where 0 <= m16 < 2^10.
+ *
+ * For a float32, the bit layout is:
+ *
+ * sign: 31
+ * exponent: 23:30
+ * mantissa: 0:22
+ *
+ * Let f32 be a float32 value. The sign, exponent, and mantissa
+ * determine its value thus:
+ *
+ * if e32 = 0 and m32 = 0, then zero: (-1)^s * 0 (10)
+ * if e32 = 0 and m32 != 0, then subnormal: (-1)^s * 2^(e32 - 126) * (m32 / 2^23) (11)
+ * if 0 < e32 < 255, then normal: (-1)^s * 2^(e32 - 127) * (1 + m32 / 2^23) (12)
+ * if e32 = 255 and m32 = 0, then infinite: (-1)^s * inf (13)
+ * if e32 = 255 and m32 != 0, then NaN (14)
+ *
+ * where 0 <= m32 < 2^23.
+ *
+ * The minimum and maximum normal float16 values are
+ *
+ * min_norm16 = 2^(1 - 15) * (1 + 0 / 2^10) = 2^(-14) (20)
+ * max_norm16 = 2^(30 - 15) * (1 + 1023 / 2^10) (21)
+ *
+ * The step at max_norm16 is
+ *
+ * max_step16 = 2^5 (22)
+ *
+ * Observe that the float16 boundary values in equations 20-21 lie in the
+ * range of normal float32 values.
+ *
+ *
+ * Rounding Behavior
+ * -----------------
+ * Not all float32 values can be exactly represented as a float16. We
+ * round all such intermediate float32 values to the nearest float16; if
+ * the float32 is exactly between to float16 values, we round to the one
+ * with an even mantissa. This rounding behavior has several benefits:
+ *
+ * - It has no sign bias.
+ *
+ * - It reproduces the behavior of real hardware: opcode F32TO16 in Intel's
+ * GPU ISA.
+ *
+ * - By reproducing the behavior of the GPU (at least on Intel hardware),
+ * compile-time evaluation of constant packHalf2x16 GLSL expressions will
+ * result in the same value as if the expression were executed on the
+ * GPU.
+ *
+ * Calculation
+ * -----------
+ * Our task is to compute s16, e16, m16 given f32. Since this function
+ * ignores the sign bit, assume that s32 = s16 = 0. There are several
+ * cases consider.
+ */
+
+ factory.emit(
+
+ /* Case 1) f32 is NaN
+ *
+ * The resultant f16 will also be NaN.
+ */
+
+ /* if (e32 == 255 && m32 != 0) { */
+ if_tree(logic_and(equal(e, constant(0xffu << 23u)),
+ logic_not(equal(m, constant(0u)))),
+
+ assign(u16, constant(0x7fffu)),
+
+ /* Case 2) f32 lies in the range [0, min_norm16).
+ *
+ * The resultant float16 will be either zero, subnormal, or normal.
+ *
+ * Solving
+ *
+ * f32 = min_norm16 (30)
+ *
+ * gives
+ *
+ * e32 = 113 and m32 = 0 (31)
+ *
+ * Therefore this case occurs if and only if
+ *
+ * e32 < 113 (32)
+ */
+
+ /* } else if (e32 < 113) { */
+ if_tree(less(e, constant(113u << 23u)),
+
+ /* u16 = uint(round_to_even(abs(f32) * float(1u << 24u))); */
+ assign(u16, f2u(round_even(mul(expr(ir_unop_abs, f),
+ constant((float) (1 << 24)))))),
+
+ /* Case 3) f32 lies in the range
+ * [min_norm16, max_norm16 + max_step16).
+ *
+ * The resultant float16 will be either normal or infinite.
+ *
+ * Solving
+ *
+ * f32 = max_norm16 + max_step16 (40)
+ * = 2^15 * (1 + 1023 / 2^10) + 2^5 (41)
+ * = 2^16 (42)
+ * gives
+ *
+ * e32 = 142 and m32 = 0 (43)
+ *
+ * We already solved the boundary condition f32 = min_norm16 above
+ * in equation 31. Therefore this case occurs if and only if
+ *
+ * 113 <= e32 and e32 < 142
+ */
+
+ /* } else if (e32 < 142) { */
+ if_tree(lequal(e, constant(142u << 23u)),
+
+ /* The addition below handles the case where the mantissa rounds
+ * up to 1024 and bumps the exponent.
+ *
+ * u16 = ((e - (112u << 23u)) >> 13u)
+ * + round_to_even((float(m) / (1u << 13u));
+ */
+ assign(u16, add(rshift(sub(e, constant(112u << 23u)),
+ constant(13u)),
+ f2u(round_even(
+ div(u2f(m), constant((float) (1 << 13))))))),
+
+ /* Case 4) f32 lies in the range [max_norm16 + max_step16, inf].
+ *
+ * The resultant float16 will be infinite.
+ *
+ * The cases above caught all float32 values in the range
+ * [0, max_norm16 + max_step16), so this is the fall-through case.
+ */
+
+ /* } else { */
+
+ assign(u16, constant(31u << 10u))))));
+
+ /* } */
+
+ return deref(u16).val;
+ }
+
+ /**
+ * \brief Lower a packHalf2x16 expression.
+ *
+ * \param vec2_rval is packHalf2x16's input
+ * \return packHalf2x16's output as a uint rvalue
+ */
+ ir_rvalue*
+ lower_pack_half_2x16(ir_rvalue *vec2_rval)
+ {
+ /* From page 89 (95 of pdf) of the GLSL ES 3.00 spec:
+ *
+ * highp uint packHalf2x16 (mediump vec2 v)
+ * ----------------------------------------
+ * Returns an unsigned integer obtained by converting the components of
+ * a two-component floating-point vector to the 16-bit floating-point
+ * representation found in the OpenGL ES Specification, and then packing
+ * these two 16-bit integers into a 32-bit unsigned integer.
+ *
+ * The first vector component specifies the 16 least- significant bits
+ * of the result; the second component specifies the 16 most-significant
+ * bits.
+ */
+
+ assert(vec2_rval->type == glsl_type::vec2_type);
+
+ /* vec2 f = VEC2_RVAL; */
+ ir_variable *f = factory.make_temp(glsl_type::vec2_type,
+ "tmp_pack_half_2x16_f");
+ factory.emit(assign(f, vec2_rval));
+
+ /* uvec2 f32 = bitcast_f2u(f); */
+ ir_variable *f32 = factory.make_temp(glsl_type::uvec2_type,
+ "tmp_pack_half_2x16_f32");
+ factory.emit(assign(f32, expr(ir_unop_bitcast_f2u, f)));
+
+ /* uvec2 f16; */
+ ir_variable *f16 = factory.make_temp(glsl_type::uvec2_type,
+ "tmp_pack_half_2x16_f16");
+
+ /* Get f32's unshifted exponent bits.
+ *
+ * uvec2 e = f32 & 0x7f800000u;
+ */
+ ir_variable *e = factory.make_temp(glsl_type::uvec2_type,
+ "tmp_pack_half_2x16_e");
+ factory.emit(assign(e, bit_and(f32, constant(0x7f800000u))));
+
+ /* Get f32's unshifted mantissa bits.
+ *
+ * uvec2 m = f32 & 0x007fffffu;
+ */
+ ir_variable *m = factory.make_temp(glsl_type::uvec2_type,
+ "tmp_pack_half_2x16_m");
+ factory.emit(assign(m, bit_and(f32, constant(0x007fffffu))));
+
+ /* Set f16's exponent and mantissa bits.
+ *
+ * f16.x = pack_half_1x16_nosign(e.x, m.x);
+ * f16.y = pack_half_1y16_nosign(e.y, m.y);
+ */
+ factory.emit(assign(f16, pack_half_1x16_nosign(swizzle_x(f),
+ swizzle_x(e),
+ swizzle_x(m)),
+ WRITEMASK_X));
+ factory.emit(assign(f16, pack_half_1x16_nosign(swizzle_y(f),
+ swizzle_y(e),
+ swizzle_y(m)),
+ WRITEMASK_Y));
+
+ /* Set f16's sign bits.
+ *
+ * f16 |= (f32 & (1u << 31u) >> 16u;
+ */
+ factory.emit(
+ assign(f16, bit_or(f16,
+ rshift(bit_and(f32, constant(1u << 31u)),
+ constant(16u)))));
+
+
+ /* return (f16.y << 16u) | f16.x; */
+ ir_rvalue *result = bit_or(lshift(swizzle_y(f16),
+ constant(16u)),
+ swizzle_x(f16));
+
+ assert(result->type == glsl_type::uint_type);
+ return result;
+ }
+
+ /**
+ * \brief Split packHalf2x16's vec2 operand into two floats.
+ *
+ * \param vec2_rval is packHalf2x16's input
+ * \return a uint rvalue
+ *
+ * Some code generators, such as the i965 fragment shader, require that all
+ * vector expressions be lowered to a sequence of scalar expressions.
+ * However, packHalf2x16 cannot be scalarized by the same mechanism as
+ * a true vector operation because its input and output have a differing
+ * number of vector components.
+ *
+ * This method scalarizes packHalf2x16 by transforming it from an unary
+ * operation having vector input to a binary operation having scalar input.
+ * That is, it transforms
+ *
+ * packHalf2x16(VEC2_RVAL);
+ *
+ * into
+ *
+ * vec2 v = VEC2_RVAL;
+ * return packHalf2x16_split(v.x, v.y);
+ */
+ ir_rvalue*
+ split_pack_half_2x16(ir_rvalue *vec2_rval)
+ {
+ assert(vec2_rval->type == glsl_type::vec2_type);
+
+ ir_variable *v = factory.make_temp(glsl_type::vec2_type,
+ "tmp_split_pack_half_2x16_v");
+ factory.emit(assign(v, vec2_rval));
+
+ return expr(ir_binop_pack_half_2x16_split, swizzle_x(v), swizzle_y(v));
+ }
+
+ /**
+ * \brief Lower the component-wise calculation of unpackHalf2x16.
+ *
+ * Given a uint that encodes a float16 in its lower 16 bits, this function
+ * returns a uint that encodes a float32 with the same value. The sign bit
+ * of the float16 is ignored.
+ *
+ * \param e_rval is the unshifted exponent bits of a float16
+ * \param m_rval is the unshifted mantissa bits of a float16
+ * \param a uint rvalue that encodes a float32
+ */
+ ir_rvalue*
+ unpack_half_1x16_nosign(ir_rvalue *e_rval, ir_rvalue *m_rval)
+ {
+ assert(e_rval->type == glsl_type::uint_type);
+ assert(m_rval->type == glsl_type::uint_type);
+
+ /* uint u32; */
+ ir_variable *u32 = factory.make_temp(glsl_type::uint_type,
+ "tmp_unpack_half_1x16_u32");
+
+ /* uint e = E_RVAL; */
+ ir_variable *e = factory.make_temp(glsl_type::uint_type,
+ "tmp_unpack_half_1x16_e");
+ factory.emit(assign(e, e_rval));
+
+ /* uint m = M_RVAL; */
+ ir_variable *m = factory.make_temp(glsl_type::uint_type,
+ "tmp_unpack_half_1x16_m");
+ factory.emit(assign(m, m_rval));
+
+ /* Preliminaries
+ * -------------
+ *
+ * For a float16, the bit layout is:
+ *
+ * sign: 15
+ * exponent: 10:14
+ * mantissa: 0:9
+ *
+ * Let f16 be a float16 value. The sign, exponent, and mantissa
+ * determine its value thus:
+ *
+ * if e16 = 0 and m16 = 0, then zero: (-1)^s16 * 0 (1)
+ * if e16 = 0 and m16!= 0, then subnormal: (-1)^s16 * 2^(e16 - 14) * (m16 / 2^10) (2)
+ * if 0 < e16 < 31, then normal: (-1)^s16 * 2^(e16 - 15) * (1 + m16 / 2^10) (3)
+ * if e16 = 31 and m16 = 0, then infinite: (-1)^s16 * inf (4)
+ * if e16 = 31 and m16 != 0, then NaN (5)
+ *
+ * where 0 <= m16 < 2^10.
+ *
+ * For a float32, the bit layout is:
+ *
+ * sign: 31
+ * exponent: 23:30
+ * mantissa: 0:22
+ *
+ * Let f32 be a float32 value. The sign, exponent, and mantissa
+ * determine its value thus:
+ *
+ * if e32 = 0 and m32 = 0, then zero: (-1)^s * 0 (10)
+ * if e32 = 0 and m32 != 0, then subnormal: (-1)^s * 2^(e32 - 126) * (m32 / 2^23) (11)
+ * if 0 < e32 < 255, then normal: (-1)^s * 2^(e32 - 127) * (1 + m32 / 2^23) (12)
+ * if e32 = 255 and m32 = 0, then infinite: (-1)^s * inf (13)
+ * if e32 = 255 and m32 != 0, then NaN (14)
+ *
+ * where 0 <= m32 < 2^23.
+ *
+ * Calculation
+ * -----------
+ * Our task is to compute s32, e32, m32 given f16. Since this function
+ * ignores the sign bit, assume that s32 = s16 = 0. There are several
+ * cases consider.
+ */
+
+ factory.emit(
+
+ /* Case 1) f16 is zero or subnormal.
+ *
+ * The simplest method of calcuating f32 in this case is
+ *
+ * f32 = f16 (20)
+ * = 2^(-14) * (m16 / 2^10) (21)
+ * = m16 / 2^(-24) (22)
+ */
+
+ /* if (e16 == 0) { */
+ if_tree(equal(e, constant(0u)),
+
+ /* u32 = bitcast_f2u(float(m) / float(1 << 24)); */
+ assign(u32, expr(ir_unop_bitcast_f2u,
+ div(u2f(m), constant((float)(1 << 24))))),
+
+ /* Case 2) f16 is normal.
+ *
+ * The equation
+ *
+ * f32 = f16 (30)
+ * 2^(e32 - 127) * (1 + m32 / 2^23) = (31)
+ * 2^(e16 - 15) * (1 + m16 / 2^10)
+ *
+ * can be decomposed into two
+ *
+ * 2^(e32 - 127) = 2^(e16 - 15) (32)
+ * 1 + m32 / 2^23 = 1 + m16 / 2^10 (33)
+ *
+ * which solve to
+ *
+ * e32 = e16 + 112 (34)
+ * m32 = m16 * 2^13 (35)
+ */
+
+ /* } else if (e16 < 31)) { */
+ if_tree(less(e, constant(31u << 10u)),
+
+ /* u32 = ((e << 13) + (112 << 23))
+ * | (m << 13);
+ */
+ assign(u32, bit_or(add(lshift(e, constant(13u)),
+ constant(112u << 23u)),
+ lshift(m, constant(13u)))),
+
+ /* Case 3) f16 is infinite. */
+ if_tree(equal(m, constant(0u)),
+
+ assign(u32, constant(255u << 23u)),
+
+ /* Case 4) f16 is NaN. */
+ /* } else { */
+
+ assign(u32, constant(0x7fffffffu))))));
+
+ /* } */
+
+ return deref(u32).val;
+ }
+
+ /**
+ * \brief Lower an unpackHalf2x16 expression.
+ *
+ * \param uint_rval is unpackHalf2x16's input
+ * \return unpackHalf2x16's output as a vec2 rvalue
+ */
+ ir_rvalue*
+ lower_unpack_half_2x16(ir_rvalue *uint_rval)
+ {
+ /* From page 89 (95 of pdf) of the GLSL ES 3.00 spec:
+ *
+ * mediump vec2 unpackHalf2x16 (highp uint v)
+ * ------------------------------------------
+ * Returns a two-component floating-point vector with components
+ * obtained by unpacking a 32-bit unsigned integer into a pair of 16-bit
+ * values, interpreting those values as 16-bit floating-point numbers
+ * according to the OpenGL ES Specification, and converting them to
+ * 32-bit floating-point values.
+ *
+ * The first component of the vector is obtained from the
+ * 16 least-significant bits of v; the second component is obtained
+ * from the 16 most-significant bits of v.
+ */
+ assert(uint_rval->type == glsl_type::uint_type);
+
+ /* uint u = RVALUE;
+ * uvec2 f16 = uvec2(u.x & 0xffff, u.y >> 16);
+ */
+ ir_variable *f16 = factory.make_temp(glsl_type::uvec2_type,
+ "tmp_unpack_half_2x16_f16");
+ factory.emit(assign(f16, unpack_uint_to_uvec2(uint_rval)));
+
+ /* uvec2 f32; */
+ ir_variable *f32 = factory.make_temp(glsl_type::uvec2_type,
+ "tmp_unpack_half_2x16_f32");
+
+ /* Get f16's unshifted exponent bits.
+ *
+ * uvec2 e = f16 & 0x7c00u;
+ */
+ ir_variable *e = factory.make_temp(glsl_type::uvec2_type,
+ "tmp_unpack_half_2x16_e");
+ factory.emit(assign(e, bit_and(f16, constant(0x7c00u))));
+
+ /* Get f16's unshifted mantissa bits.
+ *
+ * uvec2 m = f16 & 0x03ffu;
+ */
+ ir_variable *m = factory.make_temp(glsl_type::uvec2_type,
+ "tmp_unpack_half_2x16_m");
+ factory.emit(assign(m, bit_and(f16, constant(0x03ffu))));
+
+ /* Set f32's exponent and mantissa bits.
+ *
+ * f32.x = unpack_half_1x16_nosign(e.x, m.x);
+ * f32.y = unpack_half_1x16_nosign(e.y, m.y);
+ */
+ factory.emit(assign(f32, unpack_half_1x16_nosign(swizzle_x(e),
+ swizzle_x(m)),
+ WRITEMASK_X));
+ factory.emit(assign(f32, unpack_half_1x16_nosign(swizzle_y(e),
+ swizzle_y(m)),
+ WRITEMASK_Y));
+
+ /* Set f32's sign bit.
+ *
+ * f32 |= (f16 & 0x8000u) << 16u;
+ */
+ factory.emit(assign(f32, bit_or(f32,
+ lshift(bit_and(f16,
+ constant(0x8000u)),
+ constant(16u)))));
+
+ /* return bitcast_u2f(f32); */
+ ir_rvalue *result = expr(ir_unop_bitcast_u2f, f32);
+ assert(result->type == glsl_type::vec2_type);
+ return result;
+ }
+
+ /**
+ * \brief Split unpackHalf2x16 into two operations.
+ *
+ * \param uint_rval is unpackHalf2x16's input
+ * \return a vec2 rvalue
+ *
+ * Some code generators, such as the i965 fragment shader, require that all
+ * vector expressions be lowered to a sequence of scalar expressions.
+ * However, unpackHalf2x16 cannot be scalarized by the same method as
+ * a true vector operation because the number of components of its input
+ * and output differ.
+ *
+ * This method scalarizes unpackHalf2x16 by transforming it from a single
+ * operation having vec2 output to a pair of operations each having float
+ * output. That is, it transforms
+ *
+ * unpackHalf2x16(UINT_RVAL)
+ *
+ * into
+ *
+ * uint u = UINT_RVAL;
+ * vec2 v;
+ *
+ * v.x = unpackHalf2x16_split_x(u);
+ * v.y = unpackHalf2x16_split_y(u);
+ *
+ * return v;
+ */
+ ir_rvalue*
+ split_unpack_half_2x16(ir_rvalue *uint_rval)
+ {
+ assert(uint_rval->type == glsl_type::uint_type);
+
+ /* uint u = uint_rval; */
+ ir_variable *u = factory.make_temp(glsl_type::uint_type,
+ "tmp_split_unpack_half_2x16_u");
+ factory.emit(assign(u, uint_rval));
+
+ /* vec2 v; */
+ ir_variable *v = factory.make_temp(glsl_type::vec2_type,
+ "tmp_split_unpack_half_2x16_v");
+
+ /* v.x = unpack_half_2x16_split_x(u); */
+ factory.emit(assign(v, expr(ir_unop_unpack_half_2x16_split_x, u),
+ WRITEMASK_X));
+
+ /* v.y = unpack_half_2x16_split_y(u); */
+ factory.emit(assign(v, expr(ir_unop_unpack_half_2x16_split_y, u),
+ WRITEMASK_Y));
+
+ return deref(v).val;
+ }
+};
+
+} // namespace anonymous
+
+/**
+ * \brief Lower the builtin packing functions.
+ *
+ * \param op_mask is a bitmask of `enum lower_packing_builtins_op`.
+ */
+bool
+lower_packing_builtins(exec_list *instructions, int op_mask)
+{
+ lower_packing_builtins_visitor v(op_mask);
+ visit_list_elements(&v, instructions, true);
+ return v.get_progress();
+}
--
1.8.1.1
More information about the mesa-dev
mailing list