[Mesa-dev] [PATCH 05/10] glsl: Add lowering pass for GLSL ES 3.00 pack/unpack operations

Thu Jan 10 10:36:33 PST 2013

On 01/10/2013 12:10 AM, Chad Versace wrote:
> Lower them to arithmetic and bit manipulation expressions.
>
> Signed-off-by: Chad Versace <chad.versace at linux.intel.com>
> ---
>   src/glsl/Makefile.sources           |    1 +
>   src/glsl/ir_optimization.h          |   18 +
>   src/glsl/lower_packing_builtins.cpp | 1566 +++++++++++++++++++++++++++++++++++
>   3 files changed, 1585 insertions(+)
>   create mode 100644 src/glsl/lower_packing_builtins.cpp
>
> diff --git a/src/glsl/Makefile.sources b/src/glsl/Makefile.sources
> index d984c5c..4aecdd7 100644
> --- a/src/glsl/Makefile.sources
> +++ b/src/glsl/Makefile.sources
> @@ -58,6 +58,7 @@ LIBGLSL_FILES = \
>   	$(GLSL_SRCDIR)/lower_mat_op_to_vec.cpp \
>   	$(GLSL_SRCDIR)/lower_noise.cpp \
>   	$(GLSL_SRCDIR)/lower_packed_varyings.cpp \
> +	$(GLSL_SRCDIR)/lower_packing_builtins.cpp \
>   	$(GLSL_SRCDIR)/lower_texture_projection.cpp \
>   	$(GLSL_SRCDIR)/lower_variable_index_to_cond_assign.cpp \
>   	$(GLSL_SRCDIR)/lower_vec_index_to_cond_assign.cpp \
> diff --git a/src/glsl/ir_optimization.h b/src/glsl/ir_optimization.h
> index 6b95191..39d3a74 100644
> --- a/src/glsl/ir_optimization.h
> +++ b/src/glsl/ir_optimization.h
> @@ -37,6 +37,23 @@
>   #define MOD_TO_FRACT       0x20
>   #define INT_DIV_TO_MUL_RCP 0x40
>
> +/**
> + * \see class lower_packing_builtins_visitor
> + */
> +enum lower_packing_builtins_op {
> +   LOWER_PACK_SNORM_2x16                = 0x0001,
> +   LOWER_UNPACK_SNORM_2x16              = 0x0002,
> +
> +   LOWER_PACK_UNORM_2x16                = 0x0004,
> +   LOWER_UNPACK_UNORM_2x16              = 0x0008,
> +
> +   LOWER_PACK_HALF_2x16                 = 0x0010,
> +   LOWER_UNPACK_HALF_2x16               = 0x0020,
> +
> +   LOWER_PACK_HALF_2x16_TO_SPLIT        = 0x0040,
> +   LOWER_UNPACK_HALF_2x16_TO_SPLIT      = 0x0080,
> +};
> +
>   bool do_common_optimization(exec_list *ir, bool linked,
>   			    bool uniform_locations_assigned,
>   			    unsigned max_unroll_iterations);
> @@ -74,6 +91,7 @@ bool lower_variable_index_to_cond_assign(exec_list *instructions,
>   bool lower_quadop_vector(exec_list *instructions, bool dont_lower_swz);
>   bool lower_clip_distance(gl_shader *shader);
>   void lower_output_reads(exec_list *instructions);
> +bool lower_packing_builtins(exec_list *instructions, int op_mask);
>   void lower_ubo_reference(struct gl_shader *shader, exec_list *instructions);
>   void lower_packed_varyings(void *mem_ctx, unsigned location_base,
>                              unsigned locations_used, ir_variable_mode mode,
> diff --git a/src/glsl/lower_packing_builtins.cpp b/src/glsl/lower_packing_builtins.cpp
> new file mode 100644
> index 0000000..cd84084
> --- /dev/null
> +++ b/src/glsl/lower_packing_builtins.cpp
> @@ -0,0 +1,1566 @@
> +/*
> + * Copyright © 2012 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
> + * DEALINGS IN THE SOFTWARE.
> + */
> +
> +#include "ir.h"
> +#include "ir_optimization.h"
> +#include "ir_rvalue_visitor.h"
> +
> +namespace {
> +
> +/**
> + * A visitor that lowers the built-in floating-point packing and unpacking
> + * operations to arithmetic and bit manipulation expressions.
> + */
> +class lower_packing_builtins_visitor : public ir_rvalue_visitor {
> +public:
> +   /**
> +    * \param op_mask is a bitmask of `enum lower_packing_builtins_op`
> +    */
> +   explicit lower_packing_builtins_visitor(int op_mask) :
> +      op_mask(op_mask),
> +      progress(false),
> +      prev_inserted_instruction(NULL)
> +   {
> +      /* Mutually exclusive options. */
> +      assert(!((op_mask & LOWER_PACK_HALF_2x16) &&
> +               (op_mask & LOWER_PACK_HALF_2x16_TO_SPLIT)));
> +
> +      assert(!((op_mask & LOWER_UNPACK_HALF_2x16) &&
> +               (op_mask & LOWER_UNPACK_HALF_2x16_TO_SPLIT)));
> +
> +      return;
> +   }
> +
> +   virtual ~lower_packing_builtins_visitor() {}
> +
> +   bool get_progress() { return progress; }
> +
> +   void handle_rvalue(ir_rvalue **rvalue)
> +   {
> +      if (!*rvalue)
> +	 return;
> +
> +      ir_expression *expr = (*rvalue)->as_expression();
> +      if (!expr)
> +	 return;
> +
> +      void *mem_ctx = ralloc_parent(*rvalue);
> +      ir_rvalue *op0 = expr->operands[0];
> +      reset_instruction_insertion();
> +
> +      switch (expr->operation) {
> +      case ir_unop_pack_snorm_2x16:
> +         assert((*rvalue)->type == glsl_type::uint_type);
> +         assert(op0->type == glsl_type::vec2_type);

Aren't all of these assertions already handled by ir_validate?

> +         if (op_mask & LOWER_PACK_SNORM_2x16) {
> +            *rvalue = lower_pack_snorm_2x16(mem_ctx, op0);
> +         }
> +         break;
> +      case ir_unop_pack_unorm_2x16:
> +         assert((*rvalue)->type == glsl_type::uint_type);
> +         assert(op0->type == glsl_type::vec2_type);
> +         if (op_mask & LOWER_PACK_UNORM_2x16) {
> +            *rvalue = lower_pack_unorm_2x16(mem_ctx, op0);
> +         }
> +         break;
> +      case ir_unop_pack_half_2x16:
> +         assert((*rvalue)->type == glsl_type::uint_type);
> +         assert(op0->type == glsl_type::vec2_type);
> +         if (op_mask & LOWER_PACK_HALF_2x16) {
> +            *rvalue = pack_half_2x16(mem_ctx, op0);
> +         } else if (op_mask & LOWER_PACK_HALF_2x16_TO_SPLIT) {
> +            *rvalue = split_pack_half_2x16(mem_ctx, op0);
> +         }
> +         break;
> +      case ir_unop_unpack_snorm_2x16:
> +         assert((*rvalue)->type == glsl_type::vec2_type);
> +         assert(op0->type == glsl_type::uint_type);
> +         if (op_mask & LOWER_UNPACK_SNORM_2x16) {
> +            *rvalue = lower_unpack_snorm_2x16(mem_ctx, op0);
> +         }
> +         break;
> +      case ir_unop_unpack_unorm_2x16:
> +         assert((*rvalue)->type == glsl_type::vec2_type);
> +         assert(op0->type == glsl_type::uint_type);
> +         if (op_mask & LOWER_UNPACK_UNORM_2x16) {
> +            *rvalue = lower_unpack_unorm_2x16(mem_ctx, op0);
> +         }
> +         break;
> +      case ir_unop_unpack_half_2x16:
> +         assert((*rvalue)->type == glsl_type::vec2_type);
> +         assert(op0->type == glsl_type::uint_type);
> +         if (op_mask & LOWER_UNPACK_HALF_2x16) {
> +            *rvalue = lower_unpack_half_2x16(mem_ctx, op0);
> +         } else if (op_mask & LOWER_UNPACK_HALF_2x16_TO_SPLIT) {
> +            *rvalue = split_unpack_half_2x16(mem_ctx, op0);
> +         }
> +         break;
> +      default:
> +         return;
> +      }
> +
> +      progress = true;
> +   }
> +
> +private:
> +   int op_mask;
> +   bool progress;
> +   ir_instruction *prev_inserted_instruction;
> +
> +   void
> +   reset_instruction_insertion()
> +   {
> +      prev_inserted_instruction = NULL;
> +   }
> +
> +   void
> +   insert_instruction(ir_instruction *ir)
> +   {
> +      assert(ir != NULL);
> +
> +      if (prev_inserted_instruction == NULL) {
> +         base_ir->insert_before(ir);
> +      } else {
> +         prev_inserted_instruction->insert_after(ir);
> +      }
> +
> +      prev_inserted_instruction = ir;
> +   }
> +
> +   /**
> +    * \brief Pack two uint16's into a single uint32.
> +    *
> +    * Interpret the given uvec2 as a uint16 pair. Pack the pair into a uint32
> +    * where the least significant bits specify the first element of the pair.
> +    * Return the uint32 as a uint rvalue.
> +    *
> +    * This function generates IR that approximates the following GLSL:
> +    *
> +    *    uvec2 *u = UVEC2_RVAL;
> +    *    return (u.y << 16) | (u.x & 0xffff);
> +    */
> +   ir_rvalue*
> +   pack_uvec2_to_uint(void *mem_ctx, ir_rvalue *uvec2_rval)
> +   {
> +      assert(uvec2_rval->type == glsl_type::uvec2_type);
> +
> +      /* uvec2 u = uvec2_rval; */
> +      ir_variable *u2 =
> +         new(mem_ctx) ir_variable(glsl_type::uvec2_type,
> +                                  "tmp_pack_uvec2_to_uint",
> +                                  ir_var_temporary);
> +      insert_instruction(u2);
> +      insert_instruction(
> +         new(mem_ctx) ir_assignment(
> +            new(mem_ctx) ir_dereference_variable(u2),
> +            uvec2_rval));
> +
> +      /* return (u.y << 16) | (u.x & 0xffff); */
> +      return
> +         new(mem_ctx) ir_expression(ir_binop_bit_or,
> +           new(mem_ctx) ir_expression(ir_binop_lshift,
> +             new(mem_ctx) ir_swizzle(
> +                new(mem_ctx) ir_dereference_variable(u2),
> +                1, 0, 0, 0, 1),
> +             new(mem_ctx) ir_constant(16u)),
> +           new(mem_ctx) ir_expression(ir_binop_bit_and,
> +             new(mem_ctx) ir_swizzle(
> +                new(mem_ctx) ir_dereference_variable(u2),
> +                0, 0, 0, 0, 1),
> +             new(mem_ctx) ir_constant(0xffffu)));

Reading this just turned my brain to mush.  I can't image what writing 
it did to yours. :)  ir_builder, perhaps?  You may need to add a couple 
methods (lsr, lsl, etc.), but that doesn't seem like a bad thing...

> +   }
> +
> +   /**
> +    * \brief Unpack a uint32 to two uint16's.
> +    *
> +    * Interpret the given uint32 as a uint16 pair where the uint32's least
> +    * significant bits specify the pair's first element. Return the uint16
> +    * pair as a uvec2 rvalue.
> +    *
> +    * This function generates IR that approximates the following GLSL:
> +    *
> +    *    uint u = UINT_RVAL;
> +    *    return uvec2(u & 0xffffu, u >> 16u);
> +    */
> +   ir_rvalue*
> +   unpack_uint_to_uvec2(void *mem_ctx, ir_rvalue *uint_rval)
> +   {
> +      assert(uint_rval->type == glsl_type::uint_type);
> +
> +      /* uint u; */
> +      ir_variable *u =
> +         new(mem_ctx) ir_variable(glsl_type::uint_type,
> +                                  "tmp_unpack_uint_to_uvec2_u",
> +                                  ir_var_temporary);
> +      insert_instruction(u);
> +
> +      /* u = uint_rval; */
> +      insert_instruction(
> +         new(mem_ctx) ir_assignment(
> +           new(mem_ctx) ir_dereference_variable(u),
> +           uint_rval));
> +
> +      /* uvec2 u2; */
> +      ir_variable *u2 =
> +         new(mem_ctx) ir_variable(glsl_type::uvec2_type,
> +                                  "tmp_unpack_uint_to_uvec2_u2",
> +                                  ir_var_temporary);
> +      insert_instruction(u2);
> +
> +
> +      /* u2.x = u & 0xffffu; */
> +      insert_instruction(
> +         new(mem_ctx) ir_assignment(
> +            new(mem_ctx) ir_dereference_variable(u2),
> +              new(mem_ctx) ir_expression(
> +                ir_binop_bit_and,
> +                new(mem_ctx) ir_dereference_variable(u),
> +                new(mem_ctx) ir_constant(0xffffu)),
> +            NULL /*condition*/,
> +            1 /*write mask*/));
> +
> +      /* u2.y = u >> 16u; */
> +      insert_instruction(
> +         new(mem_ctx) ir_assignment(
> +            new(mem_ctx) ir_dereference_variable(u2),
> +              new(mem_ctx) ir_expression(
> +                ir_binop_rshift,
> +                new(mem_ctx) ir_dereference_variable(u),
> +                new(mem_ctx) ir_constant(16u)),
> +            NULL /*condition*/,
> +            2 /*write mask*/));
> +
> +      return new(mem_ctx) ir_dereference_variable(u2);
> +   }
> +
> +   /* \brief Lower packSnorm2x16.
> +    *
> +    * From page 88 (94 of pdf) of the GLSL ES 3.00 spec:
> +    *
> +    *    hihgp uint packSnorm2x16(vec2 v)
> +    *    --------------------------------
> +    *    First, converts each component of the normalized floating-point value
> +    *    v into 16-bit integer values. Then, the results are packed into the
> +    *    returned 32-bit unsigned integer.
> +    *
> +    *    The conversion for component c of v to fixed point is done as
> +    *    follows:
> +    *
> +    *       packSnorm2x16: round(clamp(c, -1, +1) * 32767.0)
> +    *
> +    *    The first component of the vector will be written to the least
> +    *    significant bits of the output; the last component will be written to
> +    *    the most significant bits.
> +    *
> +    * This function generates IR that approximates the following GLSL:
> +    *
> +    *     return pack_uvec2_to_uint(
> +    *         uvec2(ivec2(
> +    *           round(clamp(RVALUE, -1.0f, 1.0f) * 32767.0f))));
> +    *
> +    * It is necessary to first convert the vec2 to ivec2 rather than directly
> +    * converting vec2 to uvec2 because the latter conversion is undefined.
> +    * From page 56 (62 of pdf) of the GLSL ES 3.00 spec: "It is undefined to
> +    * convert a negative floating point value to an uint".
> +    */
> +   ir_rvalue*
> +   lower_pack_snorm_2x16(void *mem_ctx, ir_rvalue *vec2_rval)
> +   {
> +      assert(vec2_rval->type == glsl_type::vec2_type);
> +
> +      ir_rvalue *result =
> +         pack_uvec2_to_uint(
> +            mem_ctx,
> +            new(mem_ctx) ir_expression(ir_unop_i2u,
> +               new(mem_ctx) ir_expression(ir_unop_f2i,
> +                  new(mem_ctx) ir_expression(ir_unop_round_even,
> +                     new(mem_ctx) ir_expression(ir_binop_mul,
> +                       new(mem_ctx) ir_expression(ir_binop_min,
> +                          new(mem_ctx) ir_expression(ir_binop_max,
> +                            vec2_rval,
> +                            new(mem_ctx) ir_constant(-1.0f)),
> +                          new(mem_ctx) ir_constant(1.0f)),
> +                       new(mem_ctx) ir_constant(32767.0f))))));
> +
> +      assert(result->type == glsl_type::uint_type);
> +      return result;
> +   }
> +
> +   /* \brief Lower unpackSnorm2x16.
> +    *
> +    * From page 88 (94 of pdf) of the GLSL ES 3.00 spec:
> +    *
> +    *    highp vec2 unpackSnorm2x16 (highp uint p)
> +    *    -----------------------------------------
> +    *    First, unpacks a single 32-bit unsigned integer p into a pair of
> +    *    16-bit unsigned integers. Then, each component is converted to
> +    *    a normalized floating-point value to generate the returned
> +    *    two-component vector.
> +    *
> +    *    The conversion for unpacked fixed-point value f to floating point is
> +    *    done as follows:
> +    *
> +    *       unpackSnorm2x16: clamp(f / 32767.0, -1,+1)
> +    *
> +    *    The first component of the returned vector will be extracted from the
> +    *    least significant bits of the input; the last component will be
> +    *    extracted from the most significant bits.
> +    *
> +    * This function generates IR that approximates the following GLSL:
> +    *
> +    *    return clamp(
> +    *       ((ivec2(unpack_uint_to_uvec2(RVALUE)) << 16) >> 16) / 32767.0f,
> +    *       -1.0f, 1.0f);
> +    *
> +    * The above IR may appear unnecessarily complex, but the intermediate
> +    * conversion to ivec2 and the bit shifts are necessary to correctly unpack
> +    * negative floats.
> +    *
> +    * To see why, consider packing and then unpacking vec2(-1.0, 0.0).
> +    * packSnorm2x16 encodes -1.0 as the int16 0xffff. During unpacking, we
> +    * place that int16 into an int32, which results in the *positive* integer
> +    * 0x0000ffff.  The int16's sign bit becomes, in the int32, the rather
> +    * unimportant bit 16. We must now extend the int16's sign bit into bits
> +    * 17-32, which is accomplished by left-shifting then right-shifting.
> +    */
> +   ir_rvalue*
> +   lower_unpack_snorm_2x16(void *mem_ctx, ir_rvalue *uint_rval)
> +   {
> +      assert(uint_rval->type == glsl_type::uint_type);
> +
> +      ir_rvalue *result =
> +         new(mem_ctx) ir_expression(ir_binop_min,
> +            new(mem_ctx) ir_expression(ir_binop_max,
> +               new(mem_ctx) ir_expression(ir_binop_div,
> +                  new(mem_ctx) ir_expression(ir_unop_i2f,
> +                     new(mem_ctx) ir_expression(ir_binop_rshift,
> +                        new(mem_ctx) ir_expression(ir_binop_lshift,
> +                           new(mem_ctx) ir_expression(ir_unop_u2i,
> +                             unpack_uint_to_uvec2(mem_ctx, uint_rval)),
> +                           new(mem_ctx) ir_constant(16)),
> +                        new(mem_ctx) ir_constant(16))),
> +              new(mem_ctx) ir_constant(32767.0f)),
> +            new(mem_ctx) ir_constant(-1.0f)),
> +          new(mem_ctx) ir_constant(1.0f));
> +
> +      assert(result->type == glsl_type::vec2_type);
> +      return result;
> +   }
> +
> +   /* \brief Lower packUnorm2x16.
> +    *
> +    * From page 88 (94 of pdf) of the GLSL ES 3.00 spec:
> +    *
> +    *    highp uint packUnorm2x16 (vec2 v)
> +    *    ---------------------------------
> +    *    First, converts each component of the normalized floating-point value
> +    *    v into 16-bit integer values. Then, the results are packed into the
> +    *    returned 32-bit unsigned integer.
> +    *
> +    *    The conversion for component c of v to fixed point is done as
> +    *    follows:
> +    *
> +    *       packUnorm2x16: round(clamp(c, 0, +1) * 65535.0)
> +    *
> +    *    The first component of the vector will be written to the least
> +    *    significant bits of the output; the last component will be written to
> +    *    the most significant bits.
> +    *
> +    * This function generates IR that approximates the following GLSL:
> +    *
> +    *     return pack_uvec2_to_uint(uvec2(
> +    *                round(clamp(RVALUE, 0.0f, 1.0f) * 65535.0f)));
> +    *
> +    * Here it is safe to directly convert the vec2 to uvec2 because the the
> +    * vec2 has been clamped to a non-negative range.
> +    */
> +   ir_rvalue*
> +   lower_pack_unorm_2x16(void *mem_ctx, ir_rvalue *vec2_rval)
> +   {
> +      assert(vec2_rval->type == glsl_type::vec2_type);
> +
> +      ir_rvalue *result =
> +         pack_uvec2_to_uint(
> +            mem_ctx,
> +            new(mem_ctx) ir_expression(ir_unop_f2u,
> +               new(mem_ctx) ir_expression(ir_unop_round_even,
> +                  new(mem_ctx) ir_expression(ir_binop_mul,
> +                    new(mem_ctx) ir_expression(ir_binop_min,
> +                       new(mem_ctx) ir_expression(ir_binop_max,
> +                         vec2_rval,
> +                         new (mem_ctx) ir_constant(0.0f)),
> +                       new(mem_ctx) ir_constant(1.0f)),
> +                    new(mem_ctx) ir_constant(65535.0f)))));
> +
> +      assert(result->type == glsl_type::uint_type);
> +      return result;
> +   }
> +
> +   /* \brief Lower unpackUnom2x16.
> +    *
> +    * From page 89 (95 of pdf) of the GLSL ES 3.00 spec:
> +    *
> +    *    highp vec2 unpackUnorm2x16 (highp uint p)
> +    *    -----------------------------------------
> +    *    First, unpacks a single 32-bit unsigned integer p into a pair of
> +    *    16-bit unsigned integers. Then, each component is converted to
> +    *    a normalized floating-point value to generate the returned
> +    *    two-component vector.
> +    *
> +    *    The conversion for unpacked fixed-point value f to floating point is
> +    *    done as follows:
> +    *
> +    *       unpackUnorm2x16: f / 65535.0
> +    *
> +    *    The first component of the returned vector will be extracted from the
> +    *    least significant bits of the input; the last component will be
> +    *    extracted from the most significant bits.
> +    *
> +    * This function generates IR that approximates the following GLSL:
> +    *
> +    *     return vec2(unpack_uint_to_uvec2(RVALUE)) / 65535.0;
> +    */
> +   ir_rvalue*
> +   lower_unpack_unorm_2x16(void *mem_ctx, ir_rvalue *uint_rval)
> +   {
> +      assert(uint_rval->type == glsl_type::uint_type);
> +
> +      ir_rvalue *result =
> +         new(mem_ctx) ir_expression(ir_binop_div,
> +           new(mem_ctx) ir_expression(ir_unop_u2f,
> +              unpack_uint_to_uvec2(mem_ctx, uint_rval)),
> +           new(mem_ctx) ir_constant(65535.0f));
> +
> +      assert(result->type == glsl_type::vec2_type);
> +      return result;
> +   }
> +
> +   /* \brief Lower the component-wise calculation of packHalf2x16.
> +    *
> +    * Given the non-shifted exponent and manstissa bits of a 16-bit float,
> +    * this function calculates the corrresponding exponent and mantissa bits
> +    * of the corresponding 16-bit float by constructing IR that approximates
> +    * the body of the following function:
> +    *
> +    *    uint
> +    *    pack_half_1x16_nosign(uint float32_exponent_bits,
> +    *                          uint float32_mantissa_bits)
> +    *    {
> +    *       uint float16;
> +    *
> +    *       float16 = float16_exponent_bits(float32_exponent_bits,
> +    *                                       float32_mantissa_bits);
> +    *       float16 |= float16_mantissa_bits(float32_exponent_bits,
> +    *                                        float32_mantissa_bits);
> +    *
> +    *       return float16;
> +    *    }
> +    *
> +    * What follows is a detailed discussion of the calculation.
> +    *
> +    * For a float16, the bit layout is:
> +    *
> +    *   sign:     15
> +    *   exponent: 10:14
> +    *   mantissa: 0:9
> +    *
> +    * The sign, exponent, and mantissa of a float16 determine its value thus:
> +    *
> +    *   if e = 0 and m = 0, then zero:       (-1)^s * 0                           (1)
> +    *   if e = 0 and m != 0, then subnormal: (-1)^s * 2^(e - 14) * (m / 2^10)     (2)
> +    *   if 0 < e < 31, then normal:          (-1)^s * 2^(e - 15) * (1 + m / 2^10) (3)
> +    *   if e = 31 and m = 0, then infinite:  (-1)^s * inf                         (4)
> +    *   if e = 31 and m != 0, then NaN                                            (5)
> +    *
> +    * where 0 <= m < 2^10. The minimum and maximum normal float16 values are
> +    *
> +    *   min_norm16 = 2^(1 - 15) * (1 + 0 / 2^10) = 2^(-14)                        (6)
> +    *   max_norm16 = 2^(30 - 15) * (1 + 1023 / 2^10)                              (7)
> +    *
> +    * For a float32, the bit layout is:
> +    *
> +    *   sign: 31
> +    *   exponent: 23:30
> +    *   mantissa: 0:22
> +    *
> +    * The sign, exponent, and mantissa of a float32 determine its value thus:
> +    *
> +    *   if e = 0 and m = 0, then zero:        (-1)^s * 0                             (8)
> +    *   if e = 0 and m != 0, then subnormal:  (-1)^s * 2^(e - 126) * (m / 2^23)      (9)
> +    *   if 0 < e < 255, then normal:          (-1)^s * 2^(e - 127) * (1 + m / 2^23) (10)
> +    *   if e = 255 and m = 0, then infinite:  (-1)^s * inf                          (11)
> +    *   if e = 255 and m != 0, then NaN                                             (12)
> +    *
> +    * where 0 <= m < 2^23.
> +    *
> +    * Let f32 be a float32. Let s32, e32, and m32 be the value of s, e, and
> +    * m in equations 6-10. Let f16 be the float16 which is the conversion of
> +    * f32. Likewise, let s16, e16, and m16 be the value of s, e, and m in
> +    * equations 1-5. Since this function ignores the sign bit, let
> +    * s32 = s16 = 0.
> +    *
> +    * To calculate e16 and m16, let's consider separately each of the classes
> +    * (equations 1-5) to which the resultant f16 may belong.
> +    *
> +    * # Case 1: f16 is zero
> +    *
> +    *     This case occurs when f32 is zero or when f32 < min_norm16.
> +    *     Calculating the bits in this case is trivial.
> +    *
> +    *       e16 = 0
> +    *       m16 = 0
> +    *
> +    * # Case 2: f16 is subnormal
> +    *
> +    *     Below in case 3, we choose to round toward zero any float32 that
> +    *     cannot be exactly represented as a float16. We also follow that
> +    *     convention here in case 2. As a consequence, we convert f32 to
> +    *     a subnormal float16 if and only if
> +    *
> +    *       f32 < min_norm16                                 (13)
> +    *
> +    *     (It would valid to choose different rounding method when
> +    *     max_subnorm16 < f32 < min_norm16, but that would introduce
> +    *     additional complexity into calculations).
> +    *
> +    *     Calculating the boundary values of e32 and m32 for which
> +    *     f32 < min_norm16 gives
> +    *
> +    *                                    f32 < min_norm16    (14)
> +    *       2^(e32 - 127) * (1 + m32 / 2^23) < 2^(-14)       (15)
> +    *                          2^(e32 - 127) < 2^(-14)       (16)
> +    *                                    e32 < 113           (17)
> +    *
> +    *     We must now calculate e16 and m16. However, to do so for subnormal
> +    *     float16 values requires iterating over the bits of f32's mantissa in
> +    *     order to find the position of the most significant enabled bit.
> +    *     Considering that packHalf2x16, on supported hardware where it need
> +    *     not be lowered, generates only a small number of instructions, to
> +    *     generate a loop when lowering packHalf2x16 might produce unexpected,
> +    *     and unacceptable, overhead during shader execution.
> +    *
> +    *     Luckily, the GLSL ES 3.00 and GLSL 4.10 specs allows us to truncate
> +    *     subnormal floats to zero. From section 4.5.1 "Range and Precision"
> +    *     of the two specs:
> +    *
> +    *       Any subnormal (denormalized) value input into a shader or
> +    *       potentially generated by any operation in a shader can be flushed
> +    *       to 0.
> +    *
> +    *     Therefore we choose
> +    *
> +    *       e16 = 0
> +    *       m16 = 0
> +    *
> +    * # Case 3: f16 is normal
> +    *
> +    *     We choose to convert f32 to a normal float16 if and only if
> +    *
> +    *       min_normal16 <= f32 <= max_normal16                           (18)
> +    *
> +    *     From similar calculations as those in equations 14-17, the left-hand
> +    *     inequality resolves to
> +    *
> +    *       e32 = 113                                                     (19)
> +    *
> +    *     Calculating the boundary values of e32 and m32 for the right-hand
> +    *     inequality gives
> +    *
> +    *       f32 <= max_normal15                                                 (20)
> +    *       2^(e32 - 127) * (1 + m32 / 2^23) <= 2^15 * (1 + 1023 / 2^ 10)       (21)
> +    *
> +    *       2^(e32 - 127) < 2^15 or                                             (22)
> +    *          (2^(e32 - 127) = 2^15 and 1 + m32 / 2^23 <= 1 + 1023 / 2^ 10)
> +    *
> +    *       e32 < 142 or (e32 = 142 and m32 <= 2^13 * 1023)                     (23)
> +    *
> +    *     Combining equations 19 and 23 gives
> +    *
> +    *       113 <= e32 < 142 or (e32 = 142 and m32 <= 2^13 * 1023)              (24)
> +    *
> +    *     Now that we have determined the float32 range that converts to
> +    *     a normal float16, we must calculate e16 and m16.
> +    *
> +    *       2^(e32 - 127) * (1 + m32 / 2^23) = 2^(e16 - 15) * (1 + m16 / 2^10)  (25)
> +    *
> +    *     Solving equation 25 for e16 and m16 gives
> +    *
> +    *       e16 = e32 - 112                                               (26)
> +    *       m16 = m32 / 2^13                                              (27)
> +    *
> +    *     However, not all float32 values can be exactly represented as
> +    *     a float16. As a consequence, m16 in equation 27 has the potential to
> +    *     have a fractional part (consider m32 = 1). We choose to round all
> +    *     such float32 values downward to the nearest float16. To accomplish
> +    *     this, ignore all but the most 10 significant bits of m32:
> +    *
> +    *       m16 = (m32 % 2^13) / 2^13                                     (28)
> +    *
> +    * # Case 4: f16 is infinite
> +    *
> +    *     The resultant f16 is infinite if and only if f32 exceeds the bounds
> +    *     in equation 23. That is, if
> +    *
> +    *       (e32 = 142 and m32 > 2^13 * 1023) or (e32 > 142)              (29)
> +    *
> +    *     In this case, the choice for e16 and m16 are trivial:
> +    *
> +    *       e16 = 31
> +    *       m16 = 0
> +    *
> +    * # Case 5: f16 is NaN
> +    *
> +    *     The resulting f16 is NaN if and only f32 is NaN, which occurs when
> +    *
> +    *       e32 = 255
> +    *       m32 != 0
> +    *
> +    *     Any choice of m16 != 0 suffices. We choose m16 where all bits are set:
> +    *
> +    *       e16 = 31
> +    *       m16 = 0x03ff
> +    *
> +    * We can therefore calculate f32 with the following C:
> +    *
> +    *   uint32_t f32 = GIVEN;
> +    *   uint16_t f16;
> +    *
> +    *   uint32_t e32 = (f32 >> 23) & 0xff;
> +    *   uint32_t m32 = f32 & 0x007fffff;
> +    *
> +    *   uint16_t e16;
> +    *   uint16_t m16;
> +    *
> +    *   if (e32 == 255 && m32 != 0) {
> +    *      // f32 is NaN
> +    *      e16 = 31;
> +    *      m16 = 0x03ff;
> +    *   } else if (e32 < 113) {
> +    *      // f32 converts to a zero or subnormal float16.
> +    *      // flush subnormal float16's to zero.
> +    *      e16 = 0;
> +    *      m16 = 0;
> +    *   } else if (e32 < 142 || (e32 == 142 && m32 <= (1023 << 13)) {
> +    *      // f32 converts to a normal float16
> +    *      e16 = e32 + 112;
> +    *      m16 = m32 >> 13;
> +    *   } else {
> +    *      // f32 lies outside of the range of finite float16 values
> +    *      e16 = 31;
> +    *      m16 = 0;
> +    *   }
> +    *
> +    * However, instruction execution is expensive on the GPU, and this
> +    * compiler's quality of optimization is less than ideal. Below is
> +    * a C implementation that should execute a near-minimal number of
> +    * instructions.
> +    *
> +    *   uint32_t f32 = GIVEN;
> +    *   uint16_t f16;
> +    *
> +    *   // Get f32's unshifted exponent and mantissa bits.
> +    *   uint16_t e = f32 & 0x7f800000;
> +    *   uint16_t m = f32 & 0x007fffff;
> +    *
> +    *   if (e == (255 << 23) && m != 0) {
> +    *     // f32 is NaN
> +    *     f16 = 0x7fff;
> +    *   } else if (e < (113 << 23)) {
> +    *     // f32 converts to a zero or subnormal float16.
> +    *     // flush subnormal float16's to zero.
> +    *     f16 = 0;
> +    *   } else if (e < (142 << 23) || (e == (142 << 23) && m <= (1023 << 13)) {
> +    *     // f32 converts to a normal float16
> +    *     f16 = (m >> 13);
> +    *     f16 |= (e - (112 << 23)) >> 13;
> +    *   } else {
> +    *     // f32 lies outside of the range of finite float16 values
> +    *     f16 = (31 << 10);
> +    *   }
> +    *
> +    * The IR constructed by this function is a translation of the above
> +    * optimized C, where e and m are provided by the caller.
> +    */
> +   ir_rvalue*
> +   pack_half_1x16_nosign(void *mem_ctx,
> +                         ir_rvalue *e_rval,
> +                         ir_rvalue *m_rval)
> +   {
> +       assert(e_rval->type == glsl_type::uint_type);
> +       assert(m_rval->type == glsl_type::uint_type);
> +
> +       /* uint f16; */
> +       ir_variable *f16 =
> +          new(mem_ctx) ir_variable(glsl_type::uint_type,
> +                                   "tmp_pack_half_1x16_f16",
> +                                   ir_var_temporary);
> +       insert_instruction(f16);
> +
> +       /* uint e = E_RVAL; */
> +       ir_variable *e =
> +          new(mem_ctx) ir_variable(glsl_type::uint_type,
> +                                   "tmp_pack_half_1x16_e",
> +                                   ir_var_temporary);
> +       insert_instruction(e);
> +       insert_instruction(
> +          new(mem_ctx) ir_assignment(
> +             new(mem_ctx) ir_dereference_variable(e),
> +             e_rval));
> +
> +       /* uint m = M_RVAL; */
> +       ir_variable *m =
> +          new(mem_ctx) ir_variable(glsl_type::uint_type,
> +                                   "tmp_pack_half_1x16_m",
> +                                   ir_var_temporary);
> +       insert_instruction(m);
> +       insert_instruction(
> +          new(mem_ctx) ir_assignment(
> +             new(mem_ctx) ir_dereference_variable(m),
> +             m_rval));
> +
> +       /*
> +        * if (e == (255 << 23) && m != 0) {
> +        *   // f32 is NaN
> +        *   f16 = 0x7fff;
> +        * }
> +        */
> +       ir_if *if_nan =
> +          new(mem_ctx) ir_if(
> +             new(mem_ctx) ir_expression(ir_binop_logic_and,
> +                new(mem_ctx) ir_expression(ir_binop_equal,
> +                   new(mem_ctx) ir_dereference_variable(e),
> +                   new(mem_ctx) ir_constant(255u << 23u)),
> +                new(mem_ctx) ir_expression(ir_unop_logic_not,
> +                   new(mem_ctx) ir_expression(ir_binop_equal,
> +                      new(mem_ctx) ir_dereference_variable(m),
> +                      new(mem_ctx) ir_constant(0u)))));
> +       insert_instruction(if_nan);
> +       if_nan->then_instructions.push_tail(
> +          new(mem_ctx) ir_assignment(
> +             new(mem_ctx) ir_dereference_variable(f16),
> +             new(mem_ctx) ir_constant(0x7fffu)));
> +
> +       /*
> +        * else if (e < (113 << 23)) {
> +        *   // f32 converts to a zero or subnormal float16.
> +        *   // flush subnormal float16's to zero.
> +        *   f16 = 0;
> +        * }
> +        */
> +       ir_if *if_zero =
> +          new(mem_ctx) ir_if(
> +             new(mem_ctx) ir_expression(ir_binop_less,
> +                   new(mem_ctx) ir_dereference_variable(e),
> +                   new(mem_ctx) ir_constant(113u << 23u)));
> +       if_nan->else_instructions.push_tail(if_zero);
> +       if_zero->then_instructions.push_tail(
> +          new(mem_ctx) ir_assignment(
> +             new(mem_ctx) ir_dereference_variable(f16),
> +                new(mem_ctx) ir_constant(0u)));
> +
> +       /*
> +        * else if (e < (142 << 23) || (e == (142 << 23) && m <= (1023 << 13)) {
> +        *   // f32 converts to a normal float16
> +        *   f16 = (m >> 13);
> +        *   f16 |= (e - (112 << 23)) >> 13;
> +        * }
> +        */
> +       ir_if *if_normal =
> +          new(mem_ctx) ir_if(
> +             new(mem_ctx) ir_expression(ir_binop_logic_or,
> +                new(mem_ctx) ir_expression(ir_binop_less,
> +                   new(mem_ctx) ir_dereference_variable(e),
> +                   new(mem_ctx) ir_constant(142u << 23u)),
> +                new(mem_ctx) ir_expression(ir_binop_logic_and,
> +                   new(mem_ctx) ir_expression(ir_binop_equal,
> +                      new(mem_ctx) ir_dereference_variable(e),
> +                      new(mem_ctx) ir_constant(142u << 23u)),
> +                   new(mem_ctx) ir_expression(ir_binop_lequal,
> +                      new(mem_ctx) ir_dereference_variable(m),
> +                      new(mem_ctx) ir_constant(1023u << 13u)))));
> +       if_zero->else_instructions.push_tail(if_normal);
> +       if_normal->then_instructions.push_tail(
> +          new(mem_ctx) ir_assignment(
> +             new(mem_ctx) ir_dereference_variable(f16),
> +             new(mem_ctx) ir_expression(ir_binop_rshift,
> +                new(mem_ctx) ir_dereference_variable(m),
> +                new(mem_ctx) ir_constant(13u))));
> +       if_normal->then_instructions.push_tail(
> +          new(mem_ctx) ir_assignment(
> +             new(mem_ctx) ir_dereference_variable(f16),
> +             new(mem_ctx) ir_expression(ir_binop_bit_or,
> +                new(mem_ctx) ir_dereference_variable(f16),
> +                new(mem_ctx) ir_expression(ir_binop_rshift,
> +                   new(mem_ctx) ir_expression(ir_binop_sub,
> +                      new(mem_ctx) ir_dereference_variable(e),
> +                      new(mem_ctx) ir_constant(112u << 23u)),
> +                   new(mem_ctx) ir_constant(13u)))));
> +
> +       /*
> +        * else {
> +        *   // f32 lies outside of the range of finite float16 values
> +        *   f16 = (31 << 10);
> +        * }
> +        */
> +       if_normal->else_instructions.push_tail(
> +          new(mem_ctx) ir_assignment(
> +             new(mem_ctx) ir_dereference_variable(f16),
> +             new(mem_ctx) ir_constant(31u << 10u)));
> +
> +       /* return f16; */
> +       return new(mem_ctx) ir_dereference_variable(f16);
> +   }
> +
> +   /* \brief Lower packHalf2x16.
> +    *
> +    * From page 89 (95 of pdf) of the GLSL ES 3.00 spec:
> +    *
> +    *    highp uint packHalf2x16 (mediump vec2 v)
> +    *    ----------------------------------------
> +    *    Returns an unsigned integer obtained by converting the components of
> +    *    a two-component floating-point vector to the 16-bit floating-point
> +    *    representation found in the OpenGL ES Specification, and then packing
> +    *    these two 16-bit integers into a 32-bit unsigned integer.
> +    *
> +    *    The first vector component specifies the 16 least- significant bits
> +    *    of the result; the second component specifies the 16 most-significant
> +    *    bits.
> +    *
> +    * This function constructs IR that approximates the following GLSL:
> +    *
> +    *   uvec2 f32 = bitcast_f2u(VEC2_RVALUE);
> +    *   uvec2 f16;
> +    *
> +    *   // Get f32's unshifted exponent and mantissa bits.
> +    *   uvec2 e = f32 & 0x7f800000u;
> +    *   uvec2 m = f32 & 0x007fffffu;
> +    *
> +    *   // Set f16's exponent and mantissa bits.
> +    *   f16.x = pack_half_1x16_nosign(e.x, m.x);
> +    *   f16.y = pack_half_1y16_nosign(e.y, m.y);
> +    *
> +    *   // Set f16's sign bits.
> +    *   f16 |= (f32 & (1u << 31u)) >> 16u;
> +    *
> +    *   return (f16.y << 16u) | f16.x;
> +    */
> +   ir_rvalue*
> +   pack_half_2x16(void *mem_ctx, ir_rvalue *vec2_rval)
> +   {
> +      assert(vec2_rval->type == glsl_type::vec2_type);
> +
> +      /* uvec2 f32 = bitcast_f2u(RVALUE); */
> +      ir_variable *f32 =
> +         new(mem_ctx) ir_variable(glsl_type::uvec2_type,
> +                                  "tmp_pack_half_2x16_f32",
> +                                  ir_var_temporary);
> +      insert_instruction(f32);
> +      insert_instruction(
> +         new(mem_ctx) ir_assignment(
> +            new(mem_ctx) ir_dereference_variable(f32),
> +            new(mem_ctx) ir_expression(ir_unop_bitcast_f2u,
> +               vec2_rval)));
> +
> +      /* uvec2 f16; */
> +      ir_variable *f16 =
> +         new(mem_ctx) ir_variable(glsl_type::uvec2_type,
> +                                  "tmp_pack_half_2x16_f16",
> +                                  ir_var_temporary);
> +      insert_instruction(f16);
> +
> +      /* Get f32's unshifted exponent bits.
> +       *
> +       *   uvec2 e = f32 & 0x7f800000u;
> +       */
> +      ir_variable *e =
> +         new(mem_ctx) ir_variable(glsl_type::uvec2_type,
> +                                  "tmp_pack_half_2x16_e",
> +                                  ir_var_temporary);
> +      insert_instruction(e);
> +      insert_instruction(
> +         new(mem_ctx) ir_assignment(
> +            new(mem_ctx) ir_dereference_variable(e),
> +            new(mem_ctx) ir_expression(ir_binop_bit_and,
> +               new(mem_ctx) ir_dereference_variable(f32),
> +               new (mem_ctx) ir_constant(0x7f800000u))));
> +
> +      /* Get f32's unshifted mantissa bits.
> +       *
> +       *   uvec2 e = f32 & 0x007fffffu;
> +       */
> +      ir_variable *m =
> +         new(mem_ctx) ir_variable(glsl_type::uvec2_type,
> +                                  "tmp_pack_half_2x16_m",
> +                                  ir_var_temporary);
> +      insert_instruction(m);
> +      insert_instruction(
> +         new(mem_ctx) ir_assignment(
> +            new(mem_ctx) ir_dereference_variable(m),
> +            new(mem_ctx) ir_expression(ir_binop_bit_and,
> +               new(mem_ctx) ir_dereference_variable(f32),
> +               new (mem_ctx) ir_constant(0x007fffffu))));
> +
> +      /* Set f16's exponent and mantissa bits.
> +       *
> +       *   f16.x = pack_half_1x16_nosign(e.x, m.x);
> +       *   f16.y = pack_half_1y16_nosign(e.y, m.y);
> +       */
> +      insert_instruction(
> +         new(mem_ctx) ir_assignment(
> +            new(mem_ctx) ir_dereference_variable(f16),
> +            pack_half_1x16_nosign(mem_ctx,
> +               new(mem_ctx) ir_swizzle(
> +                  new(mem_ctx) ir_dereference_variable(e),
> +                  0, 0, 0, 0, 1),
> +               new(mem_ctx) ir_swizzle(
> +                  new(mem_ctx) ir_dereference_variable(m),
> +                  0, 0, 0, 0, 1)),
> +            NULL /*condition*/,
> +            1 /*write mask*/));
> +      insert_instruction(
> +         new(mem_ctx) ir_assignment(
> +            new(mem_ctx) ir_dereference_variable(f16),
> +            pack_half_1x16_nosign(mem_ctx,
> +               new(mem_ctx) ir_swizzle(
> +                  new(mem_ctx) ir_dereference_variable(e),
> +                  1, 0, 0, 0, 1),
> +               new(mem_ctx) ir_swizzle(
> +                  new(mem_ctx) ir_dereference_variable(m),
> +                  1, 0, 0, 0, 1)),
> +            NULL /*condition*/,
> +            2 /*write mask*/));
> +
> +      /* Set f16's sign bits.
> +       *
> +       *   f16 |= (f32 & (1u << 31u) >> 16u;
> +       */
> +      insert_instruction(
> +         new(mem_ctx) ir_assignment(
> +            new(mem_ctx) ir_dereference_variable(f16),
> +            new(mem_ctx) ir_expression(ir_binop_bit_or,
> +               glsl_type::uvec2_type,
> +               new(mem_ctx) ir_dereference_variable(f16),
> +               new(mem_ctx) ir_expression(ir_binop_rshift,
> +                  new(mem_ctx) ir_expression(ir_binop_bit_and,
> +                     new(mem_ctx) ir_dereference_variable(f32),
> +                     new(mem_ctx) ir_constant(1u << 31u)),
> +                  new(mem_ctx) ir_constant(16u)))));
> +
> +
> +      /* return (f16.y << 16u) | f16.x; */
> +      ir_rvalue *result =
> +         new(mem_ctx) ir_expression(ir_binop_bit_or,
> +            new(mem_ctx) ir_expression(ir_binop_lshift,
> +               new(mem_ctx) ir_swizzle(
> +                  new(mem_ctx) ir_dereference_variable(f16),
> +                  1, 0, 0, 0, 1),
> +               new(mem_ctx) ir_constant(16u)),
> +            new(mem_ctx) ir_swizzle(
> +               new(mem_ctx) ir_dereference_variable(f16),
> +               0, 0, 0, 0, 1));
> +
> +      assert(result->type == glsl_type::uint_type);
> +      return result;
> +   }
> +
> +   /**
> +    * \brief Split packHalf2x16's vec2 operand into two floats.
> +    *
> +    * Some code generators, such as the i965 fragment shader, require that all
> +    * vector expressions be lowered to a sequence of scalar expressions.
> +    * However, packHalf2x16 cannot be scalarized by the same method as a true
> +    * vector operation because the number of components of its input and
> +    * output differ.
> +    *
> +    * This method scalarizes packHalf2x16 by transforming it from an unary
> +    * operation having vector input to a binary operation having scalar input.
> +    * That is, it transforms
> +    *
> +    *    packHalf2x16(VEC2_RVAL);
> +    *
> +    * into
> +    *
> +    *    vec2 v = VEC2_RVAL;
> +    *    return packHalf2x16_split(v.x, v.y);
> +    */
> +   ir_rvalue*
> +   split_pack_half_2x16(void *mem_ctx, ir_rvalue *vec2_rval)
> +   {
> +      assert(vec2_rval->type == glsl_type::vec2_type);
> +
> +      /* vec2 v = vec2_rval; */
> +      ir_variable *v =
> +         new(mem_ctx) ir_variable(glsl_type::vec2_type,
> +                                  "tmp_split_pack_half_2x16_v",
> +                                  ir_var_temporary);
> +      insert_instruction(
> +         new(mem_ctx) ir_assignment(
> +            new(mem_ctx) ir_dereference_variable(v),
> +            vec2_rval));
> +
> +      /* return packHalf2x16_split(v.x, v.y); */
> +      return
> +         new(mem_ctx) ir_expression(ir_binop_pack_half_2x16_split,
> +            glsl_type::uint_type,
> +            new(mem_ctx) ir_swizzle(
> +               new(mem_ctx) ir_dereference_variable(v),
> +               0, 0, 0, 0, 1),
> +            new(mem_ctx) ir_swizzle(
> +               new(mem_ctx) ir_dereference_variable(v),
> +               1, 0, 0, 0, 1));
> +   }
> +
> +   /**
> +    * \brief Lower the component-wise calculation of unpackHalf2x16.
> +    *
> +    * Given the non-shifted exponent and mantissa bits of a 16-bit float, this
> +    * function calculates the corresponding exponent and mantisssa bits of the
> +    * equivalent 32-bit float by constructing IR that approximates the body
> +    * following function:
> +    *
> +    *    uint
> +    *    unpack_half_1x16_nosign(uint float16_exponent_bits,
> +    *                            uint float16_mantissa_bits)
> +    *    {
> +    *       uint float32;
> +    *
> +    *       float32 = float32_exponent_bits(float16_exponent_bits,
> +    *                                        float16_mantissa_bits);
> +    *       float32 |= float32_mantissa_bits(float16_exponent_bits,
> +    *                                        float16_mantissa_bits);
> +    *
> +    *       return float32;
> +    *    }
> +    *
> +    * What follows is a detailed discussion of the calculation.
> +    *
> +    * For a float16, the bit layout is:
> +    *
> +    *   sign:     15
> +    *   exponent: 10:14
> +    *   mantissa: 0:9
> +    *
> +    * The sign, exponent, and mantissa of a float16 determine its value thus:
> +    *
> +    *   if e = 0 and m = 0, then zero:       (-1)^s * 0                           (1)
> +    *   if e = 0 and m != 0, then subnormal: (-1)^s * 2^(e - 14) * (m / 2^10)     (2)
> +    *   if 0 < e < 31, then normal:          (-1)^s * 2^(e - 15) * (1 + m / 2^10) (3)
> +    *   if e = 31 and m = 0, then infinite:  (-1)^s * inf                         (4)
> +    *   if e = 31 and m != 0, then NaN                                            (5)
> +    *
> +    * where 0 <= m < 2^10 .
> +    *
> +    * For a float32, the bit layout is:
> +    *
> +    *   sign:     31
> +    *   exponent: 23:30
> +    *   mantissa: 0:22
> +    *
> +    * The sign, exponent, and mantissa of a float32 determine its value thus:
> +    *
> +    *   if e = 0 and m = 0, then zero:        (-1)^s * 0                             (6)
> +    *   if e = 0 and m != 0, then subnormal:  (-1)^s * 2^(e - 126) * (m / 2^23)      (7)
> +    *   if 0 < e < 255, then normal:          (-1)^s * 2^(e - 127) * (1 + m / 2^23)  (8)
> +    *   if e = 255 and m = 0, then infinite:  (-1)^s * inf                           (9)
> +    *   if e = 255 and m != 0, then NaN                                             (10)
> +    *
> +    * where 0 <= m < 2^8 .
> +    *
> +    * Let f16 and f32 be a float16 and float32 with equal value. Let s16, e16,
> +    * and m16 be the value of s, e, and m in equations 1-5. Likewise, let s32,
> +    * e32, and m32 be those in equations 6-10.  Since this function ignores
> +    * the sign bit, let s16 = s32 = 0.
> +    *
> +    * To calculate e32 and m32, let's consider separately each of the classes
> +    * (equations 1-5) to which f16 may belong.
> +    *
> +    * # Case 1: f16 is zero
> +    *
> +    *     Calculating the bits in this case is trivial.
> +    *
> +    *       e32 := 0
> +    *       m32 := 0
> +    *
> +    * # Case 2: f16 is subnormal
> +    *
> +    *     To calculate the e32 and m32 in this case requires iterating
> +    *     over the bits of the float16's mantissa in order to find the
> +    *     position of the most significant enabled bit. Considering that
> +    *     unpackHalf2x16, on supported hardware where it need not be lowered,
> +    *     generates only a small number of instructions, it would result in
> +    *     unexpected overhead during shader execution if we generated a loop
> +    *     when lowering unpackHalf2x16.
> +    *
> +    *     Luckily, the GLSL ES 3.00 and GLSL 4.10 specs allows us to truncate
> +    *     subnormal floats to zero. From section 4.5.1 "Range and Precision"
> +    *     of the two specs:
> +    *
> +    *       Any subnormal (denormalized) value input into a shader or
> +    *       potentially generated by any operation in a shader can be flushed
> +    *       to 0.
> +    *
> +    *     Therefore we choose
> +    *
> +    *       e32 = 0
> +    *       m32 = 0
> +    *
> +    * # Case 3: f16 is normal
> +    *
> +    *     The normal range of float16 is a subset of the normal range of
> +    *     float32. Therefore we calculate e32 as
> +    *
> +    *       2^(e32 - 127) = 2^(e16 - 15)
> +    *                 e32 = e16 + 112
> +    *
> +    *     as m32 as
> +    *
> +    *       1 + m32 / 2^23 = 1 + m16 / 2^10
> +    *                  m32 = 2^13 * m16
> +    *
> +    * # Case 4: f16 is infinite
> +    *
> +    *    Calculating the bits in this case is trivial.
> +    *
> +    *      e32 = 255
> +    *      m32 = 0
> +    *
> +    * # Case 5: f16 is NaN
> +    *
> +    *    Any choice of m32 != 0 suffices. We choose m32 where all bits are
> +    *    set:
> +    *
> +    *       e32 = 255
> +    *       m32 = 0x7fffff (23 bits)
> +    *
> +    * We can therefore calculate f32 with the following C:
> +    *
> +    *   uint16_t f16 = GIVEN;
> +    *   uint32_t f32;
> +    *
> +    *   uint32_t e16 = (f16 >> 10) & 0x1f;
> +    *   uint32_t m16 = f16 & 0x03ff;
> +    *
> +    *   uint32_t e32;
> +    *   uint32_t m32;
> +    *
> +    *   if (e16 == 0) {
> +    *     // f16 is zero or subnormal.
> +    *     // flush subnormal float16's to zero.
> +    *     e32 = 0;
> +    *     m32 = 0;
> +    *   } else if (e16 < 31) {
> +    *     // f16 is normal
> +    *     e32 = e16 + 112;
> +    *     m32 = m16 << 13;
> +    *   } else if (e16 == 31 && m16 == 0) {
> +    *     // f16 is infinite
> +    *     e32 = 255;
> +    *     m32 = 0;
> +    *   } else {
> +    *     // f16 is NaN
> +    *     e32 = 255;
> +    *     m32 = 0x007fffff;
> +    *   }
> +    *
> +    *   f32 = (e32 << 23) | m32;
> +    *   return f32;
> +    *
> +    * However, instruction execution is expensive on the GPU, and this
> +    * compiler's quality of optimization is less than ideal. Below is
> +    * a C implementation with that should execute a near-minimal number of
> +    * instructions.
> +    *
> +    *   uint16_t f16 = GIVEN;
> +    *   uint32_t f32;
> +    *
> +    *   // Get f16's unshifted exponent and mantissa bits.
> +    *   uint32_t e = f16 & 0x7f00;
> +    *   uint32_t m = f16 & 0x03ff;
> +    *
> +    *   if (e == 0) {
> +    *     // f16 is zero or subnormal
> +    *     f32 = 0;
> +    *   } else if (e < (31 << 10)) {
> +    *     // f16 is normal
> +    *     f32 = (m << 13);
> +    *     f32 |= (e << 13) + (112 << 23);
> +    *   } else if (m == 0) {
> +    *     // f16 is infinite
> +    *     f32 = (255 << 23);
> +    *   } else {
> +    *     // f16 is NaN
> +    *     f32 = 0x7fffffff;
> +    *   }
> +    *
> +    *   return f32;
> +    *
> +    * The IR constructed by this function is a translation of the above
> +    * optimized C, where e and m are provided by the caller.
> +    */
> +   ir_rvalue*
> +   unpack_half_1x16_nosign(void *mem_ctx,
> +                           ir_rvalue *e_rval,
> +                           ir_rvalue *m_rval)
> +   {
> +      assert(e_rval->type == glsl_type::uint_type);
> +      assert(m_rval->type == glsl_type::uint_type);
> +
> +      /* uint f32; */
> +      ir_variable *f32 =
> +         new(mem_ctx) ir_variable(glsl_type::uint_type,
> +                                  "tmp_unpack_half_1x16_f32",
> +                                  ir_var_temporary);
> +      insert_instruction(f32);
> +
> +      /* uint e = E_RVAL; */
> +      ir_variable *e =
> +         new(mem_ctx) ir_variable(glsl_type::uint_type,
> +                                  "tmp_unpack_half_1x16_e",
> +                                  ir_var_temporary);
> +      insert_instruction(e);
> +      insert_instruction(
> +         new(mem_ctx) ir_assignment(
> +            new(mem_ctx) ir_dereference_variable(e),
> +            e_rval));
> +
> +      /* uint m = M_RVAL; */
> +      ir_variable *m =
> +         new(mem_ctx) ir_variable(glsl_type::uint_type,
> +                                  "tmp_unpack_half_1x16_m",
> +                                  ir_var_temporary);
> +      insert_instruction(m);
> +      insert_instruction(
> +         new(mem_ctx) ir_assignment(
> +            new(mem_ctx) ir_dereference_variable(m),
> +            m_rval));
> +
> +      /*
> +       * if (e == 0) {
> +       *   // f16 is zero or subnormal
> +       *   f32 = 0;
> +       * }
> +       */
> +      ir_if *if_zero =
> +         new(mem_ctx) ir_if(
> +            new(mem_ctx) ir_expression(ir_binop_equal,
> +               new(mem_ctx) ir_dereference_variable(e),
> +               new(mem_ctx) ir_constant(0u)));
> +      insert_instruction(if_zero);
> +      if_zero->then_instructions.push_tail(
> +         new(mem_ctx) ir_assignment(
> +            new(mem_ctx) ir_dereference_variable(f32),
> +            new(mem_ctx) ir_constant(0u)));
> +
> +      /*
> +       * else if (e < (31 << 10)) {
> +       *    // f16 is normal
> +       *    f32 = (m << 13);
> +       *    f32 |= (e << 13) + (112 << 23);
> +       * }
> +       */
> +      ir_if *if_normal =
> +         new(mem_ctx) ir_if(
> +            new(mem_ctx) ir_expression(ir_binop_less,
> +                  new(mem_ctx) ir_dereference_variable(e),
> +                  new(mem_ctx) ir_constant(31u << 10u)));
> +      if_zero->else_instructions.push_tail(if_normal);
> +      if_normal->then_instructions.push_tail(
> +         new(mem_ctx) ir_assignment(
> +            new(mem_ctx) ir_dereference_variable(f32),
> +            new(mem_ctx) ir_expression(ir_binop_lshift,
> +               new(mem_ctx) ir_dereference_variable(m),
> +               new(mem_ctx) ir_constant(13u))));
> +      if_normal->then_instructions.push_tail(
> +         new(mem_ctx) ir_assignment(
> +            new(mem_ctx) ir_dereference_variable(f32),
> +            new(mem_ctx) ir_expression(ir_binop_bit_or,
> +               new(mem_ctx) ir_dereference_variable(f32),
> +               new(mem_ctx) ir_expression(ir_binop_add,
> +                  new(mem_ctx) ir_expression(ir_binop_lshift,
> +                     new(mem_ctx) ir_dereference_variable(e),
> +                     new(mem_ctx) ir_constant(13u)),
> +                  new(mem_ctx) ir_constant(112u << 23u)))));
> +
> +      /*
> +       * else if (m == 0) {
> +       *    // f16 is infinite
> +       *    f32 = (255 << 23);
> +       * }
> +       */
> +      ir_if *if_infinite =
> +         new(mem_ctx) ir_if(
> +            new(mem_ctx) ir_expression(ir_binop_equal,
> +                  new(mem_ctx) ir_dereference_variable(m),
> +                  new(mem_ctx) ir_constant(0u)));
> +      if_normal->else_instructions.push_tail(if_infinite);
> +      if_infinite->then_instructions.push_tail(
> +         new(mem_ctx) ir_assignment(
> +            new(mem_ctx) ir_dereference_variable(f32),
> +            new(mem_ctx) ir_constant(255u << 23u)));
> +
> +      /*
> +       * else {
> +       *    // f16 is NaN
> +       *    f32 = 0x7fffffff;
> +       * }
> +       */
> +      if_infinite->else_instructions.push_tail(
> +         new(mem_ctx) ir_assignment(
> +            new(mem_ctx) ir_dereference_variable(f32),
> +            new(mem_ctx) ir_constant(0x7fffffffu)));
> +
> +      /* return f32; */
> +      return new(mem_ctx) ir_dereference_variable(f32);
> +   }
> +
> +   /* From page 89 (95 of pdf) of the GLSL ES 3.00 spec:
> +    *
> +    *    mediump vec2 unpackHalf2x16 (highp uint v)
> +    *    ------------------------------------------
> +    *    Returns a two-component floating-point vector with components
> +    *    obtained by unpacking a 32-bit unsigned integer into a pair of 16-bit
> +    *    values, interpreting those values as 16-bit floating-point numbers
> +    *    according to the OpenGL ES Specification, and converting them to
> +    *    32-bit floating-point values.
> +    *
> +    *    The first component of the vector is obtained from the
> +    *    16 least-significant bits of v; the second component is obtained from
> +    *    the 16 most-significant bits of v.
> +    *
> +    * This function constructs IR that approximates the following GLSL:
> +    *
> +    *   uint u = RVALUE;
> +    *   uvec2 f16 = uvec2(u & 0xffff, u >> 16);
> +    *   uvec2 f32;
> +    *
> +    *   // Get f16's unshifted exponent and mantissa bits.
> +    *   uvec2 e = f16 & 0x7c00u;
> +    *   uvec2 m = f16 & 0x03ffu;
> +    *
> +    *   // Set f32's exponent and mantissa bits.
> +    *   f32.x = unpack_half_1x16_nosign(e.x, m.x);
> +    *   f32.y = unpack_half_1x16_nosign(e.y, m.y);
> +    *
> +    *   // Set f32's the sign bit.
> +    *   f32 = (f16 & 0x8000u) << 16u;
> +    *
> +    *   return bitcast_u2f(f32);
> +    */
> +   ir_rvalue*
> +   lower_unpack_half_2x16(void *mem_ctx, ir_rvalue *uint_rval)
> +   {
> +      assert(uint_rval->type == glsl_type::uint_type);
> +
> +      /* uint u = RVALUE;
> +       * uvec2 f16 = uvec2(u.x & 0xffff, u.y >> 16);
> +       */
> +      ir_variable *f16 =
> +         new(mem_ctx) ir_variable(glsl_type::uvec2_type,
> +                                  "tmp_unpack_half_2x16_f16",
> +                                  ir_var_temporary);
> +      insert_instruction(f16);
> +      insert_instruction(
> +         new(mem_ctx) ir_assignment(
> +            new(mem_ctx) ir_dereference_variable(f16),
> +            unpack_uint_to_uvec2(mem_ctx, uint_rval)));
> +
> +      /* uvec2 f32; */
> +      ir_variable *f32 =
> +         new(mem_ctx) ir_variable(glsl_type::uvec2_type,
> +                                  "tmp_unpack_half_2x16_f32",
> +                                  ir_var_temporary);
> +      insert_instruction(f32);
> +
> +      /* Get f16's unshifted exponent bits.
> +       *
> +       *    uvec2 e = f16 & 0x7c00u;
> +       */
> +      ir_variable *e =
> +         new(mem_ctx) ir_variable(glsl_type::uvec2_type,
> +                                  "tmp_unpack_half_2x16_e",
> +                                  ir_var_temporary);
> +      insert_instruction(e);
> +      insert_instruction(
> +         new(mem_ctx) ir_assignment(
> +            new(mem_ctx) ir_dereference_variable(e),
> +            new(mem_ctx) ir_expression(ir_binop_bit_and,
> +               new(mem_ctx) ir_dereference_variable(f16),
> +               new(mem_ctx) ir_constant(0x7c00u))));
> +
> +      /* Get f16's unshifted mantissa bits.
> +       *
> +       *    uvec2 m = f16 & 0x03ffu;
> +       */
> +      ir_variable *m =
> +         new(mem_ctx) ir_variable(glsl_type::uvec2_type,
> +                                  "tmp_unpack_half_2x16_m",
> +                                  ir_var_temporary);
> +      insert_instruction(m);
> +      insert_instruction(
> +         new(mem_ctx) ir_assignment(
> +            new(mem_ctx) ir_dereference_variable(m),
> +            new(mem_ctx) ir_expression(ir_binop_bit_and,
> +               new(mem_ctx) ir_dereference_variable(f16),
> +               new(mem_ctx) ir_constant(0x03ffu))));
> +
> +      /* Set f32's exponent and mantissa bits.
> +       *
> +       *   f32.x = unpack_half_1x16_nosign(e.x, m.x);
> +       *   f32.y = unpack_half_1x16_nosign(e.y, m.y);
> +       */
> +      insert_instruction(
> +         new(mem_ctx) ir_assignment(
> +            new(mem_ctx) ir_dereference_variable(f32),
> +            unpack_half_1x16_nosign(mem_ctx,
> +               new(mem_ctx) ir_swizzle(
> +                  new (mem_ctx) ir_dereference_variable(e),
> +                  0, 0, 0, 0, 1),
> +               new(mem_ctx) ir_swizzle(
> +                  new(mem_ctx) ir_dereference_variable(m),
> +                  0, 0, 0, 0, 1)),
> +             NULL /*condition*/,
> +             1 /*write mask*/));
> +      insert_instruction(
> +         new(mem_ctx) ir_assignment(
> +            new(mem_ctx) ir_dereference_variable(f32),
> +            unpack_half_1x16_nosign(mem_ctx,
> +               new(mem_ctx) ir_swizzle(
> +                  new (mem_ctx) ir_dereference_variable(e),
> +                  1, 0, 0, 0, 1),
> +               new(mem_ctx) ir_swizzle(
> +                  new(mem_ctx) ir_dereference_variable(m),
> +                  1, 0, 0, 0, 1)),
> +             NULL /*condition*/,
> +             2 /*write mask*/));
> +
> +      /* Set f32's sign bit.
> +       *
> +       *    f32 |= (f16 & 0x8000u) << 16u;
> +       */
> +      insert_instruction(
> +         new(mem_ctx) ir_assignment(
> +            new(mem_ctx) ir_dereference_variable(f32),
> +            new(mem_ctx) ir_expression(ir_binop_bit_or,
> +               glsl_type::uvec2_type,
> +               new(mem_ctx) ir_dereference_variable(f32),
> +               new(mem_ctx) ir_expression(ir_binop_lshift,
> +                  new(mem_ctx) ir_expression(ir_binop_bit_and,
> +                     new(mem_ctx) ir_dereference_variable(f16),
> +                     new(mem_ctx) ir_constant(0x8000u)),
> +                  new(mem_ctx) ir_constant(16u)))));
> +
> +      ir_rvalue *result =
> +         new(mem_ctx) ir_expression(ir_unop_bitcast_u2f,
> +            new(mem_ctx) ir_dereference_variable(f32));
> +
> +      assert(result->type == glsl_type::vec2_type);
> +      return result;
> +   }
> +
> +   /**
> +    * \brief Split unpackHalf2x16 into two operations.
> +    *
> +    * Some code generators, such as the i965 fragment shader, require that all
> +    * vector expressions be lowered to a sequence of scalar expressions.
> +    * However, unpackHalf2x16 cannot be scalarized by the same method as
> +    * a true vector operation because the number of components of its input
> +    * and output differ.
> +    *
> +    * This method scalarizes unpackHalf2x16 by transforming it from a single
> +    * operation having vec2 output to a pair of operations each having float
> +    * output. That is, it transforms
> +    *
> +    *   unpackHalf2x16(UINT_RVAL)
> +    *
> +    * into
> +    *
> +    *   uint u = UINT_RVAL;
> +    *   vec2 v;
> +    *
> +    *   v.x = unpackHalf2x16_split_x(u);
> +    *   v.y = unpackHalf2x16_split_y(u);
> +    *
> +    *   return v;
> +    */
> +   ir_rvalue*
> +   split_unpack_half_2x16(void *mem_ctx, ir_rvalue *uint_rval)
> +   {
> +      assert(uint_rval->type == glsl_type::uint_type);
> +
> +      /* uint u = uint_rval; */
> +      ir_variable *u =
> +         new(mem_ctx) ir_variable(glsl_type::uint_type,
> +                                  "tmp_split_unpack_half_2x16_u",
> +                                  ir_var_temporary);
> +      insert_instruction(u);
> +      insert_instruction(
> +         new(mem_ctx) ir_assignment(
> +            new(mem_ctx) ir_dereference_variable(u),
> +            uint_rval));
> +
> +      /* vec2 v; */
> +      ir_variable *v =
> +         new(mem_ctx) ir_variable(glsl_type::vec2_type,
> +                                  "tmp_split_unpack_half_2x16_v",
> +                                  ir_var_temporary);
> +      insert_instruction(v);
> +
> +      /* v.x = unpack_half_2x16_split_x(u); */
> +      insert_instruction(
> +         new(mem_ctx) ir_assignment(
> +            new(mem_ctx) ir_dereference_variable(v),
> +            new(mem_ctx) ir_expression(ir_unop_unpack_half_2x16_split_x,
> +               glsl_type::float_type,
> +               new(mem_ctx) ir_dereference_variable(u)),
> +            NULL /*condition*/,
> +            1 /*write mask*/));
> +
> +      /* v.y = unpack_half_2x16_split_y(u); */
> +      insert_instruction(
> +         new(mem_ctx) ir_assignment(
> +            new(mem_ctx) ir_dereference_variable(v),
> +            new(mem_ctx) ir_expression(ir_unop_unpack_half_2x16_split_y,
> +               glsl_type::float_type,
> +               new(mem_ctx) ir_dereference_variable(u)),
> +            NULL /*condition*/,
> +            2 /*write mask*/));
> +
> +      /* return v; */
> +      return new(mem_ctx) ir_dereference_variable(v);
> +   }
> +};
> +
> +} // namespace anonymous
> +
> +/**
> + * \brief Lower the builtin packing functions.
> + *
> + * \param op_mask is a bitmask of `enum lower_packing_builtins_op`.
> + */
> +bool
> +lower_packing_builtins(exec_list *instructions, int op_mask)
> +{
> +   lower_packing_builtins_visitor v(op_mask);
> +   visit_list_elements(&v, instructions, true);
> +   return v.get_progress();
> +}
>