<div dir="ltr">On 24 January 2013 19:47, Matt Turner <span dir="ltr"><<a href="mailto:mattst88@gmail.com" target="_blank">mattst88@gmail.com</a>></span> wrote:<br><div class="gmail_extra"><div class="gmail_quote"><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
Lower them to arithmetic and bit manipulation expressions.<br>
---<br>
 src/glsl/ir_optimization.h          |    6 +<br>
 src/glsl/lower_packing_builtins.cpp |  279 +++++++++++++++++++++++++++++++++++<br>
 2 files changed, 285 insertions(+), 0 deletions(-)<br>
<br>
diff --git a/src/glsl/ir_optimization.h b/src/glsl/ir_optimization.h<br>
index ac90b87..8f33018 100644<br>
--- a/src/glsl/ir_optimization.h<br>
+++ b/src/glsl/ir_optimization.h<br>
@@ -54,6 +54,12 @@ enum lower_packing_builtins_op {<br>
<br>
    LOWER_PACK_HALF_2x16_TO_SPLIT        = 0x0040,<br>
    LOWER_UNPACK_HALF_2x16_TO_SPLIT      = 0x0080,<br>
+<br>
+   LOWER_PACK_SNORM_4x8                 = 0x0100,<br>
+   LOWER_UNPACK_SNORM_4x8               = 0x0200,<br>
+<br>
+   LOWER_PACK_UNORM_4x8                 = 0x0400,<br>
+   LOWER_UNPACK_UNORM_4x8               = 0x0800,<br>
 };<br>
<br>
 bool do_common_optimization(exec_list *ir, bool linked,<br>
diff --git a/src/glsl/lower_packing_builtins.cpp b/src/glsl/lower_packing_builtins.cpp<br>
index 49176cc..aa6765f 100644<br>
--- a/src/glsl/lower_packing_builtins.cpp<br>
+++ b/src/glsl/lower_packing_builtins.cpp<br>
@@ -85,9 +85,15 @@ public:<br>
       case LOWER_PACK_SNORM_2x16:<br>
          *rvalue = lower_pack_snorm_2x16(op0);<br>
          break;<br>
+      case LOWER_PACK_SNORM_4x8:<br>
+         *rvalue = lower_pack_snorm_4x8(op0);<br>
+         break;<br>
       case LOWER_PACK_UNORM_2x16:<br>
          *rvalue = lower_pack_unorm_2x16(op0);<br>
          break;<br>
+      case LOWER_PACK_UNORM_4x8:<br>
+         *rvalue = lower_pack_unorm_4x8(op0);<br>
+         break;<br>
       case LOWER_PACK_HALF_2x16:<br>
          *rvalue = lower_pack_half_2x16(op0);<br>
          break;<br>
@@ -97,9 +103,15 @@ public:<br>
       case LOWER_UNPACK_SNORM_2x16:<br>
          *rvalue = lower_unpack_snorm_2x16(op0);<br>
          break;<br>
+      case LOWER_UNPACK_SNORM_4x8:<br>
+         *rvalue = lower_unpack_snorm_4x8(op0);<br>
+         break;<br>
       case LOWER_UNPACK_UNORM_2x16:<br>
          *rvalue = lower_unpack_unorm_2x16(op0);<br>
          break;<br>
+      case LOWER_UNPACK_UNORM_4x8:<br>
+         *rvalue = lower_unpack_unorm_4x8(op0);<br>
+         break;<br>
       case LOWER_UNPACK_HALF_2x16:<br>
          *rvalue = lower_unpack_half_2x16(op0);<br>
          break;<br>
@@ -137,18 +149,30 @@ private:<br>
       case ir_unop_pack_snorm_2x16:<br>
          result = op_mask & LOWER_PACK_SNORM_2x16;<br>
          break;<br>
+      case ir_unop_pack_snorm_4x8:<br>
+         result = op_mask & LOWER_PACK_SNORM_4x8;<br>
+         break;<br>
       case ir_unop_pack_unorm_2x16:<br>
          result = op_mask & LOWER_PACK_UNORM_2x16;<br>
          break;<br>
+      case ir_unop_pack_unorm_4x8:<br>
+         result = op_mask & LOWER_PACK_UNORM_4x8;<br>
+         break;<br>
       case ir_unop_pack_half_2x16:<br>
          result = op_mask & (LOWER_PACK_HALF_2x16 | LOWER_PACK_HALF_2x16_TO_SPLIT);<br>
          break;<br>
       case ir_unop_unpack_snorm_2x16:<br>
          result = op_mask & LOWER_UNPACK_SNORM_2x16;<br>
          break;<br>
+      case ir_unop_unpack_snorm_4x8:<br>
+         result = op_mask & LOWER_UNPACK_SNORM_4x8;<br>
+         break;<br>
       case ir_unop_unpack_unorm_2x16:<br>
          result = op_mask & LOWER_UNPACK_UNORM_2x16;<br>
          break;<br>
+      case ir_unop_unpack_unorm_4x8:<br>
+         result = op_mask & LOWER_UNPACK_UNORM_4x8;<br>
+         break;<br>
       case ir_unop_unpack_half_2x16:<br>
          result = op_mask & (LOWER_UNPACK_HALF_2x16 | LOWER_UNPACK_HALF_2x16_TO_SPLIT);<br>
          break;<br>
@@ -214,6 +238,30 @@ private:<br>
    }<br>
<br>
    /**<br>
+    * \brief Pack four uint8's into a single uint32.<br>
+    *<br>
+    * Interpret the given uvec4 as a uint32 quad. Pack the quad into a uint32<br>
+    * where the least significant bits specify the first element of the quad.<br>
+    * Return the uint32.<br>
+    */<br>
+   ir_rvalue*<br>
+   pack_uvec4_to_uint(ir_rvalue *uvec4_rval)<br>
+   {<br>
+      assert(uvec4_rval->type == glsl_type::uvec4_type);<br>
+<br>
+      /* uvec4 u = UVEC4_RVAL; */<br>
+      ir_variable *u = factory.make_temp(glsl_type::uvec4_type,<br>
+                                          "tmp_pack_uvec4_to_uint");<br>
+      factory.emit(assign(u, uvec4_rval));<br></blockquote><div><br></div><div>Rather than do four scalar bit_and(..., constant(0xffu)) instructions below, how about changing the above line to:<br><br>factory.emit(assign(u, bit_and(uvec4_rval, constant(0xffu))));<br>
<br></div><div>That way we take advantage of vector processing in the GPU to do all four bit_ands at once.<br><br></div><div>With that fixed (as well as the copy/paste errors Ian spotted), this patch is:<br><br>Reviewed-by: Paul Berry <<a href="mailto:stereotype441@gmail.com">stereotype441@gmail.com</a>><br>
</div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
+<br>
+      /* return ((u.w 0xff) << 24) | ((u.z & 0xff) << 16) | ((u.y & 0xff) << 8) | (u.x & 0xff); */<br>
+      return bit_or(bit_or(lshift(bit_and(swizzle_w(u), constant(0xffu)), constant(24u)),<br>
+                           lshift(bit_and(swizzle_z(u), constant(0xffu)), constant(16u))),<br>
+                    bit_or(lshift(bit_and(swizzle_y(u), constant(0xffu)), constant(8u)),<br>
+                           bit_and(swizzle_x(u), constant(0xffu))));<br>
+   }<br>
+<br>
+   /**<br>
     * \brief Unpack a uint32 into two uint16's.<br>
     *<br>
     * Interpret the given uint32 as a uint16 pair where the uint32's least<br>
@@ -244,6 +292,44 @@ private:<br>
    }<br>
<br>
    /**<br>
+    * \brief Unpack a uint32 into four uint8's.<br>
+    *<br>
+    * Interpret the given uint32 as a uint8 quad where the uint32's least<br>
+    * significant bits specify the quad's first element. Return the uint8<br>
+    * quad as a uvec4.<br>
+    */<br>
+   ir_rvalue*<br>
+   unpack_uint_to_uvec4(ir_rvalue *uint_rval)<br>
+   {<br>
+      assert(uint_rval->type == glsl_type::uint_type);<br>
+<br>
+      /* uint u = UINT_RVAL; */<br>
+      ir_variable *u = factory.make_temp(glsl_type::uint_type,<br>
+                                          "tmp_unpack_uint_to_uvec4_u");<br>
+      factory.emit(assign(u, uint_rval));<br>
+<br>
+      /* uvec4 u4; */<br>
+      ir_variable *u4 = factory.make_temp(glsl_type::uvec4_type,<br>
+                                           "tmp_unpack_uint_to_uvec4_u4");<br>
+<br>
+      /* u4.x = u & 0xffu; */<br>
+      factory.emit(assign(u4, bit_and(u, constant(0xffu)), WRITEMASK_X));<br>
+<br>
+      /* u4.y = (u >> 8u) & 0xffu; */<br>
+      factory.emit(assign(u4, bit_and(rshift(u, constant(8u)),<br>
+                                      constant(0xffu)), WRITEMASK_Y));<br>
+<br>
+      /* u4.z = (u >> 16u) & 0xffu; */<br>
+      factory.emit(assign(u4, bit_and(rshift(u, constant(16u)),<br>
+                                      constant(0xffu)), WRITEMASK_Z));<br>
+<br>
+      /* u4.w = (u >> 24u) */<br>
+      factory.emit(assign(u4, rshift(u, constant(24u)), WRITEMASK_W));<br>
+<br>
+      return deref(u4).val;<br>
+   }<br>
+<br>
+   /**<br>
     * \brief Lower a packSnorm2x16 expression.<br>
     *<br>
     * \param vec2_rval is packSnorm2x16's input<br>
@@ -293,6 +379,55 @@ private:<br>
    }<br>
<br>
    /**<br>
+    * \brief Lower a packSnorm4x8 expression.<br>
+    *<br>
+    * \param vec4_rval is packSnorm4x8's input<br>
+    * \return packSnorm4x8's output as a uint rvalue<br>
+    */<br>
+   ir_rvalue*<br>
+   lower_pack_snorm_4x8(ir_rvalue *vec4_rval)<br>
+   {<br>
+      /* From page 137 (143 of pdf) of the GLSL 4.30 spec:<br>
+       *<br>
+       *    highp uint packSnorm4x8(vec4 v)<br>
+       *    -------------------------------<br>
+       *    First, converts each component of the normalized floating-point value<br>
+       *    v into 8-bit integer values. Then, the results are packed into the<br>
+       *    returned 32-bit unsigned integer.<br>
+       *<br>
+       *    The conversion for component c of v to fixed point is done as<br>
+       *    follows:<br>
+       *<br>
+       *       packSnorm4x8: round(clamp(c, -1, +1) * 127.0)<br>
+       *<br>
+       *    The first component of the vector will be written to the least<br>
+       *    significant bits of the output; the last component will be written to<br>
+       *    the most significant bits.<br>
+       *<br>
+       * This function generates IR that approximates the following pseudo-GLSL:<br>
+       *<br>
+       *     return pack_uvec4_to_uint(<br>
+       *         uvec4(ivec4(<br>
+       *           round(clamp(VEC4_RVALUE, -1.0f, 1.0f) * 127.0f))));<br>
+       *<br>
+       * It is necessary to first convert the vec4 to ivec4 rather than directly<br>
+       * converting vec4 to uvec4 because the latter conversion is undefined.<br>
+       * From page 87 (93 of pdf) of the GLSL 4.30 spec: "It is undefined to<br>
+       * convert a negative floating point value to an uint".<br>
+       */<br>
+      assert(vec4_rval->type == glsl_type::vec4_type);<br>
+<br>
+      ir_rvalue *result = pack_uvec4_to_uint(<br>
+            i2u(f2i(round_even(mul(clamp(vec4_rval,<br>
+                                         constant(-1.0f),<br>
+                                         constant(1.0f)),<br>
+                                   constant(127.0f))))));<br>
+<br>
+      assert(result->type == glsl_type::uint_type);<br>
+      return result;<br>
+   }<br>
+<br>
+   /**<br>
     * \brief Lower an unpackSnorm2x16 expression.<br>
     *<br>
     * \param uint_rval is unpackSnorm2x16's input<br>
@@ -352,6 +487,65 @@ private:<br>
    }<br>
<br>
    /**<br>
+    * \brief Lower an unpackSnorm4x8 expression.<br>
+    *<br>
+    * \param uint_rval is unpackSnorm4x8's input<br>
+    * \return unpackSnorm4x8's output as a vec4 rvalue<br>
+    */<br>
+   ir_rvalue*<br>
+   lower_unpack_snorm_4x8(ir_rvalue *uint_rval)<br>
+   {<br>
+      /* From page 137 (143 of pdf) of the GLSL 4.30 spec:<br>
+       *<br>
+       *    highp vec4 unpackSnorm4x8 (highp uint p)<br>
+       *    ----------------------------------------<br>
+       *    First, unpacks a single 32-bit unsigned integer p into four<br>
+       *    8-bit unsigned integers. Then, each component is converted to<br>
+       *    a normalized floating-point value to generate the returned<br>
+       *    four-component vector.<br>
+       *<br>
+       *    The conversion for unpacked fixed-point value f to floating point is<br>
+       *    done as follows:<br>
+       *<br>
+       *       unpackSnorm4x8: clamp(f / 127.0, -1, +1)<br>
+       *<br>
+       *    The first component of the returned vector will be extracted from the<br>
+       *    least significant bits of the input; the last component will be<br>
+       *    extracted from the most significant bits.<br>
+       *<br>
+       * This function generates IR that approximates the following pseudo-GLSL:<br>
+       *<br>
+       *    return clamp(<br>
+       *       ((ivec4(unpack_uint_to_uvec4(UINT_RVALUE)) << 24) >> 24) / 127.0f,<br>
+       *       -1.0f, 1.0f);<br>
+       *<br>
+       * The above IR may appear unnecessarily complex, but the intermediate<br>
+       * conversion to ivec4 and the bit shifts are necessary to correctly unpack<br>
+       * negative floats.<br>
+       *<br>
+       * To see why, consider packing and then unpacking vec4(-1.0, 0.0, 0.0,<br>
+       * 0.0). packSnorm4x8 encodes -1.0 as the int8 0xff. During unpacking, we<br>
+       * place that int8 into an int32, which results in the *positive* integer<br>
+       * 0x000000ff.  The int8's sign bit becomes, in the int32, the rather<br>
+       * unimportant bit 8. We must now extend the int8's sign bit into bits<br>
+       * 9-32, which is accomplished by left-shifting then right-shifting.<br>
+       */<br>
+<br>
+      assert(uint_rval->type == glsl_type::uint_type);<br>
+<br>
+      ir_rvalue *result =<br>
+        clamp(div(i2f(rshift(lshift(u2i(unpack_uint_to_uvec4(uint_rval)),<br>
+                                    constant(24u)),<br>
+                             constant(24u))),<br>
+                  constant(127.0f)),<br>
+              constant(-1.0f),<br>
+              constant(1.0f));<br>
+<br>
+      assert(result->type == glsl_type::vec4_type);<br>
+      return result;<br>
+   }<br>
+<br>
+   /**<br>
     * \brief Lower a packUnorm2x16 expression.<br>
     *<br>
     * \param vec2_rval is packUnorm2x16's input<br>
@@ -396,6 +590,50 @@ private:<br>
    }<br>
<br>
    /**<br>
+    * \brief Lower a packUnorm4x8 expression.<br>
+    *<br>
+    * \param vec4_rval is packUnorm4x8's input<br>
+    * \return packUnorm4x8's output as a uint rvalue<br>
+    */<br>
+   ir_rvalue*<br>
+   lower_pack_unorm_4x8(ir_rvalue *vec4_rval)<br>
+   {<br>
+      /* From page 137 (143 of pdf) of the GLSL 4.30 spec:<br>
+       *<br>
+       *    highp uint packUnorm4x8 (vec4 v)<br>
+       *    --------------------------------<br>
+       *    First, converts each component of the normalized floating-point value<br>
+       *    v into 16-bit integer values. Then, the results are packed into the<br>
+       *    returned 32-bit unsigned integer.<br>
+       *<br>
+       *    The conversion for component c of v to fixed point is done as<br>
+       *    follows:<br>
+       *<br>
+       *       packUnorm4x8: round(clamp(c, 0, +1) * 65535.0)<br>
+       *<br>
+       *    The first component of the vector will be written to the least<br>
+       *    significant bits of the output; the last component will be written to<br>
+       *    the most significant bits.<br>
+       *<br>
+       * This function generates IR that approximates the following pseudo-GLSL:<br>
+       *<br>
+       *     return pack_uvec4_to_uint(uvec4(<br>
+       *                round(clamp(VEC2_RVALUE, 0.0f, 1.0f) * 255.0f)));<br>
+       *<br>
+       * Here it is safe to directly convert the vec4 to uvec4 because the the<br>
+       * vec4 has been clamped to a non-negative range.<br>
+       */<br>
+<br>
+      assert(vec4_rval->type == glsl_type::vec4_type);<br>
+<br>
+      ir_rvalue *result = pack_uvec4_to_uint(<br>
+         f2u(round_even(mul(saturate(vec4_rval), constant(255.0f)))));<br>
+<br>
+      assert(result->type == glsl_type::uint_type);<br>
+      return result;<br>
+   }<br>
+<br>
+   /**<br>
     * \brief Lower an unpackUnorm2x16 expression.<br>
     *<br>
     * \param uint_rval is unpackUnorm2x16's input<br>
@@ -437,6 +675,47 @@ private:<br>
    }<br>
<br>
    /**<br>
+    * \brief Lower an unpackUnorm4x8 expression.<br>
+    *<br>
+    * \param uint_rval is unpackUnorm4x8's input<br>
+    * \return unpackUnorm4x8's output as a vec4 rvalue<br>
+    */<br>
+   ir_rvalue*<br>
+   lower_unpack_unorm_4x8(ir_rvalue *uint_rval)<br>
+   {<br>
+      /* From page 137 (143 of pdf) of the GLSL 4.30 spec:<br>
+       *<br>
+       *    highp vec4 unpackUnorm4x8 (highp uint p)<br>
+       *    ----------------------------------------<br>
+       *    First, unpacks a single 32-bit unsigned integer p into four<br>
+       *    8-bit unsigned integers. Then, each component is converted to<br>
+       *    a normalized floating-point value to generate the returned<br>
+       *    two-component vector.<br>
+       *<br>
+       *    The conversion for unpacked fixed-point value f to floating point is<br>
+       *    done as follows:<br>
+       *<br>
+       *       unpackUnorm4x8: f / 255.0<br>
+       *<br>
+       *    The first component of the returned vector will be extracted from the<br>
+       *    least significant bits of the input; the last component will be<br>
+       *    extracted from the most significant bits.<br>
+       *<br>
+       * This function generates IR that approximates the following pseudo-GLSL:<br>
+       *<br>
+       *     return vec4(unpack_uint_to_uvec4(UINT_RVALUE)) / 255.0;<br>
+       */<br>
+<br>
+      assert(uint_rval->type == glsl_type::uint_type);<br>
+<br>
+      ir_rvalue *result = div(u2f(unpack_uint_to_uvec4(uint_rval)),<br>
+                              constant(255.0f));<br>
+<br>
+      assert(result->type == glsl_type::vec4_type);<br>
+      return result;<br>
+   }<br>
+<br>
+   /**<br>
     * \brief Lower the component-wise calculation of packHalf2x16.<br>
     *<br>
     * \param f_rval is one component of packHafl2x16's input<br>
<span class="HOEnZb"><font color="#888888">--<br>
1.7.8.6<br>
<br>
_______________________________________________<br>
mesa-dev mailing list<br>
<a href="mailto:mesa-dev@lists.freedesktop.org">mesa-dev@lists.freedesktop.org</a><br>
<a href="http://lists.freedesktop.org/mailman/listinfo/mesa-dev" target="_blank">http://lists.freedesktop.org/mailman/listinfo/mesa-dev</a><br>
</font></span></blockquote></div><br></div></div>