[Mesa-dev] [PATCH (gles3) 19/20] i965/vs/gen7: Emit code for GLSL ES 3.00 pack/unpack operations

Mon Jan 21 00:49:31 PST 2013

Signed-off-by: Chad Versace <chad.versace at linux.intel.com>
---
 src/mesa/drivers/dri/i965/brw_vec4.h           |   3 +
 src/mesa/drivers/dri/i965/brw_vec4_emit.cpp    |   8 ++
 src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp | 155 +++++++++++++++++++++++++
 3 files changed, 166 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h
index e65b92c..43d0454 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4.h
@@ -431,6 +431,9 @@ public:
    void emit_math(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1);
    src_reg fix_math_operand(src_reg src);
 
+   void emit_pack_half_2x16(dst_reg dst, src_reg src0);
+   void emit_unpack_half_2x16(dst_reg dst, src_reg src0);
+
    void swizzle_result(ir_texture *ir, src_reg orig_val, int sampler);
 
    void emit_ndc_computation();
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp b/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp
index 747edc2..e395ada 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp
@@ -808,6 +808,14 @@ vec4_generator::generate_code(exec_list *instructions)
 	 brw_DP2(p, dst, src[0], src[1]);
 	 break;
 
+      case BRW_OPCODE_F32TO16:
+         brw_F32TO16(p, dst, src[0]);
+         break;
+
+      case BRW_OPCODE_F16TO32:
+         brw_F16TO32(p, dst, src[0]);
+         break;
+
       case BRW_OPCODE_IF:
 	 if (inst->src[0].file != BAD_FILE) {
 	    /* The instruction has an embedded compare (only allowed on gen6) */
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
index ebf8990..b5f1aae 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -348,6 +348,143 @@ vec4_visitor::emit_math(enum opcode opcode,
 }
 
 void
+vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
+{
+   if (intel->gen < 7)
+      assert(!"ir_unop_pack_half_2x16 should be lowered");
+
+   /* uint dst; */
+   assert(dst.type == BRW_REGISTER_TYPE_UD);
+
+   /* vec2 src0; */
+   assert(src0.type == BRW_REGISTER_TYPE_F);
+
+   /* uvec2 tmp;
+    *
+    * The PRM lists the destination type of f32to16 as W.  However, I've
+    * experimentally confirmed on gen7 that it must be a 32-bit size, such as
+    * UD, in align16 mode.
+    */
+   dst_reg tmp_dst(this, glsl_type::uvec2_type);
+   src_reg tmp_src(tmp_dst);
+
+   /* tmp.xy = f32to16(src0); */
+   tmp_dst.writemask = WRITEMASK_XY;
+   emit(new(mem_ctx) vec4_instruction(this, BRW_OPCODE_F32TO16,
+                                      tmp_dst, src0));
+
+   /* The result's high 16 bits are in the low 16 bits of the temporary
+    * register's Y channel.  The result's low 16 bits are in the low 16 bits
+    * of the X channel.
+    *
+    * In experiments on gen7 I've found the that, in the temporary register,
+    * the hight 16 bits of the X and Y channels are zeros. This is critical
+    * for the SHL and OR instructions below to work as expected.
+    */
+
+   /* dst = tmp.y << 16; */
+   tmp_src.swizzle = SWIZZLE_Y;
+   emit(new(mem_ctx) vec4_instruction(this, BRW_OPCODE_SHL,
+                                      dst, tmp_src, src_reg(16u)));
+   /* dst |= tmp.x; */
+   tmp_src.swizzle = SWIZZLE_X;
+   emit(new(mem_ctx) vec4_instruction(this, BRW_OPCODE_OR,
+                                      dst, src_reg(dst), tmp_src));
+
+
+   /* Idea for reducing the above number of registers and instructions
+    * ----------------------------------------------------------------
+    *
+    * It should be possible to remove the temporary register and replace the
+    * SHL and OR instructions above with a single MOV instruction mode in
+    * align1 mode that uses clever register region addressing. (It is
+    * impossible to specify the necessary register regions in align16 mode).
+    * Unfortunately, it is difficult to emit an align1 instruction here.
+    *
+    * In particular, I want to do this:
+    *
+    *   # Give dst the form:
+    *   #
+    *   #    w z          y          x w z          y          x
+    *   #  |0|0|0x0000hhhh|0x0000llll|0|0|0x0000hhhh|0x0000llll|
+    *   #
+    *   f32to16(8) dst<1>.xy:UD src<4;4,1>:F {align16}
+    *
+    *   # Transform dst into the form of packHalf2x16's output.
+    *   #
+    *   #    w z          y          x w z          y          x
+    *   #  |0|0|0x00000000|0xhhhhllll|0|0|0x00000000|0xhhhhllll|
+    *   #
+    *   # Use width=2 in order to move the Y channel's high 16 bits
+    *   # into the low 16 bits, thus clearing the Y channel to zero.
+    *   #
+    *   mov(4) dst.1<1>:UW dst.2<8;2,1>:UW {align1}
+    */
+}
+
+void
+vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
+{
+   if (intel->gen < 7)
+      assert(!"ir_unop_unpack_half_2x16 should be lowered");
+
+   /* vec2 dst; */
+   assert(dst.type == BRW_REGISTER_TYPE_F);
+
+   /* uint src0; */
+   assert(src0.type == BRW_REGISTER_TYPE_UD);
+
+   /* uvec2 tmp; */
+   dst_reg tmp_dst(this, glsl_type::uvec2_type);
+   src_reg tmp_src(tmp_dst);
+
+   /* tmp.x = src0 & 0xffffu; */
+   tmp_dst.writemask = WRITEMASK_X;
+   emit(new(mem_ctx) vec4_instruction(this, BRW_OPCODE_AND,
+                                      tmp_dst, src0, src_reg(0xffffu)));
+
+   /* tmp.y = src0 >> 16u; */
+   tmp_dst.writemask = WRITEMASK_Y;
+   emit(new(mem_ctx) vec4_instruction(this, BRW_OPCODE_SHR,
+                                      tmp_dst, src0, src_reg(16u)));
+
+   /* dst = f16to32(tmp); */
+   dst.writemask = WRITEMASK_XY;
+   emit(new(mem_ctx) vec4_instruction(this, BRW_OPCODE_F16TO32,
+                                      dst, tmp_src));
+
+   /* Idea for reducing the above number of registers and instructions
+    * ----------------------------------------------------------------
+    *
+    * It should be possible to remove the temporary register and replace the
+    * SHR and AND instructions above with a single MOV instruction mode in
+    * align1 mode that uses clever register region addressing. (It is
+    * impossible to specify the necessary register regions in align16 mode).
+    * Unfortunately, it is difficult to emit an align1 instruction here.
+    *
+    * In particular, I want to do this:
+    *
+    *   # Now, src has the form of unpackHalf2x16's input:
+    *   #
+    *   #    w z          y          x w z          y          x
+    *   #  |0|0|0x00000000|0xhhhhllll|0|0|0x00000000|0xhhhhllll|
+    *
+    *   # Transform src into a form consumable by f16to32:
+    *   #
+    *   #    w z          y          x w z          y          x
+    *   #  |0|0|0x0000hhhh|0x0000llll|0|0|0x0000hhhh|0x0000llll|
+    *   #
+    *   # Use dst as the scratch register.
+    *   #
+    *   mov(2) dst.2<1>:UW dst.1<8;1,1>:UW {align1}
+    *
+    *   # Give dst the form of unpackHalf2x16's output:
+    *   #
+    *   f16to32(4) dst<1>.xy:F src<4;4,1>:UD {align16}
+    */
+}
+
+void
 vec4_visitor::visit_instructions(const exec_list *list)
 {
    foreach_list(node, list) {
@@ -1469,6 +1606,24 @@ vec4_visitor::visit(ir_expression *ir)
    case ir_quadop_vector:
       assert(!"not reached: should be handled by lower_quadop_vector");
       break;
+
+   case ir_unop_pack_half_2x16:
+      emit_pack_half_2x16(result_dst, op[0]);
+      break;
+   case ir_unop_unpack_half_2x16:
+      emit_unpack_half_2x16(result_dst, op[0]);
+      break;
+   case ir_unop_pack_snorm_2x16:
+   case ir_unop_pack_unorm_2x16:
+   case ir_unop_unpack_snorm_2x16:
+   case ir_unop_unpack_unorm_2x16:
+      assert(!"not reached: should be handled by lower_packing_builtins");
+      break;
+   case ir_unop_unpack_half_2x16_split_x:
+   case ir_unop_unpack_half_2x16_split_y:
+   case ir_binop_pack_half_2x16_split:
+           assert(!"not reached: should not occur in vertex shader");
+           break;
    }
 }
 
-- 
1.8.1.1