[Mesa-dev] [PATCH 1/2] i965/vs/gen7: Emit code for GLSL ES 3.00 pack/unpack operations (v3)

Thu Jan 24 14:00:21 PST 2013

FIXME: This patch emits VS code that violates documented hardware restrictions
and then relies on undocumented behavior that results from that violation.
This patch passes all tests, but should be fixed ASAP to conform to the
hardware documentation.

v2: Explain undocumented hardware behavior. Improve comments.
v3: Use ALU1 helper methods F32TO16() and F16TO32(). [for anholt]

CC: Eric Anholt <eric at anholt.net>
CC: Paul Berry <stereotype441 at gmail.com>
Reviewed-by: Ian Romanick <ian.d.romanick at intel.com> (v1)
Signed-off-by: Chad Versace <chad.versace at linux.intel.com>
---
 src/mesa/drivers/dri/i965/brw_vec4.h           |   5 +
 src/mesa/drivers/dri/i965/brw_vec4_emit.cpp    |   8 ++
 src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp | 133 +++++++++++++++++++++++++
 3 files changed, 146 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h
index e65b92c..86921a0 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4.h
@@ -358,6 +358,8 @@ public:
    vec4_instruction *RNDE(dst_reg dst, src_reg src0);
    vec4_instruction *RNDZ(dst_reg dst, src_reg src0);
    vec4_instruction *FRC(dst_reg dst, src_reg src0);
+   vec4_instruction *F32TO16(dst_reg dst, src_reg src0);
+   vec4_instruction *F16TO32(dst_reg dst, src_reg src0);
    vec4_instruction *ADD(dst_reg dst, src_reg src0, src_reg src1);
    vec4_instruction *MUL(dst_reg dst, src_reg src0, src_reg src1);
    vec4_instruction *MACH(dst_reg dst, src_reg src0, src_reg src1);
@@ -431,6 +433,9 @@ public:
    void emit_math(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1);
    src_reg fix_math_operand(src_reg src);
 
+   void emit_pack_half_2x16(dst_reg dst, src_reg src0);
+   void emit_unpack_half_2x16(dst_reg dst, src_reg src0);
+
    void swizzle_result(ir_texture *ir, src_reg orig_val, int sampler);
 
    void emit_ndc_computation();
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp b/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp
index 747edc2..e395ada 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp
@@ -808,6 +808,14 @@ vec4_generator::generate_code(exec_list *instructions)
 	 brw_DP2(p, dst, src[0], src[1]);
 	 break;
 
+      case BRW_OPCODE_F32TO16:
+         brw_F32TO16(p, dst, src[0]);
+         break;
+
+      case BRW_OPCODE_F16TO32:
+         brw_F16TO32(p, dst, src[0]);
+         break;
+
       case BRW_OPCODE_IF:
 	 if (inst->src[0].file != BAD_FILE) {
 	    /* The instruction has an embedded compare (only allowed on gen6) */
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
index ebf8990..4ec77a7 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -113,6 +113,8 @@ ALU1(FRC)
 ALU1(RNDD)
 ALU1(RNDE)
 ALU1(RNDZ)
+ALU1(F32TO16)
+ALU1(F16TO32)
 ALU2(ADD)
 ALU2(MUL)
 ALU2(MACH)
@@ -348,6 +350,119 @@ vec4_visitor::emit_math(enum opcode opcode,
 }
 
 void
+vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
+{
+   if (intel->gen < 7)
+      assert(!"ir_unop_pack_half_2x16 should be lowered");
+
+   assert(dst.type == BRW_REGISTER_TYPE_UD);
+   assert(src0.type == BRW_REGISTER_TYPE_F);
+
+   /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
+    *
+    *   Because this instruction does not have a 16-bit floating-point type,
+    *   the destination data type must be Word (W).
+    *
+    *   The destination must be DWord-aligned and specify a horizontal stride
+    *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
+    *   each destination channel and the upper word is not modified.
+    *
+    * The above restriction implies that the f32to16 instruction must use
+    * align1 mode, because only in align1 mode is it possible to specify
+    * horizontal stride.  We choose here to defy the hardware docs and emit
+    * align16 instructions.
+    *
+    * (I [chadv] did attempt to emit align1 instructions for VS f32to16
+    * instructions. I was partially successful in that the code passed all
+    * tests.  However, the code was dubiously correct and fragile, and the
+    * tests were not harsh enough to probe that frailty. Not trusting the
+    * code, I chose instead to remain in align16 mode in defiance of the hw
+    * docs).
+    *
+    * I've [chadv] experimentally confirmed that, on gen7 hardware and the
+    * simulator, emitting a f32to16 in align16 mode with UD as destination
+    * data type is safe. The behavior differs from that specified in the PRM
+    * in that the upper word of each destination channel is cleared to 0.
+    */
+
+   dst_reg tmp_dst(this, glsl_type::uvec2_type);
+   src_reg tmp_src(tmp_dst);
+
+#if 0
+   /* Verify the undocumented behavior on which the following instructions
+    * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
+    * then the result of the bit-or instruction below will be incorrect.
+    *
+    * You should inspect the disasm output in order to verify that the MOV is
+    * not optimized away.
+    */
+   emit(MOV(tmp_dst, src_reg(0x12345678u)));
+#endif
+
+   /* Give tmp the form below, where "." means untouched.
+    *
+    *     w z          y          x w z          y          x
+    *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
+    *
+    * That the upper word of each write-channel be 0 is required for the
+    * following bit-shift and bit-or instructions to work. Note that this
+    * relies on the undocumented hardware behavior mentioned above.
+    */
+   tmp_dst.writemask = WRITEMASK_XY;
+   emit(F32TO16(tmp_dst, src0));
+
+   /* Give the write-channels of dst the form:
+    *   0xhhhh0000
+    */
+   tmp_src.swizzle = SWIZZLE_Y;
+   emit(SHL(dst, tmp_src, src_reg(16u)));
+
+   /* Finally, give the write-channels of dst the form of packHalf2x16's
+    * output:
+    *   0xhhhhllll
+    */
+   tmp_src.swizzle = SWIZZLE_X;
+   emit(OR(dst, src_reg(dst), tmp_src));
+}
+
+void
+vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
+{
+   if (intel->gen < 7)
+      assert(!"ir_unop_unpack_half_2x16 should be lowered");
+
+   assert(dst.type == BRW_REGISTER_TYPE_F);
+   assert(src0.type == BRW_REGISTER_TYPE_UD);
+
+   /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
+    *
+    *   Because this instruction does not have a 16-bit floating-point type,
+    *   the source data type must be Word (W). The destination type must be
+    *   F (Float).
+    *
+    * To use W as the source data type, we must adjust horizontal strides,
+    * which is only possible in align1 mode. All my [chadv] attempts at
+    * emitting align1 instructions for unpackHalf2x16 failed to pass the
+    * Piglit tests, so I gave up.
+    *
+    * I've verified that, on gen7 hardware and the simulator, it is safe to
+    * emit f16to32 in align16 mode with UD as source data type.
+    */
+
+   dst_reg tmp_dst(this, glsl_type::uvec2_type);
+   src_reg tmp_src(tmp_dst);
+
+   tmp_dst.writemask = WRITEMASK_X;
+   emit(AND(tmp_dst, src0, src_reg(0xffffu)));
+
+   tmp_dst.writemask = WRITEMASK_Y;
+   emit(SHR(tmp_dst, src0, src_reg(16u)));
+
+   dst.writemask = WRITEMASK_XY;
+   emit(F16TO32(dst, tmp_src));
+}
+
+void
 vec4_visitor::visit_instructions(const exec_list *list)
 {
    foreach_list(node, list) {
@@ -1469,6 +1584,24 @@ vec4_visitor::visit(ir_expression *ir)
    case ir_quadop_vector:
       assert(!"not reached: should be handled by lower_quadop_vector");
       break;
+
+   case ir_unop_pack_half_2x16:
+      emit_pack_half_2x16(result_dst, op[0]);
+      break;
+   case ir_unop_unpack_half_2x16:
+      emit_unpack_half_2x16(result_dst, op[0]);
+      break;
+   case ir_unop_pack_snorm_2x16:
+   case ir_unop_pack_unorm_2x16:
+   case ir_unop_unpack_snorm_2x16:
+   case ir_unop_unpack_unorm_2x16:
+      assert(!"not reached: should be handled by lower_packing_builtins");
+      break;
+   case ir_unop_unpack_half_2x16_split_x:
+   case ir_unop_unpack_half_2x16_split_y:
+   case ir_binop_pack_half_2x16_split:
+           assert(!"not reached: should not occur in vertex shader");
+           break;
    }
 }
 
-- 
1.8.1.1