[Mesa-dev] [PATCH 09/10] i965/vs/gen7: Emit code for GLSL ES 3.00 pack/unpack operations
Chad Versace
chad.versace at linux.intel.com
Thu Jan 10 00:10:27 PST 2013
Signed-off-by: Chad Versace <chad.versace at linux.intel.com>
---
src/mesa/drivers/dri/i965/brw_vec4.h | 3 +
src/mesa/drivers/dri/i965/brw_vec4_emit.cpp | 8 ++
src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp | 156 ++++++++++++++++++++++++-
3 files changed, 166 insertions(+), 1 deletion(-)
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h
index e65b92c..43d0454 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4.h
@@ -431,6 +431,9 @@ public:
void emit_math(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1);
src_reg fix_math_operand(src_reg src);
+ void emit_pack_half_2x16(dst_reg dst, src_reg src0);
+ void emit_unpack_half_2x16(dst_reg dst, src_reg src0);
+
void swizzle_result(ir_texture *ir, src_reg orig_val, int sampler);
void emit_ndc_computation();
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp b/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp
index 9fa742d..a38bb02 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_emit.cpp
@@ -803,6 +803,14 @@ vec4_generator::generate_code(exec_list *instructions)
brw_DP2(p, dst, src[0], src[1]);
break;
+ case BRW_OPCODE_F32TO16:
+ brw_F32TO16(p, dst, src[0]);
+ break;
+
+ case BRW_OPCODE_F16TO32:
+ brw_F16TO32(p, dst, src[0]);
+ break;
+
case BRW_OPCODE_IF:
if (inst->src[0].file != BAD_FILE) {
/* The instruction has an embedded compare (only allowed on gen6) */
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
index 02feff6..96376c4 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -348,6 +348,143 @@ vec4_visitor::emit_math(enum opcode opcode,
}
void
+vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
+{
+ if (intel->gen < 7)
+ assert(!"ir_unop_pack_half_2x16 should be lowered");
+
+ /* uint dst; */
+ assert(dst.type == BRW_REGISTER_TYPE_UD);
+
+ /* vec2 src0; */
+ assert(src0.type == BRW_REGISTER_TYPE_F);
+
+ /* uvec2 tmp;
+ *
+ * The PRM lists the destination type of f32to16 as W. However, I've
+ * experimentally confirmed on gen7 that it must be a 32-bit size, such as
+ * UD, in align16 mode.
+ */
+ dst_reg tmp_dst(this, glsl_type::uvec2_type);
+ src_reg tmp_src(tmp_dst);
+
+ /* tmp.xy = f32to16(src0); */
+ tmp_dst.writemask = WRITEMASK_XY;
+ emit(new(mem_ctx) vec4_instruction(this, BRW_OPCODE_F32TO16,
+ tmp_dst, src0));
+
+ /* The result's high 16 bits are in the low 16 bits of the temporary
+ * register's Y channel. The result's low 16 bits are in the low 16 bits
+ * of the X channel.
+ *
+ * In experiments on gen7 I've found the that, in the temporary register,
+ * the hight 16 bits of the X and Y channels are zeros. This is critical
+ * for the SHL and OR instructions below to work as expected.
+ */
+
+ /* dst = tmp.y << 16; */
+ tmp_src.swizzle = SWIZZLE_Y;
+ emit(new(mem_ctx) vec4_instruction(this, BRW_OPCODE_SHL,
+ dst, tmp_src, src_reg(16u)));
+ /* dst |= tmp.x; */
+ tmp_src.swizzle = SWIZZLE_X;
+ emit(new(mem_ctx) vec4_instruction(this, BRW_OPCODE_OR,
+ dst, src_reg(dst), tmp_src));
+
+
+ /* Idea for reducing the above number of registers and instructions
+ * ----------------------------------------------------------------
+ *
+ * It should be possible to remove the temporary register and replace the
+ * SHL and OR instructions above with a single MOV instruction mode in
+ * align1 mode that uses clever register region addressing. (It is
+ * impossible to specify the necessary register regions in align16 mode).
+ * Unfortunately, it is difficult to emit an align1 instruction here.
+ *
+ * In particular, I want to do this:
+ *
+ * # Give dst the form:
+ * #
+ * # w z y x w z y x
+ * # |0|0|0x0000hhhh|0x0000llll|0|0|0x0000hhhh|0x0000llll|
+ * #
+ * f32to16(8) dst<1>.xy:UD src<4;4,1>:F {align16}
+ *
+ * # Transform dst into the form of packHalf2x16's output.
+ * #
+ * # w z y x w z y x
+ * # |0|0|0x00000000|0xhhhhllll|0|0|0x00000000|0xhhhhllll|
+ * #
+ * # Use width=2 in order to move the Y channel's high 16 bits
+ * # into the low 16 bits, thus clearing the Y channel to zero.
+ * #
+ * mov(4) dst.1<1>:UW dst.2<8;2,1>:UW {align1}
+ */
+}
+
+void
+vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
+{
+ if (intel->gen < 7)
+ assert(!"ir_unop_unpack_half_2x16 should be lowered");
+
+ /* vec2 dst; */
+ assert(dst.type == BRW_REGISTER_TYPE_F);
+
+ /* uint src0; */
+ assert(src0.type == BRW_REGISTER_TYPE_UD);
+
+ /* uvec2 tmp; */
+ dst_reg tmp_dst(this, glsl_type::uvec2_type);
+ src_reg tmp_src(tmp_dst);
+
+ /* tmp.x = src0 & 0xffffu; */
+ tmp_dst.writemask = WRITEMASK_X;
+ emit(new(mem_ctx) vec4_instruction(this, BRW_OPCODE_AND,
+ tmp_dst, src0, src_reg(0xffffu)));
+
+ /* tmp.y = src0 >> 16u; */
+ tmp_dst.writemask = WRITEMASK_Y;
+ emit(new(mem_ctx) vec4_instruction(this, BRW_OPCODE_SHR,
+ tmp_dst, src0, src_reg(16u)));
+
+ /* dst = f16to32(tmp); */
+ dst.writemask = WRITEMASK_XY;
+ emit(new(mem_ctx) vec4_instruction(this, BRW_OPCODE_F16TO32,
+ dst, tmp_src));
+
+ /* Idea for reducing the above number of registers and instructions
+ * ----------------------------------------------------------------
+ *
+ * It should be possible to remove the temporary register and replace the
+ * SHR and AND instructions above with a single MOV instruction mode in
+ * align1 mode that uses clever register region addressing. (It is
+ * impossible to specify the necessary register regions in align16 mode).
+ * Unfortunately, it is difficult to emit an align1 instruction here.
+ *
+ * In particular, I want to do this:
+ *
+ * # Now, src has the form of unpackHalf2x16's input:
+ * #
+ * # w z y x w z y x
+ * # |0|0|0x00000000|0xhhhhllll|0|0|0x00000000|0xhhhhllll|
+ *
+ * # Transform src into a form consumable by f16to32:
+ * #
+ * # w z y x w z y x
+ * # |0|0|0x0000hhhh|0x0000llll|0|0|0x0000hhhh|0x0000llll|
+ * #
+ * # Use dst as the scratch register.
+ * #
+ * mov(2) dst.2<1>:UW dst.1<8;1,1>:UW {align1}
+ *
+ * # Give dst the form of unpackHalf2x16's output:
+ * #
+ * f16to32(4) dst<1>.xy:F src<4;4,1>:UD {align16}
+ */
+}
+
+void
vec4_visitor::visit_instructions(const exec_list *list)
{
foreach_list(node, list) {
@@ -1385,7 +1522,6 @@ vec4_visitor::visit(ir_expression *ir)
case ir_unop_round_even:
emit(RNDE(result_dst, op[0]));
break;
-
case ir_binop_min:
emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
break;
@@ -1469,6 +1605,24 @@ vec4_visitor::visit(ir_expression *ir)
case ir_quadop_vector:
assert(!"not reached: should be handled by lower_quadop_vector");
break;
+
+ case ir_unop_pack_half_2x16:
+ emit_pack_half_2x16(result_dst, op[0]);
+ break;
+ case ir_unop_unpack_half_2x16:
+ emit_unpack_half_2x16(result_dst, op[0]);
+ break;
+ case ir_unop_pack_snorm_2x16:
+ case ir_unop_pack_unorm_2x16:
+ case ir_unop_unpack_snorm_2x16:
+ case ir_unop_unpack_unorm_2x16:
+ assert(!"not reached: should be handled by lower_packing_builtins");
+ break;
+ case ir_unop_unpack_half_2x16_split_x:
+ case ir_unop_unpack_half_2x16_split_y:
+ case ir_binop_pack_half_2x16_split:
+ assert(!"not reached: should not occur in vertex shader");
+ break;
}
}
--
1.8.1
More information about the mesa-dev
mailing list