[Mesa-dev] [PATCH] i965/fs/gen7: Emit code for GLSL 3.00 pack/unpack operations (v3)
Chad Versace
chad.versace at linux.intel.com
Wed Jan 23 17:27:47 PST 2013
v2: Remove lewd comment. [for idr]
v3: - Optimize away tmp register for packHalf2x16. [for anholt, paul]
- Improve comments. [for anholt, paul]
- Reduce near-duplicate code by removing vec4_visitor emit_pack/unpack
methods. [for chadv]
CC: Eric Anholt <eric at anholt.net>
CC: Paul Berry Paul Berry <stereotype441 at gmail.com>
Reviewed-by: Ian Romanick <ian.d.romanick at intel.com> (v2)
Signed-off-by: Chad Versace <chad.versace at linux.intel.com>
---
src/mesa/drivers/dri/i965/brw_defines.h | 3 +
src/mesa/drivers/dri/i965/brw_fs.h | 8 ++
.../dri/i965/brw_fs_channel_expressions.cpp | 12 +++
src/mesa/drivers/dri/i965/brw_fs_emit.cpp | 106 ++++++++++++++++++++-
src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 19 +++-
5 files changed, 145 insertions(+), 3 deletions(-)
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index e2f1e65..79cc12f 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -726,6 +726,9 @@ enum opcode {
FS_OPCODE_MOV_DISPATCH_TO_FLAGS,
FS_OPCODE_DISCARD_JUMP,
FS_OPCODE_SET_GLOBAL_OFFSET,
+ FS_OPCODE_PACK_HALF_2x16_SPLIT,
+ FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X,
+ FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y,
VS_OPCODE_URB_WRITE,
VS_OPCODE_SCRATCH_READ,
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index b47b0d0..d332502 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -542,6 +542,14 @@ private:
struct brw_reg offset);
void generate_discard_jump(fs_inst *inst);
+ void generate_pack_half_2x16_split(fs_inst *inst,
+ struct brw_reg dst,
+ struct brw_reg x,
+ struct brw_reg y);
+ void generate_unpack_half_2x16_split(fs_inst *inst,
+ struct brw_reg dst,
+ struct brw_reg src);
+
void patch_discard_jumps_to_fb_writes();
struct brw_context *brw;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp b/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp
index 58521ee..e19da51 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp
@@ -342,9 +342,21 @@ ir_channel_expressions_visitor::visit_leave(ir_assignment *ir)
assert(!"not yet supported");
break;
+ case ir_unop_pack_snorm_2x16:
+ case ir_unop_pack_unorm_2x16:
+ case ir_unop_pack_half_2x16:
+ case ir_unop_unpack_snorm_2x16:
+ case ir_unop_unpack_unorm_2x16:
+ case ir_unop_unpack_half_2x16:
case ir_quadop_vector:
assert(!"should have been lowered");
break;
+
+ case ir_unop_unpack_half_2x16_split_x:
+ case ir_unop_unpack_half_2x16_split_y:
+ case ir_binop_pack_half_2x16_split:
+ assert("!not reached: expression operates on scalars only");
+ break;
}
ir->remove();
diff --git a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
index 324e665..9b54796 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
@@ -923,6 +923,96 @@ fs_generator::generate_set_global_offset(fs_inst *inst,
}
void
+fs_generator::generate_pack_half_2x16_split(fs_inst *inst,
+ struct brw_reg dst,
+ struct brw_reg x,
+ struct brw_reg y)
+{
+ assert(intel->gen >= 7);
+ assert(dst.type == BRW_REGISTER_TYPE_UD);
+ assert(x.type = BRW_REGISTER_TYPE_F);
+ assert(y.type = BRW_REGISTER_TYPE_F);
+
+ /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
+ *
+ * Because this instruction does not have a 16-bit floating-point type,
+ * the destination data type must be Word (W).
+ *
+ * The destination must be DWord-aligned and specify a horizontal stride
+ * (HorzStride) of 2. The 16-bit result is stored in the lower word of
+ * each destination channel and the upper word is not modified.
+ */
+
+ /* Give each 32-bit channel of dst the form below , where "." means
+ * unchanged.
+ * 0x....hhhh
+ *
+ * Per the PRM, change the source data type to W. To compensate for
+ * halving the data type width, double the horizontal stride. (The
+ * BRW_*_STRIDE enums are defined so that incrementing the field doubles
+ * the real stride).
+ */
+ dst.type = BRW_REGISTER_TYPE_W;
+ if (dst.hstride != 0)
+ ++dst.hstride;
+ brw_F32TO16(p, dst, y);
+
+ /* Now the form:
+ * 0xhhhh0000
+ */
+ dst.type = BRW_REGISTER_TYPE_UD;
+ if (dst.hstride != 0)
+ --dst.hstride;
+ brw_SHL(p, dst, dst, brw_imm_ud(16u));
+
+ /* And, finally the form of packHalf2x16's output:
+ * 0xhhhhllll
+ */
+ dst.type = BRW_REGISTER_TYPE_W;
+ if (dst.hstride != 0)
+ ++dst.hstride;
+ brw_F32TO16(p, dst, x);
+}
+
+void
+fs_generator::generate_unpack_half_2x16_split(fs_inst *inst,
+ struct brw_reg dst,
+ struct brw_reg src)
+{
+ assert(intel->gen >= 7);
+ assert(dst.type == BRW_REGISTER_TYPE_F);
+ assert(src.type == BRW_REGISTER_TYPE_UD);
+
+ /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
+ *
+ * Because this instruction does not have a 16-bit floating-point type,
+ * the source data type must be Word (W). The destination type must be
+ * F (Float).
+ */
+
+ /* Per the PRM, change the source data type to W. To compensate for halving
+ * the data type width, double the strides. (The BRW_*_STRIDE enums are
+ * defined so that incrementing the field doubles the real stride).
+ */
+ src.type = BRW_REGISTER_TYPE_W;
+ if (src.vstride > 0)
+ ++src.vstride;
+ if (src.hstride > 0)
+ ++src.hstride;
+
+ /* Each channel of src has the form of unpackHalf2x16's input: 0xhhhhllll.
+ * For the Y case, we wish to access only the upper word; therefore
+ * a 16-bit subregister offset is needed.
+ */
+ assert(inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X ||
+ inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y);
+ if (inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y)
+ src.subnr += 2;
+
+ brw_F16TO32(p, dst, src);
+}
+
+void
fs_generator::generate_code(exec_list *instructions)
{
int last_native_insn_offset = p->next_insn_offset;
@@ -1082,7 +1172,12 @@ fs_generator::generate_code(exec_list *instructions)
case BRW_OPCODE_SHL:
brw_SHL(p, dst, src[0], src[1]);
break;
-
+ case BRW_OPCODE_F32TO16:
+ brw_F32TO16(p, dst, src[0]);
+ break;
+ case BRW_OPCODE_F16TO32:
+ brw_F16TO32(p, dst, src[0]);
+ break;
case BRW_OPCODE_CMP:
brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
break;
@@ -1229,6 +1324,15 @@ fs_generator::generate_code(exec_list *instructions)
generate_set_global_offset(inst, dst, src[0], src[1]);
break;
+ case FS_OPCODE_PACK_HALF_2x16_SPLIT:
+ generate_pack_half_2x16_split(inst, dst, src[0], src[1]);
+ break;
+
+ case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X:
+ case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y:
+ generate_unpack_half_2x16_split(inst, dst, src[0]);
+ break;
+
default:
if (inst->opcode < (int) ARRAY_SIZE(opcode_descs)) {
_mesa_problem(ctx, "Unsupported opcode `%s' in FS",
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index 5885989..2011fe3 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -538,7 +538,20 @@ fs_visitor::visit(ir_expression *ir)
BRW_CONDITIONAL_L : BRW_CONDITIONAL_GE,
this->result, op[0], op[1]);
break;
-
+ case ir_unop_pack_snorm_2x16:
+ case ir_unop_pack_unorm_2x16:
+ case ir_unop_unpack_snorm_2x16:
+ case ir_unop_unpack_unorm_2x16:
+ case ir_unop_unpack_half_2x16:
+ case ir_unop_pack_half_2x16:
+ assert(!"not reached: should be handled by lower_packing_builtins");
+ break;
+ case ir_unop_unpack_half_2x16_split_x:
+ emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X, this->result, op[0]);
+ break;
+ case ir_unop_unpack_half_2x16_split_y:
+ emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y, this->result, op[0]);
+ break;
case ir_binop_pow:
emit_math(SHADER_OPCODE_POW, this->result, op[0], op[1]);
break;
@@ -566,7 +579,9 @@ fs_visitor::visit(ir_expression *ir)
else
inst = emit(SHR(this->result, op[0], op[1]));
break;
-
+ case ir_binop_pack_half_2x16_split:
+ emit(FS_OPCODE_PACK_HALF_2x16_SPLIT, this->result, op[0], op[1]);
+ break;
case ir_binop_ubo_load:
/* This IR node takes a constant uniform block and a constant or
* variable byte offset within the block and loads a vector from that.
--
1.8.1.1
More information about the mesa-dev
mailing list