[Mesa-dev] [RFC PATCH] nir: Add byte_extract operation.
Matt Turner
mattst88 at gmail.com
Wed Dec 2 13:07:25 PST 2015
Two shaders that appear in Unigine benchmarks (Heaven and Valley) unpack
three bytes from an integer and convert each into a float:
float((val >> 16u) & 0xffu)
float((val >> 8u) & 0xffu)
float((val >> 0u) & 0xffu)
Instead of shifting, masking, and type converting like this:
shr(8) g15<1>UD g25<8,8,1>UD 0x00000010UD
and(8) g16<1>UD g15<8,8,1>UD 0x000000ffUD
mov(8) g17<1>F g16<8,8,1>UD
shr(8) g18<1>UD g25<8,8,1>UD 0x00000008UD
and(8) g19<1>UD g18<8,8,1>UD 0x000000ffUD
mov(8) g20<1>F g19<8,8,1>UD
and(8) g21<1>UD g25<8,8,1>UD 0x000000ffUD
mov(8) g22<1>F g21<8,8,1>UD
i965 can simply extract a byte and convert to float in a single
instruction:
mov(8) g17<1>F g25.2<16,4,4>UB
mov(8) g20<1>F g25.1<16,4,4>UB
mov(8) g22<1>F g25.0<16,4,4>UB
Decreases the number of instructions and cycles in the two programs by:
#1706: 3728 -> 3363 instructions (-9.79%), 9594 -> 9180 cycles (-4.32%)
#1721: 4027 -> 3662 instructions (-9.06%), 10264 -> 9572 cycles (-6.74%)
---
This is dependent on Connor's outstanding i965 scheduling patches and
requires some benchmark data.
Presumably we'll need a lower_byte_extract flag, or maybe we get lucky
and both vc4 and freedreno are capable of handling this opcode like i965.
I should probably split the nir and i965 pieces into separate patches.
TODO: test the i965/vec4 code.
src/glsl/nir/nir_opcodes.py | 4 ++
src/glsl/nir/nir_opt_algebraic.py | 6 +++
src/mesa/drivers/dri/i965/brw_defines.h | 6 +++
src/mesa/drivers/dri/i965/brw_fs.h | 2 +
src/mesa/drivers/dri/i965/brw_fs_cse.cpp | 1 +
src/mesa/drivers/dri/i965/brw_fs_generator.cpp | 6 +++
src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 47 ++++++++++++++++++++++++
src/mesa/drivers/dri/i965/brw_shader.cpp | 2 +
src/mesa/drivers/dri/i965/brw_vec4_cse.cpp | 1 +
src/mesa/drivers/dri/i965/brw_vec4_generator.cpp | 8 ++++
src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 6 +++
11 files changed, 89 insertions(+)
diff --git a/src/glsl/nir/nir_opcodes.py b/src/glsl/nir/nir_opcodes.py
index 37d3dfc..1d85ec9 100644
--- a/src/glsl/nir/nir_opcodes.py
+++ b/src/glsl/nir/nir_opcodes.py
@@ -540,6 +540,10 @@ dst.x = src0.x;
dst.y = src1.x;
""")
+# Byte extraction
+binop_convert("extract_byte", tuint, tuint, "", "(unsigned)src0 >> (src1 * 8) & 0xff")
+
+
def triop(name, ty, const_expr):
opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], "", const_expr)
def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr):
diff --git a/src/glsl/nir/nir_opt_algebraic.py b/src/glsl/nir/nir_opt_algebraic.py
index 6aa8b1f..a236ac6 100644
--- a/src/glsl/nir/nir_opt_algebraic.py
+++ b/src/glsl/nir/nir_opt_algebraic.py
@@ -202,6 +202,12 @@ optimizations = [
(('f2i', ('ftrunc', a)), ('f2i', a)),
(('f2u', ('ftrunc', a)), ('f2u', a)),
+ # Byte extraction
+ (('iand', 0xff, ('ushr', a, 24)), ('extract_byte', a, 3)),
+ (('iand', 0xff, ('ushr', a, 16)), ('extract_byte', a, 2)),
+ (('iand', 0xff, ('ushr', a, 8)), ('extract_byte', a, 1)),
+ (('iand', 0xff, a), ('extract_byte', a, 0)),
+
# Subtracts
(('fsub', a, ('fsub', 0.0, b)), ('fadd', a, b)),
(('isub', a, ('isub', 0, b)), ('iadd', a, b)),
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index ade3ede..dc9069d 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -1079,6 +1079,12 @@ enum opcode {
*/
SHADER_OPCODE_BROADCAST,
+ /**
+ * Pick the byte from its first source register given by the index
+ * specified as second source.
+ */
+ SHADER_OPCODE_EXTRACT_BYTE,
+
VEC4_OPCODE_MOV_BYTES,
VEC4_OPCODE_PACK_BYTES,
VEC4_OPCODE_UNPACK_UNIFORM,
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index 5729fdf..718ebf3 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -263,6 +263,8 @@ public:
void emit_percomp(const brw::fs_builder &bld, const fs_inst &inst,
unsigned wr_mask);
+ bool optimize_extract_byte_to_float(nir_alu_instr *instr,
+ const fs_reg &result);
bool optimize_frontfacing_ternary(nir_alu_instr *instr,
const fs_reg &result);
diff --git a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
index 3b65a38..c496036 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
@@ -78,6 +78,7 @@ is_expression(const fs_visitor *v, const fs_inst *const inst)
case FS_OPCODE_LINTERP:
case SHADER_OPCODE_FIND_LIVE_CHANNEL:
case SHADER_OPCODE_BROADCAST:
+ case SHADER_OPCODE_EXTRACT_BYTE:
case SHADER_OPCODE_MOV_INDIRECT:
return true;
case SHADER_OPCODE_RCP:
diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
index c25da07..b090824 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -2193,6 +2193,12 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
brw_broadcast(p, dst, src[0], src[1]);
break;
+ case SHADER_OPCODE_EXTRACT_BYTE:
+ brw_MOV(p, dst, stride(suboffset(retype(src[0], BRW_REGISTER_TYPE_UB),
+ src[1].ud),
+ 16, 4, 4));
+ break;
+
case FS_OPCODE_SET_SAMPLE_ID:
generate_set_sample_id(inst, dst, src[0], src[1]);
break;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index 15bd98f..d6e414c 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -453,6 +453,43 @@ fs_visitor::nir_emit_instr(nir_instr *instr)
}
}
+/**
+ * Recognizes a parent instruction of nir_op_extract_byte and changes the type
+ * to match instr.
+ *
+ * Used as a peephole by i2f and u2f. nir_op_extract_byte returns an unsigned,
+ * but i965 can extract a byte and do type conversion in a single instruction.
+ */
+bool
+fs_visitor::optimize_extract_byte_to_float(nir_alu_instr *instr,
+ const fs_reg &result)
+{
+ if (!instr->src[0].src.is_ssa ||
+ !instr->src[0].src.ssa->parent_instr)
+ return false;
+
+ if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu)
+ return false;
+
+ nir_alu_instr *src0 =
+ nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
+
+ if (src0->op != nir_op_extract_byte)
+ return false;
+
+ nir_const_value *byte = nir_src_as_const_value(src0->src[1].src);
+ assert(byte != NULL && byte->u[0] <= 3);
+
+ fs_reg op0 = get_nir_src(src0->src[0].src);
+ op0.type = brw_type_for_nir_type(nir_op_infos[src0->op].input_types[0]);
+ op0 = offset(op0, bld, src0->src[0].swizzle[0]);
+
+ set_saturate(instr->dest.saturate,
+ bld.emit(SHADER_OPCODE_EXTRACT_BYTE,
+ result, op0, brw_imm_ud(byte->u[0])));
+ return true;
+}
+
bool
fs_visitor::optimize_frontfacing_ternary(nir_alu_instr *instr,
const fs_reg &result)
@@ -624,6 +661,9 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
switch (instr->op) {
case nir_op_i2f:
case nir_op_u2f:
+ if (optimize_extract_byte_to_float(instr, result))
+ return;
+
inst = bld.MOV(result, op[0]);
inst->saturate = instr->dest.saturate;
break;
@@ -1036,6 +1076,13 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
inst->predicate = BRW_PREDICATE_NORMAL;
break;
+ case nir_op_extract_byte: {
+ nir_const_value *byte = nir_src_as_const_value(instr->src[1].src);
+ bld.emit(SHADER_OPCODE_EXTRACT_BYTE,
+ result, op[0], brw_imm_ud(byte->u[0]));
+ break;
+ }
+
default:
unreachable("unhandled instruction");
}
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
index 5a6752b..b0f0e5a 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -423,6 +423,8 @@ brw_instruction_name(enum opcode op)
case SHADER_OPCODE_BROADCAST:
return "broadcast";
+ case SHADER_OPCODE_EXTRACT_BYTE:
+ return "extract_byte";
case VEC4_OPCODE_MOV_BYTES:
return "mov_bytes";
case VEC4_OPCODE_PACK_BYTES:
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp b/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp
index 85cbf24..c344de7 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp
@@ -75,6 +75,7 @@ is_expression(const vec4_instruction *const inst)
case VEC4_OPCODE_UNPACK_UNIFORM:
case SHADER_OPCODE_FIND_LIVE_CHANNEL:
case SHADER_OPCODE_BROADCAST:
+ case SHADER_OPCODE_EXTRACT_BYTE:
return true;
case SHADER_OPCODE_RCP:
case SHADER_OPCODE_RSQ:
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
index acf9286..9bd4ae2 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
@@ -1463,6 +1463,14 @@ generate_code(struct brw_codegen *p,
brw_broadcast(p, dst, src[0], src[1]);
break;
+ case SHADER_OPCODE_EXTRACT_BYTE:
+ brw_set_default_access_mode(p, BRW_ALIGN_1);
+ brw_MOV(p, dst, stride(suboffset(retype(src[0], BRW_REGISTER_TYPE_UB),
+ src[1].ud),
+ 16, 4, 4));
+ brw_set_default_access_mode(p, BRW_ALIGN_16);
+ break;
+
case VS_OPCODE_UNPACK_FLAGS_SIMD4X2:
generate_unpack_flags(p, dst);
break;
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index 4aed60e..622bd8c 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -1473,6 +1473,12 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
break;
}
+ case nir_op_extract_byte: {
+ nir_const_value *byte = nir_src_as_const_value(instr->src[1].src);
+ emit(SHADER_OPCODE_EXTRACT_BYTE, dst, op[0], brw_imm_ud(byte->u[0]));
+ break;
+ }
+
case nir_op_fabs:
case nir_op_iabs:
case nir_op_fneg:
--
2.4.9
More information about the mesa-dev
mailing list