[Mesa-dev] [RFC PATCH] nir: Add byte_extract operation.

Matt Turner mattst88 at gmail.com
Wed Dec 2 13:07:25 PST 2015


Two shaders that appear in Unigine benchmarks (Heaven and Valley) unpack
three bytes from an integer and convert each into a float:

   float((val >> 16u) & 0xffu)
   float((val >>  8u) & 0xffu)
   float((val >>  0u) & 0xffu)

Instead of shifting, masking, and type converting like this:

   shr(8)          g15<1>UD        g25<8,8,1>UD    0x00000010UD
   and(8)          g16<1>UD        g15<8,8,1>UD    0x000000ffUD
   mov(8)          g17<1>F         g16<8,8,1>UD

   shr(8)          g18<1>UD        g25<8,8,1>UD    0x00000008UD
   and(8)          g19<1>UD        g18<8,8,1>UD    0x000000ffUD
   mov(8)          g20<1>F         g19<8,8,1>UD

   and(8)          g21<1>UD        g25<8,8,1>UD    0x000000ffUD
   mov(8)          g22<1>F         g21<8,8,1>UD

i965 can simply extract a byte and convert to float in a single
instruction:

   mov(8)          g17<1>F         g25.2<16,4,4>UB
   mov(8)          g20<1>F         g25.1<16,4,4>UB
   mov(8)          g22<1>F         g25.0<16,4,4>UB

Decreases the number of instructions and cycles in the two programs by:

 #1706: 3728 -> 3363 instructions (-9.79%), 9594 -> 9180 cycles (-4.32%)
 #1721: 4027 -> 3662 instructions (-9.06%), 10264 -> 9572 cycles (-6.74%)
---
This is dependent on Connor's outstanding i965 scheduling patches and
requires some benchmark data.

Presumably we'll need a lower_byte_extract flag, or maybe we get lucky
and both vc4 and freedreno are capable of handling this opcode like i965.

I should probably split the nir and i965 pieces into separate patches.

TODO: test the i965/vec4 code.

 src/glsl/nir/nir_opcodes.py                      |  4 ++
 src/glsl/nir/nir_opt_algebraic.py                |  6 +++
 src/mesa/drivers/dri/i965/brw_defines.h          |  6 +++
 src/mesa/drivers/dri/i965/brw_fs.h               |  2 +
 src/mesa/drivers/dri/i965/brw_fs_cse.cpp         |  1 +
 src/mesa/drivers/dri/i965/brw_fs_generator.cpp   |  6 +++
 src/mesa/drivers/dri/i965/brw_fs_nir.cpp         | 47 ++++++++++++++++++++++++
 src/mesa/drivers/dri/i965/brw_shader.cpp         |  2 +
 src/mesa/drivers/dri/i965/brw_vec4_cse.cpp       |  1 +
 src/mesa/drivers/dri/i965/brw_vec4_generator.cpp |  8 ++++
 src/mesa/drivers/dri/i965/brw_vec4_nir.cpp       |  6 +++
 11 files changed, 89 insertions(+)

diff --git a/src/glsl/nir/nir_opcodes.py b/src/glsl/nir/nir_opcodes.py
index 37d3dfc..1d85ec9 100644
--- a/src/glsl/nir/nir_opcodes.py
+++ b/src/glsl/nir/nir_opcodes.py
@@ -540,6 +540,10 @@ dst.x = src0.x;
 dst.y = src1.x;
 """)
 
+# Byte extraction
+binop_convert("extract_byte", tuint, tuint, "", "(unsigned)src0 >> (src1 * 8) & 0xff")
+
+
 def triop(name, ty, const_expr):
    opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], "", const_expr)
 def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr):
diff --git a/src/glsl/nir/nir_opt_algebraic.py b/src/glsl/nir/nir_opt_algebraic.py
index 6aa8b1f..a236ac6 100644
--- a/src/glsl/nir/nir_opt_algebraic.py
+++ b/src/glsl/nir/nir_opt_algebraic.py
@@ -202,6 +202,12 @@ optimizations = [
    (('f2i', ('ftrunc', a)), ('f2i', a)),
    (('f2u', ('ftrunc', a)), ('f2u', a)),
 
+   # Byte extraction
+   (('iand', 0xff, ('ushr', a, 24)), ('extract_byte', a, 3)),
+   (('iand', 0xff, ('ushr', a, 16)), ('extract_byte', a, 2)),
+   (('iand', 0xff, ('ushr', a,  8)), ('extract_byte', a, 1)),
+   (('iand', 0xff, a), ('extract_byte', a, 0)),
+
    # Subtracts
    (('fsub', a, ('fsub', 0.0, b)), ('fadd', a, b)),
    (('isub', a, ('isub', 0, b)), ('iadd', a, b)),
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index ade3ede..dc9069d 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -1079,6 +1079,12 @@ enum opcode {
     */
    SHADER_OPCODE_BROADCAST,
 
+   /**
+    * Pick the byte from its first source register given by the index
+    * specified as second source.
+    */
+   SHADER_OPCODE_EXTRACT_BYTE,
+
    VEC4_OPCODE_MOV_BYTES,
    VEC4_OPCODE_PACK_BYTES,
    VEC4_OPCODE_UNPACK_UNIFORM,
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index 5729fdf..718ebf3 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -263,6 +263,8 @@ public:
    void emit_percomp(const brw::fs_builder &bld, const fs_inst &inst,
                      unsigned wr_mask);
 
+   bool optimize_extract_byte_to_float(nir_alu_instr *instr,
+                                       const fs_reg &result);
    bool optimize_frontfacing_ternary(nir_alu_instr *instr,
                                      const fs_reg &result);
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
index 3b65a38..c496036 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
@@ -78,6 +78,7 @@ is_expression(const fs_visitor *v, const fs_inst *const inst)
    case FS_OPCODE_LINTERP:
    case SHADER_OPCODE_FIND_LIVE_CHANNEL:
    case SHADER_OPCODE_BROADCAST:
+   case SHADER_OPCODE_EXTRACT_BYTE:
    case SHADER_OPCODE_MOV_INDIRECT:
       return true;
    case SHADER_OPCODE_RCP:
diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
index c25da07..b090824 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -2193,6 +2193,12 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
          brw_broadcast(p, dst, src[0], src[1]);
          break;
 
+      case SHADER_OPCODE_EXTRACT_BYTE:
+         brw_MOV(p, dst, stride(suboffset(retype(src[0], BRW_REGISTER_TYPE_UB),
+                                          src[1].ud),
+                                16, 4, 4));
+         break;
+
       case FS_OPCODE_SET_SAMPLE_ID:
          generate_set_sample_id(inst, dst, src[0], src[1]);
          break;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index 15bd98f..d6e414c 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -453,6 +453,43 @@ fs_visitor::nir_emit_instr(nir_instr *instr)
    }
 }
 
+/**
+ * Recognizes a parent instruction of nir_op_extract_byte and changes the type
+ * to match instr.
+ *
+ * Used as a peephole by i2f and u2f. nir_op_extract_byte returns an unsigned,
+ * but i965 can extract a byte and do type conversion in a single instruction.
+ */
+bool
+fs_visitor::optimize_extract_byte_to_float(nir_alu_instr *instr,
+                                           const fs_reg &result)
+{
+   if (!instr->src[0].src.is_ssa ||
+       !instr->src[0].src.ssa->parent_instr)
+      return false;
+
+   if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu)
+      return false;
+
+   nir_alu_instr *src0 =
+      nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
+
+   if (src0->op != nir_op_extract_byte)
+      return false;
+
+   nir_const_value *byte = nir_src_as_const_value(src0->src[1].src);
+   assert(byte != NULL && byte->u[0] <= 3);
+
+   fs_reg op0 = get_nir_src(src0->src[0].src);
+   op0.type = brw_type_for_nir_type(nir_op_infos[src0->op].input_types[0]);
+   op0 = offset(op0, bld, src0->src[0].swizzle[0]);
+
+   set_saturate(instr->dest.saturate,
+                bld.emit(SHADER_OPCODE_EXTRACT_BYTE,
+                         result, op0, brw_imm_ud(byte->u[0])));
+   return true;
+}
+
 bool
 fs_visitor::optimize_frontfacing_ternary(nir_alu_instr *instr,
                                          const fs_reg &result)
@@ -624,6 +661,9 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
    switch (instr->op) {
    case nir_op_i2f:
    case nir_op_u2f:
+      if (optimize_extract_byte_to_float(instr, result))
+         return;
+
       inst = bld.MOV(result, op[0]);
       inst->saturate = instr->dest.saturate;
       break;
@@ -1036,6 +1076,13 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
       inst->predicate = BRW_PREDICATE_NORMAL;
       break;
 
+   case nir_op_extract_byte: {
+      nir_const_value *byte = nir_src_as_const_value(instr->src[1].src);
+      bld.emit(SHADER_OPCODE_EXTRACT_BYTE,
+               result, op[0], brw_imm_ud(byte->u[0]));
+      break;
+   }
+
    default:
       unreachable("unhandled instruction");
    }
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
index 5a6752b..b0f0e5a 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -423,6 +423,8 @@ brw_instruction_name(enum opcode op)
    case SHADER_OPCODE_BROADCAST:
       return "broadcast";
 
+   case SHADER_OPCODE_EXTRACT_BYTE:
+      return "extract_byte";
    case VEC4_OPCODE_MOV_BYTES:
       return "mov_bytes";
    case VEC4_OPCODE_PACK_BYTES:
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp b/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp
index 85cbf24..c344de7 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp
@@ -75,6 +75,7 @@ is_expression(const vec4_instruction *const inst)
    case VEC4_OPCODE_UNPACK_UNIFORM:
    case SHADER_OPCODE_FIND_LIVE_CHANNEL:
    case SHADER_OPCODE_BROADCAST:
+   case SHADER_OPCODE_EXTRACT_BYTE:
       return true;
    case SHADER_OPCODE_RCP:
    case SHADER_OPCODE_RSQ:
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
index acf9286..9bd4ae2 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
@@ -1463,6 +1463,14 @@ generate_code(struct brw_codegen *p,
          brw_broadcast(p, dst, src[0], src[1]);
          break;
 
+      case SHADER_OPCODE_EXTRACT_BYTE:
+         brw_set_default_access_mode(p, BRW_ALIGN_1);
+         brw_MOV(p, dst, stride(suboffset(retype(src[0], BRW_REGISTER_TYPE_UB),
+                                          src[1].ud),
+                                16, 4, 4));
+         brw_set_default_access_mode(p, BRW_ALIGN_16);
+         break;
+
       case VS_OPCODE_UNPACK_FLAGS_SIMD4X2:
          generate_unpack_flags(p, dst);
          break;
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index 4aed60e..622bd8c 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -1473,6 +1473,12 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
       break;
    }
 
+   case nir_op_extract_byte: {
+      nir_const_value *byte = nir_src_as_const_value(instr->src[1].src);
+      emit(SHADER_OPCODE_EXTRACT_BYTE, dst, op[0], brw_imm_ud(byte->u[0]));
+      break;
+   }
+
    case nir_op_fabs:
    case nir_op_iabs:
    case nir_op_fneg:
-- 
2.4.9



More information about the mesa-dev mailing list