[Mesa-dev] [PATCH 21/21] i965/vec4: Introduce VEC4 IR builder.

Tue Apr 28 10:08:37 PDT 2015

See "i965/fs: Introduce FS IR builder." for the rationale.
---
 src/mesa/drivers/dri/i965/Makefile.sources   |   1 +
 src/mesa/drivers/dri/i965/brw_vec4_builder.h | 664 +++++++++++++++++++++++++++
 2 files changed, 665 insertions(+)
 create mode 100644 src/mesa/drivers/dri/i965/brw_vec4_builder.h

diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources
index 20cbdb2..5bb6f06 100644
--- a/src/mesa/drivers/dri/i965/Makefile.sources
+++ b/src/mesa/drivers/dri/i965/Makefile.sources
@@ -110,6 +110,7 @@ i965_FILES = \
 	brw_urb.c \
 	brw_util.c \
 	brw_util.h \
+	brw_vec4_builder.h \
 	brw_vec4_copy_propagation.cpp \
 	brw_vec4.cpp \
 	brw_vec4_cse.cpp \
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_builder.h b/src/mesa/drivers/dri/i965/brw_vec4_builder.h
new file mode 100644
index 0000000..8c4f222
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_vec4_builder.h
@@ -0,0 +1,664 @@
+/* -*- c++ -*- */
+/*
+ * Copyright © 2010-2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_VEC4_BUILDER_H
+#define BRW_VEC4_BUILDER_H
+
+#include "brw_ir_vec4.h"
+#include "brw_ir_allocator.h"
+#include "brw_context.h"
+
+namespace brw {
+   /**
+    * Toolbox to assemble a VEC4 IR program out of individual instructions.
+    *
+    * This object is meant to have an interface consistent with
+    * brw::fs_builder in order to enable generic FS/VEC4 programming.  They
+    * cannot be fully interchangeable because brw::fs_builder generates scalar
+    * code while brw::vec4_builder generates vector code.  For a drop-in
+    * replacement of brw::vec4_builder see brw::svec4_builder.
+    */
+   class vec4_builder {
+   public:
+      /** Type used in this IR to represent a source of an instruction. */
+      typedef brw::src_reg src_reg;
+
+      /** Type used in this IR to represent the destination of an instruction. */
+      typedef brw::dst_reg dst_reg;
+
+      /** Type used in this IR to represent an instruction. */
+      typedef vec4_instruction instruction;
+
+      /** We can build scalar instructions. */
+      typedef vec4_builder scalar_builder;
+
+      /** We can build vector instructions too. */
+      typedef vec4_builder vector_builder;
+
+      /**
+       * Construct a vec4_builder appending instructions at the end of the
+       * list \p instructions.  \p alloc provides book-keeping of virtual
+       * registers allocated through the builder.
+       */
+      vec4_builder(const brw_device_info *devinfo,
+                   void *mem_ctx,
+                   simple_allocator &alloc,
+                   exec_list &instructions) :
+         devinfo(devinfo), mem_ctx(mem_ctx),
+         alloc(&alloc), block(NULL),
+         cursor((exec_node *)&instructions.tail)
+      {
+      }
+
+      /**
+       * Construct a vec4_builder that inserts instructions before \p cursor
+       * in basic block \p block, inheriting other code generation parameters
+       * from this.
+       */
+      vec4_builder
+      at(bblock_t *block, instruction *cursor) const
+      {
+         vec4_builder bld = *this;
+         bld.block = block;
+         bld.cursor = cursor;
+         return bld;
+      }
+
+      /**
+       * Construct a scalar builder inheriting other code generation
+       * parameters from this.
+       */
+      vec4_builder
+      scalar() const
+      {
+         return *this;
+      }
+
+      /**
+       * Construct a vector builder inheriting other code generation
+       * parameters from this.
+       */
+      vec4_builder
+      vector() const
+      {
+         return *this;
+      }
+
+      /**
+       * Construct a builder of half-SIMD-width instructions inheriting other
+       * code generation parameters from this.  No-op.
+       */
+      const vec4_builder &
+      half(unsigned i) const
+      {
+         return *this;
+      }
+
+      /**
+       * Get the SIMD width in use.
+       */
+      unsigned
+      dispatch_width() const
+      {
+         return 8;
+      }
+
+      /**
+       * No-op.  See brw::svec4_builder::reduced_predicate().
+       */
+      static brw_predicate
+      reduced_predicate(brw_predicate pred)
+      {
+         return pred;
+      }
+
+      /**
+       * Allocate a virtual register of natural vector size (four for this IR)
+       * and SIMD width.  \p n gives the amount of space to allocate in
+       * dispatch_width units (which is just enough space for four logical
+       * components in this IR).
+       */
+      dst_reg
+      natural_reg(enum brw_reg_type type, unsigned n = 1) const
+      {
+         return retype(dst_reg(GRF, alloc->allocate(
+                                  n * DIV_ROUND_UP(type_sz(type), 4))),
+                       type);
+      }
+
+      /**
+       * Create a register of natural vector size and SIMD width using array
+       * \p reg as storage.
+       */
+      dst_reg
+      natural_reg(const array_reg &reg) const
+      {
+         return dst_reg(reg, WRITEMASK_XYZW);
+      }
+
+      /**
+       * Allocate a virtual register of vector size one and natural SIMD
+       * width.
+       */
+      dst_reg
+      scalar_reg(brw_reg_type type) const
+      {
+         return writemask(natural_reg(type), WRITEMASK_X);
+      }
+
+      /**
+       * Allocate a raw chunk of memory from the virtual GRF file with no
+       * special vector size or SIMD width.  \p n is given in units of 32B
+       * registers.
+       */
+      ::array_reg
+      array_reg(enum brw_reg_type type, unsigned n) const
+      {
+         return ::array_reg(retype(dst_reg(GRF, alloc->allocate(n)),
+                                   type),
+                            n);
+      }
+
+      /**
+       * Create a null register of floating type.
+       */
+      dst_reg
+      null_reg_f() const
+      {
+         return dst_reg(retype(brw_null_vec(dispatch_width()),
+                               BRW_REGISTER_TYPE_F));
+      }
+
+      /**
+       * Create a null register of signed integer type.
+       */
+      dst_reg
+      null_reg_d() const
+      {
+         return dst_reg(retype(brw_null_vec(dispatch_width()),
+                               BRW_REGISTER_TYPE_D));
+      }
+
+      /**
+       * Create a null register of unsigned integer type.
+       */
+      dst_reg
+      null_reg_ud() const
+      {
+         return dst_reg(retype(brw_null_vec(dispatch_width()),
+                               BRW_REGISTER_TYPE_UD));
+      }
+
+      /**
+       * Insert an instruction into the program.
+       */
+      instruction *
+      emit(const instruction &inst) const
+      {
+         return emit(new(mem_ctx) instruction(inst));
+      }
+
+      /**
+       * Create and insert a nullary control instruction into the program.
+       */
+      instruction *
+      emit(enum opcode opcode) const
+      {
+         return emit(instruction(opcode));
+      }
+
+      /**
+       * Create and insert a nullary instruction into the program.
+       */
+      instruction *
+      emit(enum opcode opcode, const dst_reg &dst) const
+      {
+         return emit(instruction(opcode, dst));
+      }
+
+      /**
+       * Create and insert a unary instruction into the program.
+       */
+      instruction *
+      emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const
+      {
+         switch (opcode) {
+         case SHADER_OPCODE_RCP:
+         case SHADER_OPCODE_RSQ:
+         case SHADER_OPCODE_SQRT:
+         case SHADER_OPCODE_EXP2:
+         case SHADER_OPCODE_LOG2:
+         case SHADER_OPCODE_SIN:
+         case SHADER_OPCODE_COS:
+            return fix_math_instruction(
+               emit(instruction(opcode, dst,
+                                fix_math_operand(src0))));
+
+         default:
+            return emit(instruction(opcode, dst, src0));
+         }
+      }
+
+      /**
+       * Create and insert a binary instruction into the program.
+       */
+      instruction *
+      emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
+           const src_reg &src1) const
+      {
+         switch (opcode) {
+         case SHADER_OPCODE_POW:
+         case SHADER_OPCODE_INT_QUOTIENT:
+         case SHADER_OPCODE_INT_REMAINDER:
+            return fix_math_instruction(
+               emit(instruction(opcode, dst,
+                                fix_math_operand(src0),
+                                fix_math_operand(src1))));
+
+         default:
+            return emit(instruction(opcode, dst, src0, src1));
+         }
+      }
+
+      /**
+       * Create and insert a ternary instruction into the program.
+       */
+      instruction *
+      emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
+           const src_reg &src1, const src_reg &src2) const
+      {
+         switch (opcode) {
+         case BRW_OPCODE_BFE:
+         case BRW_OPCODE_BFI2:
+         case BRW_OPCODE_MAD:
+         case BRW_OPCODE_LRP:
+            return emit(instruction(opcode, dst,
+                                    fix_3src_operand(src0),
+                                    fix_3src_operand(src1),
+                                    fix_3src_operand(src2)));
+
+         default:
+            return emit(instruction(opcode, dst, src0, src1, src2));
+         }
+      }
+
+      /**
+       * Insert a preallocated instruction into the program.
+       */
+      instruction *
+      emit(instruction *inst) const
+      {
+         inst->annotation = current_annotation;
+         inst->ir = base_ir;
+
+         if (block)
+            static_cast<instruction *>(cursor)->insert_before(block, inst);
+         else
+            cursor->insert_before(inst);
+
+         return inst;
+      }
+
+      /**
+       * Select \p src0 if the comparison of both sources with the given
+       * conditional mod evaluates to true, otherwise select \p src1.
+       *
+       * Generally useful to get the minimum or maximum of two values.
+       */
+      void
+      emit_minmax(const dst_reg &dst, const src_reg &src0,
+                  const src_reg &src1, brw_conditional_mod mod) const
+      {
+         if (devinfo->gen >= 6) {
+            exec_condmod(mod, SEL(dst, fix_unsigned_negate(src0),
+                                  fix_unsigned_negate(src1)));
+         } else {
+            CMP(null_reg_d(), src0, src1, mod);
+            exec_predicate(BRW_PREDICATE_NORMAL,
+                           SEL(dst, src0, src1));
+         }
+      }
+
+      /**
+       * Copy any live channel from \p src to the first channel of \p dst.
+       */
+      void
+      emit_uniformize(const dst_reg &dst, const src_reg &src) const
+      {
+         const dst_reg chan_index = scalar_reg(BRW_REGISTER_TYPE_UD);
+
+         emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index)
+            ->force_writemask_all = true;
+         emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
+            ->force_writemask_all = true;
+      }
+
+      /**
+       * Assorted arithmetic ops.
+       * @{
+       */
+#define ALU1(op)                                        \
+      instruction *                                     \
+      op(const dst_reg &dst, const src_reg &src0) const \
+      {                                                 \
+         return emit(BRW_OPCODE_##op, dst, src0);       \
+      }
+
+#define ALU2(op)                                                        \
+      instruction *                                                     \
+      op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
+      {                                                                 \
+         return emit(BRW_OPCODE_##op, dst, src0, src1);                 \
+      }
+
+#define ALU2_ACC(op)                                                    \
+      instruction *                                                     \
+      op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
+      {                                                                 \
+         instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1);    \
+         inst->writes_accumulator = true;                               \
+         return inst;                                                   \
+      }
+
+#define ALU3(op)                                                        \
+      instruction *                                                     \
+      op(const dst_reg &dst, const src_reg &src0, const src_reg &src1,  \
+         const src_reg &src2) const                                     \
+      {                                                                 \
+         return emit(BRW_OPCODE_##op, dst, src0, src1, src2);           \
+      }
+
+      ALU2(ADD)
+      ALU2_ACC(ADDC)
+      ALU2(AND)
+      ALU2(ASR)
+      ALU2(AVG)
+      ALU3(BFE)
+      ALU2(BFI1)
+      ALU3(BFI2)
+      ALU1(BFREV)
+      ALU1(CBIT)
+      ALU2(CMPN)
+      ALU3(CSEL)
+      ALU2(DP2)
+      ALU2(DP3)
+      ALU2(DP4)
+      ALU2(DPH)
+      ALU1(F16TO32)
+      ALU1(F32TO16)
+      ALU1(FBH)
+      ALU1(FBL)
+      ALU1(FRC)
+      ALU2(LINE)
+      ALU1(LZD)
+      ALU2(MAC)
+      ALU2_ACC(MACH)
+      ALU3(MAD)
+      ALU1(MOV)
+      ALU2(MUL)
+      ALU1(NOT)
+      ALU2(OR)
+      ALU2(PLN)
+      ALU1(RNDD)
+      ALU1(RNDE)
+      ALU1(RNDU)
+      ALU1(RNDZ)
+      ALU2(SAD2)
+      ALU2_ACC(SADA2)
+      ALU2(SEL)
+      ALU2(SHL)
+      ALU2(SHR)
+      ALU2_ACC(SUBB)
+      ALU2(XOR)
+
+#undef ALU3
+#undef ALU2_ACC
+#undef ALU2
+#undef ALU1
+      /** @} */
+
+      /**
+       * CMP: Sets the low bit of the destination channels with the result
+       * of the comparison, while the upper bits are undefined, and updates
+       * the flag register with the packed 16 bits of the result.
+       */
+      instruction *
+      CMP(dst_reg dst, const src_reg &src0, const src_reg &src1,
+          brw_conditional_mod condition) const
+      {
+         /* Take the instruction:
+          *
+          * CMP null<d> src0<f> src1<f>
+          *
+          * Original gen4 does type conversion to the destination type before
+          * comparison, producing garbage results for floating point comparisons.
+          * gen5 does the comparison on the execution type (resolved source types),
+          * so dst type doesn't matter.  gen6 does comparison and then uses the
+          * result as if it was the dst type with no conversion, which happens to
+          * mostly work out for float-interpreted-as-int since our comparisons are
+          * for >0, =0, <0.
+          */
+         if (devinfo->gen == 4)
+            dst = retype(dst, src0.type);
+
+         return exec_condmod(condition,
+                             emit(BRW_OPCODE_CMP, dst,
+                                  fix_unsigned_negate(src0),
+                                  fix_unsigned_negate(src1)));
+      }
+
+      /**
+       * Gen4 predicated IF.
+       */
+      instruction *
+      IF(brw_predicate predicate) const
+      {
+         instruction *inst = emit(BRW_OPCODE_IF);
+         return exec_predicate(predicate, inst);
+      }
+
+      /**
+       * Gen6 IF with embedded comparison.
+       */
+      instruction *
+      IF(const src_reg &src0, const src_reg &src1,
+         brw_conditional_mod condition) const
+      {
+         assert(devinfo->gen == 6);
+         return exec_condmod(condition,
+                             emit(BRW_OPCODE_IF,
+                                  null_reg_d(),
+                                  fix_unsigned_negate(src0),
+                                  fix_unsigned_negate(src1)));
+      }
+
+      /**
+       * Emit a linear interpolation instruction.
+       */
+      instruction *
+      LRP(const dst_reg &dst, const src_reg &x, const src_reg &y,
+          const src_reg &a) const
+      {
+         if (devinfo->gen >= 6) {
+            /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
+             * we need to reorder the operands.
+             */
+            return emit(BRW_OPCODE_LRP, dst, a, y, x);
+
+         } else {
+            /* We can't use the LRP instruction.  Emit x*(1-a) + y*a. */
+            const dst_reg y_times_a = natural_reg(dst.type);
+            const dst_reg one_minus_a = natural_reg(dst.type);
+            const dst_reg x_times_one_minus_a = natural_reg(dst.type);
+
+            MUL(y_times_a, y, a);
+            ADD(one_minus_a, negate(a), src_reg(1.0f));
+            MUL(x_times_one_minus_a, x, src_reg(one_minus_a));
+            return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a));
+         }
+      }
+
+      /**
+       * Collect a number of registers in a contiguous range of registers.
+       */
+      instruction *
+      LOAD_PAYLOAD(const dst_reg &dst, const src_reg *src, unsigned sources) const
+      {
+         /* FIXME -- Add support for the LOAD_PAYLOAD instruction in the VEC4
+          * back-end.
+          */
+         for (unsigned i = 0; i < sources; ++i) {
+            if (src[i].file != BAD_FILE)
+               exec_all(MOV(offset(dst, i), src[i]));
+         }
+
+         return NULL;
+      }
+
+      /**
+       * Debug annotation info.
+       * @{
+       */
+      void
+      set_annotation(const char *s) {
+         current_annotation = s;
+      }
+
+      void
+      set_base_ir(const void *ir) {
+         base_ir = ir;
+      }
+      /** @} */
+
+      const brw_device_info *const devinfo;
+
+   protected:
+      /**
+       * Workaround for negation of UD registers.  See comment in
+       * fs_generator::generate_code() for the details.
+       */
+      src_reg
+      fix_unsigned_negate(const src_reg &src) const
+      {
+         if (src.type == BRW_REGISTER_TYPE_UD && src.negate) {
+            dst_reg temp = natural_reg(BRW_REGISTER_TYPE_UD);
+            MOV(temp, src);
+            return src_reg(temp);
+         } else {
+            return src;
+         }
+      }
+
+      /**
+       * Workaround for register access modes not supported by the ternary
+       * instruction encoding.
+       */
+      src_reg
+      fix_3src_operand(const src_reg &src) const
+      {
+         /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
+          * able to use vertical stride of zero to replicate the vec4 uniform, like
+          *
+          *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
+          *
+          * But you can't, since vertical stride is always four in three-source
+          * instructions. Instead, insert a MOV instruction to do the replication so
+          * that the three-source instruction can consume it.
+          */
+
+         /* The MOV is only needed if the source is a uniform or immediate. */
+         if (src.file != UNIFORM && src.file != IMM)
+            return src;
+
+         if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
+            return src;
+
+         dst_reg expanded = natural_reg(src.type);
+         MOV(expanded, src);
+         return src_reg(expanded);
+      }
+
+      /**
+       * Workaround for register access modes not supported by the math
+       * instruction.
+       */
+      src_reg
+      fix_math_operand(const src_reg &src) const
+      {
+         /* The gen6 math instruction ignores the source modifiers --
+          * swizzle, abs, negate, and at least some parts of the register
+          * region description.
+          *
+          * Rather than trying to enumerate all these cases, *always* expand the
+          * operand to a temp GRF for gen6.
+          *
+          * For gen7, keep the operand as-is, except if immediate, which gen7 still
+          * can't use.
+          */
+         if (devinfo->gen < 7 || (devinfo->gen == 7 && src.file == IMM)) {
+            const dst_reg tmp = natural_reg(src.type);
+            MOV(tmp, src);
+            return src_reg(tmp);
+         } else {
+            return src;
+         }
+      }
+
+      /**
+       * Workaround other weirdness of the math instruction.
+       */
+      instruction *
+      fix_math_instruction(instruction *instr) const
+      {
+         if (devinfo->gen == 6 && instr->dst.writemask != WRITEMASK_XYZW) {
+            const dst_reg tmp = natural_reg(instr->dst.type);
+            MOV(instr->dst, src_reg(tmp));
+            instr->dst = tmp;
+
+         } else if (devinfo->gen < 6) {
+            const unsigned sources = (instr->src[1].file == BAD_FILE ? 1 : 2);
+            instr->base_mrf = 1;
+            instr->mlen = sources;
+         }
+
+         return instr;
+      }
+
+      void *const mem_ctx;
+
+      simple_allocator *alloc;
+      bblock_t *block;
+      exec_node *cursor;
+
+      /**
+       * Debug annotation info.
+       * @{
+       */
+      const char *current_annotation;
+      const void *base_ir;
+      /** @} */
+   };
+}
+
+#endif
-- 
2.3.5