[Mesa-dev] [PATCH 1/5] i965: Add writes_accumulator flag

Juha-Pekka Heikkila juhapekka.heikkila at gmail.com
Fri Apr 4 06:51:59 PDT 2014


Our hardware has an "accumulator" register, which can be used to store
intermediate results across multiple instructions.  Many instructions
can implicitly write a value to the accumulator in addition to their
normal destination register.  This is enabled by the "AccWrEn" flag.

This patch introduces a new flag, inst->writes_accumulator, which
allows us to express the AccWrEn notion in the IR.  It also creates a
n ALU2_ACC macro to easily define emitters for instructions that
implicitly write the accumulator.

Previously, we only supported implicit accumulator writes from the
ADDC, SUBB, and MACH instructions.  We always enabled them on those
instructions, and left them disabled for other instructions.

To take advantage of the MAC (multiply-accumulate) instruction, we
need to be able to set AccWrEn on other types of instructions.

Signed-off-by: Juha-Pekka Heikkila <juhapekka.heikkila at gmail.com>
Cc: Matt Turner <mattst88 at gmail.com>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp               | 34 ++++++++++++-----
 src/mesa/drivers/dri/i965/brw_fs.h                 |  1 +
 src/mesa/drivers/dri/i965/brw_fs_generator.cpp     |  7 +---
 .../drivers/dri/i965/brw_schedule_instructions.cpp | 44 ++++++++++++++++++++++
 src/mesa/drivers/dri/i965/brw_shader.h             |  1 +
 src/mesa/drivers/dri/i965/brw_vec4.cpp             | 33 ++++++++++------
 src/mesa/drivers/dri/i965/brw_vec4.h               |  2 +
 src/mesa/drivers/dri/i965/brw_vec4_generator.cpp   |  7 +---
 src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp     | 17 +++++++--
 9 files changed, 110 insertions(+), 36 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 713e477..65db469 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -64,6 +64,8 @@ fs_inst::init()
 
    /* This will be the case for almost all instructions. */
    this->regs_written = 1;
+
+   this->writes_accumulator = false;
 }
 
 fs_inst::fs_inst()
@@ -151,6 +153,15 @@ fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
    }
 
+#define ALU2_ACC(op)                                                    \
+   fs_inst *                                                            \
+   fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1)                 \
+   {                                                                    \
+      fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
+      inst->writes_accumulator = true;                                  \
+      return inst;                                                      \
+   }
+
 #define ALU3(op)                                                        \
    fs_inst *                                                            \
    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2)    \
@@ -166,7 +177,7 @@ ALU1(RNDE)
 ALU1(RNDZ)
 ALU2(ADD)
 ALU2(MUL)
-ALU2(MACH)
+ALU2_ACC(MACH)
 ALU2(AND)
 ALU2(OR)
 ALU2(XOR)
@@ -182,8 +193,8 @@ ALU1(FBH)
 ALU1(FBL)
 ALU1(CBIT)
 ALU3(MAD)
-ALU2(ADDC)
-ALU2(SUBB)
+ALU2_ACC(ADDC)
+ALU2_ACC(SUBB)
 ALU2(SEL)
 
 /** Gen4 predicated IF. */
@@ -501,6 +512,14 @@ fs_reg::is_valid_3src() const
    return file == GRF || file == UNIFORM;
 }
 
+bool
+fs_reg::is_accumulator() const
+{
+   return file == HW_REG &&
+          fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
+          fixed_hw_reg.nr == BRW_ARF_ACCUMULATOR;
+}
+
 int
 fs_visitor::type_size(const struct glsl_type *type)
 {
@@ -2107,16 +2126,11 @@ fs_visitor::dead_code_eliminate()
              * accumulator as a side-effect. Instead just set the destination
              * to the null register to free it.
              */
-            switch (inst->opcode) {
-            case BRW_OPCODE_ADDC:
-            case BRW_OPCODE_SUBB:
-            case BRW_OPCODE_MACH:
+            if (inst->writes_accumulator) {
                inst->dst = fs_reg(retype(brw_null_reg(), inst->dst.type));
-               break;
-            default:
+            } else {
                inst->remove();
                progress = true;
-               break;
             }
          }
       }
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index 0d064f6..306f18c 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -83,6 +83,7 @@ public:
    bool is_null() const;
    bool is_valid_3src() const;
    bool is_contiguous() const;
+   bool is_accumulator() const;
 
    fs_reg &apply_stride(unsigned stride);
    /** Smear a channel of the reg to all channels. */
diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
index e590bdf..1cf35b4 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -1411,6 +1411,7 @@ fs_generator::generate_code(exec_list *instructions, FILE *dump_file)
       brw_set_flag_reg(p, 0, inst->flag_subreg);
       brw_set_saturate(p, inst->saturate);
       brw_set_mask_control(p, inst->force_writemask_all);
+      brw_set_acc_write_control(p, inst->writes_accumulator);
 
       if (inst->force_uncompressed || dispatch_width == 8) {
 	 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
@@ -1434,9 +1435,7 @@ fs_generator::generate_code(exec_list *instructions, FILE *dump_file)
 	 brw_AVG(p, dst, src[0], src[1]);
 	 break;
       case BRW_OPCODE_MACH:
-	 brw_set_acc_write_control(p, 1);
 	 brw_MACH(p, dst, src[0], src[1]);
-	 brw_set_acc_write_control(p, 0);
 	 break;
 
       case BRW_OPCODE_MAD:
@@ -1540,15 +1539,11 @@ fs_generator::generate_code(exec_list *instructions, FILE *dump_file)
          break;
       case BRW_OPCODE_ADDC:
          assert(brw->gen >= 7);
-         brw_set_acc_write_control(p, 1);
          brw_ADDC(p, dst, src[0], src[1]);
-         brw_set_acc_write_control(p, 0);
          break;
       case BRW_OPCODE_SUBB:
          assert(brw->gen >= 7);
-         brw_set_acc_write_control(p, 1);
          brw_SUBB(p, dst, src[0], src[1]);
-         brw_set_acc_write_control(p, 0);
          break;
 
       case BRW_OPCODE_BFE:
diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
index a951459..92f82fd 100644
--- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
@@ -758,6 +758,7 @@ fs_instruction_scheduler::calculate_deps()
    schedule_node *last_fixed_grf_write = NULL;
    int reg_width = v->dispatch_width / 8;
 
+   schedule_node *last_accumulator_write = NULL;
    /* The last instruction always needs to still be the last
     * instruction.  Either it's flow control (IF, ELSE, ENDIF, DO,
     * WHILE) and scheduling other things after it would disturb the
@@ -822,6 +823,10 @@ fs_instruction_scheduler::calculate_deps()
 	 add_dep(last_conditional_mod[inst->flag_subreg], n);
       }
 
+      if (inst->writes_accumulator || inst->dst.is_accumulator()) {
+         add_dep(last_accumulator_write, n);
+      }
+
       /* write-after-write deps. */
       if (inst->dst.file == GRF) {
          if (post_reg_alloc) {
@@ -869,6 +874,14 @@ fs_instruction_scheduler::calculate_deps()
 	 add_dep(last_conditional_mod[inst->flag_subreg], n, 0);
 	 last_conditional_mod[inst->flag_subreg] = n;
       }
+
+      if (inst->writes_accumulator || inst->dst.is_accumulator()) {
+         /* Instruction uses accumulator register which need to be preserved
+          * across opcodes.
+          */
+         add_dep(last_accumulator_write, n);
+         last_accumulator_write = n;
+      }
    }
 
    /* bottom-to-top dependencies: WAR */
@@ -876,6 +889,7 @@ fs_instruction_scheduler::calculate_deps()
    memset(last_mrf_write, 0, sizeof(last_mrf_write));
    memset(last_conditional_mod, 0, sizeof(last_conditional_mod));
    last_fixed_grf_write = NULL;
+   last_accumulator_write = NULL;
 
    exec_node *node;
    exec_node *prev;
@@ -928,6 +942,10 @@ fs_instruction_scheduler::calculate_deps()
 	 add_dep(n, last_conditional_mod[inst->flag_subreg]);
       }
 
+      if (inst->writes_accumulator || inst->dst.is_accumulator()) {
+         add_dep(n, last_accumulator_write);
+      }
+
       /* Update the things this instruction wrote, so earlier reads
        * can mark this as WAR dependency.
        */
@@ -972,6 +990,10 @@ fs_instruction_scheduler::calculate_deps()
       if (inst->writes_flag()) {
 	 last_conditional_mod[inst->flag_subreg] = n;
       }
+
+      if (inst->writes_accumulator || inst->dst.is_accumulator()) {
+         last_accumulator_write = n;
+      }
    }
 }
 
@@ -988,6 +1010,7 @@ vec4_instruction_scheduler::calculate_deps()
     */
    schedule_node *last_fixed_grf_write = NULL;
 
+   schedule_node *last_accumulator_write = NULL;
    /* The last instruction always needs to still be the last instruction.
     * Either it's flow control (IF, ELSE, ENDIF, DO, WHILE) and scheduling
     * other things after it would disturb the basic block, or it's the EOT
@@ -1039,6 +1062,10 @@ vec4_instruction_scheduler::calculate_deps()
          add_dep(last_conditional_mod, n);
       }
 
+      if (inst->writes_accumulator || inst->dst.is_accumulator()) {
+         add_dep(last_accumulator_write, n);
+      }
+
       /* write-after-write deps. */
       if (inst->dst.file == GRF) {
          add_dep(last_grf_write[inst->dst.reg], n);
@@ -1064,6 +1091,14 @@ vec4_instruction_scheduler::calculate_deps()
          add_dep(last_conditional_mod, n, 0);
          last_conditional_mod = n;
       }
+
+      if (inst->writes_accumulator || inst->dst.is_accumulator()) {
+         /* Instruction uses accumulator register which need to be preserved
+          * across opcodes.
+          */
+         add_dep(last_accumulator_write, n);
+         last_accumulator_write = n;
+      }
    }
 
    /* bottom-to-top dependencies: WAR */
@@ -1071,6 +1106,7 @@ vec4_instruction_scheduler::calculate_deps()
    memset(last_mrf_write, 0, sizeof(last_mrf_write));
    last_conditional_mod = NULL;
    last_fixed_grf_write = NULL;
+   last_accumulator_write = NULL;
 
    exec_node *node;
    exec_node *prev;
@@ -1109,6 +1145,10 @@ vec4_instruction_scheduler::calculate_deps()
          add_dep(n, last_conditional_mod);
       }
 
+      if (inst->writes_accumulator || inst->dst.is_accumulator()) {
+         add_dep(n, last_accumulator_write);
+      }
+
       /* Update the things this instruction wrote, so earlier reads
        * can mark this as WAR dependency.
        */
@@ -1132,6 +1172,10 @@ vec4_instruction_scheduler::calculate_deps()
       if (inst->writes_flag()) {
          last_conditional_mod = n;
       }
+
+      if (inst->writes_accumulator || inst->dst.is_accumulator()) {
+         last_accumulator_write = n;
+      }
    }
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_shader.h b/src/mesa/drivers/dri/i965/brw_shader.h
index 086d042..e0f6a06 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.h
+++ b/src/mesa/drivers/dri/i965/brw_shader.h
@@ -59,6 +59,7 @@ public:
 
    uint8_t predicate;
    bool predicate_inverse;
+   bool writes_accumulator; /**< instruction implicitly writes accumulator */
 };
 
 enum instruction_scheduler_mode {
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp
index a86d42c..9720352 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -151,6 +151,15 @@ src_reg::src_reg(dst_reg reg)
                                 swizzles[2], swizzles[3]);
 }
 
+bool
+src_reg::is_accumulator() const
+{
+   return file == HW_REG &&
+          fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
+          fixed_hw_reg.nr == BRW_ARF_ACCUMULATOR;
+}
+
+
 void
 dst_reg::init()
 {
@@ -221,6 +230,14 @@ dst_reg::is_null() const
 }
 
 bool
+dst_reg::is_accumulator() const
+{
+   return file == HW_REG &&
+          fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
+          fixed_hw_reg.nr == BRW_ARF_ACCUMULATOR;
+}
+
+bool
 vec4_instruction::is_send_from_grf()
 {
    switch (opcode) {
@@ -330,19 +347,13 @@ try_eliminate_instruction(vec4_instruction *inst, int new_writemask,
        * accumulator as a side-effect. Instead just set the destination
        * to the null register to free it.
        */
-      switch (inst->opcode) {
-      case BRW_OPCODE_ADDC:
-      case BRW_OPCODE_SUBB:
-      case BRW_OPCODE_MACH:
+
+      if (inst->writes_accumulator || inst->writes_flag()) {
          inst->dst = dst_reg(retype(brw_null_reg(), inst->dst.type));
-         break;
-      default:
-         if (inst->writes_flag()) {
-            inst->dst = dst_reg(retype(brw_null_reg(), inst->dst.type));
-         } else {
-            inst->remove();
-         }
+      } else {
+         inst->remove();
       }
+
       return true;
    } else if (inst->dst.writemask != new_writemask) {
       switch (inst->opcode) {
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h
index 159a5bd..b3549a5 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4.h
@@ -128,6 +128,7 @@ public:
    bool equals(src_reg *r);
    bool is_zero() const;
    bool is_one() const;
+   bool is_accumulator() const;
 
    src_reg(class vec4_visitor *v, const struct glsl_type *type);
 
@@ -195,6 +196,7 @@ public:
    explicit dst_reg(src_reg reg);
 
    bool is_null() const;
+   bool is_accumulator() const;
 
    int writemask; /**< Bitfield of WRITEMASK_[XYZW] */
 
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
index a74514f..5f85d31 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
@@ -971,9 +971,7 @@ vec4_generator::generate_vec4_instruction(vec4_instruction *instruction,
       brw_MUL(p, dst, src[0], src[1]);
       break;
    case BRW_OPCODE_MACH:
-      brw_set_acc_write_control(p, 1);
       brw_MACH(p, dst, src[0], src[1]);
-      brw_set_acc_write_control(p, 0);
       break;
 
    case BRW_OPCODE_MAD:
@@ -1077,15 +1075,11 @@ vec4_generator::generate_vec4_instruction(vec4_instruction *instruction,
       break;
    case BRW_OPCODE_ADDC:
       assert(brw->gen >= 7);
-      brw_set_acc_write_control(p, 1);
       brw_ADDC(p, dst, src[0], src[1]);
-      brw_set_acc_write_control(p, 0);
       break;
    case BRW_OPCODE_SUBB:
       assert(brw->gen >= 7);
-      brw_set_acc_write_control(p, 1);
       brw_SUBB(p, dst, src[0], src[1]);
-      brw_set_acc_write_control(p, 0);
       break;
 
    case BRW_OPCODE_BFE:
@@ -1317,6 +1311,7 @@ vec4_generator::generate_code(exec_list *instructions)
       brw_set_predicate_inverse(p, inst->predicate_inverse);
       brw_set_saturate(p, inst->saturate);
       brw_set_mask_control(p, inst->force_writemask_all);
+      brw_set_acc_write_control(p, inst->writes_accumulator);
 
       unsigned pre_emit_nr_insn = p->nr_insn;
 
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
index c73e58d..2600114 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -42,6 +42,7 @@ vec4_instruction::vec4_instruction(vec4_visitor *v,
    this->force_writemask_all = false;
    this->no_dd_clear = false;
    this->no_dd_check = false;
+   this->writes_accumulator = false;
    this->conditional_mod = BRW_CONDITIONAL_NONE;
    this->sampler = 0;
    this->texture_offset = 0;
@@ -124,6 +125,16 @@ vec4_visitor::emit(enum opcode opcode)
 					   src0, src1);			\
    }
 
+#define ALU2_ACC(op)							\
+   vec4_instruction *							\
+   vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1)		\
+   {									\
+      vec4_instruction *inst = new(mem_ctx) vec4_instruction(this,     \
+                       BRW_OPCODE_##op, dst, src0, src1);		\
+      inst->writes_accumulator = true;                                 \
+      return inst;                                                     \
+   }
+
 #define ALU3(op)							\
    vec4_instruction *							\
    vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1, src_reg src2)\
@@ -143,7 +154,7 @@ ALU1(F32TO16)
 ALU1(F16TO32)
 ALU2(ADD)
 ALU2(MUL)
-ALU2(MACH)
+ALU2_ACC(MACH)
 ALU2(AND)
 ALU2(OR)
 ALU2(XOR)
@@ -162,8 +173,8 @@ ALU1(FBH)
 ALU1(FBL)
 ALU1(CBIT)
 ALU3(MAD)
-ALU2(ADDC)
-ALU2(SUBB)
+ALU2_ACC(ADDC)
+ALU2_ACC(SUBB)
 
 /** Gen4 predicated IF. */
 vec4_instruction *
-- 
1.8.1.2



More information about the mesa-dev mailing list