[Mesa-dev] [PATCH 7/7] i965/fs: Add empirically-determined instruction latencies for gen7.

Fri Dec 7 14:58:18 PST 2012

The limited performance testing I've done on this hasn't shown any
statistically significant differences yet.
---
 .../dri/i965/brw_fs_schedule_instructions.cpp      |  150 +++++++++++++++++++-
 1 file changed, 147 insertions(+), 3 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp
index 3623c13..f3f0079 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp
@@ -57,7 +57,7 @@ static bool debug = false;
 class schedule_node : public exec_node
 {
 public:
-   schedule_node(fs_inst *inst)
+   schedule_node(fs_inst *inst, int gen)
    {
       this->inst = inst;
       this->child_array_size = 0;
@@ -67,10 +67,14 @@ public:
       this->parent_count = 0;
       this->unblocked_time = 0;
 
-      set_latency_gen4();
+      if (gen >= 7)
+         set_latency_gen7();
+      else
+         set_latency_gen4();
    }
 
    void set_latency_gen4();
+   void set_latency_gen7();
 
    fs_inst *inst;
    schedule_node **children;
@@ -120,6 +124,146 @@ schedule_node::set_latency_gen4()
    }
 }
 
+void
+schedule_node::set_latency_gen7()
+{
+   switch (inst->opcode) {
+   case BRW_OPCODE_MAD:
+      /* 3 cycles (this is said to be 4 cycles sometimes depending on the
+       * register numbers in the sources):
+       * mad(8) g4<1>F g2.2<4,1,1>F.x  g2<4,1,1>F.x g2.1<4,1,1>F.x { align16 WE_normal 1Q };
+       *
+       * 20 cycles:
+       * mad(8) g4<1>F g2.2<4,1,1>F.x  g2<4,1,1>F.x g2.1<4,1,1>F.x { align16 WE_normal 1Q };
+       * mov(8) null   g4<4,4,1>F                      { align16 WE_normal 1Q };
+       */
+      latency = 17;
+      break;
+
+   case SHADER_OPCODE_RCP:
+      /* 2 cycles:
+       * math inv(8) g4<1>F g2<0,1,0>F      null            { align1 WE_normal 1Q };
+       *
+       * 18 cycles:
+       * math inv(8) g4<1>F g2<0,1,0>F      null            { align1 WE_normal 1Q };
+       * mov(8)      null   g4<8,8,1>F                      { align1 WE_normal 1Q };
+       *
+       * Same for exp2, log2, rsq, sqrt, sin, cos.
+       */
+      latency = 16;
+      break;
+
+   case SHADER_OPCODE_POW:
+      /* 2 cycles:
+       * math pow(8) g4<1>F g2<0,1,0>F      g2.1<0,1,0>F    { align1 WE_normal 1Q };
+       *
+       * 26 cycles:
+       * math pow(8) g4<1>F g2<0,1,0>F      g2.1<0,1,0>F    { align1 WE_normal 1Q };
+       * mov(8)      null   g4<8,8,1>F                      { align1 WE_normal 1Q };
+       */
+      latency = 24;
+      break;
+
+   case SHADER_OPCODE_TEX:
+   case SHADER_OPCODE_TXD:
+   case SHADER_OPCODE_TXF:
+   case SHADER_OPCODE_TXL:
+   case SHADER_OPCODE_TXS:
+      /* 18 cycles:
+       * mov(8)  g115<1>F   0F                              { align1 WE_normal 1Q };
+       * mov(8)  g114<1>F   0F                              { align1 WE_normal 1Q };
+       * send(8) g4<1>UW    g114<8,8,1>F
+       *   sampler (10, 0, 0, 1) mlen 2 rlen 4              { align1 WE_normal 1Q };
+       *
+       * 697 +/-49 cycles (min 610, n=26):
+       * mov(8)  g115<1>F   0F                              { align1 WE_normal 1Q };
+       * mov(8)  g114<1>F   0F                              { align1 WE_normal 1Q };
+       * send(8) g4<1>UW    g114<8,8,1>F
+       *   sampler (10, 0, 0, 1) mlen 2 rlen 4              { align1 WE_normal 1Q };
+       * mov(8)  null       g4<8,8,1>F                      { align1 WE_normal 1Q };
+       *
+       * So the latency on our first texture load of the batchbuffer takes
+       * ~700 cycles, since the caches are cold at that point.
+       *
+       * 840 +/- 92 cycles (min 720, n=25):
+       * mov(8)  g115<1>F   0F                              { align1 WE_normal 1Q };
+       * mov(8)  g114<1>F   0F                              { align1 WE_normal 1Q };
+       * send(8) g4<1>UW    g114<8,8,1>F
+       *   sampler (10, 0, 0, 1) mlen 2 rlen 4              { align1 WE_normal 1Q };
+       * mov(8)  null       g4<8,8,1>F                      { align1 WE_normal 1Q };
+       * send(8) g4<1>UW    g114<8,8,1>F
+       *   sampler (10, 0, 0, 1) mlen 2 rlen 4              { align1 WE_normal 1Q };
+       * mov(8)  null       g4<8,8,1>F                      { align1 WE_normal 1Q };
+       *
+       * On the second load, it takes just an extra ~140 cycles, and after
+       * accounting for the 14 cycles of the MOV's latency, that makes ~130.
+       *
+       * 683 +/- 49 cycles (min = 602, n=47):
+       * mov(8)  g115<1>F   0F                              { align1 WE_normal 1Q };
+       * mov(8)  g114<1>F   0F                              { align1 WE_normal 1Q };
+       * send(8) g4<1>UW    g114<8,8,1>F
+       *   sampler (10, 0, 0, 1) mlen 2 rlen 4              { align1 WE_normal 1Q };
+       * send(8) g50<1>UW   g114<8,8,1>F
+       *   sampler (10, 0, 0, 1) mlen 2 rlen 4              { align1 WE_normal 1Q };
+       * mov(8)  null       g4<8,8,1>F                      { align1 WE_normal 1Q };
+       *
+       * The unit appears to be pipelined, since this matches up with the
+       * cache-cold case, despite there being two loads here.  If you replace
+       * the g4 in the MOV to null with g50, it's still 693 +/- 52 (n=39).
+       *
+       * So, take some number between the cache-hot 140 cycles and the
+       * cache-cold 700 cycles.  No particular tuning was done on this.
+       *
+       * I haven't done significant testing of the non-TEX opcodes.  TXL at
+       * least looked about the same as TEX.
+       */
+      latency = 200;
+      break;
+
+   case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
+   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
+      /* testing using varying-index pull constants:
+       *
+       * 16 cycles:
+       * mov(8)  g4<1>D  g2.1<0,1,0>F                    { align1 WE_normal 1Q };
+       * send(8) g4<1>F  g4<8,8,1>D
+       *   data (9, 2, 3) mlen 1 rlen 1                  { align1 WE_normal 1Q };
+       *
+       * ~480 cycles:
+       * mov(8)  g4<1>D  g2.1<0,1,0>F                    { align1 WE_normal 1Q };
+       * send(8) g4<1>F  g4<8,8,1>D
+       *   data (9, 2, 3) mlen 1 rlen 1                  { align1 WE_normal 1Q };
+       * mov(8)  null    g4<8,8,1>F                      { align1 WE_normal 1Q };
+       *
+       * ~620 cycles:
+       * mov(8)  g4<1>D  g2.1<0,1,0>F                    { align1 WE_normal 1Q };
+       * send(8) g4<1>F  g4<8,8,1>D
+       *   data (9, 2, 3) mlen 1 rlen 1                  { align1 WE_normal 1Q };
+       * mov(8)  null    g4<8,8,1>F                      { align1 WE_normal 1Q };
+       * send(8) g4<1>F  g4<8,8,1>D
+       *   data (9, 2, 3) mlen 1 rlen 1                  { align1 WE_normal 1Q };
+       * mov(8)  null    g4<8,8,1>F                      { align1 WE_normal 1Q };
+       *
+       * So, if it's cache-hot, it's about 140.  If it's cache cold, it's
+       * about 460.  We expect to mostly be cache hot, so pick something more
+       * in that direction.
+       */
+      latency = 200;
+      break;
+
+   default:
+      /* 2 cycles:
+       * mul(8) g4<1>F g2<0,1,0>F      0.5F            { align1 WE_normal 1Q };
+       *
+       * 16 cycles:
+       * mul(8) g4<1>F g2<0,1,0>F      0.5F            { align1 WE_normal 1Q };
+       * mov(8) null   g4<8,8,1>F                      { align1 WE_normal 1Q };
+       */
+      latency = 14;
+      break;
+   }
+}
+
 class instruction_scheduler {
 public:
    instruction_scheduler(fs_visitor *v, void *mem_ctx, int grf_count,
@@ -159,7 +303,7 @@ public:
 void
 instruction_scheduler::add_inst(fs_inst *inst)
 {
-   schedule_node *n = new(mem_ctx) schedule_node(inst);
+   schedule_node *n = new(mem_ctx) schedule_node(inst, v->intel->gen);
 
    assert(!inst->is_head_sentinel());
    assert(!inst->is_tail_sentinel());
-- 
1.7.10.4