[Mesa-dev] [PATCH 7/7] i965/fs: Add empirically-determined instruction latencies for gen7.

Sun Dec 9 14:45:32 PST 2012

On 12/07/2012 02:58 PM, Eric Anholt wrote:
> The limited performance testing I've done on this hasn't shown any
> statistically significant differences yet.
> ---
>   .../dri/i965/brw_fs_schedule_instructions.cpp      |  150 +++++++++++++++++++-
>   1 file changed, 147 insertions(+), 3 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp
> index 3623c13..f3f0079 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp
> @@ -57,7 +57,7 @@ static bool debug = false;
>   class schedule_node : public exec_node
>   {
>   public:
> -   schedule_node(fs_inst *inst)
> +   schedule_node(fs_inst *inst, int gen)
>      {
>         this->inst = inst;
>         this->child_array_size = 0;
> @@ -67,10 +67,14 @@ public:
>         this->parent_count = 0;
>         this->unblocked_time = 0;
>
> -      set_latency_gen4();
> +      if (gen >= 7)
> +         set_latency_gen7();
> +      else
> +         set_latency_gen4();
>      }
>
>      void set_latency_gen4();
> +   void set_latency_gen7();
>
>      fs_inst *inst;
>      schedule_node **children;
> @@ -120,6 +124,146 @@ schedule_node::set_latency_gen4()
>      }
>   }
>
> +void
> +schedule_node::set_latency_gen7()
> +{
> +   switch (inst->opcode) {
> +   case BRW_OPCODE_MAD:
> +      /* 3 cycles (this is said to be 4 cycles sometimes depending on the
> +       * register numbers in the sources):
> +       * mad(8) g4<1>F g2.2<4,1,1>F.x  g2<4,1,1>F.x g2.1<4,1,1>F.x { align16 WE_normal 1Q };
> +       *
> +       * 20 cycles:
> +       * mad(8) g4<1>F g2.2<4,1,1>F.x  g2<4,1,1>F.x g2.1<4,1,1>F.x { align16 WE_normal 1Q };
> +       * mov(8) null   g4<4,4,1>F                      { align16 WE_normal 1Q };
> +       */
> +      latency = 17;
> +      break;
> +
> +   case SHADER_OPCODE_RCP:
> +      /* 2 cycles:
> +       * math inv(8) g4<1>F g2<0,1,0>F      null            { align1 WE_normal 1Q };
> +       *
> +       * 18 cycles:
> +       * math inv(8) g4<1>F g2<0,1,0>F      null            { align1 WE_normal 1Q };
> +       * mov(8)      null   g4<8,8,1>F                      { align1 WE_normal 1Q };
> +       *
> +       * Same for exp2, log2, rsq, sqrt, sin, cos.
> +       */
> +      latency = 16;
> +      break;
> +
> +   case SHADER_OPCODE_POW:
> +      /* 2 cycles:
> +       * math pow(8) g4<1>F g2<0,1,0>F      g2.1<0,1,0>F    { align1 WE_normal 1Q };
> +       *
> +       * 26 cycles:
> +       * math pow(8) g4<1>F g2<0,1,0>F      g2.1<0,1,0>F    { align1 WE_normal 1Q };
> +       * mov(8)      null   g4<8,8,1>F                      { align1 WE_normal 1Q };
> +       */
> +      latency = 24;
> +      break;
> +
> +   case SHADER_OPCODE_TEX:
> +   case SHADER_OPCODE_TXD:
> +   case SHADER_OPCODE_TXF:
> +   case SHADER_OPCODE_TXL:
> +   case SHADER_OPCODE_TXS:
> +      /* 18 cycles:
> +       * mov(8)  g115<1>F   0F                              { align1 WE_normal 1Q };
> +       * mov(8)  g114<1>F   0F                              { align1 WE_normal 1Q };
> +       * send(8) g4<1>UW    g114<8,8,1>F
> +       *   sampler (10, 0, 0, 1) mlen 2 rlen 4              { align1 WE_normal 1Q };
> +       *
> +       * 697 +/-49 cycles (min 610, n=26):
> +       * mov(8)  g115<1>F   0F                              { align1 WE_normal 1Q };
> +       * mov(8)  g114<1>F   0F                              { align1 WE_normal 1Q };
> +       * send(8) g4<1>UW    g114<8,8,1>F
> +       *   sampler (10, 0, 0, 1) mlen 2 rlen 4              { align1 WE_normal 1Q };
> +       * mov(8)  null       g4<8,8,1>F                      { align1 WE_normal 1Q };
> +       *
> +       * So the latency on our first texture load of the batchbuffer takes
> +       * ~700 cycles, since the caches are cold at that point.
> +       *
> +       * 840 +/- 92 cycles (min 720, n=25):
> +       * mov(8)  g115<1>F   0F                              { align1 WE_normal 1Q };
> +       * mov(8)  g114<1>F   0F                              { align1 WE_normal 1Q };
> +       * send(8) g4<1>UW    g114<8,8,1>F
> +       *   sampler (10, 0, 0, 1) mlen 2 rlen 4              { align1 WE_normal 1Q };
> +       * mov(8)  null       g4<8,8,1>F                      { align1 WE_normal 1Q };
> +       * send(8) g4<1>UW    g114<8,8,1>F
> +       *   sampler (10, 0, 0, 1) mlen 2 rlen 4              { align1 WE_normal 1Q };
> +       * mov(8)  null       g4<8,8,1>F                      { align1 WE_normal 1Q };
> +       *
> +       * On the second load, it takes just an extra ~140 cycles, and after
> +       * accounting for the 14 cycles of the MOV's latency, that makes ~130.
> +       *
> +       * 683 +/- 49 cycles (min = 602, n=47):
> +       * mov(8)  g115<1>F   0F                              { align1 WE_normal 1Q };
> +       * mov(8)  g114<1>F   0F                              { align1 WE_normal 1Q };
> +       * send(8) g4<1>UW    g114<8,8,1>F
> +       *   sampler (10, 0, 0, 1) mlen 2 rlen 4              { align1 WE_normal 1Q };
> +       * send(8) g50<1>UW   g114<8,8,1>F
> +       *   sampler (10, 0, 0, 1) mlen 2 rlen 4              { align1 WE_normal 1Q };
> +       * mov(8)  null       g4<8,8,1>F                      { align1 WE_normal 1Q };
> +       *
> +       * The unit appears to be pipelined, since this matches up with the
> +       * cache-cold case, despite there being two loads here.  If you replace
> +       * the g4 in the MOV to null with g50, it's still 693 +/- 52 (n=39).
> +       *
> +       * So, take some number between the cache-hot 140 cycles and the
> +       * cache-cold 700 cycles.  No particular tuning was done on this.
> +       *
> +       * I haven't done significant testing of the non-TEX opcodes.  TXL at
> +       * least looked about the same as TEX.
> +       */
> +      latency = 200;
> +      break;
> +
> +   case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
> +   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
> +      /* testing using varying-index pull constants:
> +       *
> +       * 16 cycles:
> +       * mov(8)  g4<1>D  g2.1<0,1,0>F                    { align1 WE_normal 1Q };
> +       * send(8) g4<1>F  g4<8,8,1>D
> +       *   data (9, 2, 3) mlen 1 rlen 1                  { align1 WE_normal 1Q };
> +       *
> +       * ~480 cycles:
> +       * mov(8)  g4<1>D  g2.1<0,1,0>F                    { align1 WE_normal 1Q };
> +       * send(8) g4<1>F  g4<8,8,1>D
> +       *   data (9, 2, 3) mlen 1 rlen 1                  { align1 WE_normal 1Q };
> +       * mov(8)  null    g4<8,8,1>F                      { align1 WE_normal 1Q };
> +       *
> +       * ~620 cycles:
> +       * mov(8)  g4<1>D  g2.1<0,1,0>F                    { align1 WE_normal 1Q };
> +       * send(8) g4<1>F  g4<8,8,1>D
> +       *   data (9, 2, 3) mlen 1 rlen 1                  { align1 WE_normal 1Q };
> +       * mov(8)  null    g4<8,8,1>F                      { align1 WE_normal 1Q };
> +       * send(8) g4<1>F  g4<8,8,1>D
> +       *   data (9, 2, 3) mlen 1 rlen 1                  { align1 WE_normal 1Q };
> +       * mov(8)  null    g4<8,8,1>F                      { align1 WE_normal 1Q };
> +       *
> +       * So, if it's cache-hot, it's about 140.  If it's cache cold, it's
> +       * about 460.  We expect to mostly be cache hot, so pick something more
> +       * in that direction.
> +       */
> +      latency = 200;
> +      break;

Painful.  Your "we expect to mostly be cache hot" comment makes sense, 
except that Ivybridge's caches are awful when the same cacheline is 
accessed within 16 cycles or so.

I'd really love to see some timing data on using LD messages (to get the 
L1 and L2 caches).  See my old patch that we couldn't justify:

i965/fs: Use the sampler for FS pull constant loading on Ivybridge.

> +   default:
> +      /* 2 cycles:
> +       * mul(8) g4<1>F g2<0,1,0>F      0.5F            { align1 WE_normal 1Q };
> +       *
> +       * 16 cycles:
> +       * mul(8) g4<1>F g2<0,1,0>F      0.5F            { align1 WE_normal 1Q };
> +       * mov(8) null   g4<8,8,1>F                      { align1 WE_normal 1Q };
> +       */
> +      latency = 14;
> +      break;
> +   }
> +}
> +
>   class instruction_scheduler {
>   public:
>      instruction_scheduler(fs_visitor *v, void *mem_ctx, int grf_count,
> @@ -159,7 +303,7 @@ public:
>   void
>   instruction_scheduler::add_inst(fs_inst *inst)
>   {
> -   schedule_node *n = new(mem_ctx) schedule_node(inst);
> +   schedule_node *n = new(mem_ctx) schedule_node(inst, v->intel->gen);
>
>      assert(!inst->is_head_sentinel());
>      assert(!inst->is_tail_sentinel());
>