Mesa (master): i965/gen7: Add instruction latency estimates for untyped atomics and reads.

Mon Nov 4 20:15:08 UTC 2013

Module: Mesa
Branch: master
Commit: 35fe7ed7d3c45f5263ae42bcedecc00ba6adf91d
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=35fe7ed7d3c45f5263ae42bcedecc00ba6adf91d

Author: Francisco Jerez <currojerez at riseup.net>
Date:   Fri Nov  1 11:29:13 2013 -0700

i965/gen7: Add instruction latency estimates for untyped atomics and reads.

The latency information has been obtained empirically from
measurements taken on Haswell and Ivy Bridge.

Acked-by: Paul Berry <stereotype441 at gmail.com>
Reviewed-by: Matt Turner <mattst88 at gmail.com>

---

 .../drivers/dri/i965/brw_schedule_instructions.cpp |   39 ++++++++++++++++++++
 1 files changed, 39 insertions(+), 0 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
index fe0de08..5a42513 100644
--- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
@@ -352,6 +352,45 @@ schedule_node::set_latency_gen7(bool is_haswell)
        * then around 140.  Presumably this is cache hit vs miss.
        */
       latency = 50;
+   case SHADER_OPCODE_UNTYPED_ATOMIC:
+      /* Test code:
+       *   mov(8)    g112<1>ud       0x00000000ud       { align1 WE_all 1Q };
+       *   mov(1)    g112.7<1>ud     g1.7<0,1,0>ud      { align1 WE_all };
+       *   mov(8)    g113<1>ud       0x00000000ud       { align1 WE_normal 1Q };
+       *   send(8)   g4<1>ud         g112<8,8,1>ud
+       *             data (38, 5, 6) mlen 2 rlen 1      { align1 WE_normal 1Q };
+       *
+       * Running it 100 times as fragment shader on a 128x128 quad
+       * gives an average latency of 13867 cycles per atomic op,
+       * standard deviation 3%.  Note that this is a rather
+       * pessimistic estimate, the actual latency in cases with few
+       * collisions between threads and favorable pipelining has been
+       * seen to be reduced by a factor of 100.
+       */
+      latency = 14000;
+      break;
+
+   case SHADER_OPCODE_UNTYPED_SURFACE_READ:
+      /* Test code:
+       *   mov(8)    g112<1>UD       0x00000000UD       { align1 WE_all 1Q };
+       *   mov(1)    g112.7<1>UD     g1.7<0,1,0>UD      { align1 WE_all };
+       *   mov(8)    g113<1>UD       0x00000000UD       { align1 WE_normal 1Q };
+       *   send(8)   g4<1>UD         g112<8,8,1>UD
+       *             data (38, 6, 5) mlen 2 rlen 1      { align1 WE_normal 1Q };
+       *   .
+       *   . [repeats 8 times]
+       *   .
+       *   mov(8)    g112<1>UD       0x00000000UD       { align1 WE_all 1Q };
+       *   mov(1)    g112.7<1>UD     g1.7<0,1,0>UD      { align1 WE_all };
+       *   mov(8)    g113<1>UD       0x00000000UD       { align1 WE_normal 1Q };
+       *   send(8)   g4<1>UD         g112<8,8,1>UD
+       *             data (38, 6, 5) mlen 2 rlen 1      { align1 WE_normal 1Q };
+       *
+       * Running it 100 times as fragment shader on a 128x128 quad
+       * gives an average latency of 583 cycles per surface read,
+       * standard deviation 0.9%.
+       */
+      latency = is_haswell ? 300 : 600;
       break;
 
    default: