<div dir="ltr">On 21 October 2013 17:48, Eric Anholt <span dir="ltr"><<a href="mailto:eric@anholt.net" target="_blank">eric@anholt.net</a>></span> wrote:<br><div class="gmail_extra"><div class="gmail_quote"><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex">
This avoids a lot of message setup we had to do otherwise.  Improves<br>
GLB2.7 performance with register spilling force enabled by 1.6442% +/-<br>
0.553218% (n=4).<br>
---<br>
 src/mesa/drivers/dri/i965/brw_defines.h            |  7 ++++<br>
 src/mesa/drivers/dri/i965/brw_eu.h                 |  5 +++<br>
 src/mesa/drivers/dri/i965/brw_eu_emit.c            | 41 ++++++++++++++++++++++<br>
 src/mesa/drivers/dri/i965/brw_fs.h                 |  1 +<br>
 src/mesa/drivers/dri/i965/brw_fs_generator.cpp     | 10 ++++++<br>
 src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp  | 21 +++++++----<br>
 .../drivers/dri/i965/brw_schedule_instructions.cpp | 12 +++++++<br>
 src/mesa/drivers/dri/i965/brw_shader.cpp           |  2 ++<br>
 8 files changed, 93 insertions(+), 6 deletions(-)<br>
<br>
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h<br>
index 72a0891..276ab44 100644<br>
--- a/src/mesa/drivers/dri/i965/brw_defines.h<br>
+++ b/src/mesa/drivers/dri/i965/brw_defines.h<br>
@@ -776,6 +776,7 @@ enum opcode {<br>
<br>
    SHADER_OPCODE_GEN4_SCRATCH_READ,<br>
    SHADER_OPCODE_GEN4_SCRATCH_WRITE,<br>
+   SHADER_OPCODE_GEN7_SCRATCH_READ,<br>
<br>
    FS_OPCODE_DDX,<br>
    FS_OPCODE_DDY,<br>
@@ -1135,6 +1136,12 @@ enum brw_message_target {<br>
 #define GEN7_DATAPORT_DC_BYTE_SCATTERED_WRITE                       12<br>
 #define GEN7_DATAPORT_DC_UNTYPED_SURFACE_WRITE                      13<br>
<br>
+#define GEN7_DATAPORT_SCRATCH_READ                            ((1 << 18) | \<br>
+                                                               (0 << 17))<br>
+#define GEN7_DATAPORT_SCRATCH_WRITE                           ((1 << 18) | \<br>
+                                                               (1 << 17))<br>
+#define GEN7_DATAPORT_SCRATCH_NUM_REGS_SHIFT                        12<br>
+<br>
 /* HSW */<br>
 #define HSW_DATAPORT_DC_PORT0_OWORD_BLOCK_READ                      0<br>
 #define HSW_DATAPORT_DC_PORT0_UNALIGNED_OWORD_BLOCK_READ            1<br>
diff --git a/src/mesa/drivers/dri/i965/brw_eu.h b/src/mesa/drivers/dri/i965/brw_eu.h<br>
index 072310d..a307948 100644<br>
--- a/src/mesa/drivers/dri/i965/brw_eu.h<br>
+++ b/src/mesa/drivers/dri/i965/brw_eu.h<br>
@@ -379,6 +379,11 @@ void brw_oword_block_write_scratch(struct brw_compile *p,<br>
                                   int num_regs,<br>
                                   GLuint offset);<br>
<br>
+void gen7_block_read_scratch(struct brw_compile *p,<br>
+                             struct brw_reg dest,<br>
+                             int num_regs,<br>
+                             GLuint offset);<br>
+<br>
 void brw_shader_time_add(struct brw_compile *p,<br>
                          struct brw_reg payload,<br>
                          uint32_t surf_index);<br>
diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c<br>
index 8efd679..accf324 100644<br>
--- a/src/mesa/drivers/dri/i965/brw_eu_emit.c<br>
+++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c<br>
@@ -2055,6 +2055,47 @@ brw_oword_block_read_scratch(struct brw_compile *p,<br>
    }<br>
 }<br>
<br>
+void<br>
+gen7_block_read_scratch(struct brw_compile *p,<br>
+                        struct brw_reg dest,<br>
+                        int num_regs,<br>
+                        GLuint offset)<br>
+{<br>
+   dest = retype(dest, BRW_REGISTER_TYPE_UW);<br>
+<br>
+   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);<br>
+<br>
+   assert(insn->header.predicate_control == 0);<br></blockquote><div><br></div><div>Can we please use BRW_PREDICATE_NONE instead of 0 here?<br></div><div> </div><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex">

+   insn->header.compression_control = BRW_COMPRESSION_NONE;<br>
+<br>
+   brw_set_dest(p, insn, dest);<br>
+<br>
+   /* The HW requires that the header is present; this is to get the g0.5<br>
+    * scratch offset.<br>
+    */<br>
+   bool header_present = true;<br>
+   brw_set_src0(p, insn, brw_vec8_grf(0, 0));<br>
+<br>
+   brw_set_message_descriptor(p, insn,<br>
+                              GEN7_SFID_DATAPORT_DATA_CACHE,<br>
+                              1, /* mlen: just g0 */<br>
+                              num_regs,<br>
+                              header_present,<br>
+                              false);<br>
+<br>
+   insn->bits3.ud |= GEN7_DATAPORT_SCRATCH_READ;<br>
+<br>
+   assert(num_regs == 1 || num_regs == 2 || num_regs == 4);<br>
+   insn->bits3.ud |= (num_regs - 1) << GEN7_DATAPORT_SCRATCH_NUM_REGS_SHIFT;<br>
+<br>
+   /* The "HWORD" unit in the docs just happens to mean "the size of a<br>
+    * register"<br>
+    */<br></blockquote><div><br></div><div>I had to search the docs for a while to understand what this comment was referring to.  How about saying something like this instead?<br><br>   /* According to the docs, offset is "A 12-bit HWord offset into the memory<br>
    * Immediate Memory buffer as specified by binding table 0xFF."  An HWORD<br>    * is 32 bytes, which happens to be the size of a register.<br>    */<br><br></div><div> </div><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex">

+   offset /= REG_SIZE;<br>
+   assert(offset < (1 << 12));<br>
+   insn->bits3.ud |= offset;<br>
+}<br>
+<br>
 /**<br>
  * Read a float[4] vector from the data port Data Cache (const buffer).<br>
  * Location (in buffer) should be a multiple of 16.<br>
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h<br>
index f9c87c7..432f3df 100644<br>
--- a/src/mesa/drivers/dri/i965/brw_fs.h<br>
+++ b/src/mesa/drivers/dri/i965/brw_fs.h<br>
@@ -521,6 +521,7 @@ private:<br>
                      bool negate_value);<br>
    void generate_scratch_write(fs_inst *inst, struct brw_reg src);<br>
    void generate_scratch_read(fs_inst *inst, struct brw_reg dst);<br>
+   void generate_scratch_read_gen7(fs_inst *inst, struct brw_reg dst);<br>
    void generate_uniform_pull_constant_load(fs_inst *inst, struct brw_reg dst,<br>
                                             struct brw_reg index,<br>
                                             struct brw_reg offset);<br>
diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp<br>
index 6aebc41..fa8bccd 100644<br>
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp<br>
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp<br>
@@ -732,6 +732,12 @@ fs_generator::generate_scratch_read(fs_inst *inst, struct brw_reg dst)<br>
 }<br>
<br>
 void<br>
+fs_generator::generate_scratch_read_gen7(fs_inst *inst, struct brw_reg dst)<br>
+{<br>
+   gen7_block_read_scratch(p, dst, dispatch_width / 8, inst->offset);<br>
+}<br>
+<br>
+void<br>
 fs_generator::generate_uniform_pull_constant_load(fs_inst *inst,<br>
                                                   struct brw_reg dst,<br>
                                                   struct brw_reg index,<br>
@@ -1517,6 +1523,10 @@ fs_generator::generate_code(exec_list *instructions)<br>
         generate_scratch_read(inst, dst);<br>
         break;<br>
<br>
+      case SHADER_OPCODE_GEN7_SCRATCH_READ:<br>
+        generate_scratch_read_gen7(inst, dst);<br>
+        break;<br>
+<br>
       case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:<br>
         generate_uniform_pull_constant_load(inst, dst, src[0], src[1]);<br>
         break;<br>
diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp<br>
index 75090a6..a78c1af 100644<br>
--- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp<br>
+++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp<br>
@@ -527,17 +527,25 @@ fs_visitor::emit_unspill(fs_inst *inst, fs_reg dst, uint32_t spill_offset,<br>
                          int count)<br>
 {<br>
    for (int i = 0; i < count; i++) {<br>
+      /* The gen7 descriptor-based offset is 12 bits of HWORD units. */<br>
+      bool gen7_read = brw->gen >= 7 && spill_offset < (1 << 12) * REG_SIZE;<br>
+<br>
       fs_inst *unspill_inst =<br>
-         new(mem_ctx) fs_inst(SHADER_OPCODE_GEN4_SCRATCH_READ, dst);<br>
+         new(mem_ctx) fs_inst(gen7_read ?<br>
+                              SHADER_OPCODE_GEN7_SCRATCH_READ :<br>
+                              SHADER_OPCODE_GEN4_SCRATCH_READ,<br>
+                              dst);<br>
       unspill_inst->offset = spill_offset;<br>
       unspill_inst->ir = inst->ir;<br>
       unspill_inst->annotation = inst->annotation;<br>
<br>
-      /* Choose a MRF that won't conflict with an MRF that's live across the<br>
-       * spill.  Nothing else will make it up to MRF 14/15.<br>
-       */<br>
-      unspill_inst->base_mrf = 14;<br>
-      unspill_inst->mlen = 1; /* header contains offset */<br>
+      if (!gen7_read) {<br>
+         /* Choose a MRF that won't conflict with an MRF that's live across<br>
+          * the spill.  Nothing else will make it up to MRF 14/15.<br>
+          */<br>
+         unspill_inst->base_mrf = 14;<br>
+         unspill_inst->mlen = 1; /* header contains offset */<br>
+      }<br>
       inst->insert_before(unspill_inst);<br>
<br>
       dst.reg_offset++;<br>
@@ -605,6 +613,7 @@ fs_visitor::choose_spill_reg(struct ra_graph *g)<br>
         break;<br>
<br>
       case SHADER_OPCODE_GEN4_SCRATCH_READ:<br>
+      case SHADER_OPCODE_GEN7_SCRATCH_READ:<br>
         if (inst->dst.file == GRF)<br>
            no_spill[inst->dst.reg] = true;<br>
         break;<br>
diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp<br>
index 9a480b4..a4e0be3 100644<br>
--- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp<br>
+++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp<br>
@@ -329,6 +329,18 @@ schedule_node::set_latency_gen7(bool is_haswell)<br>
       latency = 200;<br>
       break;<br>
<br>
+   case SHADER_OPCODE_GEN7_SCRATCH_READ:<br>
+      /* Testing a load from offset 0, that had been previously written:<br>
+       *<br>
+       * send(8) g114<1>UW g0<8,8,1>F data (0, 0, 0) mlen 1 rlen 1 { align1 WE_normal 1Q };<br>
+       * mov(8)  null      g114<8,8,1>F { align1 WE_normal 1Q };<br>
+       *<br>
+       * The cycles spent seemed to be grouped around 40-50 (as low as 38),<br>
+       * then around 140.  Presumably this is cache hit vs miss.<br>
+       */<br>
+      latency = 50;<br>
+      break;<br>
+<br>
    default:<br>
       /* 2 cycles:<br>
        * mul(8) g4<1>F g2<0,1,0>F      0.5F            { align1 WE_normal 1Q };<br>
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp<br>
index 19421ba..81930ae 100644<br>
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp<br>
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp<br>
@@ -442,6 +442,8 @@ brw_instruction_name(enum opcode op)<br>
       return "gen4_scratch_read";<br>
    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:<br>
       return "gen4_scratch_write";<br>
+   case SHADER_OPCODE_GEN7_SCRATCH_READ:<br>
+      return "gen7_scratch_read";<br>
<br>
    case FS_OPCODE_DDX:<br>
       return "ddx";<br>
<span class=""><font color="#888888">--<br>
1.8.4.rc3<br></font></span></blockquote><div><br></div><div>With those changes, this patch is:<br><br>Reviewed-by: Paul Berry <<a href="mailto:stereotype441@gmail.com">stereotype441@gmail.com</a>><br><br></div><div>
I sent a comment on patch 2 earlier.  The remainder are:<br><br>Reviewed-by: Paul Berry <<a href="mailto:stereotype441@gmail.com">stereotype441@gmail.com</a>><br></div></div></div></div>