Mesa (master): i965: Add optimization pass to let us use the replicate data message

Fri Aug 15 18:28:19 UTC 2014

Module: Mesa
Branch: master
Commit: f9dc7aabb3273d6d8a54c6778a5695a8527f4454
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=f9dc7aabb3273d6d8a54c6778a5695a8527f4454

Author: Kristian Høgsberg <krh at bitplanet.net>
Date:   Mon Jul  7 15:27:17 2014 -0700

i965: Add optimization pass to let us use the replicate data message

The data port has a SIMD16 'replicate data' message, which lets us write
the same color for all 16 pixels by sending the four floats in the
lower half of a register instead of sending 4 times 16 identical
component values in 8 registers.

The message comes with a lot of restrictions and could be made generally
useful by recognizing when those restriction are satisfied.  For now,
this lets us enable the optimization when we know it's safe, but we don't
enable it by default.  The optimization works for simple color clear shaders
only, but does recognized and support multiple render targets.

Signed-off-by: Kristian Høgsberg <krh at bitplanet.net>
Acked-by: Kenneth Graunke <kenneth at whitecape.org>

---

 src/mesa/drivers/dri/i965/brw_context.h        |    1 +
 src/mesa/drivers/dri/i965/brw_defines.h        |    1 +
 src/mesa/drivers/dri/i965/brw_fs.cpp           |   97 ++++++++++++++++++++++++
 src/mesa/drivers/dri/i965/brw_fs.h             |    3 +
 src/mesa/drivers/dri/i965/brw_fs_generator.cpp |    5 +-
 5 files changed, 106 insertions(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 1933182..a8f5d0f 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -1034,6 +1034,7 @@ struct brw_context
    bool has_negative_rhw_bug;
    bool has_pln;
    bool no_simd8;
+   bool use_rep_send;
 
    /**
     * Some versions of Gen hardware don't do centroid interpolation correctly
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index 88f2057..248a866 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -850,6 +850,7 @@ enum opcode {
     */
    FS_OPCODE_FB_WRITE = 128,
    FS_OPCODE_BLORP_FB_WRITE,
+   FS_OPCODE_REP_FB_WRITE,
    SHADER_OPCODE_RCP,
    SHADER_OPCODE_RSQ,
    SHADER_OPCODE_SQRT,
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 565189b..f1d3fb8 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -2284,6 +2284,100 @@ fs_visitor::compute_to_mrf()
 }
 
 /**
+ * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
+ * instructions to FS_OPCODE_REP_FB_WRITE.
+ */
+void
+fs_visitor::try_rep_send()
+{
+   int i, count;
+   fs_inst *start = NULL;
+
+   /* From the Ivybridge PRM, Volume 4 Part 1, section 3.9.11.2
+    * ("Message Descriptor - Render Target Write"):
+    *
+    * "SIMD16_REPDATA message must not be used in SIMD8 pixel-shaders."
+    */
+   if (dispatch_width != 16)
+      return;
+
+   /* The constant color write message can't handle anything but the 4 color
+    * values.  We could do MRT, but the loops below would need to understand
+    * handling the header being enabled or disabled on different messages.  It
+    * also requires that the render target be tiled, which might not be the
+    * case for some EGLImage paths or if we some day do rendering to PBOs.
+    */
+   if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH) ||
+       payload.aa_dest_stencil_reg ||
+       payload.dest_depth_reg ||
+       dual_src_output.file != BAD_FILE)
+      return;
+
+   /* The optimization is implemented as one pass through the instruction
+    * list.  We keep track of the most recent block of MOVs into sequential
+    * MRFs from single, sequential float registers (ie uniforms).  Then when
+    * we find an FB_WRITE opcode, we see if the payload registers match the
+    * destination registers in our block of MOVs.
+    */
+   count = 0;
+   foreach_in_list_safe(fs_inst, inst, &this->instructions) {
+      if (count == 0)
+         start = inst;
+      if (inst->opcode == BRW_OPCODE_MOV &&
+	  inst->dst.file == MRF &&
+          inst->dst.reg == start->dst.reg + 2 * count &&
+          inst->src[0].file == HW_REG &&
+          inst->src[0].reg_offset == start->src[0].reg_offset + count) {
+         if (count == 0)
+            start = inst;
+         count++;
+      }
+
+      if (inst->opcode == FS_OPCODE_FB_WRITE &&
+          count == 4 &&
+          (inst->base_mrf == start->dst.reg ||
+           (inst->base_mrf + 2 == start->dst.reg && inst->header_present))) {
+         fs_inst *mov = MOV(start->dst, start->src[0]);
+
+         /* Make a MOV that moves the four floats into the replicated write
+          * payload.  Since we're running at the very end of code generation
+          * we can use hw registers and generate the stride and offsets we
+          * need for this MOV.  We use the first of the eight registers
+          * allocated for the SIMD16 payload for the four floats.
+          */
+         mov->dst.fixed_hw_reg =
+            brw_vec4_reg(BRW_MESSAGE_REGISTER_FILE,
+                         start->dst.reg, 0);
+         mov->dst.file = HW_REG;
+         mov->dst.type = mov->dst.fixed_hw_reg.type;
+
+         mov->src[0].fixed_hw_reg =
+            brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
+         mov->src[0].file = HW_REG;
+         mov->src[0].type = mov->src[0].fixed_hw_reg.type;
+         mov->force_writemask_all = true;
+         mov->dst.type = BRW_REGISTER_TYPE_F;
+
+         /* Replace the four MOVs with the new vec4 MOV. */
+         start->insert_before(mov);
+         for (i = 0; i < 4; i++)
+            mov->next->remove();
+
+         /* Finally, adjust the message length and set the opcode to
+          * REP_FB_WRITE for the send, so that the generator will use the
+          * replicated data mesage type.  Then reset count so we'll start
+          * looking for a new block in case we're in a MRT shader.
+          */
+         inst->opcode = FS_OPCODE_REP_FB_WRITE;
+         inst->mlen -= 7;
+         count = 0;
+      }
+   }
+
+   return;
+}
+
+/**
  * Walks through basic blocks, looking for repeated MRF writes and
  * removing the later ones.
  */
@@ -3226,6 +3320,9 @@ fs_visitor::run()
       prog_data->total_scratch = brw_get_scratch_size(last_scratch);
    }
 
+   if (brw->use_rep_send)
+      try_rep_send();
+
    if (dispatch_width == 8)
       prog_data->reg_blocks = brw_register_blocks(grf_used);
    else
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index 0f8fb2d..9e5b5d7 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -355,6 +355,8 @@ public:
    void lower_uniform_pull_constant_loads();
    bool lower_load_payload();
 
+   void try_rep_send();
+
    void push_force_uncompressed();
    void pop_force_uncompressed();
 
@@ -590,6 +592,7 @@ private:
                       GLuint nr);
    void generate_fb_write(fs_inst *inst);
    void generate_blorp_fb_write(fs_inst *inst);
+   void generate_rep_fb_write(fs_inst *inst);
    void generate_pixel_xy(struct brw_reg dst, bool is_x);
    void generate_linterp(fs_inst *inst, struct brw_reg dst,
 			 struct brw_reg *src);
diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
index 1cf5a88..a243003 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -115,7 +115,9 @@ fs_generator::fire_fb_write(fs_inst *inst,
       brw_pop_insn_state(p);
    }
 
-   if (prog_data->dual_src_blend)
+   if (inst->opcode == FS_OPCODE_REP_FB_WRITE)
+      msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED;
+   else if (prog_data->dual_src_blend)
       msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
    else if (dispatch_width == 16)
       msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
@@ -1839,6 +1841,7 @@ fs_generator::generate_code(exec_list *instructions)
 	 generate_varying_pull_constant_load_gen7(inst, dst, src[0], src[1]);
 	 break;
 
+      case FS_OPCODE_REP_FB_WRITE:
       case FS_OPCODE_FB_WRITE:
 	 generate_fb_write(inst);
 	 break;