[Mesa-dev] [PATCH] i965/fs: Combine tex/fb_write operations (opt)

Ben Widawsky benjamin.widawsky at intel.com
Sun Feb 8 13:59:57 PST 2015


Certain platforms support the ability to sample from a texture, and write it out
to the file RT - thus saving a costly send instructions (and before Topi's
recent patch, excess MOVs),

On BSW, the performance data isn't quite what I was hoping for. Statistically,
it is an improvement. Several microbenchmarks improve between 0-2%, and Egypt
improves by similar amounts. No benchmarks regress with n=10. BDW and SKL also
support this, testing would be welcome.

The ministat with n=10
Egypt:
   1.94611% +/- 1.35665%

Egpyt Offscreen:
   1.2429% +/- 0.834759%

v2: Modify the algorithm. Instead of iterating in reverse through blocks and
insts, since the last block/inst is the only thing which can benefit.
Rebased on top of Ken's patching modifying is_last_send

Cc: Kenneth Graunke <kenneth at whitecape.org>
Cc: Jason Ekstrand <jason.ekstrand at intel.com>
Signed-off-by: Ben Widawsky <ben at bwidawsk.net>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp               | 101 +++++++++++++++++++++
 src/mesa/drivers/dri/i965/brw_fs.h                 |   3 +
 .../dri/i965/brw_fs_dead_code_eliminate.cpp        |   2 +-
 src/mesa/drivers/dri/i965/brw_fs_generator.cpp     |  12 +++
 4 files changed, 117 insertions(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 78feee2..4b8fb4a 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -2452,6 +2452,106 @@ fs_visitor::opt_algebraic()
    return progress;
 }
 
+/* Set up the sampler message to also have EOT set */
+void
+fs_visitor::combine_send_tex(struct bblock_t *block,
+                             fs_inst *tex_inst,
+                             fs_inst *fb_write)
+{
+   assert((tex_inst->offset & (0xff << 24)) == 0);
+   assert(!tex_inst->eot); /* We can't get here twice */
+
+   fs_inst *old_load_payload = (fs_inst *) tex_inst->prev;
+   /* It's unlikely yet possible the load_payload is further above */
+   if (old_load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD) {
+      perf_debug("Potential send collapse skipped: Where's LOAD_PAYLOAD?");
+      return;
+   }
+
+   tex_inst->offset |= fb_write->target << 24;
+   tex_inst->eot = true;
+   fb_write->remove(block);
+
+   /* The texture instruction with EOT requires a header. If one is already
+    * present, there is nothing to do.
+    */
+   if (tex_inst->header_present)
+      return;
+
+   /* If a header is not present, we need to create a new LOAD_PAYLOAD command
+    * with the same sources and a space saved for the header. Using a new
+    * destination register not only makes sure we have enough space, but it will
+    * make sure the dead code eliminator kills the old command.
+    */
+   tex_inst->mlen++;
+   tex_inst->header_present = true;
+
+   fs_reg send_header = vgrf(old_load_payload->sources + 1);
+   fs_reg *new_sources =
+      ralloc_array(mem_ctx, fs_reg, old_load_payload->sources + 1);
+
+   new_sources[0] = fs_reg();
+   for (int i = 0; i < old_load_payload->sources; i++)
+      new_sources[i+1] = old_load_payload->src[i];
+
+   /* The LOAD_PAYLOAD helper seems like the obvious choice here. However, it
+    * requires a lot of information about the sources to appropriately figure
+    * out the number of registers needed to be used. Given this stage in our
+    * optimization, we may not have the appropriate GRFs required by
+    * LOAD_PAYLOAD at this point (copy propogation). Therefore, we need to
+    * manually emit the instruction.
+    *
+    * jekstrand: "The helper is too automagic for what you're doing."
+    */
+   fs_inst *new_load_payload = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD,
+                                                    old_load_payload->exec_size,
+                                                    send_header,
+                                                    new_sources,
+                                                    old_load_payload->sources + 1);
+
+   new_load_payload->regs_written = old_load_payload->regs_written + 1;
+
+   tex_inst->insert_before(block, new_load_payload);
+   tex_inst->src[0] = send_header;
+
+}
+
+/* Try to collapse two sends into one.
+ *
+ * The first send should be sampling from a texture, the second send should be
+ * writing out the returned information to the RT.
+ */
+bool
+fs_visitor::opt_sampler_eot()
+{
+   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
+
+   if (brw->gen < 9 && !brw->is_cherryview)
+      return false;
+
+   if (key->nr_color_regions != 1)
+      return false;
+
+   fs_inst *fb_write = (fs_inst *) cfg->blocks[cfg->num_blocks - 1]->end();
+   fs_inst *tex_inst = (fs_inst *) fb_write->prev;
+
+   if (tex_inst->is_head_sentinel())
+      return false;
+
+   /* It will potentially be multiple iterations of optimizations before we get
+    * the condition we wish to optimize for.
+    */
+   if (!tex_inst->is_tex())
+      return false;
+
+   assert(fb_write->eot);
+   assert(fb_write->opcode == FS_OPCODE_FB_WRITE);
+
+   combine_send_tex(cfg->blocks[cfg->num_blocks - 1], tex_inst, fb_write);
+
+   return true;
+}
+
 bool
 fs_visitor::opt_register_renaming()
 {
@@ -3609,6 +3709,7 @@ fs_visitor::optimize()
       OPT(opt_peephole_predicated_break);
       OPT(opt_cmod_propagation);
       OPT(dead_code_eliminate);
+      OPT(opt_sampler_eot);
       OPT(opt_peephole_sel);
       OPT(dead_control_flow_eliminate, this);
       OPT(opt_register_renaming);
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index b95e2c0..c9f5ab0 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -459,6 +459,9 @@ public:
    bool compute_to_mrf();
    bool dead_code_eliminate();
    bool remove_duplicate_mrf_writes();
+
+   void combine_send_tex(bblock_t *block, fs_inst *tex_inst, fs_inst *fb_write);
+   bool opt_sampler_eot();
    bool virtual_grf_interferes(int a, int b);
    void schedule_instructions(instruction_scheduler_mode mode);
    void insert_gen4_send_dependency_workarounds();
diff --git a/src/mesa/drivers/dri/i965/brw_fs_dead_code_eliminate.cpp b/src/mesa/drivers/dri/i965/brw_fs_dead_code_eliminate.cpp
index d66808b..e5e52c8 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_dead_code_eliminate.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_dead_code_eliminate.cpp
@@ -52,7 +52,7 @@ fs_visitor::dead_code_eliminate()
              sizeof(BITSET_WORD));
 
       foreach_inst_in_block_reverse(fs_inst, inst, block) {
-         if (inst->dst.file == GRF && !inst->has_side_effects()) {
+         if (!inst->eot && inst->dst.file == GRF && !inst->has_side_effects()) {
             bool result_live = false;
 
             if (inst->regs_written == 1) {
diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
index 8cd36f8..9f2ebaf 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -503,6 +503,7 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src
    int rlen = 4;
    uint32_t simd_mode;
    uint32_t return_format;
+   bool is_combined_send = inst->eot;
 
    switch (dst.type) {
    case BRW_REGISTER_TYPE_D:
@@ -662,6 +663,11 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src
       dst = vec16(dst);
    }
 
+   if (is_combined_send) {
+      assert(brw->gen >= 9 || brw->is_cherryview);
+      rlen = 0;
+   }
+
    assert(brw->gen < 7 || !inst->header_present ||
           src.file == BRW_GENERAL_REGISTER_FILE);
 
@@ -777,6 +783,12 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src
        * so has already done marking.
        */
    }
+
+   if (is_combined_send) {
+      brw_inst_set_eot(brw, brw_last_inst, true);
+      brw_set_dest(p, brw_last_inst, brw_null_reg());
+      brw_inst_set_opcode(brw, brw_last_inst, BRW_OPCODE_SENDC);
+   }
 }
 
 
-- 
2.3.0



More information about the mesa-dev mailing list