[Mesa-dev] [PATCH 41/47] i965/fs: Add reuse_16bit_conversions_register optimization

Thu Aug 24 13:54:56 UTC 2017

When dealing with HF/U/UW, it is usual having a register with a
F/D/UD, and then convert it to HF/U/UW, and not use again the F/D/UD
value. In those cases it would be possible to reuse the register where
the F value is initially stored instead of having two. Take also into
account that when operating with HF/U/UW, you would need to use the
full register (so stride 2). Packs/unpacks would be only useful when
loading/storing several HF/W/UW.

Note that no instruction is removed. The main benefict is reducing the
amoung of registers used, so the pressure on the register allocator is
decreased with big shaders.

Possibly this could be integrated into an existing optimization, at it
is even already done by the register allocator, but was far easier to
write and cleaner to read as a separate optimization.

We found this issue when dealing with some Vulkan CTS tests that
needed several minutes to compile. Most of the time was spent on the
register allocator.

Right now the optimization only handles 32 to 16 bit conversion. It
could be possible to do the equivalent for 16 to 32 bit too, but in
practice, we didn't need it.
---
 src/intel/compiler/brw_fs.cpp | 77 +++++++++++++++++++++++++++++++++++++++++++
 src/intel/compiler/brw_fs.h   |  1 +
 2 files changed, 78 insertions(+)

diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index b6013a5ce85..1342150b44e 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -39,6 +39,7 @@
 #include "compiler/glsl_types.h"
 #include "compiler/nir/nir_builder.h"
 #include "program/prog_parameter.h"
+#include "brw_fs_live_variables.h"
 
 using namespace brw;
 
@@ -3133,6 +3134,81 @@ fs_visitor::remove_extra_rounding_modes()
    return progress;
 }
 
+/**
+ * When dealing with HF/W/UW, it is usual having a register with a F/D/UD, and
+ * then convert it to HF/W/UW, and not use again the F/D/UD value. In those
+ * cases it would be possible to reuse the register where the F value is
+ * initially stored instead of having two. Take also into account that when
+ * operating with HF/W/UW, you would need to use the full register (so stride
+ * 2). Packs/unpacks would be only useful when loading/storing several
+ * HF/W/UWs.
+ *
+ * So something like this:
+ *  mov(8) vgrf14<2>:HF, vgrf39:F
+ *
+ * Became:
+ *  mov(8) vgrf39<2>:HF, vgrf39:F
+ *
+ * Note that no instruction is removed. The main benefict is reducing the
+ * amoung of registers used, so the pressure on the register allocator is
+ * decreased with big shaders.
+ */
+bool
+fs_visitor::reuse_16bit_conversions_vgrf()
+{
+   bool progress = false;
+   int ip = -1;
+
+   calculate_live_intervals();
+
+   foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
+      ip++;
+
+      if (inst->dst.file != VGRF || inst->src[0].file != VGRF)
+         continue;
+
+      if (inst->opcode != BRW_OPCODE_MOV)
+         continue;
+
+      if (type_sz(inst->dst.type) != 2 || inst->dst.stride != 2 ||
+          type_sz(inst->src[0].type) != 4 || inst->src[0].stride != 1) {
+         continue;
+      }
+
+      int src_reg = inst->src[0].nr;
+      int src_offset = inst->src[0].offset;
+      unsigned src_var = live_intervals->var_from_vgrf[src_reg];
+      int src_end = live_intervals->end[src_var];
+      int dst_reg = inst->dst.nr;
+
+      if (src_end > ip)
+         continue;
+
+      foreach_block_and_inst(block, fs_inst, scan_inst, cfg) {
+         if (scan_inst->dst.file == VGRF &&
+             scan_inst->dst.nr == dst_reg) {
+            scan_inst->dst.nr = src_reg;
+            scan_inst->dst.offset = src_offset;
+            progress = true;
+         }
+
+         for (int i = 0; i < scan_inst->sources; i++) {
+            if (scan_inst->src[i].file == VGRF &&
+                scan_inst->src[i].nr == dst_reg) {
+               scan_inst->src[i].nr = src_reg;
+               scan_inst->src[i].offset = src_offset;
+               progress = true;
+            }
+         }
+      }
+   }
+
+   if (progress)
+      invalidate_live_intervals();
+
+   return progress;
+}
+
 
 static void
 clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
@@ -5829,6 +5905,7 @@ fs_visitor::optimize()
 
    OPT(opt_drop_redundant_mov_to_flags);
    OPT(remove_extra_rounding_modes);
+   OPT(reuse_16bit_conversions_vgrf);
 
    do {
       progress = false;
diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h
index b9476e69edb..2685f748b87 100644
--- a/src/intel/compiler/brw_fs.h
+++ b/src/intel/compiler/brw_fs.h
@@ -151,6 +151,7 @@ public:
    bool dead_code_eliminate();
    bool remove_duplicate_mrf_writes();
    bool remove_extra_rounding_modes();
+   bool reuse_16bit_conversions_vgrf();
 
    bool opt_sampler_eot();
    bool virtual_grf_interferes(int a, int b);
-- 
2.11.0