[Mesa-dev] [PATCH] i965/fs: Recognize constants can be loaded by subtracting from 1.0.

Matt Turner mattst88 at gmail.com
Tue Mar 1 06:06:21 UTC 2016


Some shaders from Synmark contain this loop:

   for (float i = 0.02; i < 0.9; i += 0.11)

and in its body it uses both i and (1.0 - i). All 16 immediates are
promoted to registers (they're used by 3-src MAD instructions). By
recognizing that we can load 8 of these into a single register and then
subtract that from 1.0 to produce the other 8 values, we can load the 16
values in 9 instructions rather than 16:

   total instructions in shared programs: 7121101 -> 7120611 (-0.01%)
   instructions in affected programs: 12915 -> 12425 (-3.79%)
   helped: 70

More importantly, because of the false dependencies between the mov(1)
instructions, instruction scheduling is not able to schedule enough work
after texture operations. With this patch, the mov(1)s and the add(8)
are emitted together and avoid the scheduling problems:

   total cycles in shared programs: 65013218 -> 64895688 (-0.18%)
   cycles in affected programs: 378700 -> 261170 (-31.04%)
   helped: 70

Synmark's OglPSPom improves by 2.42872% +/- 0.158891% (n=120) on a
Haswell GT3e.
---
I've had this in a branch for more than a year now. Might as well send it.

 .../drivers/dri/i965/brw_fs_combine_constants.cpp  | 93 ++++++++++++++++++++++
 1 file changed, 93 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp b/src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp
index d7a1456..f272263 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp
@@ -133,6 +133,9 @@ struct imm {
     */
    bool must_promote;
 
+   /** Whether this constant has been inserted via a different method. */
+   bool inserted;
+
    uint16_t first_use_ip;
    uint16_t last_use_ip;
 };
@@ -168,6 +171,90 @@ new_imm(struct table *table, void *mem_ctx)
 }
 
 /**
+ * Recognizes when a group of 8 immediates can be loaded by subtracting other
+ * constants we're already loading from 1.0.
+ */
+static void
+try_recognize_1_minus_x(fs_visitor *v, struct table *table)
+{
+   struct {
+      int a_index, b_index;
+      float a_val, b_val;
+
+      bool valid;
+      bool found;
+   } info[table->len] = {0};
+
+   int c = 0;
+   bblock_t *block = NULL;
+
+   for (int i = 0; i < table->len; i++) {
+      /* Skip some values that aren't useful to include this in optimization */
+      if (table->imm[i].val == 0.0f ||
+          table->imm[i].val == 0.5f ||
+          table->imm[i].val == 1.0f)
+         continue;
+
+      for (int j = 0; j < table->len; j++) {
+         if (table->imm[i].val == 1.0f - table->imm[j].val) {
+            if (info[j].found)
+               continue;
+
+            if (block == NULL)
+               block = table->imm[i].block;
+            if (block != table->imm[i].block)
+               continue;
+
+            info[i].a_index = i;
+            info[i].b_index = j;
+            info[i].a_val = table->imm[i].val;
+            info[i].b_val = table->imm[j].val;
+            info[i].valid = true;
+
+            info[i].found = true;
+            info[j].found = true;
+            c++;
+         }
+      }
+   }
+
+   if (c == 8) {
+      backend_instruction *inst = block->first_non_control_flow_inst();
+      const fs_builder bld = v->bld.at(block, inst).exec_all().group(1, 0);
+
+      fs_reg first(VGRF, v->alloc.allocate(v->dispatch_width / 8));
+      fs_reg second(VGRF, v->alloc.allocate(v->dispatch_width / 8));
+      first.stride = 0;
+
+      for (int i = 0; i < table->len; i++) {
+         if (!info[i].valid)
+            continue;
+
+         fs_inst *mov = bld.MOV(first, brw_imm_f(info[i].a_val));
+         mov->no_dd_clear = first.subreg_offset != 28;
+         mov->no_dd_check = first.subreg_offset != 0;
+
+         int a = info[i].a_index;
+         int b = info[i].b_index;
+
+         table->imm[a].inserted = true;
+         table->imm[a].nr = first.nr;
+         table->imm[a].subreg_offset = first.subreg_offset;
+
+         table->imm[b].inserted = true;
+         table->imm[b].nr = second.nr;
+         table->imm[b].subreg_offset = first.subreg_offset;
+
+         first.subreg_offset += sizeof(float);
+      }
+
+      first.stride = 1;
+      first.subreg_offset = 0;
+      bld.group(8, 0).ADD(second, negate(first), brw_imm_f(1.0f));
+   }
+}
+
+/**
  * Comparator used for sorting an array of imm structures.
  *
  * We sort by basic block number, then last use IP, then first use IP (least
@@ -241,6 +328,7 @@ fs_visitor::opt_combine_constants()
             imm->val = val;
             imm->uses_by_coissue = could_coissue(devinfo, inst);
             imm->must_promote = must_promote_imm(devinfo, inst);
+            imm->inserted = false;
             imm->first_use_ip = ip;
             imm->last_use_ip = ip;
          }
@@ -267,11 +355,16 @@ fs_visitor::opt_combine_constants()
    if (cfg->num_blocks != 1)
       qsort(table.imm, table.len, sizeof(struct imm), compare);
 
+   try_recognize_1_minus_x(this, &table);
+
    /* Insert MOVs to load the constant values into GRFs. */
    fs_reg reg(VGRF, alloc.allocate(1));
    reg.stride = 0;
    for (int i = 0; i < table.len; i++) {
       struct imm *imm = &table.imm[i];
+      if (imm->inserted)
+         continue;
+
       /* Insert it either before the instruction that generated the immediate
        * or after the last non-control flow instruction of the common ancestor.
        */
-- 
2.4.10



More information about the mesa-dev mailing list