[Mesa-dev] [PATCH 1/3] i965/fs: Emit better b2f of an expression on pre-SNB

Thu Feb 5 22:55:51 PST 2015

From: Ian Romanick <ian.d.romanick at intel.com>

On platforms that do not natively generate 0u and ~0u for Boolean
results, b2f expressions that look like

   f = b2f(expr cmp 0)

will generate better code by pretending the expression is

    f = ir_triop_sel(0.0, 1.0, expr cmp 0)

This is because the last instruction of "expr" can generate the
condition code for the "cmp 0".  This avoids having to do the "-(b & 1)"
trick to generate 0u or ~0u for the Boolean result.  This means code like

    mov(16)         g16<1>F         1F
    mul.ge.f0(16)   null            g6<8,8,1>F      g14<8,8,1>F
    (+f0) sel(16)   m6<1>F          g16<8,8,1>F     0F

will be generated instead of

    mul(16)         g2<1>F          g12<8,8,1>F     g4<8,8,1>F
    cmp.ge.f0(16)   g2<1>D          g4<8,8,1>F      0F
    and(16)         g4<1>D          g2<8,8,1>D      1D
    and(16)         m6<1>D          -g4<8,8,1>D     0x3f800000UD

When the comparison is either == 0.0 or != 0.0 it would seem that using
the knowledge that the true (or false) case already results in zero
would allow better code generation by possibly avoiding a load-immediate
instruction.  Some experimentation showed this to not be the case.

Shader-db results:

GM45 (0x2A42):
total instructions in shared programs: 3542437 -> 3542267 (-0.00%)
instructions in affected programs:     32947 -> 32777 (-0.52%)
helped:                                118
HURT:                                  0
GAINED:                                0
LOST:                                  0

Iron Lake (0x0046):
total instructions in shared programs: 4864785 -> 4864611 (-0.00%)
instructions in affected programs:     33094 -> 32920 (-0.53%)
helped:                                122
HURT:                                  0
GAINED:                                0
LOST:                                  0

No change on other platforms.

Signed-off-by: Ian Romanick <ian.d.romanick at intel.com>
---
 src/mesa/drivers/dri/i965/brw_fs.h           |  1 +
 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 74 ++++++++++++++++++++++++++++
 2 files changed, 75 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index 84e0b9e..cae55f4 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -533,6 +533,7 @@ public:
                  const fs_reg &a);
    void emit_minmax(enum brw_conditional_mod conditionalmod, const fs_reg &dst,
                     const fs_reg &src0, const fs_reg &src1);
+   bool try_emit_b2f_of_comparison(ir_expression *ir);
    bool try_emit_saturate(ir_expression *ir);
    bool try_emit_line(ir_expression *ir);
    bool try_emit_mad(ir_expression *ir);
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index f5d7383..2f74716 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -483,6 +483,75 @@ fs_visitor::try_emit_mad(ir_expression *ir)
    return true;
 }
 
+bool
+fs_visitor::try_emit_b2f_of_comparison(ir_expression *ir)
+{
+   /* On platforms that do not natively generate 0u and ~0u for Boolean
+    * results, b2f expressions that look like
+    *
+    *     f = b2f(expr cmp 0)
+    *
+    * will generate better code by pretending the expression is
+    *
+    *     f = ir_triop_csel(0.0, 1.0, expr cmp 0)
+    *
+    * This is because the last instruction of "expr" can generate the
+    * condition code for the "cmp 0".  This avoids having to do the "-(b & 1)"
+    * trick to generate 0u or ~0u for the Boolean result.  This means code like
+    *
+    *     mov(16)         g16<1>F         1F
+    *     mul.ge.f0(16)   null            g6<8,8,1>F      g14<8,8,1>F
+    *     (+f0) sel(16)   m6<1>F          g16<8,8,1>F     0F
+    *
+    * will be generated instead of
+    *
+    *     mul(16)         g2<1>F          g12<8,8,1>F     g4<8,8,1>F
+    *     cmp.ge.f0(16)   g2<1>D          g4<8,8,1>F      0F
+    *     and(16)         g4<1>D          g2<8,8,1>D      1D
+    *     and(16)         m6<1>D          -g4<8,8,1>D     0x3f800000UD
+    *
+    * When the comparison is either == 0.0 or != 0.0 it would seem that using
+    * the knowledge that the true (or false) case already results in zero
+    * would allow better code generation by possibly avoiding a load-immediate
+    * instruction.  Some experimentation showed this to not be the case.
+    */
+   ir_expression *const cmp = ir->operands[0]->as_expression();
+   if (cmp == NULL || cmp->get_num_operands() != 2)
+      return false;
+
+   unsigned i;
+   for (i = 0; i < 2; i++) {
+      ir_constant *c = cmp->operands[i]->as_constant();
+      if (c == NULL)
+         continue;
+
+      /* Both operands cannot be constants, and the constant has to be zero
+       * for the optimization to work.  Therefore, if we got a constant and
+       * the constant is not zero, we fail.
+       */
+      if (!c->is_zero())
+         return false;
+
+      ir_expression *expr = cmp->operands[i ^ 1]->as_expression();
+      if (expr != NULL)
+         break;
+   }
+
+   if (i == 2)
+      return false;
+
+   emit_bool_to_cond_code(cmp);
+
+   fs_reg temp = vgrf(ir->type);
+   emit(MOV(temp, fs_reg(1.0f)));
+
+   this->result = vgrf(ir->type);
+   fs_inst *inst = emit(SEL(this->result, temp, fs_reg(0.0f)));
+   inst->predicate = BRW_PREDICATE_NORMAL;
+
+   return true;
+}
+
 static int
 pack_pixel_offset(float x)
 {
@@ -647,6 +716,11 @@ fs_visitor::visit(ir_expression *ir)
       inst->predicate = BRW_PREDICATE_NORMAL;
       return;
 
+   case ir_unop_b2f:
+      if (brw->gen <= 5 && try_emit_b2f_of_comparison(ir))
+         return;
+      break;
+
    case ir_unop_interpolate_at_centroid:
    case ir_binop_interpolate_at_offset:
    case ir_binop_interpolate_at_sample:
-- 
2.1.0