[Mesa-dev] [PATCH 2/5] i965/fs: Emit better b2f of an expression on GEN4 and GEN5

Tapani tapani.palli at intel.com
Wed Mar 18 22:31:10 PDT 2015


Reviewed-by: Tapani Pälli <tapani.palli at intel.com>

On 03/11/2015 10:44 PM, Ian Romanick wrote:
> From: Ian Romanick <ian.d.romanick at intel.com>
>
> On platforms that do not natively generate 0u and ~0u for Boolean
> results, b2f expressions that look like
>
>     f = b2f(expr cmp 0)
>
> will generate better code by pretending the expression is
>
>      f = ir_triop_sel(0.0, 1.0, expr cmp 0)
>
> This is because the last instruction of "expr" can generate the
> condition code for the "cmp 0".  This avoids having to do the "-(b & 1)"
> trick to generate 0u or ~0u for the Boolean result.  This means code like
>
>      mov(16)         g16<1>F         1F
>      mul.ge.f0(16)   null            g6<8,8,1>F      g14<8,8,1>F
>      (+f0) sel(16)   m6<1>F          g16<8,8,1>F     0F
>
> will be generated instead of
>
>      mul(16)         g2<1>F          g12<8,8,1>F     g4<8,8,1>F
>      cmp.ge.f0(16)   g2<1>D          g4<8,8,1>F      0F
>      and(16)         g4<1>D          g2<8,8,1>D      1D
>      and(16)         m6<1>D          -g4<8,8,1>D     0x3f800000UD
>
> v2: When the comparison is either == 0.0 or != 0.0 use the knowledge
> that the true (or false) case already results in zero would allow better
> code generation by possibly avoiding a load-immediate instruction.
>
> v3: Apply the optimization even when neither comparitor is zero.
>
> Shader-db results:
>
> GM45 (0x2A42):
> total instructions in shared programs: 3551002 -> 3550829 (-0.00%)
> instructions in affected programs:     33269 -> 33096 (-0.52%)
> helped:                                121
>
> Iron Lake (0x0046):
> total instructions in shared programs: 4993327 -> 4993146 (-0.00%)
> instructions in affected programs:     34199 -> 34018 (-0.53%)
> helped:                                129
>
> No change on other platforms.
>
> Signed-off-by: Ian Romanick <ian.d.romanick at intel.com>
> Cc: Tapani Palli <tapani.palli at intel.com>
> ---
>   src/mesa/drivers/dri/i965/brw_fs.h           |   2 +
>   src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 101 +++++++++++++++++++++++++--
>   2 files changed, 99 insertions(+), 4 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
> index d9d5858..075e90c 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs.h
> +++ b/src/mesa/drivers/dri/i965/brw_fs.h
> @@ -307,6 +307,7 @@ public:
>                    const fs_reg &a);
>      void emit_minmax(enum brw_conditional_mod conditionalmod, const fs_reg &dst,
>                       const fs_reg &src0, const fs_reg &src1);
> +   bool try_emit_b2f_of_comparison(ir_expression *ir);
>      bool try_emit_saturate(ir_expression *ir);
>      bool try_emit_line(ir_expression *ir);
>      bool try_emit_mad(ir_expression *ir);
> @@ -317,6 +318,7 @@ public:
>      bool opt_saturate_propagation();
>      bool opt_cmod_propagation();
>      void emit_bool_to_cond_code(ir_rvalue *condition);
> +   void emit_bool_to_cond_code_of_reg(ir_expression *expr, fs_reg op[3]);
>      void emit_if_gen6(ir_if *ir);
>      void emit_unspill(bblock_t *block, fs_inst *inst, fs_reg reg,
>                        uint32_t spill_offset, int count);
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> index 3025a9d..3d79796 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> @@ -475,6 +475,87 @@ fs_visitor::try_emit_mad(ir_expression *ir)
>      return true;
>   }
>   
> +bool
> +fs_visitor::try_emit_b2f_of_comparison(ir_expression *ir)
> +{
> +   /* On platforms that do not natively generate 0u and ~0u for Boolean
> +    * results, b2f expressions that look like
> +    *
> +    *     f = b2f(expr cmp 0)
> +    *
> +    * will generate better code by pretending the expression is
> +    *
> +    *     f = ir_triop_csel(0.0, 1.0, expr cmp 0)
> +    *
> +    * This is because the last instruction of "expr" can generate the
> +    * condition code for the "cmp 0".  This avoids having to do the "-(b & 1)"
> +    * trick to generate 0u or ~0u for the Boolean result.  This means code like
> +    *
> +    *     mov(16)         g16<1>F         1F
> +    *     mul.ge.f0(16)   null            g6<8,8,1>F      g14<8,8,1>F
> +    *     (+f0) sel(16)   m6<1>F          g16<8,8,1>F     0F
> +    *
> +    * will be generated instead of
> +    *
> +    *     mul(16)         g2<1>F          g12<8,8,1>F     g4<8,8,1>F
> +    *     cmp.ge.f0(16)   g2<1>D          g4<8,8,1>F      0F
> +    *     and(16)         g4<1>D          g2<8,8,1>D      1D
> +    *     and(16)         m6<1>D          -g4<8,8,1>D     0x3f800000UD
> +    *
> +    * When the comparison is either == 0.0 or != 0.0 using the knowledge that
> +    * the true (or false) case already results in zero would allow better code
> +    * generation by possibly avoiding a load-immediate instruction.
> +    */
> +   ir_expression *cmp = ir->operands[0]->as_expression();
> +   if (cmp == NULL)
> +      return false;
> +
> +   if (cmp->operation == ir_binop_equal || cmp->operation == ir_binop_nequal) {
> +      for (unsigned i = 0; i < 2; i++) {
> +         ir_constant *c = cmp->operands[i]->as_constant();
> +         if (c == NULL || !c->is_zero())
> +            continue;
> +
> +         ir_expression *expr = cmp->operands[i ^ 1]->as_expression();
> +         if (expr != NULL) {
> +            fs_reg op[2];
> +
> +            for (unsigned j = 0; j < 2; j++) {
> +               cmp->operands[j]->accept(this);
> +               op[j] = this->result;
> +
> +               resolve_ud_negate(&op[j]);
> +            }
> +
> +            emit_bool_to_cond_code_of_reg(cmp, op);
> +
> +            /* In this case we know when the condition is true, op[i ^ 1]
> +             * contains zero.  Invert the predicate, use op[i ^ 1] as src0,
> +             * and immediate 1.0f as src1.
> +             */
> +            this->result = vgrf(ir->type);
> +            op[i ^ 1].type = BRW_REGISTER_TYPE_F;
> +
> +            fs_inst *inst = emit(SEL(this->result, op[i ^ 1], fs_reg(1.0f)));
> +            inst->predicate = BRW_PREDICATE_NORMAL;
> +            inst->predicate_inverse = cmp->operation == ir_binop_equal;
> +            return true;
> +         }
> +      }
> +   }
> +
> +   emit_bool_to_cond_code(cmp);
> +
> +   fs_reg temp = vgrf(ir->type);
> +   emit(MOV(temp, fs_reg(1.0f)));
> +
> +   this->result = vgrf(ir->type);
> +   fs_inst *inst = emit(SEL(this->result, temp, fs_reg(0.0f)));
> +   inst->predicate = BRW_PREDICATE_NORMAL;
> +
> +   return true;
> +}
> +
>   static int
>   pack_pixel_offset(float x)
>   {
> @@ -639,6 +720,11 @@ fs_visitor::visit(ir_expression *ir)
>         inst->predicate = BRW_PREDICATE_NORMAL;
>         return;
>   
> +   case ir_unop_b2f:
> +      if (brw->gen <= 5 && try_emit_b2f_of_comparison(ir))
> +         return;
> +      break;
> +
>      case ir_unop_interpolate_at_centroid:
>      case ir_binop_interpolate_at_offset:
>      case ir_binop_interpolate_at_sample:
> @@ -2525,7 +2611,6 @@ fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
>      }
>   
>      fs_reg op[3];
> -   fs_inst *inst;
>   
>      assert(expr->get_num_operands() <= 3);
>      for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
> @@ -2537,6 +2622,14 @@ fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
>         resolve_ud_negate(&op[i]);
>      }
>   
> +   emit_bool_to_cond_code_of_reg(expr, op);
> +}
> +
> +void
> +fs_visitor::emit_bool_to_cond_code_of_reg(ir_expression *expr, fs_reg op[3])
> +{
> +   fs_inst *inst;
> +
>      switch (expr->operation) {
>      case ir_unop_logic_not:
>         inst = emit(AND(reg_null_d, op[0], fs_reg(1)));
> @@ -2545,7 +2638,7 @@ fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
>   
>      case ir_binop_logic_xor:
>         if (brw->gen <= 5) {
> -         fs_reg temp = vgrf(ir->type);
> +         fs_reg temp = vgrf(expr->type);
>            emit(XOR(temp, op[0], op[1]));
>            inst = emit(AND(reg_null_d, temp, fs_reg(1)));
>         } else {
> @@ -2556,7 +2649,7 @@ fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
>   
>      case ir_binop_logic_or:
>         if (brw->gen <= 5) {
> -         fs_reg temp = vgrf(ir->type);
> +         fs_reg temp = vgrf(expr->type);
>            emit(OR(temp, op[0], op[1]));
>            inst = emit(AND(reg_null_d, temp, fs_reg(1)));
>         } else {
> @@ -2567,7 +2660,7 @@ fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
>   
>      case ir_binop_logic_and:
>         if (brw->gen <= 5) {
> -         fs_reg temp = vgrf(ir->type);
> +         fs_reg temp = vgrf(expr->type);
>            emit(AND(temp, op[0], op[1]));
>            inst = emit(AND(reg_null_d, temp, fs_reg(1)));
>         } else {



More information about the mesa-dev mailing list