[Mesa-dev] [PATCH 2/5] i965/fs: Emit better b2f of an expression on GEN4 and GEN5
Tapani Pälli
tapani.palli at intel.com
Mon Mar 16 04:54:39 PDT 2015
Hi Ian;
Is there some particular Piglit test case that hits this path and is it
possible with gen>5 (by removing gen check)? I've tried this with
handicrafted shader_test and also shader-db and cannot hit the
conditions for changes to happen. Would be nice to be able to run
examine changes and understand this better.
On 03/11/2015 10:44 PM, Ian Romanick wrote:
> From: Ian Romanick <ian.d.romanick at intel.com>
>
> On platforms that do not natively generate 0u and ~0u for Boolean
> results, b2f expressions that look like
>
> f = b2f(expr cmp 0)
>
> will generate better code by pretending the expression is
>
> f = ir_triop_sel(0.0, 1.0, expr cmp 0)
>
> This is because the last instruction of "expr" can generate the
> condition code for the "cmp 0". This avoids having to do the "-(b & 1)"
> trick to generate 0u or ~0u for the Boolean result. This means code like
>
> mov(16) g16<1>F 1F
> mul.ge.f0(16) null g6<8,8,1>F g14<8,8,1>F
> (+f0) sel(16) m6<1>F g16<8,8,1>F 0F
>
> will be generated instead of
>
> mul(16) g2<1>F g12<8,8,1>F g4<8,8,1>F
> cmp.ge.f0(16) g2<1>D g4<8,8,1>F 0F
> and(16) g4<1>D g2<8,8,1>D 1D
> and(16) m6<1>D -g4<8,8,1>D 0x3f800000UD
>
> v2: When the comparison is either == 0.0 or != 0.0 use the knowledge
> that the true (or false) case already results in zero would allow better
> code generation by possibly avoiding a load-immediate instruction.
>
> v3: Apply the optimization even when neither comparitor is zero.
>
> Shader-db results:
>
> GM45 (0x2A42):
> total instructions in shared programs: 3551002 -> 3550829 (-0.00%)
> instructions in affected programs: 33269 -> 33096 (-0.52%)
> helped: 121
>
> Iron Lake (0x0046):
> total instructions in shared programs: 4993327 -> 4993146 (-0.00%)
> instructions in affected programs: 34199 -> 34018 (-0.53%)
> helped: 129
>
> No change on other platforms.
>
> Signed-off-by: Ian Romanick <ian.d.romanick at intel.com>
> Cc: Tapani Palli <tapani.palli at intel.com>
> ---
> src/mesa/drivers/dri/i965/brw_fs.h | 2 +
> src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 101 +++++++++++++++++++++++++--
> 2 files changed, 99 insertions(+), 4 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
> index d9d5858..075e90c 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs.h
> +++ b/src/mesa/drivers/dri/i965/brw_fs.h
> @@ -307,6 +307,7 @@ public:
> const fs_reg &a);
> void emit_minmax(enum brw_conditional_mod conditionalmod, const fs_reg &dst,
> const fs_reg &src0, const fs_reg &src1);
> + bool try_emit_b2f_of_comparison(ir_expression *ir);
> bool try_emit_saturate(ir_expression *ir);
> bool try_emit_line(ir_expression *ir);
> bool try_emit_mad(ir_expression *ir);
> @@ -317,6 +318,7 @@ public:
> bool opt_saturate_propagation();
> bool opt_cmod_propagation();
> void emit_bool_to_cond_code(ir_rvalue *condition);
> + void emit_bool_to_cond_code_of_reg(ir_expression *expr, fs_reg op[3]);
> void emit_if_gen6(ir_if *ir);
> void emit_unspill(bblock_t *block, fs_inst *inst, fs_reg reg,
> uint32_t spill_offset, int count);
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> index 3025a9d..3d79796 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> @@ -475,6 +475,87 @@ fs_visitor::try_emit_mad(ir_expression *ir)
> return true;
> }
>
> +bool
> +fs_visitor::try_emit_b2f_of_comparison(ir_expression *ir)
> +{
> + /* On platforms that do not natively generate 0u and ~0u for Boolean
> + * results, b2f expressions that look like
> + *
> + * f = b2f(expr cmp 0)
> + *
> + * will generate better code by pretending the expression is
> + *
> + * f = ir_triop_csel(0.0, 1.0, expr cmp 0)
> + *
> + * This is because the last instruction of "expr" can generate the
> + * condition code for the "cmp 0". This avoids having to do the "-(b & 1)"
> + * trick to generate 0u or ~0u for the Boolean result. This means code like
> + *
> + * mov(16) g16<1>F 1F
> + * mul.ge.f0(16) null g6<8,8,1>F g14<8,8,1>F
> + * (+f0) sel(16) m6<1>F g16<8,8,1>F 0F
> + *
> + * will be generated instead of
> + *
> + * mul(16) g2<1>F g12<8,8,1>F g4<8,8,1>F
> + * cmp.ge.f0(16) g2<1>D g4<8,8,1>F 0F
> + * and(16) g4<1>D g2<8,8,1>D 1D
> + * and(16) m6<1>D -g4<8,8,1>D 0x3f800000UD
> + *
> + * When the comparison is either == 0.0 or != 0.0 using the knowledge that
> + * the true (or false) case already results in zero would allow better code
> + * generation by possibly avoiding a load-immediate instruction.
> + */
> + ir_expression *cmp = ir->operands[0]->as_expression();
> + if (cmp == NULL)
> + return false;
> +
> + if (cmp->operation == ir_binop_equal || cmp->operation == ir_binop_nequal) {
> + for (unsigned i = 0; i < 2; i++) {
> + ir_constant *c = cmp->operands[i]->as_constant();
> + if (c == NULL || !c->is_zero())
> + continue;
> +
> + ir_expression *expr = cmp->operands[i ^ 1]->as_expression();
> + if (expr != NULL) {
> + fs_reg op[2];
> +
> + for (unsigned j = 0; j < 2; j++) {
> + cmp->operands[j]->accept(this);
> + op[j] = this->result;
> +
> + resolve_ud_negate(&op[j]);
> + }
> +
> + emit_bool_to_cond_code_of_reg(cmp, op);
> +
> + /* In this case we know when the condition is true, op[i ^ 1]
> + * contains zero. Invert the predicate, use op[i ^ 1] as src0,
> + * and immediate 1.0f as src1.
> + */
> + this->result = vgrf(ir->type);
> + op[i ^ 1].type = BRW_REGISTER_TYPE_F;
> +
> + fs_inst *inst = emit(SEL(this->result, op[i ^ 1], fs_reg(1.0f)));
> + inst->predicate = BRW_PREDICATE_NORMAL;
> + inst->predicate_inverse = cmp->operation == ir_binop_equal;
> + return true;
> + }
> + }
> + }
> +
> + emit_bool_to_cond_code(cmp);
> +
> + fs_reg temp = vgrf(ir->type);
> + emit(MOV(temp, fs_reg(1.0f)));
> +
> + this->result = vgrf(ir->type);
> + fs_inst *inst = emit(SEL(this->result, temp, fs_reg(0.0f)));
> + inst->predicate = BRW_PREDICATE_NORMAL;
> +
> + return true;
> +}
> +
> static int
> pack_pixel_offset(float x)
> {
> @@ -639,6 +720,11 @@ fs_visitor::visit(ir_expression *ir)
> inst->predicate = BRW_PREDICATE_NORMAL;
> return;
>
> + case ir_unop_b2f:
> + if (brw->gen <= 5 && try_emit_b2f_of_comparison(ir))
> + return;
> + break;
> +
> case ir_unop_interpolate_at_centroid:
> case ir_binop_interpolate_at_offset:
> case ir_binop_interpolate_at_sample:
> @@ -2525,7 +2611,6 @@ fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
> }
>
> fs_reg op[3];
> - fs_inst *inst;
>
> assert(expr->get_num_operands() <= 3);
> for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
> @@ -2537,6 +2622,14 @@ fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
> resolve_ud_negate(&op[i]);
> }
>
> + emit_bool_to_cond_code_of_reg(expr, op);
> +}
> +
> +void
> +fs_visitor::emit_bool_to_cond_code_of_reg(ir_expression *expr, fs_reg op[3])
> +{
> + fs_inst *inst;
> +
> switch (expr->operation) {
> case ir_unop_logic_not:
> inst = emit(AND(reg_null_d, op[0], fs_reg(1)));
> @@ -2545,7 +2638,7 @@ fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
>
> case ir_binop_logic_xor:
> if (brw->gen <= 5) {
> - fs_reg temp = vgrf(ir->type);
> + fs_reg temp = vgrf(expr->type);
> emit(XOR(temp, op[0], op[1]));
> inst = emit(AND(reg_null_d, temp, fs_reg(1)));
> } else {
> @@ -2556,7 +2649,7 @@ fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
>
> case ir_binop_logic_or:
> if (brw->gen <= 5) {
> - fs_reg temp = vgrf(ir->type);
> + fs_reg temp = vgrf(expr->type);
> emit(OR(temp, op[0], op[1]));
> inst = emit(AND(reg_null_d, temp, fs_reg(1)));
> } else {
> @@ -2567,7 +2660,7 @@ fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
>
> case ir_binop_logic_and:
> if (brw->gen <= 5) {
> - fs_reg temp = vgrf(ir->type);
> + fs_reg temp = vgrf(expr->type);
> emit(AND(temp, op[0], op[1]));
> inst = emit(AND(reg_null_d, temp, fs_reg(1)));
> } else {
>
More information about the mesa-dev
mailing list