[Mesa-dev] [PATCH 09/10] i965/fs: Add a peephole pass to combine ADD with ADDC/SUBB.
Ian Romanick
idr at freedesktop.org
Tue Sep 24 14:41:27 PDT 2013
For our own edification, we should add some feedback in the
INTEL_DEBUG=perf case. If there is any case that an ADDC (or SUBB) from
the frontend doesn't get merged with an add, we should generate a perf
warning. This probably indicates a failing of the optimization pass.
That could probably be a follow-on patch...
On 09/23/2013 04:13 PM, Matt Turner wrote:
> ---
> src/mesa/drivers/dri/i965/brw_fs.h | 1 +
> src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 101 +++++++++++++++++++++++++++
> 2 files changed, 102 insertions(+)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
> index a56f561..d776b5a 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs.h
> +++ b/src/mesa/drivers/dri/i965/brw_fs.h
> @@ -373,6 +373,7 @@ public:
> fs_reg src0, fs_reg src1);
> bool try_emit_saturate(ir_expression *ir);
> bool try_emit_mad(ir_expression *ir, int mul_arg);
> + void try_combine_add_with_addc_subb();
> void try_replace_with_sel();
> void emit_bool_to_cond_code(ir_rvalue *condition);
> void emit_if_gen6(ir_if *ir);
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> index 23cdc2e..531ef52 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> @@ -313,9 +313,104 @@ fs_visitor::try_emit_mad(ir_expression *ir, int mul_arg)
> return true;
> }
>
> +/**
> + * The addition and carry in the uaddCarry() built-in function are implemented
> + * separately as ir_binop_add and ir_binop_carry respectively. i965 generates
> + * ADDC and a MOV from the accumulator for the carry.
> + *
> + * The generated code for uaddCarry(uint x, uint y, out uint carry) would look
> + * like this:
> + *
> + * addc null, x, y
> + * mov carry, acc0
> + * add sum, x, y
> + *
> + * This peephole pass optimizes this into
> + *
> + * addc sum, x, y
> + * mov carry, acc0
> + *
> + * usubBorrow() works in the same fashion.
> + */
> +void
> +fs_visitor::try_combine_add_with_addc_subb()
> +{
> + /* ADDC/SUBB was introduced in gen7. */
> + if (brw->gen < 7)
> + return;
> +
> + fs_inst *add_inst = (fs_inst *) instructions.get_tail();
> + assert(add_inst->opcode == BRW_OPCODE_ADD);
> +
> + /* ADDC/SUBB only operates on UD. */
> + if (add_inst->dst.type != BRW_REGISTER_TYPE_UD ||
> + add_inst->src[0].type != BRW_REGISTER_TYPE_UD ||
> + add_inst->src[1].type != BRW_REGISTER_TYPE_UD)
> + return;
> +
> + bool found = false;
> + fs_inst *match = (fs_inst *) add_inst->prev;
> + /* The ADDC should appear within 8 instructions of ADD for a vec4. SUBB
> + * should appear farther away because of the extra MOV negates.
> + */
> + for (int i = 0; i < 16; i++, match = (fs_inst *) match->prev) {
> + if (match->is_head_sentinel())
> + return;
> +
> + /* Look for an ADDC/SUBB instruction whose destination is the null
> + * register (ir_binop_carry emits ADDC with null destination; same for
> + * ir_binop_borrow with SUBB) and whose sources are identical to those
> + * of the ADD.
> + */
> + if (match->opcode != BRW_OPCODE_ADDC && match->opcode != BRW_OPCODE_SUBB)
> + continue;
> +
> + /* Only look for newly emitted ADDC/SUBB with null destination. */
> + if (match->dst.file != ARF || match->dst.reg != BRW_ARF_NULL)
> + continue;
> +
> + fs_reg *src0 = &add_inst->src[0];
> + fs_reg *src1 = &add_inst->src[1];
> +
> + /* For SUBB, the ADD's second source will contain a negate modifier
> + * which at this point will be in the form of a
> + *
> + * MOV dst, -src
> + *
> + * instruction, so src[1].file will be GRF, even if it's a uniform push
> + * constant.
> + */
> + if (match->src[1].reg != add_inst->src[1].reg) {
> + /* The negating MOV should be immediately before the ADD. */
> + fs_inst *mov_inst = (fs_inst *) add_inst->prev;
> + if (mov_inst->opcode != BRW_OPCODE_MOV)
> + continue;
> +
> + src1 = &mov_inst->src[0];
> + }
> +
> + /* If everything matches, we're done. */
> + if (match->src[0].file == src0->file &&
> + match->src[1].file == src1->file &&
> + match->src[0].reg == src0->reg &&
> + match->src[1].reg == src1->reg &&
> + match->src[0].reg_offset == src0->reg_offset &&
> + match->src[1].reg_offset == src1->reg_offset) {
> + found = true;
> + break;
> + }
> + }
> +
> + if (found) {
> + match->dst = add_inst->dst;
> + add_inst->remove();
> + }
> +}
> +
> void
> fs_visitor::visit(ir_expression *ir)
> {
> + static bool emitted_addc_or_subb = false;
> unsigned int operand;
> fs_reg op[3], temp;
> fs_inst *inst;
> @@ -415,6 +510,8 @@ fs_visitor::visit(ir_expression *ir)
>
> case ir_binop_add:
> emit(ADD(this->result, op[0], op[1]));
> + if (emitted_addc_or_subb)
> + try_combine_add_with_addc_subb();
> break;
> case ir_binop_sub:
> assert(!"not reached: should be handled by ir_sub_to_add_neg");
> @@ -451,6 +548,8 @@ fs_visitor::visit(ir_expression *ir)
> if (brw->gen >= 7 && dispatch_width == 16)
> fail("16-wide explicit accumulator operands unsupported\n");
>
> + emitted_addc_or_subb = true;
> +
> struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
>
> emit(ADDC(reg_null_ud, op[0], op[1]));
> @@ -461,6 +560,8 @@ fs_visitor::visit(ir_expression *ir)
> if (brw->gen >= 7 && dispatch_width == 16)
> fail("16-wide explicit accumulator operands unsupported\n");
>
> + emitted_addc_or_subb = true;
> +
> struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
>
> emit(SUBB(reg_null_ud, op[0], op[1]));
>
More information about the mesa-dev
mailing list