[Mesa-dev] [PATCH 3/3] i965/vs: Avoid the MUL/MACH/MOV sequence for small integer multiplies.

Fri Jun 7 22:42:08 PDT 2013

On 06/07/2013 06:55 PM, Eric Anholt wrote:
> We do a lot of multiplies by 3 or 4 for skinning shaders, and we can avoid
> the sequence if we just move them into the right argument of the MUL.
>
> On SNB, this means reliably putting a constant in a position where it
> can't be constant folded, but that's still better than MUL/MACH/MOV.
>
> Improves GLB 2.7 trex performance by 0.788648% +/- 0.23865% (n=29/30)
> ---
>   src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp | 50 +++++++++++++++++++-------
>   1 file changed, 37 insertions(+), 13 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
> index 451f7d5..3c453eb 100644
> --- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
> @@ -1313,6 +1313,20 @@ vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
>      }
>   }
>
> +static bool
> +is_16bit_constant(ir_rvalue *rvalue)
> +{
> +   ir_constant *constant = rvalue->as_constant();
> +   if (!constant)
> +      return false;
> +
> +   if (constant->type != glsl_type::int_type &&
> +       constant->type != glsl_type::uint_type)
> +      return false;
> +
> +   return constant->value.u[0] < (1 << 16);
> +}
> +
>   void
>   vec4_visitor::visit(ir_expression *ir)
>   {
> @@ -1472,19 +1486,29 @@ vec4_visitor::visit(ir_expression *ir)
>
>      case ir_binop_mul:
>         if (ir->type->is_integer()) {
> -	 /* For integer multiplication, the MUL uses the low 16 bits
> -	  * of one of the operands (src0 on gen6, src1 on gen7).  The
> -	  * MACH accumulates in the contribution of the upper 16 bits
> -	  * of that operand.
> -	  *
> -	  * FINISHME: Emit just the MUL if we know an operand is small
> -	  * enough.
> -	  */
> -	 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
> -
> -	 emit(MUL(acc, op[0], op[1]));
> -	 emit(MACH(dst_null_d(), op[0], op[1]));
> -	 emit(MOV(result_dst, src_reg(acc)));
> +	 /* For integer multiplication, the MUL uses the low 16 bits of one of
> +	  * the operands (src0 on gen6, src1 on gen7).  The MACH accumulates
> +	  * in the contribution of the upper 16 bits of that operand.  If we
> +	  * can determine that one of the args is in the low 16 bits, though,
> +	  * we can just emit a single MUL.
> +          */
> +         if (is_16bit_constant(ir->operands[0])) {
> +            if (intel->gen == 6)
> +               emit(MUL(result_dst, op[0], op[1]));
> +            else
> +               emit(MUL(result_dst, op[1], op[0]));

This will take the IVB path on Gen4-5, which is wrong.  Gen4-6 use the 
low 16-bits of src0, while Gen7 uses src1.

> +         } else if (is_16bit_constant(ir->operands[1])) {
> +            if (intel->gen == 6)
> +               emit(MUL(result_dst, op[1], op[0]));
> +            else
> +               emit(MUL(result_dst, op[0], op[1]));

Ditto.  Assuming you fix that (and the comment and commit message), this is:
Reviewed-by: Kenneth Graunke <kenneth at whitecape.org>

(and hey, it looks like we both wrote the VS MAD code in the same week! 
  I just hadn't sent it out since it didn't seem to help the apps I was 
looking at.  ah well...)

> +         } else {
> +            struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
> +
> +            emit(MUL(acc, op[0], op[1]));
> +            emit(MACH(dst_null_d(), op[0], op[1]));
> +            emit(MOV(result_dst, src_reg(acc)));
> +         }
>         } else {
>   	 emit(MUL(result_dst, op[0], op[1]));
>         }