[Mesa-dev] [PATCH 6/9] i965/fs: Use the LRP instruction for ir_triop_lrp when possible.

Tue Feb 26 09:51:09 PST 2013

On 02/19/2013 05:03 PM, Matt Turner wrote:
> From: Kenneth Graunke <kenneth at whitecape.org>
>
> v2 [mattst88]:
>     - Add BRW_OPCODE_LRP to list of CSE-able expressions.
>     - Fix op_var[] array size.
>     - Rename arguments to emit_lrp to (x, y, a) to clear confusion.
>     - Add LRP function to brw_fs.cpp/.h.
>     - Corrected comment about LRP instruction arguments in emit_lrp

Nice changes.

Signed-off-by: Kenneth Graunke <kenneth at whitecape.org>
Signed-off-by: Matt Turner <mattst88 at gmail.com>

A few comments below...we'll need a v3 of this...

> Reviewed-by: Matt Turner <mattst88 at gmail.com>
> ---
>   src/mesa/drivers/dri/i965/brw_fs.cpp               |    8 ++++
>   src/mesa/drivers/dri/i965/brw_fs.h                 |    2 +
>   .../dri/i965/brw_fs_channel_expressions.cpp        |   16 ++++++++-
>   src/mesa/drivers/dri/i965/brw_fs_cse.cpp           |    1 +
>   src/mesa/drivers/dri/i965/brw_fs_emit.cpp          |   15 +++++++--
>   src/mesa/drivers/dri/i965/brw_fs_visitor.cpp       |   35 ++++++++++++++++++--
>   src/mesa/drivers/dri/i965/brw_shader.cpp           |    2 +-
>   7 files changed, 71 insertions(+), 8 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
> index c1ccd92..bdb6616 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
> @@ -146,6 +146,13 @@ fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
>         return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
>      }
>
> +#define ALU3(op)                                                        \
> +   fs_inst *                                                            \
> +   fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2)    \
> +   {                                                                    \
> +      return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
> +   }
> +
>   ALU1(NOT)
>   ALU1(MOV)
>   ALU1(FRC)
> @@ -161,6 +168,7 @@ ALU2(XOR)
>   ALU2(SHL)
>   ALU2(SHR)
>   ALU2(ASR)
> +ALU3(LRP)
>
>   /** Gen4 predicated IF. */
>   fs_inst *
> diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
> index d5ebd51..9c1b359 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs.h
> +++ b/src/mesa/drivers/dri/i965/brw_fs.h
> @@ -285,6 +285,7 @@ public:
>      fs_inst *IF(fs_reg src0, fs_reg src1, uint32_t condition);
>      fs_inst *CMP(fs_reg dst, fs_reg src0, fs_reg src1,
>                   uint32_t condition);
> +   fs_inst *LRP(fs_reg dst, fs_reg a, fs_reg y, fs_reg x);
>      fs_inst *DEP_RESOLVE_MOV(int grf);
>
>      int type_size(const struct glsl_type *type);
> @@ -360,6 +361,7 @@ public:
>      fs_reg fix_math_operand(fs_reg src);
>      fs_inst *emit_math(enum opcode op, fs_reg dst, fs_reg src0);
>      fs_inst *emit_math(enum opcode op, fs_reg dst, fs_reg src0, fs_reg src1);
> +   void emit_lrp(fs_reg dst, fs_reg x, fs_reg y, fs_reg a);
>      void emit_minmax(uint32_t conditionalmod, fs_reg dst,
>                       fs_reg src0, fs_reg src1);
>      bool try_emit_saturate(ir_expression *ir);
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp b/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp
> index ea06225..30d8d9b 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp
> @@ -135,7 +135,7 @@ ir_channel_expressions_visitor::visit_leave(ir_assignment *ir)
>      ir_expression *expr = ir->rhs->as_expression();
>      bool found_vector = false;
>      unsigned int i, vector_elements = 1;
> -   ir_variable *op_var[2];
> +   ir_variable *op_var[3];
>
>      if (!expr)
>         return visit_continue;
> @@ -342,6 +342,20 @@ ir_channel_expressions_visitor::visit_leave(ir_assignment *ir)
>         assert(!"not yet supported");
>         break;
>
> +   case ir_triop_lrp:
> +      for (i = 0; i < vector_elements; i++) {
> +	 ir_rvalue *op0 = get_element(op_var[0], i);
> +	 ir_rvalue *op1 = get_element(op_var[1], i);
> +	 ir_rvalue *op2 = get_element(op_var[2], i);
> +
> +	 assign(ir, i, new(mem_ctx) ir_expression(expr->operation,
> +						  element_type,
> +						  op0,
> +						  op1,
> +						  op2));
> +      }
> +      break;
> +
>      case ir_unop_pack_snorm_2x16:
>      case ir_unop_pack_snorm_4x8:
>      case ir_unop_pack_unorm_2x16:
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
> index 70c143a..0b74d2e 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
> @@ -66,6 +66,7 @@ is_expression(const fs_inst *const inst)
>      case BRW_OPCODE_LINE:
>      case BRW_OPCODE_PLN:
>      case BRW_OPCODE_MAD:
> +   case BRW_OPCODE_LRP:
>      case FS_OPCODE_CINTERP:
>      case FS_OPCODE_LINTERP:
>         return true;
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
> index 3d1f3b3..38d6332 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
> @@ -1082,18 +1082,27 @@ fs_generator::generate_code(exec_list *instructions)
>   	 break;
>
>         case BRW_OPCODE_MAD:
> +      case BRW_OPCODE_LRP: {
> +         struct brw_instruction *(*brw_inst)(struct brw_compile *p,
> +					     struct brw_reg dest,
> +					     struct brw_reg src0,
> +					     struct brw_reg src1,
> +					     struct brw_reg src2);
> +	 brw_inst = (inst->opcode == BRW_OPCODE_MAD) ?
> +			brw_MAD : brw_LRP;

Clever use of function pointers.  I think I'd prefer duplicating the 
code, though...seeing what looks like a prototype here is kind of 
jarring.  But if other people prefer it this way, I'll go along...it's 
not a strong preference.

>   	 brw_set_access_mode(p, BRW_ALIGN_16);
>   	 if (dispatch_width == 16) {
>   	    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
> -	    brw_MAD(p, dst, src[0], src[1], src[2]);
> +	    brw_inst(p, dst, src[0], src[1], src[2]);
>   	    brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
> -	    brw_MAD(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2]));
> +	    brw_inst(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2]));
>   	    brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
>   	 } else {
> -	    brw_MAD(p, dst, src[0], src[1], src[2]);
> +	    brw_inst(p, dst, src[0], src[1], src[2]);
>   	 }
>   	 brw_set_access_mode(p, BRW_ALIGN_1);
>   	 break;
> +      }
>
>         case BRW_OPCODE_FRC:
>   	 brw_FRC(p, dst, src[0]);
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> index d4f6fc9..7c03f6b 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> @@ -199,6 +199,30 @@ fs_visitor::visit(ir_dereference_array *ir)
>   }
>
>   void
> +fs_visitor::emit_lrp(fs_reg dst, fs_reg x, fs_reg y, fs_reg a)
> +{
> +   if (intel->gen < 6 || x.file == IMM || y.file == IMM || a.file == IMM) {

I was wondering whether these should be != GRF instead of == IMM...but I 
guess that would preclude the uniform file.  It does work on uniforms?

> +      /* We can't use the LRP instruction.  Emit x*(1-a) + y*a. */
> +      fs_reg y_times_a           = fs_reg(this, glsl_type::float_type);
> +      fs_reg one_minus_a         = fs_reg(this, glsl_type::float_type);
> +      fs_reg x_times_one_minus_a = fs_reg(this, glsl_type::float_type);
> +
> +      emit(MUL(y_times_a, y, a));
> +
> +      a.negate = !a.negate;
> +      emit(ADD(one_minus_a, fs_reg(1.0f), a));
> +      emit(MUL(x_times_one_minus_a, x, one_minus_a));
> +
> +      emit(ADD(dst, x_times_one_minus_a, y_times_a));
> +   } else {
> +      /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
> +       * we need to reorder the operands.
> +       */
> +      emit(LRP(dst, a, y, x));
> +   }
> +}
> +
> +void
>   fs_visitor::emit_minmax(uint32_t conditionalmod, fs_reg dst,
>                           fs_reg src0, fs_reg src1)
>   {
> @@ -291,10 +315,10 @@ void
>   fs_visitor::visit(ir_expression *ir)
>   {
>      unsigned int operand;
> -   fs_reg op[2], temp;
> +   fs_reg op[3], temp;
>      fs_inst *inst;
>
> -   assert(ir->get_num_operands() <= 2);
> +   assert(ir->get_num_operands() <= 3);
>
>      if (try_emit_saturate(ir))
>         return;
> @@ -586,7 +610,7 @@ fs_visitor::visit(ir_expression *ir)
>      case ir_binop_pack_half_2x16_split:
>         emit(FS_OPCODE_PACK_HALF_2x16_SPLIT, this->result, op[0], op[1]);
>         break;
> -   case ir_binop_ubo_load:
> +   case ir_binop_ubo_load: {
>         /* This IR node takes a constant uniform block and a constant or
>          * variable byte offset within the block and loads a vector from that.
>          */
> @@ -662,6 +686,11 @@ fs_visitor::visit(ir_expression *ir)
>         result.reg_offset = 0;
>         break;
>      }
> +
> +   case ir_triop_lrp:
> +      emit_lrp(this->result, op[0], op[1], op[2]);
> +      break;
> +   }
>   }
>
>   void
> diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
> index 9ab18cc..6965d72 100644
> --- a/src/mesa/drivers/dri/i965/brw_shader.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
> @@ -156,7 +156,7 @@ brw_link_shader(struct gl_context *ctx, struct gl_shader_program *shProg)
>   			 SUB_TO_ADD_NEG |
>   			 EXP_TO_EXP2 |
>   			 LOG_TO_LOG2 |
> -			 LRP_TO_ARITH);
> +			 (stage == MESA_SHADER_FRAGMENT ? 0 : LRP_TO_ARITH));

Doesn't this need to include a gen check as well?  Perhaps:

const int lrp_to_arith = 0;
if (intel->gen < 6 || stage != MESA_SHADER_FRAGMENT)
     lrp_to_arith = LRP_TO_ARITH;

then use lrp_to_arith here.

>
>         /* Pre-gen6 HW can only nest if-statements 16 deep.  Beyond this,
>          * if-statements need to be flattened.
>