[Mesa-dev] [PATCH 3/4] mesa/st: implement new bit manipulation opcodes

Fri Apr 25 14:23:50 PDT 2014

Am 25.04.2014 19:41, schrieb Ilia Mirkin:
> Also pipe through [IU]MUL_HI, MAD, and lower ldexp. This provides
> coverage of all new ARB_gpu_shader5 functions except uaddCarry,
> usubBorrow and interpolateAt*.
> 
> Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu>
> ---
>  src/mesa/state_tracker/st_glsl_to_tgsi.cpp | 70 ++++++++++++++++++++++--------
>  1 file changed, 53 insertions(+), 17 deletions(-)
> 
> diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
> index e87e761..108ad29 100644
> --- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
> +++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
> @@ -241,7 +241,7 @@ public:
>  
>     unsigned op;
>     st_dst_reg dst;
> -   st_src_reg src[3];
> +   st_src_reg src[4];
>     /** Pointer to the ir source this tree came from for debugging */
>     ir_instruction *ir;
>     GLboolean cond_update;
> @@ -411,7 +411,12 @@ public:
>     glsl_to_tgsi_instruction *emit(ir_instruction *ir, unsigned op,
>          		        st_dst_reg dst,
>          		        st_src_reg src0, st_src_reg src1, st_src_reg src2);
> -   
> +
> +   glsl_to_tgsi_instruction *emit(ir_instruction *ir, unsigned op,
> +                                  st_dst_reg dst,
> +                                  st_src_reg src0, st_src_reg src1,
> +                                  st_src_reg src2, st_src_reg src3);
> +
>     unsigned get_opcode(ir_instruction *ir, unsigned op,
>                      st_dst_reg dst,
>                      st_src_reg src0, st_src_reg src1);
> @@ -524,8 +529,9 @@ num_inst_src_regs(unsigned opcode)
>  
>  glsl_to_tgsi_instruction *
>  glsl_to_tgsi_visitor::emit(ir_instruction *ir, unsigned op,
> -        		 st_dst_reg dst,
> -        		 st_src_reg src0, st_src_reg src1, st_src_reg src2)
> +                           st_dst_reg dst,
> +                           st_src_reg src0, st_src_reg src1,
> +                           st_src_reg src2, st_src_reg src3)
>  {
>     glsl_to_tgsi_instruction *inst = new(mem_ctx) glsl_to_tgsi_instruction();
>     int num_reladdr = 0, i;
> @@ -540,7 +546,9 @@ glsl_to_tgsi_visitor::emit(ir_instruction *ir, unsigned op,
>     num_reladdr += src0.reladdr != NULL || src0.reladdr2 != NULL;
>     num_reladdr += src1.reladdr != NULL || src1.reladdr2 != NULL;
>     num_reladdr += src2.reladdr != NULL || src2.reladdr2 != NULL;
> +   num_reladdr += src3.reladdr != NULL || src3.reladdr2 != NULL;
>  
> +   reladdr_to_temp(ir, &src3, &num_reladdr);
>     reladdr_to_temp(ir, &src2, &num_reladdr);
>     reladdr_to_temp(ir, &src1, &num_reladdr);
>     reladdr_to_temp(ir, &src0, &num_reladdr);
> @@ -556,6 +564,7 @@ glsl_to_tgsi_visitor::emit(ir_instruction *ir, unsigned op,
>     inst->src[0] = src0;
>     inst->src[1] = src1;
>     inst->src[2] = src2;
> +   inst->src[3] = src3;
>     inst->ir = ir;
>     inst->dead_mask = 0;
>  
> @@ -577,7 +586,7 @@ glsl_to_tgsi_visitor::emit(ir_instruction *ir, unsigned op,
>        }
>     }
>     else {
> -      for (i=0; i<3; i++) {
> +      for (i=0; i<4; i++) {
>           if(inst->src[i].reladdr) {
>              switch(inst->src[i].file) {
>              case PROGRAM_STATE_VAR:
> @@ -600,12 +609,19 @@ glsl_to_tgsi_visitor::emit(ir_instruction *ir, unsigned op,
>     return inst;
>  }
>  
> +glsl_to_tgsi_instruction *
> +glsl_to_tgsi_visitor::emit(ir_instruction *ir, unsigned op,
> +                           st_dst_reg dst, st_src_reg src0,
> +                           st_src_reg src1, st_src_reg src2)
> +{
> +   return emit(ir, op, dst, src0, src1, src2, undef_src);
> +}
>  
>  glsl_to_tgsi_instruction *
>  glsl_to_tgsi_visitor::emit(ir_instruction *ir, unsigned op,
>          		 st_dst_reg dst, st_src_reg src0, st_src_reg src1)
>  {
> -   return emit(ir, op, dst, src0, src1, undef_src);
> +   return emit(ir, op, dst, src0, src1, undef_src, undef_src);
>  }
>  
>  glsl_to_tgsi_instruction *
> @@ -613,13 +629,13 @@ glsl_to_tgsi_visitor::emit(ir_instruction *ir, unsigned op,
>          		 st_dst_reg dst, st_src_reg src0)
>  {
>     assert(dst.writemask != 0);
> -   return emit(ir, op, dst, src0, undef_src, undef_src);
> +   return emit(ir, op, dst, src0, undef_src, undef_src, undef_src);
>  }
>  
>  glsl_to_tgsi_instruction *
>  glsl_to_tgsi_visitor::emit(ir_instruction *ir, unsigned op)
>  {
> -   return emit(ir, op, undef_dst, undef_src, undef_src, undef_src);
> +   return emit(ir, op, undef_dst, undef_src, undef_src, undef_src, undef_src);
>  }
>  
>  /**
> @@ -690,7 +706,10 @@ glsl_to_tgsi_visitor::get_opcode(ir_instruction *ir, unsigned op,
>  
>        case2fi(SSG, ISSG);
>        case3(ABS, IABS, IABS);
> -      
> +
> +      case2iu(IBFE, UBFE);
> +      case2iu(IMSB, UMSB);
> +      case2iu(IMUL_HI, UMUL_HI);
>        default: break;
>     }
>     
> @@ -1934,6 +1953,30 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
>           emit(ir, TGSI_OPCODE_CMP, result_dst, op[0], op[1], op[2]);
>        }
>        break;
> +   case ir_triop_bitfield_extract:
> +      emit(ir, TGSI_OPCODE_IBFE, result_dst, op[0], op[1], op[2]);
> +      break;
> +   case ir_quadop_bitfield_insert:
> +      emit(ir, TGSI_OPCODE_BFI, result_dst, op[0], op[1], op[2], op[3]);
> +      break;
> +   case ir_unop_bitfield_reverse:
> +      emit(ir, TGSI_OPCODE_BREV, result_dst, op[0]);
> +      break;
> +   case ir_unop_bit_count:
> +      emit(ir, TGSI_OPCODE_POPC, result_dst, op[0]);
> +      break;
> +   case ir_unop_find_msb:
> +      emit(ir, TGSI_OPCODE_IMSB, result_dst, op[0]);
> +      break;
> +   case ir_unop_find_lsb:
> +      emit(ir, TGSI_OPCODE_LSB, result_dst, op[0]);
> +      break;
> +   case ir_binop_imul_high:
> +      emit(ir, TGSI_OPCODE_IMUL_HI, result_dst, op[0], op[1]);
> +      break;
> +   case ir_triop_fma:
> +      emit(ir, TGSI_OPCODE_MAD, result_dst, op[0], op[1], op[2]);
fma without precise qualifier isn't all that useful (and is indeed just
mad). Well actually MAD is not specified strictly in gallium, but
personally I believe it should be allowwed to do both fused and unfused
operation if possible (I am not entirely sure if that's ok for d3d10 as
it doesn't specify anything) - the reason being that some chips may not
have (single-instruction) unfused mad but just fma (I'm of course
thinking about x86 with fma extension), so the driver should use
whatever is faster. This also matches what opencl mad allows (if you
specifically need unfused mad in opencl just use mul+add).
fma, however, usually really means fused operation. This will require a
new opcode (though apparently in glsl it only matters with the precise
qualifier), but I guess in the meantime this sloppy translation is ok
(maybe add a comment?).

> +      break;
>     case ir_unop_pack_snorm_2x16:
>     case ir_unop_pack_unorm_2x16:
>     case ir_unop_pack_half_2x16:
> @@ -1947,22 +1990,14 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
>     case ir_unop_unpack_snorm_4x8:
>     case ir_unop_unpack_unorm_4x8:
>     case ir_binop_pack_half_2x16_split:
> -   case ir_unop_bitfield_reverse:
> -   case ir_unop_bit_count:
> -   case ir_unop_find_msb:
> -   case ir_unop_find_lsb:
>     case ir_binop_bfm:
> -   case ir_triop_fma:
>     case ir_triop_bfi:
> -   case ir_triop_bitfield_extract:
> -   case ir_quadop_bitfield_insert:
>     case ir_quadop_vector:
>     case ir_binop_vector_extract:
>     case ir_triop_vector_insert:
>     case ir_binop_ldexp:
>     case ir_binop_carry:
>     case ir_binop_borrow:
> -   case ir_binop_imul_high:
>        /* This operation is not supported, or should have already been handled.
>         */
>        assert(!"Invalid ir opcode in glsl_to_tgsi_visitor::visit()");
> @@ -5357,6 +5392,7 @@ st_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
>                           DIV_TO_MUL_RCP |
>                           EXP_TO_EXP2 |
>                           LOG_TO_LOG2 |
> +                         LDEXP_TO_ARITH |
>                           (options->EmitNoPow ? POW_TO_EXP2 : 0) |
>                           (!ctx->Const.NativeIntegers ? INT_DIV_TO_MUL_RCP : 0));
>  
> 

Otherwise looks good to me