[Mesa-dev] [RFC 4/5] intel/compiler: Implement unordered comparisons

Tue Nov 27 18:55:24 UTC 2018

On 11/22/2018 10:46 AM, Jason Ekstrand wrote:
> The vec4 path has only been compile-tested as there's no easy way to
> generate a vec4 shader with an unordered equality.
> ---
>  src/intel/compiler/brw_compiler.c   |  3 +++
>  src/intel/compiler/brw_fs_nir.cpp   | 20 +++++++++++++-------
>  src/intel/compiler/brw_vec4_nir.cpp | 21 +++++++++++++++++++++
>  3 files changed, 37 insertions(+), 7 deletions(-)
> 
> diff --git a/src/intel/compiler/brw_compiler.c b/src/intel/compiler/brw_compiler.c
> index fe632c5badc..f9e8fa09a34 100644
> --- a/src/intel/compiler/brw_compiler.c
> +++ b/src/intel/compiler/brw_compiler.c
> @@ -42,6 +42,9 @@
>     .lower_fdiv = true,                                                        \
>     .lower_flrp64 = true,                                                      \
>     .lower_ldexp = true,                                                       \
> +   .lower_fltu = true,                                                        \
> +   .lower_fgeu = true,                                                        \

Can't we use cmpn.l and cmpn.ge for these?  On some platforms it has to
be split into two SIMD8 instructions, but that seems better than the
lowering... or maybe just use those on BDW+?

> +   .lower_fne_to_fequ = true,                                                 \
>     .lower_cs_local_id_from_index = true,                                      \
>     .lower_device_index_to_zero = true,                                        \
>     .native_integers = true,                                                   \
> diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
> index a62d521bb5d..eba3611e447 100644
> --- a/src/intel/compiler/brw_fs_nir.cpp
> +++ b/src/intel/compiler/brw_fs_nir.cpp
> @@ -1049,6 +1049,8 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
>     case nir_op_flt:
>     case nir_op_fge:
>     case nir_op_feq:
> +   case nir_op_fne:
> +   case nir_op_fequ:
>     case nir_op_fneu: {
>        fs_reg dest = result;
>  
> @@ -1056,26 +1058,30 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
>        if (bit_size != 32)
>           dest = bld.vgrf(op[0].type, 1);
>  
> -      brw_conditional_mod cond;
>        switch (instr->op) {
>        case nir_op_flt:
> -         cond = BRW_CONDITIONAL_L;
> +         bld.CMP(dest, op[0], op[1], BRW_CONDITIONAL_L);
>           break;
>        case nir_op_fge:
> -         cond = BRW_CONDITIONAL_GE;
> +         bld.CMP(dest, op[0], op[1], BRW_CONDITIONAL_GE);
>           break;
>        case nir_op_feq:
> -         cond = BRW_CONDITIONAL_Z;
> +         bld.CMP(dest, op[0], op[1], BRW_CONDITIONAL_Z);
> +         break;
> +      case nir_op_fequ:
> +         bld.CMP(dest, op[0], op[0], BRW_CONDITIONAL_NZ);
> +         set_predicate_inv(BRW_PREDICATE_NORMAL, true, /* inverse */
> +                           bld.CMP(dest, op[1], op[1], BRW_CONDITIONAL_NZ));
> +         set_predicate_inv(BRW_PREDICATE_NORMAL, true, /* inverse */
> +                           bld.CMP(dest, op[0], op[1], BRW_CONDITIONAL_Z));

OpFUnordEqual is (isnan(a) || isnan(b) || a == b), and that's literally
what this code does.  It seems like this could be implemented (perhaps
as a lowering?) as (OpFUnordGreaterThanEqual(a,b) &&
OpFUnordGreaterThanEqual(b, a)) via cmpn.ge.  Using this same technique,
that would be 2 instructions (on BDW+) instead of 3.  HSW has to do cmpn
as two SIMD8 instructions, so this may be better there.  Dunno.

>           break;
>        case nir_op_fneu:
> -         cond = BRW_CONDITIONAL_NZ;
> +         bld.CMP(dest, op[0], op[1], BRW_CONDITIONAL_NZ);
>           break;
>        default:
>           unreachable("bad opcode");
>        }
>  
> -      bld.CMP(dest, op[0], op[1], cond);
> -
>        if (bit_size > 32) {
>           bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0));
>        } else if(bit_size < 32) {
> diff --git a/src/intel/compiler/brw_vec4_nir.cpp b/src/intel/compiler/brw_vec4_nir.cpp
> index f7f46f5034c..32559e1aade 100644
> --- a/src/intel/compiler/brw_vec4_nir.cpp
> +++ b/src/intel/compiler/brw_vec4_nir.cpp
> @@ -1366,6 +1366,27 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
>        break;
>     }
>  
> +   case nir_op_fequ: {
> +      dst_reg cmp_res = dst;
> +      if (nir_src_bit_size(instr->src[0].src) == 64)
> +         cmp_res = dst_reg(this, glsl_type::dvec4_type);
> +
> +      vec4_instruction *inst;
> +      inst = emit(CMP(cmp_res, op[0], op[0], BRW_CONDITIONAL_NZ));
> +      inst = emit(CMP(cmp_res, op[1], op[1], BRW_CONDITIONAL_NZ));
> +      inst->predicate = BRW_PREDICATE_NORMAL;
> +      inst->predicate_inverse = true;
> +      inst = emit(CMP(cmp_res, op[0], op[1], BRW_CONDITIONAL_Z));
> +      inst->predicate = BRW_PREDICATE_NORMAL;
> +      inst->predicate_inverse = true;
> +
> +      if (nir_src_bit_size(instr->src[0].src) == 64) {
> +         dst_reg cmp_res32 = dst_reg(this, glsl_type::bvec4_type);
> +         emit(VEC4_OPCODE_PICK_LOW_32BIT, cmp_res32, src_reg(cmp_res));
> +         emit(MOV(dst, src_reg(cmp_res32)));
> +      }
> +   }
> +
>     case nir_op_ball_iequal2:
>     case nir_op_ball_iequal3:
>     case nir_op_ball_iequal4:
>