[Mesa-dev] [RFC 4/5] intel/compiler: Implement unordered comparisons

Tue Nov 27 19:43:32 UTC 2018

On Tue, Nov 27, 2018 at 12:55 PM Ian Romanick <idr at freedesktop.org> wrote:

> On 11/22/2018 10:46 AM, Jason Ekstrand wrote:
> > The vec4 path has only been compile-tested as there's no easy way to
> > generate a vec4 shader with an unordered equality.
> > ---
> >  src/intel/compiler/brw_compiler.c   |  3 +++
> >  src/intel/compiler/brw_fs_nir.cpp   | 20 +++++++++++++-------
> >  src/intel/compiler/brw_vec4_nir.cpp | 21 +++++++++++++++++++++
> >  3 files changed, 37 insertions(+), 7 deletions(-)
> >
> > diff --git a/src/intel/compiler/brw_compiler.c
> b/src/intel/compiler/brw_compiler.c
> > index fe632c5badc..f9e8fa09a34 100644
> > --- a/src/intel/compiler/brw_compiler.c
> > +++ b/src/intel/compiler/brw_compiler.c
> > @@ -42,6 +42,9 @@
> >     .lower_fdiv = true,
>       \
> >     .lower_flrp64 = true,
>       \
> >     .lower_ldexp = true,
>        \
> > +   .lower_fltu = true,
>       \
> > +   .lower_fgeu = true,
>       \
>
> Can't we use cmpn.l and cmpn.ge for these?  On some platforms it has to
> be split into two SIMD8 instructions, but that seems better than the
> lowering... or maybe just use those on BDW+?
>

I didn't know about CMPN!  I guess that would do the trick though I suppose
we have a pile of back-end work to be able to optimize it.  I think the
back-end usually gets rid of the inot by flipping the cmod but maybe not.

> > +   .lower_fne_to_fequ = true,
>        \
> >     .lower_cs_local_id_from_index = true,
>       \
> >     .lower_device_index_to_zero = true,
>       \
> >     .native_integers = true,
>        \
> > diff --git a/src/intel/compiler/brw_fs_nir.cpp
> b/src/intel/compiler/brw_fs_nir.cpp
> > index a62d521bb5d..eba3611e447 100644
> > --- a/src/intel/compiler/brw_fs_nir.cpp
> > +++ b/src/intel/compiler/brw_fs_nir.cpp
> > @@ -1049,6 +1049,8 @@ fs_visitor::nir_emit_alu(const fs_builder &bld,
> nir_alu_instr *instr)
> >     case nir_op_flt:
> >     case nir_op_fge:
> >     case nir_op_feq:
> > +   case nir_op_fne:
> > +   case nir_op_fequ:
> >     case nir_op_fneu: {
> >        fs_reg dest = result;
> >
> > @@ -1056,26 +1058,30 @@ fs_visitor::nir_emit_alu(const fs_builder &bld,
> nir_alu_instr *instr)
> >        if (bit_size != 32)
> >           dest = bld.vgrf(op[0].type, 1);
> >
> > -      brw_conditional_mod cond;
> >        switch (instr->op) {
> >        case nir_op_flt:
> > -         cond = BRW_CONDITIONAL_L;
> > +         bld.CMP(dest, op[0], op[1], BRW_CONDITIONAL_L);
> >           break;
> >        case nir_op_fge:
> > -         cond = BRW_CONDITIONAL_GE;
> > +         bld.CMP(dest, op[0], op[1], BRW_CONDITIONAL_GE);
> >           break;
> >        case nir_op_feq:
> > -         cond = BRW_CONDITIONAL_Z;
> > +         bld.CMP(dest, op[0], op[1], BRW_CONDITIONAL_Z);
> > +         break;
> > +      case nir_op_fequ:
> > +         bld.CMP(dest, op[0], op[0], BRW_CONDITIONAL_NZ);
> > +         set_predicate_inv(BRW_PREDICATE_NORMAL, true, /* inverse */
> > +                           bld.CMP(dest, op[1], op[1],
> BRW_CONDITIONAL_NZ));
> > +         set_predicate_inv(BRW_PREDICATE_NORMAL, true, /* inverse */
> > +                           bld.CMP(dest, op[0], op[1],
> BRW_CONDITIONAL_Z));
>
> OpFUnordEqual is (isnan(a) || isnan(b) || a == b), and that's literally
> what this code does.  It seems like this could be implemented (perhaps
> as a lowering?) as (OpFUnordGreaterThanEqual(a,b) &&
> OpFUnordGreaterThanEqual(b, a)) via cmpn.ge.  Using this same technique,
> that would be 2 instructions (on BDW+) instead of 3.  HSW has to do cmpn
> as two SIMD8 instructions, so this may be better there.  Dunno.
>

That's a good idea.  Unfortunately, CMPN dosn't modify Z or NZ.  We could
also lower fequ(x, y) to inot(fne(x, y)) and implement fne(x, y) as (x < y)
|| (x > y) which wouldn't require CMPN.

--Jason

> >           break;
> >        case nir_op_fneu:
> > -         cond = BRW_CONDITIONAL_NZ;
> > +         bld.CMP(dest, op[0], op[1], BRW_CONDITIONAL_NZ);
> >           break;
> >        default:
> >           unreachable("bad opcode");
> >        }
> >
> > -      bld.CMP(dest, op[0], op[1], cond);
> > -
> >        if (bit_size > 32) {
> >           bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0));
> >        } else if(bit_size < 32) {
> > diff --git a/src/intel/compiler/brw_vec4_nir.cpp
> b/src/intel/compiler/brw_vec4_nir.cpp
> > index f7f46f5034c..32559e1aade 100644
> > --- a/src/intel/compiler/brw_vec4_nir.cpp
> > +++ b/src/intel/compiler/brw_vec4_nir.cpp
> > @@ -1366,6 +1366,27 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
> >        break;
> >     }
> >
> > +   case nir_op_fequ: {
> > +      dst_reg cmp_res = dst;
> > +      if (nir_src_bit_size(instr->src[0].src) == 64)
> > +         cmp_res = dst_reg(this, glsl_type::dvec4_type);
> > +
> > +      vec4_instruction *inst;
> > +      inst = emit(CMP(cmp_res, op[0], op[0], BRW_CONDITIONAL_NZ));
> > +      inst = emit(CMP(cmp_res, op[1], op[1], BRW_CONDITIONAL_NZ));
> > +      inst->predicate = BRW_PREDICATE_NORMAL;
> > +      inst->predicate_inverse = true;
> > +      inst = emit(CMP(cmp_res, op[0], op[1], BRW_CONDITIONAL_Z));
> > +      inst->predicate = BRW_PREDICATE_NORMAL;
> > +      inst->predicate_inverse = true;
> > +
> > +      if (nir_src_bit_size(instr->src[0].src) == 64) {
> > +         dst_reg cmp_res32 = dst_reg(this, glsl_type::bvec4_type);
> > +         emit(VEC4_OPCODE_PICK_LOW_32BIT, cmp_res32, src_reg(cmp_res));
> > +         emit(MOV(dst, src_reg(cmp_res32)));
> > +      }
> > +   }
> > +
> >     case nir_op_ball_iequal2:
> >     case nir_op_ball_iequal3:
> >     case nir_op_ball_iequal4:
> >
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/mesa-dev/attachments/20181127/edb61482/attachment.html>