[Mesa-dev] [PATCH 20/20] nir: Narrow unnecessary 64-bit operations to 32-bits

Wed Jul 19 20:37:31 UTC 2017

On Mon, Jul 17, 2017 at 8:14 PM, Connor Abbott <cwabbott0 at gmail.com> wrote:
> On Thu, Jul 6, 2017 at 4:48 PM, Matt Turner <mattst88 at gmail.com> wrote:
>> If we know the high bits are zero, we can just do a 32-bit comparison on
>> the low bytes instead.
>> ---
>>  src/compiler/nir/nir_opt_algebraic.py | 14 +++++++++-
>>  src/compiler/nir/nir_search_helpers.h | 48 +++++++++++++++++++++++++++++++++++
>>  2 files changed, 61 insertions(+), 1 deletion(-)
>>
>> diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py
>> index df5854270c..a9c3e80929 100644
>> --- a/src/compiler/nir/nir_opt_algebraic.py
>> +++ b/src/compiler/nir/nir_opt_algebraic.py
>> @@ -44,7 +44,7 @@ d = 'd'
>>  # however, be used for backend-requested lowering operations as those need to
>>  # happen regardless of precision.
>>  #
>> -# Variable names are specified as "[#]name[@type][(cond)]" where "#" inicates
>> +# Variable names are specified as "[#]name[@type][(cond)]" where "#" indicates
>>  # that the given variable will only match constants and the type indicates that
>>  # the given variable will only match values from ALU instructions with the
>>  # given output type, and (cond) specifies an additional condition function
>> @@ -144,6 +144,16 @@ optimizations = [
>>     (('inot', ('ieq', a, b)), ('ine', a, b)),
>>     (('inot', ('ine', a, b)), ('ieq', a, b)),
>>
>> +   # Unnecessary 64-bit comparisons
>> +   (('ieq', 'a at 64(fits_in_32_bits)', 'b at 64(fits_in_32_bits)'), ('ieq', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_x', b))),
>> +   (('ine', 'a at 64(fits_in_32_bits)', 'b at 64(fits_in_32_bits)'), ('ine', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_x', b))),
>> +   (('ilt', 'a at 64(fits_in_32_bits)', 'b at 64(fits_in_32_bits)'), ('ilt', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_x', b))),
>> +   (('ige', 'a at 64(fits_in_32_bits)', 'b at 64(fits_in_32_bits)'), ('ige', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_x', b))),
>> +   (('ult', 'a at 64(fits_in_32_bits)', 'b at 64(fits_in_32_bits)'), ('ult', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_x', b))),
>> +   (('uge', 'a at 64(fits_in_32_bits)', 'b at 64(fits_in_32_bits)'), ('uge', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_x', b))),
>> +
>> +   (('iand', 'a at 64(fits_in_32_bits)', 'b at 64'), ('pack_64_2x32_split', ('iand', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_x', b)), 0)),
>> +
>>     # 0.0 >= b2f(a)
>>     # b2f(a) <= 0.0
>>     # b2f(a) == 0.0 because b2f(a) can only be 0 or 1
>> @@ -315,6 +325,8 @@ optimizations = [
>>     (('pack_64_2x32_split', ('unpack_64_2x32_split_x', a),
>>                             ('unpack_64_2x32_split_y', a)), a),
>>
>> +   (('unpack_64_2x32_split_y', 'a(fits_in_32_bits)'), 0),
>> +
>>     # Byte extraction
>>     (('ushr', a, 24), ('extract_u8', a, 3), '!options->lower_extract_byte'),
>>     (('iand', 0xff, ('ushr', a, 16)), ('extract_u8', a, 2), '!options->lower_extract_byte'),
>> diff --git a/src/compiler/nir/nir_search_helpers.h b/src/compiler/nir/nir_search_helpers.h
>> index 200f2471f8..c29ea5b9dd 100644
>> --- a/src/compiler/nir/nir_search_helpers.h
>> +++ b/src/compiler/nir/nir_search_helpers.h
>> @@ -115,6 +115,54 @@ is_zero_to_one(nir_alu_instr *instr, unsigned src, unsigned num_components,
>>  }
>>
>>  static inline bool
>> +fits_in_32_bits(nir_alu_instr *instr, unsigned src, unsigned num_components,
>> +                const uint8_t *swizzle)
>> +{
>> +   if (instr->src[src].src.is_ssa &&
>> +       instr->src[src].src.ssa->parent_instr->type == nir_instr_type_alu) {
>> +      nir_alu_instr *parent_instr =
>> +         nir_instr_as_alu(instr->src[src].src.ssa->parent_instr);
>> +
>> +      switch (parent_instr->op) {
>> +      case nir_op_pack_64_2x32_split: {
>> +         nir_const_value *val =
>> +            nir_src_as_const_value(parent_instr->src[1].src);
>> +
>> +         if (val && val->u32[0] == 0)
>> +            return true;
>
> I think you need to check all the components here, or at least all the
> components that are used by parent_instr.

True.

>> +         break;
>> +      }
>> +      default:
>> +         break;
>> +      }
>> +
>> +      return false;
>> +   }
>> +
>> +   nir_const_value *val = nir_src_as_const_value(instr->src[src].src);
>> +
>> +   if (!val)
>> +      return false;
>> +
>> +   for (unsigned i = 0; i < num_components; i++) {
>> +      switch (nir_op_infos[instr->op].input_types[src]) {
>> +      case nir_type_int:
>> +         if (val->i64[swizzle[i]] != (int)val->i64[swizzle[i]])
>> +            return false;
>> +         break;
>> +      case nir_type_uint:
>> +         if (val->u64[swizzle[i]] != (unsigned)val->u64[swizzle[i]])
>> +            return false;
>> +         break;
>> +      default:
>> +         return false;
>> +      }
>> +   }
>> +
>> +   return true;
>> +}
>
> This function seems to be doing two subtly different things:
>
> 1. If the instruction defining the source is nir_op_pack_64_2x32_split
> or the source type is nir_type_uint, then it checks if the high 32
> bits are 0.
> 2. Otherwise, it checks that the value can be replaced by a 32-bit
> value that's sign extended to 64 bits, that is, whether the high 32
> bits are the same as bit 31.
>
> The problem is that if the source type is nir_type_int, then this
> function checks two different things depending on whether the source
> is a constant or the result of nir_op_pack_64_2x32_split. Based on how
> you're using it, it seems like you really want to check whether the
> high bits are the same as bit 31 in this case, so need to bail out of
> the first 'if' if the source type isn't nir_type_uint.

You are right. I gave this a try, and it breaks the optimization,
because the cases I'm looking to optimize are (ine, 0,
pack_64_2x32(ballot(...), 0)). Since we use "ine", so the type comes
back as nir_type_int. In that case I have

    case nir_type_int:
       if (!hi_bits || !lo_bits ||
           hi_bits->i32[i] != (lo_bits->i32[i] >> 31))
          return false;
       break;

but lo_bits is not const, so it fails.

Any suggestions?

I don't think I really had any reason for the sign-extending case, FWIW.