[Mesa-dev] [PATCH v3 4/7] nir: Add a pass for lowering integer division by constants

Mon Oct 29 16:59:45 UTC 2018

Hi Jason,

thx for doing this pass, I was about to do the same (then we'd have 3 :P).
I'm not completely sure, but it looks like you're implementation is 
based on "Division by Invariant Integers using Multiplication" from T. 
Granlund and P. L. Montgomery? I think, it should be mentioned then.
Although some things are done a bit different (like idiv by pow2)...

Do you also plan to do runtime-invariants or even general lowering of 
udiv and friends? What do you think about some kind of ircp_scaled 
opcode to get the scaled 1/d multiplicator for some specified bit-size? 
The idea would be that for architectures without udiv support, they'd 
only have to implement the first part of the lowering while this pass 
does the second.

 >
 > It's a reasonably well-known fact in the world of compilers that integer
 > divisions by constants can be replaced by a multiply, an add, and some
 > shifts.  This commit adds such an optimization to NIR for easiest case
 > of udiv.  Other division operations will be added in following commits.
 > In order to provide some additional driver control, the pass takes a
 > minimum bit size to optimize.
 > ---
 >   src/compiler/Makefile.sources         |   1 +
 >   src/compiler/nir/meson.build          |   1 +
 >   src/compiler/nir/nir.h                |   2 +
 >   src/compiler/nir/nir_opt_idiv_const.c | 215 ++++++++++++++++++++++++++
 >   4 files changed, 219 insertions(+)
 >   create mode 100644 src/compiler/nir/nir_opt_idiv_const.c
 >
 > diff --git a/src/compiler/Makefile.sources 
b/src/compiler/Makefile.sources
 > index b65bb9b80b9..e44bce85ad9 100644
 > --- a/src/compiler/Makefile.sources
 > +++ b/src/compiler/Makefile.sources
 > @@ -278,6 +278,7 @@ NIR_FILES = \
 >       nir/nir_opt_find_array_copies.c \
 >       nir/nir_opt_gcm.c \
 >       nir/nir_opt_global_to_local.c \
 > +    nir/nir_opt_idiv_const.c \
 >       nir/nir_opt_if.c \
 >       nir/nir_opt_intrinsics.c \
 >       nir/nir_opt_loop_unroll.c \
 > diff --git a/src/compiler/nir/meson.build b/src/compiler/nir/meson.build
 > index d8f65640004..63f31e09870 100644
 > --- a/src/compiler/nir/meson.build
 > +++ b/src/compiler/nir/meson.build
 > @@ -162,6 +162,7 @@ files_libnir = files(
 >     'nir_opt_find_array_copies.c',
 >     'nir_opt_gcm.c',
 >     'nir_opt_global_to_local.c',
 > +  'nir_opt_idiv_const.c',
 >     'nir_opt_if.c',
 >     'nir_opt_intrinsics.c',
 >     'nir_opt_large_constants.c',
 > diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
 > index 96b437e7c82..4991c8c604c 100644
 > --- a/src/compiler/nir/nir.h
 > +++ b/src/compiler/nir/nir.h
 > @@ -3081,6 +3081,8 @@ bool nir_opt_find_array_copies(nir_shader *shader);
 >
 >   bool nir_opt_gcm(nir_shader *shader, bool value_number);
 >
 > +bool nir_opt_idiv_const(nir_shader *shader, unsigned min_bit_size);
 > +
 >   bool nir_opt_if(nir_shader *shader);
 >
 >   bool nir_opt_intrinsics(nir_shader *shader);
 > diff --git a/src/compiler/nir/nir_opt_idiv_const.c
 > b/src/compiler/nir/nir_opt_idiv_const.c
 > new file mode 100644
 > index 00000000000..7fa739161ba
 > --- /dev/null
 > +++ b/src/compiler/nir/nir_opt_idiv_const.c
 > @@ -0,0 +1,215 @@
 > +/*
 > + * Copyright © 2018 Intel Corporation
 > + *
 > + * Permission is hereby granted, free of charge, to any person 
obtaining a
 > + * copy of this software and associated documentation files (the
 > "Software"),
 > + * to deal in the Software without restriction, including without
 > limitation
 > + * the rights to use, copy, modify, merge, publish, distribute,
 > sublicense,
 > + * and/or sell copies of the Software, and to permit persons to whom the
 > + * Software is furnished to do so, subject to the following conditions:
 > + *
 > + * The above copyright notice and this permission notice (including the
 > next
 > + * paragraph) shall be included in all copies or substantial portions
 > of the
 > + * Software.
 > + *
 > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 > EXPRESS OR
 > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 > MERCHANTABILITY,
 > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT
 > SHALL
 > + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
 > OTHER
 > + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 
ARISING
 > + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 > DEALINGS
 > + * IN THE SOFTWARE.
 > + */
 > +
 > +#include "nir.h"
 > +#include "nir_builder.h"
 > +#include "util/fast_idiv_by_const.h"
 > +#include "util/u_math.h"
 > +
 > +static nir_ssa_def *
 > +build_udiv(nir_builder *b, nir_ssa_def *n, uint64_t d)
 > +{
 > +   if (d == 0) {
 > +      return nir_imm_intN_t(b, 0, n->bit_size);
 > +   } else if (util_is_power_of_two_or_zero64(d)) {
 > +      return nir_ushr(b, n, nir_imm_int(b, util_logbase2_64(d)));
 > +   } else {
 > +      struct util_fast_udiv_info m =
 > +         util_compute_fast_udiv_info(d, n->bit_size, n->bit_size);
 > +
 > +      if (m.pre_shift)
 > +         n = nir_ushr(b, n, nir_imm_int(b, m.pre_shift));
 > +      if (m.increment)
 > +         n = nir_uadd_sat(b, n, nir_imm_intN_t(b, m.increment,
 > n->bit_size));

Does Intel have an actual instruction for nir_uadd_sat?
Here, your algorithm is a bit different from the paper, and if there is 
no actual hardware for this instruction, I'd suppose to leave it out. 
Anyway, codegen for hardware without support for uadd_sat would look 
different, so it might be worth to offer a second path here.
I think, lowering uadd_sat afterwards could would give worse code.

 > +      n = nir_umul_high(b, n, nir_imm_intN_t(b, m.multiplier,
 > n->bit_size));
 > +      if (m.post_shift)
 > +         n = nir_ushr(b, n, nir_imm_int(b, m.post_shift));
 > +
 > +      return n;
 > +   }
 > +}
 > +
 > +static nir_ssa_def *
 > +build_umod(nir_builder *b, nir_ssa_def *n, uint64_t d)
 > +{
 > +   if (d == 0) {
 > +      return nir_imm_intN_t(b, 0, n->bit_size);
 > +   } else if (util_is_power_of_two_or_zero64(d)) {
 > +      return nir_iand(b, n, nir_imm_intN_t(b, d - 1, n->bit_size));
 > +   } else {
 > +      return nir_isub(b, n, nir_imul(b, build_udiv(b, n, d),
 > +                                        nir_imm_intN_t(b, d,
 > n->bit_size)));
 > +   }
 > +}
 > +

What about imod? :)

 > +static nir_ssa_def *
 > +build_idiv(nir_builder *b, nir_ssa_def *n, int64_t d)
 > +{
 > +   if (d == 0) {
 > +      return nir_imm_intN_t(b, 0, n->bit_size);
 > +   } else if (d == 1) {
 > +      return n;
 > +   } else if (d == -1) {
 > +      return nir_ineg(b, n);
 > +   } else if (util_is_power_of_two_or_zero64(d)) {
 > +      uint64_t abs_d = d < 0 ? -d : d;
 > +      nir_ssa_def *uq = nir_ishr(b, n, nir_imm_int(b,
 > util_logbase2_64(abs_d)));
 > +      nir_ssa_def *n_neg = nir_ilt(b, n, nir_imm_intN_t(b, 0,
 > n->bit_size));
 > +      nir_ssa_def *neg = d < 0 ? nir_inot(b, n_neg) : n_neg;
 > +      return nir_bcsel(b, neg, nir_ineg(b, uq), uq);
 > +   } else {
 > +      struct util_fast_sdiv_info m =
 > +         util_compute_fast_sdiv_info(d, n->bit_size);
 > +
 > +      nir_ssa_def *res =
 > +         nir_imul_high(b, n, nir_imm_intN_t(b, m.multiplier,
 > n->bit_size));
 > +      if (d > 0 && m.multiplier < 0)
 > +         res = nir_iadd(b, res, n);
 > +      if (d < 0 && m.multiplier > 0)
 > +         res = nir_isub(b, res, n);
 > +      if (m.shift)
 > +         res = nir_ishr(b, res, nir_imm_int(b, m.shift));
 > +      res = nir_iadd(b, res, nir_ushr(b, res, nir_imm_int(b,
 > n->bit_size - 1)));
 > +
 > +      return res;
 > +   }
 > +}
 > +
 > +static bool
 > +nir_opt_idiv_const_instr(nir_builder *b, nir_alu_instr *alu)
 > +{
 > +   assert(alu->dest.dest.is_ssa);
 > +   assert(alu->src[0].src.is_ssa && alu->src[1].src.is_ssa);
 > +
 > +   nir_const_value *const_denom = 
nir_src_as_const_value(alu->src[1].src);
 > +   if (!const_denom)
 > +      return false;
 > +
 > +   unsigned bit_size = alu->src[1].src.ssa->bit_size;
 > +
 > +   b->cursor = nir_before_instr(&alu->instr);
 > +
 > +   nir_ssa_def *q[4];
 > +   for (unsigned comp = 0; comp < alu->dest.dest.ssa.num_components;
 > comp++) {
 > +      /* Get the numerator for the channel */
 > +      nir_ssa_def *n = nir_channel(b, alu->src[0].src.ssa,
 > +                                   alu->src[0].swizzle[comp]);
 > +
 > +      /* Get the denominator for the channel */
 > +      int64_t d;
 > +      switch (bit_size) {
 > +      case 8:
 > +         d = const_denom->i8[alu->src[1].swizzle[comp]];
 > +         break;
 > +      case 16:
 > +         d = const_denom->i16[alu->src[1].swizzle[comp]];
 > +         break;
 > +      case 32:
 > +         d = const_denom->i32[alu->src[1].swizzle[comp]];
 > +         break;
 > +      case 64:
 > +         d = const_denom->i64[alu->src[1].swizzle[comp]];
 > +         break;
 > +      default:
 > +         unreachable("Invalid bit size");
 > +      }
 > +
 > +      nir_alu_type d_type = nir_op_infos[alu->op].input_types[1];
 > +      if (nir_alu_type_get_base_type(d_type) == nir_type_uint) {
 > +         /* The code above sign-extended.  If we're lowering an
 > unsigned op,
 > +          * we need to mask it off to the correct number of bits so 
that a
 > +          * cast to uint64_t will do the right thing.
 > +          */
 > +         if (bit_size < 64)
 > +            d &= (1ull << bit_size) - 1;
 > +      }
 > +
 > +      switch (alu->op) {
 > +      case nir_op_udiv:
 > +         q[comp] = build_udiv(b, n, d);
 > +         break;
 > +      case nir_op_idiv:
 > +         q[comp] = build_idiv(b, n, d);
 > +         break;
 > +      case nir_op_umod:
 > +         q[comp] = build_umod(b, n, d);
 > +         break;
 > +      default:
 > +         unreachable("Unknown integer division op");
 > +      }
 > +   }
 > +
 > +   nir_ssa_def *qvec = nir_vec(b, q, alu->dest.dest.ssa.num_components);
 > +   nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa, nir_src_for_ssa(qvec));
 > +   nir_instr_remove(&alu->instr);
 > +
 > +   return true;
 > +}
 > +
 > +static bool
 > +nir_opt_idiv_const_impl(nir_function_impl *impl, unsigned min_bit_size)
 > +{
 > +   bool progress = false;
 > +
 > +   nir_builder b;
 > +   nir_builder_init(&b, impl);
 > +
 > +   nir_foreach_block(block, impl) {
 > +      nir_foreach_instr_safe(instr, block) {
 > +         if (instr->type != nir_instr_type_alu)
 > +            continue;
 > +
 > +         nir_alu_instr *alu = nir_instr_as_alu(instr);
 > +         if (alu->op != nir_op_udiv &&
 > +             alu->op != nir_op_idiv &&
 > +             alu->op != nir_op_umod)
 > +            continue;
 > +
 > +         assert(alu->dest.dest.is_ssa);
 > +         if (alu->dest.dest.ssa.bit_size < min_bit_size)
 > +            continue;
 > +
 > +         progress |= nir_opt_idiv_const_instr(&b, alu);
 > +      }
 > +   }
 > +
 > +   if (progress) {
 > +      nir_metadata_preserve(impl, nir_metadata_block_index |
 > +                                  nir_metadata_dominance);
 > +   }
 > +
 > +   return progress;
 > +}
 > +
 > +bool
 > +nir_opt_idiv_const(nir_shader *shader, unsigned min_bit_size)
 > +{
 > +   bool progress = false;
 > +
 > +   nir_foreach_function(function, shader) {
 > +      if (function->impl)
 > +         progress |= nir_opt_idiv_const_impl(function->impl,
 > min_bit_size);
 > +   }
 > +
 > +   return progress;
 > +}

Kind regards,
Daniel