[Mesa-dev] [RFC 06/10] nir/lower_double_ops: lower mul()

Elie Tournier tournier.elie at gmail.com
Wed Apr 12 22:43:15 UTC 2017


Signed-off-by: Elie Tournier <elie.tournier at collabora.com>
---
 src/compiler/nir/nir.h                  |   3 +-
 src/compiler/nir/nir_lower_double_ops.c | 749 ++++++++++++++++++++++++++++++++
 src/intel/compiler/brw_nir.c            |   3 +-
 3 files changed, 753 insertions(+), 2 deletions(-)

diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 89d5dd8e1b..58045e3d42 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -2573,7 +2573,8 @@ typedef enum {
    nir_lower_dneg = (1 << 10),
    nir_lower_dsign = (1 << 11),
    nir_lower_deq = (1 << 12),
-   nir_lower_dlt = (1 << 13)
+   nir_lower_dlt = (1 << 13),
+   nir_lower_dmul = (1 << 14)
 } nir_lower_doubles_options;
 
 bool nir_lower_doubles(nir_shader *shader, nir_lower_doubles_options options);
diff --git a/src/compiler/nir/nir_lower_double_ops.c b/src/compiler/nir/nir_lower_double_ops.c
index 38743206a8..807fa18fc1 100644
--- a/src/compiler/nir/nir_lower_double_ops.c
+++ b/src/compiler/nir/nir_lower_double_ops.c
@@ -36,6 +36,20 @@
  * - 32-bit integer and floating point arithmetic
  */
 
+/* Creates a double with the sign bits set to a given integer value */
+static nir_ssa_def *
+set_sign(nir_builder *b, nir_ssa_def *src, nir_ssa_def *sign)
+{
+   /* Split into bits 0-31 and 32-63 */
+   nir_ssa_def *lo = nir_unpack_64_2x32_split_x(b, src);
+   nir_ssa_def *hi = nir_unpack_64_2x32_split_y(b, src);
+
+   /* The exponent is bits 63, or 31 of the high word */
+   nir_ssa_def *new_hi = nir_bfi(b, nir_imm_int(b, 0x80000000), sign, hi);
+   /* recombine */
+   return nir_pack_64_2x32_split(b, lo, new_hi);
+}
+
 static nir_ssa_def *
 get_sign(nir_builder *b, nir_ssa_def *src)
 {
@@ -73,6 +87,57 @@ get_exponent(nir_builder *b, nir_ssa_def *src)
 }
 
 static nir_ssa_def *
+set_frac_hi(nir_builder *b, nir_ssa_def *src, nir_ssa_def *frac_hi)
+{
+   /* Split into bits 0-31 and 32-63 */
+   nir_ssa_def *lo = nir_unpack_64_2x32_split_x(b, src);
+   nir_ssa_def *hi = nir_unpack_64_2x32_split_y(b, src);
+
+   /* The frac_lo is bits 32-51, or 0-19 of the high word */
+   nir_ssa_def *new_hi = nir_bfi(b, nir_imm_int(b, 0x000FFFFF), frac_hi, hi);
+   /* recombine */
+   return nir_pack_64_2x32_split(b, lo, new_hi);
+}
+
+static nir_ssa_def *
+get_frac_hi(nir_builder *b, nir_ssa_def *src)
+{
+   /* get bits 32-63 */
+   nir_ssa_def *hi = nir_unpack_64_2x32_split_y(b, src);
+
+   /* extract bits 0-19 of the high word */
+   return nir_ubitfield_extract(b, hi, nir_imm_int(b, 0), nir_imm_int(b, 20));
+}
+
+static nir_ssa_def *
+set_frac_lo(nir_builder *b, nir_ssa_def *src, nir_ssa_def *frac_lo)
+{
+   nir_ssa_def *hi = nir_unpack_64_2x32_split_y(b, src);
+   /* recombine */
+   return nir_pack_64_2x32_split(b, frac_lo, hi);
+}
+
+static nir_ssa_def *
+get_frac_lo(nir_builder *b, nir_ssa_def *src)
+{
+   /* get bits 0-31 */
+   return nir_unpack_64_2x32_split_x(b, src);
+}
+
+static nir_ssa_def *
+pack_fp64(nir_builder *b, nir_ssa_def *z_si,
+                          nir_ssa_def *z_exp,
+                          nir_ssa_def *z_frac_hi, nir_ssa_def *z_frac_lo)
+{
+   nir_ssa_def *z = nir_imm_double(b, 0.0);
+   z = set_sign(b, z, z_si);
+   z = set_exponent(b, z, z_exp);
+   z = set_frac_hi(b, z, z_frac_hi);
+   z = set_frac_lo(b, z, z_frac_lo);
+   return z;
+}
+
+static nir_ssa_def *
 is_nan(nir_builder *b, nir_ssa_def *src)
 {
    nir_ssa_def *src_lo = nir_unpack_64_2x32_split_x(b, src);
@@ -90,6 +155,247 @@ is_nan(nir_builder *b, nir_ssa_def *src)
                                          nir_imm_int(b, 0x000FFFFF))));
 }
 
+static nir_ssa_def *
+is_signaling_nan(nir_builder *b, nir_ssa_def *src)
+{
+   nir_ssa_def *src_lo = nir_unpack_64_2x32_split_x(b, src);
+   nir_ssa_def *src_hi = nir_unpack_64_2x32_split_y(b, src);
+
+   /* return (((src_hi>>19) & 0xFFF) == 0xFFE ) &&
+    *    (src_lo || (src_hi & 0x0007FFFF));
+    */
+   return nir_iand(b,
+                   nir_ieq(b,
+                           nir_iand(b,
+                                    nir_ishr(b, src_hi, nir_imm_int(b, 19)),
+                                    nir_imm_int(b, 0xFFF)),
+                           nir_imm_int(b, 0xFFE)),
+                   nir_ior(b, src_lo, nir_iand(b,
+                                               src_hi,
+                                               nir_imm_int(b, 0x0007FFFF))));
+}
+
+static nir_ssa_def *
+propagate_fp64_nan(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
+{
+   nir_ssa_def *x_is_nan = is_nan(b, x);
+   nir_ssa_def *x_is_signaling_nan = is_signaling_nan(b, x);
+   nir_ssa_def *y_is_nan = is_nan(b, y);
+
+   nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
+   nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
+   nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y);
+   nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y);
+
+   x_hi = nir_ior(b, x_hi, nir_imm_int(b, 0x00080000));
+   y_hi = nir_ior(b, y_hi, nir_imm_int(b, 0x00080000));
+   x =  nir_pack_64_2x32_split(b, x_lo, x_hi);
+   y =  nir_pack_64_2x32_split(b, y_lo, y_hi);
+
+   return nir_bcsel(b,
+                    x_is_nan,
+                    nir_bcsel(b,
+                              nir_iand(b, x_is_signaling_nan, y_is_nan),
+                              y,
+                              x),
+                    y);
+}
+
+/* Shifts the 64-bit value formed by concatenating `src_hi' and `src_lo' left
+ * by the number of bits given in `count'.  Any bits shifted off are lost.
+ * The value of `count' must be less than 32.  The result is broken into two
+ * 32-bit pieces which are stored at the locations pointed to by
+ * `z0Ptr' and `z1Ptr'.
+ */
+static void
+short_shl64(nir_builder *b, nir_ssa_def *src_hi, nir_ssa_def *src_lo,
+                            nir_ssa_def *count,
+                            nir_ssa_def **z0Ptr, nir_ssa_def **z1Ptr)
+{
+   /* z1Ptr = src_lo << count */
+   *z1Ptr = nir_ishl(b, src_lo, count);
+
+   /* z0Ptr = (count == 0) ? src_hi
+    *                      : (src_hi << count) | (src_lo >> ((-count) & 31))
+    */
+   *z0Ptr = nir_bcsel(b,
+                      nir_ieq(b, count, nir_imm_int(b, 0)),
+                      src_hi,
+                      nir_ior(b,
+                              nir_ishl(b, src_hi, count),
+                              nir_ishr(b, src_lo,
+                                          nir_iand(b,
+                                                   nir_ineg(b, count),
+                                                   nir_imm_int(b, 31)))));
+}
+
+/* Shifts the 96-bit value formed by concatenating `src_0', `src_1', and `src_2'
+ * right by 32 _plus_ the number of bits given in `count'.  The shifted result
+ * is at most 64 nonzero bits; these are broken into two 32-bit pieces which are
+ * stored at the locations pointed to by `z0Ptr' and `z1Ptr'.  The bits shifted
+ * off form a third 32-bit result as follows:  The _last_ bit shifted off is
+ * the most-significant bit of the extra result, and the other 31 bits of the
+ * extra result are all zero if and only if _all_but_the_last_ bits shifted off
+ * were all zero.  This extra result is stored in the location pointed to by
+ * `z2Ptr'.  The value of `count' can be arbitrarily large.
+ * (This routine makes more sense if `src_0', `src_1', and `src_2' are considered
+ * to form a fixed-point value with binary point between `src_1' and `src_2'.
+ * This fixed-point value is shifted right by the number of bits given in `count',
+ * and the integer part of the result is returned at the locations pointed to
+ * by `z0Ptr' and `z1Ptr'.  The fractional part of the result may be slightly
+ * corrupted as described above, and is returned at the location pointed to by
+ * `z2Ptr'.)
+ */
+static void
+shift64_extra_right_jamming(nir_builder *b,
+                            nir_ssa_def *src_0,
+                            nir_ssa_def *src_1,
+                            nir_ssa_def *src_2,
+                            nir_ssa_def *count,
+                            nir_ssa_def **z0Ptr,
+                            nir_ssa_def **z1Ptr,
+                            nir_ssa_def **z2Ptr)
+{
+   nir_ssa_def *int_0 = nir_imm_int(b, 0);
+   nir_ssa_def *int_31 = nir_imm_int(b, 31);
+   nir_ssa_def *int_32 = nir_imm_int(b, 32);
+   nir_ssa_def *int_64 = nir_imm_int(b, 64);
+
+   nir_ssa_def *neg_count = nir_iand(b,
+                                     nir_ineg(b, count),
+                                     int_31);
+
+   nir_ssa_def *is_count_0 = nir_ieq(b, count, int_0);
+   nir_ssa_def *is_count_lt32 = nir_ilt(b, count, int_32);
+   nir_ssa_def *is_count_32 = nir_ieq(b, count, int_32);
+   nir_ssa_def *is_count_lt64 = nir_ilt(b, count, int_64);
+
+   *z0Ptr = nir_bcsel(b,
+                      is_count_0,
+                      src_0,
+                      nir_bcsel(b,
+                                is_count_lt32,
+                                nir_ishr(b, src_0, count),
+                                int_0));
+
+   *z1Ptr =
+      nir_bcsel(b,
+                is_count_0,
+                src_1,
+                nir_bcsel(b,
+                          is_count_32,
+                          nir_ior(b,
+                                  nir_ishl(b, src_0, neg_count),
+                                  nir_ishr(b, src_1, count)),
+                          nir_bcsel(b,
+                                    is_count_32,
+                                    src_0,
+                                    nir_bcsel(b,
+                                              is_count_lt64,
+                                              nir_ishr(b,
+                                                       src_0,
+                                                       nir_iand(b, count,
+                                                                   int_31)),
+                                              int_0))));
+
+   *z2Ptr =
+      nir_bcsel(b,
+                is_count_0,
+                src_2,
+                nir_bcsel(b,
+                          is_count_lt32,
+                          nir_ior(b, nir_ishl(b, src_1, neg_count),
+                                     nir_ine(b, src_2, int_0)),
+                          nir_bcsel(b,
+                                    is_count_32,
+                                    nir_ior(b, src_1,
+                                               nir_ine(b, src_2, int_0)),
+                                    nir_ior(b,
+                                            nir_bcsel(b,
+                                                      is_count_lt64,
+                                                      nir_ishl(b, src_0,
+                                                                  neg_count),
+                                                      nir_bcsel(b,
+                                                                nir_ieq(b,
+                                                                        count,
+                                                                        int_64),
+                                                                src_0,
+                                                                nir_ine(b,
+                                                                        src_0,
+                                                                        int_0))),
+                                            nir_ine(b,
+                                                    nir_ior(b, src_2, src_1),
+                                                    int_0)))));
+}
+
+/* Returns the number of leading 0 bits before the most-significant 1 bit of
+ * `src'.  If `src' is zero, 32 is returned.
+ */
+static nir_ssa_def *
+count_leading_zeros(nir_builder *b, nir_ssa_def *src)
+{
+   return nir_isub(b, nir_imm_int(b, 31), nir_ufind_msb(b, src));
+}
+
+/* Normalizes the subnormal double-precision floating-point value represented
+ * by the denormalized significand formed by the concatenation of `frac_hi' and
+ * `frac_lo'.  The normalized exponent is stored at the location pointed to by
+ * `zExpPtr'.  The most significant 21 bits of the normalized significand are
+ * stored at the location pointed to by `zFrac0Ptr', and the least significant
+ * 32 bits of the normalized significand are stored at the location pointed to
+ * by `zFrac1Ptr'.
+ */
+static void
+normalize_fp64_subnormal(nir_builder *b,
+                         nir_ssa_def *frac_hi,
+                         nir_ssa_def *frac_lo,
+                         nir_ssa_def **zExpPtr,
+                         nir_ssa_def **zFrac0Ptr,
+                         nir_ssa_def **zFrac1Ptr)
+{
+   nir_ssa_def *shift_count_hi = nir_isub(b,
+                                          count_leading_zeros(b, frac_hi),
+                                          nir_imm_int(b, 11));
+   nir_ssa_def *shift_count_lo = nir_isub(b,
+                                          count_leading_zeros(b, frac_lo),
+                                          nir_imm_int(b, 11));
+
+   short_shl64(b, frac_hi, frac_lo, shift_count_hi, zFrac0Ptr, zFrac1Ptr);
+
+   nir_ssa_def *is_frac_hi_zero = nir_ieq(b, frac_hi, nir_imm_int(b, 0));
+   nir_ssa_def *is_shift_count_lo_neg = nir_ilt(b,
+                                                shift_count_lo,
+                                                nir_imm_int(b, 0));
+
+   *zFrac0Ptr = nir_bcsel(b,
+                          is_frac_hi_zero,
+                          nir_bcsel(b,
+                                    is_shift_count_lo_neg,
+                                    nir_ishr(b, frac_lo,
+                                                nir_ineg(b, shift_count_lo)),
+                                    nir_ishl(b, frac_lo, shift_count_lo)),
+                          *zFrac0Ptr);
+
+   *zFrac1Ptr = nir_bcsel(b,
+                          is_frac_hi_zero,
+                          nir_bcsel(b,
+                                    is_shift_count_lo_neg,
+                                    nir_ishl(b, frac_lo,
+                                                nir_iand(b,
+                                                         shift_count_lo,
+                                                         nir_imm_int(b, 31))),
+                                    nir_imm_int(b, 0)),
+                          *zFrac1Ptr);
+
+   *zExpPtr = nir_bcsel(b,
+                        is_frac_hi_zero,
+                        nir_isub(b,
+                                 nir_ineg(b, shift_count_lo),
+                                 nir_imm_int(b, 31)),
+                        nir_isub(b, nir_imm_int(b, 1), shift_count_hi));
+}
+
+
 /* Return infinity with the sign of the given source which is +/-0 */
 
 static nir_ssa_def *
@@ -147,6 +453,265 @@ lt64(nir_builder *b, nir_ssa_def *x_hi, nir_ssa_def *x_lo,
    return nir_ior(b, lt_hi, nir_iand(b, eq_hi, lt_lo));
 }
 
+/* Adds the 64-bit value formed by concatenating `x_hi' and `x_lo' to the 64-bit
+ * value formed by concatenating `b_hi' and `b_lo'.  Addition is modulo 2^64, so
+ * any carry out is lost.  The result is broken into two 32-bit pieces which
+ * are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
+ */
+static void
+add64(nir_builder *b,
+      nir_ssa_def *x_hi, nir_ssa_def *x_lo,
+      nir_ssa_def *y_hi, nir_ssa_def *y_lo,
+      nir_ssa_def **z0Ptr, nir_ssa_def **z1Ptr)
+{
+   nir_ssa_def *z = nir_fadd(b, x_lo, y_lo);
+   *z1Ptr = z;
+   *z0Ptr = nir_fadd(b, x_hi, nir_fadd(b, y_hi, nir_flt(b, z, x_lo)));
+}
+
+/* Multiplies `x' by `y' to obtain a 64-bit product.  The product is broken
+ * into two 32-bit pieces which are stored at the locations pointed to by
+ * `z0Ptr' and `z1Ptr'.
+ */
+static void
+mul32_to_64(nir_builder *b,
+            nir_ssa_def *x, nir_ssa_def *y,
+            nir_ssa_def **z0Ptr, nir_ssa_def **z1Ptr)
+{
+   nir_ssa_def *x_lo = x;
+   nir_ssa_def *x_hi = nir_ishr(b, x, nir_imm_int(b, 16));
+   nir_ssa_def *y_lo = y;
+   nir_ssa_def *y_hi = nir_ishr(b, y, nir_imm_int(b, 16));
+   nir_ssa_def *z1 = nir_fmul(b, x_lo, y_lo);
+   nir_ssa_def *z_mid_x = nir_fmul(b, x_lo, y_hi);
+   nir_ssa_def *z_mid_y = nir_fmul(b, x_hi, y_lo);
+   nir_ssa_def *z0 = nir_fmul(b, x_hi, y_hi);
+   z_mid_x = nir_fadd(b, z_mid_x, z_mid_y);
+   z0 = nir_fadd(b, z0,
+                    nir_fadd(b,
+                             nir_ishl(b,
+                                      nir_flt(b, z_mid_x, z_mid_y),
+                                      nir_imm_int(b, 16)),
+                             nir_ishr(b, z_mid_x, nir_imm_int(b, 16))));
+   z_mid_x = nir_ishl(b, z_mid_x, nir_imm_int(b, 16));
+   z1 = nir_fadd(b, z1, z_mid_x);
+   z0 = nir_fadd(b, z0, nir_flt(b, z1, z_mid_x));
+   *z1Ptr = z1;
+   *z0Ptr = z0;
+}
+
+/* Multiplies the 64-bit value formed by concatenating `x_hi' and `x_lo' to the
+ * 64-bit value formed by concatenating `y_hi' and `y_lo' to obtain a 128-bit
+ * product.  The product is broken into four 32-bit pieces which are stored at
+ * the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
+ */
+static void
+mul64_to_128(nir_builder *b,
+      nir_ssa_def *x_hi, nir_ssa_def *x_lo,
+      nir_ssa_def *y_hi, nir_ssa_def *y_lo,
+      nir_ssa_def **z0Ptr, nir_ssa_def **z1Ptr,
+      nir_ssa_def **z2Ptr, nir_ssa_def **z3Ptr)
+{
+   nir_ssa_def *z0;
+   nir_ssa_def *z1;
+   nir_ssa_def *z2;
+   nir_ssa_def *z3;
+   nir_ssa_def *more1;
+   nir_ssa_def *more2;
+
+   mul32_to_64(b, x_lo, y_lo, &z2, &z3);
+   mul32_to_64(b, x_lo, y_hi, &z1, &more2);
+   add64(b, z1, more2, nir_imm_int(b, 0), z2, &z1, &z2);
+   mul32_to_64(b, x_hi, y_hi, &z0, &more1);
+   add64(b, z0, more1, nir_imm_int(b, 0), z1, &z0, &z1);
+   mul32_to_64(b, x_hi, y_lo, &more1, &more2);
+   add64(b, more1, more2, nir_imm_int(b, 0), z2, &more1, &z2);
+   add64(b, z0, z1, nir_imm_int(b, 0), more1, &z0, &z1);
+
+   *z3Ptr = z3;
+   *z2Ptr = z2;
+   *z1Ptr = z1;
+   *z0Ptr = z0;
+}
+
+static nir_ssa_def *
+round_pack(nir_builder *b,
+           nir_ssa_def *increment,
+           nir_ssa_def *round_nearest_even,
+           nir_ssa_def *z_si,
+           nir_ssa_def *z_exp,
+           nir_ssa_def *z_frac_0,
+           nir_ssa_def *z_frac_1,
+           nir_ssa_def *z_frac_2)
+{
+   nir_ssa_def *zero = nir_imm_int(b, 0);
+   nir_ssa_def *one = nir_imm_int(b, 1);
+
+   nir_ssa_def *z_frac_0_incr;
+   nir_ssa_def *z_frac_1_incr;
+   add64(b, z_frac_0, z_frac_1, zero, one, &z_frac_0_incr, &z_frac_1_incr);
+   z_frac_1_incr = nir_iand(b,
+                            z_frac_1_incr,
+                            nir_inot(b, nir_iand(b,
+                                                 nir_fadd(b,
+                                                          z_frac_2,
+                                                          nir_ieq(b,
+                                                                  z_frac_2,
+                                                                  zero)),
+                                                 round_nearest_even)));
+
+   return nir_bcsel(b,
+                    nir_ine(b, increment, zero),
+                    pack_fp64(b, z_si,
+                                 z_exp,
+                                 z_frac_0_incr, z_frac_1_incr),
+                    nir_bcsel(b,
+                              nir_ieq(b, nir_ior(b, z_frac_0, z_frac_1), zero),
+                              pack_fp64(b, z_si, zero, z_frac_0, z_frac_1),
+                              pack_fp64(b, z_si, z_exp, z_frac_0, z_frac_1)));
+}
+
+static nir_ssa_def *
+round_pack_fp64(nir_builder *b,
+                nir_ssa_def *z_si,
+                nir_ssa_def *z_exp,
+                nir_ssa_def *z_frac_0,
+                nir_ssa_def *z_frac_1,
+                nir_ssa_def *z_frac_2)
+{
+   nir_ssa_def *zero = nir_imm_int(b, 0);
+
+   /* Rounding mode available */
+   nir_ssa_def *float_round_nearest_even = nir_imm_int(b, 0);
+   nir_ssa_def *float_round_to_zero = nir_imm_int(b, 1);
+   nir_ssa_def *float_round_down = nir_imm_int(b, 2);
+   nir_ssa_def *float_round_up = nir_imm_int(b, 3);
+   nir_ssa_def *rounding_mode = float_round_nearest_even;
+
+   nir_ssa_def *round_nearest_even = nir_ieq(b,
+                                             rounding_mode,
+                                             float_round_nearest_even);
+
+   nir_ssa_def *increment =
+      nir_bcsel(b,
+                nir_ine(b, round_nearest_even, zero),
+                nir_bcsel(b,
+                          nir_ieq(b, rounding_mode, float_round_to_zero),
+                          zero,
+                          nir_bcsel(b,
+                                    nir_ine(b, z_si, zero),
+                                    nir_iand(b,
+                                             nir_ieq(b, rounding_mode,
+                                                        float_round_down),
+                                             z_frac_2),
+                                    nir_iand(b,
+                                             nir_ieq(b, rounding_mode,
+                                                        float_round_up),
+                                             z_frac_2))),
+                nir_flt(b, z_frac_2, zero));
+
+   /* Condition for the bcsel */
+   nir_ssa_def *cond_1 =
+      nir_ior(b,
+              nir_ilt(b, nir_imm_int(b, 0x7FD), z_exp),
+              nir_iand(b,
+                       nir_ieq(b, z_exp, nir_imm_int(b, 0x7FD)),
+                       nir_iand(b,
+                                nir_iand(b,
+                                         nir_ieq(b,
+                                                 z_frac_0,
+                                                 nir_imm_int(b, 0x001FFFFF)),
+                                         nir_ieq(b,
+                                                 z_frac_1,
+                                                 nir_imm_int(b, 0xFFFFFFFF))),
+                                   increment)));
+
+   nir_ssa_def *cond_2 =
+      nir_ior(b,
+              nir_ieq(b, rounding_mode, float_round_to_zero),
+              nir_ior(b,
+                      nir_iand(b,
+                               z_si,
+                               nir_ieq(b, rounding_mode, float_round_up)),
+                      nir_iand(b,
+                               nir_ine(b, z_si, zero),
+                               nir_ieq(b, rounding_mode, float_round_down))));
+
+   /* Right jamming if (z_exp < 0) */
+   nir_ssa_def *z_frac_0_jam;
+   nir_ssa_def *z_frac_1_jam;
+   nir_ssa_def *z_frac_2_jam;
+   shift64_extra_right_jamming(b,
+                               z_frac_0, z_frac_1, z_frac_2,
+                               nir_ineg(b, z_exp),
+                               &z_frac_0_jam, &z_frac_1_jam, &z_frac_2_jam);
+
+   nir_ssa_def *increment_1 = nir_iand(b,
+                                       nir_ieq(b, rounding_mode,
+                                                  float_round_down),
+                                       z_frac_2_jam);
+   nir_ssa_def *increment_2 = nir_iand(b,
+                                       nir_ieq(b, rounding_mode,
+                                                  float_round_up),
+                                       z_frac_2_jam);
+   return
+      nir_bcsel(b,
+                nir_ige(b, z_exp, nir_imm_int(b, 0x7FD)),
+                nir_bcsel(b,
+                          cond_1,
+                          nir_bcsel(b,
+                                    cond_2,
+                                    pack_fp64(b, z_si,
+                                                 nir_imm_int(b, 0x7FE),
+                                                 nir_imm_int(b, 0x000FFFFF),
+                                                 nir_imm_int(b, 0xFFFFFFFF)),
+                                    pack_fp64(b, z_si,
+                                                 nir_imm_int(b, 0x7FF),
+                                                 zero, zero)),
+                          nir_bcsel(b,
+                                    nir_ilt(b, z_exp, zero),
+                                    nir_bcsel(b,
+                                              nir_ine(b, round_nearest_even, zero),
+                                              round_pack(b,
+                                                         nir_flt(b,
+                                                                 z_frac_2_jam,
+                                                                 zero),
+                                                         round_nearest_even,
+                                                         z_si,
+                                                         zero,
+                                                         z_frac_0_jam,
+                                                         z_frac_1_jam,
+                                                         z_frac_2_jam),
+                                              nir_bcsel(b,
+                                                        nir_ine(b, z_si, zero),
+                                                        round_pack(b,
+                                                                   increment_1,
+                                                                   round_nearest_even,
+                                                                   z_si,
+                                                                   zero,
+                                                                   z_frac_0_jam,
+                                                                   z_frac_1_jam,
+                                                                   z_frac_2_jam),
+                                                        round_pack(b,
+                                                                   increment_2,
+                                                                   round_nearest_even,
+                                                                   z_si,
+                                                                   zero,
+                                                                   z_frac_0_jam,
+                                                                   z_frac_1_jam,
+                                                                   z_frac_2_jam))),
+                                    round_pack(b, increment,
+                                                  round_nearest_even,
+                                                  z_si,
+                                                  z_exp,
+                                                  z_frac_0, z_frac_1, z_frac_2))),
+               round_pack(b, increment,
+                             round_nearest_even,
+                             z_si,
+                             z_exp,
+                             z_frac_0, z_frac_1, z_frac_2));
+}
+
 static nir_ssa_def *
 lower_rcp(nir_builder *b, nir_ssa_def *src)
 {
@@ -613,6 +1178,178 @@ lower_flt64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
                                            lt64(b, x_hi, x_lo, y_hi, y_lo))));
 }
 
+static nir_ssa_def *
+mul64(nir_builder *b,
+      nir_ssa_def *z_si,
+      nir_ssa_def *x_frac_hi, nir_ssa_def *x_frac_lo, nir_ssa_def *x_exp,
+      nir_ssa_def *y_frac_hi, nir_ssa_def *y_frac_lo, nir_ssa_def *y_exp)
+{
+   nir_ssa_def *z_frac_0;
+   nir_ssa_def *z_frac_1;
+   nir_ssa_def *z_frac_2;
+   nir_ssa_def *z_frac_3;
+
+   nir_ssa_def *z_frac_0_shift;
+   nir_ssa_def *z_frac_1_shift;
+   nir_ssa_def *z_frac_2_shift;
+
+   nir_ssa_def *z_exp = nir_isub(b,
+                                 nir_iadd(b, x_exp, y_exp),
+                                 nir_imm_int(b, 0x400));
+   x_frac_hi = nir_ior(b,
+                       x_frac_hi,
+                       nir_imm_int(b, 0x00100000));
+   short_shl64(b,
+               y_frac_hi, y_frac_lo,
+               nir_imm_int(b, 12),
+               &y_frac_hi, &y_frac_lo);
+   mul64_to_128(b, x_frac_hi, x_frac_lo,
+                   y_frac_hi, y_frac_lo,
+                   &z_frac_0, &z_frac_1, &z_frac_2, &z_frac_3);
+   add64(b, z_frac_0, z_frac_1,
+            x_frac_hi, x_frac_lo,
+            &z_frac_0, &z_frac_1);
+
+   z_frac_2 = nir_ior(b, z_frac_2, nir_ine(b, z_frac_3, nir_imm_int(b, 0)));
+
+   shift64_extra_right_jamming(b, z_frac_0, z_frac_1, z_frac_2,
+                                  nir_imm_int(b, 1),
+                                  &z_frac_0_shift,
+                                  &z_frac_1_shift,
+                                  &z_frac_2_shift);
+
+   return
+      nir_bcsel(b,
+                nir_fge(b, z_frac_0, nir_imm_int(b, 0x00200000)),
+                round_pack_fp64(b, z_si,
+                                   nir_iadd(b, z_exp, nir_imm_int(b, 1)),
+                                   z_frac_0_shift,
+                                   z_frac_1_shift,
+                                   z_frac_2_shift),
+                round_pack_fp64(b, z_si, z_exp, z_frac_0, z_frac_1, z_frac_2));
+}
+
+static nir_ssa_def *
+lower_fmul64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
+{
+   nir_ssa_def *x_si = get_sign(b, x);
+   nir_ssa_def *x_exp = get_exponent(b, x);
+   nir_ssa_def *x_frac_lo = get_frac_hi(b, x);
+   nir_ssa_def *x_frac_hi = get_frac_lo(b, x);
+   nir_ssa_def *y_si = get_sign(b, y);
+   nir_ssa_def *y_exp = get_exponent(b, y);
+   nir_ssa_def *y_frac_lo = get_frac_lo(b, y);
+   nir_ssa_def *y_frac_hi = get_frac_hi(b, y);
+
+   nir_ssa_def *z_si = nir_ixor(b, x_si, y_si);
+   nir_ssa_def *x_frac = nir_ior(b, x_frac_hi, x_frac_lo);
+   nir_ssa_def *x_exp_frac = nir_ior(b, x_exp, x_frac);
+   nir_ssa_def *y_frac = nir_ior(b, y_frac_hi, y_frac_lo);
+   nir_ssa_def *y_exp_frac = nir_ior(b, y_exp, y_frac);
+   nir_ssa_def *y_nan = nir_iand(b, nir_ieq(b, y_exp, nir_imm_int(b, 0x7FF)),
+                                    y_frac);
+
+   nir_ssa_def *zero = nir_imm_int(b, 0);
+
+   /* Result of NaN, Inf and subnormal multiply */
+   nir_ssa_def *propagate_nan = propagate_fp64_nan(b, x, y);
+
+   nir_ssa_def *pack_inf_fp64 = pack_fp64(b,
+                                          z_si,
+                                          nir_imm_int(b, 0x7FF),
+                                          zero,
+                                          zero);
+
+   nir_ssa_def *pack_zero_fp64 = pack_fp64(b,
+                                           z_si,
+                                           zero,
+                                           zero,
+                                           zero);
+
+   nir_ssa_def *default_nan =
+      nir_pack_64_2x32_split(b,
+                             nir_imm_int(b, 0xFFFFFFFF),
+                             nir_imm_int(b, 0xFFFFFFFF));
+
+   nir_ssa_def *x_exp_sub = x_exp;
+   nir_ssa_def *x_frac_hi_sub = x_frac_hi;
+   nir_ssa_def *x_frac_lo_sub = x_frac_lo;
+   normalize_fp64_subnormal(b,
+                            x_frac_hi, x_frac_lo,
+                            &x_exp_sub,
+                            &x_frac_hi_sub, &x_frac_lo_sub);
+   nir_ssa_def *normalize_x = mul64(b, z_si,
+                                       x_frac_hi_sub, x_frac_lo_sub, x_exp_sub,
+                                       y_frac_hi, y_frac_lo, y_exp);
+
+   nir_ssa_def *y_exp_sub = y_exp;
+   nir_ssa_def *y_frac_hi_sub = y_frac_hi;
+   nir_ssa_def *y_frac_lo_sub = y_frac_lo;
+   normalize_fp64_subnormal(b,
+                            y_frac_hi, y_frac_lo,
+                            &y_exp_sub,
+                            &y_frac_hi_sub, &y_frac_lo_sub);
+   nir_ssa_def *normalize_y = mul64(b, z_si,
+                                       x_frac_hi, x_frac_lo, x_exp,
+                                       y_frac_hi_sub, y_frac_lo_sub, y_exp_sub);
+
+   /*
+    * Handle the different exeption before compute the multiply.
+    *
+    * If x * Inf or Inf * y, return Inf.
+    * If Inf * Inf, return Inf.
+    * If Inf * 0, we return a default NaN (0xFFFFFFFFFFFFFFFF)
+    *
+    * If x * NaN or NaN * y, we propagate the NaN.
+    * If NaN * NaN, we select the correct NaN to propagate.
+    *
+    * If x or y is equal to 0, we return 0.
+    *
+    * If x or y is a subnormal (exponent == 0 and significant != 0),
+    * we normalize this entry and realize the multiply.
+    */
+
+   return
+      nir_bcsel(b,
+                nir_ieq(b, x_exp, nir_imm_int(b, 0x7FF)),
+                nir_bcsel(b,
+                          nir_ior(b, x_frac, y_nan),
+                          propagate_nan,
+                          nir_bcsel(b,
+                                    nir_ieq(b, y_exp_frac, zero),
+                                    default_nan,
+                                    pack_inf_fp64)),
+                nir_bcsel(b,
+                          nir_ieq(b, y_exp, nir_imm_int(b, 0x7FF)),
+                          nir_bcsel(b,
+                                    y_frac,
+                                    propagate_nan,
+                                    nir_bcsel(b,
+                                              nir_ieq(b, x_exp_frac, zero),
+                                              default_nan,
+                                              pack_inf_fp64)),
+                          nir_bcsel(b,
+                                    nir_ieq(b, x_exp, zero),
+                                    nir_bcsel(b,
+                                              nir_ieq(b, x_frac, zero),
+                                              pack_zero_fp64,
+                                              normalize_x),
+                                    nir_bcsel(b,
+                                              nir_ieq(b, y_exp, zero),
+                                              nir_bcsel(b,
+                                                        nir_ieq(b, y_frac,
+                                                                   zero),
+                                                        pack_zero_fp64,
+                                                        normalize_y),
+                                              mul64(b,
+                                                    z_si,
+                                                    x_frac_hi, x_frac_lo,
+                                                    x_exp,
+                                                    y_frac_hi, y_frac_lo,
+                                                    y_exp)))));
+
+}
+
 static bool
 lower_doubles_instr(nir_alu_instr *instr, nir_lower_doubles_options options)
 {
@@ -691,6 +1428,11 @@ lower_doubles_instr(nir_alu_instr *instr, nir_lower_doubles_options options)
          return false;
       break;
 
+   case nir_op_fmul:
+      if (!(options & nir_lower_dmul))
+         return false;
+      break;
+
    default:
       return false;
    }
@@ -763,6 +1505,13 @@ lower_doubles_instr(nir_alu_instr *instr, nir_lower_doubles_options options)
    }
       break;
 
+   case nir_op_fmul: {
+      nir_ssa_def *src1 = nir_fmov_alu(&bld, instr->src[1],
+                                      instr->dest.dest.ssa.num_components);
+      result = lower_fmul64(&bld, src, src1);
+   }
+      break;
+
    default:
       unreachable("unhandled opcode");
    }
diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c
index 374230a89b..9dc745d327 100644
--- a/src/intel/compiler/brw_nir.c
+++ b/src/intel/compiler/brw_nir.c
@@ -514,7 +514,8 @@ nir_optimize(nir_shader *nir, const struct brw_compiler *compiler,
                              nir_lower_dneg |
                              nir_lower_dsign |
                              nir_lower_deq |
-                             nir_lower_dlt);
+                             nir_lower_dlt |
+                             nir_lower_dmul);
       OPT(nir_lower_64bit_pack);
    } while (progress);
 
-- 
2.11.0



More information about the mesa-dev mailing list