[Mesa-dev] [RFC 06/10] nir/lower_double_ops: lower mul()
Elie Tournier
tournier.elie at gmail.com
Wed Apr 12 22:43:15 UTC 2017
Signed-off-by: Elie Tournier <elie.tournier at collabora.com>
---
src/compiler/nir/nir.h | 3 +-
src/compiler/nir/nir_lower_double_ops.c | 749 ++++++++++++++++++++++++++++++++
src/intel/compiler/brw_nir.c | 3 +-
3 files changed, 753 insertions(+), 2 deletions(-)
diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 89d5dd8e1b..58045e3d42 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -2573,7 +2573,8 @@ typedef enum {
nir_lower_dneg = (1 << 10),
nir_lower_dsign = (1 << 11),
nir_lower_deq = (1 << 12),
- nir_lower_dlt = (1 << 13)
+ nir_lower_dlt = (1 << 13),
+ nir_lower_dmul = (1 << 14)
} nir_lower_doubles_options;
bool nir_lower_doubles(nir_shader *shader, nir_lower_doubles_options options);
diff --git a/src/compiler/nir/nir_lower_double_ops.c b/src/compiler/nir/nir_lower_double_ops.c
index 38743206a8..807fa18fc1 100644
--- a/src/compiler/nir/nir_lower_double_ops.c
+++ b/src/compiler/nir/nir_lower_double_ops.c
@@ -36,6 +36,20 @@
* - 32-bit integer and floating point arithmetic
*/
+/* Creates a double with the sign bits set to a given integer value */
+static nir_ssa_def *
+set_sign(nir_builder *b, nir_ssa_def *src, nir_ssa_def *sign)
+{
+ /* Split into bits 0-31 and 32-63 */
+ nir_ssa_def *lo = nir_unpack_64_2x32_split_x(b, src);
+ nir_ssa_def *hi = nir_unpack_64_2x32_split_y(b, src);
+
+ /* The exponent is bits 63, or 31 of the high word */
+ nir_ssa_def *new_hi = nir_bfi(b, nir_imm_int(b, 0x80000000), sign, hi);
+ /* recombine */
+ return nir_pack_64_2x32_split(b, lo, new_hi);
+}
+
static nir_ssa_def *
get_sign(nir_builder *b, nir_ssa_def *src)
{
@@ -73,6 +87,57 @@ get_exponent(nir_builder *b, nir_ssa_def *src)
}
static nir_ssa_def *
+set_frac_hi(nir_builder *b, nir_ssa_def *src, nir_ssa_def *frac_hi)
+{
+ /* Split into bits 0-31 and 32-63 */
+ nir_ssa_def *lo = nir_unpack_64_2x32_split_x(b, src);
+ nir_ssa_def *hi = nir_unpack_64_2x32_split_y(b, src);
+
+ /* The frac_lo is bits 32-51, or 0-19 of the high word */
+ nir_ssa_def *new_hi = nir_bfi(b, nir_imm_int(b, 0x000FFFFF), frac_hi, hi);
+ /* recombine */
+ return nir_pack_64_2x32_split(b, lo, new_hi);
+}
+
+static nir_ssa_def *
+get_frac_hi(nir_builder *b, nir_ssa_def *src)
+{
+ /* get bits 32-63 */
+ nir_ssa_def *hi = nir_unpack_64_2x32_split_y(b, src);
+
+ /* extract bits 0-19 of the high word */
+ return nir_ubitfield_extract(b, hi, nir_imm_int(b, 0), nir_imm_int(b, 20));
+}
+
+static nir_ssa_def *
+set_frac_lo(nir_builder *b, nir_ssa_def *src, nir_ssa_def *frac_lo)
+{
+ nir_ssa_def *hi = nir_unpack_64_2x32_split_y(b, src);
+ /* recombine */
+ return nir_pack_64_2x32_split(b, frac_lo, hi);
+}
+
+static nir_ssa_def *
+get_frac_lo(nir_builder *b, nir_ssa_def *src)
+{
+ /* get bits 0-31 */
+ return nir_unpack_64_2x32_split_x(b, src);
+}
+
+static nir_ssa_def *
+pack_fp64(nir_builder *b, nir_ssa_def *z_si,
+ nir_ssa_def *z_exp,
+ nir_ssa_def *z_frac_hi, nir_ssa_def *z_frac_lo)
+{
+ nir_ssa_def *z = nir_imm_double(b, 0.0);
+ z = set_sign(b, z, z_si);
+ z = set_exponent(b, z, z_exp);
+ z = set_frac_hi(b, z, z_frac_hi);
+ z = set_frac_lo(b, z, z_frac_lo);
+ return z;
+}
+
+static nir_ssa_def *
is_nan(nir_builder *b, nir_ssa_def *src)
{
nir_ssa_def *src_lo = nir_unpack_64_2x32_split_x(b, src);
@@ -90,6 +155,247 @@ is_nan(nir_builder *b, nir_ssa_def *src)
nir_imm_int(b, 0x000FFFFF))));
}
+static nir_ssa_def *
+is_signaling_nan(nir_builder *b, nir_ssa_def *src)
+{
+ nir_ssa_def *src_lo = nir_unpack_64_2x32_split_x(b, src);
+ nir_ssa_def *src_hi = nir_unpack_64_2x32_split_y(b, src);
+
+ /* return (((src_hi>>19) & 0xFFF) == 0xFFE ) &&
+ * (src_lo || (src_hi & 0x0007FFFF));
+ */
+ return nir_iand(b,
+ nir_ieq(b,
+ nir_iand(b,
+ nir_ishr(b, src_hi, nir_imm_int(b, 19)),
+ nir_imm_int(b, 0xFFF)),
+ nir_imm_int(b, 0xFFE)),
+ nir_ior(b, src_lo, nir_iand(b,
+ src_hi,
+ nir_imm_int(b, 0x0007FFFF))));
+}
+
+static nir_ssa_def *
+propagate_fp64_nan(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
+{
+ nir_ssa_def *x_is_nan = is_nan(b, x);
+ nir_ssa_def *x_is_signaling_nan = is_signaling_nan(b, x);
+ nir_ssa_def *y_is_nan = is_nan(b, y);
+
+ nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
+ nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
+ nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y);
+ nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y);
+
+ x_hi = nir_ior(b, x_hi, nir_imm_int(b, 0x00080000));
+ y_hi = nir_ior(b, y_hi, nir_imm_int(b, 0x00080000));
+ x = nir_pack_64_2x32_split(b, x_lo, x_hi);
+ y = nir_pack_64_2x32_split(b, y_lo, y_hi);
+
+ return nir_bcsel(b,
+ x_is_nan,
+ nir_bcsel(b,
+ nir_iand(b, x_is_signaling_nan, y_is_nan),
+ y,
+ x),
+ y);
+}
+
+/* Shifts the 64-bit value formed by concatenating `src_hi' and `src_lo' left
+ * by the number of bits given in `count'. Any bits shifted off are lost.
+ * The value of `count' must be less than 32. The result is broken into two
+ * 32-bit pieces which are stored at the locations pointed to by
+ * `z0Ptr' and `z1Ptr'.
+ */
+static void
+short_shl64(nir_builder *b, nir_ssa_def *src_hi, nir_ssa_def *src_lo,
+ nir_ssa_def *count,
+ nir_ssa_def **z0Ptr, nir_ssa_def **z1Ptr)
+{
+ /* z1Ptr = src_lo << count */
+ *z1Ptr = nir_ishl(b, src_lo, count);
+
+ /* z0Ptr = (count == 0) ? src_hi
+ * : (src_hi << count) | (src_lo >> ((-count) & 31))
+ */
+ *z0Ptr = nir_bcsel(b,
+ nir_ieq(b, count, nir_imm_int(b, 0)),
+ src_hi,
+ nir_ior(b,
+ nir_ishl(b, src_hi, count),
+ nir_ishr(b, src_lo,
+ nir_iand(b,
+ nir_ineg(b, count),
+ nir_imm_int(b, 31)))));
+}
+
+/* Shifts the 96-bit value formed by concatenating `src_0', `src_1', and `src_2'
+ * right by 32 _plus_ the number of bits given in `count'. The shifted result
+ * is at most 64 nonzero bits; these are broken into two 32-bit pieces which are
+ * stored at the locations pointed to by `z0Ptr' and `z1Ptr'. The bits shifted
+ * off form a third 32-bit result as follows: The _last_ bit shifted off is
+ * the most-significant bit of the extra result, and the other 31 bits of the
+ * extra result are all zero if and only if _all_but_the_last_ bits shifted off
+ * were all zero. This extra result is stored in the location pointed to by
+ * `z2Ptr'. The value of `count' can be arbitrarily large.
+ * (This routine makes more sense if `src_0', `src_1', and `src_2' are considered
+ * to form a fixed-point value with binary point between `src_1' and `src_2'.
+ * This fixed-point value is shifted right by the number of bits given in `count',
+ * and the integer part of the result is returned at the locations pointed to
+ * by `z0Ptr' and `z1Ptr'. The fractional part of the result may be slightly
+ * corrupted as described above, and is returned at the location pointed to by
+ * `z2Ptr'.)
+ */
+static void
+shift64_extra_right_jamming(nir_builder *b,
+ nir_ssa_def *src_0,
+ nir_ssa_def *src_1,
+ nir_ssa_def *src_2,
+ nir_ssa_def *count,
+ nir_ssa_def **z0Ptr,
+ nir_ssa_def **z1Ptr,
+ nir_ssa_def **z2Ptr)
+{
+ nir_ssa_def *int_0 = nir_imm_int(b, 0);
+ nir_ssa_def *int_31 = nir_imm_int(b, 31);
+ nir_ssa_def *int_32 = nir_imm_int(b, 32);
+ nir_ssa_def *int_64 = nir_imm_int(b, 64);
+
+ nir_ssa_def *neg_count = nir_iand(b,
+ nir_ineg(b, count),
+ int_31);
+
+ nir_ssa_def *is_count_0 = nir_ieq(b, count, int_0);
+ nir_ssa_def *is_count_lt32 = nir_ilt(b, count, int_32);
+ nir_ssa_def *is_count_32 = nir_ieq(b, count, int_32);
+ nir_ssa_def *is_count_lt64 = nir_ilt(b, count, int_64);
+
+ *z0Ptr = nir_bcsel(b,
+ is_count_0,
+ src_0,
+ nir_bcsel(b,
+ is_count_lt32,
+ nir_ishr(b, src_0, count),
+ int_0));
+
+ *z1Ptr =
+ nir_bcsel(b,
+ is_count_0,
+ src_1,
+ nir_bcsel(b,
+ is_count_32,
+ nir_ior(b,
+ nir_ishl(b, src_0, neg_count),
+ nir_ishr(b, src_1, count)),
+ nir_bcsel(b,
+ is_count_32,
+ src_0,
+ nir_bcsel(b,
+ is_count_lt64,
+ nir_ishr(b,
+ src_0,
+ nir_iand(b, count,
+ int_31)),
+ int_0))));
+
+ *z2Ptr =
+ nir_bcsel(b,
+ is_count_0,
+ src_2,
+ nir_bcsel(b,
+ is_count_lt32,
+ nir_ior(b, nir_ishl(b, src_1, neg_count),
+ nir_ine(b, src_2, int_0)),
+ nir_bcsel(b,
+ is_count_32,
+ nir_ior(b, src_1,
+ nir_ine(b, src_2, int_0)),
+ nir_ior(b,
+ nir_bcsel(b,
+ is_count_lt64,
+ nir_ishl(b, src_0,
+ neg_count),
+ nir_bcsel(b,
+ nir_ieq(b,
+ count,
+ int_64),
+ src_0,
+ nir_ine(b,
+ src_0,
+ int_0))),
+ nir_ine(b,
+ nir_ior(b, src_2, src_1),
+ int_0)))));
+}
+
+/* Returns the number of leading 0 bits before the most-significant 1 bit of
+ * `src'. If `src' is zero, 32 is returned.
+ */
+static nir_ssa_def *
+count_leading_zeros(nir_builder *b, nir_ssa_def *src)
+{
+ return nir_isub(b, nir_imm_int(b, 31), nir_ufind_msb(b, src));
+}
+
+/* Normalizes the subnormal double-precision floating-point value represented
+ * by the denormalized significand formed by the concatenation of `frac_hi' and
+ * `frac_lo'. The normalized exponent is stored at the location pointed to by
+ * `zExpPtr'. The most significant 21 bits of the normalized significand are
+ * stored at the location pointed to by `zFrac0Ptr', and the least significant
+ * 32 bits of the normalized significand are stored at the location pointed to
+ * by `zFrac1Ptr'.
+ */
+static void
+normalize_fp64_subnormal(nir_builder *b,
+ nir_ssa_def *frac_hi,
+ nir_ssa_def *frac_lo,
+ nir_ssa_def **zExpPtr,
+ nir_ssa_def **zFrac0Ptr,
+ nir_ssa_def **zFrac1Ptr)
+{
+ nir_ssa_def *shift_count_hi = nir_isub(b,
+ count_leading_zeros(b, frac_hi),
+ nir_imm_int(b, 11));
+ nir_ssa_def *shift_count_lo = nir_isub(b,
+ count_leading_zeros(b, frac_lo),
+ nir_imm_int(b, 11));
+
+ short_shl64(b, frac_hi, frac_lo, shift_count_hi, zFrac0Ptr, zFrac1Ptr);
+
+ nir_ssa_def *is_frac_hi_zero = nir_ieq(b, frac_hi, nir_imm_int(b, 0));
+ nir_ssa_def *is_shift_count_lo_neg = nir_ilt(b,
+ shift_count_lo,
+ nir_imm_int(b, 0));
+
+ *zFrac0Ptr = nir_bcsel(b,
+ is_frac_hi_zero,
+ nir_bcsel(b,
+ is_shift_count_lo_neg,
+ nir_ishr(b, frac_lo,
+ nir_ineg(b, shift_count_lo)),
+ nir_ishl(b, frac_lo, shift_count_lo)),
+ *zFrac0Ptr);
+
+ *zFrac1Ptr = nir_bcsel(b,
+ is_frac_hi_zero,
+ nir_bcsel(b,
+ is_shift_count_lo_neg,
+ nir_ishl(b, frac_lo,
+ nir_iand(b,
+ shift_count_lo,
+ nir_imm_int(b, 31))),
+ nir_imm_int(b, 0)),
+ *zFrac1Ptr);
+
+ *zExpPtr = nir_bcsel(b,
+ is_frac_hi_zero,
+ nir_isub(b,
+ nir_ineg(b, shift_count_lo),
+ nir_imm_int(b, 31)),
+ nir_isub(b, nir_imm_int(b, 1), shift_count_hi));
+}
+
+
/* Return infinity with the sign of the given source which is +/-0 */
static nir_ssa_def *
@@ -147,6 +453,265 @@ lt64(nir_builder *b, nir_ssa_def *x_hi, nir_ssa_def *x_lo,
return nir_ior(b, lt_hi, nir_iand(b, eq_hi, lt_lo));
}
+/* Adds the 64-bit value formed by concatenating `x_hi' and `x_lo' to the 64-bit
+ * value formed by concatenating `b_hi' and `b_lo'. Addition is modulo 2^64, so
+ * any carry out is lost. The result is broken into two 32-bit pieces which
+ * are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
+ */
+static void
+add64(nir_builder *b,
+ nir_ssa_def *x_hi, nir_ssa_def *x_lo,
+ nir_ssa_def *y_hi, nir_ssa_def *y_lo,
+ nir_ssa_def **z0Ptr, nir_ssa_def **z1Ptr)
+{
+ nir_ssa_def *z = nir_fadd(b, x_lo, y_lo);
+ *z1Ptr = z;
+ *z0Ptr = nir_fadd(b, x_hi, nir_fadd(b, y_hi, nir_flt(b, z, x_lo)));
+}
+
+/* Multiplies `x' by `y' to obtain a 64-bit product. The product is broken
+ * into two 32-bit pieces which are stored at the locations pointed to by
+ * `z0Ptr' and `z1Ptr'.
+ */
+static void
+mul32_to_64(nir_builder *b,
+ nir_ssa_def *x, nir_ssa_def *y,
+ nir_ssa_def **z0Ptr, nir_ssa_def **z1Ptr)
+{
+ nir_ssa_def *x_lo = x;
+ nir_ssa_def *x_hi = nir_ishr(b, x, nir_imm_int(b, 16));
+ nir_ssa_def *y_lo = y;
+ nir_ssa_def *y_hi = nir_ishr(b, y, nir_imm_int(b, 16));
+ nir_ssa_def *z1 = nir_fmul(b, x_lo, y_lo);
+ nir_ssa_def *z_mid_x = nir_fmul(b, x_lo, y_hi);
+ nir_ssa_def *z_mid_y = nir_fmul(b, x_hi, y_lo);
+ nir_ssa_def *z0 = nir_fmul(b, x_hi, y_hi);
+ z_mid_x = nir_fadd(b, z_mid_x, z_mid_y);
+ z0 = nir_fadd(b, z0,
+ nir_fadd(b,
+ nir_ishl(b,
+ nir_flt(b, z_mid_x, z_mid_y),
+ nir_imm_int(b, 16)),
+ nir_ishr(b, z_mid_x, nir_imm_int(b, 16))));
+ z_mid_x = nir_ishl(b, z_mid_x, nir_imm_int(b, 16));
+ z1 = nir_fadd(b, z1, z_mid_x);
+ z0 = nir_fadd(b, z0, nir_flt(b, z1, z_mid_x));
+ *z1Ptr = z1;
+ *z0Ptr = z0;
+}
+
+/* Multiplies the 64-bit value formed by concatenating `x_hi' and `x_lo' to the
+ * 64-bit value formed by concatenating `y_hi' and `y_lo' to obtain a 128-bit
+ * product. The product is broken into four 32-bit pieces which are stored at
+ * the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
+ */
+static void
+mul64_to_128(nir_builder *b,
+ nir_ssa_def *x_hi, nir_ssa_def *x_lo,
+ nir_ssa_def *y_hi, nir_ssa_def *y_lo,
+ nir_ssa_def **z0Ptr, nir_ssa_def **z1Ptr,
+ nir_ssa_def **z2Ptr, nir_ssa_def **z3Ptr)
+{
+ nir_ssa_def *z0;
+ nir_ssa_def *z1;
+ nir_ssa_def *z2;
+ nir_ssa_def *z3;
+ nir_ssa_def *more1;
+ nir_ssa_def *more2;
+
+ mul32_to_64(b, x_lo, y_lo, &z2, &z3);
+ mul32_to_64(b, x_lo, y_hi, &z1, &more2);
+ add64(b, z1, more2, nir_imm_int(b, 0), z2, &z1, &z2);
+ mul32_to_64(b, x_hi, y_hi, &z0, &more1);
+ add64(b, z0, more1, nir_imm_int(b, 0), z1, &z0, &z1);
+ mul32_to_64(b, x_hi, y_lo, &more1, &more2);
+ add64(b, more1, more2, nir_imm_int(b, 0), z2, &more1, &z2);
+ add64(b, z0, z1, nir_imm_int(b, 0), more1, &z0, &z1);
+
+ *z3Ptr = z3;
+ *z2Ptr = z2;
+ *z1Ptr = z1;
+ *z0Ptr = z0;
+}
+
+static nir_ssa_def *
+round_pack(nir_builder *b,
+ nir_ssa_def *increment,
+ nir_ssa_def *round_nearest_even,
+ nir_ssa_def *z_si,
+ nir_ssa_def *z_exp,
+ nir_ssa_def *z_frac_0,
+ nir_ssa_def *z_frac_1,
+ nir_ssa_def *z_frac_2)
+{
+ nir_ssa_def *zero = nir_imm_int(b, 0);
+ nir_ssa_def *one = nir_imm_int(b, 1);
+
+ nir_ssa_def *z_frac_0_incr;
+ nir_ssa_def *z_frac_1_incr;
+ add64(b, z_frac_0, z_frac_1, zero, one, &z_frac_0_incr, &z_frac_1_incr);
+ z_frac_1_incr = nir_iand(b,
+ z_frac_1_incr,
+ nir_inot(b, nir_iand(b,
+ nir_fadd(b,
+ z_frac_2,
+ nir_ieq(b,
+ z_frac_2,
+ zero)),
+ round_nearest_even)));
+
+ return nir_bcsel(b,
+ nir_ine(b, increment, zero),
+ pack_fp64(b, z_si,
+ z_exp,
+ z_frac_0_incr, z_frac_1_incr),
+ nir_bcsel(b,
+ nir_ieq(b, nir_ior(b, z_frac_0, z_frac_1), zero),
+ pack_fp64(b, z_si, zero, z_frac_0, z_frac_1),
+ pack_fp64(b, z_si, z_exp, z_frac_0, z_frac_1)));
+}
+
+static nir_ssa_def *
+round_pack_fp64(nir_builder *b,
+ nir_ssa_def *z_si,
+ nir_ssa_def *z_exp,
+ nir_ssa_def *z_frac_0,
+ nir_ssa_def *z_frac_1,
+ nir_ssa_def *z_frac_2)
+{
+ nir_ssa_def *zero = nir_imm_int(b, 0);
+
+ /* Rounding mode available */
+ nir_ssa_def *float_round_nearest_even = nir_imm_int(b, 0);
+ nir_ssa_def *float_round_to_zero = nir_imm_int(b, 1);
+ nir_ssa_def *float_round_down = nir_imm_int(b, 2);
+ nir_ssa_def *float_round_up = nir_imm_int(b, 3);
+ nir_ssa_def *rounding_mode = float_round_nearest_even;
+
+ nir_ssa_def *round_nearest_even = nir_ieq(b,
+ rounding_mode,
+ float_round_nearest_even);
+
+ nir_ssa_def *increment =
+ nir_bcsel(b,
+ nir_ine(b, round_nearest_even, zero),
+ nir_bcsel(b,
+ nir_ieq(b, rounding_mode, float_round_to_zero),
+ zero,
+ nir_bcsel(b,
+ nir_ine(b, z_si, zero),
+ nir_iand(b,
+ nir_ieq(b, rounding_mode,
+ float_round_down),
+ z_frac_2),
+ nir_iand(b,
+ nir_ieq(b, rounding_mode,
+ float_round_up),
+ z_frac_2))),
+ nir_flt(b, z_frac_2, zero));
+
+ /* Condition for the bcsel */
+ nir_ssa_def *cond_1 =
+ nir_ior(b,
+ nir_ilt(b, nir_imm_int(b, 0x7FD), z_exp),
+ nir_iand(b,
+ nir_ieq(b, z_exp, nir_imm_int(b, 0x7FD)),
+ nir_iand(b,
+ nir_iand(b,
+ nir_ieq(b,
+ z_frac_0,
+ nir_imm_int(b, 0x001FFFFF)),
+ nir_ieq(b,
+ z_frac_1,
+ nir_imm_int(b, 0xFFFFFFFF))),
+ increment)));
+
+ nir_ssa_def *cond_2 =
+ nir_ior(b,
+ nir_ieq(b, rounding_mode, float_round_to_zero),
+ nir_ior(b,
+ nir_iand(b,
+ z_si,
+ nir_ieq(b, rounding_mode, float_round_up)),
+ nir_iand(b,
+ nir_ine(b, z_si, zero),
+ nir_ieq(b, rounding_mode, float_round_down))));
+
+ /* Right jamming if (z_exp < 0) */
+ nir_ssa_def *z_frac_0_jam;
+ nir_ssa_def *z_frac_1_jam;
+ nir_ssa_def *z_frac_2_jam;
+ shift64_extra_right_jamming(b,
+ z_frac_0, z_frac_1, z_frac_2,
+ nir_ineg(b, z_exp),
+ &z_frac_0_jam, &z_frac_1_jam, &z_frac_2_jam);
+
+ nir_ssa_def *increment_1 = nir_iand(b,
+ nir_ieq(b, rounding_mode,
+ float_round_down),
+ z_frac_2_jam);
+ nir_ssa_def *increment_2 = nir_iand(b,
+ nir_ieq(b, rounding_mode,
+ float_round_up),
+ z_frac_2_jam);
+ return
+ nir_bcsel(b,
+ nir_ige(b, z_exp, nir_imm_int(b, 0x7FD)),
+ nir_bcsel(b,
+ cond_1,
+ nir_bcsel(b,
+ cond_2,
+ pack_fp64(b, z_si,
+ nir_imm_int(b, 0x7FE),
+ nir_imm_int(b, 0x000FFFFF),
+ nir_imm_int(b, 0xFFFFFFFF)),
+ pack_fp64(b, z_si,
+ nir_imm_int(b, 0x7FF),
+ zero, zero)),
+ nir_bcsel(b,
+ nir_ilt(b, z_exp, zero),
+ nir_bcsel(b,
+ nir_ine(b, round_nearest_even, zero),
+ round_pack(b,
+ nir_flt(b,
+ z_frac_2_jam,
+ zero),
+ round_nearest_even,
+ z_si,
+ zero,
+ z_frac_0_jam,
+ z_frac_1_jam,
+ z_frac_2_jam),
+ nir_bcsel(b,
+ nir_ine(b, z_si, zero),
+ round_pack(b,
+ increment_1,
+ round_nearest_even,
+ z_si,
+ zero,
+ z_frac_0_jam,
+ z_frac_1_jam,
+ z_frac_2_jam),
+ round_pack(b,
+ increment_2,
+ round_nearest_even,
+ z_si,
+ zero,
+ z_frac_0_jam,
+ z_frac_1_jam,
+ z_frac_2_jam))),
+ round_pack(b, increment,
+ round_nearest_even,
+ z_si,
+ z_exp,
+ z_frac_0, z_frac_1, z_frac_2))),
+ round_pack(b, increment,
+ round_nearest_even,
+ z_si,
+ z_exp,
+ z_frac_0, z_frac_1, z_frac_2));
+}
+
static nir_ssa_def *
lower_rcp(nir_builder *b, nir_ssa_def *src)
{
@@ -613,6 +1178,178 @@ lower_flt64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
lt64(b, x_hi, x_lo, y_hi, y_lo))));
}
+static nir_ssa_def *
+mul64(nir_builder *b,
+ nir_ssa_def *z_si,
+ nir_ssa_def *x_frac_hi, nir_ssa_def *x_frac_lo, nir_ssa_def *x_exp,
+ nir_ssa_def *y_frac_hi, nir_ssa_def *y_frac_lo, nir_ssa_def *y_exp)
+{
+ nir_ssa_def *z_frac_0;
+ nir_ssa_def *z_frac_1;
+ nir_ssa_def *z_frac_2;
+ nir_ssa_def *z_frac_3;
+
+ nir_ssa_def *z_frac_0_shift;
+ nir_ssa_def *z_frac_1_shift;
+ nir_ssa_def *z_frac_2_shift;
+
+ nir_ssa_def *z_exp = nir_isub(b,
+ nir_iadd(b, x_exp, y_exp),
+ nir_imm_int(b, 0x400));
+ x_frac_hi = nir_ior(b,
+ x_frac_hi,
+ nir_imm_int(b, 0x00100000));
+ short_shl64(b,
+ y_frac_hi, y_frac_lo,
+ nir_imm_int(b, 12),
+ &y_frac_hi, &y_frac_lo);
+ mul64_to_128(b, x_frac_hi, x_frac_lo,
+ y_frac_hi, y_frac_lo,
+ &z_frac_0, &z_frac_1, &z_frac_2, &z_frac_3);
+ add64(b, z_frac_0, z_frac_1,
+ x_frac_hi, x_frac_lo,
+ &z_frac_0, &z_frac_1);
+
+ z_frac_2 = nir_ior(b, z_frac_2, nir_ine(b, z_frac_3, nir_imm_int(b, 0)));
+
+ shift64_extra_right_jamming(b, z_frac_0, z_frac_1, z_frac_2,
+ nir_imm_int(b, 1),
+ &z_frac_0_shift,
+ &z_frac_1_shift,
+ &z_frac_2_shift);
+
+ return
+ nir_bcsel(b,
+ nir_fge(b, z_frac_0, nir_imm_int(b, 0x00200000)),
+ round_pack_fp64(b, z_si,
+ nir_iadd(b, z_exp, nir_imm_int(b, 1)),
+ z_frac_0_shift,
+ z_frac_1_shift,
+ z_frac_2_shift),
+ round_pack_fp64(b, z_si, z_exp, z_frac_0, z_frac_1, z_frac_2));
+}
+
+static nir_ssa_def *
+lower_fmul64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
+{
+ nir_ssa_def *x_si = get_sign(b, x);
+ nir_ssa_def *x_exp = get_exponent(b, x);
+ nir_ssa_def *x_frac_lo = get_frac_hi(b, x);
+ nir_ssa_def *x_frac_hi = get_frac_lo(b, x);
+ nir_ssa_def *y_si = get_sign(b, y);
+ nir_ssa_def *y_exp = get_exponent(b, y);
+ nir_ssa_def *y_frac_lo = get_frac_lo(b, y);
+ nir_ssa_def *y_frac_hi = get_frac_hi(b, y);
+
+ nir_ssa_def *z_si = nir_ixor(b, x_si, y_si);
+ nir_ssa_def *x_frac = nir_ior(b, x_frac_hi, x_frac_lo);
+ nir_ssa_def *x_exp_frac = nir_ior(b, x_exp, x_frac);
+ nir_ssa_def *y_frac = nir_ior(b, y_frac_hi, y_frac_lo);
+ nir_ssa_def *y_exp_frac = nir_ior(b, y_exp, y_frac);
+ nir_ssa_def *y_nan = nir_iand(b, nir_ieq(b, y_exp, nir_imm_int(b, 0x7FF)),
+ y_frac);
+
+ nir_ssa_def *zero = nir_imm_int(b, 0);
+
+ /* Result of NaN, Inf and subnormal multiply */
+ nir_ssa_def *propagate_nan = propagate_fp64_nan(b, x, y);
+
+ nir_ssa_def *pack_inf_fp64 = pack_fp64(b,
+ z_si,
+ nir_imm_int(b, 0x7FF),
+ zero,
+ zero);
+
+ nir_ssa_def *pack_zero_fp64 = pack_fp64(b,
+ z_si,
+ zero,
+ zero,
+ zero);
+
+ nir_ssa_def *default_nan =
+ nir_pack_64_2x32_split(b,
+ nir_imm_int(b, 0xFFFFFFFF),
+ nir_imm_int(b, 0xFFFFFFFF));
+
+ nir_ssa_def *x_exp_sub = x_exp;
+ nir_ssa_def *x_frac_hi_sub = x_frac_hi;
+ nir_ssa_def *x_frac_lo_sub = x_frac_lo;
+ normalize_fp64_subnormal(b,
+ x_frac_hi, x_frac_lo,
+ &x_exp_sub,
+ &x_frac_hi_sub, &x_frac_lo_sub);
+ nir_ssa_def *normalize_x = mul64(b, z_si,
+ x_frac_hi_sub, x_frac_lo_sub, x_exp_sub,
+ y_frac_hi, y_frac_lo, y_exp);
+
+ nir_ssa_def *y_exp_sub = y_exp;
+ nir_ssa_def *y_frac_hi_sub = y_frac_hi;
+ nir_ssa_def *y_frac_lo_sub = y_frac_lo;
+ normalize_fp64_subnormal(b,
+ y_frac_hi, y_frac_lo,
+ &y_exp_sub,
+ &y_frac_hi_sub, &y_frac_lo_sub);
+ nir_ssa_def *normalize_y = mul64(b, z_si,
+ x_frac_hi, x_frac_lo, x_exp,
+ y_frac_hi_sub, y_frac_lo_sub, y_exp_sub);
+
+ /*
+ * Handle the different exeption before compute the multiply.
+ *
+ * If x * Inf or Inf * y, return Inf.
+ * If Inf * Inf, return Inf.
+ * If Inf * 0, we return a default NaN (0xFFFFFFFFFFFFFFFF)
+ *
+ * If x * NaN or NaN * y, we propagate the NaN.
+ * If NaN * NaN, we select the correct NaN to propagate.
+ *
+ * If x or y is equal to 0, we return 0.
+ *
+ * If x or y is a subnormal (exponent == 0 and significant != 0),
+ * we normalize this entry and realize the multiply.
+ */
+
+ return
+ nir_bcsel(b,
+ nir_ieq(b, x_exp, nir_imm_int(b, 0x7FF)),
+ nir_bcsel(b,
+ nir_ior(b, x_frac, y_nan),
+ propagate_nan,
+ nir_bcsel(b,
+ nir_ieq(b, y_exp_frac, zero),
+ default_nan,
+ pack_inf_fp64)),
+ nir_bcsel(b,
+ nir_ieq(b, y_exp, nir_imm_int(b, 0x7FF)),
+ nir_bcsel(b,
+ y_frac,
+ propagate_nan,
+ nir_bcsel(b,
+ nir_ieq(b, x_exp_frac, zero),
+ default_nan,
+ pack_inf_fp64)),
+ nir_bcsel(b,
+ nir_ieq(b, x_exp, zero),
+ nir_bcsel(b,
+ nir_ieq(b, x_frac, zero),
+ pack_zero_fp64,
+ normalize_x),
+ nir_bcsel(b,
+ nir_ieq(b, y_exp, zero),
+ nir_bcsel(b,
+ nir_ieq(b, y_frac,
+ zero),
+ pack_zero_fp64,
+ normalize_y),
+ mul64(b,
+ z_si,
+ x_frac_hi, x_frac_lo,
+ x_exp,
+ y_frac_hi, y_frac_lo,
+ y_exp)))));
+
+}
+
static bool
lower_doubles_instr(nir_alu_instr *instr, nir_lower_doubles_options options)
{
@@ -691,6 +1428,11 @@ lower_doubles_instr(nir_alu_instr *instr, nir_lower_doubles_options options)
return false;
break;
+ case nir_op_fmul:
+ if (!(options & nir_lower_dmul))
+ return false;
+ break;
+
default:
return false;
}
@@ -763,6 +1505,13 @@ lower_doubles_instr(nir_alu_instr *instr, nir_lower_doubles_options options)
}
break;
+ case nir_op_fmul: {
+ nir_ssa_def *src1 = nir_fmov_alu(&bld, instr->src[1],
+ instr->dest.dest.ssa.num_components);
+ result = lower_fmul64(&bld, src, src1);
+ }
+ break;
+
default:
unreachable("unhandled opcode");
}
diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c
index 374230a89b..9dc745d327 100644
--- a/src/intel/compiler/brw_nir.c
+++ b/src/intel/compiler/brw_nir.c
@@ -514,7 +514,8 @@ nir_optimize(nir_shader *nir, const struct brw_compiler *compiler,
nir_lower_dneg |
nir_lower_dsign |
nir_lower_deq |
- nir_lower_dlt);
+ nir_lower_dlt |
+ nir_lower_dmul);
OPT(nir_lower_64bit_pack);
} while (progress);
--
2.11.0
More information about the mesa-dev
mailing list