<div dir="ltr"><div class="gmail_quote"><div dir="ltr">On Sun, Oct 14, 2018 at 5:12 PM Matt Turner <<a href="mailto:mattst88@gmail.com">mattst88@gmail.com</a>> wrote:<br></div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">---<br>
 src/compiler/nir/nir.h             |   1 +<br>
 src/compiler/nir/nir_lower_int64.c | 142 +++++++++++++++++++++++++++++++++++++<br>
 2 files changed, 143 insertions(+)<br>
<br>
diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h<br>
index 12cbd030e21..2c477126acc 100644<br>
--- a/src/compiler/nir/nir.h<br>
+++ b/src/compiler/nir/nir.h<br>
@@ -3001,6 +3001,7 @@ typedef enum {<br>
    nir_lower_ineg64    = (1 << 7),<br>
    nir_lower_logic64   = (1 << 8),<br>
    nir_lower_minmax64  = (1 << 9),<br>
+   nir_lower_shift64   = (1 << 10),<br>
 } nir_lower_int64_options;<br>
<br>
 bool nir_lower_int64(nir_shader *shader, nir_lower_int64_options options);<br>
diff --git a/src/compiler/nir/nir_lower_int64.c b/src/compiler/nir/nir_lower_int64.c<br>
index 9cdc8a9d592..25882d3a858 100644<br>
--- a/src/compiler/nir/nir_lower_int64.c<br>
+++ b/src/compiler/nir/nir_lower_int64.c<br>
@@ -90,6 +90,138 @@ lower_ixor64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)<br>
                                     nir_ixor(b, x_hi, y_hi));<br>
 }<br>
<br>
+static nir_ssa_def *<br>
+lower_ishl64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)<br>
+{<br>
+   /* Implemented as<br>
+    *<br>
+    * uint64_t lshift(uint64_t x, int c)<br>
+    * {<br>
+    *    if (c == 0) return x;<br>
+    *<br>
+    *    uint32_t lo = LO(x), hi = HI(x);<br>
+    *<br>
+    *    if (c < 32) {<br>
+    *       uint32_t lo_shifted = lo << (c & 0x1f);<br>
+    *       uint32_t hi_shifted = hi << (c & 0x1f);<br>
+    *       uint32_t lo_shifted_hi = lo >> (abs(32 - c) & 0x1f);<br></blockquote><div><br></div><div>Why the abs and the &?  it's already predicated on c < 32 and negative or OOB shifts already have undefined results.<br></div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
+    *       return pack_64(lo_shifted, hi_shifted | lo_shifted_hi);<br>
+    *    } else {<br>
+    *       uint32_t lo_shifted_hi = lo << (abs(32 - c) & 0x1f);<br>
+    *       return pack_64(0, lo_shifted_hi);<br>
+    *    }<br>
+    * }<br>
+    */<br>
+   nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);<br>
+   nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);<br>
+<br>
+   nir_ssa_def *reverse_count = nir_iabs(b, nir_iadd(b, y, nir_imm_int(b, -32)));<br></blockquote><div><br></div><div>This is iabs(c - 32) (which yields the same result but isn't the same expression) and doesn't have the & 0x1f.</div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
+   nir_ssa_def *lo_shifted = nir_ishl(b, x_lo, y);<br>
+   nir_ssa_def *hi_shifted = nir_ishl(b, x_hi, y);<br></blockquote><div><br></div><div>In general, all of the 0x1f are missing.  While not having them works on
 i965, there's no guarantee it works in general.  Maybe we should add 
them in and have an i965-specific optimization to delete them again?  
Maybe it's ok to just not have them.  In any case, the code down here 
should match the code above or there should be a very good comment 
saying why it doesn't.</div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
+   nir_ssa_def *lo_shifted_hi = nir_ushr(b, x_lo, reverse_count);<br>
+<br>
+   nir_ssa_def *res_if_lt_32 =<br>
+      nir_pack_64_2x32_split(b, lo_shifted,<br>
+                                nir_ior(b, hi_shifted, lo_shifted_hi));<br>
+   nir_ssa_def *res_if_ge_32 =<br>
+      nir_pack_64_2x32_split(b, nir_imm_int(b, 0),<br>
+                                nir_ishl(b, x_lo, reverse_count));<br>
+<br>
+   return nir_bcsel(b,<br>
+                    nir_ieq(b, y, nir_imm_int(b, 0)), x,<br>
+                    nir_bcsel(b, nir_uge(b, y, nir_imm_int(b, 32)),<br>
+                                 res_if_ge_32, res_if_lt_32));<br>
+}<br>
+<br>
+static nir_ssa_def *<br>
+lower_ishr64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)<br>
+{<br>
+   /* Implemented as<br>
+    *<br>
+    * uint64_t arshift(uint64_t x, int c)<br>
+    * {<br>
+    *    if (c == 0) return x;<br>
+    *<br>
+    *    uint32_t lo = LO(x);<br>
+    *    int32_t  hi = HI(x);<br>
+    *<br>
+    *    if (c < 32) {<br>
+    *       uint32_t lo_shifted = lo >> (c & 0x1f);<br>
+    *       uint32_t hi_shifted = hi >> (c & 0x1f);<br>
+    *       uint32_t hi_shifted_lo = hi << (abs(32 - c) & 0x1f);<br>
+    *       return pack_64(hi_shifted, hi_shifted_lo | lo_shifted);<br>
+    *    } else {<br>
+    *       uint32_t hi_shifted = hi >> 31;<br>
+    *       uint32_t hi_shifted_lo = hi >> (abs(32 - c) & 0x1f);<br>
+    *       return pack_64(hi_shifted, hi_shifted_lo);<br>
+    *    }<br>
+    * }<br>
+    */<br>
+   nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);<br>
+   nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);<br>
+<br>
+   nir_ssa_def *reverse_count = nir_iabs(b, nir_iadd(b, y, nir_imm_int(b, -32)));<br>
+   nir_ssa_def *lo_shifted = nir_ushr(b, x_lo, y);<br>
+   nir_ssa_def *hi_shifted = nir_ishr(b, x_hi, y);<br>
+   nir_ssa_def *hi_shifted_lo = nir_ishl(b, x_hi, reverse_count);<br>
+<br>
+   nir_ssa_def *res_if_lt_32 =<br>
+      nir_pack_64_2x32_split(b, nir_ior(b, lo_shifted, hi_shifted_lo),<br>
+                                hi_shifted);<br>
+   nir_ssa_def *res_if_ge_32 =<br>
+      nir_pack_64_2x32_split(b, nir_ishr(b, x_hi, reverse_count),<br>
+                                nir_ishr(b, x_hi, nir_imm_int(b, 31)));<br>
+<br>
+   return nir_bcsel(b,<br>
+                    nir_ieq(b, y, nir_imm_int(b, 0)), x,<br>
+                    nir_bcsel(b, nir_uge(b, y, nir_imm_int(b, 32)),<br>
+                                 res_if_ge_32, res_if_lt_32));<br>
+}<br>
+<br>
+static nir_ssa_def *<br>
+lower_ushr64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)<br>
+{<br>
+   /* Implemented as<br>
+    *<br>
+    * uint64_t rshift(uint64_t x, int c)<br>
+    * {<br>
+    *    if (c == 0) return x;<br>
+    *<br>
+    *    uint32_t lo = LO(x), hi = HI(x);<br>
+    *<br>
+    *    if (c < 32) {<br>
+    *       uint32_t lo_shifted = lo >> (c & 0x1f);<br>
+    *       uint32_t hi_shifted = hi >> (c & 0x1f);<br>
+    *       uint32_t hi_shifted_lo = hi << (abs(32 - c) & 0x1f);<br>
+    *       return pack_64(hi_shifted, hi_shifted_lo | lo_shifted);<br>
+    *    } else {<br>
+    *       uint32_t hi_shifted_lo = hi >> (abs(32 - c) & 0x1f);<br>
+    *       return pack_64(0, hi_shifted_lo);<br>
+    *    }<br>
+    * }<br>
+    */<br>
+<br>
+   nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);<br>
+   nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);<br>
+<br>
+   nir_ssa_def *reverse_count = nir_iabs(b, nir_iadd(b, y, nir_imm_int(b, -32)));<br>
+   nir_ssa_def *lo_shifted = nir_ushr(b, x_lo, y);<br>
+   nir_ssa_def *hi_shifted = nir_ushr(b, x_hi, y);<br>
+   nir_ssa_def *hi_shifted_lo = nir_ishl(b, x_hi, reverse_count);<br>
+<br>
+   nir_ssa_def *res_if_lt_32 =<br>
+      nir_pack_64_2x32_split(b, nir_ior(b, lo_shifted, hi_shifted_lo),<br>
+                                hi_shifted);<br>
+   nir_ssa_def *res_if_ge_32 =<br>
+      nir_pack_64_2x32_split(b, nir_ushr(b, x_hi, reverse_count),<br>
+                                nir_imm_int(b, 0));<br>
+<br>
+   return nir_bcsel(b,<br>
+                    nir_ieq(b, y, nir_imm_int(b, 0)), x,<br>
+                    nir_bcsel(b, nir_uge(b, y, nir_imm_int(b, 32)),<br>
+                                 res_if_ge_32, res_if_lt_32));<br>
+}<br>
+<br>
 static nir_ssa_def *<br>
 lower_iadd64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)<br>
 {<br>
@@ -430,6 +562,10 @@ opcode_to_options_mask(nir_op opcode)<br>
    case nir_op_ixor:<br>
    case nir_op_inot:<br>
       return nir_lower_logic64;<br>
+   case nir_op_ishl:<br>
+   case nir_op_ishr:<br>
+   case nir_op_ushr:<br>
+      return nir_lower_shift64;<br>
    default:<br>
       return 0;<br>
    }<br>
@@ -492,6 +628,12 @@ lower_int64_alu_instr(nir_builder *b, nir_alu_instr *alu)<br>
       return lower_ixor64(b, src[0], src[1]);<br>
    case nir_op_inot:<br>
       return lower_inot64(b, src[0]);<br>
+   case nir_op_ishl:<br>
+      return lower_ishl64(b, src[0], src[1]);<br>
+   case nir_op_ishr:<br>
+      return lower_ishr64(b, src[0], src[1]);<br>
+   case nir_op_ushr:<br>
+      return lower_ushr64(b, src[0], src[1]);<br>
    default:<br>
       unreachable("Invalid ALU opcode to lower");<br>
    }<br>
-- <br>
2.16.4<br>
<br>
_______________________________________________<br>
mesa-dev mailing list<br>
<a href="mailto:mesa-dev@lists.freedesktop.org" target="_blank">mesa-dev@lists.freedesktop.org</a><br>
<a href="https://lists.freedesktop.org/mailman/listinfo/mesa-dev" rel="noreferrer" target="_blank">https://lists.freedesktop.org/mailman/listinfo/mesa-dev</a><br>
</blockquote></div></div>