<div dir="ltr"><div>I haven't read it all in detail but I have a strong suspicion that this is probably broken for TCS outputs.  In the TCS, we write values out immediately and we have to be able to write single components.  This is because TCS outputs also act as a sort of SLM for tessellation shaders where each thread can write it's own data and then you barrier and then any thread can read the output of any other thread.  I think we can make this work but it's going to be rather annoying and will take careful thought and lots of testing.<br><br></div>--Jason<br></div><div class="gmail_extra"><br><div class="gmail_quote">On Thu, Oct 12, 2017 at 11:38 AM, Jose Maria Casanova Crespo <span dir="ltr"><<a href="mailto:jmcasanova@igalia.com" target="_blank">jmcasanova@igalia.com</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">Enables the support of 16-bit types on load_input and<br>
store_outputs intrinsics intra-stages.<br>
<br>
The approach was based on re-using the 32-bit URB read<br>
and writes between stages, shuffling pairs of 16-bit values into<br>
32-bit values at load_store intrinsic and un-shuffling the values<br>
at load_inputs.<br>
<br>
shuffle_32bit_load_result_to_<wbr>16bit_data and<br>
shuffle_32bit_load_result_to_<wbr>16bit_data are implemented in a similar<br>
way than the analogous functions for handling 64-bit types.<br>
---<br>
 src/intel/compiler/brw_fs.h       |  11 ++++<br>
 src/intel/compiler/brw_fs_nir.<wbr>cpp | 119 ++++++++++++++++++++++++++++++<wbr>+++++++-<br>
 2 files changed, 129 insertions(+), 1 deletion(-)<br>
<br>
diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h<br>
index b9476e69ed..90ada3ef4b 100644<br>
--- a/src/intel/compiler/brw_fs.h<br>
+++ b/src/intel/compiler/brw_fs.h<br>
@@ -498,6 +498,17 @@ void shuffle_64bit_data_for_32bit_<wbr>write(const brw::fs_builder &bld,<br>
                                         const fs_reg &dst,<br>
                                         const fs_reg &src,<br>
                                         uint32_t components);<br>
+<br>
+void shuffle_32bit_load_result_to_<wbr>16bit_data(const brw::fs_builder &bld,<br>
+                                             const fs_reg &dst,<br>
+                                             const fs_reg &src,<br>
+                                             uint32_t components);<br>
+<br>
+void shuffle_16bit_data_for_32bit_<wbr>write(const brw::fs_builder &bld,<br>
+                                        const fs_reg &dst,<br>
+                                        const fs_reg &src,<br>
+                                        uint32_t components);<br>
+<br>
 fs_reg setup_imm_df(const brw::fs_builder &bld,<br>
                     double v);<br>
<br>
diff --git a/src/intel/compiler/brw_fs_<wbr>nir.cpp b/src/intel/compiler/brw_fs_<wbr>nir.cpp<br>
index 83ff0607a7..9c694a1c53 100644<br>
--- a/src/intel/compiler/brw_fs_<wbr>nir.cpp<br>
+++ b/src/intel/compiler/brw_fs_<wbr>nir.cpp<br>
@@ -2124,12 +2124,17 @@ fs_visitor::emit_gs_input_<wbr>load(const fs_reg &dst,<br>
       first_component = first_component / 2;<br>
    }<br>
<br>
+   if (type_sz(dst.type) == 2) {<br>
+      num_components = DIV_ROUND_UP(num_components, 2);<br>
+      tmp_dst = bld.vgrf(BRW_REGISTER_TYPE_F, num_components);<br>
+   }<br>
+<br>
    for (unsigned iter = 0; iter < num_iterations; iter++) {<br>
       if (offset_const) {<br>
          /* Constant indexing - use global offset. */<br>
          if (first_component != 0) {<br>
             unsigned read_components = num_components + first_component;<br>
-            fs_reg tmp = bld.vgrf(dst.type, read_components);<br>
+            fs_reg tmp = bld.vgrf(tmp_dst.type, read_components);<br>
             inst = bld.emit(SHADER_OPCODE_URB_<wbr>READ_SIMD8, tmp, icp_handle);<br>
             inst->size_written = read_components *<br>
                                  tmp.component_size(inst->exec_<wbr>size);<br>
@@ -2179,6 +2184,11 @@ fs_visitor::emit_gs_input_<wbr>load(const fs_reg &dst,<br>
             bld.MOV(offset(dst, bld, iter * 2 + c), offset(tmp_dst, bld, c));<br>
       }<br>
<br>
+      if (type_sz(dst.type) == 2) {<br>
+         shuffle_32bit_load_result_to_<wbr>16bit_data(<br>
+            bld, dst, retype(tmp_dst, BRW_REGISTER_TYPE_F), orig_num_components);<br>
+      }<br>
+<br>
       if (num_iterations > 1) {<br>
          num_components = orig_num_components - 2;<br>
          if(offset_const) {<br>
@@ -2484,6 +2494,11 @@ fs_visitor::nir_emit_tcs_<wbr>intrinsic(const fs_builder &bld,<br>
          dst = tmp;<br>
       }<br>
<br>
+      if (type_sz(dst.type) == 2) {<br>
+         num_components = DIV_ROUND_UP(num_components, 2);<br>
+         dst = bld.vgrf(BRW_REGISTER_TYPE_F, num_components);<br>
+      }<br>
+<br>
       for (unsigned iter = 0; iter < num_iterations; iter++) {<br>
          if (indirect_offset.file == BAD_FILE) {<br>
             /* Constant indexing - use global offset. */<br>
@@ -2539,6 +2554,11 @@ fs_visitor::nir_emit_tcs_<wbr>intrinsic(const fs_builder &bld,<br>
             }<br>
          }<br>
<br>
+         if (type_sz(orig_dst.type) == 2) {<br>
+            shuffle_32bit_load_result_to_<wbr>16bit_data(<br>
+               bld, orig_dst, dst, instr->num_components);<br>
+         }<br>
+<br>
          /* Copy the temporary to the destination to deal with writemasking.<br>
           *<br>
           * Also attempt to deal with gl_PointSize being in the .w component.<br>
@@ -2629,6 +2649,8 @@ fs_visitor::nir_emit_tcs_<wbr>intrinsic(const fs_builder &bld,<br>
       fs_reg value = get_nir_src(instr->src[0]);<br>
       bool is_64bit = (instr->src[0].is_ssa ?<br>
          instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_<wbr>size) == 64;<br>
+      bool is_16bit = (instr->src[0].is_ssa ?<br>
+         instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_<wbr>size) == 16;<br>
       fs_reg indirect_offset = get_indirect_offset(instr);<br>
       unsigned imm_offset = instr->const_index[0];<br>
       unsigned swiz = BRW_SWIZZLE_XYZW;<br>
@@ -2659,6 +2681,10 @@ fs_visitor::nir_emit_tcs_<wbr>intrinsic(const fs_builder &bld,<br>
             num_iterations = 2;<br>
             iter_components = 2;<br>
          }<br>
+      } else {<br>
+         if (is_16bit) {<br>
+            iter_components = DIV_ROUND_UP(num_components, 2);<br>
+         }<br>
       }<br>
<br>
       /* 64-bit data needs to me shuffled before we can write it to the URB.<br>
@@ -2711,6 +2737,12 @@ fs_visitor::nir_emit_tcs_<wbr>intrinsic(const fs_builder &bld,<br>
                continue;<br>
<br>
             if (!is_64bit) {<br>
+               if (is_16bit) {<br>
+                  shuffle_16bit_data_for_32bit_<wbr>write(bld,<br>
+                     retype(offset(value,bld,BRW_<wbr>GET_SWZ(swiz, i)), BRW_REGISTER_TYPE_F),<br>
+                     retype(offset(value,bld,BRW_<wbr>GET_SWZ(swiz, i)), BRW_REGISTER_TYPE_HF),<br>
+                     2);<br>
+               }<br>
                srcs[header_regs + i + first_component] =<br>
                   offset(value, bld, BRW_GET_SWZ(swiz, i));<br>
             } else {<br>
@@ -2862,6 +2894,11 @@ fs_visitor::nir_emit_tes_<wbr>intrinsic(const fs_builder &bld,<br>
             dest = tmp;<br>
          }<br>
<br>
+         if (type_sz(dest.type) == 2) {<br>
+            num_components = DIV_ROUND_UP(num_components, 2);<br>
+            dest = bld.vgrf(BRW_REGISTER_TYPE_F, num_components);<br>
+         }<br>
+<br>
          for (unsigned iter = 0; iter < num_iterations; iter++) {<br>
             const fs_reg srcs[] = {<br>
                retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),<br>
@@ -2904,6 +2941,11 @@ fs_visitor::nir_emit_tes_<wbr>intrinsic(const fs_builder &bld,<br>
                }<br>
             }<br>
<br>
+            if (type_sz(dest.type) == 2) {<br>
+               shuffle_32bit_load_result_to_<wbr>16bit_data(<br>
+                  bld, orig_dest, retype(dest, BRW_REGISTER_TYPE_F), num_components);<br>
+            }<br>
+<br>
             /* If we are loading double data and we need a second read message<br>
              * adjust the offset<br>
              */<br>
@@ -3257,6 +3299,13 @@ fs_visitor::nir_emit_fs_<wbr>intrinsic(const fs_builder &bld,<br>
          num_components *= 2;<br>
       }<br>
<br>
+      fs_reg orig_dest = dest;<br>
+      if (nir_dest_bit_size(instr-><wbr>dest) == 16) {<br>
+         type = BRW_REGISTER_TYPE_F;<br>
+         num_components = DIV_ROUND_UP(num_components, 2);<br>
+         dest = bld.vgrf(type, num_components);<br>
+      }<br>
+<br>
       for (unsigned int i = 0; i < num_components; i++) {<br>
          struct brw_reg interp = interp_reg(base, component + i);<br>
          interp = suboffset(interp, 3);<br>
@@ -3270,6 +3319,12 @@ fs_visitor::nir_emit_fs_<wbr>intrinsic(const fs_builder &bld,<br>
                                                  retype(dest, type),<br>
                                                  instr->num_components);<br>
       }<br>
+      if (nir_dest_bit_size(instr-><wbr>dest) == 16) {<br>
+         shuffle_32bit_load_result_to_<wbr>16bit_data(bld,<br>
+                                                 orig_dest,<br>
+                                                 retype(dest, type),<br>
+                                                 instr->num_components);<br>
+      }<br>
       break;<br>
    }<br>
<br>
@@ -4182,6 +4237,7 @@ fs_visitor::nir_emit_<wbr>intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr<br>
       unsigned first_component = nir_intrinsic_component(instr)<wbr>;<br>
       unsigned bit_size = instr->src[0].is_ssa ?<br>
          instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_<wbr>size;<br>
+<br>
       if (bit_size == 64) {<br>
          fs_reg tmp =<br>
             fs_reg(VGRF, alloc.allocate(2 * num_components),<br>
@@ -4192,6 +4248,16 @@ fs_visitor::nir_emit_<wbr>intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr<br>
          num_components *= 2;<br>
       }<br>
<br>
+      if (bit_size == 16) {<br>
+         fs_reg tmp =<br>
+            fs_reg(VGRF, alloc.allocate(DIV_ROUND_UP(<wbr>num_components, 2)),<br>
+                   BRW_REGISTER_TYPE_F);<br>
+         shuffle_16bit_data_for_32bit_<wbr>write(<br>
+            bld, tmp, retype(src, BRW_REGISTER_TYPE_HF), num_components);<br>
+         src = retype(tmp, src.type);<br>
+         num_components = DIV_ROUND_UP(num_components, 2);<br>
+      }<br>
+<br>
       for (unsigned j = 0; j < num_components; j++) {<br>
          bld.MOV(offset(new_dest, bld, j + first_component),<br>
                  offset(src, bld, j));<br>
@@ -4815,6 +4881,33 @@ shuffle_32bit_load_result_to_<wbr>64bit_data(const fs_builder &bld,<br>
    }<br>
 }<br>
<br>
+void<br>
+shuffle_32bit_load_result_to_<wbr>16bit_data(const fs_builder &bld,<br>
+                                        const fs_reg &dst,<br>
+                                        const fs_reg &src,<br>
+                                        uint32_t components)<br>
+{<br>
+   assert(type_sz(src.type) == 4);<br>
+   assert(type_sz(dst.type) == 2);<br>
+<br>
+   fs_reg tmp = retype(bld.vgrf(src.type), dst.type);<br>
+<br>
+   for (unsigned i = 0; i < components; i++) {<br>
+      const fs_reg component_i = subscript(offset(src, bld, i / 2), dst.type, i % 2);<br>
+<br>
+      bld.MOV(offset(tmp, bld, i % 2), component_i);<br>
+<br>
+      if (i % 2) {<br>
+         bld.MOV(offset(dst, bld, i -1), offset(tmp, bld, 0));<br>
+         bld.MOV(offset(dst, bld, i), offset(tmp, bld, 1));<br>
+      }<br>
+   }<br>
+   if (components % 2) {<br>
+      bld.MOV(offset(dst, bld, components - 1), tmp);<br>
+   }<br>
+}<br>
+<br>
+<br>
 /**<br>
  * This helper does the inverse operation of<br>
  * SHUFFLE_32BIT_LOAD_RESULT_TO_<wbr>64BIT_DATA.<br>
@@ -4849,6 +4942,30 @@ shuffle_64bit_data_for_32bit_<wbr>write(const fs_builder &bld,<br>
    }<br>
 }<br>
<br>
+void<br>
+shuffle_16bit_data_for_32bit_<wbr>write(const fs_builder &bld,<br>
+                                   const fs_reg &dst,<br>
+                                   const fs_reg &src,<br>
+                                   uint32_t components)<br>
+{<br>
+   assert(type_sz(src.type) == 2);<br>
+   assert(type_sz(dst.type) == 4);<br>
+<br>
+   fs_reg tmp = bld.vgrf(dst.type);<br>
+<br>
+   for (unsigned i = 0; i < components; i++) {<br>
+      const fs_reg component_i = offset(src, bld, i);<br>
+      bld.MOV(subscript(tmp, src.type, i % 2), component_i);<br>
+      if (i % 2) {<br>
+         bld.MOV(offset(dst, bld, i / 2), tmp);<br>
+      }<br>
+      if (components % 2) {<br>
+         bld.MOV(offset(dst, bld, components / 2), tmp);<br>
+      }<br>
+   }<br>
+}<br>
+<br>
+<br>
 fs_reg<br>
 setup_imm_df(const fs_builder &bld, double v)<br>
 {<br>
<span class="HOEnZb"><font color="#888888">--<br>
2.13.6<br>
<br>
______________________________<wbr>_________________<br>
mesa-dev mailing list<br>
<a href="mailto:mesa-dev@lists.freedesktop.org">mesa-dev@lists.freedesktop.org</a><br>
<a href="https://lists.freedesktop.org/mailman/listinfo/mesa-dev" rel="noreferrer" target="_blank">https://lists.freedesktop.org/<wbr>mailman/listinfo/mesa-dev</a><br>
</font></span></blockquote></div><br></div>