<div dir="ltr"><div class="gmail_extra"><div class="gmail_quote">On Tue, Feb 27, 2018 at 5:27 AM, Jose Maria Casanova Crespo <span dir="ltr"><<a href="mailto:jmcasanova@igalia.com" target="_blank">jmcasanova@igalia.com</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">16-bit load_ubo/ssbo operations that call do_untyped_read_vector don't<br>
guarantee that offsets are multiple of 4-bytes as required by untyped_read<br>
message. This happens for example in the case of f16mat3x3 when then<br>
VK_KHR_relaxed_block_layout is enabled.<br>
<br>
Vectors reads when we have non-constant offsets are implemented with<br>
multiple byte_scattered_read messages that not require 32-bit aligned offsets.<br>
<br>
Now for all constant offsets we can use the untyped_read_surface message.<br>
In the case of constant offsets not aligned to 32-bits, we calculate a<br>
start offset 32-bit aligned and use the shuffle_32bit_load_result_to_<wbr>16bit_data<br>
function and the first_component parameter to skip the copy of the unneeded<br>
component.<br>
<br>
v2: Use untyped_read_surface messages always we have constant offsets.<br>
    (Jason Ekstrand)<br>
---<br>
 src/intel/compiler/brw_fs_nir.<wbr>cpp | 54 +++++++++++++++++++++++++++++-<wbr>---------<br>
 1 file changed, 40 insertions(+), 14 deletions(-)<br>
<br>
diff --git a/src/intel/compiler/brw_fs_<wbr>nir.cpp b/src/intel/compiler/brw_fs_<wbr>nir.cpp<br>
index 5567433a19e..affb242668a 100644<br>
--- a/src/intel/compiler/brw_fs_<wbr>nir.cpp<br>
+++ b/src/intel/compiler/brw_fs_<wbr>nir.cpp<br>
@@ -2304,28 +2304,54 @@ do_untyped_vector_read(const fs_builder &bld,<br>
 {<br>
    if (type_sz(dest.type) <= 2) {<br>
       assert(dest.stride == 1);<br>
+      boolean is_const_offset = offset_reg.file == BRW_IMMEDIATE_VALUE;<br>
<br>
-      if (num_components > 1) {<br>
-         /* Pairs of 16-bit components can be read with untyped read, for 16-bit<br>
-          * vec3 4th component is ignored.<br>
+      if (is_const_offset) {<br>
+         uint32_t start = offset_reg.ud & ~3;<br>
+         uint32_t end = offset_reg.ud + num_components * type_sz(dest.type);<br>
+         end = ALIGN(end, 4);<br>
+         assert (end - start <= 16);<br>
+<br>
+         /* At this point we have 16-bit component/s that have constant<br>
+          * offset aligned to 4-bytes that can be read with untyped_reads.<br>
+          * untyped_read message requires 32-bit aligned offsets.<br>
           */<br>
+         unsigned first_component = (offset_reg.ud & 3) / type_sz(dest.type);<br>
+         unsigned num_components_32bit =<br>
+            DIV_ROUND_UP(first_component + num_components, 4 / type_sz(dest.type));<br></blockquote><div><br></div><div>This could also be (end - start) / 4.  You don't even need DIV_ROUND_UP since both start and end are 4-aligned.<br></div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
+<br>
          fs_reg read_result =<br>
-            emit_untyped_read(bld, surf_index, offset_reg,<br>
-                              1 /* dims */, DIV_ROUND_UP(num_components, 2),<br>
+            emit_untyped_read(bld, surf_index, brw_imm_ud(start),<br>
+                              1 /* dims */,<br>
+                              num_components_32bit,<br>
                               BRW_PREDICATE_NONE);<br>
          shuffle_32bit_load_result_to_<wbr>16bit_data(bld,<br>
                retype(dest, BRW_REGISTER_TYPE_W),<br>
                retype(read_result, BRW_REGISTER_TYPE_D),<br>
-               num_components, 0);<br>
+               num_components, first_component);<br>
       } else {<br>
-         assert(num_components == 1);<br>
-         /* scalar 16-bit are read using one byte_scattered_read message */<br>
-         fs_reg read_result =<br>
-            emit_byte_scattered_read(bld, surf_index, offset_reg,<br>
-                                     1 /* dims */, 1,<br>
-                                     type_sz(dest.type) * 8 /* bit_size */,<br>
-                                     BRW_PREDICATE_NONE);<br>
-         bld.MOV(dest, subscript(read_result, dest.type, 0));<br>
+         fs_reg read_offset = bld.vgrf(BRW_REGISTER_TYPE_UD)<wbr>;<br>
+         bld.MOV(read_offset, offset_reg);<br>
+         unsigned first_component = 0;<br>
+         unsigned pending_components = num_components;<br>
+         while (pending_components > 0) {<br></blockquote><div><br></div><div>Now that we're not doing anything crazy, this can be a simple for loop:<br><br></div><div>for (unsigned i = 0; i < num_components; i++) {<br></div><div>   if (i == 0)<br></div><div>      bld.MOV(read_offset, offset_reg);<br></div><div>   else<br></div><div>      bld.ADD(read_offset, offset_reg, brw_imm_ud(i * type_sz(dst.type)));<br><br></div><div>   fs_reg read_result =<br></div><div>      emit_byte_scattered_read(...);<br></div><div>   bld.MOV(offset(dest, bld, i), subscript(read_result, dst.type, 0));<br>}<br></div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
+            /* Non constant offsets are not guaranteed to be aligned 32-bits<br>
+             * so they are read using one byte_scattered_read message<br>
+             * for each component.<br>
+             */<br>
+            fs_reg read_result =<br>
+               emit_byte_scattered_read(bld, surf_index, read_offset,<br>
+                                        1 /* dims */, 1,<br>
+                                        type_sz(dest.type) * 8 /* bit_size */,<br>
+                                        BRW_PREDICATE_NONE);<br>
+            shuffle_32bit_load_result_to_<wbr>16bit_data(bld,<br>
+               retype(offset(dest, bld, first_component), BRW_REGISTER_TYPE_W),<br>
+               retype(read_result, BRW_REGISTER_TYPE_D),<br>
+               1, 0);<br>
+            pending_components--;<br>
+            first_component ++;<br>
+            bld.ADD(read_offset, offset_reg, brw_imm_ud(2 * first_component));<br>
+         }<br>
       }<br>
    } else if (type_sz(dest.type) == 4) {<br>
       fs_reg read_result = emit_untyped_read(bld, surf_index, offset_reg,<br>
<span class="HOEnZb"><font color="#888888">--<br>
2.14.3<br>
<br>
______________________________<wbr>_________________<br>
mesa-dev mailing list<br>
<a href="mailto:mesa-dev@lists.freedesktop.org">mesa-dev@lists.freedesktop.org</a><br>
<a href="https://lists.freedesktop.org/mailman/listinfo/mesa-dev" rel="noreferrer" target="_blank">https://lists.freedesktop.org/<wbr>mailman/listinfo/mesa-dev</a><br>
</font></span></blockquote></div><br></div></div>