<div dir="ltr">Reviewed-by: Jason Ekstrand <<a href="mailto:jason@jlekstrand.net">jason@jlekstrand.net</a>><br></div><div class="gmail_extra"><br><div class="gmail_quote">On Wed, Feb 28, 2018 at 5:38 AM, Jose Maria Casanova Crespo <span dir="ltr"><<a href="mailto:jmcasanova@igalia.com" target="_blank">jmcasanova@igalia.com</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">16-bit load_ubo/ssbo operations that call do_untyped_read_vector don't<br>
guarantee that offsets are multiple of 4-bytes as required by untyped_read<br>
message. This happens for example in the case of f16mat3x3 when then<br>
VK_KHR_relaxed_block_layout is enabled.<br>
<br>
Vectors reads when we have non-constant offsets are implemented with<br>
multiple byte_scattered_read messages that not require 32-bit aligned offsets.<br>
<br>
Now for all constant offsets we can use the untyped_read_surface message.<br>
In the case of constant offsets not aligned to 32-bits, we calculate a<br>
start offset 32-bit aligned and use the shuffle_32bit_load_result_to_<wbr>16bit_data<br>
function and the first_component parameter to skip the copy of the unneeded<br>
component.<br>
<br>
v2: (Jason Ekstrand)<br>
    Use untyped_read_surface messages always we have constant offsets.<br>
<br>
v3: (Jason Ekstrand)<br>
    Simplify loop for reads with non constant offsets.<br>
    Use end - start to calculate the number of 32-bit components to read with<br>
    constant offsets.<br>
---<br>
 src/intel/compiler/brw_fs_nir.<wbr>cpp | 51 ++++++++++++++++++++++++++++--<wbr>---------<br>
 1 file changed, 37 insertions(+), 14 deletions(-)<br>
<br>
diff --git a/src/intel/compiler/brw_fs_<wbr>nir.cpp b/src/intel/compiler/brw_fs_<wbr>nir.cpp<br>
index 0d1ab5b01c..3f077b3c91 100644<br>
--- a/src/intel/compiler/brw_fs_<wbr>nir.cpp<br>
+++ b/src/intel/compiler/brw_fs_<wbr>nir.cpp<br>
@@ -2304,28 +2304,51 @@ do_untyped_vector_read(const fs_builder &bld,<br>
 {<br>
    if (type_sz(dest.type) <= 2) {<br>
       assert(dest.stride == 1);<br>
+      boolean is_const_offset = offset_reg.file == BRW_IMMEDIATE_VALUE;<br>
<br>
-      if (num_components > 1) {<br>
-         /* Pairs of 16-bit components can be read with untyped read, for 16-bit<br>
-          * vec3 4th component is ignored.<br>
+      if (is_const_offset) {<br>
+         uint32_t start = offset_reg.ud & ~3;<br>
+         uint32_t end = offset_reg.ud + num_components * type_sz(dest.type);<br>
+         end = ALIGN(end, 4);<br>
+         assert (end - start <= 16);<br>
+<br>
+         /* At this point we have 16-bit component/s that have constant<br>
+          * offset aligned to 4-bytes that can be read with untyped_reads.<br>
+          * untyped_read message requires 32-bit aligned offsets.<br>
           */<br>
+         unsigned first_component = (offset_reg.ud & 3) / type_sz(dest.type);<br>
+         unsigned num_components_32bit = (end - start) / 4;<br>
+<br>
          fs_reg read_result =<br>
-            emit_untyped_read(bld, surf_index, offset_reg,<br>
-                              1 /* dims */, DIV_ROUND_UP(num_components, 2),<br>
+            emit_untyped_read(bld, surf_index, brw_imm_ud(start),<br>
+                              1 /* dims */,<br>
+                              num_components_32bit,<br>
                               BRW_PREDICATE_NONE);<br>
          shuffle_32bit_load_result_to_<wbr>16bit_data(bld,<br>
                retype(dest, BRW_REGISTER_TYPE_W),<br>
                retype(read_result, BRW_REGISTER_TYPE_D),<br>
-               0, num_components);<br>
+               first_component, num_components);<br>
       } else {<br>
-         assert(num_components == 1);<br>
-         /* scalar 16-bit are read using one byte_scattered_read message */<br>
-         fs_reg read_result =<br>
-            emit_byte_scattered_read(bld, surf_index, offset_reg,<br>
-                                     1 /* dims */, 1,<br>
-                                     type_sz(dest.type) * 8 /* bit_size */,<br>
-                                     BRW_PREDICATE_NONE);<br>
-         bld.MOV(dest, subscript(read_result, dest.type, 0));<br>
+         fs_reg read_offset = bld.vgrf(BRW_REGISTER_TYPE_UD)<wbr>;<br>
+         for (unsigned i = 0; i < num_components; i++) {<br>
+            if (i == 0) {<br>
+               bld.MOV(read_offset, offset_reg);<br>
+            } else {<br>
+               bld.ADD(read_offset, offset_reg,<br>
+                       brw_imm_ud(i * type_sz(dest.type)));<br>
+            }<br>
+            /* Non constant offsets are not guaranteed to be aligned 32-bits<br>
+             * so they are read using one byte_scattered_read message<br>
+             * for each component.<br>
+             */<br>
+            fs_reg read_result =<br>
+               emit_byte_scattered_read(bld, surf_index, read_offset,<br>
+                                        1 /* dims */, 1,<br>
+                                        type_sz(dest.type) * 8 /* bit_size */,<br>
+                                        BRW_PREDICATE_NONE);<br>
+            bld.MOV(offset(dest, bld, i),<br>
+                    subscript (read_result, dest.type, 0));<br>
+         }<br>
       }<br>
    } else if (type_sz(dest.type) == 4) {<br>
       fs_reg read_result = emit_untyped_read(bld, surf_index, offset_reg,<br>
<span class="HOEnZb"><font color="#888888">--<br>
2.16.1<br>
<br>
</font></span></blockquote></div><br></div>