[Mesa-dev] [PATCH 1/2] intel/fs: Read multiple 8/16-bit components with byte_scattered_read

Wed Jul 25 21:24:57 UTC 2018

We used the byte_scattered_read message because it allows to read from
non aligned 32-bit offsets. We were reading one component for each
message.

Using a 32-bit bitsize read at byte_scattered_read we can read up to two
16-bit components or four 8-bit components with only one message per
iteration.

The same applies for 16-bit bitsize for two 8-bit components read. In
the case of int8 vec3, we read them as 32-bit and we ignore the padding.

Cc: Jason Ekstrand <jason at jlekstrand.net>
---
 src/intel/compiler/brw_fs_nir.cpp | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index 9b11b5fbd01..a1f946708ed 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -2415,24 +2415,34 @@ do_untyped_vector_read(const fs_builder &bld,
                                  num_components);
       } else {
          fs_reg read_offset = bld.vgrf(BRW_REGISTER_TYPE_UD);
-         for (unsigned i = 0; i < num_components; i++) {
-            if (i == 0) {
+         unsigned iters = DIV_ROUND_UP(type_sz(dest.type) * num_components, 4);
+         for (unsigned it = 0; it < iters; it++) {
+            if (it == 0) {
                bld.MOV(read_offset, offset_reg);
             } else {
-               bld.ADD(read_offset, offset_reg,
-                       brw_imm_ud(i * type_sz(dest.type)));
+               bld.ADD(read_offset, offset_reg, brw_imm_ud(4 * it));
             }
+            unsigned iter_components = MIN2(4 / type_sz(dest.type),
+                                            num_components);
+            num_components -= iter_components;
+            /* We adjust the bitsize_read to hold as many components we can in
+             * the same read message. We use 32-bit to read 8-bit vec3 but we
+             * ignore last padding.component.
+             */
+            unsigned bitsize_read = util_next_power_of_two(8 * iter_components *
+                                                           type_sz(dest.type));
             /* Non constant offsets are not guaranteed to be aligned 32-bits
-             * so they are read using one byte_scattered_read message
-             * for each component.
+             * for 8/16 bit componentes. We use byte_scattered_read for
+             * one or multiple components up to 4-bytes for iteration.
              */
             fs_reg read_result =
                emit_byte_scattered_read(bld, surf_index, read_offset,
                                         1 /* dims */, 1,
-                                        type_sz(dest.type) * 8 /* bit_size */,
+                                        bitsize_read,
                                         BRW_PREDICATE_NONE);
-            bld.MOV(offset(dest, bld, i),
-                    subscript (read_result, dest.type, 0));
+            shuffle_from_32bit_read(bld, offset(dest, bld,
+                                                it * 4 / type_sz(dest.type)),
+                                    read_result, 0, iter_components);
          }
       }
    } else if (type_sz(dest.type) == 4) {
-- 
2.17.1