[Mesa-dev] [PATCH 2/2] intel/fs: Write multiple 8/16-bit components with byte_scattered_write

Wed Jul 25 21:24:58 UTC 2018

We also pack in the same byte_scattered_write message the maximum
number of 8/16-bit components.

Comments have been rewritten to adapt them to the 8-bit case.
---
 src/intel/compiler/brw_fs_nir.cpp | 66 ++++++++++++++++++-------------
 1 file changed, 38 insertions(+), 28 deletions(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index a1f946708ed..7259acb862e 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -4263,6 +4263,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
          fs_reg write_src = offset(val_reg, bld, first_component);
 
          nir_const_value *const_offset = nir_src_as_const_value(instr->src[2]);
+         bool use_scattered_write = false;
 
          if (type_size > 4) {
             /* We can't write more than 2 64-bit components at once. Limit
@@ -4273,29 +4274,38 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
             write_src = shuffle_for_32bit_write(bld, write_src, 0,
                                                 num_components);
          } else if (type_size < 4) {
-            /* For 16-bit types we pack two consecutive values into a 32-bit
-             * word and use an untyped write message. For single values or not
-             * 32-bit-aligned we need to use byte-scattered writes because
-             * untyped writes works with 32-bit components with 32-bit
-             * alignment. byte_scattered_write messages only support one
-             * 16-bit component at a time. As VK_KHR_relaxed_block_layout
-             * could be enabled we can not guarantee that not constant offsets
-             * to be 32-bit aligned for 16-bit types. For example an array, of
-             * 16-bit vec3 with array element stride of 6.
+            /* For 8/16-bit types we pack consecutive values into a 32-bit
+             * type and use an untyped write message. When size is not
+             * multiple of 4-bytes or offset is not 32-bit-aligned we need to
+             * use byte-scattered writes because they didn't require 32-bit
+             * components or 32-bit offset alignment. We can pack multiple
+             * 8/16-bit components on one 8/16/32-bit component used by the
+             * byte_scattered_write message.
+             *
+             * As VK_KHR_relaxed_block_layout could be requested and it is
+             * core in VK 1.1 we can not guarantee not constant offsets to be
+             * 32-bit aligned for 8/16-bit types. For example a 16-bit vec3
+             * begin with at offset 2 in a structure.
              *
              * In the case of 32-bit aligned constant offsets if there is
-             * a 3-components vector we submit one untyped-write message
+             * a 16-bit vec3 we submit one untyped-write message
              * of 32-bit (first two components), and one byte-scattered
              * write message (the last component).
              */
-
-            if ( !const_offset || ((const_offset->u32[0] +
-                                   type_size * first_component) % 4)) {
-               /* If we use a .yz writemask we also need to emit 2
-                * byte-scattered write messages because of y-component not
-                * being aligned to 32-bit.
+            if (!const_offset || ((const_offset->u32[0] +
+                                   type_size * first_component) % 4) ||
+                num_components * type_size < 4) {
+               /* If we don't have a constant offset or a constant offset
+                * not 32-bit aligned or we are reading less than 32-bits then
+                * we use byte_scattered_write with the maximum number of
+                * components we can pack exactly into one 8/16/32-bit component.
+                * So for a int8 vec3 we have to split into two one 16-bit and
+                * another 8-bit writtings.
                 */
-               num_components = 1;
+               use_scattered_write = true;
+               num_components = MIN2(4 / type_size, num_components);
+               if (num_components == 3)
+                  num_components = 2;
             } else if (num_components * type_size > 4 &&
                        (num_components * type_size % 4)) {
                /* If the pending components size is not a multiple of 4 bytes
@@ -4303,13 +4313,10 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
                 * length == 1 with byte_scattered_write.
                 */
                num_components -= (num_components * type_size % 4) / type_size;
-            } else if (num_components * type_size < 4) {
-               num_components = 1;
             }
             /* For num_components == 1 we are also shuffling the component
-             * because byte scattered writes of 16-bit need values to be dword
-             * aligned. Shuffling only one component would be the same as
-             * striding it.
+             * because byte scattered writes of 8/16-bit need values 32-bit
+             * aligned.
              */
             write_src = shuffle_for_32bit_write(bld, write_src, 0,
                                                 num_components);
@@ -4327,16 +4334,19 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
                     brw_imm_ud(type_size * first_component));
          }
 
-         if (type_size < 4 && num_components == 1) {
-            /* Untyped Surface messages have a fixed 32-bit size, so we need
-             * to rely on byte scattered in order to write 16-bit elements.
-             * The byte_scattered_write message needs that every written 16-bit
-             * type to be aligned 32-bits (stride=2).
+         if (use_scattered_write) {
+            assert(num_components * bit_size <= 32);
+            assert(util_is_power_of_two_nonzero(num_components * bit_size));
+            /* Untyped Surface messages have a fixed 32-bit size, and are
+             * limited to use 32-bit offsets, so we use on byte
+             * scattered_writes in order to write 8/16-bit elements. We pack
+             * pack multiple 8/16 bits components in one single message, using
+             * the suitable bitsize. It is limited to 8/16/32-bits.
              */
             emit_byte_scattered_write(bld, surf_index, offset_reg,
                                       write_src,
                                       1 /* dims */, 1,
-                                      bit_size,
+                                      bit_size * num_components,
                                       BRW_PREDICATE_NONE);
          } else {
             assert(num_components * type_size <= 16);
-- 
2.17.1