[Mesa-dev] [PATCH v3 39/43] i965/fs: Enables 16-bit load_ubo with sampler

Thu Oct 12 18:38:28 UTC 2017

load_ubo is using 32-bit loads as uniforms surfaces have a 32-bit
surface format defined. So when reading 16-bit components with the
sampler we need to unshuffle two 16-bit components from each 32-bit
component.

Using the sampler avoids the use of the byte_scattered_read message
that needs one message for each component and is supposed to be
slower.

In the case of SKL+ we take advance of a hardware feature that
automatically defines a channel mask based on the rlen value, so on
SKL+ we only use half of the registers without using a header in the
payload.
---
 src/intel/compiler/brw_fs.cpp           | 31 +++++++++++++++++++++++++++----
 src/intel/compiler/brw_fs_generator.cpp | 10 ++++++++--
 src/intel/compiler/brw_fs_nir.cpp       | 11 +++--------
 3 files changed, 38 insertions(+), 14 deletions(-)

diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 1d6fbdd06a..45608c1e47 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -185,9 +185,17 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
     * a double this means we are only loading 2 elements worth of data.
     * We also want to use a 32-bit data type for the dst of the load operation
     * so other parts of the driver don't get confused about the size of the
-    * result.
+    * result. On the case of 16-bit data we only need half of the 32-bit
+    * components on SKL+ as we take advance of using message return size to
+    * define an xy channel mask.
     */
-   fs_reg vec4_result = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
+   fs_reg vec4_result;
+   if (type_sz(dst.type) == 2 && (devinfo->gen >= 9)) {
+      vec4_result = bld.vgrf(BRW_REGISTER_TYPE_F, 2);
+      vec4_result = retype(vec4_result, BRW_REGISTER_TYPE_HF);
+   } else {
+      vec4_result = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
+   }
    fs_inst *inst = bld.emit(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL,
                             vec4_result, surf_index, vec4_offset);
    inst->size_written = 4 * vec4_result.component_size(inst->exec_size);
@@ -198,8 +206,23 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
    }
 
    vec4_result.type = dst.type;
-   bld.MOV(dst, offset(vec4_result, bld,
-                       (const_offset & 0xf) / type_sz(vec4_result.type)));
+
+   if (type_sz(dst.type) == 2) {
+      /* 16-bit types need to be unshuffled as each pair of 16-bit components
+       * is packed on a 32-bit compoment because we are using a 32-bit format
+       * in the surface of uniform that is read by the sampler.
+       * TODO: On BDW+ mark when an uniform has 16-bit type so we could setup a
+       * surface format of 16-bit and use the 16-bit return format at the
+       * sampler.
+       */
+      vec4_result.stride = 2;
+      bld.MOV(dst, byte_offset(offset(vec4_result, bld,
+                                      (const_offset & 0x7) / 4),
+                               (const_offset & 0x7) / 2 % 2 * 2));
+   } else {
+      bld.MOV(dst, offset(vec4_result, bld,
+                          (const_offset & 0xf) / type_sz(vec4_result.type)));
+   }
 }
 
 /**
diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp
index c766e72ecf..83852107ce 100644
--- a/src/intel/compiler/brw_fs_generator.cpp
+++ b/src/intel/compiler/brw_fs_generator.cpp
@@ -1366,12 +1366,18 @@ fs_generator::generate_varying_pull_constant_load_gen7(fs_inst *inst,
    uint32_t simd_mode, rlen, mlen;
    if (inst->exec_size == 16) {
       mlen = 2;
-      rlen = 8;
+      if (type_sz(dst.type) == 2 && (devinfo->gen >= 9))
+         rlen = 4;
+      else
+         rlen = 8;
       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
    } else {
       assert(inst->exec_size == 8);
       mlen = 1;
-      rlen = 4;
+      if (type_sz(dst.type) == 2 && (devinfo->gen >= 9))
+         rlen = 2;
+      else
+         rlen = 4;
       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
    }
 
diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index c07b3e4d8d..774bd97968 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -4056,14 +4056,9 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
       if (const_offset == NULL) {
          fs_reg base_offset = retype(get_nir_src(instr->src[1]),
                                      BRW_REGISTER_TYPE_UD);
-         if (type_sz(dest.type) == 2) {
-            do_untyped_vector_read(bld, dest, surf_index, base_offset,
-                                   instr->num_components);
-         } else {
-            for (int i = 0; i < instr->num_components; i++)
-               VARYING_PULL_CONSTANT_LOAD(bld, offset(dest, bld, i), surf_index,
-                                          base_offset, i * type_sz(dest.type));
-         }
+         for (int i = 0; i < instr->num_components; i++)
+            VARYING_PULL_CONSTANT_LOAD(bld, offset(dest, bld, i), surf_index,
+                                       base_offset, i * type_sz(dest.type));
       } else {
          /* Even if we are loading doubles, a pull constant load will load
           * a 32-bit vec4, so should only reserve vgrf space for that. If we
-- 
2.13.6