Mesa (main): intel/compiler/fs: Add support for 16-bit sampler msg payload

Tue Nov 23 07:08:04 UTC 2021

Module: Mesa
Branch: main
Commit: 0374b56faa175bbd4bbf2617767773b680f4e25f
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=0374b56faa175bbd4bbf2617767773b680f4e25f

Author: Topi Pohjolainen <topi.pohjolainen at intel.com>
Date:   Tue Jul  7 22:26:08 2020 -0700

intel/compiler/fs: Add support for 16-bit sampler msg payload

For SIMD8 half float payload, each component takes a full register, so
we can use existing LOAD_PAYLOAD infrastruture for required padding by
alternating plain 8-wide half float vector and null vector.

Also this patch removes an unwanted assertion from
opt_copy_propagation_local for LOAD_PAYLOAD.

Reviewed-by: Sagar Ghuge <sagar.ghuge at intel.com>
Reviewed-by: Francisco Jerez <currojerez at riseup.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11766>

---

 src/intel/compiler/brw_fs.cpp                  | 103 ++++++++++++++++++-------
 src/intel/compiler/brw_fs_copy_propagation.cpp |   1 -
 2 files changed, 77 insertions(+), 27 deletions(-)

diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 3629bde7a9f..4101b7830e4 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -5221,16 +5221,23 @@ lower_sampler_logical_send_gfx7(const fs_builder &bld, fs_inst *inst, opcode op,
                                 const fs_reg &surface_handle,
                                 const fs_reg &sampler_handle,
                                 const fs_reg &tg4_offset,
+                                unsigned payload_type_bit_size,
                                 unsigned coord_components,
                                 unsigned grad_components)
 {
    const intel_device_info *devinfo = bld.shader->devinfo;
+   const enum brw_reg_type payload_type =
+      brw_reg_type_from_bit_size(payload_type_bit_size, BRW_REGISTER_TYPE_F);
+   const enum brw_reg_type payload_unsigned_type =
+      brw_reg_type_from_bit_size(payload_type_bit_size, BRW_REGISTER_TYPE_UD);
+   const enum brw_reg_type payload_signed_type =
+      brw_reg_type_from_bit_size(payload_type_bit_size, BRW_REGISTER_TYPE_D);
    const brw_stage_prog_data *prog_data = bld.shader->stage_prog_data;
    unsigned reg_width = bld.dispatch_width() / 8;
    unsigned header_size = 0, length = 0;
    fs_reg sources[MAX_SAMPLER_MESSAGE_SIZE];
    for (unsigned i = 0; i < ARRAY_SIZE(sources); i++)
-      sources[i] = bld.vgrf(BRW_REGISTER_TYPE_F);
+      sources[i] = bld.vgrf(payload_type);
 
    /* We must have exactly one of surface/sampler and surface/sampler_handle */
    assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE));
@@ -5369,23 +5376,23 @@ lower_sampler_logical_send_gfx7(const fs_builder &bld, fs_inst *inst, opcode op,
       coordinate_done = true;
       break;
    case SHADER_OPCODE_TXS:
-      bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), lod);
+      bld.MOV(retype(sources[length], payload_unsigned_type), lod);
       length++;
       break;
    case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
       /* We need an LOD; just use 0 */
-      bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
+      bld.MOV(retype(sources[length], payload_unsigned_type), brw_imm_ud(0));
       length++;
       break;
    case SHADER_OPCODE_TXF:
       /* Unfortunately, the parameters for LD are intermixed: u, lod, v, r.
        * On Gfx9 they are u, v, lod, r
        */
-      bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D), coordinate);
+      bld.MOV(retype(sources[length++], payload_signed_type), coordinate);
 
       if (devinfo->ver >= 9) {
          if (coord_components >= 2) {
-            bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D),
+            bld.MOV(retype(sources[length], payload_signed_type),
                     offset(coordinate, bld, 1));
          } else {
             sources[length] = brw_imm_d(0);
@@ -5396,12 +5403,12 @@ lower_sampler_logical_send_gfx7(const fs_builder &bld, fs_inst *inst, opcode op,
       if (devinfo->ver >= 9 && lod.is_zero()) {
          op = SHADER_OPCODE_TXF_LZ;
       } else {
-         bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), lod);
+         bld.MOV(retype(sources[length], payload_signed_type), lod);
          length++;
       }
 
       for (unsigned i = devinfo->ver >= 9 ? 2 : 1; i < coord_components; i++)
-         bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D),
+         bld.MOV(retype(sources[length++], payload_signed_type),
                  offset(coordinate, bld, i));
 
       coordinate_done = true;
@@ -5414,20 +5421,19 @@ lower_sampler_logical_send_gfx7(const fs_builder &bld, fs_inst *inst, opcode op,
       if (op == SHADER_OPCODE_TXF_UMS ||
           op == SHADER_OPCODE_TXF_CMS ||
           op == SHADER_OPCODE_TXF_CMS_W) {
-         bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), sample_index);
-         length++;
+         bld.MOV(retype(sources[length++], payload_unsigned_type), sample_index);
       }
 
       if (op == SHADER_OPCODE_TXF_CMS || op == SHADER_OPCODE_TXF_CMS_W) {
          /* Data from the multisample control surface. */
-         bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), mcs);
+         bld.MOV(retype(sources[length], payload_unsigned_type), mcs);
          length++;
 
          /* On Gfx9+ we'll use ld2dms_w instead which has two registers for
           * the MCS data.
           */
          if (op == SHADER_OPCODE_TXF_CMS_W) {
-            bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD),
+            bld.MOV(retype(sources[length], payload_unsigned_type),
                     mcs.file == IMM ?
                     mcs :
                     offset(mcs, bld, 1));
@@ -5439,7 +5445,7 @@ lower_sampler_logical_send_gfx7(const fs_builder &bld, fs_inst *inst, opcode op,
        * texture coordinates.
        */
       for (unsigned i = 0; i < coord_components; i++)
-         bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D),
+         bld.MOV(retype(sources[length++], payload_signed_type),
                  offset(coordinate, bld, i));
 
       coordinate_done = true;
@@ -5450,7 +5456,7 @@ lower_sampler_logical_send_gfx7(const fs_builder &bld, fs_inst *inst, opcode op,
          bld.MOV(sources[length++], offset(coordinate, bld, i));
 
       for (unsigned i = 0; i < 2; i++) /* offu, offv */
-         bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D),
+         bld.MOV(retype(sources[length++], payload_signed_type),
                  offset(tg4_offset, bld, i));
 
       if (coord_components == 3) /* r if present */
@@ -5465,7 +5471,8 @@ lower_sampler_logical_send_gfx7(const fs_builder &bld, fs_inst *inst, opcode op,
    /* Set up the coordinate (except for cases where it was done above) */
    if (!coordinate_done) {
       for (unsigned i = 0; i < coord_components; i++)
-         bld.MOV(sources[length++], offset(coordinate, bld, i));
+         bld.MOV(retype(sources[length++], payload_type),
+                 offset(coordinate, bld, i));
    }
 
    if (min_lod.file != BAD_FILE) {
@@ -5477,15 +5484,27 @@ lower_sampler_logical_send_gfx7(const fs_builder &bld, fs_inst *inst, opcode op,
       bld.MOV(sources[length++], min_lod);
    }
 
-   unsigned mlen;
-   if (reg_width == 2)
-      mlen = length * reg_width - header_size;
-   else
-      mlen = length * reg_width;
-
-   const fs_reg src_payload = fs_reg(VGRF, bld.shader->alloc.allocate(mlen),
-                                     BRW_REGISTER_TYPE_F);
-   bld.LOAD_PAYLOAD(src_payload, sources, length, header_size);
+   const fs_reg src_payload =
+      fs_reg(VGRF, bld.shader->alloc.allocate(length * reg_width),
+                                              BRW_REGISTER_TYPE_F);
+   /* In case of 16-bit payload each component takes one full register in
+    * both SIMD8H and SIMD16H modes. In both cases one reg can hold 16
+    * elements. In SIMD8H case hardware simply expects the components to be
+    * padded (i.e., aligned on reg boundary).
+    */
+   fs_inst *load_payload_inst =
+      emit_load_payload_with_padding(bld, src_payload, sources, length,
+                                     header_size, REG_SIZE);
+   unsigned mlen = load_payload_inst->size_written / REG_SIZE;
+   unsigned simd_mode = 0;
+   if (payload_type_bit_size == 16) {
+      assert(devinfo->ver >= 11);
+      simd_mode = inst->exec_size <= 8 ? GFX10_SAMPLER_SIMD_MODE_SIMD8H :
+                                         GFX10_SAMPLER_SIMD_MODE_SIMD16H;
+   } else {
+      simd_mode = inst->exec_size <= 8 ? BRW_SAMPLER_SIMD_MODE_SIMD8 :
+                                         BRW_SAMPLER_SIMD_MODE_SIMD16;
+   }
 
    /* Generate the SEND. */
    inst->opcode = SHADER_OPCODE_SEND;
@@ -5494,9 +5513,6 @@ lower_sampler_logical_send_gfx7(const fs_builder &bld, fs_inst *inst, opcode op,
 
    const unsigned msg_type =
       sampler_msg_type(devinfo, op, inst->shadow_compare);
-   const unsigned simd_mode =
-      inst->exec_size <= 8 ? BRW_SAMPLER_SIMD_MODE_SIMD8 :
-                             BRW_SAMPLER_SIMD_MODE_SIMD16;
 
    uint32_t base_binding_table_index;
    switch (op) {
@@ -5599,6 +5615,34 @@ lower_sampler_logical_send_gfx7(const fs_builder &bld, fs_inst *inst, opcode op,
    assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE);
 }
 
+static unsigned
+get_sampler_msg_payload_type_bit_size(const fs_reg *src)
+{
+   unsigned src_type_size = 0;
+
+   /* All sources need to have the same size, therefore seek the first valid
+    * and take the size from there.
+    */
+   for (unsigned i = 0; i < TEX_LOGICAL_NUM_SRCS; i++) {
+      if (src[i].file != BAD_FILE) {
+         src_type_size = brw_reg_type_to_size(src[i].type);
+         break;
+      }
+   }
+
+   assert(src_type_size == 2 || src_type_size == 4);
+
+#ifndef NDEBUG
+   /* Make sure all sources agree. */
+   for (unsigned i = 0; i < TEX_LOGICAL_NUM_SRCS; i++) {
+      assert(src[i].file == BAD_FILE ||
+             brw_reg_type_to_size(src[i].type) == src_type_size);
+   }
+#endif
+
+   return src_type_size * 8;
+}
+
 static void
 lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op)
 {
@@ -5621,12 +5665,19 @@ lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op)
    const unsigned grad_components = inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
 
    if (devinfo->ver >= 7) {
+      const unsigned msg_payload_type_bit_size =
+         get_sampler_msg_payload_type_bit_size(inst->src);
+
+      /* 16-bit payloads are available only on gfx11+ */
+      assert(msg_payload_type_bit_size != 16 || devinfo->ver >= 11);
+
       lower_sampler_logical_send_gfx7(bld, inst, op, coordinate,
                                       shadow_c, lod, lod2, min_lod,
                                       sample_index,
                                       mcs, surface, sampler,
                                       surface_handle, sampler_handle,
                                       tg4_offset,
+                                      msg_payload_type_bit_size,
                                       coord_components, grad_components);
    } else if (devinfo->ver >= 5) {
       lower_sampler_logical_send_gfx5(bld, inst, op, coordinate,
diff --git a/src/intel/compiler/brw_fs_copy_propagation.cpp b/src/intel/compiler/brw_fs_copy_propagation.cpp
index 167b7e6a69f..c68bf7ca957 100644
--- a/src/intel/compiler/brw_fs_copy_propagation.cpp
+++ b/src/intel/compiler/brw_fs_copy_propagation.cpp
@@ -1026,7 +1026,6 @@ fs_visitor::opt_copy_propagation_local(void *copy_prop_ctx, bblock_t *block,
          int offset = 0;
          for (int i = 0; i < inst->sources; i++) {
             int effective_width = i < inst->header_size ? 8 : inst->exec_size;
-            assert(effective_width * type_sz(inst->src[i].type) % REG_SIZE == 0);
             const unsigned size_written = effective_width *
                                           type_sz(inst->src[i].type);
             if (inst->src[i].file == VGRF ||