Mesa (main): intel/fs: Make logical URB read instructions more like other logical instructions

Tue Jul 26 17:46:09 UTC 2022

Module: Mesa
Branch: main
Commit: 1b17f8fc5a3fdb6e6b2a7b54e3cc319ea1fe9a5f
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=1b17f8fc5a3fdb6e6b2a7b54e3cc319ea1fe9a5f

Author: Ian Romanick <ian.d.romanick at intel.com>
Date:   Thu Jul 14 11:57:03 2022 -0700

intel/fs: Make logical URB read instructions more like other logical instructions

No shader-db changes on any Intel platform

Fossil-db results:

Tiger Lake
Instructions in all programs: 156926440 -> 156926470 (+0.0%)
Instructions hurt: 15

Cycles in all programs: 7513099349 -> 7513099402 (+0.0%)
Cycles hurt: 15

Ice Lake and Skylake had similar results. (Ice Lake shown)
Cycles in all programs: 9099036492 -> 9099036489 (-0.0%)
Cycles helped: 1

Reviewed-by: Kenneth Graunke <kenneth at whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17605>

---

 src/intel/compiler/brw_fs.cpp                  |  2 -
 src/intel/compiler/brw_fs_nir.cpp              | 85 +++++++++++++++-----------
 src/intel/compiler/brw_lower_logical_sends.cpp | 23 ++++---
 src/intel/compiler/brw_mesh.cpp                | 16 ++---
 4 files changed, 73 insertions(+), 53 deletions(-)

diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 34a88ac89e2..efaa7e461c7 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -902,8 +902,6 @@ fs_inst::size_read(int arg) const
       break;
 
    case FS_OPCODE_FB_READ:
-   case SHADER_OPCODE_URB_READ_LOGICAL:
-   case SHADER_OPCODE_URB_READ_PER_SLOT_LOGICAL:
    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
       if (arg == 0)
diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index 35a50e838a8..319567c9bdb 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -2619,11 +2619,15 @@ fs_visitor::emit_gs_input_load(const fs_reg &dst,
    fs_reg indirect_offset = get_nir_src(offset_src);
 
    if (nir_src_is_const(offset_src)) {
+      fs_reg srcs[URB_LOGICAL_NUM_SRCS];
+      srcs[URB_LOGICAL_SRC_HANDLE] = icp_handle;
+
       /* Constant indexing - use global offset. */
       if (first_component != 0) {
          unsigned read_components = num_components + first_component;
          fs_reg tmp = bld.vgrf(dst.type, read_components);
-         inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp, icp_handle);
+         inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp, srcs,
+                         ARRAY_SIZE(srcs));
          inst->size_written = read_components *
                               tmp.component_size(inst->exec_size);
          for (unsigned i = 0; i < num_components; i++) {
@@ -2631,7 +2635,8 @@ fs_visitor::emit_gs_input_load(const fs_reg &dst,
                     offset(tmp, bld, i + first_component));
          }
       } else {
-         inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dst, icp_handle);
+         inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dst, srcs,
+                         ARRAY_SIZE(srcs));
          inst->size_written = num_components *
                               dst.component_size(inst->exec_size);
       }
@@ -2639,14 +2644,16 @@ fs_visitor::emit_gs_input_load(const fs_reg &dst,
       inst->mlen = 1;
    } else {
       /* Indirect indexing - use per-slot offsets as well. */
-      const fs_reg srcs[] = { icp_handle, indirect_offset };
       unsigned read_components = num_components + first_component;
       fs_reg tmp = bld.vgrf(dst.type, read_components);
-      fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
-      bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
+
+      fs_reg srcs[URB_LOGICAL_NUM_SRCS];
+      srcs[URB_LOGICAL_SRC_HANDLE] = icp_handle;
+      srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
+
       if (first_component != 0) {
          inst = bld.emit(SHADER_OPCODE_URB_READ_PER_SLOT_LOGICAL, tmp,
-                         payload);
+                         srcs, ARRAY_SIZE(srcs));
          inst->size_written = read_components *
                               tmp.component_size(inst->exec_size);
          for (unsigned i = 0; i < num_components; i++) {
@@ -2654,7 +2661,8 @@ fs_visitor::emit_gs_input_load(const fs_reg &dst,
                     offset(tmp, bld, i + first_component));
          }
       } else {
-         inst = bld.emit(SHADER_OPCODE_URB_READ_PER_SLOT_LOGICAL, dst, payload);
+         inst = bld.emit(SHADER_OPCODE_URB_READ_PER_SLOT_LOGICAL, dst,
+                         srcs, ARRAY_SIZE(srcs));
          inst->size_written = num_components *
                               dst.component_size(inst->exec_size);
       }
@@ -2923,38 +2931,42 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
       unsigned num_components = instr->num_components;
       unsigned first_component = nir_intrinsic_component(instr);
 
+      fs_reg srcs[URB_LOGICAL_NUM_SRCS];
+      srcs[URB_LOGICAL_SRC_HANDLE] = icp_handle;
+
       if (indirect_offset.file == BAD_FILE) {
          /* Constant indexing - use global offset. */
          if (first_component != 0) {
             unsigned read_components = num_components + first_component;
             fs_reg tmp = bld.vgrf(dst.type, read_components);
-            inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp, icp_handle);
+            inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp, srcs,
+                            ARRAY_SIZE(srcs));
             for (unsigned i = 0; i < num_components; i++) {
                bld.MOV(offset(dst, bld, i),
                        offset(tmp, bld, i + first_component));
             }
          } else {
-            inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dst, icp_handle);
+            inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dst, srcs,
+                            ARRAY_SIZE(srcs));
          }
          inst->offset = imm_offset;
          inst->mlen = 1;
       } else {
          /* Indirect indexing - use per-slot offsets as well. */
-         const fs_reg srcs[] = { icp_handle, indirect_offset };
-         fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
-         bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
+         srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
+
          if (first_component != 0) {
             unsigned read_components = num_components + first_component;
             fs_reg tmp = bld.vgrf(dst.type, read_components);
             inst = bld.emit(SHADER_OPCODE_URB_READ_PER_SLOT_LOGICAL, tmp,
-                            payload);
+                            srcs, ARRAY_SIZE(srcs));
             for (unsigned i = 0; i < num_components; i++) {
                bld.MOV(offset(dst, bld, i),
                        offset(tmp, bld, i + first_component));
             }
          } else {
             inst = bld.emit(SHADER_OPCODE_URB_READ_PER_SLOT_LOGICAL, dst,
-                            payload);
+                            srcs, ARRAY_SIZE(srcs));
          }
          inst->offset = imm_offset;
          inst->mlen = 2;
@@ -2993,12 +3005,15 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
          bld.MOV(patch_handle, output_handles);
 
          {
+            fs_reg srcs[URB_LOGICAL_NUM_SRCS];
+            srcs[URB_LOGICAL_SRC_HANDLE] = patch_handle;
+
             if (first_component != 0) {
                unsigned read_components =
                   instr->num_components + first_component;
                fs_reg tmp = bld.vgrf(dst.type, read_components);
                inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp,
-                               patch_handle);
+                               srcs, ARRAY_SIZE(srcs));
                inst->size_written = read_components * REG_SIZE;
                for (unsigned i = 0; i < instr->num_components; i++) {
                   bld.MOV(offset(dst, bld, i),
@@ -3006,7 +3021,7 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
                }
             } else {
                inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dst,
-                               patch_handle);
+                               srcs, ARRAY_SIZE(srcs));
                inst->size_written = instr->num_components * REG_SIZE;
             }
             inst->offset = imm_offset;
@@ -3014,15 +3029,16 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
          }
       } else {
          /* Indirect indexing - use per-slot offsets as well. */
-         const fs_reg srcs[] = { output_handles, indirect_offset };
-         fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
-         bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
+         fs_reg srcs[URB_LOGICAL_NUM_SRCS];
+         srcs[URB_LOGICAL_SRC_HANDLE] = output_handles;
+         srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
+
          if (first_component != 0) {
             unsigned read_components =
                instr->num_components + first_component;
             fs_reg tmp = bld.vgrf(dst.type, read_components);
             inst = bld.emit(SHADER_OPCODE_URB_READ_PER_SLOT_LOGICAL, tmp,
-                            payload);
+                            srcs, ARRAY_SIZE(srcs));
             inst->size_written = read_components * REG_SIZE;
             for (unsigned i = 0; i < instr->num_components; i++) {
                bld.MOV(offset(dst, bld, i),
@@ -3030,7 +3046,7 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
             }
          } else {
             inst = bld.emit(SHADER_OPCODE_URB_READ_PER_SLOT_LOGICAL, dst,
-                            payload);
+                            srcs, ARRAY_SIZE(srcs));
             inst->size_written = instr->num_components * REG_SIZE;
          }
          inst->offset = imm_offset;
@@ -3151,18 +3167,16 @@ fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld,
                     (imm_offset / 2) + 1);
          } else {
             /* Replicate the patch handle to all enabled channels */
-            const fs_reg srcs[] = {
-               retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)
-            };
-            fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
-            bld.LOAD_PAYLOAD(patch_handle, srcs, ARRAY_SIZE(srcs), 0);
+            fs_reg srcs[URB_LOGICAL_NUM_SRCS];
+            srcs[URB_LOGICAL_SRC_HANDLE] =
+               retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD);
 
             if (first_component != 0) {
                unsigned read_components =
                   instr->num_components + first_component;
                fs_reg tmp = bld.vgrf(dest.type, read_components);
                inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp,
-                               patch_handle);
+                               srcs, ARRAY_SIZE(srcs));
                inst->size_written = read_components * REG_SIZE;
                for (unsigned i = 0; i < instr->num_components; i++) {
                   bld.MOV(offset(dest, bld, i),
@@ -3170,7 +3184,7 @@ fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld,
                }
             } else {
                inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dest,
-                               patch_handle);
+                               srcs, ARRAY_SIZE(srcs));
                inst->size_written = instr->num_components * REG_SIZE;
             }
             inst->mlen = 1;
@@ -3184,26 +3198,25 @@ fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld,
           * two double components.
           */
          unsigned num_components = instr->num_components;
-         const fs_reg srcs[] = {
-            retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
-            indirect_offset
-         };
-         fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
-         bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
+
+         fs_reg srcs[URB_LOGICAL_NUM_SRCS];
+         srcs[URB_LOGICAL_SRC_HANDLE] =
+            retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD);
+         srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
 
          if (first_component != 0) {
             unsigned read_components =
                 num_components + first_component;
             fs_reg tmp = bld.vgrf(dest.type, read_components);
             inst = bld.emit(SHADER_OPCODE_URB_READ_PER_SLOT_LOGICAL, tmp,
-                            payload);
+                            srcs, ARRAY_SIZE(srcs));
             for (unsigned i = 0; i < num_components; i++) {
                bld.MOV(offset(dest, bld, i),
                        offset(tmp, bld, i + first_component));
             }
          } else {
             inst = bld.emit(SHADER_OPCODE_URB_READ_PER_SLOT_LOGICAL, dest,
-                            payload);
+                            srcs, ARRAY_SIZE(srcs));
          }
          inst->mlen = 2;
          inst->offset = imm_offset;
diff --git a/src/intel/compiler/brw_lower_logical_sends.cpp b/src/intel/compiler/brw_lower_logical_sends.cpp
index 0ebc9984b1e..2ccd083f576 100644
--- a/src/intel/compiler/brw_lower_logical_sends.cpp
+++ b/src/intel/compiler/brw_lower_logical_sends.cpp
@@ -36,13 +36,24 @@ lower_urb_read_logical_send(const fs_builder &bld, fs_inst *inst,
 {
    const intel_device_info *devinfo = bld.shader->devinfo;
 
-
    assert(inst->size_written % REG_SIZE == 0);
-   assert(inst->src[0].type == BRW_REGISTER_TYPE_UD);
-   assert(inst->src[0].file == FIXED_GRF || inst->src[0].file == VGRF);
+   assert(inst->header_size == 0);
+
+   fs_reg *payload_sources = new fs_reg[inst->mlen];
+   fs_reg payload = fs_reg(VGRF, bld.shader->alloc.allocate(inst->mlen),
+                           BRW_REGISTER_TYPE_F);
+
+   unsigned header_size = 0;
+   payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_HANDLE];
+   if (per_slot_present)
+      payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS];
+
+   bld.LOAD_PAYLOAD(payload, payload_sources, inst->mlen, header_size);
+
+   delete [] payload_sources;
 
    inst->opcode = SHADER_OPCODE_SEND;
-   inst->header_size = 1;
+   inst->header_size = header_size;
 
    inst->sfid = BRW_SFID_URB;
    inst->desc = brw_urb_desc(devinfo,
@@ -55,13 +66,11 @@ lower_urb_read_logical_send(const fs_builder &bld, fs_inst *inst,
    inst->ex_mlen = 0;
    inst->send_is_volatile = true;
 
-   fs_reg tmp = inst->src[0];
-
    inst->resize_sources(4);
 
    inst->src[0] = brw_imm_ud(0); /* desc */
    inst->src[1] = brw_imm_ud(0); /* ex_desc */
-   inst->src[2] = tmp;
+   inst->src[2] = payload;
    inst->src[3] = brw_null_reg();
 }
 
diff --git a/src/intel/compiler/brw_mesh.cpp b/src/intel/compiler/brw_mesh.cpp
index 6a8872cebe9..65f6164de00 100644
--- a/src/intel/compiler/brw_mesh.cpp
+++ b/src/intel/compiler/brw_mesh.cpp
@@ -1037,8 +1037,11 @@ emit_urb_direct_reads(const fs_builder &bld, nir_intrinsic_instr *instr,
 
    fs_builder ubld8 = bld.group(8, 0).exec_all();
    fs_reg data = ubld8.vgrf(BRW_REGISTER_TYPE_UD, num_regs);
+   fs_reg srcs[URB_LOGICAL_NUM_SRCS];
+   srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
 
-   fs_inst *inst = ubld8.emit(SHADER_OPCODE_URB_READ_LOGICAL, data, urb_handle);
+   fs_inst *inst = ubld8.emit(SHADER_OPCODE_URB_READ_LOGICAL, data,
+                              srcs, ARRAY_SIZE(srcs));
    inst->mlen = 1;
    inst->offset = urb_global_offset;
    assert(inst->offset < 2048);
@@ -1093,17 +1096,14 @@ emit_urb_indirect_reads(const fs_builder &bld, nir_intrinsic_instr *instr,
 
          bld8.SHR(off, off, brw_imm_ud(2));
 
-         fs_reg payload_srcs[2];
-         payload_srcs[0] = urb_handle;
-         payload_srcs[1] = off;
-
-         fs_reg payload = bld8.vgrf(BRW_REGISTER_TYPE_UD, 2);
-         bld8.LOAD_PAYLOAD(payload, payload_srcs, 2, 2);
+         fs_reg srcs[URB_LOGICAL_NUM_SRCS];
+         srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
+         srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = off;
 
          fs_reg data = bld8.vgrf(BRW_REGISTER_TYPE_UD, 4);
 
          fs_inst *inst = bld8.emit(SHADER_OPCODE_URB_READ_PER_SLOT_LOGICAL,
-                                   data, payload);
+                                   data, srcs, ARRAY_SIZE(srcs));
          inst->mlen = 2;
          inst->offset = 0;
          inst->size_written = 4 * REG_SIZE;