Mesa (main): intel/fs: Make logical URB write instructions more like other logical instructions

Tue Jul 26 17:46:09 UTC 2022

Module: Mesa
Branch: main
Commit: 349a040f684cc5c6b80d40a4edbefa410e91034d
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=349a040f684cc5c6b80d40a4edbefa410e91034d

Author: Ian Romanick <ian.d.romanick at intel.com>
Date:   Tue Jul 12 15:32:01 2022 -0700

intel/fs: Make logical URB write instructions more like other logical instructions

The changes to fs_visitor::validate() helped track down a place where I
initially forgot to convert a message to the new sources layout.  This
had caused a different validation failure in
dEQP-GLES31.functional.tessellation.tesscoord.triangles_equal_spacing,
but this were not detected until after SENDs were lowered.

Tiger Lake, Ice Lake, and Skylake had similar results. (Ice Lake shown)
total instructions in shared programs: 19951145 -> 19951133 (<.01%)
instructions in affected programs: 2429 -> 2417 (-0.49%)
helped: 8 / HURT: 0

total cycles in shared programs: 858904152 -> 858862331 (<.01%)
cycles in affected programs: 5702652 -> 5660831 (-0.73%)
helped: 2138 / HURT: 1255

Broadwell
total cycles in shared programs: 904869459 -> 904835501 (<.01%)
cycles in affected programs: 7686744 -> 7652786 (-0.44%)
helped: 2861 / HURT: 2050

Tiger Lake, Ice Lake, and Skylake had similar results. (Ice Lake shown)
Instructions in all programs: 141442369 -> 141442032 (-0.0%)
Instructions helped: 337

Cycles in all programs: 9099270231 -> 9099036492 (-0.0%)
Cycles helped: 40661
Cycles hurt: 28606

Reviewed-by: Kenneth Graunke <kenneth at whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17605>

---

 src/intel/compiler/brw_eu_defines.h            | 11 ++++
 src/intel/compiler/brw_fs.cpp                  | 47 ++++++++--------
 src/intel/compiler/brw_fs_nir.cpp              | 77 +++++++++++++-------------
 src/intel/compiler/brw_fs_validate.cpp         | 14 +++++
 src/intel/compiler/brw_fs_visitor.cpp          | 53 ++++++++++--------
 src/intel/compiler/brw_lower_logical_sends.cpp | 25 +++++++--
 src/intel/compiler/brw_mesh.cpp                | 70 +++++++++++------------
 7 files changed, 176 insertions(+), 121 deletions(-)

diff --git a/src/intel/compiler/brw_eu_defines.h b/src/intel/compiler/brw_eu_defines.h
index da7c09c96f2..fecb3273d86 100644
--- a/src/intel/compiler/brw_eu_defines.h
+++ b/src/intel/compiler/brw_eu_defines.h
@@ -950,6 +950,17 @@ enum rt_logical_srcs {
    RT_LOGICAL_NUM_SRCS
 };
 
+enum urb_logical_srcs {
+   URB_LOGICAL_SRC_HANDLE,
+   URB_LOGICAL_SRC_PER_SLOT_OFFSETS,
+   URB_LOGICAL_SRC_CHANNEL_MASK,
+   /** Data to be written.  BAD_FILE for reads. */
+   URB_LOGICAL_SRC_DATA,
+
+   URB_LOGICAL_NUM_SRCS
+};
+
+
 #ifdef __cplusplus
 /**
  * Allow brw_urb_write_flags enums to be ORed together.
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 9e5ede1dc48..34a88ac89e2 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -863,6 +863,17 @@ fs_inst::components_read(unsigned i) const
          return 1;
    }
 
+   case SHADER_OPCODE_URB_WRITE_LOGICAL:
+   case SHADER_OPCODE_URB_WRITE_PER_SLOT_LOGICAL:
+   case SHADER_OPCODE_URB_WRITE_MASKED_LOGICAL:
+   case SHADER_OPCODE_URB_WRITE_MASKED_PER_SLOT_LOGICAL:
+      if (i == URB_LOGICAL_SRC_DATA)
+         return mlen - 1 -
+            unsigned(src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS].file != BAD_FILE) -
+            unsigned(src[URB_LOGICAL_SRC_CHANNEL_MASK].file != BAD_FILE);
+      else
+         return 1;
+
    default:
       return 1;
    }
@@ -891,10 +902,6 @@ fs_inst::size_read(int arg) const
       break;
 
    case FS_OPCODE_FB_READ:
-   case SHADER_OPCODE_URB_WRITE_LOGICAL:
-   case SHADER_OPCODE_URB_WRITE_PER_SLOT_LOGICAL:
-   case SHADER_OPCODE_URB_WRITE_MASKED_LOGICAL:
-   case SHADER_OPCODE_URB_WRITE_MASKED_PER_SLOT_LOGICAL:
    case SHADER_OPCODE_URB_READ_LOGICAL:
    case SHADER_OPCODE_URB_READ_PER_SLOT_LOGICAL:
    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
@@ -1546,17 +1553,17 @@ fs_visitor::emit_gs_thread_end()
             break;
          }
       }
-      fs_reg hdr = abld.vgrf(BRW_REGISTER_TYPE_UD, 1);
-      abld.MOV(hdr, fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)));
-      inst = abld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef, hdr);
+      fs_reg srcs[URB_LOGICAL_NUM_SRCS];
+      srcs[URB_LOGICAL_SRC_HANDLE] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
+      inst = abld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
+                       srcs, ARRAY_SIZE(srcs));
       inst->mlen = 1;
    } else {
-      fs_reg payload = abld.vgrf(BRW_REGISTER_TYPE_UD, 2);
-      fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 2);
-      sources[0] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
-      sources[1] = this->final_gs_vertex_count;
-      abld.LOAD_PAYLOAD(payload, sources, 2, 2);
-      inst = abld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef, payload);
+      fs_reg srcs[URB_LOGICAL_NUM_SRCS];
+      srcs[URB_LOGICAL_SRC_HANDLE] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
+      srcs[URB_LOGICAL_SRC_DATA] = this->final_gs_vertex_count;
+      inst = abld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
+                       srcs, ARRAY_SIZE(srcs));
       inst->mlen = 2;
    }
    inst->eot = true;
@@ -6676,16 +6683,12 @@ fs_visitor::run_tcs()
    }
 
    /* Emit EOT write; set TR DS Cache bit */
-   fs_reg srcs[3] = {
-      fs_reg(get_tcs_output_urb_handle()),
-      fs_reg(brw_imm_ud(WRITEMASK_X << 16)),
-      fs_reg(brw_imm_ud(0)),
-   };
-   fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 3);
-   bld.LOAD_PAYLOAD(payload, srcs, 3, 2);
-
+   fs_reg srcs[URB_LOGICAL_NUM_SRCS];
+   srcs[URB_LOGICAL_SRC_HANDLE] = get_tcs_output_urb_handle();
+   srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = brw_imm_ud(WRITEMASK_X << 16);
+   srcs[URB_LOGICAL_SRC_DATA] = brw_imm_ud(0);
    fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_MASKED_LOGICAL,
-                            bld.null_reg_ud(), payload);
+                            reg_undef, srcs, ARRAY_SIZE(srcs));
    inst->mlen = 3;
    inst->eot = true;
 
diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index 87aff871e78..35a50e838a8 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -2341,27 +2341,27 @@ fs_visitor::emit_gs_control_data_bits(const fs_reg &vertex_count)
    }
 
    /* Store the control data bits in the message payload and send it. */
-   unsigned mlen = 2;
-   if (channel_mask.file != BAD_FILE)
-      mlen += 4; /* channel masks, plus 3 extra copies of the data */
-   if (per_slot_offset.file != BAD_FILE)
-      mlen++;
-
-   fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, mlen);
-   fs_reg *sources = ralloc_array(mem_ctx, fs_reg, mlen);
-   unsigned i = 0;
-   sources[i++] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
-   if (per_slot_offset.file != BAD_FILE)
-      sources[i++] = per_slot_offset;
-   if (channel_mask.file != BAD_FILE)
-      sources[i++] = channel_mask;
-   while (i < mlen) {
-      sources[i++] = this->control_data_bits;
-   }
-
-   abld.LOAD_PAYLOAD(payload, sources, mlen, mlen);
-   fs_inst *inst = abld.emit(opcode, reg_undef, payload);
-   inst->mlen = mlen;
+   const unsigned header_size = 1 + unsigned(channel_mask.file != BAD_FILE) +
+      unsigned(per_slot_offset.file != BAD_FILE);
+
+   /* If there are channel masks, add 3 extra copies of the data. */
+   const unsigned length = 1 + 3 * unsigned(channel_mask.file != BAD_FILE);
+
+   fs_reg sources[4];
+
+   for (unsigned i = 0; i < ARRAY_SIZE(sources); i++)
+      sources[i] = this->control_data_bits;
+
+   fs_reg srcs[URB_LOGICAL_NUM_SRCS];
+   srcs[URB_LOGICAL_SRC_HANDLE] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
+   srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = per_slot_offset;
+   srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = channel_mask;
+   srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, alloc.allocate(length),
+                                       BRW_REGISTER_TYPE_F);
+   abld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], sources, length, 0);
+
+   fs_inst *inst = abld.emit(opcode, reg_undef, srcs, ARRAY_SIZE(srcs));
+   inst->mlen = header_size + length;
    /* We need to increment Global Offset by 256-bits to make room for
     * Broadwell's extra "Vertex Count" payload at the beginning of the
     * URB entry.  Since this is an OWord message, Global Offset is counted
@@ -3046,15 +3046,6 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
       fs_reg indirect_offset = get_indirect_offset(instr);
       unsigned imm_offset = instr->const_index[0];
       unsigned mask = instr->const_index[1];
-      unsigned header_regs = 0;
-      struct brw_reg output_handles = get_tcs_output_urb_handle();
-
-      fs_reg srcs[7];
-      srcs[header_regs++] = output_handles;
-
-      if (indirect_offset.file != BAD_FILE) {
-         srcs[header_regs++] = indirect_offset;
-      }
 
       if (mask == 0)
          break;
@@ -3068,8 +3059,9 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
       unsigned first_component = nir_intrinsic_component(instr);
       mask = mask << first_component;
 
+      fs_reg mask_reg;
       if (mask != WRITEMASK_XYZW) {
-         srcs[header_regs++] = brw_imm_ud(mask << 16);
+         mask_reg = brw_imm_ud(mask << 16);
          opcode = indirect_offset.file != BAD_FILE ?
             SHADER_OPCODE_URB_WRITE_MASKED_PER_SLOT_LOGICAL :
             SHADER_OPCODE_URB_WRITE_MASKED_LOGICAL;
@@ -3079,21 +3071,30 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
             SHADER_OPCODE_URB_WRITE_LOGICAL;
       }
 
+      fs_reg sources[4];
+
       for (unsigned i = 0; i < num_components; i++) {
          if (!(mask & (1 << (i + first_component))))
             continue;
 
-         srcs[header_regs + i + first_component] = offset(value, bld, i);
+         sources[i + first_component] = offset(value, bld, i);
       }
 
-      unsigned mlen = header_regs + num_components + first_component;
-      fs_reg payload =
-         bld.vgrf(BRW_REGISTER_TYPE_UD, mlen);
-      bld.LOAD_PAYLOAD(payload, srcs, mlen, header_regs);
+      unsigned header_size = 1 + unsigned(indirect_offset.file != BAD_FILE) +
+         unsigned(mask != WRITEMASK_XYZW);
+      const unsigned length = num_components + first_component;
+
+      fs_reg srcs[URB_LOGICAL_NUM_SRCS];
+      srcs[URB_LOGICAL_SRC_HANDLE] = get_tcs_output_urb_handle();
+      srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
+      srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = mask_reg;
+      srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, alloc.allocate(length),
+                                          BRW_REGISTER_TYPE_F);
+      bld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], sources, length, 0);
 
-      fs_inst *inst = bld.emit(opcode, bld.null_reg_ud(), payload);
+      fs_inst *inst = bld.emit(opcode, reg_undef, srcs, ARRAY_SIZE(srcs));
       inst->offset = imm_offset;
-      inst->mlen = mlen;
+      inst->mlen = header_size + length;
       break;
    }
 
diff --git a/src/intel/compiler/brw_fs_validate.cpp b/src/intel/compiler/brw_fs_validate.cpp
index 75a794fd794..3fb071086f6 100644
--- a/src/intel/compiler/brw_fs_validate.cpp
+++ b/src/intel/compiler/brw_fs_validate.cpp
@@ -43,6 +43,20 @@ fs_visitor::validate()
 {
 #ifndef NDEBUG
    foreach_block_and_inst (block, fs_inst, inst, cfg) {
+      if (inst->opcode == SHADER_OPCODE_URB_WRITE_LOGICAL) {
+         const unsigned header_size = 1 +
+            unsigned(inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS].file != BAD_FILE) +
+            unsigned(inst->src[URB_LOGICAL_SRC_CHANNEL_MASK].file != BAD_FILE);
+
+         unsigned data_size = 0;
+         for (unsigned i = header_size, j = 0; i < inst->mlen; i++, j++) {
+            fsv_assert(type_sz(offset(inst->src[URB_LOGICAL_SRC_DATA], bld, j).type) == 4);
+            data_size++;
+         }
+
+         fsv_assert(header_size + data_size == inst->mlen);
+      }
+
       if (inst->dst.file == VGRF) {
          fsv_assert(inst->dst.offset / REG_SIZE + regs_written(inst) <=
                     alloc.sizes[inst->dst.nr]);
diff --git a/src/intel/compiler/brw_fs_visitor.cpp b/src/intel/compiler/brw_fs_visitor.cpp
index 1a6c42f2715..3ced049d101 100644
--- a/src/intel/compiler/brw_fs_visitor.cpp
+++ b/src/intel/compiler/brw_fs_visitor.cpp
@@ -935,22 +935,15 @@ fs_visitor::emit_urb_writes(const fs_reg &gs_vertex_count)
       if (length == 8 || (length > 0 && slot == last_slot))
          flush = true;
       if (flush) {
-         fs_reg *payload_sources =
-            ralloc_array(mem_ctx, fs_reg, length + header_size);
-         fs_reg payload = fs_reg(VGRF, alloc.allocate(length + header_size),
-                                 BRW_REGISTER_TYPE_F);
-         payload_sources[0] = urb_handle;
+         fs_reg srcs[URB_LOGICAL_NUM_SRCS];
 
-         if (opcode == SHADER_OPCODE_URB_WRITE_PER_SLOT_LOGICAL)
-            payload_sources[1] = per_slot_offsets;
+         srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
+         srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = per_slot_offsets;
+         srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, alloc.allocate(length),
+                                             BRW_REGISTER_TYPE_F);
+         abld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], sources, length, 0);
 
-         memcpy(&payload_sources[header_size], sources,
-                length * sizeof sources[0]);
-
-         abld.LOAD_PAYLOAD(payload, payload_sources, length + header_size,
-                           header_size);
-
-         fs_inst *inst = abld.emit(opcode, reg_undef, payload);
+         fs_inst *inst = abld.emit(opcode, reg_undef, srcs, ARRAY_SIZE(srcs));
 
          /* For ICL WA 1805992985 one needs additional write in the end. */
          if (devinfo->ver == 11 && stage == MESA_SHADER_TESS_EVAL)
@@ -985,10 +978,17 @@ fs_visitor::emit_urb_writes(const fs_reg &gs_vertex_count)
       if (stage == MESA_SHADER_GEOMETRY)
          return;
 
-      fs_reg payload = fs_reg(VGRF, alloc.allocate(2), BRW_REGISTER_TYPE_UD);
-      bld.exec_all().MOV(payload, urb_handle);
+      fs_reg uniform_urb_handle = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
+      fs_reg payload = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
+
+      bld.exec_all().MOV(uniform_urb_handle, urb_handle);
 
-      fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef, payload);
+      fs_reg srcs[URB_LOGICAL_NUM_SRCS];
+      srcs[URB_LOGICAL_SRC_HANDLE] = uniform_urb_handle;
+      srcs[URB_LOGICAL_SRC_DATA] = payload;
+
+      fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
+                               srcs, ARRAY_SIZE(srcs));
       inst->eot = true;
       inst->mlen = 2;
       inst->offset = 1;
@@ -1002,14 +1002,16 @@ fs_visitor::emit_urb_writes(const fs_reg &gs_vertex_count)
     * all 8 lanes must valid.
     */
    if (devinfo->ver == 11 && stage == MESA_SHADER_TESS_EVAL) {
-      fs_reg payload = fs_reg(VGRF, alloc.allocate(6), BRW_REGISTER_TYPE_UD);
+      fs_reg uniform_urb_handle = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
+      fs_reg uniform_mask = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
+      fs_reg payload = fs_reg(VGRF, alloc.allocate(4), BRW_REGISTER_TYPE_UD);
 
       /* Workaround requires all 8 channels (lanes) to be valid. This is
        * understood to mean they all need to be alive. First trick is to find
        * a live channel and copy its urb handle for all the other channels to
        * make sure all handles are valid.
        */
-      bld.exec_all().MOV(payload, bld.emit_uniformize(urb_handle));
+      bld.exec_all().MOV(uniform_urb_handle, bld.emit_uniformize(urb_handle));
 
       /* Second trick is to use masked URB write where one can tell the HW to
        * actually write data only for selected channels even though all are
@@ -1025,14 +1027,19 @@ fs_visitor::emit_urb_writes(const fs_reg &gs_vertex_count)
        * 4 slots data. All are explicitly zeros in order to to keep the MBZ
        * area written as zeros.
        */
-      bld.exec_all().MOV(offset(payload, bld, 1), brw_imm_ud(0x10000u));
+      bld.exec_all().MOV(uniform_mask, brw_imm_ud(0x10000u));
+      bld.exec_all().MOV(offset(payload, bld, 0), brw_imm_ud(0u));
+      bld.exec_all().MOV(offset(payload, bld, 1), brw_imm_ud(0u));
       bld.exec_all().MOV(offset(payload, bld, 2), brw_imm_ud(0u));
       bld.exec_all().MOV(offset(payload, bld, 3), brw_imm_ud(0u));
-      bld.exec_all().MOV(offset(payload, bld, 4), brw_imm_ud(0u));
-      bld.exec_all().MOV(offset(payload, bld, 5), brw_imm_ud(0u));
+
+      fs_reg srcs[URB_LOGICAL_NUM_SRCS];
+      srcs[URB_LOGICAL_SRC_HANDLE] = uniform_urb_handle;
+      srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = uniform_mask;
+      srcs[URB_LOGICAL_SRC_DATA] = payload;
 
       fs_inst *inst = bld.exec_all().emit(SHADER_OPCODE_URB_WRITE_MASKED_LOGICAL,
-                                          reg_undef, payload);
+                                          reg_undef, srcs, ARRAY_SIZE(srcs));
       inst->eot = true;
       inst->mlen = 6;
       inst->offset = 0;
diff --git a/src/intel/compiler/brw_lower_logical_sends.cpp b/src/intel/compiler/brw_lower_logical_sends.cpp
index e1845a4fc34..0ebc9984b1e 100644
--- a/src/intel/compiler/brw_lower_logical_sends.cpp
+++ b/src/intel/compiler/brw_lower_logical_sends.cpp
@@ -73,8 +73,27 @@ lower_urb_write_logical_send(const fs_builder &bld, fs_inst *inst,
 
    assert(inst->header_size == 0);
 
+   fs_reg *payload_sources = new fs_reg[inst->mlen];
+   fs_reg payload = fs_reg(VGRF, bld.shader->alloc.allocate(inst->mlen),
+                           BRW_REGISTER_TYPE_F);
+
+   unsigned header_size = 0;
+   payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_HANDLE];
+   if (per_slot_present)
+      payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS];
+
+   if (channel_mask_present)
+      payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_CHANNEL_MASK];
+
+   for (unsigned i = header_size, j = 0; i < inst->mlen; i++, j++)
+      payload_sources[i] = offset(inst->src[URB_LOGICAL_SRC_DATA], bld, j);
+
+   bld.LOAD_PAYLOAD(payload, payload_sources, inst->mlen, header_size);
+
+   delete [] payload_sources;
+
    inst->opcode = SHADER_OPCODE_SEND;
-   inst->header_size = 1;
+   inst->header_size = header_size;
    inst->dst = brw_null_reg();
 
    inst->sfid = BRW_SFID_URB;
@@ -88,13 +107,11 @@ lower_urb_write_logical_send(const fs_builder &bld, fs_inst *inst,
    inst->ex_mlen = 0;
    inst->send_has_side_effects = true;
 
-   fs_reg tmp = inst->src[0];
-
    inst->resize_sources(4);
 
    inst->src[0] = brw_imm_ud(0); /* desc */
    inst->src[1] = brw_imm_ud(0); /* ex_desc */
-   inst->src[2] = tmp;
+   inst->src[2] = payload;
    inst->src[3] = brw_null_reg();
 }
 
diff --git a/src/intel/compiler/brw_mesh.cpp b/src/intel/compiler/brw_mesh.cpp
index d9828923c9e..6a8872cebe9 100644
--- a/src/intel/compiler/brw_mesh.cpp
+++ b/src/intel/compiler/brw_mesh.cpp
@@ -892,25 +892,25 @@ emit_urb_direct_writes(const fs_builder &bld, nir_intrinsic_instr *instr,
       for (unsigned q = 0; q < bld.dispatch_width() / 8; q++) {
          fs_builder bld8 = bld.group(8, q);
 
-         fs_reg payload_srcs[6];
-         unsigned p = 0;
-
-         payload_srcs[p++] = urb_handle;
-         payload_srcs[p++] = brw_imm_ud(first_mask << 16);
-         const unsigned header_size = p;
+         fs_reg payload_srcs[4];
+         unsigned length = 0;
 
          for (unsigned i = 0; i < comp_shift; i++)
-            payload_srcs[p++] = reg_undef;
+            payload_srcs[length++] = reg_undef;
 
          for (unsigned c = 0; c < first_comps; c++)
-            payload_srcs[p++] = quarter(offset(src, bld, c), q);
+            payload_srcs[length++] = quarter(offset(src, bld, c), q);
 
-         fs_reg payload = bld8.vgrf(BRW_REGISTER_TYPE_UD, p);
-         bld8.LOAD_PAYLOAD(payload, payload_srcs, p, header_size);
+         fs_reg srcs[URB_LOGICAL_NUM_SRCS];
+         srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
+         srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = brw_imm_ud(first_mask << 16);
+         srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, bld.shader->alloc.allocate(length),
+                                             BRW_REGISTER_TYPE_F);
+         bld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], payload_srcs, length, 0);
 
          fs_inst *inst = bld8.emit(SHADER_OPCODE_URB_WRITE_MASKED_LOGICAL,
-                                   reg_undef, payload);
-         inst->mlen = p;
+                                   reg_undef, srcs, ARRAY_SIZE(srcs));
+         inst->mlen = 2 + length;
          inst->offset = urb_global_offset;
          assert(inst->offset < 2048);
       }
@@ -923,22 +923,22 @@ emit_urb_direct_writes(const fs_builder &bld, nir_intrinsic_instr *instr,
       for (unsigned q = 0; q < bld.dispatch_width() / 8; q++) {
          fs_builder bld8 = bld.group(8, q);
 
-         fs_reg payload_srcs[6];
-         unsigned p = 0;
-
-         payload_srcs[p++] = urb_handle;
-         payload_srcs[p++] = brw_imm_ud(second_mask << 16);
-         const unsigned header_size = p;
+         fs_reg payload_srcs[4];
+         unsigned length = 0;
 
          for (unsigned c = 0; c < second_comps; c++)
-            payload_srcs[p++] = quarter(offset(src, bld, c + first_comps), q);
+            payload_srcs[length++] = quarter(offset(src, bld, c + first_comps), q);
 
-         fs_reg payload = bld8.vgrf(BRW_REGISTER_TYPE_UD, p);
-         bld8.LOAD_PAYLOAD(payload, payload_srcs, p, header_size);
+         fs_reg srcs[URB_LOGICAL_NUM_SRCS];
+         srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
+         srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = brw_imm_ud(second_mask << 16);
+         srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, bld.shader->alloc.allocate(length),
+                                             BRW_REGISTER_TYPE_F);
+         bld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], payload_srcs, length, 0);
 
          fs_inst *inst = bld8.emit(SHADER_OPCODE_URB_WRITE_MASKED_LOGICAL,
-                                   reg_undef, payload);
-         inst->mlen = p;
+                                   reg_undef, srcs, ARRAY_SIZE(srcs));
+         inst->mlen = 2 + length;
          inst->offset = urb_global_offset;
          assert(inst->offset < 2048);
       }
@@ -988,21 +988,23 @@ emit_urb_indirect_writes(const fs_builder &bld, nir_intrinsic_instr *instr,
 
          bld8.SHR(off, off, brw_imm_ud(2));
 
-         fs_reg payload_srcs[7];
-         int x = 0;
-         payload_srcs[x++] = urb_handle;
-         payload_srcs[x++] = off;
-         payload_srcs[x++] = mask;
+         fs_reg payload_srcs[4];
+         unsigned length = 0;
 
          for (unsigned j = 0; j < 4; j++)
-            payload_srcs[x++] = quarter(src_comp, q);
+            payload_srcs[length++] = quarter(src_comp, q);
 
-         fs_reg payload = bld8.vgrf(BRW_REGISTER_TYPE_UD, x);
-         bld8.LOAD_PAYLOAD(payload, payload_srcs, x, 3);
+         fs_reg srcs[URB_LOGICAL_NUM_SRCS];
+         srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
+         srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = off;
+         srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = mask;
+         srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, bld.shader->alloc.allocate(length),
+                                             BRW_REGISTER_TYPE_F);
+         bld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], payload_srcs, length, 0);
 
-         fs_inst *inst = bld8.emit(SHADER_OPCODE_URB_WRITE_MASKED_PER_SLOT_LOGICAL,
-                                   reg_undef, payload);
-         inst->mlen = x;
+         fs_inst *inst = bld8.emit(SHADER_OPCODE_URB_WRITE_MASKED_LOGICAL,
+                                   reg_undef, srcs, ARRAY_SIZE(srcs));
+         inst->mlen = 3 + length;
          inst->offset = 0;
       }
    }