[Mesa-stable] [PATCH] intel/fs: Set up sampler message headers in the visitor on gen7+

Thu Mar 1 05:49:19 UTC 2018

This gives the scheduler visibility into the headers which should
improve scheduling.  More importantly, however, it lets the scheduler
know that the header gets written.  As-is, the scheduler thinks that a
texture instruction only reads it's payload and is unaware that it may
write to the first register so it may reorder it with respect to a read
from that register.  This is causing issues in a couple of Dota 2 vertex
shaders.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=104923
Cc: mesa-stable at lists.freedesktop.org
---
 src/intel/compiler/brw_fs.cpp           | 40 +++++++++++++++++++++++++++++----
 src/intel/compiler/brw_fs_generator.cpp | 21 +++--------------
 2 files changed, 39 insertions(+), 22 deletions(-)

diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 113f62c..ab8cc89 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -4192,17 +4192,15 @@ lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op,
        op == SHADER_OPCODE_SAMPLEINFO ||
        is_high_sampler(devinfo, sampler)) {
       /* For general texture offsets (no txf workaround), we need a header to
-       * put them in.  Note that we're only reserving space for it in the
-       * message payload as it will be initialized implicitly by the
-       * generator.
+       * put them in.
        *
        * TG4 needs to place its channel select in the header, for interaction
        * with ARB_texture_swizzle.  The sampler index is only 4-bits, so for
        * larger sampler numbers we need to offset the Sampler State Pointer in
        * the header.
        */
+      fs_reg header = retype(sources[0], BRW_REGISTER_TYPE_UD);
       header_size = 1;
-      sources[0] = fs_reg();
       length++;
 
       /* If we're requesting fewer than four channels worth of response,
@@ -4214,6 +4212,40 @@ lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op,
          unsigned mask = ~((1 << (regs_written(inst) / reg_width)) - 1) & 0xf;
          inst->offset |= mask << 12;
       }
+
+      /* Build the actual header */
+      const fs_builder ubld = bld.exec_all().group(8, 0);
+      const fs_builder ubld1 = ubld.group(1, 0);
+      ubld.MOV(header, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
+      if (inst->offset) {
+         ubld1.MOV(component(header, 2), brw_imm_ud(inst->offset));
+      } else if (bld.shader->stage != MESA_SHADER_VERTEX &&
+                 bld.shader->stage != MESA_SHADER_FRAGMENT) {
+         /* The vertex and fragment stages have g0.2 set to 0, so
+          * header0.2 is 0 when g0 is copied. Other stages may not, so we
+          * must set it to 0 to avoid setting undesirable bits in the
+          * message.
+          */
+         ubld1.MOV(component(header, 2), brw_imm_ud(0));
+      }
+
+      if (is_high_sampler(devinfo, sampler)) {
+         if (sampler.file == BRW_IMMEDIATE_VALUE) {
+            assert(sampler.ud >= 16);
+            const int sampler_state_size = 16; /* 16 bytes */
+
+            ubld1.ADD(component(header, 3),
+                      retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD),
+                      brw_imm_ud(16 * (sampler.ud / 16) * sampler_state_size));
+         } else {
+            fs_reg tmp = ubld1.vgrf(BRW_REGISTER_TYPE_UD);
+            ubld1.AND(tmp, sampler, brw_imm_ud(0x0f0));
+            ubld1.SHL(tmp, tmp, brw_imm_ud(4));
+            ubld1.ADD(component(header, 3),
+                      retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD),
+                      tmp);
+         }
+      }
    }
 
    if (shadow_c.file != BAD_FILE) {
diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp
index b59c09f..a5a821a 100644
--- a/src/intel/compiler/brw_fs_generator.cpp
+++ b/src/intel/compiler/brw_fs_generator.cpp
@@ -1001,19 +1001,13 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src
     * we need to set it up explicitly and load the offset bitfield.
     * Otherwise, we can use an implied move from g0 to the first message reg.
     */
-   if (inst->header_size != 0) {
+   if (inst->header_size != 0 && devinfo->gen < 7) {
       if (devinfo->gen < 6 && !inst->offset) {
          /* Set up an implied move from g0 to the MRF. */
          src = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
       } else {
-         struct brw_reg header_reg;
-
-         if (devinfo->gen >= 7) {
-            header_reg = src;
-         } else {
-            assert(inst->base_mrf != -1);
-            header_reg = brw_message_reg(inst->base_mrf);
-         }
+         assert(inst->base_mrf != -1);
+         struct brw_reg header_reg = brw_message_reg(inst->base_mrf);
 
          brw_push_insn_state(p);
          brw_set_default_exec_size(p, BRW_EXECUTE_8);
@@ -1027,17 +1021,8 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src
             /* Set the offset bits in DWord 2. */
             brw_MOV(p, get_element_ud(header_reg, 2),
                        brw_imm_ud(inst->offset));
-         } else if (stage != MESA_SHADER_VERTEX &&
-                    stage != MESA_SHADER_FRAGMENT) {
-            /* The vertex and fragment stages have g0.2 set to 0, so
-             * header0.2 is 0 when g0 is copied. Other stages may not, so we
-             * must set it to 0 to avoid setting undesirable bits in the
-             * message.
-             */
-            brw_MOV(p, get_element_ud(header_reg, 2), brw_imm_ud(0));
          }
 
-         brw_adjust_sampler_state_pointer(p, header_reg, sampler_index);
          brw_pop_insn_state(p);
       }
    }
-- 
2.5.0.400.gff86faf