[Mesa-dev] [PATCH 1/4] i965/fs/skl+: Use ld_lz when the LOD parameter is zero

Matt Turner mattst88 at gmail.com
Wed May 4 06:04:31 UTC 2016


From: Neil Roberts <neil at linux.intel.com>

Adds an optimisation pass which recognises the LD sampler message type
when the LOD parameter is either a constant zero or not given and
replaces it with the LD_LZ message type. This is the same but the LOD
is hardcoded to zero and doesn't need to be in the message. This can
be a benefit for shaders using texelFetch with 3 coordinates because
otherwise the LOD parameter can't be optimised out because it comes
before the r coordinate.

[mattst88]: Does not affect anything in shader-db.

Reviewed-by: Matt Turner <mattst88 at gmail.com>
---
 src/mesa/drivers/dri/i965/brw_defines.h            |  2 +
 src/mesa/drivers/dri/i965/brw_disasm.c             |  1 +
 src/mesa/drivers/dri/i965/brw_fs.cpp               | 97 ++++++++++++++++++++++
 src/mesa/drivers/dri/i965/brw_fs.h                 |  1 +
 src/mesa/drivers/dri/i965/brw_fs_generator.cpp     |  5 ++
 .../drivers/dri/i965/brw_schedule_instructions.cpp |  1 +
 src/mesa/drivers/dri/i965/brw_shader.cpp           |  3 +
 7 files changed, 110 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index 60b696c..e23f372 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -977,6 +977,7 @@ enum opcode {
    SHADER_OPCODE_TXD_LOGICAL,
    SHADER_OPCODE_TXF,
    SHADER_OPCODE_TXF_LOGICAL,
+   SHADER_OPCODE_TXF_LZ,
    SHADER_OPCODE_TXL,
    SHADER_OPCODE_TXL_LOGICAL,
    SHADER_OPCODE_TXS,
@@ -1636,6 +1637,7 @@ enum brw_message_target {
 #define GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO   17
 #define GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C 18
 #define HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE 20
+#define GEN9_SAMPLER_MESSAGE_SAMPLE_LD_LZ        26
 #define GEN9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W     28
 #define GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS       29
 #define GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS       30
diff --git a/src/mesa/drivers/dri/i965/brw_disasm.c b/src/mesa/drivers/dri/i965/brw_disasm.c
index 1778419..046e1b8 100644
--- a/src/mesa/drivers/dri/i965/brw_disasm.c
+++ b/src/mesa/drivers/dri/i965/brw_disasm.c
@@ -550,6 +550,7 @@ static const char *const gen5_sampler_msg_type[] = {
    [GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO]   = "gather4_po",
    [GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C] = "gather4_po_c",
    [HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE] = "sample_d_c",
+   [GEN9_SAMPLER_MESSAGE_SAMPLE_LD_LZ]        = "ld_lz",
    [GEN9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W]     = "ld2dms_w",
    [GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS]       = "ld_mcs",
    [GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS]       = "ld2dms",
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 18760dd..15df298 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -949,6 +949,7 @@ fs_visitor::implied_mrf_writes(fs_inst *inst)
    case FS_OPCODE_TXB:
    case SHADER_OPCODE_TXD:
    case SHADER_OPCODE_TXF:
+   case SHADER_OPCODE_TXF_LZ:
    case SHADER_OPCODE_TXF_CMS:
    case SHADER_OPCODE_TXF_CMS_W:
    case SHADER_OPCODE_TXF_MCS:
@@ -2482,6 +2483,100 @@ fs_visitor::opt_zero_samples()
    return progress;
 }
 
+static bool
+lod_source_is_zero(const fs_inst *send_inst)
+{
+   int reg_offset = send_inst->exec_size / 8 * 2 + send_inst->header_size;
+   const fs_reg src = byte_offset(send_inst->src[0], reg_offset * 32);
+
+   /* Look for the last instruction that writes to the source */
+   foreach_inst_in_block_reverse_starting_from(const fs_inst, inst, send_inst) {
+      if (inst->overwrites_reg(src)) {
+         return (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD &&
+                 inst->src[inst->header_size + 2].is_zero());
+      }
+   }
+
+   return false;
+}
+
+/**
+ * Replace LD sample messages that have a zero LOD with LD_LZ. This
+ * instruction is available since Gen9. It would help for doing texelFetch
+ * when passing three coordinates because then the LOD can be skipped.
+ */
+bool
+fs_visitor::opt_ld_lz()
+{
+   if (devinfo->gen < 9)
+      return false;
+
+   bool progress = false;
+
+   foreach_block_and_inst(block, fs_inst, inst, cfg) {
+      if (inst->opcode != SHADER_OPCODE_TXF)
+         continue;
+
+      /* If the LOD parameter is not sent or is a constant zero then we can
+       * change the instruction.
+       */
+      bool lod_included = (inst->mlen - inst->header_size >=
+                           inst->exec_size / 8 * 3);
+      if (lod_included && !lod_source_is_zero(inst))
+         continue;
+
+      inst->opcode = SHADER_OPCODE_TXF_LZ;
+
+      if (lod_included) {
+         inst->mlen -= inst->exec_size / 8;
+
+         /* If the r coordinate is included then we need a new LOAD_PAYLOAD
+          * instruction which has it in the right place.
+          */
+         if (inst->mlen - inst->header_size >= inst->exec_size / 8 * 3) {
+            const fs_builder ibld(this, block, inst);
+            fs_reg send_header = fs_reg(VGRF, alloc.allocate(inst->mlen),
+                                        BRW_REGISTER_TYPE_F);
+            int n_sources = ((inst->mlen - inst->header_size) *
+                             8 / inst->exec_size +
+                             inst->header_size);
+            fs_reg *new_sources = ralloc_array(mem_ctx, fs_reg, n_sources);
+
+            for (int i = 0; i < n_sources; i++) {
+               int j;
+               if (i >= inst->header_size + 2)
+                  j = i + 1;
+               else
+                  j = i;
+               new_sources[i] = offset(inst->src[0], ibld, j);
+            }
+
+            /* The LOAD_PAYLOAD helper is not used for the same reasons given
+             * in fs_visitor::opt_sample_eot.
+             */
+            fs_inst *new_load_payload =
+               new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD,
+                                    inst->exec_size,
+                                    send_header,
+                                    new_sources,
+                                    n_sources);
+
+            new_load_payload->regs_written = inst->mlen;
+            new_load_payload->header_size = inst->header_size;
+            inst->insert_before(block, new_load_payload);
+            inst->src[0] = send_header;
+         }
+      }
+
+      progress = true;
+   }
+
+   if (progress)
+      invalidate_live_intervals();
+
+   return progress;
+}
+
 /**
  * Optimize sample messages which are followed by the final RT write.
  *
@@ -4156,6 +4251,7 @@ lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op,
 
       coordinate_done = true;
       break;
+
    case SHADER_OPCODE_TXF_CMS:
    case SHADER_OPCODE_TXF_CMS_W:
    case SHADER_OPCODE_TXF_UMS:
@@ -5329,6 +5425,7 @@ fs_visitor::optimize()
       OPT(opt_redundant_discard_jumps);
       OPT(opt_saturate_propagation);
       OPT(opt_zero_samples);
+      OPT(opt_ld_lz);
       OPT(register_coalesce);
       OPT(compute_to_mrf);
       OPT(eliminate_find_live_channel);
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index ba6bd3f..66b39dc 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -227,6 +227,7 @@ public:
    bool opt_saturate_propagation();
    bool opt_cmod_propagation();
    bool opt_zero_samples();
+   bool opt_ld_lz();
    void emit_unspill(bblock_t *block, fs_inst *inst, fs_reg reg,
                      uint32_t spill_offset, int count);
    void emit_spill(bblock_t *block, fs_inst *inst, fs_reg reg,
diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
index 8654ca4..0516d28 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -807,6 +807,10 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src
       case SHADER_OPCODE_TXF:
 	 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
 	 break;
+      case SHADER_OPCODE_TXF_LZ:
+         assert(devinfo->gen >= 9);
+         msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LD_LZ;
+         break;
       case SHADER_OPCODE_TXF_CMS_W:
          assert(devinfo->gen >= 9);
          msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W;
@@ -2115,6 +2119,7 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
       case FS_OPCODE_TXB:
       case SHADER_OPCODE_TXD:
       case SHADER_OPCODE_TXF:
+      case SHADER_OPCODE_TXF_LZ:
       case SHADER_OPCODE_TXF_CMS:
       case SHADER_OPCODE_TXF_CMS_W:
       case SHADER_OPCODE_TXF_UMS:
diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
index 8d92584..557811c 100644
--- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
@@ -215,6 +215,7 @@ schedule_node::set_latency_gen7(bool is_haswell)
    case SHADER_OPCODE_TEX:
    case SHADER_OPCODE_TXD:
    case SHADER_OPCODE_TXF:
+   case SHADER_OPCODE_TXF_LZ:
    case SHADER_OPCODE_TXL:
       /* 18 cycles:
        * mov(8)  g115<1>F   0F                         { align1 WE_normal 1Q };
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
index 068244b..69f62d9 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -213,6 +213,8 @@ brw_instruction_name(const struct brw_device_info *devinfo, enum opcode op)
       return "txf";
    case SHADER_OPCODE_TXF_LOGICAL:
       return "txf_logical";
+   case SHADER_OPCODE_TXF_LZ:
+      return "txf_lz";
    case SHADER_OPCODE_TXL:
       return "txl";
    case SHADER_OPCODE_TXL_LOGICAL:
@@ -749,6 +751,7 @@ backend_instruction::is_tex() const
            opcode == FS_OPCODE_TXB ||
            opcode == SHADER_OPCODE_TXD ||
            opcode == SHADER_OPCODE_TXF ||
+           opcode == SHADER_OPCODE_TXF_LZ ||
            opcode == SHADER_OPCODE_TXF_CMS ||
            opcode == SHADER_OPCODE_TXF_CMS_W ||
            opcode == SHADER_OPCODE_TXF_UMS ||
-- 
2.7.3



More information about the mesa-dev mailing list