[Mesa-dev] [PATCH 4/7] i965/fs: Set up gen7 UBO loads as sends from GRFs.

Eric Anholt eric at anholt.net
Fri Dec 7 14:58:15 PST 2012


This gives the instruction scheduler a chance to schedule between the
loads, whereas before it was restricted due to the dependencies between
the MRFs for setting them up.

For one shader in gles3conform, it goes from getting stuck in register
allocation for as long as anybody's bothered to leave it running down
to 23 seconds, thanks to the LIFO scheduling.
---
 src/mesa/drivers/dri/i965/brw_defines.h      |    2 +
 src/mesa/drivers/dri/i965/brw_fs.cpp         |    4 +-
 src/mesa/drivers/dri/i965/brw_fs.h           |    8 +++
 src/mesa/drivers/dri/i965/brw_fs_emit.cpp    |   75 ++++++++++++++++++++++++++
 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp |   32 ++++++++---
 5 files changed, 114 insertions(+), 7 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index 5e00b40..a8bee2e 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -677,9 +677,11 @@ enum opcode {
    FS_OPCODE_SPILL,
    FS_OPCODE_UNSPILL,
    FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
+   FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7,
    FS_OPCODE_VARYING_PULL_CONSTANT_LOAD,
    FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7,
    FS_OPCODE_MOV_DISPATCH_TO_FLAGS,
+   FS_OPCODE_SET_GLOBAL_OFFSET,
 
    VS_OPCODE_URB_WRITE,
    VS_OPCODE_SCRATCH_READ,
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 4d8f06c..8dd2139 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -330,7 +330,9 @@ fs_inst::is_math()
 bool
 fs_inst::is_send_from_grf()
 {
-   return opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
+   return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
+           (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
+            src[1].file == GRF));
 }
 
 bool
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index 0dab222..0ce43d0 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -505,6 +505,10 @@ private:
    void generate_uniform_pull_constant_load(fs_inst *inst, struct brw_reg dst,
                                             struct brw_reg index,
                                             struct brw_reg offset);
+   void generate_uniform_pull_constant_load_gen7(fs_inst *inst,
+                                                 struct brw_reg dst,
+                                                 struct brw_reg surf_index,
+                                                 struct brw_reg offset);
    void generate_varying_pull_constant_load(fs_inst *inst, struct brw_reg dst,
                                             struct brw_reg index);
    void generate_varying_pull_constant_load_gen7(fs_inst *inst,
@@ -512,6 +516,10 @@ private:
                                                  struct brw_reg index,
                                                  struct brw_reg offset);
    void generate_mov_dispatch_to_flags();
+   void generate_set_global_offset(fs_inst *inst,
+                                   struct brw_reg dst,
+                                   struct brw_reg src,
+                                   struct brw_reg offset);
 
    struct brw_context *brw;
    struct intel_context *intel;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
index 4e8b44e..f211870 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
@@ -649,6 +649,44 @@ fs_generator::generate_uniform_pull_constant_load(fs_inst *inst,
 }
 
 void
+fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst,
+                                                       struct brw_reg dst,
+                                                       struct brw_reg index,
+                                                       struct brw_reg offset)
+{
+   assert(inst->mlen == 0);
+
+   assert(index.file == BRW_IMMEDIATE_VALUE &&
+	  index.type == BRW_REGISTER_TYPE_UD);
+   uint32_t surf_index = index.dw1.ud;
+
+   assert(offset.file == BRW_GENERAL_REGISTER_FILE);
+
+   brw_push_insn_state(p);
+   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+   brw_set_mask_control(p, BRW_MASK_DISABLE);
+   struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
+   brw_pop_insn_state(p);
+
+   brw_set_dest(p, send, dst);
+   brw_set_src0(p, send, offset);
+   if (intel->gen < 6)
+      send->header.destreg__conditionalmod = inst->base_mrf;
+
+   uint32_t msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
+   uint32_t msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ;
+   bool header_present = true;
+   brw_set_dp_read_message(p, send,
+                           surf_index,
+                           msg_control,
+                           msg_type,
+                           BRW_DATAPORT_READ_TARGET_DATA_CACHE,
+                           1,
+                           header_present,
+                           1);
+}
+
+void
 fs_generator::generate_varying_pull_constant_load(fs_inst *inst,
                                                   struct brw_reg dst,
                                                   struct brw_reg index)
@@ -831,6 +869,35 @@ brw_reg_from_fs_reg(fs_reg *reg)
    return brw_reg;
 }
 
+/**
+ * Sets the second dword of a vgrf for gen7+ message setup.
+ *
+ * For setting up gen7 messages in VGRFs, we need to be able to set the second
+ * dword for some payloads where in the MRF world we'd have just used
+ * brw_message_reg().  We don't want to bake it into the send message's code
+ * generation because that means we don't get a chance to schedule the
+ * instructions.
+ */
+void
+fs_generator::generate_set_global_offset(fs_inst *inst,
+                                         struct brw_reg dst,
+                                         struct brw_reg src,
+                                         struct brw_reg value)
+{
+   /* We use a matching src and dst to get the information on how this
+    * instruction works exposed to various optimization passes that would
+    * otherwise treat it as completely overwriting the dst.
+    */
+   assert(src.file == dst.file && src.nr == dst.nr);
+   assert(value.file == BRW_IMMEDIATE_VALUE);
+
+   brw_push_insn_state(p);
+   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+   brw_set_mask_control(p, BRW_MASK_DISABLE);
+   brw_MOV(p, retype(brw_vec1_reg(dst.file, dst.nr, 2), value.type), value);
+   brw_pop_insn_state(p);
+}
+
 void
 fs_generator::generate_code(exec_list *instructions)
 {
@@ -1108,6 +1175,10 @@ fs_generator::generate_code(exec_list *instructions)
 	 generate_uniform_pull_constant_load(inst, dst, src[0], src[1]);
 	 break;
 
+      case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
+	 generate_uniform_pull_constant_load_gen7(inst, dst, src[0], src[1]);
+	 break;
+
       case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 	 generate_varying_pull_constant_load(inst, dst, src[0]);
 	 break;
@@ -1128,6 +1199,10 @@ fs_generator::generate_code(exec_list *instructions)
          brw_shader_time_add(p, inst->base_mrf, SURF_INDEX_WM_SHADER_TIME);
          break;
 
+      case FS_OPCODE_SET_GLOBAL_OFFSET:
+         generate_set_global_offset(inst, dst, src[0], src[1]);
+         break;
+
       default:
 	 if (inst->opcode < (int) ARRAY_SIZE(opcode_descs)) {
 	    _mesa_problem(ctx, "Unsupported opcode `%s' in FS",
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index f1c6860..69626d7 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -581,12 +581,32 @@ fs_visitor::visit(ir_expression *ir)
       if (const_offset) {
          fs_reg packed_consts = fs_reg(this, glsl_type::float_type);
          packed_consts.type = result.type;
-         fs_inst *pull = emit(fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
-                                      packed_consts,
-                                      surf_index,
-                                      fs_reg(const_offset->value.u[0])));
-         pull->base_mrf = 14;
-         pull->mlen = 1;
+
+         if (intel->gen >= 7) {
+            fs_reg const_offset_reg = fs_reg(const_offset->value.u[0] / 16);
+            fs_reg payload = fs_reg(this, glsl_type::uint_type);
+            struct brw_reg g0 = retype(brw_vec8_grf(0, 0),
+                                       BRW_REGISTER_TYPE_UD);
+            fs_inst *setup = emit(MOV(payload, fs_reg(g0)));
+            setup->force_writemask_all = true;
+            /* We don't need the second half of this vgrf to be filled with g1
+             * in the 16-wide case, but if we use force_uncompressed then live
+             * variable analysis won't consider this a def!
+             */
+
+            emit(FS_OPCODE_SET_GLOBAL_OFFSET, payload,
+                 payload, const_offset_reg);
+            emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7, packed_consts,
+                 surf_index, payload);
+         } else {
+            fs_reg const_offset_reg = fs_reg(const_offset->value.u[0]);
+            fs_inst *pull = emit(fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
+                                         packed_consts,
+                                         surf_index,
+                                         const_offset_reg));
+            pull->base_mrf = 14;
+            pull->mlen = 1;
+         }
 
          packed_consts.smear = const_offset->value.u[0] % 16 / 4;
          for (int i = 0; i < ir->type->vector_elements; i++) {
-- 
1.7.10.4



More information about the mesa-dev mailing list