[Mesa-dev] [PATCH 6/7] i965/fs: Fetch one cacheline of pull constants at a time.

Francisco Jerez currojerez at riseup.net
Sat Jan 17 15:04:08 PST 2015


Asking the DC for less than one cacheline (4 owords) of data for uniform pull
constants is suboptimal because the DC itself cannot request less than that
from L3, resulting in wasted bandwidth, unnecessary message dispatch overhead
and exacerbating the L3 serialization bug on IVB.  Improves performance of
pull constants on all generations I've tested so far.  On BDW and BSW the FPS
of a microbenchmark increases up to 5-6x, see the third column of the table in
"i965/fs: Switch to the constant cache for uniform pull constants." for more
detailed numbers.

Going up to 8 oword blocks would improve performance of pull constants even
more, but at the cost of some additional bandwidth and register pressure, so
I'd rather do that as a follow-up together with some on-demand mechanism to
calculate the block size based on the number of constants actually used by the
shader.

Currently untested on Gen4-5.
---
 src/mesa/drivers/dri/i965/brw_eu_emit.c        | 10 ++++----
 src/mesa/drivers/dri/i965/brw_fs.cpp           | 33 ++++++++++++++++----------
 src/mesa/drivers/dri/i965/brw_fs_generator.cpp | 13 +++++-----
 src/mesa/drivers/dri/i965/brw_fs_nir.cpp       | 25 +++++++++++--------
 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp   | 27 ++++++++++++---------
 5 files changed, 62 insertions(+), 46 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c
index 7829878..b30db88 100644
--- a/src/mesa/drivers/dri/i965/brw_eu_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c
@@ -2194,7 +2194,7 @@ gen7_block_read_scratch(struct brw_compile *p,
 }
 
 /**
- * Read a float[4] vector from the data port constant cache.
+ * Read four float[4] vectors from the data port constant cache.
  * Location (in buffer) should be a multiple of 16.
  * Used for fetching shader constants.
  */
@@ -2231,8 +2231,8 @@ void brw_oword_block_read(struct brw_compile *p,
 
    brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
 
-   /* cast dest to a uword[8] vector */
-   dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
+   /* cast dest to a dword[16] vector */
+   dest = retype(vec16(dest), BRW_REGISTER_TYPE_UD);
 
    brw_set_dest(p, insn, dest);
    if (brw->gen >= 6) {
@@ -2245,12 +2245,12 @@ void brw_oword_block_read(struct brw_compile *p,
    brw_set_dp_read_message(p,
 			   insn,
 			   bind_table_index,
-			   BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
+			   BRW_DATAPORT_OWORD_BLOCK_4_OWORDS,
 			   BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
 			   target_cache,
 			   1, /* msg_length */
                            true, /* header_present */
-			   1); /* response_length (1 reg, 2 owords!) */
+			   2); /* response_length (2 regs, 4 owords!) */
 
    brw_pop_insn_state(p);
 }
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index d585a67..3c41e01 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -2261,29 +2261,40 @@ fs_visitor::demote_pull_constants()
          current_annotation = inst->annotation;
 
          fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
-         fs_reg dst = fs_reg(this, glsl_type::float_type);
 
          /* Generate a pull load into dst. */
          if (inst->src[i].reladdr) {
+            const fs_reg dst = fs_reg(this, glsl_type::float_type);
             exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
                                                         surf_index,
                                                         *inst->src[i].reladdr,
                                                         pull_index);
             inst->insert_before(block, &list);
+
+            /* Rewrite the instruction to use the temporary VGRF. */
+            inst->src[i].file = GRF;
             inst->src[i].reladdr = NULL;
+            inst->src[i].reg = dst.reg;
+            inst->src[i].reg_offset = 0;
          } else {
-            fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
+            const unsigned num_regs = 2; /* Fetch 4 owords at a time. */
+            const unsigned base = (pull_index * 4) & ~(32 * num_regs - 1);
+            const fs_reg dst(GRF, virtual_grf_alloc(num_regs),
+                             BRW_REGISTER_TYPE_F, dispatch_width);
             fs_inst *pull =
-               new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
-                                    dst, surf_index, offset);
+               new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
+                                    dst, surf_index, fs_reg(base));
+            pull->force_writemask_all = true;
+            pull->regs_written = num_regs;
             inst->insert_before(block, pull);
-            inst->src[i].set_smear(pull_index & 3);
+
+            /* Rewrite the instruction to use the temporary VGRF. */
+            inst->src[i].file = GRF;
+            inst->src[i].reg = dst.reg;
+            inst->src[i].reg_offset = pull_index / 8 % num_regs;
+            inst->src[i].set_smear(pull_index & 7);
          }
 
-         /* Rewrite the instruction to use the temporary VGRF. */
-         inst->src[i].file = GRF;
-         inst->src[i].reg = dst.reg;
-         inst->src[i].reg_offset = 0;
          inst->src[i].width = dispatch_width;
       }
    }
@@ -3010,10 +3021,6 @@ fs_visitor::lower_uniform_pull_constant_loads()
          setup->annotation = inst->annotation;
          inst->insert_before(block, setup);
 
-         /* Similarly, this will only populate the first 4 channels of the
-          * result register (since we only use smear values from 0-3), but we
-          * don't tell the optimizer.
-          */
          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
          inst->src[1] = payload;
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
index 3916c1a..b1fca41 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -1013,21 +1013,20 @@ fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst,
       const uint32_t surf_index = index.dw1.ud;
 
       brw_push_insn_state(p);
-      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
       brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
       brw_pop_insn_state(p);
 
-      brw_set_dest(p, send, vec8(dst));
+      brw_set_dest(p, send, vec16(dst));
       brw_set_src0(p, send, payload);
       brw_set_dp_read_message(p, send,
                               surf_index,
-                              BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
+                              BRW_DATAPORT_OWORD_BLOCK_4_OWORDS,
                               GEN7_DATAPORT_DC_OWORD_BLOCK_READ,
                               GEN6_SFID_DATAPORT_CONSTANT_CACHE,
                               1, /* mlen */
                               true, /* header */
-                              1); /* rlen */
+                              2); /* rlen */
 
       brw_mark_surface_used(prog_data, surf_index);
 
@@ -1049,12 +1048,12 @@ fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst,
       brw_inst *insn_or = brw_next_insn(p, BRW_OPCODE_OR);
       brw_set_dp_read_message(p, insn_or,
                               0, /* surface */
-                              BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
+                              BRW_DATAPORT_OWORD_BLOCK_4_OWORDS,
                               GEN7_DATAPORT_DC_OWORD_BLOCK_READ,
                               GEN6_SFID_DATAPORT_CONSTANT_CACHE,
                               1, /* mlen */
                               true, /* header */
-                              1); /* rlen */
+                              2); /* rlen */
       brw_inst_set_exec_size(p->brw, insn_or, BRW_EXECUTE_1);
       brw_inst_set_src1_reg_type(p->brw, insn_or, BRW_REGISTER_TYPE_UD);
       brw_set_src0(p, insn_or, addr);
@@ -1062,7 +1061,7 @@ fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst,
 
       /* dst = send(payload, a0.0) */
       brw_inst *insn_send = brw_next_insn(p, BRW_OPCODE_SEND);
-      brw_set_dest(p, insn_send, vec8(dst));
+      brw_set_dest(p, insn_send, vec16(dst));
       brw_set_src0(p, insn_send, payload);
       brw_set_indirect_send_descriptor(p, insn_send,
                                        GEN6_SFID_DATAPORT_CONSTANT_CACHE, addr);
diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index e8f398a..5268246 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -1418,22 +1418,27 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
             emit(VARYING_PULL_CONSTANT_LOAD(offset(dest, i), surf_index,
                                             base_offset, vec4_offset + i));
       } else {
-         fs_reg packed_consts = fs_reg(this, glsl_type::float_type);
-         packed_consts.type = dest.type;
-
-         fs_reg const_offset_reg((unsigned) instr->const_index[0] & ~15);
-         emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, packed_consts,
-              surf_index, const_offset_reg);
+         const unsigned num_regs = 2; /* Fetch 4 owords at a time. */
+         const unsigned base = instr->const_index[0] & ~(32 * num_regs - 1);
+         const unsigned delta = instr->const_index[0] & (32 * num_regs - 1);
+         fs_reg packed_consts(GRF, virtual_grf_alloc(num_regs),
+                              dest.type, dispatch_width);
+         fs_inst *pull = emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
+                              packed_consts, surf_index, fs_reg(base));
+         pull->regs_written = num_regs;
+         pull->force_writemask_all = true;
 
          for (unsigned i = 0; i < instr->num_components; i++) {
-            packed_consts.set_smear(instr->const_index[0] % 16 / 4 + i);
+            fs_reg src = byte_offset(packed_consts,
+                                     delta + type_sz(dest.type) * i);
+            src.stride = 0;
 
             /* The std140 packing rules don't allow vectors to cross 16-byte
-             * boundaries, and a reg is 32 bytes.
+             * boundaries, and we fetch 64 bytes.
              */
-            assert(packed_consts.subreg_offset < 32);
+            assert((unsigned)src.reg_offset < num_regs);
 
-            emit(MOV(dest, packed_consts));
+            emit(MOV(dest, src));
             dest.reg_offset++;
          }
       }
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index 8aafbef..a916f42 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -1135,28 +1135,33 @@ fs_visitor::visit(ir_expression *ir)
       }
 
       if (const_offset) {
-         fs_reg packed_consts = fs_reg(this, glsl_type::float_type);
-         packed_consts.type = result.type;
-
-         fs_reg const_offset_reg = fs_reg(const_offset->value.u[0] & ~15);
-         emit(new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
-                                   packed_consts, surf_index, const_offset_reg));
+         const unsigned num_regs = 2; /* Fetch 4 owords at a time. */
+         const unsigned base = const_offset->value.u[0] & ~(32 * num_regs - 1);
+         const unsigned delta = const_offset->value.u[0] & (32 * num_regs - 1);
+         const fs_reg packed_consts(GRF, virtual_grf_alloc(num_regs),
+                                    result.type, dispatch_width);
+         fs_inst *pull = emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
+                              packed_consts, surf_index, fs_reg(base));
+         pull->regs_written = num_regs;
+         pull->force_writemask_all = true;
 
          for (int i = 0; i < ir->type->vector_elements; i++) {
-            packed_consts.set_smear(const_offset->value.u[0] % 16 / 4 + i);
+            fs_reg src = byte_offset(packed_consts,
+                                     delta + type_sz(result.type) * i);
+            src.stride = 0;
 
             /* The std140 packing rules don't allow vectors to cross 16-byte
-             * boundaries, and a reg is 32 bytes.
+             * boundaries, and we fetch 64 bytes.
              */
-            assert(packed_consts.subreg_offset < 32);
+            assert((unsigned)src.reg_offset < num_regs);
 
             /* UBO bools are any nonzero value.  We consider bools to be
              * values with the low bit set to 1.  Convert them using CMP.
              */
             if (ir->type->base_type == GLSL_TYPE_BOOL) {
-               emit(CMP(result, packed_consts, fs_reg(0u), BRW_CONDITIONAL_NZ));
+               emit(CMP(result, src, fs_reg(0u), BRW_CONDITIONAL_NZ));
             } else {
-               emit(MOV(result, packed_consts));
+               emit(MOV(result, src));
             }
 
             result = offset(result, 1);
-- 
2.1.3



More information about the mesa-dev mailing list