[Mesa-dev] [PATCH v2 5/5] i965/vec4: allow partial DF register spilling

Wed Jul 19 13:51:13 UTC 2017

v2:
- Enable spilling for partial DF reads/writes on HSW+

Signed-off-by: Samuel Iglesias Gonsálvez <siglesias at igalia.com>
---
 src/intel/compiler/brw_vec4_reg_allocate.cpp | 54 ++++++++++++++++++++--------
 1 file changed, 40 insertions(+), 14 deletions(-)

diff --git a/src/intel/compiler/brw_vec4_reg_allocate.cpp b/src/intel/compiler/brw_vec4_reg_allocate.cpp
index a6f1070ebd..3ad18b12bb 100644
--- a/src/intel/compiler/brw_vec4_reg_allocate.cpp
+++ b/src/intel/compiler/brw_vec4_reg_allocate.cpp
@@ -411,17 +411,21 @@ vec4_visitor::evaluate_spill_costs(float *spill_costs, bool *no_spill)
                spill_costs[inst->src[i].nr] +=
                   loop_scale * spill_cost_for_type(inst->src[i].type);
                if (inst->src[i].reladdr ||
-                   inst->src[i].offset >= REG_SIZE)
+                   (inst->src[i].offset >= REG_SIZE &&
+                    (type_sz(inst->src[i].type) != 8 ||
+                     !(inst->src[i].offset == 32 && inst->group == 4))))
                   no_spill[inst->src[i].nr] = true;
 
-               /* We don't support unspills of partial DF reads.
+               /* For execsize == 8, our 64-bit unspills are implemented with
+                * two 32-bit scratch messages, each one reading that for both
+                * SIMD4x2 threads that we need to shuffle into correct 64-bit
+                * data. Ensure that we are reading data for both threads.
                 *
-                * Our 64-bit unspills are implemented with two 32-bit scratch
-                * messages, each one reading that for both SIMD4x2 threads that
-                * we need to shuffle into correct 64-bit data. Ensure that we
-                * are reading data for both threads.
+                * For execsize == 4, it is similar but using 1-Oword block
+                * read messages and we don't need to shuffle data.
                 */
-               if (type_sz(inst->src[i].type) == 8 && inst->exec_size != 8)
+               if (type_sz(inst->src[i].type) == 8 &&
+                   inst->exec_size != 8 && inst->exec_size != 4)
                   no_spill[inst->src[i].nr] = true;
             }
 
@@ -439,16 +443,21 @@ vec4_visitor::evaluate_spill_costs(float *spill_costs, bool *no_spill)
       if (inst->dst.file == VGRF && !no_spill[inst->dst.nr]) {
          spill_costs[inst->dst.nr] +=
             loop_scale * spill_cost_for_type(inst->dst.type);
-         if (inst->dst.reladdr || inst->dst.offset >= REG_SIZE)
+         if (inst->dst.reladdr ||
+             (inst->dst.offset >= REG_SIZE &&
+              (type_sz(inst->dst.type) != 8 ||
+               !(inst->dst.offset == 32 && inst->group == 4))))
             no_spill[inst->dst.nr] = true;
 
-         /* We don't support spills of partial DF writes.
+         /* For execsize == 8, our 64-bit spills are implemented with two
+          * 32-bit scratch messages, each one writing that for both SIMD4x2
+          * threads. Ensure that we are writing data for both threads.
           *
-          * Our 64-bit spills are implemented with two 32-bit scratch messages,
-          * each one writing that for both SIMD4x2 threads. Ensure that we
-          * are writing data for both threads.
+          * For execsize == 4, it is similar but using 1-Oword block
+          * write messages.
           */
-         if (type_sz(inst->dst.type) == 8 && inst->exec_size != 8)
+         if (type_sz(inst->dst.type) == 8 &&
+             inst->exec_size != 8 && inst->exec_size != 4)
             no_spill[inst->dst.nr] = true;
 
          /* We can't spill registers that mix 32-bit and 64-bit access (that
@@ -514,11 +523,25 @@ vec4_visitor::spill_reg(int spill_reg_nr)
 
    /* Generate spill/unspill instructions for the objects being spilled. */
    int scratch_reg = -1;
+   bool do_partial_df_scratch_read = false;
    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
       for (unsigned int i = 0; i < 3; i++) {
          if (inst->src[i].file == VGRF && inst->src[i].nr == spill_reg_nr) {
+            /* DF scratch reads are not actual partial reads because we are
+             * going to read both GRFs in the first read instruction.
+             * Because of that, we will skip scratch read of the other splitted
+             * instruction (if any), as it can reuse the read value. We check
+             * the value of done_scratch_read to know if we need to do scratch
+             * read or not.
+             */
+            bool do_df_scratch_read = devinfo->gen >= 7 &&
+               type_sz(inst->src[i].type) == 8 &&
+               (inst->exec_size != 4 || do_partial_df_scratch_read);
+
             if (scratch_reg == -1 ||
-                !can_use_scratch_for_source(inst, i, scratch_reg, false)) {
+                (!can_use_scratch_for_source(inst, i, scratch_reg,
+                                             do_partial_df_scratch_read) &&
+                 (do_df_scratch_read || type_sz(inst->src[i].type) != 8))) {
                /* We need to unspill anyway so make sure we read the full vec4
                 * in any case. This way, the cached register can be reused
                 * for consecutive instructions that read different channels of
@@ -532,6 +555,7 @@ vec4_visitor::spill_reg(int spill_reg_nr)
                emit_scratch_read(block, inst,
                                  dst_reg(temp), inst->src[i], spill_offset, false);
                temp.offset = inst->src[i].offset;
+               do_partial_df_scratch_read = false;
             }
             assert(scratch_reg != -1);
             inst->src[i].nr = scratch_reg;
@@ -541,6 +565,8 @@ vec4_visitor::spill_reg(int spill_reg_nr)
       if (inst->dst.file == VGRF && inst->dst.nr == spill_reg_nr) {
          emit_scratch_write(block, inst, spill_offset, false);
          scratch_reg = inst->dst.nr;
+         if (type_sz(inst->dst.type) == 8 && inst->exec_size == 4)
+            do_partial_df_scratch_read = true;
       }
    }
 
-- 
2.11.0