Mesa (master): aco: improve FLAT/GLOBAL scheduling

Fri Nov 29 18:07:21 UTC 2019

Module: Mesa
Branch: master
Commit: 389ee819c04f3375358d0253bdb1f6094f2423c6
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=389ee819c04f3375358d0253bdb1f6094f2423c6

Author: Rhys Perry <pendingchaos02 at gmail.com>
Date:   Wed Nov 27 17:27:36 2019 +0000

aco: improve FLAT/GLOBAL scheduling

Signed-off-by: Rhys Perry <pendingchaos02 at gmail.com>
Reviewed-by: Daniel Schürmann <daniel at schuermann.dev>

---

 src/amd/compiler/aco_instruction_selection.cpp |  2 ++
 src/amd/compiler/aco_ir.h                      |  7 ++++--
 src/amd/compiler/aco_opcodes.py                |  1 +
 src/amd/compiler/aco_print_ir.cpp              |  1 +
 src/amd/compiler/aco_scheduler.cpp             | 33 ++++++++++++++++----------
 5 files changed, 30 insertions(+), 14 deletions(-)

diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
index 60963060dea..2bced09cf97 100644
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -4644,6 +4644,7 @@ void visit_load_global(isel_context *ctx, nir_intrinsic_instr *instr)
       flat->operands[1] = Operand(s1);
       flat->glc = glc;
       flat->dlc = dlc;
+      flat->barrier = barrier_buffer;
 
       if (dst.type() == RegType::sgpr) {
          Temp vec = bld.tmp(RegType::vgpr, dst.size());
@@ -4765,6 +4766,7 @@ void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr)
       flat->dlc = false;
       flat->offset = offset;
       flat->disable_wqm = true;
+      flat->barrier = barrier_buffer;
       ctx->program->needs_exact = true;
       ctx->block->instructions.emplace_back(std::move(flat));
    }
diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h
index 60f06393aa5..4073086662a 100644
--- a/src/amd/compiler/aco_ir.h
+++ b/src/amd/compiler/aco_ir.h
@@ -850,7 +850,9 @@ struct FLAT_instruction : public Instruction {
    bool dlc; /* NAVI: device level coherent */
    bool lds;
    bool nv;
-   bool disable_wqm;
+   bool disable_wqm; /* Require an exec mask without helper invocations */
+   bool can_reorder;
+   barrier_interaction barrier;
 };
 
 struct Export_instruction : public Instruction {
@@ -972,7 +974,8 @@ constexpr barrier_interaction get_barrier_interaction(Instruction* instr)
       return static_cast<MIMG_instruction*>(instr)->barrier;
    case Format::FLAT:
    case Format::GLOBAL:
-      return barrier_buffer;
+   case Format::SCRATCH:
+      return static_cast<FLAT_instruction*>(instr)->barrier;
    case Format::DS:
       return barrier_shared;
    default:
diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py
index 5f74998a421..a4b02507eda 100644
--- a/src/amd/compiler/aco_opcodes.py
+++ b/src/amd/compiler/aco_opcodes.py
@@ -133,6 +133,7 @@ class Format(Enum):
                  ('bool', 'bound_ctrl', 'false')]
       elif self in [Format.FLAT, Format.GLOBAL, Format.SCRATCH]:
          return [('uint16_t', 'offset', 0),
+                 ('bool', 'can_reorder', 'true'),
                  ('bool', 'glc', 'false'),
                  ('bool', 'slc', 'false'),
                  ('bool', 'lds', 'false'),
diff --git a/src/amd/compiler/aco_print_ir.cpp b/src/amd/compiler/aco_print_ir.cpp
index 5ced1d2d7bb..780980a8c69 100644
--- a/src/amd/compiler/aco_print_ir.cpp
+++ b/src/amd/compiler/aco_print_ir.cpp
@@ -373,6 +373,7 @@ static void print_instr_format_specific(struct Instruction *instr, FILE *output)
          fprintf(output, " nv");
       if (flat->disable_wqm)
          fprintf(output, " disable_wqm");
+      print_barrier_reorder(flat->can_reorder, flat->barrier, output);
       break;
    }
    case Format::MTBUF: {
diff --git a/src/amd/compiler/aco_scheduler.cpp b/src/amd/compiler/aco_scheduler.cpp
index eb0bb0d93e9..5c164703ebf 100644
--- a/src/amd/compiler/aco_scheduler.cpp
+++ b/src/amd/compiler/aco_scheduler.cpp
@@ -138,6 +138,11 @@ bool can_move_instr(aco_ptr<Instruction>& instr, Instruction* current, int movin
          case Format::MIMG:
             can_reorder = static_cast<MIMG_instruction*>(current)->can_reorder;
             break;
+         case Format::FLAT:
+         case Format::GLOBAL:
+         case Format::SCRATCH:
+            can_reorder = static_cast<FLAT_instruction*>(current)->can_reorder;
+            break;
          default:
             break;
          }
@@ -186,7 +191,7 @@ bool can_reorder(Instruction* candidate)
    case Format::FLAT:
    case Format::GLOBAL:
    case Format::SCRATCH:
-      return false;
+      return static_cast<FLAT_instruction*>(candidate)->can_reorder;
    default:
       return true;
    }
@@ -483,6 +488,7 @@ void schedule_VMEM(sched_ctx& ctx, Block* block,
       assert(candidate_idx >= 0);
       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
       bool can_reorder_candidate = can_reorder(candidate.get());
+      bool is_vmem = candidate->isVMEM() || candidate->isFlatOrGlobal();
 
       /* break when encountering another VMEM instruction, logical_start or barriers */
       if (!can_reorder_smem && candidate->format == Format::SMEM && !can_reorder_candidate)
@@ -501,8 +507,10 @@ void schedule_VMEM(sched_ctx& ctx, Block* block,
       register_pressure_indep.update(register_demand[candidate_idx]);
 
       bool part_of_clause = false;
-      if (candidate->isVMEM()) {
-         bool same_resource = candidate->operands[1].tempId() == current->operands[1].tempId();
+      if (current->isVMEM() == candidate->isVMEM()) {
+         bool same_resource = true;
+         if (current->isVMEM())
+            same_resource = candidate->operands[1].tempId() == current->operands[1].tempId();
          bool can_reorder = can_reorder_vmem || can_reorder_candidate;
          int grab_dist = clause_insert_idx - candidate_idx;
          /* We can't easily tell how much this will decrease the def-to-use
@@ -511,7 +519,7 @@ void schedule_VMEM(sched_ctx& ctx, Block* block,
       }
 
       /* if current depends on candidate, add additional dependencies and continue */
-      bool can_move_down = !candidate->isVMEM() || part_of_clause;
+      bool can_move_down = !is_vmem || part_of_clause;
       bool writes_exec = false;
       for (const Definition& def : candidate->definitions) {
          if (def.isTemp() && ctx.depends_on[def.tempId()])
@@ -540,7 +548,7 @@ void schedule_VMEM(sched_ctx& ctx, Block* block,
          }
          register_pressure_clause.update(register_demand[candidate_idx]);
          can_reorder_smem &= candidate->format != Format::SMEM || can_reorder_candidate;
-         can_reorder_vmem &= !candidate->isVMEM() || can_reorder_candidate;
+         can_reorder_vmem &= !is_vmem || can_reorder_candidate;
          continue;
       }
 
@@ -575,7 +583,7 @@ void schedule_VMEM(sched_ctx& ctx, Block* block,
          }
          register_pressure_clause.update(register_demand[candidate_idx]);
          can_reorder_smem &= candidate->format != Format::SMEM || can_reorder_candidate;
-         can_reorder_vmem &= !candidate->isVMEM() || can_reorder_candidate;
+         can_reorder_vmem &= !is_vmem || can_reorder_candidate;
          continue;
       }
 
@@ -636,6 +644,7 @@ void schedule_VMEM(sched_ctx& ctx, Block* block,
       assert(candidate_idx < (int) block->instructions.size());
       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
       bool can_reorder_candidate = can_reorder(candidate.get());
+      bool is_vmem = candidate->isVMEM() || candidate->isFlatOrGlobal();
 
       if (candidate->opcode == aco_opcode::p_logical_end)
          break;
@@ -651,7 +660,7 @@ void schedule_VMEM(sched_ctx& ctx, Block* block,
       bool is_dependency = false;
       if (candidate->format == Format::SMEM)
          is_dependency = !can_reorder_smem && !can_reorder_candidate;
-      if (candidate->isVMEM())
+      if (is_vmem)
          is_dependency = !can_reorder_vmem && !can_reorder_candidate;
       for (const Operand& op : candidate->operands) {
          if (op.isTemp() && ctx.depends_on[op.tempId()]) {
@@ -676,7 +685,7 @@ void schedule_VMEM(sched_ctx& ctx, Block* block,
          }
          /* update flag whether we can reorder other memory instructions */
          can_reorder_smem &= candidate->format != Format::SMEM || can_reorder_candidate;
-         can_reorder_vmem &= !candidate->isVMEM() || can_reorder_candidate;
+         can_reorder_vmem &= !is_vmem || can_reorder_candidate;
 
          if (!found_dependency) {
             insert_idx = candidate_idx;
@@ -686,7 +695,7 @@ void schedule_VMEM(sched_ctx& ctx, Block* block,
             continue;
          }
 
-      } else if (candidate->isVMEM()) {
+      } else if (is_vmem) {
          /* don't move up dependencies of other VMEM instructions */
          for (const Definition& def : candidate->definitions) {
             if (def.isTemp())
@@ -717,7 +726,7 @@ void schedule_VMEM(sched_ctx& ctx, Block* block,
                ctx.RAR_dependencies[op.tempId()] = true;
          }
          can_reorder_smem &= candidate->format != Format::SMEM || can_reorder_candidate;
-         can_reorder_vmem &= !candidate->isVMEM() || can_reorder_candidate;
+         can_reorder_vmem &= !is_vmem || can_reorder_candidate;
          continue;
       }
 
@@ -783,7 +792,7 @@ void schedule_position_export(sched_ctx& ctx, Block* block,
          break;
       if (candidate->opcode == aco_opcode::p_exit_early_if)
          break;
-      if (candidate->isVMEM() || candidate->format == Format::SMEM)
+      if (candidate->isVMEM() || candidate->format == Format::SMEM || candidate->isFlatOrGlobal())
          break;
       if (!can_move_instr(candidate, current, moving_interaction))
          break;
@@ -876,7 +885,7 @@ void schedule_block(sched_ctx& ctx, Program *program, Block* block, live& live_v
       if (current->definitions.empty())
          continue;
 
-      if (current->isVMEM())
+      if (current->isVMEM() || current->isFlatOrGlobal())
          schedule_VMEM(ctx, block, live_vars.register_demand[block->index], current, idx);
       if (current->format == Format::SMEM)
          schedule_SMEM(ctx, block, live_vars.register_demand[block->index], current, idx);