Mesa (master): aco: add vmem/smem score statistic

Fri Apr 3 12:23:10 UTC 2020

Module: Mesa
Branch: master
Commit: 507956ed04fcdcfd44419d1b16f032e1d81d0dcb
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=507956ed04fcdcfd44419d1b16f032e1d81d0dcb

Author: Rhys Perry <pendingchaos02 at gmail.com>
Date:   Wed Dec  4 14:41:18 2019 +0000

aco: add vmem/smem score statistic

This isn't perfect (for example, changes might not be too meaningful when
comparing shaders with different control flow) but it should be useful for
evaluating scheduler changes.

Signed-off-by: Rhys Perry <pendingchaos02 at gmail.com>
Acked-by: Bas Nieuwenhuizen <bas at basnieuwenhuizen.nl>
Reviewed-by: Daniel Schürmann <daniel at schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/2965>

---

 src/amd/compiler/aco_insert_waitcnt.cpp | 160 ++++++++++++++++++++++++++++++--
 src/amd/compiler/aco_interface.cpp      |   2 +
 src/amd/compiler/aco_ir.h               |   2 +
 3 files changed, 155 insertions(+), 9 deletions(-)

diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp
index 09556d232b5..c0a93e3a929 100644
--- a/src/amd/compiler/aco_insert_waitcnt.cpp
+++ b/src/amd/compiler/aco_insert_waitcnt.cpp
@@ -25,6 +25,7 @@
 #include <algorithm>
 #include <map>
 #include <stack>
+#include <math.h>
 
 #include "aco_ir.h"
 #include "vulkan/radv_shader.h"
@@ -66,6 +67,7 @@ enum wait_event : uint16_t {
    event_gds_gpr_lock = 1 << 9,
    event_vmem_gpr_lock = 1 << 10,
    event_sendmsg = 1 << 11,
+   num_events = 12,
 };
 
 enum counter_type : uint8_t {
@@ -73,6 +75,7 @@ enum counter_type : uint8_t {
    counter_lgkm = 1 << 1,
    counter_vm = 1 << 2,
    counter_vs = 1 << 3,
+   num_counters = 4,
 };
 
 static const uint16_t exp_events = event_exp_pos | event_exp_param | event_exp_mrt_null | event_gds_gpr_lock | event_vmem_gpr_lock;
@@ -105,6 +108,21 @@ uint8_t get_counters_for_event(wait_event ev)
    }
 }
 
+uint16_t get_events_for_counter(counter_type ctr)
+{
+   switch (ctr) {
+   case counter_exp:
+      return exp_events;
+   case counter_lgkm:
+      return lgkm_events;
+   case counter_vm:
+      return vm_events;
+   case counter_vs:
+      return vs_events;
+   }
+   return 0;
+}
+
 struct wait_imm {
    static const uint8_t unset_counter = 0xff;
 
@@ -251,6 +269,13 @@ struct wait_ctx {
 
    std::map<PhysReg,wait_entry> gpr_map;
 
+   /* used for vmem/smem scores */
+   bool collect_statistics;
+   Instruction *gen_instr;
+   std::map<Instruction *, unsigned> unwaited_instrs[num_counters];
+   std::map<PhysReg,std::set<Instruction *>> reg_instrs[num_counters];
+   std::vector<unsigned> wait_distances[num_events];
+
    wait_ctx() {}
    wait_ctx(Program *program_)
            : program(program_),
@@ -298,8 +323,53 @@ struct wait_ctx {
          barrier_events[i] |= other->barrier_events[i];
       }
 
+      /* these are used for statistics, so don't update "changed" */
+      for (unsigned i = 0; i < num_counters; i++) {
+         for (std::pair<Instruction *, unsigned> instr : other->unwaited_instrs[i]) {
+            auto pos = unwaited_instrs[i].find(instr.first);
+            if (pos == unwaited_instrs[i].end())
+               unwaited_instrs[i].insert(instr);
+            else
+               pos->second = std::min(pos->second, instr.second);
+         }
+         /* don't use a foreach loop to avoid copies */
+         for (auto it = other->reg_instrs[i].begin(); it != other->reg_instrs[i].end(); ++it)
+            reg_instrs[i][it->first].insert(it->second.begin(), it->second.end());
+      }
+
       return changed;
    }
+
+   void wait_and_remove_from_entry(PhysReg reg, wait_entry& entry, counter_type counter) {
+      if (collect_statistics && (entry.counters & counter)) {
+         unsigned counter_idx = ffs(counter) - 1;
+         for (Instruction *instr : reg_instrs[counter_idx][reg]) {
+            auto pos = unwaited_instrs[counter_idx].find(instr);
+            if (pos == unwaited_instrs[counter_idx].end())
+               continue;
+
+            unsigned distance = pos->second;
+            unsigned events = entry.events & get_events_for_counter(counter);
+            while (events) {
+               unsigned event_idx = u_bit_scan(&events);
+               wait_distances[event_idx].push_back(distance);
+            }
+
+            unwaited_instrs[counter_idx].erase(instr);
+         }
+         reg_instrs[counter_idx][reg].clear();
+      }
+
+      entry.remove_counter(counter);
+   }
+
+   void advance_unwaited_instrs()
+   {
+      for (unsigned i = 0; i < num_counters; i++) {
+         for (auto it = unwaited_instrs[i].begin(); it != unwaited_instrs[i].end(); ++it)
+            it->second++;
+      }
+   }
 };
 
 wait_imm check_instr(Instruction* instr, wait_ctx& ctx)
@@ -477,13 +547,13 @@ wait_imm kill(Instruction* instr, wait_ctx& ctx)
       while (it != ctx.gpr_map.end())
       {
          if (imm.exp != wait_imm::unset_counter && imm.exp <= it->second.imm.exp)
-            it->second.remove_counter(counter_exp);
+            ctx.wait_and_remove_from_entry(it->first, it->second, counter_exp);
          if (imm.vm != wait_imm::unset_counter && imm.vm <= it->second.imm.vm)
-            it->second.remove_counter(counter_vm);
+            ctx.wait_and_remove_from_entry(it->first, it->second, counter_vm);
          if (imm.lgkm != wait_imm::unset_counter && imm.lgkm <= it->second.imm.lgkm)
-            it->second.remove_counter(counter_lgkm);
+            ctx.wait_and_remove_from_entry(it->first, it->second, counter_lgkm);
          if (imm.lgkm != wait_imm::unset_counter && imm.vs <= it->second.imm.vs)
-            it->second.remove_counter(counter_vs);
+            ctx.wait_and_remove_from_entry(it->first, it->second, counter_vs);
          if (!it->second.counters)
             it = ctx.gpr_map.erase(it);
          else
@@ -619,6 +689,16 @@ void insert_wait_entry(wait_ctx& ctx, PhysReg reg, RegClass rc, wait_event event
       if (!it.second)
          it.first->second.join(new_entry);
    }
+
+   if (ctx.collect_statistics) {
+      unsigned counters_todo = counters;
+      while (counters_todo) {
+         unsigned i = u_bit_scan(&counters_todo);
+         ctx.unwaited_instrs[i].insert(std::make_pair(ctx.gen_instr, 0u));
+         for (unsigned j = 0; j < rc.size(); j++)
+            ctx.reg_instrs[i][PhysReg{reg.reg+j}].insert(ctx.gen_instr);
+      }
+   }
 }
 
 void insert_wait_entry(wait_ctx& ctx, Operand op, wait_event event)
@@ -758,11 +838,15 @@ void handle_block(Program *program, Block& block, wait_ctx& ctx)
    std::vector<aco_ptr<Instruction>> new_instructions;
 
    wait_imm queued_imm;
+
+   ctx.collect_statistics = program->collect_statistics;
+
    for (aco_ptr<Instruction>& instr : block.instructions) {
       bool is_wait = !parse_wait_instr(ctx, instr.get()).empty();
 
       queued_imm.combine(kill(instr.get(), ctx));
 
+      ctx.gen_instr = instr.get();
       gen(instr.get(), ctx);
 
       if (instr->format != Format::PSEUDO_BARRIER && !is_wait) {
@@ -771,6 +855,9 @@ void handle_block(Program *program, Block& block, wait_ctx& ctx)
             queued_imm = wait_imm();
          }
          new_instructions.emplace_back(std::move(instr));
+
+         if (ctx.collect_statistics)
+            ctx.advance_unwaited_instrs();
       }
    }
 
@@ -782,12 +869,58 @@ void handle_block(Program *program, Block& block, wait_ctx& ctx)
 
 } /* end namespace */
 
+static uint32_t calculate_score(unsigned num_ctx, wait_ctx *ctx, uint32_t event_mask)
+{
+   double result = 0.0;
+   unsigned num_waits = 0;
+   while (event_mask) {
+      unsigned event_index = u_bit_scan(&event_mask);
+      for (unsigned i = 0; i < num_ctx; i++) {
+         for (unsigned dist : ctx[i].wait_distances[event_index]) {
+            double score = dist;
+            /* for many events, excessive distances provide little benefit, so
+             * decrease the score in that case. */
+            double threshold = INFINITY;
+            double inv_strength = 0.000001;
+            switch (1 << event_index) {
+            case event_smem:
+               threshold = 70.0;
+               inv_strength = 75.0;
+               break;
+            case event_vmem:
+            case event_vmem_store:
+            case event_flat:
+               threshold = 230.0;
+               inv_strength = 150.0;
+               break;
+            case event_lds:
+               threshold = 16.0;
+               break;
+            default:
+               break;
+            }
+            if (score > threshold) {
+               score -= threshold;
+               score = threshold + score / (1.0 + score / inv_strength);
+            }
+
+            /* we don't want increases in high scores to hide decreases in low scores,
+             * so raise to the power of 0.1 before averaging. */
+            result += pow(score, 0.1);
+            num_waits++;
+         }
+      }
+   }
+   return round(pow(result / num_waits, 10.0) * 10.0);
+}
+
 void insert_wait_states(Program* program)
 {
    /* per BB ctx */
    std::vector<bool> done(program->blocks.size());
    wait_ctx in_ctx[program->blocks.size()];
    wait_ctx out_ctx[program->blocks.size()];
+
    for (unsigned i = 0; i < program->blocks.size(); i++)
       in_ctx[i] = wait_ctx(program);
    std::stack<unsigned> loop_header_indices;
@@ -817,13 +950,15 @@ void insert_wait_states(Program* program)
       for (unsigned b : current.logical_preds)
          changed |= ctx.join(&out_ctx[b], true);
 
-      in_ctx[current.index] = ctx;
-
-      if (done[current.index] && !changed)
+      if (done[current.index] && !changed) {
+         in_ctx[current.index] = std::move(ctx);
          continue;
+      } else {
+         in_ctx[current.index] = ctx;
+      }
 
       if (current.instructions.empty()) {
-         out_ctx[current.index] = ctx;
+         out_ctx[current.index] = std::move(ctx);
          continue;
       }
 
@@ -832,7 +967,14 @@ void insert_wait_states(Program* program)
 
       handle_block(program, current, ctx);
 
-      out_ctx[current.index] = ctx;
+      out_ctx[current.index] = std::move(ctx);
+   }
+
+   if (program->collect_statistics) {
+      program->statistics[statistic_vmem_score] =
+         calculate_score(program->blocks.size(), out_ctx, event_vmem | event_flat | event_vmem_store);
+      program->statistics[statistic_smem_score] =
+         calculate_score(program->blocks.size(), out_ctx, event_smem);
    }
 }
 
diff --git a/src/amd/compiler/aco_interface.cpp b/src/amd/compiler/aco_interface.cpp
index 104436d33c6..19c98c8196c 100644
--- a/src/amd/compiler/aco_interface.cpp
+++ b/src/amd/compiler/aco_interface.cpp
@@ -62,6 +62,8 @@ static radv_compiler_statistic_info statistic_infos[] = {
    [aco::statistic_cycles] = {"Busy Cycles", "Estimate of busy cycles"},
    [aco::statistic_vmem_clauses] = {"VMEM Clause", "Number of VMEM clauses (includes 1-sized clauses)"},
    [aco::statistic_smem_clauses] = {"SMEM Clause", "Number of SMEM clauses (includes 1-sized clauses)"},
+   [aco::statistic_vmem_score] = {"VMEM Score", "Average VMEM def-use distances"},
+   [aco::statistic_smem_score] = {"SMEM Score", "Average SMEM def-use distances"},
    [aco::statistic_sgpr_presched] = {"Pre-Sched SGPRs", "SGPR usage before scheduling"},
    [aco::statistic_vgpr_presched] = {"Pre-Sched VGPRs", "VGPR usage before scheduling"},
 };
diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h
index c6213e0c04e..a62525016fb 100644
--- a/src/amd/compiler/aco_ir.h
+++ b/src/amd/compiler/aco_ir.h
@@ -1227,6 +1227,8 @@ enum statistic {
    statistic_cycles,
    statistic_vmem_clauses,
    statistic_smem_clauses,
+   statistic_vmem_score,
+   statistic_smem_score,
    statistic_sgpr_presched,
    statistic_vgpr_presched,
    num_statistics