Mesa (main): ir3: Introduce systall metric and new helper functions

GitLab Mirror gitlab-mirror at kemper.freedesktop.org
Fri Jan 7 15:02:56 UTC 2022


Module: Mesa
Branch: main
Commit: 7e60978d30a8c5ac6d16b7b28163b183f9263825
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=7e60978d30a8c5ac6d16b7b28163b183f9263825

Author: Connor Abbott <cwabbott0 at gmail.com>
Date:   Fri Dec 17 17:40:02 2021 +0100

ir3: Introduce systall metric and new helper functions

Add new centralized functions which will replace the various places we
hardcode 10 for the number of (ss) nops, add numbers for soft (sy) nops
based on similar computerator experiments with ldc, sam, and ldib (the
most common (sy) producers), and add a "systall" metric which is
analogous to sstall. This also fixes some cases where we'd erroniously
count ldl* as (sy) producers instead of (ss) producers when calculating
sstall.

This only switches over the metric reporting to the new functions, so
there is no behavior change. The following commit will switch over
the rest of the compiler.

While we're at it, remove max_sun as it's never set.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14246>

---

 src/freedreno/ir3/ir3.c                         | 18 ++++-
 src/freedreno/ir3/ir3.h                         | 98 +++++++++++++++++++++++++
 src/freedreno/ir3/ir3_legalize.c                |  6 +-
 src/freedreno/ir3/ir3_shader.c                  |  6 +-
 src/freedreno/ir3/ir3_shader.h                  |  1 -
 src/freedreno/vulkan/tu_pipeline.c              |  8 ++
 src/gallium/drivers/freedreno/ir3/ir3_gallium.c |  4 +-
 7 files changed, 126 insertions(+), 15 deletions(-)

diff --git a/src/freedreno/ir3/ir3.c b/src/freedreno/ir3/ir3.c
index 162a393110e..45f26fbb5fc 100644
--- a/src/freedreno/ir3/ir3.c
+++ b/src/freedreno/ir3/ir3.c
@@ -249,7 +249,7 @@ ir3_collect_info(struct ir3_shader_variant *v)
    info->sizedwords = info->size / 4;
 
    foreach_block (block, &shader->block_list) {
-      int sfu_delay = 0;
+      int sfu_delay = 0, mem_delay = 0;
 
       foreach_instr (instr, &block->instr_list) {
 
@@ -307,15 +307,25 @@ ir3_collect_info(struct ir3_shader_variant *v)
             sfu_delay = 0;
          }
 
-         if (instr->flags & IR3_INSTR_SY)
+         if (instr->flags & IR3_INSTR_SY) {
             info->sy++;
+            info->systall += mem_delay;
+            mem_delay = 0;
+         }
 
-         if (is_sfu(instr)) {
-            sfu_delay = 10;
+         if (is_ss_producer(instr)) {
+            sfu_delay = soft_ss_delay(instr);
          } else {
             int n = MIN2(sfu_delay, 1 + instr->repeat + instr->nop);
             sfu_delay -= n;
          }
+
+         if (is_sy_producer(instr)) {
+            mem_delay = soft_sy_delay(instr, shader);
+         } else {
+            int n = MIN2(mem_delay, 1 + instr->repeat + instr->nop);
+            mem_delay -= n;
+         }
       }
    }
 
diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h
index 3fec81d85b7..e1d6399e435 100644
--- a/src/freedreno/ir3/ir3.h
+++ b/src/freedreno/ir3/ir3.h
@@ -78,6 +78,8 @@ struct ir3_info {
 
    /* estimate of number of cycles stalled on (ss) */
    uint16_t sstall;
+   /* estimate of number of cycles stalled on (sy) */
+   uint16_t systall;
 
    uint16_t last_baryf; /* instruction # of last varying fetch */
 
@@ -1655,6 +1657,102 @@ unsigned ir3_delayslots_with_repeat(struct ir3_instruction *assigner,
 unsigned ir3_delay_calc(struct ir3_block *block,
                         struct ir3_instruction *instr, bool mergedregs);
 
+/* estimated (ss)/(sy) delay calculation */
+
+static inline bool
+is_local_mem_load(struct ir3_instruction *instr)
+{
+   return instr->opc == OPC_LDL || instr->opc == OPC_LDLV ||
+      instr->opc == OPC_LDLW;
+}
+
+/* Does this instruction need (ss) to wait for its result? */
+static inline bool
+is_ss_producer(struct ir3_instruction *instr)
+{
+   return is_sfu(instr) || is_local_mem_load(instr);
+}
+
+/* The soft delay for approximating the cost of (ss). */
+static inline unsigned
+soft_ss_delay(struct ir3_instruction *instr)
+{
+   /* On a6xx, it takes the number of delay slots to get a SFU result back (ie.
+    * using nop's instead of (ss) is:
+    *
+    *     8 - single warp
+    *     9 - two warps
+    *    10 - four warps
+    *
+    * and so on. Not quite sure where it tapers out (ie. how many warps share an
+    * SFU unit). But 10 seems like a reasonable # to choose:
+    */
+   return 10;
+}
+
+static inline bool
+is_sy_producer(struct ir3_instruction *instr)
+{
+   return is_tex_or_prefetch(instr) ||
+      (is_load(instr) && !is_local_mem_load(instr)) ||
+      is_atomic(instr->opc);
+}
+
+static inline unsigned
+soft_sy_delay(struct ir3_instruction *instr, struct ir3 *shader)
+{
+   /* TODO: this is just an optimistic guess, we can do better post-RA.
+    */
+   bool double_wavesize =
+      shader->type == MESA_SHADER_FRAGMENT ||
+      shader->type == MESA_SHADER_COMPUTE;
+
+   unsigned components = reg_elems(instr->dsts[0]);
+
+   /* These numbers come from counting the number of delay slots to get
+    * cat5/cat6 results back using nops instead of (sy). Note that these numbers
+    * are with the result preloaded to cache by loading it before in the same
+    * shader - uncached results are much larger.
+    *
+    * Note: most ALU instructions can't complete at the full doubled rate, so
+    * they take 2 cycles. The only exception is fp16 instructions with no
+    * built-in conversions. Therefore divide the latency by 2.
+    *
+    * TODO: Handle this properly in the scheduler and remove this.
+    */
+   if (instr->opc == OPC_LDC) {
+      if (double_wavesize)
+         return (21 + 8 * components) / 2;
+      else
+         return 18 + 4 * components;
+   } else if (is_tex_or_prefetch(instr)) {
+      if (double_wavesize) {
+         switch (components) {
+         case 1: return 58 / 2;
+         case 2: return 60 / 2;
+         case 3: return 77 / 2;
+         case 4: return 79 / 2;
+         default: unreachable("bad number of components");
+         }
+      } else {
+         switch (components) {
+         case 1: return 51;
+         case 2: return 53;
+         case 3: return 62;
+         case 4: return 64;
+         default: unreachable("bad number of components");
+         }
+      }
+   } else {
+      /* TODO: measure other cat6 opcodes like ldg */
+      if (double_wavesize)
+         return (172 + components) / 2;
+      else
+         return 109 + components;
+   }
+}
+
+
 /* unreachable block elimination: */
 bool ir3_remove_unreachable(struct ir3 *ir);
 
diff --git a/src/freedreno/ir3/ir3_legalize.c b/src/freedreno/ir3/ir3_legalize.c
index eaa393bc1a2..bf8906f79b5 100644
--- a/src/freedreno/ir3/ir3_legalize.c
+++ b/src/freedreno/ir3/ir3_legalize.c
@@ -264,11 +264,7 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
          ir3_NOP(block)->flags |= IR3_INSTR_SS;
          last_input_needs_ss = false;
       } else if (is_load(n)) {
-         /* seems like ldlv needs (ss) bit instead??  which is odd but
-          * makes a bunch of flat-varying tests start working on a4xx.
-          */
-         if ((n->opc == OPC_LDLV) || (n->opc == OPC_LDL) ||
-             (n->opc == OPC_LDLW))
+         if (is_local_mem_load(n))
             regmask_set(&state->needs_ss, n->dsts[0]);
          else
             regmask_set(&state->needs_sy, n->dsts[0]);
diff --git a/src/freedreno/ir3/ir3_shader.c b/src/freedreno/ir3/ir3_shader.c
index c81ff1ed77d..965d20db5e5 100644
--- a/src/freedreno/ir3/ir3_shader.c
+++ b/src/freedreno/ir3/ir3_shader.c
@@ -790,9 +790,9 @@ ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin, FILE *out)
 
    fprintf(
       out,
-      "; %s prog %d/%d: %u sstall, %u (ss), %u (sy), %d max_sun, %d loops\n",
-      type, so->shader->id, so->id, so->info.sstall, so->info.ss, so->info.sy,
-      so->max_sun, so->loops);
+      "; %s prog %d/%d: %u sstall, %u (ss), %u systall, %u (sy), %d loops\n",
+      type, so->shader->id, so->id, so->info.sstall, so->info.ss,
+      so->info.systall, so->info.sy, so->loops);
 
    /* print shader type specific info: */
    switch (so->type) {
diff --git a/src/freedreno/ir3/ir3_shader.h b/src/freedreno/ir3/ir3_shader.h
index 6dc005ea9f3..f4588512a15 100644
--- a/src/freedreno/ir3/ir3_shader.h
+++ b/src/freedreno/ir3/ir3_shader.h
@@ -538,7 +538,6 @@ struct ir3_shader_variant {
     */
    unsigned branchstack;
 
-   unsigned max_sun;
    unsigned loops;
 
    /* the instructions length is in units of instruction groups
diff --git a/src/freedreno/vulkan/tu_pipeline.c b/src/freedreno/vulkan/tu_pipeline.c
index 6694913d6d1..2749a4cfa6e 100644
--- a/src/freedreno/vulkan/tu_pipeline.c
+++ b/src/freedreno/vulkan/tu_pipeline.c
@@ -3584,6 +3584,14 @@ tu_GetPipelineExecutableStatisticsKHR(
       stat->value.u64 = exe->stats.sstall;
    }
 
+   vk_outarray_append(&out, stat) {
+      WRITE_STR(stat->name, "Estimated cycles stalled on SY");
+      WRITE_STR(stat->description,
+                "A better metric to estimate the impact of SY syncs.");
+      stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
+      stat->value.u64 = exe->stats.systall;
+   }
+
    for (int i = 0; i < ARRAY_SIZE(exe->stats.instrs_per_cat); i++) {
       vk_outarray_append(&out, stat) {
          WRITE_STR(stat->name, "cat%d instructions", i);
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_gallium.c b/src/gallium/drivers/freedreno/ir3/ir3_gallium.c
index 06ea8fec32c..92c6527c894 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_gallium.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_gallium.c
@@ -85,7 +85,7 @@ dump_shader_info(struct ir3_shader_variant *v,
       "%s shader: %u inst, %u nops, %u non-nops, %u mov, %u cov, "
       "%u dwords, %u last-baryf, %u half, %u full, %u constlen, "
       "%u cat0, %u cat1, %u cat2, %u cat3, %u cat4, %u cat5, %u cat6, %u cat7, "
-      "%u stp, %u ldp, %u sstall, %u (ss), %u (sy), %d waves, %d max_sun, "
+      "%u stp, %u ldp, %u sstall, %u (ss), %u systall, %u (sy), %d waves, "
       "%d loops\n",
       ir3_shader_stage(v), v->info.instrs_count, v->info.nops_count,
       v->info.instrs_count - v->info.nops_count, v->info.mov_count,
@@ -96,7 +96,7 @@ dump_shader_info(struct ir3_shader_variant *v,
       v->info.instrs_per_cat[4], v->info.instrs_per_cat[5],
       v->info.instrs_per_cat[6], v->info.instrs_per_cat[7],
       v->info.stp_count, v->info.ldp_count, v->info.sstall,
-      v->info.ss, v->info.sy, v->info.max_waves, v->max_sun, v->loops);
+      v->info.ss, v->info.systall, v->info.sy, v->info.max_waves, v->loops);
 }
 
 static void



More information about the mesa-commit mailing list