Mesa (master): broadcom/compiler: let QPUs stall on TMU input/config overflows

Thu Feb 4 11:09:20 UTC 2021

Module: Mesa
Branch: master
Commit: 6630825dcfb384a17947c9f98f1cab3c157d2c0b
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=6630825dcfb384a17947c9f98f1cab3c157d2c0b

Author: Iago Toral Quiroga <itoral at igalia.com>
Date:   Wed Feb  3 09:14:00 2021 +0100

broadcom/compiler: let QPUs stall on TMU input/config overflows

We have been trying to avoid this by tracking fifo usages in the driver and
flushing all outstanding TMU sequences if we overflowed any of these, however,
this is actually not the most efficient strategy. Instead, we would like to
flush only enough operations to get things going again, which is better for
pipelining. Doing that in the driver would require some additional work, but
thankfully, it is not required, since this seems to be what the hardware does
automatically, so we can just remove overflow tracking for these two fifos
and enjoy the benefits.

This also further improves shader-db stats:

total instructions in shared programs: 8975062 -> 8955145 (-0.22%)
instructions in affected programs: 1637624 -> 1617707 (-1.22%)
helped: 4050
HURT: 2241
Instructions are helped.

total threads in shared programs: 236802 -> 237042 (0.10%)
threads in affected programs: 252 -> 492 (95.24%)
helped: 122
HURT: 2
Threads are helped.

total sfu-stalls in shared programs: 19901 -> 19592 (-1.55%)
sfu-stalls in affected programs: 4744 -> 4435 (-6.51%)
helped: 1248
HURT: 1051
Sfu-stalls are helped.

total inst-and-stalls in shared programs: 8994963 -> 8974737 (-0.22%)
inst-and-stalls in affected programs: 1636184 -> 1615958 (-1.24%)
helped: 4050
HURT: 2239
Inst-and-stalls are helped.

Reviewed-by: Alejandro Piñeiro <apinheiro at igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8825>

---

 src/broadcom/compiler/nir_to_vir.c   | 56 ++++++++++++++++--------------------
 src/broadcom/compiler/v3d40_tex.c    | 10 +++----
 src/broadcom/compiler/v3d_compiler.h | 27 +++++++++++++----
 3 files changed, 50 insertions(+), 43 deletions(-)

diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c
index 48e9c9b9255..ce56181f154 100644
--- a/src/broadcom/compiler/nir_to_vir.c
+++ b/src/broadcom/compiler/nir_to_vir.c
@@ -203,25 +203,23 @@ v3d_general_tmu_op(nir_intrinsic_instr *instr)
 }
 
 /**
- * Checks if pipelining a new TMU operation requiring 'components' LDTMUs and
- * 'writes' TMU register writes would overflow any of the TMU fifos.
+ * Checks if pipelining a new TMU operation requiring 'components' LDTMUs
+ * would overflow the Output TMU fifo.
+ *
+ * It is not allowed to overflow the Output fifo, however, we can overflow
+ * Input and Config fifos. Doing that makes the shader stall, but only for as
+ * long as it needs to be able to continue so it is better for pipelining to
+ * let the QPU stall on these if needed than trying to emit TMU flushes in the
+ * driver.
  */
 bool
-ntq_tmu_fifo_overflow(struct v3d_compile *c,
-                      uint32_t components,
-                      uint32_t writes)
+ntq_tmu_fifo_overflow(struct v3d_compile *c, uint32_t components)
 {
-        if (c->tmu.input_fifo_size + writes > 16 / c->threads)
+        if (c->tmu.flush_count >= MAX_TMU_QUEUE_SIZE)
                 return true;
 
-        /* Output and Config fifos are only involved with TMU lookups */
-        if (components > 0 &&
-            (c->tmu.config_fifo_size + 1 > 8 / c->threads ||
-             c->tmu.output_fifo_size + components > 16 / c->threads)) {
-                return true;
-        }
-
-        return false;
+        return components > 0 &&
+               c->tmu.output_fifo_size + components > 16 / c->threads;
 }
 
 /**
@@ -254,8 +252,6 @@ ntq_flush_tmu(struct v3d_compile *c)
                 }
         }
 
-        c->tmu.input_fifo_size = 0;
-        c->tmu.config_fifo_size = 0;
         c->tmu.output_fifo_size = 0;
         c->tmu.flush_count = 0;
         _mesa_set_clear(c->tmu.outstanding_regs, NULL);
@@ -269,15 +265,12 @@ ntq_flush_tmu(struct v3d_compile *c)
 void
 ntq_add_pending_tmu_flush(struct v3d_compile *c,
                           nir_dest *dest,
-                          uint32_t component_mask,
-                          uint32_t tmu_writes)
+                          uint32_t component_mask)
 {
         const uint32_t num_components = util_bitcount(component_mask);
-        assert(!ntq_tmu_fifo_overflow(c, num_components, tmu_writes));
+        assert(!ntq_tmu_fifo_overflow(c, num_components));
 
-        c->tmu.input_fifo_size += tmu_writes;
         if (num_components > 0) {
-                c->tmu.config_fifo_size += 1;
                 c->tmu.output_fifo_size += num_components;
                 if (!dest->is_ssa)
                         _mesa_set_add(c->tmu.outstanding_regs, dest->reg.reg);
@@ -544,14 +537,14 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
         }
 
         /* We are ready to emit TMU register writes now, but before we actually
-         * emit them we need to know the amount of writes we will require
-         * and we need to flush outstanding TMU operations if any of the writes
-         * reads from the result of an outstanding TMU operation before we emit
-         * any of the writes for the current operation to avoid corrupting its
-         * TMU sequence. To do this we run this logic twice, the first time
-         * it will count register writes and flush pending TMU requests if
-         * necessary due to a dependency, and the second one will emit the
-         * actual TMU writes.
+         * emit them we need to flush outstanding TMU operations if any of our
+         * writes reads from the result of an outstanding TMU operation before
+         * we start the TMU sequence for this operation, since otherwise the
+         * flush could happen in the middle of the TMU sequence we are about to
+         * emit, which is illegal. To do this we run this logic twice, the
+         * first time it will count required register writes and flush pending
+         * TMU requests if necessary due to a dependency, and the second one
+         * will emit the actual TMU writes.
          */
         const uint32_t dest_components = nir_intrinsic_dest_components(instr);
         uint32_t base_const_offset = const_offset;
@@ -623,7 +616,7 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
                                 /* If pipelining this TMU operation would
                                  * overflow TMU fifos, we need to flush.
                                  */
-                                if (ntq_tmu_fifo_overflow(c, dest_components, tmu_writes))
+                                if (ntq_tmu_fifo_overflow(c, dest_components))
                                         ntq_flush_tmu(c);
                         } else {
                                 /* Delay emission of the thread switch and
@@ -633,8 +626,7 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
                                 const uint32_t component_mask =
                                         (1 << dest_components) - 1;
                                 ntq_add_pending_tmu_flush(c, &instr->dest,
-                                                          component_mask,
-                                                          tmu_writes);
+                                                          component_mask);
                         }
                 }
         } while (is_store && writemask != 0);
diff --git a/src/broadcom/compiler/v3d40_tex.c b/src/broadcom/compiler/v3d40_tex.c
index 2c733c215a9..40092fb3f47 100644
--- a/src/broadcom/compiler/v3d40_tex.c
+++ b/src/broadcom/compiler/v3d40_tex.c
@@ -262,7 +262,7 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
         */
         const unsigned dest_components =
            util_bitcount(p0_unpacked.return_words_of_texture_data);
-        if (ntq_tmu_fifo_overflow(c, dest_components, tmu_writes))
+        if (ntq_tmu_fifo_overflow(c, dest_components))
                 ntq_flush_tmu(c);
 
         /* Process tex sources emitting corresponding TMU writes */
@@ -380,8 +380,7 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
         }
 
         ntq_add_pending_tmu_flush(c, &instr->dest,
-                                  p0_unpacked.return_words_of_texture_data,
-                                  tmu_writes);
+                                  p0_unpacked.return_words_of_texture_data);
 }
 
 static uint32_t
@@ -591,7 +590,7 @@ v3d40_vir_emit_image_load_store(struct v3d_compile *c,
        /* If pipelining this TMU operation would overflow TMU fifos, we need
         * to flush any outstanding TMU operations.
         */
-        if (ntq_tmu_fifo_overflow(c, instr_return_channels, tmu_writes))
+        if (ntq_tmu_fifo_overflow(c, instr_return_channels))
                 ntq_flush_tmu(c);
 
         vir_WRTMUC(c, QUNIFORM_IMAGE_TMU_CONFIG_P0, p0_packed);
@@ -603,6 +602,5 @@ v3d40_vir_emit_image_load_store(struct v3d_compile *c,
         vir_image_emit_register_writes(c, instr, atomic_add_replaced, NULL);
 
         ntq_add_pending_tmu_flush(c, &instr->dest,
-                                  p0_unpacked.return_words_of_texture_data,
-                                  tmu_writes);
+                                  p0_unpacked.return_words_of_texture_data);
 }
diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h
index 5353fe0a1c3..ca380f6b0c8 100644
--- a/src/broadcom/compiler/v3d_compiler.h
+++ b/src/broadcom/compiler/v3d_compiler.h
@@ -42,6 +42,25 @@
 #include "qpu/qpu_instr.h"
 #include "pipe/p_state.h"
 
+/**
+ * Maximum number of outstanding TMU operations we can queue for execution.
+ *
+ * This is mostly limited by the size of the TMU fifos. The Input and Config
+ * fifos can stall, but we prefer that than injecting TMU flushes manually
+ * in the driver, so we can ignore these, but we can't overflow the Output fifo,
+ * which has 16 / threads per-thread entries, meaning that the maximum number
+ * of outstanding LDTMUs we can ever have is 8, for a 2-way threaded shader.
+ * This means that at most we can have 8 outstanding TMU loads, if each load
+ * is just one component.
+ *
+ * NOTE: we could actually have a larger value here because TMU stores don't
+ * consume any entries in the Output fifo (so we could have any number of
+ * outstanding stores) and the driver keeps track of used Output fifo entries
+ * and will flush if we ever needs more than 8, but since loads are much more
+ * common than stores, it is probably not worth it.
+ */
+#define MAX_TMU_QUEUE_SIZE 8
+
 struct nir_builder;
 
 struct v3d_fs_inputs {
@@ -573,15 +592,13 @@ struct v3d_compile {
                  */
                 struct set *outstanding_regs;
 
-                uint32_t input_fifo_size;
-                uint32_t config_fifo_size;
                 uint32_t output_fifo_size;
 
                 struct {
                         nir_dest *dest;
                         uint8_t num_components;
                         uint8_t component_mask;
-                } flush[8]; /* 16 entries / 2 threads for input/output fifos */
+                } flush[MAX_TMU_QUEUE_SIZE];
                 uint32_t flush_count;
         } tmu;
 
@@ -943,9 +960,9 @@ uint8_t vir_channels_written(struct qinst *inst);
 struct qreg ntq_get_src(struct v3d_compile *c, nir_src src, int i);
 void ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan,
                     struct qreg result);
-bool ntq_tmu_fifo_overflow(struct v3d_compile *c, uint32_t components, uint32_t writes);
+bool ntq_tmu_fifo_overflow(struct v3d_compile *c, uint32_t components);
 void ntq_add_pending_tmu_flush(struct v3d_compile *c, nir_dest *dest,
-                               uint32_t component_mask, uint32_t tmu_writes);
+                               uint32_t component_mask);
 void ntq_flush_tmu(struct v3d_compile *c);
 void vir_emit_thrsw(struct v3d_compile *c);