Mesa (master): broadcom/compiler: support pipelining of tex instructions

Thu Feb 4 11:09:20 UTC 2021

Module: Mesa
Branch: master
Commit: be45960d3e23cfa9a96b5f15d64f8936be0f8f28
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=be45960d3e23cfa9a96b5f15d64f8936be0f8f28

Author: Iago Toral Quiroga <itoral at igalia.com>
Date:   Wed Jan 27 09:45:52 2021 +0100

broadcom/compiler: support pipelining of tex instructions

This follows the same idea as for TMU general instructions of reusing
the existing infrastructure to first count required register writes and
flush outstanding TMU dependencies, and then emit the actual writes, which
requires that we split the code that decides about register writes to
a helper.

We also need to start using a component mask instead of the number
of components that we need to read with a particular TMU operation.

v2: update tmu_writes for V3D_QPU_WADDR_TMUOFF

Reviewed-by: Alejandro Piñeiro <apinheiro at igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8825>

---

 src/broadcom/compiler/nir_to_vir.c   |  31 ++--
 src/broadcom/compiler/v3d33_tex.c    |   6 +-
 src/broadcom/compiler/v3d40_tex.c    | 324 ++++++++++++++++++++++-------------
 src/broadcom/compiler/v3d_compiler.h |   6 +-
 4 files changed, 231 insertions(+), 136 deletions(-)

diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c
index 2bf591fbac5..2ed84146735 100644
--- a/src/broadcom/compiler/nir_to_vir.c
+++ b/src/broadcom/compiler/nir_to_vir.c
@@ -206,8 +206,10 @@ v3d_general_tmu_op(nir_intrinsic_instr *instr)
  * Checks if pipelining a new TMU operation requiring 'components' LDTMUs and
  * 'writes' TMU register writes would overflow any of the TMU fifos.
  */
-static bool
-tmu_fifo_overflow(struct v3d_compile *c, uint32_t components, uint32_t writes)
+bool
+ntq_tmu_fifo_overflow(struct v3d_compile *c,
+                      uint32_t components,
+                      uint32_t writes)
 {
         if (c->tmu.input_fifo_size + writes > 16 / c->threads)
                 return true;
@@ -236,13 +238,15 @@ ntq_flush_tmu(struct v3d_compile *c)
 
         bool emitted_tmuwt = false;
         for (int i = 0; i < c->tmu.flush_count; i++) {
-                if (c->tmu.flush[i].num_components > 0) {
+                if (c->tmu.flush[i].component_mask > 0) {
                         nir_dest *dest = c->tmu.flush[i].dest;
                         assert(dest);
 
-                        for (int j = 0; j < c->tmu.flush[i].num_components; j++) {
-                                ntq_store_dest(c, dest, j,
-                                               vir_MOV(c, vir_LDTMU(c)));
+                        for (int j = 0; j < 4; j++) {
+                                if (c->tmu.flush[i].component_mask & (1 << j)) {
+                                        ntq_store_dest(c, dest, j,
+                                                       vir_MOV(c, vir_LDTMU(c)));
+                                }
                         }
                 } else if (!emitted_tmuwt) {
                         vir_TMUWT(c);
@@ -262,13 +266,14 @@ ntq_flush_tmu(struct v3d_compile *c)
  * is reponsible for ensuring that doing this doesn't overflow the TMU fifos,
  * and more specifically, the output fifo, since that can't stall.
  */
-static void
+void
 ntq_add_pending_tmu_flush(struct v3d_compile *c,
                           nir_dest *dest,
-                          uint32_t num_components,
+                          uint32_t component_mask,
                           uint32_t tmu_writes)
 {
-        assert(!tmu_fifo_overflow(c, num_components, tmu_writes));
+        const uint32_t num_components = util_bitcount(component_mask);
+        assert(!ntq_tmu_fifo_overflow(c, num_components, tmu_writes));
 
         c->tmu.input_fifo_size += tmu_writes;
         if (num_components > 0) {
@@ -279,7 +284,7 @@ ntq_add_pending_tmu_flush(struct v3d_compile *c,
         }
 
         c->tmu.flush[c->tmu.flush_count].dest = dest;
-        c->tmu.flush[c->tmu.flush_count].num_components = num_components;
+        c->tmu.flush[c->tmu.flush_count].component_mask = component_mask;
         c->tmu.flush_count++;
 }
 
@@ -615,15 +620,17 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
                                 /* If pipelining this TMU operation would
                                  * overflow TMU fifos, we need to flush.
                                  */
-                                if (tmu_fifo_overflow(c, dest_components, tmu_writes))
+                                if (ntq_tmu_fifo_overflow(c, dest_components, tmu_writes))
                                         ntq_flush_tmu(c);
                         } else {
                                 /* Delay emission of the thread switch and
                                  * LDTMU/TMUWT until we really need to do it to
                                  * improve pipelining.
                                  */
+                                const uint32_t component_mask =
+                                        (1 << dest_components) - 1;
                                 ntq_add_pending_tmu_flush(c, &instr->dest,
-                                                          dest_components,
+                                                          component_mask,
                                                           tmu_writes);
                         }
                 }
diff --git a/src/broadcom/compiler/v3d33_tex.c b/src/broadcom/compiler/v3d33_tex.c
index 386453289c3..b933635f6fe 100644
--- a/src/broadcom/compiler/v3d33_tex.c
+++ b/src/broadcom/compiler/v3d33_tex.c
@@ -33,7 +33,11 @@
 void
 v3d33_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
 {
-        /* FIXME: allow tex pipelining */
+        /* FIXME: We don't bother implementing pipelining for texture reads
+         * for any pre 4.x hardware. It should be straight forward to do but
+         * we are not really testing or even targetting this hardware at
+         * present.
+         */
         ntq_flush_tmu(c);
 
         unsigned unit = instr->texture_index;
diff --git a/src/broadcom/compiler/v3d40_tex.c b/src/broadcom/compiler/v3d40_tex.c
index f999c8b8619..73a1d539aab 100644
--- a/src/broadcom/compiler/v3d40_tex.c
+++ b/src/broadcom/compiler/v3d40_tex.c
@@ -39,7 +39,8 @@ vir_TMU_WRITE(struct v3d_compile *c, enum v3d_qpu_waddr waddr, struct qreg val,
          */
         vir_MOV_dest(c, vir_reg(QFILE_MAGIC, waddr), val);
 
-        (*tmu_writes)++;
+        if (tmu_writes)
+                (*tmu_writes)++;
 }
 
 static void
@@ -58,124 +59,184 @@ static const struct V3D41_TMU_CONFIG_PARAMETER_2 p2_unpacked_default = {
         .op = V3D_TMU_OP_REGULAR,
 };
 
-void
-v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
+/**
+ * If 'tmu_writes' is not NULL, then it just counts required register writes,
+ * otherwise, it emits the actual register writes.
+ *
+ * It is important to notice that emitting register writes for the current
+ * TMU operation may trigger a TMU flush, since it is possible that any
+ * of the inputs required for the register writes is the result of a pending
+ * TMU operation. If that happens we need to make sure that it doesn't happen
+ * in the middle of the TMU register writes for the current TMU operation,
+ * which is why we always call ntq_get_src() even if we are only interested in
+ * register write counts.
+ */
+static void
+handle_tex_src(struct v3d_compile *c,
+               nir_tex_instr *instr,
+               unsigned src_idx,
+               unsigned non_array_components,
+               struct V3D41_TMU_CONFIG_PARAMETER_2 *p2_unpacked,
+               struct qreg *s_out,
+               unsigned *tmu_writes)
 {
-        /* FIXME: allow tex pipelining */
-        ntq_flush_tmu(c);
-
-        unsigned texture_idx = instr->texture_index;
-        unsigned sampler_idx = instr->sampler_index;
-
-        int tmu_writes = 0;
-
-        struct V3D41_TMU_CONFIG_PARAMETER_0 p0_unpacked = {
-        };
-
-        assert(instr->op != nir_texop_lod || c->devinfo->ver >= 42);
-
-        struct V3D41_TMU_CONFIG_PARAMETER_2 p2_unpacked = {
-                .op = V3D_TMU_OP_REGULAR,
-
-                .gather_mode = instr->op == nir_texop_tg4,
-                .gather_component = instr->component,
-
-                .coefficient_mode = instr->op == nir_texop_txd,
-
-                .disable_autolod = instr->op == nir_texop_tg4
-        };
-
-        int non_array_components =
-           instr->op != nir_texop_lod ?
-           instr->coord_components - instr->is_array :
-           instr->coord_components;
+        /* Either we are calling this just to count required TMU writes, or we
+         * are calling this to emit the actual TMU writes.
+         */
+        assert(tmu_writes || (s_out && p2_unpacked));
 
         struct qreg s;
+        switch (instr->src[src_idx].src_type) {
+        case nir_tex_src_coord:
+                /* S triggers the lookup, so save it for the end. */
+                s = ntq_get_src(c, instr->src[src_idx].src, 0);
+                if (tmu_writes)
+                        (*tmu_writes)++;
+                else
+                        *s_out = s;
+
+                if (non_array_components > 1) {
+                        struct qreg src =
+                                ntq_get_src(c, instr->src[src_idx].src, 1);
+                        if (tmu_writes)
+                                (*tmu_writes)++;
+                        else
+                                vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUT, src, NULL);
+                }
 
-        for (unsigned i = 0; i < instr->num_srcs; i++) {
-                switch (instr->src[i].src_type) {
-                case nir_tex_src_coord:
-                        /* S triggers the lookup, so save it for the end. */
-                        s = ntq_get_src(c, instr->src[i].src, 0);
-
-                        if (non_array_components > 1) {
-                                vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUT,
-                                              ntq_get_src(c, instr->src[i].src,
-                                                          1), &tmu_writes);
-                        }
-                        if (non_array_components > 2) {
-                                vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUR,
-                                              ntq_get_src(c, instr->src[i].src,
-                                                          2), &tmu_writes);
-                        }
-
-                        if (instr->is_array) {
-                                vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUI,
-                                              ntq_get_src(c, instr->src[i].src,
-                                                          instr->coord_components - 1),
-                                              &tmu_writes);
-                        }
-                        break;
+                if (non_array_components > 2) {
+                        struct qreg src =
+                                ntq_get_src(c, instr->src[src_idx].src, 2);
+                        if (tmu_writes)
+                                (*tmu_writes)++;
+                        else
+                                vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUR, src, NULL);
+                }
 
-                case nir_tex_src_bias:
-                        vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUB,
-                                      ntq_get_src(c, instr->src[i].src, 0),
-                                      &tmu_writes);
-                        break;
+                if (instr->is_array) {
+                        struct qreg src =
+                                ntq_get_src(c, instr->src[src_idx].src,
+                                            instr->coord_components - 1);
+                        if (tmu_writes)
+                                (*tmu_writes)++;
+                        else
+                                vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUI, src, NULL);
+                }
+                break;
 
-                case nir_tex_src_lod:
-                        vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUB,
-                                      ntq_get_src(c, instr->src[i].src, 0),
-                                      &tmu_writes);
+        case nir_tex_src_bias: {
+                struct qreg src = ntq_get_src(c, instr->src[src_idx].src, 0);
+                if (tmu_writes)
+                        (*tmu_writes)++;
+                else
+                        vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUB, src, NULL);
+                break;
+        }
 
-                        /* With texel fetch automatic LOD is already disabled,
-                         * and disable_autolod must not be enabled. For
-                         * non-cubes we can use the register TMUSLOD, that
-                         * implicitly sets disable_autolod.
-                         */
-                        if (instr->op != nir_texop_txf &&
-                            instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
-                                p2_unpacked.disable_autolod = true;
-                        }
-                        break;
+        case nir_tex_src_lod: {
+                struct qreg src = ntq_get_src(c, instr->src[src_idx].src, 0);
+                if (tmu_writes) {
+                        (*tmu_writes)++;
+                } else {
+                         vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUB, src, NULL);
+
+                         /* With texel fetch automatic LOD is already disabled,
+                          * and disable_autolod must not be enabled. For
+                          * non-cubes we can use the register TMUSLOD, that
+                          * implicitly sets disable_autolod.
+                          */
+                          if (instr->op != nir_texop_txf &&
+                              instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
+                                  p2_unpacked->disable_autolod = true;
+                          }
+               }
+               break;
+        }
 
-                case nir_tex_src_comparator:
-                        vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUDREF,
-                                      ntq_get_src(c, instr->src[i].src, 0),
-                                      &tmu_writes);
-                        break;
+        case nir_tex_src_comparator: {
+                struct qreg src = ntq_get_src(c, instr->src[src_idx].src, 0);
+                if (tmu_writes)
+                        (*tmu_writes)++;
+                else
+                        vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUDREF, src , NULL);
+                break;
+        }
 
-                case nir_tex_src_offset: {
-                        if (nir_src_is_const(instr->src[i].src)) {
-                                p2_unpacked.offset_s = nir_src_comp_as_int(instr->src[i].src, 0);
+        case nir_tex_src_offset: {
+                bool is_const_offset = nir_src_is_const(instr->src[src_idx].src);
+                if (is_const_offset) {
+                        if (!tmu_writes) {
+                                p2_unpacked->offset_s =
+                                        nir_src_comp_as_int(instr->src[src_idx].src, 0);
                                 if (non_array_components >= 2)
-                                        p2_unpacked.offset_t =
-                                                nir_src_comp_as_int(instr->src[i].src, 1);
+                                        p2_unpacked->offset_t =
+                                                nir_src_comp_as_int(instr->src[src_idx].src, 1);
                                 if (non_array_components >= 3)
-                                        p2_unpacked.offset_r =
-                                                nir_src_comp_as_int(instr->src[i].src, 2);
-                        } else {
+                                        p2_unpacked->offset_r =
+                                                nir_src_comp_as_int(instr->src[src_idx].src, 2);
+                        }
+                } else {
+                        struct qreg src_0 =
+                                ntq_get_src(c, instr->src[src_idx].src, 0);
+                        struct qreg src_1 =
+                                ntq_get_src(c, instr->src[src_idx].src, 1);
+                        if (!tmu_writes) {
                                 struct qreg mask = vir_uniform_ui(c, 0xf);
                                 struct qreg x, y, offset;
 
-                                x = vir_AND(c, ntq_get_src(c, instr->src[i].src,
-                                                           0), mask);
-                                y = vir_AND(c, ntq_get_src(c, instr->src[i].src,
-                                                           1), mask);
+                                x = vir_AND(c, src_0, mask);
+                                y = vir_AND(c, src_1, mask);
                                 offset = vir_OR(c, x,
-                                                vir_SHL(c, y,
-                                                        vir_uniform_ui(c, 4)));
+                                                vir_SHL(c, y, vir_uniform_ui(c, 4)));
 
-                                vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUOFF,
-                                              offset, &tmu_writes);
+                                vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUOFF, offset, NULL);
+                        } else {
+                                (*tmu_writes)++;
                         }
-                        break;
                 }
+                break;
+        }
 
-                default:
-                        unreachable("unknown texture source");
-                }
+        default:
+                unreachable("unknown texture source");
+        }
+}
+
+static void
+vir_tex_handle_srcs(struct v3d_compile *c,
+                    nir_tex_instr *instr,
+                    struct V3D41_TMU_CONFIG_PARAMETER_2 *p2_unpacked,
+                    struct qreg *s,
+                    unsigned *tmu_writes)
+{
+        unsigned non_array_components = instr->op != nir_texop_lod ?
+                instr->coord_components - instr->is_array :
+                instr->coord_components;
+
+        for (unsigned i = 0; i < instr->num_srcs; i++) {
+                handle_tex_src(c, instr, i, non_array_components,
+                               p2_unpacked, s, tmu_writes);
         }
+}
+
+static unsigned
+get_required_tmu_writes(struct v3d_compile *c, nir_tex_instr *instr)
+{
+        unsigned tmu_writes = 0;
+        vir_tex_handle_srcs(c, instr, NULL, NULL, &tmu_writes);
+        return tmu_writes;
+}
+
+void
+v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
+{
+        assert(instr->op != nir_texop_lod || c->devinfo->ver >= 42);
+
+        unsigned texture_idx = instr->texture_index;
+        unsigned sampler_idx = instr->sampler_index;
+
+        struct V3D41_TMU_CONFIG_PARAMETER_0 p0_unpacked = {
+        };
 
         /* Limit the number of channels returned to both how many the NIR
          * instruction writes and how many the instruction could produce.
@@ -184,9 +245,36 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
                 instr->dest.is_ssa ?
                 nir_ssa_def_components_read(&instr->dest.ssa) :
                 (1 << instr->dest.reg.reg->num_components) - 1;
-
         assert(p0_unpacked.return_words_of_texture_data != 0);
 
+        struct V3D41_TMU_CONFIG_PARAMETER_2 p2_unpacked = {
+                .op = V3D_TMU_OP_REGULAR,
+                .gather_mode = instr->op == nir_texop_tg4,
+                .gather_component = instr->component,
+                .coefficient_mode = instr->op == nir_texop_txd,
+                .disable_autolod = instr->op == nir_texop_tg4
+        };
+
+        const unsigned tmu_writes = get_required_tmu_writes(c, instr);
+
+        /* The input FIFO has 16 slots across all threads so if we require
+         * more than that we need to lower thread count.
+         */
+        while (tmu_writes > 16 / c->threads)
+                c->threads /= 2;
+
+       /* If pipelining this TMU operation would overflow TMU fifos, we need
+        * to flush any outstanding TMU operations.
+        */
+        const unsigned dest_components =
+           util_bitcount(p0_unpacked.return_words_of_texture_data);
+        if (ntq_tmu_fifo_overflow(c, dest_components, tmu_writes))
+                ntq_flush_tmu(c);
+
+        /* Process tex sources emitting corresponding TMU writes */
+        struct qreg s = { };
+        vir_tex_handle_srcs(c, instr, &p2_unpacked, &s, NULL);
+
         uint32_t p0_packed;
         V3D41_TMU_CONFIG_PARAMETER_0_pack(NULL,
                                           (uint8_t *)&p0_packed,
@@ -216,15 +304,15 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
          * itself, we still need to add the sampler configuration
          * parameter if the output is 32 bit
          */
-        bool output_type_32_bit = (c->key->sampler[sampler_idx].return_size == 32 &&
-                                   !instr->is_shadow);
+        bool output_type_32_bit =
+                c->key->sampler[sampler_idx].return_size == 32 &&
+                !instr->is_shadow;
 
-        /*
-         * p1 is optional, but we can skip it only if p2 can be skipped too
-         */
+        /* p1 is optional, but we can skip it only if p2 can be skipped too */
         bool needs_p2_config =
                 (instr->op == nir_texop_lod ||
-                 memcmp(&p2_unpacked, &p2_unpacked_default, sizeof(p2_unpacked)) != 0);
+                 memcmp(&p2_unpacked, &p2_unpacked_default,
+                        sizeof(p2_unpacked)) != 0);
 
         /* To handle the cases were we can't just use p1_unpacked_default */
         bool non_default_p1_config = nir_tex_instr_need_sampler(instr) ||
@@ -285,29 +373,21 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
         if (needs_p2_config)
                 vir_WRTMUC(c, QUNIFORM_CONSTANT, p2_packed);
 
+        /* Emit retiring TMU write */
         if (instr->op == nir_texop_txf) {
                 assert(instr->sampler_dim != GLSL_SAMPLER_DIM_CUBE);
-                vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSF, s, &tmu_writes);
+                vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSF, s, NULL);
         } else if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
-                vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSCM, s, &tmu_writes);
+                vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSCM, s, NULL);
         } else if (instr->op == nir_texop_txl) {
-                vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSLOD, s, &tmu_writes);
+                vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSLOD, s, NULL);
         } else {
-                vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUS, s, &tmu_writes);
+                vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUS, s, NULL);
         }
 
-        vir_emit_thrsw(c);
-
-        /* The input FIFO has 16 slots across all threads, so make sure we
-         * don't overfill our allocation.
-         */
-        while (tmu_writes > 16 / c->threads)
-                c->threads /= 2;
-
-        for (int i = 0; i < 4; i++) {
-                if (p0_unpacked.return_words_of_texture_data & (1 << i))
-                        ntq_store_dest(c, &instr->dest, i, vir_LDTMU(c));
-        }
+        ntq_add_pending_tmu_flush(c, &instr->dest,
+                                  p0_unpacked.return_words_of_texture_data,
+                                  tmu_writes);
 }
 
 static uint32_t
diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h
index d617168ddd5..d75a6203ba9 100644
--- a/src/broadcom/compiler/v3d_compiler.h
+++ b/src/broadcom/compiler/v3d_compiler.h
@@ -579,7 +579,8 @@ struct v3d_compile {
 
                 struct {
                         nir_dest *dest;
-                        uint32_t num_components;
+                        uint8_t num_components;
+                        uint8_t component_mask;
                 } flush[8]; /* 16 entries / 2 threads for input/output fifos */
                 uint32_t flush_count;
         } tmu;
@@ -936,6 +937,9 @@ uint8_t vir_channels_written(struct qinst *inst);
 struct qreg ntq_get_src(struct v3d_compile *c, nir_src src, int i);
 void ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan,
                     struct qreg result);
+bool ntq_tmu_fifo_overflow(struct v3d_compile *c, uint32_t components, uint32_t writes);
+void ntq_add_pending_tmu_flush(struct v3d_compile *c, nir_dest *dest,
+                               uint32_t component_mask, uint32_t tmu_writes);
 void ntq_flush_tmu(struct v3d_compile *c);
 void vir_emit_thrsw(struct v3d_compile *c);