Mesa (main): ir3: Use stp/ldp base offset for {load,store}_scratch

Wed Oct 20 15:39:01 UTC 2021

Module: Mesa
Branch: main
Commit: e7599f09a149347b74bf0913cdd86d64e0ae9d20
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=e7599f09a149347b74bf0913cdd86d64e0ae9d20

Author: Connor Abbott <cwabbott0 at gmail.com>
Date:   Mon Oct 11 16:43:10 2021 +0200

ir3: Use stp/ldp base offset for {load,store}_scratch

When we have a series of loads/stores we were creating a constant for
each one, which isn't great. Furthermore, because the nir pass puts the
offset constant at the top of the shader, it resulted in extra register
pressure and spilling when that happened inside a loop. Fix this by
using the base/offset form of stp and ldp.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13307>

---

 src/freedreno/ci/deqp-freedreno-a630-skips.txt |  2 +-
 src/freedreno/ir3/ir3_compiler_nir.c           | 33 ++++++++++++++++++++++----
 2 files changed, 30 insertions(+), 5 deletions(-)

diff --git a/src/freedreno/ci/deqp-freedreno-a630-skips.txt b/src/freedreno/ci/deqp-freedreno-a630-skips.txt
index d95dac8a996..f9cab0aa952 100644
--- a/src/freedreno/ci/deqp-freedreno-a630-skips.txt
+++ b/src/freedreno/ci/deqp-freedreno-a630-skips.txt
@@ -29,4 +29,4 @@ KHR-GLES31.core.shader_image_load_store.basic-allFormats-store-fs
 # causes a hangcheck timeout on a630:
 # msm ae00000.mdss: [drm:hangcheck_handler] *ERROR* A630: hangcheck detected gpu lockup rb 0!
 dEQP-VK.graphicsfuzz.spv-stable-maze-flatten-copy-composite
-dEQP-VK.graphicsfuzz.spv-stable-pillars-volatile-nontemporal-store
+spill-dEQP-VK.graphicsfuzz.spv-stable-pillars-volatile-nontemporal-store
diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c
index 036c2e3bc59..0a7e00c0f71 100644
--- a/src/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/freedreno/ir3/ir3_compiler_nir.c
@@ -1064,6 +1064,28 @@ emit_intrinsic_atomic_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr)
    return atomic;
 }
 
+static void
+stp_ldp_offset(struct ir3_context *ctx, nir_src *src,
+               struct ir3_instruction **offset, int32_t *base)
+{
+   struct ir3_block *b = ctx->block;
+
+   if (nir_src_is_const(*src)) {
+      unsigned src_offset = nir_src_as_uint(*src);
+      /* The base offset field is only 13 bits, and it's signed. Try to make the
+       * offset constant whenever the original offsets are similar, to avoid
+       * creating too many constants in the final shader.
+       */
+      *base = ((int32_t) src_offset << (32 - 13)) >> (32 - 13);
+      uint32_t offset_val = src_offset - *base;
+      *offset = create_immed(b, offset_val);
+   } else {
+      /* TODO: match on nir_iadd with a constant that fits */
+      *base = 0;
+      *offset = ir3_get_src(ctx, src)[0];
+   }
+}
+
 /* src[] = { offset }. */
 static void
 emit_intrinsic_load_scratch(struct ir3_context *ctx, nir_intrinsic_instr *intr,
@@ -1071,10 +1093,11 @@ emit_intrinsic_load_scratch(struct ir3_context *ctx, nir_intrinsic_instr *intr,
 {
    struct ir3_block *b = ctx->block;
    struct ir3_instruction *ldp, *offset;
+   int32_t base;
 
-   offset = ir3_get_src(ctx, &intr->src[0])[0];
+   stp_ldp_offset(ctx, &intr->src[0], &offset, &base);
 
-   ldp = ir3_LDP(b, offset, 0, create_immed(b, 0), 0,
+   ldp = ir3_LDP(b, offset, 0, create_immed(b, base), 0,
                  create_immed(b, intr->num_components), 0);
 
    ldp->cat6.type = utype_dst(intr->dest);
@@ -1094,9 +1117,11 @@ emit_intrinsic_store_scratch(struct ir3_context *ctx, nir_intrinsic_instr *intr)
    struct ir3_instruction *stp, *offset;
    struct ir3_instruction *const *value;
    unsigned wrmask, ncomp;
+   int32_t base;
 
    value = ir3_get_src(ctx, &intr->src[0]);
-   offset = ir3_get_src(ctx, &intr->src[1])[0];
+
+   stp_ldp_offset(ctx, &intr->src[1], &offset, &base);
 
    wrmask = nir_intrinsic_write_mask(intr);
    ncomp = ffs(~wrmask) - 1;
@@ -1105,7 +1130,7 @@ emit_intrinsic_store_scratch(struct ir3_context *ctx, nir_intrinsic_instr *intr)
 
    stp = ir3_STP(b, offset, 0, ir3_create_collect(b, value, ncomp), 0,
                  create_immed(b, ncomp), 0);
-   stp->cat6.dst_offset = 0;
+   stp->cat6.dst_offset = base;
    stp->cat6.type = utype_src(intr->src[0]);
    stp->barrier_class = IR3_BARRIER_PRIVATE_W;
    stp->barrier_conflict = IR3_BARRIER_PRIVATE_R | IR3_BARRIER_PRIVATE_W;