Mesa (main): nir: Add a .base field to nir_load_ubo_vec4.

Wed Jan 19 22:59:54 UTC 2022

Module: Mesa
Branch: main
Commit: 700d2fbd0ab3bde9b87a28b14addaac9bb4520eb
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=700d2fbd0ab3bde9b87a28b14addaac9bb4520eb

Author: Emma Anholt <emma at anholt.net>
Date:   Thu Dec 23 15:21:30 2021 -0800

nir: Add a .base field to nir_load_ubo_vec4.

This lets nir-to-tgsi fold the constant offset of addressing calculations
into the CONST[] reference, which is important for D3D9-era compatibility:
HW of that age has limited uniform space, and if we do the addressing math
as math in the shader for dynamic indexing, the nir_load_consts end up
taking up uniforms we don't have available.

r300:
total instructions in shared programs: 1279699 -> 1279167 (-0.04%)
instructions in affected programs: 134796 -> 134264 (-0.39%)
total instructions in shared programs: 1279699 -> 1279167 (-0.04%)
instructions in affected programs: 134796 -> 134264 (-0.39%)
total temps in shared programs: 213912 -> 213736 (-0.08%)
temps in affected programs: 2166 -> 1990 (-8.13%)
total consts in shared programs: 953237 -> 952973 (-0.03%)
consts in affected programs: 45980 -> 45716 (-0.57%)

Acked-by: Timur Kristóf <timur.kristof at gmail.com>
Reviewed-by: Matt Turner <mattst88 at gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14309>

---

 src/compiler/nir/nir.h                           |  3 +++
 src/compiler/nir/nir_intrinsics.py               |  4 ++--
 src/compiler/nir/nir_lower_uniforms_to_ubo.c     |  3 +--
 src/compiler/nir/nir_opt_offsets.c               |  2 ++
 src/freedreno/ir3/ir3_compiler_nir.c             |  5 +++++
 src/gallium/auxiliary/nir/nir_to_tgsi.c          |  1 +
 src/gallium/drivers/r600/sfn/sfn_shader_base.cpp | 10 ++++++----
 7 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 0c5b82cfd3c..aae86579bc7 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -5257,6 +5257,9 @@ typedef struct {
    /** nir_load_uniform max base offset */
    uint32_t uniform_max;
 
+   /** nir_load_ubo_vec4 max base offset */
+   uint32_t ubo_vec4_max;
+
    /** nir_var_mem_shared max base offset */
    uint32_t shared_max;
 
diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py
index 24f47f6279b..8f59443f8ea 100644
--- a/src/compiler/nir/nir_intrinsics.py
+++ b/src/compiler/nir/nir_intrinsics.py
@@ -934,8 +934,8 @@ def load(name, src_comp, indices=[], flags=[]):
 load("uniform", [1], [BASE, RANGE, DEST_TYPE], [CAN_ELIMINATE, CAN_REORDER])
 # src[] = { buffer_index, offset }.
 load("ubo", [-1, 1], [ACCESS, ALIGN_MUL, ALIGN_OFFSET, RANGE_BASE, RANGE], flags=[CAN_ELIMINATE, CAN_REORDER])
-# src[] = { buffer_index, offset in vec4 units }
-load("ubo_vec4", [-1, 1], [ACCESS, COMPONENT], flags=[CAN_ELIMINATE, CAN_REORDER])
+# src[] = { buffer_index, offset in vec4 units }.  base is also in vec4 units.
+load("ubo_vec4", [-1, 1], [ACCESS, BASE, COMPONENT], flags=[CAN_ELIMINATE, CAN_REORDER])
 # src[] = { offset }.
 load("input", [1], [BASE, COMPONENT, DEST_TYPE, IO_SEMANTICS], [CAN_ELIMINATE, CAN_REORDER])
 # src[] = { vertex_id, offset }.
diff --git a/src/compiler/nir/nir_lower_uniforms_to_ubo.c b/src/compiler/nir/nir_lower_uniforms_to_ubo.c
index 12f835c3f0e..8fefcfd8f33 100644
--- a/src/compiler/nir/nir_lower_uniforms_to_ubo.c
+++ b/src/compiler/nir/nir_lower_uniforms_to_ubo.c
@@ -65,8 +65,7 @@ lower_instr(nir_intrinsic_instr *instr, nir_builder *b, bool dword_packed, bool
           */
          assert(!dword_packed);
          load_result = nir_load_ubo_vec4(b, instr->num_components, instr->dest.ssa.bit_size,
-                                         ubo_idx,
-                                         nir_iadd_imm(b, uniform_offset, nir_intrinsic_base(instr)));
+                                         ubo_idx, uniform_offset, .base=nir_intrinsic_base(instr));
       } else {
          /* For PIPE_CAP_PACKED_UNIFORMS, the uniforms are packed with the
           * base/offset in dword units instead of vec4 units.
diff --git a/src/compiler/nir/nir_opt_offsets.c b/src/compiler/nir/nir_opt_offsets.c
index 58cfa98a1ac..0d10fc93e46 100644
--- a/src/compiler/nir/nir_opt_offsets.c
+++ b/src/compiler/nir/nir_opt_offsets.c
@@ -135,6 +135,8 @@ process_instr(nir_builder *b, nir_instr *instr, void *s)
    switch (intrin->intrinsic) {
    case nir_intrinsic_load_uniform:
       return try_fold_load_store(b, intrin, state, 0, state->options->uniform_max);
+   case nir_intrinsic_load_ubo_vec4:
+      return try_fold_load_store(b, intrin, state, 1, state->options->ubo_vec4_max);
    case nir_intrinsic_load_shared:
    case nir_intrinsic_load_shared_ir3:
       return try_fold_load_store(b, intrin, state, 0, state->options->shared_max);
diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c
index 17449664e40..9611e743e81 100644
--- a/src/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/freedreno/ir3/ir3_compiler_nir.c
@@ -873,6 +873,11 @@ emit_intrinsic_load_ubo_ldc(struct ir3_context *ctx, nir_intrinsic_instr *intr,
 {
    struct ir3_block *b = ctx->block;
 
+   /* This is only generated for us by nir_lower_ubo_vec4, which leaves base =
+    * 0.
+    */
+   assert(nir_intrinsic_base(intr) == 0);
+
    unsigned ncomp = intr->num_components;
    struct ir3_instruction *offset = ir3_get_src(ctx, &intr->src[1])[0];
    struct ir3_instruction *idx = ir3_get_src(ctx, &intr->src[0])[0];
diff --git a/src/gallium/auxiliary/nir/nir_to_tgsi.c b/src/gallium/auxiliary/nir/nir_to_tgsi.c
index 6281c197edf..df76d50af51 100644
--- a/src/gallium/auxiliary/nir/nir_to_tgsi.c
+++ b/src/gallium/auxiliary/nir/nir_to_tgsi.c
@@ -1338,6 +1338,7 @@ ntt_emit_load_ubo(struct ntt_compile *c, nir_intrinsic_instr *instr)
       /* !PIPE_CAP_LOAD_CONSTBUF: Just emit it as a vec4 reference to the const
        * file.
        */
+      src.Index = nir_intrinsic_base(instr);
 
       if (nir_src_is_const(instr->src[1])) {
          src.Index += ntt_src_as_uint(c, instr->src[1]);
diff --git a/src/gallium/drivers/r600/sfn/sfn_shader_base.cpp b/src/gallium/drivers/r600/sfn/sfn_shader_base.cpp
index ed3f1b57a31..3f8ef12db9f 100644
--- a/src/gallium/drivers/r600/sfn/sfn_shader_base.cpp
+++ b/src/gallium/drivers/r600/sfn/sfn_shader_base.cpp
@@ -872,6 +872,7 @@ bool ShaderFromNirProcessor::emit_load_ubo_vec4(nir_intrinsic_instr* instr)
 {
    auto bufid = nir_src_as_const_value(instr->src[0]);
    auto buf_offset = nir_src_as_const_value(instr->src[1]);
+   auto base = nir_intrinsic_base(instr);
 
    if (!buf_offset) {
       /* TODO: if buf_offset is constant then this can also be solved by using the CF indes
@@ -892,11 +893,11 @@ bool ShaderFromNirProcessor::emit_load_ubo_vec4(nir_intrinsic_instr* instr)
 
       FetchInstruction *ir;
       if (bufid) {
-         ir = new FetchInstruction(vc_fetch, no_index_offset, trgt, addr, 0,
+         ir = new FetchInstruction(vc_fetch, no_index_offset, trgt, addr, base,
                                               1 + bufid->u32, nullptr, bim_none);
       } else {
          PValue bufid = from_nir(instr->src[0], 0, 0);
-         ir = new FetchInstruction(vc_fetch, no_index_offset, trgt, addr, 0,
+         ir = new FetchInstruction(vc_fetch, no_index_offset, trgt, addr, base,
                                               1, bufid, bim_zero);
       }
       ir->set_dest_swizzle(swz);
@@ -905,6 +906,7 @@ bool ShaderFromNirProcessor::emit_load_ubo_vec4(nir_intrinsic_instr* instr)
       return true;
    }
 
+   uint32_t offset = 512 + base + buf_offset->u32;
 
    if (bufid) {
       int buf_cmp = nir_intrinsic_component(instr);
@@ -912,7 +914,7 @@ bool ShaderFromNirProcessor::emit_load_ubo_vec4(nir_intrinsic_instr* instr)
       for (unsigned i = 0; i < nir_dest_num_components(instr->dest); ++i) {
          int cmp = buf_cmp + i;
          assert(cmp < 4);
-         auto u = PValue(new UniformValue(512 +  buf_offset->u32, cmp, bufid->u32 + 1));
+         auto u = PValue(new UniformValue(offset, cmp, bufid->u32 + 1));
          if (instr->dest.is_ssa)
             load_preloaded_value(instr->dest, i, u);
          else {
@@ -930,7 +932,7 @@ bool ShaderFromNirProcessor::emit_load_ubo_vec4(nir_intrinsic_instr* instr)
       auto kc_id = from_nir(instr->src[0], 0);
       for (unsigned i = 0; i < nir_dest_num_components(instr->dest); ++i) {
          int cmp = buf_cmp + i;
-         auto u = PValue(new UniformValue(512 +  buf_offset->u32, cmp, kc_id));
+         auto u = PValue(new UniformValue(offset, cmp, kc_id));
          if (instr->dest.is_ssa)
             load_preloaded_value(instr->dest, i, u);
          else {