Mesa (main): zink: move all 64-32bit shader load rewriting to nir pass

Wed Oct 27 17:23:38 UTC 2021

Module: Mesa
Branch: main
Commit: 150d6ee97e374b5f520fc1ec3817a8a09c4b80fc
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=150d6ee97e374b5f520fc1ec3817a8a09c4b80fc

Author: Mike Blumenkrantz <michael.blumenkrantz at gmail.com>
Date:   Wed Oct 20 10:02:08 2021 -0400

zink: move all 64-32bit shader load rewriting to nir pass

this also enables natural 64bit loads on drivers that support it

Reviewed-by: Dave Airlie <airlied at redhat.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13484>

---

 .../drivers/zink/nir_to_spirv/nir_to_spirv.c       | 61 ++++------------------
 src/gallium/drivers/zink/zink_compiler.c           | 54 ++++++++++++++++---
 2 files changed, 57 insertions(+), 58 deletions(-)

diff --git a/src/gallium/drivers/zink/nir_to_spirv/nir_to_spirv.c b/src/gallium/drivers/zink/nir_to_spirv/nir_to_spirv.c
index e3b8a27a42b..30f4b6bd267 100644
--- a/src/gallium/drivers/zink/nir_to_spirv/nir_to_spirv.c
+++ b/src/gallium/drivers/zink/nir_to_spirv/nir_to_spirv.c
@@ -49,10 +49,10 @@ struct ntv_context {
    gl_shader_stage stage;
    const struct zink_so_info *so_info;
 
-   SpvId ubos[PIPE_MAX_CONSTANT_BUFFERS][3]; //8, 16, 32
+   SpvId ubos[PIPE_MAX_CONSTANT_BUFFERS][5]; //8, 16, 32, unused, 64
    nir_variable *ubo_vars[PIPE_MAX_CONSTANT_BUFFERS];
 
-   SpvId ssbos[PIPE_MAX_SHADER_BUFFERS][3]; //8, 16, 32
+   SpvId ssbos[PIPE_MAX_SHADER_BUFFERS][5]; //8, 16, 32, unused, 64
    nir_variable *ssbo_vars[PIPE_MAX_SHADER_BUFFERS];
    SpvId image_types[PIPE_MAX_SAMPLERS];
    SpvId images[PIPE_MAX_SAMPLERS];
@@ -1915,9 +1915,9 @@ emit_load_bo(struct ntv_context *ctx, nir_intrinsic_instr *intr)
    bool ssbo = intr->intrinsic == nir_intrinsic_load_ssbo;
    assert(const_block_index); // no dynamic indexing for now
 
-   unsigned idx = 0;
    unsigned bit_size = nir_dest_bit_size(intr->dest);
-   idx = MIN2(bit_size, 32) >> 4;
+   assert(bit_size <= 64);
+   unsigned idx = bit_size >> 4;
    if (ssbo) {
       assert(idx < ARRAY_SIZE(ctx->ssbos[0]));
       if (!ctx->ssbos[const_block_index->u32][idx])
@@ -1928,15 +1928,12 @@ emit_load_bo(struct ntv_context *ctx, nir_intrinsic_instr *intr)
          emit_bo(ctx, ctx->ubo_vars[const_block_index->u32], nir_dest_bit_size(intr->dest));
    }
    SpvId bo = ssbo ? ctx->ssbos[const_block_index->u32][idx] : ctx->ubos[const_block_index->u32][idx];
-   SpvId uint_type = get_uvec_type(ctx, MIN2(bit_size, 32), 1);
+   SpvId uint_type = get_uvec_type(ctx, bit_size, 1);
    SpvId one = emit_uint_const(ctx, 32, 1);
 
    /* number of components being loaded */
    unsigned num_components = nir_dest_num_components(intr->dest);
-   /* we need to grab 2x32 to fill the 64bit value */
-   if (bit_size == 64)
-      num_components *= 2;
-   SpvId constituents[NIR_MAX_VEC_COMPONENTS * 2];
+   SpvId constituents[NIR_MAX_VEC_COMPONENTS];
    SpvId result;
 
    /* destination type for the load */
@@ -1950,7 +1947,7 @@ emit_load_bo(struct ntv_context *ctx, nir_intrinsic_instr *intr)
    /* our generated uniform has a memory layout like
     *
     * struct {
-    *    uint base[array_size];
+    *    uintN base[array_size];
     * };
     *
     * first, access 'base'
@@ -1983,18 +1980,6 @@ emit_load_bo(struct ntv_context *ctx, nir_intrinsic_instr *intr)
       offset = emit_binop(ctx, SpvOpIAdd, uint_type, offset, one);
    }
 
-   /* if we're loading a 64bit value, we have to reassemble all the u32 values we've loaded into u64 values
-    * by creating uvec2 composites and bitcasting them to u64 values
-    */
-   if (bit_size == 64) {
-      num_components /= 2;
-      type = get_uvec_type(ctx, 64, num_components);
-      SpvId u64_type = get_uvec_type(ctx, 64, 1);
-      for (unsigned i = 0; i < num_components; i++) {
-         constituents[i] = spirv_builder_emit_composite_construct(&ctx->builder, get_uvec_type(ctx, 32, 2), constituents + i * 2, 2);
-         constituents[i] = emit_bitcast(ctx, u64_type, constituents[i]);
-      }
-   }
    /* if loading more than 1 value, reassemble the results into the desired type,
     * otherwise just use the loaded result
     */
@@ -2194,7 +2179,6 @@ emit_load_shared(struct ntv_context *ctx, nir_intrinsic_instr *intr)
    SpvId dest_type = get_dest_type(ctx, &intr->dest, nir_type_uint);
    unsigned num_components = nir_dest_num_components(intr->dest);
    unsigned bit_size = nir_dest_bit_size(intr->dest);
-   bool qword = bit_size == 64;
    SpvId uint_type = get_uvec_type(ctx, 32, 1);
    SpvId ptr_type = spirv_builder_type_pointer(&ctx->builder,
                                                SpvStorageClassWorkgroup,
@@ -2203,17 +2187,10 @@ emit_load_shared(struct ntv_context *ctx, nir_intrinsic_instr *intr)
    SpvId constituents[NIR_MAX_VEC_COMPONENTS];
    /* need to convert array -> vec */
    for (unsigned i = 0; i < num_components; i++) {
-      SpvId parts[2];
-      for (unsigned j = 0; j < 1 + !!qword; j++) {
-         SpvId member = spirv_builder_emit_access_chain(&ctx->builder, ptr_type,
-                                                        ctx->shared_block_var, &offset, 1);
-         parts[j] = spirv_builder_emit_load(&ctx->builder, uint_type, member);
-         offset = emit_binop(ctx, SpvOpIAdd, uint_type, offset, emit_uint_const(ctx, 32, 1));
-      }
-      if (qword)
-         constituents[i] = spirv_builder_emit_composite_construct(&ctx->builder, get_uvec_type(ctx, 64, 1), parts, 2);
-      else
-         constituents[i] = parts[0];
+      SpvId member = spirv_builder_emit_access_chain(&ctx->builder, ptr_type,
+                                                     ctx->shared_block_var, &offset, 1);
+      constituents[i] = spirv_builder_emit_load(&ctx->builder, uint_type, member);
+      offset = emit_binop(ctx, SpvOpIAdd, uint_type, offset, emit_uint_const(ctx, 32, 1));
    }
    SpvId result;
    if (num_components > 1)
@@ -2258,15 +2235,11 @@ emit_store_shared(struct ntv_context *ctx, nir_intrinsic_instr *intr)
 static void
 emit_load_push_const(struct ntv_context *ctx, nir_intrinsic_instr *intr)
 {
-   unsigned bit_size = nir_dest_bit_size(intr->dest);
    SpvId uint_type = get_uvec_type(ctx, 32, 1);
    SpvId load_type = get_uvec_type(ctx, 32, 1);
 
    /* number of components being loaded */
    unsigned num_components = nir_dest_num_components(intr->dest);
-   /* we need to grab 2x32 to fill the 64bit value */
-   if (bit_size == 64)
-      num_components *= 2;
    SpvId constituents[NIR_MAX_VEC_COMPONENTS * 2];
    SpvId result;
 
@@ -2298,18 +2271,6 @@ emit_load_push_const(struct ntv_context *ctx, nir_intrinsic_instr *intr)
       offset = emit_binop(ctx, SpvOpIAdd, uint_type, offset, one);
    }
 
-   /* if we're loading a 64bit value, we have to reassemble all the u32 values we've loaded into u64 values
-    * by creating uvec2 composites and bitcasting them to u64 values
-    */
-   if (bit_size == 64) {
-      num_components /= 2;
-      type = get_uvec_type(ctx, 64, num_components);
-      SpvId u64_type = get_uvec_type(ctx, 64, 1);
-      for (unsigned i = 0; i < num_components; i++) {
-         constituents[i] = spirv_builder_emit_composite_construct(&ctx->builder, get_uvec_type(ctx, 32, 2), constituents + i * 2, 2);
-         constituents[i] = emit_bitcast(ctx, u64_type, constituents[i]);
-      }
-   }
    /* if loading more than 1 value, reassemble the results into the desired type,
     * otherwise just use the loaded result
     */
diff --git a/src/gallium/drivers/zink/zink_compiler.c b/src/gallium/drivers/zink/zink_compiler.c
index be08bf811aa..73443e27933 100644
--- a/src/gallium/drivers/zink/zink_compiler.c
+++ b/src/gallium/drivers/zink/zink_compiler.c
@@ -640,16 +640,54 @@ decompose_attribs(nir_shader *nir, uint32_t decomposed_attrs, uint32_t decompose
 static bool
 rewrite_bo_access_instr(nir_builder *b, nir_instr *instr, void *data)
 {
+   struct zink_screen *screen = data;
+   const bool has_int64 = screen->info.feats.features.shaderInt64;
    if (instr->type != nir_instr_type_intrinsic)
       return false;
    nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+   b->cursor = nir_before_instr(instr);
    switch (intr->intrinsic) {
    case nir_intrinsic_load_ssbo:
-   case nir_intrinsic_load_ubo:
-   case nir_intrinsic_load_ubo_vec4:
-      b->cursor = nir_before_instr(instr);
-      nir_instr_rewrite_src_ssa(instr, &intr->src[1], nir_udiv_imm(b, intr->src[1].ssa, MIN2(nir_dest_bit_size(intr->dest), 32) / 8));
+   case nir_intrinsic_load_ubo: {
+      /* ubo0 can have unaligned 64bit loads, particularly for bindless texture ids */
+      bool force_2x32 = intr->intrinsic == nir_intrinsic_load_ubo &&
+                        nir_src_as_uint(intr->src[0]) == 0 &&
+                        nir_dest_bit_size(intr->dest) == 64 &&
+                        nir_intrinsic_align_offset(intr) % 8 != 0;
+      nir_instr_rewrite_src_ssa(instr, &intr->src[1], nir_udiv_imm(b, intr->src[1].ssa,
+                                (force_2x32 ? 32 : nir_dest_bit_size(intr->dest)) / 8));
+      /* if 64bit isn't supported, 64bit loads definitely aren't supported, so rewrite as 2x32 with cast and pray */
+      if (force_2x32 || (nir_dest_bit_size(intr->dest) == 64 && !has_int64)) {
+         /* this is always scalarized */
+         assert(intr->dest.ssa.num_components == 1);
+         /* rewrite as 2x32 */
+         nir_ssa_def *load;
+         if (intr->intrinsic == nir_intrinsic_load_ssbo)
+            load = nir_load_ssbo(b, 2, 32, intr->src[0].ssa, intr->src[1].ssa, .align_mul = 4, .align_offset = 0);
+         else
+            load = nir_load_ubo(b, 2, 32, intr->src[0].ssa, intr->src[1].ssa, .align_mul = 4, .align_offset = 0, .range = 4);
+         nir_intrinsic_set_access(nir_instr_as_intrinsic(load->parent_instr), nir_intrinsic_access(intr));
+         /* cast back to 64bit */
+         nir_ssa_def *casted = nir_pack_64_2x32(b, load);
+         nir_ssa_def_rewrite_uses(&intr->dest.ssa, casted);
+         nir_instr_remove(instr);
+      }
       return true;
+   }
+   case nir_intrinsic_load_shared:
+      /* if 64bit isn't supported, 64bit loads definitely aren't supported, so rewrite as 2x32 with cast and pray */
+      if (nir_dest_bit_size(intr->dest) == 64 && !has_int64) {
+         /* this is always scalarized */
+         assert(intr->dest.ssa.num_components == 1);
+         /* rewrite as 2x32 */
+         nir_ssa_def *load = nir_load_shared(b, 2, 32, intr->src[0].ssa, .align_mul = 4, .align_offset = 0);
+         /* cast back to 64bit */
+         nir_ssa_def *casted = nir_pack_64_2x32(b, load);
+         nir_ssa_def_rewrite_uses(&intr->dest.ssa, casted);
+         nir_instr_remove(instr);
+         return true;
+      }
+      break;
    case nir_intrinsic_store_ssbo:
    default:
       break;
@@ -658,9 +696,9 @@ rewrite_bo_access_instr(nir_builder *b, nir_instr *instr, void *data)
 }
 
 static bool
-rewrite_bo_access(nir_shader *shader)
+rewrite_bo_access(nir_shader *shader, struct zink_screen *screen)
 {
-   return nir_shader_instructions_pass(shader, rewrite_bo_access_instr, nir_metadata_dominance, NULL);
+   return nir_shader_instructions_pass(shader, rewrite_bo_access_instr, nir_metadata_dominance, screen);
 }
 
 static void
@@ -900,7 +938,7 @@ zink_shader_compile(struct zink_screen *screen, struct zink_shader *zs, nir_shad
    }
    if (screen->driconf.inline_uniforms) {
       NIR_PASS_V(nir, nir_lower_io_to_scalar, nir_var_mem_ubo | nir_var_mem_ssbo | nir_var_mem_shared);
-      NIR_PASS_V(nir, rewrite_bo_access);
+      NIR_PASS_V(nir, rewrite_bo_access, screen);
    }
    if (inlined_uniforms) {
       optimize_nir(nir);
@@ -1417,7 +1455,7 @@ zink_shader_create(struct zink_screen *screen, struct nir_shader *nir,
    /* run in compile if there could be inlined uniforms */
    if (!screen->driconf.inline_uniforms) {
       NIR_PASS_V(nir, nir_lower_io_to_scalar, nir_var_mem_ubo | nir_var_mem_ssbo | nir_var_mem_shared);
-      NIR_PASS_V(nir, rewrite_bo_access);
+      NIR_PASS_V(nir, rewrite_bo_access, screen);
    }
 
    if (zink_debug & ZINK_DEBUG_NIR) {