Mesa (main): zink: rewrite 64bit shader i/o as 32bit

Fri Jun 10 02:49:40 UTC 2022

Module: Mesa
Branch: main
Commit: 5b2f850425e4c841529ff8339a1fbc5a26e9ff8c
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=5b2f850425e4c841529ff8339a1fbc5a26e9ff8c

Author: Mike Blumenkrantz <michael.blumenkrantz at gmail.com>
Date:   Wed May 18 10:11:41 2022 -0400

zink: rewrite 64bit shader i/o as 32bit

zink can't use lower_io, so this all has to be done manually and in
excruciating depth and detail

fixes (tu):
KHR-Single-GL46.arrays_of_arrays_gl.InteractionFunctionCalls2
KHR-GL46.gpu_shader_fp64.fp64.named_uniform_blocks
KHR-GL46.gpu_shader_fp64.fp64.varyings
KHR-GL46.vertex_attrib_binding.advanced-bindingUpdate
KHR-Single-GL46.enhanced_layouts.varying_array_components
KHR-Single-GL46.enhanced_layouts.varying_array_locations
KHR-Single-GL46.enhanced_layouts.varying_components
KHR-Single-GL46.enhanced_layouts.varying_locations
KHR-Single-GL46.enhanced_layouts.xfb_explicit_location
dEQP-GLES3.functional.transform_feedback.basic_types.interleaved.lines.highp_mat3x4
dEQP-GLES3.functional.transform_feedback.basic_types.interleaved.lines.mediump_vec3
dEQP-GLES3.functional.transform_feedback.basic_types.interleaved.points.mediump_vec4
dEQP-GLES3.functional.transform_feedback.basic_types.separate.points.highp_vec3

Reviewed-by: Adam Jackson <ajax at redhat.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16669>

---

 src/gallium/drivers/zink/ci/zink-tu-a630-fails.txt |  12 -
 src/gallium/drivers/zink/ci/zink-tu-a630-skips.txt |   4 +-
 src/gallium/drivers/zink/zink_compiler.c           | 320 +++++++++++++++++++++
 3 files changed, 322 insertions(+), 14 deletions(-)

diff --git a/src/gallium/drivers/zink/ci/zink-tu-a630-fails.txt b/src/gallium/drivers/zink/ci/zink-tu-a630-fails.txt
index bbf8562f9d7..0b90047a3fb 100644
--- a/src/gallium/drivers/zink/ci/zink-tu-a630-fails.txt
+++ b/src/gallium/drivers/zink/ci/zink-tu-a630-fails.txt
@@ -173,8 +173,6 @@ KHR-GL46.gpu_shader_fp64.builtin.trunc_dvec2,Fail
 KHR-GL46.gpu_shader_fp64.builtin.trunc_dvec3,Fail
 KHR-GL46.gpu_shader_fp64.builtin.trunc_dvec4,Fail
 KHR-GL46.gpu_shader_fp64.fp64.max_uniform_components,Fail
-KHR-GL46.gpu_shader_fp64.fp64.named_uniform_blocks,Crash
-KHR-GL46.gpu_shader_fp64.fp64.varyings,Crash
 KHR-GL46.limits.max_fragment_input_components,Fail
 KHR-GL46.packed_pixels.pbo_rectangle.r11f_g11f_b10f,Fail
 KHR-GL46.packed_pixels.pbo_rectangle.r16,Fail
@@ -235,19 +233,13 @@ KHR-GL46.texture_cube_map_array.color_depth_attachments,Fail
 KHR-GL46.texture_cube_map_array.image_op_geometry_sh,Fail
 KHR-GL46.texture_view.view_classes,Fail
 KHR-GL46.vertex_attrib_64bit.vao,Fail
-KHR-GL46.vertex_attrib_binding.advanced-bindingUpdate,Crash
 KHR-Single-GL46.arrays_of_arrays_gl.AtomicUsage,Fail
 KHR-Single-GL46.arrays_of_arrays_gl.SubroutineFunctionCalls2,Crash
 KHR-Single-GL46.enhanced_layouts.ssb_member_offset_and_align,Fail
 KHR-Single-GL46.enhanced_layouts.uniform_block_member_offset_and_align,Fail
-KHR-Single-GL46.enhanced_layouts.varying_array_components,Crash
-KHR-Single-GL46.enhanced_layouts.varying_array_locations,Crash
-KHR-Single-GL46.enhanced_layouts.varying_components,Crash
-KHR-Single-GL46.enhanced_layouts.varying_locations,Crash
 KHR-Single-GL46.enhanced_layouts.varying_structure_locations,Crash
 KHR-Single-GL46.enhanced_layouts.xfb_capture_inactive_output_block_member,Fail
 KHR-Single-GL46.enhanced_layouts.xfb_capture_struct,Fail
-KHR-Single-GL46.enhanced_layouts.xfb_explicit_location,Crash
 KHR-Single-GL46.enhanced_layouts.xfb_global_buffer,Fail
 KHR-Single-GL46.enhanced_layouts.xfb_override_qualifiers_with_api,Fail
 KHR-Single-GL46.enhanced_layouts.xfb_stride,Fail
@@ -259,10 +251,6 @@ dEQP-GLES3.functional.occlusion_query.scissor_depth_clear_stencil_clear,Fail
 dEQP-GLES3.functional.occlusion_query.scissor_stencil_clear,Fail
 dEQP-GLES3.functional.occlusion_query.stencil_clear,Fail
 dEQP-GLES3.functional.shaders.loops.for_uniform_iterations.unconditional_continue_fragment,Crash
-dEQP-GLES3.functional.transform_feedback.basic_types.interleaved.lines.highp_mat3x4,Fail
-dEQP-GLES3.functional.transform_feedback.basic_types.interleaved.lines.mediump_vec3,Fail
-dEQP-GLES3.functional.transform_feedback.basic_types.interleaved.points.mediump_vec4,Fail
-dEQP-GLES3.functional.transform_feedback.basic_types.separate.points.highp_vec3,Fail
 dEQP-GLES31.functional.blend_equation_advanced.barrier.colorburn,Fail
 dEQP-GLES31.functional.blend_equation_advanced.barrier.colordodge,Fail
 dEQP-GLES31.functional.blend_equation_advanced.barrier.darken,Fail
diff --git a/src/gallium/drivers/zink/ci/zink-tu-a630-skips.txt b/src/gallium/drivers/zink/ci/zink-tu-a630-skips.txt
index fef05d14ab3..fb03e671d01 100644
--- a/src/gallium/drivers/zink/ci/zink-tu-a630-skips.txt
+++ b/src/gallium/drivers/zink/ci/zink-tu-a630-skips.txt
@@ -1,2 +1,2 @@
-# #6490
-KHR-Single-GL46.arrays_of_arrays_gl.InteractionFunctionCalls2
+# takes forever, but passes
+KHR-GL46.texture_swizzle.smoke
diff --git a/src/gallium/drivers/zink/zink_compiler.c b/src/gallium/drivers/zink/zink_compiler.c
index 01f08899dfb..aeb2eff55e0 100644
--- a/src/gallium/drivers/zink/zink_compiler.c
+++ b/src/gallium/drivers/zink/zink_compiler.c
@@ -1339,6 +1339,323 @@ zink_compiler_assign_io(nir_shader *producer, nir_shader *consumer)
    optimize_nir(nir);
 }
 
+/* all types that hit this function contain something that is 64bit */
+static const struct glsl_type *
+rewrite_64bit_type(nir_shader *nir, const struct glsl_type *type, nir_variable *var)
+{
+   if (glsl_type_is_array(type)) {
+      const struct glsl_type *child = glsl_get_array_element(type);
+      unsigned elements = glsl_get_aoa_size(type);
+      unsigned stride = glsl_get_explicit_stride(type);
+      return glsl_array_type(rewrite_64bit_type(nir, child, var), elements, stride);
+   }
+   /* rewrite structs recursively */
+   if (glsl_type_is_struct_or_ifc(type)) {
+      unsigned nmembers = glsl_get_length(type);
+      struct glsl_struct_field *fields = rzalloc_array(nir, struct glsl_struct_field, nmembers * 2);
+      unsigned xfb_offset = 0;
+      for (unsigned i = 0; i < nmembers; i++) {
+         const struct glsl_struct_field *f = glsl_get_struct_field_data(type, i);
+         fields[i] = *f;
+         xfb_offset += glsl_get_component_slots(fields[i].type) * 4;
+         if (i < nmembers - 1 && xfb_offset % 8 &&
+             glsl_type_contains_64bit(glsl_get_struct_field(type, i + 1))) {
+            var->data.is_xfb = true;
+         }
+         fields[i].type = rewrite_64bit_type(nir, f->type, var);
+      }
+      return glsl_struct_type(fields, nmembers, glsl_get_type_name(type), glsl_struct_type_is_packed(type));
+   }
+   if (!glsl_type_is_64bit(type))
+      return type;
+   enum glsl_base_type base_type;
+   switch (glsl_get_base_type(type)) {
+   case GLSL_TYPE_UINT64:
+      base_type = GLSL_TYPE_UINT;
+      break;
+   case GLSL_TYPE_INT64:
+      base_type = GLSL_TYPE_INT;
+      break;
+   case GLSL_TYPE_DOUBLE:
+      base_type = GLSL_TYPE_FLOAT;
+      break;
+   default:
+      unreachable("unknown 64-bit vertex attribute format!");
+   }
+   if (glsl_type_is_scalar(type))
+      return glsl_vector_type(base_type, 2);
+   unsigned num_components;
+   if (glsl_type_is_matrix(type)) {
+      /* align to vec4 size: dvec3-composed arrays are arrays of dvec3s */
+      unsigned vec_components = glsl_get_vector_elements(type);
+      if (vec_components == 3)
+         vec_components = 4;
+      num_components = vec_components * 2 * glsl_get_matrix_columns(type);
+   } else {
+      num_components = glsl_get_vector_elements(type) * 2;
+      if (num_components <= 4)
+         return glsl_vector_type(base_type, num_components);
+   }
+   /* dvec3/dvec4/dmatX: rewrite as struct { vec4, vec4, vec4, ... [vec2] } */
+   struct glsl_struct_field fields[8] = {0};
+   unsigned remaining = num_components;
+   unsigned nfields = 0;
+   for (unsigned i = 0; remaining; i++, remaining -= MIN2(4, remaining), nfields++) {
+      assert(i < ARRAY_SIZE(fields));
+      fields[i].name = "";
+      fields[i].offset = i * 16;
+      fields[i].type = glsl_vector_type(base_type, MIN2(4, remaining));
+   }
+   char buf[64];
+   snprintf(buf, sizeof(buf), "struct(%s)", glsl_get_type_name(type));
+   return glsl_struct_type(fields, nfields, buf, true);
+}
+
+static const struct glsl_type *
+deref_is_matrix(nir_deref_instr *deref)
+{
+   if (glsl_type_is_matrix(deref->type))
+      return deref->type;
+   nir_deref_instr *parent = nir_deref_instr_parent(deref);
+   if (parent)
+      return deref_is_matrix(parent);
+   return NULL;
+}
+
+/* rewrite all input/output variables using 32bit types and load/stores */
+static bool
+lower_64bit_vars(nir_shader *shader)
+{
+   bool progress = false;
+   struct hash_table *derefs = _mesa_hash_table_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal);
+   struct set *deletes = _mesa_set_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal);
+   nir_foreach_variable_with_modes(var, shader, nir_var_shader_in | nir_var_shader_out) {
+      if (!glsl_type_contains_64bit(var->type))
+         continue;
+      var->type = rewrite_64bit_type(shader, var->type, var);
+      /* once type is rewritten, rewrite all loads and stores */
+      nir_foreach_function(function, shader) {
+         bool func_progress = false;
+         if (!function->impl)
+            continue;
+         nir_builder b;
+         nir_builder_init(&b, function->impl);
+         nir_foreach_block(block, function->impl) {
+            nir_foreach_instr_safe(instr, block) {
+               switch (instr->type) {
+               case nir_instr_type_deref: {
+                  nir_deref_instr *deref = nir_instr_as_deref(instr);
+                  if (!(deref->modes & (nir_var_shader_in | nir_var_shader_out)))
+                     continue;
+                  if (nir_deref_instr_get_variable(deref) != var)
+                     continue;
+
+                  /* matrix types are special: store the original deref type for later use */
+                  const struct glsl_type *matrix = deref_is_matrix(deref);
+                  nir_deref_instr *parent = nir_deref_instr_parent(deref);
+                  if (!matrix) {
+                     /* if this isn't a direct matrix deref, it's maybe a matrix row deref */
+                     hash_table_foreach(derefs, he) {
+                        /* propagate parent matrix type to row deref */
+                        if (he->key == parent)
+                           matrix = he->data;
+                     }
+                  }
+                  if (matrix)
+                     _mesa_hash_table_insert(derefs, deref, (void*)matrix);
+                  if (deref->deref_type == nir_deref_type_var)
+                     deref->type = var->type;
+                  else
+                     deref->type = rewrite_64bit_type(shader, deref->type, var);
+               }
+               break;
+               case nir_instr_type_intrinsic: {
+                  nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+                  if (intr->intrinsic != nir_intrinsic_store_deref &&
+                      intr->intrinsic != nir_intrinsic_load_deref)
+                     break;
+                  if (nir_intrinsic_get_var(intr, 0) != var)
+                     break;
+                  if ((intr->intrinsic == nir_intrinsic_store_deref && intr->src[1].ssa->bit_size != 64) ||
+                      (intr->intrinsic == nir_intrinsic_load_deref && intr->dest.ssa.bit_size != 64))
+                     break;
+                  b.cursor = nir_before_instr(instr);
+                  nir_deref_instr *deref = nir_src_as_deref(intr->src[0]);
+                  unsigned num_components = intr->num_components * 2;
+                  nir_ssa_def *comp[NIR_MAX_VEC_COMPONENTS];
+                  /* this is the stored matrix type from the deref */
+                  struct hash_entry *he = _mesa_hash_table_search(derefs, deref);
+                  const struct glsl_type *matrix = he ? he->data : NULL;
+                  func_progress = true;
+                  if (intr->intrinsic == nir_intrinsic_store_deref) {
+                     /* first, unpack the src data to 32bit vec2 components */
+                     for (unsigned i = 0; i < intr->num_components; i++) {
+                        nir_ssa_def *ssa = nir_unpack_64_2x32(&b, nir_channel(&b, intr->src[1].ssa, i));
+                        comp[i * 2] = nir_channel(&b, ssa, 0);
+                        comp[i * 2 + 1] = nir_channel(&b, ssa, 1);
+                     }
+                     unsigned wrmask = nir_intrinsic_write_mask(intr);
+                     unsigned mask = 0;
+                     /* expand writemask for doubled components */
+                     for (unsigned i = 0; i < intr->num_components; i++) {
+                        if (wrmask & BITFIELD_BIT(i))
+                           mask |= BITFIELD_BIT(i * 2) | BITFIELD_BIT(i * 2 + 1);
+                     }
+                     if (matrix) {
+                        /* matrix types always come from array (row) derefs */
+                        assert(deref->deref_type == nir_deref_type_array);
+                        nir_deref_instr *var_deref = nir_deref_instr_parent(deref);
+                        /* let optimization clean up consts later */
+                        nir_ssa_def *index = deref->arr.index.ssa;
+                        /* this might be an indirect array index:
+                         * - iterate over matrix columns
+                         * - add if blocks for each column
+                         * - perform the store in the block
+                         */
+                        for (unsigned idx = 0; idx < glsl_get_matrix_columns(matrix); idx++) {
+                           nir_push_if(&b, nir_ieq_imm(&b, index, idx));
+                           unsigned vec_components = glsl_get_vector_elements(matrix);
+                           /* always clamp dvec3 to 4 components */
+                           if (vec_components == 3)
+                              vec_components = 4;
+                           unsigned start_component = idx * vec_components * 2;
+                           /* struct member */
+                           unsigned member = start_component / 4;
+                           /* number of components remaining */
+                           unsigned remaining = num_components;
+                           for (unsigned i = 0; i < num_components; member++) {
+                              if (!(mask & BITFIELD_BIT(i)))
+                                 continue;
+                              assert(member < glsl_get_length(var_deref->type));
+                              /* deref the rewritten struct to the appropriate vec4/vec2 */
+                              nir_deref_instr *strct = nir_build_deref_struct(&b, var_deref, member);
+                              unsigned incr = MIN2(remaining, 4);
+                              /* assemble the write component vec */
+                              nir_ssa_def *val = nir_vec(&b, &comp[i], incr);
+                              /* use the number of components being written as the writemask */
+                              if (glsl_get_vector_elements(strct->type) > val->num_components)
+                                 val = nir_pad_vector(&b, val, glsl_get_vector_elements(strct->type));
+                              nir_store_deref(&b, strct, val, BITFIELD_MASK(incr));
+                              remaining -= incr;
+                              i += incr;
+                           }
+                           nir_pop_if(&b, NULL);
+                        }
+                        _mesa_set_add(deletes, &deref->instr);
+                     } else if (num_components <= 4) {
+                        /* simple store case: just write out the components */
+                        nir_ssa_def *dest = nir_vec(&b, comp, num_components);
+                        nir_store_deref(&b, deref, dest, mask);
+                     } else {
+                        /* writing > 4 components: access the struct and write to the appropriate vec4 members */
+                        for (unsigned i = 0; num_components; i++, num_components -= MIN2(num_components, 4)) {
+                           if (!(mask & BITFIELD_MASK(4)))
+                              continue;
+                           nir_deref_instr *strct = nir_build_deref_struct(&b, deref, i);
+                           nir_ssa_def *dest = nir_vec(&b, &comp[i * 4], MIN2(num_components, 4));
+                           if (glsl_get_vector_elements(strct->type) > dest->num_components)
+                              dest = nir_pad_vector(&b, dest, glsl_get_vector_elements(strct->type));
+                           nir_store_deref(&b, strct, dest, mask & BITFIELD_MASK(4));
+                           mask >>= 4;
+                        }
+                     }
+                  } else {
+                     nir_ssa_def *dest = NULL;
+                     if (matrix) {
+                        /* matrix types always come from array (row) derefs */
+                        assert(deref->deref_type == nir_deref_type_array);
+                        nir_deref_instr *var_deref = nir_deref_instr_parent(deref);
+                        /* let optimization clean up consts later */
+                        nir_ssa_def *index = deref->arr.index.ssa;
+                        /* this might be an indirect array index:
+                         * - iterate over matrix columns
+                         * - add if blocks for each column
+                         * - phi the loads using the array index
+                         */
+                        unsigned cols = glsl_get_matrix_columns(matrix);
+                        nir_ssa_def *dests[4];
+                        for (unsigned idx = 0; idx < cols; idx++) {
+                           /* don't add an if for the final row: this will be handled in the else */
+                           if (idx < cols - 1)
+                              nir_push_if(&b, nir_ieq_imm(&b, index, idx));
+                           unsigned vec_components = glsl_get_vector_elements(matrix);
+                           /* always clamp dvec3 to 4 components */
+                           if (vec_components == 3)
+                              vec_components = 4;
+                           unsigned start_component = idx * vec_components * 2;
+                           /* struct member */
+                           unsigned member = start_component / 4;
+                           /* number of components remaining */
+                           unsigned remaining = num_components;
+                           /* component index */
+                           unsigned comp_idx = 0;
+                           for (unsigned i = 0; i < num_components; member++) {
+                              assert(member < glsl_get_length(var_deref->type));
+                              nir_deref_instr *strct = nir_build_deref_struct(&b, var_deref, member);
+                              nir_ssa_def *load = nir_load_deref(&b, strct);
+                              unsigned incr = MIN2(remaining, 4);
+                              /* repack the loads to 64bit */
+                              for (unsigned c = 0; c < incr / 2; c++, comp_idx++)
+                                 comp[comp_idx] = nir_pack_64_2x32(&b, nir_channels(&b, load, BITFIELD_RANGE(c * 2, 2)));
+                              remaining -= incr;
+                              i += incr;
+                           }
+                           dest = dests[idx] = nir_vec(&b, comp, intr->num_components);
+                           if (idx < cols - 1)
+                              nir_push_else(&b, NULL);
+                        }
+                        /* loop over all the if blocks that were made, pop them, and phi the loaded+packed results */
+                        for (unsigned idx = cols - 1; idx >= 1; idx--) {
+                           nir_pop_if(&b, NULL);
+                           dest = nir_if_phi(&b, dests[idx - 1], dest);
+                        }
+                        _mesa_set_add(deletes, &deref->instr);
+                     } else if (num_components <= 4) {
+                        /* simple load case */
+                        nir_ssa_def *load = nir_load_deref(&b, deref);
+                        /* pack 32bit loads into 64bit: this will automagically get optimized out later */
+                        for (unsigned i = 0; i < intr->num_components; i++) {
+                           comp[i] = nir_pack_64_2x32(&b, nir_channels(&b, load, BITFIELD_RANGE(i * 2, 2)));
+                        }
+                        dest = nir_vec(&b, comp, intr->num_components);
+                     } else {
+                        /* writing > 4 components: access the struct and load the appropriate vec4 members */
+                        for (unsigned i = 0; i < 2; i++, num_components -= 4) {
+                           nir_deref_instr *strct = nir_build_deref_struct(&b, deref, i);
+                           nir_ssa_def *load = nir_load_deref(&b, strct);
+                           comp[i * 2] = nir_pack_64_2x32(&b, nir_channels(&b, load, BITFIELD_MASK(2)));
+                           if (num_components > 2)
+                              comp[i * 2 + 1] = nir_pack_64_2x32(&b, nir_channels(&b, load, BITFIELD_RANGE(2, 2)));
+                        }
+                        dest = nir_vec(&b, comp, intr->num_components);
+                     }
+                     nir_ssa_def_rewrite_uses_after(&intr->dest.ssa, dest, instr);
+                  }
+                  _mesa_set_add(deletes, instr);
+                  break;
+               }
+               break;
+               default: break;
+               }
+            }
+         }
+         if (func_progress)
+            nir_metadata_preserve(function->impl, nir_metadata_none);
+         /* derefs must be queued for deletion to avoid deleting the same deref repeatedly */
+         set_foreach_remove(deletes, he)
+            nir_instr_remove((void*)he->key);
+      }
+      progress = true;
+   }
+   ralloc_free(deletes);
+   ralloc_free(derefs);
+   if (progress) {
+      nir_lower_alu_to_scalar(shader, filter_64_bit_instr, NULL);
+      nir_lower_phis_to_scalar(shader, false);
+   }
+   return progress;
+}
+
 static void
 zink_shader_dump(void *words, size_t size, const char *file)
 {
@@ -2395,6 +2712,9 @@ zink_shader_create(struct zink_screen *screen, struct nir_shader *nir,
    NIR_PASS(bindless_lowered, nir, lower_bindless, &bindless);
    ret->bindless |= bindless_lowered;
 
+   if (!screen->info.feats.features.shaderInt64)
+      NIR_PASS_V(nir, lower_64bit_vars);
+
    ret->nir = nir;
    if (so_info && nir->info.outputs_written && nir->info.has_transform_feedback_varyings)
       update_so_info(ret, so_info, nir->info.outputs_written, have_psiz);