Mesa (main): gallivm: Optimize single-invocation SSBO stores.

GitLab Mirror gitlab-mirror at kemper.freedesktop.org
Tue Jul 26 19:29:24 UTC 2022


Module: Mesa
Branch: main
Commit: bd8740da77c191e1da7c93ff0e42df333840212f
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=bd8740da77c191e1da7c93ff0e42df333840212f

Author: Emma Anholt <emma at anholt.net>
Date:   Sun Jul 24 08:11:49 2022 -0700

gallivm: Optimize single-invocation SSBO stores.

The CTS does a lot of 1x1x1 compute shaders (all that stuff like
dEQP-GLES31.functional.shaders.builtin_functions.precision.mul.highp_compute.scalar)
which finish with store_ssbos.  Instead of doing the invocation loop in
that case (which LLVM has to later unroll), just emit the single
invocation's store.

Fixes timeouts running
dEQP-GLES31.functional.ssbo.layout.random.all_shared_buffer.36, which does
a spectacular number of SSBO stores in a long 1x1x1 compute shader.
Reduces runtime of on llvmpipe from 66s to 29s locally, and virgl from
1:38 to 43s.  virgl
dEQP-GLES31.functional.ssbo.layout.random.nested_structs_arrays_instance_arrays.22
goes down to 7 seconds.

Fixes: #6797
Reviewed-by: Dave Airlie <airlied at redhat.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17730>

---

 src/gallium/auxiliary/gallivm/lp_bld_nir.c         |  6 ++--
 src/gallium/auxiliary/gallivm/lp_bld_nir.h         |  1 +
 src/gallium/auxiliary/gallivm/lp_bld_nir_soa.c     | 36 ++++++++++++++++++++++
 src/gallium/drivers/llvmpipe/ci/llvmpipe-skips.txt |  1 -
 src/gallium/drivers/virgl/ci/virgl-gl-fails.txt    |  3 ++
 src/gallium/drivers/virgl/ci/virgl-gl-skips.txt    |  2 +-
 src/gallium/drivers/virgl/ci/virgl-gles-fails.txt  |  3 ++
 src/gallium/drivers/virgl/ci/virgl-gles-flakes.txt |  6 ----
 src/gallium/drivers/virgl/ci/virgl-gles-skips.txt  |  4 +++
 9 files changed, 52 insertions(+), 10 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_nir.c b/src/gallium/auxiliary/gallivm/lp_bld_nir.c
index 858002f5d15..3cf110f533a 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_nir.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_nir.c
@@ -1550,10 +1550,11 @@ visit_store_ssbo(struct lp_build_nir_context *bld_base,
    LLVMValueRef val = get_src(bld_base, instr->src[0]);
    LLVMValueRef idx = cast_type(bld_base, get_src(bld_base, instr->src[1]), nir_type_uint, 32);
    LLVMValueRef offset = get_src(bld_base, instr->src[2]);
+   bool index_and_offset_are_uniform = nir_src_is_always_uniform(instr->src[1]) && nir_src_is_always_uniform(instr->src[2]);
    int writemask = instr->const_index[0];
    int nc = nir_src_num_components(instr->src[0]);
    int bitsize = nir_src_bit_size(instr->src[0]);
-   bld_base->store_mem(bld_base, writemask, nc, bitsize, idx, offset, val);
+   bld_base->store_mem(bld_base, writemask, nc, bitsize, index_and_offset_are_uniform, idx, offset, val);
 }
 
 
@@ -1821,10 +1822,11 @@ visit_shared_store(struct lp_build_nir_context *bld_base,
 {
    LLVMValueRef val = get_src(bld_base, instr->src[0]);
    LLVMValueRef offset = get_src(bld_base, instr->src[1]);
+   bool offset_is_uniform = nir_src_is_always_uniform(instr->src[1]);
    int writemask = instr->const_index[1];
    int nc = nir_src_num_components(instr->src[0]);
    int bitsize = nir_src_bit_size(instr->src[0]);
-   bld_base->store_mem(bld_base, writemask, nc, bitsize, NULL, offset, val);
+   bld_base->store_mem(bld_base, writemask, nc, bitsize, offset_is_uniform, NULL, offset, val);
 }
 
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_nir.h b/src/gallium/auxiliary/gallivm/lp_bld_nir.h
index 6c40d982ad1..13236719a1c 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_nir.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_nir.h
@@ -115,6 +115,7 @@ struct lp_build_nir_context
                     LLVMValueRef index, LLVMValueRef offset, LLVMValueRef result[NIR_MAX_VEC_COMPONENTS]);
    void (*store_mem)(struct lp_build_nir_context *bld_base,
                      unsigned writemask, unsigned nc, unsigned bit_size,
+                     bool index_and_offset_are_uniform,
                      LLVMValueRef index, LLVMValueRef offset, LLVMValueRef dst);
 
    void (*atomic_mem)(struct lp_build_nir_context *bld_base,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_nir_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_nir_soa.c
index a7a1cf5d800..a7b2af8a8c6 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_nir_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_nir_soa.c
@@ -1296,6 +1296,7 @@ static void emit_store_mem(struct lp_build_nir_context *bld_base,
                            unsigned writemask,
                            unsigned nc,
                            unsigned bit_size,
+                           bool index_and_offset_are_uniform,
                            LLVMValueRef index,
                            LLVMValueRef offset,
                            LLVMValueRef dst)
@@ -1310,6 +1311,41 @@ static void emit_store_mem(struct lp_build_nir_context *bld_base,
 
    offset = lp_build_shr_imm(uint_bld, offset, shift_val);
 
+   /* If the address is uniform, then just store the value from the first
+    * channel instead of making LLVM unroll the invocation loop.
+    */
+   if (index_and_offset_are_uniform && invocation_0_must_be_active(bld_base)) {
+      LLVMValueRef ssbo_limit;
+      LLVMValueRef mem_ptr = mem_access_base_pointer(bld_base, store_bld, bit_size, index,
+                                                     lp_build_const_int32(gallivm, 0), &ssbo_limit);
+
+      offset = LLVMBuildExtractElement(gallivm->builder, offset, lp_build_const_int32(gallivm, 0), "");
+
+      for (unsigned c = 0; c < nc; c++) {
+         if (!(writemask & (1u << c)))
+            continue;
+
+         /* Pick out invocation 0's value. */
+         LLVMValueRef val = (nc == 1) ? dst : LLVMBuildExtractValue(builder, dst, c, "");
+         LLVMValueRef value_ptr = LLVMBuildExtractElement(gallivm->builder, val,
+                                                         lp_build_const_int32(gallivm, 0), "");
+         value_ptr = LLVMBuildBitCast(gallivm->builder, value_ptr, store_bld->elem_type, "");
+
+         LLVMValueRef chan_offset = LLVMBuildAdd(builder, offset, lp_build_const_int32(gallivm, c), "");
+
+         /* If storing outside the SSBO, we need to skip the store instead. */
+         if (ssbo_limit) {
+            struct lp_build_if_state ifthen;
+            lp_build_if(&ifthen, gallivm, lp_offset_in_range(bld_base, chan_offset, ssbo_limit));
+            lp_build_pointer_set(builder, mem_ptr, chan_offset, value_ptr);
+            lp_build_endif(&ifthen);
+         } else {
+            lp_build_pointer_set(builder, mem_ptr, chan_offset, value_ptr);
+         }
+      }
+      return;
+   }
+
    LLVMValueRef exec_mask = mask_vec(bld_base);
    LLVMValueRef cond = LLVMBuildICmp(gallivm->builder, LLVMIntNE, exec_mask, uint_bld->zero, "");
    struct lp_build_loop_state loop_state;
diff --git a/src/gallium/drivers/llvmpipe/ci/llvmpipe-skips.txt b/src/gallium/drivers/llvmpipe/ci/llvmpipe-skips.txt
index 50c6669ad97..dabe47e4340 100644
--- a/src/gallium/drivers/llvmpipe/ci/llvmpipe-skips.txt
+++ b/src/gallium/drivers/llvmpipe/ci/llvmpipe-skips.txt
@@ -13,7 +13,6 @@ KHR-GL45.texture_size_promotion.functional
 KHR-GL45.texture_swizzle.functional
 KHR-GL45.texture_swizzle.smoke
 KHR-GLES31.core.arrays_of_arrays.InteractionFunctionCalls2
-dEQP-GLES31.functional.ssbo.layout.random.all_shared_buffer.36
 arb_pipeline_statistics_query-comp
 gl-1.0-blend-func
 
diff --git a/src/gallium/drivers/virgl/ci/virgl-gl-fails.txt b/src/gallium/drivers/virgl/ci/virgl-gl-fails.txt
index 2b7f51ea990..2133432f82f 100644
--- a/src/gallium/drivers/virgl/ci/virgl-gl-fails.txt
+++ b/src/gallium/drivers/virgl/ci/virgl-gl-fails.txt
@@ -36,7 +36,10 @@ dEQP-GLES31.functional.shaders.sample_variables.sample_mask_in.bit_count_per_two
 dEQP-GLES31.functional.shaders.sample_variables.sample_mask_in.bit_count_per_two_samples.multisample_rbo_2,Fail
 dEQP-GLES31.functional.shaders.sample_variables.sample_mask_in.bit_count_per_two_samples.multisample_texture_1,Fail
 dEQP-GLES31.functional.shaders.sample_variables.sample_mask_in.bit_count_per_two_samples.multisample_texture_2,Fail
+
+# Times out waiting for >15s compile on the host side.
 dEQP-GLES31.functional.ssbo.layout.random.all_shared_buffer.36,Fail
+dEQP-GLES31.functional.ssbo.layout.random.nested_structs_arrays_instance_arrays.22,Fail
 
 KHR-GL30.transform_feedback.api_errors_test,Fail
 KHR-GL32.transform_feedback_overflow_query_ARB.advanced-single-stream-interleaved-attribs,Fail
diff --git a/src/gallium/drivers/virgl/ci/virgl-gl-skips.txt b/src/gallium/drivers/virgl/ci/virgl-gl-skips.txt
index 4c3e6f27fa9..5fc9f2d6e2c 100644
--- a/src/gallium/drivers/virgl/ci/virgl-gl-skips.txt
+++ b/src/gallium/drivers/virgl/ci/virgl-gl-skips.txt
@@ -5,7 +5,7 @@
 # Sometimes crashes, e.g. https://gitlab.freedesktop.org/kusma/mesa/-/jobs/4109419
 dEQP-GLES31.functional.compute.basic.empty
 
-# too slow.
+# too slow (>15s compile on host causes timeouts that make for flakes)
 dEQP-GLES31.functional.ssbo.layout.random.all_shared_buffer.36
 dEQP-GLES31.functional.ssbo.layout.random.nested_structs_arrays_instance_arrays.22
 
diff --git a/src/gallium/drivers/virgl/ci/virgl-gles-fails.txt b/src/gallium/drivers/virgl/ci/virgl-gles-fails.txt
index bc664b233bc..68db7b58f53 100644
--- a/src/gallium/drivers/virgl/ci/virgl-gles-fails.txt
+++ b/src/gallium/drivers/virgl/ci/virgl-gles-fails.txt
@@ -26,8 +26,11 @@ dEQP-GLES31.functional.shaders.sample_variables.sample_mask_in.bit_count_per_two
 dEQP-GLES31.functional.shaders.sample_variables.sample_mask_in.bit_count_per_two_samples.multisample_rbo_2,Fail
 dEQP-GLES31.functional.shaders.sample_variables.sample_mask_in.bit_count_per_two_samples.multisample_texture_1,Fail
 dEQP-GLES31.functional.shaders.sample_variables.sample_mask_in.bit_count_per_two_samples.multisample_texture_2,Fail
+
+# Times out waiting for >15s compile on the host side.
 dEQP-GLES31.functional.ssbo.layout.random.all_shared_buffer.36,Fail
 dEQP-GLES31.functional.ssbo.layout.random.nested_structs_arrays_instance_arrays.22,Fail
+
 KHR-GL30.glsl_noperspective.functionaltest,Fail
 
 KHR-GL30.transform_feedback.api_errors_test,Fail
diff --git a/src/gallium/drivers/virgl/ci/virgl-gles-flakes.txt b/src/gallium/drivers/virgl/ci/virgl-gles-flakes.txt
index 8fec80970e3..a40530cf5d9 100644
--- a/src/gallium/drivers/virgl/ci/virgl-gles-flakes.txt
+++ b/src/gallium/drivers/virgl/ci/virgl-gles-flakes.txt
@@ -12,9 +12,3 @@ dEQP-GLES2.functional.clipping.point.wide_point_clip_viewport_corner
 dEQP-GLES3.functional.clipping.point.wide_point_clip
 dEQP-GLES3.functional.clipping.point.wide_point_clip_viewport_center
 dEQP-GLES3.functional.clipping.point.wide_point_clip_viewport_corner
-
-dEQP-GLES31.functional.ssbo.layout.random.arrays_of_arrays.1
-
-dEQP-GLES31.functional.ssbo.layout.unsized_struct_array.single_buffer.std430_instance_array
-dEQP-GLES31.functional.ssbo.layout.unsized_struct_array.per_block_buffer.std430_instance_array
-dEQP-GLES31.functional.ssbo.layout.unsized_struct_array.single_buffer.std140_instance_array
diff --git a/src/gallium/drivers/virgl/ci/virgl-gles-skips.txt b/src/gallium/drivers/virgl/ci/virgl-gles-skips.txt
index d47f896e062..fd7e4d1848d 100644
--- a/src/gallium/drivers/virgl/ci/virgl-gles-skips.txt
+++ b/src/gallium/drivers/virgl/ci/virgl-gles-skips.txt
@@ -4,5 +4,9 @@
 
 KHR-GL32.packed_pixels.varied_rectangle.depth*
 
+# too slow (>15s compile on host causes timeouts that make for flakes)
+dEQP-GLES31.functional.ssbo.layout.random.all_shared_buffer.36
+dEQP-GLES31.functional.ssbo.layout.random.nested_structs_arrays_instance_arrays.22
+
 # Sometimes crashes, e.g. https://gitlab.freedesktop.org/kusma/mesa/-/jobs/4109419
 dEQP-GLES31.functional.compute.basic.empty



More information about the mesa-commit mailing list