Mesa (main): intel/compiler: Add helpers to select SIMD for compute shaders

Tue Oct 26 18:06:20 UTC 2021

Module: Mesa
Branch: main
Commit: 7558340ebb688a9607f0cbc05f8bad56d77b19fe
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=7558340ebb688a9607f0cbc05f8bad56d77b19fe

Author: Caio Marcelo de Oliveira Filho <caio.oliveira at intel.com>
Date:   Wed Oct  6 22:37:42 2021 -0700

intel/compiler: Add helpers to select SIMD for compute shaders

Clean up the logic and move it to functions that work with prog_data
attributes to select the right SIMD.  This shouldn't change any
behavior compared to the original.

Having it extracted will allow reuse by Task/Mesh and make it easier
to write tests.

Reviewed-by: Kenneth Graunke <kenneth at whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13249>

---

 src/intel/compiler/brw_private.h           |  54 ++++++
 src/intel/compiler/brw_simd_selection.c    | 163 +++++++++++++++++
 src/intel/compiler/meson.build             |   3 +
 src/intel/compiler/test_simd_selection.cpp | 282 +++++++++++++++++++++++++++++
 4 files changed, 502 insertions(+)

diff --git a/src/intel/compiler/brw_private.h b/src/intel/compiler/brw_private.h
new file mode 100644
index 00000000000..d166a29e0d5
--- /dev/null
+++ b/src/intel/compiler/brw_private.h
@@ -0,0 +1,54 @@
+/* -*- c++ -*- */
+/*
+ * Copyright © 2021 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_PRIVATE_H
+#define BRW_PRIVATE_H
+
+#include "brw_compiler.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+unsigned brw_required_dispatch_width(const struct shader_info *info,
+                                     enum brw_subgroup_size_type subgroup_size_type);
+
+bool brw_simd_should_compile(void *mem_ctx,
+                             unsigned simd,
+                             const struct intel_device_info *devinfo,
+                             struct brw_cs_prog_data *prog_data,
+                             unsigned required_dispatch_width,
+                             const char **error);
+
+void brw_simd_mark_compiled(unsigned simd,
+                            struct brw_cs_prog_data *prog_data,
+                            bool spilled);
+
+int brw_simd_select(const struct brw_cs_prog_data *prog_data);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif // BRW_PRIVATE_H
diff --git a/src/intel/compiler/brw_simd_selection.c b/src/intel/compiler/brw_simd_selection.c
new file mode 100644
index 00000000000..551e882e1a8
--- /dev/null
+++ b/src/intel/compiler/brw_simd_selection.c
@@ -0,0 +1,163 @@
+/*
+ * Copyright © 2021 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_private.h"
+#include "compiler/shader_info.h"
+#include "intel/dev/intel_debug.h"
+#include "intel/dev/intel_device_info.h"
+#include "util/ralloc.h"
+
+unsigned
+brw_required_dispatch_width(const struct shader_info *info,
+                            enum brw_subgroup_size_type subgroup_size_type)
+{
+   unsigned required = 0;
+
+   if ((int)subgroup_size_type >= (int)BRW_SUBGROUP_SIZE_REQUIRE_8) {
+      assert(gl_shader_stage_uses_workgroup(info->stage));
+      /* These enum values are expressly chosen to be equal to the subgroup
+       * size that they require.
+       */
+      required = (unsigned)subgroup_size_type;
+   }
+
+   if (gl_shader_stage_is_compute(info->stage) && info->cs.subgroup_size > 0) {
+      assert(required == 0 || required == info->cs.subgroup_size);
+      required = info->cs.subgroup_size;
+   }
+
+   return required;
+}
+
+static inline bool
+test_bit(unsigned mask, unsigned bit) {
+   return mask & (1u << bit);
+}
+
+bool
+brw_simd_should_compile(void *mem_ctx,
+                        unsigned simd,
+                        const struct intel_device_info *devinfo,
+                        struct brw_cs_prog_data *prog_data,
+                        unsigned required,
+                        const char **error)
+
+{
+   assert(!test_bit(prog_data->prog_mask, simd));
+   assert(error);
+
+   const unsigned width = 8u << simd;
+
+   /* For shaders with variable size workgroup, we will always compile all the
+    * variants, since the choice will happen only at dispatch time.
+    */
+   const bool workgroup_size_variable = prog_data->local_size[0] == 0;
+
+   if (!workgroup_size_variable) {
+      if (test_bit(prog_data->prog_spilled, simd)) {
+         *error = ralloc_asprintf(
+            mem_ctx, "SIMD%u skipped because would spill", width);
+         return false;
+      }
+
+      const unsigned workgroup_size = prog_data->local_size[0] *
+                                      prog_data->local_size[1] *
+                                      prog_data->local_size[2];
+
+      unsigned max_threads = devinfo->max_cs_workgroup_threads;
+
+      if (required && required != width) {
+         *error = ralloc_asprintf(
+            mem_ctx, "SIMD%u skipped because required dispatch width is %u",
+            width, required);
+         return false;
+      }
+
+      /* TODO: Ignore SIMD larger than workgroup if previous SIMD already passed. */
+
+      if (DIV_ROUND_UP(workgroup_size, width) > max_threads) {
+         *error = ralloc_asprintf(
+            mem_ctx, "SIMD%u can't fit all %u invocations in %u threads",
+            width, workgroup_size, max_threads);
+         return false;
+      }
+
+      /* The SIMD32 is only enabled for cases it is needed unless forced.
+       *
+       * TODO: Use performance_analysis and drop this rule.
+       */
+      if (width == 32) {
+         if (!INTEL_DEBUG(DEBUG_DO32) && prog_data->prog_mask) {
+            *error = ralloc_strdup(
+               mem_ctx, "SIMD32 skipped because not required");
+            return false;
+         }
+      }
+   }
+
+   const bool env_skip[3] = {
+      INTEL_DEBUG(DEBUG_NO8),
+      INTEL_DEBUG(DEBUG_NO16),
+      INTEL_DEBUG(DEBUG_NO32),
+   };
+
+   if (unlikely(env_skip[simd])) {
+      *error = ralloc_asprintf(
+         mem_ctx, "SIMD%u skipped because INTEL_DEBUG=no%u",
+         width, width);
+      return false;
+   }
+
+   return true;
+}
+
+void
+brw_simd_mark_compiled(unsigned simd, struct brw_cs_prog_data *prog_data, bool spilled)
+{
+   assert(!test_bit(prog_data->prog_mask, simd));
+
+   prog_data->prog_mask |= 1u << simd;
+
+   /* If a SIMD spilled, all the larger ones would spill too. */
+   if (spilled) {
+      for (unsigned i = simd; i < 3; i++)
+         prog_data->prog_spilled |= 1u << i;
+   }
+}
+
+int
+brw_simd_select(const struct brw_cs_prog_data *prog_data)
+{
+   assert((prog_data->prog_mask & ~0x7u) == 0);
+   const unsigned not_spilled_mask =
+      prog_data->prog_mask & ~prog_data->prog_spilled;
+
+   /* Util functions index bits from 1 instead of 0, adjust before return. */
+
+   if (not_spilled_mask)
+      return util_last_bit(not_spilled_mask) - 1;
+   else if (prog_data->prog_mask)
+      return ffs(prog_data->prog_mask) - 1;
+   else
+      return -1;
+}
diff --git a/src/intel/compiler/meson.build b/src/intel/compiler/meson.build
index 10c9cff703c..dadb75d43e5 100644
--- a/src/intel/compiler/meson.build
+++ b/src/intel/compiler/meson.build
@@ -98,6 +98,7 @@ libintel_compiler_files = files(
   'brw_nir_clamp_image_1d_2d_array_sizes.c',
   'brw_packed_float.c',
   'brw_predicated_break.cpp',
+  'brw_private.h',
   'brw_reg.h',
   'brw_reg_type.c',
   'brw_reg_type.h',
@@ -105,6 +106,7 @@ libintel_compiler_files = files(
   'brw_schedule_instructions.cpp',
   'brw_shader.cpp',
   'brw_shader.h',
+  'brw_simd_selection.c',
   'brw_vec4_builder.h',
   'brw_vec4_cmod_propagation.cpp',
   'brw_vec4_copy_propagation.cpp',
@@ -169,6 +171,7 @@ if with_tests
         'test_fs_copy_propagation.cpp',
         'test_fs_saturate_propagation.cpp',
         'test_fs_scoreboard.cpp',
+        'test_simd_selection.cpp',
         'test_vec4_cmod_propagation.cpp',
         'test_vec4_copy_propagation.cpp',
         'test_vec4_dead_code_eliminate.cpp',
diff --git a/src/intel/compiler/test_simd_selection.cpp b/src/intel/compiler/test_simd_selection.cpp
new file mode 100644
index 00000000000..f1be0bf185f
--- /dev/null
+++ b/src/intel/compiler/test_simd_selection.cpp
@@ -0,0 +1,282 @@
+/*
+ * Copyright © 2021 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+
+#include "brw_private.h"
+#include "compiler/shader_info.h"
+#include "intel/dev/intel_debug.h"
+#include "intel/dev/intel_device_info.h"
+#include "util/ralloc.h"
+
+#include <gtest/gtest.h>
+
+enum {
+   SIMD8  = 0,
+   SIMD16 = 1,
+   SIMD32 = 2,
+};
+
+const bool spilled = true;
+const bool not_spilled = false;
+
+class SIMDSelectionTest : public ::testing::Test {
+protected:
+   SIMDSelectionTest() {
+      mem_ctx = ralloc_context(NULL);
+      devinfo = rzalloc(mem_ctx, intel_device_info);
+      prog_data = rzalloc(mem_ctx, struct brw_cs_prog_data);
+      required_dispatch_width = 0;
+   }
+
+   ~SIMDSelectionTest() {
+      ralloc_free(mem_ctx);
+   };
+
+   bool should_compile(unsigned simd) {
+      return brw_simd_should_compile(mem_ctx, simd, devinfo, prog_data,
+                                     required_dispatch_width, &error[simd]);
+   }
+
+   void *mem_ctx;
+   intel_device_info *devinfo;
+   struct brw_cs_prog_data *prog_data;
+   const char *error[3];
+   unsigned required_dispatch_width;
+};
+
+class SIMDSelectionCS : public SIMDSelectionTest {
+protected:
+   SIMDSelectionCS() {
+      prog_data->base.stage = MESA_SHADER_COMPUTE;
+      prog_data->local_size[0] = 32;
+      prog_data->local_size[1] = 1;
+      prog_data->local_size[2] = 1;
+
+      devinfo->max_cs_workgroup_threads = 64;
+   }
+};
+
+TEST_F(SIMDSelectionCS, DefaultsToSIMD16)
+{
+   ASSERT_TRUE(should_compile(SIMD8));
+   brw_simd_mark_compiled(SIMD8, prog_data, not_spilled);
+   ASSERT_TRUE(should_compile(SIMD16));
+   brw_simd_mark_compiled(SIMD16, prog_data, not_spilled);
+   ASSERT_FALSE(should_compile(SIMD32));
+
+   ASSERT_EQ(brw_simd_select(prog_data), SIMD16);
+}
+
+TEST_F(SIMDSelectionCS, TooBigFor16)
+{
+   prog_data->local_size[0] = devinfo->max_cs_workgroup_threads;
+   prog_data->local_size[1] = 32;
+   prog_data->local_size[2] = 1;
+
+   ASSERT_FALSE(should_compile(SIMD8));
+   ASSERT_FALSE(should_compile(SIMD16));
+   ASSERT_TRUE(should_compile(SIMD32));
+   brw_simd_mark_compiled(SIMD32, prog_data, spilled);
+
+   ASSERT_EQ(brw_simd_select(prog_data), SIMD32);
+}
+
+TEST_F(SIMDSelectionCS, WorkgroupSize1)
+{
+   prog_data->local_size[0] = 1;
+   prog_data->local_size[1] = 1;
+   prog_data->local_size[2] = 1;
+
+   ASSERT_TRUE(should_compile(SIMD8));
+   brw_simd_mark_compiled(SIMD8, prog_data, not_spilled);
+   ASSERT_TRUE(should_compile(SIMD16));
+   brw_simd_mark_compiled(SIMD16, prog_data, not_spilled);
+   ASSERT_FALSE(should_compile(SIMD32));
+
+   ASSERT_EQ(brw_simd_select(prog_data), SIMD16);
+}
+
+TEST_F(SIMDSelectionCS, WorkgroupSize8)
+{
+   prog_data->local_size[0] = 8;
+   prog_data->local_size[1] = 1;
+   prog_data->local_size[2] = 1;
+
+   ASSERT_TRUE(should_compile(SIMD8));
+   brw_simd_mark_compiled(SIMD8, prog_data, not_spilled);
+   ASSERT_TRUE(should_compile(SIMD16));
+   brw_simd_mark_compiled(SIMD16, prog_data, not_spilled);
+   ASSERT_FALSE(should_compile(SIMD32));
+
+   ASSERT_EQ(brw_simd_select(prog_data), SIMD16);
+}
+
+TEST_F(SIMDSelectionCS, WorkgroupSizeVariable)
+{
+   prog_data->local_size[0] = 0;
+   prog_data->local_size[1] = 0;
+   prog_data->local_size[2] = 0;
+
+   ASSERT_TRUE(should_compile(SIMD8));
+   brw_simd_mark_compiled(SIMD8, prog_data, not_spilled);
+   ASSERT_TRUE(should_compile(SIMD16));
+   brw_simd_mark_compiled(SIMD16, prog_data, not_spilled);
+   ASSERT_TRUE(should_compile(SIMD32));
+   brw_simd_mark_compiled(SIMD32, prog_data, not_spilled);
+
+   ASSERT_EQ(prog_data->prog_mask, 1u << SIMD8 | 1u << SIMD16 | 1u << SIMD32);
+}
+
+TEST_F(SIMDSelectionCS, WorkgroupSizeVariableSpilled)
+{
+   prog_data->local_size[0] = 0;
+   prog_data->local_size[1] = 0;
+   prog_data->local_size[2] = 0;
+
+   ASSERT_TRUE(should_compile(SIMD8));
+   brw_simd_mark_compiled(SIMD8, prog_data, spilled);
+   ASSERT_TRUE(should_compile(SIMD16));
+   brw_simd_mark_compiled(SIMD16, prog_data, spilled);
+   ASSERT_TRUE(should_compile(SIMD32));
+   brw_simd_mark_compiled(SIMD32, prog_data, spilled);
+
+   ASSERT_EQ(prog_data->prog_mask, 1u << SIMD8 | 1u << SIMD16 | 1u << SIMD32);
+}
+
+TEST_F(SIMDSelectionCS, SpillAtSIMD8)
+{
+   ASSERT_TRUE(should_compile(SIMD8));
+   brw_simd_mark_compiled(SIMD8, prog_data, spilled);
+   ASSERT_FALSE(should_compile(SIMD16));
+   ASSERT_FALSE(should_compile(SIMD32));
+
+   ASSERT_EQ(brw_simd_select(prog_data), SIMD8);
+}
+
+TEST_F(SIMDSelectionCS, SpillAtSIMD16)
+{
+   ASSERT_TRUE(should_compile(SIMD8));
+   brw_simd_mark_compiled(SIMD8, prog_data, not_spilled);
+   ASSERT_TRUE(should_compile(SIMD16));
+   brw_simd_mark_compiled(SIMD16, prog_data, spilled);
+   ASSERT_FALSE(should_compile(SIMD32));
+
+   ASSERT_EQ(brw_simd_select(prog_data), SIMD8);
+}
+
+TEST_F(SIMDSelectionCS, EnvironmentVariable32)
+{
+   intel_debug |= DEBUG_DO32;
+
+   ASSERT_TRUE(should_compile(SIMD8));
+   brw_simd_mark_compiled(SIMD8, prog_data, not_spilled);
+   ASSERT_TRUE(should_compile(SIMD16));
+   brw_simd_mark_compiled(SIMD16, prog_data, not_spilled);
+   ASSERT_TRUE(should_compile(SIMD32));
+   brw_simd_mark_compiled(SIMD32, prog_data, not_spilled);
+
+   ASSERT_EQ(brw_simd_select(prog_data), SIMD32);
+}
+
+TEST_F(SIMDSelectionCS, EnvironmentVariable32ButSpills)
+{
+   intel_debug |= DEBUG_DO32;
+
+   ASSERT_TRUE(should_compile(SIMD8));
+   brw_simd_mark_compiled(SIMD8, prog_data, not_spilled);
+   ASSERT_TRUE(should_compile(SIMD16));
+   brw_simd_mark_compiled(SIMD16, prog_data, not_spilled);
+   ASSERT_TRUE(should_compile(SIMD32));
+   brw_simd_mark_compiled(SIMD32, prog_data, spilled);
+
+   ASSERT_EQ(brw_simd_select(prog_data), SIMD16);
+}
+
+TEST_F(SIMDSelectionCS, Require8)
+{
+   required_dispatch_width = 8;
+
+   ASSERT_TRUE(should_compile(SIMD8));
+   brw_simd_mark_compiled(SIMD8, prog_data, not_spilled);
+   ASSERT_FALSE(should_compile(SIMD16));
+   ASSERT_FALSE(should_compile(SIMD32));
+
+   ASSERT_EQ(brw_simd_select(prog_data), SIMD8);
+}
+
+TEST_F(SIMDSelectionCS, Require8ErrorWhenNotCompile)
+{
+   required_dispatch_width = 8;
+
+   ASSERT_TRUE(should_compile(SIMD8));
+   ASSERT_FALSE(should_compile(SIMD16));
+   ASSERT_FALSE(should_compile(SIMD32));
+
+   ASSERT_EQ(brw_simd_select(prog_data), -1);
+}
+
+TEST_F(SIMDSelectionCS, Require16)
+{
+   required_dispatch_width = 16;
+
+   ASSERT_FALSE(should_compile(SIMD8));
+   ASSERT_TRUE(should_compile(SIMD16));
+   brw_simd_mark_compiled(SIMD16, prog_data, not_spilled);
+   ASSERT_FALSE(should_compile(SIMD32));
+
+   ASSERT_EQ(brw_simd_select(prog_data), SIMD16);
+}
+
+TEST_F(SIMDSelectionCS, Require16ErrorWhenNotCompile)
+{
+   required_dispatch_width = 16;
+
+   ASSERT_FALSE(should_compile(SIMD8));
+   ASSERT_TRUE(should_compile(SIMD16));
+   ASSERT_FALSE(should_compile(SIMD32));
+
+   ASSERT_EQ(brw_simd_select(prog_data), -1);
+}
+
+TEST_F(SIMDSelectionCS, Require32)
+{
+   required_dispatch_width = 32;
+
+   ASSERT_FALSE(should_compile(SIMD8));
+   ASSERT_FALSE(should_compile(SIMD16));
+   ASSERT_TRUE(should_compile(SIMD32));
+   brw_simd_mark_compiled(SIMD32, prog_data, not_spilled);
+
+   ASSERT_EQ(brw_simd_select(prog_data), SIMD32);
+}
+
+TEST_F(SIMDSelectionCS, Require32ErrorWhenNotCompile)
+{
+   required_dispatch_width = 32;
+
+   ASSERT_FALSE(should_compile(SIMD8));
+   ASSERT_FALSE(should_compile(SIMD16));
+   ASSERT_TRUE(should_compile(SIMD32));
+
+   ASSERT_EQ(brw_simd_select(prog_data), -1);
+}