Mesa (main): intel/compiler: Use SIMD selection helpers for variable workgroup size

Tue Oct 26 18:06:20 UTC 2021

Module: Mesa
Branch: main
Commit: 4e7b71e00cb9ae27dff4f022fab58e14ea75fa2d
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=4e7b71e00cb9ae27dff4f022fab58e14ea75fa2d

Author: Caio Marcelo de Oliveira Filho <caio.oliveira at intel.com>
Date:   Mon Oct 11 07:49:40 2021 -0700

intel/compiler: Use SIMD selection helpers for variable workgroup size

Variable workgroup size works by compiling as much SIMD variants as
possible and then selecting the right one during dispatch (when the
actual workgroup size is passed to us).

Instead of replicating the logic in a separate function, reuse the
same logic for regular SIMD selection.  And move function for that
together with the remaining simd selection functions.

Reviewed-by: Kenneth Graunke <kenneth at whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13249>

---

 src/intel/compiler/brw_fs.cpp              | 42 ++------------
 src/intel/compiler/brw_private.h           |  4 ++
 src/intel/compiler/brw_simd_selection.c    | 39 +++++++++++++
 src/intel/compiler/test_simd_selection.cpp | 89 ++++++++++++++++++++++++++++++
 4 files changed, 138 insertions(+), 36 deletions(-)

diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index c66b3cd8ff6..ccd2f039340 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -10193,40 +10193,6 @@ brw_compile_cs(const struct brw_compiler *compiler,
    return ret;
 }
 
-static unsigned
-brw_cs_simd_size_for_group_size(const struct intel_device_info *devinfo,
-                                const struct brw_cs_prog_data *cs_prog_data,
-                                unsigned group_size)
-{
-   const unsigned mask = cs_prog_data->prog_mask;
-   assert(mask != 0);
-
-   static const unsigned simd8  = 1 << 0;
-   static const unsigned simd16 = 1 << 1;
-   static const unsigned simd32 = 1 << 2;
-
-   if (INTEL_DEBUG(DEBUG_DO32) && (mask & simd32))
-      return 32;
-
-   const uint32_t max_threads = devinfo->max_cs_workgroup_threads;
-
-   if ((mask & simd8) && group_size <= 8 * max_threads) {
-      /* Prefer SIMD16 if can do without spilling.  Matches logic in
-       * brw_simd_selection.cpp.
-       */
-      if ((mask & simd16) && (~cs_prog_data->prog_spilled & simd16))
-         return 16;
-      return 8;
-   }
-
-   if ((mask & simd16) && group_size <= 16 * max_threads)
-      return 16;
-
-   assert(mask & simd32);
-   assert(group_size <= 32 * max_threads);
-   return 32;
-}
-
 struct brw_cs_dispatch_info
 brw_cs_get_dispatch_info(const struct intel_device_info *devinfo,
                          const struct brw_cs_prog_data *prog_data,
@@ -10238,9 +10204,13 @@ brw_cs_get_dispatch_info(const struct intel_device_info *devinfo,
       override_local_size ? override_local_size :
                             prog_data->local_size;
 
+   const int simd =
+      override_local_size ? brw_simd_select_for_workgroup_size(devinfo, prog_data, sizes) :
+                            brw_simd_select(prog_data);
+   assert(simd >= 0 && simd < 3);
+
    info.group_size = sizes[0] * sizes[1] * sizes[2];
-   info.simd_size =
-      brw_cs_simd_size_for_group_size(devinfo, prog_data, info.group_size);
+   info.simd_size = 8u << simd;
    info.threads = DIV_ROUND_UP(info.group_size, info.simd_size);
 
    const uint32_t remainder = info.group_size & (info.simd_size - 1);
diff --git a/src/intel/compiler/brw_private.h b/src/intel/compiler/brw_private.h
index d166a29e0d5..c4334ce3ff7 100644
--- a/src/intel/compiler/brw_private.h
+++ b/src/intel/compiler/brw_private.h
@@ -47,6 +47,10 @@ void brw_simd_mark_compiled(unsigned simd,
 
 int brw_simd_select(const struct brw_cs_prog_data *prog_data);
 
+int brw_simd_select_for_workgroup_size(const struct intel_device_info *devinfo,
+                                       const struct brw_cs_prog_data *prog_data,
+                                       const unsigned *sizes);
+
 #ifdef __cplusplus
 } /* extern "C" */
 #endif
diff --git a/src/intel/compiler/brw_simd_selection.c b/src/intel/compiler/brw_simd_selection.c
index 551e882e1a8..2ecf568ad57 100644
--- a/src/intel/compiler/brw_simd_selection.c
+++ b/src/intel/compiler/brw_simd_selection.c
@@ -161,3 +161,42 @@ brw_simd_select(const struct brw_cs_prog_data *prog_data)
    else
       return -1;
 }
+
+int
+brw_simd_select_for_workgroup_size(const struct intel_device_info *devinfo,
+                                   const struct brw_cs_prog_data *prog_data,
+                                   const unsigned *sizes)
+{
+   assert(sizes);
+
+   if (prog_data->local_size[0] == sizes[0] &&
+       prog_data->local_size[1] == sizes[1] &&
+       prog_data->local_size[2] == sizes[2])
+      return brw_simd_select(prog_data);
+
+   void *mem_ctx = ralloc_context(NULL);
+
+   struct brw_cs_prog_data cloned = *prog_data;
+   for (unsigned i = 0; i < 3; i++)
+      cloned.local_size[i] = sizes[i];
+
+   cloned.prog_mask = 0;
+   cloned.prog_spilled = 0;
+
+   const char *error[3] = {0};
+
+   for (unsigned simd = 0; simd < 3; simd++) {
+      /* We are not recompiling, so use original results of prog_mask and
+       * prog_spilled as they will already contain all possible compilations.
+       */
+      if (brw_simd_should_compile(mem_ctx, simd, devinfo, &cloned,
+                                  0 /* required_dispatch_width */, &error[simd]) &&
+          test_bit(prog_data->prog_mask, simd)) {
+         brw_simd_mark_compiled(simd, &cloned, test_bit(prog_data->prog_spilled, simd));
+      }
+   }
+
+   ralloc_free(mem_ctx);
+
+   return brw_simd_select(&cloned);
+}
diff --git a/src/intel/compiler/test_simd_selection.cpp b/src/intel/compiler/test_simd_selection.cpp
index f1be0bf185f..7344c57aca4 100644
--- a/src/intel/compiler/test_simd_selection.cpp
+++ b/src/intel/compiler/test_simd_selection.cpp
@@ -145,6 +145,15 @@ TEST_F(SIMDSelectionCS, WorkgroupSizeVariable)
    brw_simd_mark_compiled(SIMD32, prog_data, not_spilled);
 
    ASSERT_EQ(prog_data->prog_mask, 1u << SIMD8 | 1u << SIMD16 | 1u << SIMD32);
+
+   const unsigned wg_8_1_1[] = { 8, 1, 1 };
+   ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_8_1_1), SIMD16);
+
+   const unsigned wg_16_1_1[] = { 16, 1, 1 };
+   ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_16_1_1), SIMD16);
+
+   const unsigned wg_32_1_1[] = { 32, 1, 1 };
+   ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_32_1_1), SIMD16);
 }
 
 TEST_F(SIMDSelectionCS, WorkgroupSizeVariableSpilled)
@@ -161,6 +170,86 @@ TEST_F(SIMDSelectionCS, WorkgroupSizeVariableSpilled)
    brw_simd_mark_compiled(SIMD32, prog_data, spilled);
 
    ASSERT_EQ(prog_data->prog_mask, 1u << SIMD8 | 1u << SIMD16 | 1u << SIMD32);
+
+   const unsigned wg_8_1_1[] = { 8, 1, 1 };
+   ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_8_1_1), SIMD8);
+
+   const unsigned wg_16_1_1[] = { 16, 1, 1 };
+   ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_16_1_1), SIMD8);
+
+   const unsigned wg_32_1_1[] = { 32, 1, 1 };
+   ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_32_1_1), SIMD8);
+}
+
+TEST_F(SIMDSelectionCS, WorkgroupSizeVariableNoSIMD8)
+{
+   prog_data->local_size[0] = 0;
+   prog_data->local_size[1] = 0;
+   prog_data->local_size[2] = 0;
+
+   ASSERT_TRUE(should_compile(SIMD8));
+   ASSERT_TRUE(should_compile(SIMD16));
+   brw_simd_mark_compiled(SIMD16, prog_data, not_spilled);
+   ASSERT_TRUE(should_compile(SIMD32));
+   brw_simd_mark_compiled(SIMD32, prog_data, not_spilled);
+
+   ASSERT_EQ(prog_data->prog_mask, 1u << SIMD16 | 1u << SIMD32);
+
+   const unsigned wg_8_1_1[] = { 8, 1, 1 };
+   ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_8_1_1), SIMD16);
+
+   const unsigned wg_16_1_1[] = { 16, 1, 1 };
+   ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_16_1_1), SIMD16);
+
+   const unsigned wg_32_1_1[] = { 32, 1, 1 };
+   ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_32_1_1), SIMD16);
+}
+
+TEST_F(SIMDSelectionCS, WorkgroupSizeVariableNoSIMD16)
+{
+   prog_data->local_size[0] = 0;
+   prog_data->local_size[1] = 0;
+   prog_data->local_size[2] = 0;
+
+   ASSERT_TRUE(should_compile(SIMD8));
+   brw_simd_mark_compiled(SIMD8, prog_data, not_spilled);
+   ASSERT_TRUE(should_compile(SIMD16));
+   ASSERT_TRUE(should_compile(SIMD32));
+   brw_simd_mark_compiled(SIMD32, prog_data, not_spilled);
+
+   ASSERT_EQ(prog_data->prog_mask, 1u << SIMD8 | 1u << SIMD32);
+
+   const unsigned wg_8_1_1[] = { 8, 1, 1 };
+   ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_8_1_1), SIMD8);
+
+   const unsigned wg_16_1_1[] = { 16, 1, 1 };
+   ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_16_1_1), SIMD8);
+
+   const unsigned wg_32_1_1[] = { 32, 1, 1 };
+   ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_32_1_1), SIMD8);
+}
+
+TEST_F(SIMDSelectionCS, WorkgroupSizeVariableNoSIMD8NoSIMD16)
+{
+   prog_data->local_size[0] = 0;
+   prog_data->local_size[1] = 0;
+   prog_data->local_size[2] = 0;
+
+   ASSERT_TRUE(should_compile(SIMD8));
+   ASSERT_TRUE(should_compile(SIMD16));
+   ASSERT_TRUE(should_compile(SIMD32));
+   brw_simd_mark_compiled(SIMD32, prog_data, not_spilled);
+
+   ASSERT_EQ(prog_data->prog_mask, 1u << SIMD32);
+
+   const unsigned wg_8_1_1[] = { 8, 1, 1 };
+   ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_8_1_1), SIMD32);
+
+   const unsigned wg_16_1_1[] = { 16, 1, 1 };
+   ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_16_1_1), SIMD32);
+
+   const unsigned wg_32_1_1[] = { 32, 1, 1 };
+   ASSERT_EQ(brw_simd_select_for_workgroup_size(devinfo, prog_data, wg_32_1_1), SIMD32);
 }
 
 TEST_F(SIMDSelectionCS, SpillAtSIMD8)