Mesa (master): clover: Calculate optimal work group size when it' s not specified by the user.

Mon Nov 4 20:15:08 UTC 2013

Module: Mesa
Branch: master
Commit: bf045bf9b409c47019fa7d9c859eaf8d50dd7032
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=bf045bf9b409c47019fa7d9c859eaf8d50dd7032

Author: Francisco Jerez <currojerez at riseup.net>
Date:   Mon Nov  4 11:26:13 2013 -0800

clover: Calculate optimal work group size when it's not specified by the user.

Inspired by a patch sent to the mailing list by Tom Stellard, but
using a different algorithm to calculate the optimal block size that
has been found to be considerably more effective.

Reviewed-by: Tom Stellard <thomas.stellard at amd.com>

---

 src/gallium/state_trackers/clover/Makefile.sources |    1 +
 src/gallium/state_trackers/clover/api/kernel.cpp   |   41 ++++--
 src/gallium/state_trackers/clover/core/kernel.cpp  |   11 ++-
 src/gallium/state_trackers/clover/core/kernel.hpp  |    7 +-
 src/gallium/state_trackers/clover/util/factor.hpp  |  131 ++++++++++++++++++++
 5 files changed, 176 insertions(+), 15 deletions(-)

diff --git a/src/gallium/state_trackers/clover/Makefile.sources b/src/gallium/state_trackers/clover/Makefile.sources
index e551677..520f52f 100644
--- a/src/gallium/state_trackers/clover/Makefile.sources
+++ b/src/gallium/state_trackers/clover/Makefile.sources
@@ -4,6 +4,7 @@ CPP_SOURCES := \
 	util/algorithm.hpp \
 	util/compat.cpp \
 	util/compat.hpp \
+	util/factor.hpp \
 	util/functional.hpp \
 	util/lazy.hpp \
 	util/pointer.hpp \
diff --git a/src/gallium/state_trackers/clover/api/kernel.cpp b/src/gallium/state_trackers/clover/api/kernel.cpp
index f2f1969..3367363 100644
--- a/src/gallium/state_trackers/clover/api/kernel.cpp
+++ b/src/gallium/state_trackers/clover/api/kernel.cpp
@@ -159,7 +159,7 @@ clGetKernelWorkGroupInfo(cl_kernel d_kern, cl_device_id d_dev,
       break;
 
    case CL_KERNEL_COMPILE_WORK_GROUP_SIZE:
-      buf.as_vector<size_t>() = kern.block_size();
+      buf.as_vector<size_t>() = kern.required_block_size();
       break;
 
    case CL_KERNEL_LOCAL_MEM_SIZE:
@@ -220,6 +220,24 @@ namespace {
       if (!d_grid_size || any_of(is_zero(), grid_size))
          throw error(CL_INVALID_GLOBAL_WORK_SIZE);
 
+      return grid_size;
+   }
+
+   std::vector<size_t>
+   validate_grid_offset(const command_queue &q, cl_uint dims,
+                        const size_t *d_grid_offset) {
+      if (d_grid_offset)
+         return range(d_grid_offset, dims);
+      else
+         return std::vector<size_t>(dims, 0);
+   }
+
+   std::vector<size_t>
+   validate_block_size(const command_queue &q, const kernel &kern,
+                       cl_uint dims, const size_t *d_grid_size,
+                       const size_t *d_block_size) {
+      auto grid_size = range(d_grid_size, dims);
+
       if (d_block_size) {
          auto block_size = range(d_block_size, dims);
 
@@ -233,15 +251,12 @@ namespace {
          if (fold(multiplies(), 1u, block_size) >
              q.dev.max_threads_per_block())
             throw error(CL_INVALID_WORK_GROUP_SIZE);
-      }
-   }
 
-   std::vector<size_t>
-   pad_vector(const size_t *p, unsigned n, size_t x) {
-      if (p)
-         return { p, p + n };
-      else
-         return { n, x };
+         return block_size;
+
+      } else {
+         return kern.optimal_block_size(q, grid_size);
+      }
    }
 }
 
@@ -254,13 +269,13 @@ clEnqueueNDRangeKernel(cl_command_queue d_q, cl_kernel d_kern,
    auto &q = obj(d_q);
    auto &kern = obj(d_kern);
    auto deps = objs<wait_list_tag>(d_deps, num_deps);
+   auto grid_size = validate_grid_size(q, dims, d_grid_size);
+   auto grid_offset = validate_grid_offset(q, dims, d_grid_offset);
+   auto block_size = validate_block_size(q, kern, dims,
+                                         d_grid_size, d_block_size);
 
    validate_common(q, kern, deps);
-   validate_grid(q, dims, d_grid_size, d_block_size);
 
-   auto grid_offset = pad_vector(d_grid_offset, dims, 0);
-   auto grid_size = pad_vector(d_grid_size, dims, 1);
-   auto block_size = pad_vector(d_block_size, dims, 1);
    hard_event *hev = new hard_event(
       q, CL_COMMAND_NDRANGE_KERNEL, deps,
       [=, &kern, &q](event &) {
diff --git a/src/gallium/state_trackers/clover/core/kernel.cpp b/src/gallium/state_trackers/clover/core/kernel.cpp
index 4670046..58780d6 100644
--- a/src/gallium/state_trackers/clover/core/kernel.cpp
+++ b/src/gallium/state_trackers/clover/core/kernel.cpp
@@ -22,6 +22,7 @@
 
 #include "core/kernel.hpp"
 #include "core/resource.hpp"
+#include "util/factor.hpp"
 #include "util/u_math.h"
 #include "pipe/p_context.h"
 
@@ -126,7 +127,15 @@ kernel::name() const {
 }
 
 std::vector<size_t>
-kernel::block_size() const {
+kernel::optimal_block_size(const command_queue &q,
+                           const std::vector<size_t> &grid_size) const {
+   return factor::find_grid_optimal_factor<size_t>(
+      q.dev.max_threads_per_block(), q.dev.max_block_size(),
+      grid_size);
+}
+
+std::vector<size_t>
+kernel::required_block_size() const {
    return { 0, 0, 0 };
 }
 
diff --git a/src/gallium/state_trackers/clover/core/kernel.hpp b/src/gallium/state_trackers/clover/core/kernel.hpp
index 4bcc3c7..f42e199 100644
--- a/src/gallium/state_trackers/clover/core/kernel.hpp
+++ b/src/gallium/state_trackers/clover/core/kernel.hpp
@@ -121,7 +121,12 @@ namespace clover {
       size_t mem_private() const;
 
       const std::string &name() const;
-      std::vector<size_t> block_size() const;
+
+      std::vector<size_t>
+      optimal_block_size(const command_queue &q,
+                         const std::vector<size_t> &grid_size) const;
+      std::vector<size_t>
+      required_block_size() const;
 
       argument_range args();
       const_argument_range args() const;
diff --git a/src/gallium/state_trackers/clover/util/factor.hpp b/src/gallium/state_trackers/clover/util/factor.hpp
new file mode 100644
index 0000000..76d3bfe
--- /dev/null
+++ b/src/gallium/state_trackers/clover/util/factor.hpp
@@ -0,0 +1,131 @@
+//
+// Copyright 2013 Francisco Jerez
+//
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+//
+
+#ifndef CLOVER_UTIL_FACTOR_HPP
+#define CLOVER_UTIL_FACTOR_HPP
+
+#include "util/range.hpp"
+
+namespace clover {
+   namespace factor {
+      ///
+      /// Calculate all prime integer factors of \p x.
+      ///
+      /// If \p limit is non-zero, terminate early as soon as enough
+      /// factors have been collected to reach the product \p limit.
+      ///
+      template<typename T>
+      std::vector<T>
+      find_integer_prime_factors(T x, T limit = 0)
+      {
+         const T max_d = (limit > 0 && limit < x ? limit : x);
+         const T min_x = x / max_d;
+         std::vector<T> factors;
+
+         for (T d = 2; d <= max_d && x > min_x; d++) {
+            if (x % d == 0) {
+               for (; x % d == 0; x /= d);
+               factors.push_back(d);
+            }
+         }
+
+         return factors;
+      }
+
+      namespace detail {
+         ///
+         /// Walk the power set of prime factors of the n-dimensional
+         /// integer array \p grid subject to the constraints given by
+         /// \p limits.
+         ///
+         template<typename T>
+         std::pair<T, std::vector<T>>
+         next_grid_factor(const std::pair<T, std::vector<T>> &limits,
+                          const std::vector<T> &grid,
+                          const std::vector<std::vector<T>> &factors,
+                          std::pair<T, std::vector<T>> block,
+                          unsigned d = 0, unsigned i = 0) {
+            if (d >= factors.size()) {
+               // We're done.
+               return {};
+
+            } else if (i >= factors[d].size()) {
+               // We're done with this grid dimension, try the next.
+               return next_grid_factor(limits, grid, factors,
+                                       std::move(block), d + 1, 0);
+
+            } else {
+               T f = factors[d][i];
+
+               // Try the next power of this factor.
+               block.first *= f;
+               block.second[d] *= f;
+
+               if (block.first <= limits.first &&
+                   block.second[d] <= limits.second[d] &&
+                   grid[d] % block.second[d] == 0) {
+                  // We've found a valid grid divisor.
+                  return block;
+
+               } else {
+                  // Overflow, back off to the zeroth power,
+                  while (block.second[d] % f == 0) {
+                     block.second[d] /= f;
+                     block.first /= f;
+                  }
+
+                  // ...and carry to the next factor.
+                  return next_grid_factor(limits, grid, factors,
+                                          std::move(block), d, i + 1);
+               }
+            }
+         }
+      }
+
+      ///
+      /// Find the divisor of the integer array \p grid that gives the
+      /// highest possible product not greater than \p product_limit
+      /// subject to the constraints given by \p coord_limit.
+      ///
+      template<typename T>
+      std::vector<T>
+      find_grid_optimal_factor(T product_limit,
+                               const std::vector<T> &coord_limit,
+                               const std::vector<T> &grid) {
+         const std::vector<std::vector<T>> factors =
+            map(find_integer_prime_factors<T>, grid, coord_limit);
+         const auto limits = std::make_pair(product_limit, coord_limit);
+         auto best = std::make_pair(T(1), std::vector<T>(grid.size(), T(1)));
+
+         for (auto block = best;
+              block.first != 0 && best.first != product_limit;
+              block = detail::next_grid_factor(limits, grid, factors, block)) {
+            if (block.first > best.first)
+               best = block;
+         }
+
+         return best.second;
+      }
+   }
+}
+
+#endif