[Mesa-dev] [PATCH] clover: Calculate the optimal work group size when local_size is NULL
Tom Stellard
tom at stellard.net
Tue Oct 29 19:20:53 CET 2013
From: Tom Stellard <thomas.stellard at amd.com>
This results in huge performance improvements for applications like
pyrit, which depend on the implementation to determine the optimal
work group size.
---
src/gallium/state_trackers/clover/api/kernel.cpp | 9 ++++++++-
src/gallium/state_trackers/clover/core/device.cpp | 19 +++++++++++++++++++
src/gallium/state_trackers/clover/core/device.hpp | 2 ++
3 files changed, 29 insertions(+), 1 deletion(-)
diff --git a/src/gallium/state_trackers/clover/api/kernel.cpp b/src/gallium/state_trackers/clover/api/kernel.cpp
index 13113a2..93cfe98 100644
--- a/src/gallium/state_trackers/clover/api/kernel.cpp
+++ b/src/gallium/state_trackers/clover/api/kernel.cpp
@@ -276,10 +276,17 @@ clEnqueueNDRangeKernel(cl_command_queue q, cl_kernel kern,
cl_event *ev) try {
auto grid_offset = opt_vector(pgrid_offset, dims, 0);
auto grid_size = opt_vector(pgrid_size, dims, 1);
- auto block_size = opt_vector(pblock_size, dims, 1);
kernel_validate(q, kern, dims, pgrid_offset, pgrid_size, pblock_size,
num_deps, deps, ev);
+ std::vector<size_t> block_size;
+ if (pblock_size) {
+ block_size = opt_vector(pblock_size, dims, 1);
+ } else {
+ std::vector<size_t> optimal_block_size =
+ q->dev.optimal_block_size(grid_size.data(), dims);
+ block_size = opt_vector(optimal_block_size.data(), dims, 1);
+ }
hard_event *hev = new hard_event(
*q, CL_COMMAND_NDRANGE_KERNEL, { deps, deps + num_deps },
diff --git a/src/gallium/state_trackers/clover/core/device.cpp b/src/gallium/state_trackers/clover/core/device.cpp
index 94faeee..710e175 100644
--- a/src/gallium/state_trackers/clover/core/device.cpp
+++ b/src/gallium/state_trackers/clover/core/device.cpp
@@ -169,6 +169,25 @@ _cl_device_id::max_block_size() const {
return { v.begin(), v.end() };
}
+std::vector<size_t>
+device::optimal_block_size(const size_t *grid_size, unsigned dims) const {
+
+ size_t max_threads = max_threads_per_block();
+ std::vector<size_t> max_size = max_block_size();
+ size_t threads = 1;
+ std::vector<size_t> optimal;
+
+ for (unsigned i = 0; i < dims; i++) {
+ size_t dim_size = max_size[i];
+ while (dim_size * threads > max_threads || (grid_size[i] % dim_size != 0)) {
+ dim_size >>= 1;
+ }
+ threads *= dim_size;
+ optimal.push_back(dim_size);
+ }
+ return optimal;
+}
+
std::string
_cl_device_id::device_name() const {
return pipe->get_name(pipe);
diff --git a/src/gallium/state_trackers/clover/core/device.hpp b/src/gallium/state_trackers/clover/core/device.hpp
index c6e50db..227c558 100644
--- a/src/gallium/state_trackers/clover/core/device.hpp
+++ b/src/gallium/state_trackers/clover/core/device.hpp
@@ -62,6 +62,8 @@ public:
cl_ulong max_mem_alloc_size() const;
std::vector<size_t> max_block_size() const;
+ std::vector<size_t> optimal_block_size(const size_t *grid_size,
+ unsigned dims) const;
std::string device_name() const;
std::string vendor_name() const;
enum pipe_shader_ir ir_format() const;
--
1.8.1.4
More information about the mesa-dev
mailing list