[Beignet] [PATCH v2] GBE: Add support for kernel attribute reqd_work_group_size.
Zhigang Gong
zhigang.gong at intel.com
Mon Nov 11 23:14:47 PST 2013
When a kernel has __attribute__((reqd_work_group_size(X, Y, Z))) qualifier,
the kernel will only accept that group size.
v2: add binary load/store support.
Signed-off-by: Zhigang Gong <zhigang.gong at intel.com>
---
backend/src/backend/program.cpp | 17 ++++++++++++++++-
backend/src/backend/program.h | 4 ++++
backend/src/backend/program.hpp | 13 +++++++++++++
backend/src/ir/function.hpp | 10 ++++++++--
backend/src/llvm/llvm_gen_backend.cpp | 26 ++++++++++++++++++++++++++
src/cl_api.c | 11 +++++++++++
src/cl_device_id.c | 12 ++++++++----
src/cl_device_id.h | 1 -
src/cl_kernel.c | 2 ++
src/cl_kernel.h | 2 ++
10 files changed, 90 insertions(+), 8 deletions(-)
diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp
index a889da9..80761fa 100644
--- a/backend/src/backend/program.cpp
+++ b/backend/src/backend/program.cpp
@@ -122,6 +122,7 @@ namespace gbe {
Kernel *kernel = this->compileKernel(unit, name);
kernel->setSamplerSet(pair.second->getSamplerSet());
kernel->setImageSet(pair.second->getImageSet());
+ kernel->setCompileWorkGroupSize(pair.second->getCompileWorkGroupSize());
kernels.insert(std::make_pair(name, kernel));
}
return true;
@@ -250,7 +251,9 @@ namespace gbe {
OUT_UPDATE_SZ(scratchSize);
OUT_UPDATE_SZ(useSLM);
OUT_UPDATE_SZ(slmSize);
-
+ OUT_UPDATE_SZ(compile_wg_sz[0]);
+ OUT_UPDATE_SZ(compile_wg_sz[1]);
+ OUT_UPDATE_SZ(compile_wg_sz[2]);
/* samplers. */
if (samplerSet) {
has_samplerset = 1;
@@ -340,6 +343,9 @@ namespace gbe {
IN_UPDATE_SZ(scratchSize);
IN_UPDATE_SZ(useSLM);
IN_UPDATE_SZ(slmSize);
+ IN_UPDATE_SZ(compile_wg_sz[0]);
+ IN_UPDATE_SZ(compile_wg_sz[1]);
+ IN_UPDATE_SZ(compile_wg_sz[2]);
IN_UPDATE_SZ(has_samplerset);
if (has_samplerset) {
@@ -417,6 +423,7 @@ namespace gbe {
outs << spaces_nl << " scratchSize: " << scratchSize << "\n";
outs << spaces_nl << " useSLM: " << useSLM << "\n";
outs << spaces_nl << " slmSize: " << slmSize << "\n";
+ outs << spaces_nl << " compile_wg_sz: " << compile_wg_sz[0] << compile_wg_sz[1] << compile_wg_sz[2] << "\n";
outs << spaces_nl << " Argument Number is " << argNum << "\n";
for (uint32_t i = 0; i < argNum; i++) {
@@ -772,6 +779,12 @@ namespace gbe {
kernel->getSamplerData(samplers);
}
+ static void kernelGetCompileWorkGroupSize(gbe_kernel gbeKernel, size_t wg_size[3]) {
+ if (gbeKernel == NULL) return;
+ const gbe::Kernel *kernel = (const gbe::Kernel*) gbeKernel;
+ kernel->getCompileWorkGroupSize(wg_size);
+ }
+
static size_t kernelGetImageSize(gbe_kernel gbeKernel) {
if (gbeKernel == NULL) return 0;
const gbe::Kernel *kernel = (const gbe::Kernel*) gbeKernel;
@@ -826,6 +839,7 @@ GBE_EXPORT_SYMBOL gbe_kernel_use_slm_cb *gbe_kernel_use_slm = NULL;
GBE_EXPORT_SYMBOL gbe_kernel_get_slm_size_cb *gbe_kernel_get_slm_size = NULL;
GBE_EXPORT_SYMBOL gbe_kernel_get_sampler_size_cb *gbe_kernel_get_sampler_size = NULL;
GBE_EXPORT_SYMBOL gbe_kernel_get_sampler_data_cb *gbe_kernel_get_sampler_data = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_compile_wg_size_cb *gbe_kernel_get_compile_wg_size = NULL;
GBE_EXPORT_SYMBOL gbe_kernel_get_image_size_cb *gbe_kernel_get_image_size = NULL;
GBE_EXPORT_SYMBOL gbe_kernel_get_image_data_cb *gbe_kernel_get_image_data = NULL;
GBE_EXPORT_SYMBOL gbe_set_image_base_index_cb *gbe_set_image_base_index = NULL;
@@ -862,6 +876,7 @@ namespace gbe
gbe_kernel_get_slm_size = gbe::kernelGetSLMSize;
gbe_kernel_get_sampler_size = gbe::kernelGetSamplerSize;
gbe_kernel_get_sampler_data = gbe::kernelGetSamplerData;
+ gbe_kernel_get_compile_wg_size = gbe::kernelGetCompileWorkGroupSize;
gbe_kernel_get_image_size = gbe::kernelGetImageSize;
gbe_kernel_get_image_data = gbe::kernelGetImageData;
gbe_get_image_base_index = gbe::getImageBaseIndex;
diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h
index 2640b65..e574764 100644
--- a/backend/src/backend/program.h
+++ b/backend/src/backend/program.h
@@ -149,6 +149,10 @@ extern gbe_kernel_get_sampler_size_cb *gbe_kernel_get_sampler_size;
typedef void (gbe_kernel_get_sampler_data_cb)(gbe_kernel gbeKernel, uint32_t *samplers);
extern gbe_kernel_get_sampler_data_cb *gbe_kernel_get_sampler_data;
+/*! Get the content of defined samplers */
+typedef void (gbe_kernel_get_compile_wg_size_cb)(gbe_kernel gbeKernel, size_t wg_sz[3]);
+extern gbe_kernel_get_compile_wg_size_cb *gbe_kernel_get_compile_wg_size;
+
/*! Destroy and deallocate the given program */
typedef void (gbe_program_delete_cb)(gbe_program);
extern gbe_program_delete_cb *gbe_program_delete;
diff --git a/backend/src/backend/program.hpp b/backend/src/backend/program.hpp
index dd76210..1aa8696 100644
--- a/backend/src/backend/program.hpp
+++ b/backend/src/backend/program.hpp
@@ -132,6 +132,18 @@ namespace gbe {
void setImageSet(ir::ImageSet * from) {
imageSet = from;
}
+ /*! Set compile work group size */
+ void setCompileWorkGroupSize(const size_t wg_sz[3]) {
+ compile_wg_sz[0] = wg_sz[0];
+ compile_wg_sz[1] = wg_sz[1];
+ compile_wg_sz[2] = wg_sz[2];
+ }
+ /*! Get compile work group size */
+ void getCompileWorkGroupSize (size_t wg_sz[3]) const {
+ wg_sz[0] = compile_wg_sz[0];
+ wg_sz[1] = compile_wg_sz[1];
+ wg_sz[2] = compile_wg_sz[2];
+ }
/*! Get defined image size */
size_t getImageSize(void) const { return imageSet->getDataSize(); }
/*! Get defined image value array */
@@ -181,6 +193,7 @@ namespace gbe {
Context *ctx; //!< Save context after compiler to alloc constant buffer curbe
ir::SamplerSet *samplerSet;//!< Copy from the corresponding function.
ir::ImageSet *imageSet; //!< Copy from the corresponding function.
+ size_t compile_wg_sz[3]; //!< required work group size by kernel attribute.
GBE_CLASS(Kernel); //!< Use custom allocators
};
diff --git a/backend/src/ir/function.hpp b/backend/src/ir/function.hpp
index 84d2504..33a87f7 100644
--- a/backend/src/ir/function.hpp
+++ b/backend/src/ir/function.hpp
@@ -310,6 +310,10 @@ namespace ir {
SamplerSet* getSamplerSet(void) const {return samplerSet; }
/*! Get image set in this function */
ImageSet* getImageSet(void) const {return imageSet; }
+ /*! Set required work group size. */
+ void setCompileWorkGroupSize(size_t x, size_t y, size_t z) { compile_wg_sz[0] = x; compile_wg_sz[1] = y; compile_wg_sz[2] = z; }
+ /*! Get required work group size. */
+ const size_t *getCompileWorkGroupSize(void) const {return compile_wg_sz;}
private:
friend class Context; //!< Can freely modify a function
std::string name; //!< Function name
@@ -326,8 +330,10 @@ namespace ir {
uint32_t simdWidth; //!< 8 or 16 if forced, 0 otherwise
bool useSLM; //!< Is SLM required?
uint32_t slmSize; //!< local variable size inside kernel function
- SamplerSet *samplerSet; //!< samplers used in this function.
- ImageSet* imageSet; //!< Image set in this function's arguments..
+ SamplerSet *samplerSet; //!< samplers used in this function.
+ ImageSet* imageSet; //!< Image set in this function's arguments..
+ size_t compile_wg_sz[3]; //!< required work group size specified by
+ // __attribute__((reqd_work_group_size(X, Y, Z))).
GBE_CLASS(Function); //!< Use custom allocator
};
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index d620d44..428b098 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -1056,6 +1056,32 @@ namespace gbe
{
GBE_ASSERTM(F.hasStructRetAttr() == false,
"Returned value for kernel functions is forbidden");
+
+ // Loop over the kernel metadatas to set the required work group size.
+ NamedMDNode *clKernelMetaDatas = TheModule->getNamedMetadata("opencl.kernels");
+ size_t reqd_wg_sz[3] = {0, 0, 0};
+ for(uint i = 0; i < clKernelMetaDatas->getNumOperands(); i++)
+ {
+ MDNode *node = clKernelMetaDatas->getOperand(i);
+ if (node->getOperand(0) != &F) continue;
+ while(node->getNumOperands() > 1) {
+ MDNode *attrNode = dyn_cast_or_null<MDNode>(node->getOperand(1));
+ if (attrNode == NULL) break;
+ MDString *attrName = dyn_cast_or_null<MDString>(attrNode->getOperand(0));
+ if (attrName && attrName->getString() == "reqd_work_group_size") {
+ GBE_ASSERT(attrNode->getNumOperands() == 4);
+ ConstantInt *x = dyn_cast<ConstantInt>(attrNode->getOperand(1));
+ ConstantInt *y = dyn_cast<ConstantInt>(attrNode->getOperand(2));
+ ConstantInt *z = dyn_cast<ConstantInt>(attrNode->getOperand(3));
+ GBE_ASSERT(x && y && z);
+ reqd_wg_sz[0] = x->getZExtValue();
+ reqd_wg_sz[1] = y->getZExtValue();
+ reqd_wg_sz[2] = z->getZExtValue();
+ }
+ break;
+ }
+ }
+ ctx.getFunction().setCompileWorkGroupSize(reqd_wg_sz[0], reqd_wg_sz[1], reqd_wg_sz[2]);
// Loop over the arguments and output registers for them
if (!F.arg_empty()) {
uint32_t argID = 0;
diff --git a/src/cl_api.c b/src/cl_api.c
index d15354b..59c47d3 100644
--- a/src/cl_api.c
+++ b/src/cl_api.c
@@ -2413,6 +2413,7 @@ clEnqueueNDRangeKernel(cl_command_queue command_queue,
goto error;
}
+
/* XXX No event right now */
//FATAL_IF(num_events_in_wait_list > 0, "Events are not supported");
//FATAL_IF(event_wait_list != NULL, "Events are not supported");
@@ -2428,6 +2429,16 @@ clEnqueueNDRangeKernel(cl_command_queue command_queue,
for (i = 0; i < work_dim; ++i)
fixed_global_off[i] = global_work_offset[i];
+ if (kernel->compile_wg_sz[0] || kernel->compile_wg_sz[1] || kernel->compile_wg_sz[2]) {
+ if (fixed_local_sz[0] != kernel->compile_wg_sz[0]
+ || fixed_local_sz[1] != kernel->compile_wg_sz[1]
+ || fixed_local_sz[2] != kernel->compile_wg_sz[2])
+ {
+ err = CL_INVALID_WORK_GROUP_SIZE;
+ goto error;
+ }
+ }
+
/* Do device specific checks are enqueue the kernel */
err = cl_command_queue_ND_range(command_queue,
kernel,
diff --git a/src/cl_device_id.c b/src/cl_device_id.c
index acc91e9..1124d30 100644
--- a/src/cl_device_id.c
+++ b/src/cl_device_id.c
@@ -42,7 +42,6 @@ static struct _cl_device_id intel_ivb_gt2_device = {
.max_work_group_size = 1024,
.max_clock_frequency = 1000,
.wg_sz = 1024,
- .compile_wg_sz = {0},
#include "cl_gen7_device.h"
};
@@ -54,7 +53,6 @@ static struct _cl_device_id intel_ivb_gt1_device = {
.max_work_group_size = 512,
.max_clock_frequency = 1000,
.wg_sz = 512,
- .compile_wg_sz = {0},
#include "cl_gen7_device.h"
};
@@ -67,7 +65,6 @@ static struct _cl_device_id intel_hsw_device = {
.max_work_group_size = 512,
.max_clock_frequency = 1000,
.wg_sz = 512,
- .compile_wg_sz = {0},
#include "cl_gen75_device.h"
};
@@ -290,7 +287,6 @@ cl_get_kernel_workgroup_info(cl_kernel kernel,
switch (param_name) {
DECL_FIELD(WORK_GROUP_SIZE, wg_sz)
- DECL_FIELD(COMPILE_WORK_GROUP_SIZE, compile_wg_sz)
DECL_FIELD(PREFERRED_WORK_GROUP_SIZE_MULTIPLE, preferred_wg_sz_mul)
case CL_KERNEL_LOCAL_MEM_SIZE:
if (param_value_size < sizeof(cl_ulong))
@@ -299,6 +295,14 @@ cl_get_kernel_workgroup_info(cl_kernel kernel,
*param_value_size_ret = sizeof(cl_ulong);
*(cl_ulong*)param_value = gbe_kernel_get_slm_size(kernel->opaque) + kernel->local_mem_sz;
return CL_SUCCESS;
+ case CL_KERNEL_COMPILE_WORK_GROUP_SIZE:
+ if (param_value_size < sizeof(kernel->compile_wg_sz))
+ return CL_INVALID_VALUE;
+ if (param_value_size_ret != NULL)
+ *param_value_size_ret = sizeof(kernel->compile_wg_sz);
+ memcpy(param_value, kernel->compile_wg_sz, sizeof(kernel->compile_wg_sz));
+ return CL_SUCCESS;
+
default: return CL_INVALID_VALUE;
};
}
diff --git a/src/cl_device_id.h b/src/cl_device_id.h
index 56ffd33..4ece26c 100644
--- a/src/cl_device_id.h
+++ b/src/cl_device_id.h
@@ -95,7 +95,6 @@ struct _cl_device_id {
size_t built_in_kernels_sz;
/* Kernel specific info that we're assigning statically */
size_t wg_sz;
- size_t compile_wg_sz[3];
size_t preferred_wg_sz_mul;
};
diff --git a/src/cl_kernel.c b/src/cl_kernel.c
index 4ba1c11..9a2a737 100644
--- a/src/cl_kernel.c
+++ b/src/cl_kernel.c
@@ -230,6 +230,7 @@ cl_kernel_setup(cl_kernel k, gbe_kernel opaque)
assert(k->sampler_sz <= GEN_MAX_SAMPLERS);
if (k->sampler_sz > 0)
gbe_kernel_get_sampler_data(k->opaque, k->samplers);
+ gbe_kernel_get_compile_wg_size(k->opaque, k->compile_wg_sz);
/* Get image data & size */
k->image_sz = gbe_kernel_get_image_size(k->opaque);
assert(k->sampler_sz <= GEN_MAX_SURFACES);
@@ -263,6 +264,7 @@ cl_kernel_dup(cl_kernel from)
to->curbe_sz = from->curbe_sz;
to->sampler_sz = from->sampler_sz;
to->image_sz = from->image_sz;
+ memcpy(to->compile_wg_sz, from->compile_wg_sz, sizeof(from->compile_wg_sz));
if (to->sampler_sz)
memcpy(to->samplers, from->samplers, to->sampler_sz * sizeof(uint32_t));
if (to->image_sz) {
diff --git a/src/cl_kernel.h b/src/cl_kernel.h
index acb7206..608ed8e 100644
--- a/src/cl_kernel.h
+++ b/src/cl_kernel.h
@@ -57,6 +57,8 @@ struct _cl_kernel {
struct ImageInfo *images; /* images defined in kernel args */
size_t image_sz; /* image count in kernel args */
cl_ulong local_mem_sz; /* local memory size specified in kernel args. */
+ size_t compile_wg_sz[3]; /* Required workgroup size by __attribute__((reqd_work_gro
+ up_size(X, Y, Z))) qualifier.*/
cl_argument *args; /* To track argument setting */
uint32_t arg_n:31; /* Number of arguments */
uint32_t ref_its_program:1; /* True only for the user kernel (created by clCreateKernel) */
--
1.7.9.5
More information about the Beignet
mailing list