[Beignet] [PATCH v2] GBE: Add support for kernel attribute reqd_work_group_size.

Zhigang Gong zhigang.gong at intel.com
Mon Nov 11 23:14:47 PST 2013


When a kernel has __attribute__((reqd_work_group_size(X, Y, Z))) qualifier,
the kernel will only accept that group size.

v2: add binary load/store support.

Signed-off-by: Zhigang Gong <zhigang.gong at intel.com>
---
 backend/src/backend/program.cpp       |   17 ++++++++++++++++-
 backend/src/backend/program.h         |    4 ++++
 backend/src/backend/program.hpp       |   13 +++++++++++++
 backend/src/ir/function.hpp           |   10 ++++++++--
 backend/src/llvm/llvm_gen_backend.cpp |   26 ++++++++++++++++++++++++++
 src/cl_api.c                          |   11 +++++++++++
 src/cl_device_id.c                    |   12 ++++++++----
 src/cl_device_id.h                    |    1 -
 src/cl_kernel.c                       |    2 ++
 src/cl_kernel.h                       |    2 ++
 10 files changed, 90 insertions(+), 8 deletions(-)

diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp
index a889da9..80761fa 100644
--- a/backend/src/backend/program.cpp
+++ b/backend/src/backend/program.cpp
@@ -122,6 +122,7 @@ namespace gbe {
       Kernel *kernel = this->compileKernel(unit, name);
       kernel->setSamplerSet(pair.second->getSamplerSet());
       kernel->setImageSet(pair.second->getImageSet());
+      kernel->setCompileWorkGroupSize(pair.second->getCompileWorkGroupSize());
       kernels.insert(std::make_pair(name, kernel));
     }
     return true;
@@ -250,7 +251,9 @@ namespace gbe {
     OUT_UPDATE_SZ(scratchSize);
     OUT_UPDATE_SZ(useSLM);
     OUT_UPDATE_SZ(slmSize);
-
+    OUT_UPDATE_SZ(compile_wg_sz[0]);
+    OUT_UPDATE_SZ(compile_wg_sz[1]);
+    OUT_UPDATE_SZ(compile_wg_sz[2]);
     /* samplers. */
     if (samplerSet) {
       has_samplerset = 1;
@@ -340,6 +343,9 @@ namespace gbe {
     IN_UPDATE_SZ(scratchSize);
     IN_UPDATE_SZ(useSLM);
     IN_UPDATE_SZ(slmSize);
+    IN_UPDATE_SZ(compile_wg_sz[0]);
+    IN_UPDATE_SZ(compile_wg_sz[1]);
+    IN_UPDATE_SZ(compile_wg_sz[2]);
 
     IN_UPDATE_SZ(has_samplerset);
     if (has_samplerset) {
@@ -417,6 +423,7 @@ namespace gbe {
     outs << spaces_nl << "  scratchSize: " << scratchSize << "\n";
     outs << spaces_nl << "  useSLM: " << useSLM << "\n";
     outs << spaces_nl << "  slmSize: " << slmSize << "\n";
+    outs << spaces_nl << "  compile_wg_sz: " << compile_wg_sz[0] << compile_wg_sz[1] << compile_wg_sz[2] << "\n";
 
     outs << spaces_nl << "  Argument Number is " << argNum << "\n";
     for (uint32_t i = 0; i < argNum; i++) {
@@ -772,6 +779,12 @@ namespace gbe {
     kernel->getSamplerData(samplers);
   }
 
+  static void kernelGetCompileWorkGroupSize(gbe_kernel gbeKernel, size_t wg_size[3]) {
+    if (gbeKernel == NULL) return;
+    const gbe::Kernel *kernel = (const gbe::Kernel*) gbeKernel;
+    kernel->getCompileWorkGroupSize(wg_size);
+  }
+
   static size_t kernelGetImageSize(gbe_kernel gbeKernel) {
     if (gbeKernel == NULL) return 0;
     const gbe::Kernel *kernel = (const gbe::Kernel*) gbeKernel;
@@ -826,6 +839,7 @@ GBE_EXPORT_SYMBOL gbe_kernel_use_slm_cb *gbe_kernel_use_slm = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_get_slm_size_cb *gbe_kernel_get_slm_size = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_get_sampler_size_cb *gbe_kernel_get_sampler_size = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_get_sampler_data_cb *gbe_kernel_get_sampler_data = NULL;
+GBE_EXPORT_SYMBOL gbe_kernel_get_compile_wg_size_cb *gbe_kernel_get_compile_wg_size = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_get_image_size_cb *gbe_kernel_get_image_size = NULL;
 GBE_EXPORT_SYMBOL gbe_kernel_get_image_data_cb *gbe_kernel_get_image_data = NULL;
 GBE_EXPORT_SYMBOL gbe_set_image_base_index_cb *gbe_set_image_base_index = NULL;
@@ -862,6 +876,7 @@ namespace gbe
       gbe_kernel_get_slm_size = gbe::kernelGetSLMSize;
       gbe_kernel_get_sampler_size = gbe::kernelGetSamplerSize;
       gbe_kernel_get_sampler_data = gbe::kernelGetSamplerData;
+      gbe_kernel_get_compile_wg_size = gbe::kernelGetCompileWorkGroupSize;
       gbe_kernel_get_image_size = gbe::kernelGetImageSize;
       gbe_kernel_get_image_data = gbe::kernelGetImageData;
       gbe_get_image_base_index = gbe::getImageBaseIndex;
diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h
index 2640b65..e574764 100644
--- a/backend/src/backend/program.h
+++ b/backend/src/backend/program.h
@@ -149,6 +149,10 @@ extern gbe_kernel_get_sampler_size_cb *gbe_kernel_get_sampler_size;
 typedef void (gbe_kernel_get_sampler_data_cb)(gbe_kernel gbeKernel, uint32_t *samplers);
 extern gbe_kernel_get_sampler_data_cb *gbe_kernel_get_sampler_data;
 
+/*! Get the content of defined samplers */
+typedef void (gbe_kernel_get_compile_wg_size_cb)(gbe_kernel gbeKernel, size_t wg_sz[3]);
+extern gbe_kernel_get_compile_wg_size_cb *gbe_kernel_get_compile_wg_size;
+
 /*! Destroy and deallocate the given program */
 typedef void (gbe_program_delete_cb)(gbe_program);
 extern gbe_program_delete_cb *gbe_program_delete;
diff --git a/backend/src/backend/program.hpp b/backend/src/backend/program.hpp
index dd76210..1aa8696 100644
--- a/backend/src/backend/program.hpp
+++ b/backend/src/backend/program.hpp
@@ -132,6 +132,18 @@ namespace gbe {
     void setImageSet(ir::ImageSet * from) {
       imageSet = from;
     }
+    /*! Set compile work group size */
+    void setCompileWorkGroupSize(const size_t wg_sz[3]) {
+       compile_wg_sz[0] = wg_sz[0];
+       compile_wg_sz[1] = wg_sz[1];
+       compile_wg_sz[2] = wg_sz[2];
+    }
+    /*! Get compile work group size */
+    void getCompileWorkGroupSize (size_t wg_sz[3]) const {
+       wg_sz[0] = compile_wg_sz[0];
+       wg_sz[1] = compile_wg_sz[1];
+       wg_sz[2] = compile_wg_sz[2];
+    }
     /*! Get defined image size */
     size_t getImageSize(void) const { return imageSet->getDataSize(); }
     /*! Get defined image value array */
@@ -181,6 +193,7 @@ namespace gbe {
     Context *ctx;              //!< Save context after compiler to alloc constant buffer curbe
     ir::SamplerSet *samplerSet;//!< Copy from the corresponding function.
     ir::ImageSet *imageSet;    //!< Copy from the corresponding function.
+    size_t compile_wg_sz[3];   //!< required work group size by kernel attribute.
     GBE_CLASS(Kernel);         //!< Use custom allocators
   };
 
diff --git a/backend/src/ir/function.hpp b/backend/src/ir/function.hpp
index 84d2504..33a87f7 100644
--- a/backend/src/ir/function.hpp
+++ b/backend/src/ir/function.hpp
@@ -310,6 +310,10 @@ namespace ir {
     SamplerSet* getSamplerSet(void) const {return samplerSet; }
     /*! Get image set in this function */
     ImageSet* getImageSet(void) const {return imageSet; }
+    /*! Set required work group size. */
+    void setCompileWorkGroupSize(size_t x, size_t y, size_t z) { compile_wg_sz[0] = x; compile_wg_sz[1] = y; compile_wg_sz[2] = z; }
+    /*! Get required work group size. */
+    const size_t *getCompileWorkGroupSize(void) const {return compile_wg_sz;}
   private:
     friend class Context;           //!< Can freely modify a function
     std::string name;               //!< Function name
@@ -326,8 +330,10 @@ namespace ir {
     uint32_t simdWidth;             //!< 8 or 16 if forced, 0 otherwise
     bool useSLM;                    //!< Is SLM required?
     uint32_t slmSize;               //!< local variable size inside kernel function
-    SamplerSet *samplerSet;          //!< samplers used in this function.
-    ImageSet* imageSet;              //!< Image set in this function's arguments..
+    SamplerSet *samplerSet;         //!< samplers used in this function.
+    ImageSet* imageSet;             //!< Image set in this function's arguments..
+    size_t compile_wg_sz[3];        //!< required work group size specified by
+                                    //   __attribute__((reqd_work_group_size(X, Y, Z))).
     GBE_CLASS(Function);            //!< Use custom allocator
   };
 
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index d620d44..428b098 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -1056,6 +1056,32 @@ namespace gbe
   {
     GBE_ASSERTM(F.hasStructRetAttr() == false,
                 "Returned value for kernel functions is forbidden");
+
+    // Loop over the kernel metadatas to set the required work group size.
+    NamedMDNode *clKernelMetaDatas = TheModule->getNamedMetadata("opencl.kernels");
+    size_t reqd_wg_sz[3] = {0, 0, 0};
+    for(uint i = 0; i < clKernelMetaDatas->getNumOperands(); i++)
+    {
+      MDNode *node = clKernelMetaDatas->getOperand(i);
+      if (node->getOperand(0) != &F) continue;
+      while(node->getNumOperands() > 1) {
+        MDNode *attrNode = dyn_cast_or_null<MDNode>(node->getOperand(1));
+        if (attrNode == NULL) break;
+        MDString *attrName = dyn_cast_or_null<MDString>(attrNode->getOperand(0));
+        if (attrName && attrName->getString() == "reqd_work_group_size") {
+          GBE_ASSERT(attrNode->getNumOperands() == 4);
+          ConstantInt *x = dyn_cast<ConstantInt>(attrNode->getOperand(1));
+          ConstantInt *y = dyn_cast<ConstantInt>(attrNode->getOperand(2));
+          ConstantInt *z = dyn_cast<ConstantInt>(attrNode->getOperand(3));
+          GBE_ASSERT(x && y && z);
+          reqd_wg_sz[0] = x->getZExtValue();
+          reqd_wg_sz[1] = y->getZExtValue();
+          reqd_wg_sz[2] = z->getZExtValue();
+        }
+        break;
+      }
+    }
+    ctx.getFunction().setCompileWorkGroupSize(reqd_wg_sz[0], reqd_wg_sz[1], reqd_wg_sz[2]);
     // Loop over the arguments and output registers for them
     if (!F.arg_empty()) {
       uint32_t argID = 0;
diff --git a/src/cl_api.c b/src/cl_api.c
index d15354b..59c47d3 100644
--- a/src/cl_api.c
+++ b/src/cl_api.c
@@ -2413,6 +2413,7 @@ clEnqueueNDRangeKernel(cl_command_queue  command_queue,
     goto error;
   }
 
+
   /* XXX No event right now */
   //FATAL_IF(num_events_in_wait_list > 0, "Events are not supported");
   //FATAL_IF(event_wait_list != NULL, "Events are not supported");
@@ -2428,6 +2429,16 @@ clEnqueueNDRangeKernel(cl_command_queue  command_queue,
     for (i = 0; i < work_dim; ++i)
       fixed_global_off[i] = global_work_offset[i];
 
+  if (kernel->compile_wg_sz[0] || kernel->compile_wg_sz[1] || kernel->compile_wg_sz[2]) {
+    if (fixed_local_sz[0] != kernel->compile_wg_sz[0]
+        || fixed_local_sz[1] != kernel->compile_wg_sz[1]
+        || fixed_local_sz[2] != kernel->compile_wg_sz[2])
+    {
+        err = CL_INVALID_WORK_GROUP_SIZE;
+        goto error;
+    }
+  }
+
   /* Do device specific checks are enqueue the kernel */
   err = cl_command_queue_ND_range(command_queue,
                                   kernel,
diff --git a/src/cl_device_id.c b/src/cl_device_id.c
index acc91e9..1124d30 100644
--- a/src/cl_device_id.c
+++ b/src/cl_device_id.c
@@ -42,7 +42,6 @@ static struct _cl_device_id intel_ivb_gt2_device = {
   .max_work_group_size = 1024,
   .max_clock_frequency = 1000,
   .wg_sz = 1024,
-  .compile_wg_sz = {0},	
 #include "cl_gen7_device.h"
 };
 
@@ -54,7 +53,6 @@ static struct _cl_device_id intel_ivb_gt1_device = {
   .max_work_group_size = 512,
   .max_clock_frequency = 1000,
   .wg_sz = 512,
-  .compile_wg_sz = {0},	
 #include "cl_gen7_device.h"
 };
 
@@ -67,7 +65,6 @@ static struct _cl_device_id intel_hsw_device = {
   .max_work_group_size = 512,
   .max_clock_frequency = 1000,
   .wg_sz = 512,
-  .compile_wg_sz = {0},	
 #include "cl_gen75_device.h"
 };
 
@@ -290,7 +287,6 @@ cl_get_kernel_workgroup_info(cl_kernel kernel,
 
   switch (param_name) {
     DECL_FIELD(WORK_GROUP_SIZE, wg_sz)
-    DECL_FIELD(COMPILE_WORK_GROUP_SIZE, compile_wg_sz)
     DECL_FIELD(PREFERRED_WORK_GROUP_SIZE_MULTIPLE, preferred_wg_sz_mul)
     case CL_KERNEL_LOCAL_MEM_SIZE:
       if (param_value_size < sizeof(cl_ulong))
@@ -299,6 +295,14 @@ cl_get_kernel_workgroup_info(cl_kernel kernel,
         *param_value_size_ret = sizeof(cl_ulong);
       *(cl_ulong*)param_value = gbe_kernel_get_slm_size(kernel->opaque) + kernel->local_mem_sz;
       return CL_SUCCESS;
+    case CL_KERNEL_COMPILE_WORK_GROUP_SIZE:
+      if (param_value_size < sizeof(kernel->compile_wg_sz))
+        return CL_INVALID_VALUE;
+      if (param_value_size_ret != NULL)
+        *param_value_size_ret = sizeof(kernel->compile_wg_sz);
+      memcpy(param_value, kernel->compile_wg_sz, sizeof(kernel->compile_wg_sz));
+      return CL_SUCCESS;
+
     default: return CL_INVALID_VALUE;
   };
 }
diff --git a/src/cl_device_id.h b/src/cl_device_id.h
index 56ffd33..4ece26c 100644
--- a/src/cl_device_id.h
+++ b/src/cl_device_id.h
@@ -95,7 +95,6 @@ struct _cl_device_id {
   size_t built_in_kernels_sz;
   /* Kernel specific info that we're assigning statically */
   size_t wg_sz;
-  size_t compile_wg_sz[3];
   size_t preferred_wg_sz_mul;
 };
 
diff --git a/src/cl_kernel.c b/src/cl_kernel.c
index 4ba1c11..9a2a737 100644
--- a/src/cl_kernel.c
+++ b/src/cl_kernel.c
@@ -230,6 +230,7 @@ cl_kernel_setup(cl_kernel k, gbe_kernel opaque)
   assert(k->sampler_sz <= GEN_MAX_SAMPLERS);
   if (k->sampler_sz > 0)
     gbe_kernel_get_sampler_data(k->opaque, k->samplers);
+  gbe_kernel_get_compile_wg_size(k->opaque, k->compile_wg_sz);
   /* Get image data & size */
   k->image_sz = gbe_kernel_get_image_size(k->opaque);
   assert(k->sampler_sz <= GEN_MAX_SURFACES);
@@ -263,6 +264,7 @@ cl_kernel_dup(cl_kernel from)
   to->curbe_sz = from->curbe_sz;
   to->sampler_sz = from->sampler_sz;
   to->image_sz = from->image_sz;
+  memcpy(to->compile_wg_sz, from->compile_wg_sz, sizeof(from->compile_wg_sz));
   if (to->sampler_sz)
     memcpy(to->samplers, from->samplers, to->sampler_sz * sizeof(uint32_t));
   if (to->image_sz) {
diff --git a/src/cl_kernel.h b/src/cl_kernel.h
index acb7206..608ed8e 100644
--- a/src/cl_kernel.h
+++ b/src/cl_kernel.h
@@ -57,6 +57,8 @@ struct _cl_kernel {
   struct ImageInfo *images;   /* images defined in kernel args */
   size_t image_sz;            /* image count in kernel args */
   cl_ulong local_mem_sz;      /* local memory size specified in kernel args. */
+  size_t compile_wg_sz[3];    /* Required workgroup size by __attribute__((reqd_work_gro
+                                 up_size(X, Y, Z))) qualifier.*/
   cl_argument *args;          /* To track argument setting */
   uint32_t arg_n:31;          /* Number of arguments */
   uint32_t ref_its_program:1; /* True only for the user kernel (created by clCreateKernel) */
-- 
1.7.9.5



More information about the Beignet mailing list