[Beignet] [PATCH] GBE: Add two builtin functions get_work_dim / get_global_offset.

Zhigang Gong zhigang.gong at linux.intel.com
Thu May 30 21:56:57 PDT 2013


Signed-off-by: Zhigang Gong <zhigang.gong at linux.intel.com>
---
 backend/src/backend/context.cpp            | 2 ++
 backend/src/backend/gen_reg_allocation.cpp | 1 +
 backend/src/backend/program.h              | 1 +
 backend/src/ir/profile.cpp                 | 1 +
 backend/src/ir/profile.hpp                 | 3 ++-
 backend/src/llvm/llvm_gen_backend.cpp      | 2 ++
 backend/src/llvm/llvm_gen_ocl_function.hxx | 1 +
 backend/src/ocl_stdlib.h                   | 9 +++++++--
 src/cl_api.c                               | 1 +
 src/cl_command_queue.c                     | 5 +++--
 src/cl_command_queue.h                     | 1 +
 src/cl_command_queue_gen7.c                | 5 ++++-
 12 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp
index af1f579..48160de 100644
--- a/backend/src/backend/context.cpp
+++ b/backend/src/backend/context.cpp
@@ -430,6 +430,7 @@ namespace gbe
         INSERT_REG(goffset0, GLOBAL_OFFSET_X, 1)
         INSERT_REG(goffset1, GLOBAL_OFFSET_Y, 1)
         INSERT_REG(goffset2, GLOBAL_OFFSET_Z, 1)
+        INSERT_REG(workdim, WORK_DIM, 1)
         INSERT_REG(numgroup0, GROUP_NUM_X, 1)
         INSERT_REG(numgroup1, GROUP_NUM_Y, 1)
         INSERT_REG(numgroup2, GROUP_NUM_Z, 1)
@@ -621,6 +622,7 @@ namespace gbe
         reg == ir::ocl::goffset0  ||
         reg == ir::ocl::goffset1  ||
         reg == ir::ocl::goffset2  ||
+        reg == ir::ocl::workdim   ||
         reg == ir::ocl::constoffst)
       return true;
     return false;
diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp
index 8c9f358..469be12 100644
--- a/backend/src/backend/gen_reg_allocation.cpp
+++ b/backend/src/backend/gen_reg_allocation.cpp
@@ -518,6 +518,7 @@ namespace gbe
     allocatePayloadReg(GBE_CURBE_GLOBAL_OFFSET_X, ocl::goffset0);
     allocatePayloadReg(GBE_CURBE_GLOBAL_OFFSET_Y, ocl::goffset1);
     allocatePayloadReg(GBE_CURBE_GLOBAL_OFFSET_Z, ocl::goffset2);
+    allocatePayloadReg(GBE_CURBE_WORK_DIM, ocl::workdim);
     allocatePayloadReg(GBE_CURBE_GROUP_NUM_X, ocl::numgroup0);
     allocatePayloadReg(GBE_CURBE_GROUP_NUM_Y, ocl::numgroup1);
     allocatePayloadReg(GBE_CURBE_GROUP_NUM_Z, ocl::numgroup2);
diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h
index f178f8b..f36bfbf 100644
--- a/backend/src/backend/program.h
+++ b/backend/src/backend/program.h
@@ -69,6 +69,7 @@ enum gbe_curbe_type {
   GBE_CURBE_GROUP_NUM_X,
   GBE_CURBE_GROUP_NUM_Y,
   GBE_CURBE_GROUP_NUM_Z,
+  GBE_CURBE_WORK_DIM,
   GBE_CURBE_GLOBAL_CONSTANT_OFFSET,
   GBE_CURBE_GLOBAL_CONSTANT_DATA,
   GBE_CURBE_IMAGE_INFO,
diff --git a/backend/src/ir/profile.cpp b/backend/src/ir/profile.cpp
index c1dc650..99cd06c 100644
--- a/backend/src/ir/profile.cpp
+++ b/backend/src/ir/profile.cpp
@@ -76,6 +76,7 @@ namespace ir {
       DECL_NEW_REG(FAMILY_DWORD, barrierid);
       DECL_NEW_REG(FAMILY_DWORD, threadn);
       DECL_NEW_REG(FAMILY_DWORD, constoffst);
+      DECL_NEW_REG(FAMILY_DWORD, workdim);
     }
 #undef DECL_NEW_REG
 
diff --git a/backend/src/ir/profile.hpp b/backend/src/ir/profile.hpp
index 32dd149..4b0ef5e 100644
--- a/backend/src/ir/profile.hpp
+++ b/backend/src/ir/profile.hpp
@@ -64,7 +64,8 @@ namespace ir {
     static const Register barrierid = Register(20);// barrierid
     static const Register threadn = Register(21);  // number of threads
     static const Register constoffst = Register(22); // offset of global constant array's curbe
-    static const uint32_t regNum = 23;             // number of special registers
+    static const Register workdim = Register(23);  // work dimention.
+    static const uint32_t regNum = 24;             // number of special registers
     extern const char *specialRegMean[];           // special register name.
   } /* namespace ocl */
 
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index db7d714..ad295a8 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -1645,6 +1645,8 @@ namespace gbe
         regTranslator.newScalarProxy(ir::ocl::goffset1, dst); break;
       case GEN_OCL_GET_GLOBAL_OFFSET2:
         regTranslator.newScalarProxy(ir::ocl::goffset2, dst); break;
+      case GEN_OCL_GET_WORK_DIM:
+        regTranslator.newScalarProxy(ir::ocl::workdim, dst); break;
       case GEN_OCL_COS:
       case GEN_OCL_SIN:
       case GEN_OCL_SQR:
diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
index 0524744..6cd7298 100644
--- a/backend/src/llvm/llvm_gen_ocl_function.hxx
+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
@@ -16,6 +16,7 @@ DECL_LLVM_GEN_FUNCTION(GET_GLOBAL_SIZE2, __gen_ocl_get_global_size2)
 DECL_LLVM_GEN_FUNCTION(GET_GLOBAL_OFFSET0, __gen_ocl_get_global_offset0)
 DECL_LLVM_GEN_FUNCTION(GET_GLOBAL_OFFSET1, __gen_ocl_get_global_offset1)
 DECL_LLVM_GEN_FUNCTION(GET_GLOBAL_OFFSET2, __gen_ocl_get_global_offset2)
+DECL_LLVM_GEN_FUNCTION(GET_WORK_DIM, __gen_ocl_get_work_dim)
 
 // Math function
 DECL_LLVM_GEN_FUNCTION(ABS, __gen_ocl_fabs)
diff --git a/backend/src/ocl_stdlib.h b/backend/src/ocl_stdlib.h
index 613b844..144e78c 100644
--- a/backend/src/ocl_stdlib.h
+++ b/backend/src/ocl_stdlib.h
@@ -346,8 +346,11 @@ DEC(16);
 /////////////////////////////////////////////////////////////////////////////
 // Work Items functions (see 6.11.1 of OCL 1.1 spec)
 /////////////////////////////////////////////////////////////////////////////
-// TODO get_global_offset
-// TODO get_work_dim
+
+PURE CONST uint __gen_ocl_get_work_dim(void);
+INLINE uint get_work_dim(void) {
+  return __gen_ocl_get_work_dim(); 
+}
 
 #define DECL_INTERNAL_WORK_ITEM_FN(NAME) \
 PURE CONST unsigned int __gen_ocl_##NAME##0(void); \
@@ -357,6 +360,7 @@ DECL_INTERNAL_WORK_ITEM_FN(get_group_id)
 DECL_INTERNAL_WORK_ITEM_FN(get_local_id)
 DECL_INTERNAL_WORK_ITEM_FN(get_local_size)
 DECL_INTERNAL_WORK_ITEM_FN(get_global_size)
+DECL_INTERNAL_WORK_ITEM_FN(get_global_offset)
 DECL_INTERNAL_WORK_ITEM_FN(get_num_groups)
 #undef DECL_INTERNAL_WORK_ITEM_FN
 
@@ -371,6 +375,7 @@ DECL_PUBLIC_WORK_ITEM_FN(get_group_id)
 DECL_PUBLIC_WORK_ITEM_FN(get_local_id)
 DECL_PUBLIC_WORK_ITEM_FN(get_local_size)
 DECL_PUBLIC_WORK_ITEM_FN(get_global_size)
+DECL_PUBLIC_WORK_ITEM_FN(get_global_offset)
 DECL_PUBLIC_WORK_ITEM_FN(get_num_groups)
 #undef DECL_PUBLIC_WORK_ITEM_FN
 
diff --git a/src/cl_api.c b/src/cl_api.c
index 9c5943b..5ef95b7 100644
--- a/src/cl_api.c
+++ b/src/cl_api.c
@@ -1453,6 +1453,7 @@ clEnqueueNDRangeKernel(cl_command_queue  command_queue,
   /* Do device specific checks are enqueue the kernel */
   err = cl_command_queue_ND_range(command_queue,
                                   kernel,
+                                  work_dim,
                                   fixed_global_off,
                                   fixed_global_sz,
                                   fixed_local_sz);
diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
index 1a37c78..a3987d8 100644
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -356,7 +356,7 @@ error:
 }
 #endif
 
-extern cl_int cl_command_queue_ND_range_gen7(cl_command_queue, cl_kernel, const size_t *, const size_t *, const size_t *);
+extern cl_int cl_command_queue_ND_range_gen7(cl_command_queue, cl_kernel, uint32_t, const size_t *, const size_t *, const size_t *);
 
 static cl_int
 cl_kernel_check_args(cl_kernel k)
@@ -371,6 +371,7 @@ cl_kernel_check_args(cl_kernel k)
 LOCAL cl_int
 cl_command_queue_ND_range(cl_command_queue queue,
                           cl_kernel k,
+                          const uint32_t work_dim,
                           const size_t *global_wk_off,
                           const size_t *global_wk_sz,
                           const size_t *local_wk_sz)
@@ -394,7 +395,7 @@ cl_command_queue_ND_range(cl_command_queue queue,
 #endif /* USE_FULSIM */
 
   if (ver == 7 || ver == 75)
-    TRY (cl_command_queue_ND_range_gen7, queue, k, global_wk_off, global_wk_sz, local_wk_sz);
+    TRY (cl_command_queue_ND_range_gen7, queue, k, work_dim, global_wk_off, global_wk_sz, local_wk_sz);
   else
     FATAL ("Unknown Gen Device");
 
diff --git a/src/cl_command_queue.h b/src/cl_command_queue.h
index f0c00f4..5a792a2 100644
--- a/src/cl_command_queue.h
+++ b/src/cl_command_queue.h
@@ -54,6 +54,7 @@ extern void cl_command_queue_add_ref(cl_command_queue);
 /* Map ND range kernel from OCL API */
 extern cl_int cl_command_queue_ND_range(cl_command_queue queue,
                                         cl_kernel ker,
+                                        const uint32_t work_dim,
                                         const size_t *global_work_offset,
                                         const size_t *global_work_size,
                                         const size_t *local_work_size);
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index 770af4a..ea9b583 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -98,6 +98,7 @@ error:
 /* Will return the total amount of slm used */
 static int32_t
 cl_curbe_fill(cl_kernel ker,
+              const uint32_t work_dim,
               const size_t *global_wk_off,
               const size_t *global_wk_sz,
               const size_t *local_wk_sz,
@@ -120,6 +121,7 @@ cl_curbe_fill(cl_kernel ker,
   UPLOAD(GBE_CURBE_GROUP_NUM_Y, global_wk_sz[1]/local_wk_sz[1]);
   UPLOAD(GBE_CURBE_GROUP_NUM_Z, global_wk_sz[2]/local_wk_sz[2]);
   UPLOAD(GBE_CURBE_THREAD_NUM, thread_n);
+  UPLOAD(GBE_CURBE_WORK_DIM, work_dim); 
   UPLOAD(GBE_CURBE_GLOBAL_CONSTANT_OFFSET, gbe_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_GLOBAL_CONSTANT_DATA, 0) + 32);
 #undef UPLOAD
 
@@ -185,6 +187,7 @@ cl_bind_stack(cl_gpgpu gpgpu, cl_kernel ker)
 LOCAL cl_int
 cl_command_queue_ND_range_gen7(cl_command_queue queue,
                                cl_kernel ker,
+                               const uint32_t work_dim,
                                const size_t *global_wk_off,
                                const size_t *global_wk_sz,
                                const size_t *local_wk_sz)
@@ -214,7 +217,7 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
 
   /* Curbe step 1: fill the constant buffer data shared by all threads */
   if (ker->curbe)
-    kernel.slm_sz = cl_curbe_fill(ker, global_wk_off, global_wk_sz, local_wk_sz, thread_n);
+    kernel.slm_sz = cl_curbe_fill(ker, work_dim, global_wk_off, global_wk_sz, local_wk_sz, thread_n);
 
   /* Setup the kernel */
   cl_gpgpu_state_init(gpgpu, ctx->device->max_compute_unit, cst_sz / 32);
-- 
1.7.11.7



More information about the Beignet mailing list