[Beignet] [PATCH 1/2] Backend: Refine get_enqueued_local_size and get_local_size

Xiuli Pan xiuli.pan at intel.com
Tue Mar 15 23:52:45 UTC 2016


From: Pan Xiuli <xiuli.pan at intel.com>

Use curbe register for these two size.

Signed-off-by: Pan Xiuli <xiuli.pan at intel.com>
---
 backend/src/backend/gen_insn_selection.cpp |  7 +++--
 backend/src/backend/program.h              |  3 ++
 backend/src/ir/profile.cpp                 |  4 +++
 backend/src/ir/profile.hpp                 | 47 ++++++++++++++++--------------
 backend/src/libocl/src/ocl_workitem.cl     | 17 ++++-------
 backend/src/llvm/llvm_gen_backend.cpp      |  6 ++++
 backend/src/llvm/llvm_gen_ocl_function.hxx |  3 ++
 src/cl_command_queue_gen7.c                |  6 +++-
 8 files changed, 57 insertions(+), 36 deletions(-)

diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 90e15c3..77614b6 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -3423,8 +3423,11 @@ namespace gbe
           reg == ir::ocl::lid1 ||
           reg == ir::ocl::lid2 ||
           reg == ir::ocl::lsize0 ||
-          reg == ir::ocl::lsize1||
-          reg == ir::ocl::lsize2)
+          reg == ir::ocl::lsize1 ||
+          reg == ir::ocl::lsize2 ||
+          reg == ir::ocl::enqlsize0 ||
+          reg == ir::ocl::enqlsize1 ||
+          reg == ir::ocl::enqlsize2)
         return true;
       else
         return false;
diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h
index 4dd3ae3..a690e3d 100644
--- a/backend/src/backend/program.h
+++ b/backend/src/backend/program.h
@@ -83,6 +83,9 @@ enum gbe_curbe_type {
   GBE_CURBE_LOCAL_SIZE_X,
   GBE_CURBE_LOCAL_SIZE_Y,
   GBE_CURBE_LOCAL_SIZE_Z,
+  GBE_CURBE_ENQUEUED_LOCAL_SIZE_X,
+  GBE_CURBE_ENQUEUED_LOCAL_SIZE_Y,
+  GBE_CURBE_ENQUEUED_LOCAL_SIZE_Z,
   GBE_CURBE_GLOBAL_SIZE_X,
   GBE_CURBE_GLOBAL_SIZE_Y,
   GBE_CURBE_GLOBAL_SIZE_Z,
diff --git a/backend/src/ir/profile.cpp b/backend/src/ir/profile.cpp
index ce5e8e7..0907d76 100644
--- a/backend/src/ir/profile.cpp
+++ b/backend/src/ir/profile.cpp
@@ -35,6 +35,7 @@ namespace ir {
         "group_id_0", "group_id_1", "group_id_2",
         "num_groups_0", "num_groups_1", "num_groups_2",
         "local_size_0", "local_size_1", "local_size_2",
+        "enqueued_local_size_0", "enqueued_local_size_1", "enqueued_local_size_2",
         "global_size_0", "global_size_1", "global_size_2",
         "global_offset_0", "global_offset_1", "global_offset_2",
         "stack_pointer", "stack_buffer",
@@ -71,6 +72,9 @@ namespace ir {
       DECL_NEW_REG(FAMILY_DWORD, lsize0, 1, GBE_CURBE_LOCAL_SIZE_X);
       DECL_NEW_REG(FAMILY_DWORD, lsize1, 1, GBE_CURBE_LOCAL_SIZE_Y);
       DECL_NEW_REG(FAMILY_DWORD, lsize2, 1, GBE_CURBE_LOCAL_SIZE_Z);
+      DECL_NEW_REG(FAMILY_DWORD, enqlsize0, 1, GBE_CURBE_ENQUEUED_LOCAL_SIZE_X);
+      DECL_NEW_REG(FAMILY_DWORD, enqlsize1, 1, GBE_CURBE_ENQUEUED_LOCAL_SIZE_Y);
+      DECL_NEW_REG(FAMILY_DWORD, enqlsize2, 1, GBE_CURBE_ENQUEUED_LOCAL_SIZE_Z);
       DECL_NEW_REG(FAMILY_DWORD, gsize0, 1, GBE_CURBE_GLOBAL_SIZE_X);
       DECL_NEW_REG(FAMILY_DWORD, gsize1, 1, GBE_CURBE_GLOBAL_SIZE_Y);
       DECL_NEW_REG(FAMILY_DWORD, gsize2, 1, GBE_CURBE_GLOBAL_SIZE_Z);
diff --git a/backend/src/ir/profile.hpp b/backend/src/ir/profile.hpp
index 35b49e3..b093adf 100644
--- a/backend/src/ir/profile.hpp
+++ b/backend/src/ir/profile.hpp
@@ -53,28 +53,31 @@ namespace ir {
     static const Register lsize0 = Register(9);    // get_local_size(0)
     static const Register lsize1 = Register(10);   // get_local_size(1)
     static const Register lsize2 = Register(11);   // get_local_size(2)
-    static const Register gsize0 = Register(12);   // get_global_size(0)
-    static const Register gsize1 = Register(13);   // get_global_size(1)
-    static const Register gsize2 = Register(14);   // get_global_size(2)
-    static const Register goffset0 = Register(15); // get_global_offset(0)
-    static const Register goffset1 = Register(16); // get_global_offset(1)
-    static const Register goffset2 = Register(17); // get_global_offset(2)
-    static const Register stackptr = Register(18); // stack pointer
-    static const Register stackbuffer = Register(19); // stack buffer base address.
-    static const Register blockip = Register(20);  // blockip
-    static const Register barrierid = Register(21);// barrierid
-    static const Register threadn = Register(22);  // number of threads
-    static const Register workdim = Register(23);  // work dimention.
-    static const Register zero = Register(24);     //  scalar register holds zero.
-    static const Register one = Register(25);     //  scalar register holds one. 
-    static const Register retVal = Register(26);   // helper register to do data flow analysis.
-    static const Register printfbptr = Register(27); // printf buffer address .
-    static const Register printfiptr = Register(28); // printf index buffer address.
-    static const Register dwblockip = Register(29);  // blockip
-    static const Register threadid = Register(30); // the thread id of this thread.
-    static const Register constant_addrspace = Register(31);  // starting address of program-scope constant
-    static const Register stacksize = Register(32); // stack buffer total size
-    static const uint32_t regNum = 33;             // number of special registers
+    static const Register enqlsize0 = Register(12);    // get_local_size(0)
+    static const Register enqlsize1 = Register(13);   // get_local_size(1)
+    static const Register enqlsize2 = Register(14);   // get_local_size(2)
+    static const Register gsize0 = Register(15);   // get_global_size(0)
+    static const Register gsize1 = Register(16);   // get_global_size(1)
+    static const Register gsize2 = Register(17);   // get_global_size(2)
+    static const Register goffset0 = Register(18); // get_global_offset(0)
+    static const Register goffset1 = Register(19); // get_global_offset(1)
+    static const Register goffset2 = Register(20); // get_global_offset(2)
+    static const Register stackptr = Register(21); // stack pointer
+    static const Register stackbuffer = Register(22); // stack buffer base address.
+    static const Register blockip = Register(23);  // blockip
+    static const Register barrierid = Register(24);// barrierid
+    static const Register threadn = Register(25);  // number of threads
+    static const Register workdim = Register(26);  // work dimention.
+    static const Register zero = Register(27);     //  scalar register holds zero.
+    static const Register one = Register(28);     //  scalar register holds one.
+    static const Register retVal = Register(29);   // helper register to do data flow analysis.
+    static const Register printfbptr = Register(30); // printf buffer address .
+    static const Register printfiptr = Register(31); // printf index buffer address.
+    static const Register dwblockip = Register(32);  // blockip
+    static const Register threadid = Register(33); // the thread id of this thread.
+    static const Register constant_addrspace = Register(34);  // starting address of program-scope constant
+    static const Register stacksize = Register(35); // stack buffer total size
+    static const uint32_t regNum = 36;             // number of special registers
     extern const char *specialRegMean[];           // special register name.
   } /* namespace ocl */
 
diff --git a/backend/src/libocl/src/ocl_workitem.cl b/backend/src/libocl/src/ocl_workitem.cl
index 235f12b..dc8fa6d 100644
--- a/backend/src/libocl/src/ocl_workitem.cl
+++ b/backend/src/libocl/src/ocl_workitem.cl
@@ -30,6 +30,7 @@ PURE CONST unsigned int __gen_ocl_##NAME##1(void); \
 PURE CONST unsigned int __gen_ocl_##NAME##2(void);
 DECL_INTERNAL_WORK_ITEM_FN(get_group_id)
 DECL_INTERNAL_WORK_ITEM_FN(get_local_id)
+DECL_INTERNAL_WORK_ITEM_FN(get_enqueued_local_size)
 DECL_INTERNAL_WORK_ITEM_FN(get_local_size)
 DECL_INTERNAL_WORK_ITEM_FN(get_global_size)
 DECL_INTERNAL_WORK_ITEM_FN(get_global_offset)
@@ -46,6 +47,7 @@ OVERLOADABLE unsigned NAME(unsigned int dim) {             \
 
 DECL_PUBLIC_WORK_ITEM_FN(get_group_id, 0)
 DECL_PUBLIC_WORK_ITEM_FN(get_local_id, 0)
+DECL_PUBLIC_WORK_ITEM_FN(get_enqueued_local_size, 1)
 DECL_PUBLIC_WORK_ITEM_FN(get_local_size, 1)
 DECL_PUBLIC_WORK_ITEM_FN(get_global_size, 1)
 DECL_PUBLIC_WORK_ITEM_FN(get_global_offset, 0)
@@ -53,14 +55,7 @@ DECL_PUBLIC_WORK_ITEM_FN(get_num_groups, 1)
 #undef DECL_PUBLIC_WORK_ITEM_FN
 
 OVERLOADABLE uint get_global_id(uint dim) {
-  return get_local_id(dim) + get_local_size(dim) * get_group_id(dim) + get_global_offset(dim);
-}
-
-OVERLOADABLE uint get_enqueued_local_size (uint dimindx)
-{
-  //TODO: should be different with get_local_size when support
-  //non-uniform work-group size
-  return get_local_size(dimindx);
+  return get_local_id(dim) + get_enqueued_local_size(dim) * get_group_id(dim) + get_global_offset(dim);
 }
 
 OVERLOADABLE uint get_global_linear_id(void)
@@ -80,8 +75,8 @@ OVERLOADABLE uint get_local_linear_id(void)
 {
   uint dim = __gen_ocl_get_work_dim();
   if (dim == 1) return get_local_id(0);
-  else if (dim == 2) return get_local_id(1) * get_local_size (0) + get_local_id(0);
-  else if (dim == 3) return (get_local_id(2) * get_local_size(1) * get_local_size(0)) +
-                            (get_local_id(1) * get_local_size(0)) + get_local_id(0);
+  else if (dim == 2) return get_local_id(1) * get_enqueued_local_size(0) + get_local_id(0);
+  else if (dim == 3) return (get_local_id(2) * get_enqueued_local_size(1) * get_local_size(0)) +
+                            (get_local_id(1) * get_enqueued_local_size(0)) + get_local_id(0);
   else return 0;
 }
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index cca1781..4a98678 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -3507,6 +3507,12 @@ namespace gbe
         regTranslator.newScalarProxy(ir::ocl::lsize1, dst); break;
       case GEN_OCL_GET_LOCAL_SIZE2:
         regTranslator.newScalarProxy(ir::ocl::lsize2, dst); break;
+      case GEN_OCL_GET_ENQUEUED_LOCAL_SIZE0:
+        regTranslator.newScalarProxy(ir::ocl::enqlsize0, dst); break;
+      case GEN_OCL_GET_ENQUEUED_LOCAL_SIZE1:
+        regTranslator.newScalarProxy(ir::ocl::enqlsize1, dst); break;
+      case GEN_OCL_GET_ENQUEUED_LOCAL_SIZE2:
+        regTranslator.newScalarProxy(ir::ocl::enqlsize2, dst); break;
       case GEN_OCL_GET_GLOBAL_SIZE0:
         regTranslator.newScalarProxy(ir::ocl::gsize0, dst); break;
       case GEN_OCL_GET_GLOBAL_SIZE1:
diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
index 7bd59fc..09feb1a 100644
--- a/backend/src/llvm/llvm_gen_ocl_function.hxx
+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
@@ -10,6 +10,9 @@ DECL_LLVM_GEN_FUNCTION(GET_NUM_GROUPS2, __gen_ocl_get_num_groups2)
 DECL_LLVM_GEN_FUNCTION(GET_LOCAL_SIZE0, __gen_ocl_get_local_size0)
 DECL_LLVM_GEN_FUNCTION(GET_LOCAL_SIZE1, __gen_ocl_get_local_size1)
 DECL_LLVM_GEN_FUNCTION(GET_LOCAL_SIZE2, __gen_ocl_get_local_size2)
+DECL_LLVM_GEN_FUNCTION(GET_ENQUEUED_LOCAL_SIZE0, __gen_ocl_get_enqueued_local_size0)
+DECL_LLVM_GEN_FUNCTION(GET_ENQUEUED_LOCAL_SIZE1, __gen_ocl_get_enqueued_local_size1)
+DECL_LLVM_GEN_FUNCTION(GET_ENQUEUED_LOCAL_SIZE2, __gen_ocl_get_enqueued_local_size2)
 DECL_LLVM_GEN_FUNCTION(GET_GLOBAL_SIZE0, __gen_ocl_get_global_size0)
 DECL_LLVM_GEN_FUNCTION(GET_GLOBAL_SIZE1, __gen_ocl_get_global_size1)
 DECL_LLVM_GEN_FUNCTION(GET_GLOBAL_SIZE2, __gen_ocl_get_global_size2)
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index 38cf56b..6bfacbf 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -221,6 +221,7 @@ cl_curbe_fill(cl_kernel ker,
               const size_t *global_wk_off,
               const size_t *global_wk_sz,
               const size_t *local_wk_sz,
+              const size_t *enqueued_local_wk_sz,
               size_t thread_n)
 {
   int32_t offset;
@@ -230,6 +231,9 @@ cl_curbe_fill(cl_kernel ker,
   UPLOAD(GBE_CURBE_LOCAL_SIZE_X, local_wk_sz[0]);
   UPLOAD(GBE_CURBE_LOCAL_SIZE_Y, local_wk_sz[1]);
   UPLOAD(GBE_CURBE_LOCAL_SIZE_Z, local_wk_sz[2]);
+  UPLOAD(GBE_CURBE_ENQUEUED_LOCAL_SIZE_X, enqueued_local_wk_sz[0]);
+  UPLOAD(GBE_CURBE_ENQUEUED_LOCAL_SIZE_Y, enqueued_local_wk_sz[1]);
+  UPLOAD(GBE_CURBE_ENQUEUED_LOCAL_SIZE_Z, enqueued_local_wk_sz[2]);
   UPLOAD(GBE_CURBE_GLOBAL_SIZE_X, global_wk_sz[0]);
   UPLOAD(GBE_CURBE_GLOBAL_SIZE_Y, global_wk_sz[1]);
   UPLOAD(GBE_CURBE_GLOBAL_SIZE_Z, global_wk_sz[2]);
@@ -374,7 +378,7 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
   }
   /* Curbe step 1: fill the constant urb buffer data shared by all threads */
   if (ker->curbe) {
-    kernel.slm_sz = cl_curbe_fill(ker, work_dim, global_wk_off, global_wk_sz, local_wk_sz, thread_n);
+    kernel.slm_sz = cl_curbe_fill(ker, work_dim, global_wk_off, global_wk_sz,local_wk_sz ,local_wk_sz, thread_n);
     if (kernel.slm_sz > ker->program->ctx->device->local_mem_size) {
       fprintf(stderr, "Beignet: Out of shared local memory %d.\n", kernel.slm_sz);
       return CL_OUT_OF_RESOURCES;
-- 
2.5.0



More information about the Beignet mailing list