[Beignet] [PATCH 2/4] GBE: Implement to_local/private/global() function

xionghu.luo at intel.com xionghu.luo at intel.com
Mon Jan 18 19:29:07 PST 2016


From: Ruiling Song <ruiling.song at intel.com>

to avoid zero address in local memory, 4 bytes is reserved. this will be fixed later.

Signed-off-by: Ruiling Song <ruiling.song at intel.com>
---
 backend/src/backend/program.h              |  1 +
 backend/src/ir/profile.cpp                 |  4 +++-
 backend/src/ir/profile.hpp                 |  3 ++-
 backend/src/libocl/include/ocl_misc.h      |  6 ++++++
 backend/src/libocl/src/ocl_misc.cl         | 21 +++++++++++++++++++++
 backend/src/llvm/llvm_gen_backend.cpp      | 22 ++++++++++++++++++++++
 backend/src/llvm/llvm_gen_ocl_function.hxx |  1 +
 src/cl_command_queue_gen7.c                | 12 +++++++++---
 8 files changed, 65 insertions(+), 5 deletions(-)

diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h
index 03150bc..0eece8f 100644
--- a/backend/src/backend/program.h
+++ b/backend/src/backend/program.h
@@ -100,6 +100,7 @@ enum gbe_curbe_type {
   GBE_CURBE_DW_BLOCK_IP,
   GBE_CURBE_THREAD_NUM,
   GBE_CURBE_CONSTANT_ADDRSPACE,
+  GBE_CURBE_STACK_SIZE,
   GBE_GEN_REG,
 };
 
diff --git a/backend/src/ir/profile.cpp b/backend/src/ir/profile.cpp
index 0699167..4f28e34 100644
--- a/backend/src/ir/profile.cpp
+++ b/backend/src/ir/profile.cpp
@@ -44,7 +44,8 @@ namespace ir {
         "retVal",
         "printf_buffer_pointer", "printf_index_buffer_pointer",
         "dwblockip",
-        "constant_addrspace_start"
+        "constant_addrspace_start",
+        "stack_size"
     };
 
 #if GBE_DEBUG
@@ -88,6 +89,7 @@ namespace ir {
       DECL_NEW_REG(FAMILY_QWORD, printfiptr, 1, GBE_CURBE_PRINTF_INDEX_POINTER);
       DECL_NEW_REG(FAMILY_DWORD, dwblockip, 0, GBE_CURBE_DW_BLOCK_IP);
       DECL_NEW_REG(FAMILY_QWORD, constant_addrspace, 1, GBE_CURBE_CONSTANT_ADDRSPACE);
+      DECL_NEW_REG(FAMILY_QWORD, stacksize, 1, GBE_CURBE_STACK_SIZE);
     }
 #undef DECL_NEW_REG
 
diff --git a/backend/src/ir/profile.hpp b/backend/src/ir/profile.hpp
index 79761d4..f348e0d 100644
--- a/backend/src/ir/profile.hpp
+++ b/backend/src/ir/profile.hpp
@@ -72,7 +72,8 @@ namespace ir {
     static const Register printfiptr = Register(28); // printf index buffer address.
     static const Register dwblockip = Register(29);  // blockip
     static const Register constant_addrspace = Register(30);  // starting address of program-scope constant
-    static const uint32_t regNum = 31;             // number of special registers
+    static const Register stacksize = Register(31); // stack buffer total size
+    static const uint32_t regNum = 32;             // number of special registers
     extern const char *specialRegMean[];           // special register name.
   } /* namespace ocl */
 
diff --git a/backend/src/libocl/include/ocl_misc.h b/backend/src/libocl/include/ocl_misc.h
index 359025b..dba821d 100644
--- a/backend/src/libocl/include/ocl_misc.h
+++ b/backend/src/libocl/include/ocl_misc.h
@@ -137,4 +137,10 @@ struct time_stamp {
 };
 
 struct time_stamp __gen_ocl_get_timestamp(void);
+bool __gen_ocl_in_local(size_t p);
+bool __gen_ocl_in_private(size_t p);
+
+local void *to_local(generic void *p);
+global void *to_global(generic void *p);
+private void *to_private(generic void *p);
 #endif
diff --git a/backend/src/libocl/src/ocl_misc.cl b/backend/src/libocl/src/ocl_misc.cl
index 7f40054..232534d 100644
--- a/backend/src/libocl/src/ocl_misc.cl
+++ b/backend/src/libocl/src/ocl_misc.cl
@@ -229,3 +229,24 @@ struct time_stamp __gen_ocl_get_timestamp(void) {
 
   return val;
 };
+bool __gen_ocl_in_local(size_t p) {
+  bool cond1 = p > 0;
+  bool cond2 = p < 64*1024;
+  return cond1 && cond2;
+}
+
+local void *to_local(generic void *p) {
+  bool cond = __gen_ocl_in_local((size_t)p);
+  return cond ? (local void*)p : NULL;
+}
+private void *to_private(generic void *p) {
+  bool cond = __gen_ocl_in_private((size_t)p);
+  return cond ? (private void*)p : NULL;
+}
+
+global void *to_global(generic void *p) {
+  bool cond1 = __gen_ocl_in_local((size_t)p);
+  bool cond2 = __gen_ocl_in_private((size_t)p);
+  bool cond = cond1 || cond2;
+  return !cond ? (global void*)p : NULL;
+}
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index d23a598..2e0bedc 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -2580,6 +2580,8 @@ namespace gbe
         const Constant *c = v.getInitializer();
         Type *ty = c->getType();
         uint32_t oldSlm = f.getSLMSize();
+        // FIXME temporary reserve 4 bytes to avoid 0 address
+        if (oldSlm == 0) oldSlm = 4;
         uint32_t align = 8 * getAlignmentByte(unit, ty);
         uint32_t padding = getPadding(oldSlm*8, align);
 
@@ -3604,6 +3606,7 @@ namespace gbe
       case GEN_OCL_SIMD_SIZE:
       case GEN_OCL_READ_TM:
       case GEN_OCL_REGION:
+      case GEN_OCL_IN_PRIVATE:
       case GEN_OCL_SIMD_ID:
       case GEN_OCL_SIMD_SHUFFLE:
       case GEN_OCL_WORK_GROUP_ALL:
@@ -3969,6 +3972,25 @@ namespace gbe
             ctx.READ_ARF(ir::TYPE_U32, dst, ir::ARF_TM);
             break;
           }
+          case GEN_OCL_IN_PRIVATE:
+          {
+            const ir::Register dst = this->getRegister(&I);
+            uint32_t stackSize = ctx.getFunction().getStackSize();
+            if (stackSize == 0) {
+              ctx.MOV(ir::TYPE_BOOL, dst, ir::ocl::zero);
+            } else {
+              ir::Register cmp0 = ctx.reg(ir::FAMILY_BOOL);
+              ir::Register cmp1 = ctx.reg(ir::FAMILY_BOOL);
+              const ir::Register src0 = this->getRegister(*AI);
+              ir::Register tmp = ctx.reg(ir::FAMILY_QWORD);
+
+              ctx.GE(ir::TYPE_U64, cmp0, src0, ir::ocl::stackbuffer);
+              ctx.ADD(ir::TYPE_U64, tmp, ir::ocl::stackbuffer, ir::ocl::stacksize);
+              ctx.LT(ir::TYPE_U64, cmp1, src0, tmp);
+              ctx.AND(ir::TYPE_BOOL, dst, cmp0, cmp1);
+            }
+            break;
+          }
           case GEN_OCL_REGION:
           {
             const ir::Register dst = this->getRegister(&I);
diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
index 0849f1e..92d4ea3 100644
--- a/backend/src/llvm/llvm_gen_ocl_function.hxx
+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
@@ -167,6 +167,7 @@ DECL_LLVM_GEN_FUNCTION(SIMD_SHUFFLE, intel_sub_group_shuffle)
 
 DECL_LLVM_GEN_FUNCTION(READ_TM, __gen_ocl_read_tm)
 DECL_LLVM_GEN_FUNCTION(REGION, __gen_ocl_region)
+DECL_LLVM_GEN_FUNCTION(IN_PRIVATE, __gen_ocl_in_private)
 
 // printf function
 DECL_LLVM_GEN_FUNCTION(PRINTF, __gen_ocl_printf)
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index 61ffe7e..eba3445 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -263,7 +263,7 @@ cl_bind_stack(cl_gpgpu gpgpu, cl_kernel ker)
   const int32_t per_lane_stack_sz = ker->stack_size;
   const int32_t value = GBE_CURBE_EXTRA_ARGUMENT;
   const int32_t sub_value = GBE_STACK_BUFFER;
-  const int32_t offset = interp_kernel_get_curbe_offset(ker->opaque, value, sub_value);
+  const int32_t offset_stack_buffer = interp_kernel_get_curbe_offset(ker->opaque, value, sub_value);
   int32_t stack_sz = per_lane_stack_sz;
 
   /* No stack required for this kernel */
@@ -273,7 +273,7 @@ cl_bind_stack(cl_gpgpu gpgpu, cl_kernel ker)
   /* The stack size is given for *each* SIMD lane. So, we accordingly compute
    * the size we need for the complete machine
    */
-  assert(offset >= 0);
+  assert(offset_stack_buffer >= 0);
   stack_sz *= interp_kernel_get_simd_width(ker->opaque);
   stack_sz *= device->max_compute_unit * ctx->device->max_thread_per_unit;
   /* Because HSW calc stack offset per thread is relative with half slice, when
@@ -282,7 +282,13 @@ cl_bind_stack(cl_gpgpu gpgpu, cl_kernel ker)
    */
   if(cl_driver_get_ver(ctx->drv) == 75)
     stack_sz *= 4;
-  cl_gpgpu_set_stack(gpgpu, offset, stack_sz, BTI_PRIVATE);
+
+  const int32_t offset_stack_size = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_STACK_SIZE, 0);
+  if (offset_stack_size >= 0) {
+    *(uint64_t *)(ker->curbe + offset_stack_size) = stack_sz;
+  }
+
+  cl_gpgpu_set_stack(gpgpu, offset_stack_buffer, stack_sz, BTI_PRIVATE);
 }
 
 static int
-- 
2.4.1



More information about the Beignet mailing list