[Beignet] [PATCH 2/4] GBE: Implement to_local/private/global() function
xionghu.luo at intel.com
xionghu.luo at intel.com
Mon Jan 18 19:29:07 PST 2016
From: Ruiling Song <ruiling.song at intel.com>
to avoid zero address in local memory, 4 bytes is reserved. this will be fixed later.
Signed-off-by: Ruiling Song <ruiling.song at intel.com>
---
backend/src/backend/program.h | 1 +
backend/src/ir/profile.cpp | 4 +++-
backend/src/ir/profile.hpp | 3 ++-
backend/src/libocl/include/ocl_misc.h | 6 ++++++
backend/src/libocl/src/ocl_misc.cl | 21 +++++++++++++++++++++
backend/src/llvm/llvm_gen_backend.cpp | 22 ++++++++++++++++++++++
backend/src/llvm/llvm_gen_ocl_function.hxx | 1 +
src/cl_command_queue_gen7.c | 12 +++++++++---
8 files changed, 65 insertions(+), 5 deletions(-)
diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h
index 03150bc..0eece8f 100644
--- a/backend/src/backend/program.h
+++ b/backend/src/backend/program.h
@@ -100,6 +100,7 @@ enum gbe_curbe_type {
GBE_CURBE_DW_BLOCK_IP,
GBE_CURBE_THREAD_NUM,
GBE_CURBE_CONSTANT_ADDRSPACE,
+ GBE_CURBE_STACK_SIZE,
GBE_GEN_REG,
};
diff --git a/backend/src/ir/profile.cpp b/backend/src/ir/profile.cpp
index 0699167..4f28e34 100644
--- a/backend/src/ir/profile.cpp
+++ b/backend/src/ir/profile.cpp
@@ -44,7 +44,8 @@ namespace ir {
"retVal",
"printf_buffer_pointer", "printf_index_buffer_pointer",
"dwblockip",
- "constant_addrspace_start"
+ "constant_addrspace_start",
+ "stack_size"
};
#if GBE_DEBUG
@@ -88,6 +89,7 @@ namespace ir {
DECL_NEW_REG(FAMILY_QWORD, printfiptr, 1, GBE_CURBE_PRINTF_INDEX_POINTER);
DECL_NEW_REG(FAMILY_DWORD, dwblockip, 0, GBE_CURBE_DW_BLOCK_IP);
DECL_NEW_REG(FAMILY_QWORD, constant_addrspace, 1, GBE_CURBE_CONSTANT_ADDRSPACE);
+ DECL_NEW_REG(FAMILY_QWORD, stacksize, 1, GBE_CURBE_STACK_SIZE);
}
#undef DECL_NEW_REG
diff --git a/backend/src/ir/profile.hpp b/backend/src/ir/profile.hpp
index 79761d4..f348e0d 100644
--- a/backend/src/ir/profile.hpp
+++ b/backend/src/ir/profile.hpp
@@ -72,7 +72,8 @@ namespace ir {
static const Register printfiptr = Register(28); // printf index buffer address.
static const Register dwblockip = Register(29); // blockip
static const Register constant_addrspace = Register(30); // starting address of program-scope constant
- static const uint32_t regNum = 31; // number of special registers
+ static const Register stacksize = Register(31); // stack buffer total size
+ static const uint32_t regNum = 32; // number of special registers
extern const char *specialRegMean[]; // special register name.
} /* namespace ocl */
diff --git a/backend/src/libocl/include/ocl_misc.h b/backend/src/libocl/include/ocl_misc.h
index 359025b..dba821d 100644
--- a/backend/src/libocl/include/ocl_misc.h
+++ b/backend/src/libocl/include/ocl_misc.h
@@ -137,4 +137,10 @@ struct time_stamp {
};
struct time_stamp __gen_ocl_get_timestamp(void);
+bool __gen_ocl_in_local(size_t p);
+bool __gen_ocl_in_private(size_t p);
+
+local void *to_local(generic void *p);
+global void *to_global(generic void *p);
+private void *to_private(generic void *p);
#endif
diff --git a/backend/src/libocl/src/ocl_misc.cl b/backend/src/libocl/src/ocl_misc.cl
index 7f40054..232534d 100644
--- a/backend/src/libocl/src/ocl_misc.cl
+++ b/backend/src/libocl/src/ocl_misc.cl
@@ -229,3 +229,24 @@ struct time_stamp __gen_ocl_get_timestamp(void) {
return val;
};
+bool __gen_ocl_in_local(size_t p) {
+ bool cond1 = p > 0;
+ bool cond2 = p < 64*1024;
+ return cond1 && cond2;
+}
+
+local void *to_local(generic void *p) {
+ bool cond = __gen_ocl_in_local((size_t)p);
+ return cond ? (local void*)p : NULL;
+}
+private void *to_private(generic void *p) {
+ bool cond = __gen_ocl_in_private((size_t)p);
+ return cond ? (private void*)p : NULL;
+}
+
+global void *to_global(generic void *p) {
+ bool cond1 = __gen_ocl_in_local((size_t)p);
+ bool cond2 = __gen_ocl_in_private((size_t)p);
+ bool cond = cond1 || cond2;
+ return !cond ? (global void*)p : NULL;
+}
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index d23a598..2e0bedc 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -2580,6 +2580,8 @@ namespace gbe
const Constant *c = v.getInitializer();
Type *ty = c->getType();
uint32_t oldSlm = f.getSLMSize();
+ // FIXME temporary reserve 4 bytes to avoid 0 address
+ if (oldSlm == 0) oldSlm = 4;
uint32_t align = 8 * getAlignmentByte(unit, ty);
uint32_t padding = getPadding(oldSlm*8, align);
@@ -3604,6 +3606,7 @@ namespace gbe
case GEN_OCL_SIMD_SIZE:
case GEN_OCL_READ_TM:
case GEN_OCL_REGION:
+ case GEN_OCL_IN_PRIVATE:
case GEN_OCL_SIMD_ID:
case GEN_OCL_SIMD_SHUFFLE:
case GEN_OCL_WORK_GROUP_ALL:
@@ -3969,6 +3972,25 @@ namespace gbe
ctx.READ_ARF(ir::TYPE_U32, dst, ir::ARF_TM);
break;
}
+ case GEN_OCL_IN_PRIVATE:
+ {
+ const ir::Register dst = this->getRegister(&I);
+ uint32_t stackSize = ctx.getFunction().getStackSize();
+ if (stackSize == 0) {
+ ctx.MOV(ir::TYPE_BOOL, dst, ir::ocl::zero);
+ } else {
+ ir::Register cmp0 = ctx.reg(ir::FAMILY_BOOL);
+ ir::Register cmp1 = ctx.reg(ir::FAMILY_BOOL);
+ const ir::Register src0 = this->getRegister(*AI);
+ ir::Register tmp = ctx.reg(ir::FAMILY_QWORD);
+
+ ctx.GE(ir::TYPE_U64, cmp0, src0, ir::ocl::stackbuffer);
+ ctx.ADD(ir::TYPE_U64, tmp, ir::ocl::stackbuffer, ir::ocl::stacksize);
+ ctx.LT(ir::TYPE_U64, cmp1, src0, tmp);
+ ctx.AND(ir::TYPE_BOOL, dst, cmp0, cmp1);
+ }
+ break;
+ }
case GEN_OCL_REGION:
{
const ir::Register dst = this->getRegister(&I);
diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
index 0849f1e..92d4ea3 100644
--- a/backend/src/llvm/llvm_gen_ocl_function.hxx
+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
@@ -167,6 +167,7 @@ DECL_LLVM_GEN_FUNCTION(SIMD_SHUFFLE, intel_sub_group_shuffle)
DECL_LLVM_GEN_FUNCTION(READ_TM, __gen_ocl_read_tm)
DECL_LLVM_GEN_FUNCTION(REGION, __gen_ocl_region)
+DECL_LLVM_GEN_FUNCTION(IN_PRIVATE, __gen_ocl_in_private)
// printf function
DECL_LLVM_GEN_FUNCTION(PRINTF, __gen_ocl_printf)
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index 61ffe7e..eba3445 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -263,7 +263,7 @@ cl_bind_stack(cl_gpgpu gpgpu, cl_kernel ker)
const int32_t per_lane_stack_sz = ker->stack_size;
const int32_t value = GBE_CURBE_EXTRA_ARGUMENT;
const int32_t sub_value = GBE_STACK_BUFFER;
- const int32_t offset = interp_kernel_get_curbe_offset(ker->opaque, value, sub_value);
+ const int32_t offset_stack_buffer = interp_kernel_get_curbe_offset(ker->opaque, value, sub_value);
int32_t stack_sz = per_lane_stack_sz;
/* No stack required for this kernel */
@@ -273,7 +273,7 @@ cl_bind_stack(cl_gpgpu gpgpu, cl_kernel ker)
/* The stack size is given for *each* SIMD lane. So, we accordingly compute
* the size we need for the complete machine
*/
- assert(offset >= 0);
+ assert(offset_stack_buffer >= 0);
stack_sz *= interp_kernel_get_simd_width(ker->opaque);
stack_sz *= device->max_compute_unit * ctx->device->max_thread_per_unit;
/* Because HSW calc stack offset per thread is relative with half slice, when
@@ -282,7 +282,13 @@ cl_bind_stack(cl_gpgpu gpgpu, cl_kernel ker)
*/
if(cl_driver_get_ver(ctx->drv) == 75)
stack_sz *= 4;
- cl_gpgpu_set_stack(gpgpu, offset, stack_sz, BTI_PRIVATE);
+
+ const int32_t offset_stack_size = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_STACK_SIZE, 0);
+ if (offset_stack_size >= 0) {
+ *(uint64_t *)(ker->curbe + offset_stack_size) = stack_sz;
+ }
+
+ cl_gpgpu_set_stack(gpgpu, offset_stack_buffer, stack_sz, BTI_PRIVATE);
}
static int
--
2.4.1
More information about the Beignet
mailing list