[Beignet] [PATCH 1/2] Backend: Refine get_enqueued_local_size and get_local_size
Xiuli Pan
xiuli.pan at intel.com
Tue Mar 15 23:52:45 UTC 2016
From: Pan Xiuli <xiuli.pan at intel.com>
Use curbe register for these two size.
Signed-off-by: Pan Xiuli <xiuli.pan at intel.com>
---
backend/src/backend/gen_insn_selection.cpp | 7 +++--
backend/src/backend/program.h | 3 ++
backend/src/ir/profile.cpp | 4 +++
backend/src/ir/profile.hpp | 47 ++++++++++++++++--------------
backend/src/libocl/src/ocl_workitem.cl | 17 ++++-------
backend/src/llvm/llvm_gen_backend.cpp | 6 ++++
backend/src/llvm/llvm_gen_ocl_function.hxx | 3 ++
src/cl_command_queue_gen7.c | 6 +++-
8 files changed, 57 insertions(+), 36 deletions(-)
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 90e15c3..77614b6 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -3423,8 +3423,11 @@ namespace gbe
reg == ir::ocl::lid1 ||
reg == ir::ocl::lid2 ||
reg == ir::ocl::lsize0 ||
- reg == ir::ocl::lsize1||
- reg == ir::ocl::lsize2)
+ reg == ir::ocl::lsize1 ||
+ reg == ir::ocl::lsize2 ||
+ reg == ir::ocl::enqlsize0 ||
+ reg == ir::ocl::enqlsize1 ||
+ reg == ir::ocl::enqlsize2)
return true;
else
return false;
diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h
index 4dd3ae3..a690e3d 100644
--- a/backend/src/backend/program.h
+++ b/backend/src/backend/program.h
@@ -83,6 +83,9 @@ enum gbe_curbe_type {
GBE_CURBE_LOCAL_SIZE_X,
GBE_CURBE_LOCAL_SIZE_Y,
GBE_CURBE_LOCAL_SIZE_Z,
+ GBE_CURBE_ENQUEUED_LOCAL_SIZE_X,
+ GBE_CURBE_ENQUEUED_LOCAL_SIZE_Y,
+ GBE_CURBE_ENQUEUED_LOCAL_SIZE_Z,
GBE_CURBE_GLOBAL_SIZE_X,
GBE_CURBE_GLOBAL_SIZE_Y,
GBE_CURBE_GLOBAL_SIZE_Z,
diff --git a/backend/src/ir/profile.cpp b/backend/src/ir/profile.cpp
index ce5e8e7..0907d76 100644
--- a/backend/src/ir/profile.cpp
+++ b/backend/src/ir/profile.cpp
@@ -35,6 +35,7 @@ namespace ir {
"group_id_0", "group_id_1", "group_id_2",
"num_groups_0", "num_groups_1", "num_groups_2",
"local_size_0", "local_size_1", "local_size_2",
+ "enqueued_local_size_0", "enqueued_local_size_1", "enqueued_local_size_2",
"global_size_0", "global_size_1", "global_size_2",
"global_offset_0", "global_offset_1", "global_offset_2",
"stack_pointer", "stack_buffer",
@@ -71,6 +72,9 @@ namespace ir {
DECL_NEW_REG(FAMILY_DWORD, lsize0, 1, GBE_CURBE_LOCAL_SIZE_X);
DECL_NEW_REG(FAMILY_DWORD, lsize1, 1, GBE_CURBE_LOCAL_SIZE_Y);
DECL_NEW_REG(FAMILY_DWORD, lsize2, 1, GBE_CURBE_LOCAL_SIZE_Z);
+ DECL_NEW_REG(FAMILY_DWORD, enqlsize0, 1, GBE_CURBE_ENQUEUED_LOCAL_SIZE_X);
+ DECL_NEW_REG(FAMILY_DWORD, enqlsize1, 1, GBE_CURBE_ENQUEUED_LOCAL_SIZE_Y);
+ DECL_NEW_REG(FAMILY_DWORD, enqlsize2, 1, GBE_CURBE_ENQUEUED_LOCAL_SIZE_Z);
DECL_NEW_REG(FAMILY_DWORD, gsize0, 1, GBE_CURBE_GLOBAL_SIZE_X);
DECL_NEW_REG(FAMILY_DWORD, gsize1, 1, GBE_CURBE_GLOBAL_SIZE_Y);
DECL_NEW_REG(FAMILY_DWORD, gsize2, 1, GBE_CURBE_GLOBAL_SIZE_Z);
diff --git a/backend/src/ir/profile.hpp b/backend/src/ir/profile.hpp
index 35b49e3..b093adf 100644
--- a/backend/src/ir/profile.hpp
+++ b/backend/src/ir/profile.hpp
@@ -53,28 +53,31 @@ namespace ir {
static const Register lsize0 = Register(9); // get_local_size(0)
static const Register lsize1 = Register(10); // get_local_size(1)
static const Register lsize2 = Register(11); // get_local_size(2)
- static const Register gsize0 = Register(12); // get_global_size(0)
- static const Register gsize1 = Register(13); // get_global_size(1)
- static const Register gsize2 = Register(14); // get_global_size(2)
- static const Register goffset0 = Register(15); // get_global_offset(0)
- static const Register goffset1 = Register(16); // get_global_offset(1)
- static const Register goffset2 = Register(17); // get_global_offset(2)
- static const Register stackptr = Register(18); // stack pointer
- static const Register stackbuffer = Register(19); // stack buffer base address.
- static const Register blockip = Register(20); // blockip
- static const Register barrierid = Register(21);// barrierid
- static const Register threadn = Register(22); // number of threads
- static const Register workdim = Register(23); // work dimention.
- static const Register zero = Register(24); // scalar register holds zero.
- static const Register one = Register(25); // scalar register holds one.
- static const Register retVal = Register(26); // helper register to do data flow analysis.
- static const Register printfbptr = Register(27); // printf buffer address .
- static const Register printfiptr = Register(28); // printf index buffer address.
- static const Register dwblockip = Register(29); // blockip
- static const Register threadid = Register(30); // the thread id of this thread.
- static const Register constant_addrspace = Register(31); // starting address of program-scope constant
- static const Register stacksize = Register(32); // stack buffer total size
- static const uint32_t regNum = 33; // number of special registers
+ static const Register enqlsize0 = Register(12); // get_local_size(0)
+ static const Register enqlsize1 = Register(13); // get_local_size(1)
+ static const Register enqlsize2 = Register(14); // get_local_size(2)
+ static const Register gsize0 = Register(15); // get_global_size(0)
+ static const Register gsize1 = Register(16); // get_global_size(1)
+ static const Register gsize2 = Register(17); // get_global_size(2)
+ static const Register goffset0 = Register(18); // get_global_offset(0)
+ static const Register goffset1 = Register(19); // get_global_offset(1)
+ static const Register goffset2 = Register(20); // get_global_offset(2)
+ static const Register stackptr = Register(21); // stack pointer
+ static const Register stackbuffer = Register(22); // stack buffer base address.
+ static const Register blockip = Register(23); // blockip
+ static const Register barrierid = Register(24);// barrierid
+ static const Register threadn = Register(25); // number of threads
+ static const Register workdim = Register(26); // work dimention.
+ static const Register zero = Register(27); // scalar register holds zero.
+ static const Register one = Register(28); // scalar register holds one.
+ static const Register retVal = Register(29); // helper register to do data flow analysis.
+ static const Register printfbptr = Register(30); // printf buffer address .
+ static const Register printfiptr = Register(31); // printf index buffer address.
+ static const Register dwblockip = Register(32); // blockip
+ static const Register threadid = Register(33); // the thread id of this thread.
+ static const Register constant_addrspace = Register(34); // starting address of program-scope constant
+ static const Register stacksize = Register(35); // stack buffer total size
+ static const uint32_t regNum = 36; // number of special registers
extern const char *specialRegMean[]; // special register name.
} /* namespace ocl */
diff --git a/backend/src/libocl/src/ocl_workitem.cl b/backend/src/libocl/src/ocl_workitem.cl
index 235f12b..dc8fa6d 100644
--- a/backend/src/libocl/src/ocl_workitem.cl
+++ b/backend/src/libocl/src/ocl_workitem.cl
@@ -30,6 +30,7 @@ PURE CONST unsigned int __gen_ocl_##NAME##1(void); \
PURE CONST unsigned int __gen_ocl_##NAME##2(void);
DECL_INTERNAL_WORK_ITEM_FN(get_group_id)
DECL_INTERNAL_WORK_ITEM_FN(get_local_id)
+DECL_INTERNAL_WORK_ITEM_FN(get_enqueued_local_size)
DECL_INTERNAL_WORK_ITEM_FN(get_local_size)
DECL_INTERNAL_WORK_ITEM_FN(get_global_size)
DECL_INTERNAL_WORK_ITEM_FN(get_global_offset)
@@ -46,6 +47,7 @@ OVERLOADABLE unsigned NAME(unsigned int dim) { \
DECL_PUBLIC_WORK_ITEM_FN(get_group_id, 0)
DECL_PUBLIC_WORK_ITEM_FN(get_local_id, 0)
+DECL_PUBLIC_WORK_ITEM_FN(get_enqueued_local_size, 1)
DECL_PUBLIC_WORK_ITEM_FN(get_local_size, 1)
DECL_PUBLIC_WORK_ITEM_FN(get_global_size, 1)
DECL_PUBLIC_WORK_ITEM_FN(get_global_offset, 0)
@@ -53,14 +55,7 @@ DECL_PUBLIC_WORK_ITEM_FN(get_num_groups, 1)
#undef DECL_PUBLIC_WORK_ITEM_FN
OVERLOADABLE uint get_global_id(uint dim) {
- return get_local_id(dim) + get_local_size(dim) * get_group_id(dim) + get_global_offset(dim);
-}
-
-OVERLOADABLE uint get_enqueued_local_size (uint dimindx)
-{
- //TODO: should be different with get_local_size when support
- //non-uniform work-group size
- return get_local_size(dimindx);
+ return get_local_id(dim) + get_enqueued_local_size(dim) * get_group_id(dim) + get_global_offset(dim);
}
OVERLOADABLE uint get_global_linear_id(void)
@@ -80,8 +75,8 @@ OVERLOADABLE uint get_local_linear_id(void)
{
uint dim = __gen_ocl_get_work_dim();
if (dim == 1) return get_local_id(0);
- else if (dim == 2) return get_local_id(1) * get_local_size (0) + get_local_id(0);
- else if (dim == 3) return (get_local_id(2) * get_local_size(1) * get_local_size(0)) +
- (get_local_id(1) * get_local_size(0)) + get_local_id(0);
+ else if (dim == 2) return get_local_id(1) * get_enqueued_local_size(0) + get_local_id(0);
+ else if (dim == 3) return (get_local_id(2) * get_enqueued_local_size(1) * get_local_size(0)) +
+ (get_local_id(1) * get_enqueued_local_size(0)) + get_local_id(0);
else return 0;
}
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index cca1781..4a98678 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -3507,6 +3507,12 @@ namespace gbe
regTranslator.newScalarProxy(ir::ocl::lsize1, dst); break;
case GEN_OCL_GET_LOCAL_SIZE2:
regTranslator.newScalarProxy(ir::ocl::lsize2, dst); break;
+ case GEN_OCL_GET_ENQUEUED_LOCAL_SIZE0:
+ regTranslator.newScalarProxy(ir::ocl::enqlsize0, dst); break;
+ case GEN_OCL_GET_ENQUEUED_LOCAL_SIZE1:
+ regTranslator.newScalarProxy(ir::ocl::enqlsize1, dst); break;
+ case GEN_OCL_GET_ENQUEUED_LOCAL_SIZE2:
+ regTranslator.newScalarProxy(ir::ocl::enqlsize2, dst); break;
case GEN_OCL_GET_GLOBAL_SIZE0:
regTranslator.newScalarProxy(ir::ocl::gsize0, dst); break;
case GEN_OCL_GET_GLOBAL_SIZE1:
diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
index 7bd59fc..09feb1a 100644
--- a/backend/src/llvm/llvm_gen_ocl_function.hxx
+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
@@ -10,6 +10,9 @@ DECL_LLVM_GEN_FUNCTION(GET_NUM_GROUPS2, __gen_ocl_get_num_groups2)
DECL_LLVM_GEN_FUNCTION(GET_LOCAL_SIZE0, __gen_ocl_get_local_size0)
DECL_LLVM_GEN_FUNCTION(GET_LOCAL_SIZE1, __gen_ocl_get_local_size1)
DECL_LLVM_GEN_FUNCTION(GET_LOCAL_SIZE2, __gen_ocl_get_local_size2)
+DECL_LLVM_GEN_FUNCTION(GET_ENQUEUED_LOCAL_SIZE0, __gen_ocl_get_enqueued_local_size0)
+DECL_LLVM_GEN_FUNCTION(GET_ENQUEUED_LOCAL_SIZE1, __gen_ocl_get_enqueued_local_size1)
+DECL_LLVM_GEN_FUNCTION(GET_ENQUEUED_LOCAL_SIZE2, __gen_ocl_get_enqueued_local_size2)
DECL_LLVM_GEN_FUNCTION(GET_GLOBAL_SIZE0, __gen_ocl_get_global_size0)
DECL_LLVM_GEN_FUNCTION(GET_GLOBAL_SIZE1, __gen_ocl_get_global_size1)
DECL_LLVM_GEN_FUNCTION(GET_GLOBAL_SIZE2, __gen_ocl_get_global_size2)
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index 38cf56b..6bfacbf 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -221,6 +221,7 @@ cl_curbe_fill(cl_kernel ker,
const size_t *global_wk_off,
const size_t *global_wk_sz,
const size_t *local_wk_sz,
+ const size_t *enqueued_local_wk_sz,
size_t thread_n)
{
int32_t offset;
@@ -230,6 +231,9 @@ cl_curbe_fill(cl_kernel ker,
UPLOAD(GBE_CURBE_LOCAL_SIZE_X, local_wk_sz[0]);
UPLOAD(GBE_CURBE_LOCAL_SIZE_Y, local_wk_sz[1]);
UPLOAD(GBE_CURBE_LOCAL_SIZE_Z, local_wk_sz[2]);
+ UPLOAD(GBE_CURBE_ENQUEUED_LOCAL_SIZE_X, enqueued_local_wk_sz[0]);
+ UPLOAD(GBE_CURBE_ENQUEUED_LOCAL_SIZE_Y, enqueued_local_wk_sz[1]);
+ UPLOAD(GBE_CURBE_ENQUEUED_LOCAL_SIZE_Z, enqueued_local_wk_sz[2]);
UPLOAD(GBE_CURBE_GLOBAL_SIZE_X, global_wk_sz[0]);
UPLOAD(GBE_CURBE_GLOBAL_SIZE_Y, global_wk_sz[1]);
UPLOAD(GBE_CURBE_GLOBAL_SIZE_Z, global_wk_sz[2]);
@@ -374,7 +378,7 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
}
/* Curbe step 1: fill the constant urb buffer data shared by all threads */
if (ker->curbe) {
- kernel.slm_sz = cl_curbe_fill(ker, work_dim, global_wk_off, global_wk_sz, local_wk_sz, thread_n);
+ kernel.slm_sz = cl_curbe_fill(ker, work_dim, global_wk_off, global_wk_sz,local_wk_sz ,local_wk_sz, thread_n);
if (kernel.slm_sz > ker->program->ctx->device->local_mem_size) {
fprintf(stderr, "Beignet: Out of shared local memory %d.\n", kernel.slm_sz);
return CL_OUT_OF_RESOURCES;
--
2.5.0
More information about the Beignet
mailing list