[Beignet] [PATCH] GBE: Add two builtin functions get_work_dim / get_global_offset.
Zhigang Gong
zhigang.gong at linux.intel.com
Thu May 30 21:56:57 PDT 2013
Signed-off-by: Zhigang Gong <zhigang.gong at linux.intel.com>
---
backend/src/backend/context.cpp | 2 ++
backend/src/backend/gen_reg_allocation.cpp | 1 +
backend/src/backend/program.h | 1 +
backend/src/ir/profile.cpp | 1 +
backend/src/ir/profile.hpp | 3 ++-
backend/src/llvm/llvm_gen_backend.cpp | 2 ++
backend/src/llvm/llvm_gen_ocl_function.hxx | 1 +
backend/src/ocl_stdlib.h | 9 +++++++--
src/cl_api.c | 1 +
src/cl_command_queue.c | 5 +++--
src/cl_command_queue.h | 1 +
src/cl_command_queue_gen7.c | 5 ++++-
12 files changed, 26 insertions(+), 6 deletions(-)
diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp
index af1f579..48160de 100644
--- a/backend/src/backend/context.cpp
+++ b/backend/src/backend/context.cpp
@@ -430,6 +430,7 @@ namespace gbe
INSERT_REG(goffset0, GLOBAL_OFFSET_X, 1)
INSERT_REG(goffset1, GLOBAL_OFFSET_Y, 1)
INSERT_REG(goffset2, GLOBAL_OFFSET_Z, 1)
+ INSERT_REG(workdim, WORK_DIM, 1)
INSERT_REG(numgroup0, GROUP_NUM_X, 1)
INSERT_REG(numgroup1, GROUP_NUM_Y, 1)
INSERT_REG(numgroup2, GROUP_NUM_Z, 1)
@@ -621,6 +622,7 @@ namespace gbe
reg == ir::ocl::goffset0 ||
reg == ir::ocl::goffset1 ||
reg == ir::ocl::goffset2 ||
+ reg == ir::ocl::workdim ||
reg == ir::ocl::constoffst)
return true;
return false;
diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp
index 8c9f358..469be12 100644
--- a/backend/src/backend/gen_reg_allocation.cpp
+++ b/backend/src/backend/gen_reg_allocation.cpp
@@ -518,6 +518,7 @@ namespace gbe
allocatePayloadReg(GBE_CURBE_GLOBAL_OFFSET_X, ocl::goffset0);
allocatePayloadReg(GBE_CURBE_GLOBAL_OFFSET_Y, ocl::goffset1);
allocatePayloadReg(GBE_CURBE_GLOBAL_OFFSET_Z, ocl::goffset2);
+ allocatePayloadReg(GBE_CURBE_WORK_DIM, ocl::workdim);
allocatePayloadReg(GBE_CURBE_GROUP_NUM_X, ocl::numgroup0);
allocatePayloadReg(GBE_CURBE_GROUP_NUM_Y, ocl::numgroup1);
allocatePayloadReg(GBE_CURBE_GROUP_NUM_Z, ocl::numgroup2);
diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h
index f178f8b..f36bfbf 100644
--- a/backend/src/backend/program.h
+++ b/backend/src/backend/program.h
@@ -69,6 +69,7 @@ enum gbe_curbe_type {
GBE_CURBE_GROUP_NUM_X,
GBE_CURBE_GROUP_NUM_Y,
GBE_CURBE_GROUP_NUM_Z,
+ GBE_CURBE_WORK_DIM,
GBE_CURBE_GLOBAL_CONSTANT_OFFSET,
GBE_CURBE_GLOBAL_CONSTANT_DATA,
GBE_CURBE_IMAGE_INFO,
diff --git a/backend/src/ir/profile.cpp b/backend/src/ir/profile.cpp
index c1dc650..99cd06c 100644
--- a/backend/src/ir/profile.cpp
+++ b/backend/src/ir/profile.cpp
@@ -76,6 +76,7 @@ namespace ir {
DECL_NEW_REG(FAMILY_DWORD, barrierid);
DECL_NEW_REG(FAMILY_DWORD, threadn);
DECL_NEW_REG(FAMILY_DWORD, constoffst);
+ DECL_NEW_REG(FAMILY_DWORD, workdim);
}
#undef DECL_NEW_REG
diff --git a/backend/src/ir/profile.hpp b/backend/src/ir/profile.hpp
index 32dd149..4b0ef5e 100644
--- a/backend/src/ir/profile.hpp
+++ b/backend/src/ir/profile.hpp
@@ -64,7 +64,8 @@ namespace ir {
static const Register barrierid = Register(20);// barrierid
static const Register threadn = Register(21); // number of threads
static const Register constoffst = Register(22); // offset of global constant array's curbe
- static const uint32_t regNum = 23; // number of special registers
+ static const Register workdim = Register(23); // work dimention.
+ static const uint32_t regNum = 24; // number of special registers
extern const char *specialRegMean[]; // special register name.
} /* namespace ocl */
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index db7d714..ad295a8 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -1645,6 +1645,8 @@ namespace gbe
regTranslator.newScalarProxy(ir::ocl::goffset1, dst); break;
case GEN_OCL_GET_GLOBAL_OFFSET2:
regTranslator.newScalarProxy(ir::ocl::goffset2, dst); break;
+ case GEN_OCL_GET_WORK_DIM:
+ regTranslator.newScalarProxy(ir::ocl::workdim, dst); break;
case GEN_OCL_COS:
case GEN_OCL_SIN:
case GEN_OCL_SQR:
diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
index 0524744..6cd7298 100644
--- a/backend/src/llvm/llvm_gen_ocl_function.hxx
+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
@@ -16,6 +16,7 @@ DECL_LLVM_GEN_FUNCTION(GET_GLOBAL_SIZE2, __gen_ocl_get_global_size2)
DECL_LLVM_GEN_FUNCTION(GET_GLOBAL_OFFSET0, __gen_ocl_get_global_offset0)
DECL_LLVM_GEN_FUNCTION(GET_GLOBAL_OFFSET1, __gen_ocl_get_global_offset1)
DECL_LLVM_GEN_FUNCTION(GET_GLOBAL_OFFSET2, __gen_ocl_get_global_offset2)
+DECL_LLVM_GEN_FUNCTION(GET_WORK_DIM, __gen_ocl_get_work_dim)
// Math function
DECL_LLVM_GEN_FUNCTION(ABS, __gen_ocl_fabs)
diff --git a/backend/src/ocl_stdlib.h b/backend/src/ocl_stdlib.h
index 613b844..144e78c 100644
--- a/backend/src/ocl_stdlib.h
+++ b/backend/src/ocl_stdlib.h
@@ -346,8 +346,11 @@ DEC(16);
/////////////////////////////////////////////////////////////////////////////
// Work Items functions (see 6.11.1 of OCL 1.1 spec)
/////////////////////////////////////////////////////////////////////////////
-// TODO get_global_offset
-// TODO get_work_dim
+
+PURE CONST uint __gen_ocl_get_work_dim(void);
+INLINE uint get_work_dim(void) {
+ return __gen_ocl_get_work_dim();
+}
#define DECL_INTERNAL_WORK_ITEM_FN(NAME) \
PURE CONST unsigned int __gen_ocl_##NAME##0(void); \
@@ -357,6 +360,7 @@ DECL_INTERNAL_WORK_ITEM_FN(get_group_id)
DECL_INTERNAL_WORK_ITEM_FN(get_local_id)
DECL_INTERNAL_WORK_ITEM_FN(get_local_size)
DECL_INTERNAL_WORK_ITEM_FN(get_global_size)
+DECL_INTERNAL_WORK_ITEM_FN(get_global_offset)
DECL_INTERNAL_WORK_ITEM_FN(get_num_groups)
#undef DECL_INTERNAL_WORK_ITEM_FN
@@ -371,6 +375,7 @@ DECL_PUBLIC_WORK_ITEM_FN(get_group_id)
DECL_PUBLIC_WORK_ITEM_FN(get_local_id)
DECL_PUBLIC_WORK_ITEM_FN(get_local_size)
DECL_PUBLIC_WORK_ITEM_FN(get_global_size)
+DECL_PUBLIC_WORK_ITEM_FN(get_global_offset)
DECL_PUBLIC_WORK_ITEM_FN(get_num_groups)
#undef DECL_PUBLIC_WORK_ITEM_FN
diff --git a/src/cl_api.c b/src/cl_api.c
index 9c5943b..5ef95b7 100644
--- a/src/cl_api.c
+++ b/src/cl_api.c
@@ -1453,6 +1453,7 @@ clEnqueueNDRangeKernel(cl_command_queue command_queue,
/* Do device specific checks are enqueue the kernel */
err = cl_command_queue_ND_range(command_queue,
kernel,
+ work_dim,
fixed_global_off,
fixed_global_sz,
fixed_local_sz);
diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
index 1a37c78..a3987d8 100644
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -356,7 +356,7 @@ error:
}
#endif
-extern cl_int cl_command_queue_ND_range_gen7(cl_command_queue, cl_kernel, const size_t *, const size_t *, const size_t *);
+extern cl_int cl_command_queue_ND_range_gen7(cl_command_queue, cl_kernel, uint32_t, const size_t *, const size_t *, const size_t *);
static cl_int
cl_kernel_check_args(cl_kernel k)
@@ -371,6 +371,7 @@ cl_kernel_check_args(cl_kernel k)
LOCAL cl_int
cl_command_queue_ND_range(cl_command_queue queue,
cl_kernel k,
+ const uint32_t work_dim,
const size_t *global_wk_off,
const size_t *global_wk_sz,
const size_t *local_wk_sz)
@@ -394,7 +395,7 @@ cl_command_queue_ND_range(cl_command_queue queue,
#endif /* USE_FULSIM */
if (ver == 7 || ver == 75)
- TRY (cl_command_queue_ND_range_gen7, queue, k, global_wk_off, global_wk_sz, local_wk_sz);
+ TRY (cl_command_queue_ND_range_gen7, queue, k, work_dim, global_wk_off, global_wk_sz, local_wk_sz);
else
FATAL ("Unknown Gen Device");
diff --git a/src/cl_command_queue.h b/src/cl_command_queue.h
index f0c00f4..5a792a2 100644
--- a/src/cl_command_queue.h
+++ b/src/cl_command_queue.h
@@ -54,6 +54,7 @@ extern void cl_command_queue_add_ref(cl_command_queue);
/* Map ND range kernel from OCL API */
extern cl_int cl_command_queue_ND_range(cl_command_queue queue,
cl_kernel ker,
+ const uint32_t work_dim,
const size_t *global_work_offset,
const size_t *global_work_size,
const size_t *local_work_size);
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index 770af4a..ea9b583 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -98,6 +98,7 @@ error:
/* Will return the total amount of slm used */
static int32_t
cl_curbe_fill(cl_kernel ker,
+ const uint32_t work_dim,
const size_t *global_wk_off,
const size_t *global_wk_sz,
const size_t *local_wk_sz,
@@ -120,6 +121,7 @@ cl_curbe_fill(cl_kernel ker,
UPLOAD(GBE_CURBE_GROUP_NUM_Y, global_wk_sz[1]/local_wk_sz[1]);
UPLOAD(GBE_CURBE_GROUP_NUM_Z, global_wk_sz[2]/local_wk_sz[2]);
UPLOAD(GBE_CURBE_THREAD_NUM, thread_n);
+ UPLOAD(GBE_CURBE_WORK_DIM, work_dim);
UPLOAD(GBE_CURBE_GLOBAL_CONSTANT_OFFSET, gbe_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_GLOBAL_CONSTANT_DATA, 0) + 32);
#undef UPLOAD
@@ -185,6 +187,7 @@ cl_bind_stack(cl_gpgpu gpgpu, cl_kernel ker)
LOCAL cl_int
cl_command_queue_ND_range_gen7(cl_command_queue queue,
cl_kernel ker,
+ const uint32_t work_dim,
const size_t *global_wk_off,
const size_t *global_wk_sz,
const size_t *local_wk_sz)
@@ -214,7 +217,7 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
/* Curbe step 1: fill the constant buffer data shared by all threads */
if (ker->curbe)
- kernel.slm_sz = cl_curbe_fill(ker, global_wk_off, global_wk_sz, local_wk_sz, thread_n);
+ kernel.slm_sz = cl_curbe_fill(ker, work_dim, global_wk_off, global_wk_sz, local_wk_sz, thread_n);
/* Setup the kernel */
cl_gpgpu_state_init(gpgpu, ctx->device->max_compute_unit, cst_sz / 32);
--
1.7.11.7
More information about the Beignet
mailing list