[Beignet] [PATCH] GBE: Add two builtin functions get_work_dim / get_global_offset.
Sun, Yi
yi.sun at intel.com
Tue Jun 4 02:30:26 PDT 2013
Great, this patch works well based on latest code.
And I'll send the refine the test case for it later.
Thanks
--Sun, Yi
On Fri, 2013-05-31 at 12:56 +0800, Zhigang Gong wrote:
> Signed-off-by: Zhigang Gong <zhigang.gong at linux.intel.com>
> ---
> backend/src/backend/context.cpp | 2 ++
> backend/src/backend/gen_reg_allocation.cpp | 1 +
> backend/src/backend/program.h | 1 +
> backend/src/ir/profile.cpp | 1 +
> backend/src/ir/profile.hpp | 3 ++-
> backend/src/llvm/llvm_gen_backend.cpp | 2 ++
> backend/src/llvm/llvm_gen_ocl_function.hxx | 1 +
> backend/src/ocl_stdlib.h | 9 +++++++--
> src/cl_api.c | 1 +
> src/cl_command_queue.c | 5 +++--
> src/cl_command_queue.h | 1 +
> src/cl_command_queue_gen7.c | 5 ++++-
> 12 files changed, 26 insertions(+), 6 deletions(-)
>
> diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp
> index af1f579..48160de 100644
> --- a/backend/src/backend/context.cpp
> +++ b/backend/src/backend/context.cpp
> @@ -430,6 +430,7 @@ namespace gbe
> INSERT_REG(goffset0, GLOBAL_OFFSET_X, 1)
> INSERT_REG(goffset1, GLOBAL_OFFSET_Y, 1)
> INSERT_REG(goffset2, GLOBAL_OFFSET_Z, 1)
> + INSERT_REG(workdim, WORK_DIM, 1)
> INSERT_REG(numgroup0, GROUP_NUM_X, 1)
> INSERT_REG(numgroup1, GROUP_NUM_Y, 1)
> INSERT_REG(numgroup2, GROUP_NUM_Z, 1)
> @@ -621,6 +622,7 @@ namespace gbe
> reg == ir::ocl::goffset0 ||
> reg == ir::ocl::goffset1 ||
> reg == ir::ocl::goffset2 ||
> + reg == ir::ocl::workdim ||
> reg == ir::ocl::constoffst)
> return true;
> return false;
> diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp
> index 8c9f358..469be12 100644
> --- a/backend/src/backend/gen_reg_allocation.cpp
> +++ b/backend/src/backend/gen_reg_allocation.cpp
> @@ -518,6 +518,7 @@ namespace gbe
> allocatePayloadReg(GBE_CURBE_GLOBAL_OFFSET_X, ocl::goffset0);
> allocatePayloadReg(GBE_CURBE_GLOBAL_OFFSET_Y, ocl::goffset1);
> allocatePayloadReg(GBE_CURBE_GLOBAL_OFFSET_Z, ocl::goffset2);
> + allocatePayloadReg(GBE_CURBE_WORK_DIM, ocl::workdim);
> allocatePayloadReg(GBE_CURBE_GROUP_NUM_X, ocl::numgroup0);
> allocatePayloadReg(GBE_CURBE_GROUP_NUM_Y, ocl::numgroup1);
> allocatePayloadReg(GBE_CURBE_GROUP_NUM_Z, ocl::numgroup2);
> diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h
> index f178f8b..f36bfbf 100644
> --- a/backend/src/backend/program.h
> +++ b/backend/src/backend/program.h
> @@ -69,6 +69,7 @@ enum gbe_curbe_type {
> GBE_CURBE_GROUP_NUM_X,
> GBE_CURBE_GROUP_NUM_Y,
> GBE_CURBE_GROUP_NUM_Z,
> + GBE_CURBE_WORK_DIM,
> GBE_CURBE_GLOBAL_CONSTANT_OFFSET,
> GBE_CURBE_GLOBAL_CONSTANT_DATA,
> GBE_CURBE_IMAGE_INFO,
> diff --git a/backend/src/ir/profile.cpp b/backend/src/ir/profile.cpp
> index c1dc650..99cd06c 100644
> --- a/backend/src/ir/profile.cpp
> +++ b/backend/src/ir/profile.cpp
> @@ -76,6 +76,7 @@ namespace ir {
> DECL_NEW_REG(FAMILY_DWORD, barrierid);
> DECL_NEW_REG(FAMILY_DWORD, threadn);
> DECL_NEW_REG(FAMILY_DWORD, constoffst);
> + DECL_NEW_REG(FAMILY_DWORD, workdim);
> }
> #undef DECL_NEW_REG
>
> diff --git a/backend/src/ir/profile.hpp b/backend/src/ir/profile.hpp
> index 32dd149..4b0ef5e 100644
> --- a/backend/src/ir/profile.hpp
> +++ b/backend/src/ir/profile.hpp
> @@ -64,7 +64,8 @@ namespace ir {
> static const Register barrierid = Register(20);// barrierid
> static const Register threadn = Register(21); // number of threads
> static const Register constoffst = Register(22); // offset of global constant array's curbe
> - static const uint32_t regNum = 23; // number of special registers
> + static const Register workdim = Register(23); // work dimention.
> + static const uint32_t regNum = 24; // number of special registers
> extern const char *specialRegMean[]; // special register name.
> } /* namespace ocl */
>
> diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
> index db7d714..ad295a8 100644
> --- a/backend/src/llvm/llvm_gen_backend.cpp
> +++ b/backend/src/llvm/llvm_gen_backend.cpp
> @@ -1645,6 +1645,8 @@ namespace gbe
> regTranslator.newScalarProxy(ir::ocl::goffset1, dst); break;
> case GEN_OCL_GET_GLOBAL_OFFSET2:
> regTranslator.newScalarProxy(ir::ocl::goffset2, dst); break;
> + case GEN_OCL_GET_WORK_DIM:
> + regTranslator.newScalarProxy(ir::ocl::workdim, dst); break;
> case GEN_OCL_COS:
> case GEN_OCL_SIN:
> case GEN_OCL_SQR:
> diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
> index 0524744..6cd7298 100644
> --- a/backend/src/llvm/llvm_gen_ocl_function.hxx
> +++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
> @@ -16,6 +16,7 @@ DECL_LLVM_GEN_FUNCTION(GET_GLOBAL_SIZE2, __gen_ocl_get_global_size2)
> DECL_LLVM_GEN_FUNCTION(GET_GLOBAL_OFFSET0, __gen_ocl_get_global_offset0)
> DECL_LLVM_GEN_FUNCTION(GET_GLOBAL_OFFSET1, __gen_ocl_get_global_offset1)
> DECL_LLVM_GEN_FUNCTION(GET_GLOBAL_OFFSET2, __gen_ocl_get_global_offset2)
> +DECL_LLVM_GEN_FUNCTION(GET_WORK_DIM, __gen_ocl_get_work_dim)
>
> // Math function
> DECL_LLVM_GEN_FUNCTION(ABS, __gen_ocl_fabs)
> diff --git a/backend/src/ocl_stdlib.h b/backend/src/ocl_stdlib.h
> index 613b844..144e78c 100644
> --- a/backend/src/ocl_stdlib.h
> +++ b/backend/src/ocl_stdlib.h
> @@ -346,8 +346,11 @@ DEC(16);
> /////////////////////////////////////////////////////////////////////////////
> // Work Items functions (see 6.11.1 of OCL 1.1 spec)
> /////////////////////////////////////////////////////////////////////////////
> -// TODO get_global_offset
> -// TODO get_work_dim
> +
> +PURE CONST uint __gen_ocl_get_work_dim(void);
> +INLINE uint get_work_dim(void) {
> + return __gen_ocl_get_work_dim();
> +}
>
> #define DECL_INTERNAL_WORK_ITEM_FN(NAME) \
> PURE CONST unsigned int __gen_ocl_##NAME##0(void); \
> @@ -357,6 +360,7 @@ DECL_INTERNAL_WORK_ITEM_FN(get_group_id)
> DECL_INTERNAL_WORK_ITEM_FN(get_local_id)
> DECL_INTERNAL_WORK_ITEM_FN(get_local_size)
> DECL_INTERNAL_WORK_ITEM_FN(get_global_size)
> +DECL_INTERNAL_WORK_ITEM_FN(get_global_offset)
> DECL_INTERNAL_WORK_ITEM_FN(get_num_groups)
> #undef DECL_INTERNAL_WORK_ITEM_FN
>
> @@ -371,6 +375,7 @@ DECL_PUBLIC_WORK_ITEM_FN(get_group_id)
> DECL_PUBLIC_WORK_ITEM_FN(get_local_id)
> DECL_PUBLIC_WORK_ITEM_FN(get_local_size)
> DECL_PUBLIC_WORK_ITEM_FN(get_global_size)
> +DECL_PUBLIC_WORK_ITEM_FN(get_global_offset)
> DECL_PUBLIC_WORK_ITEM_FN(get_num_groups)
> #undef DECL_PUBLIC_WORK_ITEM_FN
>
> diff --git a/src/cl_api.c b/src/cl_api.c
> index 9c5943b..5ef95b7 100644
> --- a/src/cl_api.c
> +++ b/src/cl_api.c
> @@ -1453,6 +1453,7 @@ clEnqueueNDRangeKernel(cl_command_queue command_queue,
> /* Do device specific checks are enqueue the kernel */
> err = cl_command_queue_ND_range(command_queue,
> kernel,
> + work_dim,
> fixed_global_off,
> fixed_global_sz,
> fixed_local_sz);
> diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
> index 1a37c78..a3987d8 100644
> --- a/src/cl_command_queue.c
> +++ b/src/cl_command_queue.c
> @@ -356,7 +356,7 @@ error:
> }
> #endif
>
> -extern cl_int cl_command_queue_ND_range_gen7(cl_command_queue, cl_kernel, const size_t *, const size_t *, const size_t *);
> +extern cl_int cl_command_queue_ND_range_gen7(cl_command_queue, cl_kernel, uint32_t, const size_t *, const size_t *, const size_t *);
>
> static cl_int
> cl_kernel_check_args(cl_kernel k)
> @@ -371,6 +371,7 @@ cl_kernel_check_args(cl_kernel k)
> LOCAL cl_int
> cl_command_queue_ND_range(cl_command_queue queue,
> cl_kernel k,
> + const uint32_t work_dim,
> const size_t *global_wk_off,
> const size_t *global_wk_sz,
> const size_t *local_wk_sz)
> @@ -394,7 +395,7 @@ cl_command_queue_ND_range(cl_command_queue queue,
> #endif /* USE_FULSIM */
>
> if (ver == 7 || ver == 75)
> - TRY (cl_command_queue_ND_range_gen7, queue, k, global_wk_off, global_wk_sz, local_wk_sz);
> + TRY (cl_command_queue_ND_range_gen7, queue, k, work_dim, global_wk_off, global_wk_sz, local_wk_sz);
> else
> FATAL ("Unknown Gen Device");
>
> diff --git a/src/cl_command_queue.h b/src/cl_command_queue.h
> index f0c00f4..5a792a2 100644
> --- a/src/cl_command_queue.h
> +++ b/src/cl_command_queue.h
> @@ -54,6 +54,7 @@ extern void cl_command_queue_add_ref(cl_command_queue);
> /* Map ND range kernel from OCL API */
> extern cl_int cl_command_queue_ND_range(cl_command_queue queue,
> cl_kernel ker,
> + const uint32_t work_dim,
> const size_t *global_work_offset,
> const size_t *global_work_size,
> const size_t *local_work_size);
> diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
> index 770af4a..ea9b583 100644
> --- a/src/cl_command_queue_gen7.c
> +++ b/src/cl_command_queue_gen7.c
> @@ -98,6 +98,7 @@ error:
> /* Will return the total amount of slm used */
> static int32_t
> cl_curbe_fill(cl_kernel ker,
> + const uint32_t work_dim,
> const size_t *global_wk_off,
> const size_t *global_wk_sz,
> const size_t *local_wk_sz,
> @@ -120,6 +121,7 @@ cl_curbe_fill(cl_kernel ker,
> UPLOAD(GBE_CURBE_GROUP_NUM_Y, global_wk_sz[1]/local_wk_sz[1]);
> UPLOAD(GBE_CURBE_GROUP_NUM_Z, global_wk_sz[2]/local_wk_sz[2]);
> UPLOAD(GBE_CURBE_THREAD_NUM, thread_n);
> + UPLOAD(GBE_CURBE_WORK_DIM, work_dim);
> UPLOAD(GBE_CURBE_GLOBAL_CONSTANT_OFFSET, gbe_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_GLOBAL_CONSTANT_DATA, 0) + 32);
> #undef UPLOAD
>
> @@ -185,6 +187,7 @@ cl_bind_stack(cl_gpgpu gpgpu, cl_kernel ker)
> LOCAL cl_int
> cl_command_queue_ND_range_gen7(cl_command_queue queue,
> cl_kernel ker,
> + const uint32_t work_dim,
> const size_t *global_wk_off,
> const size_t *global_wk_sz,
> const size_t *local_wk_sz)
> @@ -214,7 +217,7 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
>
> /* Curbe step 1: fill the constant buffer data shared by all threads */
> if (ker->curbe)
> - kernel.slm_sz = cl_curbe_fill(ker, global_wk_off, global_wk_sz, local_wk_sz, thread_n);
> + kernel.slm_sz = cl_curbe_fill(ker, work_dim, global_wk_off, global_wk_sz, local_wk_sz, thread_n);
>
> /* Setup the kernel */
> cl_gpgpu_state_init(gpgpu, ctx->device->max_compute_unit, cst_sz / 32);
More information about the Beignet
mailing list