[Beignet] [PATCH 1/2] Backend: Refine get_enqueued_local_size and get_local_size
Yang, Rong R
rong.r.yang at intel.com
Thu Mar 17 08:00:15 UTC 2016
LGTM, pushed.
> -----Original Message-----
> From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of
> Xiuli Pan
> Sent: Wednesday, March 16, 2016 7:53
> To: beignet at lists.freedesktop.org
> Cc: Pan, Xiuli <xiuli.pan at intel.com>
> Subject: [Beignet] [PATCH 1/2] Backend: Refine get_enqueued_local_size
> and get_local_size
>
> From: Pan Xiuli <xiuli.pan at intel.com>
>
> Use curbe register for these two size.
>
> Signed-off-by: Pan Xiuli <xiuli.pan at intel.com>
> ---
> backend/src/backend/gen_insn_selection.cpp | 7 +++--
> backend/src/backend/program.h | 3 ++
> backend/src/ir/profile.cpp | 4 +++
> backend/src/ir/profile.hpp | 47 ++++++++++++++++--------------
> backend/src/libocl/src/ocl_workitem.cl | 17 ++++-------
> backend/src/llvm/llvm_gen_backend.cpp | 6 ++++
> backend/src/llvm/llvm_gen_ocl_function.hxx | 3 ++
> src/cl_command_queue_gen7.c | 6 +++-
> 8 files changed, 57 insertions(+), 36 deletions(-)
>
> diff --git a/backend/src/backend/gen_insn_selection.cpp
> b/backend/src/backend/gen_insn_selection.cpp
> index 90e15c3..77614b6 100644
> --- a/backend/src/backend/gen_insn_selection.cpp
> +++ b/backend/src/backend/gen_insn_selection.cpp
> @@ -3423,8 +3423,11 @@ namespace gbe
> reg == ir::ocl::lid1 ||
> reg == ir::ocl::lid2 ||
> reg == ir::ocl::lsize0 ||
> - reg == ir::ocl::lsize1||
> - reg == ir::ocl::lsize2)
> + reg == ir::ocl::lsize1 ||
> + reg == ir::ocl::lsize2 ||
> + reg == ir::ocl::enqlsize0 ||
> + reg == ir::ocl::enqlsize1 ||
> + reg == ir::ocl::enqlsize2)
> return true;
> else
> return false;
> diff --git a/backend/src/backend/program.h
> b/backend/src/backend/program.h index 4dd3ae3..a690e3d 100644
> --- a/backend/src/backend/program.h
> +++ b/backend/src/backend/program.h
> @@ -83,6 +83,9 @@ enum gbe_curbe_type {
> GBE_CURBE_LOCAL_SIZE_X,
> GBE_CURBE_LOCAL_SIZE_Y,
> GBE_CURBE_LOCAL_SIZE_Z,
> + GBE_CURBE_ENQUEUED_LOCAL_SIZE_X,
> + GBE_CURBE_ENQUEUED_LOCAL_SIZE_Y,
> + GBE_CURBE_ENQUEUED_LOCAL_SIZE_Z,
> GBE_CURBE_GLOBAL_SIZE_X,
> GBE_CURBE_GLOBAL_SIZE_Y,
> GBE_CURBE_GLOBAL_SIZE_Z,
> diff --git a/backend/src/ir/profile.cpp b/backend/src/ir/profile.cpp index
> ce5e8e7..0907d76 100644
> --- a/backend/src/ir/profile.cpp
> +++ b/backend/src/ir/profile.cpp
> @@ -35,6 +35,7 @@ namespace ir {
> "group_id_0", "group_id_1", "group_id_2",
> "num_groups_0", "num_groups_1", "num_groups_2",
> "local_size_0", "local_size_1", "local_size_2",
> + "enqueued_local_size_0", "enqueued_local_size_1",
> + "enqueued_local_size_2",
> "global_size_0", "global_size_1", "global_size_2",
> "global_offset_0", "global_offset_1", "global_offset_2",
> "stack_pointer", "stack_buffer", @@ -71,6 +72,9 @@ namespace ir {
> DECL_NEW_REG(FAMILY_DWORD, lsize0, 1, GBE_CURBE_LOCAL_SIZE_X);
> DECL_NEW_REG(FAMILY_DWORD, lsize1, 1, GBE_CURBE_LOCAL_SIZE_Y);
> DECL_NEW_REG(FAMILY_DWORD, lsize2, 1, GBE_CURBE_LOCAL_SIZE_Z);
> + DECL_NEW_REG(FAMILY_DWORD, enqlsize0, 1,
> GBE_CURBE_ENQUEUED_LOCAL_SIZE_X);
> + DECL_NEW_REG(FAMILY_DWORD, enqlsize1, 1,
> GBE_CURBE_ENQUEUED_LOCAL_SIZE_Y);
> + DECL_NEW_REG(FAMILY_DWORD, enqlsize2, 1,
> + GBE_CURBE_ENQUEUED_LOCAL_SIZE_Z);
> DECL_NEW_REG(FAMILY_DWORD, gsize0, 1,
> GBE_CURBE_GLOBAL_SIZE_X);
> DECL_NEW_REG(FAMILY_DWORD, gsize1, 1,
> GBE_CURBE_GLOBAL_SIZE_Y);
> DECL_NEW_REG(FAMILY_DWORD, gsize2, 1,
> GBE_CURBE_GLOBAL_SIZE_Z); diff --git a/backend/src/ir/profile.hpp
> b/backend/src/ir/profile.hpp index 35b49e3..b093adf 100644
> --- a/backend/src/ir/profile.hpp
> +++ b/backend/src/ir/profile.hpp
> @@ -53,28 +53,31 @@ namespace ir {
> static const Register lsize0 = Register(9); // get_local_size(0)
> static const Register lsize1 = Register(10); // get_local_size(1)
> static const Register lsize2 = Register(11); // get_local_size(2)
> - static const Register gsize0 = Register(12); // get_global_size(0)
> - static const Register gsize1 = Register(13); // get_global_size(1)
> - static const Register gsize2 = Register(14); // get_global_size(2)
> - static const Register goffset0 = Register(15); // get_global_offset(0)
> - static const Register goffset1 = Register(16); // get_global_offset(1)
> - static const Register goffset2 = Register(17); // get_global_offset(2)
> - static const Register stackptr = Register(18); // stack pointer
> - static const Register stackbuffer = Register(19); // stack buffer base
> address.
> - static const Register blockip = Register(20); // blockip
> - static const Register barrierid = Register(21);// barrierid
> - static const Register threadn = Register(22); // number of threads
> - static const Register workdim = Register(23); // work dimention.
> - static const Register zero = Register(24); // scalar register holds zero.
> - static const Register one = Register(25); // scalar register holds one.
> - static const Register retVal = Register(26); // helper register to do data
> flow analysis.
> - static const Register printfbptr = Register(27); // printf buffer address .
> - static const Register printfiptr = Register(28); // printf index buffer address.
> - static const Register dwblockip = Register(29); // blockip
> - static const Register threadid = Register(30); // the thread id of this thread.
> - static const Register constant_addrspace = Register(31); // starting
> address of program-scope constant
> - static const Register stacksize = Register(32); // stack buffer total size
> - static const uint32_t regNum = 33; // number of special registers
> + static const Register enqlsize0 = Register(12); // get_local_size(0)
> + static const Register enqlsize1 = Register(13); // get_local_size(1)
> + static const Register enqlsize2 = Register(14); // get_local_size(2)
> + static const Register gsize0 = Register(15); // get_global_size(0)
> + static const Register gsize1 = Register(16); // get_global_size(1)
> + static const Register gsize2 = Register(17); // get_global_size(2)
> + static const Register goffset0 = Register(18); // get_global_offset(0)
> + static const Register goffset1 = Register(19); // get_global_offset(1)
> + static const Register goffset2 = Register(20); // get_global_offset(2)
> + static const Register stackptr = Register(21); // stack pointer
> + static const Register stackbuffer = Register(22); // stack buffer base
> address.
> + static const Register blockip = Register(23); // blockip
> + static const Register barrierid = Register(24);// barrierid
> + static const Register threadn = Register(25); // number of threads
> + static const Register workdim = Register(26); // work dimention.
> + static const Register zero = Register(27); // scalar register holds zero.
> + static const Register one = Register(28); // scalar register holds one.
> + static const Register retVal = Register(29); // helper register to do data
> flow analysis.
> + static const Register printfbptr = Register(30); // printf buffer address .
> + static const Register printfiptr = Register(31); // printf index buffer
> address.
> + static const Register dwblockip = Register(32); // blockip
> + static const Register threadid = Register(33); // the thread id of this thread.
> + static const Register constant_addrspace = Register(34); // starting
> address of program-scope constant
> + static const Register stacksize = Register(35); // stack buffer total size
> + static const uint32_t regNum = 36; // number of special registers
> extern const char *specialRegMean[]; // special register name.
> } /* namespace ocl */
>
> diff --git a/backend/src/libocl/src/ocl_workitem.cl
> b/backend/src/libocl/src/ocl_workitem.cl
> index 235f12b..dc8fa6d 100644
> --- a/backend/src/libocl/src/ocl_workitem.cl
> +++ b/backend/src/libocl/src/ocl_workitem.cl
> @@ -30,6 +30,7 @@ PURE CONST unsigned int
> __gen_ocl_##NAME##1(void); \ PURE CONST unsigned int
> __gen_ocl_##NAME##2(void);
> DECL_INTERNAL_WORK_ITEM_FN(get_group_id)
> DECL_INTERNAL_WORK_ITEM_FN(get_local_id)
> +DECL_INTERNAL_WORK_ITEM_FN(get_enqueued_local_size)
> DECL_INTERNAL_WORK_ITEM_FN(get_local_size)
> DECL_INTERNAL_WORK_ITEM_FN(get_global_size)
> DECL_INTERNAL_WORK_ITEM_FN(get_global_offset)
> @@ -46,6 +47,7 @@ OVERLOADABLE unsigned NAME(unsigned int dim)
> { \
>
> DECL_PUBLIC_WORK_ITEM_FN(get_group_id, 0)
> DECL_PUBLIC_WORK_ITEM_FN(get_local_id, 0)
> +DECL_PUBLIC_WORK_ITEM_FN(get_enqueued_local_size, 1)
> DECL_PUBLIC_WORK_ITEM_FN(get_local_size, 1)
> DECL_PUBLIC_WORK_ITEM_FN(get_global_size, 1)
> DECL_PUBLIC_WORK_ITEM_FN(get_global_offset, 0) @@ -53,14 +55,7 @@
> DECL_PUBLIC_WORK_ITEM_FN(get_num_groups, 1) #undef
> DECL_PUBLIC_WORK_ITEM_FN
>
> OVERLOADABLE uint get_global_id(uint dim) {
> - return get_local_id(dim) + get_local_size(dim) * get_group_id(dim) +
> get_global_offset(dim); -}
> -
> -OVERLOADABLE uint get_enqueued_local_size (uint dimindx) -{
> - //TODO: should be different with get_local_size when support
> - //non-uniform work-group size
> - return get_local_size(dimindx);
> + return get_local_id(dim) + get_enqueued_local_size(dim) *
> + get_group_id(dim) + get_global_offset(dim);
> }
>
> OVERLOADABLE uint get_global_linear_id(void) @@ -80,8 +75,8 @@
> OVERLOADABLE uint get_local_linear_id(void) {
> uint dim = __gen_ocl_get_work_dim();
> if (dim == 1) return get_local_id(0);
> - else if (dim == 2) return get_local_id(1) * get_local_size (0) +
> get_local_id(0);
> - else if (dim == 3) return (get_local_id(2) * get_local_size(1) *
> get_local_size(0)) +
> - (get_local_id(1) * get_local_size(0)) + get_local_id(0);
> + else if (dim == 2) return get_local_id(1) *
> + get_enqueued_local_size(0) + get_local_id(0); else if (dim == 3) return
> (get_local_id(2) * get_enqueued_local_size(1) * get_local_size(0)) +
> + (get_local_id(1) *
> + get_enqueued_local_size(0)) + get_local_id(0);
> else return 0;
> }
> diff --git a/backend/src/llvm/llvm_gen_backend.cpp
> b/backend/src/llvm/llvm_gen_backend.cpp
> index cca1781..4a98678 100644
> --- a/backend/src/llvm/llvm_gen_backend.cpp
> +++ b/backend/src/llvm/llvm_gen_backend.cpp
> @@ -3507,6 +3507,12 @@ namespace gbe
> regTranslator.newScalarProxy(ir::ocl::lsize1, dst); break;
> case GEN_OCL_GET_LOCAL_SIZE2:
> regTranslator.newScalarProxy(ir::ocl::lsize2, dst); break;
> + case GEN_OCL_GET_ENQUEUED_LOCAL_SIZE0:
> + regTranslator.newScalarProxy(ir::ocl::enqlsize0, dst); break;
> + case GEN_OCL_GET_ENQUEUED_LOCAL_SIZE1:
> + regTranslator.newScalarProxy(ir::ocl::enqlsize1, dst); break;
> + case GEN_OCL_GET_ENQUEUED_LOCAL_SIZE2:
> + regTranslator.newScalarProxy(ir::ocl::enqlsize2, dst); break;
> case GEN_OCL_GET_GLOBAL_SIZE0:
> regTranslator.newScalarProxy(ir::ocl::gsize0, dst); break;
> case GEN_OCL_GET_GLOBAL_SIZE1:
> diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx
> b/backend/src/llvm/llvm_gen_ocl_function.hxx
> index 7bd59fc..09feb1a 100644
> --- a/backend/src/llvm/llvm_gen_ocl_function.hxx
> +++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
> @@ -10,6 +10,9 @@ DECL_LLVM_GEN_FUNCTION(GET_NUM_GROUPS2,
> __gen_ocl_get_num_groups2)
> DECL_LLVM_GEN_FUNCTION(GET_LOCAL_SIZE0, __gen_ocl_get_local_size0)
> DECL_LLVM_GEN_FUNCTION(GET_LOCAL_SIZE1, __gen_ocl_get_local_size1)
> DECL_LLVM_GEN_FUNCTION(GET_LOCAL_SIZE2, __gen_ocl_get_local_size2)
> +DECL_LLVM_GEN_FUNCTION(GET_ENQUEUED_LOCAL_SIZE0,
> +__gen_ocl_get_enqueued_local_size0)
> +DECL_LLVM_GEN_FUNCTION(GET_ENQUEUED_LOCAL_SIZE1,
> +__gen_ocl_get_enqueued_local_size1)
> +DECL_LLVM_GEN_FUNCTION(GET_ENQUEUED_LOCAL_SIZE2,
> +__gen_ocl_get_enqueued_local_size2)
> DECL_LLVM_GEN_FUNCTION(GET_GLOBAL_SIZE0,
> __gen_ocl_get_global_size0)
> DECL_LLVM_GEN_FUNCTION(GET_GLOBAL_SIZE1,
> __gen_ocl_get_global_size1)
> DECL_LLVM_GEN_FUNCTION(GET_GLOBAL_SIZE2,
> __gen_ocl_get_global_size2) diff --git a/src/cl_command_queue_gen7.c
> b/src/cl_command_queue_gen7.c index 38cf56b..6bfacbf 100644
> --- a/src/cl_command_queue_gen7.c
> +++ b/src/cl_command_queue_gen7.c
> @@ -221,6 +221,7 @@ cl_curbe_fill(cl_kernel ker,
> const size_t *global_wk_off,
> const size_t *global_wk_sz,
> const size_t *local_wk_sz,
> + const size_t *enqueued_local_wk_sz,
> size_t thread_n)
> {
> int32_t offset;
> @@ -230,6 +231,9 @@ cl_curbe_fill(cl_kernel ker,
> UPLOAD(GBE_CURBE_LOCAL_SIZE_X, local_wk_sz[0]);
> UPLOAD(GBE_CURBE_LOCAL_SIZE_Y, local_wk_sz[1]);
> UPLOAD(GBE_CURBE_LOCAL_SIZE_Z, local_wk_sz[2]);
> + UPLOAD(GBE_CURBE_ENQUEUED_LOCAL_SIZE_X,
> enqueued_local_wk_sz[0]);
> + UPLOAD(GBE_CURBE_ENQUEUED_LOCAL_SIZE_Y,
> enqueued_local_wk_sz[1]);
> + UPLOAD(GBE_CURBE_ENQUEUED_LOCAL_SIZE_Z,
> enqueued_local_wk_sz[2]);
> UPLOAD(GBE_CURBE_GLOBAL_SIZE_X, global_wk_sz[0]);
> UPLOAD(GBE_CURBE_GLOBAL_SIZE_Y, global_wk_sz[1]);
> UPLOAD(GBE_CURBE_GLOBAL_SIZE_Z, global_wk_sz[2]); @@ -374,7
> +378,7 @@ cl_command_queue_ND_range_gen7(cl_command_queue
> queue,
> }
> /* Curbe step 1: fill the constant urb buffer data shared by all threads */
> if (ker->curbe) {
> - kernel.slm_sz = cl_curbe_fill(ker, work_dim, global_wk_off, global_wk_sz,
> local_wk_sz, thread_n);
> + kernel.slm_sz = cl_curbe_fill(ker, work_dim, global_wk_off,
> + global_wk_sz,local_wk_sz ,local_wk_sz, thread_n);
> if (kernel.slm_sz > ker->program->ctx->device->local_mem_size) {
> fprintf(stderr, "Beignet: Out of shared local memory %d.\n",
> kernel.slm_sz);
> return CL_OUT_OF_RESOURCES;
> --
> 2.5.0
>
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/beignet
More information about the Beignet
mailing list