[Beignet] [PATCH] GBE: Add two builtin functions get_work_dim / get_global_offset.

Sun, Yi yi.sun at intel.com
Tue Jun 4 02:30:26 PDT 2013


Great, this patch works well based on latest code.
And I'll send the refine the test case for it later.

Thanks
  --Sun, Yi

On Fri, 2013-05-31 at 12:56 +0800, Zhigang Gong wrote:
> Signed-off-by: Zhigang Gong <zhigang.gong at linux.intel.com>
> ---
>  backend/src/backend/context.cpp            | 2 ++
>  backend/src/backend/gen_reg_allocation.cpp | 1 +
>  backend/src/backend/program.h              | 1 +
>  backend/src/ir/profile.cpp                 | 1 +
>  backend/src/ir/profile.hpp                 | 3 ++-
>  backend/src/llvm/llvm_gen_backend.cpp      | 2 ++
>  backend/src/llvm/llvm_gen_ocl_function.hxx | 1 +
>  backend/src/ocl_stdlib.h                   | 9 +++++++--
>  src/cl_api.c                               | 1 +
>  src/cl_command_queue.c                     | 5 +++--
>  src/cl_command_queue.h                     | 1 +
>  src/cl_command_queue_gen7.c                | 5 ++++-
>  12 files changed, 26 insertions(+), 6 deletions(-)
> 
> diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp
> index af1f579..48160de 100644
> --- a/backend/src/backend/context.cpp
> +++ b/backend/src/backend/context.cpp
> @@ -430,6 +430,7 @@ namespace gbe
>          INSERT_REG(goffset0, GLOBAL_OFFSET_X, 1)
>          INSERT_REG(goffset1, GLOBAL_OFFSET_Y, 1)
>          INSERT_REG(goffset2, GLOBAL_OFFSET_Z, 1)
> +        INSERT_REG(workdim, WORK_DIM, 1)
>          INSERT_REG(numgroup0, GROUP_NUM_X, 1)
>          INSERT_REG(numgroup1, GROUP_NUM_Y, 1)
>          INSERT_REG(numgroup2, GROUP_NUM_Z, 1)
> @@ -621,6 +622,7 @@ namespace gbe
>          reg == ir::ocl::goffset0  ||
>          reg == ir::ocl::goffset1  ||
>          reg == ir::ocl::goffset2  ||
> +        reg == ir::ocl::workdim   ||
>          reg == ir::ocl::constoffst)
>        return true;
>      return false;
> diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp
> index 8c9f358..469be12 100644
> --- a/backend/src/backend/gen_reg_allocation.cpp
> +++ b/backend/src/backend/gen_reg_allocation.cpp
> @@ -518,6 +518,7 @@ namespace gbe
>      allocatePayloadReg(GBE_CURBE_GLOBAL_OFFSET_X, ocl::goffset0);
>      allocatePayloadReg(GBE_CURBE_GLOBAL_OFFSET_Y, ocl::goffset1);
>      allocatePayloadReg(GBE_CURBE_GLOBAL_OFFSET_Z, ocl::goffset2);
> +    allocatePayloadReg(GBE_CURBE_WORK_DIM, ocl::workdim);
>      allocatePayloadReg(GBE_CURBE_GROUP_NUM_X, ocl::numgroup0);
>      allocatePayloadReg(GBE_CURBE_GROUP_NUM_Y, ocl::numgroup1);
>      allocatePayloadReg(GBE_CURBE_GROUP_NUM_Z, ocl::numgroup2);
> diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h
> index f178f8b..f36bfbf 100644
> --- a/backend/src/backend/program.h
> +++ b/backend/src/backend/program.h
> @@ -69,6 +69,7 @@ enum gbe_curbe_type {
>    GBE_CURBE_GROUP_NUM_X,
>    GBE_CURBE_GROUP_NUM_Y,
>    GBE_CURBE_GROUP_NUM_Z,
> +  GBE_CURBE_WORK_DIM,
>    GBE_CURBE_GLOBAL_CONSTANT_OFFSET,
>    GBE_CURBE_GLOBAL_CONSTANT_DATA,
>    GBE_CURBE_IMAGE_INFO,
> diff --git a/backend/src/ir/profile.cpp b/backend/src/ir/profile.cpp
> index c1dc650..99cd06c 100644
> --- a/backend/src/ir/profile.cpp
> +++ b/backend/src/ir/profile.cpp
> @@ -76,6 +76,7 @@ namespace ir {
>        DECL_NEW_REG(FAMILY_DWORD, barrierid);
>        DECL_NEW_REG(FAMILY_DWORD, threadn);
>        DECL_NEW_REG(FAMILY_DWORD, constoffst);
> +      DECL_NEW_REG(FAMILY_DWORD, workdim);
>      }
>  #undef DECL_NEW_REG
>  
> diff --git a/backend/src/ir/profile.hpp b/backend/src/ir/profile.hpp
> index 32dd149..4b0ef5e 100644
> --- a/backend/src/ir/profile.hpp
> +++ b/backend/src/ir/profile.hpp
> @@ -64,7 +64,8 @@ namespace ir {
>      static const Register barrierid = Register(20);// barrierid
>      static const Register threadn = Register(21);  // number of threads
>      static const Register constoffst = Register(22); // offset of global constant array's curbe
> -    static const uint32_t regNum = 23;             // number of special registers
> +    static const Register workdim = Register(23);  // work dimention.
> +    static const uint32_t regNum = 24;             // number of special registers
>      extern const char *specialRegMean[];           // special register name.
>    } /* namespace ocl */
>  
> diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
> index db7d714..ad295a8 100644
> --- a/backend/src/llvm/llvm_gen_backend.cpp
> +++ b/backend/src/llvm/llvm_gen_backend.cpp
> @@ -1645,6 +1645,8 @@ namespace gbe
>          regTranslator.newScalarProxy(ir::ocl::goffset1, dst); break;
>        case GEN_OCL_GET_GLOBAL_OFFSET2:
>          regTranslator.newScalarProxy(ir::ocl::goffset2, dst); break;
> +      case GEN_OCL_GET_WORK_DIM:
> +        regTranslator.newScalarProxy(ir::ocl::workdim, dst); break;
>        case GEN_OCL_COS:
>        case GEN_OCL_SIN:
>        case GEN_OCL_SQR:
> diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
> index 0524744..6cd7298 100644
> --- a/backend/src/llvm/llvm_gen_ocl_function.hxx
> +++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
> @@ -16,6 +16,7 @@ DECL_LLVM_GEN_FUNCTION(GET_GLOBAL_SIZE2, __gen_ocl_get_global_size2)
>  DECL_LLVM_GEN_FUNCTION(GET_GLOBAL_OFFSET0, __gen_ocl_get_global_offset0)
>  DECL_LLVM_GEN_FUNCTION(GET_GLOBAL_OFFSET1, __gen_ocl_get_global_offset1)
>  DECL_LLVM_GEN_FUNCTION(GET_GLOBAL_OFFSET2, __gen_ocl_get_global_offset2)
> +DECL_LLVM_GEN_FUNCTION(GET_WORK_DIM, __gen_ocl_get_work_dim)
>  
>  // Math function
>  DECL_LLVM_GEN_FUNCTION(ABS, __gen_ocl_fabs)
> diff --git a/backend/src/ocl_stdlib.h b/backend/src/ocl_stdlib.h
> index 613b844..144e78c 100644
> --- a/backend/src/ocl_stdlib.h
> +++ b/backend/src/ocl_stdlib.h
> @@ -346,8 +346,11 @@ DEC(16);
>  /////////////////////////////////////////////////////////////////////////////
>  // Work Items functions (see 6.11.1 of OCL 1.1 spec)
>  /////////////////////////////////////////////////////////////////////////////
> -// TODO get_global_offset
> -// TODO get_work_dim
> +
> +PURE CONST uint __gen_ocl_get_work_dim(void);
> +INLINE uint get_work_dim(void) {
> +  return __gen_ocl_get_work_dim(); 
> +}
>  
>  #define DECL_INTERNAL_WORK_ITEM_FN(NAME) \
>  PURE CONST unsigned int __gen_ocl_##NAME##0(void); \
> @@ -357,6 +360,7 @@ DECL_INTERNAL_WORK_ITEM_FN(get_group_id)
>  DECL_INTERNAL_WORK_ITEM_FN(get_local_id)
>  DECL_INTERNAL_WORK_ITEM_FN(get_local_size)
>  DECL_INTERNAL_WORK_ITEM_FN(get_global_size)
> +DECL_INTERNAL_WORK_ITEM_FN(get_global_offset)
>  DECL_INTERNAL_WORK_ITEM_FN(get_num_groups)
>  #undef DECL_INTERNAL_WORK_ITEM_FN
>  
> @@ -371,6 +375,7 @@ DECL_PUBLIC_WORK_ITEM_FN(get_group_id)
>  DECL_PUBLIC_WORK_ITEM_FN(get_local_id)
>  DECL_PUBLIC_WORK_ITEM_FN(get_local_size)
>  DECL_PUBLIC_WORK_ITEM_FN(get_global_size)
> +DECL_PUBLIC_WORK_ITEM_FN(get_global_offset)
>  DECL_PUBLIC_WORK_ITEM_FN(get_num_groups)
>  #undef DECL_PUBLIC_WORK_ITEM_FN
>  
> diff --git a/src/cl_api.c b/src/cl_api.c
> index 9c5943b..5ef95b7 100644
> --- a/src/cl_api.c
> +++ b/src/cl_api.c
> @@ -1453,6 +1453,7 @@ clEnqueueNDRangeKernel(cl_command_queue  command_queue,
>    /* Do device specific checks are enqueue the kernel */
>    err = cl_command_queue_ND_range(command_queue,
>                                    kernel,
> +                                  work_dim,
>                                    fixed_global_off,
>                                    fixed_global_sz,
>                                    fixed_local_sz);
> diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
> index 1a37c78..a3987d8 100644
> --- a/src/cl_command_queue.c
> +++ b/src/cl_command_queue.c
> @@ -356,7 +356,7 @@ error:
>  }
>  #endif
>  
> -extern cl_int cl_command_queue_ND_range_gen7(cl_command_queue, cl_kernel, const size_t *, const size_t *, const size_t *);
> +extern cl_int cl_command_queue_ND_range_gen7(cl_command_queue, cl_kernel, uint32_t, const size_t *, const size_t *, const size_t *);
>  
>  static cl_int
>  cl_kernel_check_args(cl_kernel k)
> @@ -371,6 +371,7 @@ cl_kernel_check_args(cl_kernel k)
>  LOCAL cl_int
>  cl_command_queue_ND_range(cl_command_queue queue,
>                            cl_kernel k,
> +                          const uint32_t work_dim,
>                            const size_t *global_wk_off,
>                            const size_t *global_wk_sz,
>                            const size_t *local_wk_sz)
> @@ -394,7 +395,7 @@ cl_command_queue_ND_range(cl_command_queue queue,
>  #endif /* USE_FULSIM */
>  
>    if (ver == 7 || ver == 75)
> -    TRY (cl_command_queue_ND_range_gen7, queue, k, global_wk_off, global_wk_sz, local_wk_sz);
> +    TRY (cl_command_queue_ND_range_gen7, queue, k, work_dim, global_wk_off, global_wk_sz, local_wk_sz);
>    else
>      FATAL ("Unknown Gen Device");
>  
> diff --git a/src/cl_command_queue.h b/src/cl_command_queue.h
> index f0c00f4..5a792a2 100644
> --- a/src/cl_command_queue.h
> +++ b/src/cl_command_queue.h
> @@ -54,6 +54,7 @@ extern void cl_command_queue_add_ref(cl_command_queue);
>  /* Map ND range kernel from OCL API */
>  extern cl_int cl_command_queue_ND_range(cl_command_queue queue,
>                                          cl_kernel ker,
> +                                        const uint32_t work_dim,
>                                          const size_t *global_work_offset,
>                                          const size_t *global_work_size,
>                                          const size_t *local_work_size);
> diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
> index 770af4a..ea9b583 100644
> --- a/src/cl_command_queue_gen7.c
> +++ b/src/cl_command_queue_gen7.c
> @@ -98,6 +98,7 @@ error:
>  /* Will return the total amount of slm used */
>  static int32_t
>  cl_curbe_fill(cl_kernel ker,
> +              const uint32_t work_dim,
>                const size_t *global_wk_off,
>                const size_t *global_wk_sz,
>                const size_t *local_wk_sz,
> @@ -120,6 +121,7 @@ cl_curbe_fill(cl_kernel ker,
>    UPLOAD(GBE_CURBE_GROUP_NUM_Y, global_wk_sz[1]/local_wk_sz[1]);
>    UPLOAD(GBE_CURBE_GROUP_NUM_Z, global_wk_sz[2]/local_wk_sz[2]);
>    UPLOAD(GBE_CURBE_THREAD_NUM, thread_n);
> +  UPLOAD(GBE_CURBE_WORK_DIM, work_dim); 
>    UPLOAD(GBE_CURBE_GLOBAL_CONSTANT_OFFSET, gbe_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_GLOBAL_CONSTANT_DATA, 0) + 32);
>  #undef UPLOAD
>  
> @@ -185,6 +187,7 @@ cl_bind_stack(cl_gpgpu gpgpu, cl_kernel ker)
>  LOCAL cl_int
>  cl_command_queue_ND_range_gen7(cl_command_queue queue,
>                                 cl_kernel ker,
> +                               const uint32_t work_dim,
>                                 const size_t *global_wk_off,
>                                 const size_t *global_wk_sz,
>                                 const size_t *local_wk_sz)
> @@ -214,7 +217,7 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
>  
>    /* Curbe step 1: fill the constant buffer data shared by all threads */
>    if (ker->curbe)
> -    kernel.slm_sz = cl_curbe_fill(ker, global_wk_off, global_wk_sz, local_wk_sz, thread_n);
> +    kernel.slm_sz = cl_curbe_fill(ker, work_dim, global_wk_off, global_wk_sz, local_wk_sz, thread_n);
>  
>    /* Setup the kernel */
>    cl_gpgpu_state_init(gpgpu, ctx->device->max_compute_unit, cst_sz / 32);



More information about the Beignet mailing list