[Beignet] [PATCH] Remove global offset need divide by local size restriction.

Wed Sep 25 18:08:40 PDT 2013

This is a workaround. And I'm afraid it may don't work as you expected.
For example, a 1D task has 1024 work item. And the offset is 512.
We set the local group size to 16. As you hard coded the work offset to
0 for the GPGPU walker, so the gpgpu walker will dispatch 1024/16 = 64
work group/threads to do the job.
The first 32 threads will work as you want. But the last 32 threads will
do bad thing. Right? Any thoughts?

On Mon, Sep 23, 2013 at 02:04:08PM +0800, Yang Rong wrote:
> Set to global offset to 0 in walker, and add global offset when get_global_id.
> 
> Signed-off-by: Yang Rong <rong.r.yang at intel.com>
> ---
>  backend/src/ocl_stdlib.tmpl.h | 2 +-
>  src/cl_api.c                  | 7 -------
>  src/intel/intel_gpgpu.c       | 6 +++---
>  3 files changed, 4 insertions(+), 11 deletions(-)
> 
> diff --git a/backend/src/ocl_stdlib.tmpl.h b/backend/src/ocl_stdlib.tmpl.h
> index 9b76ba1..4921ee4 100644
> --- a/backend/src/ocl_stdlib.tmpl.h
> +++ b/backend/src/ocl_stdlib.tmpl.h
> @@ -588,7 +588,7 @@ DECL_PUBLIC_WORK_ITEM_FN(get_num_groups, 1)
>  #undef DECL_PUBLIC_WORK_ITEM_FN
>  
>  INLINE uint get_global_id(uint dim) {
> -  return get_local_id(dim) + get_local_size(dim) * get_group_id(dim);
> +  return get_local_id(dim) + get_local_size(dim) * get_group_id(dim) + get_global_offset(dim);
>  }
>  
>  /////////////////////////////////////////////////////////////////////////////
> diff --git a/src/cl_api.c b/src/cl_api.c
> index b60160b..c19b80a 100644
> --- a/src/cl_api.c
> +++ b/src/cl_api.c
> @@ -2264,19 +2264,12 @@ clEnqueueNDRangeKernel(cl_command_queue  command_queue,
>      goto error;
>    }
>  
> -  /* Check offset values. We add a non standard restriction. The offsets must
> -   * also be evenly divided by the local sizes
> -   */
>    if (global_work_offset != NULL)
>      for (i = 0; i < work_dim; ++i) {
>        if (UNLIKELY(~0LL - global_work_offset[i] > global_work_size[i])) {
>          err = CL_INVALID_GLOBAL_OFFSET;
>          goto error;
>        }
> -      if (UNLIKELY(local_work_size != NULL && global_work_offset[i] % local_work_size[i])) {
> -        err = CL_INVALID_GLOBAL_OFFSET;
> -        goto error;
> -      }
>      }
>  
>    /* Local sizes must be non-null and divide global sizes */
> diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
> index 7b82b76..44f44ef 100644
> --- a/src/intel/intel_gpgpu.c
> +++ b/src/intel/intel_gpgpu.c
> @@ -886,11 +886,11 @@ intel_gpgpu_walker(intel_gpgpu_t *gpgpu,
>      OUT_BATCH(gpgpu->batch, (1 << 30) | (thread_n-1)); /* SIMD16 | thread max */
>    else
>      OUT_BATCH(gpgpu->batch, (0 << 30) | (thread_n-1)); /* SIMD8  | thread max */
> -  OUT_BATCH(gpgpu->batch, global_wk_off[0]);
> +  OUT_BATCH(gpgpu->batch, 0);
>    OUT_BATCH(gpgpu->batch, global_wk_dim[0]);
> -  OUT_BATCH(gpgpu->batch, global_wk_off[1]);
> +  OUT_BATCH(gpgpu->batch, 0);
>    OUT_BATCH(gpgpu->batch, global_wk_dim[1]);
> -  OUT_BATCH(gpgpu->batch, global_wk_off[2]);
> +  OUT_BATCH(gpgpu->batch, 0);
>    OUT_BATCH(gpgpu->batch, global_wk_dim[2]);
>    OUT_BATCH(gpgpu->batch, right_mask);
>    OUT_BATCH(gpgpu->batch, ~0x0);                     /* we always set height as 1, so set bottom mask as all 1*/
> -- 
> 1.8.1.2
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet