[Beignet] [PATCH 2/3] [opencl-1.2] Implement the clEnqueueFillBuffer API.

Mon Apr 28 22:57:38 PDT 2014

Some minor comments as below:

On Wed, Apr 23, 2014 at 04:35:25PM +0800, junyan.he at inbox.com wrote:
> From: Junyan He <junyan.he at linux.intel.com>
> 
> We use the floatn's assigment to do the copy.
> 128 pattern size is according to double16, and because
> the double problem on our platform, we use to float16
> to handle this.
> unaligned cases is not optimized now, just use the char
> assigment.
> 
> Signed-off-by: Junyan He <junyan.he at linux.intel.com>
> ---
>  src/cl_api.c     |  78 ++++++++++++++++++++++++++++++++
>  src/cl_context.c | 133 ++++++++++++++++++++++++++++++++++++++++++-------------
>  src/cl_context.h |   8 ++++
>  src/cl_enqueue.c |   1 +
>  src/cl_enqueue.h |   1 +
>  src/cl_event.c   |   1 +
>  src/cl_mem.c     | 102 ++++++++++++++++++++++++++++++++++++++++++
>  src/cl_mem.h     |   3 ++
>  8 files changed, 295 insertions(+), 32 deletions(-)
> 
> diff --git a/src/cl_api.c b/src/cl_api.c
> index 1543ff4..be94bcb 100644
> --- a/src/cl_api.c
> +++ b/src/cl_api.c
> @@ -1592,6 +1592,84 @@ error:
>  }
>  
>  cl_int
> +clEnqueueFillBuffer(cl_command_queue   command_queue,
> +                    cl_mem             buffer,
> +                    const void *       pattern,
> +                    size_t             pattern_size,
> +                    size_t             offset,
> +                    size_t             size,
> +                    cl_uint            num_events_in_wait_list,
> +                    const cl_event *   event_wait_list,
> +                    cl_event *         event)
> +{
> +  cl_int err = CL_SUCCESS;
> +  enqueue_data *data, no_wait_data = { 0 };
> +  static size_t valid_sz[] = {1, 2, 4, 8, 16, 32, 64, 128};
> +  int i = 0;
> +
> +  CHECK_QUEUE(command_queue);
> +  CHECK_MEM(buffer);
> +
> +  if (command_queue->ctx != buffer->ctx) {
> +    err = CL_INVALID_CONTEXT;
> +    goto error;
> +  }
> +
> +  if (offset < 0 || offset + size > buffer->size) {
> +    err = CL_INVALID_VALUE;
> +    goto error;
> +  }
> +
> +  if (pattern == NULL) {
> +    err = CL_INVALID_VALUE;
> +    goto error;
> +  }
> +
> +  for (i = 0; i < sizeof(valid_sz)/sizeof(size_t); i++) {
coding style issue, we'd better to use "sizeof(valid_sz) / sizeof(size_t)"
rather than the above compact style. I noticed you mixed two styles in the
same patch, please fix it in the new version.

> +    if (valid_sz[i] == pattern_size)
> +      break;
> +  }
> +  if (i == sizeof(valid_sz)/sizeof(size_t)) {
> +    err = CL_INVALID_VALUE;
> +    goto error;
> +  }
> +
> +  if (offset%pattern_size || size%pattern_size) {
> +    err = CL_INVALID_VALUE;
> +    goto error;
> +  }
> +
> +  err = cl_mem_fill(command_queue, pattern, pattern_size, buffer, offset, size);
> +  if (err) {
> +    goto error;
> +  }
> +
> +  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, buffer->ctx);
> +
> +  data = &no_wait_data;
> +  data->type = EnqueueFillBuffer;
> +  data->queue = command_queue;
> +
> +  if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
> +                   event, data, CL_COMMAND_FILL_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
> +    if (event && (*event)->type != CL_COMMAND_USER
> +        && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) {
> +      cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
> +    }
> +
> +    err = cl_command_queue_flush(command_queue);
> +  }
> +
> +  if(b_output_kernel_perf)
> +    time_end(command_queue->ctx, "beignet internal kernel : cl_fill_buffer", command_queue);
> +
> +  return 0;
> +
> + error:
> +  return err;
> +}
> +
> +cl_int
>  clEnqueueCopyBuffer(cl_command_queue     command_queue,
>                      cl_mem               src_buffer,
>                      cl_mem               dst_buffer,
> diff --git a/src/cl_context.c b/src/cl_context.c
> index 8190e6a..e2dba65 100644
> --- a/src/cl_context.c
> +++ b/src/cl_context.c
> @@ -1,4 +1,4 @@
> -/* 
> +/*
>   * Copyright © 2012 Intel Corporation
>   *
>   * This library is free software; you can redistribute it and/or
> @@ -188,6 +188,7 @@ error:
>  LOCAL void
>  cl_context_delete(cl_context ctx)
>  {
> +  int i = 0;
>    if (UNLIKELY(ctx == NULL))
>      return;
>  
> @@ -195,6 +196,18 @@ cl_context_delete(cl_context ctx)
>    if (atomic_dec(&ctx->ref_n) > 1)
>      return;
>  
> +  /* delete the internal programs. */
> +  for (i = CL_ENQUEUE_COPY_BUFFER_ALIGN4; i < CL_INTERNAL_KERNEL_MAX; i++) {
Use i = 0 here may be better or define CL_INTERNAL_KERNEL_MIN to 0 and use that macro
instead of using a specific enum number.
> +    if (ctx->internel_kernels[i]) {
> +      cl_kernel_delete(ctx->internel_kernels[i]);
> +      ctx->internel_kernels[i] = NULL;
> +
> +      assert(ctx->internal_prgs[i]);
> +      cl_program_delete(ctx->internal_prgs[i]);
> +      ctx->internal_prgs[i] = NULL;
> +    }
> +  }
> +
>    /* All object lists should have been freed. Otherwise, the reference counter
>     * of the context cannot be 0
>     */
> @@ -252,47 +265,103 @@ cl_context_get_static_kernel(cl_context ctx, cl_int index, const char * str_kern
>  {
>    cl_int ret;
>    if (!ctx->internal_prgs[index])
> -  {
> -    size_t length = strlen(str_kernel) + 1;
> -    ctx->internal_prgs[index] = cl_program_create_from_source(ctx, 1, &str_kernel, &length, NULL);
> -
> -    if (!ctx->internal_prgs[index])
> -      return NULL;
> -
> -    ret = cl_program_build(ctx->internal_prgs[index], str_option);
> -    if (ret != CL_SUCCESS)
> -      return NULL;
> -
> -    ctx->internal_prgs[index]->is_built = 1;
> -
> -    ctx->internel_kernels[index] = cl_kernel_dup(ctx->internal_prgs[index]->ker[0]);
> -  }
> +    {
> +      size_t length = strlen(str_kernel) + 1;
> +      ctx->internal_prgs[index] = cl_program_create_from_source(ctx, 1, &str_kernel, &length, NULL);
> +
> +      if (!ctx->internal_prgs[index])
> +        return NULL;
> +
> +      ret = cl_program_build(ctx->internal_prgs[index], str_option);
> +      if (ret != CL_SUCCESS)
> +        return NULL;
> +
> +      ctx->internal_prgs[index]->is_built = 1;
> +
> +      /* All CL_ENQUEUE_FILL_BUFFER_ALIGN16_xxx use the same program, different kernel. */
> +      if (index >= CL_ENQUEUE_FILL_BUFFER_ALIGN8_8 && index <= CL_ENQUEUE_FILL_BUFFER_ALIGN8_64) {
> +        int i = CL_ENQUEUE_FILL_BUFFER_ALIGN8_8;
> +        for (; i <= CL_ENQUEUE_FILL_BUFFER_ALIGN8_64; i++) {
> +          if (index != i) {
> +            assert(ctx->internal_prgs[i] == NULL);
> +            assert(ctx->internel_kernels[i] == NULL);
> +            cl_program_add_ref(ctx->internal_prgs[index]);
> +            ctx->internal_prgs[i] = ctx->internal_prgs[index];
> +          }
> +
> +          if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_8) {
> +            ctx->internel_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
> +                                                                "__cl_fill_region_align8_2", NULL);
> +          } else if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_16) {
> +            ctx->internel_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
> +                                                                "__cl_fill_region_align8_4", NULL);
> +          } else if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_32) {
> +            ctx->internel_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
> +                                                                "__cl_fill_region_align8_8", NULL);
> +          } else if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_64) {
> +            ctx->internel_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
> +                                                                "__cl_fill_region_align8_16", NULL);
> +          } else
> +            assert(0);
> +        }
> +      } else {
> +        ctx->internel_kernels[index] = cl_kernel_dup(ctx->internal_prgs[index]->ker[0]);
> +      }
> +    }
>  
>    return ctx->internel_kernels[index];
>  }
>  
>  cl_kernel
>  cl_context_get_static_kernel_form_bin(cl_context ctx, cl_int index,
> -                  const char * str_kernel, size_t size, const char * str_option)
> +                                      const char * str_kernel, size_t size, const char * str_option)
>  {
>    cl_int ret;
>    cl_int binary_status = CL_SUCCESS;
>    if (!ctx->internal_prgs[index])
> -  {
> -    ctx->internal_prgs[index] = cl_program_create_from_binary(ctx, 1, &ctx->device,
> -      &size, (const unsigned char **)&str_kernel, &binary_status, &ret);
> -
> -    if (!ctx->internal_prgs[index])
> -      return NULL;
> -
> -    ret = cl_program_build(ctx->internal_prgs[index], str_option);
> -    if (ret != CL_SUCCESS)
> -      return NULL;
> -
> -    ctx->internal_prgs[index]->is_built = 1;
> -
> -    ctx->internel_kernels[index] = cl_kernel_dup(ctx->internal_prgs[index]->ker[0]);
> -  }
> +    {
> +      ctx->internal_prgs[index] = cl_program_create_from_binary(ctx, 1, &ctx->device,
> +                                                                &size, (const unsigned char **)&str_kernel, &binary_status, &ret);
> +
> +      if (!ctx->internal_prgs[index])
> +        return NULL;
> +
> +      ret = cl_program_build(ctx->internal_prgs[index], str_option);
> +      if (ret != CL_SUCCESS)
> +        return NULL;
> +
> +      ctx->internal_prgs[index]->is_built = 1;
> +
> +      /* All CL_ENQUEUE_FILL_BUFFER_ALIGN16_xxx use the same program, different kernel. */
> +      if (index >= CL_ENQUEUE_FILL_BUFFER_ALIGN8_8 && index <= CL_ENQUEUE_FILL_BUFFER_ALIGN8_64) {
> +        int i = CL_ENQUEUE_FILL_BUFFER_ALIGN8_8;
> +        for (; i <= CL_ENQUEUE_FILL_BUFFER_ALIGN8_64; i++) {
> +          if (index != i) {
> +            assert(ctx->internal_prgs[i] == NULL);
> +            assert(ctx->internel_kernels[i] == NULL);
> +            cl_program_add_ref(ctx->internal_prgs[index]);
> +            ctx->internal_prgs[i] = ctx->internal_prgs[index];
> +          }
> +
> +          if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_8) {
> +            ctx->internel_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
> +                                                                "__cl_fill_region_align8_2", NULL);
> +          } else if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_16) {
> +            ctx->internel_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
> +                                                                "__cl_fill_region_align8_4", NULL);
> +          } else if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_32) {
> +            ctx->internel_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
> +                                                                "__cl_fill_region_align8_8", NULL);
> +          } else if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_64) {
> +            ctx->internel_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
> +                                                                "__cl_fill_region_align8_16", NULL);
> +          } else
> +            assert(0);
> +        }
> +      } else {
> +        ctx->internel_kernels[index] = cl_kernel_dup(ctx->internal_prgs[index]->ker[0]);
> +      }
> +    }
>  
>    return ctx->internel_kernels[index];
>  }
> diff --git a/src/cl_context.h b/src/cl_context.h
> index 782a9af..6eb6c9e 100644
> --- a/src/cl_context.h
> +++ b/src/cl_context.h
> @@ -54,6 +54,14 @@ enum _cl_internal_ker_type {
>    CL_ENQUEUE_COPY_IMAGE_TO_BUFFER_1,   //copy image 3d tobuffer
>    CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_0,   //copy buffer to image 2d
>    CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_1,   //copy buffer to image 3d
> +  CL_ENQUEUE_FILL_BUFFER_UNALIGN,      //fill buffer with 1 aligne pattern, pattern size=1
> +  CL_ENQUEUE_FILL_BUFFER_ALIGN2,       //fill buffer with 2 aligne pattern, pattern size=2
> +  CL_ENQUEUE_FILL_BUFFER_ALIGN4,       //fill buffer with 4 aligne pattern, pattern size=4
> +  CL_ENQUEUE_FILL_BUFFER_ALIGN8_8,     //fill buffer with 8 aligne pattern, pattern size=8
> +  CL_ENQUEUE_FILL_BUFFER_ALIGN8_16,    //fill buffer with 16 aligne pattern, pattern size=16
> +  CL_ENQUEUE_FILL_BUFFER_ALIGN8_32,    //fill buffer with 16 aligne pattern, pattern size=32
> +  CL_ENQUEUE_FILL_BUFFER_ALIGN8_64,    //fill buffer with 16 aligne pattern, pattern size=64
> +  CL_ENQUEUE_FILL_BUFFER_ALIGN128,     //fill buffer with 128 aligne pattern, pattern size=128
>    CL_INTERNAL_KERNEL_MAX
>  };
>  
> diff --git a/src/cl_enqueue.c b/src/cl_enqueue.c
> index 330d230..d44fa66 100644
> --- a/src/cl_enqueue.c
> +++ b/src/cl_enqueue.c
> @@ -412,6 +412,7 @@ cl_int cl_enqueue_handle(cl_event event, enqueue_data* data)
>      case EnqueueCopyBufferToImage:
>      case EnqueueCopyImageToBuffer:
>      case EnqueueNDRangeKernel:
> +    case EnqueueFillBuffer:
>        cl_gpgpu_event_resume((cl_gpgpu_event)data->ptr);
>        return CL_SUCCESS;
>      case EnqueueNativeKernel:
> diff --git a/src/cl_enqueue.h b/src/cl_enqueue.h
> index 1d3ae5f..cb612eb 100644
> --- a/src/cl_enqueue.h
> +++ b/src/cl_enqueue.h
> @@ -41,6 +41,7 @@ typedef enum {
>    EnqueueNDRangeKernel,
>    EnqueueNativeKernel,
>    EnqueueMarker,
> +  EnqueueFillBuffer,
>    EnqueueInvalid
>  } enqueue_type;
>  
> diff --git a/src/cl_event.c b/src/cl_event.c
> index 727ee1f..ae5bfaf 100644
> --- a/src/cl_event.c
> +++ b/src/cl_event.c
> @@ -33,6 +33,7 @@ cl_event_is_gpu_command_type(cl_command_type type)
>  {
>    switch(type) {
>      case CL_COMMAND_COPY_BUFFER:
> +    case CL_COMMAND_FILL_BUFFER:
>      case CL_COMMAND_COPY_IMAGE:
>      case CL_COMMAND_COPY_IMAGE_TO_BUFFER:
>      case CL_COMMAND_COPY_BUFFER_TO_IMAGE:
> diff --git a/src/cl_mem.c b/src/cl_mem.c
> index 44482f7..0f7dc23 100644
> --- a/src/cl_mem.c
> +++ b/src/cl_mem.c
> @@ -902,6 +902,108 @@ cl_mem_copy(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
>  }
>  
>  LOCAL cl_int
> +cl_mem_fill(cl_command_queue queue, const void * pattern, size_t pattern_size,
> +            cl_mem buffer, size_t offset, size_t size)
> +{
> +  cl_int ret = CL_SUCCESS;
> +  cl_kernel ker = NULL;
> +  size_t global_off[] = {0,0,0};
> +  size_t global_sz[] = {1,1,1};
> +  size_t local_sz[] = {1,1,1};
> +  char pattern_comb[4];
> +  int is_128 = 0;
> +  const void * pattern1 = NULL;
> +
> +  assert(offset%pattern_size == 0);
> +  assert(size%pattern_size == 0);
> +
> +  if (!size)
> +    return ret;
> +
> +  if (pattern_size == 128) {
> +    /* 128 is according to pattern of double16, but double works not very
> +       well on some platform. We use two float16 to handle this. */
> +    extern char cl_internal_fill_buf_align128_str[];
> +    extern int cl_internal_fill_buf_align128_str_size;
> +
> +    ker = cl_context_get_static_kernel_form_bin(queue->ctx, CL_ENQUEUE_FILL_BUFFER_ALIGN128,
> +               cl_internal_fill_buf_align128_str, (size_t)cl_internal_fill_buf_align128_str_size, NULL);
> +    is_128 = 1;
> +    pattern_size = pattern_size/2;
> +    pattern1 = pattern + pattern_size;
> +    size = size/2;
> +  } else if (pattern_size % 8 == 0) { /* Handle the 8 16 32 64 cases here. */
> +    extern char cl_internal_fill_buf_align8_str[];
> +    extern int cl_internal_fill_buf_align8_str_size;
> +    int order = ffs(pattern_size/8) - 1;
> +
> +    ker = cl_context_get_static_kernel_form_bin(queue->ctx, CL_ENQUEUE_FILL_BUFFER_ALIGN8_8 + order,
> +               cl_internal_fill_buf_align8_str, (size_t)cl_internal_fill_buf_align8_str_size, NULL);
> +  } else if (pattern_size == 4) { /* Handle the 8 16 32 64 cases here. */
                                     incorrect comments here?
> +    extern char cl_internal_fill_buf_align4_str[];
> +    extern int cl_internal_fill_buf_align4_str_size;
> +
> +    ker = cl_context_get_static_kernel_form_bin(queue->ctx, CL_ENQUEUE_FILL_BUFFER_ALIGN4,
> +               cl_internal_fill_buf_align4_str, (size_t)cl_internal_fill_buf_align4_str_size, NULL);
Is it a typo for the function name? cl_context_get_static_kernel_from_bin ?
And IMO, the API design is better to change to just pass in a kernel name string as key. And the binary
buffer and the binary size should be keep as privately internal data. considering the following prototype.

cl_context_get_static_kernel_from_bin(queue->ctx, "cl_internal_fill_buf_align4_str", NULL);

The index number and the cl_internal_fill_buf_align4_str and cl_internal_fill_buf_align4_str_size
should be managed internally.

The last comment is that we could leverage vector load/store as much as possible here.

No matter the pattern size is, for example, if the pattern size is 1 char, and the size is multiple of 128,
and the offset is 64 byte align. Then we can easily promote it to use a int2 load/store. If it is multiple
of 256, then we can use int4 load/store.

What's your opinion?