[Beignet] [PATCH 2/3] [opencl-1.2] Implement the clEnqueueFillBuffer API.

junyan.he at inbox.com junyan.he at inbox.com
Wed Apr 23 01:35:25 PDT 2014


From: Junyan He <junyan.he at linux.intel.com>

We use the floatn's assigment to do the copy.
128 pattern size is according to double16, and because
the double problem on our platform, we use to float16
to handle this.
unaligned cases is not optimized now, just use the char
assigment.

Signed-off-by: Junyan He <junyan.he at linux.intel.com>
---
 src/cl_api.c     |  78 ++++++++++++++++++++++++++++++++
 src/cl_context.c | 133 ++++++++++++++++++++++++++++++++++++++++++-------------
 src/cl_context.h |   8 ++++
 src/cl_enqueue.c |   1 +
 src/cl_enqueue.h |   1 +
 src/cl_event.c   |   1 +
 src/cl_mem.c     | 102 ++++++++++++++++++++++++++++++++++++++++++
 src/cl_mem.h     |   3 ++
 8 files changed, 295 insertions(+), 32 deletions(-)

diff --git a/src/cl_api.c b/src/cl_api.c
index 1543ff4..be94bcb 100644
--- a/src/cl_api.c
+++ b/src/cl_api.c
@@ -1592,6 +1592,84 @@ error:
 }
 
 cl_int
+clEnqueueFillBuffer(cl_command_queue   command_queue,
+                    cl_mem             buffer,
+                    const void *       pattern,
+                    size_t             pattern_size,
+                    size_t             offset,
+                    size_t             size,
+                    cl_uint            num_events_in_wait_list,
+                    const cl_event *   event_wait_list,
+                    cl_event *         event)
+{
+  cl_int err = CL_SUCCESS;
+  enqueue_data *data, no_wait_data = { 0 };
+  static size_t valid_sz[] = {1, 2, 4, 8, 16, 32, 64, 128};
+  int i = 0;
+
+  CHECK_QUEUE(command_queue);
+  CHECK_MEM(buffer);
+
+  if (command_queue->ctx != buffer->ctx) {
+    err = CL_INVALID_CONTEXT;
+    goto error;
+  }
+
+  if (offset < 0 || offset + size > buffer->size) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if (pattern == NULL) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  for (i = 0; i < sizeof(valid_sz)/sizeof(size_t); i++) {
+    if (valid_sz[i] == pattern_size)
+      break;
+  }
+  if (i == sizeof(valid_sz)/sizeof(size_t)) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  if (offset%pattern_size || size%pattern_size) {
+    err = CL_INVALID_VALUE;
+    goto error;
+  }
+
+  err = cl_mem_fill(command_queue, pattern, pattern_size, buffer, offset, size);
+  if (err) {
+    goto error;
+  }
+
+  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, buffer->ctx);
+
+  data = &no_wait_data;
+  data->type = EnqueueFillBuffer;
+  data->queue = command_queue;
+
+  if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
+                   event, data, CL_COMMAND_FILL_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
+    if (event && (*event)->type != CL_COMMAND_USER
+        && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) {
+      cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
+    }
+
+    err = cl_command_queue_flush(command_queue);
+  }
+
+  if(b_output_kernel_perf)
+    time_end(command_queue->ctx, "beignet internal kernel : cl_fill_buffer", command_queue);
+
+  return 0;
+
+ error:
+  return err;
+}
+
+cl_int
 clEnqueueCopyBuffer(cl_command_queue     command_queue,
                     cl_mem               src_buffer,
                     cl_mem               dst_buffer,
diff --git a/src/cl_context.c b/src/cl_context.c
index 8190e6a..e2dba65 100644
--- a/src/cl_context.c
+++ b/src/cl_context.c
@@ -1,4 +1,4 @@
-/* 
+/*
  * Copyright © 2012 Intel Corporation
  *
  * This library is free software; you can redistribute it and/or
@@ -188,6 +188,7 @@ error:
 LOCAL void
 cl_context_delete(cl_context ctx)
 {
+  int i = 0;
   if (UNLIKELY(ctx == NULL))
     return;
 
@@ -195,6 +196,18 @@ cl_context_delete(cl_context ctx)
   if (atomic_dec(&ctx->ref_n) > 1)
     return;
 
+  /* delete the internal programs. */
+  for (i = CL_ENQUEUE_COPY_BUFFER_ALIGN4; i < CL_INTERNAL_KERNEL_MAX; i++) {
+    if (ctx->internel_kernels[i]) {
+      cl_kernel_delete(ctx->internel_kernels[i]);
+      ctx->internel_kernels[i] = NULL;
+
+      assert(ctx->internal_prgs[i]);
+      cl_program_delete(ctx->internal_prgs[i]);
+      ctx->internal_prgs[i] = NULL;
+    }
+  }
+
   /* All object lists should have been freed. Otherwise, the reference counter
    * of the context cannot be 0
    */
@@ -252,47 +265,103 @@ cl_context_get_static_kernel(cl_context ctx, cl_int index, const char * str_kern
 {
   cl_int ret;
   if (!ctx->internal_prgs[index])
-  {
-    size_t length = strlen(str_kernel) + 1;
-    ctx->internal_prgs[index] = cl_program_create_from_source(ctx, 1, &str_kernel, &length, NULL);
-
-    if (!ctx->internal_prgs[index])
-      return NULL;
-
-    ret = cl_program_build(ctx->internal_prgs[index], str_option);
-    if (ret != CL_SUCCESS)
-      return NULL;
-
-    ctx->internal_prgs[index]->is_built = 1;
-
-    ctx->internel_kernels[index] = cl_kernel_dup(ctx->internal_prgs[index]->ker[0]);
-  }
+    {
+      size_t length = strlen(str_kernel) + 1;
+      ctx->internal_prgs[index] = cl_program_create_from_source(ctx, 1, &str_kernel, &length, NULL);
+
+      if (!ctx->internal_prgs[index])
+        return NULL;
+
+      ret = cl_program_build(ctx->internal_prgs[index], str_option);
+      if (ret != CL_SUCCESS)
+        return NULL;
+
+      ctx->internal_prgs[index]->is_built = 1;
+
+      /* All CL_ENQUEUE_FILL_BUFFER_ALIGN16_xxx use the same program, different kernel. */
+      if (index >= CL_ENQUEUE_FILL_BUFFER_ALIGN8_8 && index <= CL_ENQUEUE_FILL_BUFFER_ALIGN8_64) {
+        int i = CL_ENQUEUE_FILL_BUFFER_ALIGN8_8;
+        for (; i <= CL_ENQUEUE_FILL_BUFFER_ALIGN8_64; i++) {
+          if (index != i) {
+            assert(ctx->internal_prgs[i] == NULL);
+            assert(ctx->internel_kernels[i] == NULL);
+            cl_program_add_ref(ctx->internal_prgs[index]);
+            ctx->internal_prgs[i] = ctx->internal_prgs[index];
+          }
+
+          if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_8) {
+            ctx->internel_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
+                                                                "__cl_fill_region_align8_2", NULL);
+          } else if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_16) {
+            ctx->internel_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
+                                                                "__cl_fill_region_align8_4", NULL);
+          } else if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_32) {
+            ctx->internel_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
+                                                                "__cl_fill_region_align8_8", NULL);
+          } else if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_64) {
+            ctx->internel_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
+                                                                "__cl_fill_region_align8_16", NULL);
+          } else
+            assert(0);
+        }
+      } else {
+        ctx->internel_kernels[index] = cl_kernel_dup(ctx->internal_prgs[index]->ker[0]);
+      }
+    }
 
   return ctx->internel_kernels[index];
 }
 
 cl_kernel
 cl_context_get_static_kernel_form_bin(cl_context ctx, cl_int index,
-                  const char * str_kernel, size_t size, const char * str_option)
+                                      const char * str_kernel, size_t size, const char * str_option)
 {
   cl_int ret;
   cl_int binary_status = CL_SUCCESS;
   if (!ctx->internal_prgs[index])
-  {
-    ctx->internal_prgs[index] = cl_program_create_from_binary(ctx, 1, &ctx->device,
-      &size, (const unsigned char **)&str_kernel, &binary_status, &ret);
-
-    if (!ctx->internal_prgs[index])
-      return NULL;
-
-    ret = cl_program_build(ctx->internal_prgs[index], str_option);
-    if (ret != CL_SUCCESS)
-      return NULL;
-
-    ctx->internal_prgs[index]->is_built = 1;
-
-    ctx->internel_kernels[index] = cl_kernel_dup(ctx->internal_prgs[index]->ker[0]);
-  }
+    {
+      ctx->internal_prgs[index] = cl_program_create_from_binary(ctx, 1, &ctx->device,
+                                                                &size, (const unsigned char **)&str_kernel, &binary_status, &ret);
+
+      if (!ctx->internal_prgs[index])
+        return NULL;
+
+      ret = cl_program_build(ctx->internal_prgs[index], str_option);
+      if (ret != CL_SUCCESS)
+        return NULL;
+
+      ctx->internal_prgs[index]->is_built = 1;
+
+      /* All CL_ENQUEUE_FILL_BUFFER_ALIGN16_xxx use the same program, different kernel. */
+      if (index >= CL_ENQUEUE_FILL_BUFFER_ALIGN8_8 && index <= CL_ENQUEUE_FILL_BUFFER_ALIGN8_64) {
+        int i = CL_ENQUEUE_FILL_BUFFER_ALIGN8_8;
+        for (; i <= CL_ENQUEUE_FILL_BUFFER_ALIGN8_64; i++) {
+          if (index != i) {
+            assert(ctx->internal_prgs[i] == NULL);
+            assert(ctx->internel_kernels[i] == NULL);
+            cl_program_add_ref(ctx->internal_prgs[index]);
+            ctx->internal_prgs[i] = ctx->internal_prgs[index];
+          }
+
+          if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_8) {
+            ctx->internel_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
+                                                                "__cl_fill_region_align8_2", NULL);
+          } else if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_16) {
+            ctx->internel_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
+                                                                "__cl_fill_region_align8_4", NULL);
+          } else if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_32) {
+            ctx->internel_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
+                                                                "__cl_fill_region_align8_8", NULL);
+          } else if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_64) {
+            ctx->internel_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
+                                                                "__cl_fill_region_align8_16", NULL);
+          } else
+            assert(0);
+        }
+      } else {
+        ctx->internel_kernels[index] = cl_kernel_dup(ctx->internal_prgs[index]->ker[0]);
+      }
+    }
 
   return ctx->internel_kernels[index];
 }
diff --git a/src/cl_context.h b/src/cl_context.h
index 782a9af..6eb6c9e 100644
--- a/src/cl_context.h
+++ b/src/cl_context.h
@@ -54,6 +54,14 @@ enum _cl_internal_ker_type {
   CL_ENQUEUE_COPY_IMAGE_TO_BUFFER_1,   //copy image 3d tobuffer
   CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_0,   //copy buffer to image 2d
   CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_1,   //copy buffer to image 3d
+  CL_ENQUEUE_FILL_BUFFER_UNALIGN,      //fill buffer with 1 aligne pattern, pattern size=1
+  CL_ENQUEUE_FILL_BUFFER_ALIGN2,       //fill buffer with 2 aligne pattern, pattern size=2
+  CL_ENQUEUE_FILL_BUFFER_ALIGN4,       //fill buffer with 4 aligne pattern, pattern size=4
+  CL_ENQUEUE_FILL_BUFFER_ALIGN8_8,     //fill buffer with 8 aligne pattern, pattern size=8
+  CL_ENQUEUE_FILL_BUFFER_ALIGN8_16,    //fill buffer with 16 aligne pattern, pattern size=16
+  CL_ENQUEUE_FILL_BUFFER_ALIGN8_32,    //fill buffer with 16 aligne pattern, pattern size=32
+  CL_ENQUEUE_FILL_BUFFER_ALIGN8_64,    //fill buffer with 16 aligne pattern, pattern size=64
+  CL_ENQUEUE_FILL_BUFFER_ALIGN128,     //fill buffer with 128 aligne pattern, pattern size=128
   CL_INTERNAL_KERNEL_MAX
 };
 
diff --git a/src/cl_enqueue.c b/src/cl_enqueue.c
index 330d230..d44fa66 100644
--- a/src/cl_enqueue.c
+++ b/src/cl_enqueue.c
@@ -412,6 +412,7 @@ cl_int cl_enqueue_handle(cl_event event, enqueue_data* data)
     case EnqueueCopyBufferToImage:
     case EnqueueCopyImageToBuffer:
     case EnqueueNDRangeKernel:
+    case EnqueueFillBuffer:
       cl_gpgpu_event_resume((cl_gpgpu_event)data->ptr);
       return CL_SUCCESS;
     case EnqueueNativeKernel:
diff --git a/src/cl_enqueue.h b/src/cl_enqueue.h
index 1d3ae5f..cb612eb 100644
--- a/src/cl_enqueue.h
+++ b/src/cl_enqueue.h
@@ -41,6 +41,7 @@ typedef enum {
   EnqueueNDRangeKernel,
   EnqueueNativeKernel,
   EnqueueMarker,
+  EnqueueFillBuffer,
   EnqueueInvalid
 } enqueue_type;
 
diff --git a/src/cl_event.c b/src/cl_event.c
index 727ee1f..ae5bfaf 100644
--- a/src/cl_event.c
+++ b/src/cl_event.c
@@ -33,6 +33,7 @@ cl_event_is_gpu_command_type(cl_command_type type)
 {
   switch(type) {
     case CL_COMMAND_COPY_BUFFER:
+    case CL_COMMAND_FILL_BUFFER:
     case CL_COMMAND_COPY_IMAGE:
     case CL_COMMAND_COPY_IMAGE_TO_BUFFER:
     case CL_COMMAND_COPY_BUFFER_TO_IMAGE:
diff --git a/src/cl_mem.c b/src/cl_mem.c
index 44482f7..0f7dc23 100644
--- a/src/cl_mem.c
+++ b/src/cl_mem.c
@@ -902,6 +902,108 @@ cl_mem_copy(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
 }
 
 LOCAL cl_int
+cl_mem_fill(cl_command_queue queue, const void * pattern, size_t pattern_size,
+            cl_mem buffer, size_t offset, size_t size)
+{
+  cl_int ret = CL_SUCCESS;
+  cl_kernel ker = NULL;
+  size_t global_off[] = {0,0,0};
+  size_t global_sz[] = {1,1,1};
+  size_t local_sz[] = {1,1,1};
+  char pattern_comb[4];
+  int is_128 = 0;
+  const void * pattern1 = NULL;
+
+  assert(offset%pattern_size == 0);
+  assert(size%pattern_size == 0);
+
+  if (!size)
+    return ret;
+
+  if (pattern_size == 128) {
+    /* 128 is according to pattern of double16, but double works not very
+       well on some platform. We use two float16 to handle this. */
+    extern char cl_internal_fill_buf_align128_str[];
+    extern int cl_internal_fill_buf_align128_str_size;
+
+    ker = cl_context_get_static_kernel_form_bin(queue->ctx, CL_ENQUEUE_FILL_BUFFER_ALIGN128,
+               cl_internal_fill_buf_align128_str, (size_t)cl_internal_fill_buf_align128_str_size, NULL);
+    is_128 = 1;
+    pattern_size = pattern_size/2;
+    pattern1 = pattern + pattern_size;
+    size = size/2;
+  } else if (pattern_size % 8 == 0) { /* Handle the 8 16 32 64 cases here. */
+    extern char cl_internal_fill_buf_align8_str[];
+    extern int cl_internal_fill_buf_align8_str_size;
+    int order = ffs(pattern_size/8) - 1;
+
+    ker = cl_context_get_static_kernel_form_bin(queue->ctx, CL_ENQUEUE_FILL_BUFFER_ALIGN8_8 + order,
+               cl_internal_fill_buf_align8_str, (size_t)cl_internal_fill_buf_align8_str_size, NULL);
+  } else if (pattern_size == 4) { /* Handle the 8 16 32 64 cases here. */
+    extern char cl_internal_fill_buf_align4_str[];
+    extern int cl_internal_fill_buf_align4_str_size;
+
+    ker = cl_context_get_static_kernel_form_bin(queue->ctx, CL_ENQUEUE_FILL_BUFFER_ALIGN4,
+               cl_internal_fill_buf_align4_str, (size_t)cl_internal_fill_buf_align4_str_size, NULL);
+  } else if (size >= 4 && size % 4 == 0 && offset % 4 == 0) {
+    /* The unaligned case. But if copy size and offset are aligned to 4, we can fake
+       the pattern with the pattern duplication fill in. */
+    assert(pattern_size == 1 || pattern_size == 2);
+    extern char cl_internal_fill_buf_align4_str[];
+    extern int cl_internal_fill_buf_align4_str_size;
+
+    if (pattern_size == 2) {
+      memcpy(pattern_comb, pattern, sizeof(char)*2);
+      memcpy(pattern_comb + 2, pattern, sizeof(char)*2);
+    } else {
+      pattern_comb[0] = pattern_comb[1] = pattern_comb[2]
+        = pattern_comb[3] = *(char *)pattern;
+    }
+
+    ker = cl_context_get_static_kernel_form_bin(queue->ctx, CL_ENQUEUE_FILL_BUFFER_ALIGN4,
+               cl_internal_fill_buf_align4_str, (size_t)cl_internal_fill_buf_align4_str_size, NULL);
+    pattern_size = 4;
+    pattern = pattern_comb;
+  }
+  //TODO: Unaligned cases, we may need to optimize it as cl_mem_copy, using mask in kernel
+  //functions. This depend on the usage but now we just use aligned 1 and 2.
+  else if (pattern_size == 2) {
+    extern char cl_internal_fill_buf_align2_str[];
+    extern int cl_internal_fill_buf_align2_str_size;
+    ker = cl_context_get_static_kernel_form_bin(queue->ctx, CL_ENQUEUE_FILL_BUFFER_ALIGN2,
+               cl_internal_fill_buf_align2_str, (size_t)cl_internal_fill_buf_align2_str_size, NULL);
+  } else if (pattern_size == 1) {
+    extern char cl_internal_fill_buf_unalign_str[];
+    extern int cl_internal_fill_buf_unalign_str_size;
+    ker = cl_context_get_static_kernel_form_bin(queue->ctx, CL_ENQUEUE_FILL_BUFFER_UNALIGN,
+               cl_internal_fill_buf_unalign_str, (size_t)cl_internal_fill_buf_unalign_str_size, NULL);
+  } else
+    assert(0);
+
+  if (!ker)
+    return CL_OUT_OF_RESOURCES;
+
+  size = size/pattern_size;
+  offset = offset/pattern_size;
+
+  if (size < LOCAL_SZ_0) {
+    local_sz[0] = 1;
+  } else {
+    local_sz[0] = LOCAL_SZ_0;
+  }
+  global_sz[0] = ((size + LOCAL_SZ_0 - 1)/LOCAL_SZ_0)*LOCAL_SZ_0;
+  cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &buffer);
+  cl_kernel_set_arg(ker, 1, pattern_size, pattern);
+  cl_kernel_set_arg(ker, 2, sizeof(cl_uint), &offset);
+  cl_kernel_set_arg(ker, 3, sizeof(cl_uint), &size);
+  if (is_128)
+    cl_kernel_set_arg(ker, 4, pattern_size, pattern1);
+
+  ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz);
+  return ret;
+}
+
+LOCAL cl_int
 cl_mem_copy_buffer_rect(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
                        const size_t *src_origin, const size_t *dst_origin, const size_t *region,
                        size_t src_row_pitch, size_t src_slice_pitch,
diff --git a/src/cl_mem.h b/src/cl_mem.h
index 3f3c54c..afa48fe 100644
--- a/src/cl_mem.h
+++ b/src/cl_mem.h
@@ -181,6 +181,9 @@ extern void cl_mem_add_ref(cl_mem);
 extern cl_int cl_mem_copy(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
               size_t src_offset, size_t dst_offset, size_t cb);
 
+extern cl_int cl_mem_fill(cl_command_queue queue, const void * pattern, size_t pattern_size,
+              cl_mem buffer, size_t offset, size_t size);
+
 /* api clEnqueueCopyBufferRect help function */
 extern cl_int cl_mem_copy_buffer_rect(cl_command_queue, cl_mem, cl_mem,
                                      const size_t *, const size_t *, const size_t *,
-- 
1.8.3.2



More information about the Beignet mailing list