[Beignet] [PATCH 6/9 V3] Modify all event related functions using new event handle.

Yang, Rong R rong.r.yang at intel.com
Wed Sep 28 08:22:53 UTC 2016


The patchset pushed, thanks, it make event much more clearly.
Utest profiling_exec failed because cl_event_get_timestamp haven't implemented, junyan will implement  it later.

> -----Original Message-----
> From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of
> junyan.he at inbox.com
> Sent: Monday, September 26, 2016 16:00
> To: beignet at lists.freedesktop.org
> Subject: [Beignet] [PATCH 6/9 V3] Modify all event related functions using
> new event handle.
> 
> From: Junyan He <junyan.he at intel.com>
> 
> Rewrite the cl_event, and modify all the event functions
> using this new event manner. Event will co-operate with
> command queue's thread together.
> 
> v2:
>   Fix a logic problem in event create failed.
> 
> V3:
>   Set enqueue default to do nothing, handle some enqueue has nothing
>   to do.
> 
> Signed-off-by: Junyan He <junyan.he at intel.com>
> ---
>  src/CMakeLists.txt             |    5 +
>  src/cl_api.c                   | 1888 +---------------------------------------
>  src/cl_api_kernel.c            |   27 +-
>  src/cl_command_queue.c         |   98 ++-
>  src/cl_command_queue.h         |    7 +-
>  src/cl_command_queue_enqueue.c |    8 +-
>  src/cl_command_queue_gen7.c    |   21 +-
>  src/cl_enqueue.c               |  502 ++++++-----
>  src/cl_enqueue.h               |   44 +-
>  src/cl_event.c                 | 1067 ++++++++++-------------
>  src/cl_event.h                 |  146 ++--
>  src/cl_mem.c                   |  118 ++-
>  src/cl_mem.h                   |   29 +-
>  13 files changed, 1074 insertions(+), 2886 deletions(-)
> 
> diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
> index 009d057..626b43f 100644
> --- a/src/CMakeLists.txt
> +++ b/src/CMakeLists.txt
> @@ -68,6 +68,10 @@ set(OPENCL_SRC
>      ${KERNEL_STR_FILES}
>      cl_base_object.c
>      cl_api.c
> +    cl_api_mem.c
> +    cl_api_kernel.c
> +    cl_api_command_queue.c
> +    cl_api_event.c
>      cl_alloc.c
>      cl_kernel.c
>      cl_program.c
> @@ -85,6 +89,7 @@ set(OPENCL_SRC
>      cl_command_queue.c
>      cl_command_queue.h
>      cl_command_queue_gen7.c
> +    cl_command_queue_enqueue.c
>      cl_thread.c
>      cl_driver.h
>      cl_driver.cpp
> diff --git a/src/cl_api.c b/src/cl_api.c
> index a2fee15..f8c48de 100644
> --- a/src/cl_api.c
> +++ b/src/cl_api.c
> @@ -67,92 +67,6 @@ typedef intptr_t cl_device_partition_property;
>  	  return RET; \
>  	} while(0)
> 
> -static inline cl_int
> -handle_events(cl_command_queue queue, cl_int num, const cl_event
> *wait_list,
> -              cl_event* event, enqueue_data* data, cl_command_type type)
> -{
> -  cl_int status = cl_event_wait_events(num, wait_list, queue);
> -  cl_event e = NULL;
> -  if(event != NULL || status == CL_ENQUEUE_EXECUTE_DEFER) {
> -    e = cl_event_new(queue->ctx, queue, type, event!=NULL);
> -
> -    /* if need profiling, add the submit timestamp here. */
> -    if (e->type != CL_COMMAND_USER &&
> -	    e->queue->props & CL_QUEUE_PROFILING_ENABLE) {
> -	cl_event_get_timestamp(e, CL_PROFILING_COMMAND_QUEUED);
> -	cl_event_get_queued_cpu_timestamp(e);
> -    }
> -
> -    if(event != NULL)
> -      *event = e;
> -    if(status == CL_ENQUEUE_EXECUTE_DEFER) {
> -      cl_event_new_enqueue_callback(e, data, num, wait_list);
> -    }
> -  }
> -  set_current_event(queue, e);
> -  return status;
> -}
> -
> -/* The following code checking overlap is from Appendix of openCL spec 1.1
> */
> -cl_bool check_copy_overlap(const size_t src_offset[3],
> -                           const size_t dst_offset[3],
> -                           const size_t region[3],
> -                           size_t row_pitch, size_t slice_pitch)
> -{
> -  const size_t src_min[] = {src_offset[0], src_offset[1], src_offset[2]};
> -  const size_t src_max[] = {src_offset[0] + region[0],
> -                            src_offset[1] + region[1],
> -                            src_offset[2] + region[2]};
> -  const size_t dst_min[] = {dst_offset[0], dst_offset[1], dst_offset[2]};
> -  const size_t dst_max[] = {dst_offset[0] + region[0],
> -                            dst_offset[1] + region[1],
> -                            dst_offset[2] + region[2]};
> -  // Check for overlap
> -  cl_bool overlap = CL_TRUE;
> -  unsigned i;
> -  size_t dst_start = dst_offset[2] * slice_pitch +
> -                     dst_offset[1] * row_pitch + dst_offset[0];
> -  size_t dst_end = dst_start + (region[2] * slice_pitch +
> -                   region[1] * row_pitch + region[0]);
> -  size_t src_start = src_offset[2] * slice_pitch +
> -                     src_offset[1] * row_pitch + src_offset[0];
> -  size_t src_end = src_start + (region[2] * slice_pitch +
> -                   region[1] * row_pitch + region[0]);
> -
> -  for (i=0; i != 3; ++i) {
> -    overlap = overlap && (src_min[i] < dst_max[i])
> -                      && (src_max[i] > dst_min[i]);
> -  }
> -
> -  if (!overlap) {
> -    size_t delta_src_x = (src_offset[0] + region[0] > row_pitch) ?
> -                          src_offset[0] + region[0] - row_pitch : 0;
> -    size_t delta_dst_x = (dst_offset[0] + region[0] > row_pitch) ?
> -                          dst_offset[0] + region[0] - row_pitch : 0;
> -    if ( (delta_src_x > 0 && delta_src_x > dst_offset[0]) ||
> -         (delta_dst_x > 0 && delta_dst_x > src_offset[0]) ) {
> -      if ( (src_start <= dst_start && dst_start < src_end) ||
> -           (dst_start <= src_start && src_start < dst_end) )
> -        overlap = CL_TRUE;
> -    }
> -    if (region[2] > 1) {
> -      size_t src_height = slice_pitch / row_pitch;
> -      size_t dst_height = slice_pitch / row_pitch;
> -      size_t delta_src_y = (src_offset[1] + region[1] > src_height) ?
> -                            src_offset[1] + region[1] - src_height : 0;
> -      size_t delta_dst_y = (dst_offset[1] + region[1] > dst_height) ?
> -                            dst_offset[1] + region[1] - dst_height : 0;
> -      if ( (delta_src_y > 0 && delta_src_y > dst_offset[1]) ||
> -           (delta_dst_y > 0 && delta_dst_y > src_offset[1]) ) {
> -        if ( (src_start <= dst_start && dst_start < src_end) ||
> -             (dst_start <= src_start && src_start < dst_end) )
> -          overlap = CL_TRUE;
> -      }
> -    }
> -  }
> -  return overlap;
> -}
> -
>  static cl_int
>  cl_check_device_type(cl_device_type device_type)
>  {
> @@ -448,16 +362,6 @@ error:
>  }
> 
>  cl_int
> -clReleaseCommandQueue(cl_command_queue command_queue)
> -{
> -  cl_int err = CL_SUCCESS;
> -  CHECK_QUEUE (command_queue);
> -  cl_command_queue_delete(command_queue);
> -error:
> -  return err;
> -}
> -
> -cl_int
>  clGetCommandQueueInfo(cl_command_queue       command_queue,
>                        cl_command_queue_info  param_name,
>                        size_t                 param_value_size,
> @@ -1369,26 +1273,6 @@ clGetKernelSubGroupInfoKHR(cl_kernel
> kernel,
>  }
> 
>  cl_int
> -clWaitForEvents(cl_uint          num_events,
> -                const cl_event * event_list)
> -{
> -  cl_int err = CL_SUCCESS;
> -  cl_context ctx = NULL;
> -
> -  if(num_events > 0 && event_list)
> -    ctx = event_list[0]->ctx;
> -
> -  TRY(cl_event_check_waitlist, num_events, event_list, NULL, ctx);
> -
> -  while(cl_event_wait_events(num_events, event_list, NULL) ==
> CL_ENQUEUE_EXECUTE_DEFER) {
> -    usleep(8000);       //sleep 8ms to wait other thread
> -  }
> -
> -error:
> -  return err;
> -}
> -
> -cl_int
>  clGetEventInfo(cl_event      event,
>                 cl_event_info param_name,
>                 size_t        param_value_size,
> @@ -1403,9 +1287,9 @@ clGetEventInfo(cl_event      event,
>    } else if (param_name == CL_EVENT_CONTEXT) {
>      FILL_GETINFO_RET (cl_context, 1, &event->ctx, CL_SUCCESS);
>    } else if (param_name == CL_EVENT_COMMAND_TYPE) {
> -    FILL_GETINFO_RET (cl_command_type, 1, &event->type, CL_SUCCESS);
> +    FILL_GETINFO_RET (cl_command_type, 1, &event->event_type,
> CL_SUCCESS);
>    } else if (param_name == CL_EVENT_COMMAND_EXECUTION_STATUS) {
> -    cl_event_update_status(event, 0);
> +    cl_event_get_status(event);
>      FILL_GETINFO_RET (cl_int, 1, &event->status, CL_SUCCESS);
>    } else if (param_name == CL_EVENT_REFERENCE_COUNT) {
>      cl_uint ref = CL_OBJECT_GET_REF(event);
> @@ -1419,22 +1303,6 @@ error:
> 
>  }
> 
> -cl_event
> -clCreateUserEvent(cl_context context,
> -                  cl_int *   errcode_ret)
> -{
> -  cl_int err = CL_SUCCESS;
> -  cl_event event = NULL;
> -  CHECK_CONTEXT(context);
> -
> -  TRY_ALLOC(event, cl_event_new(context, NULL, CL_COMMAND_USER,
> CL_TRUE));
> -
> -error:
> -  if(errcode_ret)
> -    *errcode_ret = err;
> -  return event;
> -}
> -
>  cl_int
>  clRetainEvent(cl_event  event)
>  {
> @@ -1459,48 +1327,6 @@ error:
>    return err;
>  }
> 
> -cl_int
> -clSetUserEventStatus(cl_event    event,
> -                     cl_int      execution_status)
> -{
> -  cl_int err = CL_SUCCESS;
> -
> -  CHECK_EVENT(event);
> -  if(execution_status > CL_COMPLETE) {
> -    err = CL_INVALID_VALUE;
> -    goto error;
> -  }
> -  if(event->status != CL_SUBMITTED) {
> -    err = CL_INVALID_OPERATION;
> -    goto error;
> -  }
> -
> -  cl_event_set_status(event, execution_status);
> -error:
> -  return err;
> -}
> -
> -cl_int
> -clSetEventCallback(cl_event     event,
> -                   cl_int       command_exec_callback_type,
> -                   void (CL_CALLBACK * pfn_notify) (cl_event, cl_int, void *),
> -                   void *       user_data)
> -{
> -  cl_int err = CL_SUCCESS;
> -
> -  CHECK_EVENT(event);
> -  if((pfn_notify == NULL) ||
> -    (command_exec_callback_type > CL_SUBMITTED) ||
> -    (command_exec_callback_type < CL_COMPLETE)) {
> -    err = CL_INVALID_VALUE;
> -    goto error;
> -  }
> -  err = cl_event_set_callback(event, command_exec_callback_type,
> pfn_notify, user_data);
> -
> -error:
> -  return err;
> -
> -}
> 
>  cl_int
>  clGetEventProfilingInfo(cl_event             event,
> @@ -1513,9 +1339,9 @@ clGetEventProfilingInfo(cl_event             event,
>    cl_ulong ret_val;
> 
>    CHECK_EVENT(event);
> -  cl_event_update_status(event, 0);
> +  //cl_event_update_status(event, 0);
> 
> -  if (event->type == CL_COMMAND_USER ||
> +  if (event->event_type == CL_COMMAND_USER ||
>        !(event->queue->props & CL_QUEUE_PROFILING_ENABLE) ||
>            event->status != CL_COMPLETE) {
>      err = CL_PROFILING_INFO_NOT_AVAILABLE;
> @@ -1552,1712 +1378,6 @@ error:
>    return err;
>  }
> 
> -cl_int
> -clFlush(cl_command_queue command_queue)
> -{
> -  /* have nothing to do now, as currently
> -   * clEnqueueNDRangeKernel will flush at
> -   * the end of each calling. we may need
> -   * to optimize it latter.*/
> -  return 0;
> -}
> -
> -cl_int
> -clFinish(cl_command_queue command_queue)
> -{
> -  cl_int err = CL_SUCCESS;
> -
> -  CHECK_QUEUE (command_queue);
> -
> -#ifdef HAS_CMRT
> -  if (command_queue->cmrt_event != NULL)
> -    return cmrt_wait_for_task_finished(command_queue);
> -#endif
> -
> -  err = cl_command_queue_finish(command_queue);
> -
> -error:
> -  return err;
> -}
> -
> -cl_int
> -clEnqueueReadBuffer(cl_command_queue command_queue,
> -                    cl_mem           buffer,
> -                    cl_bool          blocking_read,
> -                    size_t           offset,
> -                    size_t           size,
> -                    void *           ptr,
> -                    cl_uint          num_events_in_wait_list,
> -                    const cl_event * event_wait_list,
> -                    cl_event *       event)
> -{
> -  cl_int err = CL_SUCCESS;
> -  enqueue_data *data, defer_enqueue_data = { 0 };
> -  CHECK_QUEUE(command_queue);
> -  CHECK_MEM(buffer);
> -  if (command_queue->ctx != buffer->ctx) {
> -     err = CL_INVALID_CONTEXT;
> -     goto error;
> -  }
> -
> -  if (!ptr || !size || offset + size > buffer->size) {
> -     err = CL_INVALID_VALUE;
> -     goto error;
> -  }
> -
> -  if (buffer->flags & (CL_MEM_HOST_WRITE_ONLY |
> CL_MEM_HOST_NO_ACCESS)) {
> -     err = CL_INVALID_OPERATION;
> -     goto error;
> -  }
> -
> -  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list,
> event, buffer->ctx);
> -
> -  data = &defer_enqueue_data;
> -  data->type    = EnqueueReadBuffer;
> -  data->mem_obj = buffer;
> -  data->ptr     = ptr;
> -  data->offset  = offset;
> -  data->size    = size;
> -
> -  if(handle_events(command_queue, num_events_in_wait_list,
> event_wait_list,
> -                   event, data, CL_COMMAND_READ_BUFFER) ==
> CL_ENQUEUE_EXECUTE_IMM) {
> -    err = cl_enqueue_handle(event ? *event : NULL, data);
> -    if(event) cl_event_set_status(*event, CL_COMPLETE);
> -  }
> -
> -error:
> -  return err;
> -}
> -
> -cl_int
> -clEnqueueReadBufferRect(cl_command_queue command_queue,
> -                        cl_mem           buffer,
> -                        cl_bool          blocking_read,
> -                        const size_t *   buffer_origin,
> -                        const size_t *   host_origin,
> -                        const size_t *   region,
> -                        size_t           buffer_row_pitch,
> -                        size_t           buffer_slice_pitch,
> -                        size_t           host_row_pitch,
> -                        size_t           host_slice_pitch,
> -                        void *           ptr,
> -                        cl_uint          num_events_in_wait_list,
> -                        const cl_event * event_wait_list,
> -                        cl_event *       event)
> -{
> -  cl_int err = CL_SUCCESS;
> -  enqueue_data *data, no_wait_data = { 0 };
> -
> -  CHECK_QUEUE(command_queue);
> -  CHECK_MEM(buffer);
> -
> -  if (command_queue->ctx != buffer->ctx) {
> -    err = CL_INVALID_CONTEXT;
> -    goto error;
> -  }
> -
> -  if (buffer->flags & (CL_MEM_HOST_WRITE_ONLY |
> CL_MEM_HOST_NO_ACCESS)) {
> -     err = CL_INVALID_OPERATION;
> -     goto error;
> -  }
> -
> -  if (!ptr || !region || region[0] == 0 || region[1] == 0 || region[2] == 0) {
> -    err = CL_INVALID_VALUE;
> -    goto error;
> -  }
> -
> -  if(buffer_row_pitch == 0)
> -    buffer_row_pitch = region[0];
> -  if(buffer_slice_pitch == 0)
> -    buffer_slice_pitch = region[1] * buffer_row_pitch;
> -
> -  if(host_row_pitch == 0)
> -    host_row_pitch = region[0];
> -  if(host_slice_pitch == 0)
> -    host_slice_pitch = region[1] * host_row_pitch;
> -
> -  if (buffer_row_pitch < region[0] ||
> -      host_row_pitch < region[0]) {
> -    err = CL_INVALID_VALUE;
> -    goto error;
> -  }
> -
> -  if ((buffer_slice_pitch < region[1] * buffer_row_pitch || buffer_slice_pitch %
> buffer_row_pitch != 0 ) ||
> -      (host_slice_pitch < region[1] * host_row_pitch || host_slice_pitch %
> host_row_pitch != 0 )) {
> -    err = CL_INVALID_VALUE;
> -    goto error;
> -  }
> -
> -  if ((buffer_origin[2] + region[2] - 1) * buffer_slice_pitch
> -         + (buffer_origin[1] + region[1] - 1) * buffer_row_pitch
> -         + buffer_origin[0] + region[0] > buffer->size) {
> -    err = CL_INVALID_VALUE;
> -    goto error;
> -  }
> -
> -  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list,
> event, buffer->ctx);
> -
> -  data = &no_wait_data;
> -  data->type        = EnqueueReadBufferRect;
> -  data->mem_obj     = buffer;
> -  data->ptr         = ptr;
> -  data->origin[0]   = buffer_origin[0]; data->origin[1] = buffer_origin[1]; data-
> >origin[2] = buffer_origin[2];
> -  data->host_origin[0]  = host_origin[0]; data->host_origin[1] =
> host_origin[1]; data->host_origin[2] = host_origin[2];
> -  data->region[0]   = region[0];  data->region[1] = region[1];  data->region[2]
> = region[2];
> -  data->row_pitch   = buffer_row_pitch;
> -  data->slice_pitch = buffer_slice_pitch;
> -  data->host_row_pitch   = host_row_pitch;
> -  data->host_slice_pitch = host_slice_pitch;
> -
> -  if(handle_events(command_queue, num_events_in_wait_list,
> event_wait_list,
> -                   event, data, CL_COMMAND_READ_BUFFER_RECT) ==
> CL_ENQUEUE_EXECUTE_IMM) {
> -    err = cl_enqueue_handle(event ? *event : NULL, data);
> -    if(event) cl_event_set_status(*event, CL_COMPLETE);
> -  }
> -
> - error:
> -  return err;
> -}
> -
> -cl_int
> -clEnqueueWriteBuffer(cl_command_queue    command_queue,
> -                     cl_mem              buffer,
> -                     cl_bool             blocking_write,
> -                     size_t              offset,
> -                     size_t              size,
> -                     const void *        ptr,
> -                     cl_uint             num_events_in_wait_list,
> -                     const cl_event *    event_wait_list,
> -                     cl_event *          event)
> -{
> -  cl_int err = CL_SUCCESS;
> -  enqueue_data *data, no_wait_data = { 0 };
> -
> -  CHECK_QUEUE(command_queue);
> -  CHECK_MEM(buffer);
> -  if (command_queue->ctx != buffer->ctx) {
> -    err = CL_INVALID_CONTEXT;
> -    goto error;
> -  }
> -
> -  if (!ptr || !size || offset + size > buffer->size) {
> -    err = CL_INVALID_VALUE;
> -    goto error;
> -  }
> -
> -  if (buffer->flags & (CL_MEM_HOST_READ_ONLY |
> CL_MEM_HOST_NO_ACCESS)) {
> -    err = CL_INVALID_OPERATION;
> -    goto error;
> -  }
> -
> -  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list,
> event, buffer->ctx);
> -
> -  data = &no_wait_data;
> -  data->type      = EnqueueWriteBuffer;
> -  data->mem_obj   = buffer;
> -  data->const_ptr = ptr;
> -  data->offset    = offset;
> -  data->size      = size;
> -
> -  if(handle_events(command_queue, num_events_in_wait_list,
> event_wait_list,
> -                   event, data, CL_COMMAND_WRITE_BUFFER) ==
> CL_ENQUEUE_EXECUTE_IMM) {
> -    err = cl_enqueue_handle(event ? *event : NULL, data);
> -    if(event) cl_event_set_status(*event, CL_COMPLETE);
> -  }
> -
> - error:
> -  return err;
> -}
> -
> -cl_int
> -clEnqueueWriteBufferRect(cl_command_queue     command_queue,
> -                         cl_mem               buffer,
> -                         cl_bool              blocking_write,
> -                         const size_t *       buffer_origin,
> -                         const size_t *       host_origin,
> -                         const size_t *       region,
> -                         size_t               buffer_row_pitch,
> -                         size_t               buffer_slice_pitch,
> -                         size_t               host_row_pitch,
> -                         size_t               host_slice_pitch,
> -                         const void *         ptr,
> -                         cl_uint              num_events_in_wait_list,
> -                         const cl_event *     event_wait_list,
> -                         cl_event *           event)
> -{
> -  cl_int err = CL_SUCCESS;
> -  enqueue_data *data, no_wait_data = { 0 };
> -
> -  CHECK_QUEUE(command_queue);
> -  CHECK_MEM(buffer);
> -
> -  if (command_queue->ctx != buffer->ctx) {
> -    err = CL_INVALID_CONTEXT;
> -    goto error;
> -  }
> -
> -  if (buffer->flags & (CL_MEM_HOST_READ_ONLY |
> CL_MEM_HOST_NO_ACCESS)) {
> -    err = CL_INVALID_OPERATION;
> -    goto error;
> -  }
> -
> -  if (!ptr || !region || region[0] == 0 || region[1] == 0 || region[2] == 0) {
> -    err = CL_INVALID_VALUE;
> -    goto error;
> -  }
> -
> -  if(buffer_row_pitch == 0)
> -    buffer_row_pitch = region[0];
> -  if(buffer_slice_pitch == 0)
> -    buffer_slice_pitch = region[1] * buffer_row_pitch;
> -
> -  if(host_row_pitch == 0)
> -    host_row_pitch = region[0];
> -  if(host_slice_pitch == 0)
> -    host_slice_pitch = region[1] * host_row_pitch;
> -
> -  if (buffer_row_pitch < region[0] ||
> -      host_row_pitch < region[0]) {
> -    err = CL_INVALID_VALUE;
> -    goto error;
> -  }
> -
> -  if ((buffer_slice_pitch < region[1] * buffer_row_pitch || buffer_slice_pitch %
> buffer_row_pitch != 0 ) ||
> -      (host_slice_pitch < region[1] * host_row_pitch || host_slice_pitch %
> host_row_pitch != 0 )) {
> -    err = CL_INVALID_VALUE;
> -    goto error;
> -  }
> -
> -  if ((buffer_origin[2] + region[2] - 1) * buffer_slice_pitch
> -         + (buffer_origin[1] + region[1] - 1) * buffer_row_pitch
> -         + buffer_origin[0] + region[0] > buffer->size) {
> -    err = CL_INVALID_VALUE;
> -    goto error;
> -  }
> -
> -  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list,
> event, buffer->ctx);
> -
> -  data = &no_wait_data;
> -  data->type        = EnqueueWriteBufferRect;
> -  data->mem_obj     = buffer;
> -  data->const_ptr   = ptr;
> -  data->origin[0]   = buffer_origin[0]; data->origin[1] = buffer_origin[1]; data-
> >origin[2] = buffer_origin[2];
> -  data->host_origin[0]  = host_origin[0]; data->host_origin[1] =
> host_origin[1]; data->host_origin[2] = host_origin[2];
> -  data->region[0]   = region[0];  data->region[1] = region[1];  data->region[2]
> = region[2];
> -  data->row_pitch   = buffer_row_pitch;
> -  data->slice_pitch = buffer_slice_pitch;
> -  data->host_row_pitch   = host_row_pitch;
> -  data->host_slice_pitch = host_slice_pitch;
> -
> -  if(handle_events(command_queue, num_events_in_wait_list,
> event_wait_list,
> -                   event, data, CL_COMMAND_WRITE_BUFFER_RECT) ==
> CL_ENQUEUE_EXECUTE_IMM) {
> -    err = cl_enqueue_handle(event ? *event : NULL, data);
> -    if(event) cl_event_set_status(*event, CL_COMPLETE);
> -  }
> -
> -error:
> -  return err;
> -}
> -
> -cl_int
> -clEnqueueFillImage(cl_command_queue   command_queue,
> -                   cl_mem             image,
> -                   const void *       fill_color,
> -                   const size_t *     porigin,
> -                   const size_t *     pregion,
> -                   cl_uint            num_events_in_wait_list,
> -                   const cl_event *   event_wait_list,
> -                   cl_event *         event)
> -{
> -  cl_int err = CL_SUCCESS;
> -  enqueue_data *data, no_wait_data = { 0 };
> -
> -  CHECK_QUEUE(command_queue);
> -  CHECK_IMAGE(image, src_image);
> -  FIXUP_IMAGE_REGION(src_image, pregion, region);
> -  FIXUP_IMAGE_ORIGIN(src_image, porigin, origin);
> -
> -  if (command_queue->ctx != image->ctx) {
> -    err = CL_INVALID_CONTEXT;
> -    goto error;
> -  }
> -
> -  if (fill_color == NULL) {
> -    err = CL_INVALID_VALUE;
> -    goto error;
> -  }
> -
> -  if (!origin || !region || origin[0] + region[0] > src_image->w || origin[1] +
> region[1] > src_image->h || origin[2] + region[2] > src_image->depth) {
> -     err = CL_INVALID_VALUE;
> -     goto error;
> -  }
> -
> -  if (src_image->image_type == CL_MEM_OBJECT_IMAGE2D && (origin[2] !=
> 0 || region[2] != 1)){
> -    err = CL_INVALID_VALUE;
> -    goto error;
> -  }
> -
> -  if (src_image->image_type == CL_MEM_OBJECT_IMAGE1D && (origin[2] !=
> 0 ||origin[1] != 0 || region[2] != 1 || region[1] != 1)){
> -    err = CL_INVALID_VALUE;
> -    goto error;
> -  }
> -
> -  err = cl_image_fill(command_queue, fill_color, src_image, origin, region);
> -  if (err) {
> -    goto error;
> -  }
> -
> -  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list,
> event, image->ctx);
> -
> -  data = &no_wait_data;
> -  data->type = EnqueueFillImage;
> -  data->queue = command_queue;
> -
> -  if(handle_events(command_queue, num_events_in_wait_list,
> event_wait_list,
> -                   event, data, CL_COMMAND_FILL_BUFFER) ==
> CL_ENQUEUE_EXECUTE_IMM) {
> -    if (event && (*event)->type != CL_COMMAND_USER
> -        && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) {
> -      cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
> -    }
> -
> -    err = cl_command_queue_flush(command_queue);
> -  }
> -
> -  if(b_output_kernel_perf)
> -    time_end(command_queue->ctx, "beignet internal kernel : cl_fill_image",
> "", command_queue);
> -
> -  return 0;
> -
> - error:
> -  return err;
> -}
> -
> -cl_int
> -clEnqueueFillBuffer(cl_command_queue   command_queue,
> -                    cl_mem             buffer,
> -                    const void *       pattern,
> -                    size_t             pattern_size,
> -                    size_t             offset,
> -                    size_t             size,
> -                    cl_uint            num_events_in_wait_list,
> -                    const cl_event *   event_wait_list,
> -                    cl_event *         event)
> -{
> -  cl_int err = CL_SUCCESS;
> -  enqueue_data *data, no_wait_data = { 0 };
> -  static size_t valid_sz[] = {1, 2, 4, 8, 16, 32, 64, 128};
> -  int i = 0;
> -
> -  CHECK_QUEUE(command_queue);
> -  CHECK_MEM(buffer);
> -
> -  if (command_queue->ctx != buffer->ctx) {
> -    err = CL_INVALID_CONTEXT;
> -    goto error;
> -  }
> -
> -  if (offset + size > buffer->size) {
> -    err = CL_INVALID_VALUE;
> -    goto error;
> -  }
> -
> -  if (pattern == NULL) {
> -    err = CL_INVALID_VALUE;
> -    goto error;
> -  }
> -
> -  for (i = 0; i < sizeof(valid_sz) / sizeof(size_t); i++) {
> -    if (valid_sz[i] == pattern_size)
> -      break;
> -  }
> -  if (i == sizeof(valid_sz) / sizeof(size_t)) {
> -    err = CL_INVALID_VALUE;
> -    goto error;
> -  }
> -
> -  if (offset % pattern_size || size % pattern_size) {
> -    err = CL_INVALID_VALUE;
> -    goto error;
> -  }
> -
> -  err = cl_mem_fill(command_queue, pattern, pattern_size, buffer, offset,
> size);
> -  if (err) {
> -    goto error;
> -  }
> -
> -  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list,
> event, buffer->ctx);
> -
> -  data = &no_wait_data;
> -  data->type = EnqueueFillBuffer;
> -  data->queue = command_queue;
> -
> -  if(handle_events(command_queue, num_events_in_wait_list,
> event_wait_list,
> -                   event, data, CL_COMMAND_FILL_BUFFER) ==
> CL_ENQUEUE_EXECUTE_IMM) {
> -    if (event && (*event)->type != CL_COMMAND_USER
> -        && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) {
> -      cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
> -    }
> -
> -    err = cl_command_queue_flush(command_queue);
> -  }
> -
> -  if(b_output_kernel_perf)
> -    time_end(command_queue->ctx, "beignet internal kernel : cl_fill_buffer",
> "", command_queue);
> -
> -  return 0;
> -
> - error:
> -  return err;
> -}
> -
> -cl_int
> -clEnqueueCopyBuffer(cl_command_queue     command_queue,
> -                    cl_mem               src_buffer,
> -                    cl_mem               dst_buffer,
> -                    size_t               src_offset,
> -                    size_t               dst_offset,
> -                    size_t               cb,
> -                    cl_uint              num_events_in_wait_list,
> -                    const cl_event *     event_wait_list,
> -                    cl_event *           event)
> -{
> -  cl_int err = CL_SUCCESS;
> -  enqueue_data *data, no_wait_data = { 0 };
> -
> -  CHECK_QUEUE(command_queue);
> -  CHECK_MEM(src_buffer);
> -  CHECK_MEM(dst_buffer);
> -
> -  if (command_queue->ctx != src_buffer->ctx) {
> -    err = CL_INVALID_CONTEXT;
> -    goto error;
> -  }
> -
> -  if (command_queue->ctx != dst_buffer->ctx) {
> -    err = CL_INVALID_CONTEXT;
> -    goto error;
> -  }
> -
> -  if (src_offset + cb > src_buffer->size) {
> -    err = CL_INVALID_VALUE;
> -    goto error;
> -  }
> -  if (dst_offset + cb > dst_buffer->size) {
> -    err = CL_INVALID_VALUE;
> -    goto error;
> -  }
> -
> -  /* Check overlap */
> -  if (src_buffer == dst_buffer
> -         && (src_offset <= dst_offset && dst_offset <= src_offset + cb - 1)
> -         && (dst_offset <= src_offset && src_offset <= dst_offset + cb - 1)) {
> -    err = CL_MEM_COPY_OVERLAP;
> -    goto error;
> -  }
> -
> -  /* Check sub overlap */
> -  if (src_buffer->type == CL_MEM_SUBBUFFER_TYPE && dst_buffer->type
> == CL_MEM_SUBBUFFER_TYPE ) {
> -    struct _cl_mem_buffer* src_b = (struct _cl_mem_buffer*)src_buffer;
> -    struct _cl_mem_buffer* dst_b = (struct _cl_mem_buffer*)dst_buffer;
> -    size_t src_sub_offset = src_b->sub_offset;
> -    size_t dst_sub_offset = dst_b->sub_offset;
> -
> -    if ((src_offset + src_sub_offset <= dst_offset + dst_sub_offset
> -          && dst_offset + dst_sub_offset <= src_offset + src_sub_offset + cb - 1)
> -     && (dst_offset + dst_sub_offset <= src_offset + src_sub_offset
> -          && src_offset + src_sub_offset <= dst_offset + dst_sub_offset + cb -
> 1)) {
> -      err = CL_MEM_COPY_OVERLAP;
> -      goto error;
> -    }
> -  }
> -
> -  err = cl_mem_copy(command_queue, src_buffer, dst_buffer, src_offset,
> dst_offset, cb);
> -
> -  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list,
> event, src_buffer->ctx);
> -
> -  data = &no_wait_data;
> -  data->type = EnqueueCopyBuffer;
> -  data->queue = command_queue;
> -
> -  if(handle_events(command_queue, num_events_in_wait_list,
> event_wait_list,
> -                   event, data, CL_COMMAND_COPY_BUFFER) ==
> CL_ENQUEUE_EXECUTE_IMM) {
> -    if (event && (*event)->type != CL_COMMAND_USER
> -            && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) {
> -      cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
> -    }
> -
> -    err = cl_command_queue_flush(command_queue);
> -  }
> -
> -  if(b_output_kernel_perf)
> -	  time_end(command_queue->ctx, "beignet internal kernel :
> cl_mem_copy", "", command_queue);
> -
> -  return 0;
> -
> -error:
> -  return err;
> -}
> -
> -cl_int
> -clEnqueueCopyBufferRect(cl_command_queue     command_queue,
> -                        cl_mem               src_buffer,
> -                        cl_mem               dst_buffer,
> -                        const size_t *       src_origin,
> -                        const size_t *       dst_origin,
> -                        const size_t *       region,
> -                        size_t               src_row_pitch,
> -                        size_t               src_slice_pitch,
> -                        size_t               dst_row_pitch,
> -                        size_t               dst_slice_pitch,
> -                        cl_uint              num_events_in_wait_list,
> -                        const cl_event *     event_wait_list,
> -                        cl_event *           event)
> -{
> -  cl_int err = CL_SUCCESS;
> -  enqueue_data *data, no_wait_data = { 0 };
> -
> -  CHECK_QUEUE(command_queue);
> -  CHECK_MEM(src_buffer);
> -  CHECK_MEM(dst_buffer);
> -
> -  if ((command_queue->ctx != src_buffer->ctx) ||
> -      (command_queue->ctx != dst_buffer->ctx)) {
> -    err = CL_INVALID_CONTEXT;
> -    goto error;
> -  }
> -
> -  if (!region || region[0] == 0 || region[1] == 0 || region[2] == 0) {
> -    err = CL_INVALID_VALUE;
> -    goto error;
> -  }
> -
> -  if(src_row_pitch == 0)
> -    src_row_pitch = region[0];
> -  if(src_slice_pitch == 0)
> -    src_slice_pitch = region[1] * src_row_pitch;
> -
> -  if(dst_row_pitch == 0)
> -    dst_row_pitch = region[0];
> -  if(dst_slice_pitch == 0)
> -    dst_slice_pitch = region[1] * dst_row_pitch;
> -
> -  if (src_row_pitch < region[0] ||
> -      dst_row_pitch < region[0]) {
> -    err = CL_INVALID_VALUE;
> -    goto error;
> -  }
> -
> -  if ((src_slice_pitch < region[1] * src_row_pitch || src_slice_pitch %
> src_row_pitch != 0 ) ||
> -      (dst_slice_pitch < region[1] * dst_row_pitch || dst_slice_pitch %
> dst_row_pitch != 0 )) {
> -    err = CL_INVALID_VALUE;
> -    goto error;
> -  }
> -
> -  if ((src_origin[2] + region[2] - 1) * src_slice_pitch
> -        + (src_origin[1] + region[1] - 1) * src_row_pitch
> -        + src_origin[0] + region[0] > src_buffer->size
> -      ||(dst_origin[2] + region[2] - 1) * dst_slice_pitch
> -          + (dst_origin[1] + region[1] - 1) * dst_row_pitch
> -          + dst_origin[0] + region[0] > dst_buffer->size) {
> -    err = CL_INVALID_VALUE;
> -    goto error;
> -  }
> -
> -  if (src_buffer == dst_buffer && (src_row_pitch != dst_row_pitch ||
> src_slice_pitch != dst_slice_pitch)) {
> -    err = CL_INVALID_VALUE;
> -    goto error;
> -  }
> -
> -  if (src_buffer == dst_buffer &&
> -      check_copy_overlap(src_origin, dst_origin, region, src_row_pitch,
> src_slice_pitch)) {
> -    err = CL_MEM_COPY_OVERLAP;
> -    goto error;
> -  }
> -
> -  cl_mem_copy_buffer_rect(command_queue, src_buffer, dst_buffer,
> src_origin, dst_origin, region,
> -                          src_row_pitch, src_slice_pitch, dst_row_pitch, dst_slice_pitch);
> -
> -  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list,
> event, src_buffer->ctx);
> -
> -  data = &no_wait_data;
> -  data->type = EnqueueCopyBufferRect;
> -  data->queue = command_queue;
> -
> -  if(handle_events(command_queue, num_events_in_wait_list,
> event_wait_list,
> -                   event, data, CL_COMMAND_COPY_BUFFER_RECT) ==
> CL_ENQUEUE_EXECUTE_IMM) {
> -    if (event && (*event)->type != CL_COMMAND_USER
> -            && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) {
> -      cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
> -    }
> -
> -    err = cl_command_queue_flush(command_queue);
> -  }
> -
> -  if(b_output_kernel_perf)
> -    time_end(command_queue->ctx, "beignet internal kernel :
> cl_mem_copy_buffer_rect", "", command_queue);
> -
> -error:
> -  return err;
> -}
> -
> -cl_int
> -clEnqueueReadImage(cl_command_queue      command_queue,
> -                   cl_mem                mem,
> -                   cl_bool               blocking_read,
> -                   const size_t *        porigin,
> -                   const size_t *        pregion,
> -                   size_t                row_pitch,
> -                   size_t                slice_pitch,
> -                   void *                ptr,
> -                   cl_uint               num_events_in_wait_list,
> -                   const cl_event *      event_wait_list,
> -                   cl_event *            event)
> -{
> -  cl_int err = CL_SUCCESS;
> -  enqueue_data *data, no_wait_data = { 0 };
> -
> -  CHECK_QUEUE(command_queue);
> -  CHECK_IMAGE(mem, image);
> -  FIXUP_IMAGE_REGION(image, pregion, region);
> -  FIXUP_IMAGE_ORIGIN(image, porigin, origin);
> -  if (command_queue->ctx != mem->ctx) {
> -     err = CL_INVALID_CONTEXT;
> -     goto error;
> -  }
> -
> -  if (!origin || !region || origin[0] + region[0] > image->w || origin[1] +
> region[1] > image->h || origin[2] + region[2] > image->depth) {
> -     err = CL_INVALID_VALUE;
> -     goto error;
> -  }
> -
> -  if (!row_pitch)
> -    row_pitch = image->bpp*region[0];
> -  else if (row_pitch < image->bpp*region[0]) {
> -     err = CL_INVALID_VALUE;
> -     goto error;
> -  }
> -
> -  if (image->slice_pitch) {
> -    if (!slice_pitch)
> -      slice_pitch = row_pitch*region[1];
> -    else if (slice_pitch < row_pitch*region[1]) {
> -      err = CL_INVALID_VALUE;
> -      goto error;
> -    }
> -  }
> -  else if (slice_pitch) {
> -     err = CL_INVALID_VALUE;
> -     goto error;
> -  }
> -
> -  if (!ptr) {
> -     err = CL_INVALID_VALUE;
> -     goto error;
> -  }
> -
> -  if (mem->flags & (CL_MEM_HOST_WRITE_ONLY |
> CL_MEM_HOST_NO_ACCESS)) {
> -     err = CL_INVALID_OPERATION;
> -     goto error;
> -  }
> -
> -  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list,
> event, mem->ctx);
> -
> -  data = &no_wait_data;
> -  data->type        = EnqueueReadImage;
> -  data->mem_obj     = mem;
> -  data->ptr         = ptr;
> -  data->origin[0]   = origin[0];  data->origin[1] = origin[1];  data->origin[2] =
> origin[2];
> -  data->region[0]   = region[0];  data->region[1] = region[1];  data->region[2]
> = region[2];
> -  data->row_pitch   = row_pitch;
> -  data->slice_pitch = slice_pitch;
> -
> -  if(handle_events(command_queue, num_events_in_wait_list,
> event_wait_list,
> -                   event, data, CL_COMMAND_READ_IMAGE) ==
> CL_ENQUEUE_EXECUTE_IMM) {
> -    err = cl_enqueue_handle(event ? *event : NULL, data);
> -    if(event) cl_event_set_status(*event, CL_COMPLETE);
> -  }
> -
> -error:
> -  return err;
> -}
> -
> -cl_int
> -clEnqueueWriteImage(cl_command_queue     command_queue,
> -                    cl_mem               mem,
> -                    cl_bool              blocking_write,
> -                    const size_t *       porigin,
> -                    const size_t *       pregion,
> -                    size_t               row_pitch,
> -                    size_t               slice_pitch,
> -                    const void *         ptr,
> -                    cl_uint              num_events_in_wait_list,
> -                    const cl_event *     event_wait_list,
> -                    cl_event *           event)
> -{
> -  cl_int err = CL_SUCCESS;
> -  enqueue_data *data, no_wait_data = { 0 };
> -
> -  CHECK_QUEUE(command_queue);
> -  CHECK_IMAGE(mem, image);
> -  FIXUP_IMAGE_REGION(image, pregion, region);
> -  FIXUP_IMAGE_ORIGIN(image, porigin, origin);
> -  if (command_queue->ctx != mem->ctx) {
> -    err = CL_INVALID_CONTEXT;
> -    goto error;
> -  }
> -
> -  if (!origin || !region || origin[0] + region[0] > image->w || origin[1] +
> region[1] > image->h || origin[2] + region[2] > image->depth) {
> -    err = CL_INVALID_VALUE;
> -    goto error;
> -  }
> -
> -  if (!row_pitch)
> -    row_pitch = image->bpp*region[0];
> -  else if (row_pitch < image->bpp*region[0]) {
> -    err = CL_INVALID_VALUE;
> -    goto error;
> -  }
> -
> -  if (image->slice_pitch) {
> -    if (!slice_pitch)
> -      slice_pitch = row_pitch*region[1];
> -    else if (slice_pitch < row_pitch*region[1]) {
> -      err = CL_INVALID_VALUE;
> -      goto error;
> -    }
> -  }
> -  else if (slice_pitch) {
> -    err = CL_INVALID_VALUE;
> -    goto error;
> -  }
> -
> -  if (!ptr) {
> -    err = CL_INVALID_VALUE;
> -    goto error;
> -  }
> -
> -  if (mem->flags & (CL_MEM_HOST_READ_ONLY |
> CL_MEM_HOST_NO_ACCESS)) {
> -    err = CL_INVALID_OPERATION;
> -    goto error;
> -  }
> -
> -  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list,
> event, mem->ctx);
> -
> -  data = &no_wait_data;
> -  data->type        = EnqueueWriteImage;
> -  data->mem_obj     = mem;
> -  data->const_ptr   = ptr;
> -  data->origin[0]   = origin[0];  data->origin[1] = origin[1];  data->origin[2] =
> origin[2];
> -  data->region[0]   = region[0];  data->region[1] = region[1];  data->region[2]
> = region[2];
> -  data->row_pitch   = row_pitch;
> -  data->slice_pitch = slice_pitch;
> -
> -  if(handle_events(command_queue, num_events_in_wait_list,
> event_wait_list,
> -                   event, data, CL_COMMAND_WRITE_IMAGE) ==
> CL_ENQUEUE_EXECUTE_IMM) {
> -    err = cl_enqueue_handle(event ? *event : NULL, data);
> -    if(event) cl_event_set_status(*event, CL_COMPLETE);
> -  }
> -
> -error:
> -  return err;
> -}
> -
> -cl_int
> -clEnqueueCopyImage(cl_command_queue      command_queue,
> -                   cl_mem                src_mem,
> -                   cl_mem                dst_mem,
> -                   const size_t *        psrc_origin,
> -                   const size_t *        pdst_origin,
> -                   const size_t *        pregion,
> -                   cl_uint               num_events_in_wait_list,
> -                   const cl_event *      event_wait_list,
> -                   cl_event *            event)
> -{
> -  cl_int err = CL_SUCCESS;
> -  enqueue_data *data, no_wait_data = { 0 };
> -  cl_bool overlap = CL_TRUE;
> -  cl_int i = 0;
> -
> -  CHECK_QUEUE(command_queue);
> -  CHECK_IMAGE(src_mem, src_image);
> -  CHECK_IMAGE(dst_mem, dst_image);
> -  FIXUP_IMAGE_REGION(src_image, pregion, region);
> -  FIXUP_IMAGE_ORIGIN(src_image, psrc_origin, src_origin);
> -  FIXUP_IMAGE_ORIGIN(dst_image, pdst_origin, dst_origin);
> -  if (command_queue->ctx != src_mem->ctx ||
> -      command_queue->ctx != dst_mem->ctx) {
> -    err = CL_INVALID_CONTEXT;
> -    goto error;
> -  }
> -
> -  if (src_image->fmt.image_channel_order != dst_image-
> >fmt.image_channel_order ||
> -      src_image->fmt.image_channel_data_type != dst_image-
> >fmt.image_channel_data_type) {
> -    err = CL_IMAGE_FORMAT_MISMATCH;
> -    goto error;
> -  }
> -
> -  if (!src_origin || !region || src_origin[0] + region[0] > src_image->w ||
> -      src_origin[1] + region[1] > src_image->h || src_origin[2] + region[2] >
> src_image->depth) {
> -    err = CL_INVALID_VALUE;
> -    goto error;
> -  }
> -
> -  if (!dst_origin || !region || dst_origin[0] + region[0] > dst_image->w ||
> -      dst_origin[1] + region[1] > dst_image->h || dst_origin[2] + region[2] >
> dst_image->depth) {
> -    err = CL_INVALID_VALUE;
> -    goto error;
> -  }
> -
> -  if ((src_image->image_type == CL_MEM_OBJECT_IMAGE2D &&
> (src_origin[2] != 0 || region[2] != 1)) ||
> -      (dst_image->image_type == CL_MEM_OBJECT_IMAGE2D &&
> (dst_origin[2] != 0 || region[2] != 1))) {
> -    err = CL_INVALID_VALUE;
> -    goto error;
> -  }
> -
> -  if (src_image == dst_image) {
> -    for(i = 0; i < 3; i++)
> -      overlap = overlap && (src_origin[i] < dst_origin[i] + region[i])
> -                        && (dst_origin[i] < src_origin[i] + region[i]);
> -    if(overlap == CL_TRUE) {
> -      err = CL_MEM_COPY_OVERLAP;
> -      goto error;
> -    }
> -  }
> -
> -  cl_mem_kernel_copy_image(command_queue, src_image, dst_image,
> src_origin, dst_origin, region);
> -
> -  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list,
> event, src_mem->ctx);
> -
> -  data = &no_wait_data;
> -  data->type = EnqueueCopyImage;
> -  data->queue = command_queue;
> -
> -  if(handle_events(command_queue, num_events_in_wait_list,
> event_wait_list,
> -                   event, data, CL_COMMAND_COPY_IMAGE) ==
> CL_ENQUEUE_EXECUTE_IMM) {
> -    if (event && (*event)->type != CL_COMMAND_USER
> -            && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) {
> -      cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
> -    }
> -
> -    err = cl_command_queue_flush(command_queue);
> -  }
> -
> -  if(b_output_kernel_perf)
> -    time_end(command_queue->ctx, "beignet internal kernel :
> cl_mem_kernel_copy_image", "", command_queue);
> -
> -error:
> -  return err;
> -}
> -
> -cl_int
> -clEnqueueCopyImageToBuffer(cl_command_queue  command_queue,
> -                           cl_mem            src_mem,
> -                           cl_mem            dst_buffer,
> -                           const size_t *    psrc_origin,
> -                           const size_t *    pregion,
> -                           size_t            dst_offset,
> -                           cl_uint           num_events_in_wait_list,
> -                           const cl_event *  event_wait_list,
> -                           cl_event *        event)
> -{
> -  cl_int err = CL_SUCCESS;
> -  enqueue_data *data, no_wait_data = { 0 };
> -
> -  CHECK_QUEUE(command_queue);
> -  CHECK_IMAGE(src_mem, src_image);
> -  CHECK_MEM(dst_buffer);
> -  FIXUP_IMAGE_REGION(src_image, pregion, region);
> -  FIXUP_IMAGE_ORIGIN(src_image, psrc_origin, src_origin);
> -  if (command_queue->ctx != src_mem->ctx ||
> -      command_queue->ctx != dst_buffer->ctx) {
> -    err = CL_INVALID_CONTEXT;
> -    goto error;
> -  }
> -
> -  if (dst_offset + region[0]*region[1]*region[2]*src_image->bpp >
> dst_buffer->size) {
> -    err = CL_INVALID_VALUE;
> -    goto error;
> -  }
> -
> -  if (!src_origin || !region || src_origin[0] + region[0] > src_image->w ||
> -      src_origin[1] + region[1] > src_image->h || src_origin[2] + region[2] >
> src_image->depth) {
> -    err = CL_INVALID_VALUE;
> -    goto error;
> -  }
> -
> -  if (src_image->image_type == CL_MEM_OBJECT_IMAGE2D &&
> (src_origin[2] != 0 || region[2] != 1)) {
> -    err = CL_INVALID_VALUE;
> -    goto error;
> -  }
> -
> -  cl_mem_copy_image_to_buffer(command_queue, src_image, dst_buffer,
> src_origin, dst_offset, region);
> -
> -  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list,
> event, src_mem->ctx);
> -
> -  data = &no_wait_data;
> -  data->type = EnqueueCopyImageToBuffer;
> -  data->queue = command_queue;
> -
> -  if(handle_events(command_queue, num_events_in_wait_list,
> event_wait_list,
> -                   event, data, CL_COMMAND_COPY_IMAGE_TO_BUFFER) ==
> CL_ENQUEUE_EXECUTE_IMM) {
> -    if (event && (*event)->type != CL_COMMAND_USER
> -            && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) {
> -      cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
> -    }
> -
> -    err = cl_command_queue_flush(command_queue);
> -  }
> -
> -  if(b_output_kernel_perf)
> -    time_end(command_queue->ctx, "beignet internal kernel :
> cl_mem_copy_image_to_buffer", "", command_queue);
> -
> -error:
> -  return err;
> -}
> -
> -cl_int
> -clEnqueueCopyBufferToImage(cl_command_queue  command_queue,
> -                           cl_mem            src_buffer,
> -                           cl_mem            dst_mem,
> -                           size_t            src_offset,
> -                           const size_t *    pdst_origin,
> -                           const size_t *    pregion,
> -                           cl_uint           num_events_in_wait_list,
> -                           const cl_event *  event_wait_list,
> -                           cl_event *        event)
> -{
> -  cl_int err = CL_SUCCESS;
> -  enqueue_data *data, no_wait_data = { 0 };
> -
> -  CHECK_QUEUE(command_queue);
> -  CHECK_MEM(src_buffer);
> -  CHECK_IMAGE(dst_mem, dst_image);
> -  FIXUP_IMAGE_REGION(dst_image, pregion, region);
> -  FIXUP_IMAGE_ORIGIN(dst_image, pdst_origin, dst_origin);
> -  if (command_queue->ctx != src_buffer->ctx ||
> -      command_queue->ctx != dst_mem->ctx) {
> -    err = CL_INVALID_CONTEXT;
> -    goto error;
> -  }
> -
> -  if (src_offset + region[0]*region[1]*region[2]*dst_image->bpp >
> src_buffer->size) {
> -    err = CL_INVALID_VALUE;
> -    goto error;
> -  }
> -
> -  if (!dst_origin || !region || dst_origin[0] + region[0] > dst_image->w ||
> -      dst_origin[1] + region[1] > dst_image->h || dst_origin[2] + region[2] >
> dst_image->depth) {
> -    err = CL_INVALID_VALUE;
> -    goto error;
> -  }
> -
> -  if (dst_image->image_type == CL_MEM_OBJECT_IMAGE2D &&
> (dst_origin[2] != 0 || region[2] != 1)) {
> -    err = CL_INVALID_VALUE;
> -    goto error;
> -  }
> -
> -  cl_mem_copy_buffer_to_image(command_queue, src_buffer, dst_image,
> src_offset, dst_origin, region);
> -
> -  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list,
> event, dst_mem->ctx);
> -
> -  data = &no_wait_data;
> -  data->type = EnqueueCopyBufferToImage;
> -  data->queue = command_queue;
> -
> -  if(handle_events(command_queue, num_events_in_wait_list,
> event_wait_list,
> -                   event, data, CL_COMMAND_COPY_BUFFER_TO_IMAGE) ==
> CL_ENQUEUE_EXECUTE_IMM) {
> -    if (event && (*event)->type != CL_COMMAND_USER
> -            && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) {
> -      cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
> -    }
> -
> -    err = cl_command_queue_flush(command_queue);
> -  }
> -
> -  if(b_output_kernel_perf)
> -    time_end(command_queue->ctx, "beignet internal kernel :
> cl_mem_copy_buffer_to_image", "", command_queue);
> -
> -error:
> -  return err;
> -}
> -
> -static cl_int _cl_map_mem(cl_mem mem, void *ptr, void **mem_ptr,
> -                          size_t offset, size_t size,
> -                          const size_t *origin, const size_t *region)
> -{
> -  cl_int slot = -1;
> -  int err = CL_SUCCESS;
> -  size_t sub_offset = 0;
> -
> -  if(mem->type == CL_MEM_SUBBUFFER_TYPE) {
> -    struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem;
> -    sub_offset = buffer->sub_offset;
> -  }
> -
> -  ptr = (char*)ptr + offset + sub_offset;
> -  if(mem->flags & CL_MEM_USE_HOST_PTR) {
> -    assert(mem->host_ptr);
> -    //only calc ptr here, will do memcpy in enqueue
> -    *mem_ptr = (char *)mem->host_ptr + offset + sub_offset;
> -  } else {
> -    *mem_ptr = ptr;
> -  }
> -  /* Record the mapped address. */
> -  if (!mem->mapped_ptr_sz) {
> -    mem->mapped_ptr_sz = 16;
> -    mem->mapped_ptr = (cl_mapped_ptr *)malloc(
> -          sizeof(cl_mapped_ptr) * mem->mapped_ptr_sz);
> -    if (!mem->mapped_ptr) {
> -      cl_mem_unmap_auto(mem);
> -      err = CL_OUT_OF_HOST_MEMORY;
> -      goto error;
> -    }
> -    memset(mem->mapped_ptr, 0, mem->mapped_ptr_sz *
> sizeof(cl_mapped_ptr));
> -    slot = 0;
> -  } else {
> -   int i = 0;
> -    for (; i < mem->mapped_ptr_sz; i++) {
> -      if (mem->mapped_ptr[i].ptr == NULL) {
> -        slot = i;
> -        break;
> -      }
> -   }
> -    if (i == mem->mapped_ptr_sz) {
> -      cl_mapped_ptr *new_ptr = (cl_mapped_ptr *)malloc(
> -          sizeof(cl_mapped_ptr) * mem->mapped_ptr_sz * 2);
> -      if (!new_ptr) {
> -        cl_mem_unmap_auto(mem);
> -        err = CL_OUT_OF_HOST_MEMORY;
> -        goto error;
> -      }
> -      memset(new_ptr, 0, 2 * mem->mapped_ptr_sz *
> sizeof(cl_mapped_ptr));
> -      memcpy(new_ptr, mem->mapped_ptr,
> -             mem->mapped_ptr_sz * sizeof(cl_mapped_ptr));
> -      slot = mem->mapped_ptr_sz;
> -      mem->mapped_ptr_sz *= 2;
> -      free(mem->mapped_ptr);
> -      mem->mapped_ptr = new_ptr;
> -    }
> -  }
> -  assert(slot != -1);
> -  mem->mapped_ptr[slot].ptr = *mem_ptr;
> -  mem->mapped_ptr[slot].v_ptr = ptr;
> -  mem->mapped_ptr[slot].size = size;
> -  if(origin) {
> -    assert(region);
> -    mem->mapped_ptr[slot].origin[0] = origin[0];
> -    mem->mapped_ptr[slot].origin[1] = origin[1];
> -    mem->mapped_ptr[slot].origin[2] = origin[2];
> -    mem->mapped_ptr[slot].region[0] = region[0];
> -    mem->mapped_ptr[slot].region[1] = region[1];
> -    mem->mapped_ptr[slot].region[2] = region[2];
> -  }
> -  mem->map_ref++;
> -error:
> -  if (err != CL_SUCCESS)
> -    *mem_ptr = NULL;
> -  return err;
> -}
> -
> -void *
> -clEnqueueMapBuffer(cl_command_queue  command_queue,
> -                   cl_mem            buffer,
> -                   cl_bool           blocking_map,
> -                   cl_map_flags      map_flags,
> -                   size_t            offset,
> -                   size_t            size,
> -                   cl_uint           num_events_in_wait_list,
> -                   const cl_event *  event_wait_list,
> -                   cl_event *        event,
> -                   cl_int *          errcode_ret)
> -{
> -  cl_int err = CL_SUCCESS;
> -  void *ptr = NULL;
> -  void *mem_ptr = NULL;
> -  enqueue_data *data, no_wait_data = { 0 };
> -
> -  CHECK_QUEUE(command_queue);
> -  CHECK_MEM(buffer);
> -  if (command_queue->ctx != buffer->ctx) {
> -    err = CL_INVALID_CONTEXT;
> -    goto error;
> -  }
> -
> -  if (!size || offset + size > buffer->size) {
> -    err = CL_INVALID_VALUE;
> -    goto error;
> -  }
> -
> -  if ((map_flags & CL_MAP_READ &&
> -       buffer->flags & (CL_MEM_HOST_WRITE_ONLY |
> CL_MEM_HOST_NO_ACCESS)) ||
> -      (map_flags & (CL_MAP_WRITE | CL_MAP_WRITE_INVALIDATE_REGION)
> &&
> -       buffer->flags & (CL_MEM_HOST_READ_ONLY |
> CL_MEM_HOST_NO_ACCESS)))
> -  {
> -    err = CL_INVALID_OPERATION;
> -    goto error;
> -  }
> -
> -#ifdef HAS_CMRT
> -  if (command_queue->cmrt_event != NULL)
> -    cmrt_wait_for_task_finished(command_queue);
> -#endif
> -
> -  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list,
> event, buffer->ctx);
> -
> -  data = &no_wait_data;
> -  data->type        = EnqueueMapBuffer;
> -  data->mem_obj     = buffer;
> -  data->offset      = offset;
> -  data->size        = size;
> -  data->ptr         = ptr;
> -  data->unsync_map  = 1;
> -  if (map_flags & (CL_MAP_WRITE | CL_MAP_WRITE_INVALIDATE_REGION))
> -    data->write_map = 1;
> -
> -  if(handle_events(command_queue, num_events_in_wait_list,
> event_wait_list,
> -                   event, data, CL_COMMAND_MAP_BUFFER) ==
> CL_ENQUEUE_EXECUTE_IMM) {
> -    data->unsync_map = 0;
> -    err = cl_enqueue_handle(event ? *event : NULL, data);
> -    if (err != CL_SUCCESS)
> -      goto error;
> -    ptr = data->ptr;
> -    if(event) cl_event_set_status(*event, CL_COMPLETE);
> -  } else {
> -    if (buffer->is_userptr)
> -      ptr = buffer->host_ptr;
> -    else {
> -      if ((ptr = cl_mem_map_gtt_unsync(buffer)) == NULL) {
> -        err = CL_MAP_FAILURE;
> -        goto error;
> -      }
> -    }
> -  }
> -  err = _cl_map_mem(buffer, ptr, &mem_ptr, offset, size, NULL, NULL);
> -  if (err != CL_SUCCESS)
> -    goto error;
> -
> -error:
> -  if (errcode_ret)
> -    *errcode_ret = err;
> -  return mem_ptr;
> -}
> -
> -void *
> -clEnqueueMapImage(cl_command_queue   command_queue,
> -                  cl_mem             mem,
> -                  cl_bool            blocking_map,
> -                  cl_map_flags       map_flags,
> -                  const size_t *     porigin,
> -                  const size_t *     pregion,
> -                  size_t *           image_row_pitch,
> -                  size_t *           image_slice_pitch,
> -                  cl_uint            num_events_in_wait_list,
> -                  const cl_event *   event_wait_list,
> -                  cl_event *         event,
> -                  cl_int *           errcode_ret)
> -{
> -  cl_int err = CL_SUCCESS;
> -  void *ptr  = NULL;
> -  void *mem_ptr = NULL;
> -  size_t offset = 0;
> -  enqueue_data *data, no_wait_data = { 0 };
> -
> -  CHECK_QUEUE(command_queue);
> -  CHECK_IMAGE(mem, image);
> -  FIXUP_IMAGE_REGION(image, pregion, region);
> -  FIXUP_IMAGE_ORIGIN(image, porigin, origin);
> -  if (command_queue->ctx != mem->ctx) {
> -    err = CL_INVALID_CONTEXT;
> -    goto error;
> -  }
> -
> -  if (!origin || !region || origin[0] + region[0] > image->w || origin[1] +
> region[1] > image->h || origin[2] + region[2] > image->depth) {
> -    err = CL_INVALID_VALUE;
> -    goto error;
> -  }
> -
> -  if (!image_row_pitch || (image->slice_pitch && !image_slice_pitch)) {
> -    err = CL_INVALID_VALUE;
> -    goto error;
> -  }
> -
> -  if ((map_flags & CL_MAP_READ &&
> -       mem->flags & (CL_MEM_HOST_WRITE_ONLY |
> CL_MEM_HOST_NO_ACCESS)) ||
> -      (map_flags & (CL_MAP_WRITE | CL_MAP_WRITE_INVALIDATE_REGION)
> &&
> -       mem->flags & (CL_MEM_HOST_READ_ONLY |
> CL_MEM_HOST_NO_ACCESS)))
> -  {
> -    err = CL_INVALID_OPERATION;
> -    goto error;
> -  }
> -
> -#ifdef HAS_CMRT
> -  if (command_queue->cmrt_event != NULL)
> -    cmrt_wait_for_task_finished(command_queue);
> -#endif
> -
> -  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list,
> event, mem->ctx);
> -
> -  data = &no_wait_data;
> -  data->type        = EnqueueMapImage;
> -  data->mem_obj     = mem;
> -  data->origin[0]   = origin[0];  data->origin[1] = origin[1];  data->origin[2] =
> origin[2];
> -  data->region[0]   = region[0];  data->region[1] = region[1];  data->region[2]
> = region[2];
> -  data->ptr         = ptr;
> -  data->unsync_map  = 1;
> -  if (map_flags & (CL_MAP_WRITE | CL_MAP_WRITE_INVALIDATE_REGION))
> -    data->write_map = 1;
> -
> -  if(handle_events(command_queue, num_events_in_wait_list,
> event_wait_list,
> -                   event, data, CL_COMMAND_MAP_IMAGE) ==
> CL_ENQUEUE_EXECUTE_IMM) {
> -    data->unsync_map = 0;
> -    err = cl_enqueue_handle(event ? *event : NULL, data);
> -    if (err != CL_SUCCESS)
> -      goto error;
> -    ptr = data->ptr;
> -    if(event) cl_event_set_status(*event, CL_COMPLETE);
> -  } else {
> -    if ((ptr = cl_mem_map_gtt_unsync(mem)) == NULL) {
> -      err = CL_MAP_FAILURE;
> -      goto error;
> -    }
> -  }
> -
> -  if(mem->flags & CL_MEM_USE_HOST_PTR) {
> -    if (image_slice_pitch)
> -      *image_slice_pitch = image->host_slice_pitch;
> -    *image_row_pitch = image->host_row_pitch;
> -
> -    offset = image->bpp*origin[0] + image->host_row_pitch*origin[1] +
> image->host_slice_pitch*origin[2];
> -  } else {
> -    if (image_slice_pitch)
> -      *image_slice_pitch = image->slice_pitch;
> -    if (image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
> -      *image_row_pitch = image->slice_pitch;
> -    else
> -      *image_row_pitch = image->row_pitch;
> -
> -    offset = image->bpp*origin[0] + image->row_pitch*origin[1] + image-
> >slice_pitch*origin[2];
> -  }
> -  err = _cl_map_mem(mem, ptr, &mem_ptr, offset, 0, origin, region);
> -
> -error:
> -  if (errcode_ret)
> -    *errcode_ret = err;
> -  return mem_ptr; //TODO: map and unmap first
> -}
> -
> -cl_int
> -clEnqueueUnmapMemObject(cl_command_queue  command_queue,
> -                        cl_mem            memobj,
> -                        void *            mapped_ptr,
> -                        cl_uint           num_events_in_wait_list,
> -                        const cl_event *  event_wait_list,
> -                        cl_event *        event)
> -{
> -  cl_int err = CL_SUCCESS;
> -  enqueue_data *data, no_wait_data = { 0 };
> -
> -  CHECK_QUEUE(command_queue);
> -  CHECK_MEM(memobj);
> -  if (command_queue->ctx != memobj->ctx) {
> -    err = CL_INVALID_CONTEXT;
> -    goto error;
> -  }
> -
> -  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list,
> event, memobj->ctx);
> -
> -  data = &no_wait_data;
> -  data->type        = EnqueueUnmapMemObject;
> -  data->mem_obj     = memobj;
> -  data->ptr         = mapped_ptr;
> -
> -  if(handle_events(command_queue, num_events_in_wait_list,
> event_wait_list,
> -                   event, data, CL_COMMAND_UNMAP_MEM_OBJECT) ==
> CL_ENQUEUE_EXECUTE_IMM) {
> -    err = cl_enqueue_handle(event ? *event : NULL, data);
> -    if(event) cl_event_set_status(*event, CL_COMPLETE);
> -  }
> -
> -error:
> -  return err;
> -}
> -
> -cl_int
> -clEnqueueMigrateMemObjects(cl_command_queue        command_queue,
> -                           cl_uint                 num_mem_objects,
> -                           const cl_mem *          mem_objects,
> -                           cl_mem_migration_flags  flags,
> -                           cl_uint                 num_events_in_wait_list,
> -                           const cl_event *        event_wait_list,
> -                           cl_event *              event)
> -{
> -  /* So far, we just support 1 device and no subdevice. So all the command
> queues
> -     belong to the small context. There is no need to migrate the mem objects
> by now. */
> -  cl_int err = CL_SUCCESS;
> -  cl_uint i = 0;
> -  enqueue_data *data, defer_enqueue_data = { 0 };
> -
> -  if (!flags & CL_MIGRATE_MEM_OBJECT_HOST)
> -    CHECK_QUEUE(command_queue);
> -
> -  if (num_mem_objects == 0 || mem_objects == NULL) {
> -    err = CL_INVALID_VALUE;
> -    goto error;
> -  }
> -
> -  if (flags && flags & ~(CL_MIGRATE_MEM_OBJECT_HOST |
> -                         CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED)) {
> -    err = CL_INVALID_VALUE;
> -    goto error;
> -  }
> -
> -  for (i = 0; i < num_mem_objects; i++) {
> -    CHECK_MEM(mem_objects[i]);
> -    if (mem_objects[i]->ctx != command_queue->ctx) {
> -      err = CL_INVALID_CONTEXT;
> -      goto error;
> -    }
> -  }
> -
> -  /* really nothing to do, fill the event. */
> -  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list,
> event, command_queue->ctx);
> -  data = &defer_enqueue_data;
> -  data->type = EnqueueMigrateMemObj;
> -
> -  if(handle_events(command_queue, num_events_in_wait_list,
> event_wait_list,
> -                   event, data, CL_COMMAND_READ_BUFFER) ==
> CL_ENQUEUE_EXECUTE_IMM) {
> -    err = cl_enqueue_handle(event ? *event : NULL, data);
> -    if(event) cl_event_set_status(*event, CL_COMPLETE);
> -  }
> -
> -error:
> -  return err;
> -}
> -
> -cl_int
> -clEnqueueNDRangeKernel(cl_command_queue  command_queue,
> -                       cl_kernel         kernel,
> -                       cl_uint           work_dim,
> -                       const size_t *    global_work_offset,
> -                       const size_t *    global_work_size,
> -                       const size_t *    local_work_size,
> -                       cl_uint           num_events_in_wait_list,
> -                       const cl_event *  event_wait_list,
> -                       cl_event *        event)
> -{
> -  size_t fixed_global_off[] = {0,0,0};
> -  size_t fixed_global_sz[] = {1,1,1};
> -  size_t fixed_local_sz[] = {1,1,1};
> -  cl_int err = CL_SUCCESS;
> -  cl_uint i;
> -  enqueue_data *data, no_wait_data = { 0 };
> -
> -  CHECK_QUEUE(command_queue);
> -  CHECK_KERNEL(kernel);
> -
> -  /* Check number of dimensions we have */
> -  if (UNLIKELY(work_dim == 0 || work_dim > 3)) {
> -    err = CL_INVALID_WORK_DIMENSION;
> -    goto error;
> -  }
> -
> -  /* We need a work size per dimension */
> -  if (UNLIKELY(global_work_size == NULL)) {
> -    err = CL_INVALID_GLOBAL_WORK_SIZE;
> -    goto error;
> -  }
> -
> -  if (kernel->vme) {
> -    if (work_dim != 2) {
> -      err = CL_INVALID_WORK_DIMENSION;
> -      goto error;
> -    }
> -    if (local_work_size != NULL) {
> -      err = CL_INVALID_WORK_GROUP_SIZE;
> -      goto error;
> -    }
> -  }
> -
> -  if (global_work_offset != NULL)
> -    for (i = 0; i < work_dim; ++i) {
> -      if (UNLIKELY(global_work_offset[i] + global_work_size[i] > (size_t)-1)) {
> -        err = CL_INVALID_GLOBAL_OFFSET;
> -        goto error;
> -      }
> -    }
> -
> -  /* Local sizes must be non-null and divide global sizes */
> -  if (local_work_size != NULL)
> -    for (i = 0; i < work_dim; ++i)
> -      if (UNLIKELY(local_work_size[i] == 0 || global_work_size[i] %
> local_work_size[i])) {
> -        err = CL_INVALID_WORK_GROUP_SIZE;
> -        goto error;
> -      }
> -
> -  /* Queue and kernel must share the same context */
> -  assert(kernel->program);
> -  if (command_queue->ctx != kernel->program->ctx) {
> -    err = CL_INVALID_CONTEXT;
> -    goto error;
> -  }
> -
> -#ifdef HAS_CMRT
> -  if (kernel->cmrt_kernel != NULL) {
> -    err = cmrt_enqueue(command_queue, kernel, global_work_size,
> local_work_size);
> -    goto error;
> -  }
> -#endif
> -
> -  /* XXX No event right now */
> -  //FATAL_IF(num_events_in_wait_list > 0, "Events are not supported");
> -  //FATAL_IF(event_wait_list != NULL, "Events are not supported");
> -  //FATAL_IF(event != NULL, "Events are not supported");
> -
> -  if (local_work_size != NULL) {
> -    for (i = 0; i < work_dim; ++i)
> -      fixed_local_sz[i] = local_work_size[i];
> -  } else {
> -    if (kernel->vme) {
> -        fixed_local_sz[0] = 16;
> -        fixed_local_sz[1] = 1;
> -    } else {
> -      uint j, maxDimSize = 64 /* from 64? */, maxGroupSize = 256;
> //MAX_WORK_GROUP_SIZE may too large
> -      size_t realGroupSize = 1;
> -      for (i = 0; i< work_dim; i++) {
> -        for (j = maxDimSize; j > 1; j--) {
> -          if (global_work_size[i] % j == 0 && j <= maxGroupSize) {
> -            fixed_local_sz[i] = j;
> -            maxGroupSize = maxGroupSize /j;
> -            maxDimSize = maxGroupSize > maxDimSize ? maxDimSize :
> maxGroupSize;
> -            break;  //choose next work_dim
> -          }
> -        }
> -        realGroupSize *= fixed_local_sz[i];
> -      }
> -
> -      //in a loop of conformance test (such as test_api
> repeated_setup_cleanup), in each loop:
> -      //create a new context, a new command queue, and uses
> 'globalsize[0]=1000, localsize=NULL' to enqueu kernel
> -      //it triggers the following message for many times.
> -      //to avoid too many messages, only print it for the first time of the
> process.
> -      //just use static variable since it doesn't matter to print a few times at
> multi-thread case.
> -      static int warn_no_good_localsize = 1;
> -      if (realGroupSize % 8 != 0 && warn_no_good_localsize) {
> -        warn_no_good_localsize = 0;
> -        DEBUGP(DL_WARNING, "unable to find good values for
> local_work_size[i], please provide local_work_size[] explicitly, you can find
> good values with trial-and-error method.");
> -      }
> -    }
> -  }
> -
> -  if (kernel->vme) {
> -    fixed_global_sz[0] = (global_work_size[0]+15) / 16 * 16;
> -    fixed_global_sz[1] = (global_work_size[1]+15) / 16;
> -  } else {
> -    for (i = 0; i < work_dim; ++i)
> -      fixed_global_sz[i] = global_work_size[i];
> -  }
> -  if (global_work_offset != NULL)
> -    for (i = 0; i < work_dim; ++i)
> -      fixed_global_off[i] = global_work_offset[i];
> -
> -  if (kernel->compile_wg_sz[0] || kernel->compile_wg_sz[1] || kernel-
> >compile_wg_sz[2]) {
> -    if (fixed_local_sz[0] != kernel->compile_wg_sz[0]
> -        || fixed_local_sz[1] != kernel->compile_wg_sz[1]
> -        || fixed_local_sz[2] != kernel->compile_wg_sz[2])
> -    {
> -        err = CL_INVALID_WORK_GROUP_SIZE;
> -        goto error;
> -    }
> -  }
> -
> -  /* Do device specific checks are enqueue the kernel */
> -  err = cl_command_queue_ND_range(command_queue,
> -                                  kernel,
> -                                  work_dim,
> -                                  fixed_global_off,
> -                                  fixed_global_sz,
> -                                  fixed_local_sz);
> -  if(err != CL_SUCCESS)
> -    goto error;
> -
> -  data = &no_wait_data;
> -  data->type = EnqueueNDRangeKernel;
> -  data->queue = command_queue;
> -
> -  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list,
> event, command_queue->ctx);
> -  if(handle_events(command_queue, num_events_in_wait_list,
> event_wait_list,
> -                   event, data, CL_COMMAND_NDRANGE_KERNEL) ==
> CL_ENQUEUE_EXECUTE_IMM) {
> -    if (event && (*event)->type != CL_COMMAND_USER
> -            && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) {
> -      cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT);
> -    }
> -
> -    err = cl_command_queue_flush(command_queue);
> -  }
> -
> -error:
> -  if(b_output_kernel_perf)
> -  {
> -    if(kernel->program->build_opts != NULL)
> -      time_end(command_queue->ctx, cl_kernel_get_name(kernel), kernel-
> >program->build_opts, command_queue);
> -    else
> -      time_end(command_queue->ctx, cl_kernel_get_name(kernel), "",
> command_queue);
> -  }
> -
> -  return err;
> -}
> -
> -cl_int
> -clEnqueueTask(cl_command_queue   command_queue,
> -              cl_kernel          kernel,
> -              cl_uint            num_events_in_wait_list,
> -              const cl_event *   event_wait_list,
> -              cl_event *         event)
> -{
> -  const size_t global_size[3] = {1, 0, 0};
> -  const size_t local_size[3]  = {1, 0, 0};
> -
> -  return clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
> global_size, local_size,
> -                                num_events_in_wait_list, event_wait_list, event);
> -}
> -
> -cl_int
> -clEnqueueNativeKernel(cl_command_queue   command_queue,
> -                      void (*user_func)(void *),
> -                      void *             args,
> -                      size_t             cb_args,
> -                      cl_uint            num_mem_objects,
> -                      const cl_mem *     mem_list,
> -                      const void **      args_mem_loc,
> -                      cl_uint            num_events_in_wait_list,
> -                      const cl_event *   event_wait_list,
> -                      cl_event *         event)
> -{
> -  cl_int err = CL_SUCCESS;
> -  void *new_args = NULL;
> -  enqueue_data *data, no_wait_data = { 0 };
> -  cl_int i;
> -
> -  if(user_func == NULL ||
> -    (args == NULL && cb_args > 0) ||
> -    (args == NULL && num_mem_objects ==0) ||
> -    (args != NULL && cb_args == 0) ||
> -    (num_mem_objects > 0 && (mem_list == NULL || args_mem_loc ==
> NULL)) ||
> -    (num_mem_objects == 0 && (mem_list != NULL || args_mem_loc !=
> NULL))) {
> -    err = CL_INVALID_VALUE;
> -    goto error;
> -  }
> -
> -  //Per spec, need copy args
> -  if (cb_args)
> -  {
> -    new_args = malloc(cb_args);
> -    if (!new_args)
> -    {
> -      err = CL_OUT_OF_HOST_MEMORY;
> -      goto error;
> -    }
> -    memcpy(new_args, args, cb_args);
> -
> -    for (i=0; i<num_mem_objects; ++i)
> -    {
> -      CHECK_MEM(mem_list[i]);
> -      args_mem_loc[i] = new_args + (args_mem_loc[i] - args);  //change to
> new args
> -    }
> -  }
> -
> -  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list,
> event, command_queue->ctx);
> -
> -  data = &no_wait_data;
> -  data->type        = EnqueueNativeKernel;
> -  data->mem_list    = mem_list;
> -  data->ptr         = new_args;
> -  data->size        = cb_args;
> -  data->offset      = (size_t)num_mem_objects;
> -  data->const_ptr   = args_mem_loc;
> -  data->user_func   = user_func;
> -
> -  if(handle_events(command_queue, num_events_in_wait_list,
> event_wait_list,
> -                   event, data, CL_COMMAND_NATIVE_KERNEL) ==
> CL_ENQUEUE_EXECUTE_IMM) {
> -    err = cl_enqueue_handle(event ? *event : NULL, data);
> -    if(event) cl_event_set_status(*event, CL_COMPLETE);
> -  }
> -
> -error:
> -  return err;
> -}
> -
> -cl_int
> -clEnqueueMarker(cl_command_queue command_queue,
> -    cl_event *event)
> -{
> -  cl_int err = CL_SUCCESS;
> -  CHECK_QUEUE(command_queue);
> -  if(event == NULL) {
> -    err = CL_INVALID_VALUE;
> -    goto error;
> -  }
> -
> -  cl_event_marker_with_wait_list(command_queue, 0, NULL, event);
> -error:
> -  return err;
> -}
> -
> -cl_int
> -clEnqueueMarkerWithWaitList(cl_command_queue command_queue,
> -    cl_uint num_events_in_wait_list,
> -    const cl_event *event_wait_list,
> -    cl_event *event)
> -{
> -  cl_int err = CL_SUCCESS;
> -  CHECK_QUEUE(command_queue);
> -
> -  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list,
> event, command_queue->ctx);
> -
> -  cl_event_marker_with_wait_list(command_queue,
> num_events_in_wait_list, event_wait_list, event);
> -error:
> -  return err;
> -}
> -
> -cl_int
> -clEnqueueWaitForEvents(cl_command_queue  command_queue,
> -                       cl_uint           num_events,
> -                       const cl_event *  event_list)
> -{
> -  cl_int err = CL_SUCCESS;
> -  CHECK_QUEUE(command_queue);
> -  err = clWaitForEvents(num_events, event_list);
> -
> -error:
> -  return err;
> -}
> -
> -cl_int
> -clEnqueueBarrier(cl_command_queue  command_queue)
> -{
> -  cl_int err = CL_SUCCESS;
> -  CHECK_QUEUE(command_queue);
> -
> -  cl_event_barrier_with_wait_list(command_queue, 0, NULL, NULL);
> -
> -error:
> -  return err;
> -}
> -
> -cl_int
> -clEnqueueBarrierWithWaitList(cl_command_queue command_queue,
> -    cl_uint num_events_in_wait_list,
> -    const cl_event *event_wait_list,
> -    cl_event *event)
> -{
> -  cl_int err = CL_SUCCESS;
> -  CHECK_QUEUE(command_queue);
> -
> -  TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list,
> event, command_queue->ctx);
> -
> -  cl_event_barrier_with_wait_list(command_queue,
> num_events_in_wait_list, event_wait_list, event);
> -error:
> -  return err;
> -}
> -
>  #define EXTFUNC(x)                      \
>    if (strcmp(#x, func_name) == 0)       \
>      return (void *)x;
> diff --git a/src/cl_api_kernel.c b/src/cl_api_kernel.c
> index a1075d7..ef494e6 100644
> --- a/src/cl_api_kernel.c
> +++ b/src/cl_api_kernel.c
> @@ -130,10 +130,19 @@ clEnqueueNDRangeKernel(cl_command_queue
> command_queue,
>            }
>            realGroupSize *= fixed_local_sz[i];
>          }
> -        if (realGroupSize % 8 != 0)
> +
> +        //in a loop of conformance test (such as test_api
> repeated_setup_cleanup), in each loop:
> +        //create a new context, a new command queue, and uses
> 'globalsize[0]=1000, localsize=NULL' to enqueu kernel
> +        //it triggers the following message for many times.
> +        //to avoid too many messages, only print it for the first time of the
> process.
> +        //just use static variable since it doesn't matter to print a few times at
> multi-thread case.
> +        static int warn_no_good_localsize = 1;
> +        if (realGroupSize % 8 != 0 && warn_no_good_localsize) {
> +          warn_no_good_localsize = 0;
>            DEBUGP(DL_WARNING, "unable to find good values for
> local_work_size[i], please provide\n"
> -                             " local_work_size[] explicitly, you can find good values
> with\n"
> -                             " trial-and-error method.");
> +                 " local_work_size[] explicitly, you can find good values with\n"
> +                 " trial-and-error method.");
> +        }
>        }
>      }
> 
> @@ -253,10 +262,10 @@ clEnqueueNativeKernel(cl_command_queue
> command_queue,
> 
>      //Per spec, need copy args
>      if (cb_args) {
> -      new_args = CL_MALLOC(cb_args);
> +      new_args = cl_malloc(cb_args);
>        if (num_mem_objects) {
> -        new_args_mem_loc = CL_MALLOC(sizeof(void *) * num_mem_objects);
> -        new_mem_list = CL_MALLOC(sizeof(cl_mem) * num_mem_objects);
> +        new_args_mem_loc = cl_malloc(sizeof(void *) * num_mem_objects);
> +        new_mem_list = cl_malloc(sizeof(cl_mem) * num_mem_objects);
>          memcpy(new_mem_list, mem_list, sizeof(cl_mem) *
> num_mem_objects);
>        }
> 
> @@ -320,11 +329,11 @@ clEnqueueNativeKernel(cl_command_queue
> command_queue,
> 
>    if (err != CL_SUCCESS) {
>      if (new_args)
> -      CL_FREE(new_args);
> +      cl_free(new_args);
>      if (new_mem_list)
> -      CL_FREE(new_mem_list);
> +      cl_free(new_mem_list);
>      if (new_args_mem_loc)
> -      CL_FREE(new_args_mem_loc);
> +      cl_free(new_args_mem_loc);
>    }
> 
>    if (err == CL_SUCCESS && event) {
> diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
> index 8d3c6b0..54a487c 100644
> --- a/src/cl_command_queue.c
> +++ b/src/cl_command_queue.c
> @@ -45,14 +45,16 @@ cl_command_queue_new(cl_context ctx)
>    assert(ctx);
>    TRY_ALLOC_NO_ERR (queue, CALLOC(struct _cl_command_queue));
>    CL_OBJECT_INIT_BASE(queue, CL_OBJECT_COMMAND_QUEUE_MAGIC);
> +  cl_command_queue_init_enqueue(queue);
> 
> -  queue->cmrt_event = NULL;
>    if ((queue->thread_data = cl_thread_data_create()) == NULL) {
>      goto error;
>    }
> 
>    /* Append the command queue in the list */
>    cl_context_add_queue(ctx, queue);
> +  queue->ctx = ctx;
> +  queue->cmrt_event = NULL;
> 
>  exit:
>    return queue;
> @@ -69,6 +71,8 @@ cl_command_queue_delete(cl_command_queue
> queue)
>    if (CL_OBJECT_DEC_REF(queue) > 1)
>      return;
> 
> +  cl_command_queue_destroy_enqueue(queue);
> +
>  #ifdef HAS_CMRT
>    if (queue->cmrt_event != NULL)
>      cmrt_destroy_event(queue);
> @@ -76,7 +80,7 @@ cl_command_queue_delete(cl_command_queue
> queue)
> 
>    // If there is a list of valid events, we need to give them
>    // a chance to call the call-back function.
> -  cl_event_update_last_events(queue,1);
> +  //cl_event_update_last_events(queue,1);
> 
>    cl_thread_data_destroy(queue);
>    queue->thread_data = NULL;
> @@ -112,10 +116,9 @@ set_image_info(char *curbe,
>  }
> 
>  LOCAL cl_int
> -cl_command_queue_bind_image(cl_command_queue queue, cl_kernel k)
> +cl_command_queue_bind_image(cl_command_queue queue, cl_kernel k,
> cl_gpgpu gpgpu)
>  {
>    uint32_t i;
> -  GET_QUEUE_THREAD_GPGPU(queue);
> 
>    for (i = 0; i < k->image_sz; i++) {
>      int id = k->images[i].arg_idx;
> @@ -149,9 +152,9 @@
> cl_command_queue_bind_image(cl_command_queue queue, cl_kernel k)
>  }
> 
>  LOCAL cl_int
> -cl_command_queue_bind_surface(cl_command_queue queue, cl_kernel k)
> +cl_command_queue_bind_surface(cl_command_queue queue, cl_kernel k,
> cl_gpgpu gpgpu)
>  {
> -  GET_QUEUE_THREAD_GPGPU(queue);
> +  //GET_QUEUE_THREAD_GPGPU(queue);
> 
>    /* Bind all user buffers (given by clSetKernelArg) */
>    uint32_t i;
> @@ -175,7 +178,8 @@
> cl_command_queue_bind_surface(cl_command_queue queue, cl_kernel k)
>    return CL_SUCCESS;
>  }
> 
> -extern cl_int cl_command_queue_ND_range_gen7(cl_command_queue,
> cl_kernel, uint32_t, const size_t *, const size_t *, const size_t *);
> +extern cl_int cl_command_queue_ND_range_gen7(cl_command_queue,
> cl_kernel, cl_event,
> +                                             uint32_t, const size_t *, const size_t *, const size_t *);
> 
>  static cl_int
>  cl_kernel_check_args(cl_kernel k)
> @@ -190,6 +194,7 @@ cl_kernel_check_args(cl_kernel k)
>  LOCAL cl_int
>  cl_command_queue_ND_range(cl_command_queue queue,
>                            cl_kernel k,
> +                          cl_event event,
>                            const uint32_t work_dim,
>                            const size_t *global_wk_off,
>                            const size_t *global_wk_sz,
> @@ -203,8 +208,10 @@
> cl_command_queue_ND_range(cl_command_queue queue,
>    /* Check that the user did not forget any argument */
>    TRY (cl_kernel_check_args, k);
> 
> +
>    if (ver == 7 || ver == 75 || ver == 8 || ver == 9)
> -    TRY (cl_command_queue_ND_range_gen7, queue, k, work_dim,
> global_wk_off, global_wk_sz, local_wk_sz);
> +    TRY (cl_command_queue_ND_range_gen7, queue, k, event,
> +         work_dim, global_wk_off, global_wk_sz, local_wk_sz);
>    else
>      FATAL ("Unknown Gen Device");
> 
> @@ -213,7 +220,7 @@ error:
>  }
> 
>  LOCAL int
> -cl_command_queue_flush_gpgpu(cl_command_queue queue, cl_gpgpu
> gpgpu)
> +cl_command_queue_flush_gpgpu(cl_gpgpu gpgpu)
>  {
>    void* printf_info = cl_gpgpu_get_printf_info(gpgpu);
>    void* profiling_info;
> @@ -246,15 +253,15 @@ cl_command_queue_flush(cl_command_queue
> queue)
>  {
>    int err;
>    GET_QUEUE_THREAD_GPGPU(queue);
> -  err = cl_command_queue_flush_gpgpu(queue, gpgpu);
> +  err = cl_command_queue_flush_gpgpu(gpgpu);
>    // We now keep a list of uncompleted events and check if they compelte
>    // every flush. This can make sure all events created have chance to be
>    // update status, so the callback functions or reference can be handled.
> -  cl_event_update_last_events(queue,0);
> +  //cl_event_update_last_events(queue,0);
> 
>    cl_event current_event = get_current_event(queue);
>    if (current_event && err == CL_SUCCESS) {
> -    err = cl_event_flush(current_event);
> +    //err = cl_event_flush(current_event);
>      set_current_event(queue, NULL);
>    }
>    cl_invalid_thread_gpgpu(queue);
> @@ -265,7 +272,7 @@ LOCAL cl_int
>  cl_command_queue_finish(cl_command_queue queue)
>  {
>    cl_gpgpu_sync(cl_get_thread_batch_buf(queue));
> -  cl_event_update_last_events(queue,1);
> +  //cl_event_update_last_events(queue,1);
>    return CL_SUCCESS;
>  }
> 
> @@ -337,72 +344,69 @@
> cl_command_queue_remove_event(cl_command_queue queue, cl_event
> event)
>    queue->wait_events_num -= 1;
>  }
> 
> -#define DEFAULT_WAIT_EVENTS_SIZE  16
>  LOCAL void
>  cl_command_queue_insert_barrier_event(cl_command_queue queue,
> cl_event event)
>  {
> -  cl_int i=0;
> -  cl_event *new_list;
> +  cl_int i = 0;
> +
> +  cl_event_add_ref(event);
> 
>    assert(queue != NULL);
> -  if(queue->barrier_events == NULL) {
> -    queue->barrier_events_size = DEFAULT_WAIT_EVENTS_SIZE;
> -    TRY_ALLOC_NO_ERR (queue->barrier_events, CALLOC_ARRAY(cl_event,
> queue->barrier_events_size));
> +  CL_OBJECT_LOCK(queue);
> +
> +  if (queue->barrier_events == NULL) {
> +    queue->barrier_events_size = 4;
> +    queue->barrier_events = cl_calloc(queue->barrier_events_size,
> sizeof(cl_event));
> +    assert(queue->barrier_events);
>    }
> 
> -  for(i=0; i<queue->barrier_events_num; i++) {
> -    if(queue->barrier_events[i] == event)
> -      return;   //is in the barrier_events, need to insert
> +  for (i = 0; i<queue->barrier_events_num; i++) {
> +    assert(queue->barrier_events[i] != event);
>    }
> 
>    if(queue->barrier_events_num < queue->barrier_events_size) {
>      queue->barrier_events[queue->barrier_events_num++] = event;
> +    CL_OBJECT_UNLOCK(queue);
>      return;
>    }
> 
> -  //barrier_events_num == barrier_events_size, array is full
> +  /* Array is full, double expand. */
>    queue->barrier_events_size *= 2;
> -  TRY_ALLOC_NO_ERR (new_list, CALLOC_ARRAY(cl_event, queue-
> >barrier_events_size));
> -  memcpy(new_list, queue->barrier_events, sizeof(cl_event)*queue-
> >barrier_events_num);
> -  cl_free(queue->barrier_events);
> -  queue->barrier_events = new_list;
> -  queue->barrier_events[queue->barrier_events_num++] = event;
> -  return;
> +  queue->barrier_events = cl_realloc(queue->barrier_events,
> +                                     queue->barrier_events_size * sizeof(cl_event));
> +  assert(queue->barrier_events);
> 
> -exit:
> +  queue->barrier_events[queue->barrier_events_num++] = event;
> +  CL_OBJECT_UNLOCK(queue);
>    return;
> -error:
> -  if(queue->barrier_events)
> -    cl_free(queue->barrier_events);
> -  queue->barrier_events = NULL;
> -  queue->barrier_events_size = 0;
> -  queue->barrier_events_num = 0;
> -  goto exit;
> -
>  }
> 
>  LOCAL void
>  cl_command_queue_remove_barrier_event(cl_command_queue queue,
> cl_event event)
>  {
> -  cl_int i=0;
> +  cl_int i = 0;
> +  assert(queue != NULL);
> 
> -  if(queue->barrier_events_num == 0)
> -    return;
> +  CL_OBJECT_LOCK(queue);
> 
> -  for(i=0; i<queue->barrier_events_num; i++) {
> +  assert(queue->barrier_events_num > 0);
> +  assert(queue->barrier_events);
> +
> +  for(i = 0; i < queue->barrier_events_num; i++) {
>      if(queue->barrier_events[i] == event)
>        break;
>    }
> +  assert(i < queue->barrier_events_num); // Must find it.
> 
> -  if(i == queue->barrier_events_num)
> -    return;
> -
> -  if(i == queue->barrier_events_num - 1) {
> +  if(i == queue->barrier_events_num - 1) { // The last one.
>      queue->barrier_events[i] = NULL;
>    } else {
> -    for(; i<queue->barrier_events_num-1; i++) {
> +    for(; i < queue->barrier_events_num - 1; i++) { // Move forward.
>        queue->barrier_events[i] = queue->barrier_events[i+1];
>      }
>    }
>    queue->barrier_events_num -= 1;
> +  CL_OBJECT_UNLOCK(queue);
> +
> +  cl_event_delete(event);
>  }
> diff --git a/src/cl_command_queue.h b/src/cl_command_queue.h
> index 34886f8..470cafb 100644
> --- a/src/cl_command_queue.h
> +++ b/src/cl_command_queue.h
> @@ -81,6 +81,7 @@ extern void
> cl_command_queue_add_ref(cl_command_queue);
>  /* Map ND range kernel from OCL API */
>  extern cl_int cl_command_queue_ND_range(cl_command_queue queue,
>                                          cl_kernel ker,
> +                                        cl_event event,
>                                          const uint32_t work_dim,
>                                          const size_t *global_work_offset,
>                                          const size_t *global_work_size,
> @@ -93,16 +94,16 @@ extern cl_int
> cl_command_queue_set_report_buffer(cl_command_queue, cl_mem);
>  extern cl_int cl_command_queue_flush(cl_command_queue);
> 
>  /* Flush for the specified gpgpu */
> -extern int cl_command_queue_flush_gpgpu(cl_command_queue,
> cl_gpgpu);
> +extern int cl_command_queue_flush_gpgpu(cl_gpgpu);
> 
>  /* Wait for the completion of the command queue */
>  extern cl_int cl_command_queue_finish(cl_command_queue);
> 
>  /* Bind all the surfaces in the GPGPU state */
> -extern cl_int cl_command_queue_bind_surface(cl_command_queue,
> cl_kernel);
> +extern cl_int cl_command_queue_bind_surface(cl_command_queue,
> cl_kernel, cl_gpgpu);
> 
>  /* Bind all the image surfaces in the GPGPU state */
> -extern cl_int cl_command_queue_bind_image(cl_command_queue,
> cl_kernel);
> +extern cl_int cl_command_queue_bind_image(cl_command_queue,
> cl_kernel, cl_gpgpu);
> 
>  /* Insert a user event to command's wait_events */
>  extern void cl_command_queue_insert_event(cl_command_queue,
> cl_event);
> diff --git a/src/cl_command_queue_enqueue.c
> b/src/cl_command_queue_enqueue.c
> index 1848d50..7bc6dd3 100644
> --- a/src/cl_command_queue_enqueue.c
> +++ b/src/cl_command_queue_enqueue.c
> @@ -18,7 +18,7 @@
>   */
> 
>  #include "cl_command_queue.h"
> -#include "cl_event_new.h"
> +#include "cl_event.h"
>  #include "cl_alloc.h"
>  #include <stdio.h>
> 
> @@ -203,7 +203,7 @@
> cl_command_queue_record_in_queue_events(cl_command_queue queue,
> cl_uint *list_nu
>    }
>    assert(event_num > 0);
> 
> -  enqueued_list = CL_CALLOC(event_num, sizeof(cl_event));
> +  enqueued_list = cl_calloc(event_num, sizeof(cl_event));
>    assert(enqueued_list);
> 
>    i = 0;
> @@ -265,7 +265,7 @@ cl_command_queue_wait_flush(cl_command_queue
> queue)
>      cl_event_delete(enqueued_list[i]);
>    }
>    if (enqueued_list)
> -    CL_FREE(enqueued_list);
> +    cl_free(enqueued_list);
> 
>    return CL_SUCCESS;
>  }
> @@ -315,7 +315,7 @@
> cl_command_queue_wait_finish(cl_command_queue queue)
>      cl_event_delete(enqueued_list[i]);
>    }
>    if (enqueued_list)
> -    CL_FREE(enqueued_list);
> +    cl_free(enqueued_list);
> 
>    return CL_SUCCESS;
>  }
> diff --git a/src/cl_command_queue_gen7.c
> b/src/cl_command_queue_gen7.c
> index b6a5920..5ad3b8b 100644
> --- a/src/cl_command_queue_gen7.c
> +++ b/src/cl_command_queue_gen7.c
> @@ -23,6 +23,7 @@
>  #include "cl_kernel.h"
>  #include "cl_device_id.h"
>  #include "cl_mem.h"
> +#include "cl_event.h"
>  #include "cl_utils.h"
>  #include "cl_alloc.h"
> 
> @@ -123,12 +124,12 @@ error:
>  }
> 
>  static int
> -cl_upload_constant_buffer(cl_command_queue queue, cl_kernel ker)
> +cl_upload_constant_buffer(cl_command_queue queue, cl_kernel ker,
> cl_gpgpu gpgpu)
>  {
>    /* calculate constant buffer size
>     * we need raw_size & aligned_size
>     */
> -  GET_QUEUE_THREAD_GPGPU(queue);
> +  //GET_QUEUE_THREAD_GPGPU(queue);
>    int32_t arg;
>    size_t offset = 0;
>    uint32_t raw_size = 0, aligned_size =0;
> @@ -331,12 +332,14 @@ cl_alloc_printf(cl_gpgpu gpgpu, cl_kernel ker, void*
> printf_info, int printf_num
>  LOCAL cl_int
>  cl_command_queue_ND_range_gen7(cl_command_queue queue,
>                                 cl_kernel ker,
> +                               cl_event event,
>                                 const uint32_t work_dim,
>                                 const size_t *global_wk_off,
>                                 const size_t *global_wk_sz,
>                                 const size_t *local_wk_sz)
>  {
> -  GET_QUEUE_THREAD_GPGPU(queue);
> +  //GET_QUEUE_THREAD_GPGPU(queue);
> +  cl_gpgpu gpgpu = cl_gpgpu_new(queue->ctx->drv);
>    cl_context ctx = queue->ctx;
>    char *final_curbe = NULL;  /* Includes them and one sub-buffer per group
> */
>    cl_gpgpu_kernel kernel;
> @@ -403,9 +406,9 @@
> cl_command_queue_ND_range_gen7(cl_command_queue queue,
>    }
> 
>    /* Bind user buffers */
> -  cl_command_queue_bind_surface(queue, ker);
> +  cl_command_queue_bind_surface(queue, ker, gpgpu);
>    /* Bind user images */
> -  if(UNLIKELY(err = cl_command_queue_bind_image(queue, ker) !=
> CL_SUCCESS))
> +  if(UNLIKELY(err = cl_command_queue_bind_image(queue, ker, gpgpu) !=
> CL_SUCCESS))
>      return err;
>    /* Bind all samplers */
>    if (ker->vme)
> @@ -419,7 +422,7 @@
> cl_command_queue_ND_range_gen7(cl_command_queue queue,
>    /* Bind a stack if needed */
>    cl_bind_stack(gpgpu, ker);
> 
> -  if (cl_upload_constant_buffer(queue, ker) != 0)
> +  if (cl_upload_constant_buffer(queue, ker, gpgpu) != 0)
>      goto error;
> 
>    cl_gpgpu_states_setup(gpgpu, &kernel);
> @@ -440,7 +443,7 @@
> cl_command_queue_ND_range_gen7(cl_command_queue queue,
>    batch_sz = cl_kernel_compute_batch_sz(ker);
>    if (cl_gpgpu_batch_reset(gpgpu, batch_sz) != 0)
>      goto error;
> -  cl_set_thread_batch_buf(queue, cl_gpgpu_ref_batch_buf(gpgpu));
> +  //cl_set_thread_batch_buf(queue, cl_gpgpu_ref_batch_buf(gpgpu));
>    cl_gpgpu_batch_start(gpgpu);
> 
>    /* Issue the GPGPU_WALKER command */
> @@ -448,6 +451,10 @@
> cl_command_queue_ND_range_gen7(cl_command_queue queue,
> 
>    /* Close the batch buffer and submit it */
>    cl_gpgpu_batch_end(gpgpu, 0);
> +
> +  event->exec_data.gpgpu = gpgpu;
> +  event->exec_data.type = EnqueueNDRangeKernel;
> +
>    return CL_SUCCESS;
> 
>  error:
> diff --git a/src/cl_enqueue.c b/src/cl_enqueue.c
> index 54c0ffa..ac29ebe 100644
> --- a/src/cl_enqueue.c
> +++ b/src/cl_enqueue.c
> @@ -16,93 +16,101 @@
>   *
>   * Author: Rong Yang <rong.r.yang at intel.com>
>   */
> -#include <stdio.h>
> -#include <string.h>
> -#include <assert.h>
> -#include <pthread.h>
> 
> +//#include "cl_image.h"
>  #include "cl_enqueue.h"
> -#include "cl_image.h"
>  #include "cl_driver.h"
>  #include "cl_event.h"
>  #include "cl_command_queue.h"
>  #include "cl_utils.h"
> +#include "cl_alloc.h"
> +#include <stdio.h>
> +#include <string.h>
> +#include <assert.h>
> +#include <pthread.h>
> 
> -
> -cl_int cl_enqueue_read_buffer(enqueue_data* data)
> +static cl_int
> +cl_enqueue_read_buffer(enqueue_data *data, cl_int status)
>  {
>    cl_int err = CL_SUCCESS;
>    cl_mem mem = data->mem_obj;
> +
> +  if (status != CL_COMPLETE)
> +    return err;
> +
>    assert(mem->type == CL_MEM_BUFFER_TYPE ||
>           mem->type == CL_MEM_SUBBUFFER_TYPE);
> -  struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem;
> +  struct _cl_mem_buffer *buffer = (struct _cl_mem_buffer *)mem;
>    //cl_buffer_get_subdata sometime is very very very slow in linux kernel, in
> skl and chv,
>    //and it is randomly. So temporary disable it, use map/copy/unmap to read.
>    //Should re-enable it after find root cause.
>    if (0 && !mem->is_userptr) {
>      if (cl_buffer_get_subdata(mem->bo, data->offset + buffer->sub_offset,
> -			       data->size, data->ptr) != 0)
> +                              data->size, data->ptr) != 0)
>        err = CL_MAP_FAILURE;
>    } else {
> -    void* src_ptr = cl_mem_map_auto(mem, 0);
> +    void *src_ptr = cl_mem_map_auto(mem, 0);
>      if (src_ptr == NULL)
>        err = CL_MAP_FAILURE;
>      else {
>        //sometimes, application invokes read buffer, instead of map buffer,
> even if userptr is enabled
>        //memcpy is not necessary for this case
> -      if (data->ptr != (char*)src_ptr + data->offset + buffer->sub_offset)
> -        memcpy(data->ptr, (char*)src_ptr + data->offset + buffer->sub_offset,
> data->size);
> +      if (data->ptr != (char *)src_ptr + data->offset + buffer->sub_offset)
> +        memcpy(data->ptr, (char *)src_ptr + data->offset + buffer->sub_offset,
> data->size);
>        cl_mem_unmap_auto(mem);
>      }
>    }
>    return err;
>  }
> 
> -cl_int cl_enqueue_read_buffer_rect(enqueue_data* data)
> +static cl_int
> +cl_enqueue_read_buffer_rect(enqueue_data *data, cl_int status)
>  {
>    cl_int err = CL_SUCCESS;
> -  void* src_ptr;
> -  void* dst_ptr;
> +  void *src_ptr;
> +  void *dst_ptr;
> 
> -  const size_t* origin = data->origin;
> -  const size_t* host_origin = data->host_origin;
> -  const size_t* region = data->region;
> +  const size_t *origin = data->origin;
> +  const size_t *host_origin = data->host_origin;
> +  const size_t *region = data->region;
> 
>    cl_mem mem = data->mem_obj;
> +
> +  if (status != CL_COMPLETE)
> +    return err;
> +
>    assert(mem->type == CL_MEM_BUFFER_TYPE ||
>           mem->type == CL_MEM_SUBBUFFER_TYPE);
> -  struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem;
> +  struct _cl_mem_buffer *buffer = (struct _cl_mem_buffer *)mem;
> 
>    if (!(src_ptr = cl_mem_map_auto(mem, 0))) {
>      err = CL_MAP_FAILURE;
>      goto error;
>    }
> 
> -   size_t offset = origin[0] + data->row_pitch*origin[1] + data-
> >slice_pitch*origin[2];
> -   src_ptr = (char*)src_ptr + offset +  buffer->sub_offset;
> -
> -   offset = host_origin[0] + data->host_row_pitch*host_origin[1] + data-
> >host_slice_pitch*host_origin[2];
> -   dst_ptr = (char *)data->ptr + offset;
> -
> -   if (data->row_pitch == region[0] && data->row_pitch == data-
> >host_row_pitch &&
> -       (region[2] == 1 || (data->slice_pitch == region[0]*region[1] && data-
> >slice_pitch == data->host_slice_pitch)))
> -   {
> -     memcpy(dst_ptr, src_ptr, region[2] == 1 ? data->row_pitch*region[1] :
> data->slice_pitch*region[2]);
> -   }
> -   else {
> -     cl_uint y, z;
> -     for (z = 0; z < region[2]; z++) {
> -       const char* src = src_ptr;
> -       char* dst = dst_ptr;
> -       for (y = 0; y < region[1]; y++) {
> -         memcpy(dst, src, region[0]);
> -         src += data->row_pitch;
> -         dst += data->host_row_pitch;
> -       }
> -       src_ptr = (char*)src_ptr + data->slice_pitch;
> -       dst_ptr = (char*)dst_ptr + data->host_slice_pitch;
> -     }
> -   }
> +  size_t offset = origin[0] + data->row_pitch * origin[1] + data->slice_pitch *
> origin[2];
> +  src_ptr = (char *)src_ptr + offset + buffer->sub_offset;
> +
> +  offset = host_origin[0] + data->host_row_pitch * host_origin[1] + data-
> >host_slice_pitch * host_origin[2];
> +  dst_ptr = (char *)data->ptr + offset;
> +
> +  if (data->row_pitch == region[0] && data->row_pitch == data-
> >host_row_pitch &&
> +      (region[2] == 1 || (data->slice_pitch == region[0] * region[1] && data-
> >slice_pitch == data->host_slice_pitch))) {
> +    memcpy(dst_ptr, src_ptr, region[2] == 1 ? data->row_pitch * region[1] :
> data->slice_pitch * region[2]);
> +  } else {
> +    cl_uint y, z;
> +    for (z = 0; z < region[2]; z++) {
> +      const char *src = src_ptr;
> +      char *dst = dst_ptr;
> +      for (y = 0; y < region[1]; y++) {
> +        memcpy(dst, src, region[0]);
> +        src += data->row_pitch;
> +        dst += data->host_row_pitch;
> +      }
> +      src_ptr = (char *)src_ptr + data->slice_pitch;
> +      dst_ptr = (char *)dst_ptr + data->host_slice_pitch;
> +    }
> +  }
> 
>    err = cl_mem_unmap_auto(mem);
> 
> @@ -110,75 +118,80 @@ error:
>    return err;
>  }
> 
> -cl_int cl_enqueue_write_buffer(enqueue_data *data)
> +static cl_int
> +cl_enqueue_write_buffer(enqueue_data *data, cl_int status)
>  {
>    cl_int err = CL_SUCCESS;
>    cl_mem mem = data->mem_obj;
>    assert(mem->type == CL_MEM_BUFFER_TYPE ||
>           mem->type == CL_MEM_SUBBUFFER_TYPE);
> -  struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem;
> +  struct _cl_mem_buffer *buffer = (struct _cl_mem_buffer *)mem;
> +
> +  if (status != CL_COMPLETE)
> +    return err;
> 
>    if (mem->is_userptr) {
> -    void* dst_ptr = cl_mem_map_auto(mem, 1);
> +    void *dst_ptr = cl_mem_map_auto(mem, 1);
>      if (dst_ptr == NULL)
>        err = CL_MAP_FAILURE;
>      else {
> -      memcpy((char*)dst_ptr + data->offset + buffer->sub_offset, data-
> >const_ptr, data->size);
> +      memcpy((char *)dst_ptr + data->offset + buffer->sub_offset, data-
> >const_ptr, data->size);
>        cl_mem_unmap_auto(mem);
>      }
> -  }
> -  else {
> +  } else {
>      if (cl_buffer_subdata(mem->bo, data->offset + buffer->sub_offset,
> -			   data->size, data->const_ptr) != 0)
> +                          data->size, data->const_ptr) != 0)
>        err = CL_MAP_FAILURE;
>    }
> 
>    return err;
>  }
> 
> -cl_int cl_enqueue_write_buffer_rect(enqueue_data *data)
> +static cl_int
> +cl_enqueue_write_buffer_rect(enqueue_data *data, cl_int status)
>  {
>    cl_int err = CL_SUCCESS;
> -  void* src_ptr;
> -  void* dst_ptr;
> +  void *src_ptr;
> +  void *dst_ptr;
> 
> -  const size_t* origin = data->origin;
> -  const size_t* host_origin = data->host_origin;
> -  const size_t* region = data->region;
> +  const size_t *origin = data->origin;
> +  const size_t *host_origin = data->host_origin;
> +  const size_t *region = data->region;
> 
>    cl_mem mem = data->mem_obj;
>    assert(mem->type == CL_MEM_BUFFER_TYPE ||
>           mem->type == CL_MEM_SUBBUFFER_TYPE);
> -  struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem;
> +  struct _cl_mem_buffer *buffer = (struct _cl_mem_buffer *)mem;
> +
> +  if (status != CL_COMPLETE)
> +    return err;
> 
>    if (!(dst_ptr = cl_mem_map_auto(mem, 1))) {
>      err = CL_MAP_FAILURE;
>      goto error;
>    }
> 
> -  size_t offset = origin[0] + data->row_pitch*origin[1] + data-
> >slice_pitch*origin[2];
> +  size_t offset = origin[0] + data->row_pitch * origin[1] + data->slice_pitch *
> origin[2];
>    dst_ptr = (char *)dst_ptr + offset + buffer->sub_offset;
> 
> -  offset = host_origin[0] + data->host_row_pitch*host_origin[1] + data-
> >host_slice_pitch*host_origin[2];
> -  src_ptr = (char*)data->const_ptr + offset;
> +  offset = host_origin[0] + data->host_row_pitch * host_origin[1] + data-
> >host_slice_pitch * host_origin[2];
> +  src_ptr = (char *)data->const_ptr + offset;
> 
>    if (data->row_pitch == region[0] && data->row_pitch == data-
> >host_row_pitch &&
> -      (region[2] == 1 || (data->slice_pitch == region[0]*region[1] && data-
> >slice_pitch == data->host_slice_pitch)))
> -  {
> -    memcpy(dst_ptr, src_ptr, region[2] == 1 ? data->row_pitch*region[1] :
> data->slice_pitch*region[2]);
> -  }
> -  else {
> +      (region[2] == 1 || (data->slice_pitch == region[0] * region[1] && data-
> >slice_pitch == data->host_slice_pitch))) {
> +    memcpy(dst_ptr, src_ptr, region[2] == 1 ? data->row_pitch * region[1] :
> data->slice_pitch * region[2]);
> +  } else {
>      cl_uint y, z;
>      for (z = 0; z < region[2]; z++) {
> -      const char* src = src_ptr;
> -      char* dst = dst_ptr;
> +      const char *src = src_ptr;
> +      char *dst = dst_ptr;
>        for (y = 0; y < region[1]; y++) {
>          memcpy(dst, src, region[0]);
>          src += data->host_row_pitch;
>          dst += data->row_pitch;
>        }
> -      src_ptr = (char*)src_ptr + data->host_slice_pitch;
> -      dst_ptr = (char*)dst_ptr + data->slice_pitch;
> +      src_ptr = (char *)src_ptr + data->host_slice_pitch;
> +      dst_ptr = (char *)dst_ptr + data->slice_pitch;
>      }
>    }
> 
> @@ -188,16 +201,19 @@ error:
>    return err;
>  }
> 
> -
> -cl_int cl_enqueue_read_image(enqueue_data *data)
> +static cl_int
> +cl_enqueue_read_image(enqueue_data *data, cl_int status)
>  {
>    cl_int err = CL_SUCCESS;
> -  void* src_ptr;
> +  void *src_ptr;
> 
>    cl_mem mem = data->mem_obj;
>    CHECK_IMAGE(mem, image);
> -  const size_t* origin = data->origin;
> -  const size_t* region = data->region;
> +  const size_t *origin = data->origin;
> +  const size_t *region = data->region;
> +
> +  if (status != CL_COMPLETE)
> +    return err;
> 
>    if (!(src_ptr = cl_mem_map_auto(mem, 0))) {
>      err = CL_MAP_FAILURE;
> @@ -208,40 +224,42 @@ cl_int cl_enqueue_read_image(enqueue_data
> *data)
>    src_ptr = (char*)src_ptr + offset;
> 
>    if (!origin[0] && region[0] == image->w && data->row_pitch == image-
> >row_pitch &&
> -      (region[2] == 1 || (!origin[1] && region[1] == image->h && data-
> >slice_pitch == image->slice_pitch)))
> -  {
> -    memcpy(data->ptr, src_ptr, region[2] == 1 ? data->row_pitch*region[1] :
> data->slice_pitch*region[2]);
> -  }
> -  else {
> +      (region[2] == 1 || (!origin[1] && region[1] == image->h && data-
> >slice_pitch == image->slice_pitch))) {
> +    memcpy(data->ptr, src_ptr, region[2] == 1 ? data->row_pitch * region[1] :
> data->slice_pitch * region[2]);
> +  } else {
>      cl_uint y, z;
>      for (z = 0; z < region[2]; z++) {
> -      const char* src = src_ptr;
> -      char* dst = data->ptr;
> +      const char *src = src_ptr;
> +      char *dst = data->ptr;
>        for (y = 0; y < region[1]; y++) {
> -        memcpy(dst, src, image->bpp*region[0]);
> +        memcpy(dst, src, image->bpp * region[0]);
>          src += image->row_pitch;
>          dst += data->row_pitch;
>        }
> -      src_ptr = (char*)src_ptr + image->slice_pitch;
> -      data->ptr = (char*)data->ptr + data->slice_pitch;
> +      src_ptr = (char *)src_ptr + image->slice_pitch;
> +      data->ptr = (char *)data->ptr + data->slice_pitch;
>      }
>    }
> 
> - err = cl_mem_unmap_auto(mem);
> +  err = cl_mem_unmap_auto(mem);
> 
>  error:
>    return err;
> -
>  }
> 
> -cl_int cl_enqueue_write_image(enqueue_data *data)
> +static cl_int
> +cl_enqueue_write_image(enqueue_data *data, cl_int status)
>  {
>    cl_int err = CL_SUCCESS;
> -  void* dst_ptr;
> +  void *dst_ptr;
> 
>    cl_mem mem = data->mem_obj;
> +
>    CHECK_IMAGE(mem, image);
> 
> +  if (status != CL_COMPLETE)
> +    return err;
> +
>    if (!(dst_ptr = cl_mem_map_auto(mem, 1))) {
>      err = CL_MAP_FAILURE;
>      goto error;
> @@ -255,45 +273,57 @@ cl_int cl_enqueue_write_image(enqueue_data
> *data)
> 
>  error:
>    return err;
> -
>  }
> 
> -cl_int cl_enqueue_map_buffer(enqueue_data *data)
> +static cl_int
> +cl_enqueue_map_buffer(enqueue_data *data, cl_int status)
>  {
>    void *ptr = NULL;
>    cl_int err = CL_SUCCESS;
>    cl_mem mem = data->mem_obj;
>    assert(mem->type == CL_MEM_BUFFER_TYPE ||
>           mem->type == CL_MEM_SUBBUFFER_TYPE);
> -  struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem;
> +  struct _cl_mem_buffer *buffer = (struct _cl_mem_buffer *)mem;
> 
> -  if (mem->is_userptr)
> -    ptr = cl_mem_map_auto(mem, data->write_map ? 1 : 0);
> -  else {
> -    if(data->unsync_map == 1)
> -      //because using unsync map in clEnqueueMapBuffer, so force use
> map_gtt here
> -      ptr = cl_mem_map_gtt(mem);
> -    else
> +  if (status == CL_SUBMITTED) {
> +    if (buffer->base.is_userptr) {
> +      ptr = buffer->base.host_ptr;
> +    } else {
> +      if ((ptr = cl_mem_map_gtt_unsync(&buffer->base)) == NULL) {
> +        err = CL_MAP_FAILURE;
> +        return err;
> +      }
> +    }
> +    data->ptr = ptr;
> +  } else if (status == CL_COMPLETE) {
> +    if (mem->is_userptr)
>        ptr = cl_mem_map_auto(mem, data->write_map ? 1 : 0);
> -  }
> +    else {
> +      if (data->unsync_map == 1)
> +        //because using unsync map in clEnqueueMapBuffer, so force use
> map_gtt here
> +        ptr = cl_mem_map_gtt(mem);
> +      else
> +        ptr = cl_mem_map_auto(mem, data->write_map ? 1 : 0);
> +    }
> 
> -  if (ptr == NULL) {
> -    err = CL_MAP_FAILURE;
> -    goto error;
> -  }
> -  data->ptr = ptr;
> +    if (ptr == NULL) {
> +      err = CL_MAP_FAILURE;
> +      return err;
> +    }
> +    data->ptr = ptr;
> 
> -  if((mem->flags & CL_MEM_USE_HOST_PTR) && !mem->is_userptr) {
> -    assert(mem->host_ptr);
> -    ptr = (char*)ptr + data->offset + buffer->sub_offset;
> -    memcpy(mem->host_ptr + data->offset + buffer->sub_offset, ptr, data-
> >size);
> +    if ((mem->flags & CL_MEM_USE_HOST_PTR) && !mem->is_userptr) {
> +      assert(mem->host_ptr);
> +      ptr = (char *)ptr + data->offset + buffer->sub_offset;
> +      memcpy(mem->host_ptr + data->offset + buffer->sub_offset, ptr, data-
> >size);
> +    }
>    }
> 
> -error:
>    return err;
>  }
> 
> -cl_int cl_enqueue_map_image(enqueue_data *data)
> +static cl_int
> +cl_enqueue_map_image(enqueue_data *data, cl_int status)
>  {
>    cl_int err = CL_SUCCESS;
>    cl_mem mem = data->mem_obj;
> @@ -301,46 +331,59 @@ cl_int cl_enqueue_map_image(enqueue_data
> *data)
>    size_t row_pitch = 0;
>    CHECK_IMAGE(mem, image);
> 
> -  if(data->unsync_map == 1)
> -    //because using unsync map in clEnqueueMapBuffer, so force use
> map_gtt here
> -    ptr = cl_mem_map_gtt(mem);
> -  else
> -    ptr = cl_mem_map_auto(mem, data->write_map ? 1 : 0);
> +  if (status == CL_SUBMITTED) {
> +    if ((ptr = cl_mem_map_gtt_unsync(mem)) == NULL) {
> +      err = CL_MAP_FAILURE;
> +      goto error;
> +    }
> +    data->ptr = ptr;
> +  } else if (status == CL_COMPLETE) {
> +    if (data->unsync_map == 1)
> +      //because using unsync map in clEnqueueMapBuffer, so force use
> map_gtt here
> +      ptr = cl_mem_map_gtt(mem);
> +    else
> +      ptr = cl_mem_map_auto(mem, data->write_map ? 1 : 0);
> +
> +    if (ptr == NULL) {
> +      err = CL_MAP_FAILURE;
> +      goto error;
> +    }
> 
> -  if (ptr == NULL) {
> -    err = CL_MAP_FAILURE;
> -    goto error;
> -  }
> -  data->ptr = (char*)ptr + image->offset;
> -  if (image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
> -    row_pitch = image->slice_pitch;
> -  else
> -    row_pitch = image->row_pitch;
> -
> -  if(mem->flags & CL_MEM_USE_HOST_PTR) {
> -    assert(mem->host_ptr);
> -    if (!mem->is_userptr)
> -      //src and dst need add offset in function cl_mem_copy_image_region
> -      cl_mem_copy_image_region(data->origin, data->region,
> -                             mem->host_ptr, image->host_row_pitch, image-
> >host_slice_pitch,
> -                             data->ptr, row_pitch, image->slice_pitch, image, CL_TRUE,
> CL_TRUE);
> +    data->ptr = (char*)ptr + image->offset;
> +    if (image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
> +      row_pitch = image->slice_pitch;
> +    else
> +      row_pitch = image->row_pitch;
> +
> +    if(mem->flags & CL_MEM_USE_HOST_PTR) {
> +      assert(mem->host_ptr);
> +      if (!mem->is_userptr)
> +        //src and dst need add offset in function cl_mem_copy_image_region
> +        cl_mem_copy_image_region(data->origin, data->region,
> +                                 mem->host_ptr, image->host_row_pitch, image-
> >host_slice_pitch,
> +                                 data->ptr, row_pitch, image->slice_pitch, image, CL_TRUE,
> CL_TRUE);
> +    }
>    }
> 
>  error:
>    return err;
>  }
> 
> -cl_int cl_enqueue_unmap_mem_object(enqueue_data *data)
> +static cl_int
> +cl_enqueue_unmap_mem_object(enqueue_data *data, cl_int status)
>  {
>    cl_int err = CL_SUCCESS;
>    int i, j;
>    size_t mapped_size = 0;
>    size_t origin[3], region[3];
> -  void * v_ptr = NULL;
> -  void * mapped_ptr = data->ptr;
> +  void *v_ptr = NULL;
> +  void *mapped_ptr = data->ptr;
>    cl_mem memobj = data->mem_obj;
>    size_t row_pitch = 0;
> 
> +  if (status != CL_COMPLETE)
> +    return err;
> +
>    assert(memobj->mapped_ptr_sz >= memobj->map_ref);
>    INVALID_VALUE_IF(!mapped_ptr);
>    for (i = 0; i < memobj->mapped_ptr_sz; i++) {
> @@ -348,7 +391,7 @@ cl_int
> cl_enqueue_unmap_mem_object(enqueue_data *data)
>        memobj->mapped_ptr[i].ptr = NULL;
>        mapped_size = memobj->mapped_ptr[i].size;
>        v_ptr = memobj->mapped_ptr[i].v_ptr;
> -      for(j=0; j<3; j++) {
> +      for (j = 0; j < 3; j++) {
>          region[j] = memobj->mapped_ptr[i].region[j];
>          origin[j] = memobj->mapped_ptr[i].origin[j];
>          memobj->mapped_ptr[i].region[j] = 0;
> @@ -364,10 +407,10 @@ cl_int
> cl_enqueue_unmap_mem_object(enqueue_data *data)
>    INVALID_VALUE_IF(i == memobj->mapped_ptr_sz);
> 
>    if (memobj->flags & CL_MEM_USE_HOST_PTR) {
> -    if(memobj->type == CL_MEM_BUFFER_TYPE ||
> -       memobj->type == CL_MEM_SUBBUFFER_TYPE) {
> +    if (memobj->type == CL_MEM_BUFFER_TYPE ||
> +        memobj->type == CL_MEM_SUBBUFFER_TYPE) {
>        assert(mapped_ptr >= memobj->host_ptr &&
> -        mapped_ptr + mapped_size <= memobj->host_ptr + memobj->size);
> +             mapped_ptr + mapped_size <= memobj->host_ptr + memobj->size);
>        /* Sync the data. */
>        if (!memobj->is_userptr)
>          memcpy(v_ptr, mapped_ptr, mapped_size);
> @@ -381,8 +424,8 @@ cl_int
> cl_enqueue_unmap_mem_object(enqueue_data *data)
>        if (!memobj->is_userptr)
>          //v_ptr have added offset, host_ptr have not added offset.
>          cl_mem_copy_image_region(origin, region, v_ptr, row_pitch, image-
> >slice_pitch,
> -                               memobj->host_ptr, image->host_row_pitch, image-
> >host_slice_pitch,
> -                               image, CL_FALSE, CL_TRUE);
> +                                 memobj->host_ptr, image->host_row_pitch, image-
> >host_slice_pitch,
> +                                 image, CL_FALSE, CL_TRUE);
>      }
>    } else {
>      assert(v_ptr == mapped_ptr);
> @@ -391,24 +434,24 @@ cl_int
> cl_enqueue_unmap_mem_object(enqueue_data *data)
>    cl_mem_unmap_auto(memobj);
> 
>    /* shrink the mapped slot. */
> -  if (memobj->mapped_ptr_sz/2 > memobj->map_ref) {
> +  if (memobj->mapped_ptr_sz / 2 > memobj->map_ref) {
>      int j = 0;
>      cl_mapped_ptr *new_ptr = (cl_mapped_ptr *)malloc(
> -                             sizeof(cl_mapped_ptr) * (memobj->mapped_ptr_sz/2));
> +      sizeof(cl_mapped_ptr) * (memobj->mapped_ptr_sz / 2));
>      if (!new_ptr) {
>        /* Just do nothing. */
>        goto error;
>      }
> -    memset(new_ptr, 0, (memobj->mapped_ptr_sz/2) *
> sizeof(cl_mapped_ptr));
> +    memset(new_ptr, 0, (memobj->mapped_ptr_sz / 2) *
> sizeof(cl_mapped_ptr));
> 
>      for (i = 0; i < memobj->mapped_ptr_sz; i++) {
>        if (memobj->mapped_ptr[i].ptr) {
>          new_ptr[j] = memobj->mapped_ptr[i];
>          j++;
> -        assert(j < memobj->mapped_ptr_sz/2);
> +        assert(j < memobj->mapped_ptr_sz / 2);
>        }
>      }
> -    memobj->mapped_ptr_sz = memobj->mapped_ptr_sz/2;
> +    memobj->mapped_ptr_sz = memobj->mapped_ptr_sz / 2;
>      free(memobj->mapped_ptr);
>      memobj->mapped_ptr = new_ptr;
>    }
> @@ -417,7 +460,8 @@ error:
>    return err;
>  }
> 
> -cl_int cl_enqueue_native_kernel(enqueue_data *data)
> +static cl_int
> +cl_enqueue_native_kernel(enqueue_data *data, cl_int status)
>  {
>    cl_int err = CL_SUCCESS;
>    cl_uint num_mem_objects = (cl_uint)data->offset;
> @@ -425,18 +469,19 @@ cl_int cl_enqueue_native_kernel(enqueue_data
> *data)
>    const void **args_mem_loc = (const void **)data->const_ptr;
>    cl_uint i;
> 
> -  for (i=0; i<num_mem_objects; ++i)
> -  {
> -      const cl_mem buffer = mem_list[i];
> -      CHECK_MEM(buffer);
> +  if (status != CL_COMPLETE)
> +    return err;
> +
> +  for (i = 0; i < num_mem_objects; ++i) {
> +    const cl_mem buffer = mem_list[i];
> +    CHECK_MEM(buffer);
> 
> -      *((void **)args_mem_loc[i]) = cl_mem_map_auto(buffer, 0);
> +    *((void **)args_mem_loc[i]) = cl_mem_map_auto(buffer, 0);
>    }
>    data->user_func(data->ptr);
> 
> -  for (i=0; i<num_mem_objects; ++i)
> -  {
> -      cl_mem_unmap_auto(mem_list[i]);
> +  for (i = 0; i < num_mem_objects; ++i) {
> +    cl_mem_unmap_auto(mem_list[i]);
>    }
> 
>    free(data->ptr);
> @@ -444,46 +489,115 @@ error:
>    return err;
>  }
> 
> -cl_int cl_enqueue_handle(cl_event event, enqueue_data* data)
> +static cl_int
> +cl_enqueue_ndrange(enqueue_data *data, cl_int status)
>  {
> -  /* if need profiling, add the submit timestamp here. */
> -  if (event && event->type != CL_COMMAND_USER
> -           && event->queue->props & CL_QUEUE_PROFILING_ENABLE) {
> -    cl_event_get_timestamp(event, CL_PROFILING_COMMAND_SUBMIT);
> +  cl_int err = CL_SUCCESS;
> +
> +  if (status == CL_SUBMITTED) {
> +    err = cl_command_queue_flush_gpgpu(data->gpgpu);
> +  } else if (status == CL_COMPLETE) {
> +    void *batch_buf = cl_gpgpu_ref_batch_buf(data->gpgpu);
> +    cl_gpgpu_sync(batch_buf);
> +    cl_gpgpu_unref_batch_buf(batch_buf);
> +    /* Finished, we can release the gpgpu now. */
> +    cl_gpgpu_delete(data->gpgpu);
> +    data->gpgpu = NULL;
>    }
> 
> -  switch(data->type) {
> -    case EnqueueReadBuffer:
> -      return cl_enqueue_read_buffer(data);
> -    case EnqueueReadBufferRect:
> -      return cl_enqueue_read_buffer_rect(data);
> -    case EnqueueWriteBuffer:
> -      return cl_enqueue_write_buffer(data);
> -    case EnqueueWriteBufferRect:
> -      return cl_enqueue_write_buffer_rect(data);
> -    case EnqueueReadImage:
> -      return cl_enqueue_read_image(data);
> -    case EnqueueWriteImage:
> -      return cl_enqueue_write_image(data);
> -    case EnqueueMapBuffer:
> -      return cl_enqueue_map_buffer(data);
> -    case EnqueueMapImage:
> -      return cl_enqueue_map_image(data);
> -    case EnqueueUnmapMemObject:
> -      return cl_enqueue_unmap_mem_object(data);
> -    case EnqueueCopyBufferRect:
> -    case EnqueueCopyBuffer:
> -    case EnqueueCopyImage:
> -    case EnqueueCopyBufferToImage:
> -    case EnqueueCopyImageToBuffer:
> -    case EnqueueNDRangeKernel:
> -    case EnqueueFillBuffer:
> -    case EnqueueFillImage:
> -      return cl_event_flush(event);
> -    case EnqueueNativeKernel:
> -      return cl_enqueue_native_kernel(data);
> -    case EnqueueMigrateMemObj:
> -    default:
> -      return CL_SUCCESS;
> +  return err;
> +}
> +
> +static cl_int
> +cl_enqueue_marker_or_barrier(enqueue_data *data, cl_int status)
> +{
> +  return CL_COMPLETE;
> +}
> +
> +LOCAL void
> +cl_enqueue_delete(enqueue_data *data)
> +{
> +  if (data == NULL)
> +    return;
> +
> +  if (data->type == EnqueueCopyBufferRect ||
> +      data->type == EnqueueCopyBuffer ||
> +      data->type == EnqueueCopyImage ||
> +      data->type == EnqueueCopyBufferToImage ||
> +      data->type == EnqueueCopyImageToBuffer ||
> +      data->type == EnqueueNDRangeKernel ||
> +      data->type == EnqueueFillBuffer ||
> +      data->type == EnqueueFillImage) {
> +    if (data->gpgpu) {
> +      cl_gpgpu_delete(data->gpgpu);
> +      data->gpgpu = NULL;
> +    }
> +    return;
> +  }
> +
> +  if (data->type == EnqueueNativeKernel) {
> +    if (data->mem_list) {
> +      cl_free((void*)data->mem_list);
> +      data->mem_list = NULL;
> +    }
> +    if (data->ptr) {
> +      cl_free((void*)data->ptr);
> +      data->ptr = NULL;
> +    }
> +    if (data->const_ptr) {
> +      cl_free((void*)data->const_ptr);
> +      data->const_ptr = NULL;
> +    }
> +  }
> +}
> +
> +LOCAL cl_int
> +cl_enqueue_handle(enqueue_data *data, cl_int status)
> +{
> +  /* if need profiling, add the submit timestamp here. */
> +  //  if (event && event->event_type != CL_COMMAND_USER &&
> +  //      event->queue->props & CL_QUEUE_PROFILING_ENABLE) {
> +  //    cl_event_get_timestamp(event,
> CL_PROFILING_COMMAND_SUBMIT);
> +  //  }
> +
> +  switch (data->type) {
> +  case EnqueueReturnSuccesss:
> +    return CL_SUCCESS;
> +  case EnqueueReadBuffer:
> +    return cl_enqueue_read_buffer(data, status);
> +  case EnqueueReadBufferRect:
> +    return cl_enqueue_read_buffer_rect(data, status);
> +  case EnqueueWriteBuffer:
> +    return cl_enqueue_write_buffer(data, status);
> +  case EnqueueWriteBufferRect:
> +    return cl_enqueue_write_buffer_rect(data, status);
> +  case EnqueueReadImage:
> +    return cl_enqueue_read_image(data, status);
> +  case EnqueueWriteImage:
> +    return cl_enqueue_write_image(data, status);
> +  case EnqueueMapBuffer:
> +    return cl_enqueue_map_buffer(data, status);
> +  case EnqueueMapImage:
> +    return cl_enqueue_map_image(data, status);
> +  case EnqueueUnmapMemObject:
> +    return cl_enqueue_unmap_mem_object(data, status);
> +  case EnqueueMarker:
> +  case EnqueueBarrier:
> +    return cl_enqueue_marker_or_barrier(data, status);
> +  case EnqueueCopyBufferRect:
> +  case EnqueueCopyBuffer:
> +  case EnqueueCopyImage:
> +  case EnqueueCopyBufferToImage:
> +  case EnqueueCopyImageToBuffer:
> +  case EnqueueNDRangeKernel:
> +  case EnqueueFillBuffer:
> +  case EnqueueFillImage:
> +    //return cl_event_flush(event);
> +    return cl_enqueue_ndrange(data, status);
> +  case EnqueueNativeKernel:
> +    return cl_enqueue_native_kernel(data, status);
> +  case EnqueueMigrateMemObj:
> +  default:
> +    return CL_SUCCESS;
>    }
>  }
> diff --git a/src/cl_enqueue.h b/src/cl_enqueue.h
> index 09305af..f8fff9d 100644
> --- a/src/cl_enqueue.h
> +++ b/src/cl_enqueue.h
> @@ -24,7 +24,8 @@
>  #include "CL/cl.h"
> 
>  typedef enum {
> -  EnqueueReadBuffer = 0,
> +  EnqueueReturnSuccesss = 0, /* For some case, we have nothing to do, just
> return SUCCESS. */
> +  EnqueueReadBuffer,
>    EnqueueReadBufferRect,
>    EnqueueWriteBuffer,
>    EnqueueWriteBufferRect,
> @@ -49,26 +50,29 @@ typedef enum {
>  } enqueue_type;
> 
>  typedef struct _enqueue_data {
> -  enqueue_type      type;             /* Command type */
> -  cl_mem            mem_obj;          /* Enqueue's cl_mem */
> -  cl_command_queue  queue;            /* Command queue */
> -  size_t            offset;           /* Mem object's offset */
> -  size_t            size;             /* Size */
> -  size_t            origin[3];        /* Origin */
> -  size_t            host_origin[3];   /* Origin */
> -  size_t            region[3];        /* Region */
> -  size_t            row_pitch;        /* Row pitch */
> -  size_t            slice_pitch;      /* Slice pitch */
> -  size_t            host_row_pitch;   /* Host row pitch, used in read/write buffer
> rect */
> -  size_t            host_slice_pitch; /* Host slice pitch, used in read/write buffer
> rect */
> -  const void *      const_ptr;        /* Const ptr for memory read */
> -  void *            ptr;              /* Ptr for write and return value */
> -  const cl_mem*     mem_list;         /* mem_list of clEnqueueNativeKernel */
> -  uint8_t           unsync_map;       /* Indicate the clEnqueueMapBuffer/Image
> is unsync map */
> -  uint8_t           write_map;        /* Indicate if the clEnqueueMapBuffer is write
> enable */
> -  void (*user_func)(void *);          /* pointer to a host-callable user function */
> +  enqueue_type type;         /* Command type */
> +  cl_mem mem_obj;            /* Enqueue's cl_mem */
> +  cl_command_queue queue;    /* Command queue */
> +  size_t offset;             /* Mem object's offset */
> +  size_t size;               /* Size */
> +  size_t origin[3];          /* Origin */
> +  size_t host_origin[3];     /* Origin */
> +  size_t region[3];          /* Region */
> +  size_t row_pitch;          /* Row pitch */
> +  size_t slice_pitch;        /* Slice pitch */
> +  size_t host_row_pitch;     /* Host row pitch, used in read/write buffer rect
> */
> +  size_t host_slice_pitch;   /* Host slice pitch, used in read/write buffer rect
> */
> +  const void *const_ptr;     /* Const ptr for memory read */
> +  void *ptr;                 /* Ptr for write and return value */
> +  const cl_mem *mem_list;    /* mem_list of clEnqueueNativeKernel */
> +  uint8_t unsync_map;        /* Indicate the clEnqueueMapBuffer/Image is
> unsync map */
> +  uint8_t write_map;         /* Indicate if the clEnqueueMapBuffer is write
> enable */
> +  void (*user_func)(void *); /* pointer to a host-callable user function */
> +  cl_gpgpu gpgpu;
>  } enqueue_data;
> 
>  /* Do real enqueue commands */
> -cl_int cl_enqueue_handle(cl_event event, enqueue_data* data);
> +extern cl_int cl_enqueue_handle(enqueue_data *data, cl_int status);
> +extern void cl_enqueue_delete(enqueue_data *data);
> +
>  #endif /* __CL_ENQUEUE_H__ */
> diff --git a/src/cl_event.c b/src/cl_event.c
> index 6c7c2e0..4acd619 100644
> --- a/src/cl_event.c
> +++ b/src/cl_event.c
> @@ -14,750 +14,615 @@
>   * You should have received a copy of the GNU Lesser General Public
>   * License along with this library. If not, see <http://www.gnu.org/licenses/>.
>   *
> - * Author: Rong Yang <rong.r.yang at intel.com>
>   */
> 
>  #include "cl_event.h"
>  #include "cl_context.h"
> -#include "cl_utils.h"
> -#include "cl_alloc.h"
> -#include "cl_khr_icd.h"
> -#include "cl_kernel.h"
>  #include "cl_command_queue.h"
> -
> -#include <assert.h>
> +#include "cl_alloc.h"
> +#include <string.h>
>  #include <stdio.h>
> 
> -void cl_event_update_last_events(cl_command_queue queue, int wait)
> +LOCAL cl_int
> +cl_event_get_timestamp(cl_event event, cl_profiling_info param_name)
>  {
> -  cl_event last_event = get_last_event(queue);
> -  if(!last_event) return;
> -  cl_event next, now;
> -  now = last_event;
> -  while(now){
> -    next = now->last_next;//get next first in case set status maintain it
> -    cl_event_update_status(now,wait);//update event status
> -    now = next;
> -  }
> +  // TODO:
> +  return CL_INVALID_VALUE;
>  }
> 
> -void cl_event_insert_last_events(cl_command_queue queue,cl_event
> event)
> +LOCAL cl_ulong
> +cl_event_get_timestamp_delta(cl_ulong start_timestamp, cl_ulong
> end_timestamp)
>  {
> -  if(!event) return;
> -  cl_event last_event = get_last_event(queue);
> -  if(last_event){
> -    cl_event now = last_event;
> -    while(now->last_next)
> -      now = now->last_next;
> -    now->last_next = event;
> -    event->last_prev = now;
> +  cl_ulong ret_val;
> +
> +  if (end_timestamp > start_timestamp) {
> +    ret_val = end_timestamp - start_timestamp;
> +  } else {
> +    /*if start time stamp is greater than end timstamp then set ret value to
> max*/
> +    ret_val = ((cl_ulong)1 << 32);
>    }
> -  else set_last_event(queue,event);
> +
> +  return ret_val;
>  }
> 
> -static inline cl_bool
> -cl_event_is_gpu_command_type(cl_command_type type)
> +LOCAL cl_ulong
> +cl_event_get_start_timestamp(cl_event event)
>  {
> -  switch(type) {
> -    case CL_COMMAND_COPY_BUFFER:
> -    case CL_COMMAND_FILL_BUFFER:
> -    case CL_COMMAND_COPY_IMAGE:
> -    case CL_COMMAND_COPY_IMAGE_TO_BUFFER:
> -    case CL_COMMAND_COPY_BUFFER_TO_IMAGE:
> -    case CL_COMMAND_COPY_BUFFER_RECT:
> -    case CL_COMMAND_TASK:
> -    case CL_COMMAND_NDRANGE_KERNEL:
> -      return CL_TRUE;
> -    default:
> -      return CL_FALSE;
> -  }
> +  cl_ulong ret_val;
> +
> +  ret_val = cl_event_get_timestamp_delta(event->timestamp[0], event-
> >timestamp[2]);
> +
> +  return ret_val;
>  }
> 
> -int cl_event_flush(cl_event event)
> +LOCAL cl_ulong
> +cl_event_get_end_timestamp(cl_event event)
>  {
> -  int err = CL_SUCCESS;
> -  if(!event) {
> -    err = CL_INVALID_VALUE;
> -    return err;
> -  }
> +  cl_ulong ret_val;
> 
> -  assert(event->gpgpu_event != NULL);
> -  if (event->gpgpu) {
> -    err = cl_command_queue_flush_gpgpu(event->queue, event->gpgpu);
> -    cl_gpgpu_delete(event->gpgpu);
> -    event->gpgpu = NULL;
> -  }
> -  cl_gpgpu_event_flush(event->gpgpu_event);
> -  cl_event_insert_last_events(event->queue,event);
> -  return err;
> +  ret_val = cl_event_get_timestamp_delta(event->timestamp[0], event-
> >timestamp[3]);
> +
> +  return ret_val;
>  }
> 
> -cl_event cl_event_new(cl_context ctx, cl_command_queue queue,
> cl_command_type type, cl_bool emplict)
> +LOCAL void
> +cl_event_add_ref(cl_event event)
>  {
> -  cl_event event = NULL;
> -  GET_QUEUE_THREAD_GPGPU(queue);
> +  assert(event);
> +  CL_OBJECT_INC_REF(event);
> +}
> 
> -  /* Allocate and inialize the structure itself */
> -  TRY_ALLOC_NO_ERR (event, CALLOC(struct _cl_event));
> -  CL_OBJECT_INIT_BASE(event, CL_OBJECT_EVENT_MAGIC);
> +LOCAL cl_int
> +cl_event_get_status(cl_event event)
> +{
> +  cl_int ret;
> +
> +  assert(event);
> +  CL_OBJECT_LOCK(event);
> +  ret = event->status;
> +  CL_OBJECT_UNLOCK(event);
> +  return ret;
> +}
> +
> +static cl_event
> +cl_event_new(cl_context ctx, cl_command_queue queue,
> cl_command_type type,
> +             cl_uint num_events, cl_event *event_list)
> +{
> +  cl_event e = cl_calloc(1, sizeof(_cl_event));
> +  if (e == NULL)
> +    return NULL;
> +
> +  CL_OBJECT_INIT_BASE(e, CL_OBJECT_EVENT_MAGIC);
> 
>    /* Append the event in the context event list */
> -  cl_context_add_event(ctx, event);
> -
> -  /* Initialize all members and create GPGPU event object */
> -  event->queue = queue;
> -  event->type  = type;
> -  event->gpgpu_event = NULL;
> -  if(type == CL_COMMAND_USER) {
> -    event->status = CL_SUBMITTED;
> +  cl_context_add_event(ctx, e);
> +  e->queue = queue;
> +
> +  list_init(&e->callbacks);
> +  list_init(&e->enqueue_node);
> +
> +  assert(type >= CL_COMMAND_NDRANGE_KERNEL && type <=
> CL_COMMAND_FILL_IMAGE);
> +  e->event_type = type;
> +  if (type == CL_COMMAND_USER) {
> +    e->status = CL_SUBMITTED;
> +  } else {
> +    e->status = CL_QUEUED;
>    }
> -  else {
> -    event->status = CL_QUEUED;
> -    if(cl_event_is_gpu_command_type(event->type))
> -      event->gpgpu_event = cl_gpgpu_event_new(gpgpu);
> +
> +  if (type == CL_COMMAND_USER) {
> +    assert(queue == NULL);
>    }
> -  cl_event_add_ref(event);       //dec when complete
> -  event->user_cb = NULL;
> -  event->enqueue_cb = NULL;
> -  event->waits_head = NULL;
> -  event->emplict = emplict;
> -
> -exit:
> -  return event;
> -error:
> -  cl_event_delete(event);
> -  event = NULL;
> -  goto exit;
> +
> +  e->depend_events = event_list;
> +  e->depend_event_num = num_events;
> +  return e;
>  }
> 
> -void cl_event_delete(cl_event event)
> +LOCAL void
> +cl_event_delete(cl_event event)
>  {
> +  int i;
> +  cl_event_user_callback cb;
> +
>    if (UNLIKELY(event == NULL))
>      return;
> 
> -  cl_event_update_status(event, 0);
> -
>    if (CL_OBJECT_DEC_REF(event) > 1)
>      return;
> 
> -  /* Call all user's callback if haven't execute */
> -  cl_event_call_callback(event, CL_COMPLETE, CL_TRUE); // CL_COMPLETE
> status will force all callbacks that are not executed to run
> +  cl_enqueue_delete(&event->exec_data);
> 
> -  /* delete gpgpu event object */
> -  if(event->gpgpu_event)
> -    cl_gpgpu_event_delete(event->gpgpu_event);
> +  assert(list_empty(&event->enqueue_node));
> 
> -  /* Remove it from the list */
> -  cl_context_remove_event(event->ctx, event);
> +  if (event->depend_events) {
> +    assert(event->depend_event_num);
> +    for (i = 0; i < event->depend_event_num; i++) {
> +      cl_event_delete(event->depend_events[i]);
> +    }
> +    cl_free(event->depend_events);
> +  }
> 
> -  if (event->gpgpu) {
> -    fprintf(stderr, "Warning: a event is deleted with a pending enqueued
> task.\n");
> -    cl_gpgpu_delete(event->gpgpu);
> -    event->gpgpu = NULL;
> +  /* Free all the callbacks. Last ref, no need to lock. */
> +  while (!list_empty(&event->callbacks)) {
> +    cb = list_entry(event->callbacks.next, _cl_event_user_callback, node);
> +    list_del(&cb->node);
> +    cl_free(cb);
>    }
> 
> +  /* Remove it from the list */
> +  assert(event->ctx);
> +  cl_context_remove_event(event->ctx, event);
> +
>    CL_OBJECT_DESTROY_BASE(event);
>    cl_free(event);
>  }
> 
> -void cl_event_add_ref(cl_event event)
> +LOCAL cl_event
> +cl_event_create(cl_context ctx, cl_command_queue queue, cl_uint
> num_events,
> +                const cl_event *event_list, cl_command_type type, cl_int
> *errcode_ret)
>  {
> -  assert(event);
> -  CL_OBJECT_INC_REF(event);
> -}
> +  cl_event e = NULL;
> +  cl_event *depend_events = NULL;
> +  cl_int err = CL_SUCCESS;
> +  cl_uint total_events = 0;
> +  int i;
> 
> -cl_int cl_event_set_callback(cl_event event ,
> -                                  cl_int command_exec_callback_type,
> -                                  EVENT_NOTIFY pfn_notify,
> -                                  void* user_data)
> -{
> -  assert(event);
> -  assert(pfn_notify);
> +  assert(ctx);
> 
> -  cl_int err = CL_SUCCESS;
> -  user_callback *cb;
> -  TRY_ALLOC(cb, CALLOC(user_callback));
> -
> -  cb->pfn_notify  = pfn_notify;
> -  cb->user_data   = user_data;
> -  cb->status      = command_exec_callback_type;
> -  cb->executed    = CL_FALSE;
> -
> -
> -  // It is possible that the event enqueued is already completed.
> -  // clEnqueueReadBuffer can be synchronous and when the callback
> -  // is registered after, it still needs to get executed.
> -  CL_OBJECT_LOCK(event); // Thread safety required: operations on the
> event->status can be made from many different threads
> -  if(event->status <= command_exec_callback_type) {
> -    /* Call user callback */
> -    CL_OBJECT_UNLOCK(event); // pfn_notify can call clFunctions that use
> the event_lock and from here it's not required
> -    cb->pfn_notify(event, event->status, cb->user_data);
> -    cl_free(cb);
> -  } else {
> -    // Enqueue to callback list
> -    cb->next        = event->user_cb;
> -    event->user_cb  = cb;
> -    CL_OBJECT_UNLOCK(event);
> -  }
> +  do {
> +    if (event_list)
> +      assert(num_events);
> 
> -exit:
> -  return err;
> -error:
> -  err = CL_OUT_OF_HOST_MEMORY;
> -  cl_free(cb);
> -  goto exit;
> -};
> -
> -cl_int cl_event_check_waitlist(cl_uint num_events_in_wait_list,
> -                                    const cl_event *event_wait_list,
> -                                    cl_event *event,cl_context ctx)
> -{
> -  cl_int err = CL_SUCCESS;
> -  cl_int i;
> -  /* check the event_wait_list and num_events_in_wait_list */
> -  if((event_wait_list == NULL) &&
> -     (num_events_in_wait_list > 0))
> -    goto error;
> -
> -  if ((event_wait_list != NULL) &&
> -      (num_events_in_wait_list == 0)){
> -    goto error;
> -  }
> +    if (queue == NULL) {
> +      assert(type == CL_COMMAND_USER);
> +      assert(event_list == NULL);
> +      assert(num_events == 0);
> 
> -  /* check the event and context */
> -  for(i=0; i<num_events_in_wait_list; i++) {
> -    CHECK_EVENT(event_wait_list[i]);
> -    if(event_wait_list[i]->status < CL_COMPLETE) {
> -      err = CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST;
> -      goto exit;
> -    }
> -    if(event && event == &event_wait_list[i])
> -      goto error;
> -    if(event_wait_list[i]->ctx != ctx) {
> -      err = CL_INVALID_CONTEXT;
> -      goto exit;
> -    }
> -  }
> +      e = cl_event_new(ctx, queue, type, 0, NULL);
> +      if (e == NULL) {
> +        err = CL_OUT_OF_HOST_MEMORY;
> +        break;
> +      }
> +    } else {
> +      CL_OBJECT_LOCK(queue);
> +      total_events = queue->barrier_events_num + num_events;
> +
> +      if (total_events) {
> +        depend_events = cl_calloc(total_events, sizeof(cl_event));
> +        if (depend_events == NULL) {
> +          CL_OBJECT_UNLOCK(queue);
> +          err = CL_OUT_OF_HOST_MEMORY;
> +          break;
> +        }
> +      }
> 
> -exit:
> -  return err;
> -error:
> -  err = CL_INVALID_EVENT_WAIT_LIST;  //reset error
> -  goto exit;
> -}
> +      /* Add all the barrier events as depend events. */
> +      for (i = 0; i < queue->barrier_events_num; i++) {
> +        assert(CL_EVENT_IS_BARRIER(queue->barrier_events[i]));
> +        cl_event_add_ref(queue->barrier_events[i]);
> +        depend_events[num_events + i] = queue->barrier_events[i];
> +      }
> 
> -cl_int cl_event_wait_events(cl_uint num_events_in_wait_list, const
> cl_event *event_wait_list,
> -                            cl_command_queue queue)
> -{
> -  cl_int i;
> +      CL_OBJECT_UNLOCK(queue);
> 
> -  /* Check whether wait user events */
> -  for(i=0; i<num_events_in_wait_list; i++) {
> -    if(event_wait_list[i]->status <= CL_COMPLETE)
> -      continue;
> +      for (i = 0; i < num_events; i++) {
> +        assert(event_list[i]);
> +        assert(event_list[i]->ctx == ctx);
> +        assert(CL_OBJECT_IS_EVENT(event_list[i]));
> +        cl_event_add_ref(event_list[i]);
> +        depend_events[i] = event_list[i];
> +      }
> 
> -    /* Need wait on user event, return and do enqueue defer */
> -    if((event_wait_list[i]->type == CL_COMMAND_USER) ||
> -       (event_wait_list[i]->enqueue_cb &&
> -       (event_wait_list[i]->enqueue_cb->wait_user_events != NULL))){
> -      return CL_ENQUEUE_EXECUTE_DEFER;
> -    }
> -  }
> +      if (depend_events)
> +        assert(total_events);
> 
> -  if(queue && queue->barrier_events_num )
> -      return CL_ENQUEUE_EXECUTE_DEFER;
> +      e = cl_event_new(ctx, queue, type, total_events, depend_events);
> +      if (e == NULL) {
> +        err = CL_OUT_OF_HOST_MEMORY;
> +        break;
> +      }
> +      depend_events = NULL;
> +    }
> +  } while (0);
> 
> -  /* Non user events or all user event finished, wait all enqueue events
> finish */
> -  for(i=0; i<num_events_in_wait_list; i++) {
> -    if(event_wait_list[i]->status <= CL_COMPLETE)
> -      continue;
> +  if (err != CL_SUCCESS) {
> +    if (depend_events) {
> +      for (i = 0; i < total_events; i++) {
> +        cl_event_delete(depend_events[i]);
> +      }
> +      cl_free(depend_events);
> +    }
> 
> -    //enqueue callback haven't finish, in another thread, wait
> -    if(event_wait_list[i]->enqueue_cb != NULL)
> -      return CL_ENQUEUE_EXECUTE_DEFER;
> -    if(event_wait_list[i]->gpgpu_event)
> -      cl_gpgpu_event_update_status(event_wait_list[i]->gpgpu_event, 1);
> -    cl_event_set_status(event_wait_list[i], CL_COMPLETE);  //Execute user's
> callback
> +    // if set depend_events, must succeed.
> +    assert(e->depend_events == NULL);
> +    cl_event_delete(e);
>    }
> -  return CL_ENQUEUE_EXECUTE_IMM;
> +
> +  if (errcode_ret)
> +    *errcode_ret = err;
> +
> +  return e;
>  }
> 
> -void cl_event_new_enqueue_callback(cl_event event,
> -                                            enqueue_data *data,
> -                                            cl_uint num_events_in_wait_list,
> -                                            const cl_event *event_wait_list)
> +LOCAL cl_int
> +cl_event_set_callback(cl_event event, cl_int exec_type,
> cl_event_notify_cb pfn_notify, void *user_data)
>  {
> -  enqueue_callback *cb, *node;
> -  user_event *user_events, *u_ev;
> -  cl_command_queue queue = event ? event->queue : NULL;
> -  cl_int i;
>    cl_int err = CL_SUCCESS;
> +  cl_event_user_callback cb;
> +  cl_bool exec_imm = CL_FALSE;
> 
> -  /* Allocate and initialize the structure itself */
> -  TRY_ALLOC_NO_ERR (cb, CALLOC(enqueue_callback));
> -  cb->num_events = 0;
> -  TRY_ALLOC_NO_ERR (cb->wait_list, CALLOC_ARRAY(cl_event,
> num_events_in_wait_list));
> -  for(i=0; i<num_events_in_wait_list; i++) {
> -    //user event will insert to cb->wait_user_events, need not in wait list,
> avoid ref twice
> -    if(event_wait_list[i]->type != CL_COMMAND_USER) {
> -      cb->wait_list[cb->num_events++] = event_wait_list[i];
> -      cl_event_add_ref(event_wait_list[i]);  //add defer enqueue's wait event
> reference
> -    }
> -  }
> -  cb->event = event;
> -  cb->next = NULL;
> -  cb->wait_user_events = NULL;
> -
> -  if(queue && queue->barrier_events_num > 0) {
> -    for(i=0; i<queue->barrier_events_num; i++) {
> -      /* Insert the enqueue_callback to user event list */
> -      node = queue->wait_events[i]->waits_head;
> -      if(node == NULL)
> -        queue->wait_events[i]->waits_head = cb;
> -      else{
> -        while((node != cb) && node->next)
> -          node = node->next;
> -        if(node == cb)   //wait on dup user event
> -          continue;
> -        node->next = cb;
> -      }
> +  assert(event);
> +  assert(pfn_notify);
> 
> -      /* Insert the user event to enqueue_callback's wait_user_events */
> -      TRY(cl_event_insert_user_event, &cb->wait_user_events, queue-
> >wait_events[i]);
> -      cl_event_add_ref(queue->wait_events[i]);
> +  do {
> +    cb = cl_calloc(1, sizeof(_cl_event_user_callback));
> +    if (cb == NULL) {
> +      err = CL_OUT_OF_HOST_MEMORY;
> +      break;
>      }
> -  }
> 
> -  /* Find out all user events that in event_wait_list wait */
> -  for(i=0; i<num_events_in_wait_list; i++) {
> -    if(event_wait_list[i]->status <= CL_COMPLETE)
> -      continue;
> -
> -    if(event_wait_list[i]->type == CL_COMMAND_USER) {
> -      /* Insert the enqueue_callback to user event list */
> -      node = event_wait_list[i]->waits_head;
> -      if(node == NULL)
> -        event_wait_list[i]->waits_head = cb;
> -      else {
> -        while((node != cb) && node->next)
> -          node = node->next;
> -        if(node == cb)   //wait on dup user event
> -          continue;
> -        node->next = cb;
> -      }
> -      /* Insert the user event to enqueue_callback's wait_user_events */
> -      TRY(cl_event_insert_user_event, &cb->wait_user_events,
> event_wait_list[i]);
> -      cl_event_add_ref(event_wait_list[i]);
> -      if(queue)
> -        cl_command_queue_insert_event(queue, event_wait_list[i]);
> -      if(queue && data->type == EnqueueBarrier){
> -        cl_command_queue_insert_barrier_event(queue, event_wait_list[i]);
> -      }
> -    } else if(event_wait_list[i]->enqueue_cb != NULL) {
> -      user_events = event_wait_list[i]->enqueue_cb->wait_user_events;
> -      while(user_events != NULL) {
> -        /* Insert the enqueue_callback to user event's  waits_tail */
> -        node = user_events->event->waits_head;
> -        if(node == NULL)
> -          event_wait_list[i]->waits_head = cb;
> -        else{
> -          while((node != cb) && node->next)
> -            node = node->next;
> -          if(node == cb) {  //wait on dup user event
> -            user_events = user_events->next;
> -            continue;
> -          }
> -          node->next = cb;
> -        }
> -
> -        /* Insert the user event to enqueue_callback's wait_user_events */
> -        TRY(cl_event_insert_user_event, &cb->wait_user_events,
> user_events->event);
> -        cl_event_add_ref(user_events->event);
> -        if(queue)
> -          cl_command_queue_insert_event(event->queue, user_events-
> >event);
> -        if(queue && data->type == EnqueueBarrier){
> -          cl_command_queue_insert_barrier_event(event->queue,
> user_events->event);
> -        }
> -        user_events = user_events->next;
> -      }
> -    }
> -  }
> -  if(event != NULL && event->queue != NULL && event->gpgpu_event !=
> NULL) {
> -    event->gpgpu = cl_thread_gpgpu_take(event->queue);
> -    data->ptr = (void *)event->gpgpu_event;
> -  }
> -  cb->data = *data;
> -  if(event)
> -    event->enqueue_cb = cb;
> -
> -exit:
> -  return;
> -error:
> -  if(cb) {
> -    while(cb->wait_user_events) {
> -      u_ev = cb->wait_user_events;
> -      cb->wait_user_events = cb->wait_user_events->next;
> -      cl_event_delete(u_ev->event);
> -      cl_free(u_ev);
> +    list_init(&cb->node);
> +    cb->pfn_notify = pfn_notify;
> +    cb->user_data = user_data;
> +    cb->status = exec_type;
> +    cb->executed = CL_FALSE;
> +
> +    CL_OBJECT_LOCK(event);
> +    if (event->status > exec_type) {
> +      list_add_tail(&cb->node, &event->callbacks);
> +      cb = NULL;
> +    } else {
> +      /* The state has already OK, call it immediately. */
> +      exec_imm = CL_TRUE;
>      }
> -    for(i=0; i<cb->num_events; i++) {
> -      if(cb->wait_list[i]) {
> -        cl_event_delete(cb->wait_list[i]);
> -      }
> -    }
> -    cl_free(cb);
> -  }
> -  goto exit;
> -}
> +    CL_OBJECT_UNLOCK(event);
> 
> -void cl_event_call_callback(cl_event event, cl_int status, cl_bool free_cb) {
> -  user_callback *user_cb = NULL;
> -  user_callback *queue_cb = NULL; // For thread safety, we create a queue
> that holds user_callback's pfn_notify contents
> -  user_callback *temp_cb = NULL;
> -  user_cb = event->user_cb;
> -  CL_OBJECT_LOCK(event);
> -  while(user_cb) {
> -    if(user_cb->status >= status
> -        && user_cb->executed == CL_FALSE) { // Added check to not execute a
> callback when it was already handled
> -      user_cb->executed = CL_TRUE;
> -      temp_cb = cl_malloc(sizeof(user_callback));
> -      if(!temp_cb) {
> -        break; // Out of memory
> -      }
> -      temp_cb->pfn_notify = user_cb->pfn_notify; // Minor struct copy to call
> ppfn_notify out of the pthread_mutex
> -      temp_cb->user_data = user_cb->user_data;
> -      if(free_cb) {
> -        cl_free(user_cb);
> -      }
> -      if(!queue_cb) {
> -        queue_cb = temp_cb;
> -        queue_cb->next = NULL;
> -      } else { // Enqueue First
> -        temp_cb->next = queue_cb;
> -        queue_cb = temp_cb;
> -      }
> +    if (exec_imm) {
> +      cb->pfn_notify(event, event->status, cb->user_data);
>      }
> -    user_cb = user_cb->next;
> -  }
> -  CL_OBJECT_UNLOCK(event);
> 
> -  // Calling the callbacks outside of the event_lock is required because the
> callback can call cl_api functions and get deadlocked
> -  while(queue_cb) { // For each callback queued, actually execute the
> callback
> -    queue_cb->pfn_notify(event, event->status, queue_cb->user_data);
> -    temp_cb = queue_cb;
> -    queue_cb = queue_cb->next;
> -    cl_free(temp_cb);
> -  }
> +  } while (0);
> +
> +  if (cb)
> +    cl_free(cb);
> +
> +  return err;
>  }
> 
> -void cl_event_set_status(cl_event event, cl_int status)
> +LOCAL cl_int
> +cl_event_set_status(cl_event event, cl_int status)
>  {
> -  cl_int ret, i;
> -  cl_event evt;
> +  list_head tmp_callbacks;
> +  list_head *n;
> +  list_head *pos;
> +  cl_bool notify_queue = CL_FALSE;
> +  cl_event_user_callback cb;
> +
> +  assert(event);
> 
>    CL_OBJECT_LOCK(event);
> -  if(status >= event->status) {
> +  if (event->status <= CL_COMPLETE) { // Already set to error or completed
>      CL_OBJECT_UNLOCK(event);
> -    return;
> +    return CL_INVALID_OPERATION;
>    }
> -  if(event->status <= CL_COMPLETE) {
> -    event->status = status;    //have done enqueue before or doing in another
> thread
> -    CL_OBJECT_UNLOCK(event);
> -    return;
> +
> +  if (CL_EVENT_IS_USER(event)) {
> +    assert(event->status != CL_RUNNING && event->status != CL_QUEUED);
> +  } else {
> +    assert(event->queue); // Must belong to some queue.
>    }
> 
> -  if(status <= CL_COMPLETE) {
> -    if(event->enqueue_cb) {
> -      if(status == CL_COMPLETE) {
> -        cl_enqueue_handle(event, &event->enqueue_cb->data);
> -        if(event->gpgpu_event)
> -          cl_gpgpu_event_update_status(event->gpgpu_event, 1);  //now set
> complet, need refine
> -      } else {
> -        if(event->gpgpu_event) {
> -          // Error then cancel the enqueued event.
> -          cl_gpgpu_delete(event->gpgpu);
> -          event->gpgpu = NULL;
> -        }
> -      }
> +  if (status >= event->status) { // Should never go back.
> +    CL_OBJECT_UNLOCK(event);
> +    return CL_INVALID_OPERATION;
> +  }
> 
> -      event->status = status;  //Change the event status after enqueue and
> befor unlock
> +  event->status = status;
> 
> +  /* Call all the callbacks. */
> +  if (!list_empty(&event->callbacks)) {
> +    do {
> +      status = event->status;
> +      list_init(&tmp_callbacks);
> +      list_replace(&event->callbacks, &tmp_callbacks);
> +      list_init(&event->callbacks);
> +      /* Call all the callbacks without lock. */
>        CL_OBJECT_UNLOCK(event);
> -      for(i=0; i<event->enqueue_cb->num_events; i++)
> -        cl_event_delete(event->enqueue_cb->wait_list[i]);
> +
> +      list_for_each_safe(pos, n, &tmp_callbacks)
> +      {
> +        cb = list_entry(pos, _cl_event_user_callback, node);
> +
> +        assert(cb->executed == CL_FALSE);
> +
> +        if (cb->status < status)
> +          continue;
> +
> +        list_del(&cb->node);
> +        cb->executed = CL_TRUE;
> +        cb->pfn_notify(event, status, cb->user_data);
> +        cl_free(cb);
> +      }
> +
>        CL_OBJECT_LOCK(event);
> 
> -      if(event->enqueue_cb->wait_list)
> -        cl_free(event->enqueue_cb->wait_list);
> -      cl_free(event->enqueue_cb);
> -      event->enqueue_cb = NULL;
> -    }
> +      // Set back the uncalled callbacks.
> +      list_splice_tail(&tmp_callbacks, &event->callbacks);
> +
> +      /* Status may changed because we unlock. need to check again. */
> +    } while (status != event->status);
>    }
> -  if(event->status >= status)  //maybe changed in other threads
> -    event->status = status;
> +
> +  /*  Wakeup all the waiter for status change. */
> +  CL_OBJECT_NOTIFY_COND(event);
> +
> +  if (event->status <= CL_COMPLETE) {
> +    notify_queue = CL_TRUE;
> +  }
> +
>    CL_OBJECT_UNLOCK(event);
> 
> -  /* Call user callback */
> -  cl_event_call_callback(event, status, CL_FALSE);
> +  /* Need to notify all the command queue within the same context. */
> +  if (notify_queue) {
> +    cl_command_queue *q_list = NULL;
> +    cl_uint queue_num = 0;
> +    int i = 0;
> +    int cookie = 0;
> +
> +    /*First, we need to remove it from queue's barrier list. */
> +    if (CL_EVENT_IS_BARRIER(event)) {
> +      assert(event->queue);
> +      cl_command_queue_remove_barrier_event(event->queue, event);
> +    }
> 
> -  if(event->type == CL_COMMAND_USER) {
> -    /* Check all defer enqueue */
> -    enqueue_callback *cb, *enqueue_cb = event->waits_head;
> -    while(enqueue_cb) {
> -      /* Remove this user event in enqueue_cb, update the header if needed.
> */
> -      cl_event_remove_user_event(&enqueue_cb->wait_user_events,
> event);
> -      cl_event_delete(event);
> +    /* Then, notify all the queues within the same context. */
> +    CL_OBJECT_LOCK(event->ctx);
> +    do {
> +      queue_num = event->ctx->queue_num;
> +      cookie = event->ctx->queue_cookie;
> +
> +      if (queue_num > 0) {
> +        q_list = cl_calloc(queue_num, sizeof(cl_command_queue));
> +        assert(q_list);
> +        i = 0;
> +        list_for_each(pos, &event->ctx->queues)
> +        {
> +          q_list[i] = (cl_command_queue)(list_entry(pos, _cl_base_object,
> node));
> +          assert(i < queue_num);
> +          i++;
> +        }
> 
> -      /* Still wait on other user events */
> -      if(enqueue_cb->wait_user_events != NULL) {
> -        enqueue_cb = enqueue_cb->next;
> -        continue;
> -      }
> +        CL_OBJECT_UNLOCK(event->ctx); // Update status without context
> lock.
> 
> -      //remove user event frome enqueue_cb's ctx
> -      cl_command_queue_remove_event(enqueue_cb->event->queue,
> event);
> -      cl_command_queue_remove_barrier_event(enqueue_cb->event-
> >queue, event);
> -
> -      /* All user events complete, now wait enqueue events */
> -      ret = cl_event_wait_events(enqueue_cb->num_events, enqueue_cb-
> >wait_list,
> -          enqueue_cb->event->queue);
> -      assert(ret != CL_ENQUEUE_EXECUTE_DEFER);
> -      ret = ~ret;
> -      cb = enqueue_cb;
> -      enqueue_cb = enqueue_cb->next;
> -
> -      /* Call the pending operation */
> -      evt = cb->event;
> -      /* TODO: if this event wait on several events, one event's
> -         status is error, the others is complete, what's the status
> -         of this event? Can't find the description in OpenCL spec.
> -         Simply update to latest finish wait event.*/
> -      cl_event_set_status(cb->event, status);
> -      if(evt->emplict == CL_FALSE) {
> -        cl_event_delete(evt);
> +        for (i = 0; i < queue_num; i++) {
> +          cl_command_queue_notify(q_list[i]);
> +        }
> +
> +        CL_OBJECT_LOCK(event->ctx); // Lock again.
> +      } else {
> +        /* No queue? Just do nothing. */
>        }
> -    }
> -    event->waits_head = NULL;
> -  }
> 
> -  if(event->status <= CL_COMPLETE){
> -    /* Maintain the last_list when event completed*/
> -    if (event->last_prev)
> -      event->last_prev->last_next = event->last_next;
> -    if (event->last_next)
> -      event->last_next->last_prev = event->last_prev;
> -    if(event->queue && get_last_event(event->queue) == event)
> -      set_last_event(event->queue, event->last_next);
> -    event->last_prev = NULL;
> -    event->last_next = NULL;
> -    cl_event_delete(event);
> +    } while (cookie != event->ctx->queue_cookie); // Some queue may be
> added when we unlock.
> +    CL_OBJECT_UNLOCK(event->ctx);
> +
> +    if (q_list)
> +      cl_free(q_list);
>    }
> +
> +  return CL_SUCCESS;
>  }
> 
> -void cl_event_update_status(cl_event event, int wait)
> +LOCAL cl_int
> +cl_event_wait_for_event_ready(const cl_event event)
>  {
> -  if(event->status <= CL_COMPLETE)
> -    return;
> -  if((event->gpgpu_event) &&
> -     (cl_gpgpu_event_update_status(event->gpgpu_event, wait) ==
> command_complete))
> -    cl_event_set_status(event, CL_COMPLETE);
> +  assert(CL_OBJECT_IS_EVENT(event));
> +  return cl_event_wait_for_events_list(event->depend_event_num,
> event->depend_events);
>  }
> 
> -cl_int cl_event_marker_with_wait_list(cl_command_queue queue,
> -                cl_uint num_events_in_wait_list,
> -                const cl_event *event_wait_list,
> -                cl_event* event)
> +LOCAL cl_int
> +cl_event_wait_for_events_list(cl_uint num_events, const cl_event
> *event_list)
>  {
> -  enqueue_data data = { 0 };
> +  int i;
>    cl_event e;
> +  cl_int ret = CL_SUCCESS;
> 
> -  e = cl_event_new(queue->ctx, queue, CL_COMMAND_MARKER,
> CL_TRUE);
> -  if(e == NULL)
> -    return CL_OUT_OF_HOST_MEMORY;
> -
> -  if(event != NULL ){
> -    *event = e;
> -  }
> +  for (i = 0; i < num_events; i++) {
> +    e = event_list[i];
> +    assert(e);
> +    assert(CL_OBJECT_IS_EVENT(e));
> 
> -//enqueues a marker command which waits for either a list of events to
> complete, or if the list is
> -//empty it waits for all commands previously enqueued in command_queue
> to complete before it  completes.
> -  if(num_events_in_wait_list > 0){
> -    if(cl_event_wait_events(num_events_in_wait_list, event_wait_list,
> queue) == CL_ENQUEUE_EXECUTE_DEFER) {
> -      data.type = EnqueueMarker;
> -      cl_event_new_enqueue_callback(event?*event:NULL, &data,
> num_events_in_wait_list, event_wait_list);
> -      return CL_SUCCESS;
> +    CL_OBJECT_LOCK(e);
> +    while (e->status > CL_COMPLETE) {
> +      CL_OBJECT_WAIT_ON_COND(e);
>      }
> -  } else if(queue->wait_events_num > 0) {
> -    data.type = EnqueueMarker;
> -    cl_event_new_enqueue_callback(event?*event:NULL, &data, queue-
> >wait_events_num, queue->wait_events);
> -    return CL_SUCCESS;
> +    /* Iff some error happened, return the error. */
> +    if (e->status < CL_COMPLETE) {
> +      ret = CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST;
> +    }
> +    CL_OBJECT_UNLOCK(e);
>    }
> 
> -  cl_event_update_last_events(queue,1);
> -
> -  cl_event_set_status(e, CL_COMPLETE);
> -  return CL_SUCCESS;
> +  return ret;
>  }
> 
> -cl_int cl_event_barrier_with_wait_list(cl_command_queue queue,
> -                cl_uint num_events_in_wait_list,
> -                const cl_event *event_wait_list,
> -                cl_event* event)
> +LOCAL cl_int
> +cl_event_check_waitlist(cl_uint num_events_in_wait_list, const cl_event
> *event_wait_list,
> +                        cl_event *event, cl_context ctx)
>  {
> -  enqueue_data data = { 0 };
> -  cl_event e;
> -
> -  e = cl_event_new(queue->ctx, queue, CL_COMMAND_BARRIER,
> CL_TRUE);
> -  if(e == NULL)
> -    return CL_OUT_OF_HOST_MEMORY;
> +  cl_int err = CL_SUCCESS;
> +  cl_int i;
> 
> -  if(event != NULL ){
> -    *event = e;
> -  }
> -//enqueues a barrier command which waits for either a list of events to
> complete, or if the list is
> -//empty it waits for all commands previously enqueued in command_queue
> to complete before it  completes.
> -  if(num_events_in_wait_list > 0){
> -    if(cl_event_wait_events(num_events_in_wait_list, event_wait_list,
> queue) == CL_ENQUEUE_EXECUTE_DEFER) {
> -      data.type = EnqueueBarrier;
> -      cl_event_new_enqueue_callback(e, &data, num_events_in_wait_list,
> event_wait_list);
> -      return CL_SUCCESS;
> +  do {
> +    /* check the event_wait_list and num_events_in_wait_list */
> +    if ((event_wait_list == NULL) && (num_events_in_wait_list > 0)) {
> +      err = CL_INVALID_EVENT_WAIT_LIST;
> +      break;
>      }
> -  } else if(queue->wait_events_num > 0) {
> -    data.type = EnqueueBarrier;
> -    cl_event_new_enqueue_callback(e, &data, queue->wait_events_num,
> queue->wait_events);
> -    return CL_SUCCESS;
> -  }
> 
> -  cl_event_update_last_events(queue,1);
> +    if ((event_wait_list != NULL) && (num_events_in_wait_list == 0)) {
> +      err = CL_INVALID_EVENT_WAIT_LIST;
> +      break;
> +    }
> 
> -  cl_event_set_status(e, CL_COMPLETE);
> -  return CL_SUCCESS;
> -}
> +    /* check the event and context */
> +    for (i = 0; i < num_events_in_wait_list; i++) {
> +      if (event_wait_list[i] == NULL
> || !CL_OBJECT_IS_EVENT(event_wait_list[i])) {
> +        err = CL_INVALID_EVENT;
> +        break;
> +      }
> 
> -cl_ulong cl_event_get_cpu_timestamp(cl_ulong *cpu_time)
> -{
> -  struct timespec ts;
> +      if (cl_event_get_status(event_wait_list[i]) < CL_COMPLETE) {
> +        err = CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST;
> +        break;
> +      }
> 
> - if(clock_gettime(CLOCK_MONOTONIC_RAW,&ts) != 0){
> -  printf("CPU Timmer error\n");
> -  return CL_FALSE;
> -  }
> -  *cpu_time = (1000000000.0) * (cl_ulong) ts.tv_sec + (cl_ulong) ts.tv_nsec;
> +      if (event == event_wait_list + i) { /* Pointer of element of the wait list */
> +        err = CL_INVALID_EVENT_WAIT_LIST;
> +        break;
> +      }
> 
> -  return CL_SUCCESS;
> -}
> +      /* check all belong to same context. */
> +      if (ctx == NULL) {
> +        ctx = event_wait_list[i]->ctx;
> +      }
> +      if (event_wait_list[i]->ctx != ctx) {
> +        err = CL_INVALID_CONTEXT;
> +        break;
> +      }
> +    }
> 
> -cl_int cl_event_get_queued_cpu_timestamp(cl_event event)
> -{
> -  cl_int ret_val;
> +    if (err != CL_SUCCESS)
> +      break;
> 
> -  ret_val = cl_event_get_cpu_timestamp(&event->queued_timestamp);
> +  } while (0);
> 
> -  return ret_val;
> +  return err;
>  }
> 
> -cl_ulong cl_event_get_timestamp_delta(cl_ulong start_timestamp,cl_ulong
> end_timestamp)
> +LOCAL void
> +cl_event_exec(cl_event event, cl_int exec_status)
>  {
> -  cl_ulong ret_val;
> +  /* We are MT safe here, no one should call this
> +     at the same time. No need to lock */
> +  cl_int ret = CL_SUCCESS;
> +  cl_int status = cl_event_get_status(event);
> +  cl_int depend_status;
> 
> -  if(end_timestamp > start_timestamp){
> -   ret_val = end_timestamp - start_timestamp;
> -   }
> -  else {
> -   /*if start time stamp is greater than end timstamp then set ret value to
> max*/
> -   ret_val = ((cl_ulong) 1 << 32);
> +  if (status < CL_COMPLETE || status <= exec_status) {
> +    return;
>    }
> 
> -  return ret_val;
> -}
> -
> -cl_ulong cl_event_get_start_timestamp(cl_event event)
> -{
> -  cl_ulong ret_val;
> +  depend_status = cl_event_is_ready(event);
> +  assert(depend_status <= CL_COMPLETE);
> +  if (depend_status < CL_COMPLETE) { // Error happend, cancel exec.
> +    ret = cl_event_set_status(event, depend_status);
> +    return;
> +  }
> 
> -   ret_val = cl_event_get_timestamp_delta(event->timestamp[0],event-
> >timestamp[2]);
> +  /* Do the according thing based on event type. */
> +  ret = cl_enqueue_handle(&event->exec_data, exec_status);
> 
> -  return ret_val;
> +  if (ret != CL_SUCCESS) {
> +    assert(ret < 0);
> +    DEBUGP(DL_WARNING, "Exec event %p error, type is %d, error staus
> is %d",
> +           event, event->event_type, ret);
> +    ret = cl_event_set_status(event, ret);
> +    assert(ret == CL_SUCCESS);
> +  } else {
> +    ret = cl_event_set_status(event, exec_status);
> +    assert(ret == CL_SUCCESS);
> +  }
>  }
> 
> -cl_ulong cl_event_get_end_timestamp(cl_event event)
> +/* 0 means ready, >0 means not ready, <0 means error. */
> +LOCAL cl_int
> +cl_event_is_ready(cl_event event)
>  {
> - cl_ulong ret_val;
> -
> -  ret_val = cl_event_get_timestamp_delta(event->timestamp[0],event-
> >timestamp[3]);
> +  int i;
> +  int status;
> 
> -  return ret_val;
> -}
> +  for (i = 0; i < event->depend_event_num; i++) {
> +    status = cl_event_get_status(event->depend_events[i]);
> 
> -cl_int cl_event_get_timestamp(cl_event event, cl_profiling_info
> param_name)
> -{
> -  cl_ulong ret_val = 0;
> -  GET_QUEUE_THREAD_GPGPU(event->queue);
> -
> -  if (!event->gpgpu_event) {
> -    cl_gpgpu_event_get_gpu_cur_timestamp(gpgpu, &ret_val);
> -    event->timestamp[param_name - CL_PROFILING_COMMAND_QUEUED]
> = ret_val;
> -    return CL_SUCCESS;
> +    if (status != CL_COMPLETE) {
> +      return status;
> +    }
>    }
> 
> -  if(param_name == CL_PROFILING_COMMAND_SUBMIT ||
> -         param_name == CL_PROFILING_COMMAND_QUEUED) {
> -    cl_gpgpu_event_get_gpu_cur_timestamp(gpgpu, &ret_val);
> -    event->timestamp[param_name - CL_PROFILING_COMMAND_QUEUED]
> = ret_val;
> -    return CL_SUCCESS;
> -  } else if(param_name == CL_PROFILING_COMMAND_START) {
> -    cl_gpgpu_event_get_exec_timestamp(gpgpu, event->gpgpu_event, 0,
> &ret_val);
> -    event->timestamp[param_name - CL_PROFILING_COMMAND_QUEUED]
> = ret_val;
> -    return CL_SUCCESS;
> -  } else if (param_name == CL_PROFILING_COMMAND_END) {
> -    cl_gpgpu_event_get_exec_timestamp(gpgpu, event->gpgpu_event, 1,
> &ret_val);
> -    event->timestamp[param_name - CL_PROFILING_COMMAND_QUEUED]
> = ret_val;
> -    return CL_SUCCESS;
> -  }
> -  return CL_INVALID_VALUE;
> +  return CL_COMPLETE;
>  }
> 
> -cl_int cl_event_insert_user_event(user_event** p_u_ev, cl_event event)
> +LOCAL cl_event
> +cl_event_create_marker_or_barrier(cl_command_queue queue, cl_uint
> num_events_in_wait_list,
> +                                  const cl_event *event_wait_list, cl_bool is_barrier, cl_int
> *error)
>  {
> -  user_event * u_iter = *p_u_ev;
> -  user_event * u_ev;
> -
> -  while(u_iter)
> -  {
> -    if(u_iter->event == event)
> -      return CL_SUCCESS;
> -    u_iter = u_iter->next;
> -  }
> +  cl_event e = NULL;
> +  cl_int err = CL_SUCCESS;
> +  cl_command_type type = CL_COMMAND_MARKER;
> +  enqueue_type eq_type = EnqueueMarker;
> 
> -  TRY_ALLOC_NO_ERR (u_ev, CALLOC(user_event));
> -  u_ev->event = event;
> -  u_ev->next = *p_u_ev;
> -  *p_u_ev = u_ev;
> +  if (is_barrier) {
> +    type = CL_COMMAND_BARRIER;
> +    eq_type = EnqueueBarrier;
> +  }
> 
> +  if (event_wait_list) {
> +    assert(num_events_in_wait_list > 0);
> 
> -  return CL_SUCCESS;
> -error:
> -  return CL_FALSE;
> -}
> +    e = cl_event_create(queue->ctx, queue, num_events_in_wait_list,
> +                        event_wait_list, type, &err);
> +    if (err != CL_SUCCESS) {
> +      *error = err;
> +      return NULL;
> +    }
> +  } else { /* The marker depends on all events in the queue now. */
> +    cl_command_queue_enqueue_worker worker = &queue->worker;
> +    cl_uint i;
> +    cl_uint event_num;
> +    cl_event *depend_events;
> +
> +    CL_OBJECT_LOCK(queue);
> +
> +    /* First, wait for the command queue retire all in executing event. */
> +    while (1) {
> +      if (worker->quit) { // already destroy the queue?
> +        CL_OBJECT_UNLOCK(queue);
> +        *error = CL_INVALID_COMMAND_QUEUE;
> +        return NULL;
> +      }
> 
> -cl_int cl_event_remove_user_event(user_event** p_u_ev, cl_event event)
> -{
> -  user_event * u_iter = *p_u_ev;
> -  user_event * u_prev = *p_u_ev;
> -
> -  while(u_iter){
> -    if(u_iter->event == event ){
> -      if(u_iter == *p_u_ev){
> -        *p_u_ev = u_iter->next;
> -      }else{
> -        u_prev->next = u_iter->next;
> +      if (worker->in_exec_status != CL_COMPLETE) {
> +        CL_OBJECT_WAIT_ON_COND(queue);
> +        continue;
>        }
> -      cl_free(u_iter);
> +
>        break;
>      }
> -    u_prev = u_iter;
> -    u_iter = u_iter->next;
> +
> +    event_num = 0;
> +    depend_events = NULL;
> +    if (!list_empty(&worker->enqueued_events)) {
> +      depend_events =
> cl_command_queue_record_in_queue_events(queue, &event_num);
> +    }
> +
> +    CL_OBJECT_UNLOCK(queue);
> +
> +    e = cl_event_create(queue->ctx, queue, event_num, depend_events,
> type, &err);
> +
> +    for (i = 0; i < event_num; i++) { //unref the temp
> +      cl_event_delete(depend_events[i]);
> +    }
> +    if (depend_events)
> +      cl_free(depend_events);
> +
> +    if (err != CL_SUCCESS) {
> +      *error = err;
> +      return NULL;
> +    }
>    }
> 
> -  return CL_SUCCESS;
> +  e->exec_data.type = eq_type;
> +  *error = CL_SUCCESS;
> +  return e;
>  }
> diff --git a/src/cl_event.h b/src/cl_event.h
> index 85cd53e..f67299c 100644
> --- a/src/cl_event.h
> +++ b/src/cl_event.h
> @@ -14,111 +14,75 @@
>   * You should have received a copy of the GNU Lesser General Public
>   * License along with this library. If not, see <http://www.gnu.org/licenses/>.
>   *
> - * Author: Benjamin Segovia <benjamin.segovia at intel.com>
>   */
> 
> -#ifndef __CL_EVENT_H__
> -#define __CL_EVENT_H__
> +#ifndef __CL_EVENT_H_
> +#define __CL_EVENT_H_
> 
>  #include <semaphore.h>
> 
>  #include "cl_base_object.h"
> -#include "cl_driver.h"
>  #include "cl_enqueue.h"
>  #include "CL/cl.h"
> 
> -#define CL_ENQUEUE_EXECUTE_IMM   0
> -#define CL_ENQUEUE_EXECUTE_DEFER 1
> +typedef void(CL_CALLBACK *cl_event_notify_cb)(cl_event event, cl_int
> event_command_exec_status, void *user_data);
> 
> -typedef struct _user_event {
> -  cl_event            event;   /* The user event */
> -  struct _user_event* next;    /* Next user event in list */
> -} user_event;
> +typedef struct _cl_event_user_callback {
> +  cl_int status;                 /* The execution status */
> +  cl_bool executed;              /* Indicat the callback function been called or not
> */
> +  cl_event_notify_cb pfn_notify; /* Callback function */
> +  void *user_data;               /* Callback user data */
> +  list_head node;                /* Event callback list node */
> +} _cl_event_user_callback;
> 
> -typedef struct _enqueue_callback {
> -  cl_event           event;            /* The event relative this enqueue callback */
> -  enqueue_data       data;             /* Hold all enqueue callback's infomation */
> -  cl_uint            num_events;       /* num events in wait list */
> -  cl_event*          wait_list;        /* All event wait list this callback wait on */
> -  user_event*        wait_user_events; /* The head of user event list the
> callback wait on */
> -  struct _enqueue_callback*  next;     /* The  next enqueue callback in wait
> list */
> -} enqueue_callback;
> +typedef _cl_event_user_callback *cl_event_user_callback;
> 
> -typedef void (CL_CALLBACK *EVENT_NOTIFY)(cl_event event, cl_int
> event_command_exec_status, void *user_data);
> -
> -typedef struct _user_callback {
> -  cl_int            status;     /* The execution status */
> -  cl_bool           executed;   /* Indicat the callback function been called or not
> */
> -  EVENT_NOTIFY      pfn_notify; /* Callback function */
> -  void*             user_data;  /* Callback user data */
> -  struct _user_callback*    next;       /* Next event callback in list */
> -} user_callback;
> -
> -struct _cl_event {
> -  _cl_base_object    base;
> -  cl_context         ctx;         /* The context associated with event */
> -  cl_command_queue   queue;       /* The command queue associated with
> event */
> -  cl_command_type    type;        /* The command type associated with event
> */
> -  cl_int             status;      /* The execution status */
> -  cl_gpgpu           gpgpu;       /* Current gpgpu, owned by this structure. */
> -  cl_gpgpu_event     gpgpu_event; /* The event object communicate with
> hardware */
> -  user_callback*     user_cb;     /* The event callback functions */
> -  enqueue_callback*  enqueue_cb;  /* This event's enqueue */
> -  enqueue_callback*  waits_head;  /* The head of enqueues list wait on this
> event */
> -  cl_bool            emplict;     /* Identify this event whether created by api
> emplict*/
> -  cl_ulong           timestamp[4];/* The time stamps for profiling. */
> -  cl_ulong	     queued_timestamp;
> -  cl_event   last_next, last_prev;/* We need a list to monitor untouchable api
> event*/
> -};
> +typedef struct _cl_event {
> +  _cl_base_object base;
> +  cl_context ctx;             /* The context associated with event */
> +  cl_command_queue queue;     /* The command queue associated with
> event */
> +  cl_command_type event_type; /* Event type. */
> +  cl_bool is_barrier;         /* Is this event a barrier */
> +  cl_int status;              /* The execution status */
> +  cl_event *depend_events;    /* The events must complete before this. */
> +  cl_uint depend_event_num;   /* The depend events number. */
> +  list_head callbacks;        /* The events The event callback functions */
> +  list_head enqueue_node;     /* The node in the enqueue list. */
> +  cl_ulong timestamp[4];      /* The time stamps for profiling. */
> +  cl_ulong queued_timestamp;
> +  enqueue_data exec_data; /* Context for execute this event. */
> +} _cl_event;
> 
>  #define CL_OBJECT_EVENT_MAGIC 0x8324a9f810ebf90fLL
> -#define CL_OBJECT_IS_EVENT(obj) (((cl_base_object)obj)->magic ==
> CL_OBJECT_EVENT_MAGIC)
> +#define CL_OBJECT_IS_EVENT(obj) ((obj &&                           \
> +         ((cl_base_object)obj)->magic == CL_OBJECT_EVENT_MAGIC &&  \
> +         CL_OBJECT_GET_REF(obj) >= 1))
> +
> +#define CL_EVENT_IS_MARKER(E) (E->event_type ==
> CL_COMMAND_MARKER)
> +#define CL_EVENT_IS_BARRIER(E) (E->event_type ==
> CL_COMMAND_BARRIER)
> +#define CL_EVENT_IS_USER(E) (E->event_type == CL_COMMAND_USER)
> 
>  /* Create a new event object */
> -cl_event cl_event_new(cl_context, cl_command_queue,
> cl_command_type, cl_bool);
> -/* Unref the object and delete it if no more reference on it */
> -void cl_event_delete(cl_event);
> -/* Add one more reference to this object */
> -void cl_event_add_ref(cl_event);
> -/* Register a user callback function for specific commond execution status
> */
> -cl_int cl_event_set_callback(cl_event, cl_int, EVENT_NOTIFY, void *);
> -/* Execute the event's callback if the event's status supersedes the
> callback's status. Free the callback if specified */
> -void cl_event_call_callback(cl_event event, cl_int status, cl_bool free_cb);
> -/* Check events wait list for enqueue commonds */
> -cl_int cl_event_check_waitlist(cl_uint, const cl_event *, cl_event *,
> cl_context);
> -/* Wait the all events in wait list complete */
> -cl_int cl_event_wait_events(cl_uint, const cl_event *, cl_command_queue);
> -/* New a enqueue suspend task */
> -void cl_event_new_enqueue_callback(cl_event, enqueue_data *, cl_uint,
> const cl_event *);
> -/* Set the event status and call all callbacks */
> -void cl_event_set_status(cl_event, cl_int);
> -/* Check and update event status */
> -void cl_event_update_status(cl_event, cl_int);
> -/* Create the marker event */
> -cl_int cl_event_marker_with_wait_list(cl_command_queue, cl_uint, const
> cl_event *,  cl_event*);
> -/* Create the barrier event */
> -cl_int cl_event_barrier_with_wait_list(cl_command_queue, cl_uint, const
> cl_event *,  cl_event*);
> -/* Get the cpu time */
> -cl_ulong cl_event_get_cpu_timestamp(cl_ulong *cpu_time);
> -/*Get the cpu time for queued*/
> -cl_int cl_event_get_queued_cpu_timestamp(cl_event event);
> -/*get timestamp delate between end and start*/
> -cl_ulong cl_event_get_timestamp_delta(cl_ulong start_timestamp,cl_ulong
> end_timestamp);
> -/*Get start time stamp*/
> -cl_ulong cl_event_get_start_timestamp(cl_event event);
> -/*Get end time stamp*/
> -cl_ulong cl_event_get_end_timestamp(cl_event event);
> -/* Do the event profiling */
> -cl_int cl_event_get_timestamp(cl_event event, cl_profiling_info
> param_name);
> -/* insert the user event */
> -cl_int cl_event_insert_user_event(user_event** p_u_ev, cl_event event);
> -/* remove the user event */
> -cl_int cl_event_remove_user_event(user_event** p_u_ev, cl_event
> event);
> -/* flush the event's pending gpgpu batch buffer and notify driver this gpgpu
> event has been flushed. */
> -cl_int cl_event_flush(cl_event event);
> -/* monitor or block wait all events in the last_event list */
> -void cl_event_update_last_events(cl_command_queue queuet, int wait);
> -/* insert the event into the last_event list in queue */
> -void cl_event_insert_last_events(cl_command_queue queue, cl_event
> event);
> +extern cl_event cl_event_create(cl_context ctx, cl_command_queue
> queue, cl_uint num_events,
> +                                const cl_event *event_list, cl_command_type type, cl_int
> *errcode_ret);
> +extern cl_int cl_event_check_waitlist(cl_uint num_events_in_wait_list,
> const cl_event *event_wait_list,
> +                                      cl_event* event, cl_context ctx);
> +extern void cl_event_exec(cl_event event, cl_int exec_status);
> +/* 0 means ready, >0 means not ready, <0 means error. */
> +extern cl_int cl_event_is_ready(cl_event event);
> +extern cl_int cl_event_get_status(cl_event event);
> +extern void cl_event_add_ref(cl_event event);
> +extern void cl_event_delete(cl_event event);
> +extern cl_int cl_event_set_status(cl_event event, cl_int status);
> +extern cl_int cl_event_set_callback(cl_event event, cl_int exec_type,
> +                                    cl_event_notify_cb pfn_notify, void *user_data);
> +extern cl_int cl_event_wait_for_events_list(cl_uint num_events, const
> cl_event *event_list);
> +extern cl_int cl_event_wait_for_event_ready(cl_event event);
> +extern cl_ulong cl_event_get_timestamp_delta(cl_ulong start_timestamp,
> cl_ulong end_timestamp);
> +extern cl_ulong cl_event_get_start_timestamp(cl_event event);
> +extern cl_ulong cl_event_get_end_timestamp(cl_event event);
> +extern cl_int cl_event_get_timestamp(cl_event event, cl_profiling_info
> param_name);
> +extern cl_event cl_event_create_marker_or_barrier(cl_command_queue
> queue, cl_uint num_events_in_wait_list,
> +                                                  const cl_event *event_wait_list, cl_bool is_barrier,
> +                                                  cl_int* error);
>  #endif /* __CL_EVENT_H__ */
> -
> diff --git a/src/cl_mem.c b/src/cl_mem.c
> index 06a4d5a..333ffc9 100644
> --- a/src/cl_mem.c
> +++ b/src/cl_mem.c
> @@ -28,6 +28,7 @@
>  #include "cl_kernel.h"
>  #include "cl_command_queue.h"
>  #include "cl_cmrt.h"
> +#include "cl_enqueue.h"
> 
>  #include "CL/cl.h"
>  #include "CL/cl_intel.h"
> @@ -1264,7 +1265,7 @@ cl_mem_add_ref(cl_mem mem)
>  #define LOCAL_SZ_2   4
> 
>  LOCAL cl_int
> -cl_mem_copy(cl_command_queue queue, cl_mem src_buf, cl_mem
> dst_buf,
> +cl_mem_copy(cl_command_queue queue, cl_event event, cl_mem
> src_buf, cl_mem dst_buf,
>              size_t src_offset, size_t dst_offset, size_t cb)
>  {
>    cl_int ret = CL_SUCCESS;
> @@ -1317,7 +1318,7 @@ cl_mem_copy(cl_command_queue queue,
> cl_mem src_buf, cl_mem dst_buf,
>      cl_kernel_set_arg(ker, 2, sizeof(cl_mem), &dst_buf);
>      cl_kernel_set_arg(ker, 3, sizeof(int), &dw_dst_offset);
>      cl_kernel_set_arg(ker, 4, sizeof(int), &cb);
> -    ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz,
> local_sz);
> +    ret = cl_command_queue_ND_range(queue, ker, event, 1, global_off,
> global_sz, local_sz);
>      cl_kernel_delete(ker);
>      return ret;
>    }
> @@ -1358,7 +1359,7 @@ cl_mem_copy(cl_command_queue queue,
> cl_mem src_buf, cl_mem dst_buf,
>      cl_kernel_set_arg(ker, 4, sizeof(int), &dw_num);
>      cl_kernel_set_arg(ker, 5, sizeof(int), &first_mask);
>      cl_kernel_set_arg(ker, 6, sizeof(int), &last_mask);
> -    ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz,
> local_sz);
> +    ret = cl_command_queue_ND_range(queue, ker, event, 1, global_off,
> global_sz, local_sz);
>      cl_kernel_delete(ker);
>      return ret;
>    }
> @@ -1388,7 +1389,7 @@ cl_mem_copy(cl_command_queue queue,
> cl_mem src_buf, cl_mem dst_buf,
>      cl_kernel_set_arg(ker, 6, sizeof(int), &last_mask);
>      cl_kernel_set_arg(ker, 7, sizeof(int), &shift);
>      cl_kernel_set_arg(ker, 8, sizeof(int), &dw_mask);
> -    ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz,
> local_sz);
> +    ret = cl_command_queue_ND_range(queue, ker, event, 1, global_off,
> global_sz, local_sz);
>      cl_kernel_delete(ker);
>      return ret;
>    }
> @@ -1420,7 +1421,7 @@ cl_mem_copy(cl_command_queue queue,
> cl_mem src_buf, cl_mem dst_buf,
>      cl_kernel_set_arg(ker, 7, sizeof(int), &shift);
>      cl_kernel_set_arg(ker, 8, sizeof(int), &dw_mask);
>      cl_kernel_set_arg(ker, 9, sizeof(int), &src_less);
> -    ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz,
> local_sz);
> +    ret = cl_command_queue_ND_range(queue, ker, event, 1, global_off,
> global_sz, local_sz);
>      cl_kernel_delete(ker);
>      return ret;
>    }
> @@ -1493,13 +1494,13 @@ cl_image_fill(cl_command_queue queue, const
> void * pattern, struct _cl_mem_image
>    cl_kernel_set_arg(ker, 6, sizeof(cl_int), &origin[1]);
>    cl_kernel_set_arg(ker, 7, sizeof(cl_int), &origin[2]);
> 
> -  ret = cl_command_queue_ND_range(queue, ker, 3, global_off, global_sz,
> local_sz);
> +  ret = cl_command_queue_ND_range(queue, ker, NULL, 3, global_off,
> global_sz, local_sz);
>    cl_kernel_delete(ker);
>    return ret;
>  }
> 
>  LOCAL cl_int
> -cl_mem_fill(cl_command_queue queue, const void * pattern, size_t
> pattern_size,
> +cl_mem_fill(cl_command_queue queue, cl_event e, const void * pattern,
> size_t pattern_size,
>              cl_mem buffer, size_t offset, size_t size)
>  {
>    cl_int ret = CL_SUCCESS;
> @@ -1596,13 +1597,13 @@ cl_mem_fill(cl_command_queue queue, const
> void * pattern, size_t pattern_size,
>    if (is_128)
>      cl_kernel_set_arg(ker, 4, pattern_size, pattern1);
> 
> -  ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz,
> local_sz);
> +  ret = cl_command_queue_ND_range(queue, ker, e, 1, global_off,
> global_sz, local_sz);
>    cl_kernel_delete(ker);
>    return ret;
>  }
> 
>  LOCAL cl_int
> -cl_mem_copy_buffer_rect(cl_command_queue queue, cl_mem src_buf,
> cl_mem dst_buf,
> +cl_mem_copy_buffer_rect(cl_command_queue queue, cl_event event,
> cl_mem src_buf, cl_mem dst_buf,
>                         const size_t *src_origin, const size_t *dst_origin, const size_t
> *region,
>                         size_t src_row_pitch, size_t src_slice_pitch,
>                         size_t dst_row_pitch, size_t dst_slice_pitch) {
> @@ -1617,7 +1618,7 @@ cl_mem_copy_buffer_rect(cl_command_queue
> queue, cl_mem src_buf, cl_mem dst_buf,
>      cl_int src_offset = src_origin[2]*src_slice_pitch +
> src_origin[1]*src_row_pitch + src_origin[0];
>      cl_int dst_offset = dst_origin[2]*dst_slice_pitch +
> dst_origin[1]*dst_row_pitch + dst_origin[0];
>      cl_int size = region[0]*region[1]*region[2];
> -    ret = cl_mem_copy(queue, src_buf, dst_buf,src_offset, dst_offset, size);
> +    ret = cl_mem_copy(queue, NULL, src_buf, dst_buf,src_offset, dst_offset,
> size);
>      return ret;
>    }
> 
> @@ -1669,14 +1670,15 @@ cl_mem_copy_buffer_rect(cl_command_queue
> queue, cl_mem src_buf, cl_mem dst_buf,
>    cl_kernel_set_arg(ker, 9, sizeof(cl_int), &dst_row_pitch);
>    cl_kernel_set_arg(ker, 10, sizeof(cl_int), &dst_slice_pitch);
> 
> -  ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz,
> local_sz);
> +  ret = cl_command_queue_ND_range(queue, ker, event, 1, global_off,
> global_sz, local_sz);
>    cl_kernel_delete(ker);
>    return ret;
>  }
> 
>  LOCAL cl_int
> -cl_mem_kernel_copy_image(cl_command_queue queue, struct
> _cl_mem_image* src_image, struct _cl_mem_image* dst_image,
> -                         const size_t *src_origin, const size_t *dst_origin, const size_t
> *region) {
> +cl_mem_kernel_copy_image(cl_command_queue queue, cl_event event,
> struct _cl_mem_image* src_image,
> +                         struct _cl_mem_image* dst_image, const size_t *src_origin,
> +                         const size_t *dst_origin, const size_t *region) {
>    cl_int ret;
>    cl_kernel ker = NULL;
>    size_t global_off[] = {0,0,0};
> @@ -1817,7 +1819,7 @@ cl_mem_kernel_copy_image(cl_command_queue
> queue, struct _cl_mem_image* src_image
>    cl_kernel_set_arg(ker, 9, sizeof(cl_int), &dst_origin[1]);
>    cl_kernel_set_arg(ker, 10, sizeof(cl_int), &dst_origin[2]);
> 
> -  ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz,
> local_sz);
> +  ret = cl_command_queue_ND_range(queue, ker, event, 1, global_off,
> global_sz, local_sz);
> 
>  fail:
> 
> @@ -1830,7 +1832,7 @@ fail:
>  }
> 
>  LOCAL cl_int
> -cl_mem_copy_image_to_buffer(cl_command_queue queue, struct
> _cl_mem_image* image, cl_mem buffer,
> +cl_mem_copy_image_to_buffer(cl_command_queue queue, cl_event
> event, struct _cl_mem_image* image, cl_mem buffer,
>                           const size_t *src_origin, const size_t dst_offset, const size_t
> *region) {
>    cl_int ret;
>    cl_kernel ker = NULL;
> @@ -1919,7 +1921,7 @@
> cl_mem_copy_image_to_buffer(cl_command_queue queue, struct
> _cl_mem_image* image,
>    cl_kernel_set_arg(ker, 7, sizeof(cl_int), &src_origin[2]);
>    cl_kernel_set_arg(ker, 8, sizeof(cl_int), &kn_dst_offset);
> 
> -  ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz,
> local_sz);
> +  ret = cl_command_queue_ND_range(queue, ker, event, 1, global_off,
> global_sz, local_sz);
> 
>  fail:
> 
> @@ -1933,7 +1935,7 @@ fail:
> 
> 
>  LOCAL cl_int
> -cl_mem_copy_buffer_to_image(cl_command_queue queue, cl_mem
> buffer, struct _cl_mem_image* image,
> +cl_mem_copy_buffer_to_image(cl_command_queue queue, cl_event
> event, cl_mem buffer, struct _cl_mem_image* image,
>                           const size_t src_offset, const size_t *dst_origin, const size_t
> *region) {
>    cl_int ret;
>    cl_kernel ker = NULL;
> @@ -2019,7 +2021,7 @@
> cl_mem_copy_buffer_to_image(cl_command_queue queue, cl_mem
> buffer, struct _cl_me
>    cl_kernel_set_arg(ker, 7, sizeof(cl_int), &dst_origin[2]);
>    cl_kernel_set_arg(ker, 8, sizeof(cl_int), &kn_src_offset);
> 
> -  ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz,
> local_sz);
> +  ret = cl_command_queue_ND_range(queue, ker, event, 1, global_off,
> global_sz, local_sz);
>    cl_kernel_delete(ker);
> 
>    image->intel_fmt = intel_fmt;
> @@ -2308,3 +2310,83 @@ error:
>    mem = NULL;
>    goto exit;
>  }
> +
> +LOCAL cl_int
> +cl_mem_record_map_mem(cl_mem mem, void *ptr, void **mem_ptr,
> size_t offset,
> +                      size_t size, const size_t *origin, const size_t *region)
> +{
> +  // TODO: Need to add MT safe logic.
> +
> +  cl_int slot = -1;
> +  int err = CL_SUCCESS;
> +  size_t sub_offset = 0;
> +
> +  if(mem->type == CL_MEM_SUBBUFFER_TYPE) {
> +    struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem;
> +    sub_offset = buffer->sub_offset;
> +  }
> +
> +  ptr = (char*)ptr + offset + sub_offset;
> +  if(mem->flags & CL_MEM_USE_HOST_PTR) {
> +    assert(mem->host_ptr);
> +    //only calc ptr here, will do memcpy in enqueue
> +    *mem_ptr = (char *)mem->host_ptr + offset + sub_offset;
> +  } else {
> +    *mem_ptr = ptr;
> +  }
> +  /* Record the mapped address. */
> +  if (!mem->mapped_ptr_sz) {
> +    mem->mapped_ptr_sz = 16;
> +    mem->mapped_ptr = (cl_mapped_ptr *)malloc(
> +        sizeof(cl_mapped_ptr) * mem->mapped_ptr_sz);
> +    if (!mem->mapped_ptr) {
> +      cl_mem_unmap_auto(mem);
> +      err = CL_OUT_OF_HOST_MEMORY;
> +      goto error;
> +    }
> +    memset(mem->mapped_ptr, 0, mem->mapped_ptr_sz *
> sizeof(cl_mapped_ptr));
> +    slot = 0;
> +  } else {
> +    int i = 0;
> +    for (; i < mem->mapped_ptr_sz; i++) {
> +      if (mem->mapped_ptr[i].ptr == NULL) {
> +        slot = i;
> +        break;
> +      }
> +    }
> +    if (i == mem->mapped_ptr_sz) {
> +      cl_mapped_ptr *new_ptr = (cl_mapped_ptr *)malloc(
> +          sizeof(cl_mapped_ptr) * mem->mapped_ptr_sz * 2);
> +      if (!new_ptr) {
> +        cl_mem_unmap_auto(mem);
> +        err = CL_OUT_OF_HOST_MEMORY;
> +        goto error;
> +      }
> +      memset(new_ptr, 0, 2 * mem->mapped_ptr_sz *
> sizeof(cl_mapped_ptr));
> +      memcpy(new_ptr, mem->mapped_ptr,
> +          mem->mapped_ptr_sz * sizeof(cl_mapped_ptr));
> +      slot = mem->mapped_ptr_sz;
> +      mem->mapped_ptr_sz *= 2;
> +      free(mem->mapped_ptr);
> +      mem->mapped_ptr = new_ptr;
> +    }
> +  }
> +  assert(slot != -1);
> +  mem->mapped_ptr[slot].ptr = *mem_ptr;
> +  mem->mapped_ptr[slot].v_ptr = ptr;
> +  mem->mapped_ptr[slot].size = size;
> +  if(origin) {
> +    assert(region);
> +    mem->mapped_ptr[slot].origin[0] = origin[0];
> +    mem->mapped_ptr[slot].origin[1] = origin[1];
> +    mem->mapped_ptr[slot].origin[2] = origin[2];
> +    mem->mapped_ptr[slot].region[0] = region[0];
> +    mem->mapped_ptr[slot].region[1] = region[1];
> +    mem->mapped_ptr[slot].region[2] = region[2];
> +  }
> +  mem->map_ref++;
> +error:
> +  if (err != CL_SUCCESS)
> +    *mem_ptr = NULL;
> +  return err;
> +}
> diff --git a/src/cl_mem.h b/src/cl_mem.h
> index 9bb5c47..82f30f6 100644
> --- a/src/cl_mem.h
> +++ b/src/cl_mem.h
> @@ -101,7 +101,17 @@ typedef  struct _cl_mem {
>  } _cl_mem;
> 
>  #define CL_OBJECT_MEM_MAGIC 0x381a27b9ee6504dfLL
> -#define CL_OBJECT_IS_MEM(obj) (((cl_base_object)obj)->magic ==
> CL_OBJECT_MEM_MAGIC)
> +#define CL_OBJECT_IS_MEM(obj) ((obj &&                           \
> +         ((cl_base_object)obj)->magic == CL_OBJECT_MEM_MAGIC &&  \
> +         CL_OBJECT_GET_REF(obj) >= 1))
> +#define CL_OBJECT_IS_IMAGE(mem) ((mem &&                           \
> +         ((cl_base_object)mem)->magic == CL_OBJECT_MEM_MAGIC &&    \
> +         CL_OBJECT_GET_REF(mem) >= 1 &&                            \
> +         mem->type >= CL_MEM_IMAGE_TYPE))
> +#define CL_OBJECT_IS_BUFFER(mem) ((mem &&                          \
> +         ((cl_base_object)mem)->magic == CL_OBJECT_MEM_MAGIC &&    \
> +         CL_OBJECT_GET_REF(mem) >= 1 &&                            \
> +         mem->type < CL_MEM_IMAGE_TYPE))
> 
>  struct _cl_mem_image {
>    _cl_mem base;
> @@ -221,30 +231,30 @@ extern void cl_mem_gl_delete(struct
> _cl_mem_gl_image *);
>  extern void cl_mem_add_ref(cl_mem);
> 
>  /* api clEnqueueCopyBuffer help function */
> -extern cl_int cl_mem_copy(cl_command_queue queue, cl_mem src_buf,
> cl_mem dst_buf,
> +extern cl_int cl_mem_copy(cl_command_queue queue, cl_event event,
> cl_mem src_buf, cl_mem dst_buf,
>                size_t src_offset, size_t dst_offset, size_t cb);
> 
> -extern cl_int cl_mem_fill(cl_command_queue queue, const void * pattern,
> size_t pattern_size,
> +extern cl_int cl_mem_fill(cl_command_queue queue, cl_event e, const
> void * pattern, size_t pattern_size,
>                cl_mem buffer, size_t offset, size_t size);
> 
>  extern cl_int cl_image_fill(cl_command_queue queue, const void * pattern,
> struct _cl_mem_image*,
>                                      const size_t *, const size_t *);
> 
>  /* api clEnqueueCopyBufferRect help function */
> -extern cl_int cl_mem_copy_buffer_rect(cl_command_queue, cl_mem,
> cl_mem,
> +extern cl_int cl_mem_copy_buffer_rect(cl_command_queue, cl_event
> event, cl_mem, cl_mem,
>                                       const size_t *, const size_t *, const size_t *,
>                                       size_t, size_t, size_t, size_t);
> 
>  /* api clEnqueueCopyImage help function */
> -extern cl_int cl_mem_kernel_copy_image(cl_command_queue, struct
> _cl_mem_image*, struct _cl_mem_image*,
> -                                       const size_t *, const size_t *, const size_t *);
> +extern cl_int cl_mem_kernel_copy_image(cl_command_queue, cl_event
> event, struct _cl_mem_image*,
> +                                       struct _cl_mem_image*, const size_t *, const size_t *,
> const size_t *);
> 
>  /* api clEnqueueCopyImageToBuffer help function */
> -extern cl_int cl_mem_copy_image_to_buffer(cl_command_queue, struct
> _cl_mem_image*, cl_mem,
> +extern cl_int cl_mem_copy_image_to_buffer(cl_command_queue,
> cl_event, struct _cl_mem_image*, cl_mem,
>                                            const size_t *, const size_t, const size_t *);
> 
>  /* api clEnqueueCopyBufferToImage help function */
> -extern cl_int cl_mem_copy_buffer_to_image(cl_command_queue,
> cl_mem, struct _cl_mem_image*,
> +extern cl_int cl_mem_copy_buffer_to_image(cl_command_queue,
> cl_event, cl_mem, struct _cl_mem_image*,
>                                            const size_t, const size_t *, const size_t *);
> 
>  /* Directly map a memory object */
> @@ -318,5 +328,8 @@ extern cl_mem
> cl_mem_new_image_from_fd(cl_context ctx,
>                                         size_t row_pitch,
>                                         cl_int *errcode);
> 
> +extern cl_int cl_mem_record_map_mem(cl_mem mem, void *ptr, void
> **mem_ptr, size_t offset,
> +                      size_t size, const size_t *origin, const size_t *region);
> +
>  #endif /* __CL_MEM_H__ */
> 
> --
> 2.7.4
> 
> 
> 
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/beignet


More information about the Beignet mailing list