[Beignet] [PATCH 4/9 newRT] Modify runtime code to support ELF and multi devices.

junyan.he at inbox.com junyan.he at inbox.com
Sat Apr 1 09:43:29 UTC 2017


From: Junyan He <junyan.he at intel.com>

A very important modification point in runtime. After this
point, the runtime can be total separated from the compiler.
The compiler will play as a backend, and it generates ELF
format binary for execution. The runtime just load and set
the execution evironment for the binary kernel. This can
be very useful, we can switch backends at will and we can also
unload compiler for some embeded system, for which the resource
may be very limited.
The multi devices cases are also considered. Though we just
have one GEN device now, we may have multi devices in future,
heterogeneous devices support is an important part for CL
spec.

TODO:
2.0 Features are not supported.

Signed-off-by: Junyan He <junyan.he at intel.com>
---
 src/CMakeLists.txt             |   11 +-
 src/cl_api.c                   |  391 +-------
 src/cl_api_command_queue.c     |    4 +-
 src/cl_api_context.c           |    8 +-
 src/cl_api_device_id.c         |    4 +-
 src/cl_api_kernel.c            |  329 ++++---
 src/cl_api_mem.c               |   25 +-
 src/cl_api_program.c           |  569 ++++++++++--
 src/cl_command_queue.c         |  203 +----
 src/cl_command_queue.h         |   21 +-
 src/cl_context.c               |  379 ++++----
 src/cl_context.h               |  162 +---
 src/cl_device_enqueue.c        |   11 +
 src/cl_device_id.c             | 1922 ++++++++--------------------------------
 src/cl_device_id.h             |  182 ++--
 src/cl_driver.h                |    6 -
 src/cl_enqueue.c               |   17 +-
 src/cl_enqueue.h               |    2 +-
 src/cl_kernel.c                |  817 ++++++++---------
 src/cl_kernel.h                |  165 ++--
 src/cl_mem.c                   |  824 +----------------
 src/cl_program.c               | 1581 +++++++++++++++++----------------
 src/cl_program.h               |  151 +---
 src/cl_sampler.c               |   61 +-
 src/gen/cl_command_queue_gen.c |  107 ++-
 src/gen/cl_device_id_gen.c     |   27 +-
 src/gen/cl_gen.h               |  106 ++-
 src/gen/cl_gen_device_common.h |   12 +-
 src/gen/intel_driver.c         |   13 +-
 src/gen/intel_driver.h         |    1 +
 30 files changed, 3035 insertions(+), 5076 deletions(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 91a772f..33b2e8d 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -84,6 +84,7 @@ set(OPENCL_SRC
     cl_sampler.c
     cl_accelerator_intel.c
     cl_event.c
+    cl_compiler.c
     cl_enqueue.c
     cl_image.c
     cl_mem.c
@@ -95,7 +96,6 @@ set(OPENCL_SRC
     cl_command_queue.h
     cl_device_enqueue.c
     cl_device_enqueue.h
-    cl_command_queue_gen7.c
     cl_command_queue_enqueue.c
     cl_utils.c
     cl_driver.h
@@ -103,7 +103,14 @@ set(OPENCL_SRC
     cl_driver_defs.c
     gen/intel_batchbuffer.c
     gen/intel_driver.c
-    intel/intel_gpgpu.c
+    gen/cl_device_id_gen.c
+    gen/cl_kernel_gen.c
+    gen/cl_program_gen.c
+    gen/cl_context_gen.c
+    gen/cl_command_queue_gen.c
+    gen/cl_mem_gen.c
+    gen/cl_image_gen.c
+    gen/cl_compiler_gen.c
     performance.c)
 
 if (X11_FOUND)
diff --git a/src/cl_api.c b/src/cl_api.c
index f72533f..397b941 100644
--- a/src/cl_api.c
+++ b/src/cl_api.c
@@ -703,225 +703,6 @@ error:
   return sampler;
 }
 
-cl_program
-clCreateProgramWithSource(cl_context     context,
-                          cl_uint        count,
-                          const char **  strings,
-                          const size_t * lengths,
-                          cl_int *       errcode_ret)
-{
-  cl_program program = NULL;
-  cl_int err = CL_SUCCESS;
-  cl_uint i;
-
-  CHECK_CONTEXT (context);
-  INVALID_VALUE_IF (count == 0);
-  INVALID_VALUE_IF (strings == NULL);
-  for(i = 0; i < count; i++) {
-    if(UNLIKELY(strings[i] == NULL)) {
-      err = CL_INVALID_VALUE;
-      goto error;
-    }
-  }
-  program = cl_program_create_from_source(context,
-                                          count,
-                                          strings,
-                                          lengths,
-                                          &err);
-error:
-  if (errcode_ret)
-    *errcode_ret = err;
-  return program;
-}
-
-cl_program
-clCreateProgramWithBinary(cl_context             context,
-                          cl_uint                num_devices,
-                          const cl_device_id *   devices,
-                          const size_t *         lengths,
-                          const unsigned char ** binaries,
-                          cl_int *               binary_status,
-                          cl_int *               errcode_ret)
-{
-  cl_program program = NULL;
-  cl_int err = CL_SUCCESS;
-
-  CHECK_CONTEXT (context);
-  program = cl_program_create_from_binary(context,
-                                          num_devices,
-                                          devices,
-                                          lengths,
-                                          binaries,
-                                          binary_status,
-                                          &err);
-error:
-  if (errcode_ret)
-    *errcode_ret = err;
-  return program;
-}
-
-cl_program
-clCreateProgramWithBuiltInKernels(cl_context           context,
-                                  cl_uint              num_devices,
-                                  const cl_device_id * device_list,
-                                  const char *         kernel_names,
-                                  cl_int *             errcode_ret)
-{
-  cl_program program = NULL;
-  cl_int err = CL_SUCCESS;
-
-  CHECK_CONTEXT (context);
-  INVALID_VALUE_IF (kernel_names == NULL);
-  program = cl_program_create_with_built_in_kernles(context,
-                                                    num_devices,
-                                                    device_list,
-                                                    kernel_names,
-                                                    &err);
-error:
-  if (errcode_ret)
-    *errcode_ret = err;
-  return program;
-}
-
-cl_int
-clRetainProgram(cl_program program)
-{
-  cl_int err = CL_SUCCESS;
-  CHECK_PROGRAM (program);
-  cl_program_add_ref(program);
-error:
-  return err;
-}
-
-cl_int
-clReleaseProgram(cl_program program)
-{
-  cl_int err = CL_SUCCESS;
-  CHECK_PROGRAM (program);
-  cl_program_delete(program);
-error:
-  return err;
-}
-
-cl_int
-clBuildProgram(cl_program            program,
-               cl_uint               num_devices,
-               const cl_device_id *  device_list,
-               const char *          options,
-               void (CL_CALLBACK *pfn_notify) (cl_program, void*),
-               void *                user_data)
-{
-  cl_int err = CL_SUCCESS;
-  CHECK_PROGRAM(program);
-  INVALID_VALUE_IF (num_devices > 1);
-  INVALID_VALUE_IF (num_devices == 0 && device_list != NULL);
-  INVALID_VALUE_IF (num_devices != 0 && device_list == NULL);
-  INVALID_VALUE_IF (pfn_notify  == 0 && user_data   != NULL);
-
-  /* Everything is easy. We only support one device anyway */
-  if (num_devices != 0) {
-    assert(program->ctx);
-    err = cl_devices_list_include_check(program->ctx->device_num,
-                                        program->ctx->devices, num_devices, device_list);
-    if (err)
-      goto error;
-  }
-
-  assert(program->source_type == FROM_LLVM ||
-         program->source_type == FROM_SOURCE ||
-         program->source_type == FROM_LLVM_SPIR ||
-         program->source_type == FROM_BINARY ||
-         program->source_type == FROM_CMRT);
-  if((err = cl_program_build(program, options)) != CL_SUCCESS) {
-    goto error;
-  }
-  program->is_built = CL_TRUE;
-
-  if (pfn_notify) pfn_notify(program, user_data);
-
-error:
-  return err;
-}
-
-cl_int
-clCompileProgram(cl_program            program ,
-                 cl_uint               num_devices ,
-                 const cl_device_id *  device_list ,
-                 const char *          options ,
-                 cl_uint               num_input_headers ,
-                 const cl_program *    input_headers ,
-                 const char **         header_include_names ,
-                 void (CL_CALLBACK *   pfn_notify )(cl_program, void *),
-                 void *                user_data )
-{
-  cl_int err = CL_SUCCESS;
-  CHECK_PROGRAM(program);
-  INVALID_VALUE_IF (num_devices > 1);
-  INVALID_VALUE_IF (num_devices == 0 && device_list != NULL);
-  INVALID_VALUE_IF (num_devices != 0 && device_list == NULL);
-  INVALID_VALUE_IF (pfn_notify  == 0 && user_data   != NULL);
-  INVALID_VALUE_IF (num_input_headers == 0 && input_headers != NULL);
-  INVALID_VALUE_IF (num_input_headers != 0 && input_headers == NULL);
-
-  /* Everything is easy. We only support one device anyway */
-  if (num_devices != 0) {
-    assert(program->ctx);
-    err = cl_devices_list_include_check(program->ctx->device_num,
-                                        program->ctx->devices, num_devices, device_list);
-    if (err)
-      goto error;
-  }
-
-  /* TODO support create program from binary */
-  assert(program->source_type == FROM_LLVM ||
-      program->source_type == FROM_SOURCE ||
-      program->source_type == FROM_LLVM_SPIR ||
-      program->source_type == FROM_BINARY);
-  if((err = cl_program_compile(program, num_input_headers, input_headers, header_include_names, options)) != CL_SUCCESS) {
-    goto error;
-  }
-  program->is_built = CL_TRUE;
-
-  if (pfn_notify) pfn_notify(program, user_data);
-
-error:
-  return err;
-}
-
-cl_program
-clLinkProgram(cl_context            context,
-              cl_uint               num_devices,
-              const cl_device_id *  device_list,
-              const char *          options,
-              cl_uint               num_input_programs,
-              const cl_program *    input_programs,
-              void (CL_CALLBACK *   pfn_notify)(cl_program  program, void * user_data),
-              void *                user_data,
-              cl_int *              errcode_ret)
-{
-  cl_int err = CL_SUCCESS;
-  cl_program program = NULL;
-  CHECK_CONTEXT (context);
-  INVALID_VALUE_IF (num_devices > 1);
-  INVALID_VALUE_IF (num_devices == 0 && device_list != NULL);
-  INVALID_VALUE_IF (num_devices != 0 && device_list == NULL);
-  INVALID_VALUE_IF (pfn_notify  == 0 && user_data   != NULL);
-  INVALID_VALUE_IF (num_input_programs == 0 && input_programs != NULL);
-  INVALID_VALUE_IF (num_input_programs != 0 && input_programs == NULL);
-  INVALID_VALUE_IF (num_input_programs == 0 && input_programs == NULL);
-
-  program = cl_program_link(context, num_input_programs, input_programs, options, &err);
-
-  if(program) program->is_built = CL_TRUE;
-
-  if (pfn_notify) pfn_notify(program, user_data);
-
-error:
-  if (errcode_ret)
-    *errcode_ret = err;
-  return program;
-}
-
 cl_int
 clUnloadCompiler(void)
 {
@@ -934,95 +715,6 @@ clUnloadPlatformCompiler(cl_platform_id platform)
   return CL_SUCCESS;
 }
 
-cl_kernel
-clCreateKernel(cl_program   program,
-               const char * kernel_name,
-               cl_int *     errcode_ret)
-{
-  cl_kernel kernel = NULL;
-  cl_int err = CL_SUCCESS;
-
-  CHECK_PROGRAM (program);
-  if (program->ker_n <= 0) {
-    err = CL_INVALID_PROGRAM_EXECUTABLE;
-    goto error;
-  }
-  INVALID_VALUE_IF (kernel_name == NULL);
-  kernel = cl_program_create_kernel(program, kernel_name, &err);
-
-error:
-  if (errcode_ret)
-    *errcode_ret = err;
-  return kernel;
-}
-
-cl_int
-clCreateKernelsInProgram(cl_program      program,
-                         cl_uint         num_kernels,
-                         cl_kernel *     kernels,
-                         cl_uint *       num_kernels_ret)
-{
-  cl_int err = CL_SUCCESS;
-
-  CHECK_PROGRAM (program);
-  if (program->ker_n <= 0) {
-    err = CL_INVALID_PROGRAM_EXECUTABLE;
-    goto error;
-  }
-  if (kernels && num_kernels < program->ker_n) {
-    err = CL_INVALID_VALUE;
-    goto error;
-  }
-
-  if(num_kernels_ret)
-    *num_kernels_ret = program->ker_n;
-
-  if(kernels)
-    err = cl_program_create_kernels_in_program(program, kernels);
-
-error:
-  return err;
-}
-
-cl_int
-clRetainKernel(cl_kernel kernel)
-{
-  cl_int err = CL_SUCCESS;
-  CHECK_KERNEL(kernel);
-  cl_kernel_add_ref(kernel);
-error:
-  return err;
-}
-
-cl_int
-clReleaseKernel(cl_kernel kernel)
-{
-  cl_int err = CL_SUCCESS;
-  CHECK_KERNEL(kernel);
-  cl_kernel_delete(kernel);
-error:
-  return err;
-}
-
-cl_int
-clSetKernelArg(cl_kernel     kernel,
-               cl_uint       arg_index,
-               size_t        arg_size,
-               const void *  arg_value)
-{
-  cl_int err = CL_SUCCESS;
-  CHECK_KERNEL(kernel);
-
-#ifdef HAS_CMRT
-  if (kernel->cmrt_kernel != NULL)
-    err = cmrt_set_kernel_arg(kernel, arg_index, arg_size, arg_value);
-  else
-#endif
-    err = cl_kernel_set_arg(kernel, arg_index, arg_size, arg_value);
-error:
-  return err;
-}
-
 cl_int
 clSetKernelArgSVMPointer(cl_kernel kernel,
                           cl_uint arg_index,
@@ -1063,74 +755,6 @@ error:
   return err;
 }
 
-cl_int clGetKernelArgInfo(cl_kernel kernel, cl_uint arg_index, cl_kernel_arg_info param_name,
-        size_t param_value_size, void *param_value, size_t *param_value_size_ret)
-{
-  cl_int err = CL_SUCCESS;
-  CHECK_KERNEL(kernel);
-
-  if(kernel->program->build_opts == NULL ||
-        strstr(kernel->program->build_opts,"-cl-kernel-arg-info") == NULL ) {
-    err = CL_KERNEL_ARG_INFO_NOT_AVAILABLE;
-    goto error;
-  }
-  if (param_name != CL_KERNEL_ARG_ADDRESS_QUALIFIER
-          && param_name != CL_KERNEL_ARG_ACCESS_QUALIFIER
-          && param_name != CL_KERNEL_ARG_TYPE_NAME
-          && param_name != CL_KERNEL_ARG_TYPE_QUALIFIER
-          && param_name != CL_KERNEL_ARG_NAME) {
-    err = CL_INVALID_VALUE;
-    goto error;
-  }
-
-  if (arg_index >= kernel->arg_n) {
-    err = CL_INVALID_ARG_INDEX;
-    goto error;
-  }
-
-  err = cl_get_kernel_arg_info(kernel, arg_index, param_name, param_value_size,
-          param_value, param_value_size_ret);
-
-error:
-  return err;
-}
-
-cl_int
-clGetKernelWorkGroupInfo(cl_kernel                   kernel,
-                         cl_device_id                device,
-                         cl_kernel_work_group_info   param_name,
-                         size_t                      param_value_size,
-                         void *                      param_value,
-                         size_t *                    param_value_size_ret)
-{
-  return cl_get_kernel_workgroup_info(kernel,
-                                      device,
-                                      param_name,
-                                      param_value_size,
-                                      param_value,
-                                      param_value_size_ret);
-}
-
-cl_int
-clGetKernelSubGroupInfoKHR(cl_kernel                   kernel,
-                          cl_device_id                device,
-                          cl_kernel_work_group_info   param_name,
-                          size_t                      input_value_size,
-                          const void *                input_value,
-                          size_t                      param_value_size,
-                          void *                      param_value,
-                          size_t *                    param_value_size_ret)
-{
-  return cl_get_kernel_subgroup_info(kernel,
-                                     device,
-                                     param_name,
-                                     input_value_size,
-                                     input_value,
-                                     param_value_size,
-                                     param_value,
-                                     param_value_size_ret);
-}
-
 cl_int
 clRetainEvent(cl_event  event)
 {
@@ -1182,7 +806,7 @@ cl_mem clCreatePipe (cl_context context,
     err = CL_INVALID_PIPE_SIZE;
     goto error;
   }
-  if ((err = cl_get_device_info(context->devices[0],
+  if ((err = cl_device_get_info(context->devices[0],
                                 CL_DEVICE_PIPE_MAX_PACKET_SIZE,
                                 sizeof(device_max_size),
                                 &device_max_size,
@@ -1280,8 +904,8 @@ clGetExtensionFunctionAddressForPlatform(cl_platform_id platform,
 cl_int
 clReportUnfreedIntel(void)
 {
-  return CL_SUCCESS;
-  //return cl_report_unfreed();
+  CL_ALLOC_REPORT_UNFREED();
+  return 0;
 }
 
 void*
@@ -1353,7 +977,14 @@ error:
 cl_int
 clGetGenVersionIntel(cl_device_id device, cl_int *ver)
 {
-  return cl_device_get_version(device, ver);
+  if (ver == NULL)
+    return CL_INVALID_VALUE;
+
+  if (!CL_OBJECT_IS_DEVICE(device)) {
+    return CL_INVALID_DEVICE;
+  }
+
+  return cl_device_get_version_gen(device, ver);
 }
 
 cl_program
diff --git a/src/cl_api_command_queue.c b/src/cl_api_command_queue.c
index b1aee12..c4132ed 100644
--- a/src/cl_api_command_queue.c
+++ b/src/cl_api_command_queue.c
@@ -50,7 +50,7 @@ clCreateCommandQueue(cl_context context,
       break;
     }
 
-    queue = cl_create_command_queue(context, device, properties, 0, &err);
+    queue = cl_command_queue_create(context, device, properties, 0, &err);
   } while (0);
 
   if (errcode_ret)
@@ -142,7 +142,7 @@ clCreateCommandQueueWithProperties(cl_context context,
       break;
     }
 
-    queue = cl_create_command_queue(context, device, prop, queue_sz, &err);
+    queue = cl_command_queue_create(context, device, prop, queue_sz, &err);
   } while (0);
 
   if (errcode_ret)
diff --git a/src/cl_api_context.c b/src/cl_api_context.c
index fa1be08..1519258 100644
--- a/src/cl_api_context.c
+++ b/src/cl_api_context.c
@@ -52,7 +52,7 @@ clCreateContext(const cl_context_properties *properties,
     if (err != CL_SUCCESS)
       break;
 
-    context = cl_create_context(properties, num_devices, devices, pfn_notify, user_data, &err);
+    context = cl_context_create(properties, num_devices, devices, pfn_notify, user_data, &err);
   } while (0);
 
   if (errcode_ret)
@@ -87,17 +87,17 @@ clCreateContextFromType(const cl_context_properties *properties,
     }
 
     /* Get the devices num first. */
-    err = cl_get_device_ids(NULL, device_type, 0, NULL, &num_devices);
+    err = cl_device_get_ids(NULL, device_type, 0, NULL, &num_devices);
     if (err != CL_SUCCESS)
       break;
 
     assert(num_devices > 0);
     devices = CL_MALLOC(num_devices * sizeof(cl_device_id));
-    err = cl_get_device_ids(NULL, device_type, num_devices, &devices[0], &num_devices);
+    err = cl_device_get_ids(NULL, device_type, num_devices, &devices[0], &num_devices);
     if (err != CL_SUCCESS)
       break;
 
-    context = cl_create_context(properties, num_devices, devices, pfn_notify, user_data, &err);
+    context = cl_context_create(properties, num_devices, devices, pfn_notify, user_data, &err);
   } while (0);
 
   if (devices)
diff --git a/src/cl_api_device_id.c b/src/cl_api_device_id.c
index 4ffef78..68bbf92 100644
--- a/src/cl_api_device_id.c
+++ b/src/cl_api_device_id.c
@@ -40,7 +40,7 @@ clGetDeviceIDs(cl_platform_id platform,
   if ((device_type & valid_type) == 0)
     return CL_INVALID_DEVICE_TYPE;
 
-  return cl_get_device_ids(platform, device_type, num_entries, devices, num_devices);
+  return cl_device_get_ids(platform, device_type, num_entries, devices, num_devices);
 }
 
 cl_int
@@ -54,7 +54,7 @@ clGetDeviceInfo(cl_device_id device,
     return CL_INVALID_DEVICE;
   }
 
-  return cl_get_device_info(device, param_name, param_value_size,
+  return cl_device_get_info(device, param_name, param_value_size,
                             param_value, param_value_size_ret);
 }
 
diff --git a/src/cl_api_kernel.c b/src/cl_api_kernel.c
index ce4d7b8..ab46f9c 100644
--- a/src/cl_api_kernel.c
+++ b/src/cl_api_kernel.c
@@ -22,11 +22,54 @@
 #include "cl_event.h"
 #include "cl_context.h"
 #include "cl_program.h"
+#include "cl_device_id.h"
 #include "cl_alloc.h"
 #include "CL/cl.h"
 #include <stdio.h>
 #include <string.h>
 
+cl_kernel
+clCreateKernel(cl_program program,
+               const char *kernel_name,
+               cl_int *errcode_ret)
+{
+  cl_kernel kernel = NULL;
+  cl_int err = CL_SUCCESS;
+
+  do {
+    if (!CL_OBJECT_IS_PROGRAM(program)) {
+      err = CL_INVALID_PROGRAM;
+      break;
+    }
+
+    if (kernel_name == NULL) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+
+    kernel = cl_kernel_create(program, kernel_name, &err);
+  } while (0);
+
+  if (errcode_ret)
+    *errcode_ret = err;
+  return kernel;
+}
+
+cl_int
+clSetKernelArg(cl_kernel kernel,
+               cl_uint arg_index,
+               size_t arg_size,
+               const void *arg_value)
+{
+  cl_int err = CL_SUCCESS;
+  if (!CL_OBJECT_IS_KERNEL(kernel)) {
+    return CL_INVALID_KERNEL;
+  }
+
+  err = cl_kernel_set_arg(kernel, arg_index, arg_size, arg_value);
+  return err;
+}
+
 cl_int
 clGetKernelInfo(cl_kernel kernel,
                 cl_kernel_info param_name,
@@ -39,6 +82,7 @@ clGetKernelInfo(cl_kernel kernel,
   const char *str = NULL;
   cl_int ref;
   cl_uint n;
+  char null_attr = 0;
 
   if (!CL_OBJECT_IS_KERNEL(kernel)) {
     return CL_INVALID_KERNEL;
@@ -59,11 +103,13 @@ clGetKernelInfo(cl_kernel kernel,
     src_ptr = &ref;
     src_size = sizeof(cl_int);
   } else if (param_name == CL_KERNEL_FUNCTION_NAME) {
-    str = cl_kernel_get_name(kernel);
+    str = kernel->name;
     src_ptr = str;
     src_size = strlen(str) + 1;
   } else if (param_name == CL_KERNEL_ATTRIBUTES) {
-    str = cl_kernel_get_attributes(kernel);
+    str = kernel->kernel_attr;
+    if (str == NULL)
+      str = &null_attr;
     src_ptr = str;
     src_size = strlen(str) + 1;
   } else {
@@ -116,17 +162,6 @@ clEnqueueNDRangeKernel(cl_command_queue command_queue,
       break;
     }
 
-    if (kernel->vme) {
-      if (work_dim != 2) {
-        err = CL_INVALID_WORK_DIMENSION;
-        break;
-      }
-      if (local_work_size != NULL) {
-        err = CL_INVALID_WORK_GROUP_SIZE;
-        break;
-      }
-    }
-
     if (global_work_offset != NULL) {
       for (i = 0; i < work_dim; ++i) {
         if (UNLIKELY(global_work_offset[i] + global_work_size[i] > (size_t)-1)) {
@@ -147,46 +182,36 @@ clEnqueueNDRangeKernel(cl_command_queue command_queue,
       for (i = 0; i < work_dim; ++i)
         fixed_local_sz[i] = local_work_size[i];
     } else {
-      if (kernel->vme) {
-        fixed_local_sz[0] = 16;
-        fixed_local_sz[1] = 1;
-      } else {
-        uint j, maxDimSize = 64 /* from 64? */, maxGroupSize = 256; //MAX_WORK_GROUP_SIZE may too large
-        size_t realGroupSize = 1;
-        for (i = 0; i < work_dim; i++) {
-          for (j = maxDimSize; j > 1; j--) {
-            if (global_work_size[i] % j == 0 && j <= maxGroupSize) {
-              fixed_local_sz[i] = j;
-              maxGroupSize = maxGroupSize / j;
-              maxDimSize = maxGroupSize > maxDimSize ? maxDimSize : maxGroupSize;
-              break; //choose next work_dim
-            }
+      uint j, maxDimSize = 64 /* from 64? */, maxGroupSize = 256; //MAX_WORK_GROUP_SIZE may too large
+      size_t realGroupSize = 1;
+      for (i = 0; i < work_dim; i++) {
+        for (j = maxDimSize; j > 1; j--) {
+          if (global_work_size[i] % j == 0 && j <= maxGroupSize) {
+            fixed_local_sz[i] = j;
+            maxGroupSize = maxGroupSize / j;
+            maxDimSize = maxGroupSize > maxDimSize ? maxDimSize : maxGroupSize;
+            break; //choose next work_dim
           }
-          realGroupSize *= fixed_local_sz[i];
         }
+        realGroupSize *= fixed_local_sz[i];
+      }
 
-        //in a loop of conformance test (such as test_api repeated_setup_cleanup), in each loop:
-        //create a new context, a new command queue, and uses 'globalsize[0]=1000, localsize=NULL' to enqueu kernel
-        //it triggers the following message for many times.
-        //to avoid too many messages, only print it for the first time of the process.
-        //just use static variable since it doesn't matter to print a few times at multi-thread case.
-        static int warn_no_good_localsize = 1;
-        if (realGroupSize % 8 != 0 && warn_no_good_localsize) {
-          warn_no_good_localsize = 0;
-          DEBUGP(DL_WARNING, "unable to find good values for local_work_size[i], please provide\n"
-                             " local_work_size[] explicitly, you can find good values with\n"
-                             " trial-and-error method.");
-        }
+      //in a loop of conformance test (such as test_api repeated_setup_cleanup), in each loop:
+      //create a new context, a new command queue, and uses 'globalsize[0]=1000, localsize=NULL' to enqueu kernel
+      //it triggers the following message for many times.
+      //to avoid too many messages, only print it for the first time of the process.
+      //just use static variable since it doesn't matter to print a few times at multi-thread case.
+      static int warn_no_good_localsize = 1;
+      if (realGroupSize % 8 != 0 && warn_no_good_localsize) {
+        warn_no_good_localsize = 0;
+        DEBUGP(DL_WARNING, "unable to find good values for local_work_size[i], please provide\n"
+                           " local_work_size[] explicitly, you can find good values with\n"
+                           " trial-and-error method.");
       }
     }
 
-    if (kernel->vme) {
-      fixed_global_sz[0] = (global_work_size[0] + 15) / 16 * 16;
-      fixed_global_sz[1] = (global_work_size[1] + 15) / 16;
-    } else {
-      for (i = 0; i < work_dim; ++i)
-        fixed_global_sz[i] = global_work_size[i];
-    }
+    for (i = 0; i < work_dim; ++i)
+      fixed_global_sz[i] = global_work_size[i];
 
     if (global_work_offset != NULL)
       for (i = 0; i < work_dim; ++i)
@@ -207,80 +232,33 @@ clEnqueueNDRangeKernel(cl_command_queue command_queue,
       break;
     }
 
-    int i, j, k;
-    const size_t global_wk_sz_div[3] = {
-      fixed_global_sz[0] / fixed_local_sz[0] * fixed_local_sz[0],
-      fixed_global_sz[1] / fixed_local_sz[1] * fixed_local_sz[1],
-      fixed_global_sz[2] / fixed_local_sz[2] * fixed_local_sz[2]};
-
-    const size_t global_wk_sz_rem[3] = {
-      fixed_global_sz[0] % fixed_local_sz[0],
-      fixed_global_sz[1] % fixed_local_sz[1],
-      fixed_global_sz[2] % fixed_local_sz[2]};
-    cl_uint count;
-    count = global_wk_sz_rem[0] ? 2 : 1;
-    count *= global_wk_sz_rem[1] ? 2 : 1;
-    count *= global_wk_sz_rem[2] ? 2 : 1;
-
-    const size_t *global_wk_all[2] = {global_wk_sz_div, global_wk_sz_rem};
-    /* Go through the at most 8 cases and euque if there is work items left */
-    for (i = 0; i < 2; i++) {
-      for (j = 0; j < 2; j++) {
-        for (k = 0; k < 2; k++) {
-          size_t global_wk_sz_use[3] = {global_wk_all[k][0], global_wk_all[j][1], global_wk_all[i][2]};
-          size_t global_dim_off[3] = {
-            k * global_wk_sz_div[0] / fixed_local_sz[0],
-            j * global_wk_sz_div[1] / fixed_local_sz[1],
-            i * global_wk_sz_div[2] / fixed_local_sz[2]};
-          size_t local_wk_sz_use[3] = {
-            k ? global_wk_sz_rem[0] : fixed_local_sz[0],
-            j ? global_wk_sz_rem[1] : fixed_local_sz[1],
-            i ? global_wk_sz_rem[2] : fixed_local_sz[2]};
-          if (local_wk_sz_use[0] == 0 || local_wk_sz_use[1] == 0 || local_wk_sz_use[2] == 0)
-            continue;
-
-          e = cl_event_create(command_queue->ctx, command_queue, num_events_in_wait_list,
-                              event_wait_list, CL_COMMAND_NDRANGE_KERNEL, &err);
-          if (err != CL_SUCCESS) {
-            break;
-          }
+    e = cl_event_create(command_queue->ctx, command_queue, num_events_in_wait_list,
+                        event_wait_list, CL_COMMAND_NDRANGE_KERNEL, &err);
+    if (err != CL_SUCCESS) {
+      break;
+    }
 
-          /* Do device specific checks are enqueue the kernel */
-          err = cl_command_queue_ND_range(command_queue, kernel, e, work_dim,
-                                          fixed_global_off, global_dim_off, fixed_global_sz,
-                                          global_wk_sz_use, fixed_local_sz, local_wk_sz_use);
-          if (err != CL_SUCCESS) {
-            break;
-          }
-          e->exec_data.mid_event_of_enq = (count > 1);
-          count--;
+    /* Do device specific checks are enqueue the kernel */
+    err = cl_command_queue_ND_range(command_queue, kernel, e, work_dim,
+                                    fixed_global_off, fixed_global_sz, fixed_local_sz);
+    if (err != CL_SUCCESS) {
+      break;
+    }
 
-          /* We will flush the ndrange if no event depend. Else we will add it to queue list.
+    /* We will flush the ndrange if no event depend. Else we will add it to queue list.
              The finish or Complete status will always be done in queue list. */
-          event_status = cl_event_is_ready(e);
-          if (event_status < CL_COMPLETE) { // Error happend, cancel.
-            err = CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST;
-            break;
-          }
-
-          err = cl_event_exec(e, (event_status == CL_COMPLETE ? CL_SUBMITTED : CL_QUEUED), CL_FALSE);
-          if (err != CL_SUCCESS) {
-            break;
-          }
-
-          cl_command_queue_enqueue_event(command_queue, e);
+    event_status = cl_event_is_ready(e);
+    if (event_status < CL_COMPLETE) { // Error happend, cancel.
+      err = CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST;
+      break;
+    }
 
-          if (e->exec_data.mid_event_of_enq)
-            cl_event_delete(e);
-        }
-        if (err != CL_SUCCESS) {
-          break;
-        }
-      }
-      if (err != CL_SUCCESS) {
-        break;
-      }
+    err = cl_event_exec(e, (event_status == CL_COMPLETE ? CL_SUBMITTED : CL_QUEUED), CL_FALSE);
+    if (err != CL_SUCCESS) {
+      break;
     }
+
+    cl_command_queue_enqueue_event(command_queue, e);
   } while (0);
 
   if (err == CL_SUCCESS && event) {
@@ -420,3 +398,122 @@ clEnqueueNativeKernel(cl_command_queue command_queue,
 
   return err;
 }
+
+cl_int
+clRetainKernel(cl_kernel kernel)
+{
+  if (!CL_OBJECT_IS_KERNEL(kernel)) {
+    return CL_INVALID_KERNEL;
+  }
+
+  cl_kernel_add_ref(kernel);
+  return CL_SUCCESS;
+}
+
+cl_int
+clReleaseKernel(cl_kernel kernel)
+{
+  if (!CL_OBJECT_IS_KERNEL(kernel)) {
+    return CL_INVALID_KERNEL;
+  }
+
+  cl_kernel_delete(kernel);
+  return CL_SUCCESS;
+}
+
+cl_int clGetKernelArgInfo(cl_kernel kernel,
+                          cl_uint arg_index,
+                          cl_kernel_arg_info param_name,
+                          size_t param_value_size,
+                          void *param_value,
+                          size_t *param_value_size_ret)
+{
+  if (!CL_OBJECT_IS_KERNEL(kernel)) {
+    return CL_INVALID_KERNEL;
+  }
+
+  if (kernel->program->build_opts == NULL ||
+      strstr(kernel->program->build_opts, "-cl-kernel-arg-info") == NULL) {
+    return CL_KERNEL_ARG_INFO_NOT_AVAILABLE;
+  }
+  if (param_name != CL_KERNEL_ARG_ADDRESS_QUALIFIER && param_name != CL_KERNEL_ARG_ACCESS_QUALIFIER &&
+      param_name != CL_KERNEL_ARG_TYPE_NAME && param_name != CL_KERNEL_ARG_TYPE_QUALIFIER &&
+      param_name != CL_KERNEL_ARG_NAME) {
+    return CL_INVALID_VALUE;
+  }
+
+  if (arg_index >= kernel->arg_n) {
+    return CL_INVALID_ARG_INDEX;
+  }
+
+  return cl_kernel_get_argument_info(kernel, arg_index, param_name, param_value_size,
+                                     param_value, param_value_size_ret);
+}
+
+cl_int
+clGetKernelWorkGroupInfo(cl_kernel kernel,
+                         cl_device_id device,
+                         cl_kernel_work_group_info param_name,
+                         size_t param_value_size,
+                         void *param_value,
+                         size_t *param_value_size_ret)
+{
+  cl_int err = CL_SUCCESS;
+  if (!CL_OBJECT_IS_KERNEL(kernel)) {
+    return CL_INVALID_KERNEL;
+  }
+
+  if (device) {
+    err = cl_devices_list_check(1, (const cl_device_id *)&device);
+    if (err != CL_SUCCESS)
+      return err;
+
+    err = cl_devices_list_include_check(kernel->program->ctx->device_num,
+                                        kernel->program->ctx->devices, 1,
+                                        (const cl_device_id *)&device);
+    if (err != CL_SUCCESS)
+      return err;
+  } else {
+    if (kernel->each_device_num != 1)
+      return CL_INVALID_DEVICE;
+  }
+
+  return cl_kernel_get_workgroup_info(kernel, device, param_name, param_value_size,
+                                      param_value, param_value_size_ret);
+}
+
+cl_int
+clGetKernelSubGroupInfoKHR(cl_kernel kernel,
+                           cl_device_id device,
+                           cl_kernel_work_group_info param_name,
+                           size_t input_value_size,
+                           const void *input_value,
+                           size_t param_value_size,
+                           void *param_value,
+                           size_t *param_value_size_ret)
+{
+  cl_int err = CL_SUCCESS;
+  if (!CL_OBJECT_IS_KERNEL(kernel)) {
+    return CL_INVALID_KERNEL;
+  }
+
+  if (device == NULL) {
+    if (kernel->program->ctx->device_num > 1)
+      return CL_INVALID_DEVICE;
+
+    device = kernel->program->ctx->devices[0];
+  } else {
+    err = cl_devices_list_check(1, (const cl_device_id *)&device);
+    if (err != CL_SUCCESS)
+      return err;
+
+    err = cl_devices_list_include_check(kernel->program->ctx->device_num,
+                                        kernel->program->ctx->devices, 1,
+                                        (const cl_device_id *)&device);
+    if (err != CL_SUCCESS)
+      return err;
+  }
+
+  return cl_kernel_get_subgroup_info(kernel, device, param_name, input_value_size, input_value,
+                                     param_value_size, param_value, param_value_size_ret);
+}
diff --git a/src/cl_api_mem.c b/src/cl_api_mem.c
index 0d19bf8..d842349 100644
--- a/src/cl_api_mem.c
+++ b/src/cl_api_mem.c
@@ -20,6 +20,7 @@
 #include "cl_enqueue.h"
 #include "cl_command_queue.h"
 #include "cl_event.h"
+#include "cl_device_id.h"
 #include "CL/cl.h"
 
 cl_int
@@ -1004,7 +1005,7 @@ clEnqueueCopyBuffer(cl_command_queue command_queue,
       break;
     }
 
-    err = cl_mem_copy(command_queue, e, src_buffer, dst_buffer, src_offset, dst_offset, cb);
+    err = command_queue->device->api.mem_copy(command_queue, e, src_buffer, dst_buffer, src_offset, dst_offset, cb);
     if (err != CL_SUCCESS) {
       break;
     }
@@ -1195,8 +1196,9 @@ clEnqueueCopyBufferRect(cl_command_queue command_queue,
       break;
     }
 
-    err = cl_mem_copy_buffer_rect(command_queue, e, src_buffer, dst_buffer, src_origin, dst_origin, region,
-                                  src_row_pitch, src_slice_pitch, dst_row_pitch, dst_slice_pitch);
+    err = command_queue->device->api.mem_copy_rect(command_queue, e, src_buffer, dst_buffer,
+                                                   src_origin, dst_origin, region, src_row_pitch,
+                                                   src_slice_pitch, dst_row_pitch, dst_slice_pitch);
     if (err != CL_SUCCESS) {
       break;
     }
@@ -1295,7 +1297,8 @@ clEnqueueFillBuffer(cl_command_queue command_queue,
       break;
     }
 
-    err = cl_mem_fill(command_queue, e, pattern, pattern_size, buffer, offset, size);
+    err = command_queue->device->api.mem_fill(command_queue, e, pattern, pattern_size,
+                                              buffer, offset, size);
     if (err) {
       break;
     }
@@ -2049,8 +2052,8 @@ clEnqueueCopyImage(cl_command_queue command_queue,
       break;
     }
 
-    err = cl_mem_kernel_copy_image(command_queue, e, src_image, dst_image,
-                                   src_origin, dst_origin, region);
+    err = command_queue->device->api.image_copy(command_queue, e, src_mem, dst_mem,
+                                                src_origin, dst_origin, region);
     if (err != CL_SUCCESS) {
       break;
     }
@@ -2159,8 +2162,8 @@ clEnqueueCopyImageToBuffer(cl_command_queue command_queue,
       break;
     }
 
-    err = cl_mem_copy_image_to_buffer(command_queue, e, src_image, dst_buffer,
-                                      src_origin, dst_offset, region);
+    err = command_queue->device->api.copy_image_to_buffer(command_queue, e, src_mem, dst_buffer,
+                                                          src_origin, dst_offset, region);
     if (err != CL_SUCCESS) {
       break;
     }
@@ -2270,8 +2273,8 @@ clEnqueueCopyBufferToImage(cl_command_queue command_queue,
       break;
     }
 
-    err = cl_mem_copy_buffer_to_image(command_queue, e, src_buffer, dst_image,
-                                      src_offset, dst_origin, region);
+    err = command_queue->device->api.copy_buffer_to_image(command_queue, e, src_buffer, dst_mem,
+                                                          src_offset, dst_origin, region);
 
     if (err != CL_SUCCESS) {
       break;
@@ -2382,7 +2385,7 @@ clEnqueueFillImage(cl_command_queue command_queue,
       break;
     }
 
-    err = cl_image_fill(command_queue, e, fill_color, image, origin, region);
+    err = command_queue->device->api.image_fill(command_queue, e, fill_color, mem, origin, region);
     if (err != CL_SUCCESS) {
       break;
     }
diff --git a/src/cl_api_program.c b/src/cl_api_program.c
index d68f29f..67fe3b3 100644
--- a/src/cl_api_program.c
+++ b/src/cl_api_program.c
@@ -15,9 +15,10 @@
  * License along with this library. If not, see <http://www.gnu.org/licenses/>.
  *
  */
+#include "cl_alloc.h"
 #include "cl_context.h"
-#include "cl_program.h"
 #include "cl_device_id.h"
+#include "cl_program.h"
 #include <string.h>
 
 cl_int
@@ -32,6 +33,7 @@ clGetProgramInfo(cl_program program,
   const char *ret_str = "";
   cl_int ref;
   cl_uint num_dev, kernels_num;
+  cl_int i;
 
   if (!CL_OBJECT_IS_PROGRAM(program)) {
     return CL_INVALID_PROGRAM;
@@ -45,14 +47,19 @@ clGetProgramInfo(cl_program program,
     src_ptr = &program->ctx;
     src_size = sizeof(cl_context);
   } else if (param_name == CL_PROGRAM_NUM_DEVICES) {
-    num_dev = program->ctx->device_num; // Just 1 dev now.
+    num_dev = program->ctx->device_num;
     src_ptr = &num_dev;
     src_size = sizeof(cl_uint);
   } else if (param_name == CL_PROGRAM_DEVICES) {
     src_ptr = program->ctx->devices;
     src_size = program->ctx->device_num * sizeof(cl_device_id);
   } else if (param_name == CL_PROGRAM_NUM_KERNELS) {
-    kernels_num = program->ker_n;
+    cl_int err;
+    kernels_num = 0;
+    err = cl_program_get_kernel_names(program, &kernels_num, 0, NULL, NULL);
+    if (err != CL_SUCCESS)
+      return err;
+
     src_ptr = &kernels_num;
     src_size = sizeof(cl_uint);
   } else if (param_name == CL_PROGRAM_SOURCE) {
@@ -64,52 +71,54 @@ clGetProgramInfo(cl_program program,
       src_size = strlen(program->source) + 1;
     }
   } else if (param_name == CL_PROGRAM_KERNEL_NAMES) {
-    // TODO: need to refine this.
-    cl_program_get_kernel_names(program, param_value_size, (char *)param_value, param_value_size_ret);
-    return CL_SUCCESS;
+    return cl_program_get_kernel_names(program, NULL, param_value_size,
+                                       (char *)param_value, param_value_size_ret);
   } else if (param_name == CL_PROGRAM_BINARY_SIZES) {
-    if (program->binary == NULL) {
-      if (program->binary_type == CL_PROGRAM_BINARY_TYPE_EXECUTABLE) {
-        program->binary_sz = compiler_program_serialize_to_binary(program->opaque, &program->binary, 0);
-      } else if (program->binary_type == CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT) {
-        program->binary_sz = compiler_program_serialize_to_binary(program->opaque, &program->binary, 1);
-      } else if (program->binary_type == CL_PROGRAM_BINARY_TYPE_LIBRARY) {
-        program->binary_sz = compiler_program_serialize_to_binary(program->opaque, &program->binary, 2);
-      } else {
-        return CL_INVALID_BINARY;
-      }
-    }
+    cl_program_for_device pd = NULL;
+    size_t *bin_sz = param_value;
+
+    if (param_value && param_value_size < program->ctx->device_num * sizeof(size_t))
+      return CL_INVALID_VALUE;
 
-    if (program->binary == NULL || program->binary_sz == 0) {
-      return CL_OUT_OF_RESOURCES;
-    }
-    src_ptr = &program->binary_sz;
-    src_size = sizeof(size_t);
-  } else if (param_name == CL_PROGRAM_BINARIES) {
     if (param_value_size_ret)
-      *param_value_size_ret = sizeof(void *);
-    if (!param_value)
-      return CL_SUCCESS;
-
-    /* param_value points to an array of n
-       pointers allocated by the caller */
-    if (program->binary == NULL) {
-      if (program->binary_type == CL_PROGRAM_BINARY_TYPE_EXECUTABLE) {
-        program->binary_sz = compiler_program_serialize_to_binary(program->opaque, &program->binary, 0);
-      } else if (program->binary_type == CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT) {
-        program->binary_sz = compiler_program_serialize_to_binary(program->opaque, &program->binary, 1);
-      } else if (program->binary_type == CL_PROGRAM_BINARY_TYPE_LIBRARY) {
-        program->binary_sz = compiler_program_serialize_to_binary(program->opaque, &program->binary, 2);
-      } else {
-        return CL_INVALID_BINARY;
+      *param_value_size_ret = program->ctx->device_num * sizeof(size_t);
+
+    if (param_value) {
+      for (i = 0; i < program->ctx->device_num; i++) {
+        DEV_PRIVATE_DATA(program, program->ctx->devices[i], pd);
+        if (pd->binary == NULL || pd->binary_type == CL_PROGRAM_BINARY_TYPE_NONE) {
+          bin_sz[i] = 0;
+        } else {
+          bin_sz[i] = pd->binary_sz;
+        }
       }
     }
+    return CL_SUCCESS;
+  } else if (param_name == CL_PROGRAM_BINARIES) {
+    cl_program_for_device pd = NULL;
+    char **bin_ptr = param_value;
 
-    if (program->binary == NULL || program->binary_sz == 0) {
-      return CL_OUT_OF_RESOURCES;
-    }
+    if (param_value && param_value_size < program->ctx->device_num * sizeof(char *))
+      return CL_INVALID_VALUE;
+
+    if (param_value_size_ret)
+      *param_value_size_ret = program->ctx->device_num * sizeof(char *);
 
-    memcpy(*((void **)param_value), program->binary, program->binary_sz);
+    bin_ptr = param_value;
+    if (param_value) {
+      for (i = 0; i < program->ctx->device_num; i++) {
+        if (bin_ptr[i] == NULL)
+          continue;
+
+        DEV_PRIVATE_DATA(program, program->ctx->devices[i], pd);
+
+        if (pd->binary == NULL || pd->binary_type == CL_PROGRAM_BINARY_TYPE_NONE) {
+          bin_ptr[i][0] = 0;
+        } else {
+          memcpy(bin_ptr[i], pd->binary, pd->binary_sz);
+        }
+      }
+    }
     return CL_SUCCESS;
   } else {
     return CL_INVALID_VALUE;
@@ -120,6 +129,28 @@ clGetProgramInfo(cl_program program,
 }
 
 cl_int
+clRetainProgram(cl_program program)
+{
+  if (!CL_OBJECT_IS_PROGRAM(program)) {
+    return CL_INVALID_PROGRAM;
+  }
+
+  cl_program_add_ref(program);
+  return CL_SUCCESS;
+}
+
+cl_int
+clReleaseProgram(cl_program program)
+{
+  if (!CL_OBJECT_IS_PROGRAM(program)) {
+    return CL_INVALID_PROGRAM;
+  }
+
+  cl_program_delete(program);
+  return CL_SUCCESS;
+}
+
+cl_int
 clGetProgramBuildInfo(cl_program program,
                       cl_device_id device,
                       cl_program_build_info param_name,
@@ -130,38 +161,55 @@ clGetProgramBuildInfo(cl_program program,
   const void *src_ptr = NULL;
   size_t src_size = 0;
   const char *ret_str = "";
-  size_t global_size;
+  cl_int err = CL_SUCCESS;
+  size_t result = 0;
+  cl_program_for_device pd;
 
   if (!CL_OBJECT_IS_PROGRAM(program)) {
     return CL_INVALID_PROGRAM;
   }
 
-  cl_int err = cl_devices_list_include_check(program->ctx->device_num,
-                                             program->ctx->devices, 1, &device);
+  if (device == NULL)
+    return CL_INVALID_DEVICE;
+
+  err = cl_devices_list_check(1, (const cl_device_id *)&device);
   if (err != CL_SUCCESS)
     return err;
 
+  cl_devices_list_include_check(program->ctx->device_num, program->ctx->devices,
+                                1, (const cl_device_id *)&device);
+  if (err != CL_SUCCESS)
+    return err;
+
+  DEV_PRIVATE_DATA(program, device, pd);
+
   if (param_name == CL_PROGRAM_BUILD_STATUS) {
     src_ptr = &program->build_status;
     src_size = sizeof(cl_build_status);
   } else if (param_name == CL_PROGRAM_BUILD_OPTIONS) {
-    if (program->is_built && program->build_opts) {
+    if (program->build_status != CL_BUILD_NONE && program->build_opts) {
       ret_str = program->build_opts;
     }
     src_ptr = ret_str;
     src_size = strlen(ret_str) + 1;
   } else if (param_name == CL_PROGRAM_BUILD_LOG) {
-    src_ptr = program->build_log;
-    src_size = program->build_log_sz + 1;
+    src_ptr = pd->build_log;
+    src_size = pd->build_log_sz + 1;
   } else if (param_name == CL_PROGRAM_BINARY_TYPE) {
-    src_ptr = &program->binary_type;
+    src_ptr = &pd->binary_type;
     src_size = sizeof(cl_uint);
   } else if (param_name == CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE) {
-    global_size = 0;
-    if (program->is_built)
-      global_size = cl_program_get_global_variable_size(program);
-    src_ptr = &global_size;
-    src_size = sizeof(global_size);
+    if (program->build_status != CL_BUILD_NONE) {
+      err = device->api.get_program_info(device, program, param_name, &result);
+    } else {
+      result = 0;
+    }
+
+    if (err != CL_SUCCESS)
+      return result;
+
+    src_ptr = &result;
+    src_size = sizeof(result);
   } else {
     return CL_INVALID_VALUE;
   }
@@ -169,3 +217,416 @@ clGetProgramBuildInfo(cl_program program,
   return cl_get_info_helper(src_ptr, src_size,
                             param_value, param_value_size, param_value_size_ret);
 }
+
+cl_program
+clCreateProgramWithSource(cl_context context,
+                          cl_uint count,
+                          const char **strings,
+                          const size_t *lengths,
+                          cl_int *errcode_ret)
+{
+  cl_program program = NULL;
+  cl_int err = CL_SUCCESS;
+  cl_uint i;
+  cl_int *lens = NULL;
+  cl_int len_total = 0;
+  char *p = NULL;
+
+  do {
+    if (!CL_OBJECT_IS_CONTEXT(context)) {
+      err = CL_INVALID_CONTEXT;
+      break;
+    }
+
+    if (count == 0) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+    if (strings == NULL) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+
+    for (i = 0; i < count; i++) {
+      if (strings[i] == NULL) {
+        err = CL_INVALID_VALUE;
+        break;
+      }
+    }
+    if (err != CL_SUCCESS)
+      break;
+
+    program = cl_program_new(context);
+    if (program == NULL) {
+      err = CL_OUT_OF_HOST_MEMORY;
+      break;
+    }
+
+    lens = CL_CALLOC(count, sizeof(cl_int));
+    if (lens == NULL) {
+      err = CL_OUT_OF_HOST_MEMORY;
+      break;
+    }
+
+    for (i = 0; i < (cl_int)count; ++i) {
+      size_t len;
+      if (lengths == NULL || lengths[i] == 0)
+        len = strlen(strings[i]);
+      else
+        len = lengths[i];
+      lens[i] = len;
+      len_total += len;
+    }
+
+    program->source = CL_CALLOC(len_total + 1, sizeof(char));
+    if (program->source == NULL) {
+      err = CL_OUT_OF_HOST_MEMORY;
+      break;
+    }
+
+    p = program->source;
+    for (i = 0; i < (cl_int)count; ++i) {
+      memcpy(p, strings[i], lens[i]);
+      p += lens[i];
+    }
+    *p = '\0';
+
+    program->source_sz = len_total + 1;
+  } while (0);
+
+  if (err != CL_SUCCESS) {
+    if (program)
+      cl_program_delete(program);
+  }
+
+  CL_FREE(lens);
+
+  if (errcode_ret)
+    *errcode_ret = err;
+  return program;
+}
+
+cl_program
+clCreateProgramWithBinary(cl_context context,
+                          cl_uint num_devices,
+                          const cl_device_id *devices,
+                          const size_t *lengths,
+                          const unsigned char **binaries,
+                          cl_int *binary_status,
+                          cl_int *errcode_ret)
+{
+  cl_program program = NULL;
+  cl_int err = CL_SUCCESS;
+  cl_int i;
+
+  do {
+    if (!CL_OBJECT_IS_CONTEXT(context)) {
+      err = CL_INVALID_CONTEXT;
+      break;
+    }
+
+    if (devices == NULL || num_devices == 0) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+
+    err = cl_devices_list_check(num_devices, devices);
+    if (err != CL_SUCCESS)
+      break;
+
+    err = cl_devices_list_include_check(context->device_num, context->devices, num_devices, devices);
+    if (err != CL_SUCCESS)
+      break;
+
+    if (binaries == NULL || lengths == NULL) {
+      if (binary_status) {
+        for (i = 0; i < num_devices; i++)
+          binary_status[i] = CL_INVALID_VALUE;
+      }
+      err = CL_INVALID_VALUE;
+      break;
+    }
+
+    for (i = 0; i < num_devices; i++) {
+      if (binaries[i] == NULL || lengths[i] == 0) {
+        if (binary_status)
+          binary_status[i] = CL_INVALID_VALUE;
+
+        err = CL_INVALID_VALUE;
+      } else {
+        if (binary_status)
+          binary_status[i] = CL_SUCCESS;
+      }
+    }
+    if (err != CL_SUCCESS)
+      break;
+
+    program = cl_program_create_from_binary(context, num_devices, devices, lengths,
+                                            binaries, binary_status, &err);
+  } while (0);
+
+  if (errcode_ret)
+    *errcode_ret = err;
+  return program;
+}
+
+cl_int
+clBuildProgram(cl_program program,
+               cl_uint num_devices,
+               const cl_device_id *device_list,
+               const char *options,
+               void(CL_CALLBACK *pfn_notify)(cl_program, void *),
+               void *user_data)
+{
+  cl_int err = CL_SUCCESS;
+
+  do {
+    if (!CL_OBJECT_IS_PROGRAM(program)) {
+      err = CL_INVALID_PROGRAM;
+      break;
+    }
+
+    if ((num_devices == 0 && device_list != NULL) ||
+        (num_devices != 0 && device_list == NULL)) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+
+    if (pfn_notify == NULL && user_data != NULL) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+
+    if (device_list) {
+      err = cl_devices_list_check(num_devices, device_list);
+      if (err != CL_SUCCESS)
+        break;
+    }
+
+    if (device_list) {
+      assert(program->ctx);
+      err = cl_devices_list_check(num_devices, device_list);
+      if (err != CL_SUCCESS)
+        break;
+
+      err = cl_devices_list_include_check(program->ctx->device_num,
+                                          program->ctx->devices, num_devices, device_list);
+      if (err != CL_SUCCESS)
+        break;
+    }
+
+    err = cl_program_build(program, options, num_devices, device_list);
+  } while (0);
+
+  if (pfn_notify)
+    pfn_notify(program, user_data);
+
+  return err;
+}
+
+cl_int
+clCompileProgram(cl_program program,
+                 cl_uint num_devices,
+                 const cl_device_id *device_list,
+                 const char *options,
+                 cl_uint num_input_headers,
+                 const cl_program *input_headers,
+                 const char **header_include_names,
+                 void(CL_CALLBACK *pfn_notify)(cl_program, void *),
+                 void *user_data)
+{
+  cl_int err = CL_SUCCESS;
+
+  do {
+    if (!CL_OBJECT_IS_PROGRAM(program)) {
+      err = CL_INVALID_PROGRAM;
+      break;
+    }
+
+    if (num_devices == 0 && device_list != NULL) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+
+    if (num_devices != 0 && device_list == NULL) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+
+    if (pfn_notify == 0 && user_data != NULL) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+
+    if (num_input_headers == 0 && input_headers != NULL) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+
+    if (num_input_headers != 0 && input_headers == NULL) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+
+    if (device_list) {
+      assert(program->ctx);
+      err = cl_devices_list_check(num_devices, device_list);
+      if (err != CL_SUCCESS)
+        break;
+
+      err = cl_devices_list_include_check(program->ctx->device_num,
+                                          program->ctx->devices, num_devices, device_list);
+      if (err != CL_SUCCESS)
+        break;
+    }
+
+    err = cl_program_compile(program, num_input_headers, input_headers,
+                             header_include_names, options, num_devices, device_list);
+    if (err != CL_SUCCESS)
+      break;
+
+  } while (0);
+
+  if (pfn_notify)
+    pfn_notify(program, user_data);
+  return err;
+}
+
+cl_program
+clLinkProgram(cl_context context,
+              cl_uint num_devices,
+              const cl_device_id *device_list,
+              const char *options,
+              cl_uint num_input_programs,
+              const cl_program *input_programs,
+              void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data),
+              void *user_data,
+              cl_int *errcode_ret)
+{
+  cl_int err = CL_SUCCESS;
+  cl_program program = NULL;
+  cl_uint i = 0;
+
+  do {
+    if (!CL_OBJECT_IS_CONTEXT(context)) {
+      err = CL_INVALID_CONTEXT;
+      break;
+    }
+
+    if (pfn_notify == 0 && user_data != NULL) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+
+    if (num_input_programs == 0 && input_programs != NULL) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+    if (num_input_programs != 0 && input_programs == NULL) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+    if (num_input_programs == 0 && input_programs == NULL) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+
+    if (num_devices == 0 && device_list != NULL) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+    if (num_devices != 0 && device_list == NULL) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+
+    if (device_list) {
+      err = cl_devices_list_check(num_devices, device_list);
+      if (err != CL_SUCCESS)
+        break;
+
+      err = cl_devices_list_include_check(context->device_num,
+                                          context->devices, num_devices, device_list);
+      if (err != CL_SUCCESS)
+        break;
+    }
+
+    for (i = 0; i < num_input_programs; i++) {
+      if (!CL_OBJECT_IS_PROGRAM(input_programs[i])) {
+        err = CL_INVALID_PROGRAM;
+        break;
+      }
+    }
+    if (err != CL_SUCCESS)
+      break;
+
+    program = cl_program_link(context, num_devices, device_list,
+                              num_input_programs, input_programs, options, &err);
+
+  } while (0);
+
+  if (pfn_notify)
+    pfn_notify(program, user_data);
+
+  if (errcode_ret)
+    *errcode_ret = err;
+  return program;
+}
+
+cl_int
+clCreateKernelsInProgram(cl_program program,
+                         cl_uint num_kernels,
+                         cl_kernel *kernels,
+                         cl_uint *num_kernels_ret)
+{
+  cl_int err = CL_SUCCESS;
+
+  if (!CL_OBJECT_IS_PROGRAM(program))
+    return CL_INVALID_PROGRAM;
+
+  if (kernels == NULL && num_kernels_ret == 0)
+    return CL_INVALID_VALUE;
+
+  err = cl_program_create_kernels_in_program(program, num_kernels, kernels, num_kernels_ret);
+
+  return err;
+}
+
+cl_program
+clCreateProgramWithBuiltInKernels(cl_context context,
+                                  cl_uint num_devices,
+                                  const cl_device_id *device_list,
+                                  const char *kernel_names,
+                                  cl_int *errcode_ret)
+{
+  cl_program program = NULL;
+  cl_int err = CL_SUCCESS;
+
+  do {
+    if (!CL_OBJECT_IS_CONTEXT(context)) {
+      err = CL_INVALID_CONTEXT;
+      break;
+    }
+
+    if (kernel_names == NULL || kernel_names[0] == 0) {
+      err = CL_INVALID_VALUE;
+      break;
+    }
+
+    err = cl_devices_list_check(num_devices, device_list);
+    if (err != CL_SUCCESS)
+      break;
+
+    err = cl_devices_list_include_check(context->device_num,
+                                        context->devices, num_devices, device_list);
+    if (err != CL_SUCCESS)
+      break;
+
+    program = cl_program_create_with_built_in_kernles(context, num_devices, device_list,
+                                                      kernel_names, &err);
+  } while (0);
+
+  if (errcode_ret)
+    *errcode_ret = err;
+  return program;
+}
diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
index 1b21375..2386723 100644
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -58,13 +58,12 @@ cl_command_queue_new(cl_context ctx)
 }
 
 LOCAL cl_command_queue
-cl_create_command_queue(cl_context ctx, cl_device_id device, cl_command_queue_properties properties,
+cl_command_queue_create(cl_context ctx, cl_device_id device, cl_command_queue_properties properties,
                         cl_uint queue_size, cl_int *errcode_ret)
 {
   cl_command_queue queue = cl_command_queue_new(ctx);
   if (queue == NULL) {
     *errcode_ret = CL_OUT_OF_HOST_MEMORY;
-    return NULL;
   }
 
   queue->props = properties;
@@ -82,14 +81,9 @@ cl_command_queue_delete(cl_command_queue queue)
   if (CL_OBJECT_DEC_REF(queue) > 1)
     return;
 
-  /* Before we destroy the queue, we should make sure all
-     the commands in the queue are finished. */
-  cl_command_queue_wait_finish(queue);
-  cl_context_remove_queue(queue->ctx, queue);
-
   cl_command_queue_destroy_enqueue(queue);
 
-  cl_mem_delete(queue->perf);
+  cl_context_remove_queue(queue->ctx, queue);
   if (queue->barrier_events) {
     CL_FREE(queue->barrier_events);
   }
@@ -103,128 +97,6 @@ cl_command_queue_add_ref(cl_command_queue queue)
   CL_OBJECT_INC_REF(queue);
 }
 
-static void
-set_image_info(char *curbe,
-               struct ImageInfo * image_info,
-               struct _cl_mem_image *image)
-{
-  if (image_info->wSlot >= 0)
-    *(uint32_t*)(curbe + image_info->wSlot) = image->w;
-  if (image_info->hSlot >= 0)
-    *(uint32_t*)(curbe + image_info->hSlot) = image->h;
-  if (image_info->depthSlot >= 0)
-    *(uint32_t*)(curbe + image_info->depthSlot) = image->depth;
-  if (image_info->channelOrderSlot >= 0)
-    *(uint32_t*)(curbe + image_info->channelOrderSlot) = image->fmt.image_channel_order;
-  if (image_info->dataTypeSlot >= 0)
-    *(uint32_t*)(curbe + image_info->dataTypeSlot) = image->fmt.image_channel_data_type;
-}
-
-LOCAL cl_int
-cl_command_queue_bind_image(cl_command_queue queue, cl_kernel k, cl_gpgpu gpgpu, uint32_t *max_bti)
-{
-  uint32_t i;
-
-  for (i = 0; i < k->image_sz; i++) {
-    int id = k->images[i].arg_idx;
-    struct _cl_mem_image *image;
-    assert(interp_kernel_get_arg_type(k->opaque, id) == GBE_ARG_IMAGE);
-
-    image = cl_mem_image(k->args[id].mem);
-    set_image_info(k->curbe, &k->images[i], image);
-    if(*max_bti < k->images[i].idx)
-      *max_bti = k->images[i].idx;
-    if(k->vme){
-      if( (image->fmt.image_channel_order != CL_R) || (image->fmt.image_channel_data_type != CL_UNORM_INT8) )
-        return CL_IMAGE_FORMAT_NOT_SUPPORTED;
-      cl_gpgpu_bind_image_for_vme(gpgpu, k->images[i].idx, image->base.bo, image->offset + k->args[id].mem->offset,
-                          image->intel_fmt, image->image_type, image->bpp,
-                          image->w, image->h, image->depth,
-                          image->row_pitch, image->slice_pitch, (cl_gpgpu_tiling)image->tiling);
-    }
-    else
-      cl_gpgpu_bind_image(gpgpu, k->images[i].idx, image->base.bo, image->offset + k->args[id].mem->offset,
-                          image->intel_fmt, image->image_type, image->bpp,
-                          image->w, image->h, image->depth,
-                          image->row_pitch, image->slice_pitch, (cl_gpgpu_tiling)image->tiling);
-    // TODO, this workaround is for GEN7/GEN75 only, we may need to do it in the driver layer
-    // on demand.
-    if (image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
-      cl_gpgpu_bind_image(gpgpu, k->images[i].idx + BTI_WORKAROUND_IMAGE_OFFSET, image->base.bo, image->offset + k->args[id].mem->offset,
-                          image->intel_fmt, image->image_type, image->bpp,
-                          image->w, image->h, image->depth,
-                          image->row_pitch, image->slice_pitch, (cl_gpgpu_tiling)image->tiling);
-  }
-  return CL_SUCCESS;
-}
-
-LOCAL cl_int
-cl_command_queue_bind_surface(cl_command_queue queue, cl_kernel k, cl_gpgpu gpgpu, uint32_t *max_bti)
-{
-  /* Bind all user buffers (given by clSetKernelArg) */
-  uint32_t i, bti;
-  uint32_t ocl_version = interp_kernel_get_ocl_version(k->opaque);
-  enum gbe_arg_type arg_type; /* kind of argument */
-  for (i = 0; i < k->arg_n; ++i) {
-    int32_t offset; // location of the address in the curbe
-    arg_type = interp_kernel_get_arg_type(k->opaque, i);
-    if (!(arg_type == GBE_ARG_GLOBAL_PTR ||
-          (arg_type == GBE_ARG_CONSTANT_PTR && ocl_version >= 200) ||
-          arg_type == GBE_ARG_PIPE) ||
-        !k->args[i].mem)
-      continue;
-    offset = interp_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, i);
-    if (offset < 0)
-      continue;
-    bti = interp_kernel_get_arg_bti(k->opaque, i);
-    if(*max_bti < bti)
-      *max_bti = bti;
-    if (k->args[i].mem->type == CL_MEM_SUBBUFFER_TYPE) {
-      struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)k->args[i].mem;
-      cl_gpgpu_bind_buf(gpgpu, k->args[i].mem->bo, offset, k->args[i].mem->offset + buffer->sub_offset, k->args[i].mem->size, bti);
-    } else {
-      size_t mem_offset = 0; //
-      if(k->args[i].is_svm) {
-        mem_offset = (size_t)k->args[i].ptr - (size_t)k->args[i].mem->host_ptr;
-      }
-      cl_gpgpu_bind_buf(gpgpu, k->args[i].mem->bo, offset, k->args[i].mem->offset + mem_offset, k->args[i].mem->size, bti);
-    }
-  }
-  return CL_SUCCESS;
-}
-
-LOCAL cl_int
-cl_command_queue_bind_exec_info(cl_command_queue queue, cl_kernel k, cl_gpgpu gpgpu, uint32_t *max_bti)
-{
-  uint32_t i;
-  size_t mem_offset, bti = *max_bti;
-  cl_mem mem;
-  int32_t offset = interp_kernel_get_curbe_size(k->opaque);
-
-  for (i = 0; i < k->exec_info_n; i++) {
-    void *ptr = k->exec_info[i];
-    mem = cl_context_get_svm_from_ptr(k->program->ctx, ptr);
-    if(mem == NULL)
-      mem = cl_context_get_mem_from_ptr(k->program->ctx, ptr);
-
-    if (mem) {
-      mem_offset = (size_t)ptr - (size_t)mem->host_ptr;
-      /* only need realloc in surface state, don't need realloc in curbe */
-      cl_gpgpu_bind_buf(gpgpu, mem->bo, offset + i * sizeof(ptr), mem->offset + mem_offset, mem->size, bti++);
-      if(bti == BTI_WORKAROUND_IMAGE_OFFSET)
-        bti = *max_bti + BTI_WORKAROUND_IMAGE_OFFSET;
-      assert(bti < BTI_MAX_ID);
-    }
-  }
-  *max_bti = bti;
-
-  return CL_SUCCESS;
-}
-
-extern cl_int cl_command_queue_ND_range_gen7(cl_command_queue, cl_kernel, cl_event, 
-                                             uint32_t, const size_t *, const size_t *,const size_t *,
-                                             const size_t *, const size_t *, const size_t *);
-
 static cl_int
 cl_kernel_check_args(cl_kernel k)
 {
@@ -241,64 +113,21 @@ cl_command_queue_ND_range(cl_command_queue queue,
                           cl_event event,
                           const uint32_t work_dim,
                           const size_t *global_wk_off,
-                          const size_t *global_dim_off,
                           const size_t *global_wk_sz,
-                          const size_t *global_wk_sz_use,
-                          const size_t *local_wk_sz,
-                          const size_t *local_wk_sz_use)
+                          const size_t *local_wk_sz)
 {
-  if(b_output_kernel_perf)
-    time_start(queue->ctx, cl_kernel_get_name(k), queue);
-  const int32_t ver = cl_driver_get_ver(queue->ctx->drv);
   cl_int err = CL_SUCCESS;
 
   /* Check that the user did not forget any argument */
-  TRY (cl_kernel_check_args, k);
-
-
-  if (ver == 7 || ver == 75 || ver == 8 || ver == 9)
-    //TRY (cl_command_queue_ND_range_gen7, queue, k, work_dim, global_wk_off, global_wk_sz, local_wk_sz);
-    TRY (cl_command_queue_ND_range_gen7, queue, k, event, work_dim,
-                                global_wk_off, global_dim_off, global_wk_sz,
-                                global_wk_sz_use, local_wk_sz, local_wk_sz_use);
-
-  else
-    FATAL ("Unknown Gen Device");
+  err = cl_kernel_check_args(k);
+  if (err != CL_SUCCESS)
+    return err;
 
-error:
+  err = queue->device->api.ND_range_kernel(queue, k, event, work_dim,
+                                                   global_wk_off, global_wk_sz, local_wk_sz);
   return err;
 }
 
-LOCAL int
-cl_command_queue_flush_gpgpu(cl_gpgpu gpgpu)
-{
-  void* printf_info = cl_gpgpu_get_printf_info(gpgpu);
-  void* profiling_info;
-
-  if (cl_gpgpu_flush(gpgpu) < 0)
-    return CL_OUT_OF_RESOURCES;
-
-  if (printf_info && interp_get_printf_num(printf_info)) {
-    void *addr = cl_gpgpu_map_printf_buffer(gpgpu);
-    interp_output_printf(printf_info, addr);
-    cl_gpgpu_unmap_printf_buffer(gpgpu);
-  }
-
-  if (printf_info) {
-    interp_release_printf_info(printf_info);
-    cl_gpgpu_set_printf_info(gpgpu, NULL);
-  }
-
-  /* If have profiling info, output it. */
-  profiling_info = cl_gpgpu_get_profiling_info(gpgpu);
-  if (profiling_info) {
-    interp_output_profiling(profiling_info, cl_gpgpu_map_profiling_buffer(gpgpu));
-    cl_gpgpu_unmap_profiling_buffer(gpgpu);
-  }
-
-  return CL_SUCCESS;
-}
-
 LOCAL void
 cl_command_queue_insert_barrier_event(cl_command_queue queue, cl_event event)
 {
@@ -315,11 +144,11 @@ cl_command_queue_insert_barrier_event(cl_command_queue queue, cl_event event)
     assert(queue->barrier_events);
   }
 
-  for (i = 0; i<queue->barrier_events_num; i++) {
+  for (i = 0; i < queue->barrier_events_num; i++) {
     assert(queue->barrier_events[i] != event);
   }
 
-  if(queue->barrier_events_num < queue->barrier_events_size) {
+  if (queue->barrier_events_num < queue->barrier_events_size) {
     queue->barrier_events[queue->barrier_events_num++] = event;
     CL_OBJECT_UNLOCK(queue);
     return;
@@ -347,21 +176,21 @@ cl_command_queue_remove_barrier_event(cl_command_queue queue, cl_event event)
   assert(queue->barrier_events_num > 0);
   assert(queue->barrier_events);
 
-  for(i = 0; i < queue->barrier_events_num; i++) {
-    if(queue->barrier_events[i] == event)
+  for (i = 0; i < queue->barrier_events_num; i++) {
+    if (queue->barrier_events[i] == event)
       break;
   }
   assert(i < queue->barrier_events_num); // Must find it.
 
-  if(i == queue->barrier_events_num - 1) { // The last one.
+  if (i == queue->barrier_events_num - 1) { // The last one.
     queue->barrier_events[i] = NULL;
   } else {
-    for(; i < queue->barrier_events_num - 1; i++) { // Move forward.
-      queue->barrier_events[i] = queue->barrier_events[i+1];
+    for (; i < queue->barrier_events_num - 1; i++) { // Move forward.
+      queue->barrier_events[i] = queue->barrier_events[i + 1];
     }
   }
   queue->barrier_events_num -= 1;
   CL_OBJECT_UNLOCK(queue);
-  
+
   cl_event_delete(event);
 }
diff --git a/src/cl_command_queue.h b/src/cl_command_queue.h
index 9f6ff39..17d55b8 100644
--- a/src/cl_command_queue.h
+++ b/src/cl_command_queue.h
@@ -60,7 +60,7 @@ typedef struct _cl_command_queue {
 
 /* Allocate and initialize a new command queue. Also insert it in the list of
  * command queue in the associated context */
-extern cl_command_queue cl_create_command_queue(cl_context, cl_device_id,
+extern cl_command_queue cl_command_queue_create(cl_context, cl_device_id,
                                                 cl_command_queue_properties, cl_uint, cl_int*);
 /* Destroy and deallocate the command queue */
 extern void cl_command_queue_delete(cl_command_queue);
@@ -71,24 +71,15 @@ extern cl_int cl_command_queue_ND_range(cl_command_queue queue,
                                         cl_kernel ker,
                                         cl_event event,
                                         const uint32_t work_dim,
-                                        const size_t *global_wk_off,
-                                        const size_t *global_dim_off,
-                                        const size_t *global_wk_sz,
-                                        const size_t *global_wk_sz_use,
-                                        const size_t *local_wk_sz,
-                                        const size_t *local_wk_sz_use);
-
+                                        const size_t *global_work_offset,
+                                        const size_t *global_work_size,
+                                        const size_t *local_work_size);
 /* The memory object where to report the performance */
 extern cl_int cl_command_queue_set_report_buffer(cl_command_queue, cl_mem);
-/* Flush for the specified gpgpu */
-extern int cl_command_queue_flush_gpgpu(cl_gpgpu);
 /* Bind all the surfaces in the GPGPU state */
 extern cl_int cl_command_queue_bind_surface(cl_command_queue, cl_kernel, cl_gpgpu, uint32_t *);
 /* Bind all the image surfaces in the GPGPU state */
 extern cl_int cl_command_queue_bind_image(cl_command_queue, cl_kernel, cl_gpgpu, uint32_t *);
-/* Bind all exec info to bind table */
-extern cl_int cl_command_queue_bind_exec_info(cl_command_queue, cl_kernel, cl_gpgpu, uint32_t *);
-
 /* Insert a user event to command's wait_events */
 extern void cl_command_queue_insert_event(cl_command_queue, cl_event);
 /* Remove a user event from command's wait_events */
@@ -104,5 +95,9 @@ extern cl_int cl_command_queue_wait_flush(cl_command_queue queue);
 /* Note: Must call this function with queue's lock. */
 extern cl_event *cl_command_queue_record_in_queue_events(cl_command_queue queue, cl_uint *list_num);
 
+/* Flush for the specified gpgpu */
+extern cl_int cl_command_queue_flush_gpgpu(void *gpu);
+extern cl_int cl_command_queue_finish_gpgpu(void *gpu);
+extern void cl_command_queue_delete_gpgpu(void *gpgpu);
 #endif /* __CL_COMMAND_QUEUE_H__ */
 
diff --git a/src/cl_context.c b/src/cl_context.c
index c54760f..13952d7 100644
--- a/src/cl_context.c
+++ b/src/cl_context.c
@@ -14,7 +14,6 @@
  * You should have received a copy of the GNU Lesser General Public
  * License along with this library. If not, see <http://www.gnu.org/licenses/>.
  *
- * Author: Benjamin Segovia <benjamin.segovia at intel.com>
  */
 
 #include "cl_platform_id.h"
@@ -41,7 +40,8 @@
 #include <string.h>
 
 LOCAL void
-cl_context_add_queue(cl_context ctx, cl_command_queue queue) {
+cl_context_add_queue(cl_context ctx, cl_command_queue queue)
+{
   assert(queue->ctx == NULL);
   cl_context_add_ref(ctx);
 
@@ -57,7 +57,8 @@ cl_context_add_queue(cl_context ctx, cl_command_queue queue) {
 }
 
 LOCAL void
-cl_context_remove_queue(cl_context ctx, cl_command_queue queue) {
+cl_context_remove_queue(cl_context ctx, cl_command_queue queue)
+{
   assert(queue->ctx == ctx);
 
   CL_OBJECT_LOCK(ctx);
@@ -73,7 +74,8 @@ cl_context_remove_queue(cl_context ctx, cl_command_queue queue) {
 }
 
 LOCAL void
-cl_context_add_mem(cl_context ctx, cl_mem mem) {
+cl_context_add_mem(cl_context ctx, cl_mem mem)
+{
   assert(mem->ctx == NULL);
   cl_context_add_ref(ctx);
 
@@ -86,7 +88,8 @@ cl_context_add_mem(cl_context ctx, cl_mem mem) {
 }
 
 LOCAL void
-cl_context_remove_mem(cl_context ctx, cl_mem mem) {
+cl_context_remove_mem(cl_context ctx, cl_mem mem)
+{
   assert(mem->ctx == ctx);
   CL_OBJECT_LOCK(ctx);
   list_node_del(&mem->base.node);
@@ -98,7 +101,8 @@ cl_context_remove_mem(cl_context ctx, cl_mem mem) {
 }
 
 LOCAL void
-cl_context_add_sampler(cl_context ctx, cl_sampler sampler) {
+cl_context_add_sampler(cl_context ctx, cl_sampler sampler)
+{
   assert(sampler->ctx == NULL);
   cl_context_add_ref(ctx);
 
@@ -111,7 +115,8 @@ cl_context_add_sampler(cl_context ctx, cl_sampler sampler) {
 }
 
 LOCAL void
-cl_context_remove_sampler(cl_context ctx, cl_sampler sampler) {
+cl_context_remove_sampler(cl_context ctx, cl_sampler sampler)
+{
   assert(sampler->ctx == ctx);
   CL_OBJECT_LOCK(ctx);
   list_node_del(&sampler->base.node);
@@ -123,7 +128,8 @@ cl_context_remove_sampler(cl_context ctx, cl_sampler sampler) {
 }
 
 LOCAL void
-cl_context_add_event(cl_context ctx, cl_event event) {
+cl_context_add_event(cl_context ctx, cl_event event)
+{
   assert(event->ctx == NULL);
   cl_context_add_ref(ctx);
 
@@ -136,7 +142,8 @@ cl_context_add_event(cl_context ctx, cl_event event) {
 }
 
 LOCAL void
-cl_context_remove_event(cl_context ctx, cl_event event) {
+cl_context_remove_event(cl_context ctx, cl_event event)
+{
   assert(event->ctx == ctx);
   CL_OBJECT_LOCK(ctx);
   list_node_del(&event->base.node);
@@ -148,7 +155,8 @@ cl_context_remove_event(cl_context ctx, cl_event event) {
 }
 
 LOCAL void
-cl_context_add_program(cl_context ctx, cl_program program) {
+cl_context_add_program(cl_context ctx, cl_program program)
+{
   assert(program->ctx == NULL);
   cl_context_add_ref(ctx);
 
@@ -161,7 +169,8 @@ cl_context_add_program(cl_context ctx, cl_program program) {
 }
 
 LOCAL void
-cl_context_remove_program(cl_context ctx, cl_program program) {
+cl_context_remove_program(cl_context ctx, cl_program program)
+{
   assert(program->ctx == ctx);
   CL_OBJECT_LOCK(ctx);
   list_node_del(&program->base.node);
@@ -172,17 +181,16 @@ cl_context_remove_program(cl_context ctx, cl_program program) {
   program->ctx = NULL;
 }
 
-
-#define CHECK(var) \
-  if (var) \
-    return CL_INVALID_PROPERTY; \
-  else \
-    var = 1;
-
 static cl_int
 cl_context_properties_process(const cl_context_properties *prop,
-                              struct _cl_context_prop *cl_props, cl_uint * prop_len)
+                              struct _cl_context_prop *cl_props, cl_uint *prop_len)
 {
+#define CHECK(var)              \
+  if (var)                      \
+    return CL_INVALID_PROPERTY; \
+  else                          \
+    var = 1;
+
   int set_cl_context_platform = 0,
       set_cl_gl_context_khr = 0,
       set_cl_egl_display_khr = 0,
@@ -195,40 +203,39 @@ cl_context_properties_process(const cl_context_properties *prop,
   cl_props->platform_id = 0;
 
   if (prop == NULL)
-    goto exit;
-
+    goto error;
 
-  while(*prop) {
+  while (*prop) {
     switch (*prop) {
     case CL_CONTEXT_PLATFORM:
-      CHECK (set_cl_context_platform);
+      CHECK(set_cl_context_platform);
       cl_props->platform_id = *(prop + 1);
-      if (UNLIKELY((cl_platform_id) cl_props->platform_id != cl_get_platform_default())) {
+      if (UNLIKELY((cl_platform_id)cl_props->platform_id != cl_get_platform_default())) {
         err = CL_INVALID_PLATFORM;
         goto error;
       }
       break;
     case CL_GL_CONTEXT_KHR:
-      CHECK (set_cl_gl_context_khr);
+      CHECK(set_cl_gl_context_khr);
       cl_props->gl_context = *(prop + 1);
       break;
     case CL_EGL_DISPLAY_KHR:
-      CHECK (set_cl_egl_display_khr);
+      CHECK(set_cl_egl_display_khr);
       cl_props->gl_type = CL_GL_EGL_DISPLAY;
       cl_props->egl_display = *(prop + 1);
       break;
     case CL_GLX_DISPLAY_KHR:
-      CHECK (set_cl_glx_display_khr);
+      CHECK(set_cl_glx_display_khr);
       cl_props->gl_type = CL_GL_GLX_DISPLAY;
       cl_props->glx_display = *(prop + 1);
       break;
     case CL_WGL_HDC_KHR:
-      CHECK (set_cl_wgl_hdc_khr);
+      CHECK(set_cl_wgl_hdc_khr);
       cl_props->gl_type = CL_GL_WGL_HDC;
       cl_props->wgl_hdc = *(prop + 1);
       break;
     case CL_CGL_SHAREGROUP_KHR:
-      CHECK (set_cl_cgl_sharegroup_khr);
+      CHECK(set_cl_cgl_sharegroup_khr);
       cl_props->gl_type = CL_GL_CGL_SHAREGROUP;
       cl_props->cgl_sharegroup = *(prop + 1);
       break;
@@ -240,20 +247,104 @@ cl_context_properties_process(const cl_context_properties *prop,
     *prop_len += 2;
   }
   (*prop_len)++;
-exit:
+
 error:
   return err;
+
+#undef CHECK
+}
+
+static cl_context
+cl_context_new(struct _cl_context_prop *props, cl_uint dev_num, cl_device_id *all_dev)
+{
+  cl_context ctx = NULL;
+  int i;
+  cl_int err = CL_SUCCESS;
+
+  ctx = CL_CALLOC(1, sizeof(_cl_context));
+  if (ctx == NULL)
+    return NULL;
+
+  CL_OBJECT_INIT_BASE(ctx, CL_OBJECT_CONTEXT_MAGIC);
+  ctx->device_num = dev_num;
+  ctx->devices = CL_MALLOC(dev_num * sizeof(cl_device_id));
+  if (ctx->devices == NULL) {
+    CL_FREE(ctx);
+    return NULL;
+  }
+  memcpy(ctx->devices, all_dev, dev_num * sizeof(cl_device_id));
+
+  ctx->props = *props;
+  list_init(&ctx->queues);
+  list_init(&ctx->mem_objects);
+  list_init(&ctx->samplers);
+  list_init(&ctx->events);
+  list_init(&ctx->programs);
+  ctx->queue_modify_disable = 0;
+
+  ctx->each_device = CL_CALLOC(ctx->device_num, sizeof(cl_context_for_device));
+  if (ctx->each_device == NULL) {
+    CL_FREE(ctx);
+    return NULL;
+  }
+  ctx->each_device_num = ctx->device_num;
+
+  for (i = 0; i < ctx->device_num; i++) {
+    ctx->each_device[i] = (ctx->devices[i]->api.context_new)(ctx->devices[i], ctx);
+    if (ctx->each_device[i] == NULL) {
+      err = CL_OUT_OF_HOST_MEMORY;
+      break;
+    }
+  }
+  if (err != CL_SUCCESS) {
+    for (i = 0; i < ctx->device_num; i++) {
+      if (ctx->each_device[i])
+        (ctx->devices[i]->api.context_delete)(ctx->each_device[i]->device, ctx);
+    }
+    CL_FREE(ctx);
+    return NULL;
+  }
+
+  return ctx;
 }
 
+LOCAL void
+cl_context_delete(cl_context ctx)
+{
+  int i = 0;
+  if (ctx == NULL)
+    return;
+
+  /* We are not done yet */
+  if (CL_OBJECT_DEC_REF(ctx) > 1)
+    return;
+
+  assert(ctx->devices);
+  for (i = 0; i < ctx->each_device_num; i++) {
+    (ctx->each_device[i]->device->api.context_delete)(ctx->each_device[i]->device, ctx);
+  }
+  CL_FREE(ctx->each_device);
+  ctx->each_device = NULL;
 
+  if (ctx->prop_user) {
+    CL_FREE(ctx->prop_user);
+    ctx->prop_user = NULL;
+  }
+
+  CL_FREE(ctx->devices);
+  ctx->devices = NULL;
+
+  CL_OBJECT_DESTROY_BASE(ctx);
+  CL_FREE(ctx);
+}
 
 LOCAL cl_context
-cl_create_context(const cl_context_properties *  properties,
-                  cl_uint                        num_devices,
-                  const cl_device_id *           devices,
-                  void (CL_CALLBACK * pfn_notify) (const char*, const void*, size_t, void*),
-                  void *                         user_data,
-                  cl_int *                       errcode_ret)
+cl_context_create(const cl_context_properties *properties,
+                  cl_uint num_devices,
+                  const cl_device_id *devices,
+                  void(CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *),
+                  void *user_data,
+                  cl_int *errcode_ret)
 {
   /* cl_platform_id platform = NULL; */
   struct _cl_context_prop props;
@@ -261,23 +352,24 @@ cl_create_context(const cl_context_properties *  properties,
   cl_int err = CL_SUCCESS;
   cl_uint prop_len = 0;
   cl_uint dev_num = 0;
-  cl_device_id* all_dev = NULL;
+  cl_device_id *all_dev = NULL;
   cl_uint i, j;
 
-  /* XXX */
-  FATAL_IF (num_devices != 1, "Only one device is supported");
+  assert(num_devices > 0);
 
   /* Check that we are getting the right platform */
-  if (UNLIKELY(((err = cl_context_properties_process(properties, &props, &prop_len)) != CL_SUCCESS)))
-    goto error;
+  if ((err = cl_context_properties_process(properties, &props, &prop_len)) != CL_SUCCESS) {
+    *errcode_ret = err;
+    return NULL;
+  }
 
-  /* Filter out repeated device. */
-  assert(num_devices > 0);
   all_dev = CL_CALLOC(num_devices, sizeof(cl_device_id));
   if (all_dev == NULL) {
     *errcode_ret = CL_OUT_OF_HOST_MEMORY;
     return NULL;
   }
+
+  /* Filter out repeated device. */
   for (i = 0; i < num_devices; i++) {
     for (j = 0; j < i; j++) {
       if (devices[j] == devices[i]) {
@@ -292,101 +384,41 @@ cl_create_context(const cl_context_properties *  properties,
     all_dev[dev_num] = devices[i];
     dev_num++;
   }
-  assert(dev_num == 1); // TODO: multi devices later.
 
   /* We are good */
-  if (UNLIKELY((ctx = cl_context_new(&props, dev_num, all_dev)) == NULL)) {
-    CL_FREE(all_dev);
-    err = CL_OUT_OF_HOST_MEMORY;
-    goto error;
+  ctx = cl_context_new(&props, dev_num, all_dev);
+  CL_FREE(all_dev);
+  if (ctx == NULL) {
+    *errcode_ret = CL_OUT_OF_HOST_MEMORY;
+    return NULL;
   }
 
-  if(properties != NULL && prop_len > 0) {
-    TRY_ALLOC (ctx->prop_user, CL_CALLOC(prop_len, sizeof(cl_context_properties)));
-    memcpy(ctx->prop_user, properties, sizeof(cl_context_properties)*prop_len);
+  if (properties != NULL && prop_len > 0) {
+    ctx->prop_user = CL_MALLOC(sizeof(cl_context_properties) * prop_len);
+    if (ctx->prop_user == NULL) {
+      cl_context_delete(ctx);
+      *errcode_ret = CL_OUT_OF_HOST_MEMORY;
+      return NULL;
+    }
+    memcpy(ctx->prop_user, properties, sizeof(cl_context_properties) * prop_len);
   }
-  ctx->prop_len = prop_len;
-  /* cl_context_new will use all_dev. */
-  all_dev = NULL;
 
+  ctx->prop_len = prop_len;
   /* Save the user callback and user data*/
   ctx->pfn_notify = pfn_notify;
   ctx->user_data = user_data;
-  cl_driver_set_atomic_flag(ctx->drv, ctx->devices[0]->atomic_test_result);
-
-exit:
-  if (errcode_ret != NULL)
-    *errcode_ret = err;
-  return ctx;
-error:
-  cl_context_delete(ctx);
-  ctx = NULL;
-  goto exit;
-}
-
-LOCAL cl_context
-cl_context_new(struct _cl_context_prop *props, cl_uint dev_num, cl_device_id* all_dev)
-{
-  cl_context ctx = NULL;
-
-  TRY_ALLOC_NO_ERR (ctx, CL_CALLOC(1, sizeof(struct _cl_context)));
-  CL_OBJECT_INIT_BASE(ctx, CL_OBJECT_CONTEXT_MAGIC);
-  ctx->devices = all_dev;
-  ctx->device_num = dev_num;
-  list_init(&ctx->queues);
-  list_init(&ctx->mem_objects);
-  list_init(&ctx->samplers);
-  list_init(&ctx->events);
-  list_init(&ctx->programs);
-  ctx->queue_modify_disable = CL_FALSE;
-  TRY_ALLOC_NO_ERR (ctx->drv, cl_driver_new(props));
-  ctx->props = *props;
-  ctx->ver = cl_driver_get_ver(ctx->drv);
-
-exit:
-  return ctx;
-error:
-  cl_context_delete(ctx);
-  ctx = NULL;
-  goto exit;
-}
-
-LOCAL void
-cl_context_delete(cl_context ctx)
-{
-  int i = 0;
-  if (UNLIKELY(ctx == NULL))
-    return;
-
-  /* We are not done yet */
-  if (CL_OBJECT_DEC_REF(ctx) > 1)
-    return;
 
-  /* delete the internal programs. */
-  for (i = CL_INTERNAL_KERNEL_MIN; i < CL_INTERNAL_KERNEL_MAX; i++) {
-    if (ctx->internal_kernels[i]) {
-      cl_kernel_delete(ctx->internal_kernels[i]);
-      ctx->internal_kernels[i] = NULL;
-
-      assert(ctx->internal_prgs[i]);
-      cl_program_delete(ctx->internal_prgs[i]);
-      ctx->internal_prgs[i] = NULL;
-    }
-
-    if (ctx->built_in_kernels[i]) {
-      cl_kernel_delete(ctx->built_in_kernels[i]);
-      ctx->built_in_kernels[i] = NULL;
+  for (i = 0; i < ctx->device_num; i++) {
+    err = (ctx->devices[i]->api.context_create)(ctx->devices[i], ctx);
+    if (err != CL_SUCCESS) {
+      *errcode_ret = err;
+      cl_context_delete(ctx);
+      return NULL;
     }
   }
 
-  cl_program_delete(ctx->built_in_prgs);
-  ctx->built_in_prgs = NULL;
-
-  CL_FREE(ctx->prop_user);
-  CL_FREE(ctx->devices);
-  cl_driver_delete(ctx->drv);
-  CL_OBJECT_DESTROY_BASE(ctx);
-  CL_FREE(ctx);
+  *errcode_ret = err;
+  return ctx;
 }
 
 LOCAL void
@@ -402,98 +434,23 @@ cl_context_get_bufmgr(cl_context ctx)
   return cl_driver_get_bufmgr(ctx->drv);
 }
 
-cl_kernel
-cl_context_get_static_kernel_from_bin(cl_context ctx, cl_int index,
-                  const char * str_kernel, size_t size, const char * str_option)
-{
-  cl_int ret;
-  cl_int binary_status = CL_SUCCESS;
-  cl_kernel ker;
-
-  CL_OBJECT_TAKE_OWNERSHIP(ctx, 1);
-  if (ctx->internal_prgs[index] == NULL) {
-    ctx->internal_prgs[index] = cl_program_create_from_binary(ctx, 1, &ctx->devices[0],
-      &size, (const unsigned char **)&str_kernel, &binary_status, &ret);
-
-    if (!ctx->internal_prgs[index]) {
-      ker = NULL;
-      goto unlock;
-    }
-    ret = cl_program_build(ctx->internal_prgs[index], str_option);
-    if (ret != CL_SUCCESS) {
-      ker = NULL;
-      goto unlock;
-    }
-
-    ctx->internal_prgs[index]->is_built = 1;
-
-    /* All CL_ENQUEUE_FILL_BUFFER_ALIGN16_xxx use the same program, different kernel. */
-    if (index >= CL_ENQUEUE_FILL_BUFFER_ALIGN8_8 && index <= CL_ENQUEUE_FILL_BUFFER_ALIGN8_64) {
-      int i = CL_ENQUEUE_FILL_BUFFER_ALIGN8_8;
-      for (; i <= CL_ENQUEUE_FILL_BUFFER_ALIGN8_64; i++) {
-        if (index != i) {
-          assert(ctx->internal_prgs[i] == NULL);
-          assert(ctx->internal_kernels[i] == NULL);
-          cl_program_add_ref(ctx->internal_prgs[index]);
-          ctx->internal_prgs[i] = ctx->internal_prgs[index];
-        }
-
-        if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_8) {
-          ctx->internal_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
-                                                              "__cl_fill_region_align8_2", NULL);
-        } else if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_16) {
-          ctx->internal_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
-                                                              "__cl_fill_region_align8_4", NULL);
-        } else if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_32) {
-          ctx->internal_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
-                                                              "__cl_fill_region_align8_8", NULL);
-        } else if (i == CL_ENQUEUE_FILL_BUFFER_ALIGN8_64) {
-          ctx->internal_kernels[i] = cl_program_create_kernel(ctx->internal_prgs[index],
-                                                              "__cl_fill_region_align8_16", NULL);
-        } else
-          assert(0);
-      }
-    } else {
-      ctx->internal_kernels[index] = cl_kernel_dup(ctx->internal_prgs[index]->ker[0]);
-    }
-  }
-  ker = ctx->internal_kernels[index];
-
-unlock:
-  CL_OBJECT_RELEASE_OWNERSHIP(ctx);
-  return cl_kernel_dup(ker);
-}
-
-
 cl_mem
-cl_context_get_svm_from_ptr(cl_context ctx, const void * p)
+cl_context_get_svm_from_ptr(cl_context ctx, const void *p)
 {
   struct list_node *pos;
   cl_mem buf;
 
-  list_for_each (pos, (&ctx->mem_objects)) {
+  list_for_each(pos, (&ctx->mem_objects))
+  {
     buf = (cl_mem)list_entry(pos, _cl_base_object, node);
-    if(buf->host_ptr == NULL) continue;
-    if(buf->is_svm == 0) continue;
-    if(buf->type != CL_MEM_SVM_TYPE) continue;
-    if((size_t)buf->host_ptr <= (size_t)p &&
-       (size_t)p < ((size_t)buf->host_ptr + buf->size))
-      return buf;
-  }
-  return NULL;
-}
-
-cl_mem
-cl_context_get_mem_from_ptr(cl_context ctx, const void * p)
-{
-  struct list_node *pos;
-  cl_mem buf;
-
-  list_for_each (pos, (&ctx->mem_objects)) {
-    buf = (cl_mem)list_entry(pos, _cl_base_object, node);
-    if(buf->host_ptr == NULL) continue;
-    if((size_t)buf->host_ptr <= (size_t)p &&
-       (size_t)p < ((size_t)buf->host_ptr + buf->size))
+    if (buf->host_ptr == NULL)
+      continue;
+    if (buf->is_svm == 0)
+      continue;
+    if (buf->type != CL_MEM_SVM_TYPE)
+      continue;
+    if ((size_t)buf->host_ptr <= (size_t)p &&
+        (size_t)p < ((size_t)buf->host_ptr + buf->size))
       return buf;
   }
   return NULL;
diff --git a/src/cl_context.h b/src/cl_context.h
index 4812afd..0aeb080 100644
--- a/src/cl_context.h
+++ b/src/cl_context.h
@@ -14,7 +14,6 @@
  * You should have received a copy of the GNU Lesser General Public
  * License along with this library. If not, see <http://www.gnu.org/licenses/>.
  *
- * Author: Benjamin Segovia <benjamin.segovia at intel.com>
  */
 
 #ifndef __CL_CONTEXT_H__
@@ -29,9 +28,6 @@
 #include <stdint.h>
 #include <pthread.h>
 
-/* DRI device created at create context */
-struct intel_driver;
-
 enum _cl_gl_context_type {
   CL_GL_NOSHARE,
   CL_GL_EGL_DISPLAY,
@@ -40,49 +36,7 @@ enum _cl_gl_context_type {
   CL_GL_CGL_SHAREGROUP
 };
 
-enum _cl_internal_ker_type {
-  CL_INTERNAL_KERNEL_MIN = 0,
-  CL_ENQUEUE_COPY_BUFFER_ALIGN4 = 0,
-  CL_ENQUEUE_COPY_BUFFER_ALIGN16,
-  CL_ENQUEUE_COPY_BUFFER_UNALIGN_SAME_OFFSET,
-  CL_ENQUEUE_COPY_BUFFER_UNALIGN_DST_OFFSET,
-  CL_ENQUEUE_COPY_BUFFER_UNALIGN_SRC_OFFSET,
-  CL_ENQUEUE_COPY_BUFFER_RECT,
-  CL_ENQUEUE_COPY_BUFFER_RECT_ALIGN4,
-  CL_ENQUEUE_COPY_IMAGE_1D_TO_1D,             //copy image 1d to image 1d
-  CL_ENQUEUE_COPY_IMAGE_2D_TO_2D,             //copy image 2d to image 2d
-  CL_ENQUEUE_COPY_IMAGE_3D_TO_2D,             //copy image 3d to image 2d
-  CL_ENQUEUE_COPY_IMAGE_2D_TO_3D,             //copy image 2d to image 3d
-  CL_ENQUEUE_COPY_IMAGE_3D_TO_3D,             //copy image 3d to image 3d
-  CL_ENQUEUE_COPY_IMAGE_2D_TO_2D_ARRAY,       //copy image 2d to image 2d array
-  CL_ENQUEUE_COPY_IMAGE_1D_ARRAY_TO_1D_ARRAY, //copy image 1d array to image 1d array
-  CL_ENQUEUE_COPY_IMAGE_2D_ARRAY_TO_2D_ARRAY, //copy image 2d array to image 2d array
-  CL_ENQUEUE_COPY_IMAGE_2D_ARRAY_TO_2D,       //copy image 2d array to image 2d
-  CL_ENQUEUE_COPY_IMAGE_2D_ARRAY_TO_3D,       //copy image 2d array to image 3d
-  CL_ENQUEUE_COPY_IMAGE_3D_TO_2D_ARRAY,       //copy image 3d to image 2d array
-  CL_ENQUEUE_COPY_IMAGE_2D_TO_BUFFER,   //copy image 2d to buffer
-  CL_ENQUEUE_COPY_IMAGE_2D_TO_BUFFER_ALIGN16,
-  CL_ENQUEUE_COPY_IMAGE_3D_TO_BUFFER,   //copy image 3d tobuffer
-  CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D,   //copy buffer to image 2d
-  CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D_ALIGN16,
-  CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_3D,   //copy buffer to image 3d
-  CL_ENQUEUE_FILL_BUFFER_UNALIGN,      //fill buffer with 1 aligne pattern, pattern size=1
-  CL_ENQUEUE_FILL_BUFFER_ALIGN2,       //fill buffer with 2 aligne pattern, pattern size=2
-  CL_ENQUEUE_FILL_BUFFER_ALIGN4,       //fill buffer with 4 aligne pattern, pattern size=4
-  CL_ENQUEUE_FILL_BUFFER_ALIGN8_8,     //fill buffer with 8 aligne pattern, pattern size=8
-  CL_ENQUEUE_FILL_BUFFER_ALIGN8_16,    //fill buffer with 16 aligne pattern, pattern size=16
-  CL_ENQUEUE_FILL_BUFFER_ALIGN8_32,    //fill buffer with 16 aligne pattern, pattern size=32
-  CL_ENQUEUE_FILL_BUFFER_ALIGN8_64,    //fill buffer with 16 aligne pattern, pattern size=64
-  CL_ENQUEUE_FILL_BUFFER_ALIGN128,     //fill buffer with 128 aligne pattern, pattern size=128
-  CL_ENQUEUE_FILL_IMAGE_1D,             //fill image 1d
-  CL_ENQUEUE_FILL_IMAGE_1D_ARRAY,       //fill image 1d array
-  CL_ENQUEUE_FILL_IMAGE_2D,             //fill image 2d
-  CL_ENQUEUE_FILL_IMAGE_2D_ARRAY,       //fill image 2d array
-  CL_ENQUEUE_FILL_IMAGE_3D,             //fill image 3d
-  CL_INTERNAL_KERNEL_MAX
-};
-
-struct _cl_context_prop {
+typedef struct _cl_context_prop {
   cl_context_properties platform_id;
   enum _cl_gl_context_type gl_type;
   cl_context_properties gl_context;
@@ -92,50 +46,53 @@ struct _cl_context_prop {
     cl_context_properties wgl_hdc;
     cl_context_properties cgl_sharegroup;
   };
-};
+} _cl_context_prop;
+
+#define IS_EGL_CONTEXT(ctx) (ctx->props.gl_type == CL_GL_EGL_DISPLAY)
+#define EGL_DISP(ctx) (EGLDisplay)(ctx->props.egl_display)
+#define EGL_CTX(ctx) (EGLContext)(ctx->props.gl_context)
 
-#define IS_EGL_CONTEXT(ctx)  (ctx->props.gl_type == CL_GL_EGL_DISPLAY)
-#define EGL_DISP(ctx)   (EGLDisplay)(ctx->props.egl_display)
-#define EGL_CTX(ctx)    (EGLContext)(ctx->props.gl_context)
-/* Encapsulate the whole device */
-struct _cl_context {
+typedef struct _cl_context_for_device {
+  cl_device_id device; /* Point to the device it belong to */
+} _cl_context_for_device;
+typedef _cl_context_for_device *cl_context_for_device;
+
+typedef struct _cl_context {
   _cl_base_object base;
-  cl_driver drv;                    /* Handles HW or simulator */
-  cl_device_id* devices;            /* All devices belong to this context */
-  cl_uint device_num;               /* Devices number of this context */
-  list_head queues;                 /* All command queues currently allocated */
-  cl_uint queue_num;                /* All queue number currently allocated */
+  cl_device_id *devices;  /* All devices belong to this context */
+  cl_uint device_num;     /* Devices number of this context */
+  list_head queues;       /* All command queues currently allocated */
+  cl_uint queue_num;      /* All queue number currently allocated */
   cl_uint queue_modify_disable;     /* Temp disable queue list change. */
-  list_head mem_objects;            /* All memory object currently allocated */
-  cl_uint mem_object_num;           /* All memory number currently allocated */
-  list_head samplers;               /* All sampler object currently allocated */
-  cl_uint sampler_num;              /* All sampler number currently allocated */
-  list_head events;                 /* All event object currently allocated */
-  cl_uint event_num;                /* All event number currently allocated */
-  list_head programs;               /* All programs currently allocated */
-  cl_uint program_num;              /* All program number currently allocated */
-
-  cl_accelerator_intel accels;      /* All accelerator_intel object currently allocated */
-  cl_program internal_prgs[CL_INTERNAL_KERNEL_MAX];
-                                    /* All programs internal used, for example clEnqueuexxx api use */
-  cl_kernel  internal_kernels[CL_INTERNAL_KERNEL_MAX];
-                                    /* All kernels  for clenqueuexxx api, for example clEnqueuexxx api use */
-  cl_program built_in_prgs;  /*all built-in kernels belongs to this program only*/
-  cl_kernel  built_in_kernels[CL_INTERNAL_KERNEL_MAX];
-  uint32_t ver;                     /* Gen version */
+  list_head mem_objects;  /* All memory object currently allocated */
+  cl_uint mem_object_num; /* All memory number currently allocated */
+  list_head samplers;     /* All sampler object currently allocated */
+  cl_uint sampler_num;    /* All sampler number currently allocated */
+  list_head events;       /* All event object currently allocated */
+  cl_uint event_num;      /* All event number currently allocated */
+  list_head programs;     /* All programs currently allocated */
+  cl_uint program_num;    /* All program number currently allocated */
+
   struct _cl_context_prop props;
-  cl_context_properties * prop_user; /* a copy of user passed context properties when create context */
-  cl_uint                 prop_len;  /* count of the properties */
-  void (CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *);
-                                     /* User's callback when error occur in context */
-  void *user_data;                   /* A pointer to user supplied data */
+  cl_context_properties *prop_user; /* a copy of user passed context properties when create context */
+  cl_uint prop_len;                 /* count of the properties */
 
-};
+  cl_uint each_device_num;            /* Each device number */
+  cl_context_for_device *each_device; /* Context content interpreted by device */
+
+  void(CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *);
+  /* User's callback when error occur in context */
+  void *user_data; /* A pointer to user supplied data */
+
+  // TODO: Delete later
+  void *drv;
+  cl_accelerator_intel accels; /* All accelerator_intel object currently allocated */
+} _cl_context;
 
 #define CL_OBJECT_CONTEXT_MAGIC 0x20BBCADE993134AALL
-#define CL_OBJECT_IS_CONTEXT(obj) ((obj &&                           \
-         ((cl_base_object)obj)->magic == CL_OBJECT_CONTEXT_MAGIC &&  \
-         CL_OBJECT_GET_REF(obj) >= 1))
+#define CL_OBJECT_IS_CONTEXT(obj) ((obj &&                                                     \
+                                    ((cl_base_object)obj)->magic == CL_OBJECT_CONTEXT_MAGIC && \
+                                    CL_OBJECT_GET_REF(obj) >= 1))
 
 extern void cl_context_add_queue(cl_context ctx, cl_command_queue queue);
 extern void cl_context_remove_queue(cl_context ctx, cl_command_queue queue);
@@ -147,44 +104,17 @@ extern void cl_context_add_event(cl_context ctx, cl_event sampler);
 extern void cl_context_remove_event(cl_context ctx, cl_event sampler);
 extern void cl_context_add_program(cl_context ctx, cl_program program);
 extern void cl_context_remove_program(cl_context ctx, cl_program program);
-
-/* Implement OpenCL function */
-extern cl_context cl_create_context(const cl_context_properties*,
-                                    cl_uint,
-                                    const cl_device_id*,
-                                    void (CL_CALLBACK * pfn_notify) (const char*, const void*, size_t, void*),
-                                    void *,
-                                    cl_int*);
-
-/* Allocate and initialize a context */
-extern cl_context cl_context_new(struct _cl_context_prop *prop, cl_uint dev_num, cl_device_id* all_dev);
-
-/* Destroy and deallocate a context */
+extern cl_context cl_context_create(const cl_context_properties *, cl_uint, const cl_device_id *,
+                                    void(CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *),
+                                    void *, cl_int *);
 extern void cl_context_delete(cl_context);
-
-/* Increment the context reference counter */
 extern void cl_context_add_ref(cl_context);
+extern cl_mem cl_context_get_svm_from_ptr(cl_context ctx, const void *p);
 
-/* Enqueue a ND Range kernel */
-extern cl_int cl_context_ND_kernel(cl_context,
-                                   cl_command_queue,
-                                   cl_kernel,
-                                   cl_uint,
-                                   const size_t*,
-                                   const size_t*,
-                                   const size_t*);
 
 /* Used for allocation */
 extern cl_buffer_mgr cl_context_get_bufmgr(cl_context ctx);
-
 /* Get the internal used kernel from binary*/
 extern cl_kernel cl_context_get_static_kernel_from_bin(cl_context ctx, cl_int index,
-                  const char * str_kernel, size_t size, const char * str_option);
-
-/* Get the SVM from pointer, return NULL if pointer is not from SVM */
-extern cl_mem cl_context_get_svm_from_ptr(cl_context ctx, const void *p);
-/* Get the mem from pointer, return NULL if pointer is not from mem*/
-extern cl_mem cl_context_get_mem_from_ptr(cl_context ctx, const void *p);
-
+                                                       const char *str_kernel, size_t size, const char *str_option);
 #endif /* __CL_CONTEXT_H__ */
-
diff --git a/src/cl_device_enqueue.c b/src/cl_device_enqueue.c
index 5d55c22..527fc98 100644
--- a/src/cl_device_enqueue.c
+++ b/src/cl_device_enqueue.c
@@ -28,6 +28,9 @@
 
 LOCAL cl_int
 cl_device_enqueue_fix_offset(cl_kernel ker) {
+
+return CL_SUCCESS;
+#if 0
   uint32_t i;
   void *ptr;
   cl_mem mem;
@@ -52,11 +55,14 @@ cl_device_enqueue_fix_offset(cl_kernel ker) {
     }
   }
   return 0;
+#endif
 }
 
 LOCAL cl_int
 cl_device_enqueue_bind_buffer(cl_gpgpu gpgpu, cl_kernel ker, uint32_t *max_bti, cl_gpgpu_kernel *kernel)
 {
+return CL_SUCCESS;
+#if 0
   int32_t value = GBE_CURBE_ENQUEUE_BUF_POINTER;
   int32_t offset = interp_kernel_get_curbe_offset(ker->opaque, value, 0);
   size_t buf_size = 32 * 1024 * 1024;  //fix 32M
@@ -81,6 +87,7 @@ cl_device_enqueue_bind_buffer(cl_gpgpu gpgpu, cl_kernel ker, uint32_t *max_bti,
     cl_gpgpu_set_kernel(gpgpu, ker);
   }
   return 0;
+#endif
 }
 
 typedef struct ndrange_info_t {
@@ -110,6 +117,9 @@ typedef struct Block_literal {
 LOCAL cl_int
 cl_device_enqueue_parse_result(cl_command_queue queue, cl_gpgpu gpgpu)
 {
+return CL_SUCCESS;
+
+#if 0
   cl_mem mem;
   int size, type, dim, i;
   const char * kernel_name;
@@ -198,4 +208,5 @@ cl_device_enqueue_parse_result(cl_command_queue queue, cl_gpgpu gpgpu)
   cl_mem_unmap_auto(mem);
   cl_kernel_delete(ker);
   return 0;
+#endif
 }
diff --git a/src/cl_device_id.c b/src/cl_device_id.c
index d4f4208..2fd3c6e 100644
--- a/src/cl_device_id.c
+++ b/src/cl_device_id.c
@@ -19,1568 +19,424 @@
 
 #include "cl_platform_id.h"
 #include "cl_device_id.h"
-#include "cl_internals.h"
 #include "cl_utils.h"
-#include "cl_driver.h"
-#include "cl_device_data.h"
-#include "cl_khr_icd.h"
-#include "CL/cl.h"
 #include "CL/cl_ext.h"
-#include "CL/cl_intel.h"
-#include "cl_gbe_loader.h"
-#include "cl_alloc.h"
-
-#include <assert.h>
-#include <stdio.h>
 #include <string.h>
-#include <stdlib.h>
-#include <sys/sysinfo.h>
 
 #ifndef CL_VERSION_1_2
 #define CL_DEVICE_BUILT_IN_KERNELS 0x103F
 #endif
 
-static struct _cl_device_id intel_ivb_gt2_device = {
-  .max_compute_unit = 16,
-  .max_thread_per_unit = 8,
-  .sub_slice_count = 2,
-  .max_work_item_sizes = {512, 512, 512},
-  .max_work_group_size = 512,
-  .max_clock_frequency = 1000,
-#include "cl_gen7_device.h"
-};
-
-static struct _cl_device_id intel_ivb_gt1_device = {
-  .max_compute_unit = 6,
-  .max_thread_per_unit = 6,
-  .sub_slice_count = 1,
-  .max_work_item_sizes = {256, 256, 256},
-  .max_work_group_size = 256,
-  .max_clock_frequency = 1000,
-#include "cl_gen7_device.h"
-};
-
-static struct _cl_device_id intel_baytrail_t_device = {
-  .max_compute_unit = 4,
-  .max_thread_per_unit = 8,
-  .sub_slice_count = 1,
-  .max_work_item_sizes = {256, 256, 256},
-  .max_work_group_size = 256,
-  .max_clock_frequency = 1000,
-#include "cl_gen7_device.h"
-};
-
-/* XXX we clone IVB for HSW now */
-static struct _cl_device_id intel_hsw_gt1_device = {
-  .max_compute_unit = 10,
-  .max_thread_per_unit = 7,
-  .sub_slice_count = 1,
-  .max_work_item_sizes = {512, 512, 512},
-  .max_work_group_size = 512,
-  .max_clock_frequency = 1000,
-#include "cl_gen75_device.h"
-};
-
-static struct _cl_device_id intel_hsw_gt2_device = {
-  .max_compute_unit = 20,
-  .max_thread_per_unit = 7,
-  .sub_slice_count = 2,
-  .max_work_item_sizes = {512, 512, 512},
-  .max_work_group_size = 512,
-  .max_clock_frequency = 1000,
-#include "cl_gen75_device.h"
-};
-
-static struct _cl_device_id intel_hsw_gt3_device = {
-  .max_compute_unit = 40,
-  .max_thread_per_unit = 7,
-  .sub_slice_count = 4,
-  .max_work_item_sizes = {512, 512, 512},
-  .max_work_group_size = 512,
-  .max_clock_frequency = 1000,
-#include "cl_gen75_device.h"
-};
-
-/* XXX we clone IVB for HSW now */
-static struct _cl_device_id intel_brw_gt1_device = {
-  .max_compute_unit = 12,
-  .max_thread_per_unit = 7,
-  .sub_slice_count = 2,
-  .max_work_item_sizes = {512, 512, 512},
-  .max_work_group_size = 512,
-  .max_clock_frequency = 1000,
-#include "cl_gen8_device.h"
-};
-
-static struct _cl_device_id intel_brw_gt2_device = {
-  .max_compute_unit = 24,
-  .max_thread_per_unit = 7,
-  .sub_slice_count = 3,
-  .max_work_item_sizes = {512, 512, 512},
-  .max_work_group_size = 512,
-  .max_clock_frequency = 1000,
-#include "cl_gen8_device.h"
-};
-
-static struct _cl_device_id intel_brw_gt3_device = {
-  .max_compute_unit = 48,
-  .max_thread_per_unit = 7,
-  .sub_slice_count = 6,
-  .max_work_item_sizes = {512, 512, 512},
-  .max_work_group_size = 512,
-  .max_clock_frequency = 1000,
-#include "cl_gen8_device.h"
-};
-
-//Cherryview has the same pciid, must get the max_compute_unit and max_thread_per_unit from drm
-static struct _cl_device_id intel_chv_device = {
-  .max_compute_unit = 8,
-  .max_thread_per_unit = 7,
-  .sub_slice_count = 2,
-  .max_work_item_sizes = {512, 512, 512},
-  .max_work_group_size = 512,
-  .max_clock_frequency = 1000,
-#include "cl_gen75_device.h"
-};
-
-/* XXX we clone brw now */
-static struct _cl_device_id intel_skl_gt1_device = {
-  .max_compute_unit = 6,
-  .max_thread_per_unit = 7,
-  .sub_slice_count = 2,
-  .max_work_item_sizes = {512, 512, 512},
-  .max_work_group_size = 512,
-  .max_clock_frequency = 1000,
-#include "cl_gen9_device.h"
-};
-
-static struct _cl_device_id intel_skl_gt2_device = {
-  .max_compute_unit = 24,
-  .max_thread_per_unit = 7,
-  .sub_slice_count = 3,
-  .max_work_item_sizes = {512, 512, 512},
-  .max_work_group_size = 512,
-  .max_clock_frequency = 1000,
-#include "cl_gen9_device.h"
-};
-
-static struct _cl_device_id intel_skl_gt3_device = {
-  .max_compute_unit = 48,
-  .max_thread_per_unit = 7,
-  .sub_slice_count = 6,
-  .max_work_item_sizes = {512, 512, 512},
-  .max_work_group_size = 512,
-  .max_clock_frequency = 1000,
-#include "cl_gen9_device.h"
-};
-
-static struct _cl_device_id intel_skl_gt4_device = {
-  .max_compute_unit = 72,
-  .max_thread_per_unit = 7,
-  .sub_slice_count = 9,
-  .max_work_item_sizes = {512, 512, 512},
-  .max_work_group_size = 512,
-  .max_clock_frequency = 1000,
-#include "cl_gen9_device.h"
-};
-
-static struct _cl_device_id intel_bxt18eu_device = {
-  .max_compute_unit = 18,
-  .max_thread_per_unit = 6,
-  .sub_slice_count = 3,
-  .max_work_item_sizes = {512, 512, 512},
-  .max_work_group_size = 512,
-  .max_clock_frequency = 1000,
-#include "cl_gen9_device.h"
-};
-
-static struct _cl_device_id intel_bxt12eu_device = {
-  .max_compute_unit = 12,
-  .max_thread_per_unit = 6,
-  .sub_slice_count = 2,
-  .max_work_item_sizes = {512, 512, 512},
-  .max_work_group_size = 512,
-  .max_clock_frequency = 1000,
-#include "cl_gen9_device.h"
-};
-
-static struct _cl_device_id intel_kbl_gt1_device = {
-  .max_compute_unit = 12,
-  .max_thread_per_unit = 7,
-  .sub_slice_count = 2,
-  .max_work_item_sizes = {512, 512, 512},
-  .max_work_group_size = 512,
-  .max_clock_frequency = 1000,
-#include "cl_gen9_device.h"
-};
-
-static struct _cl_device_id intel_kbl_gt15_device = {
-  .max_compute_unit = 18,
-  .max_thread_per_unit = 7,
-  .sub_slice_count = 3,
-  .max_work_item_sizes = {512, 512, 512},
-  .max_work_group_size = 512,
-  .max_clock_frequency = 1000,
-#include "cl_gen9_device.h"
-};
-
-static struct _cl_device_id intel_kbl_gt2_device = {
-  .max_compute_unit = 24,
-  .max_thread_per_unit = 7,
-  .sub_slice_count = 3,
-  .max_work_item_sizes = {512, 512, 512},
-  .max_work_group_size = 512,
-  .max_clock_frequency = 1000,
-#include "cl_gen9_device.h"
-};
-
-static struct _cl_device_id intel_kbl_gt3_device = {
-  .max_compute_unit = 48,
-  .max_thread_per_unit = 7,
-  .sub_slice_count = 6,
-  .max_work_item_sizes = {512, 512, 512},
-  .max_work_group_size = 512,
-  .max_clock_frequency = 1000,
-#include "cl_gen9_device.h"
-};
-
-static struct _cl_device_id intel_kbl_gt4_device = {
-  .max_compute_unit = 72,
-  .max_thread_per_unit = 7,
-  .sub_slice_count = 9,
-  .max_work_item_sizes = {512, 512, 512},
-  .max_work_group_size = 512,
-  .max_clock_frequency = 1000,
-#include "cl_gen9_device.h"
-};
-
-LOCAL cl_device_id
-cl_get_gt_device(cl_device_type device_type)
+LOCAL cl_int
+cl_device_get_ids(cl_platform_id platform, cl_device_type device_type, cl_uint num_entries,
+                  cl_device_id *devices, cl_uint *num_devices)
 {
-  cl_device_id ret = NULL;
-  const int device_id = cl_driver_get_device_id();
   cl_device_id device = NULL;
 
-  //cl_get_gt_device only return GPU type device.
-  if (((CL_DEVICE_TYPE_GPU | CL_DEVICE_TYPE_DEFAULT) & device_type) == 0)
-    return NULL;
-
-#define DECL_INFO_STRING(BREAK, STRUCT, FIELD, STRING) \
-    STRUCT.FIELD = STRING; \
-    STRUCT.JOIN(FIELD,_sz) = sizeof(STRING); \
-    device = &STRUCT; \
-    goto BREAK;
-
-  switch (device_id) {
-    case PCI_CHIP_HASWELL_D1:
-      DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell GT1 Desktop");
-    case PCI_CHIP_HASWELL_D2:
-      DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell GT2 Desktop");
-    case PCI_CHIP_HASWELL_D3:
-      DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell GT3 Desktop");
-    case PCI_CHIP_HASWELL_S1:
-      DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell GT1 Server");
-    case PCI_CHIP_HASWELL_S2:
-      DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell GT2 Server");
-    case PCI_CHIP_HASWELL_S3:
-      DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell GT3 Server");
-    case PCI_CHIP_HASWELL_M1:
-      DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell GT1 Mobile");
-    case PCI_CHIP_HASWELL_M2:
-      DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell GT2 Mobile");
-    case PCI_CHIP_HASWELL_M3:
-      DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell GT3 Mobile");
-    case PCI_CHIP_HASWELL_B1:
-      DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell GT1 reserved");
-    case PCI_CHIP_HASWELL_B2:
-      DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell GT2 reserved");
-    case PCI_CHIP_HASWELL_B3:
-      DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell GT3 reserved");
-    case PCI_CHIP_HASWELL_E1:
-      DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell GT1 reserved");
-    case PCI_CHIP_HASWELL_E2:
-      DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell GT2 reserved");
-    case PCI_CHIP_HASWELL_E3:
-      DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell GT3 reserved");
-    case PCI_CHIP_HASWELL_SDV_D1:
-      DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell"
-                                                           " Software Development Vehicle device GT1 Desktop");
-    case PCI_CHIP_HASWELL_SDV_D2:
-      DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell"
-                                                           " Software Development Vehicle device GT2 Desktop");
-    case PCI_CHIP_HASWELL_SDV_D3:
-      DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell"
-                                                           " Software Development Vehicle device GT3 Desktop");
-    case PCI_CHIP_HASWELL_SDV_S1:
-      DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell"
-                                                           " Software Development Vehicle device GT1 Server");
-    case PCI_CHIP_HASWELL_SDV_S2:
-      DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell"
-                                                           " Software Development Vehicle device GT2 Server");
-    case PCI_CHIP_HASWELL_SDV_S3:
-      DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell"
-                                                           " Software Development Vehicle device GT3 Server");
-    case PCI_CHIP_HASWELL_SDV_M1:
-      DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell"
-                                                           " Software Development Vehicle device GT1 Mobile");
-    case PCI_CHIP_HASWELL_SDV_M2:
-      DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell"
-                                                           " Software Development Vehicle device GT2 Mobile");
-    case PCI_CHIP_HASWELL_SDV_M3:
-      DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell"
-                                                           " Software Development Vehicle device GT3 Mobile");
-    case PCI_CHIP_HASWELL_SDV_B1:
-      DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell"
-                                                           " Software Development Vehicle device GT1 reserved");
-    case PCI_CHIP_HASWELL_SDV_B2:
-      DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell"
-                                                           " Software Development Vehicle device GT2 reserved");
-    case PCI_CHIP_HASWELL_SDV_B3:
-      DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell"
-                                                           " Software Development Vehicle device GT3 reserved");
-    case PCI_CHIP_HASWELL_SDV_E1:
-      DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell"
-                                                           " Software Development Vehicle device GT1 reserved");
-    case PCI_CHIP_HASWELL_SDV_E2:
-      DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell"
-                                                           " Software Development Vehicle device GT2 reserved");
-    case PCI_CHIP_HASWELL_SDV_E3:
-      DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell"
-                                                           " Software Development Vehicle device GT3 reserved");
-    case PCI_CHIP_HASWELL_ULT_D1:
-      DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell Ultrabook GT1 Desktop");
-    case PCI_CHIP_HASWELL_ULT_D2:
-      DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell Ultrabook GT2 Desktop");
-    case PCI_CHIP_HASWELL_ULT_D3:
-      DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell Ultrabook GT3 Desktop");
-    case PCI_CHIP_HASWELL_ULT_S1:
-      DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell Ultrabook GT1 Server");
-    case PCI_CHIP_HASWELL_ULT_S2:
-      DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell Ultrabook GT2 Server");
-    case PCI_CHIP_HASWELL_ULT_S3:
-      DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell Ultrabook GT3 Server");
-    case PCI_CHIP_HASWELL_ULT_M1:
-      DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell Ultrabook GT1 Mobile");
-    case PCI_CHIP_HASWELL_ULT_M2:
-      DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile");
-    case PCI_CHIP_HASWELL_ULT_M3:
-      DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell Ultrabook GT3 Mobile");
-    case PCI_CHIP_HASWELL_ULT_B1:
-      DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell Ultrabook GT1 reserved");
-    case PCI_CHIP_HASWELL_ULT_B2:
-      DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell Ultrabook GT2 reserved");
-    case PCI_CHIP_HASWELL_ULT_B3:
-      DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell Ultrabook GT3 reserved");
-    case PCI_CHIP_HASWELL_ULT_E1:
-      DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell Ultrabook GT1 reserved");
-    case PCI_CHIP_HASWELL_ULT_E2:
-      DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell Ultrabook GT2 reserved");
-    case PCI_CHIP_HASWELL_ULT_E3:
-      DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell Ultrabook GT3 reserved");
-
-	/* CRW */
-    case PCI_CHIP_HASWELL_CRW_D1:
-      DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell CRW GT1 Desktop");
-    case PCI_CHIP_HASWELL_CRW_D2:
-      DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell CRW GT2 Desktop");
-    case PCI_CHIP_HASWELL_CRW_D3:
-      DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell CRW GT3 Desktop");
-    case PCI_CHIP_HASWELL_CRW_S1:
-      DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell CRW GT1 Server");
-    case PCI_CHIP_HASWELL_CRW_S2:
-      DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell CRW GT2 Server");
-    case PCI_CHIP_HASWELL_CRW_S3:
-      DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell CRW GT3 Server");
-    case PCI_CHIP_HASWELL_CRW_M1:
-      DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell CRW GT1 Mobile");
-    case PCI_CHIP_HASWELL_CRW_M2:
-      DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell CRW GT2 Mobile");
-    case PCI_CHIP_HASWELL_CRW_M3:
-      DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell CRW GT3 Mobile");
-    case PCI_CHIP_HASWELL_CRW_B1:
-      DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell CRW GT1 reserved");
-    case PCI_CHIP_HASWELL_CRW_B2:
-      DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell CRW GT2 reserved");
-    case PCI_CHIP_HASWELL_CRW_B3:
-      DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell CRW GT3 reserved");
-    case PCI_CHIP_HASWELL_CRW_E1:
-      DECL_INFO_STRING(has_break, intel_hsw_gt1_device, name, "Intel(R) HD Graphics Haswell CRW GT1 reserved");
-    case PCI_CHIP_HASWELL_CRW_E2:
-      DECL_INFO_STRING(has_break, intel_hsw_gt2_device, name, "Intel(R) HD Graphics Haswell CRW GT2 reserved");
-    case PCI_CHIP_HASWELL_CRW_E3:
-      DECL_INFO_STRING(has_break, intel_hsw_gt3_device, name, "Intel(R) HD Graphics Haswell CRW GT3 reserved");
-has_break:
-      device->device_id = device_id;
-      device->platform = cl_get_platform_default();
-      ret = device;
-      cl_intel_platform_get_default_extension(ret);
-      break;
-
-    case PCI_CHIP_IVYBRIDGE_GT1:
-      DECL_INFO_STRING(ivb_gt1_break, intel_ivb_gt1_device, name, "Intel(R) HD Graphics IvyBridge GT1");
-    case PCI_CHIP_IVYBRIDGE_M_GT1:
-      DECL_INFO_STRING(ivb_gt1_break, intel_ivb_gt1_device, name, "Intel(R) HD Graphics IvyBridge M GT1");
-    case PCI_CHIP_IVYBRIDGE_S_GT1:
-      DECL_INFO_STRING(ivb_gt1_break, intel_ivb_gt1_device, name, "Intel(R) HD Graphics IvyBridge S GT1");
-ivb_gt1_break:
-      intel_ivb_gt1_device.device_id = device_id;
-      intel_ivb_gt1_device.platform = cl_get_platform_default();
-      ret = &intel_ivb_gt1_device;
-      cl_intel_platform_get_default_extension(ret);
-      cl_intel_platform_enable_extension(ret, cl_intel_motion_estimation_ext_id);
-      break;
-
-    case PCI_CHIP_IVYBRIDGE_GT2:
-      DECL_INFO_STRING(ivb_gt2_break, intel_ivb_gt2_device, name, "Intel(R) HD Graphics IvyBridge GT2");
-    case PCI_CHIP_IVYBRIDGE_M_GT2:
-      DECL_INFO_STRING(ivb_gt2_break, intel_ivb_gt2_device, name, "Intel(R) HD Graphics IvyBridge M GT2");
-    case PCI_CHIP_IVYBRIDGE_S_GT2:
-      DECL_INFO_STRING(ivb_gt2_break, intel_ivb_gt2_device, name, "Intel(R) HD Graphics IvyBridge S GT2");
-ivb_gt2_break:
-      intel_ivb_gt2_device.device_id = device_id;
-      intel_ivb_gt2_device.platform = cl_get_platform_default();
-      ret = &intel_ivb_gt2_device;
-      cl_intel_platform_get_default_extension(ret);
-      cl_intel_platform_enable_extension(ret, cl_intel_motion_estimation_ext_id);
-      break;
-
-    case PCI_CHIP_BAYTRAIL_T:
-      DECL_INFO_STRING(baytrail_t_device_break, intel_baytrail_t_device, name, "Intel(R) HD Graphics Bay Trail-T");
-baytrail_t_device_break:
-      intel_baytrail_t_device.device_id = device_id;
-      intel_baytrail_t_device.platform = cl_get_platform_default();
-      ret = &intel_baytrail_t_device;
-      cl_intel_platform_get_default_extension(ret);
-      cl_intel_platform_enable_extension(ret, cl_intel_motion_estimation_ext_id);
-      break;
-
-    case PCI_CHIP_BROADWLL_M_GT1:
-      DECL_INFO_STRING(brw_gt1_break, intel_brw_gt1_device, name, "Intel(R) HD Graphics BroadWell Mobile GT1");
-    case PCI_CHIP_BROADWLL_D_GT1:
-      DECL_INFO_STRING(brw_gt1_break, intel_brw_gt1_device, name, "Intel(R) HD Graphics BroadWell U-Processor GT1");
-    case PCI_CHIP_BROADWLL_S_GT1:
-      DECL_INFO_STRING(brw_gt1_break, intel_brw_gt1_device, name, "Intel(R) HD Graphics BroadWell Server GT1");
-    case PCI_CHIP_BROADWLL_W_GT1:
-      DECL_INFO_STRING(brw_gt1_break, intel_brw_gt1_device, name, "Intel(R) HD Graphics BroadWell Workstation GT1");
-    case PCI_CHIP_BROADWLL_U_GT1:
-      DECL_INFO_STRING(brw_gt1_break, intel_brw_gt1_device, name, "Intel(R) HD Graphics BroadWell ULX GT1");
-brw_gt1_break:
-      /* For Gen8 and later, half float is suppported and we will enable cl_khr_fp16. */
-      intel_brw_gt1_device.device_id = device_id;
-      intel_brw_gt1_device.platform = cl_get_platform_default();
-      ret = &intel_brw_gt1_device;
-      cl_intel_platform_get_default_extension(ret);
-#ifdef ENABLE_FP64
-      cl_intel_platform_enable_extension(ret, cl_khr_fp64_ext_id);
-#endif
-      cl_intel_platform_enable_extension(ret, cl_khr_fp16_ext_id);
-      break;
-
-    case PCI_CHIP_BROADWLL_M_GT2:
-      DECL_INFO_STRING(brw_gt2_break, intel_brw_gt2_device, name, "Intel(R) HD Graphics 5600 BroadWell Mobile GT2");
-    case PCI_CHIP_BROADWLL_D_GT2:
-      DECL_INFO_STRING(brw_gt2_break, intel_brw_gt2_device, name, "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2");
-    case PCI_CHIP_BROADWLL_S_GT2:
-      DECL_INFO_STRING(brw_gt2_break, intel_brw_gt2_device, name, "Intel(R) HD Graphics BroadWell Server GT2");
-    case PCI_CHIP_BROADWLL_W_GT2:
-      DECL_INFO_STRING(brw_gt2_break, intel_brw_gt2_device, name, "Intel(R) HD Graphics BroadWell Workstation GT2");
-    case PCI_CHIP_BROADWLL_U_GT2:
-      DECL_INFO_STRING(brw_gt2_break, intel_brw_gt2_device, name, "Intel(R) HD Graphics 5300 BroadWell ULX GT2");
-brw_gt2_break:
-      intel_brw_gt2_device.device_id = device_id;
-      intel_brw_gt2_device.platform = cl_get_platform_default();
-      ret = &intel_brw_gt2_device;
-      cl_intel_platform_get_default_extension(ret);
-#ifdef ENABLE_FP64
-      cl_intel_platform_enable_extension(ret, cl_khr_fp64_ext_id);
-#endif
-      cl_intel_platform_enable_extension(ret, cl_khr_fp16_ext_id);
-      break;
-
-    case PCI_CHIP_BROADWLL_M_GT3:
-      DECL_INFO_STRING(brw_gt3_break, intel_brw_gt3_device, name, "Intel(R) Iris Pro Graphics 6200 BroadWell Mobile GT3");
-    case PCI_CHIP_BROADWLL_D_GT3:
-      DECL_INFO_STRING(brw_gt3_break, intel_brw_gt3_device, name, "Intel(R) HD Graphics 6000 BroadWell U-Processor GT3");
-    case PCI_CHIP_BROADWLL_UI_GT3:
-      DECL_INFO_STRING(brw_gt3_break, intel_brw_gt3_device, name, "Intel(R) Iris Graphics 6100 BroadWell U-Processor GT3");
-    case PCI_CHIP_BROADWLL_S_GT3:
-      DECL_INFO_STRING(brw_gt3_break, intel_brw_gt3_device, name, "Intel(R) Iris Pro Graphics P6300 BroadWell Server GT3");
-    case PCI_CHIP_BROADWLL_W_GT3:
-      DECL_INFO_STRING(brw_gt3_break, intel_brw_gt3_device, name, "Intel(R) HD Graphics BroadWell Workstation GT3");
-    case PCI_CHIP_BROADWLL_U_GT3:
-      DECL_INFO_STRING(brw_gt3_break, intel_brw_gt3_device, name, "Intel(R) HD Graphics BroadWell ULX GT3");
-brw_gt3_break:
-      intel_brw_gt3_device.device_id = device_id;
-      intel_brw_gt3_device.platform = cl_get_platform_default();
-      ret = &intel_brw_gt3_device;
-      cl_intel_platform_get_default_extension(ret);
-#ifdef ENABLE_FP64
-      cl_intel_platform_enable_extension(ret, cl_khr_fp64_ext_id);
-#endif
-      cl_intel_platform_enable_extension(ret, cl_khr_fp16_ext_id);
-      break;
-
-    case PCI_CHIP_CHV_0:
-    case PCI_CHIP_CHV_1:
-    case PCI_CHIP_CHV_2:
-    case PCI_CHIP_CHV_3:
-      DECL_INFO_STRING(chv_break, intel_chv_device, name, "Intel(R) HD Graphics Cherryview");
-chv_break:
-      intel_chv_device.device_id = device_id;
-      intel_chv_device.platform = cl_get_platform_default();
-      ret = &intel_chv_device;
-      cl_intel_platform_get_default_extension(ret);
-#ifdef ENABLE_FP64
-      cl_intel_platform_enable_extension(ret, cl_khr_fp64_ext_id);
-#endif
-      cl_intel_platform_enable_extension(ret, cl_khr_fp16_ext_id);
-      break;
-
-
-    case PCI_CHIP_SKYLAKE_ULT_GT1:
-      DECL_INFO_STRING(skl_gt1_break, intel_skl_gt1_device, name, "Intel(R) HD Graphics Skylake ULT GT1");
-    case PCI_CHIP_SKYLAKE_ULX_GT1:
-      DECL_INFO_STRING(skl_gt1_break, intel_skl_gt1_device, name, "Intel(R) HD Graphics Skylake ULX GT1");
-    case PCI_CHIP_SKYLAKE_DT_GT1:
-      DECL_INFO_STRING(skl_gt1_break, intel_skl_gt1_device, name, "Intel(R) HD Graphics Skylake Desktop GT1");
-    case PCI_CHIP_SKYLAKE_HALO_GT1:
-      DECL_INFO_STRING(skl_gt1_break, intel_skl_gt1_device, name, "Intel(R) HD Graphics Skylake Halo GT1");
-    case PCI_CHIP_SKYLAKE_SRV_GT1:
-      DECL_INFO_STRING(skl_gt1_break, intel_skl_gt1_device, name, "Intel(R) HD Graphics Skylake Server GT1");
-skl_gt1_break:
-      intel_skl_gt1_device.device_id = device_id;
-      intel_skl_gt1_device.platform = cl_get_platform_default();
-      ret = &intel_skl_gt1_device;
-#ifdef ENABLE_FP64
-      cl_intel_platform_enable_extension(ret, cl_khr_fp64_ext_id);
-#endif
-      cl_intel_platform_get_default_extension(ret);
-      cl_intel_platform_enable_extension(ret, cl_khr_fp16_ext_id);
-      break;
-
-    case PCI_CHIP_SKYLAKE_ULT_GT2:
-      DECL_INFO_STRING(skl_gt2_break, intel_skl_gt2_device, name, "Intel(R) HD Graphics Skylake ULT GT2");
-    case PCI_CHIP_SKYLAKE_ULT_GT2F:
-      DECL_INFO_STRING(skl_gt2_break, intel_skl_gt2_device, name, "Intel(R) HD Graphics Skylake ULT GT2F");
-    case PCI_CHIP_SKYLAKE_ULX_GT2:
-      DECL_INFO_STRING(skl_gt2_break, intel_skl_gt2_device, name, "Intel(R) HD Graphics Skylake ULX GT2");
-    case PCI_CHIP_SKYLAKE_DT_GT2:
-      DECL_INFO_STRING(skl_gt2_break, intel_skl_gt2_device, name, "Intel(R) HD Graphics Skylake Desktop GT2");
-    case PCI_CHIP_SKYLAKE_HALO_GT2:
-      DECL_INFO_STRING(skl_gt2_break, intel_skl_gt2_device, name, "Intel(R) HD Graphics Skylake Halo GT2");
-    case PCI_CHIP_SKYLAKE_SRV_GT2:
-      DECL_INFO_STRING(skl_gt2_break, intel_skl_gt2_device, name, "Intel(R) HD Graphics Skylake Server GT2");
-    case PCI_CHIP_SKYLAKE_WKS_GT2:
-      DECL_INFO_STRING(skl_gt2_break, intel_skl_gt2_device, name, "Intel(R) HD Graphics Skylake Workstation GT2");
-skl_gt2_break:
-      intel_skl_gt2_device.device_id = device_id;
-      intel_skl_gt2_device.platform = cl_get_platform_default();
-      ret = &intel_skl_gt2_device;
-#ifdef ENABLE_FP64
-      cl_intel_platform_enable_extension(ret, cl_khr_fp64_ext_id);
-#endif
-      cl_intel_platform_get_default_extension(ret);
-      cl_intel_platform_enable_extension(ret, cl_khr_fp16_ext_id);
-      break;
-
-    case PCI_CHIP_SKYLAKE_ULT_GT3:
-      DECL_INFO_STRING(skl_gt3_break, intel_skl_gt3_device, name, "Intel(R) HD Graphics Skylake ULT GT3");
-    case PCI_CHIP_SKYLAKE_HALO_GT3:
-      DECL_INFO_STRING(skl_gt3_break, intel_skl_gt3_device, name, "Intel(R) HD Graphics Skylake Halo GT3");
-    case PCI_CHIP_SKYLAKE_SRV_GT3:
-      DECL_INFO_STRING(skl_gt3_break, intel_skl_gt3_device, name, "Intel(R) HD Graphics Skylake Server GT3");
-    case PCI_CHIP_SKYLAKE_MEDIA_SRV_GT3:
-      DECL_INFO_STRING(skl_gt3_break, intel_skl_gt3_device, name, "Intel(R) HD Graphics Skylake Media Server GT3");
-skl_gt3_break:
-      intel_skl_gt3_device.device_id = device_id;
-      intel_skl_gt3_device.platform = cl_get_platform_default();
-      ret = &intel_skl_gt3_device;
-      cl_intel_platform_get_default_extension(ret);
-#ifdef ENABLE_FP64
-      cl_intel_platform_enable_extension(ret, cl_khr_fp64_ext_id);
-#endif
-      cl_intel_platform_enable_extension(ret, cl_khr_fp16_ext_id);
-      break;
-
-    case PCI_CHIP_SKYLAKE_DT_GT4:
-      DECL_INFO_STRING(skl_gt4_break, intel_skl_gt4_device, name, "Intel(R) HD Graphics Skylake Desktop GT4");
-    case PCI_CHIP_SKYLAKE_HALO_GT4:
-      DECL_INFO_STRING(skl_gt4_break, intel_skl_gt4_device, name, "Intel(R) HD Graphics Skylake Halo GT4");
-    case PCI_CHIP_SKYLAKE_SRV_GT4:
-      DECL_INFO_STRING(skl_gt4_break, intel_skl_gt4_device, name, "Intel(R) HD Graphics Skylake Server GT4");
-    case PCI_CHIP_SKYLAKE_WKS_GT4:
-      DECL_INFO_STRING(skl_gt4_break, intel_skl_gt4_device, name, "Intel(R) HD Graphics Skylake Workstation GT4");
-skl_gt4_break:
-      intel_skl_gt4_device.device_id = device_id;
-      intel_skl_gt4_device.platform = cl_get_platform_default();
-      ret = &intel_skl_gt4_device;
-#ifdef ENABLE_FP64
-      cl_intel_platform_enable_extension(ret, cl_khr_fp64_ext_id);
-#endif
-      cl_intel_platform_get_default_extension(ret);
-      cl_intel_platform_enable_extension(ret, cl_khr_fp16_ext_id);
-      break;
-
-    case PCI_CHIP_BROXTON_0:
-      DECL_INFO_STRING(bxt18eu_break, intel_bxt18eu_device, name, "Intel(R) HD Graphics Broxton 0");
-    case PCI_CHIP_BROXTON_2:
-      DECL_INFO_STRING(bxt18eu_break, intel_bxt18eu_device, name, "Intel(R) HD Graphics Broxton 2");
-bxt18eu_break:
-      intel_bxt18eu_device.device_id = device_id;
-      intel_bxt18eu_device.platform = cl_get_platform_default();
-      ret = &intel_bxt18eu_device;
-      cl_intel_platform_get_default_extension(ret);
-      cl_intel_platform_enable_extension(ret, cl_khr_fp16_ext_id);
-      break;
-
-    case PCI_CHIP_BROXTON_1:
-      DECL_INFO_STRING(bxt12eu_break, intel_bxt12eu_device, name, "Intel(R) HD Graphics Broxton 1");
-    case PCI_CHIP_BROXTON_3:
-      DECL_INFO_STRING(bxt12eu_break, intel_bxt12eu_device, name, "Intel(R) HD Graphics Broxton 3");
-bxt12eu_break:
-      intel_bxt12eu_device.device_id = device_id;
-      intel_bxt12eu_device.platform = cl_get_platform_default();
-      ret = &intel_bxt12eu_device;
-      cl_intel_platform_get_default_extension(ret);
-      cl_intel_platform_enable_extension(ret, cl_khr_fp16_ext_id);
-      break;
-
-    case PCI_CHIP_KABYLAKE_ULT_GT1:
-      DECL_INFO_STRING(kbl_gt1_break, intel_kbl_gt1_device, name, "Intel(R) HD Graphics Kabylake ULT GT1");
-    case PCI_CHIP_KABYLAKE_DT_GT1:
-      DECL_INFO_STRING(kbl_gt1_break, intel_kbl_gt1_device, name, "Intel(R) HD Graphics Kabylake Desktop GT1");
-    case PCI_CHIP_KABYLAKE_HALO_GT1:
-      DECL_INFO_STRING(kbl_gt1_break, intel_kbl_gt1_device, name, "Intel(R) HD Graphics Kabylake Halo GT1");
-    case PCI_CHIP_KABYLAKE_ULX_GT1:
-      DECL_INFO_STRING(kbl_gt1_break, intel_kbl_gt1_device, name, "Intel(R) HD Graphics Kabylake ULX GT1");
-    case PCI_CHIP_KABYLAKE_SRV_GT1:
-      DECL_INFO_STRING(kbl_gt1_break, intel_kbl_gt1_device, name, "Intel(R) HD Graphics Kabylake Server GT1");
-kbl_gt1_break:
-      intel_kbl_gt1_device.device_id = device_id;
-      intel_kbl_gt1_device.platform = cl_get_platform_default();
-      ret = &intel_kbl_gt1_device;
-#ifdef ENABLE_FP64
-      cl_intel_platform_enable_extension(ret, cl_khr_fp64_ext_id);
-#endif
-      cl_intel_platform_get_default_extension(ret);
-      cl_intel_platform_enable_extension(ret, cl_khr_fp16_ext_id);
-      break;
-
-    case PCI_CHIP_KABYLAKE_ULT_GT15:
-      DECL_INFO_STRING(kbl_gt15_break, intel_kbl_gt15_device, name, "Intel(R) HD Graphics Kabylake ULT GT1.5");
-    case PCI_CHIP_KABYLAKE_DT_GT15:
-      DECL_INFO_STRING(kbl_gt15_break, intel_kbl_gt15_device, name, "Intel(R) HD Graphics Kabylake Desktop GT1.5");
-    case PCI_CHIP_KABYLAKE_HALO_GT15:
-      DECL_INFO_STRING(kbl_gt15_break, intel_kbl_gt15_device, name, "Intel(R) HD Graphics Kabylake Halo GT1.5");
-    case PCI_CHIP_KABYLAKE_ULX_GT15:
-      DECL_INFO_STRING(kbl_gt15_break, intel_kbl_gt15_device, name, "Intel(R) HD Graphics Kabylake ULX GT1.5");
-kbl_gt15_break:
-      intel_kbl_gt15_device.device_id = device_id;
-      intel_kbl_gt15_device.platform = cl_get_platform_default();
-      ret = &intel_kbl_gt15_device;
-#ifdef ENABLE_FP64
-      cl_intel_platform_enable_extension(ret, cl_khr_fp64_ext_id);
-#endif
-      cl_intel_platform_get_default_extension(ret);
-      cl_intel_platform_enable_extension(ret, cl_khr_fp16_ext_id);
-      break;
-
-    case PCI_CHIP_KABYLAKE_ULT_GT2:
-    case PCI_CHIP_KABYLAKE_ULT_GT2_1:
-      DECL_INFO_STRING(kbl_gt2_break, intel_kbl_gt2_device, name, "Intel(R) HD Graphics Kabylake ULT GT2");
-    case PCI_CHIP_KABYLAKE_DT_GT2:
-      DECL_INFO_STRING(kbl_gt2_break, intel_kbl_gt2_device, name, "Intel(R) HD Graphics Kabylake Desktop GT2");
-    case PCI_CHIP_KABYLAKE_HALO_GT2:
-      DECL_INFO_STRING(kbl_gt2_break, intel_kbl_gt2_device, name, "Intel(R) HD Graphics Kabylake Halo GT2");
-    case PCI_CHIP_KABYLAKE_ULX_GT2:
-      DECL_INFO_STRING(kbl_gt2_break, intel_kbl_gt2_device, name, "Intel(R) HD Graphics Kabylake ULX GT2");
-    case PCI_CHIP_KABYLAKE_SRV_GT2:
-      DECL_INFO_STRING(kbl_gt2_break, intel_kbl_gt2_device, name, "Intel(R) HD Graphics Kabylake Server GT2");
-    case PCI_CHIP_KABYLAKE_WKS_GT2:
-      DECL_INFO_STRING(kbl_gt2_break, intel_kbl_gt2_device, name, "Intel(R) HD Graphics Kabylake Workstation GT2");
-kbl_gt2_break:
-      intel_kbl_gt2_device.device_id = device_id;
-      intel_kbl_gt2_device.platform = cl_get_platform_default();
-      ret = &intel_kbl_gt2_device;
-#ifdef ENABLE_FP64
-      cl_intel_platform_enable_extension(ret, cl_khr_fp64_ext_id);
-#endif
-      cl_intel_platform_get_default_extension(ret);
-      cl_intel_platform_enable_extension(ret, cl_khr_fp16_ext_id);
-      break;
-
-    case PCI_CHIP_KABYLAKE_ULT_GT3:
-    case PCI_CHIP_KABYLAKE_ULT_GT3_1:
-    case PCI_CHIP_KABYLAKE_ULT_GT3_2:
-      DECL_INFO_STRING(kbl_gt3_break, intel_kbl_gt3_device, name, "Intel(R) HD Graphics Kabylake ULT GT3");
-kbl_gt3_break:
-      intel_kbl_gt3_device.device_id = device_id;
-      intel_kbl_gt3_device.platform = cl_get_platform_default();
-      ret = &intel_kbl_gt3_device;
-#ifdef ENABLE_FP64
-      cl_intel_platform_enable_extension(ret, cl_khr_fp64_ext_id);
-#endif
-      cl_intel_platform_get_default_extension(ret);
-      cl_intel_platform_enable_extension(ret, cl_khr_fp16_ext_id);
-      break;
-
-    case PCI_CHIP_KABYLAKE_HALO_GT4:
-      DECL_INFO_STRING(kbl_gt4_break, intel_kbl_gt4_device, name, "Intel(R) HD Graphics Kabylake ULT GT4");
-kbl_gt4_break:
-      intel_kbl_gt4_device.device_id = device_id;
-      intel_kbl_gt4_device.platform = cl_get_platform_default();
-      ret = &intel_kbl_gt4_device;
-#ifdef ENABLE_FP64
-      cl_intel_platform_enable_extension(ret, cl_khr_fp64_ext_id);
-#endif
-      cl_intel_platform_get_default_extension(ret);
-      cl_intel_platform_enable_extension(ret, cl_khr_fp16_ext_id);
-      break;
-
-    case PCI_CHIP_SANDYBRIDGE_BRIDGE:
-    case PCI_CHIP_SANDYBRIDGE_GT1:
-    case PCI_CHIP_SANDYBRIDGE_GT2:
-    case PCI_CHIP_SANDYBRIDGE_GT2_PLUS:
-    case PCI_CHIP_SANDYBRIDGE_BRIDGE_M:
-    case PCI_CHIP_SANDYBRIDGE_M_GT1:
-    case PCI_CHIP_SANDYBRIDGE_M_GT2:
-    case PCI_CHIP_SANDYBRIDGE_M_GT2_PLUS:
-    case PCI_CHIP_SANDYBRIDGE_BRIDGE_S:
-    case PCI_CHIP_SANDYBRIDGE_S_GT:
-      // Intel(R) HD Graphics SandyBridge not supported yet
-      ret = NULL;
-      break;
-    default:
-      printf("cl_get_gt_device(): error, unknown device: %x\n", device_id);
-  }
-
-  if (ret == NULL)
-    return NULL;
-
-  CL_OBJECT_INIT_BASE(ret, CL_OBJECT_DEVICE_MAGIC);
-  if (!CompilerSupported()) {
-    ret->compiler_available = CL_FALSE;
-    //ret->linker_available = CL_FALSE;
-    ret->profile = "EMBEDDED_PROFILE";
-    ret->profile_sz = strlen(ret->profile) + 1;
-  }
-
-  /* Apply any driver-dependent updates to the device info */
-  cl_driver_update_device_info(ret);
-
-  #define toMB(size) (size)&(UINT64_MAX<<20)
-  /* Get the global_mem_size and max_mem_alloc size from
-   * driver, system ram and hardware*/
-  struct sysinfo info;
-  if (sysinfo(&info) == 0) {
-    uint64_t totalgpumem = ret->global_mem_size;
-	uint64_t maxallocmem = ret->max_mem_alloc_size;
-    uint64_t totalram = info.totalram * info.mem_unit;
-	/* In case to keep system stable we just use half
-	 * of the raw as global mem */
-    ret->global_mem_size = toMB((totalram / 2 > totalgpumem) ?
-                            totalgpumem: totalram / 2);
-	/* The hardware has some limit about the alloc size
-	 * and the excution of kernel need some global mem
-	 * so we now make sure single mem does not use much
-	 * than 3/4 global mem*/
-    ret->max_mem_alloc_size = toMB((ret->global_mem_size * 3 / 4 > maxallocmem) ?
-                              maxallocmem: ret->global_mem_size * 3 / 4);
-  }
-
-  return ret;
-}
-
-/* Runs a small kernel to check that the device works; returns
- * SELF_TEST_PASS: for success.
- * SELF_TEST_SLM_FAIL: for SLM results mismatch;
- * SELF_TEST_ATOMIC_FAIL: for hsw enqueue  kernel failure to not enable atomics in L3.
- * SELF_TEST_OTHER_FAIL: other fail like runtime API fail.*/
-LOCAL cl_self_test_res
-cl_self_test(cl_device_id device, cl_self_test_res atomic_in_l3_flag)
-{
-  cl_int status;
-  cl_context ctx;
-  cl_command_queue queue;
-  cl_program program;
-  cl_kernel kernel;
-  cl_mem buffer;
-  cl_event kernel_finished;
-  size_t n = 3;
-  cl_int test_data[3] = {3, 7, 5};
-  const char* kernel_source = "__kernel void self_test(__global int *buf) {"
-  "  __local int tmp[3];"
-  "  tmp[get_local_id(0)] = buf[get_local_id(0)];"
-  "  barrier(CLK_LOCAL_MEM_FENCE);"
-  "  buf[get_global_id(0)] = tmp[2 - get_local_id(0)] + buf[get_global_id(0)];"
-  "}"; // using __local to catch the "no SLM on Haswell" problem
-  static int tested = 0;
-  static cl_self_test_res ret = SELF_TEST_OTHER_FAIL;
-  if (tested != 0)
-    return ret;
-  tested = 1;
-  ctx = clCreateContext(NULL, 1, &device, NULL, NULL, &status);
-  if(!ctx)
-    return ret;
-  cl_driver_set_atomic_flag(ctx->drv, atomic_in_l3_flag);
-  if (status == CL_SUCCESS) {
-    queue = clCreateCommandQueueWithProperties(ctx, device, 0, &status);
-    if (status == CL_SUCCESS) {
-      program = clCreateProgramWithSource(ctx, 1, &kernel_source, NULL, &status);
-      if (status == CL_SUCCESS) {
-        status = clBuildProgram(program, 1, &device, "", NULL, NULL);
-        if (status == CL_SUCCESS) {
-          kernel = clCreateKernel(program, "self_test", &status);
-          if (status == CL_SUCCESS) {
-            buffer = clCreateBuffer(ctx, CL_MEM_COPY_HOST_PTR, n*4, test_data, &status);
-            if (status == CL_SUCCESS) {
-              status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &buffer);
-              if (status == CL_SUCCESS) {
-                status = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &n, &n, 0, NULL, &kernel_finished);
-                if (status == CL_SUCCESS) {
-                  status = clEnqueueReadBuffer(queue, buffer, CL_TRUE, 0, n*4, test_data, 1, &kernel_finished, NULL);
-                  if (status == CL_SUCCESS) {
-                    if (test_data[0] == 8 && test_data[1] == 14 && test_data[2] == 8){
-                      ret = SELF_TEST_PASS;
-                    } else {
-                      ret = SELF_TEST_SLM_FAIL;
-                      printf("Beignet: self-test failed: (3, 7, 5) + (5, 7, 3) returned (%i, %i, %i)\n"
-                             "See README.md or http://www.freedesktop.org/wiki/Software/Beignet/\n",
-                             test_data[0], test_data[1], test_data[2]);
-
-                    }
-                  }
-                } else{
-                  ret = SELF_TEST_ATOMIC_FAIL;
-                  // Atomic fail need to test SLM again with atomic in L3 feature disabled.
-                  tested = 0;
-                }
-                clReleaseEvent(kernel_finished);
-              }
-            }
-            clReleaseMemObject(buffer);
-          }
-          clReleaseKernel(kernel);
-        }
-      }
-      clReleaseProgram(program);
-    }
-    clReleaseCommandQueue(queue);
-  }
-  clReleaseContext(ctx);
-  return ret;
-}
-
-LOCAL cl_int
-cl_get_device_ids(cl_platform_id    platform,
-                  cl_device_type    device_type,
-                  cl_uint           num_entries,
-                  cl_device_id *    devices,
-                  cl_uint *         num_devices)
-{
-  cl_device_id device;
+  if (device_type & (CL_DEVICE_TYPE_GPU | CL_DEVICE_TYPE_DEFAULT))
+    device = cl_device_get_id_gen(platform);
 
   /* Do we have a usable device? */
-  device = cl_get_gt_device(device_type);
-  if (device) {
-    cl_self_test_res ret = cl_self_test(device, SELF_TEST_PASS);
-    if (ret == SELF_TEST_ATOMIC_FAIL) {
-      device->atomic_test_result = ret;
-      ret = cl_self_test(device, ret);
-      printf("Beignet: warning - disable atomic in L3 feature.\n");
-    }
-
-    if(ret == SELF_TEST_SLM_FAIL) {
-      int disable_self_test = 0;
-      // can't use BVAR (backend/src/sys/cvar.hpp) here as it's C++
-      const char *env = getenv("OCL_IGNORE_SELF_TEST");
-      if (env != NULL) {
-        sscanf(env, "%i", &disable_self_test);
-      }
-      if (disable_self_test) {
-        printf("Beignet: Warning - overriding self-test failure\n");
-      } else {
-        printf("Beignet: disabling non-working device\n");
-        device = 0;
-      }
-    }
-  }
-  if (!device) {
-    if (num_devices)
-      *num_devices = 0;
-    if (devices)
-      *devices = 0;
+  if (device == NULL)
     return CL_DEVICE_NOT_FOUND;
-  } else {
-    if (num_devices)
-      *num_devices = 1;
-    if (devices) {
-      *devices = device;
-    }
-    return CL_SUCCESS;
-  }
-}
 
-LOCAL cl_bool is_gen_device(cl_device_id device) {
-  return device == &intel_ivb_gt1_device ||
-         device == &intel_ivb_gt2_device ||
-         device == &intel_baytrail_t_device ||
-         device == &intel_hsw_gt1_device ||
-         device == &intel_hsw_gt2_device ||
-         device == &intel_hsw_gt3_device ||
-         device == &intel_brw_gt1_device ||
-         device == &intel_brw_gt2_device ||
-         device == &intel_brw_gt3_device ||
-         device == &intel_chv_device ||
-         device == &intel_skl_gt1_device ||
-         device == &intel_skl_gt2_device ||
-         device == &intel_skl_gt3_device ||
-         device == &intel_skl_gt4_device ||
-         device == &intel_bxt18eu_device ||
-         device == &intel_bxt12eu_device ||
-         device == &intel_kbl_gt1_device ||
-         device == &intel_kbl_gt15_device ||
-         device == &intel_kbl_gt2_device ||
-         device == &intel_kbl_gt3_device ||
-         device == &intel_kbl_gt4_device;
+  if (devices)
+    devices[0] = device;
+  if (num_devices)
+    *num_devices = 1;
+  return CL_SUCCESS;
 }
 
 LOCAL cl_int
-cl_get_device_info(cl_device_id     device,
-                   cl_device_info   param_name,
-                   size_t           param_value_size,
-                   void *           param_value,
-                   size_t *         param_value_size_ret)
+cl_device_get_info(cl_device_id device, cl_device_info param_name, size_t param_value_size,
+                   void *param_value, size_t *param_value_size_ret)
 {
   const void *src_ptr = NULL;
   size_t src_size = 0;
   cl_int dev_ref;
 
-  // We now just support gen devices.
-  if (UNLIKELY(is_gen_device(device) == CL_FALSE))
-    return CL_INVALID_DEVICE;
-
   /* Find the correct parameter */
   switch (param_name) {
-    case CL_DEVICE_TYPE:
-      src_ptr = &device->device_type;
-      src_size = sizeof(device->device_type);
-      break;
-    case CL_DEVICE_VENDOR_ID:
-      src_ptr = &device->vendor_id;
-      src_size = sizeof(device->vendor_id);
-      break;
-    case CL_DEVICE_MAX_COMPUTE_UNITS:
-      src_ptr = &device->max_compute_unit;
-      src_size = sizeof(device->max_compute_unit);
-      break;
-    case CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS:
-      src_ptr = &device->max_work_item_dimensions;
-      src_size = sizeof(device->max_work_item_dimensions);
-      break;
-    case CL_DEVICE_MAX_WORK_ITEM_SIZES:
-      src_ptr = &device->max_work_item_sizes;
-      src_size = sizeof(device->max_work_item_sizes);
-      break;
-    case CL_DEVICE_MAX_WORK_GROUP_SIZE:
-      src_ptr = &device->max_work_group_size;
-      src_size = sizeof(device->max_work_group_size);
-      break;
-    case CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR:
-      src_ptr = &device->preferred_vector_width_char;
-      src_size = sizeof(device->preferred_vector_width_char);
-      break;
-    case CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT:
-      src_ptr = &device->preferred_vector_width_short;
-      src_size = sizeof(device->preferred_vector_width_short);
-      break;
-    case CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT:
-      src_ptr = &device->preferred_vector_width_int;
-      src_size = sizeof(device->preferred_vector_width_int);
-      break;
-    case CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG:
-      src_ptr = &device->preferred_vector_width_long;
-      src_size = sizeof(device->preferred_vector_width_long);
-      break;
-    case CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT:
-      src_ptr = &device->preferred_vector_width_float;
-      src_size = sizeof(device->preferred_vector_width_float);
-      break;
-    case CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE:
-      src_ptr = &device->preferred_vector_width_double;
-      src_size = sizeof(device->preferred_vector_width_double);
-      break;
-    case CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF:
-      src_ptr = &device->preferred_vector_width_half;
-      src_size = sizeof(device->preferred_vector_width_half);
-      break;
-    case CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR:
-      src_ptr = &device->native_vector_width_char;
-      src_size = sizeof(device->native_vector_width_char);
-      break;
-    case CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT:
-      src_ptr = &device->native_vector_width_short;
-      src_size = sizeof(device->native_vector_width_short);
-      break;
-    case CL_DEVICE_NATIVE_VECTOR_WIDTH_INT:
-      src_ptr = &device->native_vector_width_int;
-      src_size = sizeof(device->native_vector_width_int);
-      break;
-    case CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG:
-      src_ptr = &device->native_vector_width_long;
-      src_size = sizeof(device->native_vector_width_long);
-      break;
-    case CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT:
-      src_ptr = &device->native_vector_width_float;
-      src_size = sizeof(device->native_vector_width_float);
-      break;
-    case CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE:
-      src_ptr = &device->native_vector_width_double;
-      src_size = sizeof(device->native_vector_width_double);
-      break;
-    case CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF:
-      src_ptr = &device->native_vector_width_half;
-      src_size = sizeof(device->native_vector_width_half);
-      break;
-    case CL_DEVICE_MAX_CLOCK_FREQUENCY:
-      src_ptr = &device->max_clock_frequency;
-      src_size = sizeof(device->max_clock_frequency);
-      break;
-    case CL_DEVICE_ADDRESS_BITS:
-      src_ptr = &device->address_bits;
-      src_size = sizeof(device->address_bits);
-      break;
-    case CL_DEVICE_MAX_MEM_ALLOC_SIZE:
-      src_ptr = &device->max_mem_alloc_size;
-      src_size = sizeof(device->max_mem_alloc_size);
-      break;
-    case CL_DEVICE_IMAGE_SUPPORT:
-      src_ptr = &device->image_support;
-      src_size = sizeof(device->image_support);
-      break;
-    case CL_DEVICE_MAX_READ_IMAGE_ARGS:
-      src_ptr = &device->max_read_image_args;
-      src_size = sizeof(device->max_read_image_args);
-      break;
-    case CL_DEVICE_MAX_WRITE_IMAGE_ARGS:
-      src_ptr = &device->max_write_image_args;
-      src_size = sizeof(device->max_write_image_args);
-      break;
-    case CL_DEVICE_MAX_READ_WRITE_IMAGE_ARGS:
-      src_ptr = &device->max_read_write_image_args;
-      src_size = sizeof(device->max_read_write_image_args);
-      break;
-    case CL_DEVICE_IMAGE_MAX_ARRAY_SIZE:
-      src_ptr = &device->image_max_array_size;
-      src_size = sizeof(device->image_max_array_size);
-      break;
-    case CL_DEVICE_IMAGE2D_MAX_WIDTH:
-      src_ptr = &device->image2d_max_width;
-      src_size = sizeof(device->image2d_max_width);
-      break;
-    case CL_DEVICE_IMAGE2D_MAX_HEIGHT:
-      src_ptr = &device->image2d_max_height;
-      src_size = sizeof(device->image2d_max_height);
-      break;
-    case CL_DEVICE_IMAGE3D_MAX_WIDTH:
-      src_ptr = &device->image3d_max_width;
-      src_size = sizeof(device->image3d_max_width);
-      break;
-    case CL_DEVICE_IMAGE3D_MAX_HEIGHT:
-      src_ptr = &device->image3d_max_height;
-      src_size = sizeof(device->image3d_max_height);
-      break;
-    case CL_DEVICE_IMAGE3D_MAX_DEPTH:
-      src_ptr = &device->image3d_max_depth;
-      src_size = sizeof(device->image3d_max_depth);
-      break;
-    case CL_DEVICE_MAX_SAMPLERS:
-      src_ptr = &device->max_samplers;
-      src_size = sizeof(device->max_samplers);
-      break;
-    case CL_DEVICE_MAX_PARAMETER_SIZE:
-      src_ptr = &device->max_parameter_size;
-      src_size = sizeof(device->max_parameter_size);
-      break;
-    case CL_DEVICE_MEM_BASE_ADDR_ALIGN:
-      src_ptr = &device->mem_base_addr_align;
-      src_size = sizeof(device->mem_base_addr_align);
-      break;
-    case CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE:
-      src_ptr = &device->min_data_type_align_size;
-      src_size = sizeof(device->min_data_type_align_size);
-      break;
-    case CL_DEVICE_MAX_PIPE_ARGS:
-      src_ptr = &device->max_pipe_args;
-      src_size = sizeof(device->max_pipe_args);
-      break;
-    case CL_DEVICE_PIPE_MAX_ACTIVE_RESERVATIONS:
-      src_ptr = &device->pipe_max_active_reservations;
-      src_size = sizeof(device->pipe_max_active_reservations);
-      break;
-    case CL_DEVICE_PIPE_MAX_PACKET_SIZE:
-      src_ptr = &device->pipe_max_packet_siz;
-      src_size = sizeof(device->pipe_max_packet_siz);
-      break;
-    case CL_DEVICE_SINGLE_FP_CONFIG:
-      src_ptr = &device->single_fp_config;
-      src_size = sizeof(device->single_fp_config);
-      break;
-    case CL_DEVICE_HALF_FP_CONFIG:
-      src_ptr = &device->half_fp_config;
-      src_size = sizeof(device->half_fp_config);
-      break;
-    case CL_DEVICE_DOUBLE_FP_CONFIG:
-      src_ptr = &device->double_fp_config;
-      src_size = sizeof(device->double_fp_config);
-      break;
-    case CL_DEVICE_GLOBAL_MEM_CACHE_TYPE:
-      src_ptr = &device->global_mem_cache_type;
-      src_size = sizeof(device->global_mem_cache_type);
-      break;
-    case CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE:
-      src_ptr = &device->global_mem_cache_line_size;
-      src_size = sizeof(device->global_mem_cache_line_size);
-      break;
-    case CL_DEVICE_GLOBAL_MEM_CACHE_SIZE:
-      src_ptr = &device->global_mem_cache_size;
-      src_size = sizeof(device->global_mem_cache_size);
-      break;
-    case CL_DEVICE_GLOBAL_MEM_SIZE:
-      src_ptr = &device->global_mem_size;
-      src_size = sizeof(device->global_mem_size);
-      break;
-    case CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE:
-      src_ptr = &device->max_constant_buffer_size;
-      src_size = sizeof(device->max_constant_buffer_size);
-      break;
-    case CL_DEVICE_IMAGE_MAX_BUFFER_SIZE:
-      src_ptr = &device->image_mem_size;
-      src_size = sizeof(device->image_mem_size);
-      break;
-    case CL_DEVICE_MAX_CONSTANT_ARGS:
-      src_ptr = &device->max_constant_args;
-      src_size = sizeof(device->max_constant_args);
-      break;
-    case CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE:
-      src_ptr = &device->max_global_variable_size;
-      src_size = sizeof(device->max_global_variable_size);
-      break;
-    case CL_DEVICE_GLOBAL_VARIABLE_PREFERRED_TOTAL_SIZE:
-      src_ptr = &device->global_variable_preferred_total_size;
-      src_size = sizeof(device->global_variable_preferred_total_size);
-      break;
-    case CL_DEVICE_LOCAL_MEM_TYPE:
-      src_ptr = &device->local_mem_type;
-      src_size = sizeof(device->local_mem_type);
-      break;
-    case CL_DEVICE_LOCAL_MEM_SIZE:
-      src_ptr = &device->local_mem_size;
-      src_size = sizeof(device->local_mem_size);
-      break;
-    case CL_DEVICE_ERROR_CORRECTION_SUPPORT:
-      src_ptr = &device->error_correction_support;
-      src_size = sizeof(device->error_correction_support);
-      break;
-    case CL_DEVICE_HOST_UNIFIED_MEMORY:
-      src_ptr = &device->host_unified_memory;
-      src_size = sizeof(device->host_unified_memory);
-      break;
-    case CL_DEVICE_PROFILING_TIMER_RESOLUTION:
-      src_ptr = &device->profiling_timer_resolution;
-      src_size = sizeof(device->profiling_timer_resolution);
-      break;
-    case CL_DEVICE_ENDIAN_LITTLE:
-      src_ptr = &device->endian_little;
-      src_size = sizeof(device->endian_little);
-      break;
-    case CL_DEVICE_AVAILABLE:
-      src_ptr = &device->available;
-      src_size = sizeof(device->available);
-      break;
-    case CL_DEVICE_COMPILER_AVAILABLE:
-      src_ptr = &device->compiler_available;
-      src_size = sizeof(device->compiler_available);
-      break;
-    case CL_DEVICE_LINKER_AVAILABLE:
-      src_ptr = &device->linker_available;
-      src_size = sizeof(device->linker_available);
-      break;
-    case CL_DEVICE_EXECUTION_CAPABILITIES:
-      src_ptr = &device->execution_capabilities;
-      src_size = sizeof(device->execution_capabilities);
-      break;
-    case CL_DEVICE_QUEUE_PROPERTIES:
-      src_ptr = &device->queue_properties;
-      src_size = sizeof(device->queue_properties);
-      break;
-    case CL_DEVICE_QUEUE_ON_DEVICE_PROPERTIES:
-      src_ptr = &device->queue_on_device_properties;
-      src_size = sizeof(device->queue_on_device_properties);
-      break;
-    case CL_DEVICE_QUEUE_ON_DEVICE_PREFERRED_SIZE:
-      src_ptr = &device->queue_on_device_preferred_size;
-      src_size = sizeof(device->queue_on_device_preferred_size);
-      break;
-    case CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE:
-      src_ptr = &device->queue_on_device_max_size;
-      src_size = sizeof(device->queue_on_device_max_size);
-      break;
-    case CL_DEVICE_MAX_ON_DEVICE_QUEUES:
-      src_ptr = &device->max_on_device_queues;
-      src_size = sizeof(device->max_on_device_queues);
-      break;
-    case CL_DEVICE_MAX_ON_DEVICE_EVENTS:
-      src_ptr = &device->max_on_device_events;
-      src_size = sizeof(device->max_on_device_events);
-      break;
-    case CL_DEVICE_PLATFORM:
-      src_ptr = &device->platform;
-      src_size = sizeof(device->platform);
-      break;
-    case CL_DEVICE_PRINTF_BUFFER_SIZE:
-      src_ptr = &device->printf_buffer_size;
-      src_size = sizeof(device->printf_buffer_size);
-      break;
-    case CL_DEVICE_PREFERRED_INTEROP_USER_SYNC:
-      src_ptr = &device->interop_user_sync;
-      src_size = sizeof(device->interop_user_sync);
-      break;
-    case CL_DEVICE_NAME:
-      src_ptr = device->name;
-      src_size = device->name_sz;
-      break;
-    case CL_DEVICE_VENDOR:
-      src_ptr = device->vendor;
-      src_size = device->vendor_sz;
-      break;
-    case CL_DEVICE_VERSION:
-      src_ptr = device->version;
-      src_size = device->version_sz;
-      break;
-    case CL_DEVICE_PROFILE:
-      src_ptr = device->profile;
-      src_size = device->profile_sz;
-      break;
-    case CL_DEVICE_OPENCL_C_VERSION:
-      src_ptr = device->opencl_c_version;
-      src_size = device->opencl_c_version_sz;
-      break;
-    case CL_DEVICE_SPIR_VERSIONS:
-      src_ptr = device->spir_versions;
-      src_size = device->spir_versions_sz;
-      break;
-    case CL_DEVICE_EXTENSIONS:
-      src_ptr = device->extensions;
-      src_size = device->extensions_sz;
-      break;
-    case CL_DEVICE_BUILT_IN_KERNELS:
-      src_ptr = device->built_in_kernels;
-      src_size = device->built_in_kernels_sz;
-      break;
-    case CL_DEVICE_PARENT_DEVICE:
-      src_ptr = &device->parent_device;
-      src_size = sizeof(device->parent_device);
-      break;
-    case CL_DEVICE_PARTITION_MAX_SUB_DEVICES:
-      src_ptr = &device->partition_max_sub_device;
-      src_size = sizeof(device->partition_max_sub_device);
-      break;
-    case CL_DEVICE_PARTITION_PROPERTIES:
-      src_ptr = &device->partition_property;
-      src_size = sizeof(device->partition_property);
-      break;
-    case CL_DEVICE_PARTITION_AFFINITY_DOMAIN:
-      src_ptr = &device->affinity_domain;
-      src_size = sizeof(device->affinity_domain);
-      break;
-    case CL_DEVICE_PARTITION_TYPE:
-      src_ptr = &device->partition_type;
-      src_size = sizeof(device->partition_type);
-      break;
-    case CL_DEVICE_PREFERRED_PLATFORM_ATOMIC_ALIGNMENT:
-      src_ptr = &device->preferred_platform_atomic_alignment;
-      src_size = sizeof(device->preferred_platform_atomic_alignment);
-      break;
-    case CL_DEVICE_PREFERRED_GLOBAL_ATOMIC_ALIGNMENT:
-      src_ptr = &device->preferred_global_atomic_alignment;
-      src_size = sizeof(device->preferred_global_atomic_alignment);
-      break;
-    case CL_DEVICE_PREFERRED_LOCAL_ATOMIC_ALIGNMENT:
-      src_ptr = &device->preferred_local_atomic_alignment;
-      src_size = sizeof(device->preferred_local_atomic_alignment);
-      break;
-    case CL_DEVICE_IMAGE_PITCH_ALIGNMENT:
-      src_ptr = &device->image_pitch_alignment;
-      src_size = sizeof(device->image_pitch_alignment);
-      break;
-    case CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT:
-      src_ptr = &device->image_base_address_alignment;
-      src_size = sizeof(device->image_base_address_alignment);
-      break;
-    case CL_DEVICE_SVM_CAPABILITIES:
-      src_ptr = &device->svm_capabilities;
-      src_size = sizeof(device->svm_capabilities);
-      break;
-    case CL_DEVICE_REFERENCE_COUNT:
-      {
-        dev_ref = CL_OBJECT_GET_REF(device);
-        src_ptr = &dev_ref;
-        src_size = sizeof(cl_int);
-        break;
-      }
-    case CL_DRIVER_VERSION:
-      src_ptr = device->driver_version;
-      src_size = device->driver_version_sz;
-      break;
-
-    default:
-      return CL_INVALID_VALUE;
+  case CL_DEVICE_TYPE:
+    src_ptr = &device->device_type;
+    src_size = sizeof(device->device_type);
+    break;
+  case CL_DEVICE_VENDOR_ID:
+    src_ptr = &device->vendor_id;
+    src_size = sizeof(device->vendor_id);
+    break;
+  case CL_DEVICE_MAX_COMPUTE_UNITS:
+    src_ptr = &device->max_compute_unit;
+    src_size = sizeof(device->max_compute_unit);
+    break;
+  case CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS:
+    src_ptr = &device->max_work_item_dimensions;
+    src_size = sizeof(device->max_work_item_dimensions);
+    break;
+  case CL_DEVICE_MAX_WORK_ITEM_SIZES:
+    src_ptr = &device->max_work_item_sizes;
+    src_size = sizeof(device->max_work_item_sizes);
+    break;
+  case CL_DEVICE_MAX_WORK_GROUP_SIZE:
+    src_ptr = &device->max_work_group_size;
+    src_size = sizeof(device->max_work_group_size);
+    break;
+  case CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR:
+    src_ptr = &device->preferred_vector_width_char;
+    src_size = sizeof(device->preferred_vector_width_char);
+    break;
+  case CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT:
+    src_ptr = &device->preferred_vector_width_short;
+    src_size = sizeof(device->preferred_vector_width_short);
+    break;
+  case CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT:
+    src_ptr = &device->preferred_vector_width_int;
+    src_size = sizeof(device->preferred_vector_width_int);
+    break;
+  case CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG:
+    src_ptr = &device->preferred_vector_width_long;
+    src_size = sizeof(device->preferred_vector_width_long);
+    break;
+  case CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT:
+    src_ptr = &device->preferred_vector_width_float;
+    src_size = sizeof(device->preferred_vector_width_float);
+    break;
+  case CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE:
+    src_ptr = &device->preferred_vector_width_double;
+    src_size = sizeof(device->preferred_vector_width_double);
+    break;
+  case CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF:
+    src_ptr = &device->preferred_vector_width_half;
+    src_size = sizeof(device->preferred_vector_width_half);
+    break;
+  case CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR:
+    src_ptr = &device->native_vector_width_char;
+    src_size = sizeof(device->native_vector_width_char);
+    break;
+  case CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT:
+    src_ptr = &device->native_vector_width_short;
+    src_size = sizeof(device->native_vector_width_short);
+    break;
+  case CL_DEVICE_NATIVE_VECTOR_WIDTH_INT:
+    src_ptr = &device->native_vector_width_int;
+    src_size = sizeof(device->native_vector_width_int);
+    break;
+  case CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG:
+    src_ptr = &device->native_vector_width_long;
+    src_size = sizeof(device->native_vector_width_long);
+    break;
+  case CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT:
+    src_ptr = &device->native_vector_width_float;
+    src_size = sizeof(device->native_vector_width_float);
+    break;
+  case CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE:
+    src_ptr = &device->native_vector_width_double;
+    src_size = sizeof(device->native_vector_width_double);
+    break;
+  case CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF:
+    src_ptr = &device->native_vector_width_half;
+    src_size = sizeof(device->native_vector_width_half);
+    break;
+  case CL_DEVICE_MAX_CLOCK_FREQUENCY:
+    src_ptr = &device->max_clock_frequency;
+    src_size = sizeof(device->max_clock_frequency);
+    break;
+  case CL_DEVICE_ADDRESS_BITS:
+    src_ptr = &device->address_bits;
+    src_size = sizeof(device->address_bits);
+    break;
+  case CL_DEVICE_MAX_MEM_ALLOC_SIZE:
+    src_ptr = &device->max_mem_alloc_size;
+    src_size = sizeof(device->max_mem_alloc_size);
+    break;
+  case CL_DEVICE_IMAGE_SUPPORT:
+    src_ptr = &device->image_support;
+    src_size = sizeof(device->image_support);
+    break;
+  case CL_DEVICE_MAX_READ_IMAGE_ARGS:
+    src_ptr = &device->max_read_image_args;
+    src_size = sizeof(device->max_read_image_args);
+    break;
+  case CL_DEVICE_MAX_WRITE_IMAGE_ARGS:
+    src_ptr = &device->max_write_image_args;
+    src_size = sizeof(device->max_write_image_args);
+    break;
+  case CL_DEVICE_MAX_READ_WRITE_IMAGE_ARGS:
+    src_ptr = &device->max_read_write_image_args;
+    src_size = sizeof(device->max_read_write_image_args);
+    break;
+  case CL_DEVICE_IMAGE_MAX_ARRAY_SIZE:
+    src_ptr = &device->image_max_array_size;
+    src_size = sizeof(device->image_max_array_size);
+    break;
+  case CL_DEVICE_IMAGE2D_MAX_WIDTH:
+    src_ptr = &device->image2d_max_width;
+    src_size = sizeof(device->image2d_max_width);
+    break;
+  case CL_DEVICE_IMAGE2D_MAX_HEIGHT:
+    src_ptr = &device->image2d_max_height;
+    src_size = sizeof(device->image2d_max_height);
+    break;
+  case CL_DEVICE_IMAGE3D_MAX_WIDTH:
+    src_ptr = &device->image3d_max_width;
+    src_size = sizeof(device->image3d_max_width);
+    break;
+  case CL_DEVICE_IMAGE3D_MAX_HEIGHT:
+    src_ptr = &device->image3d_max_height;
+    src_size = sizeof(device->image3d_max_height);
+    break;
+  case CL_DEVICE_IMAGE3D_MAX_DEPTH:
+    src_ptr = &device->image3d_max_depth;
+    src_size = sizeof(device->image3d_max_depth);
+    break;
+  case CL_DEVICE_MAX_SAMPLERS:
+    src_ptr = &device->max_samplers;
+    src_size = sizeof(device->max_samplers);
+    break;
+  case CL_DEVICE_MAX_PARAMETER_SIZE:
+    src_ptr = &device->max_parameter_size;
+    src_size = sizeof(device->max_parameter_size);
+    break;
+  case CL_DEVICE_MEM_BASE_ADDR_ALIGN:
+    src_ptr = &device->mem_base_addr_align;
+    src_size = sizeof(device->mem_base_addr_align);
+    break;
+  case CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE:
+    src_ptr = &device->min_data_type_align_size;
+    src_size = sizeof(device->min_data_type_align_size);
+    break;
+  case CL_DEVICE_MAX_PIPE_ARGS:
+    src_ptr = &device->max_pipe_args;
+    src_size = sizeof(device->max_pipe_args);
+    break;
+  case CL_DEVICE_PIPE_MAX_ACTIVE_RESERVATIONS:
+    src_ptr = &device->pipe_max_active_reservations;
+    src_size = sizeof(device->pipe_max_active_reservations);
+    break;
+  case CL_DEVICE_PIPE_MAX_PACKET_SIZE:
+    src_ptr = &device->pipe_max_packet_siz;
+    src_size = sizeof(device->pipe_max_packet_siz);
+    break;
+  case CL_DEVICE_SINGLE_FP_CONFIG:
+    src_ptr = &device->single_fp_config;
+    src_size = sizeof(device->single_fp_config);
+    break;
+  case CL_DEVICE_HALF_FP_CONFIG:
+    src_ptr = &device->half_fp_config;
+    src_size = sizeof(device->half_fp_config);
+    break;
+  case CL_DEVICE_DOUBLE_FP_CONFIG:
+    src_ptr = &device->double_fp_config;
+    src_size = sizeof(device->double_fp_config);
+    break;
+  case CL_DEVICE_GLOBAL_MEM_CACHE_TYPE:
+    src_ptr = &device->global_mem_cache_type;
+    src_size = sizeof(device->global_mem_cache_type);
+    break;
+  case CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE:
+    src_ptr = &device->global_mem_cache_line_size;
+    src_size = sizeof(device->global_mem_cache_line_size);
+    break;
+  case CL_DEVICE_GLOBAL_MEM_CACHE_SIZE:
+    src_ptr = &device->global_mem_cache_size;
+    src_size = sizeof(device->global_mem_cache_size);
+    break;
+  case CL_DEVICE_GLOBAL_MEM_SIZE:
+    src_ptr = &device->global_mem_size;
+    src_size = sizeof(device->global_mem_size);
+    break;
+  case CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE:
+    src_ptr = &device->max_constant_buffer_size;
+    src_size = sizeof(device->max_constant_buffer_size);
+    break;
+  case CL_DEVICE_IMAGE_MAX_BUFFER_SIZE:
+    src_ptr = &device->image_mem_size;
+    src_size = sizeof(device->image_mem_size);
+    break;
+  case CL_DEVICE_MAX_CONSTANT_ARGS:
+    src_ptr = &device->max_constant_args;
+    src_size = sizeof(device->max_constant_args);
+    break;
+  case CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE:
+    src_ptr = &device->max_global_variable_size;
+    src_size = sizeof(device->max_global_variable_size);
+    break;
+  case CL_DEVICE_GLOBAL_VARIABLE_PREFERRED_TOTAL_SIZE:
+    src_ptr = &device->global_variable_preferred_total_size;
+    src_size = sizeof(device->global_variable_preferred_total_size);
+    break;
+  case CL_DEVICE_LOCAL_MEM_TYPE:
+    src_ptr = &device->local_mem_type;
+    src_size = sizeof(device->local_mem_type);
+    break;
+  case CL_DEVICE_LOCAL_MEM_SIZE:
+    src_ptr = &device->local_mem_size;
+    src_size = sizeof(device->local_mem_size);
+    break;
+  case CL_DEVICE_ERROR_CORRECTION_SUPPORT:
+    src_ptr = &device->error_correction_support;
+    src_size = sizeof(device->error_correction_support);
+    break;
+  case CL_DEVICE_HOST_UNIFIED_MEMORY:
+    src_ptr = &device->host_unified_memory;
+    src_size = sizeof(device->host_unified_memory);
+    break;
+  case CL_DEVICE_PROFILING_TIMER_RESOLUTION:
+    src_ptr = &device->profiling_timer_resolution;
+    src_size = sizeof(device->profiling_timer_resolution);
+    break;
+  case CL_DEVICE_ENDIAN_LITTLE:
+    src_ptr = &device->endian_little;
+    src_size = sizeof(device->endian_little);
+    break;
+  case CL_DEVICE_AVAILABLE:
+    src_ptr = &device->available;
+    src_size = sizeof(device->available);
+    break;
+  case CL_DEVICE_COMPILER_AVAILABLE:
+    src_ptr = &device->compiler_available;
+    src_size = sizeof(device->compiler_available);
+    break;
+  case CL_DEVICE_LINKER_AVAILABLE:
+    src_ptr = &device->linker_available;
+    src_size = sizeof(device->linker_available);
+    break;
+  case CL_DEVICE_EXECUTION_CAPABILITIES:
+    src_ptr = &device->execution_capabilities;
+    src_size = sizeof(device->execution_capabilities);
+    break;
+  case CL_DEVICE_QUEUE_PROPERTIES:
+    src_ptr = &device->queue_properties;
+    src_size = sizeof(device->queue_properties);
+    break;
+  case CL_DEVICE_QUEUE_ON_DEVICE_PROPERTIES:
+    src_ptr = &device->queue_on_device_properties;
+    src_size = sizeof(device->queue_on_device_properties);
+    break;
+  case CL_DEVICE_QUEUE_ON_DEVICE_PREFERRED_SIZE:
+    src_ptr = &device->queue_on_device_preferred_size;
+    src_size = sizeof(device->queue_on_device_preferred_size);
+    break;
+  case CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE:
+    src_ptr = &device->queue_on_device_max_size;
+    src_size = sizeof(device->queue_on_device_max_size);
+    break;
+  case CL_DEVICE_MAX_ON_DEVICE_QUEUES:
+    src_ptr = &device->max_on_device_queues;
+    src_size = sizeof(device->max_on_device_queues);
+    break;
+  case CL_DEVICE_MAX_ON_DEVICE_EVENTS:
+    src_ptr = &device->max_on_device_events;
+    src_size = sizeof(device->max_on_device_events);
+    break;
+  case CL_DEVICE_PLATFORM:
+    src_ptr = &device->platform;
+    src_size = sizeof(device->platform);
+    break;
+  case CL_DEVICE_PRINTF_BUFFER_SIZE:
+    src_ptr = &device->printf_buffer_size;
+    src_size = sizeof(device->printf_buffer_size);
+    break;
+  case CL_DEVICE_PREFERRED_INTEROP_USER_SYNC:
+    src_ptr = &device->interop_user_sync;
+    src_size = sizeof(device->interop_user_sync);
+    break;
+  case CL_DEVICE_NAME:
+    src_ptr = device->name;
+    src_size = device->name_sz;
+    break;
+  case CL_DEVICE_VENDOR:
+    src_ptr = device->vendor;
+    src_size = device->vendor_sz;
+    break;
+  case CL_DEVICE_VERSION:
+    src_ptr = device->version;
+    src_size = device->version_sz;
+    break;
+  case CL_DEVICE_PROFILE:
+    src_ptr = device->profile;
+    src_size = device->profile_sz;
+    break;
+  case CL_DEVICE_OPENCL_C_VERSION:
+    src_ptr = device->opencl_c_version;
+    src_size = device->opencl_c_version_sz;
+    break;
+  case CL_DEVICE_SPIR_VERSIONS:
+    src_ptr = device->spir_versions;
+    src_size = device->spir_versions_sz;
+    break;
+  case CL_DEVICE_EXTENSIONS:
+    src_ptr = device->extensions;
+    src_size = device->extensions_sz;
+    break;
+  case CL_DEVICE_BUILT_IN_KERNELS:
+    src_ptr = device->built_in_kernels;
+    if (src_ptr)
+      src_size = strlen(device->built_in_kernels) + 1;
+    else
+      src_size = 0;
+    break;
+  case CL_DEVICE_PARENT_DEVICE:
+    src_ptr = &device->parent_device;
+    src_size = sizeof(device->parent_device);
+    break;
+  case CL_DEVICE_PARTITION_MAX_SUB_DEVICES:
+    src_ptr = &device->partition_max_sub_device;
+    src_size = sizeof(device->partition_max_sub_device);
+    break;
+  case CL_DEVICE_PARTITION_PROPERTIES:
+    src_ptr = &device->partition_property;
+    src_size = sizeof(device->partition_property);
+    break;
+  case CL_DEVICE_PARTITION_AFFINITY_DOMAIN:
+    src_ptr = &device->affinity_domain;
+    src_size = sizeof(device->affinity_domain);
+    break;
+  case CL_DEVICE_PARTITION_TYPE:
+    src_ptr = &device->partition_type;
+    src_size = sizeof(device->partition_type);
+    break;
+  case CL_DEVICE_PREFERRED_PLATFORM_ATOMIC_ALIGNMENT:
+    src_ptr = &device->preferred_platform_atomic_alignment;
+    src_size = sizeof(device->preferred_platform_atomic_alignment);
+    break;
+  case CL_DEVICE_PREFERRED_GLOBAL_ATOMIC_ALIGNMENT:
+    src_ptr = &device->preferred_global_atomic_alignment;
+    src_size = sizeof(device->preferred_global_atomic_alignment);
+    break;
+  case CL_DEVICE_PREFERRED_LOCAL_ATOMIC_ALIGNMENT:
+    src_ptr = &device->preferred_local_atomic_alignment;
+    src_size = sizeof(device->preferred_local_atomic_alignment);
+    break;
+  case CL_DEVICE_IMAGE_PITCH_ALIGNMENT:
+    src_ptr = &device->image_pitch_alignment;
+    src_size = sizeof(device->image_pitch_alignment);
+    break;
+  case CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT:
+    src_ptr = &device->image_base_address_alignment;
+    src_size = sizeof(device->image_base_address_alignment);
+    break;
+  case CL_DEVICE_SVM_CAPABILITIES:
+    src_ptr = &device->svm_capabilities;
+    src_size = sizeof(device->svm_capabilities);
+    break;
+  case CL_DEVICE_REFERENCE_COUNT: {
+    dev_ref = CL_OBJECT_GET_REF(device);
+    src_ptr = &dev_ref;
+    src_size = sizeof(cl_int);
+    break;
   }
+  case CL_DRIVER_VERSION:
+    src_ptr = device->driver_version;
+    src_size = device->driver_version_sz;
+    break;
 
-  return cl_get_info_helper(src_ptr, src_size,
-                            param_value, param_value_size, param_value_size_ret);
-}
-
-LOCAL cl_int
-cl_device_get_version(cl_device_id device, cl_int *ver)
-{
-  if (UNLIKELY(is_gen_device(device) == CL_FALSE))
-    return CL_INVALID_DEVICE;
-  if (ver == NULL)
-    return CL_SUCCESS;
-  if (device == &intel_ivb_gt1_device || 
-      device == &intel_ivb_gt2_device ||
-      device == &intel_baytrail_t_device) {
-    *ver = 7;
-  } else if (device == &intel_hsw_gt1_device || device == &intel_hsw_gt2_device
-        || device == &intel_hsw_gt3_device) {
-    *ver = 75;
-  } else if (device == &intel_brw_gt1_device || device == &intel_brw_gt2_device
-        || device == &intel_brw_gt3_device || device == &intel_chv_device) {
-    *ver = 8;
-  } else if (device == &intel_skl_gt1_device || device == &intel_skl_gt2_device
-        || device == &intel_skl_gt3_device || device == &intel_skl_gt4_device
-        || device == &intel_bxt18eu_device || device == &intel_bxt12eu_device || device == &intel_kbl_gt1_device
-        || device == &intel_kbl_gt2_device || device == &intel_kbl_gt3_device
-        || device == &intel_kbl_gt4_device || device == &intel_kbl_gt15_device) {
-    *ver = 9;
-  } else
+  default:
     return CL_INVALID_VALUE;
-
-  return CL_SUCCESS;
-}
-#undef DECL_FIELD
-
-#define _DECL_FIELD(FIELD)                                 \
-      if (param_value && param_value_size < sizeof(FIELD)) \
-        return CL_INVALID_VALUE;                           \
-      if (param_value_size_ret != NULL)                    \
-        *param_value_size_ret = sizeof(FIELD);             \
-      if (param_value)                                     \
-        memcpy(param_value, &FIELD, sizeof(FIELD));        \
-        return CL_SUCCESS;
-
-#define DECL_FIELD(CASE,FIELD)                             \
-  case JOIN(CL_KERNEL_,CASE):                              \
-  _DECL_FIELD(FIELD)
-
-#include "cl_kernel.h"
-#include "cl_program.h"
-static int
-cl_check_builtin_kernel_dimension(cl_kernel kernel, cl_device_id device)
-{
-  const char * n = cl_kernel_get_name(kernel);
-  const char * builtin_kernels_2d = "__cl_copy_image_2d_to_2d;__cl_copy_image_2d_to_buffer;__cl_copy_buffer_to_image_2d;__cl_fill_image_2d;__cl_fill_image_2d_array;";
-  const char * builtin_kernels_3d = "__cl_copy_image_3d_to_2d;__cl_copy_image_2d_to_3d;__cl_copy_image_3d_to_3d;__cl_copy_image_3d_to_buffer;__cl_copy_buffer_to_image_3d;__cl_fill_image_3d";
-    if (n == NULL || !strstr(device->built_in_kernels, n)){
-      return 0;
-    }else if(strstr(builtin_kernels_2d, n)){
-      return 2;
-    }else if(strstr(builtin_kernels_3d, n)){
-      return 3;
-    }else
-      return 1;
-
-}
-
-LOCAL size_t
-cl_get_kernel_max_wg_sz(cl_kernel kernel)
-{
-  size_t work_group_size, thread_cnt;
-  int simd_width = interp_kernel_get_simd_width(kernel->opaque);
-  int device_id = kernel->program->ctx->devices[0]->device_id;
-  if (!interp_kernel_use_slm(kernel->opaque)) {
-    if (!IS_BAYTRAIL_T(device_id) || simd_width == 16)
-      work_group_size = simd_width * 64;
-    else
-      work_group_size = kernel->program->ctx->devices[0]->max_compute_unit *
-                        kernel->program->ctx->devices[0]->max_thread_per_unit * simd_width;
-  } else {
-    thread_cnt = kernel->program->ctx->devices[0]->max_compute_unit *
-                 kernel->program->ctx->devices[0]->max_thread_per_unit / kernel->program->ctx->devices[0]->sub_slice_count;
-    if(thread_cnt > 64)
-      thread_cnt = 64;
-    work_group_size = thread_cnt * simd_width;
   }
-  if(work_group_size > kernel->program->ctx->devices[0]->max_work_group_size)
-    work_group_size = kernel->program->ctx->devices[0]->max_work_group_size;
-  return work_group_size;
-}
-
-LOCAL cl_int
-cl_get_kernel_workgroup_info(cl_kernel kernel,
-                             cl_device_id device,
-                             cl_kernel_work_group_info param_name,
-                             size_t param_value_size,
-                             void* param_value,
-                             size_t* param_value_size_ret)
-{
-  int err = CL_SUCCESS;
-  int dimension = 0;
-  CHECK_KERNEL(kernel);
-  if (device == NULL)
-    device = kernel->program->ctx->devices[0];
-  if (UNLIKELY(is_gen_device(device) == CL_FALSE))
-    return CL_INVALID_DEVICE;
 
-  switch (param_name) {
-    case CL_KERNEL_WORK_GROUP_SIZE:
-    {
-      if (param_value && param_value_size < sizeof(size_t))
-        return CL_INVALID_VALUE;
-      if (param_value_size_ret != NULL)
-        *param_value_size_ret = sizeof(size_t);
-      if (param_value) {
-        size_t work_group_size = cl_get_kernel_max_wg_sz(kernel);
-        *(size_t*)param_value = work_group_size;
-        return CL_SUCCESS;
-      }
-    }
-    case CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE:
-    {
-      if (param_value && param_value_size < sizeof(size_t))
-        return CL_INVALID_VALUE;
-      if (param_value_size_ret != NULL)
-        *param_value_size_ret = sizeof(size_t);
-      if (param_value)
-        *(size_t*)param_value = interp_kernel_get_simd_width(kernel->opaque);
-      return CL_SUCCESS;
-    }
-    case CL_KERNEL_LOCAL_MEM_SIZE:
-    {
-      size_t local_mem_sz =  interp_kernel_get_slm_size(kernel->opaque) + kernel->local_mem_sz;
-      _DECL_FIELD(local_mem_sz)
-    }
-    DECL_FIELD(COMPILE_WORK_GROUP_SIZE, kernel->compile_wg_sz)
-    DECL_FIELD(PRIVATE_MEM_SIZE, kernel->stack_size)
-    case CL_KERNEL_GLOBAL_WORK_SIZE:
-      dimension = cl_check_builtin_kernel_dimension(kernel, device);
-      if ( !dimension ) return CL_INVALID_VALUE;
-      if (param_value_size_ret != NULL)
-        *param_value_size_ret = sizeof(device->max_1d_global_work_sizes);
-      if (param_value) {
-        if (dimension == 1) {
-          memcpy(param_value, device->max_1d_global_work_sizes, sizeof(device->max_1d_global_work_sizes));
-        }else if(dimension == 2){
-          memcpy(param_value, device->max_2d_global_work_sizes, sizeof(device->max_2d_global_work_sizes));
-        }else if(dimension == 3){
-          memcpy(param_value, device->max_3d_global_work_sizes, sizeof(device->max_3d_global_work_sizes));
-        }else
-          return CL_INVALID_VALUE;
-
-        return CL_SUCCESS;
-      }
-      return CL_SUCCESS;
-    default:
-      return CL_INVALID_VALUE;
-  };
-
-error:
-  return err;
-}
-
-LOCAL cl_int
-cl_get_kernel_subgroup_info(cl_kernel kernel,
-                            cl_device_id device,
-                            cl_kernel_work_group_info param_name,
-                            size_t input_value_size,
-                            const void* input_value,
-                            size_t param_value_size,
-                            void* param_value,
-                            size_t* param_value_size_ret)
-{
-  int err = CL_SUCCESS;
-  if(device != NULL)
-    if (kernel->program->ctx->devices[0] != device)
-      return CL_INVALID_DEVICE;
-
-  CHECK_KERNEL(kernel);
-  switch (param_name) {
-    case CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR:
-    {
-      int i, dim = 0;
-      size_t local_sz = 1;
-      if (param_value && param_value_size < sizeof(size_t))
-        return CL_INVALID_VALUE;
-      if (param_value_size_ret != NULL)
-        *param_value_size_ret = sizeof(size_t);
-      switch (input_value_size)
-      {
-        case sizeof(size_t)*1:
-        case sizeof(size_t)*2:
-        case sizeof(size_t)*3:
-          dim = input_value_size/sizeof(size_t);
-          break;
-        default: return CL_INVALID_VALUE;
-      }
-      if (input_value == NULL )
-        return CL_INVALID_VALUE;
-      for(i = 0; i < dim; i++)
-        local_sz *= ((size_t*)input_value)[i];
-      if (param_value) {
-        size_t simd_sz = cl_kernel_get_simd_width(kernel);
-        size_t sub_group_size = local_sz >= simd_sz? simd_sz : local_sz;
-        *(size_t*)param_value = sub_group_size;
-        return CL_SUCCESS;
-      }
-      break;
-    }
-    case CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE_KHR:
-    {
-      int i, dim = 0;
-      size_t local_sz = 1;
-      if (param_value && param_value_size < sizeof(size_t))
-        return CL_INVALID_VALUE;
-      if (param_value_size_ret != NULL)
-        *param_value_size_ret = sizeof(size_t);
-      switch (input_value_size)
-      {
-        case sizeof(size_t)*1:
-        case sizeof(size_t)*2:
-        case sizeof(size_t)*3:
-          dim = input_value_size/sizeof(size_t);
-          break;
-        default: return CL_INVALID_VALUE;
-      }
-      if (input_value == NULL )
-        return CL_INVALID_VALUE;
-      for(i = 0; i < dim; i++)
-        local_sz *= ((size_t*)input_value)[i];
-      if (param_value) {
-        size_t simd_sz = cl_kernel_get_simd_width(kernel);
-        size_t sub_group_num = (local_sz + simd_sz - 1) / simd_sz;
-        *(size_t*)param_value = sub_group_num;
-        return CL_SUCCESS;
-      }
-      break;
-    }
-    default:
-      return CL_INVALID_VALUE;
-  };
-
-error:
-  return err;
+  return cl_get_info_helper(src_ptr, src_size,
+                            param_value, param_value_size, param_value_size_ret);
 }
 
 LOCAL cl_int
@@ -1607,7 +463,7 @@ cl_devices_list_check(cl_uint num_devices, const cl_device_id *devices)
     }
 
     // TODO: We now just support Gen Device.
-    if (devices[i] != cl_get_gt_device(devices[i]->device_type)) {
+    if (devices[i] != cl_device_get_id_gen(devices[i]->platform)) {
       return CL_INVALID_DEVICE;
     }
   }
diff --git a/src/cl_device_id.h b/src/cl_device_id.h
index b3136e7..ca4604c 100644
--- a/src/cl_device_id.h
+++ b/src/cl_device_id.h
@@ -27,80 +27,80 @@
 #include "cl_device_api.h"
 
 /* Store complete information about the device */
-struct _cl_device_id {
+typedef struct _cl_device_id {
   _cl_base_object base;
   cl_device_type device_type;
-  cl_uint  device_id;
-  cl_uint  vendor_id;
-  cl_uint  max_compute_unit;               // maximum EU number
-  cl_uint  max_thread_per_unit;            // maximum EU threads per EU.
-  cl_uint  sub_slice_count;                // Device's sub slice count
-  cl_uint  max_work_item_dimensions;       // should be 3.
-  size_t   max_work_item_sizes[3];         // equal to maximum work group size.
-  size_t   max_work_group_size;            // maximum work group size under simd16 mode.
-  size_t   max_1d_global_work_sizes[3];       // maximum 1d global work size for builtin kernels.
-  size_t   max_2d_global_work_sizes[3];       // maximum 2d global work size for builtin kernels.
-  size_t   max_3d_global_work_sizes[3];       // maximum 3d global work size for builtin kernels.
-  cl_uint  preferred_vector_width_char;
-  cl_uint  preferred_vector_width_short;
-  cl_uint  preferred_vector_width_int;
-  cl_uint  preferred_vector_width_long;
-  cl_uint  preferred_vector_width_float;
-  cl_uint  preferred_vector_width_double;
-  cl_uint  preferred_vector_width_half;
-  cl_uint  native_vector_width_char;
-  cl_uint  native_vector_width_short;
-  cl_uint  native_vector_width_int;
-  cl_uint  native_vector_width_long;
-  cl_uint  native_vector_width_float;
-  cl_uint  native_vector_width_double;
-  cl_uint  native_vector_width_half;
-  cl_uint  max_clock_frequency;
-  cl_uint  address_bits;
-  cl_ulong max_mem_alloc_size;
-  cl_device_svm_capabilities  svm_capabilities;
+  cl_uint device_id;
+  cl_uint vendor_id;
+  cl_uint max_compute_unit;           // maximum EU number
+  cl_uint max_thread_per_unit;        // maximum EU threads per EU.
+  cl_uint sub_slice_count;            // Device's sub slice count
+  cl_uint max_work_item_dimensions;   // should be 3.
+  size_t max_work_item_sizes[3];      // equal to maximum work group size.
+  size_t max_work_group_size;         // maximum work group size under simd16 mode.
+  size_t max_1d_global_work_sizes[3]; // maximum 1d global work size for builtin kernels.
+  size_t max_2d_global_work_sizes[3]; // maximum 2d global work size for builtin kernels.
+  size_t max_3d_global_work_sizes[3]; // maximum 3d global work size for builtin kernels.
+  cl_uint preferred_vector_width_char;
+  cl_uint preferred_vector_width_short;
+  cl_uint preferred_vector_width_int;
+  cl_uint preferred_vector_width_long;
+  cl_uint preferred_vector_width_float;
+  cl_uint preferred_vector_width_double;
+  cl_uint preferred_vector_width_half;
+  cl_uint native_vector_width_char;
+  cl_uint native_vector_width_short;
+  cl_uint native_vector_width_int;
+  cl_uint native_vector_width_long;
+  cl_uint native_vector_width_float;
+  cl_uint native_vector_width_double;
+  cl_uint native_vector_width_half;
+  cl_uint max_clock_frequency;
+  cl_uint address_bits;
+  size_t max_mem_alloc_size;
+  cl_device_svm_capabilities svm_capabilities;
   cl_uint preferred_platform_atomic_alignment;
   cl_uint preferred_global_atomic_alignment;
   cl_uint preferred_local_atomic_alignment;
-  cl_bool  image_support;
-  cl_uint  max_read_image_args;
-  cl_uint  max_write_image_args;
-  cl_uint  max_read_write_image_args;
-  size_t   image2d_max_width;
-  size_t   image_max_array_size;
-  size_t   image2d_max_height;
-  size_t   image3d_max_width;
-  size_t   image3d_max_height;
-  size_t   image3d_max_depth;
-  size_t   image_mem_size;
-  cl_uint  max_samplers;
-  size_t   max_parameter_size;
-  cl_uint  mem_base_addr_align;
-  cl_uint  min_data_type_align_size;
-  cl_uint  max_pipe_args;
-  cl_uint  pipe_max_active_reservations;
-  cl_uint  pipe_max_packet_siz;
+  cl_bool image_support;
+  cl_uint max_read_image_args;
+  cl_uint max_write_image_args;
+  cl_uint max_read_write_image_args;
+  size_t image2d_max_width;
+  size_t image_max_array_size;
+  size_t image2d_max_height;
+  size_t image3d_max_width;
+  size_t image3d_max_height;
+  size_t image3d_max_depth;
+  size_t image_mem_size;
+  cl_uint max_samplers;
+  size_t max_parameter_size;
+  cl_uint mem_base_addr_align;
+  cl_uint min_data_type_align_size;
+  cl_uint max_pipe_args;
+  cl_uint pipe_max_active_reservations;
+  cl_uint pipe_max_packet_siz;
   cl_device_fp_config single_fp_config;
   cl_device_fp_config half_fp_config;
   cl_device_fp_config double_fp_config;
   cl_device_mem_cache_type global_mem_cache_type;
-  cl_uint  global_mem_cache_line_size;
+  cl_uint global_mem_cache_line_size;
   cl_ulong global_mem_cache_size;
   cl_ulong global_mem_size;
   cl_ulong max_constant_buffer_size;
-  cl_uint  max_constant_args;
-  size_t  max_global_variable_size;
-  size_t  global_variable_preferred_total_size;
+  cl_uint max_constant_args;
+  size_t max_global_variable_size;
+  size_t global_variable_preferred_total_size;
   cl_device_local_mem_type local_mem_type;
   cl_ulong local_mem_size;
   cl_ulong scratch_mem_size;
-  cl_bool  error_correction_support;
-  cl_bool  host_unified_memory;
-  size_t   profiling_timer_resolution;
-  cl_bool  endian_little;
-  cl_bool  available;
-  cl_bool  compiler_available;
-  cl_bool  linker_available;
+  cl_bool error_correction_support;
+  cl_bool host_unified_memory;
+  size_t profiling_timer_resolution;
+  cl_bool endian_little;
+  cl_bool available;
+  cl_bool compiler_available;
+  cl_bool linker_available;
   cl_device_exec_capabilities execution_capabilities;
   cl_command_queue_properties queue_properties;
   cl_command_queue_properties queue_on_host_properties;
@@ -120,7 +120,6 @@ struct _cl_device_id {
   const char extensions[EXTENSTION_LENGTH];
   const char *driver_version;
   const char *spir_versions;
-  const char *built_in_kernels;
   size_t name_sz;
   size_t vendor_sz;
   size_t version_sz;
@@ -132,9 +131,9 @@ struct _cl_device_id {
   size_t built_in_kernels_sz;
   /* SubDevice specific info */
   cl_device_id parent_device;
-  cl_uint      partition_max_sub_device;
+  cl_uint partition_max_sub_device;
   cl_device_partition_property partition_property[3];
-  cl_device_affinity_domain    affinity_domain;
+  cl_device_affinity_domain affinity_domain;
   cl_device_partition_property partition_type[3];
   uint32_t atomic_test_result;
   cl_uint image_pitch_alignment;
@@ -142,55 +141,28 @@ struct _cl_device_id {
 
   _cl_device_api api;
   _cl_compiler compiler;
-
-  //inited as NULL, created only when cmrt kernel is used
-  void* cmrt_device;  //realtype: CmDevice*
-};
+  const char *built_in_kernels;
+  const char *built_in_kernels_binary;
+  size_t built_in_kernels_binary_sz;
+} _cl_device_id;
 
 #define CL_OBJECT_DEVICE_MAGIC 0x2acaddcca8853c52LL
-#define CL_OBJECT_IS_DEVICE(obj) ((obj &&                           \
-         ((cl_base_object)obj)->magic == CL_OBJECT_DEVICE_MAGIC &&  \
-         CL_OBJECT_GET_REF(obj) >= 1))
-
-/* Get a device from the given platform */
-extern cl_int cl_get_device_ids(cl_platform_id    platform,
-                                cl_device_type    device_type,
-                                cl_uint           num_entries,
-                                cl_device_id *    devices,
-                                cl_uint *         num_devices);
+#define CL_OBJECT_IS_DEVICE(obj) ((obj &&                                                    \
+                                   ((cl_base_object)obj)->magic == CL_OBJECT_DEVICE_MAGIC && \
+                                   CL_OBJECT_GET_REF(obj) >= 1))
 
 /* Get the intel GPU device we currently have in this machine (if any) */
-extern cl_device_id cl_get_gt_device(cl_device_type device_type);
-
+extern cl_device_id cl_device_get_id_gen(cl_platform_id platform);
+extern cl_int cl_device_get_version_gen(cl_device_id device, cl_int *ver);
+extern void cl_device_gen_cleanup(void);
+/* Get a device from the given platform */
+extern cl_int cl_device_get_ids(cl_platform_id platform, cl_device_type device_type,
+                                cl_uint num_entries, cl_device_id *devices, cl_uint *num_devices);
 /* Provide info about the device */
-extern cl_int cl_get_device_info(cl_device_id     device,
-                                 cl_device_info   param_name,
-                                 size_t           param_value_size,
-                                 void *           param_value,
-                                 size_t *         param_value_size_ret);
-
-extern cl_int cl_get_kernel_workgroup_info(cl_kernel kernel,
-                                           cl_device_id     device,
-                                           cl_kernel_work_group_info   param_name,
-                                           size_t           param_value_size,
-                                           void *           param_value,
-                                           size_t *         param_value_size_ret);
-
-extern cl_int cl_get_kernel_subgroup_info(cl_kernel kernel,
-                                          cl_device_id     device,
-                                          cl_kernel_work_group_info   param_name,
-                                          size_t           input_value_size,
-                                          const void *     input_value,
-                                          size_t           param_value_size,
-                                          void *           param_value,
-                                          size_t *         param_value_size_ret);
+extern cl_int cl_device_get_info(cl_device_id device, cl_device_info param_name, size_t param_value_size,
+                                 void *param_value, size_t *param_value_size_ret);
 /* Returns the Gen device ID */
-extern cl_int cl_device_get_version(cl_device_id device, cl_int *ver);
-extern size_t cl_get_kernel_max_wg_sz(cl_kernel);
-
 extern cl_int cl_devices_list_check(cl_uint num_devices, const cl_device_id *devices);
 extern cl_int cl_devices_list_include_check(cl_uint num_devices, const cl_device_id *devices,
-                                        cl_uint num_to_check, const cl_device_id *devices_to_check);
-
+                                            cl_uint num_to_check, const cl_device_id *devices_to_check);
 #endif /* __CL_DEVICE_ID_H__ */
-
diff --git a/src/cl_driver.h b/src/cl_driver.h
index 07c5f7f..8f6b3c3 100644
--- a/src/cl_driver.h
+++ b/src/cl_driver.h
@@ -55,12 +55,6 @@ extern cl_driver_get_ver_cb *cl_driver_get_ver;
 typedef void (cl_driver_enlarge_stack_size_cb)(cl_driver, int32_t*);
 extern cl_driver_enlarge_stack_size_cb *cl_driver_enlarge_stack_size;
 
-typedef enum cl_self_test_res{
-  SELF_TEST_PASS = 0,
-  SELF_TEST_SLM_FAIL  = 1,
-  SELF_TEST_ATOMIC_FAIL = 2,
-  SELF_TEST_OTHER_FAIL = 3,
-} cl_self_test_res;
 /* Set the atomic enable/disable flag in the driver */
 typedef void (cl_driver_set_atomic_flag_cb)(cl_driver, int);
 extern cl_driver_set_atomic_flag_cb *cl_driver_set_atomic_flag;
diff --git a/src/cl_enqueue.c b/src/cl_enqueue.c
index 933b2b7..15292f2 100644
--- a/src/cl_enqueue.c
+++ b/src/cl_enqueue.c
@@ -24,7 +24,6 @@
 #include "cl_command_queue.h"
 #include "cl_utils.h"
 #include "cl_alloc.h"
-#include "cl_device_enqueue.h"
 #include <stdio.h>
 #include <string.h>
 #include <assert.h>
@@ -439,7 +438,7 @@ cl_enqueue_unmap_mem_object(enqueue_data *data, cl_int status)
   /* shrink the mapped slot. */
   if (memobj->mapped_ptr_sz / 2 > memobj->map_ref) {
     int j = 0;
-    cl_mapped_ptr *new_ptr = (cl_mapped_ptr *)malloc(
+    cl_mapped_ptr *new_ptr = (cl_mapped_ptr *)CL_MALLOC(
       sizeof(cl_mapped_ptr) * (memobj->mapped_ptr_sz / 2));
     if (!new_ptr) {
       /* Just do nothing. */
@@ -455,7 +454,7 @@ cl_enqueue_unmap_mem_object(enqueue_data *data, cl_int status)
       }
     }
     memobj->mapped_ptr_sz = memobj->mapped_ptr_sz / 2;
-    free(memobj->mapped_ptr);
+    CL_FREE(memobj->mapped_ptr);
     memobj->mapped_ptr = new_ptr;
   }
 
@@ -569,16 +568,8 @@ cl_enqueue_ndrange(enqueue_data *data, cl_int status)
 
   if (status == CL_SUBMITTED) {
     err = cl_command_queue_flush_gpgpu(data->gpgpu);
-    //if it is the last ndrange of an cl enqueue api,
-    //check the device enqueue information.
-    if (data->mid_event_of_enq == 0) {
-      assert(data->queue);
-      cl_device_enqueue_parse_result(data->queue, data->gpgpu);
-    }
   } else if (status == CL_COMPLETE) {
-    void *batch_buf = cl_gpgpu_ref_batch_buf(data->gpgpu);
-    cl_gpgpu_sync(batch_buf);
-    cl_gpgpu_unref_batch_buf(batch_buf);
+    err = cl_command_queue_finish_gpgpu(data->gpgpu);
   }
 
   return err;
@@ -605,7 +596,7 @@ cl_enqueue_delete(enqueue_data *data)
       data->type == EnqueueFillBuffer ||
       data->type == EnqueueFillImage) {
     if (data->gpgpu) {
-      cl_gpgpu_delete(data->gpgpu);
+      cl_command_queue_delete_gpgpu(data->gpgpu);
       data->gpgpu = NULL;
     }
     return;
diff --git a/src/cl_enqueue.h b/src/cl_enqueue.h
index 50a54fc..ced5e6e 100644
--- a/src/cl_enqueue.h
+++ b/src/cl_enqueue.h
@@ -77,7 +77,7 @@ typedef struct _enqueue_data {
                                  cl_uint num_svm_pointers,
                                  void *svm_pointers[],
                                  void *user_data);  /* pointer to pfn_free_func of clEnqueueSVMFree */
-  cl_gpgpu gpgpu;
+  void* gpgpu;
   cl_bool mid_event_of_enq;  /* For non-uniform ndrange, one enqueue have a sequence event, the
                                 last event need to parse device enqueue information.
                                 0 : last event; 1: non-last event */
diff --git a/src/cl_kernel.c b/src/cl_kernel.c
index 867231d..60fb1dc 100644
--- a/src/cl_kernel.c
+++ b/src/cl_kernel.c
@@ -36,320 +36,145 @@
 #include <stdint.h>
 #include <assert.h>
 
-LOCAL void
-cl_kernel_delete(cl_kernel k)
+static void
+cl_kernel_arg_delete(cl_argument *arg)
 {
-  uint32_t i;
-  if (k == NULL) return;
+  assert(arg);
 
-#ifdef HAS_CMRT
-  if (k->cmrt_kernel != NULL) {
-    cmrt_destroy_kernel(k);
-    CL_OBJECT_DESTROY_BASE(k);
-    CL_FREE(k);
+  if (arg->is_set == CL_FALSE) {
     return;
   }
-#endif
 
-  /* We are not done with the kernel */
-  if (CL_OBJECT_DEC_REF(k) > 1)
+  if (arg->arg_type != ArgTypeValue && arg->arg_type != ArgTypeStruct) {
     return;
-
-  /* Release one reference on all bos we own */
-  if (k->bo)       cl_buffer_unreference(k->bo);
-  /* This will be true for kernels created by clCreateKernel */
-  if (k->ref_its_program) cl_program_delete(k->program);
-  /* Release the curbe if allocated */
-  if (k->curbe) CL_FREE(k->curbe);
-  /* Release the argument array if required */
-  if (k->args) {
-    for (i = 0; i < k->arg_n; ++i)
-      if (k->args[i].mem != NULL)
-        cl_mem_delete(k->args[i].mem);
-    CL_FREE(k->args);
   }
-  if (k->image_sz)
-    CL_FREE(k->images);
-
-  if (k->exec_info)
-    CL_FREE(k->exec_info);
-
-  if (k->device_enqueue_ptr)
-    cl_mem_svm_delete(k->program->ctx, k->device_enqueue_ptr);
-  if (k->device_enqueue_infos)
-    CL_FREE(k->device_enqueue_infos);
-
-  CL_OBJECT_DESTROY_BASE(k);
-
-  CL_FREE(k);
-}
-
-LOCAL cl_kernel
-cl_kernel_new(cl_program p)
-{
-  cl_kernel k = NULL;
-  TRY_ALLOC_NO_ERR (k, CL_CALLOC(1, sizeof(struct _cl_kernel)));
-  CL_OBJECT_INIT_BASE(k, CL_OBJECT_KERNEL_MAGIC);
-  k->program = p;
-  k->cmrt_kernel = NULL;
-
-exit:
-  return k;
-error:
-  cl_kernel_delete(k);
-  k = NULL;
-  goto exit;
-}
-
-LOCAL const char*
-cl_kernel_get_name(cl_kernel k)
-{
-  if (UNLIKELY(k == NULL)) return NULL;
-  return interp_kernel_get_name(k->opaque);
-}
 
-LOCAL const char*
-cl_kernel_get_attributes(cl_kernel k)
-{
-  if (UNLIKELY(k == NULL)) return NULL;
-  return interp_kernel_get_attributes(k->opaque);
-}
+  if (arg->arg_type == ArgTypeValue && arg->arg_size > sizeof(cl_double)) {
+    CL_FREE(arg->val.val_ptr);
+  } else if (arg->arg_type == ArgTypeStruct) {
+    CL_FREE(arg->val.val_ptr);
+  }
 
-LOCAL void
-cl_kernel_add_ref(cl_kernel k)
-{
-  CL_OBJECT_INC_REF(k);
+  arg->is_set = CL_FALSE;
+  return;
 }
 
 LOCAL cl_int
-cl_kernel_set_arg(cl_kernel k, cl_uint index, size_t sz, const void *value)
+cl_kernel_set_arg(cl_kernel kernel, cl_uint index, size_t sz, const void *value)
 {
-  int32_t offset;            /* where to patch */
-  enum gbe_arg_type arg_type; /* kind of argument */
-  size_t arg_sz;              /* size of the argument */
-  cl_mem mem = NULL;          /* for __global, __constant and image arguments */
-  cl_context ctx = k->program->ctx;
+  int i;
+  cl_argument *arg = NULL;
 
-  if (UNLIKELY(index >= k->arg_n))
+  if (index >= kernel->arg_n)
     return CL_INVALID_ARG_INDEX;
-  arg_type = interp_kernel_get_arg_type(k->opaque, index);
-  arg_sz = interp_kernel_get_arg_size(k->opaque, index);
-
-  if (k->vme && index == 0) {
-    //the best method is to return the arg type of GBE_ARG_ACCELERATOR_INTEL
-    //but it is not straightforward since clang does not support it now
-    //the easy way is to consider typedef accelerator_intel_t as a struct,
-    //this easy way makes the size mismatched, so use another size check method.
-    if (sz != sizeof(cl_accelerator_intel) || arg_sz != sizeof(cl_motion_estimation_desc_intel))
-      return CL_INVALID_ARG_SIZE;
-    cl_accelerator_intel* accel = (cl_accelerator_intel*)value;
-    if ((*accel)->type != CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL)
-      return CL_INVALID_ACCELERATOR_TYPE_INTEL;
-  } else {
-    if (UNLIKELY(arg_type != GBE_ARG_LOCAL_PTR && arg_sz != sz)) {
-      if (arg_type != GBE_ARG_SAMPLER ||
-          (arg_type == GBE_ARG_SAMPLER && sz != sizeof(cl_sampler)))
-        return CL_INVALID_ARG_SIZE;
+
+  for (i = 0; i < kernel->arg_n; i++) {
+    if (kernel->args[i].arg_no == index) {
+      arg = &kernel->args[i];
+      break;
     }
   }
+  assert(arg);
 
-  if(UNLIKELY(arg_type == GBE_ARG_LOCAL_PTR && sz == 0))
-    return CL_INVALID_ARG_SIZE;
-  if(arg_type == GBE_ARG_VALUE) {
-    if(UNLIKELY(value == NULL))
-      return CL_INVALID_ARG_VALUE;
-  } else if(arg_type == GBE_ARG_LOCAL_PTR) {
-    if(UNLIKELY(value != NULL))
-      return CL_INVALID_ARG_VALUE;
-  } else if(arg_type == GBE_ARG_SAMPLER) {
-    if (UNLIKELY(value == NULL))
-      return CL_INVALID_ARG_VALUE;
+  if (arg->is_set) {
+    cl_kernel_arg_delete(arg);
+  }
+
+  /* Local mem is special, the size is the local mem's size to be allocated. */
+  if (arg->arg_type == ArgTypePointer && arg->arg_addrspace == AddressSpaceLocal) {
+    if (sz == 0)
+      return CL_INVALID_ARG_SIZE;
 
-    cl_sampler s = *(cl_sampler*)value;
-    if(!CL_OBJECT_IS_SAMPLER(s))
-      return CL_INVALID_SAMPLER;
-  } else {
-    // should be image, GLOBAL_PTR, CONSTANT_PTR
-    if (UNLIKELY(value == NULL && (arg_type == GBE_ARG_IMAGE ||
-            arg_type == GBE_ARG_PIPE)))
+    if (value != NULL)
       return CL_INVALID_ARG_VALUE;
-    if(value != NULL)
-      mem = *(cl_mem*)value;
-    if(arg_type == GBE_ARG_PIPE) {
-      _cl_mem_pipe* pipe= cl_mem_pipe(mem);
-      size_t type_size = (size_t)interp_kernel_get_arg_info(k->opaque, index,5);
-      if(pipe->packet_size != type_size)
-          return CL_INVALID_ARG_VALUE;
-    }
-    if(value != NULL && mem) {
-      if(CL_SUCCESS != cl_mem_is_valid(mem, ctx))
-        return CL_INVALID_MEM_OBJECT;
 
-      if (UNLIKELY((arg_type == GBE_ARG_IMAGE && !IS_IMAGE(mem))
-         || (arg_type != GBE_ARG_IMAGE && IS_IMAGE(mem))))
-          return CL_INVALID_ARG_VALUE;
-    }
+    arg->val_size = sz;
+    arg->is_set = CL_TRUE;
+    return CL_SUCCESS;
   }
 
-  /* Copy the structure or the value directly into the curbe */
-  if (arg_type == GBE_ARG_VALUE) {
-    if (k->vme && index == 0) {
-      cl_accelerator_intel accel;
-      memcpy(&accel, value, sz);
-      offset = interp_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, index);
-      if (offset >= 0) {
-        assert(offset + sz <= k->curbe_sz);
-        memcpy(k->curbe + offset, &(accel->desc.me), arg_sz);
-      }
-      k->args[index].local_sz = 0;
-      k->args[index].is_set = 1;
-      k->args[index].mem = NULL;
-      k->accel = accel;
-      return CL_SUCCESS;
+  if (sz != arg->arg_size)
+    return CL_INVALID_ARG_SIZE;
+
+  /* For constant and global mem, we should have a cl_mem object, and it is a buffer. */
+  if (arg->arg_type == ArgTypePointer) {
+    assert(arg->arg_addrspace != AddressSpaceLocal);
+    if (value == NULL || *((cl_mem *)value) == NULL) {
+      arg->val.val_mem = NULL;
     } else {
-      offset = interp_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, index);
-      if (offset >= 0) {
-        assert(offset + sz <= k->curbe_sz);
-        memcpy(k->curbe + offset, value, sz);
-      }
-      k->args[index].local_sz = 0;
-      k->args[index].is_set = 1;
-      k->args[index].mem = NULL;
-      return CL_SUCCESS;
+      if (!CL_OBJECT_IS_MEM(*(cl_mem *)value))
+        return CL_INVALID_ARG_VALUE;
+
+      arg->val.val_mem = *(cl_mem *)value;
     }
-  }
 
-  /* For a local pointer just save the size */
-  if (arg_type == GBE_ARG_LOCAL_PTR) {
-    k->args[index].local_sz = sz;
-    k->args[index].is_set = 1;
-    k->args[index].mem = NULL;
+    arg->val_size = sizeof(cl_mem);
+    arg->is_set = CL_TRUE;
     return CL_SUCCESS;
   }
 
-  /* Is it a sampler*/
-  if (arg_type == GBE_ARG_SAMPLER) {
-    cl_sampler sampler;
-    memcpy(&sampler, value, sz);
-    k->args[index].local_sz = 0;
-    k->args[index].is_set = 1;
-    k->args[index].mem = NULL;
-    k->args[index].sampler = sampler;
-    cl_set_sampler_arg_slot(k, index, sampler);
-    offset = interp_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, index);
-    if (offset >= 0) {
-      assert(offset + 4 <= k->curbe_sz);
-      memcpy(k->curbe + offset, &sampler->clkSamplerValue, 4);
-    }
+  /* For image, we should have a cl_mem object, and it is a image. */
+  if (arg->arg_type == ArgTypeImage) {
+    if (!CL_OBJECT_IS_MEM(*(cl_mem *)value))
+      return CL_INVALID_ARG_VALUE;
+
+    arg->val.val_mem = *(cl_mem *)value;
+    arg->val_size = sizeof(cl_mem);
+    arg->is_set = CL_TRUE;
     return CL_SUCCESS;
   }
 
-  if(value != NULL)
-    mem = *(cl_mem*) value;
+  /* For image, we should have a cl_mem object, and it is a image. */
+  if (arg->arg_type == ArgTypeSampler) {
+    if (!CL_OBJECT_IS_SAMPLER(*(cl_sampler *)value))
+      return CL_INVALID_ARG_VALUE;
 
-  if(value == NULL || mem == NULL) {
-    /* for buffer object GLOBAL_PTR CONSTANT_PTR, it maybe NULL */
-    int32_t offset = interp_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, index);
-    if (offset >= 0)
-      *((uint32_t *)(k->curbe + offset)) = 0;
-    assert(arg_type == GBE_ARG_GLOBAL_PTR || arg_type == GBE_ARG_CONSTANT_PTR);
+    arg->val.val_sampler = *(cl_sampler *)value;
+    arg->val_size = sizeof(cl_sampler);
+    arg->is_set = CL_TRUE;
+    return CL_SUCCESS;
+  }
 
-    if (k->args[index].mem)
-      cl_mem_delete(k->args[index].mem);
-    k->args[index].mem = NULL;
-    k->args[index].is_set = 1;
-    k->args[index].local_sz = 0;
+  if (arg->arg_type == ArgTypeValue && arg->arg_size <= sizeof(cl_double)) {
+    memcpy(&arg->val, value, sz);
+    arg->is_set = CL_TRUE;
+    arg->val_size = arg->arg_size;
     return CL_SUCCESS;
   }
 
-  mem = *(cl_mem*) value;
+  arg->val.val_ptr = CL_MALLOC(sz);
+  if (arg->val.val_ptr == NULL)
+    return CL_OUT_OF_HOST_MEMORY;
 
-  cl_mem_add_ref(mem);
-  if (k->args[index].mem)
-    cl_mem_delete(k->args[index].mem);
-  k->args[index].mem = mem;
-  k->args[index].is_set = 1;
-  k->args[index].is_svm = mem->is_svm;
-  if(mem->is_svm)
-    k->args[index].ptr = mem->host_ptr;
-  k->args[index].local_sz = 0;
-  k->args[index].bti = interp_kernel_get_arg_bti(k->opaque, index);
+  memset(arg->val.val_ptr, 0, sz);
+  memcpy(arg->val.val_ptr, value, sz);
+  arg->val_size = arg->arg_size;
+  arg->is_set = CL_TRUE;
   return CL_SUCCESS;
 }
 
-
-LOCAL cl_int
-cl_kernel_set_arg_svm_pointer(cl_kernel k, cl_uint index, const void *value)
-{
-  enum gbe_arg_type arg_type; /* kind of argument */
-  //size_t arg_sz;              /* size of the argument */
-  cl_context ctx = k->program->ctx;
-  cl_mem mem= cl_context_get_svm_from_ptr(ctx, value);
-
-  if (UNLIKELY(index >= k->arg_n))
-    return CL_INVALID_ARG_INDEX;
-  arg_type = interp_kernel_get_arg_type(k->opaque, index);
-  //arg_sz = interp_kernel_get_arg_size(k->opaque, index);
-
-  if(arg_type != GBE_ARG_GLOBAL_PTR && arg_type != GBE_ARG_CONSTANT_PTR )
-    return CL_INVALID_ARG_VALUE;
-
-  if(mem == NULL)
-    return CL_INVALID_ARG_VALUE;
-
-  cl_mem_add_ref(mem);
-  if (k->args[index].mem)
-    cl_mem_delete(k->args[index].mem);
-
-  k->args[index].ptr = (void *)value;
-  k->args[index].mem = mem;
-  k->args[index].is_set = 1;
-  k->args[index].is_svm = 1;
-  k->args[index].local_sz = 0;
-  k->args[index].bti = interp_kernel_get_arg_bti(k->opaque, index);
-  return 0;
-}
-
-LOCAL cl_int
-cl_kernel_set_exec_info(cl_kernel k, size_t n, const void *value)
-{
-  cl_int err = CL_SUCCESS;
-  assert(k != NULL);
-
-  if (n == 0) return err;
-  TRY_ALLOC(k->exec_info, CL_CALLOC(n, 1));
-  memcpy(k->exec_info, value, n);
-  k->exec_info_n = n / sizeof(void *);
-
-error:
-  return err;
-}
-
 LOCAL int
-cl_get_kernel_arg_info(cl_kernel k, cl_uint arg_index, cl_kernel_arg_info param_name,
-                       size_t param_value_size, void *param_value, size_t *param_value_size_ret)
+cl_kernel_get_argument_info(cl_kernel k, cl_uint arg_index, cl_kernel_arg_info param_name,
+                            size_t param_value_size, void *param_value, size_t *param_value_size_ret)
 {
   assert(k != NULL);
-  void *ret_info = interp_kernel_get_arg_info(k->opaque, arg_index,
-                           param_name - CL_KERNEL_ARG_ADDRESS_QUALIFIER);
-  uint32_t arg_type = interp_kernel_get_arg_type(k->opaque, arg_index);
   int str_len = 0;
-  cl_kernel_arg_type_qualifier type_qual = CL_KERNEL_ARG_TYPE_NONE;
 
   switch (param_name) {
   case CL_KERNEL_ARG_ADDRESS_QUALIFIER:
     if (param_value_size_ret)
       *param_value_size_ret = sizeof(cl_kernel_arg_address_qualifier);
-    if (!param_value) return CL_SUCCESS;
+    if (!param_value)
+      return CL_SUCCESS;
+
     if (param_value_size < sizeof(cl_kernel_arg_address_qualifier))
       return CL_INVALID_VALUE;
-    if ((size_t)ret_info == 0) {
-      *(cl_kernel_arg_address_qualifier *)param_value = CL_KERNEL_ARG_ADDRESS_PRIVATE;
-    } else if ((size_t)ret_info == 1 || (size_t)ret_info == 4) {
+
+    if (k->args[arg_index].arg_addrspace == AddressSpaceGlobal) {
       *(cl_kernel_arg_address_qualifier *)param_value = CL_KERNEL_ARG_ADDRESS_GLOBAL;
-    } else if ((size_t)ret_info == 2) {
+    } else if (k->args[arg_index].arg_addrspace == AddressSpaceConstant) {
       *(cl_kernel_arg_address_qualifier *)param_value = CL_KERNEL_ARG_ADDRESS_CONSTANT;
-    } else if ((size_t)ret_info == 3) {
+    } else if (k->args[arg_index].arg_addrspace == AddressSpaceLocal) {
       *(cl_kernel_arg_address_qualifier *)param_value = CL_KERNEL_ARG_ADDRESS_LOCAL;
     } else {
       /* If no address qualifier is specified, the default address qualifier
@@ -359,52 +184,61 @@ cl_get_kernel_arg_info(cl_kernel k, cl_uint arg_index, cl_kernel_arg_info param_
     return CL_SUCCESS;
 
   case CL_KERNEL_ARG_ACCESS_QUALIFIER:
+    if (k->args[arg_index].arg_access_qualifier == 0)
+      return CL_KERNEL_ARG_INFO_NOT_AVAILABLE;
     if (param_value_size_ret)
       *param_value_size_ret = sizeof(cl_kernel_arg_access_qualifier);
-    if (!param_value) return CL_SUCCESS;
+    if (!param_value)
+      return CL_SUCCESS;
     if (param_value_size < sizeof(cl_kernel_arg_access_qualifier))
       return CL_INVALID_VALUE;
-    if (!strcmp((char*)ret_info, "write_only")) {
-      *(cl_kernel_arg_address_qualifier *)param_value = CL_KERNEL_ARG_ACCESS_WRITE_ONLY;
-    } else if (!strcmp((char*)ret_info, "read_only")) {
-      *(cl_kernel_arg_address_qualifier *)param_value = CL_KERNEL_ARG_ACCESS_READ_ONLY;
-    } else if (!strcmp((char*)ret_info, "read_write")) {
-      *(cl_kernel_arg_address_qualifier *)param_value = CL_KERNEL_ARG_ACCESS_READ_WRITE;
-    } else {
-      *(cl_kernel_arg_address_qualifier *)param_value = CL_KERNEL_ARG_ACCESS_NONE;
-    }
+    *(cl_kernel_arg_address_qualifier *)param_value = k->args[arg_index].arg_access_qualifier;
     return CL_SUCCESS;
 
   case CL_KERNEL_ARG_TYPE_NAME:
+    if (k->args[arg_index].arg_type_name == NULL)
+      return CL_KERNEL_ARG_INFO_NOT_AVAILABLE;
+    str_len = strlen(k->args[arg_index].arg_type_name);
+    if (param_value_size_ret)
+      *param_value_size_ret = str_len + 1;
+    if (!param_value)
+      return CL_SUCCESS;
+    if (param_value_size < str_len + 1)
+      return CL_INVALID_VALUE;
+
+    memcpy(param_value, k->args[arg_index].arg_type_name, str_len);
+    ((char *)param_value)[str_len] = 0;
+    return CL_SUCCESS;
+
   case CL_KERNEL_ARG_NAME:
-    str_len = strlen(ret_info);
+    if (k->args[arg_index].arg_name == NULL)
+      return CL_KERNEL_ARG_INFO_NOT_AVAILABLE;
+    str_len = strlen(k->args[arg_index].arg_name);
     if (param_value_size_ret)
       *param_value_size_ret = str_len + 1;
-    if (!param_value) return CL_SUCCESS;
+    if (!param_value)
+      return CL_SUCCESS;
     if (param_value_size < str_len + 1)
       return CL_INVALID_VALUE;
-    memcpy(param_value, ret_info, str_len);
+
+    memcpy(param_value, k->args[arg_index].arg_name, str_len);
     ((char *)param_value)[str_len] = 0;
     return CL_SUCCESS;
 
   case CL_KERNEL_ARG_TYPE_QUALIFIER:
+    if ((k->args[arg_index].arg_type_qualifier &
+        (~(CL_KERNEL_ARG_TYPE_NONE | CL_KERNEL_ARG_TYPE_CONST |
+         CL_KERNEL_ARG_TYPE_RESTRICT | CL_KERNEL_ARG_TYPE_VOLATILE |
+         CL_KERNEL_ARG_TYPE_PIPE))) != 0)
+      return CL_KERNEL_ARG_INFO_NOT_AVAILABLE;
     if (param_value_size_ret)
       *param_value_size_ret = sizeof(cl_kernel_arg_type_qualifier);
-    if (!param_value) return CL_SUCCESS;
+    if (!param_value)
+      return CL_SUCCESS;
     if (param_value_size < sizeof(cl_kernel_arg_type_qualifier))
       return CL_INVALID_VALUE;
-    if (strstr((char*)ret_info, "const") &&
-         (arg_type == GBE_ARG_GLOBAL_PTR   ||
-          arg_type == GBE_ARG_CONSTANT_PTR ||
-          arg_type == GBE_ARG_LOCAL_PTR))
-      type_qual = type_qual | CL_KERNEL_ARG_TYPE_CONST;
-    if (strstr((char*)ret_info, "volatile"))
-      type_qual = type_qual | CL_KERNEL_ARG_TYPE_VOLATILE;
-    if (strstr((char*)ret_info, "restrict"))
-      type_qual = type_qual | CL_KERNEL_ARG_TYPE_RESTRICT;
-    if (strstr((char*)ret_info, "pipe"))
-      type_qual = CL_KERNEL_ARG_TYPE_PIPE;
-    *(cl_kernel_arg_type_qualifier *)param_value = type_qual;
+
+    *(cl_kernel_arg_type_qualifier *)param_value = k->args[arg_index].arg_type_qualifier;
     return CL_SUCCESS;
 
   default:
@@ -414,145 +248,320 @@ cl_get_kernel_arg_info(cl_kernel k, cl_uint arg_index, cl_kernel_arg_info param_
   return CL_SUCCESS;
 }
 
-LOCAL uint32_t
-cl_kernel_get_simd_width(cl_kernel k)
+LOCAL cl_kernel
+cl_kernel_new(cl_program p, const char *name)
 {
-  assert(k != NULL);
-  return interp_kernel_get_simd_width(k->opaque);
+  cl_kernel k = NULL;
+  cl_int err = CL_SUCCESS;
+  cl_uint i;
+
+  k = CL_CALLOC(1, sizeof(struct _cl_kernel));
+  if (k == NULL)
+    return NULL;
+
+  CL_OBJECT_INIT_BASE(k, CL_OBJECT_KERNEL_MAGIC);
+  k->program = p;
+
+  k->name = CL_CALLOC(1, strlen(name) + 1);
+  if (k->name == NULL) {
+    CL_FREE(k);
+    return NULL;
+  }
+  memcpy(k->name, name, strlen(name) + 1);
+
+  k->each_device = CL_CALLOC(p->each_device_num, sizeof(cl_kernel_for_device));
+  if (k->each_device == NULL) {
+    CL_FREE(k->name);
+    CL_FREE(k);
+    return NULL;
+  }
+  k->each_device_num = p->each_device_num;
+
+  for (i = 0; i < k->each_device_num; i++) {
+    k->each_device[i] = (p->each_device[i]->device->api.kernel_new)(p->each_device[i]->device, k);
+    if (k->each_device[i] == NULL) {
+      err = CL_OUT_OF_HOST_MEMORY;
+      break;
+    }
+  }
+
+  if (err != CL_SUCCESS) {
+    for (i = 0; i < k->each_device_num; i++) {
+      if (k->each_device[i])
+        (p->each_device[i]->device->api.kernel_delete)(p->each_device[i]->device, k);
+    }
+    CL_FREE(k->name);
+    CL_FREE(k);
+    return NULL;
+  }
+
+  /* Add it to program's user kernels list. */
+  cl_program_add_ref(p);
+  CL_OBJECT_LOCK(p);
+  list_add_tail(&p->kernels, &k->base.node);
+  p->ker_n++;
+  CL_OBJECT_UNLOCK(p);
+  return k;
 }
 
 LOCAL void
-cl_kernel_setup(cl_kernel k, gbe_kernel opaque)
+cl_kernel_delete(cl_kernel k)
 {
-  cl_context ctx = k->program->ctx;
-  cl_buffer_mgr bufmgr = cl_context_get_bufmgr(ctx);
-
-  if(k->bo != NULL)
-    cl_buffer_unreference(k->bo);
-
-  /* Allocate the gen code here */
-  const uint32_t code_sz = interp_kernel_get_code_size(opaque);
-  const char *code = interp_kernel_get_code(opaque);
-  k->bo = cl_buffer_alloc(bufmgr, "CL kernel", code_sz, 64u);
-  k->arg_n = interp_kernel_get_arg_num(opaque);
-
-  /* Upload the code */
-  cl_buffer_subdata(k->bo, 0, code_sz, code);
-  k->opaque = opaque;
-
-  const char* kname = cl_kernel_get_name(k);
-  if (kname != NULL &&
-      strncmp(kname, "block_motion_estimate_intel", sizeof("block_motion_estimate_intel")) == 0)
-    k->vme = 1;
-  else
-    k->vme = 0;
-
-  /* Create the curbe */
-  k->curbe_sz = interp_kernel_get_curbe_size(k->opaque);
-
-  /* Get sampler data & size */
-  k->sampler_sz = interp_kernel_get_sampler_size(k->opaque);
-  assert(k->sampler_sz <= GEN_MAX_SAMPLERS);
-  if (k->sampler_sz > 0)
-    interp_kernel_get_sampler_data(k->opaque, k->samplers);
-  interp_kernel_get_compile_wg_size(k->opaque, k->compile_wg_sz);
-  k->stack_size = interp_kernel_get_stack_size(k->opaque);
-  /* Get image data & size */
-  k->image_sz = interp_kernel_get_image_size(k->opaque);
-  assert(k->sampler_sz <= GEN_MAX_SURFACES);
-  assert(k->image_sz <= ctx->devices[0]->max_read_image_args + ctx->devices[0]->max_write_image_args);
-  if (k->image_sz > 0) {
-    TRY_ALLOC_NO_ERR(k->images, CL_CALLOC(k->image_sz, sizeof(k->images[0])));
-    interp_kernel_get_image_data(k->opaque, k->images);
-  } else
-    k->images = NULL;
-  return;
-error:
-  cl_buffer_unreference(k->bo);
-  k->bo = NULL;
+  uint32_t i;
+  if (k == NULL)
+    return;
+
+  /* We are not done with the kernel */
+  if (CL_OBJECT_DEC_REF(k) > 1)
+    return;
+
+  CL_OBJECT_LOCK(k->program);
+  list_node_del(&k->base.node);
+  k->program->ker_n--;
+  CL_OBJECT_UNLOCK(k->program);
+  cl_program_delete(k->program);
+
+  if (k->name)
+    CL_FREE(k->name);
+  k->name = NULL;
+
+  if (k->kernel_attr)
+    CL_FREE(k->kernel_attr);
+  k->kernel_attr = NULL;
+
+  for (i = 0; i < k->each_device_num; i++) {
+    (k->each_device[i]->device->api.kernel_delete)(k->each_device[i]->device, k);
+  }
+  CL_FREE(k->each_device);
+
+  if (k->args) {
+    for (i = 0; i < k->arg_n; i++) {
+      if (k->args[i].arg_name)
+        CL_FREE(k->args[i].arg_name);
+      if (k->args[i].arg_type_name)
+        CL_FREE(k->args[i].arg_type_name);
+      cl_kernel_arg_delete(&k->args[i]);
+    }
+
+    CL_FREE(k->args);
+    k->args = NULL;
+  }
+
+  CL_OBJECT_DESTROY_BASE(k);
+  CL_FREE(k);
+}
+
+LOCAL void
+cl_kernel_add_ref(cl_kernel k)
+{
+  CL_OBJECT_INC_REF(k);
 }
 
 LOCAL cl_kernel
-cl_kernel_dup(cl_kernel from)
+cl_kernel_create(cl_program p, const char *name, cl_int *errcode_ret)
 {
-  cl_kernel to = NULL;
+  cl_kernel kernel = NULL;
+  cl_uint i, j;
+  cl_int err;
+  int someone_created;
+  cl_bool find;
+
+  assert(p->each_device);
+  assert(name);
+
+  if (CL_OBJECT_TAKE_OWNERSHIP(p, CL_FALSE) == CL_FALSE) {
+    *errcode_ret = CL_INVALID_OPERATION;
+    return NULL;
+  }
+
+  if (p->build_status != CL_BUILD_SUCCESS) {
+    *errcode_ret = CL_INVALID_PROGRAM_EXECUTABLE;
+    return NULL;
+  }
+
+  /* Need to find it in at least one device's program */
+  find = CL_FALSE;
+  for (i = 0; i < p->each_device_num; i++) {
+    for (j = 0; j < p->each_device[i]->kernel_num; j++) {
+      if (strcmp(p->each_device[i]->kernel_names[j], name) == 0)
+        find = CL_TRUE;
+    }
+  }
+  if (find == CL_FALSE) {
+    *errcode_ret = CL_INVALID_KERNEL_NAME;
+    CL_OBJECT_RELEASE_OWNERSHIP(p);
+    return NULL;
+  }
+
+  kernel = cl_kernel_new(p, name);
+  CL_OBJECT_RELEASE_OWNERSHIP(p);
+  if (kernel == NULL) {
+    *errcode_ret = CL_OUT_OF_HOST_MEMORY;
+    return NULL;
+  }
+
+  someone_created = 0;
+  for (i = 0; i < p->each_device_num; i++) {
+    err = (p->each_device[i]->device->api.kernel_create)(p->each_device[i]->device, kernel);
+    if (err == CL_INVALID_KERNEL_DEFINITION) { // Conflict kernel define, can not go on
+      *errcode_ret = CL_INVALID_KERNEL_DEFINITION;
+      break;
+    }
+
+    if (err == CL_SUCCESS) { // Once success, this kernel can be created
+      someone_created = 1;
+    }
+  }
 
-  if (UNLIKELY(from == NULL))
+  if (*errcode_ret != CL_SUCCESS) {
+    cl_kernel_delete(kernel);
     return NULL;
-  TRY_ALLOC_NO_ERR (to, CL_CALLOC(1, sizeof(struct _cl_kernel)));
-  CL_OBJECT_INIT_BASE(to, CL_OBJECT_KERNEL_MAGIC);
-  to->bo = from->bo;
-  to->opaque = from->opaque;
-  to->vme = from->vme;
-  to->program = from->program;
-  to->arg_n = from->arg_n;
-  to->curbe_sz = from->curbe_sz;
-  to->sampler_sz = from->sampler_sz;
-  to->image_sz = from->image_sz;
-  to->exec_info_n = from->exec_info_n;
-  memcpy(to->compile_wg_sz, from->compile_wg_sz, sizeof(from->compile_wg_sz));
-  to->stack_size = from->stack_size;
-  if (to->sampler_sz)
-    memcpy(to->samplers, from->samplers, to->sampler_sz * sizeof(uint32_t));
-  if (to->image_sz) {
-    TRY_ALLOC_NO_ERR(to->images, CL_CALLOC(to->image_sz, sizeof(to->images[0])));
-    memcpy(to->images, from->images, to->image_sz * sizeof(to->images[0]));
-  } else
-    to->images = NULL;
-  if (to->exec_info_n) { /* Must always 0 here */
-    TRY_ALLOC_NO_ERR(to->exec_info, CL_CALLOC(to->exec_info_n, sizeof(void *)));
-    memcpy(to->exec_info, from->exec_info, to->exec_info_n * sizeof(void *));
-  }
-  TRY_ALLOC_NO_ERR(to->args, CL_CALLOC(to->arg_n, sizeof(cl_argument)));
-  if (to->curbe_sz) TRY_ALLOC_NO_ERR(to->curbe, CL_CALLOC(1, to->curbe_sz));
-
-  /* Retain the bos */
-  if (from->bo)       cl_buffer_reference(from->bo);
-
-  /* We retain the program destruction since this kernel (user allocated)
-   * depends on the program for some of its pointers
-   */
-  assert(from->program);
-  cl_program_add_ref(from->program);
-  to->ref_its_program = CL_TRUE;
-
-exit:
-  return to;
-error:
-  cl_kernel_delete(to);
-  to = NULL;
-  goto exit;
+  }
+  if (someone_created == 0) {
+    assert(err != CL_SUCCESS);
+    *errcode_ret = err;
+    cl_kernel_delete(kernel);
+    return NULL;
+  }
+
+  *errcode_ret = CL_SUCCESS;
+  return kernel;
 }
 
 LOCAL cl_int
-cl_kernel_work_group_sz(cl_kernel ker,
-                        const size_t *local_wk_sz,
-                        uint32_t wk_dim,
-                        size_t *wk_grp_sz)
+cl_kernel_get_workgroup_info(cl_kernel kernel, cl_device_id device, cl_kernel_work_group_info param_name,
+                             size_t param_value_size, void *param_value, size_t *param_value_size_ret)
 {
+  const void *src_ptr = NULL;
+  size_t src_size = 0;
+  size_t parameter_data = 0;
+  size_t wk_size[3] = {0, 0, 0};
   cl_int err = CL_SUCCESS;
-  size_t sz = 0;
-  cl_uint i;
 
-  for (i = 0; i < wk_dim; ++i) {
-    const uint32_t required_sz = interp_kernel_get_required_work_group_size(ker->opaque, i);
-    if (required_sz != 0 && required_sz != local_wk_sz[i]) {
-      err = CL_INVALID_WORK_ITEM_SIZE;
-      goto error;
-    }
+  if (device == NULL) {
+    assert(kernel->each_device_num == 1);
+    device = kernel->each_device[0]->device;
   }
-  sz = local_wk_sz[0];
-  for (i = 1; i < wk_dim; ++i)
-    sz *= local_wk_sz[i];
 
-  if (sz > cl_get_kernel_max_wg_sz(ker)) {
-    err = CL_INVALID_WORK_ITEM_SIZE;
-    goto error;
+  switch (param_name) {
+  case CL_KERNEL_WORK_GROUP_SIZE:
+  case CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE:
+  case CL_KERNEL_LOCAL_MEM_SIZE:
+  case CL_KERNEL_PRIVATE_MEM_SIZE: {
+    err = device->api.get_kernel_info(device, kernel, param_name, &parameter_data);
+    src_ptr = &parameter_data;
+    src_size = sizeof(size_t);
+    break;
   }
+  case CL_KERNEL_COMPILE_WORK_GROUP_SIZE: {
+    src_ptr = kernel->compile_wg_sz;
+    src_size = sizeof(size_t) * 3;
+    break;
+  }
+  case CL_KERNEL_GLOBAL_WORK_SIZE: {
+    err = device->api.get_kernel_info(device, kernel, param_name, wk_size);
+    src_ptr = wk_size;
+    src_size = sizeof(size_t) * 3;
+    break;
+  }
+  default:
+    return CL_INVALID_VALUE;
+  }
+
+  if (err != CL_SUCCESS)
+    return err;
 
-error:
-  if (wk_grp_sz) *wk_grp_sz = sz;
-  return err;
+  return cl_get_info_helper(src_ptr, src_size,
+                            param_value, param_value_size, param_value_size_ret);
 }
 
+LOCAL cl_int
+cl_kernel_get_subgroup_info(cl_kernel kernel, cl_device_id device, cl_kernel_work_group_info param_name,
+                            size_t input_value_size, const void *input_value, size_t param_value_size,
+                            void *param_value, size_t *param_value_size_ret)
+{
+  const void *src_ptr = NULL;
+  size_t src_size = 0;
+  size_t parameter_data = 0;
+  cl_int err = CL_SUCCESS;
+
+  switch (param_name) {
+  case CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR: {
+    int i, dim = 0;
+    size_t local_sz = 1;
+    size_t prefer_sz = 0;
+
+    switch (input_value_size) {
+    case sizeof(size_t) * 1:
+    case sizeof(size_t) * 2:
+    case sizeof(size_t) * 3:
+      dim = input_value_size / sizeof(size_t);
+      break;
+    default:
+      return CL_INVALID_VALUE;
+    }
+
+    if (input_value == NULL)
+      return CL_INVALID_VALUE;
+
+    for (i = 0; i < dim; i++)
+      local_sz *= ((size_t *)input_value)[i];
+
+    err = device->api.get_kernel_info(device, kernel, param_name, &prefer_sz);
+    if (err != CL_SUCCESS)
+      return err;
+
+    parameter_data = local_sz >= prefer_sz ? prefer_sz : local_sz;
+    src_ptr = &parameter_data;
+    src_size = sizeof(size_t);
+    break;
+  }
+
+  case CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE_KHR: {
+    int i, dim = 0;
+    size_t local_sz = 1;
+    size_t prefer_sz = 0;
+
+    switch (input_value_size) {
+    case sizeof(size_t) * 1:
+    case sizeof(size_t) * 2:
+    case sizeof(size_t) * 3:
+      dim = input_value_size / sizeof(size_t);
+      break;
+    default:
+      return CL_INVALID_VALUE;
+    }
+
+    if (input_value == NULL)
+      return CL_INVALID_VALUE;
 
+    for (i = 0; i < dim; i++)
+      local_sz *= ((size_t *)input_value)[i];
+
+    err = device->api.get_kernel_info(device, kernel,
+                                      CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR, &prefer_sz);
+    if (err != CL_SUCCESS)
+      return err;
+    parameter_data = (local_sz + prefer_sz - 1) / prefer_sz;
+    src_ptr = &parameter_data;
+    src_size = sizeof(size_t);
+    break;
+  }
+  default:
+    return CL_INVALID_VALUE;
+  };
+
+  return cl_get_info_helper(src_ptr, src_size,
+                            param_value, param_value_size, param_value_size_ret);
+}
+
+LOCAL cl_int
+cl_kernel_set_arg_svm_pointer(cl_kernel k, cl_uint index, const void *value)
+{
+  return CL_OUT_OF_HOST_MEMORY;
+}
+LOCAL cl_int
+cl_kernel_set_exec_info(cl_kernel k, size_t n, const void *value)
+{
+  return CL_OUT_OF_HOST_MEMORY;
+}
diff --git a/src/cl_kernel.h b/src/cl_kernel.h
index 4690c0b..07d8996 100644
--- a/src/cl_kernel.h
+++ b/src/cl_kernel.h
@@ -30,21 +30,46 @@
 #include <stdint.h>
 #include <stdlib.h>
 
-/* This is the kernel as it is interfaced by the compiler */
-struct _gbe_kernel;
+typedef enum cl_address_space_type {
+  AddressSpacePrivate = 0,
+  AddressSpaceGlobal = 1,
+  AddressSpaceConstant = 2,
+  AddressSpaceLocal = 3,
+} cl_address_space_type;
+
+typedef enum cl_arg_type {
+  ArgTypeInvalid = 0,
+  ArgTypeValue, // int, float, double, long, etc
+  ArgTypeStruct,
+  ArgTypePointer,
+  ArgTypeImage,
+  ArgTypeSampler,
+} cl_arg_type;
 
-/* We need to save buffer data for relocation and binding and we must figure out
- * if all arguments are properly set
- */
 typedef struct cl_argument {
-  cl_mem mem;           /* For image and regular buffers */
-  cl_sampler sampler;   /* For sampler. */
-  cl_accelerator_intel accel;
-  unsigned char bti;
-  void *ptr;            /* SVM ptr value. */
-  uint32_t local_sz:30; /* For __local size specification */
-  uint32_t is_set:1;    /* All args must be set before NDRange */
-  uint32_t is_svm:1;    /* Indicate this argument is SVMPointer */
+  cl_arg_type arg_type;
+  cl_uint arg_no;
+  cl_uint arg_size; // size in bytes
+  cl_address_space_type arg_addrspace;
+  cl_uint arg_type_qualifier;
+  cl_uint arg_access_qualifier;
+  char *arg_name;
+  char *arg_type_name;
+
+  union {
+    cl_char val_char;
+    cl_short val_short;
+    cl_int val_int;
+    cl_long val_long;
+    cl_half val_half;
+    cl_float val_float;
+    cl_double val_double;
+    cl_sampler val_sampler;
+    cl_mem val_mem;
+    void *val_ptr;
+  } val;
+  cl_uint val_size;
+  cl_bool is_set; /* All args must be set before NDRange */
 } cl_argument;
 
 typedef struct _cl_kernel_for_device {
@@ -55,93 +80,45 @@ typedef struct _cl_kernel_for_device {
 typedef _cl_kernel_for_device *cl_kernel_for_device;
 
 /* One OCL function */
-struct _cl_kernel {
+typedef struct _cl_kernel {
   _cl_base_object base;
-  cl_buffer bo;               /* The code itself */
-  cl_program program;         /* Owns this structure (and pointers) */
-  gbe_kernel opaque;          /* (Opaque) compiler structure for the OCL kernel */
-  cl_accelerator_intel accel;     /* accelerator */
-  char *curbe;                /* One curbe per kernel */
-  size_t curbe_sz;            /* Size of it */
-  uint32_t samplers[GEN_MAX_SAMPLERS]; /* samplers defined in kernel & kernel args */
-  size_t sampler_sz;          /* sampler size defined in kernel & kernel args. */
-  struct ImageInfo *images;   /* images defined in kernel args */
-  size_t image_sz;            /* image count in kernel args */
-  cl_ulong local_mem_sz;      /* local memory size specified in kernel args. */
-  size_t compile_wg_sz[3];    /* Required workgroup size by __attribute__((reqd_work_gro
-                                 up_size(X, Y, Z))) qualifier.*/
-  size_t global_work_sz[3];    /* maximum global size that can be used to execute a kernel
-                                (i.e. global_work_size argument to clEnqueueNDRangeKernel.)*/
-  size_t stack_size;          /* stack size per work item. */
-  cl_argument *args;          /* To track argument setting */
-  uint32_t arg_n:30;          /* Number of arguments */
-  uint32_t ref_its_program:1; /* True only for the user kernel (created by clCreateKernel) */
-  uint32_t vme:1;             /* True only if it is a built-in kernel for VME */
-
-  void* cmrt_kernel;          /* CmKernel* */
-  uint32_t exec_info_n;       /* The kernel's exec info count */
-  void** exec_info;             /* The kernel's exec info */
-  cl_bool useDeviceEnqueue;     /* kernel use device enqueue */
-  void* device_enqueue_ptr;     /* device_enqueue buffer*/
-  uint32_t device_enqueue_info_n; /* count of parent kernel's arguments buffers, as child enqueues' exec info */
-  void** device_enqueue_infos;   /* parent kernel's arguments buffers, as child enqueues' exec info   */
-};
+  cl_program program;                /* Point back to program */
+  char *name;                        /* The kernel name */
+  cl_argument *args;                 /* All the arguments */
+  cl_uint arg_n;                     /* Number of arguments */
+  size_t compile_wg_sz[3];           /* Required workgroup size by
+                                        __attribute__((reqd_work_group_size(X, Y, Z))) qualifier */
+  char *kernel_attr;                 /* The kernel attribute */
+  cl_uint each_device_num;           /* Each device number */
+  cl_kernel_for_device *each_device; /* Program content interpreted by device */
+
+  uint32_t exec_info_n; /* The kernel's exec info count */
+  void **exec_info;     /* The kernel's exec info */
+} _cl_kernel;
 
 #define CL_OBJECT_KERNEL_MAGIC 0x1234567890abedefLL
-#define CL_OBJECT_IS_KERNEL(obj) ((obj &&                           \
-         ((cl_base_object)obj)->magic == CL_OBJECT_KERNEL_MAGIC &&  \
-         CL_OBJECT_GET_REF(obj) >= 1))
+#define CL_OBJECT_IS_KERNEL(obj) ((obj &&                                                    \
+                                   ((cl_base_object)obj)->magic == CL_OBJECT_KERNEL_MAGIC && \
+                                   CL_OBJECT_GET_REF(obj) >= 1))
 
-/* Allocate an empty kernel */
-extern cl_kernel cl_kernel_new(cl_program);
-
-/* Destroy and deallocate an empty kernel */
+extern cl_kernel cl_kernel_create(cl_program p, const char *name, cl_int *errcode_ret);
+extern cl_kernel cl_kernel_new(cl_program, const char *name);
 extern void cl_kernel_delete(cl_kernel);
-
-/* Setup the kernel with the given GBE Kernel */
-extern void cl_kernel_setup(cl_kernel k, gbe_kernel opaque);
-
-/* Get the kernel name */
-extern const char *cl_kernel_get_name(cl_kernel k);
-
-/* Get the kernel attributes*/
-extern const char *cl_kernel_get_attributes(cl_kernel k);
-
-/* Get the simd width as used in the code */
-extern uint32_t cl_kernel_get_simd_width(cl_kernel k);
-
-/* When a kernel is created from outside, we just duplicate the structure we
- * have internally and give it back to the user
- */
-extern cl_kernel cl_kernel_dup(cl_kernel);
-
-/* Add one more reference on the kernel object */
 extern void cl_kernel_add_ref(cl_kernel);
+extern int cl_kernel_set_arg(cl_kernel, uint32_t arg_index, size_t arg_size, const void *arg_value);
+extern int cl_kernel_get_argument_info(cl_kernel k, cl_uint arg_index, cl_kernel_arg_info param_name,
+                                       size_t param_value_size, void *param_value, size_t *param_value_size_ret);
+extern cl_int cl_kernel_get_workgroup_info(cl_kernel kernel, cl_device_id device,
+                                           cl_kernel_work_group_info param_name, size_t param_value_size,
+                                           void *param_value, size_t *param_value_size_ret);
+extern cl_int cl_kernel_get_subgroup_info(cl_kernel kernel, cl_device_id device,
+                                          cl_kernel_work_group_info param_name, size_t input_value_size,
+                                          const void *input_value, size_t param_value_size,
+                                          void *param_value, size_t *param_value_size_ret);
 
 /* Set the argument before kernel execution */
-extern int cl_kernel_set_arg(cl_kernel,
-                             uint32_t    arg_index,
-                             size_t      arg_size,
-                             const void *arg_value);
-extern int cl_kernel_set_arg_svm_pointer(cl_kernel,
-                                            uint32_t arg_index,
-                                            const void *arg_value);
-extern cl_int cl_kernel_set_exec_info(cl_kernel k,
-                                      size_t n,
-                                      const void *value);
-
-/* Get the argument information */
-extern int cl_get_kernel_arg_info(cl_kernel k, cl_uint arg_index,
-                                  cl_kernel_arg_info param_name,
-                                  size_t param_value_size, void *param_value,
-                                  size_t *param_value_size_ret);
-
-/* Compute and check the work group size from the user provided local size */
-extern cl_int
-cl_kernel_work_group_sz(cl_kernel ker,
-                        const size_t *local_wk_sz,
-                        cl_uint wk_dim,
-                        size_t *wk_grp_sz);
-
+extern int cl_kernel_set_arg_svm_pointer(cl_kernel, uint32_t arg_index, const void *arg_value);
+extern cl_int cl_kernel_set_exec_info(cl_kernel k, size_t n, const void *value);
+extern cl_int cl_kernel_work_group_sz(cl_kernel ker, const size_t *local_wk_sz,
+                                      cl_uint wk_dim, size_t *wk_grp_sz);
 #endif /* __CL_KERNEL_H__ */
-
diff --git a/src/cl_mem.c b/src/cl_mem.c
index f0cccb8..362bccd 100644
--- a/src/cl_mem.c
+++ b/src/cl_mem.c
@@ -168,7 +168,7 @@ cl_mem_allocate(enum cl_mem_type type,
     if (ctx->devices[0]->host_unified_memory) {
       int page_size = getpagesize();
       int cacheline_size = 0;
-      cl_get_device_info(ctx->devices[0], CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, sizeof(cacheline_size), &cacheline_size, NULL);
+      cl_device_get_info(ctx->devices[0], CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, sizeof(cacheline_size), &cacheline_size, NULL);
 
       if (type == CL_MEM_BUFFER_TYPE) {
         if (flags & CL_MEM_USE_HOST_PTR) {
@@ -222,7 +222,7 @@ cl_mem_allocate(enum cl_mem_type type,
       // if create image from USE_HOST_PTR buffer, the buffer's base address need be aligned.
       if(buffer->is_userptr) {
         int base_alignement = 0;
-        cl_get_device_info(ctx->devices[0], CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT, sizeof(base_alignement), &base_alignement, NULL);
+        cl_device_get_info(ctx->devices[0], CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT, sizeof(base_alignement), &base_alignement, NULL);
         if(ALIGN((unsigned long)buffer->host_ptr, base_alignement) != (unsigned long)buffer->host_ptr) {
           err = CL_INVALID_IMAGE_FORMAT_DESCRIPTOR;
           goto error;
@@ -339,7 +339,7 @@ cl_mem_new_buffer(cl_context ctx,
     goto error;
   }
 
-  if ((err = cl_get_device_info(ctx->devices[0],
+  if ((err = cl_device_get_info(ctx->devices[0],
                                 CL_DEVICE_MAX_MEM_ALLOC_SIZE,
                                 sizeof(max_mem_size),
                                 &max_mem_size,
@@ -575,7 +575,7 @@ void* cl_mem_svm_allocate(cl_context ctx, cl_svm_mem_flags flags,
   if(UNLIKELY(alignment & (alignment - 1)))
     return NULL;
 
-  if ((err = cl_get_device_info(ctx->devices[0],
+  if ((err = cl_device_get_info(ctx->devices[0],
                                  CL_DEVICE_MAX_MEM_ALLOC_SIZE,
                                  sizeof(max_mem_size),
                                  &max_mem_size,
@@ -877,7 +877,7 @@ _cl_mem_new_image(cl_context ctx,
   uint8_t enableUserptr = 0;
   if (enable_true_hostptr && ctx->devices[0]->host_unified_memory && data != NULL && (flags & CL_MEM_USE_HOST_PTR)) {
     int cacheline_size = 0;
-    cl_get_device_info(ctx->devices[0], CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, sizeof(cacheline_size), &cacheline_size, NULL);
+    cl_device_get_info(ctx->devices[0], CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, sizeof(cacheline_size), &cacheline_size, NULL);
     if (ALIGN((unsigned long)data, cacheline_size) == (unsigned long)data &&
         ALIGN(h, cl_buffer_get_tiling_align(ctx, CL_NO_TILE, 1)) == h &&
         ALIGN(h * pitch * depth, cacheline_size) == h * pitch * depth && //h and pitch should same as aligned_h and aligned_pitch if enable userptr
@@ -1054,7 +1054,7 @@ _cl_mem_new_image_from_buffer(cl_context ctx,
     goto error;
   }
 
-  if ((err = cl_get_device_info(ctx->devices[0],
+  if ((err = cl_device_get_info(ctx->devices[0],
                                 CL_DEVICE_IMAGE_MAX_BUFFER_SIZE,
                                 sizeof(max_size),
                                 &max_size,
@@ -1262,7 +1262,7 @@ cl_mem_delete(cl_mem mem)
   }
 
   if (mem->mapped_ptr)
-    free(mem->mapped_ptr);
+    CL_FREE(mem->mapped_ptr);
 
   /* Iff we are sub, do nothing for bo release. */
   if (mem->type == CL_MEM_SUBBUFFER_TYPE) {
@@ -1306,810 +1306,6 @@ cl_mem_add_ref(cl_mem mem)
   CL_OBJECT_INC_REF(mem);
 }
 
-#define LOCAL_SZ_0   16
-#define LOCAL_SZ_1   4
-#define LOCAL_SZ_2   4
-
-LOCAL cl_int
-cl_mem_copy(cl_command_queue queue, cl_event event, cl_mem src_buf, cl_mem dst_buf,
-            size_t src_offset, size_t dst_offset, size_t cb)
-{
-  cl_int ret = CL_SUCCESS;
-  cl_kernel ker = NULL;
-  size_t global_off[] = {0,0,0};
-  size_t global_sz[] = {1,1,1};
-  size_t local_sz[] = {1,1,1};
-  const unsigned int masks[4] = {0xffffffff, 0x0ff, 0x0ffff, 0x0ffffff};
-  int aligned = 0;
-  int dw_src_offset = src_offset/4;
-  int dw_dst_offset = dst_offset/4;
-
-  if (!cb)
-    return ret;
-
-  /* We use one kernel to copy the data. The kernel is lazily created. */
-  assert(src_buf->ctx == dst_buf->ctx);
-
-  /* All 16 bytes aligned, fast and easy one. */
-  if((cb % 16 == 0) && (src_offset % 16 == 0) && (dst_offset % 16 == 0)) {
-    extern char cl_internal_copy_buf_align16_str[];
-    extern size_t cl_internal_copy_buf_align16_str_size;
-
-    ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_ALIGN16,
-             cl_internal_copy_buf_align16_str, (size_t)cl_internal_copy_buf_align16_str_size, NULL);
-    cb = cb/16;
-    aligned = 1;
-  } else if ((cb % 4 == 0) && (src_offset % 4 == 0) && (dst_offset % 4 == 0)) { /* all Dword aligned.*/
-    extern char cl_internal_copy_buf_align4_str[];
-    extern size_t cl_internal_copy_buf_align4_str_size;
-
-    ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_ALIGN4,
-             cl_internal_copy_buf_align4_str, (size_t)cl_internal_copy_buf_align4_str_size, NULL);
-    cb = cb/4;
-    aligned = 1;
-  }
-
-  if (aligned) {
-    if (!ker)
-      return CL_OUT_OF_RESOURCES;
-
-    if (cb < LOCAL_SZ_0) {
-      local_sz[0] = 1;
-    } else {
-      local_sz[0] = LOCAL_SZ_0;
-    }
-    global_sz[0] = ((cb + LOCAL_SZ_0 - 1)/LOCAL_SZ_0)*LOCAL_SZ_0;
-    cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf);
-    cl_kernel_set_arg(ker, 1, sizeof(int), &dw_src_offset);
-    cl_kernel_set_arg(ker, 2, sizeof(cl_mem), &dst_buf);
-    cl_kernel_set_arg(ker, 3, sizeof(int), &dw_dst_offset);
-    cl_kernel_set_arg(ker, 4, sizeof(int), &cb);
-    ret = cl_command_queue_ND_range(queue, ker, event, 1, global_off,
-                                    global_off, global_sz, global_sz, local_sz, local_sz);
-    cl_kernel_delete(ker);
-    return ret;
-  }
-
-  /* Now handle the unaligned cases. */
-  int dw_num = ((dst_offset % 4 + cb) + 3) / 4;
-  unsigned int first_mask = dst_offset % 4 == 0 ? 0x0 : masks[dst_offset % 4];
-  unsigned int last_mask = masks[(dst_offset + cb) % 4];
-  /* handle the very small range copy. */
-  if (cb < 4 && dw_num == 1) {
-    first_mask = first_mask | ~last_mask;
-  }
-
-  if (cb < LOCAL_SZ_0) {
-    local_sz[0] = 1;
-  } else {
-    local_sz[0] = LOCAL_SZ_0;
-  }
-  global_sz[0] = ((dw_num + LOCAL_SZ_0 - 1)/LOCAL_SZ_0)*LOCAL_SZ_0;
-
-  if (src_offset % 4 == dst_offset % 4) {
-    /* Src and dst has the same unaligned offset, just handle the
-       header and tail. */
-    extern char cl_internal_copy_buf_unalign_same_offset_str[];
-    extern size_t cl_internal_copy_buf_unalign_same_offset_str_size;
-
-    ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_UNALIGN_SAME_OFFSET,
-             cl_internal_copy_buf_unalign_same_offset_str,
-             (size_t)cl_internal_copy_buf_unalign_same_offset_str_size, NULL);
-
-    if (!ker)
-      return CL_OUT_OF_RESOURCES;
-
-    cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf);
-    cl_kernel_set_arg(ker, 1, sizeof(int), &dw_src_offset);
-    cl_kernel_set_arg(ker, 2, sizeof(cl_mem), &dst_buf);
-    cl_kernel_set_arg(ker, 3, sizeof(int), &dw_dst_offset);
-    cl_kernel_set_arg(ker, 4, sizeof(int), &dw_num);
-    cl_kernel_set_arg(ker, 5, sizeof(int), &first_mask);
-    cl_kernel_set_arg(ker, 6, sizeof(int), &last_mask);
-    ret = cl_command_queue_ND_range(queue, ker, event, 1, global_off,
-                                    global_off, global_sz, global_sz, local_sz, local_sz);
-    cl_kernel_delete(ker);
-    return ret;
-  }
-
-  /* Dst's offset < Src's offset, so one dst dword need two sequential src dwords to fill it. */
-  if (dst_offset % 4 < src_offset % 4) {
-    extern char cl_internal_copy_buf_unalign_dst_offset_str[];
-    extern size_t cl_internal_copy_buf_unalign_dst_offset_str_size;
-
-    int align_diff = src_offset % 4 - dst_offset % 4;
-    unsigned int dw_mask = masks[align_diff];
-    int shift = align_diff * 8;
-
-    ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_UNALIGN_DST_OFFSET,
-             cl_internal_copy_buf_unalign_dst_offset_str,
-             (size_t)cl_internal_copy_buf_unalign_dst_offset_str_size, NULL);
-
-    if (!ker)
-      return CL_OUT_OF_RESOURCES;
-
-    cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf);
-    cl_kernel_set_arg(ker, 1, sizeof(int), &dw_src_offset);
-    cl_kernel_set_arg(ker, 2, sizeof(cl_mem), &dst_buf);
-    cl_kernel_set_arg(ker, 3, sizeof(int), &dw_dst_offset);
-    cl_kernel_set_arg(ker, 4, sizeof(int), &dw_num);
-    cl_kernel_set_arg(ker, 5, sizeof(int), &first_mask);
-    cl_kernel_set_arg(ker, 6, sizeof(int), &last_mask);
-    cl_kernel_set_arg(ker, 7, sizeof(int), &shift);
-    cl_kernel_set_arg(ker, 8, sizeof(int), &dw_mask);
-    ret = cl_command_queue_ND_range(queue, ker, event, 1, global_off,
-                                    global_off, global_sz, global_sz, local_sz, local_sz);
-    cl_kernel_delete(ker);
-    return ret;
-  }
-
-  /* Dst's offset > Src's offset, so one dst dword need two sequential src - and src to fill it. */
-  if (dst_offset % 4 > src_offset % 4) {
-    extern char cl_internal_copy_buf_unalign_src_offset_str[];
-    extern size_t cl_internal_copy_buf_unalign_src_offset_str_size;
-
-    int align_diff = dst_offset % 4 - src_offset % 4;
-    unsigned int dw_mask = masks[4 - align_diff];
-    int shift = align_diff * 8;
-    int src_less = !(src_offset % 4) && !((src_offset + cb) % 4);
-
-    ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_UNALIGN_SRC_OFFSET,
-             cl_internal_copy_buf_unalign_src_offset_str,
-             (size_t)cl_internal_copy_buf_unalign_src_offset_str_size, NULL);
-
-    if (!ker)
-      return CL_OUT_OF_RESOURCES;
-
-    cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf);
-    cl_kernel_set_arg(ker, 1, sizeof(int), &dw_src_offset);
-    cl_kernel_set_arg(ker, 2, sizeof(cl_mem), &dst_buf);
-    cl_kernel_set_arg(ker, 3, sizeof(int), &dw_dst_offset);
-    cl_kernel_set_arg(ker, 4, sizeof(int), &dw_num);
-    cl_kernel_set_arg(ker, 5, sizeof(int), &first_mask);
-    cl_kernel_set_arg(ker, 6, sizeof(int), &last_mask);
-    cl_kernel_set_arg(ker, 7, sizeof(int), &shift);
-    cl_kernel_set_arg(ker, 8, sizeof(int), &dw_mask);
-    cl_kernel_set_arg(ker, 9, sizeof(int), &src_less);
-    ret = cl_command_queue_ND_range(queue, ker, event, 1, global_off,
-                                    global_off, global_sz, global_sz, local_sz, local_sz);
-    cl_kernel_delete(ker);
-    return ret;
-  }
-
-  /* no case can hanldle? */
-  assert(0);
-
-  return ret;
-}
-
-LOCAL cl_int
-cl_image_fill(cl_command_queue queue, cl_event e, const void * pattern, struct _cl_mem_image* src_image,
-           const size_t * origin, const size_t * region)
-{
-  cl_int ret = CL_SUCCESS;
-  cl_kernel ker = NULL;
-  size_t global_off[] = {0,0,0};
-  size_t global_sz[] = {1,1,1};
-  size_t local_sz[] = {LOCAL_SZ_0,LOCAL_SZ_1,LOCAL_SZ_2};
-  uint32_t savedIntelFmt = src_image->intel_fmt;
-
-
-  if(region[1] == 1) local_sz[1] = 1;
-  if(region[2] == 1) local_sz[2] = 1;
-  global_sz[0] = ((region[0] + local_sz[0] - 1) / local_sz[0]) * local_sz[0];
-  global_sz[1] = ((region[1] + local_sz[1] - 1) / local_sz[1]) * local_sz[1];
-  global_sz[2] = ((region[2] + local_sz[2] - 1) / local_sz[2]) * local_sz[2];
-
-  if(src_image->image_type == CL_MEM_OBJECT_IMAGE1D) {
-    extern char cl_internal_fill_image_1d_str[];
-    extern size_t cl_internal_fill_image_1d_str_size;
-
-    ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_FILL_IMAGE_1D,
-        cl_internal_fill_image_1d_str, (size_t)cl_internal_fill_image_1d_str_size, NULL);
-  }else if(src_image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY) {
-    extern char cl_internal_fill_image_1d_array_str[];
-    extern size_t cl_internal_fill_image_1d_array_str_size;
-
-    ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_FILL_IMAGE_1D_ARRAY,
-        cl_internal_fill_image_1d_array_str, (size_t)cl_internal_fill_image_1d_array_str_size, NULL);
-  }else if(src_image->image_type == CL_MEM_OBJECT_IMAGE2D) {
-    extern char cl_internal_fill_image_2d_str[];
-    extern size_t cl_internal_fill_image_2d_str_size;
-
-    ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_FILL_IMAGE_2D,
-        cl_internal_fill_image_2d_str, (size_t)cl_internal_fill_image_2d_str_size, NULL);
-  }else if(src_image->image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY) {
-    extern char cl_internal_fill_image_2d_array_str[];
-    extern size_t cl_internal_fill_image_2d_array_str_size;
-
-    ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_FILL_IMAGE_2D_ARRAY,
-        cl_internal_fill_image_2d_array_str, (size_t)cl_internal_fill_image_2d_array_str_size, NULL);
-  }else if(src_image->image_type == CL_MEM_OBJECT_IMAGE3D) {
-    extern char cl_internal_fill_image_3d_str[];
-    extern size_t cl_internal_fill_image_3d_str_size;
-
-    ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_FILL_IMAGE_3D,
-        cl_internal_fill_image_3d_str, (size_t)cl_internal_fill_image_3d_str_size, NULL);
-  }else{
-    return CL_IMAGE_FORMAT_NOT_SUPPORTED;
-  }
-
-  if (!ker)
-    return CL_OUT_OF_RESOURCES;
-
-  cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_image);
-  if(src_image->fmt.image_channel_order >= CL_sRGBA) {
-#define RGB2sRGB(linear)  ( linear <= 0.0031308f )? ( 12.92f * linear ):( 1.055f * powf( linear, 1.0f/2.4f ) - 0.055f);
-    cl_image_format fmt;
-    float newpattern[4] = {0.0,0.0,0.0,((float*)pattern)[3]};
-    int i;
-    for(i = 0;i < 3; i++){
-      if(src_image->fmt.image_channel_order == CL_sRGBA) {
-        newpattern[i] = RGB2sRGB(((float*)pattern)[i]);
-      } else
-        newpattern[2-i] = RGB2sRGB(((float*)pattern)[i]);
-    }
-    cl_kernel_set_arg(ker, 1, sizeof(float)*4, newpattern);
-    fmt.image_channel_order = CL_RGBA;
-    fmt.image_channel_data_type = CL_UNORM_INT8;
-    src_image->intel_fmt = cl_image_get_intel_format(&fmt);
-#undef RGB2sRGB
-  } else
-    cl_kernel_set_arg(ker, 1, sizeof(float)*4, pattern);
-  cl_kernel_set_arg(ker, 2, sizeof(cl_int), &region[0]);
-  cl_kernel_set_arg(ker, 3, sizeof(cl_int), &region[1]);
-  cl_kernel_set_arg(ker, 4, sizeof(cl_int), &region[2]);
-  cl_kernel_set_arg(ker, 5, sizeof(cl_int), &origin[0]);
-  cl_kernel_set_arg(ker, 6, sizeof(cl_int), &origin[1]);
-  cl_kernel_set_arg(ker, 7, sizeof(cl_int), &origin[2]);
-
-  ret = cl_command_queue_ND_range(queue, ker, e, 3, global_off,
-                                  global_off, global_sz, global_sz, local_sz, local_sz);
-  cl_kernel_delete(ker);
-  src_image->intel_fmt = savedIntelFmt;
-  return ret;
-}
-
-LOCAL cl_int
-cl_mem_fill(cl_command_queue queue, cl_event e, const void * pattern, size_t pattern_size,
-            cl_mem buffer, size_t offset, size_t size)
-{
-  cl_int ret = CL_SUCCESS;
-  cl_kernel ker = NULL;
-  size_t global_off[] = {0,0,0};
-  size_t global_sz[] = {1,1,1};
-  size_t local_sz[] = {1,1,1};
-  char pattern_comb[4];
-  int is_128 = 0;
-  const void * pattern1 = NULL;
-
-  assert(offset % pattern_size == 0);
-  assert(size % pattern_size == 0);
-
-  if (!size)
-    return ret;
-
-  if (pattern_size == 128) {
-    /* 128 is according to pattern of double16, but double works not very
-       well on some platform. We use two float16 to handle this. */
-    extern char cl_internal_fill_buf_align128_str[];
-    extern size_t cl_internal_fill_buf_align128_str_size;
-
-    ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_FILL_BUFFER_ALIGN128,
-               cl_internal_fill_buf_align128_str, (size_t)cl_internal_fill_buf_align128_str_size, NULL);
-    is_128 = 1;
-    pattern_size = pattern_size / 2;
-    pattern1 = pattern + pattern_size;
-    size = size / 2;
-  } else if (pattern_size % 8 == 0) { /* Handle the 8 16 32 64 cases here. */
-    extern char cl_internal_fill_buf_align8_str[];
-    extern size_t cl_internal_fill_buf_align8_str_size;
-    int order = ffs(pattern_size / 8) - 1;
-
-    ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_FILL_BUFFER_ALIGN8_8 + order,
-               cl_internal_fill_buf_align8_str, (size_t)cl_internal_fill_buf_align8_str_size, NULL);
-  } else if (pattern_size == 4) {
-    extern char cl_internal_fill_buf_align4_str[];
-    extern size_t cl_internal_fill_buf_align4_str_size;
-
-    ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_FILL_BUFFER_ALIGN4,
-               cl_internal_fill_buf_align4_str, (size_t)cl_internal_fill_buf_align4_str_size, NULL);
-  } else if (size >= 4 && size % 4 == 0 && offset % 4 == 0) {
-    /* The unaligned case. But if copy size and offset are aligned to 4, we can fake
-       the pattern with the pattern duplication fill in. */
-    assert(pattern_size == 1 || pattern_size == 2);
-    extern char cl_internal_fill_buf_align4_str[];
-    extern size_t cl_internal_fill_buf_align4_str_size;
-
-    if (pattern_size == 2) {
-      memcpy(pattern_comb, pattern, sizeof(char)*2);
-      memcpy(pattern_comb + 2, pattern, sizeof(char)*2);
-    } else {
-      pattern_comb[0] = pattern_comb[1] = pattern_comb[2]
-        = pattern_comb[3] = *(char *)pattern;
-    }
-
-    ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_FILL_BUFFER_ALIGN4,
-               cl_internal_fill_buf_align4_str, (size_t)cl_internal_fill_buf_align4_str_size, NULL);
-    pattern_size = 4;
-    pattern = pattern_comb;
-  }
-  //TODO: Unaligned cases, we may need to optimize it as cl_mem_copy, using mask in kernel
-  //functions. This depend on the usage but now we just use aligned 1 and 2.
-  else if (pattern_size == 2) {
-    extern char cl_internal_fill_buf_align2_str[];
-    extern size_t cl_internal_fill_buf_align2_str_size;
-    ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_FILL_BUFFER_ALIGN2,
-               cl_internal_fill_buf_align2_str, (size_t)cl_internal_fill_buf_align2_str_size, NULL);
-  } else if (pattern_size == 1) {
-    extern char cl_internal_fill_buf_unalign_str[];
-    extern size_t cl_internal_fill_buf_unalign_str_size;
-    ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_FILL_BUFFER_UNALIGN,
-               cl_internal_fill_buf_unalign_str, (size_t)cl_internal_fill_buf_unalign_str_size, NULL);
-  } else
-    assert(0);
-
-  if (!ker)
-    return CL_OUT_OF_RESOURCES;
-
-  size = size / pattern_size;
-  offset = offset / pattern_size;
-
-  if (size < LOCAL_SZ_0) {
-    local_sz[0] = 1;
-  } else {
-    local_sz[0] = LOCAL_SZ_0;
-  }
-  global_sz[0] = ((size + LOCAL_SZ_0 - 1) / LOCAL_SZ_0) * LOCAL_SZ_0;
-  cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &buffer);
-  cl_kernel_set_arg(ker, 1, pattern_size, pattern);
-  cl_kernel_set_arg(ker, 2, sizeof(cl_uint), &offset);
-  cl_kernel_set_arg(ker, 3, sizeof(cl_uint), &size);
-  if (is_128)
-    cl_kernel_set_arg(ker, 4, pattern_size, pattern1);
-
-  ret = cl_command_queue_ND_range(queue, ker, e, 1, global_off,
-                                  global_off, global_sz, global_sz, local_sz, local_sz);
-  cl_kernel_delete(ker);
-  return ret;
-}
-
-LOCAL cl_int
-cl_mem_copy_buffer_rect(cl_command_queue queue, cl_event event, cl_mem src_buf, cl_mem dst_buf,
-                       const size_t *src_origin, const size_t *dst_origin, const size_t *region,
-                       size_t src_row_pitch, size_t src_slice_pitch,
-                       size_t dst_row_pitch, size_t dst_slice_pitch) {
-  cl_int ret;
-  cl_kernel ker;
-  size_t global_off[] = {0,0,0};
-  size_t global_sz[] = {1,1,1};
-  size_t local_sz[] = {LOCAL_SZ_0,LOCAL_SZ_1,LOCAL_SZ_1};
-  // the src and dst mem rect is continuous, the copy is degraded to buf copy
-  if((region[0] == dst_row_pitch) && (region[0] == src_row_pitch) &&
-  (region[1] * src_row_pitch == src_slice_pitch) && (region[1] * dst_row_pitch == dst_slice_pitch)){
-    cl_int src_offset = src_origin[2]*src_slice_pitch + src_origin[1]*src_row_pitch + src_origin[0];
-    cl_int dst_offset = dst_origin[2]*dst_slice_pitch + dst_origin[1]*dst_row_pitch + dst_origin[0];
-    cl_int size = region[0]*region[1]*region[2];
-    ret = cl_mem_copy(queue, NULL, src_buf, dst_buf,src_offset, dst_offset, size);
-    return ret;
-  }
-
-  if(region[1] == 1) local_sz[1] = 1;
-  if(region[2] == 1) local_sz[2] = 1;
-  global_sz[0] = ((region[0] + local_sz[0] - 1) / local_sz[0]) * local_sz[0];
-  global_sz[1] = ((region[1] + local_sz[1] - 1) / local_sz[1]) * local_sz[1];
-  global_sz[2] = ((region[2] + local_sz[2] - 1) / local_sz[2]) * local_sz[2];
-  cl_int src_offset = src_origin[2]*src_slice_pitch + src_origin[1]*src_row_pitch + src_origin[0];
-  cl_int dst_offset = dst_origin[2]*dst_slice_pitch + dst_origin[1]*dst_row_pitch + dst_origin[0];
-
-  /* We use one kernel to copy the data. The kernel is lazily created. */
-  assert(src_buf->ctx == dst_buf->ctx);
-
-  /* setup the kernel and run. */
-  size_t region0 = region[0];
-  if( (src_offset % 4== 0) && (dst_offset % 4== 0) && (src_row_pitch % 4== 0) && (dst_row_pitch % 4== 0)
-  && (src_slice_pitch % 4== 0) && (dst_slice_pitch % 4== 0) && (region0 % 4 == 0) ){
-    extern char cl_internal_copy_buf_rect_align4_str[];
-    extern size_t cl_internal_copy_buf_rect_align4_str_size;
-    region0 /= 4;
-    src_offset /= 4;
-    dst_offset /= 4;
-    src_row_pitch /= 4;
-    dst_row_pitch /= 4;
-    src_slice_pitch /= 4;
-    dst_slice_pitch /= 4;
-    ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_RECT_ALIGN4,
-    cl_internal_copy_buf_rect_align4_str, (size_t)cl_internal_copy_buf_rect_align4_str_size, NULL);
-  }else{
-    extern char cl_internal_copy_buf_rect_str[];
-    extern size_t cl_internal_copy_buf_rect_str_size;
-    ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_RECT,
-    cl_internal_copy_buf_rect_str, (size_t)cl_internal_copy_buf_rect_str_size, NULL);
-  }
-
-  if (!ker)
-    return CL_OUT_OF_RESOURCES;
-
-  cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf);
-  cl_kernel_set_arg(ker, 1, sizeof(cl_mem), &dst_buf);
-  cl_kernel_set_arg(ker, 2, sizeof(cl_int), &region0);
-  cl_kernel_set_arg(ker, 3, sizeof(cl_int), &region[1]);
-  cl_kernel_set_arg(ker, 4, sizeof(cl_int), &region[2]);
-  cl_kernel_set_arg(ker, 5, sizeof(cl_int), &src_offset);
-  cl_kernel_set_arg(ker, 6, sizeof(cl_int), &dst_offset);
-  cl_kernel_set_arg(ker, 7, sizeof(cl_int), &src_row_pitch);
-  cl_kernel_set_arg(ker, 8, sizeof(cl_int), &src_slice_pitch);
-  cl_kernel_set_arg(ker, 9, sizeof(cl_int), &dst_row_pitch);
-  cl_kernel_set_arg(ker, 10, sizeof(cl_int), &dst_slice_pitch);
-
-  ret = cl_command_queue_ND_range(queue, ker, event, 1, global_off,
-                                  global_off, global_sz, global_sz, local_sz, local_sz);
-  cl_kernel_delete(ker);
-  return ret;
-}
-
-LOCAL cl_int
-cl_mem_kernel_copy_image(cl_command_queue queue, cl_event event, struct _cl_mem_image* src_image,
-                         struct _cl_mem_image* dst_image, const size_t *src_origin,
-                         const size_t *dst_origin, const size_t *region) {
-  cl_int ret;
-  cl_kernel ker = NULL;
-  size_t global_off[] = {0,0,0};
-  size_t global_sz[] = {1,1,1};
-  size_t local_sz[] = {LOCAL_SZ_0,LOCAL_SZ_1,LOCAL_SZ_2};
-  uint32_t fixupDataType;
-  uint32_t savedIntelFmt;
-
-  if(region[1] == 1) local_sz[1] = 1;
-  if(region[2] == 1) local_sz[2] = 1;
-  global_sz[0] = ((region[0] + local_sz[0] - 1) / local_sz[0]) * local_sz[0];
-  global_sz[1] = ((region[1] + local_sz[1] - 1) / local_sz[1]) * local_sz[1];
-  global_sz[2] = ((region[2] + local_sz[2] - 1) / local_sz[2]) * local_sz[2];
-
-  switch (src_image->fmt.image_channel_data_type) {
-    case CL_SNORM_INT8:
-    case CL_UNORM_INT8:  fixupDataType = CL_UNSIGNED_INT8; break;
-    case CL_HALF_FLOAT:
-    case CL_SNORM_INT16:
-    case CL_UNORM_INT16: fixupDataType = CL_UNSIGNED_INT16; break;
-    case CL_FLOAT:       fixupDataType = CL_UNSIGNED_INT32; break;
-    default:
-      fixupDataType = 0;
-  }
-
-  if (fixupDataType) {
-    cl_image_format fmt;
-    if (src_image->fmt.image_channel_order != CL_BGRA &&
-        src_image->fmt.image_channel_order != CL_sBGRA &&
-        src_image->fmt.image_channel_order != CL_sRGBA)
-      fmt.image_channel_order = src_image->fmt.image_channel_order;
-    else
-      fmt.image_channel_order = CL_RGBA;
-    fmt.image_channel_data_type = fixupDataType;
-    savedIntelFmt = src_image->intel_fmt;
-    src_image->intel_fmt = cl_image_get_intel_format(&fmt);
-    dst_image->intel_fmt = src_image->intel_fmt;
-  }
-
-  /* We use one kernel to copy the data. The kernel is lazily created. */
-  assert(src_image->base.ctx == dst_image->base.ctx);
-
-  /* setup the kernel and run. */
-  if(src_image->image_type == CL_MEM_OBJECT_IMAGE1D) {
-    if(dst_image->image_type == CL_MEM_OBJECT_IMAGE1D) {
-      extern char cl_internal_copy_image_1d_to_1d_str[];
-      extern size_t cl_internal_copy_image_1d_to_1d_str_size;
-
-      ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_1D_TO_1D,
-          cl_internal_copy_image_1d_to_1d_str, (size_t)cl_internal_copy_image_1d_to_1d_str_size, NULL);
-    }
-  } else if(src_image->image_type == CL_MEM_OBJECT_IMAGE2D) {
-    if(dst_image->image_type == CL_MEM_OBJECT_IMAGE2D) {
-      extern char cl_internal_copy_image_2d_to_2d_str[];
-      extern size_t cl_internal_copy_image_2d_to_2d_str_size;
-
-      ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_2D_TO_2D,
-          cl_internal_copy_image_2d_to_2d_str, (size_t)cl_internal_copy_image_2d_to_2d_str_size, NULL);
-    } else if(dst_image->image_type == CL_MEM_OBJECT_IMAGE3D) {
-      extern char cl_internal_copy_image_2d_to_3d_str[];
-      extern size_t cl_internal_copy_image_2d_to_3d_str_size;
-
-      ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_2D_TO_3D,
-          cl_internal_copy_image_2d_to_3d_str, (size_t)cl_internal_copy_image_2d_to_3d_str_size, NULL);
-    } else if(dst_image->image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY) {
-      extern char cl_internal_copy_image_2d_to_2d_array_str[];
-      extern size_t cl_internal_copy_image_2d_to_2d_array_str_size;
-
-      ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_2D_TO_2D_ARRAY,
-          cl_internal_copy_image_2d_to_2d_array_str, (size_t)cl_internal_copy_image_2d_to_2d_array_str_size, NULL);
-    }
-  } else if(src_image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY) {
-    if(dst_image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY) {
-      extern char cl_internal_copy_image_1d_array_to_1d_array_str[];
-      extern size_t cl_internal_copy_image_1d_array_to_1d_array_str_size;
-
-      ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_1D_ARRAY_TO_1D_ARRAY,
-          cl_internal_copy_image_1d_array_to_1d_array_str,
-          (size_t)cl_internal_copy_image_1d_array_to_1d_array_str_size, NULL);
-    }
-  } else if(src_image->image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY) {
-    if(dst_image->image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY) {
-      extern char cl_internal_copy_image_2d_array_to_2d_array_str[];
-      extern size_t cl_internal_copy_image_2d_array_to_2d_array_str_size;
-
-      ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_2D_ARRAY_TO_2D_ARRAY,
-          cl_internal_copy_image_2d_array_to_2d_array_str,
-          (size_t)cl_internal_copy_image_2d_array_to_2d_array_str_size, NULL);
-    } else if(dst_image->image_type == CL_MEM_OBJECT_IMAGE2D) {
-      extern char cl_internal_copy_image_2d_array_to_2d_str[];
-      extern size_t cl_internal_copy_image_2d_array_to_2d_str_size;
-
-      ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_2D_ARRAY_TO_2D,
-          cl_internal_copy_image_2d_array_to_2d_str,
-          (size_t)cl_internal_copy_image_2d_array_to_2d_str_size, NULL);
-    } else if(dst_image->image_type == CL_MEM_OBJECT_IMAGE3D) {
-      extern char cl_internal_copy_image_2d_array_to_3d_str[];
-      extern size_t cl_internal_copy_image_2d_array_to_3d_str_size;
-
-      ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_2D_ARRAY_TO_3D,
-          cl_internal_copy_image_2d_array_to_3d_str,
-          (size_t)cl_internal_copy_image_2d_array_to_3d_str_size, NULL);
-    }
-  } else if(src_image->image_type == CL_MEM_OBJECT_IMAGE3D) {
-    if(dst_image->image_type == CL_MEM_OBJECT_IMAGE2D) {
-      extern char cl_internal_copy_image_3d_to_2d_str[];
-      extern size_t cl_internal_copy_image_3d_to_2d_str_size;
-
-      ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_3D_TO_2D,
-          cl_internal_copy_image_3d_to_2d_str, (size_t)cl_internal_copy_image_3d_to_2d_str_size, NULL);
-    } else if(dst_image->image_type == CL_MEM_OBJECT_IMAGE3D) {
-      extern char cl_internal_copy_image_3d_to_3d_str[];
-      extern size_t cl_internal_copy_image_3d_to_3d_str_size;
-
-      ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_3D_TO_3D,
-          cl_internal_copy_image_3d_to_3d_str, (size_t)cl_internal_copy_image_3d_to_3d_str_size, NULL);
-    } else if(dst_image->image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY) {
-      extern char cl_internal_copy_image_3d_to_2d_array_str[];
-      extern size_t cl_internal_copy_image_3d_to_2d_array_str_size;
-
-      ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_3D_TO_2D_ARRAY,
-          cl_internal_copy_image_3d_to_2d_array_str, (size_t)cl_internal_copy_image_3d_to_2d_array_str_size, NULL);
-    }
-  }
-
-  if (!ker) {
-    ret = CL_OUT_OF_RESOURCES;
-    goto fail;
-  }
-
-  cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_image);
-  cl_kernel_set_arg(ker, 1, sizeof(cl_mem), &dst_image);
-  cl_kernel_set_arg(ker, 2, sizeof(cl_int), &region[0]);
-  cl_kernel_set_arg(ker, 3, sizeof(cl_int), &region[1]);
-  cl_kernel_set_arg(ker, 4, sizeof(cl_int), &region[2]);
-  cl_kernel_set_arg(ker, 5, sizeof(cl_int), &src_origin[0]);
-  cl_kernel_set_arg(ker, 6, sizeof(cl_int), &src_origin[1]);
-  cl_kernel_set_arg(ker, 7, sizeof(cl_int), &src_origin[2]);
-  cl_kernel_set_arg(ker, 8, sizeof(cl_int), &dst_origin[0]);
-  cl_kernel_set_arg(ker, 9, sizeof(cl_int), &dst_origin[1]);
-  cl_kernel_set_arg(ker, 10, sizeof(cl_int), &dst_origin[2]);
-
-  ret = cl_command_queue_ND_range(queue, ker, event, 1, global_off,
-                                  global_off, global_sz, global_sz, local_sz, local_sz);
-
-fail:
-
-  cl_kernel_delete(ker);
-  if (fixupDataType) {
-    src_image->intel_fmt = savedIntelFmt;
-    dst_image->intel_fmt = savedIntelFmt;
-  }
-  return ret;
-}
-
-LOCAL cl_int
-cl_mem_copy_image_to_buffer(cl_command_queue queue, cl_event event, struct _cl_mem_image* image, cl_mem buffer,
-                         const size_t *src_origin, const size_t dst_offset, const size_t *region) {
-  cl_int ret;
-  cl_kernel ker = NULL;
-  size_t global_off[] = {0,0,0};
-  size_t global_sz[] = {1,1,1};
-  size_t local_sz[] = {LOCAL_SZ_0,LOCAL_SZ_1,LOCAL_SZ_2};
-  uint32_t intel_fmt, bpp;
-  cl_image_format fmt;
-  size_t origin0, region0;
-  size_t kn_dst_offset;
-  int align16 = 0;
-  size_t align_size = 1;
-  size_t w_saved;
-
-  if(region[1] == 1) local_sz[1] = 1;
-  if(region[2] == 1) local_sz[2] = 1;
-  global_sz[0] = ((region[0] + local_sz[0] - 1) / local_sz[0]) * local_sz[0];
-  global_sz[1] = ((region[1] + local_sz[1] - 1) / local_sz[1]) * local_sz[1];
-  global_sz[2] = ((region[2] + local_sz[2] - 1) / local_sz[2]) * local_sz[2];
-
-  /* We use one kernel to copy the data. The kernel is lazily created. */
-  assert(image->base.ctx == buffer->ctx);
-
-  intel_fmt = image->intel_fmt;
-  bpp = image->bpp;
-  w_saved = image->w;
-  region0 = region[0] * bpp;
-  kn_dst_offset = dst_offset;
-  if((image->image_type == CL_MEM_OBJECT_IMAGE2D) && ((image->w * image->bpp) % 16 == 0) &&
-      ((src_origin[0] * bpp) % 16 == 0) && (region0 % 16 == 0) && (dst_offset % 16 == 0)){
-    fmt.image_channel_order = CL_RGBA;
-    fmt.image_channel_data_type = CL_UNSIGNED_INT32;
-    align16 = 1;
-    align_size = 16;
-  }
-  else{
-    fmt.image_channel_order = CL_R;
-    fmt.image_channel_data_type = CL_UNSIGNED_INT8;
-    align_size = 1;
-  }
-  image->intel_fmt = cl_image_get_intel_format(&fmt);
-  image->w = (image->w * image->bpp) / align_size;
-  image->bpp = align_size;
-  region0 = (region[0] * bpp) / align_size;
-  origin0 = (src_origin[0] * bpp) / align_size;
-  kn_dst_offset /= align_size;
-  global_sz[0] = ((region0 + local_sz[0] - 1) / local_sz[0]) * local_sz[0];
-
-  /* setup the kernel and run. */
-  if(image->image_type == CL_MEM_OBJECT_IMAGE2D) {
-    if(align16){
-      extern char cl_internal_copy_image_2d_to_buffer_align16_str[];
-      extern size_t cl_internal_copy_image_2d_to_buffer_align16_str_size;
-
-      ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_2D_TO_BUFFER_ALIGN16,
-                cl_internal_copy_image_2d_to_buffer_align16_str,
-                (size_t)cl_internal_copy_image_2d_to_buffer_align16_str_size, NULL);
-    }
-    else{
-      extern char cl_internal_copy_image_2d_to_buffer_str[];
-      extern size_t cl_internal_copy_image_2d_to_buffer_str_size;
-
-      ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_2D_TO_BUFFER,
-          cl_internal_copy_image_2d_to_buffer_str, (size_t)cl_internal_copy_image_2d_to_buffer_str_size, NULL);
-    }
-  }else if(image->image_type == CL_MEM_OBJECT_IMAGE3D) {
-    extern char cl_internal_copy_image_3d_to_buffer_str[];
-    extern size_t cl_internal_copy_image_3d_to_buffer_str_size;
-
-    ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_3D_TO_BUFFER,
-          cl_internal_copy_image_3d_to_buffer_str, (size_t)cl_internal_copy_image_3d_to_buffer_str_size, NULL);
-  }
-
-  if (!ker) {
-    ret = CL_OUT_OF_RESOURCES;
-    goto fail;
-  }
-
-  cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &image);
-  cl_kernel_set_arg(ker, 1, sizeof(cl_mem), &buffer);
-  cl_kernel_set_arg(ker, 2, sizeof(cl_int), &region0);
-  cl_kernel_set_arg(ker, 3, sizeof(cl_int), &region[1]);
-  cl_kernel_set_arg(ker, 4, sizeof(cl_int), &region[2]);
-  cl_kernel_set_arg(ker, 5, sizeof(cl_int), &origin0);
-  cl_kernel_set_arg(ker, 6, sizeof(cl_int), &src_origin[1]);
-  cl_kernel_set_arg(ker, 7, sizeof(cl_int), &src_origin[2]);
-  cl_kernel_set_arg(ker, 8, sizeof(cl_int), &kn_dst_offset);
-
-  ret = cl_command_queue_ND_range(queue, ker, event, 1, global_off,
-                                  global_off, global_sz, global_sz, local_sz, local_sz);
-
-fail:
-
-  cl_kernel_delete(ker);
-  image->intel_fmt = intel_fmt;
-  image->bpp = bpp;
-  image->w = w_saved;
-
-  return ret;
-}
-
-
-LOCAL cl_int
-cl_mem_copy_buffer_to_image(cl_command_queue queue, cl_event event, cl_mem buffer, struct _cl_mem_image* image,
-                         const size_t src_offset, const size_t *dst_origin, const size_t *region) {
-  cl_int ret;
-  cl_kernel ker = NULL;
-  size_t global_off[] = {0,0,0};
-  size_t global_sz[] = {1,1,1};
-  size_t local_sz[] = {LOCAL_SZ_0,LOCAL_SZ_1,LOCAL_SZ_2};
-  uint32_t intel_fmt, bpp;
-  cl_image_format fmt;
-  size_t origin0, region0;
-  size_t kn_src_offset;
-  int align16 = 0;
-  size_t align_size = 1;
-  size_t w_saved = 0;
-
-  if(region[1] == 1) local_sz[1] = 1;
-  if(region[2] == 1) local_sz[2] = 1;
-  global_sz[0] = ((region[0] + local_sz[0] - 1) / local_sz[0]) * local_sz[0];
-  global_sz[1] = ((region[1] + local_sz[1] - 1) / local_sz[1]) * local_sz[1];
-  global_sz[2] = ((region[2] + local_sz[2] - 1) / local_sz[2]) * local_sz[2];
-
-  /* We use one kernel to copy the data. The kernel is lazily created. */
-  assert(image->base.ctx == buffer->ctx);
-
-  intel_fmt = image->intel_fmt;
-  bpp = image->bpp;
-  w_saved = image->w;
-  region0 = region[0] * bpp;
-  kn_src_offset = src_offset;
-  if((image->image_type == CL_MEM_OBJECT_IMAGE2D) && ((image->w * image->bpp) % 16 == 0) &&
-      ((dst_origin[0] * bpp) % 16 == 0) && (region0 % 16 == 0) && (src_offset % 16 == 0)){
-    fmt.image_channel_order = CL_RGBA;
-    fmt.image_channel_data_type = CL_UNSIGNED_INT32;
-    align16 = 1;
-    align_size = 16;
-  }
-  else{
-    fmt.image_channel_order = CL_R;
-    fmt.image_channel_data_type = CL_UNSIGNED_INT8;
-    align_size = 1;
-  }
-  image->intel_fmt = cl_image_get_intel_format(&fmt);
-  image->w = (image->w * image->bpp) / align_size;
-  image->bpp = align_size;
-  region0 = (region[0] * bpp) / align_size;
-  origin0 = (dst_origin[0] * bpp) / align_size;
-  kn_src_offset /= align_size;
-  global_sz[0] = ((region0 + local_sz[0] - 1) / local_sz[0]) * local_sz[0];
-
-  /* setup the kernel and run. */
-  if(image->image_type == CL_MEM_OBJECT_IMAGE2D) {
-    if(align16){
-      extern char cl_internal_copy_buffer_to_image_2d_align16_str[];
-      extern size_t cl_internal_copy_buffer_to_image_2d_align16_str_size;
-
-      ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D_ALIGN16,
-                cl_internal_copy_buffer_to_image_2d_align16_str,
-                (size_t)cl_internal_copy_buffer_to_image_2d_align16_str_size, NULL);
-    }
-    else{
-      extern char cl_internal_copy_buffer_to_image_2d_str[];
-      extern size_t cl_internal_copy_buffer_to_image_2d_str_size;
-
-      ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D,
-          cl_internal_copy_buffer_to_image_2d_str, (size_t)cl_internal_copy_buffer_to_image_2d_str_size, NULL);
-    }
-  }else if(image->image_type == CL_MEM_OBJECT_IMAGE3D) {
-      extern char cl_internal_copy_buffer_to_image_3d_str[];
-      extern size_t cl_internal_copy_buffer_to_image_3d_str_size;
-
-      ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_3D,
-          cl_internal_copy_buffer_to_image_3d_str, (size_t)cl_internal_copy_buffer_to_image_3d_str_size, NULL);
-  }
-  if (!ker)
-    return CL_OUT_OF_RESOURCES;
-
-  cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &image);
-  cl_kernel_set_arg(ker, 1, sizeof(cl_mem), &buffer);
-  cl_kernel_set_arg(ker, 2, sizeof(cl_int), &region0);
-  cl_kernel_set_arg(ker, 3, sizeof(cl_int), &region[1]);
-  cl_kernel_set_arg(ker, 4, sizeof(cl_int), &region[2]);
-  cl_kernel_set_arg(ker, 5, sizeof(cl_int), &origin0);
-  cl_kernel_set_arg(ker, 6, sizeof(cl_int), &dst_origin[1]);
-  cl_kernel_set_arg(ker, 7, sizeof(cl_int), &dst_origin[2]);
-  cl_kernel_set_arg(ker, 8, sizeof(cl_int), &kn_src_offset);
-
-  ret = cl_command_queue_ND_range(queue, ker, event, 1, global_off,
-                                  global_off, global_sz, global_sz, local_sz, local_sz);
-  cl_kernel_delete(ker);
-
-  image->intel_fmt = intel_fmt;
-  image->bpp = bpp;
-  image->w = w_saved;
-
-  return ret;
-}
-
-
 LOCAL void*
 cl_mem_map(cl_mem mem, int write)
 {
@@ -2415,7 +1611,7 @@ cl_mem_record_map_mem(cl_mem mem, void *ptr, void **mem_ptr, size_t offset,
   /* Record the mapped address. */
   if (!mem->mapped_ptr_sz) {
     mem->mapped_ptr_sz = 16;
-    mem->mapped_ptr = (cl_mapped_ptr *)malloc(
+    mem->mapped_ptr = (cl_mapped_ptr *)CL_MALLOC(
         sizeof(cl_mapped_ptr) * mem->mapped_ptr_sz);
     if (!mem->mapped_ptr) {
       cl_mem_unmap_auto(mem);
@@ -2433,7 +1629,7 @@ cl_mem_record_map_mem(cl_mem mem, void *ptr, void **mem_ptr, size_t offset,
       }
     }
     if (i == mem->mapped_ptr_sz) {
-      cl_mapped_ptr *new_ptr = (cl_mapped_ptr *)malloc(
+      cl_mapped_ptr *new_ptr = (cl_mapped_ptr *)CL_MALLOC(
           sizeof(cl_mapped_ptr) * mem->mapped_ptr_sz * 2);
       if (!new_ptr) {
         cl_mem_unmap_auto(mem);
@@ -2445,7 +1641,7 @@ cl_mem_record_map_mem(cl_mem mem, void *ptr, void **mem_ptr, size_t offset,
           mem->mapped_ptr_sz * sizeof(cl_mapped_ptr));
       slot = mem->mapped_ptr_sz;
       mem->mapped_ptr_sz *= 2;
-      free(mem->mapped_ptr);
+      CL_FREE(mem->mapped_ptr);
       mem->mapped_ptr = new_ptr;
     }
   }
diff --git a/src/cl_program.c b/src/cl_program.c
index c090bba..4dc0b5e 100644
--- a/src/cl_program.c
+++ b/src/cl_program.c
@@ -14,7 +14,6 @@
  * You should have received a copy of the GNU Lesser General Public
  * License along with this library. If not, see <http://www.gnu.org/licenses/>.
  *
- * Author: Benjamin Segovia <benjamin.segovia at intel.com>
  */
 
 #include "cl_kernel.h"
@@ -39,1003 +38,1019 @@
 #include <sys/stat.h>
 #include <libgen.h>
 
-static void
-cl_program_release_sources(cl_program p)
+LOCAL cl_program
+cl_program_new(cl_context ctx)
 {
-  if (p->source) {
-    CL_FREE(p->source);
-    p->source = NULL;
+  cl_program p = NULL;
+  int i;
+  cl_int err = CL_SUCCESS;
+
+  /* Allocate the structure */
+  p = CL_CALLOC(1, sizeof(struct _cl_program));
+  if (p == NULL)
+    return NULL;
+
+  CL_OBJECT_INIT_BASE(p, CL_OBJECT_PROGRAM_MAGIC);
+  list_init(&p->kernels);
+
+  p->each_device = CL_CALLOC(ctx->device_num, sizeof(cl_program_for_device));
+  if (p->each_device == NULL) {
+    CL_FREE(p);
+    return NULL;
+  }
+  p->each_device_num = ctx->device_num;
+
+  for (i = 0; i < ctx->device_num; i++) {
+    p->each_device[i] = (ctx->devices[i]->api.program_new)(ctx->devices[i], p);
+    if (p->each_device[i] == NULL) {
+      err = CL_OUT_OF_HOST_MEMORY;
+      break;
+    }
+  }
+
+  if (err != CL_SUCCESS) {
+    for (i = 0; i < ctx->device_num; i++) {
+      if (p->each_device[i])
+        (ctx->devices[i]->api.program_delete)(p->each_device[i]->device, p);
+    }
+    CL_FREE(p);
+    return NULL;
   }
+
+  p->build_status = CL_BUILD_NONE;
+  /* The queue also belongs to its context */
+  cl_context_add_program(ctx, p);
+  return p;
 }
 
-static void
-cl_program_release_binary(cl_program p)
+LOCAL void
+cl_program_add_ref(cl_program p)
 {
-  if (p->binary) {
-    CL_FREE(p->binary);
-    p->binary = NULL;
-  }
+  assert(p);
+  CL_OBJECT_INC_REF(p);
 }
 
 LOCAL void
 cl_program_delete(cl_program p)
 {
-  uint32_t ref, i;
+  cl_uint i = 0;
 
   if (p == NULL)
     return;
 
   /* We are not done with it yet */
-  if ((ref = CL_OBJECT_DEC_REF(p)) > 1) return;
+  if (CL_OBJECT_DEC_REF(p) > 1)
+    return;
 
-  /* Destroy the sources and binary if still allocated */
-  cl_program_release_sources(p);
-  cl_program_release_binary(p);
+  /* Remove it from the context list */
+  if (p->ctx)
+    cl_context_remove_program(p->ctx, p);
 
-  /* Release the build options. */
-  if (p->build_opts) {
-    CL_FREE(p->build_opts);
-    p->build_opts = NULL;
-  }
+  assert(p->ker_n == 0);
+  assert(list_empty(&p->kernels));
 
-  if (p->build_log) {
-    free(p->build_log);
-    p->build_log = NULL;
+  for (i = 0; i < p->each_device_num; i++) {
+    if (p->each_device[i]->binary) {
+      assert(p->each_device[i]->binary_sz > 0);
+      CL_FREE(p->each_device[i]->binary);
+      p->each_device[i]->binary = NULL;
+      p->each_device[i]->binary_sz = 0;
+    }
   }
 
-#ifdef HAS_CMRT
-  if (p->cmrt_program != NULL)
-    cmrt_destroy_program(p);
-  else
-#endif
-  {
-    CL_FREE(p->bin);               /* Free the blob */
-    for (i = 0; i < p->ker_n; ++i) /* Free the kernels */
-      cl_kernel_delete(p->ker[i]);
-    CL_FREE(p->ker);
+  for (i = 0; i < p->each_device_num; i++) {
+    (p->each_device[i]->device->api.program_delete)(p->each_device[i]->device, p);
   }
+  CL_FREE(p->each_device);
 
-  if (p->global_data_ptr)
-    cl_buffer_unreference(p->global_data);
-  CL_FREE(p->global_data_ptr);
-
-  /* Remove it from the list */
-  cl_context_remove_program(p->ctx, p);
+  if (p->build_opts) {
+    CL_FREE(p->build_opts);
+    p->build_opts = NULL;
+  }
 
-  /* Free the program as allocated by the compiler */
-  if (p->opaque) {
-    if (CompilerSupported())
-      //For static variables release, gbeLoader may have been released, so
-      //compiler_program_clean_llvm_resource and interp_program_delete may be NULL.
-      if(compiler_program_clean_llvm_resource)
-        compiler_program_clean_llvm_resource(p->opaque);
-    if(interp_program_delete)
-      interp_program_delete(p->opaque);
+  /* Destroy the sources and binary if still allocated */
+  if (p->source) {
+    CL_FREE(p->source);
+    p->source = NULL;
   }
 
   CL_OBJECT_DESTROY_BASE(p);
   CL_FREE(p);
 }
 
-LOCAL cl_program
-cl_program_new(cl_context ctx)
+/* Before we do the real work, we need to check whether our platform
+   cl version can meet -cl-std= */
+static int check_cl_version_option(cl_program p, const char *options)
 {
-  cl_program p = NULL;
+  const char *s = NULL;
+  int ver1 = 0;
+  int ver2 = 0;
+  char version_str[64] = {0};
 
-  /* Allocate the structure */
-  TRY_ALLOC_NO_ERR (p, CL_CALLOC(1, sizeof(struct _cl_program)));
-  CL_OBJECT_INIT_BASE(p, CL_OBJECT_PROGRAM_MAGIC);
-  p->build_status = CL_BUILD_NONE;
-  p->cmrt_program = NULL;
-  p->build_log = calloc(BUILD_LOG_MAX_SIZE, sizeof(char));
-  if (p->build_log)
-    p->build_log_max_sz = BUILD_LOG_MAX_SIZE;
+  if (options && (s = strstr(options, "-cl-std="))) {
 
-  /* The queue also belongs to its context */
-  cl_context_add_program(ctx, p);
+    if (s + strlen("-cl-std=CLX.X") > options + strlen(options)) {
+      return 0;
+    }
 
-exit:
-  return p;
-error:
-  cl_program_delete(p);
-  goto exit;
-}
+    if (s[8] != 'C' || s[9] != 'L' || s[10] > '9' || s[10] < '0' ||
+        s[11] != '.' || s[12] > '9' || s[12] < '0') {
+      return 0;
+    }
 
-LOCAL void
-cl_program_add_ref(cl_program p)
-{
-  assert(p);
-  CL_OBJECT_INC_REF(p);
-}
+    ver1 = (s[10] - '0') * 10 + (s[12] - '0');
 
-static cl_int
-cl_program_load_gen_program(cl_program p)
-{
-  cl_int err = CL_SUCCESS;
-  uint32_t i;
+    if (cl_device_get_info(p->ctx->devices[0], CL_DEVICE_OPENCL_C_VERSION,
+                           sizeof(version_str), version_str, NULL) != CL_SUCCESS)
+      return 0;
 
-  assert(p->opaque != NULL);
-  p->ker_n = interp_program_get_kernel_num(p->opaque);
+    assert(strstr(version_str, "OpenCL") && version_str[0] == 'O');
+    ver2 = (version_str[9] - '0') * 10 + (version_str[11] - '0');
 
-  /* Allocate the kernel array */
-  TRY_ALLOC (p->ker, CL_CALLOC(p->ker_n, sizeof(cl_kernel)));
+    if (ver2 < ver1)
+      return 0;
 
-  for (i = 0; i < p->ker_n; ++i) {
-    const gbe_kernel opaque = interp_program_get_kernel(p->opaque, i);
-    assert(opaque != NULL);
-    TRY_ALLOC (p->ker[i], cl_kernel_new(p));
-    cl_kernel_setup(p->ker[i], opaque);
+    return 1;
   }
 
-error:
-  return err;
+  return 1;
 }
 
-#define BINARY_HEADER_LENGTH 5
-
-static const unsigned char binary_type_header[BHI_MAX][BINARY_HEADER_LENGTH]=  \
-                                              {{'B','C', 0xC0, 0xDE},
-                                               {1, 'B', 'C', 0xC0, 0xDE},
-                                               {2, 'B', 'C', 0xC0, 0xDE},
-                                               {1, 'G','E', 'N', 'C'},
-                                               {'C','I', 'S', 'A'},
-                                               };
-
-LOCAL cl_bool headerCompare(const unsigned char *BufPtr, BINARY_HEADER_INDEX index)
+static cl_uint
+cl_program_get_kernel_num(cl_program p)
 {
-  bool matched = true;
-  int length = (index == BHI_SPIR || index == BHI_CMRT) ? BINARY_HEADER_LENGTH -1 :BINARY_HEADER_LENGTH;
-  int i = 0;
-  if(index == BHI_GEN_BINARY)
-    i = 1;
-  for (; i < length; ++i)
-  {
-    matched = matched && (BufPtr[i] == binary_type_header[index][i]);
-  }
-  if(index == BHI_GEN_BINARY && matched) {
-    if(BufPtr[0] != binary_type_header[index][0]) {
-      DEBUGP(DL_WARNING, "Beignet binary format have been changed, please generate binary again.\n");
-      matched = false;
-    }
-  }
-  return matched;
+  cl_uint num;
+  CL_OBJECT_LOCK(p);
+  num = p->ker_n;
+  CL_OBJECT_UNLOCK(p);
+  return num;
 }
 
-#define isSPIR(BufPtr)      headerCompare(BufPtr, BHI_SPIR)
-#define isLLVM_C_O(BufPtr)  headerCompare(BufPtr, BHI_COMPIRED_OBJECT)
-#define isLLVM_LIB(BufPtr)  headerCompare(BufPtr, BHI_LIBRARY)
-#define isGenBinary(BufPtr) headerCompare(BufPtr, BHI_GEN_BINARY)
-#define isCMRT(BufPtr)      headerCompare(BufPtr, BHI_CMRT)
-
-static cl_int get_program_global_data(cl_program prog) {
-//OpenCL 1.2 would never call this function, and OpenCL 2.0 alwasy HAS_BO_SET_SOFTPIN.
-#ifdef HAS_BO_SET_SOFTPIN
-  cl_buffer_mgr bufmgr = NULL;
-  bufmgr = cl_context_get_bufmgr(prog->ctx);
-  assert(bufmgr);
-  size_t const_size = interp_program_get_global_constant_size(prog->opaque);
-  if (const_size == 0) return CL_SUCCESS;
-
-  int page_size = getpagesize();
-  size_t alignedSz = ALIGN(const_size, page_size);
-  char * p = (char*)CL_MEMALIGN(page_size, alignedSz);
-  prog->global_data_ptr = p;
-  interp_program_get_global_constant_data(prog->opaque, (char*)p);
-
-  prog->global_data = cl_buffer_alloc_userptr(bufmgr, "program global data", p, alignedSz, 0);
-  cl_buffer_set_softpin_offset(prog->global_data, (size_t)p);
-  cl_buffer_set_bo_use_full_range(prog->global_data, 1);
-
-  uint32_t reloc_count = interp_program_get_global_reloc_count(prog->opaque);
-  if (reloc_count > 0) {
-    uint32_t x;
-    struct RelocEntry {int refOffset; int defOffset;};
-    char *temp = (char*) malloc(reloc_count *sizeof(int)*2);
-    interp_program_get_global_reloc_table(prog->opaque, temp);
-    for (x = 0; x < reloc_count; x++) {
-      int ref_offset = ((struct RelocEntry *)temp)[x].refOffset;
-      *(uint64_t*)&(p[ref_offset]) = ((struct RelocEntry *)temp)[x].defOffset + (uint64_t)p;
+static cl_int
+cl_program_check_rebuild(cl_program p, cl_bool just_compile,
+                         cl_uint num_devices, const cl_device_id *device_list)
+{
+  cl_device_id device;
+  cl_program_for_device pd = NULL;
+  cl_uint i;
+  cl_uint j;
+
+  if (p->source == NULL) // If no source, e.g from binary, we never rebuild
+    return CL_SUCCESS;
+
+  for (i = 0; i < num_devices; i++) {
+    device = device_list[i];
+    for (j = 0; j < p->each_device_num; j++) {
+      if (device == p->each_device[j]->device) {
+        pd = p->each_device[j];
+        break;
+      }
+    }
+    assert(pd);
+
+    if (pd->binary_type == CL_PROGRAM_BINARY_TYPE_EXECUTABLE ||
+        (just_compile && pd->binary_type == CL_PROGRAM_BINARY_TYPE_LIBRARY) ||
+        (just_compile && pd->binary_type == CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT)) {
+      assert(pd->binary);
+      CL_FREE(pd->binary);
+      pd->binary_sz = 0;
+      (device->api.program_delete)(device, p);
+      p->each_device[j] = (device->api.program_new)(device, p);
+      if (p->each_device[j] == NULL)
+        return CL_OUT_OF_HOST_MEMORY;
     }
-    free(temp);
-  }
-#if 0
-  int x = 0;
-  for (x = 0; x < const_size; x++) {
-    printf("offset %d data: %x\n", x, (unsigned)p[x]);
   }
-#endif
-#endif
-  return CL_SUCCESS;
-}
 
-LOCAL size_t cl_program_get_global_variable_size(cl_program prog) {
-  return interp_program_get_global_constant_size(prog->opaque);
+  return CL_SUCCESS;
 }
 
-LOCAL cl_program
-cl_program_create_from_binary(cl_context             ctx,
-                              cl_uint                num_devices,
-                              const cl_device_id *   devices,
-                              const size_t *         lengths,
-                              const unsigned char ** binaries,
-                              cl_int *               binary_status,
-                              cl_int *               errcode_ret)
+LOCAL cl_int
+cl_program_build(cl_program p, const char *options, cl_uint num_devices,
+                 const cl_device_id *device_list)
 {
-  cl_program program = NULL;
-  cl_int err = CL_SUCCESS;
+  cl_bool build_ret = CL_FALSE;
+  cl_device_id device;
+  cl_int ret = CL_SUCCESS;
+  cl_uint i;
+  cl_program_for_device pd = NULL;
 
-  assert(ctx);
-  INVALID_DEVICE_IF (num_devices != 1);
-  INVALID_DEVICE_IF (devices == NULL);
-  INVALID_DEVICE_IF (devices[0] != ctx->devices[0]);
-  INVALID_VALUE_IF (binaries == NULL);
-  INVALID_VALUE_IF (lengths == NULL);
-
-  if (binaries[0] == NULL) {
-    err = CL_INVALID_VALUE;
-    if (binary_status)
-      binary_status[0] = CL_INVALID_VALUE;
-    goto error;
+  if (device_list == NULL) {
+    assert(num_devices == 0);
+    num_devices = p->ctx->device_num;
+    device_list = p->ctx->devices;
   }
 
-  //need at least 4 bytes to check the binary type.
-  if (lengths[0] == 0 || lengths[0] < 4) {
-    err = CL_INVALID_VALUE;
-    if (binary_status)
-      binary_status[0] = CL_INVALID_VALUE;
-    goto error;
-  }
+  if (!check_cl_version_option(p, options))
+    return CL_INVALID_BUILD_OPTIONS;
 
-  program = cl_program_new(ctx);
-  if (UNLIKELY(program == NULL)) {
-      err = CL_OUT_OF_HOST_MEMORY;
-      goto error;
-  }
+  if (cl_program_get_kernel_num(p) > 0)
+    return CL_INVALID_OPERATION;
 
-  TRY_ALLOC(program->binary, CL_CALLOC(lengths[0], sizeof(char)));
-  memcpy(program->binary, binaries[0], lengths[0]);
-  program->binary_sz = lengths[0];
-  program->source_type = FROM_BINARY;
-
-  if (isCMRT((unsigned char*)program->binary)) {
-    program->source_type = FROM_CMRT;
-  }else if(isSPIR((unsigned char*)program->binary)) {
-    char* typed_binary;
-    TRY_ALLOC(typed_binary, CL_CALLOC(lengths[0]+1, sizeof(char)));
-    memcpy(typed_binary+1, binaries[0], lengths[0]);
-    *typed_binary = 1;
-    program->opaque = compiler_program_new_from_llvm_binary(program->ctx->devices[0]->device_id, typed_binary, program->binary_sz+1);
-    CL_FREE(typed_binary);
-    if (UNLIKELY(program->opaque == NULL)) {
-      err = CL_INVALID_PROGRAM;
-      goto error;
-    }
+  if (CL_OBJECT_TAKE_OWNERSHIP(p, CL_FALSE) == CL_FALSE)
+    return CL_INVALID_OPERATION;
 
-    program->source_type = FROM_LLVM_SPIR;
-    program->binary_type = CL_PROGRAM_BINARY_TYPE_INTERMEDIATE;
-  }else if(isLLVM_C_O((unsigned char*)program->binary) || isLLVM_LIB((unsigned char*)program->binary)) {
-    if(*program->binary == BHI_COMPIRED_OBJECT){
-      program->binary_type = CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT;
-    }else if(*program->binary == BHI_LIBRARY){
-      program->binary_type = CL_PROGRAM_BINARY_TYPE_LIBRARY;
-    }else{
-      err= CL_INVALID_BINARY;
-      goto error;
-    }
-    program->opaque = compiler_program_new_from_llvm_binary(program->ctx->devices[0]->device_id, program->binary, program->binary_sz);
+  if (options) {
+    if (p->build_opts == NULL || strcmp(options, p->build_opts) != 0) {
+      if (p->build_opts) {
+        CL_FREE(p->build_opts);
+        p->build_opts = NULL;
+      }
+      p->build_opts = CL_CALLOC(strlen(options) + 1, sizeof(char));
+      if (p->build_opts == NULL) {
+        return CL_OUT_OF_HOST_MEMORY;
+      }
 
-    if (UNLIKELY(program->opaque == NULL)) {
-      err = CL_INVALID_PROGRAM;
-      goto error;
+      memcpy(p->build_opts, options, strlen(options));
     }
-    program->source_type = FROM_LLVM;
   }
-  else if (isGenBinary((unsigned char*)program->binary)) {
-    program->opaque = interp_program_new_from_binary(program->ctx->devices[0]->device_id, program->binary, program->binary_sz);
-    if (UNLIKELY(program->opaque == NULL)) {
-      DEBUGP(DL_ERROR, "Incompatible binary, please delete the binary and generate again.");
-      err = CL_INVALID_PROGRAM;
-      goto error;
-    }
 
-    /* Create all the kernels */
-    TRY (cl_program_load_gen_program, program);
-    program->binary_type = CL_PROGRAM_BINARY_TYPE_EXECUTABLE;
+  if (options == NULL && p->build_opts) {
+    CL_FREE(p->build_opts);
+    p->build_opts = NULL;
   }
-  else {
-    err= CL_INVALID_BINARY;
-    goto error;
+
+  if (p->build_status < CL_BUILD_NONE) {
+    CL_OBJECT_RELEASE_OWNERSHIP(p);
+    return CL_INVALID_OPERATION;
   }
 
-  if (binary_status)
-    binary_status[0] = CL_SUCCESS;
+  ret = cl_program_check_rebuild(p, CL_FALSE, num_devices, device_list);
+  if (ret != CL_SUCCESS) {
+    CL_OBJECT_RELEASE_OWNERSHIP(p);
+    return ret;
+  }
 
-exit:
-  if (errcode_ret)
-    *errcode_ret = err;
-  return program;
-error:
-  cl_program_delete(program);
-  program = NULL;
-  goto exit;
+  /* Begin to build for each device */
+  p->build_status = CL_BUILD_IN_PROGRESS;
+  for (i = 0; i < num_devices; i++) {
+    device = device_list[i];
+    DEV_PRIVATE_DATA(p, device, pd);
 
-  return CL_SUCCESS;
-}
+    ret = cl_compiler_check_available(device);
+    if (ret != CL_SUCCESS)
+      break;
 
-LOCAL cl_program
-cl_program_create_with_built_in_kernles(cl_context     ctx,
-                                  cl_uint              num_devices,
-                                  const cl_device_id * devices,
-                                  const char *         kernel_names,
-                                  cl_int *             errcode_ret)
-{
-  cl_int err = CL_SUCCESS;
+    if ((device->compiler.check_compiler_option)(options) == CL_FALSE) {
+      ret = CL_INVALID_BUILD_OPTIONS;
+      break;
+    }
 
-  assert(ctx);
-  INVALID_DEVICE_IF (num_devices != 1);
-  INVALID_DEVICE_IF (devices == NULL);
-  INVALID_DEVICE_IF (devices[0] != ctx->devices[0]);
-
-  cl_int binary_status = CL_SUCCESS;
-  extern char cl_internal_built_in_kernel_str[];
-  extern size_t cl_internal_built_in_kernel_str_size;
-  char* p_built_in_kernel_str =cl_internal_built_in_kernel_str;
-
-  ctx->built_in_prgs = cl_program_create_from_binary(ctx, 1,
-                                                          &ctx->devices[0],
-                                                          (size_t*)&cl_internal_built_in_kernel_str_size,
-                                                          (const unsigned char **)&p_built_in_kernel_str,
-                                                          &binary_status, &err);
-  if (!ctx->built_in_prgs)
-    return NULL;
+    if (pd->binary_type == CL_PROGRAM_BINARY_TYPE_NONE) { // Build from source on shot
+      assert(pd->binary_sz == 0);
+      assert(pd->binary == NULL);
 
-  err = cl_program_build(ctx->built_in_prgs, NULL);
-  if (err != CL_SUCCESS)
-    return NULL;
+      if (p->source == NULL) {
+        ret = CL_INVALID_OPERATION;
+        break;
+      }
 
-  ctx->built_in_prgs->is_built = 1;
+      assert(p->source_sz > 0);
 
-  char delims[] = ";";
-  char* saveptr = NULL;
-  char* local_kernel_names;
-  char* kernel = NULL;
-  char* matched_kernel;
-  int i = 0;
+      build_ret = (device->compiler.build_program)(device->device_id, p->source, p->source_sz, p->build_opts,
+                                                   pd->build_log_max_sz, pd->build_log, &pd->build_log_sz,
+                                                   &pd->binary, &pd->binary_sz);
+      if (build_ret == CL_FALSE) {
+        if (pd->build_log_sz > 0 && strstr(pd->build_log, "error: error reading 'options'"))
+          ret = CL_INVALID_COMPILER_OPTIONS;
+        else
+          ret = CL_COMPILE_PROGRAM_FAILURE;
 
-  //copy the content to local_kernel_names to protect the kernel_names.
-  TRY_ALLOC(local_kernel_names, CL_CALLOC(strlen(kernel_names)+1, sizeof(char) ) );
-  memcpy(local_kernel_names, kernel_names, strlen(kernel_names)+1);
-
-  kernel = strtok_r( local_kernel_names, delims , &saveptr);
-  while( kernel != NULL ) {
-    matched_kernel = strstr(ctx->devices[0]->built_in_kernels, kernel);
-    if(matched_kernel){
-      for (i = 0; i < ctx->built_in_prgs->ker_n; ++i) {
-        assert(ctx->built_in_prgs->ker[i]);
-        const char *ker_name = cl_kernel_get_name(ctx->built_in_prgs->ker[i]);
-        if (ker_name != NULL && strcmp(ker_name, kernel) == 0) {
-          break;
-        }
+        break;
       }
 
-      ctx->built_in_kernels[i] = cl_program_create_kernel(ctx->built_in_prgs, kernel, NULL);
+      CL_REGISTER_ALLOC_PTR(pd->binary, pd->binary_sz);
+    } else if (pd->binary_type == CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT) { // Build from IR binary
+      char *tmp_binary = pd->binary;
+      size_t tmp_binary_sz = pd->binary_sz;
+      assert(tmp_binary_sz != 0);
+      assert(tmp_binary != NULL);
+      pd->binary = NULL;
+      pd->binary_sz = 0;
+
+      build_ret = (device->compiler.link_program)(device->device_id, 1, &tmp_binary, &tmp_binary_sz, p->build_opts,
+                                                  pd->build_log_max_sz, pd->build_log, &pd->build_log_sz,
+                                                  &pd->binary, &pd->binary_sz);
+      CL_FREE(tmp_binary);
+
+      if (build_ret == CL_FALSE) {
+        if (pd->build_log_sz > 0 && strstr(pd->build_log, "error: error reading 'options'"))
+          ret = CL_INVALID_COMPILER_OPTIONS;
+        else
+          ret = CL_COMPILE_PROGRAM_FAILURE;
+
+        break;
+      }
+
+      CL_REGISTER_ALLOC_PTR(pd->binary, pd->binary_sz);
+    } else if (pd->binary_type == CL_PROGRAM_BINARY_TYPE_EXECUTABLE) { // Already a exec format
+      // rebuild will handle the created from source logic. If created from binary, we need to do nothing
+      continue;
+    } else {
+      ret = CL_BUILD_PROGRAM_FAILURE;
+      break;
     }
-    kernel = strtok_r((char*)saveptr , delims, &saveptr );
+
+    pd->binary_type = CL_PROGRAM_BINARY_TYPE_EXECUTABLE;
+
+    ret = (device->api.program_load_binary)(device, p);
+    if (ret != CL_SUCCESS)
+      break;
   }
 
-  CL_FREE(local_kernel_names);
+  assert(p->ker_n == 0); // No kernels generated
 
-exit:
-  if (errcode_ret)
-    *errcode_ret = err;
-  return ctx->built_in_prgs;
-error:
-  goto exit;
+  //  ret = cl_program_build_kernels_gen(device, p);
+  if (ret != CL_SUCCESS) {
+    p->build_status = CL_BUILD_ERROR;
+    CL_OBJECT_RELEASE_OWNERSHIP(p);
+    return ret;
+  }
 
-  return CL_SUCCESS;
+  p->build_status = CL_BUILD_SUCCESS;
+  CL_OBJECT_RELEASE_OWNERSHIP(p);
+  return ret;
 }
 
-LOCAL cl_program
-cl_program_create_from_llvm(cl_context ctx,
-                            cl_uint num_devices,
-                            const cl_device_id *devices,
-                            const char *file_name,
-                            cl_int *errcode_ret)
+LOCAL cl_int
+cl_program_compile(cl_program p, cl_uint num_input_headers, const cl_program *input_headers,
+                   const char **header_include_names, const char *options,
+                   cl_uint num_devices, const cl_device_id *device_list)
 {
-  cl_program program = NULL;
   cl_int err = CL_SUCCESS;
+  cl_bool build_ret = CL_FALSE;
+  int i = 0;
+  const char **headers = NULL;
+  size_t *header_lengths = NULL;
+  cl_device_id device;
+  cl_program_for_device pd = NULL;
+
+  if (device_list == NULL) {
+    assert(num_devices == 0);
+    num_devices = p->ctx->device_num;
+    device_list = p->ctx->devices;
+  }
 
-  assert(ctx);
-  INVALID_DEVICE_IF (num_devices != 1);
-  INVALID_DEVICE_IF (devices == NULL);
-  INVALID_DEVICE_IF (devices[0] != ctx->devices[0]);
-  INVALID_VALUE_IF (file_name == NULL);
+  if (!check_cl_version_option(p, options))
+    return CL_INVALID_COMPILER_OPTIONS;
 
-  program = cl_program_new(ctx);
-  if (UNLIKELY(program == NULL)) {
-      err = CL_OUT_OF_HOST_MEMORY;
-      goto error;
-  }
+  if (cl_program_get_kernel_num(p) > 0)
+    return CL_INVALID_OPERATION;
 
-  program->opaque = compiler_program_new_from_llvm(ctx->devices[0]->device_id, file_name, NULL, NULL, NULL, program->build_log_max_sz, program->build_log, &program->build_log_sz, 1, NULL);
-  if (UNLIKELY(program->opaque == NULL)) {
-    err = CL_INVALID_PROGRAM;
-    goto error;
+  if (CL_OBJECT_TAKE_OWNERSHIP(p, CL_FALSE) == CL_FALSE)
+    return CL_INVALID_OPERATION;
+
+  if (p->build_status < CL_BUILD_NONE) { // Already did something?
+    CL_OBJECT_RELEASE_OWNERSHIP(p);
+    return CL_INVALID_OPERATION;
   }
 
-  /* Create all the kernels */
-  TRY (cl_program_load_gen_program, program);
-  program->source_type = FROM_LLVM;
+  err = cl_program_check_rebuild(p, CL_TRUE, num_devices, device_list);
+  if (err != CL_SUCCESS) {
+    CL_OBJECT_RELEASE_OWNERSHIP(p);
+    return err;
+  }
 
-exit:
-  if (errcode_ret)
-    *errcode_ret = err;
-  return program;
-error:
-  cl_program_delete(program);
-  program = NULL;
-  goto exit;
-}
+  if (options) {
+    if (p->build_opts == NULL || strcmp(options, p->build_opts) != 0) {
+      if (p->build_opts) {
+        CL_FREE(p->build_opts);
+        p->build_opts = NULL;
+      }
+      p->build_opts = CL_CALLOC(strlen(options) + 1, sizeof(char));
+      if (p->build_opts == NULL) {
+        CL_OBJECT_RELEASE_OWNERSHIP(p);
+        return CL_OUT_OF_HOST_MEMORY;
+      }
 
-LOCAL cl_program
-cl_program_create_from_source(cl_context ctx,
-                              cl_uint count,
-                              const char **strings,
-                              const size_t *lengths,
-                              cl_int *errcode_ret)
+      memcpy(p->build_opts, options, strlen(options));
+    }
+  }
 
-{
-  cl_program program = NULL;
-  cl_int err = CL_SUCCESS;
-  cl_uint i;
-  int32_t * lens = NULL;
-  int32_t len_total = 0;
-  assert(ctx);
-  char * p = NULL;
-  // the real compilation step will be done at build time since we do not have
-  // yet the compilation options
-  program = cl_program_new(ctx);
-  if (UNLIKELY(program == NULL)) {
-      err = CL_OUT_OF_HOST_MEMORY;
-      goto error;
+  if (options == NULL && p->build_opts) {
+    CL_FREE(p->build_opts);
+    p->build_opts = NULL;
   }
 
-  TRY_ALLOC (lens, CL_CALLOC(count, sizeof(int32_t)));
-  for (i = 0; i < (int) count; ++i) {
-    size_t len;
-    if (lengths == NULL || lengths[i] == 0)
-      len = strlen(strings[i]);
-    else
-      len = lengths[i];
-    lens[i] = len;
-    len_total += len;
+  if (p->source == NULL) { // No source can build
+    p->build_status = CL_BUILD_ERROR;
+    CL_OBJECT_RELEASE_OWNERSHIP(p);
+    return CL_INVALID_OPERATION;
   }
-  TRY_ALLOC(program->source, CL_CALLOC(len_total+1, sizeof(char)));
-  p = program->source;
-  for (i = 0; i < (int) count; ++i) {
-    memcpy(p, strings[i], lens[i]);
-    p += lens[i];
+
+  if (num_input_headers) {
+    headers = CL_CALLOC(num_input_headers, sizeof(void *));
+    if (headers == NULL) {
+      CL_OBJECT_RELEASE_OWNERSHIP(p);
+      return CL_OUT_OF_HOST_MEMORY;
+    }
+    header_lengths = CL_CALLOC(num_input_headers, sizeof(size_t));
+    if (header_lengths == NULL) {
+      CL_FREE(headers);
+      CL_OBJECT_RELEASE_OWNERSHIP(p);
+      return CL_OUT_OF_HOST_MEMORY;
+    }
   }
-  *p = '\0';
 
-  program->source_type = FROM_SOURCE;
-  program->binary_type = CL_PROGRAM_BINARY_TYPE_NONE;
+  p->build_status = CL_BUILD_IN_PROGRESS;
 
-exit:
-  CL_FREE(lens);
-  lens = NULL;
-  if (errcode_ret)
-    *errcode_ret = err;
-  return program;
-error:
-  cl_program_delete(program);
-  program = NULL;
-  goto exit;
-}
+  for (i = 0; i < num_devices; i++) {
+    device = device_list[i];
+    DEV_PRIVATE_DATA(p, device, pd);
 
-/* Before we do the real work, we need to check whether our platform
-   cl version can meet -cl-std= */
-static int check_cl_version_option(cl_program p, const char* options) {
-  const char* s = NULL;
-  int ver1 = 0;
-  int ver2 = 0;
-  char version_str[64] = {0};
+    assert(pd->binary_sz == 0);
+    assert(pd->binary == NULL);
 
-  if (options && (s = strstr(options, "-cl-std="))) {
+    if (pd->binary_type == CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT ||
+        pd->binary_type == CL_PROGRAM_BINARY_TYPE_LIBRARY ||
+        pd->binary_type == CL_PROGRAM_BINARY_TYPE_EXECUTABLE)
+      continue;
 
-    if (s + strlen("-cl-std=CLX.X") > options + strlen(options)) {
-      return 0;
+    err = cl_compiler_check_available(device);
+    if (err != CL_SUCCESS)
+      break;
+
+    if ((device->compiler.check_compiler_option)(options) == CL_FALSE) {
+      err = CL_INVALID_COMPILER_OPTIONS;
+      break;
     }
 
-    if (s[8] != 'C' || s[9] != 'L' || s[10] > '9' || s[10] < '0' || s[11] != '.'
-        || s[12] > '9' || s[12] < '0') {
-      return 0;
+    for (i = 0; i < num_input_headers; i++) {
+      headers[i] = input_headers[i]->source;
+      header_lengths[i] = input_headers[i]->source_sz;
     }
 
-    ver1 = (s[10] - '0') * 10 + (s[12] - '0');
+    build_ret = (device->compiler.compile_program)(
+      device->device_id, p->source, p->source_sz, headers, header_lengths,
+      header_include_names, num_input_headers, p->build_opts, pd->build_log_max_sz,
+      pd->build_log, &pd->build_log_sz, &pd->binary, &pd->binary_sz);
 
-    if (cl_get_device_info(p->ctx->devices[0], CL_DEVICE_OPENCL_C_VERSION, sizeof(version_str),
-                                  version_str, NULL) != CL_SUCCESS)
-      return 0;
+    if (build_ret == CL_FALSE) {
+      if (pd->build_log_sz > 0 && strstr(pd->build_log, "error: error reading 'options'"))
+        err = CL_INVALID_COMPILER_OPTIONS;
+      else
+        err = CL_COMPILE_PROGRAM_FAILURE;
 
-    assert(strstr(version_str, "OpenCL") && version_str[0] == 'O');
-    ver2 = (version_str[9] - '0') * 10 + (version_str[11] - '0');
+      break;
+    } else {
+      pd->binary_type = CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT;
+    }
 
-    if (ver2 < ver1)
-      return 0;
+    CL_REGISTER_ALLOC_PTR(pd->binary, pd->binary_sz);
+  }
 
-    return 1;
+  if (err == CL_SUCCESS) {
+    p->build_status = CL_BUILD_SUCCESS;
+  } else {
+    p->build_status = CL_BUILD_ERROR;
   }
 
-  return 1;
+  if (headers)
+    CL_FREE(headers);
+  if (header_lengths)
+    CL_FREE(header_lengths);
+
+  CL_OBJECT_RELEASE_OWNERSHIP(p);
+  return err;
 }
 
-LOCAL cl_int
-cl_program_build(cl_program p, const char *options)
+cl_program
+cl_program_link(cl_context context, cl_uint num_devices, const cl_device_id *device_list,
+                cl_uint num_input_programs, const cl_program *input_programs, const char *options,
+                cl_int *errcode_ret)
 {
+  cl_program p = NULL;
   cl_int err = CL_SUCCESS;
-  int i = 0;
-  int copyed = 0;
+  cl_uint i = 0;
+  cl_uint j = 0;
+  cl_uint k = 0;
+  cl_bool build_ret = CL_FALSE;
+  char **binary = NULL;
+  size_t *binary_size = NULL;
+  cl_device_id device;
+  cl_program_for_device pd = NULL;
+  cl_program_for_device pdi = NULL;
+  cl_bool build_for_lib = CL_FALSE;
+
+  if (device_list == NULL) {
+    assert(num_devices == 0);
+    num_devices = context->device_num;
+    device_list = context->devices;
+  }
+
+  p = cl_program_new(context);
+  if (p == NULL) {
+    *errcode_ret = CL_OUT_OF_HOST_MEMORY;
+    return NULL;
+  }
 
-  if (CL_OBJECT_GET_REF(p) > 1) {
-    err = CL_INVALID_OPERATION;
-    goto error;
+  if (options && options[0]) {
+    p->build_opts = CL_CALLOC(strlen(options) + 1, sizeof(char));
+    if (p->build_opts == NULL) {
+      cl_program_delete(p);
+      *errcode_ret = CL_OUT_OF_HOST_MEMORY;
+      return NULL;
+    }
+    memcpy(p->build_opts, options, strlen(options) + 1);
   }
 
-#if HAS_CMRT
-  if (p->source_type == FROM_CMRT) {
-    //only here we begins to invoke cmrt
-    //break spec to return other errors such as CL_DEVICE_NOT_FOUND
-    err = cmrt_build_program(p, options);
-    if (err == CL_SUCCESS) {
-      p->build_status = CL_BUILD_SUCCESS;
-      p->binary_type = CL_PROGRAM_BINARY_TYPE_EXECUTABLE;
-      return CL_SUCCESS;
-    } else
-      goto error;
+  binary = CL_CALLOC(num_input_programs, sizeof(char *));
+  if (binary == NULL) {
+    cl_program_delete(p);
+    *errcode_ret = CL_OUT_OF_HOST_MEMORY;
+    return NULL;
   }
-#endif
+  binary_size = CL_CALLOC(num_input_programs, sizeof(size_t));
+  if (binary_size == NULL) {
+    cl_program_delete(p);
+    CL_FREE(binary);
+    *errcode_ret = CL_OUT_OF_HOST_MEMORY;
+    return NULL;
+  }
+
+  if (p->build_opts && strstr(p->build_opts, "-create-library"))
+    build_for_lib = CL_TRUE;
+
+  p->build_status = CL_BUILD_IN_PROGRESS;
 
-  if (!check_cl_version_option(p, options)) {
-    err = CL_BUILD_PROGRAM_FAILURE;
-    goto error;
+  for (k = 0; k < num_input_programs; k++) {
+    if (CL_OBJECT_TAKE_OWNERSHIP(input_programs[k], CL_FALSE) == CL_FALSE)
+      break;
   }
-  if (options) {
-    if(p->build_opts == NULL || strcmp(options, p->build_opts) != 0) {
-      if(p->build_opts) {
-        CL_FREE(p->build_opts);
-        p->build_opts = NULL;
-      }
-      TRY_ALLOC (p->build_opts, CL_CALLOC(strlen(options) + 1, sizeof(char)));
-      memcpy(p->build_opts, options, strlen(options));
+  if (k != num_input_programs) { // Some one not ready
+    for (i = 0; i < k; i++) {
+      CL_OBJECT_RELEASE_OWNERSHIP(input_programs[i]);
     }
+    cl_program_delete(p);
+    CL_FREE(binary);
+    CL_FREE(binary_size);
+    *errcode_ret = CL_INVALID_VALUE;
+    return NULL;
   }
 
-  if (options == NULL && p->build_opts) {
-    CL_FREE(p->build_opts);
-    p->build_opts = NULL;
-  }
+  for (j = 0; j < num_devices; j++) {
+    device = device_list[j];
+    DEV_PRIVATE_DATA(p, device, pd);
 
-  if (p->source_type == FROM_SOURCE) {
-    if (!CompilerSupported()) {
-      err = CL_COMPILER_NOT_AVAILABLE;
-      goto error;
+    err = cl_compiler_check_available(device);
+    if (err != CL_SUCCESS)
+      break;
+
+    if ((device->compiler.check_compiler_option)(options) == CL_FALSE) {
+      err = CL_INVALID_LINKER_OPTIONS;
+      break;
     }
 
-    p->opaque = compiler_program_new_from_source(p->ctx->devices[0]->device_id, p->source, p->build_log_max_sz, options, p->build_log, &p->build_log_sz);
-    if (UNLIKELY(p->opaque == NULL)) {
-      if (p->build_log_sz > 0 && strstr(p->build_log, "error: error reading 'options'"))
-        err = CL_INVALID_BUILD_OPTIONS;
-      else
-        err = CL_BUILD_PROGRAM_FAILURE;
-      goto error;
+    for (i = 0; i < num_input_programs; i++) {
+      DEV_PRIVATE_DATA(input_programs[i], device, pdi);
+
+      if (input_programs[i]->build_status < CL_BUILD_NONE) {
+        err = CL_INVALID_OPERATION;
+        break;
+      }
+
+      if ((pdi->binary_type != CL_PROGRAM_BINARY_TYPE_LIBRARY) &&
+          (pdi->binary_type != CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT)) {
+        err = CL_INVALID_OPERATION;
+        break;
+      }
     }
+    if (err != CL_SUCCESS)
+      break;
 
-    /* Create all the kernels */
-    TRY (cl_program_load_gen_program, p);
-  } else if (p->source_type == FROM_LLVM || p->source_type == FROM_LLVM_SPIR) {
-    if (!CompilerSupported()) {
-      err = CL_COMPILER_NOT_AVAILABLE;
-      goto error;
+    for (i = 0; i < num_input_programs; i++) {
+      DEV_PRIVATE_DATA(input_programs[i], device, pdi);
+      binary[i] = pdi->binary;
+      binary_size[i] = pdi->binary_sz;
     }
 
-    compiler_program_build_from_llvm(p->opaque, p->build_log_max_sz, p->build_log, &p->build_log_sz, options);
-    if (UNLIKELY(p->opaque == NULL)) {
-      if (p->build_log_sz > 0 && strstr(p->build_log, "error: error reading 'options'"))
-        err = CL_INVALID_BUILD_OPTIONS;
+    build_ret = (device->compiler.link_program)(device->device_id, num_input_programs, binary, binary_size,
+                                                options, pd->build_log_max_sz, pd->build_log, &pd->build_log_sz,
+                                                &pd->binary, &pd->binary_sz);
+
+    if (build_ret == CL_FALSE) {
+      if (pd->build_log_sz > 0 && strstr(pd->build_log, "error: error reading 'options'"))
+        err = CL_INVALID_COMPILER_OPTIONS;
       else
-        err = CL_BUILD_PROGRAM_FAILURE;
-      goto error;
+        err = CL_COMPILE_PROGRAM_FAILURE;
+
+      break;
     }
-    /* Create all the kernels */
-    TRY (cl_program_load_gen_program, p);
-  } else if (p->source_type == FROM_BINARY && p->binary_type != CL_PROGRAM_BINARY_TYPE_EXECUTABLE) {
-    p->opaque = interp_program_new_from_binary(p->ctx->devices[0]->device_id, p->binary, p->binary_sz);
-    if (UNLIKELY(p->opaque == NULL)) {
-      err = CL_BUILD_PROGRAM_FAILURE;
-      goto error;
+
+    CL_REGISTER_ALLOC_PTR(pd->binary, pd->binary_sz);
+
+    if (build_for_lib) { // Create a lib, no further work
+      pd->binary_type = CL_PROGRAM_BINARY_TYPE_LIBRARY;
+      continue;
     }
 
-    /* Create all the kernels */
-    TRY (cl_program_load_gen_program, p);
+    pd->binary_type = CL_PROGRAM_BINARY_TYPE_EXECUTABLE;
+    err = (device->api.program_load_binary)(device, p);
+    if (err != CL_SUCCESS)
+      break;
   }
-  p->binary_type = CL_PROGRAM_BINARY_TYPE_EXECUTABLE;
 
-  for (i = 0; i < p->ker_n; i ++) {
-    const gbe_kernel opaque = interp_program_get_kernel(p->opaque, i);
-    p->bin_sz += interp_kernel_get_code_size(opaque);
+  for (k = 0; k < num_input_programs; k++) {
+    CL_OBJECT_RELEASE_OWNERSHIP(input_programs[k]);
   }
 
-  TRY_ALLOC (p->bin, CL_CALLOC(p->bin_sz, sizeof(char)));
-  for (i = 0; i < p->ker_n; i ++) {
-    const gbe_kernel opaque = interp_program_get_kernel(p->opaque, i);
-    size_t sz = interp_kernel_get_code_size(opaque);
-
-    memcpy(p->bin + copyed, interp_kernel_get_code(opaque), sz);
-    copyed += sz;
+  if (binary)
+    CL_FREE(binary);
+  if (binary_size)
+    CL_FREE(binary_size);
+  if (err != CL_SUCCESS) {
+    cl_program_delete(p);
+    p = NULL;
+  } else {
+    p->build_status = CL_BUILD_SUCCESS;
   }
-  if ((err = get_program_global_data(p)) != CL_SUCCESS)
-    goto error;
-
-  p->is_built = 1;
-  p->build_status = CL_BUILD_SUCCESS;
-  return CL_SUCCESS;
 
-error:
-  p->build_status = CL_BUILD_ERROR;
-  return err;
+  *errcode_ret = err;
+  return p;
 }
 
-cl_program
-cl_program_link(cl_context            context,
-                cl_uint               num_input_programs,
-                const cl_program *    input_programs,
-                const char *          options,
-                cl_int*               errcode_ret)
+LOCAL cl_int
+cl_program_get_kernel_names(cl_program p, cl_uint *kerne_num, size_t size, char *names, size_t *name_ret)
 {
-  cl_program p = NULL;
-  cl_int err = CL_SUCCESS;
-  cl_int i = 0;
-  int copyed = 0;
-  cl_bool ret = 0;
-  int avialable_program = 0;
-  //Although we don't use options, but still need check options
-  if(!compiler_program_check_opt(options)) {
-    err = CL_INVALID_LINKER_OPTIONS;
-    goto error;
+  /* This function will get all possible kernel names, at least one device has it */
+  char **known_kernel_list = NULL;
+  int known_kernel_num = 0;
+  int i, j, k;
+  int total_sz = 0;
+  char *ptr;
+
+  if (CL_OBJECT_TAKE_OWNERSHIP(p, CL_FALSE) == CL_FALSE)
+    return CL_INVALID_OPERATION;
+
+  if (p->build_status != CL_BUILD_SUCCESS) {
+    CL_OBJECT_RELEASE_OWNERSHIP(p);
+    return CL_INVALID_PROGRAM_EXECUTABLE;
   }
-  const char kernel_arg_option[] = "-cl-kernel-arg-info";
-  cl_bool option_exist = CL_TRUE;
-  for(i = 0; i < num_input_programs; i++) {
-    //num_input_programs >0 and input_programs MUST not NULL, so compare with input_programs[0] directly.
-    if(input_programs[i]->binary_type == CL_PROGRAM_BINARY_TYPE_LIBRARY ||
-       input_programs[i]->binary_type == CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT ||
-       input_programs[i]->binary_type == CL_PROGRAM_BINARY_TYPE_INTERMEDIATE) {
-      avialable_program++;
-    }
-    if(input_programs[i]->build_opts == NULL || strstr(input_programs[i]->build_opts, kernel_arg_option) == NULL ) {
-      option_exist = CL_FALSE;
+
+  for (i = 0; i < p->each_device_num; i++) {
+    if (p->each_device[i]->kernel_names == NULL)
+      continue;
+
+    if (known_kernel_list == NULL) {
+      assert(known_kernel_num == 0);
+      known_kernel_num = p->each_device[i]->kernel_num;
+      known_kernel_list = CL_CALLOC(known_kernel_num, sizeof(char *));
+      if (known_kernel_list == NULL)
+        goto ERROR;
+
+      memcpy(known_kernel_list, p->each_device[i]->kernel_names,
+             p->each_device[i]->kernel_num * sizeof(char *));
+      continue;
     }
-  }
 
-  //None of program contain a compilerd binary or library.
-  if(avialable_program == 0) {
-    goto done;
-  }
+    /* Compare the same name */
+    for (j = 0; j < p->each_device[i]->kernel_num; j++) {
+      k = 0;
+      for (; k < known_kernel_num; k++) {
+        if (strcmp(known_kernel_list[k], p->each_device[i]->kernel_names[j]) == 0)
+          break;
+      }
 
-  //Must all of program contain a compilerd binary or library.
-  if(avialable_program < num_input_programs) {
-    err = CL_INVALID_OPERATION;
-    goto error;
-  }
+      if (k == known_kernel_num) { // Append a new one
+        known_kernel_list = CL_REALLOC(known_kernel_list, (known_kernel_num + 1) * sizeof(char *));
+        if (known_kernel_list == NULL)
+          goto ERROR;
 
-  p = cl_program_new(context);
-  if (UNLIKELY(p == NULL)) {
-      err = CL_OUT_OF_HOST_MEMORY;
-      goto error;
+        known_kernel_list[known_kernel_num] = p->each_device[i]->kernel_names[j];
+        known_kernel_num++;
+      }
+    }
   }
 
-  if(option_exist) {
-      TRY_ALLOC (p->build_opts, CL_CALLOC(strlen(kernel_arg_option) + 1, sizeof(char)));
-      memcpy(p->build_opts, kernel_arg_option, strlen(kernel_arg_option));
-  }
+  assert(known_kernel_num > 0);
 
-  if (!check_cl_version_option(p, options)) {
-    err = CL_BUILD_PROGRAM_FAILURE;
-    goto error;
+  if (kerne_num) {
+    *kerne_num = known_kernel_num;
   }
 
-  p->opaque = compiler_program_new_gen_program(context->devices[0]->device_id, NULL, NULL, NULL);
-  for(i = 0; i < num_input_programs; i++) {
-    // if program create with llvm binary, need deserilize first to get module.
-    if(input_programs[i])
-      ret = compiler_program_link_program(p->opaque, input_programs[i]->opaque,
-                                          p->build_log_max_sz, p->build_log, &p->build_log_sz);
-    if (UNLIKELY(ret)) {
-      err = CL_LINK_PROGRAM_FAILURE;
-      goto error;
-    }
+  if (names == NULL && name_ret == NULL) {
+    CL_OBJECT_RELEASE_OWNERSHIP(p);
+    CL_FREE(known_kernel_list);
+    return CL_SUCCESS;
   }
 
-  if(options && strstr(options, "-create-library")){
-    p->binary_type = CL_PROGRAM_BINARY_TYPE_LIBRARY;
-    goto done;
-  }else{
-    p->binary_type = CL_PROGRAM_BINARY_TYPE_EXECUTABLE;
+  total_sz = 0;
+  for (i = 0; i < known_kernel_num; i++) {
+    assert(known_kernel_list[i] && known_kernel_list[i][0] != 0);
+    total_sz += strlen(known_kernel_list[i]) + 1;
   }
 
-  compiler_program_build_from_llvm(p->opaque, p->build_log_max_sz, p->build_log, &p->build_log_sz, options);
+  if (name_ret)
+    *name_ret = total_sz;
 
-  /* Create all the kernels */
-  TRY (cl_program_load_gen_program, p);
-
-  for (i = 0; i < p->ker_n; i ++) {
-    const gbe_kernel opaque = interp_program_get_kernel(p->opaque, i);
-    p->bin_sz += interp_kernel_get_code_size(opaque);
+  if (names && size < total_sz) {
+    CL_OBJECT_RELEASE_OWNERSHIP(p);
+    CL_FREE(known_kernel_list);
+    return CL_INVALID_VALUE;
   }
 
-  TRY_ALLOC (p->bin, CL_CALLOC(p->bin_sz, sizeof(char)));
-  for (i = 0; i < p->ker_n; i ++) {
-    const gbe_kernel opaque = interp_program_get_kernel(p->opaque, i);
-    size_t sz = interp_kernel_get_code_size(opaque);
+  if (names) {
+    ptr = names;
+    for (i = 0; i < known_kernel_num; i++) {
+      memcpy(ptr, known_kernel_list[i], strlen(known_kernel_list[i]));
+      ptr[strlen(known_kernel_list[i])] = ';';
+      ptr = ptr + strlen(known_kernel_list[i]) + 1;
+    }
 
-    memcpy(p->bin + copyed, interp_kernel_get_code(opaque), sz);
-    copyed += sz;
+    names[total_sz - 1] = 0;
+    assert(ptr - names == total_sz);
   }
 
-  if ((err = get_program_global_data(p)) != CL_SUCCESS)
-    goto error;
+  CL_OBJECT_RELEASE_OWNERSHIP(p);
+  CL_FREE(known_kernel_list);
+  return CL_SUCCESS;
 
-done:
-  if(p) p->is_built = 1;
-  if(p) p->build_status = CL_BUILD_SUCCESS;
-  if (errcode_ret)
-    *errcode_ret = err;
-  return p;
+ERROR:
+  if (known_kernel_list)
+    CL_FREE(known_kernel_list);
 
-error:
-  if(p) p->build_status = CL_BUILD_ERROR;
-  if (errcode_ret)
-    *errcode_ret = err;
-  return p;
+  CL_OBJECT_RELEASE_OWNERSHIP(p);
+  return CL_OUT_OF_HOST_MEMORY;
 }
 
-#define FILE_PATH_LENGTH  1024
-LOCAL cl_int
-cl_program_compile(cl_program            p,
-                   cl_uint               num_input_headers,
-                   const cl_program *    input_headers,
-                   const char **         header_include_names,
-                   const char*           options)
+LOCAL cl_program
+cl_program_create_from_binary(cl_context ctx, cl_uint num_devices, const cl_device_id *devices,
+                              const size_t *lengths, const unsigned char **binaries,
+                              cl_int *binary_status, cl_int *errcode_ret)
 {
+  cl_program program = NULL;
+  cl_program_for_device pd = NULL;
   cl_int err = CL_SUCCESS;
-  int i = 0;
+  cl_uint i;
 
-  if (CL_OBJECT_GET_REF(p) > 1) {
-    err = CL_INVALID_OPERATION;
-    goto error;
-  }
+  assert(ctx);
 
-  if (!check_cl_version_option(p, options)) {
-    err = CL_BUILD_PROGRAM_FAILURE;
-    goto error;
+  program = cl_program_new(ctx);
+  if (program == NULL) {
+    *errcode_ret = CL_OUT_OF_HOST_MEMORY;
+    return NULL;
   }
 
-  if (options) {
-    if(p->build_opts == NULL || strcmp(options, p->build_opts) != 0) {
-      if(p->build_opts) {
-        CL_FREE(p->build_opts);
-        p->build_opts = NULL;
+  for (i = 0; i < num_devices; i++) {
+    DEV_PRIVATE_DATA(program, devices[i], pd);
+    pd->binary = CL_MALLOC(lengths[i] * sizeof(char));
+    if (pd->binary == NULL) {
+      if (binary_status) {
+        binary_status[i] = CL_INVALID_VALUE; // Just set to this, no other err kind for it
       }
-      TRY_ALLOC (p->build_opts, CL_CALLOC(strlen(options) + 1, sizeof(char)));
-      memcpy(p->build_opts, options, strlen(options));
+
+      err = CL_OUT_OF_HOST_MEMORY;
+      break;
+    }
+    memcpy(pd->binary, binaries[i], lengths[i]);
+    pd->binary_sz = lengths[i];
+    err = (devices[i]->api.program_load_binary)(devices[i], program);
+    if (binary_status) {
+      binary_status[i] = err;
     }
-  }
 
-  if (options == NULL && p->build_opts) {
-    CL_FREE(p->build_opts);
-    p->build_opts = NULL;
+    if (err != CL_SUCCESS) {
+      break;
+    }
+
+    assert(pd->binary_type != CL_PROGRAM_BINARY_TYPE_NONE); // Must be something
   }
 
-#if defined(__ANDROID__)
-  char temp_header_template[]= "/data/local/tmp/beignet.XXXXXX";
-#else
-  char temp_header_template[]= "/tmp/beignet.XXXXXX";
-#endif
+  *errcode_ret = err;
+  if (err != CL_SUCCESS) {
+    cl_program_delete(program);
+    return NULL;
+  }
+  return program;
+}
 
-  char* temp_header_path = mkdtemp(temp_header_template);
+LOCAL cl_program
+cl_program_create_with_built_in_kernles(cl_context ctx,
+                                        cl_uint num_devices,
+                                        const cl_device_id *devices,
+                                        const char *kernel_names,
+                                        cl_int *errcode_ret)
+{
 
-  if (p->source_type == FROM_SOURCE) {
+  cl_uint i;
+  char *required_names = NULL;
+  char *name = NULL;
+  char *ptr;
+  int find;
+  cl_int err = CL_SUCCESS;
+  size_t *lengths;
+  const unsigned char **binaries;
+  cl_program prog = NULL;
 
-    if (!CompilerSupported()) {
-      err = CL_COMPILER_NOT_AVAILABLE;
-      goto error;
-    }
+  assert(ctx);
 
-    //write the headers to /tmp/beignet.XXXXXX for include.
-    for (i = 0; i < num_input_headers; i++) {
-      if(header_include_names[i] == NULL || input_headers[i] == NULL)
-        continue;
-      char temp_path[FILE_PATH_LENGTH]="";
-      strncat(temp_path, temp_header_path, strlen(temp_header_path));
-      strncat(temp_path, "/", 1);
-      strncat(temp_path, header_include_names[i], strlen(header_include_names[i]));
-      if(strlen(temp_path) >= FILE_PATH_LENGTH - 1 ) {
-        err = CL_COMPILE_PROGRAM_FAILURE;
-        goto error;
+  required_names = CL_MALLOC(strlen(kernel_names) + 1);
+  if (required_names == NULL) {
+    *errcode_ret = CL_OUT_OF_HOST_MEMORY;
+    return NULL;
+  }
+  memcpy(required_names, kernel_names, strlen(kernel_names) + 1);
+
+  name = strtok(required_names, ";");
+  assert(name);
+
+  // check whether all kernel names are included in all devices's binary
+  while (name) {
+    for (i = 0; i < num_devices; i++) {
+      if (devices[i]->built_in_kernels == NULL) {
+        CL_FREE(required_names);
+        *errcode_ret = CL_INVALID_VALUE;
+        return NULL;
       }
-      temp_path[strlen(temp_path)+1] = '\0';
-      char* dirc = strdup(temp_path);
-      char* dir = dirname(dirc);
-      mkdir(dir, 0755);
-      if(access(dir, R_OK|W_OK) != 0){
-        err = CL_COMPILE_PROGRAM_FAILURE;
-        goto error;
+
+      ptr = NULL;
+      find = 0;
+
+      ptr = strstr(devices[i]->built_in_kernels, name);
+      while (ptr != NULL) {
+        /* Need to be whole match */
+        if (ptr != devices[i]->built_in_kernels && *(ptr - 1) != ';') { // Not the frist one
+          ptr = strstr(ptr, name);
+          continue;
+        }
+
+        if (ptr[strlen(name)] != ';' && ptr[strlen(name)] != 0) {
+          ptr = strstr(ptr, name);
+          continue;
+        }
+
+        find = 1;
+        break;
       }
-      free(dirc);
 
-      FILE* pfile = fopen(temp_path, "wb");
-      if(pfile){
-        fwrite(input_headers[i]->source, strlen(input_headers[i]->source), 1, pfile);
-        fclose(pfile);
-      }else{
-        err = CL_COMPILE_PROGRAM_FAILURE;
-        goto error;
+      if (find == 0) {
+        CL_FREE(required_names);
+        *errcode_ret = CL_INVALID_VALUE;
+        return NULL;
       }
     }
 
-    p->opaque = compiler_program_compile_from_source(p->ctx->devices[0]->device_id, p->source, temp_header_path,
-        p->build_log_max_sz, options, p->build_log, &p->build_log_sz);
+    name = strtok(NULL, ";");
+  }
 
-    char rm_path[255]="rm ";
-    strncat(rm_path, temp_header_path, strlen(temp_header_path));
-    strncat(rm_path, " -rf", 4);
-    int temp = system(rm_path);
+  CL_FREE(required_names);
 
-    if(temp){
-      assert(0);
-    }
+  /* OK, all kernels' name supported, create program */
+  lengths = CL_CALLOC(num_devices, sizeof(size_t));
+  if (lengths == NULL) {
+    *errcode_ret = CL_OUT_OF_HOST_MEMORY;
+    return NULL;
+  }
+  binaries = CL_CALLOC(num_devices, sizeof(char *));
+  if (binaries == NULL) {
+    CL_FREE(lengths);
+    *errcode_ret = CL_OUT_OF_HOST_MEMORY;
+    return NULL;
+  }
 
-    if (UNLIKELY(p->opaque == NULL)) {
-      if (p->build_log_sz > 0 && strstr(p->build_log, "error: error reading 'options'"))
-        err = CL_INVALID_COMPILER_OPTIONS;
-      else
-        err = CL_COMPILE_PROGRAM_FAILURE;
-      goto error;
-    }
+  for (i = 0; i < num_devices; i++) {
+    lengths[i] = devices[i]->built_in_kernels_binary_sz;
+    binaries[i] = (unsigned char *)devices[i]->built_in_kernels_binary;
+  }
 
-    /* Create all the kernels */
-    p->binary_type = CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT;
-  }else if(p->source_type == FROM_BINARY){
-    err = CL_INVALID_OPERATION;
-    return err;
+  prog = cl_program_create_from_binary(ctx, num_devices, devices,
+                                       lengths, binaries, NULL, &err);
+
+  CL_FREE(lengths);
+  CL_FREE(binaries);
+
+  if (prog) {
+    assert(err == CL_SUCCESS);
+    err = cl_program_build(prog, NULL, num_devices, devices);
+    if (err != CL_SUCCESS) {
+      cl_program_delete(prog);
+      prog = NULL;
+    }
   }
 
-  p->is_built = 1;
-  p->build_status = CL_BUILD_SUCCESS;
-  return CL_SUCCESS;
+  *errcode_ret = err;
+  return prog;
+}
 
-error:
-  p->build_status = CL_BUILD_ERROR;
-  return err;
+LOCAL cl_program
+cl_program_create_from_llvm(cl_context ctx,
+                            cl_uint num_devices,
+                            const cl_device_id *devices,
+                            const char *file_name,
+                            cl_int *errcode_ret)
+{
+  return NULL;
 }
 
-LOCAL cl_kernel
-cl_program_create_kernel(cl_program p, const char *name, cl_int *errcode_ret)
+LOCAL cl_int
+cl_program_create_kernels_in_program(cl_program program, cl_uint num_kernels,
+                                     cl_kernel *kernels, cl_uint *num_kernels_ret)
 {
-  cl_kernel from = NULL, to = NULL;
   cl_int err = CL_SUCCESS;
-  uint32_t i = 0;
-
-#ifdef HAS_CMRT
-  if (p->cmrt_program != NULL) {
-    void* cmrt_kernel = cmrt_create_kernel(p, name);
-    if (cmrt_kernel != NULL) {
-      to = cl_kernel_new(p);
-      to->cmrt_kernel = cmrt_kernel;
-      goto exit;
-    } else {
-      err = CL_INVALID_KERNEL_NAME;
-      goto error;
-    }
-  }
-#endif
-
-  /* Find the program first */
-  for (i = 0; i < p->ker_n; ++i) {
-    assert(p->ker[i]);
-    const char *ker_name = cl_kernel_get_name(p->ker[i]);
-    if (ker_name != NULL && strcmp(ker_name, name) == 0) {
-      from = p->ker[i];
+  char **known_kernel_list = NULL;
+  int known_kernel_num = 0;
+  int all_kernel_num;
+  int i, j, k;
+
+  if (CL_OBJECT_TAKE_OWNERSHIP(program, CL_FALSE) == CL_FALSE)
+    return CL_INVALID_OPERATION;
+
+  for (i = 0; i < program->each_device_num; i++) {
+    if (program->each_device[i]->kernel_names == NULL) {
+      all_kernel_num = 0;
       break;
     }
-  }
-
-  /* We were not able to find this named kernel */
-  if (UNLIKELY(from == NULL)) {
-    err = CL_INVALID_KERNEL_NAME;
-    goto error;
-  }
 
-  TRY_ALLOC(to, cl_kernel_dup(from));
+    if (known_kernel_list == NULL) {
+      assert(known_kernel_num == 0);
+      known_kernel_num = program->each_device[i]->kernel_num;
+      known_kernel_list = CL_CALLOC(known_kernel_num, sizeof(char *));
+      if (known_kernel_list == NULL) {
+        CL_OBJECT_RELEASE_OWNERSHIP(program);
+        return CL_OUT_OF_HOST_MEMORY;
+      }
 
-exit:
-  if (errcode_ret)
-    *errcode_ret = err;
-  return to;
-error:
-  cl_kernel_delete(to);
-  to = NULL;
-  goto exit;
-}
+      all_kernel_num = known_kernel_num;
+      memcpy(known_kernel_list, program->each_device[i]->kernel_names,
+             program->each_device[i]->kernel_num * sizeof(char *));
+      continue;
+    }
 
-LOCAL cl_int
-cl_program_create_kernels_in_program(cl_program p, cl_kernel* ker)
-{
-  int i = 0;
+    /* Find kernels name availible for all devices */
+    all_kernel_num = known_kernel_num;
+    for (k = 0; k < known_kernel_num; k++) {
+      if (known_kernel_list[k] == NULL)
+        continue;
 
-  if(ker == NULL)
-    return CL_SUCCESS;
+      for (j = 0; j < program->each_device[i]->kernel_num; j++) {
+        if (strcmp(known_kernel_list[k], program->each_device[i]->kernel_names[j]) == 0)
+          break;
+      }
 
-  for (i = 0; i < p->ker_n; ++i) {
-    TRY_ALLOC_NO_ERR(ker[i], cl_kernel_dup(p->ker[i]));
+      if (j == program->each_device[i]->kernel_num) { // Not found
+        known_kernel_list[k] = NULL;
+        all_kernel_num--;
+      }
+    }
   }
 
-  return CL_SUCCESS;
+  if (all_kernel_num == 0) {
+    if (known_kernel_list)
+      CL_FREE(known_kernel_list);
 
-error:
-  do {
-    cl_kernel_delete(ker[i]);
-    ker[i--] = NULL;
-  } while(i > 0);
+    CL_OBJECT_RELEASE_OWNERSHIP(program);
+    return CL_INVALID_PROGRAM_EXECUTABLE;
+  }
 
-  return CL_OUT_OF_HOST_MEMORY;
-}
+  assert(known_kernel_list);
+  if (kernels && all_kernel_num > num_kernels) {
+    CL_OBJECT_RELEASE_OWNERSHIP(program);
+    CL_FREE(known_kernel_list);
+    return CL_INVALID_VALUE;
+  }
 
-LOCAL void
-cl_program_get_kernel_names(cl_program p, size_t size, char *names, size_t *size_ret)
-{
-  int i = 0;
-  const char *ker_name = NULL;
-  size_t len = 0;
-  if(size_ret) *size_ret = 0;
+  if (num_kernels_ret)
+    *num_kernels_ret = all_kernel_num;
 
-  if(p->ker == NULL) {
-    return;
+  if (kernels == NULL) { // Done
+    CL_OBJECT_RELEASE_OWNERSHIP(program);
+    CL_FREE(known_kernel_list);
+    return CL_SUCCESS;
   }
+  CL_OBJECT_RELEASE_OWNERSHIP(program);
 
-  ker_name = cl_kernel_get_name(p->ker[0]);
-  if (ker_name != NULL)
-    len = strlen(ker_name);
-  else
-    len = 0;
-  if(names && ker_name) {
-    strncpy(names, ker_name, size - 1);
-    names[size - 1] = '\0';
-    if(size < len - 1) {
-      if(size_ret) *size_ret = size;
-      return;
-    }
-    size = size - len - 1;  //sub \0
+  /* Create each kernel */
+  j = 0;
+  for (i = 0; i < known_kernel_num; i++) {
+    if (known_kernel_list[i] == NULL)
+      continue;
+
+    kernels[j] = cl_kernel_create(program, known_kernel_list[i], &err);
+    if (err != CL_SUCCESS)
+      break;
+
+    j++;
   }
-  if(size_ret) *size_ret = len + 1;  //add NULL
-
-  for (i = 1; i < p->ker_n; ++i) {
-    ker_name = cl_kernel_get_name(p->ker[i]);
-    if (ker_name != NULL)
-      len = strlen(ker_name);
-    else
-      len = 0;
-    if(names && ker_name) {
-      strncat(names, ";", size);
-      if(size >= 1)
-        strncat(names, ker_name, size - 1);
-      if(size < len + 1) {
-        if(size_ret) *size_ret = size;
-        break;
-      }
-      size = size - len - 1;
+
+  if (err != CL_SUCCESS) {
+    for (i = 0; i < j; i++) {
+      assert(kernels[i] != NULL);
+      cl_kernel_delete(kernels[i]);
+      kernels[i] = NULL;
     }
-    if(size_ret) *size_ret += len + 1; //add ';'
   }
+
+  CL_FREE(known_kernel_list);
+  return err;
+}
+
+/* Some program are internal used, we need to cache them all the time.
+   NOT a MT safe and user need to pay attention to its reference and device list.
+   This kind of program can not be built and compiled. */
+LOCAL void
+cl_program_take_out_of_context(cl_program p)
+{
+  assert(p->ctx);
+
+  /* Remove it from the context list */
+  cl_context_remove_program(p->ctx, p);
 }
diff --git a/src/cl_program.h b/src/cl_program.h
index 4afa553..c94c6f3 100644
--- a/src/cl_program.h
+++ b/src/cl_program.h
@@ -14,7 +14,6 @@
  * You should have received a copy of the GNU Lesser General Public
  * License along with this library. If not, see <http://www.gnu.org/licenses/>.
  *
- * Author: Benjamin Segovia <benjamin.segovia at intel.com>
  */
 
 #ifndef __CL_PROGRAM_H__
@@ -45,131 +44,57 @@ typedef struct _cl_program_for_device {
 } _cl_program_for_device;
 typedef _cl_program_for_device *cl_program_for_device;
 
-// This is the structure ouput by the compiler
-struct _gbe_program;
-
-enum {
-  FROM_SOURCE = 0,
-  FROM_LLVM = 1,
-  FROM_BINARY = 2,
-  FROM_LLVM_SPIR = 3,
-  FROM_CMRT = 4,
-};
-
-typedef enum _BINARY_HEADER_INDEX {
-  BHI_SPIR = 0,
-  BHI_COMPIRED_OBJECT = 1,
-  BHI_LIBRARY = 2,
-  BHI_GEN_BINARY = 3,
-  BHI_CMRT = 4,
-  BHI_MAX,
-}BINARY_HEADER_INDEX;
-
 /* This maps an OCL file containing some kernels */
 struct _cl_program {
   _cl_base_object base;
-  gbe_program opaque;     /* (Opaque) program as ouput by the compiler */
-  cl_kernel *ker;         /* All kernels included by the OCL file */
-  cl_program prev, next;  /* We chain the programs together */
-  cl_context ctx;         /* Its parent context */
-  cl_buffer  global_data;
-  char * global_data_ptr;
-  char *bin;              /* The program copied verbatim */
-  size_t bin_sz;          /* Its size in memory */
-  char *source;           /* Program sources */
-  char *binary;           /* Program binary. */
-  size_t binary_sz;       /* The binary size. */
-  uint32_t binary_type;   /* binary type: COMPILED_OBJECT(LLVM IR), LIBRARY(LLVM IR with option "-create-library"), or EXECUTABLE(GEN binary). */
-                          /* ext binary type: BINARY_TYPE_INTERMIDIATE. */
-  uint32_t ker_n;         /* Number of declared kernels */
-  uint32_t source_type:3; /* Built from binary, source, CMRT or LLVM*/
-  uint32_t is_built:1;    /* Did we call clBuildProgram on it? */
-  int32_t build_status;   /* build status. */
-  char *build_opts;       /* The build options for this program */
-  size_t build_log_max_sz; /*build log maximum size in byte.*/
-  char *build_log;         /* The build log for this program. */
-  size_t build_log_sz;    /* The actual build log size.*/
-
-  void* cmrt_program;      /* real type: CmProgram* */
+  cl_context ctx;                     /* Its parent context */
+  char *source;                       /* Program sources */
+  size_t source_sz;                   /* The source size. */
+  cl_uint each_device_num;            /* Each device number */
+  cl_program_for_device *each_device; /* Program content interpreted by device */
+  char *build_opts;                   /* The build options for this program */
+  cl_int build_status;                /* build status. */
+  list_head kernels;                  /* All kernels belong to this program. */
+  cl_uint ker_n;                      /* Number of declared kernels */
 };
 
 #define CL_OBJECT_PROGRAM_MAGIC 0x34562ab12789cdefLL
-#define CL_OBJECT_IS_PROGRAM(obj) ((obj &&                           \
-         ((cl_base_object)obj)->magic == CL_OBJECT_PROGRAM_MAGIC &&  \
-         CL_OBJECT_GET_REF(obj) >= 1))
+#define CL_OBJECT_IS_PROGRAM(obj) ((obj &&                                                     \
+                                    ((cl_base_object)obj)->magic == CL_OBJECT_PROGRAM_MAGIC && \
+                                    CL_OBJECT_GET_REF(obj) >= 1))
 
-/* Create a empty program */
 extern cl_program cl_program_new(cl_context);
-
-/* Destroy and deallocate an empty kernel */
 extern void cl_program_delete(cl_program);
-
-/* Add one more reference to the object (to defer its deletion) */
 extern void cl_program_add_ref(cl_program);
-
-/* Create a kernel for the OCL user */
-extern cl_kernel cl_program_create_kernel(cl_program, const char*, cl_int*);
-
-/* creates kernel objects for all kernel functions in program. */
-extern cl_int cl_program_create_kernels_in_program(cl_program, cl_kernel*);
-
-/* Create a program from OCL source */
-extern cl_program
-cl_program_create_from_source(cl_context ctx,
-                              cl_uint count,
-                              const char **strings,
-                              const size_t *lengths,
-                              cl_int *errcode_ret);
-
-/* Directly create a program from a blob */
-extern cl_program
-cl_program_create_from_binary(cl_context             context,
-                              cl_uint                num_devices,
-                              const cl_device_id *   devices,
-                              const size_t *         lengths,
-                              const unsigned char ** binaries,
-                              cl_int *               binary_status,
-                              cl_int *               errcode_ret);
+extern cl_int cl_program_create_kernels_in_program(cl_program program, cl_uint num_kernels,
+                                                   cl_kernel *kernels, cl_uint *num_kernels_ret);
+extern cl_program cl_program_create_from_binary(cl_context ctx, cl_uint num_devices,
+                                                const cl_device_id *devices, const size_t *lengths,
+                                                const unsigned char **binaries, cl_int *binary_status,
+                                                cl_int *errcode_ret);
+extern cl_int cl_program_build(cl_program p, const char *options, cl_uint num_devices, const cl_device_id *device_list);
+extern cl_int cl_program_compile(cl_program p, cl_uint num_input_headers, const cl_program *input_headers,
+                                 const char **header_include_names, const char *options,
+                                 cl_uint num_devices, const cl_device_id *device_list);
+extern cl_program cl_program_link(cl_context context, cl_uint num_devices, const cl_device_id *device_list,
+                                  cl_uint num_input_programs, const cl_program *input_programs,
+                                  const char *options, cl_int *errcode_ret);
+extern cl_int cl_program_get_kernel_names(cl_program p, cl_uint *kerne_num, size_t size, char *names, size_t *name_ret);
+extern cl_program cl_program_create_with_built_in_kernles(cl_context context, cl_uint num_devices,
+                                                          const cl_device_id *device_list, const char *kernel_names,
+                                                          cl_int *errcode_ret);
+extern void cl_program_take_out_of_context(cl_program p);
 
 /* Create a program with built-in kernels*/
-extern cl_program
-cl_program_create_with_built_in_kernles(cl_context     context,
-                                  cl_uint              num_devices,
-                                  const cl_device_id * device_list,
-                                  const char *         kernel_names,
-                                  cl_int *             errcode_ret);
 /* Directly create a program from a LLVM source file */
 extern cl_program
-cl_program_create_from_llvm(cl_context             context,
-                            cl_uint                num_devices,
-                            const cl_device_id *   devices,
-                            const char *           fileName,
-                            cl_int *               errcode_ret);
+cl_program_create_from_llvm(cl_context context,
+                            cl_uint num_devices,
+                            const cl_device_id *devices,
+                            const char *fileName,
+                            cl_int *errcode_ret);
 
-/* Build the program as specified by OCL */
-extern cl_int
-cl_program_build(cl_program p, const char* options);
-/* Compile the program as specified by OCL */
-extern cl_int
-cl_program_compile(cl_program            p,
-                   cl_uint               num_input_headers,
-                   const cl_program *    input_headers,
-                   const char **         header_include_names,
-                   const char*           options);
-/* link the program as specified by OCL */
-extern cl_program
-cl_program_link(cl_context            context,
-                cl_uint               num_input_programs,
-                const cl_program *    input_programs,
-                const char *          options,
-                cl_int*               errcode_ret);
-/* Get the kernel names in program */
-extern void
-cl_program_get_kernel_names(cl_program p,
-                            size_t size,
-                            char *names,
-                            size_t *size_ret);
-extern size_t
-cl_program_get_global_variable_size(cl_program p);
-#endif /* __CL_PROGRAM_H__ */
+/* Create a kernel for the OCL user */
+extern cl_kernel cl_program_create_user_kernel(cl_program, const char *, cl_int *);
 
+#endif /* __CL_PROGRAM_H__ */
diff --git a/src/cl_sampler.c b/src/cl_sampler.c
index 69d90e6..87406c3 100644
--- a/src/cl_sampler.c
+++ b/src/cl_sampler.c
@@ -26,48 +26,41 @@
 
 #include <assert.h>
 
-uint32_t cl_to_clk(cl_bool normalized_coords,
-                   cl_addressing_mode address,
-                   cl_filter_mode filter)
+static uint32_t
+sampler_cl_to_clk(cl_bool normalized_coords, cl_addressing_mode address, cl_filter_mode filter)
 {
   int clk_address = CLK_ADDRESS_NONE;
   int clk_filter = CLK_FILTER_NEAREST;
   switch (address) {
-  case CL_ADDRESS_NONE: clk_address = CLK_ADDRESS_NONE; break;
-  case CL_ADDRESS_CLAMP: clk_address = CLK_ADDRESS_CLAMP; break;
-  case CL_ADDRESS_CLAMP_TO_EDGE: clk_address = CLK_ADDRESS_CLAMP_TO_EDGE; break;
-  case CL_ADDRESS_REPEAT: clk_address = CLK_ADDRESS_REPEAT; break;
-  case CL_ADDRESS_MIRRORED_REPEAT: clk_address = CLK_ADDRESS_MIRRORED_REPEAT; break;
+  case CL_ADDRESS_NONE:
+    clk_address = CLK_ADDRESS_NONE;
+    break;
+  case CL_ADDRESS_CLAMP:
+    clk_address = CLK_ADDRESS_CLAMP;
+    break;
+  case CL_ADDRESS_CLAMP_TO_EDGE:
+    clk_address = CLK_ADDRESS_CLAMP_TO_EDGE;
+    break;
+  case CL_ADDRESS_REPEAT:
+    clk_address = CLK_ADDRESS_REPEAT;
+    break;
+  case CL_ADDRESS_MIRRORED_REPEAT:
+    clk_address = CLK_ADDRESS_MIRRORED_REPEAT;
+    break;
   default:
     assert(0);
   }
-  switch(filter) {
-  case CL_FILTER_NEAREST: clk_filter = CLK_FILTER_NEAREST; break;
-  case CL_FILTER_LINEAR: clk_filter = CLK_FILTER_LINEAR; break;
+  switch (filter) {
+  case CL_FILTER_NEAREST:
+    clk_filter = CLK_FILTER_NEAREST;
+    break;
+  case CL_FILTER_LINEAR:
+    clk_filter = CLK_FILTER_LINEAR;
+    break;
   default:
     assert(0);
   }
-  return (clk_address << __CLK_ADDRESS_BASE)
-         | (normalized_coords << __CLK_NORMALIZED_BASE)
-         | (clk_filter);
-}
-
-#define IS_SAMPLER_ARG(v) (v & __CLK_SAMPLER_ARG_KEY_BIT)
-#define SAMPLER_ARG_ID(v) ((v & __CLK_SAMPLER_ARG_MASK) >> __CLK_SAMPLER_ARG_BASE)
-int cl_set_sampler_arg_slot(cl_kernel k, int index, cl_sampler sampler)
-{
-  int slot_id;
-  for(slot_id = 0; slot_id < k->sampler_sz; slot_id++)
-  {
-    if (IS_SAMPLER_ARG(k->samplers[slot_id])) {
-     if (SAMPLER_ARG_ID(k->samplers[slot_id]) == index) {
-       k->samplers[slot_id] = (k->samplers[slot_id] & (~__CLK_SAMPLER_MASK))
-                              | sampler->clkSamplerValue;
-       return slot_id;
-     }
-    }
-  }
-  return -1;
+  return (clk_address << __CLK_ADDRESS_BASE) | (normalized_coords << __CLK_NORMALIZED_BASE) | (clk_filter);
 }
 
 LOCAL cl_sampler
@@ -91,8 +84,7 @@ cl_create_sampler(cl_context ctx, cl_bool normalized_coords, cl_addressing_mode
   /* Append the sampler in the context sampler list */
   cl_context_add_sampler(ctx, sampler);
 
-  // TODO: May move it to other place, it's not a common sampler logic.
-  sampler->clkSamplerValue = cl_to_clk(normalized_coords, address, filter);
+  sampler->clkSamplerValue = sampler_cl_to_clk(normalized_coords, address, filter);
 
   *errcode_ret = CL_SUCCESS;
   return sampler;
@@ -117,4 +109,3 @@ cl_sampler_add_ref(cl_sampler sampler)
   assert(sampler);
   CL_OBJECT_INC_REF(sampler);
 }
-
diff --git a/src/gen/cl_command_queue_gen.c b/src/gen/cl_command_queue_gen.c
index c2f3888..8bbfe2c 100644
--- a/src/gen/cl_command_queue_gen.c
+++ b/src/gen/cl_command_queue_gen.c
@@ -176,11 +176,15 @@ gen_gpgpu_setup_curbe(cl_kernel kernel, cl_kernel_gen kernel_gen, gen_gpgpu *gpu
     if (kernel->args[i].arg_type == ArgTypePointer &&
         kernel->args[i].arg_addrspace == AddressSpaceLocal) { // SLM setting
       assert(kernel->args[i].val_size > 0);
-      assert(kernel->args[i].arg_size == sizeof(uint32_t));
+      assert(kernel->args[i].arg_size == sizeof(uint32_t) || kernel->args[i].arg_size == sizeof(uint64_t));
       assert(kernel_gen->arg_extra_info[i].arg_align > 0);
       // Need to be aligned address
       slm_offset = ALIGN(slm_offset, kernel_gen->arg_extra_info[i].arg_align);
-      *((uint32_t *)(curbe + kernel_gen->arg_extra_info[i].arg_offset)) = slm_offset;
+      if (kernel->args[i].arg_size == sizeof(uint32_t)) {
+        *((uint32_t *)(curbe + kernel_gen->arg_extra_info[i].arg_offset)) = slm_offset;
+      } else {
+        *((uint64_t *)(curbe + kernel_gen->arg_extra_info[i].arg_offset)) = slm_offset;
+      }
       slm_offset += kernel->args[i].val_size;
       continue;
     }
@@ -381,7 +385,7 @@ gen_setup_constant_buffer(cl_kernel kernel, cl_kernel_gen kernel_gen, gen_gpgpu
 
   if (prog_gen->rodata) {
     const_buf_size = prog_gen->rodata_data->d_size;
-    aligned_const_buf_size = ALIGN(const_buf_size, 4);
+    aligned_const_buf_size = ALIGN(const_buf_size, 8);
   } else {
     /* Reserve 8 bytes to get rid of 0 address */
     aligned_const_buf_size = 8;
@@ -421,28 +425,41 @@ gen_setup_constant_buffer(cl_kernel kernel, cl_kernel_gen kernel_gen, gen_gpgpu
   /* upload the global constant data, in rodata */
   if (prog_gen->rodata && prog_gen->rodata_data->d_size > 0) {
     memcpy(const_buf_addr, prog_gen->rodata_data->d_buf, prog_gen->rodata_data->d_size);
-    addr_offset += prog_gen->rodata_data->d_size;
+    addr_offset = prog_gen->rodata_data->d_size;
+    addr_offset = ALIGN(addr_offset, 8);
   } else {
     addr_offset = 8;
   }
 
   /* Upload constant ptr content */
   for (i = 0; i < kernel->arg_n; i++) {
+    cl_uint ptr_val = 0;
+
     if (kernel->args[i].arg_type != ArgTypePointer)
       continue;
     if (kernel->args[i].arg_addrspace != AddressSpaceConstant)
       continue;
 
+    assert(kernel_gen->arg_extra_info[i].arg_align > 0);
     addr_offset = ALIGN(addr_offset, kernel_gen->arg_extra_info[i].arg_align);
-
-    /* Set curbe */
-    *(uint32_t *)(gpu->thread.curbe + kernel_gen->arg_extra_info[i].arg_offset) = addr_offset;
+    assert(kernel->args[i].arg_size == sizeof(uint32_t) || kernel->args[i].arg_size == sizeof(uint64_t));
 
     mem = (cl_mem)kernel->args[i].val.val_ptr;
-    drm_intel_bo_map((drm_intel_bo *)mem->bo, 1);
-    memcpy(const_buf_addr + addr_offset, ((drm_intel_bo *)(mem->bo))->virtual, mem->size);
-    drm_intel_bo_unmap((drm_intel_bo *)mem->bo);
-    addr_offset += mem->size;
+    if (mem) {
+      drm_intel_bo_map((drm_intel_bo *)mem->bo, 1);
+      memcpy(const_buf_addr + addr_offset, ((drm_intel_bo *)(mem->bo))->virtual, mem->size);
+      drm_intel_bo_unmap((drm_intel_bo *)mem->bo);
+      ptr_val = addr_offset;
+      addr_offset += mem->size;
+      addr_offset = ALIGN(addr_offset, kernel_gen->arg_extra_info[i].arg_align);
+    }
+
+    /* Set curbe */
+    if (kernel->args[i].arg_size == sizeof(uint32_t)) {
+      *(uint32_t *)(gpu->thread.curbe + kernel_gen->arg_extra_info[i].arg_offset) = ptr_val;
+    } else {
+      *(uint64_t *)(gpu->thread.curbe + kernel_gen->arg_extra_info[i].arg_offset) = ptr_val;
+    }
   }
 
   drm_intel_bo_unmap(gpu->mem.const_bo);
@@ -641,12 +658,12 @@ gen_gpu_compute_batch_sz(cl_kernel k)
 
 /* This is a very important function. It is responsible for loading and setting GPU
    execution context based on the cl_kernel and kernel's arguments. */
-LOCAL cl_int
-cl_command_queue_ND_range_gen(cl_command_queue queue, cl_kernel kernel, cl_event event,
-                              const uint32_t work_dim, const size_t *global_wk_off,
-                              const size_t *global_dim_off, const size_t *global_wk_sz,
-                              const size_t *global_wk_sz_use, const size_t *local_wk_sz,
-                              const size_t *local_wk_sz_use)
+static cl_int
+cl_command_queue_ND_range_gen_once(cl_command_queue queue, cl_kernel kernel, cl_event event,
+                                   const uint32_t work_dim, const size_t *global_wk_off,
+                                   const size_t *global_dim_off, const size_t *global_wk_sz,
+                                   const size_t *global_wk_sz_use, const size_t *local_wk_sz,
+                                   const size_t *local_wk_sz_use)
 {
   cl_int ret = CL_SUCCESS;
   gen_gpgpu *gpu = NULL;
@@ -785,6 +802,62 @@ cl_command_queue_ND_range_gen(cl_command_queue queue, cl_kernel kernel, cl_event
   return ret;
 }
 
+LOCAL cl_int
+cl_command_queue_ND_range_gen(cl_command_queue queue, cl_kernel ker, cl_event event,
+                              const uint32_t work_dim, const size_t *global_wk_off,
+                              const size_t *global_wk_sz, const size_t *local_wk_sz)
+{
+  /* Used for non uniform work group size */
+  cl_int err = CL_SUCCESS;
+  int i, j, k;
+  const size_t global_wk_sz_div[3] = {
+    global_wk_sz[0] / local_wk_sz[0] * local_wk_sz[0],
+    global_wk_sz[1] / local_wk_sz[1] * local_wk_sz[1],
+    global_wk_sz[2] / local_wk_sz[2] * local_wk_sz[2]};
+
+  const size_t global_wk_sz_rem[3] = {
+    global_wk_sz[0] % local_wk_sz[0],
+    global_wk_sz[1] % local_wk_sz[1],
+    global_wk_sz[2] % local_wk_sz[2]};
+
+  const size_t *global_wk_all[2] = {global_wk_sz_div, global_wk_sz_rem};
+  /* Go through the at most 8 cases and euque if there is work items left */
+  for (i = 0; i < 2; i++) {
+    for (j = 0; j < 2; j++) {
+      for (k = 0; k < 2; k++) {
+        size_t global_wk_sz_use[3] = {global_wk_all[k][0], global_wk_all[j][1], global_wk_all[i][2]};
+        size_t global_dim_off[3] = {
+          k * global_wk_sz_div[0] / local_wk_sz[0],
+          j * global_wk_sz_div[1] / local_wk_sz[1],
+          i * global_wk_sz_div[2] / local_wk_sz[2]};
+        size_t local_wk_sz_use[3] = {
+          k ? global_wk_sz_rem[0] : local_wk_sz[0],
+          j ? global_wk_sz_rem[1] : local_wk_sz[1],
+          i ? global_wk_sz_rem[2] : local_wk_sz[2]};
+        if (local_wk_sz_use[0] == 0 || local_wk_sz_use[1] == 0 || local_wk_sz_use[2] == 0)
+          continue;
+
+        err = cl_command_queue_ND_range_gen_once(queue, ker, event, work_dim, global_wk_off, global_dim_off,
+                                                 global_wk_sz, global_wk_sz_use, local_wk_sz, local_wk_sz_use);
+        if (err != CL_SUCCESS)
+          return err;
+
+        /* TODO: need to handle events for multiple enqueue, now is a workaroud for uniform group size */
+        if (!(global_wk_sz_rem[0] == 0 && global_wk_sz_rem[1] == 0 && global_wk_sz_rem[2] == 0))
+          err = cl_command_queue_wait_flush(queue);
+        if (err != CL_SUCCESS)
+          return err;
+      }
+      if (work_dim < 2)
+        break;
+    }
+    if (work_dim < 3)
+      break;
+  }
+
+  return err;
+}
+
 LOCAL int
 cl_command_queue_flush_gpgpu(void *gpgpu)
 {
diff --git a/src/gen/cl_device_id_gen.c b/src/gen/cl_device_id_gen.c
index 35e9025..456c7c4 100644
--- a/src/gen/cl_device_id_gen.c
+++ b/src/gen/cl_device_id_gen.c
@@ -35,7 +35,7 @@ static _cl_device_api __gen_device_api = {
   .kernel_delete = cl_kernel_delete_gen,
   .kernel_create = cl_kernel_create_gen,
   .get_kernel_info = cl_kernel_get_info_gen,
-  .ND_range_kernel = cl_command_queue_ND_range_gen_wrap,
+  .ND_range_kernel = cl_command_queue_ND_range_gen,
   .mem_copy = cl_mem_copy_gen,
   .mem_fill = cl_mem_fill_gen,
   .mem_copy_rect = cl_mem_copy_buffer_rect_gen,
@@ -881,7 +881,7 @@ static struct _cl_device_id_gen device_gen;
 static cl_device_id __gen_device = NULL;
 
 LOCAL cl_device_id
-cl_get_device_id_gen(cl_platform_id platform)
+cl_device_get_id_gen(cl_platform_id platform)
 {
   static int inited = 0;
   cl_device_id dev = NULL;
@@ -950,7 +950,7 @@ cl_get_device_id_gen(cl_platform_id platform)
 }
 
 LOCAL void
-cl_device_id_gen_cleanup(void)
+cl_device_gen_cleanup(void)
 {
   int i;
   cl_device_id_gen gen_dev = NULL;
@@ -972,3 +972,24 @@ cl_device_id_gen_cleanup(void)
     }
   }
 }
+
+LOCAL cl_int
+cl_device_get_version_gen(cl_device_id device, cl_int *ver)
+{
+  if (device != __gen_device)
+    return CL_INVALID_DEVICE;
+
+  if (IS_GEN7(device->device_id)) {
+    *ver = 7;
+  } else if (IS_GEN75(device->device_id)) {
+    *ver = 75;
+  } else if (IS_GEN8(device->device_id)) {
+    *ver = 8;
+  } else if (IS_GEN9(device->device_id)) {
+    *ver = 9;
+  } else {
+    return CL_INVALID_VALUE;
+  }
+
+  return CL_SUCCESS;
+}
diff --git a/src/gen/cl_gen.h b/src/gen/cl_gen.h
index 6cdc405..d04a644 100644
--- a/src/gen/cl_gen.h
+++ b/src/gen/cl_gen.h
@@ -24,6 +24,7 @@
 #include "cl_utils.h"
 #include "cl_alloc.h"
 #include "cl_platform_id.h"
+#include "cl_device_id.h"
 #include "cl_mem.h"
 #include "cl_image.h"
 #include "cl_device_id.h"
@@ -35,6 +36,62 @@
 #include <gelf.h>
 #include <string.h>
 
+/*************************************** Device ******************************************/
+enum cl_internal_kernel_type_gen { // All internal kernel types for gen
+  CL_INTERNAL_KERNEL_MIN = 0,
+  CL_ENQUEUE_COPY_BUFFER_ALIGN4 = 0,
+  CL_ENQUEUE_COPY_BUFFER_ALIGN16,
+  CL_ENQUEUE_COPY_BUFFER_UNALIGN_SAME_OFFSET,
+  CL_ENQUEUE_COPY_BUFFER_UNALIGN_DST_OFFSET,
+  CL_ENQUEUE_COPY_BUFFER_UNALIGN_SRC_OFFSET,
+  CL_ENQUEUE_COPY_BUFFER_RECT,
+  CL_ENQUEUE_COPY_BUFFER_RECT_ALIGN4,
+  CL_ENQUEUE_COPY_IMAGE_1D_TO_1D,             //copy image 1d to image 1d
+  CL_ENQUEUE_COPY_IMAGE_2D_TO_2D,             //copy image 2d to image 2d
+  CL_ENQUEUE_COPY_IMAGE_3D_TO_2D,             //copy image 3d to image 2d
+  CL_ENQUEUE_COPY_IMAGE_2D_TO_3D,             //copy image 2d to image 3d
+  CL_ENQUEUE_COPY_IMAGE_3D_TO_3D,             //copy image 3d to image 3d
+  CL_ENQUEUE_COPY_IMAGE_2D_TO_2D_ARRAY,       //copy image 2d to image 2d array
+  CL_ENQUEUE_COPY_IMAGE_1D_ARRAY_TO_1D_ARRAY, //copy image 1d array to image 1d array
+  CL_ENQUEUE_COPY_IMAGE_2D_ARRAY_TO_2D_ARRAY, //copy image 2d array to image 2d array
+  CL_ENQUEUE_COPY_IMAGE_2D_ARRAY_TO_2D,       //copy image 2d array to image 2d
+  CL_ENQUEUE_COPY_IMAGE_2D_ARRAY_TO_3D,       //copy image 2d array to image 3d
+  CL_ENQUEUE_COPY_IMAGE_3D_TO_2D_ARRAY,       //copy image 3d to image 2d array
+  CL_ENQUEUE_COPY_IMAGE_2D_TO_BUFFER,         //copy image 2d to buffer
+  CL_ENQUEUE_COPY_IMAGE_2D_TO_BUFFER_ALIGN16,
+  CL_ENQUEUE_COPY_IMAGE_3D_TO_BUFFER, //copy image 3d tobuffer
+  CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D, //copy buffer to image 2d
+  CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D_ALIGN16,
+  CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_3D, //copy buffer to image 3d
+  CL_ENQUEUE_FILL_BUFFER_UNALIGN,     //fill buffer with 1 aligne pattern, pattern size=1
+  CL_ENQUEUE_FILL_BUFFER_ALIGN2,      //fill buffer with 2 aligne pattern, pattern size=2
+  CL_ENQUEUE_FILL_BUFFER_ALIGN4,      //fill buffer with 4 aligne pattern, pattern size=4
+  CL_ENQUEUE_FILL_BUFFER_ALIGN8_8,    //fill buffer with 8 aligne pattern, pattern size=8
+  CL_ENQUEUE_FILL_BUFFER_ALIGN8_16,   //fill buffer with 16 aligne pattern, pattern size=16
+  CL_ENQUEUE_FILL_BUFFER_ALIGN8_32,   //fill buffer with 16 aligne pattern, pattern size=32
+  CL_ENQUEUE_FILL_BUFFER_ALIGN8_64,   //fill buffer with 16 aligne pattern, pattern size=64
+  CL_ENQUEUE_FILL_BUFFER_ALIGN128,    //fill buffer with 128 aligne pattern, pattern size=128
+  CL_ENQUEUE_FILL_IMAGE_1D,           //fill image 1d
+  CL_ENQUEUE_FILL_IMAGE_1D_ARRAY,     //fill image 1d array
+  CL_ENQUEUE_FILL_IMAGE_2D,           //fill image 2d
+  CL_ENQUEUE_FILL_IMAGE_2D_ARRAY,     //fill image 2d array
+  CL_ENQUEUE_FILL_IMAGE_3D,           //fill image 3d
+  CL_INTERNAL_KERNEL_MAX
+};
+
+typedef struct _cl_device_id_gen {
+  _cl_device_id base;
+  /* All programs internal used, for example clEnqueuexxx api use */
+  cl_program internal_program[CL_INTERNAL_KERNEL_MAX];
+  cl_kernel internal_kernels[CL_INTERNAL_KERNEL_MAX];
+} _cl_device_id_gen;
+typedef _cl_device_id_gen *cl_device_id_gen;
+
+extern char *cl_internal_built_in_kernel_str_kernels;
+extern char *cl_internal_built_in_kernel_str;
+extern size_t cl_internal_built_in_kernel_str_size;
+extern cl_device_id cl_device_get_id_gen(cl_platform_id platform);
+
 /*********************************** Kernel *****************************************/
 /* Special virtual registers for OpenCL */
 typedef enum cl_gen_virt_reg {
@@ -166,13 +223,6 @@ extern cl_int cl_command_queue_ND_range_gen(cl_command_queue queue, cl_kernel ke
 extern cl_int cl_compiler_load_gen(cl_device_id device);
 extern cl_int cl_compiler_unload_gen(cl_device_id device);
 
-/************************************* Device *******************************************/
-extern char *cl_internal_built_in_kernel_str_kernels;
-extern char *cl_internal_built_in_kernel_str;
-extern size_t cl_internal_built_in_kernel_str_size;
-
-extern cl_device_id cl_get_device_id_gen(cl_platform_id platform);
-
 /*************************************** Mem *******************************************/
 extern cl_int cl_mem_copy_gen(cl_command_queue queue, cl_event event, cl_mem src_buf, cl_mem dst_buf,
                               size_t src_offset, size_t dst_offset, size_t cb);
@@ -197,48 +247,6 @@ extern cl_int cl_mem_copy_buffer_to_image_gen(cl_command_queue queue, cl_event e
                                               const size_t *dst_origin, const size_t *region);
 
 /*********************************** Context *****************************************/
-enum cl_internal_kernel_type_gen { // All internal kernel types for gen
-  CL_INTERNAL_KERNEL_MIN = 0,
-  CL_ENQUEUE_COPY_BUFFER_ALIGN4 = 0,
-  CL_ENQUEUE_COPY_BUFFER_ALIGN16,
-  CL_ENQUEUE_COPY_BUFFER_UNALIGN_SAME_OFFSET,
-  CL_ENQUEUE_COPY_BUFFER_UNALIGN_DST_OFFSET,
-  CL_ENQUEUE_COPY_BUFFER_UNALIGN_SRC_OFFSET,
-  CL_ENQUEUE_COPY_BUFFER_RECT,
-  CL_ENQUEUE_COPY_BUFFER_RECT_ALIGN4,
-  CL_ENQUEUE_COPY_IMAGE_1D_TO_1D,             //copy image 1d to image 1d
-  CL_ENQUEUE_COPY_IMAGE_2D_TO_2D,             //copy image 2d to image 2d
-  CL_ENQUEUE_COPY_IMAGE_3D_TO_2D,             //copy image 3d to image 2d
-  CL_ENQUEUE_COPY_IMAGE_2D_TO_3D,             //copy image 2d to image 3d
-  CL_ENQUEUE_COPY_IMAGE_3D_TO_3D,             //copy image 3d to image 3d
-  CL_ENQUEUE_COPY_IMAGE_2D_TO_2D_ARRAY,       //copy image 2d to image 2d array
-  CL_ENQUEUE_COPY_IMAGE_1D_ARRAY_TO_1D_ARRAY, //copy image 1d array to image 1d array
-  CL_ENQUEUE_COPY_IMAGE_2D_ARRAY_TO_2D_ARRAY, //copy image 2d array to image 2d array
-  CL_ENQUEUE_COPY_IMAGE_2D_ARRAY_TO_2D,       //copy image 2d array to image 2d
-  CL_ENQUEUE_COPY_IMAGE_2D_ARRAY_TO_3D,       //copy image 2d array to image 3d
-  CL_ENQUEUE_COPY_IMAGE_3D_TO_2D_ARRAY,       //copy image 3d to image 2d array
-  CL_ENQUEUE_COPY_IMAGE_2D_TO_BUFFER,         //copy image 2d to buffer
-  CL_ENQUEUE_COPY_IMAGE_2D_TO_BUFFER_ALIGN16,
-  CL_ENQUEUE_COPY_IMAGE_3D_TO_BUFFER, //copy image 3d tobuffer
-  CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D, //copy buffer to image 2d
-  CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D_ALIGN16,
-  CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_3D, //copy buffer to image 3d
-  CL_ENQUEUE_FILL_BUFFER_UNALIGN,     //fill buffer with 1 aligne pattern, pattern size=1
-  CL_ENQUEUE_FILL_BUFFER_ALIGN2,      //fill buffer with 2 aligne pattern, pattern size=2
-  CL_ENQUEUE_FILL_BUFFER_ALIGN4,      //fill buffer with 4 aligne pattern, pattern size=4
-  CL_ENQUEUE_FILL_BUFFER_ALIGN8_8,    //fill buffer with 8 aligne pattern, pattern size=8
-  CL_ENQUEUE_FILL_BUFFER_ALIGN8_16,   //fill buffer with 16 aligne pattern, pattern size=16
-  CL_ENQUEUE_FILL_BUFFER_ALIGN8_32,   //fill buffer with 16 aligne pattern, pattern size=32
-  CL_ENQUEUE_FILL_BUFFER_ALIGN8_64,   //fill buffer with 16 aligne pattern, pattern size=64
-  CL_ENQUEUE_FILL_BUFFER_ALIGN128,    //fill buffer with 128 aligne pattern, pattern size=128
-  CL_ENQUEUE_FILL_IMAGE_1D,           //fill image 1d
-  CL_ENQUEUE_FILL_IMAGE_1D_ARRAY,     //fill image 1d array
-  CL_ENQUEUE_FILL_IMAGE_2D,           //fill image 2d
-  CL_ENQUEUE_FILL_IMAGE_2D_ARRAY,     //fill image 2d array
-  CL_ENQUEUE_FILL_IMAGE_3D,           //fill image 3d
-  CL_INTERNAL_KERNEL_MAX
-};
-
 typedef struct _cl_context_gen {
   _cl_context_for_device ctx_base; /* Point to the device it belong to */
   intel_driver_t *drv;             /* Handles HW or simulator */
diff --git a/src/gen/cl_gen_device_common.h b/src/gen/cl_gen_device_common.h
index ca774e3..9fef422 100644
--- a/src/gen/cl_gen_device_common.h
+++ b/src/gen/cl_gen_device_common.h
@@ -17,7 +17,17 @@
  * Author: Benjamin Segovia <benjamin.segovia at intel.com>
  */
 
-/* Common fields for both all GT devices (IVB / SNB) */
+#undef LIBCL_VERSION_STRING
+#undef LIBCL_C_VERSION_STRING
+#ifdef GEN9_DEVICE
+#define LIBCL_VERSION_STRING GEN9_LIBCL_VERSION_STRING
+#define LIBCL_C_VERSION_STRING GEN9_LIBCL_C_VERSION_STRING
+#else
+#define LIBCL_VERSION_STRING NONGEN9_LIBCL_VERSION_STRING
+#define LIBCL_C_VERSION_STRING NONGEN9_LIBCL_C_VERSION_STRING
+#endif
+
+/* Common fields for both all GT devices */
 .device_type = CL_DEVICE_TYPE_GPU,
 .device_id=0,/* == device_id (set when requested) */
 .vendor_id = INTEL_VENDOR_ID,
diff --git a/src/gen/intel_driver.c b/src/gen/intel_driver.c
index 5161bee..2f62b22 100644
--- a/src/gen/intel_driver.c
+++ b/src/gen/intel_driver.c
@@ -371,6 +371,12 @@ intel_driver_open(intel_driver_t *intel, cl_context_prop props)
   return CL_SUCCESS;
 }
 
+LOCAL void
+intel_driver_set_atomic_flag(intel_driver_t *drv, int atomic_flag)
+{
+  drv->atomic_test_result = atomic_flag;
+}
+
 LOCAL intel_driver_t *
 intel_driver_create(cl_context_prop props)
 {
@@ -1001,12 +1007,6 @@ intel_driver_enlarge_stack_size(struct intel_driver *drv, int32_t *stack_size)
     *stack_size = *stack_size * 2;
 }
 
-static void
-intel_driver_set_atomic_flag(intel_driver_t *drv, int atomic_flag)
-{
-  drv->atomic_test_result = atomic_flag;
-}
-
 LOCAL void
 intel_setup_callbacks(void)
 {
@@ -1050,5 +1050,4 @@ intel_setup_callbacks(void)
   cl_buffer_get_buffer_from_fd = (cl_buffer_get_buffer_from_fd_cb *)intel_share_buffer_from_fd;
   cl_buffer_get_image_from_fd = (cl_buffer_get_image_from_fd_cb *)intel_share_image_from_fd;
   cl_buffer_disable_reuse = (cl_buffer_disable_reuse_cb *)drm_intel_bo_disable_reuse;
-  intel_set_gpgpu_callbacks(intel_get_device_id());
 }
diff --git a/src/gen/intel_driver.h b/src/gen/intel_driver.h
index bb517d5..825eebf 100644
--- a/src/gen/intel_driver.h
+++ b/src/gen/intel_driver.h
@@ -123,6 +123,7 @@ typedef struct intel_driver {
 
 extern void intel_driver_lock_hardware(intel_driver_t *);
 extern void intel_driver_unlock_hardware(intel_driver_t *);
+extern void intel_driver_set_atomic_flag(intel_driver_t *drv, int atomic_flag);
 extern intel_driver_t *intel_driver_create(cl_context_prop props);
 extern void intel_driver_delete(intel_driver_t *driver);
 extern void intel_update_device_info(cl_device_id device);
-- 
2.7.4



More information about the Beignet mailing list