[Beignet] [PATCH v2 2/4] Add extensions intel_accelerator and basic intel_motion_estimation.

Sun Sep 6 22:01:30 PDT 2015

v2:
1. Just upload the first vme_state.
2. Remove duplicated code in check_opt1_extension.
3. Check image format before cl_gpgpu_bind_image_for_vme.
4. Fix error of getting mv. Because we suppose this kernel run in SIMD16
   mode, so dword 0 of grf 1 should be
   __gen_ocl_region(8,vme_result.s0), not
   __gen_ocl_region(0,vme_result.s1).

Signed-off-by: Guo Yejun <yejun.guo at intel.com>
Signed-off-by: Chuanbo Weng <chuanbo.weng at intel.com>
---
 include/CL/cl_ext.h                                | 103 ++++++++++
 src/CMakeLists.txt                                 |   4 +-
 src/cl_accelerator_intel.c                         |  86 ++++++++
 src/cl_accelerator_intel.h                         |  29 +++
 src/cl_api.c                                       | 106 +++++++++-
 src/cl_command_queue.c                             |  16 +-
 src/cl_command_queue_gen7.c                        |   5 +-
 src/cl_context.c                                   |   1 +
 src/cl_context.h                                   |   3 +
 src/cl_driver.h                                    |  21 ++
 src/cl_driver_defs.c                               |   2 +
 src/cl_extensions.c                                |   4 +-
 src/cl_extensions.h                                |   8 +
 src/cl_gen7_device.h                               |   5 +-
 src/cl_gt_device.h                                 |   6 +-
 src/cl_internals.h                                 |   1 +
 src/cl_kernel.c                                    |  55 ++++-
 src/cl_kernel.h                                    |   6 +-
 src/cl_utils.h                                     |  12 ++
 src/intel/intel_gpgpu.c                            | 218 +++++++++++++++++++-
 src/intel/intel_structs.h                          | 120 +++++++++++
 .../cl_internal_block_motion_estimate_intel.cl     | 224 +++++++++++++++++++++
 .../builtin_kernel_block_motion_estimate_intel.cpp | 109 ++++++++++
 23 files changed, 1113 insertions(+), 31 deletions(-)
 create mode 100644 src/cl_accelerator_intel.c
 create mode 100644 src/cl_accelerator_intel.h
 create mode 100644 src/kernels/cl_internal_block_motion_estimate_intel.cl
 create mode 100644 utests/builtin_kernel_block_motion_estimate_intel.cpp

diff --git a/include/CL/cl_ext.h b/include/CL/cl_ext.h
index 710bea8..0a66d70 100644
--- a/include/CL/cl_ext.h
+++ b/include/CL/cl_ext.h
@@ -184,6 +184,109 @@ typedef CL_API_ENTRY cl_int (CL_API_CALL *clTerminateContextKHR_fn)(cl_context /
 #define CL_PRINTF_CALLBACK_ARM                      0x40B0
 #define CL_PRINTF_BUFFERSIZE_ARM                    0x40B1
 
+/*********************************
+* cl_intel_accelerator extension *
+*********************************/
+#define cl_intel_accelerator 1
+#define cl_intel_motion_estimation 1
+
+typedef struct _cl_accelerator_intel*     cl_accelerator_intel;
+typedef cl_uint                           cl_accelerator_type_intel;
+typedef cl_uint                           cl_accelerator_info_intel;
+
+typedef struct _cl_motion_estimation_desc_intel {
+    cl_uint mb_block_type;
+    cl_uint subpixel_mode;
+    cl_uint sad_adjust_mode;
+    cl_uint search_path_type;
+} cl_motion_estimation_desc_intel;
+
+/* Error Codes */
+#define CL_INVALID_ACCELERATOR_INTEL            -1094
+#define CL_INVALID_ACCELERATOR_TYPE_INTEL       -1095
+#define CL_INVALID_ACCELERATOR_DESCRIPTOR_INTEL -1096
+#define CL_ACCELERATOR_TYPE_NOT_SUPPORTED_INTEL -1097
+
+/* Deprecated Error Codes */
+#define CL_INVALID_ACCELERATOR_INTEL_DEPRECATED            -6000
+#define CL_INVALID_ACCELERATOR_TYPE_INTEL_DEPRECATED       -6001
+#define CL_INVALID_ACCELERATOR_DESCRIPTOR_INTEL_DEPRECATED -6002
+#define CL_ACCELERATOR_TYPE_NOT_SUPPORTED_INTEL_DEPRECATED -6003
+
+/* cl_accelerator_type_intel */
+#define CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL     0x0
+
+/* cl_accelerator_info_intel */
+#define CL_ACCELERATOR_DESCRIPTOR_INTEL                 0x4090
+#define CL_ACCELERATOR_REFERENCE_COUNT_INTEL            0x4091
+#define CL_ACCELERATOR_CONTEXT_INTEL                    0x4092
+#define CL_ACCELERATOR_TYPE_INTEL                       0x4093
+
+/*cl_motion_detect_desc_intel flags */
+#define CL_ME_MB_TYPE_16x16_INTEL                       0x0
+#define CL_ME_MB_TYPE_8x8_INTEL                         0x1
+#define CL_ME_MB_TYPE_4x4_INTEL                         0x2
+
+#define CL_ME_SUBPIXEL_MODE_INTEGER_INTEL               0x0
+#define CL_ME_SUBPIXEL_MODE_HPEL_INTEL                  0x1
+#define CL_ME_SUBPIXEL_MODE_QPEL_INTEL                  0x2
+
+#define CL_ME_SAD_ADJUST_MODE_NONE_INTEL                0x0
+#define CL_ME_SAD_ADJUST_MODE_HAAR_INTEL                0x1
+
+#define CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL              0x0
+#define CL_ME_SEARCH_PATH_RADIUS_4_4_INTEL              0x1
+#define CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL            0x5
+
+extern CL_API_ENTRY cl_accelerator_intel CL_API_CALL
+clCreateAcceleratorINTEL(
+    cl_context                  /* context */,
+    cl_accelerator_type_intel   /* accelerator_type */,
+    size_t                      /* descriptor_size */,
+    const void*                 /* descriptor */,
+    cl_int*                     /* errcode_ret */ ) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_accelerator_intel
+    (CL_API_CALL *clCreateAcceleratorINTEL_fn)(
+    cl_context                  /* context */,
+    cl_accelerator_type_intel   /* accelerator_type */,
+    size_t                      /* descriptor_size */,
+    const void*                 /* descriptor */,
+    cl_int*                     /* errcode_ret */ ) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetAcceleratorInfoINTEL
+(
+    cl_accelerator_intel        /* accelerator */,
+    cl_accelerator_info_intel   /* param_name */,
+    size_t                      /* param_value_size */,
+    void*                       /* param_value */,
+    size_t*                     /* param_value_size_ret */ ) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int
+    (CL_API_CALL *clGetAcceleratorInfoINTEL_fn)(
+    cl_accelerator_intel        /* accelerator */,
+    cl_accelerator_info_intel   /* param_name */,
+    size_t                      /* param_value_size */,
+    void*                       /* param_value */,
+    size_t*                     /* param_value_size_ret */ ) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainAcceleratorINTEL(
+    cl_accelerator_intel        /* accelerator */ ) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int
+    (CL_API_CALL *clRetainAcceleratorINTEL_fn)(
+    cl_accelerator_intel        /* accelerator */ ) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseAcceleratorINTEL(
+    cl_accelerator_intel        /* accelerator */ ) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int
+    (CL_API_CALL *clReleaseAcceleratorINTEL_fn)(
+    cl_accelerator_intel        /* accelerator */ ) CL_EXT_SUFFIX__VERSION_1_2;
+
 #ifdef CL_VERSION_1_1
    /***********************************
     * cl_ext_device_fission extension *
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 40a9afb..c917e76 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -56,7 +56,8 @@ cl_internal_fill_buf_align8 cl_internal_fill_buf_align4
 cl_internal_fill_buf_align2 cl_internal_fill_buf_unalign
 cl_internal_fill_buf_align128 cl_internal_fill_image_1d
 cl_internal_fill_image_1d_array cl_internal_fill_image_2d
-cl_internal_fill_image_2d_array cl_internal_fill_image_3d)
+cl_internal_fill_image_2d_array cl_internal_fill_image_3d
+cl_internal_block_motion_estimate_intel)
 set (BUILT_IN_NAME  cl_internal_built_in_kernel)
 MakeBuiltInKernelStr ("${CMAKE_CURRENT_SOURCE_DIR}/kernels/" "${KERNEL_NAMES}")
 MakeKernelBinStr ("${CMAKE_CURRENT_SOURCE_DIR}/kernels/" "${KERNEL_NAMES}")
@@ -70,6 +71,7 @@ set(OPENCL_SRC
     cl_program.c
     cl_gbe_loader.cpp
     cl_sampler.c
+    cl_accelerator_intel.c
     cl_event.c
     cl_enqueue.c
     cl_image.c
diff --git a/src/cl_accelerator_intel.c b/src/cl_accelerator_intel.c
new file mode 100644
index 0000000..cda8963
--- /dev/null
+++ b/src/cl_accelerator_intel.c
@@ -0,0 +1,86 @@
+#include "cl_context.h"
+#include "cl_accelerator_intel.h"
+#include "cl_utils.h"
+#include "cl_alloc.h"
+#include "cl_khr_icd.h"
+#include "cl_kernel.h"
+
+#include <assert.h>
+
+LOCAL cl_accelerator_intel
+cl_accelerator_intel_new(cl_context ctx,
+                         cl_accelerator_type_intel accel_type,
+                         size_t desc_sz,
+                         const void* desc,
+                         cl_int* errcode_ret)
+{
+  cl_accelerator_intel accel = NULL;
+  cl_int err = CL_SUCCESS;
+
+  /* Allocate and inialize the structure itself */
+  TRY_ALLOC(accel, CALLOC(struct _cl_accelerator_intel));
+  SET_ICD(accel->dispatch)
+  accel->ref_n = 1;
+  accel->magic = CL_MAGIC_ACCELERATOR_INTEL_HEADER;
+
+  if (accel_type != CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL) {
+    err = CL_INVALID_ACCELERATOR_TYPE_INTEL;
+    goto error;
+  }
+  accel->type = accel_type;
+
+  if (desc == NULL) {   //  and check inside desc
+    err = CL_INVALID_ACCELERATOR_DESCRIPTOR_INTEL;
+    goto error;
+  }
+  accel->desc.me = *(cl_motion_estimation_desc_intel*)desc;
+
+  /* Append the accelerator_intel in the context accelerator_intel list */
+  /* does this really needed? */
+  pthread_mutex_lock(&ctx->accelerator_intel_lock);
+    accel->next = ctx->accels;
+    if (ctx->accels != NULL)
+      ctx->accels->prev = accel;
+    ctx->accels = accel;
+  pthread_mutex_unlock(&ctx->accelerator_intel_lock);
+
+  accel->ctx = ctx;
+  cl_context_add_ref(ctx);
+
+exit:
+  if (errcode_ret)
+    *errcode_ret = err;
+  return accel;
+error:
+  cl_accelerator_intel_delete(accel);
+  accel = NULL;
+  goto exit;
+}
+
+LOCAL void
+cl_accelerator_intel_add_ref(cl_accelerator_intel accel)
+{
+  atomic_inc(&accel->ref_n);
+}
+
+LOCAL void
+cl_accelerator_intel_delete(cl_accelerator_intel accel)
+{
+  if (UNLIKELY(accel == NULL))
+    return;
+  if (atomic_dec(&accel->ref_n) > 1)
+    return;
+
+  /* Remove the accelerator_intel in the context accelerator_intel list */
+  pthread_mutex_lock(&accel->ctx->accelerator_intel_lock);
+    if (accel->prev)
+      accel->prev->next = accel->next;
+    if (accel->next)
+      accel->next->prev = accel->prev;
+    if (accel->ctx->accels == accel)
+      accel->ctx->accels = accel->next;
+  pthread_mutex_unlock(&accel->ctx->accelerator_intel_lock);
+
+  cl_context_delete(accel->ctx);
+  cl_free(accel);
+}
diff --git a/src/cl_accelerator_intel.h b/src/cl_accelerator_intel.h
new file mode 100644
index 0000000..cecfd2a
--- /dev/null
+++ b/src/cl_accelerator_intel.h
@@ -0,0 +1,29 @@
+#ifndef __CL_ACCELERATOR_INTEL_H__
+#define __CL_ACCELERATOR_INTEL_H__
+
+#include "CL/cl.h"
+#include "CL/cl_ext.h"
+#include <stdint.h>
+
+struct _cl_accelerator_intel {
+  DEFINE_ICD(dispatch)
+  uint64_t magic;            /* To identify it as a accelerator_intel object */
+  volatile int ref_n;        /* This object is reference counted */
+  cl_accelerator_intel prev, next;     /* We chain in the allocator, why chain? */
+  cl_context ctx;            /* Context it belongs to */
+  cl_accelerator_type_intel type;
+  union {
+    cl_motion_estimation_desc_intel me;
+  }desc;                     /* save desc before we decide how to handle it */
+};
+
+cl_accelerator_intel cl_accelerator_intel_new(cl_context ctx,
+                         cl_accelerator_type_intel accel_type,
+                         size_t desc_sz,
+                         const void* desc,
+                         cl_int* errcode_ret);
+
+void cl_accelerator_intel_add_ref(cl_accelerator_intel accel);
+void cl_accelerator_intel_delete(cl_accelerator_intel accel);
+
+#endif
diff --git a/src/cl_api.c b/src/cl_api.c
index dbbcbb0..6da6b9e 100644
--- a/src/cl_api.c
+++ b/src/cl_api.c
@@ -28,6 +28,7 @@
 #include "cl_mem.h"
 #include "cl_image.h"
 #include "cl_sampler.h"
+#include "cl_accelerator_intel.h"
 #include "cl_alloc.h"
 #include "cl_utils.h"
 
@@ -2910,6 +2911,17 @@ clEnqueueNDRangeKernel(cl_command_queue  command_queue,
     goto error;
   }
 
+  if (kernel->vme) {
+    if (work_dim != 2) {
+      err = CL_INVALID_WORK_DIMENSION;
+      goto error;
+    }
+    if (local_work_size != NULL) {
+      err = CL_INVALID_WORK_GROUP_SIZE;
+      goto error;
+    }
+  }
+
   if (global_work_offset != NULL)
     for (i = 0; i < work_dim; ++i) {
       if (UNLIKELY(global_work_offset[i] + global_work_size[i] > (size_t)-1)) {
@@ -2943,22 +2955,31 @@ clEnqueueNDRangeKernel(cl_command_queue  command_queue,
     for (i = 0; i < work_dim; ++i)
       fixed_local_sz[i] = local_work_size[i];
   } else {
-    uint j, maxDimSize = 64 /* from 64? */, maxGroupSize = 256; //MAX_WORK_GROUP_SIZE may too large
-    for (i = 0; i< work_dim; i++) {
-      for (j = maxDimSize; j > 1; j--) {
-        if (global_work_size[i] % j == 0 && j <= maxGroupSize) {
-          fixed_local_sz[i] = j;
-          maxGroupSize = maxGroupSize /j;
-          maxDimSize = maxGroupSize > maxDimSize ? maxDimSize : maxGroupSize;
-          break;  //choose next work_dim
+    if (kernel->vme) {
+        fixed_local_sz[0] = 16;
+        fixed_local_sz[1] = 1;
+    } else {
+      uint j, maxDimSize = 64 /* from 64? */, maxGroupSize = 256; //MAX_WORK_GROUP_SIZE may too large
+      for (i = 0; i< work_dim; i++) {
+        for (j = maxDimSize; j > 1; j--) {
+          if (global_work_size[i] % j == 0 && j <= maxGroupSize) {
+            fixed_local_sz[i] = j;
+            maxGroupSize = maxGroupSize /j;
+            maxDimSize = maxGroupSize > maxDimSize ? maxDimSize : maxGroupSize;
+            break;  //choose next work_dim
+          }
         }
       }
     }
   }
 
-  if (global_work_size != NULL)
+  if (kernel->vme) {
+    fixed_global_sz[0] = (global_work_size[0]+15) / 16 * 16;
+    fixed_global_sz[1] = (global_work_size[1]+15) / 16;
+  } else {
     for (i = 0; i < work_dim; ++i)
       fixed_global_sz[i] = global_work_size[i];
+  }
   if (global_work_offset != NULL)
     for (i = 0; i < work_dim; ++i)
       fixed_global_off[i] = global_work_offset[i];
@@ -3187,6 +3208,10 @@ internal_clGetExtensionFunctionAddress(const char *func_name)
   EXTFUNC(clCreateBufferFromLibvaIntel)
   EXTFUNC(clCreateImageFromLibvaIntel)
   EXTFUNC(clGetMemObjectFdIntel)
+  EXTFUNC(clCreateAcceleratorINTEL)
+  EXTFUNC(clRetainAcceleratorINTEL)
+  EXTFUNC(clReleaseAcceleratorINTEL)
+  EXTFUNC(clGetAcceleratorInfoINTEL)
   return NULL;
 }
 
@@ -3355,3 +3380,66 @@ clGetMemObjectFdIntel(cl_context context,
 error:
   return err;
 }
+
+cl_accelerator_intel
+clCreateAcceleratorINTEL(cl_context context,
+                         cl_accelerator_type_intel accel_type,
+                         size_t desc_sz,
+                         const void* desc,
+                         cl_int* errcode_ret)
+{
+  cl_accelerator_intel accel = NULL;
+  cl_int err = CL_SUCCESS;
+  CHECK_CONTEXT(context);
+  accel = cl_accelerator_intel_new(context, accel_type, desc_sz, desc, &err);
+error:
+  if (errcode_ret)
+    *errcode_ret = err;
+  return accel;
+}
+
+cl_int
+clRetainAcceleratorINTEL(cl_accelerator_intel accel)
+{
+  cl_int err = CL_SUCCESS;
+  CHECK_ACCELERATOR_INTEL(accel);
+  cl_accelerator_intel_add_ref(accel);
+error:
+  return err;
+}
+
+cl_int
+clReleaseAcceleratorINTEL(cl_accelerator_intel accel)
+{
+  cl_int err = CL_SUCCESS;
+  CHECK_ACCELERATOR_INTEL(accel);
+  cl_accelerator_intel_delete(accel);
+error:
+  return err;
+}
+
+cl_int
+clGetAcceleratorInfoINTEL(cl_accelerator_intel           accel,
+                            cl_accelerator_info_intel    param_name,
+                            size_t                       param_value_size,
+                            void*                        param_value,
+                            size_t*                      param_value_size_ret)
+{
+  cl_int err = CL_SUCCESS;
+  CHECK_ACCELERATOR_INTEL(accel);
+
+  if (param_name == CL_ACCELERATOR_REFERENCE_COUNT_INTEL) {
+    FILL_GETINFO_RET (cl_uint, 1, (cl_uint*)&accel->ref_n, CL_SUCCESS);
+  } else if (param_name == CL_ACCELERATOR_CONTEXT_INTEL) {
+    FILL_GETINFO_RET (cl_context, 1, &accel->ctx, CL_SUCCESS);
+  } else if (param_name == CL_ACCELERATOR_TYPE_INTEL) {
+    FILL_GETINFO_RET (cl_uint, 1, &accel->type, CL_SUCCESS);
+  } else if (param_name == CL_ACCELERATOR_DESCRIPTOR_INTEL) {
+    FILL_GETINFO_RET (cl_motion_estimation_desc_intel, 1, &(accel->desc.me), CL_SUCCESS);
+  } else{
+    return CL_INVALID_VALUE;
+  }
+
+error:
+  return err;
+}
diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
index 4b92311..7cf743f 100644
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -140,10 +140,18 @@ cl_command_queue_bind_image(cl_command_queue queue, cl_kernel k)
 
     image = cl_mem_image(k->args[id].mem);
     set_image_info(k->curbe, &k->images[i], image);
-    cl_gpgpu_bind_image(gpgpu, k->images[i].idx, image->base.bo, image->offset,
-                        image->intel_fmt, image->image_type, image->bpp,
-                        image->w, image->h, image->depth,
-                        image->row_pitch, image->slice_pitch, (cl_gpgpu_tiling)image->tiling);
+    if(k->vme){
+      assert((image->fmt.image_channel_order == CL_R) && (image->fmt.image_channel_data_type == CL_UNORM_INT8));
+      cl_gpgpu_bind_image_for_vme(gpgpu, k->images[i].idx, image->base.bo, image->offset,
+                          image->intel_fmt, image->image_type, image->bpp,
+                          image->w, image->h, image->depth,
+                          image->row_pitch, image->slice_pitch, (cl_gpgpu_tiling)image->tiling);
+    }
+    else
+      cl_gpgpu_bind_image(gpgpu, k->images[i].idx, image->base.bo, image->offset,
+                          image->intel_fmt, image->image_type, image->bpp,
+                          image->w, image->h, image->depth,
+                          image->row_pitch, image->slice_pitch, (cl_gpgpu_tiling)image->tiling);
     // TODO, this workaround is for GEN7/GEN75 only, we may need to do it in the driver layer
     // on demand.
     if (image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index 0e60528..d64bfba 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -361,7 +361,10 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
   /* Bind user images */
   cl_command_queue_bind_image(queue, ker);
   /* Bind all samplers */
-  cl_gpgpu_bind_sampler(gpgpu, ker->samplers, ker->sampler_sz);
+  if (ker->vme)
+    cl_gpgpu_bind_vme_state(gpgpu, ker->accel);
+  else
+    cl_gpgpu_bind_sampler(gpgpu, ker->samplers, ker->sampler_sz);
 
   if (cl_gpgpu_set_scratch(gpgpu, scratch_sz) != 0)
     goto error;
diff --git a/src/cl_context.c b/src/cl_context.c
index c45e0aa..a6bde7d 100644
--- a/src/cl_context.c
+++ b/src/cl_context.c
@@ -177,6 +177,7 @@ cl_context_new(struct _cl_context_prop *props)
   pthread_mutex_init(&ctx->queue_lock, NULL);
   pthread_mutex_init(&ctx->buffer_lock, NULL);
   pthread_mutex_init(&ctx->sampler_lock, NULL);
+  pthread_mutex_init(&ctx->accelerator_intel_lock, NULL);
 
 exit:
   return ctx;
diff --git a/src/cl_context.h b/src/cl_context.h
index ef94823..489e5d7 100644
--- a/src/cl_context.h
+++ b/src/cl_context.h
@@ -21,6 +21,7 @@
 #define __CL_CONTEXT_H__
 
 #include "CL/cl.h"
+#include "CL/cl_ext.h"
 #include "cl_internals.h"
 #include "cl_driver.h"
 #include "cl_khr_icd.h"
@@ -107,11 +108,13 @@ struct _cl_context {
   cl_program programs;              /* All programs currently allocated */
   cl_mem buffers;                   /* All memory object currently allocated */
   cl_sampler samplers;              /* All sampler object currently allocated */
+  cl_accelerator_intel accels;      /* All accelerator_intel object currently allocated */
   cl_event   events;                /* All event object currently allocated */
   pthread_mutex_t queue_lock;       /* To allocate and deallocate queues */
   pthread_mutex_t program_lock;     /* To allocate and deallocate programs */
   pthread_mutex_t buffer_lock;      /* To allocate and deallocate buffers */
   pthread_mutex_t sampler_lock;     /* To allocate and deallocate samplers */
+  pthread_mutex_t accelerator_intel_lock;     /* To allocate and deallocate accelerator_intel */
   pthread_mutex_t event_lock;       /* To allocate and deallocate events */
   cl_program internal_prgs[CL_INTERNAL_KERNEL_MAX];
                                     /* All programs internal used, for example clEnqueuexxx api use */
diff --git a/src/cl_driver.h b/src/cl_driver.h
index 1ab4dff..48049a5 100644
--- a/src/cl_driver.h
+++ b/src/cl_driver.h
@@ -23,9 +23,11 @@
 #include <stdint.h>
 #include <stdlib.h>
 #include "cl_driver_type.h"
+#include "CL/cl_ext.h"
 /* Various limitations we should remove actually */
 #define GEN_MAX_SURFACES 256
 #define GEN_MAX_SAMPLERS 16
+#define GEN_MAX_VME_STATES 8
 
 /**************************************************************************
  * cl_driver:
@@ -145,6 +147,9 @@ extern cl_gpgpu_bind_buf_cb *cl_gpgpu_bind_buf;
 typedef void (cl_gpgpu_bind_sampler_cb)(cl_gpgpu, uint32_t *samplers, size_t sampler_sz);
 extern cl_gpgpu_bind_sampler_cb *cl_gpgpu_bind_sampler;
 
+typedef void (cl_gpgpu_bind_vme_state_cb)(cl_gpgpu, cl_accelerator_intel accel);
+extern cl_gpgpu_bind_vme_state_cb *cl_gpgpu_bind_vme_state;
+
 /* get the default cache control value. */
 typedef uint32_t (cl_gpgpu_get_cache_ctrl_cb)();
 extern cl_gpgpu_get_cache_ctrl_cb *cl_gpgpu_get_cache_ctrl;
@@ -165,6 +170,22 @@ typedef void (cl_gpgpu_bind_image_cb)(cl_gpgpu state,
 
 extern cl_gpgpu_bind_image_cb *cl_gpgpu_bind_image;
 
+typedef void (cl_gpgpu_bind_image_for_vme_cb)(cl_gpgpu state,
+                                              uint32_t id,
+                                              cl_buffer obj_bo,
+                                              uint32_t obj_bo_offset,
+                                              uint32_t format,
+                                              uint32_t bpp,
+                                              uint32_t type,
+                                              int32_t w,
+                                              int32_t h,
+                                              int32_t depth,
+                                              int pitch,
+                                              int32_t slice_pitch,
+                                              cl_gpgpu_tiling tiling);
+
+extern cl_gpgpu_bind_image_for_vme_cb *cl_gpgpu_bind_image_for_vme;
+
 /* Setup a stack */
 typedef void (cl_gpgpu_set_stack_cb)(cl_gpgpu, uint32_t offset, uint32_t size, uint32_t cchint);
 extern cl_gpgpu_set_stack_cb *cl_gpgpu_set_stack;
diff --git a/src/cl_driver_defs.c b/src/cl_driver_defs.c
index b77acdc..4472373 100644
--- a/src/cl_driver_defs.c
+++ b/src/cl_driver_defs.c
@@ -69,6 +69,7 @@ LOCAL cl_gpgpu_bind_buf_cb *cl_gpgpu_bind_buf = NULL;
 LOCAL cl_gpgpu_set_stack_cb *cl_gpgpu_set_stack = NULL;
 LOCAL cl_gpgpu_set_scratch_cb *cl_gpgpu_set_scratch = NULL;
 LOCAL cl_gpgpu_bind_image_cb *cl_gpgpu_bind_image = NULL;
+LOCAL cl_gpgpu_bind_image_cb *cl_gpgpu_bind_image_for_vme = NULL;
 LOCAL cl_gpgpu_get_cache_ctrl_cb *cl_gpgpu_get_cache_ctrl = NULL;
 LOCAL cl_gpgpu_state_init_cb *cl_gpgpu_state_init = NULL;
 LOCAL cl_gpgpu_alloc_constant_buffer_cb * cl_gpgpu_alloc_constant_buffer = NULL;
@@ -82,6 +83,7 @@ LOCAL cl_gpgpu_batch_end_cb *cl_gpgpu_batch_end = NULL;
 LOCAL cl_gpgpu_flush_cb *cl_gpgpu_flush = NULL;
 LOCAL cl_gpgpu_walker_cb *cl_gpgpu_walker = NULL;
 LOCAL cl_gpgpu_bind_sampler_cb *cl_gpgpu_bind_sampler = NULL;
+LOCAL cl_gpgpu_bind_vme_state_cb *cl_gpgpu_bind_vme_state = NULL;
 LOCAL cl_gpgpu_event_new_cb *cl_gpgpu_event_new = NULL;
 LOCAL cl_gpgpu_event_update_status_cb *cl_gpgpu_event_update_status = NULL;
 LOCAL cl_gpgpu_event_flush_cb *cl_gpgpu_event_flush = NULL;
diff --git a/src/cl_extensions.c b/src/cl_extensions.c
index 3eb303f..dbddb58 100644
--- a/src/cl_extensions.c
+++ b/src/cl_extensions.c
@@ -63,7 +63,9 @@ check_gl_extension(cl_extensions_t *extensions) {
 void
 check_intel_extension(cl_extensions_t *extensions)
 {
-  /* Should put those map/unmap extensions here. */
+  int id;
+  for(id = INTEL_EXT_START_ID; id <= INTEL_EXT_END_ID; id++)
+    extensions->extensions[id].base.ext_enabled = 1;
 }
 
 void
diff --git a/src/cl_extensions.h b/src/cl_extensions.h
index b4544e2..27041c3 100644
--- a/src/cl_extensions.h
+++ b/src/cl_extensions.h
@@ -23,6 +23,10 @@
   DECL_EXT(khr_spir) \
   DECL_EXT(khr_icd)
 
+#define DECL_INTEL_EXTENSIONS \
+  DECL_EXT(intel_accelerator) \
+  DECL_EXT(intel_motion_estimation)
+
 #define DECL_GL_EXTENSIONS \
   DECL_EXT(khr_gl_sharing)\
   DECL_EXT(khr_gl_event)\
@@ -37,6 +41,7 @@
 #define DECL_ALL_EXTENSIONS \
   DECL_BASE_EXTENSIONS \
   DECL_OPT1_EXTENSIONS \
+  DECL_INTEL_EXTENSIONS \
   DECL_GL_EXTENSIONS \
   DECL_D3D_EXTENSIONS
 
@@ -54,6 +59,8 @@ cl_khr_extension_id_max
 #define BASE_EXT_END_ID EXT_ID(khr_fp64)
 #define OPT1_EXT_START_ID EXT_ID(khr_int64_base_atomics)
 #define OPT1_EXT_END_ID EXT_ID(khr_icd)
+#define INTEL_EXT_START_ID EXT_ID(intel_accelerator)
+#define INTEL_EXT_END_ID EXT_ID(intel_motion_estimation)
 #define GL_EXT_START_ID EXT_ID(khr_gl_sharing)
 #define GL_EXT_END_ID EXT_ID(khr_gl_msaa_sharing)
 
@@ -75,6 +82,7 @@ struct EXT_STRUCT_NAME(name) { \
 
 DECL_BASE_EXTENSIONS
 DECL_OPT1_EXTENSIONS
+DECL_INTEL_EXTENSIONS
 DECL_D3D_EXTENSIONS
 DECL_GL_EXTENSIONS
 #undef DECL_EXT
diff --git a/src/cl_gen7_device.h b/src/cl_gen7_device.h
index 4ad5d96..648e84e 100644
--- a/src/cl_gen7_device.h
+++ b/src/cl_gen7_device.h
@@ -25,5 +25,8 @@
 .local_mem_size = 64 << 10,
 .scratch_mem_size = 12 << 10,
 
+//temporarily define to only export builtin kernel block_motion_estimate_intel only for Gen7
+//will remove after HSW+ also support
+#define GEN7_DEVICE
 #include "cl_gt_device.h"
-
+#undef GEN7_DEVICE
diff --git a/src/cl_gt_device.h b/src/cl_gt_device.h
index bd87cc4..d0fd95a 100644
--- a/src/cl_gt_device.h
+++ b/src/cl_gt_device.h
@@ -116,7 +116,11 @@ DECL_INFO_STRING(built_in_kernels, "__cl_copy_region_align4;"
                                    "__cl_fill_image_1d_array;"
                                    "__cl_fill_image_2d;"
                                    "__cl_fill_image_2d_array;"
-                                   "__cl_fill_image_3d;")
+                                   "__cl_fill_image_3d;"
+#ifdef GEN7_DEVICE
+                                   "block_motion_estimate_intel;"
+#endif
+                                   )
 
 DECL_INFO_STRING(driver_version, LIBCL_DRIVER_VERSION_STRING)
 #undef DECL_INFO_STRING
diff --git a/src/cl_internals.h b/src/cl_internals.h
index cb3fc23..9aeb8c1 100644
--- a/src/cl_internals.h
+++ b/src/cl_internals.h
@@ -31,6 +31,7 @@
 #define CL_MAGIC_EVENT_HEADER     0x8324a9c810ebf90fLL
 #define CL_MAGIC_MEM_HEADER       0x381a27b9ce6504dfLL
 #define CL_MAGIC_DEAD_HEADER      0xdeaddeaddeaddeadLL
+#define CL_MAGIC_ACCELERATOR_INTEL_HEADER   0x7c6a08c9a7ac3e3fLL
 
 #endif /* __CL_INTERNALS_H__ */
 
diff --git a/src/cl_kernel.c b/src/cl_kernel.c
index 286e57c..ce3c4b2 100644
--- a/src/cl_kernel.c
+++ b/src/cl_kernel.c
@@ -27,6 +27,7 @@
 #include "cl_khr_icd.h"
 #include "CL/cl.h"
 #include "cl_sampler.h"
+#include "cl_accelerator_intel.h"
 
 #include <stdio.h>
 #include <string.h>
@@ -113,10 +114,22 @@ cl_kernel_set_arg(cl_kernel k, cl_uint index, size_t sz, const void *value)
   arg_type = interp_kernel_get_arg_type(k->opaque, index);
   arg_sz = interp_kernel_get_arg_size(k->opaque, index);
 
-  if (UNLIKELY(arg_type != GBE_ARG_LOCAL_PTR && arg_sz != sz)) {
-    if (arg_type != GBE_ARG_SAMPLER ||
-        (arg_type == GBE_ARG_SAMPLER && sz != sizeof(cl_sampler)))
+  if (k->vme && index == 0) {
+    //the best method is to return the arg type of GBE_ARG_ACCELERATOR_INTEL
+    //but it is not straightforward since clang does not support it now
+    //the easy way is to consider typedef accelerator_intel_t as a struct,
+    //this easy way makes the size mismatched, so use another size check method.
+    if (sz != sizeof(cl_accelerator_intel) || arg_sz != sizeof(cl_motion_estimation_desc_intel))
       return CL_INVALID_ARG_SIZE;
+    cl_accelerator_intel* accel = (cl_accelerator_intel*)value;
+    if ((*accel)->type != CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL)
+      return CL_INVALID_ACCELERATOR_TYPE_INTEL;
+  } else {
+    if (UNLIKELY(arg_type != GBE_ARG_LOCAL_PTR && arg_sz != sz)) {
+      if (arg_type != GBE_ARG_SAMPLER ||
+          (arg_type == GBE_ARG_SAMPLER && sz != sizeof(cl_sampler)))
+        return CL_INVALID_ARG_SIZE;
+    }
   }
 
   if(UNLIKELY(arg_type == GBE_ARG_LOCAL_PTR && sz == 0))
@@ -150,15 +163,30 @@ cl_kernel_set_arg(cl_kernel k, cl_uint index, size_t sz, const void *value)
     }
   }
 
+  memset(&k->args[index], 0, sizeof(k->args[index]));
+
   /* Copy the structure or the value directly into the curbe */
   if (arg_type == GBE_ARG_VALUE) {
-    offset = interp_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, index);
-    assert(offset + sz <= k->curbe_sz);
-    memcpy(k->curbe + offset, value, sz);
-    k->args[index].local_sz = 0;
-    k->args[index].is_set = 1;
-    k->args[index].mem = NULL;
-    return CL_SUCCESS;
+    if (k->vme && index == 0) {
+      cl_accelerator_intel accel;
+      memcpy(&accel, value, sz);
+      offset = interp_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, index);
+      assert(offset + sz <= k->curbe_sz);
+      memcpy(k->curbe + offset, &(accel->desc.me), arg_sz);
+      k->args[index].local_sz = 0;
+      k->args[index].is_set = 1;
+      k->args[index].mem = NULL;
+      k->accel = accel;
+      return CL_SUCCESS;
+    } else {
+      offset = interp_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, index);
+      assert(offset + sz <= k->curbe_sz);
+      memcpy(k->curbe + offset, value, sz);
+      k->args[index].local_sz = 0;
+      k->args[index].is_set = 1;
+      k->args[index].mem = NULL;
+      return CL_SUCCESS;
+    }
   }
 
   /* For a local pointer just save the size */
@@ -327,6 +355,12 @@ cl_kernel_setup(cl_kernel k, gbe_kernel opaque)
   cl_buffer_subdata(k->bo, 0, code_sz, code);
   k->opaque = opaque;
 
+  const char* kname = cl_kernel_get_name(k);
+  if (strncmp(kname, "block_motion_estimate_intel", sizeof("block_motion_estimate_intel")) == 0)
+    k->vme = 1;
+  else
+    k->vme = 0;
+
   /* Create the curbe */
   k->curbe_sz = interp_kernel_get_curbe_size(k->opaque);
 
@@ -363,6 +397,7 @@ cl_kernel_dup(cl_kernel from)
   SET_ICD(to->dispatch)
   to->bo = from->bo;
   to->opaque = from->opaque;
+  to->vme = from->vme;
   to->ref_n = 1;
   to->magic = CL_MAGIC_KERNEL_HEADER;
   to->program = from->program;
diff --git a/src/cl_kernel.h b/src/cl_kernel.h
index 140bbb1..7f59162 100644
--- a/src/cl_kernel.h
+++ b/src/cl_kernel.h
@@ -24,6 +24,7 @@
 #include "cl_driver.h"
 #include "cl_gbe_loader.h"
 #include "CL/cl.h"
+#include "CL/cl_ext.h"
 
 #include <stdint.h>
 #include <stdlib.h>
@@ -37,6 +38,7 @@ struct _gbe_kernel;
 typedef struct cl_argument {
   cl_mem mem;           /* For image and regular buffers */
   cl_sampler sampler;   /* For sampler. */
+  cl_accelerator_intel accel;
   unsigned char bti;
   uint32_t local_sz:31; /* For __local size specification */
   uint32_t is_set:1;    /* All args must be set before NDRange */
@@ -50,6 +52,7 @@ struct _cl_kernel {
   cl_buffer bo;               /* The code itself */
   cl_program program;         /* Owns this structure (and pointers) */
   gbe_kernel opaque;          /* (Opaque) compiler structure for the OCL kernel */
+  cl_accelerator_intel accel;     /* accelerator */
   char *curbe;                /* One curbe per kernel */
   size_t curbe_sz;            /* Size of it */
   uint32_t samplers[GEN_MAX_SAMPLERS]; /* samplers defined in kernel & kernel args */
@@ -63,8 +66,9 @@ struct _cl_kernel {
                                 (i.e. global_work_size argument to clEnqueueNDRangeKernel.)*/
   size_t stack_size;          /* stack size per work item. */
   cl_argument *args;          /* To track argument setting */
-  uint32_t arg_n:31;          /* Number of arguments */
+  uint32_t arg_n:30;          /* Number of arguments */
   uint32_t ref_its_program:1; /* True only for the user kernel (created by clCreateKernel) */
+  uint32_t vme:1;             /* True only if it is a built-in kernel for VME */
 };
 
 /* Allocate an empty kernel */
diff --git a/src/cl_utils.h b/src/cl_utils.h
index 28fdef6..ee9d614 100644
--- a/src/cl_utils.h
+++ b/src/cl_utils.h
@@ -202,6 +202,18 @@ do {                                                        \
   }                                                         \
 } while (0)
 
+#define CHECK_ACCELERATOR_INTEL(ACCELERATOR_INTEL)                              \
+do {                                                                            \
+  if (UNLIKELY(ACCELERATOR_INTEL == NULL)) {                                    \
+    err = CL_INVALID_ACCELERATOR_INTEL;                                         \
+    goto error;                                                                 \
+  }                                                                             \
+  if (UNLIKELY(ACCELERATOR_INTEL->magic != CL_MAGIC_ACCELERATOR_INTEL_HEADER)) {\
+    err = CL_INVALID_ACCELERATOR_INTEL;                                         \
+    goto error;                                                                 \
+  }                                                                             \
+} while (0)
+
 #define CHECK_KERNEL(KERNEL)                                \
 do {                                                        \
   if (UNLIKELY(KERNEL == NULL)) {                           \
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
index 901bd98..1b776a6 100644
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -38,6 +38,7 @@
 #include "cl_alloc.h"
 #include "cl_utils.h"
 #include "cl_sampler.h"
+#include "cl_accelerator_intel.h"
 
 #ifndef CL_VERSION_1_2
 #define CL_MEM_OBJECT_IMAGE1D                       0x10F4
@@ -941,10 +942,12 @@ intel_gpgpu_state_init(intel_gpgpu_t *gpgpu,
   gpgpu->aux_offset.idrt_offset = size_aux;
   size_aux += MAX_IF_DESC * sizeof(struct gen6_interface_descriptor);
 
-  //sampler state must be 32 bytes aligned
+  //must be 32 bytes aligned
+  //sampler state and vme state share the same buffer,
   size_aux = ALIGN(size_aux, 32);
   gpgpu->aux_offset.sampler_state_offset = size_aux;
-  size_aux += GEN_MAX_SAMPLERS * sizeof(gen6_sampler_state_t);
+  size_aux += MAX(GEN_MAX_SAMPLERS * sizeof(gen6_sampler_state_t),
+                  GEN_MAX_VME_STATES * sizeof(gen7_vme_state_t));
 
   //sampler border color state must be 32 bytes aligned
   size_aux = ALIGN(size_aux, 32);
@@ -985,6 +988,22 @@ intel_gpgpu_set_buf_reloc_gen7(intel_gpgpu_t *gpgpu, int32_t index, dri_bo* obj_
                     obj_bo);
 }
 
+static void
+intel_gpgpu_set_buf_reloc_for_vme_gen7(intel_gpgpu_t *gpgpu, int32_t index, dri_bo* obj_bo, uint32_t obj_bo_offset)
+{
+  surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset;
+  heap->binding_table[index] = offsetof(surface_heap_t, surface) +
+                               index * sizeof(gen7_surface_state_t);
+  dri_bo_emit_reloc(gpgpu->aux_buf.bo,
+                    I915_GEM_DOMAIN_RENDER,
+                    I915_GEM_DOMAIN_RENDER,
+                    obj_bo_offset,
+                    gpgpu->aux_offset.surface_heap_offset +
+                    heap->binding_table[index] +
+                    offsetof(gen7_media_surface_state_t, ss0),
+                    obj_bo);
+}
+
 static dri_bo*
 intel_gpgpu_alloc_constant_buffer(intel_gpgpu_t *gpgpu, uint32_t size, uint8_t bti)
 {
@@ -1201,6 +1220,55 @@ intel_gpgpu_bind_image_gen7(intel_gpgpu_t *gpgpu,
 }
 
 static void
+intel_gpgpu_bind_image_for_vme_gen7(intel_gpgpu_t *gpgpu,
+                                    uint32_t index,
+                                    dri_bo* obj_bo,
+                                    uint32_t obj_bo_offset,
+                                    uint32_t format,
+                                    cl_mem_object_type type,
+                                    uint32_t bpp,
+                                    int32_t w,
+                                    int32_t h,
+                                    int32_t depth,
+                                    int32_t pitch,
+                                    int32_t slice_pitch,
+                                    int32_t tiling)
+{
+  surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset;
+  gen7_media_surface_state_t *ss = (gen7_media_surface_state_t *) &heap->surface[index * sizeof(gen7_surface_state_t)];
+
+  memset(ss, 0, sizeof(*ss));
+  ss->ss0.base_addr = obj_bo->offset + obj_bo_offset;
+  ss->ss1.uv_offset_v_direction = 0;
+  ss->ss1.pic_struct = 0;
+  ss->ss1.width = w - 1;
+  ss->ss1.height = h - 1;
+  if (tiling == GPGPU_NO_TILE) {
+    ss->ss2.tile_mode = 0;
+  }
+  else if (tiling == GPGPU_TILE_X){
+    ss->ss2.tile_mode = 2;
+  }
+  else if (tiling == GPGPU_TILE_Y){
+    ss->ss2.tile_mode = 3;
+  }
+  ss->ss2.half_pitch_for_chroma = 0;
+  ss->ss2.surface_pitch = pitch - 1;
+  ss->ss2.surface_object_control_state = cl_gpgpu_get_cache_ctrl();
+  ss->ss2.interleave_chroma = 0;
+  ss->ss2.surface_format = 12; //Y8_UNORM
+  ss->ss3.y_offset_for_u = 0;
+  ss->ss3.x_offset_for_u = 0;
+  ss->ss4.y_offset_for_v = 0;
+  ss->ss4.x_offset_for_v = 0;
+
+  intel_gpgpu_set_buf_reloc_for_vme_gen7(gpgpu, index, obj_bo, obj_bo_offset);
+
+  assert(index < GEN_MAX_SURFACES);
+}
+
+
+static void
 intel_gpgpu_bind_image_gen75(intel_gpgpu_t *gpgpu,
                               uint32_t index,
                               dri_bo* obj_bo,
@@ -1636,6 +1704,150 @@ int translate_wrap_mode(uint32_t cl_address_mode, int using_nearest)
    }
 }
 
+static void intel_gpgpu_insert_vme_state_gen7(intel_gpgpu_t *gpgpu, cl_accelerator_intel accel, uint32_t index)
+{
+    gen7_vme_state_t* vme = (gen7_vme_state_t*)(gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.sampler_state_offset)  + index;
+    memset(vme, 0, sizeof(*vme));
+    gen7_vme_search_path_state_t* sp = vme->sp;
+
+    if(accel->desc.me.search_path_type == CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL){
+      sp[0].dw0.SPD_0_X = 0;
+      sp[0].dw0.SPD_0_Y = 0;
+      sp[0].dw0.SPD_1_X = 0;
+      sp[0].dw0.SPD_1_Y = 0;
+      sp[0].dw0.SPD_2_X = 0;
+      sp[0].dw0.SPD_2_Y = 0;
+      sp[0].dw0.SPD_3_X = 0;
+      sp[0].dw0.SPD_3_Y = 0;
+    }
+    else if(accel->desc.me.search_path_type == CL_ME_SEARCH_PATH_RADIUS_4_4_INTEL){
+      sp[0].dw0.SPD_0_X = 1;
+      sp[0].dw0.SPD_0_Y = 0;
+      sp[0].dw0.SPD_1_X = 0;
+      sp[0].dw0.SPD_1_Y = 1;
+      sp[0].dw0.SPD_2_X = -1;
+      sp[0].dw0.SPD_2_Y = 0;
+      sp[0].dw0.SPD_3_X = 0;
+      sp[0].dw0.SPD_3_Y = 0;
+    }
+    else if(accel->desc.me.search_path_type == CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL){
+      sp[0].dw0.SPD_0_X = 1;
+      sp[0].dw0.SPD_0_Y = 0;
+      sp[0].dw0.SPD_1_X = 1;
+      sp[0].dw0.SPD_1_Y = 0;
+      sp[0].dw0.SPD_2_X = 1;
+      sp[0].dw0.SPD_2_Y = 0;
+      sp[0].dw0.SPD_3_X = 1;
+      sp[0].dw0.SPD_3_Y = 0;
+
+      sp[1].dw0.SPD_0_X = 1;
+      sp[1].dw0.SPD_0_Y = 0;
+      sp[1].dw0.SPD_1_X = 1;
+      sp[1].dw0.SPD_1_Y = 0;
+      sp[1].dw0.SPD_2_X = 1;
+      sp[1].dw0.SPD_2_Y = 0;
+      sp[1].dw0.SPD_3_X = 0;
+      sp[1].dw0.SPD_3_Y = 1;
+
+      sp[2].dw0.SPD_0_X = -1;
+      sp[2].dw0.SPD_0_Y = 0;
+      sp[2].dw0.SPD_1_X = -1;
+      sp[2].dw0.SPD_1_Y = 0;
+      sp[2].dw0.SPD_2_X = -1;
+      sp[2].dw0.SPD_2_Y = 0;
+      sp[2].dw0.SPD_3_X = -1;
+      sp[2].dw0.SPD_3_Y = 0;
+
+      sp[3].dw0.SPD_0_X = -1;
+      sp[3].dw0.SPD_0_Y = 0;
+      sp[3].dw0.SPD_1_X = -1;
+      sp[3].dw0.SPD_1_Y = 0;
+      sp[3].dw0.SPD_2_X = -1;
+      sp[3].dw0.SPD_2_Y = 0;
+      sp[3].dw0.SPD_3_X = 0;
+      sp[3].dw0.SPD_3_Y = 1;
+
+      sp[4].dw0.SPD_0_X = 1;
+      sp[4].dw0.SPD_0_Y = 0;
+      sp[4].dw0.SPD_1_X = 1;
+      sp[4].dw0.SPD_1_Y = 0;
+      sp[4].dw0.SPD_2_X = 1;
+      sp[4].dw0.SPD_2_Y = 0;
+      sp[4].dw0.SPD_3_X = 1;
+      sp[4].dw0.SPD_3_Y = 0;
+
+      sp[5].dw0.SPD_0_X = 1;
+      sp[5].dw0.SPD_0_Y = 0;
+      sp[5].dw0.SPD_1_X = 1;
+      sp[5].dw0.SPD_1_Y = 0;
+      sp[5].dw0.SPD_2_X = 1;
+      sp[5].dw0.SPD_2_Y = 0;
+      sp[5].dw0.SPD_3_X = 0;
+      sp[5].dw0.SPD_3_Y = 1;
+
+      sp[6].dw0.SPD_0_X = -1;
+      sp[6].dw0.SPD_0_Y = 0;
+      sp[6].dw0.SPD_1_X = -1;
+      sp[6].dw0.SPD_1_Y = 0;
+      sp[6].dw0.SPD_2_X = -1;
+      sp[6].dw0.SPD_2_Y = 0;
+      sp[6].dw0.SPD_3_X = -1;
+      sp[6].dw0.SPD_3_Y = 0;
+
+      sp[7].dw0.SPD_0_X = -1;
+      sp[7].dw0.SPD_0_Y = 0;
+      sp[7].dw0.SPD_1_X = -1;
+      sp[7].dw0.SPD_1_Y = 0;
+      sp[7].dw0.SPD_2_X = -1;
+      sp[7].dw0.SPD_2_Y = 0;
+      sp[7].dw0.SPD_3_X = 0;
+      sp[7].dw0.SPD_3_Y = 1;
+
+      sp[8].dw0.SPD_0_X = 1;
+      sp[8].dw0.SPD_0_Y = 0;
+      sp[8].dw0.SPD_1_X = 1;
+      sp[8].dw0.SPD_1_Y = 0;
+      sp[8].dw0.SPD_2_X = 1;
+      sp[8].dw0.SPD_2_Y = 0;
+      sp[8].dw0.SPD_3_X = 1;
+      sp[8].dw0.SPD_3_Y = 0;
+
+      sp[9].dw0.SPD_0_X = 1;
+      sp[9].dw0.SPD_0_Y = 0;
+      sp[9].dw0.SPD_1_X = 1;
+      sp[9].dw0.SPD_1_Y = 0;
+      sp[9].dw0.SPD_2_X = 1;
+      sp[9].dw0.SPD_2_Y = 0;
+      sp[9].dw0.SPD_3_X = 0;
+      sp[9].dw0.SPD_3_Y = 1;
+
+      sp[10].dw0.SPD_0_X = -1;
+      sp[10].dw0.SPD_0_Y = 0;
+      sp[10].dw0.SPD_1_X = -1;
+      sp[10].dw0.SPD_1_Y = 0;
+      sp[10].dw0.SPD_2_X = -1;
+      sp[10].dw0.SPD_2_Y = 0;
+      sp[10].dw0.SPD_3_X = -1;
+      sp[10].dw0.SPD_3_Y = 0;
+
+      sp[11].dw0.SPD_0_X = -1;
+      sp[11].dw0.SPD_0_Y = 0;
+      sp[11].dw0.SPD_1_X = -1;
+      sp[11].dw0.SPD_1_Y = 0;
+      sp[11].dw0.SPD_2_X = -1;
+      sp[11].dw0.SPD_2_Y = 0;
+      sp[11].dw0.SPD_3_X = 0;
+      sp[11].dw0.SPD_3_Y = 0;
+    }
+}
+
+static void
+intel_gpgpu_bind_vme_state_gen7(intel_gpgpu_t *gpgpu, cl_accelerator_intel accel)
+{
+  int index;
+  intel_gpgpu_insert_vme_state_gen7(gpgpu, accel, 0);
+}
+
 static void
 intel_gpgpu_insert_sampler_gen7(intel_gpgpu_t *gpgpu, uint32_t index, uint32_t clk_sampler)
 {
@@ -2141,6 +2353,7 @@ intel_set_gpgpu_callbacks(int device_id)
   cl_gpgpu_batch_end = (cl_gpgpu_batch_end_cb *) intel_gpgpu_batch_end;
   cl_gpgpu_flush = (cl_gpgpu_flush_cb *) intel_gpgpu_flush;
   cl_gpgpu_bind_sampler = (cl_gpgpu_bind_sampler_cb *) intel_gpgpu_bind_sampler_gen7;
+  cl_gpgpu_bind_vme_state = (cl_gpgpu_bind_vme_state_cb *) intel_gpgpu_bind_vme_state_gen7;
   cl_gpgpu_set_scratch = (cl_gpgpu_set_scratch_cb *) intel_gpgpu_set_scratch;
   cl_gpgpu_event_new = (cl_gpgpu_event_new_cb *)intel_gpgpu_event_new;
   cl_gpgpu_event_flush = (cl_gpgpu_event_flush_cb *)intel_gpgpu_event_flush;
@@ -2218,6 +2431,7 @@ intel_set_gpgpu_callbacks(int device_id)
   }
   else if (IS_IVYBRIDGE(device_id)) {
     cl_gpgpu_bind_image = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image_gen7;
+    cl_gpgpu_bind_image_for_vme = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image_for_vme_gen7;
     if (IS_BAYTRAIL_T(device_id)) {
       intel_gpgpu_set_L3 = intel_gpgpu_set_L3_baytrail;
       intel_gpgpu_read_ts_reg = intel_gpgpu_read_ts_reg_baytrail;
diff --git a/src/intel/intel_structs.h b/src/intel/intel_structs.h
index fd6a82b..c112a16 100644
--- a/src/intel/intel_structs.h
+++ b/src/intel/intel_structs.h
@@ -381,6 +381,57 @@ typedef struct gen8_surface_state
   } ss15;
 } gen8_surface_state_t;
 
+typedef struct gen7_media_surface_state
+{
+  struct {
+    uint32_t base_addr;
+  } ss0;
+
+  struct {
+    uint32_t uv_offset_v_direction:2;
+    uint32_t pic_struct:2;
+    uint32_t width:14;
+    uint32_t height:14;
+  } ss1;
+
+  struct {
+    uint32_t tile_mode:2;
+    uint32_t half_pitch_for_chroma:1;
+    uint32_t surface_pitch:18;
+    uint32_t pad1:1;
+    uint32_t surface_object_control_state:4;
+    uint32_t pad0:1;
+    uint32_t interleave_chroma:1;
+    uint32_t surface_format:4;
+  } ss2;
+
+  struct {
+    uint32_t y_offset_for_u:14;
+    uint32_t pad1:2;
+    uint32_t x_offset_for_u:14;
+    uint32_t pad0:2;
+  } ss3;
+
+  struct {
+    uint32_t y_offset_for_v:15;
+    uint32_t pad1:1;
+    uint32_t x_offset_for_v:14;
+    uint32_t pad0:2;
+  } ss4;
+
+  struct {
+    uint32_t pad0;
+  } ss5;
+
+  struct {
+    uint32_t pad0;
+  } ss6;
+
+  struct {
+    uint32_t pad0;
+  } ss7;
+} gen7_media_surface_state_t;
+
 typedef union gen_surface_state
 {
   gen7_surface_state_t gen7_surface_state;
@@ -555,6 +606,75 @@ typedef struct gen8_pipe_control
   } dw5;
 } gen8_pipe_control_t;
 
+#define GEN7_NUM_VME_SEARCH_PATH_STATES 14
+#define GEN7_NUM_VME_RD_LUT_SETS 4
+
+typedef struct gen7_vme_search_path_state
+{
+    struct {
+        uint32_t SPD_0_X : BITFIELD_RANGE(0, 3);        //search path distance
+        uint32_t SPD_0_Y : BITFIELD_RANGE(4, 7);
+        uint32_t SPD_1_X : BITFIELD_RANGE(8, 11);
+        uint32_t SPD_1_Y : BITFIELD_RANGE(12, 15);
+        uint32_t SPD_2_X : BITFIELD_RANGE(16, 19);
+        uint32_t SPD_2_Y : BITFIELD_RANGE(20, 23);
+        uint32_t SPD_3_X : BITFIELD_RANGE(24, 27);
+        uint32_t SPD_3_Y : BITFIELD_RANGE(28, 31);
+    }dw0;
+}gen7_vme_search_path_state_t;
+
+typedef struct gen7_vme_rd_lut_set
+{
+    struct {
+        uint32_t LUT_MbMode_0 : BITFIELD_RANGE(0, 7);
+        uint32_t LUT_MbMode_1 : BITFIELD_RANGE(8, 15);
+        uint32_t LUT_MbMode_2 : BITFIELD_RANGE(16, 23);
+        uint32_t LUT_MbMode_3 : BITFIELD_RANGE(24, 31);
+    }dw0;
+
+    struct {
+        uint32_t LUT_MbMode_4 : BITFIELD_RANGE(0, 7);
+        uint32_t LUT_MbMode_5 : BITFIELD_RANGE(8, 15);
+        uint32_t LUT_MbMode_6 : BITFIELD_RANGE(16, 23);
+        uint32_t LUT_MbMode_7 : BITFIELD_RANGE(24, 31);
+    }dw1;
+
+    struct {
+        uint32_t LUT_MV_0 : BITFIELD_RANGE(0, 7);
+        uint32_t LUT_MV_1 : BITFIELD_RANGE(8, 15);
+        uint32_t LUT_MV_2 : BITFIELD_RANGE(16, 23);
+        uint32_t LUT_MV_3 : BITFIELD_RANGE(24, 31);
+    }dw2;
+
+    struct {
+        uint32_t LUT_MV_4 : BITFIELD_RANGE(0, 7);
+        uint32_t LUT_MV_5 : BITFIELD_RANGE(8, 15);
+        uint32_t LUT_MV_6 : BITFIELD_RANGE(16, 23);
+        uint32_t LUT_MV_7 : BITFIELD_RANGE(24, 31);
+    }dw3;
+}gen7_vme_rd_lut_set_t;
+
+typedef struct gen7_vme_state
+{
+    gen7_vme_search_path_state_t sp[GEN7_NUM_VME_SEARCH_PATH_STATES];
+
+    struct {
+        uint32_t LUT_MbMode_8_0 : BITFIELD_RANGE(0, 7);
+        uint32_t LUT_MbMode_9_0 : BITFIELD_RANGE(8, 15);
+        uint32_t LUT_MbMode_8_1 : BITFIELD_RANGE(16, 23);
+        uint32_t LUT_MbMode_9_1 : BITFIELD_RANGE(24, 31);
+    }dw14;
+
+    struct {
+        uint32_t LUT_MbMode_8_2 : BITFIELD_RANGE(0, 7);
+        uint32_t LUT_MbMode_9_2 : BITFIELD_RANGE(8, 15);
+        uint32_t LUT_MbMode_8_3 : BITFIELD_RANGE(16, 23);
+        uint32_t LUT_MbMode_9_3 : BITFIELD_RANGE(24, 31);
+    }dw15;
+
+    gen7_vme_rd_lut_set_t lut[GEN7_NUM_VME_RD_LUT_SETS];
+}gen7_vme_state_t;
+
 typedef struct gen6_sampler_state
 {
   struct {
diff --git a/src/kernels/cl_internal_block_motion_estimate_intel.cl b/src/kernels/cl_internal_block_motion_estimate_intel.cl
new file mode 100644
index 0000000..51ce048
--- /dev/null
+++ b/src/kernels/cl_internal_block_motion_estimate_intel.cl
@@ -0,0 +1,224 @@
+typedef struct _motion_estimation_desc_intel {
+  uint mb_block_type;
+  uint subpixel_mode;
+  uint sad_adjust_mode;
+  uint search_path_type;
+} accelerator_intel_t;
+
+__kernel __attribute__((reqd_work_group_size(16,1,1)))
+void block_motion_estimate_intel(accelerator_intel_t accel,
+                                 __read_only  image2d_t src_image,
+                                 __read_only  image2d_t ref_image,
+                                 __global short2 * prediction_motion_vector_buffer,
+                                 __global short2 * motion_vector_buffer,
+                                 __global ushort * residuals){
+
+  uint src_grf0_dw7;
+  uint src_grf0_dw6;
+  uint src_grf0_dw5;
+  uint src_grf0_dw4;
+  uint src_grf0_dw3;
+  uint src_grf0_dw2;
+  uint src_grf0_dw1;
+  uint src_grf0_dw0;
+  uint src_grf1_dw7;
+  uint src_grf1_dw6;
+  uint src_grf1_dw5;
+  uint src_grf1_dw4;
+  uint src_grf1_dw3;
+  uint src_grf1_dw2;
+  uint src_grf1_dw1;
+  uint src_grf1_dw0;
+  uint src_grf2_dw7;
+  uint src_grf2_dw6;
+  uint src_grf2_dw5;
+  uint src_grf2_dw4;
+  uint src_grf2_dw3;
+  uint src_grf2_dw2;
+  uint src_grf2_dw1;
+  uint src_grf2_dw0;
+  uint src_grf3_dw7;
+  uint src_grf3_dw6;
+  uint src_grf3_dw5;
+  uint src_grf3_dw4;
+  uint src_grf3_dw3;
+  uint src_grf3_dw2;
+  uint src_grf3_dw1;
+  uint src_grf3_dw0;
+  uint src_grf4_dw7;
+  uint src_grf4_dw6;
+  uint src_grf4_dw5;
+  uint src_grf4_dw4;
+  uint src_grf4_dw3;
+  uint src_grf4_dw2;
+  uint src_grf4_dw1;
+  uint src_grf4_dw0;
+
+  uint8 vme_result = (0, 0, 0, 0, 0, 0, 0, 0);
+
+  int lgid_x = get_group_id(0);
+  int lgid_y = get_group_id(1);
+
+  uint2 srcCoord = 0;
+
+  srcCoord.x = lgid_x * 16;
+  srcCoord.y = lgid_y * 16;
+
+  //CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL
+  if(accel.search_path_type == 0x0){
+    //src_grf0_dw5 = (Ref_Height << 24) | (Ref_Width << 16) | (Ignored << 8) | (Dispatch_Id?);
+    src_grf0_dw5 =   (20 << 24)         | (20 << 16)        | (0 << 8)       | (0);
+    //src_grf0_dw1 = (Ref1Y << 16)  | (Ref1X);
+    src_grf0_dw1 =   0xfffefffe;
+    //src_grf0_dw0 = (Ref0Y << 16)  | (Ref0X);
+    src_grf0_dw0 =   0xfffefffe;
+    //src_grf1_dw2 = (Start1Y << 28)                  | (Start1X << 24)                | (Start0Y << 20)
+    src_grf1_dw2 =   (0 << 28)                        | (0 << 24)                      | (0 << 20)
+                   //| (Start0X << 16)               | (Max_Num_SU << 8)              | (LenSP);
+                     | (0 << 16)                     | (2 << 8)                       | (2);
+  }
+  //CL_ME_SEARCH_PATH_RADIUS_4_4_INTEL
+  else if(accel.search_path_type == 0x1){
+    src_grf0_dw5 =   (24 << 24)         | (24 << 16)        | (0 << 8)       | (0);
+    src_grf0_dw1 =   0xfffcfffc;
+    src_grf0_dw0 =   0xfffcfffc;
+    src_grf1_dw2 =   (0 << 28)                        | (0 << 24)                      | (0 << 20)
+                     | (0 << 16)                     | (48 << 8)                       | (48);
+  }
+  //CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL
+  else if(accel.search_path_type == 0x5){
+    src_grf0_dw5 =   (40 << 24)         | (48 << 16)        | (0 << 8)       | (0);
+    src_grf0_dw1 =   0xfff4fff0;
+    src_grf0_dw0 =   0xfff4fff0;
+    src_grf1_dw2 =   (0 << 28)                        | (0 << 24)                      | (0 << 20)
+                     | (0 << 16)                     | (48 << 8)                       | (48);
+  }
+
+  //src_grf0_dw7 = Debug;
+  src_grf0_dw7 = 0;
+  //src_grf0_dw6 = Debug;
+  src_grf0_dw6 = 0;
+  //src_grf0_dw5 = (Ref_Height << 24) | (Ref_Width << 16) | (Ignored << 8) | (Dispatch_Id?);
+  //src_grf0_dw4 = Ignored;
+  src_grf0_dw4 = 0;
+  //src_grf0_dw3 = (Reserved << 31)                 | (Sub_Mb_Part_Mask << 24)       | (Intra_SAD << 22)
+  src_grf0_dw3 =   (0 << 31)                 | (0x7e << 24)                   | (0 << 22)
+                 //| (Inter_SAD << 20)             | (BB_Skip_Enabled << 19)        | (Reserverd << 18)
+                   | (0 << 20)                     | (0 << 19)                      | (0 << 18)
+                 //| (Dis_Aligned_Src_Fetch << 17) | (Dis_Aligned_Ref_Fetch << 16)  | (Dis_Field_Cache_Alloc << 15)
+                   | (0 << 17)                     | (0 << 16)                      | (0 << 15)
+                 //| (Skip_Type << 14)             | (Sub_Pel_Mode << 12)           | (Dual_Search_Path_Opt << 11)
+                   | (0 << 14)                     | (0 << 12)                      | (0 << 11)
+                 //| (Search_Ctrl << 8)            | (Ref_Access << 7)              | (SrcAccess << 6)
+                   | (0 << 8)                      | (0 << 7)                       | (0 << 6)
+                 //| (Mb_Type_Remap << 4)          | (Reserved_Workaround << 3)     | (Reserved_Workaround << 2)
+                   | (0 << 4)                      | (0 << 3)                       | (0 << 2)
+                 //| (Src_Size);
+                   | (0);
+
+  //src_grf0_dw2 = (SrcY << 16) | (SrcX);
+  src_grf0_dw2 = (srcCoord.y << 16)  | (srcCoord.x);
+  //src_grf0_dw1 = (Ref1Y << 16)  | (Ref1X);
+  //src_grf0_dw0 = (Ref0Y << 16)  | (Ref0X);
+  /*src_grf1_dw7 = (Skip_Center_Mask << 24)         | (Reserved << 22)               | (Ref1_Field_Polarity << 21)
+                 | (Ref0_Field_Polarity << 20)   | (Src_Field_Polarity << 19)     | (Bilinear_Enable << 18)
+                 | (MV_Cost_Scale_Factor << 16)  | (Mb_Intra_Struct << 8)         | (Intra_Corner_Swap << 7)
+                 | (Non_Skip_Mode_Added << 6)    | (Non_Skip_ZMv_Added << 5)      | (IntraPartMask);*/
+  src_grf1_dw7 = 0;
+  //src_grf1_dw6 = Reserved;
+  src_grf1_dw6 = 0;
+  /*src_grf1_dw5 = (Cost_Center1Y << 16)  | (Cost_Center1X);
+  src_grf1_dw4 = (Cost_Center0Y << 16)  | (Cost_Center0X);
+  src_grf1_dw3 = (Ime_Too_Good << 24 )  | (Ime_Too_Bad << 16)  | (Part_Tolerance_Thrhd << 8) | (FBPrunThrhd);*/
+  src_grf1_dw5 = 0;
+  src_grf1_dw4 = 0;
+  src_grf1_dw3 = 0;
+  //src_grf1_dw2 = (Start1Y << 28)                  | (Start1X << 24)                | (Start0Y << 20)
+                 //| (Start0X << 16)               | (Max_Num_SU << 8)              | (LenSP);
+  /*src_grf1_dw1 = (RepartEn << 31)                 | (FBPrunEn << 30)               | (AdaptiveValidationControl << 29)
+                 | (Uni_Mix_Disable << 28)       | (Bi_Sub_Mb_Part_Mask << 24)    | (Reserverd << 22)
+                 | (Bi_Weight << 16)             | (Reserved << 6)                | (MaxNumMVs);*/
+  src_grf1_dw1 = (0 << 24) | (2);
+  /*src_grf1_dw0 = (Early_Ime_Stop << 24)           | (Early_Fme_Success << 16)      | (Skip_Success << 8)
+                 | (T8x8_Flag_For_Inter_En << 7) | (Quit_Inter_En << 6)           | (Early_Ime_Success_En << 5)
+                 | (Early_Success_En << 4)       | (Part_Candidate_En << 3)       | (Bi_Mix_Dis << 2)
+                 | (Adaptive_En  << 1)           | (SkipModeEn);*/
+  src_grf1_dw0 = 0;
+  /*src_grf2_dw7 = Ref1_SkipCenter_3_Delta_XY;
+  src_grf2_dw6 = Ref0_SkipCenter_3_Delta_XY;
+  src_grf2_dw5 = Ref1_SkipCenter_2_Delta_XY;
+  src_grf2_dw4 = Ref0_SkipCenter_3_Delta_XY;
+  src_grf2_dw3 = Ref1_SkipCenter_1_Delta_XY;
+  src_grf2_dw2 = Ref0_SkipCenter_1_Delta_XY;
+  src_grf2_dw1 = Ref1_SkipCenter_0_Delta_XY;
+  src_grf2_dw0 = (Ref0_Skip_Center_0_Delta_Y << 16)  | (Ref0_Skip_Center_0_Delta_X);
+  src_grf3_dw7 = Neighbor pixel Luma value [23, -1] to [20, -1];
+  src_grf3_dw6 = Neighbor pixel Luma value [19, -1] to [16, -1];
+  src_grf3_dw5 = Neighbor pixel Luma value [15, -1] to [12, -1];
+  src_grf3_dw4 = Neighbor pixel Luma value [11, -1] to [8, -1];
+  src_grf3_dw3 = Neighbor pixel Luma value [7, -1] to [4, -1];
+  src_grf3_dw2 = (Neighbor pixel Luma value [3, -1] << 24)    | (Neighbor pixel Luma value [2, -1] << 16)
+                 | (Neighbor pixel Luma value [1, -1] << 8)  | (Neighbor pixel Luma value [0, -1]);
+  //src_grf3_dw1 = (?)  | (Reserved)  | ((Intra_16x16_Mode_Mask);
+  src_grf3_dw0 = (Reserved<<25)  | (Intra_16x16_Mode_Mask << 16)  | (Reserved)  | (Intra_16x16_Mode_Mask);
+  src_grf4_dw7 = Reserved;
+  src_grf4_dw6 = Reserved;
+  src_grf4_dw5 = Reserved;
+  src_grf4_dw4 = (Intra_MxM_Pred_Mode_B15 << 28)    | (Intra_MxM_Pred_Mode_B14 << 24)  | (Intra_MxM_Pred_Mode_B11 << 20)
+                 | (Intra_MxM_Pred_Mode_B10 << 16) | (Intra_MxM_Pred_Mode_A15 << 12)  | (Intra_MxM_Pred_Mode_A13 << 8)
+                 | (Intra_MxM_Pred_Mode_A7 << 4)   | (Intra_MxM_Pred_Mode_A5);
+  //src_grf4_dw3 = (?)  | (Neighbor pixel Luma value [-1, 14] to [-1, 12]);
+  src_grf4_dw2 = Neighbor pixel Luma value [-1, 11] to [-1, 8];
+  src_grf4_dw1 = Neighbor pixel Luma value [-1, 7] to [-1, 4];
+  src_grf4_dw0 = (Neighbor pixel Luma value [-1, 3] << 24)    | (Neighbor pixel Luma value [-1, 2] << 16)
+                 | (Neighbor pixel Luma value [-1, 1] << 8)  | (Neighbor pixel Luma value [-1, 0]);*/
+  src_grf2_dw7 = 0;
+  src_grf2_dw6 = 0;
+  src_grf2_dw5 = 0;
+  src_grf2_dw4 = 0;
+  src_grf2_dw3 = 0;
+  src_grf2_dw2 = 0;
+  src_grf2_dw1 = 0;
+  src_grf2_dw0 = 0;
+  src_grf3_dw7 = 0;
+  src_grf3_dw6 = 0;
+  src_grf3_dw5 = 0;
+  src_grf3_dw4 = 0;
+  src_grf3_dw3 = 0;
+  src_grf3_dw2 = 0;
+  src_grf3_dw1 = 0;
+  src_grf3_dw0 = 0;
+  src_grf4_dw7 = 0;
+  src_grf4_dw6 = 0;
+  src_grf4_dw5 = 0;
+  src_grf4_dw4 = 0;
+  src_grf4_dw3 = 0;
+  src_grf4_dw2 = 0;
+  src_grf4_dw1 = 0;
+  src_grf4_dw0 = 0;
+
+  vme_result = __gen_ocl_vme(src_image, ref_image,
+                src_grf0_dw7, src_grf0_dw6, src_grf0_dw5, src_grf0_dw4,
+                src_grf0_dw3, src_grf0_dw2, src_grf0_dw1, src_grf0_dw0,
+                src_grf1_dw7, src_grf1_dw6, src_grf1_dw5, src_grf1_dw4,
+                src_grf1_dw3, src_grf1_dw2, src_grf1_dw1, src_grf1_dw0,
+                src_grf2_dw7, src_grf2_dw6, src_grf2_dw5, src_grf2_dw4,
+                src_grf2_dw3, src_grf2_dw2, src_grf2_dw1, src_grf2_dw0,
+                src_grf3_dw7, src_grf3_dw6, src_grf3_dw5, src_grf3_dw4,
+                src_grf3_dw3, src_grf3_dw2, src_grf3_dw1, src_grf3_dw0,
+                src_grf4_dw7, src_grf4_dw6, src_grf4_dw5, src_grf4_dw4,
+                src_grf4_dw3, src_grf4_dw2, src_grf4_dw1, src_grf4_dw0,
+                //msg_type, vme_search_path_lut, lut_sub,
+                1, 0, 0);
+
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  int lid_x = get_local_id(0);
+  uint write_back_grf1_dw0 = __gen_ocl_region(8, vme_result.s0);
+  short2 val = as_short2( write_back_grf1_dw0 );
+  int index = lgid_y * get_num_groups(0) + lgid_x;
+  if( lid_x == 0 ){
+    motion_vector_buffer[index] = val;
+  }
+
+}
diff --git a/utests/builtin_kernel_block_motion_estimate_intel.cpp b/utests/builtin_kernel_block_motion_estimate_intel.cpp
new file mode 100644
index 0000000..12bcb7d
--- /dev/null
+++ b/utests/builtin_kernel_block_motion_estimate_intel.cpp
@@ -0,0 +1,109 @@
+#include "utest_helper.hpp"
+#include <string.h>
+
+void builtin_kernel_block_motion_estimate_intel(void)
+{
+  char* built_in_kernel_names;
+  size_t built_in_kernels_size;
+  cl_int err = CL_SUCCESS;
+  size_t ret_sz;
+
+  OCL_CALL (clGetDeviceInfo, device, CL_DEVICE_BUILT_IN_KERNELS, 0, 0, &built_in_kernels_size);
+  built_in_kernel_names = (char* )malloc(built_in_kernels_size * sizeof(char) );
+  OCL_CALL(clGetDeviceInfo, device, CL_DEVICE_BUILT_IN_KERNELS, built_in_kernels_size, (void*)built_in_kernel_names, &ret_sz);
+  OCL_ASSERT(ret_sz == built_in_kernels_size);
+
+  if (strstr(built_in_kernel_names, "block_motion_estimate_intel") == NULL)
+  {
+        free(built_in_kernel_names);
+        return;
+  }
+
+  cl_program built_in_prog = clCreateProgramWithBuiltInKernels(ctx, 1, &device, built_in_kernel_names, &err);
+  OCL_ASSERT(built_in_prog != NULL);
+  kernel = clCreateKernel(built_in_prog, "block_motion_estimate_intel",  &err);
+  OCL_ASSERT(kernel != NULL);
+
+  cl_motion_estimation_desc_intel vmedesc = {CL_ME_MB_TYPE_16x16_INTEL,       //0x0
+                                          CL_ME_SUBPIXEL_MODE_INTEGER_INTEL,  //0x0
+                                          CL_ME_SAD_ADJUST_MODE_NONE_INTEL,   //0x0
+                                          CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL //0x5
+                                          };
+  cl_accelerator_intel accel = clCreateAcceleratorINTEL(ctx, CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL,sizeof(cl_motion_estimation_desc_intel), &vmedesc, &err);
+  OCL_ASSERT(accel != NULL);
+
+  const size_t w = 71; //80
+  const size_t h = 41; //48
+
+  cl_image_format format;
+  cl_image_desc desc;
+
+  memset(&desc, 0x0, sizeof(cl_image_desc));
+  memset(&format, 0x0, sizeof(cl_image_format));
+
+  uint8_t* image_data1 = (uint8_t *)malloc(w * h);    //src
+  uint8_t* image_data2 = (uint8_t *)malloc(w * h);    //ref
+  for (size_t j = 0; j < h; j++) {
+    for (size_t i = 0; i < w; i++) {
+      if (i >= 32 && i <= 47 && j >= 16 && j <= 31)
+        image_data2[w * j + i] = image_data1[w * j + i] = 100;
+      else
+        image_data2[w * j + i] = image_data1[w * j + i] = 0;
+    }
+  }
+
+  format.image_channel_order = CL_R;
+  format.image_channel_data_type = CL_UNORM_INT8;
+  desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+  desc.image_width = w;
+  desc.image_height = h;
+  desc.image_row_pitch = 0;
+  OCL_CREATE_IMAGE(buf[0], CL_MEM_COPY_HOST_PTR, &format, &desc, image_data1);        //src
+  OCL_CREATE_IMAGE(buf[1], CL_MEM_COPY_HOST_PTR, &format, &desc, image_data2);        //ref
+
+  const size_t mv = (80/16) * (48/16);
+  OCL_CREATE_BUFFER(buf[2], 0, mv * sizeof(int) * 4, NULL);
+
+  OCL_SET_ARG(0, sizeof(cl_accelerator_intel), &accel);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(3, sizeof(cl_mem), NULL);
+  OCL_SET_ARG(4, sizeof(cl_mem), &buf[2]);
+  OCL_SET_ARG(5, sizeof(cl_mem), NULL);
+
+  globals[0] = w;
+  globals[1] = h;
+  OCL_CALL(clEnqueueNDRangeKernel, queue, kernel, 2, NULL, globals, NULL, 0, NULL, NULL);
+
+  OCL_MAP_BUFFER(2);
+  short expected[] = {-64, -48,
+                    -64, -48,
+                    -64, -48,
+                    -64, -48,
+                    -64, -48,
+                    -64, -48,
+                    -64, -48,
+                    0, 0,
+                    0, -48,
+                    -64, -48,
+                    -64, -48,
+                    -64, -48,
+                    -64, -48,
+                    0, -48,
+                    -64, -48};
+  short* res = (short*)buf_data[2];
+  for (uint32_t j = 0; j < mv; ++j) {
+    OCL_ASSERT(res[j * 2 + 0] == expected[j * 2 + 0]);
+    OCL_ASSERT(res[j * 2 + 1] == expected[j * 2 + 1]);
+  }
+  OCL_UNMAP_BUFFER(2);
+
+  clReleaseAcceleratorINTEL(accel);
+  clReleaseKernel(kernel);
+  clReleaseProgram(built_in_prog);
+  free(built_in_kernel_names);
+  free(image_data1);
+  free(image_data2);
+}
+
+MAKE_UTEST_FROM_FUNCTION(builtin_kernel_block_motion_estimate_intel);
-- 
1.9.1