[Beignet] [PATCH 54/57] Add all internal kernels for GEN device.

junyan.he at inbox.com junyan.he at inbox.com
Sun Jun 11 05:50:40 UTC 2017


From: Junyan He <junyan.he at intel.com>

Add these kernels are used as builtin kernels. They also help to
do some clEnqueueXXX job such as mem_copy, image_fill, etc.

Signed-off-by: Junyan He <junyan.he at intel.com>
---
 .../cl_internal_block_motion_estimate_intel.cl     | 369 +++++++++++++++++++++
 .../gen/kernels/cl_internal_copy_buf_align16.cl    |  12 +
 runtime/gen/kernels/cl_internal_copy_buf_align4.cl |   8 +
 runtime/gen/kernels/cl_internal_copy_buf_rect.cl   |  15 +
 .../kernels/cl_internal_copy_buf_rect_align4.cl    |  15 +
 .../cl_internal_copy_buf_unalign_dst_offset.cl     |  28 ++
 .../cl_internal_copy_buf_unalign_same_offset.cl    |  19 ++
 .../cl_internal_copy_buf_unalign_src_offset.cl     |  29 ++
 .../kernels/cl_internal_copy_buffer_to_image_2d.cl |  18 +
 .../cl_internal_copy_buffer_to_image_2d_align16.cl |  18 +
 .../kernels/cl_internal_copy_buffer_to_image_3d.cl |  19 ++
 .../cl_internal_copy_image_1d_array_to_1d_array.cl |  21 ++
 .../gen/kernels/cl_internal_copy_image_1d_to_1d.cl |  19 ++
 .../cl_internal_copy_image_2d_array_to_2d.cl       |  21 ++
 .../cl_internal_copy_image_2d_array_to_2d_array.cl |  23 ++
 .../cl_internal_copy_image_2d_array_to_3d.cl       |  23 ++
 .../gen/kernels/cl_internal_copy_image_2d_to_2d.cl |  21 ++
 .../cl_internal_copy_image_2d_to_2d_array.cl       |  21 ++
 .../gen/kernels/cl_internal_copy_image_2d_to_3d.cl |  22 ++
 .../kernels/cl_internal_copy_image_2d_to_buffer.cl |  19 ++
 .../cl_internal_copy_image_2d_to_buffer_align16.cl |  19 ++
 .../gen/kernels/cl_internal_copy_image_3d_to_2d.cl |  22 ++
 .../cl_internal_copy_image_3d_to_2d_array.cl       |  23 ++
 .../gen/kernels/cl_internal_copy_image_3d_to_3d.cl |  23 ++
 .../kernels/cl_internal_copy_image_3d_to_buffer.cl |  22 ++
 .../gen/kernels/cl_internal_fill_buf_align128.cl   |   9 +
 runtime/gen/kernels/cl_internal_fill_buf_align2.cl |   8 +
 runtime/gen/kernels/cl_internal_fill_buf_align4.cl |   8 +
 runtime/gen/kernels/cl_internal_fill_buf_align8.cl |  14 +
 .../gen/kernels/cl_internal_fill_buf_unalign.cl    |   8 +
 runtime/gen/kernels/cl_internal_fill_image_1d.cl   |  14 +
 .../gen/kernels/cl_internal_fill_image_1d_array.cl |  15 +
 runtime/gen/kernels/cl_internal_fill_image_2d.cl   |  15 +
 .../gen/kernels/cl_internal_fill_image_2d_array.cl |  16 +
 runtime/gen/kernels/cl_internal_fill_image_3d.cl   |  16 +
 35 files changed, 972 insertions(+)
 create mode 100644 runtime/gen/kernels/cl_internal_block_motion_estimate_intel.cl
 create mode 100644 runtime/gen/kernels/cl_internal_copy_buf_align16.cl
 create mode 100644 runtime/gen/kernels/cl_internal_copy_buf_align4.cl
 create mode 100644 runtime/gen/kernels/cl_internal_copy_buf_rect.cl
 create mode 100644 runtime/gen/kernels/cl_internal_copy_buf_rect_align4.cl
 create mode 100644 runtime/gen/kernels/cl_internal_copy_buf_unalign_dst_offset.cl
 create mode 100644 runtime/gen/kernels/cl_internal_copy_buf_unalign_same_offset.cl
 create mode 100644 runtime/gen/kernels/cl_internal_copy_buf_unalign_src_offset.cl
 create mode 100644 runtime/gen/kernels/cl_internal_copy_buffer_to_image_2d.cl
 create mode 100644 runtime/gen/kernels/cl_internal_copy_buffer_to_image_2d_align16.cl
 create mode 100644 runtime/gen/kernels/cl_internal_copy_buffer_to_image_3d.cl
 create mode 100644 runtime/gen/kernels/cl_internal_copy_image_1d_array_to_1d_array.cl
 create mode 100644 runtime/gen/kernels/cl_internal_copy_image_1d_to_1d.cl
 create mode 100644 runtime/gen/kernels/cl_internal_copy_image_2d_array_to_2d.cl
 create mode 100644 runtime/gen/kernels/cl_internal_copy_image_2d_array_to_2d_array.cl
 create mode 100644 runtime/gen/kernels/cl_internal_copy_image_2d_array_to_3d.cl
 create mode 100644 runtime/gen/kernels/cl_internal_copy_image_2d_to_2d.cl
 create mode 100644 runtime/gen/kernels/cl_internal_copy_image_2d_to_2d_array.cl
 create mode 100644 runtime/gen/kernels/cl_internal_copy_image_2d_to_3d.cl
 create mode 100644 runtime/gen/kernels/cl_internal_copy_image_2d_to_buffer.cl
 create mode 100644 runtime/gen/kernels/cl_internal_copy_image_2d_to_buffer_align16.cl
 create mode 100644 runtime/gen/kernels/cl_internal_copy_image_3d_to_2d.cl
 create mode 100644 runtime/gen/kernels/cl_internal_copy_image_3d_to_2d_array.cl
 create mode 100644 runtime/gen/kernels/cl_internal_copy_image_3d_to_3d.cl
 create mode 100644 runtime/gen/kernels/cl_internal_copy_image_3d_to_buffer.cl
 create mode 100644 runtime/gen/kernels/cl_internal_fill_buf_align128.cl
 create mode 100644 runtime/gen/kernels/cl_internal_fill_buf_align2.cl
 create mode 100644 runtime/gen/kernels/cl_internal_fill_buf_align4.cl
 create mode 100644 runtime/gen/kernels/cl_internal_fill_buf_align8.cl
 create mode 100644 runtime/gen/kernels/cl_internal_fill_buf_unalign.cl
 create mode 100644 runtime/gen/kernels/cl_internal_fill_image_1d.cl
 create mode 100644 runtime/gen/kernels/cl_internal_fill_image_1d_array.cl
 create mode 100644 runtime/gen/kernels/cl_internal_fill_image_2d.cl
 create mode 100644 runtime/gen/kernels/cl_internal_fill_image_2d_array.cl
 create mode 100644 runtime/gen/kernels/cl_internal_fill_image_3d.cl

diff --git a/runtime/gen/kernels/cl_internal_block_motion_estimate_intel.cl b/runtime/gen/kernels/cl_internal_block_motion_estimate_intel.cl
new file mode 100644
index 0000000..7216229
--- /dev/null
+++ b/runtime/gen/kernels/cl_internal_block_motion_estimate_intel.cl
@@ -0,0 +1,369 @@
+typedef struct _motion_estimation_desc_intel {
+  uint mb_block_type;
+  uint subpixel_mode;
+  uint sad_adjust_mode;
+  uint search_path_type;
+} accelerator_intel_t;
+
+__kernel __attribute__((reqd_work_group_size(16,1,1)))
+void block_motion_estimate_intel(accelerator_intel_t accel,
+                                 __read_only  image2d_t src_image,
+                                 __read_only  image2d_t ref_image,
+                                 __global short2 * prediction_motion_vector_buffer,
+                                 __global short2 * motion_vector_buffer,
+                                 __global ushort * residuals){
+
+  uint src_grf0_dw7;
+  uint src_grf0_dw6;
+  uint src_grf0_dw5;
+  uint src_grf0_dw4;
+  uint src_grf0_dw3;
+  uint src_grf0_dw2;
+  uint src_grf0_dw1;
+  uint src_grf0_dw0;
+  uint src_grf1_dw7;
+  uint src_grf1_dw6;
+  uint src_grf1_dw5;
+  uint src_grf1_dw4;
+  uint src_grf1_dw3;
+  uint src_grf1_dw2;
+  uint src_grf1_dw1;
+  uint src_grf1_dw0;
+  uint src_grf2_dw7;
+  uint src_grf2_dw6;
+  uint src_grf2_dw5;
+  uint src_grf2_dw4;
+  uint src_grf2_dw3;
+  uint src_grf2_dw2;
+  uint src_grf2_dw1;
+  uint src_grf2_dw0;
+  uint src_grf3_dw7;
+  uint src_grf3_dw6;
+  uint src_grf3_dw5;
+  uint src_grf3_dw4;
+  uint src_grf3_dw3;
+  uint src_grf3_dw2;
+  uint src_grf3_dw1;
+  uint src_grf3_dw0;
+  uint src_grf4_dw7;
+  uint src_grf4_dw6;
+  uint src_grf4_dw5;
+  uint src_grf4_dw4;
+  uint src_grf4_dw3;
+  uint src_grf4_dw2;
+  uint src_grf4_dw1;
+  uint src_grf4_dw0;
+
+  uint8 vme_result = (0);
+
+  int lgid_x = get_group_id(0);
+  int lgid_y = get_group_id(1);
+
+  int num_groups_x = get_num_groups(0);
+  int index = lgid_y * num_groups_x + lgid_x;
+
+  uint2 srcCoord = 0;
+  short2 predict_mv = 0;
+  if(prediction_motion_vector_buffer != NULL){
+    predict_mv = prediction_motion_vector_buffer[index];
+    predict_mv.x = predict_mv.x / 4;
+    predict_mv.y = predict_mv.y / 4;
+  }
+
+  srcCoord.x = lgid_x * 16;
+  srcCoord.y = lgid_y * 16;
+
+  //CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL
+  if(accel.search_path_type == 0x0){
+    //src_grf0_dw5 = (Ref_Height << 24) | (Ref_Width << 16) | (Ignored << 8) | (Dispatch_Id);
+    src_grf0_dw5 =   (20 << 24)         | (20 << 16)        | (0 << 8)       | (0);
+    //src_grf0_dw1 = (Ref1Y << 16)  | (Ref1X);
+    src_grf0_dw1 =   ((-2 + predict_mv.y) << 16 ) | ((-2 + predict_mv.x) & 0x0000ffff);
+    //src_grf0_dw0 = (Ref0Y << 16)  | (Ref0X);
+    src_grf0_dw0 =   ((-2 + predict_mv.y) << 16 ) | ((-2 + predict_mv.x) & 0x0000ffff);
+    //src_grf1_dw2 = (Start1Y << 28)                  | (Start1X << 24)                | (Start0Y << 20)
+    src_grf1_dw2 =   (0 << 28)                        | (0 << 24)                      | (0 << 20)
+                   //| (Start0X << 16)               | (Max_Num_SU << 8)              | (LenSP);
+                     | (0 << 16)                     | (2 << 8)                       | (2);
+  }
+  //CL_ME_SEARCH_PATH_RADIUS_4_4_INTEL
+  else if(accel.search_path_type == 0x1){
+    src_grf0_dw5 =   (24 << 24)         | (24 << 16)        | (0 << 8)       | (0);
+    src_grf0_dw1 =   ((-4 + predict_mv.y) << 16 ) | ((-4 + predict_mv.x) & 0x0000ffff);
+    src_grf0_dw0 =   ((-4 + predict_mv.y) << 16 ) | ((-4 + predict_mv.x) & 0x0000ffff);
+    src_grf1_dw2 =   (0 << 28)                        | (0 << 24)                      | (0 << 20)
+                     | (0 << 16)                     | (48 << 8)                       | (48);
+  }
+  //CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL
+  else if(accel.search_path_type == 0x5){
+    src_grf0_dw5 =   (40 << 24)         | (48 << 16)        | (0 << 8)       | (0);
+    src_grf0_dw1 =   ((-12 + predict_mv.y) << 16 ) | ((-16 + predict_mv.x) & 0x0000ffff);
+    src_grf0_dw0 =   ((-12 + predict_mv.y) << 16 ) | ((-16 +  + predict_mv.x) & 0x0000ffff);
+    src_grf1_dw2 =   (0 << 28)                        | (0 << 24)                      | (0 << 20)
+                     | (0 << 16)                     | (48 << 8)                       | (48);
+  }
+
+  /*Deal with mb_block_type & sad_adjust_mode & subpixel_mode*/
+  uchar sub_mb_part_mask = 0;
+  //CL_ME_MB_TYPE_16x16_INTEL
+  if(accel.mb_block_type == 0x0)
+    sub_mb_part_mask = 0x7e;
+  //CL_ME_MB_TYPE_8x8_INTEL
+  else if(accel.mb_block_type == 0x1)
+    sub_mb_part_mask = 0x77;
+  //CL_ME_MB_TYPE_4x4_INTEL
+  else if(accel.mb_block_type == 0x2)
+    sub_mb_part_mask = 0x3f;
+
+  uchar inter_sad = 0;
+  //CL_ME_SAD_ADJUST_MODE_NONE_INTEL
+  if(accel.sad_adjust_mode == 0x0)
+    inter_sad = 0;
+  //CL_ME_SAD_ADJUST_MODE_HAAR_INTEL
+  else if(accel.sad_adjust_mode == 0x1)
+    inter_sad = 2;
+
+  uchar sub_pel_mode = 0;
+  //CL_ME_SUBPIXEL_MODE_INTEGER_INTEL
+  if(accel.subpixel_mode == 0x0)
+    sub_pel_mode = 0;
+  //CL_ME_SUBPIXEL_MODE_HPEL_INTEL
+  else if(accel.subpixel_mode == 0x1)
+    sub_pel_mode = 1;
+  //CL_ME_SUBPIXEL_MODE_QPEL_INTEL
+  else if(accel.subpixel_mode == 0x2)
+    sub_pel_mode = 3;
+
+  //src_grf0_dw3 = (Reserved << 31)                | (Sub_Mb_Part_Mask << 24)       | (Intra_SAD << 22)
+  src_grf0_dw3 =   (0 << 31)                       | (sub_mb_part_mask << 24)       | (0 << 22)
+                 //| (Inter_SAD << 20)             | (BB_Skip_Enabled << 19)        | (Reserverd << 18)
+                   | (inter_sad << 20)             | (0 << 19)                      | (0 << 18)
+                 //| (Dis_Aligned_Src_Fetch << 17) | (Dis_Aligned_Ref_Fetch << 16)  | (Dis_Field_Cache_Alloc << 15)
+                   | (0 << 17)                     | (0 << 16)                      | (0 << 15)
+                 //| (Skip_Type << 14)             | (Sub_Pel_Mode << 12)           | (Dual_Search_Path_Opt << 11)
+                   | (0 << 14)                     | (sub_pel_mode << 12)           | (0 << 11)
+                 //| (Search_Ctrl << 8)            | (Ref_Access << 7)              | (SrcAccess << 6)
+                   | (0 << 8)                      | (0 << 7)                       | (0 << 6)
+                 //| (Mb_Type_Remap << 4)          | (Reserved_Workaround << 3)     | (Reserved_Workaround << 2)
+                   | (0 << 4)                      | (0 << 3)                       | (0 << 2)
+                 //| (Src_Size);
+                   | (0);
+
+
+  //src_grf0_dw7 = Debug;
+  src_grf0_dw7 = 0;
+  //src_grf0_dw6 = Debug;
+  src_grf0_dw6 = 0;
+  //src_grf0_dw5 = (Ref_Height << 24) | (Ref_Width << 16) | (Ignored << 8) | (Dispatch_Id?);
+  //src_grf0_dw4 = Ignored;
+  src_grf0_dw4 = 0;
+
+  //src_grf0_dw2 = (SrcY << 16) | (SrcX);
+  src_grf0_dw2 = (srcCoord.y << 16)  | (srcCoord.x);
+  //src_grf0_dw1 = (Ref1Y << 16)  | (Ref1X);
+  //src_grf0_dw0 = (Ref0Y << 16)  | (Ref0X);
+  /*src_grf1_dw7 = (Skip_Center_Mask << 24)         | (Reserved << 22)               | (Ref1_Field_Polarity << 21)
+                 | (Ref0_Field_Polarity << 20)   | (Src_Field_Polarity << 19)     | (Bilinear_Enable << 18)
+                 | (MV_Cost_Scale_Factor << 16)  | (Mb_Intra_Struct << 8)         | (Intra_Corner_Swap << 7)
+                 | (Non_Skip_Mode_Added << 6)    | (Non_Skip_ZMv_Added << 5)      | (IntraPartMask);*/
+  src_grf1_dw7 = 0;
+  //src_grf1_dw6 = Reserved;
+  src_grf1_dw6 = 0;
+  /*src_grf1_dw5 = (Cost_Center1Y << 16)  | (Cost_Center1X);
+  src_grf1_dw4 = (Cost_Center0Y << 16)  | (Cost_Center0X);
+  src_grf1_dw3 = (Ime_Too_Good << 24 )  | (Ime_Too_Bad << 16)  | (Part_Tolerance_Thrhd << 8) | (FBPrunThrhd);*/
+  src_grf1_dw5 = 0;
+  src_grf1_dw4 = 0;
+  src_grf1_dw3 = 0;
+  //src_grf1_dw2 = (Start1Y << 28)                  | (Start1X << 24)                | (Start0Y << 20)
+                 //| (Start0X << 16)               | (Max_Num_SU << 8)              | (LenSP);
+  /*src_grf1_dw1 = (RepartEn << 31)                 | (FBPrunEn << 30)               | (AdaptiveValidationControl << 29)
+                 | (Uni_Mix_Disable << 28)       | (Bi_Sub_Mb_Part_Mask << 24)    | (Reserverd << 22)
+                 | (Bi_Weight << 16)             | (Reserved << 6)                | (MaxNumMVs);*/
+  //src_grf1_dw1 = (0 << 24) | (2);
+  src_grf1_dw1 = (0 << 24) | (16);
+  /*src_grf1_dw0 = (Early_Ime_Stop << 24)           | (Early_Fme_Success << 16)      | (Skip_Success << 8)
+                 | (T8x8_Flag_For_Inter_En << 7) | (Quit_Inter_En << 6)           | (Early_Ime_Success_En << 5)
+                 | (Early_Success_En << 4)       | (Part_Candidate_En << 3)       | (Bi_Mix_Dis << 2)
+                 | (Adaptive_En  << 1)           | (SkipModeEn);*/
+  src_grf1_dw0 = 0;
+  /*src_grf2_dw7 = Ref1_SkipCenter_3_Delta_XY;
+  src_grf2_dw6 = Ref0_SkipCenter_3_Delta_XY;
+  src_grf2_dw5 = Ref1_SkipCenter_2_Delta_XY;
+  src_grf2_dw4 = Ref0_SkipCenter_3_Delta_XY;
+  src_grf2_dw3 = Ref1_SkipCenter_1_Delta_XY;
+  src_grf2_dw2 = Ref0_SkipCenter_1_Delta_XY;
+  src_grf2_dw1 = Ref1_SkipCenter_0_Delta_XY;
+  src_grf2_dw0 = (Ref0_Skip_Center_0_Delta_Y << 16)  | (Ref0_Skip_Center_0_Delta_X);
+  src_grf3_dw7 = Neighbor pixel Luma value [23, -1] to [20, -1];
+  src_grf3_dw6 = Neighbor pixel Luma value [19, -1] to [16, -1];
+  src_grf3_dw5 = Neighbor pixel Luma value [15, -1] to [12, -1];
+  src_grf3_dw4 = Neighbor pixel Luma value [11, -1] to [8, -1];
+  src_grf3_dw3 = Neighbor pixel Luma value [7, -1] to [4, -1];
+  src_grf3_dw2 = (Neighbor pixel Luma value [3, -1] << 24)    | (Neighbor pixel Luma value [2, -1] << 16)
+                 | (Neighbor pixel Luma value [1, -1] << 8)  | (Neighbor pixel Luma value [0, -1]);
+  //src_grf3_dw1 = (?)  | (Reserved)  | ((Intra_16x16_Mode_Mask);
+  src_grf3_dw0 = (Reserved<<25)  | (Intra_16x16_Mode_Mask << 16)  | (Reserved)  | (Intra_16x16_Mode_Mask);
+  src_grf4_dw7 = Reserved;
+  src_grf4_dw6 = Reserved;
+  src_grf4_dw5 = Reserved;
+  src_grf4_dw4 = (Intra_MxM_Pred_Mode_B15 << 28)    | (Intra_MxM_Pred_Mode_B14 << 24)  | (Intra_MxM_Pred_Mode_B11 << 20)
+                 | (Intra_MxM_Pred_Mode_B10 << 16) | (Intra_MxM_Pred_Mode_A15 << 12)  | (Intra_MxM_Pred_Mode_A13 << 8)
+                 | (Intra_MxM_Pred_Mode_A7 << 4)   | (Intra_MxM_Pred_Mode_A5);
+  //src_grf4_dw3 = (?)  | (Neighbor pixel Luma value [-1, 14] to [-1, 12]);
+  src_grf4_dw2 = Neighbor pixel Luma value [-1, 11] to [-1, 8];
+  src_grf4_dw1 = Neighbor pixel Luma value [-1, 7] to [-1, 4];
+  src_grf4_dw0 = (Neighbor pixel Luma value [-1, 3] << 24)    | (Neighbor pixel Luma value [-1, 2] << 16)
+                 | (Neighbor pixel Luma value [-1, 1] << 8)  | (Neighbor pixel Luma value [-1, 0]);*/
+  src_grf2_dw7 = 0;
+  src_grf2_dw6 = 0;
+  src_grf2_dw5 = 0;
+  src_grf2_dw4 = 0;
+  src_grf2_dw3 = 0;
+  src_grf2_dw2 = 0;
+  src_grf2_dw1 = 0;
+  src_grf2_dw0 = 0;
+  src_grf3_dw7 = 0;
+  src_grf3_dw6 = 0;
+  src_grf3_dw5 = 0;
+  src_grf3_dw4 = 0;
+  src_grf3_dw3 = 0;
+  src_grf3_dw2 = 0;
+  src_grf3_dw1 = 0;
+  src_grf3_dw0 = 0;
+  src_grf4_dw7 = 0;
+  src_grf4_dw6 = 0;
+  src_grf4_dw5 = 0;
+  src_grf4_dw4 = 0;
+  src_grf4_dw3 = 0;
+  src_grf4_dw2 = 0;
+  src_grf4_dw1 = 0;
+  src_grf4_dw0 = 0;
+
+  int lid_x = get_local_id(0);
+
+  vme_result = __gen_ocl_vme(src_image, ref_image,
+                src_grf0_dw7, src_grf0_dw6, src_grf0_dw5, src_grf0_dw4,
+                src_grf0_dw3, src_grf0_dw2, src_grf0_dw1, src_grf0_dw0,
+                src_grf1_dw7, src_grf1_dw6, src_grf1_dw5, src_grf1_dw4,
+                src_grf1_dw3, src_grf1_dw2, src_grf1_dw1, src_grf1_dw0,
+                src_grf2_dw7, src_grf2_dw6, src_grf2_dw5, src_grf2_dw4,
+                src_grf2_dw3, src_grf2_dw2, src_grf2_dw1, src_grf2_dw0,
+                src_grf3_dw7, src_grf3_dw6, src_grf3_dw5, src_grf3_dw4,
+                src_grf3_dw3, src_grf3_dw2, src_grf3_dw1, src_grf3_dw0,
+                src_grf4_dw7, src_grf4_dw6, src_grf4_dw5, src_grf4_dw4,
+                src_grf4_dw3, src_grf4_dw2, src_grf4_dw1, src_grf4_dw0,
+                //msg_type, vme_search_path_lut, lut_sub,
+                1, 0, 0);
+
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  short2 mv[16];
+  ushort res[16];
+
+  uint write_back_dwx;
+  uint simd_width = get_max_sub_group_size();
+
+  /* In simd 8 mode, one kernel variable 'uint' map to 8 dword.
+   * In simd 16 mode, one kernel variable 'uint' map to 16 dword.
+   * That's why we should treat simd8 and simd16 differently when
+   * use __gen_ocl_region.
+   * */
+  if(simd_width == 8){
+    write_back_dwx = __gen_ocl_region(0, vme_result.s1);
+    mv[0] = as_short2( write_back_dwx );
+
+    if(accel.mb_block_type > 0x0){
+      for(int i = 2, j = 1; j < 4; i += 2, j++){
+        write_back_dwx = __gen_ocl_region(i, vme_result.s1);
+        mv[j] = as_short2( write_back_dwx );
+      }
+      if(accel.mb_block_type > 0x1){
+        for(int i = 0, j = 4; j < 8; i += 2, j++){
+          write_back_dwx = __gen_ocl_region(i, vme_result.s2);
+          mv[j] = as_short2( write_back_dwx );
+        }
+        for(int i = 0, j = 8; j < 12; i += 2, j++){
+          write_back_dwx = __gen_ocl_region(i, vme_result.s3);
+          mv[j] = as_short2( write_back_dwx );
+        }
+        for(int i = 0, j = 12; j < 16; i += 2, j++){
+          write_back_dwx = __gen_ocl_region(i, vme_result.s4);
+          mv[j] = as_short2( write_back_dwx );
+        }
+      }
+    }
+    ushort2 temp_res;
+    for(int i = 0; i < 8; i++){
+      write_back_dwx = __gen_ocl_region(i, vme_result.s5);
+      temp_res = as_ushort2(write_back_dwx);
+      res[i*2] = temp_res.s0;
+      res[i*2+1] = temp_res.s1;
+    }
+  }
+  else if(simd_width == 16){
+    write_back_dwx = __gen_ocl_region(0 + 8, vme_result.s0);
+    mv[0] = as_short2( write_back_dwx );
+
+    if(accel.mb_block_type > 0x0){
+      for(int i = 2, j = 1; j < 4; i += 2, j++){
+        write_back_dwx = __gen_ocl_region(i + 8, vme_result.s0);
+        mv[j] = as_short2( write_back_dwx );
+      }
+      if(accel.mb_block_type > 0x1){
+        for(int i = 0, j = 4; j < 8; i += 2, j++){
+          write_back_dwx = __gen_ocl_region(i, vme_result.s1);
+          mv[j] = as_short2( write_back_dwx );
+        }
+        for(int i = 0, j = 8; j < 12; i += 2, j++){
+          write_back_dwx = __gen_ocl_region(i + 8, vme_result.s1);
+          mv[j] = as_short2( write_back_dwx );
+        }
+        for(int i = 0, j = 12; j < 16; i += 2, j++){
+          write_back_dwx = __gen_ocl_region(i, vme_result.s2);
+          mv[j] = as_short2( write_back_dwx );
+        }
+      }
+    }
+    ushort2 temp_res;
+    for(int i = 0; i < 8; i++){
+      write_back_dwx = __gen_ocl_region(i + 8, vme_result.s2);
+      temp_res = as_ushort2(write_back_dwx);
+      res[i*2] = temp_res.s0;
+      res[i*2+1] = temp_res.s1;
+    }
+  }
+
+  int mv_index;
+
+  //CL_ME_MB_TYPE_16x16_INTEL
+  if(accel.mb_block_type == 0x0){
+    mv_index = index * 1;
+    if( lid_x == 0 ){
+      motion_vector_buffer[mv_index] = mv[lid_x];
+      if(residuals)
+        residuals[mv_index] = 2 * res[lid_x];
+    }
+  }
+  //CL_ME_MB_TYPE_8x8_INTEL
+  else if(accel.mb_block_type == 0x1){
+    if(lid_x < 4){
+      mv_index = lgid_y * num_groups_x * 4 + lgid_x * 2;
+      mv_index = mv_index + num_groups_x * 2 * (lid_x / 2) + (lid_x % 2);
+      motion_vector_buffer[mv_index] = mv[lid_x];
+      if(residuals)
+        residuals[mv_index] = 2 * res[lid_x];
+    }
+  }
+  //CL_ME_MB_TYPE_4x4_INTEL
+  else if(accel.mb_block_type == 0x2){
+    if(lid_x < 16){
+      mv_index = lgid_y * num_groups_x * 16 + lgid_x * 4;
+      mv_index = mv_index + num_groups_x * 4 * (lid_x / 4) + (lid_x % 4);
+      motion_vector_buffer[mv_index] = mv[lid_x];
+      if(residuals)
+        residuals[mv_index] = 2 * res[lid_x];
+    }
+  }
+
+}
diff --git a/runtime/gen/kernels/cl_internal_copy_buf_align16.cl b/runtime/gen/kernels/cl_internal_copy_buf_align16.cl
new file mode 100644
index 0000000..1abb4e9
--- /dev/null
+++ b/runtime/gen/kernels/cl_internal_copy_buf_align16.cl
@@ -0,0 +1,12 @@
+kernel void __cl_copy_region_align16 ( global float* src, unsigned int src_offset,
+                                      global float* dst, unsigned int dst_offset,
+				      unsigned int size)
+{
+    int i = get_global_id(0) * 4;
+    if (i < size*4) {
+        dst[i+dst_offset] = src[i+src_offset];
+        dst[i+dst_offset + 1] = src[i+src_offset + 1];
+        dst[i+dst_offset + 2] = src[i+src_offset + 2];
+        dst[i+dst_offset + 3] = src[i+src_offset + 3];
+    }
+}
diff --git a/runtime/gen/kernels/cl_internal_copy_buf_align4.cl b/runtime/gen/kernels/cl_internal_copy_buf_align4.cl
new file mode 100644
index 0000000..27174ca
--- /dev/null
+++ b/runtime/gen/kernels/cl_internal_copy_buf_align4.cl
@@ -0,0 +1,8 @@
+kernel void __cl_copy_region_align4 ( global float* src, unsigned int src_offset,
+                                     global float* dst, unsigned int dst_offset,
+				     unsigned int size)
+{
+    int i = get_global_id(0);
+    if (i < size)
+        dst[i+dst_offset] = src[i+src_offset];
+}
diff --git a/runtime/gen/kernels/cl_internal_copy_buf_rect.cl b/runtime/gen/kernels/cl_internal_copy_buf_rect.cl
new file mode 100644
index 0000000..71e7484
--- /dev/null
+++ b/runtime/gen/kernels/cl_internal_copy_buf_rect.cl
@@ -0,0 +1,15 @@
+kernel void __cl_copy_buffer_rect ( global char* src, global char* dst,
+                                          unsigned int region0, unsigned int region1, unsigned int region2,
+                                          unsigned int src_offset, unsigned int dst_offset,
+                                          unsigned int src_row_pitch, unsigned int src_slice_pitch,
+                                          unsigned int dst_row_pitch, unsigned int dst_slice_pitch)
+{
+  int i = get_global_id(0);
+  int j = get_global_id(1);
+  int k = get_global_id(2);
+  if((i >= region0) || (j>= region1) || (k>=region2))
+    return;
+  src_offset += k * src_slice_pitch + j * src_row_pitch + i;
+  dst_offset += k * dst_slice_pitch + j * dst_row_pitch + i;
+  dst[dst_offset] = src[src_offset];
+}
diff --git a/runtime/gen/kernels/cl_internal_copy_buf_rect_align4.cl b/runtime/gen/kernels/cl_internal_copy_buf_rect_align4.cl
new file mode 100644
index 0000000..fbfe7b2
--- /dev/null
+++ b/runtime/gen/kernels/cl_internal_copy_buf_rect_align4.cl
@@ -0,0 +1,15 @@
+kernel void __cl_copy_buffer_rect_align4 ( global int* src, global int* dst,
+                                          unsigned int region0, unsigned int region1, unsigned int region2,
+                                          unsigned int src_offset, unsigned int dst_offset,
+                                          unsigned int src_row_pitch, unsigned int src_slice_pitch,
+                                          unsigned int dst_row_pitch, unsigned int dst_slice_pitch)
+{
+  int i = get_global_id(0);
+  int j = get_global_id(1);
+  int k = get_global_id(2);
+  if((i >= region0) || (j>= region1) || (k>=region2))
+    return;
+  src_offset += k * src_slice_pitch + j * src_row_pitch + i;
+  dst_offset += k * dst_slice_pitch + j * dst_row_pitch + i;
+  dst[dst_offset] = src[src_offset];
+}
diff --git a/runtime/gen/kernels/cl_internal_copy_buf_unalign_dst_offset.cl b/runtime/gen/kernels/cl_internal_copy_buf_unalign_dst_offset.cl
new file mode 100644
index 0000000..e02d0e5
--- /dev/null
+++ b/runtime/gen/kernels/cl_internal_copy_buf_unalign_dst_offset.cl
@@ -0,0 +1,28 @@
+kernel void __cl_copy_region_unalign_dst_offset ( global int* src, unsigned int src_offset,
+                                     global int* dst, unsigned int dst_offset,
+				     unsigned int size,
+				     unsigned int first_mask, unsigned int last_mask,
+				     unsigned int shift, unsigned int dw_mask)
+{
+    int i = get_global_id(0);
+    unsigned int tmp = 0;
+
+    if (i > size -1)
+        return;
+
+    /* last dw, need to be careful, not to overflow the source. */
+    if ((i == size - 1) && ((last_mask & (~(~dw_mask >> shift))) == 0)) {
+        tmp = ((src[src_offset + i] & ~dw_mask) >> shift);
+    } else {
+        tmp = ((src[src_offset + i] & ~dw_mask) >> shift)
+             | ((src[src_offset + i + 1] & dw_mask) << (32 - shift));
+    }
+
+    if (i == 0) {
+        dst[dst_offset] = (dst[dst_offset] & first_mask) | (tmp & (~first_mask));
+    } else if (i == size - 1) {
+        dst[i+dst_offset] = (tmp & last_mask) | (dst[i+dst_offset] & (~last_mask));
+    } else {
+        dst[i+dst_offset] = tmp;
+    }
+}
diff --git a/runtime/gen/kernels/cl_internal_copy_buf_unalign_same_offset.cl b/runtime/gen/kernels/cl_internal_copy_buf_unalign_same_offset.cl
new file mode 100644
index 0000000..83b6e97
--- /dev/null
+++ b/runtime/gen/kernels/cl_internal_copy_buf_unalign_same_offset.cl
@@ -0,0 +1,19 @@
+kernel void __cl_copy_region_unalign_same_offset ( global int* src, unsigned int src_offset,
+                                     global int* dst, unsigned int dst_offset,
+				     unsigned int size,
+				     unsigned int first_mask, unsigned int last_mask)
+{
+    int i = get_global_id(0);
+    if (i > size -1)
+       return;
+
+    if (i == 0) {
+        dst[dst_offset] = (dst[dst_offset] & first_mask)
+             | (src[src_offset] & (~first_mask));
+    } else if (i == size - 1) {
+        dst[i+dst_offset] = (src[i+src_offset] & last_mask)
+            | (dst[i+dst_offset] & (~last_mask));
+    } else {
+        dst[i+dst_offset] = src[i+src_offset];
+    }
+}
diff --git a/runtime/gen/kernels/cl_internal_copy_buf_unalign_src_offset.cl b/runtime/gen/kernels/cl_internal_copy_buf_unalign_src_offset.cl
new file mode 100644
index 0000000..ce0aa1d
--- /dev/null
+++ b/runtime/gen/kernels/cl_internal_copy_buf_unalign_src_offset.cl
@@ -0,0 +1,29 @@
+kernel void __cl_copy_region_unalign_src_offset ( global int* src, unsigned int src_offset,
+                                     global int* dst, unsigned int dst_offset,
+				     unsigned int size,
+				     unsigned int first_mask, unsigned int last_mask,
+				     unsigned int shift, unsigned int dw_mask, int src_less)
+{
+    int i = get_global_id(0);
+    unsigned int tmp = 0;
+
+    if (i > size -1)
+        return;
+
+    if (i == 0) {
+        tmp = ((src[src_offset + i] & dw_mask) << shift);
+    } else if (src_less && i == size - 1) { // not exceed the bound of source
+        tmp = ((src[src_offset + i - 1] & ~dw_mask) >> (32 - shift));
+    } else {
+        tmp = ((src[src_offset + i - 1] & ~dw_mask) >> (32 - shift))
+             | ((src[src_offset + i] & dw_mask) << shift);
+    }
+
+    if (i == 0) {
+        dst[dst_offset] = (dst[dst_offset] & first_mask) | (tmp & (~first_mask));
+    } else if (i == size - 1) {
+        dst[i+dst_offset] = (tmp & last_mask) | (dst[i+dst_offset] & (~last_mask));
+    } else {
+        dst[i+dst_offset] = tmp;
+    }
+}
diff --git a/runtime/gen/kernels/cl_internal_copy_buffer_to_image_2d.cl b/runtime/gen/kernels/cl_internal_copy_buffer_to_image_2d.cl
new file mode 100644
index 0000000..e162393
--- /dev/null
+++ b/runtime/gen/kernels/cl_internal_copy_buffer_to_image_2d.cl
@@ -0,0 +1,18 @@
+kernel void __cl_copy_buffer_to_image_2d(__write_only image2d_t image, global uchar* buffer,
+                                        unsigned int region0, unsigned int region1, unsigned int region2,
+                                        unsigned int dst_origin0, unsigned int dst_origin1, unsigned int dst_origin2,
+                                        unsigned int src_offset)
+{
+  int i = get_global_id(0);
+  int j = get_global_id(1);
+  int k = get_global_id(2);
+  uint4 color = (uint4)(0);
+  int2 dst_coord;
+  if((i >= region0) || (j>= region1) || (k>=region2))
+    return;
+  dst_coord.x = dst_origin0 + i;
+  dst_coord.y = dst_origin1 + j;
+  src_offset += (k * region1 + j) * region0 + i;
+  color.x = buffer[src_offset];
+  write_imageui(image, dst_coord, color);
+}
diff --git a/runtime/gen/kernels/cl_internal_copy_buffer_to_image_2d_align16.cl b/runtime/gen/kernels/cl_internal_copy_buffer_to_image_2d_align16.cl
new file mode 100644
index 0000000..5b32cd5
--- /dev/null
+++ b/runtime/gen/kernels/cl_internal_copy_buffer_to_image_2d_align16.cl
@@ -0,0 +1,18 @@
+kernel void __cl_copy_buffer_to_image_2d_align16(__write_only image2d_t image, global uint4* buffer,
+                                        unsigned int region0, unsigned int region1, unsigned int region2,
+                                        unsigned int dst_origin0, unsigned int dst_origin1, unsigned int dst_origin2,
+                                        unsigned int src_offset)
+{
+  int i = get_global_id(0);
+  int j = get_global_id(1);
+  uint4 color = (uint4)(0);
+  int2 dst_coord;
+  if((i >= region0) || (j>= region1))
+    return;
+  dst_coord.x = dst_origin0 + i;
+  dst_coord.y = dst_origin1 + j;
+  src_offset += j * region0 + i;
+  color = buffer[src_offset];
+  write_imageui(image, dst_coord, color);
+}
+
diff --git a/runtime/gen/kernels/cl_internal_copy_buffer_to_image_3d.cl b/runtime/gen/kernels/cl_internal_copy_buffer_to_image_3d.cl
new file mode 100644
index 0000000..5f0e890
--- /dev/null
+++ b/runtime/gen/kernels/cl_internal_copy_buffer_to_image_3d.cl
@@ -0,0 +1,19 @@
+kernel void __cl_copy_buffer_to_image_3d(__write_only image3d_t image, global uchar* buffer,
+                                        unsigned int region0, unsigned int region1, unsigned int region2,
+                                        unsigned int dst_origin0, unsigned int dst_origin1, unsigned int dst_origin2,
+                                        unsigned int src_offset)
+{
+  int i = get_global_id(0);
+  int j = get_global_id(1);
+  int k = get_global_id(2);
+  uint4 color = (uint4)(0);
+  int4 dst_coord;
+  if((i >= region0) || (j>= region1) || (k>=region2))
+    return;
+  dst_coord.x = dst_origin0 + i;
+  dst_coord.y = dst_origin1 + j;
+  dst_coord.z = dst_origin2 + k;
+  src_offset += (k * region1 + j) * region0 + i;
+  color.x = buffer[src_offset];
+  write_imageui(image, dst_coord, color);
+}
diff --git a/runtime/gen/kernels/cl_internal_copy_image_1d_array_to_1d_array.cl b/runtime/gen/kernels/cl_internal_copy_image_1d_array_to_1d_array.cl
new file mode 100644
index 0000000..0c7c6e2
--- /dev/null
+++ b/runtime/gen/kernels/cl_internal_copy_image_1d_array_to_1d_array.cl
@@ -0,0 +1,21 @@
+kernel void __cl_copy_image_1d_array_to_1d_array(__read_only image1d_array_t src_image, __write_only image1d_array_t dst_image,
+                             unsigned int region0, unsigned int region1, unsigned int region2,
+                             unsigned int src_origin0, unsigned int src_origin1, unsigned int src_origin2,
+                             unsigned int dst_origin0, unsigned int dst_origin1, unsigned int dst_origin2)
+{
+  int i = get_global_id(0);
+  int k = get_global_id(2);
+  int4 color;
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
+  int2 src_coord;
+  int2 dst_coord;
+  if((i >= region0) || (k>=region2))
+    return;
+
+  src_coord.x = src_origin0 + i;
+  src_coord.y = src_origin2 + k;
+  dst_coord.x = dst_origin0 + i;
+  dst_coord.y = dst_origin2 + k;
+  color = read_imagei(src_image, sampler, src_coord);
+  write_imagei(dst_image, dst_coord, color);
+}
diff --git a/runtime/gen/kernels/cl_internal_copy_image_1d_to_1d.cl b/runtime/gen/kernels/cl_internal_copy_image_1d_to_1d.cl
new file mode 100644
index 0000000..dca82b2
--- /dev/null
+++ b/runtime/gen/kernels/cl_internal_copy_image_1d_to_1d.cl
@@ -0,0 +1,19 @@
+kernel void __cl_copy_image_1d_to_1d(__read_only image1d_t src_image, __write_only image1d_t dst_image,
+                             unsigned int region0, unsigned int region1, unsigned int region2,
+                             unsigned int src_origin0, unsigned int src_origin1, unsigned int src_origin2,
+                             unsigned int dst_origin0, unsigned int dst_origin1, unsigned int dst_origin2)
+{
+  int i = get_global_id(0);
+  int j = get_global_id(1);
+  int k = get_global_id(2);
+  int4 color;
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
+  int src_coord;
+  int dst_coord;
+  if((i >= region0) || (j>= region1) || (k>=region2))
+    return;
+  src_coord = src_origin0 + i;
+  dst_coord = dst_origin0 + i;
+  color = read_imagei(src_image, sampler, src_coord);
+  write_imagei(dst_image, dst_coord, color);
+}
diff --git a/runtime/gen/kernels/cl_internal_copy_image_2d_array_to_2d.cl b/runtime/gen/kernels/cl_internal_copy_image_2d_array_to_2d.cl
new file mode 100644
index 0000000..89e36c0
--- /dev/null
+++ b/runtime/gen/kernels/cl_internal_copy_image_2d_array_to_2d.cl
@@ -0,0 +1,21 @@
+kernel void __cl_copy_image_2d_array_to_2d(__read_only image2d_array_t src_image, __write_only image2d_t dst_image,
+                             unsigned int region0, unsigned int region1, unsigned int region2,
+                             unsigned int src_origin0, unsigned int src_origin1, unsigned int src_origin2,
+                             unsigned int dst_origin0, unsigned int dst_origin1, unsigned int dst_origin2)
+{
+  int i = get_global_id(0);
+  int j = get_global_id(1);
+  int4 color;
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
+  int4 src_coord;
+  int2 dst_coord;
+  if((i >= region0) || (j>= region1))
+    return;
+  src_coord.x = src_origin0 + i;
+  src_coord.y = src_origin1 + j;
+  src_coord.z = src_origin2;
+  dst_coord.x = dst_origin0 + i;
+  dst_coord.y = dst_origin1 + j;
+  color = read_imagei(src_image, sampler, src_coord);
+  write_imagei(dst_image, dst_coord, color);
+}
diff --git a/runtime/gen/kernels/cl_internal_copy_image_2d_array_to_2d_array.cl b/runtime/gen/kernels/cl_internal_copy_image_2d_array_to_2d_array.cl
new file mode 100644
index 0000000..3653660
--- /dev/null
+++ b/runtime/gen/kernels/cl_internal_copy_image_2d_array_to_2d_array.cl
@@ -0,0 +1,23 @@
+kernel void __cl_copy_image_2d_array_to_2d_array(__read_only image2d_array_t src_image, __write_only image2d_array_t dst_image,
+                             unsigned int region0, unsigned int region1, unsigned int region2,
+                             unsigned int src_origin0, unsigned int src_origin1, unsigned int src_origin2,
+                             unsigned int dst_origin0, unsigned int dst_origin1, unsigned int dst_origin2)
+{
+  int i = get_global_id(0);
+  int j = get_global_id(1);
+  int k = get_global_id(2);
+  int4 color;
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
+  int4 src_coord;
+  int4 dst_coord;
+  if((i >= region0) || (j>= region1) || (k>=region2))
+    return;
+  src_coord.x = src_origin0 + i;
+  src_coord.y = src_origin1 + j;
+  src_coord.z = src_origin2 + k;
+  dst_coord.x = dst_origin0 + i;
+  dst_coord.y = dst_origin1 + j;
+  dst_coord.z = dst_origin2 + k;
+  color = read_imagei(src_image, sampler, src_coord);
+  write_imagei(dst_image, dst_coord, color);
+}
diff --git a/runtime/gen/kernels/cl_internal_copy_image_2d_array_to_3d.cl b/runtime/gen/kernels/cl_internal_copy_image_2d_array_to_3d.cl
new file mode 100644
index 0000000..424f6b5
--- /dev/null
+++ b/runtime/gen/kernels/cl_internal_copy_image_2d_array_to_3d.cl
@@ -0,0 +1,23 @@
+kernel void __cl_copy_image_2d_array_to_3d(__read_only image2d_array_t src_image, __write_only image3d_t dst_image,
+                             unsigned int region0, unsigned int region1, unsigned int region2,
+                             unsigned int src_origin0, unsigned int src_origin1, unsigned int src_origin2,
+                             unsigned int dst_origin0, unsigned int dst_origin1, unsigned int dst_origin2)
+{
+  int i = get_global_id(0);
+  int j = get_global_id(1);
+  int k = get_global_id(2);
+  int4 color;
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
+  int4 src_coord;
+  int4 dst_coord;
+  if((i >= region0) || (j>= region1) || (k>=region2))
+    return;
+  src_coord.x = src_origin0 + i;
+  src_coord.y = src_origin1 + j;
+  src_coord.z = src_origin2 + k;
+  dst_coord.x = dst_origin0 + i;
+  dst_coord.y = dst_origin1 + j;
+  dst_coord.z = dst_origin2 + k;
+  color = read_imagei(src_image, sampler, src_coord);
+  write_imagei(dst_image, dst_coord, color);
+}
diff --git a/runtime/gen/kernels/cl_internal_copy_image_2d_to_2d.cl b/runtime/gen/kernels/cl_internal_copy_image_2d_to_2d.cl
new file mode 100644
index 0000000..c5eaab1
--- /dev/null
+++ b/runtime/gen/kernels/cl_internal_copy_image_2d_to_2d.cl
@@ -0,0 +1,21 @@
+kernel void __cl_copy_image_2d_to_2d(__read_only image2d_t src_image, __write_only image2d_t dst_image,
+                             unsigned int region0, unsigned int region1, unsigned int region2,
+                             unsigned int src_origin0, unsigned int src_origin1, unsigned int src_origin2,
+                             unsigned int dst_origin0, unsigned int dst_origin1, unsigned int dst_origin2)
+{
+  int i = get_global_id(0);
+  int j = get_global_id(1);
+  int k = get_global_id(2);
+  int4 color;
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
+  int2 src_coord;
+  int2 dst_coord;
+  if((i >= region0) || (j>= region1) || (k>=region2))
+    return;
+  src_coord.x = src_origin0 + i;
+  src_coord.y = src_origin1 + j;
+  dst_coord.x = dst_origin0 + i;
+  dst_coord.y = dst_origin1 + j;
+  color = read_imagei(src_image, sampler, src_coord);
+  write_imagei(dst_image, dst_coord, color);
+}
diff --git a/runtime/gen/kernels/cl_internal_copy_image_2d_to_2d_array.cl b/runtime/gen/kernels/cl_internal_copy_image_2d_to_2d_array.cl
new file mode 100644
index 0000000..4384f01
--- /dev/null
+++ b/runtime/gen/kernels/cl_internal_copy_image_2d_to_2d_array.cl
@@ -0,0 +1,21 @@
+kernel void __cl_copy_image_2d_to_2d_array(__read_only image2d_t src_image, __write_only image2d_array_t dst_image,
+                                          unsigned int region0, unsigned int region1, unsigned int region2,
+                                          unsigned int src_origin0, unsigned int src_origin1, unsigned int src_origin2,
+                                          unsigned int dst_origin0, unsigned int dst_origin1, unsigned int dst_origin2)
+{
+  int i = get_global_id(0);
+  int j = get_global_id(1);
+  int4 color;
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
+  int2 src_coord;
+  int4 dst_coord;
+  if((i >= region0) || (j>= region1))
+    return;
+  src_coord.x = src_origin0 + i;
+  src_coord.y = src_origin1 + j;
+  dst_coord.x = dst_origin0 + i;
+  dst_coord.y = dst_origin1 + j;
+  dst_coord.z = dst_origin2;
+  color = read_imagei(src_image, sampler, src_coord);
+  write_imagei(dst_image, dst_coord, color);
+}
diff --git a/runtime/gen/kernels/cl_internal_copy_image_2d_to_3d.cl b/runtime/gen/kernels/cl_internal_copy_image_2d_to_3d.cl
new file mode 100644
index 0000000..4c73a74
--- /dev/null
+++ b/runtime/gen/kernels/cl_internal_copy_image_2d_to_3d.cl
@@ -0,0 +1,22 @@
+kernel void __cl_copy_image_2d_to_3d(__read_only image2d_t src_image, __write_only image3d_t dst_image,
+                                         unsigned int region0, unsigned int region1, unsigned int region2,
+                                         unsigned int src_origin0, unsigned int src_origin1, unsigned int src_origin2,
+                                         unsigned int dst_origin0, unsigned int dst_origin1, unsigned int dst_origin2)
+{
+  int i = get_global_id(0);
+  int j = get_global_id(1);
+  int k = get_global_id(2);
+  int4 color;
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
+  int2 src_coord;
+  int4 dst_coord;
+  if((i >= region0) || (j>= region1) || (k>=region2))
+    return;
+  src_coord.x = src_origin0 + i;
+  src_coord.y = src_origin1 + j;
+  dst_coord.x = dst_origin0 + i;
+  dst_coord.y = dst_origin1 + j;
+  dst_coord.z = dst_origin2 + k;
+  color = read_imagei(src_image, sampler, src_coord);
+  write_imagei(dst_image, dst_coord, color);
+}
diff --git a/runtime/gen/kernels/cl_internal_copy_image_2d_to_buffer.cl b/runtime/gen/kernels/cl_internal_copy_image_2d_to_buffer.cl
new file mode 100644
index 0000000..b6c352e
--- /dev/null
+++ b/runtime/gen/kernels/cl_internal_copy_image_2d_to_buffer.cl
@@ -0,0 +1,19 @@
+kernel void __cl_copy_image_2d_to_buffer( __read_only image2d_t image, global uchar* buffer,
+                                        unsigned int region0, unsigned int region1, unsigned int region2,
+                                        unsigned int src_origin0, unsigned int src_origin1, unsigned int src_origin2,
+                                        unsigned int dst_offset)
+{
+  int i = get_global_id(0);
+  int j = get_global_id(1);
+  int k = get_global_id(2);
+  uint4 color;
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
+  int2 src_coord;
+  if((i >= region0) || (j>= region1) || (k>=region2))
+    return;
+  src_coord.x = src_origin0 + i;
+  src_coord.y = src_origin1 + j;
+  color = read_imageui(image, sampler, src_coord);
+  dst_offset += (k * region1 + j) * region0 + i;
+  buffer[dst_offset] = color.x;
+}
diff --git a/runtime/gen/kernels/cl_internal_copy_image_2d_to_buffer_align16.cl b/runtime/gen/kernels/cl_internal_copy_image_2d_to_buffer_align16.cl
new file mode 100644
index 0000000..a32e5f2
--- /dev/null
+++ b/runtime/gen/kernels/cl_internal_copy_image_2d_to_buffer_align16.cl
@@ -0,0 +1,19 @@
+kernel void __cl_copy_image_2d_to_buffer_align16( __read_only image2d_t image, global uint4* buffer,
+                                        unsigned int region0, unsigned int region1, unsigned int region2,
+                                        unsigned int src_origin0, unsigned int src_origin1, unsigned int src_origin2,
+                                        unsigned int dst_offset)
+{
+
+  int i = get_global_id(0);
+  int j = get_global_id(1);
+  if((i >= region0) || (j>= region1))
+    return;
+  uint4 color;
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
+  int2 src_coord;
+  src_coord.x = src_origin0 + i;
+  src_coord.y = src_origin1 + j;
+  color = read_imageui(image, sampler, src_coord);
+
+  *(buffer + dst_offset + region0*j + i) = color;
+}
diff --git a/runtime/gen/kernels/cl_internal_copy_image_3d_to_2d.cl b/runtime/gen/kernels/cl_internal_copy_image_3d_to_2d.cl
new file mode 100644
index 0000000..e0effa0
--- /dev/null
+++ b/runtime/gen/kernels/cl_internal_copy_image_3d_to_2d.cl
@@ -0,0 +1,22 @@
+kernel void __cl_copy_image_3d_to_2d(__read_only image3d_t src_image, __write_only image2d_t dst_image,
+                             unsigned int region0, unsigned int region1, unsigned int region2,
+                             unsigned int src_origin0, unsigned int src_origin1, unsigned int src_origin2,
+                             unsigned int dst_origin0, unsigned int dst_origin1, unsigned int dst_origin2)
+{
+  int i = get_global_id(0);
+  int j = get_global_id(1);
+  int k = get_global_id(2);
+  int4 color;
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
+  int4 src_coord;
+  int2 dst_coord;
+  if((i >= region0) || (j>= region1) || (k>=region2))
+    return;
+  src_coord.x = src_origin0 + i;
+  src_coord.y = src_origin1 + j;
+  src_coord.z = src_origin2 + k;
+  dst_coord.x = dst_origin0 + i;
+  dst_coord.y = dst_origin1 + j;
+  color = read_imagei(src_image, sampler, src_coord);
+  write_imagei(dst_image, dst_coord, color);
+}
diff --git a/runtime/gen/kernels/cl_internal_copy_image_3d_to_2d_array.cl b/runtime/gen/kernels/cl_internal_copy_image_3d_to_2d_array.cl
new file mode 100644
index 0000000..8041a32
--- /dev/null
+++ b/runtime/gen/kernels/cl_internal_copy_image_3d_to_2d_array.cl
@@ -0,0 +1,23 @@
+kernel void __cl_copy_image_3d_to_2d_array(__read_only image3d_t src_image, __write_only image2d_array_t dst_image,
+                             unsigned int region0, unsigned int region1, unsigned int region2,
+                             unsigned int src_origin0, unsigned int src_origin1, unsigned int src_origin2,
+                             unsigned int dst_origin0, unsigned int dst_origin1, unsigned int dst_origin2)
+{
+  int i = get_global_id(0);
+  int j = get_global_id(1);
+  int k = get_global_id(2);
+  int4 color;
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
+  int4 src_coord;
+  int4 dst_coord;
+  if((i >= region0) || (j>= region1) || (k>=region2))
+    return;
+  src_coord.x = src_origin0 + i;
+  src_coord.y = src_origin1 + j;
+  src_coord.z = src_origin2 + k;
+  dst_coord.x = dst_origin0 + i;
+  dst_coord.y = dst_origin1 + j;
+  dst_coord.z = dst_origin2 + k;
+  color = read_imagei(src_image, sampler, src_coord);
+  write_imagei(dst_image, dst_coord, color);
+}
diff --git a/runtime/gen/kernels/cl_internal_copy_image_3d_to_3d.cl b/runtime/gen/kernels/cl_internal_copy_image_3d_to_3d.cl
new file mode 100644
index 0000000..de80a0a
--- /dev/null
+++ b/runtime/gen/kernels/cl_internal_copy_image_3d_to_3d.cl
@@ -0,0 +1,23 @@
+kernel void __cl_copy_image_3d_to_3d(__read_only image3d_t src_image, __write_only image3d_t dst_image,
+                             unsigned int region0, unsigned int region1, unsigned int region2,
+                             unsigned int src_origin0, unsigned int src_origin1, unsigned int src_origin2,
+                             unsigned int dst_origin0, unsigned int dst_origin1, unsigned int dst_origin2)
+{
+  int i = get_global_id(0);
+  int j = get_global_id(1);
+  int k = get_global_id(2);
+  int4 color;
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
+  int4 src_coord;
+  int4 dst_coord;
+  if((i >= region0) || (j>= region1) || (k>=region2))
+    return;
+  src_coord.x = src_origin0 + i;
+  src_coord.y = src_origin1 + j;
+  src_coord.z = src_origin2 + k;
+  dst_coord.x = dst_origin0 + i;
+  dst_coord.y = dst_origin1 + j;
+  dst_coord.z = dst_origin2 + k;
+  color = read_imagei(src_image, sampler, src_coord);
+  write_imagei(dst_image, dst_coord, color);
+}
diff --git a/runtime/gen/kernels/cl_internal_copy_image_3d_to_buffer.cl b/runtime/gen/kernels/cl_internal_copy_image_3d_to_buffer.cl
new file mode 100644
index 0000000..dcfc8a2
--- /dev/null
+++ b/runtime/gen/kernels/cl_internal_copy_image_3d_to_buffer.cl
@@ -0,0 +1,22 @@
+#define IMAGE_TYPE image3d_t
+#define COORD_TYPE int4
+kernel void __cl_copy_image_3d_to_buffer ( __read_only IMAGE_TYPE image, global uchar* buffer,
+                                        unsigned int region0, unsigned int region1, unsigned int region2,
+                                        unsigned int src_origin0, unsigned int src_origin1, unsigned int src_origin2,
+                                        unsigned int dst_offset)
+{
+  int i = get_global_id(0);
+  int j = get_global_id(1);
+  int k = get_global_id(2);
+  uint4 color;
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
+  COORD_TYPE src_coord;
+  if((i >= region0) || (j>= region1) || (k>=region2))
+    return;
+  src_coord.x = src_origin0 + i;
+  src_coord.y = src_origin1 + j;
+  src_coord.z = src_origin2 + k;
+  color = read_imageui(image, sampler, src_coord);
+  dst_offset += (k * region1 + j) * region0 + i;
+  buffer[dst_offset] = color.x;
+}
diff --git a/runtime/gen/kernels/cl_internal_fill_buf_align128.cl b/runtime/gen/kernels/cl_internal_fill_buf_align128.cl
new file mode 100644
index 0000000..552820c
--- /dev/null
+++ b/runtime/gen/kernels/cl_internal_fill_buf_align128.cl
@@ -0,0 +1,9 @@
+kernel void __cl_fill_region_align128 ( global float16* dst, float16 pattern0,
+                                        unsigned int offset, unsigned int size, float16 pattern1)
+{
+    int i = get_global_id(0);
+    if (i < size) {
+        dst[i*2+offset] = pattern0;
+        dst[i*2+offset+1] = pattern1;
+    }
+}
diff --git a/runtime/gen/kernels/cl_internal_fill_buf_align2.cl b/runtime/gen/kernels/cl_internal_fill_buf_align2.cl
new file mode 100644
index 0000000..0b9a4cf
--- /dev/null
+++ b/runtime/gen/kernels/cl_internal_fill_buf_align2.cl
@@ -0,0 +1,8 @@
+kernel void __cl_fill_region_align2 ( global char2 * dst, char2 pattern,
+			             unsigned int offset, unsigned int size)
+{
+    int i = get_global_id(0);
+    if (i < size) {
+        dst[i+offset] = pattern;
+    }
+}
diff --git a/runtime/gen/kernels/cl_internal_fill_buf_align4.cl b/runtime/gen/kernels/cl_internal_fill_buf_align4.cl
new file mode 100644
index 0000000..aefd92f
--- /dev/null
+++ b/runtime/gen/kernels/cl_internal_fill_buf_align4.cl
@@ -0,0 +1,8 @@
+kernel void __cl_fill_region_align4 ( global float* dst, float pattern,
+			             unsigned int offset, unsigned int size)
+{
+    int i = get_global_id(0);
+    if (i < size) {
+        dst[i+offset] = pattern;
+    }
+}
diff --git a/runtime/gen/kernels/cl_internal_fill_buf_align8.cl b/runtime/gen/kernels/cl_internal_fill_buf_align8.cl
new file mode 100644
index 0000000..edaff77
--- /dev/null
+++ b/runtime/gen/kernels/cl_internal_fill_buf_align8.cl
@@ -0,0 +1,14 @@
+#define COMPILER_ABS_FUNC_N(N) \
+    kernel void __cl_fill_region_align8_##N ( global float##N* dst, float##N pattern, \
+                                              unsigned int offset, unsigned int size) { \
+         int i = get_global_id(0); \
+         if (i < size) { \
+             dst[i+offset] = pattern; \
+         }  \
+    }
+
+
+COMPILER_ABS_FUNC_N(2)
+COMPILER_ABS_FUNC_N(4)
+COMPILER_ABS_FUNC_N(8)
+COMPILER_ABS_FUNC_N(16)
diff --git a/runtime/gen/kernels/cl_internal_fill_buf_unalign.cl b/runtime/gen/kernels/cl_internal_fill_buf_unalign.cl
new file mode 100644
index 0000000..90762b0
--- /dev/null
+++ b/runtime/gen/kernels/cl_internal_fill_buf_unalign.cl
@@ -0,0 +1,8 @@
+kernel void __cl_fill_region_unalign ( global char * dst, char pattern,
+			               unsigned int offset, unsigned int size)
+{
+    int i = get_global_id(0);
+    if (i < size) {
+        dst[i+offset] = pattern;
+    }
+}
diff --git a/runtime/gen/kernels/cl_internal_fill_image_1d.cl b/runtime/gen/kernels/cl_internal_fill_image_1d.cl
new file mode 100644
index 0000000..b3b0cbf
--- /dev/null
+++ b/runtime/gen/kernels/cl_internal_fill_image_1d.cl
@@ -0,0 +1,14 @@
+kernel void __cl_fill_image_1d( __write_only image1d_t image, float4 pattern,
+                             unsigned int region0, unsigned int region1, unsigned int region2,
+                             unsigned int origin0, unsigned int origin1, unsigned int origin2)
+{
+  int i = get_global_id(0);
+  int j = get_global_id(1);
+  int k = get_global_id(2);
+  int coord;
+  if((i >= region0) || (j>= region1) || (k>=region2))
+    return;
+  coord = origin0 + i;
+  write_imagef(image, coord, pattern);
+
+}
diff --git a/runtime/gen/kernels/cl_internal_fill_image_1d_array.cl b/runtime/gen/kernels/cl_internal_fill_image_1d_array.cl
new file mode 100644
index 0000000..f1eb241
--- /dev/null
+++ b/runtime/gen/kernels/cl_internal_fill_image_1d_array.cl
@@ -0,0 +1,15 @@
+kernel void __cl_fill_image_1d_array( __write_only image1d_array_t image, float4 pattern,
+                             unsigned int region0, unsigned int region1, unsigned int region2,
+                             unsigned int origin0, unsigned int origin1, unsigned int origin2)
+{
+  int i = get_global_id(0);
+  int j = get_global_id(1);
+  int k = get_global_id(2);
+  int2 coord;
+  if((i >= region0) || (j>= region1) || (k>=region2))
+    return;
+  coord.x = origin0 + i;
+  coord.y = origin2 + k;
+  write_imagef(image, coord, pattern);
+
+}
diff --git a/runtime/gen/kernels/cl_internal_fill_image_2d.cl b/runtime/gen/kernels/cl_internal_fill_image_2d.cl
new file mode 100644
index 0000000..0e29f3e
--- /dev/null
+++ b/runtime/gen/kernels/cl_internal_fill_image_2d.cl
@@ -0,0 +1,15 @@
+kernel void __cl_fill_image_2d( __write_only image2d_t image, float4 pattern,
+                             unsigned int region0, unsigned int region1, unsigned int region2,
+                             unsigned int origin0, unsigned int origin1, unsigned int origin2)
+{
+  int i = get_global_id(0);
+  int j = get_global_id(1);
+  int k = get_global_id(2);
+  int2 coord;
+  if((i >= region0) || (j>= region1) || (k>=region2))
+    return;
+  coord.x = origin0 + i;
+  coord.y = origin1 + j;
+  write_imagef(image, coord, pattern);
+
+}
diff --git a/runtime/gen/kernels/cl_internal_fill_image_2d_array.cl b/runtime/gen/kernels/cl_internal_fill_image_2d_array.cl
new file mode 100644
index 0000000..f29c9e7
--- /dev/null
+++ b/runtime/gen/kernels/cl_internal_fill_image_2d_array.cl
@@ -0,0 +1,16 @@
+kernel void __cl_fill_image_2d_array( __write_only image2d_array_t image, float4 pattern,
+                             unsigned int region0, unsigned int region1, unsigned int region2,
+                             unsigned int origin0, unsigned int origin1, unsigned int origin2)
+{
+  int i = get_global_id(0);
+  int j = get_global_id(1);
+  int k = get_global_id(2);
+  int4 coord;
+  if((i >= region0) || (j>= region1) || (k>=region2))
+    return;
+  coord.x = origin0 + i;
+  coord.y = origin1 + j;
+  coord.z = origin2 + k;
+  write_imagef(image, coord, pattern);
+
+}
diff --git a/runtime/gen/kernels/cl_internal_fill_image_3d.cl b/runtime/gen/kernels/cl_internal_fill_image_3d.cl
new file mode 100644
index 0000000..042b8ab
--- /dev/null
+++ b/runtime/gen/kernels/cl_internal_fill_image_3d.cl
@@ -0,0 +1,16 @@
+kernel void __cl_fill_image_3d( __write_only image3d_t image, float4 pattern,
+                             unsigned int region0, unsigned int region1, unsigned int region2,
+                             unsigned int origin0, unsigned int origin1, unsigned int origin2)
+{
+  int i = get_global_id(0);
+  int j = get_global_id(1);
+  int k = get_global_id(2);
+  int4 coord;
+  if((i >= region0) || (j>= region1) || (k>=region2))
+    return;
+  coord.x = origin0 + i;
+  coord.y = origin1 + j;
+  coord.z = origin2 + k;
+  write_imagef(image, coord, pattern);
+
+}
-- 
2.7.4





More information about the Beignet mailing list