[Beignet] [PATCH] Full support of cl_intel_motion_estimation extension.
Yang, Rong R
rong.r.yang at intel.com
Mon Nov 16 19:31:10 PST 2015
LGTM, pushed, thanks.
> -----Original Message-----
> From: Beignet [mailto:beignet-bounces at lists.freedesktop.org] On Behalf Of
> Chuanbo Weng
> Sent: Wednesday, November 11, 2015 17:22
> To: beignet at lists.freedesktop.org
> Cc: Weng, Chuanbo
> Subject: [Beignet] [PATCH] Full support of cl_intel_motion_estimation
> extension.
>
> The following items are supported in this commit:
> 1. Return residuals.
> 2. All types of mb_block_type, subpixel_mode, sad_adjust_mode in
> cl_motion_estimation_desc_intel.
> After this commit, cl_intel_motion_estimation is fully supported.
>
> Signed-off-by: Chuanbo Weng <chuanbo.weng at intel.com>
> ---
> docs/howto/video-motion-estimation-howto.mdwn | 26 +--
> .../cl_internal_block_motion_estimate_intel.cl | 199
> +++++++++++++++++----
> 2 files changed, 175 insertions(+), 50 deletions(-)
>
> diff --git a/docs/howto/video-motion-estimation-howto.mdwn
> b/docs/howto/video-motion-estimation-howto.mdwn
> index d9edc9b..8deaa61 100644
> --- a/docs/howto/video-motion-estimation-howto.mdwn
> +++ b/docs/howto/video-motion-estimation-howto.mdwn
> @@ -1,21 +1,15 @@
> Video Motion Vector HowTo
> ==========================
>
> -Beignet now supports cl_intel_accelerator and part of
> cl_intel_motion_estimation, which -are Khronos official extensions. It
> provides a hardware acceleration of video motion
> +Beignet now supports cl_intel_accelerator and
> +cl_intel_motion_estimation, which are Khronos official extensions. It
> +provides a hardware acceleration of video motion
> vector to users.
>
> -Supported hardware platform and limitation
> -------------------------------------------
> +Supported hardware platform
> +---------------------------
>
> -Only 3rd Generation Intel Core Processors is supported for vme now. And
> now we just -implement this part of cl_intel_motion_estimation for motion
> vector computation(residuals -can not be returned yet) on 3rd Generation
> Intel Core Processors:
> - mb_block_type = CL_ME_MB_TYPE_16x16_INTEL
> - subpixel_mode = CL_ME_SUBPIXEL_MODE_INTEGER_INTEL
> - search_path_type = CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL /
> CL_ME_SEARCH_PATH_RADIUS_4_4_INTEL
> - / CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL
> -We will fully support cl_intel_motion_estimation in the future.
> +Only 3rd Generation Intel Core Processors is supported for vme now. We
> +will consider to support more platforms if necessary.
>
> Steps
> -----
> @@ -23,15 +17,13 @@ Steps
> In order to use video motion estimation provided by Beignet in your
> program, please follow the steps as below:
>
> -- Create a cl_accelerator_intel object using extension API
> clCreateAcceleratorINTEL, with
> - the following parameters:
> +- Create a cl_accelerator_intel object using extension API
> +clCreateAcceleratorINTEL, like
> + this:
> _accelerator_type_intel accelerator_type =
> CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL;
> cl_motion_estimation_desc_intel vmedesc =
> {CL_ME_MB_TYPE_16x16_INTEL,
> CL_ME_SUBPIXEL_MODE_INTEGER_INTEL,
> CL_ME_SAD_ADJUST_MODE_NONE_INTEL,
> - CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL(
> - or CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL
> - or CL_ME_SEARCH_PATH_RADIUS_4_4_INTEL)
> +
> + CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL
> };
>
> - Invoke clCreateProgramWithBuiltInKernels to create a program object with
> built-in kernels diff --git
> a/src/kernels/cl_internal_block_motion_estimate_intel.cl
> b/src/kernels/cl_internal_block_motion_estimate_intel.cl
> index 5a22338..1f28f4e 100644
> --- a/src/kernels/cl_internal_block_motion_estimate_intel.cl
> +++ b/src/kernels/cl_internal_block_motion_estimate_intel.cl
> @@ -59,23 +59,28 @@ void
> block_motion_estimate_intel(accelerator_intel_t accel,
> int lgid_x = get_group_id(0);
> int lgid_y = get_group_id(1);
>
> + int num_groups_x = get_num_groups(0); int index = lgid_y *
> + num_groups_x + lgid_x;
> +
> uint2 srcCoord = 0;
> + short2 predict_mv = 0;
> + if(prediction_motion_vector_buffer != NULL){
> + predict_mv = prediction_motion_vector_buffer[index];
> + predict_mv.x = predict_mv.x / 4;
> + predict_mv.y = predict_mv.y / 4;
> + }
>
> srcCoord.x = lgid_x * 16;
> srcCoord.y = lgid_y * 16;
>
> - //TODO: This line of code is just to workaround a curbe related bug caused
> by commit 061d214a6fc2876a0e24e094f87f2a172984bc23
> - //After fix, this line should be removed.
> - src_grf0_dw5 = accel.mb_block_type;
> -
> //CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL
> if(accel.search_path_type == 0x0){
> - //src_grf0_dw5 = (Ref_Height << 24) | (Ref_Width << 16) | (Ignored << 8)
> | (Dispatch_Id?);
> + //src_grf0_dw5 = (Ref_Height << 24) | (Ref_Width << 16) | (Ignored
> + << 8) | (Dispatch_Id);
> src_grf0_dw5 = (20 << 24) | (20 << 16) | (0 << 8) | (0);
> //src_grf0_dw1 = (Ref1Y << 16) | (Ref1X);
> - src_grf0_dw1 = 0xfffefffe;
> + src_grf0_dw1 = ((-2 + predict_mv.y) << 16 ) | ((-2 + predict_mv.x) &
> 0x0000ffff);
> //src_grf0_dw0 = (Ref0Y << 16) | (Ref0X);
> - src_grf0_dw0 = 0xfffefffe;
> + src_grf0_dw0 = ((-2 + predict_mv.y) << 16 ) | ((-2 + predict_mv.x) &
> 0x0000ffff);
> //src_grf1_dw2 = (Start1Y << 28) | (Start1X << 24) |
> (Start0Y << 20)
> src_grf1_dw2 = (0 << 28) | (0 << 24) | (0 << 20)
> //| (Start0X << 16) | (Max_Num_SU << 8) | (LenSP);
> @@ -84,35 +89,59 @@ void
> block_motion_estimate_intel(accelerator_intel_t accel,
> //CL_ME_SEARCH_PATH_RADIUS_4_4_INTEL
> else if(accel.search_path_type == 0x1){
> src_grf0_dw5 = (24 << 24) | (24 << 16) | (0 << 8) | (0);
> - src_grf0_dw1 = 0xfffcfffc;
> - src_grf0_dw0 = 0xfffcfffc;
> + src_grf0_dw1 = ((-4 + predict_mv.y) << 16 ) | ((-4 + predict_mv.x) &
> 0x0000ffff);
> + src_grf0_dw0 = ((-4 + predict_mv.y) << 16 ) | ((-4 + predict_mv.x) &
> 0x0000ffff);
> src_grf1_dw2 = (0 << 28) | (0 << 24) | (0 << 20)
> | (0 << 16) | (48 << 8) | (48);
> }
> //CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL
> else if(accel.search_path_type == 0x5){
> src_grf0_dw5 = (40 << 24) | (48 << 16) | (0 << 8) | (0);
> - src_grf0_dw1 = 0xfff4fff0;
> - src_grf0_dw0 = 0xfff4fff0;
> + src_grf0_dw1 = ((-12 + predict_mv.y) << 16 ) | ((-16 + predict_mv.x) &
> 0x0000ffff);
> + src_grf0_dw0 = ((-12 + predict_mv.y) << 16 ) | ((-16 + + predict_mv.x) &
> 0x0000ffff);
> src_grf1_dw2 = (0 << 28) | (0 << 24) | (0 << 20)
> | (0 << 16) | (48 << 8) | (48);
> }
>
> - //src_grf0_dw7 = Debug;
> - src_grf0_dw7 = 0;
> - //src_grf0_dw6 = Debug;
> - src_grf0_dw6 = 0;
> - //src_grf0_dw5 = (Ref_Height << 24) | (Ref_Width << 16) | (Ignored << 8) |
> (Dispatch_Id?);
> - //src_grf0_dw4 = Ignored;
> - src_grf0_dw4 = 0;
> - //src_grf0_dw3 = (Reserved << 31) | (Sub_Mb_Part_Mask << 24)
> | (Intra_SAD << 22)
> - src_grf0_dw3 = (0 << 31) | (0x7e << 24) | (0 << 22)
> + /*Deal with mb_block_type & sad_adjust_mode & subpixel_mode*/
> uchar
> + sub_mb_part_mask = 0; //CL_ME_MB_TYPE_16x16_INTEL
> + if(accel.mb_block_type == 0x0)
> + sub_mb_part_mask = 0x7e;
> + //CL_ME_MB_TYPE_8x8_INTEL
> + else if(accel.mb_block_type == 0x1)
> + sub_mb_part_mask = 0x77;
> + //CL_ME_MB_TYPE_4x4_INTEL
> + else if(accel.mb_block_type == 0x2)
> + sub_mb_part_mask = 0x3f;
> +
> + uchar inter_sad = 0;
> + //CL_ME_SAD_ADJUST_MODE_NONE_INTEL
> + if(accel.sad_adjust_mode == 0x0)
> + inter_sad = 0;
> + //CL_ME_SAD_ADJUST_MODE_HAAR_INTEL
> + else if(accel.sad_adjust_mode == 0x1)
> + inter_sad = 2;
> +
> + uchar sub_pel_mode = 0;
> + //CL_ME_SUBPIXEL_MODE_INTEGER_INTEL
> + if(accel.subpixel_mode == 0x0)
> + sub_pel_mode = 0;
> + //CL_ME_SUBPIXEL_MODE_HPEL_INTEL
> + else if(accel.subpixel_mode == 0x1)
> + sub_pel_mode = 1;
> + //CL_ME_SUBPIXEL_MODE_QPEL_INTEL
> + else if(accel.subpixel_mode == 0x2)
> + sub_pel_mode = 3;
> +
> + //src_grf0_dw3 = (Reserved << 31) | (Sub_Mb_Part_Mask << 24)
> | (Intra_SAD << 22)
> + src_grf0_dw3 = (0 << 31) | (sub_mb_part_mask << 24) | (0
> << 22)
> //| (Inter_SAD << 20) | (BB_Skip_Enabled << 19) |
> (Reserverd << 18)
> - | (0 << 20) | (0 << 19) | (0 << 18)
> + | (inter_sad << 20) | (0 << 19) | (0 << 18)
> //| (Dis_Aligned_Src_Fetch << 17) | (Dis_Aligned_Ref_Fetch << 16)
> | (Dis_Field_Cache_Alloc << 15)
> | (0 << 17) | (0 << 16) | (0 << 15)
> //| (Skip_Type << 14) | (Sub_Pel_Mode << 12) |
> (Dual_Search_Path_Opt << 11)
> - | (0 << 14) | (0 << 12) | (0 << 11)
> + | (0 << 14) | (sub_pel_mode << 12) | (0 << 11)
> //| (Search_Ctrl << 8) | (Ref_Access << 7) | (SrcAccess
> << 6)
> | (0 << 8) | (0 << 7) | (0 << 6)
> //| (Mb_Type_Remap << 4) | (Reserved_Workaround << 3) |
> (Reserved_Workaround << 2)
> @@ -120,6 +149,15 @@ void
> block_motion_estimate_intel(accelerator_intel_t accel,
> //| (Src_Size);
> | (0);
>
> +
> + //src_grf0_dw7 = Debug;
> + src_grf0_dw7 = 0;
> + //src_grf0_dw6 = Debug;
> + src_grf0_dw6 = 0;
> + //src_grf0_dw5 = (Ref_Height << 24) | (Ref_Width << 16) | (Ignored <<
> + 8) | (Dispatch_Id?);
> + //src_grf0_dw4 = Ignored;
> + src_grf0_dw4 = 0;
> +
> //src_grf0_dw2 = (SrcY << 16) | (SrcX);
> src_grf0_dw2 = (srcCoord.y << 16) | (srcCoord.x);
> //src_grf0_dw1 = (Ref1Y << 16) | (Ref1X); @@ -142,7 +180,8 @@ void
> block_motion_estimate_intel(accelerator_intel_t accel,
> /*src_grf1_dw1 = (RepartEn << 31) | (FBPrunEn << 30) |
> (AdaptiveValidationControl << 29)
> | (Uni_Mix_Disable << 28) | (Bi_Sub_Mb_Part_Mask << 24) |
> (Reserverd << 22)
> | (Bi_Weight << 16) | (Reserved << 6) |
> (MaxNumMVs);*/
> - src_grf1_dw1 = (0 << 24) | (2);
> + //src_grf1_dw1 = (0 << 24) | (2);
> + src_grf1_dw1 = (0 << 24) | (16);
> /*src_grf1_dw0 = (Early_Ime_Stop << 24) | (Early_Fme_Success << 16)
> | (Skip_Success << 8)
> | (T8x8_Flag_For_Inter_En << 7) | (Quit_Inter_En << 6) |
> (Early_Ime_Success_En << 5)
> | (Early_Success_En << 4) | (Part_Candidate_En << 3) |
> (Bi_Mix_Dis << 2)
> @@ -201,6 +240,8 @@ void
> block_motion_estimate_intel(accelerator_intel_t accel,
> src_grf4_dw1 = 0;
> src_grf4_dw0 = 0;
>
> + int lid_x = get_local_id(0);
> +
> vme_result = __gen_ocl_vme(src_image, ref_image,
> src_grf0_dw7, src_grf0_dw6, src_grf0_dw5, src_grf0_dw4,
> src_grf0_dw3, src_grf0_dw2, src_grf0_dw1, src_grf0_dw0, @@ -
> 217,17 +258,109 @@ void block_motion_estimate_intel(accelerator_intel_t
> accel,
>
> barrier(CLK_LOCAL_MEM_FENCE);
>
> - int lid_x = get_local_id(0);
> + short2 mv[16];
> + ushort res[16];
> +
> + uint write_back_dwx;
> uint simd_width = get_sub_group_size();
> - uint write_back_grf1_dw0;
> - if(simd_width == 8)
> - write_back_grf1_dw0 = __gen_ocl_region(0, vme_result.s1);
> - else if(simd_width == 16)
> - write_back_grf1_dw0 = __gen_ocl_region(8, vme_result.s0);
> - short2 val = as_short2( write_back_grf1_dw0 );
> - int index = lgid_y * get_num_groups(0) + lgid_x;
> - if( lid_x == 0 ){
> - motion_vector_buffer[index] = val;
> +
> + /* In simd 8 mode, one kernel variable 'uint' map to 8 dword.
> + * In simd 16 mode, one kernel variable 'uint' map to 16 dword.
> + * That's why we should treat simd8 and simd16 differently when
> + * use __gen_ocl_region.
> + * */
> + if(simd_width == 8){
> + write_back_dwx = __gen_ocl_region(0, vme_result.s1);
> + mv[0] = as_short2( write_back_dwx );
> +
> + if(accel.mb_block_type > 0x0){
> + for(int i = 2, j = 1; j < 4; i += 2, j++){
> + write_back_dwx = __gen_ocl_region(i, vme_result.s1);
> + mv[j] = as_short2( write_back_dwx );
> + }
> + if(accel.mb_block_type > 0x1){
> + for(int i = 0, j = 4; j < 8; i += 2, j++){
> + write_back_dwx = __gen_ocl_region(i, vme_result.s2);
> + mv[j] = as_short2( write_back_dwx );
> + }
> + for(int i = 0, j = 8; j < 12; i += 2, j++){
> + write_back_dwx = __gen_ocl_region(i, vme_result.s3);
> + mv[j] = as_short2( write_back_dwx );
> + }
> + for(int i = 0, j = 12; j < 16; i += 2, j++){
> + write_back_dwx = __gen_ocl_region(i, vme_result.s4);
> + mv[j] = as_short2( write_back_dwx );
> + }
> + }
> + }
> + ushort2 temp_res;
> + for(int i = 0; i < 8; i++){
> + write_back_dwx = __gen_ocl_region(i, vme_result.s5);
> + temp_res = as_ushort2(write_back_dwx);
> + res[i*2] = temp_res.s0;
> + res[i*2+1] = temp_res.s1;
> + }
> + }
> + else if(simd_width == 16){
> + write_back_dwx = __gen_ocl_region(0 + 8, vme_result.s0);
> + mv[0] = as_short2( write_back_dwx );
> +
> + if(accel.mb_block_type > 0x0){
> + for(int i = 2, j = 1; j < 4; i += 2, j++){
> + write_back_dwx = __gen_ocl_region(i + 8, vme_result.s0);
> + mv[j] = as_short2( write_back_dwx );
> + }
> + if(accel.mb_block_type > 0x1){
> + for(int i = 0, j = 4; j < 8; i += 2, j++){
> + write_back_dwx = __gen_ocl_region(i, vme_result.s1);
> + mv[j] = as_short2( write_back_dwx );
> + }
> + for(int i = 0, j = 8; j < 12; i += 2, j++){
> + write_back_dwx = __gen_ocl_region(i + 8, vme_result.s1);
> + mv[j] = as_short2( write_back_dwx );
> + }
> + for(int i = 0, j = 12; j < 16; i += 2, j++){
> + write_back_dwx = __gen_ocl_region(i, vme_result.s2);
> + mv[j] = as_short2( write_back_dwx );
> + }
> + }
> + }
> + ushort2 temp_res;
> + for(int i = 0; i < 8; i++){
> + write_back_dwx = __gen_ocl_region(i + 8, vme_result.s2);
> + temp_res = as_ushort2(write_back_dwx);
> + res[i*2] = temp_res.s0;
> + res[i*2+1] = temp_res.s1;
> + }
> + }
> +
> + int mv_index;
> +
> + //CL_ME_MB_TYPE_16x16_INTEL
> + if(accel.mb_block_type == 0x0){
> + mv_index = index * 1;
> + if( lid_x == 0 ){
> + motion_vector_buffer[mv_index] = mv[lid_x];
> + residuals[mv_index] = 2 * res[lid_x];
> + }
> + }
> + //CL_ME_MB_TYPE_8x8_INTEL
> + else if(accel.mb_block_type == 0x1){
> + if(lid_x < 4){
> + mv_index = lgid_y * num_groups_x * 4 + lgid_x * 2;
> + mv_index = mv_index + num_groups_x * 2 * (lid_x / 2) + (lid_x % 2);
> + motion_vector_buffer[mv_index] = mv[lid_x];
> + residuals[mv_index] = 2 * res[lid_x];
> + }
> + }
> + //CL_ME_MB_TYPE_4x4_INTEL
> + else if(accel.mb_block_type == 0x2){
> + if(lid_x < 16){
> + mv_index = lgid_y * num_groups_x * 16 + lgid_x * 4;
> + mv_index = mv_index + num_groups_x * 4 * (lid_x / 4) + (lid_x % 4);
> + motion_vector_buffer[mv_index] = mv[lid_x];
> + residuals[mv_index] = 2 * res[lid_x];
> + }
> }
>
> }
> --
> 1.9.1
>
> _______________________________________________
> Beignet mailing list
> Beignet at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet
More information about the Beignet
mailing list