[Libva] [PATCH 1/5] VME pipeline for HEVC

Qu,Pengfei Pengfei.Qu at intel.com
Mon Jan 5 19:33:26 PST 2015


Signed-off-by: Qu,Pengfei <Pengfei.Qu at intel.com>
---
 src/gen6_mfc_common.c | 688 ++++++++++++++++++++++++-------------
 src/gen6_vme.h        |  19 ++
 src/gen9_vme.c        | 928 ++++++++++++++++++++++++++++++++++++++------------
 3 files changed, 1180 insertions(+), 455 deletions(-)

diff --git a/src/gen6_mfc_common.c b/src/gen6_mfc_common.c
index fe41dac..a69f00a 100644
--- a/src/gen6_mfc_common.c
+++ b/src/gen6_mfc_common.c
@@ -631,8 +631,8 @@ VAStatus intel_mfc_avc_prepare(VADriverContextP ctx,
     int height_in_mbs = pSequenceParameter->picture_height_in_mbs;
 
     if (IS_GEN6(i965->intel.device_info)) {
-	/* On the SNB it should be fixed to 128 for the DMV buffer */
-	width_in_mbs = 128;
+        /* On the SNB it should be fixed to 128 for the DMV buffer */
+        width_in_mbs = 128;
     }
 
     for (j = 0; j < encode_state->num_slice_params_ext && enable_avc_ildb == 0; j++) {
@@ -735,7 +735,7 @@ VAStatus intel_mfc_avc_prepare(VADriverContextP ctx,
             break;
         }
     }
-	
+
     mfc_context->uncompressed_picture_source.bo = encode_state->input_yuv_object->bo;
     dri_bo_reference(mfc_context->uncompressed_picture_source.bo);
 
@@ -807,7 +807,7 @@ int intel_format_lutvalue(int value, int max)
     if (temp1 > temp2)
         ret = max;
     return ret;
-	
+
 }
 
 
@@ -842,40 +842,40 @@ void intel_vme_update_mbmv_cost(VADriverContextP ctx,
 
     
     if (encoder_context->rate_control_mode == VA_RC_CQP)
-	qp = pic_param->pic_init_qp + slice_param->slice_qp_delta;
+        qp = pic_param->pic_init_qp + slice_param->slice_qp_delta;
     else
-	qp = mfc_context->bit_rate_control_context[slice_type].QpPrimeY;
-  
+        qp = mfc_context->bit_rate_control_context[slice_type].QpPrimeY;
+
     if (vme_state_message == NULL)
-	return;
- 
+        return;
+
     assert(qp <= QP_MAX); 
     lambda = intel_lambda_qp(qp);
     if (slice_type == SLICE_TYPE_I) {
-	vme_state_message[MODE_INTRA_16X16] = 0;
-	m_cost = lambda * 4;
-	vme_state_message[MODE_INTRA_8X8] = intel_format_lutvalue(m_cost, 0x8f);
-	m_cost = lambda * 16; 
-	vme_state_message[MODE_INTRA_4X4] = intel_format_lutvalue(m_cost, 0x8f);
-	m_cost = lambda * 3;
-	vme_state_message[MODE_INTRA_NONPRED] = intel_format_lutvalue(m_cost, 0x6f);
+        vme_state_message[MODE_INTRA_16X16] = 0;
+        m_cost = lambda * 4;
+        vme_state_message[MODE_INTRA_8X8] = intel_format_lutvalue(m_cost, 0x8f);
+        m_cost = lambda * 16; 
+        vme_state_message[MODE_INTRA_4X4] = intel_format_lutvalue(m_cost, 0x8f);
+        m_cost = lambda * 3;
+        vme_state_message[MODE_INTRA_NONPRED] = intel_format_lutvalue(m_cost, 0x6f);
     } else {
-    	m_cost = 0;
-	vme_state_message[MODE_INTER_MV0] = intel_format_lutvalue(m_cost, 0x6f);
-	for (j = 1; j < 3; j++) {
+        m_cost = 0;
+        vme_state_message[MODE_INTER_MV0] = intel_format_lutvalue(m_cost, 0x6f);
+        for (j = 1; j < 3; j++) {
             m_costf = (log2f((float)(j + 1)) + 1.718f) * lambda;
             m_cost = (int)m_costf;
             vme_state_message[MODE_INTER_MV0 + j] = intel_format_lutvalue(m_cost, 0x6f);
-   	}
-    	mv_count = 3;
-    	for (j = 4; j <= 64; j *= 2) {
+        }
+        mv_count = 3;
+        for (j = 4; j <= 64; j *= 2) {
             m_costf = (log2f((float)(j + 1)) + 1.718f) * lambda;
             m_cost = (int)m_costf;
             vme_state_message[MODE_INTER_MV0 + mv_count] = intel_format_lutvalue(m_cost, 0x6f);
             mv_count++;
-	}
+        }
 
-	if (qp <= 25) {
+        if (qp <= 25) {
             vme_state_message[MODE_INTRA_16X16] = 0x4a;
             vme_state_message[MODE_INTRA_8X8] = 0x4a;
             vme_state_message[MODE_INTRA_4X4] = 0x4a;
@@ -887,17 +887,17 @@ void intel_vme_update_mbmv_cost(VADriverContextP ctx,
             vme_state_message[MODE_INTER_4X4] = 0x4a;
             vme_state_message[MODE_INTER_BWD] = 0x2a;
             return;
-	}
-	m_costf = lambda * 10;
-	vme_state_message[MODE_INTRA_16X16] = intel_format_lutvalue(m_cost, 0x8f);
-	m_cost = lambda * 14;
-	vme_state_message[MODE_INTRA_8X8] = intel_format_lutvalue(m_cost, 0x8f);
-	m_cost = lambda * 24; 
-	vme_state_message[MODE_INTRA_4X4] = intel_format_lutvalue(m_cost, 0x8f);
-	m_costf = lambda * 3.5;
-	m_cost = m_costf;
-	vme_state_message[MODE_INTRA_NONPRED] = intel_format_lutvalue(m_cost, 0x6f);
-    	if (slice_type == SLICE_TYPE_P) {
+        }
+        m_costf = lambda * 10;
+        vme_state_message[MODE_INTRA_16X16] = intel_format_lutvalue(m_cost, 0x8f);
+        m_cost = lambda * 14;
+        vme_state_message[MODE_INTRA_8X8] = intel_format_lutvalue(m_cost, 0x8f);
+        m_cost = lambda * 24; 
+        vme_state_message[MODE_INTRA_4X4] = intel_format_lutvalue(m_cost, 0x8f);
+        m_costf = lambda * 3.5;
+        m_cost = m_costf;
+        vme_state_message[MODE_INTRA_NONPRED] = intel_format_lutvalue(m_cost, 0x6f);
+        if (slice_type == SLICE_TYPE_P) {
             m_costf = lambda * 2.5;
             m_cost = m_costf;
             vme_state_message[MODE_INTER_16X16] = intel_format_lutvalue(m_cost, 0x8f);
@@ -915,7 +915,7 @@ void intel_vme_update_mbmv_cost(VADriverContextP ctx,
             vme_state_message[MODE_INTER_4X4] = intel_format_lutvalue(m_cost, 0x6f);
             /* BWD is not used in P-frame */
             vme_state_message[MODE_INTER_BWD] = 0;
-	} else {
+        } else {
             m_costf = lambda * 2.5;
             m_cost = m_costf;
             vme_state_message[MODE_INTER_16X16] = intel_format_lutvalue(m_cost, 0x8f);
@@ -934,7 +934,7 @@ void intel_vme_update_mbmv_cost(VADriverContextP ctx,
             m_costf = lambda * 1.5;
             m_cost = m_costf;
             vme_state_message[MODE_INTER_BWD] = intel_format_lutvalue(m_cost, 0x6f);
-	}
+        }
     }
 }
 
@@ -961,7 +961,7 @@ gen7_vme_scoreboard_init(VADriverContextP ctx, struct gen6_vme_context *vme_cont
     vme_context->gpe_context.vfe_desc6.scoreboard1.delta_y1 = -1;
     vme_context->gpe_context.vfe_desc6.scoreboard1.delta_x2 = 1;
     vme_context->gpe_context.vfe_desc6.scoreboard1.delta_y2 = -1;
-	
+
     vme_context->gpe_context.vfe_desc7.dword = 0;
     return;
 }
@@ -974,7 +974,7 @@ static inline int loop_in_bounds(int x_index, int y_index, int first_mb, int num
         return -1;
     if (y_index < 0 || y_index >= mb_height)
         return -1;
-	
+
     mb_index = y_index * mb_width + x_index;
     if (mb_index < first_mb || mb_index > (first_mb + num_mb))
         return -1;
@@ -1000,103 +1000,103 @@ gen7_vme_walker_fill_vme_batchbuffer(VADriverContextP ctx,
     command_ptr = vme_context->vme_batchbuffer.bo->virtual;
 
     for (s = 0; s < encode_state->num_slice_params_ext; s++) {
-	VAEncSliceParameterBufferH264 *pSliceParameter = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[s]->buffer;
-	int first_mb = pSliceParameter->macroblock_address;
-	int num_mb = pSliceParameter->num_macroblocks;
-	unsigned int mb_intra_ub, score_dep;
-	int x_outer, y_outer, x_inner, y_inner;
-	int xtemp_outer = 0;
-
-	x_outer = first_mb % mb_width;
-	y_outer = first_mb / mb_width;
-	mb_row = y_outer;
-				 
-	for (; x_outer < (mb_width -2 ) && !loop_in_bounds(x_outer, y_outer, first_mb, num_mb, mb_width, mb_height); ) {
-	    x_inner = x_outer;
-	    y_inner = y_outer;
-	    for (; !loop_in_bounds(x_inner, y_inner, first_mb, num_mb, mb_width, mb_height);) {
-		mb_intra_ub = 0;
-		score_dep = 0;
-		if (x_inner != 0) {
-		    mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_AE;
-		    score_dep |= MB_SCOREBOARD_A; 
-		}
-		if (y_inner != mb_row) {
-		    mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_B;
-		    score_dep |= MB_SCOREBOARD_B;
-		    if (x_inner != 0)
-			mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_D;
-		    if (x_inner != (mb_width -1)) {
-			mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_C;
-			score_dep |= MB_SCOREBOARD_C;
+        VAEncSliceParameterBufferH264 *pSliceParameter = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[s]->buffer;
+        int first_mb = pSliceParameter->macroblock_address;
+        int num_mb = pSliceParameter->num_macroblocks;
+        unsigned int mb_intra_ub, score_dep;
+        int x_outer, y_outer, x_inner, y_inner;
+        int xtemp_outer = 0;
+
+        x_outer = first_mb % mb_width;
+        y_outer = first_mb / mb_width;
+        mb_row = y_outer;
+
+        for (; x_outer < (mb_width -2 ) && !loop_in_bounds(x_outer, y_outer, first_mb, num_mb, mb_width, mb_height); ) {
+            x_inner = x_outer;
+            y_inner = y_outer;
+            for (; !loop_in_bounds(x_inner, y_inner, first_mb, num_mb, mb_width, mb_height);) {
+                mb_intra_ub = 0;
+                score_dep = 0;
+                if (x_inner != 0) {
+                    mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_AE;
+                    score_dep |= MB_SCOREBOARD_A; 
+                }
+                if (y_inner != mb_row) {
+                    mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_B;
+                    score_dep |= MB_SCOREBOARD_B;
+                    if (x_inner != 0)
+                        mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_D;
+                    if (x_inner != (mb_width -1)) {
+                        mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_C;
+                        score_dep |= MB_SCOREBOARD_C;
                     }
-		}
-							
-            	*command_ptr++ = (CMD_MEDIA_OBJECT | (8 - 2));
-		*command_ptr++ = kernel;
-		*command_ptr++ = USE_SCOREBOARD;
-		/* Indirect data */
-		*command_ptr++ = 0;
-		/* the (X, Y) term of scoreboard */
-		*command_ptr++ = ((y_inner << 16) | x_inner);
-		*command_ptr++ = score_dep;
-		/*inline data */
-		*command_ptr++ = (mb_width << 16 | y_inner << 8 | x_inner);
-		*command_ptr++ = ((1 << 18) | (1 << 16) | transform_8x8_mode_flag | (mb_intra_ub << 8));
-		x_inner -= 2;
-		y_inner += 1;
-	    }
-	    x_outer += 1;
-	}
-
-	xtemp_outer = mb_width - 2;
-	if (xtemp_outer < 0)
+                }
+
+                *command_ptr++ = (CMD_MEDIA_OBJECT | (8 - 2));
+                *command_ptr++ = kernel;
+                *command_ptr++ = USE_SCOREBOARD;
+                /* Indirect data */
+                *command_ptr++ = 0;
+                /* the (X, Y) term of scoreboard */
+                *command_ptr++ = ((y_inner << 16) | x_inner);
+                *command_ptr++ = score_dep;
+                /*inline data */
+                *command_ptr++ = (mb_width << 16 | y_inner << 8 | x_inner);
+                *command_ptr++ = ((1 << 18) | (1 << 16) | transform_8x8_mode_flag | (mb_intra_ub << 8));
+                x_inner -= 2;
+                y_inner += 1;
+            }
+            x_outer += 1;
+        }
+
+        xtemp_outer = mb_width - 2;
+        if (xtemp_outer < 0)
             xtemp_outer = 0;
-	x_outer = xtemp_outer;
-	y_outer = first_mb / mb_width;
-	for (;!loop_in_bounds(x_outer, y_outer, first_mb, num_mb, mb_width, mb_height); ) { 
-	    y_inner = y_outer;
-	    x_inner = x_outer;
-	    for (; !loop_in_bounds(x_inner, y_inner, first_mb, num_mb, mb_width, mb_height);) {
-	    	mb_intra_ub = 0;
-		score_dep = 0;
-		if (x_inner != 0) {
-		    mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_AE;
-		    score_dep |= MB_SCOREBOARD_A; 
-		}
-		if (y_inner != mb_row) {
-		    mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_B;
-		    score_dep |= MB_SCOREBOARD_B;
-		    if (x_inner != 0)
-			mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_D;
-
-		    if (x_inner != (mb_width -1)) {
-			mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_C;
-			score_dep |= MB_SCOREBOARD_C;
+        x_outer = xtemp_outer;
+        y_outer = first_mb / mb_width;
+        for (;!loop_in_bounds(x_outer, y_outer, first_mb, num_mb, mb_width, mb_height); ) { 
+            y_inner = y_outer;
+            x_inner = x_outer;
+            for (; !loop_in_bounds(x_inner, y_inner, first_mb, num_mb, mb_width, mb_height);) {
+                mb_intra_ub = 0;
+                score_dep = 0;
+                if (x_inner != 0) {
+                    mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_AE;
+                    score_dep |= MB_SCOREBOARD_A; 
+                }
+                if (y_inner != mb_row) {
+                    mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_B;
+                    score_dep |= MB_SCOREBOARD_B;
+                    if (x_inner != 0)
+                        mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_D;
+
+                    if (x_inner != (mb_width -1)) {
+                        mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_C;
+                        score_dep |= MB_SCOREBOARD_C;
                     }
-		}
-
-            	*command_ptr++ = (CMD_MEDIA_OBJECT | (8 - 2));
-		*command_ptr++ = kernel;
-		*command_ptr++ = USE_SCOREBOARD;
-		/* Indirect data */
-		*command_ptr++ = 0;
-		/* the (X, Y) term of scoreboard */
-		*command_ptr++ = ((y_inner << 16) | x_inner);
-		*command_ptr++ = score_dep;
-		/*inline data */
-		*command_ptr++ = (mb_width << 16 | y_inner << 8 | x_inner);
-		*command_ptr++ = ((1 << 18) | (1 << 16) | transform_8x8_mode_flag | (mb_intra_ub << 8));
-
-		x_inner -= 2;
-		y_inner += 1;
-	    }
-	    x_outer++;
-	    if (x_outer >= mb_width) {
-		y_outer += 1;
-		x_outer = xtemp_outer;
-	    }		
-	}
+                }
+
+                *command_ptr++ = (CMD_MEDIA_OBJECT | (8 - 2));
+                *command_ptr++ = kernel;
+                *command_ptr++ = USE_SCOREBOARD;
+                /* Indirect data */
+                *command_ptr++ = 0;
+                /* the (X, Y) term of scoreboard */
+                *command_ptr++ = ((y_inner << 16) | x_inner);
+                *command_ptr++ = score_dep;
+                /*inline data */
+                *command_ptr++ = (mb_width << 16 | y_inner << 8 | x_inner);
+                *command_ptr++ = ((1 << 18) | (1 << 16) | transform_8x8_mode_flag | (mb_intra_ub << 8));
+
+                x_inner -= 2;
+                y_inner += 1;
+            }
+            x_outer++;
+            if (x_outer >= mb_width) {
+                y_outer += 1;
+                x_outer = xtemp_outer;
+            }		
+        }
     }
 
     *command_ptr++ = 0;
@@ -1270,7 +1270,7 @@ void intel_vme_mpeg2_state_setup(VADriverContextP ctx,
          */
         vme_state_message[MODE_INTRA_16X16] = intel_format_lutvalue(m_cost, 0x8f);
         vme_state_message[MODE_INTER_16X16] = intel_format_lutvalue(m_cost, 0x8f);
-			
+
         vme_state_message[MODE_INTER_16X8] = 0;
         vme_state_message[MODE_INTER_8X8] = 0;
         vme_state_message[MODE_INTER_8X4] = 0;
@@ -1300,105 +1300,105 @@ gen7_vme_mpeg2_walker_fill_vme_batchbuffer(VADriverContextP ctx,
     command_ptr = vme_context->vme_batchbuffer.bo->virtual;
 
     {
-	unsigned int mb_intra_ub, score_dep;
-	int x_outer, y_outer, x_inner, y_inner;
-	int xtemp_outer = 0;
-	int first_mb = 0;
-	int num_mb = mb_width * mb_height;
-
-	x_outer = 0;
-	y_outer = 0;
-	
-				 
-	for (; x_outer < (mb_width -2 ) && !loop_in_bounds(x_outer, y_outer, first_mb, num_mb, mb_width, mb_height); ) {
-	    x_inner = x_outer;
-	    y_inner = y_outer;
-	    for (; !loop_in_bounds(x_inner, y_inner, first_mb, num_mb, mb_width, mb_height);) {
-		mb_intra_ub = 0;
-		score_dep = 0;
-		if (x_inner != 0) {
-		    mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_AE;
-		    score_dep |= MB_SCOREBOARD_A; 
-		}
-		if (y_inner != 0) {
-		    mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_B;
-		    score_dep |= MB_SCOREBOARD_B;
-
-		    if (x_inner != 0)
-			mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_D;
-
-		    if (x_inner != (mb_width -1)) {
-			mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_C;
-			score_dep |= MB_SCOREBOARD_C;
-		    }
-		}
-							
-            	*command_ptr++ = (CMD_MEDIA_OBJECT | (8 - 2));
-		*command_ptr++ = kernel;
-		*command_ptr++ = MPEG2_SCOREBOARD;
-		/* Indirect data */
-		*command_ptr++ = 0;
-		/* the (X, Y) term of scoreboard */
-		*command_ptr++ = ((y_inner << 16) | x_inner);
-		*command_ptr++ = score_dep;
-		/*inline data */
-		*command_ptr++ = (mb_width << 16 | y_inner << 8 | x_inner);
-		*command_ptr++ = ((1 << 18) | (1 << 16) | (mb_intra_ub << 8));
-		x_inner -= 2;
-		y_inner += 1;
-	    }
-	    x_outer += 1;
-	}
-
-	xtemp_outer = mb_width - 2;
-	if (xtemp_outer < 0)
+        unsigned int mb_intra_ub, score_dep;
+        int x_outer, y_outer, x_inner, y_inner;
+        int xtemp_outer = 0;
+        int first_mb = 0;
+        int num_mb = mb_width * mb_height;
+
+        x_outer = 0;
+        y_outer = 0;
+
+
+        for (; x_outer < (mb_width -2 ) && !loop_in_bounds(x_outer, y_outer, first_mb, num_mb, mb_width, mb_height); ) {
+            x_inner = x_outer;
+            y_inner = y_outer;
+            for (; !loop_in_bounds(x_inner, y_inner, first_mb, num_mb, mb_width, mb_height);) {
+                mb_intra_ub = 0;
+                score_dep = 0;
+                if (x_inner != 0) {
+                    mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_AE;
+                    score_dep |= MB_SCOREBOARD_A; 
+                }
+                if (y_inner != 0) {
+                    mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_B;
+                    score_dep |= MB_SCOREBOARD_B;
+
+                    if (x_inner != 0)
+                        mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_D;
+
+                    if (x_inner != (mb_width -1)) {
+                        mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_C;
+                        score_dep |= MB_SCOREBOARD_C;
+                    }
+                }
+
+                *command_ptr++ = (CMD_MEDIA_OBJECT | (8 - 2));
+                *command_ptr++ = kernel;
+                *command_ptr++ = MPEG2_SCOREBOARD;
+                /* Indirect data */
+                *command_ptr++ = 0;
+                /* the (X, Y) term of scoreboard */
+                *command_ptr++ = ((y_inner << 16) | x_inner);
+                *command_ptr++ = score_dep;
+                /*inline data */
+                *command_ptr++ = (mb_width << 16 | y_inner << 8 | x_inner);
+                *command_ptr++ = ((1 << 18) | (1 << 16) | (mb_intra_ub << 8));
+                x_inner -= 2;
+                y_inner += 1;
+            }
+            x_outer += 1;
+        }
+
+        xtemp_outer = mb_width - 2;
+        if (xtemp_outer < 0)
             xtemp_outer = 0;
-	x_outer = xtemp_outer;
-	y_outer = 0;
-	for (;!loop_in_bounds(x_outer, y_outer, first_mb, num_mb, mb_width, mb_height); ) { 
-	    y_inner = y_outer;
-	    x_inner = x_outer;
-	    for (; !loop_in_bounds(x_inner, y_inner, first_mb, num_mb, mb_width, mb_height);) {
-	    	mb_intra_ub = 0;
-		score_dep = 0;
-		if (x_inner != 0) {
-		    mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_AE;
-		    score_dep |= MB_SCOREBOARD_A; 
-		}
-		if (y_inner != 0) {
-		    mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_B;
-		    score_dep |= MB_SCOREBOARD_B;
-
-		    if (x_inner != 0)
-			mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_D;
-
-		    if (x_inner != (mb_width -1)) {
-			mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_C;
-			score_dep |= MB_SCOREBOARD_C;
-		    }
-		}
-
-            	*command_ptr++ = (CMD_MEDIA_OBJECT | (8 - 2));
-		*command_ptr++ = kernel;
-		*command_ptr++ = MPEG2_SCOREBOARD;
-		/* Indirect data */
-		*command_ptr++ = 0;
-		/* the (X, Y) term of scoreboard */
-		*command_ptr++ = ((y_inner << 16) | x_inner);
-		*command_ptr++ = score_dep;
-		/*inline data */
-		*command_ptr++ = (mb_width << 16 | y_inner << 8 | x_inner);
-		*command_ptr++ = ((1 << 18) | (1 << 16) | (mb_intra_ub << 8));
-
-		x_inner -= 2;
-		y_inner += 1;
-	    }
-	    x_outer++;
-	    if (x_outer >= mb_width) {
-		y_outer += 1;
-		x_outer = xtemp_outer;
-	    }		
-	}
+        x_outer = xtemp_outer;
+        y_outer = 0;
+        for (;!loop_in_bounds(x_outer, y_outer, first_mb, num_mb, mb_width, mb_height); ) { 
+            y_inner = y_outer;
+            x_inner = x_outer;
+            for (; !loop_in_bounds(x_inner, y_inner, first_mb, num_mb, mb_width, mb_height);) {
+                mb_intra_ub = 0;
+                score_dep = 0;
+                if (x_inner != 0) {
+                    mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_AE;
+                    score_dep |= MB_SCOREBOARD_A; 
+                }
+                if (y_inner != 0) {
+                    mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_B;
+                    score_dep |= MB_SCOREBOARD_B;
+
+                    if (x_inner != 0)
+                        mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_D;
+
+                    if (x_inner != (mb_width -1)) {
+                        mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_C;
+                        score_dep |= MB_SCOREBOARD_C;
+                    }
+                }
+
+                *command_ptr++ = (CMD_MEDIA_OBJECT | (8 - 2));
+                *command_ptr++ = kernel;
+                *command_ptr++ = MPEG2_SCOREBOARD;
+                /* Indirect data */
+                *command_ptr++ = 0;
+                /* the (X, Y) term of scoreboard */
+                *command_ptr++ = ((y_inner << 16) | x_inner);
+                *command_ptr++ = score_dep;
+                /*inline data */
+                *command_ptr++ = (mb_width << 16 | y_inner << 8 | x_inner);
+                *command_ptr++ = ((1 << 18) | (1 << 16) | (mb_intra_ub << 8));
+
+                x_inner -= 2;
+                y_inner += 1;
+            }
+            x_outer++;
+            if (x_outer >= mb_width) {
+                y_outer += 1;
+                x_outer = xtemp_outer;
+            }		
+        }
     }
 
     *command_ptr++ = 0;
@@ -1406,7 +1406,7 @@ gen7_vme_mpeg2_walker_fill_vme_batchbuffer(VADriverContextP ctx,
 
     dri_bo_unmap(vme_context->vme_batchbuffer.bo);
     return;
-}
+                                           }
 
 static int
 avc_temporal_find_surface(VAPictureH264 *curr_pic,
@@ -1652,3 +1652,225 @@ void intel_avc_slice_insert_packed_data(VADriverContextP ctx,
     return;
 }
 
+/* HEVC */
+static int
+hevc_temporal_find_surface(VAPictureHEVC *curr_pic,
+                           VAPictureHEVC *ref_list,
+                           int num_pictures,
+                           int dir)
+{
+    int i, found = -1, min = 0x7FFFFFFF;
+
+    for (i = 0; i < num_pictures; i++) {
+        int tmp;
+
+        if ((ref_list[i].flags & VA_PICTURE_HEVC_INVALID) ||
+            (ref_list[i].picture_id == VA_INVALID_SURFACE))
+            break;
+
+        tmp = curr_pic->pic_order_cnt - ref_list[i].pic_order_cnt;
+
+        if (dir)
+            tmp = -tmp;
+
+        if (tmp > 0 && tmp < min) {
+            min = tmp;
+            found = i;
+        }
+    }
+
+    return found;
+}
+void
+intel_hevc_vme_reference_state(VADriverContextP ctx,
+                               struct encode_state *encode_state,
+                               struct intel_encoder_context *encoder_context,
+                               int list_index,
+                               int surface_index,
+                               void (* vme_source_surface_state)(
+                                   VADriverContextP ctx,
+                                   int index,
+                                   struct object_surface *obj_surface,
+                                   struct intel_encoder_context *encoder_context))
+{
+    struct gen6_vme_context *vme_context = encoder_context->vme_context;
+    struct object_surface *obj_surface = NULL;
+    struct i965_driver_data *i965 = i965_driver_data(ctx);
+    VASurfaceID ref_surface_id;
+    VAEncPictureParameterBufferHEVC *pic_param = (VAEncPictureParameterBufferHEVC *)encode_state->pic_param_ext->buffer;
+    VAEncSliceParameterBufferHEVC *slice_param = (VAEncSliceParameterBufferHEVC *)encode_state->slice_params_ext[0]->buffer;
+    int max_num_references;
+    VAPictureHEVC *curr_pic;
+    VAPictureHEVC *ref_list;
+    int ref_idx;
+
+    if (list_index == 0) {
+        max_num_references = pic_param->num_ref_idx_l0_default_active_minus1 + 1;
+        ref_list = slice_param->ref_pic_list0;
+    } else {
+        max_num_references = pic_param->num_ref_idx_l1_default_active_minus1 + 1;
+        ref_list = slice_param->ref_pic_list1;
+    }
+
+    if (max_num_references == 1) {
+        if (list_index == 0) {
+            ref_surface_id = slice_param->ref_pic_list0[0].picture_id;
+            vme_context->used_references[0] = &slice_param->ref_pic_list0[0];
+        } else {
+            ref_surface_id = slice_param->ref_pic_list1[0].picture_id;
+            vme_context->used_references[1] = &slice_param->ref_pic_list1[0];
+        }
+
+        if (ref_surface_id != VA_INVALID_SURFACE)
+            obj_surface = SURFACE(ref_surface_id);
+
+        if (!obj_surface ||
+            !obj_surface->bo) {
+            obj_surface = encode_state->reference_objects[list_index];
+            vme_context->used_references[list_index] = &pic_param->reference_frames[list_index];
+        }
+
+        ref_idx = 0;
+    } else {
+        curr_pic = &pic_param->decoded_curr_pic;
+
+        /* select the reference frame in temporal space */
+        ref_idx = hevc_temporal_find_surface(curr_pic, ref_list, max_num_references, list_index == 1);
+        ref_surface_id = ref_list[ref_idx].picture_id;
+
+        if (ref_surface_id != VA_INVALID_SURFACE) /* otherwise warning later */
+            obj_surface = SURFACE(ref_surface_id);
+
+        vme_context->used_reference_objects[list_index] = obj_surface;
+        vme_context->used_references[list_index] = &ref_list[ref_idx];
+    }
+
+    if (obj_surface &&
+        obj_surface->bo) {
+        assert(ref_idx >= 0);
+        vme_context->used_reference_objects[list_index] = obj_surface;
+        vme_source_surface_state(ctx, surface_index, obj_surface, encoder_context);
+        vme_context->ref_index_in_mb[list_index] = (ref_idx << 24 |
+                ref_idx << 16 |
+                ref_idx <<  8 |
+                ref_idx);
+    } else {
+        vme_context->used_reference_objects[list_index] = NULL;
+        vme_context->used_references[list_index] = NULL;
+        vme_context->ref_index_in_mb[list_index] = 0;
+    }
+}
+
+void intel_vme_hevc_update_mbmv_cost(VADriverContextP ctx,
+                                     struct encode_state *encode_state,
+                                     struct intel_encoder_context *encoder_context)
+{
+    //struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
+    struct gen6_vme_context *vme_context = encoder_context->vme_context;
+    VAEncPictureParameterBufferHEVC *pic_param = (VAEncPictureParameterBufferHEVC *)encode_state->pic_param_ext->buffer;
+    VAEncSliceParameterBufferHEVC *slice_param = (VAEncSliceParameterBufferHEVC *)encode_state->slice_params_ext[0]->buffer;
+    int qp, m_cost, j, mv_count;
+    uint8_t *vme_state_message = (uint8_t *)(vme_context->vme_state_message);
+    float   lambda, m_costf;
+
+    /* here no SI SP slice for HEVC, do not need slice fixup */
+    int slice_type = slice_param->slice_type;
+
+
+    /* to do for CBR*/
+    //if (encoder_context->rate_control_mode == VA_RC_CQP)
+    qp = pic_param->pic_init_qp + slice_param->slice_qp_delta;
+    //else
+    //qp = mfc_context->bit_rate_control_context[slice_type].QpPrimeY;
+
+    if (vme_state_message == NULL)
+        return;
+
+    assert(qp <= QP_MAX);
+    lambda = intel_lambda_qp(qp);
+    if (slice_type == SLICE_TYPE_I) {
+        vme_state_message[MODE_INTRA_16X16] = 0;
+        m_cost = lambda * 4;
+        vme_state_message[MODE_INTRA_8X8] = intel_format_lutvalue(m_cost, 0x8f);
+        m_cost = lambda * 16;
+        vme_state_message[MODE_INTRA_4X4] = intel_format_lutvalue(m_cost, 0x8f);
+        m_cost = lambda * 3;
+        vme_state_message[MODE_INTRA_NONPRED] = intel_format_lutvalue(m_cost, 0x6f);
+    } else {
+        m_cost = 0;
+        vme_state_message[MODE_INTER_MV0] = intel_format_lutvalue(m_cost, 0x6f);
+        for (j = 1; j < 3; j++) {
+            m_costf = (log2f((float)(j + 1)) + 1.718f) * lambda;
+            m_cost = (int)m_costf;
+            vme_state_message[MODE_INTER_MV0 + j] = intel_format_lutvalue(m_cost, 0x6f);
+        }
+        mv_count = 3;
+        for (j = 4; j <= 64; j *= 2) {
+            m_costf = (log2f((float)(j + 1)) + 1.718f) * lambda;
+            m_cost = (int)m_costf;
+            vme_state_message[MODE_INTER_MV0 + mv_count] = intel_format_lutvalue(m_cost, 0x6f);
+            mv_count++;
+        }
+
+        if (qp <= 25) {
+            vme_state_message[MODE_INTRA_16X16] = 0x4a;
+            vme_state_message[MODE_INTRA_8X8] = 0x4a;
+            vme_state_message[MODE_INTRA_4X4] = 0x4a;
+            vme_state_message[MODE_INTRA_NONPRED] = 0x4a;
+            vme_state_message[MODE_INTER_16X16] = 0x4a;
+            vme_state_message[MODE_INTER_16X8] = 0x4a;
+            vme_state_message[MODE_INTER_8X8] = 0x4a;
+            vme_state_message[MODE_INTER_8X4] = 0x4a;
+            vme_state_message[MODE_INTER_4X4] = 0x4a;
+            vme_state_message[MODE_INTER_BWD] = 0x2a;
+            return;
+        }
+        m_costf = lambda * 10;
+        vme_state_message[MODE_INTRA_16X16] = intel_format_lutvalue(m_cost, 0x8f);
+        m_cost = lambda * 14;
+        vme_state_message[MODE_INTRA_8X8] = intel_format_lutvalue(m_cost, 0x8f);
+        m_cost = lambda * 24;
+        vme_state_message[MODE_INTRA_4X4] = intel_format_lutvalue(m_cost, 0x8f);
+        m_costf = lambda * 3.5;
+        m_cost = m_costf;
+        vme_state_message[MODE_INTRA_NONPRED] = intel_format_lutvalue(m_cost, 0x6f);
+        if (slice_type == SLICE_TYPE_P) {
+            m_costf = lambda * 2.5;
+            m_cost = m_costf;
+            vme_state_message[MODE_INTER_16X16] = intel_format_lutvalue(m_cost, 0x8f);
+            m_costf = lambda * 4;
+            m_cost = m_costf;
+            vme_state_message[MODE_INTER_16X8] = intel_format_lutvalue(m_cost, 0x8f);
+            m_costf = lambda * 1.5;
+            m_cost = m_costf;
+            vme_state_message[MODE_INTER_8X8] = intel_format_lutvalue(m_cost, 0x6f);
+            m_costf = lambda * 3;
+            m_cost = m_costf;
+            vme_state_message[MODE_INTER_8X4] = intel_format_lutvalue(m_cost, 0x6f);
+            m_costf = lambda * 5;
+            m_cost = m_costf;
+            vme_state_message[MODE_INTER_4X4] = intel_format_lutvalue(m_cost, 0x6f);
+            /* BWD is not used in P-frame */
+            vme_state_message[MODE_INTER_BWD] = 0;
+        } else {
+            m_costf = lambda * 2.5;
+            m_cost = m_costf;
+            vme_state_message[MODE_INTER_16X16] = intel_format_lutvalue(m_cost, 0x8f);
+            m_costf = lambda * 5.5;
+            m_cost = m_costf;
+            vme_state_message[MODE_INTER_16X8] = intel_format_lutvalue(m_cost, 0x8f);
+            m_costf = lambda * 3.5;
+            m_cost = m_costf;
+            vme_state_message[MODE_INTER_8X8] = intel_format_lutvalue(m_cost, 0x6f);
+            m_costf = lambda * 5.0;
+            m_cost = m_costf;
+            vme_state_message[MODE_INTER_8X4] = intel_format_lutvalue(m_cost, 0x6f);
+            m_costf = lambda * 6.5;
+            m_cost = m_costf;
+            vme_state_message[MODE_INTER_4X4] = intel_format_lutvalue(m_cost, 0x6f);
+            m_costf = lambda * 1.5;
+            m_cost = m_costf;
+            vme_state_message[MODE_INTER_BWD] = intel_format_lutvalue(m_cost, 0x6f);
+        }
+    }
+}
diff --git a/src/gen6_vme.h b/src/gen6_vme.h
index bc62c14..c9d6b48 100644
--- a/src/gen6_vme.h
+++ b/src/gen6_vme.h
@@ -83,6 +83,7 @@ struct gen6_vme_context
                                            unsigned long surface_state_offset);
     void *vme_state_message;
     unsigned int h264_level;
+    unsigned int hevc_level;
     unsigned int video_coding_type;
     unsigned int vme_kernel_sum;
     unsigned int mpeg2_level;
@@ -174,6 +175,24 @@ intel_avc_vme_reference_state(VADriverContextP ctx,
                                   struct object_surface *obj_surface,
                                   struct intel_encoder_context *encoder_context));
 
+/* HEVC */
+void
+intel_hevc_vme_reference_state(VADriverContextP ctx,
+                              struct encode_state *encode_state,
+                              struct intel_encoder_context *encoder_context,
+                              int list_index,
+                              int surface_index,
+                              void (* vme_source_surface_state)(
+                                  VADriverContextP ctx,
+                                  int index,
+                                  struct object_surface *obj_surface,
+                                  struct intel_encoder_context *encoder_context));
+
+void intel_vme_hevc_update_mbmv_cost(VADriverContextP ctx,
+                                struct encode_state *encode_state,
+                                struct intel_encoder_context *encoder_context);
+
+
 extern Bool gen8_vme_context_init(VADriverContextP ctx, struct intel_encoder_context *encoder_context);
 
 extern Bool gen9_vme_context_init(VADriverContextP ctx, struct intel_encoder_context *encoder_context);
diff --git a/src/gen9_vme.c b/src/gen9_vme.c
index b4310f2..9239645 100644
--- a/src/gen9_vme.c
+++ b/src/gen9_vme.c
@@ -51,13 +51,13 @@
 
 #define VME_INTRA_SHADER        0
 #define VME_INTER_SHADER        1
-#define VME_BINTER_SHADER	2
+#define VME_BINTER_SHADER       2
 
 #define CURBE_ALLOCATION_SIZE   37              /* in 256-bit */
 #define CURBE_TOTAL_DATA_LENGTH (4 * 32)        /* in byte, it should be less than or equal to CURBE_ALLOCATION_SIZE * 32 */
 #define CURBE_URB_ENTRY_LENGTH  4               /* in 256-bit, it should be less than or equal to CURBE_TOTAL_DATA_LENGTH / 32 */
 
-#define VME_MSG_LENGTH		32
+#define VME_MSG_LENGTH          32
 
 static const uint32_t gen9_vme_intra_frame[][4] = {
 #include "shaders/vme/intra_frame_gen9.g9b"
@@ -120,6 +120,43 @@ static struct i965_kernel gen9_vme_mpeg2_kernels[] = {
     },
 };
 
+/* HEVC */
+
+static const uint32_t gen9_vme_hevc_intra_frame[][4] = {
+#include "shaders/vme/intra_frame_gen9.g9b"
+};
+
+static const uint32_t gen9_vme_hevc_inter_frame[][4] = {
+#include "shaders/vme/inter_frame_gen9.g9b"
+};
+
+static const uint32_t gen9_vme_hevc_inter_bframe[][4] = {
+#include "shaders/vme/inter_bframe_gen9.g9b"
+};
+
+static struct i965_kernel gen9_vme_hevc_kernels[] = {
+    {
+        "VME Intra Frame",
+        VME_INTRA_SHADER, /*index*/
+        gen9_vme_hevc_intra_frame,
+        sizeof(gen9_vme_hevc_intra_frame),
+        NULL
+    },
+    {
+        "VME inter Frame",
+        VME_INTER_SHADER,
+        gen9_vme_hevc_inter_frame,
+        sizeof(gen9_vme_hevc_inter_frame),
+        NULL
+    },
+    {
+        "VME inter BFrame",
+        VME_BINTER_SHADER,
+        gen9_vme_hevc_inter_bframe,
+        sizeof(gen9_vme_hevc_inter_bframe),
+        NULL
+    }
+};
 /* only used for VME source surface state */
 static void
 gen9_vme_source_surface_state(VADriverContextP ctx,
@@ -226,13 +263,13 @@ gen9_vme_output_vme_batchbuffer_setup(VADriverContextP ctx,
                                                    "VME batchbuffer",
                                                    vme_context->vme_batchbuffer.num_blocks * vme_context->vme_batchbuffer.size_block,
                                                    0x1000);
-	/*
+    /*
     vme_context->vme_buffer_suface_setup(ctx,
                                          &vme_context->gpe_context,
                                          &vme_context->vme_batchbuffer,
                                          BINDING_TABLE_OFFSET(index),
                                          SURFACE_STATE_OFFSET(index));
-	*/
+    */
 }
 
 static VAStatus
@@ -251,15 +288,15 @@ gen9_vme_surface_setup(VADriverContextP ctx,
     gen9_vme_media_chroma_source_surface_state(ctx, 6, obj_surface, encoder_context);
 
     if (!is_intra) {
-	VAEncSliceParameterBufferH264 *slice_param = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[0]->buffer;
-	int slice_type;
+        VAEncSliceParameterBufferH264 *slice_param = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[0]->buffer;
+        int slice_type;
 
-	slice_type = intel_avc_enc_slice_type_fixup(slice_param->slice_type);
-	assert(slice_type != SLICE_TYPE_I && slice_type != SLICE_TYPE_SI);
+        slice_type = intel_avc_enc_slice_type_fixup(slice_param->slice_type);
+        assert(slice_type != SLICE_TYPE_I && slice_type != SLICE_TYPE_SI);
 
-	intel_avc_vme_reference_state(ctx, encode_state, encoder_context, 0, 1, gen9_vme_source_surface_state);
+        intel_avc_vme_reference_state(ctx, encode_state, encoder_context, 0, 1, gen9_vme_source_surface_state);
 
-	if (slice_type == SLICE_TYPE_B)
+        if (slice_type == SLICE_TYPE_B)
             intel_avc_vme_reference_state(ctx, encode_state, encoder_context, 1, 2, gen9_vme_source_surface_state);
     }
 
@@ -330,6 +367,13 @@ static VAStatus gen9_vme_constant_setup(VADriverContextP ctx,
         }
     } else if (encoder_context->codec == CODEC_MPEG2) {
         mv_num = 2;
+    }else if (encoder_context->codec == CODEC_HEVC) {
+        if (vme_context->hevc_level >= 30*3) {
+            mv_num = 16;
+
+            if (vme_context->hevc_level >= 31*3)
+                mv_num = 8;
+        }/* use the avc level setting */
     }
 
     vme_state_message[31] = mv_num;
@@ -388,107 +432,107 @@ gen9wa_vme_walker_fill_vme_batchbuffer(VADriverContextP ctx,
     command_ptr = vme_context->vme_batchbuffer.bo->virtual;
 
     for (s = 0; s < encode_state->num_slice_params_ext; s++) {
-	VAEncSliceParameterBufferH264 *pSliceParameter = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[s]->buffer;
-	int first_mb = pSliceParameter->macroblock_address;
-	int num_mb = pSliceParameter->num_macroblocks;
-	unsigned int mb_intra_ub, score_dep;
-	int x_outer, y_outer, x_inner, y_inner;
-	int xtemp_outer = 0;
-
-	x_outer = first_mb % mb_width;
-	y_outer = first_mb / mb_width;
-	mb_row = y_outer;
-
-	for (; x_outer < (mb_width -2 ) && !loop_in_bounds(x_outer, y_outer, first_mb, num_mb, mb_width, mb_height); ) {
-	    x_inner = x_outer;
-	    y_inner = y_outer;
-	    for (; !loop_in_bounds(x_inner, y_inner, first_mb, num_mb, mb_width, mb_height);) {
-		mb_intra_ub = 0;
-		score_dep = 0;
-		if (x_inner != 0) {
-		    mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_AE;
-		    score_dep |= MB_SCOREBOARD_A;
+        VAEncSliceParameterBufferH264 *pSliceParameter = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[s]->buffer;
+        int first_mb = pSliceParameter->macroblock_address;
+        int num_mb = pSliceParameter->num_macroblocks;
+        unsigned int mb_intra_ub, score_dep;
+        int x_outer, y_outer, x_inner, y_inner;
+        int xtemp_outer = 0;
+
+        x_outer = first_mb % mb_width;
+        y_outer = first_mb / mb_width;
+        mb_row = y_outer;
+
+        for (; x_outer < (mb_width -2 ) && !loop_in_bounds(x_outer, y_outer, first_mb, num_mb, mb_width, mb_height); ) {
+            x_inner = x_outer;
+            y_inner = y_outer;
+            for (; !loop_in_bounds(x_inner, y_inner, first_mb, num_mb, mb_width, mb_height);) {
+                mb_intra_ub = 0;
+                score_dep = 0;
+                if (x_inner != 0) {
+                    mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_AE;
+                    score_dep |= MB_SCOREBOARD_A;
                 }
-		if (y_inner != mb_row) {
-		    mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_B;
-		    score_dep |= MB_SCOREBOARD_B;
-		    if (x_inner != 0)
-			mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_D;
-		    if (x_inner != (mb_width -1)) {
-			mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_C;
-			score_dep |= MB_SCOREBOARD_C;
+                if (y_inner != mb_row) {
+                    mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_B;
+                    score_dep |= MB_SCOREBOARD_B;
+                    if (x_inner != 0)
+                        mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_D;
+                    if (x_inner != (mb_width -1)) {
+                        mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_C;
+                        score_dep |= MB_SCOREBOARD_C;
                     }
-		}
-
-		*command_ptr++ = (CMD_MEDIA_OBJECT | (8 - 2));
-		*command_ptr++ = kernel;
-		*command_ptr++ = USE_SCOREBOARD;
-		/* Indirect data */
-		*command_ptr++ = 0;
-		/* the (X, Y) term of scoreboard */
-		*command_ptr++ = ((y_inner << 16) | x_inner);
-		*command_ptr++ = score_dep;
-		/*inline data */
-		*command_ptr++ = (mb_width << 16 | y_inner << 8 | x_inner);
-		*command_ptr++ = ((1 << 18) | (1 << 16) | transform_8x8_mode_flag | (mb_intra_ub << 8));
+                }
+
+                *command_ptr++ = (CMD_MEDIA_OBJECT | (8 - 2));
+                *command_ptr++ = kernel;
+                *command_ptr++ = USE_SCOREBOARD;
+                /* Indirect data */
+                *command_ptr++ = 0;
+                /* the (X, Y) term of scoreboard */
+                *command_ptr++ = ((y_inner << 16) | x_inner);
+                *command_ptr++ = score_dep;
+                /*inline data */
+                *command_ptr++ = (mb_width << 16 | y_inner << 8 | x_inner);
+                *command_ptr++ = ((1 << 18) | (1 << 16) | transform_8x8_mode_flag | (mb_intra_ub << 8));
                 *command_ptr++ = CMD_MEDIA_STATE_FLUSH;
                 *command_ptr++ = 0;
 
-		x_inner -= 2;
-		y_inner += 1;
-	    }
-	    x_outer += 1;
-	}
+                x_inner -= 2;
+                y_inner += 1;
+            }
+            x_outer += 1;
+        }
 
-	xtemp_outer = mb_width - 2;
-	if (xtemp_outer < 0)
+        xtemp_outer = mb_width - 2;
+        if (xtemp_outer < 0)
             xtemp_outer = 0;
-	x_outer = xtemp_outer;
-	y_outer = first_mb / mb_width;
-	for (;!loop_in_bounds(x_outer, y_outer, first_mb, num_mb, mb_width, mb_height); ) {
+        x_outer = xtemp_outer;
+        y_outer = first_mb / mb_width;
+        for (;!loop_in_bounds(x_outer, y_outer, first_mb, num_mb, mb_width, mb_height); ) {
             y_inner = y_outer;
-	    x_inner = x_outer;
-	    for (; !loop_in_bounds(x_inner, y_inner, first_mb, num_mb, mb_width, mb_height);) {
-		mb_intra_ub = 0;
-		score_dep = 0;
-		if (x_inner != 0) {
-		    mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_AE;
-		    score_dep |= MB_SCOREBOARD_A;
+            x_inner = x_outer;
+            for (; !loop_in_bounds(x_inner, y_inner, first_mb, num_mb, mb_width, mb_height);) {
+                mb_intra_ub = 0;
+                score_dep = 0;
+                if (x_inner != 0) {
+                    mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_AE;
+                    score_dep |= MB_SCOREBOARD_A;
                 }
-		if (y_inner != mb_row) {
-		    mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_B;
-		    score_dep |= MB_SCOREBOARD_B;
-		    if (x_inner != 0)
-			mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_D;
-
-		    if (x_inner != (mb_width -1)) {
-			mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_C;
-			score_dep |= MB_SCOREBOARD_C;
+                if (y_inner != mb_row) {
+                    mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_B;
+                    score_dep |= MB_SCOREBOARD_B;
+                    if (x_inner != 0)
+                        mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_D;
+
+                    if (x_inner != (mb_width -1)) {
+                        mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_C;
+                        score_dep |= MB_SCOREBOARD_C;
                     }
-		}
-
-		*command_ptr++ = (CMD_MEDIA_OBJECT | (8 - 2));
-		*command_ptr++ = kernel;
-		*command_ptr++ = USE_SCOREBOARD;
-		/* Indirect data */
-		*command_ptr++ = 0;
-		/* the (X, Y) term of scoreboard */
-		*command_ptr++ = ((y_inner << 16) | x_inner);
-		*command_ptr++ = score_dep;
-		/*inline data */
-		*command_ptr++ = (mb_width << 16 | y_inner << 8 | x_inner);
-		*command_ptr++ = ((1 << 18) | (1 << 16) | transform_8x8_mode_flag | (mb_intra_ub << 8));
+                }
+
+                *command_ptr++ = (CMD_MEDIA_OBJECT | (8 - 2));
+                *command_ptr++ = kernel;
+                *command_ptr++ = USE_SCOREBOARD;
+                /* Indirect data */
+                *command_ptr++ = 0;
+                /* the (X, Y) term of scoreboard */
+                *command_ptr++ = ((y_inner << 16) | x_inner);
+                *command_ptr++ = score_dep;
+                /*inline data */
+                *command_ptr++ = (mb_width << 16 | y_inner << 8 | x_inner);
+                *command_ptr++ = ((1 << 18) | (1 << 16) | transform_8x8_mode_flag | (mb_intra_ub << 8));
 
                 *command_ptr++ = CMD_MEDIA_STATE_FLUSH;
                 *command_ptr++ = 0;
-		x_inner -= 2;
-		y_inner += 1;
-	    }
-	    x_outer++;
-	    if (x_outer >= mb_width) {
-		y_outer += 1;
-		x_outer = xtemp_outer;
-	    }
+                x_inner -= 2;
+                y_inner += 1;
+            }
+            x_outer++;
+            if (x_outer >= mb_width) {
+                y_outer += 1;
+                x_outer = xtemp_outer;
+            }
         }
     }
 
@@ -519,34 +563,34 @@ gen9_vme_fill_vme_batchbuffer(VADriverContextP ctx,
         int slice_mb_begin = pSliceParameter->macroblock_address;
         int slice_mb_number = pSliceParameter->num_macroblocks;
         unsigned int mb_intra_ub;
-	int slice_mb_x = pSliceParameter->macroblock_address % mb_width;
+        int slice_mb_x = pSliceParameter->macroblock_address % mb_width;
         for (i = 0; i < slice_mb_number;  ) {
             int mb_count = i + slice_mb_begin;
             mb_x = mb_count % mb_width;
             mb_y = mb_count / mb_width;
-	    mb_intra_ub = 0;
-	    if (mb_x != 0) {
-		mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_AE;
-	    }
-	    if (mb_y != 0) {
-		mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_B;
-		if (mb_x != 0)
+            mb_intra_ub = 0;
+            if (mb_x != 0) {
+                mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_AE;
+            }
+            if (mb_y != 0) {
+                mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_B;
+                if (mb_x != 0)
                     mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_D;
-		if (mb_x != (mb_width -1))
+                if (mb_x != (mb_width -1))
                     mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_C;
-	    }
-	    if (i < mb_width) {
-		if (i == 0)
+            }
+            if (i < mb_width) {
+                if (i == 0)
                     mb_intra_ub &= ~(INTRA_PRED_AVAIL_FLAG_AE);
-		mb_intra_ub &= ~(INTRA_PRED_AVAIL_FLAG_BCD_MASK);
-		if ((i == (mb_width - 1)) && slice_mb_x) {
+                mb_intra_ub &= ~(INTRA_PRED_AVAIL_FLAG_BCD_MASK);
+                if ((i == (mb_width - 1)) && slice_mb_x) {
                     mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_C;
-		}
-	    }
+                }
+            }
 
-	    if ((i == mb_width) && slice_mb_x) {
-		mb_intra_ub &= ~(INTRA_PRED_AVAIL_FLAG_D);
-	    }
+            if ((i == mb_width) && slice_mb_x) {
+                mb_intra_ub &= ~(INTRA_PRED_AVAIL_FLAG_D);
+            }
             *command_ptr++ = (CMD_MEDIA_OBJECT | (8 - 2));
             *command_ptr++ = kernel;
             *command_ptr++ = 0;
@@ -670,7 +714,7 @@ static VAStatus gen9_vme_prepare(VADriverContextP ctx,
 
     if (!vme_context->h264_level ||
         (vme_context->h264_level != pSequenceParameter->level_idc)) {
-	vme_context->h264_level = pSequenceParameter->level_idc;
+            vme_context->h264_level = pSequenceParameter->level_idc;
     }
 
     intel_vme_update_mbmv_cost(ctx, encode_state, encoder_context);
@@ -837,109 +881,109 @@ gen9wa_vme_mpeg2_walker_fill_vme_batchbuffer(VADriverContextP ctx,
     command_ptr = vme_context->vme_batchbuffer.bo->virtual;
 
     {
-	unsigned int mb_intra_ub, score_dep;
-	int x_outer, y_outer, x_inner, y_inner;
-	int xtemp_outer = 0;
-	int first_mb = 0;
-	int num_mb = mb_width * mb_height;
-
-	x_outer = 0;
-	y_outer = 0;
-
-	for (; x_outer < (mb_width -2 ) && !loop_in_bounds(x_outer, y_outer, first_mb, num_mb, mb_width, mb_height); ) {
-	    x_inner = x_outer;
-	    y_inner = y_outer;
-	    for (; !loop_in_bounds(x_inner, y_inner, first_mb, num_mb, mb_width, mb_height);) {
-		mb_intra_ub = 0;
-		score_dep = 0;
-		if (x_inner != 0) {
-		    mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_AE;
-		    score_dep |= MB_SCOREBOARD_A;
+        unsigned int mb_intra_ub, score_dep;
+        int x_outer, y_outer, x_inner, y_inner;
+        int xtemp_outer = 0;
+        int first_mb = 0;
+        int num_mb = mb_width * mb_height;
+
+        x_outer = 0;
+        y_outer = 0;
+
+        for (; x_outer < (mb_width -2 ) && !loop_in_bounds(x_outer, y_outer, first_mb, num_mb, mb_width, mb_height); ) {
+            x_inner = x_outer;
+            y_inner = y_outer;
+            for (; !loop_in_bounds(x_inner, y_inner, first_mb, num_mb, mb_width, mb_height);) {
+                mb_intra_ub = 0;
+                score_dep = 0;
+                if (x_inner != 0) {
+                    mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_AE;
+                    score_dep |= MB_SCOREBOARD_A;
                 }
-		if (y_inner != 0) {
-		    mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_B;
-		    score_dep |= MB_SCOREBOARD_B;
-
-		    if (x_inner != 0)
-			mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_D;
-
-		    if (x_inner != (mb_width -1)) {
-			mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_C;
-			score_dep |= MB_SCOREBOARD_C;
-		    }
-		}
-
-		*command_ptr++ = (CMD_MEDIA_OBJECT | (8 - 2));
-		*command_ptr++ = kernel;
-		*command_ptr++ = MPEG2_SCOREBOARD;
-		/* Indirect data */
-		*command_ptr++ = 0;
-		/* the (X, Y) term of scoreboard */
-		*command_ptr++ = ((y_inner << 16) | x_inner);
-		*command_ptr++ = score_dep;
-		/*inline data */
-		*command_ptr++ = (mb_width << 16 | y_inner << 8 | x_inner);
-		*command_ptr++ = ((1 << 18) | (1 << 16) | (mb_intra_ub << 8));
+                if (y_inner != 0) {
+                    mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_B;
+                    score_dep |= MB_SCOREBOARD_B;
+
+                    if (x_inner != 0)
+                        mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_D;
+
+                    if (x_inner != (mb_width -1)) {
+                        mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_C;
+                        score_dep |= MB_SCOREBOARD_C;
+                    }
+                }
+
+                *command_ptr++ = (CMD_MEDIA_OBJECT | (8 - 2));
+                *command_ptr++ = kernel;
+                *command_ptr++ = MPEG2_SCOREBOARD;
+                /* Indirect data */
+                *command_ptr++ = 0;
+                /* the (X, Y) term of scoreboard */
+                *command_ptr++ = ((y_inner << 16) | x_inner);
+                *command_ptr++ = score_dep;
+                /*inline data */
+                *command_ptr++ = (mb_width << 16 | y_inner << 8 | x_inner);
+                *command_ptr++ = ((1 << 18) | (1 << 16) | (mb_intra_ub << 8));
                 *command_ptr++ = CMD_MEDIA_STATE_FLUSH;
                 *command_ptr++ = 0;
 
-		x_inner -= 2;
-		y_inner += 1;
-	    }
-	    x_outer += 1;
-	}
+                x_inner -= 2;
+                y_inner += 1;
+            }
+            x_outer += 1;
+        }
 
-	xtemp_outer = mb_width - 2;
-	if (xtemp_outer < 0)
+        xtemp_outer = mb_width - 2;
+        if (xtemp_outer < 0)
             xtemp_outer = 0;
-	x_outer = xtemp_outer;
-	y_outer = 0;
-	for (;!loop_in_bounds(x_outer, y_outer, first_mb, num_mb, mb_width, mb_height); ) {
+        x_outer = xtemp_outer;
+        y_outer = 0;
+        for (;!loop_in_bounds(x_outer, y_outer, first_mb, num_mb, mb_width, mb_height); ) {
             y_inner = y_outer;
-	    x_inner = x_outer;
-	    for (; !loop_in_bounds(x_inner, y_inner, first_mb, num_mb, mb_width, mb_height);) {
-		mb_intra_ub = 0;
-		score_dep = 0;
-		if (x_inner != 0) {
-		    mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_AE;
-		    score_dep |= MB_SCOREBOARD_A;
+            x_inner = x_outer;
+            for (; !loop_in_bounds(x_inner, y_inner, first_mb, num_mb, mb_width, mb_height);) {
+                mb_intra_ub = 0;
+                score_dep = 0;
+                if (x_inner != 0) {
+                    mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_AE;
+                    score_dep |= MB_SCOREBOARD_A;
+                }
+                if (y_inner != 0) {
+                    mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_B;
+                    score_dep |= MB_SCOREBOARD_B;
+
+                    if (x_inner != 0)
+                        mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_D;
+
+                    if (x_inner != (mb_width -1)) {
+                        mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_C;
+                        score_dep |= MB_SCOREBOARD_C;
+                    }
                 }
-		if (y_inner != 0) {
-		    mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_B;
-		    score_dep |= MB_SCOREBOARD_B;
-
-		    if (x_inner != 0)
-			mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_D;
-
-		    if (x_inner != (mb_width -1)) {
-			mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_C;
-			score_dep |= MB_SCOREBOARD_C;
-		    }
-		}
-
-		*command_ptr++ = (CMD_MEDIA_OBJECT | (8 - 2));
-		*command_ptr++ = kernel;
-		*command_ptr++ = MPEG2_SCOREBOARD;
-		/* Indirect data */
-		*command_ptr++ = 0;
-		/* the (X, Y) term of scoreboard */
-		*command_ptr++ = ((y_inner << 16) | x_inner);
-		*command_ptr++ = score_dep;
-		/*inline data */
-		*command_ptr++ = (mb_width << 16 | y_inner << 8 | x_inner);
-		*command_ptr++ = ((1 << 18) | (1 << 16) | (mb_intra_ub << 8));
+
+                *command_ptr++ = (CMD_MEDIA_OBJECT | (8 - 2));
+                *command_ptr++ = kernel;
+                *command_ptr++ = MPEG2_SCOREBOARD;
+                /* Indirect data */
+                *command_ptr++ = 0;
+                /* the (X, Y) term of scoreboard */
+                *command_ptr++ = ((y_inner << 16) | x_inner);
+                *command_ptr++ = score_dep;
+                /*inline data */
+                *command_ptr++ = (mb_width << 16 | y_inner << 8 | x_inner);
+                *command_ptr++ = ((1 << 18) | (1 << 16) | (mb_intra_ub << 8));
 
                 *command_ptr++ = CMD_MEDIA_STATE_FLUSH;
                 *command_ptr++ = 0;
-		x_inner -= 2;
-		y_inner += 1;
-	    }
-	    x_outer++;
-	    if (x_outer >= mb_width) {
-		y_outer += 1;
-		x_outer = xtemp_outer;
+                x_inner -= 2;
+                y_inner += 1;
+            }
+            x_outer++;
+            if (x_outer >= mb_width) {
+                y_outer += 1;
+                x_outer = xtemp_outer;
             }
-	}
+        }
     }
 
     *command_ptr++ = MI_BATCH_BUFFER_END;
@@ -1038,23 +1082,23 @@ gen9_vme_mpeg2_pipeline_programing(VADriverContextP ctx,
     VAEncPictureParameterBufferMPEG2 *pic_param = NULL;
 
     for (s = 0; s < encode_state->num_slice_params_ext; s++) {
-	int j;
+        int j;
         VAEncSliceParameterBufferMPEG2 *slice_param = (VAEncSliceParameterBufferMPEG2 *)encode_state->slice_params_ext[s]->buffer;
 
         for (j = 0; j < encode_state->slice_params_ext[s]->num_elements; j++) {
-	    if (slice_param->macroblock_address % width_in_mbs) {
-		allow_hwscore = false;
-		break;
-	    }
-	}
+            if (slice_param->macroblock_address % width_in_mbs) {
+                allow_hwscore = false;
+                break;
+            }
+        }
     }
 
     pic_param = (VAEncPictureParameterBufferMPEG2 *)encode_state->pic_param_ext->buffer;
     if (pic_param->picture_type == VAEncPictureTypeIntra) {
-	allow_hwscore = false;
-	kernel_shader = VME_INTRA_SHADER;
+        allow_hwscore = false;
+        kernel_shader = VME_INTRA_SHADER;
     } else {
-	kernel_shader = VME_INTER_SHADER;
+        kernel_shader = VME_INTER_SHADER;
     }
 
     if (allow_hwscore)
@@ -1064,7 +1108,7 @@ gen9_vme_mpeg2_pipeline_programing(VADriverContextP ctx,
                                                    kernel_shader,
                                                    encoder_context);
     else
-	gen9_vme_mpeg2_fill_vme_batchbuffer(ctx,
+        gen9_vme_mpeg2_fill_vme_batchbuffer(ctx,
                                             encode_state,
                                             width_in_mbs, height_in_mbs,
                                             is_intra ? VME_INTRA_SHADER : VME_INTER_SHADER,
@@ -1100,7 +1144,7 @@ gen9_vme_mpeg2_prepare(VADriverContextP ctx,
 
     if ((!vme_context->mpeg2_level) ||
         (vme_context->mpeg2_level != (seq_param->sequence_extension.bits.profile_and_level_indication & MPEG2_LEVEL_MASK))) {
-	vme_context->mpeg2_level = seq_param->sequence_extension.bits.profile_and_level_indication & MPEG2_LEVEL_MASK;
+            vme_context->mpeg2_level = seq_param->sequence_extension.bits.profile_and_level_indication & MPEG2_LEVEL_MASK;
     }
 
     /*Setup all the memory object*/
@@ -1130,6 +1174,440 @@ gen9_vme_mpeg2_pipeline(VADriverContextP ctx,
     return VA_STATUS_SUCCESS;
 }
 
+/* HEVC */
+
+static void
+gen9_vme_hevc_output_buffer_setup(VADriverContextP ctx,
+                             struct encode_state *encode_state,
+                             int index,
+                             struct intel_encoder_context *encoder_context)
+
+{
+    struct i965_driver_data *i965 = i965_driver_data(ctx);
+    struct gen6_vme_context *vme_context = encoder_context->vme_context;
+    VAEncSequenceParameterBufferHEVC *pSequenceParameter = (VAEncSequenceParameterBufferHEVC *)encode_state->seq_param_ext->buffer;
+    VAEncSliceParameterBufferHEVC *pSliceParameter = (VAEncSliceParameterBufferHEVC *)encode_state->slice_params_ext[0]->buffer;
+    int is_intra = pSliceParameter->slice_type == SLICE_TYPE_I;
+    int width_in_mbs = (pSequenceParameter->pic_width_in_luma_samples + 15)/16;
+    int height_in_mbs = (pSequenceParameter->pic_height_in_luma_samples + 15)/16;
+
+
+    vme_context->vme_output.num_blocks = width_in_mbs * height_in_mbs;
+    vme_context->vme_output.pitch = 16; /* in bytes, always 16 */
+
+    if (is_intra)
+        vme_context->vme_output.size_block = INTRA_VME_OUTPUT_IN_BYTES * 2;
+    else
+        vme_context->vme_output.size_block = INTRA_VME_OUTPUT_IN_BYTES * 24;
+    /*
+     * Inter MV . 32-byte Intra search + 16 IME info + 128 IME MV + 32 IME Ref
+     * + 16 FBR Info + 128 FBR MV + 32 FBR Ref.
+     * 16 * (2 + 2 * (1 + 8 + 2))= 16 * 24.
+     */
+
+    vme_context->vme_output.bo = dri_bo_alloc(i965->intel.bufmgr,
+                                              "VME output buffer",
+                                              vme_context->vme_output.num_blocks * vme_context->vme_output.size_block,
+                                              0x1000);
+    assert(vme_context->vme_output.bo);
+    vme_context->vme_buffer_suface_setup(ctx,
+                                         &vme_context->gpe_context,
+                                         &vme_context->vme_output,
+                                         BINDING_TABLE_OFFSET(index),
+                                         SURFACE_STATE_OFFSET(index));
+}
+
+static void
+gen9_vme_hevc_output_vme_batchbuffer_setup(VADriverContextP ctx,
+                                      struct encode_state *encode_state,
+                                      int index,
+                                      struct intel_encoder_context *encoder_context)
+
+{
+    struct i965_driver_data *i965 = i965_driver_data(ctx);
+    struct gen6_vme_context *vme_context = encoder_context->vme_context;
+    VAEncSequenceParameterBufferHEVC *pSequenceParameter = (VAEncSequenceParameterBufferHEVC *)encode_state->seq_param_ext->buffer;
+    int width_in_mbs = (pSequenceParameter->pic_width_in_luma_samples + 15)/16;
+    int height_in_mbs = (pSequenceParameter->pic_height_in_luma_samples + 15)/16;
+
+    vme_context->vme_batchbuffer.num_blocks = width_in_mbs * height_in_mbs + 1;
+    vme_context->vme_batchbuffer.size_block = 64; /* 4 OWORDs */
+    vme_context->vme_batchbuffer.pitch = 16;
+    vme_context->vme_batchbuffer.bo = dri_bo_alloc(i965->intel.bufmgr,
+                                                   "VME batchbuffer",
+                                                   vme_context->vme_batchbuffer.num_blocks * vme_context->vme_batchbuffer.size_block,
+                                                   0x1000);
+}
+static VAStatus
+gen9_vme_hevc_surface_setup(VADriverContextP ctx,
+                       struct encode_state *encode_state,
+                       int is_intra,
+                       struct intel_encoder_context *encoder_context)
+{
+    struct object_surface *obj_surface;
+
+    /*Setup surfaces state*/
+    /* current picture for encoding */
+    obj_surface = encode_state->input_yuv_object;
+    gen9_vme_source_surface_state(ctx, 0, obj_surface, encoder_context);
+    gen9_vme_media_source_surface_state(ctx, 4, obj_surface, encoder_context);
+    gen9_vme_media_chroma_source_surface_state(ctx, 6, obj_surface, encoder_context);
+
+    if (!is_intra) {
+        VAEncSliceParameterBufferHEVC *slice_param = (VAEncSliceParameterBufferHEVC *)encode_state->slice_params_ext[0]->buffer;
+        int slice_type;
+
+        slice_type = slice_param->slice_type;
+        assert(slice_type != SLICE_TYPE_I && slice_type != SLICE_TYPE_SI);
+
+        /* to do HEVC */
+        intel_hevc_vme_reference_state(ctx, encode_state, encoder_context, 0, 1, gen9_vme_source_surface_state);
+
+        if (slice_type == SLICE_TYPE_B)
+            intel_hevc_vme_reference_state(ctx, encode_state, encoder_context, 1, 2, gen9_vme_source_surface_state);
+    }
+
+    /* VME output */
+    gen9_vme_hevc_output_buffer_setup(ctx, encode_state, 3, encoder_context);
+    gen9_vme_hevc_output_vme_batchbuffer_setup(ctx, encode_state, 5, encoder_context);
+
+    return VA_STATUS_SUCCESS;
+}
+static void
+gen9wa_vme_hevc_walker_fill_vme_batchbuffer(VADriverContextP ctx,
+                                     struct encode_state *encode_state,
+                                     int mb_width, int mb_height,
+                                     int kernel,
+                                     int transform_8x8_mode_flag,
+                                     struct intel_encoder_context *encoder_context)
+{
+    struct gen6_vme_context *vme_context = encoder_context->vme_context;
+    int mb_row;
+    int s;
+    unsigned int *command_ptr;
+    VAEncSequenceParameterBufferHEVC *pSequenceParameter = (VAEncSequenceParameterBufferHEVC *)encode_state->seq_param_ext->buffer;
+    int log2_cu_size = pSequenceParameter->log2_min_luma_coding_block_size_minus3 + 3;
+    int log2_ctb_size = pSequenceParameter->log2_diff_max_min_luma_coding_block_size + log2_cu_size;
+    int ctb_size = 1 << log2_ctb_size;
+    int num_mb_in_ctb = (ctb_size + 15)/16;
+    num_mb_in_ctb = num_mb_in_ctb * num_mb_in_ctb;
+
+#define		USE_SCOREBOARD		(1 << 21)
+
+    dri_bo_map(vme_context->vme_batchbuffer.bo, 1);
+    command_ptr = vme_context->vme_batchbuffer.bo->virtual;
+
+    /*slice_segment_address  must picture_width_in_ctb alainment */
+    for (s = 0; s < encode_state->num_slice_params_ext; s++) {
+        VAEncSliceParameterBufferHEVC *pSliceParameter = (VAEncSliceParameterBufferHEVC *)encode_state->slice_params_ext[s]->buffer;
+        int first_mb = pSliceParameter->slice_segment_address * num_mb_in_ctb;
+        int num_mb = pSliceParameter->num_ctu_in_slice * num_mb_in_ctb;
+        unsigned int mb_intra_ub, score_dep;
+        int x_outer, y_outer, x_inner, y_inner;
+        int xtemp_outer = 0;
+
+        x_outer = first_mb % mb_width;
+        y_outer = first_mb / mb_width;
+        mb_row = y_outer;
+
+        for (; x_outer < (mb_width -2 ) && !loop_in_bounds(x_outer, y_outer, first_mb, num_mb, mb_width, mb_height); ) {
+            x_inner = x_outer;
+            y_inner = y_outer;
+            for (; !loop_in_bounds(x_inner, y_inner, first_mb, num_mb, mb_width, mb_height);) {
+                mb_intra_ub = 0;
+                score_dep = 0;
+                if (x_inner != 0) {
+                    mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_AE;
+                    score_dep |= MB_SCOREBOARD_A;
+                }
+                if (y_inner != mb_row) {
+                    mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_B;
+                    score_dep |= MB_SCOREBOARD_B;
+                    if (x_inner != 0)
+                        mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_D;
+                    if (x_inner != (mb_width -1)) {
+                        mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_C;
+                        score_dep |= MB_SCOREBOARD_C;
+                    }
+                }
+
+                *command_ptr++ = (CMD_MEDIA_OBJECT | (8 - 2));
+                *command_ptr++ = kernel;
+                *command_ptr++ = USE_SCOREBOARD;
+                /* Indirect data */
+                *command_ptr++ = 0;
+                /* the (X, Y) term of scoreboard */
+                *command_ptr++ = ((y_inner << 16) | x_inner);
+                *command_ptr++ = score_dep;
+                /*inline data */
+                *command_ptr++ = (mb_width << 16 | y_inner << 8 | x_inner);
+                *command_ptr++ = ((1 << 18) | (1 << 16) | transform_8x8_mode_flag | (mb_intra_ub << 8));
+                *command_ptr++ = CMD_MEDIA_STATE_FLUSH;
+                *command_ptr++ = 0;
+
+                x_inner -= 2;
+                y_inner += 1;
+            }
+            x_outer += 1;
+        }
+
+        xtemp_outer = mb_width - 2;
+        if (xtemp_outer < 0)
+            xtemp_outer = 0;
+        x_outer = xtemp_outer;
+        y_outer = first_mb / mb_width;
+        for (;!loop_in_bounds(x_outer, y_outer, first_mb, num_mb, mb_width, mb_height); ) {
+            y_inner = y_outer;
+            x_inner = x_outer;
+            for (; !loop_in_bounds(x_inner, y_inner, first_mb, num_mb, mb_width, mb_height);) {
+                mb_intra_ub = 0;
+                score_dep = 0;
+                if (x_inner != 0) {
+                    mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_AE;
+                    score_dep |= MB_SCOREBOARD_A;
+                }
+                if (y_inner != mb_row) {
+                    mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_B;
+                    score_dep |= MB_SCOREBOARD_B;
+                    if (x_inner != 0)
+                        mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_D;
+
+                    if (x_inner != (mb_width -1)) {
+                        mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_C;
+                        score_dep |= MB_SCOREBOARD_C;
+                    }
+                }
+
+                *command_ptr++ = (CMD_MEDIA_OBJECT | (8 - 2));
+                *command_ptr++ = kernel;
+                *command_ptr++ = USE_SCOREBOARD;
+                /* Indirect data */
+                *command_ptr++ = 0;
+                /* the (X, Y) term of scoreboard */
+                *command_ptr++ = ((y_inner << 16) | x_inner);
+                *command_ptr++ = score_dep;
+                /*inline data */
+                *command_ptr++ = (mb_width << 16 | y_inner << 8 | x_inner);
+                *command_ptr++ = ((1 << 18) | (1 << 16) | transform_8x8_mode_flag | (mb_intra_ub << 8));
+
+                *command_ptr++ = CMD_MEDIA_STATE_FLUSH;
+                *command_ptr++ = 0;
+                x_inner -= 2;
+                y_inner += 1;
+            }
+            x_outer++;
+            if (x_outer >= mb_width) {
+                y_outer += 1;
+                x_outer = xtemp_outer;
+            }
+        }
+    }
+
+    *command_ptr++ = MI_BATCH_BUFFER_END;
+    *command_ptr++ = 0;
+
+    dri_bo_unmap(vme_context->vme_batchbuffer.bo);
+}
+
+static void
+gen9_vme_hevc_fill_vme_batchbuffer(VADriverContextP ctx,
+                              struct encode_state *encode_state,
+                              int mb_width, int mb_height,
+                              int kernel,
+                              int transform_8x8_mode_flag,
+                              struct intel_encoder_context *encoder_context)
+{
+    struct gen6_vme_context *vme_context = encoder_context->vme_context;
+    int mb_x = 0, mb_y = 0;
+    int i, s;
+    unsigned int *command_ptr;
+    VAEncSequenceParameterBufferHEVC *pSequenceParameter = (VAEncSequenceParameterBufferHEVC *)encode_state->seq_param_ext->buffer;
+    int log2_cu_size = pSequenceParameter->log2_min_luma_coding_block_size_minus3 + 3;
+    int log2_ctb_size = pSequenceParameter->log2_diff_max_min_luma_coding_block_size + log2_cu_size;
+
+    int ctb_size = 1 << log2_ctb_size;
+    int num_mb_in_ctb = (ctb_size + 15)/16;
+    num_mb_in_ctb = num_mb_in_ctb * num_mb_in_ctb;
+
+    dri_bo_map(vme_context->vme_batchbuffer.bo, 1);
+    command_ptr = vme_context->vme_batchbuffer.bo->virtual;
+
+    for (s = 0; s < encode_state->num_slice_params_ext; s++) {
+        VAEncSliceParameterBufferHEVC *pSliceParameter = (VAEncSliceParameterBufferHEVC *)encode_state->slice_params_ext[s]->buffer;
+        int slice_mb_begin = pSliceParameter->slice_segment_address * num_mb_in_ctb;
+        int slice_mb_number = pSliceParameter->num_ctu_in_slice * num_mb_in_ctb;
+
+        unsigned int mb_intra_ub;
+        int slice_mb_x = slice_mb_begin % mb_width;
+        for (i = 0; i < slice_mb_number;  ) {
+            int mb_count = i + slice_mb_begin;
+            mb_x = mb_count % mb_width;
+            mb_y = mb_count / mb_width;
+            mb_intra_ub = 0;
+
+            if (mb_x != 0) {
+                mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_AE;
+            }
+            if (mb_y != 0) {
+                mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_B;
+                if (mb_x != 0)
+                    mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_D;
+                if (mb_x != (mb_width -1))
+                    mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_C;
+            }
+            if (i < mb_width) {
+                if (i == 0)
+                    mb_intra_ub &= ~(INTRA_PRED_AVAIL_FLAG_AE);
+                mb_intra_ub &= ~(INTRA_PRED_AVAIL_FLAG_BCD_MASK);
+                if ((i == (mb_width - 1)) && slice_mb_x) {
+                    mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_C;
+                }
+            }
+
+            if ((i == mb_width) && slice_mb_x) {
+                mb_intra_ub &= ~(INTRA_PRED_AVAIL_FLAG_D);
+            }
+
+            *command_ptr++ = (CMD_MEDIA_OBJECT | (8 - 2));
+            *command_ptr++ = kernel;
+            *command_ptr++ = 0;
+            *command_ptr++ = 0;
+            *command_ptr++ = 0;
+            *command_ptr++ = 0;
+
+            /*inline data */
+            *command_ptr++ = (mb_width << 16 | mb_y << 8 | mb_x);
+            *command_ptr++ = ( (1 << 16) | transform_8x8_mode_flag | (mb_intra_ub << 8));
+
+            *command_ptr++ = CMD_MEDIA_STATE_FLUSH;
+            *command_ptr++ = 0;
+            i += 1;
+        }
+    }
+
+    *command_ptr++ = MI_BATCH_BUFFER_END;
+    *command_ptr++ = 0;
+
+    dri_bo_unmap(vme_context->vme_batchbuffer.bo);
+}
+
+static void gen9_vme_hevc_pipeline_programing(VADriverContextP ctx,
+                                         struct encode_state *encode_state,
+                                         struct intel_encoder_context *encoder_context)
+{
+    struct gen6_vme_context *vme_context = encoder_context->vme_context;
+    struct intel_batchbuffer *batch = encoder_context->base.batch;
+    VAEncSliceParameterBufferHEVC *pSliceParameter = (VAEncSliceParameterBufferHEVC *)encode_state->slice_params_ext[0]->buffer;
+    VAEncSequenceParameterBufferHEVC *pSequenceParameter = (VAEncSequenceParameterBufferHEVC *)encode_state->seq_param_ext->buffer;
+    int width_in_mbs = (pSequenceParameter->pic_width_in_luma_samples + 15)/16;
+    int height_in_mbs = (pSequenceParameter->pic_height_in_luma_samples + 15)/16;
+    int kernel_shader;
+    bool allow_hwscore = true;
+    int s;
+
+    int log2_cu_size = pSequenceParameter->log2_min_luma_coding_block_size_minus3 + 3;
+    int log2_ctb_size = pSequenceParameter->log2_diff_max_min_luma_coding_block_size + log2_cu_size;
+
+    int ctb_size = 1 << log2_ctb_size;
+    int num_mb_in_ctb = (ctb_size + 15)/16;
+    int transform_8x8_mode_flag = 1;
+    num_mb_in_ctb = num_mb_in_ctb * num_mb_in_ctb;
+
+    for (s = 0; s < encode_state->num_slice_params_ext; s++) {
+        pSliceParameter = (VAEncSliceParameterBufferHEVC *)encode_state->slice_params_ext[s]->buffer;
+        int slice_mb_begin = pSliceParameter->slice_segment_address * num_mb_in_ctb;
+        if ((slice_mb_begin % width_in_mbs)) {
+            allow_hwscore = false;
+            break;
+        }
+    }
+
+    if (pSliceParameter->slice_type == SLICE_TYPE_I) {
+        kernel_shader = VME_INTRA_SHADER;
+    } else if (pSliceParameter->slice_type == SLICE_TYPE_P) {
+        kernel_shader = VME_INTER_SHADER;
+    } else {
+        kernel_shader = VME_BINTER_SHADER;
+        if (!allow_hwscore)
+            kernel_shader = VME_INTER_SHADER;
+    }
+    if (allow_hwscore)
+        gen9wa_vme_hevc_walker_fill_vme_batchbuffer(ctx,
+                                               encode_state,
+                                               width_in_mbs, height_in_mbs,
+                                               kernel_shader,
+                                               transform_8x8_mode_flag,
+                                               encoder_context);
+    else
+        gen9_vme_hevc_fill_vme_batchbuffer(ctx,
+                                      encode_state,
+                                      width_in_mbs, height_in_mbs,
+                                      kernel_shader,
+                                      transform_8x8_mode_flag,
+                                      encoder_context);
+
+    intel_batchbuffer_start_atomic(batch, 0x1000);
+    gen9_gpe_pipeline_setup(ctx, &vme_context->gpe_context, batch);
+    BEGIN_BATCH(batch, 3);
+    OUT_BATCH(batch, MI_BATCH_BUFFER_START | (1 << 8) | (1 << 0));
+    OUT_RELOC(batch,
+              vme_context->vme_batchbuffer.bo,
+              I915_GEM_DOMAIN_COMMAND, 0,
+              0);
+    OUT_BATCH(batch, 0);
+    ADVANCE_BATCH(batch);
+
+    gen9_gpe_pipeline_end(ctx, &vme_context->gpe_context, batch);
+
+    intel_batchbuffer_end_atomic(batch);
+}
+
+static VAStatus gen9_vme_hevc_prepare(VADriverContextP ctx,
+                                 struct encode_state *encode_state,
+                                 struct intel_encoder_context *encoder_context)
+{
+    VAStatus vaStatus = VA_STATUS_SUCCESS;
+    VAEncSliceParameterBufferHEVC *pSliceParameter = (VAEncSliceParameterBufferHEVC *)encode_state->slice_params_ext[0]->buffer;
+    int is_intra = pSliceParameter->slice_type == SLICE_TYPE_I;
+    VAEncSequenceParameterBufferHEVC *pSequenceParameter = (VAEncSequenceParameterBufferHEVC *)encode_state->seq_param_ext->buffer;
+    struct gen6_vme_context *vme_context = encoder_context->vme_context;
+
+    /* here use the avc level for hevc vme */
+    if (!vme_context->hevc_level ||
+        (vme_context->hevc_level != pSequenceParameter->general_level_idc)) {
+        vme_context->hevc_level = pSequenceParameter->general_level_idc;
+    }
+
+    intel_vme_hevc_update_mbmv_cost(ctx, encode_state, encoder_context);
+
+    /*Setup all the memory object*/
+    gen9_vme_hevc_surface_setup(ctx, encode_state, is_intra, encoder_context);
+    gen9_vme_interface_setup(ctx, encode_state, encoder_context);
+    //gen9_vme_vme_state_setup(ctx, encode_state, is_intra, encoder_context);
+    gen9_vme_constant_setup(ctx, encode_state, encoder_context);
+
+    /*Programing media pipeline*/
+    gen9_vme_hevc_pipeline_programing(ctx, encode_state, encoder_context);
+
+    return vaStatus;
+}
+
+
+static VAStatus
+gen9_vme_hevc_pipeline(VADriverContextP ctx,
+                  VAProfile profile,
+                  struct encode_state *encode_state,
+                  struct intel_encoder_context *encoder_context)
+{
+    gen9_vme_media_init(ctx, encoder_context);
+    gen9_vme_hevc_prepare(ctx, encode_state, encoder_context);
+    gen9_vme_run(ctx, encode_state, encoder_context);
+    gen9_vme_stop(ctx, encode_state, encoder_context);
+
+    return VA_STATUS_SUCCESS;
+}
+
+
 static void
 gen9_vme_context_destroy(void *context)
 {
@@ -1147,8 +1625,8 @@ gen9_vme_context_destroy(void *context)
     vme_context->vme_batchbuffer.bo = NULL;
 
     if (vme_context->vme_state_message) {
-	free(vme_context->vme_state_message);
-	vme_context->vme_state_message = NULL;
+        free(vme_context->vme_state_message);
+        vme_context->vme_state_message = NULL;
     }
 
     free(vme_context);
@@ -1172,6 +1650,12 @@ Bool gen9_vme_context_init(VADriverContextP ctx, struct intel_encoder_context *e
         vme_kernel_list = gen9_vme_mpeg2_kernels;
         encoder_context->vme_pipeline = gen9_vme_mpeg2_pipeline;
         i965_kernel_num = sizeof(gen9_vme_mpeg2_kernels) / sizeof(struct i965_kernel);
+        break;
+
+   case CODEC_HEVC:
+        vme_kernel_list = gen9_vme_hevc_kernels;
+        encoder_context->vme_pipeline = gen9_vme_hevc_pipeline;
+        i965_kernel_num = sizeof(gen9_vme_hevc_kernels) / sizeof(struct i965_kernel);
 
         break;
 
-- 
1.9.1



More information about the Libva mailing list