[Libva] [PATCH v2 08/12] VP8 HWEnc: Add BSW VP8 HWEnc support

Zhong Li zhong.li at intel.com
Tue Jan 13 21:03:39 PST 2015


Signed-off-by: Zhong Li <zhong.li at intel.com>
---
 src/gen8_mfc.c                           | 911 ++++++++++++++++++++++++++++++-
 src/gen8_vme.c                           | 264 +++++++--
 src/i965_device_info.c                   |   1 +
 src/shaders/vme/Makefile.am              |   4 +-
 src/shaders/vme/vp8_inter_frame_gen8.asm | 739 +++++++++++++++++++++++++
 src/shaders/vme/vp8_inter_frame_gen8.g8a |   2 +
 src/shaders/vme/vp8_inter_frame_gen8.g8b | 299 ++++++++++
 src/shaders/vme/vp8_intra_frame_gen8.asm | 200 +++++++
 src/shaders/vme/vp8_intra_frame_gen8.g8a |   2 +
 src/shaders/vme/vp8_intra_frame_gen8.g8b |  73 +++
 src/vp8_probs.h                          |  16 +-
 11 files changed, 2439 insertions(+), 72 deletions(-)
 create mode 100644 src/shaders/vme/vp8_inter_frame_gen8.asm
 create mode 100644 src/shaders/vme/vp8_inter_frame_gen8.g8a
 create mode 100644 src/shaders/vme/vp8_inter_frame_gen8.g8b
 create mode 100644 src/shaders/vme/vp8_intra_frame_gen8.asm
 create mode 100644 src/shaders/vme/vp8_intra_frame_gen8.g8a
 create mode 100644 src/shaders/vme/vp8_intra_frame_gen8.g8b

diff --git a/src/gen8_mfc.c b/src/gen8_mfc.c
index b50616d..9a227ac 100644
--- a/src/gen8_mfc.c
+++ b/src/gen8_mfc.c
@@ -43,6 +43,7 @@
 #include "gen6_vme.h"
 #include "intel_media.h"
 #include <va/va_enc_jpeg.h>
+#include "vp8_probs.h"
 
 #define SURFACE_STATE_PADDED_SIZE               SURFACE_STATE_PADDED_SIZE_GEN8
 #define SURFACE_STATE_OFFSET(index)             (SURFACE_STATE_PADDED_SIZE * index)
@@ -131,6 +132,7 @@ static struct i965_kernel gen8_mfc_kernels[] = {
 #define		INTER_16X8		0x01
 #define		INTER_8X16		0x02
 #define		SUBMB_SHAPE_MASK	0x00FF00
+#define		INTER_16X16		0x00
 
 #define		INTER_MV8		(4 << 20)
 #define		INTER_MV32		(6 << 20)
@@ -146,7 +148,8 @@ gen8_mfc_pipe_mode_select(VADriverContextP ctx,
 
     assert(standard_select == MFX_FORMAT_MPEG2 ||
            standard_select == MFX_FORMAT_AVC   ||
-           standard_select == MFX_FORMAT_JPEG);
+           standard_select == MFX_FORMAT_JPEG  ||
+           standard_select == MFX_FORMAT_VP8);
 
     BEGIN_BCS_BATCH(batch, 5);
 
@@ -157,6 +160,7 @@ gen8_mfc_pipe_mode_select(VADriverContextP ctx,
                   (0 << 10) | /* Stream-Out Enable */
                   ((!!mfc_context->post_deblocking_output.bo) << 9)  | /* Post Deblocking Output */
                   ((!!mfc_context->pre_deblocking_output.bo) << 8)  | /* Pre Deblocking Output */
+                  (0 << 6)  | /* frame statistics stream-out enable*/
                   (0 << 5)  | /* not in stitch mode */
                   (1 << 4)  | /* encoding mode */
                   (standard_select << 0));  /* standard select: avc or mpeg2 or jpeg*/
@@ -221,9 +225,18 @@ gen8_mfc_ind_obj_base_addr_state(VADriverContextP ctx,
     OUT_BCS_BATCH(batch, 0);
     OUT_BCS_BATCH(batch, 0);
     OUT_BCS_BATCH(batch, 0);
+
     /* the DW4-5 is the MFX upper bound */
-    OUT_BCS_BATCH(batch, 0);
-    OUT_BCS_BATCH(batch, 0);
+    if (encoder_context->codec == CODEC_VP8) {
+        OUT_BCS_RELOC(batch,
+                mfc_context->mfc_indirect_pak_bse_object.bo,
+                I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+                mfc_context->mfc_indirect_pak_bse_object.end_offset);
+        OUT_BCS_BATCH(batch, 0);
+    } else {
+        OUT_BCS_BATCH(batch, 0);
+        OUT_BCS_BATCH(batch, 0);
+    }
 
     if(encoder_context->codec != CODEC_JPEG) {
         vme_size = vme_context->vme_output.size_block * vme_context->vme_output.num_blocks;
@@ -3203,6 +3216,871 @@ gen8_mfc_jpeg_encode_picture(VADriverContextP ctx,
     return VA_STATUS_SUCCESS;
 }
 
+static void vp8_enc_state_init(struct gen6_mfc_context *mfc_context,
+                               VAEncPictureParameterBufferVP8 *pic_param,
+                               VAQMatrixBufferVP8 *q_matrix)
+{
+
+    int is_key_frame = !pic_param->pic_flags.bits.frame_type;
+    unsigned char *coeff_probs_stream_in_buffer;
+    
+    mfc_context->vp8_state.frame_header_lf_update_pos = 0;
+    mfc_context->vp8_state.frame_header_qindex_update_pos = 0;
+    mfc_context->vp8_state.frame_header_token_update_pos = 0;
+    mfc_context->vp8_state.frame_header_bin_mv_upate_pos = 0;
+
+    mfc_context->vp8_state.prob_skip_false = 255;
+    memset(mfc_context->vp8_state.mb_segment_tree_probs, 0, sizeof(mfc_context->vp8_state.mb_segment_tree_probs));
+    memcpy(mfc_context->vp8_state.mv_probs, vp8_default_mv_context, sizeof(mfc_context->vp8_state.mv_probs));
+    
+    if (is_key_frame) {
+        memcpy(mfc_context->vp8_state.y_mode_probs, vp8_kf_ymode_prob, sizeof(mfc_context->vp8_state.y_mode_probs));
+        memcpy(mfc_context->vp8_state.uv_mode_probs, vp8_kf_uv_mode_prob, sizeof(mfc_context->vp8_state.uv_mode_probs));
+
+        mfc_context->vp8_state.prob_intra = 255;
+        mfc_context->vp8_state.prob_last = 128;
+        mfc_context->vp8_state.prob_gf = 128;
+    } else {
+        memcpy(mfc_context->vp8_state.y_mode_probs, vp8_ymode_prob, sizeof(mfc_context->vp8_state.y_mode_probs));
+        memcpy(mfc_context->vp8_state.uv_mode_probs, vp8_uv_mode_prob, sizeof(mfc_context->vp8_state.uv_mode_probs));
+
+        mfc_context->vp8_state.prob_intra = 63;
+        mfc_context->vp8_state.prob_last = 128;
+        mfc_context->vp8_state.prob_gf = 128;
+    }
+    
+    mfc_context->vp8_state.prob_skip_false = vp8_base_skip_false_prob[q_matrix->quantization_index[0]];
+  
+    dri_bo_map(mfc_context->vp8_state.coeff_probs_stream_in_bo, 1);
+    coeff_probs_stream_in_buffer = (unsigned char *)mfc_context->vp8_state.coeff_probs_stream_in_bo->virtual;
+    assert(coeff_probs_stream_in_buffer);
+    memcpy(coeff_probs_stream_in_buffer, vp8_default_coef_probs, sizeof(vp8_default_coef_probs));
+    dri_bo_unmap(mfc_context->vp8_state.coeff_probs_stream_in_bo);
+}
+
+static void vp8_enc_state_update(struct gen6_mfc_context *mfc_context,
+                                 VAQMatrixBufferVP8 *q_matrix)
+{
+
+    /*some other probabilities need to be updated*/
+}
+
+extern void binarize_vp8_frame_header(VAEncSequenceParameterBufferVP8 *seq_param,
+                           VAEncPictureParameterBufferVP8 *pic_param,
+                           VAQMatrixBufferVP8 *q_matrix,
+                           struct gen6_mfc_context *mfc_context);
+
+static void vp8_enc_frame_header_binarize(struct encode_state *encode_state, 
+                                          struct gen6_mfc_context *mfc_context)
+{
+    VAEncSequenceParameterBufferVP8 *seq_param = (VAEncSequenceParameterBufferVP8 *)encode_state->seq_param_ext->buffer;
+    VAEncPictureParameterBufferVP8 *pic_param = (VAEncPictureParameterBufferVP8 *)encode_state->pic_param_ext->buffer;
+    VAQMatrixBufferVP8 *q_matrix = (VAQMatrixBufferVP8 *)encode_state->q_matrix->buffer;
+    unsigned char *frame_header_buffer;
+
+    binarize_vp8_frame_header(seq_param, pic_param, q_matrix, mfc_context);
+ 
+    dri_bo_map(mfc_context->vp8_state.frame_header_bo, 1);
+    frame_header_buffer = (unsigned char *)mfc_context->vp8_state.frame_header_bo->virtual;
+    assert(frame_header_buffer);
+    memcpy(frame_header_buffer, mfc_context->vp8_state.vp8_frame_header, (mfc_context->vp8_state.frame_header_bit_count + 7) / 8);
+    dri_bo_unmap(mfc_context->vp8_state.frame_header_bo);
+}
+
+#define MAX_VP8_FRAME_HEADER_SIZE              0x2000
+#define VP8_TOKEN_STATISTICS_BUFFER_SIZE       0x2000
+
+static void gen8_mfc_vp8_init(VADriverContextP ctx,
+                          struct encode_state *encode_state,
+                          struct intel_encoder_context *encoder_context)
+{
+    struct i965_driver_data *i965 = i965_driver_data(ctx);
+    struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
+    dri_bo *bo;
+    int i;
+    int width_in_mbs = 0;
+    int height_in_mbs = 0;
+    int slice_batchbuffer_size;
+
+    VAEncSequenceParameterBufferVP8 *pSequenceParameter = (VAEncSequenceParameterBufferVP8 *)encode_state->seq_param_ext->buffer;
+    VAEncPictureParameterBufferVP8 *pic_param = (VAEncPictureParameterBufferVP8 *)encode_state->pic_param_ext->buffer;
+    VAQMatrixBufferVP8 *q_matrix = (VAQMatrixBufferVP8 *)encode_state->q_matrix->buffer;
+
+    width_in_mbs = ALIGN(pSequenceParameter->frame_height, 16) / 16;
+    height_in_mbs = ALIGN(pSequenceParameter->frame_height, 16) / 16;
+
+    slice_batchbuffer_size = 64 * width_in_mbs * height_in_mbs + 4096 +
+        (SLICE_HEADER + SLICE_TAIL);
+
+    /*Encode common setup for MFC*/
+    dri_bo_unreference(mfc_context->post_deblocking_output.bo);
+    mfc_context->post_deblocking_output.bo = NULL;
+
+    dri_bo_unreference(mfc_context->pre_deblocking_output.bo);
+    mfc_context->pre_deblocking_output.bo = NULL;
+
+    dri_bo_unreference(mfc_context->uncompressed_picture_source.bo);
+    mfc_context->uncompressed_picture_source.bo = NULL;
+
+    dri_bo_unreference(mfc_context->mfc_indirect_pak_bse_object.bo);
+    mfc_context->mfc_indirect_pak_bse_object.bo = NULL;
+
+    for (i = 0; i < NUM_MFC_DMV_BUFFERS; i++){
+        if ( mfc_context->direct_mv_buffers[i].bo != NULL)
+            dri_bo_unreference(mfc_context->direct_mv_buffers[i].bo);
+        mfc_context->direct_mv_buffers[i].bo = NULL;
+    }
+
+    for (i = 0; i < MAX_MFC_REFERENCE_SURFACES; i++){
+        if (mfc_context->reference_surfaces[i].bo != NULL)
+            dri_bo_unreference(mfc_context->reference_surfaces[i].bo);
+        mfc_context->reference_surfaces[i].bo = NULL;
+    }
+
+    dri_bo_unreference(mfc_context->intra_row_store_scratch_buffer.bo);
+    bo = dri_bo_alloc(i965->intel.bufmgr,
+                      "Buffer",
+                      width_in_mbs * 64,
+                      64);
+    assert(bo);
+    mfc_context->intra_row_store_scratch_buffer.bo = bo;
+
+    dri_bo_unreference(mfc_context->macroblock_status_buffer.bo);
+    bo = dri_bo_alloc(i965->intel.bufmgr,
+                      "Buffer",
+                      width_in_mbs * height_in_mbs * 16,
+                      64);
+    assert(bo);
+    mfc_context->macroblock_status_buffer.bo = bo;
+
+    dri_bo_unreference(mfc_context->deblocking_filter_row_store_scratch_buffer.bo);
+    bo = dri_bo_alloc(i965->intel.bufmgr,
+                      "Buffer",
+                      4 * width_in_mbs * 64,  /* 4 * width_in_mbs * 64 */
+                      64);
+    assert(bo);
+    mfc_context->deblocking_filter_row_store_scratch_buffer.bo = bo;
+
+    dri_bo_unreference(mfc_context->bsd_mpc_row_store_scratch_buffer.bo);
+    bo = dri_bo_alloc(i965->intel.bufmgr,
+                      "Buffer",
+                      2 * width_in_mbs * 64, /* 2 * width_in_mbs * 64 */
+                      0x1000);
+    assert(bo);
+    mfc_context->bsd_mpc_row_store_scratch_buffer.bo = bo;
+
+    dri_bo_unreference(mfc_context->mfc_batchbuffer_surface.bo);
+    mfc_context->mfc_batchbuffer_surface.bo = NULL;
+
+    dri_bo_unreference(mfc_context->aux_batchbuffer_surface.bo);
+    mfc_context->aux_batchbuffer_surface.bo = NULL;
+
+    if (mfc_context->aux_batchbuffer)
+        intel_batchbuffer_free(mfc_context->aux_batchbuffer);
+
+    mfc_context->aux_batchbuffer = intel_batchbuffer_new(&i965->intel, I915_EXEC_BSD, slice_batchbuffer_size);
+    mfc_context->aux_batchbuffer_surface.bo = mfc_context->aux_batchbuffer->buffer;
+    dri_bo_reference(mfc_context->aux_batchbuffer_surface.bo);
+    mfc_context->aux_batchbuffer_surface.pitch = 16;
+    mfc_context->aux_batchbuffer_surface.num_blocks = mfc_context->aux_batchbuffer->size / 16;
+    mfc_context->aux_batchbuffer_surface.size_block = 16;
+
+    i965_gpe_context_init(ctx, &mfc_context->gpe_context);
+
+    /* alloc vp8 encoding buffers*/
+    dri_bo_unreference(mfc_context->vp8_state.frame_header_bo);
+    bo = dri_bo_alloc(i965->intel.bufmgr,
+                      "Buffer",
+                      MAX_VP8_FRAME_HEADER_SIZE,
+                      0x1000);
+    assert(bo);
+    mfc_context->vp8_state.frame_header_bo = bo;
+
+    mfc_context->vp8_state.intermediate_buffer_max_size = width_in_mbs * height_in_mbs * 256 * 9;
+    for(i = 0; i < 8; i++) {
+        mfc_context->vp8_state.intermediate_partition_offset[i] = width_in_mbs * height_in_mbs * 256 * (i + 1);
+    }
+    dri_bo_unreference(mfc_context->vp8_state.intermediate_bo);
+    bo = dri_bo_alloc(i965->intel.bufmgr,
+                      "Buffer",
+                      mfc_context->vp8_state.intermediate_buffer_max_size,
+                      0x1000);
+    assert(bo);
+    mfc_context->vp8_state.intermediate_bo = bo;
+
+    dri_bo_unreference(mfc_context->vp8_state.stream_out_bo);
+    bo = dri_bo_alloc(i965->intel.bufmgr,
+                      "Buffer",
+                      width_in_mbs * height_in_mbs * 16,
+                      0x1000);
+    assert(bo);
+    mfc_context->vp8_state.stream_out_bo = bo;
+
+    dri_bo_unreference(mfc_context->vp8_state.coeff_probs_stream_in_bo);
+    bo = dri_bo_alloc(i965->intel.bufmgr,
+                      "Buffer",
+                      sizeof(vp8_default_coef_probs),
+                      0x1000);
+    assert(bo);
+    mfc_context->vp8_state.coeff_probs_stream_in_bo = bo;
+
+    dri_bo_unreference(mfc_context->vp8_state.token_statistics_bo);
+    bo = dri_bo_alloc(i965->intel.bufmgr,
+                      "Buffer",
+                      VP8_TOKEN_STATISTICS_BUFFER_SIZE,
+                      0x1000);
+    assert(bo);
+    mfc_context->vp8_state.token_statistics_bo = bo;
+
+    dri_bo_unreference(mfc_context->vp8_state.mpc_row_store_bo);
+    bo = dri_bo_alloc(i965->intel.bufmgr,
+                      "Buffer",
+                      width_in_mbs * 16 * 64,
+                      0x1000);
+    assert(bo);
+    mfc_context->vp8_state.mpc_row_store_bo = bo;
+
+    vp8_enc_state_init(mfc_context, pic_param, q_matrix);
+    vp8_enc_frame_header_binarize(encode_state, mfc_context);
+}
+
+static VAStatus
+intel_mfc_vp8_prepare(VADriverContextP ctx,
+                        struct encode_state *encode_state,
+                        struct intel_encoder_context *encoder_context)
+{
+    struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
+    struct object_surface *obj_surface;
+    struct object_buffer *obj_buffer;
+    struct i965_coded_buffer_segment *coded_buffer_segment;
+    VAEncPictureParameterBufferVP8 *pic_param = (VAEncPictureParameterBufferVP8 *)encode_state->pic_param_ext->buffer;
+    VAStatus vaStatus = VA_STATUS_SUCCESS;
+    dri_bo *bo;
+    int i;
+
+    /* reconstructed surface */
+    obj_surface = encode_state->reconstructed_object;
+    i965_check_alloc_surface_bo(ctx, obj_surface, 1, VA_FOURCC('N','V','1','2'), SUBSAMPLE_YUV420);
+    if (pic_param->loop_filter_level[0] == 0) {
+        mfc_context->pre_deblocking_output.bo = obj_surface->bo;
+        dri_bo_reference(mfc_context->pre_deblocking_output.bo);
+    } else {
+        mfc_context->post_deblocking_output.bo = obj_surface->bo;
+        dri_bo_reference(mfc_context->post_deblocking_output.bo);
+    }
+
+    mfc_context->surface_state.width = obj_surface->orig_width;
+    mfc_context->surface_state.height = obj_surface->orig_height;
+    mfc_context->surface_state.w_pitch = obj_surface->width;
+    mfc_context->surface_state.h_pitch = obj_surface->height;
+
+    /* set vp8 reference frames */
+    for (i = 0; i < ARRAY_ELEMS(mfc_context->reference_surfaces); i++) {
+        obj_surface = encode_state->reference_objects[i];
+
+        if (obj_surface && obj_surface->bo) {
+            mfc_context->reference_surfaces[i].bo = obj_surface->bo;
+            dri_bo_reference(mfc_context->reference_surfaces[i].bo);
+        } else {
+            mfc_context->reference_surfaces[i].bo = NULL;
+        }
+    }
+
+    /* input YUV surface */
+    obj_surface = encode_state->input_yuv_object;
+    mfc_context->uncompressed_picture_source.bo = obj_surface->bo;
+    dri_bo_reference(mfc_context->uncompressed_picture_source.bo);
+
+    /* coded buffer */
+    obj_buffer = encode_state->coded_buf_object;
+    bo = obj_buffer->buffer_store->bo;
+    mfc_context->mfc_indirect_pak_bse_object.bo = bo;
+    mfc_context->mfc_indirect_pak_bse_object.offset = I965_CODEDBUFFER_HEADER_SIZE;
+    mfc_context->mfc_indirect_pak_bse_object.end_offset = ALIGN(obj_buffer->size_element - 0x1000, 0x1000);
+    dri_bo_reference(mfc_context->mfc_indirect_pak_bse_object.bo);
+
+    dri_bo_unreference(mfc_context->vp8_state.final_frame_bo);
+    mfc_context->vp8_state.final_frame_bo = mfc_context->mfc_indirect_pak_bse_object.bo;
+    mfc_context->vp8_state.final_frame_byte_offset = I965_CODEDBUFFER_HEADER_SIZE;
+    dri_bo_reference(mfc_context->vp8_state.final_frame_bo);
+
+    /* set the internal flag to 0 to indicate the coded size is unknown */
+    dri_bo_map(bo, 1);
+    coded_buffer_segment = (struct i965_coded_buffer_segment *)bo->virtual;
+    coded_buffer_segment->mapped = 0;
+    coded_buffer_segment->codec = encoder_context->codec;
+    dri_bo_unmap(bo);
+
+    return vaStatus;
+}
+
+static void
+gen8_mfc_vp8_encoder_cfg(VADriverContextP ctx, 
+                         struct encode_state *encode_state,
+                         struct intel_encoder_context *encoder_context)
+{
+    struct intel_batchbuffer *batch = encoder_context->base.batch;
+    struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
+    VAEncSequenceParameterBufferVP8 *seq_param = (VAEncSequenceParameterBufferVP8 *)encode_state->seq_param_ext->buffer;
+    VAEncPictureParameterBufferVP8 *pic_param = (VAEncPictureParameterBufferVP8 *)encode_state->pic_param_ext->buffer;
+
+    BEGIN_BCS_BATCH(batch, 30);
+    OUT_BCS_BATCH(batch, MFX_VP8_ENCODER_CFG | (30 - 2)); /* SKL should be 31-2 ? */
+
+    OUT_BCS_BATCH(batch,
+                  0 << 9 | /* compressed bitstream output disable */
+                  1 << 7 | /* disable per-segment delta qindex and loop filter in RC */
+                  1 << 6 | /* RC initial pass */
+                  0 << 4 | /* upate segment feature date flag */
+                  1 << 3 | /* bitstream statistics output enable */
+                  1 << 2 | /* token statistics output enable */
+                  0 << 1 | /* final bitstream output disable */
+                  0 << 0); /*DW1*/
+    
+    OUT_BCS_BATCH(batch, 0); /*DW2*/
+
+    OUT_BCS_BATCH(batch, 
+                  0xfff << 16 | /* max intra mb bit count limit */
+                  0xfff << 0  /* max inter mb bit count limit */
+                  ); /*DW3*/
+
+    OUT_BCS_BATCH(batch, 0); /*DW4*/
+    OUT_BCS_BATCH(batch, 0); /*DW5*/
+    OUT_BCS_BATCH(batch, 0); /*DW6*/
+    OUT_BCS_BATCH(batch, 0); /*DW7*/
+    OUT_BCS_BATCH(batch, 0); /*DW8*/
+    OUT_BCS_BATCH(batch, 0); /*DW9*/
+    OUT_BCS_BATCH(batch, 0); /*DW10*/
+    OUT_BCS_BATCH(batch, 0); /*DW11*/
+    OUT_BCS_BATCH(batch, 0); /*DW12*/
+    OUT_BCS_BATCH(batch, 0); /*DW13*/
+    OUT_BCS_BATCH(batch, 0); /*DW14*/
+    OUT_BCS_BATCH(batch, 0); /*DW15*/
+    OUT_BCS_BATCH(batch, 0); /*DW16*/
+    OUT_BCS_BATCH(batch, 0); /*DW17*/
+    OUT_BCS_BATCH(batch, 0); /*DW18*/
+    OUT_BCS_BATCH(batch, 0); /*DW19*/
+    OUT_BCS_BATCH(batch, 0); /*DW20*/
+    OUT_BCS_BATCH(batch, 0); /*DW21*/
+
+    OUT_BCS_BATCH(batch, 
+                 pic_param->pic_flags.bits.show_frame << 23 |
+                 pic_param->pic_flags.bits.version << 20
+                 ); /*DW22*/
+
+    OUT_BCS_BATCH(batch,
+                 (seq_param->frame_height_scale << 14 | seq_param->frame_height) << 16 |
+                 (seq_param->frame_width_scale << 14 | seq_param->frame_width) << 0
+                 );
+
+    /*DW24*/
+    OUT_BCS_BATCH(batch, mfc_context->vp8_state.frame_header_bit_count); /* frame header bit count */
+
+    /*DW25*/
+    OUT_BCS_BATCH(batch, mfc_context->vp8_state.frame_header_qindex_update_pos); /* frame header bin buffer qindex update pointer */
+
+    /*DW26*/
+    OUT_BCS_BATCH(batch, mfc_context->vp8_state.frame_header_lf_update_pos); /* frame header bin buffer loop filter update pointer*/
+
+    /*DW27*/
+    OUT_BCS_BATCH(batch, mfc_context->vp8_state.frame_header_token_update_pos); /* frame header bin buffer token update pointer */
+
+    /*DW28*/
+    OUT_BCS_BATCH(batch, mfc_context->vp8_state.frame_header_bin_mv_upate_pos); /*frame header bin buffer mv update pointer */
+
+    /*DW29*/
+    OUT_BCS_BATCH(batch, 0);
+
+    ADVANCE_BCS_BATCH(batch);
+}
+
+static void
+gen8_mfc_vp8_pic_state(VADriverContextP ctx,
+                       struct encode_state *encode_state,
+                       struct intel_encoder_context *encoder_context)
+{
+    struct intel_batchbuffer *batch = encoder_context->base.batch;
+    struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
+    VAEncSequenceParameterBufferVP8 *seq_param = (VAEncSequenceParameterBufferVP8 *)encode_state->seq_param_ext->buffer;
+    VAEncPictureParameterBufferVP8 *pic_param = (VAEncPictureParameterBufferVP8 *)encode_state->pic_param_ext->buffer;
+    VAQMatrixBufferVP8 *q_matrix = (VAQMatrixBufferVP8 *)encode_state->q_matrix->buffer;
+    int i, j, log2num;
+
+    assert(pic_param->pic_flags.bits.num_token_partitions > 0);
+    assert(pic_param->pic_flags.bits.num_token_partitions < 9);
+    log2num = (int)log2(pic_param->pic_flags.bits.num_token_partitions);
+
+    /*update mode and token probs*/
+    vp8_enc_state_update(mfc_context, q_matrix);
+ 
+    BEGIN_BCS_BATCH(batch, 38);
+    OUT_BCS_BATCH(batch, MFX_VP8_PIC_STATE | (38 - 2));
+    OUT_BCS_BATCH(batch,
+                  (ALIGN(seq_param->frame_height, 16) / 16 - 1) << 16 |
+                  (ALIGN(seq_param->frame_width, 16) / 16 - 1) << 0);
+ 
+    OUT_BCS_BATCH(batch,
+                  log2num << 24 |
+                  pic_param->sharpness_level << 16 |
+                  pic_param->pic_flags.bits.sign_bias_alternate << 13 |
+                  pic_param->pic_flags.bits.sign_bias_golden << 12 |
+                  pic_param->pic_flags.bits.loop_filter_adj_enable << 11 |
+                  pic_param->pic_flags.bits.mb_no_coeff_skip << 10 |
+                  pic_param->pic_flags.bits.update_mb_segmentation_map << 9 |
+                  pic_param->pic_flags.bits.segmentation_enabled << 8 |
+                  !pic_param->pic_flags.bits.frame_type << 5 | /* 0 indicate an intra frame in VP8 stream/spec($9.1)*/
+                  (pic_param->pic_flags.bits.version / 2) << 4 |
+                  (pic_param->pic_flags.bits.version == 3) << 1 | /* full pixel mode for version 3 */
+                  !!pic_param->pic_flags.bits.version << 0); /* version 0: 6 tap */
+ 
+    OUT_BCS_BATCH(batch,
+                  pic_param->loop_filter_level[3] << 24 |
+                  pic_param->loop_filter_level[2] << 16 |
+                  pic_param->loop_filter_level[1] <<  8 |
+                  pic_param->loop_filter_level[0] <<  0);
+
+    OUT_BCS_BATCH(batch,
+                  q_matrix->quantization_index[3] << 24 |
+                  q_matrix->quantization_index[2] << 16 |
+                  q_matrix->quantization_index[1] <<  8 |
+                  q_matrix->quantization_index[0] << 0);
+
+    OUT_BCS_BATCH(batch,
+                 ((unsigned short)(q_matrix->quantization_index_delta[4]) >> 15) << 28 | 
+                 abs(q_matrix->quantization_index_delta[4]) << 24 |
+                 ((unsigned short)(q_matrix->quantization_index_delta[3]) >> 15) << 20 | 
+                 abs(q_matrix->quantization_index_delta[3]) << 16 |
+                 ((unsigned short)(q_matrix->quantization_index_delta[2]) >> 15) << 12 | 
+                 abs(q_matrix->quantization_index_delta[2]) << 8 |
+                 ((unsigned short)(q_matrix->quantization_index_delta[1]) >> 15) << 4 | 
+                 abs(q_matrix->quantization_index_delta[1]) << 0);
+
+    OUT_BCS_BATCH(batch,
+                 ((unsigned short)(q_matrix->quantization_index_delta[0]) >> 15) << 4 |
+                 abs(q_matrix->quantization_index_delta[0]) << 0);
+    
+    OUT_BCS_BATCH(batch,
+                 pic_param->clamp_qindex_high << 8 |
+                 pic_param->clamp_qindex_low << 0);
+
+    for (i = 8; i < 19; i++) {
+         OUT_BCS_BATCH(batch, 0xffffffff);
+    }
+
+    OUT_BCS_BATCH(batch,
+                  mfc_context->vp8_state.mb_segment_tree_probs[2] << 16 |
+                  mfc_context->vp8_state.mb_segment_tree_probs[1] <<  8 |
+                  mfc_context->vp8_state.mb_segment_tree_probs[0] <<  0);
+
+    OUT_BCS_BATCH(batch,
+                  mfc_context->vp8_state.prob_skip_false << 24 |
+                  mfc_context->vp8_state.prob_intra      << 16 |
+                  mfc_context->vp8_state.prob_last       <<  8 |
+                  mfc_context->vp8_state.prob_gf         <<  0);
+
+    OUT_BCS_BATCH(batch,
+                  mfc_context->vp8_state.y_mode_probs[3] << 24 |
+                  mfc_context->vp8_state.y_mode_probs[2] << 16 |
+                  mfc_context->vp8_state.y_mode_probs[1] <<  8 |
+                  mfc_context->vp8_state.y_mode_probs[0] <<  0);
+
+    OUT_BCS_BATCH(batch,
+                  mfc_context->vp8_state.uv_mode_probs[2] << 16 |
+                  mfc_context->vp8_state.uv_mode_probs[1] <<  8 |
+                  mfc_context->vp8_state.uv_mode_probs[0] <<  0);
+    
+    /* MV update value, DW23-DW32 */
+    for (i = 0; i < 2; i++) {
+        for (j = 0; j < 20; j += 4) {
+            OUT_BCS_BATCH(batch,
+                          (j + 3 == 19 ? 0 : mfc_context->vp8_state.mv_probs[i][j + 3]) << 24 |
+                          mfc_context->vp8_state.mv_probs[i][j + 2] << 16 |
+                          mfc_context->vp8_state.mv_probs[i][j + 1] <<  8 |
+                          mfc_context->vp8_state.mv_probs[i][j + 0] <<  0);
+        }
+    }
+
+    OUT_BCS_BATCH(batch,
+                  (pic_param->ref_lf_delta[3] & 0x7f) << 24 |
+                  (pic_param->ref_lf_delta[2] & 0x7f) << 16 |
+                  (pic_param->ref_lf_delta[1] & 0x7f) <<  8 |
+                  (pic_param->ref_lf_delta[0] & 0x7f) <<  0);
+
+    OUT_BCS_BATCH(batch,
+                  (pic_param->mode_lf_delta[3] & 0x7f) << 24 |
+                  (pic_param->mode_lf_delta[2] & 0x7f) << 16 |
+                  (pic_param->mode_lf_delta[1] & 0x7f) <<  8 |
+                  (pic_param->mode_lf_delta[0] & 0x7f) <<  0);
+
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+
+    ADVANCE_BCS_BATCH(batch);
+}
+
+#define OUT_VP8_BUFFER(bo, offset)                                      \
+    if (bo)                                                             \
+        OUT_BCS_RELOC(batch,                                            \
+                      bo,                                               \
+                      I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, \
+                      offset);                                           \
+    else                                                                \
+        OUT_BCS_BATCH(batch, 0);                                        \
+    OUT_BCS_BATCH(batch, 0);                                            \
+    OUT_BCS_BATCH(batch, 0);
+
+static void 
+gen8_mfc_vp8_bsp_buf_base_addr_state(VADriverContextP ctx, 
+                                     struct encode_state *encode_state,
+                                     struct intel_encoder_context *encoder_context)
+{
+    struct intel_batchbuffer *batch = encoder_context->base.batch;
+    struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
+
+    BEGIN_BCS_BATCH(batch, 32);
+    OUT_BCS_BATCH(batch, MFX_VP8_BSP_BUF_BASE_ADDR_STATE | (32 - 2));
+
+    OUT_VP8_BUFFER(mfc_context->vp8_state.frame_header_bo, 0);
+
+    OUT_VP8_BUFFER(mfc_context->vp8_state.intermediate_bo, 0);
+    OUT_BCS_BATCH(batch, mfc_context->vp8_state.intermediate_partition_offset[0]);
+    OUT_BCS_BATCH(batch, mfc_context->vp8_state.intermediate_partition_offset[1]);
+    OUT_BCS_BATCH(batch, mfc_context->vp8_state.intermediate_partition_offset[2]);
+    OUT_BCS_BATCH(batch, mfc_context->vp8_state.intermediate_partition_offset[3]);
+    OUT_BCS_BATCH(batch, mfc_context->vp8_state.intermediate_partition_offset[4]);
+    OUT_BCS_BATCH(batch, mfc_context->vp8_state.intermediate_partition_offset[5]);
+    OUT_BCS_BATCH(batch, mfc_context->vp8_state.intermediate_partition_offset[6]);
+    OUT_BCS_BATCH(batch, mfc_context->vp8_state.intermediate_partition_offset[7]);
+    OUT_BCS_BATCH(batch, mfc_context->vp8_state.intermediate_buffer_max_size);
+
+    OUT_VP8_BUFFER(mfc_context->vp8_state.final_frame_bo, I965_CODEDBUFFER_HEADER_SIZE);
+    OUT_BCS_BATCH(batch, 0);
+
+    OUT_VP8_BUFFER(mfc_context->vp8_state.stream_out_bo, 0);
+    OUT_VP8_BUFFER(mfc_context->vp8_state.coeff_probs_stream_in_bo, 0);
+    OUT_VP8_BUFFER(mfc_context->vp8_state.token_statistics_bo, 0);
+    OUT_VP8_BUFFER(mfc_context->vp8_state.mpc_row_store_bo, 0);
+
+    ADVANCE_BCS_BATCH(batch);
+}
+
+static void
+gen8_mfc_vp8_pipeline_picture_programing(VADriverContextP ctx,
+                                           struct encode_state *encode_state,
+                                           struct intel_encoder_context *encoder_context)
+{
+    struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
+
+    mfc_context->pipe_mode_select(ctx, MFX_FORMAT_VP8, encoder_context);
+    mfc_context->set_surface_state(ctx, encoder_context);
+    mfc_context->ind_obj_base_addr_state(ctx, encoder_context);
+    gen8_mfc_pipe_buf_addr_state(ctx, encoder_context);
+    gen8_mfc_bsp_buf_base_addr_state(ctx, encoder_context);
+    gen8_mfc_vp8_bsp_buf_base_addr_state(ctx, encode_state, encoder_context);
+    gen8_mfc_vp8_pic_state(ctx, encode_state,encoder_context);
+    gen8_mfc_vp8_encoder_cfg(ctx, encode_state, encoder_context);
+}
+
+static const unsigned char
+vp8_intra_mb_mode_map[VME_MB_INTRA_MODE_COUNT] = {
+    PAK_V_PRED,
+    PAK_H_PRED,
+    PAK_DC_PRED,
+    PAK_TM_PRED
+};
+
+static const unsigned char
+vp8_intra_block_mode_map[VME_B_INTRA_MODE_COUNT] = {
+    PAK_B_VE_PRED,
+    PAK_B_HE_PRED,
+    PAK_B_DC_PRED,
+    PAK_B_LD_PRED,
+    PAK_B_RD_PRED,
+    PAK_B_VR_PRED,
+    PAK_B_HD_PRED,
+    PAK_B_VL_PRED,
+    PAK_B_HU_PRED
+};
+
+static int inline gen8_mfc_vp8_intra_mb_mode_map(unsigned int vme_pred_mode, int is_luma_4x4)
+{
+    unsigned int i, pak_pred_mode = 0;
+    unsigned int vme_sub_blocks_pred_mode[8], pak_sub_blocks_pred_mode[8]; /* 8 blocks's intra mode */
+
+    if (!is_luma_4x4) {
+        pak_pred_mode = vp8_intra_mb_mode_map[vme_pred_mode & 0x3];
+    } else {
+        for (i = 0; i < 8; i++) { 
+            vme_sub_blocks_pred_mode[i] = ((vme_pred_mode >> (4 * i)) & 0xf);
+            assert(vme_sub_blocks_pred_mode[i] < VME_B_INTRA_MODE_COUNT);
+            pak_sub_blocks_pred_mode[i] = vp8_intra_block_mode_map[vme_sub_blocks_pred_mode[i]];
+            pak_pred_mode |= (pak_sub_blocks_pred_mode[i] << (4 * i));
+        }
+    }
+
+    return pak_pred_mode;
+}
+static void
+gen8_mfc_vp8_pak_object_intra(VADriverContextP ctx, 
+                              struct intel_encoder_context *encoder_context,
+                              unsigned int *msg,
+                              int x, int y,
+                              struct intel_batchbuffer *batch)
+{
+    unsigned int vme_intra_mb_mode, vme_chroma_pred_mode;
+    unsigned int pak_intra_mb_mode, pak_chroma_pred_mode;
+    unsigned int vme_luma_pred_mode[2], pak_luma_pred_mode[2];
+
+    if (batch == NULL)
+        batch = encoder_context->base.batch;
+
+    vme_intra_mb_mode = ((msg[0] & 0x30) >> 4);
+    assert((vme_intra_mb_mode == 0) || (vme_intra_mb_mode == 2)); //vp8 only support intra_16x16 and intra_4x4
+    pak_intra_mb_mode = (vme_intra_mb_mode >> 1);
+
+    vme_luma_pred_mode[0] = msg[1];
+    vme_luma_pred_mode[1] = msg[2];
+    vme_chroma_pred_mode = msg[3] & 0x3;
+
+    pak_luma_pred_mode[0] = gen8_mfc_vp8_intra_mb_mode_map(vme_luma_pred_mode[0], pak_intra_mb_mode);
+    pak_luma_pred_mode[1] = gen8_mfc_vp8_intra_mb_mode_map(vme_luma_pred_mode[1], pak_intra_mb_mode);
+    pak_chroma_pred_mode = gen8_mfc_vp8_intra_mb_mode_map(vme_chroma_pred_mode, 0);
+
+    BEGIN_BCS_BATCH(batch, 7);
+
+    OUT_BCS_BATCH(batch, MFX_VP8_PAK_OBJECT | (7 - 2));
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch,
+                  (0 << 20) |                    /* mv format: intra mb */
+                  (0 << 18) |                    /* Segment ID */
+                  (0 << 17) |                    /* disable coeff clamp */
+                  (1 << 13) |                    /* intra mb flag */
+                  (0 << 11) | 		         /* refer picture select: last frame */
+                  (pak_intra_mb_mode << 8) |     /* mb type */
+                  (pak_chroma_pred_mode << 4) |  /* mb uv mode */
+                  (0 << 2) |                     /* skip mb flag: disable */
+                  0);
+
+    OUT_BCS_BATCH(batch, (y << 16) | x);
+    OUT_BCS_BATCH(batch, pak_luma_pred_mode[0]);
+    OUT_BCS_BATCH(batch, pak_luma_pred_mode[1]);
+
+    ADVANCE_BCS_BATCH(batch);
+}
+
+static void
+gen8_mfc_vp8_pak_object_inter(VADriverContextP ctx, 
+                              struct intel_encoder_context *encoder_context,
+                              unsigned int *msg,
+                              int offset,
+                              int x, int y,
+                              struct intel_batchbuffer *batch)
+{
+    int i;
+
+    if (batch == NULL)
+        batch = encoder_context->base.batch;
+
+    /* only support inter_16x16 now */
+    assert((msg[AVC_INTER_MSG_OFFSET] & INTER_MODE_MASK) == INTER_16X16);
+    /* for inter_16x16, all 16 MVs should be same, 
+     * and move mv to the vme mb start address to make sure offset is 64 bytes aligned */
+    msg[0] = (msg[AVC_INTER_MV_OFFSET/4] & 0xfffefffe);
+    for (i = 1; i < 16; i++) {
+        msg[i] = msg[0];
+    }
+    
+    BEGIN_BCS_BATCH(batch, 7);
+
+    OUT_BCS_BATCH(batch, MFX_VP8_PAK_OBJECT | (7 - 2));
+    OUT_BCS_BATCH(batch,
+                  (0 << 29) |           /* enable inline mv data: disable */
+                  64);
+    OUT_BCS_BATCH(batch,
+                  offset);
+    OUT_BCS_BATCH(batch,
+                  (4 << 20) |           /* mv format: inter */
+                  (0 << 18) |           /* Segment ID */
+                  (0 << 17) |           /* coeff clamp: disable */
+                  (0 << 13) |		/* intra mb flag: inter mb */
+                  (0 << 11) | 		/* refer picture select: last frame */
+                  (0 << 8) |            /* mb type: 16x16 */
+                  (0 << 4) |		/* mb uv mode: dc_pred */
+                  (0 << 2) |		/* skip mb flag: disable */
+                  0);
+
+    OUT_BCS_BATCH(batch, (y << 16) | x);
+
+    /*new mv*/
+    OUT_BCS_BATCH(batch, 0x8);
+    OUT_BCS_BATCH(batch, 0x8);
+
+    ADVANCE_BCS_BATCH(batch);
+}
+
+static void
+gen8_mfc_vp8_pak_pipeline(VADriverContextP ctx,
+                          struct encode_state *encode_state,
+                          struct intel_encoder_context *encoder_context,
+                          struct intel_batchbuffer *slice_batch)
+{
+    struct gen6_vme_context *vme_context = encoder_context->vme_context;
+    VAEncSequenceParameterBufferVP8 *seq_param = (VAEncSequenceParameterBufferVP8 *)encode_state->seq_param_ext->buffer;
+    VAEncPictureParameterBufferVP8 *pic_param = (VAEncPictureParameterBufferVP8 *)encode_state->pic_param_ext->buffer;
+    int width_in_mbs = ALIGN(seq_param->frame_width, 16) / 16;
+    int height_in_mbs = ALIGN(seq_param->frame_height, 16) / 16;
+    unsigned int *msg = NULL;
+    unsigned char *msg_ptr = NULL;
+    unsigned int i, offset, is_intra_frame;
+
+    is_intra_frame = !pic_param->pic_flags.bits.frame_type;
+
+    dri_bo_map(vme_context->vme_output.bo , 1);
+    msg_ptr = (unsigned char *)vme_context->vme_output.bo->virtual;
+
+    for( i = 0; i < width_in_mbs * height_in_mbs; i++) {
+        int h_pos = i % width_in_mbs;
+        int v_pos = i / width_in_mbs;
+        msg = (unsigned int *) (msg_ptr + i * vme_context->vme_output.size_block);
+        
+        if (is_intra_frame) {
+            gen8_mfc_vp8_pak_object_intra(ctx,
+                    encoder_context,
+                    msg,
+                    h_pos, v_pos,
+                    slice_batch);
+        } else {
+            int inter_rdo, intra_rdo;
+            inter_rdo = msg[AVC_INTER_RDO_OFFSET] & AVC_RDO_MASK;
+            intra_rdo = msg[AVC_INTRA_RDO_OFFSET] & AVC_RDO_MASK;
+
+            if (intra_rdo < inter_rdo) {
+                gen8_mfc_vp8_pak_object_intra(ctx,
+                        encoder_context,
+                        msg,
+                        h_pos, v_pos,
+                        slice_batch);
+            } else {
+                offset = i * vme_context->vme_output.size_block;
+                gen8_mfc_vp8_pak_object_inter(ctx,
+                        encoder_context,
+                        msg,
+                        offset,
+                        h_pos, v_pos,
+                        slice_batch);
+            }
+        }
+    }
+
+    dri_bo_unmap(vme_context->vme_output.bo);
+}
+
+/*
+ * A batch buffer for vp8 pak object commands
+ */
+static dri_bo *
+gen8_mfc_vp8_software_batchbuffer(VADriverContextP ctx,
+                                          struct encode_state *encode_state,
+                                          struct intel_encoder_context *encoder_context)
+{
+    struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
+    struct intel_batchbuffer *batch;
+    dri_bo *batch_bo;
+
+    batch = mfc_context->aux_batchbuffer;
+    batch_bo = batch->buffer;
+
+    gen8_mfc_vp8_pak_pipeline(ctx, encode_state, encoder_context, batch);
+
+    intel_batchbuffer_align(batch, 8);
+
+    BEGIN_BCS_BATCH(batch, 2);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, MI_BATCH_BUFFER_END);
+    ADVANCE_BCS_BATCH(batch);
+
+    dri_bo_reference(batch_bo);
+    intel_batchbuffer_free(batch);
+    mfc_context->aux_batchbuffer = NULL;
+
+    return batch_bo;
+}
+
+static void
+gen8_mfc_vp8_pipeline_programing(VADriverContextP ctx,
+                                   struct encode_state *encode_state,
+                                   struct intel_encoder_context *encoder_context)
+{
+    struct intel_batchbuffer *batch = encoder_context->base.batch;
+    dri_bo *slice_batch_bo;
+
+    slice_batch_bo = gen8_mfc_vp8_software_batchbuffer(ctx, encode_state, encoder_context);
+
+    // begin programing
+    intel_batchbuffer_start_atomic_bcs(batch, 0x4000);
+    intel_batchbuffer_emit_mi_flush(batch);
+
+    // picture level programing
+    gen8_mfc_vp8_pipeline_picture_programing(ctx, encode_state, encoder_context);
+
+    BEGIN_BCS_BATCH(batch, 4);
+    OUT_BCS_BATCH(batch, MI_BATCH_BUFFER_START | (1 << 8) | (1 << 0));
+    OUT_BCS_RELOC(batch,
+                  slice_batch_bo,
+                  I915_GEM_DOMAIN_COMMAND, 0,
+                  0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    ADVANCE_BCS_BATCH(batch);
+
+    // end programing
+    intel_batchbuffer_end_atomic(batch);
+
+    dri_bo_unreference(slice_batch_bo);
+}
+
+static void gen8_mfc_calc_vp8_coded_buffer_size(VADriverContextP ctx,
+                          struct encode_state *encode_state,
+                          struct intel_encoder_context *encoder_context)
+{
+    struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
+    VAEncPictureParameterBufferVP8 *pic_param = (VAEncPictureParameterBufferVP8 *)encode_state->pic_param_ext->buffer;
+    unsigned char is_intra_frame = !pic_param->pic_flags.bits.frame_type;
+    unsigned int *vp8_encoding_status, first_partition_bytes, token_partition_bytes, vp8_coded_bytes;
+
+    dri_bo_map(mfc_context->vp8_state.token_statistics_bo, 0);
+
+    vp8_encoding_status = (unsigned int *)mfc_context->vp8_state.token_statistics_bo->virtual;
+    first_partition_bytes = (*vp8_encoding_status + 7) / 8;
+    token_partition_bytes = (*(unsigned int *)(vp8_encoding_status + 9) + 7) / 8;
+    
+    /*coded_bytes includes P0~P8 partitions bytes + uncompresse date bytes + partion_size bytes in bitstream */
+    vp8_coded_bytes = first_partition_bytes + token_partition_bytes + (3 + 7 * !!is_intra_frame) + (pic_param->pic_flags.bits.num_token_partitions - 1) * 3;
+
+    dri_bo_unmap(mfc_context->vp8_state.token_statistics_bo);
+
+    dri_bo_map(mfc_context->vp8_state.final_frame_bo, 0);
+    struct i965_coded_buffer_segment *coded_buffer_segment = (struct i965_coded_buffer_segment *)(mfc_context->vp8_state.final_frame_bo->virtual);
+    coded_buffer_segment->base.size = vp8_coded_bytes;
+    dri_bo_unmap(mfc_context->vp8_state.final_frame_bo);
+}
+
+static VAStatus
+gen8_mfc_vp8_encode_picture(VADriverContextP ctx,
+                              struct encode_state *encode_state,
+                              struct intel_encoder_context *encoder_context)
+{
+    gen8_mfc_vp8_init(ctx, encode_state, encoder_context);
+    intel_mfc_vp8_prepare(ctx, encode_state, encoder_context);
+    /*Programing bcs pipeline*/
+    gen8_mfc_vp8_pipeline_programing(ctx, encode_state, encoder_context);
+    gen8_mfc_run(ctx, encode_state, encoder_context);
+    gen8_mfc_calc_vp8_coded_buffer_size(ctx, encode_state, encoder_context);
+
+    return VA_STATUS_SUCCESS;
+}
 
 static void
 gen8_mfc_context_destroy(void *context)
@@ -3258,6 +4136,27 @@ gen8_mfc_context_destroy(void *context)
 
     mfc_context->aux_batchbuffer = NULL;
 
+    dri_bo_unreference(mfc_context->vp8_state.coeff_probs_stream_in_bo);
+    mfc_context->vp8_state.coeff_probs_stream_in_bo = NULL;
+
+    dri_bo_unreference(mfc_context->vp8_state.final_frame_bo);
+    mfc_context->vp8_state.final_frame_bo = NULL;
+
+    dri_bo_unreference(mfc_context->vp8_state.frame_header_bo);
+    mfc_context->vp8_state.frame_header_bo = NULL;
+
+    dri_bo_unreference(mfc_context->vp8_state.intermediate_bo);
+    mfc_context->vp8_state.intermediate_bo = NULL;
+
+    dri_bo_unreference(mfc_context->vp8_state.mpc_row_store_bo);
+    mfc_context->vp8_state.mpc_row_store_bo = NULL;
+
+    dri_bo_unreference(mfc_context->vp8_state.stream_out_bo);
+    mfc_context->vp8_state.stream_out_bo = NULL;
+
+    dri_bo_unreference(mfc_context->vp8_state.token_statistics_bo);
+    mfc_context->vp8_state.token_statistics_bo = NULL;
+
     free(mfc_context);
 }
 
@@ -3287,7 +4186,11 @@ static VAStatus gen8_mfc_pipeline(VADriverContextP ctx,
         jpeg_init_default_qmatrix(ctx, encoder_context);
         vaStatus = gen8_mfc_jpeg_encode_picture(ctx, encode_state, encoder_context);
         break;
-        
+ 
+    case VAProfileVP8Version0_3:
+        vaStatus = gen8_mfc_vp8_encode_picture(ctx, encode_state, encoder_context);
+        break;
+ 
     default:
         vaStatus = VA_STATUS_ERROR_UNSUPPORTED_PROFILE;
         break;
diff --git a/src/gen8_vme.c b/src/gen8_vme.c
index 29d4b5a..ace3288 100644
--- a/src/gen8_vme.c
+++ b/src/gen8_vme.c
@@ -120,6 +120,31 @@ static struct i965_kernel gen8_vme_mpeg2_kernels[] = {
     },
 };
 
+static const uint32_t gen8_vme_vp8_intra_frame[][4] = {
+#include "shaders/vme/vp8_intra_frame_gen8.g8b"
+};
+
+static const uint32_t gen8_vme_vp8_inter_frame[][4] = {
+#include "shaders/vme/vp8_inter_frame_gen8.g8b"
+};
+
+static struct i965_kernel gen8_vme_vp8_kernels[] = {
+    {
+        "VME Intra Frame",
+        VME_INTRA_SHADER, /*index*/
+        gen8_vme_vp8_intra_frame,
+        sizeof(gen8_vme_vp8_intra_frame),
+        NULL
+    },
+    {
+        "VME inter Frame",
+        VME_INTER_SHADER,
+        gen8_vme_vp8_inter_frame,
+        sizeof(gen8_vme_vp8_inter_frame),
+        NULL
+    },
+};
+
 /* only used for VME source surface state */
 static void 
 gen8_vme_source_surface_state(VADriverContextP ctx,
@@ -170,16 +195,14 @@ static void
 gen8_vme_output_buffer_setup(VADriverContextP ctx,
                              struct encode_state *encode_state,
                              int index,
-                             struct intel_encoder_context *encoder_context)
+                             struct intel_encoder_context *encoder_context,
+                             int is_intra,
+                             int width_in_mbs,
+                             int height_in_mbs)
 
 {
     struct i965_driver_data *i965 = i965_driver_data(ctx);
     struct gen6_vme_context *vme_context = encoder_context->vme_context;
-    VAEncSequenceParameterBufferH264 *pSequenceParameter = (VAEncSequenceParameterBufferH264 *)encode_state->seq_param_ext->buffer;
-    VAEncSliceParameterBufferH264 *pSliceParameter = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[0]->buffer;
-    int is_intra = pSliceParameter->slice_type == SLICE_TYPE_I;
-    int width_in_mbs = pSequenceParameter->picture_width_in_mbs;
-    int height_in_mbs = pSequenceParameter->picture_height_in_mbs;
 
     vme_context->vme_output.num_blocks = width_in_mbs * height_in_mbs;
     vme_context->vme_output.pitch = 16; /* in bytes, always 16 */
@@ -194,7 +217,7 @@ gen8_vme_output_buffer_setup(VADriverContextP ctx,
      * 16 * (2 + 2 * (1 + 8 + 2))= 16 * 24.
      */
 
-    vme_context->vme_output.bo = dri_bo_alloc(i965->intel.bufmgr, 
+    vme_context->vme_output.bo = dri_bo_alloc(i965->intel.bufmgr,
                                               "VME output buffer",
                                               vme_context->vme_output.num_blocks * vme_context->vme_output.size_block,
                                               0x1000);
@@ -207,32 +230,57 @@ gen8_vme_output_buffer_setup(VADriverContextP ctx,
 }
 
 static void
+gen8_vme_avc_output_buffer_setup(VADriverContextP ctx,
+                             struct encode_state *encode_state,
+                             int index,
+                             struct intel_encoder_context *encoder_context)
+{
+    VAEncSequenceParameterBufferH264 *pSequenceParameter = (VAEncSequenceParameterBufferH264 *)encode_state->seq_param_ext->buffer;
+    VAEncSliceParameterBufferH264 *pSliceParameter = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[0]->buffer;
+    int is_intra = pSliceParameter->slice_type == SLICE_TYPE_I;
+    int width_in_mbs = pSequenceParameter->picture_width_in_mbs;
+    int height_in_mbs = pSequenceParameter->picture_height_in_mbs;
+
+    gen8_vme_output_buffer_setup(ctx, encode_state, index, encoder_context, is_intra, width_in_mbs, height_in_mbs);
+
+}
+
+static void
 gen8_vme_output_vme_batchbuffer_setup(VADriverContextP ctx,
                                       struct encode_state *encode_state,
                                       int index,
-                                      struct intel_encoder_context *encoder_context)
-
+                                      struct intel_encoder_context *encoder_context,
+                                      int width_in_mbs,
+                                      int height_in_mbs)
 {
     struct i965_driver_data *i965 = i965_driver_data(ctx);
     struct gen6_vme_context *vme_context = encoder_context->vme_context;
-    VAEncSequenceParameterBufferH264 *pSequenceParameter = (VAEncSequenceParameterBufferH264 *)encode_state->seq_param_ext->buffer;
-    int width_in_mbs = pSequenceParameter->picture_width_in_mbs;
-    int height_in_mbs = pSequenceParameter->picture_height_in_mbs;
 
     vme_context->vme_batchbuffer.num_blocks = width_in_mbs * height_in_mbs + 1;
     vme_context->vme_batchbuffer.size_block = 64; /* 4 OWORDs */
     vme_context->vme_batchbuffer.pitch = 16;
-    vme_context->vme_batchbuffer.bo = dri_bo_alloc(i965->intel.bufmgr, 
+    vme_context->vme_batchbuffer.bo = dri_bo_alloc(i965->intel.bufmgr,
                                                    "VME batchbuffer",
                                                    vme_context->vme_batchbuffer.num_blocks * vme_context->vme_batchbuffer.size_block,
                                                    0x1000);
-	/*
     vme_context->vme_buffer_suface_setup(ctx,
                                          &vme_context->gpe_context,
                                          &vme_context->vme_batchbuffer,
                                          BINDING_TABLE_OFFSET(index),
                                          SURFACE_STATE_OFFSET(index));
-	*/
+}
+
+static void
+gen8_vme_avc_output_vme_batchbuffer_setup(VADriverContextP ctx,
+                                      struct encode_state *encode_state,
+                                      int index,
+                                      struct intel_encoder_context *encoder_context)
+{
+    VAEncSequenceParameterBufferH264 *pSequenceParameter = (VAEncSequenceParameterBufferH264 *)encode_state->seq_param_ext->buffer;
+    int width_in_mbs = pSequenceParameter->picture_width_in_mbs;
+    int height_in_mbs = pSequenceParameter->picture_height_in_mbs;
+
+    gen8_vme_output_vme_batchbuffer_setup(ctx, encode_state, index, encoder_context, width_in_mbs, height_in_mbs);
 }
 
 static VAStatus
@@ -264,8 +312,8 @@ gen8_vme_surface_setup(VADriverContextP ctx,
     }
 
     /* VME output */
-    gen8_vme_output_buffer_setup(ctx, encode_state, 3, encoder_context);
-    gen8_vme_output_vme_batchbuffer_setup(ctx, encode_state, 5, encoder_context);
+    gen8_vme_avc_output_buffer_setup(ctx, encode_state, 3, encoder_context);
+    gen8_vme_avc_output_vme_batchbuffer_setup(ctx, encode_state, 5, encoder_context);
 
     return VA_STATUS_SUCCESS;
 }
@@ -724,37 +772,12 @@ gen8_vme_mpeg2_output_buffer_setup(VADriverContextP ctx,
                                    int index,
                                    int is_intra,
                                    struct intel_encoder_context *encoder_context)
-
 {
-    struct i965_driver_data *i965 = i965_driver_data(ctx);
-    struct gen6_vme_context *vme_context = encoder_context->vme_context;
     VAEncSequenceParameterBufferMPEG2 *seq_param = (VAEncSequenceParameterBufferMPEG2 *)encode_state->seq_param_ext->buffer;
     int width_in_mbs = ALIGN(seq_param->picture_width, 16) / 16;
     int height_in_mbs = ALIGN(seq_param->picture_height, 16) / 16;
 
-    vme_context->vme_output.num_blocks = width_in_mbs * height_in_mbs;
-    vme_context->vme_output.pitch = 16; /* in bytes, always 16 */
-
-    if (is_intra)
-        vme_context->vme_output.size_block = INTRA_VME_OUTPUT_IN_BYTES * 2;
-    else
-        vme_context->vme_output.size_block = INTRA_VME_OUTPUT_IN_BYTES * 24;
-    /*
-     * Inter MV . 32-byte Intra search + 16 IME info + 128 IME MV + 32 IME Ref
-     * + 16 FBR Info + 128 FBR MV + 32 FBR Ref.
-     * 16 * (2 + 2 * (1 + 8 + 2))= 16 * 24.
-     */
-
-    vme_context->vme_output.bo = dri_bo_alloc(i965->intel.bufmgr, 
-                                              "VME output buffer",
-                                              vme_context->vme_output.num_blocks * vme_context->vme_output.size_block,
-                                              0x1000);
-    assert(vme_context->vme_output.bo);
-    vme_context->vme_buffer_suface_setup(ctx,
-                                         &vme_context->gpe_context,
-                                         &vme_context->vme_output,
-                                         BINDING_TABLE_OFFSET(index),
-                                         SURFACE_STATE_OFFSET(index));
+    gen8_vme_output_buffer_setup(ctx, encode_state, index, encoder_context, is_intra, width_in_mbs, height_in_mbs);
 }
 
 static void
@@ -762,26 +785,12 @@ gen8_vme_mpeg2_output_vme_batchbuffer_setup(VADriverContextP ctx,
                                             struct encode_state *encode_state,
                                             int index,
                                             struct intel_encoder_context *encoder_context)
-
 {
-    struct i965_driver_data *i965 = i965_driver_data(ctx);
-    struct gen6_vme_context *vme_context = encoder_context->vme_context;
     VAEncSequenceParameterBufferMPEG2 *seq_param = (VAEncSequenceParameterBufferMPEG2 *)encode_state->seq_param_ext->buffer;
     int width_in_mbs = ALIGN(seq_param->picture_width, 16) / 16;
     int height_in_mbs = ALIGN(seq_param->picture_height, 16) / 16;
 
-    vme_context->vme_batchbuffer.num_blocks = width_in_mbs * height_in_mbs + 1;
-    vme_context->vme_batchbuffer.size_block = 64; /* 4 OWORDs */
-    vme_context->vme_batchbuffer.pitch = 16;
-    vme_context->vme_batchbuffer.bo = dri_bo_alloc(i965->intel.bufmgr, 
-                                                   "VME batchbuffer",
-                                                   vme_context->vme_batchbuffer.num_blocks * vme_context->vme_batchbuffer.size_block,
-                                                   0x1000);
-    vme_context->vme_buffer_suface_setup(ctx,
-                                         &vme_context->gpe_context,
-                                         &vme_context->vme_batchbuffer,
-                                         BINDING_TABLE_OFFSET(index),
-                                         SURFACE_STATE_OFFSET(index));
+    gen8_vme_output_vme_batchbuffer_setup(ctx, encode_state, index, encoder_context, width_in_mbs, height_in_mbs);
 }
 
 static VAStatus
@@ -1130,6 +1139,139 @@ gen8_vme_mpeg2_pipeline(VADriverContextP ctx,
 }
 
 static void
+gen8_vme_vp8_output_buffer_setup(VADriverContextP ctx,
+                                   struct encode_state *encode_state,
+                                   int index,
+                                   int is_intra,
+                                   struct intel_encoder_context *encoder_context)
+{
+    VAEncSequenceParameterBufferVP8 *seq_param = (VAEncSequenceParameterBufferVP8 *)encode_state->seq_param_ext->buffer;
+    int width_in_mbs = ALIGN(seq_param->frame_width, 16) / 16;
+    int height_in_mbs = ALIGN(seq_param->frame_height, 16) / 16;
+
+    gen8_vme_output_buffer_setup(ctx, encode_state, index, encoder_context, is_intra, width_in_mbs, height_in_mbs);
+}
+
+static void
+gen8_vme_vp8_output_vme_batchbuffer_setup(VADriverContextP ctx,
+                                            struct encode_state *encode_state,
+                                            int index,
+                                            struct intel_encoder_context *encoder_context)
+{
+    VAEncSequenceParameterBufferVP8 *seq_param = (VAEncSequenceParameterBufferVP8 *)encode_state->seq_param_ext->buffer;
+    int width_in_mbs = ALIGN(seq_param->frame_width, 16) / 16;
+    int height_in_mbs = ALIGN(seq_param->frame_height, 16) / 16;
+
+    gen8_vme_output_vme_batchbuffer_setup(ctx, encode_state, index, encoder_context, width_in_mbs, height_in_mbs);
+}
+
+static VAStatus
+gen8_vme_vp8_surface_setup(VADriverContextP ctx,
+                             struct encode_state *encode_state,
+                             int is_intra,
+                             struct intel_encoder_context *encoder_context)
+{
+    struct object_surface *obj_surface;
+
+    /*Setup surfaces state*/
+    /* current picture for encoding */
+    obj_surface = encode_state->input_yuv_object;
+    gen8_vme_source_surface_state(ctx, 0, obj_surface, encoder_context);
+    gen8_vme_media_source_surface_state(ctx, 4, obj_surface, encoder_context);
+    gen8_vme_media_chroma_source_surface_state(ctx, 6, obj_surface, encoder_context);
+
+    if (!is_intra) {
+        /* reference 0 */
+        obj_surface = encode_state->reference_objects[0];
+
+        if (obj_surface->bo != NULL)
+            gen8_vme_source_surface_state(ctx, 1, obj_surface, encoder_context);
+
+        /* reference 1 */
+        obj_surface = encode_state->reference_objects[1];
+
+        if (obj_surface && obj_surface->bo != NULL)
+            gen8_vme_source_surface_state(ctx, 2, obj_surface, encoder_context);
+    }
+
+    /* VME output */
+    gen8_vme_vp8_output_buffer_setup(ctx, encode_state, 3, is_intra, encoder_context);
+    gen8_vme_vp8_output_vme_batchbuffer_setup(ctx, encode_state, 5, encoder_context);
+
+    return VA_STATUS_SUCCESS;
+}
+
+static void
+gen8_vme_vp8_pipeline_programing(VADriverContextP ctx,
+                                   struct encode_state *encode_state,
+                                   int is_intra,
+                                   struct intel_encoder_context *encoder_context)
+{
+    struct gen6_vme_context *vme_context = encoder_context->vme_context;
+    struct intel_batchbuffer *batch = encoder_context->base.batch;
+    VAEncSequenceParameterBufferVP8 *seq_param = (VAEncSequenceParameterBufferVP8 *)encode_state->seq_param_ext->buffer;
+    int width_in_mbs = ALIGN(seq_param->frame_width, 16) / 16;
+    int height_in_mbs = ALIGN(seq_param->frame_height, 16) / 16;
+    int kernel_shader = (is_intra ? VME_INTRA_SHADER : VME_INTER_SHADER);
+
+    gen8wa_vme_mpeg2_walker_fill_vme_batchbuffer(ctx,
+                                                 encode_state,
+                                                 width_in_mbs, height_in_mbs,
+                                                 kernel_shader,
+                                                 encoder_context);
+
+    intel_batchbuffer_start_atomic(batch, 0x1000);
+    gen8_gpe_pipeline_setup(ctx, &vme_context->gpe_context, batch);
+    BEGIN_BATCH(batch, 4);
+    OUT_BATCH(batch, MI_BATCH_BUFFER_START | (1 << 8) | (1 << 0));
+    OUT_RELOC(batch,
+              vme_context->vme_batchbuffer.bo,
+              I915_GEM_DOMAIN_COMMAND, 0,
+              0);
+    OUT_BATCH(batch, 0);
+    OUT_BATCH(batch, 0);
+    ADVANCE_BATCH(batch);
+
+    intel_batchbuffer_end_atomic(batch);
+}
+
+static VAStatus gen8_vme_vp8_prepare(VADriverContextP ctx,
+                                 struct encode_state *encode_state,
+                                 struct intel_encoder_context *encoder_context)
+{
+    VAStatus vaStatus = VA_STATUS_SUCCESS;
+    VAEncPictureParameterBufferVP8 *pPicParameter = (VAEncPictureParameterBufferVP8 *)encode_state->pic_param_ext->buffer;
+    int is_intra = !pPicParameter->pic_flags.bits.frame_type;
+
+    /* update vp8 mbmv cost */
+    intel_vme_vp8_update_mbmv_cost(ctx, encode_state, encoder_context);
+
+    /*Setup all the memory object*/
+    gen8_vme_vp8_surface_setup(ctx, encode_state, is_intra, encoder_context);
+    gen8_vme_interface_setup(ctx, encode_state, encoder_context);
+    gen8_vme_constant_setup(ctx, encode_state, encoder_context);
+
+    /*Programing media pipeline*/
+    gen8_vme_vp8_pipeline_programing(ctx, encode_state, is_intra, encoder_context);
+
+    return vaStatus;
+}
+
+static VAStatus
+gen8_vme_vp8_pipeline(VADriverContextP ctx,
+                        VAProfile profile,
+                        struct encode_state *encode_state,
+                        struct intel_encoder_context *encoder_context)
+{
+    gen8_vme_media_init(ctx, encoder_context);
+    gen8_vme_vp8_prepare(ctx, encode_state, encoder_context);
+    gen8_vme_run(ctx, encode_state, encoder_context);
+    gen8_vme_stop(ctx, encode_state, encoder_context);
+
+    return VA_STATUS_SUCCESS;
+}
+
+static void
 gen8_vme_context_destroy(void *context)
 {
     struct gen6_vme_context *vme_context = context;
@@ -1180,6 +1322,12 @@ Bool gen8_vme_context_init(VADriverContextP ctx, struct intel_encoder_context *e
         encoder_context->vme_context_destroy = NULL;
         break;
 
+    case CODEC_VP8:
+        vme_kernel_list = gen8_vme_vp8_kernels;
+        encoder_context->vme_pipeline = gen8_vme_vp8_pipeline;
+        i965_kernel_num = sizeof(gen8_vme_vp8_kernels) / sizeof(struct i965_kernel);
+        break;
+
     default:
         /* never get here */
         assert(0);
diff --git a/src/i965_device_info.c b/src/i965_device_info.c
index e63f509..a7e2546 100755
--- a/src/i965_device_info.c
+++ b/src/i965_device_info.c
@@ -297,6 +297,7 @@ static struct hw_codec_info chv_hw_codec_info = {
     .has_di_motion_adptive = 1,
     .has_di_motion_compensated = 1,
     .has_vp8_decoding = 1,
+    .has_vp8_encoding = 1,
     .has_h264_mvc_encoding = 1,
 
     .num_filters = 5,
diff --git a/src/shaders/vme/Makefile.am b/src/shaders/vme/Makefile.am
index 0883c16..4543e35 100644
--- a/src/shaders/vme/Makefile.am
+++ b/src/shaders/vme/Makefile.am
@@ -20,8 +20,8 @@ INTEL_GEN75_INC	= batchbuffer.inc vme75.inc vme75_mpeg2.inc
 INTEL_GEN75_ASM	= $(INTEL_G75A:%.g75a=%.gen75.asm)
 
 
-INTEL_G8B	= intra_frame_gen8.g8b inter_frame_gen8.g8b inter_bframe_gen8.g8b mpeg2_inter_gen8.g8b
-INTEL_G8A	= intra_frame_gen8.g8a inter_frame_gen8.g8a inter_bframe_gen8.g8a mpeg2_inter_gen8.g8a
+INTEL_G8B	= intra_frame_gen8.g8b inter_frame_gen8.g8b inter_bframe_gen8.g8b mpeg2_inter_gen8.g8b vp8_intra_frame_gen8.g8b vp8_inter_frame_gen8.g8b
+INTEL_G8A	= intra_frame_gen8.g8a inter_frame_gen8.g8a inter_bframe_gen8.g8a mpeg2_inter_gen8.g8a vp8_intra_frame_gen8.g8a vp8_inter_frame_gen8.g8a
 INTEL_GEN8_INC	= vme8.inc vme75_mpeg2.inc
 INTEL_GEN8_ASM	= $(INTEL_G8A:%.g8a=%.gen8.asm)
 
diff --git a/src/shaders/vme/vp8_inter_frame_gen8.asm b/src/shaders/vme/vp8_inter_frame_gen8.asm
new file mode 100644
index 0000000..d660810
--- /dev/null
+++ b/src/shaders/vme/vp8_inter_frame_gen8.asm
@@ -0,0 +1,739 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Zhao Yakui <yakui.zhao at intel.com>
+ *    Xiang Haihao <haihao.xiang at intel.com>
+ *    Li Zhong <zhong.li at intel.com>
+ *
+ */
+
+#define SAVE_RET	add (1) RETURN_REG<1>:ud   ip:ud	32:ud
+#define	RETURN		mov (1)	ip:ud	RETURN_REG<0,1,0>:ud
+
+/*
+ * __START
+ */
+__INTER_START:
+mov  (16) tmp_reg0.0<1>:UD      0x0:UD {align1};
+mov  (16) tmp_reg2.0<1>:UD      0x0:UD {align1};
+mov  (16) tmp_reg4.0<1>:UD      0x0:UD {align1} ;
+mov  (16) tmp_reg6.0<1>:UD      0x0:UD {align1} ;
+
+shl  (2) read0_header.0<1>:D    orig_xy_ub<2,2,1>:UB 4:UW {align1};    /* (x, y) * 16 */
+add  (1) read0_header.0<1>:D    read0_header.0<0,1,0>:D -8:W {align1};     /* X offset */
+add  (1) read0_header.4<1>:D    read0_header.4<0,1,0>:D -1:W {align1};     /* Y offset */ 
+mov  (1) read0_header.8<1>:UD   BLOCK_32X1 {align1};
+mov  (1) read0_header.20<1>:UB  thread_id_ub {align1};                  /* dispatch id */
+
+shl  (2) read1_header.0<1>:D    orig_xy_ub<2,2,1>:UB 4:UW {align1};    /* (x, y) * 16 */
+add  (1) read1_header.0<1>:D    read1_header.0<0,1,0>:D -4:W {align1};     /* X offset */
+mov  (1) read1_header.8<1>:UD   BLOCK_4X16 {align1};
+mov  (1) read1_header.20<1>:UB  thread_id_ub {align1};                  /* dispatch id */
+        
+shl  (2) vme_m0.8<1>:UW         orig_xy_ub<2,2,1>:UB 4:UW {align1};    /* (x, y) * 16 */
+mov  (1) vme_m0.20<1>:UB        thread_id_ub {align1};                  /* dispatch id */
+
+mul  (1) obw_m0.8<1>:UD         w_in_mb_uw<0,1,0>:UW orig_y_ub<0,1,0>:UB {align1};
+add  (1) obw_m0.8<1>:UD         obw_m0.8<0,1,0>:UD orig_x_ub<0,1,0>:UB {align1};
+mul  (1) obw_m0.8<1>:UD         obw_m0.8<0,1,0>:UD 24:UD {align1};
+mov  (1) obw_m0.20<1>:UB        thread_id_ub {align1};                  /* dispatch id */
+        
+/*
+ * Media Read Message -- fetch Luma neighbor edge pixels 
+ */
+/* ROW */
+mov  (8) msg_reg0.0<1>:UD       read0_header.0<8,8,1>:UD {align1};        
+send (8) msg_ind INEP_ROW<1>:UB null read(BIND_IDX_INEP, 0, 0, 4) mlen 1 rlen 1 {align1};
+
+/* COL */
+mov  (8) msg_reg0.0<1>:UD       read1_header.0<8,8,1>:UD {align1};                
+send (8) msg_ind INEP_COL0<1>:UB null read(BIND_IDX_INEP, 0, 0, 4) mlen 1 rlen 2 {align1};
+        
+/*
+ * Media Read Message -- fetch Chroma neighbor edge pixels 
+ */
+/* ROW */
+shl  (2) read0_header.0<1>:D    orig_xy_ub<2,2,1>:UB 3:UW {align1};    /* x * 16 , y * 8 */
+mul  (1) read0_header.0<1>:D    read0_header.0<0,1,0>:D  2:W {align1};
+add  (1) read0_header.0<1>:D    read0_header.0<0,1,0>:D -8:W {align1};     /* X offset */
+add  (1) read0_header.4<1>:D    read0_header.4<0,1,0>:D -1:W {align1};     /* Y offset */ 
+mov  (8) msg_reg0.0<1>:UD       read0_header.0<8,8,1>:UD {align1};        
+send (8) msg_ind CHROMA_ROW<1>:UB null read(BIND_IDX_CBCR, 0, 0, 4) mlen 1 rlen 1 {align1};
+
+/* COL */
+shl  (2) read1_header.0<1>:D    orig_xy_ub<2,2,1>:UB 3:UW {align1};    /* x * 16, y * 8 */
+mul  (1) read1_header.0<1>:D    read1_header.0<0,1,0>:D  2:W {align1};
+add  (1) read1_header.0<1>:D    read1_header.0<0,1,0>:D -4:W {align1};     /* X offset */
+mov  (1) read1_header.8<1>:UD   BLOCK_8X4 {align1};
+mov  (8) msg_reg0.0<1>:UD       read1_header.0<8,8,1>:UD {align1};                
+send (8) msg_ind CHROMA_COL<1>:UB null read(BIND_IDX_CBCR, 0, 0, 4) mlen 1 rlen 1 {align1};
+
+mov  (8) mb_mvp_ref.0<1>:ud	0:ud		{align1};
+mov  (8) mb_ref_win.0<1>:ud	0:ud		{align1};
+and.z.f0.0 (1)		null:uw	mb_hwdep<0,1,0>:uw		0x04:uw   {align1};
+(f0.0) jmpi (1) __mb_hwdep_end;
+/* read back the data for MB A */
+/* the layout of MB result is: rx.0(Available). rx.4(MVa), rX.8(MVb), rX.16(Pred_L0 flag),
+*  rX.18 (Pred_L1 flag), rX.20(Forward reference ID), rX.22(Backwared reference ID)
+*/
+mov  (8) mba_result.0<1>:ud	0x0:ud		{align1};
+mov  (8) mbb_result.0<1>:ud	0x0:ud		{align1};
+mov  (8) mbc_result.0<1>:ud	0x0:ud		{align1};
+mba_start:
+mov  (8) mb_msg0.0<1>:ud	0:ud		{align1};
+and.z.f0.0 (1)		null:uw	input_mb_intra_ub<0,1,0>:ub	INTRA_PRED_AVAIL_FLAG_AE:uw   {align1};
+/* MB A doesn't exist. Zero MV. mba_flag is zero and ref ID = -1 */
+(f0.0)  mov  (2)    	mba_result.20<1>:w	-1:w	{align1};
+(f0.0)  jmpi (1)	mbb_start;
+mov  (1) mba_result.0<1>:d	MB_AVAIL		{align1};	
+mov  (2) tmp_reg0.0<1>:UW	orig_xy_ub<2,2,1>:UB	{align1};
+add  (1) tmp_reg0.0<1>:w	tmp_reg0.0<0,1,0>:w	-1:w	{align1};
+mul  (1) mb_msg0.8<1>:UD       w_in_mb_uw<0,1,0>:UW tmp_reg0.2<0,1,0>:UW {align1};
+add  (1) mb_msg0.8<1>:UD       mb_msg0.8<0,1,0>:UD   tmp_reg0.0<0,1,0>:uw {align1};
+mul  (1) mb_msg0.8<1>:UD       mb_msg0.8<0,1,0>:UD 24:UD {align1};
+mov  (1) mb_msg0.20<1>:UB        thread_id_ub {align1};                  /* dispatch id */
+
+/* bind index 3, read 4 oword (64bytes), msg type: 0(OWord Block Read) */
+send (16)
+        mb_ind
+        mb_wb.0<1>:ud
+	NULL
+        data_port(
+                OBR_CACHE_TYPE,
+                OBR_MESSAGE_TYPE,
+                OBR_CONTROL_4,
+                OBR_BIND_IDX,
+                OBR_WRITE_COMMIT_CATEGORY,
+                OBR_HEADER_PRESENT
+        )
+        mlen 1
+        rlen 2
+        {align1};
+
+/* TODO: RefID is required after multi-references are added */
+cmp.l.f0.0 (1)		null:w	mb_intra_wb.16<0,1,0>:uw	mb_inter_wb.8<0,1,0>:uw {align1};
+(f0.0)   mov (2)	mba_result.20<1>:w			-1:w	{align1};
+(f0.0)   jmpi	(1)	mbb_start;
+
+add   (1) mb_msg0.8<1>:UD	mb_msg0.8<0,1,0>:ud	3:ud {align1};
+/* Read MV for MB A */
+/* bind index 3, read 8 oword (128bytes), msg type: 0(OWord Block Read) */
+send (16)
+        mb_ind
+        mb_mv0.0<1>:ud
+	NULL
+        data_port(
+                OBR_CACHE_TYPE,
+                OBR_MESSAGE_TYPE,
+                OBR_CONTROL_8,
+                OBR_BIND_IDX,
+                OBR_WRITE_COMMIT_CATEGORY,
+                OBR_HEADER_PRESENT
+        )
+        mlen 1
+        rlen 4
+        {align1};
+/* TODO: RefID is required after multi-references are added */
+/* MV */
+mov	   (2)		mba_result.4<1>:ud		mb_mv1.8<2,2,1>:ud	{align1};
+mov	   (1)		mba_result.16<1>:w		MB_PRED_FLAG		{align1};
+
+mbb_start:
+mov  (8) mb_msg0.0<1>:ud	0:ud		{align1};
+and.z.f0.0 (1)		null:uw	input_mb_intra_ub<0,1,0>:ub	INTRA_PRED_AVAIL_FLAG_B:uw   {align1};
+/* MB B doesn't exist. Zero MV. mba_flag is zero */
+/* If MB B doesn't exist, neither MB C nor D exists */
+(f0.0)  mov  (2)    	mbb_result.20<1>:w	-1:w		{align1};
+(f0.0)  mov  (2)    	mbc_result.20<1>:w	-1:w		{align1};
+(f0.0)  jmpi (1)	mb_mvp_start;
+mov  (1) mbb_result.0<1>:d	MB_AVAIL		{align1};	
+mov  (2) tmp_reg0.0<1>:UW	orig_xy_ub<2,2,1>:UB	{align1};
+add  (1) tmp_reg0.2<1>:w	tmp_reg0.2<0,1,0>:w	-1:w	{align1};
+mul  (1) mb_msg0.8<1>:UD       w_in_mb_uw<0,1,0>:UW tmp_reg0.2<0,1,0>:UW {align1};
+add  (1) mb_msg0.8<1>:UD       mb_msg0.8<0,1,0>:UD   tmp_reg0.0<0,1,0>:uw {align1};
+mul  (1) mb_msg0.8<1>:UD       mb_msg0.8<0,1,0>:UD 24:UD {align1};
+mov  (1) mb_msg0.20<1>:UB        thread_id_ub {align1};                  /* dispatch id */
+
+/* bind index 3, read 4 oword (64bytes), msg type: 0(OWord Block Read) */
+send (16)
+        mb_ind
+        mb_wb.0<1>:ud
+	NULL
+        data_port(
+                OBR_CACHE_TYPE,
+                OBR_MESSAGE_TYPE,
+                OBR_CONTROL_4,
+                OBR_BIND_IDX,
+                OBR_WRITE_COMMIT_CATEGORY,
+                OBR_HEADER_PRESENT
+        )
+        mlen 1
+        rlen 2
+        {align1};
+
+/* TODO: RefID is required after multi-references are added */
+cmp.l.f0.0 (1)		null:w	mb_intra_wb.16<0,1,0>:uw	mb_inter_wb.8<0,1,0>:uw {align1};
+(f0.0)   mov (2)	mbb_result.20<1>:w			-1:w	{align1};
+(f0.0)   jmpi	(1)	mbc_start;
+add   (1) mb_msg0.8<1>:UD	mb_msg0.8<0,1,0>:ud	3:ud {align1};
+/* Read MV for MB B */
+/* bind index 3, read 8 oword (128bytes), msg type: 0(OWord Block Read) */
+send (16)
+        mb_ind
+        mb_mv0.0<1>:ud
+	NULL
+        data_port(
+                OBR_CACHE_TYPE,
+                OBR_MESSAGE_TYPE,
+                OBR_CONTROL_8,
+                OBR_BIND_IDX,
+                OBR_WRITE_COMMIT_CATEGORY,
+                OBR_HEADER_PRESENT
+        )
+        mlen 1
+        rlen 4
+        {align1};
+/* TODO: RefID is required after multi-references are added */
+mov	   (2)		mbb_result.4<1>:ud		mb_mv2.16<2,2,1>:ud	{align1};
+mov	   (1)		mbb_result.16<1>:w		MB_PRED_FLAG		{align1};
+
+mbc_start:
+mov  (8) mb_msg0.0<1>:ud	0:ud		{align1};
+and.z.f0.0 (1)		null:uw	input_mb_intra_ub<0,1,0>:ub	INTRA_PRED_AVAIL_FLAG_C:uw   {align1};
+/* MB C doesn't exist. Zero MV. mba_flag is zero */
+/* Based on h264 spec the MB D will be replaced if MB C doesn't exist */
+(f0.0)  jmpi (1)	mbd_start;
+mov  (1) mbc_result.0<1>:d	MB_AVAIL		{align1};	
+mov  (2) tmp_reg0.0<1>:UW	orig_xy_ub<2,2,1>:UB	{align1};
+add  (1) tmp_reg0.2<1>:w	tmp_reg0.2<0,1,0>:w	-1:w	{align1};
+add  (1) tmp_reg0.0<1>:w	tmp_reg0.0<0,1,0>:w	1:w	{align1};
+mul  (1) mb_msg0.8<1>:UD       w_in_mb_uw<0,1,0>:UW tmp_reg0.2<0,1,0>:UW {align1};
+add  (1) mb_msg0.8<1>:UD       mb_msg0.8<0,1,0>:UD   tmp_reg0.0<0,1,0>:uw {align1};
+mul  (1) mb_msg0.8<1>:UD       mb_msg0.8<0,1,0>:UD 24:UD {align1};
+mov  (1) mb_msg0.20<1>:UB        thread_id_ub {align1};                  /* dispatch id */
+
+/* bind index 3, read 4 oword (64bytes), msg type: 0(OWord Block Read) */
+send (16)
+        mb_ind
+        mb_wb.0<1>:ud
+	NULL
+        data_port(
+                OBR_CACHE_TYPE,
+                OBR_MESSAGE_TYPE,
+                OBR_CONTROL_4,
+                OBR_BIND_IDX,
+                OBR_WRITE_COMMIT_CATEGORY,
+                OBR_HEADER_PRESENT
+        )
+        mlen 1
+        rlen 2
+        {align1};
+
+/* TODO: RefID is required after multi-references are added */
+cmp.l.f0.0 (1)		null:w	mb_intra_wb.16<0,1,0>:uw	mb_inter_wb.8<0,1,0>:uw {align1};
+(f0.0)   mov (2)	mbc_result.20<1>:w			-1:w	{align1};
+(f0.0)   jmpi	(1)	mb_mvp_start;
+add   (1) mb_msg0.8<1>:UD	mb_msg0.8<0,1,0>:ud	3:ud {align1};
+/* Read MV for MB C */
+/* bind index 3, read 8 oword (128bytes), msg type: 0(OWord Block Read) */
+send (16)
+        mb_ind
+        mb_mv0.0<1>:ud
+	NULL
+        data_port(
+                OBR_CACHE_TYPE,
+                OBR_MESSAGE_TYPE,
+                OBR_CONTROL_8,
+                OBR_BIND_IDX,
+                OBR_WRITE_COMMIT_CATEGORY,
+                OBR_HEADER_PRESENT
+        )
+        mlen 1
+        rlen 4
+        {align1};
+/* TODO: RefID is required after multi-references are added */
+/* Forward MV */
+mov	   (2)		mbc_result.4<1>:ud		mb_mv2.16<2,2,1>:ud	{align1};
+mov	   (1)		mbc_result.16<1>:w		MB_PRED_FLAG		{align1};
+
+jmpi   (1)    mb_mvp_start;
+mbd_start:
+mov  (8) mb_msg0.0<1>:ud	0:ud		{align1};
+and.z.f0.0 (1)		null:uw	input_mb_intra_ub<0,1,0>:ub	INTRA_PRED_AVAIL_FLAG_D:uw   {align1};
+(f0.0)  mov (2)	mbc_result.20<1>:w	-1:w	{align1};
+(f0.0)  jmpi (1)	mb_mvp_start;
+mov  (1) mbc_result.0<1>:d	MB_AVAIL		{align1};	
+mov  (2) tmp_reg0.0<1>:UW	orig_xy_ub<2,2,1>:UB	{align1};
+add  (2) tmp_reg0.0<1>:w	tmp_reg0.0<2,2,1>:w	-1:w	{align1};
+mul  (1) mb_msg0.8<1>:UD       w_in_mb_uw<0,1,0>:UW tmp_reg0.2<0,1,0>:UW {align1};
+add  (1) mb_msg0.8<1>:UD       mb_msg0.8<0,1,0>:UD   tmp_reg0.0<0,1,0>:uw {align1};
+mul  (1) mb_msg0.8<1>:UD       mb_msg0.8<0,1,0>:UD 24:UD {align1};
+mov  (1) mb_msg0.20<1>:UB        thread_id_ub {align1};                  /* dispatch id */
+
+/* bind index 3, read 4 oword (64bytes), msg type: 0(OWord Block Read) */
+send (16)
+        mb_ind
+        mb_wb.0<1>:ud
+	NULL
+        data_port(
+                OBR_CACHE_TYPE,
+                OBR_MESSAGE_TYPE,
+                OBR_CONTROL_4,
+                OBR_BIND_IDX,
+                OBR_WRITE_COMMIT_CATEGORY,
+                OBR_HEADER_PRESENT
+        )
+        mlen 1
+        rlen 2
+        {align1};
+
+cmp.l.f0.0 (1)		null:w	mb_intra_wb.16<0,1,0>:uw	mb_inter_wb.8<0,1,0>:uw {align1};
+(f0.0)   mov (2)	mbc_result.20<1>:w			-1:w	{align1};
+(f0.0)   jmpi	(1)	mb_mvp_start;
+
+add   (1) mb_msg0.8<1>:UD	mb_msg0.8<0,1,0>:ud	3:ud {align1};
+/* Read MV for MB D */
+/* bind index 3, read 8 oword (128bytes), msg type: 0(OWord Block Read) */
+send (16)
+        mb_ind
+        mb_mv0.0<1>:ub
+	NULL
+        data_port(
+                OBR_CACHE_TYPE,
+                OBR_MESSAGE_TYPE,
+                OBR_CONTROL_8,
+                OBR_BIND_IDX,
+                OBR_WRITE_COMMIT_CATEGORY,
+                OBR_HEADER_PRESENT
+        )
+        mlen 1
+        rlen 4
+        {align1};
+
+/* TODO: RefID is required after multi-references are added */
+
+/* Forward MV */
+mov	   (2)		mbc_result.4<1>:ud		mb_mv3.24<2,2,1>:ud	{align1};
+mov	   (1)		mbc_result.16<1>:w		MB_PRED_FLAG		{align1};
+	
+mb_mvp_start:
+/*TODO: Add the skip prediction */
+/* Check whether both MB B and C are inavailable */
+add	(1)	tmp_reg0.0<1>:d		mbb_result.0<0,1,0>:d	mbc_result.0<0,1,0>:d	{align1};
+cmp.z.f0.0 (1)	null:d			tmp_reg0.0<0,1,0>:d	0:d	{align1};
+(-f0.0)	jmpi (1)	mb_median_start;
+cmp.nz.f0.0 (1)	null:d	mba_result.0<0,1,0>:d		0:d		{align1};
+(f0.0)	mov	(1)	mbb_result.4<1>:ud		mba_result.4<0,1,0>:ud	{align1};	
+(f0.0)	mov	(1)	mbc_result.4<1>:ud		mba_result.4<0,1,0>:ud	{align1};	
+(f0.0)	mov	(1)	mbb_result.20<1>:uw		mba_result.20<0,1,0>:uw	{align1};	
+(f0.0)	mov	(1)	mbc_result.20<1>:uw		mba_result.20<0,1,0>:uw	{align1};	
+(f0.0)  mov     (1)	mb_mvp_ref.0<1>:ud		mba_result.4<0,1,0>:ud	{align1};
+(-f0.0) mov	(1)	mb_mvp_ref.0<1>:ud		0:ud			{align1};
+jmpi	(1)	__mb_hwdep_end;
+	
+mb_median_start:
+/* check whether only one neighbour MB has the same ref ID with the current MB */
+mov (8)	tmp_reg0.0<1>:ud		0:ud		{align1};
+cmp.z.f0.0	(1)	null:d	mba_result.20<0,1,0>:w	0:w	{align1};
+(f0.0)	add	(1)	tmp_reg0.0<1>:w		tmp_reg0.0<0,1,0>:w	1:w	{align1};
+(f0.0)	mov	(1)	tmp_reg0.4<1>:ud	mba_result.4<0,1,0>:ud	{align1};
+cmp.z.f0.0	(1)	null:d	mbb_result.20<0,1,0>:w	0:w	{align1};
+(f0.0)	add	(1)	tmp_reg0.0<1>:w		tmp_reg0.0<0,1,0>:w	1:w	{align1};
+(f0.0)	mov	(1)	tmp_reg0.4<1>:ud	mbb_result.4<0,1,0>:ud	{align1};
+cmp.z.f0.0	(1)	null:d	mbc_result.20<0,1,0>:w	0:w	{align1};
+(f0.0)	add	(1)	tmp_reg0.0<1>:w		tmp_reg0.0<0,1,0>:w	1:w	{align1};
+(f0.0)	mov	(1)	tmp_reg0.4<1>:ud	mbc_result.4<0,1,0>:ud	{align1};
+cmp.e.f0.0	(1)	null:d	tmp_reg0.0<0,1,0>:w	 1:w	{align1};
+(f0.0)	mov	(1)     mb_mvp_ref.0<1>:ud	tmp_reg0.4<0,1,0>:ud	{align1};
+(f0.0)	jmpi (1)  __mb_hwdep_end;
+
+mov	(1)	INPUT_ARG0.0<1>:w	mba_result.4<0,1,0>:w	{align1};
+mov	(1)	INPUT_ARG0.4<1>:w	mbb_result.4<0,1,0>:w	{align1};
+mov	(1)	INPUT_ARG0.8<1>:w	mbc_result.4<0,1,0>:w	{align1};
+SAVE_RET	{align1};
+ jmpi	(1)	word_imedian;
+mov	(1)	mb_mvp_ref.0<1>:w		RET_ARG<0,1,0>:w	{align1};
+mov	(1)	INPUT_ARG0.0<1>:w	mba_result.6<0,1,0>:w	{align1};
+mov	(1)	INPUT_ARG0.4<1>:w	mbb_result.6<0,1,0>:w	{align1};
+mov	(1)	INPUT_ARG0.8<1>:w	mbc_result.6<0,1,0>:w	{align1};
+SAVE_RET	{align1};
+jmpi	(1)	word_imedian; 
+mov	(1)	mb_mvp_ref.2<1>:w		RET_ARG<0,1,0>:w	{align1};
+
+__mb_hwdep_end:
+asr	(2)	mb_ref_win.0<1>:w	mb_mvp_ref.0<2,2,1>:w	2:w	{align1};
+add	(2)	mb_ref_win.8<1>:w	mb_ref_win.0<2,2,1>:w	3:w	{align1};
+and	(2)	mb_ref_win.16<1>:uw	mb_ref_win.8<2,2,1>:uw	0xFFFC:uw {align1};
+/* m2, get the MV/Mb cost passed from constant buffer when
+spawning thread by MEDIA_OBJECT */       
+mov (8) vme_m2<1>:UD            r1.0<8,8,1>:UD {align1};
+
+mov (8) vme_msg_2<1>:UD		vme_m2.0<8,8,1>:UD {align1};
+
+/* m3 FWD/BWD cost center*/
+mov (8) vme_msg_3<1>:UD		0x0:UD {align1};	        
+
+/* m4 skip center*/
+mov (8) vme_msg_4<1>:UD		0x0:UD {align1};	       
+
+/* m5 */ 
+mov  (1) INEP_ROW.0<1>:UD       0x0:UD {align1};
+and  (1) INEP_ROW.4<1>:UD       INEP_ROW.4<0,1,0>:UD            0xFF000000:UD {align1};
+mov  (8) vme_msg_5<1>:UD         INEP_ROW.0<8,8,1>:UD {align1};
+
+
+/* Use the Luma mode */
+mov  (1) tmp_reg0.0<1>:UW	LUMA_INTRA_MODE:UW {align1};
+mov  (1) vme_msg_5.5<1>:UB	tmp_reg0.0<0,1,0>:UB {align1};
+
+/* m6 */        
+mov  (8) vme_msg_6<1>:UD         0x0:UD {align1};
+mov (16) vme_msg_6.0<1>:UB       INEP_COL0.3<32,8,4>:UB {align1};
+mov  (1) vme_msg_6.16<1>:UD      INTRA_PREDICTORE_MODE {align1};
+
+/* the penalty for Intra mode */
+mov  (1) vme_msg_6.28<1>:UD	0x010101:UD {align1};
+mov  (1) vme_msg_6.20<1>:UW      CHROMA_ROW.6<0,1,0>:UW {align1};
+
+
+/* m7 */
+
+mov  (4) vme_msg_7.16<1>:UD      CHROMA_ROW.8<4,4,1>:UD {align1};
+mov  (8) vme_msg_7.0<1>:UW       CHROMA_COL.2<16,8,2>:UW {align1};
+
+/*
+ * SIC VME message
+ */
+
+/* m1 */
+mov  (1) intra_flag<1>:UW       0x0:UW {align1};
+mov  (1) intra_part_mask_ub<1>:UB  LUMA_INTRA_8x8_DISABLE {align1}; /* vp8 don't support intra_8x8 mode*/
+
+/* assign MB intra struct from the thread payload*/
+mov (1) mb_intra_struct_ub<1>:UB input_mb_intra_ub<0,1,0>:UB {align1}; 
+
+/* Disable DC HAAR component when calculating HARR SATD block */
+mov  (1) tmp_reg0.0<1>:UW	DC_HARR_DISABLE:UW		{align1};
+mov  (1) vme_m1.30<1>:UB	tmp_reg0.0<0,1,0>:UB  {align1};
+mov  (8) vme_msg_1<1>:UD        vme_m1.0<8,8,1>:UD {align1};
+
+/* m0 */        
+mov  (1) vme_m0.12<1>:UD        INTRA_SAD_HAAR:UD {align1};    /* 16x16 Source, Intra_harr */
+mov  (8) vme_msg_0.0<1>:UD      vme_m0.0<8,8,1>:UD {align1};
+
+/* after verification it will be passed by using payload */
+send (8)
+        vme_msg_ind
+        vme_wb<1>:UD
+        null
+        cre(
+                BIND_IDX_VME,
+                VME_SIC_MESSAGE_TYPE
+        )
+        mlen sic_vme_msg_length
+        rlen vme_wb_length
+        {align1};
+/*
+ * Oword Block Write message
+ */
+mov  (8) msg_reg0.0<1>:UD       obw_m0<8,8,1>:UD {align1};
+        
+mov  (1) msg_reg1.0<1>:UD       vme_wb.0<0,1,0>:UD      {align1};
+mov  (1) msg_reg1.4<1>:UD       vme_wb.16<0,1,0>:UD     {align1};
+mov  (1) msg_reg1.8<1>:UD       vme_wb.20<0,1,0>:UD     {align1};
+mov  (1) msg_reg1.12<1>:UD      vme_wb.24<0,1,0>:UD     {align1};
+
+/* Distortion, Intra (17-16), */
+mov  (1) msg_reg1.16<1>:UW      vme_wb.12<0,1,0>:UW     {align1};
+
+mov  (1) msg_reg1.20<1>:UD      vme_wb.8<0,1,0>:UD     {align1};
+/* VME clock counts */
+mov  (1) msg_reg1.24<1>:UD      vme_wb.28<0,1,0>:UD     {align1};
+
+mov  (1) msg_reg1.28<1>:UD      obw_m0.8<0,1,0>:UD     {align1};
+
+/* bind index 3, write 2 oword (32bytes), msg type: 8(OWord Block Write) */
+send (16)
+        msg_ind
+        obw_wb
+        null
+        data_port(
+                OBW_CACHE_TYPE,
+                OBW_MESSAGE_TYPE,
+                OBW_CONTROL_2,
+                OBW_BIND_IDX,
+                OBW_WRITE_COMMIT_CATEGORY,
+                OBW_HEADER_PRESENT
+        )
+        mlen 2
+        rlen obw_wb_length
+        {align1};
+
+/* IME search */
+mov  (1) vme_m0.12<1>:UD        SEARCH_CTRL_SINGLE + VP8_INTER_PART_MASK + INTER_SAD_HAAR:UD {align1};    /* 16x16 Source, harr */
+mov  (1) vme_m0.22<1>:UW        REF_REGION_SIZE {align1};         /* Reference Width&Height, 48x40 */
+
+mov  (1) vme_m0.0<1>:UD		vme_m0.8<0,1,0>:UD      {align1};
+
+add  (1) vme_m0.0<1>:W          vme_m0.0<0,1,0>:W -16:W {align1};		/* Reference = (x-16,y-12)-(x+32,y+28) */
+add  (1) vme_m0.2<1>:W          vme_m0.2<0,1,0>:W -12:W {align1};
+
+mov  (1) vme_m0.0<1>:W		-16:W			{align1};
+mov  (1) vme_m0.2<1>:W		-12:W			{align1};
+
+mov  (1) vme_m0.4<1>:UD		vme_m0.0<0,1,0>:UD	{align1};
+
+and.z.f0.0 (1)		null:uw	input_mb_intra_ub<0,1,0>:ub	INTRA_PRED_AVAIL_FLAG_AE:uw   {align1};
+(f0.0)	add 	(1)	vme_m0.0<1>:w	vme_m0.0<0,1,0>:w	12:w	{align1};
+and.z.f0.0 (1)		null:uw	input_mb_intra_ub<0,1,0>:ub	INTRA_PRED_AVAIL_FLAG_B:uw   {align1};
+(f0.0)	add 	(1)	vme_m0.2<1>:w	vme_m0.2<0,1,0>:w	8:w	{align1};
+ 
+add  (2) vme_m0.0<1>:w		vme_m0.0<2,2,1>:w	mb_ref_win.16<2,2,1>:w	{align1};
+add  (2) vme_m0.4<1>:w		vme_m0.4<2,2,1>:w	mb_ref_win.16<2,2,1>:w	{align1};
+mov  (8) vme_msg_0.0<1>:UD      vme_m0.0<8,8,1>:UD {align1};
+
+mov  (1) vme_m1.0<1>:UD         ADAPTIVE_SEARCH_ENABLE:ud {align1} ;
+/* the Max MV number is passed by constant buffer */
+mov  (1) vme_m1.4<1>:UB         r4.28<0,1,0>:UB {align1};          
+mov  (1) vme_m1.8<1>:UD         START_CENTER + SEARCH_PATH_LEN:UD {align1};
+mov  (8) vme_msg_1.0<1>:UD      vme_m1.0<8,8,1>:UD {align1};
+
+/* Setup the Cost center */
+/* currently four 8x8 share the same cost center */
+mov  (4) vme_m3.0<2>:ud		mb_mvp_ref.0<0,1,0>:ud	{align1};
+mov  (4) vme_m3.4<2>:ud		mb_mvp_ref.0<0,1,0>:ud	{align1};
+
+mov (8) vme_msg_3<1>:UD		vme_m3.0<8,8,1>:UD {align1};
+mov (8) vme_msg_2<1>:UD		vme_m2.0<8,8,1>:UD {align1};
+
+/* M4/M5 search path */
+mov  (1) vme_msg_4.0<1>:UD	0x01010101:UD {align1};
+mov  (1) vme_msg_4.4<1>:UD	0x10010101:UD {align1};
+mov  (1) vme_msg_4.8<1>:UD	0x0F0F0F0F:UD {align1};
+mov  (1) vme_msg_4.12<1>:UD	0x100F0F0F:UD {align1};
+mov  (1) vme_msg_4.16<1>:UD	0x01010101:UD {align1};
+mov  (1) vme_msg_4.20<1>:UD	0x10010101:UD {align1};
+mov  (1) vme_msg_4.24<1>:UD	0x0F0F0F0F:UD {align1};
+mov  (1) vme_msg_4.28<1>:UD	0x100F0F0F:UD {align1};
+
+mov  (1) vme_msg_5.0<1>:UD	0x01010101:UD {align1};
+mov  (1) vme_msg_5.4<1>:UD	0x10010101:UD {align1};
+mov  (1) vme_msg_5.8<1>:UD	0x0F0F0F0F:UD {align1};
+mov  (1) vme_msg_5.12<1>:UD	0x000F0F0F:UD {align1};
+
+mov  (4) vme_msg_5.16<1>:UD	0x0:UD {align1};
+
+send (8)
+        vme_msg_ind
+        vme_wb<1>:UD
+        null
+        vme(
+                BIND_IDX_VME,
+                0,
+                0,
+                VME_IME_MESSAGE_TYPE
+        )
+        mlen ime_vme_msg_length
+        rlen vme_wb_length {align1};
+
+/* Set Macroblock-shape/mode for FBR */
+
+mov  (1) vme_m2.20<1>:UD	0x0:UD {align1};
+mov  (1) vme_m2.21<1>:UB	vme_wb.25<0,1,0>:UB	{align1};
+mov  (1) vme_m2.22<1>:UB	vme_wb.26<0,1,0>:UB	{align1};
+
+and  (1) tmp_reg0.0<1>:UW	vme_wb.0<0,1,0>:UW	0x03:UW {align1};
+mov  (1) vme_m2.20<1>:UB	tmp_reg0.0<0,1,0>:UB    {align1};
+
+/* Send FBR message into CRE */
+
+mov  (8) vme_msg_4.0<1>:UD       vme_wb1.0<8,8,1>:UD {align1};
+mov  (8) vme_msg_5.0<1>:ud       vme_wb2.0<8,8,1>:ud {align1};
+mov  (8) vme_msg_6.0<1>:ud       vme_wb3.0<8,8,1>:ud {align1};
+mov  (8) vme_msg_7.0<1>:ud       vme_wb4.0<8,8,1>:ud {align1};                
+
+mov  (1) vme_m0.12<1>:UD	INTER_SAD_HAAR + SUB_PEL_MODE_QUARTER + FBR_BME_DISABLE:UD {align1};    /* 16x16 Source, 1/4 pixel, harr, BME disable */
+mov  (8) vme_msg_0.0<1>:UD	vme_m0.0<8,8,1>:UD  {align1};
+mov  (8) vme_msg_1.0<1>:UD	vme_m1.0<8,8,1>:UD  {align1};
+
+mov  (8) vme_msg_2.0<1>:UD		vme_m2.0<8,8,1>:UD	{align1};
+mov  (8) vme_msg_3.0<1>:UD		vme_m3.0<8,8,1>:UD	{align1};
+
+/* after verification it will be passed by using payload */
+send (8)
+        vme_msg_ind
+        vme_wb<1>:UD
+        null
+        cre(
+                BIND_IDX_VME,
+                VME_FBR_MESSAGE_TYPE
+        )
+        mlen fbr_vme_msg_length
+        rlen vme_wb_length
+        {align1};
+
+add  (1) obw_m0.8<1>:UD         obw_m0.8<0,1,0>:UD 0x02:UD {align1};
+mov  (8) msg_reg0.0<1>:UD       obw_m0<8,8,1>:UD {align1};
+/* write FME info */
+mov  (1) msg_reg1.0<1>:UD       vme_wb.0<0,1,0>:UD      {align1};
+
+mov  (1) msg_reg1.4<1>:UD       vme_wb.24<0,1,0>:UD     {align1};
+/* Inter distortion of FME */
+mov  (1) msg_reg1.8<1>:UD       vme_wb.8<0,1,0>:UD     {align1};
+
+mov  (1) msg_reg1.12<1>:UD	vme_m2.20<0,1,0>:UD {align1};
+
+/* bind index 3, write  oword (16bytes), msg type: 8(OWord Block Write) */
+send (16)
+        msg_ind
+        obw_wb
+        null
+        data_port(
+                OBW_CACHE_TYPE,
+                OBW_MESSAGE_TYPE,
+                OBW_CONTROL_0,
+                OBW_BIND_IDX,
+                OBW_WRITE_COMMIT_CATEGORY,
+                OBW_HEADER_PRESENT
+        )
+        mlen 2
+        rlen obw_wb_length
+        {align1};
+
+/* Write FME/BME MV */
+add  (1) obw_m0.8<1>:UD         obw_m0.8<0,1,0>:UD 0x01:UD {align1};
+mov  (8) msg_reg0.0<1>:UD       obw_m0.0<8,8,1>:UD {align1};
+
+
+mov  (8) msg_reg1.0<1>:UD       vme_wb1.0<8,8,1>:UD {align1};
+mov  (8) msg_reg2.0<1>:ud       vme_wb2.0<8,8,1>:ud {align1};
+mov  (8) msg_reg3.0<1>:ud       vme_wb3.0<8,8,1>:ud {align1};
+mov  (8) msg_reg4.0<1>:ud       vme_wb4.0<8,8,1>:ud {align1};                
+/* bind index 3, write  8 oword (128 bytes), msg type: 8(OWord Block Write) */
+send (16)
+        msg_ind
+        obw_wb
+        null
+        data_port(
+                OBW_CACHE_TYPE,
+                OBW_MESSAGE_TYPE,
+                OBW_CONTROL_8,
+                OBW_BIND_IDX,
+                OBW_WRITE_COMMIT_CATEGORY,
+                OBW_HEADER_PRESENT
+        )
+        mlen 5
+        rlen obw_wb_length
+        {align1};
+
+/* Write FME/BME RefID */
+add  (1) obw_m0.8<1>:UD         obw_m0.8<0,1,0>:UD 0x08:UD {align1};
+mov  (8) msg_reg0.0<1>:UD       obw_m0<8,8,1>:UD {align1};
+
+mov  (8) msg_reg1.0<1>:UD	vme_wb6.0<8,8,1>:UD {align1};
+
+/* bind index 3, write 2 oword (32bytes), msg type: 8(OWord Block Write) */
+send (16)
+        msg_ind
+        obw_wb
+        null
+        data_port(
+                OBW_CACHE_TYPE,
+                OBW_MESSAGE_TYPE,
+                OBW_CONTROL_2,
+                OBW_BIND_IDX,
+                OBW_WRITE_COMMIT_CATEGORY,
+                OBW_HEADER_PRESENT
+        )
+        mlen 2
+        rlen obw_wb_length
+        {align1};
+
+/* Issue message fence so that the previous write message is committed */
+send (16)
+        mb_ind
+        mb_wb.0<1>:ud
+	NULL
+        data_port(
+                OBR_CACHE_TYPE,
+                OBR_MESSAGE_FENCE,
+                OBR_MF_COMMIT,
+                OBR_BIND_IDX,
+                OBR_WRITE_COMMIT_CATEGORY,
+                OBR_HEADER_PRESENT
+        )
+        mlen 1
+        rlen 1
+        {align1};
+
+__EXIT: 
+/*
+ * kill thread
+ */        
+mov  (8) ts_msg_reg0<1>:UD         r0<8,8,1>:UD {align1};
+send (16) ts_msg_ind acc0<1>UW null thread_spawner(0, 0, 1) mlen 1 rlen 0 {align1 EOT};
+
+
+	nop		;
+	nop		;
+/* Compare three word data to get the min value */
+word_imin:
+	cmp.le.f0.0 (1)		null:w		INPUT_ARG0.0<0,1,0>:w	INPUT_ARG0.4<0,1,0>:w {align1};
+	(f0.0) mov  (1)		TEMP_VAR0.0<1>:w INPUT_ARG0.0<0,1,0>:w			  {align1};
+	(-f0.0) mov (1)		TEMP_VAR0.0<1>:w INPUT_ARG0.4<0,1,0>:w			  {align1};
+	cmp.le.f0.0 (1)		null:w		TEMP_VAR0.0<0,1,0>:w	INPUT_ARG0.8<0,1,0>:w {align1};
+	(f0.0) mov  (1)		RET_ARG<1>:w TEMP_VAR0.0<0,1,0>:w			  {align1};
+	(-f0.0) mov (1)		RET_ARG<1>:w INPUT_ARG0.8<0,1,0>:w			  {align1};
+	RETURN		{align1};	
+	
+/* Compare three word data to get the max value */
+word_imax:
+	cmp.ge.f0.0 (1)		null:w		INPUT_ARG0.0<0,1,0>:w	INPUT_ARG0.4<0,1,0>:w {align1};
+	(f0.0) mov  (1)		TEMP_VAR0.0<1>:w INPUT_ARG0.0<0,1,0>:w			  {align1};
+	(-f0.0) mov (1)		TEMP_VAR0.0<1>:w INPUT_ARG0.4<0,1,0>:w			  {align1};
+	cmp.ge.f0.0 (1)		null:w		TEMP_VAR0.0<0,1,0>:w	INPUT_ARG0.8<0,1,0>:w {align1};
+	(f0.0) mov  (1)		RET_ARG<1>:w TEMP_VAR0.0<0,1,0>:w			  {align1};
+	(-f0.0) mov (1)		RET_ARG<1>:w INPUT_ARG0.8<0,1,0>:w			  {align1};
+	RETURN		{align1};	
+	
+word_imedian:
+	cmp.ge.f0.0 (1) null:w INPUT_ARG0.0<0,1,0>:w INPUT_ARG0.4<0,1,0>:w {align1};
+	(f0.0)	jmpi (1) cmp_a_ge_b;
+	cmp.ge.f0.0 (1) null:w INPUT_ARG0.0<0,1,0>:w INPUT_ARG0.8<0,1,0>:w {align1};
+	(f0.0) mov (1) RET_ARG<1>:w INPUT_ARG0.0<0,1,0>:w {align1};
+	(f0.0) jmpi (1) cmp_end;
+	cmp.ge.f0.0 (1) null:w INPUT_ARG0.4<0,1,0>:w INPUT_ARG0.8<0,1,0>:w {align1};
+	(f0.0) mov (1) RET_ARG<1>:w INPUT_ARG0.8<0,1,0>:w {align1};
+	(-f0.0) mov (1) RET_ARG<1>:w INPUT_ARG0.4<0,1,0>:w {align1};
+	jmpi (1) cmp_end;
+cmp_a_ge_b:
+	cmp.ge.f0.0 (1) null:w INPUT_ARG0.4<0,1,0>:w INPUT_ARG0.8<0,1,0>:w {align1};
+	(f0.0) mov (1) RET_ARG<1>:w INPUT_ARG0.4<0,1,0>:w {align1};
+	(f0.0) jmpi (1) cmp_end;
+	cmp.ge.f0.0 (1) null:w INPUT_ARG0.0<0,1,0>:w INPUT_ARG0.8<0,1,0>:w {align1};
+	(f0.0) mov (1) RET_ARG<1>:w INPUT_ARG0.8<0,1,0>:w {align1};
+	(-f0.0) mov (1) RET_ARG<1>:w INPUT_ARG0.0<0,1,0>:w {align1};
+cmp_end:
+ 	RETURN	{align1};
+
diff --git a/src/shaders/vme/vp8_inter_frame_gen8.g8a b/src/shaders/vme/vp8_inter_frame_gen8.g8a
new file mode 100644
index 0000000..3b72c1c
--- /dev/null
+++ b/src/shaders/vme/vp8_inter_frame_gen8.g8a
@@ -0,0 +1,2 @@
+#include "vme8.inc"
+#include "vp8_inter_frame_gen8.asm"
diff --git a/src/shaders/vme/vp8_inter_frame_gen8.g8b b/src/shaders/vme/vp8_inter_frame_gen8.g8b
new file mode 100644
index 0000000..6377aae
--- /dev/null
+++ b/src/shaders/vme/vp8_inter_frame_gen8.g8b
@@ -0,0 +1,299 @@
+   { 0x00800001, 0x24000608, 0x00000000, 0x00000000 },
+   { 0x00800001, 0x24400608, 0x00000000, 0x00000000 },
+   { 0x00800001, 0x24800608, 0x00000000, 0x00000000 },
+   { 0x00800001, 0x24c00608, 0x00000000, 0x00000000 },
+   { 0x00200009, 0x24002228, 0x164500a0, 0x00040004 },
+   { 0x00000040, 0x24000a28, 0x1e000400, 0xfff8fff8 },
+   { 0x00000040, 0x24040a28, 0x1e000404, 0xffffffff },
+   { 0x00000001, 0x24080e08, 0x08000000, 0x0000001f },
+   { 0x00000001, 0x24142288, 0x00000014, 0x00000000 },
+   { 0x00200009, 0x24202228, 0x164500a0, 0x00040004 },
+   { 0x00000040, 0x24200a28, 0x1e000420, 0xfffcfffc },
+   { 0x00000001, 0x24280e08, 0x08000000, 0x000f0003 },
+   { 0x00000001, 0x24342288, 0x00000014, 0x00000000 },
+   { 0x00200009, 0x24482248, 0x164500a0, 0x00040004 },
+   { 0x00000001, 0x24542288, 0x00000014, 0x00000000 },
+   { 0x00000041, 0x24881208, 0x220000a2, 0x000000a1 },
+   { 0x00000040, 0x24880208, 0x22000488, 0x000000a0 },
+   { 0x00000041, 0x24880208, 0x06000488, 0x00000018 },
+   { 0x00000001, 0x24942288, 0x00000014, 0x00000000 },
+   { 0x00600001, 0x28000208, 0x008d0400, 0x00000000 },
+   { 0x04600031, 0x23800a88, 0x0e000800, 0x02190004 },
+   { 0x00600001, 0x28000208, 0x008d0420, 0x00000000 },
+   { 0x04600031, 0x23a00a88, 0x0e000800, 0x02290004 },
+   { 0x00200009, 0x24002228, 0x164500a0, 0x00030003 },
+   { 0x00000041, 0x24000a28, 0x1e000400, 0x00020002 },
+   { 0x00000040, 0x24000a28, 0x1e000400, 0xfff8fff8 },
+   { 0x00000040, 0x24040a28, 0x1e000404, 0xffffffff },
+   { 0x00600001, 0x28000208, 0x008d0400, 0x00000000 },
+   { 0x04600031, 0x26000a88, 0x0e000800, 0x02190006 },
+   { 0x00200009, 0x24202228, 0x164500a0, 0x00030003 },
+   { 0x00000041, 0x24200a28, 0x1e000420, 0x00020002 },
+   { 0x00000040, 0x24200a28, 0x1e000420, 0xfffcfffc },
+   { 0x00000001, 0x24280e08, 0x08000000, 0x00070003 },
+   { 0x00600001, 0x28000208, 0x008d0420, 0x00000000 },
+   { 0x04600031, 0x26200a88, 0x0e000800, 0x02190006 },
+   { 0x00600001, 0x2ac00608, 0x00000000, 0x00000000 },
+   { 0x00600001, 0x2a800608, 0x00000000, 0x00000000 },
+   { 0x01000005, 0x20001240, 0x160000a6, 0x00040004 },
+   { 0x00010020, 0x34000000, 0x0e001400, 0x00000750 },
+   { 0x00600001, 0x2ae00608, 0x00000000, 0x00000000 },
+   { 0x00600001, 0x2b000608, 0x00000000, 0x00000000 },
+   { 0x00600001, 0x2b200608, 0x00000000, 0x00000000 },
+   { 0x00600001, 0x2b400608, 0x00000000, 0x00000000 },
+   { 0x01000005, 0x20002240, 0x160000a5, 0x00600060 },
+   { 0x00210001, 0x2af41e68, 0x18000000, 0xffffffff },
+   { 0x00010020, 0x34000000, 0x0e001400, 0x000000f0 },
+   { 0x00000001, 0x2ae00e28, 0x08000000, 0x00000001 },
+   { 0x00200001, 0x24002248, 0x004500a0, 0x00000000 },
+   { 0x00000040, 0x24001a68, 0x1e000400, 0xffffffff },
+   { 0x00000041, 0x2b481208, 0x120000a2, 0x00000402 },
+   { 0x00000040, 0x2b480208, 0x12000b48, 0x00000400 },
+   { 0x00000041, 0x2b480208, 0x06000b48, 0x00000018 },
+   { 0x00000001, 0x2b542288, 0x00000014, 0x00000000 },
+   { 0x0a800031, 0x2b600a08, 0x0e000b40, 0x02280303 },
+   { 0x05000010, 0x20001260, 0x12000b70, 0x00000b88 },
+   { 0x00210001, 0x2af41e68, 0x18000000, 0xffffffff },
+   { 0x00010020, 0x34000000, 0x0e001400, 0x00000040 },
+   { 0x00000040, 0x2b480208, 0x06000b48, 0x00000003 },
+   { 0x0a800031, 0x2ba00a08, 0x0e000b40, 0x02480403 },
+   { 0x00200001, 0x2ae40208, 0x00450bc8, 0x00000000 },
+   { 0x00000001, 0x2af01e68, 0x18000000, 0x00010001 },
+   { 0x00600001, 0x2b400608, 0x00000000, 0x00000000 },
+   { 0x01000005, 0x20002240, 0x160000a5, 0x00100010 },
+   { 0x00210001, 0x2b141e68, 0x18000000, 0xffffffff },
+   { 0x00210001, 0x2b341e68, 0x18000000, 0xffffffff },
+   { 0x00010020, 0x34000000, 0x0e001400, 0x00000360 },
+   { 0x00000001, 0x2b000e28, 0x08000000, 0x00000001 },
+   { 0x00200001, 0x24002248, 0x004500a0, 0x00000000 },
+   { 0x00000040, 0x24021a68, 0x1e000402, 0xffffffff },
+   { 0x00000041, 0x2b481208, 0x120000a2, 0x00000402 },
+   { 0x00000040, 0x2b480208, 0x12000b48, 0x00000400 },
+   { 0x00000041, 0x2b480208, 0x06000b48, 0x00000018 },
+   { 0x00000001, 0x2b542288, 0x00000014, 0x00000000 },
+   { 0x0a800031, 0x2b600a08, 0x0e000b40, 0x02280303 },
+   { 0x05000010, 0x20001260, 0x12000b70, 0x00000b88 },
+   { 0x00210001, 0x2b141e68, 0x18000000, 0xffffffff },
+   { 0x00010020, 0x34000000, 0x0e001400, 0x00000040 },
+   { 0x00000040, 0x2b480208, 0x06000b48, 0x00000003 },
+   { 0x0a800031, 0x2ba00a08, 0x0e000b40, 0x02480403 },
+   { 0x00200001, 0x2b040208, 0x00450bf0, 0x00000000 },
+   { 0x00000001, 0x2b101e68, 0x18000000, 0x00010001 },
+   { 0x00600001, 0x2b400608, 0x00000000, 0x00000000 },
+   { 0x01000005, 0x20002240, 0x160000a5, 0x00080008 },
+   { 0x00010020, 0x34000000, 0x0e001400, 0x00000110 },
+   { 0x00000001, 0x2b200e28, 0x08000000, 0x00000001 },
+   { 0x00200001, 0x24002248, 0x004500a0, 0x00000000 },
+   { 0x00000040, 0x24021a68, 0x1e000402, 0xffffffff },
+   { 0x00000040, 0x24001a68, 0x1e000400, 0x00010001 },
+   { 0x00000041, 0x2b481208, 0x120000a2, 0x00000402 },
+   { 0x00000040, 0x2b480208, 0x12000b48, 0x00000400 },
+   { 0x00000041, 0x2b480208, 0x06000b48, 0x00000018 },
+   { 0x00000001, 0x2b542288, 0x00000014, 0x00000000 },
+   { 0x0a800031, 0x2b600a08, 0x0e000b40, 0x02280303 },
+   { 0x05000010, 0x20001260, 0x12000b70, 0x00000b88 },
+   { 0x00210001, 0x2b341e68, 0x18000000, 0xffffffff },
+   { 0x00010020, 0x34000000, 0x0e001400, 0x00000180 },
+   { 0x00000040, 0x2b480208, 0x06000b48, 0x00000003 },
+   { 0x0a800031, 0x2ba00a08, 0x0e000b40, 0x02480403 },
+   { 0x00200001, 0x2b240208, 0x00450bf0, 0x00000000 },
+   { 0x00000001, 0x2b301e68, 0x18000000, 0x00010001 },
+   { 0x00000020, 0x34000000, 0x0e001400, 0x00000130 },
+   { 0x00600001, 0x2b400608, 0x00000000, 0x00000000 },
+   { 0x01000005, 0x20002240, 0x160000a5, 0x00040004 },
+   { 0x00210001, 0x2b341e68, 0x18000000, 0xffffffff },
+   { 0x00010020, 0x34000000, 0x0e001400, 0x000000f0 },
+   { 0x00000001, 0x2b200e28, 0x08000000, 0x00000001 },
+   { 0x00200001, 0x24002248, 0x004500a0, 0x00000000 },
+   { 0x00200040, 0x24001a68, 0x1e450400, 0xffffffff },
+   { 0x00000041, 0x2b481208, 0x120000a2, 0x00000402 },
+   { 0x00000040, 0x2b480208, 0x12000b48, 0x00000400 },
+   { 0x00000041, 0x2b480208, 0x06000b48, 0x00000018 },
+   { 0x00000001, 0x2b542288, 0x00000014, 0x00000000 },
+   { 0x0a800031, 0x2b600a08, 0x0e000b40, 0x02280303 },
+   { 0x05000010, 0x20001260, 0x12000b70, 0x00000b88 },
+   { 0x00210001, 0x2b341e68, 0x18000000, 0xffffffff },
+   { 0x00010020, 0x34000000, 0x0e001400, 0x00000040 },
+   { 0x00000040, 0x2b480208, 0x06000b48, 0x00000003 },
+   { 0x0a800031, 0x2ba00a88, 0x0e000b40, 0x02480403 },
+   { 0x00200001, 0x2b240208, 0x00450c18, 0x00000000 },
+   { 0x00000001, 0x2b301e68, 0x18000000, 0x00010001 },
+   { 0x00000040, 0x24000a28, 0x0a000b00, 0x00000b20 },
+   { 0x01000010, 0x20000a20, 0x0e000400, 0x00000000 },
+   { 0x00110020, 0x34000000, 0x0e001400, 0x00000080 },
+   { 0x02000010, 0x20000a20, 0x0e000ae0, 0x00000000 },
+   { 0x00010001, 0x2b040208, 0x00000ae4, 0x00000000 },
+   { 0x00010001, 0x2b240208, 0x00000ae4, 0x00000000 },
+   { 0x00010001, 0x2b141248, 0x00000af4, 0x00000000 },
+   { 0x00010001, 0x2b341248, 0x00000af4, 0x00000000 },
+   { 0x00010001, 0x2ac00208, 0x00000ae4, 0x00000000 },
+   { 0x00110001, 0x2ac00608, 0x00000000, 0x00000000 },
+   { 0x00000020, 0x34000000, 0x0e001400, 0x00000190 },
+   { 0x00600001, 0x24000608, 0x00000000, 0x00000000 },
+   { 0x01000010, 0x20001a20, 0x1e000af4, 0x00000000 },
+   { 0x00010040, 0x24001a68, 0x1e000400, 0x00010001 },
+   { 0x00010001, 0x24040208, 0x00000ae4, 0x00000000 },
+   { 0x01000010, 0x20001a20, 0x1e000b14, 0x00000000 },
+   { 0x00010040, 0x24001a68, 0x1e000400, 0x00010001 },
+   { 0x00010001, 0x24040208, 0x00000b04, 0x00000000 },
+   { 0x01000010, 0x20001a20, 0x1e000b34, 0x00000000 },
+   { 0x00010040, 0x24001a68, 0x1e000400, 0x00010001 },
+   { 0x00010001, 0x24040208, 0x00000b24, 0x00000000 },
+   { 0x01000010, 0x20001a20, 0x1e000400, 0x00010001 },
+   { 0x00010001, 0x2ac00208, 0x00000404, 0x00000000 },
+   { 0x00010020, 0x34000000, 0x0e001400, 0x000000c0 },
+   { 0x00000001, 0x2fa01a68, 0x00000ae4, 0x00000000 },
+   { 0x00000001, 0x2fa41a68, 0x00000b04, 0x00000000 },
+   { 0x00000001, 0x2fa81a68, 0x00000b24, 0x00000000 },
+   { 0x00000040, 0x2fe00008, 0x06001400, 0x00000020 },
+   { 0x00000020, 0x34000000, 0x0e001400, 0x00000860 },
+   { 0x00000001, 0x2ac01a68, 0x00000fe4, 0x00000000 },
+   { 0x00000001, 0x2fa01a68, 0x00000ae6, 0x00000000 },
+   { 0x00000001, 0x2fa41a68, 0x00000b06, 0x00000000 },
+   { 0x00000001, 0x2fa81a68, 0x00000b26, 0x00000000 },
+   { 0x00000040, 0x2fe00008, 0x06001400, 0x00000020 },
+   { 0x00000020, 0x34000000, 0x0e001400, 0x00000800 },
+   { 0x00000001, 0x2ac21a68, 0x00000fe4, 0x00000000 },
+   { 0x0020000c, 0x2a801a68, 0x1e450ac0, 0x00020002 },
+   { 0x00200040, 0x2a881a68, 0x1e450a80, 0x00030003 },
+   { 0x00200005, 0x2a901248, 0x16450a88, 0xfffcfffc },
+   { 0x00600001, 0x25600208, 0x008d0020, 0x00000000 },
+   { 0x00600001, 0x28400208, 0x008d0560, 0x00000000 },
+   { 0x00600001, 0x28600608, 0x00000000, 0x00000000 },
+   { 0x00600001, 0x28800608, 0x00000000, 0x00000000 },
+   { 0x00000001, 0x23800608, 0x00000000, 0x00000000 },
+   { 0x00000005, 0x23840208, 0x06000384, 0xff000000 },
+   { 0x00600001, 0x28a00208, 0x008d0380, 0x00000000 },
+   { 0x00000001, 0x24001648, 0x10000000, 0x00010001 },
+   { 0x00000001, 0x28a52288, 0x00000400, 0x00000000 },
+   { 0x00600001, 0x28c00608, 0x00000000, 0x00000000 },
+   { 0x00800001, 0x28c02288, 0x00cf03a3, 0x00000000 },
+   { 0x00000001, 0x28d00608, 0x00000000, 0x11111111 },
+   { 0x00000001, 0x28dc0608, 0x00000000, 0x00010101 },
+   { 0x00000001, 0x28d41248, 0x00000606, 0x00000000 },
+   { 0x00400001, 0x28f00208, 0x00690608, 0x00000000 },
+   { 0x00600001, 0x28e01248, 0x00ae0622, 0x00000000 },
+   { 0x00000001, 0x247c1648, 0x10000000, 0x00000000 },
+   { 0x00000001, 0x247c0e88, 0x08000000, 0x00000002 },
+   { 0x00000001, 0x247d2288, 0x000000a5, 0x00000000 },
+   { 0x00000001, 0x24001648, 0x10000000, 0x00200020 },
+   { 0x00000001, 0x247e2288, 0x00000400, 0x00000000 },
+   { 0x00600001, 0x28200208, 0x008d0460, 0x00000000 },
+   { 0x00000001, 0x244c0608, 0x00000000, 0x00800000 },
+   { 0x00600001, 0x28000208, 0x008d0440, 0x00000000 },
+   { 0x0d600031, 0x21800a08, 0x0e000800, 0x10782000 },
+   { 0x00600001, 0x28000208, 0x008d0480, 0x00000000 },
+   { 0x00000001, 0x28200208, 0x00000180, 0x00000000 },
+   { 0x00000001, 0x28240208, 0x00000190, 0x00000000 },
+   { 0x00000001, 0x28280208, 0x00000194, 0x00000000 },
+   { 0x00000001, 0x282c0208, 0x00000198, 0x00000000 },
+   { 0x00000001, 0x28301248, 0x0000018c, 0x00000000 },
+   { 0x00000001, 0x28340208, 0x00000188, 0x00000000 },
+   { 0x00000001, 0x28380208, 0x0000019c, 0x00000000 },
+   { 0x00000001, 0x283c0208, 0x00000488, 0x00000000 },
+   { 0x0a800031, 0x20000a60, 0x0e000800, 0x040a0203 },
+   { 0x00000001, 0x244c0608, 0x00000000, 0x7e200000 },
+   { 0x00000001, 0x24561648, 0x10000000, 0x28302830 },
+   { 0x00000001, 0x24400208, 0x00000448, 0x00000000 },
+   { 0x00000040, 0x24401a68, 0x1e000440, 0xfff0fff0 },
+   { 0x00000040, 0x24421a68, 0x1e000442, 0xfff4fff4 },
+   { 0x00000001, 0x24401e68, 0x18000000, 0xfff0fff0 },
+   { 0x00000001, 0x24421e68, 0x18000000, 0xfff4fff4 },
+   { 0x00000001, 0x24440208, 0x00000440, 0x00000000 },
+   { 0x01000005, 0x20002240, 0x160000a5, 0x00600060 },
+   { 0x00010040, 0x24401a68, 0x1e000440, 0x000c000c },
+   { 0x01000005, 0x20002240, 0x160000a5, 0x00100010 },
+   { 0x00010040, 0x24421a68, 0x1e000442, 0x00080008 },
+   { 0x00200040, 0x24401a68, 0x1a450440, 0x00450a90 },
+   { 0x00200040, 0x24441a68, 0x1a450444, 0x00450a90 },
+   { 0x00600001, 0x28000208, 0x008d0440, 0x00000000 },
+   { 0x00000001, 0x24600608, 0x00000000, 0x00000002 },
+   { 0x00000001, 0x24642288, 0x0000009c, 0x00000000 },
+   { 0x00000001, 0x24680608, 0x00000000, 0x30003030 },
+   { 0x00600001, 0x28200208, 0x008d0460, 0x00000000 },
+   { 0x00400001, 0x45800208, 0x00000ac0, 0x00000000 },
+   { 0x00400001, 0x45840208, 0x00000ac0, 0x00000000 },
+   { 0x00600001, 0x28600208, 0x008d0580, 0x00000000 },
+   { 0x00600001, 0x28400208, 0x008d0560, 0x00000000 },
+   { 0x00000001, 0x28800608, 0x00000000, 0x01010101 },
+   { 0x00000001, 0x28840608, 0x00000000, 0x10010101 },
+   { 0x00000001, 0x28880608, 0x00000000, 0x0f0f0f0f },
+   { 0x00000001, 0x288c0608, 0x00000000, 0x100f0f0f },
+   { 0x00000001, 0x28900608, 0x00000000, 0x01010101 },
+   { 0x00000001, 0x28940608, 0x00000000, 0x10010101 },
+   { 0x00000001, 0x28980608, 0x00000000, 0x0f0f0f0f },
+   { 0x00000001, 0x289c0608, 0x00000000, 0x100f0f0f },
+   { 0x00000001, 0x28a00608, 0x00000000, 0x01010101 },
+   { 0x00000001, 0x28a40608, 0x00000000, 0x10010101 },
+   { 0x00000001, 0x28a80608, 0x00000000, 0x0f0f0f0f },
+   { 0x00000001, 0x28ac0608, 0x00000000, 0x000f0f0f },
+   { 0x00400001, 0x28b00608, 0x00000000, 0x00000000 },
+   { 0x08600031, 0x21800a08, 0x0e000800, 0x0c784000 },
+   { 0x00000001, 0x25740608, 0x00000000, 0x00000000 },
+   { 0x00000001, 0x25752288, 0x00000199, 0x00000000 },
+   { 0x00000001, 0x25762288, 0x0000019a, 0x00000000 },
+   { 0x00000005, 0x24001248, 0x16000180, 0x00030003 },
+   { 0x00000001, 0x25742288, 0x00000400, 0x00000000 },
+   { 0x00600001, 0x28800208, 0x008d01a0, 0x00000000 },
+   { 0x00600001, 0x28a00208, 0x008d01c0, 0x00000000 },
+   { 0x00600001, 0x28c00208, 0x008d01e0, 0x00000000 },
+   { 0x00600001, 0x28e00208, 0x008d0200, 0x00000000 },
+   { 0x00000001, 0x244c0608, 0x00000000, 0x00243000 },
+   { 0x00600001, 0x28000208, 0x008d0440, 0x00000000 },
+   { 0x00600001, 0x28200208, 0x008d0460, 0x00000000 },
+   { 0x00600001, 0x28400208, 0x008d0560, 0x00000000 },
+   { 0x00600001, 0x28600208, 0x008d0580, 0x00000000 },
+   { 0x0d600031, 0x21800a08, 0x0e000800, 0x10786000 },
+   { 0x00000040, 0x24880208, 0x06000488, 0x00000002 },
+   { 0x00600001, 0x28000208, 0x008d0480, 0x00000000 },
+   { 0x00000001, 0x28200208, 0x00000180, 0x00000000 },
+   { 0x00000001, 0x28240208, 0x00000198, 0x00000000 },
+   { 0x00000001, 0x28280208, 0x00000188, 0x00000000 },
+   { 0x00000001, 0x282c0208, 0x00000574, 0x00000000 },
+   { 0x0a800031, 0x20000a60, 0x0e000800, 0x040a0003 },
+   { 0x00000040, 0x24880208, 0x06000488, 0x00000001 },
+   { 0x00600001, 0x28000208, 0x008d0480, 0x00000000 },
+   { 0x00600001, 0x28200208, 0x008d01a0, 0x00000000 },
+   { 0x00600001, 0x28400208, 0x008d01c0, 0x00000000 },
+   { 0x00600001, 0x28600208, 0x008d01e0, 0x00000000 },
+   { 0x00600001, 0x28800208, 0x008d0200, 0x00000000 },
+   { 0x0a800031, 0x20000a60, 0x0e000800, 0x0a0a0403 },
+   { 0x00000040, 0x24880208, 0x06000488, 0x00000008 },
+   { 0x00600001, 0x28000208, 0x008d0480, 0x00000000 },
+   { 0x00600001, 0x28200208, 0x008d0240, 0x00000000 },
+   { 0x0a800031, 0x20000a60, 0x0e000800, 0x040a0203 },
+   { 0x0a800031, 0x2b600a08, 0x0e000b40, 0x0219e003 },
+   { 0x00600001, 0x2e000208, 0x008d0000, 0x00000000 },
+   { 0x07800031, 0x24000a40, 0x0e000e00, 0x82000010 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+   { 0x06000010, 0x20001a60, 0x1a000fa0, 0x00000fa4 },
+   { 0x00010001, 0x2f601a68, 0x00000fa0, 0x00000000 },
+   { 0x00110001, 0x2f601a68, 0x00000fa4, 0x00000000 },
+   { 0x06000010, 0x20001a60, 0x1a000f60, 0x00000fa8 },
+   { 0x00010001, 0x2fe41a68, 0x00000f60, 0x00000000 },
+   { 0x00110001, 0x2fe41a68, 0x00000fa8, 0x00000000 },
+   { 0x00000001, 0x34000200, 0x00000fe0, 0x00000000 },
+   { 0x04000010, 0x20001a60, 0x1a000fa0, 0x00000fa4 },
+   { 0x00010001, 0x2f601a68, 0x00000fa0, 0x00000000 },
+   { 0x00110001, 0x2f601a68, 0x00000fa4, 0x00000000 },
+   { 0x04000010, 0x20001a60, 0x1a000f60, 0x00000fa8 },
+   { 0x00010001, 0x2fe41a68, 0x00000f60, 0x00000000 },
+   { 0x00110001, 0x2fe41a68, 0x00000fa8, 0x00000000 },
+   { 0x00000001, 0x34000200, 0x00000fe0, 0x00000000 },
+   { 0x04000010, 0x20001a60, 0x1a000fa0, 0x00000fa4 },
+   { 0x00010020, 0x34000000, 0x0e001400, 0x00000070 },
+   { 0x04000010, 0x20001a60, 0x1a000fa0, 0x00000fa8 },
+   { 0x00010001, 0x2fe41a68, 0x00000fa0, 0x00000000 },
+   { 0x00010020, 0x34000000, 0x0e001400, 0x000000a0 },
+   { 0x04000010, 0x20001a60, 0x1a000fa4, 0x00000fa8 },
+   { 0x00010001, 0x2fe41a68, 0x00000fa8, 0x00000000 },
+   { 0x00110001, 0x2fe41a68, 0x00000fa4, 0x00000000 },
+   { 0x00000020, 0x34000000, 0x0e001400, 0x00000060 },
+   { 0x04000010, 0x20001a60, 0x1a000fa4, 0x00000fa8 },
+   { 0x00010001, 0x2fe41a68, 0x00000fa4, 0x00000000 },
+   { 0x00010020, 0x34000000, 0x0e001400, 0x00000030 },
+   { 0x04000010, 0x20001a60, 0x1a000fa0, 0x00000fa8 },
+   { 0x00010001, 0x2fe41a68, 0x00000fa8, 0x00000000 },
+   { 0x00110001, 0x2fe41a68, 0x00000fa0, 0x00000000 },
+   { 0x00000001, 0x34000200, 0x00000fe0, 0x00000000 },
diff --git a/src/shaders/vme/vp8_intra_frame_gen8.asm b/src/shaders/vme/vp8_intra_frame_gen8.asm
new file mode 100644
index 0000000..f1e7891
--- /dev/null
+++ b/src/shaders/vme/vp8_intra_frame_gen8.asm
@@ -0,0 +1,200 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Zhao Yakui <yakui.zhao at intel.com>
+ *    Xiang Haihao <haihao.xiang at intel.com>
+ *    Li Zhong <zhong.li at intel.com>
+ *
+ */
+
+/*
+ * __START
+ */
+__INTRA_START:
+mov  (16) tmp_reg0.0<1>:UD      0x0:UD {align1};
+mov  (16) tmp_reg2.0<1>:UD      0x0:UD {align1};
+mov  (16) tmp_reg4.0<1>:UD      0x0:UD {align1} ;
+mov  (16) tmp_reg6.0<1>:UD      0x0:UD {align1} ;
+
+shl  (2) read0_header.0<1>:D    orig_xy_ub<2,2,1>:UB 4:UW {align1};    /* (x, y) * 16 */
+add  (1) read0_header.0<1>:D    read0_header.0<0,1,0>:D -8:W {align1};     /* X offset */
+add  (1) read0_header.4<1>:D    read0_header.4<0,1,0>:D -1:W {align1};     /* Y offset */ 
+mov  (1) read0_header.8<1>:UD   BLOCK_32X1 {align1};
+mov  (1) read0_header.20<1>:UB  thread_id_ub {align1};                  /* dispatch id */
+
+shl  (2) read1_header.0<1>:D    orig_xy_ub<2,2,1>:UB 4:UW {align1};    /* (x, y) * 16 */
+add  (1) read1_header.0<1>:D    read1_header.0<0,1,0>:D -4:W {align1};     /* X offset */
+mov  (1) read1_header.8<1>:UD   BLOCK_4X16 {align1};
+mov  (1) read1_header.20<1>:UB  thread_id_ub {align1};                  /* dispatch id */
+        
+shl  (2) vme_m0.8<1>:UW         orig_xy_ub<2,2,1>:UB 4:UW {align1};    /* (x, y) * 16 */
+mov  (1) vme_m0.20<1>:UB        thread_id_ub {align1};                  /* dispatch id */
+
+mul  (1) obw_m0.8<1>:UD         w_in_mb_uw<0,1,0>:UW orig_y_ub<0,1,0>:UB {align1};
+add  (1) obw_m0.8<1>:UD         obw_m0.8<0,1,0>:UD orig_x_ub<0,1,0>:UB {align1};
+mul  (1) obw_m0.8<1>:UD         obw_m0.8<0,1,0>:UD 0x02:UD {align1};
+mov  (1) obw_m0.20<1>:UB        thread_id_ub {align1};                  /* dispatch id */
+        
+/*
+ * Media Read Message -- fetch Luma neighbor edge pixels 
+ */
+/* ROW */
+mov  (8) msg_reg0.0<1>:UD       read0_header.0<8,8,1>:UD {align1};        
+send (8) msg_ind INEP_ROW<1>:UB null read(BIND_IDX_INEP, 0, 0, 4) mlen 1 rlen 1 {align1};
+
+/* COL */
+mov  (8) msg_reg0.0<1>:UD       read1_header.0<8,8,1>:UD {align1};                
+send (8) msg_ind INEP_COL0<1>:UB null read(BIND_IDX_INEP, 0, 0, 4) mlen 1 rlen 2 {align1};
+        
+/*
+ * Media Read Message -- fetch Chroma neighbor edge pixels 
+ */
+/* ROW */
+shl  (2) read0_header.0<1>:D    orig_xy_ub<2,2,1>:UB 3:UW {align1};    /* x * 16 , y * 8 */
+mul  (1) read0_header.0<1>:D    read0_header.0<0,1,0>:D  2:W {align1};
+add  (1) read0_header.0<1>:D    read0_header.0<0,1,0>:D -8:W {align1};     /* X offset */
+add  (1) read0_header.4<1>:D    read0_header.4<0,1,0>:D -1:W {align1};     /* Y offset */ 
+mov  (8) msg_reg0.0<1>:UD       read0_header.0<8,8,1>:UD {align1};        
+send (8) msg_ind CHROMA_ROW<1>:UB null read(BIND_IDX_CBCR, 0, 0, 4) mlen 1 rlen 1 {align1};
+
+/* COL */
+shl  (2) read1_header.0<1>:D    orig_xy_ub<2,2,1>:UB 3:UW {align1};    /* x * 16, y * 8 */
+mul  (1) read1_header.0<1>:D    read1_header.0<0,1,0>:D  2:W {align1};
+add  (1) read1_header.0<1>:D    read1_header.0<0,1,0>:D -4:W {align1};     /* X offset */
+mov  (1) read1_header.8<1>:UD   BLOCK_8X4 {align1};
+mov  (8) msg_reg0.0<1>:UD       read1_header.0<8,8,1>:UD {align1};                
+send (8) msg_ind CHROMA_COL<1>:UB null read(BIND_IDX_CBCR, 0, 0, 4) mlen 1 rlen 1 {align1};
+
+/* m2, get the MV/Mb cost passed by constant buffer 
+when creating EU thread by MEDIA_OBJECT */       
+mov (8) vme_msg_2<1>:UD         r1.0<8,8,1>:UD {align1};
+
+/* m3. This is changed for FWD/BWD cost center */
+mov (8) vme_msg_3<1>:UD		0x0:UD {align1};	        
+
+/* m4.*/
+mov (8) vme_msg_4<1>:ud		0x0:ud	{align1};
+
+/* m5 */
+mov  (1) INEP_ROW.0<1>:UD       0x0:UD {align1};
+and  (1) INEP_ROW.4<1>:UD       INEP_ROW.4<0,1,0>:UD            0xFF000000:UD {align1};
+mov  (8) vme_msg_5<1>:UD        INEP_ROW.0<8,8,1>:UD {align1};
+
+mov  (1) tmp_reg0.0<1>:UB	INTRA_PLANAR_MODE_MASK {align1}; /* vp8 don't support planar intra mode */
+mov  (1) tmp_reg0.1<1>:UB	LUMA_CHROMA_MODE {align1}; /* Intra type: Luma + Chroma */
+
+/* Intra mode mask && Intra compute type */
+mov  (1) vme_msg_5.4<1>:UW	tmp_reg0.0<0,1,0>:UW {align1};
+
+/* m6 */        
+mov  (8) vme_msg_6<1>:UD         0x0:UD {align1};
+mov (16) vme_msg_6.0<1>:UB       INEP_COL0.3<32,8,4>:UB {align1};
+mov  (1) vme_msg_6.16<1>:UD      INTRA_PREDICTORE_MODE {align1};
+
+/* the penalty for Intra mode */
+mov  (1) vme_msg_6.28<1>:UD	0x010101:UD {align1};
+mov  (1) vme_msg_6.20<1>:UW      CHROMA_ROW.6<0,1,0>:UW {align1};
+
+
+/* m7 */
+
+mov  (4) vme_msg_7.16<1>:UD      CHROMA_ROW.8<4,4,1>:UD {align1};
+mov  (8) vme_msg_7.0<1>:UW       CHROMA_COL.2<16,8,2>:UW {align1};
+
+/*
+ * VME message
+ */
+
+/* m1 */
+mov  (1) intra_flag<1>:UW       0x0:UW {align1};
+mov  (1) intra_part_mask_ub<1>:UB  LUMA_INTRA_8x8_DISABLE {align1}; /* vp8 don't support intra_8x8 mode*/
+
+/* assign MB intra struct from the thread payload*/
+mov (1) mb_intra_struct_ub<1>:UB input_mb_intra_ub<0,1,0>:UB {align1}; 
+                           
+/* Disable DC HAAR component when calculating HARR SATD block */
+mov  (1) tmp_reg0.0<1>:UW	DC_HARR_DISABLE:UW		{align1};
+mov  (1) vme_m1.30<1>:UB	tmp_reg0.0<0,1,0>:UB  {align1};
+
+mov  (8) vme_msg_1<1>:UD        vme_m1.0<8,8,1>:UD {align1};
+
+/* m0 */        
+add  (1) vme_m0.12<1>:UD        vme_m0.12<0,1,0>:ud	INTRA_SAD_HAAR:UD {align1};/* 16x16 Source, Intra_harr */
+mov  (1) vme_m0.15<1>:UB        SUB_PART_8x4_DISABLE + SUB_PART_4x8_DISABLE {align1}; /* vp8 don't support 8x4 and 4x8 partion */
+mov  (8) vme_msg_0<1>:UD        vme_m0.0<8,8,1>:UD {align1};
+
+/* after verification it will be passed by using payload */
+send (8)
+        vme_msg_ind
+        vme_wb<1>:UD
+        null
+        cre(
+                BIND_IDX_VME,
+                VME_SIC_MESSAGE_TYPE
+        )
+        mlen sic_vme_msg_length
+        rlen vme_wb_length
+        {align1};
+/*
+ * Oword Block Write message
+ */
+mov  (8) msg_reg0.0<1>:UD       obw_m0<8,8,1>:UD {align1};
+        
+mov  (1) msg_reg1.0<1>:UD       vme_wb.0<0,1,0>:UD      {align1};
+mov  (1) msg_reg1.4<1>:UD       vme_wb.16<0,1,0>:UD     {align1};
+mov  (1) msg_reg1.8<1>:UD       vme_wb.20<0,1,0>:UD     {align1};
+mov  (1) msg_reg1.12<1>:UD      vme_wb.24<0,1,0>:UD     {align1};
+
+/* Distortion, Intra (17-16), */
+mov  (1) msg_reg1.16<1>:UW      vme_wb.12<0,1,0>:UW     {align1};
+
+mov  (1) msg_reg1.20<1>:UD      vme_wb.8<0,1,0>:UD     {align1};
+/* VME clock counts */
+mov  (1) msg_reg1.24<1>:UD      vme_wb.28<0,1,0>:UD     {align1};
+
+mov  (1) msg_reg1.28<1>:UD      obw_m0.8<0,1,0>:UD     {align1};
+
+/* bind index 3, write 2 oword (32bytes), msg type: 8(OWord Block Write) */
+send (16)
+        msg_ind
+        obw_wb
+        null
+        data_port(
+                OBW_CACHE_TYPE,
+                OBW_MESSAGE_TYPE,
+                OBW_CONTROL_2,
+                OBW_BIND_IDX,
+                OBW_WRITE_COMMIT_CATEGORY,
+                OBW_HEADER_PRESENT
+        )
+        mlen 2
+        rlen obw_wb_length
+        {align1};
+
+__EXIT: 
+/*
+ * kill thread
+ */        
+mov  (8) ts_msg_reg0<1>:UD         r0<8,8,1>:UD {align1};
+send (16) ts_msg_ind acc0<1>UW null thread_spawner(0, 0, 1) mlen 1 rlen 0 {align1 EOT};
diff --git a/src/shaders/vme/vp8_intra_frame_gen8.g8a b/src/shaders/vme/vp8_intra_frame_gen8.g8a
new file mode 100644
index 0000000..a445b1e
--- /dev/null
+++ b/src/shaders/vme/vp8_intra_frame_gen8.g8a
@@ -0,0 +1,2 @@
+#include "vme8.inc"
+#include "vp8_intra_frame_gen8.asm"
diff --git a/src/shaders/vme/vp8_intra_frame_gen8.g8b b/src/shaders/vme/vp8_intra_frame_gen8.g8b
new file mode 100644
index 0000000..4dca617
--- /dev/null
+++ b/src/shaders/vme/vp8_intra_frame_gen8.g8b
@@ -0,0 +1,73 @@
+   { 0x00800001, 0x24000608, 0x00000000, 0x00000000 },
+   { 0x00800001, 0x24400608, 0x00000000, 0x00000000 },
+   { 0x00800001, 0x24800608, 0x00000000, 0x00000000 },
+   { 0x00800001, 0x24c00608, 0x00000000, 0x00000000 },
+   { 0x00200009, 0x24002228, 0x164500a0, 0x00040004 },
+   { 0x00000040, 0x24000a28, 0x1e000400, 0xfff8fff8 },
+   { 0x00000040, 0x24040a28, 0x1e000404, 0xffffffff },
+   { 0x00000001, 0x24080e08, 0x08000000, 0x0000001f },
+   { 0x00000001, 0x24142288, 0x00000014, 0x00000000 },
+   { 0x00200009, 0x24202228, 0x164500a0, 0x00040004 },
+   { 0x00000040, 0x24200a28, 0x1e000420, 0xfffcfffc },
+   { 0x00000001, 0x24280e08, 0x08000000, 0x000f0003 },
+   { 0x00000001, 0x24342288, 0x00000014, 0x00000000 },
+   { 0x00200009, 0x24482248, 0x164500a0, 0x00040004 },
+   { 0x00000001, 0x24542288, 0x00000014, 0x00000000 },
+   { 0x00000041, 0x24881208, 0x220000a2, 0x000000a1 },
+   { 0x00000040, 0x24880208, 0x22000488, 0x000000a0 },
+   { 0x00000041, 0x24880208, 0x06000488, 0x00000002 },
+   { 0x00000001, 0x24942288, 0x00000014, 0x00000000 },
+   { 0x00600001, 0x28000208, 0x008d0400, 0x00000000 },
+   { 0x04600031, 0x23800a88, 0x0e000800, 0x02190004 },
+   { 0x00600001, 0x28000208, 0x008d0420, 0x00000000 },
+   { 0x04600031, 0x23a00a88, 0x0e000800, 0x02290004 },
+   { 0x00200009, 0x24002228, 0x164500a0, 0x00030003 },
+   { 0x00000041, 0x24000a28, 0x1e000400, 0x00020002 },
+   { 0x00000040, 0x24000a28, 0x1e000400, 0xfff8fff8 },
+   { 0x00000040, 0x24040a28, 0x1e000404, 0xffffffff },
+   { 0x00600001, 0x28000208, 0x008d0400, 0x00000000 },
+   { 0x04600031, 0x26000a88, 0x0e000800, 0x02190006 },
+   { 0x00200009, 0x24202228, 0x164500a0, 0x00030003 },
+   { 0x00000041, 0x24200a28, 0x1e000420, 0x00020002 },
+   { 0x00000040, 0x24200a28, 0x1e000420, 0xfffcfffc },
+   { 0x00000001, 0x24280e08, 0x08000000, 0x00070003 },
+   { 0x00600001, 0x28000208, 0x008d0420, 0x00000000 },
+   { 0x04600031, 0x26200a88, 0x0e000800, 0x02190006 },
+   { 0x00600001, 0x28400208, 0x008d0020, 0x00000000 },
+   { 0x00600001, 0x28600608, 0x00000000, 0x00000000 },
+   { 0x00600001, 0x28800608, 0x00000000, 0x00000000 },
+   { 0x00000001, 0x23800608, 0x00000000, 0x00000000 },
+   { 0x00000005, 0x23840208, 0x06000384, 0xff000000 },
+   { 0x00600001, 0x28a00208, 0x008d0380, 0x00000000 },
+   { 0x00000001, 0x24000688, 0x00000000, 0x10001000 },
+   { 0x00000001, 0x24010e88, 0x08000000, 0x00000000 },
+   { 0x00000001, 0x28a41248, 0x00000400, 0x00000000 },
+   { 0x00600001, 0x28c00608, 0x00000000, 0x00000000 },
+   { 0x00800001, 0x28c02288, 0x00cf03a3, 0x00000000 },
+   { 0x00000001, 0x28d00608, 0x00000000, 0x11111111 },
+   { 0x00000001, 0x28dc0608, 0x00000000, 0x00010101 },
+   { 0x00000001, 0x28d41248, 0x00000606, 0x00000000 },
+   { 0x00400001, 0x28f00208, 0x00690608, 0x00000000 },
+   { 0x00600001, 0x28e01248, 0x00ae0622, 0x00000000 },
+   { 0x00000001, 0x247c1648, 0x10000000, 0x00000000 },
+   { 0x00000001, 0x247c0e88, 0x08000000, 0x00000002 },
+   { 0x00000001, 0x247d2288, 0x000000a5, 0x00000000 },
+   { 0x00000001, 0x24001648, 0x10000000, 0x00200020 },
+   { 0x00000001, 0x247e2288, 0x00000400, 0x00000000 },
+   { 0x00600001, 0x28200208, 0x008d0460, 0x00000000 },
+   { 0x00000040, 0x244c0208, 0x0600044c, 0x00800000 },
+   { 0x00000001, 0x244f0e88, 0x08000000, 0x00000030 },
+   { 0x00600001, 0x28000208, 0x008d0440, 0x00000000 },
+   { 0x0d600031, 0x21800a08, 0x0e000800, 0x10782000 },
+   { 0x00600001, 0x28000208, 0x008d0480, 0x00000000 },
+   { 0x00000001, 0x28200208, 0x00000180, 0x00000000 },
+   { 0x00000001, 0x28240208, 0x00000190, 0x00000000 },
+   { 0x00000001, 0x28280208, 0x00000194, 0x00000000 },
+   { 0x00000001, 0x282c0208, 0x00000198, 0x00000000 },
+   { 0x00000001, 0x28301248, 0x0000018c, 0x00000000 },
+   { 0x00000001, 0x28340208, 0x00000188, 0x00000000 },
+   { 0x00000001, 0x28380208, 0x0000019c, 0x00000000 },
+   { 0x00000001, 0x283c0208, 0x00000488, 0x00000000 },
+   { 0x0a800031, 0x20000a60, 0x0e000800, 0x040a0203 },
+   { 0x00600001, 0x2e000208, 0x008d0000, 0x00000000 },
+   { 0x07800031, 0x24000a40, 0x0e000e00, 0x82000010 },
diff --git a/src/vp8_probs.h b/src/vp8_probs.h
index 8dd4290..e864b68 100644
--- a/src/vp8_probs.h
+++ b/src/vp8_probs.h
@@ -42,27 +42,27 @@
 #ifndef VP8_PROBS_H
 #define VP8_PROBS_H
 
-const unsigned char vp8_ymode_prob[4] =
+static const unsigned char vp8_ymode_prob[4] =
 {
     112, 86, 140, 37
 };
 
-const unsigned char vp8_kf_ymode_prob[4] =
+static const unsigned char vp8_kf_ymode_prob[4] =
 {
     145, 156, 163, 128
 };
 
-const unsigned char vp8_uv_mode_prob[3] =
+static const unsigned char vp8_uv_mode_prob[3] =
 {
     162, 101, 204
 };
 
-static  const unsigned char vp8_kf_uv_mode_prob[3] =
+static const unsigned char vp8_kf_uv_mode_prob[3] =
 {
     142, 114, 183
 };
 
-const unsigned char vp8_base_skip_false_prob[128] =
+static const unsigned char vp8_base_skip_false_prob[128] =
 {
     255, 255, 255, 255, 255, 255, 255, 255,
     255, 255, 255, 255, 255, 255, 255, 255,
@@ -82,7 +82,7 @@ const unsigned char vp8_base_skip_false_prob[128] =
     30,  28,  26,  24,  22,  20,  18, 16,
 };
 
-const unsigned char vp8_mv_update_probs[2][19] =
+static const unsigned char vp8_mv_update_probs[2][19] =
 {
     {
         237,
@@ -98,7 +98,7 @@ const unsigned char vp8_mv_update_probs[2][19] =
     }
 };
 
-const unsigned char vp8_default_mv_context[2][19] =
+static const unsigned char vp8_default_mv_context[2][19] =
 {
     {
         162,                                        /* is short */
@@ -116,7 +116,7 @@ const unsigned char vp8_default_mv_context[2][19] =
     }
 };
 
-const unsigned char vp8_default_coef_probs[4][8][3][11] =
+static const unsigned char vp8_default_coef_probs[4][8][3][11] =
 {
     { /* Block Type ( 0 ) */
         { /* Coeff Band ( 0 )*/
-- 
1.9.1



More information about the Libva mailing list