[Libva] [PATCH v2 08/12] VP8 HWEnc: Add BSW VP8 HWEnc support
Zhong Li
zhong.li at intel.com
Tue Jan 13 21:03:39 PST 2015
Signed-off-by: Zhong Li <zhong.li at intel.com>
---
src/gen8_mfc.c | 911 ++++++++++++++++++++++++++++++-
src/gen8_vme.c | 264 +++++++--
src/i965_device_info.c | 1 +
src/shaders/vme/Makefile.am | 4 +-
src/shaders/vme/vp8_inter_frame_gen8.asm | 739 +++++++++++++++++++++++++
src/shaders/vme/vp8_inter_frame_gen8.g8a | 2 +
src/shaders/vme/vp8_inter_frame_gen8.g8b | 299 ++++++++++
src/shaders/vme/vp8_intra_frame_gen8.asm | 200 +++++++
src/shaders/vme/vp8_intra_frame_gen8.g8a | 2 +
src/shaders/vme/vp8_intra_frame_gen8.g8b | 73 +++
src/vp8_probs.h | 16 +-
11 files changed, 2439 insertions(+), 72 deletions(-)
create mode 100644 src/shaders/vme/vp8_inter_frame_gen8.asm
create mode 100644 src/shaders/vme/vp8_inter_frame_gen8.g8a
create mode 100644 src/shaders/vme/vp8_inter_frame_gen8.g8b
create mode 100644 src/shaders/vme/vp8_intra_frame_gen8.asm
create mode 100644 src/shaders/vme/vp8_intra_frame_gen8.g8a
create mode 100644 src/shaders/vme/vp8_intra_frame_gen8.g8b
diff --git a/src/gen8_mfc.c b/src/gen8_mfc.c
index b50616d..9a227ac 100644
--- a/src/gen8_mfc.c
+++ b/src/gen8_mfc.c
@@ -43,6 +43,7 @@
#include "gen6_vme.h"
#include "intel_media.h"
#include <va/va_enc_jpeg.h>
+#include "vp8_probs.h"
#define SURFACE_STATE_PADDED_SIZE SURFACE_STATE_PADDED_SIZE_GEN8
#define SURFACE_STATE_OFFSET(index) (SURFACE_STATE_PADDED_SIZE * index)
@@ -131,6 +132,7 @@ static struct i965_kernel gen8_mfc_kernels[] = {
#define INTER_16X8 0x01
#define INTER_8X16 0x02
#define SUBMB_SHAPE_MASK 0x00FF00
+#define INTER_16X16 0x00
#define INTER_MV8 (4 << 20)
#define INTER_MV32 (6 << 20)
@@ -146,7 +148,8 @@ gen8_mfc_pipe_mode_select(VADriverContextP ctx,
assert(standard_select == MFX_FORMAT_MPEG2 ||
standard_select == MFX_FORMAT_AVC ||
- standard_select == MFX_FORMAT_JPEG);
+ standard_select == MFX_FORMAT_JPEG ||
+ standard_select == MFX_FORMAT_VP8);
BEGIN_BCS_BATCH(batch, 5);
@@ -157,6 +160,7 @@ gen8_mfc_pipe_mode_select(VADriverContextP ctx,
(0 << 10) | /* Stream-Out Enable */
((!!mfc_context->post_deblocking_output.bo) << 9) | /* Post Deblocking Output */
((!!mfc_context->pre_deblocking_output.bo) << 8) | /* Pre Deblocking Output */
+ (0 << 6) | /* frame statistics stream-out enable*/
(0 << 5) | /* not in stitch mode */
(1 << 4) | /* encoding mode */
(standard_select << 0)); /* standard select: avc or mpeg2 or jpeg*/
@@ -221,9 +225,18 @@ gen8_mfc_ind_obj_base_addr_state(VADriverContextP ctx,
OUT_BCS_BATCH(batch, 0);
OUT_BCS_BATCH(batch, 0);
OUT_BCS_BATCH(batch, 0);
+
/* the DW4-5 is the MFX upper bound */
- OUT_BCS_BATCH(batch, 0);
- OUT_BCS_BATCH(batch, 0);
+ if (encoder_context->codec == CODEC_VP8) {
+ OUT_BCS_RELOC(batch,
+ mfc_context->mfc_indirect_pak_bse_object.bo,
+ I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+ mfc_context->mfc_indirect_pak_bse_object.end_offset);
+ OUT_BCS_BATCH(batch, 0);
+ } else {
+ OUT_BCS_BATCH(batch, 0);
+ OUT_BCS_BATCH(batch, 0);
+ }
if(encoder_context->codec != CODEC_JPEG) {
vme_size = vme_context->vme_output.size_block * vme_context->vme_output.num_blocks;
@@ -3203,6 +3216,871 @@ gen8_mfc_jpeg_encode_picture(VADriverContextP ctx,
return VA_STATUS_SUCCESS;
}
+static void vp8_enc_state_init(struct gen6_mfc_context *mfc_context,
+ VAEncPictureParameterBufferVP8 *pic_param,
+ VAQMatrixBufferVP8 *q_matrix)
+{
+
+ int is_key_frame = !pic_param->pic_flags.bits.frame_type;
+ unsigned char *coeff_probs_stream_in_buffer;
+
+ mfc_context->vp8_state.frame_header_lf_update_pos = 0;
+ mfc_context->vp8_state.frame_header_qindex_update_pos = 0;
+ mfc_context->vp8_state.frame_header_token_update_pos = 0;
+ mfc_context->vp8_state.frame_header_bin_mv_upate_pos = 0;
+
+ mfc_context->vp8_state.prob_skip_false = 255;
+ memset(mfc_context->vp8_state.mb_segment_tree_probs, 0, sizeof(mfc_context->vp8_state.mb_segment_tree_probs));
+ memcpy(mfc_context->vp8_state.mv_probs, vp8_default_mv_context, sizeof(mfc_context->vp8_state.mv_probs));
+
+ if (is_key_frame) {
+ memcpy(mfc_context->vp8_state.y_mode_probs, vp8_kf_ymode_prob, sizeof(mfc_context->vp8_state.y_mode_probs));
+ memcpy(mfc_context->vp8_state.uv_mode_probs, vp8_kf_uv_mode_prob, sizeof(mfc_context->vp8_state.uv_mode_probs));
+
+ mfc_context->vp8_state.prob_intra = 255;
+ mfc_context->vp8_state.prob_last = 128;
+ mfc_context->vp8_state.prob_gf = 128;
+ } else {
+ memcpy(mfc_context->vp8_state.y_mode_probs, vp8_ymode_prob, sizeof(mfc_context->vp8_state.y_mode_probs));
+ memcpy(mfc_context->vp8_state.uv_mode_probs, vp8_uv_mode_prob, sizeof(mfc_context->vp8_state.uv_mode_probs));
+
+ mfc_context->vp8_state.prob_intra = 63;
+ mfc_context->vp8_state.prob_last = 128;
+ mfc_context->vp8_state.prob_gf = 128;
+ }
+
+ mfc_context->vp8_state.prob_skip_false = vp8_base_skip_false_prob[q_matrix->quantization_index[0]];
+
+ dri_bo_map(mfc_context->vp8_state.coeff_probs_stream_in_bo, 1);
+ coeff_probs_stream_in_buffer = (unsigned char *)mfc_context->vp8_state.coeff_probs_stream_in_bo->virtual;
+ assert(coeff_probs_stream_in_buffer);
+ memcpy(coeff_probs_stream_in_buffer, vp8_default_coef_probs, sizeof(vp8_default_coef_probs));
+ dri_bo_unmap(mfc_context->vp8_state.coeff_probs_stream_in_bo);
+}
+
+static void vp8_enc_state_update(struct gen6_mfc_context *mfc_context,
+ VAQMatrixBufferVP8 *q_matrix)
+{
+
+ /*some other probabilities need to be updated*/
+}
+
+extern void binarize_vp8_frame_header(VAEncSequenceParameterBufferVP8 *seq_param,
+ VAEncPictureParameterBufferVP8 *pic_param,
+ VAQMatrixBufferVP8 *q_matrix,
+ struct gen6_mfc_context *mfc_context);
+
+static void vp8_enc_frame_header_binarize(struct encode_state *encode_state,
+ struct gen6_mfc_context *mfc_context)
+{
+ VAEncSequenceParameterBufferVP8 *seq_param = (VAEncSequenceParameterBufferVP8 *)encode_state->seq_param_ext->buffer;
+ VAEncPictureParameterBufferVP8 *pic_param = (VAEncPictureParameterBufferVP8 *)encode_state->pic_param_ext->buffer;
+ VAQMatrixBufferVP8 *q_matrix = (VAQMatrixBufferVP8 *)encode_state->q_matrix->buffer;
+ unsigned char *frame_header_buffer;
+
+ binarize_vp8_frame_header(seq_param, pic_param, q_matrix, mfc_context);
+
+ dri_bo_map(mfc_context->vp8_state.frame_header_bo, 1);
+ frame_header_buffer = (unsigned char *)mfc_context->vp8_state.frame_header_bo->virtual;
+ assert(frame_header_buffer);
+ memcpy(frame_header_buffer, mfc_context->vp8_state.vp8_frame_header, (mfc_context->vp8_state.frame_header_bit_count + 7) / 8);
+ dri_bo_unmap(mfc_context->vp8_state.frame_header_bo);
+}
+
+#define MAX_VP8_FRAME_HEADER_SIZE 0x2000
+#define VP8_TOKEN_STATISTICS_BUFFER_SIZE 0x2000
+
+static void gen8_mfc_vp8_init(VADriverContextP ctx,
+ struct encode_state *encode_state,
+ struct intel_encoder_context *encoder_context)
+{
+ struct i965_driver_data *i965 = i965_driver_data(ctx);
+ struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
+ dri_bo *bo;
+ int i;
+ int width_in_mbs = 0;
+ int height_in_mbs = 0;
+ int slice_batchbuffer_size;
+
+ VAEncSequenceParameterBufferVP8 *pSequenceParameter = (VAEncSequenceParameterBufferVP8 *)encode_state->seq_param_ext->buffer;
+ VAEncPictureParameterBufferVP8 *pic_param = (VAEncPictureParameterBufferVP8 *)encode_state->pic_param_ext->buffer;
+ VAQMatrixBufferVP8 *q_matrix = (VAQMatrixBufferVP8 *)encode_state->q_matrix->buffer;
+
+ width_in_mbs = ALIGN(pSequenceParameter->frame_height, 16) / 16;
+ height_in_mbs = ALIGN(pSequenceParameter->frame_height, 16) / 16;
+
+ slice_batchbuffer_size = 64 * width_in_mbs * height_in_mbs + 4096 +
+ (SLICE_HEADER + SLICE_TAIL);
+
+ /*Encode common setup for MFC*/
+ dri_bo_unreference(mfc_context->post_deblocking_output.bo);
+ mfc_context->post_deblocking_output.bo = NULL;
+
+ dri_bo_unreference(mfc_context->pre_deblocking_output.bo);
+ mfc_context->pre_deblocking_output.bo = NULL;
+
+ dri_bo_unreference(mfc_context->uncompressed_picture_source.bo);
+ mfc_context->uncompressed_picture_source.bo = NULL;
+
+ dri_bo_unreference(mfc_context->mfc_indirect_pak_bse_object.bo);
+ mfc_context->mfc_indirect_pak_bse_object.bo = NULL;
+
+ for (i = 0; i < NUM_MFC_DMV_BUFFERS; i++){
+ if ( mfc_context->direct_mv_buffers[i].bo != NULL)
+ dri_bo_unreference(mfc_context->direct_mv_buffers[i].bo);
+ mfc_context->direct_mv_buffers[i].bo = NULL;
+ }
+
+ for (i = 0; i < MAX_MFC_REFERENCE_SURFACES; i++){
+ if (mfc_context->reference_surfaces[i].bo != NULL)
+ dri_bo_unreference(mfc_context->reference_surfaces[i].bo);
+ mfc_context->reference_surfaces[i].bo = NULL;
+ }
+
+ dri_bo_unreference(mfc_context->intra_row_store_scratch_buffer.bo);
+ bo = dri_bo_alloc(i965->intel.bufmgr,
+ "Buffer",
+ width_in_mbs * 64,
+ 64);
+ assert(bo);
+ mfc_context->intra_row_store_scratch_buffer.bo = bo;
+
+ dri_bo_unreference(mfc_context->macroblock_status_buffer.bo);
+ bo = dri_bo_alloc(i965->intel.bufmgr,
+ "Buffer",
+ width_in_mbs * height_in_mbs * 16,
+ 64);
+ assert(bo);
+ mfc_context->macroblock_status_buffer.bo = bo;
+
+ dri_bo_unreference(mfc_context->deblocking_filter_row_store_scratch_buffer.bo);
+ bo = dri_bo_alloc(i965->intel.bufmgr,
+ "Buffer",
+ 4 * width_in_mbs * 64, /* 4 * width_in_mbs * 64 */
+ 64);
+ assert(bo);
+ mfc_context->deblocking_filter_row_store_scratch_buffer.bo = bo;
+
+ dri_bo_unreference(mfc_context->bsd_mpc_row_store_scratch_buffer.bo);
+ bo = dri_bo_alloc(i965->intel.bufmgr,
+ "Buffer",
+ 2 * width_in_mbs * 64, /* 2 * width_in_mbs * 64 */
+ 0x1000);
+ assert(bo);
+ mfc_context->bsd_mpc_row_store_scratch_buffer.bo = bo;
+
+ dri_bo_unreference(mfc_context->mfc_batchbuffer_surface.bo);
+ mfc_context->mfc_batchbuffer_surface.bo = NULL;
+
+ dri_bo_unreference(mfc_context->aux_batchbuffer_surface.bo);
+ mfc_context->aux_batchbuffer_surface.bo = NULL;
+
+ if (mfc_context->aux_batchbuffer)
+ intel_batchbuffer_free(mfc_context->aux_batchbuffer);
+
+ mfc_context->aux_batchbuffer = intel_batchbuffer_new(&i965->intel, I915_EXEC_BSD, slice_batchbuffer_size);
+ mfc_context->aux_batchbuffer_surface.bo = mfc_context->aux_batchbuffer->buffer;
+ dri_bo_reference(mfc_context->aux_batchbuffer_surface.bo);
+ mfc_context->aux_batchbuffer_surface.pitch = 16;
+ mfc_context->aux_batchbuffer_surface.num_blocks = mfc_context->aux_batchbuffer->size / 16;
+ mfc_context->aux_batchbuffer_surface.size_block = 16;
+
+ i965_gpe_context_init(ctx, &mfc_context->gpe_context);
+
+ /* alloc vp8 encoding buffers*/
+ dri_bo_unreference(mfc_context->vp8_state.frame_header_bo);
+ bo = dri_bo_alloc(i965->intel.bufmgr,
+ "Buffer",
+ MAX_VP8_FRAME_HEADER_SIZE,
+ 0x1000);
+ assert(bo);
+ mfc_context->vp8_state.frame_header_bo = bo;
+
+ mfc_context->vp8_state.intermediate_buffer_max_size = width_in_mbs * height_in_mbs * 256 * 9;
+ for(i = 0; i < 8; i++) {
+ mfc_context->vp8_state.intermediate_partition_offset[i] = width_in_mbs * height_in_mbs * 256 * (i + 1);
+ }
+ dri_bo_unreference(mfc_context->vp8_state.intermediate_bo);
+ bo = dri_bo_alloc(i965->intel.bufmgr,
+ "Buffer",
+ mfc_context->vp8_state.intermediate_buffer_max_size,
+ 0x1000);
+ assert(bo);
+ mfc_context->vp8_state.intermediate_bo = bo;
+
+ dri_bo_unreference(mfc_context->vp8_state.stream_out_bo);
+ bo = dri_bo_alloc(i965->intel.bufmgr,
+ "Buffer",
+ width_in_mbs * height_in_mbs * 16,
+ 0x1000);
+ assert(bo);
+ mfc_context->vp8_state.stream_out_bo = bo;
+
+ dri_bo_unreference(mfc_context->vp8_state.coeff_probs_stream_in_bo);
+ bo = dri_bo_alloc(i965->intel.bufmgr,
+ "Buffer",
+ sizeof(vp8_default_coef_probs),
+ 0x1000);
+ assert(bo);
+ mfc_context->vp8_state.coeff_probs_stream_in_bo = bo;
+
+ dri_bo_unreference(mfc_context->vp8_state.token_statistics_bo);
+ bo = dri_bo_alloc(i965->intel.bufmgr,
+ "Buffer",
+ VP8_TOKEN_STATISTICS_BUFFER_SIZE,
+ 0x1000);
+ assert(bo);
+ mfc_context->vp8_state.token_statistics_bo = bo;
+
+ dri_bo_unreference(mfc_context->vp8_state.mpc_row_store_bo);
+ bo = dri_bo_alloc(i965->intel.bufmgr,
+ "Buffer",
+ width_in_mbs * 16 * 64,
+ 0x1000);
+ assert(bo);
+ mfc_context->vp8_state.mpc_row_store_bo = bo;
+
+ vp8_enc_state_init(mfc_context, pic_param, q_matrix);
+ vp8_enc_frame_header_binarize(encode_state, mfc_context);
+}
+
+static VAStatus
+intel_mfc_vp8_prepare(VADriverContextP ctx,
+ struct encode_state *encode_state,
+ struct intel_encoder_context *encoder_context)
+{
+ struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
+ struct object_surface *obj_surface;
+ struct object_buffer *obj_buffer;
+ struct i965_coded_buffer_segment *coded_buffer_segment;
+ VAEncPictureParameterBufferVP8 *pic_param = (VAEncPictureParameterBufferVP8 *)encode_state->pic_param_ext->buffer;
+ VAStatus vaStatus = VA_STATUS_SUCCESS;
+ dri_bo *bo;
+ int i;
+
+ /* reconstructed surface */
+ obj_surface = encode_state->reconstructed_object;
+ i965_check_alloc_surface_bo(ctx, obj_surface, 1, VA_FOURCC('N','V','1','2'), SUBSAMPLE_YUV420);
+ if (pic_param->loop_filter_level[0] == 0) {
+ mfc_context->pre_deblocking_output.bo = obj_surface->bo;
+ dri_bo_reference(mfc_context->pre_deblocking_output.bo);
+ } else {
+ mfc_context->post_deblocking_output.bo = obj_surface->bo;
+ dri_bo_reference(mfc_context->post_deblocking_output.bo);
+ }
+
+ mfc_context->surface_state.width = obj_surface->orig_width;
+ mfc_context->surface_state.height = obj_surface->orig_height;
+ mfc_context->surface_state.w_pitch = obj_surface->width;
+ mfc_context->surface_state.h_pitch = obj_surface->height;
+
+ /* set vp8 reference frames */
+ for (i = 0; i < ARRAY_ELEMS(mfc_context->reference_surfaces); i++) {
+ obj_surface = encode_state->reference_objects[i];
+
+ if (obj_surface && obj_surface->bo) {
+ mfc_context->reference_surfaces[i].bo = obj_surface->bo;
+ dri_bo_reference(mfc_context->reference_surfaces[i].bo);
+ } else {
+ mfc_context->reference_surfaces[i].bo = NULL;
+ }
+ }
+
+ /* input YUV surface */
+ obj_surface = encode_state->input_yuv_object;
+ mfc_context->uncompressed_picture_source.bo = obj_surface->bo;
+ dri_bo_reference(mfc_context->uncompressed_picture_source.bo);
+
+ /* coded buffer */
+ obj_buffer = encode_state->coded_buf_object;
+ bo = obj_buffer->buffer_store->bo;
+ mfc_context->mfc_indirect_pak_bse_object.bo = bo;
+ mfc_context->mfc_indirect_pak_bse_object.offset = I965_CODEDBUFFER_HEADER_SIZE;
+ mfc_context->mfc_indirect_pak_bse_object.end_offset = ALIGN(obj_buffer->size_element - 0x1000, 0x1000);
+ dri_bo_reference(mfc_context->mfc_indirect_pak_bse_object.bo);
+
+ dri_bo_unreference(mfc_context->vp8_state.final_frame_bo);
+ mfc_context->vp8_state.final_frame_bo = mfc_context->mfc_indirect_pak_bse_object.bo;
+ mfc_context->vp8_state.final_frame_byte_offset = I965_CODEDBUFFER_HEADER_SIZE;
+ dri_bo_reference(mfc_context->vp8_state.final_frame_bo);
+
+ /* set the internal flag to 0 to indicate the coded size is unknown */
+ dri_bo_map(bo, 1);
+ coded_buffer_segment = (struct i965_coded_buffer_segment *)bo->virtual;
+ coded_buffer_segment->mapped = 0;
+ coded_buffer_segment->codec = encoder_context->codec;
+ dri_bo_unmap(bo);
+
+ return vaStatus;
+}
+
+static void
+gen8_mfc_vp8_encoder_cfg(VADriverContextP ctx,
+ struct encode_state *encode_state,
+ struct intel_encoder_context *encoder_context)
+{
+ struct intel_batchbuffer *batch = encoder_context->base.batch;
+ struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
+ VAEncSequenceParameterBufferVP8 *seq_param = (VAEncSequenceParameterBufferVP8 *)encode_state->seq_param_ext->buffer;
+ VAEncPictureParameterBufferVP8 *pic_param = (VAEncPictureParameterBufferVP8 *)encode_state->pic_param_ext->buffer;
+
+ BEGIN_BCS_BATCH(batch, 30);
+ OUT_BCS_BATCH(batch, MFX_VP8_ENCODER_CFG | (30 - 2)); /* SKL should be 31-2 ? */
+
+ OUT_BCS_BATCH(batch,
+ 0 << 9 | /* compressed bitstream output disable */
+ 1 << 7 | /* disable per-segment delta qindex and loop filter in RC */
+ 1 << 6 | /* RC initial pass */
+ 0 << 4 | /* upate segment feature date flag */
+ 1 << 3 | /* bitstream statistics output enable */
+ 1 << 2 | /* token statistics output enable */
+ 0 << 1 | /* final bitstream output disable */
+ 0 << 0); /*DW1*/
+
+ OUT_BCS_BATCH(batch, 0); /*DW2*/
+
+ OUT_BCS_BATCH(batch,
+ 0xfff << 16 | /* max intra mb bit count limit */
+ 0xfff << 0 /* max inter mb bit count limit */
+ ); /*DW3*/
+
+ OUT_BCS_BATCH(batch, 0); /*DW4*/
+ OUT_BCS_BATCH(batch, 0); /*DW5*/
+ OUT_BCS_BATCH(batch, 0); /*DW6*/
+ OUT_BCS_BATCH(batch, 0); /*DW7*/
+ OUT_BCS_BATCH(batch, 0); /*DW8*/
+ OUT_BCS_BATCH(batch, 0); /*DW9*/
+ OUT_BCS_BATCH(batch, 0); /*DW10*/
+ OUT_BCS_BATCH(batch, 0); /*DW11*/
+ OUT_BCS_BATCH(batch, 0); /*DW12*/
+ OUT_BCS_BATCH(batch, 0); /*DW13*/
+ OUT_BCS_BATCH(batch, 0); /*DW14*/
+ OUT_BCS_BATCH(batch, 0); /*DW15*/
+ OUT_BCS_BATCH(batch, 0); /*DW16*/
+ OUT_BCS_BATCH(batch, 0); /*DW17*/
+ OUT_BCS_BATCH(batch, 0); /*DW18*/
+ OUT_BCS_BATCH(batch, 0); /*DW19*/
+ OUT_BCS_BATCH(batch, 0); /*DW20*/
+ OUT_BCS_BATCH(batch, 0); /*DW21*/
+
+ OUT_BCS_BATCH(batch,
+ pic_param->pic_flags.bits.show_frame << 23 |
+ pic_param->pic_flags.bits.version << 20
+ ); /*DW22*/
+
+ OUT_BCS_BATCH(batch,
+ (seq_param->frame_height_scale << 14 | seq_param->frame_height) << 16 |
+ (seq_param->frame_width_scale << 14 | seq_param->frame_width) << 0
+ );
+
+ /*DW24*/
+ OUT_BCS_BATCH(batch, mfc_context->vp8_state.frame_header_bit_count); /* frame header bit count */
+
+ /*DW25*/
+ OUT_BCS_BATCH(batch, mfc_context->vp8_state.frame_header_qindex_update_pos); /* frame header bin buffer qindex update pointer */
+
+ /*DW26*/
+ OUT_BCS_BATCH(batch, mfc_context->vp8_state.frame_header_lf_update_pos); /* frame header bin buffer loop filter update pointer*/
+
+ /*DW27*/
+ OUT_BCS_BATCH(batch, mfc_context->vp8_state.frame_header_token_update_pos); /* frame header bin buffer token update pointer */
+
+ /*DW28*/
+ OUT_BCS_BATCH(batch, mfc_context->vp8_state.frame_header_bin_mv_upate_pos); /*frame header bin buffer mv update pointer */
+
+ /*DW29*/
+ OUT_BCS_BATCH(batch, 0);
+
+ ADVANCE_BCS_BATCH(batch);
+}
+
+static void
+gen8_mfc_vp8_pic_state(VADriverContextP ctx,
+ struct encode_state *encode_state,
+ struct intel_encoder_context *encoder_context)
+{
+ struct intel_batchbuffer *batch = encoder_context->base.batch;
+ struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
+ VAEncSequenceParameterBufferVP8 *seq_param = (VAEncSequenceParameterBufferVP8 *)encode_state->seq_param_ext->buffer;
+ VAEncPictureParameterBufferVP8 *pic_param = (VAEncPictureParameterBufferVP8 *)encode_state->pic_param_ext->buffer;
+ VAQMatrixBufferVP8 *q_matrix = (VAQMatrixBufferVP8 *)encode_state->q_matrix->buffer;
+ int i, j, log2num;
+
+ assert(pic_param->pic_flags.bits.num_token_partitions > 0);
+ assert(pic_param->pic_flags.bits.num_token_partitions < 9);
+ log2num = (int)log2(pic_param->pic_flags.bits.num_token_partitions);
+
+ /*update mode and token probs*/
+ vp8_enc_state_update(mfc_context, q_matrix);
+
+ BEGIN_BCS_BATCH(batch, 38);
+ OUT_BCS_BATCH(batch, MFX_VP8_PIC_STATE | (38 - 2));
+ OUT_BCS_BATCH(batch,
+ (ALIGN(seq_param->frame_height, 16) / 16 - 1) << 16 |
+ (ALIGN(seq_param->frame_width, 16) / 16 - 1) << 0);
+
+ OUT_BCS_BATCH(batch,
+ log2num << 24 |
+ pic_param->sharpness_level << 16 |
+ pic_param->pic_flags.bits.sign_bias_alternate << 13 |
+ pic_param->pic_flags.bits.sign_bias_golden << 12 |
+ pic_param->pic_flags.bits.loop_filter_adj_enable << 11 |
+ pic_param->pic_flags.bits.mb_no_coeff_skip << 10 |
+ pic_param->pic_flags.bits.update_mb_segmentation_map << 9 |
+ pic_param->pic_flags.bits.segmentation_enabled << 8 |
+ !pic_param->pic_flags.bits.frame_type << 5 | /* 0 indicate an intra frame in VP8 stream/spec($9.1)*/
+ (pic_param->pic_flags.bits.version / 2) << 4 |
+ (pic_param->pic_flags.bits.version == 3) << 1 | /* full pixel mode for version 3 */
+ !!pic_param->pic_flags.bits.version << 0); /* version 0: 6 tap */
+
+ OUT_BCS_BATCH(batch,
+ pic_param->loop_filter_level[3] << 24 |
+ pic_param->loop_filter_level[2] << 16 |
+ pic_param->loop_filter_level[1] << 8 |
+ pic_param->loop_filter_level[0] << 0);
+
+ OUT_BCS_BATCH(batch,
+ q_matrix->quantization_index[3] << 24 |
+ q_matrix->quantization_index[2] << 16 |
+ q_matrix->quantization_index[1] << 8 |
+ q_matrix->quantization_index[0] << 0);
+
+ OUT_BCS_BATCH(batch,
+ ((unsigned short)(q_matrix->quantization_index_delta[4]) >> 15) << 28 |
+ abs(q_matrix->quantization_index_delta[4]) << 24 |
+ ((unsigned short)(q_matrix->quantization_index_delta[3]) >> 15) << 20 |
+ abs(q_matrix->quantization_index_delta[3]) << 16 |
+ ((unsigned short)(q_matrix->quantization_index_delta[2]) >> 15) << 12 |
+ abs(q_matrix->quantization_index_delta[2]) << 8 |
+ ((unsigned short)(q_matrix->quantization_index_delta[1]) >> 15) << 4 |
+ abs(q_matrix->quantization_index_delta[1]) << 0);
+
+ OUT_BCS_BATCH(batch,
+ ((unsigned short)(q_matrix->quantization_index_delta[0]) >> 15) << 4 |
+ abs(q_matrix->quantization_index_delta[0]) << 0);
+
+ OUT_BCS_BATCH(batch,
+ pic_param->clamp_qindex_high << 8 |
+ pic_param->clamp_qindex_low << 0);
+
+ for (i = 8; i < 19; i++) {
+ OUT_BCS_BATCH(batch, 0xffffffff);
+ }
+
+ OUT_BCS_BATCH(batch,
+ mfc_context->vp8_state.mb_segment_tree_probs[2] << 16 |
+ mfc_context->vp8_state.mb_segment_tree_probs[1] << 8 |
+ mfc_context->vp8_state.mb_segment_tree_probs[0] << 0);
+
+ OUT_BCS_BATCH(batch,
+ mfc_context->vp8_state.prob_skip_false << 24 |
+ mfc_context->vp8_state.prob_intra << 16 |
+ mfc_context->vp8_state.prob_last << 8 |
+ mfc_context->vp8_state.prob_gf << 0);
+
+ OUT_BCS_BATCH(batch,
+ mfc_context->vp8_state.y_mode_probs[3] << 24 |
+ mfc_context->vp8_state.y_mode_probs[2] << 16 |
+ mfc_context->vp8_state.y_mode_probs[1] << 8 |
+ mfc_context->vp8_state.y_mode_probs[0] << 0);
+
+ OUT_BCS_BATCH(batch,
+ mfc_context->vp8_state.uv_mode_probs[2] << 16 |
+ mfc_context->vp8_state.uv_mode_probs[1] << 8 |
+ mfc_context->vp8_state.uv_mode_probs[0] << 0);
+
+ /* MV update value, DW23-DW32 */
+ for (i = 0; i < 2; i++) {
+ for (j = 0; j < 20; j += 4) {
+ OUT_BCS_BATCH(batch,
+ (j + 3 == 19 ? 0 : mfc_context->vp8_state.mv_probs[i][j + 3]) << 24 |
+ mfc_context->vp8_state.mv_probs[i][j + 2] << 16 |
+ mfc_context->vp8_state.mv_probs[i][j + 1] << 8 |
+ mfc_context->vp8_state.mv_probs[i][j + 0] << 0);
+ }
+ }
+
+ OUT_BCS_BATCH(batch,
+ (pic_param->ref_lf_delta[3] & 0x7f) << 24 |
+ (pic_param->ref_lf_delta[2] & 0x7f) << 16 |
+ (pic_param->ref_lf_delta[1] & 0x7f) << 8 |
+ (pic_param->ref_lf_delta[0] & 0x7f) << 0);
+
+ OUT_BCS_BATCH(batch,
+ (pic_param->mode_lf_delta[3] & 0x7f) << 24 |
+ (pic_param->mode_lf_delta[2] & 0x7f) << 16 |
+ (pic_param->mode_lf_delta[1] & 0x7f) << 8 |
+ (pic_param->mode_lf_delta[0] & 0x7f) << 0);
+
+ OUT_BCS_BATCH(batch, 0);
+ OUT_BCS_BATCH(batch, 0);
+ OUT_BCS_BATCH(batch, 0);
+
+ ADVANCE_BCS_BATCH(batch);
+}
+
+#define OUT_VP8_BUFFER(bo, offset) \
+ if (bo) \
+ OUT_BCS_RELOC(batch, \
+ bo, \
+ I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, \
+ offset); \
+ else \
+ OUT_BCS_BATCH(batch, 0); \
+ OUT_BCS_BATCH(batch, 0); \
+ OUT_BCS_BATCH(batch, 0);
+
+static void
+gen8_mfc_vp8_bsp_buf_base_addr_state(VADriverContextP ctx,
+ struct encode_state *encode_state,
+ struct intel_encoder_context *encoder_context)
+{
+ struct intel_batchbuffer *batch = encoder_context->base.batch;
+ struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
+
+ BEGIN_BCS_BATCH(batch, 32);
+ OUT_BCS_BATCH(batch, MFX_VP8_BSP_BUF_BASE_ADDR_STATE | (32 - 2));
+
+ OUT_VP8_BUFFER(mfc_context->vp8_state.frame_header_bo, 0);
+
+ OUT_VP8_BUFFER(mfc_context->vp8_state.intermediate_bo, 0);
+ OUT_BCS_BATCH(batch, mfc_context->vp8_state.intermediate_partition_offset[0]);
+ OUT_BCS_BATCH(batch, mfc_context->vp8_state.intermediate_partition_offset[1]);
+ OUT_BCS_BATCH(batch, mfc_context->vp8_state.intermediate_partition_offset[2]);
+ OUT_BCS_BATCH(batch, mfc_context->vp8_state.intermediate_partition_offset[3]);
+ OUT_BCS_BATCH(batch, mfc_context->vp8_state.intermediate_partition_offset[4]);
+ OUT_BCS_BATCH(batch, mfc_context->vp8_state.intermediate_partition_offset[5]);
+ OUT_BCS_BATCH(batch, mfc_context->vp8_state.intermediate_partition_offset[6]);
+ OUT_BCS_BATCH(batch, mfc_context->vp8_state.intermediate_partition_offset[7]);
+ OUT_BCS_BATCH(batch, mfc_context->vp8_state.intermediate_buffer_max_size);
+
+ OUT_VP8_BUFFER(mfc_context->vp8_state.final_frame_bo, I965_CODEDBUFFER_HEADER_SIZE);
+ OUT_BCS_BATCH(batch, 0);
+
+ OUT_VP8_BUFFER(mfc_context->vp8_state.stream_out_bo, 0);
+ OUT_VP8_BUFFER(mfc_context->vp8_state.coeff_probs_stream_in_bo, 0);
+ OUT_VP8_BUFFER(mfc_context->vp8_state.token_statistics_bo, 0);
+ OUT_VP8_BUFFER(mfc_context->vp8_state.mpc_row_store_bo, 0);
+
+ ADVANCE_BCS_BATCH(batch);
+}
+
+static void
+gen8_mfc_vp8_pipeline_picture_programing(VADriverContextP ctx,
+ struct encode_state *encode_state,
+ struct intel_encoder_context *encoder_context)
+{
+ struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
+
+ mfc_context->pipe_mode_select(ctx, MFX_FORMAT_VP8, encoder_context);
+ mfc_context->set_surface_state(ctx, encoder_context);
+ mfc_context->ind_obj_base_addr_state(ctx, encoder_context);
+ gen8_mfc_pipe_buf_addr_state(ctx, encoder_context);
+ gen8_mfc_bsp_buf_base_addr_state(ctx, encoder_context);
+ gen8_mfc_vp8_bsp_buf_base_addr_state(ctx, encode_state, encoder_context);
+ gen8_mfc_vp8_pic_state(ctx, encode_state,encoder_context);
+ gen8_mfc_vp8_encoder_cfg(ctx, encode_state, encoder_context);
+}
+
+static const unsigned char
+vp8_intra_mb_mode_map[VME_MB_INTRA_MODE_COUNT] = {
+ PAK_V_PRED,
+ PAK_H_PRED,
+ PAK_DC_PRED,
+ PAK_TM_PRED
+};
+
+static const unsigned char
+vp8_intra_block_mode_map[VME_B_INTRA_MODE_COUNT] = {
+ PAK_B_VE_PRED,
+ PAK_B_HE_PRED,
+ PAK_B_DC_PRED,
+ PAK_B_LD_PRED,
+ PAK_B_RD_PRED,
+ PAK_B_VR_PRED,
+ PAK_B_HD_PRED,
+ PAK_B_VL_PRED,
+ PAK_B_HU_PRED
+};
+
+static int inline gen8_mfc_vp8_intra_mb_mode_map(unsigned int vme_pred_mode, int is_luma_4x4)
+{
+ unsigned int i, pak_pred_mode = 0;
+ unsigned int vme_sub_blocks_pred_mode[8], pak_sub_blocks_pred_mode[8]; /* 8 blocks's intra mode */
+
+ if (!is_luma_4x4) {
+ pak_pred_mode = vp8_intra_mb_mode_map[vme_pred_mode & 0x3];
+ } else {
+ for (i = 0; i < 8; i++) {
+ vme_sub_blocks_pred_mode[i] = ((vme_pred_mode >> (4 * i)) & 0xf);
+ assert(vme_sub_blocks_pred_mode[i] < VME_B_INTRA_MODE_COUNT);
+ pak_sub_blocks_pred_mode[i] = vp8_intra_block_mode_map[vme_sub_blocks_pred_mode[i]];
+ pak_pred_mode |= (pak_sub_blocks_pred_mode[i] << (4 * i));
+ }
+ }
+
+ return pak_pred_mode;
+}
+static void
+gen8_mfc_vp8_pak_object_intra(VADriverContextP ctx,
+ struct intel_encoder_context *encoder_context,
+ unsigned int *msg,
+ int x, int y,
+ struct intel_batchbuffer *batch)
+{
+ unsigned int vme_intra_mb_mode, vme_chroma_pred_mode;
+ unsigned int pak_intra_mb_mode, pak_chroma_pred_mode;
+ unsigned int vme_luma_pred_mode[2], pak_luma_pred_mode[2];
+
+ if (batch == NULL)
+ batch = encoder_context->base.batch;
+
+ vme_intra_mb_mode = ((msg[0] & 0x30) >> 4);
+ assert((vme_intra_mb_mode == 0) || (vme_intra_mb_mode == 2)); //vp8 only support intra_16x16 and intra_4x4
+ pak_intra_mb_mode = (vme_intra_mb_mode >> 1);
+
+ vme_luma_pred_mode[0] = msg[1];
+ vme_luma_pred_mode[1] = msg[2];
+ vme_chroma_pred_mode = msg[3] & 0x3;
+
+ pak_luma_pred_mode[0] = gen8_mfc_vp8_intra_mb_mode_map(vme_luma_pred_mode[0], pak_intra_mb_mode);
+ pak_luma_pred_mode[1] = gen8_mfc_vp8_intra_mb_mode_map(vme_luma_pred_mode[1], pak_intra_mb_mode);
+ pak_chroma_pred_mode = gen8_mfc_vp8_intra_mb_mode_map(vme_chroma_pred_mode, 0);
+
+ BEGIN_BCS_BATCH(batch, 7);
+
+ OUT_BCS_BATCH(batch, MFX_VP8_PAK_OBJECT | (7 - 2));
+ OUT_BCS_BATCH(batch, 0);
+ OUT_BCS_BATCH(batch, 0);
+ OUT_BCS_BATCH(batch,
+ (0 << 20) | /* mv format: intra mb */
+ (0 << 18) | /* Segment ID */
+ (0 << 17) | /* disable coeff clamp */
+ (1 << 13) | /* intra mb flag */
+ (0 << 11) | /* refer picture select: last frame */
+ (pak_intra_mb_mode << 8) | /* mb type */
+ (pak_chroma_pred_mode << 4) | /* mb uv mode */
+ (0 << 2) | /* skip mb flag: disable */
+ 0);
+
+ OUT_BCS_BATCH(batch, (y << 16) | x);
+ OUT_BCS_BATCH(batch, pak_luma_pred_mode[0]);
+ OUT_BCS_BATCH(batch, pak_luma_pred_mode[1]);
+
+ ADVANCE_BCS_BATCH(batch);
+}
+
+static void
+gen8_mfc_vp8_pak_object_inter(VADriverContextP ctx,
+ struct intel_encoder_context *encoder_context,
+ unsigned int *msg,
+ int offset,
+ int x, int y,
+ struct intel_batchbuffer *batch)
+{
+ int i;
+
+ if (batch == NULL)
+ batch = encoder_context->base.batch;
+
+ /* only support inter_16x16 now */
+ assert((msg[AVC_INTER_MSG_OFFSET] & INTER_MODE_MASK) == INTER_16X16);
+ /* for inter_16x16, all 16 MVs should be same,
+ * and move mv to the vme mb start address to make sure offset is 64 bytes aligned */
+ msg[0] = (msg[AVC_INTER_MV_OFFSET/4] & 0xfffefffe);
+ for (i = 1; i < 16; i++) {
+ msg[i] = msg[0];
+ }
+
+ BEGIN_BCS_BATCH(batch, 7);
+
+ OUT_BCS_BATCH(batch, MFX_VP8_PAK_OBJECT | (7 - 2));
+ OUT_BCS_BATCH(batch,
+ (0 << 29) | /* enable inline mv data: disable */
+ 64);
+ OUT_BCS_BATCH(batch,
+ offset);
+ OUT_BCS_BATCH(batch,
+ (4 << 20) | /* mv format: inter */
+ (0 << 18) | /* Segment ID */
+ (0 << 17) | /* coeff clamp: disable */
+ (0 << 13) | /* intra mb flag: inter mb */
+ (0 << 11) | /* refer picture select: last frame */
+ (0 << 8) | /* mb type: 16x16 */
+ (0 << 4) | /* mb uv mode: dc_pred */
+ (0 << 2) | /* skip mb flag: disable */
+ 0);
+
+ OUT_BCS_BATCH(batch, (y << 16) | x);
+
+ /*new mv*/
+ OUT_BCS_BATCH(batch, 0x8);
+ OUT_BCS_BATCH(batch, 0x8);
+
+ ADVANCE_BCS_BATCH(batch);
+}
+
+static void
+gen8_mfc_vp8_pak_pipeline(VADriverContextP ctx,
+ struct encode_state *encode_state,
+ struct intel_encoder_context *encoder_context,
+ struct intel_batchbuffer *slice_batch)
+{
+ struct gen6_vme_context *vme_context = encoder_context->vme_context;
+ VAEncSequenceParameterBufferVP8 *seq_param = (VAEncSequenceParameterBufferVP8 *)encode_state->seq_param_ext->buffer;
+ VAEncPictureParameterBufferVP8 *pic_param = (VAEncPictureParameterBufferVP8 *)encode_state->pic_param_ext->buffer;
+ int width_in_mbs = ALIGN(seq_param->frame_width, 16) / 16;
+ int height_in_mbs = ALIGN(seq_param->frame_height, 16) / 16;
+ unsigned int *msg = NULL;
+ unsigned char *msg_ptr = NULL;
+ unsigned int i, offset, is_intra_frame;
+
+ is_intra_frame = !pic_param->pic_flags.bits.frame_type;
+
+ dri_bo_map(vme_context->vme_output.bo , 1);
+ msg_ptr = (unsigned char *)vme_context->vme_output.bo->virtual;
+
+ for( i = 0; i < width_in_mbs * height_in_mbs; i++) {
+ int h_pos = i % width_in_mbs;
+ int v_pos = i / width_in_mbs;
+ msg = (unsigned int *) (msg_ptr + i * vme_context->vme_output.size_block);
+
+ if (is_intra_frame) {
+ gen8_mfc_vp8_pak_object_intra(ctx,
+ encoder_context,
+ msg,
+ h_pos, v_pos,
+ slice_batch);
+ } else {
+ int inter_rdo, intra_rdo;
+ inter_rdo = msg[AVC_INTER_RDO_OFFSET] & AVC_RDO_MASK;
+ intra_rdo = msg[AVC_INTRA_RDO_OFFSET] & AVC_RDO_MASK;
+
+ if (intra_rdo < inter_rdo) {
+ gen8_mfc_vp8_pak_object_intra(ctx,
+ encoder_context,
+ msg,
+ h_pos, v_pos,
+ slice_batch);
+ } else {
+ offset = i * vme_context->vme_output.size_block;
+ gen8_mfc_vp8_pak_object_inter(ctx,
+ encoder_context,
+ msg,
+ offset,
+ h_pos, v_pos,
+ slice_batch);
+ }
+ }
+ }
+
+ dri_bo_unmap(vme_context->vme_output.bo);
+}
+
+/*
+ * A batch buffer for vp8 pak object commands
+ */
+static dri_bo *
+gen8_mfc_vp8_software_batchbuffer(VADriverContextP ctx,
+ struct encode_state *encode_state,
+ struct intel_encoder_context *encoder_context)
+{
+ struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
+ struct intel_batchbuffer *batch;
+ dri_bo *batch_bo;
+
+ batch = mfc_context->aux_batchbuffer;
+ batch_bo = batch->buffer;
+
+ gen8_mfc_vp8_pak_pipeline(ctx, encode_state, encoder_context, batch);
+
+ intel_batchbuffer_align(batch, 8);
+
+ BEGIN_BCS_BATCH(batch, 2);
+ OUT_BCS_BATCH(batch, 0);
+ OUT_BCS_BATCH(batch, MI_BATCH_BUFFER_END);
+ ADVANCE_BCS_BATCH(batch);
+
+ dri_bo_reference(batch_bo);
+ intel_batchbuffer_free(batch);
+ mfc_context->aux_batchbuffer = NULL;
+
+ return batch_bo;
+}
+
+static void
+gen8_mfc_vp8_pipeline_programing(VADriverContextP ctx,
+ struct encode_state *encode_state,
+ struct intel_encoder_context *encoder_context)
+{
+ struct intel_batchbuffer *batch = encoder_context->base.batch;
+ dri_bo *slice_batch_bo;
+
+ slice_batch_bo = gen8_mfc_vp8_software_batchbuffer(ctx, encode_state, encoder_context);
+
+ // begin programing
+ intel_batchbuffer_start_atomic_bcs(batch, 0x4000);
+ intel_batchbuffer_emit_mi_flush(batch);
+
+ // picture level programing
+ gen8_mfc_vp8_pipeline_picture_programing(ctx, encode_state, encoder_context);
+
+ BEGIN_BCS_BATCH(batch, 4);
+ OUT_BCS_BATCH(batch, MI_BATCH_BUFFER_START | (1 << 8) | (1 << 0));
+ OUT_BCS_RELOC(batch,
+ slice_batch_bo,
+ I915_GEM_DOMAIN_COMMAND, 0,
+ 0);
+ OUT_BCS_BATCH(batch, 0);
+ OUT_BCS_BATCH(batch, 0);
+ ADVANCE_BCS_BATCH(batch);
+
+ // end programing
+ intel_batchbuffer_end_atomic(batch);
+
+ dri_bo_unreference(slice_batch_bo);
+}
+
+static void gen8_mfc_calc_vp8_coded_buffer_size(VADriverContextP ctx,
+ struct encode_state *encode_state,
+ struct intel_encoder_context *encoder_context)
+{
+ struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
+ VAEncPictureParameterBufferVP8 *pic_param = (VAEncPictureParameterBufferVP8 *)encode_state->pic_param_ext->buffer;
+ unsigned char is_intra_frame = !pic_param->pic_flags.bits.frame_type;
+ unsigned int *vp8_encoding_status, first_partition_bytes, token_partition_bytes, vp8_coded_bytes;
+
+ dri_bo_map(mfc_context->vp8_state.token_statistics_bo, 0);
+
+ vp8_encoding_status = (unsigned int *)mfc_context->vp8_state.token_statistics_bo->virtual;
+ first_partition_bytes = (*vp8_encoding_status + 7) / 8;
+ token_partition_bytes = (*(unsigned int *)(vp8_encoding_status + 9) + 7) / 8;
+
+ /*coded_bytes includes P0~P8 partitions bytes + uncompresse date bytes + partion_size bytes in bitstream */
+ vp8_coded_bytes = first_partition_bytes + token_partition_bytes + (3 + 7 * !!is_intra_frame) + (pic_param->pic_flags.bits.num_token_partitions - 1) * 3;
+
+ dri_bo_unmap(mfc_context->vp8_state.token_statistics_bo);
+
+ dri_bo_map(mfc_context->vp8_state.final_frame_bo, 0);
+ struct i965_coded_buffer_segment *coded_buffer_segment = (struct i965_coded_buffer_segment *)(mfc_context->vp8_state.final_frame_bo->virtual);
+ coded_buffer_segment->base.size = vp8_coded_bytes;
+ dri_bo_unmap(mfc_context->vp8_state.final_frame_bo);
+}
+
+static VAStatus
+gen8_mfc_vp8_encode_picture(VADriverContextP ctx,
+ struct encode_state *encode_state,
+ struct intel_encoder_context *encoder_context)
+{
+ gen8_mfc_vp8_init(ctx, encode_state, encoder_context);
+ intel_mfc_vp8_prepare(ctx, encode_state, encoder_context);
+ /*Programing bcs pipeline*/
+ gen8_mfc_vp8_pipeline_programing(ctx, encode_state, encoder_context);
+ gen8_mfc_run(ctx, encode_state, encoder_context);
+ gen8_mfc_calc_vp8_coded_buffer_size(ctx, encode_state, encoder_context);
+
+ return VA_STATUS_SUCCESS;
+}
static void
gen8_mfc_context_destroy(void *context)
@@ -3258,6 +4136,27 @@ gen8_mfc_context_destroy(void *context)
mfc_context->aux_batchbuffer = NULL;
+ dri_bo_unreference(mfc_context->vp8_state.coeff_probs_stream_in_bo);
+ mfc_context->vp8_state.coeff_probs_stream_in_bo = NULL;
+
+ dri_bo_unreference(mfc_context->vp8_state.final_frame_bo);
+ mfc_context->vp8_state.final_frame_bo = NULL;
+
+ dri_bo_unreference(mfc_context->vp8_state.frame_header_bo);
+ mfc_context->vp8_state.frame_header_bo = NULL;
+
+ dri_bo_unreference(mfc_context->vp8_state.intermediate_bo);
+ mfc_context->vp8_state.intermediate_bo = NULL;
+
+ dri_bo_unreference(mfc_context->vp8_state.mpc_row_store_bo);
+ mfc_context->vp8_state.mpc_row_store_bo = NULL;
+
+ dri_bo_unreference(mfc_context->vp8_state.stream_out_bo);
+ mfc_context->vp8_state.stream_out_bo = NULL;
+
+ dri_bo_unreference(mfc_context->vp8_state.token_statistics_bo);
+ mfc_context->vp8_state.token_statistics_bo = NULL;
+
free(mfc_context);
}
@@ -3287,7 +4186,11 @@ static VAStatus gen8_mfc_pipeline(VADriverContextP ctx,
jpeg_init_default_qmatrix(ctx, encoder_context);
vaStatus = gen8_mfc_jpeg_encode_picture(ctx, encode_state, encoder_context);
break;
-
+
+ case VAProfileVP8Version0_3:
+ vaStatus = gen8_mfc_vp8_encode_picture(ctx, encode_state, encoder_context);
+ break;
+
default:
vaStatus = VA_STATUS_ERROR_UNSUPPORTED_PROFILE;
break;
diff --git a/src/gen8_vme.c b/src/gen8_vme.c
index 29d4b5a..ace3288 100644
--- a/src/gen8_vme.c
+++ b/src/gen8_vme.c
@@ -120,6 +120,31 @@ static struct i965_kernel gen8_vme_mpeg2_kernels[] = {
},
};
+static const uint32_t gen8_vme_vp8_intra_frame[][4] = {
+#include "shaders/vme/vp8_intra_frame_gen8.g8b"
+};
+
+static const uint32_t gen8_vme_vp8_inter_frame[][4] = {
+#include "shaders/vme/vp8_inter_frame_gen8.g8b"
+};
+
+static struct i965_kernel gen8_vme_vp8_kernels[] = {
+ {
+ "VME Intra Frame",
+ VME_INTRA_SHADER, /*index*/
+ gen8_vme_vp8_intra_frame,
+ sizeof(gen8_vme_vp8_intra_frame),
+ NULL
+ },
+ {
+ "VME inter Frame",
+ VME_INTER_SHADER,
+ gen8_vme_vp8_inter_frame,
+ sizeof(gen8_vme_vp8_inter_frame),
+ NULL
+ },
+};
+
/* only used for VME source surface state */
static void
gen8_vme_source_surface_state(VADriverContextP ctx,
@@ -170,16 +195,14 @@ static void
gen8_vme_output_buffer_setup(VADriverContextP ctx,
struct encode_state *encode_state,
int index,
- struct intel_encoder_context *encoder_context)
+ struct intel_encoder_context *encoder_context,
+ int is_intra,
+ int width_in_mbs,
+ int height_in_mbs)
{
struct i965_driver_data *i965 = i965_driver_data(ctx);
struct gen6_vme_context *vme_context = encoder_context->vme_context;
- VAEncSequenceParameterBufferH264 *pSequenceParameter = (VAEncSequenceParameterBufferH264 *)encode_state->seq_param_ext->buffer;
- VAEncSliceParameterBufferH264 *pSliceParameter = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[0]->buffer;
- int is_intra = pSliceParameter->slice_type == SLICE_TYPE_I;
- int width_in_mbs = pSequenceParameter->picture_width_in_mbs;
- int height_in_mbs = pSequenceParameter->picture_height_in_mbs;
vme_context->vme_output.num_blocks = width_in_mbs * height_in_mbs;
vme_context->vme_output.pitch = 16; /* in bytes, always 16 */
@@ -194,7 +217,7 @@ gen8_vme_output_buffer_setup(VADriverContextP ctx,
* 16 * (2 + 2 * (1 + 8 + 2))= 16 * 24.
*/
- vme_context->vme_output.bo = dri_bo_alloc(i965->intel.bufmgr,
+ vme_context->vme_output.bo = dri_bo_alloc(i965->intel.bufmgr,
"VME output buffer",
vme_context->vme_output.num_blocks * vme_context->vme_output.size_block,
0x1000);
@@ -207,32 +230,57 @@ gen8_vme_output_buffer_setup(VADriverContextP ctx,
}
static void
+gen8_vme_avc_output_buffer_setup(VADriverContextP ctx,
+ struct encode_state *encode_state,
+ int index,
+ struct intel_encoder_context *encoder_context)
+{
+ VAEncSequenceParameterBufferH264 *pSequenceParameter = (VAEncSequenceParameterBufferH264 *)encode_state->seq_param_ext->buffer;
+ VAEncSliceParameterBufferH264 *pSliceParameter = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[0]->buffer;
+ int is_intra = pSliceParameter->slice_type == SLICE_TYPE_I;
+ int width_in_mbs = pSequenceParameter->picture_width_in_mbs;
+ int height_in_mbs = pSequenceParameter->picture_height_in_mbs;
+
+ gen8_vme_output_buffer_setup(ctx, encode_state, index, encoder_context, is_intra, width_in_mbs, height_in_mbs);
+
+}
+
+static void
gen8_vme_output_vme_batchbuffer_setup(VADriverContextP ctx,
struct encode_state *encode_state,
int index,
- struct intel_encoder_context *encoder_context)
-
+ struct intel_encoder_context *encoder_context,
+ int width_in_mbs,
+ int height_in_mbs)
{
struct i965_driver_data *i965 = i965_driver_data(ctx);
struct gen6_vme_context *vme_context = encoder_context->vme_context;
- VAEncSequenceParameterBufferH264 *pSequenceParameter = (VAEncSequenceParameterBufferH264 *)encode_state->seq_param_ext->buffer;
- int width_in_mbs = pSequenceParameter->picture_width_in_mbs;
- int height_in_mbs = pSequenceParameter->picture_height_in_mbs;
vme_context->vme_batchbuffer.num_blocks = width_in_mbs * height_in_mbs + 1;
vme_context->vme_batchbuffer.size_block = 64; /* 4 OWORDs */
vme_context->vme_batchbuffer.pitch = 16;
- vme_context->vme_batchbuffer.bo = dri_bo_alloc(i965->intel.bufmgr,
+ vme_context->vme_batchbuffer.bo = dri_bo_alloc(i965->intel.bufmgr,
"VME batchbuffer",
vme_context->vme_batchbuffer.num_blocks * vme_context->vme_batchbuffer.size_block,
0x1000);
- /*
vme_context->vme_buffer_suface_setup(ctx,
&vme_context->gpe_context,
&vme_context->vme_batchbuffer,
BINDING_TABLE_OFFSET(index),
SURFACE_STATE_OFFSET(index));
- */
+}
+
+static void
+gen8_vme_avc_output_vme_batchbuffer_setup(VADriverContextP ctx,
+ struct encode_state *encode_state,
+ int index,
+ struct intel_encoder_context *encoder_context)
+{
+ VAEncSequenceParameterBufferH264 *pSequenceParameter = (VAEncSequenceParameterBufferH264 *)encode_state->seq_param_ext->buffer;
+ int width_in_mbs = pSequenceParameter->picture_width_in_mbs;
+ int height_in_mbs = pSequenceParameter->picture_height_in_mbs;
+
+ gen8_vme_output_vme_batchbuffer_setup(ctx, encode_state, index, encoder_context, width_in_mbs, height_in_mbs);
}
static VAStatus
@@ -264,8 +312,8 @@ gen8_vme_surface_setup(VADriverContextP ctx,
}
/* VME output */
- gen8_vme_output_buffer_setup(ctx, encode_state, 3, encoder_context);
- gen8_vme_output_vme_batchbuffer_setup(ctx, encode_state, 5, encoder_context);
+ gen8_vme_avc_output_buffer_setup(ctx, encode_state, 3, encoder_context);
+ gen8_vme_avc_output_vme_batchbuffer_setup(ctx, encode_state, 5, encoder_context);
return VA_STATUS_SUCCESS;
}
@@ -724,37 +772,12 @@ gen8_vme_mpeg2_output_buffer_setup(VADriverContextP ctx,
int index,
int is_intra,
struct intel_encoder_context *encoder_context)
-
{
- struct i965_driver_data *i965 = i965_driver_data(ctx);
- struct gen6_vme_context *vme_context = encoder_context->vme_context;
VAEncSequenceParameterBufferMPEG2 *seq_param = (VAEncSequenceParameterBufferMPEG2 *)encode_state->seq_param_ext->buffer;
int width_in_mbs = ALIGN(seq_param->picture_width, 16) / 16;
int height_in_mbs = ALIGN(seq_param->picture_height, 16) / 16;
- vme_context->vme_output.num_blocks = width_in_mbs * height_in_mbs;
- vme_context->vme_output.pitch = 16; /* in bytes, always 16 */
-
- if (is_intra)
- vme_context->vme_output.size_block = INTRA_VME_OUTPUT_IN_BYTES * 2;
- else
- vme_context->vme_output.size_block = INTRA_VME_OUTPUT_IN_BYTES * 24;
- /*
- * Inter MV . 32-byte Intra search + 16 IME info + 128 IME MV + 32 IME Ref
- * + 16 FBR Info + 128 FBR MV + 32 FBR Ref.
- * 16 * (2 + 2 * (1 + 8 + 2))= 16 * 24.
- */
-
- vme_context->vme_output.bo = dri_bo_alloc(i965->intel.bufmgr,
- "VME output buffer",
- vme_context->vme_output.num_blocks * vme_context->vme_output.size_block,
- 0x1000);
- assert(vme_context->vme_output.bo);
- vme_context->vme_buffer_suface_setup(ctx,
- &vme_context->gpe_context,
- &vme_context->vme_output,
- BINDING_TABLE_OFFSET(index),
- SURFACE_STATE_OFFSET(index));
+ gen8_vme_output_buffer_setup(ctx, encode_state, index, encoder_context, is_intra, width_in_mbs, height_in_mbs);
}
static void
@@ -762,26 +785,12 @@ gen8_vme_mpeg2_output_vme_batchbuffer_setup(VADriverContextP ctx,
struct encode_state *encode_state,
int index,
struct intel_encoder_context *encoder_context)
-
{
- struct i965_driver_data *i965 = i965_driver_data(ctx);
- struct gen6_vme_context *vme_context = encoder_context->vme_context;
VAEncSequenceParameterBufferMPEG2 *seq_param = (VAEncSequenceParameterBufferMPEG2 *)encode_state->seq_param_ext->buffer;
int width_in_mbs = ALIGN(seq_param->picture_width, 16) / 16;
int height_in_mbs = ALIGN(seq_param->picture_height, 16) / 16;
- vme_context->vme_batchbuffer.num_blocks = width_in_mbs * height_in_mbs + 1;
- vme_context->vme_batchbuffer.size_block = 64; /* 4 OWORDs */
- vme_context->vme_batchbuffer.pitch = 16;
- vme_context->vme_batchbuffer.bo = dri_bo_alloc(i965->intel.bufmgr,
- "VME batchbuffer",
- vme_context->vme_batchbuffer.num_blocks * vme_context->vme_batchbuffer.size_block,
- 0x1000);
- vme_context->vme_buffer_suface_setup(ctx,
- &vme_context->gpe_context,
- &vme_context->vme_batchbuffer,
- BINDING_TABLE_OFFSET(index),
- SURFACE_STATE_OFFSET(index));
+ gen8_vme_output_vme_batchbuffer_setup(ctx, encode_state, index, encoder_context, width_in_mbs, height_in_mbs);
}
static VAStatus
@@ -1130,6 +1139,139 @@ gen8_vme_mpeg2_pipeline(VADriverContextP ctx,
}
static void
+gen8_vme_vp8_output_buffer_setup(VADriverContextP ctx,
+ struct encode_state *encode_state,
+ int index,
+ int is_intra,
+ struct intel_encoder_context *encoder_context)
+{
+ VAEncSequenceParameterBufferVP8 *seq_param = (VAEncSequenceParameterBufferVP8 *)encode_state->seq_param_ext->buffer;
+ int width_in_mbs = ALIGN(seq_param->frame_width, 16) / 16;
+ int height_in_mbs = ALIGN(seq_param->frame_height, 16) / 16;
+
+ gen8_vme_output_buffer_setup(ctx, encode_state, index, encoder_context, is_intra, width_in_mbs, height_in_mbs);
+}
+
+static void
+gen8_vme_vp8_output_vme_batchbuffer_setup(VADriverContextP ctx,
+ struct encode_state *encode_state,
+ int index,
+ struct intel_encoder_context *encoder_context)
+{
+ VAEncSequenceParameterBufferVP8 *seq_param = (VAEncSequenceParameterBufferVP8 *)encode_state->seq_param_ext->buffer;
+ int width_in_mbs = ALIGN(seq_param->frame_width, 16) / 16;
+ int height_in_mbs = ALIGN(seq_param->frame_height, 16) / 16;
+
+ gen8_vme_output_vme_batchbuffer_setup(ctx, encode_state, index, encoder_context, width_in_mbs, height_in_mbs);
+}
+
+static VAStatus
+gen8_vme_vp8_surface_setup(VADriverContextP ctx,
+ struct encode_state *encode_state,
+ int is_intra,
+ struct intel_encoder_context *encoder_context)
+{
+ struct object_surface *obj_surface;
+
+ /*Setup surfaces state*/
+ /* current picture for encoding */
+ obj_surface = encode_state->input_yuv_object;
+ gen8_vme_source_surface_state(ctx, 0, obj_surface, encoder_context);
+ gen8_vme_media_source_surface_state(ctx, 4, obj_surface, encoder_context);
+ gen8_vme_media_chroma_source_surface_state(ctx, 6, obj_surface, encoder_context);
+
+ if (!is_intra) {
+ /* reference 0 */
+ obj_surface = encode_state->reference_objects[0];
+
+ if (obj_surface->bo != NULL)
+ gen8_vme_source_surface_state(ctx, 1, obj_surface, encoder_context);
+
+ /* reference 1 */
+ obj_surface = encode_state->reference_objects[1];
+
+ if (obj_surface && obj_surface->bo != NULL)
+ gen8_vme_source_surface_state(ctx, 2, obj_surface, encoder_context);
+ }
+
+ /* VME output */
+ gen8_vme_vp8_output_buffer_setup(ctx, encode_state, 3, is_intra, encoder_context);
+ gen8_vme_vp8_output_vme_batchbuffer_setup(ctx, encode_state, 5, encoder_context);
+
+ return VA_STATUS_SUCCESS;
+}
+
+static void
+gen8_vme_vp8_pipeline_programing(VADriverContextP ctx,
+ struct encode_state *encode_state,
+ int is_intra,
+ struct intel_encoder_context *encoder_context)
+{
+ struct gen6_vme_context *vme_context = encoder_context->vme_context;
+ struct intel_batchbuffer *batch = encoder_context->base.batch;
+ VAEncSequenceParameterBufferVP8 *seq_param = (VAEncSequenceParameterBufferVP8 *)encode_state->seq_param_ext->buffer;
+ int width_in_mbs = ALIGN(seq_param->frame_width, 16) / 16;
+ int height_in_mbs = ALIGN(seq_param->frame_height, 16) / 16;
+ int kernel_shader = (is_intra ? VME_INTRA_SHADER : VME_INTER_SHADER);
+
+ gen8wa_vme_mpeg2_walker_fill_vme_batchbuffer(ctx,
+ encode_state,
+ width_in_mbs, height_in_mbs,
+ kernel_shader,
+ encoder_context);
+
+ intel_batchbuffer_start_atomic(batch, 0x1000);
+ gen8_gpe_pipeline_setup(ctx, &vme_context->gpe_context, batch);
+ BEGIN_BATCH(batch, 4);
+ OUT_BATCH(batch, MI_BATCH_BUFFER_START | (1 << 8) | (1 << 0));
+ OUT_RELOC(batch,
+ vme_context->vme_batchbuffer.bo,
+ I915_GEM_DOMAIN_COMMAND, 0,
+ 0);
+ OUT_BATCH(batch, 0);
+ OUT_BATCH(batch, 0);
+ ADVANCE_BATCH(batch);
+
+ intel_batchbuffer_end_atomic(batch);
+}
+
+static VAStatus gen8_vme_vp8_prepare(VADriverContextP ctx,
+ struct encode_state *encode_state,
+ struct intel_encoder_context *encoder_context)
+{
+ VAStatus vaStatus = VA_STATUS_SUCCESS;
+ VAEncPictureParameterBufferVP8 *pPicParameter = (VAEncPictureParameterBufferVP8 *)encode_state->pic_param_ext->buffer;
+ int is_intra = !pPicParameter->pic_flags.bits.frame_type;
+
+ /* update vp8 mbmv cost */
+ intel_vme_vp8_update_mbmv_cost(ctx, encode_state, encoder_context);
+
+ /*Setup all the memory object*/
+ gen8_vme_vp8_surface_setup(ctx, encode_state, is_intra, encoder_context);
+ gen8_vme_interface_setup(ctx, encode_state, encoder_context);
+ gen8_vme_constant_setup(ctx, encode_state, encoder_context);
+
+ /*Programing media pipeline*/
+ gen8_vme_vp8_pipeline_programing(ctx, encode_state, is_intra, encoder_context);
+
+ return vaStatus;
+}
+
+static VAStatus
+gen8_vme_vp8_pipeline(VADriverContextP ctx,
+ VAProfile profile,
+ struct encode_state *encode_state,
+ struct intel_encoder_context *encoder_context)
+{
+ gen8_vme_media_init(ctx, encoder_context);
+ gen8_vme_vp8_prepare(ctx, encode_state, encoder_context);
+ gen8_vme_run(ctx, encode_state, encoder_context);
+ gen8_vme_stop(ctx, encode_state, encoder_context);
+
+ return VA_STATUS_SUCCESS;
+}
+
+static void
gen8_vme_context_destroy(void *context)
{
struct gen6_vme_context *vme_context = context;
@@ -1180,6 +1322,12 @@ Bool gen8_vme_context_init(VADriverContextP ctx, struct intel_encoder_context *e
encoder_context->vme_context_destroy = NULL;
break;
+ case CODEC_VP8:
+ vme_kernel_list = gen8_vme_vp8_kernels;
+ encoder_context->vme_pipeline = gen8_vme_vp8_pipeline;
+ i965_kernel_num = sizeof(gen8_vme_vp8_kernels) / sizeof(struct i965_kernel);
+ break;
+
default:
/* never get here */
assert(0);
diff --git a/src/i965_device_info.c b/src/i965_device_info.c
index e63f509..a7e2546 100755
--- a/src/i965_device_info.c
+++ b/src/i965_device_info.c
@@ -297,6 +297,7 @@ static struct hw_codec_info chv_hw_codec_info = {
.has_di_motion_adptive = 1,
.has_di_motion_compensated = 1,
.has_vp8_decoding = 1,
+ .has_vp8_encoding = 1,
.has_h264_mvc_encoding = 1,
.num_filters = 5,
diff --git a/src/shaders/vme/Makefile.am b/src/shaders/vme/Makefile.am
index 0883c16..4543e35 100644
--- a/src/shaders/vme/Makefile.am
+++ b/src/shaders/vme/Makefile.am
@@ -20,8 +20,8 @@ INTEL_GEN75_INC = batchbuffer.inc vme75.inc vme75_mpeg2.inc
INTEL_GEN75_ASM = $(INTEL_G75A:%.g75a=%.gen75.asm)
-INTEL_G8B = intra_frame_gen8.g8b inter_frame_gen8.g8b inter_bframe_gen8.g8b mpeg2_inter_gen8.g8b
-INTEL_G8A = intra_frame_gen8.g8a inter_frame_gen8.g8a inter_bframe_gen8.g8a mpeg2_inter_gen8.g8a
+INTEL_G8B = intra_frame_gen8.g8b inter_frame_gen8.g8b inter_bframe_gen8.g8b mpeg2_inter_gen8.g8b vp8_intra_frame_gen8.g8b vp8_inter_frame_gen8.g8b
+INTEL_G8A = intra_frame_gen8.g8a inter_frame_gen8.g8a inter_bframe_gen8.g8a mpeg2_inter_gen8.g8a vp8_intra_frame_gen8.g8a vp8_inter_frame_gen8.g8a
INTEL_GEN8_INC = vme8.inc vme75_mpeg2.inc
INTEL_GEN8_ASM = $(INTEL_G8A:%.g8a=%.gen8.asm)
diff --git a/src/shaders/vme/vp8_inter_frame_gen8.asm b/src/shaders/vme/vp8_inter_frame_gen8.asm
new file mode 100644
index 0000000..d660810
--- /dev/null
+++ b/src/shaders/vme/vp8_inter_frame_gen8.asm
@@ -0,0 +1,739 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ * Zhao Yakui <yakui.zhao at intel.com>
+ * Xiang Haihao <haihao.xiang at intel.com>
+ * Li Zhong <zhong.li at intel.com>
+ *
+ */
+
+#define SAVE_RET add (1) RETURN_REG<1>:ud ip:ud 32:ud
+#define RETURN mov (1) ip:ud RETURN_REG<0,1,0>:ud
+
+/*
+ * __START
+ */
+__INTER_START:
+mov (16) tmp_reg0.0<1>:UD 0x0:UD {align1};
+mov (16) tmp_reg2.0<1>:UD 0x0:UD {align1};
+mov (16) tmp_reg4.0<1>:UD 0x0:UD {align1} ;
+mov (16) tmp_reg6.0<1>:UD 0x0:UD {align1} ;
+
+shl (2) read0_header.0<1>:D orig_xy_ub<2,2,1>:UB 4:UW {align1}; /* (x, y) * 16 */
+add (1) read0_header.0<1>:D read0_header.0<0,1,0>:D -8:W {align1}; /* X offset */
+add (1) read0_header.4<1>:D read0_header.4<0,1,0>:D -1:W {align1}; /* Y offset */
+mov (1) read0_header.8<1>:UD BLOCK_32X1 {align1};
+mov (1) read0_header.20<1>:UB thread_id_ub {align1}; /* dispatch id */
+
+shl (2) read1_header.0<1>:D orig_xy_ub<2,2,1>:UB 4:UW {align1}; /* (x, y) * 16 */
+add (1) read1_header.0<1>:D read1_header.0<0,1,0>:D -4:W {align1}; /* X offset */
+mov (1) read1_header.8<1>:UD BLOCK_4X16 {align1};
+mov (1) read1_header.20<1>:UB thread_id_ub {align1}; /* dispatch id */
+
+shl (2) vme_m0.8<1>:UW orig_xy_ub<2,2,1>:UB 4:UW {align1}; /* (x, y) * 16 */
+mov (1) vme_m0.20<1>:UB thread_id_ub {align1}; /* dispatch id */
+
+mul (1) obw_m0.8<1>:UD w_in_mb_uw<0,1,0>:UW orig_y_ub<0,1,0>:UB {align1};
+add (1) obw_m0.8<1>:UD obw_m0.8<0,1,0>:UD orig_x_ub<0,1,0>:UB {align1};
+mul (1) obw_m0.8<1>:UD obw_m0.8<0,1,0>:UD 24:UD {align1};
+mov (1) obw_m0.20<1>:UB thread_id_ub {align1}; /* dispatch id */
+
+/*
+ * Media Read Message -- fetch Luma neighbor edge pixels
+ */
+/* ROW */
+mov (8) msg_reg0.0<1>:UD read0_header.0<8,8,1>:UD {align1};
+send (8) msg_ind INEP_ROW<1>:UB null read(BIND_IDX_INEP, 0, 0, 4) mlen 1 rlen 1 {align1};
+
+/* COL */
+mov (8) msg_reg0.0<1>:UD read1_header.0<8,8,1>:UD {align1};
+send (8) msg_ind INEP_COL0<1>:UB null read(BIND_IDX_INEP, 0, 0, 4) mlen 1 rlen 2 {align1};
+
+/*
+ * Media Read Message -- fetch Chroma neighbor edge pixels
+ */
+/* ROW */
+shl (2) read0_header.0<1>:D orig_xy_ub<2,2,1>:UB 3:UW {align1}; /* x * 16 , y * 8 */
+mul (1) read0_header.0<1>:D read0_header.0<0,1,0>:D 2:W {align1};
+add (1) read0_header.0<1>:D read0_header.0<0,1,0>:D -8:W {align1}; /* X offset */
+add (1) read0_header.4<1>:D read0_header.4<0,1,0>:D -1:W {align1}; /* Y offset */
+mov (8) msg_reg0.0<1>:UD read0_header.0<8,8,1>:UD {align1};
+send (8) msg_ind CHROMA_ROW<1>:UB null read(BIND_IDX_CBCR, 0, 0, 4) mlen 1 rlen 1 {align1};
+
+/* COL */
+shl (2) read1_header.0<1>:D orig_xy_ub<2,2,1>:UB 3:UW {align1}; /* x * 16, y * 8 */
+mul (1) read1_header.0<1>:D read1_header.0<0,1,0>:D 2:W {align1};
+add (1) read1_header.0<1>:D read1_header.0<0,1,0>:D -4:W {align1}; /* X offset */
+mov (1) read1_header.8<1>:UD BLOCK_8X4 {align1};
+mov (8) msg_reg0.0<1>:UD read1_header.0<8,8,1>:UD {align1};
+send (8) msg_ind CHROMA_COL<1>:UB null read(BIND_IDX_CBCR, 0, 0, 4) mlen 1 rlen 1 {align1};
+
+mov (8) mb_mvp_ref.0<1>:ud 0:ud {align1};
+mov (8) mb_ref_win.0<1>:ud 0:ud {align1};
+and.z.f0.0 (1) null:uw mb_hwdep<0,1,0>:uw 0x04:uw {align1};
+(f0.0) jmpi (1) __mb_hwdep_end;
+/* read back the data for MB A */
+/* the layout of MB result is: rx.0(Available). rx.4(MVa), rX.8(MVb), rX.16(Pred_L0 flag),
+* rX.18 (Pred_L1 flag), rX.20(Forward reference ID), rX.22(Backwared reference ID)
+*/
+mov (8) mba_result.0<1>:ud 0x0:ud {align1};
+mov (8) mbb_result.0<1>:ud 0x0:ud {align1};
+mov (8) mbc_result.0<1>:ud 0x0:ud {align1};
+mba_start:
+mov (8) mb_msg0.0<1>:ud 0:ud {align1};
+and.z.f0.0 (1) null:uw input_mb_intra_ub<0,1,0>:ub INTRA_PRED_AVAIL_FLAG_AE:uw {align1};
+/* MB A doesn't exist. Zero MV. mba_flag is zero and ref ID = -1 */
+(f0.0) mov (2) mba_result.20<1>:w -1:w {align1};
+(f0.0) jmpi (1) mbb_start;
+mov (1) mba_result.0<1>:d MB_AVAIL {align1};
+mov (2) tmp_reg0.0<1>:UW orig_xy_ub<2,2,1>:UB {align1};
+add (1) tmp_reg0.0<1>:w tmp_reg0.0<0,1,0>:w -1:w {align1};
+mul (1) mb_msg0.8<1>:UD w_in_mb_uw<0,1,0>:UW tmp_reg0.2<0,1,0>:UW {align1};
+add (1) mb_msg0.8<1>:UD mb_msg0.8<0,1,0>:UD tmp_reg0.0<0,1,0>:uw {align1};
+mul (1) mb_msg0.8<1>:UD mb_msg0.8<0,1,0>:UD 24:UD {align1};
+mov (1) mb_msg0.20<1>:UB thread_id_ub {align1}; /* dispatch id */
+
+/* bind index 3, read 4 oword (64bytes), msg type: 0(OWord Block Read) */
+send (16)
+ mb_ind
+ mb_wb.0<1>:ud
+ NULL
+ data_port(
+ OBR_CACHE_TYPE,
+ OBR_MESSAGE_TYPE,
+ OBR_CONTROL_4,
+ OBR_BIND_IDX,
+ OBR_WRITE_COMMIT_CATEGORY,
+ OBR_HEADER_PRESENT
+ )
+ mlen 1
+ rlen 2
+ {align1};
+
+/* TODO: RefID is required after multi-references are added */
+cmp.l.f0.0 (1) null:w mb_intra_wb.16<0,1,0>:uw mb_inter_wb.8<0,1,0>:uw {align1};
+(f0.0) mov (2) mba_result.20<1>:w -1:w {align1};
+(f0.0) jmpi (1) mbb_start;
+
+add (1) mb_msg0.8<1>:UD mb_msg0.8<0,1,0>:ud 3:ud {align1};
+/* Read MV for MB A */
+/* bind index 3, read 8 oword (128bytes), msg type: 0(OWord Block Read) */
+send (16)
+ mb_ind
+ mb_mv0.0<1>:ud
+ NULL
+ data_port(
+ OBR_CACHE_TYPE,
+ OBR_MESSAGE_TYPE,
+ OBR_CONTROL_8,
+ OBR_BIND_IDX,
+ OBR_WRITE_COMMIT_CATEGORY,
+ OBR_HEADER_PRESENT
+ )
+ mlen 1
+ rlen 4
+ {align1};
+/* TODO: RefID is required after multi-references are added */
+/* MV */
+mov (2) mba_result.4<1>:ud mb_mv1.8<2,2,1>:ud {align1};
+mov (1) mba_result.16<1>:w MB_PRED_FLAG {align1};
+
+mbb_start:
+mov (8) mb_msg0.0<1>:ud 0:ud {align1};
+and.z.f0.0 (1) null:uw input_mb_intra_ub<0,1,0>:ub INTRA_PRED_AVAIL_FLAG_B:uw {align1};
+/* MB B doesn't exist. Zero MV. mba_flag is zero */
+/* If MB B doesn't exist, neither MB C nor D exists */
+(f0.0) mov (2) mbb_result.20<1>:w -1:w {align1};
+(f0.0) mov (2) mbc_result.20<1>:w -1:w {align1};
+(f0.0) jmpi (1) mb_mvp_start;
+mov (1) mbb_result.0<1>:d MB_AVAIL {align1};
+mov (2) tmp_reg0.0<1>:UW orig_xy_ub<2,2,1>:UB {align1};
+add (1) tmp_reg0.2<1>:w tmp_reg0.2<0,1,0>:w -1:w {align1};
+mul (1) mb_msg0.8<1>:UD w_in_mb_uw<0,1,0>:UW tmp_reg0.2<0,1,0>:UW {align1};
+add (1) mb_msg0.8<1>:UD mb_msg0.8<0,1,0>:UD tmp_reg0.0<0,1,0>:uw {align1};
+mul (1) mb_msg0.8<1>:UD mb_msg0.8<0,1,0>:UD 24:UD {align1};
+mov (1) mb_msg0.20<1>:UB thread_id_ub {align1}; /* dispatch id */
+
+/* bind index 3, read 4 oword (64bytes), msg type: 0(OWord Block Read) */
+send (16)
+ mb_ind
+ mb_wb.0<1>:ud
+ NULL
+ data_port(
+ OBR_CACHE_TYPE,
+ OBR_MESSAGE_TYPE,
+ OBR_CONTROL_4,
+ OBR_BIND_IDX,
+ OBR_WRITE_COMMIT_CATEGORY,
+ OBR_HEADER_PRESENT
+ )
+ mlen 1
+ rlen 2
+ {align1};
+
+/* TODO: RefID is required after multi-references are added */
+cmp.l.f0.0 (1) null:w mb_intra_wb.16<0,1,0>:uw mb_inter_wb.8<0,1,0>:uw {align1};
+(f0.0) mov (2) mbb_result.20<1>:w -1:w {align1};
+(f0.0) jmpi (1) mbc_start;
+add (1) mb_msg0.8<1>:UD mb_msg0.8<0,1,0>:ud 3:ud {align1};
+/* Read MV for MB B */
+/* bind index 3, read 8 oword (128bytes), msg type: 0(OWord Block Read) */
+send (16)
+ mb_ind
+ mb_mv0.0<1>:ud
+ NULL
+ data_port(
+ OBR_CACHE_TYPE,
+ OBR_MESSAGE_TYPE,
+ OBR_CONTROL_8,
+ OBR_BIND_IDX,
+ OBR_WRITE_COMMIT_CATEGORY,
+ OBR_HEADER_PRESENT
+ )
+ mlen 1
+ rlen 4
+ {align1};
+/* TODO: RefID is required after multi-references are added */
+mov (2) mbb_result.4<1>:ud mb_mv2.16<2,2,1>:ud {align1};
+mov (1) mbb_result.16<1>:w MB_PRED_FLAG {align1};
+
+mbc_start:
+mov (8) mb_msg0.0<1>:ud 0:ud {align1};
+and.z.f0.0 (1) null:uw input_mb_intra_ub<0,1,0>:ub INTRA_PRED_AVAIL_FLAG_C:uw {align1};
+/* MB C doesn't exist. Zero MV. mba_flag is zero */
+/* Based on h264 spec the MB D will be replaced if MB C doesn't exist */
+(f0.0) jmpi (1) mbd_start;
+mov (1) mbc_result.0<1>:d MB_AVAIL {align1};
+mov (2) tmp_reg0.0<1>:UW orig_xy_ub<2,2,1>:UB {align1};
+add (1) tmp_reg0.2<1>:w tmp_reg0.2<0,1,0>:w -1:w {align1};
+add (1) tmp_reg0.0<1>:w tmp_reg0.0<0,1,0>:w 1:w {align1};
+mul (1) mb_msg0.8<1>:UD w_in_mb_uw<0,1,0>:UW tmp_reg0.2<0,1,0>:UW {align1};
+add (1) mb_msg0.8<1>:UD mb_msg0.8<0,1,0>:UD tmp_reg0.0<0,1,0>:uw {align1};
+mul (1) mb_msg0.8<1>:UD mb_msg0.8<0,1,0>:UD 24:UD {align1};
+mov (1) mb_msg0.20<1>:UB thread_id_ub {align1}; /* dispatch id */
+
+/* bind index 3, read 4 oword (64bytes), msg type: 0(OWord Block Read) */
+send (16)
+ mb_ind
+ mb_wb.0<1>:ud
+ NULL
+ data_port(
+ OBR_CACHE_TYPE,
+ OBR_MESSAGE_TYPE,
+ OBR_CONTROL_4,
+ OBR_BIND_IDX,
+ OBR_WRITE_COMMIT_CATEGORY,
+ OBR_HEADER_PRESENT
+ )
+ mlen 1
+ rlen 2
+ {align1};
+
+/* TODO: RefID is required after multi-references are added */
+cmp.l.f0.0 (1) null:w mb_intra_wb.16<0,1,0>:uw mb_inter_wb.8<0,1,0>:uw {align1};
+(f0.0) mov (2) mbc_result.20<1>:w -1:w {align1};
+(f0.0) jmpi (1) mb_mvp_start;
+add (1) mb_msg0.8<1>:UD mb_msg0.8<0,1,0>:ud 3:ud {align1};
+/* Read MV for MB C */
+/* bind index 3, read 8 oword (128bytes), msg type: 0(OWord Block Read) */
+send (16)
+ mb_ind
+ mb_mv0.0<1>:ud
+ NULL
+ data_port(
+ OBR_CACHE_TYPE,
+ OBR_MESSAGE_TYPE,
+ OBR_CONTROL_8,
+ OBR_BIND_IDX,
+ OBR_WRITE_COMMIT_CATEGORY,
+ OBR_HEADER_PRESENT
+ )
+ mlen 1
+ rlen 4
+ {align1};
+/* TODO: RefID is required after multi-references are added */
+/* Forward MV */
+mov (2) mbc_result.4<1>:ud mb_mv2.16<2,2,1>:ud {align1};
+mov (1) mbc_result.16<1>:w MB_PRED_FLAG {align1};
+
+jmpi (1) mb_mvp_start;
+mbd_start:
+mov (8) mb_msg0.0<1>:ud 0:ud {align1};
+and.z.f0.0 (1) null:uw input_mb_intra_ub<0,1,0>:ub INTRA_PRED_AVAIL_FLAG_D:uw {align1};
+(f0.0) mov (2) mbc_result.20<1>:w -1:w {align1};
+(f0.0) jmpi (1) mb_mvp_start;
+mov (1) mbc_result.0<1>:d MB_AVAIL {align1};
+mov (2) tmp_reg0.0<1>:UW orig_xy_ub<2,2,1>:UB {align1};
+add (2) tmp_reg0.0<1>:w tmp_reg0.0<2,2,1>:w -1:w {align1};
+mul (1) mb_msg0.8<1>:UD w_in_mb_uw<0,1,0>:UW tmp_reg0.2<0,1,0>:UW {align1};
+add (1) mb_msg0.8<1>:UD mb_msg0.8<0,1,0>:UD tmp_reg0.0<0,1,0>:uw {align1};
+mul (1) mb_msg0.8<1>:UD mb_msg0.8<0,1,0>:UD 24:UD {align1};
+mov (1) mb_msg0.20<1>:UB thread_id_ub {align1}; /* dispatch id */
+
+/* bind index 3, read 4 oword (64bytes), msg type: 0(OWord Block Read) */
+send (16)
+ mb_ind
+ mb_wb.0<1>:ud
+ NULL
+ data_port(
+ OBR_CACHE_TYPE,
+ OBR_MESSAGE_TYPE,
+ OBR_CONTROL_4,
+ OBR_BIND_IDX,
+ OBR_WRITE_COMMIT_CATEGORY,
+ OBR_HEADER_PRESENT
+ )
+ mlen 1
+ rlen 2
+ {align1};
+
+cmp.l.f0.0 (1) null:w mb_intra_wb.16<0,1,0>:uw mb_inter_wb.8<0,1,0>:uw {align1};
+(f0.0) mov (2) mbc_result.20<1>:w -1:w {align1};
+(f0.0) jmpi (1) mb_mvp_start;
+
+add (1) mb_msg0.8<1>:UD mb_msg0.8<0,1,0>:ud 3:ud {align1};
+/* Read MV for MB D */
+/* bind index 3, read 8 oword (128bytes), msg type: 0(OWord Block Read) */
+send (16)
+ mb_ind
+ mb_mv0.0<1>:ub
+ NULL
+ data_port(
+ OBR_CACHE_TYPE,
+ OBR_MESSAGE_TYPE,
+ OBR_CONTROL_8,
+ OBR_BIND_IDX,
+ OBR_WRITE_COMMIT_CATEGORY,
+ OBR_HEADER_PRESENT
+ )
+ mlen 1
+ rlen 4
+ {align1};
+
+/* TODO: RefID is required after multi-references are added */
+
+/* Forward MV */
+mov (2) mbc_result.4<1>:ud mb_mv3.24<2,2,1>:ud {align1};
+mov (1) mbc_result.16<1>:w MB_PRED_FLAG {align1};
+
+mb_mvp_start:
+/*TODO: Add the skip prediction */
+/* Check whether both MB B and C are inavailable */
+add (1) tmp_reg0.0<1>:d mbb_result.0<0,1,0>:d mbc_result.0<0,1,0>:d {align1};
+cmp.z.f0.0 (1) null:d tmp_reg0.0<0,1,0>:d 0:d {align1};
+(-f0.0) jmpi (1) mb_median_start;
+cmp.nz.f0.0 (1) null:d mba_result.0<0,1,0>:d 0:d {align1};
+(f0.0) mov (1) mbb_result.4<1>:ud mba_result.4<0,1,0>:ud {align1};
+(f0.0) mov (1) mbc_result.4<1>:ud mba_result.4<0,1,0>:ud {align1};
+(f0.0) mov (1) mbb_result.20<1>:uw mba_result.20<0,1,0>:uw {align1};
+(f0.0) mov (1) mbc_result.20<1>:uw mba_result.20<0,1,0>:uw {align1};
+(f0.0) mov (1) mb_mvp_ref.0<1>:ud mba_result.4<0,1,0>:ud {align1};
+(-f0.0) mov (1) mb_mvp_ref.0<1>:ud 0:ud {align1};
+jmpi (1) __mb_hwdep_end;
+
+mb_median_start:
+/* check whether only one neighbour MB has the same ref ID with the current MB */
+mov (8) tmp_reg0.0<1>:ud 0:ud {align1};
+cmp.z.f0.0 (1) null:d mba_result.20<0,1,0>:w 0:w {align1};
+(f0.0) add (1) tmp_reg0.0<1>:w tmp_reg0.0<0,1,0>:w 1:w {align1};
+(f0.0) mov (1) tmp_reg0.4<1>:ud mba_result.4<0,1,0>:ud {align1};
+cmp.z.f0.0 (1) null:d mbb_result.20<0,1,0>:w 0:w {align1};
+(f0.0) add (1) tmp_reg0.0<1>:w tmp_reg0.0<0,1,0>:w 1:w {align1};
+(f0.0) mov (1) tmp_reg0.4<1>:ud mbb_result.4<0,1,0>:ud {align1};
+cmp.z.f0.0 (1) null:d mbc_result.20<0,1,0>:w 0:w {align1};
+(f0.0) add (1) tmp_reg0.0<1>:w tmp_reg0.0<0,1,0>:w 1:w {align1};
+(f0.0) mov (1) tmp_reg0.4<1>:ud mbc_result.4<0,1,0>:ud {align1};
+cmp.e.f0.0 (1) null:d tmp_reg0.0<0,1,0>:w 1:w {align1};
+(f0.0) mov (1) mb_mvp_ref.0<1>:ud tmp_reg0.4<0,1,0>:ud {align1};
+(f0.0) jmpi (1) __mb_hwdep_end;
+
+mov (1) INPUT_ARG0.0<1>:w mba_result.4<0,1,0>:w {align1};
+mov (1) INPUT_ARG0.4<1>:w mbb_result.4<0,1,0>:w {align1};
+mov (1) INPUT_ARG0.8<1>:w mbc_result.4<0,1,0>:w {align1};
+SAVE_RET {align1};
+ jmpi (1) word_imedian;
+mov (1) mb_mvp_ref.0<1>:w RET_ARG<0,1,0>:w {align1};
+mov (1) INPUT_ARG0.0<1>:w mba_result.6<0,1,0>:w {align1};
+mov (1) INPUT_ARG0.4<1>:w mbb_result.6<0,1,0>:w {align1};
+mov (1) INPUT_ARG0.8<1>:w mbc_result.6<0,1,0>:w {align1};
+SAVE_RET {align1};
+jmpi (1) word_imedian;
+mov (1) mb_mvp_ref.2<1>:w RET_ARG<0,1,0>:w {align1};
+
+__mb_hwdep_end:
+asr (2) mb_ref_win.0<1>:w mb_mvp_ref.0<2,2,1>:w 2:w {align1};
+add (2) mb_ref_win.8<1>:w mb_ref_win.0<2,2,1>:w 3:w {align1};
+and (2) mb_ref_win.16<1>:uw mb_ref_win.8<2,2,1>:uw 0xFFFC:uw {align1};
+/* m2, get the MV/Mb cost passed from constant buffer when
+spawning thread by MEDIA_OBJECT */
+mov (8) vme_m2<1>:UD r1.0<8,8,1>:UD {align1};
+
+mov (8) vme_msg_2<1>:UD vme_m2.0<8,8,1>:UD {align1};
+
+/* m3 FWD/BWD cost center*/
+mov (8) vme_msg_3<1>:UD 0x0:UD {align1};
+
+/* m4 skip center*/
+mov (8) vme_msg_4<1>:UD 0x0:UD {align1};
+
+/* m5 */
+mov (1) INEP_ROW.0<1>:UD 0x0:UD {align1};
+and (1) INEP_ROW.4<1>:UD INEP_ROW.4<0,1,0>:UD 0xFF000000:UD {align1};
+mov (8) vme_msg_5<1>:UD INEP_ROW.0<8,8,1>:UD {align1};
+
+
+/* Use the Luma mode */
+mov (1) tmp_reg0.0<1>:UW LUMA_INTRA_MODE:UW {align1};
+mov (1) vme_msg_5.5<1>:UB tmp_reg0.0<0,1,0>:UB {align1};
+
+/* m6 */
+mov (8) vme_msg_6<1>:UD 0x0:UD {align1};
+mov (16) vme_msg_6.0<1>:UB INEP_COL0.3<32,8,4>:UB {align1};
+mov (1) vme_msg_6.16<1>:UD INTRA_PREDICTORE_MODE {align1};
+
+/* the penalty for Intra mode */
+mov (1) vme_msg_6.28<1>:UD 0x010101:UD {align1};
+mov (1) vme_msg_6.20<1>:UW CHROMA_ROW.6<0,1,0>:UW {align1};
+
+
+/* m7 */
+
+mov (4) vme_msg_7.16<1>:UD CHROMA_ROW.8<4,4,1>:UD {align1};
+mov (8) vme_msg_7.0<1>:UW CHROMA_COL.2<16,8,2>:UW {align1};
+
+/*
+ * SIC VME message
+ */
+
+/* m1 */
+mov (1) intra_flag<1>:UW 0x0:UW {align1};
+mov (1) intra_part_mask_ub<1>:UB LUMA_INTRA_8x8_DISABLE {align1}; /* vp8 don't support intra_8x8 mode*/
+
+/* assign MB intra struct from the thread payload*/
+mov (1) mb_intra_struct_ub<1>:UB input_mb_intra_ub<0,1,0>:UB {align1};
+
+/* Disable DC HAAR component when calculating HARR SATD block */
+mov (1) tmp_reg0.0<1>:UW DC_HARR_DISABLE:UW {align1};
+mov (1) vme_m1.30<1>:UB tmp_reg0.0<0,1,0>:UB {align1};
+mov (8) vme_msg_1<1>:UD vme_m1.0<8,8,1>:UD {align1};
+
+/* m0 */
+mov (1) vme_m0.12<1>:UD INTRA_SAD_HAAR:UD {align1}; /* 16x16 Source, Intra_harr */
+mov (8) vme_msg_0.0<1>:UD vme_m0.0<8,8,1>:UD {align1};
+
+/* after verification it will be passed by using payload */
+send (8)
+ vme_msg_ind
+ vme_wb<1>:UD
+ null
+ cre(
+ BIND_IDX_VME,
+ VME_SIC_MESSAGE_TYPE
+ )
+ mlen sic_vme_msg_length
+ rlen vme_wb_length
+ {align1};
+/*
+ * Oword Block Write message
+ */
+mov (8) msg_reg0.0<1>:UD obw_m0<8,8,1>:UD {align1};
+
+mov (1) msg_reg1.0<1>:UD vme_wb.0<0,1,0>:UD {align1};
+mov (1) msg_reg1.4<1>:UD vme_wb.16<0,1,0>:UD {align1};
+mov (1) msg_reg1.8<1>:UD vme_wb.20<0,1,0>:UD {align1};
+mov (1) msg_reg1.12<1>:UD vme_wb.24<0,1,0>:UD {align1};
+
+/* Distortion, Intra (17-16), */
+mov (1) msg_reg1.16<1>:UW vme_wb.12<0,1,0>:UW {align1};
+
+mov (1) msg_reg1.20<1>:UD vme_wb.8<0,1,0>:UD {align1};
+/* VME clock counts */
+mov (1) msg_reg1.24<1>:UD vme_wb.28<0,1,0>:UD {align1};
+
+mov (1) msg_reg1.28<1>:UD obw_m0.8<0,1,0>:UD {align1};
+
+/* bind index 3, write 2 oword (32bytes), msg type: 8(OWord Block Write) */
+send (16)
+ msg_ind
+ obw_wb
+ null
+ data_port(
+ OBW_CACHE_TYPE,
+ OBW_MESSAGE_TYPE,
+ OBW_CONTROL_2,
+ OBW_BIND_IDX,
+ OBW_WRITE_COMMIT_CATEGORY,
+ OBW_HEADER_PRESENT
+ )
+ mlen 2
+ rlen obw_wb_length
+ {align1};
+
+/* IME search */
+mov (1) vme_m0.12<1>:UD SEARCH_CTRL_SINGLE + VP8_INTER_PART_MASK + INTER_SAD_HAAR:UD {align1}; /* 16x16 Source, harr */
+mov (1) vme_m0.22<1>:UW REF_REGION_SIZE {align1}; /* Reference Width&Height, 48x40 */
+
+mov (1) vme_m0.0<1>:UD vme_m0.8<0,1,0>:UD {align1};
+
+add (1) vme_m0.0<1>:W vme_m0.0<0,1,0>:W -16:W {align1}; /* Reference = (x-16,y-12)-(x+32,y+28) */
+add (1) vme_m0.2<1>:W vme_m0.2<0,1,0>:W -12:W {align1};
+
+mov (1) vme_m0.0<1>:W -16:W {align1};
+mov (1) vme_m0.2<1>:W -12:W {align1};
+
+mov (1) vme_m0.4<1>:UD vme_m0.0<0,1,0>:UD {align1};
+
+and.z.f0.0 (1) null:uw input_mb_intra_ub<0,1,0>:ub INTRA_PRED_AVAIL_FLAG_AE:uw {align1};
+(f0.0) add (1) vme_m0.0<1>:w vme_m0.0<0,1,0>:w 12:w {align1};
+and.z.f0.0 (1) null:uw input_mb_intra_ub<0,1,0>:ub INTRA_PRED_AVAIL_FLAG_B:uw {align1};
+(f0.0) add (1) vme_m0.2<1>:w vme_m0.2<0,1,0>:w 8:w {align1};
+
+add (2) vme_m0.0<1>:w vme_m0.0<2,2,1>:w mb_ref_win.16<2,2,1>:w {align1};
+add (2) vme_m0.4<1>:w vme_m0.4<2,2,1>:w mb_ref_win.16<2,2,1>:w {align1};
+mov (8) vme_msg_0.0<1>:UD vme_m0.0<8,8,1>:UD {align1};
+
+mov (1) vme_m1.0<1>:UD ADAPTIVE_SEARCH_ENABLE:ud {align1} ;
+/* the Max MV number is passed by constant buffer */
+mov (1) vme_m1.4<1>:UB r4.28<0,1,0>:UB {align1};
+mov (1) vme_m1.8<1>:UD START_CENTER + SEARCH_PATH_LEN:UD {align1};
+mov (8) vme_msg_1.0<1>:UD vme_m1.0<8,8,1>:UD {align1};
+
+/* Setup the Cost center */
+/* currently four 8x8 share the same cost center */
+mov (4) vme_m3.0<2>:ud mb_mvp_ref.0<0,1,0>:ud {align1};
+mov (4) vme_m3.4<2>:ud mb_mvp_ref.0<0,1,0>:ud {align1};
+
+mov (8) vme_msg_3<1>:UD vme_m3.0<8,8,1>:UD {align1};
+mov (8) vme_msg_2<1>:UD vme_m2.0<8,8,1>:UD {align1};
+
+/* M4/M5 search path */
+mov (1) vme_msg_4.0<1>:UD 0x01010101:UD {align1};
+mov (1) vme_msg_4.4<1>:UD 0x10010101:UD {align1};
+mov (1) vme_msg_4.8<1>:UD 0x0F0F0F0F:UD {align1};
+mov (1) vme_msg_4.12<1>:UD 0x100F0F0F:UD {align1};
+mov (1) vme_msg_4.16<1>:UD 0x01010101:UD {align1};
+mov (1) vme_msg_4.20<1>:UD 0x10010101:UD {align1};
+mov (1) vme_msg_4.24<1>:UD 0x0F0F0F0F:UD {align1};
+mov (1) vme_msg_4.28<1>:UD 0x100F0F0F:UD {align1};
+
+mov (1) vme_msg_5.0<1>:UD 0x01010101:UD {align1};
+mov (1) vme_msg_5.4<1>:UD 0x10010101:UD {align1};
+mov (1) vme_msg_5.8<1>:UD 0x0F0F0F0F:UD {align1};
+mov (1) vme_msg_5.12<1>:UD 0x000F0F0F:UD {align1};
+
+mov (4) vme_msg_5.16<1>:UD 0x0:UD {align1};
+
+send (8)
+ vme_msg_ind
+ vme_wb<1>:UD
+ null
+ vme(
+ BIND_IDX_VME,
+ 0,
+ 0,
+ VME_IME_MESSAGE_TYPE
+ )
+ mlen ime_vme_msg_length
+ rlen vme_wb_length {align1};
+
+/* Set Macroblock-shape/mode for FBR */
+
+mov (1) vme_m2.20<1>:UD 0x0:UD {align1};
+mov (1) vme_m2.21<1>:UB vme_wb.25<0,1,0>:UB {align1};
+mov (1) vme_m2.22<1>:UB vme_wb.26<0,1,0>:UB {align1};
+
+and (1) tmp_reg0.0<1>:UW vme_wb.0<0,1,0>:UW 0x03:UW {align1};
+mov (1) vme_m2.20<1>:UB tmp_reg0.0<0,1,0>:UB {align1};
+
+/* Send FBR message into CRE */
+
+mov (8) vme_msg_4.0<1>:UD vme_wb1.0<8,8,1>:UD {align1};
+mov (8) vme_msg_5.0<1>:ud vme_wb2.0<8,8,1>:ud {align1};
+mov (8) vme_msg_6.0<1>:ud vme_wb3.0<8,8,1>:ud {align1};
+mov (8) vme_msg_7.0<1>:ud vme_wb4.0<8,8,1>:ud {align1};
+
+mov (1) vme_m0.12<1>:UD INTER_SAD_HAAR + SUB_PEL_MODE_QUARTER + FBR_BME_DISABLE:UD {align1}; /* 16x16 Source, 1/4 pixel, harr, BME disable */
+mov (8) vme_msg_0.0<1>:UD vme_m0.0<8,8,1>:UD {align1};
+mov (8) vme_msg_1.0<1>:UD vme_m1.0<8,8,1>:UD {align1};
+
+mov (8) vme_msg_2.0<1>:UD vme_m2.0<8,8,1>:UD {align1};
+mov (8) vme_msg_3.0<1>:UD vme_m3.0<8,8,1>:UD {align1};
+
+/* after verification it will be passed by using payload */
+send (8)
+ vme_msg_ind
+ vme_wb<1>:UD
+ null
+ cre(
+ BIND_IDX_VME,
+ VME_FBR_MESSAGE_TYPE
+ )
+ mlen fbr_vme_msg_length
+ rlen vme_wb_length
+ {align1};
+
+add (1) obw_m0.8<1>:UD obw_m0.8<0,1,0>:UD 0x02:UD {align1};
+mov (8) msg_reg0.0<1>:UD obw_m0<8,8,1>:UD {align1};
+/* write FME info */
+mov (1) msg_reg1.0<1>:UD vme_wb.0<0,1,0>:UD {align1};
+
+mov (1) msg_reg1.4<1>:UD vme_wb.24<0,1,0>:UD {align1};
+/* Inter distortion of FME */
+mov (1) msg_reg1.8<1>:UD vme_wb.8<0,1,0>:UD {align1};
+
+mov (1) msg_reg1.12<1>:UD vme_m2.20<0,1,0>:UD {align1};
+
+/* bind index 3, write oword (16bytes), msg type: 8(OWord Block Write) */
+send (16)
+ msg_ind
+ obw_wb
+ null
+ data_port(
+ OBW_CACHE_TYPE,
+ OBW_MESSAGE_TYPE,
+ OBW_CONTROL_0,
+ OBW_BIND_IDX,
+ OBW_WRITE_COMMIT_CATEGORY,
+ OBW_HEADER_PRESENT
+ )
+ mlen 2
+ rlen obw_wb_length
+ {align1};
+
+/* Write FME/BME MV */
+add (1) obw_m0.8<1>:UD obw_m0.8<0,1,0>:UD 0x01:UD {align1};
+mov (8) msg_reg0.0<1>:UD obw_m0.0<8,8,1>:UD {align1};
+
+
+mov (8) msg_reg1.0<1>:UD vme_wb1.0<8,8,1>:UD {align1};
+mov (8) msg_reg2.0<1>:ud vme_wb2.0<8,8,1>:ud {align1};
+mov (8) msg_reg3.0<1>:ud vme_wb3.0<8,8,1>:ud {align1};
+mov (8) msg_reg4.0<1>:ud vme_wb4.0<8,8,1>:ud {align1};
+/* bind index 3, write 8 oword (128 bytes), msg type: 8(OWord Block Write) */
+send (16)
+ msg_ind
+ obw_wb
+ null
+ data_port(
+ OBW_CACHE_TYPE,
+ OBW_MESSAGE_TYPE,
+ OBW_CONTROL_8,
+ OBW_BIND_IDX,
+ OBW_WRITE_COMMIT_CATEGORY,
+ OBW_HEADER_PRESENT
+ )
+ mlen 5
+ rlen obw_wb_length
+ {align1};
+
+/* Write FME/BME RefID */
+add (1) obw_m0.8<1>:UD obw_m0.8<0,1,0>:UD 0x08:UD {align1};
+mov (8) msg_reg0.0<1>:UD obw_m0<8,8,1>:UD {align1};
+
+mov (8) msg_reg1.0<1>:UD vme_wb6.0<8,8,1>:UD {align1};
+
+/* bind index 3, write 2 oword (32bytes), msg type: 8(OWord Block Write) */
+send (16)
+ msg_ind
+ obw_wb
+ null
+ data_port(
+ OBW_CACHE_TYPE,
+ OBW_MESSAGE_TYPE,
+ OBW_CONTROL_2,
+ OBW_BIND_IDX,
+ OBW_WRITE_COMMIT_CATEGORY,
+ OBW_HEADER_PRESENT
+ )
+ mlen 2
+ rlen obw_wb_length
+ {align1};
+
+/* Issue message fence so that the previous write message is committed */
+send (16)
+ mb_ind
+ mb_wb.0<1>:ud
+ NULL
+ data_port(
+ OBR_CACHE_TYPE,
+ OBR_MESSAGE_FENCE,
+ OBR_MF_COMMIT,
+ OBR_BIND_IDX,
+ OBR_WRITE_COMMIT_CATEGORY,
+ OBR_HEADER_PRESENT
+ )
+ mlen 1
+ rlen 1
+ {align1};
+
+__EXIT:
+/*
+ * kill thread
+ */
+mov (8) ts_msg_reg0<1>:UD r0<8,8,1>:UD {align1};
+send (16) ts_msg_ind acc0<1>UW null thread_spawner(0, 0, 1) mlen 1 rlen 0 {align1 EOT};
+
+
+ nop ;
+ nop ;
+/* Compare three word data to get the min value */
+word_imin:
+ cmp.le.f0.0 (1) null:w INPUT_ARG0.0<0,1,0>:w INPUT_ARG0.4<0,1,0>:w {align1};
+ (f0.0) mov (1) TEMP_VAR0.0<1>:w INPUT_ARG0.0<0,1,0>:w {align1};
+ (-f0.0) mov (1) TEMP_VAR0.0<1>:w INPUT_ARG0.4<0,1,0>:w {align1};
+ cmp.le.f0.0 (1) null:w TEMP_VAR0.0<0,1,0>:w INPUT_ARG0.8<0,1,0>:w {align1};
+ (f0.0) mov (1) RET_ARG<1>:w TEMP_VAR0.0<0,1,0>:w {align1};
+ (-f0.0) mov (1) RET_ARG<1>:w INPUT_ARG0.8<0,1,0>:w {align1};
+ RETURN {align1};
+
+/* Compare three word data to get the max value */
+word_imax:
+ cmp.ge.f0.0 (1) null:w INPUT_ARG0.0<0,1,0>:w INPUT_ARG0.4<0,1,0>:w {align1};
+ (f0.0) mov (1) TEMP_VAR0.0<1>:w INPUT_ARG0.0<0,1,0>:w {align1};
+ (-f0.0) mov (1) TEMP_VAR0.0<1>:w INPUT_ARG0.4<0,1,0>:w {align1};
+ cmp.ge.f0.0 (1) null:w TEMP_VAR0.0<0,1,0>:w INPUT_ARG0.8<0,1,0>:w {align1};
+ (f0.0) mov (1) RET_ARG<1>:w TEMP_VAR0.0<0,1,0>:w {align1};
+ (-f0.0) mov (1) RET_ARG<1>:w INPUT_ARG0.8<0,1,0>:w {align1};
+ RETURN {align1};
+
+word_imedian:
+ cmp.ge.f0.0 (1) null:w INPUT_ARG0.0<0,1,0>:w INPUT_ARG0.4<0,1,0>:w {align1};
+ (f0.0) jmpi (1) cmp_a_ge_b;
+ cmp.ge.f0.0 (1) null:w INPUT_ARG0.0<0,1,0>:w INPUT_ARG0.8<0,1,0>:w {align1};
+ (f0.0) mov (1) RET_ARG<1>:w INPUT_ARG0.0<0,1,0>:w {align1};
+ (f0.0) jmpi (1) cmp_end;
+ cmp.ge.f0.0 (1) null:w INPUT_ARG0.4<0,1,0>:w INPUT_ARG0.8<0,1,0>:w {align1};
+ (f0.0) mov (1) RET_ARG<1>:w INPUT_ARG0.8<0,1,0>:w {align1};
+ (-f0.0) mov (1) RET_ARG<1>:w INPUT_ARG0.4<0,1,0>:w {align1};
+ jmpi (1) cmp_end;
+cmp_a_ge_b:
+ cmp.ge.f0.0 (1) null:w INPUT_ARG0.4<0,1,0>:w INPUT_ARG0.8<0,1,0>:w {align1};
+ (f0.0) mov (1) RET_ARG<1>:w INPUT_ARG0.4<0,1,0>:w {align1};
+ (f0.0) jmpi (1) cmp_end;
+ cmp.ge.f0.0 (1) null:w INPUT_ARG0.0<0,1,0>:w INPUT_ARG0.8<0,1,0>:w {align1};
+ (f0.0) mov (1) RET_ARG<1>:w INPUT_ARG0.8<0,1,0>:w {align1};
+ (-f0.0) mov (1) RET_ARG<1>:w INPUT_ARG0.0<0,1,0>:w {align1};
+cmp_end:
+ RETURN {align1};
+
diff --git a/src/shaders/vme/vp8_inter_frame_gen8.g8a b/src/shaders/vme/vp8_inter_frame_gen8.g8a
new file mode 100644
index 0000000..3b72c1c
--- /dev/null
+++ b/src/shaders/vme/vp8_inter_frame_gen8.g8a
@@ -0,0 +1,2 @@
+#include "vme8.inc"
+#include "vp8_inter_frame_gen8.asm"
diff --git a/src/shaders/vme/vp8_inter_frame_gen8.g8b b/src/shaders/vme/vp8_inter_frame_gen8.g8b
new file mode 100644
index 0000000..6377aae
--- /dev/null
+++ b/src/shaders/vme/vp8_inter_frame_gen8.g8b
@@ -0,0 +1,299 @@
+ { 0x00800001, 0x24000608, 0x00000000, 0x00000000 },
+ { 0x00800001, 0x24400608, 0x00000000, 0x00000000 },
+ { 0x00800001, 0x24800608, 0x00000000, 0x00000000 },
+ { 0x00800001, 0x24c00608, 0x00000000, 0x00000000 },
+ { 0x00200009, 0x24002228, 0x164500a0, 0x00040004 },
+ { 0x00000040, 0x24000a28, 0x1e000400, 0xfff8fff8 },
+ { 0x00000040, 0x24040a28, 0x1e000404, 0xffffffff },
+ { 0x00000001, 0x24080e08, 0x08000000, 0x0000001f },
+ { 0x00000001, 0x24142288, 0x00000014, 0x00000000 },
+ { 0x00200009, 0x24202228, 0x164500a0, 0x00040004 },
+ { 0x00000040, 0x24200a28, 0x1e000420, 0xfffcfffc },
+ { 0x00000001, 0x24280e08, 0x08000000, 0x000f0003 },
+ { 0x00000001, 0x24342288, 0x00000014, 0x00000000 },
+ { 0x00200009, 0x24482248, 0x164500a0, 0x00040004 },
+ { 0x00000001, 0x24542288, 0x00000014, 0x00000000 },
+ { 0x00000041, 0x24881208, 0x220000a2, 0x000000a1 },
+ { 0x00000040, 0x24880208, 0x22000488, 0x000000a0 },
+ { 0x00000041, 0x24880208, 0x06000488, 0x00000018 },
+ { 0x00000001, 0x24942288, 0x00000014, 0x00000000 },
+ { 0x00600001, 0x28000208, 0x008d0400, 0x00000000 },
+ { 0x04600031, 0x23800a88, 0x0e000800, 0x02190004 },
+ { 0x00600001, 0x28000208, 0x008d0420, 0x00000000 },
+ { 0x04600031, 0x23a00a88, 0x0e000800, 0x02290004 },
+ { 0x00200009, 0x24002228, 0x164500a0, 0x00030003 },
+ { 0x00000041, 0x24000a28, 0x1e000400, 0x00020002 },
+ { 0x00000040, 0x24000a28, 0x1e000400, 0xfff8fff8 },
+ { 0x00000040, 0x24040a28, 0x1e000404, 0xffffffff },
+ { 0x00600001, 0x28000208, 0x008d0400, 0x00000000 },
+ { 0x04600031, 0x26000a88, 0x0e000800, 0x02190006 },
+ { 0x00200009, 0x24202228, 0x164500a0, 0x00030003 },
+ { 0x00000041, 0x24200a28, 0x1e000420, 0x00020002 },
+ { 0x00000040, 0x24200a28, 0x1e000420, 0xfffcfffc },
+ { 0x00000001, 0x24280e08, 0x08000000, 0x00070003 },
+ { 0x00600001, 0x28000208, 0x008d0420, 0x00000000 },
+ { 0x04600031, 0x26200a88, 0x0e000800, 0x02190006 },
+ { 0x00600001, 0x2ac00608, 0x00000000, 0x00000000 },
+ { 0x00600001, 0x2a800608, 0x00000000, 0x00000000 },
+ { 0x01000005, 0x20001240, 0x160000a6, 0x00040004 },
+ { 0x00010020, 0x34000000, 0x0e001400, 0x00000750 },
+ { 0x00600001, 0x2ae00608, 0x00000000, 0x00000000 },
+ { 0x00600001, 0x2b000608, 0x00000000, 0x00000000 },
+ { 0x00600001, 0x2b200608, 0x00000000, 0x00000000 },
+ { 0x00600001, 0x2b400608, 0x00000000, 0x00000000 },
+ { 0x01000005, 0x20002240, 0x160000a5, 0x00600060 },
+ { 0x00210001, 0x2af41e68, 0x18000000, 0xffffffff },
+ { 0x00010020, 0x34000000, 0x0e001400, 0x000000f0 },
+ { 0x00000001, 0x2ae00e28, 0x08000000, 0x00000001 },
+ { 0x00200001, 0x24002248, 0x004500a0, 0x00000000 },
+ { 0x00000040, 0x24001a68, 0x1e000400, 0xffffffff },
+ { 0x00000041, 0x2b481208, 0x120000a2, 0x00000402 },
+ { 0x00000040, 0x2b480208, 0x12000b48, 0x00000400 },
+ { 0x00000041, 0x2b480208, 0x06000b48, 0x00000018 },
+ { 0x00000001, 0x2b542288, 0x00000014, 0x00000000 },
+ { 0x0a800031, 0x2b600a08, 0x0e000b40, 0x02280303 },
+ { 0x05000010, 0x20001260, 0x12000b70, 0x00000b88 },
+ { 0x00210001, 0x2af41e68, 0x18000000, 0xffffffff },
+ { 0x00010020, 0x34000000, 0x0e001400, 0x00000040 },
+ { 0x00000040, 0x2b480208, 0x06000b48, 0x00000003 },
+ { 0x0a800031, 0x2ba00a08, 0x0e000b40, 0x02480403 },
+ { 0x00200001, 0x2ae40208, 0x00450bc8, 0x00000000 },
+ { 0x00000001, 0x2af01e68, 0x18000000, 0x00010001 },
+ { 0x00600001, 0x2b400608, 0x00000000, 0x00000000 },
+ { 0x01000005, 0x20002240, 0x160000a5, 0x00100010 },
+ { 0x00210001, 0x2b141e68, 0x18000000, 0xffffffff },
+ { 0x00210001, 0x2b341e68, 0x18000000, 0xffffffff },
+ { 0x00010020, 0x34000000, 0x0e001400, 0x00000360 },
+ { 0x00000001, 0x2b000e28, 0x08000000, 0x00000001 },
+ { 0x00200001, 0x24002248, 0x004500a0, 0x00000000 },
+ { 0x00000040, 0x24021a68, 0x1e000402, 0xffffffff },
+ { 0x00000041, 0x2b481208, 0x120000a2, 0x00000402 },
+ { 0x00000040, 0x2b480208, 0x12000b48, 0x00000400 },
+ { 0x00000041, 0x2b480208, 0x06000b48, 0x00000018 },
+ { 0x00000001, 0x2b542288, 0x00000014, 0x00000000 },
+ { 0x0a800031, 0x2b600a08, 0x0e000b40, 0x02280303 },
+ { 0x05000010, 0x20001260, 0x12000b70, 0x00000b88 },
+ { 0x00210001, 0x2b141e68, 0x18000000, 0xffffffff },
+ { 0x00010020, 0x34000000, 0x0e001400, 0x00000040 },
+ { 0x00000040, 0x2b480208, 0x06000b48, 0x00000003 },
+ { 0x0a800031, 0x2ba00a08, 0x0e000b40, 0x02480403 },
+ { 0x00200001, 0x2b040208, 0x00450bf0, 0x00000000 },
+ { 0x00000001, 0x2b101e68, 0x18000000, 0x00010001 },
+ { 0x00600001, 0x2b400608, 0x00000000, 0x00000000 },
+ { 0x01000005, 0x20002240, 0x160000a5, 0x00080008 },
+ { 0x00010020, 0x34000000, 0x0e001400, 0x00000110 },
+ { 0x00000001, 0x2b200e28, 0x08000000, 0x00000001 },
+ { 0x00200001, 0x24002248, 0x004500a0, 0x00000000 },
+ { 0x00000040, 0x24021a68, 0x1e000402, 0xffffffff },
+ { 0x00000040, 0x24001a68, 0x1e000400, 0x00010001 },
+ { 0x00000041, 0x2b481208, 0x120000a2, 0x00000402 },
+ { 0x00000040, 0x2b480208, 0x12000b48, 0x00000400 },
+ { 0x00000041, 0x2b480208, 0x06000b48, 0x00000018 },
+ { 0x00000001, 0x2b542288, 0x00000014, 0x00000000 },
+ { 0x0a800031, 0x2b600a08, 0x0e000b40, 0x02280303 },
+ { 0x05000010, 0x20001260, 0x12000b70, 0x00000b88 },
+ { 0x00210001, 0x2b341e68, 0x18000000, 0xffffffff },
+ { 0x00010020, 0x34000000, 0x0e001400, 0x00000180 },
+ { 0x00000040, 0x2b480208, 0x06000b48, 0x00000003 },
+ { 0x0a800031, 0x2ba00a08, 0x0e000b40, 0x02480403 },
+ { 0x00200001, 0x2b240208, 0x00450bf0, 0x00000000 },
+ { 0x00000001, 0x2b301e68, 0x18000000, 0x00010001 },
+ { 0x00000020, 0x34000000, 0x0e001400, 0x00000130 },
+ { 0x00600001, 0x2b400608, 0x00000000, 0x00000000 },
+ { 0x01000005, 0x20002240, 0x160000a5, 0x00040004 },
+ { 0x00210001, 0x2b341e68, 0x18000000, 0xffffffff },
+ { 0x00010020, 0x34000000, 0x0e001400, 0x000000f0 },
+ { 0x00000001, 0x2b200e28, 0x08000000, 0x00000001 },
+ { 0x00200001, 0x24002248, 0x004500a0, 0x00000000 },
+ { 0x00200040, 0x24001a68, 0x1e450400, 0xffffffff },
+ { 0x00000041, 0x2b481208, 0x120000a2, 0x00000402 },
+ { 0x00000040, 0x2b480208, 0x12000b48, 0x00000400 },
+ { 0x00000041, 0x2b480208, 0x06000b48, 0x00000018 },
+ { 0x00000001, 0x2b542288, 0x00000014, 0x00000000 },
+ { 0x0a800031, 0x2b600a08, 0x0e000b40, 0x02280303 },
+ { 0x05000010, 0x20001260, 0x12000b70, 0x00000b88 },
+ { 0x00210001, 0x2b341e68, 0x18000000, 0xffffffff },
+ { 0x00010020, 0x34000000, 0x0e001400, 0x00000040 },
+ { 0x00000040, 0x2b480208, 0x06000b48, 0x00000003 },
+ { 0x0a800031, 0x2ba00a88, 0x0e000b40, 0x02480403 },
+ { 0x00200001, 0x2b240208, 0x00450c18, 0x00000000 },
+ { 0x00000001, 0x2b301e68, 0x18000000, 0x00010001 },
+ { 0x00000040, 0x24000a28, 0x0a000b00, 0x00000b20 },
+ { 0x01000010, 0x20000a20, 0x0e000400, 0x00000000 },
+ { 0x00110020, 0x34000000, 0x0e001400, 0x00000080 },
+ { 0x02000010, 0x20000a20, 0x0e000ae0, 0x00000000 },
+ { 0x00010001, 0x2b040208, 0x00000ae4, 0x00000000 },
+ { 0x00010001, 0x2b240208, 0x00000ae4, 0x00000000 },
+ { 0x00010001, 0x2b141248, 0x00000af4, 0x00000000 },
+ { 0x00010001, 0x2b341248, 0x00000af4, 0x00000000 },
+ { 0x00010001, 0x2ac00208, 0x00000ae4, 0x00000000 },
+ { 0x00110001, 0x2ac00608, 0x00000000, 0x00000000 },
+ { 0x00000020, 0x34000000, 0x0e001400, 0x00000190 },
+ { 0x00600001, 0x24000608, 0x00000000, 0x00000000 },
+ { 0x01000010, 0x20001a20, 0x1e000af4, 0x00000000 },
+ { 0x00010040, 0x24001a68, 0x1e000400, 0x00010001 },
+ { 0x00010001, 0x24040208, 0x00000ae4, 0x00000000 },
+ { 0x01000010, 0x20001a20, 0x1e000b14, 0x00000000 },
+ { 0x00010040, 0x24001a68, 0x1e000400, 0x00010001 },
+ { 0x00010001, 0x24040208, 0x00000b04, 0x00000000 },
+ { 0x01000010, 0x20001a20, 0x1e000b34, 0x00000000 },
+ { 0x00010040, 0x24001a68, 0x1e000400, 0x00010001 },
+ { 0x00010001, 0x24040208, 0x00000b24, 0x00000000 },
+ { 0x01000010, 0x20001a20, 0x1e000400, 0x00010001 },
+ { 0x00010001, 0x2ac00208, 0x00000404, 0x00000000 },
+ { 0x00010020, 0x34000000, 0x0e001400, 0x000000c0 },
+ { 0x00000001, 0x2fa01a68, 0x00000ae4, 0x00000000 },
+ { 0x00000001, 0x2fa41a68, 0x00000b04, 0x00000000 },
+ { 0x00000001, 0x2fa81a68, 0x00000b24, 0x00000000 },
+ { 0x00000040, 0x2fe00008, 0x06001400, 0x00000020 },
+ { 0x00000020, 0x34000000, 0x0e001400, 0x00000860 },
+ { 0x00000001, 0x2ac01a68, 0x00000fe4, 0x00000000 },
+ { 0x00000001, 0x2fa01a68, 0x00000ae6, 0x00000000 },
+ { 0x00000001, 0x2fa41a68, 0x00000b06, 0x00000000 },
+ { 0x00000001, 0x2fa81a68, 0x00000b26, 0x00000000 },
+ { 0x00000040, 0x2fe00008, 0x06001400, 0x00000020 },
+ { 0x00000020, 0x34000000, 0x0e001400, 0x00000800 },
+ { 0x00000001, 0x2ac21a68, 0x00000fe4, 0x00000000 },
+ { 0x0020000c, 0x2a801a68, 0x1e450ac0, 0x00020002 },
+ { 0x00200040, 0x2a881a68, 0x1e450a80, 0x00030003 },
+ { 0x00200005, 0x2a901248, 0x16450a88, 0xfffcfffc },
+ { 0x00600001, 0x25600208, 0x008d0020, 0x00000000 },
+ { 0x00600001, 0x28400208, 0x008d0560, 0x00000000 },
+ { 0x00600001, 0x28600608, 0x00000000, 0x00000000 },
+ { 0x00600001, 0x28800608, 0x00000000, 0x00000000 },
+ { 0x00000001, 0x23800608, 0x00000000, 0x00000000 },
+ { 0x00000005, 0x23840208, 0x06000384, 0xff000000 },
+ { 0x00600001, 0x28a00208, 0x008d0380, 0x00000000 },
+ { 0x00000001, 0x24001648, 0x10000000, 0x00010001 },
+ { 0x00000001, 0x28a52288, 0x00000400, 0x00000000 },
+ { 0x00600001, 0x28c00608, 0x00000000, 0x00000000 },
+ { 0x00800001, 0x28c02288, 0x00cf03a3, 0x00000000 },
+ { 0x00000001, 0x28d00608, 0x00000000, 0x11111111 },
+ { 0x00000001, 0x28dc0608, 0x00000000, 0x00010101 },
+ { 0x00000001, 0x28d41248, 0x00000606, 0x00000000 },
+ { 0x00400001, 0x28f00208, 0x00690608, 0x00000000 },
+ { 0x00600001, 0x28e01248, 0x00ae0622, 0x00000000 },
+ { 0x00000001, 0x247c1648, 0x10000000, 0x00000000 },
+ { 0x00000001, 0x247c0e88, 0x08000000, 0x00000002 },
+ { 0x00000001, 0x247d2288, 0x000000a5, 0x00000000 },
+ { 0x00000001, 0x24001648, 0x10000000, 0x00200020 },
+ { 0x00000001, 0x247e2288, 0x00000400, 0x00000000 },
+ { 0x00600001, 0x28200208, 0x008d0460, 0x00000000 },
+ { 0x00000001, 0x244c0608, 0x00000000, 0x00800000 },
+ { 0x00600001, 0x28000208, 0x008d0440, 0x00000000 },
+ { 0x0d600031, 0x21800a08, 0x0e000800, 0x10782000 },
+ { 0x00600001, 0x28000208, 0x008d0480, 0x00000000 },
+ { 0x00000001, 0x28200208, 0x00000180, 0x00000000 },
+ { 0x00000001, 0x28240208, 0x00000190, 0x00000000 },
+ { 0x00000001, 0x28280208, 0x00000194, 0x00000000 },
+ { 0x00000001, 0x282c0208, 0x00000198, 0x00000000 },
+ { 0x00000001, 0x28301248, 0x0000018c, 0x00000000 },
+ { 0x00000001, 0x28340208, 0x00000188, 0x00000000 },
+ { 0x00000001, 0x28380208, 0x0000019c, 0x00000000 },
+ { 0x00000001, 0x283c0208, 0x00000488, 0x00000000 },
+ { 0x0a800031, 0x20000a60, 0x0e000800, 0x040a0203 },
+ { 0x00000001, 0x244c0608, 0x00000000, 0x7e200000 },
+ { 0x00000001, 0x24561648, 0x10000000, 0x28302830 },
+ { 0x00000001, 0x24400208, 0x00000448, 0x00000000 },
+ { 0x00000040, 0x24401a68, 0x1e000440, 0xfff0fff0 },
+ { 0x00000040, 0x24421a68, 0x1e000442, 0xfff4fff4 },
+ { 0x00000001, 0x24401e68, 0x18000000, 0xfff0fff0 },
+ { 0x00000001, 0x24421e68, 0x18000000, 0xfff4fff4 },
+ { 0x00000001, 0x24440208, 0x00000440, 0x00000000 },
+ { 0x01000005, 0x20002240, 0x160000a5, 0x00600060 },
+ { 0x00010040, 0x24401a68, 0x1e000440, 0x000c000c },
+ { 0x01000005, 0x20002240, 0x160000a5, 0x00100010 },
+ { 0x00010040, 0x24421a68, 0x1e000442, 0x00080008 },
+ { 0x00200040, 0x24401a68, 0x1a450440, 0x00450a90 },
+ { 0x00200040, 0x24441a68, 0x1a450444, 0x00450a90 },
+ { 0x00600001, 0x28000208, 0x008d0440, 0x00000000 },
+ { 0x00000001, 0x24600608, 0x00000000, 0x00000002 },
+ { 0x00000001, 0x24642288, 0x0000009c, 0x00000000 },
+ { 0x00000001, 0x24680608, 0x00000000, 0x30003030 },
+ { 0x00600001, 0x28200208, 0x008d0460, 0x00000000 },
+ { 0x00400001, 0x45800208, 0x00000ac0, 0x00000000 },
+ { 0x00400001, 0x45840208, 0x00000ac0, 0x00000000 },
+ { 0x00600001, 0x28600208, 0x008d0580, 0x00000000 },
+ { 0x00600001, 0x28400208, 0x008d0560, 0x00000000 },
+ { 0x00000001, 0x28800608, 0x00000000, 0x01010101 },
+ { 0x00000001, 0x28840608, 0x00000000, 0x10010101 },
+ { 0x00000001, 0x28880608, 0x00000000, 0x0f0f0f0f },
+ { 0x00000001, 0x288c0608, 0x00000000, 0x100f0f0f },
+ { 0x00000001, 0x28900608, 0x00000000, 0x01010101 },
+ { 0x00000001, 0x28940608, 0x00000000, 0x10010101 },
+ { 0x00000001, 0x28980608, 0x00000000, 0x0f0f0f0f },
+ { 0x00000001, 0x289c0608, 0x00000000, 0x100f0f0f },
+ { 0x00000001, 0x28a00608, 0x00000000, 0x01010101 },
+ { 0x00000001, 0x28a40608, 0x00000000, 0x10010101 },
+ { 0x00000001, 0x28a80608, 0x00000000, 0x0f0f0f0f },
+ { 0x00000001, 0x28ac0608, 0x00000000, 0x000f0f0f },
+ { 0x00400001, 0x28b00608, 0x00000000, 0x00000000 },
+ { 0x08600031, 0x21800a08, 0x0e000800, 0x0c784000 },
+ { 0x00000001, 0x25740608, 0x00000000, 0x00000000 },
+ { 0x00000001, 0x25752288, 0x00000199, 0x00000000 },
+ { 0x00000001, 0x25762288, 0x0000019a, 0x00000000 },
+ { 0x00000005, 0x24001248, 0x16000180, 0x00030003 },
+ { 0x00000001, 0x25742288, 0x00000400, 0x00000000 },
+ { 0x00600001, 0x28800208, 0x008d01a0, 0x00000000 },
+ { 0x00600001, 0x28a00208, 0x008d01c0, 0x00000000 },
+ { 0x00600001, 0x28c00208, 0x008d01e0, 0x00000000 },
+ { 0x00600001, 0x28e00208, 0x008d0200, 0x00000000 },
+ { 0x00000001, 0x244c0608, 0x00000000, 0x00243000 },
+ { 0x00600001, 0x28000208, 0x008d0440, 0x00000000 },
+ { 0x00600001, 0x28200208, 0x008d0460, 0x00000000 },
+ { 0x00600001, 0x28400208, 0x008d0560, 0x00000000 },
+ { 0x00600001, 0x28600208, 0x008d0580, 0x00000000 },
+ { 0x0d600031, 0x21800a08, 0x0e000800, 0x10786000 },
+ { 0x00000040, 0x24880208, 0x06000488, 0x00000002 },
+ { 0x00600001, 0x28000208, 0x008d0480, 0x00000000 },
+ { 0x00000001, 0x28200208, 0x00000180, 0x00000000 },
+ { 0x00000001, 0x28240208, 0x00000198, 0x00000000 },
+ { 0x00000001, 0x28280208, 0x00000188, 0x00000000 },
+ { 0x00000001, 0x282c0208, 0x00000574, 0x00000000 },
+ { 0x0a800031, 0x20000a60, 0x0e000800, 0x040a0003 },
+ { 0x00000040, 0x24880208, 0x06000488, 0x00000001 },
+ { 0x00600001, 0x28000208, 0x008d0480, 0x00000000 },
+ { 0x00600001, 0x28200208, 0x008d01a0, 0x00000000 },
+ { 0x00600001, 0x28400208, 0x008d01c0, 0x00000000 },
+ { 0x00600001, 0x28600208, 0x008d01e0, 0x00000000 },
+ { 0x00600001, 0x28800208, 0x008d0200, 0x00000000 },
+ { 0x0a800031, 0x20000a60, 0x0e000800, 0x0a0a0403 },
+ { 0x00000040, 0x24880208, 0x06000488, 0x00000008 },
+ { 0x00600001, 0x28000208, 0x008d0480, 0x00000000 },
+ { 0x00600001, 0x28200208, 0x008d0240, 0x00000000 },
+ { 0x0a800031, 0x20000a60, 0x0e000800, 0x040a0203 },
+ { 0x0a800031, 0x2b600a08, 0x0e000b40, 0x0219e003 },
+ { 0x00600001, 0x2e000208, 0x008d0000, 0x00000000 },
+ { 0x07800031, 0x24000a40, 0x0e000e00, 0x82000010 },
+ { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+ { 0x0000007e, 0x00000000, 0x00000000, 0x00000000 },
+ { 0x06000010, 0x20001a60, 0x1a000fa0, 0x00000fa4 },
+ { 0x00010001, 0x2f601a68, 0x00000fa0, 0x00000000 },
+ { 0x00110001, 0x2f601a68, 0x00000fa4, 0x00000000 },
+ { 0x06000010, 0x20001a60, 0x1a000f60, 0x00000fa8 },
+ { 0x00010001, 0x2fe41a68, 0x00000f60, 0x00000000 },
+ { 0x00110001, 0x2fe41a68, 0x00000fa8, 0x00000000 },
+ { 0x00000001, 0x34000200, 0x00000fe0, 0x00000000 },
+ { 0x04000010, 0x20001a60, 0x1a000fa0, 0x00000fa4 },
+ { 0x00010001, 0x2f601a68, 0x00000fa0, 0x00000000 },
+ { 0x00110001, 0x2f601a68, 0x00000fa4, 0x00000000 },
+ { 0x04000010, 0x20001a60, 0x1a000f60, 0x00000fa8 },
+ { 0x00010001, 0x2fe41a68, 0x00000f60, 0x00000000 },
+ { 0x00110001, 0x2fe41a68, 0x00000fa8, 0x00000000 },
+ { 0x00000001, 0x34000200, 0x00000fe0, 0x00000000 },
+ { 0x04000010, 0x20001a60, 0x1a000fa0, 0x00000fa4 },
+ { 0x00010020, 0x34000000, 0x0e001400, 0x00000070 },
+ { 0x04000010, 0x20001a60, 0x1a000fa0, 0x00000fa8 },
+ { 0x00010001, 0x2fe41a68, 0x00000fa0, 0x00000000 },
+ { 0x00010020, 0x34000000, 0x0e001400, 0x000000a0 },
+ { 0x04000010, 0x20001a60, 0x1a000fa4, 0x00000fa8 },
+ { 0x00010001, 0x2fe41a68, 0x00000fa8, 0x00000000 },
+ { 0x00110001, 0x2fe41a68, 0x00000fa4, 0x00000000 },
+ { 0x00000020, 0x34000000, 0x0e001400, 0x00000060 },
+ { 0x04000010, 0x20001a60, 0x1a000fa4, 0x00000fa8 },
+ { 0x00010001, 0x2fe41a68, 0x00000fa4, 0x00000000 },
+ { 0x00010020, 0x34000000, 0x0e001400, 0x00000030 },
+ { 0x04000010, 0x20001a60, 0x1a000fa0, 0x00000fa8 },
+ { 0x00010001, 0x2fe41a68, 0x00000fa8, 0x00000000 },
+ { 0x00110001, 0x2fe41a68, 0x00000fa0, 0x00000000 },
+ { 0x00000001, 0x34000200, 0x00000fe0, 0x00000000 },
diff --git a/src/shaders/vme/vp8_intra_frame_gen8.asm b/src/shaders/vme/vp8_intra_frame_gen8.asm
new file mode 100644
index 0000000..f1e7891
--- /dev/null
+++ b/src/shaders/vme/vp8_intra_frame_gen8.asm
@@ -0,0 +1,200 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ * Zhao Yakui <yakui.zhao at intel.com>
+ * Xiang Haihao <haihao.xiang at intel.com>
+ * Li Zhong <zhong.li at intel.com>
+ *
+ */
+
+/*
+ * __START
+ */
+__INTRA_START:
+mov (16) tmp_reg0.0<1>:UD 0x0:UD {align1};
+mov (16) tmp_reg2.0<1>:UD 0x0:UD {align1};
+mov (16) tmp_reg4.0<1>:UD 0x0:UD {align1} ;
+mov (16) tmp_reg6.0<1>:UD 0x0:UD {align1} ;
+
+shl (2) read0_header.0<1>:D orig_xy_ub<2,2,1>:UB 4:UW {align1}; /* (x, y) * 16 */
+add (1) read0_header.0<1>:D read0_header.0<0,1,0>:D -8:W {align1}; /* X offset */
+add (1) read0_header.4<1>:D read0_header.4<0,1,0>:D -1:W {align1}; /* Y offset */
+mov (1) read0_header.8<1>:UD BLOCK_32X1 {align1};
+mov (1) read0_header.20<1>:UB thread_id_ub {align1}; /* dispatch id */
+
+shl (2) read1_header.0<1>:D orig_xy_ub<2,2,1>:UB 4:UW {align1}; /* (x, y) * 16 */
+add (1) read1_header.0<1>:D read1_header.0<0,1,0>:D -4:W {align1}; /* X offset */
+mov (1) read1_header.8<1>:UD BLOCK_4X16 {align1};
+mov (1) read1_header.20<1>:UB thread_id_ub {align1}; /* dispatch id */
+
+shl (2) vme_m0.8<1>:UW orig_xy_ub<2,2,1>:UB 4:UW {align1}; /* (x, y) * 16 */
+mov (1) vme_m0.20<1>:UB thread_id_ub {align1}; /* dispatch id */
+
+mul (1) obw_m0.8<1>:UD w_in_mb_uw<0,1,0>:UW orig_y_ub<0,1,0>:UB {align1};
+add (1) obw_m0.8<1>:UD obw_m0.8<0,1,0>:UD orig_x_ub<0,1,0>:UB {align1};
+mul (1) obw_m0.8<1>:UD obw_m0.8<0,1,0>:UD 0x02:UD {align1};
+mov (1) obw_m0.20<1>:UB thread_id_ub {align1}; /* dispatch id */
+
+/*
+ * Media Read Message -- fetch Luma neighbor edge pixels
+ */
+/* ROW */
+mov (8) msg_reg0.0<1>:UD read0_header.0<8,8,1>:UD {align1};
+send (8) msg_ind INEP_ROW<1>:UB null read(BIND_IDX_INEP, 0, 0, 4) mlen 1 rlen 1 {align1};
+
+/* COL */
+mov (8) msg_reg0.0<1>:UD read1_header.0<8,8,1>:UD {align1};
+send (8) msg_ind INEP_COL0<1>:UB null read(BIND_IDX_INEP, 0, 0, 4) mlen 1 rlen 2 {align1};
+
+/*
+ * Media Read Message -- fetch Chroma neighbor edge pixels
+ */
+/* ROW */
+shl (2) read0_header.0<1>:D orig_xy_ub<2,2,1>:UB 3:UW {align1}; /* x * 16 , y * 8 */
+mul (1) read0_header.0<1>:D read0_header.0<0,1,0>:D 2:W {align1};
+add (1) read0_header.0<1>:D read0_header.0<0,1,0>:D -8:W {align1}; /* X offset */
+add (1) read0_header.4<1>:D read0_header.4<0,1,0>:D -1:W {align1}; /* Y offset */
+mov (8) msg_reg0.0<1>:UD read0_header.0<8,8,1>:UD {align1};
+send (8) msg_ind CHROMA_ROW<1>:UB null read(BIND_IDX_CBCR, 0, 0, 4) mlen 1 rlen 1 {align1};
+
+/* COL */
+shl (2) read1_header.0<1>:D orig_xy_ub<2,2,1>:UB 3:UW {align1}; /* x * 16, y * 8 */
+mul (1) read1_header.0<1>:D read1_header.0<0,1,0>:D 2:W {align1};
+add (1) read1_header.0<1>:D read1_header.0<0,1,0>:D -4:W {align1}; /* X offset */
+mov (1) read1_header.8<1>:UD BLOCK_8X4 {align1};
+mov (8) msg_reg0.0<1>:UD read1_header.0<8,8,1>:UD {align1};
+send (8) msg_ind CHROMA_COL<1>:UB null read(BIND_IDX_CBCR, 0, 0, 4) mlen 1 rlen 1 {align1};
+
+/* m2, get the MV/Mb cost passed by constant buffer
+when creating EU thread by MEDIA_OBJECT */
+mov (8) vme_msg_2<1>:UD r1.0<8,8,1>:UD {align1};
+
+/* m3. This is changed for FWD/BWD cost center */
+mov (8) vme_msg_3<1>:UD 0x0:UD {align1};
+
+/* m4.*/
+mov (8) vme_msg_4<1>:ud 0x0:ud {align1};
+
+/* m5 */
+mov (1) INEP_ROW.0<1>:UD 0x0:UD {align1};
+and (1) INEP_ROW.4<1>:UD INEP_ROW.4<0,1,0>:UD 0xFF000000:UD {align1};
+mov (8) vme_msg_5<1>:UD INEP_ROW.0<8,8,1>:UD {align1};
+
+mov (1) tmp_reg0.0<1>:UB INTRA_PLANAR_MODE_MASK {align1}; /* vp8 don't support planar intra mode */
+mov (1) tmp_reg0.1<1>:UB LUMA_CHROMA_MODE {align1}; /* Intra type: Luma + Chroma */
+
+/* Intra mode mask && Intra compute type */
+mov (1) vme_msg_5.4<1>:UW tmp_reg0.0<0,1,0>:UW {align1};
+
+/* m6 */
+mov (8) vme_msg_6<1>:UD 0x0:UD {align1};
+mov (16) vme_msg_6.0<1>:UB INEP_COL0.3<32,8,4>:UB {align1};
+mov (1) vme_msg_6.16<1>:UD INTRA_PREDICTORE_MODE {align1};
+
+/* the penalty for Intra mode */
+mov (1) vme_msg_6.28<1>:UD 0x010101:UD {align1};
+mov (1) vme_msg_6.20<1>:UW CHROMA_ROW.6<0,1,0>:UW {align1};
+
+
+/* m7 */
+
+mov (4) vme_msg_7.16<1>:UD CHROMA_ROW.8<4,4,1>:UD {align1};
+mov (8) vme_msg_7.0<1>:UW CHROMA_COL.2<16,8,2>:UW {align1};
+
+/*
+ * VME message
+ */
+
+/* m1 */
+mov (1) intra_flag<1>:UW 0x0:UW {align1};
+mov (1) intra_part_mask_ub<1>:UB LUMA_INTRA_8x8_DISABLE {align1}; /* vp8 don't support intra_8x8 mode*/
+
+/* assign MB intra struct from the thread payload*/
+mov (1) mb_intra_struct_ub<1>:UB input_mb_intra_ub<0,1,0>:UB {align1};
+
+/* Disable DC HAAR component when calculating HARR SATD block */
+mov (1) tmp_reg0.0<1>:UW DC_HARR_DISABLE:UW {align1};
+mov (1) vme_m1.30<1>:UB tmp_reg0.0<0,1,0>:UB {align1};
+
+mov (8) vme_msg_1<1>:UD vme_m1.0<8,8,1>:UD {align1};
+
+/* m0 */
+add (1) vme_m0.12<1>:UD vme_m0.12<0,1,0>:ud INTRA_SAD_HAAR:UD {align1};/* 16x16 Source, Intra_harr */
+mov (1) vme_m0.15<1>:UB SUB_PART_8x4_DISABLE + SUB_PART_4x8_DISABLE {align1}; /* vp8 don't support 8x4 and 4x8 partion */
+mov (8) vme_msg_0<1>:UD vme_m0.0<8,8,1>:UD {align1};
+
+/* after verification it will be passed by using payload */
+send (8)
+ vme_msg_ind
+ vme_wb<1>:UD
+ null
+ cre(
+ BIND_IDX_VME,
+ VME_SIC_MESSAGE_TYPE
+ )
+ mlen sic_vme_msg_length
+ rlen vme_wb_length
+ {align1};
+/*
+ * Oword Block Write message
+ */
+mov (8) msg_reg0.0<1>:UD obw_m0<8,8,1>:UD {align1};
+
+mov (1) msg_reg1.0<1>:UD vme_wb.0<0,1,0>:UD {align1};
+mov (1) msg_reg1.4<1>:UD vme_wb.16<0,1,0>:UD {align1};
+mov (1) msg_reg1.8<1>:UD vme_wb.20<0,1,0>:UD {align1};
+mov (1) msg_reg1.12<1>:UD vme_wb.24<0,1,0>:UD {align1};
+
+/* Distortion, Intra (17-16), */
+mov (1) msg_reg1.16<1>:UW vme_wb.12<0,1,0>:UW {align1};
+
+mov (1) msg_reg1.20<1>:UD vme_wb.8<0,1,0>:UD {align1};
+/* VME clock counts */
+mov (1) msg_reg1.24<1>:UD vme_wb.28<0,1,0>:UD {align1};
+
+mov (1) msg_reg1.28<1>:UD obw_m0.8<0,1,0>:UD {align1};
+
+/* bind index 3, write 2 oword (32bytes), msg type: 8(OWord Block Write) */
+send (16)
+ msg_ind
+ obw_wb
+ null
+ data_port(
+ OBW_CACHE_TYPE,
+ OBW_MESSAGE_TYPE,
+ OBW_CONTROL_2,
+ OBW_BIND_IDX,
+ OBW_WRITE_COMMIT_CATEGORY,
+ OBW_HEADER_PRESENT
+ )
+ mlen 2
+ rlen obw_wb_length
+ {align1};
+
+__EXIT:
+/*
+ * kill thread
+ */
+mov (8) ts_msg_reg0<1>:UD r0<8,8,1>:UD {align1};
+send (16) ts_msg_ind acc0<1>UW null thread_spawner(0, 0, 1) mlen 1 rlen 0 {align1 EOT};
diff --git a/src/shaders/vme/vp8_intra_frame_gen8.g8a b/src/shaders/vme/vp8_intra_frame_gen8.g8a
new file mode 100644
index 0000000..a445b1e
--- /dev/null
+++ b/src/shaders/vme/vp8_intra_frame_gen8.g8a
@@ -0,0 +1,2 @@
+#include "vme8.inc"
+#include "vp8_intra_frame_gen8.asm"
diff --git a/src/shaders/vme/vp8_intra_frame_gen8.g8b b/src/shaders/vme/vp8_intra_frame_gen8.g8b
new file mode 100644
index 0000000..4dca617
--- /dev/null
+++ b/src/shaders/vme/vp8_intra_frame_gen8.g8b
@@ -0,0 +1,73 @@
+ { 0x00800001, 0x24000608, 0x00000000, 0x00000000 },
+ { 0x00800001, 0x24400608, 0x00000000, 0x00000000 },
+ { 0x00800001, 0x24800608, 0x00000000, 0x00000000 },
+ { 0x00800001, 0x24c00608, 0x00000000, 0x00000000 },
+ { 0x00200009, 0x24002228, 0x164500a0, 0x00040004 },
+ { 0x00000040, 0x24000a28, 0x1e000400, 0xfff8fff8 },
+ { 0x00000040, 0x24040a28, 0x1e000404, 0xffffffff },
+ { 0x00000001, 0x24080e08, 0x08000000, 0x0000001f },
+ { 0x00000001, 0x24142288, 0x00000014, 0x00000000 },
+ { 0x00200009, 0x24202228, 0x164500a0, 0x00040004 },
+ { 0x00000040, 0x24200a28, 0x1e000420, 0xfffcfffc },
+ { 0x00000001, 0x24280e08, 0x08000000, 0x000f0003 },
+ { 0x00000001, 0x24342288, 0x00000014, 0x00000000 },
+ { 0x00200009, 0x24482248, 0x164500a0, 0x00040004 },
+ { 0x00000001, 0x24542288, 0x00000014, 0x00000000 },
+ { 0x00000041, 0x24881208, 0x220000a2, 0x000000a1 },
+ { 0x00000040, 0x24880208, 0x22000488, 0x000000a0 },
+ { 0x00000041, 0x24880208, 0x06000488, 0x00000002 },
+ { 0x00000001, 0x24942288, 0x00000014, 0x00000000 },
+ { 0x00600001, 0x28000208, 0x008d0400, 0x00000000 },
+ { 0x04600031, 0x23800a88, 0x0e000800, 0x02190004 },
+ { 0x00600001, 0x28000208, 0x008d0420, 0x00000000 },
+ { 0x04600031, 0x23a00a88, 0x0e000800, 0x02290004 },
+ { 0x00200009, 0x24002228, 0x164500a0, 0x00030003 },
+ { 0x00000041, 0x24000a28, 0x1e000400, 0x00020002 },
+ { 0x00000040, 0x24000a28, 0x1e000400, 0xfff8fff8 },
+ { 0x00000040, 0x24040a28, 0x1e000404, 0xffffffff },
+ { 0x00600001, 0x28000208, 0x008d0400, 0x00000000 },
+ { 0x04600031, 0x26000a88, 0x0e000800, 0x02190006 },
+ { 0x00200009, 0x24202228, 0x164500a0, 0x00030003 },
+ { 0x00000041, 0x24200a28, 0x1e000420, 0x00020002 },
+ { 0x00000040, 0x24200a28, 0x1e000420, 0xfffcfffc },
+ { 0x00000001, 0x24280e08, 0x08000000, 0x00070003 },
+ { 0x00600001, 0x28000208, 0x008d0420, 0x00000000 },
+ { 0x04600031, 0x26200a88, 0x0e000800, 0x02190006 },
+ { 0x00600001, 0x28400208, 0x008d0020, 0x00000000 },
+ { 0x00600001, 0x28600608, 0x00000000, 0x00000000 },
+ { 0x00600001, 0x28800608, 0x00000000, 0x00000000 },
+ { 0x00000001, 0x23800608, 0x00000000, 0x00000000 },
+ { 0x00000005, 0x23840208, 0x06000384, 0xff000000 },
+ { 0x00600001, 0x28a00208, 0x008d0380, 0x00000000 },
+ { 0x00000001, 0x24000688, 0x00000000, 0x10001000 },
+ { 0x00000001, 0x24010e88, 0x08000000, 0x00000000 },
+ { 0x00000001, 0x28a41248, 0x00000400, 0x00000000 },
+ { 0x00600001, 0x28c00608, 0x00000000, 0x00000000 },
+ { 0x00800001, 0x28c02288, 0x00cf03a3, 0x00000000 },
+ { 0x00000001, 0x28d00608, 0x00000000, 0x11111111 },
+ { 0x00000001, 0x28dc0608, 0x00000000, 0x00010101 },
+ { 0x00000001, 0x28d41248, 0x00000606, 0x00000000 },
+ { 0x00400001, 0x28f00208, 0x00690608, 0x00000000 },
+ { 0x00600001, 0x28e01248, 0x00ae0622, 0x00000000 },
+ { 0x00000001, 0x247c1648, 0x10000000, 0x00000000 },
+ { 0x00000001, 0x247c0e88, 0x08000000, 0x00000002 },
+ { 0x00000001, 0x247d2288, 0x000000a5, 0x00000000 },
+ { 0x00000001, 0x24001648, 0x10000000, 0x00200020 },
+ { 0x00000001, 0x247e2288, 0x00000400, 0x00000000 },
+ { 0x00600001, 0x28200208, 0x008d0460, 0x00000000 },
+ { 0x00000040, 0x244c0208, 0x0600044c, 0x00800000 },
+ { 0x00000001, 0x244f0e88, 0x08000000, 0x00000030 },
+ { 0x00600001, 0x28000208, 0x008d0440, 0x00000000 },
+ { 0x0d600031, 0x21800a08, 0x0e000800, 0x10782000 },
+ { 0x00600001, 0x28000208, 0x008d0480, 0x00000000 },
+ { 0x00000001, 0x28200208, 0x00000180, 0x00000000 },
+ { 0x00000001, 0x28240208, 0x00000190, 0x00000000 },
+ { 0x00000001, 0x28280208, 0x00000194, 0x00000000 },
+ { 0x00000001, 0x282c0208, 0x00000198, 0x00000000 },
+ { 0x00000001, 0x28301248, 0x0000018c, 0x00000000 },
+ { 0x00000001, 0x28340208, 0x00000188, 0x00000000 },
+ { 0x00000001, 0x28380208, 0x0000019c, 0x00000000 },
+ { 0x00000001, 0x283c0208, 0x00000488, 0x00000000 },
+ { 0x0a800031, 0x20000a60, 0x0e000800, 0x040a0203 },
+ { 0x00600001, 0x2e000208, 0x008d0000, 0x00000000 },
+ { 0x07800031, 0x24000a40, 0x0e000e00, 0x82000010 },
diff --git a/src/vp8_probs.h b/src/vp8_probs.h
index 8dd4290..e864b68 100644
--- a/src/vp8_probs.h
+++ b/src/vp8_probs.h
@@ -42,27 +42,27 @@
#ifndef VP8_PROBS_H
#define VP8_PROBS_H
-const unsigned char vp8_ymode_prob[4] =
+static const unsigned char vp8_ymode_prob[4] =
{
112, 86, 140, 37
};
-const unsigned char vp8_kf_ymode_prob[4] =
+static const unsigned char vp8_kf_ymode_prob[4] =
{
145, 156, 163, 128
};
-const unsigned char vp8_uv_mode_prob[3] =
+static const unsigned char vp8_uv_mode_prob[3] =
{
162, 101, 204
};
-static const unsigned char vp8_kf_uv_mode_prob[3] =
+static const unsigned char vp8_kf_uv_mode_prob[3] =
{
142, 114, 183
};
-const unsigned char vp8_base_skip_false_prob[128] =
+static const unsigned char vp8_base_skip_false_prob[128] =
{
255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255,
@@ -82,7 +82,7 @@ const unsigned char vp8_base_skip_false_prob[128] =
30, 28, 26, 24, 22, 20, 18, 16,
};
-const unsigned char vp8_mv_update_probs[2][19] =
+static const unsigned char vp8_mv_update_probs[2][19] =
{
{
237,
@@ -98,7 +98,7 @@ const unsigned char vp8_mv_update_probs[2][19] =
}
};
-const unsigned char vp8_default_mv_context[2][19] =
+static const unsigned char vp8_default_mv_context[2][19] =
{
{
162, /* is short */
@@ -116,7 +116,7 @@ const unsigned char vp8_default_mv_context[2][19] =
}
};
-const unsigned char vp8_default_coef_probs[4][8][3][11] =
+static const unsigned char vp8_default_coef_probs[4][8][3][11] =
{
{ /* Block Type ( 0 ) */
{ /* Coeff Band ( 0 )*/
--
1.9.1
More information about the Libva
mailing list