Merge branch 'master' into staging
authorXiang, Haihao <haihao.xiang@intel.com>
Wed, 3 Apr 2013 00:58:13 +0000 (08:58 +0800)
committerXiang, Haihao <haihao.xiang@intel.com>
Wed, 3 Apr 2013 01:15:56 +0000 (09:15 +0800)
Conflicts:
NEWS
configure.ac
src/Makefile.am
src/gen6_mfc.c
src/gen6_mfd.c
src/gen6_vme.c
src/gen6_vme.h
src/gen75_mfc.c
src/gen75_mfd.c
src/gen75_vme.c
src/gen75_vpp_vebox.c
src/gen75_vpp_vebox.h
src/gen7_mfd.c
src/i965_avc_bsd.c
src/i965_decoder.h
src/i965_decoder_utils.c
src/i965_defines.h
src/i965_drv_video.c
src/i965_drv_video.h
src/i965_encoder.c
src/i965_encoder.h
src/i965_output_dri.c
src/i965_post_processing.c
src/i965_post_processing.h
src/i965_render.c
src/i965_structs.h
src/intel_driver.c
src/object_heap.c
src/shaders/post_processing/Common/AYUV_Load_16x8.asm
src/shaders/post_processing/Common/AYUV_Load_16x8.inc
src/shaders/post_processing/Common/Init_All_Regs.asm
src/shaders/post_processing/Makefile.am
src/shaders/post_processing/gen5_6/Common/AYUV_Load_16x8.asm
src/shaders/post_processing/gen5_6/Common/AYUV_Load_16x8.inc
src/shaders/post_processing/gen5_6/Common/Init_All_Regs.asm
src/shaders/post_processing/gen5_6/Common/NV12_Load_8x4.asm
src/shaders/post_processing/gen5_6/Common/RGBX_Load_16x8.asm
src/shaders/post_processing/gen5_6/Common/RGBX_Load_16x8.inc
src/shaders/post_processing/gen5_6/Makefile.am
src/shaders/post_processing/gen5_6/nv12_avs_nv12.g4b.gen5
src/shaders/post_processing/gen5_6/nv12_avs_nv12.g6b
src/shaders/post_processing/gen5_6/nv12_dn_nv12.g4b.gen5
src/shaders/post_processing/gen5_6/nv12_dn_nv12.g6b
src/shaders/post_processing/gen5_6/nv12_dndi_nv12.g4b.gen5
src/shaders/post_processing/gen5_6/nv12_dndi_nv12.g6b
src/shaders/post_processing/gen5_6/nv12_load_save_nv12.g4b.gen5
src/shaders/post_processing/gen5_6/nv12_load_save_nv12.g6b
src/shaders/post_processing/gen5_6/nv12_load_save_pa.g4b.gen5
src/shaders/post_processing/gen5_6/nv12_load_save_pa.g6b
src/shaders/post_processing/gen5_6/nv12_load_save_pl3.g4b.gen5
src/shaders/post_processing/gen5_6/nv12_load_save_pl3.g6b
src/shaders/post_processing/gen5_6/pa_load_save_nv12.g4b.gen5
src/shaders/post_processing/gen5_6/pa_load_save_nv12.g6b
src/shaders/post_processing/gen5_6/pa_load_save_pl3.g4b.gen5
src/shaders/post_processing/gen5_6/pa_load_save_pl3.g6b
src/shaders/post_processing/gen5_6/pl3_load_save_nv12.g4b.gen5
src/shaders/post_processing/gen5_6/pl3_load_save_nv12.g6b
src/shaders/post_processing/gen5_6/pl3_load_save_pa.g4b.gen5
src/shaders/post_processing/gen5_6/pl3_load_save_pa.g6b
src/shaders/post_processing/gen5_6/pl3_load_save_pl3.g4b.gen5
src/shaders/post_processing/gen5_6/pl3_load_save_pl3.g6b
src/shaders/post_processing/gen7/EOT.g4a
src/shaders/post_processing/gen7/Makefile.am
src/shaders/post_processing/gen7/PA_AVS_Buf_0.g4a
src/shaders/post_processing/gen7/PA_AVS_Buf_1.g4a
src/shaders/post_processing/gen7/PA_AVS_Buf_2.g4a
src/shaders/post_processing/gen7/PA_AVS_Buf_3.g4a
src/shaders/post_processing/gen7/PL2_AVS_Buf_0.g4a
src/shaders/post_processing/gen7/PL2_AVS_Buf_1.g4a
src/shaders/post_processing/gen7/PL2_AVS_Buf_2.g4a
src/shaders/post_processing/gen7/PL2_AVS_Buf_3.g4a
src/shaders/post_processing/gen7/PL3_AVS_Buf_0.g4a
src/shaders/post_processing/gen7/PL3_AVS_Buf_1.g4a
src/shaders/post_processing/gen7/PL3_AVS_Buf_2.g4a
src/shaders/post_processing/gen7/PL3_AVS_Buf_3.g4a
src/shaders/post_processing/gen7/Save_AVS_NV12.g4a
src/shaders/post_processing/gen7/Save_AVS_PA.g4a
src/shaders/post_processing/gen7/Save_AVS_PL3.g4a
src/shaders/post_processing/gen7/Save_AVS_RGB.g4a
src/shaders/post_processing/gen7/Set_AVS_Buf_0123_BGRA.g4a
src/shaders/post_processing/gen7/Set_AVS_Buf_0123_PL2.g4a
src/shaders/post_processing/gen7/Set_AVS_Buf_0123_PL3.g4a
src/shaders/post_processing/gen7/Set_AVS_Buf_0123_VUYA.g4a
src/shaders/post_processing/gen7/Set_AVS_Buf_0123_VYUA.g4a
src/shaders/post_processing/gen7/Set_Layer_0.g4a
src/shaders/post_processing/gen7/VP_Setup.g4a
src/shaders/vme/Makefile.am
src/shaders/vme/inter_frame_haswell.asm
src/shaders/vme/inter_frame_haswell.g75b
src/shaders/vme/intra_frame_haswell.asm
src/shaders/vme/intra_frame_haswell.g75b
src/shaders/vme/vme75.inc
src/shaders/vme/vme7_mpeg2.inc

29 files changed:
1  2 
configure.ac
src/Makefile.am
src/gen6_mfc.c
src/gen6_mfd.c
src/gen6_vme.c
src/gen6_vme.h
src/gen75_mfc.c
src/gen75_mfd.c
src/gen75_vme.c
src/gen75_vpp_vebox.c
src/gen75_vpp_vebox.h
src/gen7_mfd.c
src/i965_avc_bsd.c
src/i965_decoder.h
src/i965_decoder_utils.c
src/i965_defines.h
src/i965_drv_video.c
src/i965_drv_video.h
src/i965_output_dri.c
src/i965_post_processing.c
src/i965_post_processing.h
src/i965_render.c
src/intel_driver.c
src/intel_driver.h
src/shaders/post_processing/gen5_6/Makefile.am
src/shaders/post_processing/gen7/Makefile.am
src/shaders/vme/inter_frame_haswell.asm
src/shaders/vme/intra_frame_haswell.asm
src/sysdeps.h

diff --cc configure.ac
@@@ -10,8 -10,8 +10,8 @@@ m4_append([intel_driver_version], intel
  ])
  
  # libva minimum version requirement
- m4_define([libva_version], [0.34])
 -m4_define([libva_package_version], [1.0.14])
 -m4_define([va_api_version], [0.32.0])
++m4_define([va_api_version], [0.34])
 +m4_define([libva_package_version], [1.2.0])
  
  # libdrm minimum version requirement
  m4_define([libdrm_version], [2.4.23])
diff --cc src/Makefile.am
index 3b020b6,4573557..3299733
mode 100755,100644..100755
@@@ -48,18 -48,13 +48,18 @@@ driver_libs = 
  source_c = \
        dso_utils.c             \
        gen6_mfc.c              \
 +      gen6_mfc_common.c       \
        gen6_mfd.c              \
        gen6_vme.c              \
 +      gen7_vme.c              \
 +      gen7_mfc.c              \
        gen7_mfd.c              \
        gen75_mfd.c             \
--      gen75_vme.c             \
        gen75_mfc.c             \
-       gen75_vpp_vebox.c       \
-       gen75_vpp_gpe.c         \
 +      gen75_picture_process.c \
++      gen75_vme.c             \
++      gen75_vpp_gpe.c         \
+       gen75_vpp_vebox.c       \
        i965_avc_bsd.c          \
        i965_avc_hw_scoreboard.c\
        i965_avc_ildb.c         \
@@@ -87,9 -79,7 +87,9 @@@ source_h = 
        gen6_mfd.h              \
        gen6_vme.h              \
        gen7_mfd.h              \
-       gen75_vpp_vebox.h       \
-       gen75_vpp_gpe.h         \
 +      gen75_picture_process.h \
++      gen75_vpp_gpe.h         \
+       gen75_vpp_vebox.h       \
        i965_avc_bsd.h          \
        i965_avc_hw_scoreboard.h\
        i965_avc_ildb.h         \
        intel_batchbuffer_dump.h\
        intel_compiler.h        \
        intel_driver.h          \
++      intel_media.h           \
        intel_memman.h          \
        object_heap.h           \
-       intel_media.h           \
+       sysdeps.h               \
+       va_backend_compat.h     \
        $(NULL)
  
  i965_drv_video_la_LTLIBRARIES = i965_drv_video.la
diff --cc src/gen6_mfc.c
@@@ -449,11 -472,15 +449,12 @@@ gen6_mfc_avc_slice_state(VADriverContex
  
      ADVANCE_BCS_BATCH(batch);
  }
 -static void gen6_mfc_avc_qm_state(VADriverContextP ctx,
 -                                  struct gen6_encoder_context *gen6_encoder_context,
 -                                  struct intel_batchbuffer *batch)
++
 +static void gen6_mfc_avc_qm_state(VADriverContextP ctx, struct intel_encoder_context *encoder_context)
  {
 +    struct intel_batchbuffer *batch = encoder_context->base.batch;
      int i;
  
 -    if (batch == NULL)
 -        batch = gen6_encoder_context->base.batch;
 -
      BEGIN_BCS_BATCH(batch, 58);
  
      OUT_BCS_BATCH(batch, MFX_AVC_QM_STATE | 56);
diff --cc src/gen6_mfd.c
   *
   */
  
- #include <stdio.h>
- #include <stdlib.h>
- #include <string.h>
- #include <assert.h>
 -#ifndef HAVE_GEN_AVC_SURFACE
 -#define HAVE_GEN_AVC_SURFACE 1
 -#endif
--
+ #include "sysdeps.h"
  #include "intel_batchbuffer.h"
  #include "intel_driver.h"
  #include "i965_defines.h"
  #include "i965_drv_video.h"
  #include "i965_decoder_utils.h"
diff --cc src/gen6_vme.c
@@@ -508,35 -862,76 +508,35 @@@ static void gen6_vme_media_init(VADrive
  
  static void gen6_vme_pipeline_programing(VADriverContextP ctx, 
                                           struct encode_state *encode_state,
 -                                         struct gen6_encoder_context *gen6_encoder_context)
 +                                         struct intel_encoder_context *encoder_context)
  {
 -    struct i965_driver_data *i965 = i965_driver_data(ctx);
 -    struct intel_batchbuffer *main_batch = gen6_encoder_context->base.batch;
 -    VAEncSliceParameterBuffer *pSliceParameter = (VAEncSliceParameterBuffer *)encode_state->slice_params[0]->buffer;
 -    VAEncSequenceParameterBufferH264 *pSequenceParameter = (VAEncSequenceParameterBufferH264 *)encode_state->seq_param->buffer;
 -    int is_intra = pSliceParameter->slice_flags.bits.is_intra;
 +    struct gen6_vme_context *vme_context = encoder_context->vme_context;
 +    struct intel_batchbuffer *batch = encoder_context->base.batch;
 +    VAEncPictureParameterBufferH264 *pPicParameter = (VAEncPictureParameterBufferH264 *)encode_state->pic_param_ext->buffer;
 +    VAEncSliceParameterBufferH264 *pSliceParameter = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[0]->buffer;
 +    VAEncSequenceParameterBufferH264 *pSequenceParameter = (VAEncSequenceParameterBufferH264 *)encode_state->seq_param_ext->buffer;
 +    int is_intra = pSliceParameter->slice_type == SLICE_TYPE_I;
      int width_in_mbs = pSequenceParameter->picture_width_in_mbs;
      int height_in_mbs = pSequenceParameter->picture_height_in_mbs;
 -    int emit_new_state = 1, object_len_in_bytes;
 -    int x, y;
 -    struct intel_batchbuffer *batch = intel_batchbuffer_new(&i965->intel, I915_EXEC_RENDER, width_in_mbs * height_in_mbs * 8 * 4 + 0x200);
 -
 -    intel_batchbuffer_start_atomic(batch, width_in_mbs * height_in_mbs * 8 * 4 + 0x100);
  
 -    for(y = 0; y < height_in_mbs; y++){
 -        for(x = 0; x < width_in_mbs; x++){    
 -
 -            if (emit_new_state) {
 -                /*Step1: MI_FLUSH/PIPE_CONTROL*/
 -                intel_batchbuffer_emit_mi_flush(batch);
 -
 -                /*Step2: State command PIPELINE_SELECT*/
 -                gen6_vme_pipeline_select(ctx, gen6_encoder_context, batch);
 -
 -                /*Step3: State commands configuring pipeline states*/
 -                gen6_vme_state_base_address(ctx, gen6_encoder_context, batch);
 -                gen6_vme_vfe_state(ctx, gen6_encoder_context, batch);
 -                gen6_vme_curbe_load(ctx, gen6_encoder_context, batch);
 -                gen6_vme_idrt(ctx, gen6_encoder_context, batch);
 -
 -                emit_new_state = 0;
 -            }
 -
 -            /*Step4: Primitive commands*/
 -            object_len_in_bytes = gen6_vme_media_object(ctx, encode_state, x, y, is_intra ? VME_INTRA_SHADER : VME_INTER_SHADER, gen6_encoder_context, batch);
 -
 -            if (intel_batchbuffer_check_free_space(batch, object_len_in_bytes) == 0) {
 -                assert(0);
 -                intel_batchbuffer_end_atomic(batch);  
 -                intel_batchbuffer_flush(batch);
 -                emit_new_state = 1;
 -                intel_batchbuffer_start_atomic(batch, 0x1000);
 -            }
 -        }
 -    }
 -
 -    intel_batchbuffer_align(batch, 8);
 +    gen6_vme_fill_vme_batchbuffer(ctx, 
 +                                  encode_state,
 +                                  width_in_mbs, height_in_mbs,
 +                                  is_intra ? AVC_VME_INTRA_SHADER : AVC_VME_INTER_SHADER, 
 +                                  pPicParameter->pic_fields.bits.transform_8x8_mode_flag,
 +                                  encoder_context);
  
 +    intel_batchbuffer_start_atomic(batch, 0x1000);
 +    gen6_gpe_pipeline_setup(ctx, &vme_context->gpe_context, batch);
      BEGIN_BATCH(batch, 2);
 -    OUT_BATCH(batch, 0);
 -    OUT_BATCH(batch, MI_BATCH_BUFFER_END);
 +    OUT_BATCH(batch, MI_BATCH_BUFFER_START | (2 << 6));
 +    OUT_RELOC(batch,
 +              vme_context->vme_batchbuffer.bo,
 +              I915_GEM_DOMAIN_COMMAND, 0, 
 +              0);
      ADVANCE_BATCH(batch);
  
-     intel_batchbuffer_end_atomic(batch);      
+     intel_batchbuffer_end_atomic(batch);
 -
 -    /* chain to the main batch buffer */
 -    intel_batchbuffer_start_atomic(main_batch, 0x100);
 -    intel_batchbuffer_emit_mi_flush(main_batch);
 -    BEGIN_BATCH(main_batch, 2);
 -    OUT_BATCH(main_batch, MI_BATCH_BUFFER_START | (2 << 6));
 -    OUT_RELOC(main_batch,
 -              batch->buffer,
 -              I915_GEM_DOMAIN_COMMAND, 0,
 -              0);
 -    ADVANCE_BATCH(main_batch);
 -    intel_batchbuffer_end_atomic(main_batch);
 -
 -    // end programing             
 -    intel_batchbuffer_free(batch);
  }
  
  static VAStatus gen6_vme_prepare(VADriverContextP ctx, 
diff --cc src/gen6_vme.h
  #include <i915_drm.h>
  #include <intel_bufmgr.h>
  
 +#include "i965_gpe_utils.h"
  
  #define INTRA_VME_OUTPUT_IN_BYTES       16      /* in bytes */
 -#define MAX_INTERFACE_DESC_GEN6      32
 -#define MAX_MEDIA_SURFACES_GEN6      34
 +#define INTRA_VME_OUTPUT_IN_DWS         (INTRA_VME_OUTPUT_IN_BYTES / 4)
 +#define INTER_VME_OUTPUT_IN_BYTES       160     /* the first 128 bytes for MVs and the last 32 bytes for other info */
 +#define INTER_VME_OUTPUT_IN_DWS         (INTER_VME_OUTPUT_IN_BYTES / 4)
  
- #define MAX_INTERFACE_DESC_GEN6      MAX_GPE_KERNELS
- #define MAX_MEDIA_SURFACES_GEN6      34
 -#define GEN6_VME_KERNEL_NUMBER          2
++#define MAX_INTERFACE_DESC_GEN6         MAX_GPE_KERNELS
++#define MAX_MEDIA_SURFACES_GEN6         34
 +
 +#define GEN6_VME_KERNEL_NUMBER          3
  
  struct encode_state;
 -struct gen6_encoder_context;
 +struct intel_encoder_context;
  
  struct gen6_vme_context
  {
diff --cc src/gen75_mfc.c
@@@ -1,5 -1,5 +1,5 @@@
  /*
-  * Copyright Â© 2012 Intel Corporation
 - * Copyright Â© 2010-2011 Intel Corporation
++ * Copyright Â© 2010-2012 Intel Corporation
   *
   * Permission is hereby granted, free of charge, to any person obtaining a
   * copy of this software and associated documentation files (the
  #define B0_STEP_REV           2
  #define IS_STEPPING_BPLUS(i965)       ((i965->intel.revision) >= B0_STEP_REV)
  
 +static const uint32_t gen75_mfc_batchbuffer_avc_intra[][4] = {
 +#include "shaders/utils/mfc_batchbuffer_avc_intra.g7b"
 +};
 +
 +static const uint32_t gen75_mfc_batchbuffer_avc_inter[][4] = {
 +#include "shaders/utils/mfc_batchbuffer_avc_inter.g7b"
 +};
 +
 +static struct i965_kernel gen75_mfc_kernels[] = {
 +    {
 +        "MFC AVC INTRA BATCHBUFFER ",
 +        MFC_BATCHBUFFER_AVC_INTRA,
 +        gen75_mfc_batchbuffer_avc_intra,
 +        sizeof(gen75_mfc_batchbuffer_avc_intra),
 +        NULL
 +    },
 +
 +    {
 +        "MFC AVC INTER BATCHBUFFER ",
 +        MFC_BATCHBUFFER_AVC_INTER,
 +        gen75_mfc_batchbuffer_avc_inter,
 +        sizeof(gen75_mfc_batchbuffer_avc_inter),
 +        NULL
 +    },
 +};
 +
 +#define               INTER_MODE_MASK         0x03
 +#define               INTER_8X8               0x03
 +#define               INTER_16X8              0x01
 +#define               INTER_8X16              0x02
 +#define               SUBMB_SHAPE_MASK        0x00FF00
 +
 +#define               INTER_MV8               (4 << 20)
 +#define               INTER_MV32              (6 << 20)
 +
 +
  static void
  gen75_mfc_pipe_mode_select(VADriverContextP ctx,
 -                           int standard_select,
 -                           struct gen6_encoder_context *gen6_encoder_context,
 -                           struct intel_batchbuffer *batch)
 +                          int standard_select,
 +                          struct intel_encoder_context *encoder_context)
  {
 -    if (batch == NULL)
 -        batch = gen6_encoder_context->base.batch;
 -
 +    struct intel_batchbuffer *batch = encoder_context->base.batch;
 +    struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
      assert(standard_select == MFX_FORMAT_MPEG2 ||
             standard_select == MFX_FORMAT_AVC);
  
@@@ -209,17 -324,22 +208,18 @@@ gen75_mfc_ind_obj_base_addr_state_bplus
  }
  
  static void
 -gen75_mfc_ind_obj_base_addr_state(VADriverContextP ctx,
 -                                  struct gen6_encoder_context *gen6_encoder_context,
 -                                  struct intel_batchbuffer *batch)
 +gen75_mfc_ind_obj_base_addr_state(VADriverContextP ctx, struct intel_encoder_context *encoder_context)
  {
 -    struct gen6_vme_context *vme_context = &gen6_encoder_context->vme_context;
 -    struct gen6_mfc_context *mfc_context = &gen6_encoder_context->mfc_context;
 +    struct intel_batchbuffer *batch = encoder_context->base.batch;
 +    struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
 +    struct gen6_vme_context *vme_context = encoder_context->vme_context;
      struct i965_driver_data *i965 = i965_driver_data(ctx);
  
 -    if (batch == NULL)
 -        batch = gen6_encoder_context->base.batch;
 -
      if (IS_STEPPING_BPLUS(i965)) {
 -      gen75_mfc_ind_obj_base_addr_state_bplus(ctx, gen6_encoder_context, batch);
 +      gen75_mfc_ind_obj_base_addr_state_bplus(ctx, encoder_context);
        return;
      }
      BEGIN_BCS_BATCH(batch, 11);
  
      OUT_BCS_BATCH(batch, MFX_IND_OBJ_BASE_ADDR_STATE | (11 - 2));
@@@ -256,16 -405,43 +256,16 @@@ gen75_mfc_avc_img_state(VADriverContext
      int width_in_mbs = (mfc_context->surface_state.width + 15) / 16;
      int height_in_mbs = (mfc_context->surface_state.height + 15) / 16;
  
 -    if (batch == NULL)
 -        batch = gen6_encoder_context->base.batch;
 -
      BEGIN_BCS_BATCH(batch, 16);
 +
      OUT_BCS_BATCH(batch, MFX_AVC_IMG_STATE | (16 - 2));
-       /*DW1. MB setting of frame */
++    /*DW1. MB setting of frame */
      OUT_BCS_BATCH(batch,
                    ((width_in_mbs * height_in_mbs) & 0xFFFF));
      OUT_BCS_BATCH(batch, 
                    ((height_in_mbs - 1) << 16) | 
                    ((width_in_mbs - 1) << 0));
-       /* DW3 QP setting */
++    /* DW3 QP setting */
      OUT_BCS_BATCH(batch, 
                    (0 << 24) | /* Second Chroma QP Offset */
                    (0 << 16) | /* Chroma QP Offset */
@@@ -2519,91 -1179,5 +2519,90 @@@ gen75_mfc_context_destroy(void *context
      dri_bo_unreference(mfc_context->bsd_mpc_row_store_scratch_buffer.bo);
      mfc_context->bsd_mpc_row_store_scratch_buffer.bo = NULL;
  
 +    for (i = 0; i < MAX_MFC_REFERENCE_SURFACES; i++){
 +        dri_bo_unreference(mfc_context->reference_surfaces[i].bo);
 +        mfc_context->reference_surfaces[i].bo = NULL;  
 +    }
 +
 +    i965_gpe_context_destroy(&mfc_context->gpe_context);
 +
 +    dri_bo_unreference(mfc_context->mfc_batchbuffer_surface.bo);
 +    mfc_context->mfc_batchbuffer_surface.bo = NULL;
 +
 +    dri_bo_unreference(mfc_context->aux_batchbuffer_surface.bo);
 +    mfc_context->aux_batchbuffer_surface.bo = NULL;
 +
 +    if (mfc_context->aux_batchbuffer)
 +        intel_batchbuffer_free(mfc_context->aux_batchbuffer);
 +
 +    mfc_context->aux_batchbuffer = NULL;
 +
 +    free(mfc_context);
 +}
 +
 +static VAStatus gen75_mfc_pipeline(VADriverContextP ctx,
 +                  VAProfile profile,
 +                  struct encode_state *encode_state,
 +                  struct intel_encoder_context *encoder_context)
 +{
 +    VAStatus vaStatus;
 +
 +    switch (profile) {
 +    case VAProfileH264Baseline:
 +    case VAProfileH264Main:
 +    case VAProfileH264High:
 +        vaStatus = gen75_mfc_avc_encode_picture(ctx, encode_state, encoder_context);
 +        break;
 +
 +        /* FIXME: add for other profile */
 +    case VAProfileMPEG2Simple:
 +    case VAProfileMPEG2Main:
 +        vaStatus = gen75_mfc_mpeg2_encode_picture(ctx, encode_state, encoder_context);
 +        break;
 +
 +    default:
 +        vaStatus = VA_STATUS_ERROR_UNSUPPORTED_PROFILE;
 +        break;
 +    }
 +
 +    return vaStatus;
 +}
 +
 +Bool gen75_mfc_context_init(VADriverContextP ctx, struct intel_encoder_context *encoder_context)
 +{
 +    struct gen6_mfc_context *mfc_context = calloc(1, sizeof(struct gen6_mfc_context));
 +
 +    mfc_context->gpe_context.surface_state_binding_table.length = (SURFACE_STATE_PADDED_SIZE + sizeof(unsigned int)) * MAX_MEDIA_SURFACES_GEN6;
 +
 +    mfc_context->gpe_context.idrt.max_entries = MAX_GPE_KERNELS;
 +    mfc_context->gpe_context.idrt.entry_size = sizeof(struct gen6_interface_descriptor_data);
 +
 +    mfc_context->gpe_context.curbe.length = 32 * 4;
 +
 +    mfc_context->gpe_context.vfe_state.max_num_threads = 60 - 1;
 +    mfc_context->gpe_context.vfe_state.num_urb_entries = 16;
 +    mfc_context->gpe_context.vfe_state.gpgpu_mode = 0;
 +    mfc_context->gpe_context.vfe_state.urb_entry_size = 59 - 1;
 +    mfc_context->gpe_context.vfe_state.curbe_allocation_size = 37 - 1;
 +
 +    i965_gpe_load_kernels(ctx,
 +                          &mfc_context->gpe_context,
 +                          gen75_mfc_kernels,
 +                          NUM_MFC_KERNEL);
 +
 +    mfc_context->pipe_mode_select = gen75_mfc_pipe_mode_select;
 +    mfc_context->set_surface_state = gen75_mfc_surface_state;
 +    mfc_context->ind_obj_base_addr_state = gen75_mfc_ind_obj_base_addr_state;
 +    mfc_context->avc_img_state = gen75_mfc_avc_img_state;
 +    mfc_context->avc_qm_state = gen75_mfc_avc_qm_state;
 +    mfc_context->avc_fqm_state = gen75_mfc_avc_fqm_state;
 +    mfc_context->insert_object = gen75_mfc_avc_insert_object;
 +    mfc_context->buffer_suface_setup = gen7_gpe_buffer_suface_setup;
 +
 +    encoder_context->mfc_context = mfc_context;
 +    encoder_context->mfc_context_destroy = gen75_mfc_context_destroy;
 +    encoder_context->mfc_pipeline = gen75_mfc_pipeline;
 +    encoder_context->mfc_brc_prepare = intel_mfc_brc_prepare;
 +
      return True;
  }
diff --cc src/gen75_mfd.c
   *
   * Authors:
   *    Xiang Haihao <haihao.xiang@intel.com>
-  *    Zhao  Yakui  <yakui.zhao@intel.com>
 - *    Zhao Yakui <yakui.zhao@intel.com>
++ *    Zhao Yakui  <yakui.zhao@intel.com>
   *
   */
++#include "sysdeps.h"
  
- #include <stdio.h>
- #include <stdlib.h>
- #include <string.h>
- #include <assert.h>
 -#ifndef HAVE_GEN_AVC_SURFACE
 -#define HAVE_GEN_AVC_SURFACE 1
 -#endif
 +#include <va/va_dec_jpeg.h>
  
 -#include <stdio.h>
 -#include <stdlib.h>
 -#include <string.h>
 -#include <assert.h>
 -
 -#include "config.h"
  #include "intel_batchbuffer.h"
  #include "intel_driver.h"
--
  #include "i965_defines.h"
  #include "i965_drv_video.h"
  #include "i965_decoder_utils.h"
--
  #include "gen7_mfd.h"
 +#include "intel_media.h"
  
  #define B0_STEP_REV           2
  #define IS_STEPPING_BPLUS(i965)       ((i965->intel.revision) >= B0_STEP_REV)
@@@ -275,12 -396,12 +270,13 @@@ gen75_mfd_pipe_buf_addr_state(VADriverC
      struct intel_batchbuffer *batch = gen7_mfd_context->base.batch;
      struct i965_driver_data *i965 = i965_driver_data(ctx);
      int i;
-       if (IS_STEPPING_BPLUS(i965)) {
-               gen75_mfd_pipe_buf_addr_state_bplus(ctx, decode_state,
-                               standard_select, gen7_mfd_context);
-               return;
-       }
 +
 -      gen75_mfd_pipe_buf_addr_state_bplus(ctx, decode_state,
 -                      standard_select, gen7_mfd_context);
 -      return;
+     if (IS_STEPPING_BPLUS(i965)) {
++        gen75_mfd_pipe_buf_addr_state_bplus(ctx, decode_state,
++                                            standard_select, gen7_mfd_context);
++        return;
+     }
      BEGIN_BCS_BATCH(batch, 25);
      OUT_BCS_BATCH(batch, MFX_PIPE_BUF_ADDR_STATE | (25 - 2));
      if (gen7_mfd_context->pre_deblocking_output.valid)
@@@ -392,14 -512,14 +388,14 @@@ gen75_mfd_ind_obj_base_addr_state(VADri
                                   struct gen7_mfd_context *gen7_mfd_context)
  {
      struct intel_batchbuffer *batch = gen7_mfd_context->base.batch;
      struct i965_driver_data *i965 = i965_driver_data(ctx);
  
-       if (IS_STEPPING_BPLUS(i965)) {
-               gen75_mfd_ind_obj_base_addr_state_bplus(ctx, slice_data_bo,
-                                       standard_select, gen7_mfd_context);
-               return;
-       }
+     if (IS_STEPPING_BPLUS(i965)) {
 -      gen75_mfd_ind_obj_base_addr_state_bplus(ctx, slice_data_bo,
 -                              standard_select, gen7_mfd_context);
 -      return;
++        gen75_mfd_ind_obj_base_addr_state_bplus(ctx, slice_data_bo,
++                                                standard_select, gen7_mfd_context);
++        return;
+     }
      BEGIN_BCS_BATCH(batch, 11);
      OUT_BCS_BATCH(batch, MFX_IND_OBJ_BASE_ADDR_STATE | (11 - 2));
      OUT_BCS_RELOC(batch, slice_data_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0); /* MFX Indirect Bitstream Object Base Address */
@@@ -466,12 -588,12 +463,12 @@@ gen75_mfd_bsp_buf_base_addr_state(VADri
      struct intel_batchbuffer *batch = gen7_mfd_context->base.batch;
      struct i965_driver_data *i965 = i965_driver_data(ctx);
  
-       if (IS_STEPPING_BPLUS(i965)) {
-               gen75_mfd_bsp_buf_base_addr_state_bplus(ctx, decode_state,
-                                       standard_select, gen7_mfd_context);
-               return;
-       }
+     if (IS_STEPPING_BPLUS(i965)) {
 -      gen75_mfd_bsp_buf_base_addr_state_bplus(ctx, decode_state,
 -                              standard_select, gen7_mfd_context);
 -      return;
 -     }
 - 
++        gen75_mfd_bsp_buf_base_addr_state_bplus(ctx, decode_state,
++                                                standard_select, gen7_mfd_context);
++        return;
++    }
 +
      BEGIN_BCS_BATCH(batch, 4);
      OUT_BCS_BATCH(batch, MFX_BSP_BUF_BASE_ADDR_STATE | (4 - 2));
  
@@@ -557,7 -654,8 +529,9 @@@ gen75_mfd_avc_img_state(VADriverContext
  
      assert(decode_state->pic_param && decode_state->pic_param->buffer);
      pic_param = (VAPictureParameterBufferH264 *)decode_state->pic_param->buffer;
 +
+     assert(!(pic_param->CurrPic.flags & VA_PICTURE_H264_INVALID));
      if (pic_param->CurrPic.flags & VA_PICTURE_H264_TOP_FIELD)
          img_struct = 1;
      else if (pic_param->CurrPic.flags & VA_PICTURE_H264_BOTTOM_FIELD)
@@@ -708,7 -810,7 +682,8 @@@ gen75_mfd_avc_directmode_state_bplus(VA
              OUT_BCS_BATCH(batch, 0);
          }
      }
--      OUT_BCS_BATCH(batch, 0);
++
++    OUT_BCS_BATCH(batch, 0);
  
      /* the current decoding frame/field */
      va_pic = &pic_param->CurrPic;
                    I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
                    0);
  
--      OUT_BCS_BATCH(batch, 0);
--      OUT_BCS_BATCH(batch, 0);
++    OUT_BCS_BATCH(batch, 0);
++    OUT_BCS_BATCH(batch, 0);
  
      /* POC List */
      for (i = 0; i < ARRAY_ELEMS(gen7_mfd_context->reference_surface); i++) {
@@@ -774,12 -873,11 +749,12 @@@ gen75_mfd_avc_directmode_state(VADriver
      VAPictureH264 *va_pic;
      int i, j;
  
-       if (IS_STEPPING_BPLUS(i965)) {
-             gen75_mfd_avc_directmode_state_bplus(ctx, decode_state, pic_param, slice_param,
-                       gen7_mfd_context);
+     if (IS_STEPPING_BPLUS(i965)) {
 -      gen75_mfd_avc_directmode_state_bplus(ctx, pic_param, slice_param,
 -              gen7_mfd_context);
 -      return;
++        gen75_mfd_avc_directmode_state_bplus(ctx, decode_state, pic_param, slice_param,
++                                             gen7_mfd_context);
 +
-               return;
-       }
++        return;
+     }
  
      BEGIN_BCS_BATCH(batch, 69);
      OUT_BCS_BATCH(batch, MFX_AVC_DIRECTMODE_STATE | (69 - 2));
@@@ -2237,8 -2371,8 +2212,7 @@@ gen75_mfd_jpeg_decode_init(VADriverCont
      }
  
      /* Current decoded picture */
 -    obj_surface = SURFACE(decode_state->current_render_target);
 -    assert(obj_surface);
 +    obj_surface = decode_state->render_object;
      i965_check_alloc_surface_bo(ctx, obj_surface, 1, VA_FOURCC('I','M','C','1'), subsampling);
  
      dri_bo_unreference(gen7_mfd_context->pre_deblocking_output.bo);
@@@ -2367,8 -2501,6 +2341,10 @@@ gen75_mfd_jpeg_huff_table_state(VADrive
  
      for (index = 0; index < num_tables; index++) {
          int id = va_to_gen7_jpeg_hufftable[index];
++
 +        if (!huffman_table->load_huffman_table[index])
 +            continue;
++
          BEGIN_BCS_BATCH(batch, 53);
          OUT_BCS_BATCH(batch, MFX_JPEG_HUFF_TABLE_STATE | (53 - 2));
          OUT_BCS_BATCH(batch, id);
@@@ -2691,10 -2819,10 +2668,11 @@@ gen75_jpeg_wa_pipe_buf_addr_state(VADri
      dri_bo *intra_bo;
      int i;
  
-       if (IS_STEPPING_BPLUS(i965)) {
-               gen75_jpeg_wa_pipe_buf_addr_state_bplus(ctx, gen7_mfd_context);
-               return;
-       }
+     if (IS_STEPPING_BPLUS(i965)) {
 -      gen75_jpeg_wa_pipe_buf_addr_state_bplus(ctx, gen7_mfd_context);
 -      return;
++        gen75_jpeg_wa_pipe_buf_addr_state_bplus(ctx, gen7_mfd_context);
++        return;
+     }
++
      intra_bo = dri_bo_alloc(i965->intel.bufmgr,
                              "intra row store",
                              128 * 64,
diff --cc src/gen75_vme.c
@@@ -1,5 -1,5 +1,5 @@@
  /*
-  * Copyright Â© 2012 Intel Corporation
 - * Copyright Â© 2010-2011 Intel Corporation
++ * Copyright Â© 2010-2012 Intel Corporation
   *
   * Permission is hereby granted, free of charge, to any person obtaining a
   * copy of this software and associated documentation files (the
   *
   * Authors:
   *    Zhao Yakui <yakui.zhao@intel.com>
 - *    Xiang HaiHao <haihao.xiang@intel.com>
 + *    Xiang Haihao <haihao.xiang@intel.com>
+  *
   */
  
--#include <stdio.h>
--#include <stdlib.h>
- #include <stdbool.h>
--#include <string.h>
--#include <assert.h>
++#include "sysdeps.h"
  
  #include "intel_batchbuffer.h"
  #include "intel_driver.h"
@@@ -473,412 -416,263 +470,411 @@@ static VAStatus gen75_vme_vme_state_set
  
      for (i=5; i < 8; i++) {
        vme_state_message[i] = 0;
 -     }
 +    }
  
 -    return VA_STATUS_SUCCESS;
 -}
 +    switch (encoder_context->profile) {
 +    case VAProfileH264Baseline:
 +    case VAProfileH264Main:
 +    case VAProfileH264High:
 +        gen75_vme_state_setup_fixup(ctx, encode_state, encoder_context, vme_state_message);
  
 -static void gen75_vme_pipeline_select(VADriverContextP ctx,
 -                                      struct gen6_encoder_context *gen6_encoder_context,
 -                                      struct intel_batchbuffer *batch)
 -{
 -    if (batch == NULL)
 -        batch = gen6_encoder_context->base.batch;
 +        break;
  
 -    BEGIN_BATCH(batch, 1);
 -    OUT_BATCH(batch, CMD_PIPELINE_SELECT | PIPELINE_SELECT_MEDIA);
 -    ADVANCE_BATCH(batch);
 +    default:
 +        /* no fixup */
 +        break;
 +    }
 +
 +    return VA_STATUS_SUCCESS;
  }
  
 -static void gen75_vme_state_base_address(VADriverContextP ctx,
 -                                         struct gen6_encoder_context *gen6_encoder_context,
 -                                         struct intel_batchbuffer *batch)
 +static void
 +gen75_vme_fill_vme_batchbuffer(VADriverContextP ctx, 
 +                               struct encode_state *encode_state,
 +                               int mb_width, int mb_height,
 +                               int kernel,
 +                               int transform_8x8_mode_flag,
 +                               struct intel_encoder_context *encoder_context)
  {
 -    struct gen6_vme_context *vme_context = &gen6_encoder_context->vme_context;
 +    struct gen6_vme_context *vme_context = encoder_context->vme_context;
 +    int mb_x = 0, mb_y = 0;
 +    int i, s;
 +    unsigned int *command_ptr;
 +
 +    dri_bo_map(vme_context->vme_batchbuffer.bo, 1);
 +    command_ptr = vme_context->vme_batchbuffer.bo->virtual;
 +
 +    for (s = 0; s < encode_state->num_slice_params_ext; s++) {
 +        VAEncSliceParameterBufferH264 *pSliceParameter = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[s]->buffer; 
 +        int slice_mb_begin = pSliceParameter->macroblock_address;
 +        int slice_mb_number = pSliceParameter->num_macroblocks;
 +        unsigned int mb_intra_ub;
 +      int slice_mb_x = pSliceParameter->macroblock_address % mb_width; 
 +        for (i = 0; i < slice_mb_number;  ) {
 +            int mb_count = i + slice_mb_begin;    
 +            mb_x = mb_count % mb_width;
 +            mb_y = mb_count / mb_width;
 +          mb_intra_ub = 0;
 +          if (mb_x != 0) {
 +              mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_AE;
 +          }
 +          if (mb_y != 0) {
 +              mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_B;
 +              if (mb_x != 0)
 +                    mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_D;
 +              if (mb_x != (mb_width -1))
 +                    mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_C;
 +          }
 +          if (i < mb_width) {
 +              if (i == 0)
 +                    mb_intra_ub &= ~(INTRA_PRED_AVAIL_FLAG_AE);
 +              mb_intra_ub &= ~(INTRA_PRED_AVAIL_FLAG_BCD_MASK);
 +              if ((i == (mb_width - 1)) && slice_mb_x) {
 +                    mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_C;
 +              }
 +          }
 +              
 +          if ((i == mb_width) && slice_mb_x) {
 +              mb_intra_ub &= ~(INTRA_PRED_AVAIL_FLAG_D);
 +          }
 +            *command_ptr++ = (CMD_MEDIA_OBJECT | (8 - 2));
 +            *command_ptr++ = kernel;
 +            *command_ptr++ = 0;
 +            *command_ptr++ = 0;
 +            *command_ptr++ = 0;
 +            *command_ptr++ = 0;
 +   
 +            /*inline data */
 +            *command_ptr++ = (mb_width << 16 | mb_y << 8 | mb_x);
 +            *command_ptr++ = ( (1 << 16) | transform_8x8_mode_flag | (mb_intra_ub << 8));
  
 -    if (batch == NULL)
 -        batch = gen6_encoder_context->base.batch;
 +            i += 1;
 +        } 
 +    }
  
 -    BEGIN_BATCH(batch, 10);
 +    *command_ptr++ = 0;
 +    *command_ptr++ = MI_BATCH_BUFFER_END;
  
 -    OUT_BATCH(batch, CMD_STATE_BASE_ADDRESS | 8);
 +    dri_bo_unmap(vme_context->vme_batchbuffer.bo);
 +}
  
 -    OUT_BATCH(batch, 0 | BASE_ADDRESS_MODIFY);                                //General State Base Address
 -    OUT_RELOC(batch, vme_context->surface_state_binding_table.bo, I915_GEM_DOMAIN_INSTRUCTION, 0, BASE_ADDRESS_MODIFY); /* Surface state base address */
 -    OUT_BATCH(batch, 0 | BASE_ADDRESS_MODIFY);                                //Dynamic State Base Address
 -    OUT_BATCH(batch, 0 | BASE_ADDRESS_MODIFY);                                //Indirect Object Base Address
 -    OUT_BATCH(batch, 0 | BASE_ADDRESS_MODIFY);                                //Instruction Base Address
 +static void gen75_vme_media_init(VADriverContextP ctx, struct intel_encoder_context *encoder_context)
 +{
 +    struct gen6_vme_context *vme_context = encoder_context->vme_context;
  
 -    OUT_BATCH(batch, 0xFFFFF000 | BASE_ADDRESS_MODIFY);               //General State Access Upper Bound      
 -    OUT_BATCH(batch, 0xFFFFF000 | BASE_ADDRESS_MODIFY);               //Dynamic State Access Upper Bound
 -    OUT_BATCH(batch, 0xFFFFF000 | BASE_ADDRESS_MODIFY);               //Indirect Object Access Upper Bound
 -    OUT_BATCH(batch, 0xFFFFF000 | BASE_ADDRESS_MODIFY);               //Instruction Access Upper Bound
 +    i965_gpe_context_init(ctx, &vme_context->gpe_context);
  
 -    /*
 -      OUT_BATCH(batch, 0 | BASE_ADDRESS_MODIFY);                              //LLC Coherent Base Address
 -      OUT_BATCH(batch, 0xFFFFF000 | BASE_ADDRESS_MODIFY );            //LLC Coherent Upper Bound
 -    */
 +    /* VME output buffer */
 +    dri_bo_unreference(vme_context->vme_output.bo);
 +    vme_context->vme_output.bo = NULL;
  
 -    ADVANCE_BATCH(batch);
 +    dri_bo_unreference(vme_context->vme_batchbuffer.bo);
 +    vme_context->vme_batchbuffer.bo = NULL;
 +
 +    /* VME state */
 +    dri_bo_unreference(vme_context->vme_state.bo);
 +    vme_context->vme_state.bo = NULL;
  }
  
 -static void gen75_vme_vfe_state(VADriverContextP ctx,
 -                                struct gen6_encoder_context *gen6_encoder_context,
 -                                struct intel_batchbuffer *batch)
 +static void gen75_vme_pipeline_programing(VADriverContextP ctx, 
 +                                          struct encode_state *encode_state,
 +                                          struct intel_encoder_context *encoder_context)
  {
 -    struct gen6_vme_context *vme_context = &gen6_encoder_context->vme_context;
 -
 -    if (batch == NULL)
 -        batch = gen6_encoder_context->base.batch;
 -
 -    BEGIN_BATCH(batch, 8);
 -
 -    OUT_BATCH(batch, CMD_MEDIA_VFE_STATE | 6);                                        /*Gen6 CMD_MEDIA_STATE_POINTERS = CMD_MEDIA_STATE */
 -    OUT_BATCH(batch, 0);                                                                                              /*Scratch Space Base Pointer and Space*/
 -    OUT_BATCH(batch, (vme_context->vfe_state.max_num_threads << 16) 
 -              | (vme_context->vfe_state.num_urb_entries << 8) 
 -              | (vme_context->vfe_state.gpgpu_mode << 2) );   /*Maximum Number of Threads , Number of URB Entries, MEDIA Mode*/
 -    OUT_BATCH(batch, 0);                                                                                              /*Debug: Object ID*/
 -    OUT_BATCH(batch, (vme_context->vfe_state.urb_entry_size << 16) 
 -              | vme_context->vfe_state.curbe_allocation_size);                                /*URB Entry Allocation Size , CURBE Allocation Size*/
 -    OUT_BATCH(batch, 0);                                                                                      /*Disable Scoreboard*/
 -    OUT_BATCH(batch, 0);                                                                                      /*Disable Scoreboard*/
 -    OUT_BATCH(batch, 0);                                                                                      /*Disable Scoreboard*/
 -      
 +    struct gen6_vme_context *vme_context = encoder_context->vme_context;
 +    struct intel_batchbuffer *batch = encoder_context->base.batch;
 +    VAEncPictureParameterBufferH264 *pPicParameter = (VAEncPictureParameterBufferH264 *)encode_state->pic_param_ext->buffer;
 +    VAEncSliceParameterBufferH264 *pSliceParameter = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[0]->buffer;
 +    VAEncSequenceParameterBufferH264 *pSequenceParameter = (VAEncSequenceParameterBufferH264 *)encode_state->seq_param_ext->buffer;
 +    int width_in_mbs = pSequenceParameter->picture_width_in_mbs;
 +    int height_in_mbs = pSequenceParameter->picture_height_in_mbs;
 +    int kernel_shader;
 +    bool allow_hwscore = true;
 +    int s;
 +
 +    for (s = 0; s < encode_state->num_slice_params_ext; s++) {
 +        pSliceParameter = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[s]->buffer; 
 +        if ((pSliceParameter->macroblock_address % width_in_mbs)) {
 +              allow_hwscore = false;
 +              break;
 +      }
 +    }
 +    if ((pSliceParameter->slice_type == SLICE_TYPE_I) ||
 +      (pSliceParameter->slice_type == SLICE_TYPE_I)) {
 +      kernel_shader = VME_INTRA_SHADER;
 +   } else if ((pSliceParameter->slice_type == SLICE_TYPE_P) ||
 +      (pSliceParameter->slice_type == SLICE_TYPE_SP)) {
 +      kernel_shader = VME_INTER_SHADER;
 +   } else {
 +      kernel_shader = VME_BINTER_SHADER;
 +      if (!allow_hwscore)
 +           kernel_shader = VME_INTER_SHADER;
 +   }
 +    if (allow_hwscore)
 +      gen7_vme_walker_fill_vme_batchbuffer(ctx, 
 +                                  encode_state,
 +                                  width_in_mbs, height_in_mbs,
 +                                  kernel_shader,
 +                                  pPicParameter->pic_fields.bits.transform_8x8_mode_flag,
 +                                  encoder_context);
 +    else
 +      gen75_vme_fill_vme_batchbuffer(ctx, 
 +                                   encode_state,
 +                                   width_in_mbs, height_in_mbs,
 +                                   kernel_shader,
 +                                   pPicParameter->pic_fields.bits.transform_8x8_mode_flag,
 +                                   encoder_context);
 +
 +    intel_batchbuffer_start_atomic(batch, 0x1000);
 +    gen6_gpe_pipeline_setup(ctx, &vme_context->gpe_context, batch);
 +    BEGIN_BATCH(batch, 2);
 +    OUT_BATCH(batch, MI_BATCH_BUFFER_START | (2 << 6));
 +    OUT_RELOC(batch,
 +              vme_context->vme_batchbuffer.bo,
 +              I915_GEM_DOMAIN_COMMAND, 0, 
 +              0);
      ADVANCE_BATCH(batch);
  
 +    intel_batchbuffer_end_atomic(batch);      
  }
  
 -static void gen75_vme_curbe_load(VADriverContextP ctx,
 -                                 struct gen6_encoder_context *gen6_encoder_context,
 -                                 struct intel_batchbuffer *batch)
 +static VAStatus gen75_vme_prepare(VADriverContextP ctx, 
 +                                  struct encode_state *encode_state,
 +                                  struct intel_encoder_context *encoder_context)
  {
 -    struct gen6_vme_context *vme_context = &gen6_encoder_context->vme_context;
 -
 -    if (batch == NULL)
 -        batch = gen6_encoder_context->base.batch;
 -
 -    BEGIN_BATCH(batch, 4);
 -
 -    OUT_BATCH(batch, CMD_MEDIA_CURBE_LOAD | 2);
 -    OUT_BATCH(batch, 0);
 +    VAStatus vaStatus = VA_STATUS_SUCCESS;
 +    VAEncSliceParameterBufferH264 *pSliceParameter = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[0]->buffer;
 +    int is_intra = pSliceParameter->slice_type == SLICE_TYPE_I;
 +    VAEncSequenceParameterBufferH264 *pSequenceParameter = (VAEncSequenceParameterBufferH264 *)encode_state->seq_param_ext->buffer;
 +    struct gen6_vme_context *vme_context = encoder_context->vme_context;
 +
 +    if (!vme_context->h264_level ||
 +        (vme_context->h264_level != pSequenceParameter->level_idc)) {
 +      vme_context->h264_level = pSequenceParameter->level_idc;        
 +    } 
 +
 +    intel_vme_update_mbmv_cost(ctx, encode_state, encoder_context);
 +      
 +    /*Setup all the memory object*/
 +    gen75_vme_surface_setup(ctx, encode_state, is_intra, encoder_context);
 +    gen75_vme_interface_setup(ctx, encode_state, encoder_context);
 +    //gen75_vme_vme_state_setup(ctx, encode_state, is_intra, encoder_context);
 +    gen75_vme_constant_setup(ctx, encode_state, encoder_context);
  
 -    OUT_BATCH(batch, CURBE_TOTAL_DATA_LENGTH);
 -    OUT_RELOC(batch, vme_context->curbe.bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
 +    /*Programing media pipeline*/
 +    gen75_vme_pipeline_programing(ctx, encode_state, encoder_context);
  
 -    ADVANCE_BATCH(batch);
 +    return vaStatus;
  }
  
 -static void gen75_vme_idrt(VADriverContextP ctx,
 -                           struct gen6_encoder_context *gen6_encoder_context,
 -                           struct intel_batchbuffer *batch)
 +static VAStatus gen75_vme_run(VADriverContextP ctx, 
 +                              struct encode_state *encode_state,
 +                              struct intel_encoder_context *encoder_context)
  {
 -    struct gen6_vme_context *vme_context = &gen6_encoder_context->vme_context;
 -
 -    if (batch == NULL)
 -        batch = gen6_encoder_context->base.batch;
 -
 -    BEGIN_BATCH(batch, 4);
 +    struct intel_batchbuffer *batch = encoder_context->base.batch;
  
 -    OUT_BATCH(batch, CMD_MEDIA_INTERFACE_LOAD | 2);   
 -    OUT_BATCH(batch, 0);
 -    OUT_BATCH(batch, GEN6_VME_KERNEL_NUMBER * sizeof(struct gen6_interface_descriptor_data));
 -    OUT_RELOC(batch, vme_context->idrt.bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
 +    intel_batchbuffer_flush(batch);
  
 -    ADVANCE_BATCH(batch);
 +    return VA_STATUS_SUCCESS;
  }
  
 -static int gen75_vme_media_object(VADriverContextP ctx, 
 -                                  struct encode_state *encode_state,
 -                                  int mb_x, int mb_y,
 -                                  int kernel, unsigned int mb_intra_ub,
 -                                  struct gen6_encoder_context *gen6_encoder_context,
 -                                  struct intel_batchbuffer *batch)
 +static VAStatus gen75_vme_stop(VADriverContextP ctx, 
 +                               struct encode_state *encode_state,
 +                               struct intel_encoder_context *encoder_context)
  {
 -    struct i965_driver_data *i965 = i965_driver_data(ctx);
 -    struct object_surface *obj_surface = SURFACE(encode_state->current_render_target);
 -    int mb_width = ALIGN(obj_surface->orig_width, 16) / 16;
 -    int len_in_dowrds = 8;
 -
 -    if (batch == NULL)
 -        batch = gen6_encoder_context->base.batch;
 -
 -    BEGIN_BATCH(batch, len_in_dowrds);
 -    
 -    OUT_BATCH(batch, CMD_MEDIA_OBJECT | (len_in_dowrds - 2));
 -    OUT_BATCH(batch, kernel);         /*Interface Descriptor Offset*/ 
 -    OUT_BATCH(batch, 0);
 -    OUT_BATCH(batch, 0);
 -    OUT_BATCH(batch, 0);
 -    OUT_BATCH(batch, 0);
 -   
 -    /*inline data */
 -    OUT_BATCH(batch, mb_width << 16 | mb_y << 8 | mb_x);                      /*M0.0 Refrence0 X,Y, not used in Intra*/
 +    return VA_STATUS_SUCCESS;
 +}
  
 -      OUT_BATCH(batch, ((mb_intra_ub << 8) | 0));
 -    ADVANCE_BATCH(batch);
 +static VAStatus
 +gen75_vme_pipeline(VADriverContextP ctx,
 +                   VAProfile profile,
 +                   struct encode_state *encode_state,
 +                   struct intel_encoder_context *encoder_context)
 +{
 +    gen75_vme_media_init(ctx, encoder_context);
 +    gen75_vme_prepare(ctx, encode_state, encoder_context);
 +    gen75_vme_run(ctx, encode_state, encoder_context);
 +    gen75_vme_stop(ctx, encode_state, encoder_context);
  
 -    return len_in_dowrds * 4;
 +    return VA_STATUS_SUCCESS;
  }
  
 -static void gen75_vme_media_init(VADriverContextP ctx, struct gen6_encoder_context *gen6_encoder_context)
 +static void
 +gen75_vme_mpeg2_output_buffer_setup(VADriverContextP ctx,
 +                                    struct encode_state *encode_state,
 +                                    int index,
 +                                    int is_intra,
 +                                    struct intel_encoder_context *encoder_context)
 +
  {
      struct i965_driver_data *i965 = i965_driver_data(ctx);
 -    struct gen6_vme_context *vme_context = &gen6_encoder_context->vme_context;
 -    dri_bo *bo;
 -
 -    /* constant buffer */
 -    dri_bo_unreference(vme_context->curbe.bo);
 -    bo = dri_bo_alloc(i965->intel.bufmgr,
 -                      "Buffer",
 -                      CURBE_TOTAL_DATA_LENGTH, 64);
 -    assert(bo);
 -    vme_context->curbe.bo = bo;
 -
 -    dri_bo_unreference(vme_context->surface_state_binding_table.bo);
 -    bo = dri_bo_alloc(i965->intel.bufmgr,
 -                      "surface state & binding table",
 -                      (SURFACE_STATE_PADDED_SIZE + sizeof(unsigned int)) * MAX_MEDIA_SURFACES_GEN6,
 -                      4096);
 -    assert(bo);
 -    vme_context->surface_state_binding_table.bo = bo;
 -
 -    /* interface descriptor remapping table */
 -    dri_bo_unreference(vme_context->idrt.bo);
 -    bo = dri_bo_alloc(i965->intel.bufmgr, 
 -                      "Buffer", 
 -                      MAX_INTERFACE_DESC_GEN6 * sizeof(struct gen6_interface_descriptor_data), 16);
 -    assert(bo);
 -    vme_context->idrt.bo = bo;
 -
 -    /* VME output buffer */
 -    dri_bo_unreference(vme_context->vme_output.bo);
 -    vme_context->vme_output.bo = NULL;
 +    struct gen6_vme_context *vme_context = encoder_context->vme_context;
 +    VAEncSequenceParameterBufferMPEG2 *seq_param = (VAEncSequenceParameterBufferMPEG2 *)encode_state->seq_param_ext->buffer;
 +    int width_in_mbs = ALIGN(seq_param->picture_width, 16) / 16;
 +    int height_in_mbs = ALIGN(seq_param->picture_height, 16) / 16;
  
 -    /* VME state */
 -    dri_bo_unreference(vme_context->vme_state.bo);
 -    vme_context->vme_state.bo = NULL;
 +    vme_context->vme_output.num_blocks = width_in_mbs * height_in_mbs;
 +    vme_context->vme_output.pitch = 16; /* in bytes, always 16 */
  
 -    vme_context->vfe_state.max_num_threads = 60 - 1;
 -    vme_context->vfe_state.num_urb_entries = 16;
 -    vme_context->vfe_state.gpgpu_mode = 0;
 -    vme_context->vfe_state.urb_entry_size = 59 - 1;
 -    vme_context->vfe_state.curbe_allocation_size = CURBE_ALLOCATION_SIZE - 1;
 +    if (is_intra)
 +        vme_context->vme_output.size_block = INTRA_VME_OUTPUT_IN_BYTES * 2;
 +    else
 +        vme_context->vme_output.size_block = INTRA_VME_OUTPUT_IN_BYTES * 24;
 +    /*
 +     * Inter MV . 32-byte Intra search + 16 IME info + 128 IME MV + 32 IME Ref
 +     * + 16 FBR Info + 128 FBR MV + 32 FBR Ref.
 +     * 16 * (2 + 2 * (1 + 8 + 2))= 16 * 24.
 +     */
 +
 +    vme_context->vme_output.bo = dri_bo_alloc(i965->intel.bufmgr, 
 +                                              "VME output buffer",
 +                                              vme_context->vme_output.num_blocks * vme_context->vme_output.size_block,
 +                                              0x1000);
 +    assert(vme_context->vme_output.bo);
 +    vme_context->vme_buffer_suface_setup(ctx,
 +                                         &vme_context->gpe_context,
 +                                         &vme_context->vme_output,
 +                                         BINDING_TABLE_OFFSET(index),
 +                                         SURFACE_STATE_OFFSET(index));
  }
  
 -#define               INTRA_PRED_AVAIL_FLAG_AE        0x60
 -#define               INTRA_PRED_AVAIL_FLAG_B         0x10
 -#define               INTRA_PRED_AVAIL_FLAG_C         0x8
 -#define               INTRA_PRED_AVAIL_FLAG_D         0x4
 -#define               INTRA_PRED_AVAIL_FLAG_BCD_MASK  0x1C
 +static void
 +gen75_vme_mpeg2_output_vme_batchbuffer_setup(VADriverContextP ctx,
 +                                             struct encode_state *encode_state,
 +                                             int index,
 +                                             struct intel_encoder_context *encoder_context)
  
 -static void gen75_vme_pipeline_programing(VADriverContextP ctx, 
 -                                         struct encode_state *encode_state,
 -                                         struct gen6_encoder_context *gen6_encoder_context)
  {
      struct i965_driver_data *i965 = i965_driver_data(ctx);
 -    struct intel_batchbuffer *main_batch = gen6_encoder_context->base.batch;
 -    VAEncSliceParameterBuffer *pSliceParameter = (VAEncSliceParameterBuffer *)encode_state->slice_params[0]->buffer;
 -    VAEncSequenceParameterBufferH264 *pSequenceParameter = (VAEncSequenceParameterBufferH264 *)encode_state->seq_param->buffer;
 -    int is_intra = pSliceParameter->slice_flags.bits.is_intra;
 -    int width_in_mbs = pSequenceParameter->picture_width_in_mbs;
 -    int height_in_mbs = pSequenceParameter->picture_height_in_mbs;
 -    int emit_new_state = 1, object_len_in_bytes;
 -    int x, y;
 -    unsigned int mb_intra_ub; 
 -    struct intel_batchbuffer *batch = intel_batchbuffer_new(&i965->intel, I915_EXEC_RENDER, width_in_mbs * height_in_mbs * 8 * 4 + 0x200);
 -
 -    intel_batchbuffer_start_atomic(batch, width_in_mbs * height_in_mbs * 8 * 4 + 0x100);
 +    struct gen6_vme_context *vme_context = encoder_context->vme_context;
 +    VAEncSequenceParameterBufferMPEG2 *seq_param = (VAEncSequenceParameterBufferMPEG2 *)encode_state->seq_param_ext->buffer;
 +    int width_in_mbs = ALIGN(seq_param->picture_width, 16) / 16;
 +    int height_in_mbs = ALIGN(seq_param->picture_height, 16) / 16;
 +
 +    vme_context->vme_batchbuffer.num_blocks = width_in_mbs * height_in_mbs + 1;
 +    vme_context->vme_batchbuffer.size_block = 64; /* 4 OWORDs */
 +    vme_context->vme_batchbuffer.pitch = 16;
 +    vme_context->vme_batchbuffer.bo = dri_bo_alloc(i965->intel.bufmgr, 
 +                                                   "VME batchbuffer",
 +                                                   vme_context->vme_batchbuffer.num_blocks * vme_context->vme_batchbuffer.size_block,
 +                                                   0x1000);
 +    vme_context->vme_buffer_suface_setup(ctx,
 +                                         &vme_context->gpe_context,
 +                                         &vme_context->vme_batchbuffer,
 +                                         BINDING_TABLE_OFFSET(index),
 +                                         SURFACE_STATE_OFFSET(index));
 +}
  
 -    for(y = 0; y < height_in_mbs; y++){
 -        for(x = 0; x < width_in_mbs; x++){    
 -          mb_intra_ub = 0;
 -          if (x != 0) {
 -              mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_AE;
 -          }
 -          if (y != 0) {
 -              mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_B;
 -              if (x != 0)
 -                      mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_D;
 -              if (x != (width_in_mbs -1))
 -                      mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_C;
 -          }
 +static VAStatus
 +gen75_vme_mpeg2_surface_setup(VADriverContextP ctx, 
 +                              struct encode_state *encode_state,
 +                              int is_intra,
 +                              struct intel_encoder_context *encoder_context)
 +{
 +    struct object_surface *obj_surface;
  
 -            if (emit_new_state) {
 -                /*Step1: MI_FLUSH/PIPE_CONTROL*/
 -                intel_batchbuffer_emit_mi_flush(batch);
 +    /*Setup surfaces state*/
 +    /* current picture for encoding */
 +    obj_surface = encode_state->input_yuv_object;
 +    gen75_vme_source_surface_state(ctx, 0, obj_surface, encoder_context);
 +    gen75_vme_media_source_surface_state(ctx, 4, obj_surface, encoder_context);
 +    gen75_vme_media_chroma_source_surface_state(ctx, 6, obj_surface, encoder_context);
  
 -                /*Step2: State command PIPELINE_SELECT*/
 -                gen75_vme_pipeline_select(ctx, gen6_encoder_context, batch);
 +    if (!is_intra) {
 +        /* reference 0 */
 +        obj_surface = encode_state->reference_objects[0];
 +        if (obj_surface->bo != NULL)
 +            gen75_vme_source_surface_state(ctx, 1, obj_surface, encoder_context);
 +
 +        /* reference 1 */
 +        obj_surface = encode_state->reference_objects[1];
 +        if (obj_surface && obj_surface->bo != NULL) 
 +            gen75_vme_source_surface_state(ctx, 2, obj_surface, encoder_context);
 +    }
  
 -                /*Step3: State commands configuring pipeline states*/
 -                gen75_vme_state_base_address(ctx, gen6_encoder_context, batch);
 -                gen75_vme_vfe_state(ctx, gen6_encoder_context, batch);
 -                gen75_vme_curbe_load(ctx, gen6_encoder_context, batch);
 -                gen75_vme_idrt(ctx, gen6_encoder_context, batch);
 +    /* VME output */
 +    gen75_vme_mpeg2_output_buffer_setup(ctx, encode_state, 3, is_intra, encoder_context);
 +    gen75_vme_mpeg2_output_vme_batchbuffer_setup(ctx, encode_state, 5, encoder_context);
  
 -                emit_new_state = 0;
 -            }
 +    return VA_STATUS_SUCCESS;
 +}
  
 -            /*Step4: Primitive commands*/
 -            object_len_in_bytes = gen75_vme_media_object(ctx, encode_state, x, y, is_intra ? VME_INTRA_SHADER : VME_INTER_SHADER, mb_intra_ub, gen6_encoder_context, batch);
 +static void
 +gen75_vme_mpeg2_fill_vme_batchbuffer(VADriverContextP ctx, 
 +                                     struct encode_state *encode_state,
 +                                     int mb_width, int mb_height,
 +                                     int kernel,
 +                                     int transform_8x8_mode_flag,
 +                                     struct intel_encoder_context *encoder_context)
 +{
 +    struct gen6_vme_context *vme_context = encoder_context->vme_context;
 +    int mb_x = 0, mb_y = 0;
 +    int i, s, j;
 +    unsigned int *command_ptr;
 +
 +
 +    dri_bo_map(vme_context->vme_batchbuffer.bo, 1);
 +    command_ptr = vme_context->vme_batchbuffer.bo->virtual;
 +
 +    for (s = 0; s < encode_state->num_slice_params_ext; s++) {
 +        VAEncSliceParameterBufferMPEG2 *slice_param = (VAEncSliceParameterBufferMPEG2 *)encode_state->slice_params_ext[s]->buffer;
 +
 +        for (j = 0; j < encode_state->slice_params_ext[s]->num_elements; j++) {
 +            int slice_mb_begin = slice_param->macroblock_address;
 +            int slice_mb_number = slice_param->num_macroblocks;
 +            unsigned int mb_intra_ub;
 +            int slice_mb_x = slice_param->macroblock_address % mb_width;
 +
 +            for (i = 0; i < slice_mb_number;) {
 +                int mb_count = i + slice_mb_begin;    
 +
 +                mb_x = mb_count % mb_width;
 +                mb_y = mb_count / mb_width;
 +                mb_intra_ub = 0;
 +
 +                if (mb_x != 0) {
 +                    mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_AE;
 +                }
 +
 +                if (mb_y != 0) {
 +                    mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_B;
 +
 +                    if (mb_x != 0)
 +                        mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_D;
 +
 +                    if (mb_x != (mb_width -1))
 +                        mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_C;
 +                }
 +
 +                if (i < mb_width) {
 +                    if (i == 0)
 +                        mb_intra_ub &= ~(INTRA_PRED_AVAIL_FLAG_AE);
 +
 +                    mb_intra_ub &= ~(INTRA_PRED_AVAIL_FLAG_BCD_MASK);
 +
 +                    if ((i == (mb_width - 1)) && slice_mb_x) {
 +                        mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_C;
 +                    }
 +                }
 +              
 +                if ((i == mb_width) && slice_mb_x) {
 +                    mb_intra_ub &= ~(INTRA_PRED_AVAIL_FLAG_D);
 +                }
 +
 +                *command_ptr++ = (CMD_MEDIA_OBJECT | (8 - 2));
 +                *command_ptr++ = kernel;
 +                *command_ptr++ = 0;
 +                *command_ptr++ = 0;
 +                *command_ptr++ = 0;
 +                *command_ptr++ = 0;
 +   
 +                /*inline data */
 +                *command_ptr++ = (mb_width << 16 | mb_y << 8 | mb_x);
 +                *command_ptr++ = ( (1 << 16) | transform_8x8_mode_flag | (mb_intra_ub << 8));
  
 -            if (intel_batchbuffer_check_free_space(batch, object_len_in_bytes) == 0) {
 -                assert(0);
 -                intel_batchbuffer_end_atomic(batch);  
 -                intel_batchbuffer_flush(batch);
 -                emit_new_state = 1;
 -                intel_batchbuffer_start_atomic(batch, 0x1000);
 +                i += 1;
              }
 +
 +            slice_param++;
          }
      }
  
@@@ -286,10 -228,10 +286,9 @@@ void hsw_veb_iecp_ace_table(VADriverCon
  void hsw_veb_iecp_tcc_table(VADriverContextP ctx, struct intel_vebox_context *proc_ctx)
  {
      unsigned int *p_table = (unsigned int*)(proc_ctx->iecp_state_table.ptr + 168);
 -    /*
 -      VAProcFilterParameterBuffer * tcc_param =
 -              (VAProcFilterParameterBuffer *) proc_ctx->filter_iecp_tcc;
 -   */
 +//    VAProcFilterParameterBuffer * tcc_param =
 +//            (VAProcFilterParameterBuffer *) proc_ctx->filter_iecp_tcc;
 +
     if(!(proc_ctx->filters_mask & VPP_IECP_TCC)){ 
          memset(p_table, 0, 11 * 4);
      }else{
@@@ -136,10 -120,7 +136,9 @@@ struct intel_vebox_contex
      void * filter_iecp_tcc;
      void * filter_iecp_amp;
      void * filter_iecp_csc;
 -    */
 +
 +    unsigned int  filter_iecp_amp_num_elements;
 +    unsigned char format_convert_flags;
  };
  
  VAStatus gen75_vebox_process_picture(VADriverContextP ctx,
diff --cc src/gen7_mfd.c
   *
   */
  
- #include <stdio.h>
- #include <stdlib.h>
- #include <string.h>
- #include <assert.h>
 -#ifndef HAVE_GEN_AVC_SURFACE
 -#define HAVE_GEN_AVC_SURFACE 1
 -#endif
 -
+ #include "sysdeps.h"
++
 +#include <va/va_dec_jpeg.h>
 +
  #include "intel_batchbuffer.h"
  #include "intel_driver.h"
  #include "i965_defines.h"
  #include "i965_drv_video.h"
  #include "i965_decoder_utils.h"
   *    Xiang Haihao <haihao.xiang@intel.com>
   *
   */
--#include <stdio.h>
--#include <stdlib.h>
--#include <string.h>
--#include <assert.h>
 -
 -#ifndef HAVE_GEN_AVC_SURFACE
 -#define HAVE_GEN_AVC_SURFACE 1
 -#endif
++#include "sysdeps.h"
  
  #include "intel_batchbuffer.h"
  #include "intel_driver.h"
@@@ -46,8 -45,47 +46,7 @@@ struct gen_buffer 
      int         valid;
  };
  
 -#if HAVE_GEN_AVC_SURFACE
 -
 -static pthread_mutex_t free_avc_surface_lock = PTHREAD_MUTEX_INITIALIZER;
 -
 -typedef struct gen_avc_surface GenAvcSurface;
 -struct gen_avc_surface
 -{
 -    dri_bo *dmv_top;
 -    dri_bo *dmv_bottom;
 -    int dmv_bottom_flag;
 -};
 -
 -static void 
 -gen_free_avc_surface(void **data)
 -{
 -    GenAvcSurface *avc_surface;
 -
 -    pthread_mutex_lock(&free_avc_surface_lock);
 -
 -    avc_surface = *data;
 -
 -    if (!avc_surface) {
 -        pthread_mutex_unlock(&free_avc_surface_lock);
 -        return;
 -    }
 -
 -
 -    dri_bo_unreference(avc_surface->dmv_top);
 -    avc_surface->dmv_top = NULL;
 -    dri_bo_unreference(avc_surface->dmv_bottom);
 -    avc_surface->dmv_bottom = NULL;
 -
 -    free(avc_surface);
 -    *data = NULL;
 -
 -    pthread_mutex_unlock(&free_avc_surface_lock);
 -}
 -
 -#endif
--
 -extern struct hw_context *
 -gen75_dec_hw_context_init(VADriverContextP ctx, VAProfile profile);
 +struct hw_context *
 +gen75_dec_hw_context_init(VADriverContextP ctx, struct object_config *obj_config);
  
  #endif /* I965_DECODER_H */
   * DEALINGS IN THE SOFTWARE.
   */
  
- #include <assert.h>
- #include <stddef.h>
- #include <string.h>
+ #include "sysdeps.h"
++
  #include <alloca.h>
++
  #include "intel_batchbuffer.h"
 -#include "i965_decoder_utils.h"
  #include "i965_drv_video.h"
 +#include "i965_decoder_utils.h"
  #include "i965_defines.h"
  
  /* Set reference surface if backing store exists */
@@@ -216,8 -200,9 +216,10 @@@ avc_get_first_mb_bit_offset_with_epb
      uint8_t *buf;
      int ret;
  
-     buf_size  = slice_param->slice_data_bit_offset / 8;
-     data_size = slice_param->slice_data_size - slice_param->slice_data_offset;
+     header_size = slice_param->slice_data_bit_offset / 8;
+     data_size   = slice_param->slice_data_size - slice_param->slice_data_offset;
+     buf_size    = (header_size * 3 + 1) / 2; // Max possible header size (x1.5)
++
      if (buf_size > data_size)
          buf_size = data_size;
  
      );
      assert(ret == 0);
  
-     for (i = 2, n = 0; i < buf_size; i++) {
-         if (!buf[i - 2] && !buf[i - 1] && buf[i] == 3)
-             i += 2, n++;
+     for (i = 2, j = 2, n = 0; i < buf_size && j < header_size; i++, j++) {
+         if (buf[i] == 0x03 && buf[i - 1] == 0x00 && buf[i - 2] == 0x00)
+             i += 2, j++, n++;
      }
++
      out_slice_data_bit_offset = in_slice_data_bit_offset + n * 8;
  
      if (mode_flag == ENTROPY_CABAC)
index d692433,144505d..c7163ae
mode 100755,100644..100755
index 19c298d,297ab7c..deee9c8
mode 100755,100644..100755
@@@ -2362,10 -1884,10 +2354,12 @@@ i965_CreateImage(VADriverContextP ctx
          image->component_order[1]  = 'G';
          image->component_order[2]  = 'B';
          break;
+     case VA_FOURCC('A','R','G','B'):
+     case VA_FOURCC('A','B','G','R'):
      case VA_FOURCC('B','G','R','A'):
      case VA_FOURCC('R','G','B','A'):
 +    case VA_FOURCC('B','G','R','X'):
 +    case VA_FOURCC('R','G','B','X'):
          image->num_planes = 1;
          image->pitches[0] = width * 4;
          image->offsets[0] = 0;
@@@ -303,8 -253,6 +303,7 @@@ struct i965_driver_dat
      VADisplayAttribute *display_attributes;
      unsigned int num_display_attributes;
      VADisplayAttribute *rotation_attrib;
-     
 +    VAContextID current_context_id;
  
      /* VA/DRI (X11) specific data */
      struct va_dri_output *dri_output;
@@@ -346,35 -294,7 +345,34 @@@ i965_check_alloc_surface_bo(VADriverCon
                              unsigned int fourcc,
                              unsigned int subsampling);
  
 +int
 +va_enc_packed_type_to_idx(int packed_type);
 +
 +/* reserve 2 byte for internal using */
 +#define CODED_H264      0
 +#define CODED_MPEG2     1
 +
 +#define H264_DELIMITER0 0x00
 +#define H264_DELIMITER1 0x00
 +#define H264_DELIMITER2 0x00
 +#define H264_DELIMITER3 0x00
 +#define H264_DELIMITER4 0x00
 +
 +#define MPEG2_DELIMITER0        0x00
 +#define MPEG2_DELIMITER1        0x00
 +#define MPEG2_DELIMITER2        0x00
 +#define MPEG2_DELIMITER3        0x00
 +#define MPEG2_DELIMITER4        0xb0
 +
 +struct i965_coded_buffer_segment
 +{
 +    VACodedBufferSegment base;
 +    unsigned char mapped;
 +    unsigned char codec;
 +};
 +
 +#define I965_CODEDBUFFER_HEADER_SIZE   ALIGN(sizeof(struct i965_coded_buffer_segment), 64)
  
  extern VAStatus i965_MapBuffer(VADriverContextP ctx,
                VABufferID buf_id,       /* in */
                void **pbuf);            /* out */
   * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
   */
  
- #include "config.h"
- #include <stdlib.h>
- #include <string.h>
- #include <assert.h>
+ #include "sysdeps.h"
++
  #include <va/va_dricommon.h>
++
  #include "i965_drv_video.h"
  #include "i965_output_dri.h"
  #include "dso_utils.h"
@@@ -307,43 -294,7 +307,42 @@@ static struct pp_module pp_modules_gen5
      
          pp_plx_load_save_plx_initialize,
      },
 - 
 +
 +    {
 +        {
 +            "PA_PL3 module",
 +            PP_PA_LOAD_SAVE_PL3,
 +            pp_pa_load_save_pl3_gen5,
 +            sizeof(pp_pa_load_save_pl3_gen5),
 +            NULL,
 +        },
 +    
 +        pp_plx_load_save_plx_initialize,
 +    },
 +
 +    {
 +        {
 +            "RGBX_NV12 module",
 +            PP_RGBX_LOAD_SAVE_NV12,
 +            pp_rgbx_load_save_nv12_gen5,
 +            sizeof(pp_rgbx_load_save_nv12_gen5),
 +            NULL,
 +        },
 +    
 +        pp_plx_load_save_plx_initialize,
 +    },
 +            
 +    {
 +        {
 +            "NV12_RGBX module",
 +            PP_NV12_LOAD_SAVE_RGBX,
 +            pp_nv12_load_save_rgbx_gen5,
 +            sizeof(pp_nv12_load_save_rgbx_gen5),
 +            NULL,
 +        },
 +    
 +        pp_plx_load_save_plx_initialize,
 +    },
-                     
  };
  
  static const uint32_t pp_null_gen6[][4] = {
@@@ -458,7 -397,7 +457,7 @@@ static struct pp_module pp_modules_gen6
      {
          {
              "PL3_PL3",
--            PP_PL3_LOAD_SAVE_N12,
++            PP_PL3_LOAD_SAVE_PL3,
              pp_pl3_load_save_pl3_gen6,
              sizeof(pp_pl3_load_save_pl3_gen6),
              NULL,
      
          pp_plx_load_save_plx_initialize,
      },
-     
++
      {
          {
              "PL3_PA module",
      
          pp_plx_load_save_plx_initialize,
      },
-     
++
      {
          {
              "PA_NV12 module",
@@@ -718,7 -607,7 +717,7 @@@ static struct pp_module pp_modules_gen7
      {
          {
              "PL3_PL3",
--            PP_PL3_LOAD_SAVE_N12,
++            PP_PL3_LOAD_SAVE_PL3,
              pp_pl3_load_save_pl3_gen7,
              sizeof(pp_pl3_load_save_pl3_gen7),
              NULL,
@@@ -953,7 -797,7 +952,7 @@@ static struct pp_module pp_modules_gen7
      {
          {
              "PL3_PL3",
--            PP_PL3_LOAD_SAVE_N12,
++            PP_PL3_LOAD_SAVE_PL3,
              pp_pl3_load_save_pl3_gen75,
              sizeof(pp_pl3_load_save_pl3_gen75),
              NULL,
  
          gen7_pp_nv12_dn_initialize,
      },
++
      {
          {
              "NV12_PA module",
@@@ -3644,7 -3176,7 +3644,6 @@@ gen7_pp_nv12_dndi_initialize(VADriverCo
      sampler_dndi[index].dw5.sdi_fallback_mode_2_constant = 0;
      sampler_dndi[index].dw5.sdi_fallback_mode_1_t2_constant = 0;
      sampler_dndi[index].dw5.sdi_fallback_mode_1_t1_constant = 0;
--
      sampler_dndi[index].dw6.dn_enable = 0;
      sampler_dndi[index].dw6.di_enable = 1;
      sampler_dndi[index].dw6.di_partial = 0;
@@@ -3924,14 -3444,14 +3923,14 @@@ gen7_pp_nv12_dn_initialize(VADriverCont
  
  static VAStatus
  ironlake_pp_initialize(
--    VADriverContextP   ctx,
++    VADriverContextP ctx,
      struct i965_post_processing_context *pp_context,
      const struct i965_surface *src_surface,
      const VARectangle *src_rect,
      struct i965_surface *dst_surface,
      const VARectangle *dst_rect,
--    int                pp_index,
 -    void  *filter_param
++    int pp_index,
 +    void *filter_param
  )
  {
      VAStatus va_status;
@@@ -4055,14 -3575,14 +4054,14 @@@ ironlake_post_processing
  
  static VAStatus
  gen6_pp_initialize(
--    VADriverContextP   ctx,
++    VADriverContextP ctx,
      struct i965_post_processing_context *pp_context,
      const struct i965_surface *src_surface,
      const VARectangle *src_rect,
      struct i965_surface *dst_surface,
      const VARectangle *dst_rect,
--    int                pp_index,
 -    void * filter_param
++    int pp_index,
 +    void *filter_param
  )
  {
      VAStatus va_status;
                                            filter_param);
      else
          va_status = VA_STATUS_ERROR_UNIMPLEMENTED;
 - 
 +
      calculate_boundary_block_mask(pp_context, dst_rect);
-     
      return va_status;
  }
  
@@@ -4458,14 -3978,14 +4457,14 @@@ gen6_pp_pipeline_setup(VADriverContext
  
  static VAStatus
  gen6_post_processing(
--    VADriverContextP   ctx,
++    VADriverContextP ctx,
      struct i965_post_processing_context *pp_context,
      const struct i965_surface *src_surface,
      const VARectangle *src_rect,
      struct i965_surface *dst_surface,
      const VARectangle *dst_rect,
--    int                pp_index,
-     void * filter_param
++    int pp_index,
+     void *filter_param
  )
  {
      VAStatus va_status;
@@@ -4498,15 -4044,19 +4497,15 @@@ i965_post_processing_internal
      void *filter_param
  )
  {
-     struct i965_driver_data *i965 = i965_driver_data(ctx);
      VAStatus va_status;
+     struct i965_driver_data *i965 = i965_driver_data(ctx);
  
 -    if(IS_HASWELL(i965->intel.device_id) && 
 -        pp_index == PP_NV12_DNDI){
 -        va_status = gen75_post_processing(ctx, pp_context, src_surface, src_rect, dst_surface, dst_rect, pp_index, filter_param);
 -    }else if (IS_GEN6(i965->intel.device_id) ||
 -              IS_GEN7(i965->intel.device_id)){
 +    if (IS_GEN6(i965->intel.device_id) ||
 +        IS_GEN7(i965->intel.device_id))
          va_status = gen6_post_processing(ctx, pp_context, src_surface, src_rect, dst_surface, dst_rect, pp_index, filter_param);
 -    }else{
 +    else
          va_status = ironlake_post_processing(ctx, pp_context, src_surface, src_rect, dst_surface, dst_rect, pp_index, filter_param);
 -    }
 -
 +    
      return va_status;
  }
  
@@@ -4765,11 -4267,11 +4764,11 @@@ i965_post_processing
  }       
  
  static VAStatus
 -i965_image_pl3_processing(VADriverContextP ctx,
 -                          const struct i965_surface *src_surface,
 -                          const VARectangle *src_rect,
 -                          struct i965_surface *dst_surface,
 -                          const VARectangle *dst_rect)
 +i965_image_pl1_rgbx_processing(VADriverContextP ctx,
-                           const struct i965_surface *src_surface,
-                           const VARectangle *src_rect,
-                           struct i965_surface *dst_surface,
-                           const VARectangle *dst_rect)
++                               const struct i965_surface *src_surface,
++                               const VARectangle *src_rect,
++                               struct i965_surface *dst_surface,
++                               const VARectangle *dst_rect)
  {
      struct i965_driver_data *i965 = i965_driver_data(ctx);
      struct i965_post_processing_context *pp_context = i965->pp_context;
@@@ -4853,50 -4326,35 +4852,50 @@@ i965_image_pl2_processing(VADriverConte
      struct i965_driver_data *i965 = i965_driver_data(ctx);
      struct i965_post_processing_context *pp_context = i965->pp_context;
      int fourcc = pp_get_surface_fourcc(ctx, dst_surface);
 +    VAStatus vaStatus = VA_STATUS_ERROR_UNIMPLEMENTED;
  
      if (fourcc == VA_FOURCC('N', 'V', '1', '2')) {
 -        i965_post_processing_internal(ctx, i965->pp_context,
 -                                      src_surface,
 -                                      src_rect,
 -                                      dst_surface,
 -                                      dst_rect,
 -                                      PP_NV12_LOAD_SAVE_N12,
 -                                      NULL);
 +        vaStatus = i965_post_processing_internal(ctx, i965->pp_context,
 +                                                 src_surface,
 +                                                 src_rect,
 +                                                 dst_surface,
 +                                                 dst_rect,
 +                                                 PP_NV12_LOAD_SAVE_N12,
 +                                                 NULL);
      } else if (fourcc == VA_FOURCC('I', 'M', 'C', '1') || 
 -               fourcc == VA_FOURCC('I', 'M', 'C', '3') ||
 +               fourcc == VA_FOURCC('I', 'M', 'C', '3') || 
                 fourcc == VA_FOURCC('Y', 'V', '1', '2') ||
                 fourcc == VA_FOURCC('I', '4', '2', '0') ) {
 -        i965_post_processing_internal(ctx, i965->pp_context,
 -                                      src_surface,
 -                                      src_rect,
 -                                      dst_surface,
 -                                      dst_rect,
 -                                      PP_NV12_LOAD_SAVE_PL3,
 -                                      NULL);
 +        vaStatus = i965_post_processing_internal(ctx, i965->pp_context,
 +                                                 src_surface,
 +                                                 src_rect,
 +                                                 dst_surface,
 +                                                 dst_rect,
 +                                                 PP_NV12_LOAD_SAVE_PL3,
 +                                                 NULL);
      } else if (fourcc == VA_FOURCC('Y', 'U', 'Y', '2') ||
                 fourcc == VA_FOURCC('U', 'Y', 'V', 'Y')) {
 -        i965_post_processing_internal(ctx, i965->pp_context,
 -                                      src_surface,
 -                                      src_rect,
 -                                      dst_surface,
 -                                      dst_rect,
 -                                      PP_NV12_LOAD_SAVE_PA,
 -                                      NULL);
 +        vaStatus = i965_post_processing_internal(ctx, i965->pp_context,
 +                                                 src_surface,
 +                                                 src_rect,
 +                                                 dst_surface,
 +                                                 dst_rect,
 +                                                 PP_NV12_LOAD_SAVE_PA,
 +                                                     NULL);
 +    } else if (fourcc == VA_FOURCC('B', 'G', 'R', 'X') || 
 +               fourcc == VA_FOURCC('B', 'G', 'R', 'A') ||
 +               fourcc == VA_FOURCC('R', 'G', 'B', 'X') ||
 +               fourcc == VA_FOURCC('R', 'G', 'B', 'A') ) {
 +        vaStatus = i965_post_processing_internal(ctx, i965->pp_context,
-                                       src_surface,
-                                       src_rect,
-                                       dst_surface,
-                                       dst_rect,
-                                       PP_NV12_LOAD_SAVE_RGBX,
-                                       NULL);
++                                                 src_surface,
++                                                 src_rect,
++                                                 dst_surface,
++                                                 dst_rect,
++                                                 PP_NV12_LOAD_SAVE_RGBX,
++                                                 NULL);
 +    } else {
 +        assert(0);
 +        return VA_STATUS_ERROR_UNKNOWN;
      }
  
      intel_batchbuffer_flush(pp_context->batch);
@@@ -4923,18 -4381,8 +4922,16 @@@ i965_image_pl1_processing(VADriverConte
                                        dst_rect,
                                        PP_PA_LOAD_SAVE_NV12,
                                        NULL);
--    }
-     else if (fourcc == VA_FOURCC_YV12) {
 -    else {
++    } else if (fourcc == VA_FOURCC_YV12) {
 +        i965_post_processing_internal(ctx, i965->pp_context,
 +                                      src_surface,
 +                                      src_rect,
 +                                      dst_surface,
 +                                      dst_rect,
 +                                      PP_PA_LOAD_SAVE_PL3,
 +                                      NULL);
 +
-     }
-     else {
++    } else {
          return VA_STATUS_ERROR_UNKNOWN;
      }
  
@@@ -5312,107 -4588,7 +5309,109 @@@ i965_proc_picture(VADriverContextP ctx
          }
      }
  
 -    return True;
 +    obj_surface = SURFACE(proc_state->current_render_target);
 +    assert(obj_surface);
 +    
 +    if (!obj_surface)
 +        goto error;
 +
 +    int csc_needed = 0;
 +    if (obj_surface->fourcc && obj_surface->fourcc !=  VA_FOURCC('N','V','1','2')){
 +        csc_needed = 1;
 +        out_surface_id = VA_INVALID_ID;
 +        status = i965_CreateSurfaces(ctx,
 +                                     obj_surface->orig_width,
 +                                     obj_surface->orig_height,
 +                                     VA_RT_FORMAT_YUV420, 
 +                                     1,
 +                                     &out_surface_id);
 +        assert(status == VA_STATUS_SUCCESS);
 +        tmp_surfaces[num_tmp_surfaces++] = out_surface_id;
 +        struct object_surface *csc_surface = SURFACE(out_surface_id);
 +        assert(csc_surface);
 +        i965_check_alloc_surface_bo(ctx, csc_surface, !!tiling, VA_FOURCC('N','V','1','2'), SUBSAMPLE_YUV420);
 +        dst_surface.base = (struct object_base *)csc_surface;
 +    } else {
 +        i965_check_alloc_surface_bo(ctx, obj_surface, !!tiling, VA_FOURCC('N','V','1','2'), SUBSAMPLE_YUV420);
 +        dst_surface.base = (struct object_base *)obj_surface;
 +    }
 +
 +    dst_surface.type = I965_SURFACE_TYPE_SURFACE;
 +    i965_vpp_clear_surface(ctx, &proc_context->pp_context, obj_surface, pipeline_param->output_background_color); 
 +
 +    // load/save doesn't support different origin offset for src and dst surface
 +    if (src_rect.width == dst_rect.width &&
 +        src_rect.height == dst_rect.height &&
 +        src_rect.x == dst_rect.x &&
 +        src_rect.y == dst_rect.y) {
 +        i965_post_processing_internal(ctx, &proc_context->pp_context,
 +                                      &src_surface,
 +                                      &src_rect,
 +                                      &dst_surface,
 +                                      &dst_rect,
 +                                      PP_NV12_LOAD_SAVE_N12,
 +                                      NULL);
 +    } else {
 +
 +        i965_post_processing_internal(ctx, &proc_context->pp_context,
 +                                      &src_surface,
 +                                      &src_rect,
 +                                      &dst_surface,
 +                                      &dst_rect,
 +                                      (pipeline_param->filter_flags & VA_FILTER_SCALING_MASK) == VA_FILTER_SCALING_NL_ANAMORPHIC ?
 +                                      PP_NV12_AVS : PP_NV12_SCALING,
 +                                      NULL);
 +    }
 +
 +    if (csc_needed) {
 +        src_surface.base = dst_surface.base;
 +        src_surface.type = dst_surface.type;
 +        src_surface.flags = dst_surface.flags;
 +        dst_surface.base = (struct object_base *)obj_surface;
 +        dst_surface.type = I965_SURFACE_TYPE_SURFACE;
 +        i965_image_processing(ctx, &src_surface, &dst_rect, &dst_surface, &dst_rect);
 +    }
 +    
 +    if (num_tmp_surfaces)
 +        i965_DestroySurfaces(ctx,
 +                             tmp_surfaces,
 +                             num_tmp_surfaces);
 +
 +    intel_batchbuffer_flush(hw_context->batch);
 +
 +    return VA_STATUS_SUCCESS;
 +
 +error:
 +    if (num_tmp_surfaces)
 +        i965_DestroySurfaces(ctx,
 +                             tmp_surfaces,
 +                             num_tmp_surfaces);
 +
 +    return VA_STATUS_ERROR_INVALID_PARAMETER;
 +}
 +
 +static void
 +i965_proc_context_destroy(void *hw_context)
 +{
 +    struct i965_proc_context *proc_context = (struct i965_proc_context *)hw_context;
 +
 +    i965_post_processing_context_finalize(&proc_context->pp_context);
 +    intel_batchbuffer_free(proc_context->base.batch);
 +    free(proc_context);
 +}
 +
 +struct hw_context *
 +i965_proc_context_init(VADriverContextP ctx, struct object_config *obj_config)
 +{
 +    struct intel_driver_data *intel = intel_driver_data(ctx);
 +    struct i965_proc_context *proc_context = calloc(1, sizeof(struct i965_proc_context));
 +
 +    proc_context->base.destroy = i965_proc_context_destroy;
 +    proc_context->base.run = i965_proc_picture;
 +    proc_context->base.batch = intel_batchbuffer_new(intel, I915_EXEC_RENDER, 0);
 +    i965_post_processing_context_init(ctx, &proc_context->pp_context, proc_context->base.batch);
 +
 +    return (struct hw_context *)proc_context;
  }
  #ifndef __I965_POST_PROCESSING_H__
  #define __I965_POST_PROCESSING_H__
  
- #define MAX_PP_SURFACES 48
 -#define MAX_PP_SURFACES  48
++#define MAX_PP_SURFACES                 48
  
 -#define I965_PP_FLAG_TOP_FIELD                 1
 -#define I965_PP_FLAG_BOTTOM_FIELD              2
 -
 -#define I965_PP_FLAG_AVS                       4
 -#define I965_PP_FLAG_DEINTERLACING             8
 +#define I965_PP_FLAG_TOP_FIELD          1
 +#define I965_PP_FLAG_BOTTOM_FIELD       2
 +#define I965_PP_FLAG_MCDI               4
 +#define I965_PP_FLAG_AVS                8
  
  enum
  {
@@@ -882,12 -885,14 +882,13 @@@ i965_render_src_surfaces_state
  
  static void
  i965_subpic_render_src_surfaces_state(VADriverContextP ctx,
 -                              VASurfaceID surface)
 +                                      struct object_surface *obj_surface)
  {
 -    struct i965_driver_data *i965 = i965_driver_data(ctx);  
 -    struct object_surface *obj_surface = SURFACE(surface);
      dri_bo *subpic_region;
      unsigned int index = obj_surface->subpic_render_idx;
 -    struct object_subpic *obj_subpic = SUBPIC(obj_surface->subpic[index]);
 -    struct object_image *obj_image = IMAGE(obj_subpic->image);
 +    struct object_subpic *obj_subpic = obj_surface->obj_subpic[index];
 +    struct object_image *obj_image = obj_subpic->obj_image;
++
      assert(obj_surface);
      assert(obj_surface->bo);
      subpic_region = obj_image->bo;
   *
   */
  
- #include <assert.h>
+ #include "sysdeps.h"
 +
 +#include <va/va_drmcommon.h>
 +
  #include "intel_batchbuffer.h"
  #include "intel_memman.h"
  #include "intel_driver.h"
@@@ -75,16 -72,16 +75,16 @@@ intel_driver_init(VADriverContextP ctx
      int has_exec2, has_bsd, has_blt;
  
      assert(drm_state);
-     assert(drm_state->auth_type == VA_DRM_AUTH_DRI1 ||
-            drm_state->auth_type == VA_DRM_AUTH_DRI2 ||
-            drm_state->auth_type == VA_DRM_AUTH_CUSTOM);
+     assert(VA_CHECK_DRM_AUTH_TYPE(ctx, VA_DRM_AUTH_DRI1) ||
+            VA_CHECK_DRM_AUTH_TYPE(ctx, VA_DRM_AUTH_DRI2) ||
+            VA_CHECK_DRM_AUTH_TYPE(ctx, VA_DRM_AUTH_CUSTOM));
  
      intel->fd = drm_state->fd;
-     intel->dri2Enabled = (drm_state->auth_type == VA_DRM_AUTH_DRI2 ||
-                           drm_state->auth_type == VA_DRM_AUTH_CUSTOM);
+     intel->dri2Enabled = (VA_CHECK_DRM_AUTH_TYPE(ctx, VA_DRM_AUTH_DRI2) ||
+                           VA_CHECK_DRM_AUTH_TYPE(ctx, VA_DRM_AUTH_CUSTOM));
  
      if (!intel->dri2Enabled) {
 -        return False;
 +        return false;
      }
  
      intel->locked = 0;
Simple merge
@@@ -12,33 -12,29 +12,33 @@@ INTEL_PP_G4B_GEN5 = 
        nv12_dn_nv12.g4b.gen5                   \
        nv12_dndi_nv12.g4b.gen5                 \
        nv12_load_save_nv12.g4b.gen5            \
++      nv12_load_save_pa.g4b.gen5              \
        nv12_load_save_pl3.g4b.gen5             \
++      nv12_load_save_rgbx.g4b.gen5            \
        nv12_scaling_nv12.g4b.gen5              \
++      pa_load_save_nv12.g4b.gen5              \
++      pa_load_save_pl3.g4b.gen5               \
        pl3_load_save_nv12.g4b.gen5             \
 -      pl3_load_save_pl3.g4b.gen5              \
+       pl3_load_save_pa.g4b.gen5               \
 -      nv12_load_save_pa.g4b.gen5              \
 -      pa_load_save_nv12.g4b.gen5              \
 -      pa_load_save_pl3.g4b.gen5               \
 -      $(NULL)
 +      pl3_load_save_pl3.g4b.gen5              \
-       nv12_load_save_pa.g4b.gen5                              \
-       pl3_load_save_pa.g4b.gen5                               \
-       pa_load_save_nv12.g4b.gen5                              \
-       pa_load_save_pl3.g4b.gen5                               \
-       rgbx_load_save_nv12.g4b.gen5                            \
-       nv12_load_save_rgbx.g4b.gen5                            \
++      rgbx_load_save_nv12.g4b.gen5            \
 +      $(NULL)
  
  INTEL_PP_G6B = \
        nv12_avs_nv12.g6b                       \
        nv12_dn_nv12.g6b                        \
        nv12_dndi_nv12.g6b                      \
        nv12_load_save_nv12.g6b                 \
++      nv12_load_save_pa.g6b                   \
        nv12_load_save_pl3.g6b                  \
++      nv12_load_save_rgbx.g6b                 \
        nv12_scaling_nv12.g6b                   \
++      pa_load_save_nv12.g6b                   \
++      pa_load_save_pl3.g6b                    \
        pl3_load_save_nv12.g6b                  \
        pl3_load_save_pl3.g6b                   \
-       nv12_load_save_pa.g6b                           \
-       pl3_load_save_pa.g6b                            \
-       pa_load_save_nv12.g6b                           \
-       pa_load_save_pl3.g6b                            \
-       rgbx_load_save_nv12.g6b                         \
-       nv12_load_save_rgbx.g6b                         \
+       pl3_load_save_pa.g6b                    \
 -      nv12_load_save_pa.g6b                   \
 -      pa_load_save_nv12.g6b                   \
 -      pa_load_save_pl3.g6b                    \
++      rgbx_load_save_nv12.g6b                 \
        $(NULL)
  
  INTEL_PP_ASM = \
        nv12_dn_nv12.asm                        \
        nv12_dndi_nv12.asm                      \
        nv12_load_save_nv12.asm                 \
++      nv12_load_save_pa.asm                   \
        nv12_load_save_pl3.asm                  \
++      nv12_load_save_rgbx.asm                 \
        nv12_scaling_nv12.asm                   \
++      pa_load_save_nv12.asm                   \
++      pa_load_save_pl3.asm                    \
        pl3_load_save_nv12.asm                  \
        pl3_load_save_pl3.asm                   \
-       nv12_load_save_pa.asm                           \
-       pl3_load_save_pa.asm                            \
-       pa_load_save_nv12.asm                           \
-       pa_load_save_pl3.asm                            \
-       rgbx_load_save_nv12.asm                         \
-       nv12_load_save_rgbx.asm                         \
+       pl3_load_save_pa.asm                    \
 -      nv12_load_save_pa.asm                   \
 -      pa_load_save_nv12.asm                   \
 -      pa_load_save_pl3.asm                    \
 -      $(NULL)
++      rgbx_load_save_nv12.asm                 \
 +      $(NULL)
  
  INTEL_PP_ASM += \
        Common/AYUV_Load_16x8.asm                       \
        Common/RGB16x8_Save_RGB16.asm                   \
        Common/RGB16x8_Save_Y416.asm                    \
        Common/RGB_Pack.asm                             \
-       Common/RGBX_Save_YUV_Fix.asm            \
-       Common/RGBX_Save_YUV_Float.asm          \
 +      Common/RGBX_Load_16x8.asm                       \
 +      Common/RGBX_to_YUV_Coef.asm                     \
-       Common/YUVX_Save_RGBX_Fix.asm       \
-       Common/YUVX_Save_RGBX_Float.asm     \
++      Common/RGBX_Save_YUV_Fix.asm                    \
++      Common/RGBX_Save_YUV_Float.asm                  \
 +      Common/YUV_to_RGBX_Coef.asm                     \
++      Common/YUVX_Save_RGBX_Fix.asm                   \
++      Common/YUVX_Save_RGBX_Float.asm                 \
        Common/SetupVPKernel.asm                        \
        Common/readSampler16x1.asm                      \
        Core_Kernels/AVS_SetupFirstBlock.asm            \
@@@ -7,11 -7,9 +7,11 @@@ INTEL_PP_G7B = 
        pl2_to_pa.g7b           \
        pl2_to_pl2.g7b          \
        pl2_to_pl3.g7b          \
++      pl2_to_rgbx.g7b         \
        pl3_to_pa.g7b           \
        pl3_to_pl2.g7b          \
        pl3_to_pl3.g7b          \
-       pl2_to_rgbx.g7b         \
-       rgbx_to_nv12.g7b                \
++      rgbx_to_nv12.g7b        \
        $(NULL)
  
  INTEL_PP_G4A = \
        PL3_DN_PL3.g4a                  \
        PL_DI_422CP.g4a                 \
        PL_DI_PA.g4a                    \
++      RGB_to_YUV.g4a                  \
        Save_AVS_PA.g4a                 \
        Save_AVS_PL3.g4a                \
        Save_AVS_NV12.g4a               \
        Save_AVS_RGB.g4a                \
-       YUV_to_RGB.g4a                  \
-       RGB_to_YUV.g4a                  \
 +      Save_AVS_RGBX.g4a               \
        Set_AVS_Buf_0123_BGRA.g4a       \
        Set_AVS_Buf_0123_PL2.g4a        \
        Set_AVS_Buf_0123_PL3.g4a        \
@@@ -65,6 -60,6 +64,7 @@@
        Set_AVS_Buf_0123_VYUA.g4a       \
        Set_Layer_0.g4a                 \
        VP_Setup.g4a                    \
++      YUV_to_RGB.g4a                  \
        $(NULL)
  
  INTEL_PP_ASM = $(INTEL_PP_G7B:%.g7b=%.asm)
@@@ -57,320 -54,6 +57,321 @@@ send (8) msg_ind INEP_ROW<1>:UB null re
  mov  (8) msg_reg0.0<1>:UD       read1_header.0<8,8,1>:UD {align1};                
  send (8) msg_ind INEP_COL0<1>:UB null read(BIND_IDX_INEP, 0, 0, 4) mlen 1 rlen 2 {align1};
          
 +/*
 + * Media Read Message -- fetch Chroma neighbor edge pixels 
 + */
 +/* ROW */
 +shl  (2) read0_header.0<1>:D    orig_xy_ub<2,2,1>:UB 3:UW {align1};    /* x * 16 , y * 8 */
 +mul  (1) read0_header.0<1>:D    read0_header.0<0,1,0>:D  2:W {align1};
 +add  (1) read0_header.0<1>:D    read0_header.0<0,1,0>:D -8:W {align1};     /* X offset */
 +add  (1) read0_header.4<1>:D    read0_header.4<0,1,0>:D -1:W {align1};     /* Y offset */ 
 +mov  (8) msg_reg0.0<1>:UD       read0_header.0<8,8,1>:UD {align1};        
 +send (8) msg_ind CHROMA_ROW<1>:UB null read(BIND_IDX_CBCR, 0, 0, 4) mlen 1 rlen 1 {align1};
 +
 +/* COL */
 +shl  (2) read1_header.0<1>:D    orig_xy_ub<2,2,1>:UB 3:UW {align1};    /* x * 16, y * 8 */
 +mul  (1) read1_header.0<1>:D    read1_header.0<0,1,0>:D  2:W {align1};
 +add  (1) read1_header.0<1>:D    read1_header.0<0,1,0>:D -4:W {align1};     /* X offset */
 +mov  (1) read1_header.8<1>:UD   BLOCK_8X4 {align1};
 +mov  (8) msg_reg0.0<1>:UD       read1_header.0<8,8,1>:UD {align1};                
 +send (8) msg_ind CHROMA_COL<1>:UB null read(BIND_IDX_CBCR, 0, 0, 4) mlen 1 rlen 1 {align1};
 +
 +mov  (8) mb_mvp_ref.0<1>:ud   0:ud            {align1};
 +mov  (8) mb_ref_win.0<1>:ud   0:ud            {align1};
 +and.z.f0.0 (1)                null:uw mb_hwdep<0,1,0>:uw              0x04:uw   {align1};
 +(f0.0) jmpi (1) __mb_hwdep_end;
 +/* read back the data for MB A */
 +/* the layout of MB result is: rx.0(Available). rx.4(MVa), rX.8(MVb), rX.16(Pred_L0 flag),
 +*  rX.18 (Pred_L1 flag), rX.20(Forward reference ID), rX.22(Backwared reference ID)
 +*/
 +mov  (8) mba_result.0<1>:ud   0x0:ud          {align1};
 +mov  (8) mbb_result.0<1>:ud   0x0:ud          {align1};
 +mov  (8) mbc_result.0<1>:ud   0x0:ud          {align1};
 +mba_start:
 +mov  (8) mb_msg0.0<1>:ud      0:ud            {align1};
 +and.z.f0.0 (1)                null:uw input_mb_intra_ub<0,1,0>:ub     INTRA_PRED_AVAIL_FLAG_AE:uw   {align1};
 +/* MB A doesn't exist. Zero MV. mba_flag is zero and ref ID = -1 */
 +(f0.0)  mov  (2)      mba_result.20<1>:w      -1:w    {align1};
 +(f0.0)  jmpi (1)      mbb_start;
 +mov  (1) mba_result.0<1>:d    MB_AVAIL                {align1};       
 +mov  (2) tmp_reg0.0<1>:UW     orig_xy_ub<2,2,1>:UB    {align1};
 +add  (1) tmp_reg0.0<1>:w      tmp_reg0.0<0,1,0>:w     -1:w    {align1};
 +mul  (1) mb_msg0.8<1>:UD       w_in_mb_uw<0,1,0>:UW tmp_reg0.2<0,1,0>:UW {align1};
 +add  (1) mb_msg0.8<1>:UD       mb_msg0.8<0,1,0>:UD   tmp_reg0.0<0,1,0>:uw {align1};
 +mul  (1) mb_msg0.8<1>:UD       mb_msg0.8<0,1,0>:UD 24:UD {align1};
 +mov  (1) mb_msg0.20<1>:UB        thread_id_ub {align1};                  /* dispatch id */
 +
 +/* bind index 3, read 4 oword (64bytes), msg type: 0(OWord Block Read) */
 +send (16)
 +        mb_ind
 +        mb_wb.0<1>:ud
 +      NULL
 +        data_port(
 +                OBR_CACHE_TYPE,
 +                OBR_MESSAGE_TYPE,
 +                OBR_CONTROL_4,
 +                OBR_BIND_IDX,
 +                OBR_WRITE_COMMIT_CATEGORY,
 +                OBR_HEADER_PRESENT
 +        )
 +        mlen 1
 +        rlen 2
 +        {align1};
 +
 +/* TODO: RefID is required after multi-references are added */
 +cmp.l.f0.0 (1)                null:w  mb_intra_wb.16<0,1,0>:uw        mb_inter_wb.8<0,1,0>:uw {align1};
 +(f0.0)   mov (2)      mba_result.20<1>:w                      -1:w    {align1};
 +(f0.0)   jmpi (1)     mbb_start;
 +
 +add   (1) mb_msg0.8<1>:UD     mb_msg0.8<0,1,0>:ud     3:ud {align1};
 +/* Read MV for MB A */
 +/* bind index 3, read 8 oword (128bytes), msg type: 0(OWord Block Read) */
 +send (16)
 +        mb_ind
 +        mb_mv0.0<1>:ud
 +      NULL
 +        data_port(
 +                OBR_CACHE_TYPE,
 +                OBR_MESSAGE_TYPE,
 +                OBR_CONTROL_8,
 +                OBR_BIND_IDX,
 +                OBR_WRITE_COMMIT_CATEGORY,
 +                OBR_HEADER_PRESENT
 +        )
 +        mlen 1
 +        rlen 4
 +        {align1};
 +/* TODO: RefID is required after multi-references are added */
 +/* MV */
 +mov      (2)          mba_result.4<1>:ud              mb_mv1.8<2,2,1>:ud      {align1};
 +mov      (1)          mba_result.16<1>:w              MB_PRED_FLAG            {align1};
 +
 +mbb_start:
 +mov  (8) mb_msg0.0<1>:ud      0:ud            {align1};
 +and.z.f0.0 (1)                null:uw input_mb_intra_ub<0,1,0>:ub     INTRA_PRED_AVAIL_FLAG_B:uw   {align1};
 +/* MB B doesn't exist. Zero MV. mba_flag is zero */
 +/* If MB B doesn't exist, neither MB C nor D exists */
 +(f0.0)  mov  (2)      mbb_result.20<1>:w      -1:w            {align1};
 +(f0.0)  mov  (2)      mbc_result.20<1>:w      -1:w            {align1};
 +(f0.0)  jmpi (1)      mb_mvp_start;
 +mov  (1) mbb_result.0<1>:d    MB_AVAIL                {align1};       
 +mov  (2) tmp_reg0.0<1>:UW     orig_xy_ub<2,2,1>:UB    {align1};
 +add  (1) tmp_reg0.2<1>:w      tmp_reg0.2<0,1,0>:w     -1:w    {align1};
 +mul  (1) mb_msg0.8<1>:UD       w_in_mb_uw<0,1,0>:UW tmp_reg0.2<0,1,0>:UW {align1};
 +add  (1) mb_msg0.8<1>:UD       mb_msg0.8<0,1,0>:UD   tmp_reg0.0<0,1,0>:uw {align1};
 +mul  (1) mb_msg0.8<1>:UD       mb_msg0.8<0,1,0>:UD 24:UD {align1};
 +mov  (1) mb_msg0.20<1>:UB        thread_id_ub {align1};                  /* dispatch id */
 +
 +/* bind index 3, read 4 oword (64bytes), msg type: 0(OWord Block Read) */
 +send (16)
 +        mb_ind
 +        mb_wb.0<1>:ud
 +      NULL
 +        data_port(
 +                OBR_CACHE_TYPE,
 +                OBR_MESSAGE_TYPE,
 +                OBR_CONTROL_4,
 +                OBR_BIND_IDX,
 +                OBR_WRITE_COMMIT_CATEGORY,
 +                OBR_HEADER_PRESENT
 +        )
 +        mlen 1
 +        rlen 2
 +        {align1};
 +
 +/* TODO: RefID is required after multi-references are added */
 +cmp.l.f0.0 (1)                null:w  mb_intra_wb.16<0,1,0>:uw        mb_inter_wb.8<0,1,0>:uw {align1};
 +(f0.0)   mov (2)      mbb_result.20<1>:w                      -1:w    {align1};
 +(f0.0)   jmpi (1)     mbc_start;
 +add   (1) mb_msg0.8<1>:UD     mb_msg0.8<0,1,0>:ud     3:ud {align1};
 +/* Read MV for MB B */
 +/* bind index 3, read 8 oword (128bytes), msg type: 0(OWord Block Read) */
 +send (16)
 +        mb_ind
 +        mb_mv0.0<1>:ud
 +      NULL
 +        data_port(
 +                OBR_CACHE_TYPE,
 +                OBR_MESSAGE_TYPE,
 +                OBR_CONTROL_8,
 +                OBR_BIND_IDX,
 +                OBR_WRITE_COMMIT_CATEGORY,
 +                OBR_HEADER_PRESENT
 +        )
 +        mlen 1
 +        rlen 4
 +        {align1};
 +/* TODO: RefID is required after multi-references are added */
 +mov      (2)          mbb_result.4<1>:ud              mb_mv2.16<2,2,1>:ud     {align1};
 +mov      (1)          mbb_result.16<1>:w              MB_PRED_FLAG            {align1};
 +
 +mbc_start:
 +mov  (8) mb_msg0.0<1>:ud      0:ud            {align1};
 +and.z.f0.0 (1)                null:uw input_mb_intra_ub<0,1,0>:ub     INTRA_PRED_AVAIL_FLAG_C:uw   {align1};
 +/* MB C doesn't exist. Zero MV. mba_flag is zero */
 +/* Based on h264 spec the MB D will be replaced if MB C doesn't exist */
 +(f0.0)  jmpi (1)      mbd_start;
 +mov  (1) mbc_result.0<1>:d    MB_AVAIL                {align1};       
 +mov  (2) tmp_reg0.0<1>:UW     orig_xy_ub<2,2,1>:UB    {align1};
 +add  (1) tmp_reg0.2<1>:w      tmp_reg0.2<0,1,0>:w     -1:w    {align1};
 +add  (1) tmp_reg0.0<1>:w      tmp_reg0.0<0,1,0>:w     1:w     {align1};
 +mul  (1) mb_msg0.8<1>:UD       w_in_mb_uw<0,1,0>:UW tmp_reg0.2<0,1,0>:UW {align1};
 +add  (1) mb_msg0.8<1>:UD       mb_msg0.8<0,1,0>:UD   tmp_reg0.0<0,1,0>:uw {align1};
 +mul  (1) mb_msg0.8<1>:UD       mb_msg0.8<0,1,0>:UD 24:UD {align1};
 +mov  (1) mb_msg0.20<1>:UB        thread_id_ub {align1};                  /* dispatch id */
 +
 +/* bind index 3, read 4 oword (64bytes), msg type: 0(OWord Block Read) */
 +send (16)
 +        mb_ind
 +        mb_wb.0<1>:ud
 +      NULL
 +        data_port(
 +                OBR_CACHE_TYPE,
 +                OBR_MESSAGE_TYPE,
 +                OBR_CONTROL_4,
 +                OBR_BIND_IDX,
 +                OBR_WRITE_COMMIT_CATEGORY,
 +                OBR_HEADER_PRESENT
 +        )
 +        mlen 1
 +        rlen 2
 +        {align1};
 +
 +/* TODO: RefID is required after multi-references are added */
 +cmp.l.f0.0 (1)                null:w  mb_intra_wb.16<0,1,0>:uw        mb_inter_wb.8<0,1,0>:uw {align1};
 +(f0.0)   mov (2)      mbc_result.20<1>:w                      -1:w    {align1};
 +(f0.0)   jmpi (1)     mb_mvp_start;
 +add   (1) mb_msg0.8<1>:UD     mb_msg0.8<0,1,0>:ud     3:ud {align1};
 +/* Read MV for MB C */
 +/* bind index 3, read 8 oword (128bytes), msg type: 0(OWord Block Read) */
 +send (16)
 +        mb_ind
 +        mb_mv0.0<1>:ud
 +      NULL
 +        data_port(
 +                OBR_CACHE_TYPE,
 +                OBR_MESSAGE_TYPE,
 +                OBR_CONTROL_8,
 +                OBR_BIND_IDX,
 +                OBR_WRITE_COMMIT_CATEGORY,
 +                OBR_HEADER_PRESENT
 +        )
 +        mlen 1
 +        rlen 4
 +        {align1};
 +/* TODO: RefID is required after multi-references are added */
 +/* Forward MV */
 +mov      (2)          mbc_result.4<1>:ud              mb_mv2.16<2,2,1>:ud     {align1};
 +mov      (1)          mbc_result.16<1>:w              MB_PRED_FLAG            {align1};
 +
 +jmpi   (1)    mb_mvp_start;
 +mbd_start:
 +mov  (8) mb_msg0.0<1>:ud      0:ud            {align1};
 +and.z.f0.0 (1)                null:uw input_mb_intra_ub<0,1,0>:ub     INTRA_PRED_AVAIL_FLAG_D:uw   {align1};
 +(f0.0)  jmpi (1)      mb_mvp_start;
 +mov  (1) mbc_result.0<1>:d    MB_AVAIL                {align1};       
 +mov  (2) tmp_reg0.0<1>:UW     orig_xy_ub<2,2,1>:UB    {align1};
 +add  (2) tmp_reg0.0<1>:w      tmp_reg0.0<2,2,1>:w     -1:w    {align1};
 +mul  (1) mb_msg0.8<1>:UD       w_in_mb_uw<0,1,0>:UW tmp_reg0.2<0,1,0>:UW {align1};
 +add  (1) mb_msg0.8<1>:UD       mb_msg0.8<0,1,0>:UD   tmp_reg0.0<0,1,0>:uw {align1};
 +mul  (1) mb_msg0.8<1>:UD       mb_msg0.8<0,1,0>:UD 24:UD {align1};
 +mov  (1) mb_msg0.20<1>:UB        thread_id_ub {align1};                  /* dispatch id */
 +
 +/* bind index 3, read 4 oword (64bytes), msg type: 0(OWord Block Read) */
 +send (16)
 +        mb_ind
 +        mb_wb.0<1>:ud
 +      NULL
 +        data_port(
 +                OBR_CACHE_TYPE,
 +                OBR_MESSAGE_TYPE,
 +                OBR_CONTROL_4,
 +                OBR_BIND_IDX,
 +                OBR_WRITE_COMMIT_CATEGORY,
 +                OBR_HEADER_PRESENT
 +        )
 +        mlen 1
 +        rlen 2
 +        {align1};
 +
 +cmp.l.f0.0 (1)                null:w  mb_intra_wb.16<0,1,0>:uw        mb_inter_wb.8<0,1,0>:uw {align1};
 +(f0.0)   mov (2)      mbc_result.20<1>:w                      -1:w    {align1};
 +(f0.0)   jmpi (1)     mb_mvp_start;
 +
 +add   (1) mb_msg0.8<1>:UD     mb_msg0.8<0,1,0>:ud     3:ud {align1};
 +/* Read MV for MB D */
 +/* bind index 3, read 8 oword (128bytes), msg type: 0(OWord Block Read) */
 +send (16)
 +        mb_ind
 +        mb_mv0.0<1>:ub
 +      NULL
 +        data_port(
 +                OBR_CACHE_TYPE,
 +                OBR_MESSAGE_TYPE,
 +                OBR_CONTROL_8,
 +                OBR_BIND_IDX,
 +                OBR_WRITE_COMMIT_CATEGORY,
 +                OBR_HEADER_PRESENT
 +        )
 +        mlen 1
 +        rlen 4
 +        {align1};
 +
 +/* TODO: RefID is required after multi-references are added */
 +
 +/* Forward MV */
 +mov      (2)          mbc_result.4<1>:ud              mb_mv3.24<2,2,1>:ud     {align1};
 +mov      (1)          mbc_result.16<1>:w              MB_PRED_FLAG            {align1};
 +      
 +mb_mvp_start:
 +/*TODO: Add the skip prediction */
 +/* Check whether both MB B and C are inavailable */
 +add   (1)     tmp_reg0.0<1>:d         mbb_result.0<0,1,0>:d   mbc_result.0<0,1,0>:d   {align1};
 +cmp.z.f0.0 (1)        null:d                  tmp_reg0.0<0,1,0>:d     0:d     {align1};
 +(-f0.0)       jmpi (1)        mb_median_start;
 +cmp.nz.f0.0 (1)       null:d  mba_result.0<0,1,0>:d           1:d             {align1};
 +(f0.0)        mov     (1)     mbb_result.4<1>:ud              mba_result.4<0,1,0>:ud  {align1};       
 +(f0.0)        mov     (1)     mbc_result.4<1>:ud              mba_result.4<0,1,0>:ud  {align1};       
 +(f0.0)        mov     (1)     mbb_result.20<1>:uw             mba_result.20<0,1,0>:uw {align1};       
 +(f0.0)        mov     (1)     mbc_result.20<1>:uw             mba_result.20<0,1,0>:uw {align1};       
 +(f0.0)  mov     (1)   mb_mvp_ref.0<1>:ud              mba_result.4<0,1,0>:ud  {align1};
 +(-f0.0) mov   (1)     mb_mvp_ref.0<1>:ud              0:ud                    {align1};
 +jmpi  (1)     __mb_hwdep_end;
 +      
 +mb_median_start:
 +/* check whether only one neighbour MB has the same ref ID with the current MB */
 +mov (8)       tmp_reg0.0<1>:ud                0:ud            {align1};
 +cmp.z.f0.0    (1)     null:d  mba_result.20<1>:w      0:w     {align1};
 +(f0.0)        add     (1)     tmp_reg0.0<1>:w         tmp_reg0.0<1>:w 1:w     {align1};
 +(f0.0)        mov     (1)     tmp_reg0.4<1>:ud        mba_result.4<0,1,0>:ud  {align1};
 +cmp.z.f0.0    (1)     null:d  mbb_result.20<1>:w      0:w     {align1};
 +(f0.0)        add     (1)     tmp_reg0.0<1>:w         tmp_reg0.0<1>:w 1:w     {align1};
 +(f0.0)        mov     (1)     tmp_reg0.4<1>:ud        mbb_result.4<0,1,0>:ud  {align1};
 +cmp.z.f0.0    (1)     null:d  mbc_result.20<1>:w      0:w     {align1};
 +(f0.0)        add     (1)     tmp_reg0.0<1>:w         tmp_reg0.0<1>:w 1:w     {align1};
 +(f0.0)        mov     (1)     tmp_reg0.4<1>:ud        mbc_result.4<0,1,0>:ud  {align1};
 +cmp.e.f0.0    (1)     null:d  tmp_reg0.0<1>:w  1:w    {align1};
 +(f0.0)        mov     (1)     mb_mvp_ref.0<1>:ud      tmp_reg0.4<0,1,0>:ud    {align1};
 +(f0.0)        jmpi (1)  __mb_hwdep_end;
 +
 +mov   (1)     INPUT_ARG0.0<1>:w       mba_result.4<0,1,0>:w   {align1};
 +mov   (1)     INPUT_ARG0.4<1>:w       mbb_result.4<0,1,0>:w   {align1};
 +mov   (1)     INPUT_ARG0.8<1>:w       mbc_result.4<0,1,0>:w   {align1};
 +SAVE_RET      {align1};
 + jmpi (1)     word_imedian;
 +mov   (1)     mb_mvp_ref.0<1>:w               RET_ARG<0,1,0>:w        {align1};
 +mov   (1)     INPUT_ARG0.0<1>:w       mba_result.6<0,1,0>:w   {align1};
 +mov   (1)     INPUT_ARG0.4<1>:w       mbb_result.6<0,1,0>:w   {align1};
 +mov   (1)     INPUT_ARG0.8<1>:w       mbc_result.6<0,1,0>:w   {align1};
 +SAVE_RET      {align1};
 +jmpi  (1)     word_imedian; 
 +mov   (1)     mb_mvp_ref.2<1>:w               RET_ARG<0,1,0>:w        {align1};
 +
 +__mb_hwdep_end:
 +asr   (2)     mb_ref_win.0<1>:w       mb_mvp_ref.0<2,2,1>:w   2:w     {align1};
 +add   (2)     mb_ref_win.8<1>:w       mb_ref_win.0<2,2,1>:w   3:w     {align1};
 +and   (2)     mb_ref_win.16<1>:uw     mb_ref_win.8<2,2,1>:uw  0xFFFC:uw {align1};
++        
  /* m2, get the MV/Mb cost passed from constant buffer when
  spawning thread by MEDIA_OBJECT */       
  mov (8) vme_m2<1>:UD            r1.0<8,8,1>:UD {align1};
@@@ -396,9 -78,8 +397,8 @@@ mov  (1) vme_msg_5.20<1>:UW      CHROMA
  
  
  /* m6 */
--
 -mov (8) vme_msg_6<1>:UD               0x0:UD {align1};                
 +mov  (4) vme_msg_6.16<1>:UD      CHROMA_ROW.8<4,4,1>:UD {align1};
 +mov  (8) vme_msg_6.0<1>:UW       CHROMA_COL.2<16,8,2>:UW {align1};
  
  /*
   * SIC VME message
@@@ -673,47 -403,3 +673,45 @@@ __EXIT
   */        
  mov  (8) ts_msg_reg0<1>:UD         r0<8,8,1>:UD {align1};
  send (16) ts_msg_ind acc0<1>UW null thread_spawner(0, 0, 1) mlen 1 rlen 0 {align1 EOT};
 +
 +      nop             ;
 +      nop             ;
 +/* Compare three word data to get the min value */
 +word_imin:
 +      cmp.le.f0.0 (1)         null:w          INPUT_ARG0.0<0,1,0>:w   INPUT_ARG0.4<0,1,0>:w {align1};
 +      (f0.0) mov  (1)         TEMP_VAR0.0<1>:w INPUT_ARG0.0<0,1,0>:w                    {align1};
 +      (-f0.0) mov (1)         TEMP_VAR0.0<1>:w INPUT_ARG0.4<0,1,0>:w                    {align1};
 +      cmp.le.f0.0 (1)         null:w          TEMP_VAR0.0<0,1,0>:w    INPUT_ARG0.8<0,1,0>:w {align1};
 +      (f0.0) mov  (1)         RET_ARG<1>:w TEMP_VAR0.0<0,1,0>:w                         {align1};
 +      (-f0.0) mov (1)         RET_ARG<1>:w INPUT_ARG0.8<0,1,0>:w                        {align1};
 +      RETURN          {align1};       
 +      
 +/* Compare three word data to get the max value */
 +word_imax:
 +      cmp.ge.f0.0 (1)         null:w          INPUT_ARG0.0<0,1,0>:w   INPUT_ARG0.4<0,1,0>:w {align1};
 +      (f0.0) mov  (1)         TEMP_VAR0.0<1>:w INPUT_ARG0.0<0,1,0>:w                    {align1};
 +      (-f0.0) mov (1)         TEMP_VAR0.0<1>:w INPUT_ARG0.4<0,1,0>:w                    {align1};
 +      cmp.ge.f0.0 (1)         null:w          TEMP_VAR0.0<0,1,0>:w    INPUT_ARG0.8<0,1,0>:w {align1};
 +      (f0.0) mov  (1)         RET_ARG<1>:w TEMP_VAR0.0<0,1,0>:w                         {align1};
 +      (-f0.0) mov (1)         RET_ARG<1>:w INPUT_ARG0.8<0,1,0>:w                        {align1};
 +      RETURN          {align1};       
 +      
 +word_imedian:
 +      cmp.ge.f0.0 (1) null:w INPUT_ARG0.0<0,1,0>:w INPUT_ARG0.4<0,1,0>:w {align1};
 +      (f0.0)  jmpi (1) cmp_a_ge_b;
 +      cmp.ge.f0.0 (1) null:w INPUT_ARG0.0<0,1,0>:w INPUT_ARG0.8<0,1,0>:w {align1};
 +      (f0.0) mov (1) RET_ARG<1>:w INPUT_ARG0.0<0,1,0>:w {align1};
 +      (f0.0) jmpi (1) cmp_end;
 +      cmp.ge.f0.0 (1) null:w INPUT_ARG0.4<0,1,0>:w INPUT_ARG0.8<0,1,0>:w {align1};
 +      (f0.0) mov (1) RET_ARG<1>:w INPUT_ARG0.8<0,1,0>:w {align1};
 +      (-f0.0) mov (1) RET_ARG<1>:w INPUT_ARG0.4<0,1,0>:w {align1};
 +      jmpi (1) cmp_end;
 +cmp_a_ge_b:
 +      cmp.ge.f0.0 (1) null:w INPUT_ARG0.4<0,1,0>:w INPUT_ARG0.8<0,1,0>:w {align1};
 +      (f0.0) mov (1) RET_ARG<1>:w INPUT_ARG0.4<0,1,0>:w {align1};
 +      (f0.0) jmpi (1) cmp_end;
 +      cmp.ge.f0.0 (1) null:w INPUT_ARG0.0<0,1,0>:w INPUT_ARG0.8<0,1,0>:w {align1};
 +      (f0.0) mov (1) RET_ARG<1>:w INPUT_ARG0.8<0,1,0>:w {align1};
 +      (-f0.0) mov (1) RET_ARG<1>:w INPUT_ARG0.0<0,1,0>:w {align1};
 +cmp_end:
 +      RETURN  {align1};
@@@ -116,7 -95,7 +116,6 @@@ and.z.f0.0 (1) null<1>:UW transform_8x8
  
  /* assign MB intra struct from the thread payload*/
  mov (1) mb_intra_struct_ub<1>:UB input_mb_intra_ub<0,1,0>:UB {align1}; 
-                            
 - 
  /* Disable DC HAAR component when calculating HARR SATD block */
  mov  (1) tmp_reg0.0<1>:UW     DC_HARR_DISABLE:UW              {align1};
  mov  (1) vme_m1.30<1>:UB      tmp_reg0.0<0,1,0>:UB  {align1};
diff --cc src/sysdeps.h
index 0000000,a713d20..71bfb4d
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,39 +1,47 @@@
+ /*
+  * Copyright (C) 2012 Intel Corporation. All Rights Reserved.
+  *
+  * Permission is hereby granted, free of charge, to any person obtaining a
+  * copy of this software and associated documentation files (the
+  * "Software"), to deal in the Software without restriction, including
+  * without limitation the rights to use, copy, modify, merge, publish,
+  * distribute, sub license, and/or sell copies of the Software, and to
+  * permit persons to whom the Software is furnished to do so, subject to
+  * the following conditions:
+  * 
+  * The above copyright notice and this permission notice (including the
+  * next paragraph) shall be included in all copies or substantial portions
+  * of the Software.
+  * 
+  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+  * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR
+  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+  */
+ #ifndef SYSDEPS_H
+ #define SYSDEPS_H
++#ifdef ANDROID
++
++# include "config_android.h"
++
++#else
++
+ #ifdef HAVE_CONFIG_H
+ # include "config.h"
+ #endif
++#endif /* ANDROID */
++
+ #include <stdio.h>
+ #include <stdlib.h>
+ #include <stdbool.h>
+ #include <string.h>
+ #include <stdint.h>
+ #include <assert.h>
+ #endif /* SYSDEPS_H */