Add the separated media encoding/decoding files for BDW
authorZhao Yakui <yakui.zhao@intel.com>
Mon, 7 Jan 2013 05:18:47 +0000 (13:18 +0800)
committerXiang, Haihao <haihao.xiang@intel.com>
Thu, 27 Feb 2014 01:51:39 +0000 (09:51 +0800)
As a lot of changes about the media are added between Haswell and BDW, the
separated media encoding/decoding files are added for BDW. This
is to avoid complex backward logic for Haswell.

Signed-off-by: Zhao Yakui <yakui.zhao@intel.com>
[Haihao: directly use object instead of id]
Signed-off-by: Xiang, Haihao <haihao.xiang@intel.com>
src/Makefile.am
src/gen6_mfc.h
src/gen6_vme.h
src/gen8_mfc.c [new file with mode: 0644]
src/gen8_mfd.c [new file with mode: 0644]
src/gen8_vme.c [new file with mode: 0644]
src/i965_decoder.h
src/i965_drv_video.c
src/i965_encoder.c
src/i965_encoder.h

index edf8f4e..5b2ac59 100755 (executable)
@@ -56,6 +56,9 @@ source_c = \
        gen7_mfd.c              \
        gen75_mfd.c             \
        gen75_mfc.c             \
+       gen8_mfc.c              \
+       gen8_mfd.c              \
+       gen8_vme.c              \
        gen75_picture_process.c \
        gen75_vme.c             \
        gen75_vpp_gpe.c         \
index 6a5777f..d55cff6 100644 (file)
@@ -269,10 +269,12 @@ extern VAStatus intel_mfc_avc_prepare(VADriverContextP ctx,
 
 extern int intel_avc_enc_slice_type_fixup(int type);
 
-
 extern void
 intel_mfc_avc_ref_idx_state(VADriverContextP ctx,
                             struct encode_state *encode_state,
                             struct intel_encoder_context *encoder_context);
 
+extern
+Bool gen8_mfc_context_init(VADriverContextP ctx, struct intel_encoder_context *encoder_context);
+
 #endif /* _GEN6_MFC_BCS_H_ */
index 939a4a3..d461982 100644 (file)
@@ -174,4 +174,5 @@ intel_avc_vme_reference_state(VADriverContextP ctx,
                                   struct object_surface *obj_surface,
                                   struct intel_encoder_context *encoder_context));
 
+extern Bool gen8_vme_context_init(VADriverContextP ctx, struct intel_encoder_context *encoder_context);
 #endif /* _GEN6_VME_H_ */
diff --git a/src/gen8_mfc.c b/src/gen8_mfc.c
new file mode 100644 (file)
index 0000000..5f50e0a
--- /dev/null
@@ -0,0 +1,2469 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Zhao Yakui <yakui.zhao@intel.com>
+ *    Xiang Haihao <haihao.xiang@intel.com>
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <assert.h>
+
+#include "intel_batchbuffer.h"
+#include "i965_defines.h"
+#include "i965_structs.h"
+#include "i965_drv_video.h"
+#include "i965_encoder.h"
+#include "i965_encoder_utils.h"
+#include "gen6_mfc.h"
+#include "gen6_vme.h"
+#include "intel_media.h"
+
+#define MFC_SOFTWARE_HASWELL   1
+
+#define B0_STEP_REV            2
+#define IS_STEPPING_BPLUS(i965)        ((i965->intel.revision) >= B0_STEP_REV)
+
+static const uint32_t gen8_mfc_batchbuffer_avc_intra[][4] = {
+#include "shaders/utils/mfc_batchbuffer_avc_intra.g7b"
+};
+
+static const uint32_t gen8_mfc_batchbuffer_avc_inter[][4] = {
+#include "shaders/utils/mfc_batchbuffer_avc_inter.g7b"
+};
+
+static struct i965_kernel gen8_mfc_kernels[] = {
+    {
+        "MFC AVC INTRA BATCHBUFFER ",
+        MFC_BATCHBUFFER_AVC_INTRA,
+        gen8_mfc_batchbuffer_avc_intra,
+        sizeof(gen8_mfc_batchbuffer_avc_intra),
+        NULL
+    },
+
+    {
+        "MFC AVC INTER BATCHBUFFER ",
+        MFC_BATCHBUFFER_AVC_INTER,
+        gen8_mfc_batchbuffer_avc_inter,
+        sizeof(gen8_mfc_batchbuffer_avc_inter),
+        NULL
+    },
+};
+
+#define                INTER_MODE_MASK         0x03
+#define                INTER_8X8               0x03
+#define                INTER_16X8              0x01
+#define                INTER_8X16              0x02
+#define                SUBMB_SHAPE_MASK        0x00FF00
+
+#define                INTER_MV8               (4 << 20)
+#define                INTER_MV32              (6 << 20)
+
+
+static void
+gen8_mfc_pipe_mode_select(VADriverContextP ctx,
+                          int standard_select,
+                          struct intel_encoder_context *encoder_context)
+{
+    struct intel_batchbuffer *batch = encoder_context->base.batch;
+    struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
+
+    assert(standard_select == MFX_FORMAT_MPEG2 ||
+           standard_select == MFX_FORMAT_AVC);
+
+    BEGIN_BCS_BATCH(batch, 5);
+
+    OUT_BCS_BATCH(batch, MFX_PIPE_MODE_SELECT | (5 - 2));
+    OUT_BCS_BATCH(batch,
+                  (MFX_LONG_MODE << 17) | /* Must be long format for encoder */
+                  (MFD_MODE_VLD << 15) | /* VLD mode */
+                  (0 << 10) | /* Stream-Out Enable */
+                  ((!!mfc_context->post_deblocking_output.bo) << 9)  | /* Post Deblocking Output */
+                  ((!!mfc_context->pre_deblocking_output.bo) << 8)  | /* Pre Deblocking Output */
+                  (0 << 5)  | /* not in stitch mode */
+                  (1 << 4)  | /* encoding mode */
+                  (standard_select << 0));  /* standard select: avc or mpeg2 */
+    OUT_BCS_BATCH(batch,
+                  (0 << 7)  | /* expand NOA bus flag */
+                  (0 << 6)  | /* disable slice-level clock gating */
+                  (0 << 5)  | /* disable clock gating for NOA */
+                  (0 << 4)  | /* terminate if AVC motion and POC table error occurs */
+                  (0 << 3)  | /* terminate if AVC mbdata error occurs */
+                  (0 << 2)  | /* terminate if AVC CABAC/CAVLC decode error occurs */
+                  (0 << 1)  |
+                  (0 << 0));
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+
+    ADVANCE_BCS_BATCH(batch);
+}
+
+static void
+gen8_mfc_surface_state(VADriverContextP ctx, struct intel_encoder_context *encoder_context)
+{
+    struct intel_batchbuffer *batch = encoder_context->base.batch;
+    struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
+
+    BEGIN_BCS_BATCH(batch, 6);
+
+    OUT_BCS_BATCH(batch, MFX_SURFACE_STATE | (6 - 2));
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch,
+                  ((mfc_context->surface_state.height - 1) << 18) |
+                  ((mfc_context->surface_state.width - 1) << 4));
+    OUT_BCS_BATCH(batch,
+                  (MFX_SURFACE_PLANAR_420_8 << 28) | /* 420 planar YUV surface */
+                  (1 << 27) | /* must be 1 for interleave U/V, hardware requirement */
+                  (0 << 22) | /* surface object control state, FIXME??? */
+                  ((mfc_context->surface_state.w_pitch - 1) << 3) | /* pitch */
+                  (0 << 2)  | /* must be 0 for interleave U/V */
+                  (1 << 1)  | /* must be tiled */
+                  (I965_TILEWALK_YMAJOR << 0));  /* tile walk, TILEWALK_YMAJOR */
+    OUT_BCS_BATCH(batch,
+                  (0 << 16) |                                                          /* must be 0 for interleave U/V */
+                  (mfc_context->surface_state.h_pitch));               /* y offset for U(cb) */
+    OUT_BCS_BATCH(batch, 0);
+
+    ADVANCE_BCS_BATCH(batch);
+}
+
+static void
+gen8_mfc_ind_obj_base_addr_state(VADriverContextP ctx,
+                               struct intel_encoder_context *encoder_context)
+{
+    struct intel_batchbuffer *batch = encoder_context->base.batch;
+    struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
+    struct gen6_vme_context *vme_context = encoder_context->vme_context;
+
+    BEGIN_BCS_BATCH(batch, 26);
+
+    OUT_BCS_BATCH(batch, MFX_IND_OBJ_BASE_ADDR_STATE | (26 - 2));
+       /* the DW1-3 is for the MFX indirect bistream offset */
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+       /* the DW4-5 is the MFX upper bound */
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+
+    /* the DW6-10 is for MFX Indirect MV Object Base Address */
+    OUT_BCS_RELOC(batch, vme_context->vme_output.bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0x80000000); /* must set, up to 2G */
+    OUT_BCS_BATCH(batch, 0);
+
+     /* the DW11-15 is for MFX IT-COFF. Not used on encoder */
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+
+     /* the DW16-20 is for MFX indirect DBLK. Not used on encoder */   
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+
+    /* the DW21-25 is for MFC Indirect PAK-BSE Object Base Address for Encoder*/       
+    OUT_BCS_RELOC(batch,
+                  mfc_context->mfc_indirect_pak_bse_object.bo,
+                  I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+                  0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+       
+    OUT_BCS_RELOC(batch,
+                  mfc_context->mfc_indirect_pak_bse_object.bo,
+                  I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+                  mfc_context->mfc_indirect_pak_bse_object.end_offset);
+    OUT_BCS_BATCH(batch, 0);
+
+    ADVANCE_BCS_BATCH(batch);
+}
+
+static void
+gen8_mfc_avc_img_state(VADriverContextP ctx, struct encode_state *encode_state,  
+                       struct intel_encoder_context *encoder_context)
+{
+    struct intel_batchbuffer *batch = encoder_context->base.batch;
+    struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
+    VAEncPictureParameterBufferH264 *pPicParameter = (VAEncPictureParameterBufferH264 *)encode_state->pic_param_ext->buffer;
+
+    int width_in_mbs = (mfc_context->surface_state.width + 15) / 16;
+    int height_in_mbs = (mfc_context->surface_state.height + 15) / 16;
+
+    BEGIN_BCS_BATCH(batch, 16);
+
+    OUT_BCS_BATCH(batch, MFX_AVC_IMG_STATE | (16 - 2));
+       /*DW1. MB setting of frame */
+    OUT_BCS_BATCH(batch,
+                  ((width_in_mbs * height_in_mbs) & 0xFFFF));
+    OUT_BCS_BATCH(batch, 
+                  ((height_in_mbs - 1) << 16) | 
+                  ((width_in_mbs - 1) << 0));
+       /* DW3 QP setting */
+    OUT_BCS_BATCH(batch, 
+                  (0 << 24) |  /* Second Chroma QP Offset */
+                  (0 << 16) |  /* Chroma QP Offset */
+                  (0 << 14) |   /* Max-bit conformance Intra flag */
+                  (0 << 13) |   /* Max Macroblock size conformance Inter flag */
+                  (pPicParameter->pic_fields.bits.weighted_pred_flag << 12) |   /*Weighted_Pred_Flag */
+                  (pPicParameter->pic_fields.bits.weighted_bipred_idc << 10) |  /* Weighted_BiPred_Idc */
+                  (0 << 8)  |   /* FIXME: Image Structure */
+                  (0 << 0) );   /* Current Decoed Image Frame Store ID, reserved in Encode mode */
+    OUT_BCS_BATCH(batch,
+                  (0 << 16) |   /* Mininum Frame size */
+                  (0 << 15) |   /* Disable reading of Macroblock Status Buffer */
+                  (0 << 14) |   /* Load BitStream Pointer only once, 1 slic 1 frame */
+                  (0 << 13) |   /* CABAC 0 word insertion test enable */
+                  (1 << 12) |   /* MVUnpackedEnable,compliant to DXVA */
+                  (1 << 10) |   /* Chroma Format IDC, 4:2:0 */
+                  (0 << 8)  |   /* FIXME: MbMvFormatFlag */
+                  (pPicParameter->pic_fields.bits.entropy_coding_mode_flag << 7)  |   /*0:CAVLC encoding mode,1:CABAC*/
+                  (0 << 6)  |   /* Only valid for VLD decoding mode */
+                  (0 << 5)  |   /* Constrained Intra Predition Flag, from PPS */
+                  (0 << 4)  |   /* Direct 8x8 inference flag */
+                  (pPicParameter->pic_fields.bits.transform_8x8_mode_flag << 3)  |   /*8x8 or 4x4 IDCT Transform Mode Flag*/
+                  (1 << 2)  |   /* Frame MB only flag */
+                  (0 << 1)  |   /* MBAFF mode is in active */
+                  (0 << 0));    /* Field picture flag */
+       /* DW5 Trellis quantization */
+    OUT_BCS_BATCH(batch, 0);    /* Mainly about MB rate control and debug, just ignoring */
+    OUT_BCS_BATCH(batch,        /* Inter and Intra Conformance Max size limit */
+                  (0xBB8 << 16) |       /* InterMbMaxSz */
+                  (0xEE8) );            /* IntraMbMaxSz */
+    OUT_BCS_BATCH(batch, 0);            /* Reserved */
+       /* DW8. QP delta */
+    OUT_BCS_BATCH(batch, 0);            /* Slice QP Delta for bitrate control */
+    OUT_BCS_BATCH(batch, 0);            /* Slice QP Delta for bitrate control */
+       /* DW10. Bit setting for MB */  
+    OUT_BCS_BATCH(batch, 0x8C000000);
+    OUT_BCS_BATCH(batch, 0x00010000);
+       /* DW12. */
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0x02010100);
+       /* DW14. For short format */
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+
+    ADVANCE_BCS_BATCH(batch);
+}
+
+static void
+gen8_mfc_qm_state(VADriverContextP ctx,
+                  int qm_type,
+                  unsigned int *qm,
+                  int qm_length,
+                  struct intel_encoder_context *encoder_context)
+{
+    struct intel_batchbuffer *batch = encoder_context->base.batch;
+    unsigned int qm_buffer[16];
+
+    assert(qm_length <= 16);
+    assert(sizeof(*qm) == 4);
+    memcpy(qm_buffer, qm, qm_length * 4);
+
+    BEGIN_BCS_BATCH(batch, 18);
+    OUT_BCS_BATCH(batch, MFX_QM_STATE | (18 - 2));
+    OUT_BCS_BATCH(batch, qm_type << 0);
+    intel_batchbuffer_data(batch, qm_buffer, 16 * 4);
+    ADVANCE_BCS_BATCH(batch);
+}
+
+static void
+gen8_mfc_avc_qm_state(VADriverContextP ctx, struct intel_encoder_context *encoder_context)
+{
+    unsigned int qm[16] = {
+        0x10101010, 0x10101010, 0x10101010, 0x10101010,
+        0x10101010, 0x10101010, 0x10101010, 0x10101010,
+        0x10101010, 0x10101010, 0x10101010, 0x10101010,
+        0x10101010, 0x10101010, 0x10101010, 0x10101010
+    };
+
+    gen8_mfc_qm_state(ctx, MFX_QM_AVC_4X4_INTRA_MATRIX, qm, 12, encoder_context);
+    gen8_mfc_qm_state(ctx, MFX_QM_AVC_4X4_INTER_MATRIX, qm, 12, encoder_context);
+    gen8_mfc_qm_state(ctx, MFX_QM_AVC_8x8_INTRA_MATRIX, qm, 16, encoder_context);
+    gen8_mfc_qm_state(ctx, MFX_QM_AVC_8x8_INTER_MATRIX, qm, 16, encoder_context);
+}
+
+static void
+gen8_mfc_fqm_state(VADriverContextP ctx,
+                   int fqm_type,
+                   unsigned int *fqm,
+                   int fqm_length,
+                   struct intel_encoder_context *encoder_context)
+{
+    struct intel_batchbuffer *batch = encoder_context->base.batch;
+    unsigned int fqm_buffer[32];
+
+    assert(fqm_length <= 32);
+    assert(sizeof(*fqm) == 4);
+    memcpy(fqm_buffer, fqm, fqm_length * 4);
+
+    BEGIN_BCS_BATCH(batch, 34);
+    OUT_BCS_BATCH(batch, MFX_FQM_STATE | (34 - 2));
+    OUT_BCS_BATCH(batch, fqm_type << 0);
+    intel_batchbuffer_data(batch, fqm_buffer, 32 * 4);
+    ADVANCE_BCS_BATCH(batch);
+}
+
+static void
+gen8_mfc_avc_fqm_state(VADriverContextP ctx, struct intel_encoder_context *encoder_context)
+{
+    unsigned int qm[32] = {
+        0x10001000, 0x10001000, 0x10001000, 0x10001000,
+        0x10001000, 0x10001000, 0x10001000, 0x10001000,
+        0x10001000, 0x10001000, 0x10001000, 0x10001000,
+        0x10001000, 0x10001000, 0x10001000, 0x10001000,
+        0x10001000, 0x10001000, 0x10001000, 0x10001000,
+        0x10001000, 0x10001000, 0x10001000, 0x10001000,
+        0x10001000, 0x10001000, 0x10001000, 0x10001000,
+        0x10001000, 0x10001000, 0x10001000, 0x10001000
+    };
+
+    gen8_mfc_fqm_state(ctx, MFX_QM_AVC_4X4_INTRA_MATRIX, qm, 24, encoder_context);
+    gen8_mfc_fqm_state(ctx, MFX_QM_AVC_4X4_INTER_MATRIX, qm, 24, encoder_context);
+    gen8_mfc_fqm_state(ctx, MFX_QM_AVC_8x8_INTRA_MATRIX, qm, 32, encoder_context);
+    gen8_mfc_fqm_state(ctx, MFX_QM_AVC_8x8_INTER_MATRIX, qm, 32, encoder_context);
+}
+
+static void
+gen8_mfc_avc_insert_object(VADriverContextP ctx, struct intel_encoder_context *encoder_context,
+                           unsigned int *insert_data, int lenght_in_dws, int data_bits_in_last_dw,
+                           int skip_emul_byte_count, int is_last_header, int is_end_of_slice, int emulation_flag,
+                           struct intel_batchbuffer *batch)
+{
+    if (batch == NULL)
+        batch = encoder_context->base.batch;
+
+    BEGIN_BCS_BATCH(batch, lenght_in_dws + 2);
+
+    OUT_BCS_BATCH(batch, MFX_INSERT_OBJECT | (lenght_in_dws + 2 - 2));
+    OUT_BCS_BATCH(batch,
+                  (0 << 16) |   /* always start at offset 0 */
+                  (data_bits_in_last_dw << 8) |
+                  (skip_emul_byte_count << 4) |
+                  (!!emulation_flag << 3) |
+                  ((!!is_last_header) << 2) |
+                  ((!!is_end_of_slice) << 1) |
+                  (0 << 0));    /* FIXME: ??? */
+    intel_batchbuffer_data(batch, insert_data, lenght_in_dws * 4);
+
+    ADVANCE_BCS_BATCH(batch);
+}
+
+
+static void gen8_mfc_init(VADriverContextP ctx,
+                       struct encode_state *encode_state,
+                       struct intel_encoder_context *encoder_context)
+{
+    struct i965_driver_data *i965 = i965_driver_data(ctx);
+    struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
+    dri_bo *bo;
+    int i;
+    int width_in_mbs = 0;
+    int height_in_mbs = 0;
+
+    if (encoder_context->codec == CODEC_H264) {
+        VAEncSequenceParameterBufferH264 *pSequenceParameter = (VAEncSequenceParameterBufferH264 *)encode_state->seq_param_ext->buffer;
+        width_in_mbs = pSequenceParameter->picture_width_in_mbs;
+        height_in_mbs = pSequenceParameter->picture_height_in_mbs;
+    } else {
+        VAEncSequenceParameterBufferMPEG2 *pSequenceParameter = (VAEncSequenceParameterBufferMPEG2 *)encode_state->seq_param_ext->buffer;
+
+        assert(encoder_context->codec == CODEC_MPEG2);
+
+        width_in_mbs = ALIGN(pSequenceParameter->picture_width, 16) / 16;
+        height_in_mbs = ALIGN(pSequenceParameter->picture_height, 16) / 16;
+    }
+
+    /*Encode common setup for MFC*/
+    dri_bo_unreference(mfc_context->post_deblocking_output.bo);
+    mfc_context->post_deblocking_output.bo = NULL;
+
+    dri_bo_unreference(mfc_context->pre_deblocking_output.bo);
+    mfc_context->pre_deblocking_output.bo = NULL;
+
+    dri_bo_unreference(mfc_context->uncompressed_picture_source.bo);
+    mfc_context->uncompressed_picture_source.bo = NULL;
+
+    dri_bo_unreference(mfc_context->mfc_indirect_pak_bse_object.bo); 
+    mfc_context->mfc_indirect_pak_bse_object.bo = NULL;
+
+    for (i = 0; i < NUM_MFC_DMV_BUFFERS; i++){
+        if ( mfc_context->direct_mv_buffers[i].bo != NULL);
+        dri_bo_unreference(mfc_context->direct_mv_buffers[i].bo);
+        mfc_context->direct_mv_buffers[i].bo = NULL;
+    }
+
+    for (i = 0; i < MAX_MFC_REFERENCE_SURFACES; i++){
+        if (mfc_context->reference_surfaces[i].bo != NULL)
+            dri_bo_unreference(mfc_context->reference_surfaces[i].bo);
+        mfc_context->reference_surfaces[i].bo = NULL;  
+    }
+
+    dri_bo_unreference(mfc_context->intra_row_store_scratch_buffer.bo);
+    bo = dri_bo_alloc(i965->intel.bufmgr,
+                      "Buffer",
+                      width_in_mbs * 64,
+                      64);
+    assert(bo);
+    mfc_context->intra_row_store_scratch_buffer.bo = bo;
+
+    dri_bo_unreference(mfc_context->macroblock_status_buffer.bo);
+    bo = dri_bo_alloc(i965->intel.bufmgr,
+                      "Buffer",
+                      width_in_mbs * height_in_mbs * 16,
+                      64);
+    assert(bo);
+    mfc_context->macroblock_status_buffer.bo = bo;
+
+    dri_bo_unreference(mfc_context->deblocking_filter_row_store_scratch_buffer.bo);
+    bo = dri_bo_alloc(i965->intel.bufmgr,
+                      "Buffer",
+                      4 * width_in_mbs * 64,  /* 4 * width_in_mbs * 64 */
+                      64);
+    assert(bo);
+    mfc_context->deblocking_filter_row_store_scratch_buffer.bo = bo;
+
+    dri_bo_unreference(mfc_context->bsd_mpc_row_store_scratch_buffer.bo);
+    bo = dri_bo_alloc(i965->intel.bufmgr,
+                      "Buffer",
+                      2 * width_in_mbs * 64, /* 2 * width_in_mbs * 64 */
+                      0x1000);
+    assert(bo);
+    mfc_context->bsd_mpc_row_store_scratch_buffer.bo = bo;
+
+    dri_bo_unreference(mfc_context->mfc_batchbuffer_surface.bo);
+    mfc_context->mfc_batchbuffer_surface.bo = NULL;
+
+    dri_bo_unreference(mfc_context->aux_batchbuffer_surface.bo);
+    mfc_context->aux_batchbuffer_surface.bo = NULL;
+
+    if (mfc_context->aux_batchbuffer)
+        intel_batchbuffer_free(mfc_context->aux_batchbuffer);
+
+    mfc_context->aux_batchbuffer = intel_batchbuffer_new(&i965->intel, I915_EXEC_BSD, 0);
+    mfc_context->aux_batchbuffer_surface.bo = mfc_context->aux_batchbuffer->buffer;
+    dri_bo_reference(mfc_context->aux_batchbuffer_surface.bo);
+    mfc_context->aux_batchbuffer_surface.pitch = 16;
+    mfc_context->aux_batchbuffer_surface.num_blocks = mfc_context->aux_batchbuffer->size / 16;
+    mfc_context->aux_batchbuffer_surface.size_block = 16;
+
+    i965_gpe_context_init(ctx, &mfc_context->gpe_context);
+}
+
+static void
+gen8_mfc_pipe_buf_addr_state(VADriverContextP ctx,
+                               struct intel_encoder_context *encoder_context)
+{
+    struct intel_batchbuffer *batch = encoder_context->base.batch;
+    struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
+    int i;
+
+    BEGIN_BCS_BATCH(batch, 61);
+
+    OUT_BCS_BATCH(batch, MFX_PIPE_BUF_ADDR_STATE | (61 - 2));
+
+    /* the DW1-3 is for pre_deblocking */
+    if (mfc_context->pre_deblocking_output.bo)
+        OUT_BCS_RELOC(batch, mfc_context->pre_deblocking_output.bo,
+                      I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+                      0);
+    else
+        OUT_BCS_BATCH(batch, 0);                                                                                       /* pre output addr   */
+
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+     /* the DW4-6 is for the post_deblocking */
+
+    if (mfc_context->post_deblocking_output.bo)
+        OUT_BCS_RELOC(batch, mfc_context->post_deblocking_output.bo,
+                      I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+                      0);                                                                                      /* post output addr  */ 
+    else
+        OUT_BCS_BATCH(batch, 0);
+    
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+
+     /* the DW7-9 is for the uncompressed_picture */
+    OUT_BCS_RELOC(batch, mfc_context->uncompressed_picture_source.bo,
+                  I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+                  0); /* uncompressed data */
+
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+
+     /* the DW10-12 is for the mb status */
+    OUT_BCS_RELOC(batch, mfc_context->macroblock_status_buffer.bo,
+                  I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+                  0); /* StreamOut data*/
+    
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+
+     /* the DW13-15 is for the intra_row_store_scratch */
+    OUT_BCS_RELOC(batch, mfc_context->intra_row_store_scratch_buffer.bo,
+                  I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+                  0);  
+
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+
+     /* the DW16-18 is for the deblocking filter */
+    OUT_BCS_RELOC(batch, mfc_context->deblocking_filter_row_store_scratch_buffer.bo,
+                  I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+                  0);
+
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+
+    /* the DW 19-50 is for Reference pictures*/
+    for (i = 0; i < ARRAY_ELEMS(mfc_context->reference_surfaces); i++) {
+        if ( mfc_context->reference_surfaces[i].bo != NULL) {
+            OUT_BCS_RELOC(batch, mfc_context->reference_surfaces[i].bo,
+                          I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+                          0);                  
+        } else {
+            OUT_BCS_BATCH(batch, 0);
+        }
+
+       OUT_BCS_BATCH(batch, 0);
+    }
+
+    OUT_BCS_BATCH(batch, 0);
+
+       /* The DW 52-54 is for the MB status buffer */
+    OUT_BCS_RELOC(batch, mfc_context->macroblock_status_buffer.bo,
+                  I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+                  0);                                                                                  /* Macroblock status buffer*/
+       
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+
+    /* the DW 55-57 is the ILDB buffer */
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+
+    /* the DW 58-60 is the second ILDB buffer */
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+
+    ADVANCE_BCS_BATCH(batch);
+}
+
+static void
+gen8_mfc_avc_directmode_state(VADriverContextP ctx,
+                               struct intel_encoder_context *encoder_context)
+{
+    struct intel_batchbuffer *batch = encoder_context->base.batch;
+    struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
+
+    int i;
+
+    BEGIN_BCS_BATCH(batch, 71);
+
+    OUT_BCS_BATCH(batch, MFX_AVC_DIRECTMODE_STATE | (71 - 2));
+
+    /* Reference frames and Current frames */
+    /* the DW1-32 is for the direct MV for reference */
+    for(i = 0; i < NUM_MFC_DMV_BUFFERS - 2; i += 2) {
+        if ( mfc_context->direct_mv_buffers[i].bo != NULL) { 
+            OUT_BCS_RELOC(batch, mfc_context->direct_mv_buffers[i].bo,
+                          I915_GEM_DOMAIN_INSTRUCTION, 0,
+                          0);
+            OUT_BCS_BATCH(batch, 0);
+        } else {
+            OUT_BCS_BATCH(batch, 0);
+            OUT_BCS_BATCH(batch, 0);
+        }
+    }
+    
+    OUT_BCS_BATCH(batch, 0);
+
+    /* the DW34-36 is the MV for the current reference */
+    OUT_BCS_RELOC(batch, mfc_context->direct_mv_buffers[NUM_MFC_DMV_BUFFERS - 2].bo,
+                  I915_GEM_DOMAIN_INSTRUCTION, 0,
+                  0);
+
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+
+    /* POL list */
+    for(i = 0; i < 32; i++) {
+        OUT_BCS_BATCH(batch, i/2);
+    }
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+
+    ADVANCE_BCS_BATCH(batch);
+}
+
+static void
+gen8_mfc_avc_ref_idx_state(VADriverContextP ctx, struct intel_encoder_context *encoder_context)
+{
+    struct intel_batchbuffer *batch = encoder_context->base.batch;
+    int i;
+
+    BEGIN_BCS_BATCH(batch, 10);
+    OUT_BCS_BATCH(batch, MFX_AVC_REF_IDX_STATE | 8); 
+    OUT_BCS_BATCH(batch, 0);                  //Select L0
+    OUT_BCS_BATCH(batch, 0x80808020);         //Only 1 reference
+    for(i = 0; i < 7; i++) {
+        OUT_BCS_BATCH(batch, 0x80808080);
+    }   
+    ADVANCE_BCS_BATCH(batch);
+
+    BEGIN_BCS_BATCH(batch, 10);
+    OUT_BCS_BATCH(batch, MFX_AVC_REF_IDX_STATE | 8); 
+    OUT_BCS_BATCH(batch, 1);                  //Select L1
+    OUT_BCS_BATCH(batch, 0x80808022);         //Only 1 reference
+    for(i = 0; i < 7; i++) {
+        OUT_BCS_BATCH(batch, 0x80808080);
+    }   
+    ADVANCE_BCS_BATCH(batch);
+}
+
+
+static void
+gen8_mfc_bsp_buf_base_addr_state(VADriverContextP ctx,
+                               struct intel_encoder_context *encoder_context)
+{
+    struct intel_batchbuffer *batch = encoder_context->base.batch;
+    struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
+
+    BEGIN_BCS_BATCH(batch, 10);
+
+    OUT_BCS_BATCH(batch, MFX_BSP_BUF_BASE_ADDR_STATE | (10 - 2));
+    OUT_BCS_RELOC(batch, mfc_context->bsd_mpc_row_store_scratch_buffer.bo,
+                  I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+                  0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+       
+       /* the DW4-6 is for MPR Row Store Scratch Buffer Base Address */
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+
+       /* the DW7-9 is for Bitplane Read Buffer Base Address */
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+
+    ADVANCE_BCS_BATCH(batch);
+}
+
+
+static void gen8_mfc_avc_pipeline_picture_programing( VADriverContextP ctx,
+                                      struct encode_state *encode_state,
+                                      struct intel_encoder_context *encoder_context)
+{
+    struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
+
+    mfc_context->pipe_mode_select(ctx, MFX_FORMAT_AVC, encoder_context);
+    mfc_context->set_surface_state(ctx, encoder_context);
+    mfc_context->ind_obj_base_addr_state(ctx, encoder_context);
+    gen8_mfc_pipe_buf_addr_state(ctx, encoder_context);
+    gen8_mfc_bsp_buf_base_addr_state(ctx, encoder_context);
+    mfc_context->avc_img_state(ctx, encode_state, encoder_context);
+    mfc_context->avc_qm_state(ctx, encoder_context);
+    mfc_context->avc_fqm_state(ctx, encoder_context);
+    gen8_mfc_avc_directmode_state(ctx, encoder_context); 
+    gen8_mfc_avc_ref_idx_state(ctx, encoder_context);
+}
+
+
+static VAStatus gen8_mfc_run(VADriverContextP ctx, 
+                             struct encode_state *encode_state,
+                             struct intel_encoder_context *encoder_context)
+{
+    struct intel_batchbuffer *batch = encoder_context->base.batch;
+
+    intel_batchbuffer_flush(batch);            //run the pipeline
+
+    return VA_STATUS_SUCCESS;
+}
+
+
+static VAStatus
+gen8_mfc_stop(VADriverContextP ctx, 
+              struct encode_state *encode_state,
+              struct intel_encoder_context *encoder_context,
+              int *encoded_bits_size)
+{
+    VAStatus vaStatus = VA_STATUS_ERROR_UNKNOWN;
+    VAEncPictureParameterBufferH264 *pPicParameter = (VAEncPictureParameterBufferH264 *)encode_state->pic_param_ext->buffer;
+    VACodedBufferSegment *coded_buffer_segment;
+    
+    vaStatus = i965_MapBuffer(ctx, pPicParameter->coded_buf, (void **)&coded_buffer_segment);
+    assert(vaStatus == VA_STATUS_SUCCESS);
+    *encoded_bits_size = coded_buffer_segment->size * 8;
+    i965_UnmapBuffer(ctx, pPicParameter->coded_buf);
+
+    return VA_STATUS_SUCCESS;
+}
+
+
+static void
+gen8_mfc_avc_slice_state(VADriverContextP ctx,
+                         VAEncPictureParameterBufferH264 *pic_param,
+                         VAEncSliceParameterBufferH264 *slice_param,
+                         struct encode_state *encode_state,
+                         struct intel_encoder_context *encoder_context,
+                         int rate_control_enable,
+                         int qp,
+                         struct intel_batchbuffer *batch)
+{
+    struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
+    int width_in_mbs = (mfc_context->surface_state.width + 15) / 16;
+    int height_in_mbs = (mfc_context->surface_state.height + 15) / 16;
+    int beginmb = slice_param->macroblock_address;
+    int endmb = beginmb + slice_param->num_macroblocks;
+    int beginx = beginmb % width_in_mbs;
+    int beginy = beginmb / width_in_mbs;
+    int nextx =  endmb % width_in_mbs;
+    int nexty = endmb / width_in_mbs;
+    int slice_type = intel_avc_enc_slice_type_fixup(slice_param->slice_type);
+    int last_slice = (endmb == (width_in_mbs * height_in_mbs));
+    int maxQpN, maxQpP;
+    unsigned char correct[6], grow, shrink;
+    int i;
+    int weighted_pred_idc = 0;
+    unsigned int luma_log2_weight_denom = slice_param->luma_log2_weight_denom;
+    unsigned int chroma_log2_weight_denom = slice_param->chroma_log2_weight_denom;
+    int bslice = 0;
+
+    if (batch == NULL)
+        batch = encoder_context->base.batch;
+
+    if (slice_type == SLICE_TYPE_P) {
+        weighted_pred_idc = pic_param->pic_fields.bits.weighted_pred_flag;
+    } else if (slice_type == SLICE_TYPE_B) {
+        weighted_pred_idc = pic_param->pic_fields.bits.weighted_bipred_idc;
+       bslice = 1;
+
+        if (weighted_pred_idc == 2) {
+            /* 8.4.3 - Derivation process for prediction weights (8-279) */
+            luma_log2_weight_denom = 5;
+            chroma_log2_weight_denom = 5;
+        }
+    }
+
+    maxQpN = mfc_context->bit_rate_control_context[slice_type].MaxQpNegModifier;
+    maxQpP = mfc_context->bit_rate_control_context[slice_type].MaxQpPosModifier;
+
+    for (i = 0; i < 6; i++)
+        correct[i] = mfc_context->bit_rate_control_context[slice_type].Correct[i];
+
+    grow = mfc_context->bit_rate_control_context[slice_type].GrowInit + 
+        (mfc_context->bit_rate_control_context[slice_type].GrowResistance << 4);
+    shrink = mfc_context->bit_rate_control_context[slice_type].ShrinkInit + 
+        (mfc_context->bit_rate_control_context[slice_type].ShrinkResistance << 4);
+
+    BEGIN_BCS_BATCH(batch, 11);;
+
+    OUT_BCS_BATCH(batch, MFX_AVC_SLICE_STATE | (11 - 2) );
+    OUT_BCS_BATCH(batch, slice_type);                  /*Slice Type: I:P:B Slice*/
+
+    if (slice_type == SLICE_TYPE_I) {
+        OUT_BCS_BATCH(batch, 0);                       /*no reference frames and pred_weight_table*/
+    } else {
+        OUT_BCS_BATCH(batch,
+                      (1 << 16) | (bslice << 24) |                     /*1 reference frame*/
+                      (chroma_log2_weight_denom << 8) |
+                      (luma_log2_weight_denom << 0));
+    }
+
+    OUT_BCS_BATCH(batch, 
+                  (weighted_pred_idc << 30) |
+                  (slice_param->direct_spatial_mv_pred_flag<<29) |             /*Direct Prediction Type*/
+                  (slice_param->disable_deblocking_filter_idc << 27) |
+                  (slice_param->cabac_init_idc << 24) |
+                  (qp<<16) |                   /*Slice Quantization Parameter*/
+                  ((slice_param->slice_beta_offset_div2 & 0xf) << 8) |
+                  ((slice_param->slice_alpha_c0_offset_div2 & 0xf) << 0));
+    OUT_BCS_BATCH(batch,
+                  (beginy << 24) |                     /*First MB X&Y , the begin postion of current slice*/
+                  (beginx << 16) |
+                  slice_param->macroblock_address );
+    OUT_BCS_BATCH(batch, (nexty << 16) | nextx);                       /*Next slice first MB X&Y*/
+    OUT_BCS_BATCH(batch, 
+                  (0/*rate_control_enable*/ << 31) |           /*in CBR mode RateControlCounterEnable = enable*/
+                  (1 << 30) |          /*ResetRateControlCounter*/
+                  (0 << 28) |          /*RC Triggle Mode = Always Rate Control*/
+                  (4 << 24) |     /*RC Stable Tolerance, middle level*/
+                  (0/*rate_control_enable*/ << 23) |     /*RC Panic Enable*/                 
+                  (0 << 22) |     /*QP mode, don't modfiy CBP*/
+                  (0 << 21) |     /*MB Type Direct Conversion Enabled*/ 
+                  (0 << 20) |     /*MB Type Skip Conversion Enabled*/ 
+                  (last_slice << 19) |     /*IsLastSlice*/
+                  (0 << 18) |  /*BitstreamOutputFlag Compressed BitStream Output Disable Flag 0:enable 1:disable*/
+                  (1 << 17) |      /*HeaderPresentFlag*/       
+                  (1 << 16) |      /*SliceData PresentFlag*/
+                  (1 << 15) |      /*TailPresentFlag*/
+                  (1 << 13) |      /*RBSP NAL TYPE*/   
+                  (0 << 12) );    /*CabacZeroWordInsertionEnable*/
+    OUT_BCS_BATCH(batch, mfc_context->mfc_indirect_pak_bse_object.offset);
+    OUT_BCS_BATCH(batch,
+                  (maxQpN << 24) |     /*Target QP - 24 is lowest QP*/ 
+                  (maxQpP << 16) |     /*Target QP + 20 is highest QP*/
+                  (shrink << 8)  |
+                  (grow << 0));   
+    OUT_BCS_BATCH(batch,
+                  (correct[5] << 20) |
+                  (correct[4] << 16) |
+                  (correct[3] << 12) |
+                  (correct[2] << 8) |
+                  (correct[1] << 4) |
+                  (correct[0] << 0));
+    OUT_BCS_BATCH(batch, 0);
+
+    ADVANCE_BCS_BATCH(batch);
+}
+
+
+#ifdef MFC_SOFTWARE_HASWELL
+
+static int
+gen8_mfc_avc_pak_object_intra(VADriverContextP ctx, int x, int y, int end_mb,
+                              int qp,unsigned int *msg,
+                              struct intel_encoder_context *encoder_context,
+                              unsigned char target_mb_size, unsigned char max_mb_size,
+                              struct intel_batchbuffer *batch)
+{
+    int len_in_dwords = 12;
+    unsigned int intra_msg;
+#define                INTRA_MSG_FLAG          (1 << 13)
+#define                INTRA_MBTYPE_MASK       (0x1F0000)
+    if (batch == NULL)
+        batch = encoder_context->base.batch;
+
+    BEGIN_BCS_BATCH(batch, len_in_dwords);
+
+    intra_msg = msg[0] & 0xC0FF;
+    intra_msg |= INTRA_MSG_FLAG;
+    intra_msg |= ((msg[0] & INTRA_MBTYPE_MASK) >> 8);
+    OUT_BCS_BATCH(batch, MFC_AVC_PAK_OBJECT | (len_in_dwords - 2));
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 
+                  (0 << 24) |          /* PackedMvNum, Debug*/
+                  (0 << 20) |          /* No motion vector */
+                  (1 << 19) |          /* CbpDcY */
+                  (1 << 18) |          /* CbpDcU */
+                  (1 << 17) |          /* CbpDcV */
+                  intra_msg);
+
+    OUT_BCS_BATCH(batch, (0xFFFF << 16) | (y << 8) | x);               /* Code Block Pattern for Y*/
+    OUT_BCS_BATCH(batch, 0x000F000F);                                                  /* Code Block Pattern */                
+    OUT_BCS_BATCH(batch, (0 << 27) | (end_mb << 26) | qp);     /* Last MB */
+
+    /*Stuff for Intra MB*/
+    OUT_BCS_BATCH(batch, msg[1]);                      /* We using Intra16x16 no 4x4 predmode*/        
+    OUT_BCS_BATCH(batch, msg[2]);      
+    OUT_BCS_BATCH(batch, msg[3]&0xFF); 
+    
+    /*MaxSizeInWord and TargetSzieInWord*/
+    OUT_BCS_BATCH(batch, (max_mb_size << 24) |
+                  (target_mb_size << 16) );
+
+    OUT_BCS_BATCH(batch, 0);
+
+    ADVANCE_BCS_BATCH(batch);
+
+    return len_in_dwords;
+}
+
+static int
+gen8_mfc_avc_pak_object_inter(VADriverContextP ctx, int x, int y, int end_mb, int qp,
+                              unsigned int *msg, unsigned int offset,
+                              struct intel_encoder_context *encoder_context,
+                              unsigned char target_mb_size,unsigned char max_mb_size, int slice_type,
+                              struct intel_batchbuffer *batch)
+{
+    int len_in_dwords = 12;
+       unsigned int inter_msg = 0;
+    if (batch == NULL)
+        batch = encoder_context->base.batch;
+    {
+#define MSG_MV_OFFSET  4
+       unsigned int *mv_ptr;
+       mv_ptr = msg + MSG_MV_OFFSET;
+       /* MV of VME output is based on 16 sub-blocks. So it is necessary
+         * to convert them to be compatible with the format of AVC_PAK
+         * command.
+         */
+       if ((msg[0] & INTER_MODE_MASK) == INTER_8X16) {
+               /* MV[0] and MV[2] are replicated */
+               mv_ptr[4] = mv_ptr[0];
+               mv_ptr[5] = mv_ptr[1];
+               mv_ptr[2] = mv_ptr[8];
+               mv_ptr[3] = mv_ptr[9];
+               mv_ptr[6] = mv_ptr[8]; 
+               mv_ptr[7] = mv_ptr[9]; 
+       } else if ((msg[0] & INTER_MODE_MASK) == INTER_16X8) {
+               /* MV[0] and MV[1] are replicated */
+               mv_ptr[2] = mv_ptr[0];  
+               mv_ptr[3] = mv_ptr[1];
+               mv_ptr[4] = mv_ptr[16]; 
+               mv_ptr[5] = mv_ptr[17]; 
+               mv_ptr[6] = mv_ptr[24];
+               mv_ptr[7] = mv_ptr[25];
+       } else if (((msg[0] & INTER_MODE_MASK) == INTER_8X8) &&
+                       !(msg[1] & SUBMB_SHAPE_MASK)) {
+               /* Don't touch MV[0] or MV[1] */
+               mv_ptr[2] = mv_ptr[8];
+               mv_ptr[3] = mv_ptr[9];
+               mv_ptr[4] = mv_ptr[16];
+               mv_ptr[5] = mv_ptr[17];
+               mv_ptr[6] = mv_ptr[24];
+               mv_ptr[7] = mv_ptr[25];
+       }
+    }
+
+    BEGIN_BCS_BATCH(batch, len_in_dwords);
+
+    OUT_BCS_BATCH(batch, MFC_AVC_PAK_OBJECT | (len_in_dwords - 2));
+
+       inter_msg = 32;
+       /* MV quantity */
+       if ((msg[0] & INTER_MODE_MASK) == INTER_8X8) {
+               if (msg[1] & SUBMB_SHAPE_MASK)
+                       inter_msg = 128;
+       }
+    OUT_BCS_BATCH(batch, inter_msg);         /* 32 MV*/
+    OUT_BCS_BATCH(batch, offset);
+       inter_msg = msg[0] & (0x1F00FFFF);
+       inter_msg |= INTER_MV8;
+       inter_msg |= ((1 << 19) | (1 << 18) | (1 << 17));
+       if (((msg[0] & INTER_MODE_MASK) == INTER_8X8) &&
+                       (msg[1] & SUBMB_SHAPE_MASK)) {
+               inter_msg |= INTER_MV32;
+       }
+
+    OUT_BCS_BATCH(batch, inter_msg);
+
+    OUT_BCS_BATCH(batch, (0xFFFF<<16) | (y << 8) | x);        /* Code Block Pattern for Y*/
+    OUT_BCS_BATCH(batch, 0x000F000F);                         /* Code Block Pattern */  
+#if 0 
+    if ( slice_type == SLICE_TYPE_B) {
+        OUT_BCS_BATCH(batch, (0xF<<28) | (end_mb << 26) | qp); /* Last MB */
+    } else {
+        OUT_BCS_BATCH(batch, (end_mb << 26) | qp);     /* Last MB */
+    }
+#else
+    OUT_BCS_BATCH(batch, (end_mb << 26) | qp); /* Last MB */
+#endif
+
+       inter_msg = msg[1] >> 8;
+    /*Stuff for Inter MB*/
+    OUT_BCS_BATCH(batch, inter_msg);        
+    OUT_BCS_BATCH(batch, 0x0);    
+    OUT_BCS_BATCH(batch, 0x0);        
+
+    /*MaxSizeInWord and TargetSzieInWord*/
+    OUT_BCS_BATCH(batch, (max_mb_size << 24) |
+                  (target_mb_size << 16) );
+
+    OUT_BCS_BATCH(batch, 0x0);    
+
+    ADVANCE_BCS_BATCH(batch);
+
+    return len_in_dwords;
+}
+
+#define                AVC_INTRA_RDO_OFFSET    4
+#define                AVC_INTER_RDO_OFFSET    10
+#define                AVC_INTER_MSG_OFFSET    8       
+#define                AVC_INTER_MV_OFFSET             48
+#define                AVC_RDO_MASK            0xFFFF
+
+static void 
+gen8_mfc_avc_pipeline_slice_programing(VADriverContextP ctx,
+                                       struct encode_state *encode_state,
+                                       struct intel_encoder_context *encoder_context,
+                                       int slice_index,
+                                       struct intel_batchbuffer *slice_batch)
+{
+    struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
+    struct gen6_vme_context *vme_context = encoder_context->vme_context;
+    VAEncSequenceParameterBufferH264 *pSequenceParameter = (VAEncSequenceParameterBufferH264 *)encode_state->seq_param_ext->buffer;
+    VAEncPictureParameterBufferH264 *pPicParameter = (VAEncPictureParameterBufferH264 *)encode_state->pic_param_ext->buffer;
+    VAEncSliceParameterBufferH264 *pSliceParameter = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[slice_index]->buffer; 
+    unsigned int *msg = NULL, offset = 0;
+    unsigned char *msg_ptr = NULL;
+    int width_in_mbs = (mfc_context->surface_state.width + 15) / 16;
+    int height_in_mbs = (mfc_context->surface_state.height + 15) / 16;
+    int last_slice = (pSliceParameter->macroblock_address + pSliceParameter->num_macroblocks) == (width_in_mbs * height_in_mbs);
+    int i,x,y;
+    int qp = pPicParameter->pic_init_qp + pSliceParameter->slice_qp_delta;
+    unsigned int rate_control_mode = encoder_context->rate_control_mode;
+    unsigned char *slice_header = NULL;
+    int slice_header_length_in_bits = 0;
+    unsigned int tail_data[] = { 0x0, 0x0 };
+    int slice_type = intel_avc_enc_slice_type_fixup(pSliceParameter->slice_type);
+    int is_intra = slice_type == SLICE_TYPE_I;
+
+
+    if (rate_control_mode == VA_RC_CBR) {
+        qp = mfc_context->bit_rate_control_context[slice_type].QpPrimeY;
+        pSliceParameter->slice_qp_delta = qp - pPicParameter->pic_init_qp;
+    }
+
+    /* only support for 8-bit pixel bit-depth */
+    assert(pSequenceParameter->bit_depth_luma_minus8 == 0);
+    assert(pSequenceParameter->bit_depth_chroma_minus8 == 0);
+    assert(pPicParameter->pic_init_qp >= 0 && pPicParameter->pic_init_qp < 52);
+    assert(qp >= 0 && qp < 52);
+
+    gen8_mfc_avc_slice_state(ctx, 
+                             pPicParameter,
+                             pSliceParameter,
+                             encode_state, encoder_context,
+                             (rate_control_mode == VA_RC_CBR), qp, slice_batch);
+
+    if ( slice_index == 0) 
+        intel_mfc_avc_pipeline_header_programing(ctx, encode_state, encoder_context, slice_batch);
+
+    slice_header_length_in_bits = build_avc_slice_header(pSequenceParameter, pPicParameter, pSliceParameter, &slice_header);
+
+    // slice hander
+    mfc_context->insert_object(ctx, encoder_context,
+                               (unsigned int *)slice_header, ALIGN(slice_header_length_in_bits, 32) >> 5, slice_header_length_in_bits & 0x1f,
+                               5,  /* first 5 bytes are start code + nal unit type */
+                               1, 0, 1, slice_batch);
+
+    dri_bo_map(vme_context->vme_output.bo , 1);
+    msg_ptr = (unsigned char *)vme_context->vme_output.bo->virtual;
+
+    if (is_intra) {
+        msg = (unsigned int *) (msg_ptr + pSliceParameter->macroblock_address * vme_context->vme_output.size_block);
+    } else {
+        msg = (unsigned int *) (msg_ptr + pSliceParameter->macroblock_address * vme_context->vme_output.size_block);
+    }
+   
+    for (i = pSliceParameter->macroblock_address; 
+         i < pSliceParameter->macroblock_address + pSliceParameter->num_macroblocks; i++) {
+        int last_mb = (i == (pSliceParameter->macroblock_address + pSliceParameter->num_macroblocks - 1) );
+        x = i % width_in_mbs;
+        y = i / width_in_mbs;
+        msg = (unsigned int *) (msg_ptr + i * vme_context->vme_output.size_block);
+
+        if (is_intra) {
+            assert(msg);
+            gen8_mfc_avc_pak_object_intra(ctx, x, y, last_mb, qp, msg, encoder_context, 0, 0, slice_batch);
+        } else {
+           int inter_rdo, intra_rdo;
+           inter_rdo = msg[AVC_INTER_RDO_OFFSET] & AVC_RDO_MASK;
+           intra_rdo = msg[AVC_INTRA_RDO_OFFSET] & AVC_RDO_MASK;
+           offset = i * vme_context->vme_output.size_block + AVC_INTER_MV_OFFSET;
+           if (intra_rdo < inter_rdo) { 
+                gen8_mfc_avc_pak_object_intra(ctx, x, y, last_mb, qp, msg, encoder_context, 0, 0, slice_batch);
+            } else {
+               msg += AVC_INTER_MSG_OFFSET;
+                gen8_mfc_avc_pak_object_inter(ctx, x, y, last_mb, qp, msg, offset, encoder_context, 0, 0, pSliceParameter->slice_type, slice_batch);
+            }
+        }
+    }
+   
+    dri_bo_unmap(vme_context->vme_output.bo);
+
+    if ( last_slice ) {    
+        mfc_context->insert_object(ctx, encoder_context,
+                                   tail_data, 2, 8,
+                                   2, 1, 1, 0, slice_batch);
+    } else {
+        mfc_context->insert_object(ctx, encoder_context,
+                                   tail_data, 1, 8,
+                                   1, 1, 1, 0, slice_batch);
+    }
+
+    free(slice_header);
+
+}
+
+static dri_bo *
+gen8_mfc_avc_software_batchbuffer(VADriverContextP ctx,
+                                  struct encode_state *encode_state,
+                                  struct intel_encoder_context *encoder_context)
+{
+    struct i965_driver_data *i965 = i965_driver_data(ctx);
+    struct intel_batchbuffer *batch;
+    dri_bo *batch_bo;
+    int i;
+    int buffer_size;
+    VAEncSequenceParameterBufferH264 *pSequenceParameter = (VAEncSequenceParameterBufferH264 *)encode_state->seq_param_ext->buffer;
+    int width_in_mbs = pSequenceParameter->picture_width_in_mbs;
+    int height_in_mbs = pSequenceParameter->picture_height_in_mbs;
+
+    buffer_size = width_in_mbs * height_in_mbs * 64;
+    batch = intel_batchbuffer_new(&i965->intel, I915_EXEC_BSD, buffer_size);
+    batch_bo = batch->buffer;
+    for (i = 0; i < encode_state->num_slice_params_ext; i++) {
+        gen8_mfc_avc_pipeline_slice_programing(ctx, encode_state, encoder_context, i, batch);
+    }
+
+    intel_batchbuffer_align(batch, 8);
+    
+    BEGIN_BCS_BATCH(batch, 2);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, MI_BATCH_BUFFER_END);
+    ADVANCE_BCS_BATCH(batch);
+
+    dri_bo_reference(batch_bo);
+    intel_batchbuffer_free(batch);
+
+    return batch_bo;
+}
+
+#else
+
+static void
+gen8_mfc_batchbuffer_surfaces_input(VADriverContextP ctx,
+                                    struct encode_state *encode_state,
+                                    struct intel_encoder_context *encoder_context)
+
+{
+    struct gen6_vme_context *vme_context = encoder_context->vme_context;
+    struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
+
+    assert(vme_context->vme_output.bo);
+    mfc_context->buffer_suface_setup(ctx,
+                                     &mfc_context->gpe_context,
+                                     &vme_context->vme_output,
+                                     BINDING_TABLE_OFFSET(BIND_IDX_VME_OUTPUT),
+                                     SURFACE_STATE_OFFSET(BIND_IDX_VME_OUTPUT));
+    assert(mfc_context->aux_batchbuffer_surface.bo);
+    mfc_context->buffer_suface_setup(ctx,
+                                     &mfc_context->gpe_context,
+                                     &mfc_context->aux_batchbuffer_surface,
+                                     BINDING_TABLE_OFFSET(BIND_IDX_MFC_SLICE_HEADER),
+                                     SURFACE_STATE_OFFSET(BIND_IDX_MFC_SLICE_HEADER));
+}
+
+static void
+gen8_mfc_batchbuffer_surfaces_output(VADriverContextP ctx,
+                                     struct encode_state *encode_state,
+                                     struct intel_encoder_context *encoder_context)
+
+{
+    struct i965_driver_data *i965 = i965_driver_data(ctx);
+    struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
+    VAEncSequenceParameterBufferH264 *pSequenceParameter = (VAEncSequenceParameterBufferH264 *)encode_state->seq_param_ext->buffer;
+    int width_in_mbs = pSequenceParameter->picture_width_in_mbs;
+    int height_in_mbs = pSequenceParameter->picture_height_in_mbs;
+    mfc_context->mfc_batchbuffer_surface.num_blocks = width_in_mbs * height_in_mbs + encode_state->num_slice_params_ext * 8 + 1;
+    mfc_context->mfc_batchbuffer_surface.size_block = 16 * CMD_LEN_IN_OWORD; /* 3 OWORDs */
+    mfc_context->mfc_batchbuffer_surface.pitch = 16;
+    mfc_context->mfc_batchbuffer_surface.bo = dri_bo_alloc(i965->intel.bufmgr, 
+                                                           "MFC batchbuffer",
+                                                           mfc_context->mfc_batchbuffer_surface.num_blocks * mfc_context->mfc_batchbuffer_surface.size_block,
+                                                           0x1000);
+    mfc_context->buffer_suface_setup(ctx,
+                                     &mfc_context->gpe_context,
+                                     &mfc_context->mfc_batchbuffer_surface,
+                                     BINDING_TABLE_OFFSET(BIND_IDX_MFC_BATCHBUFFER),
+                                     SURFACE_STATE_OFFSET(BIND_IDX_MFC_BATCHBUFFER));
+}
+
+static void
+gen8_mfc_batchbuffer_surfaces_setup(VADriverContextP ctx, 
+                                    struct encode_state *encode_state,
+                                    struct intel_encoder_context *encoder_context)
+{
+    gen8_mfc_batchbuffer_surfaces_input(ctx, encode_state, encoder_context);
+    gen8_mfc_batchbuffer_surfaces_output(ctx, encode_state, encoder_context);
+}
+
+static void
+gen8_mfc_batchbuffer_idrt_setup(VADriverContextP ctx, 
+                                struct encode_state *encode_state,
+                                struct intel_encoder_context *encoder_context)
+{
+    struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
+    struct gen6_interface_descriptor_data *desc;   
+    int i;
+    dri_bo *bo;
+
+    bo = mfc_context->gpe_context.idrt.bo;
+    dri_bo_map(bo, 1);
+    assert(bo->virtual);
+    desc = bo->virtual;
+
+    for (i = 0; i < mfc_context->gpe_context.num_kernels; i++) {
+        struct i965_kernel *kernel;
+
+        kernel = &mfc_context->gpe_context.kernels[i];
+        assert(sizeof(*desc) == 32);
+
+        /*Setup the descritor table*/
+        memset(desc, 0, sizeof(*desc));
+        desc->desc0.kernel_start_pointer = (kernel->bo->offset >> 6);
+        desc->desc2.sampler_count = 0;
+        desc->desc2.sampler_state_pointer = 0;
+        desc->desc3.binding_table_entry_count = 2;
+        desc->desc3.binding_table_pointer = (BINDING_TABLE_OFFSET(0) >> 5);
+        desc->desc4.constant_urb_entry_read_offset = 0;
+        desc->desc4.constant_urb_entry_read_length = 4;
+               
+        /*kernel start*/
+        dri_bo_emit_reloc(bo,  
+                          I915_GEM_DOMAIN_INSTRUCTION, 0,
+                          0,
+                          i * sizeof(*desc) + offsetof(struct gen6_interface_descriptor_data, desc0),
+                          kernel->bo);
+        desc++;
+    }
+
+    dri_bo_unmap(bo);
+}
+
+static void
+gen8_mfc_batchbuffer_constant_setup(VADriverContextP ctx, 
+                                    struct encode_state *encode_state,
+                                    struct intel_encoder_context *encoder_context)
+{
+    struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
+    
+    (void)mfc_context;
+}
+
+static void
+gen8_mfc_batchbuffer_emit_object_command(struct intel_batchbuffer *batch,
+                                         int index,
+                                         int head_offset,
+                                         int batchbuffer_offset,
+                                         int head_size,
+                                         int tail_size,
+                                         int number_mb_cmds,
+                                         int first_object,
+                                         int last_object,
+                                         int last_slice,
+                                         int mb_x,
+                                         int mb_y,
+                                         int width_in_mbs,
+                                         int qp)
+{
+    BEGIN_BATCH(batch, 12);
+    
+    OUT_BATCH(batch, CMD_MEDIA_OBJECT | (12 - 2));
+    OUT_BATCH(batch, index);
+    OUT_BATCH(batch, 0);
+    OUT_BATCH(batch, 0);
+    OUT_BATCH(batch, 0);
+    OUT_BATCH(batch, 0);
+   
+    /*inline data */
+    OUT_BATCH(batch, head_offset);
+    OUT_BATCH(batch, batchbuffer_offset);
+    OUT_BATCH(batch, 
+              head_size << 16 |
+              tail_size);
+    OUT_BATCH(batch,
+              number_mb_cmds << 16 |
+              first_object << 2 |
+              last_object << 1 |
+              last_slice);
+    OUT_BATCH(batch,
+              mb_y << 8 |
+              mb_x);
+    OUT_BATCH(batch,
+              qp << 16 |
+              width_in_mbs);
+
+    ADVANCE_BATCH(batch);
+}
+
+static void
+gen8_mfc_avc_batchbuffer_slice_command(VADriverContextP ctx,
+                                       struct intel_encoder_context *encoder_context,
+                                       VAEncSliceParameterBufferH264 *slice_param,
+                                       int head_offset,
+                                       unsigned short head_size,
+                                       unsigned short tail_size,
+                                       int batchbuffer_offset,
+                                       int qp,
+                                       int last_slice)
+{
+    struct intel_batchbuffer *batch = encoder_context->base.batch;
+    struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
+    int width_in_mbs = (mfc_context->surface_state.width + 15) / 16;
+    int total_mbs = slice_param->num_macroblocks;
+    int number_mb_cmds = 128;
+    int starting_mb = 0;
+    int last_object = 0;
+    int first_object = 1;
+    int i;
+    int mb_x, mb_y;
+    int index = (slice_param->slice_type == SLICE_TYPE_I) ? MFC_BATCHBUFFER_AVC_INTRA : MFC_BATCHBUFFER_AVC_INTER;
+
+    for (i = 0; i < total_mbs / number_mb_cmds; i++) {
+        last_object = (total_mbs - starting_mb) == number_mb_cmds;
+        mb_x = (slice_param->macroblock_address + starting_mb) % width_in_mbs;
+        mb_y = (slice_param->macroblock_address + starting_mb) / width_in_mbs;
+        assert(mb_x <= 255 && mb_y <= 255);
+
+        starting_mb += number_mb_cmds;
+
+        gen8_mfc_batchbuffer_emit_object_command(batch,
+                                                 index,
+                                                 head_offset,
+                                                 batchbuffer_offset,
+                                                 head_size,
+                                                 tail_size,
+                                                 number_mb_cmds,
+                                                 first_object,
+                                                 last_object,
+                                                 last_slice,
+                                                 mb_x,
+                                                 mb_y,
+                                                 width_in_mbs,
+                                                 qp);
+
+        if (first_object) {
+            head_offset += head_size;
+            batchbuffer_offset += head_size;
+        }
+
+        if (last_object) {
+            head_offset += tail_size;
+            batchbuffer_offset += tail_size;
+        }
+
+        batchbuffer_offset += number_mb_cmds * CMD_LEN_IN_OWORD;
+
+        first_object = 0;
+    }
+
+    if (!last_object) {
+        last_object = 1;
+        number_mb_cmds = total_mbs % number_mb_cmds;
+        mb_x = (slice_param->macroblock_address + starting_mb) % width_in_mbs;
+        mb_y = (slice_param->macroblock_address + starting_mb) / width_in_mbs;
+        assert(mb_x <= 255 && mb_y <= 255);
+        starting_mb += number_mb_cmds;
+
+        gen8_mfc_batchbuffer_emit_object_command(batch,
+                                                 index,
+                                                 head_offset,
+                                                 batchbuffer_offset,
+                                                 head_size,
+                                                 tail_size,
+                                                 number_mb_cmds,
+                                                 first_object,
+                                                 last_object,
+                                                 last_slice,
+                                                 mb_x,
+                                                 mb_y,
+                                                 width_in_mbs,
+                                                 qp);
+    }
+}
+                          
+/*
+ * return size in Owords (16bytes)
+ */         
+static int
+gen8_mfc_avc_batchbuffer_slice(VADriverContextP ctx,
+                               struct encode_state *encode_state,
+                               struct intel_encoder_context *encoder_context,
+                               int slice_index,
+                               int batchbuffer_offset)
+{
+    struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
+    struct intel_batchbuffer *slice_batch = mfc_context->aux_batchbuffer;
+    VAEncSequenceParameterBufferH264 *pSequenceParameter = (VAEncSequenceParameterBufferH264 *)encode_state->seq_param_ext->buffer;
+    VAEncPictureParameterBufferH264 *pPicParameter = (VAEncPictureParameterBufferH264 *)encode_state->pic_param_ext->buffer;
+    VAEncSliceParameterBufferH264 *pSliceParameter = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[slice_index]->buffer; 
+    int width_in_mbs = (mfc_context->surface_state.width + 15) / 16;
+    int height_in_mbs = (mfc_context->surface_state.height + 15) / 16;
+    int last_slice = (pSliceParameter->macroblock_address + pSliceParameter->num_macroblocks) == (width_in_mbs * height_in_mbs);
+    int qp = pPicParameter->pic_init_qp + pSliceParameter->slice_qp_delta;
+    unsigned int rate_control_mode = encoder_context->rate_control_mode;
+    unsigned char *slice_header = NULL;
+    int slice_header_length_in_bits = 0;
+    unsigned int tail_data[] = { 0x0, 0x0 };
+    long head_offset;
+    int old_used = intel_batchbuffer_used_size(slice_batch), used;
+    unsigned short head_size, tail_size;
+    int slice_type = intel_avc_enc_slice_type_fixup(pSliceParameter->slice_type);
+
+    if (rate_control_mode == VA_RC_CBR) {
+        qp = mfc_context->bit_rate_control_context[slice_type].QpPrimeY;
+        pSliceParameter->slice_qp_delta = qp - pPicParameter->pic_init_qp;
+    }
+
+    /* only support for 8-bit pixel bit-depth */
+    assert(pSequenceParameter->bit_depth_luma_minus8 == 0);
+    assert(pSequenceParameter->bit_depth_chroma_minus8 == 0);
+    assert(pPicParameter->pic_init_qp >= 0 && pPicParameter->pic_init_qp < 52);
+    assert(qp >= 0 && qp < 52);
+
+    head_offset = old_used / 16;
+    gen8_mfc_avc_slice_state(ctx,
+                             pPicParameter,
+                             pSliceParameter,
+                             encode_state,
+                             encoder_context,
+                             (rate_control_mode == VA_RC_CBR),
+                             qp,
+                             slice_batch);
+
+    if (slice_index == 0)
+        intel_mfc_avc_pipeline_header_programing(ctx, encode_state, encoder_context, slice_batch);
+
+    slice_header_length_in_bits = build_avc_slice_header(pSequenceParameter, pPicParameter, pSliceParameter, &slice_header);
+
+    // slice hander
+    mfc_context->insert_object(ctx,
+                               encoder_context,
+                               (unsigned int *)slice_header,
+                               ALIGN(slice_header_length_in_bits, 32) >> 5,
+                               slice_header_length_in_bits & 0x1f,
+                               5,  /* first 5 bytes are start code + nal unit type */
+                               1,
+                               0,
+                               1,
+                               slice_batch);
+    free(slice_header);
+
+    intel_batchbuffer_align(slice_batch, 16); /* aligned by an Oword */
+    used = intel_batchbuffer_used_size(slice_batch);
+    head_size = (used - old_used) / 16;
+    old_used = used;
+
+    /* tail */
+    if (last_slice) {    
+        mfc_context->insert_object(ctx,
+                                   encoder_context,
+                                   tail_data,
+                                   2,
+                                   8,
+                                   2,
+                                   1,
+                                   1,
+                                   0,
+                                   slice_batch);
+    } else {
+        mfc_context->insert_object(ctx,
+                                   encoder_context,
+                                   tail_data,
+                                   1,
+                                   8,
+                                   1,
+                                   1,
+                                   1,
+                                   0,
+                                   slice_batch);
+    }
+
+    intel_batchbuffer_align(slice_batch, 16); /* aligned by an Oword */
+    used = intel_batchbuffer_used_size(slice_batch);
+    tail_size = (used - old_used) / 16;
+
+   
+    gen8_mfc_avc_batchbuffer_slice_command(ctx,
+                                           encoder_context,
+                                           pSliceParameter,
+                                           head_offset,
+                                           head_size,
+                                           tail_size,
+                                           batchbuffer_offset,
+                                           qp,
+                                           last_slice);
+
+    return head_size + tail_size + pSliceParameter->num_macroblocks * CMD_LEN_IN_OWORD;
+}
+
+static void
+gen8_mfc_avc_batchbuffer_pipeline(VADriverContextP ctx,
+                                  struct encode_state *encode_state,
+                                  struct intel_encoder_context *encoder_context)
+{
+    struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
+    struct intel_batchbuffer *batch = encoder_context->base.batch;
+    int i, size, offset = 0;
+    intel_batchbuffer_start_atomic(batch, 0x4000); 
+    gen6_gpe_pipeline_setup(ctx, &mfc_context->gpe_context, batch);
+
+    for ( i = 0; i < encode_state->num_slice_params_ext; i++) {
+        size = gen8_mfc_avc_batchbuffer_slice(ctx, encode_state, encoder_context, i, offset);
+        offset += size;
+    }
+
+    intel_batchbuffer_end_atomic(batch);
+    intel_batchbuffer_flush(batch);
+}
+
+static void
+gen8_mfc_build_avc_batchbuffer(VADriverContextP ctx, 
+                               struct encode_state *encode_state,
+                               struct intel_encoder_context *encoder_context)
+{
+    gen8_mfc_batchbuffer_surfaces_setup(ctx, encode_state, encoder_context);
+    gen8_mfc_batchbuffer_idrt_setup(ctx, encode_state, encoder_context);
+    gen8_mfc_batchbuffer_constant_setup(ctx, encode_state, encoder_context);
+    gen8_mfc_avc_batchbuffer_pipeline(ctx, encode_state, encoder_context);
+}
+
+static dri_bo *
+gen8_mfc_avc_hardware_batchbuffer(VADriverContextP ctx,
+                                  struct encode_state *encode_state,
+                                  struct intel_encoder_context *encoder_context)
+{
+    struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
+
+    gen8_mfc_build_avc_batchbuffer(ctx, encode_state, encoder_context);
+    dri_bo_reference(mfc_context->mfc_batchbuffer_surface.bo);
+
+    return mfc_context->mfc_batchbuffer_surface.bo;
+}
+
+#endif
+
+static void
+gen8_mfc_avc_pipeline_programing(VADriverContextP ctx,
+                                 struct encode_state *encode_state,
+                                 struct intel_encoder_context *encoder_context)
+{
+    struct intel_batchbuffer *batch = encoder_context->base.batch;
+    dri_bo *slice_batch_bo;
+
+    if ( intel_mfc_interlace_check(ctx, encode_state, encoder_context) ) {
+        fprintf(stderr, "Current VA driver don't support interlace mode!\n");
+        assert(0);
+        return; 
+    }
+
+#ifdef MFC_SOFTWARE_HASWELL
+    slice_batch_bo = gen8_mfc_avc_software_batchbuffer(ctx, encode_state, encoder_context);
+#else
+    slice_batch_bo = gen8_mfc_avc_hardware_batchbuffer(ctx, encode_state, encoder_context);
+#endif
+
+    // begin programing
+    intel_batchbuffer_start_atomic_bcs(batch, 0x4000); 
+    intel_batchbuffer_emit_mi_flush(batch);
+    
+    // picture level programing
+    gen8_mfc_avc_pipeline_picture_programing(ctx, encode_state, encoder_context);
+
+    BEGIN_BCS_BATCH(batch, 2);
+    OUT_BCS_BATCH(batch, MI_BATCH_BUFFER_START | (1 << 8));
+    OUT_BCS_RELOC(batch,
+                  slice_batch_bo,
+                  I915_GEM_DOMAIN_COMMAND, 0, 
+                  0);
+    ADVANCE_BCS_BATCH(batch);
+
+    // end programing
+    intel_batchbuffer_end_atomic(batch);
+
+    dri_bo_unreference(slice_batch_bo);
+}
+
+
+static VAStatus
+gen8_mfc_avc_encode_picture(VADriverContextP ctx, 
+                            struct encode_state *encode_state,
+                            struct intel_encoder_context *encoder_context)
+{
+    struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
+    unsigned int rate_control_mode = encoder_context->rate_control_mode;
+    int current_frame_bits_size;
+    int sts;
+    for (;;) {
+        gen8_mfc_init(ctx, encode_state, encoder_context);
+        intel_mfc_avc_prepare(ctx, encode_state, encoder_context);
+        /*Programing bcs pipeline*/
+        gen8_mfc_avc_pipeline_programing(ctx, encode_state, encoder_context);  //filling the pipeline
+        gen8_mfc_run(ctx, encode_state, encoder_context);
+        if (rate_control_mode == VA_RC_CBR /*|| rate_control_mode == VA_RC_VBR*/) {
+            gen8_mfc_stop(ctx, encode_state, encoder_context, &current_frame_bits_size);
+            sts = intel_mfc_brc_postpack(encode_state, mfc_context, current_frame_bits_size);
+            if (sts == BRC_NO_HRD_VIOLATION) {
+                intel_mfc_hrd_context_update(encode_state, mfc_context);
+                break;
+            }
+            else if (sts == BRC_OVERFLOW_WITH_MIN_QP || sts == BRC_UNDERFLOW_WITH_MAX_QP) {
+                if (!mfc_context->hrd.violation_noted) {
+                    fprintf(stderr, "Unrepairable %s!\n", (sts == BRC_OVERFLOW_WITH_MIN_QP)? "overflow": "underflow");
+                    mfc_context->hrd.violation_noted = 1;
+                }
+                return VA_STATUS_SUCCESS;
+            }
+        } else {
+            break;
+        }
+    }
+
+    return VA_STATUS_SUCCESS;
+}
+
+/*
+ * MPEG-2
+ */
+
+static const int
+va_to_gen8_mpeg2_picture_type[3] = {
+    1,  /* I */
+    2,  /* P */
+    3   /* B */
+};
+
+static void
+gen8_mfc_mpeg2_pic_state(VADriverContextP ctx,
+                          struct intel_encoder_context *encoder_context,
+                          struct encode_state *encode_state)
+{
+    struct intel_batchbuffer *batch = encoder_context->base.batch;
+    struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
+    VAEncPictureParameterBufferMPEG2 *pic_param;
+    int width_in_mbs = (mfc_context->surface_state.width + 15) / 16;
+    int height_in_mbs = (mfc_context->surface_state.height + 15) / 16;
+
+    assert(encode_state->pic_param_ext && encode_state->pic_param_ext->buffer);
+    pic_param = (VAEncPictureParameterBufferMPEG2 *)encode_state->pic_param_ext->buffer;
+
+    BEGIN_BCS_BATCH(batch, 13);
+    OUT_BCS_BATCH(batch, MFX_MPEG2_PIC_STATE | (13 - 2));
+    OUT_BCS_BATCH(batch,
+                  (pic_param->f_code[1][1] & 0xf) << 28 | /* f_code[1][1] */
+                  (pic_param->f_code[1][0] & 0xf) << 24 | /* f_code[1][0] */
+                  (pic_param->f_code[0][1] & 0xf) << 20 | /* f_code[0][1] */
+                  (pic_param->f_code[0][0] & 0xf) << 16 | /* f_code[0][0] */
+                  pic_param->picture_coding_extension.bits.intra_dc_precision << 14 |
+                  pic_param->picture_coding_extension.bits.picture_structure << 12 |
+                  pic_param->picture_coding_extension.bits.top_field_first << 11 |
+                  pic_param->picture_coding_extension.bits.frame_pred_frame_dct << 10 |
+                  pic_param->picture_coding_extension.bits.concealment_motion_vectors << 9 |
+                  pic_param->picture_coding_extension.bits.q_scale_type << 8 |
+                  pic_param->picture_coding_extension.bits.intra_vlc_format << 7 | 
+                  pic_param->picture_coding_extension.bits.alternate_scan << 6);
+    OUT_BCS_BATCH(batch,
+                  0 << 14 |     /* LoadSlicePointerFlag, 0 means only loading bitstream pointer once */
+                  va_to_gen8_mpeg2_picture_type[pic_param->picture_type] << 9 |
+                  0);
+    OUT_BCS_BATCH(batch,
+                  1 << 31 |     /* slice concealment */
+                  (height_in_mbs - 1) << 16 |
+                  (width_in_mbs - 1));
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch,
+                  0xFFF << 16 | /* InterMBMaxSize */
+                  0xFFF << 0 |  /* IntraMBMaxSize */
+                  0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    ADVANCE_BCS_BATCH(batch);
+}
+
+static void
+gen8_mfc_mpeg2_qm_state(VADriverContextP ctx, struct intel_encoder_context *encoder_context)
+{
+    unsigned char intra_qm[64] = {
+         8, 16, 19, 22, 26, 27, 29, 34,
+        16, 16, 22, 24, 27, 29, 34, 37,
+        19, 22, 26, 27, 29, 34, 34, 38,
+        22, 22, 26, 27, 29, 34, 37, 40,
+        22, 26, 27, 29, 32, 35, 40, 48,
+        26, 27, 29, 32, 35, 40, 48, 58,
+        26, 27, 29, 34, 38, 46, 56, 69,
+        27, 29, 35, 38, 46, 56, 69, 83
+    };
+
+    unsigned char non_intra_qm[64] = {
+        16, 16, 16, 16, 16, 16, 16, 16,
+        16, 16, 16, 16, 16, 16, 16, 16,
+        16, 16, 16, 16, 16, 16, 16, 16,
+        16, 16, 16, 16, 16, 16, 16, 16,
+        16, 16, 16, 16, 16, 16, 16, 16,
+        16, 16, 16, 16, 16, 16, 16, 16,
+        16, 16, 16, 16, 16, 16, 16, 16,
+        16, 16, 16, 16, 16, 16, 16, 16
+    };
+
+    gen8_mfc_qm_state(ctx, MFX_QM_MPEG_INTRA_QUANTIZER_MATRIX, (unsigned int *)intra_qm, 16, encoder_context);
+    gen8_mfc_qm_state(ctx, MFX_QM_MPEG_NON_INTRA_QUANTIZER_MATRIX, (unsigned int *)non_intra_qm, 16,encoder_context);
+}
+
+static void
+gen8_mfc_mpeg2_fqm_state(VADriverContextP ctx, struct intel_encoder_context *encoder_context)
+{
+    unsigned short intra_fqm[64] = {
+         65536/0x8, 65536/0x10, 65536/0x13, 65536/0x16, 65536/0x16, 65536/0x1a, 65536/0x1a, 65536/0x1b,
+         65536/0x10, 65536/0x10, 65536/0x16, 65536/0x16, 65536/0x1a, 65536/0x1b, 65536/0x1b, 65536/0x1d,
+         65536/0x13, 65536/0x16, 65536/0x1a, 65536/0x1a, 65536/0x1b, 65536/0x1d, 65536/0x1d, 65536/0x23,
+         65536/0x16, 65536/0x18, 65536/0x1b, 65536/0x1b, 65536/0x13, 65536/0x20, 65536/0x22, 65536/0x26,
+         65536/0x1a, 65536/0x1b, 65536/0x13, 65536/0x13, 65536/0x20, 65536/0x23, 65536/0x26, 65536/0x2e,
+         65536/0x1b, 65536/0x1d, 65536/0x22, 65536/0x22, 65536/0x23, 65536/0x28, 65536/0x2e, 65536/0x38,
+         65536/0x1d, 65536/0x22, 65536/0x22, 65536/0x25, 65536/0x28, 65536/0x30, 65536/0x38, 65536/0x45,
+         65536/0x22, 65536/0x25, 65536/0x26, 65536/0x28, 65536/0x30, 65536/0x3a, 65536/0x45, 65536/0x53,
+    };
+
+    unsigned short non_intra_fqm[64] = {
+        0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000,
+        0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000,
+        0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000,
+        0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000,
+        0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000,
+        0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000,
+        0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000,
+        0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000, 0x1000,
+    };
+
+    gen8_mfc_fqm_state(ctx, MFX_QM_MPEG_INTRA_QUANTIZER_MATRIX, (unsigned int *)intra_fqm, 32, encoder_context);
+    gen8_mfc_fqm_state(ctx, MFX_QM_MPEG_NON_INTRA_QUANTIZER_MATRIX, (unsigned int *)non_intra_fqm, 32, encoder_context);
+}
+
+static void
+gen8_mfc_mpeg2_slicegroup_state(VADriverContextP ctx,
+                                 struct intel_encoder_context *encoder_context,
+                                 int x, int y,
+                                 int next_x, int next_y,
+                                 int is_fisrt_slice_group,
+                                 int is_last_slice_group,
+                                 int intra_slice,
+                                 int qp,
+                                 struct intel_batchbuffer *batch)
+{
+    struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
+
+    if (batch == NULL)
+        batch = encoder_context->base.batch;
+
+    BEGIN_BCS_BATCH(batch, 8);
+
+    OUT_BCS_BATCH(batch, MFC_MPEG2_SLICEGROUP_STATE | (8 - 2));
+    OUT_BCS_BATCH(batch,
+                  0 << 31 |                             /* MbRateCtrlFlag */
+                  !!is_last_slice_group << 19 |         /* IsLastSliceGrp */
+                  1 << 17 |                             /* Insert Header before the first slice group data */
+                  1 << 16 |                             /* SliceData PresentFlag: always 1 */
+                  1 << 15 |                             /* TailPresentFlag: always 1 */
+                  0 << 14 |                             /* FirstSliceHdrDisabled: slice header for each slice */
+                  !!intra_slice << 13 |                 /* IntraSlice */
+                  !!intra_slice << 12 |                 /* IntraSliceFlag */
+                  0);
+    OUT_BCS_BATCH(batch,
+                  next_y << 24 |
+                  next_x << 16 |
+                  y << 8 |
+                  x << 0 |
+                  0);
+    OUT_BCS_BATCH(batch, qp);   /* FIXME: SliceGroupQp */
+    /* bitstream pointer is only loaded once for the first slice of a frame when 
+     * LoadSlicePointerFlag is 0
+     */
+    OUT_BCS_BATCH(batch, mfc_context->mfc_indirect_pak_bse_object.offset);
+    OUT_BCS_BATCH(batch, 0);    /* FIXME: */
+    OUT_BCS_BATCH(batch, 0);    /* FIXME: CorrectPoints */
+    OUT_BCS_BATCH(batch, 0);    /* FIXME: CVxxx */
+
+    ADVANCE_BCS_BATCH(batch);
+}
+
+static int
+gen8_mfc_mpeg2_pak_object_intra(VADriverContextP ctx,
+                                 struct intel_encoder_context *encoder_context,
+                                 int x, int y,
+                                 int first_mb_in_slice,
+                                 int last_mb_in_slice,
+                                 int first_mb_in_slice_group,
+                                 int last_mb_in_slice_group,
+                                 int mb_type,
+                                 int qp_scale_code,
+                                 int coded_block_pattern,
+                                 unsigned char target_size_in_word,
+                                 unsigned char max_size_in_word,
+                                 struct intel_batchbuffer *batch)
+{
+    int len_in_dwords = 9;
+
+    if (batch == NULL)
+        batch = encoder_context->base.batch;
+
+    BEGIN_BCS_BATCH(batch, len_in_dwords);
+
+    OUT_BCS_BATCH(batch, MFC_MPEG2_PAK_OBJECT | (len_in_dwords - 2));
+    OUT_BCS_BATCH(batch,
+                  0 << 24 |     /* PackedMvNum */
+                  0 << 20 |     /* MvFormat */
+                  7 << 17 |     /* CbpDcY/CbpDcU/CbpDcV */
+                  0 << 15 |     /* TransformFlag: frame DCT */
+                  0 << 14 |     /* FieldMbFlag */
+                  1 << 13 |     /* IntraMbFlag */
+                  mb_type << 8 |   /* MbType: Intra */
+                  0 << 2 |      /* SkipMbFlag */
+                  0 << 0 |      /* InterMbMode */
+                  0);
+    OUT_BCS_BATCH(batch, y << 16 | x);
+    OUT_BCS_BATCH(batch,
+                  max_size_in_word << 24 |
+                  target_size_in_word << 16 |
+                  coded_block_pattern << 6 |      /* CBP */
+                  0);
+    OUT_BCS_BATCH(batch,
+                  last_mb_in_slice << 31 |
+                  first_mb_in_slice << 30 |
+                  0 << 27 |     /* EnableCoeffClamp */
+                  last_mb_in_slice_group << 26 |
+                  0 << 25 |     /* MbSkipConvDisable */
+                  first_mb_in_slice_group << 24 |
+                  0 << 16 |     /* MvFieldSelect */
+                  qp_scale_code << 0 |
+                  0);
+    OUT_BCS_BATCH(batch, 0);    /* MV[0][0] */
+    OUT_BCS_BATCH(batch, 0);    /* MV[1][0] */
+    OUT_BCS_BATCH(batch, 0);    /* MV[0][1] */
+    OUT_BCS_BATCH(batch, 0);    /* MV[1][1] */
+
+    ADVANCE_BCS_BATCH(batch);
+
+    return len_in_dwords;
+}
+
+#define MPEG2_INTER_MV_OFFSET   12 
+
+static struct _mv_ranges
+{
+    int low;    /* in the unit of 1/2 pixel */
+    int high;   /* in the unit of 1/2 pixel */
+} mv_ranges[] = {
+    {0, 0},
+    {-16, 15},
+    {-32, 31},
+    {-64, 63},
+    {-128, 127},
+    {-256, 255},
+    {-512, 511},
+    {-1024, 1023},
+    {-2048, 2047},
+    {-4096, 4095}
+};
+
+static int
+mpeg2_motion_vector(int mv, int pos, int display_max, int f_code)
+{
+    if (mv + pos * 16 * 2 < 0 ||
+        mv + (pos + 1) * 16 * 2 > display_max * 2)
+        mv = 0;
+
+    if (f_code > 0 && f_code < 10) {
+        if (mv < mv_ranges[f_code].low)
+            mv = mv_ranges[f_code].low;
+
+        if (mv > mv_ranges[f_code].high)
+            mv = mv_ranges[f_code].high;
+    }
+
+    return mv;
+}
+
+static int
+gen8_mfc_mpeg2_pak_object_inter(VADriverContextP ctx,
+                                 struct encode_state *encode_state,
+                                 struct intel_encoder_context *encoder_context,
+                                 unsigned int *msg,
+                                 int width_in_mbs, int height_in_mbs,
+                                 int x, int y,
+                                 int first_mb_in_slice,
+                                 int last_mb_in_slice,
+                                 int first_mb_in_slice_group,
+                                 int last_mb_in_slice_group,
+                                 int qp_scale_code,
+                                 unsigned char target_size_in_word,
+                                 unsigned char max_size_in_word,
+                                 struct intel_batchbuffer *batch)
+{
+    VAEncPictureParameterBufferMPEG2 *pic_param = (VAEncPictureParameterBufferMPEG2 *)encode_state->pic_param_ext->buffer;
+    int len_in_dwords = 9;
+    short *mvptr, mvx0, mvy0, mvx1, mvy1;
+    
+    if (batch == NULL)
+        batch = encoder_context->base.batch;
+
+    mvptr = (short *)msg;
+    mvx0 = mpeg2_motion_vector(mvptr[MPEG2_INTER_MV_OFFSET + 0] / 2, x, width_in_mbs * 16, pic_param->f_code[0][0]);
+    mvy0 = mpeg2_motion_vector(mvptr[MPEG2_INTER_MV_OFFSET + 1] / 2, y, height_in_mbs * 16, pic_param->f_code[0][0]);
+    mvx1 = mpeg2_motion_vector(mvptr[MPEG2_INTER_MV_OFFSET + 2] / 2, x, width_in_mbs * 16, pic_param->f_code[1][0]);
+    mvy1 = mpeg2_motion_vector(mvptr[MPEG2_INTER_MV_OFFSET + 3] / 2, y, height_in_mbs * 16, pic_param->f_code[1][0]);
+
+    BEGIN_BCS_BATCH(batch, len_in_dwords);
+
+    OUT_BCS_BATCH(batch, MFC_MPEG2_PAK_OBJECT | (len_in_dwords - 2));
+    OUT_BCS_BATCH(batch,
+                  2 << 24 |     /* PackedMvNum */
+                  7 << 20 |     /* MvFormat */
+                  7 << 17 |     /* CbpDcY/CbpDcU/CbpDcV */
+                  0 << 15 |     /* TransformFlag: frame DCT */
+                  0 << 14 |     /* FieldMbFlag */
+                  0 << 13 |     /* IntraMbFlag */
+                  1 << 8 |      /* MbType: Frame-based */
+                  0 << 2 |      /* SkipMbFlag */
+                  0 << 0 |      /* InterMbMode */
+                  0);
+    OUT_BCS_BATCH(batch, y << 16 | x);
+    OUT_BCS_BATCH(batch,
+                  max_size_in_word << 24 |
+                  target_size_in_word << 16 |
+                  0x3f << 6 |   /* CBP */
+                  0);
+    OUT_BCS_BATCH(batch,
+                  last_mb_in_slice << 31 |
+                  first_mb_in_slice << 30 |
+                  0 << 27 |     /* EnableCoeffClamp */
+                  last_mb_in_slice_group << 26 |
+                  0 << 25 |     /* MbSkipConvDisable */
+                  first_mb_in_slice_group << 24 |
+                  0 << 16 |     /* MvFieldSelect */
+                  qp_scale_code << 0 |
+                  0);
+
+    OUT_BCS_BATCH(batch, (mvx0 & 0xFFFF) | mvy0 << 16);    /* MV[0][0] */
+    OUT_BCS_BATCH(batch, (mvx1 & 0xFFFF) | mvy1 << 16);    /* MV[1][0] */
+    OUT_BCS_BATCH(batch, 0);    /* MV[0][1] */
+    OUT_BCS_BATCH(batch, 0);    /* MV[1][1] */
+
+    ADVANCE_BCS_BATCH(batch);
+
+    return len_in_dwords;
+}
+
+static void
+intel_mfc_mpeg2_pipeline_header_programing(VADriverContextP ctx,
+                                           struct encode_state *encode_state,
+                                           struct intel_encoder_context *encoder_context,
+                                           struct intel_batchbuffer *slice_batch)
+{
+    struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
+    int idx = va_enc_packed_type_to_idx(VAEncPackedHeaderMPEG2_SPS);
+
+    if (encode_state->packed_header_data[idx]) {
+        VAEncPackedHeaderParameterBuffer *param = NULL;
+        unsigned int *header_data = (unsigned int *)encode_state->packed_header_data[idx]->buffer;
+        unsigned int length_in_bits;
+
+        assert(encode_state->packed_header_param[idx]);
+        param = (VAEncPackedHeaderParameterBuffer *)encode_state->packed_header_param[idx]->buffer;
+        length_in_bits = param->bit_length;
+
+        mfc_context->insert_object(ctx,
+                                   encoder_context,
+                                   header_data,
+                                   ALIGN(length_in_bits, 32) >> 5,
+                                   length_in_bits & 0x1f,
+                                   5,   /* FIXME: check it */
+                                   0,
+                                   0,
+                                   0,   /* Needn't insert emulation bytes for MPEG-2 */
+                                   slice_batch);
+    }
+
+    idx = va_enc_packed_type_to_idx(VAEncPackedHeaderMPEG2_PPS);
+
+    if (encode_state->packed_header_data[idx]) {
+        VAEncPackedHeaderParameterBuffer *param = NULL;
+        unsigned int *header_data = (unsigned int *)encode_state->packed_header_data[idx]->buffer;
+        unsigned int length_in_bits;
+
+        assert(encode_state->packed_header_param[idx]);
+        param = (VAEncPackedHeaderParameterBuffer *)encode_state->packed_header_param[idx]->buffer;
+        length_in_bits = param->bit_length;
+
+        mfc_context->insert_object(ctx,
+                                   encoder_context,
+                                   header_data,
+                                   ALIGN(length_in_bits, 32) >> 5,
+                                   length_in_bits & 0x1f,
+                                   5,   /* FIXME: check it */
+                                   0,
+                                   0,
+                                   0,   /* Needn't insert emulation bytes for MPEG-2 */
+                                   slice_batch);
+    }
+}
+
+static void 
+gen8_mfc_mpeg2_pipeline_slice_group(VADriverContextP ctx,
+                                     struct encode_state *encode_state,
+                                     struct intel_encoder_context *encoder_context,
+                                     int slice_index,
+                                     VAEncSliceParameterBufferMPEG2 *next_slice_group_param,
+                                     struct intel_batchbuffer *slice_batch)
+{
+    struct gen6_vme_context *vme_context = encoder_context->vme_context;
+    struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
+    VAEncSequenceParameterBufferMPEG2 *seq_param = (VAEncSequenceParameterBufferMPEG2 *)encode_state->seq_param_ext->buffer;
+    VAEncSliceParameterBufferMPEG2 *slice_param = NULL;
+    unsigned char tail_delimiter[] = {MPEG2_DELIMITER0, MPEG2_DELIMITER1, MPEG2_DELIMITER2, MPEG2_DELIMITER3, MPEG2_DELIMITER4, 0, 0, 0};
+    unsigned char section_delimiter[] = {0x0, 0x0, 0x0, 0x0};
+    int width_in_mbs = ALIGN(seq_param->picture_width, 16) / 16;
+    int height_in_mbs = ALIGN(seq_param->picture_height, 16) / 16;
+    int i, j;
+    int h_start_pos, v_start_pos, h_next_start_pos, v_next_start_pos;
+    unsigned int *msg = NULL;
+    unsigned char *msg_ptr = NULL;
+
+    slice_param = (VAEncSliceParameterBufferMPEG2 *)encode_state->slice_params_ext[slice_index]->buffer;
+    h_start_pos = slice_param->macroblock_address % width_in_mbs;
+    v_start_pos = slice_param->macroblock_address / width_in_mbs;
+    assert(h_start_pos + slice_param->num_macroblocks <= width_in_mbs);
+
+    dri_bo_map(vme_context->vme_output.bo , 0);
+    msg_ptr = (unsigned char *)vme_context->vme_output.bo->virtual;
+
+    if (next_slice_group_param) {
+        h_next_start_pos = next_slice_group_param->macroblock_address % width_in_mbs;
+        v_next_start_pos = next_slice_group_param->macroblock_address / width_in_mbs;
+    } else {
+        h_next_start_pos = 0;
+        v_next_start_pos = height_in_mbs;
+    }
+
+    gen8_mfc_mpeg2_slicegroup_state(ctx,
+                                     encoder_context,
+                                     h_start_pos,
+                                     v_start_pos,
+                                     h_next_start_pos,
+                                     v_next_start_pos,
+                                     slice_index == 0,
+                                     next_slice_group_param == NULL,
+                                     slice_param->is_intra_slice,
+                                     slice_param->quantiser_scale_code,
+                                     slice_batch);
+
+    if (slice_index == 0) 
+        intel_mfc_mpeg2_pipeline_header_programing(ctx, encode_state, encoder_context, slice_batch);
+
+    /* Insert '00' to make sure the header is valid */
+    mfc_context->insert_object(ctx,
+                               encoder_context,
+                               (unsigned int*)section_delimiter,
+                               1,
+                               8,   /* 8bits in the last DWORD */
+                               1,   /* 1 byte */
+                               1,
+                               0,
+                               0,
+                               slice_batch);
+
+    for (i = 0; i < encode_state->slice_params_ext[slice_index]->num_elements; i++) {
+        /* PAK for each macroblocks */
+        for (j = 0; j < slice_param->num_macroblocks; j++) {
+            int h_pos = (slice_param->macroblock_address + j) % width_in_mbs;
+            int v_pos = (slice_param->macroblock_address + j) / width_in_mbs;
+            int first_mb_in_slice = (j == 0);
+            int last_mb_in_slice = (j == slice_param->num_macroblocks - 1);
+            int first_mb_in_slice_group = (i == 0 && j == 0);
+            int last_mb_in_slice_group = (i == encode_state->slice_params_ext[slice_index]->num_elements - 1 &&
+                                          j == slice_param->num_macroblocks - 1);
+
+            msg = (unsigned int *)(msg_ptr + (slice_param->macroblock_address + j) * vme_context->vme_output.size_block);
+
+            if (slice_param->is_intra_slice) {
+                gen8_mfc_mpeg2_pak_object_intra(ctx,
+                                                 encoder_context,
+                                                 h_pos, v_pos,
+                                                 first_mb_in_slice,
+                                                 last_mb_in_slice,
+                                                 first_mb_in_slice_group,
+                                                 last_mb_in_slice_group,
+                                                 0x1a,
+                                                 slice_param->quantiser_scale_code,
+                                                 0x3f,
+                                                 0,
+                                                 0xff,
+                                                 slice_batch);
+            } else {
+                gen8_mfc_mpeg2_pak_object_inter(ctx,
+                                                 encode_state,
+                                                 encoder_context,
+                                                 msg,
+                                                 width_in_mbs, height_in_mbs,
+                                                 h_pos, v_pos,
+                                                 first_mb_in_slice,
+                                                 last_mb_in_slice,
+                                                 first_mb_in_slice_group,
+                                                 last_mb_in_slice_group,
+                                                 slice_param->quantiser_scale_code,
+                                                 0,
+                                                 0xff,
+                                                 slice_batch);
+            }
+        }
+
+        slice_param++;
+    }
+
+    dri_bo_unmap(vme_context->vme_output.bo);
+
+    /* tail data */
+    if (next_slice_group_param == NULL) { /* end of a picture */
+        mfc_context->insert_object(ctx,
+                                   encoder_context,
+                                   (unsigned int *)tail_delimiter,
+                                   2,
+                                   8,   /* 8bits in the last DWORD */
+                                   5,   /* 5 bytes */
+                                   1,
+                                   1,
+                                   0,
+                                   slice_batch);
+    } else {        /* end of a lsice group */
+        mfc_context->insert_object(ctx,
+                                   encoder_context,
+                                   (unsigned int *)section_delimiter,
+                                   1,
+                                   8,   /* 8bits in the last DWORD */
+                                   1,   /* 1 byte */
+                                   1,
+                                   1,
+                                   0,
+                                   slice_batch);
+    }
+}
+
+/* 
+ * A batch buffer for all slices, including slice state, 
+ * slice insert object and slice pak object commands
+ *
+ */
+static dri_bo *
+gen8_mfc_mpeg2_software_slice_batchbuffer(VADriverContextP ctx,
+                                           struct encode_state *encode_state,
+                                           struct intel_encoder_context *encoder_context)
+{
+    struct i965_driver_data *i965 = i965_driver_data(ctx);
+    struct intel_batchbuffer *batch;
+    VAEncSequenceParameterBufferMPEG2 *seq_param = (VAEncSequenceParameterBufferMPEG2 *)encode_state->seq_param_ext->buffer;
+    VAEncSliceParameterBufferMPEG2 *next_slice_group_param = NULL;
+    dri_bo *batch_bo;
+    int i;
+    int buffer_size;
+    int width_in_mbs = ALIGN(seq_param->picture_width, 16) / 16;
+    int height_in_mbs = ALIGN(seq_param->picture_height, 16) / 16;
+
+    buffer_size = width_in_mbs * height_in_mbs * 64;
+    batch = intel_batchbuffer_new(&i965->intel, I915_EXEC_BSD, buffer_size);
+    batch_bo = batch->buffer;
+
+    for (i = 0; i < encode_state->num_slice_params_ext; i++) {
+        if (i == encode_state->num_slice_params_ext - 1)
+            next_slice_group_param = NULL;
+        else
+            next_slice_group_param = (VAEncSliceParameterBufferMPEG2 *)encode_state->slice_params_ext[i + 1]->buffer;
+
+        gen8_mfc_mpeg2_pipeline_slice_group(ctx, encode_state, encoder_context, i, next_slice_group_param, batch);
+    }
+
+    intel_batchbuffer_align(batch, 8);
+    
+    BEGIN_BCS_BATCH(batch, 2);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, MI_BATCH_BUFFER_END);
+    ADVANCE_BCS_BATCH(batch);
+
+    dri_bo_reference(batch_bo);
+    intel_batchbuffer_free(batch);
+
+    return batch_bo;
+}
+
+static void
+gen8_mfc_mpeg2_pipeline_picture_programing(VADriverContextP ctx,
+                                            struct encode_state *encode_state,
+                                            struct intel_encoder_context *encoder_context)
+{
+    struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
+
+    mfc_context->pipe_mode_select(ctx, MFX_FORMAT_MPEG2, encoder_context);
+    mfc_context->set_surface_state(ctx, encoder_context);
+    mfc_context->ind_obj_base_addr_state(ctx, encoder_context);
+    gen8_mfc_pipe_buf_addr_state(ctx, encoder_context);
+    gen8_mfc_bsp_buf_base_addr_state(ctx, encoder_context);
+    gen8_mfc_mpeg2_pic_state(ctx, encoder_context, encode_state);
+    gen8_mfc_mpeg2_qm_state(ctx, encoder_context);
+    gen8_mfc_mpeg2_fqm_state(ctx, encoder_context);
+}
+
+static void
+gen8_mfc_mpeg2_pipeline_programing(VADriverContextP ctx,
+                                    struct encode_state *encode_state,
+                                    struct intel_encoder_context *encoder_context)
+{
+    struct intel_batchbuffer *batch = encoder_context->base.batch;
+    dri_bo *slice_batch_bo;
+
+    slice_batch_bo = gen8_mfc_mpeg2_software_slice_batchbuffer(ctx, encode_state, encoder_context);
+
+    // begin programing
+    intel_batchbuffer_start_atomic_bcs(batch, 0x4000); 
+    intel_batchbuffer_emit_mi_flush(batch);
+    
+    // picture level programing
+    gen8_mfc_mpeg2_pipeline_picture_programing(ctx, encode_state, encoder_context);
+
+    BEGIN_BCS_BATCH(batch, 2);
+    OUT_BCS_BATCH(batch, MI_BATCH_BUFFER_START | (1 << 8));
+    OUT_BCS_RELOC(batch,
+                  slice_batch_bo,
+                  I915_GEM_DOMAIN_COMMAND, 0, 
+                  0);
+    ADVANCE_BCS_BATCH(batch);
+
+    // end programing
+    intel_batchbuffer_end_atomic(batch);
+
+    dri_bo_unreference(slice_batch_bo);
+}
+
+static VAStatus
+intel_mfc_mpeg2_prepare(VADriverContextP ctx, 
+                        struct encode_state *encode_state,
+                        struct intel_encoder_context *encoder_context)
+{
+    struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
+    struct object_surface *obj_surface;        
+    struct object_buffer *obj_buffer;
+    struct i965_coded_buffer_segment *coded_buffer_segment;
+    VAStatus vaStatus = VA_STATUS_SUCCESS;
+    dri_bo *bo;
+    int i;
+
+    /* reconstructed surface */
+    obj_surface = encode_state->reconstructed_object;
+    i965_check_alloc_surface_bo(ctx, obj_surface, 1, VA_FOURCC('N','V','1','2'), SUBSAMPLE_YUV420);
+    mfc_context->pre_deblocking_output.bo = obj_surface->bo;
+    dri_bo_reference(mfc_context->pre_deblocking_output.bo);
+    mfc_context->surface_state.width = obj_surface->orig_width;
+    mfc_context->surface_state.height = obj_surface->orig_height;
+    mfc_context->surface_state.w_pitch = obj_surface->width;
+    mfc_context->surface_state.h_pitch = obj_surface->height;
+
+    /* forward reference */
+    obj_surface = encode_state->reference_objects[0];
+
+    if (obj_surface && obj_surface->bo) {
+        mfc_context->reference_surfaces[0].bo = obj_surface->bo;
+        dri_bo_reference(mfc_context->reference_surfaces[0].bo);
+    } else
+        mfc_context->reference_surfaces[0].bo = NULL;
+
+    /* backward reference */
+    obj_surface = encode_state->reference_objects[1];
+
+    if (obj_surface && obj_surface->bo) {
+        mfc_context->reference_surfaces[1].bo = obj_surface->bo;
+        dri_bo_reference(mfc_context->reference_surfaces[1].bo);
+    } else {
+        mfc_context->reference_surfaces[1].bo = mfc_context->reference_surfaces[0].bo;
+
+        if (mfc_context->reference_surfaces[1].bo)
+            dri_bo_reference(mfc_context->reference_surfaces[1].bo);
+    }
+
+    for (i = 2; i < ARRAY_ELEMS(mfc_context->reference_surfaces); i++) {
+        mfc_context->reference_surfaces[i].bo = mfc_context->reference_surfaces[i & 1].bo;
+
+        if (mfc_context->reference_surfaces[i].bo)
+            dri_bo_reference(mfc_context->reference_surfaces[i].bo);
+    }
+    
+    /* input YUV surface */
+    obj_surface = encode_state->input_yuv_object;
+    mfc_context->uncompressed_picture_source.bo = obj_surface->bo;
+    dri_bo_reference(mfc_context->uncompressed_picture_source.bo);
+
+    /* coded buffer */
+    obj_buffer = encode_state->coded_buf_object;
+    bo = obj_buffer->buffer_store->bo;
+    mfc_context->mfc_indirect_pak_bse_object.bo = bo;
+    mfc_context->mfc_indirect_pak_bse_object.offset = I965_CODEDBUFFER_HEADER_SIZE;
+    mfc_context->mfc_indirect_pak_bse_object.end_offset = ALIGN(obj_buffer->size_element - 0x1000, 0x1000);
+    dri_bo_reference(mfc_context->mfc_indirect_pak_bse_object.bo);
+
+    /* set the internal flag to 0 to indicate the coded size is unknown */
+    dri_bo_map(bo, 1);
+    coded_buffer_segment = (struct i965_coded_buffer_segment *)bo->virtual;
+    coded_buffer_segment->mapped = 0;
+    coded_buffer_segment->codec = encoder_context->codec;
+    dri_bo_unmap(bo);
+
+    return vaStatus;
+}
+
+static VAStatus
+gen8_mfc_mpeg2_encode_picture(VADriverContextP ctx, 
+                               struct encode_state *encode_state,
+                               struct intel_encoder_context *encoder_context)
+{
+    gen8_mfc_init(ctx, encode_state, encoder_context);
+    intel_mfc_mpeg2_prepare(ctx, encode_state, encoder_context);
+    /*Programing bcs pipeline*/
+    gen8_mfc_mpeg2_pipeline_programing(ctx, encode_state, encoder_context);
+    gen8_mfc_run(ctx, encode_state, encoder_context);
+
+    return VA_STATUS_SUCCESS;
+}
+
+static void
+gen8_mfc_context_destroy(void *context)
+{
+    struct gen6_mfc_context *mfc_context = context;
+    int i;
+
+    dri_bo_unreference(mfc_context->post_deblocking_output.bo);
+    mfc_context->post_deblocking_output.bo = NULL;
+
+    dri_bo_unreference(mfc_context->pre_deblocking_output.bo);
+    mfc_context->pre_deblocking_output.bo = NULL;
+
+    dri_bo_unreference(mfc_context->uncompressed_picture_source.bo);
+    mfc_context->uncompressed_picture_source.bo = NULL;
+
+    dri_bo_unreference(mfc_context->mfc_indirect_pak_bse_object.bo); 
+    mfc_context->mfc_indirect_pak_bse_object.bo = NULL;
+
+    for (i = 0; i < NUM_MFC_DMV_BUFFERS; i++){
+        dri_bo_unreference(mfc_context->direct_mv_buffers[i].bo);
+        mfc_context->direct_mv_buffers[i].bo = NULL;
+    }
+
+    dri_bo_unreference(mfc_context->intra_row_store_scratch_buffer.bo);
+    mfc_context->intra_row_store_scratch_buffer.bo = NULL;
+
+    dri_bo_unreference(mfc_context->macroblock_status_buffer.bo);
+    mfc_context->macroblock_status_buffer.bo = NULL;
+
+    dri_bo_unreference(mfc_context->deblocking_filter_row_store_scratch_buffer.bo);
+    mfc_context->deblocking_filter_row_store_scratch_buffer.bo = NULL;
+
+    dri_bo_unreference(mfc_context->bsd_mpc_row_store_scratch_buffer.bo);
+    mfc_context->bsd_mpc_row_store_scratch_buffer.bo = NULL;
+
+
+    for (i = 0; i < MAX_MFC_REFERENCE_SURFACES; i++){
+        dri_bo_unreference(mfc_context->reference_surfaces[i].bo);
+        mfc_context->reference_surfaces[i].bo = NULL;  
+    }
+
+    i965_gpe_context_destroy(&mfc_context->gpe_context);
+
+    dri_bo_unreference(mfc_context->mfc_batchbuffer_surface.bo);
+    mfc_context->mfc_batchbuffer_surface.bo = NULL;
+
+    dri_bo_unreference(mfc_context->aux_batchbuffer_surface.bo);
+    mfc_context->aux_batchbuffer_surface.bo = NULL;
+
+    if (mfc_context->aux_batchbuffer)
+        intel_batchbuffer_free(mfc_context->aux_batchbuffer);
+
+    mfc_context->aux_batchbuffer = NULL;
+
+    free(mfc_context);
+}
+
+static VAStatus gen8_mfc_pipeline(VADriverContextP ctx,
+                  VAProfile profile,
+                  struct encode_state *encode_state,
+                  struct intel_encoder_context *encoder_context)
+{
+    VAStatus vaStatus;
+
+    switch (profile) {
+    case VAProfileH264Baseline:
+    case VAProfileH264Main:
+    case VAProfileH264High:
+        vaStatus = gen8_mfc_avc_encode_picture(ctx, encode_state, encoder_context);
+        break;
+
+        /* FIXME: add for other profile */
+    case VAProfileMPEG2Simple:
+    case VAProfileMPEG2Main:
+        vaStatus = gen8_mfc_mpeg2_encode_picture(ctx, encode_state, encoder_context);
+        break;
+
+    default:
+        vaStatus = VA_STATUS_ERROR_UNSUPPORTED_PROFILE;
+        break;
+    }
+
+    return vaStatus;
+}
+
+Bool gen8_mfc_context_init(VADriverContextP ctx, struct intel_encoder_context *encoder_context)
+{
+    struct gen6_mfc_context *mfc_context = calloc(1, sizeof(struct gen6_mfc_context));
+
+    mfc_context->gpe_context.surface_state_binding_table.length = (SURFACE_STATE_PADDED_SIZE + sizeof(unsigned int)) * MAX_MEDIA_SURFACES_GEN6;
+
+    mfc_context->gpe_context.idrt.max_entries = MAX_GPE_KERNELS;
+    mfc_context->gpe_context.idrt.entry_size = sizeof(struct gen6_interface_descriptor_data);
+
+    mfc_context->gpe_context.curbe.length = 32 * 4;
+
+    mfc_context->gpe_context.vfe_state.max_num_threads = 60 - 1;
+    mfc_context->gpe_context.vfe_state.num_urb_entries = 16;
+    mfc_context->gpe_context.vfe_state.gpgpu_mode = 0;
+    mfc_context->gpe_context.vfe_state.urb_entry_size = 59 - 1;
+    mfc_context->gpe_context.vfe_state.curbe_allocation_size = 37 - 1;
+
+    i965_gpe_load_kernels(ctx,
+                          &mfc_context->gpe_context,
+                          gen8_mfc_kernels,
+                          NUM_MFC_KERNEL);
+
+    mfc_context->pipe_mode_select = gen8_mfc_pipe_mode_select;
+    mfc_context->set_surface_state = gen8_mfc_surface_state;
+    mfc_context->ind_obj_base_addr_state = gen8_mfc_ind_obj_base_addr_state;
+    mfc_context->avc_img_state = gen8_mfc_avc_img_state;
+    mfc_context->avc_qm_state = gen8_mfc_avc_qm_state;
+    mfc_context->avc_fqm_state = gen8_mfc_avc_fqm_state;
+    mfc_context->insert_object = gen8_mfc_avc_insert_object;
+    mfc_context->buffer_suface_setup = gen7_gpe_buffer_suface_setup;
+
+    encoder_context->mfc_context = mfc_context;
+    encoder_context->mfc_context_destroy = gen8_mfc_context_destroy;
+    encoder_context->mfc_pipeline = gen8_mfc_pipeline;
+    encoder_context->mfc_brc_prepare = intel_mfc_brc_prepare;
+
+    return True;
+}
diff --git a/src/gen8_mfd.c b/src/gen8_mfd.c
new file mode 100644 (file)
index 0000000..c351e4b
--- /dev/null
@@ -0,0 +1,2834 @@
+/*
+ * Copyright © 2011 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Xiang Haihao <haihao.xiang@intel.com>
+ *    Zhao  Yakui  <yakui.zhao@intel.com>
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <va/va_dec_jpeg.h>
+
+#include "intel_batchbuffer.h"
+#include "intel_driver.h"
+
+#include "i965_defines.h"
+#include "i965_drv_video.h"
+#include "i965_decoder_utils.h"
+
+#include "gen7_mfd.h"
+#include "intel_media.h"
+
+#define B0_STEP_REV            2
+#define IS_STEPPING_BPLUS(i965)        ((i965->intel.revision) >= B0_STEP_REV)
+
+static const uint32_t zigzag_direct[64] = {
+    0,   1,  8, 16,  9,  2,  3, 10,
+    17, 24, 32, 25, 18, 11,  4,  5,
+    12, 19, 26, 33, 40, 48, 41, 34,
+    27, 20, 13,  6,  7, 14, 21, 28,
+    35, 42, 49, 56, 57, 50, 43, 36,
+    29, 22, 15, 23, 30, 37, 44, 51,
+    58, 59, 52, 45, 38, 31, 39, 46,
+    53, 60, 61, 54, 47, 55, 62, 63
+};
+
+static void
+gen8_mfd_init_avc_surface(VADriverContextP ctx, 
+                          VAPictureParameterBufferH264 *pic_param,
+                          struct object_surface *obj_surface)
+{
+    struct i965_driver_data *i965 = i965_driver_data(ctx);
+    GenAvcSurface *gen7_avc_surface = obj_surface->private_data;
+    int width_in_mbs, height_in_mbs;
+
+    obj_surface->free_private_data = gen_free_avc_surface;
+    width_in_mbs = pic_param->picture_width_in_mbs_minus1 + 1;
+    height_in_mbs = pic_param->picture_height_in_mbs_minus1 + 1; /* frame height */
+
+    if (!gen7_avc_surface) {
+        gen7_avc_surface = calloc(sizeof(GenAvcSurface), 1);
+        assert((obj_surface->size & 0x3f) == 0);
+        obj_surface->private_data = gen7_avc_surface;
+    }
+
+    gen7_avc_surface->dmv_bottom_flag = (pic_param->pic_fields.bits.field_pic_flag &&
+                                         !pic_param->seq_fields.bits.direct_8x8_inference_flag);
+
+    if (gen7_avc_surface->dmv_top == NULL) {
+        gen7_avc_surface->dmv_top = dri_bo_alloc(i965->intel.bufmgr,
+                                                 "direct mv w/r buffer",
+                                                 width_in_mbs * height_in_mbs * 128,
+                                                 0x1000);
+        assert(gen7_avc_surface->dmv_top);
+    }
+
+    if (gen7_avc_surface->dmv_bottom_flag &&
+        gen7_avc_surface->dmv_bottom == NULL) {
+        gen7_avc_surface->dmv_bottom = dri_bo_alloc(i965->intel.bufmgr,
+                                                    "direct mv w/r buffer",
+                                                    width_in_mbs * height_in_mbs * 128,                                                    
+                                                    0x1000);
+        assert(gen7_avc_surface->dmv_bottom);
+    }
+}
+
+static void
+gen8_mfd_pipe_mode_select(VADriverContextP ctx,
+                          struct decode_state *decode_state,
+                          int standard_select,
+                          struct gen7_mfd_context *gen7_mfd_context)
+{
+    struct intel_batchbuffer *batch = gen7_mfd_context->base.batch;
+
+    assert(standard_select == MFX_FORMAT_MPEG2 ||
+           standard_select == MFX_FORMAT_AVC ||
+           standard_select == MFX_FORMAT_VC1 ||
+           standard_select == MFX_FORMAT_JPEG);
+
+    BEGIN_BCS_BATCH(batch, 5);
+    OUT_BCS_BATCH(batch, MFX_PIPE_MODE_SELECT | (5 - 2));
+    OUT_BCS_BATCH(batch,
+                  (MFX_LONG_MODE << 17) | /* Currently only support long format */
+                  (MFD_MODE_VLD << 15) | /* VLD mode */
+                  (0 << 10) | /* disable Stream-Out */
+                  (gen7_mfd_context->post_deblocking_output.valid << 9)  | /* Post Deblocking Output */
+                  (gen7_mfd_context->pre_deblocking_output.valid << 8)  | /* Pre Deblocking Output */
+                  (0 << 5)  | /* not in stitch mode */
+                  (MFX_CODEC_DECODE << 4)  | /* decoding mode */
+                  (standard_select << 0));
+    OUT_BCS_BATCH(batch,
+                  (0 << 4)  | /* terminate if AVC motion and POC table error occurs */
+                  (0 << 3)  | /* terminate if AVC mbdata error occurs */
+                  (0 << 2)  | /* terminate if AVC CABAC/CAVLC decode error occurs */
+                  (0 << 1)  |
+                  (0 << 0));
+    OUT_BCS_BATCH(batch, 0); /* pic status/error report id */ 
+    OUT_BCS_BATCH(batch, 0); /* reserved */
+    ADVANCE_BCS_BATCH(batch);
+}
+
+static void
+gen8_mfd_surface_state(VADriverContextP ctx,
+                       struct decode_state *decode_state,
+                       int standard_select,
+                       struct gen7_mfd_context *gen7_mfd_context)
+{
+    struct intel_batchbuffer *batch = gen7_mfd_context->base.batch;
+    struct object_surface *obj_surface = decode_state->render_object;
+    unsigned int y_cb_offset;
+    unsigned int y_cr_offset;
+
+    assert(obj_surface);
+
+    y_cb_offset = obj_surface->y_cb_offset;
+    y_cr_offset = obj_surface->y_cr_offset;
+
+    BEGIN_BCS_BATCH(batch, 6);
+    OUT_BCS_BATCH(batch, MFX_SURFACE_STATE | (6 - 2));
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch,
+                  ((obj_surface->orig_height - 1) << 18) |
+                  ((obj_surface->orig_width - 1) << 4));
+    OUT_BCS_BATCH(batch,
+                  (MFX_SURFACE_PLANAR_420_8 << 28) | /* 420 planar YUV surface */
+                  ((standard_select != MFX_FORMAT_JPEG) << 27) | /* interleave chroma, set to 0 for JPEG */
+                  (0 << 22) | /* surface object control state, ignored */
+                  ((obj_surface->width - 1) << 3) | /* pitch */
+                  (0 << 2)  | /* must be 0 */
+                  (1 << 1)  | /* must be tiled */
+                  (I965_TILEWALK_YMAJOR << 0));  /* tile walk, must be 1 */
+    OUT_BCS_BATCH(batch,
+                  (0 << 16) | /* X offset for U(Cb), must be 0 */
+                  (y_cb_offset << 0)); /* Y offset for U(Cb) */
+    OUT_BCS_BATCH(batch,
+                  (0 << 16) | /* X offset for V(Cr), must be 0 */
+                  (y_cr_offset << 0)); /* Y offset for V(Cr), must be 0 for video codec, non-zoro for JPEG */
+    ADVANCE_BCS_BATCH(batch);
+}
+
+static void
+gen8_mfd_pipe_buf_addr_state(VADriverContextP ctx,
+                             struct decode_state *decode_state,
+                             int standard_select,
+                             struct gen7_mfd_context *gen7_mfd_context)
+{
+    struct intel_batchbuffer *batch = gen7_mfd_context->base.batch;
+    int i;
+
+    BEGIN_BCS_BATCH(batch, 61);
+    OUT_BCS_BATCH(batch, MFX_PIPE_BUF_ADDR_STATE | (61 - 2));
+       /* Pre-deblock 1-3 */
+    if (gen7_mfd_context->pre_deblocking_output.valid)
+        OUT_BCS_RELOC(batch, gen7_mfd_context->pre_deblocking_output.bo,
+                      I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+                      0);
+    else
+        OUT_BCS_BATCH(batch, 0);
+
+       OUT_BCS_BATCH(batch, 0);
+       OUT_BCS_BATCH(batch, 0);
+       /* Post-debloing 4-6 */
+    if (gen7_mfd_context->post_deblocking_output.valid)
+        OUT_BCS_RELOC(batch, gen7_mfd_context->post_deblocking_output.bo,
+                      I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+                      0);
+    else
+        OUT_BCS_BATCH(batch, 0);
+
+       OUT_BCS_BATCH(batch, 0);
+       OUT_BCS_BATCH(batch, 0);
+
+       /* uncompressed-video & stream out 7-12 */
+    OUT_BCS_BATCH(batch, 0); /* ignore for decoding */
+    OUT_BCS_BATCH(batch, 0); /* ignore for decoding */
+       OUT_BCS_BATCH(batch, 0);
+       OUT_BCS_BATCH(batch, 0);
+       OUT_BCS_BATCH(batch, 0);
+       OUT_BCS_BATCH(batch, 0);
+
+       /* intra row-store scratch 13-15 */
+    if (gen7_mfd_context->intra_row_store_scratch_buffer.valid)
+        OUT_BCS_RELOC(batch, gen7_mfd_context->intra_row_store_scratch_buffer.bo,
+                      I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+                      0);
+    else
+        OUT_BCS_BATCH(batch, 0);
+
+       OUT_BCS_BATCH(batch, 0);
+       OUT_BCS_BATCH(batch, 0);
+       /* deblocking-filter-row-store 16-18 */
+    if (gen7_mfd_context->deblocking_filter_row_store_scratch_buffer.valid)
+        OUT_BCS_RELOC(batch, gen7_mfd_context->deblocking_filter_row_store_scratch_buffer.bo,
+                      I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+                      0);
+    else
+        OUT_BCS_BATCH(batch, 0);
+       OUT_BCS_BATCH(batch, 0);
+       OUT_BCS_BATCH(batch, 0);
+
+    /* DW 19..50 */
+    for (i = 0; i < ARRAY_ELEMS(gen7_mfd_context->reference_surface); i++) {
+        struct object_surface *obj_surface;
+
+        if (gen7_mfd_context->reference_surface[i].surface_id != VA_INVALID_ID &&
+            gen7_mfd_context->reference_surface[i].obj_surface &&
+            gen7_mfd_context->reference_surface[i].obj_surface->bo) {
+            obj_surface = gen7_mfd_context->reference_surface[i].obj_surface;
+
+            OUT_BCS_RELOC(batch, obj_surface->bo,
+                          I915_GEM_DOMAIN_INSTRUCTION, 0,
+                          0);
+        } else {
+            OUT_BCS_BATCH(batch, 0);
+        }
+        
+        OUT_BCS_BATCH(batch, 0);
+    }
+    
+    /* reference property 51 */
+    OUT_BCS_BATCH(batch, 0);  
+       
+    /* Macroblock status & ILDB 52-57 */
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+
+    /* the second Macroblock status 58-60 */   
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+
+    ADVANCE_BCS_BATCH(batch);
+}
+
+static void
+gen8_mfd_ind_obj_base_addr_state(VADriverContextP ctx,
+                                 dri_bo *slice_data_bo,
+                                 int standard_select,
+                                 struct gen7_mfd_context *gen7_mfd_context)
+{
+    struct intel_batchbuffer *batch = gen7_mfd_context->base.batch;
+
+    BEGIN_BCS_BATCH(batch, 26);
+    OUT_BCS_BATCH(batch, MFX_IND_OBJ_BASE_ADDR_STATE | (26 - 2));
+       /* MFX In BS 1-5 */
+    OUT_BCS_RELOC(batch, slice_data_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0); /* MFX Indirect Bitstream Object Base Address */
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+       /* Upper bound 4-5 */   
+    OUT_BCS_BATCH(batch, 0x80000000); /* must set, up to 2G */
+    OUT_BCS_BATCH(batch, 0);
+
+       /* MFX indirect MV 6-10 */
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+       
+       /* MFX IT_COFF 11-15 */
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+
+       /* MFX IT_DBLK 16-20 */
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+
+       /* MFX PAK_BSE object for encoder 21-25 */
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+
+    ADVANCE_BCS_BATCH(batch);
+}
+
+static void
+gen8_mfd_bsp_buf_base_addr_state(VADriverContextP ctx,
+                                 struct decode_state *decode_state,
+                                 int standard_select,
+                                 struct gen7_mfd_context *gen7_mfd_context)
+{
+    struct intel_batchbuffer *batch = gen7_mfd_context->base.batch;
+
+    BEGIN_BCS_BATCH(batch, 10);
+    OUT_BCS_BATCH(batch, MFX_BSP_BUF_BASE_ADDR_STATE | (10 - 2));
+
+    if (gen7_mfd_context->bsd_mpc_row_store_scratch_buffer.valid)
+        OUT_BCS_RELOC(batch, gen7_mfd_context->bsd_mpc_row_store_scratch_buffer.bo,
+                      I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+                      0);
+       else
+               OUT_BCS_BATCH(batch, 0);
+               
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+       /* MPR Row Store Scratch buffer 4-6 */
+    if (gen7_mfd_context->mpr_row_store_scratch_buffer.valid)
+        OUT_BCS_RELOC(batch, gen7_mfd_context->mpr_row_store_scratch_buffer.bo,
+                      I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+                      0);
+    else
+        OUT_BCS_BATCH(batch, 0);
+
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+
+       /* Bitplane 7-9 */ 
+    if (gen7_mfd_context->bitplane_read_buffer.valid)
+        OUT_BCS_RELOC(batch, gen7_mfd_context->bitplane_read_buffer.bo,
+                      I915_GEM_DOMAIN_INSTRUCTION, 0,
+                      0);
+    else
+       OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    ADVANCE_BCS_BATCH(batch);
+}
+
+static void
+gen8_mfd_qm_state(VADriverContextP ctx,
+                  int qm_type,
+                  unsigned char *qm,
+                  int qm_length,
+                  struct gen7_mfd_context *gen7_mfd_context)
+{
+    struct intel_batchbuffer *batch = gen7_mfd_context->base.batch;
+    unsigned int qm_buffer[16];
+
+    assert(qm_length <= 16 * 4);
+    memcpy(qm_buffer, qm, qm_length);
+
+    BEGIN_BCS_BATCH(batch, 18);
+    OUT_BCS_BATCH(batch, MFX_QM_STATE | (18 - 2));
+    OUT_BCS_BATCH(batch, qm_type << 0);
+    intel_batchbuffer_data(batch, qm_buffer, 16 * 4);
+    ADVANCE_BCS_BATCH(batch);
+}
+
+static void
+gen8_mfd_avc_img_state(VADriverContextP ctx,
+                       struct decode_state *decode_state,
+                       struct gen7_mfd_context *gen7_mfd_context)
+{
+    struct intel_batchbuffer *batch = gen7_mfd_context->base.batch;
+    int img_struct;
+    int mbaff_frame_flag;
+    unsigned int width_in_mbs, height_in_mbs;
+    VAPictureParameterBufferH264 *pic_param;
+
+    assert(decode_state->pic_param && decode_state->pic_param->buffer);
+    pic_param = (VAPictureParameterBufferH264 *)decode_state->pic_param->buffer;
+    assert(!(pic_param->CurrPic.flags & VA_PICTURE_H264_INVALID));
+
+    if (pic_param->CurrPic.flags & VA_PICTURE_H264_TOP_FIELD)
+        img_struct = 1;
+    else if (pic_param->CurrPic.flags & VA_PICTURE_H264_BOTTOM_FIELD)
+        img_struct = 3;
+    else
+        img_struct = 0;
+
+    if ((img_struct & 0x1) == 0x1) {
+        assert(pic_param->pic_fields.bits.field_pic_flag == 0x1);
+    } else {
+        assert(pic_param->pic_fields.bits.field_pic_flag == 0x0);
+    }
+
+    if (pic_param->seq_fields.bits.frame_mbs_only_flag) { /* a frame containing only frame macroblocks */
+        assert(pic_param->seq_fields.bits.mb_adaptive_frame_field_flag == 0);
+        assert(pic_param->pic_fields.bits.field_pic_flag == 0);
+    } else {
+        assert(pic_param->seq_fields.bits.direct_8x8_inference_flag == 1); /* see H.264 spec */
+    }
+
+    mbaff_frame_flag = (pic_param->seq_fields.bits.mb_adaptive_frame_field_flag &&
+                        !pic_param->pic_fields.bits.field_pic_flag);
+
+    width_in_mbs = pic_param->picture_width_in_mbs_minus1 + 1;
+    height_in_mbs = pic_param->picture_height_in_mbs_minus1 + 1; /* frame height */
+
+    /* MFX unit doesn't support 4:2:2 and 4:4:4 picture */
+    assert(pic_param->seq_fields.bits.chroma_format_idc == 0 || /* monochrome picture */
+           pic_param->seq_fields.bits.chroma_format_idc == 1);  /* 4:2:0 */
+    assert(pic_param->seq_fields.bits.residual_colour_transform_flag == 0); /* only available for 4:4:4 */
+
+    BEGIN_BCS_BATCH(batch, 17);
+    OUT_BCS_BATCH(batch, MFX_AVC_IMG_STATE | (17 - 2));
+    OUT_BCS_BATCH(batch, 
+                  width_in_mbs * height_in_mbs);
+    OUT_BCS_BATCH(batch, 
+                  ((height_in_mbs - 1) << 16) | 
+                  ((width_in_mbs - 1) << 0));
+    OUT_BCS_BATCH(batch, 
+                  ((pic_param->second_chroma_qp_index_offset & 0x1f) << 24) |
+                  ((pic_param->chroma_qp_index_offset & 0x1f) << 16) |
+                  (0 << 14) | /* Max-bit conformance Intra flag ??? FIXME */
+                  (0 << 13) | /* Max Macroblock size conformance Inter flag ??? FIXME */
+                  (pic_param->pic_fields.bits.weighted_pred_flag << 12) | /* differ from GEN6 */
+                  (pic_param->pic_fields.bits.weighted_bipred_idc << 10) |
+                  (img_struct << 8));
+    OUT_BCS_BATCH(batch,
+                  (pic_param->seq_fields.bits.chroma_format_idc << 10) |
+                  (pic_param->pic_fields.bits.entropy_coding_mode_flag << 7) |
+                  ((!pic_param->pic_fields.bits.reference_pic_flag) << 6) |
+                  (pic_param->pic_fields.bits.constrained_intra_pred_flag << 5) |
+                  (pic_param->seq_fields.bits.direct_8x8_inference_flag << 4) |
+                  (pic_param->pic_fields.bits.transform_8x8_mode_flag << 3) |
+                  (pic_param->seq_fields.bits.frame_mbs_only_flag << 2) |
+                  (mbaff_frame_flag << 1) |
+                  (pic_param->pic_fields.bits.field_pic_flag << 0));
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    ADVANCE_BCS_BATCH(batch);
+}
+
+static void
+gen8_mfd_avc_qm_state(VADriverContextP ctx,
+                      struct decode_state *decode_state,
+                      struct gen7_mfd_context *gen7_mfd_context)
+{
+    VAIQMatrixBufferH264 *iq_matrix;
+    VAPictureParameterBufferH264 *pic_param;
+
+    if (decode_state->iq_matrix && decode_state->iq_matrix->buffer)
+        iq_matrix = (VAIQMatrixBufferH264 *)decode_state->iq_matrix->buffer;
+    else
+        iq_matrix = &gen7_mfd_context->iq_matrix.h264;
+
+    assert(decode_state->pic_param && decode_state->pic_param->buffer);
+    pic_param = (VAPictureParameterBufferH264 *)decode_state->pic_param->buffer;
+
+    gen8_mfd_qm_state(ctx, MFX_QM_AVC_4X4_INTRA_MATRIX, &iq_matrix->ScalingList4x4[0][0], 3 * 16, gen7_mfd_context);
+    gen8_mfd_qm_state(ctx, MFX_QM_AVC_4X4_INTER_MATRIX, &iq_matrix->ScalingList4x4[3][0], 3 * 16, gen7_mfd_context);
+
+    if (pic_param->pic_fields.bits.transform_8x8_mode_flag) {
+        gen8_mfd_qm_state(ctx, MFX_QM_AVC_8x8_INTRA_MATRIX, &iq_matrix->ScalingList8x8[0][0], 64, gen7_mfd_context);
+        gen8_mfd_qm_state(ctx, MFX_QM_AVC_8x8_INTER_MATRIX, &iq_matrix->ScalingList8x8[1][0], 64, gen7_mfd_context);
+    }
+}
+
+static void
+gen8_mfd_avc_picid_state(VADriverContextP ctx,
+                      struct decode_state *decode_state,
+                      struct gen7_mfd_context *gen7_mfd_context)
+{
+    struct intel_batchbuffer *batch = gen7_mfd_context->base.batch;
+
+    BEGIN_BCS_BATCH(batch, 10);
+    OUT_BCS_BATCH(batch, MFD_AVC_PICID_STATE | (10 - 2));
+    OUT_BCS_BATCH(batch, 1); // disable Picture ID Remapping
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    ADVANCE_BCS_BATCH(batch);
+}
+
+static void
+gen8_mfd_avc_directmode_state(VADriverContextP ctx,
+                              struct decode_state *decode_state,
+                              VAPictureParameterBufferH264 *pic_param,
+                              VASliceParameterBufferH264 *slice_param,
+                              struct gen7_mfd_context *gen7_mfd_context)
+{
+    struct intel_batchbuffer *batch = gen7_mfd_context->base.batch;
+    struct object_surface *obj_surface;
+    GenAvcSurface *gen7_avc_surface;
+    VAPictureH264 *va_pic;
+    int i, j;
+
+    BEGIN_BCS_BATCH(batch, 71);
+    OUT_BCS_BATCH(batch, MFX_AVC_DIRECTMODE_STATE | (71 - 2));
+
+    /* reference surfaces 0..15 */
+    for (i = 0; i < ARRAY_ELEMS(gen7_mfd_context->reference_surface); i++) {
+        if (gen7_mfd_context->reference_surface[i].surface_id != VA_INVALID_ID &&
+            gen7_mfd_context->reference_surface[i].obj_surface &&
+            gen7_mfd_context->reference_surface[i].obj_surface->private_data) {
+
+            obj_surface = gen7_mfd_context->reference_surface[i].obj_surface;
+            gen7_avc_surface = obj_surface->private_data;
+
+            OUT_BCS_RELOC(batch, gen7_avc_surface->dmv_top,
+                          I915_GEM_DOMAIN_INSTRUCTION, 0,
+                          0);
+            OUT_BCS_BATCH(batch, 0);
+        } else {
+            OUT_BCS_BATCH(batch, 0);
+            OUT_BCS_BATCH(batch, 0);
+        }
+    }
+    
+    OUT_BCS_BATCH(batch, 0);
+
+    /* the current decoding frame/field */
+    va_pic = &pic_param->CurrPic;
+    obj_surface = decode_state->render_object;
+    assert(obj_surface->bo && obj_surface->private_data);
+    gen7_avc_surface = obj_surface->private_data;
+
+    OUT_BCS_RELOC(batch, gen7_avc_surface->dmv_top,
+                  I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+                  0);
+
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+
+    /* POC List */
+    for (i = 0; i < ARRAY_ELEMS(gen7_mfd_context->reference_surface); i++) {
+        if (gen7_mfd_context->reference_surface[i].surface_id != VA_INVALID_ID) {
+            int found = 0;
+
+            assert(gen7_mfd_context->reference_surface[i].obj_surface != NULL);
+
+            for (j = 0; j < ARRAY_ELEMS(pic_param->ReferenceFrames); j++) {
+                va_pic = &pic_param->ReferenceFrames[j];
+                
+                if (va_pic->flags & VA_PICTURE_H264_INVALID)
+                    continue;
+
+                if (va_pic->picture_id == gen7_mfd_context->reference_surface[i].surface_id) {
+                    found = 1;
+                    break;
+                }
+            }
+
+            assert(found == 1);
+            assert(!(va_pic->flags & VA_PICTURE_H264_INVALID));
+            
+            OUT_BCS_BATCH(batch, va_pic->TopFieldOrderCnt);
+            OUT_BCS_BATCH(batch, va_pic->BottomFieldOrderCnt);
+        } else {
+            OUT_BCS_BATCH(batch, 0);
+            OUT_BCS_BATCH(batch, 0);
+        }
+    }
+
+    va_pic = &pic_param->CurrPic;
+    OUT_BCS_BATCH(batch, va_pic->TopFieldOrderCnt);
+    OUT_BCS_BATCH(batch, va_pic->BottomFieldOrderCnt);
+
+    ADVANCE_BCS_BATCH(batch);
+}
+
+static void
+gen8_mfd_avc_slice_state(VADriverContextP ctx,
+                         VAPictureParameterBufferH264 *pic_param,
+                         VASliceParameterBufferH264 *slice_param,
+                         VASliceParameterBufferH264 *next_slice_param,
+                         struct gen7_mfd_context *gen7_mfd_context)
+{
+    struct intel_batchbuffer *batch = gen7_mfd_context->base.batch;
+    int width_in_mbs = pic_param->picture_width_in_mbs_minus1 + 1;
+    int height_in_mbs = pic_param->picture_height_in_mbs_minus1 + 1;
+    int slice_hor_pos, slice_ver_pos, next_slice_hor_pos, next_slice_ver_pos;
+    int num_ref_idx_l0, num_ref_idx_l1;
+    int mbaff_picture = (!pic_param->pic_fields.bits.field_pic_flag &&
+                         pic_param->seq_fields.bits.mb_adaptive_frame_field_flag);
+    int first_mb_in_slice = 0, first_mb_in_next_slice = 0;
+    int slice_type;
+
+    if (slice_param->slice_type == SLICE_TYPE_I ||
+        slice_param->slice_type == SLICE_TYPE_SI) {
+        slice_type = SLICE_TYPE_I;
+    } else if (slice_param->slice_type == SLICE_TYPE_P ||
+               slice_param->slice_type == SLICE_TYPE_SP) {
+        slice_type = SLICE_TYPE_P;
+    } else { 
+        assert(slice_param->slice_type == SLICE_TYPE_B);
+        slice_type = SLICE_TYPE_B;
+    }
+
+    if (slice_type == SLICE_TYPE_I) {
+        assert(slice_param->num_ref_idx_l0_active_minus1 == 0);
+        assert(slice_param->num_ref_idx_l1_active_minus1 == 0);
+        num_ref_idx_l0 = 0;
+        num_ref_idx_l1 = 0;
+    } else if (slice_type == SLICE_TYPE_P) {
+        assert(slice_param->num_ref_idx_l1_active_minus1 == 0);
+        num_ref_idx_l0 = slice_param->num_ref_idx_l0_active_minus1 + 1;
+        num_ref_idx_l1 = 0;
+    } else {
+        num_ref_idx_l0 = slice_param->num_ref_idx_l0_active_minus1 + 1;
+        num_ref_idx_l1 = slice_param->num_ref_idx_l1_active_minus1 + 1;
+    }
+
+    first_mb_in_slice = slice_param->first_mb_in_slice << mbaff_picture;
+    slice_hor_pos = first_mb_in_slice % width_in_mbs; 
+    slice_ver_pos = first_mb_in_slice / width_in_mbs;
+
+    if (next_slice_param) {
+        first_mb_in_next_slice = next_slice_param->first_mb_in_slice << mbaff_picture;
+        next_slice_hor_pos = first_mb_in_next_slice % width_in_mbs; 
+        next_slice_ver_pos = first_mb_in_next_slice / width_in_mbs;
+    } else {
+        next_slice_hor_pos = 0;
+        next_slice_ver_pos = height_in_mbs / (1 + !!pic_param->pic_fields.bits.field_pic_flag);
+    }
+
+    BEGIN_BCS_BATCH(batch, 11); /* FIXME: is it 10??? */
+    OUT_BCS_BATCH(batch, MFX_AVC_SLICE_STATE | (11 - 2));
+    OUT_BCS_BATCH(batch, slice_type);
+    OUT_BCS_BATCH(batch, 
+                  (num_ref_idx_l1 << 24) |
+                  (num_ref_idx_l0 << 16) |
+                  (slice_param->chroma_log2_weight_denom << 8) |
+                  (slice_param->luma_log2_weight_denom << 0));
+    OUT_BCS_BATCH(batch, 
+                  (slice_param->direct_spatial_mv_pred_flag << 29) |
+                  (slice_param->disable_deblocking_filter_idc << 27) |
+                  (slice_param->cabac_init_idc << 24) |
+                  ((pic_param->pic_init_qp_minus26 + 26 + slice_param->slice_qp_delta) << 16) |
+                  ((slice_param->slice_beta_offset_div2 & 0xf) << 8) |
+                  ((slice_param->slice_alpha_c0_offset_div2 & 0xf) << 0));
+    OUT_BCS_BATCH(batch, 
+                  (slice_ver_pos << 24) |
+                  (slice_hor_pos << 16) | 
+                  (first_mb_in_slice << 0));
+    OUT_BCS_BATCH(batch,
+                  (next_slice_ver_pos << 16) |
+                  (next_slice_hor_pos << 0));
+    OUT_BCS_BATCH(batch, 
+                  (next_slice_param == NULL) << 19); /* last slice flag */
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    ADVANCE_BCS_BATCH(batch);
+}
+
+static inline void
+gen8_mfd_avc_ref_idx_state(VADriverContextP ctx,
+                           VAPictureParameterBufferH264 *pic_param,
+                           VASliceParameterBufferH264 *slice_param,
+                           struct gen7_mfd_context *gen7_mfd_context)
+{
+    gen6_send_avc_ref_idx_state(
+        gen7_mfd_context->base.batch,
+        slice_param,
+        gen7_mfd_context->reference_surface
+    );
+}
+
+static void
+gen8_mfd_avc_weightoffset_state(VADriverContextP ctx,
+                                VAPictureParameterBufferH264 *pic_param,
+                                VASliceParameterBufferH264 *slice_param,
+                                struct gen7_mfd_context *gen7_mfd_context)
+{
+    struct intel_batchbuffer *batch = gen7_mfd_context->base.batch;
+    int i, j, num_weight_offset_table = 0;
+    short weightoffsets[32 * 6];
+
+    if ((slice_param->slice_type == SLICE_TYPE_P ||
+         slice_param->slice_type == SLICE_TYPE_SP) &&
+        (pic_param->pic_fields.bits.weighted_pred_flag == 1)) {
+        num_weight_offset_table = 1;
+    }
+    
+    if ((slice_param->slice_type == SLICE_TYPE_B) &&
+        (pic_param->pic_fields.bits.weighted_bipred_idc == 1)) {
+        num_weight_offset_table = 2;
+    }
+
+    for (i = 0; i < num_weight_offset_table; i++) {
+        BEGIN_BCS_BATCH(batch, 98);
+        OUT_BCS_BATCH(batch, MFX_AVC_WEIGHTOFFSET_STATE | (98 - 2));
+        OUT_BCS_BATCH(batch, i);
+
+        if (i == 0) {
+            for (j = 0; j < 32; j++) {
+                weightoffsets[j * 6 + 0] = slice_param->luma_weight_l0[j];
+                weightoffsets[j * 6 + 1] = slice_param->luma_offset_l0[j];
+                weightoffsets[j * 6 + 2] = slice_param->chroma_weight_l0[j][0];
+                weightoffsets[j * 6 + 3] = slice_param->chroma_offset_l0[j][0];
+                weightoffsets[j * 6 + 4] = slice_param->chroma_weight_l0[j][1];
+                weightoffsets[j * 6 + 5] = slice_param->chroma_offset_l0[j][1];
+            }
+        } else {
+            for (j = 0; j < 32; j++) {
+                weightoffsets[j * 6 + 0] = slice_param->luma_weight_l1[j];
+                weightoffsets[j * 6 + 1] = slice_param->luma_offset_l1[j];
+                weightoffsets[j * 6 + 2] = slice_param->chroma_weight_l1[j][0];
+                weightoffsets[j * 6 + 3] = slice_param->chroma_offset_l1[j][0];
+                weightoffsets[j * 6 + 4] = slice_param->chroma_weight_l1[j][1];
+                weightoffsets[j * 6 + 5] = slice_param->chroma_offset_l1[j][1];
+            }
+        }
+
+        intel_batchbuffer_data(batch, weightoffsets, sizeof(weightoffsets));
+        ADVANCE_BCS_BATCH(batch);
+    }
+}
+
+static void
+gen8_mfd_avc_bsd_object(VADriverContextP ctx,
+                        VAPictureParameterBufferH264 *pic_param,
+                        VASliceParameterBufferH264 *slice_param,
+                        dri_bo *slice_data_bo,
+                        VASliceParameterBufferH264 *next_slice_param,
+                        struct gen7_mfd_context *gen7_mfd_context)
+{
+    struct intel_batchbuffer *batch = gen7_mfd_context->base.batch;
+    int slice_data_bit_offset = avc_get_first_mb_bit_offset(slice_data_bo,
+                                                            slice_param,
+                                                            pic_param->pic_fields.bits.entropy_coding_mode_flag);
+
+    /* the input bitsteam format on GEN7 differs from GEN6 */
+    BEGIN_BCS_BATCH(batch, 6);
+    OUT_BCS_BATCH(batch, MFD_AVC_BSD_OBJECT | (6 - 2));
+    OUT_BCS_BATCH(batch, 
+                  (slice_param->slice_data_size));
+    OUT_BCS_BATCH(batch, slice_param->slice_data_offset);
+    OUT_BCS_BATCH(batch,
+                  (0 << 31) |
+                  (0 << 14) |
+                  (0 << 12) |
+                  (0 << 10) |
+                  (0 << 8));
+    OUT_BCS_BATCH(batch,
+                  ((slice_data_bit_offset >> 3) << 16) |
+                  (1 << 7)  |
+                  (0 << 5)  |
+                  (0 << 4)  |
+                  ((next_slice_param == NULL) << 3) | /* LastSlice Flag */
+                  (slice_data_bit_offset & 0x7));
+    OUT_BCS_BATCH(batch, 0);
+    ADVANCE_BCS_BATCH(batch);
+}
+
+static inline void
+gen8_mfd_avc_context_init(
+    VADriverContextP         ctx,
+    struct gen7_mfd_context *gen7_mfd_context
+)
+{
+    /* Initialize flat scaling lists */
+    avc_gen_default_iq_matrix(&gen7_mfd_context->iq_matrix.h264);
+}
+
+static void
+gen8_mfd_avc_decode_init(VADriverContextP ctx,
+                         struct decode_state *decode_state,
+                         struct gen7_mfd_context *gen7_mfd_context)
+{
+    VAPictureParameterBufferH264 *pic_param;
+    VASliceParameterBufferH264 *slice_param;
+    struct i965_driver_data *i965 = i965_driver_data(ctx);
+    struct object_surface *obj_surface;
+    dri_bo *bo;
+    int i, j, enable_avc_ildb = 0;
+    unsigned int width_in_mbs, height_in_mbs;
+
+    for (j = 0; j < decode_state->num_slice_params && enable_avc_ildb == 0; j++) {
+        assert(decode_state->slice_params && decode_state->slice_params[j]->buffer);
+        slice_param = (VASliceParameterBufferH264 *)decode_state->slice_params[j]->buffer;
+
+        for (i = 0; i < decode_state->slice_params[j]->num_elements; i++) {
+            assert(slice_param->slice_data_flag == VA_SLICE_DATA_FLAG_ALL);
+            assert((slice_param->slice_type == SLICE_TYPE_I) ||
+                   (slice_param->slice_type == SLICE_TYPE_SI) ||
+                   (slice_param->slice_type == SLICE_TYPE_P) ||
+                   (slice_param->slice_type == SLICE_TYPE_SP) ||
+                   (slice_param->slice_type == SLICE_TYPE_B));
+
+            if (slice_param->disable_deblocking_filter_idc != 1) {
+                enable_avc_ildb = 1;
+                break;
+            }
+
+            slice_param++;
+        }
+    }
+
+    assert(decode_state->pic_param && decode_state->pic_param->buffer);
+    pic_param = (VAPictureParameterBufferH264 *)decode_state->pic_param->buffer;
+    intel_update_avc_frame_store_index(ctx, decode_state, pic_param, gen7_mfd_context->reference_surface);
+    width_in_mbs = pic_param->picture_width_in_mbs_minus1 + 1;
+    height_in_mbs = pic_param->picture_height_in_mbs_minus1 + 1;
+    assert(width_in_mbs > 0 && width_in_mbs <= 256); /* 4K */
+    assert(height_in_mbs > 0 && height_in_mbs <= 256);
+
+    /* Current decoded picture */
+    obj_surface = decode_state->render_object;
+    obj_surface->flags &= ~SURFACE_REF_DIS_MASK;
+    obj_surface->flags |= (pic_param->pic_fields.bits.reference_pic_flag ? SURFACE_REFERENCED : 0);
+    i965_check_alloc_surface_bo(ctx, obj_surface, 1, VA_FOURCC('N','V','1','2'), SUBSAMPLE_YUV420);
+
+    /* initial uv component for YUV400 case */
+    if (pic_param->seq_fields.bits.chroma_format_idc == 0) {
+         unsigned int uv_offset = obj_surface->width * obj_surface->height; 
+         unsigned int uv_size   = obj_surface->width * obj_surface->height / 2; 
+
+         drm_intel_gem_bo_map_gtt(obj_surface->bo);
+         memset(obj_surface->bo->virtual + uv_offset, 0x80, uv_size);
+         drm_intel_gem_bo_unmap_gtt(obj_surface->bo);
+    }
+
+    gen8_mfd_init_avc_surface(ctx, pic_param, obj_surface);
+
+    dri_bo_unreference(gen7_mfd_context->post_deblocking_output.bo);
+    gen7_mfd_context->post_deblocking_output.bo = obj_surface->bo;
+    dri_bo_reference(gen7_mfd_context->post_deblocking_output.bo);
+    gen7_mfd_context->post_deblocking_output.valid = enable_avc_ildb;
+
+    dri_bo_unreference(gen7_mfd_context->pre_deblocking_output.bo);
+    gen7_mfd_context->pre_deblocking_output.bo = obj_surface->bo;
+    dri_bo_reference(gen7_mfd_context->pre_deblocking_output.bo);
+    gen7_mfd_context->pre_deblocking_output.valid = !enable_avc_ildb;
+
+    dri_bo_unreference(gen7_mfd_context->intra_row_store_scratch_buffer.bo);
+    bo = dri_bo_alloc(i965->intel.bufmgr,
+                      "intra row store",
+                      width_in_mbs * 64,
+                      0x1000);
+    assert(bo);
+    gen7_mfd_context->intra_row_store_scratch_buffer.bo = bo;
+    gen7_mfd_context->intra_row_store_scratch_buffer.valid = 1;
+
+    dri_bo_unreference(gen7_mfd_context->deblocking_filter_row_store_scratch_buffer.bo);
+    bo = dri_bo_alloc(i965->intel.bufmgr,
+                      "deblocking filter row store",
+                      width_in_mbs * 64 * 4,
+                      0x1000);
+    assert(bo);
+    gen7_mfd_context->deblocking_filter_row_store_scratch_buffer.bo = bo;
+    gen7_mfd_context->deblocking_filter_row_store_scratch_buffer.valid = 1;
+
+    dri_bo_unreference(gen7_mfd_context->bsd_mpc_row_store_scratch_buffer.bo);
+    bo = dri_bo_alloc(i965->intel.bufmgr,
+                      "bsd mpc row store",
+                      width_in_mbs * 64 * 2,
+                      0x1000);
+    assert(bo);
+    gen7_mfd_context->bsd_mpc_row_store_scratch_buffer.bo = bo;
+    gen7_mfd_context->bsd_mpc_row_store_scratch_buffer.valid = 1;
+
+    dri_bo_unreference(gen7_mfd_context->mpr_row_store_scratch_buffer.bo);
+    bo = dri_bo_alloc(i965->intel.bufmgr,
+                      "mpr row store",
+                      width_in_mbs * 64 * 2,
+                      0x1000);
+    assert(bo);
+    gen7_mfd_context->mpr_row_store_scratch_buffer.bo = bo;
+    gen7_mfd_context->mpr_row_store_scratch_buffer.valid = 1;
+
+    gen7_mfd_context->bitplane_read_buffer.valid = 0;
+}
+
+static void
+gen8_mfd_avc_decode_picture(VADriverContextP ctx,
+                            struct decode_state *decode_state,
+                            struct gen7_mfd_context *gen7_mfd_context)
+{
+    struct intel_batchbuffer *batch = gen7_mfd_context->base.batch;
+    VAPictureParameterBufferH264 *pic_param;
+    VASliceParameterBufferH264 *slice_param, *next_slice_param, *next_slice_group_param;
+    dri_bo *slice_data_bo;
+    int i, j;
+
+    assert(decode_state->pic_param && decode_state->pic_param->buffer);
+    pic_param = (VAPictureParameterBufferH264 *)decode_state->pic_param->buffer;
+    gen8_mfd_avc_decode_init(ctx, decode_state, gen7_mfd_context);
+
+    intel_batchbuffer_start_atomic_bcs(batch, 0x1000);
+    intel_batchbuffer_emit_mi_flush(batch);
+    gen8_mfd_pipe_mode_select(ctx, decode_state, MFX_FORMAT_AVC, gen7_mfd_context);
+    gen8_mfd_surface_state(ctx, decode_state, MFX_FORMAT_AVC, gen7_mfd_context);
+    gen8_mfd_pipe_buf_addr_state(ctx, decode_state, MFX_FORMAT_AVC, gen7_mfd_context);
+    gen8_mfd_bsp_buf_base_addr_state(ctx, decode_state, MFX_FORMAT_AVC, gen7_mfd_context);
+    gen8_mfd_avc_qm_state(ctx, decode_state, gen7_mfd_context);
+    gen8_mfd_avc_img_state(ctx, decode_state, gen7_mfd_context);
+    gen8_mfd_avc_picid_state(ctx, decode_state, gen7_mfd_context);
+
+    for (j = 0; j < decode_state->num_slice_params; j++) {
+        assert(decode_state->slice_params && decode_state->slice_params[j]->buffer);
+        slice_param = (VASliceParameterBufferH264 *)decode_state->slice_params[j]->buffer;
+        slice_data_bo = decode_state->slice_datas[j]->bo;
+        gen8_mfd_ind_obj_base_addr_state(ctx, slice_data_bo, MFX_FORMAT_AVC, gen7_mfd_context);
+
+        if (j == decode_state->num_slice_params - 1)
+            next_slice_group_param = NULL;
+        else
+            next_slice_group_param = (VASliceParameterBufferH264 *)decode_state->slice_params[j + 1]->buffer;
+
+        for (i = 0; i < decode_state->slice_params[j]->num_elements; i++) {
+            assert(slice_param->slice_data_flag == VA_SLICE_DATA_FLAG_ALL);
+            assert((slice_param->slice_type == SLICE_TYPE_I) ||
+                   (slice_param->slice_type == SLICE_TYPE_SI) ||
+                   (slice_param->slice_type == SLICE_TYPE_P) ||
+                   (slice_param->slice_type == SLICE_TYPE_SP) ||
+                   (slice_param->slice_type == SLICE_TYPE_B));
+
+            if (i < decode_state->slice_params[j]->num_elements - 1)
+                next_slice_param = slice_param + 1;
+            else
+                next_slice_param = next_slice_group_param;
+
+            gen8_mfd_avc_directmode_state(ctx, decode_state, pic_param, slice_param, gen7_mfd_context);
+            gen8_mfd_avc_ref_idx_state(ctx, pic_param, slice_param, gen7_mfd_context);
+            gen8_mfd_avc_weightoffset_state(ctx, pic_param, slice_param, gen7_mfd_context);
+            gen8_mfd_avc_slice_state(ctx, pic_param, slice_param, next_slice_param, gen7_mfd_context);
+            gen8_mfd_avc_bsd_object(ctx, pic_param, slice_param, slice_data_bo, next_slice_param, gen7_mfd_context);
+            slice_param++;
+        }
+    }
+
+    intel_batchbuffer_end_atomic(batch);
+    intel_batchbuffer_flush(batch);
+}
+
+static void
+gen8_mfd_mpeg2_decode_init(VADriverContextP ctx,
+                           struct decode_state *decode_state,
+                           struct gen7_mfd_context *gen7_mfd_context)
+{
+    VAPictureParameterBufferMPEG2 *pic_param;
+    struct i965_driver_data *i965 = i965_driver_data(ctx);
+    struct object_surface *obj_surface;
+    dri_bo *bo;
+    unsigned int width_in_mbs;
+
+    assert(decode_state->pic_param && decode_state->pic_param->buffer);
+    pic_param = (VAPictureParameterBufferMPEG2 *)decode_state->pic_param->buffer;
+    width_in_mbs = ALIGN(pic_param->horizontal_size, 16) / 16;
+
+    mpeg2_set_reference_surfaces(
+        ctx,
+        gen7_mfd_context->reference_surface,
+        decode_state,
+        pic_param
+    );
+
+    /* Current decoded picture */
+    obj_surface = decode_state->render_object;
+    i965_check_alloc_surface_bo(ctx, obj_surface, 1, VA_FOURCC('N','V','1','2'), SUBSAMPLE_YUV420);
+
+    dri_bo_unreference(gen7_mfd_context->pre_deblocking_output.bo);
+    gen7_mfd_context->pre_deblocking_output.bo = obj_surface->bo;
+    dri_bo_reference(gen7_mfd_context->pre_deblocking_output.bo);
+    gen7_mfd_context->pre_deblocking_output.valid = 1;
+
+    dri_bo_unreference(gen7_mfd_context->bsd_mpc_row_store_scratch_buffer.bo);
+    bo = dri_bo_alloc(i965->intel.bufmgr,
+                      "bsd mpc row store",
+                      width_in_mbs * 96,
+                      0x1000);
+    assert(bo);
+    gen7_mfd_context->bsd_mpc_row_store_scratch_buffer.bo = bo;
+    gen7_mfd_context->bsd_mpc_row_store_scratch_buffer.valid = 1;
+
+    gen7_mfd_context->post_deblocking_output.valid = 0;
+    gen7_mfd_context->intra_row_store_scratch_buffer.valid = 0;
+    gen7_mfd_context->deblocking_filter_row_store_scratch_buffer.valid = 0;
+    gen7_mfd_context->mpr_row_store_scratch_buffer.valid = 0;
+    gen7_mfd_context->bitplane_read_buffer.valid = 0;
+}
+
+static void
+gen8_mfd_mpeg2_pic_state(VADriverContextP ctx,
+                         struct decode_state *decode_state,
+                         struct gen7_mfd_context *gen7_mfd_context)
+{
+    struct intel_batchbuffer *batch = gen7_mfd_context->base.batch;
+    VAPictureParameterBufferMPEG2 *pic_param;
+    unsigned int slice_concealment_disable_bit = 0;
+
+    assert(decode_state->pic_param && decode_state->pic_param->buffer);
+    pic_param = (VAPictureParameterBufferMPEG2 *)decode_state->pic_param->buffer;
+
+    slice_concealment_disable_bit = 1;
+
+    BEGIN_BCS_BATCH(batch, 13);
+    OUT_BCS_BATCH(batch, MFX_MPEG2_PIC_STATE | (13 - 2));
+    OUT_BCS_BATCH(batch,
+                  (pic_param->f_code & 0xf) << 28 | /* f_code[1][1] */
+                  ((pic_param->f_code >> 4) & 0xf) << 24 | /* f_code[1][0] */
+                  ((pic_param->f_code >> 8) & 0xf) << 20 | /* f_code[0][1] */
+                  ((pic_param->f_code >> 12) & 0xf) << 16 | /* f_code[0][0] */
+                  pic_param->picture_coding_extension.bits.intra_dc_precision << 14 |
+                  pic_param->picture_coding_extension.bits.picture_structure << 12 |
+                  pic_param->picture_coding_extension.bits.top_field_first << 11 |
+                  pic_param->picture_coding_extension.bits.frame_pred_frame_dct << 10 |
+                  pic_param->picture_coding_extension.bits.concealment_motion_vectors << 9 |
+                  pic_param->picture_coding_extension.bits.q_scale_type << 8 |
+                  pic_param->picture_coding_extension.bits.intra_vlc_format << 7 | 
+                  pic_param->picture_coding_extension.bits.alternate_scan << 6);
+    OUT_BCS_BATCH(batch,
+                  pic_param->picture_coding_type << 9);
+    OUT_BCS_BATCH(batch,
+                  (slice_concealment_disable_bit << 31) |
+                  ((ALIGN(pic_param->vertical_size, 16) / 16) - 1) << 16 |
+                  ((ALIGN(pic_param->horizontal_size, 16) / 16) - 1));
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    ADVANCE_BCS_BATCH(batch);
+}
+
+static void
+gen8_mfd_mpeg2_qm_state(VADriverContextP ctx,
+                        struct decode_state *decode_state,
+                        struct gen7_mfd_context *gen7_mfd_context)
+{
+    VAIQMatrixBufferMPEG2 * const gen_iq_matrix = &gen7_mfd_context->iq_matrix.mpeg2;
+    int i, j;
+
+    /* Update internal QM state */
+    if (decode_state->iq_matrix && decode_state->iq_matrix->buffer) {
+        VAIQMatrixBufferMPEG2 * const iq_matrix =
+            (VAIQMatrixBufferMPEG2 *)decode_state->iq_matrix->buffer;
+
+        if (gen_iq_matrix->load_intra_quantiser_matrix == -1 ||
+            iq_matrix->load_intra_quantiser_matrix) {
+            gen_iq_matrix->load_intra_quantiser_matrix =
+                iq_matrix->load_intra_quantiser_matrix;
+            if (iq_matrix->load_intra_quantiser_matrix) {
+                for (j = 0; j < 64; j++)
+                    gen_iq_matrix->intra_quantiser_matrix[zigzag_direct[j]] =
+                        iq_matrix->intra_quantiser_matrix[j];
+            }
+        }
+
+        if (gen_iq_matrix->load_non_intra_quantiser_matrix == -1 ||
+            iq_matrix->load_non_intra_quantiser_matrix) {
+            gen_iq_matrix->load_non_intra_quantiser_matrix =
+                iq_matrix->load_non_intra_quantiser_matrix;
+            if (iq_matrix->load_non_intra_quantiser_matrix) {
+                for (j = 0; j < 64; j++)
+                    gen_iq_matrix->non_intra_quantiser_matrix[zigzag_direct[j]] =
+                        iq_matrix->non_intra_quantiser_matrix[j];
+            }
+        }
+    }
+
+    /* Commit QM state to HW */
+    for (i = 0; i < 2; i++) {
+        unsigned char *qm = NULL;
+        int qm_type;
+
+        if (i == 0) {
+            if (gen_iq_matrix->load_intra_quantiser_matrix) {
+                qm = gen_iq_matrix->intra_quantiser_matrix;
+                qm_type = MFX_QM_MPEG_INTRA_QUANTIZER_MATRIX;
+            }
+        } else {
+            if (gen_iq_matrix->load_non_intra_quantiser_matrix) {
+                qm = gen_iq_matrix->non_intra_quantiser_matrix;
+                qm_type = MFX_QM_MPEG_NON_INTRA_QUANTIZER_MATRIX;
+            }
+        }
+
+        if (!qm)
+            continue;
+
+        gen8_mfd_qm_state(ctx, qm_type, qm, 64, gen7_mfd_context);
+    }
+}
+
+static void
+gen8_mfd_mpeg2_bsd_object(VADriverContextP ctx,
+                          VAPictureParameterBufferMPEG2 *pic_param,
+                          VASliceParameterBufferMPEG2 *slice_param,
+                          VASliceParameterBufferMPEG2 *next_slice_param,
+                          struct gen7_mfd_context *gen7_mfd_context)
+{
+    struct intel_batchbuffer *batch = gen7_mfd_context->base.batch;
+    unsigned int width_in_mbs = ALIGN(pic_param->horizontal_size, 16) / 16;
+    int mb_count, vpos0, hpos0, vpos1, hpos1, is_field_pic_wa, is_field_pic = 0;
+
+    if (pic_param->picture_coding_extension.bits.picture_structure == MPEG_TOP_FIELD ||
+        pic_param->picture_coding_extension.bits.picture_structure == MPEG_BOTTOM_FIELD)
+        is_field_pic = 1;
+    is_field_pic_wa = is_field_pic &&
+        gen7_mfd_context->wa_mpeg2_slice_vertical_position > 0;
+
+    vpos0 = slice_param->slice_vertical_position / (1 + is_field_pic_wa);
+    hpos0 = slice_param->slice_horizontal_position;
+
+    if (next_slice_param == NULL) {
+        vpos1 = ALIGN(pic_param->vertical_size, 16) / 16 / (1 + is_field_pic);
+        hpos1 = 0;
+    } else {
+        vpos1 = next_slice_param->slice_vertical_position / (1 + is_field_pic_wa);
+        hpos1 = next_slice_param->slice_horizontal_position;
+    }
+
+    mb_count = (vpos1 * width_in_mbs + hpos1) - (vpos0 * width_in_mbs + hpos0);
+
+    BEGIN_BCS_BATCH(batch, 5);
+    OUT_BCS_BATCH(batch, MFD_MPEG2_BSD_OBJECT | (5 - 2));
+    OUT_BCS_BATCH(batch, 
+                  slice_param->slice_data_size - (slice_param->macroblock_offset >> 3));
+    OUT_BCS_BATCH(batch, 
+                  slice_param->slice_data_offset + (slice_param->macroblock_offset >> 3));
+    OUT_BCS_BATCH(batch,
+                  hpos0 << 24 |
+                  vpos0 << 16 |
+                  mb_count << 8 |
+                  (next_slice_param == NULL) << 5 |
+                  (next_slice_param == NULL) << 3 |
+                  (slice_param->macroblock_offset & 0x7));
+    OUT_BCS_BATCH(batch,
+                  (slice_param->quantiser_scale_code << 24) |
+                  (vpos1 << 8 | hpos1));
+    ADVANCE_BCS_BATCH(batch);
+}
+
+static void
+gen8_mfd_mpeg2_decode_picture(VADriverContextP ctx,
+                              struct decode_state *decode_state,
+                              struct gen7_mfd_context *gen7_mfd_context)
+{
+    struct intel_batchbuffer *batch = gen7_mfd_context->base.batch;
+    VAPictureParameterBufferMPEG2 *pic_param;
+    VASliceParameterBufferMPEG2 *slice_param, *next_slice_param, *next_slice_group_param;
+    dri_bo *slice_data_bo;
+    int i, j;
+
+    assert(decode_state->pic_param && decode_state->pic_param->buffer);
+    pic_param = (VAPictureParameterBufferMPEG2 *)decode_state->pic_param->buffer;
+
+    gen8_mfd_mpeg2_decode_init(ctx, decode_state, gen7_mfd_context);
+    intel_batchbuffer_start_atomic_bcs(batch, 0x1000);
+    intel_batchbuffer_emit_mi_flush(batch);
+    gen8_mfd_pipe_mode_select(ctx, decode_state, MFX_FORMAT_MPEG2, gen7_mfd_context);
+    gen8_mfd_surface_state(ctx, decode_state, MFX_FORMAT_MPEG2, gen7_mfd_context);
+    gen8_mfd_pipe_buf_addr_state(ctx, decode_state, MFX_FORMAT_MPEG2, gen7_mfd_context);
+    gen8_mfd_bsp_buf_base_addr_state(ctx, decode_state, MFX_FORMAT_MPEG2, gen7_mfd_context);
+    gen8_mfd_mpeg2_pic_state(ctx, decode_state, gen7_mfd_context);
+    gen8_mfd_mpeg2_qm_state(ctx, decode_state, gen7_mfd_context);
+
+    if (gen7_mfd_context->wa_mpeg2_slice_vertical_position < 0)
+        gen7_mfd_context->wa_mpeg2_slice_vertical_position =
+            mpeg2_wa_slice_vertical_position(decode_state, pic_param);
+
+    for (j = 0; j < decode_state->num_slice_params; j++) {
+        assert(decode_state->slice_params && decode_state->slice_params[j]->buffer);
+        slice_param = (VASliceParameterBufferMPEG2 *)decode_state->slice_params[j]->buffer;
+        slice_data_bo = decode_state->slice_datas[j]->bo;
+        gen8_mfd_ind_obj_base_addr_state(ctx, slice_data_bo, MFX_FORMAT_MPEG2, gen7_mfd_context);
+
+        if (j == decode_state->num_slice_params - 1)
+            next_slice_group_param = NULL;
+        else
+            next_slice_group_param = (VASliceParameterBufferMPEG2 *)decode_state->slice_params[j + 1]->buffer;
+
+        for (i = 0; i < decode_state->slice_params[j]->num_elements; i++) {
+            assert(slice_param->slice_data_flag == VA_SLICE_DATA_FLAG_ALL);
+
+            if (i < decode_state->slice_params[j]->num_elements - 1)
+                next_slice_param = slice_param + 1;
+            else
+                next_slice_param = next_slice_group_param;
+
+            gen8_mfd_mpeg2_bsd_object(ctx, pic_param, slice_param, next_slice_param, gen7_mfd_context);
+            slice_param++;
+        }
+    }
+
+    intel_batchbuffer_end_atomic(batch);
+    intel_batchbuffer_flush(batch);
+}
+
+static const int va_to_gen7_vc1_pic_type[5] = {
+    GEN7_VC1_I_PICTURE,
+    GEN7_VC1_P_PICTURE,
+    GEN7_VC1_B_PICTURE,
+    GEN7_VC1_BI_PICTURE,
+    GEN7_VC1_P_PICTURE,
+};
+
+static const int va_to_gen7_vc1_mv[4] = {
+    1, /* 1-MV */
+    2, /* 1-MV half-pel */
+    3, /* 1-MV half-pef bilinear */
+    0, /* Mixed MV */
+};
+
+static const int b_picture_scale_factor[21] = {
+    128, 85,  170, 64,  192,
+    51,  102, 153, 204, 43,
+    215, 37,  74,  111, 148,
+    185, 222, 32,  96,  160, 
+    224,
+};
+
+static const int va_to_gen7_vc1_condover[3] = {
+    0,
+    2,
+    3
+};
+
+static const int va_to_gen7_vc1_profile[4] = {
+    GEN7_VC1_SIMPLE_PROFILE,
+    GEN7_VC1_MAIN_PROFILE,
+    GEN7_VC1_RESERVED_PROFILE,
+    GEN7_VC1_ADVANCED_PROFILE
+};
+
+static void 
+gen8_mfd_free_vc1_surface(void **data)
+{
+    struct gen7_vc1_surface *gen7_vc1_surface = *data;
+
+    if (!gen7_vc1_surface)
+        return;
+
+    dri_bo_unreference(gen7_vc1_surface->dmv);
+    free(gen7_vc1_surface);
+    *data = NULL;
+}
+
+static void
+gen8_mfd_init_vc1_surface(VADriverContextP ctx, 
+                          VAPictureParameterBufferVC1 *pic_param,
+                          struct object_surface *obj_surface)
+{
+    struct i965_driver_data *i965 = i965_driver_data(ctx);
+    struct gen7_vc1_surface *gen7_vc1_surface = obj_surface->private_data;
+    int width_in_mbs = ALIGN(pic_param->coded_width, 16) / 16;
+    int height_in_mbs = ALIGN(pic_param->coded_height, 16) / 16;
+
+    obj_surface->free_private_data = gen8_mfd_free_vc1_surface;
+
+    if (!gen7_vc1_surface) {
+        gen7_vc1_surface = calloc(sizeof(struct gen7_vc1_surface), 1);
+        assert((obj_surface->size & 0x3f) == 0);
+        obj_surface->private_data = gen7_vc1_surface;
+    }
+
+    gen7_vc1_surface->picture_type = pic_param->picture_fields.bits.picture_type;
+
+    if (gen7_vc1_surface->dmv == NULL) {
+        gen7_vc1_surface->dmv = dri_bo_alloc(i965->intel.bufmgr,
+                                             "direct mv w/r buffer",
+                                             width_in_mbs * height_in_mbs * 64,
+                                             0x1000);
+    }
+}
+
+static void
+gen8_mfd_vc1_decode_init(VADriverContextP ctx,
+                         struct decode_state *decode_state,
+                         struct gen7_mfd_context *gen7_mfd_context)
+{
+    VAPictureParameterBufferVC1 *pic_param;
+    struct i965_driver_data *i965 = i965_driver_data(ctx);
+    struct object_surface *obj_surface;
+    dri_bo *bo;
+    int width_in_mbs;
+    int picture_type;
+
+    assert(decode_state->pic_param && decode_state->pic_param->buffer);
+    pic_param = (VAPictureParameterBufferVC1 *)decode_state->pic_param->buffer;
+    width_in_mbs = ALIGN(pic_param->coded_width, 16) / 16;
+    picture_type = pic_param->picture_fields.bits.picture_type;
+    intel_update_vc1_frame_store_index(ctx,
+                                       decode_state,
+                                       pic_param,
+                                       gen7_mfd_context->reference_surface);
+
+    /* Current decoded picture */
+    obj_surface = decode_state->render_object;
+    i965_check_alloc_surface_bo(ctx, obj_surface, 1, VA_FOURCC('N','V','1','2'), SUBSAMPLE_YUV420);
+    gen8_mfd_init_vc1_surface(ctx, pic_param, obj_surface);
+
+    dri_bo_unreference(gen7_mfd_context->post_deblocking_output.bo);
+    gen7_mfd_context->post_deblocking_output.bo = obj_surface->bo;
+    dri_bo_reference(gen7_mfd_context->post_deblocking_output.bo);
+    gen7_mfd_context->post_deblocking_output.valid = pic_param->entrypoint_fields.bits.loopfilter;
+
+    dri_bo_unreference(gen7_mfd_context->pre_deblocking_output.bo);
+    gen7_mfd_context->pre_deblocking_output.bo = obj_surface->bo;
+    dri_bo_reference(gen7_mfd_context->pre_deblocking_output.bo);
+    gen7_mfd_context->pre_deblocking_output.valid = !pic_param->entrypoint_fields.bits.loopfilter;
+
+    dri_bo_unreference(gen7_mfd_context->intra_row_store_scratch_buffer.bo);
+    bo = dri_bo_alloc(i965->intel.bufmgr,
+                      "intra row store",
+                      width_in_mbs * 64,
+                      0x1000);
+    assert(bo);
+    gen7_mfd_context->intra_row_store_scratch_buffer.bo = bo;
+    gen7_mfd_context->intra_row_store_scratch_buffer.valid = 1;
+
+    dri_bo_unreference(gen7_mfd_context->deblocking_filter_row_store_scratch_buffer.bo);
+    bo = dri_bo_alloc(i965->intel.bufmgr,
+                      "deblocking filter row store",
+                      width_in_mbs * 6 * 64,
+                      0x1000);
+    assert(bo);
+    gen7_mfd_context->deblocking_filter_row_store_scratch_buffer.bo = bo;
+    gen7_mfd_context->deblocking_filter_row_store_scratch_buffer.valid = 1;
+
+    dri_bo_unreference(gen7_mfd_context->bsd_mpc_row_store_scratch_buffer.bo);
+    bo = dri_bo_alloc(i965->intel.bufmgr,
+                      "bsd mpc row store",
+                      width_in_mbs * 96,
+                      0x1000);
+    assert(bo);
+    gen7_mfd_context->bsd_mpc_row_store_scratch_buffer.bo = bo;
+    gen7_mfd_context->bsd_mpc_row_store_scratch_buffer.valid = 1;
+
+    gen7_mfd_context->mpr_row_store_scratch_buffer.valid = 0;
+
+    gen7_mfd_context->bitplane_read_buffer.valid = !!pic_param->bitplane_present.value;
+    dri_bo_unreference(gen7_mfd_context->bitplane_read_buffer.bo);
+    
+    if (gen7_mfd_context->bitplane_read_buffer.valid) {
+        int width_in_mbs = ALIGN(pic_param->coded_width, 16) / 16;
+        int height_in_mbs = ALIGN(pic_param->coded_height, 16) / 16;
+        int bitplane_width = ALIGN(width_in_mbs, 2) / 2;
+        int src_w, src_h;
+        uint8_t *src = NULL, *dst = NULL;
+
+        assert(decode_state->bit_plane->buffer);
+        src = decode_state->bit_plane->buffer;
+
+        bo = dri_bo_alloc(i965->intel.bufmgr,
+                          "VC-1 Bitplane",
+                          bitplane_width * height_in_mbs,
+                          0x1000);
+        assert(bo);
+        gen7_mfd_context->bitplane_read_buffer.bo = bo;
+
+        dri_bo_map(bo, True);
+        assert(bo->virtual);
+        dst = bo->virtual;
+
+        for (src_h = 0; src_h < height_in_mbs; src_h++) {
+            for(src_w = 0; src_w < width_in_mbs; src_w++) {
+                int src_index, dst_index;
+                int src_shift;
+                uint8_t src_value;
+
+                src_index = (src_h * width_in_mbs + src_w) / 2;
+                src_shift = !((src_h * width_in_mbs + src_w) & 1) * 4;
+                src_value = ((src[src_index] >> src_shift) & 0xf);
+
+                if (picture_type == GEN7_VC1_SKIPPED_PICTURE){
+                    src_value |= 0x2;
+                }
+
+                dst_index = src_w / 2;
+                dst[dst_index] = ((dst[dst_index] >> 4) | (src_value << 4));
+            }
+
+            if (src_w & 1)
+                dst[src_w / 2] >>= 4;
+
+            dst += bitplane_width;
+        }
+
+        dri_bo_unmap(bo);
+    } else
+        gen7_mfd_context->bitplane_read_buffer.bo = NULL;
+}
+
+static void
+gen8_mfd_vc1_pic_state(VADriverContextP ctx,
+                       struct decode_state *decode_state,
+                       struct gen7_mfd_context *gen7_mfd_context)
+{
+    struct intel_batchbuffer *batch = gen7_mfd_context->base.batch;
+    VAPictureParameterBufferVC1 *pic_param;
+    struct object_surface *obj_surface;
+    int alt_pquant_config = 0, alt_pquant_edge_mask = 0, alt_pq;
+    int dquant, dquantfrm, dqprofile, dqdbedge, dqsbedge, dqbilevel;
+    int unified_mv_mode;
+    int ref_field_pic_polarity = 0;
+    int scale_factor = 0;
+    int trans_ac_y = 0;
+    int dmv_surface_valid = 0;
+    int brfd = 0;
+    int fcm = 0;
+    int picture_type;
+    int profile;
+    int overlap;
+    int interpolation_mode = 0;
+
+    assert(decode_state->pic_param && decode_state->pic_param->buffer);
+    pic_param = (VAPictureParameterBufferVC1 *)decode_state->pic_param->buffer;
+
+    profile = va_to_gen7_vc1_profile[pic_param->sequence_fields.bits.profile];
+    dquant = pic_param->pic_quantizer_fields.bits.dquant;
+    dquantfrm = pic_param->pic_quantizer_fields.bits.dq_frame;
+    dqprofile = pic_param->pic_quantizer_fields.bits.dq_profile;
+    dqdbedge = pic_param->pic_quantizer_fields.bits.dq_db_edge;
+    dqsbedge = pic_param->pic_quantizer_fields.bits.dq_sb_edge;
+    dqbilevel = pic_param->pic_quantizer_fields.bits.dq_binary_level;
+    alt_pq = pic_param->pic_quantizer_fields.bits.alt_pic_quantizer;
+
+    if (dquant == 0) {
+        alt_pquant_config = 0;
+        alt_pquant_edge_mask = 0;
+    } else if (dquant == 2) {
+        alt_pquant_config = 1;
+        alt_pquant_edge_mask = 0xf;
+    } else {
+        assert(dquant == 1);
+        if (dquantfrm == 0) {
+            alt_pquant_config = 0;
+            alt_pquant_edge_mask = 0;
+            alt_pq = 0;
+        } else {
+            assert(dquantfrm == 1);
+            alt_pquant_config = 1;
+
+            switch (dqprofile) {
+            case 3:
+                if (dqbilevel == 0) {
+                    alt_pquant_config = 2;
+                    alt_pquant_edge_mask = 0;
+                } else {
+                    assert(dqbilevel == 1);
+                    alt_pquant_config = 3;
+                    alt_pquant_edge_mask = 0;
+                }
+                break;
+                
+            case 0:
+                alt_pquant_edge_mask = 0xf;
+                break;
+
+            case 1:
+                if (dqdbedge == 3)
+                    alt_pquant_edge_mask = 0x9;
+                else
+                    alt_pquant_edge_mask = (0x3 << dqdbedge);
+
+                break;
+
+            case 2:
+                alt_pquant_edge_mask = (0x1 << dqsbedge);
+                break;
+
+            default:
+                assert(0);
+            }
+        }
+    }
+
+    if (pic_param->mv_fields.bits.mv_mode == VAMvModeIntensityCompensation) {
+        assert(pic_param->mv_fields.bits.mv_mode2 < 4);
+        unified_mv_mode = va_to_gen7_vc1_mv[pic_param->mv_fields.bits.mv_mode2];
+    } else {
+        assert(pic_param->mv_fields.bits.mv_mode < 4);
+        unified_mv_mode = va_to_gen7_vc1_mv[pic_param->mv_fields.bits.mv_mode];
+    }
+
+    if (pic_param->sequence_fields.bits.interlace == 1 &&
+        pic_param->picture_fields.bits.frame_coding_mode != 0) { /* frame-interlace or field-interlace */
+        /* FIXME: calculate reference field picture polarity */
+        assert(0);
+        ref_field_pic_polarity = 0;
+    }
+
+    if (pic_param->b_picture_fraction < 21)
+        scale_factor = b_picture_scale_factor[pic_param->b_picture_fraction];
+
+    picture_type = va_to_gen7_vc1_pic_type[pic_param->picture_fields.bits.picture_type];
+    
+    if (profile == GEN7_VC1_ADVANCED_PROFILE && 
+        picture_type == GEN7_VC1_I_PICTURE)
+        picture_type = GEN7_VC1_BI_PICTURE;
+
+    if (picture_type == GEN7_VC1_I_PICTURE || picture_type == GEN7_VC1_BI_PICTURE) /* I picture */
+        trans_ac_y = pic_param->transform_fields.bits.transform_ac_codingset_idx2;
+    else {
+        trans_ac_y = pic_param->transform_fields.bits.transform_ac_codingset_idx1;
+
+        /*
+         * 8.3.6.2.1 Transform Type Selection
+         * If variable-sized transform coding is not enabled,
+         * then the 8x8 transform shall be used for all blocks.
+         * it is also MFX_VC1_PIC_STATE requirement.
+         */
+        if (pic_param->transform_fields.bits.variable_sized_transform_flag == 0) {
+            pic_param->transform_fields.bits.mb_level_transform_type_flag   = 1;
+            pic_param->transform_fields.bits.frame_level_transform_type     = 0;
+        }
+    }
+
+    if (picture_type == GEN7_VC1_B_PICTURE) {
+        struct gen7_vc1_surface *gen7_vc1_surface = NULL;
+
+        obj_surface = decode_state->reference_objects[1];
+
+        if (obj_surface)
+            gen7_vc1_surface = obj_surface->private_data;
+
+        if (!gen7_vc1_surface || 
+            (va_to_gen7_vc1_pic_type[gen7_vc1_surface->picture_type] == GEN7_VC1_I_PICTURE ||
+             va_to_gen7_vc1_pic_type[gen7_vc1_surface->picture_type] == GEN7_VC1_BI_PICTURE))
+            dmv_surface_valid = 0;
+        else
+            dmv_surface_valid = 1;
+    }
+
+    assert(pic_param->picture_fields.bits.frame_coding_mode < 3);
+
+    if (pic_param->picture_fields.bits.frame_coding_mode < 2)
+        fcm = pic_param->picture_fields.bits.frame_coding_mode;
+    else {
+        if (pic_param->picture_fields.bits.top_field_first)
+            fcm = 2;
+        else
+            fcm = 3;
+    }
+
+    if (pic_param->picture_fields.bits.picture_type == GEN7_VC1_B_PICTURE) { /* B picture */
+        brfd = pic_param->reference_fields.bits.reference_distance;
+        brfd = (scale_factor * brfd) >> 8;
+        brfd = pic_param->reference_fields.bits.reference_distance - brfd - 1;
+
+        if (brfd < 0)
+            brfd = 0;
+    }
+
+    overlap = 0;
+    if (profile != GEN7_VC1_ADVANCED_PROFILE){
+        if (pic_param->pic_quantizer_fields.bits.pic_quantizer_scale >= 9 &&
+            pic_param->picture_fields.bits.picture_type != GEN7_VC1_B_PICTURE) {
+            overlap = 1; 
+        }
+    }else {
+        if (pic_param->picture_fields.bits.picture_type == GEN7_VC1_P_PICTURE &&
+             pic_param->pic_quantizer_fields.bits.pic_quantizer_scale >= 9){
+              overlap = 1; 
+        }
+        if (pic_param->picture_fields.bits.picture_type == GEN7_VC1_I_PICTURE ||
+            pic_param->picture_fields.bits.picture_type == GEN7_VC1_BI_PICTURE){
+             if (pic_param->pic_quantizer_fields.bits.pic_quantizer_scale >= 9){
+                overlap = 1; 
+             } else if (va_to_gen7_vc1_condover[pic_param->conditional_overlap_flag] == 2 ||
+                        va_to_gen7_vc1_condover[pic_param->conditional_overlap_flag] == 3) {
+                 overlap = 1;
+             }
+        }
+    } 
+
+    assert(pic_param->conditional_overlap_flag < 3);
+    assert(pic_param->mv_fields.bits.mv_table < 4); /* FIXME: interlace mode */
+
+    if (pic_param->mv_fields.bits.mv_mode == VAMvMode1MvHalfPelBilinear ||
+        (pic_param->mv_fields.bits.mv_mode == VAMvModeIntensityCompensation &&
+         pic_param->mv_fields.bits.mv_mode2 == VAMvMode1MvHalfPelBilinear))
+        interpolation_mode = 9; /* Half-pel bilinear */
+    else if (pic_param->mv_fields.bits.mv_mode == VAMvMode1MvHalfPel ||
+             (pic_param->mv_fields.bits.mv_mode == VAMvModeIntensityCompensation &&
+              pic_param->mv_fields.bits.mv_mode2 == VAMvMode1MvHalfPel))
+        interpolation_mode = 1; /* Half-pel bicubic */
+    else
+        interpolation_mode = 0; /* Quarter-pel bicubic */
+
+    BEGIN_BCS_BATCH(batch, 6);
+    OUT_BCS_BATCH(batch, MFD_VC1_LONG_PIC_STATE | (6 - 2));
+    OUT_BCS_BATCH(batch,
+                  (((ALIGN(pic_param->coded_height, 16) / 16) - 1) << 16) |
+                  ((ALIGN(pic_param->coded_width, 16) / 16) - 1));
+    OUT_BCS_BATCH(batch,
+                  ((ALIGN(pic_param->coded_width, 16) / 16 + 1) / 2 - 1) << 24 |
+                  dmv_surface_valid << 15 |
+                  (pic_param->pic_quantizer_fields.bits.quantizer == 0) << 14 | /* implicit quantizer */
+                  pic_param->rounding_control << 13 |
+                  pic_param->sequence_fields.bits.syncmarker << 12 |
+                  interpolation_mode << 8 |
+                  0 << 7 | /* FIXME: scale up or down ??? */
+                  pic_param->range_reduction_frame << 6 |
+                  pic_param->entrypoint_fields.bits.loopfilter << 5 |
+                  overlap << 4 |
+                  !pic_param->picture_fields.bits.is_first_field << 3 |
+                  (pic_param->sequence_fields.bits.profile == 3) << 0);
+    OUT_BCS_BATCH(batch,
+                  va_to_gen7_vc1_condover[pic_param->conditional_overlap_flag] << 29 |
+                  picture_type << 26 |
+                  fcm << 24 |
+                  alt_pq << 16 |
+                  pic_param->pic_quantizer_fields.bits.pic_quantizer_scale << 8 |
+                  scale_factor << 0);
+    OUT_BCS_BATCH(batch,
+                  unified_mv_mode << 28 |
+                  pic_param->mv_fields.bits.four_mv_switch << 27 |
+                  pic_param->fast_uvmc_flag << 26 |
+                  ref_field_pic_polarity << 25 |
+                  pic_param->reference_fields.bits.num_reference_pictures << 24 |
+                  pic_param->reference_fields.bits.reference_distance << 20 |
+                  pic_param->reference_fields.bits.reference_distance << 16 | /* FIXME: ??? */
+                  pic_param->mv_fields.bits.extended_dmv_range << 10 |
+                  pic_param->mv_fields.bits.extended_mv_range << 8 |
+                  alt_pquant_edge_mask << 4 |
+                  alt_pquant_config << 2 |
+                  pic_param->pic_quantizer_fields.bits.half_qp << 1 |                  
+                  pic_param->pic_quantizer_fields.bits.pic_quantizer_type << 0);
+    OUT_BCS_BATCH(batch,
+                  !!pic_param->bitplane_present.value << 31 |
+                  !pic_param->bitplane_present.flags.bp_forward_mb << 30 |
+                  !pic_param->bitplane_present.flags.bp_mv_type_mb << 29 |
+                  !pic_param->bitplane_present.flags.bp_skip_mb << 28 |
+                  !pic_param->bitplane_present.flags.bp_direct_mb << 27 |
+                  !pic_param->bitplane_present.flags.bp_overflags << 26 |
+                  !pic_param->bitplane_present.flags.bp_ac_pred << 25 |
+                  !pic_param->bitplane_present.flags.bp_field_tx << 24 |
+                  pic_param->mv_fields.bits.mv_table << 20 |
+                  pic_param->mv_fields.bits.four_mv_block_pattern_table << 18 |
+                  pic_param->mv_fields.bits.two_mv_block_pattern_table << 16 |
+                  pic_param->transform_fields.bits.frame_level_transform_type << 12 |                  
+                  pic_param->transform_fields.bits.mb_level_transform_type_flag << 11 |
+                  pic_param->mb_mode_table << 8 |
+                  trans_ac_y << 6 |
+                  pic_param->transform_fields.bits.transform_ac_codingset_idx1 << 4 |
+                  pic_param->transform_fields.bits.intra_transform_dc_table << 3 |
+                  pic_param->cbp_table << 0);
+    ADVANCE_BCS_BATCH(batch);
+}
+
+static void
+gen8_mfd_vc1_pred_pipe_state(VADriverContextP ctx,
+                             struct decode_state *decode_state,
+                             struct gen7_mfd_context *gen7_mfd_context)
+{
+    struct intel_batchbuffer *batch = gen7_mfd_context->base.batch;
+    VAPictureParameterBufferVC1 *pic_param;
+    int intensitycomp_single;
+
+    assert(decode_state->pic_param && decode_state->pic_param->buffer);
+    pic_param = (VAPictureParameterBufferVC1 *)decode_state->pic_param->buffer;
+
+    assert(decode_state->pic_param && decode_state->pic_param->buffer);
+    pic_param = (VAPictureParameterBufferVC1 *)decode_state->pic_param->buffer;
+    intensitycomp_single = (pic_param->mv_fields.bits.mv_mode == VAMvModeIntensityCompensation);
+
+    BEGIN_BCS_BATCH(batch, 6);
+    OUT_BCS_BATCH(batch, MFX_VC1_PRED_PIPE_STATE | (6 - 2));
+    OUT_BCS_BATCH(batch,
+                  0 << 14 | /* FIXME: double ??? */
+                  0 << 12 |
+                  intensitycomp_single << 10 |
+                  intensitycomp_single << 8 |
+                  0 << 4 | /* FIXME: interlace mode */
+                  0);
+    OUT_BCS_BATCH(batch,
+                  pic_param->luma_shift << 16 |
+                  pic_param->luma_scale << 0); /* FIXME: Luma Scaling */
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    ADVANCE_BCS_BATCH(batch);
+}
+
+static void
+gen8_mfd_vc1_directmode_state(VADriverContextP ctx,
+                              struct decode_state *decode_state,
+                              struct gen7_mfd_context *gen7_mfd_context)
+{
+    struct intel_batchbuffer *batch = gen7_mfd_context->base.batch;
+    struct object_surface *obj_surface;
+    dri_bo *dmv_read_buffer = NULL, *dmv_write_buffer = NULL;
+
+    obj_surface = decode_state->render_object;
+
+    if (obj_surface && obj_surface->private_data) {
+        dmv_write_buffer = ((struct gen7_vc1_surface *)(obj_surface->private_data))->dmv;
+    }
+
+    obj_surface = decode_state->reference_objects[1];
+
+    if (obj_surface && obj_surface->private_data) {
+        dmv_read_buffer = ((struct gen7_vc1_surface *)(obj_surface->private_data))->dmv;
+    }
+
+    BEGIN_BCS_BATCH(batch, 7);
+    OUT_BCS_BATCH(batch, MFX_VC1_DIRECTMODE_STATE | (7 - 2));
+
+    if (dmv_write_buffer)
+        OUT_BCS_RELOC(batch, dmv_write_buffer,
+                      I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+                      0);
+    else
+        OUT_BCS_BATCH(batch, 0);
+
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+
+    if (dmv_read_buffer)
+        OUT_BCS_RELOC(batch, dmv_read_buffer,
+                      I915_GEM_DOMAIN_INSTRUCTION, 0,
+                      0);
+    else
+        OUT_BCS_BATCH(batch, 0);
+    
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+                  
+    ADVANCE_BCS_BATCH(batch);
+}
+
+static int
+gen8_mfd_vc1_get_macroblock_bit_offset(uint8_t *buf, int in_slice_data_bit_offset, int profile)
+{
+    int out_slice_data_bit_offset;
+    int slice_header_size = in_slice_data_bit_offset / 8;
+    int i, j;
+
+    if (profile != 3)
+        out_slice_data_bit_offset = in_slice_data_bit_offset;
+    else {
+        for (i = 0, j = 0; i < slice_header_size; i++, j++) {
+            if (!buf[j] && !buf[j + 1] && buf[j + 2] == 3 && buf[j + 3] < 4) {
+                i++, j += 2;
+            }
+        }
+
+        out_slice_data_bit_offset = 8 * j + in_slice_data_bit_offset % 8;
+    }
+
+    return out_slice_data_bit_offset;
+}
+
+static void
+gen8_mfd_vc1_bsd_object(VADriverContextP ctx,
+                        VAPictureParameterBufferVC1 *pic_param,
+                        VASliceParameterBufferVC1 *slice_param,
+                        VASliceParameterBufferVC1 *next_slice_param,
+                        dri_bo *slice_data_bo,
+                        struct gen7_mfd_context *gen7_mfd_context)
+{
+    struct intel_batchbuffer *batch = gen7_mfd_context->base.batch;
+    int next_slice_start_vert_pos;
+    int macroblock_offset;
+    uint8_t *slice_data = NULL;
+
+    dri_bo_map(slice_data_bo, 0);
+    slice_data = (uint8_t *)(slice_data_bo->virtual + slice_param->slice_data_offset);
+    macroblock_offset = gen8_mfd_vc1_get_macroblock_bit_offset(slice_data, 
+                                                               slice_param->macroblock_offset,
+                                                               pic_param->sequence_fields.bits.profile);
+    dri_bo_unmap(slice_data_bo);
+
+    if (next_slice_param)
+        next_slice_start_vert_pos = next_slice_param->slice_vertical_position;
+    else
+        next_slice_start_vert_pos = ALIGN(pic_param->coded_height, 16) / 16;
+
+    BEGIN_BCS_BATCH(batch, 5);
+    OUT_BCS_BATCH(batch, MFD_VC1_BSD_OBJECT | (5 - 2));
+    OUT_BCS_BATCH(batch, 
+                  slice_param->slice_data_size - (macroblock_offset >> 3));
+    OUT_BCS_BATCH(batch, 
+                  slice_param->slice_data_offset + (macroblock_offset >> 3));
+    OUT_BCS_BATCH(batch,
+                  slice_param->slice_vertical_position << 16 |
+                  next_slice_start_vert_pos << 0);
+    OUT_BCS_BATCH(batch,
+                  (macroblock_offset & 0x7));
+    ADVANCE_BCS_BATCH(batch);
+}
+
+static void
+gen8_mfd_vc1_decode_picture(VADriverContextP ctx,
+                            struct decode_state *decode_state,
+                            struct gen7_mfd_context *gen7_mfd_context)
+{
+    struct intel_batchbuffer *batch = gen7_mfd_context->base.batch;
+    VAPictureParameterBufferVC1 *pic_param;
+    VASliceParameterBufferVC1 *slice_param, *next_slice_param, *next_slice_group_param;
+    dri_bo *slice_data_bo;
+    int i, j;
+
+    assert(decode_state->pic_param && decode_state->pic_param->buffer);
+    pic_param = (VAPictureParameterBufferVC1 *)decode_state->pic_param->buffer;
+
+    gen8_mfd_vc1_decode_init(ctx, decode_state, gen7_mfd_context);
+    intel_batchbuffer_start_atomic_bcs(batch, 0x1000);
+    intel_batchbuffer_emit_mi_flush(batch);
+    gen8_mfd_pipe_mode_select(ctx, decode_state, MFX_FORMAT_VC1, gen7_mfd_context);
+    gen8_mfd_surface_state(ctx, decode_state, MFX_FORMAT_VC1, gen7_mfd_context);
+    gen8_mfd_pipe_buf_addr_state(ctx, decode_state, MFX_FORMAT_VC1, gen7_mfd_context);
+    gen8_mfd_bsp_buf_base_addr_state(ctx, decode_state, MFX_FORMAT_VC1, gen7_mfd_context);
+    gen8_mfd_vc1_pic_state(ctx, decode_state, gen7_mfd_context);
+    gen8_mfd_vc1_pred_pipe_state(ctx, decode_state, gen7_mfd_context);
+    gen8_mfd_vc1_directmode_state(ctx, decode_state, gen7_mfd_context);
+
+    for (j = 0; j < decode_state->num_slice_params; j++) {
+        assert(decode_state->slice_params && decode_state->slice_params[j]->buffer);
+        slice_param = (VASliceParameterBufferVC1 *)decode_state->slice_params[j]->buffer;
+        slice_data_bo = decode_state->slice_datas[j]->bo;
+        gen8_mfd_ind_obj_base_addr_state(ctx, slice_data_bo, MFX_FORMAT_VC1, gen7_mfd_context);
+
+        if (j == decode_state->num_slice_params - 1)
+            next_slice_group_param = NULL;
+        else
+            next_slice_group_param = (VASliceParameterBufferVC1 *)decode_state->slice_params[j + 1]->buffer;
+
+        for (i = 0; i < decode_state->slice_params[j]->num_elements; i++) {
+            assert(slice_param->slice_data_flag == VA_SLICE_DATA_FLAG_ALL);
+
+            if (i < decode_state->slice_params[j]->num_elements - 1)
+                next_slice_param = slice_param + 1;
+            else
+                next_slice_param = next_slice_group_param;
+
+            gen8_mfd_vc1_bsd_object(ctx, pic_param, slice_param, next_slice_param, slice_data_bo, gen7_mfd_context);
+            slice_param++;
+        }
+    }
+
+    intel_batchbuffer_end_atomic(batch);
+    intel_batchbuffer_flush(batch);
+}
+
+static void
+gen8_mfd_jpeg_decode_init(VADriverContextP ctx,
+                          struct decode_state *decode_state,
+                          struct gen7_mfd_context *gen7_mfd_context)
+{
+    struct object_surface *obj_surface;
+    VAPictureParameterBufferJPEGBaseline *pic_param;
+    int subsampling = SUBSAMPLE_YUV420;
+
+    pic_param = (VAPictureParameterBufferJPEGBaseline *)decode_state->pic_param->buffer;
+
+    if (pic_param->num_components == 1)
+        subsampling = SUBSAMPLE_YUV400;
+    else if (pic_param->num_components == 3) {
+        int h1 = pic_param->components[0].h_sampling_factor;
+        int h2 = pic_param->components[1].h_sampling_factor;
+        int h3 = pic_param->components[2].h_sampling_factor;
+        int v1 = pic_param->components[0].v_sampling_factor;
+        int v2 = pic_param->components[1].v_sampling_factor;
+        int v3 = pic_param->components[2].v_sampling_factor;
+
+        if (h1 == 2 && h2 == 1 && h3 == 1 &&
+            v1 == 2 && v2 == 1 && v3 == 1)
+            subsampling = SUBSAMPLE_YUV420;
+        else if (h1 == 2 && h2 == 1 && h3 == 1 &&
+                 v1 == 1 && v2 == 1 && v3 == 1)
+            subsampling = SUBSAMPLE_YUV422H;
+        else if (h1 == 1 && h2 == 1 && h3 == 1 &&
+                 v1 == 1 && v2 == 1 && v3 == 1)
+            subsampling = SUBSAMPLE_YUV444;
+        else if (h1 == 4 && h2 == 1 && h3 == 1 &&
+                 v1 == 1 && v2 == 1 && v3 == 1)
+            subsampling = SUBSAMPLE_YUV411;
+        else if (h1 == 1 && h2 == 1 && h3 == 1 &&
+                 v1 == 2 && v2 == 1 && v3 == 1)
+            subsampling = SUBSAMPLE_YUV422V;
+        else if (h1 == 2 && h2 == 1 && h3 == 1 &&
+                 v1 == 2 && v2 == 2 && v3 == 2)
+            subsampling = SUBSAMPLE_YUV422H;
+        else if (h2 == 2 && h2 == 2 && h3 == 2 &&
+                 v1 == 2 && v2 == 1 && v3 == 1)
+            subsampling = SUBSAMPLE_YUV422V;
+        else
+            assert(0);
+    } else {
+        assert(0);
+    }
+
+    /* Current decoded picture */
+    obj_surface = decode_state->render_object;
+    i965_check_alloc_surface_bo(ctx, obj_surface, 1, VA_FOURCC('I','M','C','1'), subsampling);
+
+    dri_bo_unreference(gen7_mfd_context->pre_deblocking_output.bo);
+    gen7_mfd_context->pre_deblocking_output.bo = obj_surface->bo;
+    dri_bo_reference(gen7_mfd_context->pre_deblocking_output.bo);
+    gen7_mfd_context->pre_deblocking_output.valid = 1;
+
+    gen7_mfd_context->post_deblocking_output.bo = NULL;
+    gen7_mfd_context->post_deblocking_output.valid = 0;
+
+    gen7_mfd_context->intra_row_store_scratch_buffer.bo = NULL;
+    gen7_mfd_context->intra_row_store_scratch_buffer.valid = 0;
+
+    gen7_mfd_context->deblocking_filter_row_store_scratch_buffer.bo = NULL;
+    gen7_mfd_context->deblocking_filter_row_store_scratch_buffer.valid = 0;
+
+    gen7_mfd_context->bsd_mpc_row_store_scratch_buffer.bo = NULL;
+    gen7_mfd_context->bsd_mpc_row_store_scratch_buffer.valid = 0;
+
+    gen7_mfd_context->mpr_row_store_scratch_buffer.bo = NULL;
+    gen7_mfd_context->mpr_row_store_scratch_buffer.valid = 0;
+
+    gen7_mfd_context->bitplane_read_buffer.bo = NULL;
+    gen7_mfd_context->bitplane_read_buffer.valid = 0;
+}
+
+static const int va_to_gen7_jpeg_rotation[4] = {
+    GEN7_JPEG_ROTATION_0,
+    GEN7_JPEG_ROTATION_90,
+    GEN7_JPEG_ROTATION_180,
+    GEN7_JPEG_ROTATION_270
+};
+
+static void
+gen8_mfd_jpeg_pic_state(VADriverContextP ctx,
+                        struct decode_state *decode_state,
+                        struct gen7_mfd_context *gen7_mfd_context)
+{
+    struct intel_batchbuffer *batch = gen7_mfd_context->base.batch;
+    VAPictureParameterBufferJPEGBaseline *pic_param;
+    int chroma_type = GEN7_YUV420;
+    int frame_width_in_blks;
+    int frame_height_in_blks;
+
+    assert(decode_state->pic_param && decode_state->pic_param->buffer);
+    pic_param = (VAPictureParameterBufferJPEGBaseline *)decode_state->pic_param->buffer;
+
+    if (pic_param->num_components == 1)
+        chroma_type = GEN7_YUV400;
+    else if (pic_param->num_components == 3) {
+        int h1 = pic_param->components[0].h_sampling_factor;
+        int h2 = pic_param->components[1].h_sampling_factor;
+        int h3 = pic_param->components[2].h_sampling_factor;
+        int v1 = pic_param->components[0].v_sampling_factor;
+        int v2 = pic_param->components[1].v_sampling_factor;
+        int v3 = pic_param->components[2].v_sampling_factor;
+
+        if (h1 == 2 && h2 == 1 && h3 == 1 &&
+            v1 == 2 && v2 == 1 && v3 == 1)
+            chroma_type = GEN7_YUV420;
+        else if (h1 == 2 && h2 == 1 && h3 == 1 &&
+                 v1 == 1 && v2 == 1 && v3 == 1)
+            chroma_type = GEN7_YUV422H_2Y;
+        else if (h1 == 1 && h2 == 1 && h3 == 1 &&
+                 v1 == 1 && v2 == 1 && v3 == 1)
+            chroma_type = GEN7_YUV444;
+        else if (h1 == 4 && h2 == 1 && h3 == 1 &&
+                 v1 == 1 && v2 == 1 && v3 == 1)
+            chroma_type = GEN7_YUV411;
+        else if (h1 == 1 && h2 == 1 && h3 == 1 &&
+                 v1 == 2 && v2 == 1 && v3 == 1)
+            chroma_type = GEN7_YUV422V_2Y;
+        else if (h1 == 2 && h2 == 1 && h3 == 1 &&
+                 v1 == 2 && v2 == 2 && v3 == 2)
+            chroma_type = GEN7_YUV422H_4Y;
+        else if (h2 == 2 && h2 == 2 && h3 == 2 &&
+                 v1 == 2 && v2 == 1 && v3 == 1)
+            chroma_type = GEN7_YUV422V_4Y;
+        else
+            assert(0);
+    }
+
+    if (chroma_type == GEN7_YUV400 ||
+        chroma_type == GEN7_YUV444 ||
+        chroma_type == GEN7_YUV422V_2Y) {
+        frame_width_in_blks = ((pic_param->picture_width + 7) / 8);
+        frame_height_in_blks = ((pic_param->picture_height + 7) / 8);
+    } else if (chroma_type == GEN7_YUV411) {
+        frame_width_in_blks = ((pic_param->picture_width + 31) / 32) * 4;
+        frame_height_in_blks = ((pic_param->picture_height + 31) / 32) * 4;
+    } else {
+        frame_width_in_blks = ((pic_param->picture_width + 15) / 16) * 2;
+        frame_height_in_blks = ((pic_param->picture_height + 15) / 16) * 2;
+    }
+
+    BEGIN_BCS_BATCH(batch, 3);
+    OUT_BCS_BATCH(batch, MFX_JPEG_PIC_STATE | (3 - 2));
+    OUT_BCS_BATCH(batch,
+                  (va_to_gen7_jpeg_rotation[0] << 4) |    /* without rotation */
+                  (chroma_type << 0));
+    OUT_BCS_BATCH(batch,
+                  ((frame_height_in_blks - 1) << 16) |   /* FrameHeightInBlks */
+                  ((frame_width_in_blks - 1) << 0));    /* FrameWidthInBlks */
+    ADVANCE_BCS_BATCH(batch);
+}
+
+static const int va_to_gen7_jpeg_hufftable[2] = {
+    MFX_HUFFTABLE_ID_Y,
+    MFX_HUFFTABLE_ID_UV
+};
+
+static void
+gen8_mfd_jpeg_huff_table_state(VADriverContextP ctx,
+                               struct decode_state *decode_state,
+                               struct gen7_mfd_context *gen7_mfd_context,
+                               int num_tables)
+{
+    VAHuffmanTableBufferJPEGBaseline *huffman_table;
+    struct intel_batchbuffer *batch = gen7_mfd_context->base.batch;
+    int index;
+
+    if (!decode_state->huffman_table || !decode_state->huffman_table->buffer)
+        return;
+
+    huffman_table = (VAHuffmanTableBufferJPEGBaseline *)decode_state->huffman_table->buffer;
+
+    for (index = 0; index < num_tables; index++) {
+        int id = va_to_gen7_jpeg_hufftable[index];
+        if (!huffman_table->load_huffman_table[index])
+            continue;
+        BEGIN_BCS_BATCH(batch, 53);
+        OUT_BCS_BATCH(batch, MFX_JPEG_HUFF_TABLE_STATE | (53 - 2));
+        OUT_BCS_BATCH(batch, id);
+        intel_batchbuffer_data(batch, huffman_table->huffman_table[index].num_dc_codes, 12);
+        intel_batchbuffer_data(batch, huffman_table->huffman_table[index].dc_values, 12);
+        intel_batchbuffer_data(batch, huffman_table->huffman_table[index].num_ac_codes, 16);
+        intel_batchbuffer_data(batch, huffman_table->huffman_table[index].ac_values, 164);
+        ADVANCE_BCS_BATCH(batch);
+    }
+}
+
+static const int va_to_gen7_jpeg_qm[5] = {
+    -1,
+    MFX_QM_JPEG_LUMA_Y_QUANTIZER_MATRIX,
+    MFX_QM_JPEG_CHROMA_CB_QUANTIZER_MATRIX,
+    MFX_QM_JPEG_CHROMA_CR_QUANTIZER_MATRIX,
+    MFX_QM_JPEG_ALPHA_QUANTIZER_MATRIX
+};
+
+static void
+gen8_mfd_jpeg_qm_state(VADriverContextP ctx,
+                       struct decode_state *decode_state,
+                       struct gen7_mfd_context *gen7_mfd_context)
+{
+    VAPictureParameterBufferJPEGBaseline *pic_param;
+    VAIQMatrixBufferJPEGBaseline *iq_matrix;
+    int index;
+
+    if (!decode_state->iq_matrix || !decode_state->iq_matrix->buffer)
+        return;
+
+    iq_matrix = (VAIQMatrixBufferJPEGBaseline *)decode_state->iq_matrix->buffer;
+    pic_param = (VAPictureParameterBufferJPEGBaseline *)decode_state->pic_param->buffer;
+
+    assert(pic_param->num_components <= 3);
+
+    for (index = 0; index < pic_param->num_components; index++) {
+        int id = pic_param->components[index].component_id - pic_param->components[0].component_id + 1;
+        int qm_type;
+        unsigned char *qm = iq_matrix->quantiser_table[pic_param->components[index].quantiser_table_selector];
+        unsigned char raster_qm[64];
+        int j;
+
+        if (id > 4 || id < 1)
+            continue;
+
+        if (!iq_matrix->load_quantiser_table[pic_param->components[index].quantiser_table_selector])
+            continue;
+
+        qm_type = va_to_gen7_jpeg_qm[id];
+
+        for (j = 0; j < 64; j++)
+            raster_qm[zigzag_direct[j]] = qm[j];
+
+        gen8_mfd_qm_state(ctx, qm_type, raster_qm, 64, gen7_mfd_context);
+    }
+}
+
+static void
+gen8_mfd_jpeg_bsd_object(VADriverContextP ctx,
+                         VAPictureParameterBufferJPEGBaseline *pic_param,
+                         VASliceParameterBufferJPEGBaseline *slice_param,
+                         VASliceParameterBufferJPEGBaseline *next_slice_param,
+                         dri_bo *slice_data_bo,
+                         struct gen7_mfd_context *gen7_mfd_context)
+{
+    struct intel_batchbuffer *batch = gen7_mfd_context->base.batch;
+    int scan_component_mask = 0;
+    int i;
+
+    assert(slice_param->num_components > 0);
+    assert(slice_param->num_components < 4);
+    assert(slice_param->num_components <= pic_param->num_components);
+
+    for (i = 0; i < slice_param->num_components; i++) {
+        switch (slice_param->components[i].component_selector - pic_param->components[0].component_id + 1) {
+        case 1:
+            scan_component_mask |= (1 << 0);
+            break;
+        case 2:
+            scan_component_mask |= (1 << 1);
+            break;
+        case 3:
+            scan_component_mask |= (1 << 2);
+            break;
+        default:
+            assert(0);
+            break;
+        }
+    }
+
+    BEGIN_BCS_BATCH(batch, 6);
+    OUT_BCS_BATCH(batch, MFD_JPEG_BSD_OBJECT | (6 - 2));
+    OUT_BCS_BATCH(batch, 
+                  slice_param->slice_data_size);
+    OUT_BCS_BATCH(batch, 
+                  slice_param->slice_data_offset);
+    OUT_BCS_BATCH(batch,
+                  slice_param->slice_horizontal_position << 16 |
+                  slice_param->slice_vertical_position << 0);
+    OUT_BCS_BATCH(batch,
+                  ((slice_param->num_components != 1) << 30) |  /* interleaved */
+                  (scan_component_mask << 27) |                 /* scan components */
+                  (0 << 26) |   /* disable interrupt allowed */
+                  (slice_param->num_mcus << 0));                /* MCU count */
+    OUT_BCS_BATCH(batch,
+                  (slice_param->restart_interval << 0));    /* RestartInterval */
+    ADVANCE_BCS_BATCH(batch);
+}
+
+/* Workaround for JPEG decoding on Ivybridge */
+
+VAStatus 
+i965_DestroySurfaces(VADriverContextP ctx,
+                     VASurfaceID *surface_list,
+                     int num_surfaces);
+VAStatus 
+i965_CreateSurfaces(VADriverContextP ctx,
+                    int width,
+                    int height,
+                    int format,
+                    int num_surfaces,
+                    VASurfaceID *surfaces);
+
+static struct {
+    int width;
+    int height;
+    unsigned char data[32];
+    int data_size;
+    int data_bit_offset;
+    int qp;
+} gen7_jpeg_wa_clip = {
+    16,
+    16,
+    {
+        0x65, 0xb8, 0x40, 0x32, 0x13, 0xfd, 0x06, 0x6c,
+        0xfc, 0x0a, 0x50, 0x71, 0x5c, 0x00
+    },
+    14,
+    40,
+    28,
+};
+
+static void
+gen8_jpeg_wa_init(VADriverContextP ctx,
+                  struct gen7_mfd_context *gen7_mfd_context)
+{
+    struct i965_driver_data *i965 = i965_driver_data(ctx);
+    VAStatus status;
+    struct object_surface *obj_surface;
+
+    if (gen7_mfd_context->jpeg_wa_surface_id != VA_INVALID_SURFACE)
+        i965_DestroySurfaces(ctx,
+                             &gen7_mfd_context->jpeg_wa_surface_id,
+                             1);
+
+    status = i965_CreateSurfaces(ctx,
+                                 gen7_jpeg_wa_clip.width,
+                                 gen7_jpeg_wa_clip.height,
+                                 VA_RT_FORMAT_YUV420,
+                                 1,
+                                 &gen7_mfd_context->jpeg_wa_surface_id);
+    assert(status == VA_STATUS_SUCCESS);
+
+    obj_surface = SURFACE(gen7_mfd_context->jpeg_wa_surface_id);
+    assert(obj_surface);
+    i965_check_alloc_surface_bo(ctx, obj_surface, 1, VA_FOURCC('N', 'V', '1', '2'), SUBSAMPLE_YUV420);
+    gen7_mfd_context->jpeg_wa_surface_object = obj_surface;
+
+    if (!gen7_mfd_context->jpeg_wa_slice_data_bo) {
+        gen7_mfd_context->jpeg_wa_slice_data_bo = dri_bo_alloc(i965->intel.bufmgr,
+                                                               "JPEG WA data",
+                                                               0x1000,
+                                                               0x1000);
+        dri_bo_subdata(gen7_mfd_context->jpeg_wa_slice_data_bo,
+                       0,
+                       gen7_jpeg_wa_clip.data_size,
+                       gen7_jpeg_wa_clip.data);
+    }
+}
+
+static void
+gen8_jpeg_wa_pipe_mode_select(VADriverContextP ctx,
+                              struct gen7_mfd_context *gen7_mfd_context)
+{
+    struct intel_batchbuffer *batch = gen7_mfd_context->base.batch;
+
+    BEGIN_BCS_BATCH(batch, 5);
+    OUT_BCS_BATCH(batch, MFX_PIPE_MODE_SELECT | (5 - 2));
+    OUT_BCS_BATCH(batch,
+                  (MFX_LONG_MODE << 17) | /* Currently only support long format */
+                  (MFD_MODE_VLD << 15) | /* VLD mode */
+                  (0 << 10) | /* disable Stream-Out */
+                  (0 << 9)  | /* Post Deblocking Output */
+                  (1 << 8)  | /* Pre Deblocking Output */
+                  (0 << 5)  | /* not in stitch mode */
+                  (MFX_CODEC_DECODE << 4)  | /* decoding mode */
+                  (MFX_FORMAT_AVC << 0));
+    OUT_BCS_BATCH(batch,
+                  (0 << 4)  | /* terminate if AVC motion and POC table error occurs */
+                  (0 << 3)  | /* terminate if AVC mbdata error occurs */
+                  (0 << 2)  | /* terminate if AVC CABAC/CAVLC decode error occurs */
+                  (0 << 1)  |
+                  (0 << 0));
+    OUT_BCS_BATCH(batch, 0); /* pic status/error report id */ 
+    OUT_BCS_BATCH(batch, 0); /* reserved */
+    ADVANCE_BCS_BATCH(batch);
+}
+
+static void
+gen8_jpeg_wa_surface_state(VADriverContextP ctx,
+                           struct gen7_mfd_context *gen7_mfd_context)
+{
+    struct object_surface *obj_surface = gen7_mfd_context->jpeg_wa_surface_object;
+    struct intel_batchbuffer *batch = gen7_mfd_context->base.batch;
+
+    BEGIN_BCS_BATCH(batch, 6);
+    OUT_BCS_BATCH(batch, MFX_SURFACE_STATE | (6 - 2));
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch,
+                  ((obj_surface->orig_width - 1) << 18) |
+                  ((obj_surface->orig_height - 1) << 4));
+    OUT_BCS_BATCH(batch,
+                  (MFX_SURFACE_PLANAR_420_8 << 28) | /* 420 planar YUV surface */
+                  (1 << 27) | /* interleave chroma, set to 0 for JPEG */
+                  (0 << 22) | /* surface object control state, ignored */
+                  ((obj_surface->width - 1) << 3) | /* pitch */
+                  (0 << 2)  | /* must be 0 */
+                  (1 << 1)  | /* must be tiled */
+                  (I965_TILEWALK_YMAJOR << 0));  /* tile walk, must be 1 */
+    OUT_BCS_BATCH(batch,
+                  (0 << 16) | /* X offset for U(Cb), must be 0 */
+                  (obj_surface->y_cb_offset << 0)); /* Y offset for U(Cb) */
+    OUT_BCS_BATCH(batch,
+                  (0 << 16) | /* X offset for V(Cr), must be 0 */
+                  (0 << 0)); /* Y offset for V(Cr), must be 0 for video codec, non-zoro for JPEG */
+    ADVANCE_BCS_BATCH(batch);
+}
+
+static void
+gen8_jpeg_wa_pipe_buf_addr_state(VADriverContextP ctx,
+                                 struct gen7_mfd_context *gen7_mfd_context)
+{
+    struct i965_driver_data *i965 = i965_driver_data(ctx);
+    struct object_surface *obj_surface = gen7_mfd_context->jpeg_wa_surface_object;
+    struct intel_batchbuffer *batch = gen7_mfd_context->base.batch;
+    dri_bo *intra_bo;
+    int i;
+
+    intra_bo = dri_bo_alloc(i965->intel.bufmgr,
+                            "intra row store",
+                            128 * 64,
+                            0x1000);
+
+    BEGIN_BCS_BATCH(batch, 61);
+    OUT_BCS_BATCH(batch, MFX_PIPE_BUF_ADDR_STATE | (61 - 2));
+    OUT_BCS_RELOC(batch,
+                  obj_surface->bo,
+                  I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+                  0);
+       OUT_BCS_BATCH(batch, 0);
+       OUT_BCS_BATCH(batch, 0);
+    
+
+    OUT_BCS_BATCH(batch, 0); /* post deblocking */
+       OUT_BCS_BATCH(batch, 0);
+       OUT_BCS_BATCH(batch, 0);
+
+       /* uncompressed-video & stream out 7-12 */
+    OUT_BCS_BATCH(batch, 0); /* ignore for decoding */
+    OUT_BCS_BATCH(batch, 0); /* ignore for decoding */
+       OUT_BCS_BATCH(batch, 0);
+       OUT_BCS_BATCH(batch, 0);
+       OUT_BCS_BATCH(batch, 0);
+       OUT_BCS_BATCH(batch, 0);
+
+       /* the DW 13-15 is for intra row store scratch */
+    OUT_BCS_RELOC(batch,
+                  intra_bo,
+                  I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+                  0);
+       OUT_BCS_BATCH(batch, 0);
+       OUT_BCS_BATCH(batch, 0);
+
+       /* the DW 16-18 is for deblocking filter */ 
+    OUT_BCS_BATCH(batch, 0);
+       OUT_BCS_BATCH(batch, 0);
+       OUT_BCS_BATCH(batch, 0);
+
+    /* DW 19..50 */
+    for (i = 0; i < MAX_GEN_REFERENCE_FRAMES; i++) {
+        OUT_BCS_BATCH(batch, 0);
+        OUT_BCS_BATCH(batch, 0);
+    }
+    OUT_BCS_BATCH(batch, 0);
+
+       /* the DW52-54 is for mb status address */
+    OUT_BCS_BATCH(batch, 0);
+       OUT_BCS_BATCH(batch, 0);
+       OUT_BCS_BATCH(batch, 0);
+       /* the DW56-60 is for ILDB & second ILDB address */
+    OUT_BCS_BATCH(batch, 0);
+       OUT_BCS_BATCH(batch, 0);
+       OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+       OUT_BCS_BATCH(batch, 0);
+       OUT_BCS_BATCH(batch, 0);
+
+    ADVANCE_BCS_BATCH(batch);
+
+    dri_bo_unreference(intra_bo);
+}
+
+static void
+gen8_jpeg_wa_bsp_buf_base_addr_state(VADriverContextP ctx,
+                                     struct gen7_mfd_context *gen7_mfd_context)
+{
+    struct i965_driver_data *i965 = i965_driver_data(ctx);
+    struct intel_batchbuffer *batch = gen7_mfd_context->base.batch;
+    dri_bo *bsd_mpc_bo, *mpr_bo;
+
+    bsd_mpc_bo = dri_bo_alloc(i965->intel.bufmgr,
+                              "bsd mpc row store",
+                              11520, /* 1.5 * 120 * 64 */
+                              0x1000);
+
+    mpr_bo = dri_bo_alloc(i965->intel.bufmgr,
+                          "mpr row store",
+                          7680, /* 1. 0 * 120 * 64 */
+                          0x1000);
+
+    BEGIN_BCS_BATCH(batch, 10);
+    OUT_BCS_BATCH(batch, MFX_BSP_BUF_BASE_ADDR_STATE | (10 - 2));
+
+    OUT_BCS_RELOC(batch,
+                  bsd_mpc_bo,
+                  I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+                  0);
+
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+
+    OUT_BCS_RELOC(batch,
+                  mpr_bo,
+                  I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+                  0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+
+    ADVANCE_BCS_BATCH(batch);
+
+    dri_bo_unreference(bsd_mpc_bo);
+    dri_bo_unreference(mpr_bo);
+}
+
+static void
+gen8_jpeg_wa_avc_qm_state(VADriverContextP ctx,
+                          struct gen7_mfd_context *gen7_mfd_context)
+{
+
+}
+
+static void
+gen8_jpeg_wa_avc_img_state(VADriverContextP ctx,
+                           struct gen7_mfd_context *gen7_mfd_context)
+{
+    struct intel_batchbuffer *batch = gen7_mfd_context->base.batch;
+    int img_struct = 0;
+    int mbaff_frame_flag = 0;
+    unsigned int width_in_mbs = 1, height_in_mbs = 1;
+
+    BEGIN_BCS_BATCH(batch, 16);
+    OUT_BCS_BATCH(batch, MFX_AVC_IMG_STATE | (16 - 2));
+    OUT_BCS_BATCH(batch, 
+                  width_in_mbs * height_in_mbs);
+    OUT_BCS_BATCH(batch, 
+                  ((height_in_mbs - 1) << 16) | 
+                  ((width_in_mbs - 1) << 0));
+    OUT_BCS_BATCH(batch, 
+                  (0 << 24) |
+                  (0 << 16) |
+                  (0 << 14) |
+                  (0 << 13) |
+                  (0 << 12) | /* differ from GEN6 */
+                  (0 << 10) |
+                  (img_struct << 8));
+    OUT_BCS_BATCH(batch,
+                  (1 << 10) | /* 4:2:0 */
+                  (1 << 7) |  /* CABAC */
+                  (0 << 6) |
+                  (0 << 5) |
+                  (0 << 4) |
+                  (0 << 3) |
+                  (1 << 2) |
+                  (mbaff_frame_flag << 1) |
+                  (0 << 0));
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    ADVANCE_BCS_BATCH(batch);
+}
+
+static void
+gen8_jpeg_wa_avc_directmode_state(VADriverContextP ctx,
+                                  struct gen7_mfd_context *gen7_mfd_context)
+{
+    struct intel_batchbuffer *batch = gen7_mfd_context->base.batch;
+    int i;
+
+    BEGIN_BCS_BATCH(batch, 71);
+    OUT_BCS_BATCH(batch, MFX_AVC_DIRECTMODE_STATE | (71 - 2));
+
+    /* reference surfaces 0..15 */
+    for (i = 0; i < MAX_GEN_REFERENCE_FRAMES; i++) {
+        OUT_BCS_BATCH(batch, 0); /* top */
+        OUT_BCS_BATCH(batch, 0); /* bottom */
+    }
+       
+        OUT_BCS_BATCH(batch, 0);
+
+    /* the current decoding frame/field */
+    OUT_BCS_BATCH(batch, 0); /* top */
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+
+    /* POC List */
+    for (i = 0; i < MAX_GEN_REFERENCE_FRAMES; i++) {
+        OUT_BCS_BATCH(batch, 0);
+        OUT_BCS_BATCH(batch, 0);
+    }
+
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+
+    ADVANCE_BCS_BATCH(batch);
+}
+
+static void
+gen8_jpeg_wa_ind_obj_base_addr_state(VADriverContextP ctx,
+                                     struct gen7_mfd_context *gen7_mfd_context)
+{
+    struct intel_batchbuffer *batch = gen7_mfd_context->base.batch;
+
+    BEGIN_BCS_BATCH(batch, 11);
+    OUT_BCS_BATCH(batch, MFX_IND_OBJ_BASE_ADDR_STATE | (11 - 2));
+    OUT_BCS_RELOC(batch,
+                  gen7_mfd_context->jpeg_wa_slice_data_bo,
+                  I915_GEM_DOMAIN_INSTRUCTION, 0,
+                  0);
+    OUT_BCS_BATCH(batch, 0x80000000); /* must set, up to 2G */
+    OUT_BCS_BATCH(batch, 0); /* ignore for VLD mode */
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0); /* ignore for VLD mode */
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0); /* ignore for VLD mode */
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0); /* ignore for VLD mode */
+    OUT_BCS_BATCH(batch, 0);
+    ADVANCE_BCS_BATCH(batch);
+}
+
+static void
+gen8_jpeg_wa_avc_bsd_object(VADriverContextP ctx,
+                            struct gen7_mfd_context *gen7_mfd_context)
+{
+    struct intel_batchbuffer *batch = gen7_mfd_context->base.batch;
+
+    /* the input bitsteam format on GEN7 differs from GEN6 */
+    BEGIN_BCS_BATCH(batch, 6);
+    OUT_BCS_BATCH(batch, MFD_AVC_BSD_OBJECT | (6 - 2));
+    OUT_BCS_BATCH(batch, gen7_jpeg_wa_clip.data_size);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch,
+                  (0 << 31) |
+                  (0 << 14) |
+                  (0 << 12) |
+                  (0 << 10) |
+                  (0 << 8));
+    OUT_BCS_BATCH(batch,
+                  ((gen7_jpeg_wa_clip.data_bit_offset >> 3) << 16) |
+                  (0 << 5)  |
+                  (0 << 4)  |
+                  (1 << 3) | /* LastSlice Flag */
+                  (gen7_jpeg_wa_clip.data_bit_offset & 0x7));
+    OUT_BCS_BATCH(batch, 0);
+    ADVANCE_BCS_BATCH(batch);
+}
+
+static void
+gen8_jpeg_wa_avc_slice_state(VADriverContextP ctx,
+                             struct gen7_mfd_context *gen7_mfd_context)
+{
+    struct intel_batchbuffer *batch = gen7_mfd_context->base.batch;
+    int slice_hor_pos = 0, slice_ver_pos = 0, next_slice_hor_pos = 0, next_slice_ver_pos = 1;
+    int num_ref_idx_l0 = 0, num_ref_idx_l1 = 0;
+    int first_mb_in_slice = 0;
+    int slice_type = SLICE_TYPE_I;
+
+    BEGIN_BCS_BATCH(batch, 11);
+    OUT_BCS_BATCH(batch, MFX_AVC_SLICE_STATE | (11 - 2));
+    OUT_BCS_BATCH(batch, slice_type);
+    OUT_BCS_BATCH(batch, 
+                  (num_ref_idx_l1 << 24) |
+                  (num_ref_idx_l0 << 16) |
+                  (0 << 8) |
+                  (0 << 0));
+    OUT_BCS_BATCH(batch, 
+                  (0 << 29) |
+                  (1 << 27) |   /* disable Deblocking */
+                  (0 << 24) |
+                  (gen7_jpeg_wa_clip.qp << 16) |
+                  (0 << 8) |
+                  (0 << 0));
+    OUT_BCS_BATCH(batch, 
+                  (slice_ver_pos << 24) |
+                  (slice_hor_pos << 16) | 
+                  (first_mb_in_slice << 0));
+    OUT_BCS_BATCH(batch,
+                  (next_slice_ver_pos << 16) |
+                  (next_slice_hor_pos << 0));
+    OUT_BCS_BATCH(batch, (1 << 19)); /* last slice flag */
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    OUT_BCS_BATCH(batch, 0);
+    ADVANCE_BCS_BATCH(batch);
+}
+
+static void
+gen8_mfd_jpeg_wa(VADriverContextP ctx,
+                 struct gen7_mfd_context *gen7_mfd_context)
+{
+    struct intel_batchbuffer *batch = gen7_mfd_context->base.batch;
+    gen8_jpeg_wa_init(ctx, gen7_mfd_context);
+    intel_batchbuffer_emit_mi_flush(batch);
+    gen8_jpeg_wa_pipe_mode_select(ctx, gen7_mfd_context);
+    gen8_jpeg_wa_surface_state(ctx, gen7_mfd_context);
+    gen8_jpeg_wa_pipe_buf_addr_state(ctx, gen7_mfd_context);
+    gen8_jpeg_wa_bsp_buf_base_addr_state(ctx, gen7_mfd_context);
+    gen8_jpeg_wa_avc_qm_state(ctx, gen7_mfd_context);
+    gen8_jpeg_wa_avc_img_state(ctx, gen7_mfd_context);
+    gen8_jpeg_wa_ind_obj_base_addr_state(ctx, gen7_mfd_context);
+
+    gen8_jpeg_wa_avc_directmode_state(ctx, gen7_mfd_context);
+    gen8_jpeg_wa_avc_slice_state(ctx, gen7_mfd_context);
+    gen8_jpeg_wa_avc_bsd_object(ctx, gen7_mfd_context);
+}
+
+void
+gen8_mfd_jpeg_decode_picture(VADriverContextP ctx,
+                             struct decode_state *decode_state,
+                             struct gen7_mfd_context *gen7_mfd_context)
+{
+    struct intel_batchbuffer *batch = gen7_mfd_context->base.batch;
+    VAPictureParameterBufferJPEGBaseline *pic_param;
+    VASliceParameterBufferJPEGBaseline *slice_param, *next_slice_param, *next_slice_group_param;
+    dri_bo *slice_data_bo;
+    int i, j, max_selector = 0;
+
+    assert(decode_state->pic_param && decode_state->pic_param->buffer);
+    pic_param = (VAPictureParameterBufferJPEGBaseline *)decode_state->pic_param->buffer;
+
+    /* Currently only support Baseline DCT */
+    gen8_mfd_jpeg_decode_init(ctx, decode_state, gen7_mfd_context);
+    intel_batchbuffer_start_atomic_bcs(batch, 0x1000);
+    gen8_mfd_jpeg_wa(ctx, gen7_mfd_context);
+    intel_batchbuffer_emit_mi_flush(batch);
+    gen8_mfd_pipe_mode_select(ctx, decode_state, MFX_FORMAT_JPEG, gen7_mfd_context);
+    gen8_mfd_surface_state(ctx, decode_state, MFX_FORMAT_JPEG, gen7_mfd_context);
+    gen8_mfd_pipe_buf_addr_state(ctx, decode_state, MFX_FORMAT_JPEG, gen7_mfd_context);
+    gen8_mfd_jpeg_pic_state(ctx, decode_state, gen7_mfd_context);
+    gen8_mfd_jpeg_qm_state(ctx, decode_state, gen7_mfd_context);
+
+    for (j = 0; j < decode_state->num_slice_params; j++) {
+        assert(decode_state->slice_params && decode_state->slice_params[j]->buffer);
+        slice_param = (VASliceParameterBufferJPEGBaseline *)decode_state->slice_params[j]->buffer;
+        slice_data_bo = decode_state->slice_datas[j]->bo;
+        gen8_mfd_ind_obj_base_addr_state(ctx, slice_data_bo, MFX_FORMAT_JPEG, gen7_mfd_context);
+
+        if (j == decode_state->num_slice_params - 1)
+            next_slice_group_param = NULL;
+        else
+            next_slice_group_param = (VASliceParameterBufferJPEGBaseline *)decode_state->slice_params[j + 1]->buffer;
+
+        for (i = 0; i < decode_state->slice_params[j]->num_elements; i++) {
+            int component;
+
+            assert(slice_param->slice_data_flag == VA_SLICE_DATA_FLAG_ALL);
+
+            if (i < decode_state->slice_params[j]->num_elements - 1)
+                next_slice_param = slice_param + 1;
+            else
+                next_slice_param = next_slice_group_param;
+
+            for (component = 0; component < slice_param->num_components; component++) {
+                if (max_selector < slice_param->components[component].dc_table_selector)
+                    max_selector = slice_param->components[component].dc_table_selector;
+
+                if (max_selector < slice_param->components[component].ac_table_selector)
+                    max_selector = slice_param->components[component].ac_table_selector;
+            }
+
+            slice_param++;
+        }
+    }
+
+    assert(max_selector < 2);
+    gen8_mfd_jpeg_huff_table_state(ctx, decode_state, gen7_mfd_context, max_selector + 1);
+
+    for (j = 0; j < decode_state->num_slice_params; j++) {
+        assert(decode_state->slice_params && decode_state->slice_params[j]->buffer);
+        slice_param = (VASliceParameterBufferJPEGBaseline *)decode_state->slice_params[j]->buffer;
+        slice_data_bo = decode_state->slice_datas[j]->bo;
+        gen8_mfd_ind_obj_base_addr_state(ctx, slice_data_bo, MFX_FORMAT_JPEG, gen7_mfd_context);
+
+        if (j == decode_state->num_slice_params - 1)
+            next_slice_group_param = NULL;
+        else
+            next_slice_group_param = (VASliceParameterBufferJPEGBaseline *)decode_state->slice_params[j + 1]->buffer;
+
+        for (i = 0; i < decode_state->slice_params[j]->num_elements; i++) {
+            assert(slice_param->slice_data_flag == VA_SLICE_DATA_FLAG_ALL);
+
+            if (i < decode_state->slice_params[j]->num_elements - 1)
+                next_slice_param = slice_param + 1;
+            else
+                next_slice_param = next_slice_group_param;
+
+            gen8_mfd_jpeg_bsd_object(ctx, pic_param, slice_param, next_slice_param, slice_data_bo, gen7_mfd_context);
+            slice_param++;
+        }
+    }
+
+    intel_batchbuffer_end_atomic(batch);
+    intel_batchbuffer_flush(batch);
+}
+
+static VAStatus
+gen8_mfd_decode_picture(VADriverContextP ctx, 
+                        VAProfile profile, 
+                        union codec_state *codec_state,
+                        struct hw_context *hw_context)
+
+{
+    struct gen7_mfd_context *gen7_mfd_context = (struct gen7_mfd_context *)hw_context;
+    struct decode_state *decode_state = &codec_state->decode;
+    VAStatus vaStatus;
+
+    assert(gen7_mfd_context);
+
+    vaStatus = intel_decoder_sanity_check_input(ctx, profile, decode_state);
+
+    if (vaStatus != VA_STATUS_SUCCESS)
+        goto out;
+
+    gen7_mfd_context->wa_mpeg2_slice_vertical_position = -1;
+
+    switch (profile) {
+    case VAProfileMPEG2Simple:
+    case VAProfileMPEG2Main:
+        gen8_mfd_mpeg2_decode_picture(ctx, decode_state, gen7_mfd_context);
+        break;
+        
+    case VAProfileH264Baseline:
+    case VAProfileH264Main:
+    case VAProfileH264High:
+        gen8_mfd_avc_decode_picture(ctx, decode_state, gen7_mfd_context);
+        break;
+
+    case VAProfileVC1Simple:
+    case VAProfileVC1Main:
+    case VAProfileVC1Advanced:
+        gen8_mfd_vc1_decode_picture(ctx, decode_state, gen7_mfd_context);
+        break;
+
+    case VAProfileJPEGBaseline:
+        gen8_mfd_jpeg_decode_picture(ctx, decode_state, gen7_mfd_context);
+        break;
+
+    default:
+        assert(0);
+        break;
+    }
+
+    vaStatus = VA_STATUS_SUCCESS;
+
+out:
+    return vaStatus;
+}
+
+static void
+gen8_mfd_context_destroy(void *hw_context)
+{
+    struct gen7_mfd_context *gen7_mfd_context = (struct gen7_mfd_context *)hw_context;
+
+    dri_bo_unreference(gen7_mfd_context->post_deblocking_output.bo);
+    gen7_mfd_context->post_deblocking_output.bo = NULL;
+
+    dri_bo_unreference(gen7_mfd_context->pre_deblocking_output.bo);
+    gen7_mfd_context->pre_deblocking_output.bo = NULL;
+
+    dri_bo_unreference(gen7_mfd_context->intra_row_store_scratch_buffer.bo);
+    gen7_mfd_context->intra_row_store_scratch_buffer.bo = NULL;
+
+    dri_bo_unreference(gen7_mfd_context->deblocking_filter_row_store_scratch_buffer.bo);
+    gen7_mfd_context->deblocking_filter_row_store_scratch_buffer.bo = NULL;
+
+    dri_bo_unreference(gen7_mfd_context->bsd_mpc_row_store_scratch_buffer.bo);
+    gen7_mfd_context->bsd_mpc_row_store_scratch_buffer.bo = NULL;
+
+    dri_bo_unreference(gen7_mfd_context->mpr_row_store_scratch_buffer.bo);
+    gen7_mfd_context->mpr_row_store_scratch_buffer.bo = NULL;
+
+    dri_bo_unreference(gen7_mfd_context->bitplane_read_buffer.bo);
+    gen7_mfd_context->bitplane_read_buffer.bo = NULL;
+
+    dri_bo_unreference(gen7_mfd_context->jpeg_wa_slice_data_bo);
+
+    intel_batchbuffer_free(gen7_mfd_context->base.batch);
+    free(gen7_mfd_context);
+}
+
+static void gen8_mfd_mpeg2_context_init(VADriverContextP ctx,
+                                    struct gen7_mfd_context *gen7_mfd_context)
+{
+    gen7_mfd_context->iq_matrix.mpeg2.load_intra_quantiser_matrix = -1;
+    gen7_mfd_context->iq_matrix.mpeg2.load_non_intra_quantiser_matrix = -1;
+    gen7_mfd_context->iq_matrix.mpeg2.load_chroma_intra_quantiser_matrix = -1;
+    gen7_mfd_context->iq_matrix.mpeg2.load_chroma_non_intra_quantiser_matrix = -1;
+}
+
+struct hw_context *
+gen8_dec_hw_context_init(VADriverContextP ctx, struct object_config *obj_config)
+{
+    struct intel_driver_data *intel = intel_driver_data(ctx);
+    struct gen7_mfd_context *gen7_mfd_context = calloc(1, sizeof(struct gen7_mfd_context));
+    int i;
+
+    gen7_mfd_context->base.destroy = gen8_mfd_context_destroy;
+    gen7_mfd_context->base.run = gen8_mfd_decode_picture;
+    gen7_mfd_context->base.batch = intel_batchbuffer_new(intel, I915_EXEC_RENDER, 0);
+
+    for (i = 0; i < ARRAY_ELEMS(gen7_mfd_context->reference_surface); i++) {
+        gen7_mfd_context->reference_surface[i].surface_id = VA_INVALID_ID;
+        gen7_mfd_context->reference_surface[i].frame_store_id = -1;
+    }
+
+    gen7_mfd_context->jpeg_wa_surface_id = VA_INVALID_SURFACE;
+
+    switch (obj_config->profile) {
+    case VAProfileMPEG2Simple:
+    case VAProfileMPEG2Main:
+        gen8_mfd_mpeg2_context_init(ctx, gen7_mfd_context);
+        break;
+
+    case VAProfileH264Baseline:
+    case VAProfileH264Main:
+    case VAProfileH264High:
+        gen8_mfd_avc_context_init(ctx, gen7_mfd_context);
+        break;
+    default:
+        break;
+    }
+    return (struct hw_context *)gen7_mfd_context;
+}
diff --git a/src/gen8_vme.c b/src/gen8_vme.c
new file mode 100644 (file)
index 0000000..3fe1605
--- /dev/null
@@ -0,0 +1,1035 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Zhao Yakui <yakui.zhao@intel.com>
+ *    Xiang Haihao <haihao.xiang@intel.com>
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <assert.h>
+
+#include "intel_batchbuffer.h"
+#include "intel_driver.h"
+
+#include "i965_defines.h"
+#include "i965_drv_video.h"
+#include "i965_encoder.h"
+#include "gen6_vme.h"
+#include "gen6_mfc.h"
+
+#define SURFACE_STATE_PADDED_SIZE_0_GEN7        ALIGN(sizeof(struct gen7_surface_state), 32)
+#define SURFACE_STATE_PADDED_SIZE_1_GEN7        ALIGN(sizeof(struct gen7_surface_state2), 32)
+#define SURFACE_STATE_PADDED_SIZE_GEN7          MAX(SURFACE_STATE_PADDED_SIZE_0_GEN7, SURFACE_STATE_PADDED_SIZE_1_GEN7)
+
+#define SURFACE_STATE_PADDED_SIZE_0_GEN6        ALIGN(sizeof(struct i965_surface_state), 32)
+#define SURFACE_STATE_PADDED_SIZE_1_GEN6        ALIGN(sizeof(struct i965_surface_state2), 32)
+#define SURFACE_STATE_PADDED_SIZE_GEN6          MAX(SURFACE_STATE_PADDED_SIZE_0_GEN6, SURFACE_STATE_PADDED_SIZE_1_GEN6)
+
+#define SURFACE_STATE_PADDED_SIZE               MAX(SURFACE_STATE_PADDED_SIZE_GEN6, SURFACE_STATE_PADDED_SIZE_GEN7)
+#define SURFACE_STATE_OFFSET(index)             (SURFACE_STATE_PADDED_SIZE * index)
+#define BINDING_TABLE_OFFSET(index)             (SURFACE_STATE_OFFSET(MAX_MEDIA_SURFACES_GEN6) + sizeof(unsigned int) * index)
+
+#define VME_INTRA_SHADER        0
+#define VME_INTER_SHADER        1
+#define VME_BINTER_SHADER      3
+#define VME_BATCHBUFFER         2
+
+#define CURBE_ALLOCATION_SIZE   37              /* in 256-bit */
+#define CURBE_TOTAL_DATA_LENGTH (4 * 32)        /* in byte, it should be less than or equal to CURBE_ALLOCATION_SIZE * 32 */
+#define CURBE_URB_ENTRY_LENGTH  4               /* in 256-bit, it should be less than or equal to CURBE_TOTAL_DATA_LENGTH / 32 */
+
+#define VME_MSG_LENGTH         32
+  
+static const uint32_t gen8_vme_intra_frame[][4] = {
+#include "shaders/vme/intra_frame_haswell.g75b"
+};
+
+static const uint32_t gen8_vme_inter_frame[][4] = {
+#include "shaders/vme/inter_frame_haswell.g75b"
+};
+
+static const uint32_t gen8_vme_inter_bframe[][4] = {
+#include "shaders/vme/inter_bframe_haswell.g75b"
+};
+
+static const uint32_t gen8_vme_batchbuffer[][4] = {
+#include "shaders/vme/batchbuffer.g75b"
+};
+
+static struct i965_kernel gen8_vme_kernels[] = {
+    {
+        "VME Intra Frame",
+        VME_INTRA_SHADER, /*index*/
+        gen8_vme_intra_frame,                  
+        sizeof(gen8_vme_intra_frame),          
+        NULL
+    },
+    {
+        "VME inter Frame",
+        VME_INTER_SHADER,
+        gen8_vme_inter_frame,
+        sizeof(gen8_vme_inter_frame),
+        NULL
+    },
+    {
+        "VME BATCHBUFFER",
+        VME_BATCHBUFFER,
+        gen8_vme_batchbuffer,
+        sizeof(gen8_vme_batchbuffer),
+        NULL
+    },
+    {
+        "VME inter BFrame",
+        VME_BINTER_SHADER,
+        gen8_vme_inter_bframe,
+        sizeof(gen8_vme_inter_bframe),
+        NULL
+    }
+};
+
+static const uint32_t gen8_vme_mpeg2_intra_frame[][4] = {
+#include "shaders/vme/intra_frame_haswell.g75b"
+};
+
+static const uint32_t gen8_vme_mpeg2_inter_frame[][4] = {
+#include "shaders/vme/mpeg2_inter_haswell.g75b"
+};
+
+static const uint32_t gen8_vme_mpeg2_batchbuffer[][4] = {
+#include "shaders/vme/batchbuffer.g75b"
+};
+
+static struct i965_kernel gen8_vme_mpeg2_kernels[] = {
+    {
+        "VME Intra Frame",
+        VME_INTRA_SHADER, /*index*/
+        gen8_vme_mpeg2_intra_frame,                    
+        sizeof(gen8_vme_mpeg2_intra_frame),            
+        NULL
+    },
+    {
+        "VME inter Frame",
+        VME_INTER_SHADER,
+        gen8_vme_mpeg2_inter_frame,
+        sizeof(gen8_vme_mpeg2_inter_frame),
+        NULL
+    },
+    {
+        "VME BATCHBUFFER",
+        VME_BATCHBUFFER,
+        gen8_vme_mpeg2_batchbuffer,
+        sizeof(gen8_vme_mpeg2_batchbuffer),
+        NULL
+    },
+};
+
+/* only used for VME source surface state */
+static void 
+gen8_vme_source_surface_state(VADriverContextP ctx,
+                               int index,
+                               struct object_surface *obj_surface,
+                               struct intel_encoder_context *encoder_context)
+{
+    struct gen6_vme_context *vme_context = encoder_context->vme_context;
+
+    vme_context->vme_surface2_setup(ctx,
+                                    &vme_context->gpe_context,
+                                    obj_surface,
+                                    BINDING_TABLE_OFFSET(index),
+                                    SURFACE_STATE_OFFSET(index));
+}
+
+static void
+gen8_vme_media_source_surface_state(VADriverContextP ctx,
+                                     int index,
+                                     struct object_surface *obj_surface,
+                                     struct intel_encoder_context *encoder_context)
+{
+    struct gen6_vme_context *vme_context = encoder_context->vme_context;
+
+    vme_context->vme_media_rw_surface_setup(ctx,
+                                            &vme_context->gpe_context,
+                                            obj_surface,
+                                            BINDING_TABLE_OFFSET(index),
+                                            SURFACE_STATE_OFFSET(index));
+}
+
+static void
+gen8_vme_media_chroma_source_surface_state(VADriverContextP ctx,
+                                            int index,
+                                            struct object_surface *obj_surface,
+                                            struct intel_encoder_context *encoder_context)
+{
+    struct gen6_vme_context *vme_context = encoder_context->vme_context;
+
+    vme_context->vme_media_chroma_surface_setup(ctx,
+                                                &vme_context->gpe_context,
+                                                obj_surface,
+                                                BINDING_TABLE_OFFSET(index),
+                                                SURFACE_STATE_OFFSET(index));
+}
+
+static void
+gen8_vme_output_buffer_setup(VADriverContextP ctx,
+                              struct encode_state *encode_state,
+                              int index,
+                              struct intel_encoder_context *encoder_context)
+
+{
+    struct i965_driver_data *i965 = i965_driver_data(ctx);
+    struct gen6_vme_context *vme_context = encoder_context->vme_context;
+    VAEncSequenceParameterBufferH264 *pSequenceParameter = (VAEncSequenceParameterBufferH264 *)encode_state->seq_param_ext->buffer;
+    VAEncSliceParameterBufferH264 *pSliceParameter = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[0]->buffer;
+    int is_intra = pSliceParameter->slice_type == SLICE_TYPE_I;
+    int width_in_mbs = pSequenceParameter->picture_width_in_mbs;
+    int height_in_mbs = pSequenceParameter->picture_height_in_mbs;
+
+    vme_context->vme_output.num_blocks = width_in_mbs * height_in_mbs;
+    vme_context->vme_output.pitch = 16; /* in bytes, always 16 */
+
+    if (is_intra)
+        vme_context->vme_output.size_block = INTRA_VME_OUTPUT_IN_BYTES * 2;
+    else
+        vme_context->vme_output.size_block = INTRA_VME_OUTPUT_IN_BYTES * 24;
+    /*
+     * Inter MV . 32-byte Intra search + 16 IME info + 128 IME MV + 32 IME Ref
+     * + 16 FBR Info + 128 FBR MV + 32 FBR Ref.
+     * 16 * (2 + 2 * (1 + 8 + 2))= 16 * 24.
+     */
+
+    vme_context->vme_output.bo = dri_bo_alloc(i965->intel.bufmgr, 
+                                              "VME output buffer",
+                                              vme_context->vme_output.num_blocks * vme_context->vme_output.size_block,
+                                              0x1000);
+    assert(vme_context->vme_output.bo);
+    vme_context->vme_buffer_suface_setup(ctx,
+                                         &vme_context->gpe_context,
+                                         &vme_context->vme_output,
+                                         BINDING_TABLE_OFFSET(index),
+                                         SURFACE_STATE_OFFSET(index));
+}
+
+static void
+gen8_vme_output_vme_batchbuffer_setup(VADriverContextP ctx,
+                                       struct encode_state *encode_state,
+                                       int index,
+                                       struct intel_encoder_context *encoder_context)
+
+{
+    struct i965_driver_data *i965 = i965_driver_data(ctx);
+    struct gen6_vme_context *vme_context = encoder_context->vme_context;
+    VAEncSequenceParameterBufferH264 *pSequenceParameter = (VAEncSequenceParameterBufferH264 *)encode_state->seq_param_ext->buffer;
+    int width_in_mbs = pSequenceParameter->picture_width_in_mbs;
+    int height_in_mbs = pSequenceParameter->picture_height_in_mbs;
+
+    vme_context->vme_batchbuffer.num_blocks = width_in_mbs * height_in_mbs + 1;
+    vme_context->vme_batchbuffer.size_block = 64; /* 4 OWORDs */
+    vme_context->vme_batchbuffer.pitch = 16;
+    vme_context->vme_batchbuffer.bo = dri_bo_alloc(i965->intel.bufmgr, 
+                                                   "VME batchbuffer",
+                                                   vme_context->vme_batchbuffer.num_blocks * vme_context->vme_batchbuffer.size_block,
+                                                   0x1000);
+    vme_context->vme_buffer_suface_setup(ctx,
+                                         &vme_context->gpe_context,
+                                         &vme_context->vme_batchbuffer,
+                                         BINDING_TABLE_OFFSET(index),
+                                         SURFACE_STATE_OFFSET(index));
+}
+
+static VAStatus
+gen8_vme_surface_setup(VADriverContextP ctx, 
+                        struct encode_state *encode_state,
+                        int is_intra,
+                        struct intel_encoder_context *encoder_context)
+{
+    struct object_surface *obj_surface;
+
+    /*Setup surfaces state*/
+    /* current picture for encoding */
+    obj_surface = encode_state->input_yuv_object;
+    gen8_vme_source_surface_state(ctx, 0, obj_surface, encoder_context);
+    gen8_vme_media_source_surface_state(ctx, 4, obj_surface, encoder_context);
+    gen8_vme_media_chroma_source_surface_state(ctx, 6, obj_surface, encoder_context);
+
+    if (!is_intra) {
+        /* reference 0 */
+        obj_surface = encode_state->reference_objects[0];
+
+        if (obj_surface && obj_surface->bo)
+            gen8_vme_source_surface_state(ctx, 1, obj_surface, encoder_context);
+
+        /* reference 1 */
+        obj_surface = encode_state->reference_objects[1];
+
+        if (obj_surface && obj_surface->bo)
+            gen8_vme_source_surface_state(ctx, 2, obj_surface, encoder_context);
+    }
+
+    /* VME output */
+    gen8_vme_output_buffer_setup(ctx, encode_state, 3, encoder_context);
+    gen8_vme_output_vme_batchbuffer_setup(ctx, encode_state, 5, encoder_context);
+
+    return VA_STATUS_SUCCESS;
+}
+
+static VAStatus gen8_vme_interface_setup(VADriverContextP ctx, 
+                                          struct encode_state *encode_state,
+                                          struct intel_encoder_context *encoder_context)
+{
+    struct gen6_vme_context *vme_context = encoder_context->vme_context;
+    struct gen6_interface_descriptor_data *desc;   
+    int i;
+    dri_bo *bo;
+
+    bo = vme_context->gpe_context.idrt.bo;
+    dri_bo_map(bo, 1);
+    assert(bo->virtual);
+    desc = bo->virtual;
+
+    for (i = 0; i < vme_context->vme_kernel_sum; i++) {
+        struct i965_kernel *kernel;
+        kernel = &vme_context->gpe_context.kernels[i];
+        assert(sizeof(*desc) == 32);
+        /*Setup the descritor table*/
+        memset(desc, 0, sizeof(*desc));
+        desc->desc0.kernel_start_pointer = (kernel->bo->offset >> 6);
+        desc->desc2.sampler_count = 0; /* FIXME: */
+        desc->desc2.sampler_state_pointer = 0;
+        desc->desc3.binding_table_entry_count = 1; /* FIXME: */
+        desc->desc3.binding_table_pointer = (BINDING_TABLE_OFFSET(0) >> 5);
+        desc->desc4.constant_urb_entry_read_offset = 0;
+        desc->desc4.constant_urb_entry_read_length = CURBE_URB_ENTRY_LENGTH;
+               
+        /*kernel start*/
+        dri_bo_emit_reloc(bo,  
+                          I915_GEM_DOMAIN_INSTRUCTION, 0,
+                          0,
+                          i * sizeof(*desc) + offsetof(struct gen6_interface_descriptor_data, desc0),
+                          kernel->bo);
+        desc++;
+    }
+    dri_bo_unmap(bo);
+
+    return VA_STATUS_SUCCESS;
+}
+
+static VAStatus gen8_vme_constant_setup(VADriverContextP ctx, 
+                                         struct encode_state *encode_state,
+                                         struct intel_encoder_context *encoder_context)
+{
+    struct gen6_vme_context *vme_context = encoder_context->vme_context;
+    unsigned char *constant_buffer;
+    unsigned int *vme_state_message;
+    int mv_num = 32;
+
+    vme_state_message = (unsigned int *)vme_context->vme_state_message;
+
+    if (encoder_context->codec == CODEC_H264) {
+        if (vme_context->h264_level >= 30) {
+            mv_num = 16;
+        
+            if (vme_context->h264_level >= 31)
+                mv_num = 8;
+        } 
+    } else if (encoder_context->codec == CODEC_MPEG2) {
+        mv_num = 2;
+    }
+
+    vme_state_message[31] = mv_num;
+
+    dri_bo_map(vme_context->gpe_context.curbe.bo, 1);
+    assert(vme_context->gpe_context.curbe.bo->virtual);
+    constant_buffer = vme_context->gpe_context.curbe.bo->virtual;
+
+    /* VME MV/Mb cost table is passed by using const buffer */
+    /* Now it uses the fixed search path. So it is constructed directly
+     * in the GPU shader.
+     */
+    memcpy(constant_buffer, (char *)vme_context->vme_state_message, 128);
+       
+    dri_bo_unmap(vme_context->gpe_context.curbe.bo);
+
+    return VA_STATUS_SUCCESS;
+}
+
+static const unsigned int intra_mb_mode_cost_table[] = {
+    0x31110001, // for qp0
+    0x09110001, // for qp1
+    0x15030001, // for qp2
+    0x0b030001, // for qp3
+    0x0d030011, // for qp4
+    0x17210011, // for qp5
+    0x41210011, // for qp6
+    0x19210011, // for qp7
+    0x25050003, // for qp8
+    0x1b130003, // for qp9
+    0x1d130003, // for qp10
+    0x27070021, // for qp11
+    0x51310021, // for qp12
+    0x29090021, // for qp13
+    0x35150005, // for qp14
+    0x2b0b0013, // for qp15
+    0x2d0d0013, // for qp16
+    0x37170007, // for qp17
+    0x61410031, // for qp18
+    0x39190009, // for qp19
+    0x45250015, // for qp20
+    0x3b1b000b, // for qp21
+    0x3d1d000d, // for qp22
+    0x47270017, // for qp23
+    0x71510041, // for qp24 ! center for qp=0..30
+    0x49290019, // for qp25
+    0x55350025, // for qp26
+    0x4b2b001b, // for qp27
+    0x4d2d001d, // for qp28
+    0x57370027, // for qp29
+    0x81610051, // for qp30
+    0x57270017, // for qp31
+    0x81510041, // for qp32 ! center for qp=31..51
+    0x59290019, // for qp33
+    0x65350025, // for qp34
+    0x5b2b001b, // for qp35
+    0x5d2d001d, // for qp36
+    0x67370027, // for qp37
+    0x91610051, // for qp38
+    0x69390029, // for qp39
+    0x75450035, // for qp40
+    0x6b3b002b, // for qp41
+    0x6d3d002d, // for qp42
+    0x77470037, // for qp43
+    0xa1710061, // for qp44
+    0x79490039, // for qp45
+    0x85550045, // for qp46
+    0x7b4b003b, // for qp47
+    0x7d4d003d, // for qp48
+    0x87570047, // for qp49
+    0xb1810071, // for qp50
+    0x89590049  // for qp51
+};
+
+static void gen8_vme_state_setup_fixup(VADriverContextP ctx,
+                                        struct encode_state *encode_state,
+                                        struct intel_encoder_context *encoder_context,
+                                        unsigned int *vme_state_message)
+{
+    struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
+    VAEncPictureParameterBufferH264 *pic_param = (VAEncPictureParameterBufferH264 *)encode_state->pic_param_ext->buffer;
+    VAEncSliceParameterBufferH264 *slice_param = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[0]->buffer;
+
+    if (slice_param->slice_type != SLICE_TYPE_I &&
+        slice_param->slice_type != SLICE_TYPE_SI)
+        return;
+    if (encoder_context->rate_control_mode == VA_RC_CQP)
+        vme_state_message[0] = intra_mb_mode_cost_table[pic_param->pic_init_qp + slice_param->slice_qp_delta];
+    else
+        vme_state_message[0] = intra_mb_mode_cost_table[mfc_context->bit_rate_control_context[slice_param->slice_type].QpPrimeY];
+}
+
+static VAStatus gen8_vme_vme_state_setup(VADriverContextP ctx,
+                                          struct encode_state *encode_state,
+                                          int is_intra,
+                                          struct intel_encoder_context *encoder_context)
+{
+    struct gen6_vme_context *vme_context = encoder_context->vme_context;
+    unsigned int *vme_state_message;
+    int i;
+       
+    //pass the MV/Mb cost into VME message on HASWell
+    assert(vme_context->vme_state_message);
+    vme_state_message = (unsigned int *)vme_context->vme_state_message;
+
+    vme_state_message[0] = 0x4a4a4a4a;
+    vme_state_message[1] = 0x4a4a4a4a;
+    vme_state_message[2] = 0x4a4a4a4a;
+    vme_state_message[3] = 0x22120200;
+    vme_state_message[4] = 0x62524232;
+
+    for (i=5; i < 8; i++) {
+       vme_state_message[i] = 0;
+    }
+
+    switch (encoder_context->codec) {
+    case CODEC_H264:
+        gen8_vme_state_setup_fixup(ctx, encode_state, encoder_context, vme_state_message);
+
+        break;
+
+    default:
+        /* no fixup */
+        break;
+    }
+
+    return VA_STATUS_SUCCESS;
+}
+
+
+static void
+gen8_vme_fill_vme_batchbuffer(VADriverContextP ctx, 
+                               struct encode_state *encode_state,
+                               int mb_width, int mb_height,
+                               int kernel,
+                               int transform_8x8_mode_flag,
+                               struct intel_encoder_context *encoder_context)
+{
+    struct gen6_vme_context *vme_context = encoder_context->vme_context;
+    int mb_x = 0, mb_y = 0;
+    int i, s;
+    unsigned int *command_ptr;
+
+    dri_bo_map(vme_context->vme_batchbuffer.bo, 1);
+    command_ptr = vme_context->vme_batchbuffer.bo->virtual;
+
+    for (s = 0; s < encode_state->num_slice_params_ext; s++) {
+        VAEncSliceParameterBufferH264 *pSliceParameter = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[s]->buffer; 
+        int slice_mb_begin = pSliceParameter->macroblock_address;
+        int slice_mb_number = pSliceParameter->num_macroblocks;
+        unsigned int mb_intra_ub;
+       int slice_mb_x = pSliceParameter->macroblock_address % mb_width; 
+        for (i = 0; i < slice_mb_number;  ) {
+            int mb_count = i + slice_mb_begin;    
+            mb_x = mb_count % mb_width;
+            mb_y = mb_count / mb_width;
+           mb_intra_ub = 0;
+           if (mb_x != 0) {
+               mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_AE;
+           }
+           if (mb_y != 0) {
+               mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_B;
+               if (mb_x != 0)
+                    mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_D;
+               if (mb_x != (mb_width -1))
+                    mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_C;
+           }
+           if (i < mb_width) {
+               if (i == 0)
+                    mb_intra_ub &= ~(INTRA_PRED_AVAIL_FLAG_AE);
+               mb_intra_ub &= ~(INTRA_PRED_AVAIL_FLAG_BCD_MASK);
+               if ((i == (mb_width - 1)) && slice_mb_x) {
+                    mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_C;
+               }
+           }
+               
+           if ((i == mb_width) && slice_mb_x) {
+               mb_intra_ub &= ~(INTRA_PRED_AVAIL_FLAG_D);
+           }
+            *command_ptr++ = (CMD_MEDIA_OBJECT | (8 - 2));
+            *command_ptr++ = kernel;
+            *command_ptr++ = 0;
+            *command_ptr++ = 0;
+            *command_ptr++ = 0;
+            *command_ptr++ = 0;
+   
+            /*inline data */
+            *command_ptr++ = (mb_width << 16 | mb_y << 8 | mb_x);
+            *command_ptr++ = ( (1 << 16) | transform_8x8_mode_flag | (mb_intra_ub << 8));
+
+            i += 1;
+        } 
+    }
+
+    *command_ptr++ = 0;
+    *command_ptr++ = MI_BATCH_BUFFER_END;
+
+    dri_bo_unmap(vme_context->vme_batchbuffer.bo);
+}
+
+static void gen8_vme_media_init(VADriverContextP ctx, struct intel_encoder_context *encoder_context)
+{
+    struct gen6_vme_context *vme_context = encoder_context->vme_context;
+
+    i965_gpe_context_init(ctx, &vme_context->gpe_context);
+
+    /* VME output buffer */
+    dri_bo_unreference(vme_context->vme_output.bo);
+    vme_context->vme_output.bo = NULL;
+
+    dri_bo_unreference(vme_context->vme_batchbuffer.bo);
+    vme_context->vme_batchbuffer.bo = NULL;
+
+    /* VME state */
+    dri_bo_unreference(vme_context->vme_state.bo);
+    vme_context->vme_state.bo = NULL;
+}
+
+static void gen8_vme_pipeline_programing(VADriverContextP ctx, 
+                                          struct encode_state *encode_state,
+                                          struct intel_encoder_context *encoder_context)
+{
+    struct gen6_vme_context *vme_context = encoder_context->vme_context;
+    struct intel_batchbuffer *batch = encoder_context->base.batch;
+    VAEncPictureParameterBufferH264 *pPicParameter = (VAEncPictureParameterBufferH264 *)encode_state->pic_param_ext->buffer;
+    VAEncSliceParameterBufferH264 *pSliceParameter = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[0]->buffer;
+    VAEncSequenceParameterBufferH264 *pSequenceParameter = (VAEncSequenceParameterBufferH264 *)encode_state->seq_param_ext->buffer;
+    int width_in_mbs = pSequenceParameter->picture_width_in_mbs;
+    int height_in_mbs = pSequenceParameter->picture_height_in_mbs;
+    int kernel_shader;
+    bool allow_hwscore = true;
+    int s;
+
+    for (s = 0; s < encode_state->num_slice_params_ext; s++) {
+        pSliceParameter = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[s]->buffer; 
+        if ((pSliceParameter->macroblock_address % width_in_mbs)) {
+               allow_hwscore = false;
+               break;
+       }
+    }
+    if ((pSliceParameter->slice_type == SLICE_TYPE_I) ||
+       (pSliceParameter->slice_type == SLICE_TYPE_I)) {
+       kernel_shader = VME_INTRA_SHADER;
+   } else if ((pSliceParameter->slice_type == SLICE_TYPE_P) ||
+       (pSliceParameter->slice_type == SLICE_TYPE_SP)) {
+       kernel_shader = VME_INTER_SHADER;
+   } else {
+       kernel_shader = VME_BINTER_SHADER;
+       if (!allow_hwscore)
+            kernel_shader = VME_INTER_SHADER;
+   }
+    if (allow_hwscore)
+       gen7_vme_walker_fill_vme_batchbuffer(ctx, 
+                                  encode_state,
+                                  width_in_mbs, height_in_mbs,
+                                  kernel_shader,
+                                  pPicParameter->pic_fields.bits.transform_8x8_mode_flag,
+                                  encoder_context);
+    else
+       gen8_vme_fill_vme_batchbuffer(ctx, 
+                                   encode_state,
+                                   width_in_mbs, height_in_mbs,
+                                   kernel_shader,
+                                   pPicParameter->pic_fields.bits.transform_8x8_mode_flag,
+                                   encoder_context);
+
+    intel_batchbuffer_start_atomic(batch, 0x1000);
+    gen6_gpe_pipeline_setup(ctx, &vme_context->gpe_context, batch);
+    BEGIN_BATCH(batch, 2);
+    OUT_BATCH(batch, MI_BATCH_BUFFER_START | (2 << 6));
+    OUT_RELOC(batch,
+              vme_context->vme_batchbuffer.bo,
+              I915_GEM_DOMAIN_COMMAND, 0, 
+              0);
+    ADVANCE_BATCH(batch);
+
+    intel_batchbuffer_end_atomic(batch);       
+}
+
+static VAStatus gen8_vme_prepare(VADriverContextP ctx, 
+                                  struct encode_state *encode_state,
+                                  struct intel_encoder_context *encoder_context)
+{
+    VAStatus vaStatus = VA_STATUS_SUCCESS;
+    VAEncSliceParameterBufferH264 *pSliceParameter = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[0]->buffer;
+    int is_intra = pSliceParameter->slice_type == SLICE_TYPE_I;
+    VAEncSequenceParameterBufferH264 *pSequenceParameter = (VAEncSequenceParameterBufferH264 *)encode_state->seq_param_ext->buffer;
+    struct gen6_vme_context *vme_context = encoder_context->vme_context;
+
+    if (!vme_context->h264_level ||
+        (vme_context->h264_level != pSequenceParameter->level_idc)) {
+       vme_context->h264_level = pSequenceParameter->level_idc;        
+    }  
+
+    intel_vme_update_mbmv_cost(ctx, encode_state, encoder_context);
+       
+    /*Setup all the memory object*/
+    gen8_vme_surface_setup(ctx, encode_state, is_intra, encoder_context);
+    gen8_vme_interface_setup(ctx, encode_state, encoder_context);
+    //gen8_vme_vme_state_setup(ctx, encode_state, is_intra, encoder_context);
+    gen8_vme_constant_setup(ctx, encode_state, encoder_context);
+
+    /*Programing media pipeline*/
+    gen8_vme_pipeline_programing(ctx, encode_state, encoder_context);
+
+    return vaStatus;
+}
+
+static VAStatus gen8_vme_run(VADriverContextP ctx, 
+                              struct encode_state *encode_state,
+                              struct intel_encoder_context *encoder_context)
+{
+    struct intel_batchbuffer *batch = encoder_context->base.batch;
+
+    intel_batchbuffer_flush(batch);
+
+    return VA_STATUS_SUCCESS;
+}
+
+static VAStatus gen8_vme_stop(VADriverContextP ctx, 
+                               struct encode_state *encode_state,
+                               struct intel_encoder_context *encoder_context)
+{
+    return VA_STATUS_SUCCESS;
+}
+
+static VAStatus
+gen8_vme_pipeline(VADriverContextP ctx,
+                   VAProfile profile,
+                   struct encode_state *encode_state,
+                   struct intel_encoder_context *encoder_context)
+{
+    gen8_vme_media_init(ctx, encoder_context);
+    gen8_vme_prepare(ctx, encode_state, encoder_context);
+    gen8_vme_run(ctx, encode_state, encoder_context);
+    gen8_vme_stop(ctx, encode_state, encoder_context);
+
+    return VA_STATUS_SUCCESS;
+}
+
+static void
+gen8_vme_mpeg2_output_buffer_setup(VADriverContextP ctx,
+                                    struct encode_state *encode_state,
+                                    int index,
+                                    int is_intra,
+                                    struct intel_encoder_context *encoder_context)
+
+{
+    struct i965_driver_data *i965 = i965_driver_data(ctx);
+    struct gen6_vme_context *vme_context = encoder_context->vme_context;
+    VAEncSequenceParameterBufferMPEG2 *seq_param = (VAEncSequenceParameterBufferMPEG2 *)encode_state->seq_param_ext->buffer;
+    int width_in_mbs = ALIGN(seq_param->picture_width, 16) / 16;
+    int height_in_mbs = ALIGN(seq_param->picture_height, 16) / 16;
+
+    vme_context->vme_output.num_blocks = width_in_mbs * height_in_mbs;
+    vme_context->vme_output.pitch = 16; /* in bytes, always 16 */
+
+    if (is_intra)
+        vme_context->vme_output.size_block = INTRA_VME_OUTPUT_IN_BYTES * 2;
+    else
+        vme_context->vme_output.size_block = INTRA_VME_OUTPUT_IN_BYTES * 24;
+    /*
+     * Inter MV . 32-byte Intra search + 16 IME info + 128 IME MV + 32 IME Ref
+     * + 16 FBR Info + 128 FBR MV + 32 FBR Ref.
+     * 16 * (2 + 2 * (1 + 8 + 2))= 16 * 24.
+     */
+
+    vme_context->vme_output.bo = dri_bo_alloc(i965->intel.bufmgr, 
+                                              "VME output buffer",
+                                              vme_context->vme_output.num_blocks * vme_context->vme_output.size_block,
+                                              0x1000);
+    assert(vme_context->vme_output.bo);
+    vme_context->vme_buffer_suface_setup(ctx,
+                                         &vme_context->gpe_context,
+                                         &vme_context->vme_output,
+                                         BINDING_TABLE_OFFSET(index),
+                                         SURFACE_STATE_OFFSET(index));
+}
+
+static void
+gen8_vme_mpeg2_output_vme_batchbuffer_setup(VADriverContextP ctx,
+                                             struct encode_state *encode_state,
+                                             int index,
+                                             struct intel_encoder_context *encoder_context)
+
+{
+    struct i965_driver_data *i965 = i965_driver_data(ctx);
+    struct gen6_vme_context *vme_context = encoder_context->vme_context;
+    VAEncSequenceParameterBufferMPEG2 *seq_param = (VAEncSequenceParameterBufferMPEG2 *)encode_state->seq_param_ext->buffer;
+    int width_in_mbs = ALIGN(seq_param->picture_width, 16) / 16;
+    int height_in_mbs = ALIGN(seq_param->picture_height, 16) / 16;
+
+    vme_context->vme_batchbuffer.num_blocks = width_in_mbs * height_in_mbs + 1;
+    vme_context->vme_batchbuffer.size_block = 64; /* 4 OWORDs */
+    vme_context->vme_batchbuffer.pitch = 16;
+    vme_context->vme_batchbuffer.bo = dri_bo_alloc(i965->intel.bufmgr, 
+                                                   "VME batchbuffer",
+                                                   vme_context->vme_batchbuffer.num_blocks * vme_context->vme_batchbuffer.size_block,
+                                                   0x1000);
+    vme_context->vme_buffer_suface_setup(ctx,
+                                         &vme_context->gpe_context,
+                                         &vme_context->vme_batchbuffer,
+                                         BINDING_TABLE_OFFSET(index),
+                                         SURFACE_STATE_OFFSET(index));
+}
+
+static VAStatus
+gen8_vme_mpeg2_surface_setup(VADriverContextP ctx, 
+                              struct encode_state *encode_state,
+                              int is_intra,
+                              struct intel_encoder_context *encoder_context)
+{
+    struct object_surface *obj_surface;
+
+    /*Setup surfaces state*/
+    /* current picture for encoding */
+    obj_surface = encode_state->input_yuv_object;
+    gen8_vme_source_surface_state(ctx, 0, obj_surface, encoder_context);
+    gen8_vme_media_source_surface_state(ctx, 4, obj_surface, encoder_context);
+    gen8_vme_media_chroma_source_surface_state(ctx, 6, obj_surface, encoder_context);
+
+    if (!is_intra) {
+        /* reference 0 */
+        obj_surface = encode_state->reference_objects[0];
+
+        if (obj_surface->bo != NULL)
+            gen8_vme_source_surface_state(ctx, 1, obj_surface, encoder_context);
+
+        /* reference 1 */
+        obj_surface = encode_state->reference_objects[1];
+
+        if (obj_surface && obj_surface->bo != NULL) 
+            gen8_vme_source_surface_state(ctx, 2, obj_surface, encoder_context);
+    }
+
+    /* VME output */
+    gen8_vme_mpeg2_output_buffer_setup(ctx, encode_state, 3, is_intra, encoder_context);
+    gen8_vme_mpeg2_output_vme_batchbuffer_setup(ctx, encode_state, 5, encoder_context);
+
+    return VA_STATUS_SUCCESS;
+}
+
+static void
+gen8_vme_mpeg2_fill_vme_batchbuffer(VADriverContextP ctx, 
+                                     struct encode_state *encode_state,
+                                     int mb_width, int mb_height,
+                                     int kernel,
+                                     int transform_8x8_mode_flag,
+                                     struct intel_encoder_context *encoder_context)
+{
+    struct gen6_vme_context *vme_context = encoder_context->vme_context;
+    int mb_x = 0, mb_y = 0;
+    int i, s, j;
+    unsigned int *command_ptr;
+
+
+    dri_bo_map(vme_context->vme_batchbuffer.bo, 1);
+    command_ptr = vme_context->vme_batchbuffer.bo->virtual;
+
+    for (s = 0; s < encode_state->num_slice_params_ext; s++) {
+        VAEncSliceParameterBufferMPEG2 *slice_param = (VAEncSliceParameterBufferMPEG2 *)encode_state->slice_params_ext[s]->buffer;
+
+        for (j = 0; j < encode_state->slice_params_ext[s]->num_elements; j++) {
+            int slice_mb_begin = slice_param->macroblock_address;
+            int slice_mb_number = slice_param->num_macroblocks;
+            unsigned int mb_intra_ub;
+            int slice_mb_x = slice_param->macroblock_address % mb_width;
+
+            for (i = 0; i < slice_mb_number;) {
+                int mb_count = i + slice_mb_begin;    
+
+                mb_x = mb_count % mb_width;
+                mb_y = mb_count / mb_width;
+                mb_intra_ub = 0;
+
+                if (mb_x != 0) {
+                    mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_AE;
+                }
+
+                if (mb_y != 0) {
+                    mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_B;
+
+                    if (mb_x != 0)
+                        mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_D;
+
+                    if (mb_x != (mb_width -1))
+                        mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_C;
+                }
+
+                if (i < mb_width) {
+                    if (i == 0)
+                        mb_intra_ub &= ~(INTRA_PRED_AVAIL_FLAG_AE);
+
+                    mb_intra_ub &= ~(INTRA_PRED_AVAIL_FLAG_BCD_MASK);
+
+                    if ((i == (mb_width - 1)) && slice_mb_x) {
+                        mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_C;
+                    }
+                }
+               
+                if ((i == mb_width) && slice_mb_x) {
+                    mb_intra_ub &= ~(INTRA_PRED_AVAIL_FLAG_D);
+                }
+
+                *command_ptr++ = (CMD_MEDIA_OBJECT | (8 - 2));
+                *command_ptr++ = kernel;
+                *command_ptr++ = 0;
+                *command_ptr++ = 0;
+                *command_ptr++ = 0;
+                *command_ptr++ = 0;
+   
+                /*inline data */
+                *command_ptr++ = (mb_width << 16 | mb_y << 8 | mb_x);
+                *command_ptr++ = ( (1 << 16) | transform_8x8_mode_flag | (mb_intra_ub << 8));
+
+                i += 1;
+            }
+
+            slice_param++;
+        }
+    }
+
+    *command_ptr++ = 0;
+    *command_ptr++ = MI_BATCH_BUFFER_END;
+
+    dri_bo_unmap(vme_context->vme_batchbuffer.bo);
+}
+
+static void
+gen8_vme_mpeg2_pipeline_programing(VADriverContextP ctx, 
+                                    struct encode_state *encode_state,
+                                    int is_intra,
+                                    struct intel_encoder_context *encoder_context)
+{
+    struct gen6_vme_context *vme_context = encoder_context->vme_context;
+    struct intel_batchbuffer *batch = encoder_context->base.batch;
+    VAEncSequenceParameterBufferMPEG2 *seq_param = (VAEncSequenceParameterBufferMPEG2 *)encode_state->seq_param_ext->buffer;
+    int width_in_mbs = ALIGN(seq_param->picture_width, 16) / 16;
+    int height_in_mbs = ALIGN(seq_param->picture_height, 16) / 16;
+
+    gen8_vme_mpeg2_fill_vme_batchbuffer(ctx, 
+                                         encode_state,
+                                         width_in_mbs, height_in_mbs,
+                                         is_intra ? VME_INTRA_SHADER : VME_INTER_SHADER,
+                                         0,
+                                         encoder_context);
+
+    intel_batchbuffer_start_atomic(batch, 0x1000);
+    gen6_gpe_pipeline_setup(ctx, &vme_context->gpe_context, batch);
+    BEGIN_BATCH(batch, 2);
+    OUT_BATCH(batch, MI_BATCH_BUFFER_START | (2 << 6));
+    OUT_RELOC(batch,
+              vme_context->vme_batchbuffer.bo,
+              I915_GEM_DOMAIN_COMMAND, 0, 
+              0);
+    ADVANCE_BATCH(batch);
+
+    intel_batchbuffer_end_atomic(batch);       
+}
+
+static VAStatus 
+gen8_vme_mpeg2_prepare(VADriverContextP ctx, 
+                        struct encode_state *encode_state,
+                        struct intel_encoder_context *encoder_context)
+{
+    VAStatus vaStatus = VA_STATUS_SUCCESS;
+    VAEncSliceParameterBufferMPEG2 *slice_param = (VAEncSliceParameterBufferMPEG2 *)encode_state->slice_params_ext[0]->buffer;
+       
+    /*Setup all the memory object*/
+    gen8_vme_mpeg2_surface_setup(ctx, encode_state, slice_param->is_intra_slice, encoder_context);
+    gen8_vme_interface_setup(ctx, encode_state, encoder_context);
+    gen8_vme_vme_state_setup(ctx, encode_state, slice_param->is_intra_slice, encoder_context);
+    gen8_vme_constant_setup(ctx, encode_state, encoder_context);
+
+    /*Programing media pipeline*/
+    gen8_vme_mpeg2_pipeline_programing(ctx, encode_state, slice_param->is_intra_slice, encoder_context);
+
+    return vaStatus;
+}
+
+static VAStatus
+gen8_vme_mpeg2_pipeline(VADriverContextP ctx,
+                         VAProfile profile,
+                         struct encode_state *encode_state,
+                         struct intel_encoder_context *encoder_context)
+{
+    gen8_vme_media_init(ctx, encoder_context);
+    gen8_vme_mpeg2_prepare(ctx, encode_state, encoder_context);
+    gen8_vme_run(ctx, encode_state, encoder_context);
+    gen8_vme_stop(ctx, encode_state, encoder_context);
+
+    return VA_STATUS_SUCCESS;
+}
+
+static void
+gen8_vme_context_destroy(void *context)
+{
+    struct gen6_vme_context *vme_context = context;
+
+    i965_gpe_context_destroy(&vme_context->gpe_context);
+
+    dri_bo_unreference(vme_context->vme_output.bo);
+    vme_context->vme_output.bo = NULL;
+
+    dri_bo_unreference(vme_context->vme_state.bo);
+    vme_context->vme_state.bo = NULL;
+
+    dri_bo_unreference(vme_context->vme_batchbuffer.bo);
+    vme_context->vme_batchbuffer.bo = NULL;
+
+    if (vme_context->vme_state_message) {
+       free(vme_context->vme_state_message);
+       vme_context->vme_state_message = NULL;
+    }
+
+    free(vme_context);
+}
+
+Bool gen8_vme_context_init(VADriverContextP ctx, struct intel_encoder_context *encoder_context)
+{
+    struct gen6_vme_context *vme_context = calloc(1, sizeof(struct gen6_vme_context));
+    struct i965_kernel *vme_kernel_list = NULL;
+       int i965_kernel_num;
+
+    switch (encoder_context->codec) {
+    case CODEC_H264:
+        vme_kernel_list = gen8_vme_kernels;
+        encoder_context->vme_pipeline = gen8_vme_pipeline;
+               i965_kernel_num = sizeof(gen8_vme_kernels) / sizeof(struct i965_kernel); 
+        break;
+
+    case CODEC_MPEG2:
+        vme_kernel_list = gen8_vme_mpeg2_kernels;
+        encoder_context->vme_pipeline = gen8_vme_mpeg2_pipeline;
+               i965_kernel_num = sizeof(gen8_vme_mpeg2_kernels) / sizeof(struct i965_kernel); 
+
+        break;
+
+    default:
+        /* never get here */
+        assert(0);
+
+        break;
+    }
+    vme_context->vme_kernel_sum = i965_kernel_num;
+    vme_context->gpe_context.surface_state_binding_table.length = (SURFACE_STATE_PADDED_SIZE + sizeof(unsigned int)) * MAX_MEDIA_SURFACES_GEN6;
+
+    vme_context->gpe_context.idrt.max_entries = MAX_INTERFACE_DESC_GEN6;
+    vme_context->gpe_context.idrt.entry_size = sizeof(struct gen6_interface_descriptor_data);
+
+    vme_context->gpe_context.curbe.length = CURBE_TOTAL_DATA_LENGTH;
+
+    vme_context->gpe_context.vfe_state.max_num_threads = 60 - 1;
+    vme_context->gpe_context.vfe_state.num_urb_entries = 16;
+    vme_context->gpe_context.vfe_state.gpgpu_mode = 0;
+    vme_context->gpe_context.vfe_state.urb_entry_size = 59 - 1;
+    vme_context->gpe_context.vfe_state.curbe_allocation_size = CURBE_ALLOCATION_SIZE - 1;
+
+    gen7_vme_scoreboard_init(ctx, vme_context);
+
+    i965_gpe_load_kernels(ctx,
+                          &vme_context->gpe_context,
+                          vme_kernel_list,
+                          i965_kernel_num);
+    vme_context->vme_surface2_setup = gen7_gpe_surface2_setup;
+    vme_context->vme_media_rw_surface_setup = gen7_gpe_media_rw_surface_setup;
+    vme_context->vme_buffer_suface_setup = gen7_gpe_buffer_suface_setup;
+    vme_context->vme_media_chroma_surface_setup = gen75_gpe_media_chroma_surface_setup;
+
+    encoder_context->vme_context = vme_context;
+    encoder_context->vme_context_destroy = gen8_vme_context_destroy;
+
+    vme_context->vme_state_message = malloc(VME_MSG_LENGTH * sizeof(int));
+
+    return True;
+}
index 4f7d2cc..c7d49d7 100644 (file)
@@ -49,4 +49,6 @@ struct gen_buffer {
 struct hw_context *
 gen75_dec_hw_context_init(VADriverContextP ctx, struct object_config *obj_config);
 
+extern struct hw_context *
+gen8_dec_hw_context_init(VADriverContextP ctx, struct object_config *obj_config);
 #endif /* I965_DECODER_H */
index c9ed624..c6c0591 100755 (executable)
@@ -320,8 +320,8 @@ static struct hw_codec_info gen75_hw_codec_info = {
 
 /* TODO: Add the separate call back function for Gen8 */
 static struct hw_codec_info gen8_hw_codec_info = {
-    .dec_hw_context_init = gen75_dec_hw_context_init,
-    .enc_hw_context_init = gen75_enc_hw_context_init,
+    .dec_hw_context_init = gen8_dec_hw_context_init,
+    .enc_hw_context_init = gen8_enc_hw_context_init,
     .proc_hw_context_init = gen75_proc_context_init,
     .max_width = 4096,
     .max_height = 4096,
index 1e46a1a..9bf133f 100644 (file)
@@ -401,3 +401,10 @@ gen75_enc_hw_context_init(VADriverContextP ctx, struct object_config *obj_config
 {
     return intel_enc_hw_context_init(ctx, obj_config, gen75_vme_context_init, gen75_mfc_context_init);
 }
+
+struct hw_context *
+gen8_enc_hw_context_init(VADriverContextP ctx, struct object_config *obj_config)
+{
+    return intel_enc_hw_context_init(ctx, obj_config, gen8_vme_context_init, gen8_mfc_context_init);
+}
+
index 29bd702..71396d6 100644 (file)
@@ -62,6 +62,8 @@ struct intel_encoder_context
 extern struct hw_context *
 gen75_enc_hw_context_init(VADriverContextP ctx, struct object_config *obj_config);
 
+extern struct hw_context *
+gen8_enc_hw_context_init(VADriverContextP ctx, struct object_config *obj_config);
 #endif /* _I965_ENCODER_H_ */