H264_Encoding: Add the support of packed slice header to be flexible
[platform/upstream/libva-intel-driver.git] / src / gen6_mfc_common.c
index 83f2792..e500feb 100644 (file)
 #include "i965_encoder_utils.h"
 #include "gen6_mfc.h"
 #include "gen6_vme.h"
+#include "intel_media.h"
 
-
-#define BRC_CLIP(x, min, max) \
-{ \
-    x = ((x > (max)) ? (max) : ((x < (min)) ? (min) : x)); \
-}
+#define BRC_CLIP(x, min, max)                                   \
+    {                                                           \
+        x = ((x > (max)) ? (max) : ((x < (min)) ? (min) : x));  \
+    }
 
 #define BRC_P_B_QP_DIFF 4
 #define BRC_I_P_QP_DIFF 2
 
 #define BRC_PI_0_5 1.5707963267948966192313216916398
 
+#ifndef HAVE_LOG2F
+#define log2f(x) (logf(x)/(float)M_LN2)
+#endif
+
+int intel_avc_enc_slice_type_fixup(int slice_type)
+{
+    if (slice_type == SLICE_TYPE_SP ||
+        slice_type == SLICE_TYPE_P)
+        slice_type = SLICE_TYPE_P;
+    else if (slice_type == SLICE_TYPE_SI ||
+             slice_type == SLICE_TYPE_I)
+        slice_type = SLICE_TYPE_I;
+    else {
+        if (slice_type != SLICE_TYPE_B)
+            WARN_ONCE("Invalid slice type for H.264 encoding!\n");
+
+        slice_type = SLICE_TYPE_B;
+    }
+
+    return slice_type;
+}
+
 static void
 intel_mfc_bit_rate_control_context_init(struct encode_state *encode_state, 
-                                       struct gen6_mfc_context *mfc_context)
+                                        struct gen6_mfc_context *mfc_context)
 {
     VAEncSequenceParameterBufferH264 *pSequenceParameter = (VAEncSequenceParameterBufferH264 *)encode_state->seq_param_ext->buffer;
-    int width_in_mbs = (mfc_context->surface_state.width + 15) / 16;
-    int height_in_mbs = (mfc_context->surface_state.height + 15) / 16;
+    int width_in_mbs = pSequenceParameter->picture_width_in_mbs;
+    int height_in_mbs = pSequenceParameter->picture_height_in_mbs;
     float fps =  pSequenceParameter->time_scale * 0.5 / pSequenceParameter->num_units_in_tick ;
     int inter_mb_size = pSequenceParameter->bits_per_second * 1.0 / (fps+4.0) / width_in_mbs / height_in_mbs;
     int intra_mb_size = inter_mb_size * 5.0;
@@ -108,7 +130,7 @@ intel_mfc_bit_rate_control_context_init(struct encode_state *encode_state,
 }
 
 static void intel_mfc_brc_init(struct encode_state *encode_state,
-                  struct intel_encoder_context* encoder_context)
+                               struct intel_encoder_context* encoder_context)
 {
     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
     VAEncSequenceParameterBufferH264 *pSequenceParameter = (VAEncSequenceParameterBufferH264 *)encode_state->seq_param_ext->buffer;
@@ -144,7 +166,7 @@ static void intel_mfc_brc_init(struct encode_state *encode_state,
     mfc_context->hrd.buffer_size = (double)pParameterHRD->buffer_size;
     mfc_context->hrd.current_buffer_fullness =
         (double)(pParameterHRD->initial_buffer_fullness < mfc_context->hrd.buffer_size)?
-            pParameterHRD->initial_buffer_fullness: mfc_context->hrd.buffer_size/2.;
+        pParameterHRD->initial_buffer_fullness: mfc_context->hrd.buffer_size/2.;
     mfc_context->hrd.target_buffer_fullness = (double)mfc_context->hrd.buffer_size/2.;
     mfc_context->hrd.buffer_capacity = (double)mfc_context->hrd.buffer_size/qp1_size;
     mfc_context->hrd.violation_noted = 0;
@@ -166,8 +188,8 @@ static void intel_mfc_brc_init(struct encode_state *encode_state,
 }
 
 int intel_mfc_update_hrd(struct encode_state *encode_state,
-                               struct gen6_mfc_context *mfc_context,
-                               int frame_bits)
+                         struct gen6_mfc_context *mfc_context,
+                         int frame_bits)
 {
     double prev_bf = mfc_context->hrd.current_buffer_fullness;
 
@@ -191,12 +213,12 @@ int intel_mfc_update_hrd(struct encode_state *encode_state,
 }
 
 int intel_mfc_brc_postpack(struct encode_state *encode_state,
-                                 struct gen6_mfc_context *mfc_context,
-                                 int frame_bits)
+                           struct gen6_mfc_context *mfc_context,
+                           int frame_bits)
 {
     gen6_brc_status sts = BRC_NO_HRD_VIOLATION;
     VAEncSliceParameterBufferH264 *pSliceParameter = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[0]->buffer; 
-    int slicetype = pSliceParameter->slice_type;
+    int slicetype = intel_avc_enc_slice_type_fixup(pSliceParameter->slice_type);
     int qpi = mfc_context->bit_rate_control_context[SLICE_TYPE_I].QpPrimeY;
     int qpp = mfc_context->bit_rate_control_context[SLICE_TYPE_P].QpPrimeY;
     int qpb = mfc_context->bit_rate_control_context[SLICE_TYPE_B].QpPrimeY;
@@ -212,11 +234,6 @@ int intel_mfc_brc_postpack(struct encode_state *encode_state,
     double x, y;
     double frame_size_alpha;
 
-    if (slicetype == SLICE_TYPE_SP)
-        slicetype = SLICE_TYPE_P;
-    else if (slicetype == SLICE_TYPE_SI)
-        slicetype = SLICE_TYPE_I;
-
     qp = mfc_context->bit_rate_control_context[slicetype].QpPrimeY;
 
     target_frame_size = mfc_context->brc.target_frame_size[slicetype];
@@ -226,7 +243,7 @@ int intel_mfc_brc_postpack(struct encode_state *encode_state,
         frame_size_alpha = (double)mfc_context->brc.gop_nums[slicetype];
     if (frame_size_alpha > 30) frame_size_alpha = 30;
     frame_size_next = target_frame_size + (double)(target_frame_size - frame_bits) /
-                                          (double)(frame_size_alpha + 1.);
+        (double)(frame_size_alpha + 1.);
 
     /* frame_size_next: avoiding negative number and too small value */
     if ((double)frame_size_next < (double)(target_frame_size * 0.25))
@@ -316,7 +333,7 @@ int intel_mfc_brc_postpack(struct encode_state *encode_state,
 }
 
 static void intel_mfc_hrd_context_init(struct encode_state *encode_state,
-                          struct intel_encoder_context *encoder_context)
+                                       struct intel_encoder_context *encoder_context)
 {
     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
     VAEncSequenceParameterBufferH264 *pSequenceParameter = (VAEncSequenceParameterBufferH264 *)encode_state->seq_param_ext->buffer;
@@ -340,14 +357,14 @@ static void intel_mfc_hrd_context_init(struct encode_state *encode_state,
 
 void 
 intel_mfc_hrd_context_update(struct encode_state *encode_state, 
-                          struct gen6_mfc_context *mfc_context) 
+                             struct gen6_mfc_context *mfc_context)
 {
     mfc_context->vui_hrd.i_frame_number++;
 }
 
 int intel_mfc_interlace_check(VADriverContextP ctx,
-                   struct encode_state *encode_state,
-                   struct intel_encoder_context *encoder_context) 
+                              struct encode_state *encode_state,
+                              struct intel_encoder_context *encoder_context)
 {
     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
     VAEncSliceParameterBufferH264 *pSliceParameter;
@@ -367,32 +384,136 @@ int intel_mfc_interlace_check(VADriverContextP ctx,
     return 1;
 }
 
+/*
+ * Check whether the parameters related with CBR are updated and decide whether
+ * it needs to reinitialize the configuration related with CBR.
+ * Currently it will check the following parameters:
+ *      bits_per_second
+ *      frame_rate
+ *      gop_configuration(intra_period, ip_period, intra_idr_period)
+ */
+static bool intel_mfc_brc_updated_check(struct encode_state *encode_state,
+                           struct intel_encoder_context *encoder_context)
+{
+    unsigned int rate_control_mode = encoder_context->rate_control_mode;
+    struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
+    double cur_fps, cur_bitrate;
+    VAEncSequenceParameterBufferH264 *pSequenceParameter;
+
+
+    if (rate_control_mode != VA_RC_CBR) {
+        return false;
+    }
+
+    pSequenceParameter = (VAEncSequenceParameterBufferH264 *)encode_state->seq_param_ext->buffer;
+
+    cur_bitrate = pSequenceParameter->bits_per_second;
+    cur_fps = (double)pSequenceParameter->time_scale /
+                (2 * (double)pSequenceParameter->num_units_in_tick);
+
+    if ((cur_bitrate == mfc_context->brc.saved_bps) &&
+        (cur_fps == mfc_context->brc.saved_fps) &&
+        (pSequenceParameter->intra_period == mfc_context->brc.saved_intra_period) &&
+        (pSequenceParameter->intra_idr_period == mfc_context->brc.saved_idr_period) &&
+        (pSequenceParameter->intra_period == mfc_context->brc.saved_intra_period)) {
+        /* the parameters related with CBR are not updaetd */
+        return false;
+    }
+
+    mfc_context->brc.saved_ip_period = pSequenceParameter->ip_period;
+    mfc_context->brc.saved_intra_period = pSequenceParameter->intra_period;
+    mfc_context->brc.saved_idr_period = pSequenceParameter->intra_idr_period;
+    mfc_context->brc.saved_fps = cur_fps;
+    mfc_context->brc.saved_bps = cur_bitrate;
+    return true;
+}
+
 void intel_mfc_brc_prepare(struct encode_state *encode_state,
-                          struct intel_encoder_context *encoder_context)
+                           struct intel_encoder_context *encoder_context)
 {
     unsigned int rate_control_mode = encoder_context->rate_control_mode;
     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
 
     if (rate_control_mode == VA_RC_CBR) {
+        bool brc_updated;
+        assert(encoder_context->codec != CODEC_MPEG2);
+
+        brc_updated = intel_mfc_brc_updated_check(encode_state, encoder_context);
+
         /*Programing bit rate control */
-        if ( mfc_context->bit_rate_control_context[SLICE_TYPE_I].MaxSizeInWord == 0 ) {
+        if ((mfc_context->bit_rate_control_context[SLICE_TYPE_I].MaxSizeInWord == 0) ||
+             brc_updated) {
             intel_mfc_bit_rate_control_context_init(encode_state, mfc_context);
             intel_mfc_brc_init(encode_state, encoder_context);
         }
 
         /*Programing HRD control */
-        if ( mfc_context->vui_hrd.i_cpb_size_value == 0 )
+        if ((mfc_context->vui_hrd.i_cpb_size_value == 0) || brc_updated )
             intel_mfc_hrd_context_init(encode_state, encoder_context);    
     }
 }
 
+static int intel_avc_find_skipemulcnt(unsigned char *buf, int bits_length)
+{
+    int i, found;
+    int leading_zero_cnt, byte_length, zero_byte;
+    int nal_unit_type;
+    int skip_cnt = 0;
+
+#define NAL_UNIT_TYPE_MASK 0x1f
+#define HW_MAX_SKIP_LENGTH 15
+
+    byte_length = ALIGN(bits_length, 32) >> 3;
+
+
+    leading_zero_cnt = 0;
+    found = 0;
+    for(i = 0; i < byte_length - 4; i++) {
+        if (((buf[i] == 0) && (buf[i + 1] == 0) && (buf[i + 2] == 1)) ||
+            ((buf[i] == 0) && (buf[i + 1] == 0) && (buf[i + 2] == 0) && (buf[i + 3] == 1))) {
+                found = 1;
+                break;
+            }
+        leading_zero_cnt++;
+    }
+    if (!found) {
+        /* warning message is complained. But anyway it will be inserted. */
+        WARN_ONCE("Invalid packed header data. "
+                   "Can't find the 000001 start_prefix code\n");
+        return 0;
+    }
+    i = leading_zero_cnt;
+
+    zero_byte = 0;
+    if (!((buf[i] == 0) && (buf[i + 1] == 0) && (buf[i + 2] == 1)))
+        zero_byte = 1;
+
+    skip_cnt = leading_zero_cnt + zero_byte + 3;
+
+    /* the unit header byte is accounted */
+    nal_unit_type = (buf[skip_cnt]) & NAL_UNIT_TYPE_MASK;
+    skip_cnt += 1;
+
+    if (nal_unit_type == 14 || nal_unit_type == 20 || nal_unit_type == 21) {
+        /* more unit header bytes are accounted for MVC/SVC */
+        skip_cnt += 3;
+    }
+    if (skip_cnt > HW_MAX_SKIP_LENGTH) {
+        WARN_ONCE("Too many leading zeros are padded for packed data. "
+                   "It is beyond the HW range.!!!\n");
+    }
+    return skip_cnt;
+}
+
 void intel_mfc_avc_pipeline_header_programing(VADriverContextP ctx,
-                                                    struct encode_state *encode_state,
-                                                    struct intel_encoder_context *encoder_context,
-                                                    struct intel_batchbuffer *slice_batch)
+                                              struct encode_state *encode_state,
+                                              struct intel_encoder_context *encoder_context,
+                                              struct intel_batchbuffer *slice_batch)
 {
     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
     int idx = va_enc_packed_type_to_idx(VAEncPackedHeaderH264_SPS);
+    unsigned int rate_control_mode = encoder_context->rate_control_mode;
+    unsigned int skip_emul_byte_cnt;
 
     if (encode_state->packed_header_data[idx]) {
         VAEncPackedHeaderParameterBuffer *param = NULL;
@@ -403,12 +524,13 @@ void intel_mfc_avc_pipeline_header_programing(VADriverContextP ctx,
         param = (VAEncPackedHeaderParameterBuffer *)encode_state->packed_header_param[idx]->buffer;
         length_in_bits = param->bit_length;
 
+        skip_emul_byte_cnt = intel_avc_find_skipemulcnt((unsigned char *)header_data, length_in_bits);
         mfc_context->insert_object(ctx,
                                    encoder_context,
                                    header_data,
                                    ALIGN(length_in_bits, 32) >> 5,
                                    length_in_bits & 0x1f,
-                                   5,   /* FIXME: check it */
+                                   skip_emul_byte_cnt,
                                    0,
                                    0,
                                    !param->has_emulation_bytes,
@@ -426,12 +548,14 @@ void intel_mfc_avc_pipeline_header_programing(VADriverContextP ctx,
         param = (VAEncPackedHeaderParameterBuffer *)encode_state->packed_header_param[idx]->buffer;
         length_in_bits = param->bit_length;
 
+        skip_emul_byte_cnt = intel_avc_find_skipemulcnt((unsigned char *)header_data, length_in_bits);
+
         mfc_context->insert_object(ctx,
                                    encoder_context,
                                    header_data,
                                    ALIGN(length_in_bits, 32) >> 5,
                                    length_in_bits & 0x1f,
-                                   5, /* FIXME: check it */
+                                   skip_emul_byte_cnt,
                                    0,
                                    0,
                                    !param->has_emulation_bytes,
@@ -449,16 +573,1048 @@ void intel_mfc_avc_pipeline_header_programing(VADriverContextP ctx,
         param = (VAEncPackedHeaderParameterBuffer *)encode_state->packed_header_param[idx]->buffer;
         length_in_bits = param->bit_length;
 
+        skip_emul_byte_cnt = intel_avc_find_skipemulcnt((unsigned char *)header_data, length_in_bits);
         mfc_context->insert_object(ctx,
                                    encoder_context,
                                    header_data,
                                    ALIGN(length_in_bits, 32) >> 5,
                                    length_in_bits & 0x1f,
-                                   5, /* FIXME: check it */
+                                   skip_emul_byte_cnt,
                                    0,
                                    0,
                                    !param->has_emulation_bytes,
                                    slice_batch);
+    } else if (rate_control_mode == VA_RC_CBR) {
+        // this is frist AU
+        struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
+
+        unsigned char *sei_data = NULL;
+    
+        int length_in_bits = build_avc_sei_buffer_timing(
+            mfc_context->vui_hrd.i_initial_cpb_removal_delay_length,
+            mfc_context->vui_hrd.i_initial_cpb_removal_delay,
+            0,
+            mfc_context->vui_hrd.i_cpb_removal_delay_length,                                                       mfc_context->vui_hrd.i_cpb_removal_delay * mfc_context->vui_hrd.i_frame_number,
+            mfc_context->vui_hrd.i_dpb_output_delay_length,
+            0,
+            &sei_data);
+        mfc_context->insert_object(ctx,
+                                   encoder_context,
+                                   (unsigned int *)sei_data,
+                                   ALIGN(length_in_bits, 32) >> 5,
+                                   length_in_bits & 0x1f,
+                                   4,   
+                                   0,   
+                                   0,   
+                                   1,
+                                   slice_batch);  
+        free(sei_data);
+    }
+}
+
+VAStatus intel_mfc_avc_prepare(VADriverContextP ctx, 
+                               struct encode_state *encode_state,
+                               struct intel_encoder_context *encoder_context)
+{
+    struct i965_driver_data *i965 = i965_driver_data(ctx);
+    struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
+    struct object_surface *obj_surface;        
+    struct object_buffer *obj_buffer;
+    GenAvcSurface *gen6_avc_surface;
+    dri_bo *bo;
+    VAStatus vaStatus = VA_STATUS_SUCCESS;
+    int i, j, enable_avc_ildb = 0;
+    VAEncSliceParameterBufferH264 *slice_param;
+    struct i965_coded_buffer_segment *coded_buffer_segment;
+    VAEncSequenceParameterBufferH264 *pSequenceParameter = (VAEncSequenceParameterBufferH264 *)encode_state->seq_param_ext->buffer;
+    int width_in_mbs = pSequenceParameter->picture_width_in_mbs;
+    int height_in_mbs = pSequenceParameter->picture_height_in_mbs;
+
+    if (IS_GEN6(i965->intel.device_info)) {
+       /* On the SNB it should be fixed to 128 for the DMV buffer */
+       width_in_mbs = 128;
+    }
+
+    for (j = 0; j < encode_state->num_slice_params_ext && enable_avc_ildb == 0; j++) {
+        assert(encode_state->slice_params_ext && encode_state->slice_params_ext[j]->buffer);
+        slice_param = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[j]->buffer;
+
+        for (i = 0; i < encode_state->slice_params_ext[j]->num_elements; i++) {
+            assert((slice_param->slice_type == SLICE_TYPE_I) ||
+                   (slice_param->slice_type == SLICE_TYPE_SI) ||
+                   (slice_param->slice_type == SLICE_TYPE_P) ||
+                   (slice_param->slice_type == SLICE_TYPE_SP) ||
+                   (slice_param->slice_type == SLICE_TYPE_B));
+
+            if (slice_param->disable_deblocking_filter_idc != 1) {
+                enable_avc_ildb = 1;
+                break;
+            }
+
+            slice_param++;
+        }
+    }
+
+    /*Setup all the input&output object*/
+
+    /* Setup current frame and current direct mv buffer*/
+    obj_surface = encode_state->reconstructed_object;
+    i965_check_alloc_surface_bo(ctx, obj_surface, 1, VA_FOURCC_NV12, SUBSAMPLE_YUV420);
+
+    if ( obj_surface->private_data == NULL) {
+        gen6_avc_surface = calloc(sizeof(GenAvcSurface), 1);
+        gen6_avc_surface->dmv_top = 
+            dri_bo_alloc(i965->intel.bufmgr,
+                         "Buffer",
+                         68 * width_in_mbs * height_in_mbs, 
+                         64);
+        gen6_avc_surface->dmv_bottom = 
+            dri_bo_alloc(i965->intel.bufmgr,
+                         "Buffer",
+                         68 * width_in_mbs * height_in_mbs, 
+                         64);
+        assert(gen6_avc_surface->dmv_top);
+        assert(gen6_avc_surface->dmv_bottom);
+        obj_surface->private_data = (void *)gen6_avc_surface;
+        obj_surface->free_private_data = (void *)gen_free_avc_surface; 
+    }
+    gen6_avc_surface = (GenAvcSurface *) obj_surface->private_data;
+    mfc_context->direct_mv_buffers[NUM_MFC_DMV_BUFFERS - 2].bo = gen6_avc_surface->dmv_top;
+    mfc_context->direct_mv_buffers[NUM_MFC_DMV_BUFFERS - 1].bo = gen6_avc_surface->dmv_bottom;
+    dri_bo_reference(gen6_avc_surface->dmv_top);
+    dri_bo_reference(gen6_avc_surface->dmv_bottom);
+
+    if (enable_avc_ildb) {
+        mfc_context->post_deblocking_output.bo = obj_surface->bo;
+        dri_bo_reference(mfc_context->post_deblocking_output.bo);
+    } else {
+        mfc_context->pre_deblocking_output.bo = obj_surface->bo;
+        dri_bo_reference(mfc_context->pre_deblocking_output.bo);
+    }
+
+    mfc_context->surface_state.width = obj_surface->orig_width;
+    mfc_context->surface_state.height = obj_surface->orig_height;
+    mfc_context->surface_state.w_pitch = obj_surface->width;
+    mfc_context->surface_state.h_pitch = obj_surface->height;
+    
+    /* Setup reference frames and direct mv buffers*/
+    for(i = 0; i < MAX_MFC_REFERENCE_SURFACES; i++) {
+        obj_surface = encode_state->reference_objects[i];
+        
+        if (obj_surface && obj_surface->bo) {
+            mfc_context->reference_surfaces[i].bo = obj_surface->bo;
+            dri_bo_reference(obj_surface->bo);
+
+            /* Check DMV buffer */
+            if ( obj_surface->private_data == NULL) {
+                
+                gen6_avc_surface = calloc(sizeof(GenAvcSurface), 1);
+                gen6_avc_surface->dmv_top = 
+                    dri_bo_alloc(i965->intel.bufmgr,
+                                 "Buffer",
+                                 68 * width_in_mbs * height_in_mbs, 
+                                 64);
+                gen6_avc_surface->dmv_bottom = 
+                    dri_bo_alloc(i965->intel.bufmgr,
+                                 "Buffer",
+                                 68 * width_in_mbs * height_in_mbs, 
+                                 64);
+                assert(gen6_avc_surface->dmv_top);
+                assert(gen6_avc_surface->dmv_bottom);
+                obj_surface->private_data = gen6_avc_surface;
+                obj_surface->free_private_data = gen_free_avc_surface; 
+            }
+    
+            gen6_avc_surface = (GenAvcSurface *) obj_surface->private_data;
+            /* Setup DMV buffer */
+            mfc_context->direct_mv_buffers[i*2].bo = gen6_avc_surface->dmv_top;
+            mfc_context->direct_mv_buffers[i*2+1].bo = gen6_avc_surface->dmv_bottom; 
+            dri_bo_reference(gen6_avc_surface->dmv_top);
+            dri_bo_reference(gen6_avc_surface->dmv_bottom);
+        } else {
+            break;
+        }
+    }
+       
+    mfc_context->uncompressed_picture_source.bo = encode_state->input_yuv_object->bo;
+    dri_bo_reference(mfc_context->uncompressed_picture_source.bo);
+
+    obj_buffer = encode_state->coded_buf_object;
+    bo = obj_buffer->buffer_store->bo;
+    mfc_context->mfc_indirect_pak_bse_object.bo = bo;
+    mfc_context->mfc_indirect_pak_bse_object.offset = I965_CODEDBUFFER_HEADER_SIZE;
+    mfc_context->mfc_indirect_pak_bse_object.end_offset = ALIGN(obj_buffer->size_element - 0x1000, 0x1000);
+    dri_bo_reference(mfc_context->mfc_indirect_pak_bse_object.bo);
+    
+    dri_bo_map(bo, 1);
+    coded_buffer_segment = (struct i965_coded_buffer_segment *)bo->virtual;
+    coded_buffer_segment->mapped = 0;
+    coded_buffer_segment->codec = encoder_context->codec;
+    dri_bo_unmap(bo);
+
+    return vaStatus;
+}
+/*
+ * The LUT uses the pair of 4-bit units: (shift, base) structure.
+ * 2^K * X = value . 
+ * So it is necessary to convert one cost into the nearest LUT format.
+ * The derivation is:
+ * 2^K *x = 2^n * (1 + deltaX)
+ *    k + log2(x) = n + log2(1 + deltaX)
+ *    log2(x) = n - k + log2(1 + deltaX)
+ *    As X is in the range of [1, 15]
+ *      4 > n - k + log2(1 + deltaX) >= 0 
+ *      =>    n + log2(1 + deltaX)  >= k > n - 4  + log2(1 + deltaX)
+ *    Then we can derive the corresponding K and get the nearest LUT format.
+ */
+int intel_format_lutvalue(int value, int max)
+{
+    int ret;
+    int logvalue, temp1, temp2;
+
+    if (value <= 0)
+        return 0;
+
+    logvalue = (int)(log2f((float)value));
+    if (logvalue < 4) {
+        ret = value;
+    } else {
+        int error, temp_value, base, j, temp_err;
+        error = value;
+        j = logvalue - 4 + 1;
+        ret = -1;
+        for(; j <= logvalue; j++) {
+            if (j == 0) {
+                base = value >> j;
+            } else {
+                base = (value + (1 << (j - 1)) - 1) >> j;
+            }
+            if (base >= 16)
+                continue;
+
+            temp_value = base << j;
+            temp_err = abs(value - temp_value);
+            if (temp_err < error) {
+                error = temp_err;
+                ret = (j << 4) | base;
+                if (temp_err == 0)
+                    break;
+            }
+        }
+    }
+    temp1 = (ret & 0xf) << ((ret & 0xf0) >> 4);
+    temp2 = (max & 0xf) << ((max & 0xf0) >> 4);
+    if (temp1 > temp2)
+        ret = max;
+    return ret;
+       
+}
+
+
+#define                QP_MAX                  52
+
+
+static float intel_lambda_qp(int qp)
+{
+    float value, lambdaf;
+    value = qp;
+    value = value / 6 - 2;
+    if (value < 0)
+        value = 0;
+    lambdaf = roundf(powf(2, value));
+    return lambdaf;
+}
+
+
+void intel_vme_update_mbmv_cost(VADriverContextP ctx,
+                                struct encode_state *encode_state,
+                                struct intel_encoder_context *encoder_context)
+{
+    struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
+    struct gen6_vme_context *vme_context = encoder_context->vme_context;
+    VAEncPictureParameterBufferH264 *pic_param = (VAEncPictureParameterBufferH264 *)encode_state->pic_param_ext->buffer;
+    VAEncSliceParameterBufferH264 *slice_param = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[0]->buffer;
+    int qp, m_cost, j, mv_count;
+    uint8_t *vme_state_message = (uint8_t *)(vme_context->vme_state_message);
+    float   lambda, m_costf;
+
+    int slice_type = intel_avc_enc_slice_type_fixup(slice_param->slice_type);
+
+    
+    if (encoder_context->rate_control_mode == VA_RC_CQP)
+       qp = pic_param->pic_init_qp + slice_param->slice_qp_delta;
+    else
+       qp = mfc_context->bit_rate_control_context[slice_type].QpPrimeY;
+  
+    if (vme_state_message == NULL)
+       return;
+    assert(qp <= QP_MAX); 
+    lambda = intel_lambda_qp(qp);
+    if (slice_type == SLICE_TYPE_I) {
+       vme_state_message[MODE_INTRA_16X16] = 0;
+       m_cost = lambda * 4;
+       vme_state_message[MODE_INTRA_8X8] = intel_format_lutvalue(m_cost, 0x8f);
+       m_cost = lambda * 16; 
+       vme_state_message[MODE_INTRA_4X4] = intel_format_lutvalue(m_cost, 0x8f);
+       m_cost = lambda * 3;
+       vme_state_message[MODE_INTRA_NONPRED] = intel_format_lutvalue(m_cost, 0x6f);
+    } else {
+       m_cost = 0;
+       vme_state_message[MODE_INTER_MV0] = intel_format_lutvalue(m_cost, 0x6f);
+       for (j = 1; j < 3; j++) {
+            m_costf = (log2f((float)(j + 1)) + 1.718f) * lambda;
+            m_cost = (int)m_costf;
+            vme_state_message[MODE_INTER_MV0 + j] = intel_format_lutvalue(m_cost, 0x6f);
+       }
+       mv_count = 3;
+       for (j = 4; j <= 64; j *= 2) {
+            m_costf = (log2f((float)(j + 1)) + 1.718f) * lambda;
+            m_cost = (int)m_costf;
+            vme_state_message[MODE_INTER_MV0 + mv_count] = intel_format_lutvalue(m_cost, 0x6f);
+            mv_count++;
+       }
+
+       if (qp <= 25) {
+            vme_state_message[MODE_INTRA_16X16] = 0x4a;
+            vme_state_message[MODE_INTRA_8X8] = 0x4a;
+            vme_state_message[MODE_INTRA_4X4] = 0x4a;
+            vme_state_message[MODE_INTRA_NONPRED] = 0x4a;
+            vme_state_message[MODE_INTER_16X16] = 0x4a;
+            vme_state_message[MODE_INTER_16X8] = 0x4a;
+            vme_state_message[MODE_INTER_8X8] = 0x4a;
+            vme_state_message[MODE_INTER_8X4] = 0x4a;
+            vme_state_message[MODE_INTER_4X4] = 0x4a;
+            vme_state_message[MODE_INTER_BWD] = 0x2a;
+            return;
+       }
+       m_costf = lambda * 10;
+       vme_state_message[MODE_INTRA_16X16] = intel_format_lutvalue(m_cost, 0x8f);
+       m_cost = lambda * 14;
+       vme_state_message[MODE_INTRA_8X8] = intel_format_lutvalue(m_cost, 0x8f);
+       m_cost = lambda * 24; 
+       vme_state_message[MODE_INTRA_4X4] = intel_format_lutvalue(m_cost, 0x8f);
+       m_costf = lambda * 3.5;
+       m_cost = m_costf;
+       vme_state_message[MODE_INTRA_NONPRED] = intel_format_lutvalue(m_cost, 0x6f);
+       if (slice_type == SLICE_TYPE_P) {
+            m_costf = lambda * 2.5;
+            m_cost = m_costf;
+            vme_state_message[MODE_INTER_16X16] = intel_format_lutvalue(m_cost, 0x8f);
+            m_costf = lambda * 4;
+            m_cost = m_costf;
+            vme_state_message[MODE_INTER_16X8] = intel_format_lutvalue(m_cost, 0x8f);
+            m_costf = lambda * 1.5;
+            m_cost = m_costf;
+            vme_state_message[MODE_INTER_8X8] = intel_format_lutvalue(m_cost, 0x6f);
+            m_costf = lambda * 3;
+            m_cost = m_costf;
+            vme_state_message[MODE_INTER_8X4] = intel_format_lutvalue(m_cost, 0x6f);
+            m_costf = lambda * 5;
+            m_cost = m_costf;
+            vme_state_message[MODE_INTER_4X4] = intel_format_lutvalue(m_cost, 0x6f);
+            /* BWD is not used in P-frame */
+            vme_state_message[MODE_INTER_BWD] = 0;
+       } else {
+            m_costf = lambda * 2.5;
+            m_cost = m_costf;
+            vme_state_message[MODE_INTER_16X16] = intel_format_lutvalue(m_cost, 0x8f);
+            m_costf = lambda * 5.5;
+            m_cost = m_costf;
+            vme_state_message[MODE_INTER_16X8] = intel_format_lutvalue(m_cost, 0x8f);
+            m_costf = lambda * 3.5;
+            m_cost = m_costf;
+            vme_state_message[MODE_INTER_8X8] = intel_format_lutvalue(m_cost, 0x6f);
+            m_costf = lambda * 5.0;
+            m_cost = m_costf;
+            vme_state_message[MODE_INTER_8X4] = intel_format_lutvalue(m_cost, 0x6f);
+            m_costf = lambda * 6.5;
+            m_cost = m_costf;
+            vme_state_message[MODE_INTER_4X4] = intel_format_lutvalue(m_cost, 0x6f);
+            m_costf = lambda * 1.5;
+            m_cost = m_costf;
+            vme_state_message[MODE_INTER_BWD] = intel_format_lutvalue(m_cost, 0x6f);
+       }
+    }
+}
+
+
+#define                MB_SCOREBOARD_A         (1 << 0)
+#define                MB_SCOREBOARD_B         (1 << 1)
+#define                MB_SCOREBOARD_C         (1 << 2)
+void 
+gen7_vme_scoreboard_init(VADriverContextP ctx, struct gen6_vme_context *vme_context)
+{
+    vme_context->gpe_context.vfe_desc5.scoreboard0.enable = 1;
+    vme_context->gpe_context.vfe_desc5.scoreboard0.type = SCOREBOARD_STALLING;
+    vme_context->gpe_context.vfe_desc5.scoreboard0.mask = (MB_SCOREBOARD_A |
+                                                           MB_SCOREBOARD_B |
+                                                           MB_SCOREBOARD_C);
+
+    /* In VME prediction the current mb depends on the neighbour 
+     * A/B/C macroblock. So the left/up/up-right dependency should
+     * be considered.
+     */
+    vme_context->gpe_context.vfe_desc6.scoreboard1.delta_x0 = -1;
+    vme_context->gpe_context.vfe_desc6.scoreboard1.delta_y0 = 0;
+    vme_context->gpe_context.vfe_desc6.scoreboard1.delta_x1 = 0;
+    vme_context->gpe_context.vfe_desc6.scoreboard1.delta_y1 = -1;
+    vme_context->gpe_context.vfe_desc6.scoreboard1.delta_x2 = 1;
+    vme_context->gpe_context.vfe_desc6.scoreboard1.delta_y2 = -1;
+       
+    vme_context->gpe_context.vfe_desc7.dword = 0;
+    return;
+}
+
+/* check whether the mb of (x_index, y_index) is out of bound */
+static inline int loop_in_bounds(int x_index, int y_index, int first_mb, int num_mb, int mb_width, int mb_height)
+{
+    int mb_index;
+    if (x_index < 0 || x_index >= mb_width)
+        return -1;
+    if (y_index < 0 || y_index >= mb_height)
+        return -1;
+       
+    mb_index = y_index * mb_width + x_index;
+    if (mb_index < first_mb || mb_index > (first_mb + num_mb))
+        return -1;
+    return 0;
+}
+
+void
+gen7_vme_walker_fill_vme_batchbuffer(VADriverContextP ctx, 
+                                     struct encode_state *encode_state,
+                                     int mb_width, int mb_height,
+                                     int kernel,
+                                     int transform_8x8_mode_flag,
+                                     struct intel_encoder_context *encoder_context)
+{
+    struct gen6_vme_context *vme_context = encoder_context->vme_context;
+    int mb_row;
+    int s;
+    unsigned int *command_ptr;
+
+#define                USE_SCOREBOARD          (1 << 21)
+    dri_bo_map(vme_context->vme_batchbuffer.bo, 1);
+    command_ptr = vme_context->vme_batchbuffer.bo->virtual;
+
+    for (s = 0; s < encode_state->num_slice_params_ext; s++) {
+       VAEncSliceParameterBufferH264 *pSliceParameter = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[s]->buffer;
+       int first_mb = pSliceParameter->macroblock_address;
+       int num_mb = pSliceParameter->num_macroblocks;
+       unsigned int mb_intra_ub, score_dep;
+       int x_outer, y_outer, x_inner, y_inner;
+       int xtemp_outer = 0;
+
+       x_outer = first_mb % mb_width;
+       y_outer = first_mb / mb_width;
+       mb_row = y_outer;
+                                
+       for (; x_outer < (mb_width -2 ) && !loop_in_bounds(x_outer, y_outer, first_mb, num_mb, mb_width, mb_height); ) {
+           x_inner = x_outer;
+           y_inner = y_outer;
+           for (; !loop_in_bounds(x_inner, y_inner, first_mb, num_mb, mb_width, mb_height);) {
+               mb_intra_ub = 0;
+               score_dep = 0;
+               if (x_inner != 0) {
+                   mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_AE;
+                   score_dep |= MB_SCOREBOARD_A; 
+               }
+               if (y_inner != mb_row) {
+                   mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_B;
+                   score_dep |= MB_SCOREBOARD_B;
+                   if (x_inner != 0)
+                       mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_D;
+                   if (x_inner != (mb_width -1)) {
+                       mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_C;
+                       score_dep |= MB_SCOREBOARD_C;
+                    }
+               }
+                                                       
+               *command_ptr++ = (CMD_MEDIA_OBJECT | (8 - 2));
+               *command_ptr++ = kernel;
+               *command_ptr++ = USE_SCOREBOARD;
+               /* Indirect data */
+               *command_ptr++ = 0;
+               /* the (X, Y) term of scoreboard */
+               *command_ptr++ = ((y_inner << 16) | x_inner);
+               *command_ptr++ = score_dep;
+               /*inline data */
+               *command_ptr++ = (mb_width << 16 | y_inner << 8 | x_inner);
+               *command_ptr++ = ((1 << 18) | (1 << 16) | transform_8x8_mode_flag | (mb_intra_ub << 8));
+               x_inner -= 2;
+               y_inner += 1;
+           }
+           x_outer += 1;
+       }
+
+       xtemp_outer = mb_width - 2;
+       if (xtemp_outer < 0)
+            xtemp_outer = 0;
+       x_outer = xtemp_outer;
+       y_outer = first_mb / mb_width;
+       for (;!loop_in_bounds(x_outer, y_outer, first_mb, num_mb, mb_width, mb_height); ) { 
+           y_inner = y_outer;
+           x_inner = x_outer;
+           for (; !loop_in_bounds(x_inner, y_inner, first_mb, num_mb, mb_width, mb_height);) {
+               mb_intra_ub = 0;
+               score_dep = 0;
+               if (x_inner != 0) {
+                   mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_AE;
+                   score_dep |= MB_SCOREBOARD_A; 
+               }
+               if (y_inner != mb_row) {
+                   mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_B;
+                   score_dep |= MB_SCOREBOARD_B;
+                   if (x_inner != 0)
+                       mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_D;
+
+                   if (x_inner != (mb_width -1)) {
+                       mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_C;
+                       score_dep |= MB_SCOREBOARD_C;
+                    }
+               }
+
+               *command_ptr++ = (CMD_MEDIA_OBJECT | (8 - 2));
+               *command_ptr++ = kernel;
+               *command_ptr++ = USE_SCOREBOARD;
+               /* Indirect data */
+               *command_ptr++ = 0;
+               /* the (X, Y) term of scoreboard */
+               *command_ptr++ = ((y_inner << 16) | x_inner);
+               *command_ptr++ = score_dep;
+               /*inline data */
+               *command_ptr++ = (mb_width << 16 | y_inner << 8 | x_inner);
+               *command_ptr++ = ((1 << 18) | (1 << 16) | transform_8x8_mode_flag | (mb_intra_ub << 8));
+
+               x_inner -= 2;
+               y_inner += 1;
+           }
+           x_outer++;
+           if (x_outer >= mb_width) {
+               y_outer += 1;
+               x_outer = xtemp_outer;
+           }           
+       }
+    }
+
+    *command_ptr++ = 0;
+    *command_ptr++ = MI_BATCH_BUFFER_END;
+
+    dri_bo_unmap(vme_context->vme_batchbuffer.bo);
+}
+
+static uint8_t
+intel_get_ref_idx_state_1(VAPictureH264 *va_pic, unsigned int frame_store_id)
+{
+    unsigned int is_long_term =
+        !!(va_pic->flags & VA_PICTURE_H264_LONG_TERM_REFERENCE);
+    unsigned int is_top_field =
+        !!(va_pic->flags & VA_PICTURE_H264_TOP_FIELD);
+    unsigned int is_bottom_field =
+        !!(va_pic->flags & VA_PICTURE_H264_BOTTOM_FIELD);
+
+    return ((is_long_term                         << 6) |
+            ((is_top_field ^ is_bottom_field ^ 1) << 5) |
+            (frame_store_id                       << 1) |
+            ((is_top_field ^ 1) & is_bottom_field));
+}
+
+void
+intel_mfc_avc_ref_idx_state(VADriverContextP ctx,
+                            struct encode_state *encode_state,
+                            struct intel_encoder_context *encoder_context)
+{
+    struct gen6_vme_context *vme_context = encoder_context->vme_context;
+    struct intel_batchbuffer *batch = encoder_context->base.batch;
+    int slice_type;
+    struct object_surface *obj_surface;
+    unsigned int fref_entry, bref_entry;
+    int frame_index, i;
+    VAEncSliceParameterBufferH264 *slice_param = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[0]->buffer;
+
+    fref_entry = 0x80808080;
+    bref_entry = 0x80808080;
+    slice_type = intel_avc_enc_slice_type_fixup(slice_param->slice_type);
+
+    if (slice_type == SLICE_TYPE_P || slice_type == SLICE_TYPE_B) {
+        int ref_idx_l0 = (vme_context->ref_index_in_mb[0] & 0xff);
+
+        if (ref_idx_l0 > 3) {
+            WARN_ONCE("ref_idx_l0 is out of range\n");
+            ref_idx_l0 = 0;
+        }
+
+        obj_surface = vme_context->used_reference_objects[0];
+        frame_index = -1;
+        for (i = 0; i < 16; i++) {
+            if (obj_surface &&
+                obj_surface == encode_state->reference_objects[i]) {
+                frame_index = i;
+                break;
+            }
+        }
+        if (frame_index == -1) {
+            WARN_ONCE("RefPicList0 is not found in DPB!\n");
+        } else {
+            int ref_idx_l0_shift = ref_idx_l0 * 8;
+            fref_entry &= ~(0xFF << ref_idx_l0_shift);
+            fref_entry += (intel_get_ref_idx_state_1(vme_context->used_references[0], frame_index) << ref_idx_l0_shift);
+        }
+    }
+
+    if (slice_type == SLICE_TYPE_B) {
+        int ref_idx_l1 = (vme_context->ref_index_in_mb[1] & 0xff);
+
+        if (ref_idx_l1 > 3) {
+            WARN_ONCE("ref_idx_l1 is out of range\n");
+            ref_idx_l1 = 0;
+        }
+
+        obj_surface = vme_context->used_reference_objects[1];
+        frame_index = -1;
+        for (i = 0; i < 16; i++) {
+            if (obj_surface &&
+                obj_surface == encode_state->reference_objects[i]) {
+                frame_index = i;
+                break;
+            }
+        }
+        if (frame_index == -1) {
+            WARN_ONCE("RefPicList1 is not found in DPB!\n");
+        } else {
+            int ref_idx_l1_shift = ref_idx_l1 * 8;
+            bref_entry &= ~(0xFF << ref_idx_l1_shift);
+            bref_entry += (intel_get_ref_idx_state_1(vme_context->used_references[1], frame_index) << ref_idx_l1_shift);
+        }
+    }
+
+    BEGIN_BCS_BATCH(batch, 10);
+    OUT_BCS_BATCH(batch, MFX_AVC_REF_IDX_STATE | 8);
+    OUT_BCS_BATCH(batch, 0);                  //Select L0
+    OUT_BCS_BATCH(batch, fref_entry);         //Only 1 reference
+    for(i = 0; i < 7; i++) {
+        OUT_BCS_BATCH(batch, 0x80808080);
+    }
+    ADVANCE_BCS_BATCH(batch);
+
+    BEGIN_BCS_BATCH(batch, 10);
+    OUT_BCS_BATCH(batch, MFX_AVC_REF_IDX_STATE | 8);
+    OUT_BCS_BATCH(batch, 1);                  //Select L1
+    OUT_BCS_BATCH(batch, bref_entry);         //Only 1 reference
+    for(i = 0; i < 7; i++) {
+        OUT_BCS_BATCH(batch, 0x80808080);
+    }
+    ADVANCE_BCS_BATCH(batch);
+}
+
+
+void intel_vme_mpeg2_state_setup(VADriverContextP ctx,
+                                 struct encode_state *encode_state,
+                                 struct intel_encoder_context *encoder_context)
+{
+    struct gen6_vme_context *vme_context = encoder_context->vme_context;
+    uint32_t *vme_state_message = (uint32_t *)(vme_context->vme_state_message);
+    VAEncSequenceParameterBufferMPEG2 *seq_param = (VAEncSequenceParameterBufferMPEG2 *)encode_state->seq_param_ext->buffer;
+    int width_in_mbs = ALIGN(seq_param->picture_width, 16) / 16;
+    int height_in_mbs = ALIGN(seq_param->picture_height, 16) / 16;
+    uint32_t mv_x, mv_y;
+    VAEncSliceParameterBufferMPEG2 *slice_param = NULL;
+    VAEncPictureParameterBufferMPEG2 *pic_param = NULL;
+    slice_param = (VAEncSliceParameterBufferMPEG2 *)encode_state->slice_params_ext[0]->buffer;
+
+    if (vme_context->mpeg2_level == MPEG2_LEVEL_LOW) {
+        mv_x = 512;
+        mv_y = 64;
+    } else if (vme_context->mpeg2_level == MPEG2_LEVEL_MAIN) {
+        mv_x = 1024;
+        mv_y = 128;
+    } else if (vme_context->mpeg2_level == MPEG2_LEVEL_HIGH) {
+        mv_x = 2048;
+        mv_y = 128;
+    } else {
+        WARN_ONCE("Incorrect Mpeg2 level setting!\n");
+        mv_x = 512;
+        mv_y = 64;
+    }
+
+    pic_param = (VAEncPictureParameterBufferMPEG2 *)encode_state->pic_param_ext->buffer;
+    if (pic_param->picture_type != VAEncPictureTypeIntra) {
+        int qp, m_cost, j, mv_count;
+        float   lambda, m_costf;
+        slice_param = (VAEncSliceParameterBufferMPEG2 *)
+            encode_state->slice_params_ext[0]->buffer;
+        qp = slice_param->quantiser_scale_code;
+        lambda = intel_lambda_qp(qp);
+        /* No Intra prediction. So it is zero */
+        vme_state_message[MODE_INTRA_8X8] = 0;
+        vme_state_message[MODE_INTRA_4X4] = 0;
+        vme_state_message[MODE_INTER_MV0] = 0;
+        for (j = 1; j < 3; j++) {
+            m_costf = (log2f((float)(j + 1)) + 1.718f) * lambda;
+            m_cost = (int)m_costf;
+            vme_state_message[MODE_INTER_MV0 + j] = intel_format_lutvalue(m_cost, 0x6f);
+        }
+        mv_count = 3;
+        for (j = 4; j <= 64; j *= 2) {
+            m_costf = (log2f((float)(j + 1)) + 1.718f) * lambda;
+            m_cost = (int)m_costf;
+            vme_state_message[MODE_INTER_MV0 + mv_count] =
+                intel_format_lutvalue(m_cost, 0x6f);
+            mv_count++;
+        }
+        m_cost = lambda;
+        /* It can only perform the 16x16 search. So mode cost can be ignored for
+         * the other mode. for example: 16x8/8x8
+         */
+        vme_state_message[MODE_INTRA_16X16] = intel_format_lutvalue(m_cost, 0x8f);
+        vme_state_message[MODE_INTER_16X16] = intel_format_lutvalue(m_cost, 0x8f);
+                       
+        vme_state_message[MODE_INTER_16X8] = 0;
+        vme_state_message[MODE_INTER_8X8] = 0;
+        vme_state_message[MODE_INTER_8X4] = 0;
+        vme_state_message[MODE_INTER_4X4] = 0;
+        vme_state_message[MODE_INTER_BWD] = intel_format_lutvalue(m_cost, 0x6f);
+
+    }
+    vme_state_message[MPEG2_MV_RANGE] = (mv_y << 16) | (mv_x);
+
+    vme_state_message[MPEG2_PIC_WIDTH_HEIGHT] = (height_in_mbs << 16) |
+        width_in_mbs;
+}
+
+void
+gen7_vme_mpeg2_walker_fill_vme_batchbuffer(VADriverContextP ctx, 
+                                           struct encode_state *encode_state,
+                                           int mb_width, int mb_height,
+                                           int kernel,
+                                           struct intel_encoder_context *encoder_context)
+{
+    struct gen6_vme_context *vme_context = encoder_context->vme_context;
+    unsigned int *command_ptr;
+
+#define                MPEG2_SCOREBOARD                (1 << 21)
+
+    dri_bo_map(vme_context->vme_batchbuffer.bo, 1);
+    command_ptr = vme_context->vme_batchbuffer.bo->virtual;
+
+    {
+       unsigned int mb_intra_ub, score_dep;
+       int x_outer, y_outer, x_inner, y_inner;
+       int xtemp_outer = 0;
+       int first_mb = 0;
+       int num_mb = mb_width * mb_height;
+
+       x_outer = 0;
+       y_outer = 0;
+       
+                                
+       for (; x_outer < (mb_width -2 ) && !loop_in_bounds(x_outer, y_outer, first_mb, num_mb, mb_width, mb_height); ) {
+           x_inner = x_outer;
+           y_inner = y_outer;
+           for (; !loop_in_bounds(x_inner, y_inner, first_mb, num_mb, mb_width, mb_height);) {
+               mb_intra_ub = 0;
+               score_dep = 0;
+               if (x_inner != 0) {
+                   mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_AE;
+                   score_dep |= MB_SCOREBOARD_A; 
+               }
+               if (y_inner != 0) {
+                   mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_B;
+                   score_dep |= MB_SCOREBOARD_B;
+
+                   if (x_inner != 0)
+                       mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_D;
+
+                   if (x_inner != (mb_width -1)) {
+                       mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_C;
+                       score_dep |= MB_SCOREBOARD_C;
+                   }
+               }
+                                                       
+               *command_ptr++ = (CMD_MEDIA_OBJECT | (8 - 2));
+               *command_ptr++ = kernel;
+               *command_ptr++ = MPEG2_SCOREBOARD;
+               /* Indirect data */
+               *command_ptr++ = 0;
+               /* the (X, Y) term of scoreboard */
+               *command_ptr++ = ((y_inner << 16) | x_inner);
+               *command_ptr++ = score_dep;
+               /*inline data */
+               *command_ptr++ = (mb_width << 16 | y_inner << 8 | x_inner);
+               *command_ptr++ = ((1 << 18) | (1 << 16) | (mb_intra_ub << 8));
+               x_inner -= 2;
+               y_inner += 1;
+           }
+           x_outer += 1;
+       }
+
+       xtemp_outer = mb_width - 2;
+       if (xtemp_outer < 0)
+            xtemp_outer = 0;
+       x_outer = xtemp_outer;
+       y_outer = 0;
+       for (;!loop_in_bounds(x_outer, y_outer, first_mb, num_mb, mb_width, mb_height); ) { 
+           y_inner = y_outer;
+           x_inner = x_outer;
+           for (; !loop_in_bounds(x_inner, y_inner, first_mb, num_mb, mb_width, mb_height);) {
+               mb_intra_ub = 0;
+               score_dep = 0;
+               if (x_inner != 0) {
+                   mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_AE;
+                   score_dep |= MB_SCOREBOARD_A; 
+               }
+               if (y_inner != 0) {
+                   mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_B;
+                   score_dep |= MB_SCOREBOARD_B;
+
+                   if (x_inner != 0)
+                       mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_D;
+
+                   if (x_inner != (mb_width -1)) {
+                       mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_C;
+                       score_dep |= MB_SCOREBOARD_C;
+                   }
+               }
+
+               *command_ptr++ = (CMD_MEDIA_OBJECT | (8 - 2));
+               *command_ptr++ = kernel;
+               *command_ptr++ = MPEG2_SCOREBOARD;
+               /* Indirect data */
+               *command_ptr++ = 0;
+               /* the (X, Y) term of scoreboard */
+               *command_ptr++ = ((y_inner << 16) | x_inner);
+               *command_ptr++ = score_dep;
+               /*inline data */
+               *command_ptr++ = (mb_width << 16 | y_inner << 8 | x_inner);
+               *command_ptr++ = ((1 << 18) | (1 << 16) | (mb_intra_ub << 8));
+
+               x_inner -= 2;
+               y_inner += 1;
+           }
+           x_outer++;
+           if (x_outer >= mb_width) {
+               y_outer += 1;
+               x_outer = xtemp_outer;
+           }           
+       }
+    }
+
+    *command_ptr++ = 0;
+    *command_ptr++ = MI_BATCH_BUFFER_END;
+
+    dri_bo_unmap(vme_context->vme_batchbuffer.bo);
+    return;
+}
+
+static int
+avc_temporal_find_surface(VAPictureH264 *curr_pic,
+                          VAPictureH264 *ref_list,
+                          int num_pictures,
+                          int dir)
+{
+    int i, found = -1, min = 0x7FFFFFFF;
+
+    for (i = 0; i < num_pictures; i++) {
+        int tmp;
+
+        if ((ref_list[i].flags & VA_PICTURE_H264_INVALID) ||
+            (ref_list[i].picture_id == VA_INVALID_SURFACE))
+            break;
+
+        tmp = curr_pic->TopFieldOrderCnt - ref_list[i].TopFieldOrderCnt;
+
+        if (dir)
+            tmp = -tmp;
+
+        if (tmp > 0 && tmp < min) {
+            min = tmp;
+            found = i;
+        }
     }
+
+    return found;
 }
 
+void
+intel_avc_vme_reference_state(VADriverContextP ctx,
+                              struct encode_state *encode_state,
+                              struct intel_encoder_context *encoder_context,
+                              int list_index,
+                              int surface_index,
+                              void (* vme_source_surface_state)(
+                                  VADriverContextP ctx,
+                                  int index,
+                                  struct object_surface *obj_surface,
+                                  struct intel_encoder_context *encoder_context))
+{
+    struct gen6_vme_context *vme_context = encoder_context->vme_context;
+    struct object_surface *obj_surface = NULL;
+    struct i965_driver_data *i965 = i965_driver_data(ctx);
+    VASurfaceID ref_surface_id;
+    VAEncPictureParameterBufferH264 *pic_param = (VAEncPictureParameterBufferH264 *)encode_state->pic_param_ext->buffer;
+    VAEncSliceParameterBufferH264 *slice_param = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[0]->buffer;
+    int max_num_references;
+    VAPictureH264 *curr_pic;
+    VAPictureH264 *ref_list;
+    int ref_idx;
+
+    if (list_index == 0) {
+        max_num_references = pic_param->num_ref_idx_l0_active_minus1 + 1;
+        ref_list = slice_param->RefPicList0;
+    } else {
+        max_num_references = pic_param->num_ref_idx_l1_active_minus1 + 1;
+        ref_list = slice_param->RefPicList1;
+    }
+
+    if (max_num_references == 1) {
+        if (list_index == 0) {
+            ref_surface_id = slice_param->RefPicList0[0].picture_id;
+            vme_context->used_references[0] = &slice_param->RefPicList0[0];
+        } else {
+            ref_surface_id = slice_param->RefPicList1[0].picture_id;
+            vme_context->used_references[1] = &slice_param->RefPicList1[0];
+        }
+
+        if (ref_surface_id != VA_INVALID_SURFACE)
+            obj_surface = SURFACE(ref_surface_id);
+
+        if (!obj_surface ||
+            !obj_surface->bo) {
+            obj_surface = encode_state->reference_objects[list_index];
+            vme_context->used_references[list_index] = &pic_param->ReferenceFrames[list_index];
+        }
+
+        ref_idx = 0;
+    } else {
+        curr_pic = &pic_param->CurrPic;
+
+        /* select the reference frame in temporal space */
+        ref_idx = avc_temporal_find_surface(curr_pic, ref_list, max_num_references, list_index == 1);
+        ref_surface_id = ref_list[ref_idx].picture_id;
+
+        if (ref_surface_id != VA_INVALID_SURFACE) /* otherwise warning later */
+            obj_surface = SURFACE(ref_surface_id);
+
+        vme_context->used_reference_objects[list_index] = obj_surface;
+        vme_context->used_references[list_index] = &ref_list[ref_idx];
+    }
+
+    if (obj_surface &&
+        obj_surface->bo) {
+        assert(ref_idx >= 0);
+        vme_context->used_reference_objects[list_index] = obj_surface;
+        vme_source_surface_state(ctx, surface_index, obj_surface, encoder_context);
+        vme_context->ref_index_in_mb[list_index] = (ref_idx << 24 |
+                                                    ref_idx << 16 |
+                                                    ref_idx <<  8 |
+                                                    ref_idx);
+    } else {
+        vme_context->used_reference_objects[list_index] = NULL;
+        vme_context->used_references[list_index] = NULL;
+        vme_context->ref_index_in_mb[list_index] = 0;
+    }
+}
+
+void intel_avc_slice_insert_packed_data(VADriverContextP ctx,
+                                        struct encode_state *encode_state,
+                                        struct intel_encoder_context *encoder_context,
+                                        int slice_index,
+                                        struct intel_batchbuffer *slice_batch)
+{
+    int count, i, start_index;
+    unsigned int length_in_bits;
+    VAEncPackedHeaderParameterBuffer *param = NULL;
+    unsigned int *header_data = NULL;
+    struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
+    int slice_header_index;
+
+    if (encode_state->slice_header_index[slice_index] == 0)
+        slice_header_index = -1;
+    else
+        slice_header_index = (encode_state->slice_header_index[slice_index] & SLICE_PACKED_DATA_INDEX_MASK);
+
+    count = encode_state->slice_rawdata_count[slice_index];
+    start_index = (encode_state->slice_rawdata_index[slice_index] & SLICE_PACKED_DATA_INDEX_MASK);
+
+    for (i = 0; i < count; i++) {
+        unsigned int skip_emul_byte_cnt;
+
+        header_data = (unsigned int *)encode_state->packed_header_data_ext[start_index + i]->buffer;
+
+        param = (VAEncPackedHeaderParameterBuffer *)
+                    (encode_state->packed_header_params_ext[start_index + i]->buffer);
+
+        /* skip the slice header packed data type as it is lastly inserted */
+        if (param->type == VAEncPackedHeaderSlice)
+            continue;
+
+        length_in_bits = param->bit_length;
+
+        skip_emul_byte_cnt = intel_avc_find_skipemulcnt((unsigned char *)header_data, length_in_bits);
+
+        /* as the slice header is still required, the last header flag is set to
+         * zero.
+         */
+        mfc_context->insert_object(ctx,
+                                   encoder_context,
+                                   header_data,
+                                   ALIGN(length_in_bits, 32) >> 5,
+                                   length_in_bits & 0x1f,
+                                   skip_emul_byte_cnt,
+                                   0,
+                                   0,
+                                   !param->has_emulation_bytes,
+                                   slice_batch);
+    }
+
+    if (slice_header_index == -1) {
+        unsigned char *slice_header = NULL;
+        int slice_header_length_in_bits = 0;
+        VAEncSequenceParameterBufferH264 *pSequenceParameter = (VAEncSequenceParameterBufferH264 *)encode_state->seq_param_ext->buffer;
+        VAEncPictureParameterBufferH264 *pPicParameter = (VAEncPictureParameterBufferH264 *)encode_state->pic_param_ext->buffer;
+        VAEncSliceParameterBufferH264 *pSliceParameter = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[slice_index]->buffer;
+
+        /* No slice header data is passed. And the driver needs to generate it */
+        /* For the Normal H264 */
+        slice_header_length_in_bits = build_avc_slice_header(pSequenceParameter,
+                                                             pPicParameter,
+                                                             pSliceParameter,
+                                                             &slice_header);
+        mfc_context->insert_object(ctx, encoder_context,
+                                   (unsigned int *)slice_header,
+                                   ALIGN(slice_header_length_in_bits, 32) >> 5,
+                                   slice_header_length_in_bits & 0x1f,
+                                   5,  /* first 5 bytes are start code + nal unit type */
+                                   1, 0, 1, slice_batch);
+
+        free(slice_header);
+    } else {
+        unsigned int skip_emul_byte_cnt;
+
+        header_data = (unsigned int *)encode_state->packed_header_data_ext[start_index + i]->buffer;
+
+        param = (VAEncPackedHeaderParameterBuffer *)
+                    (encode_state->packed_header_params_ext[start_index + i]->buffer);
+        length_in_bits = param->bit_length;
+
+        /* as the slice header is the last header data for one slice,
+         * the last header flag is set to one.
+         */
+        skip_emul_byte_cnt = intel_avc_find_skipemulcnt((unsigned char *)header_data, length_in_bits);
+
+        mfc_context->insert_object(ctx,
+                                   encoder_context,
+                                   header_data,
+                                   ALIGN(length_in_bits, 32) >> 5,
+                                   length_in_bits & 0x1f,
+                                   skip_emul_byte_cnt,
+                                   1,
+                                   0,
+                                   !param->has_emulation_bytes,
+                                   slice_batch);
+    }
+
+    return;
+}