Follow the spec to make BDW encoding media pipeline command support 48-bit addressing...
[platform/upstream/libva-intel-driver.git] / src / gen8_vme.c
index 45c01f4..5369b31 100644 (file)
@@ -123,9 +123,9 @@ static struct i965_kernel gen8_vme_mpeg2_kernels[] = {
 /* only used for VME source surface state */
 static void 
 gen8_vme_source_surface_state(VADriverContextP ctx,
-                               int index,
-                               struct object_surface *obj_surface,
-                               struct intel_encoder_context *encoder_context)
+                              int index,
+                              struct object_surface *obj_surface,
+                              struct intel_encoder_context *encoder_context)
 {
     struct gen6_vme_context *vme_context = encoder_context->vme_context;
 
@@ -138,9 +138,9 @@ gen8_vme_source_surface_state(VADriverContextP ctx,
 
 static void
 gen8_vme_media_source_surface_state(VADriverContextP ctx,
-                                     int index,
-                                     struct object_surface *obj_surface,
-                                     struct intel_encoder_context *encoder_context)
+                                    int index,
+                                    struct object_surface *obj_surface,
+                                    struct intel_encoder_context *encoder_context)
 {
     struct gen6_vme_context *vme_context = encoder_context->vme_context;
 
@@ -153,9 +153,9 @@ gen8_vme_media_source_surface_state(VADriverContextP ctx,
 
 static void
 gen8_vme_media_chroma_source_surface_state(VADriverContextP ctx,
-                                            int index,
-                                            struct object_surface *obj_surface,
-                                            struct intel_encoder_context *encoder_context)
+                                           int index,
+                                           struct object_surface *obj_surface,
+                                           struct intel_encoder_context *encoder_context)
 {
     struct gen6_vme_context *vme_context = encoder_context->vme_context;
 
@@ -168,9 +168,9 @@ gen8_vme_media_chroma_source_surface_state(VADriverContextP ctx,
 
 static void
 gen8_vme_output_buffer_setup(VADriverContextP ctx,
-                              struct encode_state *encode_state,
-                              int index,
-                              struct intel_encoder_context *encoder_context)
+                             struct encode_state *encode_state,
+                             int index,
+                             struct intel_encoder_context *encoder_context)
 
 {
     struct i965_driver_data *i965 = i965_driver_data(ctx);
@@ -208,9 +208,9 @@ gen8_vme_output_buffer_setup(VADriverContextP ctx,
 
 static void
 gen8_vme_output_vme_batchbuffer_setup(VADriverContextP ctx,
-                                       struct encode_state *encode_state,
-                                       int index,
-                                       struct intel_encoder_context *encoder_context)
+                                      struct encode_state *encode_state,
+                                      int index,
+                                      struct intel_encoder_context *encoder_context)
 
 {
     struct i965_driver_data *i965 = i965_driver_data(ctx);
@@ -226,21 +226,22 @@ gen8_vme_output_vme_batchbuffer_setup(VADriverContextP ctx,
                                                    "VME batchbuffer",
                                                    vme_context->vme_batchbuffer.num_blocks * vme_context->vme_batchbuffer.size_block,
                                                    0x1000);
+       /*
     vme_context->vme_buffer_suface_setup(ctx,
                                          &vme_context->gpe_context,
                                          &vme_context->vme_batchbuffer,
                                          BINDING_TABLE_OFFSET(index),
                                          SURFACE_STATE_OFFSET(index));
+       */
 }
 
 static VAStatus
 gen8_vme_surface_setup(VADriverContextP ctx, 
-                        struct encode_state *encode_state,
-                        int is_intra,
-                        struct intel_encoder_context *encoder_context)
+                       struct encode_state *encode_state,
+                       int is_intra,
+                       struct intel_encoder_context *encoder_context)
 {
     struct object_surface *obj_surface;
-    struct i965_driver_data *i965 = i965_driver_data(ctx);
 
     /*Setup surfaces state*/
     /* current picture for encoding */
@@ -252,43 +253,14 @@ gen8_vme_surface_setup(VADriverContextP ctx,
     if (!is_intra) {
        VAEncSliceParameterBufferH264 *slice_param = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[0]->buffer;
        int slice_type;
-       struct object_surface *slice_obj_surface;
-       int ref_surface_id;
 
        slice_type = intel_avc_enc_slice_type_fixup(slice_param->slice_type);
+       assert(slice_type != SLICE_TYPE_I && slice_type != SLICE_TYPE_SI);
 
-       if (slice_type == SLICE_TYPE_P || slice_type == SLICE_TYPE_B) {
-               slice_obj_surface = NULL;
-               ref_surface_id = slice_param->RefPicList0[0].picture_id;
-               if (ref_surface_id != 0 && ref_surface_id != VA_INVALID_SURFACE) {
-                       slice_obj_surface = SURFACE(ref_surface_id);
-               }
-               if (slice_obj_surface && slice_obj_surface->bo) {
-                       obj_surface = slice_obj_surface;
-               } else {
-                       obj_surface = encode_state->reference_objects[0];
-               }
-               /* reference 0 */
-               if (obj_surface && obj_surface->bo)
-                       gen8_vme_source_surface_state(ctx, 1, obj_surface, encoder_context);
-       }
-       if (slice_type == SLICE_TYPE_B) {
-               /* reference 1 */
-               slice_obj_surface = NULL;
-               ref_surface_id = slice_param->RefPicList1[0].picture_id;
-               if (ref_surface_id != 0 && ref_surface_id != VA_INVALID_SURFACE) {
-                       slice_obj_surface = SURFACE(ref_surface_id);
-               }
-               if (slice_obj_surface && slice_obj_surface->bo) {
-                       obj_surface = slice_obj_surface;
-               } else {
-                       obj_surface = encode_state->reference_objects[0];
-               }
+       intel_avc_vme_reference_state(ctx, encode_state, encoder_context, 0, 1, gen8_vme_source_surface_state);
 
-               obj_surface = encode_state->reference_objects[1];
-               if (obj_surface && obj_surface->bo)
-                       gen8_vme_source_surface_state(ctx, 2, obj_surface, encoder_context);
-       }
+       if (slice_type == SLICE_TYPE_B)
+            intel_avc_vme_reference_state(ctx, encode_state, encoder_context, 1, 2, gen8_vme_source_surface_state);
     }
 
     /* VME output */
@@ -299,18 +271,21 @@ gen8_vme_surface_setup(VADriverContextP ctx,
 }
 
 static VAStatus gen8_vme_interface_setup(VADriverContextP ctx, 
-                                          struct encode_state *encode_state,
-                                          struct intel_encoder_context *encoder_context)
+                                         struct encode_state *encode_state,
+                                         struct intel_encoder_context *encoder_context)
 {
     struct gen6_vme_context *vme_context = encoder_context->vme_context;
     struct gen8_interface_descriptor_data *desc;   
     int i;
     dri_bo *bo;
+    unsigned char *desc_ptr;
 
-    bo = vme_context->gpe_context.idrt.bo;
+    bo = vme_context->gpe_context.dynamic_state.bo;
     dri_bo_map(bo, 1);
     assert(bo->virtual);
-    desc = bo->virtual;
+    desc_ptr = (unsigned char *)bo->virtual + vme_context->gpe_context.idrt_offset;
+
+    desc = (struct gen8_interface_descriptor_data *)desc_ptr;
 
     for (i = 0; i < vme_context->vme_kernel_sum; i++) {
         struct i965_kernel *kernel;
@@ -318,31 +293,26 @@ static VAStatus gen8_vme_interface_setup(VADriverContextP ctx,
         assert(sizeof(*desc) == 32);
         /*Setup the descritor table*/
         memset(desc, 0, sizeof(*desc));
-        desc->desc0.kernel_start_pointer = (kernel->bo->offset >> 6);
+        desc->desc0.kernel_start_pointer = kernel->kernel_offset >> 6;
         desc->desc3.sampler_count = 0; /* FIXME: */
         desc->desc3.sampler_state_pointer = 0;
         desc->desc4.binding_table_entry_count = 1; /* FIXME: */
         desc->desc4.binding_table_pointer = (BINDING_TABLE_OFFSET(0) >> 5);
         desc->desc5.constant_urb_entry_read_offset = 0;
         desc->desc5.constant_urb_entry_read_length = CURBE_URB_ENTRY_LENGTH;
-               
+
                
-        /*kernel start*/
-        dri_bo_emit_reloc(bo,  
-                          I915_GEM_DOMAIN_INSTRUCTION, 0,
-                          0,
-                          i * sizeof(*desc) + offsetof(struct gen8_interface_descriptor_data, desc0),
-                          kernel->bo);
         desc++;
     }
+
     dri_bo_unmap(bo);
 
     return VA_STATUS_SUCCESS;
 }
 
 static VAStatus gen8_vme_constant_setup(VADriverContextP ctx, 
-                                         struct encode_state *encode_state,
-                                         struct intel_encoder_context *encoder_context)
+                                        struct encode_state *encode_state,
+                                        struct intel_encoder_context *encoder_context)
 {
     struct gen6_vme_context *vme_context = encoder_context->vme_context;
     unsigned char *constant_buffer;
@@ -364,9 +334,10 @@ static VAStatus gen8_vme_constant_setup(VADriverContextP ctx,
 
     vme_state_message[31] = mv_num;
 
-    dri_bo_map(vme_context->gpe_context.curbe.bo, 1);
-    assert(vme_context->gpe_context.curbe.bo->virtual);
-    constant_buffer = vme_context->gpe_context.curbe.bo->virtual;
+    dri_bo_map(vme_context->gpe_context.dynamic_state.bo, 1);
+    assert(vme_context->gpe_context.dynamic_state.bo->virtual);
+    constant_buffer = (unsigned char *)vme_context->gpe_context.dynamic_state.bo->virtual +
+                                         vme_context->gpe_context.curbe_offset;
 
     /* VME MV/Mb cost table is passed by using const buffer */
     /* Now it uses the fixed search path. So it is constructed directly
@@ -374,19 +345,166 @@ static VAStatus gen8_vme_constant_setup(VADriverContextP ctx,
      */
     memcpy(constant_buffer, (char *)vme_context->vme_state_message, 128);
        
-    dri_bo_unmap(vme_context->gpe_context.curbe.bo);
+    dri_bo_unmap(vme_context->gpe_context.dynamic_state.bo);
 
     return VA_STATUS_SUCCESS;
 }
 
+#define                MB_SCOREBOARD_A         (1 << 0)
+#define                MB_SCOREBOARD_B         (1 << 1)
+#define                MB_SCOREBOARD_C         (1 << 2)
+
+/* check whether the mb of (x_index, y_index) is out of bound */
+static inline int loop_in_bounds(int x_index, int y_index, int first_mb, int num_mb, int mb_width, int mb_height)
+{
+    int mb_index;
+    if (x_index < 0 || x_index >= mb_width)
+        return -1;
+    if (y_index < 0 || y_index >= mb_height)
+        return -1;
+       
+    mb_index = y_index * mb_width + x_index;
+    if (mb_index < first_mb || mb_index > (first_mb + num_mb))
+        return -1;
+    return 0;
+}
+
+static void
+gen8wa_vme_walker_fill_vme_batchbuffer(VADriverContextP ctx, 
+                                     struct encode_state *encode_state,
+                                     int mb_width, int mb_height,
+                                     int kernel,
+                                     int transform_8x8_mode_flag,
+                                     struct intel_encoder_context *encoder_context)
+{
+    struct gen6_vme_context *vme_context = encoder_context->vme_context;
+    int mb_row;
+    int s;
+    unsigned int *command_ptr;
+
+#define                USE_SCOREBOARD          (1 << 21)
+    dri_bo_map(vme_context->vme_batchbuffer.bo, 1);
+    command_ptr = vme_context->vme_batchbuffer.bo->virtual;
+
+    for (s = 0; s < encode_state->num_slice_params_ext; s++) {
+       VAEncSliceParameterBufferH264 *pSliceParameter = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[s]->buffer;
+       int first_mb = pSliceParameter->macroblock_address;
+       int num_mb = pSliceParameter->num_macroblocks;
+       unsigned int mb_intra_ub, score_dep;
+       int x_outer, y_outer, x_inner, y_inner;
+       int xtemp_outer = 0;
+
+       x_outer = first_mb % mb_width;
+       y_outer = first_mb / mb_width;
+       mb_row = y_outer;
+                                
+       for (; x_outer < (mb_width -2 ) && !loop_in_bounds(x_outer, y_outer, first_mb, num_mb, mb_width, mb_height); ) {
+           x_inner = x_outer;
+           y_inner = y_outer;
+           for (; !loop_in_bounds(x_inner, y_inner, first_mb, num_mb, mb_width, mb_height);) {
+               mb_intra_ub = 0;
+               score_dep = 0;
+               if (x_inner != 0) {
+                   mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_AE;
+                   score_dep |= MB_SCOREBOARD_A; 
+               }
+               if (y_inner != mb_row) {
+                   mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_B;
+                   score_dep |= MB_SCOREBOARD_B;
+                   if (x_inner != 0)
+                       mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_D;
+                   if (x_inner != (mb_width -1)) {
+                       mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_C;
+                       score_dep |= MB_SCOREBOARD_C;
+                    }
+               }
+                                                       
+               *command_ptr++ = (CMD_MEDIA_OBJECT | (8 - 2));
+               *command_ptr++ = kernel;
+               *command_ptr++ = USE_SCOREBOARD;
+               /* Indirect data */
+               *command_ptr++ = 0;
+               /* the (X, Y) term of scoreboard */
+               *command_ptr++ = ((y_inner << 16) | x_inner);
+               *command_ptr++ = score_dep;
+               /*inline data */
+               *command_ptr++ = (mb_width << 16 | y_inner << 8 | x_inner);
+               *command_ptr++ = ((1 << 18) | (1 << 16) | transform_8x8_mode_flag | (mb_intra_ub << 8));
+                *command_ptr++ = CMD_MEDIA_STATE_FLUSH;
+                *command_ptr++ = 0;
+
+               x_inner -= 2;
+               y_inner += 1;
+           }
+           x_outer += 1;
+       }
+
+       xtemp_outer = mb_width - 2;
+       if (xtemp_outer < 0)
+            xtemp_outer = 0;
+       x_outer = xtemp_outer;
+       y_outer = first_mb / mb_width;
+       for (;!loop_in_bounds(x_outer, y_outer, first_mb, num_mb, mb_width, mb_height); ) { 
+           y_inner = y_outer;
+           x_inner = x_outer;
+           for (; !loop_in_bounds(x_inner, y_inner, first_mb, num_mb, mb_width, mb_height);) {
+               mb_intra_ub = 0;
+               score_dep = 0;
+               if (x_inner != 0) {
+                   mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_AE;
+                   score_dep |= MB_SCOREBOARD_A; 
+               }
+               if (y_inner != mb_row) {
+                   mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_B;
+                   score_dep |= MB_SCOREBOARD_B;
+                   if (x_inner != 0)
+                       mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_D;
+
+                   if (x_inner != (mb_width -1)) {
+                       mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_C;
+                       score_dep |= MB_SCOREBOARD_C;
+                    }
+               }
+
+               *command_ptr++ = (CMD_MEDIA_OBJECT | (8 - 2));
+               *command_ptr++ = kernel;
+               *command_ptr++ = USE_SCOREBOARD;
+               /* Indirect data */
+               *command_ptr++ = 0;
+               /* the (X, Y) term of scoreboard */
+               *command_ptr++ = ((y_inner << 16) | x_inner);
+               *command_ptr++ = score_dep;
+               /*inline data */
+               *command_ptr++ = (mb_width << 16 | y_inner << 8 | x_inner);
+               *command_ptr++ = ((1 << 18) | (1 << 16) | transform_8x8_mode_flag | (mb_intra_ub << 8));
+
+                *command_ptr++ = CMD_MEDIA_STATE_FLUSH;
+                *command_ptr++ = 0;
+               x_inner -= 2;
+               y_inner += 1;
+           }
+           x_outer++;
+           if (x_outer >= mb_width) {
+               y_outer += 1;
+               x_outer = xtemp_outer;
+           }           
+       }
+    }
+
+    *command_ptr++ = MI_BATCH_BUFFER_END;
+    *command_ptr++ = 0;
+
+    dri_bo_unmap(vme_context->vme_batchbuffer.bo);
+}
 
 static void
 gen8_vme_fill_vme_batchbuffer(VADriverContextP ctx, 
-                               struct encode_state *encode_state,
-                               int mb_width, int mb_height,
-                               int kernel,
-                               int transform_8x8_mode_flag,
-                               struct intel_encoder_context *encoder_context)
+                              struct encode_state *encode_state,
+                              int mb_width, int mb_height,
+                              int kernel,
+                              int transform_8x8_mode_flag,
+                              struct intel_encoder_context *encoder_context)
 {
     struct gen6_vme_context *vme_context = encoder_context->vme_context;
     int mb_x = 0, mb_y = 0;
@@ -440,12 +558,14 @@ gen8_vme_fill_vme_batchbuffer(VADriverContextP ctx,
             *command_ptr++ = (mb_width << 16 | mb_y << 8 | mb_x);
             *command_ptr++ = ( (1 << 16) | transform_8x8_mode_flag | (mb_intra_ub << 8));
 
+            *command_ptr++ = CMD_MEDIA_STATE_FLUSH;
+            *command_ptr++ = 0;
             i += 1;
         } 
     }
 
-    *command_ptr++ = 0;
     *command_ptr++ = MI_BATCH_BUFFER_END;
+    *command_ptr++ = 0;
 
     dri_bo_unmap(vme_context->vme_batchbuffer.bo);
 }
@@ -454,7 +574,7 @@ static void gen8_vme_media_init(VADriverContextP ctx, struct intel_encoder_conte
 {
     struct gen6_vme_context *vme_context = encoder_context->vme_context;
 
-    i965_gpe_context_init(ctx, &vme_context->gpe_context);
+    gen8_gpe_context_init(ctx, &vme_context->gpe_context);
 
     /* VME output buffer */
     dri_bo_unreference(vme_context->vme_output.bo);
@@ -469,8 +589,8 @@ static void gen8_vme_media_init(VADriverContextP ctx, struct intel_encoder_conte
 }
 
 static void gen8_vme_pipeline_programing(VADriverContextP ctx, 
-                                          struct encode_state *encode_state,
-                                          struct intel_encoder_context *encoder_context)
+                                         struct encode_state *encode_state,
+                                         struct intel_encoder_context *encoder_context)
 {
     struct gen6_vme_context *vme_context = encoder_context->vme_context;
     struct intel_batchbuffer *batch = encoder_context->base.batch;
@@ -486,35 +606,36 @@ static void gen8_vme_pipeline_programing(VADriverContextP ctx,
     for (s = 0; s < encode_state->num_slice_params_ext; s++) {
         pSliceParameter = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[s]->buffer; 
         if ((pSliceParameter->macroblock_address % width_in_mbs)) {
-               allow_hwscore = false;
-               break;
+            allow_hwscore = false;
+            break;
        }
     }
+
     if ((pSliceParameter->slice_type == SLICE_TYPE_I) ||
        (pSliceParameter->slice_type == SLICE_TYPE_I)) {
        kernel_shader = VME_INTRA_SHADER;
-   } else if ((pSliceParameter->slice_type == SLICE_TYPE_P) ||
-       (pSliceParameter->slice_type == SLICE_TYPE_SP)) {
+    } else if ((pSliceParameter->slice_type == SLICE_TYPE_P) ||
+               (pSliceParameter->slice_type == SLICE_TYPE_SP)) {
        kernel_shader = VME_INTER_SHADER;
-   } else {
+    } else {
        kernel_shader = VME_BINTER_SHADER;
        if (!allow_hwscore)
-            kernel_shader = VME_INTER_SHADER;
-   }
+            kernel_shader = VME_INTER_SHADER;
+    }
     if (allow_hwscore)
-       gen7_vme_walker_fill_vme_batchbuffer(ctx, 
-                                  encode_state,
-                                  width_in_mbs, height_in_mbs,
-                                  kernel_shader,
-                                  pPicParameter->pic_fields.bits.transform_8x8_mode_flag,
-                                  encoder_context);
+       gen8wa_vme_walker_fill_vme_batchbuffer(ctx, 
+                                             encode_state,
+                                             width_in_mbs, height_in_mbs,
+                                             kernel_shader,
+                                             pPicParameter->pic_fields.bits.transform_8x8_mode_flag,
+                                             encoder_context);
     else
        gen8_vme_fill_vme_batchbuffer(ctx, 
-                                   encode_state,
-                                   width_in_mbs, height_in_mbs,
-                                   kernel_shader,
-                                   pPicParameter->pic_fields.bits.transform_8x8_mode_flag,
-                                   encoder_context);
+                                      encode_state,
+                                      width_in_mbs, height_in_mbs,
+                                      kernel_shader,
+                                      pPicParameter->pic_fields.bits.transform_8x8_mode_flag,
+                                      encoder_context);
 
     intel_batchbuffer_start_atomic(batch, 0x1000);
     gen8_gpe_pipeline_setup(ctx, &vme_context->gpe_context, batch);
@@ -531,8 +652,8 @@ static void gen8_vme_pipeline_programing(VADriverContextP ctx,
 }
 
 static VAStatus gen8_vme_prepare(VADriverContextP ctx, 
-                                  struct encode_state *encode_state,
-                                  struct intel_encoder_context *encoder_context)
+                                 struct encode_state *encode_state,
+                                 struct intel_encoder_context *encoder_context)
 {
     VAStatus vaStatus = VA_STATUS_SUCCESS;
     VAEncSliceParameterBufferH264 *pSliceParameter = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[0]->buffer;
@@ -560,8 +681,8 @@ static VAStatus gen8_vme_prepare(VADriverContextP ctx,
 }
 
 static VAStatus gen8_vme_run(VADriverContextP ctx, 
-                              struct encode_state *encode_state,
-                              struct intel_encoder_context *encoder_context)
+                             struct encode_state *encode_state,
+                             struct intel_encoder_context *encoder_context)
 {
     struct intel_batchbuffer *batch = encoder_context->base.batch;
 
@@ -571,17 +692,17 @@ static VAStatus gen8_vme_run(VADriverContextP ctx,
 }
 
 static VAStatus gen8_vme_stop(VADriverContextP ctx, 
-                               struct encode_state *encode_state,
-                               struct intel_encoder_context *encoder_context)
+                              struct encode_state *encode_state,
+                              struct intel_encoder_context *encoder_context)
 {
     return VA_STATUS_SUCCESS;
 }
 
 static VAStatus
 gen8_vme_pipeline(VADriverContextP ctx,
-                   VAProfile profile,
-                   struct encode_state *encode_state,
-                   struct intel_encoder_context *encoder_context)
+                  VAProfile profile,
+                  struct encode_state *encode_state,
+                  struct intel_encoder_context *encoder_context)
 {
     gen8_vme_media_init(ctx, encoder_context);
     gen8_vme_prepare(ctx, encode_state, encoder_context);
@@ -593,10 +714,10 @@ gen8_vme_pipeline(VADriverContextP ctx,
 
 static void
 gen8_vme_mpeg2_output_buffer_setup(VADriverContextP ctx,
-                                    struct encode_state *encode_state,
-                                    int index,
-                                    int is_intra,
-                                    struct intel_encoder_context *encoder_context)
+                                   struct encode_state *encode_state,
+                                   int index,
+                                   int is_intra,
+                                   struct intel_encoder_context *encoder_context)
 
 {
     struct i965_driver_data *i965 = i965_driver_data(ctx);
@@ -632,9 +753,9 @@ gen8_vme_mpeg2_output_buffer_setup(VADriverContextP ctx,
 
 static void
 gen8_vme_mpeg2_output_vme_batchbuffer_setup(VADriverContextP ctx,
-                                             struct encode_state *encode_state,
-                                             int index,
-                                             struct intel_encoder_context *encoder_context)
+                                            struct encode_state *encode_state,
+                                            int index,
+                                            struct intel_encoder_context *encoder_context)
 
 {
     struct i965_driver_data *i965 = i965_driver_data(ctx);
@@ -659,9 +780,9 @@ gen8_vme_mpeg2_output_vme_batchbuffer_setup(VADriverContextP ctx,
 
 static VAStatus
 gen8_vme_mpeg2_surface_setup(VADriverContextP ctx, 
-                              struct encode_state *encode_state,
-                              int is_intra,
-                              struct intel_encoder_context *encoder_context)
+                             struct encode_state *encode_state,
+                             int is_intra,
+                             struct intel_encoder_context *encoder_context)
 {
     struct object_surface *obj_surface;
 
@@ -694,12 +815,141 @@ gen8_vme_mpeg2_surface_setup(VADriverContextP ctx,
 }
 
 static void
+gen8wa_vme_mpeg2_walker_fill_vme_batchbuffer(VADriverContextP ctx, 
+                                           struct encode_state *encode_state,
+                                           int mb_width, int mb_height,
+                                           int kernel,
+                                           struct intel_encoder_context *encoder_context)
+{
+    struct gen6_vme_context *vme_context = encoder_context->vme_context;
+    unsigned int *command_ptr;
+
+#define                MPEG2_SCOREBOARD                (1 << 21)
+
+    dri_bo_map(vme_context->vme_batchbuffer.bo, 1);
+    command_ptr = vme_context->vme_batchbuffer.bo->virtual;
+
+    {
+       unsigned int mb_intra_ub, score_dep;
+       int x_outer, y_outer, x_inner, y_inner;
+       int xtemp_outer = 0;
+       int first_mb = 0;
+       int num_mb = mb_width * mb_height;
+
+       x_outer = 0;
+       y_outer = 0;
+       
+                                
+       for (; x_outer < (mb_width -2 ) && !loop_in_bounds(x_outer, y_outer, first_mb, num_mb, mb_width, mb_height); ) {
+           x_inner = x_outer;
+           y_inner = y_outer;
+           for (; !loop_in_bounds(x_inner, y_inner, first_mb, num_mb, mb_width, mb_height);) {
+               mb_intra_ub = 0;
+               score_dep = 0;
+               if (x_inner != 0) {
+                   mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_AE;
+                   score_dep |= MB_SCOREBOARD_A; 
+               }
+               if (y_inner != 0) {
+                   mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_B;
+                   score_dep |= MB_SCOREBOARD_B;
+
+                   if (x_inner != 0)
+                       mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_D;
+
+                   if (x_inner != (mb_width -1)) {
+                       mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_C;
+                       score_dep |= MB_SCOREBOARD_C;
+                   }
+               }
+                                                       
+               *command_ptr++ = (CMD_MEDIA_OBJECT | (8 - 2));
+               *command_ptr++ = kernel;
+               *command_ptr++ = MPEG2_SCOREBOARD;
+               /* Indirect data */
+               *command_ptr++ = 0;
+               /* the (X, Y) term of scoreboard */
+               *command_ptr++ = ((y_inner << 16) | x_inner);
+               *command_ptr++ = score_dep;
+               /*inline data */
+               *command_ptr++ = (mb_width << 16 | y_inner << 8 | x_inner);
+               *command_ptr++ = ((1 << 18) | (1 << 16) | (mb_intra_ub << 8));
+                *command_ptr++ = CMD_MEDIA_STATE_FLUSH;
+                *command_ptr++ = 0;
+
+               x_inner -= 2;
+               y_inner += 1;
+           }
+           x_outer += 1;
+       }
+
+       xtemp_outer = mb_width - 2;
+       if (xtemp_outer < 0)
+            xtemp_outer = 0;
+       x_outer = xtemp_outer;
+       y_outer = 0;
+       for (;!loop_in_bounds(x_outer, y_outer, first_mb, num_mb, mb_width, mb_height); ) { 
+           y_inner = y_outer;
+           x_inner = x_outer;
+           for (; !loop_in_bounds(x_inner, y_inner, first_mb, num_mb, mb_width, mb_height);) {
+               mb_intra_ub = 0;
+               score_dep = 0;
+               if (x_inner != 0) {
+                   mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_AE;
+                   score_dep |= MB_SCOREBOARD_A; 
+               }
+               if (y_inner != 0) {
+                   mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_B;
+                   score_dep |= MB_SCOREBOARD_B;
+
+                   if (x_inner != 0)
+                       mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_D;
+
+                   if (x_inner != (mb_width -1)) {
+                       mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_C;
+                       score_dep |= MB_SCOREBOARD_C;
+                   }
+               }
+
+               *command_ptr++ = (CMD_MEDIA_OBJECT | (8 - 2));
+               *command_ptr++ = kernel;
+               *command_ptr++ = MPEG2_SCOREBOARD;
+               /* Indirect data */
+               *command_ptr++ = 0;
+               /* the (X, Y) term of scoreboard */
+               *command_ptr++ = ((y_inner << 16) | x_inner);
+               *command_ptr++ = score_dep;
+               /*inline data */
+               *command_ptr++ = (mb_width << 16 | y_inner << 8 | x_inner);
+               *command_ptr++ = ((1 << 18) | (1 << 16) | (mb_intra_ub << 8));
+
+                *command_ptr++ = CMD_MEDIA_STATE_FLUSH;
+                *command_ptr++ = 0;
+               x_inner -= 2;
+               y_inner += 1;
+           }
+           x_outer++;
+           if (x_outer >= mb_width) {
+               y_outer += 1;
+               x_outer = xtemp_outer;
+           }           
+       }
+    }
+
+    *command_ptr++ = MI_BATCH_BUFFER_END;
+    *command_ptr++ = 0;
+
+    dri_bo_unmap(vme_context->vme_batchbuffer.bo);
+    return;
+}
+
+static void
 gen8_vme_mpeg2_fill_vme_batchbuffer(VADriverContextP ctx, 
-                                     struct encode_state *encode_state,
-                                     int mb_width, int mb_height,
-                                     int kernel,
-                                     int transform_8x8_mode_flag,
-                                     struct intel_encoder_context *encoder_context)
+                                    struct encode_state *encode_state,
+                                    int mb_width, int mb_height,
+                                    int kernel,
+                                    int transform_8x8_mode_flag,
+                                    struct intel_encoder_context *encoder_context)
 {
     struct gen6_vme_context *vme_context = encoder_context->vme_context;
     int mb_x = 0, mb_y = 0;
@@ -750,6 +1000,8 @@ gen8_vme_mpeg2_fill_vme_batchbuffer(VADriverContextP ctx,
                 *command_ptr++ = (mb_width << 16 | mb_y << 8 | mb_x);
                 *command_ptr++ = ( (1 << 16) | transform_8x8_mode_flag | (mb_intra_ub << 8));
 
+                *command_ptr++ = CMD_MEDIA_STATE_FLUSH;
+                *command_ptr++ = 0;
                 i += 1;
             }
 
@@ -757,17 +1009,17 @@ gen8_vme_mpeg2_fill_vme_batchbuffer(VADriverContextP ctx,
         }
     }
 
-    *command_ptr++ = 0;
     *command_ptr++ = MI_BATCH_BUFFER_END;
+    *command_ptr++ = 0;
 
     dri_bo_unmap(vme_context->vme_batchbuffer.bo);
 }
 
 static void
 gen8_vme_mpeg2_pipeline_programing(VADriverContextP ctx, 
-                                    struct encode_state *encode_state,
-                                    int is_intra,
-                                    struct intel_encoder_context *encoder_context)
+                                   struct encode_state *encode_state,
+                                   int is_intra,
+                                   struct intel_encoder_context *encoder_context)
 {
     struct gen6_vme_context *vme_context = encoder_context->vme_context;
     struct intel_batchbuffer *batch = encoder_context->base.batch;
@@ -800,27 +1052,29 @@ gen8_vme_mpeg2_pipeline_programing(VADriverContextP ctx,
     }
 
     if (allow_hwscore) 
-       gen7_vme_mpeg2_walker_fill_vme_batchbuffer(ctx,
-                                         encode_state,
-                                         width_in_mbs, height_in_mbs,
-                                         kernel_shader,
-                                         encoder_context);
+       gen8wa_vme_mpeg2_walker_fill_vme_batchbuffer(ctx,
+                                                   encode_state,
+                                                   width_in_mbs, height_in_mbs,
+                                                   kernel_shader,
+                                                   encoder_context);
     else
        gen8_vme_mpeg2_fill_vme_batchbuffer(ctx, 
-                                         encode_state,
-                                         width_in_mbs, height_in_mbs,
-                                         is_intra ? VME_INTRA_SHADER : VME_INTER_SHADER,
-                                         0,
-                                         encoder_context);
+                                            encode_state,
+                                            width_in_mbs, height_in_mbs,
+                                            is_intra ? VME_INTRA_SHADER : VME_INTER_SHADER,
+                                            0,
+                                            encoder_context);
 
     intel_batchbuffer_start_atomic(batch, 0x1000);
     gen8_gpe_pipeline_setup(ctx, &vme_context->gpe_context, batch);
-    BEGIN_BATCH(batch, 2);
-    OUT_BATCH(batch, MI_BATCH_BUFFER_START | (2 << 6));
+    BEGIN_BATCH(batch, 4);
+    OUT_BATCH(batch, MI_BATCH_BUFFER_START | (1 << 8) | (1 << 0));
     OUT_RELOC(batch,
               vme_context->vme_batchbuffer.bo,
               I915_GEM_DOMAIN_COMMAND, 0, 
               0);
+    OUT_BATCH(batch, 0);
+    OUT_BATCH(batch, 0);
     ADVANCE_BATCH(batch);
 
     intel_batchbuffer_end_atomic(batch);       
@@ -828,8 +1082,8 @@ gen8_vme_mpeg2_pipeline_programing(VADriverContextP ctx,
 
 static VAStatus 
 gen8_vme_mpeg2_prepare(VADriverContextP ctx, 
-                        struct encode_state *encode_state,
-                        struct intel_encoder_context *encoder_context)
+                       struct encode_state *encode_state,
+                       struct intel_encoder_context *encoder_context)
 {
     VAStatus vaStatus = VA_STATUS_SUCCESS;
     VAEncSliceParameterBufferMPEG2 *slice_param = (VAEncSliceParameterBufferMPEG2 *)encode_state->slice_params_ext[0]->buffer;
@@ -837,7 +1091,7 @@ gen8_vme_mpeg2_prepare(VADriverContextP ctx,
     struct gen6_vme_context *vme_context = encoder_context->vme_context;
 
     if ((!vme_context->mpeg2_level) ||
-               (vme_context->mpeg2_level != (seq_param->sequence_extension.bits.profile_and_level_indication & MPEG2_LEVEL_MASK))) {
+        (vme_context->mpeg2_level != (seq_param->sequence_extension.bits.profile_and_level_indication & MPEG2_LEVEL_MASK))) {
        vme_context->mpeg2_level = seq_param->sequence_extension.bits.profile_and_level_indication & MPEG2_LEVEL_MASK;
     }
 
@@ -857,9 +1111,9 @@ gen8_vme_mpeg2_prepare(VADriverContextP ctx,
 
 static VAStatus
 gen8_vme_mpeg2_pipeline(VADriverContextP ctx,
-                         VAProfile profile,
-                         struct encode_state *encode_state,
-                         struct intel_encoder_context *encoder_context)
+                        VAProfile profile,
+                        struct encode_state *encode_state,
+                        struct intel_encoder_context *encoder_context)
 {
     gen8_vme_media_init(ctx, encoder_context);
     gen8_vme_mpeg2_prepare(ctx, encode_state, encoder_context);
@@ -874,7 +1128,7 @@ gen8_vme_context_destroy(void *context)
 {
     struct gen6_vme_context *vme_context = context;
 
-    i965_gpe_context_destroy(&vme_context->gpe_context);
+    gen8_gpe_context_destroy(&vme_context->gpe_context);
 
     dri_bo_unreference(vme_context->vme_output.bo);
     vme_context->vme_output.bo = NULL;
@@ -897,7 +1151,7 @@ Bool gen8_vme_context_init(VADriverContextP ctx, struct intel_encoder_context *e
 {
     struct gen6_vme_context *vme_context = calloc(1, sizeof(struct gen6_vme_context));
     struct i965_kernel *vme_kernel_list = NULL;
-       int i965_kernel_num;
+    int i965_kernel_num;
 
     switch (encoder_context->codec) {
     case CODEC_H264:
@@ -922,10 +1176,10 @@ Bool gen8_vme_context_init(VADriverContextP ctx, struct intel_encoder_context *e
     vme_context->vme_kernel_sum = i965_kernel_num;
     vme_context->gpe_context.surface_state_binding_table.length = (SURFACE_STATE_PADDED_SIZE + sizeof(unsigned int)) * MAX_MEDIA_SURFACES_GEN6;
 
-    vme_context->gpe_context.idrt.max_entries = MAX_INTERFACE_DESC_GEN6;
-    vme_context->gpe_context.idrt.entry_size = sizeof(struct gen8_interface_descriptor_data);
+    vme_context->gpe_context.idrt_size = sizeof(struct gen8_interface_descriptor_data) * MAX_INTERFACE_DESC_GEN6;
+    vme_context->gpe_context.curbe_size = CURBE_TOTAL_DATA_LENGTH;
+    vme_context->gpe_context.sampler_size = 0;
 
-    vme_context->gpe_context.curbe.length = CURBE_TOTAL_DATA_LENGTH;
 
     vme_context->gpe_context.vfe_state.max_num_threads = 60 - 1;
     vme_context->gpe_context.vfe_state.num_urb_entries = 16;
@@ -935,7 +1189,7 @@ Bool gen8_vme_context_init(VADriverContextP ctx, struct intel_encoder_context *e
 
     gen7_vme_scoreboard_init(ctx, vme_context);
 
-    i965_gpe_load_kernels(ctx,
+    gen8_gpe_load_kernels(ctx,
                           &vme_context->gpe_context,
                           vme_kernel_list,
                           i965_kernel_num);