MEDIA_OBJECT uses hardware scoreboard during VME prediction on Haswell
authorZhao Yakui <yakui.zhao@intel.com>
Mon, 24 Dec 2012 07:08:25 +0000 (15:08 +0800)
committerXiang, Haihao <haihao.xiang@intel.com>
Thu, 17 Jan 2013 05:08:39 +0000 (13:08 +0800)
To get the precise VME prediction the current macroblock will depend on its
neighbour mb(left, top, up-right). So the hardware scoreboard is used
when submitting the MEDIA_OBJECT command. This is to do the preparation
of adding MV prediction in VME prediction.

Signed-off-by: Zhao Yakui <yakui.zhao@intel.com>
src/gen75_vme.c
src/i965_gpe_utils.c
src/i965_gpe_utils.h

index dc5730e..6bf8777 100644 (file)
@@ -28,6 +28,7 @@
 
 #include <stdio.h>
 #include <stdlib.h>
+#include <stdbool.h>
 #include <string.h>
 #include <assert.h>
 
 
 #define VME_MSG_LENGTH         32
   
+#define                MB_SCOREBOARD_A         (1 << 0)
+#define                MB_SCOREBOARD_B         (1 << 1)
+#define                MB_SCOREBOARD_C         (1 << 2)
+
 static const uint32_t gen75_vme_intra_frame[][4] = {
 #include "shaders/vme/intra_frame_haswell.g75b"
 };
@@ -481,6 +486,12 @@ static VAStatus gen75_vme_vme_state_setup(VADriverContextP ctx,
     return VA_STATUS_SUCCESS;
 }
 
+#define                INTRA_PRED_AVAIL_FLAG_AE        0x60
+#define                INTRA_PRED_AVAIL_FLAG_B         0x10
+#define                INTRA_PRED_AVAIL_FLAG_C         0x8
+#define                INTRA_PRED_AVAIL_FLAG_D         0x4
+#define                INTRA_PRED_AVAIL_FLAG_BCD_MASK  0x1C
+
 static void
 gen75_vme_fill_vme_batchbuffer(VADriverContextP ctx, 
                                struct encode_state *encode_state,
@@ -494,12 +505,6 @@ gen75_vme_fill_vme_batchbuffer(VADriverContextP ctx,
     int i, s;
     unsigned int *command_ptr;
 
-#define                INTRA_PRED_AVAIL_FLAG_AE        0x60
-#define                INTRA_PRED_AVAIL_FLAG_B         0x10
-#define                INTRA_PRED_AVAIL_FLAG_C         0x8
-#define                INTRA_PRED_AVAIL_FLAG_D         0x4
-#define                INTRA_PRED_AVAIL_FLAG_BCD_MASK  0x1C
-
     dri_bo_map(vme_context->vme_batchbuffer.bo, 1);
     command_ptr = vme_context->vme_batchbuffer.bo->virtual;
 
@@ -557,6 +562,77 @@ gen75_vme_fill_vme_batchbuffer(VADriverContextP ctx,
     dri_bo_unmap(vme_context->vme_batchbuffer.bo);
 }
 
+
+static void
+gen75_vme_walker_fill_vme_batchbuffer(VADriverContextP ctx, 
+                              struct encode_state *encode_state,
+                              int mb_width, int mb_height,
+                              int kernel,
+                              int transform_8x8_mode_flag,
+                              struct intel_encoder_context *encoder_context)
+{
+    struct gen6_vme_context *vme_context = encoder_context->vme_context;
+    int mb_x = 0, mb_y = 0;
+    int mb_row;
+    int i, s;
+    unsigned int *command_ptr;
+    int temp;
+
+
+#define                USE_SCOREBOARD          (1 << 21)
+    dri_bo_map(vme_context->vme_batchbuffer.bo, 1);
+    command_ptr = vme_context->vme_batchbuffer.bo->virtual;
+
+    for (s = 0; s < encode_state->num_slice_params_ext; s++) {
+        VAEncSliceParameterBufferH264 *pSliceParameter = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[s]->buffer; 
+        int slice_mb_begin = pSliceParameter->macroblock_address;
+        int slice_mb_number = pSliceParameter->num_macroblocks;
+        unsigned int mb_intra_ub, score_dep;
+       int slice_mb_x = pSliceParameter->macroblock_address % mb_width;
+       mb_row = slice_mb_begin / mb_width; 
+        for (i = 0; i < slice_mb_number;  ) {
+            int mb_count = i + slice_mb_begin;    
+            mb_x = mb_count % mb_width;
+            mb_y = mb_count / mb_width;
+           mb_intra_ub = 0;
+           score_dep = 0;
+           if (mb_x != 0) {
+               mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_AE;
+               score_dep |= MB_SCOREBOARD_A;
+           }
+           if (mb_y != mb_row) {
+               mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_B;
+               score_dep |= MB_SCOREBOARD_B;
+               if (mb_x != 0)
+                       mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_D;
+               if (mb_x != (mb_width -1)) {
+                       mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_C;
+                       score_dep |= MB_SCOREBOARD_C;
+               }
+           }
+
+               *command_ptr++ = (CMD_MEDIA_OBJECT | (8 - 2));
+               *command_ptr++ = kernel;
+               *command_ptr++ = USE_SCOREBOARD;
+               *command_ptr++ = 0;
+               /* the (X, Y) term of scoreboard */
+               *command_ptr++ = ((mb_y << 16) | mb_x);
+               *command_ptr++ = score_dep;
+               /*inline data */
+               *command_ptr++ = (mb_width << 16 | mb_y << 8 | mb_x);
+               *command_ptr++ = ( (1 << 16) | transform_8x8_mode_flag | (mb_intra_ub << 8));
+
+            i += 1;
+        } 
+    }
+
+    *command_ptr++ = 0;
+    *command_ptr++ = MI_BATCH_BUFFER_END;
+
+    dri_bo_unmap(vme_context->vme_batchbuffer.bo);
+}
+
 static void gen75_vme_media_init(VADriverContextP ctx, struct intel_encoder_context *encoder_context)
 {
     struct i965_driver_data *i965 = i965_driver_data(ctx);
@@ -589,8 +665,26 @@ static void gen75_vme_pipeline_programing(VADriverContextP ctx,
     int is_intra = pSliceParameter->slice_type == SLICE_TYPE_I;
     int width_in_mbs = pSequenceParameter->picture_width_in_mbs;
     int height_in_mbs = pSequenceParameter->picture_height_in_mbs;
+    bool allow_hwscore = true;
+    int s;
+
+    for (s = 0; s < encode_state->num_slice_params_ext; s++) {
+        pSliceParameter = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[s]->buffer; 
+        if ((pSliceParameter->macroblock_address % width_in_mbs)) {
+               allow_hwscore = false;
+               break;
+       }
+    }
 
-    gen75_vme_fill_vme_batchbuffer(ctx, 
+    if (allow_hwscore)
+       gen75_vme_walker_fill_vme_batchbuffer(ctx, 
+                                  encode_state,
+                                  width_in_mbs, height_in_mbs,
+                                  is_intra ? VME_INTRA_SHADER : VME_INTER_SHADER,
+                                  pPicParameter->pic_fields.bits.transform_8x8_mode_flag,
+                                  encoder_context);
+    else
+       gen75_vme_fill_vme_batchbuffer(ctx, 
                                    encode_state,
                                    width_in_mbs, height_in_mbs,
                                    is_intra ? VME_INTRA_SHADER : VME_INTER_SHADER,
@@ -997,6 +1091,25 @@ Bool gen75_vme_context_init(VADriverContextP ctx, struct intel_encoder_context *
     vme_context->gpe_context.vfe_state.urb_entry_size = 59 - 1;
     vme_context->gpe_context.vfe_state.curbe_allocation_size = CURBE_ALLOCATION_SIZE - 1;
 
+       vme_context->gpe_context.vfe_desc5.scoreboard0.enable = 1;
+       vme_context->gpe_context.vfe_desc5.scoreboard0.type = SCOREBOARD_STALLING;
+       vme_context->gpe_context.vfe_desc5.scoreboard0.mask = (MB_SCOREBOARD_A |
+                                                               MB_SCOREBOARD_B |
+                                                               MB_SCOREBOARD_C);
+
+       /* In VME prediction the current mb depends on the neighbour 
+        * A/B/C macroblock. So the left/up/up-right dependency should
+        * be considered.
+        */
+       vme_context->gpe_context.vfe_desc6.scoreboard1.delta_x0 = -1;
+       vme_context->gpe_context.vfe_desc6.scoreboard1.delta_y0 = 0;
+       vme_context->gpe_context.vfe_desc6.scoreboard1.delta_x1 = 0;
+       vme_context->gpe_context.vfe_desc6.scoreboard1.delta_y1 = -1;
+       vme_context->gpe_context.vfe_desc6.scoreboard1.delta_x2 = 1;
+       vme_context->gpe_context.vfe_desc6.scoreboard1.delta_y2 = -1;
+       
+       vme_context->gpe_context.vfe_desc7.dword = 0;
+
     i965_gpe_load_kernels(ctx,
                           &vme_context->gpe_context,
                           vme_kernel_list,
index 546e5ba..9e569b2 100644 (file)
@@ -87,9 +87,10 @@ gen6_gpe_vfe_state(VADriverContextP ctx,
     OUT_BATCH(batch,
               gpe_context->vfe_state.urb_entry_size << 16 |     /* URB Entry Allocation Size */
               gpe_context->vfe_state.curbe_allocation_size);    /* CURBE Allocation Size */
-    OUT_BATCH(batch, 0);                                        /* Disable Scoreboard */
-    OUT_BATCH(batch, 0);                                        /* Disable Scoreboard */
-    OUT_BATCH(batch, 0);                                        /* Disable Scoreboard */
+    /* the vfe_desc5/6/7 will decide whether the scoreboard is used. */
+    OUT_BATCH(batch, gpe_context->vfe_desc5.dword);                                        
+    OUT_BATCH(batch, gpe_context->vfe_desc6.dword);                                       
+    OUT_BATCH(batch, gpe_context->vfe_desc7.dword);                                       
        
     ADVANCE_BATCH(batch);
 
index 3ebb3cb..72d7de8 100644 (file)
@@ -70,6 +70,47 @@ struct i965_gpe_context
         unsigned int urb_entry_size : 16;
         unsigned int curbe_allocation_size : 16;
     } vfe_state;
+  
+    /* vfe_desc5/6/7 is used to determine whether the HW scoreboard is used.
+     * If scoreboard is not used, don't touch them
+     */ 
+    union { 
+       unsigned int dword;
+       struct {
+               unsigned int mask:8;
+               unsigned int pad:22;
+               unsigned int type:1;
+               unsigned int enable:1;
+       } scoreboard0; 
+    }vfe_desc5;
+
+    union {
+       unsigned int dword;
+       struct {
+               int delta_x0:4;
+               int delta_y0:4;
+               int delta_x1:4;
+               int delta_y1:4;
+               int delta_x2:4;
+               int delta_y2:4;
+               int delta_x3:4;
+               int delta_y3:4;
+       } scoreboard1;
+     } vfe_desc6;
+
+    union {
+       unsigned int dword;
+       struct {
+               int delta_x4:4;
+               int delta_y4:4;
+               int delta_x5:4;
+               int delta_y5:4;
+               int delta_x6:4;
+               int delta_y6:4;
+               int delta_x7:4;
+               int delta_y7:4;
+       } scoreboard2;
+     } vfe_desc7;
 
     unsigned int num_kernels;
     struct i965_kernel kernels[MAX_GPE_KERNELS];