From: Zhao Yakui Date: Mon, 24 Dec 2012 07:08:25 +0000 (+0800) Subject: MEDIA_OBJECT uses hardware scoreboard during VME prediction on Haswell X-Git-Tag: submit/upstream/20130321.072122~68 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=f2b5f3f038efff0c4161bf6df9ae27797b031b25;p=profile%2Fivi%2Fvaapi-intel-driver.git MEDIA_OBJECT uses hardware scoreboard during VME prediction on Haswell To get the precise VME prediction the current macroblock will depend on its neighbour mb(left, top, up-right). So the hardware scoreboard is used when submitting the MEDIA_OBJECT command. This is to do the preparation of adding MV prediction in VME prediction. Signed-off-by: Zhao Yakui --- diff --git a/src/gen75_vme.c b/src/gen75_vme.c index dc5730e..6bf8777 100644 --- a/src/gen75_vme.c +++ b/src/gen75_vme.c @@ -28,6 +28,7 @@ #include #include +#include #include #include @@ -62,6 +63,10 @@ #define VME_MSG_LENGTH 32 +#define MB_SCOREBOARD_A (1 << 0) +#define MB_SCOREBOARD_B (1 << 1) +#define MB_SCOREBOARD_C (1 << 2) + static const uint32_t gen75_vme_intra_frame[][4] = { #include "shaders/vme/intra_frame_haswell.g75b" }; @@ -481,6 +486,12 @@ static VAStatus gen75_vme_vme_state_setup(VADriverContextP ctx, return VA_STATUS_SUCCESS; } +#define INTRA_PRED_AVAIL_FLAG_AE 0x60 +#define INTRA_PRED_AVAIL_FLAG_B 0x10 +#define INTRA_PRED_AVAIL_FLAG_C 0x8 +#define INTRA_PRED_AVAIL_FLAG_D 0x4 +#define INTRA_PRED_AVAIL_FLAG_BCD_MASK 0x1C + static void gen75_vme_fill_vme_batchbuffer(VADriverContextP ctx, struct encode_state *encode_state, @@ -494,12 +505,6 @@ gen75_vme_fill_vme_batchbuffer(VADriverContextP ctx, int i, s; unsigned int *command_ptr; -#define INTRA_PRED_AVAIL_FLAG_AE 0x60 -#define INTRA_PRED_AVAIL_FLAG_B 0x10 -#define INTRA_PRED_AVAIL_FLAG_C 0x8 -#define INTRA_PRED_AVAIL_FLAG_D 0x4 -#define INTRA_PRED_AVAIL_FLAG_BCD_MASK 0x1C - dri_bo_map(vme_context->vme_batchbuffer.bo, 1); command_ptr = vme_context->vme_batchbuffer.bo->virtual; @@ -557,6 +562,77 @@ gen75_vme_fill_vme_batchbuffer(VADriverContextP ctx, dri_bo_unmap(vme_context->vme_batchbuffer.bo); } + +static void +gen75_vme_walker_fill_vme_batchbuffer(VADriverContextP ctx, + struct encode_state *encode_state, + int mb_width, int mb_height, + int kernel, + int transform_8x8_mode_flag, + struct intel_encoder_context *encoder_context) +{ + struct gen6_vme_context *vme_context = encoder_context->vme_context; + int mb_x = 0, mb_y = 0; + int mb_row; + int i, s; + unsigned int *command_ptr; + int temp; + + +#define USE_SCOREBOARD (1 << 21) + + dri_bo_map(vme_context->vme_batchbuffer.bo, 1); + command_ptr = vme_context->vme_batchbuffer.bo->virtual; + + for (s = 0; s < encode_state->num_slice_params_ext; s++) { + VAEncSliceParameterBufferH264 *pSliceParameter = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[s]->buffer; + int slice_mb_begin = pSliceParameter->macroblock_address; + int slice_mb_number = pSliceParameter->num_macroblocks; + unsigned int mb_intra_ub, score_dep; + int slice_mb_x = pSliceParameter->macroblock_address % mb_width; + mb_row = slice_mb_begin / mb_width; + for (i = 0; i < slice_mb_number; ) { + int mb_count = i + slice_mb_begin; + mb_x = mb_count % mb_width; + mb_y = mb_count / mb_width; + mb_intra_ub = 0; + score_dep = 0; + if (mb_x != 0) { + mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_AE; + score_dep |= MB_SCOREBOARD_A; + } + if (mb_y != mb_row) { + mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_B; + score_dep |= MB_SCOREBOARD_B; + if (mb_x != 0) + mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_D; + if (mb_x != (mb_width -1)) { + mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_C; + score_dep |= MB_SCOREBOARD_C; + } + } + + *command_ptr++ = (CMD_MEDIA_OBJECT | (8 - 2)); + *command_ptr++ = kernel; + *command_ptr++ = USE_SCOREBOARD; + *command_ptr++ = 0; + /* the (X, Y) term of scoreboard */ + *command_ptr++ = ((mb_y << 16) | mb_x); + *command_ptr++ = score_dep; + /*inline data */ + *command_ptr++ = (mb_width << 16 | mb_y << 8 | mb_x); + *command_ptr++ = ( (1 << 16) | transform_8x8_mode_flag | (mb_intra_ub << 8)); + + i += 1; + } + } + + *command_ptr++ = 0; + *command_ptr++ = MI_BATCH_BUFFER_END; + + dri_bo_unmap(vme_context->vme_batchbuffer.bo); +} + static void gen75_vme_media_init(VADriverContextP ctx, struct intel_encoder_context *encoder_context) { struct i965_driver_data *i965 = i965_driver_data(ctx); @@ -589,8 +665,26 @@ static void gen75_vme_pipeline_programing(VADriverContextP ctx, int is_intra = pSliceParameter->slice_type == SLICE_TYPE_I; int width_in_mbs = pSequenceParameter->picture_width_in_mbs; int height_in_mbs = pSequenceParameter->picture_height_in_mbs; + bool allow_hwscore = true; + int s; + + for (s = 0; s < encode_state->num_slice_params_ext; s++) { + pSliceParameter = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[s]->buffer; + if ((pSliceParameter->macroblock_address % width_in_mbs)) { + allow_hwscore = false; + break; + } + } - gen75_vme_fill_vme_batchbuffer(ctx, + if (allow_hwscore) + gen75_vme_walker_fill_vme_batchbuffer(ctx, + encode_state, + width_in_mbs, height_in_mbs, + is_intra ? VME_INTRA_SHADER : VME_INTER_SHADER, + pPicParameter->pic_fields.bits.transform_8x8_mode_flag, + encoder_context); + else + gen75_vme_fill_vme_batchbuffer(ctx, encode_state, width_in_mbs, height_in_mbs, is_intra ? VME_INTRA_SHADER : VME_INTER_SHADER, @@ -997,6 +1091,25 @@ Bool gen75_vme_context_init(VADriverContextP ctx, struct intel_encoder_context * vme_context->gpe_context.vfe_state.urb_entry_size = 59 - 1; vme_context->gpe_context.vfe_state.curbe_allocation_size = CURBE_ALLOCATION_SIZE - 1; + vme_context->gpe_context.vfe_desc5.scoreboard0.enable = 1; + vme_context->gpe_context.vfe_desc5.scoreboard0.type = SCOREBOARD_STALLING; + vme_context->gpe_context.vfe_desc5.scoreboard0.mask = (MB_SCOREBOARD_A | + MB_SCOREBOARD_B | + MB_SCOREBOARD_C); + + /* In VME prediction the current mb depends on the neighbour + * A/B/C macroblock. So the left/up/up-right dependency should + * be considered. + */ + vme_context->gpe_context.vfe_desc6.scoreboard1.delta_x0 = -1; + vme_context->gpe_context.vfe_desc6.scoreboard1.delta_y0 = 0; + vme_context->gpe_context.vfe_desc6.scoreboard1.delta_x1 = 0; + vme_context->gpe_context.vfe_desc6.scoreboard1.delta_y1 = -1; + vme_context->gpe_context.vfe_desc6.scoreboard1.delta_x2 = 1; + vme_context->gpe_context.vfe_desc6.scoreboard1.delta_y2 = -1; + + vme_context->gpe_context.vfe_desc7.dword = 0; + i965_gpe_load_kernels(ctx, &vme_context->gpe_context, vme_kernel_list, diff --git a/src/i965_gpe_utils.c b/src/i965_gpe_utils.c index 546e5ba..9e569b2 100644 --- a/src/i965_gpe_utils.c +++ b/src/i965_gpe_utils.c @@ -87,9 +87,10 @@ gen6_gpe_vfe_state(VADriverContextP ctx, OUT_BATCH(batch, gpe_context->vfe_state.urb_entry_size << 16 | /* URB Entry Allocation Size */ gpe_context->vfe_state.curbe_allocation_size); /* CURBE Allocation Size */ - OUT_BATCH(batch, 0); /* Disable Scoreboard */ - OUT_BATCH(batch, 0); /* Disable Scoreboard */ - OUT_BATCH(batch, 0); /* Disable Scoreboard */ + /* the vfe_desc5/6/7 will decide whether the scoreboard is used. */ + OUT_BATCH(batch, gpe_context->vfe_desc5.dword); + OUT_BATCH(batch, gpe_context->vfe_desc6.dword); + OUT_BATCH(batch, gpe_context->vfe_desc7.dword); ADVANCE_BATCH(batch); diff --git a/src/i965_gpe_utils.h b/src/i965_gpe_utils.h index 3ebb3cb..72d7de8 100644 --- a/src/i965_gpe_utils.h +++ b/src/i965_gpe_utils.h @@ -70,6 +70,47 @@ struct i965_gpe_context unsigned int urb_entry_size : 16; unsigned int curbe_allocation_size : 16; } vfe_state; + + /* vfe_desc5/6/7 is used to determine whether the HW scoreboard is used. + * If scoreboard is not used, don't touch them + */ + union { + unsigned int dword; + struct { + unsigned int mask:8; + unsigned int pad:22; + unsigned int type:1; + unsigned int enable:1; + } scoreboard0; + }vfe_desc5; + + union { + unsigned int dword; + struct { + int delta_x0:4; + int delta_y0:4; + int delta_x1:4; + int delta_y1:4; + int delta_x2:4; + int delta_y2:4; + int delta_x3:4; + int delta_y3:4; + } scoreboard1; + } vfe_desc6; + + union { + unsigned int dword; + struct { + int delta_x4:4; + int delta_y4:4; + int delta_x5:4; + int delta_y5:4; + int delta_x6:4; + int delta_y6:4; + int delta_x7:4; + int delta_y7:4; + } scoreboard2; + } vfe_desc7; unsigned int num_kernels; struct i965_kernel kernels[MAX_GPE_KERNELS];