Feed MFC PAK with correct MV of VME output on haswell
[platform/upstream/libva-intel-driver.git] / src / gen75_mfc.c
1 /*
2  * Copyright © 2012 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the
6  * "Software"), to deal in the Software without restriction, including
7  * without limitation the rights to use, copy, modify, merge, publish,
8  * distribute, sub license, and/or sell copies of the Software, and to
9  * permit persons to whom the Software is furnished to do so, subject to
10  * the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the
13  * next paragraph) shall be included in all copies or substantial portions
14  * of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
17  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
19  * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR
20  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
21  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
22  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23  *
24  * Authors:
25  *    Zhao Yakui <yakui.zhao@intel.com>
26  *    Xiang Haihao <haihao.xiang@intel.com>
27  *
28  */
29
30 #include <stdio.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #include <math.h>
34 #include <assert.h>
35
36 #include "intel_batchbuffer.h"
37 #include "i965_defines.h"
38 #include "i965_structs.h"
39 #include "i965_drv_video.h"
40 #include "i965_encoder.h"
41 #include "i965_encoder_utils.h"
42 #include "gen6_mfc.h"
43 #include "gen6_vme.h"
44 #include "intel_media.h"
45
46 #define MFC_SOFTWARE_HASWELL    1
47
48 #define B0_STEP_REV             2
49 #define IS_STEPPING_BPLUS(i965) ((i965->intel.revision) >= B0_STEP_REV)
50
51 static const uint32_t gen75_mfc_batchbuffer_avc_intra[][4] = {
52 #include "shaders/utils/mfc_batchbuffer_avc_intra.g7b"
53 };
54
55 static const uint32_t gen75_mfc_batchbuffer_avc_inter[][4] = {
56 #include "shaders/utils/mfc_batchbuffer_avc_inter.g7b"
57 };
58
59 static struct i965_kernel gen75_mfc_kernels[] = {
60     {
61         "MFC AVC INTRA BATCHBUFFER ",
62         MFC_BATCHBUFFER_AVC_INTRA,
63         gen75_mfc_batchbuffer_avc_intra,
64         sizeof(gen75_mfc_batchbuffer_avc_intra),
65         NULL
66     },
67
68     {
69         "MFC AVC INTER BATCHBUFFER ",
70         MFC_BATCHBUFFER_AVC_INTER,
71         gen75_mfc_batchbuffer_avc_inter,
72         sizeof(gen75_mfc_batchbuffer_avc_inter),
73         NULL
74     },
75 };
76
77 #define         INTER_MODE_MASK         0x03
78 #define         INTER_8X8               0x03
79 #define         INTER_16X8              0x01
80 #define         INTER_8X16              0x02
81 #define         SUBMB_SHAPE_MASK        0x00FF00
82
83 #define         INTER_MV8               (4 << 20)
84 #define         INTER_MV32              (6 << 20)
85
86
87 static void
88 gen75_mfc_pipe_mode_select(VADriverContextP ctx,
89                           int standard_select,
90                           struct intel_encoder_context *encoder_context)
91 {
92     struct intel_batchbuffer *batch = encoder_context->base.batch;
93     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
94
95     assert(standard_select == MFX_FORMAT_MPEG2 ||
96            standard_select == MFX_FORMAT_AVC);
97
98     BEGIN_BCS_BATCH(batch, 5);
99
100     OUT_BCS_BATCH(batch, MFX_PIPE_MODE_SELECT | (5 - 2));
101     OUT_BCS_BATCH(batch,
102                   (MFX_LONG_MODE << 17) | /* Must be long format for encoder */
103                   (MFD_MODE_VLD << 15) | /* VLD mode */
104                   (1 << 10) | /* Stream-Out Enable */
105                   ((!!mfc_context->post_deblocking_output.bo) << 9)  | /* Post Deblocking Output */
106                   ((!!mfc_context->pre_deblocking_output.bo) << 8)  | /* Pre Deblocking Output */
107                   (0 << 8)  | /* Pre Deblocking Output */
108                   (0 << 5)  | /* not in stitch mode */
109                   (1 << 4)  | /* encoding mode */
110                   (standard_select << 0));  /* standard select: avc or mpeg2 */
111     OUT_BCS_BATCH(batch,
112                   (0 << 7)  | /* expand NOA bus flag */
113                   (0 << 6)  | /* disable slice-level clock gating */
114                   (0 << 5)  | /* disable clock gating for NOA */
115                   (0 << 4)  | /* terminate if AVC motion and POC table error occurs */
116                   (0 << 3)  | /* terminate if AVC mbdata error occurs */
117                   (0 << 2)  | /* terminate if AVC CABAC/CAVLC decode error occurs */
118                   (0 << 1)  |
119                   (0 << 0));
120     OUT_BCS_BATCH(batch, 0);
121     OUT_BCS_BATCH(batch, 0);
122
123     ADVANCE_BCS_BATCH(batch);
124 }
125
126 static void
127 gen75_mfc_surface_state(VADriverContextP ctx, struct intel_encoder_context *encoder_context)
128 {
129     struct intel_batchbuffer *batch = encoder_context->base.batch;
130     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
131
132     BEGIN_BCS_BATCH(batch, 6);
133
134     OUT_BCS_BATCH(batch, MFX_SURFACE_STATE | (6 - 2));
135     OUT_BCS_BATCH(batch, 0);
136     OUT_BCS_BATCH(batch,
137                   ((mfc_context->surface_state.height - 1) << 18) |
138                   ((mfc_context->surface_state.width - 1) << 4));
139     OUT_BCS_BATCH(batch,
140                   (MFX_SURFACE_PLANAR_420_8 << 28) | /* 420 planar YUV surface */
141                   (1 << 27) | /* must be 1 for interleave U/V, hardware requirement */
142                   (0 << 22) | /* surface object control state, FIXME??? */
143                   ((mfc_context->surface_state.w_pitch - 1) << 3) | /* pitch */
144                   (0 << 2)  | /* must be 0 for interleave U/V */
145                   (1 << 1)  | /* must be tiled */
146                   (I965_TILEWALK_YMAJOR << 0));  /* tile walk, TILEWALK_YMAJOR */
147     OUT_BCS_BATCH(batch,
148                   (0 << 16) |                                                           /* must be 0 for interleave U/V */
149                   (mfc_context->surface_state.h_pitch));                /* y offset for U(cb) */
150     OUT_BCS_BATCH(batch, 0);
151
152     ADVANCE_BCS_BATCH(batch);
153 }
154
155 static void
156 gen75_mfc_ind_obj_base_addr_state_bplus(VADriverContextP ctx,
157                                 struct intel_encoder_context *encoder_context)
158 {
159     struct intel_batchbuffer *batch = encoder_context->base.batch;
160     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
161     struct gen6_vme_context *vme_context = encoder_context->vme_context;
162
163     BEGIN_BCS_BATCH(batch, 26);
164
165     OUT_BCS_BATCH(batch, MFX_IND_OBJ_BASE_ADDR_STATE | (26 - 2));
166         /* the DW1-3 is for the MFX indirect bistream offset */
167     OUT_BCS_BATCH(batch, 0);
168     OUT_BCS_BATCH(batch, 0);
169     OUT_BCS_BATCH(batch, 0);
170         /* the DW4-5 is the MFX upper bound */
171     OUT_BCS_BATCH(batch, 0);
172     OUT_BCS_BATCH(batch, 0);
173
174     /* the DW6-10 is for MFX Indirect MV Object Base Address */
175     OUT_BCS_RELOC(batch, vme_context->vme_output.bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
176     OUT_BCS_BATCH(batch, 0);
177     OUT_BCS_BATCH(batch, 0);
178     OUT_BCS_BATCH(batch, 0x80000000); /* must set, up to 2G */
179     OUT_BCS_BATCH(batch, 0);
180
181      /* the DW11-15 is for MFX IT-COFF. Not used on encoder */
182     OUT_BCS_BATCH(batch, 0);
183     OUT_BCS_BATCH(batch, 0);
184     OUT_BCS_BATCH(batch, 0);
185     OUT_BCS_BATCH(batch, 0);
186     OUT_BCS_BATCH(batch, 0);
187
188      /* the DW16-20 is for MFX indirect DBLK. Not used on encoder */    
189     OUT_BCS_BATCH(batch, 0);
190     OUT_BCS_BATCH(batch, 0);
191     OUT_BCS_BATCH(batch, 0);
192     OUT_BCS_BATCH(batch, 0);
193     OUT_BCS_BATCH(batch, 0);
194
195     /* the DW21-25 is for MFC Indirect PAK-BSE Object Base Address for Encoder*/        
196     OUT_BCS_RELOC(batch,
197                   mfc_context->mfc_indirect_pak_bse_object.bo,
198                   I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
199                   0);
200     OUT_BCS_BATCH(batch, 0);
201     OUT_BCS_BATCH(batch, 0);
202         
203     OUT_BCS_RELOC(batch,
204                   mfc_context->mfc_indirect_pak_bse_object.bo,
205                   I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
206                   mfc_context->mfc_indirect_pak_bse_object.end_offset);
207     OUT_BCS_BATCH(batch, 0);
208
209     ADVANCE_BCS_BATCH(batch);
210 }
211
212 static void
213 gen75_mfc_ind_obj_base_addr_state(VADriverContextP ctx, struct intel_encoder_context *encoder_context)
214 {
215     struct intel_batchbuffer *batch = encoder_context->base.batch;
216     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
217     struct gen6_vme_context *vme_context = encoder_context->vme_context;
218     struct i965_driver_data *i965 = i965_driver_data(ctx);
219
220     if (IS_STEPPING_BPLUS(i965)) {
221         gen75_mfc_ind_obj_base_addr_state_bplus(ctx, encoder_context);
222         return;
223     }
224     BEGIN_BCS_BATCH(batch, 11);
225
226     OUT_BCS_BATCH(batch, MFX_IND_OBJ_BASE_ADDR_STATE | (11 - 2));
227     OUT_BCS_BATCH(batch, 0);
228     OUT_BCS_BATCH(batch, 0);
229     /* MFX Indirect MV Object Base Address */
230     OUT_BCS_RELOC(batch, vme_context->vme_output.bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
231     OUT_BCS_BATCH(batch, 0x80000000); /* must set, up to 2G */
232     OUT_BCS_BATCH(batch, 0);
233     OUT_BCS_BATCH(batch, 0);
234     OUT_BCS_BATCH(batch, 0);
235     OUT_BCS_BATCH(batch, 0);
236     /*MFC Indirect PAK-BSE Object Base Address for Encoder*/    
237     OUT_BCS_RELOC(batch,
238                   mfc_context->mfc_indirect_pak_bse_object.bo,
239                   I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
240                   0);
241     OUT_BCS_RELOC(batch,
242                   mfc_context->mfc_indirect_pak_bse_object.bo,
243                   I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
244                   mfc_context->mfc_indirect_pak_bse_object.end_offset);
245
246     ADVANCE_BCS_BATCH(batch);
247 }
248
249 static void
250 gen75_mfc_avc_img_state(VADriverContextP ctx, struct encode_state *encode_state,  
251                        struct intel_encoder_context *encoder_context)
252 {
253     struct intel_batchbuffer *batch = encoder_context->base.batch;
254     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
255     VAEncPictureParameterBufferH264 *pPicParameter = (VAEncPictureParameterBufferH264 *)encode_state->pic_param_ext->buffer;
256
257     int width_in_mbs = (mfc_context->surface_state.width + 15) / 16;
258     int height_in_mbs = (mfc_context->surface_state.height + 15) / 16;
259
260     BEGIN_BCS_BATCH(batch, 16);
261
262     OUT_BCS_BATCH(batch, MFX_AVC_IMG_STATE | (16 - 2));
263     OUT_BCS_BATCH(batch,
264                   ((width_in_mbs * height_in_mbs) & 0xFFFF));
265     OUT_BCS_BATCH(batch, 
266                   ((height_in_mbs - 1) << 16) | 
267                   ((width_in_mbs - 1) << 0));
268     OUT_BCS_BATCH(batch, 
269                   (0 << 24) |   /* Second Chroma QP Offset */
270                   (0 << 16) |   /* Chroma QP Offset */
271                   (0 << 14) |   /* Max-bit conformance Intra flag */
272                   (0 << 13) |   /* Max Macroblock size conformance Inter flag */
273                   (pPicParameter->pic_fields.bits.weighted_pred_flag << 12) |   /*Weighted_Pred_Flag */
274                   (pPicParameter->pic_fields.bits.weighted_bipred_idc << 10) |  /* Weighted_BiPred_Idc */
275                   (0 << 8)  |   /* FIXME: Image Structure */
276                   (0 << 0) );   /* Current Decoed Image Frame Store ID, reserved in Encode mode */
277     OUT_BCS_BATCH(batch,
278                   (0 << 16) |   /* Mininum Frame size */
279                   (0 << 15) |   /* Disable reading of Macroblock Status Buffer */
280                   (0 << 14) |   /* Load BitStream Pointer only once, 1 slic 1 frame */
281                   (0 << 13) |   /* CABAC 0 word insertion test enable */
282                   (1 << 12) |   /* MVUnpackedEnable,compliant to DXVA */
283                   (1 << 10) |   /* Chroma Format IDC, 4:2:0 */
284                   (0 << 8)  |   /* FIXME: MbMvFormatFlag */
285                   (pPicParameter->pic_fields.bits.entropy_coding_mode_flag << 7)  |   /*0:CAVLC encoding mode,1:CABAC*/
286                   (0 << 6)  |   /* Only valid for VLD decoding mode */
287                   (0 << 5)  |   /* Constrained Intra Predition Flag, from PPS */
288                   (0 << 4)  |   /* Direct 8x8 inference flag */
289                   (pPicParameter->pic_fields.bits.transform_8x8_mode_flag << 3)  |   /*8x8 or 4x4 IDCT Transform Mode Flag*/
290                   (1 << 2)  |   /* Frame MB only flag */
291                   (0 << 1)  |   /* MBAFF mode is in active */
292                   (0 << 0));    /* Field picture flag */
293     OUT_BCS_BATCH(batch, 0);    /* Mainly about MB rate control and debug, just ignoring */
294     OUT_BCS_BATCH(batch,        /* Inter and Intra Conformance Max size limit */
295                   (0xBB8 << 16) |       /* InterMbMaxSz */
296                   (0xEE8) );            /* IntraMbMaxSz */
297     OUT_BCS_BATCH(batch, 0);            /* Reserved */
298     OUT_BCS_BATCH(batch, 0);            /* Slice QP Delta for bitrate control */
299     OUT_BCS_BATCH(batch, 0);            /* Slice QP Delta for bitrate control */        
300     OUT_BCS_BATCH(batch, 0x8C000000);
301     OUT_BCS_BATCH(batch, 0x00010000);
302     OUT_BCS_BATCH(batch, 0);
303     OUT_BCS_BATCH(batch, 0);
304     OUT_BCS_BATCH(batch, 0);
305     OUT_BCS_BATCH(batch, 0);
306
307     ADVANCE_BCS_BATCH(batch);
308 }
309
310 static void
311 gen75_mfc_qm_state(VADriverContextP ctx,
312                   int qm_type,
313                   unsigned int *qm,
314                   int qm_length,
315                   struct intel_encoder_context *encoder_context)
316 {
317     struct intel_batchbuffer *batch = encoder_context->base.batch;
318     unsigned int qm_buffer[16];
319
320     assert(qm_length <= 16);
321     assert(sizeof(*qm) == 4);
322     memcpy(qm_buffer, qm, qm_length * 4);
323
324     BEGIN_BCS_BATCH(batch, 18);
325     OUT_BCS_BATCH(batch, MFX_QM_STATE | (18 - 2));
326     OUT_BCS_BATCH(batch, qm_type << 0);
327     intel_batchbuffer_data(batch, qm_buffer, 16 * 4);
328     ADVANCE_BCS_BATCH(batch);
329 }
330
331 static void
332 gen75_mfc_avc_qm_state(VADriverContextP ctx, struct intel_encoder_context *encoder_context)
333 {
334     unsigned int qm[16] = {
335         0x10101010, 0x10101010, 0x10101010, 0x10101010,
336         0x10101010, 0x10101010, 0x10101010, 0x10101010,
337         0x10101010, 0x10101010, 0x10101010, 0x10101010,
338         0x10101010, 0x10101010, 0x10101010, 0x10101010
339     };
340
341     gen75_mfc_qm_state(ctx, MFX_QM_AVC_4X4_INTRA_MATRIX, qm, 12, encoder_context);
342     gen75_mfc_qm_state(ctx, MFX_QM_AVC_4X4_INTER_MATRIX, qm, 12, encoder_context);
343     gen75_mfc_qm_state(ctx, MFX_QM_AVC_8x8_INTRA_MATRIX, qm, 16, encoder_context);
344     gen75_mfc_qm_state(ctx, MFX_QM_AVC_8x8_INTER_MATRIX, qm, 16, encoder_context);
345 }
346
347 static void
348 gen75_mfc_fqm_state(VADriverContextP ctx,
349                    int fqm_type,
350                    unsigned int *fqm,
351                    int fqm_length,
352                    struct intel_encoder_context *encoder_context)
353 {
354     struct intel_batchbuffer *batch = encoder_context->base.batch;
355     unsigned int fqm_buffer[32];
356
357     assert(fqm_length <= 32);
358     assert(sizeof(*fqm) == 4);
359     memcpy(fqm_buffer, fqm, fqm_length * 4);
360
361     BEGIN_BCS_BATCH(batch, 34);
362     OUT_BCS_BATCH(batch, MFX_FQM_STATE | (34 - 2));
363     OUT_BCS_BATCH(batch, fqm_type << 0);
364     intel_batchbuffer_data(batch, fqm_buffer, 32 * 4);
365     ADVANCE_BCS_BATCH(batch);
366 }
367
368 static void
369 gen75_mfc_avc_fqm_state(VADriverContextP ctx, struct intel_encoder_context *encoder_context)
370 {
371     unsigned int qm[32] = {
372         0x10001000, 0x10001000, 0x10001000, 0x10001000,
373         0x10001000, 0x10001000, 0x10001000, 0x10001000,
374         0x10001000, 0x10001000, 0x10001000, 0x10001000,
375         0x10001000, 0x10001000, 0x10001000, 0x10001000,
376         0x10001000, 0x10001000, 0x10001000, 0x10001000,
377         0x10001000, 0x10001000, 0x10001000, 0x10001000,
378         0x10001000, 0x10001000, 0x10001000, 0x10001000,
379         0x10001000, 0x10001000, 0x10001000, 0x10001000
380     };
381
382     gen75_mfc_fqm_state(ctx, MFX_QM_AVC_4X4_INTRA_MATRIX, qm, 24, encoder_context);
383     gen75_mfc_fqm_state(ctx, MFX_QM_AVC_4X4_INTER_MATRIX, qm, 24, encoder_context);
384     gen75_mfc_fqm_state(ctx, MFX_QM_AVC_8x8_INTRA_MATRIX, qm, 32, encoder_context);
385     gen75_mfc_fqm_state(ctx, MFX_QM_AVC_8x8_INTER_MATRIX, qm, 32, encoder_context);
386 }
387
388 static void
389 gen75_mfc_avc_insert_object(VADriverContextP ctx, struct intel_encoder_context *encoder_context,
390                            unsigned int *insert_data, int lenght_in_dws, int data_bits_in_last_dw,
391                            int skip_emul_byte_count, int is_last_header, int is_end_of_slice, int emulation_flag,
392                            struct intel_batchbuffer *batch)
393 {
394     if (batch == NULL)
395         batch = encoder_context->base.batch;
396
397     BEGIN_BCS_BATCH(batch, lenght_in_dws + 2);
398
399     OUT_BCS_BATCH(batch, MFX_INSERT_OBJECT | (lenght_in_dws + 2 - 2));
400     OUT_BCS_BATCH(batch,
401                   (0 << 16) |   /* always start at offset 0 */
402                   (data_bits_in_last_dw << 8) |
403                   (skip_emul_byte_count << 4) |
404                   (!!emulation_flag << 3) |
405                   ((!!is_last_header) << 2) |
406                   ((!!is_end_of_slice) << 1) |
407                   (0 << 0));    /* FIXME: ??? */
408     intel_batchbuffer_data(batch, insert_data, lenght_in_dws * 4);
409
410     ADVANCE_BCS_BATCH(batch);
411 }
412
413
414 static void gen75_mfc_init(VADriverContextP ctx,
415                         struct encode_state *encode_state,
416                         struct intel_encoder_context *encoder_context)
417 {
418     struct i965_driver_data *i965 = i965_driver_data(ctx);
419     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
420     dri_bo *bo;
421     int i;
422     VAEncSequenceParameterBufferH264 *pSequenceParameter = (VAEncSequenceParameterBufferH264 *)encode_state->seq_param_ext->buffer;
423     int width_in_mbs = pSequenceParameter->picture_width_in_mbs;
424     int height_in_mbs = pSequenceParameter->picture_height_in_mbs;
425
426     /*Encode common setup for MFC*/
427     dri_bo_unreference(mfc_context->post_deblocking_output.bo);
428     mfc_context->post_deblocking_output.bo = NULL;
429
430     dri_bo_unreference(mfc_context->pre_deblocking_output.bo);
431     mfc_context->pre_deblocking_output.bo = NULL;
432
433     dri_bo_unreference(mfc_context->uncompressed_picture_source.bo);
434     mfc_context->uncompressed_picture_source.bo = NULL;
435
436     dri_bo_unreference(mfc_context->mfc_indirect_pak_bse_object.bo); 
437     mfc_context->mfc_indirect_pak_bse_object.bo = NULL;
438
439     for (i = 0; i < NUM_MFC_DMV_BUFFERS; i++){
440         if ( mfc_context->direct_mv_buffers[i].bo != NULL);
441         dri_bo_unreference(mfc_context->direct_mv_buffers[i].bo);
442         mfc_context->direct_mv_buffers[i].bo = NULL;
443     }
444
445     for (i = 0; i < MAX_MFC_REFERENCE_SURFACES; i++){
446         if (mfc_context->reference_surfaces[i].bo != NULL)
447             dri_bo_unreference(mfc_context->reference_surfaces[i].bo);
448         mfc_context->reference_surfaces[i].bo = NULL;  
449     }
450
451     dri_bo_unreference(mfc_context->intra_row_store_scratch_buffer.bo);
452     bo = dri_bo_alloc(i965->intel.bufmgr,
453                       "Buffer",
454                       width_in_mbs * 64,
455                       64);
456     assert(bo);
457     mfc_context->intra_row_store_scratch_buffer.bo = bo;
458
459     dri_bo_unreference(mfc_context->macroblock_status_buffer.bo);
460     bo = dri_bo_alloc(i965->intel.bufmgr,
461                       "Buffer",
462                       width_in_mbs * height_in_mbs * 16,
463                       64);
464     assert(bo);
465     mfc_context->macroblock_status_buffer.bo = bo;
466
467     dri_bo_unreference(mfc_context->deblocking_filter_row_store_scratch_buffer.bo);
468     bo = dri_bo_alloc(i965->intel.bufmgr,
469                       "Buffer",
470                       4 * width_in_mbs * 64,  /* 4 * width_in_mbs * 64 */
471                       64);
472     assert(bo);
473     mfc_context->deblocking_filter_row_store_scratch_buffer.bo = bo;
474
475     dri_bo_unreference(mfc_context->bsd_mpc_row_store_scratch_buffer.bo);
476     bo = dri_bo_alloc(i965->intel.bufmgr,
477                       "Buffer",
478                       2 * width_in_mbs * 64, /* 2 * width_in_mbs * 64 */
479                       0x1000);
480     assert(bo);
481     mfc_context->bsd_mpc_row_store_scratch_buffer.bo = bo;
482
483     dri_bo_unreference(mfc_context->mfc_batchbuffer_surface.bo);
484     mfc_context->mfc_batchbuffer_surface.bo = NULL;
485
486     dri_bo_unreference(mfc_context->aux_batchbuffer_surface.bo);
487     mfc_context->aux_batchbuffer_surface.bo = NULL;
488
489     if (mfc_context->aux_batchbuffer)
490         intel_batchbuffer_free(mfc_context->aux_batchbuffer);
491
492     mfc_context->aux_batchbuffer = intel_batchbuffer_new(&i965->intel, I915_EXEC_BSD, 0);
493     mfc_context->aux_batchbuffer_surface.bo = mfc_context->aux_batchbuffer->buffer;
494     dri_bo_reference(mfc_context->aux_batchbuffer_surface.bo);
495     mfc_context->aux_batchbuffer_surface.pitch = 16;
496     mfc_context->aux_batchbuffer_surface.num_blocks = mfc_context->aux_batchbuffer->size / 16;
497     mfc_context->aux_batchbuffer_surface.size_block = 16;
498
499     i965_gpe_context_init(ctx, &mfc_context->gpe_context);
500 }
501
502 static void
503 gen75_mfc_pipe_buf_addr_state_bplus(VADriverContextP ctx,
504                                 struct intel_encoder_context *encoder_context)
505 {
506     struct intel_batchbuffer *batch = encoder_context->base.batch;
507     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
508     int i;
509
510     BEGIN_BCS_BATCH(batch, 61);
511
512     OUT_BCS_BATCH(batch, MFX_PIPE_BUF_ADDR_STATE | (61 - 2));
513
514     /* the DW1-3 is for pre_deblocking */
515     if (mfc_context->pre_deblocking_output.bo)
516         OUT_BCS_RELOC(batch, mfc_context->pre_deblocking_output.bo,
517                       I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
518                       0);
519     else
520         OUT_BCS_BATCH(batch, 0);                                                                                        /* pre output addr   */
521
522         OUT_BCS_BATCH(batch, 0);
523         OUT_BCS_BATCH(batch, 0);
524      /* the DW4-6 is for the post_deblocking */
525
526     if (mfc_context->post_deblocking_output.bo)
527         OUT_BCS_RELOC(batch, mfc_context->post_deblocking_output.bo,
528                       I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
529                       0);                                                                                       /* post output addr  */ 
530     else
531         OUT_BCS_BATCH(batch, 0);
532         OUT_BCS_BATCH(batch, 0);
533         OUT_BCS_BATCH(batch, 0);
534
535      /* the DW7-9 is for the uncompressed_picture */
536     OUT_BCS_RELOC(batch, mfc_context->uncompressed_picture_source.bo,
537                   I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
538                   0); /* uncompressed data */
539
540         OUT_BCS_BATCH(batch, 0);
541         OUT_BCS_BATCH(batch, 0);
542
543      /* the DW10-12 is for the mb status */
544     OUT_BCS_RELOC(batch, mfc_context->macroblock_status_buffer.bo,
545                   I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
546                   0); /* StreamOut data*/
547         OUT_BCS_BATCH(batch, 0);
548         OUT_BCS_BATCH(batch, 0);
549
550      /* the DW13-15 is for the intra_row_store_scratch */
551     OUT_BCS_RELOC(batch, mfc_context->intra_row_store_scratch_buffer.bo,
552                   I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
553                   0);   
554         OUT_BCS_BATCH(batch, 0);
555         OUT_BCS_BATCH(batch, 0);
556
557      /* the DW16-18 is for the deblocking filter */
558     OUT_BCS_RELOC(batch, mfc_context->deblocking_filter_row_store_scratch_buffer.bo,
559                   I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
560                   0);
561         OUT_BCS_BATCH(batch, 0);
562         OUT_BCS_BATCH(batch, 0);
563
564     /* the DW 19-50 is for Reference pictures*/
565     for (i = 0; i < ARRAY_ELEMS(mfc_context->reference_surfaces); i++) {
566         if ( mfc_context->reference_surfaces[i].bo != NULL) {
567             OUT_BCS_RELOC(batch, mfc_context->reference_surfaces[i].bo,
568                           I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
569                           0);                   
570         } else {
571             OUT_BCS_BATCH(batch, 0);
572         }
573         OUT_BCS_BATCH(batch, 0);
574     }
575         OUT_BCS_BATCH(batch, 0);
576
577         /* The DW 52-54 is for the MB status buffer */
578     OUT_BCS_RELOC(batch, mfc_context->macroblock_status_buffer.bo,
579                   I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
580                   0);                                                                                   /* Macroblock status buffer*/
581         
582         OUT_BCS_BATCH(batch, 0);
583         OUT_BCS_BATCH(batch, 0);
584
585         /* the DW 55-57 is the ILDB buffer */
586         OUT_BCS_BATCH(batch, 0);
587         OUT_BCS_BATCH(batch, 0);
588         OUT_BCS_BATCH(batch, 0);
589
590         /* the DW 58-60 is the second ILDB buffer */
591         OUT_BCS_BATCH(batch, 0);
592         OUT_BCS_BATCH(batch, 0);
593         OUT_BCS_BATCH(batch, 0);
594     ADVANCE_BCS_BATCH(batch);
595 }
596
597 static void
598 gen75_mfc_pipe_buf_addr_state(VADriverContextP ctx, struct intel_encoder_context *encoder_context)
599 {
600     struct intel_batchbuffer *batch = encoder_context->base.batch;
601     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
602     struct i965_driver_data *i965 = i965_driver_data(ctx);
603     int i;
604
605     if (IS_STEPPING_BPLUS(i965)) {
606         gen75_mfc_pipe_buf_addr_state_bplus(ctx, encoder_context);
607         return;
608     }
609
610     BEGIN_BCS_BATCH(batch, 25);
611
612     OUT_BCS_BATCH(batch, MFX_PIPE_BUF_ADDR_STATE | (25 - 2));
613
614     if (mfc_context->pre_deblocking_output.bo)
615         OUT_BCS_RELOC(batch, mfc_context->pre_deblocking_output.bo,
616                       I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
617                       0);
618     else
619         OUT_BCS_BATCH(batch, 0);                                                                                        /* pre output addr   */
620
621     if (mfc_context->post_deblocking_output.bo)
622         OUT_BCS_RELOC(batch, mfc_context->post_deblocking_output.bo,
623                       I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
624                       0);                                                                                       /* post output addr  */ 
625     else
626         OUT_BCS_BATCH(batch, 0);
627
628     OUT_BCS_RELOC(batch, mfc_context->uncompressed_picture_source.bo,
629                   I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
630                   0);                                                                                   /* uncompressed data */
631     OUT_BCS_RELOC(batch, mfc_context->macroblock_status_buffer.bo,
632                   I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
633                   0);                                                                                   /* StreamOut data*/
634     OUT_BCS_RELOC(batch, mfc_context->intra_row_store_scratch_buffer.bo,
635                   I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
636                   0);   
637     OUT_BCS_RELOC(batch, mfc_context->deblocking_filter_row_store_scratch_buffer.bo,
638                   I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
639                   0);
640     /* 7..22 Reference pictures*/
641     for (i = 0; i < ARRAY_ELEMS(mfc_context->reference_surfaces); i++) {
642         if ( mfc_context->reference_surfaces[i].bo != NULL) {
643             OUT_BCS_RELOC(batch, mfc_context->reference_surfaces[i].bo,
644                           I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
645                           0);                   
646         } else {
647             OUT_BCS_BATCH(batch, 0);
648         }
649     }
650     OUT_BCS_RELOC(batch, mfc_context->macroblock_status_buffer.bo,
651                   I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
652                   0);                                                                                   /* Macroblock status buffer*/
653
654         OUT_BCS_BATCH(batch, 0);
655
656     ADVANCE_BCS_BATCH(batch);
657 }
658
659 static void
660 gen75_mfc_avc_directmode_state_bplus(VADriverContextP ctx,
661                                 struct intel_encoder_context *encoder_context)
662 {
663     struct intel_batchbuffer *batch = encoder_context->base.batch;
664     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
665
666     int i;
667
668     BEGIN_BCS_BATCH(batch, 71);
669
670     OUT_BCS_BATCH(batch, MFX_AVC_DIRECTMODE_STATE | (71 - 2));
671
672     /* Reference frames and Current frames */
673     /* the DW1-32 is for the direct MV for reference */
674     for(i = 0; i < NUM_MFC_DMV_BUFFERS - 2; i += 2) {
675         if ( mfc_context->direct_mv_buffers[i].bo != NULL) { 
676             OUT_BCS_RELOC(batch, mfc_context->direct_mv_buffers[i].bo,
677                           I915_GEM_DOMAIN_INSTRUCTION, 0,
678                           0);
679             OUT_BCS_BATCH(batch, 0);
680         } else {
681             OUT_BCS_BATCH(batch, 0);
682             OUT_BCS_BATCH(batch, 0);
683         }
684     }
685         OUT_BCS_BATCH(batch, 0);
686
687         /* the DW34-36 is the MV for the current reference */
688         OUT_BCS_RELOC(batch, mfc_context->direct_mv_buffers[NUM_MFC_DMV_BUFFERS - 2].bo,
689                           I915_GEM_DOMAIN_INSTRUCTION, 0,
690                           0);
691
692         OUT_BCS_BATCH(batch, 0);
693         OUT_BCS_BATCH(batch, 0);
694
695     /* POL list */
696     for(i = 0; i < 32; i++) {
697         OUT_BCS_BATCH(batch, i/2);
698     }
699     OUT_BCS_BATCH(batch, 0);
700     OUT_BCS_BATCH(batch, 0);
701
702     ADVANCE_BCS_BATCH(batch);
703 }
704
705 static void
706 gen75_mfc_avc_directmode_state(VADriverContextP ctx, struct intel_encoder_context *encoder_context)
707 {
708     struct intel_batchbuffer *batch = encoder_context->base.batch;
709     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
710     struct i965_driver_data *i965 = i965_driver_data(ctx);
711     int i;
712
713     if (IS_STEPPING_BPLUS(i965)) {
714         gen75_mfc_avc_directmode_state_bplus(ctx, encoder_context);
715         return;
716     }
717
718     BEGIN_BCS_BATCH(batch, 69);
719
720     OUT_BCS_BATCH(batch, MFX_AVC_DIRECTMODE_STATE | (69 - 2));
721
722     /* Reference frames and Current frames */
723     for(i = 0; i < NUM_MFC_DMV_BUFFERS; i++) {
724         if ( mfc_context->direct_mv_buffers[i].bo != NULL) { 
725             OUT_BCS_RELOC(batch, mfc_context->direct_mv_buffers[i].bo,
726                           I915_GEM_DOMAIN_INSTRUCTION, 0,
727                           0);
728         } else {
729             OUT_BCS_BATCH(batch, 0);
730         }
731     }
732
733     /* POL list */
734     for(i = 0; i < 32; i++) {
735         OUT_BCS_BATCH(batch, i/2);
736     }
737     OUT_BCS_BATCH(batch, 0);
738     OUT_BCS_BATCH(batch, 0);
739
740     ADVANCE_BCS_BATCH(batch);
741 }
742
743 static void
744 gen75_mfc_avc_ref_idx_state(VADriverContextP ctx, struct intel_encoder_context *encoder_context)
745 {
746     struct intel_batchbuffer *batch = encoder_context->base.batch;
747     int i;
748
749     BEGIN_BCS_BATCH(batch, 10);
750     OUT_BCS_BATCH(batch, MFX_AVC_REF_IDX_STATE | 8); 
751     OUT_BCS_BATCH(batch, 0);                  //Select L0
752     OUT_BCS_BATCH(batch, 0x80808020);         //Only 1 reference
753     for(i = 0; i < 7; i++) {
754         OUT_BCS_BATCH(batch, 0x80808080);
755     }   
756     ADVANCE_BCS_BATCH(batch);
757
758     BEGIN_BCS_BATCH(batch, 10);
759     OUT_BCS_BATCH(batch, MFX_AVC_REF_IDX_STATE | 8); 
760     OUT_BCS_BATCH(batch, 1);                  //Select L1
761     OUT_BCS_BATCH(batch, 0x80808022);         //Only 1 reference
762     for(i = 0; i < 7; i++) {
763         OUT_BCS_BATCH(batch, 0x80808080);
764     }   
765     ADVANCE_BCS_BATCH(batch);
766 }
767
768
769 static void
770 gen75_mfc_bsp_buf_base_addr_state_bplus(VADriverContextP ctx,
771                                 struct intel_encoder_context *encoder_context)
772 {
773     struct intel_batchbuffer *batch = encoder_context->base.batch;
774     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
775
776     BEGIN_BCS_BATCH(batch, 10);
777
778     OUT_BCS_BATCH(batch, MFX_BSP_BUF_BASE_ADDR_STATE | (10 - 2));
779     OUT_BCS_RELOC(batch, mfc_context->bsd_mpc_row_store_scratch_buffer.bo,
780                   I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
781                   0);
782     OUT_BCS_BATCH(batch, 0);
783     OUT_BCS_BATCH(batch, 0);
784         
785         /* the DW4-6 is for MPR Row Store Scratch Buffer Base Address */
786     OUT_BCS_BATCH(batch, 0);
787     OUT_BCS_BATCH(batch, 0);
788     OUT_BCS_BATCH(batch, 0);
789
790         /* the DW7-9 is for Bitplane Read Buffer Base Address */
791     OUT_BCS_BATCH(batch, 0);
792     OUT_BCS_BATCH(batch, 0);
793     OUT_BCS_BATCH(batch, 0);
794
795     ADVANCE_BCS_BATCH(batch);
796 }
797
798 static void
799 gen75_mfc_bsp_buf_base_addr_state(VADriverContextP ctx, struct intel_encoder_context *encoder_context)
800 {
801     struct intel_batchbuffer *batch = encoder_context->base.batch;
802     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
803     struct i965_driver_data *i965 = i965_driver_data(ctx);
804
805     if (IS_STEPPING_BPLUS(i965)) {
806         gen75_mfc_bsp_buf_base_addr_state_bplus(ctx, encoder_context);
807         return;
808     }
809
810     BEGIN_BCS_BATCH(batch, 4);
811
812     OUT_BCS_BATCH(batch, MFX_BSP_BUF_BASE_ADDR_STATE | (4 - 2));
813     OUT_BCS_RELOC(batch, mfc_context->bsd_mpc_row_store_scratch_buffer.bo,
814                   I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
815                   0);
816     OUT_BCS_BATCH(batch, 0);
817     OUT_BCS_BATCH(batch, 0);
818
819     ADVANCE_BCS_BATCH(batch);
820 }
821
822
823 static void gen75_mfc_avc_pipeline_picture_programing( VADriverContextP ctx,
824                                       struct encode_state *encode_state,
825                                       struct intel_encoder_context *encoder_context)
826 {
827     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
828
829     mfc_context->pipe_mode_select(ctx, MFX_FORMAT_AVC, encoder_context);
830     mfc_context->set_surface_state(ctx, encoder_context);
831     mfc_context->ind_obj_base_addr_state(ctx, encoder_context);
832     gen75_mfc_pipe_buf_addr_state(ctx, encoder_context);
833     gen75_mfc_bsp_buf_base_addr_state(ctx, encoder_context);
834     mfc_context->avc_img_state(ctx, encode_state, encoder_context);
835     mfc_context->avc_qm_state(ctx, encoder_context);
836     mfc_context->avc_fqm_state(ctx, encoder_context);
837     gen75_mfc_avc_directmode_state(ctx, encoder_context); 
838     gen75_mfc_avc_ref_idx_state(ctx, encoder_context);
839 }
840
841
842 static VAStatus gen75_mfc_run(VADriverContextP ctx, 
843                              struct encode_state *encode_state,
844                              struct intel_encoder_context *encoder_context)
845 {
846     struct intel_batchbuffer *batch = encoder_context->base.batch;
847
848     intel_batchbuffer_flush(batch);             //run the pipeline
849
850     return VA_STATUS_SUCCESS;
851 }
852
853
854 static VAStatus
855 gen75_mfc_stop(VADriverContextP ctx, 
856               struct encode_state *encode_state,
857               struct intel_encoder_context *encoder_context,
858               int *encoded_bits_size)
859 {
860     VAStatus vaStatus = VA_STATUS_ERROR_UNKNOWN;
861     VAEncPictureParameterBufferH264 *pPicParameter = (VAEncPictureParameterBufferH264 *)encode_state->pic_param_ext->buffer;
862     VACodedBufferSegment *coded_buffer_segment;
863     
864     vaStatus = i965_MapBuffer(ctx, pPicParameter->coded_buf, (void **)&coded_buffer_segment);
865     assert(vaStatus == VA_STATUS_SUCCESS);
866     *encoded_bits_size = coded_buffer_segment->size * 8;
867     i965_UnmapBuffer(ctx, pPicParameter->coded_buf);
868
869     return VA_STATUS_SUCCESS;
870 }
871
872
873 static void
874 gen75_mfc_avc_slice_state(VADriverContextP ctx,
875                          VAEncPictureParameterBufferH264 *pic_param,
876                          VAEncSliceParameterBufferH264 *slice_param,
877                          struct encode_state *encode_state,
878                          struct intel_encoder_context *encoder_context,
879                          int rate_control_enable,
880                          int qp,
881                          struct intel_batchbuffer *batch)
882 {
883     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
884     int width_in_mbs = (mfc_context->surface_state.width + 15) / 16;
885     int height_in_mbs = (mfc_context->surface_state.height + 15) / 16;
886     int beginmb = slice_param->macroblock_address;
887     int endmb = beginmb + slice_param->num_macroblocks;
888     int beginx = beginmb % width_in_mbs;
889     int beginy = beginmb / width_in_mbs;
890     int nextx =  endmb % width_in_mbs;
891     int nexty = endmb / width_in_mbs;
892     int slice_type = slice_param->slice_type;
893     int last_slice = (endmb == (width_in_mbs * height_in_mbs));
894     int bit_rate_control_target, maxQpN, maxQpP;
895     unsigned char correct[6], grow, shrink;
896     int i;
897     int weighted_pred_idc = 0;
898     unsigned int luma_log2_weight_denom = slice_param->luma_log2_weight_denom;
899     unsigned int chroma_log2_weight_denom = slice_param->chroma_log2_weight_denom;
900
901     if (batch == NULL)
902         batch = encoder_context->base.batch;
903
904     bit_rate_control_target = slice_type;
905     if (slice_type == SLICE_TYPE_SP)
906         bit_rate_control_target = SLICE_TYPE_P;
907     else if (slice_type == SLICE_TYPE_SI)
908         bit_rate_control_target = SLICE_TYPE_I;
909
910     if (slice_type == SLICE_TYPE_P) {
911         weighted_pred_idc = pic_param->pic_fields.bits.weighted_pred_flag;
912     } else if (slice_type == SLICE_TYPE_B) {
913         weighted_pred_idc = pic_param->pic_fields.bits.weighted_bipred_idc;
914
915         if (weighted_pred_idc == 2) {
916             /* 8.4.3 - Derivation process for prediction weights (8-279) */
917             luma_log2_weight_denom = 5;
918             chroma_log2_weight_denom = 5;
919         }
920     }
921
922     maxQpN = mfc_context->bit_rate_control_context[bit_rate_control_target].MaxQpNegModifier;
923     maxQpP = mfc_context->bit_rate_control_context[bit_rate_control_target].MaxQpPosModifier;
924
925     for (i = 0; i < 6; i++)
926         correct[i] = mfc_context->bit_rate_control_context[bit_rate_control_target].Correct[i];
927
928     grow = mfc_context->bit_rate_control_context[bit_rate_control_target].GrowInit + 
929         (mfc_context->bit_rate_control_context[bit_rate_control_target].GrowResistance << 4);
930     shrink = mfc_context->bit_rate_control_context[bit_rate_control_target].ShrinkInit + 
931         (mfc_context->bit_rate_control_context[bit_rate_control_target].ShrinkResistance << 4);
932
933     BEGIN_BCS_BATCH(batch, 11);;
934
935     OUT_BCS_BATCH(batch, MFX_AVC_SLICE_STATE | (11 - 2) );
936     OUT_BCS_BATCH(batch, slice_type);                   /*Slice Type: I:P:B Slice*/
937
938     if (slice_type == SLICE_TYPE_I) {
939         OUT_BCS_BATCH(batch, 0);                        /*no reference frames and pred_weight_table*/
940     } else {
941         OUT_BCS_BATCH(batch,
942                       (1 << 16) |                       /*1 reference frame*/
943                       (chroma_log2_weight_denom << 8) |
944                       (luma_log2_weight_denom << 0));
945     }
946
947     OUT_BCS_BATCH(batch, 
948                   (weighted_pred_idc << 30) |
949                   (slice_param->direct_spatial_mv_pred_flag<<29) |             /*Direct Prediction Type*/
950                   (slice_param->disable_deblocking_filter_idc << 27) |
951                   (slice_param->cabac_init_idc << 24) |
952                   (qp<<16) |                    /*Slice Quantization Parameter*/
953                   ((slice_param->slice_beta_offset_div2 & 0xf) << 8) |
954                   ((slice_param->slice_alpha_c0_offset_div2 & 0xf) << 0));
955     OUT_BCS_BATCH(batch,
956                   (beginy << 24) |                      /*First MB X&Y , the begin postion of current slice*/
957                   (beginx << 16) |
958                   slice_param->macroblock_address );
959     OUT_BCS_BATCH(batch, (nexty << 16) | nextx);                       /*Next slice first MB X&Y*/
960     OUT_BCS_BATCH(batch, 
961                   (0/*rate_control_enable*/ << 31) |            /*in CBR mode RateControlCounterEnable = enable*/
962                   (1 << 30) |           /*ResetRateControlCounter*/
963                   (0 << 28) |           /*RC Triggle Mode = Always Rate Control*/
964                   (4 << 24) |     /*RC Stable Tolerance, middle level*/
965                   (0/*rate_control_enable*/ << 23) |     /*RC Panic Enable*/                 
966                   (0 << 22) |     /*QP mode, don't modfiy CBP*/
967                   (0 << 21) |     /*MB Type Direct Conversion Enabled*/ 
968                   (0 << 20) |     /*MB Type Skip Conversion Enabled*/ 
969                   (last_slice << 19) |     /*IsLastSlice*/
970                   (0 << 18) |   /*BitstreamOutputFlag Compressed BitStream Output Disable Flag 0:enable 1:disable*/
971                   (1 << 17) |       /*HeaderPresentFlag*/       
972                   (1 << 16) |       /*SliceData PresentFlag*/
973                   (1 << 15) |       /*TailPresentFlag*/
974                   (1 << 13) |       /*RBSP NAL TYPE*/   
975                   (0 << 12) );    /*CabacZeroWordInsertionEnable*/
976     OUT_BCS_BATCH(batch, mfc_context->mfc_indirect_pak_bse_object.offset);
977     OUT_BCS_BATCH(batch,
978                   (maxQpN << 24) |     /*Target QP - 24 is lowest QP*/ 
979                   (maxQpP << 16) |     /*Target QP + 20 is highest QP*/
980                   (shrink << 8)  |
981                   (grow << 0));   
982     OUT_BCS_BATCH(batch,
983                   (correct[5] << 20) |
984                   (correct[4] << 16) |
985                   (correct[3] << 12) |
986                   (correct[2] << 8) |
987                   (correct[1] << 4) |
988                   (correct[0] << 0));
989     OUT_BCS_BATCH(batch, 0);
990
991     ADVANCE_BCS_BATCH(batch);
992 }
993
994
995 #ifdef MFC_SOFTWARE_HASWELL
996
997 static int
998 gen75_mfc_avc_pak_object_intra(VADriverContextP ctx, int x, int y, int end_mb,
999                                 int qp,unsigned int *msg,
1000                               struct intel_encoder_context *encoder_context,
1001                               unsigned char target_mb_size, unsigned char max_mb_size,
1002                               struct intel_batchbuffer *batch)
1003 {
1004     int len_in_dwords = 12;
1005     unsigned int intra_msg;
1006 #define         INTRA_MSG_FLAG          (1 << 13)
1007 #define         INTRA_MBTYPE_MASK       (0x1F0000)
1008     if (batch == NULL)
1009         batch = encoder_context->base.batch;
1010
1011     BEGIN_BCS_BATCH(batch, len_in_dwords);
1012
1013     intra_msg = msg[0] & 0xC0FF;
1014     intra_msg |= INTRA_MSG_FLAG;
1015     intra_msg |= ((msg[0] & INTRA_MBTYPE_MASK) >> 8);
1016     OUT_BCS_BATCH(batch, MFC_AVC_PAK_OBJECT | (len_in_dwords - 2));
1017     OUT_BCS_BATCH(batch, 0);
1018     OUT_BCS_BATCH(batch, 0);
1019     OUT_BCS_BATCH(batch, 
1020                   (0 << 24) |           /* PackedMvNum, Debug*/
1021                   (0 << 20) |           /* No motion vector */
1022                   (1 << 19) |           /* CbpDcY */
1023                   (1 << 18) |           /* CbpDcU */
1024                   (1 << 17) |           /* CbpDcV */
1025                   intra_msg);
1026
1027     OUT_BCS_BATCH(batch, (0xFFFF << 16) | (y << 8) | x);                /* Code Block Pattern for Y*/
1028     OUT_BCS_BATCH(batch, 0x000F000F);                                                   /* Code Block Pattern */                
1029     OUT_BCS_BATCH(batch, (0 << 27) | (end_mb << 26) | qp);      /* Last MB */
1030
1031     /*Stuff for Intra MB*/
1032     OUT_BCS_BATCH(batch, msg[1]);                       /* We using Intra16x16 no 4x4 predmode*/        
1033     OUT_BCS_BATCH(batch, msg[2]);       
1034     OUT_BCS_BATCH(batch, msg[3]&0xFF);  
1035     
1036     /*MaxSizeInWord and TargetSzieInWord*/
1037     OUT_BCS_BATCH(batch, (max_mb_size << 24) |
1038                   (target_mb_size << 16) );
1039
1040     OUT_BCS_BATCH(batch, 0);
1041
1042     ADVANCE_BCS_BATCH(batch);
1043
1044     return len_in_dwords;
1045 }
1046
1047 static int
1048 gen75_mfc_avc_pak_object_inter(VADriverContextP ctx, int x, int y, int end_mb, int qp,
1049                               unsigned int *msg, unsigned int offset,
1050                               struct intel_encoder_context *encoder_context,
1051                               unsigned char target_mb_size,unsigned char max_mb_size, int slice_type,
1052                               struct intel_batchbuffer *batch)
1053 {
1054     int len_in_dwords = 12;
1055         unsigned int inter_msg = 0;
1056     if (batch == NULL)
1057         batch = encoder_context->base.batch;
1058     {
1059 #define MSG_MV_OFFSET   4
1060         unsigned int *mv_ptr;
1061         mv_ptr = msg + MSG_MV_OFFSET;
1062         /* MV of VME output is based on 16 sub-blocks. So it is necessary
1063          * to convert them to be compatible with the format of AVC_PAK
1064          * command.
1065          */
1066         if ((msg[0] & INTER_MODE_MASK) == INTER_8X16) {
1067                 /* MV[0] and MV[2] are replicated */
1068                 mv_ptr[4] = mv_ptr[0];
1069                 mv_ptr[5] = mv_ptr[1];
1070                 mv_ptr[2] = mv_ptr[8];
1071                 mv_ptr[3] = mv_ptr[9];
1072                 mv_ptr[6] = mv_ptr[8]; 
1073                 mv_ptr[7] = mv_ptr[9]; 
1074         } else if ((msg[0] & INTER_MODE_MASK) == INTER_16X8) {
1075                 /* MV[0] and MV[1] are replicated */
1076                 mv_ptr[2] = mv_ptr[0];  
1077                 mv_ptr[3] = mv_ptr[1];
1078                 mv_ptr[4] = mv_ptr[16]; 
1079                 mv_ptr[5] = mv_ptr[17]; 
1080                 mv_ptr[6] = mv_ptr[24];
1081                 mv_ptr[7] = mv_ptr[25];
1082         } else if (((msg[0] & INTER_MODE_MASK) == INTER_8X8) &&
1083                         !(msg[1] & SUBMB_SHAPE_MASK)) {
1084                 /* Don't touch MV[0] or MV[1] */
1085                 mv_ptr[2] = mv_ptr[8];
1086                 mv_ptr[3] = mv_ptr[9];
1087                 mv_ptr[4] = mv_ptr[16];
1088                 mv_ptr[5] = mv_ptr[17];
1089                 mv_ptr[6] = mv_ptr[24];
1090                 mv_ptr[7] = mv_ptr[25];
1091         }
1092     }
1093
1094     BEGIN_BCS_BATCH(batch, len_in_dwords);
1095
1096     OUT_BCS_BATCH(batch, MFC_AVC_PAK_OBJECT | (len_in_dwords - 2));
1097
1098         inter_msg = 32;
1099         /* MV quantity */
1100         if ((msg[0] & INTER_MODE_MASK) == INTER_8X8) {
1101                 if (msg[1] & SUBMB_SHAPE_MASK)
1102                         inter_msg = 128;
1103         }
1104     OUT_BCS_BATCH(batch, inter_msg);         /* 32 MV*/
1105     OUT_BCS_BATCH(batch, offset);
1106         inter_msg = msg[0] & (0x1F00FFFF);
1107         inter_msg |= INTER_MV8;
1108         inter_msg |= ((1 << 19) | (1 << 18) | (1 << 17));
1109         if (((msg[0] & INTER_MODE_MASK) == INTER_8X8) &&
1110                         (msg[1] & SUBMB_SHAPE_MASK)) {
1111                 inter_msg |= INTER_MV32;
1112         }
1113
1114     OUT_BCS_BATCH(batch, inter_msg);
1115
1116     OUT_BCS_BATCH(batch, (0xFFFF<<16) | (y << 8) | x);        /* Code Block Pattern for Y*/
1117     OUT_BCS_BATCH(batch, 0x000F000F);                         /* Code Block Pattern */  
1118 #if 0 
1119     if ( slice_type == SLICE_TYPE_B) {
1120         OUT_BCS_BATCH(batch, (0xF<<28) | (end_mb << 26) | qp);  /* Last MB */
1121     } else {
1122         OUT_BCS_BATCH(batch, (end_mb << 26) | qp);      /* Last MB */
1123     }
1124 #else
1125     OUT_BCS_BATCH(batch, (end_mb << 26) | qp);  /* Last MB */
1126 #endif
1127
1128         inter_msg = msg[1] >> 8;
1129     /*Stuff for Inter MB*/
1130     OUT_BCS_BATCH(batch, inter_msg);        
1131     OUT_BCS_BATCH(batch, 0x0);    
1132     OUT_BCS_BATCH(batch, 0x0);        
1133
1134     /*MaxSizeInWord and TargetSzieInWord*/
1135     OUT_BCS_BATCH(batch, (max_mb_size << 24) |
1136                   (target_mb_size << 16) );
1137
1138     OUT_BCS_BATCH(batch, 0x0);    
1139
1140     ADVANCE_BCS_BATCH(batch);
1141
1142     return len_in_dwords;
1143 }
1144
1145 #define         INTRA_RDO_OFFSET        4
1146 #define         INTER_RDO_OFFSET        54
1147 #define         INTER_MSG_OFFSET        52
1148 #define         INTER_MV_OFFSET         224
1149 #define         RDO_MASK                0xFFFF
1150
1151 static void 
1152 gen75_mfc_avc_pipeline_slice_programing(VADriverContextP ctx,
1153                                        struct encode_state *encode_state,
1154                                        struct intel_encoder_context *encoder_context,
1155                                        int slice_index,
1156                                        struct intel_batchbuffer *slice_batch)
1157 {
1158     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
1159     struct gen6_vme_context *vme_context = encoder_context->vme_context;
1160     VAEncSequenceParameterBufferH264 *pSequenceParameter = (VAEncSequenceParameterBufferH264 *)encode_state->seq_param_ext->buffer;
1161     VAEncPictureParameterBufferH264 *pPicParameter = (VAEncPictureParameterBufferH264 *)encode_state->pic_param_ext->buffer;
1162     VAEncSliceParameterBufferH264 *pSliceParameter = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[slice_index]->buffer; 
1163     unsigned int *msg = NULL, offset = 0;
1164     unsigned char *msg_ptr = NULL;
1165     int is_intra = pSliceParameter->slice_type == SLICE_TYPE_I;
1166     int width_in_mbs = (mfc_context->surface_state.width + 15) / 16;
1167     int height_in_mbs = (mfc_context->surface_state.height + 15) / 16;
1168     int last_slice = (pSliceParameter->macroblock_address + pSliceParameter->num_macroblocks) == (width_in_mbs * height_in_mbs);
1169     int i,x,y;
1170     int qp = pPicParameter->pic_init_qp + pSliceParameter->slice_qp_delta;
1171     unsigned int rate_control_mode = encoder_context->rate_control_mode;
1172     unsigned char *slice_header = NULL;
1173     int slice_header_length_in_bits = 0;
1174     unsigned int tail_data[] = { 0x0, 0x0 };
1175     int slice_type = pSliceParameter->slice_type;
1176
1177
1178     if (rate_control_mode == VA_RC_CBR) {
1179         qp = mfc_context->bit_rate_control_context[slice_type].QpPrimeY;
1180         pSliceParameter->slice_qp_delta = qp - pPicParameter->pic_init_qp;
1181     }
1182
1183     /* only support for 8-bit pixel bit-depth */
1184     assert(pSequenceParameter->bit_depth_luma_minus8 == 0);
1185     assert(pSequenceParameter->bit_depth_chroma_minus8 == 0);
1186     assert(pPicParameter->pic_init_qp >= 0 && pPicParameter->pic_init_qp < 52);
1187     assert(qp >= 0 && qp < 52);
1188
1189     gen75_mfc_avc_slice_state(ctx, 
1190                              pPicParameter,
1191                              pSliceParameter,
1192                              encode_state, encoder_context,
1193                              (rate_control_mode == VA_RC_CBR), qp, slice_batch);
1194
1195     if ( slice_index == 0) 
1196         intel_mfc_avc_pipeline_header_programing(ctx, encode_state, encoder_context, slice_batch);
1197
1198     slice_header_length_in_bits = build_avc_slice_header(pSequenceParameter, pPicParameter, pSliceParameter, &slice_header);
1199
1200     // slice hander
1201     mfc_context->insert_object(ctx, encoder_context,
1202                                (unsigned int *)slice_header, ALIGN(slice_header_length_in_bits, 32) >> 5, slice_header_length_in_bits & 0x1f,
1203                                5,  /* first 5 bytes are start code + nal unit type */
1204                                1, 0, 1, slice_batch);
1205
1206     dri_bo_map(vme_context->vme_output.bo , 1);
1207     msg_ptr = (unsigned char *)vme_context->vme_output.bo->virtual;
1208
1209     if (is_intra) {
1210         msg = (unsigned int *) (msg_ptr + pSliceParameter->macroblock_address * vme_context->vme_output.size_block);
1211     } else {
1212         msg = (unsigned int *) (msg_ptr + pSliceParameter->macroblock_address * vme_context->vme_output.size_block);
1213     }
1214    
1215     for (i = pSliceParameter->macroblock_address; 
1216          i < pSliceParameter->macroblock_address + pSliceParameter->num_macroblocks; i++) {
1217         int last_mb = (i == (pSliceParameter->macroblock_address + pSliceParameter->num_macroblocks - 1) );
1218         x = i % width_in_mbs;
1219         y = i / width_in_mbs;
1220         msg = (unsigned int *) (msg_ptr + i * vme_context->vme_output.size_block);
1221
1222         if (is_intra) {
1223             assert(msg);
1224             gen75_mfc_avc_pak_object_intra(ctx, x, y, last_mb, qp, msg, encoder_context, 0, 0, slice_batch);
1225         } else {
1226             int inter_rdo, intra_rdo;
1227             inter_rdo = msg[INTER_RDO_OFFSET] & RDO_MASK;
1228             intra_rdo = msg[INTRA_RDO_OFFSET] & RDO_MASK;
1229             offset = i * vme_context->vme_output.size_block + INTER_MV_OFFSET;
1230             if (intra_rdo < inter_rdo) { 
1231                 gen75_mfc_avc_pak_object_intra(ctx, x, y, last_mb, qp, msg, encoder_context, 0, 0, slice_batch);
1232             } else {
1233                 msg += INTER_MSG_OFFSET;
1234                 gen75_mfc_avc_pak_object_inter(ctx, x, y, last_mb, qp, msg, offset, encoder_context, 0, 0, pSliceParameter->slice_type, slice_batch);
1235             }
1236         }
1237     }
1238    
1239     dri_bo_unmap(vme_context->vme_output.bo);
1240
1241     if ( last_slice ) {    
1242         mfc_context->insert_object(ctx, encoder_context,
1243                                    tail_data, 2, 8,
1244                                    2, 1, 1, 0, slice_batch);
1245     } else {
1246         mfc_context->insert_object(ctx, encoder_context,
1247                                    tail_data, 1, 8,
1248                                    1, 1, 1, 0, slice_batch);
1249     }
1250
1251     free(slice_header);
1252
1253 }
1254
1255 static dri_bo *
1256 gen75_mfc_avc_software_batchbuffer(VADriverContextP ctx,
1257                                   struct encode_state *encode_state,
1258                                   struct intel_encoder_context *encoder_context)
1259 {
1260     struct i965_driver_data *i965 = i965_driver_data(ctx);
1261     struct intel_batchbuffer *batch;
1262     dri_bo *batch_bo;
1263     int i;
1264     int buffer_size;
1265     VAEncSequenceParameterBufferH264 *pSequenceParameter = (VAEncSequenceParameterBufferH264 *)encode_state->seq_param_ext->buffer;
1266     int width_in_mbs = pSequenceParameter->picture_width_in_mbs;
1267     int height_in_mbs = pSequenceParameter->picture_height_in_mbs;
1268
1269     buffer_size = width_in_mbs * height_in_mbs * 64;
1270     batch = intel_batchbuffer_new(&i965->intel, I915_EXEC_BSD, buffer_size);
1271     batch_bo = batch->buffer;
1272     for (i = 0; i < encode_state->num_slice_params_ext; i++) {
1273         gen75_mfc_avc_pipeline_slice_programing(ctx, encode_state, encoder_context, i, batch);
1274     }
1275
1276     intel_batchbuffer_align(batch, 8);
1277     
1278     BEGIN_BCS_BATCH(batch, 2);
1279     OUT_BCS_BATCH(batch, 0);
1280     OUT_BCS_BATCH(batch, MI_BATCH_BUFFER_END);
1281     ADVANCE_BCS_BATCH(batch);
1282
1283     dri_bo_reference(batch_bo);
1284     intel_batchbuffer_free(batch);
1285
1286     return batch_bo;
1287 }
1288
1289 #else
1290
1291 static void
1292 gen75_mfc_batchbuffer_surfaces_input(VADriverContextP ctx,
1293                                     struct encode_state *encode_state,
1294                                     struct intel_encoder_context *encoder_context)
1295
1296 {
1297     struct gen6_vme_context *vme_context = encoder_context->vme_context;
1298     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
1299
1300     assert(vme_context->vme_output.bo);
1301     mfc_context->buffer_suface_setup(ctx,
1302                                      &mfc_context->gpe_context,
1303                                      &vme_context->vme_output,
1304                                      BINDING_TABLE_OFFSET(BIND_IDX_VME_OUTPUT),
1305                                      SURFACE_STATE_OFFSET(BIND_IDX_VME_OUTPUT));
1306     assert(mfc_context->aux_batchbuffer_surface.bo);
1307     mfc_context->buffer_suface_setup(ctx,
1308                                      &mfc_context->gpe_context,
1309                                      &mfc_context->aux_batchbuffer_surface,
1310                                      BINDING_TABLE_OFFSET(BIND_IDX_MFC_SLICE_HEADER),
1311                                      SURFACE_STATE_OFFSET(BIND_IDX_MFC_SLICE_HEADER));
1312 }
1313
1314 static void
1315 gen75_mfc_batchbuffer_surfaces_output(VADriverContextP ctx,
1316                                      struct encode_state *encode_state,
1317                                      struct intel_encoder_context *encoder_context)
1318
1319 {
1320     struct i965_driver_data *i965 = i965_driver_data(ctx);
1321     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
1322     VAEncSequenceParameterBufferH264 *pSequenceParameter = (VAEncSequenceParameterBufferH264 *)encode_state->seq_param_ext->buffer;
1323     int width_in_mbs = pSequenceParameter->picture_width_in_mbs;
1324     int height_in_mbs = pSequenceParameter->picture_height_in_mbs;
1325     mfc_context->mfc_batchbuffer_surface.num_blocks = width_in_mbs * height_in_mbs + encode_state->num_slice_params_ext * 8 + 1;
1326     mfc_context->mfc_batchbuffer_surface.size_block = 16 * CMD_LEN_IN_OWORD; /* 3 OWORDs */
1327     mfc_context->mfc_batchbuffer_surface.pitch = 16;
1328     mfc_context->mfc_batchbuffer_surface.bo = dri_bo_alloc(i965->intel.bufmgr, 
1329                                                            "MFC batchbuffer",
1330                                                            mfc_context->mfc_batchbuffer_surface.num_blocks * mfc_context->mfc_batchbuffer_surface.size_block,
1331                                                            0x1000);
1332     mfc_context->buffer_suface_setup(ctx,
1333                                      &mfc_context->gpe_context,
1334                                      &mfc_context->mfc_batchbuffer_surface,
1335                                      BINDING_TABLE_OFFSET(BIND_IDX_MFC_BATCHBUFFER),
1336                                      SURFACE_STATE_OFFSET(BIND_IDX_MFC_BATCHBUFFER));
1337 }
1338
1339 static void
1340 gen75_mfc_batchbuffer_surfaces_setup(VADriverContextP ctx, 
1341                                     struct encode_state *encode_state,
1342                                     struct intel_encoder_context *encoder_context)
1343 {
1344     gen75_mfc_batchbuffer_surfaces_input(ctx, encode_state, encoder_context);
1345     gen75_mfc_batchbuffer_surfaces_output(ctx, encode_state, encoder_context);
1346 }
1347
1348 static void
1349 gen75_mfc_batchbuffer_idrt_setup(VADriverContextP ctx, 
1350                                 struct encode_state *encode_state,
1351                                 struct intel_encoder_context *encoder_context)
1352 {
1353     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
1354     struct gen6_interface_descriptor_data *desc;   
1355     int i;
1356     dri_bo *bo;
1357
1358     bo = mfc_context->gpe_context.idrt.bo;
1359     dri_bo_map(bo, 1);
1360     assert(bo->virtual);
1361     desc = bo->virtual;
1362
1363     for (i = 0; i < mfc_context->gpe_context.num_kernels; i++) {
1364         struct i965_kernel *kernel;
1365
1366         kernel = &mfc_context->gpe_context.kernels[i];
1367         assert(sizeof(*desc) == 32);
1368
1369         /*Setup the descritor table*/
1370         memset(desc, 0, sizeof(*desc));
1371         desc->desc0.kernel_start_pointer = (kernel->bo->offset >> 6);
1372         desc->desc2.sampler_count = 0;
1373         desc->desc2.sampler_state_pointer = 0;
1374         desc->desc3.binding_table_entry_count = 2;
1375         desc->desc3.binding_table_pointer = (BINDING_TABLE_OFFSET(0) >> 5);
1376         desc->desc4.constant_urb_entry_read_offset = 0;
1377         desc->desc4.constant_urb_entry_read_length = 4;
1378                 
1379         /*kernel start*/
1380         dri_bo_emit_reloc(bo,   
1381                           I915_GEM_DOMAIN_INSTRUCTION, 0,
1382                           0,
1383                           i * sizeof(*desc) + offsetof(struct gen6_interface_descriptor_data, desc0),
1384                           kernel->bo);
1385         desc++;
1386     }
1387
1388     dri_bo_unmap(bo);
1389 }
1390
1391 static void
1392 gen75_mfc_batchbuffer_constant_setup(VADriverContextP ctx, 
1393                                     struct encode_state *encode_state,
1394                                     struct intel_encoder_context *encoder_context)
1395 {
1396     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
1397     
1398     (void)mfc_context;
1399 }
1400
1401 static void
1402 gen75_mfc_batchbuffer_emit_object_command(struct intel_batchbuffer *batch,
1403                                          int index,
1404                                          int head_offset,
1405                                          int batchbuffer_offset,
1406                                          int head_size,
1407                                          int tail_size,
1408                                          int number_mb_cmds,
1409                                          int first_object,
1410                                          int last_object,
1411                                          int last_slice,
1412                                          int mb_x,
1413                                          int mb_y,
1414                                          int width_in_mbs,
1415                                          int qp)
1416 {
1417     BEGIN_BATCH(batch, 12);
1418     
1419     OUT_BATCH(batch, CMD_MEDIA_OBJECT | (12 - 2));
1420     OUT_BATCH(batch, index);
1421     OUT_BATCH(batch, 0);
1422     OUT_BATCH(batch, 0);
1423     OUT_BATCH(batch, 0);
1424     OUT_BATCH(batch, 0);
1425    
1426     /*inline data */
1427     OUT_BATCH(batch, head_offset);
1428     OUT_BATCH(batch, batchbuffer_offset);
1429     OUT_BATCH(batch, 
1430               head_size << 16 |
1431               tail_size);
1432     OUT_BATCH(batch,
1433               number_mb_cmds << 16 |
1434               first_object << 2 |
1435               last_object << 1 |
1436               last_slice);
1437     OUT_BATCH(batch,
1438               mb_y << 8 |
1439               mb_x);
1440     OUT_BATCH(batch,
1441               qp << 16 |
1442               width_in_mbs);
1443
1444     ADVANCE_BATCH(batch);
1445 }
1446
1447 static void
1448 gen75_mfc_avc_batchbuffer_slice_command(VADriverContextP ctx,
1449                                        struct intel_encoder_context *encoder_context,
1450                                        VAEncSliceParameterBufferH264 *slice_param,
1451                                        int head_offset,
1452                                        unsigned short head_size,
1453                                        unsigned short tail_size,
1454                                        int batchbuffer_offset,
1455                                        int qp,
1456                                        int last_slice)
1457 {
1458     struct intel_batchbuffer *batch = encoder_context->base.batch;
1459     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
1460     int width_in_mbs = (mfc_context->surface_state.width + 15) / 16;
1461     int total_mbs = slice_param->num_macroblocks;
1462     int number_mb_cmds = 128;
1463     int starting_mb = 0;
1464     int last_object = 0;
1465     int first_object = 1;
1466     int i;
1467     int mb_x, mb_y;
1468     int index = (slice_param->slice_type == SLICE_TYPE_I) ? MFC_BATCHBUFFER_AVC_INTRA : MFC_BATCHBUFFER_AVC_INTER;
1469
1470     for (i = 0; i < total_mbs / number_mb_cmds; i++) {
1471         last_object = (total_mbs - starting_mb) == number_mb_cmds;
1472         mb_x = (slice_param->macroblock_address + starting_mb) % width_in_mbs;
1473         mb_y = (slice_param->macroblock_address + starting_mb) / width_in_mbs;
1474         assert(mb_x <= 255 && mb_y <= 255);
1475
1476         starting_mb += number_mb_cmds;
1477
1478         gen75_mfc_batchbuffer_emit_object_command(batch,
1479                                                  index,
1480                                                  head_offset,
1481                                                  batchbuffer_offset,
1482                                                  head_size,
1483                                                  tail_size,
1484                                                  number_mb_cmds,
1485                                                  first_object,
1486                                                  last_object,
1487                                                  last_slice,
1488                                                  mb_x,
1489                                                  mb_y,
1490                                                  width_in_mbs,
1491                                                  qp);
1492
1493         if (first_object) {
1494             head_offset += head_size;
1495             batchbuffer_offset += head_size;
1496         }
1497
1498         if (last_object) {
1499             head_offset += tail_size;
1500             batchbuffer_offset += tail_size;
1501         }
1502
1503         batchbuffer_offset += number_mb_cmds * CMD_LEN_IN_OWORD;
1504
1505         first_object = 0;
1506     }
1507
1508     if (!last_object) {
1509         last_object = 1;
1510         number_mb_cmds = total_mbs % number_mb_cmds;
1511         mb_x = (slice_param->macroblock_address + starting_mb) % width_in_mbs;
1512         mb_y = (slice_param->macroblock_address + starting_mb) / width_in_mbs;
1513         assert(mb_x <= 255 && mb_y <= 255);
1514         starting_mb += number_mb_cmds;
1515
1516         gen75_mfc_batchbuffer_emit_object_command(batch,
1517                                                  index,
1518                                                  head_offset,
1519                                                  batchbuffer_offset,
1520                                                  head_size,
1521                                                  tail_size,
1522                                                  number_mb_cmds,
1523                                                  first_object,
1524                                                  last_object,
1525                                                  last_slice,
1526                                                  mb_x,
1527                                                  mb_y,
1528                                                  width_in_mbs,
1529                                                  qp);
1530     }
1531 }
1532                           
1533 /*
1534  * return size in Owords (16bytes)
1535  */         
1536 static int
1537 gen75_mfc_avc_batchbuffer_slice(VADriverContextP ctx,
1538                                struct encode_state *encode_state,
1539                                struct intel_encoder_context *encoder_context,
1540                                int slice_index,
1541                                int batchbuffer_offset)
1542 {
1543     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
1544     struct intel_batchbuffer *slice_batch = mfc_context->aux_batchbuffer;
1545     VAEncSequenceParameterBufferH264 *pSequenceParameter = (VAEncSequenceParameterBufferH264 *)encode_state->seq_param_ext->buffer;
1546     VAEncPictureParameterBufferH264 *pPicParameter = (VAEncPictureParameterBufferH264 *)encode_state->pic_param_ext->buffer;
1547     VAEncSliceParameterBufferH264 *pSliceParameter = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[slice_index]->buffer; 
1548     int width_in_mbs = (mfc_context->surface_state.width + 15) / 16;
1549     int height_in_mbs = (mfc_context->surface_state.height + 15) / 16;
1550     int last_slice = (pSliceParameter->macroblock_address + pSliceParameter->num_macroblocks) == (width_in_mbs * height_in_mbs);
1551     int qp = pPicParameter->pic_init_qp + pSliceParameter->slice_qp_delta;
1552     unsigned int rate_control_mode = encoder_context->rate_control_mode;
1553     unsigned char *slice_header = NULL;
1554     int slice_header_length_in_bits = 0;
1555     unsigned int tail_data[] = { 0x0, 0x0 };
1556     long head_offset;
1557     int old_used = intel_batchbuffer_used_size(slice_batch), used;
1558     unsigned short head_size, tail_size;
1559     int slice_type = pSliceParameter->slice_type;
1560
1561     if (rate_control_mode == VA_RC_CBR) {
1562         qp = mfc_context->bit_rate_control_context[slice_type].QpPrimeY;
1563         pSliceParameter->slice_qp_delta = qp - pPicParameter->pic_init_qp;
1564     }
1565
1566     /* only support for 8-bit pixel bit-depth */
1567     assert(pSequenceParameter->bit_depth_luma_minus8 == 0);
1568     assert(pSequenceParameter->bit_depth_chroma_minus8 == 0);
1569     assert(pPicParameter->pic_init_qp >= 0 && pPicParameter->pic_init_qp < 52);
1570     assert(qp >= 0 && qp < 52);
1571
1572     head_offset = old_used / 16;
1573     gen75_mfc_avc_slice_state(ctx,
1574                              pPicParameter,
1575                              pSliceParameter,
1576                              encode_state,
1577                              encoder_context,
1578                              (rate_control_mode == VA_RC_CBR),
1579                              qp,
1580                              slice_batch);
1581
1582     if (slice_index == 0)
1583         intel_mfc_avc_pipeline_header_programing(ctx, encode_state, encoder_context, slice_batch);
1584
1585     slice_header_length_in_bits = build_avc_slice_header(pSequenceParameter, pPicParameter, pSliceParameter, &slice_header);
1586
1587     // slice hander
1588     mfc_context->insert_object(ctx,
1589                                encoder_context,
1590                                (unsigned int *)slice_header,
1591                                ALIGN(slice_header_length_in_bits, 32) >> 5,
1592                                slice_header_length_in_bits & 0x1f,
1593                                5,  /* first 5 bytes are start code + nal unit type */
1594                                1,
1595                                0,
1596                                1,
1597                                slice_batch);
1598     free(slice_header);
1599
1600     intel_batchbuffer_align(slice_batch, 16); /* aligned by an Oword */
1601     used = intel_batchbuffer_used_size(slice_batch);
1602     head_size = (used - old_used) / 16;
1603     old_used = used;
1604
1605     /* tail */
1606     if (last_slice) {    
1607         mfc_context->insert_object(ctx,
1608                                    encoder_context,
1609                                    tail_data,
1610                                    2,
1611                                    8,
1612                                    2,
1613                                    1,
1614                                    1,
1615                                    0,
1616                                    slice_batch);
1617     } else {
1618         mfc_context->insert_object(ctx,
1619                                    encoder_context,
1620                                    tail_data,
1621                                    1,
1622                                    8,
1623                                    1,
1624                                    1,
1625                                    1,
1626                                    0,
1627                                    slice_batch);
1628     }
1629
1630     intel_batchbuffer_align(slice_batch, 16); /* aligned by an Oword */
1631     used = intel_batchbuffer_used_size(slice_batch);
1632     tail_size = (used - old_used) / 16;
1633
1634    
1635     gen75_mfc_avc_batchbuffer_slice_command(ctx,
1636                                            encoder_context,
1637                                            pSliceParameter,
1638                                            head_offset,
1639                                            head_size,
1640                                            tail_size,
1641                                            batchbuffer_offset,
1642                                            qp,
1643                                            last_slice);
1644
1645     return head_size + tail_size + pSliceParameter->num_macroblocks * CMD_LEN_IN_OWORD;
1646 }
1647
1648 static void
1649 gen75_mfc_avc_batchbuffer_pipeline(VADriverContextP ctx,
1650                                   struct encode_state *encode_state,
1651                                   struct intel_encoder_context *encoder_context)
1652 {
1653     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
1654     struct intel_batchbuffer *batch = encoder_context->base.batch;
1655     int i, size, offset = 0;
1656     intel_batchbuffer_start_atomic(batch, 0x4000); 
1657     gen6_gpe_pipeline_setup(ctx, &mfc_context->gpe_context, batch);
1658
1659     for ( i = 0; i < encode_state->num_slice_params_ext; i++) {
1660         size = gen75_mfc_avc_batchbuffer_slice(ctx, encode_state, encoder_context, i, offset);
1661         offset += size;
1662     }
1663
1664     intel_batchbuffer_end_atomic(batch);
1665     intel_batchbuffer_flush(batch);
1666 }
1667
1668 static void
1669 gen75_mfc_build_avc_batchbuffer(VADriverContextP ctx, 
1670                                struct encode_state *encode_state,
1671                                struct intel_encoder_context *encoder_context)
1672 {
1673     gen75_mfc_batchbuffer_surfaces_setup(ctx, encode_state, encoder_context);
1674     gen75_mfc_batchbuffer_idrt_setup(ctx, encode_state, encoder_context);
1675     gen75_mfc_batchbuffer_constant_setup(ctx, encode_state, encoder_context);
1676     gen75_mfc_avc_batchbuffer_pipeline(ctx, encode_state, encoder_context);
1677 }
1678
1679 static dri_bo *
1680 gen75_mfc_avc_hardware_batchbuffer(VADriverContextP ctx,
1681                                   struct encode_state *encode_state,
1682                                   struct intel_encoder_context *encoder_context)
1683 {
1684     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
1685
1686     gen75_mfc_build_avc_batchbuffer(ctx, encode_state, encoder_context);
1687     dri_bo_reference(mfc_context->mfc_batchbuffer_surface.bo);
1688
1689     return mfc_context->mfc_batchbuffer_surface.bo;
1690 }
1691
1692 #endif
1693
1694 static void
1695 gen75_mfc_avc_pipeline_programing(VADriverContextP ctx,
1696                                  struct encode_state *encode_state,
1697                                  struct intel_encoder_context *encoder_context)
1698 {
1699     struct intel_batchbuffer *batch = encoder_context->base.batch;
1700     dri_bo *slice_batch_bo;
1701
1702     if ( intel_mfc_interlace_check(ctx, encode_state, encoder_context) ) {
1703         fprintf(stderr, "Current VA driver don't support interlace mode!\n");
1704         assert(0);
1705         return; 
1706     }
1707
1708 #ifdef MFC_SOFTWARE_HASWELL
1709     slice_batch_bo = gen75_mfc_avc_software_batchbuffer(ctx, encode_state, encoder_context);
1710 #else
1711     slice_batch_bo = gen75_mfc_avc_hardware_batchbuffer(ctx, encode_state, encoder_context);
1712 #endif
1713
1714     // begin programing
1715     intel_batchbuffer_start_atomic_bcs(batch, 0x4000); 
1716     intel_batchbuffer_emit_mi_flush(batch);
1717     
1718     // picture level programing
1719     gen75_mfc_avc_pipeline_picture_programing(ctx, encode_state, encoder_context);
1720
1721     BEGIN_BCS_BATCH(batch, 2);
1722     OUT_BCS_BATCH(batch, MI_BATCH_BUFFER_START | (1 << 8));
1723     OUT_BCS_RELOC(batch,
1724                   slice_batch_bo,
1725                   I915_GEM_DOMAIN_COMMAND, 0, 
1726                   0);
1727     ADVANCE_BCS_BATCH(batch);
1728
1729     // end programing
1730     intel_batchbuffer_end_atomic(batch);
1731
1732     dri_bo_unreference(slice_batch_bo);
1733 }
1734
1735
1736 static VAStatus
1737 gen75_mfc_avc_encode_picture(VADriverContextP ctx, 
1738                             struct encode_state *encode_state,
1739                             struct intel_encoder_context *encoder_context)
1740 {
1741     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
1742     unsigned int rate_control_mode = encoder_context->rate_control_mode;
1743     int current_frame_bits_size;
1744     int sts;
1745  
1746     for (;;) {
1747         gen75_mfc_init(ctx, encode_state, encoder_context);
1748         intel_mfc_avc_prepare(ctx, encode_state, encoder_context);
1749         /*Programing bcs pipeline*/
1750         gen75_mfc_avc_pipeline_programing(ctx, encode_state, encoder_context);  //filling the pipeline
1751         gen75_mfc_run(ctx, encode_state, encoder_context);
1752         if (rate_control_mode == VA_RC_CBR /*|| rate_control_mode == VA_RC_VBR*/) {
1753             gen75_mfc_stop(ctx, encode_state, encoder_context, &current_frame_bits_size);
1754             sts = intel_mfc_brc_postpack(encode_state, mfc_context, current_frame_bits_size);
1755             if (sts == BRC_NO_HRD_VIOLATION) {
1756                 intel_mfc_hrd_context_update(encode_state, mfc_context);
1757                 break;
1758             }
1759             else if (sts == BRC_OVERFLOW_WITH_MIN_QP || sts == BRC_UNDERFLOW_WITH_MAX_QP) {
1760                 if (!mfc_context->hrd.violation_noted) {
1761                     fprintf(stderr, "Unrepairable %s!\n", (sts == BRC_OVERFLOW_WITH_MIN_QP)? "overflow": "underflow");
1762                     mfc_context->hrd.violation_noted = 1;
1763                 }
1764                 return VA_STATUS_SUCCESS;
1765             }
1766         } else {
1767             break;
1768         }
1769     }
1770
1771     return VA_STATUS_SUCCESS;
1772 }
1773
1774
1775 static void
1776 gen75_mfc_context_destroy(void *context)
1777 {
1778     struct gen6_mfc_context *mfc_context = context;
1779     int i;
1780
1781     dri_bo_unreference(mfc_context->post_deblocking_output.bo);
1782     mfc_context->post_deblocking_output.bo = NULL;
1783
1784     dri_bo_unreference(mfc_context->pre_deblocking_output.bo);
1785     mfc_context->pre_deblocking_output.bo = NULL;
1786
1787     dri_bo_unreference(mfc_context->uncompressed_picture_source.bo);
1788     mfc_context->uncompressed_picture_source.bo = NULL;
1789
1790     dri_bo_unreference(mfc_context->mfc_indirect_pak_bse_object.bo); 
1791     mfc_context->mfc_indirect_pak_bse_object.bo = NULL;
1792
1793     for (i = 0; i < NUM_MFC_DMV_BUFFERS; i++){
1794         dri_bo_unreference(mfc_context->direct_mv_buffers[i].bo);
1795         mfc_context->direct_mv_buffers[i].bo = NULL;
1796     }
1797
1798     dri_bo_unreference(mfc_context->intra_row_store_scratch_buffer.bo);
1799     mfc_context->intra_row_store_scratch_buffer.bo = NULL;
1800
1801     dri_bo_unreference(mfc_context->macroblock_status_buffer.bo);
1802     mfc_context->macroblock_status_buffer.bo = NULL;
1803
1804     dri_bo_unreference(mfc_context->deblocking_filter_row_store_scratch_buffer.bo);
1805     mfc_context->deblocking_filter_row_store_scratch_buffer.bo = NULL;
1806
1807     dri_bo_unreference(mfc_context->bsd_mpc_row_store_scratch_buffer.bo);
1808     mfc_context->bsd_mpc_row_store_scratch_buffer.bo = NULL;
1809
1810
1811     for (i = 0; i < MAX_MFC_REFERENCE_SURFACES; i++){
1812         dri_bo_unreference(mfc_context->reference_surfaces[i].bo);
1813         mfc_context->reference_surfaces[i].bo = NULL;  
1814     }
1815
1816     i965_gpe_context_destroy(&mfc_context->gpe_context);
1817
1818     dri_bo_unreference(mfc_context->mfc_batchbuffer_surface.bo);
1819     mfc_context->mfc_batchbuffer_surface.bo = NULL;
1820
1821     dri_bo_unreference(mfc_context->aux_batchbuffer_surface.bo);
1822     mfc_context->aux_batchbuffer_surface.bo = NULL;
1823
1824     if (mfc_context->aux_batchbuffer)
1825         intel_batchbuffer_free(mfc_context->aux_batchbuffer);
1826
1827     mfc_context->aux_batchbuffer = NULL;
1828
1829     free(mfc_context);
1830 }
1831
1832 static VAStatus gen75_mfc_pipeline(VADriverContextP ctx,
1833                   VAProfile profile,
1834                   struct encode_state *encode_state,
1835                   struct intel_encoder_context *encoder_context)
1836 {
1837     VAStatus vaStatus;
1838
1839     switch (profile) {
1840     case VAProfileH264Baseline:
1841     case VAProfileH264Main:
1842     case VAProfileH264High:
1843         vaStatus = gen75_mfc_avc_encode_picture(ctx, encode_state, encoder_context);
1844         break;
1845
1846         /* FIXME: add for other profile */
1847     default:
1848         vaStatus = VA_STATUS_ERROR_UNSUPPORTED_PROFILE;
1849         break;
1850     }
1851
1852     return vaStatus;
1853 }
1854
1855 Bool gen75_mfc_context_init(VADriverContextP ctx, struct intel_encoder_context *encoder_context)
1856 {
1857     struct gen6_mfc_context *mfc_context = calloc(1, sizeof(struct gen6_mfc_context));
1858
1859     mfc_context->gpe_context.surface_state_binding_table.length = (SURFACE_STATE_PADDED_SIZE + sizeof(unsigned int)) * MAX_MEDIA_SURFACES_GEN6;
1860
1861     mfc_context->gpe_context.idrt.max_entries = MAX_GPE_KERNELS;
1862     mfc_context->gpe_context.idrt.entry_size = sizeof(struct gen6_interface_descriptor_data);
1863
1864     mfc_context->gpe_context.curbe.length = 32 * 4;
1865
1866     mfc_context->gpe_context.vfe_state.max_num_threads = 60 - 1;
1867     mfc_context->gpe_context.vfe_state.num_urb_entries = 16;
1868     mfc_context->gpe_context.vfe_state.gpgpu_mode = 0;
1869     mfc_context->gpe_context.vfe_state.urb_entry_size = 59 - 1;
1870     mfc_context->gpe_context.vfe_state.curbe_allocation_size = 37 - 1;
1871
1872     i965_gpe_load_kernels(ctx,
1873                           &mfc_context->gpe_context,
1874                           gen75_mfc_kernels,
1875                           NUM_MFC_KERNEL);
1876
1877     mfc_context->pipe_mode_select = gen75_mfc_pipe_mode_select;
1878     mfc_context->set_surface_state = gen75_mfc_surface_state;
1879     mfc_context->ind_obj_base_addr_state = gen75_mfc_ind_obj_base_addr_state;
1880     mfc_context->avc_img_state = gen75_mfc_avc_img_state;
1881     mfc_context->avc_qm_state = gen75_mfc_avc_qm_state;
1882     mfc_context->avc_fqm_state = gen75_mfc_avc_fqm_state;
1883     mfc_context->insert_object = gen75_mfc_avc_insert_object;
1884     mfc_context->buffer_suface_setup = gen7_gpe_buffer_suface_setup;
1885
1886     encoder_context->mfc_context = mfc_context;
1887     encoder_context->mfc_context_destroy = gen75_mfc_context_destroy;
1888     encoder_context->mfc_pipeline = gen75_mfc_pipeline;
1889     encoder_context->mfc_brc_prepare = intel_mfc_brc_prepare;
1890
1891     return True;
1892 }