Use the common scoreboard code on Ivy/Haswell to remove the duplicated code
[profile/ivi/vaapi-intel-driver.git] / src / gen6_mfc_common.c
1 /*
2  * Copyright © 2012 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the
6  * "Software"), to deal in the Software without restriction, including
7  * without limitation the rights to use, copy, modify, merge, publish,
8  * distribute, sub license, and/or sell copies of the Software, and to
9  * permit persons to whom the Software is furnished to do so, subject to
10  * the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the
13  * next paragraph) shall be included in all copies or substantial portions
14  * of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
17  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
19  * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR
20  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
21  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
22  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23  *
24  * Authors:
25  *    Xiang Haihao <haihao.xiang@intel.com>
26  *    Zhao Yakui <yakui.zhao@intel.com>
27  *
28  */
29
30 #include <stdio.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #include <assert.h>
34 #include <math.h>
35
36 #include "intel_batchbuffer.h"
37 #include "i965_defines.h"
38 #include "i965_structs.h"
39 #include "i965_drv_video.h"
40 #include "i965_encoder.h"
41 #include "i965_encoder_utils.h"
42 #include "gen6_mfc.h"
43 #include "gen6_vme.h"
44 #include "intel_media.h"
45
46 #define BRC_CLIP(x, min, max) \
47 { \
48     x = ((x > (max)) ? (max) : ((x < (min)) ? (min) : x)); \
49 }
50
51 #define BRC_P_B_QP_DIFF 4
52 #define BRC_I_P_QP_DIFF 2
53 #define BRC_I_B_QP_DIFF (BRC_I_P_QP_DIFF + BRC_P_B_QP_DIFF)
54
55 #define BRC_PWEIGHT 0.6  /* weight if P slice with comparison to I slice */
56 #define BRC_BWEIGHT 0.25 /* weight if B slice with comparison to I slice */
57
58 #define BRC_QP_MAX_CHANGE 5 /* maximum qp modification */
59 #define BRC_CY 0.1 /* weight for */
60 #define BRC_CX_UNDERFLOW 5.
61 #define BRC_CX_OVERFLOW -4.
62
63 #define BRC_PI_0_5 1.5707963267948966192313216916398
64
65 static void
66 intel_mfc_bit_rate_control_context_init(struct encode_state *encode_state, 
67                                        struct gen6_mfc_context *mfc_context)
68 {
69     VAEncSequenceParameterBufferH264 *pSequenceParameter = (VAEncSequenceParameterBufferH264 *)encode_state->seq_param_ext->buffer;
70     int width_in_mbs = (mfc_context->surface_state.width + 15) / 16;
71     int height_in_mbs = (mfc_context->surface_state.height + 15) / 16;
72     float fps =  pSequenceParameter->time_scale * 0.5 / pSequenceParameter->num_units_in_tick ;
73     int inter_mb_size = pSequenceParameter->bits_per_second * 1.0 / (fps+4.0) / width_in_mbs / height_in_mbs;
74     int intra_mb_size = inter_mb_size * 5.0;
75     int i;
76
77     mfc_context->bit_rate_control_context[SLICE_TYPE_I].target_mb_size = intra_mb_size;
78     mfc_context->bit_rate_control_context[SLICE_TYPE_I].target_frame_size = intra_mb_size * width_in_mbs * height_in_mbs;
79     mfc_context->bit_rate_control_context[SLICE_TYPE_P].target_mb_size = inter_mb_size;
80     mfc_context->bit_rate_control_context[SLICE_TYPE_P].target_frame_size = inter_mb_size * width_in_mbs * height_in_mbs;
81     mfc_context->bit_rate_control_context[SLICE_TYPE_B].target_mb_size = inter_mb_size;
82     mfc_context->bit_rate_control_context[SLICE_TYPE_B].target_frame_size = inter_mb_size * width_in_mbs * height_in_mbs;
83
84     for(i = 0 ; i < 3; i++) {
85         mfc_context->bit_rate_control_context[i].QpPrimeY = 26;
86         mfc_context->bit_rate_control_context[i].MaxQpNegModifier = 6;
87         mfc_context->bit_rate_control_context[i].MaxQpPosModifier = 6;
88         mfc_context->bit_rate_control_context[i].GrowInit = 6;
89         mfc_context->bit_rate_control_context[i].GrowResistance = 4;
90         mfc_context->bit_rate_control_context[i].ShrinkInit = 6;
91         mfc_context->bit_rate_control_context[i].ShrinkResistance = 4;
92         
93         mfc_context->bit_rate_control_context[i].Correct[0] = 8;
94         mfc_context->bit_rate_control_context[i].Correct[1] = 4;
95         mfc_context->bit_rate_control_context[i].Correct[2] = 2;
96         mfc_context->bit_rate_control_context[i].Correct[3] = 2;
97         mfc_context->bit_rate_control_context[i].Correct[4] = 4;
98         mfc_context->bit_rate_control_context[i].Correct[5] = 8;
99     }
100     
101     mfc_context->bit_rate_control_context[SLICE_TYPE_I].TargetSizeInWord = (intra_mb_size + 16)/ 16;
102     mfc_context->bit_rate_control_context[SLICE_TYPE_P].TargetSizeInWord = (inter_mb_size + 16)/ 16;
103     mfc_context->bit_rate_control_context[SLICE_TYPE_B].TargetSizeInWord = (inter_mb_size + 16)/ 16;
104
105     mfc_context->bit_rate_control_context[SLICE_TYPE_I].MaxSizeInWord = mfc_context->bit_rate_control_context[SLICE_TYPE_I].TargetSizeInWord * 1.5;
106     mfc_context->bit_rate_control_context[SLICE_TYPE_P].MaxSizeInWord = mfc_context->bit_rate_control_context[SLICE_TYPE_P].TargetSizeInWord * 1.5;
107     mfc_context->bit_rate_control_context[SLICE_TYPE_B].MaxSizeInWord = mfc_context->bit_rate_control_context[SLICE_TYPE_B].TargetSizeInWord * 1.5;
108 }
109
110 static void intel_mfc_brc_init(struct encode_state *encode_state,
111                   struct intel_encoder_context* encoder_context)
112 {
113     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
114     VAEncSequenceParameterBufferH264 *pSequenceParameter = (VAEncSequenceParameterBufferH264 *)encode_state->seq_param_ext->buffer;
115     VAEncMiscParameterBuffer* pMiscParamHRD = (VAEncMiscParameterBuffer*)encode_state->misc_param[VAEncMiscParameterTypeHRD]->buffer;
116     VAEncMiscParameterHRD* pParameterHRD = (VAEncMiscParameterHRD*)pMiscParamHRD->data;
117     double bitrate = pSequenceParameter->bits_per_second;
118     double framerate = (double)pSequenceParameter->time_scale /(2 * (double)pSequenceParameter->num_units_in_tick);
119     int inum = 1, pnum = 0, bnum = 0; /* Gop structure: number of I, P, B frames in the Gop. */
120     int intra_period = pSequenceParameter->intra_period;
121     int ip_period = pSequenceParameter->ip_period;
122     double qp1_size = 0.1 * 8 * 3 * (pSequenceParameter->picture_width_in_mbs<<4) * (pSequenceParameter->picture_height_in_mbs<<4)/2;
123     double qp51_size = 0.001 * 8 * 3 * (pSequenceParameter->picture_width_in_mbs<<4) * (pSequenceParameter->picture_height_in_mbs<<4)/2;
124     double bpf;
125
126     if (pSequenceParameter->ip_period) {
127         pnum = (intra_period + ip_period - 1)/ip_period - 1;
128         bnum = intra_period - inum - pnum;
129     }
130
131     mfc_context->brc.mode = encoder_context->rate_control_mode;
132
133     mfc_context->brc.target_frame_size[SLICE_TYPE_I] = (int)((double)((bitrate * intra_period)/framerate) /
134                                                              (double)(inum + BRC_PWEIGHT * pnum + BRC_BWEIGHT * bnum));
135     mfc_context->brc.target_frame_size[SLICE_TYPE_P] = BRC_PWEIGHT * mfc_context->brc.target_frame_size[SLICE_TYPE_I];
136     mfc_context->brc.target_frame_size[SLICE_TYPE_B] = BRC_BWEIGHT * mfc_context->brc.target_frame_size[SLICE_TYPE_I];
137
138     mfc_context->brc.gop_nums[SLICE_TYPE_I] = inum;
139     mfc_context->brc.gop_nums[SLICE_TYPE_P] = pnum;
140     mfc_context->brc.gop_nums[SLICE_TYPE_B] = bnum;
141
142     bpf = mfc_context->brc.bits_per_frame = bitrate/framerate;
143
144     mfc_context->hrd.buffer_size = (double)pParameterHRD->buffer_size;
145     mfc_context->hrd.current_buffer_fullness =
146         (double)(pParameterHRD->initial_buffer_fullness < mfc_context->hrd.buffer_size)?
147             pParameterHRD->initial_buffer_fullness: mfc_context->hrd.buffer_size/2.;
148     mfc_context->hrd.target_buffer_fullness = (double)mfc_context->hrd.buffer_size/2.;
149     mfc_context->hrd.buffer_capacity = (double)mfc_context->hrd.buffer_size/qp1_size;
150     mfc_context->hrd.violation_noted = 0;
151
152     if ((bpf > qp51_size) && (bpf < qp1_size)) {
153         mfc_context->bit_rate_control_context[SLICE_TYPE_P].QpPrimeY = 51 - 50*(bpf - qp51_size)/(qp1_size - qp51_size);
154     }
155     else if (bpf >= qp1_size)
156         mfc_context->bit_rate_control_context[SLICE_TYPE_P].QpPrimeY = 1;
157     else if (bpf <= qp51_size)
158         mfc_context->bit_rate_control_context[SLICE_TYPE_P].QpPrimeY = 51;
159
160     mfc_context->bit_rate_control_context[SLICE_TYPE_I].QpPrimeY = mfc_context->bit_rate_control_context[SLICE_TYPE_P].QpPrimeY;
161     mfc_context->bit_rate_control_context[SLICE_TYPE_B].QpPrimeY = mfc_context->bit_rate_control_context[SLICE_TYPE_I].QpPrimeY;
162
163     BRC_CLIP(mfc_context->bit_rate_control_context[SLICE_TYPE_I].QpPrimeY, 1, 51);
164     BRC_CLIP(mfc_context->bit_rate_control_context[SLICE_TYPE_P].QpPrimeY, 1, 51);
165     BRC_CLIP(mfc_context->bit_rate_control_context[SLICE_TYPE_B].QpPrimeY, 1, 51);
166 }
167
168 int intel_mfc_update_hrd(struct encode_state *encode_state,
169                                struct gen6_mfc_context *mfc_context,
170                                int frame_bits)
171 {
172     double prev_bf = mfc_context->hrd.current_buffer_fullness;
173
174     mfc_context->hrd.current_buffer_fullness -= frame_bits;
175
176     if (mfc_context->hrd.buffer_size > 0 && mfc_context->hrd.current_buffer_fullness <= 0.) {
177         mfc_context->hrd.current_buffer_fullness = prev_bf;
178         return BRC_UNDERFLOW;
179     }
180     
181     mfc_context->hrd.current_buffer_fullness += mfc_context->brc.bits_per_frame;
182     if (mfc_context->hrd.buffer_size > 0 && mfc_context->hrd.current_buffer_fullness > mfc_context->hrd.buffer_size) {
183         if (mfc_context->brc.mode == VA_RC_VBR)
184             mfc_context->hrd.current_buffer_fullness = mfc_context->hrd.buffer_size;
185         else {
186             mfc_context->hrd.current_buffer_fullness = prev_bf;
187             return BRC_OVERFLOW;
188         }
189     }
190     return BRC_NO_HRD_VIOLATION;
191 }
192
193 int intel_mfc_brc_postpack(struct encode_state *encode_state,
194                                  struct gen6_mfc_context *mfc_context,
195                                  int frame_bits)
196 {
197     gen6_brc_status sts = BRC_NO_HRD_VIOLATION;
198     VAEncSliceParameterBufferH264 *pSliceParameter = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[0]->buffer; 
199     int slicetype = pSliceParameter->slice_type;
200     int qpi = mfc_context->bit_rate_control_context[SLICE_TYPE_I].QpPrimeY;
201     int qpp = mfc_context->bit_rate_control_context[SLICE_TYPE_P].QpPrimeY;
202     int qpb = mfc_context->bit_rate_control_context[SLICE_TYPE_B].QpPrimeY;
203     int qp; // quantizer of previously encoded slice of current type
204     int qpn; // predicted quantizer for next frame of current type in integer format
205     double qpf; // predicted quantizer for next frame of current type in float format
206     double delta_qp; // QP correction
207     int target_frame_size, frame_size_next;
208     /* Notes:
209      *  x - how far we are from HRD buffer borders
210      *  y - how far we are from target HRD buffer fullness
211      */
212     double x, y;
213     double frame_size_alpha;
214
215     if (slicetype == SLICE_TYPE_SP)
216         slicetype = SLICE_TYPE_P;
217     else if (slicetype == SLICE_TYPE_SI)
218         slicetype = SLICE_TYPE_I;
219
220     qp = mfc_context->bit_rate_control_context[slicetype].QpPrimeY;
221
222     target_frame_size = mfc_context->brc.target_frame_size[slicetype];
223     if (mfc_context->hrd.buffer_capacity < 5)
224         frame_size_alpha = 0;
225     else
226         frame_size_alpha = (double)mfc_context->brc.gop_nums[slicetype];
227     if (frame_size_alpha > 30) frame_size_alpha = 30;
228     frame_size_next = target_frame_size + (double)(target_frame_size - frame_bits) /
229                                           (double)(frame_size_alpha + 1.);
230
231     /* frame_size_next: avoiding negative number and too small value */
232     if ((double)frame_size_next < (double)(target_frame_size * 0.25))
233         frame_size_next = (int)((double)target_frame_size * 0.25);
234
235     qpf = (double)qp * target_frame_size / frame_size_next;
236     qpn = (int)(qpf + 0.5);
237
238     if (qpn == qp) {
239         /* setting qpn we round qpf making mistakes: now we are trying to compensate this */
240         mfc_context->brc.qpf_rounding_accumulator += qpf - qpn;
241         if (mfc_context->brc.qpf_rounding_accumulator > 1.0) {
242             qpn++;
243             mfc_context->brc.qpf_rounding_accumulator = 0.;
244         } else if (mfc_context->brc.qpf_rounding_accumulator < -1.0) {
245             qpn--;
246             mfc_context->brc.qpf_rounding_accumulator = 0.;
247         }
248     }
249     /* making sure that QP is not changing too fast */
250     if ((qpn - qp) > BRC_QP_MAX_CHANGE) qpn = qp + BRC_QP_MAX_CHANGE;
251     else if ((qpn - qp) < -BRC_QP_MAX_CHANGE) qpn = qp - BRC_QP_MAX_CHANGE;
252     /* making sure that with QP predictions we did do not leave QPs range */
253     BRC_CLIP(qpn, 1, 51);
254
255     /* checking wthether HRD compliance is still met */
256     sts = intel_mfc_update_hrd(encode_state, mfc_context, frame_bits);
257
258     /* calculating QP delta as some function*/
259     x = mfc_context->hrd.target_buffer_fullness - mfc_context->hrd.current_buffer_fullness;
260     if (x > 0) {
261         x /= mfc_context->hrd.target_buffer_fullness;
262         y = mfc_context->hrd.current_buffer_fullness;
263     }
264     else {
265         x /= (mfc_context->hrd.buffer_size - mfc_context->hrd.target_buffer_fullness);
266         y = mfc_context->hrd.buffer_size - mfc_context->hrd.current_buffer_fullness;
267     }
268     if (y < 0.01) y = 0.01;
269     if (x > 1) x = 1;
270     else if (x < -1) x = -1;
271
272     delta_qp = BRC_QP_MAX_CHANGE*exp(-1/y)*sin(BRC_PI_0_5 * x);
273     qpn = (int)(qpn + delta_qp + 0.5);
274
275     /* making sure that with QP predictions we did do not leave QPs range */
276     BRC_CLIP(qpn, 1, 51);
277
278     if (sts == BRC_NO_HRD_VIOLATION) { // no HRD violation
279         /* correcting QPs of slices of other types */
280         if (slicetype == SLICE_TYPE_P) {
281             if (abs(qpn + BRC_P_B_QP_DIFF - qpb) > 2)
282                 mfc_context->bit_rate_control_context[SLICE_TYPE_B].QpPrimeY += (qpn + BRC_P_B_QP_DIFF - qpb) >> 1;
283             if (abs(qpn - BRC_I_P_QP_DIFF - qpi) > 2)
284                 mfc_context->bit_rate_control_context[SLICE_TYPE_I].QpPrimeY += (qpn - BRC_I_P_QP_DIFF - qpi) >> 1;
285         } else if (slicetype == SLICE_TYPE_I) {
286             if (abs(qpn + BRC_I_B_QP_DIFF - qpb) > 4)
287                 mfc_context->bit_rate_control_context[SLICE_TYPE_B].QpPrimeY += (qpn + BRC_I_B_QP_DIFF - qpb) >> 2;
288             if (abs(qpn + BRC_I_P_QP_DIFF - qpp) > 2)
289                 mfc_context->bit_rate_control_context[SLICE_TYPE_P].QpPrimeY += (qpn + BRC_I_P_QP_DIFF - qpp) >> 2;
290         } else { // SLICE_TYPE_B
291             if (abs(qpn - BRC_P_B_QP_DIFF - qpp) > 2)
292                 mfc_context->bit_rate_control_context[SLICE_TYPE_P].QpPrimeY += (qpn - BRC_P_B_QP_DIFF - qpp) >> 1;
293             if (abs(qpn - BRC_I_B_QP_DIFF - qpi) > 4)
294                 mfc_context->bit_rate_control_context[SLICE_TYPE_I].QpPrimeY += (qpn - BRC_I_B_QP_DIFF - qpi) >> 2;
295         }
296         BRC_CLIP(mfc_context->bit_rate_control_context[SLICE_TYPE_I].QpPrimeY, 1, 51);
297         BRC_CLIP(mfc_context->bit_rate_control_context[SLICE_TYPE_P].QpPrimeY, 1, 51);
298         BRC_CLIP(mfc_context->bit_rate_control_context[SLICE_TYPE_B].QpPrimeY, 1, 51);
299     } else if (sts == BRC_UNDERFLOW) { // underflow
300         if (qpn <= qp) qpn = qp + 1;
301         if (qpn > 51) {
302             qpn = 51;
303             sts = BRC_UNDERFLOW_WITH_MAX_QP; //underflow with maxQP
304         }
305     } else if (sts == BRC_OVERFLOW) {
306         if (qpn >= qp) qpn = qp - 1;
307         if (qpn < 1) { // < 0 (?) overflow with minQP
308             qpn = 1;
309             sts = BRC_OVERFLOW_WITH_MIN_QP; // bit stuffing to be done
310         }
311     }
312
313     mfc_context->bit_rate_control_context[slicetype].QpPrimeY = qpn;
314
315     return sts;
316 }
317
318 static void intel_mfc_hrd_context_init(struct encode_state *encode_state,
319                           struct intel_encoder_context *encoder_context)
320 {
321     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
322     VAEncSequenceParameterBufferH264 *pSequenceParameter = (VAEncSequenceParameterBufferH264 *)encode_state->seq_param_ext->buffer;
323     unsigned int rate_control_mode = encoder_context->rate_control_mode;
324     int target_bit_rate = pSequenceParameter->bits_per_second;
325     
326     // current we only support CBR mode.
327     if (rate_control_mode == VA_RC_CBR) {
328         mfc_context->vui_hrd.i_bit_rate_value = target_bit_rate >> 10;
329         mfc_context->vui_hrd.i_cpb_size_value = (target_bit_rate * 8) >> 10;
330         mfc_context->vui_hrd.i_initial_cpb_removal_delay = mfc_context->vui_hrd.i_cpb_size_value * 0.5 * 1024 / target_bit_rate * 90000;
331         mfc_context->vui_hrd.i_cpb_removal_delay = 2;
332         mfc_context->vui_hrd.i_frame_number = 0;
333
334         mfc_context->vui_hrd.i_initial_cpb_removal_delay_length = 24; 
335         mfc_context->vui_hrd.i_cpb_removal_delay_length = 24;
336         mfc_context->vui_hrd.i_dpb_output_delay_length = 24;
337     }
338
339 }
340
341 void 
342 intel_mfc_hrd_context_update(struct encode_state *encode_state, 
343                           struct gen6_mfc_context *mfc_context) 
344 {
345     mfc_context->vui_hrd.i_frame_number++;
346 }
347
348 int intel_mfc_interlace_check(VADriverContextP ctx,
349                    struct encode_state *encode_state,
350                    struct intel_encoder_context *encoder_context) 
351 {
352     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
353     VAEncSliceParameterBufferH264 *pSliceParameter;
354     int i;
355     int mbCount = 0;
356     int width_in_mbs = (mfc_context->surface_state.width + 15) / 16;
357     int height_in_mbs = (mfc_context->surface_state.height + 15) / 16;
358   
359     for (i = 0; i < encode_state->num_slice_params_ext; i++) {
360         pSliceParameter = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[i]->buffer; 
361         mbCount += pSliceParameter->num_macroblocks; 
362     }
363     
364     if ( mbCount == ( width_in_mbs * height_in_mbs ) )
365         return 0;
366
367     return 1;
368 }
369
370 void intel_mfc_brc_prepare(struct encode_state *encode_state,
371                           struct intel_encoder_context *encoder_context)
372 {
373     unsigned int rate_control_mode = encoder_context->rate_control_mode;
374     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
375
376     if (rate_control_mode == VA_RC_CBR) {
377         /*Programing bit rate control */
378         if ( mfc_context->bit_rate_control_context[SLICE_TYPE_I].MaxSizeInWord == 0 ) {
379             intel_mfc_bit_rate_control_context_init(encode_state, mfc_context);
380             intel_mfc_brc_init(encode_state, encoder_context);
381         }
382
383         /*Programing HRD control */
384         if ( mfc_context->vui_hrd.i_cpb_size_value == 0 )
385             intel_mfc_hrd_context_init(encode_state, encoder_context);    
386     }
387 }
388
389 void intel_mfc_avc_pipeline_header_programing(VADriverContextP ctx,
390                                                     struct encode_state *encode_state,
391                                                     struct intel_encoder_context *encoder_context,
392                                                     struct intel_batchbuffer *slice_batch)
393 {
394     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
395     int idx = va_enc_packed_type_to_idx(VAEncPackedHeaderH264_SPS);
396     unsigned int rate_control_mode = encoder_context->rate_control_mode;
397
398     if (encode_state->packed_header_data[idx]) {
399         VAEncPackedHeaderParameterBuffer *param = NULL;
400         unsigned int *header_data = (unsigned int *)encode_state->packed_header_data[idx]->buffer;
401         unsigned int length_in_bits;
402
403         assert(encode_state->packed_header_param[idx]);
404         param = (VAEncPackedHeaderParameterBuffer *)encode_state->packed_header_param[idx]->buffer;
405         length_in_bits = param->bit_length;
406
407         mfc_context->insert_object(ctx,
408                                    encoder_context,
409                                    header_data,
410                                    ALIGN(length_in_bits, 32) >> 5,
411                                    length_in_bits & 0x1f,
412                                    5,   /* FIXME: check it */
413                                    0,
414                                    0,
415                                    !param->has_emulation_bytes,
416                                    slice_batch);
417     }
418
419     idx = va_enc_packed_type_to_idx(VAEncPackedHeaderH264_PPS);
420
421     if (encode_state->packed_header_data[idx]) {
422         VAEncPackedHeaderParameterBuffer *param = NULL;
423         unsigned int *header_data = (unsigned int *)encode_state->packed_header_data[idx]->buffer;
424         unsigned int length_in_bits;
425
426         assert(encode_state->packed_header_param[idx]);
427         param = (VAEncPackedHeaderParameterBuffer *)encode_state->packed_header_param[idx]->buffer;
428         length_in_bits = param->bit_length;
429
430         mfc_context->insert_object(ctx,
431                                    encoder_context,
432                                    header_data,
433                                    ALIGN(length_in_bits, 32) >> 5,
434                                    length_in_bits & 0x1f,
435                                    5, /* FIXME: check it */
436                                    0,
437                                    0,
438                                    !param->has_emulation_bytes,
439                                    slice_batch);
440     }
441     
442     idx = va_enc_packed_type_to_idx(VAEncPackedHeaderH264_SEI);
443
444     if (encode_state->packed_header_data[idx]) {
445         VAEncPackedHeaderParameterBuffer *param = NULL;
446         unsigned int *header_data = (unsigned int *)encode_state->packed_header_data[idx]->buffer;
447         unsigned int length_in_bits;
448
449         assert(encode_state->packed_header_param[idx]);
450         param = (VAEncPackedHeaderParameterBuffer *)encode_state->packed_header_param[idx]->buffer;
451         length_in_bits = param->bit_length;
452
453         mfc_context->insert_object(ctx,
454                                    encoder_context,
455                                    header_data,
456                                    ALIGN(length_in_bits, 32) >> 5,
457                                    length_in_bits & 0x1f,
458                                    5, /* FIXME: check it */
459                                    0,
460                                    0,
461                                    !param->has_emulation_bytes,
462                                    slice_batch);
463     } else if (rate_control_mode == VA_RC_CBR) {
464         // this is frist AU
465         struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
466
467         unsigned char *sei_data = NULL;
468     
469         int length_in_bits = build_avc_sei_buffer_timing(
470                         mfc_context->vui_hrd.i_initial_cpb_removal_delay_length,
471                         mfc_context->vui_hrd.i_initial_cpb_removal_delay,
472                         0,
473                         mfc_context->vui_hrd.i_cpb_removal_delay_length,                                                       mfc_context->vui_hrd.i_cpb_removal_delay * mfc_context->vui_hrd.i_frame_number,
474                         mfc_context->vui_hrd.i_dpb_output_delay_length,
475                         0,
476                         &sei_data);
477         mfc_context->insert_object(ctx,
478                                    encoder_context,
479                                    (unsigned int *)sei_data,
480                                    ALIGN(length_in_bits, 32) >> 5,
481                                    length_in_bits & 0x1f,
482                                    4,   
483                                    0,   
484                                    0,   
485                                    1,
486                                    slice_batch);  
487         free(sei_data);
488     }
489 }
490
491 VAStatus intel_mfc_avc_prepare(VADriverContextP ctx, 
492                                      struct encode_state *encode_state,
493                                      struct intel_encoder_context *encoder_context)
494 {
495     struct i965_driver_data *i965 = i965_driver_data(ctx);
496     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
497     struct object_surface *obj_surface; 
498     struct object_buffer *obj_buffer;
499     GenAvcSurface *gen6_avc_surface;
500     dri_bo *bo;
501     VAEncPictureParameterBufferH264 *pPicParameter = (VAEncPictureParameterBufferH264 *)encode_state->pic_param_ext->buffer;
502     VAStatus vaStatus = VA_STATUS_SUCCESS;
503     int i, j, enable_avc_ildb = 0;
504     VAEncSliceParameterBufferH264 *slice_param;
505     struct i965_coded_buffer_segment *coded_buffer_segment;
506     VAEncSequenceParameterBufferH264 *pSequenceParameter = (VAEncSequenceParameterBufferH264 *)encode_state->seq_param_ext->buffer;
507     int width_in_mbs = pSequenceParameter->picture_width_in_mbs;
508     int height_in_mbs = pSequenceParameter->picture_height_in_mbs;
509
510     if (IS_GEN6(i965->intel.device_id)) {
511         /* On the SNB it should be fixed to 128 for the DMV buffer */
512         width_in_mbs = 128;
513     }
514
515     for (j = 0; j < encode_state->num_slice_params_ext && enable_avc_ildb == 0; j++) {
516         assert(encode_state->slice_params_ext && encode_state->slice_params_ext[j]->buffer);
517         slice_param = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[j]->buffer;
518
519         for (i = 0; i < encode_state->slice_params_ext[j]->num_elements; i++) {
520             assert((slice_param->slice_type == SLICE_TYPE_I) ||
521                    (slice_param->slice_type == SLICE_TYPE_SI) ||
522                    (slice_param->slice_type == SLICE_TYPE_P) ||
523                    (slice_param->slice_type == SLICE_TYPE_SP) ||
524                    (slice_param->slice_type == SLICE_TYPE_B));
525
526             if (slice_param->disable_deblocking_filter_idc != 1) {
527                 enable_avc_ildb = 1;
528                 break;
529             }
530
531             slice_param++;
532         }
533     }
534
535     /*Setup all the input&output object*/
536
537     /* Setup current frame and current direct mv buffer*/
538     obj_surface = SURFACE(pPicParameter->CurrPic.picture_id);
539     assert(obj_surface);
540     i965_check_alloc_surface_bo(ctx, obj_surface, 1, VA_FOURCC('N','V','1','2'), SUBSAMPLE_YUV420);
541
542     if ( obj_surface->private_data == NULL) {
543         gen6_avc_surface = calloc(sizeof(GenAvcSurface), 1);
544         gen6_avc_surface->dmv_top = 
545             dri_bo_alloc(i965->intel.bufmgr,
546                          "Buffer",
547                          68 * width_in_mbs * height_in_mbs, 
548                          64);
549         gen6_avc_surface->dmv_bottom = 
550             dri_bo_alloc(i965->intel.bufmgr,
551                          "Buffer",
552                          68 * width_in_mbs * height_in_mbs, 
553                          64);
554         assert(gen6_avc_surface->dmv_top);
555         assert(gen6_avc_surface->dmv_bottom);
556         obj_surface->private_data = (void *)gen6_avc_surface;
557         obj_surface->free_private_data = (void *)gen_free_avc_surface; 
558     }
559     gen6_avc_surface = (GenAvcSurface *) obj_surface->private_data;
560     mfc_context->direct_mv_buffers[NUM_MFC_DMV_BUFFERS - 2].bo = gen6_avc_surface->dmv_top;
561     mfc_context->direct_mv_buffers[NUM_MFC_DMV_BUFFERS - 1].bo = gen6_avc_surface->dmv_bottom;
562     dri_bo_reference(gen6_avc_surface->dmv_top);
563     dri_bo_reference(gen6_avc_surface->dmv_bottom);
564
565     if (enable_avc_ildb) {
566         mfc_context->post_deblocking_output.bo = obj_surface->bo;
567         dri_bo_reference(mfc_context->post_deblocking_output.bo);
568     } else {
569         mfc_context->pre_deblocking_output.bo = obj_surface->bo;
570         dri_bo_reference(mfc_context->pre_deblocking_output.bo);
571     }
572
573     mfc_context->surface_state.width = obj_surface->orig_width;
574     mfc_context->surface_state.height = obj_surface->orig_height;
575     mfc_context->surface_state.w_pitch = obj_surface->width;
576     mfc_context->surface_state.h_pitch = obj_surface->height;
577     
578     /* Setup reference frames and direct mv buffers*/
579     for(i = 0; i < MAX_MFC_REFERENCE_SURFACES; i++) {
580         if ( pPicParameter->ReferenceFrames[i].picture_id != VA_INVALID_ID ) { 
581             obj_surface = SURFACE(pPicParameter->ReferenceFrames[i].picture_id);
582             assert(obj_surface);
583             if (obj_surface->bo != NULL) {
584                 mfc_context->reference_surfaces[i].bo = obj_surface->bo;
585                 dri_bo_reference(obj_surface->bo);
586             }
587             /* Check DMV buffer */
588             if ( obj_surface->private_data == NULL) {
589                 
590                 gen6_avc_surface = calloc(sizeof(GenAvcSurface), 1);
591                 gen6_avc_surface->dmv_top = 
592                     dri_bo_alloc(i965->intel.bufmgr,
593                                  "Buffer",
594                                  68 * width_in_mbs * height_in_mbs, 
595                                  64);
596                 gen6_avc_surface->dmv_bottom = 
597                     dri_bo_alloc(i965->intel.bufmgr,
598                                  "Buffer",
599                                  68 * width_in_mbs * height_in_mbs, 
600                                  64);
601                 assert(gen6_avc_surface->dmv_top);
602                 assert(gen6_avc_surface->dmv_bottom);
603                 obj_surface->private_data = gen6_avc_surface;
604                 obj_surface->free_private_data = gen_free_avc_surface; 
605             }
606     
607             gen6_avc_surface = (GenAvcSurface *) obj_surface->private_data;
608             /* Setup DMV buffer */
609             mfc_context->direct_mv_buffers[i*2].bo = gen6_avc_surface->dmv_top;
610             mfc_context->direct_mv_buffers[i*2+1].bo = gen6_avc_surface->dmv_bottom; 
611             dri_bo_reference(gen6_avc_surface->dmv_top);
612             dri_bo_reference(gen6_avc_surface->dmv_bottom);
613         } else {
614             break;
615         }
616     }
617         
618     obj_surface = SURFACE(encoder_context->input_yuv_surface);
619     assert(obj_surface && obj_surface->bo);
620     mfc_context->uncompressed_picture_source.bo = obj_surface->bo;
621     dri_bo_reference(mfc_context->uncompressed_picture_source.bo);
622
623     obj_buffer = BUFFER (pPicParameter->coded_buf); /* FIXME: fix this later */
624     bo = obj_buffer->buffer_store->bo;
625     assert(bo);
626     mfc_context->mfc_indirect_pak_bse_object.bo = bo;
627     mfc_context->mfc_indirect_pak_bse_object.offset = I965_CODEDBUFFER_HEADER_SIZE;
628     mfc_context->mfc_indirect_pak_bse_object.end_offset = ALIGN(obj_buffer->size_element - 0x1000, 0x1000);
629     dri_bo_reference(mfc_context->mfc_indirect_pak_bse_object.bo);
630     
631     dri_bo_map(bo, 1);
632     coded_buffer_segment = (struct i965_coded_buffer_segment *)bo->virtual;
633     coded_buffer_segment->mapped = 0;
634     coded_buffer_segment->codec = CODED_H264;
635     dri_bo_unmap(bo);
636
637     return vaStatus;
638 }
639 /*
640  * The LUT uses the pair of 4-bit units: (shift, base) structure.
641  * 2^K * X = value . 
642  * So it is necessary to convert one cost into the nearest LUT format.
643  * The derivation is:
644  * 2^K *x = 2^n * (1 + deltaX)
645  *    k + log2(x) = n + log2(1 + deltaX)
646  *    log2(x) = n - k + log2(1 + deltaX)
647  *    As X is in the range of [1, 15]
648  *      4 > n - k + log2(1 + deltaX) >= 0 
649  *      =>    n + log2(1 + deltaX)  >= k > n - 4  + log2(1 + deltaX)
650  *    Then we can derive the corresponding K and get the nearest LUT format.
651  */
652 int intel_format_lutvalue(int value, int max)
653 {
654         int ret;
655         int logvalue, temp1, temp2;
656
657         if (value <= 0)
658                 return 0;
659
660         logvalue = (int)(log2f((float)value));
661         if (logvalue < 4) {
662                 ret = value;
663         } else {
664                 int error, temp_value, base, j, temp_err;
665                 error = value;
666                 j = logvalue - 4 + 1;
667                 ret = -1;
668                 for(; j <= logvalue; j++) {
669                         if (j == 0) {
670                                 base = value >> j;
671                         } else {
672                                 base = (value + (1 << (j - 1)) - 1) >> j; 
673                         }
674                         if (base >= 16)
675                                 continue;
676
677                         temp_value = base << j;
678                         temp_err = abs(value - temp_value);
679                         if (temp_err < error) {
680                                 error = temp_err;
681                                 ret = (j << 4) | base;
682                                 if (temp_err == 0)
683                                         break;
684                         }
685                 }       
686         }
687         temp1 = (ret & 0xf) << ((ret & 0xf0) >> 4);
688         temp2 = (max & 0xf) << ((max & 0xf0) >> 4);
689         if (temp1 > temp2)
690                 ret = max;
691         return ret;
692         
693 }
694
695
696 #define         QP_MAX                  52
697
698
699 static float intel_lambda_qp(int qp)
700 {
701         float value, lambdaf;
702         value = qp;
703         value = value / 6 - 2;
704         if (value < 0)
705                 value = 0;
706         lambdaf = roundf(powf(2, value));
707         return lambdaf; 
708 }
709
710
711 void intel_vme_update_mbmv_cost(VADriverContextP ctx,
712                                        struct encode_state *encode_state,
713                                        struct intel_encoder_context *encoder_context)
714 {
715     struct gen6_mfc_context *mfc_context = encoder_context->mfc_context;
716     struct gen6_vme_context *vme_context = encoder_context->vme_context;
717     VAEncPictureParameterBufferH264 *pic_param = (VAEncPictureParameterBufferH264 *)encode_state->pic_param_ext->buffer;
718     VAEncSliceParameterBufferH264 *slice_param = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[0]->buffer;
719     int qp, m_cost, j, mv_count;
720     uint8_t *vme_state_message = (uint8_t *)(vme_context->vme_state_message);
721     float   lambda, m_costf;
722
723     if (encoder_context->rate_control_mode == VA_RC_CQP)
724         qp = pic_param->pic_init_qp + slice_param->slice_qp_delta;
725     else
726         qp = mfc_context->bit_rate_control_context[slice_param->slice_type].QpPrimeY;
727   
728     if (vme_state_message == NULL)
729         return;
730  
731     assert(qp <= QP_MAX); 
732     lambda = intel_lambda_qp(qp);
733     if ((slice_param->slice_type == SLICE_TYPE_I) ||
734                 (slice_param->slice_type == SLICE_TYPE_SI)) {
735         vme_state_message[MODE_INTRA_16X16] = 0;
736         m_cost = lambda * 4;
737         vme_state_message[MODE_INTRA_8X8] = intel_format_lutvalue(m_cost, 0x8f);
738         m_cost = lambda * 16; 
739         vme_state_message[MODE_INTRA_4X4] = intel_format_lutvalue(m_cost, 0x8f);
740         m_cost = lambda * 3;
741         vme_state_message[MODE_INTRA_NONPRED] = intel_format_lutvalue(m_cost, 0x6f);
742     } else {
743         m_cost = 0;
744         vme_state_message[MODE_INTER_MV0] = intel_format_lutvalue(m_cost, 0x6f);
745         for (j = 1; j < 3; j++) {
746                 m_costf = (log2f((float)(j + 1)) + 1.718f) * lambda;
747                 m_cost = (int)m_costf;
748                 vme_state_message[MODE_INTER_MV0 + j] = intel_format_lutvalue(m_cost, 0x6f);
749         }
750         mv_count = 3;
751         for (j = 4; j <= 64; j *= 2) {
752                 m_costf = (log2f((float)(j + 1)) + 1.718f) * lambda;
753                 m_cost = (int)m_costf;
754                 vme_state_message[MODE_INTER_MV0 + mv_count] = intel_format_lutvalue(m_cost, 0x6f);
755                 mv_count++;
756         }
757
758         if (qp <= 25) {
759                 vme_state_message[MODE_INTRA_16X16] = 0x4a;
760                 vme_state_message[MODE_INTRA_8X8] = 0x4a;
761                 vme_state_message[MODE_INTRA_4X4] = 0x4a;
762                 vme_state_message[MODE_INTRA_NONPRED] = 0x4a;
763                 vme_state_message[MODE_INTER_16X16] = 0x4a;
764                 vme_state_message[MODE_INTER_16X8] = 0x4a;
765                 vme_state_message[MODE_INTER_8X8] = 0x4a;
766                 vme_state_message[MODE_INTER_8X4] = 0x4a;
767                 vme_state_message[MODE_INTER_4X4] = 0x4a;
768                 vme_state_message[MODE_INTER_BWD] = 0x2a;
769                 return; 
770         }
771         m_costf = lambda * 10;
772         vme_state_message[MODE_INTRA_16X16] = intel_format_lutvalue(m_cost, 0x8f);
773         m_cost = lambda * 14;
774         vme_state_message[MODE_INTRA_8X8] = intel_format_lutvalue(m_cost, 0x8f);
775         m_cost = lambda * 24; 
776         vme_state_message[MODE_INTRA_4X4] = intel_format_lutvalue(m_cost, 0x8f);
777         m_costf = lambda * 3.5;
778         m_cost = m_costf;
779         vme_state_message[MODE_INTRA_NONPRED] = intel_format_lutvalue(m_cost, 0x6f);
780         if ((slice_param->slice_type == SLICE_TYPE_P) ||
781                 (slice_param->slice_type == SLICE_TYPE_SP)) {
782                 m_costf = lambda * 2.5;
783                 m_cost = m_costf;
784                 vme_state_message[MODE_INTER_16X16] = intel_format_lutvalue(m_cost, 0x8f);
785                 m_costf = lambda * 4;
786                 m_cost = m_costf;
787                 vme_state_message[MODE_INTER_16X8] = intel_format_lutvalue(m_cost, 0x8f);
788                 m_costf = lambda * 1.5;
789                 m_cost = m_costf;
790                 vme_state_message[MODE_INTER_8X8] = intel_format_lutvalue(m_cost, 0x6f);
791                 m_costf = lambda * 3;
792                 m_cost = m_costf;
793                 vme_state_message[MODE_INTER_8X4] = intel_format_lutvalue(m_cost, 0x6f);
794                 m_costf = lambda * 5;
795                 m_cost = m_costf;
796                 vme_state_message[MODE_INTER_4X4] = intel_format_lutvalue(m_cost, 0x6f);
797                 /* BWD is not used in P-frame */
798                 vme_state_message[MODE_INTER_BWD] = 0;
799         } else {
800                 m_costf = lambda * 2.5;
801                 m_cost = m_costf;
802                 vme_state_message[MODE_INTER_16X16] = intel_format_lutvalue(m_cost, 0x8f);
803                 m_costf = lambda * 5.5;
804                 m_cost = m_costf;
805                 vme_state_message[MODE_INTER_16X8] = intel_format_lutvalue(m_cost, 0x8f);
806                 m_costf = lambda * 3.5;
807                 m_cost = m_costf;
808                 vme_state_message[MODE_INTER_8X8] = intel_format_lutvalue(m_cost, 0x6f);
809                 m_costf = lambda * 5.0;
810                 m_cost = m_costf;
811                 vme_state_message[MODE_INTER_8X4] = intel_format_lutvalue(m_cost, 0x6f);
812                 m_costf = lambda * 6.5;
813                 m_cost = m_costf;
814                 vme_state_message[MODE_INTER_4X4] = intel_format_lutvalue(m_cost, 0x6f);
815                 m_costf = lambda * 1.5;
816                 m_cost = m_costf;
817                 vme_state_message[MODE_INTER_BWD] = intel_format_lutvalue(m_cost, 0x6f);
818         }
819     }
820 }
821
822
823 #define         MB_SCOREBOARD_A         (1 << 0)
824 #define         MB_SCOREBOARD_B         (1 << 1)
825 #define         MB_SCOREBOARD_C         (1 << 2)
826 void 
827 gen7_vme_scoreboard_init(VADriverContextP ctx, struct gen6_vme_context *vme_context)
828 {
829     vme_context->gpe_context.vfe_desc5.scoreboard0.enable = 1;
830     vme_context->gpe_context.vfe_desc5.scoreboard0.type = SCOREBOARD_STALLING;
831     vme_context->gpe_context.vfe_desc5.scoreboard0.mask = (MB_SCOREBOARD_A |
832                                                                 MB_SCOREBOARD_B |
833                                                                 MB_SCOREBOARD_C);
834
835     /* In VME prediction the current mb depends on the neighbour 
836      * A/B/C macroblock. So the left/up/up-right dependency should
837      * be considered.
838      */
839     vme_context->gpe_context.vfe_desc6.scoreboard1.delta_x0 = -1;
840     vme_context->gpe_context.vfe_desc6.scoreboard1.delta_y0 = 0;
841     vme_context->gpe_context.vfe_desc6.scoreboard1.delta_x1 = 0;
842     vme_context->gpe_context.vfe_desc6.scoreboard1.delta_y1 = -1;
843     vme_context->gpe_context.vfe_desc6.scoreboard1.delta_x2 = 1;
844     vme_context->gpe_context.vfe_desc6.scoreboard1.delta_y2 = -1;
845         
846     vme_context->gpe_context.vfe_desc7.dword = 0;
847     return;
848 }
849
850 /* check whether the mb of (x_index, y_index) is out of bound */
851 static inline int loop_in_bounds(int x_index, int y_index, int first_mb, int num_mb, int mb_width, int mb_height)
852 {
853         int mb_index;
854         if (x_index < 0 || x_index >= mb_width)
855                 return -1;
856         if (y_index < 0 || y_index >= mb_height)
857                 return -1;
858         
859         mb_index = y_index * mb_width + x_index;
860         if (mb_index < first_mb || mb_index > (first_mb + num_mb))
861                 return -1;
862         return 0;
863 }
864
865 void
866 gen7_vme_walker_fill_vme_batchbuffer(VADriverContextP ctx, 
867                               struct encode_state *encode_state,
868                               int mb_width, int mb_height,
869                               int kernel,
870                               int transform_8x8_mode_flag,
871                               struct intel_encoder_context *encoder_context)
872 {
873     struct gen6_vme_context *vme_context = encoder_context->vme_context;
874     int mb_x = 0, mb_y = 0;
875     int mb_row;
876     int s;
877     unsigned int *command_ptr;
878     int temp;
879
880
881 #define         USE_SCOREBOARD          (1 << 21)
882  
883     dri_bo_map(vme_context->vme_batchbuffer.bo, 1);
884     command_ptr = vme_context->vme_batchbuffer.bo->virtual;
885
886     for (s = 0; s < encode_state->num_slice_params_ext; s++) {
887         VAEncSliceParameterBufferH264 *pSliceParameter = (VAEncSliceParameterBufferH264 *)encode_state->slice_params_ext[s]->buffer;
888         int first_mb = pSliceParameter->macroblock_address;
889         int num_mb = pSliceParameter->num_macroblocks;
890         unsigned int mb_intra_ub, score_dep;
891         int x_outer, y_outer, x_inner, y_inner;
892
893         x_outer = first_mb % mb_width;
894         y_outer = first_mb / mb_width;
895         mb_row = y_outer;
896                                  
897         for (; x_outer < (mb_width -2 ) && !loop_in_bounds(x_outer, y_outer, first_mb, num_mb, mb_width, mb_height); ) {
898             x_inner = x_outer;
899             y_inner = y_outer;
900             for (; !loop_in_bounds(x_inner, y_inner, first_mb, num_mb, mb_width, mb_height);) {
901                 mb_intra_ub = 0;
902                 score_dep = 0;
903                 if (x_inner != 0) {
904                     mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_AE;
905                     score_dep |= MB_SCOREBOARD_A; 
906                 }
907                 if (y_inner != mb_row) {
908                     mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_B;
909                     score_dep |= MB_SCOREBOARD_B;
910                     if (x_inner != 0)
911                         mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_D;
912                     if (x_inner != (mb_width -1)) {
913                         mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_C;
914                         score_dep |= MB_SCOREBOARD_C;
915                      }
916                 }
917                                                         
918                 *command_ptr++ = (CMD_MEDIA_OBJECT | (8 - 2));
919                 *command_ptr++ = kernel;
920                 *command_ptr++ = USE_SCOREBOARD;
921                 /* Indirect data */
922                 *command_ptr++ = 0;
923                 /* the (X, Y) term of scoreboard */
924                 *command_ptr++ = ((y_inner << 16) | x_inner);
925                 *command_ptr++ = score_dep;
926                 /*inline data */
927                 *command_ptr++ = (mb_width << 16 | y_inner << 8 | x_inner);
928                 *command_ptr++ = ((1 << 18) | (1 << 16) | transform_8x8_mode_flag | (mb_intra_ub << 8));
929                 x_inner -= 2;
930                 y_inner += 1;
931             }
932             x_outer += 1;
933         }
934
935         x_outer = mb_width - 2;
936         y_outer = first_mb / mb_width;
937         temp = 0;
938         for (;!loop_in_bounds(x_outer, y_outer, first_mb, num_mb, mb_width, mb_height); ) { 
939             y_inner = y_outer;
940             x_inner = x_outer;
941             for (; !loop_in_bounds(x_inner, y_inner, first_mb, num_mb, mb_width, mb_height);) {
942                 mb_intra_ub = 0;
943                 score_dep = 0;
944                 if (x_inner != 0) {
945                     mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_AE;
946                     score_dep |= MB_SCOREBOARD_A; 
947                 }
948                 if (y_inner != mb_row) {
949                     mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_B;
950                     score_dep |= MB_SCOREBOARD_B;
951                     if (x_inner != 0)
952                         mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_D;
953
954                     if (x_inner != (mb_width -1)) {
955                         mb_intra_ub |= INTRA_PRED_AVAIL_FLAG_C;
956                         score_dep |= MB_SCOREBOARD_C;
957                      }
958                 }
959
960                 *command_ptr++ = (CMD_MEDIA_OBJECT | (8 - 2));
961                 *command_ptr++ = kernel;
962                 *command_ptr++ = USE_SCOREBOARD;
963                 /* Indirect data */
964                 *command_ptr++ = 0;
965                 /* the (X, Y) term of scoreboard */
966                 *command_ptr++ = ((y_inner << 16) | x_inner);
967                 *command_ptr++ = score_dep;
968                 /*inline data */
969                 *command_ptr++ = (mb_width << 16 | y_inner << 8 | x_inner);
970                 *command_ptr++ = ((1 << 18) | (1 << 16) | transform_8x8_mode_flag | (mb_intra_ub << 8));
971
972                 x_inner -= 2;
973                 y_inner += 1;
974             }
975             temp++;
976             if (temp == 2) {
977                 y_outer += 1;
978                 temp = 0;
979                 x_outer = mb_width - 2;
980             } else {
981                 x_outer++;
982             }   
983         }
984     }
985
986     *command_ptr++ = 0;
987     *command_ptr++ = MI_BATCH_BUFFER_END;
988
989     dri_bo_unmap(vme_context->vme_batchbuffer.bo);
990 }
991