Initialize
[sdk/emulator/qemu.git] / tizen / distrib / ffmpeg / libswscale / ppc / yuv2rgb_altivec.c
1 /*
2  * AltiVec acceleration for colorspace conversion
3  *
4  * copyright (C) 2004 Marc Hoffman <marc.hoffman@analog.com>
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with FFmpeg; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22
23 /*
24 Convert I420 YV12 to RGB in various formats,
25   it rejects images that are not in 420 formats,
26   it rejects images that don't have widths of multiples of 16,
27   it rejects images that don't have heights of multiples of 2.
28 Reject defers to C simulation code.
29
30 Lots of optimizations to be done here.
31
32 1. Need to fix saturation code. I just couldn't get it to fly with packs
33    and adds, so we currently use max/min to clip.
34
35 2. The inefficient use of chroma loading needs a bit of brushing up.
36
37 3. Analysis of pipeline stalls needs to be done. Use shark to identify
38    pipeline stalls.
39
40
41 MODIFIED to calculate coeffs from currently selected color space.
42 MODIFIED core to be a macro where you specify the output format.
43 ADDED UYVY conversion which is never called due to some thing in swscale.
44 CORRECTED algorithim selection to be strict on input formats.
45 ADDED runtime detection of AltiVec.
46
47 ADDED altivec_yuv2packedX vertical scl + RGB converter
48
49 March 27,2004
50 PERFORMANCE ANALYSIS
51
52 The C version uses 25% of the processor or ~250Mips for D1 video rawvideo
53 used as test.
54 The AltiVec version uses 10% of the processor or ~100Mips for D1 video
55 same sequence.
56
57 720 * 480 * 30  ~10MPS
58
59 so we have roughly 10 clocks per pixel. This is too high, something has
60 to be wrong.
61
62 OPTIMIZED clip codes to utilize vec_max and vec_packs removing the
63 need for vec_min.
64
65 OPTIMIZED DST OUTPUT cache/DMA controls. We are pretty much guaranteed to have
66 the input video frame, it was just decompressed so it probably resides in L1
67 caches. However, we are creating the output video stream. This needs to use the
68 DSTST instruction to optimize for the cache. We couple this with the fact that
69 we are not going to be visiting the input buffer again so we mark it Least
70 Recently Used. This shaves 25% of the processor cycles off.
71
72 Now memcpy is the largest mips consumer in the system, probably due
73 to the inefficient X11 stuff.
74
75 GL libraries seem to be very slow on this machine 1.33Ghz PB running
76 Jaguar, this is not the case for my 1Ghz PB.  I thought it might be
77 a versioning issue, however I have libGL.1.2.dylib for both
78 machines. (We need to figure this out now.)
79
80 GL2 libraries work now with patch for RGB32.
81
82 NOTE: quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor.
83
84 Integrated luma prescaling adjustment for saturation/contrast/brightness
85 adjustment.
86 */
87
88 #include <stdio.h>
89 #include <stdlib.h>
90 #include <string.h>
91 #include <inttypes.h>
92 #include <assert.h>
93 #include "config.h"
94 #include "libswscale/rgb2rgb.h"
95 #include "libswscale/swscale.h"
96 #include "libswscale/swscale_internal.h"
97
98 #undef PROFILE_THE_BEAST
99 #undef INC_SCALING
100
101 typedef unsigned char ubyte;
102 typedef signed char   sbyte;
103
104
105 /* RGB interleaver, 16 planar pels 8-bit samples per channel in
106    homogeneous vector registers x0,x1,x2 are interleaved with the
107    following technique:
108
109       o0 = vec_mergeh (x0,x1);
110       o1 = vec_perm (o0, x2, perm_rgb_0);
111       o2 = vec_perm (o0, x2, perm_rgb_1);
112       o3 = vec_mergel (x0,x1);
113       o4 = vec_perm (o3,o2,perm_rgb_2);
114       o5 = vec_perm (o3,o2,perm_rgb_3);
115
116   perm_rgb_0:   o0(RG).h v1(B) --> o1*
117               0   1  2   3   4
118              rgbr|gbrg|brgb|rgbr
119              0010 0100 1001 0010
120              0102 3145 2673 894A
121
122   perm_rgb_1:   o0(RG).h v1(B) --> o2
123               0   1  2   3   4
124              gbrg|brgb|bbbb|bbbb
125              0100 1001 1111 1111
126              B5CD 6EF7 89AB CDEF
127
128   perm_rgb_2:   o3(RG).l o2(rgbB.l) --> o4*
129               0   1  2   3   4
130              gbrg|brgb|rgbr|gbrg
131              1111 1111 0010 0100
132              89AB CDEF 0182 3945
133
134   perm_rgb_2:   o3(RG).l o2(rgbB.l) ---> o5*
135               0   1  2   3   4
136              brgb|rgbr|gbrg|brgb
137              1001 0010 0100 1001
138              a67b 89cA BdCD eEFf
139
140 */
141 static
142 const vector unsigned char
143   perm_rgb_0 = {0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05,
144                 0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a},
145   perm_rgb_1 = {0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17,
146                 0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f},
147   perm_rgb_2 = {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
148                 0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05},
149   perm_rgb_3 = {0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a,
150                 0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f};
151
152 #define vec_merge3(x2,x1,x0,y0,y1,y2)       \
153 do {                                        \
154     __typeof__(x0) o0,o2,o3;                \
155         o0 = vec_mergeh (x0,x1);            \
156         y0 = vec_perm (o0, x2, perm_rgb_0); \
157         o2 = vec_perm (o0, x2, perm_rgb_1); \
158         o3 = vec_mergel (x0,x1);            \
159         y1 = vec_perm (o3,o2,perm_rgb_2);   \
160         y2 = vec_perm (o3,o2,perm_rgb_3);   \
161 } while(0)
162
163 #define vec_mstbgr24(x0,x1,x2,ptr)      \
164 do {                                    \
165     __typeof__(x0) _0,_1,_2;            \
166     vec_merge3 (x0,x1,x2,_0,_1,_2);     \
167     vec_st (_0, 0, ptr++);              \
168     vec_st (_1, 0, ptr++);              \
169     vec_st (_2, 0, ptr++);              \
170 }  while (0)
171
172 #define vec_mstrgb24(x0,x1,x2,ptr)      \
173 do {                                    \
174     __typeof__(x0) _0,_1,_2;            \
175     vec_merge3 (x2,x1,x0,_0,_1,_2);     \
176     vec_st (_0, 0, ptr++);              \
177     vec_st (_1, 0, ptr++);              \
178     vec_st (_2, 0, ptr++);              \
179 }  while (0)
180
181 /* pack the pixels in rgb0 format
182    msb R
183    lsb 0
184 */
185 #define vec_mstrgb32(T,x0,x1,x2,x3,ptr)                                       \
186 do {                                                                          \
187     T _0,_1,_2,_3;                                                            \
188     _0 = vec_mergeh (x0,x1);                                                  \
189     _1 = vec_mergeh (x2,x3);                                                  \
190     _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
191     _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
192     vec_st (_2, 0*16, (T *)ptr);                                              \
193     vec_st (_3, 1*16, (T *)ptr);                                              \
194     _0 = vec_mergel (x0,x1);                                                  \
195     _1 = vec_mergel (x2,x3);                                                  \
196     _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
197     _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
198     vec_st (_2, 2*16, (T *)ptr);                                              \
199     vec_st (_3, 3*16, (T *)ptr);                                              \
200     ptr += 4;                                                                 \
201 }  while (0)
202
203 /*
204
205   | 1     0       1.4021   | | Y |
206   | 1    -0.3441 -0.7142   |x| Cb|
207   | 1     1.7718  0        | | Cr|
208
209
210   Y:      [-128 127]
211   Cb/Cr : [-128 127]
212
213   typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode.
214
215 */
216
217
218
219
220 #define vec_unh(x) \
221     (vector signed short) \
222         vec_perm(x,(__typeof__(x)){0}, \
223                  ((vector unsigned char){0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\
224                                          0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07}))
225 #define vec_unl(x) \
226     (vector signed short) \
227         vec_perm(x,(__typeof__(x)){0}, \
228                  ((vector unsigned char){0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\
229                                          0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F}))
230
231 #define vec_clip_s16(x) \
232     vec_max (vec_min (x, ((vector signed short){235,235,235,235,235,235,235,235})), \
233                          ((vector signed short){ 16, 16, 16, 16, 16, 16, 16, 16}))
234
235 #define vec_packclp(x,y) \
236     (vector unsigned char)vec_packs \
237         ((vector unsigned short)vec_max (x,((vector signed short) {0})), \
238          (vector unsigned short)vec_max (y,((vector signed short) {0})))
239
240 //#define out_pixels(a,b,c,ptr) vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),a,a,a,ptr)
241
242
243 static inline void cvtyuvtoRGB (SwsContext *c,
244                                 vector signed short Y, vector signed short U, vector signed short V,
245                                 vector signed short *R, vector signed short *G, vector signed short *B)
246 {
247     vector signed   short vx,ux,uvx;
248
249     Y = vec_mradds (Y, c->CY, c->OY);
250     U  = vec_sub (U,(vector signed short)
251                     vec_splat((vector signed short){128},0));
252     V  = vec_sub (V,(vector signed short)
253                     vec_splat((vector signed short){128},0));
254
255     //   ux  = (CBU*(u<<c->CSHIFT)+0x4000)>>15;
256     ux = vec_sl (U, c->CSHIFT);
257     *B = vec_mradds (ux, c->CBU, Y);
258
259     // vx  = (CRV*(v<<c->CSHIFT)+0x4000)>>15;
260     vx = vec_sl (V, c->CSHIFT);
261     *R = vec_mradds (vx, c->CRV, Y);
262
263     // uvx = ((CGU*u) + (CGV*v))>>15;
264     uvx = vec_mradds (U, c->CGU, Y);
265     *G  = vec_mradds (V, c->CGV, uvx);
266 }
267
268
269 /*
270   ------------------------------------------------------------------------------
271   CS converters
272   ------------------------------------------------------------------------------
273 */
274
275
276 #define DEFCSP420_CVT(name,out_pixels)                                  \
277 static int altivec_##name (SwsContext *c,                               \
278                            unsigned char **in, int *instrides,          \
279                            int srcSliceY,        int srcSliceH,         \
280                            unsigned char **oplanes, int *outstrides)    \
281 {                                                                       \
282     int w = c->srcW;                                                    \
283     int h = srcSliceH;                                                  \
284     int i,j;                                                            \
285     int instrides_scl[3];                                               \
286     vector unsigned char y0,y1;                                         \
287                                                                         \
288     vector signed char  u,v;                                            \
289                                                                         \
290     vector signed short Y0,Y1,Y2,Y3;                                    \
291     vector signed short U,V;                                            \
292     vector signed short vx,ux,uvx;                                      \
293     vector signed short vx0,ux0,uvx0;                                   \
294     vector signed short vx1,ux1,uvx1;                                   \
295     vector signed short R0,G0,B0;                                       \
296     vector signed short R1,G1,B1;                                       \
297     vector unsigned char R,G,B;                                         \
298                                                                         \
299     vector unsigned char *y1ivP, *y2ivP, *uivP, *vivP;                  \
300     vector unsigned char align_perm;                                    \
301                                                                         \
302     vector signed short                                                 \
303         lCY  = c->CY,                                                   \
304         lOY  = c->OY,                                                   \
305         lCRV = c->CRV,                                                  \
306         lCBU = c->CBU,                                                  \
307         lCGU = c->CGU,                                                  \
308         lCGV = c->CGV;                                                  \
309                                                                         \
310     vector unsigned short lCSHIFT = c->CSHIFT;                          \
311                                                                         \
312     ubyte *y1i   = in[0];                                               \
313     ubyte *y2i   = in[0]+instrides[0];                                  \
314     ubyte *ui    = in[1];                                               \
315     ubyte *vi    = in[2];                                               \
316                                                                         \
317     vector unsigned char *oute                                          \
318         = (vector unsigned char *)                                      \
319             (oplanes[0]+srcSliceY*outstrides[0]);                       \
320     vector unsigned char *outo                                          \
321         = (vector unsigned char *)                                      \
322             (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);         \
323                                                                         \
324                                                                         \
325     instrides_scl[0] = instrides[0]*2-w;  /* the loop moves y{1,2}i by w */ \
326     instrides_scl[1] = instrides[1]-w/2;  /* the loop moves ui by w/2 */    \
327     instrides_scl[2] = instrides[2]-w/2;  /* the loop moves vi by w/2 */    \
328                                                                         \
329                                                                         \
330     for (i=0;i<h/2;i++) {                                               \
331         vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);          \
332         vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);          \
333                                                                         \
334         for (j=0;j<w/16;j++) {                                          \
335                                                                         \
336             y1ivP = (vector unsigned char *)y1i;                        \
337             y2ivP = (vector unsigned char *)y2i;                        \
338             uivP  = (vector unsigned char *)ui;                         \
339             vivP  = (vector unsigned char *)vi;                         \
340                                                                         \
341             align_perm = vec_lvsl (0, y1i);                             \
342             y0 = (vector unsigned char)                                 \
343                  vec_perm (y1ivP[0], y1ivP[1], align_perm);             \
344                                                                         \
345             align_perm = vec_lvsl (0, y2i);                             \
346             y1 = (vector unsigned char)                                 \
347                  vec_perm (y2ivP[0], y2ivP[1], align_perm);             \
348                                                                         \
349             align_perm = vec_lvsl (0, ui);                              \
350             u = (vector signed char)                                    \
351                 vec_perm (uivP[0], uivP[1], align_perm);                \
352                                                                         \
353             align_perm = vec_lvsl (0, vi);                              \
354             v = (vector signed char)                                    \
355                 vec_perm (vivP[0], vivP[1], align_perm);                \
356                                                                         \
357             u  = (vector signed char)                                   \
358                  vec_sub (u,(vector signed char)                        \
359                           vec_splat((vector signed char){128},0));      \
360             v  = (vector signed char)                                   \
361                  vec_sub (v,(vector signed char)                        \
362                           vec_splat((vector signed char){128},0));      \
363                                                                         \
364             U  = vec_unpackh (u);                                       \
365             V  = vec_unpackh (v);                                       \
366                                                                         \
367                                                                         \
368             Y0 = vec_unh (y0);                                          \
369             Y1 = vec_unl (y0);                                          \
370             Y2 = vec_unh (y1);                                          \
371             Y3 = vec_unl (y1);                                          \
372                                                                         \
373             Y0 = vec_mradds (Y0, lCY, lOY);                             \
374             Y1 = vec_mradds (Y1, lCY, lOY);                             \
375             Y2 = vec_mradds (Y2, lCY, lOY);                             \
376             Y3 = vec_mradds (Y3, lCY, lOY);                             \
377                                                                         \
378             /*   ux  = (CBU*(u<<CSHIFT)+0x4000)>>15 */                  \
379             ux = vec_sl (U, lCSHIFT);                                   \
380             ux = vec_mradds (ux, lCBU, (vector signed short){0});       \
381             ux0  = vec_mergeh (ux,ux);                                  \
382             ux1  = vec_mergel (ux,ux);                                  \
383                                                                         \
384             /* vx  = (CRV*(v<<CSHIFT)+0x4000)>>15;        */            \
385             vx = vec_sl (V, lCSHIFT);                                   \
386             vx = vec_mradds (vx, lCRV, (vector signed short){0});       \
387             vx0  = vec_mergeh (vx,vx);                                  \
388             vx1  = vec_mergel (vx,vx);                                  \
389                                                                         \
390             /* uvx = ((CGU*u) + (CGV*v))>>15 */                         \
391             uvx = vec_mradds (U, lCGU, (vector signed short){0});       \
392             uvx = vec_mradds (V, lCGV, uvx);                            \
393             uvx0 = vec_mergeh (uvx,uvx);                                \
394             uvx1 = vec_mergel (uvx,uvx);                                \
395                                                                         \
396             R0 = vec_add (Y0,vx0);                                      \
397             G0 = vec_add (Y0,uvx0);                                     \
398             B0 = vec_add (Y0,ux0);                                      \
399             R1 = vec_add (Y1,vx1);                                      \
400             G1 = vec_add (Y1,uvx1);                                     \
401             B1 = vec_add (Y1,ux1);                                      \
402                                                                         \
403             R  = vec_packclp (R0,R1);                                   \
404             G  = vec_packclp (G0,G1);                                   \
405             B  = vec_packclp (B0,B1);                                   \
406                                                                         \
407             out_pixels(R,G,B,oute);                                     \
408                                                                         \
409             R0 = vec_add (Y2,vx0);                                      \
410             G0 = vec_add (Y2,uvx0);                                     \
411             B0 = vec_add (Y2,ux0);                                      \
412             R1 = vec_add (Y3,vx1);                                      \
413             G1 = vec_add (Y3,uvx1);                                     \
414             B1 = vec_add (Y3,ux1);                                      \
415             R  = vec_packclp (R0,R1);                                   \
416             G  = vec_packclp (G0,G1);                                   \
417             B  = vec_packclp (B0,B1);                                   \
418                                                                         \
419                                                                         \
420             out_pixels(R,G,B,outo);                                     \
421                                                                         \
422             y1i  += 16;                                                 \
423             y2i  += 16;                                                 \
424             ui   += 8;                                                  \
425             vi   += 8;                                                  \
426                                                                         \
427         }                                                               \
428                                                                         \
429         outo  += (outstrides[0])>>4;                                    \
430         oute  += (outstrides[0])>>4;                                    \
431                                                                         \
432         ui    += instrides_scl[1];                                      \
433         vi    += instrides_scl[2];                                      \
434         y1i   += instrides_scl[0];                                      \
435         y2i   += instrides_scl[0];                                      \
436     }                                                                   \
437     return srcSliceH;                                                   \
438 }
439
440
441 #define out_abgr(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),c,b,a,ptr)
442 #define out_bgra(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),c,b,a,((__typeof__ (a)){255}),ptr)
443 #define out_rgba(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),a,b,c,((__typeof__ (a)){255}),ptr)
444 #define out_argb(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),a,b,c,ptr)
445 #define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr)
446 #define out_bgr24(a,b,c,ptr) vec_mstbgr24(a,b,c,ptr)
447
448 DEFCSP420_CVT (yuv2_abgr, out_abgr)
449 #if 1
450 DEFCSP420_CVT (yuv2_bgra, out_bgra)
451 #else
452 static int altivec_yuv2_bgra32 (SwsContext *c,
453                                 unsigned char **in, int *instrides,
454                                 int srcSliceY,        int srcSliceH,
455                                 unsigned char **oplanes, int *outstrides)
456 {
457     int w = c->srcW;
458     int h = srcSliceH;
459     int i,j;
460     int instrides_scl[3];
461     vector unsigned char y0,y1;
462
463     vector signed char  u,v;
464
465     vector signed short Y0,Y1,Y2,Y3;
466     vector signed short U,V;
467     vector signed short vx,ux,uvx;
468     vector signed short vx0,ux0,uvx0;
469     vector signed short vx1,ux1,uvx1;
470     vector signed short R0,G0,B0;
471     vector signed short R1,G1,B1;
472     vector unsigned char R,G,B;
473
474     vector unsigned char *uivP, *vivP;
475     vector unsigned char align_perm;
476
477     vector signed short
478         lCY  = c->CY,
479         lOY  = c->OY,
480         lCRV = c->CRV,
481         lCBU = c->CBU,
482         lCGU = c->CGU,
483         lCGV = c->CGV;
484
485     vector unsigned short lCSHIFT = c->CSHIFT;
486
487     ubyte *y1i   = in[0];
488     ubyte *y2i   = in[0]+w;
489     ubyte *ui    = in[1];
490     ubyte *vi    = in[2];
491
492     vector unsigned char *oute
493         = (vector unsigned char *)
494           (oplanes[0]+srcSliceY*outstrides[0]);
495     vector unsigned char *outo
496         = (vector unsigned char *)
497           (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);
498
499
500     instrides_scl[0] = instrides[0];
501     instrides_scl[1] = instrides[1]-w/2;  /* the loop moves ui by w/2 */
502     instrides_scl[2] = instrides[2]-w/2;  /* the loop moves vi by w/2 */
503
504
505     for (i=0;i<h/2;i++) {
506         vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);
507         vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);
508
509         for (j=0;j<w/16;j++) {
510
511             y0 = vec_ldl (0,y1i);
512             y1 = vec_ldl (0,y2i);
513             uivP = (vector unsigned char *)ui;
514             vivP = (vector unsigned char *)vi;
515
516             align_perm = vec_lvsl (0, ui);
517             u  = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm);
518
519             align_perm = vec_lvsl (0, vi);
520             v  = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm);
521             u  = (vector signed char)
522                  vec_sub (u,(vector signed char)
523                           vec_splat((vector signed char){128},0));
524
525             v  = (vector signed char)
526                  vec_sub (v, (vector signed char)
527                           vec_splat((vector signed char){128},0));
528
529             U  = vec_unpackh (u);
530             V  = vec_unpackh (v);
531
532
533             Y0 = vec_unh (y0);
534             Y1 = vec_unl (y0);
535             Y2 = vec_unh (y1);
536             Y3 = vec_unl (y1);
537
538             Y0 = vec_mradds (Y0, lCY, lOY);
539             Y1 = vec_mradds (Y1, lCY, lOY);
540             Y2 = vec_mradds (Y2, lCY, lOY);
541             Y3 = vec_mradds (Y3, lCY, lOY);
542
543             /*   ux  = (CBU*(u<<CSHIFT)+0x4000)>>15 */
544             ux = vec_sl (U, lCSHIFT);
545             ux = vec_mradds (ux, lCBU, (vector signed short){0});
546             ux0  = vec_mergeh (ux,ux);
547             ux1  = vec_mergel (ux,ux);
548
549             /* vx  = (CRV*(v<<CSHIFT)+0x4000)>>15;        */
550             vx = vec_sl (V, lCSHIFT);
551             vx = vec_mradds (vx, lCRV, (vector signed short){0});
552             vx0  = vec_mergeh (vx,vx);
553             vx1  = vec_mergel (vx,vx);
554             /* uvx = ((CGU*u) + (CGV*v))>>15 */
555             uvx = vec_mradds (U, lCGU, (vector signed short){0});
556             uvx = vec_mradds (V, lCGV, uvx);
557             uvx0 = vec_mergeh (uvx,uvx);
558             uvx1 = vec_mergel (uvx,uvx);
559             R0 = vec_add (Y0,vx0);
560             G0 = vec_add (Y0,uvx0);
561             B0 = vec_add (Y0,ux0);
562             R1 = vec_add (Y1,vx1);
563             G1 = vec_add (Y1,uvx1);
564             B1 = vec_add (Y1,ux1);
565             R  = vec_packclp (R0,R1);
566             G  = vec_packclp (G0,G1);
567             B  = vec_packclp (B0,B1);
568
569             out_argb(R,G,B,oute);
570             R0 = vec_add (Y2,vx0);
571             G0 = vec_add (Y2,uvx0);
572             B0 = vec_add (Y2,ux0);
573             R1 = vec_add (Y3,vx1);
574             G1 = vec_add (Y3,uvx1);
575             B1 = vec_add (Y3,ux1);
576             R  = vec_packclp (R0,R1);
577             G  = vec_packclp (G0,G1);
578             B  = vec_packclp (B0,B1);
579
580             out_argb(R,G,B,outo);
581             y1i  += 16;
582             y2i  += 16;
583             ui   += 8;
584             vi   += 8;
585
586         }
587
588         outo  += (outstrides[0])>>4;
589         oute  += (outstrides[0])>>4;
590
591         ui    += instrides_scl[1];
592         vi    += instrides_scl[2];
593         y1i   += instrides_scl[0];
594         y2i   += instrides_scl[0];
595     }
596     return srcSliceH;
597 }
598
599 #endif
600
601
602 DEFCSP420_CVT (yuv2_rgba, out_rgba)
603 DEFCSP420_CVT (yuv2_argb, out_argb)
604 DEFCSP420_CVT (yuv2_rgb24,  out_rgb24)
605 DEFCSP420_CVT (yuv2_bgr24,  out_bgr24)
606
607
608 // uyvy|uyvy|uyvy|uyvy
609 // 0123 4567 89ab cdef
610 static
611 const vector unsigned char
612     demux_u = {0x10,0x00,0x10,0x00,
613                0x10,0x04,0x10,0x04,
614                0x10,0x08,0x10,0x08,
615                0x10,0x0c,0x10,0x0c},
616     demux_v = {0x10,0x02,0x10,0x02,
617                0x10,0x06,0x10,0x06,
618                0x10,0x0A,0x10,0x0A,
619                0x10,0x0E,0x10,0x0E},
620     demux_y = {0x10,0x01,0x10,0x03,
621                0x10,0x05,0x10,0x07,
622                0x10,0x09,0x10,0x0B,
623                0x10,0x0D,0x10,0x0F};
624
625 /*
626   this is so I can play live CCIR raw video
627 */
628 static int altivec_uyvy_rgb32 (SwsContext *c,
629                                unsigned char **in, int *instrides,
630                                int srcSliceY,        int srcSliceH,
631                                unsigned char **oplanes, int *outstrides)
632 {
633     int w = c->srcW;
634     int h = srcSliceH;
635     int i,j;
636     vector unsigned char uyvy;
637     vector signed   short Y,U,V;
638     vector signed   short R0,G0,B0,R1,G1,B1;
639     vector unsigned char  R,G,B;
640     vector unsigned char *out;
641     ubyte *img;
642
643     img = in[0];
644     out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]);
645
646     for (i=0;i<h;i++) {
647         for (j=0;j<w/16;j++) {
648             uyvy = vec_ld (0, img);
649             U = (vector signed short)
650                 vec_perm (uyvy, (vector unsigned char){0}, demux_u);
651
652             V = (vector signed short)
653                 vec_perm (uyvy, (vector unsigned char){0}, demux_v);
654
655             Y = (vector signed short)
656                 vec_perm (uyvy, (vector unsigned char){0}, demux_y);
657
658             cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0);
659
660             uyvy = vec_ld (16, img);
661             U = (vector signed short)
662                 vec_perm (uyvy, (vector unsigned char){0}, demux_u);
663
664             V = (vector signed short)
665                 vec_perm (uyvy, (vector unsigned char){0}, demux_v);
666
667             Y = (vector signed short)
668                 vec_perm (uyvy, (vector unsigned char){0}, demux_y);
669
670             cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1);
671
672             R  = vec_packclp (R0,R1);
673             G  = vec_packclp (G0,G1);
674             B  = vec_packclp (B0,B1);
675
676             //      vec_mstbgr24 (R,G,B, out);
677             out_rgba (R,G,B,out);
678
679             img += 32;
680         }
681     }
682     return srcSliceH;
683 }
684
685
686
687 /* Ok currently the acceleration routine only supports
688    inputs of widths a multiple of 16
689    and heights a multiple 2
690
691    So we just fall back to the C codes for this.
692 */
693 SwsFunc ff_yuv2rgb_init_altivec(SwsContext *c)
694 {
695     if (!(c->flags & SWS_CPU_CAPS_ALTIVEC))
696         return NULL;
697
698     /*
699       and this seems not to matter too much I tried a bunch of
700       videos with abnormal widths and MPlayer crashes elsewhere.
701       mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv
702       boom with X11 bad match.
703
704     */
705     if ((c->srcW & 0xf) != 0)    return NULL;
706
707     switch (c->srcFormat) {
708     case PIX_FMT_YUV410P:
709     case PIX_FMT_YUV420P:
710     /*case IMGFMT_CLPL:        ??? */
711     case PIX_FMT_GRAY8:
712     case PIX_FMT_NV12:
713     case PIX_FMT_NV21:
714         if ((c->srcH & 0x1) != 0)
715             return NULL;
716
717         switch(c->dstFormat) {
718         case PIX_FMT_RGB24:
719             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGB24\n");
720             return altivec_yuv2_rgb24;
721         case PIX_FMT_BGR24:
722             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGR24\n");
723             return altivec_yuv2_bgr24;
724         case PIX_FMT_ARGB:
725             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ARGB\n");
726             return altivec_yuv2_argb;
727         case PIX_FMT_ABGR:
728             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ABGR\n");
729             return altivec_yuv2_abgr;
730         case PIX_FMT_RGBA:
731             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGBA\n");
732             return altivec_yuv2_rgba;
733         case PIX_FMT_BGRA:
734             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGRA\n");
735             return altivec_yuv2_bgra;
736         default: return NULL;
737         }
738         break;
739
740     case PIX_FMT_UYVY422:
741         switch(c->dstFormat) {
742         case PIX_FMT_BGR32:
743             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space UYVY -> RGB32\n");
744             return altivec_uyvy_rgb32;
745         default: return NULL;
746         }
747         break;
748
749     }
750     return NULL;
751 }
752
753 void ff_yuv2rgb_init_tables_altivec(SwsContext *c, const int inv_table[4], int brightness, int contrast, int saturation)
754 {
755     union {
756         DECLARE_ALIGNED(16, signed short, tmp)[8];
757         vector signed short vec;
758     } buf;
759
760     buf.tmp[0] =  ((0xffffLL) * contrast>>8)>>9;                        //cy
761     buf.tmp[1] =  -256*brightness;                                      //oy
762     buf.tmp[2] =  (inv_table[0]>>3) *(contrast>>16)*(saturation>>16);   //crv
763     buf.tmp[3] =  (inv_table[1]>>3) *(contrast>>16)*(saturation>>16);   //cbu
764     buf.tmp[4] = -((inv_table[2]>>1)*(contrast>>16)*(saturation>>16));  //cgu
765     buf.tmp[5] = -((inv_table[3]>>1)*(contrast>>16)*(saturation>>16));  //cgv
766
767
768     c->CSHIFT = (vector unsigned short)vec_splat_u16(2);
769     c->CY   = vec_splat ((vector signed short)buf.vec, 0);
770     c->OY   = vec_splat ((vector signed short)buf.vec, 1);
771     c->CRV  = vec_splat ((vector signed short)buf.vec, 2);
772     c->CBU  = vec_splat ((vector signed short)buf.vec, 3);
773     c->CGU  = vec_splat ((vector signed short)buf.vec, 4);
774     c->CGV  = vec_splat ((vector signed short)buf.vec, 5);
775     return;
776 }
777
778
779 void
780 ff_yuv2packedX_altivec(SwsContext *c,
781                        const int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
782                        const int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
783                      uint8_t *dest, int dstW, int dstY)
784 {
785     int i,j;
786     vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V;
787     vector signed short R0,G0,B0,R1,G1,B1;
788
789     vector unsigned char R,G,B;
790     vector unsigned char *out,*nout;
791
792     vector signed short   RND = vec_splat_s16(1<<3);
793     vector unsigned short SCL = vec_splat_u16(4);
794     DECLARE_ALIGNED(16, unsigned long, scratch)[16];
795
796     vector signed short *YCoeffs, *CCoeffs;
797
798     YCoeffs = c->vYCoeffsBank+dstY*lumFilterSize;
799     CCoeffs = c->vCCoeffsBank+dstY*chrFilterSize;
800
801     out = (vector unsigned char *)dest;
802
803     for (i=0; i<dstW; i+=16) {
804         Y0 = RND;
805         Y1 = RND;
806         /* extract 16 coeffs from lumSrc */
807         for (j=0; j<lumFilterSize; j++) {
808             X0 = vec_ld (0,  &lumSrc[j][i]);
809             X1 = vec_ld (16, &lumSrc[j][i]);
810             Y0 = vec_mradds (X0, YCoeffs[j], Y0);
811             Y1 = vec_mradds (X1, YCoeffs[j], Y1);
812         }
813
814         U = RND;
815         V = RND;
816         /* extract 8 coeffs from U,V */
817         for (j=0; j<chrFilterSize; j++) {
818             X  = vec_ld (0, &chrSrc[j][i/2]);
819             U  = vec_mradds (X, CCoeffs[j], U);
820             X  = vec_ld (0, &chrSrc[j][i/2+2048]);
821             V  = vec_mradds (X, CCoeffs[j], V);
822         }
823
824         /* scale and clip signals */
825         Y0 = vec_sra (Y0, SCL);
826         Y1 = vec_sra (Y1, SCL);
827         U  = vec_sra (U,  SCL);
828         V  = vec_sra (V,  SCL);
829
830         Y0 = vec_clip_s16 (Y0);
831         Y1 = vec_clip_s16 (Y1);
832         U  = vec_clip_s16 (U);
833         V  = vec_clip_s16 (V);
834
835         /* now we have
836           Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
837           U= u0 u1 u2 u3 u4 u5 u6 u7      V= v0 v1 v2 v3 v4 v5 v6 v7
838
839           Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
840           U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
841           V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
842         */
843
844         U0 = vec_mergeh (U,U);
845         V0 = vec_mergeh (V,V);
846
847         U1 = vec_mergel (U,U);
848         V1 = vec_mergel (V,V);
849
850         cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
851         cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
852
853         R  = vec_packclp (R0,R1);
854         G  = vec_packclp (G0,G1);
855         B  = vec_packclp (B0,B1);
856
857         switch(c->dstFormat) {
858         case PIX_FMT_ABGR:  out_abgr  (R,G,B,out); break;
859         case PIX_FMT_BGRA:  out_bgra  (R,G,B,out); break;
860         case PIX_FMT_RGBA:  out_rgba  (R,G,B,out); break;
861         case PIX_FMT_ARGB:  out_argb  (R,G,B,out); break;
862         case PIX_FMT_RGB24: out_rgb24 (R,G,B,out); break;
863         case PIX_FMT_BGR24: out_bgr24 (R,G,B,out); break;
864         default:
865             {
866                 /* If this is reached, the caller should have called yuv2packedXinC
867                    instead. */
868                 static int printed_error_message;
869                 if (!printed_error_message) {
870                     av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
871                            sws_format_name(c->dstFormat));
872                     printed_error_message=1;
873                 }
874                 return;
875             }
876         }
877     }
878
879     if (i < dstW) {
880         i -= 16;
881
882         Y0 = RND;
883         Y1 = RND;
884         /* extract 16 coeffs from lumSrc */
885         for (j=0; j<lumFilterSize; j++) {
886             X0 = vec_ld (0,  &lumSrc[j][i]);
887             X1 = vec_ld (16, &lumSrc[j][i]);
888             Y0 = vec_mradds (X0, YCoeffs[j], Y0);
889             Y1 = vec_mradds (X1, YCoeffs[j], Y1);
890         }
891
892         U = RND;
893         V = RND;
894         /* extract 8 coeffs from U,V */
895         for (j=0; j<chrFilterSize; j++) {
896             X  = vec_ld (0, &chrSrc[j][i/2]);
897             U  = vec_mradds (X, CCoeffs[j], U);
898             X  = vec_ld (0, &chrSrc[j][i/2+2048]);
899             V  = vec_mradds (X, CCoeffs[j], V);
900         }
901
902         /* scale and clip signals */
903         Y0 = vec_sra (Y0, SCL);
904         Y1 = vec_sra (Y1, SCL);
905         U  = vec_sra (U,  SCL);
906         V  = vec_sra (V,  SCL);
907
908         Y0 = vec_clip_s16 (Y0);
909         Y1 = vec_clip_s16 (Y1);
910         U  = vec_clip_s16 (U);
911         V  = vec_clip_s16 (V);
912
913         /* now we have
914            Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
915            U = u0 u1 u2 u3 u4 u5 u6 u7     V = v0 v1 v2 v3 v4 v5 v6 v7
916
917            Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
918            U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
919            V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
920         */
921
922         U0 = vec_mergeh (U,U);
923         V0 = vec_mergeh (V,V);
924
925         U1 = vec_mergel (U,U);
926         V1 = vec_mergel (V,V);
927
928         cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
929         cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
930
931         R  = vec_packclp (R0,R1);
932         G  = vec_packclp (G0,G1);
933         B  = vec_packclp (B0,B1);
934
935         nout = (vector unsigned char *)scratch;
936         switch(c->dstFormat) {
937         case PIX_FMT_ABGR:  out_abgr  (R,G,B,nout); break;
938         case PIX_FMT_BGRA:  out_bgra  (R,G,B,nout); break;
939         case PIX_FMT_RGBA:  out_rgba  (R,G,B,nout); break;
940         case PIX_FMT_ARGB:  out_argb  (R,G,B,nout); break;
941         case PIX_FMT_RGB24: out_rgb24 (R,G,B,nout); break;
942         case PIX_FMT_BGR24: out_bgr24 (R,G,B,nout); break;
943         default:
944             /* Unreachable, I think. */
945             av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
946                    sws_format_name(c->dstFormat));
947             return;
948         }
949
950         memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4);
951     }
952
953 }