tizen/distrib/ffmpeg/libswscale/ppc/yuv2rgb_altivec.c

   1 /*
   2  * AltiVec acceleration for colorspace conversion
   3  *
   4  * copyright (C) 2004 Marc Hoffman <marc.hoffman@analog.com>
   5  *
   6  * This file is part of FFmpeg.
   7  *
   8  * FFmpeg is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License as published by the Free Software Foundation; either
  11  * version 2.1 of the License, or (at your option) any later version.
  12  *
  13  * FFmpeg is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16  * Lesser General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU Lesser General Public
  19  * License along with FFmpeg; if not, write to the Free Software
  20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21  */
  22
  23 /*
  24 Convert I420 YV12 to RGB in various formats,
  25   it rejects images that are not in 420 formats,
  26   it rejects images that don't have widths of multiples of 16,
  27   it rejects images that don't have heights of multiples of 2.
  28 Reject defers to C simulation code.
  29
  30 Lots of optimizations to be done here.
  31
  32 1. Need to fix saturation code. I just couldn't get it to fly with packs
  33    and adds, so we currently use max/min to clip.
  34
  35 2. The inefficient use of chroma loading needs a bit of brushing up.
  36
  37 3. Analysis of pipeline stalls needs to be done. Use shark to identify
  38    pipeline stalls.
  39
  40
  41 MODIFIED to calculate coeffs from currently selected color space.
  42 MODIFIED core to be a macro where you specify the output format.
  43 ADDED UYVY conversion which is never called due to some thing in swscale.
  44 CORRECTED algorithim selection to be strict on input formats.
  45 ADDED runtime detection of AltiVec.
  46
  47 ADDED altivec_yuv2packedX vertical scl + RGB converter
  48
  49 March 27,2004
  50 PERFORMANCE ANALYSIS
  51
  52 The C version uses 25% of the processor or ~250Mips for D1 video rawvideo
  53 used as test.
  54 The AltiVec version uses 10% of the processor or ~100Mips for D1 video
  55 same sequence.
  56
  57 720 * 480 * 30  ~10MPS
  58
  59 so we have roughly 10 clocks per pixel. This is too high, something has
  60 to be wrong.
  61
  62 OPTIMIZED clip codes to utilize vec_max and vec_packs removing the
  63 need for vec_min.
  64
  65 OPTIMIZED DST OUTPUT cache/DMA controls. We are pretty much guaranteed to have
  66 the input video frame, it was just decompressed so it probably resides in L1
  67 caches. However, we are creating the output video stream. This needs to use the
  68 DSTST instruction to optimize for the cache. We couple this with the fact that
  69 we are not going to be visiting the input buffer again so we mark it Least
  70 Recently Used. This shaves 25% of the processor cycles off.
  71
  72 Now memcpy is the largest mips consumer in the system, probably due
  73 to the inefficient X11 stuff.
  74
  75 GL libraries seem to be very slow on this machine 1.33Ghz PB running
  76 Jaguar, this is not the case for my 1Ghz PB.  I thought it might be
  77 a versioning issue, however I have libGL.1.2.dylib for both
  78 machines. (We need to figure this out now.)
  79
  80 GL2 libraries work now with patch for RGB32.
  81
  82 NOTE: quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor.
  83
  84 Integrated luma prescaling adjustment for saturation/contrast/brightness
  85 adjustment.
  86 */
  87
  88 #include <stdio.h>
  89 #include <stdlib.h>
  90 #include <string.h>
  91 #include <inttypes.h>
  92 #include <assert.h>
  93 #include "config.h"
  94 #include "libswscale/rgb2rgb.h"
  95 #include "libswscale/swscale.h"
  96 #include "libswscale/swscale_internal.h"
  97
  98 #undef PROFILE_THE_BEAST
  99 #undef INC_SCALING
 100
 101 typedef unsigned char ubyte;
 102 typedef signed char   sbyte;
 103
 104
 105 /* RGB interleaver, 16 planar pels 8-bit samples per channel in
 106    homogeneous vector registers x0,x1,x2 are interleaved with the
 107    following technique:
 108
 109       o0 = vec_mergeh (x0,x1);
 110       o1 = vec_perm (o0, x2, perm_rgb_0);
 111       o2 = vec_perm (o0, x2, perm_rgb_1);
 112       o3 = vec_mergel (x0,x1);
 113       o4 = vec_perm (o3,o2,perm_rgb_2);
 114       o5 = vec_perm (o3,o2,perm_rgb_3);
 115
 116   perm_rgb_0:   o0(RG).h v1(B) --> o1*
 117               0   1  2   3   4
 118              rgbr|gbrg|brgb|rgbr
 119              0010 0100 1001 0010
 120              0102 3145 2673 894A
 121
 122   perm_rgb_1:   o0(RG).h v1(B) --> o2
 123               0   1  2   3   4
 124              gbrg|brgb|bbbb|bbbb
 125              0100 1001 1111 1111
 126              B5CD 6EF7 89AB CDEF
 127
 128   perm_rgb_2:   o3(RG).l o2(rgbB.l) --> o4*
 129               0   1  2   3   4
 130              gbrg|brgb|rgbr|gbrg
 131              1111 1111 0010 0100
 132              89AB CDEF 0182 3945
 133
 134   perm_rgb_2:   o3(RG).l o2(rgbB.l) ---> o5*
 135               0   1  2   3   4
 136              brgb|rgbr|gbrg|brgb
 137              1001 0010 0100 1001
 138              a67b 89cA BdCD eEFf
 139
 140 */
 141 static
 142 const vector unsigned char
 143   perm_rgb_0 = {0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05,
 144                 0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a},
 145   perm_rgb_1 = {0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17,
 146                 0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f},
 147   perm_rgb_2 = {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
 148                 0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05},
 149   perm_rgb_3 = {0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a,
 150                 0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f};
 151
 152 #define vec_merge3(x2,x1,x0,y0,y1,y2)       \
 153 do {                                        \
 154     __typeof__(x0) o0,o2,o3;                \
 155         o0 = vec_mergeh (x0,x1);            \
 156         y0 = vec_perm (o0, x2, perm_rgb_0); \
 157         o2 = vec_perm (o0, x2, perm_rgb_1); \
 158         o3 = vec_mergel (x0,x1);            \
 159         y1 = vec_perm (o3,o2,perm_rgb_2);   \
 160         y2 = vec_perm (o3,o2,perm_rgb_3);   \
 161 } while(0)
 162
 163 #define vec_mstbgr24(x0,x1,x2,ptr)      \
 164 do {                                    \
 165     __typeof__(x0) _0,_1,_2;            \
 166     vec_merge3 (x0,x1,x2,_0,_1,_2);     \
 167     vec_st (_0, 0, ptr++);              \
 168     vec_st (_1, 0, ptr++);              \
 169     vec_st (_2, 0, ptr++);              \
 170 }  while (0)
 171
 172 #define vec_mstrgb24(x0,x1,x2,ptr)      \
 173 do {                                    \
 174     __typeof__(x0) _0,_1,_2;            \
 175     vec_merge3 (x2,x1,x0,_0,_1,_2);     \
 176     vec_st (_0, 0, ptr++);              \
 177     vec_st (_1, 0, ptr++);              \
 178     vec_st (_2, 0, ptr++);              \
 179 }  while (0)
 180
 181 /* pack the pixels in rgb0 format
 182    msb R
 183    lsb 0
 184 */
 185 #define vec_mstrgb32(T,x0,x1,x2,x3,ptr)                                       \
 186 do {                                                                          \
 187     T _0,_1,_2,_3;                                                            \
 188     _0 = vec_mergeh (x0,x1);                                                  \
 189     _1 = vec_mergeh (x2,x3);                                                  \
 190     _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
 191     _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
 192     vec_st (_2, 0*16, (T *)ptr);                                              \
 193     vec_st (_3, 1*16, (T *)ptr);                                              \
 194     _0 = vec_mergel (x0,x1);                                                  \
 195     _1 = vec_mergel (x2,x3);                                                  \
 196     _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
 197     _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
 198     vec_st (_2, 2*16, (T *)ptr);                                              \
 199     vec_st (_3, 3*16, (T *)ptr);                                              \
 200     ptr += 4;                                                                 \
 201 }  while (0)
 202
 203 /*
 204
 205   | 1     0       1.4021   | | Y |
 206   | 1    -0.3441 -0.7142   |x| Cb|
 207   | 1     1.7718  0        | | Cr|
 208
 209
 210   Y:      [-128 127]
 211   Cb/Cr : [-128 127]
 212
 213   typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode.
 214
 215 */
 216
 217
 218
 219
 220 #define vec_unh(x) \
 221     (vector signed short) \
 222         vec_perm(x,(__typeof__(x)){0}, \
 223                  ((vector unsigned char){0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\
 224                                          0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07}))
 225 #define vec_unl(x) \
 226     (vector signed short) \
 227         vec_perm(x,(__typeof__(x)){0}, \
 228                  ((vector unsigned char){0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\
 229                                          0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F}))
 230
 231 #define vec_clip_s16(x) \
 232     vec_max (vec_min (x, ((vector signed short){235,235,235,235,235,235,235,235})), \
 233                          ((vector signed short){ 16, 16, 16, 16, 16, 16, 16, 16}))
 234
 235 #define vec_packclp(x,y) \
 236     (vector unsigned char)vec_packs \
 237         ((vector unsigned short)vec_max (x,((vector signed short) {0})), \
 238          (vector unsigned short)vec_max (y,((vector signed short) {0})))
 239
 240 //#define out_pixels(a,b,c,ptr) vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),a,a,a,ptr)
 241
 242
 243 static inline void cvtyuvtoRGB (SwsContext *c,
 244                                 vector signed short Y, vector signed short U, vector signed short V,
 245                                 vector signed short *R, vector signed short *G, vector signed short *B)
 246 {
 247     vector signed   short vx,ux,uvx;
 248
 249     Y = vec_mradds (Y, c->CY, c->OY);
 250     U  = vec_sub (U,(vector signed short)
 251                     vec_splat((vector signed short){128},0));
 252     V  = vec_sub (V,(vector signed short)
 253                     vec_splat((vector signed short){128},0));
 254
 255     //   ux  = (CBU*(u<<c->CSHIFT)+0x4000)>>15;
 256     ux = vec_sl (U, c->CSHIFT);
 257     *B = vec_mradds (ux, c->CBU, Y);
 258
 259     // vx  = (CRV*(v<<c->CSHIFT)+0x4000)>>15;
 260     vx = vec_sl (V, c->CSHIFT);
 261     *R = vec_mradds (vx, c->CRV, Y);
 262
 263     // uvx = ((CGU*u) + (CGV*v))>>15;
 264     uvx = vec_mradds (U, c->CGU, Y);
 265     *G  = vec_mradds (V, c->CGV, uvx);
 266 }
 267
 268
 269 /*
 270   ------------------------------------------------------------------------------
 271   CS converters
 272   ------------------------------------------------------------------------------
 273 */
 274
 275
 276 #define DEFCSP420_CVT(name,out_pixels)                                  \
 277 static int altivec_##name (SwsContext *c,                               \
 278                            unsigned char **in, int *instrides,          \
 279                            int srcSliceY,        int srcSliceH,         \
 280                            unsigned char **oplanes, int *outstrides)    \
 281 {                                                                       \
 282     int w = c->srcW;                                                    \
 283     int h = srcSliceH;                                                  \
 284     int i,j;                                                            \
 285     int instrides_scl[3];                                               \
 286     vector unsigned char y0,y1;                                         \
 287                                                                         \
 288     vector signed char  u,v;                                            \
 289                                                                         \
 290     vector signed short Y0,Y1,Y2,Y3;                                    \
 291     vector signed short U,V;                                            \
 292     vector signed short vx,ux,uvx;                                      \
 293     vector signed short vx0,ux0,uvx0;                                   \
 294     vector signed short vx1,ux1,uvx1;                                   \
 295     vector signed short R0,G0,B0;                                       \
 296     vector signed short R1,G1,B1;                                       \
 297     vector unsigned char R,G,B;                                         \
 298                                                                         \
 299     vector unsigned char *y1ivP, *y2ivP, *uivP, *vivP;                  \
 300     vector unsigned char align_perm;                                    \
 301                                                                         \
 302     vector signed short                                                 \
 303         lCY  = c->CY,                                                   \
 304         lOY  = c->OY,                                                   \
 305         lCRV = c->CRV,                                                  \
 306         lCBU = c->CBU,                                                  \
 307         lCGU = c->CGU,                                                  \
 308         lCGV = c->CGV;                                                  \
 309                                                                         \
 310     vector unsigned short lCSHIFT = c->CSHIFT;                          \
 311                                                                         \
 312     ubyte *y1i   = in[0];                                               \
 313     ubyte *y2i   = in[0]+instrides[0];                                  \
 314     ubyte *ui    = in[1];                                               \
 315     ubyte *vi    = in[2];                                               \
 316                                                                         \
 317     vector unsigned char *oute                                          \
 318         = (vector unsigned char *)                                      \
 319             (oplanes[0]+srcSliceY*outstrides[0]);                       \
 320     vector unsigned char *outo                                          \
 321         = (vector unsigned char *)                                      \
 322             (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);         \
 323                                                                         \
 324                                                                         \
 325     instrides_scl[0] = instrides[0]*2-w;  /* the loop moves y{1,2}i by w */ \
 326     instrides_scl[1] = instrides[1]-w/2;  /* the loop moves ui by w/2 */    \
 327     instrides_scl[2] = instrides[2]-w/2;  /* the loop moves vi by w/2 */    \
 328                                                                         \
 329                                                                         \
 330     for (i=0;i<h/2;i++) {                                               \
 331         vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);          \
 332         vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);          \
 333                                                                         \
 334         for (j=0;j<w/16;j++) {                                          \
 335                                                                         \
 336             y1ivP = (vector unsigned char *)y1i;                        \
 337             y2ivP = (vector unsigned char *)y2i;                        \
 338             uivP  = (vector unsigned char *)ui;                         \
 339             vivP  = (vector unsigned char *)vi;                         \
 340                                                                         \
 341             align_perm = vec_lvsl (0, y1i);                             \
 342             y0 = (vector unsigned char)                                 \
 343                  vec_perm (y1ivP[0], y1ivP[1], align_perm);             \
 344                                                                         \
 345             align_perm = vec_lvsl (0, y2i);                             \
 346             y1 = (vector unsigned char)                                 \
 347                  vec_perm (y2ivP[0], y2ivP[1], align_perm);             \
 348                                                                         \
 349             align_perm = vec_lvsl (0, ui);                              \
 350             u = (vector signed char)                                    \
 351                 vec_perm (uivP[0], uivP[1], align_perm);                \
 352                                                                         \
 353             align_perm = vec_lvsl (0, vi);                              \
 354             v = (vector signed char)                                    \
 355                 vec_perm (vivP[0], vivP[1], align_perm);                \
 356                                                                         \
 357             u  = (vector signed char)                                   \
 358                  vec_sub (u,(vector signed char)                        \
 359                           vec_splat((vector signed char){128},0));      \
 360             v  = (vector signed char)                                   \
 361                  vec_sub (v,(vector signed char)                        \
 362                           vec_splat((vector signed char){128},0));      \
 363                                                                         \
 364             U  = vec_unpackh (u);                                       \
 365             V  = vec_unpackh (v);                                       \
 366                                                                         \
 367                                                                         \
 368             Y0 = vec_unh (y0);                                          \
 369             Y1 = vec_unl (y0);                                          \
 370             Y2 = vec_unh (y1);                                          \
 371             Y3 = vec_unl (y1);                                          \
 372                                                                         \
 373             Y0 = vec_mradds (Y0, lCY, lOY);                             \
 374             Y1 = vec_mradds (Y1, lCY, lOY);                             \
 375             Y2 = vec_mradds (Y2, lCY, lOY);                             \
 376             Y3 = vec_mradds (Y3, lCY, lOY);                             \
 377                                                                         \
 378             /*   ux  = (CBU*(u<<CSHIFT)+0x4000)>>15 */                  \
 379             ux = vec_sl (U, lCSHIFT);                                   \
 380             ux = vec_mradds (ux, lCBU, (vector signed short){0});       \
 381             ux0  = vec_mergeh (ux,ux);                                  \
 382             ux1  = vec_mergel (ux,ux);                                  \
 383                                                                         \
 384             /* vx  = (CRV*(v<<CSHIFT)+0x4000)>>15;        */            \
 385             vx = vec_sl (V, lCSHIFT);                                   \
 386             vx = vec_mradds (vx, lCRV, (vector signed short){0});       \
 387             vx0  = vec_mergeh (vx,vx);                                  \
 388             vx1  = vec_mergel (vx,vx);                                  \
 389                                                                         \
 390             /* uvx = ((CGU*u) + (CGV*v))>>15 */                         \
 391             uvx = vec_mradds (U, lCGU, (vector signed short){0});       \
 392             uvx = vec_mradds (V, lCGV, uvx);                            \
 393             uvx0 = vec_mergeh (uvx,uvx);                                \
 394             uvx1 = vec_mergel (uvx,uvx);                                \
 395                                                                         \
 396             R0 = vec_add (Y0,vx0);                                      \
 397             G0 = vec_add (Y0,uvx0);                                     \
 398             B0 = vec_add (Y0,ux0);                                      \
 399             R1 = vec_add (Y1,vx1);                                      \
 400             G1 = vec_add (Y1,uvx1);                                     \
 401             B1 = vec_add (Y1,ux1);                                      \
 402                                                                         \
 403             R  = vec_packclp (R0,R1);                                   \
 404             G  = vec_packclp (G0,G1);                                   \
 405             B  = vec_packclp (B0,B1);                                   \
 406                                                                         \
 407             out_pixels(R,G,B,oute);                                     \
 408                                                                         \
 409             R0 = vec_add (Y2,vx0);                                      \
 410             G0 = vec_add (Y2,uvx0);                                     \
 411             B0 = vec_add (Y2,ux0);                                      \
 412             R1 = vec_add (Y3,vx1);                                      \
 413             G1 = vec_add (Y3,uvx1);                                     \
 414             B1 = vec_add (Y3,ux1);                                      \
 415             R  = vec_packclp (R0,R1);                                   \
 416             G  = vec_packclp (G0,G1);                                   \
 417             B  = vec_packclp (B0,B1);                                   \
 418                                                                         \
 419                                                                         \
 420             out_pixels(R,G,B,outo);                                     \
 421                                                                         \
 422             y1i  += 16;                                                 \
 423             y2i  += 16;                                                 \
 424             ui   += 8;                                                  \
 425             vi   += 8;                                                  \
 426                                                                         \
 427         }                                                               \
 428                                                                         \
 429         outo  += (outstrides[0])>>4;                                    \
 430         oute  += (outstrides[0])>>4;                                    \
 431                                                                         \
 432         ui    += instrides_scl[1];                                      \
 433         vi    += instrides_scl[2];                                      \
 434         y1i   += instrides_scl[0];                                      \
 435         y2i   += instrides_scl[0];                                      \
 436     }                                                                   \
 437     return srcSliceH;                                                   \
 438 }
 439
 440
 441 #define out_abgr(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),c,b,a,ptr)
 442 #define out_bgra(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),c,b,a,((__typeof__ (a)){255}),ptr)
 443 #define out_rgba(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),a,b,c,((__typeof__ (a)){255}),ptr)
 444 #define out_argb(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),a,b,c,ptr)
 445 #define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr)
 446 #define out_bgr24(a,b,c,ptr) vec_mstbgr24(a,b,c,ptr)
 447
 448 DEFCSP420_CVT (yuv2_abgr, out_abgr)
 449 #if 1
 450 DEFCSP420_CVT (yuv2_bgra, out_bgra)
 451 #else
 452 static int altivec_yuv2_bgra32 (SwsContext *c,
 453                                 unsigned char **in, int *instrides,
 454                                 int srcSliceY,        int srcSliceH,
 455                                 unsigned char **oplanes, int *outstrides)
 456 {
 457     int w = c->srcW;
 458     int h = srcSliceH;
 459     int i,j;
 460     int instrides_scl[3];
 461     vector unsigned char y0,y1;
 462
 463     vector signed char  u,v;
 464
 465     vector signed short Y0,Y1,Y2,Y3;
 466     vector signed short U,V;
 467     vector signed short vx,ux,uvx;
 468     vector signed short vx0,ux0,uvx0;
 469     vector signed short vx1,ux1,uvx1;
 470     vector signed short R0,G0,B0;
 471     vector signed short R1,G1,B1;
 472     vector unsigned char R,G,B;
 473
 474     vector unsigned char *uivP, *vivP;
 475     vector unsigned char align_perm;
 476
 477     vector signed short
 478         lCY  = c->CY,
 479         lOY  = c->OY,
 480         lCRV = c->CRV,
 481         lCBU = c->CBU,
 482         lCGU = c->CGU,
 483         lCGV = c->CGV;
 484
 485     vector unsigned short lCSHIFT = c->CSHIFT;
 486
 487     ubyte *y1i   = in[0];
 488     ubyte *y2i   = in[0]+w;
 489     ubyte *ui    = in[1];
 490     ubyte *vi    = in[2];
 491
 492     vector unsigned char *oute
 493         = (vector unsigned char *)
 494           (oplanes[0]+srcSliceY*outstrides[0]);
 495     vector unsigned char *outo
 496         = (vector unsigned char *)
 497           (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);
 498
 499
 500     instrides_scl[0] = instrides[0];
 501     instrides_scl[1] = instrides[1]-w/2;  /* the loop moves ui by w/2 */
 502     instrides_scl[2] = instrides[2]-w/2;  /* the loop moves vi by w/2 */
 503
 504
 505     for (i=0;i<h/2;i++) {
 506         vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);
 507         vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);
 508
 509         for (j=0;j<w/16;j++) {
 510
 511             y0 = vec_ldl (0,y1i);
 512             y1 = vec_ldl (0,y2i);
 513             uivP = (vector unsigned char *)ui;
 514             vivP = (vector unsigned char *)vi;
 515
 516             align_perm = vec_lvsl (0, ui);
 517             u  = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm);
 518
 519             align_perm = vec_lvsl (0, vi);
 520             v  = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm);
 521             u  = (vector signed char)
 522                  vec_sub (u,(vector signed char)
 523                           vec_splat((vector signed char){128},0));
 524
 525             v  = (vector signed char)
 526                  vec_sub (v, (vector signed char)
 527                           vec_splat((vector signed char){128},0));
 528
 529             U  = vec_unpackh (u);
 530             V  = vec_unpackh (v);
 531
 532
 533             Y0 = vec_unh (y0);
 534             Y1 = vec_unl (y0);
 535             Y2 = vec_unh (y1);
 536             Y3 = vec_unl (y1);
 537
 538             Y0 = vec_mradds (Y0, lCY, lOY);
 539             Y1 = vec_mradds (Y1, lCY, lOY);
 540             Y2 = vec_mradds (Y2, lCY, lOY);
 541             Y3 = vec_mradds (Y3, lCY, lOY);
 542
 543             /*   ux  = (CBU*(u<<CSHIFT)+0x4000)>>15 */
 544             ux = vec_sl (U, lCSHIFT);
 545             ux = vec_mradds (ux, lCBU, (vector signed short){0});
 546             ux0  = vec_mergeh (ux,ux);
 547             ux1  = vec_mergel (ux,ux);
 548
 549             /* vx  = (CRV*(v<<CSHIFT)+0x4000)>>15;        */
 550             vx = vec_sl (V, lCSHIFT);
 551             vx = vec_mradds (vx, lCRV, (vector signed short){0});
 552             vx0  = vec_mergeh (vx,vx);
 553             vx1  = vec_mergel (vx,vx);
 554             /* uvx = ((CGU*u) + (CGV*v))>>15 */
 555             uvx = vec_mradds (U, lCGU, (vector signed short){0});
 556             uvx = vec_mradds (V, lCGV, uvx);
 557             uvx0 = vec_mergeh (uvx,uvx);
 558             uvx1 = vec_mergel (uvx,uvx);
 559             R0 = vec_add (Y0,vx0);
 560             G0 = vec_add (Y0,uvx0);
 561             B0 = vec_add (Y0,ux0);
 562             R1 = vec_add (Y1,vx1);
 563             G1 = vec_add (Y1,uvx1);
 564             B1 = vec_add (Y1,ux1);
 565             R  = vec_packclp (R0,R1);
 566             G  = vec_packclp (G0,G1);
 567             B  = vec_packclp (B0,B1);
 568
 569             out_argb(R,G,B,oute);
 570             R0 = vec_add (Y2,vx0);
 571             G0 = vec_add (Y2,uvx0);
 572             B0 = vec_add (Y2,ux0);
 573             R1 = vec_add (Y3,vx1);
 574             G1 = vec_add (Y3,uvx1);
 575             B1 = vec_add (Y3,ux1);
 576             R  = vec_packclp (R0,R1);
 577             G  = vec_packclp (G0,G1);
 578             B  = vec_packclp (B0,B1);
 579
 580             out_argb(R,G,B,outo);
 581             y1i  += 16;
 582             y2i  += 16;
 583             ui   += 8;
 584             vi   += 8;
 585
 586         }
 587
 588         outo  += (outstrides[0])>>4;
 589         oute  += (outstrides[0])>>4;
 590
 591         ui    += instrides_scl[1];
 592         vi    += instrides_scl[2];
 593         y1i   += instrides_scl[0];
 594         y2i   += instrides_scl[0];
 595     }
 596     return srcSliceH;
 597 }
 598
 599 #endif
 600
 601
 602 DEFCSP420_CVT (yuv2_rgba, out_rgba)
 603 DEFCSP420_CVT (yuv2_argb, out_argb)
 604 DEFCSP420_CVT (yuv2_rgb24,  out_rgb24)
 605 DEFCSP420_CVT (yuv2_bgr24,  out_bgr24)
 606
 607
 608 // uyvy|uyvy|uyvy|uyvy
 609 // 0123 4567 89ab cdef
 610 static
 611 const vector unsigned char
 612     demux_u = {0x10,0x00,0x10,0x00,
 613                0x10,0x04,0x10,0x04,
 614                0x10,0x08,0x10,0x08,
 615                0x10,0x0c,0x10,0x0c},
 616     demux_v = {0x10,0x02,0x10,0x02,
 617                0x10,0x06,0x10,0x06,
 618                0x10,0x0A,0x10,0x0A,
 619                0x10,0x0E,0x10,0x0E},
 620     demux_y = {0x10,0x01,0x10,0x03,
 621                0x10,0x05,0x10,0x07,
 622                0x10,0x09,0x10,0x0B,
 623                0x10,0x0D,0x10,0x0F};
 624
 625 /*
 626   this is so I can play live CCIR raw video
 627 */
 628 static int altivec_uyvy_rgb32 (SwsContext *c,
 629                                unsigned char **in, int *instrides,
 630                                int srcSliceY,        int srcSliceH,
 631                                unsigned char **oplanes, int *outstrides)
 632 {
 633     int w = c->srcW;
 634     int h = srcSliceH;
 635     int i,j;
 636     vector unsigned char uyvy;
 637     vector signed   short Y,U,V;
 638     vector signed   short R0,G0,B0,R1,G1,B1;
 639     vector unsigned char  R,G,B;
 640     vector unsigned char *out;
 641     ubyte *img;
 642
 643     img = in[0];
 644     out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]);
 645
 646     for (i=0;i<h;i++) {
 647         for (j=0;j<w/16;j++) {
 648             uyvy = vec_ld (0, img);
 649             U = (vector signed short)
 650                 vec_perm (uyvy, (vector unsigned char){0}, demux_u);
 651
 652             V = (vector signed short)
 653                 vec_perm (uyvy, (vector unsigned char){0}, demux_v);
 654
 655             Y = (vector signed short)
 656                 vec_perm (uyvy, (vector unsigned char){0}, demux_y);
 657
 658             cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0);
 659
 660             uyvy = vec_ld (16, img);
 661             U = (vector signed short)
 662                 vec_perm (uyvy, (vector unsigned char){0}, demux_u);
 663
 664             V = (vector signed short)
 665                 vec_perm (uyvy, (vector unsigned char){0}, demux_v);
 666
 667             Y = (vector signed short)
 668                 vec_perm (uyvy, (vector unsigned char){0}, demux_y);
 669
 670             cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1);
 671
 672             R  = vec_packclp (R0,R1);
 673             G  = vec_packclp (G0,G1);
 674             B  = vec_packclp (B0,B1);
 675
 676             //      vec_mstbgr24 (R,G,B, out);
 677             out_rgba (R,G,B,out);
 678
 679             img += 32;
 680         }
 681     }
 682     return srcSliceH;
 683 }
 684
 685
 686
 687 /* Ok currently the acceleration routine only supports
 688    inputs of widths a multiple of 16
 689    and heights a multiple 2
 690
 691    So we just fall back to the C codes for this.
 692 */
 693 SwsFunc ff_yuv2rgb_init_altivec(SwsContext *c)
 694 {
 695     if (!(c->flags & SWS_CPU_CAPS_ALTIVEC))
 696         return NULL;
 697
 698     /*
 699       and this seems not to matter too much I tried a bunch of
 700       videos with abnormal widths and MPlayer crashes elsewhere.
 701       mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv
 702       boom with X11 bad match.
 703
 704     */
 705     if ((c->srcW & 0xf) != 0)    return NULL;
 706
 707     switch (c->srcFormat) {
 708     case PIX_FMT_YUV410P:
 709     case PIX_FMT_YUV420P:
 710     /*case IMGFMT_CLPL:        ??? */
 711     case PIX_FMT_GRAY8:
 712     case PIX_FMT_NV12:
 713     case PIX_FMT_NV21:
 714         if ((c->srcH & 0x1) != 0)
 715             return NULL;
 716
 717         switch(c->dstFormat) {
 718         case PIX_FMT_RGB24:
 719             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGB24\n");
 720             return altivec_yuv2_rgb24;
 721         case PIX_FMT_BGR24:
 722             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGR24\n");
 723             return altivec_yuv2_bgr24;
 724         case PIX_FMT_ARGB:
 725             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ARGB\n");
 726             return altivec_yuv2_argb;
 727         case PIX_FMT_ABGR:
 728             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ABGR\n");
 729             return altivec_yuv2_abgr;
 730         case PIX_FMT_RGBA:
 731             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGBA\n");
 732             return altivec_yuv2_rgba;
 733         case PIX_FMT_BGRA:
 734             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGRA\n");
 735             return altivec_yuv2_bgra;
 736         default: return NULL;
 737         }
 738         break;
 739
 740     case PIX_FMT_UYVY422:
 741         switch(c->dstFormat) {
 742         case PIX_FMT_BGR32:
 743             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space UYVY -> RGB32\n");
 744             return altivec_uyvy_rgb32;
 745         default: return NULL;
 746         }
 747         break;
 748
 749     }
 750     return NULL;
 751 }
 752
 753 void ff_yuv2rgb_init_tables_altivec(SwsContext *c, const int inv_table[4], int brightness, int contrast, int saturation)
 754 {
 755     union {
 756         DECLARE_ALIGNED(16, signed short, tmp)[8];
 757         vector signed short vec;
 758     } buf;
 759
 760     buf.tmp[0] =  ((0xffffLL) * contrast>>8)>>9;                        //cy
 761     buf.tmp[1] =  -256*brightness;                                      //oy
 762     buf.tmp[2] =  (inv_table[0]>>3) *(contrast>>16)*(saturation>>16);   //crv
 763     buf.tmp[3] =  (inv_table[1]>>3) *(contrast>>16)*(saturation>>16);   //cbu
 764     buf.tmp[4] = -((inv_table[2]>>1)*(contrast>>16)*(saturation>>16));  //cgu
 765     buf.tmp[5] = -((inv_table[3]>>1)*(contrast>>16)*(saturation>>16));  //cgv
 766
 767
 768     c->CSHIFT = (vector unsigned short)vec_splat_u16(2);
 769     c->CY   = vec_splat ((vector signed short)buf.vec, 0);
 770     c->OY   = vec_splat ((vector signed short)buf.vec, 1);
 771     c->CRV  = vec_splat ((vector signed short)buf.vec, 2);
 772     c->CBU  = vec_splat ((vector signed short)buf.vec, 3);
 773     c->CGU  = vec_splat ((vector signed short)buf.vec, 4);
 774     c->CGV  = vec_splat ((vector signed short)buf.vec, 5);
 775     return;
 776 }
 777
 778
 779 void
 780 ff_yuv2packedX_altivec(SwsContext *c,
 781                        const int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
 782                        const int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
 783                      uint8_t *dest, int dstW, int dstY)
 784 {
 785     int i,j;
 786     vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V;
 787     vector signed short R0,G0,B0,R1,G1,B1;
 788
 789     vector unsigned char R,G,B;
 790     vector unsigned char *out,*nout;
 791
 792     vector signed short   RND = vec_splat_s16(1<<3);
 793     vector unsigned short SCL = vec_splat_u16(4);
 794     DECLARE_ALIGNED(16, unsigned long, scratch)[16];
 795
 796     vector signed short *YCoeffs, *CCoeffs;
 797
 798     YCoeffs = c->vYCoeffsBank+dstY*lumFilterSize;
 799     CCoeffs = c->vCCoeffsBank+dstY*chrFilterSize;
 800
 801     out = (vector unsigned char *)dest;
 802
 803     for (i=0; i<dstW; i+=16) {
 804         Y0 = RND;
 805         Y1 = RND;
 806         /* extract 16 coeffs from lumSrc */
 807         for (j=0; j<lumFilterSize; j++) {
 808             X0 = vec_ld (0,  &lumSrc[j][i]);
 809             X1 = vec_ld (16, &lumSrc[j][i]);
 810             Y0 = vec_mradds (X0, YCoeffs[j], Y0);
 811             Y1 = vec_mradds (X1, YCoeffs[j], Y1);
 812         }
 813
 814         U = RND;
 815         V = RND;
 816         /* extract 8 coeffs from U,V */
 817         for (j=0; j<chrFilterSize; j++) {
 818             X  = vec_ld (0, &chrSrc[j][i/2]);
 819             U  = vec_mradds (X, CCoeffs[j], U);
 820             X  = vec_ld (0, &chrSrc[j][i/2+2048]);
 821             V  = vec_mradds (X, CCoeffs[j], V);
 822         }
 823
 824         /* scale and clip signals */
 825         Y0 = vec_sra (Y0, SCL);
 826         Y1 = vec_sra (Y1, SCL);
 827         U  = vec_sra (U,  SCL);
 828         V  = vec_sra (V,  SCL);
 829
 830         Y0 = vec_clip_s16 (Y0);
 831         Y1 = vec_clip_s16 (Y1);
 832         U  = vec_clip_s16 (U);
 833         V  = vec_clip_s16 (V);
 834
 835         /* now we have
 836           Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
 837           U= u0 u1 u2 u3 u4 u5 u6 u7      V= v0 v1 v2 v3 v4 v5 v6 v7
 838
 839           Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
 840           U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
 841           V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
 842         */
 843
 844         U0 = vec_mergeh (U,U);
 845         V0 = vec_mergeh (V,V);
 846
 847         U1 = vec_mergel (U,U);
 848         V1 = vec_mergel (V,V);
 849
 850         cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
 851         cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
 852
 853         R  = vec_packclp (R0,R1);
 854         G  = vec_packclp (G0,G1);
 855         B  = vec_packclp (B0,B1);
 856
 857         switch(c->dstFormat) {
 858         case PIX_FMT_ABGR:  out_abgr  (R,G,B,out); break;
 859         case PIX_FMT_BGRA:  out_bgra  (R,G,B,out); break;
 860         case PIX_FMT_RGBA:  out_rgba  (R,G,B,out); break;
 861         case PIX_FMT_ARGB:  out_argb  (R,G,B,out); break;
 862         case PIX_FMT_RGB24: out_rgb24 (R,G,B,out); break;
 863         case PIX_FMT_BGR24: out_bgr24 (R,G,B,out); break;
 864         default:
 865             {
 866                 /* If this is reached, the caller should have called yuv2packedXinC
 867                    instead. */
 868                 static int printed_error_message;
 869                 if (!printed_error_message) {
 870                     av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
 871                            sws_format_name(c->dstFormat));
 872                     printed_error_message=1;
 873                 }
 874                 return;
 875             }
 876         }
 877     }
 878
 879     if (i < dstW) {
 880         i -= 16;
 881
 882         Y0 = RND;
 883         Y1 = RND;
 884         /* extract 16 coeffs from lumSrc */
 885         for (j=0; j<lumFilterSize; j++) {
 886             X0 = vec_ld (0,  &lumSrc[j][i]);
 887             X1 = vec_ld (16, &lumSrc[j][i]);
 888             Y0 = vec_mradds (X0, YCoeffs[j], Y0);
 889             Y1 = vec_mradds (X1, YCoeffs[j], Y1);
 890         }
 891
 892         U = RND;
 893         V = RND;
 894         /* extract 8 coeffs from U,V */
 895         for (j=0; j<chrFilterSize; j++) {
 896             X  = vec_ld (0, &chrSrc[j][i/2]);
 897             U  = vec_mradds (X, CCoeffs[j], U);
 898             X  = vec_ld (0, &chrSrc[j][i/2+2048]);
 899             V  = vec_mradds (X, CCoeffs[j], V);
 900         }
 901
 902         /* scale and clip signals */
 903         Y0 = vec_sra (Y0, SCL);
 904         Y1 = vec_sra (Y1, SCL);
 905         U  = vec_sra (U,  SCL);
 906         V  = vec_sra (V,  SCL);
 907
 908         Y0 = vec_clip_s16 (Y0);
 909         Y1 = vec_clip_s16 (Y1);
 910         U  = vec_clip_s16 (U);
 911         V  = vec_clip_s16 (V);
 912
 913         /* now we have
 914            Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
 915            U = u0 u1 u2 u3 u4 u5 u6 u7     V = v0 v1 v2 v3 v4 v5 v6 v7
 916
 917            Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
 918            U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
 919            V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
 920         */
 921
 922         U0 = vec_mergeh (U,U);
 923         V0 = vec_mergeh (V,V);
 924
 925         U1 = vec_mergel (U,U);
 926         V1 = vec_mergel (V,V);
 927
 928         cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
 929         cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
 930
 931         R  = vec_packclp (R0,R1);
 932         G  = vec_packclp (G0,G1);
 933         B  = vec_packclp (B0,B1);
 934
 935         nout = (vector unsigned char *)scratch;
 936         switch(c->dstFormat) {
 937         case PIX_FMT_ABGR:  out_abgr  (R,G,B,nout); break;
 938         case PIX_FMT_BGRA:  out_bgra  (R,G,B,nout); break;
 939         case PIX_FMT_RGBA:  out_rgba  (R,G,B,nout); break;
 940         case PIX_FMT_ARGB:  out_argb  (R,G,B,nout); break;
 941         case PIX_FMT_RGB24: out_rgb24 (R,G,B,nout); break;
 942         case PIX_FMT_BGR24: out_bgr24 (R,G,B,nout); break;
 943         default:
 944             /* Unreachable, I think. */
 945             av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
 946                    sws_format_name(c->dstFormat));
 947             return;
 948         }
 949
 950         memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4);
 951     }
 952
 953 }