gst/goom/xmmx.c

   1 #ifdef HAVE_CONFIG_H
   2 #include "config.h"
   3 #endif
   4
   5 #include "goom_config.h"
   6
   7 #ifdef HAVE_MMX
   8
   9 /* a definir pour avoir exactement le meme resultat que la fonction C
  10  * (un chouillat plus lent).. mais la difference est assez peu notable.
  11  */
  12 // #define STRICT_COMPAT
  13
  14 #define BUFFPOINTNB 16
  15 #define BUFFPOINTMASK 0xffff
  16 #define BUFFINCR 0xff
  17
  18 #define sqrtperte 16
  19 /* faire : a % sqrtperte <=> a & pertemask*/
  20 #define PERTEMASK 0xf
  21 /* faire : a / sqrtperte <=> a >> PERTEDEC*/
  22 #define PERTEDEC 4
  23
  24
  25 /*#define MMX_TRACE*/
  26 #include "mmx.h"
  27 /*#include "xmmx.h"*/
  28 #include "goom_graphic.h"
  29
  30 int
  31 xmmx_supported (void)
  32 {
  33   return (mm_support () & 0x8) >> 3;
  34 }
  35
  36 void
  37 zoom_filter_xmmx (int prevX, int prevY,
  38     Pixel * expix1, Pixel * expix2,
  39     int *lbruS, int *lbruD, int buffratio, int precalCoef[16][16])
  40 {
  41   int bufsize = prevX * prevY;  /* taille du buffer */
  42   volatile int loop;            /* variable de boucle */
  43
  44   mmx_t *brutS = (mmx_t *) lbruS;       /* buffer de transformation source */
  45   mmx_t *brutD = (mmx_t *) lbruD;       /* buffer de transformation dest */
  46
  47   volatile mmx_t prevXY;
  48   volatile mmx_t ratiox;
  49
  50   /*      volatile mmx_t interpix; */
  51
  52   expix1[0].val = expix1[prevX - 1].val = expix1[prevX * prevY - 1].val =
  53       expix1[prevX * prevY - prevX].val = 0;
  54
  55   prevXY.ud[0] = (prevX - 1) << PERTEDEC;
  56   prevXY.ud[1] = (prevY - 1) << PERTEDEC;
  57
  58   ratiox.d[0] = buffratio;
  59   ratiox.d[1] = buffratio;
  60
  61   asm volatile ("\n\t movq  %[ratio], %%mm6" "\n\t pslld $16,      %%mm6"       /* mm6 = [rat16=buffratio<<16 | rat16=buffratio<<16] */
  62       "\n\t pxor  %%mm7,    %%mm7"      /* mm7 = 0 */
  63       ::[ratio] "m" (ratiox));
  64
  65   loop = 0;
  66
  67   /*
  68    * NOTE : mm6 et mm7 ne sont pas modifies dans la boucle.
  69    */
  70   while (loop < bufsize) {
  71     /* Thread #1
  72      * pre :  mm6 = [rat16|rat16]
  73      * post : mm0 = S + ((D-S)*rat16 format [X|Y]
  74      * modified = mm0,mm1,mm2
  75      */
  76
  77     asm volatile ("#1 \n\t movq 0(%[brutS]), %%mm0" "#1 \n\t movq 0(%[brutD]), %%mm1" "#1 \n\t psubd   %%mm0, %%mm1"    /* mm1 = D - S */
  78         "#1 \n\t movq    %%mm1, %%mm2"  /* mm2 = D - S */
  79         "#1 \n\t pslld     $16, %%mm1" "#1 \n\t pmullw  %%mm6, %%mm2" "#1 \n\t pmulhuw %%mm6, %%mm1" "#1 \n\t pslld   $16,   %%mm0" "#1 \n\t paddd   %%mm2, %%mm1"      /* mm1 = (D - S) * buffratio >> 16 */
  80         "#1 \n\t paddd   %%mm1, %%mm0"  /* mm0 = S + mm1 */
  81         "#1 \n\t psrld   $16,   %%mm0"::[brutS] "r" (&brutS[loop]),
  82         [brutD] "r" (&brutD[loop])
  83         );                      /* mm0 = S */
  84
  85     /*
  86      * pre : mm0 : position vector on screen
  87      *       prevXY : coordinate of the lower-right point on screen
  88      * post : clipped mm0
  89      * modified : mm0,mm1,mm2
  90      */
  91     asm volatile
  92         ("#1 \n\t movq %[prevXY], %%mm1" "#1 \n\t pcmpgtd %%mm0,  %%mm1"
  93         /* mm0 en X contient (idem pour Y) :
  94          *   1111 si prevXY > px
  95          *   0000 si prevXY <= px */
  96 #ifdef STRICT_COMPAT
  97         "#1 \n\t movq      %%mm1, %%mm2"
  98         "#1 \n\t punpckhdq %%mm2, %%mm2"
  99         "#1 \n\t punpckldq %%mm1, %%mm1" "#1 \n\t pand      %%mm2, %%mm0"
 100 #endif
 101         "#1 \n\t pand %%mm1, %%mm0"     /* on met a zero la partie qui deborde */
 102         ::[prevXY] "m" (prevXY));
 103
 104     /* Thread #2
 105      * pre :  mm0 : clipped position on screen
 106      *
 107      * post : mm3 : coefs for this position
 108      *        mm1 : X vector [0|X]
 109      *
 110      * modif : eax,esi
 111      */
 112     __asm__ __volatile__ ("#2 \n\t movd %%mm0,%%esi"
 113         "#2 \n\t movq %%mm0,%%mm1"
 114         "#2 \n\t andl $15,%%esi"
 115         "#2 \n\t psrlq $32,%%mm1"
 116         "#2 \n\t shll $6,%%esi"
 117         "#2 \n\t movd %%mm1,%%eax"
 118         "#2 \n\t addl %[precalCoef],%%esi"
 119         "#2 \n\t andl $15,%%eax"
 120         "#2 \n\t movd (%%esi,%%eax,4),%%mm3"::[precalCoef]
 121         "g" (precalCoef):"eax", "esi");
 122
 123     /*
 124      * extraction des coefficients... (Thread #3)
 125      *
 126      * pre : coef dans mm3
 127      *
 128      * post : coef extraits dans mm3 (c1 & c2)
 129      *                        et mm4 (c3 & c4)
 130      *
 131      * modif : mm5
 132      */
 133
 134     /* (Thread #4)
 135      * pre : mm0 : Y pos [*|Y]
 136      *       mm1 : X pos [*|X]
 137      *
 138      * post : mm0 : expix1[position]
 139      *        mm2 : expix1[position+largeur]
 140      *
 141      * modif : eax, esi
 142      */
 143     __asm__ __volatile__ ("#2 \n\t psrld $4, %%mm0" "#2 \n\t psrld $4, %%mm1"   /* PERTEDEC = $4 */
 144         "#4 \n\t movd %%mm1,%%eax"
 145         "#3 \n\t movq %%mm3,%%mm5"
 146         "#4 \n\t mull %[prevX]"
 147         "#4 \n\t movd %%mm0,%%esi"
 148         "#3 \n\t punpcklbw %%mm5, %%mm3"
 149         "#4 \n\t addl %%esi, %%eax"
 150         "#3 \n\t movq %%mm3, %%mm4"
 151         "#3 \n\t movq %%mm3, %%mm5"
 152         "#4 \n\t movl %[expix1], %%esi"
 153         "#3 \n\t punpcklbw %%mm5, %%mm3"
 154         "#4 \n\t movq (%%esi,%%eax,4),%%mm0"
 155         "#3 \n\t punpckhbw %%mm5, %%mm4"
 156         "#4 \n\t addl %[prevX],%%eax"
 157         "#4 \n\t movq (%%esi,%%eax,4),%%mm2"::[expix1] "g" (expix1)
 158         ,[prevX] "g" (prevX)
 159         :"eax", "esi");
 160
 161     /*
 162      * pre :       mm0 : expix1[position]
 163      *             mm2 : expix1[position+largeur]
 164      *       mm3 & mm4 : coefs
 165      */
 166
 167     /* recopie des deux premiers pixels dans mm0 et mm1 */
 168     movq_r2r (mm0, mm1);        /* b1-v1-r1-a1-b2-v2-r2-a2 */
 169
 170     /* depackage du premier pixel */
 171     punpcklbw_r2r (mm7, mm0);   /* 00-b2-00-v2-00-r2-00-a2 */
 172
 173     /* extraction des coefficients... */
 174
 175     movq_r2r (mm3, mm5);        /* c2-c2-c2-c2-c1-c1-c1-c1 */
 176
 177     /*^en parrallele^ *//* depackage du 2ieme pixel */
 178     /*^ */ punpckhbw_r2r (mm7, mm1);
 179     /* 00-b1-00-v1-00-r1-00-a1 */
 180
 181     punpcklbw_r2r (mm7, mm5);   /* 00-c1-00-c1-00-c1-00-c1 */
 182     punpckhbw_r2r (mm7, mm3);   /* 00-c2-00-c2-00-c2-00-c2 */
 183
 184     /* multiplication des pixels par les coefficients */
 185     pmullw_r2r (mm5, mm0);      /* c1*b2-c1*v2-c1*r2-c1*a2 */
 186     pmullw_r2r (mm3, mm1);      /* c2*b1-c2*v1-c2*r1-c2*a1 */
 187     paddw_r2r (mm1, mm0);
 188
 189     /* ...extraction des 2 derniers coefficients */
 190     movq_r2r (mm4, mm5);        /* c4-c4-c4-c4-c3-c3-c3-c3 */
 191     punpcklbw_r2r (mm7, mm4);   /* 00-c3-00-c3-00-c3-00-c3 */
 192     punpckhbw_r2r (mm7, mm5);   /* 00-c4-00-c4-00-c4-00-c4 */
 193
 194     /* recuperation des 2 derniers pixels */
 195     movq_r2r (mm2, mm1);
 196
 197     /* depackage des pixels */
 198     punpcklbw_r2r (mm7, mm1);
 199     punpckhbw_r2r (mm7, mm2);
 200
 201     /* multiplication pas les coeffs */
 202     pmullw_r2r (mm4, mm1);
 203     pmullw_r2r (mm5, mm2);
 204
 205     /* ajout des valeurs obtenues � la valeur finale */
 206     paddw_r2r (mm1, mm0);
 207     paddw_r2r (mm2, mm0);
 208
 209     /* division par 256 = 16+16+16+16, puis repackage du pixel final */
 210     psrlw_i2r (8, mm0);
 211     packuswb_r2r (mm7, mm0);
 212
 213     movd_r2m (mm0, expix2[loop]);
 214
 215     ++loop;
 216   }
 217   /* this was femms, which is AMD 3dnow */
 218   __asm__ __volatile__ ("emms\n");
 219 }
 220
 221 #define DRAWMETHOD_PLUS_XMMX(_out,_backbuf,_col) \
 222 { \
 223         movd_m2r(_backbuf, mm0); \
 224         paddusb_m2r(_col, mm0); \
 225         movd_r2m(mm0, _out); \
 226 }
 227
 228 #define DRAWMETHOD DRAWMETHOD_PLUS_XMMX(*p,*p,col)
 229
 230 void
 231 draw_line_xmmx (Pixel * data, int x1, int y1, int x2, int y2, int col,
 232     int screenx, int screeny)
 233 {
 234   int x, y, dx, dy, yy, xx;
 235   Pixel *p;
 236
 237   if ((y1 < 0) || (y2 < 0) || (x1 < 0) || (x2 < 0) || (y1 >= screeny)
 238       || (y2 >= screeny) || (x1 >= screenx) || (x2 >= screenx))
 239     goto end_of_line;
 240
 241   dx = x2 - x1;
 242   dy = y2 - y1;
 243   if (x1 >= x2) {
 244     int tmp;
 245
 246     tmp = x1;
 247     x1 = x2;
 248     x2 = tmp;
 249     tmp = y1;
 250     y1 = y2;
 251     y2 = tmp;
 252     dx = x2 - x1;
 253     dy = y2 - y1;
 254   }
 255
 256   /* vertical line */
 257   if (dx == 0) {
 258     if (y1 < y2) {
 259       p = &(data[(screenx * y1) + x1]);
 260       for (y = y1; y <= y2; y++) {
 261         DRAWMETHOD;
 262         p += screenx;
 263       }
 264     } else {
 265       p = &(data[(screenx * y2) + x1]);
 266       for (y = y2; y <= y1; y++) {
 267         DRAWMETHOD;
 268         p += screenx;
 269       }
 270     }
 271     goto end_of_line;
 272   }
 273   /* horizontal line */
 274   if (dy == 0) {
 275     if (x1 < x2) {
 276       p = &(data[(screenx * y1) + x1]);
 277       for (x = x1; x <= x2; x++) {
 278         DRAWMETHOD;
 279         p++;
 280       }
 281       goto end_of_line;
 282     } else {
 283       p = &(data[(screenx * y1) + x2]);
 284       for (x = x2; x <= x1; x++) {
 285         DRAWMETHOD;
 286         p++;
 287       }
 288       goto end_of_line;
 289     }
 290   }
 291   /* 1    */
 292   /*  \   */
 293   /*   \  */
 294   /*    2 */
 295   if (y2 > y1) {
 296     /* steep */
 297     if (dy > dx) {
 298       dx = ((dx << 16) / dy);
 299       x = x1 << 16;
 300       for (y = y1; y <= y2; y++) {
 301         xx = x >> 16;
 302         p = &(data[(screenx * y) + xx]);
 303         DRAWMETHOD;
 304         if (xx < (screenx - 1)) {
 305           p++;
 306           /* DRAWMETHOD; */
 307         }
 308         x += dx;
 309       }
 310       goto end_of_line;
 311     }
 312     /* shallow */
 313     else {
 314       dy = ((dy << 16) / dx);
 315       y = y1 << 16;
 316       for (x = x1; x <= x2; x++) {
 317         yy = y >> 16;
 318         p = &(data[(screenx * yy) + x]);
 319         DRAWMETHOD;
 320         if (yy < (screeny - 1)) {
 321           p += screeny;
 322           /* DRAWMETHOD; */
 323         }
 324         y += dy;
 325       }
 326     }
 327   }
 328   /*    2 */
 329   /*   /  */
 330   /*  /   */
 331   /* 1    */
 332   else {
 333     /* steep */
 334     if (-dy > dx) {
 335       dx = ((dx << 16) / -dy);
 336       x = (x1 + 1) << 16;
 337       for (y = y1; y >= y2; y--) {
 338         xx = x >> 16;
 339         p = &(data[(screenx * y) + xx]);
 340         DRAWMETHOD;
 341         if (xx < (screenx - 1)) {
 342           p--;
 343           /* DRAWMETHOD; */
 344         }
 345         x += dx;
 346       }
 347       goto end_of_line;
 348     }
 349     /* shallow */
 350     else {
 351       dy = ((dy << 16) / dx);
 352       y = y1 << 16;
 353       for (x = x1; x <= x2; x++) {
 354         yy = y >> 16;
 355         p = &(data[(screenx * yy) + x]);
 356         DRAWMETHOD;
 357         if (yy < (screeny - 1)) {
 358           p += screeny;
 359           /* DRAWMETHOD; */
 360         }
 361         y += dy;
 362       }
 363       goto end_of_line;
 364     }
 365   }
 366 end_of_line:
 367   /* this was femms, which is AMD 3dnow */
 368   __asm__ __volatile__ ("emms\n");
 369 }
 370 #else
 371 int
 372 xmmx_supported (void)
 373 {
 374   return (0);
 375 }
 376 #endif