gst/goom/xmmx.c

   1 /*      xmmx.c
   2
   3         eXtended MultiMedia eXtensions GCC interface library for IA32.
   4
   5         To use this library, simply include this header file
   6         and compile with GCC.  You MUST have inlining enabled
   7         in order for xmmx_ok() to work; this can be done by
   8         simply using -O on the GCC command line.
   9
  10         Compiling with -DXMMX_TRACE will cause detailed trace
  11         output to be sent to stderr for each mmx operation.
  12         This adds lots of code, and obviously slows execution to
  13         a crawl, but can be very useful for debugging.
  14
  15         THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY
  16         EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT
  17         LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTABILITY
  18         AND FITNESS FOR ANY PARTICULAR PURPOSE.
  19
  20         1999 by R. Fisher
  21         Based on libmmx, 1997-99 by H. Dietz and R. Fisher
  22
  23  Notes:
  24         It appears that the latest gas has the pand problem fixed, therefore
  25           I'll undefine BROKEN_PAND by default.
  26 */
  27 #ifdef HAVE_CONFIG_H
  28 #include "config.h"
  29 #endif
  30
  31 #include "goom_config.h"
  32
  33 #ifdef HAVE_MMX
  34
  35 /* a definir pour avoir exactement le meme resultat que la fonction C
  36  * (un chouillat plus lent).. mais la difference est assez peu notable.
  37  */
  38 // #define STRICT_COMPAT
  39
  40 #define BUFFPOINTNB 16
  41 #define BUFFPOINTMASK 0xffff
  42 #define BUFFINCR 0xff
  43
  44 #define sqrtperte 16
  45 /* faire : a % sqrtperte <=> a & pertemask*/
  46 #define PERTEMASK 0xf
  47 /* faire : a / sqrtperte <=> a >> PERTEDEC*/
  48 #define PERTEDEC 4
  49
  50
  51 /*#define MMX_TRACE*/
  52 #include "mmx.h"
  53 /*#include "xmmx.h"*/
  54 #include "goom_graphic.h"
  55
  56 int
  57 xmmx_supported (void)
  58 {
  59   return (mm_support () & 0x8) >> 3;
  60 }
  61
  62 void
  63 zoom_filter_xmmx (int prevX, int prevY,
  64     Pixel * expix1, Pixel * expix2,
  65     int *lbruS, int *lbruD, int buffratio, int precalCoef[16][16])
  66 {
  67   int bufsize = prevX * prevY;  /* taille du buffer */
  68   volatile int loop;            /* variable de boucle */
  69
  70   mmx_t *brutS = (mmx_t *) lbruS;       /* buffer de transformation source */
  71   mmx_t *brutD = (mmx_t *) lbruD;       /* buffer de transformation dest */
  72
  73   volatile mmx_t prevXY;
  74   volatile mmx_t ratiox;
  75
  76   /*      volatile mmx_t interpix; */
  77
  78   expix1[0].val = expix1[prevX - 1].val = expix1[prevX * prevY - 1].val =
  79       expix1[prevX * prevY - prevX].val = 0;
  80
  81   prevXY.ud[0] = (prevX - 1) << PERTEDEC;
  82   prevXY.ud[1] = (prevY - 1) << PERTEDEC;
  83
  84   ratiox.d[0] = buffratio;
  85   ratiox.d[1] = buffratio;
  86
  87   asm volatile ("\n\t movq  %[ratio], %%mm6" "\n\t pslld $16,      %%mm6"       /* mm6 = [rat16=buffratio<<16 | rat16=buffratio<<16] */
  88       "\n\t pxor  %%mm7,    %%mm7"      /* mm7 = 0 */
  89       ::[ratio] "m" (ratiox));
  90
  91   loop = 0;
  92
  93   /*
  94    * NOTE : mm6 et mm7 ne sont pas modifies dans la boucle.
  95    */
  96   while (loop < bufsize) {
  97     /* Thread #1
  98      * pre :  mm6 = [rat16|rat16]
  99      * post : mm0 = S + ((D-S)*rat16 format [X|Y]
 100      * modified = mm0,mm1,mm2
 101      */
 102
 103     asm volatile ("#1 \n\t movq 0(%[brutS]), %%mm0" "#1 \n\t movq 0(%[brutD]), %%mm1" "#1 \n\t psubd   %%mm0, %%mm1"    /* mm1 = D - S */
 104         "#1 \n\t movq    %%mm1, %%mm2"  /* mm2 = D - S */
 105         "#1 \n\t pslld     $16, %%mm1" "#1 \n\t pmullw  %%mm6, %%mm2" "#1 \n\t pmulhuw %%mm6, %%mm1" "#1 \n\t pslld   $16,   %%mm0" "#1 \n\t paddd   %%mm2, %%mm1"      /* mm1 = (D - S) * buffratio >> 16 */
 106         "#1 \n\t paddd   %%mm1, %%mm0"  /* mm0 = S + mm1 */
 107         "#1 \n\t psrld   $16,   %%mm0"::[brutS] "r" (&brutS[loop]),
 108         [brutD] "r" (&brutD[loop])
 109         );                      /* mm0 = S */
 110
 111     /*
 112      * pre : mm0 : position vector on screen
 113      *       prevXY : coordinate of the lower-right point on screen
 114      * post : clipped mm0
 115      * modified : mm0,mm1,mm2
 116      */
 117     asm volatile
 118         ("#1 \n\t movq %[prevXY], %%mm1" "#1 \n\t pcmpgtd %%mm0,  %%mm1"
 119         /* mm0 en X contient (idem pour Y) :
 120          *   1111 si prevXY > px
 121          *   0000 si prevXY <= px */
 122 #ifdef STRICT_COMPAT
 123         "#1 \n\t movq      %%mm1, %%mm2"
 124         "#1 \n\t punpckhdq %%mm2, %%mm2"
 125         "#1 \n\t punpckldq %%mm1, %%mm1" "#1 \n\t pand      %%mm2, %%mm0"
 126 #endif
 127         "#1 \n\t pand %%mm1, %%mm0"     /* on met a zero la partie qui deborde */
 128         ::[prevXY] "m" (prevXY));
 129
 130     /* Thread #2
 131      * pre :  mm0 : clipped position on screen
 132      *
 133      * post : mm3 : coefs for this position
 134      *        mm1 : X vector [0|X]
 135      *
 136      * modif : eax,esi
 137      */
 138     __asm__ __volatile__ ("#2 \n\t movd %%mm0,%%esi"
 139         "#2 \n\t movq %%mm0,%%mm1"
 140         "#2 \n\t andl $15,%%esi"
 141         "#2 \n\t psrlq $32,%%mm1"
 142         "#2 \n\t shll $6,%%esi"
 143         "#2 \n\t movd %%mm1,%%eax"
 144         "#2 \n\t addl %[precalCoef],%%esi"
 145         "#2 \n\t andl $15,%%eax"
 146         "#2 \n\t movd (%%esi,%%eax,4),%%mm3"::[precalCoef]
 147         "g" (precalCoef):"eax", "esi");
 148
 149     /*
 150      * extraction des coefficients... (Thread #3)
 151      *
 152      * pre : coef dans mm3
 153      *
 154      * post : coef extraits dans mm3 (c1 & c2)
 155      *                        et mm4 (c3 & c4)
 156      *
 157      * modif : mm5
 158      */
 159
 160     /* (Thread #4)
 161      * pre : mm0 : Y pos [*|Y]
 162      *       mm1 : X pos [*|X]
 163      *
 164      * post : mm0 : expix1[position]
 165      *        mm2 : expix1[position+largeur]
 166      *
 167      * modif : eax, esi
 168      */
 169     __asm__ __volatile__ ("#2 \n\t psrld $4, %%mm0" "#2 \n\t psrld $4, %%mm1"   /* PERTEDEC = $4 */
 170         "#4 \n\t movd %%mm1,%%eax"
 171         "#3 \n\t movq %%mm3,%%mm5"
 172         "#4 \n\t mull %[prevX]"
 173         "#4 \n\t movd %%mm0,%%esi"
 174         "#3 \n\t punpcklbw %%mm5, %%mm3"
 175         "#4 \n\t addl %%esi, %%eax"
 176         "#3 \n\t movq %%mm3, %%mm4"
 177         "#3 \n\t movq %%mm3, %%mm5"
 178         "#4 \n\t movl %[expix1], %%esi"
 179         "#3 \n\t punpcklbw %%mm5, %%mm3"
 180         "#4 \n\t movq (%%esi,%%eax,4),%%mm0"
 181         "#3 \n\t punpckhbw %%mm5, %%mm4"
 182         "#4 \n\t addl %[prevX],%%eax"
 183         "#4 \n\t movq (%%esi,%%eax,4),%%mm2"::[expix1] "g" (expix1)
 184         ,[prevX] "g" (prevX)
 185         :"eax", "esi", "edx");
 186
 187     /*
 188      * pre :       mm0 : expix1[position]
 189      *             mm2 : expix1[position+largeur]
 190      *       mm3 & mm4 : coefs
 191      */
 192
 193     /* recopie des deux premiers pixels dans mm0 et mm1 */
 194     movq_r2r (mm0, mm1);        /* b1-v1-r1-a1-b2-v2-r2-a2 */
 195
 196     /* depackage du premier pixel */
 197     punpcklbw_r2r (mm7, mm0);   /* 00-b2-00-v2-00-r2-00-a2 */
 198
 199     /* extraction des coefficients... */
 200
 201     movq_r2r (mm3, mm5);        /* c2-c2-c2-c2-c1-c1-c1-c1 */
 202
 203     /*^en parrallele^ *//* depackage du 2ieme pixel */
 204     /*^ */ punpckhbw_r2r (mm7, mm1);
 205     /* 00-b1-00-v1-00-r1-00-a1 */
 206
 207     punpcklbw_r2r (mm7, mm5);   /* 00-c1-00-c1-00-c1-00-c1 */
 208     punpckhbw_r2r (mm7, mm3);   /* 00-c2-00-c2-00-c2-00-c2 */
 209
 210     /* multiplication des pixels par les coefficients */
 211     pmullw_r2r (mm5, mm0);      /* c1*b2-c1*v2-c1*r2-c1*a2 */
 212     pmullw_r2r (mm3, mm1);      /* c2*b1-c2*v1-c2*r1-c2*a1 */
 213     paddw_r2r (mm1, mm0);
 214
 215     /* ...extraction des 2 derniers coefficients */
 216     movq_r2r (mm4, mm5);        /* c4-c4-c4-c4-c3-c3-c3-c3 */
 217     punpcklbw_r2r (mm7, mm4);   /* 00-c3-00-c3-00-c3-00-c3 */
 218     punpckhbw_r2r (mm7, mm5);   /* 00-c4-00-c4-00-c4-00-c4 */
 219
 220     /* recuperation des 2 derniers pixels */
 221     movq_r2r (mm2, mm1);
 222
 223     /* depackage des pixels */
 224     punpcklbw_r2r (mm7, mm1);
 225     punpckhbw_r2r (mm7, mm2);
 226
 227     /* multiplication pas les coeffs */
 228     pmullw_r2r (mm4, mm1);
 229     pmullw_r2r (mm5, mm2);
 230
 231     /* ajout des valeurs obtenues � la valeur finale */
 232     paddw_r2r (mm1, mm0);
 233     paddw_r2r (mm2, mm0);
 234
 235     /* division par 256 = 16+16+16+16, puis repackage du pixel final */
 236     psrlw_i2r (8, mm0);
 237     packuswb_r2r (mm7, mm0);
 238
 239     movd_r2m (mm0, expix2[loop]);
 240
 241     ++loop;
 242   }
 243   /* this was femms, which is AMD 3dnow */
 244   __asm__ __volatile__ ("emms\n");
 245 }
 246
 247 #define DRAWMETHOD_PLUS_XMMX(_out,_backbuf,_col) \
 248 { \
 249         movd_m2r(_backbuf, mm0); \
 250         paddusb_m2r(_col, mm0); \
 251         movd_r2m(mm0, _out); \
 252 }
 253
 254 #define DRAWMETHOD DRAWMETHOD_PLUS_XMMX(*p,*p,col)
 255
 256 void
 257 draw_line_xmmx (Pixel * data, int x1, int y1, int x2, int y2, int col,
 258     int screenx, int screeny)
 259 {
 260   int x, y, dx, dy, yy, xx;
 261   Pixel *p;
 262
 263   if ((y1 < 0) || (y2 < 0) || (x1 < 0) || (x2 < 0) || (y1 >= screeny)
 264       || (y2 >= screeny) || (x1 >= screenx) || (x2 >= screenx))
 265     goto end_of_line;
 266
 267   dx = x2 - x1;
 268   dy = y2 - y1;
 269   if (x1 >= x2) {
 270     int tmp;
 271
 272     tmp = x1;
 273     x1 = x2;
 274     x2 = tmp;
 275     tmp = y1;
 276     y1 = y2;
 277     y2 = tmp;
 278     dx = x2 - x1;
 279     dy = y2 - y1;
 280   }
 281
 282   /* vertical line */
 283   if (dx == 0) {
 284     if (y1 < y2) {
 285       p = &(data[(screenx * y1) + x1]);
 286       for (y = y1; y <= y2; y++) {
 287         DRAWMETHOD;
 288         p += screenx;
 289       }
 290     } else {
 291       p = &(data[(screenx * y2) + x1]);
 292       for (y = y2; y <= y1; y++) {
 293         DRAWMETHOD;
 294         p += screenx;
 295       }
 296     }
 297     goto end_of_line;
 298   }
 299   /* horizontal line */
 300   if (dy == 0) {
 301     if (x1 < x2) {
 302       p = &(data[(screenx * y1) + x1]);
 303       for (x = x1; x <= x2; x++) {
 304         DRAWMETHOD;
 305         p++;
 306       }
 307       goto end_of_line;
 308     } else {
 309       p = &(data[(screenx * y1) + x2]);
 310       for (x = x2; x <= x1; x++) {
 311         DRAWMETHOD;
 312         p++;
 313       }
 314       goto end_of_line;
 315     }
 316   }
 317   /* 1    */
 318   /*  \   */
 319   /*   \  */
 320   /*    2 */
 321   if (y2 > y1) {
 322     /* steep */
 323     if (dy > dx) {
 324       dx = ((dx << 16) / dy);
 325       x = x1 << 16;
 326       for (y = y1; y <= y2; y++) {
 327         xx = x >> 16;
 328         p = &(data[(screenx * y) + xx]);
 329         DRAWMETHOD;
 330         if (xx < (screenx - 1)) {
 331           p++;
 332           /* DRAWMETHOD; */
 333         }
 334         x += dx;
 335       }
 336       goto end_of_line;
 337     }
 338     /* shallow */
 339     else {
 340       dy = ((dy << 16) / dx);
 341       y = y1 << 16;
 342       for (x = x1; x <= x2; x++) {
 343         yy = y >> 16;
 344         p = &(data[(screenx * yy) + x]);
 345         DRAWMETHOD;
 346         if (yy < (screeny - 1)) {
 347           p += screeny;
 348           /* DRAWMETHOD; */
 349         }
 350         y += dy;
 351       }
 352     }
 353   }
 354   /*    2 */
 355   /*   /  */
 356   /*  /   */
 357   /* 1    */
 358   else {
 359     /* steep */
 360     if (-dy > dx) {
 361       dx = ((dx << 16) / -dy);
 362       x = (x1 + 1) << 16;
 363       for (y = y1; y >= y2; y--) {
 364         xx = x >> 16;
 365         p = &(data[(screenx * y) + xx]);
 366         DRAWMETHOD;
 367         if (xx < (screenx - 1)) {
 368           p--;
 369           /* DRAWMETHOD; */
 370         }
 371         x += dx;
 372       }
 373       goto end_of_line;
 374     }
 375     /* shallow */
 376     else {
 377       dy = ((dy << 16) / dx);
 378       y = y1 << 16;
 379       for (x = x1; x <= x2; x++) {
 380         yy = y >> 16;
 381         p = &(data[(screenx * yy) + x]);
 382         DRAWMETHOD;
 383         if (yy < (screeny - 1)) {
 384           p += screeny;
 385           /* DRAWMETHOD; */
 386         }
 387         y += dy;
 388       }
 389       goto end_of_line;
 390     }
 391   }
 392 end_of_line:
 393   /* this was femms, which is AMD 3dnow */
 394   __asm__ __volatile__ ("emms\n");
 395 }
 396 #else
 397 int
 398 xmmx_supported (void)
 399 {
 400   return (0);
 401 }
 402 #endif