3 #if defined (HAVE_CPU_I386) || defined (HAVE_CPU_X86_64)
6 #define BUFFPOINTMASK 0xffff
10 #include "goom_graphic.h"
13 // faire : a % sqrtperte <=> a & pertemask
15 // faire : a / sqrtperte <=> a >> PERTEDEC
21 return (mm_support () & 0x1);
25 zoom_filter_mmx (int prevX, int prevY,
26 Pixel * expix1, Pixel * expix2,
27 int *brutS, int *brutD, int buffratio, int precalCoef[16][16])
29 unsigned int ax = (prevX - 1) << PERTEDEC, ay = (prevY - 1) << PERTEDEC;
31 int bufsize = prevX * prevY;
34 __asm__ __volatile__ ("pxor %mm7,%mm7");
36 for (loop = 0; loop < bufsize; loop++) {
42 int myPos = loop << 1, myPos2 = myPos + 1;
43 int brutSmypos = brutS[myPos];
45 px = brutSmypos + (((brutD[myPos] -
46 brutSmypos) * buffratio) >> BUFFPOINTNB);
47 brutSmypos = brutS[myPos2];
48 py = brutSmypos + (((brutD[myPos2] -
49 brutSmypos) * buffratio) >> BUFFPOINTNB);
51 if ((py >= ay) || (px >= ax)) {
54 pos = ((px >> PERTEDEC) + prevX * (py >> PERTEDEC));
56 coeffs = precalCoef[px & PERTEMASK][py & PERTEMASK];
59 __asm__ __volatile__ ("movd %2, %%mm6 \n\t"
60 /* recuperation des deux premiers pixels dans mm0 et mm1 */
61 "movq (%3,%1,4), %%mm0 \n\t" /* b1-v1-r1-a1-b2-v2-r2-a2 */
62 "movq %%mm0, %%mm1 \n\t" /* b1-v1-r1-a1-b2-v2-r2-a2 */
63 /* depackage du premier pixel */
64 "punpcklbw %%mm7, %%mm0 \n\t" /* 00-b2-00-v2-00-r2-00-a2 */
65 "movq %%mm6, %%mm5 \n\t" /* ??-??-??-??-c4-c3-c2-c1 */
66 /* depackage du 2ieme pixel */
67 "punpckhbw %%mm7, %%mm1 \n\t" /* 00-b1-00-v1-00-r1-00-a1 */
68 /* extraction des coefficients... */
69 "punpcklbw %%mm5, %%mm6 \n\t" /* c4-c4-c3-c3-c2-c2-c1-c1 */
70 "movq %%mm6, %%mm4 \n\t" /* c4-c4-c3-c3-c2-c2-c1-c1 */
71 "movq %%mm6, %%mm5 \n\t" /* c4-c4-c3-c3-c2-c2-c1-c1 */
72 "punpcklbw %%mm5, %%mm6 \n\t" /* c2-c2-c2-c2-c1-c1-c1-c1 */
73 "punpckhbw %%mm5, %%mm4 \n\t" /* c4-c4-c4-c4-c3-c3-c3-c3 */
74 "movq %%mm6, %%mm3 \n\t" /* c2-c2-c2-c2-c1-c1-c1-c1 */
75 "punpcklbw %%mm7, %%mm6 \n\t" /* 00-c1-00-c1-00-c1-00-c1 */
76 "punpckhbw %%mm7, %%mm3 \n\t" /* 00-c2-00-c2-00-c2-00-c2 */
77 /* multiplication des pixels par les coefficients */
78 "pmullw %%mm6, %%mm0 \n\t" /* c1*b2-c1*v2-c1*r2-c1*a2 */
79 "pmullw %%mm3, %%mm1 \n\t" /* c2*b1-c2*v1-c2*r1-c2*a1 */
80 "paddw %%mm1, %%mm0 \n\t"
81 /* ...extraction des 2 derniers coefficients */
82 "movq %%mm4, %%mm5 \n\t" /* c4-c4-c4-c4-c3-c3-c3-c3 */
83 "punpcklbw %%mm7, %%mm4 \n\t" /* 00-c3-00-c3-00-c3-00-c3 */
84 "punpckhbw %%mm7, %%mm5 \n\t" /* 00-c4-00-c4-00-c4-00-c4 */
85 /* ajouter la longueur de ligne a esi */
86 "addl 8(%%ebp),%1 \n\t"
87 /* recuperation des 2 derniers pixels */
88 "movq (%3,%1,4), %%mm1 \n\t" "movq %%mm1, %%mm2 \n\t"
89 /* depackage des pixels */
90 "punpcklbw %%mm7, %%mm1 \n\t" "punpckhbw %%mm7, %%mm2 \n\t"
91 /* multiplication pas les coeffs */
92 "pmullw %%mm4, %%mm1 \n\t" "pmullw %%mm5, %%mm2 \n\t"
93 /* ajout des valeurs obtenues ? la valeur finale */
94 "paddw %%mm1, %%mm0 \n\t" "paddw %%mm2, %%mm0 \n\t"
95 /* division par 256 = 16+16+16+16, puis repackage du pixel final */
96 "psrlw $8, %%mm0 \n\t"
97 "packuswb %%mm7, %%mm0 \n\t" "movd %%mm0,%0 \n\t":"=g" (expix2[loop])
98 :"r" (pos), "r" (coeffs), "r" (expix1)
106 #define DRAWMETHOD_PLUS_MMX(_out,_backbuf,_col) \
108 movd_m2r(_backbuf, mm0); \
109 paddusb_m2r(_col, mm0); \
110 movd_r2m(mm0, _out); \
113 #define DRAWMETHOD DRAWMETHOD_PLUS_MMX(*p,*p,col)
116 draw_line_mmx (Pixel * data, int x1, int y1, int x2, int y2, int col,
117 int screenx, int screeny)
119 int x, y, dx, dy, yy, xx;
122 if ((y1 < 0) || (y2 < 0) || (x1 < 0) || (x2 < 0) || (y1 >= screeny)
123 || (y2 >= screeny) || (x1 >= screenx) || (x2 >= screenx))
144 p = &(data[(screenx * y1) + x1]);
145 for (y = y1; y <= y2; y++) {
150 p = &(data[(screenx * y2) + x1]);
151 for (y = y2; y <= y1; y++) {
158 /* horizontal line */
161 p = &(data[(screenx * y1) + x1]);
162 for (x = x1; x <= x2; x++) {
168 p = &(data[(screenx * y1) + x2]);
169 for (x = x2; x <= x1; x++) {
183 dx = ((dx << 16) / dy);
185 for (y = y1; y <= y2; y++) {
187 p = &(data[(screenx * y) + xx]);
189 if (xx < (screenx - 1)) {
199 dy = ((dy << 16) / dx);
201 for (x = x1; x <= x2; x++) {
203 p = &(data[(screenx * yy) + x]);
205 if (yy < (screeny - 1)) {
220 dx = ((dx << 16) / -dy);
222 for (y = y1; y >= y2; y--) {
224 p = &(data[(screenx * y) + xx]);
226 if (xx < (screenx - 1)) {
236 dy = ((dy << 16) / dx);
238 for (x = x1; x <= x2; x++) {
240 p = &(data[(screenx * yy) + x]);
242 if (yy < (screeny - 1)) {
253 /* __asm__ __volatile__ ("emms"); */
256 #endif /* HAVE_CPU_I386 || HAVE_CPU_X86_64 */