5 #include "goom_config.h"
9 /* a definir pour avoir exactement le meme resultat que la fonction C
10 * (un chouillat plus lent).. mais la difference est assez peu notable.
12 // #define STRICT_COMPAT
14 #define BUFFPOINTNB 16
15 #define BUFFPOINTMASK 0xffff
19 /* faire : a % sqrtperte <=> a & pertemask*/
21 /* faire : a / sqrtperte <=> a >> PERTEDEC*/
28 #include "goom_graphic.h"
33 return (mm_support () & 0x8) >> 3;
37 zoom_filter_xmmx (int prevX, int prevY,
38 Pixel * expix1, Pixel * expix2,
39 int *lbruS, int *lbruD, int buffratio, int precalCoef[16][16])
41 int bufsize = prevX * prevY; /* taille du buffer */
42 volatile int loop; /* variable de boucle */
44 mmx_t *brutS = (mmx_t *) lbruS; /* buffer de transformation source */
45 mmx_t *brutD = (mmx_t *) lbruD; /* buffer de transformation dest */
47 volatile mmx_t prevXY;
48 volatile mmx_t ratiox;
50 /* volatile mmx_t interpix; */
52 expix1[0].val = expix1[prevX - 1].val = expix1[prevX * prevY - 1].val =
53 expix1[prevX * prevY - prevX].val = 0;
55 prevXY.ud[0] = (prevX - 1) << PERTEDEC;
56 prevXY.ud[1] = (prevY - 1) << PERTEDEC;
58 ratiox.d[0] = buffratio;
59 ratiox.d[1] = buffratio;
61 asm volatile ("\n\t movq %[ratio], %%mm6" "\n\t pslld $16, %%mm6" /* mm6 = [rat16=buffratio<<16 | rat16=buffratio<<16] */
62 "\n\t pxor %%mm7, %%mm7" /* mm7 = 0 */
63 ::[ratio] "m" (ratiox));
68 * NOTE : mm6 et mm7 ne sont pas modifies dans la boucle.
70 while (loop < bufsize) {
72 * pre : mm6 = [rat16|rat16]
73 * post : mm0 = S + ((D-S)*rat16 format [X|Y]
74 * modified = mm0,mm1,mm2
77 asm volatile ("#1 \n\t movq 0(%[brutS]), %%mm0" "#1 \n\t movq 0(%[brutD]), %%mm1" "#1 \n\t psubd %%mm0, %%mm1" /* mm1 = D - S */
78 "#1 \n\t movq %%mm1, %%mm2" /* mm2 = D - S */
79 "#1 \n\t pslld $16, %%mm1" "#1 \n\t pmullw %%mm6, %%mm2" "#1 \n\t pmulhuw %%mm6, %%mm1" "#1 \n\t pslld $16, %%mm0" "#1 \n\t paddd %%mm2, %%mm1" /* mm1 = (D - S) * buffratio >> 16 */
80 "#1 \n\t paddd %%mm1, %%mm0" /* mm0 = S + mm1 */
81 "#1 \n\t psrld $16, %%mm0"::[brutS] "r" (&brutS[loop]),
82 [brutD] "r" (&brutD[loop])
86 * pre : mm0 : position vector on screen
87 * prevXY : coordinate of the lower-right point on screen
89 * modified : mm0,mm1,mm2
92 ("#1 \n\t movq %[prevXY], %%mm1" "#1 \n\t pcmpgtd %%mm0, %%mm1"
93 /* mm0 en X contient (idem pour Y) :
95 * 0000 si prevXY <= px */
97 "#1 \n\t movq %%mm1, %%mm2"
98 "#1 \n\t punpckhdq %%mm2, %%mm2"
99 "#1 \n\t punpckldq %%mm1, %%mm1" "#1 \n\t pand %%mm2, %%mm0"
101 "#1 \n\t pand %%mm1, %%mm0" /* on met a zero la partie qui deborde */
102 ::[prevXY] "m" (prevXY));
105 * pre : mm0 : clipped position on screen
107 * post : mm3 : coefs for this position
108 * mm1 : X vector [0|X]
112 __asm__ __volatile__ ("#2 \n\t movd %%mm0,%%esi"
113 "#2 \n\t movq %%mm0,%%mm1"
114 "#2 \n\t andl $15,%%esi"
115 "#2 \n\t psrlq $32,%%mm1"
116 "#2 \n\t shll $6,%%esi"
117 "#2 \n\t movd %%mm1,%%eax"
118 "#2 \n\t addl %[precalCoef],%%esi"
119 "#2 \n\t andl $15,%%eax"
120 "#2 \n\t movd (%%esi,%%eax,4),%%mm3"::[precalCoef]
121 "g" (precalCoef):"eax", "esi");
124 * extraction des coefficients... (Thread #3)
126 * pre : coef dans mm3
128 * post : coef extraits dans mm3 (c1 & c2)
135 * pre : mm0 : Y pos [*|Y]
138 * post : mm0 : expix1[position]
139 * mm2 : expix1[position+largeur]
143 __asm__ __volatile__ ("#2 \n\t psrld $4, %%mm0" "#2 \n\t psrld $4, %%mm1" /* PERTEDEC = $4 */
144 "#4 \n\t movd %%mm1,%%eax"
145 "#3 \n\t movq %%mm3,%%mm5"
146 "#4 \n\t mull %[prevX]"
147 "#4 \n\t movd %%mm0,%%esi"
148 "#3 \n\t punpcklbw %%mm5, %%mm3"
149 "#4 \n\t addl %%esi, %%eax"
150 "#3 \n\t movq %%mm3, %%mm4"
151 "#3 \n\t movq %%mm3, %%mm5"
152 "#4 \n\t movl %[expix1], %%esi"
153 "#3 \n\t punpcklbw %%mm5, %%mm3"
154 "#4 \n\t movq (%%esi,%%eax,4),%%mm0"
155 "#3 \n\t punpckhbw %%mm5, %%mm4"
156 "#4 \n\t addl %[prevX],%%eax"
157 "#4 \n\t movq (%%esi,%%eax,4),%%mm2"::[expix1] "g" (expix1)
162 * pre : mm0 : expix1[position]
163 * mm2 : expix1[position+largeur]
167 /* recopie des deux premiers pixels dans mm0 et mm1 */
168 movq_r2r (mm0, mm1); /* b1-v1-r1-a1-b2-v2-r2-a2 */
170 /* depackage du premier pixel */
171 punpcklbw_r2r (mm7, mm0); /* 00-b2-00-v2-00-r2-00-a2 */
173 /* extraction des coefficients... */
175 movq_r2r (mm3, mm5); /* c2-c2-c2-c2-c1-c1-c1-c1 */
177 /*^en parrallele^ *//* depackage du 2ieme pixel */
178 /*^ */ punpckhbw_r2r (mm7, mm1);
179 /* 00-b1-00-v1-00-r1-00-a1 */
181 punpcklbw_r2r (mm7, mm5); /* 00-c1-00-c1-00-c1-00-c1 */
182 punpckhbw_r2r (mm7, mm3); /* 00-c2-00-c2-00-c2-00-c2 */
184 /* multiplication des pixels par les coefficients */
185 pmullw_r2r (mm5, mm0); /* c1*b2-c1*v2-c1*r2-c1*a2 */
186 pmullw_r2r (mm3, mm1); /* c2*b1-c2*v1-c2*r1-c2*a1 */
187 paddw_r2r (mm1, mm0);
189 /* ...extraction des 2 derniers coefficients */
190 movq_r2r (mm4, mm5); /* c4-c4-c4-c4-c3-c3-c3-c3 */
191 punpcklbw_r2r (mm7, mm4); /* 00-c3-00-c3-00-c3-00-c3 */
192 punpckhbw_r2r (mm7, mm5); /* 00-c4-00-c4-00-c4-00-c4 */
194 /* recuperation des 2 derniers pixels */
197 /* depackage des pixels */
198 punpcklbw_r2r (mm7, mm1);
199 punpckhbw_r2r (mm7, mm2);
201 /* multiplication pas les coeffs */
202 pmullw_r2r (mm4, mm1);
203 pmullw_r2r (mm5, mm2);
205 /* ajout des valeurs obtenues � la valeur finale */
206 paddw_r2r (mm1, mm0);
207 paddw_r2r (mm2, mm0);
209 /* division par 256 = 16+16+16+16, puis repackage du pixel final */
211 packuswb_r2r (mm7, mm0);
213 movd_r2m (mm0, expix2[loop]);
217 /* this was femms, which is AMD 3dnow */
218 __asm__ __volatile__ ("emms\n");
221 #define DRAWMETHOD_PLUS_XMMX(_out,_backbuf,_col) \
223 movd_m2r(_backbuf, mm0); \
224 paddusb_m2r(_col, mm0); \
225 movd_r2m(mm0, _out); \
228 #define DRAWMETHOD DRAWMETHOD_PLUS_XMMX(*p,*p,col)
231 draw_line_xmmx (Pixel * data, int x1, int y1, int x2, int y2, int col,
232 int screenx, int screeny)
234 int x, y, dx, dy, yy, xx;
237 if ((y1 < 0) || (y2 < 0) || (x1 < 0) || (x2 < 0) || (y1 >= screeny)
238 || (y2 >= screeny) || (x1 >= screenx) || (x2 >= screenx))
259 p = &(data[(screenx * y1) + x1]);
260 for (y = y1; y <= y2; y++) {
265 p = &(data[(screenx * y2) + x1]);
266 for (y = y2; y <= y1; y++) {
273 /* horizontal line */
276 p = &(data[(screenx * y1) + x1]);
277 for (x = x1; x <= x2; x++) {
283 p = &(data[(screenx * y1) + x2]);
284 for (x = x2; x <= x1; x++) {
298 dx = ((dx << 16) / dy);
300 for (y = y1; y <= y2; y++) {
302 p = &(data[(screenx * y) + xx]);
304 if (xx < (screenx - 1)) {
314 dy = ((dy << 16) / dx);
316 for (x = x1; x <= x2; x++) {
318 p = &(data[(screenx * yy) + x]);
320 if (yy < (screeny - 1)) {
335 dx = ((dx << 16) / -dy);
337 for (y = y1; y >= y2; y--) {
339 p = &(data[(screenx * y) + xx]);
341 if (xx < (screenx - 1)) {
351 dy = ((dy << 16) / dx);
353 for (x = x1; x <= x2; x++) {
355 p = &(data[(screenx * yy) + x]);
357 if (yy < (screeny - 1)) {
367 /* this was femms, which is AMD 3dnow */
368 __asm__ __volatile__ ("emms\n");
372 xmmx_supported (void)