2 #include <liboil/liboilfunction.h>
3 #include <liboil/liboilclasses.h>
7 mas10_u8_mmx (uint8_t *d, const uint8_t *s1_np9, const int16_t *s2_10,
8 const int16_t *s3_2, int n)
16 x += s1_np9[j] * s2_10[j];
18 *d = CLAMP((x + s3_2[0])>>s3_2[1],0,255);
26 __asm__ __volatile__("\n"
27 " pxor %%mm7, %%mm7\n"
29 " movd (%[s3_2]), %%mm6\n"
31 " movzwl 2(%[s3_2]), %%ecx\n"
32 " movd %%ecx, %%mm5\n"
36 " pshufw $0x00, %%mm6, %%mm2\n"
39 " movd " #x "(%[s1_np9]), %%mm0\n" \
40 " punpcklbw %%mm7, %%mm0\n" \
41 " movq 2*" #x "(%[s2_10]), %%mm1\n" \
42 " pshufw $0x00, %%mm1, %%mm1\n" \
43 " pmullw %%mm1, %%mm0\n" \
44 " paddw %%mm0, %%mm2\n"
58 " psraw %%mm5, %%mm2\n"
59 " pmaxsw %%mm7, %%mm2\n"
60 " packuswb %%mm2, %%mm2\n"
61 " movd %%mm2, 0(%[d])\n"
63 " add $4, %[s1_np9]\n"
68 [s1_np9] "+r" (s1_np9),
70 : [s2_10] "r" (s2_10),
74 OIL_DEFINE_IMPL_FULL (mas10_u8_mmx, mas10_u8_l15, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
78 mas10_u8_mmx_2 (uint8_t *d, const uint8_t *s1_np9, const int16_t *s2_10,
79 const int16_t *s3_2, int n)
91 x += s1_np9[j] * s2_10[j];
93 *d = CLAMP((x + s3_2[0])>>s3_2[1],0,255);
100 ptr[4*j + 0] = s2_10[j];
101 ptr[4*j + 1] = s2_10[j];
102 ptr[4*j + 2] = s2_10[j];
103 ptr[4*j + 3] = s2_10[j];
108 __asm__ __volatile__("\n"
109 " pxor %%mm7, %%mm7\n"
111 " movd (%[s3_2]), %%mm6\n"
113 " movzwl 2(%[s3_2]), %%ecx\n"
114 " movd %%ecx, %%mm5\n"
118 " pshufw $0x00, %%mm6, %%mm2\n"
121 " movd " #x "(%[s1_np9]), %%mm0\n" \
122 " punpcklbw %%mm7, %%mm0\n" \
123 " pmullw 8*" #x "(%[coeff]), %%mm0\n" \
124 " paddw %%mm0, %%mm2\n"
138 " psraw %%mm5, %%mm2\n"
139 " pmaxsw %%mm7, %%mm2\n"
140 " packuswb %%mm2, %%mm2\n"
141 " movd %%mm2, 0(%[d])\n"
143 " add $4, %[s1_np9]\n"
148 [s1_np9] "+r" (s1_np9),
154 OIL_DEFINE_IMPL_FULL (mas10_u8_mmx_2, mas10_u8_l15, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
158 mas10_u8_mmx_3 (uint8_t *d, const uint8_t *s1_np9, const int16_t *s2_10,
159 const int16_t *s3_2, int n)
167 x += s1_np9[j] * s2_10[j];
169 *d = CLAMP((x + s3_2[0])>>s3_2[1],0,255);
177 __asm__ __volatile__("\n"
178 " pxor %%mm7, %%mm7\n"
180 " movd (%[s3_2]), %%mm6\n"
182 " movzwl 2(%[s3_2]), %%ecx\n"
183 " movd %%ecx, %%mm5\n"
185 " movq 0(%[s2_10]), %%mm3\n"
186 " movq 8(%[s2_10]), %%mm4\n"
190 " pshufw $0x00, %%mm6, %%mm2\n"
193 " movd " #x "(%[s1_np9]), %%mm0\n" \
194 " punpcklbw %%mm7, %%mm0\n" \
195 " movq 2*" #x "(%[s2_10]), %%mm1\n" \
196 " pshufw $0x00, %%mm1, %%mm1\n" \
197 " pmullw %%mm1, %%mm0\n" \
198 " paddw %%mm0, %%mm2\n"
201 " movd 0(%[s1_np9]), %%mm0\n"
202 " punpcklbw %%mm7, %%mm0\n"
203 " pshufw $0x00, %%mm3, %%mm1\n"
204 " pmullw %%mm1, %%mm0\n"
205 " paddw %%mm0, %%mm2\n"
208 " movd 1(%[s1_np9]), %%mm0\n"
209 " punpcklbw %%mm7, %%mm0\n"
210 " pshufw $0x55*1, %%mm3, %%mm1\n"
211 " pmullw %%mm1, %%mm0\n"
212 " paddw %%mm0, %%mm2\n"
215 " movd 2(%[s1_np9]), %%mm0\n"
216 " punpcklbw %%mm7, %%mm0\n"
217 " pshufw $0x55*2, %%mm3, %%mm1\n"
218 " pmullw %%mm1, %%mm0\n"
219 " paddw %%mm0, %%mm2\n"
222 " movd 3(%[s1_np9]), %%mm0\n"
223 " punpcklbw %%mm7, %%mm0\n"
224 " pshufw $0x55*3, %%mm3, %%mm1\n"
225 " pmullw %%mm1, %%mm0\n"
226 " paddw %%mm0, %%mm2\n"
229 " movd 4(%[s1_np9]), %%mm0\n"
230 " punpcklbw %%mm7, %%mm0\n"
231 " pshufw $0x00, %%mm4, %%mm1\n"
232 " pmullw %%mm1, %%mm0\n"
233 " paddw %%mm0, %%mm2\n"
236 " movd 5(%[s1_np9]), %%mm0\n"
237 " punpcklbw %%mm7, %%mm0\n"
238 " pshufw $0x55*1, %%mm4, %%mm1\n"
239 " pmullw %%mm1, %%mm0\n"
240 " paddw %%mm0, %%mm2\n"
243 " movd 6(%[s1_np9]), %%mm0\n"
244 " punpcklbw %%mm7, %%mm0\n"
245 " pshufw $0x55*2, %%mm4, %%mm1\n"
246 " pmullw %%mm1, %%mm0\n"
247 " paddw %%mm0, %%mm2\n"
250 " movd 7(%[s1_np9]), %%mm0\n"
251 " punpcklbw %%mm7, %%mm0\n"
252 " pshufw $0x55*3, %%mm4, %%mm1\n"
253 " pmullw %%mm1, %%mm0\n"
254 " paddw %%mm0, %%mm2\n"
260 " psraw %%mm5, %%mm2\n"
261 " pmaxsw %%mm7, %%mm2\n"
262 " packuswb %%mm2, %%mm2\n"
263 " movd %%mm2, 0(%[d])\n"
265 " add $4, %[s1_np9]\n"
270 [s1_np9] "+r" (s1_np9),
272 : [s2_10] "r" (s2_10),
276 OIL_DEFINE_IMPL_FULL (mas10_u8_mmx_3, mas10_u8_l15, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
279 mas10_u8_mmx_4 (uint8_t *d, const uint8_t *s1_np9, const int16_t *s2_10,
280 const int16_t *s3_2, int n)
283 __asm__ __volatile__("\n"
284 " pxor %%mm7, %%mm7\n"
286 " movzwl 0(%[s3_2]), %%ecx\n"
287 " movd %%ecx, %%mm6\n"
289 " movzwl 2(%[s3_2]), %%ecx\n"
290 " movd %%ecx, %%mm5\n"
293 " movd 0(%[s1_np9]), %%mm0\n"
294 " punpcklbw %%mm7, %%mm0\n"
295 " pmaddwd 0(%[s2_10]), %%mm0\n"
297 " movd 4(%[s1_np9]), %%mm1\n"
298 " punpcklbw %%mm7, %%mm1\n"
299 " pmaddwd 8(%[s2_10]), %%mm1\n"
301 " movd 8(%[s1_np9]), %%mm2\n"
302 " punpcklbw %%mm7, %%mm2\n"
303 " pmaddwd 16(%[s2_10]), %%mm2\n"
305 " paddd %%mm1, %%mm0\n"
306 " movq %%mm0, %%mm1\n"
307 " psrlq $32, %%mm0\n"
308 " paddd %%mm1, %%mm0\n"
309 " paddd %%mm2, %%mm0\n"
310 " paddd %%mm6, %%mm0\n"
312 " psrad %%mm5, %%mm0\n"
313 " pmaxsw %%mm7, %%mm0\n"
314 " packuswb %%mm0, %%mm0\n"
315 " movd %%mm0, %%ecx\n"
316 " movb %%cl,0(%[d])\n"
319 " add $1, %[s1_np9]\n"
324 [s1_np9] "+r" (s1_np9),
326 : [s2_10] "r" (s2_10),
330 OIL_DEFINE_IMPL_FULL (mas10_u8_mmx_4, mas10_u8, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
334 mas8_u8_mmx_3 (uint8_t *d, const uint8_t *s1_np7, const int16_t *s2_8,
335 const int16_t *s3_2, int n)
343 x += s1_np7[j] * s2_8[j];
345 *d = CLAMP((x + s3_2[0])>>s3_2[1],0,255);
353 __asm__ __volatile__("\n"
354 " pxor %%mm7, %%mm7\n"
356 " movd (%[s3_2]), %%mm6\n"
358 " movzwl 2(%[s3_2]), %%ecx\n"
359 " movd %%ecx, %%mm5\n"
361 " movq 0(%[s2_8]), %%mm3\n"
362 " movq 8(%[s2_8]), %%mm4\n"
366 " pshufw $0x00, %%mm6, %%mm2\n"
368 " movd 0(%[s1_np7]), %%mm0\n"
369 " punpcklbw %%mm7, %%mm0\n"
370 " pshufw $0x00, %%mm3, %%mm1\n"
371 " pmullw %%mm1, %%mm0\n"
372 " paddw %%mm0, %%mm2\n"
374 " movd 1(%[s1_np7]), %%mm0\n"
375 " punpcklbw %%mm7, %%mm0\n"
376 " pshufw $0x55*1, %%mm3, %%mm1\n"
377 " pmullw %%mm1, %%mm0\n"
378 " paddw %%mm0, %%mm2\n"
380 " movd 2(%[s1_np7]), %%mm0\n"
381 " punpcklbw %%mm7, %%mm0\n"
382 " pshufw $0x55*2, %%mm3, %%mm1\n"
383 " pmullw %%mm1, %%mm0\n"
384 " paddw %%mm0, %%mm2\n"
386 " movd 3(%[s1_np7]), %%mm0\n"
387 " punpcklbw %%mm7, %%mm0\n"
388 " pshufw $0x55*3, %%mm3, %%mm1\n"
389 " pmullw %%mm1, %%mm0\n"
390 " paddw %%mm0, %%mm2\n"
392 " movd 4(%[s1_np7]), %%mm0\n"
393 " punpcklbw %%mm7, %%mm0\n"
394 " pshufw $0x00, %%mm4, %%mm1\n"
395 " pmullw %%mm1, %%mm0\n"
396 " paddw %%mm0, %%mm2\n"
398 " movd 5(%[s1_np7]), %%mm0\n"
399 " punpcklbw %%mm7, %%mm0\n"
400 " pshufw $0x55*1, %%mm4, %%mm1\n"
401 " pmullw %%mm1, %%mm0\n"
402 " paddw %%mm0, %%mm2\n"
404 " movd 6(%[s1_np7]), %%mm0\n"
405 " punpcklbw %%mm7, %%mm0\n"
406 " pshufw $0x55*2, %%mm4, %%mm1\n"
407 " pmullw %%mm1, %%mm0\n"
408 " paddw %%mm0, %%mm2\n"
410 " movd 7(%[s1_np7]), %%mm0\n"
411 " punpcklbw %%mm7, %%mm0\n"
412 " pshufw $0x55*3, %%mm4, %%mm1\n"
413 " pmullw %%mm1, %%mm0\n"
414 " paddw %%mm0, %%mm2\n"
416 " psraw %%mm5, %%mm2\n"
417 " pmaxsw %%mm7, %%mm2\n"
418 " packuswb %%mm2, %%mm2\n"
419 " movd %%mm2, 0(%[d])\n"
421 " add $4, %[s1_np7]\n"
426 [s1_np7] "+r" (s1_np7),
432 OIL_DEFINE_IMPL_FULL (mas8_u8_mmx_3, mas8_u8_l15, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
435 mas8_u8_mmx_4 (uint8_t *d, const uint8_t *s1_np7, const int16_t *s2_8,
436 const int16_t *s3_2, int n)
439 __asm__ __volatile__("\n"
440 " pxor %%mm7, %%mm7\n"
442 " movzwl 0(%[s3_2]), %%ecx\n"
443 " movd %%ecx, %%mm6\n"
445 " movzwl 2(%[s3_2]), %%ecx\n"
446 " movd %%ecx, %%mm5\n"
449 " movd 0(%[s1_np7]), %%mm0\n"
450 " punpcklbw %%mm7, %%mm0\n"
451 " pmaddwd 0(%[s2_8]), %%mm0\n"
453 " movd 4(%[s1_np7]), %%mm1\n"
454 " punpcklbw %%mm7, %%mm1\n"
455 " pmaddwd 8(%[s2_8]), %%mm1\n"
457 " paddd %%mm1, %%mm0\n"
458 " movq %%mm0, %%mm1\n"
459 " psrlq $32, %%mm0\n"
460 " paddd %%mm1, %%mm0\n"
461 " paddd %%mm6, %%mm0\n"
463 " psrad %%mm5, %%mm0\n"
464 " pmaxsw %%mm7, %%mm0\n"
465 " packuswb %%mm0, %%mm0\n"
466 " movd %%mm0, %%ecx\n"
467 " movb %%cl,0(%[d])\n"
470 " add $1, %[s1_np7]\n"
475 [s1_np7] "+r" (s1_np7),
481 OIL_DEFINE_IMPL_FULL (mas8_u8_mmx_4, mas8_u8, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
484 mas8_u8_sym_mmx_3 (uint8_t *d, const uint8_t *s1_np7, const int16_t *s2_8,
485 const int16_t *s3_2, int n)
493 x += s1_np7[j] * s2_8[j];
495 *d = CLAMP((x + s3_2[0])>>s3_2[1],0,255);
503 __asm__ __volatile__("\n"
504 " pxor %%mm7, %%mm7\n"
506 " movd (%[s3_2]), %%mm6\n"
508 " movzwl 2(%[s3_2]), %%ecx\n"
509 " movd %%ecx, %%mm5\n"
511 " movq 0(%[s2_8]), %%mm3\n"
512 " movq 8(%[s2_8]), %%mm4\n"
517 " pshufw $0x00, %%mm6, %%mm2\n"
519 " movd 0(%[s1_np7]), %%mm0\n"
520 " punpcklbw %%mm7, %%mm0\n"
521 " movd 7(%[s1_np7]), %%mm1\n"
522 " punpcklbw %%mm7, %%mm1\n"
523 " paddw %%mm1, %%mm0\n"
524 //" pshufw $0x00, %%mm3, %%mm1\n"
525 //" pmullw %%mm1, %%mm0\n"
526 //" paddw %%mm0, %%mm2\n"
527 " psubw %%mm0, %%mm2\n"
529 " movd 1(%[s1_np7]), %%mm0\n"
530 " punpcklbw %%mm7, %%mm0\n"
531 " movd 6(%[s1_np7]), %%mm1\n"
532 " punpcklbw %%mm7, %%mm1\n"
533 " paddw %%mm1, %%mm0\n"
534 " pshufw $0x55*1, %%mm3, %%mm1\n"
535 " pmullw %%mm1, %%mm0\n"
536 " paddw %%mm0, %%mm2\n"
538 " movd 2(%[s1_np7]), %%mm0\n"
539 " punpcklbw %%mm7, %%mm0\n"
540 " movd 5(%[s1_np7]), %%mm1\n"
541 " punpcklbw %%mm7, %%mm1\n"
542 " paddw %%mm1, %%mm0\n"
543 " pshufw $0x55*2, %%mm3, %%mm1\n"
544 " pmullw %%mm1, %%mm0\n"
545 " paddw %%mm0, %%mm2\n"
547 " movd 3(%[s1_np7]), %%mm0\n"
548 " punpcklbw %%mm7, %%mm0\n"
549 " movd 4(%[s1_np7]), %%mm1\n"
550 " punpcklbw %%mm7, %%mm1\n"
551 " paddw %%mm1, %%mm0\n"
552 " pshufw $0x55*3, %%mm3, %%mm1\n"
553 " pmullw %%mm1, %%mm0\n"
554 " paddw %%mm0, %%mm2\n"
556 " psraw %%mm5, %%mm2\n"
557 " pmaxsw %%mm7, %%mm2\n"
558 " packuswb %%mm2, %%mm2\n"
559 " movd %%mm2, 0(%[d])\n"
561 " add $4, %[s1_np7]\n"
566 [s1_np7] "+r" (s1_np7),
572 OIL_DEFINE_IMPL_FULL (mas8_u8_sym_mmx_3, mas8_u8_sym_l15, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
575 mas8_u8_sym_mmx_41 (uint8_t *d, const uint8_t *s1_np7, const int16_t *s2_8,
576 const int16_t *s3_2, int n)
585 x += s1_np7[j] * s2_8[j];
587 *d = CLAMP((x + s3_2[0])>>s3_2[1],0,255);
595 __asm__ __volatile__("\n"
596 " pxor %%mm7, %%mm7\n"
598 " movd (%[s3_2]), %%mm6\n"
600 " movzwl 2(%[s3_2]), %%ecx\n"
601 " movd %%ecx, %%mm5\n"
603 " movq 0(%[s2_8]), %%mm3\n"
604 " pshufw $0x55*0, %%mm3, %%mm1\n"
605 " movq %%mm1, 0(%[coeff])\n"
606 " pshufw $0x55*1, %%mm3, %%mm1\n"
607 " movq %%mm1, 8(%[coeff])\n"
608 " pshufw $0x55*2, %%mm3, %%mm1\n"
609 " movq %%mm1, 16(%[coeff])\n"
610 " pshufw $0x55*3, %%mm3, %%mm1\n"
611 " movq %%mm1, 24(%[coeff])\n"
618 __asm__ __volatile__("\n"
622 " pshufw $0x00, %%mm6, %%mm2\n"
624 " movd 0(%[s1_np7]), %%mm0\n"
625 " punpcklbw %%mm7, %%mm0\n"
626 " movd 7(%[s1_np7]), %%mm1\n"
627 " punpcklbw %%mm7, %%mm1\n"
628 " paddw %%mm1, %%mm0\n"
629 " pmullw 0(%[coeff]), %%mm0\n"
630 " paddw %%mm0, %%mm2\n"
632 " movd 1(%[s1_np7]), %%mm0\n"
633 " punpcklbw %%mm7, %%mm0\n"
634 " movd 6(%[s1_np7]), %%mm1\n"
635 " punpcklbw %%mm7, %%mm1\n"
636 " paddw %%mm1, %%mm0\n"
637 " pmullw 8(%[coeff]), %%mm0\n"
638 " paddw %%mm0, %%mm2\n"
640 " movd 2(%[s1_np7]), %%mm0\n"
641 " punpcklbw %%mm7, %%mm0\n"
642 " movd 5(%[s1_np7]), %%mm1\n"
643 " punpcklbw %%mm7, %%mm1\n"
644 " paddw %%mm1, %%mm0\n"
645 " pmullw 16(%[coeff]), %%mm0\n"
646 " paddw %%mm0, %%mm2\n"
648 " movd 3(%[s1_np7]), %%mm0\n"
649 " punpcklbw %%mm7, %%mm0\n"
650 " movd 4(%[s1_np7]), %%mm1\n"
651 " punpcklbw %%mm7, %%mm1\n"
652 " paddw %%mm1, %%mm0\n"
653 " pmullw 24(%[coeff]), %%mm0\n"
654 " paddw %%mm0, %%mm2\n"
656 " psraw %%mm5, %%mm2\n"
657 " pmaxsw %%mm7, %%mm2\n"
658 " packuswb %%mm2, %%mm2\n"
659 " movd %%mm2, 0(%[d])\n"
661 " add $4, %[s1_np7]\n"
666 [s1_np7] "+r" (s1_np7),
672 OIL_DEFINE_IMPL_FULL (mas8_u8_sym_mmx_41, mas8_u8_sym_l15, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
675 #define PSHUFW_3210 "0xe4"
676 #define PSHUFW_0123 "0x1b"
679 mas8_u8_sym_mmx_5 (uint8_t *d, const uint8_t *s1_np7, const int16_t *s2_8,
680 const int16_t *s3_2, int n)
683 __asm__ __volatile__("\n"
684 " pxor %%mm7, %%mm7\n"
686 " movzwl 0(%[s3_2]), %%ecx\n"
687 " movd %%ecx, %%mm6\n"
688 " pshufw $0x44, %%mm6, %%mm6\n" // 01 00 01 00
690 " movzwl 2(%[s3_2]), %%ecx\n"
691 " movd %%ecx, %%mm5\n"
697 " movd 0(%[s1_np7]), %%mm0\n"
698 " punpcklbw %%mm7, %%mm0\n"
700 " movd 4(%[s1_np7]), %%mm1\n"
701 " punpcklbw %%mm7, %%mm1\n"
702 " pshufw $0x1b, %%mm1, %%mm1\n" // 00 01 10 11
703 " paddw %%mm1, %%mm0\n"
704 " pmaddwd 0(%[s2_8]), %%mm0\n"
706 " pmaddwd 0(%[s2_8]), %%mm0\n"
708 " movd 4(%[s1_np7]), %%mm1\n"
709 " punpcklbw %%mm7, %%mm1\n"
710 " pmaddwd 8(%[s2_8]), %%mm1\n"
711 " paddd %%mm1, %%mm0\n"
714 " pshufw $0xee, %%mm0, %%mm1\n" // 11 10 11 10
715 " paddd %%mm1, %%mm0\n"
716 " paddd %%mm6, %%mm0\n"
718 " psrad %%mm5, %%mm0\n"
719 " pmaxsw %%mm7, %%mm0\n"
720 " packuswb %%mm0, %%mm0\n"
721 " movd %%mm0, %%ecx\n"
722 " movb %%cl,0(%[d])\n"
725 " add $1, %[s1_np7]\n"
732 [s1_np7] "+r" (s1_np7),
738 OIL_DEFINE_IMPL_FULL (mas8_u8_sym_mmx_5, mas8_u8_sym_l15, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
740 #ifdef HAVE_SSSE3_ASM
742 mas8_u8_sym_mmx_6 (uint8_t *d, const uint8_t *s1_np7, const int16_t *s2_8,
743 const int16_t *s3_2, int n)
750 //coeff[i] = s2_8[i];
756 __asm__ __volatile__("\n"
757 " pxor %%mm7, %%mm7\n"
759 " movzwl 0(%[s3_2]), %%ecx\n"
760 " movd %%ecx, %%mm6\n"
761 " pshufw $0x44, %%mm6, %%mm6\n" // 01 00 01 00
763 " movzwl 2(%[s3_2]), %%ecx\n"
764 " movd %%ecx, %%mm5\n"
766 " movq 0(%[s2_8]), %%mm4\n"
767 " packsswb 8(%[s2_8]), %%mm4\n"
770 " movq 0(%[s1_np7]), %%mm0\n"
771 " pmaddubsw %%mm4, %%mm0\n"
774 " pshufw $0xee, %%mm0, %%mm1\n" // 11 10 11 10
775 " paddw %%mm1, %%mm0\n"
776 " pshufw $0x55, %%mm0, %%mm1\n" // 01 01 01 01
777 " paddw %%mm1, %%mm0\n"
779 " phaddw %%mm0, %%mm0\n"
780 " phaddw %%mm0, %%mm0\n"
783 " paddw %%mm6, %%mm0\n"
784 " psraw %%mm5, %%mm0\n"
785 " pmaxsw %%mm7, %%mm0\n"
786 " packuswb %%mm0, %%mm0\n"
787 " movd %%mm0, %%ecx\n"
788 " movb %%cl,0(%[d])\n"
791 " add $1, %[s1_np7]\n"
797 [s1_np7] "+r" (s1_np7),
803 OIL_DEFINE_IMPL_FULL (mas8_u8_sym_mmx_6, mas8_u8_sym_l15, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT|OIL_IMPL_FLAG_SSSE3);
806 #ifdef ENABLE_BROKEN_IMPLS
807 /* This only works for the taps array: -1, 3, -7, 21, 21, -7, 3, -1 */
809 mas8_u8_supersym_mmx (uint8_t *d, const uint8_t *s1_np7, const int16_t *s2_8,
810 const int16_t *s3_2, int n)
818 x += s1_np7[j] * s2_8[j];
820 *d = CLAMP((x + s3_2[0])>>s3_2[1],0,255);
827 __asm__ __volatile__("\n"
828 " pxor %%mm7, %%mm7\n"
830 " movd (%[s3_2]), %%mm6\n"
831 " pshufw $0x00, %%mm6, %%mm6\n"
833 " movzwl 2(%[s3_2]), %%ecx\n"
834 " movd %%ecx, %%mm5\n"
836 " movq 0(%[s2_8]), %%mm3\n"
837 " movq 8(%[s2_8]), %%mm4\n"
840 " movd 0(%[s1_np7]), %%mm0\n"
841 " punpcklbw %%mm7, %%mm0\n"
842 " movd 7(%[s1_np7]), %%mm1\n"
843 " punpcklbw %%mm7, %%mm1\n"
844 " paddw %%mm1, %%mm0\n"
846 " movd 1(%[s1_np7]), %%mm2\n"
847 " punpcklbw %%mm7, %%mm2\n"
848 " movd 6(%[s1_np7]), %%mm3\n"
849 " punpcklbw %%mm7, %%mm3\n"
850 " paddw %%mm3, %%mm2\n"
852 " paddw %%mm2, %%mm0\n"
854 " psubw %%mm0, %%mm2\n"
855 " movq %%mm2, %%mm4\n"
857 " movd 2(%[s1_np7]), %%mm0\n"
858 " punpcklbw %%mm7, %%mm0\n"
859 " movd 5(%[s1_np7]), %%mm1\n"
860 " punpcklbw %%mm7, %%mm1\n"
861 " paddw %%mm1, %%mm0\n"
863 " movd 3(%[s1_np7]), %%mm2\n"
864 " punpcklbw %%mm7, %%mm2\n"
865 " movd 4(%[s1_np7]), %%mm3\n"
866 " punpcklbw %%mm7, %%mm3\n"
867 " paddw %%mm3, %%mm2\n"
869 " paddw %%mm2, %%mm0\n"
871 " psubw %%mm0, %%mm2\n"
873 " psubw %%mm2, %%mm4\n"
875 " paddw %%mm4, %%mm2\n"
877 " paddw %%mm6, %%mm2\n"
879 " psraw %%mm5, %%mm2\n"
880 " pmaxsw %%mm7, %%mm2\n"
881 " packuswb %%mm2, %%mm2\n"
882 " movd %%mm2, 0(%[d])\n"
884 " add $4, %[s1_np7]\n"
889 [s1_np7] "+r" (s1_np7),
895 OIL_DEFINE_IMPL_FULL (mas8_u8_supersym_mmx, mas8_u8_sym_l15, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
899 mas12_addc_rshift_decim2_u8_mmx_4 (uint8_t *d, const uint8_t *s1_2xnp11,
900 const int16_t *s2_12, const int16_t *s3_2, int n)
903 __asm__ __volatile__("\n"
904 " pxor %%mm7, %%mm7\n"
906 " movzwl 0(%[s3_2]), %%ecx\n"
907 " movd %%ecx, %%mm6\n"
909 " movzwl 2(%[s3_2]), %%ecx\n"
910 " movd %%ecx, %%mm5\n"
913 " movd 0(%[s1_2xnp11]), %%mm0\n"
914 " punpcklbw %%mm7, %%mm0\n"
915 " pmaddwd 0(%[s2_12]), %%mm0\n"
917 " movd 4(%[s1_2xnp11]), %%mm1\n"
918 " punpcklbw %%mm7, %%mm1\n"
919 " pmaddwd 8(%[s2_12]), %%mm1\n"
920 " paddd %%mm1, %%mm0\n"
922 " movd 8(%[s1_2xnp11]), %%mm1\n"
923 " punpcklbw %%mm7, %%mm1\n"
924 " pmaddwd 16(%[s2_12]), %%mm1\n"
925 " paddd %%mm1, %%mm0\n"
927 " movq %%mm0, %%mm1\n"
928 " psrlq $32, %%mm0\n"
929 " paddd %%mm1, %%mm0\n"
930 " paddd %%mm6, %%mm0\n"
932 " psrad %%mm5, %%mm0\n"
933 " pmaxsw %%mm7, %%mm0\n"
934 " packuswb %%mm0, %%mm0\n"
935 " movd %%mm0, %%ecx\n"
936 " movb %%cl,0(%[d])\n"
939 " add $2, %[s1_2xnp11]\n"
944 [s1_2xnp11] "+r" (s1_2xnp11),
946 : [s2_12] "r" (s2_12),
950 OIL_DEFINE_IMPL_FULL (mas12_addc_rshift_decim2_u8_mmx_4,
951 mas12_addc_rshift_decim2_u8, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
955 mas8_addc_rshift_decim2_u8_mmx_4 (uint8_t *d, const uint8_t *s1_2xnp9,
956 const int16_t *s2_8, const int16_t *s3_2, int n)
959 __asm__ __volatile__("\n"
960 " pxor %%mm7, %%mm7\n"
962 " movzwl 0(%[s3_2]), %%ecx\n"
963 " movd %%ecx, %%mm6\n"
965 " movzwl 2(%[s3_2]), %%ecx\n"
966 " movd %%ecx, %%mm5\n"
969 " movd 0(%[s1_2xnp9]), %%mm0\n"
970 " punpcklbw %%mm7, %%mm0\n"
971 " pmaddwd 0(%[s2_8]), %%mm0\n"
973 " movd 4(%[s1_2xnp9]), %%mm1\n"
974 " punpcklbw %%mm7, %%mm1\n"
975 " pmaddwd 8(%[s2_8]), %%mm1\n"
976 " paddd %%mm1, %%mm0\n"
978 " movq %%mm0, %%mm1\n"
979 " psrlq $32, %%mm0\n"
980 " paddd %%mm1, %%mm0\n"
981 " paddd %%mm6, %%mm0\n"
983 " psrad %%mm5, %%mm0\n"
984 " pmaxsw %%mm7, %%mm0\n"
985 " packuswb %%mm0, %%mm0\n"
986 " movd %%mm0, %%ecx\n"
987 " movb %%cl,0(%[d])\n"
990 " add $2, %[s1_2xnp9]\n"
995 [s1_2xnp9] "+r" (s1_2xnp9),
1001 OIL_DEFINE_IMPL_FULL (mas8_addc_rshift_decim2_u8_mmx_4,
1002 mas8_addc_rshift_decim2_u8, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);
1007 mas8_across_u8_mmx_3 (uint8_t *d, const uint8_t *s1_nx8, int ss1,
1008 const int16_t *s2_8, const int16_t *s3_2, int n)
1016 x += OIL_GET(s1_nx8, i*ss1, uint8_t)*s2_8[i];
1018 *d = CLAMP((x + s3_2[0])>>s3_2[1],0,255);
1026 __asm__ __volatile__("\n"
1027 " pxor %%mm7, %%mm7\n"
1029 " movd (%[s3_2]), %%mm6\n"
1031 " movzwl 2(%[s3_2]), %%ecx\n"
1032 " movd %%ecx, %%mm5\n"
1034 " movq 0(%[s2_8]), %%mm3\n"
1035 " movq 8(%[s2_8]), %%mm4\n"
1037 : [s2_8] "r" (s2_8),
1042 const uint8_t *p = s1_nx8;
1043 __asm__ __volatile__("\n"
1046 " pshufw $0x00, %%mm6, %%mm2\n"
1048 " movd 0(%[p]), %%mm0\n"
1049 " add %[ss1], %[p]\n"
1050 " punpcklbw %%mm7, %%mm0\n"
1051 " pshufw $0x00, %%mm3, %%mm1\n"
1052 " pmullw %%mm1, %%mm0\n"
1053 " paddw %%mm0, %%mm2\n"
1055 " movd 0(%[p]), %%mm0\n"
1056 " add %[ss1], %[p]\n"
1057 " punpcklbw %%mm7, %%mm0\n"
1058 " pshufw $0x55*1, %%mm3, %%mm1\n"
1059 " pmullw %%mm1, %%mm0\n"
1060 " paddw %%mm0, %%mm2\n"
1062 " movd 0(%[p]), %%mm0\n"
1063 " add %[ss1], %[p]\n"
1064 " punpcklbw %%mm7, %%mm0\n"
1065 " pshufw $0x55*2, %%mm3, %%mm1\n"
1066 " pmullw %%mm1, %%mm0\n"
1067 " paddw %%mm0, %%mm2\n"
1069 " movd 0(%[p]), %%mm0\n"
1070 " add %[ss1], %[p]\n"
1071 " punpcklbw %%mm7, %%mm0\n"
1072 " pshufw $0x55*3, %%mm3, %%mm1\n"
1073 " pmullw %%mm1, %%mm0\n"
1074 " paddw %%mm0, %%mm2\n"
1076 " movd 0(%[p]), %%mm0\n"
1077 " add %[ss1], %[p]\n"
1078 " punpcklbw %%mm7, %%mm0\n"
1079 " pshufw $0x00, %%mm4, %%mm1\n"
1080 " pmullw %%mm1, %%mm0\n"
1081 " paddw %%mm0, %%mm2\n"
1083 " movd 0(%[p]), %%mm0\n"
1084 " add %[ss1], %[p]\n"
1085 " punpcklbw %%mm7, %%mm0\n"
1086 " pshufw $0x55*1, %%mm4, %%mm1\n"
1087 " pmullw %%mm1, %%mm0\n"
1088 " paddw %%mm0, %%mm2\n"
1090 " movd 0(%[p]), %%mm0\n"
1091 " add %[ss1], %[p]\n"
1092 " punpcklbw %%mm7, %%mm0\n"
1093 " pshufw $0x55*2, %%mm4, %%mm1\n"
1094 " pmullw %%mm1, %%mm0\n"
1095 " paddw %%mm0, %%mm2\n"
1097 " movd 0(%[p]), %%mm0\n"
1098 " add %[ss1], %[p]\n"
1099 " punpcklbw %%mm7, %%mm0\n"
1100 " pshufw $0x55*3, %%mm4, %%mm1\n"
1101 " pmullw %%mm1, %%mm0\n"
1102 " paddw %%mm0, %%mm2\n"
1104 " psraw %%mm5, %%mm2\n"
1105 " pmaxsw %%mm7, %%mm2\n"
1106 " packuswb %%mm2, %%mm2\n"
1107 " movd %%mm2, 0(%[d])\n"
1109 : [d] "r" (d), [ss1] "r" ((long)ss1));
1115 asm volatile ("emms");
1117 OIL_DEFINE_IMPL_FULL (mas8_across_u8_mmx_3, mas8_across_u8, OIL_IMPL_FLAG_MMX|OIL_IMPL_FLAG_MMXEXT);