2 * Copyright © 2008 Rodrigo Kumpera
3 * Copyright © 2008 André Tupinambá
5 * Permission to use, copy, modify, distribute, and sell this software and its
6 * documentation for any purpose is hereby granted without fee, provided that
7 * the above copyright notice appear in all copies and that both that
8 * copyright notice and this permission notice appear in supporting
9 * documentation, and that the name of Red Hat not be used in advertising or
10 * publicity pertaining to distribution of the software without specific,
11 * written prior permission. Red Hat makes no representations about the
12 * suitability of this software for any purpose. It is provided "as is"
13 * without express or implied warranty.
15 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
16 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
18 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
20 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
21 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
24 * Author: Rodrigo Kumpera (kumpera@gmail.com)
25 * André Tupinambá (andrelrt@gmail.com)
27 * Based on work by Owen Taylor and Søren Sandmann
34 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
35 #include <emmintrin.h> /* for SSE2 intrinsics */
37 #include "pixman-sse2.h"
43 #define inline __forceinline
47 # define inline __inline__ __attribute__ ((__always_inline__))
50 /* -------------------------------------------------------------------------------------------------
54 static __m64 xMask0080;
55 static __m64 xMask00ff;
56 static __m64 xMask0101;
57 static __m64 xMaskAlpha;
59 static __m64 xMask565rgb;
60 static __m64 xMask565Unpack;
62 static __m128i Mask0080;
63 static __m128i Mask00ff;
64 static __m128i Mask0101;
65 static __m128i Maskffff;
66 static __m128i Maskff000000;
67 static __m128i MaskAlpha;
69 static __m128i Mask565r;
70 static __m128i Mask565g1, Mask565g2;
71 static __m128i Mask565b;
72 static __m128i MaskRed;
73 static __m128i MaskGreen;
74 static __m128i MaskBlue;
76 static __m128i Mask565FixRB;
77 static __m128i Mask565FixG;
79 /* -------------------------------------------------------------------------------------------------
83 unpack_32_1x128 (uint32_t data)
85 return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128());
89 unpack_128_2x128 (__m128i data, __m128i* dataLo, __m128i* dataHi)
91 *dataLo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
92 *dataHi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
96 unpack565to8888 (__m128i lo)
98 __m128i r, g, b, rb, t;
100 r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), MaskRed);
101 g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), MaskGreen);
102 b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), MaskBlue);
104 rb = _mm_or_si128 (r, b);
105 t = _mm_and_si128 (rb, Mask565FixRB);
106 t = _mm_srli_epi32 (t, 5);
107 rb = _mm_or_si128 (rb, t);
109 t = _mm_and_si128 (g, Mask565FixG);
110 t = _mm_srli_epi32 (t, 6);
111 g = _mm_or_si128 (g, t);
113 return _mm_or_si128 (rb, g);
117 unpack565_128_4x128 (__m128i data, __m128i* data0, __m128i* data1, __m128i* data2, __m128i* data3)
121 lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
122 hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
124 lo = unpack565to8888 (lo);
125 hi = unpack565to8888 (hi);
127 unpack_128_2x128 (lo, data0, data1);
128 unpack_128_2x128 (hi, data2, data3);
131 static inline uint16_t
132 pack565_32_16 (uint32_t pixel)
134 return (uint16_t) (((pixel>>8) & 0xf800) | ((pixel>>5) & 0x07e0) | ((pixel>>3) & 0x001f));
137 static inline __m128i
138 pack_2x128_128 (__m128i lo, __m128i hi)
140 return _mm_packus_epi16 (lo, hi);
143 static inline __m128i
144 pack565_2x128_128 (__m128i lo, __m128i hi)
147 __m128i r, g1, g2, b;
149 data = pack_2x128_128 ( lo, hi );
151 r = _mm_and_si128 (data , Mask565r);
152 g1 = _mm_and_si128 (_mm_slli_epi32 (data , 3), Mask565g1);
153 g2 = _mm_and_si128 (_mm_srli_epi32 (data , 5), Mask565g2);
154 b = _mm_and_si128 (_mm_srli_epi32 (data , 3), Mask565b);
156 return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
159 static inline __m128i
160 pack565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
162 return _mm_packus_epi16 (pack565_2x128_128 (*xmm0, *xmm1), pack565_2x128_128 (*xmm2, *xmm3));
165 static inline uint32_t
166 packAlpha (__m128i x)
168 return _mm_cvtsi128_si32 (_mm_packus_epi16 (_mm_packus_epi16 (_mm_srli_epi32 (x, 24),
169 _mm_setzero_si128 ()),
170 _mm_setzero_si128 ()));
173 static inline __m128i
174 expandPixel_32_1x128 (uint32_t data)
176 return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE(1, 0, 1, 0));
179 static inline __m128i
180 expandAlpha_1x128 (__m128i data)
182 return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
186 expandAlpha_2x128 (__m128i dataLo, __m128i dataHi, __m128i* alphaLo, __m128i* alphaHi)
190 lo = _mm_shufflelo_epi16 (dataLo, _MM_SHUFFLE(3, 3, 3, 3));
191 hi = _mm_shufflelo_epi16 (dataHi, _MM_SHUFFLE(3, 3, 3, 3));
192 *alphaLo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE(3, 3, 3, 3));
193 *alphaHi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE(3, 3, 3, 3));
197 expandAlphaRev_2x128 (__m128i dataLo, __m128i dataHi, __m128i* alphaLo, __m128i* alphaHi)
201 lo = _mm_shufflelo_epi16 (dataLo, _MM_SHUFFLE(0, 0, 0, 0));
202 hi = _mm_shufflelo_epi16 (dataHi, _MM_SHUFFLE(0, 0, 0, 0));
203 *alphaLo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE(0, 0, 0, 0));
204 *alphaHi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE(0, 0, 0, 0));
208 pixMultiply_2x128 (__m128i* dataLo, __m128i* dataHi, __m128i* alphaLo, __m128i* alphaHi, __m128i* retLo, __m128i* retHi)
212 lo = _mm_mullo_epi16 (*dataLo, *alphaLo);
213 hi = _mm_mullo_epi16 (*dataHi, *alphaHi);
214 lo = _mm_adds_epu16 (lo, Mask0080);
215 hi = _mm_adds_epu16 (hi, Mask0080);
216 *retLo = _mm_mulhi_epu16 (lo, Mask0101);
217 *retHi = _mm_mulhi_epu16 (hi, Mask0101);
221 pixAddMultiply_2x128 (__m128i* srcLo, __m128i* srcHi, __m128i* alphaDstLo, __m128i* alphaDstHi,
222 __m128i* dstLo, __m128i* dstHi, __m128i* alphaSrcLo, __m128i* alphaSrcHi,
223 __m128i* retLo, __m128i* retHi)
226 __m128i mulLo, mulHi;
228 lo = _mm_mullo_epi16 (*srcLo, *alphaDstLo);
229 hi = _mm_mullo_epi16 (*srcHi, *alphaDstHi);
230 mulLo = _mm_mullo_epi16 (*dstLo, *alphaSrcLo);
231 mulHi = _mm_mullo_epi16 (*dstHi, *alphaSrcHi);
232 lo = _mm_adds_epu16 (lo, Mask0080);
233 hi = _mm_adds_epu16 (hi, Mask0080);
234 lo = _mm_adds_epu16 (lo, mulLo);
235 hi = _mm_adds_epu16 (hi, mulHi);
236 *retLo = _mm_mulhi_epu16 (lo, Mask0101);
237 *retHi = _mm_mulhi_epu16 (hi, Mask0101);
241 negate_2x128 (__m128i dataLo, __m128i dataHi, __m128i* negLo, __m128i* negHi)
243 *negLo = _mm_xor_si128 (dataLo, Mask00ff);
244 *negHi = _mm_xor_si128 (dataHi, Mask00ff);
248 invertColors_2x128 (__m128i dataLo, __m128i dataHi, __m128i* invLo, __m128i* invHi)
252 lo = _mm_shufflelo_epi16 (dataLo, _MM_SHUFFLE(3, 0, 1, 2));
253 hi = _mm_shufflelo_epi16 (dataHi, _MM_SHUFFLE(3, 0, 1, 2));
254 *invLo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE(3, 0, 1, 2));
255 *invHi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE(3, 0, 1, 2));
259 over_2x128 (__m128i* srcLo, __m128i* srcHi, __m128i* alphaLo, __m128i* alphaHi, __m128i* dstLo, __m128i* dstHi)
263 negate_2x128 (*alphaLo, *alphaHi, &t1, &t2);
265 pixMultiply_2x128 (dstLo, dstHi, &t1, &t2, dstLo, dstHi);
267 *dstLo = _mm_adds_epu8 (*srcLo, *dstLo);
268 *dstHi = _mm_adds_epu8 (*srcHi, *dstHi);
272 overRevNonPre_2x128 (__m128i srcLo, __m128i srcHi, __m128i* dstLo, __m128i* dstHi)
275 __m128i alphaLo, alphaHi;
277 expandAlpha_2x128 (srcLo, srcHi, &alphaLo, &alphaHi);
279 lo = _mm_or_si128 (alphaLo, MaskAlpha);
280 hi = _mm_or_si128 (alphaHi, MaskAlpha);
282 invertColors_2x128 (srcLo, srcHi, &srcLo, &srcHi);
284 pixMultiply_2x128 (&srcLo, &srcHi, &lo, &hi, &lo, &hi);
286 over_2x128 (&lo, &hi, &alphaLo, &alphaHi, dstLo, dstHi);
290 inOver_2x128 (__m128i* srcLo, __m128i* srcHi, __m128i* alphaLo, __m128i* alphaHi,
291 __m128i* maskLo, __m128i* maskHi, __m128i* dstLo, __m128i* dstHi)
296 pixMultiply_2x128 ( srcLo, srcHi, maskLo, maskHi, &sLo, &sHi);
297 pixMultiply_2x128 (alphaLo, alphaHi, maskLo, maskHi, &aLo, &aHi);
299 over_2x128 (&sLo, &sHi, &aLo, &aHi, dstLo, dstHi);
303 cachePrefetch (__m128i* addr)
305 _mm_prefetch (addr, _MM_HINT_T0);
309 cachePrefetchNext (__m128i* addr)
311 _mm_prefetch (addr + 4, _MM_HINT_T0); // 64 bytes ahead
314 /* load 4 pixels from a 16-byte boundary aligned address */
315 static inline __m128i
316 load128Aligned (__m128i* src)
318 return _mm_load_si128 (src);
321 /* load 4 pixels from a unaligned address */
322 static inline __m128i
323 load128Unaligned (__m128i* src)
325 return _mm_loadu_si128 (src);
328 /* save 4 pixels using Write Combining memory on a 16-byte boundary aligned address */
330 save128WriteCombining (__m128i* dst, __m128i data)
332 _mm_stream_si128 (dst, data);
335 /* save 4 pixels on a 16-byte boundary aligned address */
337 save128Aligned (__m128i* dst, __m128i data)
339 _mm_store_si128 (dst, data);
342 /* save 4 pixels on a unaligned address */
344 save128Unaligned (__m128i* dst, __m128i data)
346 _mm_storeu_si128 (dst, data);
349 /* -------------------------------------------------------------------------------------------------
354 unpack_32_1x64 (uint32_t data)
356 return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (data), _mm_setzero_si64());
360 expandAlpha_1x64 (__m64 data)
362 return _mm_shuffle_pi16 (data, _MM_SHUFFLE(3, 3, 3, 3));
366 expandAlphaRev_1x64 (__m64 data)
368 return _mm_shuffle_pi16 (data, _MM_SHUFFLE(0, 0, 0, 0));
372 expandPixel_8_1x64 (uint8_t data)
374 return _mm_shuffle_pi16 (unpack_32_1x64 ((uint32_t)data), _MM_SHUFFLE(0, 0, 0, 0));
378 pixMultiply_1x64 (__m64 data, __m64 alpha)
380 return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (data, alpha),
386 pixAddMultiply_1x64 (__m64* src, __m64* alphaDst, __m64* dst, __m64* alphaSrc)
388 return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (*src, *alphaDst),
390 _mm_mullo_pi16 (*dst, *alphaSrc)),
395 negate_1x64 (__m64 data)
397 return _mm_xor_si64 (data, xMask00ff);
401 invertColors_1x64 (__m64 data)
403 return _mm_shuffle_pi16 (data, _MM_SHUFFLE(3, 0, 1, 2));
407 over_1x64 (__m64 src, __m64 alpha, __m64 dst)
409 return _mm_adds_pu8 (src, pixMultiply_1x64 (dst, negate_1x64 (alpha)));
413 inOver_1x64 (__m64* src, __m64* alpha, __m64* mask, __m64* dst)
415 return over_1x64 (pixMultiply_1x64 (*src, *mask),
416 pixMultiply_1x64 (*alpha, *mask),
421 overRevNonPre_1x64 (__m64 src, __m64 dst)
423 __m64 alpha = expandAlpha_1x64 (src);
425 return over_1x64 (pixMultiply_1x64 (invertColors_1x64 (src),
426 _mm_or_si64 (alpha, xMaskAlpha)),
431 static inline uint32_t
432 pack_1x64_32( __m64 data )
434 return _mm_cvtsi64_si32 (_mm_packs_pu16 (data, _mm_setzero_si64()));
437 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
441 * --- Expanding 565 in the low word ---
443 * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
444 * m = m & (01f0003f001f);
445 * m = m * (008404100840);
448 * Note the trick here - the top word is shifted by another nibble to
449 * avoid it bumping into the middle word
452 expand565_16_1x64 (uint16_t pixel)
457 p = _mm_cvtsi32_si64 ((uint32_t) pixel);
459 t1 = _mm_slli_si64 (p, 36 - 11);
460 t2 = _mm_slli_si64 (p, 16 - 5);
462 p = _mm_or_si64 (t1, p);
463 p = _mm_or_si64 (t2, p);
464 p = _mm_and_si64 (p, xMask565rgb);
465 p = _mm_mullo_pi16 (p, xMask565Unpack);
467 return _mm_srli_pi16 (p, 8);
470 /* -------------------------------------------------------------------------------------------------
471 * Compose Core transformations
473 static inline uint32_t
474 coreCombineOverUPixelsse2 (uint32_t src, uint32_t dst)
487 ms = unpack_32_1x64 (src);
488 return pack_1x64_32 (over_1x64 (ms, expandAlpha_1x64 (ms), unpack_32_1x64 (dst)));
495 coreCombineOverUsse2 (uint32_t* pd, const uint32_t* ps, int w)
500 __m128i xmmDstLo, xmmDstHi;
501 __m128i xmmSrcLo, xmmSrcHi;
502 __m128i xmmAlphaLo, xmmAlphaHi;
504 /* call prefetch hint to optimize cache load*/
505 cachePrefetch ((__m128i*)ps);
506 cachePrefetch ((__m128i*)pd);
508 /* Align dst on a 16-byte boundary */
510 ((unsigned long)pd & 15))
515 *pd++ = coreCombineOverUPixelsse2 (s, d);
519 /* call prefetch hint to optimize cache load*/
520 cachePrefetch ((__m128i*)ps);
521 cachePrefetch ((__m128i*)pd);
525 /* fill cache line with next memory */
526 cachePrefetchNext ((__m128i*)ps);
527 cachePrefetchNext ((__m128i*)pd);
529 /* I'm loading unaligned because I'm not sure about the address alignment. */
530 xmmSrcHi = load128Unaligned ((__m128i*) ps);
532 /* Check the alpha channel */
533 pa = packAlpha (xmmSrcHi);
535 if (pa == 0xffffffff)
537 save128Aligned ((__m128i*)pd, xmmSrcHi);
541 xmmDstHi = load128Aligned ((__m128i*) pd);
543 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
544 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
546 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);
548 over_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi, &xmmDstLo, &xmmDstHi);
550 /* rebuid the 4 pixel data and save*/
551 save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
564 *pd++ = coreCombineOverUPixelsse2 (s, d);
570 coreCombineOverReverseUsse2 (uint32_t* pd, const uint32_t* ps, int w)
574 __m128i xmmDstLo, xmmDstHi;
575 __m128i xmmSrcLo, xmmSrcHi;
576 __m128i xmmAlphaLo, xmmAlphaHi;
578 /* call prefetch hint to optimize cache load*/
579 cachePrefetch ((__m128i*)ps);
580 cachePrefetch ((__m128i*)pd);
582 /* Align dst on a 16-byte boundary */
584 ((unsigned long)pd & 15))
589 *pd++ = coreCombineOverUPixelsse2 (d, s);
593 /* call prefetch hint to optimize cache load*/
594 cachePrefetch ((__m128i*)ps);
595 cachePrefetch ((__m128i*)pd);
599 /* fill cache line with next memory */
600 cachePrefetchNext ((__m128i*)ps);
601 cachePrefetchNext ((__m128i*)pd);
603 /* I'm loading unaligned because I'm not sure about the address alignment. */
604 xmmSrcHi = load128Unaligned ((__m128i*) ps);
605 xmmDstHi = load128Aligned ((__m128i*) pd);
607 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
608 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
610 expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaLo, &xmmAlphaHi);
612 over_2x128 (&xmmDstLo, &xmmDstHi, &xmmAlphaLo, &xmmAlphaHi, &xmmSrcLo, &xmmSrcHi);
614 /* rebuid the 4 pixel data and save*/
615 save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmSrcLo, xmmSrcHi));
627 *pd++ = coreCombineOverUPixelsse2 (d, s);
632 static inline uint32_t
633 coreCombineInUPixelsse2 (uint32_t src, uint32_t dst)
635 uint32_t maska = src >> 24;
641 else if (maska != 0xff)
643 return pack_1x64_32(pixMultiply_1x64 (unpack_32_1x64 (dst), expandAlpha_1x64 (unpack_32_1x64 (src))));
650 coreCombineInUsse2 (uint32_t* pd, const uint32_t* ps, int w)
654 __m128i xmmSrcLo, xmmSrcHi;
655 __m128i xmmDstLo, xmmDstHi;
657 /* call prefetch hint to optimize cache load*/
658 cachePrefetch ((__m128i*)ps);
659 cachePrefetch ((__m128i*)pd);
661 while (w && ((unsigned long) pd & 15))
666 *pd++ = coreCombineInUPixelsse2 (d, s);
670 /* call prefetch hint to optimize cache load*/
671 cachePrefetch ((__m128i*)ps);
672 cachePrefetch ((__m128i*)pd);
676 /* fill cache line with next memory */
677 cachePrefetchNext ((__m128i*)ps);
678 cachePrefetchNext ((__m128i*)pd);
680 xmmDstHi = load128Aligned ((__m128i*) pd);
681 xmmSrcHi = load128Unaligned ((__m128i*) ps);
683 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
684 expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmDstLo, &xmmDstHi);
686 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
687 pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmDstLo, &xmmDstHi, &xmmDstLo, &xmmDstHi);
689 save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
701 *pd++ = coreCombineInUPixelsse2 (d, s);
707 coreCombineReverseInUsse2 (uint32_t* pd, const uint32_t* ps, int w)
711 __m128i xmmSrcLo, xmmSrcHi;
712 __m128i xmmDstLo, xmmDstHi;
714 /* call prefetch hint to optimize cache load*/
715 cachePrefetch ((__m128i*)ps);
716 cachePrefetch ((__m128i*)pd);
718 while (w && ((unsigned long) pd & 15))
723 *pd++ = coreCombineInUPixelsse2 (s, d);
727 /* call prefetch hint to optimize cache load*/
728 cachePrefetch ((__m128i*)ps);
729 cachePrefetch ((__m128i*)pd);
733 /* fill cache line with next memory */
734 cachePrefetchNext ((__m128i*)ps);
735 cachePrefetchNext ((__m128i*)pd);
737 xmmDstHi = load128Aligned ((__m128i*) pd);
738 xmmSrcHi = load128Unaligned ((__m128i*) ps);
740 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
741 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
743 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
744 pixMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmSrcLo, &xmmSrcHi, &xmmDstLo, &xmmDstHi);
746 save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
758 *pd++ = coreCombineInUPixelsse2 (s, d);
764 coreCombineReverseOutUsse2 (uint32_t* pd, const uint32_t* ps, int w)
766 /* call prefetch hint to optimize cache load*/
767 cachePrefetch ((__m128i*)ps);
768 cachePrefetch ((__m128i*)pd);
770 while (w && ((unsigned long) pd & 15))
775 *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d), negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (s)))));
779 /* call prefetch hint to optimize cache load*/
780 cachePrefetch ((__m128i*)ps);
781 cachePrefetch ((__m128i*)pd);
785 __m128i xmmSrcLo, xmmSrcHi;
786 __m128i xmmDstLo, xmmDstHi;
788 /* fill cache line with next memory */
789 cachePrefetchNext ((__m128i*)ps);
790 cachePrefetchNext ((__m128i*)pd);
792 xmmSrcHi = load128Unaligned ((__m128i*) ps);
793 xmmDstHi = load128Aligned ((__m128i*) pd);
795 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
796 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
798 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
799 negate_2x128 (xmmSrcLo, xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
801 pixMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmSrcLo, &xmmSrcHi, &xmmDstLo, &xmmDstHi);
803 save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
815 *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d), negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (s)))));
821 coreCombineOutUsse2 (uint32_t* pd, const uint32_t* ps, int w)
823 /* call prefetch hint to optimize cache load*/
824 cachePrefetch ((__m128i*)ps);
825 cachePrefetch ((__m128i*)pd);
827 while (w && ((unsigned long) pd & 15))
832 *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s), negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (d)))));
836 /* call prefetch hint to optimize cache load*/
837 cachePrefetch ((__m128i*)ps);
838 cachePrefetch ((__m128i*)pd);
842 __m128i xmmSrcLo, xmmSrcHi;
843 __m128i xmmDstLo, xmmDstHi;
845 /* fill cache line with next memory */
846 cachePrefetchNext ((__m128i*)ps);
847 cachePrefetchNext ((__m128i*)pd);
849 xmmSrcHi = load128Unaligned ((__m128i*) ps);
850 xmmDstHi = load128Aligned ((__m128i*) pd);
852 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
853 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
855 expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmDstLo, &xmmDstHi);
856 negate_2x128 (xmmDstLo, xmmDstHi, &xmmDstLo, &xmmDstHi);
858 pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmDstLo, &xmmDstHi, &xmmDstLo, &xmmDstHi);
860 save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
872 *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s), negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (d)))));
877 static inline uint32_t
878 coreCombineAtopUPixelsse2 (uint32_t src, uint32_t dst)
880 __m64 s = unpack_32_1x64 (src);
881 __m64 d = unpack_32_1x64 (dst);
883 __m64 sa = negate_1x64 (expandAlpha_1x64 (s));
884 __m64 da = expandAlpha_1x64 (d);
886 return pack_1x64_32 (pixAddMultiply_1x64 (&s, &da, &d, &sa));
890 coreCombineAtopUsse2 (uint32_t* pd, const uint32_t* ps, int w)
894 __m128i xmmSrcLo, xmmSrcHi;
895 __m128i xmmDstLo, xmmDstHi;
896 __m128i xmmAlphaSrcLo, xmmAlphaSrcHi;
897 __m128i xmmAlphaDstLo, xmmAlphaDstHi;
899 /* call prefetch hint to optimize cache load*/
900 cachePrefetch ((__m128i*)ps);
901 cachePrefetch ((__m128i*)pd);
903 while (w && ((unsigned long) pd & 15))
908 *pd++ = coreCombineAtopUPixelsse2 (s, d);
912 /* call prefetch hint to optimize cache load*/
913 cachePrefetch ((__m128i*)ps);
914 cachePrefetch ((__m128i*)pd);
918 /* fill cache line with next memory */
919 cachePrefetchNext ((__m128i*)ps);
920 cachePrefetchNext ((__m128i*)pd);
922 xmmSrcHi = load128Unaligned ((__m128i*) ps);
923 xmmDstHi = load128Aligned ((__m128i*) pd);
925 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
926 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
928 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
929 expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
931 negate_2x128 (xmmAlphaSrcLo, xmmAlphaSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
933 pixAddMultiply_2x128 ( &xmmSrcLo, &xmmSrcHi, &xmmAlphaDstLo, &xmmAlphaDstHi,
934 &xmmDstLo, &xmmDstHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi,
935 &xmmDstLo, &xmmDstHi );
937 save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
949 *pd++ = coreCombineAtopUPixelsse2 (s, d);
954 static inline uint32_t
955 coreCombineReverseAtopUPixelsse2 (uint32_t src, uint32_t dst)
957 __m64 s = unpack_32_1x64 (src);
958 __m64 d = unpack_32_1x64 (dst);
960 __m64 sa = expandAlpha_1x64 (s);
961 __m64 da = negate_1x64 (expandAlpha_1x64 (d));
963 return pack_1x64_32 (pixAddMultiply_1x64 (&s, &da, &d, &sa));
967 coreCombineReverseAtopUsse2 (uint32_t* pd, const uint32_t* ps, int w)
971 __m128i xmmSrcLo, xmmSrcHi;
972 __m128i xmmDstLo, xmmDstHi;
973 __m128i xmmAlphaSrcLo, xmmAlphaSrcHi;
974 __m128i xmmAlphaDstLo, xmmAlphaDstHi;
976 /* call prefetch hint to optimize cache load*/
977 cachePrefetch ((__m128i*)ps);
978 cachePrefetch ((__m128i*)pd);
980 while (w && ((unsigned long) pd & 15))
985 *pd++ = coreCombineReverseAtopUPixelsse2 (s, d);
989 /* call prefetch hint to optimize cache load*/
990 cachePrefetch ((__m128i*)ps);
991 cachePrefetch ((__m128i*)pd);
995 /* fill cache line with next memory */
996 cachePrefetchNext ((__m128i*)ps);
997 cachePrefetchNext ((__m128i*)pd);
999 xmmSrcHi = load128Unaligned ((__m128i*) ps);
1000 xmmDstHi = load128Aligned ((__m128i*) pd);
1002 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
1003 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
1005 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
1006 expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
1008 negate_2x128 (xmmAlphaDstLo, xmmAlphaDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
1010 pixAddMultiply_2x128 ( &xmmSrcLo, &xmmSrcHi, &xmmAlphaDstLo, &xmmAlphaDstHi,
1011 &xmmDstLo, &xmmDstHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi,
1012 &xmmDstLo, &xmmDstHi );
1014 save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
1026 *pd++ = coreCombineReverseAtopUPixelsse2 (s, d);
1031 static inline uint32_t
1032 coreCombineXorUPixelsse2 (uint32_t src, uint32_t dst)
1034 __m64 s = unpack_32_1x64 (src);
1035 __m64 d = unpack_32_1x64 (dst);
1037 __m64 negD = negate_1x64 (expandAlpha_1x64 (d));
1038 __m64 negS = negate_1x64 (expandAlpha_1x64 (s));
1040 return pack_1x64_32 (pixAddMultiply_1x64 (&s, &negD, &d, &negS));
1044 coreCombineXorUsse2 (uint32_t* dst, const uint32_t* src, int width)
1049 const uint32_t* ps = src;
1051 __m128i xmmSrc, xmmSrcLo, xmmSrcHi;
1052 __m128i xmmDst, xmmDstLo, xmmDstHi;
1053 __m128i xmmAlphaSrcLo, xmmAlphaSrcHi;
1054 __m128i xmmAlphaDstLo, xmmAlphaDstHi;
1056 /* call prefetch hint to optimize cache load*/
1057 cachePrefetch ((__m128i*)ps);
1058 cachePrefetch ((__m128i*)pd);
1060 while (w && ((unsigned long) pd & 15))
1065 *pd++ = coreCombineXorUPixelsse2 (s, d);
1069 /* call prefetch hint to optimize cache load*/
1070 cachePrefetch ((__m128i*)ps);
1071 cachePrefetch ((__m128i*)pd);
1075 /* fill cache line with next memory */
1076 cachePrefetchNext ((__m128i*)ps);
1077 cachePrefetchNext ((__m128i*)pd);
1079 xmmSrc = load128Unaligned ((__m128i*) ps);
1080 xmmDst = load128Aligned ((__m128i*) pd);
1082 unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
1083 unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
1085 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
1086 expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
1088 negate_2x128 (xmmAlphaSrcLo, xmmAlphaSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
1089 negate_2x128 (xmmAlphaDstLo, xmmAlphaDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
1091 pixAddMultiply_2x128 ( &xmmSrcLo, &xmmSrcHi, &xmmAlphaDstLo, &xmmAlphaDstHi,
1092 &xmmDstLo, &xmmDstHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi,
1093 &xmmDstLo, &xmmDstHi );
1095 save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
1107 *pd++ = coreCombineXorUPixelsse2 (s, d);
1113 coreCombineAddUsse2 (uint32_t* dst, const uint32_t* src, int width)
1118 const uint32_t* ps = src;
1120 /* call prefetch hint to optimize cache load*/
1121 cachePrefetch ((__m128i*)ps);
1122 cachePrefetch ((__m128i*)pd);
1124 while (w && (unsigned long)pd & 15)
1128 *pd++ = _mm_cvtsi64_si32 (_mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
1132 /* call prefetch hint to optimize cache load*/
1133 cachePrefetch ((__m128i*)ps);
1134 cachePrefetch ((__m128i*)pd);
1138 /* fill cache line with next memory */
1139 cachePrefetchNext ((__m128i*)ps);
1140 cachePrefetchNext ((__m128i*)pd);
1142 save128Aligned( (__m128i*)pd,
1143 _mm_adds_epu8( load128Unaligned((__m128i*)ps),
1144 load128Aligned ((__m128i*)pd)) );
1154 *pd++ = _mm_cvtsi64_si32 (_mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
1158 static inline uint32_t
1159 coreCombineSaturateUPixelsse2 (uint32_t src, uint32_t dst)
1161 __m64 ms = unpack_32_1x64 (src);
1162 __m64 md = unpack_32_1x64 (dst);
1163 uint32_t sa = src >> 24;
1164 uint32_t da = ~dst >> 24;
1168 ms = pixMultiply_1x64 (ms, expandAlpha_1x64 (unpack_32_1x64 (FbIntDiv(da, sa) << 24)));
1171 return pack_1x64_32 (_mm_adds_pu16 (md, ms));
1175 coreCombineSaturateUsse2 (uint32_t *pd, const uint32_t *ps, int w)
1180 __m128i xmmSrc, xmmDst;
1182 /* call prefetch hint to optimize cache load*/
1183 cachePrefetch ((__m128i*)ps);
1184 cachePrefetch ((__m128i*)pd);
1186 while (w && (unsigned long)pd & 15)
1190 *pd++ = coreCombineSaturateUPixelsse2 (s, d);
1194 /* call prefetch hint to optimize cache load*/
1195 cachePrefetch ((__m128i*)ps);
1196 cachePrefetch ((__m128i*)pd);
1200 /* fill cache line with next memory */
1201 cachePrefetchNext ((__m128i*)ps);
1202 cachePrefetchNext ((__m128i*)pd);
1204 xmmDst = load128Aligned ((__m128i*)pd);
1205 xmmSrc = load128Unaligned((__m128i*)ps);
1207 packCmp = _mm_movemask_epi8 (_mm_cmpgt_epi32 (_mm_srli_epi32 (xmmSrc, 24),
1208 _mm_srli_epi32 (_mm_xor_si128 (xmmDst, Maskff000000), 24)));
1210 /* if some alpha src is grater than respective ~alpha dst */
1215 *pd++ = coreCombineSaturateUPixelsse2 (s, d);
1219 *pd++ = coreCombineSaturateUPixelsse2 (s, d);
1223 *pd++ = coreCombineSaturateUPixelsse2 (s, d);
1227 *pd++ = coreCombineSaturateUPixelsse2 (s, d);
1231 save128Aligned ((__m128i*)pd, _mm_adds_epu8 (xmmDst, xmmSrc));
1244 *pd++ = coreCombineSaturateUPixelsse2 (s, d);
1249 coreCombineSrcCsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t *pm, int w)
1253 __m128i xmmSrcLo, xmmSrcHi;
1254 __m128i xmmMaskLo, xmmMaskHi;
1255 __m128i xmmDstLo, xmmDstHi;
1257 /* call prefetch hint to optimize cache load*/
1258 cachePrefetch ((__m128i*)ps);
1259 cachePrefetch ((__m128i*)pd);
1260 cachePrefetch ((__m128i*)pm);
1262 while (w && (unsigned long)pd & 15)
1266 *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
1270 /* call prefetch hint to optimize cache load*/
1271 cachePrefetch ((__m128i*)ps);
1272 cachePrefetch ((__m128i*)pd);
1273 cachePrefetch ((__m128i*)pm);
1277 /* fill cache line with next memory */
1278 cachePrefetchNext ((__m128i*)ps);
1279 cachePrefetchNext ((__m128i*)pd);
1280 cachePrefetchNext ((__m128i*)pm);
1282 xmmSrcHi = load128Unaligned ((__m128i*)ps);
1283 xmmMaskHi = load128Unaligned ((__m128i*)pm);
1285 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
1286 unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
1288 pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi);
1290 save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
1302 *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
1307 static inline uint32_t
1308 coreCombineOverCPixelsse2 (uint32_t src, uint32_t mask, uint32_t dst)
1310 __m64 s = unpack_32_1x64 (src);
1311 __m64 expAlpha = expandAlpha_1x64 (s);
1312 __m64 unpkMask = unpack_32_1x64 (mask);
1313 __m64 unpkDst = unpack_32_1x64 (dst);
1315 return pack_1x64_32 (inOver_1x64 (&s, &expAlpha, &unpkMask, &unpkDst));
1319 coreCombineOverCsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t *pm, int w)
1323 __m128i xmmAlphaLo, xmmAlphaHi;
1324 __m128i xmmSrcLo, xmmSrcHi;
1325 __m128i xmmDstLo, xmmDstHi;
1326 __m128i xmmMaskLo, xmmMaskHi;
1328 /* call prefetch hint to optimize cache load*/
1329 cachePrefetch ((__m128i*)ps);
1330 cachePrefetch ((__m128i*)pd);
1331 cachePrefetch ((__m128i*)pm);
1333 while (w && (unsigned long)pd & 15)
1339 *pd++ = coreCombineOverCPixelsse2 (s, m, d);
1343 /* call prefetch hint to optimize cache load*/
1344 cachePrefetch ((__m128i*)ps);
1345 cachePrefetch ((__m128i*)pd);
1346 cachePrefetch ((__m128i*)pm);
1350 /* fill cache line with next memory */
1351 cachePrefetchNext ((__m128i*)ps);
1352 cachePrefetchNext ((__m128i*)pd);
1353 cachePrefetchNext ((__m128i*)pm);
1355 xmmDstHi = load128Aligned ((__m128i*)pd);
1356 xmmSrcHi = load128Unaligned ((__m128i*)ps);
1357 xmmMaskHi = load128Unaligned ((__m128i*)pm);
1359 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
1360 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
1361 unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
1363 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);
1365 inOver_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi);
1367 save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
1381 *pd++ = coreCombineOverCPixelsse2 (s, m, d);
1386 static inline uint32_t
1387 coreCombineOverReverseCPixelsse2 (uint32_t src, uint32_t mask, uint32_t dst)
1389 __m64 d = unpack_32_1x64 (dst);
1391 return pack_1x64_32(over_1x64 (d, expandAlpha_1x64 (d), pixMultiply_1x64 (unpack_32_1x64 (src), unpack_32_1x64 (mask))));
1395 coreCombineOverReverseCsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t *pm, int w)
1399 __m128i xmmAlphaLo, xmmAlphaHi;
1400 __m128i xmmSrcLo, xmmSrcHi;
1401 __m128i xmmDstLo, xmmDstHi;
1402 __m128i xmmMaskLo, xmmMaskHi;
1404 /* call prefetch hint to optimize cache load*/
1405 cachePrefetch ((__m128i*)ps);
1406 cachePrefetch ((__m128i*)pd);
1407 cachePrefetch ((__m128i*)pm);
1409 while (w && (unsigned long)pd & 15)
1415 *pd++ = coreCombineOverReverseCPixelsse2 (s, m, d);
1419 /* call prefetch hint to optimize cache load*/
1420 cachePrefetch ((__m128i*)ps);
1421 cachePrefetch ((__m128i*)pd);
1422 cachePrefetch ((__m128i*)pm);
1426 /* fill cache line with next memory */
1427 cachePrefetchNext ((__m128i*)ps);
1428 cachePrefetchNext ((__m128i*)pd);
1429 cachePrefetchNext ((__m128i*)pm);
1431 xmmDstHi = load128Aligned ((__m128i*)pd);
1432 xmmSrcHi = load128Unaligned ((__m128i*)ps);
1433 xmmMaskHi = load128Unaligned ((__m128i*)pm);
1435 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
1436 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
1437 unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
1439 expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaLo, &xmmAlphaHi);
1440 pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
1442 over_2x128 (&xmmDstLo, &xmmDstHi, &xmmAlphaLo, &xmmAlphaHi, &xmmMaskLo, &xmmMaskHi);
1444 save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmMaskLo, xmmMaskHi));
1458 *pd++ = coreCombineOverReverseCPixelsse2 (s, m, d);
1464 coreCombineInCsse2 (uint32_t *pd, uint32_t *ps, uint32_t *pm, int w)
1468 __m128i xmmAlphaLo, xmmAlphaHi;
1469 __m128i xmmSrcLo, xmmSrcHi;
1470 __m128i xmmDstLo, xmmDstHi;
1471 __m128i xmmMaskLo, xmmMaskHi;
1473 /* call prefetch hint to optimize cache load*/
1474 cachePrefetch ((__m128i*)ps);
1475 cachePrefetch ((__m128i*)pd);
1476 cachePrefetch ((__m128i*)pm);
1478 while (w && (unsigned long)pd & 15)
1484 *pd++ = pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
1485 expandAlpha_1x64 (unpack_32_1x64 (d))));
1489 /* call prefetch hint to optimize cache load*/
1490 cachePrefetch ((__m128i*)ps);
1491 cachePrefetch ((__m128i*)pd);
1492 cachePrefetch ((__m128i*)pm);
1496 /* fill cache line with next memory */
1497 cachePrefetchNext ((__m128i*)ps);
1498 cachePrefetchNext ((__m128i*)pd);
1499 cachePrefetchNext ((__m128i*)pm);
1501 xmmDstHi = load128Aligned ((__m128i*)pd);
1502 xmmSrcHi = load128Unaligned ((__m128i*)ps);
1503 xmmMaskHi = load128Unaligned ((__m128i*)pm);
1505 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
1506 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
1507 unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
1509 expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaLo, &xmmAlphaHi);
1510 pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi);
1512 pixMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmAlphaLo, &xmmAlphaHi, &xmmDstLo, &xmmDstHi);
1514 save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
1528 *pd++ = pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
1529 expandAlpha_1x64 (unpack_32_1x64 (d))));
1535 coreCombineInReverseCsse2 (uint32_t *pd, uint32_t *ps, uint32_t *pm, int w)
1539 __m128i xmmAlphaLo, xmmAlphaHi;
1540 __m128i xmmSrcLo, xmmSrcHi;
1541 __m128i xmmDstLo, xmmDstHi;
1542 __m128i xmmMaskLo, xmmMaskHi;
1544 /* call prefetch hint to optimize cache load*/
1545 cachePrefetch ((__m128i*)ps);
1546 cachePrefetch ((__m128i*)pd);
1547 cachePrefetch ((__m128i*)pm);
1549 while (w && (unsigned long)pd & 15)
1555 *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d),
1556 pixMultiply_1x64 (unpack_32_1x64 (m),
1557 expandAlpha_1x64 (unpack_32_1x64 (s)))));
1561 /* call prefetch hint to optimize cache load*/
1562 cachePrefetch ((__m128i*)ps);
1563 cachePrefetch ((__m128i*)pd);
1564 cachePrefetch ((__m128i*)pm);
1568 /* fill cache line with next memory */
1569 cachePrefetchNext ((__m128i*)ps);
1570 cachePrefetchNext ((__m128i*)pd);
1571 cachePrefetchNext ((__m128i*)pm);
1573 xmmDstHi = load128Aligned ((__m128i*)pd);
1574 xmmSrcHi = load128Unaligned ((__m128i*)ps);
1575 xmmMaskHi = load128Unaligned ((__m128i*)pm);
1577 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
1578 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
1579 unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
1581 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);
1582 pixMultiply_2x128 (&xmmMaskLo, &xmmMaskHi, &xmmAlphaLo, &xmmAlphaHi, &xmmAlphaLo, &xmmAlphaHi);
1584 pixMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmAlphaLo, &xmmAlphaHi, &xmmDstLo, &xmmDstHi);
1586 save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
1600 *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d),
1601 pixMultiply_1x64 (unpack_32_1x64 (m),
1602 expandAlpha_1x64 (unpack_32_1x64 (s)))));
1608 coreCombineOutCsse2 (uint32_t *pd, uint32_t *ps, uint32_t *pm, int w)
1612 __m128i xmmAlphaLo, xmmAlphaHi;
1613 __m128i xmmSrcLo, xmmSrcHi;
1614 __m128i xmmDstLo, xmmDstHi;
1615 __m128i xmmMaskLo, xmmMaskHi;
1617 /* call prefetch hint to optimize cache load*/
1618 cachePrefetch ((__m128i*)ps);
1619 cachePrefetch ((__m128i*)pd);
1620 cachePrefetch ((__m128i*)pm);
1622 while (w && (unsigned long)pd & 15)
1628 *pd++ = pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
1629 negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (d)))));
1633 /* call prefetch hint to optimize cache load*/
1634 cachePrefetch ((__m128i*)ps);
1635 cachePrefetch ((__m128i*)pd);
1636 cachePrefetch ((__m128i*)pm);
1640 /* fill cache line with next memory */
1641 cachePrefetchNext ((__m128i*)ps);
1642 cachePrefetchNext ((__m128i*)pd);
1643 cachePrefetchNext ((__m128i*)pm);
1645 xmmDstHi = load128Aligned ((__m128i*)pd);
1646 xmmSrcHi = load128Unaligned ((__m128i*)ps);
1647 xmmMaskHi = load128Unaligned ((__m128i*)pm);
1649 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
1650 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
1651 unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
1653 expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaLo, &xmmAlphaHi);
1654 negate_2x128 (xmmAlphaLo, xmmAlphaHi, &xmmAlphaLo, &xmmAlphaHi);
1656 pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi);
1657 pixMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmAlphaLo, &xmmAlphaHi, &xmmDstLo, &xmmDstHi);
1659 save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
1673 *pd++ = pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
1674 negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (d)))));
1680 coreCombineOutReverseCsse2 (uint32_t *pd, uint32_t *ps, uint32_t *pm, int w)
1684 __m128i xmmAlphaLo, xmmAlphaHi;
1685 __m128i xmmSrcLo, xmmSrcHi;
1686 __m128i xmmDstLo, xmmDstHi;
1687 __m128i xmmMaskLo, xmmMaskHi;
1689 /* call prefetch hint to optimize cache load*/
1690 cachePrefetch ((__m128i*)ps);
1691 cachePrefetch ((__m128i*)pd);
1692 cachePrefetch ((__m128i*)pm);
1694 while (w && (unsigned long)pd & 15)
1700 *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d),
1701 negate_1x64 (pixMultiply_1x64 (unpack_32_1x64 (m),
1702 expandAlpha_1x64 (unpack_32_1x64 (s))))));
1706 /* call prefetch hint to optimize cache load*/
1707 cachePrefetch ((__m128i*)ps);
1708 cachePrefetch ((__m128i*)pd);
1709 cachePrefetch ((__m128i*)pm);
1713 /* fill cache line with next memory */
1714 cachePrefetchNext ((__m128i*)ps);
1715 cachePrefetchNext ((__m128i*)pd);
1716 cachePrefetchNext ((__m128i*)pm);
1718 xmmDstHi = load128Aligned ((__m128i*)pd);
1719 xmmSrcHi = load128Unaligned ((__m128i*)ps);
1720 xmmMaskHi = load128Unaligned ((__m128i*)pm);
1722 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
1723 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
1724 unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
1726 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);
1728 pixMultiply_2x128 (&xmmMaskLo, &xmmMaskHi, &xmmAlphaLo, &xmmAlphaHi, &xmmMaskLo, &xmmMaskHi);
1730 negate_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
1732 pixMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi);
1734 save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
1748 *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d),
1749 negate_1x64 (pixMultiply_1x64 (unpack_32_1x64 (m),
1750 expandAlpha_1x64 (unpack_32_1x64 (s))))));
1755 static inline uint32_t
1756 coreCombineAtopCPixelsse2 (uint32_t src, uint32_t mask, uint32_t dst)
1758 __m64 m = unpack_32_1x64 (mask);
1759 __m64 s = unpack_32_1x64 (src);
1760 __m64 d = unpack_32_1x64 (dst);
1761 __m64 sa = expandAlpha_1x64 (s);
1762 __m64 da = expandAlpha_1x64 (d);
1764 s = pixMultiply_1x64 (s, m);
1765 m = negate_1x64 (pixMultiply_1x64 (m, sa));
1767 return pack_1x64_32 (pixAddMultiply_1x64 (&d, &m, &s, &da));
1771 coreCombineAtopCsse2 (uint32_t *pd, uint32_t *ps, uint32_t *pm, int w)
1775 __m128i xmmSrcLo, xmmSrcHi;
1776 __m128i xmmDstLo, xmmDstHi;
1777 __m128i xmmAlphaSrcLo, xmmAlphaSrcHi;
1778 __m128i xmmAlphaDstLo, xmmAlphaDstHi;
1779 __m128i xmmMaskLo, xmmMaskHi;
1781 /* call prefetch hint to optimize cache load*/
1782 cachePrefetch ((__m128i*)ps);
1783 cachePrefetch ((__m128i*)pd);
1784 cachePrefetch ((__m128i*)pm);
1786 while (w && (unsigned long)pd & 15)
1792 *pd++ = coreCombineAtopCPixelsse2 (s, m, d);
1796 /* call prefetch hint to optimize cache load*/
1797 cachePrefetch ((__m128i*)ps);
1798 cachePrefetch ((__m128i*)pd);
1799 cachePrefetch ((__m128i*)pm);
1803 /* fill cache line with next memory */
1804 cachePrefetchNext ((__m128i*)ps);
1805 cachePrefetchNext ((__m128i*)pd);
1806 cachePrefetchNext ((__m128i*)pm);
1808 xmmDstHi = load128Aligned ((__m128i*)pd);
1809 xmmSrcHi = load128Unaligned ((__m128i*)ps);
1810 xmmMaskHi = load128Unaligned ((__m128i*)pm);
1812 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
1813 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
1814 unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
1816 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
1817 expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
1819 pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmSrcLo, &xmmSrcHi);
1820 pixMultiply_2x128 (&xmmMaskLo, &xmmMaskHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi, &xmmMaskLo, &xmmMaskHi);
1822 negate_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
1824 pixAddMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmMaskLo, &xmmMaskHi,
1825 &xmmSrcLo, &xmmSrcHi, &xmmAlphaDstLo, &xmmAlphaDstHi,
1826 &xmmDstLo, &xmmDstHi);
1828 save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
1842 *pd++ = coreCombineAtopCPixelsse2 (s, m, d);
1847 static inline uint32_t
1848 coreCombineReverseAtopCPixelsse2 (uint32_t src, uint32_t mask, uint32_t dst)
1850 __m64 m = unpack_32_1x64 (mask);
1851 __m64 s = unpack_32_1x64 (src);
1852 __m64 d = unpack_32_1x64 (dst);
1854 __m64 da = negate_1x64 (expandAlpha_1x64 (d));
1855 __m64 sa = expandAlpha_1x64 (s);
1857 s = pixMultiply_1x64 (s, m);
1858 m = pixMultiply_1x64 (m, sa);
1860 return pack_1x64_32 (pixAddMultiply_1x64 (&d, &m, &s, &da));
1864 coreCombineReverseAtopCsse2 (uint32_t *pd, uint32_t *ps, uint32_t *pm, int w)
1868 __m128i xmmSrcLo, xmmSrcHi;
1869 __m128i xmmDstLo, xmmDstHi;
1870 __m128i xmmAlphaSrcLo, xmmAlphaSrcHi;
1871 __m128i xmmAlphaDstLo, xmmAlphaDstHi;
1872 __m128i xmmMaskLo, xmmMaskHi;
1874 /* call prefetch hint to optimize cache load*/
1875 cachePrefetch ((__m128i*)ps);
1876 cachePrefetch ((__m128i*)pd);
1877 cachePrefetch ((__m128i*)pm);
1879 while (w && (unsigned long)pd & 15)
1885 *pd++ = coreCombineReverseAtopCPixelsse2 (s, m, d);
1889 /* call prefetch hint to optimize cache load*/
1890 cachePrefetch ((__m128i*)ps);
1891 cachePrefetch ((__m128i*)pd);
1892 cachePrefetch ((__m128i*)pm);
1896 /* fill cache line with next memory */
1897 cachePrefetchNext ((__m128i*)ps);
1898 cachePrefetchNext ((__m128i*)pd);
1899 cachePrefetchNext ((__m128i*)pm);
1901 xmmDstHi = load128Aligned ((__m128i*)pd);
1902 xmmSrcHi = load128Unaligned ((__m128i*)ps);
1903 xmmMaskHi = load128Unaligned ((__m128i*)pm);
1905 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
1906 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
1907 unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
1909 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
1910 expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
1912 pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmSrcLo, &xmmSrcHi);
1913 pixMultiply_2x128 (&xmmMaskLo, &xmmMaskHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi, &xmmMaskLo, &xmmMaskHi);
1915 negate_2x128 (xmmAlphaDstLo, xmmAlphaDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
1917 pixAddMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmMaskLo, &xmmMaskHi,
1918 &xmmSrcLo, &xmmSrcHi, &xmmAlphaDstLo, &xmmAlphaDstHi,
1919 &xmmDstLo, &xmmDstHi);
1921 save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
1935 *pd++ = coreCombineReverseAtopCPixelsse2 (s, m, d);
1940 static inline uint32_t
1941 coreCombineXorCPixelsse2 (uint32_t src, uint32_t mask, uint32_t dst)
1943 __m64 a = unpack_32_1x64 (mask);
1944 __m64 s = unpack_32_1x64 (src);
1945 __m64 d = unpack_32_1x64 (dst);
1947 __m64 alphaDst = negate_1x64 (pixMultiply_1x64 (a, expandAlpha_1x64 (s)));
1948 __m64 dest = pixMultiply_1x64 (s, a);
1949 __m64 alphaSrc = negate_1x64 (expandAlpha_1x64 (d));
1951 return pack_1x64_32 (pixAddMultiply_1x64 (&d,
1958 coreCombineXorCsse2 (uint32_t *pd, uint32_t *ps, uint32_t *pm, int w)
1962 __m128i xmmSrcLo, xmmSrcHi;
1963 __m128i xmmDstLo, xmmDstHi;
1964 __m128i xmmAlphaSrcLo, xmmAlphaSrcHi;
1965 __m128i xmmAlphaDstLo, xmmAlphaDstHi;
1966 __m128i xmmMaskLo, xmmMaskHi;
1968 /* call prefetch hint to optimize cache load*/
1969 cachePrefetch ((__m128i*)ps);
1970 cachePrefetch ((__m128i*)pd);
1971 cachePrefetch ((__m128i*)pm);
1973 while (w && (unsigned long)pd & 15)
1979 *pd++ = coreCombineXorCPixelsse2 (s, m, d);
1983 /* call prefetch hint to optimize cache load*/
1984 cachePrefetch ((__m128i*)ps);
1985 cachePrefetch ((__m128i*)pd);
1986 cachePrefetch ((__m128i*)pm);
1990 /* fill cache line with next memory */
1991 cachePrefetchNext ((__m128i*)ps);
1992 cachePrefetchNext ((__m128i*)pd);
1993 cachePrefetchNext ((__m128i*)pm);
1995 xmmDstHi = load128Aligned ((__m128i*)pd);
1996 xmmSrcHi = load128Unaligned ((__m128i*)ps);
1997 xmmMaskHi = load128Unaligned ((__m128i*)pm);
1999 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
2000 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
2001 unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
2003 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
2004 expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
2006 pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmSrcLo, &xmmSrcHi);
2007 pixMultiply_2x128 (&xmmMaskLo, &xmmMaskHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi, &xmmMaskLo, &xmmMaskHi);
2009 negate_2x128 (xmmAlphaDstLo, xmmAlphaDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
2010 negate_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
2012 pixAddMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmMaskLo, &xmmMaskHi,
2013 &xmmSrcLo, &xmmSrcHi, &xmmAlphaDstLo, &xmmAlphaDstHi,
2014 &xmmDstLo, &xmmDstHi);
2016 save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
2030 *pd++ = coreCombineXorCPixelsse2 (s, m, d);
2036 coreCombineAddCsse2 (uint32_t *pd, uint32_t *ps, uint32_t *pm, int w)
2040 __m128i xmmSrcLo, xmmSrcHi;
2041 __m128i xmmDstLo, xmmDstHi;
2042 __m128i xmmMaskLo, xmmMaskHi;
2044 /* call prefetch hint to optimize cache load*/
2045 cachePrefetch ((__m128i*)ps);
2046 cachePrefetch ((__m128i*)pd);
2047 cachePrefetch ((__m128i*)pm);
2049 while (w && (unsigned long)pd & 15)
2055 *pd++ = pack_1x64_32 (_mm_adds_pu8 (pixMultiply_1x64 (unpack_32_1x64 (s),
2056 unpack_32_1x64 (m)),
2057 unpack_32_1x64 (d)));
2061 /* call prefetch hint to optimize cache load*/
2062 cachePrefetch ((__m128i*)ps);
2063 cachePrefetch ((__m128i*)pd);
2064 cachePrefetch ((__m128i*)pm);
2068 /* fill cache line with next memory */
2069 cachePrefetchNext ((__m128i*)ps);
2070 cachePrefetchNext ((__m128i*)pd);
2071 cachePrefetchNext ((__m128i*)pm);
2073 xmmSrcHi = load128Unaligned ((__m128i*)ps);
2074 xmmMaskHi = load128Unaligned ((__m128i*)pm);
2075 xmmDstHi = load128Aligned ((__m128i*)pd);
2077 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
2078 unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
2079 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
2081 pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmSrcLo, &xmmSrcHi);
2083 save128Aligned( (__m128i*)pd, pack_2x128_128 (_mm_adds_epu8 (xmmSrcLo, xmmDstLo),
2084 _mm_adds_epu8 (xmmSrcHi, xmmDstHi)));
2098 *pd++ = pack_1x64_32 (_mm_adds_pu8 (pixMultiply_1x64 (unpack_32_1x64 (s),
2099 unpack_32_1x64 (m)),
2100 unpack_32_1x64 (d)));
2105 /* -------------------------------------------------------------------------------------------------
2106 * fbComposeSetupSSE2
2109 createMask_16_64 (uint16_t mask)
2111 return _mm_set1_pi16 (mask);
2114 static inline __m128i
2115 createMask_16_128 (uint16_t mask)
2117 return _mm_set1_epi16 (mask);
2121 createMask_2x32_64 (uint32_t mask0, uint32_t mask1)
2123 return _mm_set_pi32 (mask0, mask1);
2126 static inline __m128i
2127 createMask_2x32_128 (uint32_t mask0, uint32_t mask1)
2129 return _mm_set_epi32 (mask0, mask1, mask0, mask1);
2132 /* SSE2 code patch for fbcompose.c */
2134 static FASTCALL void
2135 sse2CombineMaskU (uint32_t *dst, const uint32_t *src, int width)
2137 coreCombineReverseInUsse2 (dst, src, width);
2141 static FASTCALL void
2142 sse2CombineOverU (uint32_t *dst, const uint32_t *src, int width)
2144 coreCombineOverUsse2 (dst, src, width);
2148 static FASTCALL void
2149 sse2CombineOverReverseU (uint32_t *dst, const uint32_t *src, int width)
2151 coreCombineOverReverseUsse2 (dst, src, width);
2155 static FASTCALL void
2156 sse2CombineInU (uint32_t *dst, const uint32_t *src, int width)
2158 coreCombineInUsse2 (dst, src, width);
2162 static FASTCALL void
2163 sse2CombineInReverseU (uint32_t *dst, const uint32_t *src, int width)
2165 coreCombineReverseInUsse2 (dst, src, width);
2169 static FASTCALL void
2170 sse2CombineOutU (uint32_t *dst, const uint32_t *src, int width)
2172 coreCombineOutUsse2 (dst, src, width);
2176 static FASTCALL void
2177 sse2CombineOutReverseU (uint32_t *dst, const uint32_t *src, int width)
2179 coreCombineReverseOutUsse2 (dst, src, width);
2183 static FASTCALL void
2184 sse2CombineAtopU (uint32_t *dst, const uint32_t *src, int width)
2186 coreCombineAtopUsse2 (dst, src, width);
2190 static FASTCALL void
2191 sse2CombineAtopReverseU (uint32_t *dst, const uint32_t *src, int width)
2193 coreCombineReverseAtopUsse2 (dst, src, width);
2197 static FASTCALL void
2198 sse2CombineXorU (uint32_t *dst, const uint32_t *src, int width)
2200 coreCombineXorUsse2 (dst, src, width);
2204 static FASTCALL void
2205 sse2CombineAddU (uint32_t *dst, const uint32_t *src, int width)
2207 coreCombineAddUsse2 (dst, src, width);
2211 static FASTCALL void
2212 sse2CombineSaturateU (uint32_t *dst, const uint32_t *src, int width)
2214 coreCombineSaturateUsse2 (dst, src, width);
2218 static FASTCALL void
2219 sse2CombineSrcC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width)
2221 coreCombineSrcCsse2 (dst, src, mask, width);
2225 static FASTCALL void
2226 sse2CombineOverC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width)
2228 coreCombineOverCsse2 (dst, src, mask, width);
2232 static FASTCALL void
2233 sse2CombineOverReverseC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width)
2235 coreCombineOverReverseCsse2 (dst, src, mask, width);
2239 static FASTCALL void
2240 sse2CombineInC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width)
2242 coreCombineInCsse2 (dst, src, mask, width);
2246 static FASTCALL void
2247 sse2CombineInReverseC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width)
2249 coreCombineInReverseCsse2 (dst, src, mask, width);
2253 static FASTCALL void
2254 sse2CombineOutC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width)
2256 coreCombineOutCsse2 (dst, src, mask, width);
2260 static FASTCALL void
2261 sse2CombineOutReverseC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width)
2263 coreCombineOutReverseCsse2 (dst, src, mask, width);
2267 static FASTCALL void
2268 sse2CombineAtopC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width)
2270 coreCombineAtopCsse2 (dst, src, mask, width);
2274 static FASTCALL void
2275 sse2CombineAtopReverseC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width)
2277 coreCombineReverseAtopCsse2 (dst, src, mask, width);
2281 static FASTCALL void
2282 sse2CombineXorC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width)
2284 coreCombineXorCsse2 (dst, src, mask, width);
2288 static FASTCALL void
2289 sse2CombineAddC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width)
2291 coreCombineAddCsse2 (dst, src, mask, width);
2296 fbComposeSetupSSE2(void)
2298 static pixman_bool_t initialized = FALSE;
2303 /* check if we have SSE2 support and initialize accordingly */
2304 if (pixman_have_sse2())
2306 /* SSE2 constants */
2307 Mask565r = createMask_2x32_128 (0x00f80000, 0x00f80000);
2308 Mask565g1 = createMask_2x32_128 (0x00070000, 0x00070000);
2309 Mask565g2 = createMask_2x32_128 (0x000000e0, 0x000000e0);
2310 Mask565b = createMask_2x32_128 (0x0000001f, 0x0000001f);
2311 MaskRed = createMask_2x32_128 (0x00f80000, 0x00f80000);
2312 MaskGreen = createMask_2x32_128 (0x0000fc00, 0x0000fc00);
2313 MaskBlue = createMask_2x32_128 (0x000000f8, 0x000000f8);
2314 Mask565FixRB = createMask_2x32_128 (0x00e000e0, 0x00e000e0);
2315 Mask565FixG = createMask_2x32_128 (0x0000c000, 0x0000c000);
2316 Mask0080 = createMask_16_128 (0x0080);
2317 Mask00ff = createMask_16_128 (0x00ff);
2318 Mask0101 = createMask_16_128 (0x0101);
2319 Maskffff = createMask_16_128 (0xffff);
2320 Maskff000000 = createMask_2x32_128 (0xff000000, 0xff000000);
2321 MaskAlpha = createMask_2x32_128 (0x00ff0000, 0x00000000);
2324 xMask565rgb = createMask_2x32_64 (0x000001f0, 0x003f001f);
2325 xMask565Unpack = createMask_2x32_64 (0x00000084, 0x04100840);
2327 xMask0080 = createMask_16_64 (0x0080);
2328 xMask00ff = createMask_16_64 (0x00ff);
2329 xMask0101 = createMask_16_64 (0x0101);
2330 xMaskAlpha = createMask_2x32_64 (0x00ff0000, 0x00000000);
2332 /* SSE code patch for fbcompose.c */
2333 pixman_composeFunctions.combineU[PIXMAN_OP_OVER] = sse2CombineOverU;
2334 pixman_composeFunctions.combineU[PIXMAN_OP_OVER_REVERSE] = sse2CombineOverReverseU;
2335 pixman_composeFunctions.combineU[PIXMAN_OP_IN] = sse2CombineInU;
2336 pixman_composeFunctions.combineU[PIXMAN_OP_IN_REVERSE] = sse2CombineInReverseU;
2337 pixman_composeFunctions.combineU[PIXMAN_OP_OUT] = sse2CombineOutU;
2339 pixman_composeFunctions.combineU[PIXMAN_OP_OUT_REVERSE] = sse2CombineOutReverseU;
2340 pixman_composeFunctions.combineU[PIXMAN_OP_ATOP] = sse2CombineAtopU;
2341 pixman_composeFunctions.combineU[PIXMAN_OP_ATOP_REVERSE] = sse2CombineAtopReverseU;
2342 pixman_composeFunctions.combineU[PIXMAN_OP_XOR] = sse2CombineXorU;
2343 pixman_composeFunctions.combineU[PIXMAN_OP_ADD] = sse2CombineAddU;
2345 pixman_composeFunctions.combineU[PIXMAN_OP_SATURATE] = sse2CombineSaturateU;
2347 pixman_composeFunctions.combineC[PIXMAN_OP_SRC] = sse2CombineSrcC;
2348 pixman_composeFunctions.combineC[PIXMAN_OP_OVER] = sse2CombineOverC;
2349 pixman_composeFunctions.combineC[PIXMAN_OP_OVER_REVERSE] = sse2CombineOverReverseC;
2350 pixman_composeFunctions.combineC[PIXMAN_OP_IN] = sse2CombineInC;
2351 pixman_composeFunctions.combineC[PIXMAN_OP_IN_REVERSE] = sse2CombineInReverseC;
2352 pixman_composeFunctions.combineC[PIXMAN_OP_OUT] = sse2CombineOutC;
2353 pixman_composeFunctions.combineC[PIXMAN_OP_OUT_REVERSE] = sse2CombineOutReverseC;
2354 pixman_composeFunctions.combineC[PIXMAN_OP_ATOP] = sse2CombineAtopC;
2355 pixman_composeFunctions.combineC[PIXMAN_OP_ATOP_REVERSE] = sse2CombineAtopReverseC;
2356 pixman_composeFunctions.combineC[PIXMAN_OP_XOR] = sse2CombineXorC;
2357 pixman_composeFunctions.combineC[PIXMAN_OP_ADD] = sse2CombineAddC;
2359 pixman_composeFunctions.combineMaskU = sse2CombineMaskU;
2368 /* -------------------------------------------------------------------------------------------------
2369 * fbCompositeSolid_nx8888
2373 fbCompositeSolid_nx8888sse2 (pixman_op_t op,
2374 pixman_image_t * pSrc,
2375 pixman_image_t * pMask,
2376 pixman_image_t * pDst,
2387 uint32_t *dstLine, *dst, d;
2390 __m128i xmmSrc, xmmAlpha;
2391 __m128i xmmDst, xmmDstLo, xmmDstHi;
2393 fbComposeGetSolid(pSrc, src, pDst->bits.format);
2398 fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
2400 xmmSrc = expandPixel_32_1x128 (src);
2401 xmmAlpha = expandAlpha_1x128 (xmmSrc);
2407 /* call prefetch hint to optimize cache load*/
2408 cachePrefetch ((__m128i*)dst);
2410 dstLine += dstStride;
2413 while (w && (unsigned long)dst & 15)
2416 *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmmSrc),
2417 _mm_movepi64_pi64 (xmmAlpha),
2418 unpack_32_1x64 (d)));
2422 cachePrefetch ((__m128i*)dst);
2426 /* fill cache line with next memory */
2427 cachePrefetchNext ((__m128i*)dst);
2429 xmmDst = load128Aligned ((__m128i*)dst);
2431 unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
2433 over_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmDstLo, &xmmDstHi);
2435 /* rebuid the 4 pixel data and save*/
2436 save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
2445 *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmmSrc),
2446 _mm_movepi64_pi64 (xmmAlpha),
2447 unpack_32_1x64 (d)));
2455 /* -------------------------------------------------------------------------------------------------
2456 * fbCompositeSolid_nx0565
2459 fbCompositeSolid_nx0565sse2 (pixman_op_t op,
2460 pixman_image_t * pSrc,
2461 pixman_image_t * pMask,
2462 pixman_image_t * pDst,
2473 uint16_t *dstLine, *dst, d;
2476 __m128i xmmSrc, xmmAlpha;
2477 __m128i xmmDst, xmmDst0, xmmDst1, xmmDst2, xmmDst3;
2479 fbComposeGetSolid(pSrc, src, pDst->bits.format);
2484 fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
2486 xmmSrc = expandPixel_32_1x128 (src);
2487 xmmAlpha = expandAlpha_1x128 (xmmSrc);
2493 /* call prefetch hint to optimize cache load*/
2494 cachePrefetch ((__m128i*)dst);
2496 dstLine += dstStride;
2499 while (w && (unsigned long)dst & 15)
2503 *dst++ = pack565_32_16 (pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmmSrc),
2504 _mm_movepi64_pi64 (xmmAlpha),
2505 expand565_16_1x64 (d))));
2509 /* call prefetch hint to optimize cache load*/
2510 cachePrefetch ((__m128i*)dst);
2514 /* fill cache line with next memory */
2515 cachePrefetchNext ((__m128i*)dst);
2517 xmmDst = load128Aligned ((__m128i*)dst);
2519 unpack565_128_4x128 (xmmDst, &xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3);
2521 over_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmDst0, &xmmDst1);
2522 over_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmDst2, &xmmDst3);
2524 xmmDst = pack565_4x128_128 (&xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3);
2525 save128Aligned ((__m128i*)dst, xmmDst);
2534 *dst++ = pack565_32_16 (pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmmSrc),
2535 _mm_movepi64_pi64 (xmmAlpha),
2536 expand565_16_1x64 (d))));
2543 /* -------------------------------------------------------------------------------------------------
2544 * fbCompositeSolidMask_nx8888x8888C
2548 fbCompositeSolidMask_nx8888x8888Csse2 (pixman_op_t op,
2549 pixman_image_t * pSrc,
2550 pixman_image_t * pMask,
2551 pixman_image_t * pDst,
2562 uint32_t *dstLine, d;
2563 uint32_t *maskLine, m;
2565 int dstStride, maskStride;
2567 __m128i xmmSrc, xmmAlpha;
2568 __m128i xmmDst, xmmDstLo, xmmDstHi;
2569 __m128i xmmMask, xmmMaskLo, xmmMaskHi;
2571 __m64 mmxSrc, mmxAlpha, mmxMask, mmxDst;
2573 fbComposeGetSolid(pSrc, src, pDst->bits.format);
2579 fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
2580 fbComposeGetStart (pMask, xMask, yMask, uint32_t, maskStride, maskLine, 1);
2582 xmmSrc = _mm_unpacklo_epi8 (createMask_2x32_128 (src, src), _mm_setzero_si128 ());
2583 xmmAlpha = expandAlpha_1x128 (xmmSrc);
2584 mmxSrc = _mm_movepi64_pi64 (xmmSrc);
2585 mmxAlpha = _mm_movepi64_pi64 (xmmAlpha);
2590 uint32_t *pm = (uint32_t *)maskLine;
2591 uint32_t *pd = (uint32_t *)dstLine;
2593 dstLine += dstStride;
2594 maskLine += maskStride;
2596 /* call prefetch hint to optimize cache load*/
2597 cachePrefetch ((__m128i*)pd);
2598 cachePrefetch ((__m128i*)pm);
2600 while (w && (unsigned long)pd & 15)
2607 mmxMask = unpack_32_1x64 (m);
2608 mmxDst = unpack_32_1x64 (d);
2610 *pd = pack_1x64_32 (inOver_1x64 (&mmxSrc,
2620 /* call prefetch hint to optimize cache load*/
2621 cachePrefetch ((__m128i*)pd);
2622 cachePrefetch ((__m128i*)pm);
2626 /* fill cache line with next memory */
2627 cachePrefetchNext ((__m128i*)pd);
2628 cachePrefetchNext ((__m128i*)pm);
2630 xmmMask = load128Unaligned ((__m128i*)pm);
2632 packCmp = _mm_movemask_epi8 (_mm_cmpeq_epi32 (xmmMask, _mm_setzero_si128()));
2634 /* if all bits in mask are zero, packCmp are equal to 0xffff */
2635 if (packCmp != 0xffff)
2637 xmmDst = load128Aligned ((__m128i*)pd);
2639 unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
2640 unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
2642 inOver_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi);
2644 save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
2659 mmxMask = unpack_32_1x64 (m);
2660 mmxDst = unpack_32_1x64 (d);
2662 *pd = pack_1x64_32 (inOver_1x64 (&mmxSrc,
2677 /* -------------------------------------------------------------------------------------------------
2678 * fbCompositeSrc_8888x8x8888
2682 fbCompositeSrc_8888x8x8888sse2 (pixman_op_t op,
2683 pixman_image_t * pSrc,
2684 pixman_image_t * pMask,
2685 pixman_image_t * pDst,
2695 uint32_t *dstLine, *dst;
2696 uint32_t *srcLine, *src;
2699 int dstStride, srcStride;
2702 __m128i xmmSrc, xmmSrcLo, xmmSrcHi;
2703 __m128i xmmDst, xmmDstLo, xmmDstHi;
2704 __m128i xmmAlphaLo, xmmAlphaHi;
2706 fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
2707 fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
2708 fbComposeGetSolid (pMask, mask, pDst->bits.format);
2710 xmmMask = createMask_16_128 (mask >> 24);
2715 dstLine += dstStride;
2717 srcLine += srcStride;
2720 /* call prefetch hint to optimize cache load*/
2721 cachePrefetch ((__m128i*)dst);
2722 cachePrefetch ((__m128i*)src);
2724 while (w && (unsigned long)dst & 15)
2726 uint32_t s = *src++;
2729 __m64 ms = unpack_32_1x64 (s);
2730 __m64 alpha = expandAlpha_1x64 (ms);
2731 __m64 dest = _mm_movepi64_pi64 (xmmMask);
2732 __m64 alphaDst = unpack_32_1x64 (d);
2734 *dst++ = pack_1x64_32 (inOver_1x64 (&ms,
2742 /* call prefetch hint to optimize cache load*/
2743 cachePrefetch ((__m128i*)dst);
2744 cachePrefetch ((__m128i*)src);
2748 /* fill cache line with next memory */
2749 cachePrefetchNext ((__m128i*)dst);
2750 cachePrefetchNext ((__m128i*)src);
2752 xmmSrc = load128Unaligned ((__m128i*)src);
2753 xmmDst = load128Aligned ((__m128i*)dst);
2755 unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
2756 unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
2757 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);
2759 inOver_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi, &xmmMask, &xmmMask, &xmmDstLo, &xmmDstHi);
2761 save128Aligned( (__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
2770 uint32_t s = *src++;
2773 __m64 ms = unpack_32_1x64 (s);
2774 __m64 alpha = expandAlpha_1x64 (ms);
2775 __m64 mask = _mm_movepi64_pi64 (xmmMask);
2776 __m64 dest = unpack_32_1x64 (d);
2778 *dst++ = pack_1x64_32 (inOver_1x64 (&ms,
2790 /* -------------------------------------------------------------------------------------------------
2791 * fbCompositeSrc_x888xnx8888
2794 fbCompositeSrc_x888xnx8888sse2 (pixman_op_t op,
2795 pixman_image_t * pSrc,
2796 pixman_image_t * pMask,
2797 pixman_image_t * pDst,
2807 uint32_t *dstLine, *dst;
2808 uint32_t *srcLine, *src;
2810 int dstStride, srcStride;
2813 __m128i xmmMask, xmmAlpha;
2814 __m128i xmmSrc, xmmSrcLo, xmmSrcHi;
2815 __m128i xmmDst, xmmDstLo, xmmDstHi;
2817 fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
2818 fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
2819 fbComposeGetSolid (pMask, mask, pDst->bits.format);
2821 xmmMask = createMask_16_128 (mask >> 24);
2822 xmmAlpha = Mask00ff;
2827 dstLine += dstStride;
2829 srcLine += srcStride;
2832 /* call prefetch hint to optimize cache load*/
2833 cachePrefetch ((__m128i*)dst);
2834 cachePrefetch ((__m128i*)src);
2836 while (w && (unsigned long)dst & 15)
2838 uint32_t s = (*src++) | 0xff000000;
2841 __m64 src = unpack_32_1x64 (s);
2842 __m64 alpha = _mm_movepi64_pi64 (xmmAlpha);
2843 __m64 mask = _mm_movepi64_pi64 (xmmMask);
2844 __m64 dest = unpack_32_1x64 (d);
2846 *dst++ = pack_1x64_32 (inOver_1x64 (&src,
2854 /* call prefetch hint to optimize cache load*/
2855 cachePrefetch ((__m128i*)dst);
2856 cachePrefetch ((__m128i*)src);
2860 /* fill cache line with next memory */
2861 cachePrefetchNext ((__m128i*)dst);
2862 cachePrefetchNext ((__m128i*)src);
2864 xmmSrc = _mm_or_si128 (load128Unaligned ((__m128i*)src), Maskff000000);
2865 xmmDst = load128Aligned ((__m128i*)dst);
2867 unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
2868 unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
2870 inOver_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmAlpha, &xmmAlpha, &xmmMask, &xmmMask, &xmmDstLo, &xmmDstHi);
2872 save128Aligned( (__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
2882 uint32_t s = (*src++) | 0xff000000;
2885 __m64 src = unpack_32_1x64 (s);
2886 __m64 alpha = _mm_movepi64_pi64 (xmmAlpha);
2887 __m64 mask = _mm_movepi64_pi64 (xmmMask);
2888 __m64 dest = unpack_32_1x64 (d);
2890 *dst++ = pack_1x64_32 (inOver_1x64 (&src,
2902 /* -------------------------------------------------------------------------------------------------
2903 * fbCompositeSrc_8888x8888
2906 fbCompositeSrc_8888x8888sse2 (pixman_op_t op,
2907 pixman_image_t * pSrc,
2908 pixman_image_t * pMask,
2909 pixman_image_t * pDst,
2919 int dstStride, srcStride;
2920 uint32_t *dstLine, *dst;
2921 uint32_t *srcLine, *src;
2923 fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
2924 fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
2931 coreCombineOverUsse2 (dst, src, width);
2939 /* -------------------------------------------------------------------------------------------------
2940 * fbCompositeSrc_8888x0565
2942 static inline uint16_t
2943 fbCompositeSrc_8888x0565pixel (uint32_t src, uint16_t dst)
2947 ms = unpack_32_1x64 (src);
2948 return pack565_32_16( pack_1x64_32 (over_1x64 (ms,
2949 expandAlpha_1x64 (ms),
2950 expand565_16_1x64 (dst))));
2954 fbCompositeSrc_8888x0565sse2 (pixman_op_t op,
2955 pixman_image_t * pSrc,
2956 pixman_image_t * pMask,
2957 pixman_image_t * pDst,
2967 uint16_t *dstLine, *dst, d;
2968 uint32_t *srcLine, *src, s;
2969 int dstStride, srcStride;
2972 __m128i xmmAlphaLo, xmmAlphaHi;
2973 __m128i xmmSrc, xmmSrcLo, xmmSrcHi;
2974 __m128i xmmDst, xmmDst0, xmmDst1, xmmDst2, xmmDst3;
2976 fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
2977 fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
2982 * I copy the code from MMX one and keep the fixme.
2983 * If it's a problem there, probably is a problem here.
2985 assert (pSrc->pDrawable == pMask->pDrawable);
2993 /* call prefetch hint to optimize cache load*/
2994 cachePrefetch ((__m128i*)src);
2995 cachePrefetch ((__m128i*)dst);
2997 dstLine += dstStride;
2998 srcLine += srcStride;
3001 /* Align dst on a 16-byte boundary */
3003 ((unsigned long)dst & 15))
3008 *dst++ = fbCompositeSrc_8888x0565pixel (s, d);
3012 /* call prefetch hint to optimize cache load*/
3013 cachePrefetch ((__m128i*)src);
3014 cachePrefetch ((__m128i*)dst);
3016 /* It's a 8 pixel loop */
3019 /* fill cache line with next memory */
3020 cachePrefetchNext ((__m128i*)src);
3021 cachePrefetchNext ((__m128i*)dst);
3023 /* I'm loading unaligned because I'm not sure about the address alignment. */
3024 xmmSrc = load128Unaligned ((__m128i*) src);
3025 xmmDst = load128Aligned ((__m128i*) dst);
3028 unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
3029 unpack565_128_4x128 (xmmDst, &xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3);
3030 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);
3032 /* I'm loading next 4 pixels from memory before to optimze the memory read. */
3033 xmmSrc = load128Unaligned ((__m128i*) (src+4));
3035 over_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi, &xmmDst0, &xmmDst1);
3038 unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
3039 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);
3041 over_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi, &xmmDst2, &xmmDst3);
3043 save128Aligned ((__m128i*)dst, pack565_4x128_128 (&xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3));
3055 *dst++ = fbCompositeSrc_8888x0565pixel (s, d);
3062 /* -------------------------------------------------------------------------------------------------
3063 * fbCompositeSolidMask_nx8x8888
3067 fbCompositeSolidMask_nx8x8888sse2 (pixman_op_t op,
3068 pixman_image_t * pSrc,
3069 pixman_image_t * pMask,
3070 pixman_image_t * pDst,
3081 uint32_t *dstLine, *dst;
3082 uint8_t *maskLine, *mask;
3083 int dstStride, maskStride;
3087 __m128i xmmSrc, xmmAlpha, xmmDef;
3088 __m128i xmmDst, xmmDstLo, xmmDstHi;
3089 __m128i xmmMask, xmmMaskLo, xmmMaskHi;
3091 __m64 mmxSrc, mmxAlpha, mmxMask, mmxDest;
3093 fbComposeGetSolid(pSrc, src, pDst->bits.format);
3099 fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
3100 fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
3102 xmmDef = createMask_2x32_128 (src, src);
3103 xmmSrc = expandPixel_32_1x128 (src);
3104 xmmAlpha = expandAlpha_1x128 (xmmSrc);
3105 mmxSrc = _mm_movepi64_pi64 (xmmSrc);
3106 mmxAlpha = _mm_movepi64_pi64 (xmmAlpha);
3111 dstLine += dstStride;
3113 maskLine += maskStride;
3116 /* call prefetch hint to optimize cache load*/
3117 cachePrefetch ((__m128i*)mask);
3118 cachePrefetch ((__m128i*)dst);
3120 while (w && (unsigned long)dst & 15)
3122 uint8_t m = *mask++;
3127 mmxMask = expandPixel_8_1x64 (m);
3128 mmxDest = unpack_32_1x64 (d);
3130 *dst = pack_1x64_32 (inOver_1x64 (&mmxSrc,
3140 /* call prefetch hint to optimize cache load*/
3141 cachePrefetch ((__m128i*)mask);
3142 cachePrefetch ((__m128i*)dst);
3146 /* fill cache line with next memory */
3147 cachePrefetchNext ((__m128i*)mask);
3148 cachePrefetchNext ((__m128i*)dst);
3150 m = *((uint32_t*)mask);
3152 if (srca == 0xff && m == 0xffffffff)
3154 save128Aligned ((__m128i*)dst, xmmDef);
3158 xmmDst = load128Aligned ((__m128i*) dst);
3159 xmmMask = unpack_32_1x128 (m);
3160 xmmMask = _mm_unpacklo_epi8 (xmmMask, _mm_setzero_si128());
3163 unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
3164 unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
3166 expandAlphaRev_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
3168 inOver_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi);
3170 save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
3180 uint8_t m = *mask++;
3185 mmxMask = expandPixel_8_1x64 (m);
3186 mmxDest = unpack_32_1x64 (d);
3188 *dst = pack_1x64_32 (inOver_1x64 (&mmxSrc,
3202 /* -------------------------------------------------------------------------------------------------
3203 * fbCompositeSolidMask_nx8x8888
3207 pixmanFillsse2 (uint32_t *bits,
3216 uint32_t byte_width;
3221 if (bpp == 16 && (data >> 16 != (data & 0xffff)))
3224 if (bpp != 16 && bpp != 32)
3229 stride = stride * (int) sizeof (uint32_t) / 2;
3230 byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
3231 byte_width = 2 * width;
3236 stride = stride * (int) sizeof (uint32_t) / 4;
3237 byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
3238 byte_width = 4 * width;
3242 cachePrefetch ((__m128i*)byte_line);
3243 xmmDef = createMask_2x32_128 (data, data);
3248 uint8_t *d = byte_line;
3249 byte_line += stride;
3253 cachePrefetchNext ((__m128i*)d);
3255 while (w >= 2 && ((unsigned long)d & 3))
3257 *(uint16_t *)d = data;
3262 while (w >= 4 && ((unsigned long)d & 15))
3264 *(uint32_t *)d = data;
3270 cachePrefetchNext ((__m128i*)d);
3274 cachePrefetch (((__m128i*)d) + 12);
3276 save128Aligned ((__m128i*)(d), xmmDef);
3277 save128Aligned ((__m128i*)(d+16), xmmDef);
3278 save128Aligned ((__m128i*)(d+32), xmmDef);
3279 save128Aligned ((__m128i*)(d+48), xmmDef);
3280 save128Aligned ((__m128i*)(d+64), xmmDef);
3281 save128Aligned ((__m128i*)(d+80), xmmDef);
3282 save128Aligned ((__m128i*)(d+96), xmmDef);
3283 save128Aligned ((__m128i*)(d+112), xmmDef);
3291 cachePrefetch (((__m128i*)d) + 8);
3293 save128Aligned ((__m128i*)(d), xmmDef);
3294 save128Aligned ((__m128i*)(d+16), xmmDef);
3295 save128Aligned ((__m128i*)(d+32), xmmDef);
3296 save128Aligned ((__m128i*)(d+48), xmmDef);
3302 cachePrefetchNext ((__m128i*)d);
3306 save128Aligned ((__m128i*)(d), xmmDef);
3307 save128Aligned ((__m128i*)(d+16), xmmDef);
3315 save128Aligned ((__m128i*)(d), xmmDef);
3321 cachePrefetchNext ((__m128i*)d);
3325 *(uint32_t *)d = data;
3333 *(uint16_t *)d = data;
3344 fbCompositeSolidMaskSrc_nx8x8888sse2 (pixman_op_t op,
3345 pixman_image_t * pSrc,
3346 pixman_image_t * pMask,
3347 pixman_image_t * pDst,
3358 uint32_t *dstLine, *dst;
3359 uint8_t *maskLine, *mask;
3360 int dstStride, maskStride;
3364 __m128i xmmSrc, xmmDef;
3365 __m128i xmmMask, xmmMaskLo, xmmMaskHi;
3367 fbComposeGetSolid(pSrc, src, pDst->bits.format);
3372 pixmanFillsse2 (pDst->bits.bits, pDst->bits.rowstride,
3373 PIXMAN_FORMAT_BPP (pDst->bits.format),
3374 xDst, yDst, width, height, 0);
3378 fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
3379 fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
3381 xmmDef = createMask_2x32_128 (src, src);
3382 xmmSrc = expandPixel_32_1x128 (src);
3387 dstLine += dstStride;
3389 maskLine += maskStride;
3392 /* call prefetch hint to optimize cache load*/
3393 cachePrefetch ((__m128i*)mask);
3394 cachePrefetch ((__m128i*)dst);
3396 while (w && (unsigned long)dst & 15)
3398 uint8_t m = *mask++;
3402 *dst = pack_1x64_32 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmSrc), expandPixel_8_1x64 (m)));
3413 /* call prefetch hint to optimize cache load*/
3414 cachePrefetch ((__m128i*)mask);
3415 cachePrefetch ((__m128i*)dst);
3419 /* fill cache line with next memory */
3420 cachePrefetchNext ((__m128i*)mask);
3421 cachePrefetchNext ((__m128i*)dst);
3423 m = *((uint32_t*)mask);
3425 if (srca == 0xff && m == 0xffffffff)
3427 save128Aligned ((__m128i*)dst, xmmDef);
3431 xmmMask = unpack_32_1x128 (m);
3432 xmmMask = _mm_unpacklo_epi8 (xmmMask, _mm_setzero_si128());
3435 unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
3437 expandAlphaRev_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
3439 pixMultiply_2x128 (&xmmSrc, &xmmSrc, &xmmMaskLo, &xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
3441 save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmMaskLo, xmmMaskHi));
3445 save128Aligned ((__m128i*)dst, _mm_setzero_si128());
3455 uint8_t m = *mask++;
3459 *dst = pack_1x64_32 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmSrc), expandPixel_8_1x64 (m)));
3474 /* -------------------------------------------------------------------------------------------------
3475 * fbCompositeSolidMask_nx8x0565
3479 fbCompositeSolidMask_nx8x0565sse2 (pixman_op_t op,
3480 pixman_image_t * pSrc,
3481 pixman_image_t * pMask,
3482 pixman_image_t * pDst,
3493 uint16_t *dstLine, *dst, d;
3494 uint8_t *maskLine, *mask;
3495 int dstStride, maskStride;
3498 __m64 mmxSrc, mmxAlpha, mmxMask, mmxDest;
3500 __m128i xmmSrc, xmmAlpha;
3501 __m128i xmmMask, xmmMaskLo, xmmMaskHi;
3502 __m128i xmmDst, xmmDst0, xmmDst1, xmmDst2, xmmDst3;
3504 fbComposeGetSolid(pSrc, src, pDst->bits.format);
3510 fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
3511 fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
3513 xmmSrc = expandPixel_32_1x128 (src);
3514 xmmAlpha = expandAlpha_1x128 (xmmSrc);
3515 mmxSrc = _mm_movepi64_pi64 (xmmSrc);
3516 mmxAlpha = _mm_movepi64_pi64 (xmmAlpha);
3521 dstLine += dstStride;
3523 maskLine += maskStride;
3526 /* call prefetch hint to optimize cache load*/
3527 cachePrefetch ((__m128i*)mask);
3528 cachePrefetch ((__m128i*)dst);
3530 while (w && (unsigned long)dst & 15)
3537 mmxMask = expandAlphaRev_1x64 (unpack_32_1x64 (m));
3538 mmxDest = expand565_16_1x64 (d);
3540 *dst = pack565_32_16 (pack_1x64_32 (inOver_1x64 (&mmxSrc,
3550 /* call prefetch hint to optimize cache load*/
3551 cachePrefetch ((__m128i*)mask);
3552 cachePrefetch ((__m128i*)dst);
3556 /* fill cache line with next memory */
3557 cachePrefetchNext ((__m128i*)mask);
3558 cachePrefetchNext ((__m128i*)dst);
3560 xmmDst = load128Aligned ((__m128i*) dst);
3561 unpack565_128_4x128 (xmmDst, &xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3);
3563 m = *((uint32_t*)mask);
3568 xmmMask = unpack_32_1x128 (m);
3569 xmmMask = _mm_unpacklo_epi8 (xmmMask, _mm_setzero_si128());
3572 unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
3574 expandAlphaRev_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
3575 inOver_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmDst0, &xmmDst1);
3578 m = *((uint32_t*)mask);
3583 xmmMask = unpack_32_1x128 (m);
3584 xmmMask = _mm_unpacklo_epi8 (xmmMask, _mm_setzero_si128());
3587 unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
3589 expandAlphaRev_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
3590 inOver_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmDst2, &xmmDst3);
3593 save128Aligned ((__m128i*)dst, pack565_4x128_128 (&xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3));
3606 mmxMask = expandAlphaRev_1x64 (unpack_32_1x64 (m));
3607 mmxDest = expand565_16_1x64 (d);
3609 *dst = pack565_32_16 (pack_1x64_32 (inOver_1x64 (&mmxSrc,
3623 /* -------------------------------------------------------------------------------------------------
3624 * fbCompositeSrc_8888RevNPx0565
3628 fbCompositeSrc_8888RevNPx0565sse2 (pixman_op_t op,
3629 pixman_image_t * pSrc,
3630 pixman_image_t * pMask,
3631 pixman_image_t * pDst,
3641 uint16_t *dstLine, *dst, d;
3642 uint32_t *srcLine, *src, s;
3643 int dstStride, srcStride;
3648 __m128i xmmSrc, xmmSrcLo, xmmSrcHi;
3649 __m128i xmmDst, xmmDst0, xmmDst1, xmmDst2, xmmDst3;
3651 fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
3652 fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
3657 * I copy the code from MMX one and keep the fixme.
3658 * If it's a problem there, probably is a problem here.
3660 assert (pSrc->pDrawable == pMask->pDrawable);
3666 dstLine += dstStride;
3668 srcLine += srcStride;
3671 /* call prefetch hint to optimize cache load*/
3672 cachePrefetch ((__m128i*)src);
3673 cachePrefetch ((__m128i*)dst);
3675 while (w && (unsigned long)dst & 15)
3680 ms = unpack_32_1x64 (s);
3682 *dst++ = pack565_32_16 (pack_1x64_32 (overRevNonPre_1x64(ms, expand565_16_1x64 (d))));
3686 /* call prefetch hint to optimize cache load*/
3687 cachePrefetch ((__m128i*)src);
3688 cachePrefetch ((__m128i*)dst);
3692 /* fill cache line with next memory */
3693 cachePrefetchNext ((__m128i*)src);
3694 cachePrefetchNext ((__m128i*)dst);
3697 xmmSrc = load128Unaligned((__m128i*)src);
3698 xmmDst = load128Aligned ((__m128i*)dst);
3700 packCmp = packAlpha (xmmSrc);
3702 unpack565_128_4x128 (xmmDst, &xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3);
3703 unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
3705 /* preload next round*/
3706 xmmSrc = load128Unaligned((__m128i*)(src+4));
3707 /* preload next round*/
3709 if (packCmp == 0xffffffff)
3711 invertColors_2x128 (xmmSrcLo, xmmSrcHi, &xmmDst0, &xmmDst1);
3715 overRevNonPre_2x128 (xmmSrcLo, xmmSrcHi, &xmmDst0, &xmmDst1);
3719 packCmp = packAlpha (xmmSrc);
3721 unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
3723 if (packCmp == 0xffffffff)
3725 invertColors_2x128 (xmmSrcLo, xmmSrcHi, &xmmDst2, &xmmDst3);
3729 overRevNonPre_2x128 (xmmSrcLo, xmmSrcHi, &xmmDst2, &xmmDst3);
3732 save128Aligned ((__m128i*)dst, pack565_4x128_128 (&xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3));
3744 ms = unpack_32_1x64 (s);
3746 *dst++ = pack565_32_16 (pack_1x64_32 (overRevNonPre_1x64(ms, expand565_16_1x64 (d))));
3754 /* "8888RevNP" is GdkPixbuf's format: ABGR, non premultiplied */
3756 /* -------------------------------------------------------------------------------------------------
3757 * fbCompositeSrc_8888RevNPx8888
3761 fbCompositeSrc_8888RevNPx8888sse2 (pixman_op_t op,
3762 pixman_image_t * pSrc,
3763 pixman_image_t * pMask,
3764 pixman_image_t * pDst,
3774 uint32_t *dstLine, *dst, d;
3775 uint32_t *srcLine, *src, s;
3776 int dstStride, srcStride;
3780 __m128i xmmSrcLo, xmmSrcHi;
3781 __m128i xmmDstLo, xmmDstHi;
3783 fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
3784 fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
3789 * I copy the code from MMX one and keep the fixme.
3790 * If it's a problem there, probably is a problem here.
3792 assert (pSrc->pDrawable == pMask->pDrawable);
3798 dstLine += dstStride;
3800 srcLine += srcStride;
3803 /* call prefetch hint to optimize cache load*/
3804 cachePrefetch ((__m128i*)src);
3805 cachePrefetch ((__m128i*)dst);
3807 while (w && (unsigned long)dst & 15)
3812 *dst++ = pack_1x64_32 (overRevNonPre_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d)));
3817 /* call prefetch hint to optimize cache load*/
3818 cachePrefetch ((__m128i*)src);
3819 cachePrefetch ((__m128i*)dst);
3823 /* fill cache line with next memory */
3824 cachePrefetchNext ((__m128i*)src);
3825 cachePrefetchNext ((__m128i*)dst);
3827 xmmSrcHi = load128Unaligned((__m128i*)src);
3829 packCmp = packAlpha (xmmSrcHi);
3831 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
3833 if (packCmp == 0xffffffff)
3835 invertColors_2x128( xmmSrcLo, xmmSrcHi, &xmmDstLo, &xmmDstHi);
3837 save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
3841 xmmDstHi = load128Aligned ((__m128i*)dst);
3843 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
3845 overRevNonPre_2x128 (xmmSrcLo, xmmSrcHi, &xmmDstLo, &xmmDstHi);
3847 save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
3860 *dst++ = pack_1x64_32 (overRevNonPre_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d)));
3869 /* -------------------------------------------------------------------------------------------------
3870 * fbCompositeSolidMask_nx8888x0565C
3874 fbCompositeSolidMask_nx8888x0565Csse2 (pixman_op_t op,
3875 pixman_image_t * pSrc,
3876 pixman_image_t * pMask,
3877 pixman_image_t * pDst,
3888 uint16_t *dstLine, *dst, d;
3889 uint32_t *maskLine, *mask, m;
3890 int dstStride, maskStride;
3894 __m128i xmmSrc, xmmAlpha;
3895 __m128i xmmMask, xmmMaskLo, xmmMaskHi;
3896 __m128i xmmDst, xmmDst0, xmmDst1, xmmDst2, xmmDst3;
3898 __m64 mmxSrc, mmxAlpha, mmxMask, mmxDest;
3900 fbComposeGetSolid(pSrc, src, pDst->bits.format);
3906 fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
3907 fbComposeGetStart (pMask, xMask, yMask, uint32_t, maskStride, maskLine, 1);
3909 xmmSrc = expandPixel_32_1x128 (src);
3910 xmmAlpha = expandAlpha_1x128 (xmmSrc);
3911 mmxSrc = _mm_movepi64_pi64 (xmmSrc);
3912 mmxAlpha = _mm_movepi64_pi64 (xmmAlpha);
3919 maskLine += maskStride;
3920 dstLine += dstStride;
3922 /* call prefetch hint to optimize cache load*/
3923 cachePrefetch ((__m128i*)mask);
3924 cachePrefetch ((__m128i*)dst);
3926 while (w && ((unsigned long)dst & 15))
3928 m = *(uint32_t *) mask;
3933 mmxMask = unpack_32_1x64 (m);
3934 mmxDest = expand565_16_1x64 (d);
3936 *dst = pack565_32_16 (pack_1x64_32 (inOver_1x64 (&mmxSrc,
3947 /* call prefetch hint to optimize cache load*/
3948 cachePrefetch ((__m128i*)mask);
3949 cachePrefetch ((__m128i*)dst);
3953 /* fill cache line with next memory */
3954 cachePrefetchNext ((__m128i*)mask);
3955 cachePrefetchNext ((__m128i*)dst);
3958 xmmMask = load128Unaligned((__m128i*)mask);
3959 xmmDst = load128Aligned((__m128i*)dst);
3961 packCmp = _mm_movemask_epi8 (_mm_cmpeq_epi32 (xmmMask, _mm_setzero_si128()));
3963 unpack565_128_4x128 (xmmDst, &xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3);
3964 unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
3966 /* preload next round*/
3967 xmmMask = load128Unaligned((__m128i*)(mask+4));
3968 /* preload next round*/
3970 if (packCmp != 0xffff)
3972 inOver_2x128(&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmDst0, &xmmDst1);
3976 packCmp = _mm_movemask_epi8 (_mm_cmpeq_epi32 (xmmMask, _mm_setzero_si128()));
3978 unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
3980 if (packCmp != 0xffff)
3982 inOver_2x128(&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmDst2, &xmmDst3);
3985 save128Aligned ((__m128i*)dst, pack565_4x128_128 (&xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3));
3994 m = *(uint32_t *) mask;
3999 mmxMask = unpack_32_1x64 (m);
4000 mmxDest = expand565_16_1x64 (d);
4002 *dst = pack565_32_16 (pack_1x64_32 (inOver_1x64 (&mmxSrc,
4017 /* -------------------------------------------------------------------------------------------------
4018 * fbCompositeIn_nx8x8
4022 fbCompositeIn_nx8x8sse2 (pixman_op_t op,
4023 pixman_image_t * pSrc,
4024 pixman_image_t * pMask,
4025 pixman_image_t * pDst,
4035 uint8_t *dstLine, *dst;
4036 uint8_t *maskLine, *mask;
4037 int dstStride, maskStride;
4043 __m128i xmmMask, xmmMaskLo, xmmMaskHi;
4044 __m128i xmmDst, xmmDstLo, xmmDstHi;
4046 fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1);
4047 fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
4049 fbComposeGetSolid(pSrc, src, pDst->bits.format);
4055 xmmAlpha = expandAlpha_1x128 (expandPixel_32_1x128 (src));
4060 dstLine += dstStride;
4062 maskLine += maskStride;
4065 /* call prefetch hint to optimize cache load*/
4066 cachePrefetch ((__m128i*)mask);
4067 cachePrefetch ((__m128i*)dst);
4069 while (w && ((unsigned long)dst & 15))
4071 m = (uint32_t) *mask++;
4072 d = (uint32_t) *dst;
4074 *dst++ = (uint8_t) pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmAlpha), unpack_32_1x64 (m)),
4075 unpack_32_1x64 (d)));
4079 /* call prefetch hint to optimize cache load*/
4080 cachePrefetch ((__m128i*)mask);
4081 cachePrefetch ((__m128i*)dst);
4085 /* fill cache line with next memory */
4086 cachePrefetchNext ((__m128i*)mask);
4087 cachePrefetchNext ((__m128i*)dst);
4089 xmmMask = load128Unaligned((__m128i*)mask);
4090 xmmDst = load128Aligned((__m128i*)dst);
4092 unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
4093 unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
4095 pixMultiply_2x128 (&xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
4096 pixMultiply_2x128 (&xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi, &xmmDstLo, &xmmDstHi);
4098 save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
4107 m = (uint32_t) *mask++;
4108 d = (uint32_t) *dst;
4110 *dst++ = (uint8_t) pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmAlpha), unpack_32_1x64 (m)),
4111 unpack_32_1x64 (d)));
4119 /* -------------------------------------------------------------------------------------------------
4124 fbCompositeIn_8x8sse2 (pixman_op_t op,
4125 pixman_image_t * pSrc,
4126 pixman_image_t * pMask,
4127 pixman_image_t * pDst,
4137 uint8_t *dstLine, *dst;
4138 uint8_t *srcLine, *src;
4139 int srcStride, dstStride;
4143 __m128i xmmSrc, xmmSrcLo, xmmSrcHi;
4144 __m128i xmmDst, xmmDstLo, xmmDstHi;
4146 fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1);
4147 fbComposeGetStart (pSrc, xSrc, ySrc, uint8_t, srcStride, srcLine, 1);
4152 dstLine += dstStride;
4154 srcLine += srcStride;
4157 /* call prefetch hint to optimize cache load*/
4158 cachePrefetch ((__m128i*)src);
4159 cachePrefetch ((__m128i*)dst);
4161 while (w && ((unsigned long)dst & 15))
4163 s = (uint32_t) *src++;
4164 d = (uint32_t) *dst;
4166 *dst++ = (uint8_t) pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s),unpack_32_1x64 (d)));
4170 /* call prefetch hint to optimize cache load*/
4171 cachePrefetch ((__m128i*)src);
4172 cachePrefetch ((__m128i*)dst);
4176 /* fill cache line with next memory */
4177 cachePrefetchNext ((__m128i*)src);
4178 cachePrefetchNext ((__m128i*)dst);
4180 xmmSrc = load128Unaligned((__m128i*)src);
4181 xmmDst = load128Aligned((__m128i*)dst);
4183 unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
4184 unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
4186 pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmDstLo, &xmmDstHi, &xmmDstLo, &xmmDstHi);
4188 save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
4197 s = (uint32_t) *src++;
4198 d = (uint32_t) *dst;
4200 *dst++ = (uint8_t) pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s),unpack_32_1x64 (d)));
4208 /* -------------------------------------------------------------------------------------------------
4209 * fbCompositeSrcAdd_8888x8x8
4213 fbCompositeSrcAdd_8888x8x8sse2 (pixman_op_t op,
4214 pixman_image_t * pSrc,
4215 pixman_image_t * pMask,
4216 pixman_image_t * pDst,
4226 uint8_t *dstLine, *dst;
4227 uint8_t *maskLine, *mask;
4228 int dstStride, maskStride;
4235 __m128i xmmMask, xmmMaskLo, xmmMaskHi;
4236 __m128i xmmDst, xmmDstLo, xmmDstHi;
4238 fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1);
4239 fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
4241 fbComposeGetSolid(pSrc, src, pDst->bits.format);
4247 xmmAlpha = expandAlpha_1x128 (expandPixel_32_1x128 (src));
4252 dstLine += dstStride;
4254 maskLine += maskStride;
4257 /* call prefetch hint to optimize cache load*/
4258 cachePrefetch ((__m128i*)mask);
4259 cachePrefetch ((__m128i*)dst);
4261 while (w && ((unsigned long)dst & 15))
4263 m = (uint32_t) *mask++;
4264 d = (uint32_t) *dst;
4266 *dst++ = (uint8_t) pack_1x64_32 (_mm_adds_pu16 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmAlpha), unpack_32_1x64 (m)),
4267 unpack_32_1x64 (d)));
4271 /* call prefetch hint to optimize cache load*/
4272 cachePrefetch ((__m128i*)mask);
4273 cachePrefetch ((__m128i*)dst);
4277 /* fill cache line with next memory */
4278 cachePrefetchNext ((__m128i*)mask);
4279 cachePrefetchNext ((__m128i*)dst);
4281 xmmMask = load128Unaligned((__m128i*)mask);
4282 xmmDst = load128Aligned((__m128i*)dst);
4284 unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
4285 unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
4287 pixMultiply_2x128 (&xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
4289 xmmDstLo = _mm_adds_epu16 (xmmMaskLo, xmmDstLo);
4290 xmmDstHi = _mm_adds_epu16 (xmmMaskHi, xmmDstHi);
4292 save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
4301 m = (uint32_t) *mask++;
4302 d = (uint32_t) *dst;
4304 *dst++ = (uint8_t) pack_1x64_32 (_mm_adds_pu16 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmAlpha), unpack_32_1x64 (m)),
4305 unpack_32_1x64 (d)));
4313 /* -------------------------------------------------------------------------------------------------
4314 * fbCompositeSrcAdd_8000x8000
4318 fbCompositeSrcAdd_8000x8000sse2 (pixman_op_t op,
4319 pixman_image_t * pSrc,
4320 pixman_image_t * pMask,
4321 pixman_image_t * pDst,
4331 uint8_t *dstLine, *dst;
4332 uint8_t *srcLine, *src;
4333 int dstStride, srcStride;
4337 fbComposeGetStart (pSrc, xSrc, ySrc, uint8_t, srcStride, srcLine, 1);
4338 fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1);
4345 /* call prefetch hint to optimize cache load*/
4346 cachePrefetch ((__m128i*)src);
4347 cachePrefetch ((__m128i*)dst);
4349 dstLine += dstStride;
4350 srcLine += srcStride;
4354 while (w && (unsigned long)dst & 3)
4356 t = (*dst) + (*src++);
4357 *dst++ = t | (0 - (t >> 8));
4361 coreCombineAddUsse2 ((uint32_t*)dst, (uint32_t*)src, w >> 2);
4371 t = (*dst) + (*src++);
4372 *dst++ = t | (0 - (t >> 8));
4380 /* -------------------------------------------------------------------------------------------------
4381 * fbCompositeSrcAdd_8888x8888
4384 fbCompositeSrcAdd_8888x8888sse2 (pixman_op_t op,
4385 pixman_image_t * pSrc,
4386 pixman_image_t * pMask,
4387 pixman_image_t * pDst,
4397 uint32_t *dstLine, *dst;
4398 uint32_t *srcLine, *src;
4399 int dstStride, srcStride;
4401 fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
4402 fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
4407 dstLine += dstStride;
4409 srcLine += srcStride;
4411 coreCombineAddUsse2 (dst, src, width);
4417 /* -------------------------------------------------------------------------------------------------
4418 * fbCompositeCopyAreasse2
4422 pixmanBltsse2 (uint32_t *src_bits,
4428 int src_x, int src_y,
4429 int dst_x, int dst_y,
4430 int width, int height)
4432 uint8_t * src_bytes;
4433 uint8_t * dst_bytes;
4436 if (src_bpp != dst_bpp)
4441 src_stride = src_stride * (int) sizeof (uint32_t) / 2;
4442 dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
4443 src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
4444 dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
4445 byte_width = 2 * width;
4449 else if (src_bpp == 32)
4451 src_stride = src_stride * (int) sizeof (uint32_t) / 4;
4452 dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
4453 src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
4454 dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
4455 byte_width = 4 * width;
4464 cachePrefetch ((__m128i*)src_bytes);
4465 cachePrefetch ((__m128i*)dst_bytes);
4470 uint8_t *s = src_bytes;
4471 uint8_t *d = dst_bytes;
4472 src_bytes += src_stride;
4473 dst_bytes += dst_stride;
4476 cachePrefetchNext ((__m128i*)s);
4477 cachePrefetchNext ((__m128i*)d);
4479 while (w >= 2 && ((unsigned long)d & 3))
4481 *(uint16_t *)d = *(uint16_t *)s;
4487 while (w >= 4 && ((unsigned long)d & 15))
4489 *(uint32_t *)d = *(uint32_t *)s;
4496 cachePrefetchNext ((__m128i*)s);
4497 cachePrefetchNext ((__m128i*)d);
4501 __m128i xmm0, xmm1, xmm2, xmm3;
4503 /* 128 bytes ahead */
4504 cachePrefetch (((__m128i*)s) + 8);
4505 cachePrefetch (((__m128i*)d) + 8);
4507 xmm0 = load128Unaligned ((__m128i*)(s));
4508 xmm1 = load128Unaligned ((__m128i*)(s+16));
4509 xmm2 = load128Unaligned ((__m128i*)(s+32));
4510 xmm3 = load128Unaligned ((__m128i*)(s+48));
4512 save128Aligned ((__m128i*)(d), xmm0);
4513 save128Aligned ((__m128i*)(d+16), xmm1);
4514 save128Aligned ((__m128i*)(d+32), xmm2);
4515 save128Aligned ((__m128i*)(d+48), xmm3);
4522 cachePrefetchNext ((__m128i*)s);
4523 cachePrefetchNext ((__m128i*)d);
4527 save128Aligned ((__m128i*)d, load128Unaligned ((__m128i*)s) );
4534 cachePrefetchNext ((__m128i*)s);
4535 cachePrefetchNext ((__m128i*)d);
4539 *(uint32_t *)d = *(uint32_t *)s;
4548 *(uint16_t *)d = *(uint16_t *)s;
4561 fbCompositeCopyAreasse2 (pixman_op_t op,
4562 pixman_image_t * pSrc,
4563 pixman_image_t * pMask,
4564 pixman_image_t * pDst,
4574 pixmanBltsse2 (pSrc->bits.bits,
4576 pSrc->bits.rowstride,
4577 pDst->bits.rowstride,
4578 PIXMAN_FORMAT_BPP (pSrc->bits.format),
4579 PIXMAN_FORMAT_BPP (pDst->bits.format),
4580 xSrc, ySrc, xDst, yDst, width, height);
4584 /* This code are buggy in MMX version, now the bug was translated to SSE2 version */
4586 fbCompositeOver_x888x8x8888sse2 (pixman_op_t op,
4587 pixman_image_t * pSrc,
4588 pixman_image_t * pMask,
4589 pixman_image_t * pDst,
4599 uint32_t *src, *srcLine, s;
4600 uint32_t *dst, *dstLine, d;
4601 uint8_t *mask, *maskLine;
4603 int srcStride, maskStride, dstStride;
4606 __m128i xmmSrc, xmmSrcLo, xmmSrcHi;
4607 __m128i xmmDst, xmmDstLo, xmmDstHi;
4608 __m128i xmmMask, xmmMaskLo, xmmMaskHi;
4610 fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
4611 fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
4612 fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
4617 srcLine += srcStride;
4619 dstLine += dstStride;
4621 maskLine += maskStride;
4625 /* call prefetch hint to optimize cache load*/
4626 cachePrefetch ((__m128i*)src);
4627 cachePrefetch ((__m128i*)dst);
4628 cachePrefetch ((__m128i*)mask);
4630 while (w && (unsigned long)dst & 15)
4632 s = 0xff000000 | *src++;
4633 m = (uint32_t) *mask++;
4636 __m64 ms = unpack_32_1x64 (s);
4640 ms = inOver_1x64 (ms,
4642 expandAlphaRev_1x64 (unpack_32_1x64 (m)),
4643 unpack_32_1x64 (d));
4646 *dst++ = pack_1x64_32 (ms);
4650 /* call prefetch hint to optimize cache load*/
4651 cachePrefetch ((__m128i*)src);
4652 cachePrefetch ((__m128i*)dst);
4653 cachePrefetch ((__m128i*)mask);
4657 /* fill cache line with next memory */
4658 cachePrefetchNext ((__m128i*)src);
4659 cachePrefetchNext ((__m128i*)dst);
4660 cachePrefetchNext ((__m128i*)mask);
4662 m = *(uint32_t*) mask;
4663 xmmSrc = _mm_or_si128 (load128Unaligned ((__m128i*)src), Maskff000000);
4665 if (m == 0xffffffff)
4667 save128Aligned ((__m128i*)dst, xmmSrc);
4671 xmmDst = load128Aligned ((__m128i*)dst);
4673 xmmMask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
4675 unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
4676 unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
4677 unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
4679 expandAlphaRev_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
4681 inOver_2x128 (xmmSrcLo, xmmSrcHi, Mask00ff, Mask00ff, xmmMaskLo, xmmMaskHi, &xmmDstLo, &xmmDstHi);
4683 save128Aligned( (__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
4694 m = (uint32_t) *mask++;
4698 s = 0xff000000 | *src;
4708 *dst = pack_1x64_32 (inOver_1x64 (unpack_32_1x64 (s),
4710 expandAlphaRev_1x64 (unpack_32_1x64 (m)),
4711 unpack_32_1x64 (d)));
4726 #endif /* USE_SSE2 */