2 * Copyright © 2008 Rodrigo Kumpera
3 * Copyright © 2008 André Tupinambá
5 * Permission to use, copy, modify, distribute, and sell this software and its
6 * documentation for any purpose is hereby granted without fee, provided that
7 * the above copyright notice appear in all copies and that both that
8 * copyright notice and this permission notice appear in supporting
9 * documentation, and that the name of Red Hat not be used in advertising or
10 * publicity pertaining to distribution of the software without specific,
11 * written prior permission. Red Hat makes no representations about the
12 * suitability of this software for any purpose. It is provided "as is"
13 * without express or implied warranty.
15 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
16 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
18 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
20 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
21 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
24 * Author: Rodrigo Kumpera (kumpera@gmail.com)
25 * André Tupinambá (andrelrt@gmail.com)
27 * Based on work by Owen Taylor and Søren Sandmann
34 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
35 #include <emmintrin.h> /* for SSE2 intrinsics */
37 #include "pixman-sse.h"
43 #define inline __forceinline
47 # define inline __inline__ __attribute__ ((__always_inline__))
50 /* -------------------------------------------------------------------------------------------------
54 static __m64 xMask0080;
55 static __m64 xMask00ff;
56 static __m64 xMask0101;
57 static __m64 xMaskAlpha;
59 static __m64 xMask565rgb;
60 static __m64 xMask565Unpack;
62 static __m128i Mask0080;
63 static __m128i Mask00ff;
64 static __m128i Mask0101;
65 static __m128i Maskffff;
66 static __m128i Maskff000000;
67 static __m128i MaskAlpha;
69 static __m128i Mask565r;
70 static __m128i Mask565g1, Mask565g2;
71 static __m128i Mask565b;
72 static __m128i MaskRed;
73 static __m128i MaskGreen;
74 static __m128i MaskBlue;
76 /* -------------------------------------------------------------------------------------------------
80 unpack_32_1x128 (uint32_t data)
82 return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128());
86 unpack_128_2x128 (__m128i data, __m128i* dataLo, __m128i* dataHi)
88 *dataLo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
89 *dataHi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
93 unpack565_128_4x128 (__m128i data, __m128i* data0, __m128i* data1, __m128i* data2, __m128i* data3)
98 lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
99 hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
101 r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), MaskRed);
102 g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), MaskGreen);
103 b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), MaskBlue);
105 lo = _mm_or_si128 (_mm_or_si128 (r, g), b);
107 r = _mm_and_si128 (_mm_slli_epi32 (hi, 8), MaskRed);
108 g = _mm_and_si128 (_mm_slli_epi32 (hi, 5), MaskGreen);
109 b = _mm_and_si128 (_mm_slli_epi32 (hi, 3), MaskBlue);
111 hi = _mm_or_si128 (_mm_or_si128 (r, g), b);
113 unpack_128_2x128 (lo, data0, data1);
114 unpack_128_2x128 (hi, data2, data3);
117 static inline uint16_t
118 pack565_32_16 (uint32_t pixel)
120 return (uint16_t) (((pixel>>8) & 0xf800) | ((pixel>>5) & 0x07e0) | ((pixel>>3) & 0x001f));
123 static inline __m128i
124 pack_2x128_128 (__m128i lo, __m128i hi)
126 return _mm_packus_epi16 (lo, hi);
129 static inline __m128i
130 pack565_2x128_128 (__m128i lo, __m128i hi)
133 __m128i r, g1, g2, b;
135 data = pack_2x128_128 ( lo, hi );
137 r = _mm_and_si128 (data , Mask565r);
138 g1 = _mm_and_si128 (_mm_slli_epi32 (data , 3), Mask565g1);
139 g2 = _mm_and_si128 (_mm_srli_epi32 (data , 5), Mask565g2);
140 b = _mm_and_si128 (_mm_srli_epi32 (data , 3), Mask565b);
142 return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
145 static inline __m128i
146 pack565_4x128_128 (__m128i xmm0, __m128i xmm1, __m128i xmm2, __m128i xmm3)
150 lo = _mm_packus_epi16 (pack565_2x128_128 ( xmm0, xmm1 ), _mm_setzero_si128 ());
151 hi = _mm_packus_epi16 (_mm_setzero_si128 (), pack565_2x128_128 ( xmm2, xmm3 ));
153 return _mm_or_si128 (lo, hi);
156 static inline uint32_t
157 packAlpha (__m128i x)
159 return _mm_cvtsi128_si32 (_mm_packus_epi16 (_mm_packus_epi16 (_mm_srli_epi32 (x, 24),
160 _mm_setzero_si128 ()),
161 _mm_setzero_si128 ()));
164 static inline __m128i
165 expandPixel_32_1x128 (uint32_t data)
167 return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE(1, 0, 1, 0));
170 static inline __m128i
171 expandAlpha_1x128 (__m128i data)
173 return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
177 expandAlpha_2x128 (__m128i dataLo, __m128i dataHi, __m128i* alphaLo, __m128i* alphaHi)
181 lo = _mm_shufflelo_epi16 (dataLo, _MM_SHUFFLE(3, 3, 3, 3));
182 hi = _mm_shufflelo_epi16 (dataHi, _MM_SHUFFLE(3, 3, 3, 3));
183 *alphaLo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE(3, 3, 3, 3));
184 *alphaHi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE(3, 3, 3, 3));
188 expandAlphaRev_2x128 (__m128i dataLo, __m128i dataHi, __m128i* alphaLo, __m128i* alphaHi)
192 lo = _mm_shufflelo_epi16 (dataLo, _MM_SHUFFLE(0, 0, 0, 0));
193 hi = _mm_shufflelo_epi16 (dataHi, _MM_SHUFFLE(0, 0, 0, 0));
194 *alphaLo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE(0, 0, 0, 0));
195 *alphaHi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE(0, 0, 0, 0));
199 pixMultiply_2x128 (__m128i dataLo, __m128i dataHi, __m128i alphaLo, __m128i alphaHi, __m128i* retLo, __m128i* retHi)
203 lo = _mm_mullo_epi16 (dataLo, alphaLo);
204 hi = _mm_mullo_epi16 (dataHi, alphaHi);
205 lo = _mm_adds_epu16 (lo, Mask0080);
206 hi = _mm_adds_epu16 (hi, Mask0080);
207 *retLo = _mm_mulhi_epu16 (lo, Mask0101);
208 *retHi = _mm_mulhi_epu16 (hi, Mask0101);
212 pixAddMultiply_2x128 (__m128i srcLo, __m128i srcHi, __m128i alphaDstLo, __m128i alphaDstHi,
213 __m128i dstLo, __m128i dstHi, __m128i alphaSrcLo, __m128i alphaSrcHi,
214 __m128i* retLo, __m128i* retHi)
217 __m128i mulLo, mulHi;
219 lo = _mm_mullo_epi16 (srcLo, alphaDstLo);
220 hi = _mm_mullo_epi16 (srcHi, alphaDstHi);
221 mulLo = _mm_mullo_epi16 (dstLo, alphaSrcLo);
222 mulHi = _mm_mullo_epi16 (dstHi, alphaSrcHi);
223 lo = _mm_adds_epu16 (lo, Mask0080);
224 hi = _mm_adds_epu16 (hi, Mask0080);
225 lo = _mm_adds_epu16 (lo, mulLo);
226 hi = _mm_adds_epu16 (hi, mulHi);
227 *retLo = _mm_mulhi_epu16 (lo, Mask0101);
228 *retHi = _mm_mulhi_epu16 (hi, Mask0101);
232 negate_2x128 (__m128i dataLo, __m128i dataHi, __m128i* negLo, __m128i* negHi)
234 *negLo = _mm_xor_si128 (dataLo, Mask00ff);
235 *negHi = _mm_xor_si128 (dataHi, Mask00ff);
239 invertColors_2x128 (__m128i dataLo, __m128i dataHi, __m128i* invLo, __m128i* invHi)
243 lo = _mm_shufflelo_epi16 (dataLo, _MM_SHUFFLE(3, 0, 1, 2));
244 hi = _mm_shufflelo_epi16 (dataHi, _MM_SHUFFLE(3, 0, 1, 2));
245 *invLo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE(3, 0, 1, 2));
246 *invHi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE(3, 0, 1, 2));
250 over_2x128 (__m128i srcLo, __m128i srcHi, __m128i alphaLo, __m128i alphaHi, __m128i* dstLo, __m128i* dstHi)
252 negate_2x128 (alphaLo, alphaHi, &alphaLo, &alphaHi);
254 pixMultiply_2x128 (*dstLo, *dstHi, alphaLo, alphaHi, dstLo, dstHi);
256 *dstLo = _mm_adds_epu8 (srcLo, *dstLo);
257 *dstHi = _mm_adds_epu8 (srcHi, *dstHi);
261 overRevNonPre_2x128 (__m128i srcLo, __m128i srcHi, __m128i* dstLo, __m128i* dstHi)
264 __m128i alphaLo, alphaHi;
266 expandAlpha_2x128 (srcLo, srcHi, &alphaLo, &alphaHi);
268 lo = _mm_or_si128 (alphaLo, MaskAlpha);
269 hi = _mm_or_si128 (alphaHi, MaskAlpha);
271 invertColors_2x128 (srcLo, srcHi, &srcLo, &srcHi);
273 pixMultiply_2x128 (srcLo, srcHi, lo, hi, &lo, &hi);
275 over_2x128 (lo, hi, alphaLo, alphaHi, dstLo, dstHi);
279 inOver_2x128 (__m128i srcLo, __m128i srcHi, __m128i alphaLo, __m128i alphaHi,
280 __m128i maskLo, __m128i maskHi, __m128i* dstLo, __m128i* dstHi)
285 pixMultiply_2x128 ( srcLo, srcHi, maskLo, maskHi, &sLo, &sHi);
286 pixMultiply_2x128 (alphaLo, alphaHi, maskLo, maskHi, &aLo, &aHi);
288 over_2x128 (sLo, sHi, aLo, aHi, dstLo, dstHi);
292 cachePrefetch (__m128i* addr)
294 _mm_prefetch (addr, _MM_HINT_T0);
298 cachePrefetchNext (__m128i* addr)
300 _mm_prefetch (addr + 4, _MM_HINT_T0); // 64 bytes ahead
303 /* load 4 pixels from a 16-byte boundary aligned address */
304 static inline __m128i
305 load128Aligned (__m128i* src)
307 return _mm_load_si128 (src);
310 /* load 4 pixels from a unaligned address */
311 static inline __m128i
312 load128Unaligned (__m128i* src)
314 return _mm_loadu_si128 (src);
317 /* save 4 pixels using Write Combining memory on a 16-byte boundary aligned address */
319 save128WriteCombining (__m128i* dst, __m128i data)
321 _mm_stream_si128 (dst, data);
324 /* save 4 pixels on a 16-byte boundary aligned address */
326 save128Aligned (__m128i* dst, __m128i data)
328 _mm_store_si128 (dst, data);
331 /* save 4 pixels on a unaligned address */
333 save128Unaligned (__m128i* dst, __m128i data)
335 _mm_storeu_si128 (dst, data);
338 /* -------------------------------------------------------------------------------------------------
343 unpack_32_1x64 (uint32_t data)
345 return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (data), _mm_setzero_si64());
349 expandAlpha_1x64 (__m64 data)
351 return _mm_shuffle_pi16 (data, _MM_SHUFFLE(3, 3, 3, 3));
355 expandAlphaRev_1x64 (__m64 data)
357 return _mm_shuffle_pi16 (data, _MM_SHUFFLE(0, 0, 0, 0));
361 expandPixel_8_1x64 (uint8_t data)
363 return _mm_shuffle_pi16 (unpack_32_1x64 ((uint32_t)data), _MM_SHUFFLE(0, 0, 0, 0));
367 pixMultiply_1x64 (__m64 data, __m64 alpha)
369 return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (data, alpha),
375 pixAddMultiply_1x64 (__m64 src, __m64 alphaDst, __m64 dst, __m64 alphaSrc)
377 return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (src, alphaDst),
379 _mm_mullo_pi16 (dst, alphaSrc)),
384 negate_1x64 (__m64 data)
386 return _mm_xor_si64 (data, xMask00ff);
390 invertColors_1x64 (__m64 data)
392 return _mm_shuffle_pi16 (data, _MM_SHUFFLE(3, 0, 1, 2));
396 over_1x64 (__m64 src, __m64 alpha, __m64 dst)
398 return _mm_adds_pu8 (src, pixMultiply_1x64 (dst, negate_1x64 (alpha)));
402 inOver_1x64 (__m64 src, __m64 alpha, __m64 mask, __m64 dst)
404 return over_1x64 (pixMultiply_1x64 (src, mask),
405 pixMultiply_1x64 (alpha, mask),
410 overRevNonPre_1x64 (__m64 src, __m64 dst)
412 __m64 alpha = expandAlpha_1x64 (src);
414 return over_1x64 (pixMultiply_1x64 (invertColors_1x64 (src),
415 _mm_or_si64 (alpha, xMaskAlpha)),
420 static inline uint32_t
421 pack_1x64_32( __m64 data )
423 return _mm_cvtsi64_si32 (_mm_packs_pu16 (data, _mm_setzero_si64()));
426 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
430 * --- Expanding 565 in the low word ---
432 * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
433 * m = m & (01f0003f001f);
434 * m = m * (008404100840);
437 * Note the trick here - the top word is shifted by another nibble to
438 * avoid it bumping into the middle word
441 expand565_16_1x64 (uint16_t pixel)
446 p = _mm_cvtsi32_si64 ((uint32_t) pixel);
448 t1 = _mm_slli_si64 (p, 36 - 11);
449 t2 = _mm_slli_si64 (p, 16 - 5);
451 p = _mm_or_si64 (t1, p);
452 p = _mm_or_si64 (t2, p);
453 p = _mm_and_si64 (p, xMask565rgb);
454 p = _mm_mullo_pi16 (p, xMask565Unpack);
456 return _mm_srli_pi16 (p, 8);
459 /* -------------------------------------------------------------------------------------------------
460 * Compose Core transformations
462 static inline uint32_t
463 coreCombineOverUPixelsse2 (uint32_t src, uint32_t dst)
476 ms = unpack_32_1x64 (src);
477 return pack_1x64_32 (over_1x64 (ms, expandAlpha_1x64 (ms), unpack_32_1x64 (dst)));
484 coreCombineOverUsse2 (uint32_t* pd, const uint32_t* ps, int w)
489 __m128i xmmDstLo, xmmDstHi;
490 __m128i xmmSrcLo, xmmSrcHi;
491 __m128i xmmAlphaLo, xmmAlphaHi;
493 /* call prefetch hint to optimize cache load*/
494 cachePrefetch ((__m128i*)ps);
495 cachePrefetch ((__m128i*)pd);
497 /* Align dst on a 16-byte boundary */
499 ((unsigned long)pd & 15))
504 *pd++ = coreCombineOverUPixelsse2 (s, d);
508 /* call prefetch hint to optimize cache load*/
509 cachePrefetch ((__m128i*)ps);
510 cachePrefetch ((__m128i*)pd);
514 /* fill cache line with next memory */
515 cachePrefetchNext ((__m128i*)ps);
516 cachePrefetchNext ((__m128i*)pd);
518 /* I'm loading unaligned because I'm not sure about the address alignment. */
519 xmmSrcHi = load128Unaligned ((__m128i*) ps);
521 /* Check the alpha channel */
522 pa = packAlpha (xmmSrcHi);
524 if (pa == 0xffffffff)
526 save128Aligned ((__m128i*)pd, xmmSrcHi);
530 xmmDstHi = load128Aligned ((__m128i*) pd);
532 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
533 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
535 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);
537 over_2x128 (xmmSrcLo, xmmSrcHi, xmmAlphaLo, xmmAlphaHi, &xmmDstLo, &xmmDstHi);
539 /* rebuid the 4 pixel data and save*/
540 save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
553 *pd++ = coreCombineOverUPixelsse2 (s, d);
559 coreCombineOverReverseUsse2 (uint32_t* pd, const uint32_t* ps, int w)
563 __m128i xmmDstLo, xmmDstHi;
564 __m128i xmmSrcLo, xmmSrcHi;
565 __m128i xmmAlphaLo, xmmAlphaHi;
567 /* call prefetch hint to optimize cache load*/
568 cachePrefetch ((__m128i*)ps);
569 cachePrefetch ((__m128i*)pd);
571 /* Align dst on a 16-byte boundary */
573 ((unsigned long)pd & 15))
578 *pd++ = coreCombineOverUPixelsse2 (d, s);
582 /* call prefetch hint to optimize cache load*/
583 cachePrefetch ((__m128i*)ps);
584 cachePrefetch ((__m128i*)pd);
588 /* fill cache line with next memory */
589 cachePrefetchNext ((__m128i*)ps);
590 cachePrefetchNext ((__m128i*)pd);
592 /* I'm loading unaligned because I'm not sure about the address alignment. */
593 xmmSrcHi = load128Unaligned ((__m128i*) ps);
594 xmmDstHi = load128Aligned ((__m128i*) pd);
596 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
597 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
599 expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaLo, &xmmAlphaHi);
601 over_2x128 (xmmDstLo, xmmDstHi, xmmAlphaLo, xmmAlphaHi, &xmmSrcLo, &xmmSrcHi);
603 /* rebuid the 4 pixel data and save*/
604 save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmSrcLo, xmmSrcHi));
616 *pd++ = coreCombineOverUPixelsse2 (d, s);
621 static inline uint32_t
622 coreCombineInUPixelsse2 (uint32_t src, uint32_t dst)
624 uint32_t maska = src >> 24;
630 else if (maska != 0xff)
632 return pack_1x64_32(pixMultiply_1x64 (unpack_32_1x64 (dst), expandAlpha_1x64 (unpack_32_1x64 (src))));
639 coreCombineInUsse2 (uint32_t* pd, const uint32_t* ps, int w)
643 __m128i xmmSrcLo, xmmSrcHi;
644 __m128i xmmDstLo, xmmDstHi;
646 /* call prefetch hint to optimize cache load*/
647 cachePrefetch ((__m128i*)ps);
648 cachePrefetch ((__m128i*)pd);
650 while (w && ((unsigned long) pd & 15))
655 *pd++ = coreCombineInUPixelsse2 (d, s);
659 /* call prefetch hint to optimize cache load*/
660 cachePrefetch ((__m128i*)ps);
661 cachePrefetch ((__m128i*)pd);
665 /* fill cache line with next memory */
666 cachePrefetchNext ((__m128i*)ps);
667 cachePrefetchNext ((__m128i*)pd);
669 xmmDstHi = load128Aligned ((__m128i*) pd);
670 xmmSrcHi = load128Unaligned ((__m128i*) ps);
672 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
673 expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmDstLo, &xmmDstHi);
675 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
676 pixMultiply_2x128 (xmmSrcLo, xmmSrcHi, xmmDstLo, xmmDstHi, &xmmDstLo, &xmmDstHi);
678 save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
690 *pd++ = coreCombineInUPixelsse2 (d, s);
696 coreCombineReverseInUsse2 (uint32_t* pd, const uint32_t* ps, int w)
700 __m128i xmmSrcLo, xmmSrcHi;
701 __m128i xmmDstLo, xmmDstHi;
703 /* call prefetch hint to optimize cache load*/
704 cachePrefetch ((__m128i*)ps);
705 cachePrefetch ((__m128i*)pd);
707 while (w && ((unsigned long) pd & 15))
712 *pd++ = coreCombineInUPixelsse2 (s, d);
716 /* call prefetch hint to optimize cache load*/
717 cachePrefetch ((__m128i*)ps);
718 cachePrefetch ((__m128i*)pd);
722 /* fill cache line with next memory */
723 cachePrefetchNext ((__m128i*)ps);
724 cachePrefetchNext ((__m128i*)pd);
726 xmmDstHi = load128Aligned ((__m128i*) pd);
727 xmmSrcHi = load128Unaligned ((__m128i*) ps);
729 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
730 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
732 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
733 pixMultiply_2x128 (xmmDstLo, xmmDstHi, xmmSrcLo, xmmSrcHi, &xmmDstLo, &xmmDstHi);
735 save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
747 *pd++ = coreCombineInUPixelsse2 (s, d);
753 coreCombineReverseOutUsse2 (uint32_t* pd, const uint32_t* ps, int w)
755 /* call prefetch hint to optimize cache load*/
756 cachePrefetch ((__m128i*)ps);
757 cachePrefetch ((__m128i*)pd);
759 while (w && ((unsigned long) pd & 15))
764 *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d), negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (s)))));
768 /* call prefetch hint to optimize cache load*/
769 cachePrefetch ((__m128i*)ps);
770 cachePrefetch ((__m128i*)pd);
774 __m128i xmmSrcLo, xmmSrcHi;
775 __m128i xmmDstLo, xmmDstHi;
777 /* fill cache line with next memory */
778 cachePrefetchNext ((__m128i*)ps);
779 cachePrefetchNext ((__m128i*)pd);
781 xmmSrcHi = load128Unaligned ((__m128i*) ps);
782 xmmDstHi = load128Aligned ((__m128i*) pd);
784 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
785 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
787 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
788 negate_2x128 (xmmSrcLo, xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
790 pixMultiply_2x128 (xmmDstLo, xmmDstHi, xmmSrcLo, xmmSrcHi, &xmmDstLo, &xmmDstHi);
792 save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
804 *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d), negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (s)))));
810 coreCombineOutUsse2 (uint32_t* pd, const uint32_t* ps, int w)
812 /* call prefetch hint to optimize cache load*/
813 cachePrefetch ((__m128i*)ps);
814 cachePrefetch ((__m128i*)pd);
816 while (w && ((unsigned long) pd & 15))
821 *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s), negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (d)))));
825 /* call prefetch hint to optimize cache load*/
826 cachePrefetch ((__m128i*)ps);
827 cachePrefetch ((__m128i*)pd);
831 __m128i xmmSrcLo, xmmSrcHi;
832 __m128i xmmDstLo, xmmDstHi;
834 /* fill cache line with next memory */
835 cachePrefetchNext ((__m128i*)ps);
836 cachePrefetchNext ((__m128i*)pd);
838 xmmSrcHi = load128Unaligned ((__m128i*) ps);
839 xmmDstHi = load128Aligned ((__m128i*) pd);
841 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
842 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
844 expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmDstLo, &xmmDstHi);
845 negate_2x128 (xmmDstLo, xmmDstHi, &xmmDstLo, &xmmDstHi);
847 pixMultiply_2x128 (xmmSrcLo, xmmSrcHi, xmmDstLo, xmmDstHi, &xmmDstLo, &xmmDstHi);
849 save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
861 *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s), negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (d)))));
866 static inline uint32_t
867 coreCombineAtopUPixelsse2 (uint32_t src, uint32_t dst)
869 __m64 s = unpack_32_1x64 (src);
870 __m64 d = unpack_32_1x64 (dst);
872 __m64 sa = negate_1x64 (expandAlpha_1x64 (s));
873 __m64 da = expandAlpha_1x64 (d);
875 return pack_1x64_32 (pixAddMultiply_1x64 (s, da, d, sa));
879 coreCombineAtopUsse2 (uint32_t* pd, const uint32_t* ps, int w)
883 __m128i xmmSrcLo, xmmSrcHi;
884 __m128i xmmDstLo, xmmDstHi;
885 __m128i xmmAlphaSrcLo, xmmAlphaSrcHi;
886 __m128i xmmAlphaDstLo, xmmAlphaDstHi;
888 /* call prefetch hint to optimize cache load*/
889 cachePrefetch ((__m128i*)ps);
890 cachePrefetch ((__m128i*)pd);
892 while (w && ((unsigned long) pd & 15))
897 *pd++ = coreCombineAtopUPixelsse2 (s, d);
901 /* call prefetch hint to optimize cache load*/
902 cachePrefetch ((__m128i*)ps);
903 cachePrefetch ((__m128i*)pd);
907 /* fill cache line with next memory */
908 cachePrefetchNext ((__m128i*)ps);
909 cachePrefetchNext ((__m128i*)pd);
911 xmmSrcHi = load128Unaligned ((__m128i*) ps);
912 xmmDstHi = load128Aligned ((__m128i*) pd);
914 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
915 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
917 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
918 expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
920 negate_2x128 (xmmAlphaSrcLo, xmmAlphaSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
922 pixAddMultiply_2x128 ( xmmSrcLo, xmmSrcHi, xmmAlphaDstLo, xmmAlphaDstHi,
923 xmmDstLo, xmmDstHi, xmmAlphaSrcLo, xmmAlphaSrcHi,
924 &xmmDstLo, &xmmDstHi );
926 save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
938 *pd++ = coreCombineAtopUPixelsse2 (s, d);
943 static inline uint32_t
944 coreCombineReverseAtopUPixelsse2 (uint32_t src, uint32_t dst)
946 __m64 s = unpack_32_1x64 (src);
947 __m64 d = unpack_32_1x64 (dst);
949 __m64 sa = expandAlpha_1x64 (s);
950 __m64 da = negate_1x64 (expandAlpha_1x64 (d));
952 return pack_1x64_32 (pixAddMultiply_1x64 (s, da, d, sa));
956 coreCombineReverseAtopUsse2 (uint32_t* pd, const uint32_t* ps, int w)
960 __m128i xmmSrcLo, xmmSrcHi;
961 __m128i xmmDstLo, xmmDstHi;
962 __m128i xmmAlphaSrcLo, xmmAlphaSrcHi;
963 __m128i xmmAlphaDstLo, xmmAlphaDstHi;
965 /* call prefetch hint to optimize cache load*/
966 cachePrefetch ((__m128i*)ps);
967 cachePrefetch ((__m128i*)pd);
969 while (w && ((unsigned long) pd & 15))
974 *pd++ = coreCombineReverseAtopUPixelsse2 (s, d);
978 /* call prefetch hint to optimize cache load*/
979 cachePrefetch ((__m128i*)ps);
980 cachePrefetch ((__m128i*)pd);
984 /* fill cache line with next memory */
985 cachePrefetchNext ((__m128i*)ps);
986 cachePrefetchNext ((__m128i*)pd);
988 xmmSrcHi = load128Unaligned ((__m128i*) ps);
989 xmmDstHi = load128Aligned ((__m128i*) pd);
991 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
992 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
994 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
995 expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
997 negate_2x128 (xmmAlphaDstLo, xmmAlphaDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
999 pixAddMultiply_2x128 ( xmmSrcLo, xmmSrcHi, xmmAlphaDstLo, xmmAlphaDstHi,
1000 xmmDstLo, xmmDstHi, xmmAlphaSrcLo, xmmAlphaSrcHi,
1001 &xmmDstLo, &xmmDstHi );
1003 save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
1015 *pd++ = coreCombineReverseAtopUPixelsse2 (s, d);
1020 static inline uint32_t
1021 coreCombineXorUPixelsse2 (uint32_t src, uint32_t dst)
1023 __m64 s = unpack_32_1x64 (src);
1024 __m64 d = unpack_32_1x64 (dst);
1026 return pack_1x64_32 (pixAddMultiply_1x64 (s, negate_1x64 (expandAlpha_1x64 (d)), d, negate_1x64 (expandAlpha_1x64 (s))));
1030 coreCombineXorUsse2 (uint32_t* dst, const uint32_t* src, int width)
1035 const uint32_t* ps = src;
1037 __m128i xmmSrc, xmmSrcLo, xmmSrcHi;
1038 __m128i xmmDst, xmmDstLo, xmmDstHi;
1039 __m128i xmmAlphaSrcLo, xmmAlphaSrcHi;
1040 __m128i xmmAlphaDstLo, xmmAlphaDstHi;
1042 /* call prefetch hint to optimize cache load*/
1043 cachePrefetch ((__m128i*)ps);
1044 cachePrefetch ((__m128i*)pd);
1046 while (w && ((unsigned long) pd & 15))
1051 *pd++ = coreCombineXorUPixelsse2 (s, d);
1055 /* call prefetch hint to optimize cache load*/
1056 cachePrefetch ((__m128i*)ps);
1057 cachePrefetch ((__m128i*)pd);
1061 /* fill cache line with next memory */
1062 cachePrefetchNext ((__m128i*)ps);
1063 cachePrefetchNext ((__m128i*)pd);
1065 xmmSrc = load128Unaligned ((__m128i*) ps);
1066 xmmDst = load128Aligned ((__m128i*) pd);
1068 unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
1069 unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
1071 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
1072 expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
1074 negate_2x128 (xmmAlphaSrcLo, xmmAlphaSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
1075 negate_2x128 (xmmAlphaDstLo, xmmAlphaDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
1077 pixAddMultiply_2x128 ( xmmSrcLo, xmmSrcHi, xmmAlphaDstLo, xmmAlphaDstHi,
1078 xmmDstLo, xmmDstHi, xmmAlphaSrcLo, xmmAlphaSrcHi,
1079 &xmmDstLo, &xmmDstHi );
1081 save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
1093 *pd++ = coreCombineXorUPixelsse2 (s, d);
1099 coreCombineAddUsse2 (uint32_t* dst, const uint32_t* src, int width)
1104 const uint32_t* ps = src;
1106 /* call prefetch hint to optimize cache load*/
1107 cachePrefetch ((__m128i*)ps);
1108 cachePrefetch ((__m128i*)pd);
1110 while (w && (unsigned long)pd & 15)
1114 *pd++ = _mm_cvtsi64_si32 (_mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
1118 /* call prefetch hint to optimize cache load*/
1119 cachePrefetch ((__m128i*)ps);
1120 cachePrefetch ((__m128i*)pd);
1124 /* fill cache line with next memory */
1125 cachePrefetchNext ((__m128i*)ps);
1126 cachePrefetchNext ((__m128i*)pd);
1128 save128Aligned( (__m128i*)pd,
1129 _mm_adds_epu8( load128Unaligned((__m128i*)ps),
1130 load128Aligned ((__m128i*)pd)) );
1140 *pd++ = _mm_cvtsi64_si32 (_mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
1144 static inline uint32_t
1145 coreCombineSaturateUPixelsse2 (uint32_t src, uint32_t dst)
1147 __m64 ms = unpack_32_1x64 (src);
1148 __m64 md = unpack_32_1x64 (dst);
1149 uint32_t sa = src >> 24;
1150 uint32_t da = ~dst >> 24;
1154 ms = pixMultiply_1x64 (ms, expandAlpha_1x64 (unpack_32_1x64 (FbIntDiv(da, sa) << 24)));
1157 return pack_1x64_32 (_mm_adds_pu16 (md, ms));
1161 coreCombineSaturateUsse2 (uint32_t *pd, const uint32_t *ps, int w)
1166 __m128i xmmSrc, xmmDst;
1168 /* call prefetch hint to optimize cache load*/
1169 cachePrefetch ((__m128i*)ps);
1170 cachePrefetch ((__m128i*)pd);
1172 while (w && (unsigned long)pd & 15)
1176 *pd++ = coreCombineSaturateUPixelsse2 (s, d);
1180 /* call prefetch hint to optimize cache load*/
1181 cachePrefetch ((__m128i*)ps);
1182 cachePrefetch ((__m128i*)pd);
1186 /* fill cache line with next memory */
1187 cachePrefetchNext ((__m128i*)ps);
1188 cachePrefetchNext ((__m128i*)pd);
1190 xmmDst = load128Aligned ((__m128i*)pd);
1191 xmmSrc = load128Unaligned((__m128i*)ps);
1193 packCmp = _mm_movemask_epi8 (_mm_cmpgt_epi32 (_mm_srli_epi32 (xmmSrc, 24),
1194 _mm_srli_epi32 (_mm_xor_si128 (xmmDst, Maskff000000), 24)));
1196 /* if some alpha src is grater than respective ~alpha dst */
1201 *pd++ = coreCombineSaturateUPixelsse2 (s, d);
1205 *pd++ = coreCombineSaturateUPixelsse2 (s, d);
1209 *pd++ = coreCombineSaturateUPixelsse2 (s, d);
1213 *pd++ = coreCombineSaturateUPixelsse2 (s, d);
1217 save128Aligned ((__m128i*)pd, _mm_adds_epu8 (xmmDst, xmmSrc));
1230 *pd++ = coreCombineSaturateUPixelsse2 (s, d);
1235 coreCombineSrcCsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t *pm, int w)
1239 __m128i xmmSrcLo, xmmSrcHi;
1240 __m128i xmmMaskLo, xmmMaskHi;
1241 __m128i xmmDstLo, xmmDstHi;
1243 /* call prefetch hint to optimize cache load*/
1244 cachePrefetch ((__m128i*)ps);
1245 cachePrefetch ((__m128i*)pd);
1246 cachePrefetch ((__m128i*)pm);
1248 while (w && (unsigned long)pd & 15)
1252 *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
1256 /* call prefetch hint to optimize cache load*/
1257 cachePrefetch ((__m128i*)ps);
1258 cachePrefetch ((__m128i*)pd);
1259 cachePrefetch ((__m128i*)pm);
1263 /* fill cache line with next memory */
1264 cachePrefetchNext ((__m128i*)ps);
1265 cachePrefetchNext ((__m128i*)pd);
1266 cachePrefetchNext ((__m128i*)pm);
1268 xmmSrcHi = load128Unaligned ((__m128i*)ps);
1269 xmmMaskHi = load128Unaligned ((__m128i*)pm);
1271 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
1272 unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
1274 pixMultiply_2x128 (xmmSrcLo, xmmSrcHi, xmmMaskLo, xmmMaskHi, &xmmDstLo, &xmmDstHi);
1276 save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
1288 *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
1293 static inline uint32_t
1294 coreCombineOverCPixelsse2 (uint32_t src, uint32_t mask, uint32_t dst)
1296 __m64 s = unpack_32_1x64 (src);
1298 return pack_1x64_32 (inOver_1x64 (s, expandAlpha_1x64 (s), unpack_32_1x64 (mask), unpack_32_1x64 (dst)));
1302 coreCombineOverCsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t *pm, int w)
1306 __m128i xmmAlphaLo, xmmAlphaHi;
1307 __m128i xmmSrcLo, xmmSrcHi;
1308 __m128i xmmDstLo, xmmDstHi;
1309 __m128i xmmMaskLo, xmmMaskHi;
1311 /* call prefetch hint to optimize cache load*/
1312 cachePrefetch ((__m128i*)ps);
1313 cachePrefetch ((__m128i*)pd);
1314 cachePrefetch ((__m128i*)pm);
1316 while (w && (unsigned long)pd & 15)
1322 *pd++ = coreCombineOverCPixelsse2 (s, m, d);
1326 /* call prefetch hint to optimize cache load*/
1327 cachePrefetch ((__m128i*)ps);
1328 cachePrefetch ((__m128i*)pd);
1329 cachePrefetch ((__m128i*)pm);
1333 /* fill cache line with next memory */
1334 cachePrefetchNext ((__m128i*)ps);
1335 cachePrefetchNext ((__m128i*)pd);
1336 cachePrefetchNext ((__m128i*)pm);
1338 xmmDstHi = load128Aligned ((__m128i*)pd);
1339 xmmSrcHi = load128Unaligned ((__m128i*)ps);
1340 xmmMaskHi = load128Unaligned ((__m128i*)pm);
1342 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
1343 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
1344 unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
1346 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);
1348 inOver_2x128 (xmmSrcLo, xmmSrcHi, xmmAlphaLo, xmmAlphaHi, xmmMaskLo, xmmMaskHi, &xmmDstLo, &xmmDstHi);
1350 save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
1364 *pd++ = coreCombineOverCPixelsse2 (s, m, d);
1369 static inline uint32_t
1370 coreCombineOverReverseCPixelsse2 (uint32_t src, uint32_t mask, uint32_t dst)
1372 __m64 d = unpack_32_1x64 (dst);
1374 return pack_1x64_32(over_1x64 (d, expandAlpha_1x64 (d), pixMultiply_1x64 (unpack_32_1x64 (src), unpack_32_1x64 (mask))));
1378 coreCombineOverReverseCsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t *pm, int w)
1382 __m128i xmmAlphaLo, xmmAlphaHi;
1383 __m128i xmmSrcLo, xmmSrcHi;
1384 __m128i xmmDstLo, xmmDstHi;
1385 __m128i xmmMaskLo, xmmMaskHi;
1387 /* call prefetch hint to optimize cache load*/
1388 cachePrefetch ((__m128i*)ps);
1389 cachePrefetch ((__m128i*)pd);
1390 cachePrefetch ((__m128i*)pm);
1392 while (w && (unsigned long)pd & 15)
1398 *pd++ = coreCombineOverReverseCPixelsse2 (s, m, d);
1402 /* call prefetch hint to optimize cache load*/
1403 cachePrefetch ((__m128i*)ps);
1404 cachePrefetch ((__m128i*)pd);
1405 cachePrefetch ((__m128i*)pm);
1409 /* fill cache line with next memory */
1410 cachePrefetchNext ((__m128i*)ps);
1411 cachePrefetchNext ((__m128i*)pd);
1412 cachePrefetchNext ((__m128i*)pm);
1414 xmmDstHi = load128Aligned ((__m128i*)pd);
1415 xmmSrcHi = load128Unaligned ((__m128i*)ps);
1416 xmmMaskHi = load128Unaligned ((__m128i*)pm);
1418 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
1419 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
1420 unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
1422 expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaLo, &xmmAlphaHi);
1423 pixMultiply_2x128 (xmmSrcLo, xmmSrcHi, xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
1425 over_2x128 (xmmDstLo, xmmDstHi, xmmAlphaLo, xmmAlphaHi, &xmmMaskLo, &xmmMaskHi);
1427 save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmMaskLo, xmmMaskHi));
1441 *pd++ = coreCombineOverReverseCPixelsse2 (s, m, d);
1447 coreCombineInCsse2 (uint32_t *pd, uint32_t *ps, uint32_t *pm, int w)
1451 __m128i xmmAlphaLo, xmmAlphaHi;
1452 __m128i xmmSrcLo, xmmSrcHi;
1453 __m128i xmmDstLo, xmmDstHi;
1454 __m128i xmmMaskLo, xmmMaskHi;
1456 /* call prefetch hint to optimize cache load*/
1457 cachePrefetch ((__m128i*)ps);
1458 cachePrefetch ((__m128i*)pd);
1459 cachePrefetch ((__m128i*)pm);
1461 while (w && (unsigned long)pd & 15)
1467 *pd++ = pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
1468 expandAlpha_1x64 (unpack_32_1x64 (d))));
1472 /* call prefetch hint to optimize cache load*/
1473 cachePrefetch ((__m128i*)ps);
1474 cachePrefetch ((__m128i*)pd);
1475 cachePrefetch ((__m128i*)pm);
1479 /* fill cache line with next memory */
1480 cachePrefetchNext ((__m128i*)ps);
1481 cachePrefetchNext ((__m128i*)pd);
1482 cachePrefetchNext ((__m128i*)pm);
1484 xmmDstHi = load128Aligned ((__m128i*)pd);
1485 xmmSrcHi = load128Unaligned ((__m128i*)ps);
1486 xmmMaskHi = load128Unaligned ((__m128i*)pm);
1488 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
1489 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
1490 unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
1492 expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaLo, &xmmAlphaHi);
1493 pixMultiply_2x128 (xmmSrcLo, xmmSrcHi, xmmMaskLo, xmmMaskHi, &xmmDstLo, &xmmDstHi);
1495 pixMultiply_2x128 (xmmDstLo, xmmDstHi, xmmAlphaLo, xmmAlphaHi, &xmmDstLo, &xmmDstHi);
1497 save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
1511 *pd++ = pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
1512 expandAlpha_1x64 (unpack_32_1x64 (d))));
1518 coreCombineInReverseCsse2 (uint32_t *pd, uint32_t *ps, uint32_t *pm, int w)
1522 __m128i xmmAlphaLo, xmmAlphaHi;
1523 __m128i xmmSrcLo, xmmSrcHi;
1524 __m128i xmmDstLo, xmmDstHi;
1525 __m128i xmmMaskLo, xmmMaskHi;
1527 /* call prefetch hint to optimize cache load*/
1528 cachePrefetch ((__m128i*)ps);
1529 cachePrefetch ((__m128i*)pd);
1530 cachePrefetch ((__m128i*)pm);
1532 while (w && (unsigned long)pd & 15)
1538 *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d),
1539 pixMultiply_1x64 (unpack_32_1x64 (m),
1540 expandAlpha_1x64 (unpack_32_1x64 (s)))));
1544 /* call prefetch hint to optimize cache load*/
1545 cachePrefetch ((__m128i*)ps);
1546 cachePrefetch ((__m128i*)pd);
1547 cachePrefetch ((__m128i*)pm);
1551 /* fill cache line with next memory */
1552 cachePrefetchNext ((__m128i*)ps);
1553 cachePrefetchNext ((__m128i*)pd);
1554 cachePrefetchNext ((__m128i*)pm);
1556 xmmDstHi = load128Aligned ((__m128i*)pd);
1557 xmmSrcHi = load128Unaligned ((__m128i*)ps);
1558 xmmMaskHi = load128Unaligned ((__m128i*)pm);
1560 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
1561 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
1562 unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
1564 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);
1565 pixMultiply_2x128 (xmmMaskLo, xmmMaskHi, xmmAlphaLo, xmmAlphaHi, &xmmAlphaLo, &xmmAlphaHi);
1567 pixMultiply_2x128 (xmmDstLo, xmmDstHi, xmmAlphaLo, xmmAlphaHi, &xmmDstLo, &xmmDstHi);
1569 save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
1583 *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d),
1584 pixMultiply_1x64 (unpack_32_1x64 (m),
1585 expandAlpha_1x64 (unpack_32_1x64 (s)))));
1591 coreCombineOutCsse2 (uint32_t *pd, uint32_t *ps, uint32_t *pm, int w)
1595 __m128i xmmAlphaLo, xmmAlphaHi;
1596 __m128i xmmSrcLo, xmmSrcHi;
1597 __m128i xmmDstLo, xmmDstHi;
1598 __m128i xmmMaskLo, xmmMaskHi;
1600 /* call prefetch hint to optimize cache load*/
1601 cachePrefetch ((__m128i*)ps);
1602 cachePrefetch ((__m128i*)pd);
1603 cachePrefetch ((__m128i*)pm);
1605 while (w && (unsigned long)pd & 15)
1611 *pd++ = pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
1612 negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (d)))));
1616 /* call prefetch hint to optimize cache load*/
1617 cachePrefetch ((__m128i*)ps);
1618 cachePrefetch ((__m128i*)pd);
1619 cachePrefetch ((__m128i*)pm);
1623 /* fill cache line with next memory */
1624 cachePrefetchNext ((__m128i*)ps);
1625 cachePrefetchNext ((__m128i*)pd);
1626 cachePrefetchNext ((__m128i*)pm);
1628 xmmDstHi = load128Aligned ((__m128i*)pd);
1629 xmmSrcHi = load128Unaligned ((__m128i*)ps);
1630 xmmMaskHi = load128Unaligned ((__m128i*)pm);
1632 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
1633 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
1634 unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
1636 expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaLo, &xmmAlphaHi);
1637 negate_2x128 (xmmAlphaLo, xmmAlphaHi, &xmmAlphaLo, &xmmAlphaHi);
1639 pixMultiply_2x128 (xmmSrcLo, xmmSrcHi, xmmMaskLo, xmmMaskHi, &xmmDstLo, &xmmDstHi);
1640 pixMultiply_2x128 (xmmDstLo, xmmDstHi, xmmAlphaLo, xmmAlphaHi, &xmmDstLo, &xmmDstHi);
1642 save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
1656 *pd++ = pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
1657 negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (d)))));
1663 coreCombineOutReverseCsse2 (uint32_t *pd, uint32_t *ps, uint32_t *pm, int w)
1667 __m128i xmmAlphaLo, xmmAlphaHi;
1668 __m128i xmmSrcLo, xmmSrcHi;
1669 __m128i xmmDstLo, xmmDstHi;
1670 __m128i xmmMaskLo, xmmMaskHi;
1672 /* call prefetch hint to optimize cache load*/
1673 cachePrefetch ((__m128i*)ps);
1674 cachePrefetch ((__m128i*)pd);
1675 cachePrefetch ((__m128i*)pm);
1677 while (w && (unsigned long)pd & 15)
1683 *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d),
1684 negate_1x64 (pixMultiply_1x64 (unpack_32_1x64 (m),
1685 expandAlpha_1x64 (unpack_32_1x64 (s))))));
1689 /* call prefetch hint to optimize cache load*/
1690 cachePrefetch ((__m128i*)ps);
1691 cachePrefetch ((__m128i*)pd);
1692 cachePrefetch ((__m128i*)pm);
1696 /* fill cache line with next memory */
1697 cachePrefetchNext ((__m128i*)ps);
1698 cachePrefetchNext ((__m128i*)pd);
1699 cachePrefetchNext ((__m128i*)pm);
1701 xmmDstHi = load128Aligned ((__m128i*)pd);
1702 xmmSrcHi = load128Unaligned ((__m128i*)ps);
1703 xmmMaskHi = load128Unaligned ((__m128i*)pm);
1705 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
1706 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
1707 unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
1709 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);
1711 pixMultiply_2x128 (xmmMaskLo, xmmMaskHi, xmmAlphaLo, xmmAlphaHi, &xmmMaskLo, &xmmMaskHi);
1713 negate_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
1715 pixMultiply_2x128 (xmmDstLo, xmmDstHi, xmmMaskLo, xmmMaskHi, &xmmDstLo, &xmmDstHi);
1717 save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
1731 *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d),
1732 negate_1x64 (pixMultiply_1x64 (unpack_32_1x64 (m),
1733 expandAlpha_1x64 (unpack_32_1x64 (s))))));
1738 static inline uint32_t
1739 coreCombineAtopCPixelsse2 (uint32_t src, uint32_t mask, uint32_t dst)
1741 __m64 m = unpack_32_1x64 (mask);
1742 __m64 s = unpack_32_1x64 (src);
1743 __m64 d = unpack_32_1x64 (dst);
1744 __m64 sa = expandAlpha_1x64 (s);
1745 __m64 da = expandAlpha_1x64 (d);
1747 s = pixMultiply_1x64 (s, m);
1748 m = negate_1x64 (pixMultiply_1x64 (m, sa));
1750 return pack_1x64_32 (pixAddMultiply_1x64 (d, m, s, da));
1754 coreCombineAtopCsse2 (uint32_t *pd, uint32_t *ps, uint32_t *pm, int w)
1758 __m128i xmmSrcLo, xmmSrcHi;
1759 __m128i xmmDstLo, xmmDstHi;
1760 __m128i xmmAlphaSrcLo, xmmAlphaSrcHi;
1761 __m128i xmmAlphaDstLo, xmmAlphaDstHi;
1762 __m128i xmmMaskLo, xmmMaskHi;
1764 /* call prefetch hint to optimize cache load*/
1765 cachePrefetch ((__m128i*)ps);
1766 cachePrefetch ((__m128i*)pd);
1767 cachePrefetch ((__m128i*)pm);
1769 while (w && (unsigned long)pd & 15)
1775 *pd++ = coreCombineAtopCPixelsse2 (s, m, d);
1779 /* call prefetch hint to optimize cache load*/
1780 cachePrefetch ((__m128i*)ps);
1781 cachePrefetch ((__m128i*)pd);
1782 cachePrefetch ((__m128i*)pm);
1786 /* fill cache line with next memory */
1787 cachePrefetchNext ((__m128i*)ps);
1788 cachePrefetchNext ((__m128i*)pd);
1789 cachePrefetchNext ((__m128i*)pm);
1791 xmmDstHi = load128Aligned ((__m128i*)pd);
1792 xmmSrcHi = load128Unaligned ((__m128i*)ps);
1793 xmmMaskHi = load128Unaligned ((__m128i*)pm);
1795 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
1796 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
1797 unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
1799 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
1800 expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
1802 pixMultiply_2x128 (xmmSrcLo, xmmSrcHi, xmmMaskLo, xmmMaskHi, &xmmSrcLo, &xmmSrcHi);
1803 pixMultiply_2x128 (xmmMaskLo, xmmMaskHi, xmmAlphaSrcLo, xmmAlphaSrcHi, &xmmMaskLo, &xmmMaskHi);
1805 negate_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
1807 pixAddMultiply_2x128 (xmmDstLo, xmmDstHi, xmmMaskLo, xmmMaskHi,
1808 xmmSrcLo, xmmSrcHi, xmmAlphaDstLo, xmmAlphaDstHi,
1809 &xmmDstLo, &xmmDstHi);
1811 save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
1825 *pd++ = coreCombineAtopCPixelsse2 (s, m, d);
1830 static inline uint32_t
1831 coreCombineReverseAtopCPixelsse2 (uint32_t src, uint32_t mask, uint32_t dst)
1833 __m64 m = unpack_32_1x64 (mask);
1834 __m64 s = unpack_32_1x64 (src);
1835 __m64 d = unpack_32_1x64 (dst);
1837 __m64 da = negate_1x64 (expandAlpha_1x64 (d));
1838 __m64 sa = expandAlpha_1x64 (s);
1840 s = pixMultiply_1x64 (s, m);
1841 m = pixMultiply_1x64 (m, sa);
1843 return pack_1x64_32 (pixAddMultiply_1x64 (d, m, s, da));
1847 coreCombineReverseAtopCsse2 (uint32_t *pd, uint32_t *ps, uint32_t *pm, int w)
1851 __m128i xmmSrcLo, xmmSrcHi;
1852 __m128i xmmDstLo, xmmDstHi;
1853 __m128i xmmAlphaSrcLo, xmmAlphaSrcHi;
1854 __m128i xmmAlphaDstLo, xmmAlphaDstHi;
1855 __m128i xmmMaskLo, xmmMaskHi;
1857 /* call prefetch hint to optimize cache load*/
1858 cachePrefetch ((__m128i*)ps);
1859 cachePrefetch ((__m128i*)pd);
1860 cachePrefetch ((__m128i*)pm);
1862 while (w && (unsigned long)pd & 15)
1868 *pd++ = coreCombineReverseAtopCPixelsse2 (s, m, d);
1872 /* call prefetch hint to optimize cache load*/
1873 cachePrefetch ((__m128i*)ps);
1874 cachePrefetch ((__m128i*)pd);
1875 cachePrefetch ((__m128i*)pm);
1879 /* fill cache line with next memory */
1880 cachePrefetchNext ((__m128i*)ps);
1881 cachePrefetchNext ((__m128i*)pd);
1882 cachePrefetchNext ((__m128i*)pm);
1884 xmmDstHi = load128Aligned ((__m128i*)pd);
1885 xmmSrcHi = load128Unaligned ((__m128i*)ps);
1886 xmmMaskHi = load128Unaligned ((__m128i*)pm);
1888 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
1889 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
1890 unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
1892 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
1893 expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
1895 pixMultiply_2x128 (xmmSrcLo, xmmSrcHi, xmmMaskLo, xmmMaskHi, &xmmSrcLo, &xmmSrcHi);
1896 pixMultiply_2x128 (xmmMaskLo, xmmMaskHi, xmmAlphaSrcLo, xmmAlphaSrcHi, &xmmMaskLo, &xmmMaskHi);
1898 negate_2x128 (xmmAlphaDstLo, xmmAlphaDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
1900 pixAddMultiply_2x128 (xmmDstLo, xmmDstHi, xmmMaskLo, xmmMaskHi,
1901 xmmSrcLo, xmmSrcHi, xmmAlphaDstLo, xmmAlphaDstHi,
1902 &xmmDstLo, &xmmDstHi);
1904 save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
1918 *pd++ = coreCombineReverseAtopCPixelsse2 (s, m, d);
1923 static inline uint32_t
1924 coreCombineXorCPixelsse2 (uint32_t src, uint32_t mask, uint32_t dst)
1926 __m64 a = unpack_32_1x64 (mask);
1927 __m64 s = unpack_32_1x64 (src);
1928 __m64 d = unpack_32_1x64 (dst);
1930 return pack_1x64_32 (pixAddMultiply_1x64 (d,
1931 negate_1x64 (pixMultiply_1x64 (a, expandAlpha_1x64 (s))),
1932 pixMultiply_1x64 (s, a),
1933 negate_1x64 (expandAlpha_1x64 (d))));
1937 coreCombineXorCsse2 (uint32_t *pd, uint32_t *ps, uint32_t *pm, int w)
1941 __m128i xmmSrcLo, xmmSrcHi;
1942 __m128i xmmDstLo, xmmDstHi;
1943 __m128i xmmAlphaSrcLo, xmmAlphaSrcHi;
1944 __m128i xmmAlphaDstLo, xmmAlphaDstHi;
1945 __m128i xmmMaskLo, xmmMaskHi;
1947 /* call prefetch hint to optimize cache load*/
1948 cachePrefetch ((__m128i*)ps);
1949 cachePrefetch ((__m128i*)pd);
1950 cachePrefetch ((__m128i*)pm);
1952 while (w && (unsigned long)pd & 15)
1958 *pd++ = coreCombineXorCPixelsse2 (s, m, d);
1962 /* call prefetch hint to optimize cache load*/
1963 cachePrefetch ((__m128i*)ps);
1964 cachePrefetch ((__m128i*)pd);
1965 cachePrefetch ((__m128i*)pm);
1969 /* fill cache line with next memory */
1970 cachePrefetchNext ((__m128i*)ps);
1971 cachePrefetchNext ((__m128i*)pd);
1972 cachePrefetchNext ((__m128i*)pm);
1974 xmmDstHi = load128Aligned ((__m128i*)pd);
1975 xmmSrcHi = load128Unaligned ((__m128i*)ps);
1976 xmmMaskHi = load128Unaligned ((__m128i*)pm);
1978 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
1979 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
1980 unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
1982 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
1983 expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
1985 pixMultiply_2x128 (xmmSrcLo, xmmSrcHi, xmmMaskLo, xmmMaskHi, &xmmSrcLo, &xmmSrcHi);
1986 pixMultiply_2x128 (xmmMaskLo, xmmMaskHi, xmmAlphaSrcLo, xmmAlphaSrcHi, &xmmMaskLo, &xmmMaskHi);
1988 negate_2x128 (xmmAlphaDstLo, xmmAlphaDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
1989 negate_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
1991 pixAddMultiply_2x128 (xmmDstLo, xmmDstHi, xmmMaskLo, xmmMaskHi,
1992 xmmSrcLo, xmmSrcHi, xmmAlphaDstLo, xmmAlphaDstHi,
1993 &xmmDstLo, &xmmDstHi);
1995 save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
2009 *pd++ = coreCombineXorCPixelsse2 (s, m, d);
2015 coreCombineAddCsse2 (uint32_t *pd, uint32_t *ps, uint32_t *pm, int w)
2019 __m128i xmmSrcLo, xmmSrcHi;
2020 __m128i xmmDstLo, xmmDstHi;
2021 __m128i xmmMaskLo, xmmMaskHi;
2023 /* call prefetch hint to optimize cache load*/
2024 cachePrefetch ((__m128i*)ps);
2025 cachePrefetch ((__m128i*)pd);
2026 cachePrefetch ((__m128i*)pm);
2028 while (w && (unsigned long)pd & 15)
2034 *pd++ = pack_1x64_32 (_mm_adds_pu8 (pixMultiply_1x64 (unpack_32_1x64 (s),
2035 unpack_32_1x64 (m)),
2036 unpack_32_1x64 (d)));
2040 /* call prefetch hint to optimize cache load*/
2041 cachePrefetch ((__m128i*)ps);
2042 cachePrefetch ((__m128i*)pd);
2043 cachePrefetch ((__m128i*)pm);
2047 /* fill cache line with next memory */
2048 cachePrefetchNext ((__m128i*)ps);
2049 cachePrefetchNext ((__m128i*)pd);
2050 cachePrefetchNext ((__m128i*)pm);
2052 xmmSrcHi = load128Unaligned ((__m128i*)ps);
2053 xmmMaskHi = load128Unaligned ((__m128i*)pm);
2054 xmmDstHi = load128Aligned ((__m128i*)pd);
2056 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
2057 unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
2058 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
2060 pixMultiply_2x128 (xmmSrcLo, xmmSrcHi, xmmMaskLo, xmmMaskHi, &xmmSrcLo, &xmmSrcHi);
2062 save128Aligned( (__m128i*)pd, pack_2x128_128 (_mm_adds_epu8 (xmmSrcLo, xmmDstLo),
2063 _mm_adds_epu8 (xmmSrcHi, xmmDstHi)));
2077 *pd++ = pack_1x64_32 (_mm_adds_pu8 (pixMultiply_1x64 (unpack_32_1x64 (s),
2078 unpack_32_1x64 (m)),
2079 unpack_32_1x64 (d)));
2084 /* -------------------------------------------------------------------------------------------------
2088 createMask_16_64 (uint16_t mask)
2090 return _mm_set1_pi16 (mask);
2093 static inline __m128i
2094 createMask_16_128 (uint16_t mask)
2096 return _mm_set1_epi16 (mask);
2100 createMask_2x32_64 (uint32_t mask0, uint32_t mask1)
2102 return _mm_set_pi32 (mask0, mask1);
2105 static inline __m128i
2106 createMask_2x32_128 (uint32_t mask0, uint32_t mask1)
2108 return _mm_set_epi32 (mask0, mask1, mask0, mask1);
2111 /* SSE2 code patch for fbcompose.c */
2113 static FASTCALL void
2114 sse2CombineMaskU (uint32_t *dst, const uint32_t *src, int width)
2116 coreCombineReverseInUsse2 (dst, src, width);
2120 static FASTCALL void
2121 sse2CombineOverU (uint32_t *dst, const uint32_t *src, int width)
2123 coreCombineOverUsse2 (dst, src, width);
2127 static FASTCALL void
2128 sse2CombineOverReverseU (uint32_t *dst, const uint32_t *src, int width)
2130 coreCombineOverReverseUsse2 (dst, src, width);
2134 static FASTCALL void
2135 sse2CombineInU (uint32_t *dst, const uint32_t *src, int width)
2137 coreCombineInUsse2 (dst, src, width);
2141 static FASTCALL void
2142 sse2CombineInReverseU (uint32_t *dst, const uint32_t *src, int width)
2144 coreCombineReverseInUsse2 (dst, src, width);
2148 static FASTCALL void
2149 sse2CombineOutU (uint32_t *dst, const uint32_t *src, int width)
2151 coreCombineOutUsse2 (dst, src, width);
2155 static FASTCALL void
2156 sse2CombineOutReverseU (uint32_t *dst, const uint32_t *src, int width)
2158 coreCombineReverseOutUsse2 (dst, src, width);
2162 static FASTCALL void
2163 sse2CombineAtopU (uint32_t *dst, const uint32_t *src, int width)
2165 coreCombineAtopUsse2 (dst, src, width);
2169 static FASTCALL void
2170 sse2CombineAtopReverseU (uint32_t *dst, const uint32_t *src, int width)
2172 coreCombineReverseAtopUsse2 (dst, src, width);
2176 static FASTCALL void
2177 sse2CombineXorU (uint32_t *dst, const uint32_t *src, int width)
2179 coreCombineXorUsse2 (dst, src, width);
2183 static FASTCALL void
2184 sse2CombineAddU (uint32_t *dst, const uint32_t *src, int width)
2186 coreCombineAddUsse2 (dst, src, width);
2190 static FASTCALL void
2191 sse2CombineSaturateU (uint32_t *dst, const uint32_t *src, int width)
2193 coreCombineSaturateUsse2 (dst, src, width);
2197 static FASTCALL void
2198 sse2CombineSrcC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width)
2200 coreCombineSrcCsse2 (dst, src, mask, width);
2204 static FASTCALL void
2205 sse2CombineOverC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width)
2207 coreCombineOverCsse2 (dst, src, mask, width);
2211 static FASTCALL void
2212 sse2CombineOverReverseC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width)
2214 coreCombineOverReverseCsse2 (dst, src, mask, width);
2218 static FASTCALL void
2219 sse2CombineInC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width)
2221 coreCombineInCsse2 (dst, src, mask, width);
2225 static FASTCALL void
2226 sse2CombineInReverseC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width)
2228 coreCombineInReverseCsse2 (dst, src, mask, width);
2232 static FASTCALL void
2233 sse2CombineOutC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width)
2235 coreCombineOutCsse2 (dst, src, mask, width);
2239 static FASTCALL void
2240 sse2CombineOutReverseC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width)
2242 coreCombineOutReverseCsse2 (dst, src, mask, width);
2246 static FASTCALL void
2247 sse2CombineAtopC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width)
2249 coreCombineAtopCsse2 (dst, src, mask, width);
2253 static FASTCALL void
2254 sse2CombineAtopReverseC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width)
2256 coreCombineReverseAtopCsse2 (dst, src, mask, width);
2260 static FASTCALL void
2261 sse2CombineXorC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width)
2263 coreCombineXorCsse2 (dst, src, mask, width);
2267 static FASTCALL void
2268 sse2CombineAddC (uint32_t *dst, uint32_t *src, uint32_t *mask, int width)
2270 coreCombineAddCsse2 (dst, src, mask, width);
2275 fbComposeSetupSSE(void)
2277 static pixman_bool_t initialized = FALSE;
2282 /* check if we have SSE2 support and initialize accordingly */
2283 if (pixman_have_sse())
2285 /* SSE2 constants */
2286 Mask565r = createMask_2x32_128 (0x00f80000, 0x00f80000);
2287 Mask565g1 = createMask_2x32_128 (0x00070000, 0x00070000);
2288 Mask565g2 = createMask_2x32_128 (0x000000e0, 0x000000e0);
2289 Mask565b = createMask_2x32_128 (0x0000001f, 0x0000001f);
2290 MaskRed = createMask_2x32_128 (0x00f80000, 0x00f80000);
2291 MaskGreen = createMask_2x32_128 (0x0000fc00, 0x0000fc00);
2292 MaskBlue = createMask_2x32_128 (0x000000f8, 0x000000f8);
2294 Mask0080 = createMask_16_128 (0x0080);
2295 Mask00ff = createMask_16_128 (0x00ff);
2296 Mask0101 = createMask_16_128 (0x0101);
2297 Maskffff = createMask_16_128 (0xffff);
2298 Maskff000000 = createMask_2x32_128 (0xff000000, 0xff000000);
2299 MaskAlpha = createMask_2x32_128 (0x00ff0000, 0x00000000);
2302 xMask565rgb = createMask_2x32_64 (0x000001f0, 0x003f001f);
2303 xMask565Unpack = createMask_2x32_64 (0x00000084, 0x04100840);
2305 xMask0080 = createMask_16_64 (0x0080);
2306 xMask00ff = createMask_16_64 (0x00ff);
2307 xMask0101 = createMask_16_64 (0x0101);
2308 xMaskAlpha = createMask_2x32_64 (0x00ff0000, 0x00000000);
2310 /* SSE code patch for fbcompose.c */
2311 pixman_composeFunctions.combineU[PIXMAN_OP_OVER] = sse2CombineOverU;
2312 pixman_composeFunctions.combineU[PIXMAN_OP_OVER_REVERSE] = sse2CombineOverReverseU;
2313 pixman_composeFunctions.combineU[PIXMAN_OP_IN] = sse2CombineInU;
2314 pixman_composeFunctions.combineU[PIXMAN_OP_IN_REVERSE] = sse2CombineInReverseU;
2315 pixman_composeFunctions.combineU[PIXMAN_OP_OUT] = sse2CombineOutU;
2317 pixman_composeFunctions.combineU[PIXMAN_OP_OUT_REVERSE] = sse2CombineOutReverseU;
2318 pixman_composeFunctions.combineU[PIXMAN_OP_ATOP] = sse2CombineAtopU;
2319 pixman_composeFunctions.combineU[PIXMAN_OP_ATOP_REVERSE] = sse2CombineAtopReverseU;
2320 pixman_composeFunctions.combineU[PIXMAN_OP_XOR] = sse2CombineXorU;
2321 pixman_composeFunctions.combineU[PIXMAN_OP_ADD] = sse2CombineAddU;
2323 pixman_composeFunctions.combineU[PIXMAN_OP_SATURATE] = sse2CombineSaturateU;
2325 pixman_composeFunctions.combineC[PIXMAN_OP_SRC] = sse2CombineSrcC;
2326 pixman_composeFunctions.combineC[PIXMAN_OP_OVER] = sse2CombineOverC;
2327 pixman_composeFunctions.combineC[PIXMAN_OP_OVER_REVERSE] = sse2CombineOverReverseC;
2328 pixman_composeFunctions.combineC[PIXMAN_OP_IN] = sse2CombineInC;
2329 pixman_composeFunctions.combineC[PIXMAN_OP_IN_REVERSE] = sse2CombineInReverseC;
2330 pixman_composeFunctions.combineC[PIXMAN_OP_OUT] = sse2CombineOutC;
2331 pixman_composeFunctions.combineC[PIXMAN_OP_OUT_REVERSE] = sse2CombineOutReverseC;
2332 pixman_composeFunctions.combineC[PIXMAN_OP_ATOP] = sse2CombineAtopC;
2333 pixman_composeFunctions.combineC[PIXMAN_OP_ATOP_REVERSE] = sse2CombineAtopReverseC;
2334 pixman_composeFunctions.combineC[PIXMAN_OP_XOR] = sse2CombineXorC;
2335 pixman_composeFunctions.combineC[PIXMAN_OP_ADD] = sse2CombineAddC;
2337 pixman_composeFunctions.combineMaskU = sse2CombineMaskU;
2346 /* -------------------------------------------------------------------------------------------------
2347 * fbCompositeSolid_nx8888
2351 fbCompositeSolid_nx8888sse2 (pixman_op_t op,
2352 pixman_image_t * pSrc,
2353 pixman_image_t * pMask,
2354 pixman_image_t * pDst,
2365 uint32_t *dstLine, *dst, d;
2368 __m128i xmmSrc, xmmAlpha;
2369 __m128i xmmDst, xmmDstLo, xmmDstHi;
2371 fbComposeGetSolid(pSrc, src, pDst->bits.format);
2376 fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
2378 xmmSrc = expandPixel_32_1x128 (src);
2379 xmmAlpha = expandAlpha_1x128 (xmmSrc);
2385 /* call prefetch hint to optimize cache load*/
2386 cachePrefetch ((__m128i*)dst);
2388 dstLine += dstStride;
2391 while (w && (unsigned long)dst & 15)
2394 *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmmSrc),
2395 _mm_movepi64_pi64 (xmmAlpha),
2396 unpack_32_1x64 (d)));
2400 cachePrefetch ((__m128i*)dst);
2404 /* fill cache line with next memory */
2405 cachePrefetchNext ((__m128i*)dst);
2407 xmmDst = load128Aligned ((__m128i*)dst);
2409 unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
2411 over_2x128 (xmmSrc, xmmSrc, xmmAlpha, xmmAlpha, &xmmDstLo, &xmmDstHi);
2413 /* rebuid the 4 pixel data and save*/
2414 save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
2423 *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmmSrc),
2424 _mm_movepi64_pi64 (xmmAlpha),
2425 unpack_32_1x64 (d)));
2433 /* -------------------------------------------------------------------------------------------------
2434 * fbCompositeSolid_nx0565
2437 fbCompositeSolid_nx0565sse2 (pixman_op_t op,
2438 pixman_image_t * pSrc,
2439 pixman_image_t * pMask,
2440 pixman_image_t * pDst,
2451 uint16_t *dstLine, *dst, d;
2454 __m128i xmmSrc, xmmAlpha;
2455 __m128i xmmDst, xmmDst0, xmmDst1, xmmDst2, xmmDst3;
2457 fbComposeGetSolid(pSrc, src, pDst->bits.format);
2462 fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
2464 xmmSrc = expandPixel_32_1x128 (src);
2465 xmmAlpha = expandAlpha_1x128 (xmmSrc);
2471 /* call prefetch hint to optimize cache load*/
2472 cachePrefetch ((__m128i*)dst);
2474 dstLine += dstStride;
2477 while (w && (unsigned long)dst & 15)
2480 *dst++ = pack565_32_16 (pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmmSrc),
2481 _mm_movepi64_pi64 (xmmAlpha),
2482 expand565_16_1x64 (d))));
2486 /* call prefetch hint to optimize cache load*/
2487 cachePrefetch ((__m128i*)dst);
2491 /* fill cache line with next memory */
2492 cachePrefetchNext ((__m128i*)dst);
2494 xmmDst = load128Aligned ((__m128i*)dst);
2496 unpack565_128_4x128 (xmmDst, &xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3);
2498 over_2x128 (xmmSrc, xmmSrc, xmmAlpha, xmmAlpha, &xmmDst0, &xmmDst1);
2499 over_2x128 (xmmSrc, xmmSrc, xmmAlpha, xmmAlpha, &xmmDst2, &xmmDst3);
2501 xmmDst = pack565_4x128_128 (xmmDst0, xmmDst1, xmmDst2, xmmDst3);
2503 save128Aligned ((__m128i*)dst, xmmDst);
2512 *dst++ = pack565_32_16 (pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmmSrc),
2513 _mm_movepi64_pi64 (xmmAlpha),
2514 expand565_16_1x64 (d))));
2521 /* -------------------------------------------------------------------------------------------------
2522 * fbCompositeSolidMask_nx8888x8888C
2526 fbCompositeSolidMask_nx8888x8888Csse2 (pixman_op_t op,
2527 pixman_image_t * pSrc,
2528 pixman_image_t * pMask,
2529 pixman_image_t * pDst,
2540 uint32_t *dstLine, d;
2541 uint32_t *maskLine, m;
2543 int dstStride, maskStride;
2545 __m128i xmmSrc, xmmAlpha;
2546 __m128i xmmDst, xmmDstLo, xmmDstHi;
2547 __m128i xmmMask, xmmMaskLo, xmmMaskHi;
2549 fbComposeGetSolid(pSrc, src, pDst->bits.format);
2555 fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
2556 fbComposeGetStart (pMask, xMask, yMask, uint32_t, maskStride, maskLine, 1);
2558 xmmSrc = _mm_unpacklo_epi8 (createMask_2x32_128 (src, src), _mm_setzero_si128 ());
2559 xmmAlpha = expandAlpha_1x128 (xmmSrc);
2564 uint32_t *pm = (uint32_t *)maskLine;
2565 uint32_t *pd = (uint32_t *)dstLine;
2567 dstLine += dstStride;
2568 maskLine += maskStride;
2570 /* call prefetch hint to optimize cache load*/
2571 cachePrefetch ((__m128i*)pd);
2572 cachePrefetch ((__m128i*)pm);
2574 while (w && (unsigned long)pd & 15)
2582 *pd = pack_1x64_32 (inOver_1x64 (_mm_movepi64_pi64 (xmmSrc),
2583 _mm_movepi64_pi64 (xmmAlpha),
2585 unpack_32_1x64 (d)));
2592 /* call prefetch hint to optimize cache load*/
2593 cachePrefetch ((__m128i*)pd);
2594 cachePrefetch ((__m128i*)pm);
2598 /* fill cache line with next memory */
2599 cachePrefetchNext ((__m128i*)pd);
2600 cachePrefetchNext ((__m128i*)pm);
2602 xmmMask = load128Unaligned ((__m128i*)pm);
2604 packCmp = _mm_movemask_epi8 (_mm_cmpeq_epi32 (xmmMask, _mm_setzero_si128()));
2606 /* if all bits in mask are zero, packCmp are equal to 0xffff */
2607 if (packCmp != 0xffff)
2609 xmmDst = load128Aligned ((__m128i*)pd);
2611 unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
2612 unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
2614 inOver_2x128 (xmmSrc, xmmSrc, xmmAlpha, xmmAlpha, xmmMaskLo, xmmMaskHi, &xmmDstLo, &xmmDstHi);
2616 save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
2632 *pd = pack_1x64_32 (inOver_1x64 (_mm_movepi64_pi64 (xmmSrc),
2633 _mm_movepi64_pi64 (xmmAlpha),
2635 unpack_32_1x64 (d)));
2647 /* -------------------------------------------------------------------------------------------------
2648 * fbCompositeSrc_8888x8x8888
2652 fbCompositeSrc_8888x8x8888sse2 (pixman_op_t op,
2653 pixman_image_t * pSrc,
2654 pixman_image_t * pMask,
2655 pixman_image_t * pDst,
2665 uint32_t *dstLine, *dst;
2666 uint32_t *srcLine, *src;
2669 int dstStride, srcStride;
2672 __m128i xmmSrc, xmmSrcLo, xmmSrcHi;
2673 __m128i xmmDst, xmmDstLo, xmmDstHi;
2674 __m128i xmmAlphaLo, xmmAlphaHi;
2676 fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
2677 fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
2678 fbComposeGetSolid (pMask, mask, pDst->bits.format);
2680 xmmMask = createMask_16_128 (mask >> 24);
2685 dstLine += dstStride;
2687 srcLine += srcStride;
2690 /* call prefetch hint to optimize cache load*/
2691 cachePrefetch ((__m128i*)dst);
2692 cachePrefetch ((__m128i*)src);
2694 while (w && (unsigned long)dst & 15)
2696 uint32_t s = *src++;
2699 __m64 ms = unpack_32_1x64 (s);
2701 *dst++ = pack_1x64_32 (inOver_1x64 (ms,
2702 expandAlpha_1x64 (ms),
2703 _mm_movepi64_pi64 (xmmMask),
2704 unpack_32_1x64 (d)));
2709 /* call prefetch hint to optimize cache load*/
2710 cachePrefetch ((__m128i*)dst);
2711 cachePrefetch ((__m128i*)src);
2715 /* fill cache line with next memory */
2716 cachePrefetchNext ((__m128i*)dst);
2717 cachePrefetchNext ((__m128i*)src);
2719 xmmSrc = load128Unaligned ((__m128i*)src);
2720 xmmDst = load128Aligned ((__m128i*)dst);
2722 unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
2723 unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
2724 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);
2726 inOver_2x128 (xmmSrcLo, xmmSrcHi, xmmAlphaLo, xmmAlphaHi, xmmMask, xmmMask, &xmmDstLo, &xmmDstHi);
2728 save128Aligned( (__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
2737 uint32_t s = *src++;
2740 __m64 ms = unpack_32_1x64 (s);
2742 *dst++ = pack_1x64_32 (inOver_1x64 (ms,
2743 expandAlpha_1x64 (ms),
2744 _mm_movepi64_pi64 (xmmMask),
2745 unpack_32_1x64 (d)));
2754 /* -------------------------------------------------------------------------------------------------
2755 * fbCompositeSrc_x888xnx8888
2758 fbCompositeSrc_x888xnx8888sse2 (pixman_op_t op,
2759 pixman_image_t * pSrc,
2760 pixman_image_t * pMask,
2761 pixman_image_t * pDst,
2771 uint32_t *dstLine, *dst;
2772 uint32_t *srcLine, *src;
2774 int dstStride, srcStride;
2777 __m128i xmmMask, xmmAlpha;
2778 __m128i xmmSrc, xmmSrcLo, xmmSrcHi;
2779 __m128i xmmDst, xmmDstLo, xmmDstHi;
2781 fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
2782 fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
2783 fbComposeGetSolid (pMask, mask, pDst->bits.format);
2785 xmmMask = createMask_16_128 (mask >> 24);
2786 xmmAlpha = Mask00ff;
2791 dstLine += dstStride;
2793 srcLine += srcStride;
2796 /* call prefetch hint to optimize cache load*/
2797 cachePrefetch ((__m128i*)dst);
2798 cachePrefetch ((__m128i*)src);
2800 while (w && (unsigned long)dst & 15)
2802 uint32_t s = (*src++) | 0xff000000;
2805 *dst++ = pack_1x64_32 (inOver_1x64 (unpack_32_1x64 (s),
2806 _mm_movepi64_pi64 (xmmAlpha),
2807 _mm_movepi64_pi64 (xmmMask),
2808 unpack_32_1x64 (d)));
2813 /* call prefetch hint to optimize cache load*/
2814 cachePrefetch ((__m128i*)dst);
2815 cachePrefetch ((__m128i*)src);
2819 /* fill cache line with next memory */
2820 cachePrefetchNext ((__m128i*)dst);
2821 cachePrefetchNext ((__m128i*)src);
2823 xmmSrc = _mm_or_si128 (load128Unaligned ((__m128i*)src), Maskff000000);
2824 xmmDst = load128Aligned ((__m128i*)dst);
2826 unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
2827 unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
2829 inOver_2x128 (xmmSrcLo, xmmSrcHi, xmmAlpha, xmmAlpha, xmmMask, xmmMask, &xmmDstLo, &xmmDstHi);
2831 save128Aligned( (__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
2841 uint32_t s = (*src++) | 0xff000000;
2844 *dst++ = pack_1x64_32 (inOver_1x64 (unpack_32_1x64 (s),
2845 _mm_movepi64_pi64 (xmmAlpha),
2846 _mm_movepi64_pi64 (xmmMask),
2847 unpack_32_1x64 (d)));
2856 /* -------------------------------------------------------------------------------------------------
2857 * fbCompositeSrc_8888x8888
2860 fbCompositeSrc_8888x8888sse2 (pixman_op_t op,
2861 pixman_image_t * pSrc,
2862 pixman_image_t * pMask,
2863 pixman_image_t * pDst,
2873 int dstStride, srcStride;
2874 uint32_t *dstLine, *dst;
2875 uint32_t *srcLine, *src;
2877 fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
2878 fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
2885 coreCombineOverUsse2 (dst, src, width);
2893 /* -------------------------------------------------------------------------------------------------
2894 * fbCompositeSrc_8888x0565
2896 static inline uint16_t
2897 fbCompositeSrc_8888x0565pixel (uint32_t src, uint16_t dst)
2901 ms = unpack_32_1x64 (src);
2902 return pack565_32_16( pack_1x64_32 (over_1x64 (ms,
2903 expandAlpha_1x64 (ms),
2904 expand565_16_1x64 (dst))));
2908 fbCompositeSrc_8888x0565sse2 (pixman_op_t op,
2909 pixman_image_t * pSrc,
2910 pixman_image_t * pMask,
2911 pixman_image_t * pDst,
2921 uint16_t *dstLine, *dst, d;
2922 uint32_t *srcLine, *src, s;
2923 int dstStride, srcStride;
2926 __m128i xmmAlphaLo, xmmAlphaHi;
2927 __m128i xmmSrc, xmmSrcLo, xmmSrcHi;
2928 __m128i xmmDst, xmmDst0, xmmDst1, xmmDst2, xmmDst3;
2930 fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
2931 fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
2936 * I copy the code from MMX one and keep the fixme.
2937 * If it's a problem there, probably is a problem here.
2939 assert (pSrc->pDrawable == pMask->pDrawable);
2947 /* call prefetch hint to optimize cache load*/
2948 cachePrefetch ((__m128i*)src);
2949 cachePrefetch ((__m128i*)dst);
2951 dstLine += dstStride;
2952 srcLine += srcStride;
2955 /* Align dst on a 16-byte boundary */
2957 ((unsigned long)dst & 15))
2962 *dst++ = fbCompositeSrc_8888x0565pixel (s, d);
2966 /* call prefetch hint to optimize cache load*/
2967 cachePrefetch ((__m128i*)src);
2968 cachePrefetch ((__m128i*)dst);
2970 /* It's a 8 pixel loop */
2973 /* fill cache line with next memory */
2974 cachePrefetchNext ((__m128i*)src);
2975 cachePrefetchNext ((__m128i*)dst);
2977 /* I'm loading unaligned because I'm not sure about the address alignment. */
2978 xmmSrc = load128Unaligned ((__m128i*) src);
2979 xmmDst = load128Aligned ((__m128i*) dst);
2982 unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
2983 unpack565_128_4x128 (xmmDst, &xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3);
2984 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);
2986 /* I'm loading next 4 pixels from memory before to optimze the memory read. */
2987 xmmSrc = load128Unaligned ((__m128i*) (src+4));
2989 over_2x128 (xmmSrcLo, xmmSrcHi, xmmAlphaLo, xmmAlphaHi, &xmmDst0, &xmmDst1);
2992 unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
2993 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);
2995 over_2x128 (xmmSrcLo, xmmSrcHi, xmmAlphaLo, xmmAlphaHi, &xmmDst2, &xmmDst3);
2997 save128Aligned ((__m128i*)dst, pack565_4x128_128 (xmmDst0, xmmDst1, xmmDst2, xmmDst3));
3009 *dst++ = fbCompositeSrc_8888x0565pixel (s, d);
3016 /* -------------------------------------------------------------------------------------------------
3017 * fbCompositeSolidMask_nx8x8888
3021 fbCompositeSolidMask_nx8x8888sse2 (pixman_op_t op,
3022 pixman_image_t * pSrc,
3023 pixman_image_t * pMask,
3024 pixman_image_t * pDst,
3035 uint32_t *dstLine, *dst;
3036 uint8_t *maskLine, *mask;
3037 int dstStride, maskStride;
3041 __m128i xmmSrc, xmmAlpha, xmmDef;
3042 __m128i xmmDst, xmmDstLo, xmmDstHi;
3043 __m128i xmmMask, xmmMaskLo, xmmMaskHi;
3045 fbComposeGetSolid(pSrc, src, pDst->bits.format);
3051 fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
3052 fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
3054 xmmDef = createMask_2x32_128 (src, src);
3055 xmmSrc = expandPixel_32_1x128 (src);
3056 xmmAlpha = expandAlpha_1x128 (xmmSrc);
3061 dstLine += dstStride;
3063 maskLine += maskStride;
3066 /* call prefetch hint to optimize cache load*/
3067 cachePrefetch ((__m128i*)mask);
3068 cachePrefetch ((__m128i*)dst);
3070 while (w && (unsigned long)dst & 15)
3072 uint8_t m = *mask++;
3078 *dst = pack_1x64_32 (inOver_1x64 (_mm_movepi64_pi64 (xmmSrc),
3079 _mm_movepi64_pi64 (xmmAlpha),
3080 expandPixel_8_1x64 (m),
3081 unpack_32_1x64 (d)));
3088 /* call prefetch hint to optimize cache load*/
3089 cachePrefetch ((__m128i*)mask);
3090 cachePrefetch ((__m128i*)dst);
3094 /* fill cache line with next memory */
3095 cachePrefetchNext ((__m128i*)mask);
3096 cachePrefetchNext ((__m128i*)dst);
3098 m = *((uint32_t*)mask);
3100 if (srca == 0xff && m == 0xffffffff)
3102 save128Aligned ((__m128i*)dst, xmmDef);
3106 xmmDst = load128Aligned ((__m128i*) dst);
3107 xmmMask = unpack_32_1x128 (m);
3108 xmmMask = _mm_unpacklo_epi8 (xmmMask, _mm_setzero_si128());
3111 unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
3112 unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
3114 expandAlphaRev_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
3116 inOver_2x128 (xmmSrc, xmmSrc, xmmAlpha, xmmAlpha, xmmMaskLo, xmmMaskHi, &xmmDstLo, &xmmDstHi);
3118 save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
3128 uint8_t m = *mask++;
3134 *dst = pack_1x64_32 (inOver_1x64 (_mm_movepi64_pi64 (xmmSrc),
3135 _mm_movepi64_pi64 (xmmAlpha),
3136 expandPixel_8_1x64 (m),
3137 unpack_32_1x64 (d)));
3148 /* -------------------------------------------------------------------------------------------------
3149 * fbCompositeSolidMask_nx8x8888
3153 pixmanFillsse2 (uint32_t *bits,
3162 uint32_t byte_width;
3167 if (bpp == 16 && (data >> 16 != (data & 0xffff)))
3170 if (bpp != 16 && bpp != 32)
3175 stride = stride * (int) sizeof (uint32_t) / 2;
3176 byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
3177 byte_width = 2 * width;
3182 stride = stride * (int) sizeof (uint32_t) / 4;
3183 byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
3184 byte_width = 4 * width;
3188 cachePrefetch ((__m128i*)byte_line);
3189 xmmDef = createMask_2x32_128 (data, data);
3194 uint8_t *d = byte_line;
3195 byte_line += stride;
3199 cachePrefetchNext ((__m128i*)d);
3201 while (w >= 2 && ((unsigned long)d & 3))
3203 *(uint16_t *)d = data;
3208 while (w >= 4 && ((unsigned long)d & 15))
3210 *(uint32_t *)d = data;
3216 cachePrefetchNext ((__m128i*)d);
3220 cachePrefetch (((__m128i*)d) + 12);
3222 save128Aligned ((__m128i*)(d), xmmDef);
3223 save128Aligned ((__m128i*)(d+16), xmmDef);
3224 save128Aligned ((__m128i*)(d+32), xmmDef);
3225 save128Aligned ((__m128i*)(d+48), xmmDef);
3226 save128Aligned ((__m128i*)(d+64), xmmDef);
3227 save128Aligned ((__m128i*)(d+80), xmmDef);
3228 save128Aligned ((__m128i*)(d+96), xmmDef);
3229 save128Aligned ((__m128i*)(d+112), xmmDef);
3237 cachePrefetch (((__m128i*)d) + 8);
3239 save128Aligned ((__m128i*)(d), xmmDef);
3240 save128Aligned ((__m128i*)(d+16), xmmDef);
3241 save128Aligned ((__m128i*)(d+32), xmmDef);
3242 save128Aligned ((__m128i*)(d+48), xmmDef);
3248 cachePrefetchNext ((__m128i*)d);
3252 save128Aligned ((__m128i*)(d), xmmDef);
3253 save128Aligned ((__m128i*)(d+16), xmmDef);
3261 save128Aligned ((__m128i*)(d), xmmDef);
3267 cachePrefetchNext ((__m128i*)d);
3271 *(uint32_t *)d = data;
3279 *(uint16_t *)d = data;
3290 fbCompositeSolidMaskSrc_nx8x8888sse2 (pixman_op_t op,
3291 pixman_image_t * pSrc,
3292 pixman_image_t * pMask,
3293 pixman_image_t * pDst,
3304 uint32_t *dstLine, *dst;
3305 uint8_t *maskLine, *mask;
3306 int dstStride, maskStride;
3310 __m128i xmmSrc, xmmDef;
3311 __m128i xmmMask, xmmMaskLo, xmmMaskHi;
3313 fbComposeGetSolid(pSrc, src, pDst->bits.format);
3318 pixmanFillsse2 (pDst->bits.bits, pDst->bits.rowstride,
3319 PIXMAN_FORMAT_BPP (pDst->bits.format),
3320 xDst, yDst, width, height, 0);
3324 fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
3325 fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
3327 xmmDef = createMask_2x32_128 (src, src);
3328 xmmSrc = expandPixel_32_1x128 (src);
3333 dstLine += dstStride;
3335 maskLine += maskStride;
3338 /* call prefetch hint to optimize cache load*/
3339 cachePrefetch ((__m128i*)mask);
3340 cachePrefetch ((__m128i*)dst);
3342 while (w && (unsigned long)dst & 15)
3344 uint8_t m = *mask++;
3348 *dst = pack_1x64_32 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmSrc), expandPixel_8_1x64 (m)));
3359 /* call prefetch hint to optimize cache load*/
3360 cachePrefetch ((__m128i*)mask);
3361 cachePrefetch ((__m128i*)dst);
3365 /* fill cache line with next memory */
3366 cachePrefetchNext ((__m128i*)mask);
3367 cachePrefetchNext ((__m128i*)dst);
3369 m = *((uint32_t*)mask);
3371 if (srca == 0xff && m == 0xffffffff)
3373 save128Aligned ((__m128i*)dst, xmmDef);
3377 xmmMask = unpack_32_1x128 (m);
3378 xmmMask = _mm_unpacklo_epi8 (xmmMask, _mm_setzero_si128());
3381 unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
3383 expandAlphaRev_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
3385 pixMultiply_2x128 (xmmSrc, xmmSrc, xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
3387 save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmMaskLo, xmmMaskHi));
3391 save128Aligned ((__m128i*)dst, _mm_setzero_si128());
3401 uint8_t m = *mask++;
3405 *dst = pack_1x64_32 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmSrc), expandPixel_8_1x64 (m)));
3420 /* -------------------------------------------------------------------------------------------------
3421 * fbCompositeSolidMask_nx8x0565
3425 fbCompositeSolidMask_nx8x0565sse2 (pixman_op_t op,
3426 pixman_image_t * pSrc,
3427 pixman_image_t * pMask,
3428 pixman_image_t * pDst,
3439 uint16_t *dstLine, *dst, d;
3440 uint8_t *maskLine, *mask;
3441 int dstStride, maskStride;
3445 __m128i xmmSrc, xmmAlpha;
3446 __m128i xmmMask, xmmMaskLo, xmmMaskHi;
3447 __m128i xmmDst, xmmDst0, xmmDst1, xmmDst2, xmmDst3;
3449 fbComposeGetSolid(pSrc, src, pDst->bits.format);
3455 fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
3456 fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
3458 xmmSrc = expandPixel_32_1x128 (src);
3459 xmmAlpha = expandAlpha_1x128 (xmmSrc);
3464 dstLine += dstStride;
3466 maskLine += maskStride;
3469 /* call prefetch hint to optimize cache load*/
3470 cachePrefetch ((__m128i*)mask);
3471 cachePrefetch ((__m128i*)dst);
3473 while (w && (unsigned long)dst & 15)
3481 *dst = pack565_32_16 (pack_1x64_32 (inOver_1x64 (_mm_movepi64_pi64 (xmmSrc),
3482 _mm_movepi64_pi64 (xmmAlpha),
3483 expandAlphaRev_1x64 (unpack_32_1x64 (m)),
3484 expand565_16_1x64 (d))));
3491 /* call prefetch hint to optimize cache load*/
3492 cachePrefetch ((__m128i*)mask);
3493 cachePrefetch ((__m128i*)dst);
3497 /* fill cache line with next memory */
3498 cachePrefetchNext ((__m128i*)mask);
3499 cachePrefetchNext ((__m128i*)dst);
3501 xmmDst = load128Aligned ((__m128i*) dst);
3502 unpack565_128_4x128 (xmmDst, &xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3);
3504 m = *((uint32_t*)mask);
3509 xmmMask = unpack_32_1x128 (m);
3510 xmmMask = _mm_unpacklo_epi8 (xmmMask, _mm_setzero_si128());
3513 unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
3515 expandAlphaRev_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
3516 inOver_2x128 (xmmSrc, xmmSrc, xmmAlpha, xmmAlpha, xmmMaskLo, xmmMaskHi, &xmmDst0, &xmmDst1);
3519 m = *((uint32_t*)mask);
3524 xmmMask = unpack_32_1x128 (m);
3525 xmmMask = _mm_unpacklo_epi8 (xmmMask, _mm_setzero_si128());
3528 unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
3530 expandAlphaRev_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
3531 inOver_2x128 (xmmSrc, xmmSrc, xmmAlpha, xmmAlpha, xmmMaskLo, xmmMaskHi, &xmmDst2, &xmmDst3);
3534 save128Aligned ((__m128i*)dst, pack565_4x128_128 (xmmDst0, xmmDst1, xmmDst2, xmmDst3));
3548 *dst = pack565_32_16 (pack_1x64_32 (inOver_1x64 (_mm_movepi64_pi64 (xmmSrc),
3549 _mm_movepi64_pi64 (xmmAlpha),
3550 expandAlphaRev_1x64 (unpack_32_1x64 (m)),
3551 expand565_16_1x64 (d))));
3562 /* -------------------------------------------------------------------------------------------------
3563 * fbCompositeSrc_8888RevNPx0565
3567 fbCompositeSrc_8888RevNPx0565sse2 (pixman_op_t op,
3568 pixman_image_t * pSrc,
3569 pixman_image_t * pMask,
3570 pixman_image_t * pDst,
3580 uint16_t *dstLine, *dst, d;
3581 uint32_t *srcLine, *src, s;
3582 int dstStride, srcStride;
3587 __m128i xmmSrc, xmmSrcLo, xmmSrcHi;
3588 __m128i xmmDst, xmmDst0, xmmDst1, xmmDst2, xmmDst3;
3590 fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
3591 fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
3596 * I copy the code from MMX one and keep the fixme.
3597 * If it's a problem there, probably is a problem here.
3599 assert (pSrc->pDrawable == pMask->pDrawable);
3605 dstLine += dstStride;
3607 srcLine += srcStride;
3610 /* call prefetch hint to optimize cache load*/
3611 cachePrefetch ((__m128i*)src);
3612 cachePrefetch ((__m128i*)dst);
3614 while (w && (unsigned long)dst & 15)
3619 ms = unpack_32_1x64 (s);
3621 *dst++ = pack565_32_16 (pack_1x64_32 (overRevNonPre_1x64(ms, expand565_16_1x64 (d))));
3625 /* call prefetch hint to optimize cache load*/
3626 cachePrefetch ((__m128i*)src);
3627 cachePrefetch ((__m128i*)dst);
3631 /* fill cache line with next memory */
3632 cachePrefetchNext ((__m128i*)src);
3633 cachePrefetchNext ((__m128i*)dst);
3636 xmmSrc = load128Unaligned((__m128i*)src);
3637 xmmDst = load128Aligned ((__m128i*)dst);
3639 packCmp = packAlpha (xmmSrc);
3641 unpack565_128_4x128 (xmmDst, &xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3);
3642 unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
3644 /* preload next round*/
3645 xmmSrc = load128Unaligned((__m128i*)(src+4));
3646 /* preload next round*/
3648 if (packCmp == 0xffffffff)
3650 invertColors_2x128 (xmmSrcLo, xmmSrcHi, &xmmDst0, &xmmDst1);
3654 overRevNonPre_2x128 (xmmSrcLo, xmmSrcHi, &xmmDst0, &xmmDst1);
3658 packCmp = packAlpha (xmmSrc);
3660 unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
3662 if (packCmp == 0xffffffff)
3664 invertColors_2x128 (xmmSrcLo, xmmSrcHi, &xmmDst2, &xmmDst3);
3668 overRevNonPre_2x128 (xmmSrcLo, xmmSrcHi, &xmmDst2, &xmmDst3);
3671 save128Aligned ((__m128i*)dst, pack565_4x128_128 (xmmDst0, xmmDst1, xmmDst2, xmmDst3));
3683 ms = unpack_32_1x64 (s);
3685 *dst++ = pack565_32_16 (pack_1x64_32 (overRevNonPre_1x64(ms, expand565_16_1x64 (d))));
3693 /* "8888RevNP" is GdkPixbuf's format: ABGR, non premultiplied */
3695 /* -------------------------------------------------------------------------------------------------
3696 * fbCompositeSrc_8888RevNPx8888
3700 fbCompositeSrc_8888RevNPx8888sse2 (pixman_op_t op,
3701 pixman_image_t * pSrc,
3702 pixman_image_t * pMask,
3703 pixman_image_t * pDst,
3713 uint32_t *dstLine, *dst, d;
3714 uint32_t *srcLine, *src, s;
3715 int dstStride, srcStride;
3719 __m128i xmmSrcLo, xmmSrcHi;
3720 __m128i xmmDstLo, xmmDstHi;
3722 fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
3723 fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
3728 * I copy the code from MMX one and keep the fixme.
3729 * If it's a problem there, probably is a problem here.
3731 assert (pSrc->pDrawable == pMask->pDrawable);
3737 dstLine += dstStride;
3739 srcLine += srcStride;
3742 /* call prefetch hint to optimize cache load*/
3743 cachePrefetch ((__m128i*)src);
3744 cachePrefetch ((__m128i*)dst);
3746 while (w && (unsigned long)dst & 15)
3751 *dst++ = pack_1x64_32 (overRevNonPre_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d)));
3756 /* call prefetch hint to optimize cache load*/
3757 cachePrefetch ((__m128i*)src);
3758 cachePrefetch ((__m128i*)dst);
3762 /* fill cache line with next memory */
3763 cachePrefetchNext ((__m128i*)src);
3764 cachePrefetchNext ((__m128i*)dst);
3766 xmmSrcHi = load128Unaligned((__m128i*)src);
3768 packCmp = packAlpha (xmmSrcHi);
3770 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
3772 if (packCmp == 0xffffffff)
3774 invertColors_2x128( xmmSrcLo, xmmSrcHi, &xmmDstLo, &xmmDstHi);
3776 save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
3780 xmmDstHi = load128Aligned ((__m128i*)dst);
3782 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
3784 overRevNonPre_2x128 (xmmSrcLo, xmmSrcHi, &xmmDstLo, &xmmDstHi);
3786 save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
3799 *dst++ = pack_1x64_32 (overRevNonPre_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d)));
3808 /* -------------------------------------------------------------------------------------------------
3809 * fbCompositeSolidMask_nx8888x0565C
3813 fbCompositeSolidMask_nx8888x0565Csse2 (pixman_op_t op,
3814 pixman_image_t * pSrc,
3815 pixman_image_t * pMask,
3816 pixman_image_t * pDst,
3827 uint16_t *dstLine, *dst, d;
3828 uint32_t *maskLine, *mask, m;
3829 int dstStride, maskStride;
3833 __m128i xmmSrc, xmmAlpha;
3834 __m128i xmmMask, xmmMaskLo, xmmMaskHi;
3835 __m128i xmmDst, xmmDst0, xmmDst1, xmmDst2, xmmDst3;
3837 fbComposeGetSolid(pSrc, src, pDst->bits.format);
3843 fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
3844 fbComposeGetStart (pMask, xMask, yMask, uint32_t, maskStride, maskLine, 1);
3846 xmmSrc = expandPixel_32_1x128 (src);
3847 xmmAlpha = expandAlpha_1x128 (xmmSrc);
3854 maskLine += maskStride;
3855 dstLine += dstStride;
3857 /* call prefetch hint to optimize cache load*/
3858 cachePrefetch ((__m128i*)mask);
3859 cachePrefetch ((__m128i*)dst);
3861 while (w && ((unsigned long)dst & 15))
3863 m = *(uint32_t *) mask;
3869 *dst = pack565_32_16 (pack_1x64_32 (inOver_1x64 (_mm_movepi64_pi64 (xmmSrc),
3870 _mm_movepi64_pi64 (xmmAlpha),
3872 expand565_16_1x64 (d))));
3880 /* call prefetch hint to optimize cache load*/
3881 cachePrefetch ((__m128i*)mask);
3882 cachePrefetch ((__m128i*)dst);
3886 /* fill cache line with next memory */
3887 cachePrefetchNext ((__m128i*)mask);
3888 cachePrefetchNext ((__m128i*)dst);
3891 xmmMask = load128Unaligned((__m128i*)mask);
3892 xmmDst = load128Aligned((__m128i*)dst);
3894 packCmp = _mm_movemask_epi8 (_mm_cmpeq_epi32 (xmmMask, _mm_setzero_si128()));
3896 unpack565_128_4x128 (xmmDst, &xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3);
3897 unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
3899 /* preload next round*/
3900 xmmMask = load128Unaligned((__m128i*)(mask+4));
3901 /* preload next round*/
3903 if (packCmp != 0xffff)
3905 inOver_2x128(xmmSrc, xmmSrc, xmmAlpha, xmmAlpha, xmmMaskLo, xmmMaskHi, &xmmDst0, &xmmDst1);
3909 packCmp = _mm_movemask_epi8 (_mm_cmpeq_epi32 (xmmMask, _mm_setzero_si128()));
3911 unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
3913 if (packCmp != 0xffff)
3915 inOver_2x128(xmmSrc, xmmSrc, xmmAlpha, xmmAlpha, xmmMaskLo, xmmMaskHi, &xmmDst2, &xmmDst3);
3918 save128Aligned ((__m128i*)dst, pack565_4x128_128 (xmmDst0, xmmDst1, xmmDst2, xmmDst3));
3927 m = *(uint32_t *) mask;
3933 *dst = pack565_32_16 (pack_1x64_32 (inOver_1x64 (_mm_movepi64_pi64 (xmmSrc),
3934 _mm_movepi64_pi64 (xmmAlpha),
3936 expand565_16_1x64 (d))));
3948 /* -------------------------------------------------------------------------------------------------
3949 * fbCompositeIn_nx8x8
3953 fbCompositeIn_nx8x8sse2 (pixman_op_t op,
3954 pixman_image_t * pSrc,
3955 pixman_image_t * pMask,
3956 pixman_image_t * pDst,
3966 uint8_t *dstLine, *dst;
3967 uint8_t *maskLine, *mask;
3968 int dstStride, maskStride;
3974 __m128i xmmMask, xmmMaskLo, xmmMaskHi;
3975 __m128i xmmDst, xmmDstLo, xmmDstHi;
3977 fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1);
3978 fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
3980 fbComposeGetSolid(pSrc, src, pDst->bits.format);
3986 xmmAlpha = expandAlpha_1x128 (expandPixel_32_1x128 (src));
3991 dstLine += dstStride;
3993 maskLine += maskStride;
3996 /* call prefetch hint to optimize cache load*/
3997 cachePrefetch ((__m128i*)mask);
3998 cachePrefetch ((__m128i*)dst);
4000 while (w && ((unsigned long)dst & 15))
4002 m = (uint32_t) *mask++;
4003 d = (uint32_t) *dst;
4005 *dst++ = (uint8_t) pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmAlpha), unpack_32_1x64 (m)),
4006 unpack_32_1x64 (d)));
4010 /* call prefetch hint to optimize cache load*/
4011 cachePrefetch ((__m128i*)mask);
4012 cachePrefetch ((__m128i*)dst);
4016 /* fill cache line with next memory */
4017 cachePrefetchNext ((__m128i*)mask);
4018 cachePrefetchNext ((__m128i*)dst);
4020 xmmMask = load128Unaligned((__m128i*)mask);
4021 xmmDst = load128Aligned((__m128i*)dst);
4023 unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
4024 unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
4026 pixMultiply_2x128 (xmmAlpha, xmmAlpha, xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
4027 pixMultiply_2x128 (xmmMaskLo, xmmMaskHi, xmmDstLo, xmmDstHi, &xmmDstLo, &xmmDstHi);
4029 save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
4038 m = (uint32_t) *mask++;
4039 d = (uint32_t) *dst;
4041 *dst++ = (uint8_t) pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmAlpha), unpack_32_1x64 (m)),
4042 unpack_32_1x64 (d)));
4050 /* -------------------------------------------------------------------------------------------------
4055 fbCompositeIn_8x8sse2 (pixman_op_t op,
4056 pixman_image_t * pSrc,
4057 pixman_image_t * pMask,
4058 pixman_image_t * pDst,
4068 uint8_t *dstLine, *dst;
4069 uint8_t *srcLine, *src;
4070 int srcStride, dstStride;
4074 __m128i xmmSrc, xmmSrcLo, xmmSrcHi;
4075 __m128i xmmDst, xmmDstLo, xmmDstHi;
4077 fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1);
4078 fbComposeGetStart (pSrc, xSrc, ySrc, uint8_t, srcStride, srcLine, 1);
4083 dstLine += dstStride;
4085 srcLine += srcStride;
4088 /* call prefetch hint to optimize cache load*/
4089 cachePrefetch ((__m128i*)src);
4090 cachePrefetch ((__m128i*)dst);
4092 while (w && ((unsigned long)dst & 15))
4094 s = (uint32_t) *src++;
4095 d = (uint32_t) *dst;
4097 *dst++ = (uint8_t) pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s),unpack_32_1x64 (d)));
4101 /* call prefetch hint to optimize cache load*/
4102 cachePrefetch ((__m128i*)src);
4103 cachePrefetch ((__m128i*)dst);
4107 /* fill cache line with next memory */
4108 cachePrefetchNext ((__m128i*)src);
4109 cachePrefetchNext ((__m128i*)dst);
4111 xmmSrc = load128Unaligned((__m128i*)src);
4112 xmmDst = load128Aligned((__m128i*)dst);
4114 unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
4115 unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
4117 pixMultiply_2x128 (xmmSrcLo, xmmSrcHi, xmmDstLo, xmmDstHi, &xmmDstLo, &xmmDstHi);
4119 save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
4128 s = (uint32_t) *src++;
4129 d = (uint32_t) *dst;
4131 *dst++ = (uint8_t) pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s),unpack_32_1x64 (d)));
4139 /* -------------------------------------------------------------------------------------------------
4140 * fbCompositeSrcAdd_8888x8x8
4144 fbCompositeSrcAdd_8888x8x8sse2 (pixman_op_t op,
4145 pixman_image_t * pSrc,
4146 pixman_image_t * pMask,
4147 pixman_image_t * pDst,
4157 uint8_t *dstLine, *dst;
4158 uint8_t *maskLine, *mask;
4159 int dstStride, maskStride;
4166 __m128i xmmMask, xmmMaskLo, xmmMaskHi;
4167 __m128i xmmDst, xmmDstLo, xmmDstHi;
4169 fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1);
4170 fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
4172 fbComposeGetSolid(pSrc, src, pDst->bits.format);
4178 xmmAlpha = expandAlpha_1x128 (expandPixel_32_1x128 (src));
4183 dstLine += dstStride;
4185 maskLine += maskStride;
4188 /* call prefetch hint to optimize cache load*/
4189 cachePrefetch ((__m128i*)mask);
4190 cachePrefetch ((__m128i*)dst);
4192 while (w && ((unsigned long)dst & 15))
4194 m = (uint32_t) *mask++;
4195 d = (uint32_t) *dst;
4197 *dst++ = (uint8_t) pack_1x64_32 (_mm_adds_pu16 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmAlpha), unpack_32_1x64 (m)),
4198 unpack_32_1x64 (d)));
4202 /* call prefetch hint to optimize cache load*/
4203 cachePrefetch ((__m128i*)mask);
4204 cachePrefetch ((__m128i*)dst);
4208 /* fill cache line with next memory */
4209 cachePrefetchNext ((__m128i*)mask);
4210 cachePrefetchNext ((__m128i*)dst);
4212 xmmMask = load128Unaligned((__m128i*)mask);
4213 xmmDst = load128Aligned((__m128i*)dst);
4215 unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
4216 unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
4218 pixMultiply_2x128 (xmmAlpha, xmmAlpha, xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
4220 xmmDstLo = _mm_adds_epu16 (xmmMaskLo, xmmDstLo);
4221 xmmDstHi = _mm_adds_epu16 (xmmMaskHi, xmmDstHi);
4223 save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
4232 m = (uint32_t) *mask++;
4233 d = (uint32_t) *dst;
4235 *dst++ = (uint8_t) pack_1x64_32 (_mm_adds_pu16 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmAlpha), unpack_32_1x64 (m)),
4236 unpack_32_1x64 (d)));
4244 /* -------------------------------------------------------------------------------------------------
4245 * fbCompositeSrcAdd_8000x8000
4249 fbCompositeSrcAdd_8000x8000sse2 (pixman_op_t op,
4250 pixman_image_t * pSrc,
4251 pixman_image_t * pMask,
4252 pixman_image_t * pDst,
4262 uint8_t *dstLine, *dst;
4263 uint8_t *srcLine, *src;
4264 int dstStride, srcStride;
4268 fbComposeGetStart (pSrc, xSrc, ySrc, uint8_t, srcStride, srcLine, 1);
4269 fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1);
4276 /* call prefetch hint to optimize cache load*/
4277 cachePrefetch ((__m128i*)src);
4278 cachePrefetch ((__m128i*)dst);
4280 dstLine += dstStride;
4281 srcLine += srcStride;
4285 while (w && (unsigned long)dst & 3)
4287 t = (*dst) + (*src++);
4288 *dst++ = t | (0 - (t >> 8));
4292 coreCombineAddUsse2 ((uint32_t*)dst, (uint32_t*)src, w >> 2);
4302 t = (*dst) + (*src++);
4303 *dst++ = t | (0 - (t >> 8));
4311 /* -------------------------------------------------------------------------------------------------
4312 * fbCompositeSrcAdd_8888x8888
4315 fbCompositeSrcAdd_8888x8888sse2 (pixman_op_t op,
4316 pixman_image_t * pSrc,
4317 pixman_image_t * pMask,
4318 pixman_image_t * pDst,
4328 uint32_t *dstLine, *dst;
4329 uint32_t *srcLine, *src;
4330 int dstStride, srcStride;
4332 fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
4333 fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
4338 dstLine += dstStride;
4340 srcLine += srcStride;
4342 coreCombineAddUsse2 (dst, src, width);
4348 /* -------------------------------------------------------------------------------------------------
4349 * fbCompositeCopyAreasse2
4353 pixmanBltsse2 (uint32_t *src_bits,
4359 int src_x, int src_y,
4360 int dst_x, int dst_y,
4361 int width, int height)
4363 uint8_t * src_bytes;
4364 uint8_t * dst_bytes;
4367 if (src_bpp != dst_bpp)
4372 src_stride = src_stride * (int) sizeof (uint32_t) / 2;
4373 dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
4374 src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
4375 dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
4376 byte_width = 2 * width;
4380 else if (src_bpp == 32)
4382 src_stride = src_stride * (int) sizeof (uint32_t) / 4;
4383 dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
4384 src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
4385 dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
4386 byte_width = 4 * width;
4395 cachePrefetch ((__m128i*)src_bytes);
4396 cachePrefetch ((__m128i*)dst_bytes);
4401 uint8_t *s = src_bytes;
4402 uint8_t *d = dst_bytes;
4403 src_bytes += src_stride;
4404 dst_bytes += dst_stride;
4407 cachePrefetchNext ((__m128i*)s);
4408 cachePrefetchNext ((__m128i*)d);
4410 while (w >= 2 && ((unsigned long)d & 3))
4412 *(uint16_t *)d = *(uint16_t *)s;
4418 while (w >= 4 && ((unsigned long)d & 15))
4420 *(uint32_t *)d = *(uint32_t *)s;
4427 cachePrefetchNext ((__m128i*)s);
4428 cachePrefetchNext ((__m128i*)d);
4432 __m128i xmm0, xmm1, xmm2, xmm3;
4434 /* 128 bytes ahead */
4435 cachePrefetch (((__m128i*)s) + 8);
4436 cachePrefetch (((__m128i*)d) + 8);
4438 xmm0 = load128Unaligned ((__m128i*)(s));
4439 xmm1 = load128Unaligned ((__m128i*)(s+16));
4440 xmm2 = load128Unaligned ((__m128i*)(s+32));
4441 xmm3 = load128Unaligned ((__m128i*)(s+48));
4443 save128Aligned ((__m128i*)(d), xmm0);
4444 save128Aligned ((__m128i*)(d+16), xmm1);
4445 save128Aligned ((__m128i*)(d+32), xmm2);
4446 save128Aligned ((__m128i*)(d+48), xmm3);
4453 cachePrefetchNext ((__m128i*)s);
4454 cachePrefetchNext ((__m128i*)d);
4458 save128Aligned ((__m128i*)d, load128Unaligned ((__m128i*)s) );
4465 cachePrefetchNext ((__m128i*)s);
4466 cachePrefetchNext ((__m128i*)d);
4470 *(uint32_t *)d = *(uint32_t *)s;
4479 *(uint16_t *)d = *(uint16_t *)s;
4492 fbCompositeCopyAreasse2 (pixman_op_t op,
4493 pixman_image_t * pSrc,
4494 pixman_image_t * pMask,
4495 pixman_image_t * pDst,
4505 pixmanBltsse2 (pSrc->bits.bits,
4507 pSrc->bits.rowstride,
4508 pDst->bits.rowstride,
4509 PIXMAN_FORMAT_BPP (pSrc->bits.format),
4510 PIXMAN_FORMAT_BPP (pDst->bits.format),
4511 xSrc, ySrc, xDst, yDst, width, height);
4515 /* This code are buggy in MMX version, now the bug was translated to SSE2 version */
4517 fbCompositeOver_x888x8x8888sse2 (pixman_op_t op,
4518 pixman_image_t * pSrc,
4519 pixman_image_t * pMask,
4520 pixman_image_t * pDst,
4530 uint32_t *src, *srcLine, s;
4531 uint32_t *dst, *dstLine, d;
4532 uint8_t *mask, *maskLine;
4534 int srcStride, maskStride, dstStride;
4537 __m128i xmmSrc, xmmSrcLo, xmmSrcHi;
4538 __m128i xmmDst, xmmDstLo, xmmDstHi;
4539 __m128i xmmMask, xmmMaskLo, xmmMaskHi;
4541 fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
4542 fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
4543 fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
4548 srcLine += srcStride;
4550 dstLine += dstStride;
4552 maskLine += maskStride;
4556 /* call prefetch hint to optimize cache load*/
4557 cachePrefetch ((__m128i*)src);
4558 cachePrefetch ((__m128i*)dst);
4559 cachePrefetch ((__m128i*)mask);
4561 while (w && (unsigned long)dst & 15)
4563 s = 0xff000000 | *src++;
4564 m = (uint32_t) *mask++;
4567 __m64 ms = unpack_32_1x64 (s);
4571 ms = inOver_1x64 (ms,
4573 expandAlphaRev_1x64 (unpack_32_1x64 (m)),
4574 unpack_32_1x64 (d));
4577 *dst++ = pack_1x64_32 (ms);
4581 /* call prefetch hint to optimize cache load*/
4582 cachePrefetch ((__m128i*)src);
4583 cachePrefetch ((__m128i*)dst);
4584 cachePrefetch ((__m128i*)mask);
4588 /* fill cache line with next memory */
4589 cachePrefetchNext ((__m128i*)src);
4590 cachePrefetchNext ((__m128i*)dst);
4591 cachePrefetchNext ((__m128i*)mask);
4593 m = *(uint32_t*) mask;
4594 xmmSrc = _mm_or_si128 (load128Unaligned ((__m128i*)src), Maskff000000);
4596 if (m == 0xffffffff)
4598 save128Aligned ((__m128i*)dst, xmmSrc);
4602 xmmDst = load128Aligned ((__m128i*)dst);
4604 xmmMask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
4606 unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
4607 unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
4608 unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
4610 expandAlphaRev_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
4612 inOver_2x128 (xmmSrcLo, xmmSrcHi, Mask00ff, Mask00ff, xmmMaskLo, xmmMaskHi, &xmmDstLo, &xmmDstHi);
4614 save128Aligned( (__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
4625 m = (uint32_t) *mask++;
4629 s = 0xff000000 | *src;
4639 *dst = pack_1x64_32 (inOver_1x64 (unpack_32_1x64 (s),
4641 expandAlphaRev_1x64 (unpack_32_1x64 (m)),
4642 unpack_32_1x64 (d)));
4657 #endif /* USE_SSE2 */