2 * Copyright © 2008 Rodrigo Kumpera
3 * Copyright © 2008 André Tupinambá
5 * Permission to use, copy, modify, distribute, and sell this software and its
6 * documentation for any purpose is hereby granted without fee, provided that
7 * the above copyright notice appear in all copies and that both that
8 * copyright notice and this permission notice appear in supporting
9 * documentation, and that the name of Red Hat not be used in advertising or
10 * publicity pertaining to distribution of the software without specific,
11 * written prior permission. Red Hat makes no representations about the
12 * suitability of this software for any purpose. It is provided "as is"
13 * without express or implied warranty.
15 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
16 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
18 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
20 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
21 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
24 * Author: Rodrigo Kumpera (kumpera@gmail.com)
25 * André Tupinambá (andrelrt@gmail.com)
27 * Based on work by Owen Taylor and Søren Sandmann
34 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
35 #include <emmintrin.h> /* for SSE2 intrinsics */
36 #include "pixman-private.h"
37 #include "pixman-combine32.h"
41 /* -------------------------------------------------------------------------------------------------
45 static __m64 xMask0080;
46 static __m64 xMask00ff;
47 static __m64 xMask0101;
48 static __m64 xMaskAlpha;
50 static __m64 xMask565rgb;
51 static __m64 xMask565Unpack;
53 static __m128i Mask0080;
54 static __m128i Mask00ff;
55 static __m128i Mask0101;
56 static __m128i Maskffff;
57 static __m128i Maskff000000;
58 static __m128i MaskAlpha;
60 static __m128i Mask565r;
61 static __m128i Mask565g1, Mask565g2;
62 static __m128i Mask565b;
63 static __m128i MaskRed;
64 static __m128i MaskGreen;
65 static __m128i MaskBlue;
67 static __m128i Mask565FixRB;
68 static __m128i Mask565FixG;
70 /* -------------------------------------------------------------------------------------------------
73 static force_inline __m128i
74 unpack_32_1x128 (uint32_t data)
76 return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128());
79 static force_inline void
80 unpack_128_2x128 (__m128i data, __m128i* dataLo, __m128i* dataHi)
82 *dataLo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
83 *dataHi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
86 static force_inline __m128i
87 unpack565to8888 (__m128i lo)
89 __m128i r, g, b, rb, t;
91 r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), MaskRed);
92 g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), MaskGreen);
93 b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), MaskBlue);
95 rb = _mm_or_si128 (r, b);
96 t = _mm_and_si128 (rb, Mask565FixRB);
97 t = _mm_srli_epi32 (t, 5);
98 rb = _mm_or_si128 (rb, t);
100 t = _mm_and_si128 (g, Mask565FixG);
101 t = _mm_srli_epi32 (t, 6);
102 g = _mm_or_si128 (g, t);
104 return _mm_or_si128 (rb, g);
107 static force_inline void
108 unpack565_128_4x128 (__m128i data, __m128i* data0, __m128i* data1, __m128i* data2, __m128i* data3)
112 lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
113 hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
115 lo = unpack565to8888 (lo);
116 hi = unpack565to8888 (hi);
118 unpack_128_2x128 (lo, data0, data1);
119 unpack_128_2x128 (hi, data2, data3);
122 static force_inline uint16_t
123 pack565_32_16 (uint32_t pixel)
125 return (uint16_t) (((pixel>>8) & 0xf800) | ((pixel>>5) & 0x07e0) | ((pixel>>3) & 0x001f));
128 static force_inline __m128i
129 pack_2x128_128 (__m128i lo, __m128i hi)
131 return _mm_packus_epi16 (lo, hi);
134 static force_inline __m128i
135 pack565_2x128_128 (__m128i lo, __m128i hi)
138 __m128i r, g1, g2, b;
140 data = pack_2x128_128 ( lo, hi );
142 r = _mm_and_si128 (data , Mask565r);
143 g1 = _mm_and_si128 (_mm_slli_epi32 (data , 3), Mask565g1);
144 g2 = _mm_and_si128 (_mm_srli_epi32 (data , 5), Mask565g2);
145 b = _mm_and_si128 (_mm_srli_epi32 (data , 3), Mask565b);
147 return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
150 static force_inline __m128i
151 pack565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
153 return _mm_packus_epi16 (pack565_2x128_128 (*xmm0, *xmm1), pack565_2x128_128 (*xmm2, *xmm3));
156 static force_inline int
159 __m128i ffs = _mm_cmpeq_epi8 (x, x);
160 return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
163 static force_inline int
166 return _mm_movemask_epi8 (_mm_cmpeq_epi8 (x, _mm_setzero_si128())) == 0xffff;
169 static force_inline int
170 isTransparent (__m128i x)
172 return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, _mm_setzero_si128())) & 0x8888) == 0x8888;
175 static force_inline __m128i
176 expandPixel_32_1x128 (uint32_t data)
178 return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE(1, 0, 1, 0));
181 static force_inline __m128i
182 expandAlpha_1x128 (__m128i data)
184 return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
187 static force_inline void
188 expandAlpha_2x128 (__m128i dataLo, __m128i dataHi, __m128i* alphaLo, __m128i* alphaHi)
192 lo = _mm_shufflelo_epi16 (dataLo, _MM_SHUFFLE(3, 3, 3, 3));
193 hi = _mm_shufflelo_epi16 (dataHi, _MM_SHUFFLE(3, 3, 3, 3));
194 *alphaLo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE(3, 3, 3, 3));
195 *alphaHi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE(3, 3, 3, 3));
198 static force_inline void
199 expandAlphaRev_2x128 (__m128i dataLo, __m128i dataHi, __m128i* alphaLo, __m128i* alphaHi)
203 lo = _mm_shufflelo_epi16 (dataLo, _MM_SHUFFLE(0, 0, 0, 0));
204 hi = _mm_shufflelo_epi16 (dataHi, _MM_SHUFFLE(0, 0, 0, 0));
205 *alphaLo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE(0, 0, 0, 0));
206 *alphaHi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE(0, 0, 0, 0));
209 static force_inline void
210 pixMultiply_2x128 (__m128i* dataLo, __m128i* dataHi, __m128i* alphaLo, __m128i* alphaHi, __m128i* retLo, __m128i* retHi)
214 lo = _mm_mullo_epi16 (*dataLo, *alphaLo);
215 hi = _mm_mullo_epi16 (*dataHi, *alphaHi);
216 lo = _mm_adds_epu16 (lo, Mask0080);
217 hi = _mm_adds_epu16 (hi, Mask0080);
218 *retLo = _mm_mulhi_epu16 (lo, Mask0101);
219 *retHi = _mm_mulhi_epu16 (hi, Mask0101);
222 static force_inline void
223 pixAddMultiply_2x128 (__m128i* srcLo, __m128i* srcHi, __m128i* alphaDstLo, __m128i* alphaDstHi,
224 __m128i* dstLo, __m128i* dstHi, __m128i* alphaSrcLo, __m128i* alphaSrcHi,
225 __m128i* retLo, __m128i* retHi)
228 __m128i mulLo, mulHi;
230 lo = _mm_mullo_epi16 (*srcLo, *alphaDstLo);
231 hi = _mm_mullo_epi16 (*srcHi, *alphaDstHi);
232 mulLo = _mm_mullo_epi16 (*dstLo, *alphaSrcLo);
233 mulHi = _mm_mullo_epi16 (*dstHi, *alphaSrcHi);
234 lo = _mm_adds_epu16 (lo, Mask0080);
235 hi = _mm_adds_epu16 (hi, Mask0080);
236 lo = _mm_adds_epu16 (lo, mulLo);
237 hi = _mm_adds_epu16 (hi, mulHi);
238 *retLo = _mm_mulhi_epu16 (lo, Mask0101);
239 *retHi = _mm_mulhi_epu16 (hi, Mask0101);
242 static force_inline void
243 negate_2x128 (__m128i dataLo, __m128i dataHi, __m128i* negLo, __m128i* negHi)
245 *negLo = _mm_xor_si128 (dataLo, Mask00ff);
246 *negHi = _mm_xor_si128 (dataHi, Mask00ff);
249 static force_inline void
250 invertColors_2x128 (__m128i dataLo, __m128i dataHi, __m128i* invLo, __m128i* invHi)
254 lo = _mm_shufflelo_epi16 (dataLo, _MM_SHUFFLE(3, 0, 1, 2));
255 hi = _mm_shufflelo_epi16 (dataHi, _MM_SHUFFLE(3, 0, 1, 2));
256 *invLo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE(3, 0, 1, 2));
257 *invHi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE(3, 0, 1, 2));
260 static force_inline void
261 over_2x128 (__m128i* srcLo, __m128i* srcHi, __m128i* alphaLo, __m128i* alphaHi, __m128i* dstLo, __m128i* dstHi)
265 negate_2x128 (*alphaLo, *alphaHi, &t1, &t2);
267 pixMultiply_2x128 (dstLo, dstHi, &t1, &t2, dstLo, dstHi);
269 *dstLo = _mm_adds_epu8 (*srcLo, *dstLo);
270 *dstHi = _mm_adds_epu8 (*srcHi, *dstHi);
273 static force_inline void
274 overRevNonPre_2x128 (__m128i srcLo, __m128i srcHi, __m128i* dstLo, __m128i* dstHi)
277 __m128i alphaLo, alphaHi;
279 expandAlpha_2x128 (srcLo, srcHi, &alphaLo, &alphaHi);
281 lo = _mm_or_si128 (alphaLo, MaskAlpha);
282 hi = _mm_or_si128 (alphaHi, MaskAlpha);
284 invertColors_2x128 (srcLo, srcHi, &srcLo, &srcHi);
286 pixMultiply_2x128 (&srcLo, &srcHi, &lo, &hi, &lo, &hi);
288 over_2x128 (&lo, &hi, &alphaLo, &alphaHi, dstLo, dstHi);
291 static force_inline void
292 inOver_2x128 (__m128i* srcLo, __m128i* srcHi, __m128i* alphaLo, __m128i* alphaHi,
293 __m128i* maskLo, __m128i* maskHi, __m128i* dstLo, __m128i* dstHi)
298 pixMultiply_2x128 ( srcLo, srcHi, maskLo, maskHi, &sLo, &sHi);
299 pixMultiply_2x128 (alphaLo, alphaHi, maskLo, maskHi, &aLo, &aHi);
301 over_2x128 (&sLo, &sHi, &aLo, &aHi, dstLo, dstHi);
304 static force_inline void
305 cachePrefetch (__m128i* addr)
307 _mm_prefetch (addr, _MM_HINT_T0);
310 static force_inline void
311 cachePrefetchNext (__m128i* addr)
313 _mm_prefetch (addr + 4, _MM_HINT_T0); // 64 bytes ahead
316 /* load 4 pixels from a 16-byte boundary aligned address */
317 static force_inline __m128i
318 load128Aligned (__m128i* src)
320 return _mm_load_si128 (src);
323 /* load 4 pixels from a unaligned address */
324 static force_inline __m128i
325 load128Unaligned (const __m128i* src)
327 return _mm_loadu_si128 (src);
330 /* save 4 pixels using Write Combining memory on a 16-byte boundary aligned address */
331 static force_inline void
332 save128WriteCombining (__m128i* dst, __m128i data)
334 _mm_stream_si128 (dst, data);
337 /* save 4 pixels on a 16-byte boundary aligned address */
338 static force_inline void
339 save128Aligned (__m128i* dst, __m128i data)
341 _mm_store_si128 (dst, data);
344 /* save 4 pixels on a unaligned address */
345 static force_inline void
346 save128Unaligned (__m128i* dst, __m128i data)
348 _mm_storeu_si128 (dst, data);
351 /* -------------------------------------------------------------------------------------------------
355 static force_inline __m64
356 unpack_32_1x64 (uint32_t data)
358 return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (data), _mm_setzero_si64());
361 static force_inline __m64
362 expandAlpha_1x64 (__m64 data)
364 return _mm_shuffle_pi16 (data, _MM_SHUFFLE(3, 3, 3, 3));
367 static force_inline __m64
368 expandAlphaRev_1x64 (__m64 data)
370 return _mm_shuffle_pi16 (data, _MM_SHUFFLE(0, 0, 0, 0));
373 static force_inline __m64
374 expandPixel_8_1x64 (uint8_t data)
376 return _mm_shuffle_pi16 (unpack_32_1x64 ((uint32_t)data), _MM_SHUFFLE(0, 0, 0, 0));
379 static force_inline __m64
380 pixMultiply_1x64 (__m64 data, __m64 alpha)
382 return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (data, alpha),
387 static force_inline __m64
388 pixAddMultiply_1x64 (__m64* src, __m64* alphaDst, __m64* dst, __m64* alphaSrc)
390 return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (*src, *alphaDst),
392 _mm_mullo_pi16 (*dst, *alphaSrc)),
396 static force_inline __m64
397 negate_1x64 (__m64 data)
399 return _mm_xor_si64 (data, xMask00ff);
402 static force_inline __m64
403 invertColors_1x64 (__m64 data)
405 return _mm_shuffle_pi16 (data, _MM_SHUFFLE(3, 0, 1, 2));
408 static force_inline __m64
409 over_1x64 (__m64 src, __m64 alpha, __m64 dst)
411 return _mm_adds_pu8 (src, pixMultiply_1x64 (dst, negate_1x64 (alpha)));
414 static force_inline __m64
415 inOver_1x64 (__m64* src, __m64* alpha, __m64* mask, __m64* dst)
417 return over_1x64 (pixMultiply_1x64 (*src, *mask),
418 pixMultiply_1x64 (*alpha, *mask),
422 static force_inline __m64
423 overRevNonPre_1x64 (__m64 src, __m64 dst)
425 __m64 alpha = expandAlpha_1x64 (src);
427 return over_1x64 (pixMultiply_1x64 (invertColors_1x64 (src),
428 _mm_or_si64 (alpha, xMaskAlpha)),
433 static force_inline uint32_t
434 pack_1x64_32( __m64 data )
436 return _mm_cvtsi64_si32 (_mm_packs_pu16 (data, _mm_setzero_si64()));
439 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
443 * --- Expanding 565 in the low word ---
445 * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
446 * m = m & (01f0003f001f);
447 * m = m * (008404100840);
450 * Note the trick here - the top word is shifted by another nibble to
451 * avoid it bumping into the middle word
453 static force_inline __m64
454 expand565_16_1x64 (uint16_t pixel)
459 p = _mm_cvtsi32_si64 ((uint32_t) pixel);
461 t1 = _mm_slli_si64 (p, 36 - 11);
462 t2 = _mm_slli_si64 (p, 16 - 5);
464 p = _mm_or_si64 (t1, p);
465 p = _mm_or_si64 (t2, p);
466 p = _mm_and_si64 (p, xMask565rgb);
467 p = _mm_mullo_pi16 (p, xMask565Unpack);
469 return _mm_srli_pi16 (p, 8);
472 /* -------------------------------------------------------------------------------------------------
473 * Compose Core transformations
475 static force_inline uint32_t
476 coreCombineOverUPixelsse2 (uint32_t src, uint32_t dst)
489 ms = unpack_32_1x64 (src);
490 return pack_1x64_32 (over_1x64 (ms, expandAlpha_1x64 (ms), unpack_32_1x64 (dst)));
496 static force_inline uint32_t
497 combine1 (const uint32_t *ps, const uint32_t *pm)
505 mm = unpack_32_1x64 (*pm);
506 mm = expandAlpha_1x64 (mm);
508 ms = unpack_32_1x64 (s);
509 ms = pixMultiply_1x64 (ms, mm);
511 s = pack_1x64_32 (ms);
517 static force_inline __m128i
518 combine4 (const __m128i *ps, const __m128i *pm)
520 __m128i xmmSrcLo, xmmSrcHi;
521 __m128i xmmMskLo, xmmMskHi;
526 xmmMskLo = load128Unaligned (pm);
528 if (isTransparent (xmmMskLo))
529 return _mm_setzero_si128 ();
532 s = load128Unaligned (ps);
536 unpack_128_2x128 (s, &xmmSrcLo, &xmmSrcHi);
537 unpack_128_2x128 (xmmMskLo, &xmmMskLo, &xmmMskHi);
539 expandAlpha_2x128 (xmmMskLo, xmmMskHi, &xmmMskLo, &xmmMskHi);
541 pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMskLo, &xmmMskHi, &xmmSrcLo, &xmmSrcHi);
543 s = pack_2x128_128 (xmmSrcLo, xmmSrcHi);
549 static force_inline void
550 coreCombineOverUsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm, int w)
554 __m128i xmmDstLo, xmmDstHi;
555 __m128i xmmSrcLo, xmmSrcHi;
556 __m128i xmmAlphaLo, xmmAlphaHi;
558 /* call prefetch hint to optimize cache load*/
559 cachePrefetch ((__m128i*)ps);
560 cachePrefetch ((__m128i*)pd);
561 cachePrefetch ((__m128i*)pm);
563 /* Align dst on a 16-byte boundary */
565 ((unsigned long)pd & 15))
568 s = combine1 (ps, pm);
570 *pd++ = coreCombineOverUPixelsse2 (s, d);
577 /* call prefetch hint to optimize cache load*/
578 cachePrefetch ((__m128i*)ps);
579 cachePrefetch ((__m128i*)pd);
580 cachePrefetch ((__m128i*)pm);
584 /* fill cache line with next memory */
585 cachePrefetchNext ((__m128i*)ps);
586 cachePrefetchNext ((__m128i*)pd);
587 cachePrefetchNext ((__m128i*)pm);
589 /* I'm loading unaligned because I'm not sure about the address alignment. */
590 xmmSrcHi = combine4 ((__m128i*)ps, (__m128i*)pm);
592 if (isOpaque (xmmSrcHi))
594 save128Aligned ((__m128i*)pd, xmmSrcHi);
596 else if (!isZero (xmmSrcHi))
598 xmmDstHi = load128Aligned ((__m128i*) pd);
600 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
601 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
603 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);
605 over_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi, &xmmDstLo, &xmmDstHi);
607 /* rebuid the 4 pixel data and save*/
608 save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
621 s = combine1 (ps, pm);
623 *pd++ = coreCombineOverUPixelsse2 (s, d);
631 static force_inline void
632 coreCombineOverReverseUsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm, int w)
636 __m128i xmmDstLo, xmmDstHi;
637 __m128i xmmSrcLo, xmmSrcHi;
638 __m128i xmmAlphaLo, xmmAlphaHi;
640 /* call prefetch hint to optimize cache load*/
641 cachePrefetch ((__m128i*)ps);
642 cachePrefetch ((__m128i*)pd);
643 cachePrefetch ((__m128i*)pm);
645 /* Align dst on a 16-byte boundary */
647 ((unsigned long)pd & 15))
650 s = combine1 (ps, pm);
652 *pd++ = coreCombineOverUPixelsse2 (d, s);
659 /* call prefetch hint to optimize cache load*/
660 cachePrefetch ((__m128i*)ps);
661 cachePrefetch ((__m128i*)pd);
662 cachePrefetch ((__m128i*)pm);
666 /* fill cache line with next memory */
667 cachePrefetchNext ((__m128i*)ps);
668 cachePrefetchNext ((__m128i*)pd);
669 cachePrefetchNext ((__m128i*)pm);
671 /* I'm loading unaligned because I'm not sure about the address alignment. */
672 xmmSrcHi = combine4 ((__m128i*)ps, (__m128i*)pm);
673 xmmDstHi = load128Aligned ((__m128i*) pd);
675 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
676 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
678 expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaLo, &xmmAlphaHi);
680 over_2x128 (&xmmDstLo, &xmmDstHi, &xmmAlphaLo, &xmmAlphaHi, &xmmSrcLo, &xmmSrcHi);
682 /* rebuid the 4 pixel data and save*/
683 save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmSrcLo, xmmSrcHi));
695 s = combine1 (ps, pm);
697 *pd++ = coreCombineOverUPixelsse2 (d, s);
705 static force_inline uint32_t
706 coreCombineInUPixelsse2 (uint32_t src, uint32_t dst)
708 uint32_t maska = src >> 24;
714 else if (maska != 0xff)
716 return pack_1x64_32(pixMultiply_1x64 (unpack_32_1x64 (dst), expandAlpha_1x64 (unpack_32_1x64 (src))));
722 static force_inline void
723 coreCombineInUsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm, int w)
727 __m128i xmmSrcLo, xmmSrcHi;
728 __m128i xmmDstLo, xmmDstHi;
730 /* call prefetch hint to optimize cache load*/
731 cachePrefetch ((__m128i*)ps);
732 cachePrefetch ((__m128i*)pd);
733 cachePrefetch ((__m128i*)pm);
735 while (w && ((unsigned long) pd & 15))
737 s = combine1 (ps, pm);
740 *pd++ = coreCombineInUPixelsse2 (d, s);
747 /* call prefetch hint to optimize cache load*/
748 cachePrefetch ((__m128i*)ps);
749 cachePrefetch ((__m128i*)pd);
750 cachePrefetch ((__m128i*)pm);
754 /* fill cache line with next memory */
755 cachePrefetchNext ((__m128i*)ps);
756 cachePrefetchNext ((__m128i*)pd);
757 cachePrefetchNext ((__m128i*)pm);
759 xmmDstHi = load128Aligned ((__m128i*) pd);
760 xmmSrcHi = combine4 ((__m128i*) ps, (__m128i*) pm);
762 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
763 expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmDstLo, &xmmDstHi);
765 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
766 pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmDstLo, &xmmDstHi, &xmmDstLo, &xmmDstHi);
768 save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
779 s = combine1 (ps, pm);
782 *pd++ = coreCombineInUPixelsse2 (d, s);
790 static force_inline void
791 coreCombineReverseInUsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t *pm, int w)
795 __m128i xmmSrcLo, xmmSrcHi;
796 __m128i xmmDstLo, xmmDstHi;
798 /* call prefetch hint to optimize cache load*/
799 cachePrefetch ((__m128i*)ps);
800 cachePrefetch ((__m128i*)pd);
801 cachePrefetch ((__m128i*)pm);
803 while (w && ((unsigned long) pd & 15))
805 s = combine1 (ps, pm);
808 *pd++ = coreCombineInUPixelsse2 (s, d);
815 /* call prefetch hint to optimize cache load*/
816 cachePrefetch ((__m128i*)ps);
817 cachePrefetch ((__m128i*)pd);
818 cachePrefetch ((__m128i*)pm);
822 /* fill cache line with next memory */
823 cachePrefetchNext ((__m128i*)ps);
824 cachePrefetchNext ((__m128i*)pd);
825 cachePrefetchNext ((__m128i*)pm);
827 xmmDstHi = load128Aligned ((__m128i*) pd);
828 xmmSrcHi = combine4 ((__m128i*) ps, (__m128i*)pm);
830 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
831 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
833 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
834 pixMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmSrcLo, &xmmSrcHi, &xmmDstLo, &xmmDstHi);
836 save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
847 s = combine1 (ps, pm);
850 *pd++ = coreCombineInUPixelsse2 (s, d);
858 static force_inline void
859 coreCombineReverseOutUsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm, int w)
861 /* call prefetch hint to optimize cache load*/
862 cachePrefetch ((__m128i*)ps);
863 cachePrefetch ((__m128i*)pd);
864 cachePrefetch ((__m128i*)pm);
866 while (w && ((unsigned long) pd & 15))
868 uint32_t s = combine1 (ps, pm);
871 *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d), negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (s)))));
878 /* call prefetch hint to optimize cache load*/
879 cachePrefetch ((__m128i*)ps);
880 cachePrefetch ((__m128i*)pd);
881 cachePrefetch ((__m128i*)pm);
885 __m128i xmmSrcLo, xmmSrcHi;
886 __m128i xmmDstLo, xmmDstHi;
888 /* fill cache line with next memory */
889 cachePrefetchNext ((__m128i*)ps);
890 cachePrefetchNext ((__m128i*)pd);
891 cachePrefetchNext ((__m128i*)pm);
893 xmmSrcHi = combine4 ((__m128i*)ps, (__m128i*)pm);
894 xmmDstHi = load128Aligned ((__m128i*) pd);
896 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
897 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
899 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
900 negate_2x128 (xmmSrcLo, xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
902 pixMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmSrcLo, &xmmSrcHi, &xmmDstLo, &xmmDstHi);
904 save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
915 uint32_t s = combine1 (ps, pm);
918 *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d), negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (s)))));
926 static force_inline void
927 coreCombineOutUsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm, int w)
929 /* call prefetch hint to optimize cache load*/
930 cachePrefetch ((__m128i*)ps);
931 cachePrefetch ((__m128i*)pd);
932 cachePrefetch ((__m128i*)pm);
934 while (w && ((unsigned long) pd & 15))
936 uint32_t s = combine1 (ps, pm);
939 *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s), negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (d)))));
946 /* call prefetch hint to optimize cache load*/
947 cachePrefetch ((__m128i*)ps);
948 cachePrefetch ((__m128i*)pd);
949 cachePrefetch ((__m128i*)pm);
953 __m128i xmmSrcLo, xmmSrcHi;
954 __m128i xmmDstLo, xmmDstHi;
956 /* fill cache line with next memory */
957 cachePrefetchNext ((__m128i*)ps);
958 cachePrefetchNext ((__m128i*)pd);
959 cachePrefetchNext ((__m128i*)pm);
961 xmmSrcHi = combine4 ((__m128i*) ps, (__m128i*)pm);
962 xmmDstHi = load128Aligned ((__m128i*) pd);
964 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
965 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
967 expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmDstLo, &xmmDstHi);
968 negate_2x128 (xmmDstLo, xmmDstHi, &xmmDstLo, &xmmDstHi);
970 pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmDstLo, &xmmDstHi, &xmmDstLo, &xmmDstHi);
972 save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
983 uint32_t s = combine1 (ps, pm);
986 *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s), negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (d)))));
994 static force_inline uint32_t
995 coreCombineAtopUPixelsse2 (uint32_t src, uint32_t dst)
997 __m64 s = unpack_32_1x64 (src);
998 __m64 d = unpack_32_1x64 (dst);
1000 __m64 sa = negate_1x64 (expandAlpha_1x64 (s));
1001 __m64 da = expandAlpha_1x64 (d);
1003 return pack_1x64_32 (pixAddMultiply_1x64 (&s, &da, &d, &sa));
1006 static force_inline void
1007 coreCombineAtopUsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm, int w)
1011 __m128i xmmSrcLo, xmmSrcHi;
1012 __m128i xmmDstLo, xmmDstHi;
1013 __m128i xmmAlphaSrcLo, xmmAlphaSrcHi;
1014 __m128i xmmAlphaDstLo, xmmAlphaDstHi;
1016 /* call prefetch hint to optimize cache load*/
1017 cachePrefetch ((__m128i*)ps);
1018 cachePrefetch ((__m128i*)pd);
1019 cachePrefetch ((__m128i*)pm);
1021 while (w && ((unsigned long) pd & 15))
1023 s = combine1 (ps, pm);
1026 *pd++ = coreCombineAtopUPixelsse2 (s, d);
1033 /* call prefetch hint to optimize cache load*/
1034 cachePrefetch ((__m128i*)ps);
1035 cachePrefetch ((__m128i*)pd);
1036 cachePrefetch ((__m128i*)pm);
1040 /* fill cache line with next memory */
1041 cachePrefetchNext ((__m128i*)ps);
1042 cachePrefetchNext ((__m128i*)pd);
1043 cachePrefetchNext ((__m128i*)pm);
1045 xmmSrcHi = combine4 ((__m128i*)ps, (__m128i*)pm);
1046 xmmDstHi = load128Aligned ((__m128i*) pd);
1048 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
1049 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
1051 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
1052 expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
1054 negate_2x128 (xmmAlphaSrcLo, xmmAlphaSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
1056 pixAddMultiply_2x128 ( &xmmSrcLo, &xmmSrcHi, &xmmAlphaDstLo, &xmmAlphaDstHi,
1057 &xmmDstLo, &xmmDstHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi,
1058 &xmmDstLo, &xmmDstHi );
1060 save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
1071 s = combine1 (ps, pm);
1074 *pd++ = coreCombineAtopUPixelsse2 (s, d);
1082 static force_inline uint32_t
1083 coreCombineReverseAtopUPixelsse2 (uint32_t src, uint32_t dst)
1085 __m64 s = unpack_32_1x64 (src);
1086 __m64 d = unpack_32_1x64 (dst);
1088 __m64 sa = expandAlpha_1x64 (s);
1089 __m64 da = negate_1x64 (expandAlpha_1x64 (d));
1091 return pack_1x64_32 (pixAddMultiply_1x64 (&s, &da, &d, &sa));
1094 static force_inline void
1095 coreCombineReverseAtopUsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm, int w)
1099 __m128i xmmSrcLo, xmmSrcHi;
1100 __m128i xmmDstLo, xmmDstHi;
1101 __m128i xmmAlphaSrcLo, xmmAlphaSrcHi;
1102 __m128i xmmAlphaDstLo, xmmAlphaDstHi;
1104 /* call prefetch hint to optimize cache load*/
1105 cachePrefetch ((__m128i*)ps);
1106 cachePrefetch ((__m128i*)pd);
1107 cachePrefetch ((__m128i*)pm);
1109 while (w && ((unsigned long) pd & 15))
1111 s = combine1 (ps, pm);
1114 *pd++ = coreCombineReverseAtopUPixelsse2 (s, d);
1121 /* call prefetch hint to optimize cache load*/
1122 cachePrefetch ((__m128i*)ps);
1123 cachePrefetch ((__m128i*)pd);
1124 cachePrefetch ((__m128i*)pm);
1128 /* fill cache line with next memory */
1129 cachePrefetchNext ((__m128i*)ps);
1130 cachePrefetchNext ((__m128i*)pd);
1131 cachePrefetchNext ((__m128i*)pm);
1133 xmmSrcHi = combine4 ((__m128i*)ps, (__m128i*)pm);
1134 xmmDstHi = load128Aligned ((__m128i*) pd);
1136 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
1137 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
1139 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
1140 expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
1142 negate_2x128 (xmmAlphaDstLo, xmmAlphaDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
1144 pixAddMultiply_2x128 ( &xmmSrcLo, &xmmSrcHi, &xmmAlphaDstLo, &xmmAlphaDstHi,
1145 &xmmDstLo, &xmmDstHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi,
1146 &xmmDstLo, &xmmDstHi );
1148 save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
1159 s = combine1 (ps, pm);
1162 *pd++ = coreCombineReverseAtopUPixelsse2 (s, d);
1170 static force_inline uint32_t
1171 coreCombineXorUPixelsse2 (uint32_t src, uint32_t dst)
1173 __m64 s = unpack_32_1x64 (src);
1174 __m64 d = unpack_32_1x64 (dst);
1176 __m64 negD = negate_1x64 (expandAlpha_1x64 (d));
1177 __m64 negS = negate_1x64 (expandAlpha_1x64 (s));
1179 return pack_1x64_32 (pixAddMultiply_1x64 (&s, &negD, &d, &negS));
1182 static force_inline void
1183 coreCombineXorUsse2 (uint32_t* dst, const uint32_t* src, const uint32_t *mask, int width)
1188 const uint32_t* ps = src;
1189 const uint32_t* pm = mask;
1191 __m128i xmmSrc, xmmSrcLo, xmmSrcHi;
1192 __m128i xmmDst, xmmDstLo, xmmDstHi;
1193 __m128i xmmAlphaSrcLo, xmmAlphaSrcHi;
1194 __m128i xmmAlphaDstLo, xmmAlphaDstHi;
1196 /* call prefetch hint to optimize cache load*/
1197 cachePrefetch ((__m128i*)ps);
1198 cachePrefetch ((__m128i*)pd);
1199 cachePrefetch ((__m128i*)pm);
1201 while (w && ((unsigned long) pd & 15))
1203 s = combine1 (ps, pm);
1206 *pd++ = coreCombineXorUPixelsse2 (s, d);
1213 /* call prefetch hint to optimize cache load*/
1214 cachePrefetch ((__m128i*)ps);
1215 cachePrefetch ((__m128i*)pd);
1216 cachePrefetch ((__m128i*)pm);
1220 /* fill cache line with next memory */
1221 cachePrefetchNext ((__m128i*)ps);
1222 cachePrefetchNext ((__m128i*)pd);
1223 cachePrefetchNext ((__m128i*)pm);
1225 xmmSrc = combine4 ((__m128i*) ps, (__m128i*) pm);
1226 xmmDst = load128Aligned ((__m128i*) pd);
1228 unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
1229 unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
1231 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
1232 expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
1234 negate_2x128 (xmmAlphaSrcLo, xmmAlphaSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
1235 negate_2x128 (xmmAlphaDstLo, xmmAlphaDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
1237 pixAddMultiply_2x128 ( &xmmSrcLo, &xmmSrcHi, &xmmAlphaDstLo, &xmmAlphaDstHi,
1238 &xmmDstLo, &xmmDstHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi,
1239 &xmmDstLo, &xmmDstHi );
1241 save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
1252 s = combine1 (ps, pm);
1255 *pd++ = coreCombineXorUPixelsse2 (s, d);
1263 static force_inline void
1264 coreCombineAddUsse2 (uint32_t* dst, const uint32_t* src, const uint32_t* mask, int width)
1269 const uint32_t* ps = src;
1270 const uint32_t* pm = mask;
1272 /* call prefetch hint to optimize cache load*/
1273 cachePrefetch ((__m128i*)ps);
1274 cachePrefetch ((__m128i*)pd);
1275 cachePrefetch ((__m128i*)pm);
1277 while (w && (unsigned long)pd & 15)
1279 s = combine1 (ps, pm);
1284 *pd++ = _mm_cvtsi64_si32 (_mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
1288 /* call prefetch hint to optimize cache load*/
1289 cachePrefetch ((__m128i*)ps);
1290 cachePrefetch ((__m128i*)pd);
1291 cachePrefetch ((__m128i*)pm);
1297 /* fill cache line with next memory */
1298 cachePrefetchNext ((__m128i*)ps);
1299 cachePrefetchNext ((__m128i*)pd);
1300 cachePrefetchNext ((__m128i*)pm);
1302 s = combine4((__m128i*)ps,(__m128i*)pm);
1304 save128Aligned( (__m128i*)pd,
1305 _mm_adds_epu8( s, load128Aligned ((__m128i*)pd)) );
1315 s = combine1 (ps, pm);
1318 *pd++ = _mm_cvtsi64_si32 (_mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
1324 static force_inline uint32_t
1325 coreCombineSaturateUPixelsse2 (uint32_t src, uint32_t dst)
1327 __m64 ms = unpack_32_1x64 (src);
1328 __m64 md = unpack_32_1x64 (dst);
1329 uint32_t sa = src >> 24;
1330 uint32_t da = ~dst >> 24;
1334 ms = pixMultiply_1x64 (ms, expandAlpha_1x64 (unpack_32_1x64 (IntDiv(da, sa) << 24)));
1337 return pack_1x64_32 (_mm_adds_pu16 (md, ms));
1340 static force_inline void
1341 coreCombineSaturateUsse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm, int w)
1346 __m128i xmmSrc, xmmDst;
1348 /* call prefetch hint to optimize cache load*/
1349 cachePrefetch ((__m128i*)ps);
1350 cachePrefetch ((__m128i*)pd);
1351 cachePrefetch ((__m128i*)pm);
1353 while (w && (unsigned long)pd & 15)
1355 s = combine1 (ps, pm);
1357 *pd++ = coreCombineSaturateUPixelsse2 (s, d);
1364 /* call prefetch hint to optimize cache load*/
1365 cachePrefetch ((__m128i*)ps);
1366 cachePrefetch ((__m128i*)pd);
1367 cachePrefetch ((__m128i*)pm);
1371 /* fill cache line with next memory */
1372 cachePrefetchNext ((__m128i*)ps);
1373 cachePrefetchNext ((__m128i*)pd);
1374 cachePrefetchNext ((__m128i*)pm);
1376 xmmDst = load128Aligned ((__m128i*)pd);
1377 xmmSrc = combine4 ((__m128i*)ps, (__m128i*)pm);
1379 packCmp = _mm_movemask_epi8 (_mm_cmpgt_epi32 (_mm_srli_epi32 (xmmSrc, 24),
1380 _mm_srli_epi32 (_mm_xor_si128 (xmmDst, Maskff000000), 24)));
1382 /* if some alpha src is grater than respective ~alpha dst */
1385 s = combine1 (ps++, pm);
1387 *pd++ = coreCombineSaturateUPixelsse2 (s, d);
1391 s = combine1 (ps++, pm);
1393 *pd++ = coreCombineSaturateUPixelsse2 (s, d);
1397 s = combine1 (ps++, pm);
1399 *pd++ = coreCombineSaturateUPixelsse2 (s, d);
1403 s = combine1 (ps++, pm);
1405 *pd++ = coreCombineSaturateUPixelsse2 (s, d);
1411 save128Aligned ((__m128i*)pd, _mm_adds_epu8 (xmmDst, xmmSrc));
1424 s = combine1 (ps, pm);
1426 *pd++ = coreCombineSaturateUPixelsse2 (s, d);
1433 static force_inline void
1434 coreCombineSrcCsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t *pm, int w)
1438 __m128i xmmSrcLo, xmmSrcHi;
1439 __m128i xmmMaskLo, xmmMaskHi;
1440 __m128i xmmDstLo, xmmDstHi;
1442 /* call prefetch hint to optimize cache load*/
1443 cachePrefetch ((__m128i*)ps);
1444 cachePrefetch ((__m128i*)pd);
1445 cachePrefetch ((__m128i*)pm);
1447 while (w && (unsigned long)pd & 15)
1451 *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
1455 /* call prefetch hint to optimize cache load*/
1456 cachePrefetch ((__m128i*)ps);
1457 cachePrefetch ((__m128i*)pd);
1458 cachePrefetch ((__m128i*)pm);
1462 /* fill cache line with next memory */
1463 cachePrefetchNext ((__m128i*)ps);
1464 cachePrefetchNext ((__m128i*)pd);
1465 cachePrefetchNext ((__m128i*)pm);
1467 xmmSrcHi = load128Unaligned ((__m128i*)ps);
1468 xmmMaskHi = load128Unaligned ((__m128i*)pm);
1470 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
1471 unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
1473 pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi);
1475 save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
1487 *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
1492 static force_inline uint32_t
1493 coreCombineOverCPixelsse2 (uint32_t src, uint32_t mask, uint32_t dst)
1495 __m64 s = unpack_32_1x64 (src);
1496 __m64 expAlpha = expandAlpha_1x64 (s);
1497 __m64 unpkMask = unpack_32_1x64 (mask);
1498 __m64 unpkDst = unpack_32_1x64 (dst);
1500 return pack_1x64_32 (inOver_1x64 (&s, &expAlpha, &unpkMask, &unpkDst));
1503 static force_inline void
1504 coreCombineOverCsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t *pm, int w)
1508 __m128i xmmAlphaLo, xmmAlphaHi;
1509 __m128i xmmSrcLo, xmmSrcHi;
1510 __m128i xmmDstLo, xmmDstHi;
1511 __m128i xmmMaskLo, xmmMaskHi;
1513 /* call prefetch hint to optimize cache load*/
1514 cachePrefetch ((__m128i*)ps);
1515 cachePrefetch ((__m128i*)pd);
1516 cachePrefetch ((__m128i*)pm);
1518 while (w && (unsigned long)pd & 15)
1524 *pd++ = coreCombineOverCPixelsse2 (s, m, d);
1528 /* call prefetch hint to optimize cache load*/
1529 cachePrefetch ((__m128i*)ps);
1530 cachePrefetch ((__m128i*)pd);
1531 cachePrefetch ((__m128i*)pm);
1535 /* fill cache line with next memory */
1536 cachePrefetchNext ((__m128i*)ps);
1537 cachePrefetchNext ((__m128i*)pd);
1538 cachePrefetchNext ((__m128i*)pm);
1540 xmmDstHi = load128Aligned ((__m128i*)pd);
1541 xmmSrcHi = load128Unaligned ((__m128i*)ps);
1542 xmmMaskHi = load128Unaligned ((__m128i*)pm);
1544 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
1545 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
1546 unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
1548 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);
1550 inOver_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi);
1552 save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
1566 *pd++ = coreCombineOverCPixelsse2 (s, m, d);
1571 static force_inline uint32_t
1572 coreCombineOverReverseCPixelsse2 (uint32_t src, uint32_t mask, uint32_t dst)
1574 __m64 d = unpack_32_1x64 (dst);
1576 return pack_1x64_32(over_1x64 (d, expandAlpha_1x64 (d), pixMultiply_1x64 (unpack_32_1x64 (src), unpack_32_1x64 (mask))));
1579 static force_inline void
1580 coreCombineOverReverseCsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t *pm, int w)
1584 __m128i xmmAlphaLo, xmmAlphaHi;
1585 __m128i xmmSrcLo, xmmSrcHi;
1586 __m128i xmmDstLo, xmmDstHi;
1587 __m128i xmmMaskLo, xmmMaskHi;
1589 /* call prefetch hint to optimize cache load*/
1590 cachePrefetch ((__m128i*)ps);
1591 cachePrefetch ((__m128i*)pd);
1592 cachePrefetch ((__m128i*)pm);
1594 while (w && (unsigned long)pd & 15)
1600 *pd++ = coreCombineOverReverseCPixelsse2 (s, m, d);
1604 /* call prefetch hint to optimize cache load*/
1605 cachePrefetch ((__m128i*)ps);
1606 cachePrefetch ((__m128i*)pd);
1607 cachePrefetch ((__m128i*)pm);
1611 /* fill cache line with next memory */
1612 cachePrefetchNext ((__m128i*)ps);
1613 cachePrefetchNext ((__m128i*)pd);
1614 cachePrefetchNext ((__m128i*)pm);
1616 xmmDstHi = load128Aligned ((__m128i*)pd);
1617 xmmSrcHi = load128Unaligned ((__m128i*)ps);
1618 xmmMaskHi = load128Unaligned ((__m128i*)pm);
1620 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
1621 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
1622 unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
1624 expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaLo, &xmmAlphaHi);
1625 pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
1627 over_2x128 (&xmmDstLo, &xmmDstHi, &xmmAlphaLo, &xmmAlphaHi, &xmmMaskLo, &xmmMaskHi);
1629 save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmMaskLo, xmmMaskHi));
1643 *pd++ = coreCombineOverReverseCPixelsse2 (s, m, d);
1648 static force_inline void
1649 coreCombineInCsse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm, int w)
1653 __m128i xmmAlphaLo, xmmAlphaHi;
1654 __m128i xmmSrcLo, xmmSrcHi;
1655 __m128i xmmDstLo, xmmDstHi;
1656 __m128i xmmMaskLo, xmmMaskHi;
1658 /* call prefetch hint to optimize cache load*/
1659 cachePrefetch ((__m128i*)ps);
1660 cachePrefetch ((__m128i*)pd);
1661 cachePrefetch ((__m128i*)pm);
1663 while (w && (unsigned long)pd & 15)
1669 *pd++ = pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
1670 expandAlpha_1x64 (unpack_32_1x64 (d))));
1674 /* call prefetch hint to optimize cache load*/
1675 cachePrefetch ((__m128i*)ps);
1676 cachePrefetch ((__m128i*)pd);
1677 cachePrefetch ((__m128i*)pm);
1681 /* fill cache line with next memory */
1682 cachePrefetchNext ((__m128i*)ps);
1683 cachePrefetchNext ((__m128i*)pd);
1684 cachePrefetchNext ((__m128i*)pm);
1686 xmmDstHi = load128Aligned ((__m128i*)pd);
1687 xmmSrcHi = load128Unaligned ((__m128i*)ps);
1688 xmmMaskHi = load128Unaligned ((__m128i*)pm);
1690 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
1691 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
1692 unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
1694 expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaLo, &xmmAlphaHi);
1695 pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi);
1697 pixMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmAlphaLo, &xmmAlphaHi, &xmmDstLo, &xmmDstHi);
1699 save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
1713 *pd++ = pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
1714 expandAlpha_1x64 (unpack_32_1x64 (d))));
1719 static force_inline void
1720 coreCombineInReverseCsse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm, int w)
1724 __m128i xmmAlphaLo, xmmAlphaHi;
1725 __m128i xmmSrcLo, xmmSrcHi;
1726 __m128i xmmDstLo, xmmDstHi;
1727 __m128i xmmMaskLo, xmmMaskHi;
1729 /* call prefetch hint to optimize cache load*/
1730 cachePrefetch ((__m128i*)ps);
1731 cachePrefetch ((__m128i*)pd);
1732 cachePrefetch ((__m128i*)pm);
1734 while (w && (unsigned long)pd & 15)
1740 *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d),
1741 pixMultiply_1x64 (unpack_32_1x64 (m),
1742 expandAlpha_1x64 (unpack_32_1x64 (s)))));
1746 /* call prefetch hint to optimize cache load*/
1747 cachePrefetch ((__m128i*)ps);
1748 cachePrefetch ((__m128i*)pd);
1749 cachePrefetch ((__m128i*)pm);
1753 /* fill cache line with next memory */
1754 cachePrefetchNext ((__m128i*)ps);
1755 cachePrefetchNext ((__m128i*)pd);
1756 cachePrefetchNext ((__m128i*)pm);
1758 xmmDstHi = load128Aligned ((__m128i*)pd);
1759 xmmSrcHi = load128Unaligned ((__m128i*)ps);
1760 xmmMaskHi = load128Unaligned ((__m128i*)pm);
1762 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
1763 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
1764 unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
1766 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);
1767 pixMultiply_2x128 (&xmmMaskLo, &xmmMaskHi, &xmmAlphaLo, &xmmAlphaHi, &xmmAlphaLo, &xmmAlphaHi);
1769 pixMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmAlphaLo, &xmmAlphaHi, &xmmDstLo, &xmmDstHi);
1771 save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
1785 *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d),
1786 pixMultiply_1x64 (unpack_32_1x64 (m),
1787 expandAlpha_1x64 (unpack_32_1x64 (s)))));
1792 static force_inline void
1793 coreCombineOutCsse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm, int w)
1797 __m128i xmmAlphaLo, xmmAlphaHi;
1798 __m128i xmmSrcLo, xmmSrcHi;
1799 __m128i xmmDstLo, xmmDstHi;
1800 __m128i xmmMaskLo, xmmMaskHi;
1802 /* call prefetch hint to optimize cache load*/
1803 cachePrefetch ((__m128i*)ps);
1804 cachePrefetch ((__m128i*)pd);
1805 cachePrefetch ((__m128i*)pm);
1807 while (w && (unsigned long)pd & 15)
1813 *pd++ = pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
1814 negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (d)))));
1818 /* call prefetch hint to optimize cache load*/
1819 cachePrefetch ((__m128i*)ps);
1820 cachePrefetch ((__m128i*)pd);
1821 cachePrefetch ((__m128i*)pm);
1825 /* fill cache line with next memory */
1826 cachePrefetchNext ((__m128i*)ps);
1827 cachePrefetchNext ((__m128i*)pd);
1828 cachePrefetchNext ((__m128i*)pm);
1830 xmmDstHi = load128Aligned ((__m128i*)pd);
1831 xmmSrcHi = load128Unaligned ((__m128i*)ps);
1832 xmmMaskHi = load128Unaligned ((__m128i*)pm);
1834 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
1835 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
1836 unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
1838 expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaLo, &xmmAlphaHi);
1839 negate_2x128 (xmmAlphaLo, xmmAlphaHi, &xmmAlphaLo, &xmmAlphaHi);
1841 pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi);
1842 pixMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmAlphaLo, &xmmAlphaHi, &xmmDstLo, &xmmDstHi);
1844 save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
1858 *pd++ = pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
1859 negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (d)))));
1864 static force_inline void
1865 coreCombineOutReverseCsse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm, int w)
1869 __m128i xmmAlphaLo, xmmAlphaHi;
1870 __m128i xmmSrcLo, xmmSrcHi;
1871 __m128i xmmDstLo, xmmDstHi;
1872 __m128i xmmMaskLo, xmmMaskHi;
1874 /* call prefetch hint to optimize cache load*/
1875 cachePrefetch ((__m128i*)ps);
1876 cachePrefetch ((__m128i*)pd);
1877 cachePrefetch ((__m128i*)pm);
1879 while (w && (unsigned long)pd & 15)
1885 *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d),
1886 negate_1x64 (pixMultiply_1x64 (unpack_32_1x64 (m),
1887 expandAlpha_1x64 (unpack_32_1x64 (s))))));
1891 /* call prefetch hint to optimize cache load*/
1892 cachePrefetch ((__m128i*)ps);
1893 cachePrefetch ((__m128i*)pd);
1894 cachePrefetch ((__m128i*)pm);
1898 /* fill cache line with next memory */
1899 cachePrefetchNext ((__m128i*)ps);
1900 cachePrefetchNext ((__m128i*)pd);
1901 cachePrefetchNext ((__m128i*)pm);
1903 xmmDstHi = load128Aligned ((__m128i*)pd);
1904 xmmSrcHi = load128Unaligned ((__m128i*)ps);
1905 xmmMaskHi = load128Unaligned ((__m128i*)pm);
1907 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
1908 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
1909 unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
1911 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);
1913 pixMultiply_2x128 (&xmmMaskLo, &xmmMaskHi, &xmmAlphaLo, &xmmAlphaHi, &xmmMaskLo, &xmmMaskHi);
1915 negate_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
1917 pixMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi);
1919 save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
1933 *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d),
1934 negate_1x64 (pixMultiply_1x64 (unpack_32_1x64 (m),
1935 expandAlpha_1x64 (unpack_32_1x64 (s))))));
1940 static force_inline uint32_t
1941 coreCombineAtopCPixelsse2 (uint32_t src, uint32_t mask, uint32_t dst)
1943 __m64 m = unpack_32_1x64 (mask);
1944 __m64 s = unpack_32_1x64 (src);
1945 __m64 d = unpack_32_1x64 (dst);
1946 __m64 sa = expandAlpha_1x64 (s);
1947 __m64 da = expandAlpha_1x64 (d);
1949 s = pixMultiply_1x64 (s, m);
1950 m = negate_1x64 (pixMultiply_1x64 (m, sa));
1952 return pack_1x64_32 (pixAddMultiply_1x64 (&d, &m, &s, &da));
1955 static force_inline void
1956 coreCombineAtopCsse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm, int w)
1960 __m128i xmmSrcLo, xmmSrcHi;
1961 __m128i xmmDstLo, xmmDstHi;
1962 __m128i xmmAlphaSrcLo, xmmAlphaSrcHi;
1963 __m128i xmmAlphaDstLo, xmmAlphaDstHi;
1964 __m128i xmmMaskLo, xmmMaskHi;
1966 /* call prefetch hint to optimize cache load*/
1967 cachePrefetch ((__m128i*)ps);
1968 cachePrefetch ((__m128i*)pd);
1969 cachePrefetch ((__m128i*)pm);
1971 while (w && (unsigned long)pd & 15)
1977 *pd++ = coreCombineAtopCPixelsse2 (s, m, d);
1981 /* call prefetch hint to optimize cache load*/
1982 cachePrefetch ((__m128i*)ps);
1983 cachePrefetch ((__m128i*)pd);
1984 cachePrefetch ((__m128i*)pm);
1988 /* fill cache line with next memory */
1989 cachePrefetchNext ((__m128i*)ps);
1990 cachePrefetchNext ((__m128i*)pd);
1991 cachePrefetchNext ((__m128i*)pm);
1993 xmmDstHi = load128Aligned ((__m128i*)pd);
1994 xmmSrcHi = load128Unaligned ((__m128i*)ps);
1995 xmmMaskHi = load128Unaligned ((__m128i*)pm);
1997 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
1998 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
1999 unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
2001 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
2002 expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
2004 pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmSrcLo, &xmmSrcHi);
2005 pixMultiply_2x128 (&xmmMaskLo, &xmmMaskHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi, &xmmMaskLo, &xmmMaskHi);
2007 negate_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
2009 pixAddMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmMaskLo, &xmmMaskHi,
2010 &xmmSrcLo, &xmmSrcHi, &xmmAlphaDstLo, &xmmAlphaDstHi,
2011 &xmmDstLo, &xmmDstHi);
2013 save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
2027 *pd++ = coreCombineAtopCPixelsse2 (s, m, d);
2032 static force_inline uint32_t
2033 coreCombineReverseAtopCPixelsse2 (uint32_t src, uint32_t mask, uint32_t dst)
2035 __m64 m = unpack_32_1x64 (mask);
2036 __m64 s = unpack_32_1x64 (src);
2037 __m64 d = unpack_32_1x64 (dst);
2039 __m64 da = negate_1x64 (expandAlpha_1x64 (d));
2040 __m64 sa = expandAlpha_1x64 (s);
2042 s = pixMultiply_1x64 (s, m);
2043 m = pixMultiply_1x64 (m, sa);
2045 return pack_1x64_32 (pixAddMultiply_1x64 (&d, &m, &s, &da));
2048 static force_inline void
2049 coreCombineReverseAtopCsse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm, int w)
2053 __m128i xmmSrcLo, xmmSrcHi;
2054 __m128i xmmDstLo, xmmDstHi;
2055 __m128i xmmAlphaSrcLo, xmmAlphaSrcHi;
2056 __m128i xmmAlphaDstLo, xmmAlphaDstHi;
2057 __m128i xmmMaskLo, xmmMaskHi;
2059 /* call prefetch hint to optimize cache load*/
2060 cachePrefetch ((__m128i*)ps);
2061 cachePrefetch ((__m128i*)pd);
2062 cachePrefetch ((__m128i*)pm);
2064 while (w && (unsigned long)pd & 15)
2070 *pd++ = coreCombineReverseAtopCPixelsse2 (s, m, d);
2074 /* call prefetch hint to optimize cache load*/
2075 cachePrefetch ((__m128i*)ps);
2076 cachePrefetch ((__m128i*)pd);
2077 cachePrefetch ((__m128i*)pm);
2081 /* fill cache line with next memory */
2082 cachePrefetchNext ((__m128i*)ps);
2083 cachePrefetchNext ((__m128i*)pd);
2084 cachePrefetchNext ((__m128i*)pm);
2086 xmmDstHi = load128Aligned ((__m128i*)pd);
2087 xmmSrcHi = load128Unaligned ((__m128i*)ps);
2088 xmmMaskHi = load128Unaligned ((__m128i*)pm);
2090 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
2091 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
2092 unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
2094 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
2095 expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
2097 pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmSrcLo, &xmmSrcHi);
2098 pixMultiply_2x128 (&xmmMaskLo, &xmmMaskHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi, &xmmMaskLo, &xmmMaskHi);
2100 negate_2x128 (xmmAlphaDstLo, xmmAlphaDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
2102 pixAddMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmMaskLo, &xmmMaskHi,
2103 &xmmSrcLo, &xmmSrcHi, &xmmAlphaDstLo, &xmmAlphaDstHi,
2104 &xmmDstLo, &xmmDstHi);
2106 save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
2120 *pd++ = coreCombineReverseAtopCPixelsse2 (s, m, d);
2125 static force_inline uint32_t
2126 coreCombineXorCPixelsse2 (uint32_t src, uint32_t mask, uint32_t dst)
2128 __m64 a = unpack_32_1x64 (mask);
2129 __m64 s = unpack_32_1x64 (src);
2130 __m64 d = unpack_32_1x64 (dst);
2132 __m64 alphaDst = negate_1x64 (pixMultiply_1x64 (a, expandAlpha_1x64 (s)));
2133 __m64 dest = pixMultiply_1x64 (s, a);
2134 __m64 alphaSrc = negate_1x64 (expandAlpha_1x64 (d));
2136 return pack_1x64_32 (pixAddMultiply_1x64 (&d,
2142 static force_inline void
2143 coreCombineXorCsse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm, int w)
2147 __m128i xmmSrcLo, xmmSrcHi;
2148 __m128i xmmDstLo, xmmDstHi;
2149 __m128i xmmAlphaSrcLo, xmmAlphaSrcHi;
2150 __m128i xmmAlphaDstLo, xmmAlphaDstHi;
2151 __m128i xmmMaskLo, xmmMaskHi;
2153 /* call prefetch hint to optimize cache load*/
2154 cachePrefetch ((__m128i*)ps);
2155 cachePrefetch ((__m128i*)pd);
2156 cachePrefetch ((__m128i*)pm);
2158 while (w && (unsigned long)pd & 15)
2164 *pd++ = coreCombineXorCPixelsse2 (s, m, d);
2168 /* call prefetch hint to optimize cache load*/
2169 cachePrefetch ((__m128i*)ps);
2170 cachePrefetch ((__m128i*)pd);
2171 cachePrefetch ((__m128i*)pm);
2175 /* fill cache line with next memory */
2176 cachePrefetchNext ((__m128i*)ps);
2177 cachePrefetchNext ((__m128i*)pd);
2178 cachePrefetchNext ((__m128i*)pm);
2180 xmmDstHi = load128Aligned ((__m128i*)pd);
2181 xmmSrcHi = load128Unaligned ((__m128i*)ps);
2182 xmmMaskHi = load128Unaligned ((__m128i*)pm);
2184 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
2185 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
2186 unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
2188 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
2189 expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
2191 pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmSrcLo, &xmmSrcHi);
2192 pixMultiply_2x128 (&xmmMaskLo, &xmmMaskHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi, &xmmMaskLo, &xmmMaskHi);
2194 negate_2x128 (xmmAlphaDstLo, xmmAlphaDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
2195 negate_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
2197 pixAddMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmMaskLo, &xmmMaskHi,
2198 &xmmSrcLo, &xmmSrcHi, &xmmAlphaDstLo, &xmmAlphaDstHi,
2199 &xmmDstLo, &xmmDstHi);
2201 save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
2215 *pd++ = coreCombineXorCPixelsse2 (s, m, d);
2220 static force_inline void
2221 coreCombineAddCsse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm, int w)
2225 __m128i xmmSrcLo, xmmSrcHi;
2226 __m128i xmmDstLo, xmmDstHi;
2227 __m128i xmmMaskLo, xmmMaskHi;
2229 /* call prefetch hint to optimize cache load*/
2230 cachePrefetch ((__m128i*)ps);
2231 cachePrefetch ((__m128i*)pd);
2232 cachePrefetch ((__m128i*)pm);
2234 while (w && (unsigned long)pd & 15)
2240 *pd++ = pack_1x64_32 (_mm_adds_pu8 (pixMultiply_1x64 (unpack_32_1x64 (s),
2241 unpack_32_1x64 (m)),
2242 unpack_32_1x64 (d)));
2246 /* call prefetch hint to optimize cache load*/
2247 cachePrefetch ((__m128i*)ps);
2248 cachePrefetch ((__m128i*)pd);
2249 cachePrefetch ((__m128i*)pm);
2253 /* fill cache line with next memory */
2254 cachePrefetchNext ((__m128i*)ps);
2255 cachePrefetchNext ((__m128i*)pd);
2256 cachePrefetchNext ((__m128i*)pm);
2258 xmmSrcHi = load128Unaligned ((__m128i*)ps);
2259 xmmMaskHi = load128Unaligned ((__m128i*)pm);
2260 xmmDstHi = load128Aligned ((__m128i*)pd);
2262 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
2263 unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
2264 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
2266 pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmSrcLo, &xmmSrcHi);
2268 save128Aligned( (__m128i*)pd, pack_2x128_128 (_mm_adds_epu8 (xmmSrcLo, xmmDstLo),
2269 _mm_adds_epu8 (xmmSrcHi, xmmDstHi)));
2283 *pd++ = pack_1x64_32 (_mm_adds_pu8 (pixMultiply_1x64 (unpack_32_1x64 (s),
2284 unpack_32_1x64 (m)),
2285 unpack_32_1x64 (d)));
2290 /* -------------------------------------------------------------------------------------------------
2291 * fbComposeSetupSSE2
2293 static force_inline __m64
2294 createMask_16_64 (uint16_t mask)
2296 return _mm_set1_pi16 (mask);
2299 static force_inline __m128i
2300 createMask_16_128 (uint16_t mask)
2302 return _mm_set1_epi16 (mask);
2305 static force_inline __m64
2306 createMask_2x32_64 (uint32_t mask0, uint32_t mask1)
2308 return _mm_set_pi32 (mask0, mask1);
2311 static force_inline __m128i
2312 createMask_2x32_128 (uint32_t mask0, uint32_t mask1)
2314 return _mm_set_epi32 (mask0, mask1, mask0, mask1);
2317 /* SSE2 code patch for fbcompose.c */
2320 sse2CombineOverU (pixman_implementation_t *imp, pixman_op_t op,
2321 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2323 coreCombineOverUsse2 (dst, src, mask, width);
2328 sse2CombineOverReverseU (pixman_implementation_t *imp, pixman_op_t op,
2329 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2331 coreCombineOverReverseUsse2 (dst, src, mask, width);
2336 sse2CombineInU (pixman_implementation_t *imp, pixman_op_t op,
2337 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2339 coreCombineInUsse2 (dst, src, mask, width);
2344 sse2CombineInReverseU (pixman_implementation_t *imp, pixman_op_t op,
2345 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2347 coreCombineReverseInUsse2 (dst, src, mask, width);
2352 sse2CombineOutU (pixman_implementation_t *imp, pixman_op_t op,
2353 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2355 coreCombineOutUsse2 (dst, src, mask, width);
2360 sse2CombineOutReverseU (pixman_implementation_t *imp, pixman_op_t op,
2361 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2363 coreCombineReverseOutUsse2 (dst, src, mask, width);
2368 sse2CombineAtopU (pixman_implementation_t *imp, pixman_op_t op,
2369 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2371 coreCombineAtopUsse2 (dst, src, mask, width);
2376 sse2CombineAtopReverseU (pixman_implementation_t *imp, pixman_op_t op,
2377 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2379 coreCombineReverseAtopUsse2 (dst, src, mask, width);
2384 sse2CombineXorU (pixman_implementation_t *imp, pixman_op_t op,
2385 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2387 coreCombineXorUsse2 (dst, src, mask, width);
2392 sse2CombineAddU (pixman_implementation_t *imp, pixman_op_t op,
2393 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2395 coreCombineAddUsse2 (dst, src, mask, width);
2400 sse2CombineSaturateU (pixman_implementation_t *imp, pixman_op_t op,
2401 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2403 coreCombineSaturateUsse2 (dst, src, mask, width);
2408 sse2CombineSrcC (pixman_implementation_t *imp, pixman_op_t op,
2409 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2411 coreCombineSrcCsse2 (dst, src, mask, width);
2416 sse2CombineOverC (pixman_implementation_t *imp, pixman_op_t op,
2417 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2419 coreCombineOverCsse2 (dst, src, mask, width);
2424 sse2CombineOverReverseC (pixman_implementation_t *imp, pixman_op_t op,
2425 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2427 coreCombineOverReverseCsse2 (dst, src, mask, width);
2432 sse2CombineInC (pixman_implementation_t *imp, pixman_op_t op,
2433 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2435 coreCombineInCsse2 (dst, src, mask, width);
2440 sse2CombineInReverseC (pixman_implementation_t *imp, pixman_op_t op,
2441 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2443 coreCombineInReverseCsse2 (dst, src, mask, width);
2448 sse2CombineOutC (pixman_implementation_t *imp, pixman_op_t op,
2449 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2451 coreCombineOutCsse2 (dst, src, mask, width);
2456 sse2CombineOutReverseC (pixman_implementation_t *imp, pixman_op_t op,
2457 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2459 coreCombineOutReverseCsse2 (dst, src, mask, width);
2464 sse2CombineAtopC (pixman_implementation_t *imp, pixman_op_t op,
2465 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2467 coreCombineAtopCsse2 (dst, src, mask, width);
2472 sse2CombineAtopReverseC (pixman_implementation_t *imp, pixman_op_t op,
2473 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2475 coreCombineReverseAtopCsse2 (dst, src, mask, width);
2480 sse2CombineXorC (pixman_implementation_t *imp, pixman_op_t op,
2481 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2483 coreCombineXorCsse2 (dst, src, mask, width);
2488 sse2CombineAddC (pixman_implementation_t *imp, pixman_op_t op,
2489 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2491 coreCombineAddCsse2 (dst, src, mask, width);
2495 /* -------------------------------------------------------------------------------------------------
2496 * fast_CompositeOver_n_8888
2500 sse2_CompositeOver_n_8888 (pixman_implementation_t *imp,
2502 pixman_image_t * src_image,
2503 pixman_image_t * mask_image,
2504 pixman_image_t * dst_image,
2515 uint32_t *dstLine, *dst, d;
2518 __m128i xmmSrc, xmmAlpha;
2519 __m128i xmmDst, xmmDstLo, xmmDstHi;
2521 src = _pixman_image_get_solid(src_image, dst_image->bits.format);
2526 fbComposeGetStart (dst_image, xDst, yDst, uint32_t, dstStride, dstLine, 1);
2528 xmmSrc = expandPixel_32_1x128 (src);
2529 xmmAlpha = expandAlpha_1x128 (xmmSrc);
2535 /* call prefetch hint to optimize cache load*/
2536 cachePrefetch ((__m128i*)dst);
2538 dstLine += dstStride;
2541 while (w && (unsigned long)dst & 15)
2544 *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmmSrc),
2545 _mm_movepi64_pi64 (xmmAlpha),
2546 unpack_32_1x64 (d)));
2550 cachePrefetch ((__m128i*)dst);
2554 /* fill cache line with next memory */
2555 cachePrefetchNext ((__m128i*)dst);
2557 xmmDst = load128Aligned ((__m128i*)dst);
2559 unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
2561 over_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmDstLo, &xmmDstHi);
2563 /* rebuid the 4 pixel data and save*/
2564 save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
2573 *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmmSrc),
2574 _mm_movepi64_pi64 (xmmAlpha),
2575 unpack_32_1x64 (d)));
2583 /* -------------------------------------------------------------------------------------------------
2584 * fast_CompositeOver_n_0565
2587 sse2_CompositeOver_n_0565 (pixman_implementation_t *imp,
2589 pixman_image_t * src_image,
2590 pixman_image_t * mask_image,
2591 pixman_image_t * dst_image,
2602 uint16_t *dstLine, *dst, d;
2605 __m128i xmmSrc, xmmAlpha;
2606 __m128i xmmDst, xmmDst0, xmmDst1, xmmDst2, xmmDst3;
2608 src = _pixman_image_get_solid(src_image, dst_image->bits.format);
2613 fbComposeGetStart (dst_image, xDst, yDst, uint16_t, dstStride, dstLine, 1);
2615 xmmSrc = expandPixel_32_1x128 (src);
2616 xmmAlpha = expandAlpha_1x128 (xmmSrc);
2622 /* call prefetch hint to optimize cache load*/
2623 cachePrefetch ((__m128i*)dst);
2625 dstLine += dstStride;
2628 while (w && (unsigned long)dst & 15)
2632 *dst++ = pack565_32_16 (pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmmSrc),
2633 _mm_movepi64_pi64 (xmmAlpha),
2634 expand565_16_1x64 (d))));
2638 /* call prefetch hint to optimize cache load*/
2639 cachePrefetch ((__m128i*)dst);
2643 /* fill cache line with next memory */
2644 cachePrefetchNext ((__m128i*)dst);
2646 xmmDst = load128Aligned ((__m128i*)dst);
2648 unpack565_128_4x128 (xmmDst, &xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3);
2650 over_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmDst0, &xmmDst1);
2651 over_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmDst2, &xmmDst3);
2653 xmmDst = pack565_4x128_128 (&xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3);
2654 save128Aligned ((__m128i*)dst, xmmDst);
2663 *dst++ = pack565_32_16 (pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmmSrc),
2664 _mm_movepi64_pi64 (xmmAlpha),
2665 expand565_16_1x64 (d))));
2672 /* -------------------------------------------------------------------------------------------------
2673 * fast_CompositeOver_n_8888_8888_ca
2677 sse2_CompositeOver_n_8888_8888_ca (pixman_implementation_t *imp,
2679 pixman_image_t * src_image,
2680 pixman_image_t * mask_image,
2681 pixman_image_t * dst_image,
2692 uint32_t *dstLine, d;
2693 uint32_t *maskLine, m;
2695 int dstStride, maskStride;
2697 __m128i xmmSrc, xmmAlpha;
2698 __m128i xmmDst, xmmDstLo, xmmDstHi;
2699 __m128i xmmMask, xmmMaskLo, xmmMaskHi;
2701 __m64 mmxSrc, mmxAlpha, mmxMask, mmxDst;
2703 src = _pixman_image_get_solid(src_image, dst_image->bits.format);
2708 fbComposeGetStart (dst_image, xDst, yDst, uint32_t, dstStride, dstLine, 1);
2709 fbComposeGetStart (mask_image, xMask, yMask, uint32_t, maskStride, maskLine, 1);
2711 xmmSrc = _mm_unpacklo_epi8 (createMask_2x32_128 (src, src), _mm_setzero_si128 ());
2712 xmmAlpha = expandAlpha_1x128 (xmmSrc);
2713 mmxSrc = _mm_movepi64_pi64 (xmmSrc);
2714 mmxAlpha = _mm_movepi64_pi64 (xmmAlpha);
2719 const uint32_t *pm = (uint32_t *)maskLine;
2720 uint32_t *pd = (uint32_t *)dstLine;
2722 dstLine += dstStride;
2723 maskLine += maskStride;
2725 /* call prefetch hint to optimize cache load*/
2726 cachePrefetch ((__m128i*)pd);
2727 cachePrefetch ((__m128i*)pm);
2729 while (w && (unsigned long)pd & 15)
2736 mmxMask = unpack_32_1x64 (m);
2737 mmxDst = unpack_32_1x64 (d);
2739 *pd = pack_1x64_32 (inOver_1x64 (&mmxSrc,
2749 /* call prefetch hint to optimize cache load*/
2750 cachePrefetch ((__m128i*)pd);
2751 cachePrefetch ((__m128i*)pm);
2755 /* fill cache line with next memory */
2756 cachePrefetchNext ((__m128i*)pd);
2757 cachePrefetchNext ((__m128i*)pm);
2759 xmmMask = load128Unaligned ((__m128i*)pm);
2761 packCmp = _mm_movemask_epi8 (_mm_cmpeq_epi32 (xmmMask, _mm_setzero_si128()));
2763 /* if all bits in mask are zero, packCmp are equal to 0xffff */
2764 if (packCmp != 0xffff)
2766 xmmDst = load128Aligned ((__m128i*)pd);
2768 unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
2769 unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
2771 inOver_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi);
2773 save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
2788 mmxMask = unpack_32_1x64 (m);
2789 mmxDst = unpack_32_1x64 (d);
2791 *pd = pack_1x64_32 (inOver_1x64 (&mmxSrc,
2806 /* -------------------------------------------------------------------------------------------------
2807 * fast_composite_over_8888_n_8888
2811 sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
2813 pixman_image_t * src_image,
2814 pixman_image_t * mask_image,
2815 pixman_image_t * dst_image,
2825 uint32_t *dstLine, *dst;
2826 uint32_t *srcLine, *src;
2829 int dstStride, srcStride;
2832 __m128i xmmSrc, xmmSrcLo, xmmSrcHi;
2833 __m128i xmmDst, xmmDstLo, xmmDstHi;
2834 __m128i xmmAlphaLo, xmmAlphaHi;
2836 fbComposeGetStart (dst_image, xDst, yDst, uint32_t, dstStride, dstLine, 1);
2837 fbComposeGetStart (src_image, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
2838 mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
2840 xmmMask = createMask_16_128 (mask >> 24);
2845 dstLine += dstStride;
2847 srcLine += srcStride;
2850 /* call prefetch hint to optimize cache load*/
2851 cachePrefetch ((__m128i*)dst);
2852 cachePrefetch ((__m128i*)src);
2854 while (w && (unsigned long)dst & 15)
2856 uint32_t s = *src++;
2859 __m64 ms = unpack_32_1x64 (s);
2860 __m64 alpha = expandAlpha_1x64 (ms);
2861 __m64 dest = _mm_movepi64_pi64 (xmmMask);
2862 __m64 alphaDst = unpack_32_1x64 (d);
2864 *dst++ = pack_1x64_32 (inOver_1x64 (&ms,
2872 /* call prefetch hint to optimize cache load*/
2873 cachePrefetch ((__m128i*)dst);
2874 cachePrefetch ((__m128i*)src);
2878 /* fill cache line with next memory */
2879 cachePrefetchNext ((__m128i*)dst);
2880 cachePrefetchNext ((__m128i*)src);
2882 xmmSrc = load128Unaligned ((__m128i*)src);
2883 xmmDst = load128Aligned ((__m128i*)dst);
2885 unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
2886 unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
2887 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);
2889 inOver_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi, &xmmMask, &xmmMask, &xmmDstLo, &xmmDstHi);
2891 save128Aligned( (__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
2900 uint32_t s = *src++;
2903 __m64 ms = unpack_32_1x64 (s);
2904 __m64 alpha = expandAlpha_1x64 (ms);
2905 __m64 mask = _mm_movepi64_pi64 (xmmMask);
2906 __m64 dest = unpack_32_1x64 (d);
2908 *dst++ = pack_1x64_32 (inOver_1x64 (&ms,
2920 /* -------------------------------------------------------------------------------------------------
2921 * fast_Composite_over_x888_n_8888
2924 sse2_Composite_over_x888_n_8888 (pixman_implementation_t *imp,
2926 pixman_image_t * src_image,
2927 pixman_image_t * mask_image,
2928 pixman_image_t * dst_image,
2938 uint32_t *dstLine, *dst;
2939 uint32_t *srcLine, *src;
2941 int dstStride, srcStride;
2944 __m128i xmmMask, xmmAlpha;
2945 __m128i xmmSrc, xmmSrcLo, xmmSrcHi;
2946 __m128i xmmDst, xmmDstLo, xmmDstHi;
2948 fbComposeGetStart (dst_image, xDst, yDst, uint32_t, dstStride, dstLine, 1);
2949 fbComposeGetStart (src_image, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
2950 mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
2952 xmmMask = createMask_16_128 (mask >> 24);
2953 xmmAlpha = Mask00ff;
2958 dstLine += dstStride;
2960 srcLine += srcStride;
2963 /* call prefetch hint to optimize cache load*/
2964 cachePrefetch ((__m128i*)dst);
2965 cachePrefetch ((__m128i*)src);
2967 while (w && (unsigned long)dst & 15)
2969 uint32_t s = (*src++) | 0xff000000;
2972 __m64 src = unpack_32_1x64 (s);
2973 __m64 alpha = _mm_movepi64_pi64 (xmmAlpha);
2974 __m64 mask = _mm_movepi64_pi64 (xmmMask);
2975 __m64 dest = unpack_32_1x64 (d);
2977 *dst++ = pack_1x64_32 (inOver_1x64 (&src,
2985 /* call prefetch hint to optimize cache load*/
2986 cachePrefetch ((__m128i*)dst);
2987 cachePrefetch ((__m128i*)src);
2991 /* fill cache line with next memory */
2992 cachePrefetchNext ((__m128i*)dst);
2993 cachePrefetchNext ((__m128i*)src);
2995 xmmSrc = _mm_or_si128 (load128Unaligned ((__m128i*)src), Maskff000000);
2996 xmmDst = load128Aligned ((__m128i*)dst);
2998 unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
2999 unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
3001 inOver_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmAlpha, &xmmAlpha, &xmmMask, &xmmMask, &xmmDstLo, &xmmDstHi);
3003 save128Aligned( (__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
3013 uint32_t s = (*src++) | 0xff000000;
3016 __m64 src = unpack_32_1x64 (s);
3017 __m64 alpha = _mm_movepi64_pi64 (xmmAlpha);
3018 __m64 mask = _mm_movepi64_pi64 (xmmMask);
3019 __m64 dest = unpack_32_1x64 (d);
3021 *dst++ = pack_1x64_32 (inOver_1x64 (&src,
3033 /* -------------------------------------------------------------------------------------------------
3034 * fast_composite_over_8888_8888
3037 sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
3039 pixman_image_t * src_image,
3040 pixman_image_t * mask_image,
3041 pixman_image_t * dst_image,
3051 int dstStride, srcStride;
3052 uint32_t *dstLine, *dst;
3053 uint32_t *srcLine, *src;
3055 fbComposeGetStart (dst_image, xDst, yDst, uint32_t, dstStride, dstLine, 1);
3056 fbComposeGetStart (src_image, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
3063 coreCombineOverUsse2 (dst, src, NULL, width);
3071 /* -------------------------------------------------------------------------------------------------
3072 * fast_composite_over_8888_0565
3074 static force_inline uint16_t
3075 fast_composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
3079 ms = unpack_32_1x64 (src);
3080 return pack565_32_16( pack_1x64_32 (over_1x64 (ms,
3081 expandAlpha_1x64 (ms),
3082 expand565_16_1x64 (dst))));
3086 sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
3088 pixman_image_t * src_image,
3089 pixman_image_t * mask_image,
3090 pixman_image_t * dst_image,
3100 uint16_t *dstLine, *dst, d;
3101 uint32_t *srcLine, *src, s;
3102 int dstStride, srcStride;
3105 __m128i xmmAlphaLo, xmmAlphaHi;
3106 __m128i xmmSrc, xmmSrcLo, xmmSrcHi;
3107 __m128i xmmDst, xmmDst0, xmmDst1, xmmDst2, xmmDst3;
3109 fbComposeGetStart (dst_image, xDst, yDst, uint16_t, dstStride, dstLine, 1);
3110 fbComposeGetStart (src_image, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
3115 * I copy the code from MMX one and keep the fixme.
3116 * If it's a problem there, probably is a problem here.
3118 assert (src_image->pDrawable == mask_image->pDrawable);
3126 /* call prefetch hint to optimize cache load*/
3127 cachePrefetch ((__m128i*)src);
3128 cachePrefetch ((__m128i*)dst);
3130 dstLine += dstStride;
3131 srcLine += srcStride;
3134 /* Align dst on a 16-byte boundary */
3136 ((unsigned long)dst & 15))
3141 *dst++ = fast_composite_over_8888_0565pixel (s, d);
3145 /* call prefetch hint to optimize cache load*/
3146 cachePrefetch ((__m128i*)src);
3147 cachePrefetch ((__m128i*)dst);
3149 /* It's a 8 pixel loop */
3152 /* fill cache line with next memory */
3153 cachePrefetchNext ((__m128i*)src);
3154 cachePrefetchNext ((__m128i*)dst);
3156 /* I'm loading unaligned because I'm not sure about the address alignment. */
3157 xmmSrc = load128Unaligned ((__m128i*) src);
3158 xmmDst = load128Aligned ((__m128i*) dst);
3161 unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
3162 unpack565_128_4x128 (xmmDst, &xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3);
3163 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);
3165 /* I'm loading next 4 pixels from memory before to optimze the memory read. */
3166 xmmSrc = load128Unaligned ((__m128i*) (src+4));
3168 over_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi, &xmmDst0, &xmmDst1);
3171 unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
3172 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);
3174 over_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi, &xmmDst2, &xmmDst3);
3176 save128Aligned ((__m128i*)dst, pack565_4x128_128 (&xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3));
3188 *dst++ = fast_composite_over_8888_0565pixel (s, d);
3195 /* -------------------------------------------------------------------------------------------------
3196 * fast_CompositeOver_n_8_8888
3200 sse2_CompositeOver_n_8_8888 (pixman_implementation_t *imp,
3202 pixman_image_t * src_image,
3203 pixman_image_t * mask_image,
3204 pixman_image_t * dst_image,
3215 uint32_t *dstLine, *dst;
3216 uint8_t *maskLine, *mask;
3217 int dstStride, maskStride;
3221 __m128i xmmSrc, xmmAlpha, xmmDef;
3222 __m128i xmmDst, xmmDstLo, xmmDstHi;
3223 __m128i xmmMask, xmmMaskLo, xmmMaskHi;
3225 __m64 mmxSrc, mmxAlpha, mmxMask, mmxDest;
3227 src = _pixman_image_get_solid(src_image, dst_image->bits.format);
3233 fbComposeGetStart (dst_image, xDst, yDst, uint32_t, dstStride, dstLine, 1);
3234 fbComposeGetStart (mask_image, xMask, yMask, uint8_t, maskStride, maskLine, 1);
3236 xmmDef = createMask_2x32_128 (src, src);
3237 xmmSrc = expandPixel_32_1x128 (src);
3238 xmmAlpha = expandAlpha_1x128 (xmmSrc);
3239 mmxSrc = _mm_movepi64_pi64 (xmmSrc);
3240 mmxAlpha = _mm_movepi64_pi64 (xmmAlpha);
3245 dstLine += dstStride;
3247 maskLine += maskStride;
3250 /* call prefetch hint to optimize cache load*/
3251 cachePrefetch ((__m128i*)mask);
3252 cachePrefetch ((__m128i*)dst);
3254 while (w && (unsigned long)dst & 15)
3256 uint8_t m = *mask++;
3261 mmxMask = expandPixel_8_1x64 (m);
3262 mmxDest = unpack_32_1x64 (d);
3264 *dst = pack_1x64_32 (inOver_1x64 (&mmxSrc,
3274 /* call prefetch hint to optimize cache load*/
3275 cachePrefetch ((__m128i*)mask);
3276 cachePrefetch ((__m128i*)dst);
3280 /* fill cache line with next memory */
3281 cachePrefetchNext ((__m128i*)mask);
3282 cachePrefetchNext ((__m128i*)dst);
3284 m = *((uint32_t*)mask);
3286 if (srca == 0xff && m == 0xffffffff)
3288 save128Aligned ((__m128i*)dst, xmmDef);
3292 xmmDst = load128Aligned ((__m128i*) dst);
3293 xmmMask = unpack_32_1x128 (m);
3294 xmmMask = _mm_unpacklo_epi8 (xmmMask, _mm_setzero_si128());
3297 unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
3298 unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
3300 expandAlphaRev_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
3302 inOver_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi);
3304 save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
3314 uint8_t m = *mask++;
3319 mmxMask = expandPixel_8_1x64 (m);
3320 mmxDest = unpack_32_1x64 (d);
3322 *dst = pack_1x64_32 (inOver_1x64 (&mmxSrc,
3336 /* -------------------------------------------------------------------------------------------------
3337 * fast_CompositeOver_n_8_8888
3341 pixmanFillsse2 (uint32_t *bits,
3350 uint32_t byte_width;
3355 if (bpp == 16 && (data >> 16 != (data & 0xffff)))
3358 if (bpp != 16 && bpp != 32)
3363 stride = stride * (int) sizeof (uint32_t) / 2;
3364 byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
3365 byte_width = 2 * width;
3370 stride = stride * (int) sizeof (uint32_t) / 4;
3371 byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
3372 byte_width = 4 * width;
3376 cachePrefetch ((__m128i*)byte_line);
3377 xmmDef = createMask_2x32_128 (data, data);
3382 uint8_t *d = byte_line;
3383 byte_line += stride;
3387 cachePrefetchNext ((__m128i*)d);
3389 while (w >= 2 && ((unsigned long)d & 3))
3391 *(uint16_t *)d = data;
3396 while (w >= 4 && ((unsigned long)d & 15))
3398 *(uint32_t *)d = data;
3404 cachePrefetchNext ((__m128i*)d);
3408 cachePrefetch (((__m128i*)d) + 12);
3410 save128Aligned ((__m128i*)(d), xmmDef);
3411 save128Aligned ((__m128i*)(d+16), xmmDef);
3412 save128Aligned ((__m128i*)(d+32), xmmDef);
3413 save128Aligned ((__m128i*)(d+48), xmmDef);
3414 save128Aligned ((__m128i*)(d+64), xmmDef);
3415 save128Aligned ((__m128i*)(d+80), xmmDef);
3416 save128Aligned ((__m128i*)(d+96), xmmDef);
3417 save128Aligned ((__m128i*)(d+112), xmmDef);
3425 cachePrefetch (((__m128i*)d) + 8);
3427 save128Aligned ((__m128i*)(d), xmmDef);
3428 save128Aligned ((__m128i*)(d+16), xmmDef);
3429 save128Aligned ((__m128i*)(d+32), xmmDef);
3430 save128Aligned ((__m128i*)(d+48), xmmDef);
3436 cachePrefetchNext ((__m128i*)d);
3440 save128Aligned ((__m128i*)(d), xmmDef);
3441 save128Aligned ((__m128i*)(d+16), xmmDef);
3449 save128Aligned ((__m128i*)(d), xmmDef);
3455 cachePrefetchNext ((__m128i*)d);
3459 *(uint32_t *)d = data;
3467 *(uint16_t *)d = data;
3478 sse2_CompositeSrc_n_8_8888 (pixman_implementation_t *imp,
3480 pixman_image_t * src_image,
3481 pixman_image_t * mask_image,
3482 pixman_image_t * dst_image,
3493 uint32_t *dstLine, *dst;
3494 uint8_t *maskLine, *mask;
3495 int dstStride, maskStride;
3499 __m128i xmmSrc, xmmDef;
3500 __m128i xmmMask, xmmMaskLo, xmmMaskHi;
3502 src = _pixman_image_get_solid(src_image, dst_image->bits.format);
3507 pixmanFillsse2 (dst_image->bits.bits, dst_image->bits.rowstride,
3508 PIXMAN_FORMAT_BPP (dst_image->bits.format),
3509 xDst, yDst, width, height, 0);
3513 fbComposeGetStart (dst_image, xDst, yDst, uint32_t, dstStride, dstLine, 1);
3514 fbComposeGetStart (mask_image, xMask, yMask, uint8_t, maskStride, maskLine, 1);
3516 xmmDef = createMask_2x32_128 (src, src);
3517 xmmSrc = expandPixel_32_1x128 (src);
3522 dstLine += dstStride;
3524 maskLine += maskStride;
3527 /* call prefetch hint to optimize cache load*/
3528 cachePrefetch ((__m128i*)mask);
3529 cachePrefetch ((__m128i*)dst);
3531 while (w && (unsigned long)dst & 15)
3533 uint8_t m = *mask++;
3537 *dst = pack_1x64_32 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmSrc), expandPixel_8_1x64 (m)));
3548 /* call prefetch hint to optimize cache load*/
3549 cachePrefetch ((__m128i*)mask);
3550 cachePrefetch ((__m128i*)dst);
3554 /* fill cache line with next memory */
3555 cachePrefetchNext ((__m128i*)mask);
3556 cachePrefetchNext ((__m128i*)dst);
3558 m = *((uint32_t*)mask);
3560 if (srca == 0xff && m == 0xffffffff)
3562 save128Aligned ((__m128i*)dst, xmmDef);
3566 xmmMask = unpack_32_1x128 (m);
3567 xmmMask = _mm_unpacklo_epi8 (xmmMask, _mm_setzero_si128());
3570 unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
3572 expandAlphaRev_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
3574 pixMultiply_2x128 (&xmmSrc, &xmmSrc, &xmmMaskLo, &xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
3576 save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmMaskLo, xmmMaskHi));
3580 save128Aligned ((__m128i*)dst, _mm_setzero_si128());
3590 uint8_t m = *mask++;
3594 *dst = pack_1x64_32 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmSrc), expandPixel_8_1x64 (m)));
3609 /* -------------------------------------------------------------------------------------------------
3610 * fast_CompositeOver_n_8_0565
3614 sse2_CompositeOver_n_8_0565 (pixman_implementation_t *imp,
3616 pixman_image_t * src_image,
3617 pixman_image_t * mask_image,
3618 pixman_image_t * dst_image,
3629 uint16_t *dstLine, *dst, d;
3630 uint8_t *maskLine, *mask;
3631 int dstStride, maskStride;
3634 __m64 mmxSrc, mmxAlpha, mmxMask, mmxDest;
3636 __m128i xmmSrc, xmmAlpha;
3637 __m128i xmmMask, xmmMaskLo, xmmMaskHi;
3638 __m128i xmmDst, xmmDst0, xmmDst1, xmmDst2, xmmDst3;
3640 src = _pixman_image_get_solid(src_image, dst_image->bits.format);
3646 fbComposeGetStart (dst_image, xDst, yDst, uint16_t, dstStride, dstLine, 1);
3647 fbComposeGetStart (mask_image, xMask, yMask, uint8_t, maskStride, maskLine, 1);
3649 xmmSrc = expandPixel_32_1x128 (src);
3650 xmmAlpha = expandAlpha_1x128 (xmmSrc);
3651 mmxSrc = _mm_movepi64_pi64 (xmmSrc);
3652 mmxAlpha = _mm_movepi64_pi64 (xmmAlpha);
3657 dstLine += dstStride;
3659 maskLine += maskStride;
3662 /* call prefetch hint to optimize cache load*/
3663 cachePrefetch ((__m128i*)mask);
3664 cachePrefetch ((__m128i*)dst);
3666 while (w && (unsigned long)dst & 15)
3673 mmxMask = expandAlphaRev_1x64 (unpack_32_1x64 (m));
3674 mmxDest = expand565_16_1x64 (d);
3676 *dst = pack565_32_16 (pack_1x64_32 (inOver_1x64 (&mmxSrc,
3686 /* call prefetch hint to optimize cache load*/
3687 cachePrefetch ((__m128i*)mask);
3688 cachePrefetch ((__m128i*)dst);
3692 /* fill cache line with next memory */
3693 cachePrefetchNext ((__m128i*)mask);
3694 cachePrefetchNext ((__m128i*)dst);
3696 xmmDst = load128Aligned ((__m128i*) dst);
3697 unpack565_128_4x128 (xmmDst, &xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3);
3699 m = *((uint32_t*)mask);
3704 xmmMask = unpack_32_1x128 (m);
3705 xmmMask = _mm_unpacklo_epi8 (xmmMask, _mm_setzero_si128());
3708 unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
3710 expandAlphaRev_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
3711 inOver_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmDst0, &xmmDst1);
3714 m = *((uint32_t*)mask);
3719 xmmMask = unpack_32_1x128 (m);
3720 xmmMask = _mm_unpacklo_epi8 (xmmMask, _mm_setzero_si128());
3723 unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
3725 expandAlphaRev_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
3726 inOver_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmDst2, &xmmDst3);
3729 save128Aligned ((__m128i*)dst, pack565_4x128_128 (&xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3));
3742 mmxMask = expandAlphaRev_1x64 (unpack_32_1x64 (m));
3743 mmxDest = expand565_16_1x64 (d);
3745 *dst = pack565_32_16 (pack_1x64_32 (inOver_1x64 (&mmxSrc,
3759 /* -------------------------------------------------------------------------------------------------
3760 * fast_Composite_over_pixbuf_0565
3764 sse2_Composite_over_pixbuf_0565 (pixman_implementation_t *imp,
3766 pixman_image_t * src_image,
3767 pixman_image_t * mask_image,
3768 pixman_image_t * dst_image,
3778 uint16_t *dstLine, *dst, d;
3779 uint32_t *srcLine, *src, s;
3780 int dstStride, srcStride;
3782 uint32_t opaque, zero;
3785 __m128i xmmSrc, xmmSrcLo, xmmSrcHi;
3786 __m128i xmmDst, xmmDst0, xmmDst1, xmmDst2, xmmDst3;
3788 fbComposeGetStart (dst_image, xDst, yDst, uint16_t, dstStride, dstLine, 1);
3789 fbComposeGetStart (src_image, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
3794 * I copy the code from MMX one and keep the fixme.
3795 * If it's a problem there, probably is a problem here.
3797 assert (src_image->pDrawable == mask_image->pDrawable);
3803 dstLine += dstStride;
3805 srcLine += srcStride;
3808 /* call prefetch hint to optimize cache load*/
3809 cachePrefetch ((__m128i*)src);
3810 cachePrefetch ((__m128i*)dst);
3812 while (w && (unsigned long)dst & 15)
3817 ms = unpack_32_1x64 (s);
3819 *dst++ = pack565_32_16 (pack_1x64_32 (overRevNonPre_1x64(ms, expand565_16_1x64 (d))));
3823 /* call prefetch hint to optimize cache load*/
3824 cachePrefetch ((__m128i*)src);
3825 cachePrefetch ((__m128i*)dst);
3829 /* fill cache line with next memory */
3830 cachePrefetchNext ((__m128i*)src);
3831 cachePrefetchNext ((__m128i*)dst);
3834 xmmSrc = load128Unaligned((__m128i*)src);
3835 xmmDst = load128Aligned ((__m128i*)dst);
3837 opaque = isOpaque (xmmSrc);
3838 zero = isZero (xmmSrc);
3840 unpack565_128_4x128 (xmmDst, &xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3);
3841 unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
3843 /* preload next round*/
3844 xmmSrc = load128Unaligned((__m128i*)(src+4));
3848 invertColors_2x128 (xmmSrcLo, xmmSrcHi, &xmmDst0, &xmmDst1);
3852 overRevNonPre_2x128 (xmmSrcLo, xmmSrcHi, &xmmDst0, &xmmDst1);
3856 opaque = isOpaque (xmmSrc);
3857 zero = isZero (xmmSrc);
3859 unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
3863 invertColors_2x128 (xmmSrcLo, xmmSrcHi, &xmmDst2, &xmmDst3);
3867 overRevNonPre_2x128 (xmmSrcLo, xmmSrcHi, &xmmDst2, &xmmDst3);
3870 save128Aligned ((__m128i*)dst, pack565_4x128_128 (&xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3));
3882 ms = unpack_32_1x64 (s);
3884 *dst++ = pack565_32_16 (pack_1x64_32 (overRevNonPre_1x64(ms, expand565_16_1x64 (d))));
3892 /* "8888RevNP" is GdkPixbuf's format: ABGR, non premultiplied */
3894 /* -------------------------------------------------------------------------------------------------
3895 * fast_Composite_over_pixbuf_8888
3899 sse2_Composite_over_pixbuf_8888 (pixman_implementation_t *imp,
3901 pixman_image_t * src_image,
3902 pixman_image_t * mask_image,
3903 pixman_image_t * dst_image,
3913 uint32_t *dstLine, *dst, d;
3914 uint32_t *srcLine, *src, s;
3915 int dstStride, srcStride;
3917 uint32_t opaque, zero;
3919 __m128i xmmSrcLo, xmmSrcHi;
3920 __m128i xmmDstLo, xmmDstHi;
3922 fbComposeGetStart (dst_image, xDst, yDst, uint32_t, dstStride, dstLine, 1);
3923 fbComposeGetStart (src_image, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
3928 * I copy the code from MMX one and keep the fixme.
3929 * If it's a problem there, probably is a problem here.
3931 assert (src_image->pDrawable == mask_image->pDrawable);
3937 dstLine += dstStride;
3939 srcLine += srcStride;
3942 /* call prefetch hint to optimize cache load*/
3943 cachePrefetch ((__m128i*)src);
3944 cachePrefetch ((__m128i*)dst);
3946 while (w && (unsigned long)dst & 15)
3951 *dst++ = pack_1x64_32 (overRevNonPre_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d)));
3956 /* call prefetch hint to optimize cache load*/
3957 cachePrefetch ((__m128i*)src);
3958 cachePrefetch ((__m128i*)dst);
3962 /* fill cache line with next memory */
3963 cachePrefetchNext ((__m128i*)src);
3964 cachePrefetchNext ((__m128i*)dst);
3966 xmmSrcHi = load128Unaligned((__m128i*)src);
3968 opaque = isOpaque (xmmSrcHi);
3969 zero = isZero (xmmSrcHi);
3971 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
3975 invertColors_2x128( xmmSrcLo, xmmSrcHi, &xmmDstLo, &xmmDstHi);
3977 save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
3981 xmmDstHi = load128Aligned ((__m128i*)dst);
3983 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
3985 overRevNonPre_2x128 (xmmSrcLo, xmmSrcHi, &xmmDstLo, &xmmDstHi);
3987 save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
4000 *dst++ = pack_1x64_32 (overRevNonPre_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d)));
4009 /* -------------------------------------------------------------------------------------------------
4010 * fast_CompositeOver_n_8888_0565_ca
4014 sse2_CompositeOver_n_8888_0565_ca (pixman_implementation_t *imp,
4016 pixman_image_t * src_image,
4017 pixman_image_t * mask_image,
4018 pixman_image_t * dst_image,
4029 uint16_t *dstLine, *dst, d;
4030 uint32_t *maskLine, *mask, m;
4031 int dstStride, maskStride;
4035 __m128i xmmSrc, xmmAlpha;
4036 __m128i xmmMask, xmmMaskLo, xmmMaskHi;
4037 __m128i xmmDst, xmmDst0, xmmDst1, xmmDst2, xmmDst3;
4039 __m64 mmxSrc, mmxAlpha, mmxMask, mmxDest;
4041 src = _pixman_image_get_solid(src_image, dst_image->bits.format);
4046 fbComposeGetStart (dst_image, xDst, yDst, uint16_t, dstStride, dstLine, 1);
4047 fbComposeGetStart (mask_image, xMask, yMask, uint32_t, maskStride, maskLine, 1);
4049 xmmSrc = expandPixel_32_1x128 (src);
4050 xmmAlpha = expandAlpha_1x128 (xmmSrc);
4051 mmxSrc = _mm_movepi64_pi64 (xmmSrc);
4052 mmxAlpha = _mm_movepi64_pi64 (xmmAlpha);
4059 maskLine += maskStride;
4060 dstLine += dstStride;
4062 /* call prefetch hint to optimize cache load*/
4063 cachePrefetch ((__m128i*)mask);
4064 cachePrefetch ((__m128i*)dst);
4066 while (w && ((unsigned long)dst & 15))
4068 m = *(uint32_t *) mask;
4073 mmxMask = unpack_32_1x64 (m);
4074 mmxDest = expand565_16_1x64 (d);
4076 *dst = pack565_32_16 (pack_1x64_32 (inOver_1x64 (&mmxSrc,
4087 /* call prefetch hint to optimize cache load*/
4088 cachePrefetch ((__m128i*)mask);
4089 cachePrefetch ((__m128i*)dst);
4093 /* fill cache line with next memory */
4094 cachePrefetchNext ((__m128i*)mask);
4095 cachePrefetchNext ((__m128i*)dst);
4098 xmmMask = load128Unaligned((__m128i*)mask);
4099 xmmDst = load128Aligned((__m128i*)dst);
4101 packCmp = _mm_movemask_epi8 (_mm_cmpeq_epi32 (xmmMask, _mm_setzero_si128()));
4103 unpack565_128_4x128 (xmmDst, &xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3);
4104 unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
4106 /* preload next round*/
4107 xmmMask = load128Unaligned((__m128i*)(mask+4));
4108 /* preload next round*/
4110 if (packCmp != 0xffff)
4112 inOver_2x128(&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmDst0, &xmmDst1);
4116 packCmp = _mm_movemask_epi8 (_mm_cmpeq_epi32 (xmmMask, _mm_setzero_si128()));
4118 unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
4120 if (packCmp != 0xffff)
4122 inOver_2x128(&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmDst2, &xmmDst3);
4125 save128Aligned ((__m128i*)dst, pack565_4x128_128 (&xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3));
4134 m = *(uint32_t *) mask;
4139 mmxMask = unpack_32_1x64 (m);
4140 mmxDest = expand565_16_1x64 (d);
4142 *dst = pack565_32_16 (pack_1x64_32 (inOver_1x64 (&mmxSrc,
4157 /* -------------------------------------------------------------------------------------------------
4158 * fast_CompositeIn_n_8_8
4162 sse2_CompositeIn_n_8_8 (pixman_implementation_t *imp,
4164 pixman_image_t * src_image,
4165 pixman_image_t * mask_image,
4166 pixman_image_t * dst_image,
4176 uint8_t *dstLine, *dst;
4177 uint8_t *maskLine, *mask;
4178 int dstStride, maskStride;
4184 __m128i xmmMask, xmmMaskLo, xmmMaskHi;
4185 __m128i xmmDst, xmmDstLo, xmmDstHi;
4187 fbComposeGetStart (dst_image, xDst, yDst, uint8_t, dstStride, dstLine, 1);
4188 fbComposeGetStart (mask_image, xMask, yMask, uint8_t, maskStride, maskLine, 1);
4190 src = _pixman_image_get_solid(src_image, dst_image->bits.format);
4196 xmmAlpha = expandAlpha_1x128 (expandPixel_32_1x128 (src));
4201 dstLine += dstStride;
4203 maskLine += maskStride;
4206 /* call prefetch hint to optimize cache load*/
4207 cachePrefetch ((__m128i*)mask);
4208 cachePrefetch ((__m128i*)dst);
4210 while (w && ((unsigned long)dst & 15))
4212 m = (uint32_t) *mask++;
4213 d = (uint32_t) *dst;
4215 *dst++ = (uint8_t) pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmAlpha), unpack_32_1x64 (m)),
4216 unpack_32_1x64 (d)));
4220 /* call prefetch hint to optimize cache load*/
4221 cachePrefetch ((__m128i*)mask);
4222 cachePrefetch ((__m128i*)dst);
4226 /* fill cache line with next memory */
4227 cachePrefetchNext ((__m128i*)mask);
4228 cachePrefetchNext ((__m128i*)dst);
4230 xmmMask = load128Unaligned((__m128i*)mask);
4231 xmmDst = load128Aligned((__m128i*)dst);
4233 unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
4234 unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
4236 pixMultiply_2x128 (&xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
4237 pixMultiply_2x128 (&xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi, &xmmDstLo, &xmmDstHi);
4239 save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
4248 m = (uint32_t) *mask++;
4249 d = (uint32_t) *dst;
4251 *dst++ = (uint8_t) pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmAlpha), unpack_32_1x64 (m)),
4252 unpack_32_1x64 (d)));
4260 /* -------------------------------------------------------------------------------------------------
4261 * fast_CompositeIn_8_8
4265 sse2_CompositeIn_8_8 (pixman_implementation_t *imp,
4267 pixman_image_t * src_image,
4268 pixman_image_t * mask_image,
4269 pixman_image_t * dst_image,
4279 uint8_t *dstLine, *dst;
4280 uint8_t *srcLine, *src;
4281 int srcStride, dstStride;
4285 __m128i xmmSrc, xmmSrcLo, xmmSrcHi;
4286 __m128i xmmDst, xmmDstLo, xmmDstHi;
4288 fbComposeGetStart (dst_image, xDst, yDst, uint8_t, dstStride, dstLine, 1);
4289 fbComposeGetStart (src_image, xSrc, ySrc, uint8_t, srcStride, srcLine, 1);
4294 dstLine += dstStride;
4296 srcLine += srcStride;
4299 /* call prefetch hint to optimize cache load*/
4300 cachePrefetch ((__m128i*)src);
4301 cachePrefetch ((__m128i*)dst);
4303 while (w && ((unsigned long)dst & 15))
4305 s = (uint32_t) *src++;
4306 d = (uint32_t) *dst;
4308 *dst++ = (uint8_t) pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s),unpack_32_1x64 (d)));
4312 /* call prefetch hint to optimize cache load*/
4313 cachePrefetch ((__m128i*)src);
4314 cachePrefetch ((__m128i*)dst);
4318 /* fill cache line with next memory */
4319 cachePrefetchNext ((__m128i*)src);
4320 cachePrefetchNext ((__m128i*)dst);
4322 xmmSrc = load128Unaligned((__m128i*)src);
4323 xmmDst = load128Aligned((__m128i*)dst);
4325 unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
4326 unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
4328 pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmDstLo, &xmmDstHi, &xmmDstLo, &xmmDstHi);
4330 save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
4339 s = (uint32_t) *src++;
4340 d = (uint32_t) *dst;
4342 *dst++ = (uint8_t) pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s),unpack_32_1x64 (d)));
4350 /* -------------------------------------------------------------------------------------------------
4351 * fast_CompositeAdd_8888_8_8
4355 sse2_CompositeAdd_8888_8_8 (pixman_implementation_t *imp,
4357 pixman_image_t * src_image,
4358 pixman_image_t * mask_image,
4359 pixman_image_t * dst_image,
4369 uint8_t *dstLine, *dst;
4370 uint8_t *maskLine, *mask;
4371 int dstStride, maskStride;
4378 __m128i xmmMask, xmmMaskLo, xmmMaskHi;
4379 __m128i xmmDst, xmmDstLo, xmmDstHi;
4381 fbComposeGetStart (dst_image, xDst, yDst, uint8_t, dstStride, dstLine, 1);
4382 fbComposeGetStart (mask_image, xMask, yMask, uint8_t, maskStride, maskLine, 1);
4384 src = _pixman_image_get_solid(src_image, dst_image->bits.format);
4390 xmmAlpha = expandAlpha_1x128 (expandPixel_32_1x128 (src));
4395 dstLine += dstStride;
4397 maskLine += maskStride;
4400 /* call prefetch hint to optimize cache load*/
4401 cachePrefetch ((__m128i*)mask);
4402 cachePrefetch ((__m128i*)dst);
4404 while (w && ((unsigned long)dst & 15))
4406 m = (uint32_t) *mask++;
4407 d = (uint32_t) *dst;
4409 *dst++ = (uint8_t) pack_1x64_32 (_mm_adds_pu16 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmAlpha), unpack_32_1x64 (m)),
4410 unpack_32_1x64 (d)));
4414 /* call prefetch hint to optimize cache load*/
4415 cachePrefetch ((__m128i*)mask);
4416 cachePrefetch ((__m128i*)dst);
4420 /* fill cache line with next memory */
4421 cachePrefetchNext ((__m128i*)mask);
4422 cachePrefetchNext ((__m128i*)dst);
4424 xmmMask = load128Unaligned((__m128i*)mask);
4425 xmmDst = load128Aligned((__m128i*)dst);
4427 unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
4428 unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
4430 pixMultiply_2x128 (&xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
4432 xmmDstLo = _mm_adds_epu16 (xmmMaskLo, xmmDstLo);
4433 xmmDstHi = _mm_adds_epu16 (xmmMaskHi, xmmDstHi);
4435 save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
4444 m = (uint32_t) *mask++;
4445 d = (uint32_t) *dst;
4447 *dst++ = (uint8_t) pack_1x64_32 (_mm_adds_pu16 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmAlpha), unpack_32_1x64 (m)),
4448 unpack_32_1x64 (d)));
4456 /* -------------------------------------------------------------------------------------------------
4457 * fast_CompositeAdd_8000_8000
4461 sse2_CompositeAdd_8000_8000 (pixman_implementation_t *imp,
4463 pixman_image_t * src_image,
4464 pixman_image_t * mask_image,
4465 pixman_image_t * dst_image,
4475 uint8_t *dstLine, *dst;
4476 uint8_t *srcLine, *src;
4477 int dstStride, srcStride;
4481 fbComposeGetStart (src_image, xSrc, ySrc, uint8_t, srcStride, srcLine, 1);
4482 fbComposeGetStart (dst_image, xDst, yDst, uint8_t, dstStride, dstLine, 1);
4489 /* call prefetch hint to optimize cache load*/
4490 cachePrefetch ((__m128i*)src);
4491 cachePrefetch ((__m128i*)dst);
4493 dstLine += dstStride;
4494 srcLine += srcStride;
4498 while (w && (unsigned long)dst & 3)
4500 t = (*dst) + (*src++);
4501 *dst++ = t | (0 - (t >> 8));
4505 coreCombineAddUsse2 ((uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
4515 t = (*dst) + (*src++);
4516 *dst++ = t | (0 - (t >> 8));
4524 /* -------------------------------------------------------------------------------------------------
4525 * fast_CompositeAdd_8888_8888
4528 sse2_CompositeAdd_8888_8888 (pixman_implementation_t *imp,
4530 pixman_image_t * src_image,
4531 pixman_image_t * mask_image,
4532 pixman_image_t * dst_image,
4542 uint32_t *dstLine, *dst;
4543 uint32_t *srcLine, *src;
4544 int dstStride, srcStride;
4546 fbComposeGetStart (src_image, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
4547 fbComposeGetStart (dst_image, xDst, yDst, uint32_t, dstStride, dstLine, 1);
4552 dstLine += dstStride;
4554 srcLine += srcStride;
4556 coreCombineAddUsse2 (dst, src, NULL, width);
4562 /* -------------------------------------------------------------------------------------------------
4563 * sse2_CompositeCopyArea
4566 static pixman_bool_t
4567 pixmanBltsse2 (uint32_t *src_bits,
4573 int src_x, int src_y,
4574 int dst_x, int dst_y,
4575 int width, int height)
4577 uint8_t * src_bytes;
4578 uint8_t * dst_bytes;
4581 if (src_bpp != dst_bpp)
4586 src_stride = src_stride * (int) sizeof (uint32_t) / 2;
4587 dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
4588 src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
4589 dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
4590 byte_width = 2 * width;
4594 else if (src_bpp == 32)
4596 src_stride = src_stride * (int) sizeof (uint32_t) / 4;
4597 dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
4598 src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
4599 dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
4600 byte_width = 4 * width;
4609 cachePrefetch ((__m128i*)src_bytes);
4610 cachePrefetch ((__m128i*)dst_bytes);
4615 uint8_t *s = src_bytes;
4616 uint8_t *d = dst_bytes;
4617 src_bytes += src_stride;
4618 dst_bytes += dst_stride;
4621 cachePrefetchNext ((__m128i*)s);
4622 cachePrefetchNext ((__m128i*)d);
4624 while (w >= 2 && ((unsigned long)d & 3))
4626 *(uint16_t *)d = *(uint16_t *)s;
4632 while (w >= 4 && ((unsigned long)d & 15))
4634 *(uint32_t *)d = *(uint32_t *)s;
4641 cachePrefetchNext ((__m128i*)s);
4642 cachePrefetchNext ((__m128i*)d);
4646 __m128i xmm0, xmm1, xmm2, xmm3;
4648 /* 128 bytes ahead */
4649 cachePrefetch (((__m128i*)s) + 8);
4650 cachePrefetch (((__m128i*)d) + 8);
4652 xmm0 = load128Unaligned ((__m128i*)(s));
4653 xmm1 = load128Unaligned ((__m128i*)(s+16));
4654 xmm2 = load128Unaligned ((__m128i*)(s+32));
4655 xmm3 = load128Unaligned ((__m128i*)(s+48));
4657 save128Aligned ((__m128i*)(d), xmm0);
4658 save128Aligned ((__m128i*)(d+16), xmm1);
4659 save128Aligned ((__m128i*)(d+32), xmm2);
4660 save128Aligned ((__m128i*)(d+48), xmm3);
4667 cachePrefetchNext ((__m128i*)s);
4668 cachePrefetchNext ((__m128i*)d);
4672 save128Aligned ((__m128i*)d, load128Unaligned ((__m128i*)s) );
4679 cachePrefetchNext ((__m128i*)s);
4680 cachePrefetchNext ((__m128i*)d);
4684 *(uint32_t *)d = *(uint32_t *)s;
4693 *(uint16_t *)d = *(uint16_t *)s;
4706 sse2_CompositeCopyArea (pixman_implementation_t *imp,
4708 pixman_image_t * src_image,
4709 pixman_image_t * mask_image,
4710 pixman_image_t * dst_image,
4720 pixmanBltsse2 (src_image->bits.bits,
4721 dst_image->bits.bits,
4722 src_image->bits.rowstride,
4723 dst_image->bits.rowstride,
4724 PIXMAN_FORMAT_BPP (src_image->bits.format),
4725 PIXMAN_FORMAT_BPP (dst_image->bits.format),
4726 xSrc, ySrc, xDst, yDst, width, height);
4730 /* This code are buggy in MMX version, now the bug was translated to SSE2 version */
4732 sse2_CompositeOver_x888_8_8888 (pixman_implementation_t *imp,
4734 pixman_image_t * src_image,
4735 pixman_image_t * mask_image,
4736 pixman_image_t * dst_image,
4746 uint32_t *src, *srcLine, s;
4747 uint32_t *dst, *dstLine, d;
4748 uint8_t *mask, *maskLine;
4750 int srcStride, maskStride, dstStride;
4753 __m128i xmmSrc, xmmSrcLo, xmmSrcHi;
4754 __m128i xmmDst, xmmDstLo, xmmDstHi;
4755 __m128i xmmMask, xmmMaskLo, xmmMaskHi;
4757 fbComposeGetStart (dst_image, xDst, yDst, uint32_t, dstStride, dstLine, 1);
4758 fbComposeGetStart (mask_image, xMask, yMask, uint8_t, maskStride, maskLine, 1);
4759 fbComposeGetStart (src_image, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
4764 srcLine += srcStride;
4766 dstLine += dstStride;
4768 maskLine += maskStride;
4772 /* call prefetch hint to optimize cache load*/
4773 cachePrefetch ((__m128i*)src);
4774 cachePrefetch ((__m128i*)dst);
4775 cachePrefetch ((__m128i*)mask);
4777 while (w && (unsigned long)dst & 15)
4779 s = 0xff000000 | *src++;
4780 m = (uint32_t) *mask++;
4783 __m64 ms = unpack_32_1x64 (s);
4787 ms = inOver_1x64 (ms,
4789 expandAlphaRev_1x64 (unpack_32_1x64 (m)),
4790 unpack_32_1x64 (d));
4793 *dst++ = pack_1x64_32 (ms);
4797 /* call prefetch hint to optimize cache load*/
4798 cachePrefetch ((__m128i*)src);
4799 cachePrefetch ((__m128i*)dst);
4800 cachePrefetch ((__m128i*)mask);
4804 /* fill cache line with next memory */
4805 cachePrefetchNext ((__m128i*)src);
4806 cachePrefetchNext ((__m128i*)dst);
4807 cachePrefetchNext ((__m128i*)mask);
4809 m = *(uint32_t*) mask;
4810 xmmSrc = _mm_or_si128 (load128Unaligned ((__m128i*)src), Maskff000000);
4812 if (m == 0xffffffff)
4814 save128Aligned ((__m128i*)dst, xmmSrc);
4818 xmmDst = load128Aligned ((__m128i*)dst);
4820 xmmMask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
4822 unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
4823 unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
4824 unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
4826 expandAlphaRev_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
4828 inOver_2x128 (xmmSrcLo, xmmSrcHi, Mask00ff, Mask00ff, xmmMaskLo, xmmMaskHi, &xmmDstLo, &xmmDstHi);
4830 save128Aligned( (__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
4841 m = (uint32_t) *mask++;
4845 s = 0xff000000 | *src;
4855 *dst = pack_1x64_32 (inOver_1x64 (unpack_32_1x64 (s),
4857 expandAlphaRev_1x64 (unpack_32_1x64 (m)),
4858 unpack_32_1x64 (d)));
4873 static const pixman_fast_path_t sse2_fast_paths[] =
4875 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_r5g6b5, sse2_CompositeOver_n_8_0565, 0 },
4876 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_b5g6r5, sse2_CompositeOver_n_8_0565, 0 },
4877 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_null, PIXMAN_a8r8g8b8, sse2_CompositeOver_n_8888, 0 },
4878 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_CompositeOver_n_8888, 0 },
4879 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_null, PIXMAN_r5g6b5, sse2_CompositeOver_n_0565, 0 },
4880 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, sse2_composite_over_8888_8888, 0 },
4881 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_over_8888_8888, 0 },
4882 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, sse2_composite_over_8888_8888, 0 },
4883 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, sse2_composite_over_8888_8888, 0 },
4884 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_r5g6b5, sse2_composite_over_8888_0565, 0 },
4885 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_b5g6r5, sse2_composite_over_8888_0565, 0 },
4886 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_CompositeOver_n_8_8888, 0 },
4887 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_CompositeOver_n_8_8888, 0 },
4888 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8b8g8r8, sse2_CompositeOver_n_8_8888, 0 },
4889 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_CompositeOver_n_8_8888, 0 },
4891 /* FIXME: This code are buggy in MMX version, now the bug was translated to SSE2 version */
4892 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_CompositeOver_x888_8_8888, 0 },
4893 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_CompositeOver_x888_8_8888, 0 },
4894 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_CompositeOver_x888_8_8888, 0 },
4895 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_CompositeOver_x888_8_8888, 0 },
4897 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_Composite_over_x888_n_8888, NEED_SOLID_MASK },
4898 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_Composite_over_x888_n_8888, NEED_SOLID_MASK },
4899 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_a8b8g8r8, sse2_Composite_over_x888_n_8888, NEED_SOLID_MASK },
4900 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_Composite_over_x888_n_8888, NEED_SOLID_MASK },
4901 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_over_8888_n_8888, NEED_SOLID_MASK },
4902 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_over_8888_n_8888, NEED_SOLID_MASK },
4903 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_a8, PIXMAN_a8b8g8r8, sse2_composite_over_8888_n_8888, NEED_SOLID_MASK },
4904 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_over_8888_n_8888, NEED_SOLID_MASK },
4905 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, sse2_CompositeOver_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
4906 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, sse2_CompositeOver_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
4907 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8b8g8r8, PIXMAN_a8b8g8r8, sse2_CompositeOver_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
4908 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8b8g8r8, PIXMAN_x8b8g8r8, sse2_CompositeOver_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
4909 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8r8g8b8, PIXMAN_r5g6b5, sse2_CompositeOver_n_8888_0565_ca, NEED_COMPONENT_ALPHA },
4910 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8b8g8r8, PIXMAN_b5g6r5, sse2_CompositeOver_n_8888_0565_ca, NEED_COMPONENT_ALPHA },
4911 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, sse2_Composite_over_pixbuf_8888, NEED_PIXBUF },
4912 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_a8r8g8b8, sse2_Composite_over_pixbuf_8888, NEED_PIXBUF },
4913 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, sse2_Composite_over_pixbuf_8888, NEED_PIXBUF },
4914 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_x8r8g8b8, sse2_Composite_over_pixbuf_8888, NEED_PIXBUF },
4915 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_a8b8g8r8, sse2_Composite_over_pixbuf_8888, NEED_PIXBUF },
4916 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_a8b8g8r8, sse2_Composite_over_pixbuf_8888, NEED_PIXBUF },
4917 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_x8b8g8r8, sse2_Composite_over_pixbuf_8888, NEED_PIXBUF },
4918 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_x8b8g8r8, sse2_Composite_over_pixbuf_8888, NEED_PIXBUF },
4919 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_r5g6b5, sse2_Composite_over_pixbuf_0565, NEED_PIXBUF },
4920 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_r5g6b5, sse2_Composite_over_pixbuf_0565, NEED_PIXBUF },
4921 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_b5g6r5, sse2_Composite_over_pixbuf_0565, NEED_PIXBUF },
4922 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_b5g6r5, sse2_Composite_over_pixbuf_0565, NEED_PIXBUF },
4923 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_CompositeCopyArea, 0 },
4924 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, sse2_CompositeCopyArea, 0 },
4926 { PIXMAN_OP_ADD, PIXMAN_a8, PIXMAN_null, PIXMAN_a8, sse2_CompositeAdd_8000_8000, 0 },
4927 { PIXMAN_OP_ADD, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, sse2_CompositeAdd_8888_8888, 0 },
4928 { PIXMAN_OP_ADD, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, sse2_CompositeAdd_8888_8888, 0 },
4929 { PIXMAN_OP_ADD, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8, sse2_CompositeAdd_8888_8_8, 0 },
4931 { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_CompositeSrc_n_8_8888, 0 },
4932 { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_CompositeSrc_n_8_8888, 0 },
4933 { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8b8g8r8, sse2_CompositeSrc_n_8_8888, 0 },
4934 { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_CompositeSrc_n_8_8888, 0 },
4935 { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, sse2_CompositeCopyArea, 0 },
4936 { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, sse2_CompositeCopyArea, 0 },
4937 { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_CompositeCopyArea, 0 },
4938 { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, sse2_CompositeCopyArea, 0 },
4939 { PIXMAN_OP_SRC, PIXMAN_x8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_CompositeCopyArea, 0 },
4940 { PIXMAN_OP_SRC, PIXMAN_x8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, sse2_CompositeCopyArea, 0 },
4941 { PIXMAN_OP_SRC, PIXMAN_r5g6b5, PIXMAN_null, PIXMAN_r5g6b5, sse2_CompositeCopyArea, 0 },
4942 { PIXMAN_OP_SRC, PIXMAN_b5g6r5, PIXMAN_null, PIXMAN_b5g6r5, sse2_CompositeCopyArea, 0 },
4944 { PIXMAN_OP_IN, PIXMAN_a8, PIXMAN_null, PIXMAN_a8, sse2_CompositeIn_8_8, 0 },
4945 { PIXMAN_OP_IN, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8, sse2_CompositeIn_n_8_8, 0 },
4951 * Work around GCC bug causing crashes in Mozilla with SSE2
4953 * When using -msse, gcc generates movdqa instructions assuming that
4954 * the stack is 16 byte aligned. Unfortunately some applications, such
4955 * as Mozilla and Mono, end up aligning the stack to 4 bytes, which
4956 * causes the movdqa instructions to fail.
4958 * The __force_align_arg_pointer__ makes gcc generate a prologue that
4959 * realigns the stack pointer to 16 bytes.
4961 * On x86-64 this is not necessary because the standard ABI already
4962 * calls for a 16 byte aligned stack.
4964 * See https://bugs.freedesktop.org/show_bug.cgi?id=15693
4966 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
4967 __attribute__((__force_align_arg_pointer__))
4970 sse2_composite (pixman_implementation_t *imp,
4972 pixman_image_t *src,
4973 pixman_image_t *mask,
4974 pixman_image_t *dest,
4984 if (_pixman_run_fast_path (sse2_fast_paths, imp,
4985 op, src, mask, dest,
4994 _pixman_implementation_composite (imp->delegate, op,
5002 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5003 __attribute__((__force_align_arg_pointer__))
5005 static pixman_bool_t
5006 sse2_blt (pixman_implementation_t *imp,
5013 int src_x, int src_y,
5014 int dst_x, int dst_y,
5015 int width, int height)
5017 if (!pixmanBltsse2 (
5018 src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
5019 src_x, src_y, dst_x, dst_y, width, height))
5022 return _pixman_implementation_blt (
5024 src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
5025 src_x, src_y, dst_x, dst_y, width, height);
5031 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5032 __attribute__((__force_align_arg_pointer__))
5034 static pixman_bool_t
5035 sse2_fill (pixman_implementation_t *imp,
5045 if (!pixmanFillsse2 (bits, stride, bpp, x, y, width, height, xor))
5047 return _pixman_implementation_fill (
5048 imp->delegate, bits, stride, bpp, x, y, width, height, xor);
5054 pixman_implementation_t *
5055 _pixman_implementation_create_sse2 (void)
5057 pixman_implementation_t *mmx = _pixman_implementation_create_mmx ();
5058 pixman_implementation_t *imp = _pixman_implementation_create (mmx);
5060 /* SSE2 constants */
5061 Mask565r = createMask_2x32_128 (0x00f80000, 0x00f80000);
5062 Mask565g1 = createMask_2x32_128 (0x00070000, 0x00070000);
5063 Mask565g2 = createMask_2x32_128 (0x000000e0, 0x000000e0);
5064 Mask565b = createMask_2x32_128 (0x0000001f, 0x0000001f);
5065 MaskRed = createMask_2x32_128 (0x00f80000, 0x00f80000);
5066 MaskGreen = createMask_2x32_128 (0x0000fc00, 0x0000fc00);
5067 MaskBlue = createMask_2x32_128 (0x000000f8, 0x000000f8);
5068 Mask565FixRB = createMask_2x32_128 (0x00e000e0, 0x00e000e0);
5069 Mask565FixG = createMask_2x32_128 (0x0000c000, 0x0000c000);
5070 Mask0080 = createMask_16_128 (0x0080);
5071 Mask00ff = createMask_16_128 (0x00ff);
5072 Mask0101 = createMask_16_128 (0x0101);
5073 Maskffff = createMask_16_128 (0xffff);
5074 Maskff000000 = createMask_2x32_128 (0xff000000, 0xff000000);
5075 MaskAlpha = createMask_2x32_128 (0x00ff0000, 0x00000000);
5078 xMask565rgb = createMask_2x32_64 (0x000001f0, 0x003f001f);
5079 xMask565Unpack = createMask_2x32_64 (0x00000084, 0x04100840);
5081 xMask0080 = createMask_16_64 (0x0080);
5082 xMask00ff = createMask_16_64 (0x00ff);
5083 xMask0101 = createMask_16_64 (0x0101);
5084 xMaskAlpha = createMask_2x32_64 (0x00ff0000, 0x00000000);
5088 /* Set up function pointers */
5090 /* SSE code patch for fbcompose.c */
5091 imp->combine_32[PIXMAN_OP_OVER] = sse2CombineOverU;
5092 imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2CombineOverReverseU;
5093 imp->combine_32[PIXMAN_OP_IN] = sse2CombineInU;
5094 imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2CombineInReverseU;
5095 imp->combine_32[PIXMAN_OP_OUT] = sse2CombineOutU;
5096 imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2CombineOutReverseU;
5097 imp->combine_32[PIXMAN_OP_ATOP] = sse2CombineAtopU;
5098 imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2CombineAtopReverseU;
5099 imp->combine_32[PIXMAN_OP_XOR] = sse2CombineXorU;
5100 imp->combine_32[PIXMAN_OP_ADD] = sse2CombineAddU;
5102 imp->combine_32[PIXMAN_OP_SATURATE] = sse2CombineSaturateU;
5104 imp->combine_32_ca[PIXMAN_OP_SRC] = sse2CombineSrcC;
5105 imp->combine_32_ca[PIXMAN_OP_OVER] = sse2CombineOverC;
5106 imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2CombineOverReverseC;
5107 imp->combine_32_ca[PIXMAN_OP_IN] = sse2CombineInC;
5108 imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2CombineInReverseC;
5109 imp->combine_32_ca[PIXMAN_OP_OUT] = sse2CombineOutC;
5110 imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2CombineOutReverseC;
5111 imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2CombineAtopC;
5112 imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2CombineAtopReverseC;
5113 imp->combine_32_ca[PIXMAN_OP_XOR] = sse2CombineXorC;
5114 imp->combine_32_ca[PIXMAN_OP_ADD] = sse2CombineAddC;
5116 imp->composite = sse2_composite;
5117 imp->blt = sse2_blt;
5118 imp->fill = sse2_fill;
5123 #endif /* USE_SSE2 */