2 * Copyright © 2008 Rodrigo Kumpera
3 * Copyright © 2008 André Tupinambá
5 * Permission to use, copy, modify, distribute, and sell this software and its
6 * documentation for any purpose is hereby granted without fee, provided that
7 * the above copyright notice appear in all copies and that both that
8 * copyright notice and this permission notice appear in supporting
9 * documentation, and that the name of Red Hat not be used in advertising or
10 * publicity pertaining to distribution of the software without specific,
11 * written prior permission. Red Hat makes no representations about the
12 * suitability of this software for any purpose. It is provided "as is"
13 * without express or implied warranty.
15 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
16 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
18 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
20 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
21 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
24 * Author: Rodrigo Kumpera (kumpera@gmail.com)
25 * André Tupinambá (andrelrt@gmail.com)
27 * Based on work by Owen Taylor and Søren Sandmann
34 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
35 #include <emmintrin.h> /* for SSE2 intrinsics */
36 #include "pixman-private.h"
37 #include "pixman-combine32.h"
41 /* -------------------------------------------------------------------------------------------------
45 static __m64 mask_x0080;
46 static __m64 mask_x00ff;
47 static __m64 mask_x0101;
48 static __m64 mask_xAlpha;
50 static __m64 mask_x565rgb;
51 static __m64 mask_x565Unpack;
53 static __m128i Mask0080;
54 static __m128i Mask00ff;
55 static __m128i Mask0101;
56 static __m128i Maskffff;
57 static __m128i Maskff000000;
58 static __m128i MaskAlpha;
60 static __m128i Mask565r;
61 static __m128i Mask565g1, Mask565g2;
62 static __m128i Mask565b;
63 static __m128i MaskRed;
64 static __m128i MaskGreen;
65 static __m128i MaskBlue;
67 static __m128i Mask565FixRB;
68 static __m128i Mask565FixG;
70 /* -------------------------------------------------------------------------------------------------
73 static force_inline __m128i
74 unpack_32_1x128 (uint32_t data)
76 return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128());
79 static force_inline void
80 unpack_128_2x128 (__m128i data, __m128i* dataLo, __m128i* dataHi)
82 *dataLo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
83 *dataHi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
86 static force_inline __m128i
87 unpack565to8888 (__m128i lo)
89 __m128i r, g, b, rb, t;
91 r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), MaskRed);
92 g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), MaskGreen);
93 b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), MaskBlue);
95 rb = _mm_or_si128 (r, b);
96 t = _mm_and_si128 (rb, Mask565FixRB);
97 t = _mm_srli_epi32 (t, 5);
98 rb = _mm_or_si128 (rb, t);
100 t = _mm_and_si128 (g, Mask565FixG);
101 t = _mm_srli_epi32 (t, 6);
102 g = _mm_or_si128 (g, t);
104 return _mm_or_si128 (rb, g);
107 static force_inline void
108 unpack565_128_4x128 (__m128i data, __m128i* data0, __m128i* data1, __m128i* data2, __m128i* data3)
112 lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
113 hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
115 lo = unpack565to8888 (lo);
116 hi = unpack565to8888 (hi);
118 unpack_128_2x128 (lo, data0, data1);
119 unpack_128_2x128 (hi, data2, data3);
122 static force_inline uint16_t
123 pack565_32_16 (uint32_t pixel)
125 return (uint16_t) (((pixel>>8) & 0xf800) | ((pixel>>5) & 0x07e0) | ((pixel>>3) & 0x001f));
128 static force_inline __m128i
129 pack_2x128_128 (__m128i lo, __m128i hi)
131 return _mm_packus_epi16 (lo, hi);
134 static force_inline __m128i
135 pack565_2x128_128 (__m128i lo, __m128i hi)
138 __m128i r, g1, g2, b;
140 data = pack_2x128_128 ( lo, hi );
142 r = _mm_and_si128 (data , Mask565r);
143 g1 = _mm_and_si128 (_mm_slli_epi32 (data , 3), Mask565g1);
144 g2 = _mm_and_si128 (_mm_srli_epi32 (data , 5), Mask565g2);
145 b = _mm_and_si128 (_mm_srli_epi32 (data , 3), Mask565b);
147 return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
150 static force_inline __m128i
151 pack565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
153 return _mm_packus_epi16 (pack565_2x128_128 (*xmm0, *xmm1), pack565_2x128_128 (*xmm2, *xmm3));
156 static force_inline int
159 __m128i ffs = _mm_cmpeq_epi8 (x, x);
160 return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
163 static force_inline int
166 return _mm_movemask_epi8 (_mm_cmpeq_epi8 (x, _mm_setzero_si128())) == 0xffff;
169 static force_inline int
170 isTransparent (__m128i x)
172 return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, _mm_setzero_si128())) & 0x8888) == 0x8888;
175 static force_inline __m128i
176 expandPixel_32_1x128 (uint32_t data)
178 return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE(1, 0, 1, 0));
181 static force_inline __m128i
182 expandAlpha_1x128 (__m128i data)
184 return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
187 static force_inline void
188 expandAlpha_2x128 (__m128i dataLo, __m128i dataHi, __m128i* alphaLo, __m128i* alphaHi)
192 lo = _mm_shufflelo_epi16 (dataLo, _MM_SHUFFLE(3, 3, 3, 3));
193 hi = _mm_shufflelo_epi16 (dataHi, _MM_SHUFFLE(3, 3, 3, 3));
194 *alphaLo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE(3, 3, 3, 3));
195 *alphaHi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE(3, 3, 3, 3));
198 static force_inline void
199 expandAlphaRev_2x128 (__m128i dataLo, __m128i dataHi, __m128i* alphaLo, __m128i* alphaHi)
203 lo = _mm_shufflelo_epi16 (dataLo, _MM_SHUFFLE(0, 0, 0, 0));
204 hi = _mm_shufflelo_epi16 (dataHi, _MM_SHUFFLE(0, 0, 0, 0));
205 *alphaLo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE(0, 0, 0, 0));
206 *alphaHi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE(0, 0, 0, 0));
209 static force_inline void
210 pixMultiply_2x128 (__m128i* dataLo, __m128i* dataHi, __m128i* alphaLo, __m128i* alphaHi, __m128i* retLo, __m128i* retHi)
214 lo = _mm_mullo_epi16 (*dataLo, *alphaLo);
215 hi = _mm_mullo_epi16 (*dataHi, *alphaHi);
216 lo = _mm_adds_epu16 (lo, Mask0080);
217 hi = _mm_adds_epu16 (hi, Mask0080);
218 *retLo = _mm_mulhi_epu16 (lo, Mask0101);
219 *retHi = _mm_mulhi_epu16 (hi, Mask0101);
222 static force_inline void
223 pixAddMultiply_2x128 (__m128i* srcLo, __m128i* srcHi, __m128i* alphaDstLo, __m128i* alphaDstHi,
224 __m128i* dstLo, __m128i* dstHi, __m128i* alphaSrcLo, __m128i* alphaSrcHi,
225 __m128i* retLo, __m128i* retHi)
228 __m128i mulLo, mulHi;
230 lo = _mm_mullo_epi16 (*srcLo, *alphaDstLo);
231 hi = _mm_mullo_epi16 (*srcHi, *alphaDstHi);
232 mulLo = _mm_mullo_epi16 (*dstLo, *alphaSrcLo);
233 mulHi = _mm_mullo_epi16 (*dstHi, *alphaSrcHi);
234 lo = _mm_adds_epu16 (lo, Mask0080);
235 hi = _mm_adds_epu16 (hi, Mask0080);
236 lo = _mm_adds_epu16 (lo, mulLo);
237 hi = _mm_adds_epu16 (hi, mulHi);
238 *retLo = _mm_mulhi_epu16 (lo, Mask0101);
239 *retHi = _mm_mulhi_epu16 (hi, Mask0101);
242 static force_inline void
243 negate_2x128 (__m128i dataLo, __m128i dataHi, __m128i* negLo, __m128i* negHi)
245 *negLo = _mm_xor_si128 (dataLo, Mask00ff);
246 *negHi = _mm_xor_si128 (dataHi, Mask00ff);
249 static force_inline void
250 invertColors_2x128 (__m128i dataLo, __m128i dataHi, __m128i* invLo, __m128i* invHi)
254 lo = _mm_shufflelo_epi16 (dataLo, _MM_SHUFFLE(3, 0, 1, 2));
255 hi = _mm_shufflelo_epi16 (dataHi, _MM_SHUFFLE(3, 0, 1, 2));
256 *invLo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE(3, 0, 1, 2));
257 *invHi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE(3, 0, 1, 2));
260 static force_inline void
261 over_2x128 (__m128i* srcLo, __m128i* srcHi, __m128i* alphaLo, __m128i* alphaHi, __m128i* dstLo, __m128i* dstHi)
265 negate_2x128 (*alphaLo, *alphaHi, &t1, &t2);
267 pixMultiply_2x128 (dstLo, dstHi, &t1, &t2, dstLo, dstHi);
269 *dstLo = _mm_adds_epu8 (*srcLo, *dstLo);
270 *dstHi = _mm_adds_epu8 (*srcHi, *dstHi);
273 static force_inline void
274 overRevNonPre_2x128 (__m128i srcLo, __m128i srcHi, __m128i* dstLo, __m128i* dstHi)
277 __m128i alphaLo, alphaHi;
279 expandAlpha_2x128 (srcLo, srcHi, &alphaLo, &alphaHi);
281 lo = _mm_or_si128 (alphaLo, MaskAlpha);
282 hi = _mm_or_si128 (alphaHi, MaskAlpha);
284 invertColors_2x128 (srcLo, srcHi, &srcLo, &srcHi);
286 pixMultiply_2x128 (&srcLo, &srcHi, &lo, &hi, &lo, &hi);
288 over_2x128 (&lo, &hi, &alphaLo, &alphaHi, dstLo, dstHi);
291 static force_inline void
292 inOver_2x128 (__m128i* srcLo, __m128i* srcHi, __m128i* alphaLo, __m128i* alphaHi,
293 __m128i* maskLo, __m128i* maskHi, __m128i* dstLo, __m128i* dstHi)
298 pixMultiply_2x128 ( srcLo, srcHi, maskLo, maskHi, &sLo, &sHi);
299 pixMultiply_2x128 (alphaLo, alphaHi, maskLo, maskHi, &aLo, &aHi);
301 over_2x128 (&sLo, &sHi, &aLo, &aHi, dstLo, dstHi);
304 static force_inline void
305 cachePrefetch (__m128i* addr)
307 _mm_prefetch (addr, _MM_HINT_T0);
310 static force_inline void
311 cachePrefetchNext (__m128i* addr)
313 _mm_prefetch (addr + 4, _MM_HINT_T0); // 64 bytes ahead
316 /* load 4 pixels from a 16-byte boundary aligned address */
317 static force_inline __m128i
318 load128Aligned (__m128i* src)
320 return _mm_load_si128 (src);
323 /* load 4 pixels from a unaligned address */
324 static force_inline __m128i
325 load128Unaligned (const __m128i* src)
327 return _mm_loadu_si128 (src);
330 /* save 4 pixels using Write Combining memory on a 16-byte boundary aligned address */
331 static force_inline void
332 save128WriteCombining (__m128i* dst, __m128i data)
334 _mm_stream_si128 (dst, data);
337 /* save 4 pixels on a 16-byte boundary aligned address */
338 static force_inline void
339 save128Aligned (__m128i* dst, __m128i data)
341 _mm_store_si128 (dst, data);
344 /* save 4 pixels on a unaligned address */
345 static force_inline void
346 save128Unaligned (__m128i* dst, __m128i data)
348 _mm_storeu_si128 (dst, data);
351 /* -------------------------------------------------------------------------------------------------
355 static force_inline __m64
356 unpack_32_1x64 (uint32_t data)
358 return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (data), _mm_setzero_si64());
361 static force_inline __m64
362 expandAlpha_1x64 (__m64 data)
364 return _mm_shuffle_pi16 (data, _MM_SHUFFLE(3, 3, 3, 3));
367 static force_inline __m64
368 expandAlphaRev_1x64 (__m64 data)
370 return _mm_shuffle_pi16 (data, _MM_SHUFFLE(0, 0, 0, 0));
373 static force_inline __m64
374 expandPixel_8_1x64 (uint8_t data)
376 return _mm_shuffle_pi16 (unpack_32_1x64 ((uint32_t)data), _MM_SHUFFLE(0, 0, 0, 0));
379 static force_inline __m64
380 pixMultiply_1x64 (__m64 data, __m64 alpha)
382 return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (data, alpha),
387 static force_inline __m64
388 pixAddMultiply_1x64 (__m64* src, __m64* alphaDst, __m64* dst, __m64* alphaSrc)
390 return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (*src, *alphaDst),
392 _mm_mullo_pi16 (*dst, *alphaSrc)),
396 static force_inline __m64
397 negate_1x64 (__m64 data)
399 return _mm_xor_si64 (data, mask_x00ff);
402 static force_inline __m64
403 invertColors_1x64 (__m64 data)
405 return _mm_shuffle_pi16 (data, _MM_SHUFFLE(3, 0, 1, 2));
408 static force_inline __m64
409 over_1x64 (__m64 src, __m64 alpha, __m64 dst)
411 return _mm_adds_pu8 (src, pixMultiply_1x64 (dst, negate_1x64 (alpha)));
414 static force_inline __m64
415 inOver_1x64 (__m64* src, __m64* alpha, __m64* mask, __m64* dst)
417 return over_1x64 (pixMultiply_1x64 (*src, *mask),
418 pixMultiply_1x64 (*alpha, *mask),
422 static force_inline __m64
423 overRevNonPre_1x64 (__m64 src, __m64 dst)
425 __m64 alpha = expandAlpha_1x64 (src);
427 return over_1x64 (pixMultiply_1x64 (invertColors_1x64 (src),
428 _mm_or_si64 (alpha, mask_xAlpha)),
433 static force_inline uint32_t
434 pack_1x64_32( __m64 data )
436 return _mm_cvtsi64_si32 (_mm_packs_pu16 (data, _mm_setzero_si64()));
439 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
443 * --- Expanding 565 in the low word ---
445 * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
446 * m = m & (01f0003f001f);
447 * m = m * (008404100840);
450 * Note the trick here - the top word is shifted by another nibble to
451 * avoid it bumping into the middle word
453 static force_inline __m64
454 expand565_16_1x64 (uint16_t pixel)
459 p = _mm_cvtsi32_si64 ((uint32_t) pixel);
461 t1 = _mm_slli_si64 (p, 36 - 11);
462 t2 = _mm_slli_si64 (p, 16 - 5);
464 p = _mm_or_si64 (t1, p);
465 p = _mm_or_si64 (t2, p);
466 p = _mm_and_si64 (p, mask_x565rgb);
467 p = _mm_mullo_pi16 (p, mask_x565Unpack);
469 return _mm_srli_pi16 (p, 8);
472 /* -------------------------------------------------------------------------------------------------
473 * Compose Core transformations
475 static force_inline uint32_t
476 coreCombineOverUPixelsse2 (uint32_t src, uint32_t dst)
489 ms = unpack_32_1x64 (src);
490 return pack_1x64_32 (over_1x64 (ms, expandAlpha_1x64 (ms), unpack_32_1x64 (dst)));
496 static force_inline uint32_t
497 combine1 (const uint32_t *ps, const uint32_t *pm)
505 mm = unpack_32_1x64 (*pm);
506 mm = expandAlpha_1x64 (mm);
508 ms = unpack_32_1x64 (s);
509 ms = pixMultiply_1x64 (ms, mm);
511 s = pack_1x64_32 (ms);
517 static force_inline __m128i
518 combine4 (const __m128i *ps, const __m128i *pm)
520 __m128i xmmSrcLo, xmmSrcHi;
521 __m128i xmmMskLo, xmmMskHi;
526 xmmMskLo = load128Unaligned (pm);
528 if (isTransparent (xmmMskLo))
529 return _mm_setzero_si128 ();
532 s = load128Unaligned (ps);
536 unpack_128_2x128 (s, &xmmSrcLo, &xmmSrcHi);
537 unpack_128_2x128 (xmmMskLo, &xmmMskLo, &xmmMskHi);
539 expandAlpha_2x128 (xmmMskLo, xmmMskHi, &xmmMskLo, &xmmMskHi);
541 pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMskLo, &xmmMskHi, &xmmSrcLo, &xmmSrcHi);
543 s = pack_2x128_128 (xmmSrcLo, xmmSrcHi);
549 static force_inline void
550 coreCombineOverUsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm, int w)
554 __m128i xmmDstLo, xmmDstHi;
555 __m128i xmmSrcLo, xmmSrcHi;
556 __m128i xmmAlphaLo, xmmAlphaHi;
558 /* call prefetch hint to optimize cache load*/
559 cachePrefetch ((__m128i*)ps);
560 cachePrefetch ((__m128i*)pd);
561 cachePrefetch ((__m128i*)pm);
563 /* Align dst on a 16-byte boundary */
565 ((unsigned long)pd & 15))
568 s = combine1 (ps, pm);
570 *pd++ = coreCombineOverUPixelsse2 (s, d);
577 /* call prefetch hint to optimize cache load*/
578 cachePrefetch ((__m128i*)ps);
579 cachePrefetch ((__m128i*)pd);
580 cachePrefetch ((__m128i*)pm);
584 /* fill cache line with next memory */
585 cachePrefetchNext ((__m128i*)ps);
586 cachePrefetchNext ((__m128i*)pd);
587 cachePrefetchNext ((__m128i*)pm);
589 /* I'm loading unaligned because I'm not sure about the address alignment. */
590 xmmSrcHi = combine4 ((__m128i*)ps, (__m128i*)pm);
592 if (isOpaque (xmmSrcHi))
594 save128Aligned ((__m128i*)pd, xmmSrcHi);
596 else if (!isZero (xmmSrcHi))
598 xmmDstHi = load128Aligned ((__m128i*) pd);
600 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
601 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
603 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);
605 over_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi, &xmmDstLo, &xmmDstHi);
607 /* rebuid the 4 pixel data and save*/
608 save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
621 s = combine1 (ps, pm);
623 *pd++ = coreCombineOverUPixelsse2 (s, d);
631 static force_inline void
632 coreCombineOverReverseUsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm, int w)
636 __m128i xmmDstLo, xmmDstHi;
637 __m128i xmmSrcLo, xmmSrcHi;
638 __m128i xmmAlphaLo, xmmAlphaHi;
640 /* call prefetch hint to optimize cache load*/
641 cachePrefetch ((__m128i*)ps);
642 cachePrefetch ((__m128i*)pd);
643 cachePrefetch ((__m128i*)pm);
645 /* Align dst on a 16-byte boundary */
647 ((unsigned long)pd & 15))
650 s = combine1 (ps, pm);
652 *pd++ = coreCombineOverUPixelsse2 (d, s);
659 /* call prefetch hint to optimize cache load*/
660 cachePrefetch ((__m128i*)ps);
661 cachePrefetch ((__m128i*)pd);
662 cachePrefetch ((__m128i*)pm);
666 /* fill cache line with next memory */
667 cachePrefetchNext ((__m128i*)ps);
668 cachePrefetchNext ((__m128i*)pd);
669 cachePrefetchNext ((__m128i*)pm);
671 /* I'm loading unaligned because I'm not sure about the address alignment. */
672 xmmSrcHi = combine4 ((__m128i*)ps, (__m128i*)pm);
673 xmmDstHi = load128Aligned ((__m128i*) pd);
675 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
676 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
678 expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaLo, &xmmAlphaHi);
680 over_2x128 (&xmmDstLo, &xmmDstHi, &xmmAlphaLo, &xmmAlphaHi, &xmmSrcLo, &xmmSrcHi);
682 /* rebuid the 4 pixel data and save*/
683 save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmSrcLo, xmmSrcHi));
695 s = combine1 (ps, pm);
697 *pd++ = coreCombineOverUPixelsse2 (d, s);
705 static force_inline uint32_t
706 coreCombineInUPixelsse2 (uint32_t src, uint32_t dst)
708 uint32_t maska = src >> 24;
714 else if (maska != 0xff)
716 return pack_1x64_32(pixMultiply_1x64 (unpack_32_1x64 (dst), expandAlpha_1x64 (unpack_32_1x64 (src))));
722 static force_inline void
723 coreCombineInUsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm, int w)
727 __m128i xmmSrcLo, xmmSrcHi;
728 __m128i xmmDstLo, xmmDstHi;
730 /* call prefetch hint to optimize cache load*/
731 cachePrefetch ((__m128i*)ps);
732 cachePrefetch ((__m128i*)pd);
733 cachePrefetch ((__m128i*)pm);
735 while (w && ((unsigned long) pd & 15))
737 s = combine1 (ps, pm);
740 *pd++ = coreCombineInUPixelsse2 (d, s);
747 /* call prefetch hint to optimize cache load*/
748 cachePrefetch ((__m128i*)ps);
749 cachePrefetch ((__m128i*)pd);
750 cachePrefetch ((__m128i*)pm);
754 /* fill cache line with next memory */
755 cachePrefetchNext ((__m128i*)ps);
756 cachePrefetchNext ((__m128i*)pd);
757 cachePrefetchNext ((__m128i*)pm);
759 xmmDstHi = load128Aligned ((__m128i*) pd);
760 xmmSrcHi = combine4 ((__m128i*) ps, (__m128i*) pm);
762 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
763 expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmDstLo, &xmmDstHi);
765 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
766 pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmDstLo, &xmmDstHi, &xmmDstLo, &xmmDstHi);
768 save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
779 s = combine1 (ps, pm);
782 *pd++ = coreCombineInUPixelsse2 (d, s);
790 static force_inline void
791 coreCombineReverseInUsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t *pm, int w)
795 __m128i xmmSrcLo, xmmSrcHi;
796 __m128i xmmDstLo, xmmDstHi;
798 /* call prefetch hint to optimize cache load*/
799 cachePrefetch ((__m128i*)ps);
800 cachePrefetch ((__m128i*)pd);
801 cachePrefetch ((__m128i*)pm);
803 while (w && ((unsigned long) pd & 15))
805 s = combine1 (ps, pm);
808 *pd++ = coreCombineInUPixelsse2 (s, d);
815 /* call prefetch hint to optimize cache load*/
816 cachePrefetch ((__m128i*)ps);
817 cachePrefetch ((__m128i*)pd);
818 cachePrefetch ((__m128i*)pm);
822 /* fill cache line with next memory */
823 cachePrefetchNext ((__m128i*)ps);
824 cachePrefetchNext ((__m128i*)pd);
825 cachePrefetchNext ((__m128i*)pm);
827 xmmDstHi = load128Aligned ((__m128i*) pd);
828 xmmSrcHi = combine4 ((__m128i*) ps, (__m128i*)pm);
830 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
831 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
833 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
834 pixMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmSrcLo, &xmmSrcHi, &xmmDstLo, &xmmDstHi);
836 save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
847 s = combine1 (ps, pm);
850 *pd++ = coreCombineInUPixelsse2 (s, d);
858 static force_inline void
859 coreCombineReverseOutUsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm, int w)
861 /* call prefetch hint to optimize cache load*/
862 cachePrefetch ((__m128i*)ps);
863 cachePrefetch ((__m128i*)pd);
864 cachePrefetch ((__m128i*)pm);
866 while (w && ((unsigned long) pd & 15))
868 uint32_t s = combine1 (ps, pm);
871 *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d), negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (s)))));
878 /* call prefetch hint to optimize cache load*/
879 cachePrefetch ((__m128i*)ps);
880 cachePrefetch ((__m128i*)pd);
881 cachePrefetch ((__m128i*)pm);
885 __m128i xmmSrcLo, xmmSrcHi;
886 __m128i xmmDstLo, xmmDstHi;
888 /* fill cache line with next memory */
889 cachePrefetchNext ((__m128i*)ps);
890 cachePrefetchNext ((__m128i*)pd);
891 cachePrefetchNext ((__m128i*)pm);
893 xmmSrcHi = combine4 ((__m128i*)ps, (__m128i*)pm);
894 xmmDstHi = load128Aligned ((__m128i*) pd);
896 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
897 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
899 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
900 negate_2x128 (xmmSrcLo, xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
902 pixMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmSrcLo, &xmmSrcHi, &xmmDstLo, &xmmDstHi);
904 save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
915 uint32_t s = combine1 (ps, pm);
918 *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d), negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (s)))));
926 static force_inline void
927 coreCombineOutUsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm, int w)
929 /* call prefetch hint to optimize cache load*/
930 cachePrefetch ((__m128i*)ps);
931 cachePrefetch ((__m128i*)pd);
932 cachePrefetch ((__m128i*)pm);
934 while (w && ((unsigned long) pd & 15))
936 uint32_t s = combine1 (ps, pm);
939 *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s), negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (d)))));
946 /* call prefetch hint to optimize cache load*/
947 cachePrefetch ((__m128i*)ps);
948 cachePrefetch ((__m128i*)pd);
949 cachePrefetch ((__m128i*)pm);
953 __m128i xmmSrcLo, xmmSrcHi;
954 __m128i xmmDstLo, xmmDstHi;
956 /* fill cache line with next memory */
957 cachePrefetchNext ((__m128i*)ps);
958 cachePrefetchNext ((__m128i*)pd);
959 cachePrefetchNext ((__m128i*)pm);
961 xmmSrcHi = combine4 ((__m128i*) ps, (__m128i*)pm);
962 xmmDstHi = load128Aligned ((__m128i*) pd);
964 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
965 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
967 expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmDstLo, &xmmDstHi);
968 negate_2x128 (xmmDstLo, xmmDstHi, &xmmDstLo, &xmmDstHi);
970 pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmDstLo, &xmmDstHi, &xmmDstLo, &xmmDstHi);
972 save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
983 uint32_t s = combine1 (ps, pm);
986 *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s), negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (d)))));
994 static force_inline uint32_t
995 coreCombineAtopUPixelsse2 (uint32_t src, uint32_t dst)
997 __m64 s = unpack_32_1x64 (src);
998 __m64 d = unpack_32_1x64 (dst);
1000 __m64 sa = negate_1x64 (expandAlpha_1x64 (s));
1001 __m64 da = expandAlpha_1x64 (d);
1003 return pack_1x64_32 (pixAddMultiply_1x64 (&s, &da, &d, &sa));
1006 static force_inline void
1007 coreCombineAtopUsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm, int w)
1011 __m128i xmmSrcLo, xmmSrcHi;
1012 __m128i xmmDstLo, xmmDstHi;
1013 __m128i xmmAlphaSrcLo, xmmAlphaSrcHi;
1014 __m128i xmmAlphaDstLo, xmmAlphaDstHi;
1016 /* call prefetch hint to optimize cache load*/
1017 cachePrefetch ((__m128i*)ps);
1018 cachePrefetch ((__m128i*)pd);
1019 cachePrefetch ((__m128i*)pm);
1021 while (w && ((unsigned long) pd & 15))
1023 s = combine1 (ps, pm);
1026 *pd++ = coreCombineAtopUPixelsse2 (s, d);
1033 /* call prefetch hint to optimize cache load*/
1034 cachePrefetch ((__m128i*)ps);
1035 cachePrefetch ((__m128i*)pd);
1036 cachePrefetch ((__m128i*)pm);
1040 /* fill cache line with next memory */
1041 cachePrefetchNext ((__m128i*)ps);
1042 cachePrefetchNext ((__m128i*)pd);
1043 cachePrefetchNext ((__m128i*)pm);
1045 xmmSrcHi = combine4 ((__m128i*)ps, (__m128i*)pm);
1046 xmmDstHi = load128Aligned ((__m128i*) pd);
1048 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
1049 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
1051 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
1052 expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
1054 negate_2x128 (xmmAlphaSrcLo, xmmAlphaSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
1056 pixAddMultiply_2x128 ( &xmmSrcLo, &xmmSrcHi, &xmmAlphaDstLo, &xmmAlphaDstHi,
1057 &xmmDstLo, &xmmDstHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi,
1058 &xmmDstLo, &xmmDstHi );
1060 save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
1071 s = combine1 (ps, pm);
1074 *pd++ = coreCombineAtopUPixelsse2 (s, d);
1082 static force_inline uint32_t
1083 coreCombineReverseAtopUPixelsse2 (uint32_t src, uint32_t dst)
1085 __m64 s = unpack_32_1x64 (src);
1086 __m64 d = unpack_32_1x64 (dst);
1088 __m64 sa = expandAlpha_1x64 (s);
1089 __m64 da = negate_1x64 (expandAlpha_1x64 (d));
1091 return pack_1x64_32 (pixAddMultiply_1x64 (&s, &da, &d, &sa));
1094 static force_inline void
1095 coreCombineReverseAtopUsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm, int w)
1099 __m128i xmmSrcLo, xmmSrcHi;
1100 __m128i xmmDstLo, xmmDstHi;
1101 __m128i xmmAlphaSrcLo, xmmAlphaSrcHi;
1102 __m128i xmmAlphaDstLo, xmmAlphaDstHi;
1104 /* call prefetch hint to optimize cache load*/
1105 cachePrefetch ((__m128i*)ps);
1106 cachePrefetch ((__m128i*)pd);
1107 cachePrefetch ((__m128i*)pm);
1109 while (w && ((unsigned long) pd & 15))
1111 s = combine1 (ps, pm);
1114 *pd++ = coreCombineReverseAtopUPixelsse2 (s, d);
1121 /* call prefetch hint to optimize cache load*/
1122 cachePrefetch ((__m128i*)ps);
1123 cachePrefetch ((__m128i*)pd);
1124 cachePrefetch ((__m128i*)pm);
1128 /* fill cache line with next memory */
1129 cachePrefetchNext ((__m128i*)ps);
1130 cachePrefetchNext ((__m128i*)pd);
1131 cachePrefetchNext ((__m128i*)pm);
1133 xmmSrcHi = combine4 ((__m128i*)ps, (__m128i*)pm);
1134 xmmDstHi = load128Aligned ((__m128i*) pd);
1136 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
1137 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
1139 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
1140 expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
1142 negate_2x128 (xmmAlphaDstLo, xmmAlphaDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
1144 pixAddMultiply_2x128 ( &xmmSrcLo, &xmmSrcHi, &xmmAlphaDstLo, &xmmAlphaDstHi,
1145 &xmmDstLo, &xmmDstHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi,
1146 &xmmDstLo, &xmmDstHi );
1148 save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
1159 s = combine1 (ps, pm);
1162 *pd++ = coreCombineReverseAtopUPixelsse2 (s, d);
1170 static force_inline uint32_t
1171 coreCombineXorUPixelsse2 (uint32_t src, uint32_t dst)
1173 __m64 s = unpack_32_1x64 (src);
1174 __m64 d = unpack_32_1x64 (dst);
1176 __m64 negD = negate_1x64 (expandAlpha_1x64 (d));
1177 __m64 negS = negate_1x64 (expandAlpha_1x64 (s));
1179 return pack_1x64_32 (pixAddMultiply_1x64 (&s, &negD, &d, &negS));
1182 static force_inline void
1183 coreCombineXorUsse2 (uint32_t* dst, const uint32_t* src, const uint32_t *mask, int width)
1188 const uint32_t* ps = src;
1189 const uint32_t* pm = mask;
1191 __m128i xmmSrc, xmmSrcLo, xmmSrcHi;
1192 __m128i xmmDst, xmmDstLo, xmmDstHi;
1193 __m128i xmmAlphaSrcLo, xmmAlphaSrcHi;
1194 __m128i xmmAlphaDstLo, xmmAlphaDstHi;
1196 /* call prefetch hint to optimize cache load*/
1197 cachePrefetch ((__m128i*)ps);
1198 cachePrefetch ((__m128i*)pd);
1199 cachePrefetch ((__m128i*)pm);
1201 while (w && ((unsigned long) pd & 15))
1203 s = combine1 (ps, pm);
1206 *pd++ = coreCombineXorUPixelsse2 (s, d);
1213 /* call prefetch hint to optimize cache load*/
1214 cachePrefetch ((__m128i*)ps);
1215 cachePrefetch ((__m128i*)pd);
1216 cachePrefetch ((__m128i*)pm);
1220 /* fill cache line with next memory */
1221 cachePrefetchNext ((__m128i*)ps);
1222 cachePrefetchNext ((__m128i*)pd);
1223 cachePrefetchNext ((__m128i*)pm);
1225 xmmSrc = combine4 ((__m128i*) ps, (__m128i*) pm);
1226 xmmDst = load128Aligned ((__m128i*) pd);
1228 unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
1229 unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
1231 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
1232 expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
1234 negate_2x128 (xmmAlphaSrcLo, xmmAlphaSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
1235 negate_2x128 (xmmAlphaDstLo, xmmAlphaDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
1237 pixAddMultiply_2x128 ( &xmmSrcLo, &xmmSrcHi, &xmmAlphaDstLo, &xmmAlphaDstHi,
1238 &xmmDstLo, &xmmDstHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi,
1239 &xmmDstLo, &xmmDstHi );
1241 save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
1252 s = combine1 (ps, pm);
1255 *pd++ = coreCombineXorUPixelsse2 (s, d);
1263 static force_inline void
1264 coreCombineAddUsse2 (uint32_t* dst, const uint32_t* src, const uint32_t* mask, int width)
1269 const uint32_t* ps = src;
1270 const uint32_t* pm = mask;
1272 /* call prefetch hint to optimize cache load*/
1273 cachePrefetch ((__m128i*)ps);
1274 cachePrefetch ((__m128i*)pd);
1275 cachePrefetch ((__m128i*)pm);
1277 while (w && (unsigned long)pd & 15)
1279 s = combine1 (ps, pm);
1284 *pd++ = _mm_cvtsi64_si32 (_mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
1288 /* call prefetch hint to optimize cache load*/
1289 cachePrefetch ((__m128i*)ps);
1290 cachePrefetch ((__m128i*)pd);
1291 cachePrefetch ((__m128i*)pm);
1297 /* fill cache line with next memory */
1298 cachePrefetchNext ((__m128i*)ps);
1299 cachePrefetchNext ((__m128i*)pd);
1300 cachePrefetchNext ((__m128i*)pm);
1302 s = combine4((__m128i*)ps,(__m128i*)pm);
1304 save128Aligned( (__m128i*)pd,
1305 _mm_adds_epu8( s, load128Aligned ((__m128i*)pd)) );
1315 s = combine1 (ps, pm);
1318 *pd++ = _mm_cvtsi64_si32 (_mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
1324 static force_inline uint32_t
1325 coreCombineSaturateUPixelsse2 (uint32_t src, uint32_t dst)
1327 __m64 ms = unpack_32_1x64 (src);
1328 __m64 md = unpack_32_1x64 (dst);
1329 uint32_t sa = src >> 24;
1330 uint32_t da = ~dst >> 24;
1334 ms = pixMultiply_1x64 (ms, expandAlpha_1x64 (unpack_32_1x64 (DIV_UN8(da, sa) << 24)));
1337 return pack_1x64_32 (_mm_adds_pu16 (md, ms));
1340 static force_inline void
1341 coreCombineSaturateUsse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm, int w)
1346 __m128i xmmSrc, xmmDst;
1348 /* call prefetch hint to optimize cache load*/
1349 cachePrefetch ((__m128i*)ps);
1350 cachePrefetch ((__m128i*)pd);
1351 cachePrefetch ((__m128i*)pm);
1353 while (w && (unsigned long)pd & 15)
1355 s = combine1 (ps, pm);
1357 *pd++ = coreCombineSaturateUPixelsse2 (s, d);
1364 /* call prefetch hint to optimize cache load*/
1365 cachePrefetch ((__m128i*)ps);
1366 cachePrefetch ((__m128i*)pd);
1367 cachePrefetch ((__m128i*)pm);
1371 /* fill cache line with next memory */
1372 cachePrefetchNext ((__m128i*)ps);
1373 cachePrefetchNext ((__m128i*)pd);
1374 cachePrefetchNext ((__m128i*)pm);
1376 xmmDst = load128Aligned ((__m128i*)pd);
1377 xmmSrc = combine4 ((__m128i*)ps, (__m128i*)pm);
1379 packCmp = _mm_movemask_epi8 (_mm_cmpgt_epi32 (_mm_srli_epi32 (xmmSrc, 24),
1380 _mm_srli_epi32 (_mm_xor_si128 (xmmDst, Maskff000000), 24)));
1382 /* if some alpha src is grater than respective ~alpha dst */
1385 s = combine1 (ps++, pm);
1387 *pd++ = coreCombineSaturateUPixelsse2 (s, d);
1391 s = combine1 (ps++, pm);
1393 *pd++ = coreCombineSaturateUPixelsse2 (s, d);
1397 s = combine1 (ps++, pm);
1399 *pd++ = coreCombineSaturateUPixelsse2 (s, d);
1403 s = combine1 (ps++, pm);
1405 *pd++ = coreCombineSaturateUPixelsse2 (s, d);
1411 save128Aligned ((__m128i*)pd, _mm_adds_epu8 (xmmDst, xmmSrc));
1424 s = combine1 (ps, pm);
1426 *pd++ = coreCombineSaturateUPixelsse2 (s, d);
1433 static force_inline void
1434 coreCombineSrcCsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t *pm, int w)
1438 __m128i xmmSrcLo, xmmSrcHi;
1439 __m128i xmmMaskLo, xmmMaskHi;
1440 __m128i xmmDstLo, xmmDstHi;
1442 /* call prefetch hint to optimize cache load*/
1443 cachePrefetch ((__m128i*)ps);
1444 cachePrefetch ((__m128i*)pd);
1445 cachePrefetch ((__m128i*)pm);
1447 while (w && (unsigned long)pd & 15)
1451 *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
1455 /* call prefetch hint to optimize cache load*/
1456 cachePrefetch ((__m128i*)ps);
1457 cachePrefetch ((__m128i*)pd);
1458 cachePrefetch ((__m128i*)pm);
1462 /* fill cache line with next memory */
1463 cachePrefetchNext ((__m128i*)ps);
1464 cachePrefetchNext ((__m128i*)pd);
1465 cachePrefetchNext ((__m128i*)pm);
1467 xmmSrcHi = load128Unaligned ((__m128i*)ps);
1468 xmmMaskHi = load128Unaligned ((__m128i*)pm);
1470 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
1471 unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
1473 pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi);
1475 save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
1487 *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
1492 static force_inline uint32_t
1493 coreCombineOverCPixelsse2 (uint32_t src, uint32_t mask, uint32_t dst)
1495 __m64 s = unpack_32_1x64 (src);
1496 __m64 expAlpha = expandAlpha_1x64 (s);
1497 __m64 unpkMask = unpack_32_1x64 (mask);
1498 __m64 unpkDst = unpack_32_1x64 (dst);
1500 return pack_1x64_32 (inOver_1x64 (&s, &expAlpha, &unpkMask, &unpkDst));
1503 static force_inline void
1504 coreCombineOverCsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t *pm, int w)
1508 __m128i xmmAlphaLo, xmmAlphaHi;
1509 __m128i xmmSrcLo, xmmSrcHi;
1510 __m128i xmmDstLo, xmmDstHi;
1511 __m128i xmmMaskLo, xmmMaskHi;
1513 /* call prefetch hint to optimize cache load*/
1514 cachePrefetch ((__m128i*)ps);
1515 cachePrefetch ((__m128i*)pd);
1516 cachePrefetch ((__m128i*)pm);
1518 while (w && (unsigned long)pd & 15)
1524 *pd++ = coreCombineOverCPixelsse2 (s, m, d);
1528 /* call prefetch hint to optimize cache load*/
1529 cachePrefetch ((__m128i*)ps);
1530 cachePrefetch ((__m128i*)pd);
1531 cachePrefetch ((__m128i*)pm);
1535 /* fill cache line with next memory */
1536 cachePrefetchNext ((__m128i*)ps);
1537 cachePrefetchNext ((__m128i*)pd);
1538 cachePrefetchNext ((__m128i*)pm);
1540 xmmDstHi = load128Aligned ((__m128i*)pd);
1541 xmmSrcHi = load128Unaligned ((__m128i*)ps);
1542 xmmMaskHi = load128Unaligned ((__m128i*)pm);
1544 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
1545 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
1546 unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
1548 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);
1550 inOver_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi);
1552 save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
1566 *pd++ = coreCombineOverCPixelsse2 (s, m, d);
1571 static force_inline uint32_t
1572 coreCombineOverReverseCPixelsse2 (uint32_t src, uint32_t mask, uint32_t dst)
1574 __m64 d = unpack_32_1x64 (dst);
1576 return pack_1x64_32(over_1x64 (d, expandAlpha_1x64 (d), pixMultiply_1x64 (unpack_32_1x64 (src), unpack_32_1x64 (mask))));
1579 static force_inline void
1580 coreCombineOverReverseCsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t *pm, int w)
1584 __m128i xmmAlphaLo, xmmAlphaHi;
1585 __m128i xmmSrcLo, xmmSrcHi;
1586 __m128i xmmDstLo, xmmDstHi;
1587 __m128i xmmMaskLo, xmmMaskHi;
1589 /* call prefetch hint to optimize cache load*/
1590 cachePrefetch ((__m128i*)ps);
1591 cachePrefetch ((__m128i*)pd);
1592 cachePrefetch ((__m128i*)pm);
1594 while (w && (unsigned long)pd & 15)
1600 *pd++ = coreCombineOverReverseCPixelsse2 (s, m, d);
1604 /* call prefetch hint to optimize cache load*/
1605 cachePrefetch ((__m128i*)ps);
1606 cachePrefetch ((__m128i*)pd);
1607 cachePrefetch ((__m128i*)pm);
1611 /* fill cache line with next memory */
1612 cachePrefetchNext ((__m128i*)ps);
1613 cachePrefetchNext ((__m128i*)pd);
1614 cachePrefetchNext ((__m128i*)pm);
1616 xmmDstHi = load128Aligned ((__m128i*)pd);
1617 xmmSrcHi = load128Unaligned ((__m128i*)ps);
1618 xmmMaskHi = load128Unaligned ((__m128i*)pm);
1620 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
1621 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
1622 unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
1624 expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaLo, &xmmAlphaHi);
1625 pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
1627 over_2x128 (&xmmDstLo, &xmmDstHi, &xmmAlphaLo, &xmmAlphaHi, &xmmMaskLo, &xmmMaskHi);
1629 save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmMaskLo, xmmMaskHi));
1643 *pd++ = coreCombineOverReverseCPixelsse2 (s, m, d);
1648 static force_inline void
1649 coreCombineInCsse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm, int w)
1653 __m128i xmmAlphaLo, xmmAlphaHi;
1654 __m128i xmmSrcLo, xmmSrcHi;
1655 __m128i xmmDstLo, xmmDstHi;
1656 __m128i xmmMaskLo, xmmMaskHi;
1658 /* call prefetch hint to optimize cache load*/
1659 cachePrefetch ((__m128i*)ps);
1660 cachePrefetch ((__m128i*)pd);
1661 cachePrefetch ((__m128i*)pm);
1663 while (w && (unsigned long)pd & 15)
1669 *pd++ = pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
1670 expandAlpha_1x64 (unpack_32_1x64 (d))));
1674 /* call prefetch hint to optimize cache load*/
1675 cachePrefetch ((__m128i*)ps);
1676 cachePrefetch ((__m128i*)pd);
1677 cachePrefetch ((__m128i*)pm);
1681 /* fill cache line with next memory */
1682 cachePrefetchNext ((__m128i*)ps);
1683 cachePrefetchNext ((__m128i*)pd);
1684 cachePrefetchNext ((__m128i*)pm);
1686 xmmDstHi = load128Aligned ((__m128i*)pd);
1687 xmmSrcHi = load128Unaligned ((__m128i*)ps);
1688 xmmMaskHi = load128Unaligned ((__m128i*)pm);
1690 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
1691 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
1692 unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
1694 expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaLo, &xmmAlphaHi);
1695 pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi);
1697 pixMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmAlphaLo, &xmmAlphaHi, &xmmDstLo, &xmmDstHi);
1699 save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
1713 *pd++ = pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
1714 expandAlpha_1x64 (unpack_32_1x64 (d))));
1719 static force_inline void
1720 coreCombineInReverseCsse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm, int w)
1724 __m128i xmmAlphaLo, xmmAlphaHi;
1725 __m128i xmmSrcLo, xmmSrcHi;
1726 __m128i xmmDstLo, xmmDstHi;
1727 __m128i xmmMaskLo, xmmMaskHi;
1729 /* call prefetch hint to optimize cache load*/
1730 cachePrefetch ((__m128i*)ps);
1731 cachePrefetch ((__m128i*)pd);
1732 cachePrefetch ((__m128i*)pm);
1734 while (w && (unsigned long)pd & 15)
1740 *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d),
1741 pixMultiply_1x64 (unpack_32_1x64 (m),
1742 expandAlpha_1x64 (unpack_32_1x64 (s)))));
1746 /* call prefetch hint to optimize cache load*/
1747 cachePrefetch ((__m128i*)ps);
1748 cachePrefetch ((__m128i*)pd);
1749 cachePrefetch ((__m128i*)pm);
1753 /* fill cache line with next memory */
1754 cachePrefetchNext ((__m128i*)ps);
1755 cachePrefetchNext ((__m128i*)pd);
1756 cachePrefetchNext ((__m128i*)pm);
1758 xmmDstHi = load128Aligned ((__m128i*)pd);
1759 xmmSrcHi = load128Unaligned ((__m128i*)ps);
1760 xmmMaskHi = load128Unaligned ((__m128i*)pm);
1762 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
1763 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
1764 unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
1766 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);
1767 pixMultiply_2x128 (&xmmMaskLo, &xmmMaskHi, &xmmAlphaLo, &xmmAlphaHi, &xmmAlphaLo, &xmmAlphaHi);
1769 pixMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmAlphaLo, &xmmAlphaHi, &xmmDstLo, &xmmDstHi);
1771 save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
1785 *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d),
1786 pixMultiply_1x64 (unpack_32_1x64 (m),
1787 expandAlpha_1x64 (unpack_32_1x64 (s)))));
1792 static force_inline void
1793 coreCombineOutCsse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm, int w)
1797 __m128i xmmAlphaLo, xmmAlphaHi;
1798 __m128i xmmSrcLo, xmmSrcHi;
1799 __m128i xmmDstLo, xmmDstHi;
1800 __m128i xmmMaskLo, xmmMaskHi;
1802 /* call prefetch hint to optimize cache load*/
1803 cachePrefetch ((__m128i*)ps);
1804 cachePrefetch ((__m128i*)pd);
1805 cachePrefetch ((__m128i*)pm);
1807 while (w && (unsigned long)pd & 15)
1813 *pd++ = pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
1814 negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (d)))));
1818 /* call prefetch hint to optimize cache load*/
1819 cachePrefetch ((__m128i*)ps);
1820 cachePrefetch ((__m128i*)pd);
1821 cachePrefetch ((__m128i*)pm);
1825 /* fill cache line with next memory */
1826 cachePrefetchNext ((__m128i*)ps);
1827 cachePrefetchNext ((__m128i*)pd);
1828 cachePrefetchNext ((__m128i*)pm);
1830 xmmDstHi = load128Aligned ((__m128i*)pd);
1831 xmmSrcHi = load128Unaligned ((__m128i*)ps);
1832 xmmMaskHi = load128Unaligned ((__m128i*)pm);
1834 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
1835 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
1836 unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
1838 expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaLo, &xmmAlphaHi);
1839 negate_2x128 (xmmAlphaLo, xmmAlphaHi, &xmmAlphaLo, &xmmAlphaHi);
1841 pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi);
1842 pixMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmAlphaLo, &xmmAlphaHi, &xmmDstLo, &xmmDstHi);
1844 save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
1858 *pd++ = pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
1859 negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (d)))));
1864 static force_inline void
1865 coreCombineOutReverseCsse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm, int w)
1869 __m128i xmmAlphaLo, xmmAlphaHi;
1870 __m128i xmmSrcLo, xmmSrcHi;
1871 __m128i xmmDstLo, xmmDstHi;
1872 __m128i xmmMaskLo, xmmMaskHi;
1874 /* call prefetch hint to optimize cache load*/
1875 cachePrefetch ((__m128i*)ps);
1876 cachePrefetch ((__m128i*)pd);
1877 cachePrefetch ((__m128i*)pm);
1879 while (w && (unsigned long)pd & 15)
1885 *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d),
1886 negate_1x64 (pixMultiply_1x64 (unpack_32_1x64 (m),
1887 expandAlpha_1x64 (unpack_32_1x64 (s))))));
1891 /* call prefetch hint to optimize cache load*/
1892 cachePrefetch ((__m128i*)ps);
1893 cachePrefetch ((__m128i*)pd);
1894 cachePrefetch ((__m128i*)pm);
1898 /* fill cache line with next memory */
1899 cachePrefetchNext ((__m128i*)ps);
1900 cachePrefetchNext ((__m128i*)pd);
1901 cachePrefetchNext ((__m128i*)pm);
1903 xmmDstHi = load128Aligned ((__m128i*)pd);
1904 xmmSrcHi = load128Unaligned ((__m128i*)ps);
1905 xmmMaskHi = load128Unaligned ((__m128i*)pm);
1907 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
1908 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
1909 unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
1911 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);
1913 pixMultiply_2x128 (&xmmMaskLo, &xmmMaskHi, &xmmAlphaLo, &xmmAlphaHi, &xmmMaskLo, &xmmMaskHi);
1915 negate_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
1917 pixMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi);
1919 save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
1933 *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d),
1934 negate_1x64 (pixMultiply_1x64 (unpack_32_1x64 (m),
1935 expandAlpha_1x64 (unpack_32_1x64 (s))))));
1940 static force_inline uint32_t
1941 coreCombineAtopCPixelsse2 (uint32_t src, uint32_t mask, uint32_t dst)
1943 __m64 m = unpack_32_1x64 (mask);
1944 __m64 s = unpack_32_1x64 (src);
1945 __m64 d = unpack_32_1x64 (dst);
1946 __m64 sa = expandAlpha_1x64 (s);
1947 __m64 da = expandAlpha_1x64 (d);
1949 s = pixMultiply_1x64 (s, m);
1950 m = negate_1x64 (pixMultiply_1x64 (m, sa));
1952 return pack_1x64_32 (pixAddMultiply_1x64 (&d, &m, &s, &da));
1955 static force_inline void
1956 coreCombineAtopCsse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm, int w)
1960 __m128i xmmSrcLo, xmmSrcHi;
1961 __m128i xmmDstLo, xmmDstHi;
1962 __m128i xmmAlphaSrcLo, xmmAlphaSrcHi;
1963 __m128i xmmAlphaDstLo, xmmAlphaDstHi;
1964 __m128i xmmMaskLo, xmmMaskHi;
1966 /* call prefetch hint to optimize cache load*/
1967 cachePrefetch ((__m128i*)ps);
1968 cachePrefetch ((__m128i*)pd);
1969 cachePrefetch ((__m128i*)pm);
1971 while (w && (unsigned long)pd & 15)
1977 *pd++ = coreCombineAtopCPixelsse2 (s, m, d);
1981 /* call prefetch hint to optimize cache load*/
1982 cachePrefetch ((__m128i*)ps);
1983 cachePrefetch ((__m128i*)pd);
1984 cachePrefetch ((__m128i*)pm);
1988 /* fill cache line with next memory */
1989 cachePrefetchNext ((__m128i*)ps);
1990 cachePrefetchNext ((__m128i*)pd);
1991 cachePrefetchNext ((__m128i*)pm);
1993 xmmDstHi = load128Aligned ((__m128i*)pd);
1994 xmmSrcHi = load128Unaligned ((__m128i*)ps);
1995 xmmMaskHi = load128Unaligned ((__m128i*)pm);
1997 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
1998 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
1999 unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
2001 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
2002 expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
2004 pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmSrcLo, &xmmSrcHi);
2005 pixMultiply_2x128 (&xmmMaskLo, &xmmMaskHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi, &xmmMaskLo, &xmmMaskHi);
2007 negate_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
2009 pixAddMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmMaskLo, &xmmMaskHi,
2010 &xmmSrcLo, &xmmSrcHi, &xmmAlphaDstLo, &xmmAlphaDstHi,
2011 &xmmDstLo, &xmmDstHi);
2013 save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
2027 *pd++ = coreCombineAtopCPixelsse2 (s, m, d);
2032 static force_inline uint32_t
2033 coreCombineReverseAtopCPixelsse2 (uint32_t src, uint32_t mask, uint32_t dst)
2035 __m64 m = unpack_32_1x64 (mask);
2036 __m64 s = unpack_32_1x64 (src);
2037 __m64 d = unpack_32_1x64 (dst);
2039 __m64 da = negate_1x64 (expandAlpha_1x64 (d));
2040 __m64 sa = expandAlpha_1x64 (s);
2042 s = pixMultiply_1x64 (s, m);
2043 m = pixMultiply_1x64 (m, sa);
2045 return pack_1x64_32 (pixAddMultiply_1x64 (&d, &m, &s, &da));
2048 static force_inline void
2049 coreCombineReverseAtopCsse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm, int w)
2053 __m128i xmmSrcLo, xmmSrcHi;
2054 __m128i xmmDstLo, xmmDstHi;
2055 __m128i xmmAlphaSrcLo, xmmAlphaSrcHi;
2056 __m128i xmmAlphaDstLo, xmmAlphaDstHi;
2057 __m128i xmmMaskLo, xmmMaskHi;
2059 /* call prefetch hint to optimize cache load*/
2060 cachePrefetch ((__m128i*)ps);
2061 cachePrefetch ((__m128i*)pd);
2062 cachePrefetch ((__m128i*)pm);
2064 while (w && (unsigned long)pd & 15)
2070 *pd++ = coreCombineReverseAtopCPixelsse2 (s, m, d);
2074 /* call prefetch hint to optimize cache load*/
2075 cachePrefetch ((__m128i*)ps);
2076 cachePrefetch ((__m128i*)pd);
2077 cachePrefetch ((__m128i*)pm);
2081 /* fill cache line with next memory */
2082 cachePrefetchNext ((__m128i*)ps);
2083 cachePrefetchNext ((__m128i*)pd);
2084 cachePrefetchNext ((__m128i*)pm);
2086 xmmDstHi = load128Aligned ((__m128i*)pd);
2087 xmmSrcHi = load128Unaligned ((__m128i*)ps);
2088 xmmMaskHi = load128Unaligned ((__m128i*)pm);
2090 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
2091 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
2092 unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
2094 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
2095 expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
2097 pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmSrcLo, &xmmSrcHi);
2098 pixMultiply_2x128 (&xmmMaskLo, &xmmMaskHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi, &xmmMaskLo, &xmmMaskHi);
2100 negate_2x128 (xmmAlphaDstLo, xmmAlphaDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
2102 pixAddMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmMaskLo, &xmmMaskHi,
2103 &xmmSrcLo, &xmmSrcHi, &xmmAlphaDstLo, &xmmAlphaDstHi,
2104 &xmmDstLo, &xmmDstHi);
2106 save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
2120 *pd++ = coreCombineReverseAtopCPixelsse2 (s, m, d);
2125 static force_inline uint32_t
2126 coreCombineXorCPixelsse2 (uint32_t src, uint32_t mask, uint32_t dst)
2128 __m64 a = unpack_32_1x64 (mask);
2129 __m64 s = unpack_32_1x64 (src);
2130 __m64 d = unpack_32_1x64 (dst);
2132 __m64 alphaDst = negate_1x64 (pixMultiply_1x64 (a, expandAlpha_1x64 (s)));
2133 __m64 dest = pixMultiply_1x64 (s, a);
2134 __m64 alphaSrc = negate_1x64 (expandAlpha_1x64 (d));
2136 return pack_1x64_32 (pixAddMultiply_1x64 (&d,
2142 static force_inline void
2143 coreCombineXorCsse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm, int w)
2147 __m128i xmmSrcLo, xmmSrcHi;
2148 __m128i xmmDstLo, xmmDstHi;
2149 __m128i xmmAlphaSrcLo, xmmAlphaSrcHi;
2150 __m128i xmmAlphaDstLo, xmmAlphaDstHi;
2151 __m128i xmmMaskLo, xmmMaskHi;
2153 /* call prefetch hint to optimize cache load*/
2154 cachePrefetch ((__m128i*)ps);
2155 cachePrefetch ((__m128i*)pd);
2156 cachePrefetch ((__m128i*)pm);
2158 while (w && (unsigned long)pd & 15)
2164 *pd++ = coreCombineXorCPixelsse2 (s, m, d);
2168 /* call prefetch hint to optimize cache load*/
2169 cachePrefetch ((__m128i*)ps);
2170 cachePrefetch ((__m128i*)pd);
2171 cachePrefetch ((__m128i*)pm);
2175 /* fill cache line with next memory */
2176 cachePrefetchNext ((__m128i*)ps);
2177 cachePrefetchNext ((__m128i*)pd);
2178 cachePrefetchNext ((__m128i*)pm);
2180 xmmDstHi = load128Aligned ((__m128i*)pd);
2181 xmmSrcHi = load128Unaligned ((__m128i*)ps);
2182 xmmMaskHi = load128Unaligned ((__m128i*)pm);
2184 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
2185 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
2186 unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
2188 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
2189 expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
2191 pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmSrcLo, &xmmSrcHi);
2192 pixMultiply_2x128 (&xmmMaskLo, &xmmMaskHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi, &xmmMaskLo, &xmmMaskHi);
2194 negate_2x128 (xmmAlphaDstLo, xmmAlphaDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
2195 negate_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
2197 pixAddMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmMaskLo, &xmmMaskHi,
2198 &xmmSrcLo, &xmmSrcHi, &xmmAlphaDstLo, &xmmAlphaDstHi,
2199 &xmmDstLo, &xmmDstHi);
2201 save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
2215 *pd++ = coreCombineXorCPixelsse2 (s, m, d);
2220 static force_inline void
2221 coreCombineAddCsse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm, int w)
2225 __m128i xmmSrcLo, xmmSrcHi;
2226 __m128i xmmDstLo, xmmDstHi;
2227 __m128i xmmMaskLo, xmmMaskHi;
2229 /* call prefetch hint to optimize cache load*/
2230 cachePrefetch ((__m128i*)ps);
2231 cachePrefetch ((__m128i*)pd);
2232 cachePrefetch ((__m128i*)pm);
2234 while (w && (unsigned long)pd & 15)
2240 *pd++ = pack_1x64_32 (_mm_adds_pu8 (pixMultiply_1x64 (unpack_32_1x64 (s),
2241 unpack_32_1x64 (m)),
2242 unpack_32_1x64 (d)));
2246 /* call prefetch hint to optimize cache load*/
2247 cachePrefetch ((__m128i*)ps);
2248 cachePrefetch ((__m128i*)pd);
2249 cachePrefetch ((__m128i*)pm);
2253 /* fill cache line with next memory */
2254 cachePrefetchNext ((__m128i*)ps);
2255 cachePrefetchNext ((__m128i*)pd);
2256 cachePrefetchNext ((__m128i*)pm);
2258 xmmSrcHi = load128Unaligned ((__m128i*)ps);
2259 xmmMaskHi = load128Unaligned ((__m128i*)pm);
2260 xmmDstHi = load128Aligned ((__m128i*)pd);
2262 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
2263 unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
2264 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
2266 pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmSrcLo, &xmmSrcHi);
2268 save128Aligned( (__m128i*)pd, pack_2x128_128 (_mm_adds_epu8 (xmmSrcLo, xmmDstLo),
2269 _mm_adds_epu8 (xmmSrcHi, xmmDstHi)));
2283 *pd++ = pack_1x64_32 (_mm_adds_pu8 (pixMultiply_1x64 (unpack_32_1x64 (s),
2284 unpack_32_1x64 (m)),
2285 unpack_32_1x64 (d)));
2290 /* -------------------------------------------------------------------------------------------------
2291 * fbComposeSetupSSE2
2293 static force_inline __m64
2294 createMask_16_64 (uint16_t mask)
2296 return _mm_set1_pi16 (mask);
2299 static force_inline __m128i
2300 createMask_16_128 (uint16_t mask)
2302 return _mm_set1_epi16 (mask);
2305 static force_inline __m64
2306 createMask_2x32_64 (uint32_t mask0, uint32_t mask1)
2308 return _mm_set_pi32 (mask0, mask1);
2311 static force_inline __m128i
2312 createMask_2x32_128 (uint32_t mask0, uint32_t mask1)
2314 return _mm_set_epi32 (mask0, mask1, mask0, mask1);
2317 /* SSE2 code patch for fbcompose.c */
2320 sse2CombineOverU (pixman_implementation_t *imp, pixman_op_t op,
2321 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2323 coreCombineOverUsse2 (dst, src, mask, width);
2328 sse2CombineOverReverseU (pixman_implementation_t *imp, pixman_op_t op,
2329 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2331 coreCombineOverReverseUsse2 (dst, src, mask, width);
2336 sse2CombineInU (pixman_implementation_t *imp, pixman_op_t op,
2337 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2339 coreCombineInUsse2 (dst, src, mask, width);
2344 sse2CombineInReverseU (pixman_implementation_t *imp, pixman_op_t op,
2345 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2347 coreCombineReverseInUsse2 (dst, src, mask, width);
2352 sse2CombineOutU (pixman_implementation_t *imp, pixman_op_t op,
2353 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2355 coreCombineOutUsse2 (dst, src, mask, width);
2360 sse2CombineOutReverseU (pixman_implementation_t *imp, pixman_op_t op,
2361 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2363 coreCombineReverseOutUsse2 (dst, src, mask, width);
2368 sse2CombineAtopU (pixman_implementation_t *imp, pixman_op_t op,
2369 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2371 coreCombineAtopUsse2 (dst, src, mask, width);
2376 sse2CombineAtopReverseU (pixman_implementation_t *imp, pixman_op_t op,
2377 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2379 coreCombineReverseAtopUsse2 (dst, src, mask, width);
2384 sse2CombineXorU (pixman_implementation_t *imp, pixman_op_t op,
2385 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2387 coreCombineXorUsse2 (dst, src, mask, width);
2392 sse2CombineAddU (pixman_implementation_t *imp, pixman_op_t op,
2393 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2395 coreCombineAddUsse2 (dst, src, mask, width);
2400 sse2CombineSaturateU (pixman_implementation_t *imp, pixman_op_t op,
2401 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2403 coreCombineSaturateUsse2 (dst, src, mask, width);
2408 sse2CombineSrcC (pixman_implementation_t *imp, pixman_op_t op,
2409 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2411 coreCombineSrcCsse2 (dst, src, mask, width);
2416 sse2CombineOverC (pixman_implementation_t *imp, pixman_op_t op,
2417 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2419 coreCombineOverCsse2 (dst, src, mask, width);
2424 sse2CombineOverReverseC (pixman_implementation_t *imp, pixman_op_t op,
2425 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2427 coreCombineOverReverseCsse2 (dst, src, mask, width);
2432 sse2CombineInC (pixman_implementation_t *imp, pixman_op_t op,
2433 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2435 coreCombineInCsse2 (dst, src, mask, width);
2440 sse2CombineInReverseC (pixman_implementation_t *imp, pixman_op_t op,
2441 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2443 coreCombineInReverseCsse2 (dst, src, mask, width);
2448 sse2CombineOutC (pixman_implementation_t *imp, pixman_op_t op,
2449 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2451 coreCombineOutCsse2 (dst, src, mask, width);
2456 sse2CombineOutReverseC (pixman_implementation_t *imp, pixman_op_t op,
2457 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2459 coreCombineOutReverseCsse2 (dst, src, mask, width);
2464 sse2CombineAtopC (pixman_implementation_t *imp, pixman_op_t op,
2465 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2467 coreCombineAtopCsse2 (dst, src, mask, width);
2472 sse2CombineAtopReverseC (pixman_implementation_t *imp, pixman_op_t op,
2473 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2475 coreCombineReverseAtopCsse2 (dst, src, mask, width);
2480 sse2CombineXorC (pixman_implementation_t *imp, pixman_op_t op,
2481 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2483 coreCombineXorCsse2 (dst, src, mask, width);
2488 sse2CombineAddC (pixman_implementation_t *imp, pixman_op_t op,
2489 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2491 coreCombineAddCsse2 (dst, src, mask, width);
2495 /* -------------------------------------------------------------------------------------------------
2496 * fast_CompositeOver_n_8888
2500 sse2_CompositeOver_n_8888 (pixman_implementation_t *imp,
2502 pixman_image_t * src_image,
2503 pixman_image_t * mask_image,
2504 pixman_image_t * dst_image,
2515 uint32_t *dstLine, *dst, d;
2518 __m128i xmmSrc, xmmAlpha;
2519 __m128i xmmDst, xmmDstLo, xmmDstHi;
2521 src = _pixman_image_get_solid(src_image, dst_image->bits.format);
2526 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dstStride, dstLine, 1);
2528 xmmSrc = expandPixel_32_1x128 (src);
2529 xmmAlpha = expandAlpha_1x128 (xmmSrc);
2535 /* call prefetch hint to optimize cache load*/
2536 cachePrefetch ((__m128i*)dst);
2538 dstLine += dstStride;
2541 while (w && (unsigned long)dst & 15)
2544 *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmmSrc),
2545 _mm_movepi64_pi64 (xmmAlpha),
2546 unpack_32_1x64 (d)));
2550 cachePrefetch ((__m128i*)dst);
2554 /* fill cache line with next memory */
2555 cachePrefetchNext ((__m128i*)dst);
2557 xmmDst = load128Aligned ((__m128i*)dst);
2559 unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
2561 over_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmDstLo, &xmmDstHi);
2563 /* rebuid the 4 pixel data and save*/
2564 save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
2573 *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmmSrc),
2574 _mm_movepi64_pi64 (xmmAlpha),
2575 unpack_32_1x64 (d)));
2583 /* -------------------------------------------------------------------------------------------------
2584 * fast_CompositeOver_n_0565
2587 sse2_CompositeOver_n_0565 (pixman_implementation_t *imp,
2589 pixman_image_t * src_image,
2590 pixman_image_t * mask_image,
2591 pixman_image_t * dst_image,
2602 uint16_t *dstLine, *dst, d;
2605 __m128i xmmSrc, xmmAlpha;
2606 __m128i xmmDst, xmmDst0, xmmDst1, xmmDst2, xmmDst3;
2608 src = _pixman_image_get_solid(src_image, dst_image->bits.format);
2613 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dstStride, dstLine, 1);
2615 xmmSrc = expandPixel_32_1x128 (src);
2616 xmmAlpha = expandAlpha_1x128 (xmmSrc);
2622 /* call prefetch hint to optimize cache load*/
2623 cachePrefetch ((__m128i*)dst);
2625 dstLine += dstStride;
2628 while (w && (unsigned long)dst & 15)
2632 *dst++ = pack565_32_16 (pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmmSrc),
2633 _mm_movepi64_pi64 (xmmAlpha),
2634 expand565_16_1x64 (d))));
2638 /* call prefetch hint to optimize cache load*/
2639 cachePrefetch ((__m128i*)dst);
2643 /* fill cache line with next memory */
2644 cachePrefetchNext ((__m128i*)dst);
2646 xmmDst = load128Aligned ((__m128i*)dst);
2648 unpack565_128_4x128 (xmmDst, &xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3);
2650 over_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmDst0, &xmmDst1);
2651 over_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmDst2, &xmmDst3);
2653 xmmDst = pack565_4x128_128 (&xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3);
2654 save128Aligned ((__m128i*)dst, xmmDst);
2663 *dst++ = pack565_32_16 (pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmmSrc),
2664 _mm_movepi64_pi64 (xmmAlpha),
2665 expand565_16_1x64 (d))));
2672 /* -------------------------------------------------------------------------------------------------
2673 * fast_CompositeOver_n_8888_8888_ca
2677 sse2_CompositeOver_n_8888_8888_ca (pixman_implementation_t *imp,
2679 pixman_image_t * src_image,
2680 pixman_image_t * mask_image,
2681 pixman_image_t * dst_image,
2692 uint32_t *dstLine, d;
2693 uint32_t *maskLine, m;
2695 int dstStride, maskStride;
2697 __m128i xmmSrc, xmmAlpha;
2698 __m128i xmmDst, xmmDstLo, xmmDstHi;
2699 __m128i xmmMask, xmmMaskLo, xmmMaskHi;
2701 __m64 mmsrc_x, mmxAlpha, mmmask_x, mmdest_x;
2703 src = _pixman_image_get_solid(src_image, dst_image->bits.format);
2708 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dstStride, dstLine, 1);
2709 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, maskStride, maskLine, 1);
2711 xmmSrc = _mm_unpacklo_epi8 (createMask_2x32_128 (src, src), _mm_setzero_si128 ());
2712 xmmAlpha = expandAlpha_1x128 (xmmSrc);
2713 mmsrc_x = _mm_movepi64_pi64 (xmmSrc);
2714 mmxAlpha = _mm_movepi64_pi64 (xmmAlpha);
2719 const uint32_t *pm = (uint32_t *)maskLine;
2720 uint32_t *pd = (uint32_t *)dstLine;
2722 dstLine += dstStride;
2723 maskLine += maskStride;
2725 /* call prefetch hint to optimize cache load*/
2726 cachePrefetch ((__m128i*)pd);
2727 cachePrefetch ((__m128i*)pm);
2729 while (w && (unsigned long)pd & 15)
2736 mmmask_x = unpack_32_1x64 (m);
2737 mmdest_x = unpack_32_1x64 (d);
2739 *pd = pack_1x64_32 (inOver_1x64 (&mmsrc_x,
2749 /* call prefetch hint to optimize cache load*/
2750 cachePrefetch ((__m128i*)pd);
2751 cachePrefetch ((__m128i*)pm);
2755 /* fill cache line with next memory */
2756 cachePrefetchNext ((__m128i*)pd);
2757 cachePrefetchNext ((__m128i*)pm);
2759 xmmMask = load128Unaligned ((__m128i*)pm);
2761 packCmp = _mm_movemask_epi8 (_mm_cmpeq_epi32 (xmmMask, _mm_setzero_si128()));
2763 /* if all bits in mask are zero, packCmp are equal to 0xffff */
2764 if (packCmp != 0xffff)
2766 xmmDst = load128Aligned ((__m128i*)pd);
2768 unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
2769 unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
2771 inOver_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi);
2773 save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
2788 mmmask_x = unpack_32_1x64 (m);
2789 mmdest_x = unpack_32_1x64 (d);
2791 *pd = pack_1x64_32 (inOver_1x64 (&mmsrc_x,
2806 /* -------------------------------------------------------------------------------------------------
2807 * fast_composite_over_8888_n_8888
2811 sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
2813 pixman_image_t * src_image,
2814 pixman_image_t * mask_image,
2815 pixman_image_t * dst_image,
2825 uint32_t *dstLine, *dst;
2826 uint32_t *srcLine, *src;
2829 int dstStride, srcStride;
2832 __m128i xmmSrc, xmmSrcLo, xmmSrcHi;
2833 __m128i xmmDst, xmmDstLo, xmmDstHi;
2834 __m128i xmmAlphaLo, xmmAlphaHi;
2836 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dstStride, dstLine, 1);
2837 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, srcStride, srcLine, 1);
2838 mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
2840 xmmMask = createMask_16_128 (mask >> 24);
2845 dstLine += dstStride;
2847 srcLine += srcStride;
2850 /* call prefetch hint to optimize cache load*/
2851 cachePrefetch ((__m128i*)dst);
2852 cachePrefetch ((__m128i*)src);
2854 while (w && (unsigned long)dst & 15)
2856 uint32_t s = *src++;
2859 __m64 ms = unpack_32_1x64 (s);
2860 __m64 alpha = expandAlpha_1x64 (ms);
2861 __m64 dest = _mm_movepi64_pi64 (xmmMask);
2862 __m64 alphaDst = unpack_32_1x64 (d);
2864 *dst++ = pack_1x64_32 (inOver_1x64 (&ms,
2872 /* call prefetch hint to optimize cache load*/
2873 cachePrefetch ((__m128i*)dst);
2874 cachePrefetch ((__m128i*)src);
2878 /* fill cache line with next memory */
2879 cachePrefetchNext ((__m128i*)dst);
2880 cachePrefetchNext ((__m128i*)src);
2882 xmmSrc = load128Unaligned ((__m128i*)src);
2883 xmmDst = load128Aligned ((__m128i*)dst);
2885 unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
2886 unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
2887 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);
2889 inOver_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi, &xmmMask, &xmmMask, &xmmDstLo, &xmmDstHi);
2891 save128Aligned( (__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
2900 uint32_t s = *src++;
2903 __m64 ms = unpack_32_1x64 (s);
2904 __m64 alpha = expandAlpha_1x64 (ms);
2905 __m64 mask = _mm_movepi64_pi64 (xmmMask);
2906 __m64 dest = unpack_32_1x64 (d);
2908 *dst++ = pack_1x64_32 (inOver_1x64 (&ms,
2920 /* -------------------------------------------------------------------------------------------------
2921 * fast_Composite_over_x888_n_8888
2924 sse2_Composite_over_x888_n_8888 (pixman_implementation_t *imp,
2926 pixman_image_t * src_image,
2927 pixman_image_t * mask_image,
2928 pixman_image_t * dst_image,
2938 uint32_t *dstLine, *dst;
2939 uint32_t *srcLine, *src;
2941 int dstStride, srcStride;
2944 __m128i xmmMask, xmmAlpha;
2945 __m128i xmmSrc, xmmSrcLo, xmmSrcHi;
2946 __m128i xmmDst, xmmDstLo, xmmDstHi;
2948 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dstStride, dstLine, 1);
2949 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, srcStride, srcLine, 1);
2950 mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
2952 xmmMask = createMask_16_128 (mask >> 24);
2953 xmmAlpha = Mask00ff;
2958 dstLine += dstStride;
2960 srcLine += srcStride;
2963 /* call prefetch hint to optimize cache load*/
2964 cachePrefetch ((__m128i*)dst);
2965 cachePrefetch ((__m128i*)src);
2967 while (w && (unsigned long)dst & 15)
2969 uint32_t s = (*src++) | 0xff000000;
2972 __m64 src = unpack_32_1x64 (s);
2973 __m64 alpha = _mm_movepi64_pi64 (xmmAlpha);
2974 __m64 mask = _mm_movepi64_pi64 (xmmMask);
2975 __m64 dest = unpack_32_1x64 (d);
2977 *dst++ = pack_1x64_32 (inOver_1x64 (&src,
2985 /* call prefetch hint to optimize cache load*/
2986 cachePrefetch ((__m128i*)dst);
2987 cachePrefetch ((__m128i*)src);
2991 /* fill cache line with next memory */
2992 cachePrefetchNext ((__m128i*)dst);
2993 cachePrefetchNext ((__m128i*)src);
2995 xmmSrc = _mm_or_si128 (load128Unaligned ((__m128i*)src), Maskff000000);
2996 xmmDst = load128Aligned ((__m128i*)dst);
2998 unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
2999 unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
3001 inOver_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmAlpha, &xmmAlpha, &xmmMask, &xmmMask, &xmmDstLo, &xmmDstHi);
3003 save128Aligned( (__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
3013 uint32_t s = (*src++) | 0xff000000;
3016 __m64 src = unpack_32_1x64 (s);
3017 __m64 alpha = _mm_movepi64_pi64 (xmmAlpha);
3018 __m64 mask = _mm_movepi64_pi64 (xmmMask);
3019 __m64 dest = unpack_32_1x64 (d);
3021 *dst++ = pack_1x64_32 (inOver_1x64 (&src,
3033 /* -------------------------------------------------------------------------------------------------
3034 * fast_composite_over_8888_8888
3037 sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
3039 pixman_image_t * src_image,
3040 pixman_image_t * mask_image,
3041 pixman_image_t * dst_image,
3051 int dstStride, srcStride;
3052 uint32_t *dstLine, *dst;
3053 uint32_t *srcLine, *src;
3055 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dstStride, dstLine, 1);
3056 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, srcStride, srcLine, 1);
3063 coreCombineOverUsse2 (dst, src, NULL, width);
3071 /* -------------------------------------------------------------------------------------------------
3072 * fast_composite_over_8888_0565
3074 static force_inline uint16_t
3075 fast_composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
3079 ms = unpack_32_1x64 (src);
3080 return pack565_32_16( pack_1x64_32 (over_1x64 (ms,
3081 expandAlpha_1x64 (ms),
3082 expand565_16_1x64 (dst))));
3086 sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
3088 pixman_image_t * src_image,
3089 pixman_image_t * mask_image,
3090 pixman_image_t * dst_image,
3100 uint16_t *dstLine, *dst, d;
3101 uint32_t *srcLine, *src, s;
3102 int dstStride, srcStride;
3105 __m128i xmmAlphaLo, xmmAlphaHi;
3106 __m128i xmmSrc, xmmSrcLo, xmmSrcHi;
3107 __m128i xmmDst, xmmDst0, xmmDst1, xmmDst2, xmmDst3;
3109 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dstStride, dstLine, 1);
3110 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, srcStride, srcLine, 1);
3115 * I copy the code from MMX one and keep the fixme.
3116 * If it's a problem there, probably is a problem here.
3118 assert (src_image->drawable == mask_image->drawable);
3126 /* call prefetch hint to optimize cache load*/
3127 cachePrefetch ((__m128i*)src);
3128 cachePrefetch ((__m128i*)dst);
3130 dstLine += dstStride;
3131 srcLine += srcStride;
3134 /* Align dst on a 16-byte boundary */
3136 ((unsigned long)dst & 15))
3141 *dst++ = fast_composite_over_8888_0565pixel (s, d);
3145 /* call prefetch hint to optimize cache load*/
3146 cachePrefetch ((__m128i*)src);
3147 cachePrefetch ((__m128i*)dst);
3149 /* It's a 8 pixel loop */
3152 /* fill cache line with next memory */
3153 cachePrefetchNext ((__m128i*)src);
3154 cachePrefetchNext ((__m128i*)dst);
3156 /* I'm loading unaligned because I'm not sure about the address alignment. */
3157 xmmSrc = load128Unaligned ((__m128i*) src);
3158 xmmDst = load128Aligned ((__m128i*) dst);
3161 unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
3162 unpack565_128_4x128 (xmmDst, &xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3);
3163 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);
3165 /* I'm loading next 4 pixels from memory before to optimze the memory read. */
3166 xmmSrc = load128Unaligned ((__m128i*) (src+4));
3168 over_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi, &xmmDst0, &xmmDst1);
3171 unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
3172 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);
3174 over_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi, &xmmDst2, &xmmDst3);
3176 save128Aligned ((__m128i*)dst, pack565_4x128_128 (&xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3));
3188 *dst++ = fast_composite_over_8888_0565pixel (s, d);
3195 /* -------------------------------------------------------------------------------------------------
3196 * fast_CompositeOver_n_8_8888
3200 sse2_CompositeOver_n_8_8888 (pixman_implementation_t *imp,
3202 pixman_image_t * src_image,
3203 pixman_image_t * mask_image,
3204 pixman_image_t * dst_image,
3215 uint32_t *dstLine, *dst;
3216 uint8_t *maskLine, *mask;
3217 int dstStride, maskStride;
3221 __m128i xmmSrc, xmmAlpha, xmmDef;
3222 __m128i xmmDst, xmmDstLo, xmmDstHi;
3223 __m128i xmmMask, xmmMaskLo, xmmMaskHi;
3225 __m64 mmsrc_x, mmxAlpha, mmmask_x, mmxDest;
3227 src = _pixman_image_get_solid(src_image, dst_image->bits.format);
3233 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dstStride, dstLine, 1);
3234 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, maskStride, maskLine, 1);
3236 xmmDef = createMask_2x32_128 (src, src);
3237 xmmSrc = expandPixel_32_1x128 (src);
3238 xmmAlpha = expandAlpha_1x128 (xmmSrc);
3239 mmsrc_x = _mm_movepi64_pi64 (xmmSrc);
3240 mmxAlpha = _mm_movepi64_pi64 (xmmAlpha);
3245 dstLine += dstStride;
3247 maskLine += maskStride;
3250 /* call prefetch hint to optimize cache load*/
3251 cachePrefetch ((__m128i*)mask);
3252 cachePrefetch ((__m128i*)dst);
3254 while (w && (unsigned long)dst & 15)
3256 uint8_t m = *mask++;
3261 mmmask_x = expandPixel_8_1x64 (m);
3262 mmxDest = unpack_32_1x64 (d);
3264 *dst = pack_1x64_32 (inOver_1x64 (&mmsrc_x,
3274 /* call prefetch hint to optimize cache load*/
3275 cachePrefetch ((__m128i*)mask);
3276 cachePrefetch ((__m128i*)dst);
3280 /* fill cache line with next memory */
3281 cachePrefetchNext ((__m128i*)mask);
3282 cachePrefetchNext ((__m128i*)dst);
3284 m = *((uint32_t*)mask);
3286 if (srca == 0xff && m == 0xffffffff)
3288 save128Aligned ((__m128i*)dst, xmmDef);
3292 xmmDst = load128Aligned ((__m128i*) dst);
3293 xmmMask = unpack_32_1x128 (m);
3294 xmmMask = _mm_unpacklo_epi8 (xmmMask, _mm_setzero_si128());
3297 unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
3298 unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
3300 expandAlphaRev_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
3302 inOver_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi);
3304 save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
3314 uint8_t m = *mask++;
3319 mmmask_x = expandPixel_8_1x64 (m);
3320 mmxDest = unpack_32_1x64 (d);
3322 *dst = pack_1x64_32 (inOver_1x64 (&mmsrc_x,
3336 /* -------------------------------------------------------------------------------------------------
3337 * fast_CompositeOver_n_8_8888
3341 pixmanFillsse2 (uint32_t *bits,
3350 uint32_t byte_width;
3355 if (bpp == 16 && (data >> 16 != (data & 0xffff)))
3358 if (bpp != 16 && bpp != 32)
3363 stride = stride * (int) sizeof (uint32_t) / 2;
3364 byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
3365 byte_width = 2 * width;
3370 stride = stride * (int) sizeof (uint32_t) / 4;
3371 byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
3372 byte_width = 4 * width;
3376 cachePrefetch ((__m128i*)byte_line);
3377 xmmDef = createMask_2x32_128 (data, data);
3382 uint8_t *d = byte_line;
3383 byte_line += stride;
3387 cachePrefetchNext ((__m128i*)d);
3389 while (w >= 2 && ((unsigned long)d & 3))
3391 *(uint16_t *)d = data;
3396 while (w >= 4 && ((unsigned long)d & 15))
3398 *(uint32_t *)d = data;
3404 cachePrefetchNext ((__m128i*)d);
3408 cachePrefetch (((__m128i*)d) + 12);
3410 save128Aligned ((__m128i*)(d), xmmDef);
3411 save128Aligned ((__m128i*)(d+16), xmmDef);
3412 save128Aligned ((__m128i*)(d+32), xmmDef);
3413 save128Aligned ((__m128i*)(d+48), xmmDef);
3414 save128Aligned ((__m128i*)(d+64), xmmDef);
3415 save128Aligned ((__m128i*)(d+80), xmmDef);
3416 save128Aligned ((__m128i*)(d+96), xmmDef);
3417 save128Aligned ((__m128i*)(d+112), xmmDef);
3425 cachePrefetch (((__m128i*)d) + 8);
3427 save128Aligned ((__m128i*)(d), xmmDef);
3428 save128Aligned ((__m128i*)(d+16), xmmDef);
3429 save128Aligned ((__m128i*)(d+32), xmmDef);
3430 save128Aligned ((__m128i*)(d+48), xmmDef);
3436 cachePrefetchNext ((__m128i*)d);
3440 save128Aligned ((__m128i*)(d), xmmDef);
3441 save128Aligned ((__m128i*)(d+16), xmmDef);
3449 save128Aligned ((__m128i*)(d), xmmDef);
3455 cachePrefetchNext ((__m128i*)d);
3459 *(uint32_t *)d = data;
3467 *(uint16_t *)d = data;
3478 sse2_CompositeSrc_n_8_8888 (pixman_implementation_t *imp,
3480 pixman_image_t * src_image,
3481 pixman_image_t * mask_image,
3482 pixman_image_t * dst_image,
3493 uint32_t *dstLine, *dst;
3494 uint8_t *maskLine, *mask;
3495 int dstStride, maskStride;
3499 __m128i xmmSrc, xmmDef;
3500 __m128i xmmMask, xmmMaskLo, xmmMaskHi;
3502 src = _pixman_image_get_solid(src_image, dst_image->bits.format);
3507 pixmanFillsse2 (dst_image->bits.bits, dst_image->bits.rowstride,
3508 PIXMAN_FORMAT_BPP (dst_image->bits.format),
3509 dest_x, dest_y, width, height, 0);
3513 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dstStride, dstLine, 1);
3514 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, maskStride, maskLine, 1);
3516 xmmDef = createMask_2x32_128 (src, src);
3517 xmmSrc = expandPixel_32_1x128 (src);
3522 dstLine += dstStride;
3524 maskLine += maskStride;
3527 /* call prefetch hint to optimize cache load*/
3528 cachePrefetch ((__m128i*)mask);
3529 cachePrefetch ((__m128i*)dst);
3531 while (w && (unsigned long)dst & 15)
3533 uint8_t m = *mask++;
3537 *dst = pack_1x64_32 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmSrc), expandPixel_8_1x64 (m)));
3548 /* call prefetch hint to optimize cache load*/
3549 cachePrefetch ((__m128i*)mask);
3550 cachePrefetch ((__m128i*)dst);
3554 /* fill cache line with next memory */
3555 cachePrefetchNext ((__m128i*)mask);
3556 cachePrefetchNext ((__m128i*)dst);
3558 m = *((uint32_t*)mask);
3560 if (srca == 0xff && m == 0xffffffff)
3562 save128Aligned ((__m128i*)dst, xmmDef);
3566 xmmMask = unpack_32_1x128 (m);
3567 xmmMask = _mm_unpacklo_epi8 (xmmMask, _mm_setzero_si128());
3570 unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
3572 expandAlphaRev_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
3574 pixMultiply_2x128 (&xmmSrc, &xmmSrc, &xmmMaskLo, &xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
3576 save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmMaskLo, xmmMaskHi));
3580 save128Aligned ((__m128i*)dst, _mm_setzero_si128());
3590 uint8_t m = *mask++;
3594 *dst = pack_1x64_32 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmSrc), expandPixel_8_1x64 (m)));
3609 /* -------------------------------------------------------------------------------------------------
3610 * fast_CompositeOver_n_8_0565
3614 sse2_CompositeOver_n_8_0565 (pixman_implementation_t *imp,
3616 pixman_image_t * src_image,
3617 pixman_image_t * mask_image,
3618 pixman_image_t * dst_image,
3629 uint16_t *dstLine, *dst, d;
3630 uint8_t *maskLine, *mask;
3631 int dstStride, maskStride;
3634 __m64 mmsrc_x, mmxAlpha, mmmask_x, mmxDest;
3636 __m128i xmmSrc, xmmAlpha;
3637 __m128i xmmMask, xmmMaskLo, xmmMaskHi;
3638 __m128i xmmDst, xmmDst0, xmmDst1, xmmDst2, xmmDst3;
3640 src = _pixman_image_get_solid(src_image, dst_image->bits.format);
3646 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dstStride, dstLine, 1);
3647 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, maskStride, maskLine, 1);
3649 xmmSrc = expandPixel_32_1x128 (src);
3650 xmmAlpha = expandAlpha_1x128 (xmmSrc);
3651 mmsrc_x = _mm_movepi64_pi64 (xmmSrc);
3652 mmxAlpha = _mm_movepi64_pi64 (xmmAlpha);
3657 dstLine += dstStride;
3659 maskLine += maskStride;
3662 /* call prefetch hint to optimize cache load*/
3663 cachePrefetch ((__m128i*)mask);
3664 cachePrefetch ((__m128i*)dst);
3666 while (w && (unsigned long)dst & 15)
3673 mmmask_x = expandAlphaRev_1x64 (unpack_32_1x64 (m));
3674 mmxDest = expand565_16_1x64 (d);
3676 *dst = pack565_32_16 (pack_1x64_32 (inOver_1x64 (&mmsrc_x,
3686 /* call prefetch hint to optimize cache load*/
3687 cachePrefetch ((__m128i*)mask);
3688 cachePrefetch ((__m128i*)dst);
3692 /* fill cache line with next memory */
3693 cachePrefetchNext ((__m128i*)mask);
3694 cachePrefetchNext ((__m128i*)dst);
3696 xmmDst = load128Aligned ((__m128i*) dst);
3697 unpack565_128_4x128 (xmmDst, &xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3);
3699 m = *((uint32_t*)mask);
3704 xmmMask = unpack_32_1x128 (m);
3705 xmmMask = _mm_unpacklo_epi8 (xmmMask, _mm_setzero_si128());
3708 unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
3710 expandAlphaRev_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
3711 inOver_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmDst0, &xmmDst1);
3714 m = *((uint32_t*)mask);
3719 xmmMask = unpack_32_1x128 (m);
3720 xmmMask = _mm_unpacklo_epi8 (xmmMask, _mm_setzero_si128());
3723 unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
3725 expandAlphaRev_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
3726 inOver_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmDst2, &xmmDst3);
3729 save128Aligned ((__m128i*)dst, pack565_4x128_128 (&xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3));
3742 mmmask_x = expandAlphaRev_1x64 (unpack_32_1x64 (m));
3743 mmxDest = expand565_16_1x64 (d);
3745 *dst = pack565_32_16 (pack_1x64_32 (inOver_1x64 (&mmsrc_x,
3759 /* -------------------------------------------------------------------------------------------------
3760 * fast_Composite_over_pixbuf_0565
3764 sse2_Composite_over_pixbuf_0565 (pixman_implementation_t *imp,
3766 pixman_image_t * src_image,
3767 pixman_image_t * mask_image,
3768 pixman_image_t * dst_image,
3778 uint16_t *dstLine, *dst, d;
3779 uint32_t *srcLine, *src, s;
3780 int dstStride, srcStride;
3782 uint32_t opaque, zero;
3785 __m128i xmmSrc, xmmSrcLo, xmmSrcHi;
3786 __m128i xmmDst, xmmDst0, xmmDst1, xmmDst2, xmmDst3;
3788 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dstStride, dstLine, 1);
3789 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, srcStride, srcLine, 1);
3794 * I copy the code from MMX one and keep the fixme.
3795 * If it's a problem there, probably is a problem here.
3797 assert (src_image->drawable == mask_image->drawable);
3803 dstLine += dstStride;
3805 srcLine += srcStride;
3808 /* call prefetch hint to optimize cache load*/
3809 cachePrefetch ((__m128i*)src);
3810 cachePrefetch ((__m128i*)dst);
3812 while (w && (unsigned long)dst & 15)
3817 ms = unpack_32_1x64 (s);
3819 *dst++ = pack565_32_16 (pack_1x64_32 (overRevNonPre_1x64(ms, expand565_16_1x64 (d))));
3823 /* call prefetch hint to optimize cache load*/
3824 cachePrefetch ((__m128i*)src);
3825 cachePrefetch ((__m128i*)dst);
3829 /* fill cache line with next memory */
3830 cachePrefetchNext ((__m128i*)src);
3831 cachePrefetchNext ((__m128i*)dst);
3834 xmmSrc = load128Unaligned((__m128i*)src);
3835 xmmDst = load128Aligned ((__m128i*)dst);
3837 opaque = isOpaque (xmmSrc);
3838 zero = isZero (xmmSrc);
3840 unpack565_128_4x128 (xmmDst, &xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3);
3841 unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
3843 /* preload next round*/
3844 xmmSrc = load128Unaligned((__m128i*)(src+4));
3848 invertColors_2x128 (xmmSrcLo, xmmSrcHi, &xmmDst0, &xmmDst1);
3852 overRevNonPre_2x128 (xmmSrcLo, xmmSrcHi, &xmmDst0, &xmmDst1);
3856 opaque = isOpaque (xmmSrc);
3857 zero = isZero (xmmSrc);
3859 unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
3863 invertColors_2x128 (xmmSrcLo, xmmSrcHi, &xmmDst2, &xmmDst3);
3867 overRevNonPre_2x128 (xmmSrcLo, xmmSrcHi, &xmmDst2, &xmmDst3);
3870 save128Aligned ((__m128i*)dst, pack565_4x128_128 (&xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3));
3882 ms = unpack_32_1x64 (s);
3884 *dst++ = pack565_32_16 (pack_1x64_32 (overRevNonPre_1x64(ms, expand565_16_1x64 (d))));
3892 /* -------------------------------------------------------------------------------------------------
3893 * fast_Composite_over_pixbuf_8888
3897 sse2_Composite_over_pixbuf_8888 (pixman_implementation_t *imp,
3899 pixman_image_t * src_image,
3900 pixman_image_t * mask_image,
3901 pixman_image_t * dst_image,
3911 uint32_t *dstLine, *dst, d;
3912 uint32_t *srcLine, *src, s;
3913 int dstStride, srcStride;
3915 uint32_t opaque, zero;
3917 __m128i xmmSrcLo, xmmSrcHi;
3918 __m128i xmmDstLo, xmmDstHi;
3920 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dstStride, dstLine, 1);
3921 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, srcStride, srcLine, 1);
3926 * I copy the code from MMX one and keep the fixme.
3927 * If it's a problem there, probably is a problem here.
3929 assert (src_image->drawable == mask_image->drawable);
3935 dstLine += dstStride;
3937 srcLine += srcStride;
3940 /* call prefetch hint to optimize cache load*/
3941 cachePrefetch ((__m128i*)src);
3942 cachePrefetch ((__m128i*)dst);
3944 while (w && (unsigned long)dst & 15)
3949 *dst++ = pack_1x64_32 (overRevNonPre_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d)));
3954 /* call prefetch hint to optimize cache load*/
3955 cachePrefetch ((__m128i*)src);
3956 cachePrefetch ((__m128i*)dst);
3960 /* fill cache line with next memory */
3961 cachePrefetchNext ((__m128i*)src);
3962 cachePrefetchNext ((__m128i*)dst);
3964 xmmSrcHi = load128Unaligned((__m128i*)src);
3966 opaque = isOpaque (xmmSrcHi);
3967 zero = isZero (xmmSrcHi);
3969 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
3973 invertColors_2x128( xmmSrcLo, xmmSrcHi, &xmmDstLo, &xmmDstHi);
3975 save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
3979 xmmDstHi = load128Aligned ((__m128i*)dst);
3981 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
3983 overRevNonPre_2x128 (xmmSrcLo, xmmSrcHi, &xmmDstLo, &xmmDstHi);
3985 save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
3998 *dst++ = pack_1x64_32 (overRevNonPre_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d)));
4007 /* -------------------------------------------------------------------------------------------------
4008 * fast_CompositeOver_n_8888_0565_ca
4012 sse2_CompositeOver_n_8888_0565_ca (pixman_implementation_t *imp,
4014 pixman_image_t * src_image,
4015 pixman_image_t * mask_image,
4016 pixman_image_t * dst_image,
4027 uint16_t *dstLine, *dst, d;
4028 uint32_t *maskLine, *mask, m;
4029 int dstStride, maskStride;
4033 __m128i xmmSrc, xmmAlpha;
4034 __m128i xmmMask, xmmMaskLo, xmmMaskHi;
4035 __m128i xmmDst, xmmDst0, xmmDst1, xmmDst2, xmmDst3;
4037 __m64 mmsrc_x, mmxAlpha, mmmask_x, mmxDest;
4039 src = _pixman_image_get_solid(src_image, dst_image->bits.format);
4044 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dstStride, dstLine, 1);
4045 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, maskStride, maskLine, 1);
4047 xmmSrc = expandPixel_32_1x128 (src);
4048 xmmAlpha = expandAlpha_1x128 (xmmSrc);
4049 mmsrc_x = _mm_movepi64_pi64 (xmmSrc);
4050 mmxAlpha = _mm_movepi64_pi64 (xmmAlpha);
4057 maskLine += maskStride;
4058 dstLine += dstStride;
4060 /* call prefetch hint to optimize cache load*/
4061 cachePrefetch ((__m128i*)mask);
4062 cachePrefetch ((__m128i*)dst);
4064 while (w && ((unsigned long)dst & 15))
4066 m = *(uint32_t *) mask;
4071 mmmask_x = unpack_32_1x64 (m);
4072 mmxDest = expand565_16_1x64 (d);
4074 *dst = pack565_32_16 (pack_1x64_32 (inOver_1x64 (&mmsrc_x,
4085 /* call prefetch hint to optimize cache load*/
4086 cachePrefetch ((__m128i*)mask);
4087 cachePrefetch ((__m128i*)dst);
4091 /* fill cache line with next memory */
4092 cachePrefetchNext ((__m128i*)mask);
4093 cachePrefetchNext ((__m128i*)dst);
4096 xmmMask = load128Unaligned((__m128i*)mask);
4097 xmmDst = load128Aligned((__m128i*)dst);
4099 packCmp = _mm_movemask_epi8 (_mm_cmpeq_epi32 (xmmMask, _mm_setzero_si128()));
4101 unpack565_128_4x128 (xmmDst, &xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3);
4102 unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
4104 /* preload next round*/
4105 xmmMask = load128Unaligned((__m128i*)(mask+4));
4106 /* preload next round*/
4108 if (packCmp != 0xffff)
4110 inOver_2x128(&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmDst0, &xmmDst1);
4114 packCmp = _mm_movemask_epi8 (_mm_cmpeq_epi32 (xmmMask, _mm_setzero_si128()));
4116 unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
4118 if (packCmp != 0xffff)
4120 inOver_2x128(&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmDst2, &xmmDst3);
4123 save128Aligned ((__m128i*)dst, pack565_4x128_128 (&xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3));
4132 m = *(uint32_t *) mask;
4137 mmmask_x = unpack_32_1x64 (m);
4138 mmxDest = expand565_16_1x64 (d);
4140 *dst = pack565_32_16 (pack_1x64_32 (inOver_1x64 (&mmsrc_x,
4155 /* -------------------------------------------------------------------------------------------------
4156 * fast_CompositeIn_n_8_8
4160 sse2_CompositeIn_n_8_8 (pixman_implementation_t *imp,
4162 pixman_image_t * src_image,
4163 pixman_image_t * mask_image,
4164 pixman_image_t * dst_image,
4174 uint8_t *dstLine, *dst;
4175 uint8_t *maskLine, *mask;
4176 int dstStride, maskStride;
4182 __m128i xmmMask, xmmMaskLo, xmmMaskHi;
4183 __m128i xmmDst, xmmDstLo, xmmDstHi;
4185 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dstStride, dstLine, 1);
4186 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, maskStride, maskLine, 1);
4188 src = _pixman_image_get_solid(src_image, dst_image->bits.format);
4194 xmmAlpha = expandAlpha_1x128 (expandPixel_32_1x128 (src));
4199 dstLine += dstStride;
4201 maskLine += maskStride;
4204 /* call prefetch hint to optimize cache load*/
4205 cachePrefetch ((__m128i*)mask);
4206 cachePrefetch ((__m128i*)dst);
4208 while (w && ((unsigned long)dst & 15))
4210 m = (uint32_t) *mask++;
4211 d = (uint32_t) *dst;
4213 *dst++ = (uint8_t) pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmAlpha), unpack_32_1x64 (m)),
4214 unpack_32_1x64 (d)));
4218 /* call prefetch hint to optimize cache load*/
4219 cachePrefetch ((__m128i*)mask);
4220 cachePrefetch ((__m128i*)dst);
4224 /* fill cache line with next memory */
4225 cachePrefetchNext ((__m128i*)mask);
4226 cachePrefetchNext ((__m128i*)dst);
4228 xmmMask = load128Unaligned((__m128i*)mask);
4229 xmmDst = load128Aligned((__m128i*)dst);
4231 unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
4232 unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
4234 pixMultiply_2x128 (&xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
4235 pixMultiply_2x128 (&xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi, &xmmDstLo, &xmmDstHi);
4237 save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
4246 m = (uint32_t) *mask++;
4247 d = (uint32_t) *dst;
4249 *dst++ = (uint8_t) pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmAlpha), unpack_32_1x64 (m)),
4250 unpack_32_1x64 (d)));
4258 /* -------------------------------------------------------------------------------------------------
4259 * fast_CompositeIn_8_8
4263 sse2_CompositeIn_8_8 (pixman_implementation_t *imp,
4265 pixman_image_t * src_image,
4266 pixman_image_t * mask_image,
4267 pixman_image_t * dst_image,
4277 uint8_t *dstLine, *dst;
4278 uint8_t *srcLine, *src;
4279 int srcStride, dstStride;
4283 __m128i xmmSrc, xmmSrcLo, xmmSrcHi;
4284 __m128i xmmDst, xmmDstLo, xmmDstHi;
4286 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dstStride, dstLine, 1);
4287 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, srcStride, srcLine, 1);
4292 dstLine += dstStride;
4294 srcLine += srcStride;
4297 /* call prefetch hint to optimize cache load*/
4298 cachePrefetch ((__m128i*)src);
4299 cachePrefetch ((__m128i*)dst);
4301 while (w && ((unsigned long)dst & 15))
4303 s = (uint32_t) *src++;
4304 d = (uint32_t) *dst;
4306 *dst++ = (uint8_t) pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s),unpack_32_1x64 (d)));
4310 /* call prefetch hint to optimize cache load*/
4311 cachePrefetch ((__m128i*)src);
4312 cachePrefetch ((__m128i*)dst);
4316 /* fill cache line with next memory */
4317 cachePrefetchNext ((__m128i*)src);
4318 cachePrefetchNext ((__m128i*)dst);
4320 xmmSrc = load128Unaligned((__m128i*)src);
4321 xmmDst = load128Aligned((__m128i*)dst);
4323 unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
4324 unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
4326 pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmDstLo, &xmmDstHi, &xmmDstLo, &xmmDstHi);
4328 save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
4337 s = (uint32_t) *src++;
4338 d = (uint32_t) *dst;
4340 *dst++ = (uint8_t) pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s),unpack_32_1x64 (d)));
4348 /* -------------------------------------------------------------------------------------------------
4349 * fast_CompositeAdd_8888_8_8
4353 sse2_CompositeAdd_8888_8_8 (pixman_implementation_t *imp,
4355 pixman_image_t * src_image,
4356 pixman_image_t * mask_image,
4357 pixman_image_t * dst_image,
4367 uint8_t *dstLine, *dst;
4368 uint8_t *maskLine, *mask;
4369 int dstStride, maskStride;
4376 __m128i xmmMask, xmmMaskLo, xmmMaskHi;
4377 __m128i xmmDst, xmmDstLo, xmmDstHi;
4379 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dstStride, dstLine, 1);
4380 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, maskStride, maskLine, 1);
4382 src = _pixman_image_get_solid(src_image, dst_image->bits.format);
4388 xmmAlpha = expandAlpha_1x128 (expandPixel_32_1x128 (src));
4393 dstLine += dstStride;
4395 maskLine += maskStride;
4398 /* call prefetch hint to optimize cache load*/
4399 cachePrefetch ((__m128i*)mask);
4400 cachePrefetch ((__m128i*)dst);
4402 while (w && ((unsigned long)dst & 15))
4404 m = (uint32_t) *mask++;
4405 d = (uint32_t) *dst;
4407 *dst++ = (uint8_t) pack_1x64_32 (_mm_adds_pu16 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmAlpha), unpack_32_1x64 (m)),
4408 unpack_32_1x64 (d)));
4412 /* call prefetch hint to optimize cache load*/
4413 cachePrefetch ((__m128i*)mask);
4414 cachePrefetch ((__m128i*)dst);
4418 /* fill cache line with next memory */
4419 cachePrefetchNext ((__m128i*)mask);
4420 cachePrefetchNext ((__m128i*)dst);
4422 xmmMask = load128Unaligned((__m128i*)mask);
4423 xmmDst = load128Aligned((__m128i*)dst);
4425 unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
4426 unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
4428 pixMultiply_2x128 (&xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
4430 xmmDstLo = _mm_adds_epu16 (xmmMaskLo, xmmDstLo);
4431 xmmDstHi = _mm_adds_epu16 (xmmMaskHi, xmmDstHi);
4433 save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
4442 m = (uint32_t) *mask++;
4443 d = (uint32_t) *dst;
4445 *dst++ = (uint8_t) pack_1x64_32 (_mm_adds_pu16 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmAlpha), unpack_32_1x64 (m)),
4446 unpack_32_1x64 (d)));
4454 /* -------------------------------------------------------------------------------------------------
4455 * fast_CompositeAdd_8000_8000
4459 sse2_CompositeAdd_8000_8000 (pixman_implementation_t *imp,
4461 pixman_image_t * src_image,
4462 pixman_image_t * mask_image,
4463 pixman_image_t * dst_image,
4473 uint8_t *dstLine, *dst;
4474 uint8_t *srcLine, *src;
4475 int dstStride, srcStride;
4479 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, srcStride, srcLine, 1);
4480 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dstStride, dstLine, 1);
4487 /* call prefetch hint to optimize cache load*/
4488 cachePrefetch ((__m128i*)src);
4489 cachePrefetch ((__m128i*)dst);
4491 dstLine += dstStride;
4492 srcLine += srcStride;
4496 while (w && (unsigned long)dst & 3)
4498 t = (*dst) + (*src++);
4499 *dst++ = t | (0 - (t >> 8));
4503 coreCombineAddUsse2 ((uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
4513 t = (*dst) + (*src++);
4514 *dst++ = t | (0 - (t >> 8));
4522 /* -------------------------------------------------------------------------------------------------
4523 * fast_CompositeAdd_8888_8888
4526 sse2_CompositeAdd_8888_8888 (pixman_implementation_t *imp,
4528 pixman_image_t * src_image,
4529 pixman_image_t * mask_image,
4530 pixman_image_t * dst_image,
4540 uint32_t *dstLine, *dst;
4541 uint32_t *srcLine, *src;
4542 int dstStride, srcStride;
4544 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, srcStride, srcLine, 1);
4545 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dstStride, dstLine, 1);
4550 dstLine += dstStride;
4552 srcLine += srcStride;
4554 coreCombineAddUsse2 (dst, src, NULL, width);
4560 /* -------------------------------------------------------------------------------------------------
4561 * sse2_CompositeCopyArea
4564 static pixman_bool_t
4565 pixmanBltsse2 (uint32_t *src_bits,
4571 int src_x, int src_y,
4572 int dst_x, int dst_y,
4573 int width, int height)
4575 uint8_t * src_bytes;
4576 uint8_t * dst_bytes;
4579 if (src_bpp != dst_bpp)
4584 src_stride = src_stride * (int) sizeof (uint32_t) / 2;
4585 dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
4586 src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
4587 dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
4588 byte_width = 2 * width;
4592 else if (src_bpp == 32)
4594 src_stride = src_stride * (int) sizeof (uint32_t) / 4;
4595 dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
4596 src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
4597 dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
4598 byte_width = 4 * width;
4607 cachePrefetch ((__m128i*)src_bytes);
4608 cachePrefetch ((__m128i*)dst_bytes);
4613 uint8_t *s = src_bytes;
4614 uint8_t *d = dst_bytes;
4615 src_bytes += src_stride;
4616 dst_bytes += dst_stride;
4619 cachePrefetchNext ((__m128i*)s);
4620 cachePrefetchNext ((__m128i*)d);
4622 while (w >= 2 && ((unsigned long)d & 3))
4624 *(uint16_t *)d = *(uint16_t *)s;
4630 while (w >= 4 && ((unsigned long)d & 15))
4632 *(uint32_t *)d = *(uint32_t *)s;
4639 cachePrefetchNext ((__m128i*)s);
4640 cachePrefetchNext ((__m128i*)d);
4644 __m128i xmm0, xmm1, xmm2, xmm3;
4646 /* 128 bytes ahead */
4647 cachePrefetch (((__m128i*)s) + 8);
4648 cachePrefetch (((__m128i*)d) + 8);
4650 xmm0 = load128Unaligned ((__m128i*)(s));
4651 xmm1 = load128Unaligned ((__m128i*)(s+16));
4652 xmm2 = load128Unaligned ((__m128i*)(s+32));
4653 xmm3 = load128Unaligned ((__m128i*)(s+48));
4655 save128Aligned ((__m128i*)(d), xmm0);
4656 save128Aligned ((__m128i*)(d+16), xmm1);
4657 save128Aligned ((__m128i*)(d+32), xmm2);
4658 save128Aligned ((__m128i*)(d+48), xmm3);
4665 cachePrefetchNext ((__m128i*)s);
4666 cachePrefetchNext ((__m128i*)d);
4670 save128Aligned ((__m128i*)d, load128Unaligned ((__m128i*)s) );
4677 cachePrefetchNext ((__m128i*)s);
4678 cachePrefetchNext ((__m128i*)d);
4682 *(uint32_t *)d = *(uint32_t *)s;
4691 *(uint16_t *)d = *(uint16_t *)s;
4704 sse2_CompositeCopyArea (pixman_implementation_t *imp,
4706 pixman_image_t * src_image,
4707 pixman_image_t * mask_image,
4708 pixman_image_t * dst_image,
4718 pixmanBltsse2 (src_image->bits.bits,
4719 dst_image->bits.bits,
4720 src_image->bits.rowstride,
4721 dst_image->bits.rowstride,
4722 PIXMAN_FORMAT_BPP (src_image->bits.format),
4723 PIXMAN_FORMAT_BPP (dst_image->bits.format),
4724 src_x, src_y, dest_x, dest_y, width, height);
4728 /* This code are buggy in MMX version, now the bug was translated to SSE2 version */
4730 sse2_CompositeOver_x888_8_8888 (pixman_implementation_t *imp,
4732 pixman_image_t * src_image,
4733 pixman_image_t * mask_image,
4734 pixman_image_t * dst_image,
4744 uint32_t *src, *srcLine, s;
4745 uint32_t *dst, *dstLine, d;
4746 uint8_t *mask, *maskLine;
4748 int srcStride, maskStride, dstStride;
4751 __m128i xmmSrc, xmmSrcLo, xmmSrcHi;
4752 __m128i xmmDst, xmmDstLo, xmmDstHi;
4753 __m128i xmmMask, xmmMaskLo, xmmMaskHi;
4755 PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dstStride, dstLine, 1);
4756 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, maskStride, maskLine, 1);
4757 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, srcStride, srcLine, 1);
4762 srcLine += srcStride;
4764 dstLine += dstStride;
4766 maskLine += maskStride;
4770 /* call prefetch hint to optimize cache load*/
4771 cachePrefetch ((__m128i*)src);
4772 cachePrefetch ((__m128i*)dst);
4773 cachePrefetch ((__m128i*)mask);
4775 while (w && (unsigned long)dst & 15)
4777 s = 0xff000000 | *src++;
4778 m = (uint32_t) *mask++;
4781 __m64 ms = unpack_32_1x64 (s);
4785 ms = inOver_1x64 (ms,
4787 expandAlphaRev_1x64 (unpack_32_1x64 (m)),
4788 unpack_32_1x64 (d));
4791 *dst++ = pack_1x64_32 (ms);
4795 /* call prefetch hint to optimize cache load*/
4796 cachePrefetch ((__m128i*)src);
4797 cachePrefetch ((__m128i*)dst);
4798 cachePrefetch ((__m128i*)mask);
4802 /* fill cache line with next memory */
4803 cachePrefetchNext ((__m128i*)src);
4804 cachePrefetchNext ((__m128i*)dst);
4805 cachePrefetchNext ((__m128i*)mask);
4807 m = *(uint32_t*) mask;
4808 xmmSrc = _mm_or_si128 (load128Unaligned ((__m128i*)src), Maskff000000);
4810 if (m == 0xffffffff)
4812 save128Aligned ((__m128i*)dst, xmmSrc);
4816 xmmDst = load128Aligned ((__m128i*)dst);
4818 xmmMask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
4820 unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
4821 unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
4822 unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
4824 expandAlphaRev_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
4826 inOver_2x128 (xmmSrcLo, xmmSrcHi, Mask00ff, Mask00ff, xmmMaskLo, xmmMaskHi, &xmmDstLo, &xmmDstHi);
4828 save128Aligned( (__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
4839 m = (uint32_t) *mask++;
4843 s = 0xff000000 | *src;
4853 *dst = pack_1x64_32 (inOver_1x64 (unpack_32_1x64 (s),
4855 expandAlphaRev_1x64 (unpack_32_1x64 (m)),
4856 unpack_32_1x64 (d)));
4871 static const pixman_fast_path_t sse2_fast_paths[] =
4873 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_r5g6b5, sse2_CompositeOver_n_8_0565, 0 },
4874 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_b5g6r5, sse2_CompositeOver_n_8_0565, 0 },
4875 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_null, PIXMAN_a8r8g8b8, sse2_CompositeOver_n_8888, 0 },
4876 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_CompositeOver_n_8888, 0 },
4877 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_null, PIXMAN_r5g6b5, sse2_CompositeOver_n_0565, 0 },
4878 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, sse2_composite_over_8888_8888, 0 },
4879 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_over_8888_8888, 0 },
4880 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, sse2_composite_over_8888_8888, 0 },
4881 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, sse2_composite_over_8888_8888, 0 },
4882 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_r5g6b5, sse2_composite_over_8888_0565, 0 },
4883 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_b5g6r5, sse2_composite_over_8888_0565, 0 },
4884 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_CompositeOver_n_8_8888, 0 },
4885 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_CompositeOver_n_8_8888, 0 },
4886 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8b8g8r8, sse2_CompositeOver_n_8_8888, 0 },
4887 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_CompositeOver_n_8_8888, 0 },
4889 /* FIXME: This code are buggy in MMX version, now the bug was translated to SSE2 version */
4890 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_CompositeOver_x888_8_8888, 0 },
4891 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_CompositeOver_x888_8_8888, 0 },
4892 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_CompositeOver_x888_8_8888, 0 },
4893 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_CompositeOver_x888_8_8888, 0 },
4895 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_Composite_over_x888_n_8888, NEED_SOLID_MASK },
4896 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_Composite_over_x888_n_8888, NEED_SOLID_MASK },
4897 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_a8b8g8r8, sse2_Composite_over_x888_n_8888, NEED_SOLID_MASK },
4898 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_Composite_over_x888_n_8888, NEED_SOLID_MASK },
4899 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_over_8888_n_8888, NEED_SOLID_MASK },
4900 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_over_8888_n_8888, NEED_SOLID_MASK },
4901 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_a8, PIXMAN_a8b8g8r8, sse2_composite_over_8888_n_8888, NEED_SOLID_MASK },
4902 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_over_8888_n_8888, NEED_SOLID_MASK },
4903 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, sse2_CompositeOver_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
4904 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, sse2_CompositeOver_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
4905 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8b8g8r8, PIXMAN_a8b8g8r8, sse2_CompositeOver_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
4906 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8b8g8r8, PIXMAN_x8b8g8r8, sse2_CompositeOver_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
4907 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8r8g8b8, PIXMAN_r5g6b5, sse2_CompositeOver_n_8888_0565_ca, NEED_COMPONENT_ALPHA },
4908 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8b8g8r8, PIXMAN_b5g6r5, sse2_CompositeOver_n_8888_0565_ca, NEED_COMPONENT_ALPHA },
4909 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, sse2_Composite_over_pixbuf_8888, NEED_PIXBUF },
4910 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_a8r8g8b8, sse2_Composite_over_pixbuf_8888, NEED_PIXBUF },
4911 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, sse2_Composite_over_pixbuf_8888, NEED_PIXBUF },
4912 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_x8r8g8b8, sse2_Composite_over_pixbuf_8888, NEED_PIXBUF },
4913 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_a8b8g8r8, sse2_Composite_over_pixbuf_8888, NEED_PIXBUF },
4914 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_a8b8g8r8, sse2_Composite_over_pixbuf_8888, NEED_PIXBUF },
4915 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_x8b8g8r8, sse2_Composite_over_pixbuf_8888, NEED_PIXBUF },
4916 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_x8b8g8r8, sse2_Composite_over_pixbuf_8888, NEED_PIXBUF },
4917 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_r5g6b5, sse2_Composite_over_pixbuf_0565, NEED_PIXBUF },
4918 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_r5g6b5, sse2_Composite_over_pixbuf_0565, NEED_PIXBUF },
4919 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_b5g6r5, sse2_Composite_over_pixbuf_0565, NEED_PIXBUF },
4920 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_b5g6r5, sse2_Composite_over_pixbuf_0565, NEED_PIXBUF },
4921 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_CompositeCopyArea, 0 },
4922 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, sse2_CompositeCopyArea, 0 },
4924 { PIXMAN_OP_ADD, PIXMAN_a8, PIXMAN_null, PIXMAN_a8, sse2_CompositeAdd_8000_8000, 0 },
4925 { PIXMAN_OP_ADD, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, sse2_CompositeAdd_8888_8888, 0 },
4926 { PIXMAN_OP_ADD, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, sse2_CompositeAdd_8888_8888, 0 },
4927 { PIXMAN_OP_ADD, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8, sse2_CompositeAdd_8888_8_8, 0 },
4929 { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_CompositeSrc_n_8_8888, 0 },
4930 { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_CompositeSrc_n_8_8888, 0 },
4931 { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8b8g8r8, sse2_CompositeSrc_n_8_8888, 0 },
4932 { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_CompositeSrc_n_8_8888, 0 },
4933 { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, sse2_CompositeCopyArea, 0 },
4934 { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, sse2_CompositeCopyArea, 0 },
4935 { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_CompositeCopyArea, 0 },
4936 { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, sse2_CompositeCopyArea, 0 },
4937 { PIXMAN_OP_SRC, PIXMAN_x8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_CompositeCopyArea, 0 },
4938 { PIXMAN_OP_SRC, PIXMAN_x8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, sse2_CompositeCopyArea, 0 },
4939 { PIXMAN_OP_SRC, PIXMAN_r5g6b5, PIXMAN_null, PIXMAN_r5g6b5, sse2_CompositeCopyArea, 0 },
4940 { PIXMAN_OP_SRC, PIXMAN_b5g6r5, PIXMAN_null, PIXMAN_b5g6r5, sse2_CompositeCopyArea, 0 },
4942 { PIXMAN_OP_IN, PIXMAN_a8, PIXMAN_null, PIXMAN_a8, sse2_CompositeIn_8_8, 0 },
4943 { PIXMAN_OP_IN, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8, sse2_CompositeIn_n_8_8, 0 },
4949 * Work around GCC bug causing crashes in Mozilla with SSE2
4951 * When using -msse, gcc generates movdqa instructions assuming that
4952 * the stack is 16 byte aligned. Unfortunately some applications, such
4953 * as Mozilla and Mono, end up aligning the stack to 4 bytes, which
4954 * causes the movdqa instructions to fail.
4956 * The __force_align_arg_pointer__ makes gcc generate a prologue that
4957 * realigns the stack pointer to 16 bytes.
4959 * On x86-64 this is not necessary because the standard ABI already
4960 * calls for a 16 byte aligned stack.
4962 * See https://bugs.freedesktop.org/show_bug.cgi?id=15693
4964 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
4965 __attribute__((__force_align_arg_pointer__))
4968 sse2_composite (pixman_implementation_t *imp,
4970 pixman_image_t *src,
4971 pixman_image_t *mask,
4972 pixman_image_t *dest,
4982 if (_pixman_run_fast_path (sse2_fast_paths, imp,
4983 op, src, mask, dest,
4992 _pixman_implementation_composite (imp->delegate, op,
5000 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5001 __attribute__((__force_align_arg_pointer__))
5003 static pixman_bool_t
5004 sse2_blt (pixman_implementation_t *imp,
5011 int src_x, int src_y,
5012 int dst_x, int dst_y,
5013 int width, int height)
5015 if (!pixmanBltsse2 (
5016 src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
5017 src_x, src_y, dst_x, dst_y, width, height))
5020 return _pixman_implementation_blt (
5022 src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
5023 src_x, src_y, dst_x, dst_y, width, height);
5029 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5030 __attribute__((__force_align_arg_pointer__))
5032 static pixman_bool_t
5033 sse2_fill (pixman_implementation_t *imp,
5043 if (!pixmanFillsse2 (bits, stride, bpp, x, y, width, height, xor))
5045 return _pixman_implementation_fill (
5046 imp->delegate, bits, stride, bpp, x, y, width, height, xor);
5052 pixman_implementation_t *
5053 _pixman_implementation_create_sse2 (void)
5055 pixman_implementation_t *mmx = _pixman_implementation_create_mmx ();
5056 pixman_implementation_t *imp = _pixman_implementation_create (mmx);
5058 /* SSE2 constants */
5059 Mask565r = createMask_2x32_128 (0x00f80000, 0x00f80000);
5060 Mask565g1 = createMask_2x32_128 (0x00070000, 0x00070000);
5061 Mask565g2 = createMask_2x32_128 (0x000000e0, 0x000000e0);
5062 Mask565b = createMask_2x32_128 (0x0000001f, 0x0000001f);
5063 MaskRed = createMask_2x32_128 (0x00f80000, 0x00f80000);
5064 MaskGreen = createMask_2x32_128 (0x0000fc00, 0x0000fc00);
5065 MaskBlue = createMask_2x32_128 (0x000000f8, 0x000000f8);
5066 Mask565FixRB = createMask_2x32_128 (0x00e000e0, 0x00e000e0);
5067 Mask565FixG = createMask_2x32_128 (0x0000c000, 0x0000c000);
5068 Mask0080 = createMask_16_128 (0x0080);
5069 Mask00ff = createMask_16_128 (0x00ff);
5070 Mask0101 = createMask_16_128 (0x0101);
5071 Maskffff = createMask_16_128 (0xffff);
5072 Maskff000000 = createMask_2x32_128 (0xff000000, 0xff000000);
5073 MaskAlpha = createMask_2x32_128 (0x00ff0000, 0x00000000);
5076 mask_x565rgb = createMask_2x32_64 (0x000001f0, 0x003f001f);
5077 mask_x565Unpack = createMask_2x32_64 (0x00000084, 0x04100840);
5079 mask_x0080 = createMask_16_64 (0x0080);
5080 mask_x00ff = createMask_16_64 (0x00ff);
5081 mask_x0101 = createMask_16_64 (0x0101);
5082 mask_xAlpha = createMask_2x32_64 (0x00ff0000, 0x00000000);
5086 /* Set up function pointers */
5088 /* SSE code patch for fbcompose.c */
5089 imp->combine_32[PIXMAN_OP_OVER] = sse2CombineOverU;
5090 imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2CombineOverReverseU;
5091 imp->combine_32[PIXMAN_OP_IN] = sse2CombineInU;
5092 imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2CombineInReverseU;
5093 imp->combine_32[PIXMAN_OP_OUT] = sse2CombineOutU;
5094 imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2CombineOutReverseU;
5095 imp->combine_32[PIXMAN_OP_ATOP] = sse2CombineAtopU;
5096 imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2CombineAtopReverseU;
5097 imp->combine_32[PIXMAN_OP_XOR] = sse2CombineXorU;
5098 imp->combine_32[PIXMAN_OP_ADD] = sse2CombineAddU;
5100 imp->combine_32[PIXMAN_OP_SATURATE] = sse2CombineSaturateU;
5102 imp->combine_32_ca[PIXMAN_OP_SRC] = sse2CombineSrcC;
5103 imp->combine_32_ca[PIXMAN_OP_OVER] = sse2CombineOverC;
5104 imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2CombineOverReverseC;
5105 imp->combine_32_ca[PIXMAN_OP_IN] = sse2CombineInC;
5106 imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2CombineInReverseC;
5107 imp->combine_32_ca[PIXMAN_OP_OUT] = sse2CombineOutC;
5108 imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2CombineOutReverseC;
5109 imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2CombineAtopC;
5110 imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2CombineAtopReverseC;
5111 imp->combine_32_ca[PIXMAN_OP_XOR] = sse2CombineXorC;
5112 imp->combine_32_ca[PIXMAN_OP_ADD] = sse2CombineAddC;
5114 imp->composite = sse2_composite;
5115 imp->blt = sse2_blt;
5116 imp->fill = sse2_fill;
5121 #endif /* USE_SSE2 */