2 * Copyright © 2008 Rodrigo Kumpera
3 * Copyright © 2008 André Tupinambá
5 * Permission to use, copy, modify, distribute, and sell this software and its
6 * documentation for any purpose is hereby granted without fee, provided that
7 * the above copyright notice appear in all copies and that both that
8 * copyright notice and this permission notice appear in supporting
9 * documentation, and that the name of Red Hat not be used in advertising or
10 * publicity pertaining to distribution of the software without specific,
11 * written prior permission. Red Hat makes no representations about the
12 * suitability of this software for any purpose. It is provided "as is"
13 * without express or implied warranty.
15 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
16 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
18 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
20 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
21 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
24 * Author: Rodrigo Kumpera (kumpera@gmail.com)
25 * André Tupinambá (andrelrt@gmail.com)
27 * Based on work by Owen Taylor and Søren Sandmann
34 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
35 #include <emmintrin.h> /* for SSE2 intrinsics */
36 #include "pixman-private.h"
40 /* -------------------------------------------------------------------------------------------------
44 static __m64 xMask0080;
45 static __m64 xMask00ff;
46 static __m64 xMask0101;
47 static __m64 xMaskAlpha;
49 static __m64 xMask565rgb;
50 static __m64 xMask565Unpack;
52 static __m128i Mask0080;
53 static __m128i Mask00ff;
54 static __m128i Mask0101;
55 static __m128i Maskffff;
56 static __m128i Maskff000000;
57 static __m128i MaskAlpha;
59 static __m128i Mask565r;
60 static __m128i Mask565g1, Mask565g2;
61 static __m128i Mask565b;
62 static __m128i MaskRed;
63 static __m128i MaskGreen;
64 static __m128i MaskBlue;
66 static __m128i Mask565FixRB;
67 static __m128i Mask565FixG;
69 /* -------------------------------------------------------------------------------------------------
72 static force_inline __m128i
73 unpack_32_1x128 (uint32_t data)
75 return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128());
78 static force_inline void
79 unpack_128_2x128 (__m128i data, __m128i* dataLo, __m128i* dataHi)
81 *dataLo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
82 *dataHi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
85 static force_inline __m128i
86 unpack565to8888 (__m128i lo)
88 __m128i r, g, b, rb, t;
90 r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), MaskRed);
91 g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), MaskGreen);
92 b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), MaskBlue);
94 rb = _mm_or_si128 (r, b);
95 t = _mm_and_si128 (rb, Mask565FixRB);
96 t = _mm_srli_epi32 (t, 5);
97 rb = _mm_or_si128 (rb, t);
99 t = _mm_and_si128 (g, Mask565FixG);
100 t = _mm_srli_epi32 (t, 6);
101 g = _mm_or_si128 (g, t);
103 return _mm_or_si128 (rb, g);
106 static force_inline void
107 unpack565_128_4x128 (__m128i data, __m128i* data0, __m128i* data1, __m128i* data2, __m128i* data3)
111 lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
112 hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
114 lo = unpack565to8888 (lo);
115 hi = unpack565to8888 (hi);
117 unpack_128_2x128 (lo, data0, data1);
118 unpack_128_2x128 (hi, data2, data3);
121 static force_inline uint16_t
122 pack565_32_16 (uint32_t pixel)
124 return (uint16_t) (((pixel>>8) & 0xf800) | ((pixel>>5) & 0x07e0) | ((pixel>>3) & 0x001f));
127 static force_inline __m128i
128 pack_2x128_128 (__m128i lo, __m128i hi)
130 return _mm_packus_epi16 (lo, hi);
133 static force_inline __m128i
134 pack565_2x128_128 (__m128i lo, __m128i hi)
137 __m128i r, g1, g2, b;
139 data = pack_2x128_128 ( lo, hi );
141 r = _mm_and_si128 (data , Mask565r);
142 g1 = _mm_and_si128 (_mm_slli_epi32 (data , 3), Mask565g1);
143 g2 = _mm_and_si128 (_mm_srli_epi32 (data , 5), Mask565g2);
144 b = _mm_and_si128 (_mm_srli_epi32 (data , 3), Mask565b);
146 return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
149 static force_inline __m128i
150 pack565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
152 return _mm_packus_epi16 (pack565_2x128_128 (*xmm0, *xmm1), pack565_2x128_128 (*xmm2, *xmm3));
155 static force_inline int
158 __m128i ffs = _mm_cmpeq_epi8 (x, x);
159 return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
162 static force_inline int
165 return _mm_movemask_epi8 (_mm_cmpeq_epi8 (x, _mm_setzero_si128())) == 0xffff;
168 static force_inline int
169 isTransparent (__m128i x)
171 return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, _mm_setzero_si128())) & 0x8888) == 0x8888;
174 static force_inline __m128i
175 expandPixel_32_1x128 (uint32_t data)
177 return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE(1, 0, 1, 0));
180 static force_inline __m128i
181 expandAlpha_1x128 (__m128i data)
183 return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
186 static force_inline void
187 expandAlpha_2x128 (__m128i dataLo, __m128i dataHi, __m128i* alphaLo, __m128i* alphaHi)
191 lo = _mm_shufflelo_epi16 (dataLo, _MM_SHUFFLE(3, 3, 3, 3));
192 hi = _mm_shufflelo_epi16 (dataHi, _MM_SHUFFLE(3, 3, 3, 3));
193 *alphaLo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE(3, 3, 3, 3));
194 *alphaHi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE(3, 3, 3, 3));
197 static force_inline void
198 expandAlphaRev_2x128 (__m128i dataLo, __m128i dataHi, __m128i* alphaLo, __m128i* alphaHi)
202 lo = _mm_shufflelo_epi16 (dataLo, _MM_SHUFFLE(0, 0, 0, 0));
203 hi = _mm_shufflelo_epi16 (dataHi, _MM_SHUFFLE(0, 0, 0, 0));
204 *alphaLo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE(0, 0, 0, 0));
205 *alphaHi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE(0, 0, 0, 0));
208 static force_inline void
209 pixMultiply_2x128 (__m128i* dataLo, __m128i* dataHi, __m128i* alphaLo, __m128i* alphaHi, __m128i* retLo, __m128i* retHi)
213 lo = _mm_mullo_epi16 (*dataLo, *alphaLo);
214 hi = _mm_mullo_epi16 (*dataHi, *alphaHi);
215 lo = _mm_adds_epu16 (lo, Mask0080);
216 hi = _mm_adds_epu16 (hi, Mask0080);
217 *retLo = _mm_mulhi_epu16 (lo, Mask0101);
218 *retHi = _mm_mulhi_epu16 (hi, Mask0101);
221 static force_inline void
222 pixAddMultiply_2x128 (__m128i* srcLo, __m128i* srcHi, __m128i* alphaDstLo, __m128i* alphaDstHi,
223 __m128i* dstLo, __m128i* dstHi, __m128i* alphaSrcLo, __m128i* alphaSrcHi,
224 __m128i* retLo, __m128i* retHi)
227 __m128i mulLo, mulHi;
229 lo = _mm_mullo_epi16 (*srcLo, *alphaDstLo);
230 hi = _mm_mullo_epi16 (*srcHi, *alphaDstHi);
231 mulLo = _mm_mullo_epi16 (*dstLo, *alphaSrcLo);
232 mulHi = _mm_mullo_epi16 (*dstHi, *alphaSrcHi);
233 lo = _mm_adds_epu16 (lo, Mask0080);
234 hi = _mm_adds_epu16 (hi, Mask0080);
235 lo = _mm_adds_epu16 (lo, mulLo);
236 hi = _mm_adds_epu16 (hi, mulHi);
237 *retLo = _mm_mulhi_epu16 (lo, Mask0101);
238 *retHi = _mm_mulhi_epu16 (hi, Mask0101);
241 static force_inline void
242 negate_2x128 (__m128i dataLo, __m128i dataHi, __m128i* negLo, __m128i* negHi)
244 *negLo = _mm_xor_si128 (dataLo, Mask00ff);
245 *negHi = _mm_xor_si128 (dataHi, Mask00ff);
248 static force_inline void
249 invertColors_2x128 (__m128i dataLo, __m128i dataHi, __m128i* invLo, __m128i* invHi)
253 lo = _mm_shufflelo_epi16 (dataLo, _MM_SHUFFLE(3, 0, 1, 2));
254 hi = _mm_shufflelo_epi16 (dataHi, _MM_SHUFFLE(3, 0, 1, 2));
255 *invLo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE(3, 0, 1, 2));
256 *invHi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE(3, 0, 1, 2));
259 static force_inline void
260 over_2x128 (__m128i* srcLo, __m128i* srcHi, __m128i* alphaLo, __m128i* alphaHi, __m128i* dstLo, __m128i* dstHi)
264 negate_2x128 (*alphaLo, *alphaHi, &t1, &t2);
266 pixMultiply_2x128 (dstLo, dstHi, &t1, &t2, dstLo, dstHi);
268 *dstLo = _mm_adds_epu8 (*srcLo, *dstLo);
269 *dstHi = _mm_adds_epu8 (*srcHi, *dstHi);
272 static force_inline void
273 overRevNonPre_2x128 (__m128i srcLo, __m128i srcHi, __m128i* dstLo, __m128i* dstHi)
276 __m128i alphaLo, alphaHi;
278 expandAlpha_2x128 (srcLo, srcHi, &alphaLo, &alphaHi);
280 lo = _mm_or_si128 (alphaLo, MaskAlpha);
281 hi = _mm_or_si128 (alphaHi, MaskAlpha);
283 invertColors_2x128 (srcLo, srcHi, &srcLo, &srcHi);
285 pixMultiply_2x128 (&srcLo, &srcHi, &lo, &hi, &lo, &hi);
287 over_2x128 (&lo, &hi, &alphaLo, &alphaHi, dstLo, dstHi);
290 static force_inline void
291 inOver_2x128 (__m128i* srcLo, __m128i* srcHi, __m128i* alphaLo, __m128i* alphaHi,
292 __m128i* maskLo, __m128i* maskHi, __m128i* dstLo, __m128i* dstHi)
297 pixMultiply_2x128 ( srcLo, srcHi, maskLo, maskHi, &sLo, &sHi);
298 pixMultiply_2x128 (alphaLo, alphaHi, maskLo, maskHi, &aLo, &aHi);
300 over_2x128 (&sLo, &sHi, &aLo, &aHi, dstLo, dstHi);
303 static force_inline void
304 cachePrefetch (__m128i* addr)
306 _mm_prefetch (addr, _MM_HINT_T0);
309 static force_inline void
310 cachePrefetchNext (__m128i* addr)
312 _mm_prefetch (addr + 4, _MM_HINT_T0); // 64 bytes ahead
315 /* load 4 pixels from a 16-byte boundary aligned address */
316 static force_inline __m128i
317 load128Aligned (__m128i* src)
319 return _mm_load_si128 (src);
322 /* load 4 pixels from a unaligned address */
323 static force_inline __m128i
324 load128Unaligned (const __m128i* src)
326 return _mm_loadu_si128 (src);
329 /* save 4 pixels using Write Combining memory on a 16-byte boundary aligned address */
330 static force_inline void
331 save128WriteCombining (__m128i* dst, __m128i data)
333 _mm_stream_si128 (dst, data);
336 /* save 4 pixels on a 16-byte boundary aligned address */
337 static force_inline void
338 save128Aligned (__m128i* dst, __m128i data)
340 _mm_store_si128 (dst, data);
343 /* save 4 pixels on a unaligned address */
344 static force_inline void
345 save128Unaligned (__m128i* dst, __m128i data)
347 _mm_storeu_si128 (dst, data);
350 /* -------------------------------------------------------------------------------------------------
354 static force_inline __m64
355 unpack_32_1x64 (uint32_t data)
357 return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (data), _mm_setzero_si64());
360 static force_inline __m64
361 expandAlpha_1x64 (__m64 data)
363 return _mm_shuffle_pi16 (data, _MM_SHUFFLE(3, 3, 3, 3));
366 static force_inline __m64
367 expandAlphaRev_1x64 (__m64 data)
369 return _mm_shuffle_pi16 (data, _MM_SHUFFLE(0, 0, 0, 0));
372 static force_inline __m64
373 expandPixel_8_1x64 (uint8_t data)
375 return _mm_shuffle_pi16 (unpack_32_1x64 ((uint32_t)data), _MM_SHUFFLE(0, 0, 0, 0));
378 static force_inline __m64
379 pixMultiply_1x64 (__m64 data, __m64 alpha)
381 return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (data, alpha),
386 static force_inline __m64
387 pixAddMultiply_1x64 (__m64* src, __m64* alphaDst, __m64* dst, __m64* alphaSrc)
389 return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (*src, *alphaDst),
391 _mm_mullo_pi16 (*dst, *alphaSrc)),
395 static force_inline __m64
396 negate_1x64 (__m64 data)
398 return _mm_xor_si64 (data, xMask00ff);
401 static force_inline __m64
402 invertColors_1x64 (__m64 data)
404 return _mm_shuffle_pi16 (data, _MM_SHUFFLE(3, 0, 1, 2));
407 static force_inline __m64
408 over_1x64 (__m64 src, __m64 alpha, __m64 dst)
410 return _mm_adds_pu8 (src, pixMultiply_1x64 (dst, negate_1x64 (alpha)));
413 static force_inline __m64
414 inOver_1x64 (__m64* src, __m64* alpha, __m64* mask, __m64* dst)
416 return over_1x64 (pixMultiply_1x64 (*src, *mask),
417 pixMultiply_1x64 (*alpha, *mask),
421 static force_inline __m64
422 overRevNonPre_1x64 (__m64 src, __m64 dst)
424 __m64 alpha = expandAlpha_1x64 (src);
426 return over_1x64 (pixMultiply_1x64 (invertColors_1x64 (src),
427 _mm_or_si64 (alpha, xMaskAlpha)),
432 static force_inline uint32_t
433 pack_1x64_32( __m64 data )
435 return _mm_cvtsi64_si32 (_mm_packs_pu16 (data, _mm_setzero_si64()));
438 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
442 * --- Expanding 565 in the low word ---
444 * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
445 * m = m & (01f0003f001f);
446 * m = m * (008404100840);
449 * Note the trick here - the top word is shifted by another nibble to
450 * avoid it bumping into the middle word
452 static force_inline __m64
453 expand565_16_1x64 (uint16_t pixel)
458 p = _mm_cvtsi32_si64 ((uint32_t) pixel);
460 t1 = _mm_slli_si64 (p, 36 - 11);
461 t2 = _mm_slli_si64 (p, 16 - 5);
463 p = _mm_or_si64 (t1, p);
464 p = _mm_or_si64 (t2, p);
465 p = _mm_and_si64 (p, xMask565rgb);
466 p = _mm_mullo_pi16 (p, xMask565Unpack);
468 return _mm_srli_pi16 (p, 8);
471 /* -------------------------------------------------------------------------------------------------
472 * Compose Core transformations
474 static force_inline uint32_t
475 coreCombineOverUPixelsse2 (uint32_t src, uint32_t dst)
488 ms = unpack_32_1x64 (src);
489 return pack_1x64_32 (over_1x64 (ms, expandAlpha_1x64 (ms), unpack_32_1x64 (dst)));
495 static force_inline uint32_t
496 combine1 (const uint32_t *ps, const uint32_t *pm)
504 mm = unpack_32_1x64 (*pm);
505 mm = expandAlpha_1x64 (mm);
507 ms = unpack_32_1x64 (s);
508 ms = pixMultiply_1x64 (ms, mm);
510 s = pack_1x64_32 (ms);
516 static force_inline __m128i
517 combine4 (const __m128i *ps, const __m128i *pm)
519 __m128i xmmSrcLo, xmmSrcHi;
520 __m128i xmmMskLo, xmmMskHi;
525 xmmMskLo = load128Unaligned (pm);
527 if (isTransparent (xmmMskLo))
528 return _mm_setzero_si128 ();
531 s = load128Unaligned (ps);
535 unpack_128_2x128 (s, &xmmSrcLo, &xmmSrcHi);
536 unpack_128_2x128 (xmmMskLo, &xmmMskLo, &xmmMskHi);
538 expandAlpha_2x128 (xmmMskLo, xmmMskHi, &xmmMskLo, &xmmMskHi);
540 pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMskLo, &xmmMskHi, &xmmSrcLo, &xmmSrcHi);
542 s = pack_2x128_128 (xmmSrcLo, xmmSrcHi);
548 static force_inline void
549 coreCombineOverUsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm, int w)
553 __m128i xmmDstLo, xmmDstHi;
554 __m128i xmmSrcLo, xmmSrcHi;
555 __m128i xmmAlphaLo, xmmAlphaHi;
557 /* call prefetch hint to optimize cache load*/
558 cachePrefetch ((__m128i*)ps);
559 cachePrefetch ((__m128i*)pd);
560 cachePrefetch ((__m128i*)pm);
562 /* Align dst on a 16-byte boundary */
564 ((unsigned long)pd & 15))
567 s = combine1 (ps, pm);
569 *pd++ = coreCombineOverUPixelsse2 (s, d);
576 /* call prefetch hint to optimize cache load*/
577 cachePrefetch ((__m128i*)ps);
578 cachePrefetch ((__m128i*)pd);
579 cachePrefetch ((__m128i*)pm);
583 /* fill cache line with next memory */
584 cachePrefetchNext ((__m128i*)ps);
585 cachePrefetchNext ((__m128i*)pd);
586 cachePrefetchNext ((__m128i*)pm);
588 /* I'm loading unaligned because I'm not sure about the address alignment. */
589 xmmSrcHi = combine4 ((__m128i*)ps, (__m128i*)pm);
591 if (isOpaque (xmmSrcHi))
593 save128Aligned ((__m128i*)pd, xmmSrcHi);
595 else if (!isZero (xmmSrcHi))
597 xmmDstHi = load128Aligned ((__m128i*) pd);
599 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
600 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
602 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);
604 over_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi, &xmmDstLo, &xmmDstHi);
606 /* rebuid the 4 pixel data and save*/
607 save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
620 s = combine1 (ps, pm);
622 *pd++ = coreCombineOverUPixelsse2 (s, d);
630 static force_inline void
631 coreCombineOverReverseUsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm, int w)
635 __m128i xmmDstLo, xmmDstHi;
636 __m128i xmmSrcLo, xmmSrcHi;
637 __m128i xmmAlphaLo, xmmAlphaHi;
639 /* call prefetch hint to optimize cache load*/
640 cachePrefetch ((__m128i*)ps);
641 cachePrefetch ((__m128i*)pd);
642 cachePrefetch ((__m128i*)pm);
644 /* Align dst on a 16-byte boundary */
646 ((unsigned long)pd & 15))
649 s = combine1 (ps, pm);
651 *pd++ = coreCombineOverUPixelsse2 (d, s);
658 /* call prefetch hint to optimize cache load*/
659 cachePrefetch ((__m128i*)ps);
660 cachePrefetch ((__m128i*)pd);
661 cachePrefetch ((__m128i*)pm);
665 /* fill cache line with next memory */
666 cachePrefetchNext ((__m128i*)ps);
667 cachePrefetchNext ((__m128i*)pd);
668 cachePrefetchNext ((__m128i*)pm);
670 /* I'm loading unaligned because I'm not sure about the address alignment. */
671 xmmSrcHi = combine4 ((__m128i*)ps, (__m128i*)pm);
672 xmmDstHi = load128Aligned ((__m128i*) pd);
674 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
675 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
677 expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaLo, &xmmAlphaHi);
679 over_2x128 (&xmmDstLo, &xmmDstHi, &xmmAlphaLo, &xmmAlphaHi, &xmmSrcLo, &xmmSrcHi);
681 /* rebuid the 4 pixel data and save*/
682 save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmSrcLo, xmmSrcHi));
694 s = combine1 (ps, pm);
696 *pd++ = coreCombineOverUPixelsse2 (d, s);
704 static force_inline uint32_t
705 coreCombineInUPixelsse2 (uint32_t src, uint32_t dst)
707 uint32_t maska = src >> 24;
713 else if (maska != 0xff)
715 return pack_1x64_32(pixMultiply_1x64 (unpack_32_1x64 (dst), expandAlpha_1x64 (unpack_32_1x64 (src))));
721 static force_inline void
722 coreCombineInUsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm, int w)
726 __m128i xmmSrcLo, xmmSrcHi;
727 __m128i xmmDstLo, xmmDstHi;
729 /* call prefetch hint to optimize cache load*/
730 cachePrefetch ((__m128i*)ps);
731 cachePrefetch ((__m128i*)pd);
732 cachePrefetch ((__m128i*)pm);
734 while (w && ((unsigned long) pd & 15))
736 s = combine1 (ps, pm);
739 *pd++ = coreCombineInUPixelsse2 (d, s);
746 /* call prefetch hint to optimize cache load*/
747 cachePrefetch ((__m128i*)ps);
748 cachePrefetch ((__m128i*)pd);
749 cachePrefetch ((__m128i*)pm);
753 /* fill cache line with next memory */
754 cachePrefetchNext ((__m128i*)ps);
755 cachePrefetchNext ((__m128i*)pd);
756 cachePrefetchNext ((__m128i*)pm);
758 xmmDstHi = load128Aligned ((__m128i*) pd);
759 xmmSrcHi = combine4 ((__m128i*) ps, (__m128i*) pm);
761 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
762 expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmDstLo, &xmmDstHi);
764 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
765 pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmDstLo, &xmmDstHi, &xmmDstLo, &xmmDstHi);
767 save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
778 s = combine1 (ps, pm);
781 *pd++ = coreCombineInUPixelsse2 (d, s);
789 static force_inline void
790 coreCombineReverseInUsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t *pm, int w)
794 __m128i xmmSrcLo, xmmSrcHi;
795 __m128i xmmDstLo, xmmDstHi;
797 /* call prefetch hint to optimize cache load*/
798 cachePrefetch ((__m128i*)ps);
799 cachePrefetch ((__m128i*)pd);
800 cachePrefetch ((__m128i*)pm);
802 while (w && ((unsigned long) pd & 15))
804 s = combine1 (ps, pm);
807 *pd++ = coreCombineInUPixelsse2 (s, d);
814 /* call prefetch hint to optimize cache load*/
815 cachePrefetch ((__m128i*)ps);
816 cachePrefetch ((__m128i*)pd);
817 cachePrefetch ((__m128i*)pm);
821 /* fill cache line with next memory */
822 cachePrefetchNext ((__m128i*)ps);
823 cachePrefetchNext ((__m128i*)pd);
824 cachePrefetchNext ((__m128i*)pm);
826 xmmDstHi = load128Aligned ((__m128i*) pd);
827 xmmSrcHi = combine4 ((__m128i*) ps, (__m128i*)pm);
829 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
830 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
832 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
833 pixMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmSrcLo, &xmmSrcHi, &xmmDstLo, &xmmDstHi);
835 save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
846 s = combine1 (ps, pm);
849 *pd++ = coreCombineInUPixelsse2 (s, d);
857 static force_inline void
858 coreCombineReverseOutUsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm, int w)
860 /* call prefetch hint to optimize cache load*/
861 cachePrefetch ((__m128i*)ps);
862 cachePrefetch ((__m128i*)pd);
863 cachePrefetch ((__m128i*)pm);
865 while (w && ((unsigned long) pd & 15))
867 uint32_t s = combine1 (ps, pm);
870 *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d), negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (s)))));
877 /* call prefetch hint to optimize cache load*/
878 cachePrefetch ((__m128i*)ps);
879 cachePrefetch ((__m128i*)pd);
880 cachePrefetch ((__m128i*)pm);
884 __m128i xmmSrcLo, xmmSrcHi;
885 __m128i xmmDstLo, xmmDstHi;
887 /* fill cache line with next memory */
888 cachePrefetchNext ((__m128i*)ps);
889 cachePrefetchNext ((__m128i*)pd);
890 cachePrefetchNext ((__m128i*)pm);
892 xmmSrcHi = combine4 ((__m128i*)ps, (__m128i*)pm);
893 xmmDstHi = load128Aligned ((__m128i*) pd);
895 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
896 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
898 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
899 negate_2x128 (xmmSrcLo, xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
901 pixMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmSrcLo, &xmmSrcHi, &xmmDstLo, &xmmDstHi);
903 save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
914 uint32_t s = combine1 (ps, pm);
917 *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d), negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (s)))));
925 static force_inline void
926 coreCombineOutUsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm, int w)
928 /* call prefetch hint to optimize cache load*/
929 cachePrefetch ((__m128i*)ps);
930 cachePrefetch ((__m128i*)pd);
931 cachePrefetch ((__m128i*)pm);
933 while (w && ((unsigned long) pd & 15))
935 uint32_t s = combine1 (ps, pm);
938 *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s), negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (d)))));
945 /* call prefetch hint to optimize cache load*/
946 cachePrefetch ((__m128i*)ps);
947 cachePrefetch ((__m128i*)pd);
948 cachePrefetch ((__m128i*)pm);
952 __m128i xmmSrcLo, xmmSrcHi;
953 __m128i xmmDstLo, xmmDstHi;
955 /* fill cache line with next memory */
956 cachePrefetchNext ((__m128i*)ps);
957 cachePrefetchNext ((__m128i*)pd);
958 cachePrefetchNext ((__m128i*)pm);
960 xmmSrcHi = combine4 ((__m128i*) ps, (__m128i*)pm);
961 xmmDstHi = load128Aligned ((__m128i*) pd);
963 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
964 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
966 expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmDstLo, &xmmDstHi);
967 negate_2x128 (xmmDstLo, xmmDstHi, &xmmDstLo, &xmmDstHi);
969 pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmDstLo, &xmmDstHi, &xmmDstLo, &xmmDstHi);
971 save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
982 uint32_t s = combine1 (ps, pm);
985 *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s), negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (d)))));
993 static force_inline uint32_t
994 coreCombineAtopUPixelsse2 (uint32_t src, uint32_t dst)
996 __m64 s = unpack_32_1x64 (src);
997 __m64 d = unpack_32_1x64 (dst);
999 __m64 sa = negate_1x64 (expandAlpha_1x64 (s));
1000 __m64 da = expandAlpha_1x64 (d);
1002 return pack_1x64_32 (pixAddMultiply_1x64 (&s, &da, &d, &sa));
1005 static force_inline void
1006 coreCombineAtopUsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm, int w)
1010 __m128i xmmSrcLo, xmmSrcHi;
1011 __m128i xmmDstLo, xmmDstHi;
1012 __m128i xmmAlphaSrcLo, xmmAlphaSrcHi;
1013 __m128i xmmAlphaDstLo, xmmAlphaDstHi;
1015 /* call prefetch hint to optimize cache load*/
1016 cachePrefetch ((__m128i*)ps);
1017 cachePrefetch ((__m128i*)pd);
1018 cachePrefetch ((__m128i*)pm);
1020 while (w && ((unsigned long) pd & 15))
1022 s = combine1 (ps, pm);
1025 *pd++ = coreCombineAtopUPixelsse2 (s, d);
1032 /* call prefetch hint to optimize cache load*/
1033 cachePrefetch ((__m128i*)ps);
1034 cachePrefetch ((__m128i*)pd);
1035 cachePrefetch ((__m128i*)pm);
1039 /* fill cache line with next memory */
1040 cachePrefetchNext ((__m128i*)ps);
1041 cachePrefetchNext ((__m128i*)pd);
1042 cachePrefetchNext ((__m128i*)pm);
1044 xmmSrcHi = combine4 ((__m128i*)ps, (__m128i*)pm);
1045 xmmDstHi = load128Aligned ((__m128i*) pd);
1047 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
1048 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
1050 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
1051 expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
1053 negate_2x128 (xmmAlphaSrcLo, xmmAlphaSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
1055 pixAddMultiply_2x128 ( &xmmSrcLo, &xmmSrcHi, &xmmAlphaDstLo, &xmmAlphaDstHi,
1056 &xmmDstLo, &xmmDstHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi,
1057 &xmmDstLo, &xmmDstHi );
1059 save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
1070 s = combine1 (ps, pm);
1073 *pd++ = coreCombineAtopUPixelsse2 (s, d);
1081 static force_inline uint32_t
1082 coreCombineReverseAtopUPixelsse2 (uint32_t src, uint32_t dst)
1084 __m64 s = unpack_32_1x64 (src);
1085 __m64 d = unpack_32_1x64 (dst);
1087 __m64 sa = expandAlpha_1x64 (s);
1088 __m64 da = negate_1x64 (expandAlpha_1x64 (d));
1090 return pack_1x64_32 (pixAddMultiply_1x64 (&s, &da, &d, &sa));
1093 static force_inline void
1094 coreCombineReverseAtopUsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm, int w)
1098 __m128i xmmSrcLo, xmmSrcHi;
1099 __m128i xmmDstLo, xmmDstHi;
1100 __m128i xmmAlphaSrcLo, xmmAlphaSrcHi;
1101 __m128i xmmAlphaDstLo, xmmAlphaDstHi;
1103 /* call prefetch hint to optimize cache load*/
1104 cachePrefetch ((__m128i*)ps);
1105 cachePrefetch ((__m128i*)pd);
1106 cachePrefetch ((__m128i*)pm);
1108 while (w && ((unsigned long) pd & 15))
1110 s = combine1 (ps, pm);
1113 *pd++ = coreCombineReverseAtopUPixelsse2 (s, d);
1120 /* call prefetch hint to optimize cache load*/
1121 cachePrefetch ((__m128i*)ps);
1122 cachePrefetch ((__m128i*)pd);
1123 cachePrefetch ((__m128i*)pm);
1127 /* fill cache line with next memory */
1128 cachePrefetchNext ((__m128i*)ps);
1129 cachePrefetchNext ((__m128i*)pd);
1130 cachePrefetchNext ((__m128i*)pm);
1132 xmmSrcHi = combine4 ((__m128i*)ps, (__m128i*)pm);
1133 xmmDstHi = load128Aligned ((__m128i*) pd);
1135 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
1136 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
1138 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
1139 expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
1141 negate_2x128 (xmmAlphaDstLo, xmmAlphaDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
1143 pixAddMultiply_2x128 ( &xmmSrcLo, &xmmSrcHi, &xmmAlphaDstLo, &xmmAlphaDstHi,
1144 &xmmDstLo, &xmmDstHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi,
1145 &xmmDstLo, &xmmDstHi );
1147 save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
1158 s = combine1 (ps, pm);
1161 *pd++ = coreCombineReverseAtopUPixelsse2 (s, d);
1169 static force_inline uint32_t
1170 coreCombineXorUPixelsse2 (uint32_t src, uint32_t dst)
1172 __m64 s = unpack_32_1x64 (src);
1173 __m64 d = unpack_32_1x64 (dst);
1175 __m64 negD = negate_1x64 (expandAlpha_1x64 (d));
1176 __m64 negS = negate_1x64 (expandAlpha_1x64 (s));
1178 return pack_1x64_32 (pixAddMultiply_1x64 (&s, &negD, &d, &negS));
1181 static force_inline void
1182 coreCombineXorUsse2 (uint32_t* dst, const uint32_t* src, const uint32_t *mask, int width)
1187 const uint32_t* ps = src;
1188 const uint32_t* pm = mask;
1190 __m128i xmmSrc, xmmSrcLo, xmmSrcHi;
1191 __m128i xmmDst, xmmDstLo, xmmDstHi;
1192 __m128i xmmAlphaSrcLo, xmmAlphaSrcHi;
1193 __m128i xmmAlphaDstLo, xmmAlphaDstHi;
1195 /* call prefetch hint to optimize cache load*/
1196 cachePrefetch ((__m128i*)ps);
1197 cachePrefetch ((__m128i*)pd);
1198 cachePrefetch ((__m128i*)pm);
1200 while (w && ((unsigned long) pd & 15))
1202 s = combine1 (ps, pm);
1205 *pd++ = coreCombineXorUPixelsse2 (s, d);
1212 /* call prefetch hint to optimize cache load*/
1213 cachePrefetch ((__m128i*)ps);
1214 cachePrefetch ((__m128i*)pd);
1215 cachePrefetch ((__m128i*)pm);
1219 /* fill cache line with next memory */
1220 cachePrefetchNext ((__m128i*)ps);
1221 cachePrefetchNext ((__m128i*)pd);
1222 cachePrefetchNext ((__m128i*)pm);
1224 xmmSrc = combine4 ((__m128i*) ps, (__m128i*) pm);
1225 xmmDst = load128Aligned ((__m128i*) pd);
1227 unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
1228 unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
1230 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
1231 expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
1233 negate_2x128 (xmmAlphaSrcLo, xmmAlphaSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
1234 negate_2x128 (xmmAlphaDstLo, xmmAlphaDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
1236 pixAddMultiply_2x128 ( &xmmSrcLo, &xmmSrcHi, &xmmAlphaDstLo, &xmmAlphaDstHi,
1237 &xmmDstLo, &xmmDstHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi,
1238 &xmmDstLo, &xmmDstHi );
1240 save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
1251 s = combine1 (ps, pm);
1254 *pd++ = coreCombineXorUPixelsse2 (s, d);
1262 static force_inline void
1263 coreCombineAddUsse2 (uint32_t* dst, const uint32_t* src, const uint32_t* mask, int width)
1268 const uint32_t* ps = src;
1269 const uint32_t* pm = mask;
1271 /* call prefetch hint to optimize cache load*/
1272 cachePrefetch ((__m128i*)ps);
1273 cachePrefetch ((__m128i*)pd);
1274 cachePrefetch ((__m128i*)pm);
1276 while (w && (unsigned long)pd & 15)
1278 s = combine1 (ps, pm);
1283 *pd++ = _mm_cvtsi64_si32 (_mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
1287 /* call prefetch hint to optimize cache load*/
1288 cachePrefetch ((__m128i*)ps);
1289 cachePrefetch ((__m128i*)pd);
1290 cachePrefetch ((__m128i*)pm);
1296 /* fill cache line with next memory */
1297 cachePrefetchNext ((__m128i*)ps);
1298 cachePrefetchNext ((__m128i*)pd);
1299 cachePrefetchNext ((__m128i*)pm);
1301 s = combine4((__m128i*)ps,(__m128i*)pm);
1303 save128Aligned( (__m128i*)pd,
1304 _mm_adds_epu8( s, load128Aligned ((__m128i*)pd)) );
1314 s = combine1 (ps, pm);
1317 *pd++ = _mm_cvtsi64_si32 (_mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
1323 static force_inline uint32_t
1324 coreCombineSaturateUPixelsse2 (uint32_t src, uint32_t dst)
1326 __m64 ms = unpack_32_1x64 (src);
1327 __m64 md = unpack_32_1x64 (dst);
1328 uint32_t sa = src >> 24;
1329 uint32_t da = ~dst >> 24;
1333 ms = pixMultiply_1x64 (ms, expandAlpha_1x64 (unpack_32_1x64 (FbIntDiv(da, sa) << 24)));
1336 return pack_1x64_32 (_mm_adds_pu16 (md, ms));
1339 static force_inline void
1340 coreCombineSaturateUsse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm, int w)
1345 __m128i xmmSrc, xmmDst;
1347 /* call prefetch hint to optimize cache load*/
1348 cachePrefetch ((__m128i*)ps);
1349 cachePrefetch ((__m128i*)pd);
1350 cachePrefetch ((__m128i*)pm);
1352 while (w && (unsigned long)pd & 15)
1354 s = combine1 (ps, pm);
1356 *pd++ = coreCombineSaturateUPixelsse2 (s, d);
1363 /* call prefetch hint to optimize cache load*/
1364 cachePrefetch ((__m128i*)ps);
1365 cachePrefetch ((__m128i*)pd);
1366 cachePrefetch ((__m128i*)pm);
1370 /* fill cache line with next memory */
1371 cachePrefetchNext ((__m128i*)ps);
1372 cachePrefetchNext ((__m128i*)pd);
1373 cachePrefetchNext ((__m128i*)pm);
1375 xmmDst = load128Aligned ((__m128i*)pd);
1376 xmmSrc = combine4 ((__m128i*)ps, (__m128i*)pm);
1378 packCmp = _mm_movemask_epi8 (_mm_cmpgt_epi32 (_mm_srli_epi32 (xmmSrc, 24),
1379 _mm_srli_epi32 (_mm_xor_si128 (xmmDst, Maskff000000), 24)));
1381 /* if some alpha src is grater than respective ~alpha dst */
1384 s = combine1 (ps++, pm);
1386 *pd++ = coreCombineSaturateUPixelsse2 (s, d);
1390 s = combine1 (ps++, pm);
1392 *pd++ = coreCombineSaturateUPixelsse2 (s, d);
1396 s = combine1 (ps++, pm);
1398 *pd++ = coreCombineSaturateUPixelsse2 (s, d);
1402 s = combine1 (ps++, pm);
1404 *pd++ = coreCombineSaturateUPixelsse2 (s, d);
1410 save128Aligned ((__m128i*)pd, _mm_adds_epu8 (xmmDst, xmmSrc));
1423 s = combine1 (ps, pm);
1425 *pd++ = coreCombineSaturateUPixelsse2 (s, d);
1432 static force_inline void
1433 coreCombineSrcCsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t *pm, int w)
1437 __m128i xmmSrcLo, xmmSrcHi;
1438 __m128i xmmMaskLo, xmmMaskHi;
1439 __m128i xmmDstLo, xmmDstHi;
1441 /* call prefetch hint to optimize cache load*/
1442 cachePrefetch ((__m128i*)ps);
1443 cachePrefetch ((__m128i*)pd);
1444 cachePrefetch ((__m128i*)pm);
1446 while (w && (unsigned long)pd & 15)
1450 *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
1454 /* call prefetch hint to optimize cache load*/
1455 cachePrefetch ((__m128i*)ps);
1456 cachePrefetch ((__m128i*)pd);
1457 cachePrefetch ((__m128i*)pm);
1461 /* fill cache line with next memory */
1462 cachePrefetchNext ((__m128i*)ps);
1463 cachePrefetchNext ((__m128i*)pd);
1464 cachePrefetchNext ((__m128i*)pm);
1466 xmmSrcHi = load128Unaligned ((__m128i*)ps);
1467 xmmMaskHi = load128Unaligned ((__m128i*)pm);
1469 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
1470 unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
1472 pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi);
1474 save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
1486 *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
1491 static force_inline uint32_t
1492 coreCombineOverCPixelsse2 (uint32_t src, uint32_t mask, uint32_t dst)
1494 __m64 s = unpack_32_1x64 (src);
1495 __m64 expAlpha = expandAlpha_1x64 (s);
1496 __m64 unpkMask = unpack_32_1x64 (mask);
1497 __m64 unpkDst = unpack_32_1x64 (dst);
1499 return pack_1x64_32 (inOver_1x64 (&s, &expAlpha, &unpkMask, &unpkDst));
1502 static force_inline void
1503 coreCombineOverCsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t *pm, int w)
1507 __m128i xmmAlphaLo, xmmAlphaHi;
1508 __m128i xmmSrcLo, xmmSrcHi;
1509 __m128i xmmDstLo, xmmDstHi;
1510 __m128i xmmMaskLo, xmmMaskHi;
1512 /* call prefetch hint to optimize cache load*/
1513 cachePrefetch ((__m128i*)ps);
1514 cachePrefetch ((__m128i*)pd);
1515 cachePrefetch ((__m128i*)pm);
1517 while (w && (unsigned long)pd & 15)
1523 *pd++ = coreCombineOverCPixelsse2 (s, m, d);
1527 /* call prefetch hint to optimize cache load*/
1528 cachePrefetch ((__m128i*)ps);
1529 cachePrefetch ((__m128i*)pd);
1530 cachePrefetch ((__m128i*)pm);
1534 /* fill cache line with next memory */
1535 cachePrefetchNext ((__m128i*)ps);
1536 cachePrefetchNext ((__m128i*)pd);
1537 cachePrefetchNext ((__m128i*)pm);
1539 xmmDstHi = load128Aligned ((__m128i*)pd);
1540 xmmSrcHi = load128Unaligned ((__m128i*)ps);
1541 xmmMaskHi = load128Unaligned ((__m128i*)pm);
1543 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
1544 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
1545 unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
1547 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);
1549 inOver_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi);
1551 save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
1565 *pd++ = coreCombineOverCPixelsse2 (s, m, d);
1570 static force_inline uint32_t
1571 coreCombineOverReverseCPixelsse2 (uint32_t src, uint32_t mask, uint32_t dst)
1573 __m64 d = unpack_32_1x64 (dst);
1575 return pack_1x64_32(over_1x64 (d, expandAlpha_1x64 (d), pixMultiply_1x64 (unpack_32_1x64 (src), unpack_32_1x64 (mask))));
1578 static force_inline void
1579 coreCombineOverReverseCsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t *pm, int w)
1583 __m128i xmmAlphaLo, xmmAlphaHi;
1584 __m128i xmmSrcLo, xmmSrcHi;
1585 __m128i xmmDstLo, xmmDstHi;
1586 __m128i xmmMaskLo, xmmMaskHi;
1588 /* call prefetch hint to optimize cache load*/
1589 cachePrefetch ((__m128i*)ps);
1590 cachePrefetch ((__m128i*)pd);
1591 cachePrefetch ((__m128i*)pm);
1593 while (w && (unsigned long)pd & 15)
1599 *pd++ = coreCombineOverReverseCPixelsse2 (s, m, d);
1603 /* call prefetch hint to optimize cache load*/
1604 cachePrefetch ((__m128i*)ps);
1605 cachePrefetch ((__m128i*)pd);
1606 cachePrefetch ((__m128i*)pm);
1610 /* fill cache line with next memory */
1611 cachePrefetchNext ((__m128i*)ps);
1612 cachePrefetchNext ((__m128i*)pd);
1613 cachePrefetchNext ((__m128i*)pm);
1615 xmmDstHi = load128Aligned ((__m128i*)pd);
1616 xmmSrcHi = load128Unaligned ((__m128i*)ps);
1617 xmmMaskHi = load128Unaligned ((__m128i*)pm);
1619 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
1620 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
1621 unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
1623 expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaLo, &xmmAlphaHi);
1624 pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
1626 over_2x128 (&xmmDstLo, &xmmDstHi, &xmmAlphaLo, &xmmAlphaHi, &xmmMaskLo, &xmmMaskHi);
1628 save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmMaskLo, xmmMaskHi));
1642 *pd++ = coreCombineOverReverseCPixelsse2 (s, m, d);
1647 static force_inline void
1648 coreCombineInCsse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm, int w)
1652 __m128i xmmAlphaLo, xmmAlphaHi;
1653 __m128i xmmSrcLo, xmmSrcHi;
1654 __m128i xmmDstLo, xmmDstHi;
1655 __m128i xmmMaskLo, xmmMaskHi;
1657 /* call prefetch hint to optimize cache load*/
1658 cachePrefetch ((__m128i*)ps);
1659 cachePrefetch ((__m128i*)pd);
1660 cachePrefetch ((__m128i*)pm);
1662 while (w && (unsigned long)pd & 15)
1668 *pd++ = pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
1669 expandAlpha_1x64 (unpack_32_1x64 (d))));
1673 /* call prefetch hint to optimize cache load*/
1674 cachePrefetch ((__m128i*)ps);
1675 cachePrefetch ((__m128i*)pd);
1676 cachePrefetch ((__m128i*)pm);
1680 /* fill cache line with next memory */
1681 cachePrefetchNext ((__m128i*)ps);
1682 cachePrefetchNext ((__m128i*)pd);
1683 cachePrefetchNext ((__m128i*)pm);
1685 xmmDstHi = load128Aligned ((__m128i*)pd);
1686 xmmSrcHi = load128Unaligned ((__m128i*)ps);
1687 xmmMaskHi = load128Unaligned ((__m128i*)pm);
1689 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
1690 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
1691 unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
1693 expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaLo, &xmmAlphaHi);
1694 pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi);
1696 pixMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmAlphaLo, &xmmAlphaHi, &xmmDstLo, &xmmDstHi);
1698 save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
1712 *pd++ = pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
1713 expandAlpha_1x64 (unpack_32_1x64 (d))));
1718 static force_inline void
1719 coreCombineInReverseCsse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm, int w)
1723 __m128i xmmAlphaLo, xmmAlphaHi;
1724 __m128i xmmSrcLo, xmmSrcHi;
1725 __m128i xmmDstLo, xmmDstHi;
1726 __m128i xmmMaskLo, xmmMaskHi;
1728 /* call prefetch hint to optimize cache load*/
1729 cachePrefetch ((__m128i*)ps);
1730 cachePrefetch ((__m128i*)pd);
1731 cachePrefetch ((__m128i*)pm);
1733 while (w && (unsigned long)pd & 15)
1739 *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d),
1740 pixMultiply_1x64 (unpack_32_1x64 (m),
1741 expandAlpha_1x64 (unpack_32_1x64 (s)))));
1745 /* call prefetch hint to optimize cache load*/
1746 cachePrefetch ((__m128i*)ps);
1747 cachePrefetch ((__m128i*)pd);
1748 cachePrefetch ((__m128i*)pm);
1752 /* fill cache line with next memory */
1753 cachePrefetchNext ((__m128i*)ps);
1754 cachePrefetchNext ((__m128i*)pd);
1755 cachePrefetchNext ((__m128i*)pm);
1757 xmmDstHi = load128Aligned ((__m128i*)pd);
1758 xmmSrcHi = load128Unaligned ((__m128i*)ps);
1759 xmmMaskHi = load128Unaligned ((__m128i*)pm);
1761 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
1762 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
1763 unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
1765 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);
1766 pixMultiply_2x128 (&xmmMaskLo, &xmmMaskHi, &xmmAlphaLo, &xmmAlphaHi, &xmmAlphaLo, &xmmAlphaHi);
1768 pixMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmAlphaLo, &xmmAlphaHi, &xmmDstLo, &xmmDstHi);
1770 save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
1784 *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d),
1785 pixMultiply_1x64 (unpack_32_1x64 (m),
1786 expandAlpha_1x64 (unpack_32_1x64 (s)))));
1791 static force_inline void
1792 coreCombineOutCsse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm, int w)
1796 __m128i xmmAlphaLo, xmmAlphaHi;
1797 __m128i xmmSrcLo, xmmSrcHi;
1798 __m128i xmmDstLo, xmmDstHi;
1799 __m128i xmmMaskLo, xmmMaskHi;
1801 /* call prefetch hint to optimize cache load*/
1802 cachePrefetch ((__m128i*)ps);
1803 cachePrefetch ((__m128i*)pd);
1804 cachePrefetch ((__m128i*)pm);
1806 while (w && (unsigned long)pd & 15)
1812 *pd++ = pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
1813 negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (d)))));
1817 /* call prefetch hint to optimize cache load*/
1818 cachePrefetch ((__m128i*)ps);
1819 cachePrefetch ((__m128i*)pd);
1820 cachePrefetch ((__m128i*)pm);
1824 /* fill cache line with next memory */
1825 cachePrefetchNext ((__m128i*)ps);
1826 cachePrefetchNext ((__m128i*)pd);
1827 cachePrefetchNext ((__m128i*)pm);
1829 xmmDstHi = load128Aligned ((__m128i*)pd);
1830 xmmSrcHi = load128Unaligned ((__m128i*)ps);
1831 xmmMaskHi = load128Unaligned ((__m128i*)pm);
1833 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
1834 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
1835 unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
1837 expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaLo, &xmmAlphaHi);
1838 negate_2x128 (xmmAlphaLo, xmmAlphaHi, &xmmAlphaLo, &xmmAlphaHi);
1840 pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi);
1841 pixMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmAlphaLo, &xmmAlphaHi, &xmmDstLo, &xmmDstHi);
1843 save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
1857 *pd++ = pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
1858 negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (d)))));
1863 static force_inline void
1864 coreCombineOutReverseCsse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm, int w)
1868 __m128i xmmAlphaLo, xmmAlphaHi;
1869 __m128i xmmSrcLo, xmmSrcHi;
1870 __m128i xmmDstLo, xmmDstHi;
1871 __m128i xmmMaskLo, xmmMaskHi;
1873 /* call prefetch hint to optimize cache load*/
1874 cachePrefetch ((__m128i*)ps);
1875 cachePrefetch ((__m128i*)pd);
1876 cachePrefetch ((__m128i*)pm);
1878 while (w && (unsigned long)pd & 15)
1884 *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d),
1885 negate_1x64 (pixMultiply_1x64 (unpack_32_1x64 (m),
1886 expandAlpha_1x64 (unpack_32_1x64 (s))))));
1890 /* call prefetch hint to optimize cache load*/
1891 cachePrefetch ((__m128i*)ps);
1892 cachePrefetch ((__m128i*)pd);
1893 cachePrefetch ((__m128i*)pm);
1897 /* fill cache line with next memory */
1898 cachePrefetchNext ((__m128i*)ps);
1899 cachePrefetchNext ((__m128i*)pd);
1900 cachePrefetchNext ((__m128i*)pm);
1902 xmmDstHi = load128Aligned ((__m128i*)pd);
1903 xmmSrcHi = load128Unaligned ((__m128i*)ps);
1904 xmmMaskHi = load128Unaligned ((__m128i*)pm);
1906 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
1907 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
1908 unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
1910 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);
1912 pixMultiply_2x128 (&xmmMaskLo, &xmmMaskHi, &xmmAlphaLo, &xmmAlphaHi, &xmmMaskLo, &xmmMaskHi);
1914 negate_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
1916 pixMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi);
1918 save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
1932 *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d),
1933 negate_1x64 (pixMultiply_1x64 (unpack_32_1x64 (m),
1934 expandAlpha_1x64 (unpack_32_1x64 (s))))));
1939 static force_inline uint32_t
1940 coreCombineAtopCPixelsse2 (uint32_t src, uint32_t mask, uint32_t dst)
1942 __m64 m = unpack_32_1x64 (mask);
1943 __m64 s = unpack_32_1x64 (src);
1944 __m64 d = unpack_32_1x64 (dst);
1945 __m64 sa = expandAlpha_1x64 (s);
1946 __m64 da = expandAlpha_1x64 (d);
1948 s = pixMultiply_1x64 (s, m);
1949 m = negate_1x64 (pixMultiply_1x64 (m, sa));
1951 return pack_1x64_32 (pixAddMultiply_1x64 (&d, &m, &s, &da));
1954 static force_inline void
1955 coreCombineAtopCsse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm, int w)
1959 __m128i xmmSrcLo, xmmSrcHi;
1960 __m128i xmmDstLo, xmmDstHi;
1961 __m128i xmmAlphaSrcLo, xmmAlphaSrcHi;
1962 __m128i xmmAlphaDstLo, xmmAlphaDstHi;
1963 __m128i xmmMaskLo, xmmMaskHi;
1965 /* call prefetch hint to optimize cache load*/
1966 cachePrefetch ((__m128i*)ps);
1967 cachePrefetch ((__m128i*)pd);
1968 cachePrefetch ((__m128i*)pm);
1970 while (w && (unsigned long)pd & 15)
1976 *pd++ = coreCombineAtopCPixelsse2 (s, m, d);
1980 /* call prefetch hint to optimize cache load*/
1981 cachePrefetch ((__m128i*)ps);
1982 cachePrefetch ((__m128i*)pd);
1983 cachePrefetch ((__m128i*)pm);
1987 /* fill cache line with next memory */
1988 cachePrefetchNext ((__m128i*)ps);
1989 cachePrefetchNext ((__m128i*)pd);
1990 cachePrefetchNext ((__m128i*)pm);
1992 xmmDstHi = load128Aligned ((__m128i*)pd);
1993 xmmSrcHi = load128Unaligned ((__m128i*)ps);
1994 xmmMaskHi = load128Unaligned ((__m128i*)pm);
1996 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
1997 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
1998 unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
2000 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
2001 expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
2003 pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmSrcLo, &xmmSrcHi);
2004 pixMultiply_2x128 (&xmmMaskLo, &xmmMaskHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi, &xmmMaskLo, &xmmMaskHi);
2006 negate_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
2008 pixAddMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmMaskLo, &xmmMaskHi,
2009 &xmmSrcLo, &xmmSrcHi, &xmmAlphaDstLo, &xmmAlphaDstHi,
2010 &xmmDstLo, &xmmDstHi);
2012 save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
2026 *pd++ = coreCombineAtopCPixelsse2 (s, m, d);
2031 static force_inline uint32_t
2032 coreCombineReverseAtopCPixelsse2 (uint32_t src, uint32_t mask, uint32_t dst)
2034 __m64 m = unpack_32_1x64 (mask);
2035 __m64 s = unpack_32_1x64 (src);
2036 __m64 d = unpack_32_1x64 (dst);
2038 __m64 da = negate_1x64 (expandAlpha_1x64 (d));
2039 __m64 sa = expandAlpha_1x64 (s);
2041 s = pixMultiply_1x64 (s, m);
2042 m = pixMultiply_1x64 (m, sa);
2044 return pack_1x64_32 (pixAddMultiply_1x64 (&d, &m, &s, &da));
2047 static force_inline void
2048 coreCombineReverseAtopCsse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm, int w)
2052 __m128i xmmSrcLo, xmmSrcHi;
2053 __m128i xmmDstLo, xmmDstHi;
2054 __m128i xmmAlphaSrcLo, xmmAlphaSrcHi;
2055 __m128i xmmAlphaDstLo, xmmAlphaDstHi;
2056 __m128i xmmMaskLo, xmmMaskHi;
2058 /* call prefetch hint to optimize cache load*/
2059 cachePrefetch ((__m128i*)ps);
2060 cachePrefetch ((__m128i*)pd);
2061 cachePrefetch ((__m128i*)pm);
2063 while (w && (unsigned long)pd & 15)
2069 *pd++ = coreCombineReverseAtopCPixelsse2 (s, m, d);
2073 /* call prefetch hint to optimize cache load*/
2074 cachePrefetch ((__m128i*)ps);
2075 cachePrefetch ((__m128i*)pd);
2076 cachePrefetch ((__m128i*)pm);
2080 /* fill cache line with next memory */
2081 cachePrefetchNext ((__m128i*)ps);
2082 cachePrefetchNext ((__m128i*)pd);
2083 cachePrefetchNext ((__m128i*)pm);
2085 xmmDstHi = load128Aligned ((__m128i*)pd);
2086 xmmSrcHi = load128Unaligned ((__m128i*)ps);
2087 xmmMaskHi = load128Unaligned ((__m128i*)pm);
2089 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
2090 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
2091 unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
2093 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
2094 expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
2096 pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmSrcLo, &xmmSrcHi);
2097 pixMultiply_2x128 (&xmmMaskLo, &xmmMaskHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi, &xmmMaskLo, &xmmMaskHi);
2099 negate_2x128 (xmmAlphaDstLo, xmmAlphaDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
2101 pixAddMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmMaskLo, &xmmMaskHi,
2102 &xmmSrcLo, &xmmSrcHi, &xmmAlphaDstLo, &xmmAlphaDstHi,
2103 &xmmDstLo, &xmmDstHi);
2105 save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
2119 *pd++ = coreCombineReverseAtopCPixelsse2 (s, m, d);
2124 static force_inline uint32_t
2125 coreCombineXorCPixelsse2 (uint32_t src, uint32_t mask, uint32_t dst)
2127 __m64 a = unpack_32_1x64 (mask);
2128 __m64 s = unpack_32_1x64 (src);
2129 __m64 d = unpack_32_1x64 (dst);
2131 __m64 alphaDst = negate_1x64 (pixMultiply_1x64 (a, expandAlpha_1x64 (s)));
2132 __m64 dest = pixMultiply_1x64 (s, a);
2133 __m64 alphaSrc = negate_1x64 (expandAlpha_1x64 (d));
2135 return pack_1x64_32 (pixAddMultiply_1x64 (&d,
2141 static force_inline void
2142 coreCombineXorCsse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm, int w)
2146 __m128i xmmSrcLo, xmmSrcHi;
2147 __m128i xmmDstLo, xmmDstHi;
2148 __m128i xmmAlphaSrcLo, xmmAlphaSrcHi;
2149 __m128i xmmAlphaDstLo, xmmAlphaDstHi;
2150 __m128i xmmMaskLo, xmmMaskHi;
2152 /* call prefetch hint to optimize cache load*/
2153 cachePrefetch ((__m128i*)ps);
2154 cachePrefetch ((__m128i*)pd);
2155 cachePrefetch ((__m128i*)pm);
2157 while (w && (unsigned long)pd & 15)
2163 *pd++ = coreCombineXorCPixelsse2 (s, m, d);
2167 /* call prefetch hint to optimize cache load*/
2168 cachePrefetch ((__m128i*)ps);
2169 cachePrefetch ((__m128i*)pd);
2170 cachePrefetch ((__m128i*)pm);
2174 /* fill cache line with next memory */
2175 cachePrefetchNext ((__m128i*)ps);
2176 cachePrefetchNext ((__m128i*)pd);
2177 cachePrefetchNext ((__m128i*)pm);
2179 xmmDstHi = load128Aligned ((__m128i*)pd);
2180 xmmSrcHi = load128Unaligned ((__m128i*)ps);
2181 xmmMaskHi = load128Unaligned ((__m128i*)pm);
2183 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
2184 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
2185 unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
2187 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
2188 expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
2190 pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmSrcLo, &xmmSrcHi);
2191 pixMultiply_2x128 (&xmmMaskLo, &xmmMaskHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi, &xmmMaskLo, &xmmMaskHi);
2193 negate_2x128 (xmmAlphaDstLo, xmmAlphaDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
2194 negate_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
2196 pixAddMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmMaskLo, &xmmMaskHi,
2197 &xmmSrcLo, &xmmSrcHi, &xmmAlphaDstLo, &xmmAlphaDstHi,
2198 &xmmDstLo, &xmmDstHi);
2200 save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
2214 *pd++ = coreCombineXorCPixelsse2 (s, m, d);
2219 static force_inline void
2220 coreCombineAddCsse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm, int w)
2224 __m128i xmmSrcLo, xmmSrcHi;
2225 __m128i xmmDstLo, xmmDstHi;
2226 __m128i xmmMaskLo, xmmMaskHi;
2228 /* call prefetch hint to optimize cache load*/
2229 cachePrefetch ((__m128i*)ps);
2230 cachePrefetch ((__m128i*)pd);
2231 cachePrefetch ((__m128i*)pm);
2233 while (w && (unsigned long)pd & 15)
2239 *pd++ = pack_1x64_32 (_mm_adds_pu8 (pixMultiply_1x64 (unpack_32_1x64 (s),
2240 unpack_32_1x64 (m)),
2241 unpack_32_1x64 (d)));
2245 /* call prefetch hint to optimize cache load*/
2246 cachePrefetch ((__m128i*)ps);
2247 cachePrefetch ((__m128i*)pd);
2248 cachePrefetch ((__m128i*)pm);
2252 /* fill cache line with next memory */
2253 cachePrefetchNext ((__m128i*)ps);
2254 cachePrefetchNext ((__m128i*)pd);
2255 cachePrefetchNext ((__m128i*)pm);
2257 xmmSrcHi = load128Unaligned ((__m128i*)ps);
2258 xmmMaskHi = load128Unaligned ((__m128i*)pm);
2259 xmmDstHi = load128Aligned ((__m128i*)pd);
2261 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
2262 unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
2263 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
2265 pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmSrcLo, &xmmSrcHi);
2267 save128Aligned( (__m128i*)pd, pack_2x128_128 (_mm_adds_epu8 (xmmSrcLo, xmmDstLo),
2268 _mm_adds_epu8 (xmmSrcHi, xmmDstHi)));
2282 *pd++ = pack_1x64_32 (_mm_adds_pu8 (pixMultiply_1x64 (unpack_32_1x64 (s),
2283 unpack_32_1x64 (m)),
2284 unpack_32_1x64 (d)));
2289 /* -------------------------------------------------------------------------------------------------
2290 * fbComposeSetupSSE2
2292 static force_inline __m64
2293 createMask_16_64 (uint16_t mask)
2295 return _mm_set1_pi16 (mask);
2298 static force_inline __m128i
2299 createMask_16_128 (uint16_t mask)
2301 return _mm_set1_epi16 (mask);
2304 static force_inline __m64
2305 createMask_2x32_64 (uint32_t mask0, uint32_t mask1)
2307 return _mm_set_pi32 (mask0, mask1);
2310 static force_inline __m128i
2311 createMask_2x32_128 (uint32_t mask0, uint32_t mask1)
2313 return _mm_set_epi32 (mask0, mask1, mask0, mask1);
2316 /* SSE2 code patch for fbcompose.c */
2319 sse2CombineOverU (pixman_implementation_t *imp, pixman_op_t op,
2320 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2322 coreCombineOverUsse2 (dst, src, mask, width);
2327 sse2CombineOverReverseU (pixman_implementation_t *imp, pixman_op_t op,
2328 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2330 coreCombineOverReverseUsse2 (dst, src, mask, width);
2335 sse2CombineInU (pixman_implementation_t *imp, pixman_op_t op,
2336 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2338 coreCombineInUsse2 (dst, src, mask, width);
2343 sse2CombineInReverseU (pixman_implementation_t *imp, pixman_op_t op,
2344 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2346 coreCombineReverseInUsse2 (dst, src, mask, width);
2351 sse2CombineOutU (pixman_implementation_t *imp, pixman_op_t op,
2352 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2354 coreCombineOutUsse2 (dst, src, mask, width);
2359 sse2CombineOutReverseU (pixman_implementation_t *imp, pixman_op_t op,
2360 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2362 coreCombineReverseOutUsse2 (dst, src, mask, width);
2367 sse2CombineAtopU (pixman_implementation_t *imp, pixman_op_t op,
2368 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2370 coreCombineAtopUsse2 (dst, src, mask, width);
2375 sse2CombineAtopReverseU (pixman_implementation_t *imp, pixman_op_t op,
2376 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2378 coreCombineReverseAtopUsse2 (dst, src, mask, width);
2383 sse2CombineXorU (pixman_implementation_t *imp, pixman_op_t op,
2384 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2386 coreCombineXorUsse2 (dst, src, mask, width);
2391 sse2CombineAddU (pixman_implementation_t *imp, pixman_op_t op,
2392 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2394 coreCombineAddUsse2 (dst, src, mask, width);
2399 sse2CombineSaturateU (pixman_implementation_t *imp, pixman_op_t op,
2400 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2402 coreCombineSaturateUsse2 (dst, src, mask, width);
2407 sse2CombineSrcC (pixman_implementation_t *imp, pixman_op_t op,
2408 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2410 coreCombineSrcCsse2 (dst, src, mask, width);
2415 sse2CombineOverC (pixman_implementation_t *imp, pixman_op_t op,
2416 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2418 coreCombineOverCsse2 (dst, src, mask, width);
2423 sse2CombineOverReverseC (pixman_implementation_t *imp, pixman_op_t op,
2424 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2426 coreCombineOverReverseCsse2 (dst, src, mask, width);
2431 sse2CombineInC (pixman_implementation_t *imp, pixman_op_t op,
2432 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2434 coreCombineInCsse2 (dst, src, mask, width);
2439 sse2CombineInReverseC (pixman_implementation_t *imp, pixman_op_t op,
2440 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2442 coreCombineInReverseCsse2 (dst, src, mask, width);
2447 sse2CombineOutC (pixman_implementation_t *imp, pixman_op_t op,
2448 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2450 coreCombineOutCsse2 (dst, src, mask, width);
2455 sse2CombineOutReverseC (pixman_implementation_t *imp, pixman_op_t op,
2456 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2458 coreCombineOutReverseCsse2 (dst, src, mask, width);
2463 sse2CombineAtopC (pixman_implementation_t *imp, pixman_op_t op,
2464 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2466 coreCombineAtopCsse2 (dst, src, mask, width);
2471 sse2CombineAtopReverseC (pixman_implementation_t *imp, pixman_op_t op,
2472 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2474 coreCombineReverseAtopCsse2 (dst, src, mask, width);
2479 sse2CombineXorC (pixman_implementation_t *imp, pixman_op_t op,
2480 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2482 coreCombineXorCsse2 (dst, src, mask, width);
2487 sse2CombineAddC (pixman_implementation_t *imp, pixman_op_t op,
2488 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
2490 coreCombineAddCsse2 (dst, src, mask, width);
2494 /* -------------------------------------------------------------------------------------------------
2495 * fbCompositeSolid_nx8888
2499 fbCompositeSolid_nx8888sse2 (pixman_implementation_t *imp,
2501 pixman_image_t * pSrc,
2502 pixman_image_t * pMask,
2503 pixman_image_t * pDst,
2514 uint32_t *dstLine, *dst, d;
2517 __m128i xmmSrc, xmmAlpha;
2518 __m128i xmmDst, xmmDstLo, xmmDstHi;
2520 src = _pixman_image_get_solid(pSrc, pDst->bits.format);
2525 fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
2527 xmmSrc = expandPixel_32_1x128 (src);
2528 xmmAlpha = expandAlpha_1x128 (xmmSrc);
2534 /* call prefetch hint to optimize cache load*/
2535 cachePrefetch ((__m128i*)dst);
2537 dstLine += dstStride;
2540 while (w && (unsigned long)dst & 15)
2543 *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmmSrc),
2544 _mm_movepi64_pi64 (xmmAlpha),
2545 unpack_32_1x64 (d)));
2549 cachePrefetch ((__m128i*)dst);
2553 /* fill cache line with next memory */
2554 cachePrefetchNext ((__m128i*)dst);
2556 xmmDst = load128Aligned ((__m128i*)dst);
2558 unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
2560 over_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmDstLo, &xmmDstHi);
2562 /* rebuid the 4 pixel data and save*/
2563 save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
2572 *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmmSrc),
2573 _mm_movepi64_pi64 (xmmAlpha),
2574 unpack_32_1x64 (d)));
2582 /* -------------------------------------------------------------------------------------------------
2583 * fbCompositeSolid_nx0565
2586 fbCompositeSolid_nx0565sse2 (pixman_implementation_t *imp,
2588 pixman_image_t * pSrc,
2589 pixman_image_t * pMask,
2590 pixman_image_t * pDst,
2601 uint16_t *dstLine, *dst, d;
2604 __m128i xmmSrc, xmmAlpha;
2605 __m128i xmmDst, xmmDst0, xmmDst1, xmmDst2, xmmDst3;
2607 src = _pixman_image_get_solid(pSrc, pDst->bits.format);
2612 fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
2614 xmmSrc = expandPixel_32_1x128 (src);
2615 xmmAlpha = expandAlpha_1x128 (xmmSrc);
2621 /* call prefetch hint to optimize cache load*/
2622 cachePrefetch ((__m128i*)dst);
2624 dstLine += dstStride;
2627 while (w && (unsigned long)dst & 15)
2631 *dst++ = pack565_32_16 (pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmmSrc),
2632 _mm_movepi64_pi64 (xmmAlpha),
2633 expand565_16_1x64 (d))));
2637 /* call prefetch hint to optimize cache load*/
2638 cachePrefetch ((__m128i*)dst);
2642 /* fill cache line with next memory */
2643 cachePrefetchNext ((__m128i*)dst);
2645 xmmDst = load128Aligned ((__m128i*)dst);
2647 unpack565_128_4x128 (xmmDst, &xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3);
2649 over_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmDst0, &xmmDst1);
2650 over_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmDst2, &xmmDst3);
2652 xmmDst = pack565_4x128_128 (&xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3);
2653 save128Aligned ((__m128i*)dst, xmmDst);
2662 *dst++ = pack565_32_16 (pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmmSrc),
2663 _mm_movepi64_pi64 (xmmAlpha),
2664 expand565_16_1x64 (d))));
2671 /* -------------------------------------------------------------------------------------------------
2672 * fbCompositeSolidMask_nx8888x8888C
2676 fbCompositeSolidMask_nx8888x8888Csse2 (pixman_implementation_t *imp,
2678 pixman_image_t * pSrc,
2679 pixman_image_t * pMask,
2680 pixman_image_t * pDst,
2691 uint32_t *dstLine, d;
2692 uint32_t *maskLine, m;
2694 int dstStride, maskStride;
2696 __m128i xmmSrc, xmmAlpha;
2697 __m128i xmmDst, xmmDstLo, xmmDstHi;
2698 __m128i xmmMask, xmmMaskLo, xmmMaskHi;
2700 __m64 mmxSrc, mmxAlpha, mmxMask, mmxDst;
2702 src = _pixman_image_get_solid(pSrc, pDst->bits.format);
2707 fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
2708 fbComposeGetStart (pMask, xMask, yMask, uint32_t, maskStride, maskLine, 1);
2710 xmmSrc = _mm_unpacklo_epi8 (createMask_2x32_128 (src, src), _mm_setzero_si128 ());
2711 xmmAlpha = expandAlpha_1x128 (xmmSrc);
2712 mmxSrc = _mm_movepi64_pi64 (xmmSrc);
2713 mmxAlpha = _mm_movepi64_pi64 (xmmAlpha);
2718 const uint32_t *pm = (uint32_t *)maskLine;
2719 uint32_t *pd = (uint32_t *)dstLine;
2721 dstLine += dstStride;
2722 maskLine += maskStride;
2724 /* call prefetch hint to optimize cache load*/
2725 cachePrefetch ((__m128i*)pd);
2726 cachePrefetch ((__m128i*)pm);
2728 while (w && (unsigned long)pd & 15)
2735 mmxMask = unpack_32_1x64 (m);
2736 mmxDst = unpack_32_1x64 (d);
2738 *pd = pack_1x64_32 (inOver_1x64 (&mmxSrc,
2748 /* call prefetch hint to optimize cache load*/
2749 cachePrefetch ((__m128i*)pd);
2750 cachePrefetch ((__m128i*)pm);
2754 /* fill cache line with next memory */
2755 cachePrefetchNext ((__m128i*)pd);
2756 cachePrefetchNext ((__m128i*)pm);
2758 xmmMask = load128Unaligned ((__m128i*)pm);
2760 packCmp = _mm_movemask_epi8 (_mm_cmpeq_epi32 (xmmMask, _mm_setzero_si128()));
2762 /* if all bits in mask are zero, packCmp are equal to 0xffff */
2763 if (packCmp != 0xffff)
2765 xmmDst = load128Aligned ((__m128i*)pd);
2767 unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
2768 unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
2770 inOver_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi);
2772 save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
2787 mmxMask = unpack_32_1x64 (m);
2788 mmxDst = unpack_32_1x64 (d);
2790 *pd = pack_1x64_32 (inOver_1x64 (&mmxSrc,
2805 /* -------------------------------------------------------------------------------------------------
2806 * fbCompositeSrc_8888x8x8888
2810 fbCompositeSrc_8888x8x8888sse2 (pixman_implementation_t *imp,
2812 pixman_image_t * pSrc,
2813 pixman_image_t * pMask,
2814 pixman_image_t * pDst,
2824 uint32_t *dstLine, *dst;
2825 uint32_t *srcLine, *src;
2828 int dstStride, srcStride;
2831 __m128i xmmSrc, xmmSrcLo, xmmSrcHi;
2832 __m128i xmmDst, xmmDstLo, xmmDstHi;
2833 __m128i xmmAlphaLo, xmmAlphaHi;
2835 fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
2836 fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
2837 mask = _pixman_image_get_solid (pMask, pDst->bits.format);
2839 xmmMask = createMask_16_128 (mask >> 24);
2844 dstLine += dstStride;
2846 srcLine += srcStride;
2849 /* call prefetch hint to optimize cache load*/
2850 cachePrefetch ((__m128i*)dst);
2851 cachePrefetch ((__m128i*)src);
2853 while (w && (unsigned long)dst & 15)
2855 uint32_t s = *src++;
2858 __m64 ms = unpack_32_1x64 (s);
2859 __m64 alpha = expandAlpha_1x64 (ms);
2860 __m64 dest = _mm_movepi64_pi64 (xmmMask);
2861 __m64 alphaDst = unpack_32_1x64 (d);
2863 *dst++ = pack_1x64_32 (inOver_1x64 (&ms,
2871 /* call prefetch hint to optimize cache load*/
2872 cachePrefetch ((__m128i*)dst);
2873 cachePrefetch ((__m128i*)src);
2877 /* fill cache line with next memory */
2878 cachePrefetchNext ((__m128i*)dst);
2879 cachePrefetchNext ((__m128i*)src);
2881 xmmSrc = load128Unaligned ((__m128i*)src);
2882 xmmDst = load128Aligned ((__m128i*)dst);
2884 unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
2885 unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
2886 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);
2888 inOver_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi, &xmmMask, &xmmMask, &xmmDstLo, &xmmDstHi);
2890 save128Aligned( (__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
2899 uint32_t s = *src++;
2902 __m64 ms = unpack_32_1x64 (s);
2903 __m64 alpha = expandAlpha_1x64 (ms);
2904 __m64 mask = _mm_movepi64_pi64 (xmmMask);
2905 __m64 dest = unpack_32_1x64 (d);
2907 *dst++ = pack_1x64_32 (inOver_1x64 (&ms,
2919 /* -------------------------------------------------------------------------------------------------
2920 * fbCompositeSrc_x888xnx8888
2923 fbCompositeSrc_x888xnx8888sse2 (pixman_implementation_t *imp,
2925 pixman_image_t * pSrc,
2926 pixman_image_t * pMask,
2927 pixman_image_t * pDst,
2937 uint32_t *dstLine, *dst;
2938 uint32_t *srcLine, *src;
2940 int dstStride, srcStride;
2943 __m128i xmmMask, xmmAlpha;
2944 __m128i xmmSrc, xmmSrcLo, xmmSrcHi;
2945 __m128i xmmDst, xmmDstLo, xmmDstHi;
2947 fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
2948 fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
2949 mask = _pixman_image_get_solid (pMask, pDst->bits.format);
2951 xmmMask = createMask_16_128 (mask >> 24);
2952 xmmAlpha = Mask00ff;
2957 dstLine += dstStride;
2959 srcLine += srcStride;
2962 /* call prefetch hint to optimize cache load*/
2963 cachePrefetch ((__m128i*)dst);
2964 cachePrefetch ((__m128i*)src);
2966 while (w && (unsigned long)dst & 15)
2968 uint32_t s = (*src++) | 0xff000000;
2971 __m64 src = unpack_32_1x64 (s);
2972 __m64 alpha = _mm_movepi64_pi64 (xmmAlpha);
2973 __m64 mask = _mm_movepi64_pi64 (xmmMask);
2974 __m64 dest = unpack_32_1x64 (d);
2976 *dst++ = pack_1x64_32 (inOver_1x64 (&src,
2984 /* call prefetch hint to optimize cache load*/
2985 cachePrefetch ((__m128i*)dst);
2986 cachePrefetch ((__m128i*)src);
2990 /* fill cache line with next memory */
2991 cachePrefetchNext ((__m128i*)dst);
2992 cachePrefetchNext ((__m128i*)src);
2994 xmmSrc = _mm_or_si128 (load128Unaligned ((__m128i*)src), Maskff000000);
2995 xmmDst = load128Aligned ((__m128i*)dst);
2997 unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
2998 unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
3000 inOver_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmAlpha, &xmmAlpha, &xmmMask, &xmmMask, &xmmDstLo, &xmmDstHi);
3002 save128Aligned( (__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
3012 uint32_t s = (*src++) | 0xff000000;
3015 __m64 src = unpack_32_1x64 (s);
3016 __m64 alpha = _mm_movepi64_pi64 (xmmAlpha);
3017 __m64 mask = _mm_movepi64_pi64 (xmmMask);
3018 __m64 dest = unpack_32_1x64 (d);
3020 *dst++ = pack_1x64_32 (inOver_1x64 (&src,
3032 /* -------------------------------------------------------------------------------------------------
3033 * fbCompositeSrc_8888x8888
3036 fbCompositeSrc_8888x8888sse2 (pixman_implementation_t *imp,
3038 pixman_image_t * pSrc,
3039 pixman_image_t * pMask,
3040 pixman_image_t * pDst,
3050 int dstStride, srcStride;
3051 uint32_t *dstLine, *dst;
3052 uint32_t *srcLine, *src;
3054 fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
3055 fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
3062 coreCombineOverUsse2 (dst, src, NULL, width);
3070 /* -------------------------------------------------------------------------------------------------
3071 * fbCompositeSrc_8888x0565
3073 static force_inline uint16_t
3074 fbCompositeSrc_8888x0565pixel (uint32_t src, uint16_t dst)
3078 ms = unpack_32_1x64 (src);
3079 return pack565_32_16( pack_1x64_32 (over_1x64 (ms,
3080 expandAlpha_1x64 (ms),
3081 expand565_16_1x64 (dst))));
3085 fbCompositeSrc_8888x0565sse2 (pixman_implementation_t *imp,
3087 pixman_image_t * pSrc,
3088 pixman_image_t * pMask,
3089 pixman_image_t * pDst,
3099 uint16_t *dstLine, *dst, d;
3100 uint32_t *srcLine, *src, s;
3101 int dstStride, srcStride;
3104 __m128i xmmAlphaLo, xmmAlphaHi;
3105 __m128i xmmSrc, xmmSrcLo, xmmSrcHi;
3106 __m128i xmmDst, xmmDst0, xmmDst1, xmmDst2, xmmDst3;
3108 fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
3109 fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
3114 * I copy the code from MMX one and keep the fixme.
3115 * If it's a problem there, probably is a problem here.
3117 assert (pSrc->pDrawable == pMask->pDrawable);
3125 /* call prefetch hint to optimize cache load*/
3126 cachePrefetch ((__m128i*)src);
3127 cachePrefetch ((__m128i*)dst);
3129 dstLine += dstStride;
3130 srcLine += srcStride;
3133 /* Align dst on a 16-byte boundary */
3135 ((unsigned long)dst & 15))
3140 *dst++ = fbCompositeSrc_8888x0565pixel (s, d);
3144 /* call prefetch hint to optimize cache load*/
3145 cachePrefetch ((__m128i*)src);
3146 cachePrefetch ((__m128i*)dst);
3148 /* It's a 8 pixel loop */
3151 /* fill cache line with next memory */
3152 cachePrefetchNext ((__m128i*)src);
3153 cachePrefetchNext ((__m128i*)dst);
3155 /* I'm loading unaligned because I'm not sure about the address alignment. */
3156 xmmSrc = load128Unaligned ((__m128i*) src);
3157 xmmDst = load128Aligned ((__m128i*) dst);
3160 unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
3161 unpack565_128_4x128 (xmmDst, &xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3);
3162 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);
3164 /* I'm loading next 4 pixels from memory before to optimze the memory read. */
3165 xmmSrc = load128Unaligned ((__m128i*) (src+4));
3167 over_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi, &xmmDst0, &xmmDst1);
3170 unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
3171 expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);
3173 over_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi, &xmmDst2, &xmmDst3);
3175 save128Aligned ((__m128i*)dst, pack565_4x128_128 (&xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3));
3187 *dst++ = fbCompositeSrc_8888x0565pixel (s, d);
3194 /* -------------------------------------------------------------------------------------------------
3195 * fbCompositeSolidMask_nx8x8888
3199 fbCompositeSolidMask_nx8x8888sse2 (pixman_implementation_t *imp,
3201 pixman_image_t * pSrc,
3202 pixman_image_t * pMask,
3203 pixman_image_t * pDst,
3214 uint32_t *dstLine, *dst;
3215 uint8_t *maskLine, *mask;
3216 int dstStride, maskStride;
3220 __m128i xmmSrc, xmmAlpha, xmmDef;
3221 __m128i xmmDst, xmmDstLo, xmmDstHi;
3222 __m128i xmmMask, xmmMaskLo, xmmMaskHi;
3224 __m64 mmxSrc, mmxAlpha, mmxMask, mmxDest;
3226 src = _pixman_image_get_solid(pSrc, pDst->bits.format);
3232 fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
3233 fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
3235 xmmDef = createMask_2x32_128 (src, src);
3236 xmmSrc = expandPixel_32_1x128 (src);
3237 xmmAlpha = expandAlpha_1x128 (xmmSrc);
3238 mmxSrc = _mm_movepi64_pi64 (xmmSrc);
3239 mmxAlpha = _mm_movepi64_pi64 (xmmAlpha);
3244 dstLine += dstStride;
3246 maskLine += maskStride;
3249 /* call prefetch hint to optimize cache load*/
3250 cachePrefetch ((__m128i*)mask);
3251 cachePrefetch ((__m128i*)dst);
3253 while (w && (unsigned long)dst & 15)
3255 uint8_t m = *mask++;
3260 mmxMask = expandPixel_8_1x64 (m);
3261 mmxDest = unpack_32_1x64 (d);
3263 *dst = pack_1x64_32 (inOver_1x64 (&mmxSrc,
3273 /* call prefetch hint to optimize cache load*/
3274 cachePrefetch ((__m128i*)mask);
3275 cachePrefetch ((__m128i*)dst);
3279 /* fill cache line with next memory */
3280 cachePrefetchNext ((__m128i*)mask);
3281 cachePrefetchNext ((__m128i*)dst);
3283 m = *((uint32_t*)mask);
3285 if (srca == 0xff && m == 0xffffffff)
3287 save128Aligned ((__m128i*)dst, xmmDef);
3291 xmmDst = load128Aligned ((__m128i*) dst);
3292 xmmMask = unpack_32_1x128 (m);
3293 xmmMask = _mm_unpacklo_epi8 (xmmMask, _mm_setzero_si128());
3296 unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
3297 unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
3299 expandAlphaRev_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
3301 inOver_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi);
3303 save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
3313 uint8_t m = *mask++;
3318 mmxMask = expandPixel_8_1x64 (m);
3319 mmxDest = unpack_32_1x64 (d);
3321 *dst = pack_1x64_32 (inOver_1x64 (&mmxSrc,
3335 /* -------------------------------------------------------------------------------------------------
3336 * fbCompositeSolidMask_nx8x8888
3340 pixmanFillsse2 (uint32_t *bits,
3349 uint32_t byte_width;
3354 if (bpp == 16 && (data >> 16 != (data & 0xffff)))
3357 if (bpp != 16 && bpp != 32)
3362 stride = stride * (int) sizeof (uint32_t) / 2;
3363 byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
3364 byte_width = 2 * width;
3369 stride = stride * (int) sizeof (uint32_t) / 4;
3370 byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
3371 byte_width = 4 * width;
3375 cachePrefetch ((__m128i*)byte_line);
3376 xmmDef = createMask_2x32_128 (data, data);
3381 uint8_t *d = byte_line;
3382 byte_line += stride;
3386 cachePrefetchNext ((__m128i*)d);
3388 while (w >= 2 && ((unsigned long)d & 3))
3390 *(uint16_t *)d = data;
3395 while (w >= 4 && ((unsigned long)d & 15))
3397 *(uint32_t *)d = data;
3403 cachePrefetchNext ((__m128i*)d);
3407 cachePrefetch (((__m128i*)d) + 12);
3409 save128Aligned ((__m128i*)(d), xmmDef);
3410 save128Aligned ((__m128i*)(d+16), xmmDef);
3411 save128Aligned ((__m128i*)(d+32), xmmDef);
3412 save128Aligned ((__m128i*)(d+48), xmmDef);
3413 save128Aligned ((__m128i*)(d+64), xmmDef);
3414 save128Aligned ((__m128i*)(d+80), xmmDef);
3415 save128Aligned ((__m128i*)(d+96), xmmDef);
3416 save128Aligned ((__m128i*)(d+112), xmmDef);
3424 cachePrefetch (((__m128i*)d) + 8);
3426 save128Aligned ((__m128i*)(d), xmmDef);
3427 save128Aligned ((__m128i*)(d+16), xmmDef);
3428 save128Aligned ((__m128i*)(d+32), xmmDef);
3429 save128Aligned ((__m128i*)(d+48), xmmDef);
3435 cachePrefetchNext ((__m128i*)d);
3439 save128Aligned ((__m128i*)(d), xmmDef);
3440 save128Aligned ((__m128i*)(d+16), xmmDef);
3448 save128Aligned ((__m128i*)(d), xmmDef);
3454 cachePrefetchNext ((__m128i*)d);
3458 *(uint32_t *)d = data;
3466 *(uint16_t *)d = data;
3477 fbCompositeSolidMaskSrc_nx8x8888sse2 (pixman_implementation_t *imp,
3479 pixman_image_t * pSrc,
3480 pixman_image_t * pMask,
3481 pixman_image_t * pDst,
3492 uint32_t *dstLine, *dst;
3493 uint8_t *maskLine, *mask;
3494 int dstStride, maskStride;
3498 __m128i xmmSrc, xmmDef;
3499 __m128i xmmMask, xmmMaskLo, xmmMaskHi;
3501 src = _pixman_image_get_solid(pSrc, pDst->bits.format);
3506 pixmanFillsse2 (pDst->bits.bits, pDst->bits.rowstride,
3507 PIXMAN_FORMAT_BPP (pDst->bits.format),
3508 xDst, yDst, width, height, 0);
3512 fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
3513 fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
3515 xmmDef = createMask_2x32_128 (src, src);
3516 xmmSrc = expandPixel_32_1x128 (src);
3521 dstLine += dstStride;
3523 maskLine += maskStride;
3526 /* call prefetch hint to optimize cache load*/
3527 cachePrefetch ((__m128i*)mask);
3528 cachePrefetch ((__m128i*)dst);
3530 while (w && (unsigned long)dst & 15)
3532 uint8_t m = *mask++;
3536 *dst = pack_1x64_32 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmSrc), expandPixel_8_1x64 (m)));
3547 /* call prefetch hint to optimize cache load*/
3548 cachePrefetch ((__m128i*)mask);
3549 cachePrefetch ((__m128i*)dst);
3553 /* fill cache line with next memory */
3554 cachePrefetchNext ((__m128i*)mask);
3555 cachePrefetchNext ((__m128i*)dst);
3557 m = *((uint32_t*)mask);
3559 if (srca == 0xff && m == 0xffffffff)
3561 save128Aligned ((__m128i*)dst, xmmDef);
3565 xmmMask = unpack_32_1x128 (m);
3566 xmmMask = _mm_unpacklo_epi8 (xmmMask, _mm_setzero_si128());
3569 unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
3571 expandAlphaRev_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
3573 pixMultiply_2x128 (&xmmSrc, &xmmSrc, &xmmMaskLo, &xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
3575 save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmMaskLo, xmmMaskHi));
3579 save128Aligned ((__m128i*)dst, _mm_setzero_si128());
3589 uint8_t m = *mask++;
3593 *dst = pack_1x64_32 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmSrc), expandPixel_8_1x64 (m)));
3608 /* -------------------------------------------------------------------------------------------------
3609 * fbCompositeSolidMask_nx8x0565
3613 fbCompositeSolidMask_nx8x0565sse2 (pixman_implementation_t *imp,
3615 pixman_image_t * pSrc,
3616 pixman_image_t * pMask,
3617 pixman_image_t * pDst,
3628 uint16_t *dstLine, *dst, d;
3629 uint8_t *maskLine, *mask;
3630 int dstStride, maskStride;
3633 __m64 mmxSrc, mmxAlpha, mmxMask, mmxDest;
3635 __m128i xmmSrc, xmmAlpha;
3636 __m128i xmmMask, xmmMaskLo, xmmMaskHi;
3637 __m128i xmmDst, xmmDst0, xmmDst1, xmmDst2, xmmDst3;
3639 src = _pixman_image_get_solid(pSrc, pDst->bits.format);
3645 fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
3646 fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
3648 xmmSrc = expandPixel_32_1x128 (src);
3649 xmmAlpha = expandAlpha_1x128 (xmmSrc);
3650 mmxSrc = _mm_movepi64_pi64 (xmmSrc);
3651 mmxAlpha = _mm_movepi64_pi64 (xmmAlpha);
3656 dstLine += dstStride;
3658 maskLine += maskStride;
3661 /* call prefetch hint to optimize cache load*/
3662 cachePrefetch ((__m128i*)mask);
3663 cachePrefetch ((__m128i*)dst);
3665 while (w && (unsigned long)dst & 15)
3672 mmxMask = expandAlphaRev_1x64 (unpack_32_1x64 (m));
3673 mmxDest = expand565_16_1x64 (d);
3675 *dst = pack565_32_16 (pack_1x64_32 (inOver_1x64 (&mmxSrc,
3685 /* call prefetch hint to optimize cache load*/
3686 cachePrefetch ((__m128i*)mask);
3687 cachePrefetch ((__m128i*)dst);
3691 /* fill cache line with next memory */
3692 cachePrefetchNext ((__m128i*)mask);
3693 cachePrefetchNext ((__m128i*)dst);
3695 xmmDst = load128Aligned ((__m128i*) dst);
3696 unpack565_128_4x128 (xmmDst, &xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3);
3698 m = *((uint32_t*)mask);
3703 xmmMask = unpack_32_1x128 (m);
3704 xmmMask = _mm_unpacklo_epi8 (xmmMask, _mm_setzero_si128());
3707 unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
3709 expandAlphaRev_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
3710 inOver_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmDst0, &xmmDst1);
3713 m = *((uint32_t*)mask);
3718 xmmMask = unpack_32_1x128 (m);
3719 xmmMask = _mm_unpacklo_epi8 (xmmMask, _mm_setzero_si128());
3722 unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
3724 expandAlphaRev_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
3725 inOver_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmDst2, &xmmDst3);
3728 save128Aligned ((__m128i*)dst, pack565_4x128_128 (&xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3));
3741 mmxMask = expandAlphaRev_1x64 (unpack_32_1x64 (m));
3742 mmxDest = expand565_16_1x64 (d);
3744 *dst = pack565_32_16 (pack_1x64_32 (inOver_1x64 (&mmxSrc,
3758 /* -------------------------------------------------------------------------------------------------
3759 * fbCompositeSrc_8888RevNPx0565
3763 fbCompositeSrc_8888RevNPx0565sse2 (pixman_implementation_t *imp,
3765 pixman_image_t * pSrc,
3766 pixman_image_t * pMask,
3767 pixman_image_t * pDst,
3777 uint16_t *dstLine, *dst, d;
3778 uint32_t *srcLine, *src, s;
3779 int dstStride, srcStride;
3781 uint32_t opaque, zero;
3784 __m128i xmmSrc, xmmSrcLo, xmmSrcHi;
3785 __m128i xmmDst, xmmDst0, xmmDst1, xmmDst2, xmmDst3;
3787 fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
3788 fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
3793 * I copy the code from MMX one and keep the fixme.
3794 * If it's a problem there, probably is a problem here.
3796 assert (pSrc->pDrawable == pMask->pDrawable);
3802 dstLine += dstStride;
3804 srcLine += srcStride;
3807 /* call prefetch hint to optimize cache load*/
3808 cachePrefetch ((__m128i*)src);
3809 cachePrefetch ((__m128i*)dst);
3811 while (w && (unsigned long)dst & 15)
3816 ms = unpack_32_1x64 (s);
3818 *dst++ = pack565_32_16 (pack_1x64_32 (overRevNonPre_1x64(ms, expand565_16_1x64 (d))));
3822 /* call prefetch hint to optimize cache load*/
3823 cachePrefetch ((__m128i*)src);
3824 cachePrefetch ((__m128i*)dst);
3828 /* fill cache line with next memory */
3829 cachePrefetchNext ((__m128i*)src);
3830 cachePrefetchNext ((__m128i*)dst);
3833 xmmSrc = load128Unaligned((__m128i*)src);
3834 xmmDst = load128Aligned ((__m128i*)dst);
3836 opaque = isOpaque (xmmSrc);
3837 zero = isZero (xmmSrc);
3839 unpack565_128_4x128 (xmmDst, &xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3);
3840 unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
3842 /* preload next round*/
3843 xmmSrc = load128Unaligned((__m128i*)(src+4));
3847 invertColors_2x128 (xmmSrcLo, xmmSrcHi, &xmmDst0, &xmmDst1);
3851 overRevNonPre_2x128 (xmmSrcLo, xmmSrcHi, &xmmDst0, &xmmDst1);
3855 opaque = isOpaque (xmmSrc);
3856 zero = isZero (xmmSrc);
3858 unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
3862 invertColors_2x128 (xmmSrcLo, xmmSrcHi, &xmmDst2, &xmmDst3);
3866 overRevNonPre_2x128 (xmmSrcLo, xmmSrcHi, &xmmDst2, &xmmDst3);
3869 save128Aligned ((__m128i*)dst, pack565_4x128_128 (&xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3));
3881 ms = unpack_32_1x64 (s);
3883 *dst++ = pack565_32_16 (pack_1x64_32 (overRevNonPre_1x64(ms, expand565_16_1x64 (d))));
3891 /* "8888RevNP" is GdkPixbuf's format: ABGR, non premultiplied */
3893 /* -------------------------------------------------------------------------------------------------
3894 * fbCompositeSrc_8888RevNPx8888
3898 fbCompositeSrc_8888RevNPx8888sse2 (pixman_implementation_t *imp,
3900 pixman_image_t * pSrc,
3901 pixman_image_t * pMask,
3902 pixman_image_t * pDst,
3912 uint32_t *dstLine, *dst, d;
3913 uint32_t *srcLine, *src, s;
3914 int dstStride, srcStride;
3916 uint32_t opaque, zero;
3918 __m128i xmmSrcLo, xmmSrcHi;
3919 __m128i xmmDstLo, xmmDstHi;
3921 fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
3922 fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
3927 * I copy the code from MMX one and keep the fixme.
3928 * If it's a problem there, probably is a problem here.
3930 assert (pSrc->pDrawable == pMask->pDrawable);
3936 dstLine += dstStride;
3938 srcLine += srcStride;
3941 /* call prefetch hint to optimize cache load*/
3942 cachePrefetch ((__m128i*)src);
3943 cachePrefetch ((__m128i*)dst);
3945 while (w && (unsigned long)dst & 15)
3950 *dst++ = pack_1x64_32 (overRevNonPre_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d)));
3955 /* call prefetch hint to optimize cache load*/
3956 cachePrefetch ((__m128i*)src);
3957 cachePrefetch ((__m128i*)dst);
3961 /* fill cache line with next memory */
3962 cachePrefetchNext ((__m128i*)src);
3963 cachePrefetchNext ((__m128i*)dst);
3965 xmmSrcHi = load128Unaligned((__m128i*)src);
3967 opaque = isOpaque (xmmSrcHi);
3968 zero = isZero (xmmSrcHi);
3970 unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
3974 invertColors_2x128( xmmSrcLo, xmmSrcHi, &xmmDstLo, &xmmDstHi);
3976 save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
3980 xmmDstHi = load128Aligned ((__m128i*)dst);
3982 unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
3984 overRevNonPre_2x128 (xmmSrcLo, xmmSrcHi, &xmmDstLo, &xmmDstHi);
3986 save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
3999 *dst++ = pack_1x64_32 (overRevNonPre_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d)));
4008 /* -------------------------------------------------------------------------------------------------
4009 * fbCompositeSolidMask_nx8888x0565C
4013 fbCompositeSolidMask_nx8888x0565Csse2 (pixman_implementation_t *imp,
4015 pixman_image_t * pSrc,
4016 pixman_image_t * pMask,
4017 pixman_image_t * pDst,
4028 uint16_t *dstLine, *dst, d;
4029 uint32_t *maskLine, *mask, m;
4030 int dstStride, maskStride;
4034 __m128i xmmSrc, xmmAlpha;
4035 __m128i xmmMask, xmmMaskLo, xmmMaskHi;
4036 __m128i xmmDst, xmmDst0, xmmDst1, xmmDst2, xmmDst3;
4038 __m64 mmxSrc, mmxAlpha, mmxMask, mmxDest;
4040 src = _pixman_image_get_solid(pSrc, pDst->bits.format);
4045 fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
4046 fbComposeGetStart (pMask, xMask, yMask, uint32_t, maskStride, maskLine, 1);
4048 xmmSrc = expandPixel_32_1x128 (src);
4049 xmmAlpha = expandAlpha_1x128 (xmmSrc);
4050 mmxSrc = _mm_movepi64_pi64 (xmmSrc);
4051 mmxAlpha = _mm_movepi64_pi64 (xmmAlpha);
4058 maskLine += maskStride;
4059 dstLine += dstStride;
4061 /* call prefetch hint to optimize cache load*/
4062 cachePrefetch ((__m128i*)mask);
4063 cachePrefetch ((__m128i*)dst);
4065 while (w && ((unsigned long)dst & 15))
4067 m = *(uint32_t *) mask;
4072 mmxMask = unpack_32_1x64 (m);
4073 mmxDest = expand565_16_1x64 (d);
4075 *dst = pack565_32_16 (pack_1x64_32 (inOver_1x64 (&mmxSrc,
4086 /* call prefetch hint to optimize cache load*/
4087 cachePrefetch ((__m128i*)mask);
4088 cachePrefetch ((__m128i*)dst);
4092 /* fill cache line with next memory */
4093 cachePrefetchNext ((__m128i*)mask);
4094 cachePrefetchNext ((__m128i*)dst);
4097 xmmMask = load128Unaligned((__m128i*)mask);
4098 xmmDst = load128Aligned((__m128i*)dst);
4100 packCmp = _mm_movemask_epi8 (_mm_cmpeq_epi32 (xmmMask, _mm_setzero_si128()));
4102 unpack565_128_4x128 (xmmDst, &xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3);
4103 unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
4105 /* preload next round*/
4106 xmmMask = load128Unaligned((__m128i*)(mask+4));
4107 /* preload next round*/
4109 if (packCmp != 0xffff)
4111 inOver_2x128(&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmDst0, &xmmDst1);
4115 packCmp = _mm_movemask_epi8 (_mm_cmpeq_epi32 (xmmMask, _mm_setzero_si128()));
4117 unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
4119 if (packCmp != 0xffff)
4121 inOver_2x128(&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmDst2, &xmmDst3);
4124 save128Aligned ((__m128i*)dst, pack565_4x128_128 (&xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3));
4133 m = *(uint32_t *) mask;
4138 mmxMask = unpack_32_1x64 (m);
4139 mmxDest = expand565_16_1x64 (d);
4141 *dst = pack565_32_16 (pack_1x64_32 (inOver_1x64 (&mmxSrc,
4156 /* -------------------------------------------------------------------------------------------------
4157 * fbCompositeIn_nx8x8
4161 fbCompositeIn_nx8x8sse2 (pixman_implementation_t *imp,
4163 pixman_image_t * pSrc,
4164 pixman_image_t * pMask,
4165 pixman_image_t * pDst,
4175 uint8_t *dstLine, *dst;
4176 uint8_t *maskLine, *mask;
4177 int dstStride, maskStride;
4183 __m128i xmmMask, xmmMaskLo, xmmMaskHi;
4184 __m128i xmmDst, xmmDstLo, xmmDstHi;
4186 fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1);
4187 fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
4189 src = _pixman_image_get_solid(pSrc, pDst->bits.format);
4195 xmmAlpha = expandAlpha_1x128 (expandPixel_32_1x128 (src));
4200 dstLine += dstStride;
4202 maskLine += maskStride;
4205 /* call prefetch hint to optimize cache load*/
4206 cachePrefetch ((__m128i*)mask);
4207 cachePrefetch ((__m128i*)dst);
4209 while (w && ((unsigned long)dst & 15))
4211 m = (uint32_t) *mask++;
4212 d = (uint32_t) *dst;
4214 *dst++ = (uint8_t) pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmAlpha), unpack_32_1x64 (m)),
4215 unpack_32_1x64 (d)));
4219 /* call prefetch hint to optimize cache load*/
4220 cachePrefetch ((__m128i*)mask);
4221 cachePrefetch ((__m128i*)dst);
4225 /* fill cache line with next memory */
4226 cachePrefetchNext ((__m128i*)mask);
4227 cachePrefetchNext ((__m128i*)dst);
4229 xmmMask = load128Unaligned((__m128i*)mask);
4230 xmmDst = load128Aligned((__m128i*)dst);
4232 unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
4233 unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
4235 pixMultiply_2x128 (&xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
4236 pixMultiply_2x128 (&xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi, &xmmDstLo, &xmmDstHi);
4238 save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
4247 m = (uint32_t) *mask++;
4248 d = (uint32_t) *dst;
4250 *dst++ = (uint8_t) pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmAlpha), unpack_32_1x64 (m)),
4251 unpack_32_1x64 (d)));
4259 /* -------------------------------------------------------------------------------------------------
4264 fbCompositeIn_8x8sse2 (pixman_implementation_t *imp,
4266 pixman_image_t * pSrc,
4267 pixman_image_t * pMask,
4268 pixman_image_t * pDst,
4278 uint8_t *dstLine, *dst;
4279 uint8_t *srcLine, *src;
4280 int srcStride, dstStride;
4284 __m128i xmmSrc, xmmSrcLo, xmmSrcHi;
4285 __m128i xmmDst, xmmDstLo, xmmDstHi;
4287 fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1);
4288 fbComposeGetStart (pSrc, xSrc, ySrc, uint8_t, srcStride, srcLine, 1);
4293 dstLine += dstStride;
4295 srcLine += srcStride;
4298 /* call prefetch hint to optimize cache load*/
4299 cachePrefetch ((__m128i*)src);
4300 cachePrefetch ((__m128i*)dst);
4302 while (w && ((unsigned long)dst & 15))
4304 s = (uint32_t) *src++;
4305 d = (uint32_t) *dst;
4307 *dst++ = (uint8_t) pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s),unpack_32_1x64 (d)));
4311 /* call prefetch hint to optimize cache load*/
4312 cachePrefetch ((__m128i*)src);
4313 cachePrefetch ((__m128i*)dst);
4317 /* fill cache line with next memory */
4318 cachePrefetchNext ((__m128i*)src);
4319 cachePrefetchNext ((__m128i*)dst);
4321 xmmSrc = load128Unaligned((__m128i*)src);
4322 xmmDst = load128Aligned((__m128i*)dst);
4324 unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
4325 unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
4327 pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmDstLo, &xmmDstHi, &xmmDstLo, &xmmDstHi);
4329 save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
4338 s = (uint32_t) *src++;
4339 d = (uint32_t) *dst;
4341 *dst++ = (uint8_t) pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s),unpack_32_1x64 (d)));
4349 /* -------------------------------------------------------------------------------------------------
4350 * fbCompositeSrcAdd_8888x8x8
4354 fbCompositeSrcAdd_8888x8x8sse2 (pixman_implementation_t *imp,
4356 pixman_image_t * pSrc,
4357 pixman_image_t * pMask,
4358 pixman_image_t * pDst,
4368 uint8_t *dstLine, *dst;
4369 uint8_t *maskLine, *mask;
4370 int dstStride, maskStride;
4377 __m128i xmmMask, xmmMaskLo, xmmMaskHi;
4378 __m128i xmmDst, xmmDstLo, xmmDstHi;
4380 fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1);
4381 fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
4383 src = _pixman_image_get_solid(pSrc, pDst->bits.format);
4389 xmmAlpha = expandAlpha_1x128 (expandPixel_32_1x128 (src));
4394 dstLine += dstStride;
4396 maskLine += maskStride;
4399 /* call prefetch hint to optimize cache load*/
4400 cachePrefetch ((__m128i*)mask);
4401 cachePrefetch ((__m128i*)dst);
4403 while (w && ((unsigned long)dst & 15))
4405 m = (uint32_t) *mask++;
4406 d = (uint32_t) *dst;
4408 *dst++ = (uint8_t) pack_1x64_32 (_mm_adds_pu16 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmAlpha), unpack_32_1x64 (m)),
4409 unpack_32_1x64 (d)));
4413 /* call prefetch hint to optimize cache load*/
4414 cachePrefetch ((__m128i*)mask);
4415 cachePrefetch ((__m128i*)dst);
4419 /* fill cache line with next memory */
4420 cachePrefetchNext ((__m128i*)mask);
4421 cachePrefetchNext ((__m128i*)dst);
4423 xmmMask = load128Unaligned((__m128i*)mask);
4424 xmmDst = load128Aligned((__m128i*)dst);
4426 unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
4427 unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
4429 pixMultiply_2x128 (&xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
4431 xmmDstLo = _mm_adds_epu16 (xmmMaskLo, xmmDstLo);
4432 xmmDstHi = _mm_adds_epu16 (xmmMaskHi, xmmDstHi);
4434 save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
4443 m = (uint32_t) *mask++;
4444 d = (uint32_t) *dst;
4446 *dst++ = (uint8_t) pack_1x64_32 (_mm_adds_pu16 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmAlpha), unpack_32_1x64 (m)),
4447 unpack_32_1x64 (d)));
4455 /* -------------------------------------------------------------------------------------------------
4456 * fbCompositeSrcAdd_8000x8000
4460 fbCompositeSrcAdd_8000x8000sse2 (pixman_implementation_t *imp,
4462 pixman_image_t * pSrc,
4463 pixman_image_t * pMask,
4464 pixman_image_t * pDst,
4474 uint8_t *dstLine, *dst;
4475 uint8_t *srcLine, *src;
4476 int dstStride, srcStride;
4480 fbComposeGetStart (pSrc, xSrc, ySrc, uint8_t, srcStride, srcLine, 1);
4481 fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1);
4488 /* call prefetch hint to optimize cache load*/
4489 cachePrefetch ((__m128i*)src);
4490 cachePrefetch ((__m128i*)dst);
4492 dstLine += dstStride;
4493 srcLine += srcStride;
4497 while (w && (unsigned long)dst & 3)
4499 t = (*dst) + (*src++);
4500 *dst++ = t | (0 - (t >> 8));
4504 coreCombineAddUsse2 ((uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
4514 t = (*dst) + (*src++);
4515 *dst++ = t | (0 - (t >> 8));
4523 /* -------------------------------------------------------------------------------------------------
4524 * fbCompositeSrcAdd_8888x8888
4527 fbCompositeSrcAdd_8888x8888sse2 (pixman_implementation_t *imp,
4529 pixman_image_t * pSrc,
4530 pixman_image_t * pMask,
4531 pixman_image_t * pDst,
4541 uint32_t *dstLine, *dst;
4542 uint32_t *srcLine, *src;
4543 int dstStride, srcStride;
4545 fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
4546 fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
4551 dstLine += dstStride;
4553 srcLine += srcStride;
4555 coreCombineAddUsse2 (dst, src, NULL, width);
4561 /* -------------------------------------------------------------------------------------------------
4562 * fbCompositeCopyAreasse2
4565 static pixman_bool_t
4566 pixmanBltsse2 (uint32_t *src_bits,
4572 int src_x, int src_y,
4573 int dst_x, int dst_y,
4574 int width, int height)
4576 uint8_t * src_bytes;
4577 uint8_t * dst_bytes;
4580 if (src_bpp != dst_bpp)
4585 src_stride = src_stride * (int) sizeof (uint32_t) / 2;
4586 dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
4587 src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
4588 dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
4589 byte_width = 2 * width;
4593 else if (src_bpp == 32)
4595 src_stride = src_stride * (int) sizeof (uint32_t) / 4;
4596 dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
4597 src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
4598 dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
4599 byte_width = 4 * width;
4608 cachePrefetch ((__m128i*)src_bytes);
4609 cachePrefetch ((__m128i*)dst_bytes);
4614 uint8_t *s = src_bytes;
4615 uint8_t *d = dst_bytes;
4616 src_bytes += src_stride;
4617 dst_bytes += dst_stride;
4620 cachePrefetchNext ((__m128i*)s);
4621 cachePrefetchNext ((__m128i*)d);
4623 while (w >= 2 && ((unsigned long)d & 3))
4625 *(uint16_t *)d = *(uint16_t *)s;
4631 while (w >= 4 && ((unsigned long)d & 15))
4633 *(uint32_t *)d = *(uint32_t *)s;
4640 cachePrefetchNext ((__m128i*)s);
4641 cachePrefetchNext ((__m128i*)d);
4645 __m128i xmm0, xmm1, xmm2, xmm3;
4647 /* 128 bytes ahead */
4648 cachePrefetch (((__m128i*)s) + 8);
4649 cachePrefetch (((__m128i*)d) + 8);
4651 xmm0 = load128Unaligned ((__m128i*)(s));
4652 xmm1 = load128Unaligned ((__m128i*)(s+16));
4653 xmm2 = load128Unaligned ((__m128i*)(s+32));
4654 xmm3 = load128Unaligned ((__m128i*)(s+48));
4656 save128Aligned ((__m128i*)(d), xmm0);
4657 save128Aligned ((__m128i*)(d+16), xmm1);
4658 save128Aligned ((__m128i*)(d+32), xmm2);
4659 save128Aligned ((__m128i*)(d+48), xmm3);
4666 cachePrefetchNext ((__m128i*)s);
4667 cachePrefetchNext ((__m128i*)d);
4671 save128Aligned ((__m128i*)d, load128Unaligned ((__m128i*)s) );
4678 cachePrefetchNext ((__m128i*)s);
4679 cachePrefetchNext ((__m128i*)d);
4683 *(uint32_t *)d = *(uint32_t *)s;
4692 *(uint16_t *)d = *(uint16_t *)s;
4705 fbCompositeCopyAreasse2 (pixman_implementation_t *imp,
4707 pixman_image_t * pSrc,
4708 pixman_image_t * pMask,
4709 pixman_image_t * pDst,
4719 pixmanBltsse2 (pSrc->bits.bits,
4721 pSrc->bits.rowstride,
4722 pDst->bits.rowstride,
4723 PIXMAN_FORMAT_BPP (pSrc->bits.format),
4724 PIXMAN_FORMAT_BPP (pDst->bits.format),
4725 xSrc, ySrc, xDst, yDst, width, height);
4729 /* This code are buggy in MMX version, now the bug was translated to SSE2 version */
4731 fbCompositeOver_x888x8x8888sse2 (pixman_implementation_t *imp,
4733 pixman_image_t * pSrc,
4734 pixman_image_t * pMask,
4735 pixman_image_t * pDst,
4745 uint32_t *src, *srcLine, s;
4746 uint32_t *dst, *dstLine, d;
4747 uint8_t *mask, *maskLine;
4749 int srcStride, maskStride, dstStride;
4752 __m128i xmmSrc, xmmSrcLo, xmmSrcHi;
4753 __m128i xmmDst, xmmDstLo, xmmDstHi;
4754 __m128i xmmMask, xmmMaskLo, xmmMaskHi;
4756 fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
4757 fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
4758 fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
4763 srcLine += srcStride;
4765 dstLine += dstStride;
4767 maskLine += maskStride;
4771 /* call prefetch hint to optimize cache load*/
4772 cachePrefetch ((__m128i*)src);
4773 cachePrefetch ((__m128i*)dst);
4774 cachePrefetch ((__m128i*)mask);
4776 while (w && (unsigned long)dst & 15)
4778 s = 0xff000000 | *src++;
4779 m = (uint32_t) *mask++;
4782 __m64 ms = unpack_32_1x64 (s);
4786 ms = inOver_1x64 (ms,
4788 expandAlphaRev_1x64 (unpack_32_1x64 (m)),
4789 unpack_32_1x64 (d));
4792 *dst++ = pack_1x64_32 (ms);
4796 /* call prefetch hint to optimize cache load*/
4797 cachePrefetch ((__m128i*)src);
4798 cachePrefetch ((__m128i*)dst);
4799 cachePrefetch ((__m128i*)mask);
4803 /* fill cache line with next memory */
4804 cachePrefetchNext ((__m128i*)src);
4805 cachePrefetchNext ((__m128i*)dst);
4806 cachePrefetchNext ((__m128i*)mask);
4808 m = *(uint32_t*) mask;
4809 xmmSrc = _mm_or_si128 (load128Unaligned ((__m128i*)src), Maskff000000);
4811 if (m == 0xffffffff)
4813 save128Aligned ((__m128i*)dst, xmmSrc);
4817 xmmDst = load128Aligned ((__m128i*)dst);
4819 xmmMask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
4821 unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
4822 unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
4823 unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
4825 expandAlphaRev_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
4827 inOver_2x128 (xmmSrcLo, xmmSrcHi, Mask00ff, Mask00ff, xmmMaskLo, xmmMaskHi, &xmmDstLo, &xmmDstHi);
4829 save128Aligned( (__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
4840 m = (uint32_t) *mask++;
4844 s = 0xff000000 | *src;
4854 *dst = pack_1x64_32 (inOver_1x64 (unpack_32_1x64 (s),
4856 expandAlphaRev_1x64 (unpack_32_1x64 (m)),
4857 unpack_32_1x64 (d)));
4872 static const pixman_fast_path_t sse2_fast_paths[] =
4874 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_r5g6b5, fbCompositeSolidMask_nx8x0565sse2, 0 },
4875 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_b5g6r5, fbCompositeSolidMask_nx8x0565sse2, 0 },
4876 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_null, PIXMAN_a8r8g8b8, fbCompositeSolid_nx8888sse2, 0 },
4877 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_null, PIXMAN_x8r8g8b8, fbCompositeSolid_nx8888sse2, 0 },
4878 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_null, PIXMAN_r5g6b5, fbCompositeSolid_nx0565sse2, 0 },
4879 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, fbCompositeSrc_8888x8888sse2, 0 },
4880 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, fbCompositeSrc_8888x8888sse2, 0 },
4881 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, fbCompositeSrc_8888x8888sse2, 0 },
4882 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, fbCompositeSrc_8888x8888sse2, 0 },
4883 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_r5g6b5, fbCompositeSrc_8888x0565sse2, 0 },
4884 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_b5g6r5, fbCompositeSrc_8888x0565sse2, 0 },
4885 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8r8g8b8, fbCompositeSolidMask_nx8x8888sse2, 0 },
4886 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8r8g8b8, fbCompositeSolidMask_nx8x8888sse2, 0 },
4887 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8b8g8r8, fbCompositeSolidMask_nx8x8888sse2, 0 },
4888 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8b8g8r8, fbCompositeSolidMask_nx8x8888sse2, 0 },
4890 /* FIXME: This code are buggy in MMX version, now the bug was translated to SSE2 version */
4891 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, fbCompositeOver_x888x8x8888sse2, 0 },
4892 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, fbCompositeOver_x888x8x8888sse2, 0 },
4893 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_x8b8g8r8, fbCompositeOver_x888x8x8888sse2, 0 },
4894 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_a8r8g8b8, fbCompositeOver_x888x8x8888sse2, 0 },
4896 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, fbCompositeSrc_x888xnx8888sse2, NEED_SOLID_MASK },
4897 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, fbCompositeSrc_x888xnx8888sse2, NEED_SOLID_MASK },
4898 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_a8b8g8r8, fbCompositeSrc_x888xnx8888sse2, NEED_SOLID_MASK },
4899 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_x8b8g8r8, fbCompositeSrc_x888xnx8888sse2, NEED_SOLID_MASK },
4900 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, fbCompositeSrc_8888x8x8888sse2, NEED_SOLID_MASK },
4901 { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, fbCompositeSrc_8888x8x8888sse2, NEED_SOLID_MASK },
4902 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_a8, PIXMAN_a8b8g8r8, fbCompositeSrc_8888x8x8888sse2, NEED_SOLID_MASK },
4903 { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_a8, PIXMAN_x8b8g8r8, fbCompositeSrc_8888x8x8888sse2, NEED_SOLID_MASK },
4904 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, fbCompositeSolidMask_nx8888x8888Csse2, NEED_COMPONENT_ALPHA },
4905 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, fbCompositeSolidMask_nx8888x8888Csse2, NEED_COMPONENT_ALPHA },
4906 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8b8g8r8, PIXMAN_a8b8g8r8, fbCompositeSolidMask_nx8888x8888Csse2, NEED_COMPONENT_ALPHA },
4907 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8b8g8r8, PIXMAN_x8b8g8r8, fbCompositeSolidMask_nx8888x8888Csse2, NEED_COMPONENT_ALPHA },
4908 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8r8g8b8, PIXMAN_r5g6b5, fbCompositeSolidMask_nx8888x0565Csse2, NEED_COMPONENT_ALPHA },
4909 { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8b8g8r8, PIXMAN_b5g6r5, fbCompositeSolidMask_nx8888x0565Csse2, NEED_COMPONENT_ALPHA },
4910 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, fbCompositeSrc_8888RevNPx8888sse2, NEED_PIXBUF },
4911 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_a8r8g8b8, fbCompositeSrc_8888RevNPx8888sse2, NEED_PIXBUF },
4912 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, fbCompositeSrc_8888RevNPx8888sse2, NEED_PIXBUF },
4913 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_x8r8g8b8, fbCompositeSrc_8888RevNPx8888sse2, NEED_PIXBUF },
4914 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_a8b8g8r8, fbCompositeSrc_8888RevNPx8888sse2, NEED_PIXBUF },
4915 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_a8b8g8r8, fbCompositeSrc_8888RevNPx8888sse2, NEED_PIXBUF },
4916 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_x8b8g8r8, fbCompositeSrc_8888RevNPx8888sse2, NEED_PIXBUF },
4917 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_x8b8g8r8, fbCompositeSrc_8888RevNPx8888sse2, NEED_PIXBUF },
4918 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_r5g6b5, fbCompositeSrc_8888RevNPx0565sse2, NEED_PIXBUF },
4919 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_r5g6b5, fbCompositeSrc_8888RevNPx0565sse2, NEED_PIXBUF },
4920 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_b5g6r5, fbCompositeSrc_8888RevNPx0565sse2, NEED_PIXBUF },
4921 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_b5g6r5, fbCompositeSrc_8888RevNPx0565sse2, NEED_PIXBUF },
4922 { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, fbCompositeCopyAreasse2, 0 },
4923 { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, fbCompositeCopyAreasse2, 0 },
4925 { PIXMAN_OP_ADD, PIXMAN_a8, PIXMAN_null, PIXMAN_a8, fbCompositeSrcAdd_8000x8000sse2, 0 },
4926 { PIXMAN_OP_ADD, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, fbCompositeSrcAdd_8888x8888sse2, 0 },
4927 { PIXMAN_OP_ADD, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, fbCompositeSrcAdd_8888x8888sse2, 0 },
4928 { PIXMAN_OP_ADD, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8, fbCompositeSrcAdd_8888x8x8sse2, 0 },
4930 { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8r8g8b8, fbCompositeSolidMaskSrc_nx8x8888sse2, 0 },
4931 { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8r8g8b8, fbCompositeSolidMaskSrc_nx8x8888sse2, 0 },
4932 { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8b8g8r8, fbCompositeSolidMaskSrc_nx8x8888sse2, 0 },
4933 { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8b8g8r8, fbCompositeSolidMaskSrc_nx8x8888sse2, 0 },
4934 { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, fbCompositeCopyAreasse2, 0 },
4935 { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, fbCompositeCopyAreasse2, 0 },
4936 { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, fbCompositeCopyAreasse2, 0 },
4937 { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, fbCompositeCopyAreasse2, 0 },
4938 { PIXMAN_OP_SRC, PIXMAN_x8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, fbCompositeCopyAreasse2, 0 },
4939 { PIXMAN_OP_SRC, PIXMAN_x8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, fbCompositeCopyAreasse2, 0 },
4940 { PIXMAN_OP_SRC, PIXMAN_r5g6b5, PIXMAN_null, PIXMAN_r5g6b5, fbCompositeCopyAreasse2, 0 },
4941 { PIXMAN_OP_SRC, PIXMAN_b5g6r5, PIXMAN_null, PIXMAN_b5g6r5, fbCompositeCopyAreasse2, 0 },
4943 { PIXMAN_OP_IN, PIXMAN_a8, PIXMAN_null, PIXMAN_a8, fbCompositeIn_8x8sse2, 0 },
4944 { PIXMAN_OP_IN, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8, fbCompositeIn_nx8x8sse2, 0 },
4950 * Work around GCC bug causing crashes in Mozilla with SSE2
4952 * When using -msse, gcc generates movdqa instructions assuming that
4953 * the stack is 16 byte aligned. Unfortunately some applications, such
4954 * as Mozilla and Mono, end up aligning the stack to 4 bytes, which
4955 * causes the movdqa instructions to fail.
4957 * The __force_align_arg_pointer__ makes gcc generate a prologue that
4958 * realigns the stack pointer to 16 bytes.
4960 * On x86-64 this is not necessary because the standard ABI already
4961 * calls for a 16 byte aligned stack.
4963 * See https://bugs.freedesktop.org/show_bug.cgi?id=15693
4965 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
4966 __attribute__((__force_align_arg_pointer__))
4969 sse2_composite (pixman_implementation_t *imp,
4971 pixman_image_t *src,
4972 pixman_image_t *mask,
4973 pixman_image_t *dest,
4983 if (_pixman_run_fast_path (sse2_fast_paths, imp,
4984 op, src, mask, dest,
4993 _pixman_implementation_composite (imp->delegate, op,
5001 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5002 __attribute__((__force_align_arg_pointer__))
5004 static pixman_bool_t
5005 sse2_blt (pixman_implementation_t *imp,
5012 int src_x, int src_y,
5013 int dst_x, int dst_y,
5014 int width, int height)
5016 if (!pixmanBltsse2 (
5017 src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
5018 src_x, src_y, dst_x, dst_y, width, height))
5021 return _pixman_implementation_blt (
5023 src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
5024 src_x, src_y, dst_x, dst_y, width, height);
5030 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
5031 __attribute__((__force_align_arg_pointer__))
5033 static pixman_bool_t
5034 sse2_fill (pixman_implementation_t *imp,
5044 if (!pixmanFillsse2 (bits, stride, bpp, x, y, width, height, xor))
5046 return _pixman_implementation_fill (
5047 imp->delegate, bits, stride, bpp, x, y, width, height, xor);
5053 pixman_implementation_t *
5054 _pixman_implementation_create_sse2 (void)
5056 pixman_implementation_t *mmx = _pixman_implementation_create_mmx ();
5057 pixman_implementation_t *imp = _pixman_implementation_create (mmx);
5059 /* SSE2 constants */
5060 Mask565r = createMask_2x32_128 (0x00f80000, 0x00f80000);
5061 Mask565g1 = createMask_2x32_128 (0x00070000, 0x00070000);
5062 Mask565g2 = createMask_2x32_128 (0x000000e0, 0x000000e0);
5063 Mask565b = createMask_2x32_128 (0x0000001f, 0x0000001f);
5064 MaskRed = createMask_2x32_128 (0x00f80000, 0x00f80000);
5065 MaskGreen = createMask_2x32_128 (0x0000fc00, 0x0000fc00);
5066 MaskBlue = createMask_2x32_128 (0x000000f8, 0x000000f8);
5067 Mask565FixRB = createMask_2x32_128 (0x00e000e0, 0x00e000e0);
5068 Mask565FixG = createMask_2x32_128 (0x0000c000, 0x0000c000);
5069 Mask0080 = createMask_16_128 (0x0080);
5070 Mask00ff = createMask_16_128 (0x00ff);
5071 Mask0101 = createMask_16_128 (0x0101);
5072 Maskffff = createMask_16_128 (0xffff);
5073 Maskff000000 = createMask_2x32_128 (0xff000000, 0xff000000);
5074 MaskAlpha = createMask_2x32_128 (0x00ff0000, 0x00000000);
5077 xMask565rgb = createMask_2x32_64 (0x000001f0, 0x003f001f);
5078 xMask565Unpack = createMask_2x32_64 (0x00000084, 0x04100840);
5080 xMask0080 = createMask_16_64 (0x0080);
5081 xMask00ff = createMask_16_64 (0x00ff);
5082 xMask0101 = createMask_16_64 (0x0101);
5083 xMaskAlpha = createMask_2x32_64 (0x00ff0000, 0x00000000);
5087 /* Set up function pointers */
5089 /* SSE code patch for fbcompose.c */
5090 imp->combine_32[PIXMAN_OP_OVER] = sse2CombineOverU;
5091 imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2CombineOverReverseU;
5092 imp->combine_32[PIXMAN_OP_IN] = sse2CombineInU;
5093 imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2CombineInReverseU;
5094 imp->combine_32[PIXMAN_OP_OUT] = sse2CombineOutU;
5095 imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2CombineOutReverseU;
5096 imp->combine_32[PIXMAN_OP_ATOP] = sse2CombineAtopU;
5097 imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2CombineAtopReverseU;
5098 imp->combine_32[PIXMAN_OP_XOR] = sse2CombineXorU;
5099 imp->combine_32[PIXMAN_OP_ADD] = sse2CombineAddU;
5101 imp->combine_32[PIXMAN_OP_SATURATE] = sse2CombineSaturateU;
5103 imp->combine_32_ca[PIXMAN_OP_SRC] = sse2CombineSrcC;
5104 imp->combine_32_ca[PIXMAN_OP_OVER] = sse2CombineOverC;
5105 imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2CombineOverReverseC;
5106 imp->combine_32_ca[PIXMAN_OP_IN] = sse2CombineInC;
5107 imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2CombineInReverseC;
5108 imp->combine_32_ca[PIXMAN_OP_OUT] = sse2CombineOutC;
5109 imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2CombineOutReverseC;
5110 imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2CombineAtopC;
5111 imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2CombineAtopReverseC;
5112 imp->combine_32_ca[PIXMAN_OP_XOR] = sse2CombineXorC;
5113 imp->combine_32_ca[PIXMAN_OP_ADD] = sse2CombineAddC;
5115 imp->composite = sse2_composite;
5116 imp->blt = sse2_blt;
5117 imp->fill = sse2_fill;
5122 #endif /* USE_SSE2 */