Replace 'i < len-1 && func(i+1)' by 'i+1 < len && func(i+1)'
[profile/ivi/qtbase.git] / src / gui / painting / qdrawhelper_ssse3.cpp
1 /****************************************************************************
2 **
3 ** Copyright (C) 2011 Nokia Corporation and/or its subsidiary(-ies).
4 ** All rights reserved.
5 ** Contact: Nokia Corporation (qt-info@nokia.com)
6 **
7 ** This file is part of the QtGui module of the Qt Toolkit.
8 **
9 ** $QT_BEGIN_LICENSE:LGPL$
10 ** GNU Lesser General Public License Usage
11 ** This file may be used under the terms of the GNU Lesser General Public
12 ** License version 2.1 as published by the Free Software Foundation and
13 ** appearing in the file LICENSE.LGPL included in the packaging of this
14 ** file. Please review the following information to ensure the GNU Lesser
15 ** General Public License version 2.1 requirements will be met:
16 ** http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
17 **
18 ** In addition, as a special exception, Nokia gives you certain additional
19 ** rights. These rights are described in the Nokia Qt LGPL Exception
20 ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
21 **
22 ** GNU General Public License Usage
23 ** Alternatively, this file may be used under the terms of the GNU General
24 ** Public License version 3.0 as published by the Free Software Foundation
25 ** and appearing in the file LICENSE.GPL included in the packaging of this
26 ** file. Please review the following information to ensure the GNU General
27 ** Public License version 3.0 requirements will be met:
28 ** http://www.gnu.org/copyleft/gpl.html.
29 **
30 ** Other Usage
31 ** Alternatively, this file may be used in accordance with the terms and
32 ** conditions contained in a signed written agreement between you and Nokia.
33 **
34 **
35 **
36 **
37 **
38 ** $QT_END_LICENSE$
39 **
40 ****************************************************************************/
41
42 #include <private/qdrawhelper_x86_p.h>
43
44 #ifdef QT_HAVE_SSSE3
45
46 #include <private/qdrawingprimitive_sse2_p.h>
47
48 QT_BEGIN_NAMESPACE
49
50 inline static void blend_pixel(quint32 &dst, const quint32 src)
51 {
52     if (src >= 0xff000000)
53         dst = src;
54     else if (src != 0)
55         dst = src + BYTE_MUL(dst, qAlpha(~src));
56 }
57
58
59 /* The instruction palignr uses direct arguments, so we have to generate the code fo the different
60    shift (4, 8, 12). Checking the alignment inside the loop is unfortunatelly way too slow.
61  */
62 #define BLENDING_LOOP(palignrOffset, length)\
63     for (; x < length-3; x += 4) { \
64         const __m128i srcVectorLastLoaded = _mm_load_si128((__m128i *)&src[x - minusOffsetToAlignSrcOn16Bytes + 4]);\
65         const __m128i srcVector = _mm_alignr_epi8(srcVectorLastLoaded, srcVectorPrevLoaded, palignrOffset); \
66         const __m128i srcVectorAlpha = _mm_and_si128(srcVector, alphaMask); \
67         if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, alphaMask)) == 0xffff) { \
68             _mm_store_si128((__m128i *)&dst[x], srcVector); \
69         } else if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, nullVector)) != 0xffff) { \
70             __m128i alphaChannel = _mm_shuffle_epi8(srcVector, alphaShuffleMask); \
71             alphaChannel = _mm_sub_epi16(one, alphaChannel); \
72             const __m128i dstVector = _mm_load_si128((__m128i *)&dst[x]); \
73             __m128i destMultipliedByOneMinusAlpha; \
74             BYTE_MUL_SSE2(destMultipliedByOneMinusAlpha, dstVector, alphaChannel, colorMask, half); \
75             const __m128i result = _mm_add_epi8(srcVector, destMultipliedByOneMinusAlpha); \
76             _mm_store_si128((__m128i *)&dst[x], result); \
77         } \
78         srcVectorPrevLoaded = srcVectorLastLoaded;\
79     }
80
81
82 // Basically blend src over dst with the const alpha defined as constAlphaVector.
83 // nullVector, half, one, colorMask are constant across the whole image/texture, and should be defined as:
84 //const __m128i nullVector = _mm_set1_epi32(0);
85 //const __m128i half = _mm_set1_epi16(0x80);
86 //const __m128i one = _mm_set1_epi16(0xff);
87 //const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
88 //const __m128i alphaMask = _mm_set1_epi32(0xff000000);
89 //
90 // The computation being done is:
91 // result = s + d * (1-alpha)
92 // with shortcuts if fully opaque or fully transparent.
93 #define BLEND_SOURCE_OVER_ARGB32_SSSE3(dst, src, length, nullVector, half, one, colorMask, alphaMask) { \
94     int x = 0; \
95 \
96     /* First, get dst aligned. */ \
97     ALIGNMENT_PROLOGUE_16BYTES(dst, x, length) { \
98         blend_pixel(dst[x], src[x]); \
99     } \
100 \
101     const int minusOffsetToAlignSrcOn16Bytes = (reinterpret_cast<quintptr>(&(src[x])) >> 2) & 0x3;\
102 \
103     if (!minusOffsetToAlignSrcOn16Bytes) {\
104         /* src is aligned, usual algorithm but with aligned operations.\
105            See the SSE2 version for more documentation on the algorithm itself. */\
106         const __m128i alphaShuffleMask = _mm_set_epi8(0xff,15,0xff,15,0xff,11,0xff,11,0xff,7,0xff,7,0xff,3,0xff,3);\
107         for (; x < length-3; x += 4) { \
108             const __m128i srcVector = _mm_load_si128((__m128i *)&src[x]); \
109             const __m128i srcVectorAlpha = _mm_and_si128(srcVector, alphaMask); \
110             if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, alphaMask)) == 0xffff) { \
111                 _mm_store_si128((__m128i *)&dst[x], srcVector); \
112             } else if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, nullVector)) != 0xffff) { \
113                 __m128i alphaChannel = _mm_shuffle_epi8(srcVector, alphaShuffleMask); \
114                 alphaChannel = _mm_sub_epi16(one, alphaChannel); \
115                 const __m128i dstVector = _mm_load_si128((__m128i *)&dst[x]); \
116                 __m128i destMultipliedByOneMinusAlpha; \
117                 BYTE_MUL_SSE2(destMultipliedByOneMinusAlpha, dstVector, alphaChannel, colorMask, half); \
118                 const __m128i result = _mm_add_epi8(srcVector, destMultipliedByOneMinusAlpha); \
119                 _mm_store_si128((__m128i *)&dst[x], result); \
120             } \
121         } /* end for() */\
122     } else if ((length - x) >= 8) {\
123         /* We use two vectors to extract the src: prevLoaded for the first pixels, lastLoaded for the current pixels. */\
124         __m128i srcVectorPrevLoaded = _mm_load_si128((__m128i *)&src[x - minusOffsetToAlignSrcOn16Bytes]);\
125         const int palignrOffset = minusOffsetToAlignSrcOn16Bytes << 2;\
126 \
127         const __m128i alphaShuffleMask = _mm_set_epi8(0xff,15,0xff,15,0xff,11,0xff,11,0xff,7,0xff,7,0xff,3,0xff,3);\
128         switch (palignrOffset) {\
129         case 4:\
130             BLENDING_LOOP(4, length)\
131             break;\
132         case 8:\
133             BLENDING_LOOP(8, length)\
134             break;\
135         case 12:\
136             BLENDING_LOOP(12, length)\
137             break;\
138         }\
139     }\
140     for (; x < length; ++x) \
141         blend_pixel(dst[x], src[x]); \
142 }
143
144 void qt_blend_argb32_on_argb32_ssse3(uchar *destPixels, int dbpl,
145                                      const uchar *srcPixels, int sbpl,
146                                      int w, int h,
147                                      int const_alpha)
148 {
149     const quint32 *src = (const quint32 *) srcPixels;
150     quint32 *dst = (quint32 *) destPixels;
151     if (const_alpha == 256) {
152         const __m128i alphaMask = _mm_set1_epi32(0xff000000);
153         const __m128i nullVector = _mm_setzero_si128();
154         const __m128i half = _mm_set1_epi16(0x80);
155         const __m128i one = _mm_set1_epi16(0xff);
156         const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
157
158         for (int y = 0; y < h; ++y) {
159             BLEND_SOURCE_OVER_ARGB32_SSSE3(dst, src, w, nullVector, half, one, colorMask, alphaMask);
160             dst = (quint32 *)(((uchar *) dst) + dbpl);
161             src = (const quint32 *)(((const uchar *) src) + sbpl);
162         }
163     } else if (const_alpha != 0) {
164         // dest = (s + d * sia) * ca + d * cia
165         //      = s * ca + d * (sia * ca + cia)
166         //      = s * ca + d * (1 - sa*ca)
167         const_alpha = (const_alpha * 255) >> 8;
168         const __m128i nullVector = _mm_setzero_si128();
169         const __m128i half = _mm_set1_epi16(0x80);
170         const __m128i one = _mm_set1_epi16(0xff);
171         const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
172         const __m128i constAlphaVector = _mm_set1_epi16(const_alpha);
173         for (int y = 0; y < h; ++y) {
174             BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_SSE2(dst, src, w, nullVector, half, one, colorMask, constAlphaVector)
175             dst = (quint32 *)(((uchar *) dst) + dbpl);
176             src = (const quint32 *)(((const uchar *) src) + sbpl);
177         }
178     }
179 }
180
181 QT_END_NAMESPACE
182
183 #endif // QT_HAVE_SSSE3