1 /****************************************************************************
3 ** Copyright (C) 2011 Nokia Corporation and/or its subsidiary(-ies).
4 ** All rights reserved.
5 ** Contact: Nokia Corporation (qt-info@nokia.com)
7 ** This file is part of the QtGui module of the Qt Toolkit.
9 ** $QT_BEGIN_LICENSE:LGPL$
10 ** GNU Lesser General Public License Usage
11 ** This file may be used under the terms of the GNU Lesser General Public
12 ** License version 2.1 as published by the Free Software Foundation and
13 ** appearing in the file LICENSE.LGPL included in the packaging of this
14 ** file. Please review the following information to ensure the GNU Lesser
15 ** General Public License version 2.1 requirements will be met:
16 ** http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
18 ** In addition, as a special exception, Nokia gives you certain additional
19 ** rights. These rights are described in the Nokia Qt LGPL Exception
20 ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
22 ** GNU General Public License Usage
23 ** Alternatively, this file may be used under the terms of the GNU General
24 ** Public License version 3.0 as published by the Free Software Foundation
25 ** and appearing in the file LICENSE.GPL included in the packaging of this
26 ** file. Please review the following information to ensure the GNU General
27 ** Public License version 3.0 requirements will be met:
28 ** http://www.gnu.org/copyleft/gpl.html.
31 ** Alternatively, this file may be used in accordance with the terms and
32 ** conditions contained in a signed written agreement between you and Nokia.
40 ****************************************************************************/
42 #include <private/qdrawhelper_x86_p.h>
46 #include <private/qdrawingprimitive_sse2_p.h>
50 inline static void blend_pixel(quint32 &dst, const quint32 src)
52 if (src >= 0xff000000)
55 dst = src + BYTE_MUL(dst, qAlpha(~src));
59 /* The instruction palignr uses direct arguments, so we have to generate the code fo the different
60 shift (4, 8, 12). Checking the alignment inside the loop is unfortunatelly way too slow.
62 #define BLENDING_LOOP(palignrOffset, length)\
63 for (; x < length-3; x += 4) { \
64 const __m128i srcVectorLastLoaded = _mm_load_si128((__m128i *)&src[x - minusOffsetToAlignSrcOn16Bytes + 4]);\
65 const __m128i srcVector = _mm_alignr_epi8(srcVectorLastLoaded, srcVectorPrevLoaded, palignrOffset); \
66 const __m128i srcVectorAlpha = _mm_and_si128(srcVector, alphaMask); \
67 if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, alphaMask)) == 0xffff) { \
68 _mm_store_si128((__m128i *)&dst[x], srcVector); \
69 } else if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, nullVector)) != 0xffff) { \
70 __m128i alphaChannel = _mm_shuffle_epi8(srcVector, alphaShuffleMask); \
71 alphaChannel = _mm_sub_epi16(one, alphaChannel); \
72 const __m128i dstVector = _mm_load_si128((__m128i *)&dst[x]); \
73 __m128i destMultipliedByOneMinusAlpha; \
74 BYTE_MUL_SSE2(destMultipliedByOneMinusAlpha, dstVector, alphaChannel, colorMask, half); \
75 const __m128i result = _mm_add_epi8(srcVector, destMultipliedByOneMinusAlpha); \
76 _mm_store_si128((__m128i *)&dst[x], result); \
78 srcVectorPrevLoaded = srcVectorLastLoaded;\
82 // Basically blend src over dst with the const alpha defined as constAlphaVector.
83 // nullVector, half, one, colorMask are constant across the whole image/texture, and should be defined as:
84 //const __m128i nullVector = _mm_set1_epi32(0);
85 //const __m128i half = _mm_set1_epi16(0x80);
86 //const __m128i one = _mm_set1_epi16(0xff);
87 //const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
88 //const __m128i alphaMask = _mm_set1_epi32(0xff000000);
90 // The computation being done is:
91 // result = s + d * (1-alpha)
92 // with shortcuts if fully opaque or fully transparent.
93 #define BLEND_SOURCE_OVER_ARGB32_SSSE3(dst, src, length, nullVector, half, one, colorMask, alphaMask) { \
96 /* First, get dst aligned. */ \
97 ALIGNMENT_PROLOGUE_16BYTES(dst, x, length) { \
98 blend_pixel(dst[x], src[x]); \
101 const int minusOffsetToAlignSrcOn16Bytes = (reinterpret_cast<quintptr>(&(src[x])) >> 2) & 0x3;\
103 if (!minusOffsetToAlignSrcOn16Bytes) {\
104 /* src is aligned, usual algorithm but with aligned operations.\
105 See the SSE2 version for more documentation on the algorithm itself. */\
106 const __m128i alphaShuffleMask = _mm_set_epi8(0xff,15,0xff,15,0xff,11,0xff,11,0xff,7,0xff,7,0xff,3,0xff,3);\
107 for (; x < length-3; x += 4) { \
108 const __m128i srcVector = _mm_load_si128((__m128i *)&src[x]); \
109 const __m128i srcVectorAlpha = _mm_and_si128(srcVector, alphaMask); \
110 if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, alphaMask)) == 0xffff) { \
111 _mm_store_si128((__m128i *)&dst[x], srcVector); \
112 } else if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, nullVector)) != 0xffff) { \
113 __m128i alphaChannel = _mm_shuffle_epi8(srcVector, alphaShuffleMask); \
114 alphaChannel = _mm_sub_epi16(one, alphaChannel); \
115 const __m128i dstVector = _mm_load_si128((__m128i *)&dst[x]); \
116 __m128i destMultipliedByOneMinusAlpha; \
117 BYTE_MUL_SSE2(destMultipliedByOneMinusAlpha, dstVector, alphaChannel, colorMask, half); \
118 const __m128i result = _mm_add_epi8(srcVector, destMultipliedByOneMinusAlpha); \
119 _mm_store_si128((__m128i *)&dst[x], result); \
122 } else if ((length - x) >= 8) {\
123 /* We use two vectors to extract the src: prevLoaded for the first pixels, lastLoaded for the current pixels. */\
124 __m128i srcVectorPrevLoaded = _mm_load_si128((__m128i *)&src[x - minusOffsetToAlignSrcOn16Bytes]);\
125 const int palignrOffset = minusOffsetToAlignSrcOn16Bytes << 2;\
127 const __m128i alphaShuffleMask = _mm_set_epi8(0xff,15,0xff,15,0xff,11,0xff,11,0xff,7,0xff,7,0xff,3,0xff,3);\
128 switch (palignrOffset) {\
130 BLENDING_LOOP(4, length)\
133 BLENDING_LOOP(8, length)\
136 BLENDING_LOOP(12, length)\
140 for (; x < length; ++x) \
141 blend_pixel(dst[x], src[x]); \
144 void qt_blend_argb32_on_argb32_ssse3(uchar *destPixels, int dbpl,
145 const uchar *srcPixels, int sbpl,
149 const quint32 *src = (const quint32 *) srcPixels;
150 quint32 *dst = (quint32 *) destPixels;
151 if (const_alpha == 256) {
152 const __m128i alphaMask = _mm_set1_epi32(0xff000000);
153 const __m128i nullVector = _mm_setzero_si128();
154 const __m128i half = _mm_set1_epi16(0x80);
155 const __m128i one = _mm_set1_epi16(0xff);
156 const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
158 for (int y = 0; y < h; ++y) {
159 BLEND_SOURCE_OVER_ARGB32_SSSE3(dst, src, w, nullVector, half, one, colorMask, alphaMask);
160 dst = (quint32 *)(((uchar *) dst) + dbpl);
161 src = (const quint32 *)(((const uchar *) src) + sbpl);
163 } else if (const_alpha != 0) {
164 // dest = (s + d * sia) * ca + d * cia
165 // = s * ca + d * (sia * ca + cia)
166 // = s * ca + d * (1 - sa*ca)
167 const_alpha = (const_alpha * 255) >> 8;
168 const __m128i nullVector = _mm_setzero_si128();
169 const __m128i half = _mm_set1_epi16(0x80);
170 const __m128i one = _mm_set1_epi16(0xff);
171 const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
172 const __m128i constAlphaVector = _mm_set1_epi16(const_alpha);
173 for (int y = 0; y < h; ++y) {
174 BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_SSE2(dst, src, w, nullVector, half, one, colorMask, constAlphaVector)
175 dst = (quint32 *)(((uchar *) dst) + dbpl);
176 src = (const quint32 *)(((const uchar *) src) + sbpl);
183 #endif // QT_HAVE_SSSE3