Improve drawing scaled image with raster using SSE2
authorOlivier Goffart <ogoffart@kde.org>
Mon, 24 Oct 2011 06:02:10 +0000 (08:02 +0200)
committerQt by Nokia <qt-info@nokia.com>
Mon, 24 Oct 2011 11:56:59 +0000 (13:56 +0200)
That codepath is taken in qml when an Image specify with and hight and
is not smooth, and the image contains alpha contents

The changes in qdrawingprimitive_sse2_p.h just put some code from the
BLEND_SOURCE_OVER_ARGB32_SSE2 macro into a sub macro to allow its reuse

The code that is not SSE2 in qt_scale_image_argb32_on_argb32_sse2 comes
from the qt_scale_image_argb32_on_argb32 in qblendfunctions.cpp

Change-Id: I071a040af4514fb21777dead9f7c5baf16071d59
Reviewed-by: Samuel Rødal <samuel.rodal@nokia.com>
src/gui/painting/qdrawhelper.cpp
src/gui/painting/qdrawhelper_sse2.cpp
src/gui/painting/qdrawingprimitive_sse2_p.h

index c4c5846..88c237b 100644 (file)
@@ -7050,6 +7050,15 @@ void qInitDrawhelperAsm()
         qDrawHelper[QImage::Format_ARGB32].bitmapBlit = qt_bitmapblit32_sse2;
         qDrawHelper[QImage::Format_ARGB32_Premultiplied].bitmapBlit = qt_bitmapblit32_sse2;
         qDrawHelper[QImage::Format_RGB16].bitmapBlit = qt_bitmapblit16_sse2;
+
+        extern void qt_scale_image_argb32_on_argb32_sse2(uchar *destPixels, int dbpl,
+                                                         const uchar *srcPixels, int sbpl,
+                                                         const QRectF &targetRect,
+                                                         const QRectF &sourceRect,
+                                                         const QRect &clip,
+                                                         int const_alpha);
+        qScaleFunctions[QImage::Format_ARGB32_Premultiplied][QImage::Format_ARGB32_Premultiplied] = qt_scale_image_argb32_on_argb32_sse2;
+        qScaleFunctions[QImage::Format_RGB32][QImage::Format_ARGB32_Premultiplied] = qt_scale_image_argb32_on_argb32_sse2;
 #endif
 #ifdef QT_HAVE_SSE
     } else if (features & SSE) {
index 340cd71..3bbdae0 100644 (file)
@@ -538,6 +538,122 @@ const uint * QT_FASTCALL qt_fetch_radial_gradient_sse2(uint *buffer, const Opera
     return qt_fetch_radial_gradient_template<QRadialFetchSimd<QSimdSse2> >(buffer, op, data, y, x, length);
 }
 
+void qt_scale_image_argb32_on_argb32_sse2(uchar *destPixels, int dbpl,
+                                          const uchar *srcPixels, int sbpl,
+                                          const QRectF &targetRect,
+                                          const QRectF &sourceRect,
+                                          const QRect &clip,
+                                          int const_alpha)
+{
+    if (const_alpha != 256) {
+        // from qblendfunctions.cpp
+        extern void qt_scale_image_argb32_on_argb32(uchar *destPixels, int dbpl,
+                                               const uchar *srcPixels, int sbpl,
+                                               const QRectF &targetRect,
+                                               const QRectF &sourceRect,
+                                               const QRect &clip,
+                                               int const_alpha);
+        return qt_scale_image_argb32_on_argb32(destPixels, dbpl, srcPixels, sbpl, targetRect, sourceRect, clip, const_alpha);
+    }
+
+    qreal sx = targetRect.width() / (qreal) sourceRect.width();
+    qreal sy = targetRect.height() / (qreal) sourceRect.height();
+
+    int ix = 0x00010000 / sx;
+    int iy = 0x00010000 / sy;
+
+    int cx1 = clip.x();
+    int cx2 = clip.x() + clip.width();
+    int cy1 = clip.top();
+    int cy2 = clip.y() + clip.height();
+
+    int tx1 = qRound(targetRect.left());
+    int tx2 = qRound(targetRect.right());
+    int ty1 = qRound(targetRect.top());
+    int ty2 = qRound(targetRect.bottom());
+
+    if (tx2 < tx1)
+        qSwap(tx2, tx1);
+    if (ty2 < ty1)
+        qSwap(ty2, ty1);
+
+    if (tx1 < cx1)
+        tx1 = cx1;
+    if (tx2 >= cx2)
+        tx2 = cx2;
+
+    if (tx1 >= tx2)
+        return;
+
+    if (ty1 < cy1)
+        ty1 = cy1;
+    if (ty2 >= cy2)
+       ty2 = cy2;
+    if (ty1 >= ty2)
+        return;
+
+    int h = ty2 - ty1;
+    int w = tx2 - tx1;
+
+    quint32 basex;
+    quint32 srcy;
+
+    if (sx < 0) {
+        int dstx = qFloor((tx1 + qreal(0.5) - targetRect.right()) * ix) + 1;
+        basex = quint32(sourceRect.right() * 65536) + dstx;
+    } else {
+        int dstx = qCeil((tx1 + qreal(0.5) - targetRect.left()) * ix) - 1;
+        basex = quint32(sourceRect.left() * 65536) + dstx;
+    }
+    if (sy < 0) {
+        int dsty = qFloor((ty1 + qreal(0.5) - targetRect.bottom()) * iy) + 1;
+        srcy = quint32(sourceRect.bottom() * 65536) + dsty;
+    } else {
+        int dsty = qCeil((ty1 + qreal(0.5) - targetRect.top()) * iy) - 1;
+        srcy = quint32(sourceRect.top() * 65536) + dsty;
+    }
+
+    quint32 *dst = ((quint32 *) (destPixels + ty1 * dbpl)) + tx1;
+
+    const __m128i nullVector = _mm_set1_epi32(0);
+    const __m128i half = _mm_set1_epi16(0x80);
+    const __m128i one = _mm_set1_epi16(0xff);
+    const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
+    const __m128i alphaMask = _mm_set1_epi32(0xff000000);
+    const __m128i ixVector = _mm_set1_epi32(4*ix);
+
+
+    while (h--) {
+        const uint *src = (const quint32 *) (srcPixels + (srcy >> 16) * sbpl);
+        int srcx = basex;
+        int x = 0;
+
+        ALIGNMENT_PROLOGUE_16BYTES(dst, x, w) {
+            uint s = src[(srcx + x*ix) >> 16];
+            dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s));
+        }
+
+        __m128i srcxVector = _mm_set_epi32(srcx, srcx + ix, srcx + ix + ix, srcx + ix + ix + ix);
+
+        for (; x<w - 3; x += 4) {
+            union Vect_buffer { __m128i vect; quint32 i[4]; };
+            Vect_buffer addr;
+            addr.vect = _mm_srli_epi32(srcxVector, 16);
+            srcxVector = _mm_add_epi32(srcxVector, ixVector);
+
+            const __m128i srcVector = _mm_set_epi32(src[addr.i[0]], src[addr.i[1]], src[addr.i[2]], src[addr.i[3]]);
+            BLEND_SOURCE_OVER_ARGB32_SSE2_helper(dst, srcVector, nullVector, half, one, colorMask, alphaMask);
+        }
+
+        for (; x<w; x++) {
+            uint s = src[(srcx + x*ix) >> 16];
+            dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s));
+        }
+        dst = (quint32 *)(((uchar *) dst) + dbpl);
+        srcy += iy;
+    }
+}
+
 
 QT_END_NAMESPACE
 
index 4c66d90..dad8f6c 100644 (file)
@@ -128,6 +128,33 @@ QT_BEGIN_NAMESPACE
     result = _mm_or_si128(finalAG, finalRB); \
 }
 
+// same as BLEND_SOURCE_OVER_ARGB32_SSE2, but for one vector srcVector
+#define BLEND_SOURCE_OVER_ARGB32_SSE2_helper(dst, srcVector, nullVector, half, one, colorMask, alphaMask) { \
+        const __m128i srcVectorAlpha = _mm_and_si128(srcVector, alphaMask); \
+        if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, alphaMask)) == 0xffff) { \
+            /* all opaque */ \
+            _mm_store_si128((__m128i *)&dst[x], srcVector); \
+        } else if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, nullVector)) != 0xffff) { \
+            /* not fully transparent */ \
+            /* extract the alpha channel on 2 x 16 bits */ \
+            /* so we have room for the multiplication */ \
+            /* each 32 bits will be in the form 0x00AA00AA */ \
+            /* with A being the 1 - alpha */ \
+            __m128i alphaChannel = _mm_srli_epi32(srcVector, 24); \
+            alphaChannel = _mm_or_si128(alphaChannel, _mm_slli_epi32(alphaChannel, 16)); \
+            alphaChannel = _mm_sub_epi16(one, alphaChannel); \
+ \
+            const __m128i dstVector = _mm_load_si128((__m128i *)&dst[x]); \
+            __m128i destMultipliedByOneMinusAlpha; \
+            BYTE_MUL_SSE2(destMultipliedByOneMinusAlpha, dstVector, alphaChannel, colorMask, half); \
+ \
+            /* result = s + d * (1-alpha) */\
+            const __m128i result = _mm_add_epi8(srcVector, destMultipliedByOneMinusAlpha); \
+            _mm_store_si128((__m128i *)&dst[x], result); \
+        } \
+    }
+
+
 // Basically blend src over dst with the const alpha defined as constAlphaVector.
 // nullVector, half, one, colorMask are constant across the whole image/texture, and should be defined as:
 //const __m128i nullVector = _mm_set1_epi32(0);
@@ -153,28 +180,7 @@ QT_BEGIN_NAMESPACE
 \
     for (; x < length-3; x += 4) { \
         const __m128i srcVector = _mm_loadu_si128((__m128i *)&src[x]); \
-        const __m128i srcVectorAlpha = _mm_and_si128(srcVector, alphaMask); \
-        if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, alphaMask)) == 0xffff) { \
-            /* all opaque */ \
-            _mm_store_si128((__m128i *)&dst[x], srcVector); \
-        } else if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, nullVector)) != 0xffff) { \
-            /* not fully transparent */ \
-            /* extract the alpha channel on 2 x 16 bits */ \
-            /* so we have room for the multiplication */ \
-            /* each 32 bits will be in the form 0x00AA00AA */ \
-            /* with A being the 1 - alpha */ \
-            __m128i alphaChannel = _mm_srli_epi32(srcVector, 24); \
-            alphaChannel = _mm_or_si128(alphaChannel, _mm_slli_epi32(alphaChannel, 16)); \
-            alphaChannel = _mm_sub_epi16(one, alphaChannel); \
- \
-            const __m128i dstVector = _mm_load_si128((__m128i *)&dst[x]); \
-            __m128i destMultipliedByOneMinusAlpha; \
-            BYTE_MUL_SSE2(destMultipliedByOneMinusAlpha, dstVector, alphaChannel, colorMask, half); \
- \
-            /* result = s + d * (1-alpha) */\
-            const __m128i result = _mm_add_epi8(srcVector, destMultipliedByOneMinusAlpha); \
-            _mm_store_si128((__m128i *)&dst[x], result); \
-        } \
+        BLEND_SOURCE_OVER_ARGB32_SSE2_helper(dst, srcVector, nullVector, half, one, colorMask, alphaMask) \
     } \
     for (; x < length; ++x) { \
         uint s = src[x]; \