MIPS DSP composition functions optimizations.
authorDamir Tatalovic <dtatalovic@mips.com>
Fri, 22 Jun 2012 16:13:02 +0000 (18:13 +0200)
committerQt by Nokia <qt-info@nokia.com>
Mon, 2 Jul 2012 23:55:44 +0000 (01:55 +0200)
List of optimized routines:
- comp_func_DestinationOver
- comp_func_SourceIn
- comp_func_DestinationIn
- comp_func_DestinationOut
- comp_func_SourceAtop
- comp_func_DestinationAtop
- comp_func_XOR
- comp_func_SourceOut
- comp_func_solid_SourceOver
- comp_func_solid_DestinationOver
- comp_func_solid_SourceIn
- comp_func_solid_DestinationIn
- comp_func_solid_SourceAtop
- comp_func_solid_DestinationAtop
- copm_func_solid_XOR
- comp_func_solid_SourceOut

Previously optimized routines qt_blend_argb32_on_argb32_mips_dsp and
comp_func_Source_mips_dsp are redesigned and rewritten.

Overall improvement by running tst_bench_blendbench benchmark app
from tests/benchmarks/gui/image/blendbench/ is 27%.

Change-Id: I6ab09b17cac10f4aded59787074ab4c89e72ccac
Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
Reviewed-by: Samuel Rødal <samuel.rodal@nokia.com>
src/gui/painting/qdrawhelper.cpp
src/gui/painting/qdrawhelper_mips_dsp.cpp
src/gui/painting/qdrawhelper_mips_dsp_asm.S
src/gui/painting/qdrawhelper_mips_dsp_p.h
src/gui/painting/qdrawhelper_mips_dspr2_asm.S
src/gui/painting/qt_mips_asm_dsp.h

index 08975da..985ef68 100644 (file)
@@ -5971,6 +5971,23 @@ void qInitDrawhelperAsm()
 #if defined(QT_COMPILER_SUPPORTS_MIPS_DSP)
         functionForMode_C[QPainter::CompositionMode_SourceOver] = comp_func_SourceOver_asm_mips_dsp;
         functionForMode_C[QPainter::CompositionMode_Source] = comp_func_Source_mips_dsp;
+        functionForMode_C[QPainter::CompositionMode_DestinationOver] = comp_func_DestinationOver_mips_dsp;
+        functionForMode_C[QPainter::CompositionMode_SourceIn] = comp_func_SourceIn_mips_dsp;
+        functionForMode_C[QPainter::CompositionMode_DestinationIn] = comp_func_DestinationIn_mips_dsp;
+        functionForMode_C[QPainter::CompositionMode_DestinationOut] = comp_func_DestinationOut_mips_dsp;
+        functionForMode_C[QPainter::CompositionMode_SourceAtop] = comp_func_SourceAtop_mips_dsp;
+        functionForMode_C[QPainter::CompositionMode_DestinationAtop] = comp_func_DestinationAtop_mips_dsp;
+        functionForMode_C[QPainter::CompositionMode_Xor] = comp_func_XOR_mips_dsp;
+        functionForMode_C[QPainter::CompositionMode_SourceOut] = comp_func_SourceOut_mips_dsp;
+
+        functionForModeSolid_C[QPainter::CompositionMode_SourceOver] = comp_func_solid_SourceOver_mips_dsp;
+        functionForModeSolid_C[QPainter::CompositionMode_DestinationOver] = comp_func_solid_DestinationOver_mips_dsp;
+        functionForModeSolid_C[QPainter::CompositionMode_SourceIn] = comp_func_solid_SourceIn_mips_dsp;
+        functionForModeSolid_C[QPainter::CompositionMode_DestinationIn] = comp_func_solid_DestinationIn_mips_dsp;
+        functionForModeSolid_C[QPainter::CompositionMode_SourceAtop] = comp_func_solid_SourceAtop_mips_dsp;
+        functionForModeSolid_C[QPainter::CompositionMode_DestinationAtop] = comp_func_solid_DestinationAtop_mips_dsp;
+        functionForModeSolid_C[QPainter::CompositionMode_Xor] = comp_func_solid_XOR_mips_dsp;
+        functionForModeSolid_C[QPainter::CompositionMode_SourceOut] = comp_func_solid_SourceOut_mips_dsp;
 
         qt_memfill32 = qt_memfill32_asm_mips_dsp;
 
index b33329c..ec1d7d2 100644 (file)
 
 QT_BEGIN_NAMESPACE
 
-#if defined(QT_COMPILER_SUPPORTS_MIPS_DSP)
-
-extern "C" uint INTERPOLATE_PIXEL_255_asm_mips_dsp(uint x, uint a, uint y, uint b);
-
-extern "C"  uint BYTE_MUL_asm_mips_dsp(uint x, uint a);
-
-extern "C" uint * destfetchARGB32_asm_mips_dsp(uint *buffer, const uint *data, int length);
-
-extern "C" uint * qt_destStoreARGB32_asm_mips_dsp(uint *buffer, const uint *data, int length);
-
-#if defined(QT_COMPILER_SUPPORTS_MIPS_DSPR2)
-
-extern "C" uint INTERPOLATE_PIXEL_255_asm_mips_dspr2(uint x, uint a, uint y, uint b);
-
-extern "C" uint BYTE_MUL_asm_mips_dspr2(uint x, uint a);
-
-#endif // QT_COMPILER_SUPPORTS_MIPS_DSPR2
-
 void qt_blend_argb32_on_argb32_mips_dsp(uchar *destPixels, int dbpl,
                                       const uchar *srcPixels, int sbpl,
                                       int w, int h,
@@ -80,32 +62,21 @@ void qt_blend_argb32_on_argb32_mips_dsp(uchar *destPixels, int dbpl,
     uint *dst = (uint *) destPixels;
     if (const_alpha == 256) {
         for (int y=0; y<h; ++y) {
-            for (int x=0; x<w; ++x) {
-                uint s = src[x];
-                if (s >= 0xff000000)
-                    dst[x] = s;
-                else if (s != 0)
-#if !defined(QT_COMPILER_SUPPORTS_MIPS_DSPR2)
-                    dst[x] = s + BYTE_MUL_asm_mips_dsp(dst[x], qAlpha(~s));
-#else
-                    dst[x] = s + BYTE_MUL_asm_mips_dspr2(dst[x], qAlpha(~s));
-#endif
-            }
+            qt_blend_argb32_on_argb32_const_alpha_256_mips_dsp_asm(dst, src, w);
             dst = (quint32 *)(((uchar *) dst) + dbpl);
             src = (const quint32 *)(((const uchar *) src) + sbpl);
         }
     } else if (const_alpha != 0) {
         const_alpha = (const_alpha * 255) >> 8;
         for (int y=0; y<h; ++y) {
-            for (int x=0; x<w; ++x) {
-#if !defined(QT_COMPILER_SUPPORTS_MIPS_DSPR2)
-                uint s = BYTE_MUL_asm_mips_dsp(src[x], const_alpha);
-                dst[x] = s + BYTE_MUL_asm_mips_dsp(dst[x], qAlpha(~s));
-#else
-                uint s = BYTE_MUL_asm_mips_dspr2(src[x], const_alpha);
-                dst[x] = s + BYTE_MUL_asm_mips_dspr2(dst[x], qAlpha(~s));
-#endif
+            if (h%2 > 0) {
+                uint s = BYTE_MUL(src[0], const_alpha);
+                dst[0] = s + BYTE_MUL(dst[0], qAlpha(~s));
+                h--;
+                dst++;
+                src++;
             }
+            qt_blend_argb32_on_argb32_mips_dsp_asm_x2(dst, src, h, const_alpha);
             dst = (quint32 *)(((uchar *) dst) + dbpl);
             src = (const quint32 *)(((const uchar *) src) + sbpl);
         }
@@ -145,13 +116,13 @@ void comp_func_Source_mips_dsp(uint *dest, const uint *src, int length, uint con
         ::memcpy(dest, src, length * sizeof(uint));
     } else {
         int ialpha = 255 - const_alpha;
-        for (int i = 0; i < length; ++i) {
-#if !defined(QT_COMPILER_SUPPORTS_MIPS_DSPR2)
-            dest[i] = INTERPOLATE_PIXEL_255_asm_mips_dsp(src[i], const_alpha, dest[i], ialpha);
-#else
-            dest[i] = INTERPOLATE_PIXEL_255_asm_mips_dspr2(src[i], const_alpha, dest[i], ialpha);
-#endif
+        if (length%2 > 0) {
+            dest[0] = INTERPOLATE_PIXEL_255(src[0], const_alpha, dest[0], ialpha);
+            length--;
+            dest++;
+            src++;
         }
+        comp_func_Source_dsp_asm_x2(dest, src, length, const_alpha);
     }
 }
 
@@ -171,6 +142,285 @@ void QT_FASTCALL qt_destStoreARGB32_mips_dsp(QRasterBuffer *rasterBuffer, int x,
     qt_destStoreARGB32_asm_mips_dsp(data, buffer, length);
 }
 
-#endif // QT_COMPILER_SUPPORTS_MIPS_DSP
+void QT_FASTCALL comp_func_solid_SourceOver_mips_dsp(uint *dest, int length, uint color, uint const_alpha)
+{
+    if (const_alpha != 255)
+        color = BYTE_MUL(color, const_alpha);
+    if (length%2 > 0) {
+        dest[0] = color + BYTE_MUL(dest[0], qAlpha(~color));
+        length--;
+        dest++;
+    }
+    comp_func_solid_Source_dsp_asm_x2(dest, length, color, qAlpha(~color));
+}
+
+void QT_FASTCALL comp_func_solid_DestinationOver_mips_dsp(uint *dest, int length, uint color, uint const_alpha)
+{
+    if (const_alpha != 255)
+        color = BYTE_MUL(color, const_alpha);
+    if (length%2 > 0) {
+        uint d = dest[0];
+        dest[0] = d + BYTE_MUL(color, qAlpha(~d));
+        length--;
+        dest++;
+    }
+    comp_func_solid_DestinationOver_dsp_asm_x2(dest, length, color);
+}
+
+void QT_FASTCALL comp_func_DestinationOver_mips_dsp(uint *dest, const uint *src, int length, uint const_alpha)
+{
+    if (length%2 > 0) {
+        if (const_alpha == 255) {
+            uint d = dest[0];
+            dest[0] = d + BYTE_MUL(src[0], qAlpha(~d));
+        } else {
+            uint d = dest[0];
+            uint s = BYTE_MUL(src[0], const_alpha);
+            dest[0] = d + BYTE_MUL(s, qAlpha(~d));
+        }
+        length--;
+        dest++;
+        src++;
+    }
+    comp_func_DestinationOver_dsp_asm_x2(dest, src, length, const_alpha);
+}
+
+void QT_FASTCALL comp_func_solid_SourceIn_mips_dsp(uint *dest, int length, uint color, uint const_alpha)
+{
+    if (length%2 > 0) {
+        if (const_alpha == 255) {
+            dest[0] = BYTE_MUL(color, qAlpha(dest[0]));
+        } else {
+            uint tmp_color = BYTE_MUL(color, const_alpha);
+            uint cia = 255 - const_alpha;
+            uint d = dest[0];
+            dest[0] = INTERPOLATE_PIXEL_255(tmp_color, qAlpha(d), d, cia);
+        }
+        length--;
+        dest++;
+    }
+    comp_func_solid_SourceIn_dsp_asm_x2(dest, length, color, const_alpha);
+}
+
+void QT_FASTCALL comp_func_SourceIn_mips_dsp(uint *dest, const uint *src, int length, uint const_alpha)
+{
+    if (length%2 > 0) {
+        if (const_alpha == 255) {
+            dest[0] = BYTE_MUL(src[0], qAlpha(dest[0]));
+        } else {
+            uint cia = 255 - const_alpha;
+            uint d = dest[0];
+            uint s = BYTE_MUL(src[0], const_alpha);
+            dest[0] = INTERPOLATE_PIXEL_255(s, qAlpha(d), d, cia);
+        }
+        length--;
+        dest++;
+        src++;
+    }
+    comp_func_SourceIn_dsp_asm_x2(dest, src, length, const_alpha);
+}
+
+void QT_FASTCALL comp_func_solid_DestinationIn_mips_dsp(uint *dest, int length, uint color, uint const_alpha)
+{
+    uint a = qAlpha(color);
+    if (const_alpha != 255) {
+        a = BYTE_MUL(a, const_alpha) + 255 - const_alpha;
+    }
+    if (length%2 > 0) {
+        dest[0] = BYTE_MUL(dest[0], a);
+        length--;
+        dest++;
+    }
+    comp_func_solid_DestinationIn_dsp_asm_x2(dest, length, a);
+}
+
+void QT_FASTCALL comp_func_DestinationIn_mips_dsp(uint *dest, const uint *src, int length, uint const_alpha)
+{
+    if (length%2 > 0) {
+        if (const_alpha == 255) {
+            dest[0] = BYTE_MUL(dest[0], qAlpha(src[0]));
+        } else {
+            int cia = 255 - const_alpha;
+            uint a = BYTE_MUL(qAlpha(src[0]), const_alpha) + cia;
+            dest[0] = BYTE_MUL(dest[0], a);
+        }
+    length--;
+    src++;
+    dest++;
+    }
+    comp_func_DestinationIn_dsp_asm_x2(dest, src, length, const_alpha);
+}
+
+void QT_FASTCALL comp_func_solid_DestinationOut_mips_dsp(uint *dest, int length, uint color, uint const_alpha)
+{
+    uint a = qAlpha(~color);
+    if (const_alpha != 255) {
+        a = BYTE_MUL(a, const_alpha) + 255 - const_alpha;
+    }
+    if (length%2 > 0) {
+        dest[0] = BYTE_MUL(dest[0], a);
+        length--;
+        dest++;
+    }
+    comp_func_solid_DestinationIn_dsp_asm_x2(dest, length, a);
+}
+
+void QT_FASTCALL comp_func_DestinationOut_mips_dsp(uint *dest, const uint *src, int length, uint const_alpha)
+{
+    if (length%2 > 0) {
+        if (const_alpha == 255) {
+            dest[0] = BYTE_MUL(dest[0], qAlpha(~src[0]));
+        } else {
+            int cia = 255 - const_alpha;
+            uint sia = BYTE_MUL(qAlpha(~src[0]), const_alpha) + cia;
+            dest[0] = BYTE_MUL(dest[0], sia);
+        }
+        length--;
+        dest++;
+        src++;
+    }
+    comp_func_DestinationOut_dsp_asm_x2(dest, src, length, const_alpha);
+}
+
+void QT_FASTCALL comp_func_solid_SourceAtop_mips_dsp(uint *dest, int length, uint color, uint const_alpha)
+{
+    if (const_alpha != 255) {
+        color = BYTE_MUL(color, const_alpha);
+    }
+    uint sia = qAlpha(~color);
+    if (length%2 > 0) {
+        dest[0] = INTERPOLATE_PIXEL_255(color, qAlpha(dest[0]), dest[0], sia);
+        length--;
+        dest++;
+    }
+    comp_func_solid_SourceAtop_dsp_asm_x2(dest, length, color, sia);
+}
+
+void QT_FASTCALL comp_func_SourceAtop_mips_dsp(uint *dest, const uint *src, int length, uint const_alpha)
+{
+    if (length%2 > 0) {
+        if (const_alpha == 255) {
+            uint s = src[0];
+            uint d = dest[0];
+            dest[0] = INTERPOLATE_PIXEL_255(s, qAlpha(d), d, qAlpha(~s));
+        } else {
+            uint s = BYTE_MUL(src[0], const_alpha);
+            uint d = dest[0];
+            dest[0] = INTERPOLATE_PIXEL_255(s, qAlpha(d), d, qAlpha(~s));
+        }
+        length--;
+        dest++;
+        src++;
+    }
+    comp_func_SourceAtop_dsp_asm_x2(dest, src, length, const_alpha);
+}
+
+
+void QT_FASTCALL comp_func_solid_DestinationAtop_mips_dsp(uint *dest, int length, uint color, uint const_alpha)
+{
+    uint a = qAlpha(color);
+    if (const_alpha != 255) {
+        color = BYTE_MUL(color, const_alpha);
+        a = qAlpha(color) + 255 - const_alpha;
+    }
+    if (length%2 > 0) {
+        uint d = dest[0];
+        dest[0] = INTERPOLATE_PIXEL_255(d, a, color, qAlpha(~d));
+        length--;
+        dest++;
+    }
+    comp_func_solid_DestinationAtop_dsp_asm_x2(dest, length, color, a);
+}
+
+void QT_FASTCALL comp_func_DestinationAtop_mips_dsp(uint *dest, const uint *src, int length, uint const_alpha)
+{
+    if (length%2 > 0) {
+        if (const_alpha == 255) {
+            uint s = src[0];
+            uint d = dest[0];
+            dest[0] = INTERPOLATE_PIXEL_255(d, qAlpha(s), s, qAlpha(~d));
+        } else {
+            int cia = 255 - const_alpha;
+            uint s = BYTE_MUL(src[0], const_alpha);
+            uint d = dest[0];
+            uint a = qAlpha(s) + cia;
+            dest[0] = INTERPOLATE_PIXEL_255(d, a, s, qAlpha(~d));
+        }
+        length--;
+        dest++;
+        src++;
+    }
+    comp_func_DestinationAtop_dsp_asm_x2(dest, src, length, const_alpha);
+}
+
+void QT_FASTCALL comp_func_solid_XOR_mips_dsp(uint *dest, int length, uint color, uint const_alpha)
+{
+    if (const_alpha != 255)
+        color = BYTE_MUL(color, const_alpha);
+    uint sia = qAlpha(~color);
+
+     if (length%2 > 0) {
+        uint d = dest[0];
+        dest[0] = INTERPOLATE_PIXEL_255(color, qAlpha(~d), d, sia);
+        length--;
+        dest++;
+    }
+    comp_func_solid_XOR_dsp_asm_x2(dest, length, color, sia);
+}
+
+void QT_FASTCALL comp_func_XOR_mips_dsp(uint *dest, const uint *src, int length, uint const_alpha)
+{
+    if (length%2 > 0) {
+        if (const_alpha == 255) {
+            uint d = dest[0];
+            uint s = src[0];
+            dest[0] = INTERPOLATE_PIXEL_255(s, qAlpha(~d), d, qAlpha(~s));
+        } else {
+            uint d = dest[0];
+            uint s = BYTE_MUL(src[0], const_alpha);
+            dest[0] = INTERPOLATE_PIXEL_255(s, qAlpha(~d), d, qAlpha(~s));
+        }
+        length--;
+        dest++;
+        src++;
+    }
+    comp_func_XOR_dsp_asm_x2(dest, src, length, const_alpha);
+}
+
+void QT_FASTCALL comp_func_solid_SourceOut_mips_dsp(uint *dest, int length, uint color, uint const_alpha)
+{
+    if (length%2 > 0) {
+        if (const_alpha == 255) {
+            dest[0] = BYTE_MUL(color, qAlpha(~dest[0]));
+        } else {
+            uint tmp_color = BYTE_MUL(color, const_alpha);
+            int cia = 255 - const_alpha;
+            uint d = dest[0];
+            dest[0] = INTERPOLATE_PIXEL_255(tmp_color, qAlpha(~d), d, cia);
+        }
+        length--;
+        dest++;
+    }
+    comp_func_solid_SourceOut_dsp_asm_x2(dest, length, color, const_alpha);
+}
+
+void QT_FASTCALL comp_func_SourceOut_mips_dsp(uint *dest, const uint *src, int length, uint const_alpha)
+{
+    if (length%2 > 0) {
+        if (const_alpha == 255) {
+            dest[0] = BYTE_MUL(src[0], qAlpha(~dest[0]));
+        } else {
+            int cia = 255 - const_alpha;
+            uint s = BYTE_MUL(src[0], const_alpha);
+            uint d = dest[0];
+            dest[0] = INTERPOLATE_PIXEL_255(s, qAlpha(~d), d, cia);
+        }
+        length--;
+        dest++;
+        src++;
+    }
+    comp_func_SourceOut_dsp_asm_x2(dest, src, length, const_alpha);
+}
+
 
 QT_END_NAMESPACE
index f426905..58cc176 100644 (file)
 
 #include "qt_mips_asm_dsp.h"
 
-LEAF_MIPS_DSP(INTERPOLATE_PIXEL_255_asm_mips_dsp)
-/*
- * a0 - uint x (First value to multiply)
- * a1 - uint a (Multiplicator byte for first value)
- * a2 - uint y (Second value to multiply)
- * a3 - uint b (Multiplicator byte for second value)
- */
-
-    .set reorder
-    li                t4, 8388736
-    preceu.ph.qbra    t0, a0        /* (x & 0xff00ff) */
-    mul               t0, t0, a1    /* (x & 0xff00ff) * a */
-    preceu.ph.qbra    t1, a2        /* (y & 0xff00ff) */
-    mul               t1, t1, a3    /* (y & 0xff00ff) * b */
-    addu              t0, t0, t1    /* (x & 0xff00ff) * a +
-                                     * (y & 0xff00ff) * b
-                                     */
-    preceu.ph.qbla    t1, t0        /* (t >> 8) & 0xff00ff */
-    addu              t0, t0, t1    /* t + ((t >> 8) & 0xff00ff */
-    addu              t0, t0, t4    /* t + ((t >> 8) & 0xff00ff) + 0x800080 */
-    preceu.ph.qbla    t0, t0        /* t >> 8 and t&=0xff00ff */
-    preceu.ph.qbla    t2, a0        /* (x>>8) & 0xff00ff */
-    mul               t2, t2, a1    /* ((x>>8) & 0xff00ff) * a */
-    preceu.ph.qbla    t3, a2        /* ((y>>8) & 0xff00ff) */
-    mul               t3, t3, a3    /* ((y>>8) & 0xff00ff) * b */
-    addu              t2, t2, t3    /* ((x>>8) & 0xff00ff) * a +
-                                     * ((y >> 8) & 0xff00ff) * b
-                                     */
-    preceu.ph.qbla    t3, t2        /* (x>>8) & 0xff00ff */
-    addu              t2, t2, t3    /* (x>>8) & 0xff00ff) + 0x800080 */
-    addu              t2, t2, t4    /* x + ((x>>8) & 0xff00ff) + 0x800080 */
-    and               t2, t2, 0xff00ff00
-    or                t1, t0, t2
-    move              v0, t1
-    j                 ra
-
-END(INTERPOLATE_PIXEL_255_asm_mips_dsp)
-
-LEAF_MIPS_DSP(BYTE_MUL_asm_mips_dsp)
-/*
- * a0 - uint x (Value to multiply)
- * a1 - uint a (Multiplicator byte)
- */
-
-    .set reorder
-    replv.ph          a1, a1         /* a1 = 0x00a00a */
-    li                t4, 8388736    /* t4 = 0x800080 */
-    muleu_s.ph.qbl    t0, a0, a1
-    muleu_s.ph.qbr    t2, a0, a1
-    preceu.ph.qbla    t1, t0
-    addu              t0, t0, t1
-    addu              t0, t0, t4
-    preceu.ph.qbla    t3, t2
-    addu              t2, t2, t3
-    addu              t2, t2, t4
-    precrq.qb.ph      t4, t0, t2
-    move              v0, t4
-    j                 ra
-
-END(BYTE_MUL_asm_mips_dsp)
-
 LEAF_MIPS_DSP(destfetchARGB32_asm_mips_dsp)
 /*
  * a0 - buffer address (dst)
@@ -349,7 +288,7 @@ LEAF_MIPS_DSP(comp_func_SourceOver_asm_mips_dsp)
 
 END(comp_func_SourceOver_asm_mips_dsp)
 
-LEAF_MIPS_DSP(qt_destStoreARGB32_asm_mips_dsp)
+LEAF_MIPS_DSPR2(qt_destStoreARGB32_asm_mips_dsp)
 /*
  * a0 - uint * data
  * a1 - const uint *buffer
@@ -422,3 +361,1243 @@ LEAF_MIPS_DSP(qt_destStoreARGB32_asm_mips_dsp)
      nop
 
 END(qt_destStoreARGB32_asm_mips_dsp)
+
+LEAF_MIPS_DSP(comp_func_solid_Source_dsp_asm_x2)
+/*
+ * a0 - const uint *dest
+ * a1 - int length
+ * a2 - uint color
+ * a3 - uint ialpha
+ */
+
+    beqz              a1, 2f
+     nop
+    replv.ph          a3, a3
+    li                t9, 8388736    /* t9 = 0x800080 */
+1:
+    lw                t0, 0(a0)
+    lw                t1, 4(a0)
+    or                t2, t0, t1    /* if both dest are zero, no computation needed */
+    beqz              t2, 12f
+     addiu             a1, -2
+
+    BYTE_MUL_x2 t0, t1, t6, t7, a3, a3, t9, t2, t3, t4, t5, 0
+11:
+    addu              t2, a2, t6
+    addu              t3, a2, t7
+    sw                t2, 0(a0)
+    sw                t3, 4(a0)
+    bnez              a1, 1b
+     addiu             a0, 8
+    b                 2f
+12:
+    addu              t2, a2, t0
+    addu              t3, a2, t1
+    sw                t2, 0(a0)
+    sw                t3, 4(a0)
+    bnez              a1, 1b
+     addiu             a0, 8
+2:
+    jr                ra
+     nop
+
+END(comp_func_solid_Source_dsp_asm_x2)
+
+LEAF_MIPS_DSP(comp_func_solid_DestinationOver_dsp_asm_x2)
+/*
+ * a0 - uint *dest
+ * a1 - int length
+ * a2 - uint color
+ */
+
+    addiu             sp, sp, -8
+    sw                s0, 0(sp)
+    sw                s1, 4(sp)
+    beqz              a1, 2f
+     nop
+    beqz              a2, 2f
+     nop
+    li                t9, 8388736    /* t4 = 0x800080 */
+
+1:
+    lw                t0, 0(a0)
+    lw                t1, 4(a0)
+    not               t2, t0
+    not               t3, t1
+    srl               t4, t2, 24
+    srl               t5, t3, 24
+    or                t2, t4, t5    /* if both dest are zero, no computation needed */
+    beqz              t2, 11f
+     addiu             a1, -2
+    replv.ph          t2, t4
+    replv.ph          t3, t5
+
+    BYTE_MUL_x2 a2, a2, t8, a3, t2, t3, t9, t4, t5, t6, t7
+
+    addu              t0, t0, t8
+    addu              t1, t1, a3
+11:
+    sw                t0, 0(a0)
+    sw                t1, 4(a0)
+    bnez              a1, 1b
+     addiu             a0, 8
+
+2:
+    lw                s0, 0(sp)
+    lw                s1, 4(sp)
+    addiu             sp, sp, 8
+    jr                ra
+     nop
+
+END(comp_func_solid_DestinationOver_dsp_asm_x2)
+
+LEAF_MIPS_DSP(comp_func_DestinationOver_dsp_asm_x2)
+/*
+ * a0 - uint *dest
+ * a1 - uint *src
+ * a2 - int length
+ * a3 - uint const_alpha
+ */
+
+    .set              noat
+    addiu             sp, sp, -8
+    sw                s0, 0(sp)
+    sw                s1, 4(sp)
+    beqz              a2, 3f
+     nop
+    li                t9, 8388736    /* t4 = 0x800080 */
+    li                t0, 0xff
+    beq               a3, t0, 2f
+     nop
+
+/* part where const_alpha != 255 */
+1:
+    replv.ph          a3, a3
+11:
+    lw                t0, 0(a1)     # src_1
+    lw                t1, 4(a1)     # src_2
+    addiu             a2, -2
+
+    BYTE_MUL_x2 t0, t1, t8, AT, a3, a3, t9, t4, t5, t6, t7, 0
+                                    # t8 = s1
+                                    # AT = s2
+    lw                t0, 0(a0)     # dest_1
+    lw                t1, 4(a0)     # dest_2
+    addiu             a1, 8
+    not               t2, t0
+    not               t3, t1
+    srl               t4, t2, 24
+    srl               t5, t3, 24
+    replv.ph          t2, t4        # qAlpha(~d) 1
+    replv.ph          t3, t5        # qAlpha(~d) 2
+
+    BYTE_MUL_x2 t8, AT, s0, s1, t2, t3, t9, t4, t5, t6, t7
+
+    addu              t0, t0, s0
+    addu              t1, t1, s1
+    sw                t0, 0(a0)
+    sw                t1, 4(a0)
+    bnez              a2, 11b
+     addiu             a0, 8
+    b                 3f
+     nop
+
+/* part where const_alpha = 255 */
+2:
+    lw                t0, 0(a0)     # dest 1
+    lw                t1, 4(a0)     # dest 2
+    lw                s0, 0(a1)     # src 1
+    lw                s1, 4(a1)     # src 2
+    not               t2, t0
+    not               t3, t1
+    srl               t4, t2, 24
+    srl               t5, t3, 24
+    replv.ph          t2, t4
+    replv.ph          t3, t5
+    addiu             a1, 8
+    addiu             a2, -2
+
+    BYTE_MUL_x2 s0, s1, t8, AT, t2, t3, t9, t4, t5, t6, t7
+
+    addu              t0, t0, t8
+    addu              t1, t1, AT
+    sw                t0, 0(a0)
+    sw                t1, 4(a0)
+    bnez              a2, 2b
+     addiu             a0, 8
+
+3:
+    lw                s0, 0(sp)
+    lw                s1, 4(sp)
+    addiu             sp, sp, 8
+    jr                ra
+     nop
+    .set              at
+
+END(comp_func_DestinationOver_dsp_asm_x2)
+
+LEAF_MIPS_DSP(comp_func_solid_SourceIn_dsp_asm_x2)
+/*
+ * a0 - uint *dest
+ * a1 - int length
+ * a2 - uint color
+ * a3 - uint const_alpha
+ */
+
+    .set              noat
+    addiu             sp, -12
+    sw                s0, 0(sp)
+    sw                s1, 4(sp)
+    sw                s2, 8(sp)
+    beqz              a1, 3f
+     nop
+    li                t9, 8388736             /* t9 = 0x800080 (rounding_factor) */
+    lui               t8, 0xff00
+    li                t0, 0xff
+    beq               a3, t0, 2f
+     ori               t8, t8, 0xff00         /* t8 = 0xff00ff00 (andi_factor) */
+
+/* part where const_alpha != 255 */
+1:
+    replv.ph          t0, a3
+    li                t5, 0xff
+    BYTE_MUL a2, a2, t0, t9, t1, t2, t3, t4    /* a2 = color ( = BYTE_MUL(color, const_alpha)); */
+    subu              t1, t5, a3               /* t1 = cia = 255 - const_alpha */
+11:
+    lw                t2, 0(a0)                /* t2 = d */
+    lw                s0, 4(a0)
+    addiu             a1, -2
+    srl               t3, t2, 24               /* t3 = qAlpha(d) */
+    srl               s2, s0, 24
+
+    INTERPOLATE_PIXEL_255 a2, t3, t2, t1, AT, t9, t8, t4, t5, t6, t7
+    INTERPOLATE_PIXEL_255 a2, s2, s0, t1, s1, t9, t8, t4, t5, t6, t7
+
+    sw                AT, 0(a0)
+    sw                s1, 4(a0)
+    bnez              a1, 11b
+     addiu            a0, 8
+    b                 3f
+     nop
+
+/* part where const_alpha = 255 */
+2:
+    lw                t0, 0(a0)                /* dest 1 */
+    lw                t1, 4(a0)                /* dest 2 */
+    srl               t4, t0, 24
+    srl               t5, t1, 24
+    replv.ph          t2, t4
+    replv.ph          t3, t5
+    addiu             a1, -2
+
+    BYTE_MUL_x2 a2, a2, t8, AT, t2, t3, t9, t4, t5, t6, t7
+
+    sw                t8, 0(a0)
+    sw                AT, 4(a0)
+    bnez              a1, 2b
+     addiu             a0, 8
+
+3:
+    lw                s0, 0(sp)
+    lw                s1, 4(sp)
+    lw                s2, 8(sp)
+    addiu             sp, 12
+    jr                ra
+     nop
+    .set              at
+
+END(comp_func_solid_SourceIn_dsp_asm_x2)
+
+LEAF_MIPS_DSP(comp_func_SourceIn_dsp_asm_x2)
+/*
+ * a0 - uint *dest
+ * a1 - const uint *src
+ * a2 - int length
+ * a3 - uint const_alpha
+ */
+
+    .set              noat
+    addiu             sp, -16
+    sw                s0, 0(sp)
+    sw                s1, 4(sp)
+    sw                s2, 8(sp)
+    sw                s3, 12(sp)
+    beqz              a2, 3f
+     nop
+    li                t9, 8388736             /* t9 = 0x800080 (rounding_factor) */
+    lui               t8, 0xff00
+    li                t0, 0xff
+    beq               a3, t0, 2f
+     ori               t8, t8, 0xff00         /* t8 = 0xff00ff00 (andi_factor) */
+
+/* part where const_alpha != 255 */
+1:
+    li                t5, 0xff
+    subu              t7, t5, a3               /* t7 = cia = 255 - const_alpha */
+    replv.ph          a3, a3
+11:
+    lw                t0, 0(a1)                /* t0 = src 1 */
+    lw                t1, 4(a1)                /* t1 = src 2 */
+    addiu             a2, -2
+
+    BYTE_MUL_x2 t0, t1, AT, s0, a3, a3, t9, t3, t4, t5, t6, 0
+
+    lw                t0, 0(a0)                /* t0 = dest 1 */
+    lw                t1, 4(a0)                /* t1 = dest 2 */
+    addiu             a1, 8
+
+    srl               t2, t0, 24               /* t2 = qAlpha(d) 1 */
+    srl               t3, t1, 24               /* t3 = qAlpha(d) 2 */
+
+    INTERPOLATE_PIXEL_255 AT, t2, t0, t7, s1, t9, t8, t4, t5, t6, s3
+    INTERPOLATE_PIXEL_255 s0, t3, t1, t7, s2, t9, t8, t4, t5, t6, s3
+
+    sw                s1, 0(a0)
+    sw                s2, 4(a0)
+    bnez              a2, 11b
+     addiu            a0, 8
+    b                 3f
+     nop
+
+/* part where const_alpha = 255 */
+2:
+    lw                t2, 0(a0)                /* dest 1 */
+    lw                t3, 4(a0)                /* dest 2 */
+    lw                t0, 0(a1)                /* src 1 */
+    lw                t1, 4(a1)                /* src 2 */
+    srl               t4, t2, 24
+    srl               t5, t3, 24
+    replv.ph          t2, t4
+    replv.ph          t3, t5
+    addiu             a2, -2
+
+    BYTE_MUL_x2 t0, t1, t8, AT, t2, t3, t9, t4, t5, t6, t7
+
+    addiu             a1, 8
+    sw                t8, 0(a0)
+    sw                AT, 4(a0)
+    bnez              a2, 2b
+     addiu             a0, 8
+
+3:
+    lw                s0, 0(sp)
+    lw                s1, 4(sp)
+    lw                s2, 8(sp)
+    lw                s3, 12(sp)
+    addiu             sp, 16
+    jr                ra
+     nop
+    .set              at
+
+END(comp_func_SourceIn_dsp_asm_x2)
+
+LEAF_MIPS_DSP(comp_func_solid_DestinationIn_dsp_asm_x2)
+/*
+ * a0 - uint *dest
+ * a1 - int length
+ * a2 - uint a
+ */
+
+    .set              noat
+    beqz              a1, 2f
+     nop
+    li                t9, 8388736             /* t9 = 0x800080 (rounding_factor) */
+    replv.ph          a2, a2
+1:
+    lw                t0, 0(a0)
+    lw                t1, 4(a0)
+    addiu             a1, -2
+
+    BYTE_MUL_x2 t0, t1, t8, AT, a2, a2, t9, t4, t5, t6, t7, 0
+
+    sw                t8, 0(a0)
+    sw                AT, 4(a0)
+    bnez              a1, 1b
+     addiu            a0, 8
+2:
+    jr                ra
+     nop
+    .set              at
+
+END(comp_func_solid_DestinationIn_dsp_asm_x2)
+
+LEAF_MIPS_DSP(comp_func_DestinationIn_dsp_asm_x2)
+/*
+ * a0 - uint *dest
+ * a1 - const uint *src
+ * a2 - int length
+ * a3 - uint const_alpha
+ */
+
+    addiu             sp, -8
+    sw                s0, 0(sp)
+    sw                s1, 4(sp)
+    beqz              a2, 3f
+     nop
+    li                t9, 8388736             /* t9 = 0x800080 (rounding_factor) */
+    li                t0, 0xff
+    beq               a3, t0, 2f
+     nop
+
+/* part where const_alpha != 255 */
+1:
+    li                t5, 0xff
+    subu              t8, t5, a3               /* t8 = cia = 255 - const_alpha */
+    replv.ph          a3, a3
+11:
+    lw                t0, 0(a1)                /* t0 = src 1 */
+    lw                t1, 4(a1)                /* t1 = src 2 */
+    addiu             a2, -2
+    srl               t0, t0, 24
+    srl               t1, t1, 24
+
+    BYTE_MUL_x2 t0, t1, s1, t7, a3, a3, t9, t3, t4, t5, t6, 0
+
+    lw                t0, 0(a0)                /* t0 = dest 1 */
+    lw                t1, 4(a0)                /* t1 = dest 2 */
+    addu              s1, s1, t8               /* a 1 */
+    addu              t7, t7, t8               /* a 2 */
+    replv.ph          t2, s1
+    replv.ph          t3, t7
+
+    BYTE_MUL_x2 t0, t1, s1, t7, t2, t3, t9, t4, t5, t6, s0
+
+    addiu             a1, 8
+    sw                s1, 0(a0)
+    sw                t7, 4(a0)
+    bnez              a2, 11b
+     addiu            a0, 8
+    b                 3f
+     nop
+
+/* part where const_alpha = 255 */
+2:
+    lw                t2, 0(a1)                /* src 1 */
+    lw                t3, 4(a1)                /* src 2 */
+    lw                t0, 0(a0)                /* dest 1 */
+    lw                t1, 4(a0)                /* dest 2 */
+    srl               t4, t2, 24
+    srl               t5, t3, 24
+    replv.ph          t2, t4                   /* t2 = qAlpha(src 1) */
+    replv.ph          t3, t5                   /* t3 = qAlpha(src 2) */
+    addiu             a2, -2
+
+    BYTE_MUL_x2 t0, t1, t8, s1, t2, t3, t9, t4, t5, t6, t7
+
+    addiu             a1, 8
+    sw                t8, 0(a0)
+    sw                s1, 4(a0)
+    bnez              a2, 2b
+     addiu             a0, 8
+
+3:
+    lw                s0, 0(sp)
+    lw                s1, 4(sp)
+    addiu             sp, 8
+    jr                ra
+     nop
+
+END(comp_func_DestinationIn_dsp_asm_x2)
+
+LEAF_MIPS_DSP(comp_func_DestinationOut_dsp_asm_x2)
+/*
+ * a0 - uint *dest
+ * a1 - const uint *src
+ * a2 - int length
+ * a3 - uint const_alpha
+ */
+
+    .set              noat
+    addiu             sp, -4
+    sw                s0, 0(sp)
+    beqz              a2, 3f
+     nop
+    li                t9, 8388736             /* t9 = 0x800080 (rounding_factor) */
+    li                t0, 0xff
+    beq               a3, t0, 2f
+     nop
+
+/* part where const_alpha != 255 */
+1:
+    li                t5, 0xff
+    subu              t8, t5, a3               /* t8 = cia = 255 - const_alpha */
+    replv.ph          a3, a3
+11:
+    lw                t0, 0(a1)                /* t0 = src 1 */
+    lw                t1, 4(a1)                /* t1 = src 2 */
+    not               t0, t0
+    not               t1, t1
+    addiu             a2, -2
+    srl               t0, t0, 24
+    srl               t1, t1, 24
+
+    BYTE_MUL_x2       t0, t1, AT, t7, a3, a3, t9, t3, t4, t5, t6, 0
+
+    lw                t0, 0(a0)                /* t0 = dest 1 */
+    lw                t1, 4(a0)                /* t1 = dest 2 */
+    addu              AT, AT, t8               /* a 1 */
+    addu              t7, t7, t8               /* a 2 */
+    replv.ph          t2, AT
+    replv.ph          t3, t7
+
+    BYTE_MUL_x2 t0, t1, AT, t7, t2, t3, t9, t4, t5, t6, s0
+
+    addiu             a1, 8
+    sw                AT, 0(a0)
+    sw                t7, 4(a0)
+    bnez              a2, 11b
+     addiu            a0, 8
+    b                 3f
+     nop
+
+/* part where const_alpha = 255 */
+2:
+    lw                t2, 0(a1)                /* src 1 */
+    lw                t3, 4(a1)                /* src 2 */
+    not               t2, t2
+    not               t3, t3
+    lw                t0, 0(a0)                /* dest 1 */
+    lw                t1, 4(a0)                /* dest 2 */
+    srl               t4, t2, 24
+    srl               t5, t3, 24
+    replv.ph          t2, t4                   /* t2 = qAlpha(src 1) */
+    replv.ph          t3, t5                   /* t3 = qAlpha(src 2) */
+    addiu             a2, -2
+
+    BYTE_MUL_x2 t0, t1, t8, AT, t2, t3, t9, t4, t5, t6, t7
+
+    addiu             a1, 8
+    sw                t8, 0(a0)
+    sw                AT, 4(a0)
+    bnez              a2, 2b
+     addiu             a0, 8
+
+3:
+    lw                s0, 0(sp)
+    addiu             sp, 4
+    jr                ra
+     nop
+    .set              at
+
+END(comp_func_DestinationOut_dsp_asm_x2)
+
+LEAF_MIPS_DSP(comp_func_solid_SourceAtop_dsp_asm_x2)
+/*
+ * a0 - uint *dest
+ * a1 - int length
+ * a2 - uint color
+ * a3 - uint sia
+ */
+
+    .set              noat
+    addu              sp, -4
+    sw                s0, 0(sp)
+    beqz              a1, 2f
+     nop
+    li                t9, 8388736              /* t9 = 0x800080 (rounding_factor) */
+    lui               t8, 0xff00
+    ori               t8, t8, 0xff00           /* t8 = 0xff00ff00 (andi_factor) */
+1:
+    lw                t0, 0(a0)                /* t0 = dest 1 */
+    lw                t1, 4(a0)                /* t1 = dest 2 */
+    addiu             a1, -2
+    srl               t2, t0, 24               /* t2 = qAlpha(dest 1) */
+    srl               t3, t1, 24               /* t3 = qAlpha(dest 2) */
+
+    INTERPOLATE_PIXEL_255 a2, t2, t0, a3, AT, t9, t8, t4, t5, t6, t7
+    INTERPOLATE_PIXEL_255 a2, t3, t1, a3, s0, t9, t8, t4, t5, t6, t7
+
+    sw                AT, 0(a0)
+    sw                s0, 4(a0)
+    bnez              a1, 1b
+     addiu            a0, 8
+2:
+    lw                s0, 0(sp)
+    addiu             sp, 4
+    jr                ra
+     nop
+    .set              at
+
+END(comp_func_solid_SourceAtop_dsp_asm_x2)
+
+LEAF_MIPS_DSP(comp_func_SourceAtop_dsp_asm_x2)
+/*
+ * a0 - uint *dest
+ * a1 - const uint *src
+ * a2 - int length
+ * a3 - uint const_alpha
+ */
+
+    .set              noat
+    addiu             sp, -20
+    sw                s0, 0(sp)
+    sw                s1, 4(sp)
+    sw                s2, 8(sp)
+    sw                s3, 12(sp)
+    sw                s4, 16(sp)
+    beqz              a2, 3f
+     nop
+    li                t9, 8388736             /* t9 = 0x800080 (rounding_factor) */
+    lui               t8, 0xff00
+    li                t0, 0xff
+    beq               a3, t0, 2f
+     ori               t8, t8, 0xff00         /* t8 = 0xff00ff00 (andi_factor) */
+
+/* part where const_alpha != 255 */
+1:
+    replv.ph          a3, a3
+11:
+    lw                AT, 0(a1)                /* src 1 */
+    lw                s0, 4(a1)                /* src 2 */
+
+    BYTE_MUL_x2 AT, s0, t0, t1, a3, a3, t9, t3, t4, t5, t6, 0
+                                               /* t0 = s */
+
+    lw                t2, 0(a0)                /* t2 = dest 1 */
+    lw                t3, 4(a0)                /* t3 = dest 2 */
+
+    srl               t4, t2, 24               /* t4 = qAplpha(dest 1) */
+    srl               t5, t3, 24
+    not               t6, t0
+    not               t7, t1
+    srl               t6, t6, 24               /* t6 = qAlpha(~s) */
+    srl               t7, t7, 24
+    addiu             a2, -2
+
+    INTERPOLATE_PIXEL_255 t0, t4, t2, t6, AT, t9, t8, s1, s2, s3, s4
+    INTERPOLATE_PIXEL_255 t1, t5, t3, t7, s0, t9, t8, s1, s2, s3, s4
+
+    addiu             a1, 8
+    sw                AT, 0(a0)
+    sw                s0, 4(a0)
+    bnez              a2, 11b
+     addiu             a0, 8
+    b                 3f
+     nop
+
+/* part where const_alpha = 255 */
+2:
+    lw                t2, 0(a0)                /* dest 1 */
+    lw                t3, 4(a0)                /* dest 2 */
+    lw                t0, 0(a1)                /* src 1 */
+    lw                t1, 4(a1)                /* src 2 */
+    srl               t4, t2, 24
+    srl               t5, t3, 24
+    not               t6, t0
+    not               t7, t1
+    srl               t6, t6, 24
+    srl               t7, t7, 24
+    addiu             a2, -2
+
+    INTERPOLATE_PIXEL_255 t0, t4, t2, t6, AT, t9, t8, s1, s2, s3, s4
+    INTERPOLATE_PIXEL_255 t1, t5, t3, t7, s0, t9, t8, s1, s2, s3, s4
+
+    addiu             a1, 8
+    sw                AT, 0(a0)
+    sw                s0, 4(a0)
+    bnez              a2, 2b
+     addiu             a0, 8
+
+3:
+    lw                s0, 0(sp)
+    lw                s1, 4(sp)
+    lw                s2, 8(sp)
+    lw                s3, 12(sp)
+    lw                s4, 16(sp)
+    addiu             sp, 20
+    jr                 ra
+     nop
+    .set              at
+
+END(comp_func_SourceAtop_dsp_asm_x2)
+
+LEAF_MIPS_DSP(comp_func_solid_DestinationAtop_dsp_asm_x2)
+/*
+ * a0 - uint *dest
+ * a1 - int length
+ * a2 - uint color
+ * a3 - uint a
+ */
+
+    .set              noat
+    addiu             sp, -4
+    sw                s0, 0(sp)
+    beqz              a1, 2f
+     nop
+    li                t9, 8388736              /* t9 = 0x800080 (rounding_factor) */
+    lui               t8, 0xff00
+    ori               t8, t8, 0xff00           /* t8 = 0xff00ff00 (andi_factor) */
+1:
+    lw                t0, 0(a0)                /* t0 = dest 1 */
+    lw                t1, 4(a0)                /* t1 = dest 2 */
+    addiu             a1, -2
+    not               t2, t0
+    not               t3, t1
+    srl               t2, t2, 24               /* t2 = qAlpha(~(dest 1)) */
+    srl               t3, t3, 24               /* t3 = qAlpha(~(dest 2)) */
+
+    INTERPOLATE_PIXEL_255 t0, a3, a2, t2, AT, t9, t8, t4, t5, t6, t7
+    INTERPOLATE_PIXEL_255 t1, a3, a2, t3, s0, t9, t8, t4, t5, t6, t7
+
+    sw                AT, 0(a0)
+    sw                s0, 4(a0)
+    bnez              a1, 1b
+     addiu            a0, 8
+2:
+    lw                s0, 0(sp)
+    addiu              sp, 4
+    jr                ra
+     nop
+    .set              at
+
+END(comp_func_solid_DestinationAtop_dsp_asm_x2)
+
+LEAF_MIPS_DSP(comp_func_DestinationAtop_dsp_asm_x2)
+/*
+ * a0 - uint *dest
+ * a1 - const uint *src
+ * a2 - int length
+ * a3 - uint const_alpha
+ */
+
+    .set              noat
+    addiu             sp, -24
+    sw                s0, 0(sp)
+    sw                s1, 4(sp)
+    sw                s2, 8(sp)
+    sw                s3, 12(sp)
+    sw                s4, 16(sp)
+    sw                s5, 20(sp)
+    beqz              a2, 3f
+     nop
+    li                t9, 8388736             /* t9 = 0x800080 (rounding_factor) */
+    lui               t8, 0xff00
+    li                t0, 0xff
+    beq               a3, t0, 2f
+     ori               t8, t8, 0xff00         /* t8 = 0xff00ff00 (andi_factor) */
+
+/* part where const_alpha != 255 */
+1:
+    li                s5, 0xff
+    subu              s5, s5, a3               /* s5 = cia = 255 - const_alpha */
+    replv.ph          a3, a3
+11:
+    lw                AT, 0(a1)                /* src 1 */
+    lw                s0, 4(a1)                /* src 2 */
+
+    BYTE_MUL_x2 AT, s0, t0, t1, a3, a3, t9, t3, t4, t5, t6, 0
+                                               /* t0 = s */
+
+    lw                t2, 0(a0)                /* t2 = dest 1 */
+    lw                t3, 4(a0)                /* t3 = dest 2 */
+
+    not               t4, t2
+    not               t5, t3
+    srl               t4, t4, 24               /* t4 = qAplpha(~(dest 1)) */
+    srl               t5, t5, 24
+    srl               t6, t0, 24
+    srl               t7, t1, 24
+    addu              t6, t6, s5               /* t6 = a = qAlpha(s1) + cia */
+    addu              t7, t7, s5
+    addiu             a2, -2
+
+    INTERPOLATE_PIXEL_255 t2, t6, t0, t4, AT, t9, t8, s1, s2, s3, s4
+    INTERPOLATE_PIXEL_255 t3, t7, t1, t5, s0, t9, t8, s1, s2, s3, s4
+
+    addiu             a1, 8
+    sw                AT, 0(a0)
+    sw                s0, 4(a0)
+    bnez              a2, 11b
+     addiu             a0, 8
+    b                 3f
+     nop
+
+/* part where const_alpha = 255 */
+2:
+    lw                t2, 0(a0)                /* d1 */
+    lw                t3, 4(a0)                /* d2 */
+    lw                t0, 0(a1)                /* s1 */
+    lw                t1, 4(a1)                /* s2 */
+    srl               t4, t0, 24               /* t4 = qAlpha(s1) */
+    srl               t5, t1, 24
+    not               t6, t2
+    not               t7, t3
+    srl               t6, t6, 24               /* qAlpha(~d1) */
+    srl               t7, t7, 24
+    addiu             a2, -2
+
+    INTERPOLATE_PIXEL_255 t2, t4, t0, t6, AT, t9, t8, s1, s2, s3, s4
+    INTERPOLATE_PIXEL_255 t3, t5, t1, t7, s0, t9, t8, s1, s2, s3, s4
+
+    addiu             a1, 8
+    sw                AT, 0(a0)
+    sw                s0, 4(a0)
+    bnez              a2, 2b
+     addiu             a0, 8
+
+3:
+    lw                s0, 0(sp)
+    lw                s1, 4(sp)
+    lw                s2, 8(sp)
+    lw                s3, 12(sp)
+    lw                s4, 16(sp)
+    lw                s5, 20(sp)
+    addiu             sp, 24
+    jr                ra
+     nop
+    .set              at
+
+END(comp_func_DestinationAtop_dsp_asm_x2)
+
+LEAF_MIPS_DSP(comp_func_solid_XOR_dsp_asm_x2)
+/*
+ * a0 - uint *dest
+ * a1 - int length
+ * a2 - uint color
+ * a3 - uint sia
+ */
+
+    .set              noat
+    addu              sp, -4
+    sw                s0, 0(sp)
+    beqz              a1, 2f
+     nop
+    li                t9, 8388736              /* t9 = 0x800080 (rounding_factor) */
+    lui               t8, 0xff00
+    ori               t8, t8, 0xff00           /* t8 = 0xff00ff00 (andi_factor) */
+1:
+    lw                t0, 0(a0)                /* t0 = dest 1 */
+    lw                t1, 4(a0)                /* t1 = dest 2 */
+    addiu             a1, -2
+    not               t2, t0
+    not               t3, t1
+    srl               t2, t2, 24               /* t2 = qAlpha(~(dest 1)) */
+    srl               t3, t3, 24               /* t3 = qAlpha(~(dest 2)) */
+
+    INTERPOLATE_PIXEL_255 a2, t2, t0, a3, AT, t9, t8, t4, t5, t6, t7
+    INTERPOLATE_PIXEL_255 a2, t3, t1, a3, s0, t9, t8, t4, t5, t6, t7
+
+    sw                AT, 0(a0)
+    sw                s0, 4(a0)
+    bnez              a1, 1b
+     addiu            a0, 8
+2:
+    lw                s0, 0(sp)
+    addu              sp, 4
+    jr                ra
+     nop
+    .set              at
+
+END(comp_func_solid_XOR_dsp_asm_x2)
+
+LEAF_MIPS_DSP(comp_func_XOR_dsp_asm_x2)
+/*
+ * a0 - uint *dest
+ * a1 - const uint *src
+ * a2 - int length
+ * a3 - uint const_alpha
+ */
+
+    .set              noat
+    addiu             sp, -20
+    sw                s0, 0(sp)
+    sw                s1, 4(sp)
+    sw                s2, 8(sp)
+    sw                s3, 12(sp)
+    sw                s4, 16(sp)
+    beqz              a2, 3f
+     nop
+    li                t9, 8388736             /* t9 = 0x800080 (rounding_factor) */
+    lui               t8, 0xff00
+    li                t0, 0xff
+    beq               a3, t0, 2f
+     ori               t8, t8, 0xff00         /* t8 = 0xff00ff00 (andi_factor) */
+
+/* part where const_alpha != 255 */
+1:
+    replv.ph          a3, a3
+11:
+    lw                AT, 0(a1)                /* src 1 */
+    lw                s0, 4(a1)                /* src 2 */
+
+    BYTE_MUL_x2 AT, s0, t0, t1, a3, a3, t9, t3, t4, t5, t6, 0
+                                               /* t0 = s1 */
+                                               /* t1 = s2 */
+
+    lw                t2, 0(a0)                /* t2 = dest 1 */
+    lw                t3, 4(a0)                /* t3 = dest 2 */
+
+    not               t4, t2
+    not               t5, t3
+    srl               t4, t4, 24               /* t4 = qAplpha(~(dest 1)) */
+    srl               t5, t5, 24
+    not               t6, t0
+    not               t7, t1
+    srl               t6, t6, 24               /* t6 = qAlpha(~s) */
+    srl               t7, t7, 24
+    addiu             a2, -2
+
+    INTERPOLATE_PIXEL_255 t0, t4, t2, t6, AT, t9, t8, s1, s2, s3, s4
+    INTERPOLATE_PIXEL_255 t1, t5, t3, t7, s0, t9, t8, s1, s2, s3, s4
+
+    addiu             a1, 8
+    sw                AT, 0(a0)
+    sw                s0, 4(a0)
+    bnez              a2, 11b
+     addiu             a0, 8
+    b                 3f
+     nop
+
+/* part where const_alpha = 255 */
+2:
+    lw                t2, 0(a0)                /* d1 */
+    lw                t3, 4(a0)                /* d2 */
+    lw                t0, 0(a1)                /* s1 */
+    lw                t1, 4(a1)                /* s2 */
+    not               t4, t0
+    not               t5, t1
+    srl               t4, t4, 24               /* t4 = qAlpha(~s1) */
+    srl               t5, t5, 24
+    not               t6, t2
+    not               t7, t3
+    srl               t6, t6, 24               /* qAlpha(~d1) */
+    srl               t7, t7, 24
+    addiu             a2, -2
+
+    INTERPOLATE_PIXEL_255 t0, t6, t2, t4, AT, t9, t8, s1, s2, s3, s4
+    INTERPOLATE_PIXEL_255 t1, t7, t3, t5, s0, t9, t8, s1, s2, s3, s4
+
+    addiu             a1, 8
+    sw                AT, 0(a0)
+    sw                s0, 4(a0)
+    bnez              a2, 2b
+     addiu             a0, 8
+
+3:
+    lw                s0, 0(sp)
+    lw                s1, 4(sp)
+    lw                s2, 8(sp)
+    lw                s3, 12(sp)
+    lw                s4, 16(sp)
+    addiu             sp, 20
+    jr                ra
+     nop
+    .set              at
+
+END(comp_func_XOR_dsp_asm_x2)
+
+LEAF_MIPS_DSP(comp_func_solid_SourceOut_dsp_asm_x2)
+/*
+ * a0 - uint *dest
+ * a1 - int length
+ * a2 - uint color
+ * a3 - uint const_alpha
+ */
+
+    .set              noat
+    addiu             sp, -12
+    sw                s0, 0(sp)
+    sw                s1, 4(sp)
+    sw                s2, 8(sp)
+    beqz              a1, 3f
+     nop
+    li                t9, 8388736             /* t9 = 0x800080 (rounding_factor) */
+    lui               t8, 0xff00
+    li                t0, 0xff
+    beq               a3, t0, 2f
+     ori               t8, t8, 0xff00         /* t8 = 0xff00ff00 (andi_factor) */
+
+/* part where const_alpha != 255 */
+1:
+    replv.ph          t0, a3
+    li                t5, 0xff
+    BYTE_MUL a2, a2, t0, t9, t1, t2, t3, t4    /* a2 = color ( = BYTE_MUL(color, const_alpha)); */
+    subu              t1, t5, a3               /* t1 = cia = 255 - const_alpha */
+11:
+    lw                t2, 0(a0)                /* t2 = d1 */
+    lw                s0, 4(a0)                /* s0 = d2 */
+    addiu             a1, -2
+    not               t3, t2
+    not               s2, s0
+    srl               t3, t3, 24               /* t3 = qAlpha(~d1) */
+    srl               s2, s2, 24               /* s2 = qAlpha(~d2) */
+
+    INTERPOLATE_PIXEL_255 a2, t3, t2, t1, AT, t9, t8, t4, t5, t6, t7
+    INTERPOLATE_PIXEL_255 a2, s2, s0, t1, s1, t9, t8, t4, t5, t6, t7
+
+    sw                AT, 0(a0)
+    sw                s1, 4(a0)
+    bnez              a1, 11b
+     addiu            a0, 8
+    b                 3f
+     nop
+
+/* part where const_alpha = 255 */
+2:
+    lw                t0, 0(a0)                /* dest 1 */
+    lw                t1, 4(a0)                /* dest 2 */
+    not               t4, t0
+    not               t5, t1
+    srl               t4, t4, 24
+    srl               t5, t5, 24
+    replv.ph          t2, t4
+    replv.ph          t3, t5
+    addiu             a1, -2
+
+    BYTE_MUL_x2 a2, a2, t8, AT, t2, t3, t9, t4, t5, t6, t7
+
+    sw                t8, 0(a0)
+    sw                AT, 4(a0)
+    bnez              a1, 2b
+     addiu             a0, 8
+
+3:
+    lw                s0, 0(sp)
+    lw                s1, 4(sp)
+    lw                s2, 8(sp)
+    addiu             sp, 12
+    jr                ra
+     nop
+    .set              at
+
+END(comp_func_solid_SourceOut_dsp_asm_x2)
+
+LEAF_MIPS_DSP(comp_func_SourceOut_dsp_asm_x2)
+/*
+ * a0 - uint *dest
+ * a1 - const uint *src
+ * a2 - int length
+ * a3 - uint const_alpha
+ */
+
+    .set              noat
+    addiu             sp, -16
+    sw                s0, 0(sp)
+    sw                s1, 4(sp)
+    sw                s2, 8(sp)
+    sw                s3, 12(sp)
+    beqz              a2, 3f
+     nop
+    li                t9, 8388736             /* t9 = 0x800080 (rounding_factor) */
+    lui               t8, 0xff00
+    li                t0, 0xff
+    beq               a3, t0, 2f
+     ori               t8, t8, 0xff00         /* t8 = 0xff00ff00 (andi_factor) */
+
+/* part where const_alpha != 255 */
+1:
+    li                t5, 0xff
+    subu              t7, t5, a3               /* t7 = cia = 255 - const_alpha */
+    replv.ph          a3, a3
+11:
+    lw                t0, 0(a1)                /* t0 = src 1 */
+    lw                t1, 4(a1)                /* t1 = src 2 */
+    addiu             a2, -2
+
+    BYTE_MUL_x2 t0, t1, AT, s0, a3, a3, t9, t3, t4, t5, t6, 0
+
+    lw                t0, 0(a0)                /* t0 = dest 1 */
+    lw                t1, 4(a0)                /* t1 = dest 2 */
+    addiu             a1, 8
+
+    not               t2, t0
+    not               t3, t1
+    srl               t2, t2, 24               /* t2 = qAlpha(~d1) */
+    srl               t3, t3, 24               /* t3 = qAlpha(~d2) */
+
+    INTERPOLATE_PIXEL_255 AT, t2, t0, t7, s1, t9, t8, t4, t5, t6, s3
+    INTERPOLATE_PIXEL_255 s0, t3, t1, t7, s2, t9, t8, t4, t5, t6, s3
+
+    sw                s1, 0(a0)
+    sw                s2, 4(a0)
+    bnez              a2, 11b
+     addiu            a0, 8
+    b                 3f
+     nop
+
+/* part where const_alpha = 255 */
+2:
+    lw                t2, 0(a0)                /* dest 1 */
+    lw                t3, 4(a0)                /* dest 2 */
+    lw                t0, 0(a1)                /* src 1 */
+    lw                t1, 4(a1)                /* src 2 */
+    not               t4, t2
+    not               t5, t3
+    srl               t4, t4, 24               /* qAlpha(~d1) */
+    srl               t5, t5, 24               /* qAlpha(~d2) */
+    replv.ph          t2, t4
+    replv.ph          t3, t5
+    addiu             a2, -2
+
+    BYTE_MUL_x2 t0, t1, t8, AT, t2, t3, t9, t4, t5, t6, t7
+
+    addiu             a1, 8
+    sw                t8, 0(a0)
+    sw                AT, 4(a0)
+    bnez              a2, 2b
+     addiu             a0, 8
+
+3:
+    lw                s0, 0(sp)
+    lw                s1, 4(sp)
+    lw                s2, 8(sp)
+    lw                s3, 12(sp)
+    addiu             sp, 16
+    jr                 ra
+     nop
+    .set              at
+
+END(comp_func_SourceOut_dsp_asm_x2)
+
+LEAF_MIPS_DSP(comp_func_Source_dsp_asm_x2)
+/*
+ * a0 - uint *dest
+ * a1 - const uint *src
+ * a2 - int length
+ * a3 - uint const_alpha
+ */
+
+    .set              noat
+    addiu             sp, -8
+    sw                s0, 0(sp)
+    sw                s1, 4(sp)
+    beqz              a2, 2f
+     nop
+    li                t9, 8388736              /* t9 = 0x800080 (rounding_factor) */
+    lui               t8, 0xff00
+    ori               t8, t8, 0xff00           /* t8 = 0xff00ff00 (andi_factor) */
+    li                t7, 0xff
+    subu              t7, t7, a3               /* t7 = ialpha */
+1:
+    lw                t0, 0(a0)                /* t0 = dest 1 */
+    lw                t1, 4(a0)                /* t1 = dest 2 */
+    lw                t2, 0(a1)                /* t2 = src 1 */
+    lw                t3, 4(a1)                /* t3 = src 2 */
+    addiu             a2, -2
+    addiu             a1, 8
+
+    INTERPOLATE_PIXEL_255 t2, a3, t0, t7, AT, t9, t8, t4, t5, t6, s1
+    INTERPOLATE_PIXEL_255 t3, a3, t1, t7, s0, t9, t8, t4, t5, t6, s1
+
+    sw                AT, 0(a0)
+    sw                s0, 4(a0)
+    bnez              a2, 1b
+     addiu            a0, 8
+2:
+    lw                s0, 0(sp)
+    lw                s1, 4(sp)
+    addiu             sp, 8
+    jr                ra
+     nop
+    .set              at
+
+END(comp_func_Source_dsp_asm_x2)
+
+LEAF_MIPS_DSP(qt_blend_argb32_on_argb32_mips_dsp_asm_x2)
+/*
+ * a0 - uint *dest
+ * a1 - const uint *src
+ * a2 - int length
+ * a3 - uint const_alpha
+ */
+
+    .set              noat
+    addiu             sp, -12
+    sw                s0, 0(sp)
+    sw                s1, 4(sp)
+    sw                s2, 8(sp)
+    beqz              a2, 2f
+     nop
+    replv.ph          a3, a3
+    li                t9, 8388736             /* t9 = 0x800080 (rounding_factor) */
+
+1:
+    lw                t0, 0(a1)                /* t0 = src 1 */
+    lw                t1, 4(a1)                /* t1 = src 2 */
+    addiu             a2, -2
+
+    BYTE_MUL_x2       t0, t1, AT, t7, a3, a3, t9, t3, t4, t5, t6, 0
+
+    lw                t0, 0(a0)                /* t0 = dest 1 */
+    lw                t1, 4(a0)                /* t1 = dest 2 */
+    not               s1, AT
+    not               s2, t7
+    srl               s1, s1, 24               /* s1 = qAlpha(~s1) */
+    srl               s2, s2, 24               /* s2 = qAlpha(~s2) */
+    replv.ph          s1, s1
+    replv.ph          s2, s2
+
+    BYTE_MUL_x2 t0, t1, t2, t3, s1, s2, t9, t4, t5, t6, s0
+
+    addiu             a1, 8
+    addu              AT, AT, t2
+    addu              t7, t7, t3
+    sw                AT, 0(a0)
+    sw                t7, 4(a0)
+    bnez              a2, 1b
+     addiu            a0, 8
+
+2:
+    lw                s0, 0(sp)
+    lw                s1, 4(sp)
+    lw                s2, 8(sp)
+    addiu             sp, 12
+    jr                ra
+     nop
+    .set              at
+
+END(qt_blend_argb32_on_argb32_mips_dsp_asm_x2)
+
+LEAF_MIPS_DSP(qt_blend_argb32_on_argb32_const_alpha_256_mips_dsp_asm)
+/*
+ * a0 - uint *dest
+ * a1 - const uint *src
+ * a2 - int length
+ */
+
+    beqz              a2, 5f
+     nop
+    li                t7, 8388736    /* t7 = 0x800080 */
+    b                 2f
+     nop
+1:
+    addiu             a0, a0, 4
+    addiu             a2, a2, -1
+    beqz              a2, 5f
+     nop
+2:
+    lw                t0, 0(a1)      /* t0 = s = src[i] */
+    addiu             a1, a1, 4
+    nor               t1, t0, zero
+    srl               t1, t1, 24     /* t1 = ~qAlpha(s) */
+    bnez              t1, 3f
+     nop
+    sw                t0, 0(a0)      /* dst[i] = src[i] */
+    addiu             a2, a2, -1
+    bnez              a2, 2b
+     addiu            a0, a0, 4
+    b 5f
+     nop
+3:
+    beqz              t0, 1b
+     replv.ph          t6, t1        /* | 0 | qAlpha(~s) | 0 | qAlpha(~s) | */
+
+    lw                t4, 0(a0)
+    addiu             a2, a2, -1
+    beqz              t4, 31f
+     move             t8, zero
+
+    BYTE_MUL t4, t8, t6, t7, t1, t2, t3, t4
+31:
+    addu              t8, t0, t8    /* dst[i] =
+                                     * s + BYTE_MUL(dst[i],~qAlpha(s)) */
+    sw                t8, 0(a0)
+    bnez              a2, 2b
+     addiu            a0, a0, 4
+    b                 5f
+     nop
+5:
+    jr                ra
+     nop
+
+END(qt_blend_argb32_on_argb32_const_alpha_256_mips_dsp_asm)
index ed84848..818b27c 100644 (file)
@@ -52,6 +52,48 @@ extern "C" void qt_memfill32_asm_mips_dsp(quint32 *dest, quint32 value, int coun
 
 extern "C" void comp_func_SourceOver_asm_mips_dsp(uint *dest, const uint *src, int length, uint const_alpha);
 
+extern "C" void comp_func_solid_DestinationOver_dsp_asm_x2(uint *dest, int length, uint color);
+
+extern "C" void comp_func_solid_Source_dsp_asm_x2(uint *dest, int length, uint color, uint const_alpha);
+
+extern "C" void comp_func_DestinationOver_dsp_asm_x2(uint *dest, const uint *src, int length, uint const_alpha);
+
+extern "C" void comp_func_solid_SourceIn_dsp_asm_x2(uint *dest, int length, uint color, uint const_alpha);
+
+extern "C" void comp_func_SourceIn_dsp_asm_x2(uint *dest, const uint *src, int length, uint const_alpha);
+
+extern "C" void comp_func_solid_DestinationIn_dsp_asm_x2(uint *dest, int length, uint a);
+
+extern "C" void comp_func_DestinationIn_dsp_asm_x2(uint *dest, const uint *src, int length, uint const_alpha);
+
+extern "C" void comp_func_DestinationOut_dsp_asm_x2(uint *dest, const uint *src, int length, uint const_alpha);
+
+extern "C" void comp_func_solid_SourceAtop_dsp_asm_x2(uint *dest, int length, uint color, uint const_alpha);
+
+extern "C" void comp_func_SourceAtop_dsp_asm_x2(uint *dest, const uint *src, int length, uint const_alpha);
+
+extern "C" void comp_func_solid_DestinationAtop_dsp_asm_x2(uint *dest, int length, uint color, uint const_alpha);
+
+extern "C" void comp_func_DestinationAtop_dsp_asm_x2(uint *dest, const uint *src, int length, uint const_alpha);
+
+extern "C" void comp_func_solid_XOR_dsp_asm_x2(uint *dest, int length, uint color, uint const_alpha);
+
+extern "C" void comp_func_XOR_dsp_asm_x2(uint *dest, const uint *src, int length, uint const_alpha);
+
+extern "C" void comp_func_solid_SourceOut_dsp_asm_x2(uint *dest, int length, uint color, uint const_alpha);
+
+extern "C" void comp_func_SourceOut_dsp_asm_x2(uint *dest, const uint *src, int length, uint const_alpha);
+
+extern "C" void comp_func_Source_dsp_asm_x2(uint *dest, const uint *src, int length, uint const_alpha);
+
+extern "C" void qt_blend_argb32_on_argb32_mips_dsp_asm_x2(uint *dest, const uint *src, int length, uint const_alpha);
+
+extern "C" void qt_blend_argb32_on_argb32_const_alpha_256_mips_dsp_asm(uint *dest, const uint *src, int length);
+
+extern "C" uint * destfetchARGB32_asm_mips_dsp(uint *buffer, const uint *data, int length);
+
+extern "C" uint * qt_destStoreARGB32_asm_mips_dsp(uint *buffer, const uint *data, int length);
+
 void qt_blend_argb32_on_argb32_mips_dsp(uchar *destPixels, int dbpl,
                                       const uchar *srcPixels, int sbpl,
                                       int w, int h,
@@ -71,6 +113,46 @@ uint * QT_FASTCALL qt_destFetchARGB32_mips_dsp(uint *buffer,
 void QT_FASTCALL qt_destStoreARGB32_mips_dsp(QRasterBuffer *rasterBuffer, int x, int y,
                                              const uint *buffer, int length);
 
+void QT_FASTCALL comp_func_solid_Source_mips_dsp(uint *dest, int length, uint color, uint const_alpha);
+
+void QT_FASTCALL comp_func_solid_SourceOver_mips_dsp(uint *dest, int length, uint color, uint const_alpha);
+
+void QT_FASTCALL comp_func_solid_DestinationOver_mips_dsp(uint *dest, int length, uint color, uint const_alpha);
+
+void QT_FASTCALL comp_func_solid_SourceOver_mips_dsp(uint *dest, int length, uint color, uint const_alpha);
+
+void QT_FASTCALL comp_func_solid_DestinationOver_mips_dsp(uint *dest, int length, uint color, uint const_alpha);
+
+void QT_FASTCALL comp_func_DestinationOver_mips_dsp(uint *dest, const uint *src, int length, uint const_alpha);
+
+void QT_FASTCALL comp_func_solid_SourceIn_mips_dsp(uint *dest, int length, uint color, uint const_alpha);
+
+void QT_FASTCALL comp_func_SourceIn_mips_dsp(uint *dest, const uint *src, int length, uint const_alpha);
+
+void QT_FASTCALL comp_func_solid_DestinationIn_mips_dsp(uint *dest, int length, uint color, uint const_alpha);
+
+void QT_FASTCALL comp_func_DestinationIn_mips_dsp(uint *dest, const uint *src, int length, uint const_alpha);
+
+void QT_FASTCALL comp_func_solid_DestinationOut_mips_dsp(uint *dest, int length, uint color, uint const_alpha);
+
+void QT_FASTCALL comp_func_DestinationOut_mips_dsp(uint *dest, const uint *src, int length, uint const_alpha);
+
+void QT_FASTCALL comp_func_solid_SourceAtop_mips_dsp(uint *dest, int length, uint color, uint const_alpha);
+
+void QT_FASTCALL comp_func_SourceAtop_mips_dsp(uint *dest, const uint *src, int length, uint const_alpha);
+
+void QT_FASTCALL comp_func_solid_DestinationAtop_mips_dsp(uint *dest, int length, uint color, uint const_alpha);
+
+void QT_FASTCALL comp_func_DestinationAtop_mips_dsp(uint *dest, const uint *src, int length, uint const_alpha);
+
+void QT_FASTCALL comp_func_solid_XOR_mips_dsp(uint *dest, int length, uint color, uint const_alpha);
+
+void QT_FASTCALL comp_func_solid_SourceOut_mips_dsp(uint *dest, int length, uint color, uint const_alpha);
+
+void QT_FASTCALL comp_func_SourceOut_mips_dsp(uint *dest, const uint *src, int length, uint const_alpha);
+
+void QT_FASTCALL comp_func_XOR_mips_dsp(uint *dest, const uint *src, int length, uint const_alpha);
+
 #endif // QT_COMPILER_SUPPORTS_MIPS_DSP
 
 
index 213fcf8..7e95410 100644 (file)
 
 #include "qt_mips_asm_dsp.h"
 
-LEAF_MIPS_DSPR2(INTERPOLATE_PIXEL_255_asm_mips_dspr2)
-/*
- * a0 - uint x (First value to multiply)
- * a1 - uint a (Multiplicator byte for first value)
- * a2 - uint y (Second value to multiply)
- * a3 - uint b (Multiplicator byte for second value)
- */
-
-    .set reorder
-    replv.ph          a1, a1
-    replv.ph          a3, a3
-    li                t8, 8388736
-    muleu_s.ph.qbl    t0, a0, a1
-    muleu_s.ph.qbl    t1, a2, a3
-    muleu_s.ph.qbr    t2, a0, a1
-    muleu_s.ph.qbr    t3, a2, a3
-    addu.ph           t4, t0, t1
-    addu.ph           t5, t2, t3
-    preceu.ph.qbla    t0, t4
-    addu              t1, t0, t8
-    addu              t1, t4, t1
-    preceu.ph.qbla    t6, t5
-    addu              t7, t6, t8
-    addu              t7, t5, t7
-    precrq.qb.ph      t2, t1, t7
-    move              v0, t2
-    j                 ra
-
-END(INTERPOLATE_PIXEL_255_asm_mips_dspr2)
-
-LEAF_MIPS_DSPR2(BYTE_MUL_asm_mips_dspr2)
-/*
- * a0 - uint x (Value to multiply)
- * a1 - uint a (Multiplicator byte)
- */
-
-    .set reorder
-    replv.ph          a1, a1              /* a1 = 0x00a00a */
-    li                t4, 8388736         /* t4 = 0x800080 */
-    muleu_s.ph.qbl    t0, a0, a1
-    muleu_s.ph.qbr    t2, a0, a1
-    preceu.ph.qbla    t1, t0
-    addu              t0, t0, t1
-    addu              t0, t0, t4
-    preceu.ph.qbla    t3, t2
-    addu              t2, t2, t3
-    addu              t2, t2, t4
-    precrq.qb.ph      t4, t0, t2
-    move              v0, t4
-    j                 ra
-
-END(BYTE_MUL_asm_mips_dspr2)
-
 LEAF_MIPS_DSPR2(qConvertRgb16To32_asm_mips_dspr2)
 /*
  * a0 - dst (a8r8g8b8)
index bcde706..088831f 100644 (file)
@@ -110,4 +110,140 @@ LEAF_MIPS32R2(symbol)                                   \
                 .end    function;                       \
                 .size   function,.-function
 
+/*
+ * BYTE_MUL operation on two pixels (in_1 and in_2) with two
+ * multiplicator bytes, repl_a1 and repl_a2, which should be
+ * prepered with:
+ *   replv.ph   repl_a1, a1
+ *   replv.ph   repl_a2, a2
+ * to became such as:
+ *   repl_a1 = | 00 | a1 | 00 | a1 |
+ *   repl_a2 = | 00 | a2 | 00 | a2 |
+ *
+ * rounding_factor must have following value:
+ *   li    rounding_factor, 0x00800080
+ *
+ * scratch(n) - temporary registers
+ *
+ * in_const: 1 -> (default) causes that in_1, in_2
+ *           registers will remain unchanged after usage
+ *           0 -> (or anything different then 1) causes
+ *           that registers repl_a1, repl_a2 remain
+ *           unchanged after usage
+ */
+.macro BYTE_MUL_x2 in_1, in_2, out_1, out_2                 \
+                   repl_a1, repl_a2, rounding_factor,       \
+                   scratch1, scratch2, scratch3, scratch4,  \
+                   in_const = 1
+    muleu_s.ph.qbl    \scratch1, \in_1,     \repl_a1
+    muleu_s.ph.qbr    \scratch2, \in_1,     \repl_a1
+    muleu_s.ph.qbl    \scratch3, \in_2,     \repl_a2
+    muleu_s.ph.qbr    \scratch4, \in_2,     \repl_a2
+
+.if \in_const == 1
+    preceu.ph.qbla    \repl_a1,  \scratch1
+    preceu.ph.qbla    \repl_a2,  \scratch2
+    preceu.ph.qbla    \out_1,    \scratch3
+    preceu.ph.qbla    \out_2,    \scratch4
+
+    addu              \scratch1,  \repl_a1, \scratch1
+    addu              \scratch2,  \repl_a2, \scratch2
+.else
+    preceu.ph.qbla    \in_1,      \scratch1
+    preceu.ph.qbla    \in_2,      \scratch2
+    preceu.ph.qbla    \out_1,     \scratch3
+    preceu.ph.qbla    \out_2,     \scratch4
+
+    addu              \scratch1,  \in_1,    \scratch1
+    addu              \scratch2,  \in_2,    \scratch2
+.endif
+
+    addu              \out_1,     \out_1,   \scratch3
+    addu              \out_2,     \out_2,   \scratch4
+
+    addu              \scratch1,  \scratch1, \rounding_factor
+    addu              \scratch2,  \scratch2, \rounding_factor
+    addu              \scratch3,  \out_1,    \rounding_factor
+    addu              \scratch4,  \out_2,    \rounding_factor
+
+    precrq.qb.ph      \out_1,     \scratch1, \scratch2
+    precrq.qb.ph      \out_2,     \scratch3, \scratch4
+
+.endm
+
+/*
+ * BYTE_MUL operation on one pixel (in_1) with
+ * multiplicator byte, repl_a1, which should be
+ * prepered with:
+ *   replv.ph   repl_a1, a1
+ * to became such as:
+ *   repl_a1 = | 00 | a1 | 00 | a1 |
+ *
+ * rounding_factor must have following value:
+ *   li    rounding_factor, 0x00800080
+ *
+ * scratch(n) - temporary registers
+ */
+.macro BYTE_MUL in_1, out_1,                            \
+                repl_a1, rounding_factor,               \
+                scratch1, scratch2, scratch3, scratch4
+    muleu_s.ph.qbl    \scratch1, \in_1,     \repl_a1
+    muleu_s.ph.qbr    \scratch2, \in_1,     \repl_a1
+
+    preceu.ph.qbla    \scratch3, \scratch1
+    preceu.ph.qbla    \scratch4, \scratch2
+
+    addu              \scratch1, \scratch1, \scratch3
+    addu              \scratch1, \scratch1, \rounding_factor
+
+    addu              \scratch2, \scratch2, \scratch4
+    addu              \scratch2, \scratch2, \rounding_factor
+
+    precrq.qb.ph      \out_1,    \scratch1, \scratch2
+
+.endm
+
+/*
+ * macro for INTERPOLATE_PIXEL_255 operation
+ * in_1 - First value to multiply
+ * mul_1 - Multiplicator byte for first value
+ * in_2 - Second value to multiply
+ * mul_2 - Multiplicator byte for second value
+ * rounding_factor and andi_factor should be prepared
+ * as:
+ *     li     rounding_factor, 0x00800080
+ *     li     andi_factor,     0xff00ff00
+ * scratch(n) - temporary registers
+ */
+.macro INTERPOLATE_PIXEL_255 in_1, mul_1,                            \
+                             in_2, mul_2,                            \
+                             out_1,                                  \
+                             rounding_factor, andi_factor            \
+                             scratch1, scratch2, scratch3, scratch4
+# x part
+    preceu.ph.qbra    \scratch1, \in_1
+    preceu.ph.qbra    \scratch2, \in_2
+    mul               \scratch1, \scratch1, \mul_1
+    mul               \scratch2, \scratch2, \mul_2
+# x>>8 part
+    preceu.ph.qbla    \scratch3, \in_1
+    preceu.ph.qbla    \scratch4, \in_2
+    mul               \scratch3, \scratch3, \mul_1
+    mul               \scratch4, \scratch4, \mul_2
+# x part
+    addu              \scratch1, \scratch1, \scratch2
+    preceu.ph.qbla    \scratch2, \scratch1
+    addu              \scratch1, \scratch1, \scratch2
+    addu              \scratch1, \scratch1, \rounding_factor
+    preceu.ph.qbla    \scratch1, \scratch1
+# x>>8 part
+    addu              \scratch3, \scratch3, \scratch4
+    preceu.ph.qbla    \scratch4, \scratch3
+    addu              \scratch3, \scratch3, \scratch4
+    addu              \scratch3, \scratch3, \rounding_factor
+    and               \scratch3, \scratch3, \andi_factor
+
+    or                \out_1,    \scratch1, \scratch3
+.endm
+
 #endif //QT_MIPS_DSP_H__