Updated primitives API and tests.
authorArmin Novak <armin.novak@thincast.com>
Wed, 13 Jul 2016 12:04:48 +0000 (14:04 +0200)
committerArmin Novak <armin.novak@thincast.com>
Thu, 6 Oct 2016 11:43:00 +0000 (13:43 +0200)
31 files changed:
include/freerdp/primitives.h
libfreerdp/codec/h264.c
libfreerdp/primitives/prim_YCoCg.c
libfreerdp/primitives/prim_YCoCg_opt.c
libfreerdp/primitives/prim_YUV.c
libfreerdp/primitives/prim_YUV_opt.c
libfreerdp/primitives/prim_add.c
libfreerdp/primitives/prim_add_opt.c
libfreerdp/primitives/prim_alphaComp.c
libfreerdp/primitives/prim_alphaComp_opt.c
libfreerdp/primitives/prim_colors.c
libfreerdp/primitives/prim_colors_opt.c
libfreerdp/primitives/prim_set.c
libfreerdp/primitives/prim_set_opt.c
libfreerdp/primitives/prim_shift.c
libfreerdp/primitives/prim_shift_opt.c
libfreerdp/primitives/prim_sign.c
libfreerdp/primitives/prim_sign_opt.c
libfreerdp/primitives/prim_templates.h
libfreerdp/primitives/primitives.c
libfreerdp/primitives/test/TestPrimitivesAdd.c
libfreerdp/primitives/test/TestPrimitivesAlphaComp.c
libfreerdp/primitives/test/TestPrimitivesAndOr.c
libfreerdp/primitives/test/TestPrimitivesColors.c
libfreerdp/primitives/test/TestPrimitivesCopy.c
libfreerdp/primitives/test/TestPrimitivesSet.c
libfreerdp/primitives/test/TestPrimitivesShift.c
libfreerdp/primitives/test/TestPrimitivesSign.c
libfreerdp/primitives/test/TestPrimitivesYCoCg.c
libfreerdp/primitives/test/TestPrimitivesYUV.c
libfreerdp/primitives/test/prim_test.h

index 287c892..8eb92f9 100644 (file)
@@ -81,62 +81,62 @@ typedef pstatus_t (*__copy_8u_AC4r_t)(
 typedef pstatus_t (*__set_8u_t)(
     BYTE val,
     BYTE* pDst,
-    INT32 len);
+    UINT32 len);
 typedef pstatus_t (*__set_32s_t)(
     INT32 val,
     INT32* pDst,
-    INT32 len);
+    UINT32 len);
 typedef pstatus_t (*__set_32u_t)(
     UINT32 val,
     UINT32* pDst,
-    INT32 len);
+    UINT32 len);
 typedef pstatus_t (*__zero_t)(
     void* pDst,
     size_t bytes);
 typedef pstatus_t (*__alphaComp_argb_t)(
-    const BYTE* pSrc1,  INT32 src1Step,
-    const BYTE* pSrc2,  INT32 src2Step,
-    BYTE* pDst,  INT32 dstStep,
-    INT32 width,  INT32 height);
+    const BYTE* pSrc1,  UINT32 src1Step,
+    const BYTE* pSrc2,  UINT32 src2Step,
+    BYTE* pDst,  UINT32 dstStep,
+    UINT32 width,  UINT32 height);
 typedef pstatus_t (*__add_16s_t)(
     const INT16* pSrc1,
     const INT16* pSrc2,
     INT16* pDst,
-    INT32 len);
+    UINT32 len);
 typedef pstatus_t (*__lShiftC_16s_t)(
     const INT16* pSrc,
-    INT32 val,
+    UINT32 val,
     INT16* pSrcDst,
-    INT32 len);
+    UINT32 len);
 typedef pstatus_t (*__lShiftC_16u_t)(
     const UINT16* pSrc,
-    INT32 val,
+    UINT32 val,
     UINT16* pSrcDst,
-    INT32 len);
+    UINT32 len);
 typedef pstatus_t (*__rShiftC_16s_t)(
     const INT16* pSrc,
-    INT32 val,
+    UINT32 val,
     INT16* pSrcDst,
-    INT32 len);
+    UINT32 len);
 typedef pstatus_t (*__rShiftC_16u_t)(
     const UINT16* pSrc,
-    INT32 val,
+    UINT32 val,
     UINT16* pSrcDst,
-    INT32 len);
+    UINT32 len);
 typedef pstatus_t (*__shiftC_16s_t)(
     const INT16* pSrc,
     INT32 val,
     INT16* pSrcDst,
-    INT32 len);
+    UINT32 len);
 typedef pstatus_t (*__shiftC_16u_t)(
     const UINT16* pSrc,
     INT32 val,
     UINT16* pSrcDst,
-    INT32 len);
+    UINT32 len);
 typedef pstatus_t (*__sign_16s_t)(
     const INT16* pSrc,
     INT16* pDst,
-    INT32 len);
+    UINT32 len);
 typedef pstatus_t (*__yCbCrToRGB_16s8u_P3AC4R_t)(
     const INT16* pSrc[3], INT32 srcStep,
     BYTE* pDst, INT32 dstStep, UINT32 DstFormat,
@@ -154,8 +154,8 @@ typedef pstatus_t (*__RGBToYCbCr_16s16s_P3P3_t)(
     INT16* pDst[3],  INT32 dstStep,
     const prim_size_t* roi);
 typedef pstatus_t (*__RGBToRGB_16s8u_P3AC4R_t)(
-    const INT16* pSrc[3],  INT32 srcStep,
-    BYTE* pDst,  INT32 dstStep, UINT32 DstFormat,
+    const INT16* const pSrc[3],  UINT32 srcStep,
+    BYTE* pDst,  UINT32 dstStep, UINT32 DstFormat,
     const prim_size_t* roi);
 typedef pstatus_t (*__YCoCgToRGB_8u_AC4R_t)(
     const BYTE* pSrc, INT32 srcStep,
@@ -177,11 +177,11 @@ typedef pstatus_t (*__YUV444ToRGB_8u_P3AC4R_t)(
     BYTE* pDst, UINT32 dstStep, UINT32 DstFormat,
     const prim_size_t* roi);
 typedef pstatus_t (*__RGBToYUV420_8u_P3AC4R_t)(
-    const BYTE* pSrc, UINT32 srcStep,
+    const BYTE* pSrc, UINT32 SrcFormat, UINT32 srcStep,
     BYTE* pDst[3], UINT32 dstStep[3],
     const prim_size_t* roi);
 typedef pstatus_t (*__RGBToYUV444_8u_P3AC4R_t)(
-    const BYTE* pSrc, UINT32 srcStep,
+    const BYTE* pSrc, UINT32 SrcFormat, UINT32 srcStep,
     BYTE* pDst[3], UINT32 dstStep[3],
     const prim_size_t* roi);
 typedef pstatus_t (*__YUV420CombineToYUV444_t)(
index 2686e92..c6b16e2 100644 (file)
@@ -1567,7 +1567,7 @@ INT32 avc420_compress(H264_CONTEXT* h264, BYTE* pSrcData, DWORD SrcFormat,
        roi.width = nSrcWidth;
        roi.height = nSrcHeight;
 
-       prims->RGBToYUV420_8u_P3AC4R(pSrcData, nSrcStep, pYUVData, iStride, &roi);
+       prims->RGBToYUV420_8u_P3AC4R(pSrcData, SrcFormat, nSrcStep, pYUVData, iStride, &roi);
 
        status = h264->subsystem->Compress(h264, ppDstData, pDstSize, 0);
 
index 0b264d2..518f248 100644 (file)
 
 /* ------------------------------------------------------------------------- */
 static pstatus_t general_YCoCgToRGB_8u_AC4R(
-    const BYTE* pSrc, INT32 srcStep,
-    BYTE* pDst, UINT32 DstFormat, INT32 dstStep,
-    UINT32 width, UINT32 height,
-    UINT8 shift,
-    BOOL withAlpha)
+               const BYTE* pSrc, INT32 srcStep,
+               BYTE* pDst, UINT32 DstFormat, INT32 dstStep,
+               UINT32 width, UINT32 height,
+               UINT8 shift,
+               BOOL withAlpha)
 {
        BYTE A;
        UINT32 x, y;
@@ -66,8 +66,11 @@ static pstatus_t general_YCoCgToRGB_8u_AC4R(
                        R  = T + Co;
                        G  = Y + Cg;
                        B  = T - Co;
-                       color = GetColor(DstFormat, MINMAX(R, 0, 255), MINMAX(G, 0, 255), MINMAX(B, 0,
-                                        255), A);
+
+                       color = GetColor(DstFormat,
+                                        MINMAX(R, 0, 255), MINMAX(G, 0, 255),
+                                        MINMAX(B, 0, 255), A);
+                       WriteColor(dptr, DstFormat, color);
                        dptr += GetBytesPerPixel(DstFormat);
                }
 
index 93cddb5..75ba935 100644 (file)
@@ -40,8 +40,8 @@ static primitives_t* generic = NULL;
 #ifdef WITH_SSE2
 /* ------------------------------------------------------------------------- */
 static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert(
-    const BYTE* pSrc, INT32 srcStep,
-    BYTE* pDst, INT32 dstStep,
+    const BYTE* pSrc, UINT32 srcStep,
+    BYTE* pDst, UINT32 DstFormat, UINT32 dstStep,
     UINT32 width, UINT32 height,
     UINT8 shift,
     BOOL withAlpha)
@@ -70,8 +70,8 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert(
        {
                /* Too small, or we'll never hit a 16-byte boundary.  Punt. */
                return generic->YCoCgToRGB_8u_AC4R(
-                          pSrc, srcStep, pDst, dstStep,
-                          width, height, shift, withAlpha, TRUE);
+                          pSrc, srcStep, pDst, DstFormat, dstStep,
+                          width, height, shift, withAlpha);
        }
 
        for (h = 0; h < height; h++)
@@ -82,12 +82,16 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert(
                /* Get to a 16-byte destination boundary. */
                if ((ULONG_PTR) dptr & 0x0f)
                {
+                       pstatus_t status;
                        int startup = (16 - ((ULONG_PTR) dptr & 0x0f)) / 4;
 
                        if (startup > width) startup = width;
 
-                       generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, dstStep,
-                                                   startup, 1, shift, withAlpha, TRUE);
+                       status = generic->YCoCgToRGB_8u_AC4R(
+                                        sptr, srcStep, dptr, DstFormat, dstStep,
+                                        startup, 1, shift, withAlpha);
+                       if (status != PRIMITIVES_SUCCESS)
+                               return status;
                        sptr += startup * sizeof(UINT32);
                        dptr += startup * sizeof(UINT32);
                        w -= startup;
@@ -195,8 +199,13 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert(
                /* Handle any remainder pixels. */
                if (w > 0)
                {
-                       generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, dstStep,
-                                                   w, 1, shift, withAlpha, TRUE);
+                       pstatus_t status;
+                       status = generic->YCoCgToRGB_8u_AC4R(
+                                        sptr, srcStep, dptr, DstFormat, dstStep,
+                                        w, 1, shift, withAlpha);
+                       if (status != PRIMITIVES_SUCCESS)
+                               return status;
+
                        sptr += w * sizeof(UINT32);
                        dptr += w * sizeof(UINT32);
                }
@@ -210,8 +219,8 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert(
 
 /* ------------------------------------------------------------------------- */
 static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert(
-    const BYTE* pSrc, INT32 srcStep,
-    BYTE* pDst, INT32 dstStep,
+    const BYTE* pSrc, UINT32 srcStep,
+    BYTE* pDst, UINT32 DstFormat, UINT32 dstStep,
     UINT32 width, UINT32 height,
     UINT8 shift,
     BOOL withAlpha)
@@ -240,9 +249,8 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert(
        {
                /* Too small, or we'll never hit a 16-byte boundary.  Punt. */
                return generic->YCoCgToRGB_8u_AC4R(
-                          pSrc, srcStep,
-                          pDst, dstStep, width, height, shift,
-                          withAlpha, FALSE);
+                                       pSrc, srcStep, pDst, DstFormat, dstStep,
+                                       width, height, shift, withAlpha);
        }
 
        for (h = 0; h < height; h++)
@@ -253,12 +261,17 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert(
                /* Get to a 16-byte destination boundary. */
                if ((ULONG_PTR) dptr & 0x0f)
                {
+                       pstatus_t status;
                        int startup = (16 - ((ULONG_PTR) dptr & 0x0f)) / 4;
 
                        if (startup > width) startup = width;
 
-                       generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, dstStep,
-                                                   startup, 1, shift, withAlpha, FALSE);
+                       status = generic->YCoCgToRGB_8u_AC4R(
+                                               sptr, srcStep, dptr, DstFormat,
+                                               dstStep, startup, 1, shift, withAlpha);
+                       if (status != PRIMITIVES_SUCCESS)
+                               return status;
+
                        sptr += startup * sizeof(UINT32);
                        dptr += startup * sizeof(UINT32);
                        w -= startup;
@@ -370,8 +383,13 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert(
                /* Handle any remainder pixels. */
                if (w > 0)
                {
-                       generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, dstStep,
-                                                   w, 1, shift, withAlpha, FALSE);
+                       pstatus_t status;
+                       status = generic->YCoCgToRGB_8u_AC4R(
+                                        sptr, srcStep, dptr, DstFormat, dstStep,
+                                        w, 1, shift, withAlpha);
+                       if (status != PRIMITIVES_SUCCESS)
+                               return status;
+
                        sptr += w * sizeof(UINT32);
                        dptr += w * sizeof(UINT32);
                }
@@ -388,21 +406,29 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert(
 /* ------------------------------------------------------------------------- */
 static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R(
     const BYTE* pSrc, INT32 srcStep,
-    BYTE* pDst, INT32 dstStep,
+    BYTE* pDst, UINT32 DstFormat, INT32 dstStep,
     UINT32 width, UINT32 height,
     UINT8 shift,
-    BOOL withAlpha,
-    BOOL invert)
+    BOOL withAlpha)
 {
-       if (invert)
-       {
-               return ssse3_YCoCgRToRGB_8u_AC4R_invert(pSrc, srcStep, pDst, dstStep,
-                                                       width, height, shift, withAlpha);
-       }
-       else
+       // TODO: Need to implement proper color conversion!!!
+       return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat,
+                                          dstStep, width, height, shift, withAlpha);
+
+       switch(DstFormat)
        {
-               return ssse3_YCoCgRToRGB_8u_AC4R_no_invert(pSrc, srcStep, pDst, dstStep,
-                       width, height, shift, withAlpha);
+       case PIXEL_FORMAT_BGRX32:
+       case PIXEL_FORMAT_BGRA32:
+               return ssse3_YCoCgRToRGB_8u_AC4R_invert(
+                                       pSrc, srcStep, pDst, DstFormat, dstStep,
+                                       width, height, shift, withAlpha);
+       case PIXEL_FORMAT_RGBX32:
+       case PIXEL_FORMAT_RGBA32:
+               return ssse3_YCoCgRToRGB_8u_AC4R_no_invert(
+                                       pSrc, srcStep, pDst, DstFormat, dstStep,
+                                       width, height, shift, withAlpha);
+       default:
+               return -1;
        }
 }
 #endif /* WITH_SSE2 */
index bb32f34..5318020 100644 (file)
@@ -232,9 +232,9 @@ static pstatus_t general_YUV444SplitToYUV420(
                {
                        /* Filter */
                        const INT32 u = pSrcU[2 * x] + pSrcU[2 * x + 1] + pSrcU1[2 * x]
-                                       + pSrcU1[2 * x + 1];
+                                       + pSrcU1[2 * x + 1];
                        const INT32 v = pSrcV[2 * x] + pSrcV[2 * x + 1] + pSrcV1[2 * x]
-                                       + pSrcV1[2 * x + 1];
+                                       + pSrcV1[2 * x + 1];
                        pU[x] = CLIP(u / 4L);
                        pV[x] = CLIP(v / 4L);
                }
@@ -331,7 +331,7 @@ static INLINE BYTE* writePixel(BYTE* dst, UINT32 format, BYTE Y, BYTE U, BYTE V)
        const BYTE r = YUV2R(Y, U, V);
        const BYTE g = YUV2G(Y, U, V);
        const BYTE b = YUV2B(Y, U, V);
-       UINT32 color = GetColor(format, r, g, b, 0);
+       UINT32 color = GetColor(format, r, g, b, 0xFF);
        WriteColor(dst, format, color);
        return dst + GetBytesPerPixel(format);
 }
@@ -500,9 +500,10 @@ static INLINE BYTE RGB2V(INT32 R, INT32 G, INT32 B)
 }
 
 static pstatus_t general_RGBToYUV444_8u_P3AC4R(
-    const BYTE* pSrc, const UINT32 srcStep,
+    const BYTE* pSrc, UINT32 SrcFormat, const UINT32 srcStep,
     BYTE* pDst[3], UINT32 dstStep[3], const prim_size_t* roi)
 {
+       const UINT32 bpp = GetBytesPerPixel(SrcFormat);
        UINT32 x, y;
        UINT32 nWidth, nHeight;
        nWidth = roi->width;
@@ -517,9 +518,10 @@ static pstatus_t general_RGBToYUV444_8u_P3AC4R(
 
                for (x = 0; x < nWidth; x++)
                {
-                       const BYTE B = pRGB[4 * x + 0];
-                       const BYTE G = pRGB[4 * x + 1];
-                       const BYTE R = pRGB[4 * x + 2];
+                       BYTE B, G, R;
+                       const UINT32 color = ReadColor(&pRGB[x * bpp], SrcFormat);
+                       SplitColor(color, SrcFormat, &R, &G, &B, NULL, NULL);
+
                        pY[x] = RGB2Y(R, G, B);
                        pU[x] = RGB2U(R, G, B);
                        pV[x] = RGB2V(R, G, B);
@@ -530,9 +532,10 @@ static pstatus_t general_RGBToYUV444_8u_P3AC4R(
 }
 
 static pstatus_t general_RGBToYUV420_8u_P3AC4R(
-    const BYTE* pSrc, UINT32 srcStep,
+    const BYTE* pSrc, UINT32 SrcFormat, UINT32 srcStep,
     BYTE* pDst[3], UINT32 dstStep[3], const prim_size_t* roi)
 {
+       const UINT32 bpp = GetBytesPerPixel(SrcFormat);
        UINT32 x, y;
        UINT32 halfWidth;
        UINT32 halfHeight;
@@ -555,39 +558,50 @@ static pstatus_t general_RGBToYUV420_8u_P3AC4R(
 
                for (x = 0; x < halfWidth; x++)
                {
-                       INT32 R, G, B;
+                       UINT32 color;
                        INT32 Ra, Ga, Ba;
                        const UINT32 val2x = (x * 2);
                        const UINT32 val2x1 = val2x + 1;
+                       BYTE B, G, R;
+
                        /* 1st pixel */
-                       Ba = B = pRGB[val2x * 4 + 0];
-                       Ga = G = pRGB[val2x * 4 + 1];
-                       Ra = R = pRGB[val2x * 4 + 2];
+                       color = ReadColor(&pRGB[val2x * bpp], SrcFormat);
+                       SplitColor(color, SrcFormat, &R, &G, &B, NULL, NULL);
+
+                       Ba = B;
+                       Ga = G;
+                       Ra = R;
                        pY[val2x] = RGB2Y(R, G, B);
 
                        if (val2x1 < nWidth)
                        {
                                /* 2nd pixel */
-                               Ba += B = pRGB[val2x * 4 + 4];
-                               Ga += G = pRGB[val2x * 4 + 5];
-                               Ra += R = pRGB[val2x * 4 + 6];
+                               color = ReadColor(&pRGB[val2x1 * bpp], SrcFormat);
+                               SplitColor(color, SrcFormat, &R, &G, &B, NULL, NULL);
+                               Ba += B;
+                               Ga += G;
+                               Ra += R;
                                pY[val2x1] = RGB2Y(R, G, B);
                        }
 
                        if (val2y1 < nHeight)
                        {
                                /* 3rd pixel */
-                               Ba += B = pRGB1[val2x * 4 + 0];
-                               Ga += G = pRGB1[val2x * 4 + 1];
-                               Ra += R = pRGB1[val2x * 4 + 2];
+                               color = ReadColor(&pRGB1[val2x * bpp], SrcFormat);
+                               SplitColor(color, SrcFormat, &R, &G, &B, NULL, NULL);
+                               Ba += B;
+                               Ga += G;
+                               Ra += R;
                                pY1[val2x] = RGB2Y(R, G, B);
 
                                if (val2x1 < nWidth)
                                {
                                        /* 4th pixel */
-                                       Ba += B = pRGB1[val2x * 4 + 4];
-                                       Ga += G = pRGB1[val2x * 4 + 5];
-                                       Ra += R = pRGB1[val2x * 4 + 6];
+                                       color = ReadColor(&pRGB1[val2x1 * bpp], SrcFormat);
+                                       SplitColor(color, SrcFormat, &R, &G, &B, NULL, NULL);
+                                       Ba += B;
+                                       Ga += G;
+                                       Ra += R;
                                        pY1[val2x1] = RGB2Y(R, G, B);
                                }
                        }
index 36c31d6..b8d2172 100644 (file)
@@ -35,6 +35,11 @@ static pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(
        UINT32 i, nWidth, nHeight, VaddDst, VaddY, VaddU, VaddV;
        __m128i r0, r1, r2, r3, r4, r5, r6, r7;
        __m128i* buffer;
+
+       // TODO: Need to implement proper color conversion!!!!!
+       return generic->YUV420ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep,
+                                             DstFormat, roi);
+
        /* last_line: if the last (U,V doubled) line should be skipped, set to 10B
         * last_column: if it's the last column in a line, set to 10B (for handling line-endings not multiple by four) */
        buffer = _aligned_malloc(4 * 16, 16);
index 5984402..cc63268 100644 (file)
@@ -30,7 +30,7 @@ static pstatus_t general_add_16s(
     const INT16* pSrc1,
     const INT16* pSrc2,
     INT16* pDst,
-    INT32 len)
+    UINT32 len)
 {
        while (len--)
        {
index d49d634..c6c7504 100644 (file)
@@ -40,7 +40,7 @@ static primitives_t* generic = NULL;
 # if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
 /* ------------------------------------------------------------------------- */
 SSE3_SSD_ROUTINE(sse3_add_16s, INT16, generic->add_16s,
-                 _mm_adds_epi16, generic->add_16s(sptr1++, sptr2++, dptr++, 1))
+                _mm_adds_epi16, generic->add_16s(sptr1++, sptr2++, dptr++, 1))
 # endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
 #endif
 
index a306b9c..cc2f75e 100644 (file)
 
 /* ------------------------------------------------------------------------- */
 static pstatus_t general_alphaComp_argb(
-    const BYTE* pSrc1,  INT32 src1Step,
-    const BYTE* pSrc2,  INT32 src2Step,
-    BYTE* pDst,  INT32 dstStep,
-    INT32 width,  INT32 height)
+    const BYTE* pSrc1,  UINT32 src1Step,
+    const BYTE* pSrc2,  UINT32 src2Step,
+    BYTE* pDst,  UINT32 dstStep,
+    UINT32 width,  UINT32 height)
 {
-       const UINT32* sptr1 = (const UINT32*) pSrc1;
-       const UINT32* sptr2 = (const UINT32*) pSrc2;
-       UINT32* dptr = (UINT32*) pDst;
-       int linebytes = width * sizeof(UINT32);
-       int src1Jump = (src1Step - linebytes) / sizeof(UINT32);
-       int src2Jump = (src2Step - linebytes) / sizeof(UINT32);
-       int dstJump  = (dstStep  - linebytes) / sizeof(UINT32);
-       int y;
+       UINT32 y;
 
        for (y = 0; y < height; y++)
        {
-               int x;
+               const UINT32* sptr1 = (const UINT32*) (pSrc1 + y * src1Step);
+               const UINT32* sptr2 = (const UINT32*) (pSrc2 + y * src2Step);
+               UINT32* dptr = (UINT32*) (pDst + y * dstStep);
+               UINT32 x;
 
                for (x = 0; x < width; x++)
                {
@@ -92,10 +88,6 @@ static pstatus_t general_alphaComp_argb(
                                *dptr++ = rb | ag;
                        }
                }
-
-               sptr1 += src1Jump;
-               sptr2 += src2Jump;
-               dptr  += dstJump;
        }
 
        return PRIMITIVES_SUCCESS;
index d0f9744..94daf27 100644 (file)
@@ -46,10 +46,10 @@ static primitives_t* generic = NULL;
 #if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
 
 pstatus_t sse2_alphaComp_argb(
-    const BYTE* pSrc1,  INT32 src1Step,
-    const BYTE* pSrc2,  INT32 src2Step,
-    BYTE* pDst,  INT32 dstStep,
-    INT32 width,  INT32 height)
+    const BYTE* pSrc1,  UINT32 src1Step,
+    const BYTE* pSrc2,  UINT32 src2Step,
+    BYTE* pDst,  UINT32 dstStep,
+    UINT32 width,  UINT32 height)
 {
        const UINT32* sptr1 = (const UINT32*) pSrc1;
        const UINT32* sptr2 = (const UINT32*) pSrc2;
@@ -62,7 +62,7 @@ pstatus_t sse2_alphaComp_argb(
        if (width < 4)     /* pointless if too small */
        {
                return generic->alphaComp_argb(pSrc1, src1Step, pSrc2, src2Step,
-                                              pDst, dstStep, width, height);
+                                              pDst, dstStep, width, height);
        }
 
        dptr = (UINT32*) pDst;
@@ -108,9 +108,13 @@ pstatus_t sse2_alphaComp_argb(
 
                if (leadIn)
                {
-                       generic->alphaComp_argb((const BYTE*) sptr1,
-                                               src1Step, (const BYTE*) sptr2, src2Step,
-                                               (BYTE*) dptr, dstStep, leadIn, 1);
+                       pstatus_t status;
+                       status = generic->alphaComp_argb((const BYTE*) sptr1,
+                                               src1Step, (const BYTE*) sptr2, src2Step,
+                                               (BYTE*) dptr, dstStep, leadIn, 1);
+                       if (status != PRIMITIVES_SUCCESS)
+                               return status;
+
                        sptr1 += leadIn;
                        sptr2 += leadIn;
                        dptr  += leadIn;
@@ -181,9 +185,13 @@ pstatus_t sse2_alphaComp_argb(
                /* Finish off the remainder. */
                if (pixels)
                {
-                       generic->alphaComp_argb((const BYTE*) sptr1, src1Step,
-                                               (const BYTE*) sptr2, src2Step,
-                                               (BYTE*) dptr, dstStep, pixels, 1);
+                       pstatus_t status;
+                       status = generic->alphaComp_argb((const BYTE*) sptr1, src1Step,
+                                               (const BYTE*) sptr2, src2Step,
+                                               (BYTE*) dptr, dstStep, pixels, 1);
+                       if (status != PRIMITIVES_SUCCESS)
+                               return status;
+
                        sptr1 += pixels;
                        sptr2 += pixels;
                        dptr  += pixels;
@@ -212,7 +220,7 @@ static pstatus_t ipp_alphaComp_argb(
        sz.width  = width;
        sz.height = height;
        return ippiAlphaComp_8u_AC4R(pSrc1, src1Step, pSrc2, src2Step,
-                                    pDst, dstStep, sz, ippAlphaOver);
+                                    pDst, dstStep, sz, ippAlphaOver);
 }
 #endif
 
index 30bc131..b2dce1d 100644 (file)
@@ -262,7 +262,7 @@ static pstatus_t general_RGBToYCbCr_16s16s_P3P3(
 
        for (y = 0; y < roi->height; y++)
        {
-               int x;
+               UINT32 x;
 
                for (x = 0; x < roi->width; ++x)
                {
@@ -305,10 +305,10 @@ static pstatus_t general_RGBToYCbCr_16s16s_P3P3(
 
 /* ------------------------------------------------------------------------- */
 static pstatus_t general_RGBToRGB_16s8u_P3AC4R(
-    const INT16* pSrc[3],      /* 16-bit R,G, and B arrays */
-    INT32 srcStep,                     /* bytes between rows in source data */
+    const INT16* const pSrc[3],        /* 16-bit R,G, and B arrays */
+    UINT32 srcStep,                    /* bytes between rows in source data */
     BYTE* pDst,                        /* 32-bit interleaved ARGB (ABGR?) data */
-    INT32 dstStep,                     /* bytes between rows in dest data */
+    UINT32 dstStep,                    /* bytes between rows in dest data */
     UINT32 DstFormat,
     const prim_size_t* roi)    /* region of interest */
 {
index ac5c662..f349fca 100644 (file)
@@ -91,7 +91,7 @@ static pstatus_t sse2_yCbCrToRGB_16s16s_P3P3(
        {
                /* We can't maintain 16-byte alignment. */
                return generic->yCbCrToRGB_16s16s_P3P3(pSrc, srcStep,
-                                                      pDst, dstStep, roi);
+                                                      pDst, dstStep, roi);
        }
 
        zero = _mm_setzero_si128();
@@ -228,7 +228,7 @@ static pstatus_t sse2_RGBToYCbCr_16s16s_P3P3(
        {
                /* We can't maintain 16-byte alignment. */
                return generic->RGBToYCbCr_16s16s_P3P3(pSrc, srcStep,
-                                                      pDst, dstStep, roi);
+                                                      pDst, dstStep, roi);
        }
 
        min = _mm_set1_epi16(-128 * 32);
@@ -357,10 +357,10 @@ static pstatus_t sse2_RGBToYCbCr_16s16s_P3P3(
        _mm_set1_epi32(0xFFFFFFFFU)
 
 pstatus_t sse2_RGBToRGB_16s8u_P3AC4R(
-    const INT16* pSrc[3],      /* 16-bit R,G, and B arrays */
-    INT32 srcStep,                     /* bytes between rows in source data */
+    const INT16* const pSrc[3],        /* 16-bit R,G, and B arrays */
+    UINT32 srcStep,                    /* bytes between rows in source data */
     BYTE* pDst,                                /* 32-bit interleaved ARGB (ABGR?) data */
-    INT32 dstStep,                     /* bytes between rows in dest data */
+    UINT32 dstStep,                    /* bytes between rows in dest data */
     UINT32 DstFormat,
     const prim_size_t* roi)    /* region of interest */
 {
@@ -385,9 +385,13 @@ pstatus_t sse2_RGBToRGB_16s8u_P3AC4R(
            || (dstStep & 0x0f))
        {
                return generic->RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst,
-                                                     dstStep, DstFormat, roi);
+                                                     dstStep, DstFormat, roi);
        }
 
+       // TODO: Need to update SSE code to allow color conversion!!!
+       return generic->RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst,
+                                             dstStep, DstFormat, roi);
+
        out = (BYTE*) pDst;
        srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
        dstbump = (dstStep - (roi->width * sizeof(UINT32)));
index 3770e50..b9e9e0e 100644 (file)
@@ -29,7 +29,7 @@
 static pstatus_t general_set_8u(
     BYTE val,
     BYTE* pDst,
-    INT32 len)
+    UINT32 len)
 {
        memset((void*) pDst, (int) val, (size_t) len);
        return PRIMITIVES_SUCCESS;
@@ -48,7 +48,7 @@ static pstatus_t general_zero(
 static pstatus_t general_set_32s(
     INT32 val,
     INT32* pDst,
-    INT32 len)
+    UINT32 len)
 {
        INT32* dptr = (INT32*) pDst;
        size_t span, remaining;
@@ -85,7 +85,7 @@ static pstatus_t general_set_32s(
 static pstatus_t general_set_32u(
     UINT32 val,
     UINT32* pDst,
-    INT32 len)
+    UINT32 len)
 {
        UINT32* dptr = (UINT32*) pDst;
        size_t span, remaining;
index a5f11c3..97b49bc 100644 (file)
@@ -40,7 +40,7 @@ static primitives_t* generic = NULL;
 static pstatus_t sse2_set_8u(
     BYTE val,
     BYTE* pDst,
-    INT32 len)
+    UINT32 len)
 {
        BYTE byte, *dptr;
        __m128i xmm0;
@@ -126,7 +126,7 @@ static pstatus_t sse2_set_8u(
 static pstatus_t sse2_set_32u(
     UINT32 val,
     UINT32* pDst,
-    INT32 len)
+    UINT32 len)
 {
        const primitives_t* prim = primitives_get_generic();
        UINT32* dptr = (UINT32*) pDst;
@@ -218,7 +218,7 @@ static pstatus_t sse2_set_32u(
 static pstatus_t sse2_set_32s(
     INT32 val,
     INT32* pDst,
-    INT32 len)
+    UINT32 len)
 {
        UINT32 uval = *((UINT32*) &val);
        return sse2_set_32u(uval, (UINT32*) pDst, len);
index 20d9e2e..684a39c 100644 (file)
@@ -24,9 +24,9 @@
 /* ------------------------------------------------------------------------- */
 static pstatus_t general_lShiftC_16s(
     const INT16* pSrc,
-    INT32 val,
+    UINT32 val,
     INT16* pDst,
-    INT32 len)
+    UINT32 len)
 {
        if (val == 0) return PRIMITIVES_SUCCESS;
 
@@ -38,9 +38,9 @@ static pstatus_t general_lShiftC_16s(
 /* ------------------------------------------------------------------------- */
 static pstatus_t general_rShiftC_16s(
     const INT16* pSrc,
-    INT32 val,
+    UINT32 val,
     INT16* pDst,
-    INT32 len)
+    UINT32 len)
 {
        if (val == 0) return PRIMITIVES_SUCCESS;
 
@@ -52,9 +52,9 @@ static pstatus_t general_rShiftC_16s(
 /* ------------------------------------------------------------------------- */
 static pstatus_t general_lShiftC_16u(
     const UINT16* pSrc,
-    INT32 val,
+    UINT32 val,
     UINT16* pDst,
-    INT32 len)
+    UINT32 len)
 {
        if (val == 0) return PRIMITIVES_SUCCESS;
 
@@ -66,9 +66,9 @@ static pstatus_t general_lShiftC_16u(
 /* ------------------------------------------------------------------------- */
 static pstatus_t general_rShiftC_16u(
     const UINT16* pSrc,
-    INT32 val,
+    UINT32 val,
     UINT16* pDst,
-    INT32 len)
+    UINT32 len)
 {
        if (val == 0) return PRIMITIVES_SUCCESS;
 
@@ -82,7 +82,7 @@ static pstatus_t general_shiftC_16s(
     const INT16* pSrc,
     INT32 val,
     INT16* pDst,
-    INT32 len)
+    UINT32 len)
 {
        if (val == 0) return PRIMITIVES_SUCCESS;
 
@@ -95,7 +95,7 @@ static pstatus_t general_shiftC_16u(
     const UINT16* pSrc,
     INT32 val,
     UINT16* pDst,
-    INT32 len)
+    UINT32 len)
 {
        if (val == 0) return PRIMITIVES_SUCCESS;
 
index df0fe2b..617330f 100644 (file)
@@ -39,16 +39,16 @@ static primitives_t* generic = NULL;
 # if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
 /* ------------------------------------------------------------------------- */
 SSE3_SCD_ROUTINE(sse2_lShiftC_16s, INT16, generic->lShiftC_16s,
-                 _mm_slli_epi16, *dptr++ = *sptr++ << val)
+                _mm_slli_epi16, *dptr++ = *sptr++ << val)
 /* ------------------------------------------------------------------------- */
 SSE3_SCD_ROUTINE(sse2_rShiftC_16s, INT16, generic->rShiftC_16s,
-                 _mm_srai_epi16, *dptr++ = *sptr++ >> val)
+                _mm_srai_epi16, *dptr++ = *sptr++ >> val)
 /* ------------------------------------------------------------------------- */
 SSE3_SCD_ROUTINE(sse2_lShiftC_16u, UINT16, generic->lShiftC_16u,
-                 _mm_slli_epi16, *dptr++ = *sptr++ << val)
+                _mm_slli_epi16, *dptr++ = *sptr++ << val)
 /* ------------------------------------------------------------------------- */
 SSE3_SCD_ROUTINE(sse2_rShiftC_16u, UINT16, generic->rShiftC_16u,
-                 _mm_srli_epi16, *dptr++ = *sptr++ >> val)
+                _mm_srli_epi16, *dptr++ = *sptr++ >> val)
 # endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
 #endif
 
index b64b297..26cf7c2 100644 (file)
@@ -28,7 +28,7 @@
 static pstatus_t general_sign_16s(
     const INT16* pSrc,
     INT16* pDst,
-    INT32 len)
+    UINT32 len)
 {
        while (len--)
        {
index e9b0373..4f01e97 100644 (file)
@@ -35,7 +35,7 @@ static primitives_t* generic = NULL;
 static pstatus_t ssse3_sign_16s(
     const INT16* pSrc,
     INT16* pDst,
-    INT32 len)
+    UINT32 len)
 {
        const INT16* sptr = (const INT16*) pSrc;
        INT16* dptr = (INT16*) pDst;
index b58266d..84d586f 100644 (file)
  * SCD = Source, Constant, Destination
  */
 #define SSE3_SCD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
-       static pstatus_t _name_(const _type_ *pSrc, INT32 val, _type_ *pDst, INT32 len) \
-       { \
-               INT32 shifts; \
-               UINT32 offBeatMask; \
-               const _type_ *sptr = pSrc; \
-               _type_ *dptr = pDst; \
-               size_t count; \
-               if (len < 16)   /* pointless if too small */ \
-               { \
-                       return _fallback_(pSrc, val, pDst, len); \
-               } \
-               if      (sizeof(_type_) == 1) shifts = 1; \
-               else if (sizeof(_type_) == 2) shifts = 2; \
-               else if (sizeof(_type_) == 4) shifts = 3; \
-               else if (sizeof(_type_) == 8) shifts = 4; \
-               offBeatMask = (1 << (shifts - 1)) - 1; \
-               if ((ULONG_PTR) pDst & offBeatMask) \
-               { \
-                       /* Incrementing the pointer skips over 16-byte boundary. */ \
-                       return _fallback_(pSrc, val, pDst, len); \
-               } \
-               /* Get to the 16-byte boundary now. */ \
-               while ((ULONG_PTR) dptr & 0x0f) \
-               { \
-                       _slowWay_; \
-                       if (--len == 0) return PRIMITIVES_SUCCESS; \
-               } \
-               /* Use 8 128-bit SSE registers. */ \
-               count = len >> (8-shifts); \
-               len -= count << (8-shifts); \
-               if ((ULONG_PTR) sptr & 0x0f) \
-               { \
-                       while (count--) \
-                       { \
-                               __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \
-                               xmm0 = _mm_lddqu_si128((__m128i *) sptr); \
-                               sptr += (16/sizeof(_type_)); \
-                               xmm1 = _mm_lddqu_si128((__m128i *) sptr); \
-                               sptr += (16/sizeof(_type_)); \
-                               xmm2 = _mm_lddqu_si128((__m128i *) sptr); \
-                               sptr += (16/sizeof(_type_)); \
-                               xmm3 = _mm_lddqu_si128((__m128i *) sptr); \
-                               sptr += (16/sizeof(_type_)); \
-                               xmm4 = _mm_lddqu_si128((__m128i *) sptr); \
-                               sptr += (16/sizeof(_type_)); \
-                               xmm5 = _mm_lddqu_si128((__m128i *) sptr); \
-                               sptr += (16/sizeof(_type_)); \
-                               xmm6 = _mm_lddqu_si128((__m128i *) sptr); \
-                               sptr += (16/sizeof(_type_)); \
-                               xmm7 = _mm_lddqu_si128((__m128i *) sptr); \
-                               sptr += (16/sizeof(_type_)); \
-                               xmm0 = _op_(xmm0, val); \
-                               xmm1 = _op_(xmm1, val); \
-                               xmm2 = _op_(xmm2, val); \
-                               xmm3 = _op_(xmm3, val); \
-                               xmm4 = _op_(xmm4, val); \
-                               xmm5 = _op_(xmm5, val); \
-                               xmm6 = _op_(xmm6, val); \
-                               xmm7 = _op_(xmm7, val); \
-                               _mm_store_si128((__m128i *) dptr, xmm0); \
-                               dptr += (16/sizeof(_type_)); \
-                               _mm_store_si128((__m128i *) dptr, xmm1); \
-                               dptr += (16/sizeof(_type_)); \
-                               _mm_store_si128((__m128i *) dptr, xmm2); \
-                               dptr += (16/sizeof(_type_)); \
-                               _mm_store_si128((__m128i *) dptr, xmm3); \
-                               dptr += (16/sizeof(_type_)); \
-                               _mm_store_si128((__m128i *) dptr, xmm4); \
-                               dptr += (16/sizeof(_type_)); \
-                               _mm_store_si128((__m128i *) dptr, xmm5); \
-                               dptr += (16/sizeof(_type_)); \
-                               _mm_store_si128((__m128i *) dptr, xmm6); \
-                               dptr += (16/sizeof(_type_)); \
-                               _mm_store_si128((__m128i *) dptr, xmm7); \
-                               dptr += (16/sizeof(_type_)); \
-                       } \
-               } \
-               else \
-               { \
-                       while (count--) \
-                       { \
-                               __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \
-                               xmm0 = _mm_load_si128((__m128i *) sptr); \
-                               sptr += (16/sizeof(_type_)); \
-                               xmm1 = _mm_load_si128((__m128i *) sptr); \
-                               sptr += (16/sizeof(_type_)); \
-                               xmm2 = _mm_load_si128((__m128i *) sptr); \
-                               sptr += (16/sizeof(_type_)); \
-                               xmm3 = _mm_load_si128((__m128i *) sptr); \
-                               sptr += (16/sizeof(_type_)); \
-                               xmm4 = _mm_load_si128((__m128i *) sptr); \
-                               sptr += (16/sizeof(_type_)); \
-                               xmm5 = _mm_load_si128((__m128i *) sptr); \
-                               sptr += (16/sizeof(_type_)); \
-                               xmm6 = _mm_load_si128((__m128i *) sptr); \
-                               sptr += (16/sizeof(_type_)); \
-                               xmm7 = _mm_load_si128((__m128i *) sptr); \
-                               sptr += (16/sizeof(_type_)); \
-                               xmm0 = _op_(xmm0, val); \
-                               xmm1 = _op_(xmm1, val); \
-                               xmm2 = _op_(xmm2, val); \
-                               xmm3 = _op_(xmm3, val); \
-                               xmm4 = _op_(xmm4, val); \
-                               xmm5 = _op_(xmm5, val); \
-                               xmm6 = _op_(xmm6, val); \
-                               xmm7 = _op_(xmm7, val); \
-                               _mm_store_si128((__m128i *) dptr, xmm0); \
-                               dptr += (16/sizeof(_type_)); \
-                               _mm_store_si128((__m128i *) dptr, xmm1); \
-                               dptr += (16/sizeof(_type_)); \
-                               _mm_store_si128((__m128i *) dptr, xmm2); \
-                               dptr += (16/sizeof(_type_)); \
-                               _mm_store_si128((__m128i *) dptr, xmm3); \
-                               dptr += (16/sizeof(_type_)); \
-                               _mm_store_si128((__m128i *) dptr, xmm4); \
-                               dptr += (16/sizeof(_type_)); \
-                               _mm_store_si128((__m128i *) dptr, xmm5); \
-                               dptr += (16/sizeof(_type_)); \
-                               _mm_store_si128((__m128i *) dptr, xmm6); \
-                               dptr += (16/sizeof(_type_)); \
-                               _mm_store_si128((__m128i *) dptr, xmm7); \
-                               dptr += (16/sizeof(_type_)); \
-                       } \
-               } \
-               /* Use a single 128-bit SSE register. */ \
-               count = len >> (5-shifts); \
-               len -= count << (5-shifts); \
-               while (count--) \
-               { \
-                       __m128i xmm0 = LOAD_SI128(sptr); sptr += (16/sizeof(_type_)); \
-                       xmm0 = _op_(xmm0, val); \
-                       _mm_store_si128((__m128i *) dptr, xmm0); \
-                       dptr += (16/sizeof(_type_)); \
-               } \
-               /* Finish off the remainder. */ \
-               while (len--) { _slowWay_; } \
-               return PRIMITIVES_SUCCESS; \
+       static pstatus_t _name_(const _type_ *pSrc, UINT32 val, _type_ *pDst, UINT32 len) \
+    { \
+           INT32 shifts; \
+           UINT32 offBeatMask; \
+           const _type_ *sptr = pSrc; \
+           _type_ *dptr = pDst; \
+           size_t count; \
+           if (len < 16)   /* pointless if too small */ \
+        { \
+               return _fallback_(pSrc, val, pDst, len); \
+           } \
+           if      (sizeof(_type_) == 1) shifts = 1; \
+           else if (sizeof(_type_) == 2) shifts = 2; \
+           else if (sizeof(_type_) == 4) shifts = 3; \
+           else if (sizeof(_type_) == 8) shifts = 4; \
+           offBeatMask = (1 << (shifts - 1)) - 1; \
+           if ((ULONG_PTR) pDst & offBeatMask) \
+        { \
+               /* Incrementing the pointer skips over 16-byte boundary. */ \
+               return _fallback_(pSrc, val, pDst, len); \
+           } \
+           /* Get to the 16-byte boundary now. */ \
+           while ((ULONG_PTR) dptr & 0x0f) \
+        { \
+               _slowWay_; \
+               if (--len == 0) return PRIMITIVES_SUCCESS; \
+           } \
+           /* Use 8 128-bit SSE registers. */ \
+           count = len >> (8-shifts); \
+           len -= count << (8-shifts); \
+           if ((ULONG_PTR) sptr & 0x0f) \
+        { \
+               while (count--) \
+            { \
+                   __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \
+                   xmm0 = _mm_lddqu_si128((__m128i *) sptr); \
+                   sptr += (16/sizeof(_type_)); \
+                   xmm1 = _mm_lddqu_si128((__m128i *) sptr); \
+                   sptr += (16/sizeof(_type_)); \
+                   xmm2 = _mm_lddqu_si128((__m128i *) sptr); \
+                   sptr += (16/sizeof(_type_)); \
+                   xmm3 = _mm_lddqu_si128((__m128i *) sptr); \
+                   sptr += (16/sizeof(_type_)); \
+                   xmm4 = _mm_lddqu_si128((__m128i *) sptr); \
+                   sptr += (16/sizeof(_type_)); \
+                   xmm5 = _mm_lddqu_si128((__m128i *) sptr); \
+                   sptr += (16/sizeof(_type_)); \
+                   xmm6 = _mm_lddqu_si128((__m128i *) sptr); \
+                   sptr += (16/sizeof(_type_)); \
+                   xmm7 = _mm_lddqu_si128((__m128i *) sptr); \
+                   sptr += (16/sizeof(_type_)); \
+                   xmm0 = _op_(xmm0, val); \
+                   xmm1 = _op_(xmm1, val); \
+                   xmm2 = _op_(xmm2, val); \
+                   xmm3 = _op_(xmm3, val); \
+                   xmm4 = _op_(xmm4, val); \
+                   xmm5 = _op_(xmm5, val); \
+                   xmm6 = _op_(xmm6, val); \
+                   xmm7 = _op_(xmm7, val); \
+                   _mm_store_si128((__m128i *) dptr, xmm0); \
+                   dptr += (16/sizeof(_type_)); \
+                   _mm_store_si128((__m128i *) dptr, xmm1); \
+                   dptr += (16/sizeof(_type_)); \
+                   _mm_store_si128((__m128i *) dptr, xmm2); \
+                   dptr += (16/sizeof(_type_)); \
+                   _mm_store_si128((__m128i *) dptr, xmm3); \
+                   dptr += (16/sizeof(_type_)); \
+                   _mm_store_si128((__m128i *) dptr, xmm4); \
+                   dptr += (16/sizeof(_type_)); \
+                   _mm_store_si128((__m128i *) dptr, xmm5); \
+                   dptr += (16/sizeof(_type_)); \
+                   _mm_store_si128((__m128i *) dptr, xmm6); \
+                   dptr += (16/sizeof(_type_)); \
+                   _mm_store_si128((__m128i *) dptr, xmm7); \
+                   dptr += (16/sizeof(_type_)); \
+               } \
+           } \
+           else \
+        { \
+               while (count--) \
+            { \
+                   __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \
+                   xmm0 = _mm_load_si128((__m128i *) sptr); \
+                   sptr += (16/sizeof(_type_)); \
+                   xmm1 = _mm_load_si128((__m128i *) sptr); \
+                   sptr += (16/sizeof(_type_)); \
+                   xmm2 = _mm_load_si128((__m128i *) sptr); \
+                   sptr += (16/sizeof(_type_)); \
+                   xmm3 = _mm_load_si128((__m128i *) sptr); \
+                   sptr += (16/sizeof(_type_)); \
+                   xmm4 = _mm_load_si128((__m128i *) sptr); \
+                   sptr += (16/sizeof(_type_)); \
+                   xmm5 = _mm_load_si128((__m128i *) sptr); \
+                   sptr += (16/sizeof(_type_)); \
+                   xmm6 = _mm_load_si128((__m128i *) sptr); \
+                   sptr += (16/sizeof(_type_)); \
+                   xmm7 = _mm_load_si128((__m128i *) sptr); \
+                   sptr += (16/sizeof(_type_)); \
+                   xmm0 = _op_(xmm0, val); \
+                   xmm1 = _op_(xmm1, val); \
+                   xmm2 = _op_(xmm2, val); \
+                   xmm3 = _op_(xmm3, val); \
+                   xmm4 = _op_(xmm4, val); \
+                   xmm5 = _op_(xmm5, val); \
+                   xmm6 = _op_(xmm6, val); \
+                   xmm7 = _op_(xmm7, val); \
+                   _mm_store_si128((__m128i *) dptr, xmm0); \
+                   dptr += (16/sizeof(_type_)); \
+                   _mm_store_si128((__m128i *) dptr, xmm1); \
+                   dptr += (16/sizeof(_type_)); \
+                   _mm_store_si128((__m128i *) dptr, xmm2); \
+                   dptr += (16/sizeof(_type_)); \
+                   _mm_store_si128((__m128i *) dptr, xmm3); \
+                   dptr += (16/sizeof(_type_)); \
+                   _mm_store_si128((__m128i *) dptr, xmm4); \
+                   dptr += (16/sizeof(_type_)); \
+                   _mm_store_si128((__m128i *) dptr, xmm5); \
+                   dptr += (16/sizeof(_type_)); \
+                   _mm_store_si128((__m128i *) dptr, xmm6); \
+                   dptr += (16/sizeof(_type_)); \
+                   _mm_store_si128((__m128i *) dptr, xmm7); \
+                   dptr += (16/sizeof(_type_)); \
+               } \
+           } \
+           /* Use a single 128-bit SSE register. */ \
+           count = len >> (5-shifts); \
+           len -= count << (5-shifts); \
+           while (count--) \
+        { \
+               __m128i xmm0 = LOAD_SI128(sptr); sptr += (16/sizeof(_type_)); \
+               xmm0 = _op_(xmm0, val); \
+               _mm_store_si128((__m128i *) dptr, xmm0); \
+               dptr += (16/sizeof(_type_)); \
+           } \
+           /* Finish off the remainder. */ \
+           while (len--) { _slowWay_; } \
+           return PRIMITIVES_SUCCESS; \
        }
 
 /* ----------------------------------------------------------------------------
  */
 #define SSE3_SCD_PRE_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
        pstatus_t _name_(const _type_ *pSrc, _type_ val, _type_ *pDst, INT32 len) \
-       { \
-               int shifts; \
-               UINT32 offBeatMask; \
-               const _type_ *sptr = pSrc; \
-               _type_ *dptr = pDst; \
-               size_t count; \
-               __m128i xmm0; \
-               if (len < 16) /* pointless if too small */ \
-               { \
-                       return _fallback_(pSrc, val, pDst, len); \
-               } \
-               if      (sizeof(_type_) == 1) shifts = 1; \
-               else if (sizeof(_type_) == 2) shifts = 2; \
-               else if (sizeof(_type_) == 4) shifts = 3; \
-               else if (sizeof(_type_) == 8) shifts = 4; \
-               offBeatMask = (1 << (shifts - 1)) - 1; \
-               if ((ULONG_PTR) pDst & offBeatMask) \
-               { \
-                       /* Incrementing the pointer skips over 16-byte boundary. */ \
-                       return _fallback_(pSrc, val, pDst, len); \
-               } \
-               /* Get to the 16-byte boundary now. */ \
-               while ((ULONG_PTR) dptr & 0x0f) \
-               { \
-                       _slowWay_; \
-                       if (--len == 0) return PRIMITIVES_SUCCESS; \
-               } \
-               /* Use 4 128-bit SSE registers. */ \
-               count = len >> (7-shifts); \
-               len -= count << (7-shifts); \
-               xmm0 = _mm_set1_epi32(val); \
-               if ((ULONG_PTR) sptr & 0x0f) \
-               { \
-                       while (count--) \
-                       { \
-                               __m128i xmm1, xmm2, xmm3, xmm4; \
-                               xmm1 = _mm_lddqu_si128((__m128i *) sptr); \
-                               sptr += (16/sizeof(_type_)); \
-                               xmm2 = _mm_lddqu_si128((__m128i *) sptr); \
-                               sptr += (16/sizeof(_type_)); \
-                               xmm3 = _mm_lddqu_si128((__m128i *) sptr); \
-                               sptr += (16/sizeof(_type_)); \
-                               xmm4 = _mm_lddqu_si128((__m128i *) sptr); \
-                               sptr += (16/sizeof(_type_)); \
-                               xmm1 = _op_(xmm1, xmm0); \
-                               xmm2 = _op_(xmm2, xmm0); \
-                               xmm3 = _op_(xmm3, xmm0); \
-                               xmm4 = _op_(xmm4, xmm0); \
-                               _mm_store_si128((__m128i *) dptr, xmm1); \
-                               dptr += (16/sizeof(_type_)); \
-                               _mm_store_si128((__m128i *) dptr, xmm2); \
-                               dptr += (16/sizeof(_type_)); \
-                               _mm_store_si128((__m128i *) dptr, xmm3); \
-                               dptr += (16/sizeof(_type_)); \
-                               _mm_store_si128((__m128i *) dptr, xmm4); \
-                               dptr += (16/sizeof(_type_)); \
-                       } \
-               } \
-               else \
-               { \
-                       while (count--) \
-                       { \
-                               __m128i xmm1, xmm2, xmm3, xmm4; \
-                               xmm1 = _mm_load_si128((__m128i *) sptr); \
-                               sptr += (16/sizeof(_type_)); \
-                               xmm2 = _mm_load_si128((__m128i *) sptr); \
-                               sptr += (16/sizeof(_type_)); \
-                               xmm3 = _mm_load_si128((__m128i *) sptr); \
-                               sptr += (16/sizeof(_type_)); \
-                               xmm4 = _mm_load_si128((__m128i *) sptr); \
-                               sptr += (16/sizeof(_type_)); \
-                               xmm1 = _op_(xmm1, xmm0); \
-                               xmm2 = _op_(xmm2, xmm0); \
-                               xmm3 = _op_(xmm3, xmm0); \
-                               xmm4 = _op_(xmm4, xmm0); \
-                               _mm_store_si128((__m128i *) dptr, xmm1); \
-                               dptr += (16/sizeof(_type_)); \
-                               _mm_store_si128((__m128i *) dptr, xmm2); \
-                               dptr += (16/sizeof(_type_)); \
-                               _mm_store_si128((__m128i *) dptr, xmm3); \
-                               dptr += (16/sizeof(_type_)); \
-                               _mm_store_si128((__m128i *) dptr, xmm4); \
-                               dptr += (16/sizeof(_type_)); \
-                       } \
-               } \
-               /* Use a single 128-bit SSE register. */ \
-               count = len >> (5-shifts); \
-               len -= count << (5-shifts); \
-               while (count--) \
-               { \
-                       __m128i xmm1 = LOAD_SI128(sptr); sptr += (16/sizeof(_type_)); \
-                       xmm1 = _op_(xmm1, xmm0); \
-                       _mm_store_si128((__m128i *) dptr, xmm1); \
-                       dptr += (16/sizeof(_type_)); \
-               } \
-               /* Finish off the remainder. */ \
-               while (len--) { _slowWay_; } \
-               return PRIMITIVES_SUCCESS; \
+    { \
+           int shifts; \
+           UINT32 offBeatMask; \
+           const _type_ *sptr = pSrc; \
+           _type_ *dptr = pDst; \
+           size_t count; \
+           __m128i xmm0; \
+           if (len < 16) /* pointless if too small */ \
+        { \
+               return _fallback_(pSrc, val, pDst, len); \
+           } \
+           if      (sizeof(_type_) == 1) shifts = 1; \
+           else if (sizeof(_type_) == 2) shifts = 2; \
+           else if (sizeof(_type_) == 4) shifts = 3; \
+           else if (sizeof(_type_) == 8) shifts = 4; \
+           offBeatMask = (1 << (shifts - 1)) - 1; \
+           if ((ULONG_PTR) pDst & offBeatMask) \
+        { \
+               /* Incrementing the pointer skips over 16-byte boundary. */ \
+               return _fallback_(pSrc, val, pDst, len); \
+           } \
+           /* Get to the 16-byte boundary now. */ \
+           while ((ULONG_PTR) dptr & 0x0f) \
+        { \
+               _slowWay_; \
+               if (--len == 0) return PRIMITIVES_SUCCESS; \
+           } \
+           /* Use 4 128-bit SSE registers. */ \
+           count = len >> (7-shifts); \
+           len -= count << (7-shifts); \
+           xmm0 = _mm_set1_epi32(val); \
+           if ((ULONG_PTR) sptr & 0x0f) \
+        { \
+               while (count--) \
+            { \
+                   __m128i xmm1, xmm2, xmm3, xmm4; \
+                   xmm1 = _mm_lddqu_si128((__m128i *) sptr); \
+                   sptr += (16/sizeof(_type_)); \
+                   xmm2 = _mm_lddqu_si128((__m128i *) sptr); \
+                   sptr += (16/sizeof(_type_)); \
+                   xmm3 = _mm_lddqu_si128((__m128i *) sptr); \
+                   sptr += (16/sizeof(_type_)); \
+                   xmm4 = _mm_lddqu_si128((__m128i *) sptr); \
+                   sptr += (16/sizeof(_type_)); \
+                   xmm1 = _op_(xmm1, xmm0); \
+                   xmm2 = _op_(xmm2, xmm0); \
+                   xmm3 = _op_(xmm3, xmm0); \
+                   xmm4 = _op_(xmm4, xmm0); \
+                   _mm_store_si128((__m128i *) dptr, xmm1); \
+                   dptr += (16/sizeof(_type_)); \
+                   _mm_store_si128((__m128i *) dptr, xmm2); \
+                   dptr += (16/sizeof(_type_)); \
+                   _mm_store_si128((__m128i *) dptr, xmm3); \
+                   dptr += (16/sizeof(_type_)); \
+                   _mm_store_si128((__m128i *) dptr, xmm4); \
+                   dptr += (16/sizeof(_type_)); \
+               } \
+           } \
+           else \
+        { \
+               while (count--) \
+            { \
+                   __m128i xmm1, xmm2, xmm3, xmm4; \
+                   xmm1 = _mm_load_si128((__m128i *) sptr); \
+                   sptr += (16/sizeof(_type_)); \
+                   xmm2 = _mm_load_si128((__m128i *) sptr); \
+                   sptr += (16/sizeof(_type_)); \
+                   xmm3 = _mm_load_si128((__m128i *) sptr); \
+                   sptr += (16/sizeof(_type_)); \
+                   xmm4 = _mm_load_si128((__m128i *) sptr); \
+                   sptr += (16/sizeof(_type_)); \
+                   xmm1 = _op_(xmm1, xmm0); \
+                   xmm2 = _op_(xmm2, xmm0); \
+                   xmm3 = _op_(xmm3, xmm0); \
+                   xmm4 = _op_(xmm4, xmm0); \
+                   _mm_store_si128((__m128i *) dptr, xmm1); \
+                   dptr += (16/sizeof(_type_)); \
+                   _mm_store_si128((__m128i *) dptr, xmm2); \
+                   dptr += (16/sizeof(_type_)); \
+                   _mm_store_si128((__m128i *) dptr, xmm3); \
+                   dptr += (16/sizeof(_type_)); \
+                   _mm_store_si128((__m128i *) dptr, xmm4); \
+                   dptr += (16/sizeof(_type_)); \
+               } \
+           } \
+           /* Use a single 128-bit SSE register. */ \
+           count = len >> (5-shifts); \
+           len -= count << (5-shifts); \
+           while (count--) \
+        { \
+               __m128i xmm1 = LOAD_SI128(sptr); sptr += (16/sizeof(_type_)); \
+               xmm1 = _op_(xmm1, xmm0); \
+               _mm_store_si128((__m128i *) dptr, xmm1); \
+               dptr += (16/sizeof(_type_)); \
+           } \
+           /* Finish off the remainder. */ \
+           while (len--) { _slowWay_; } \
+           return PRIMITIVES_SUCCESS; \
        }
 
 /* ----------------------------------------------------------------------------
  * SSD = Source1, Source2, Destination
  */
 #define SSE3_SSD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
-       pstatus_t _name_(const _type_ *pSrc1, const _type_ *pSrc2, _type_ *pDst, INT32 len) \
-       { \
-               int shifts; \
-               UINT32 offBeatMask; \
-               const _type_ *sptr1 = pSrc1; \
-               const _type_ *sptr2 = pSrc2; \
-               _type_ *dptr = pDst; \
-               size_t count; \
-               if (len < 16) /* pointless if too small */ \
-               { \
-                       return _fallback_(pSrc1, pSrc2, pDst, len); \
-               } \
-               if      (sizeof(_type_) == 1) shifts = 1; \
-               else if (sizeof(_type_) == 2) shifts = 2; \
-               else if (sizeof(_type_) == 4) shifts = 3; \
-               else if (sizeof(_type_) == 8) shifts = 4; \
-               offBeatMask = (1 << (shifts - 1)) - 1; \
-               if ((ULONG_PTR) pDst & offBeatMask) \
-               { \
-                       /* Incrementing the pointer skips over 16-byte boundary. */ \
-                       return _fallback_(pSrc1, pSrc2, pDst, len); \
-               } \
-               /* Get to the 16-byte boundary now. */ \
-               while ((ULONG_PTR) dptr & 0x0f) \
-               { \
-                       _slowWay_; \
-                       if (--len == 0) return PRIMITIVES_SUCCESS; \
-               } \
-               /* Use 4 128-bit SSE registers. */ \
-               count = len >> (7-shifts); \
-               len -= count << (7-shifts); \
-               if (((ULONG_PTR) sptr1 & 0x0f) || ((ULONG_PTR) sptr2 & 0x0f)) \
-               { \
-                       /* Unaligned loads */ \
-                       while (count--) \
-                       { \
-                               __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \
-                               xmm0 = _mm_lddqu_si128((__m128i *) sptr1); \
-                               sptr1 += (16/sizeof(_type_)); \
-                               xmm1 = _mm_lddqu_si128((__m128i *) sptr1); \
-                               sptr1 += (16/sizeof(_type_)); \
-                               xmm2 = _mm_lddqu_si128((__m128i *) sptr1); \
-                               sptr1 += (16/sizeof(_type_)); \
-                               xmm3 = _mm_lddqu_si128((__m128i *) sptr1); \
-                               sptr1 += (16/sizeof(_type_)); \
-                               xmm4 = _mm_lddqu_si128((__m128i *) sptr2); \
-                               sptr2 += (16/sizeof(_type_)); \
-                               xmm5 = _mm_lddqu_si128((__m128i *) sptr2); \
-                               sptr2 += (16/sizeof(_type_)); \
-                               xmm6 = _mm_lddqu_si128((__m128i *) sptr2); \
-                               sptr2 += (16/sizeof(_type_)); \
-                               xmm7 = _mm_lddqu_si128((__m128i *) sptr2); \
-                               sptr2 += (16/sizeof(_type_)); \
-                               xmm0 = _op_(xmm0, xmm4); \
-                               xmm1 = _op_(xmm1, xmm5); \
-                               xmm2 = _op_(xmm2, xmm6); \
-                               xmm3 = _op_(xmm3, xmm7); \
-                               _mm_store_si128((__m128i *) dptr, xmm0); \
-                               dptr += (16/sizeof(_type_)); \
-                               _mm_store_si128((__m128i *) dptr, xmm1); \
-                               dptr += (16/sizeof(_type_)); \
-                               _mm_store_si128((__m128i *) dptr, xmm2); \
-                               dptr += (16/sizeof(_type_)); \
-                               _mm_store_si128((__m128i *) dptr, xmm3); \
-                               dptr += (16/sizeof(_type_)); \
-                       } \
-               } \
-               else \
-               { \
-                       /* Aligned loads */ \
-                       while (count--) \
-                       { \
-                               __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \
-                               xmm0 = _mm_load_si128((__m128i *) sptr1); \
-                               sptr1 += (16/sizeof(_type_)); \
-                               xmm1 = _mm_load_si128((__m128i *) sptr1); \
-                               sptr1 += (16/sizeof(_type_)); \
-                               xmm2 = _mm_load_si128((__m128i *) sptr1); \
-                               sptr1 += (16/sizeof(_type_)); \
-                               xmm3 = _mm_load_si128((__m128i *) sptr1); \
-                               sptr1 += (16/sizeof(_type_)); \
-                               xmm4 = _mm_load_si128((__m128i *) sptr2); \
-                               sptr2 += (16/sizeof(_type_)); \
-                               xmm5 = _mm_load_si128((__m128i *) sptr2); \
-                               sptr2 += (16/sizeof(_type_)); \
-                               xmm6 = _mm_load_si128((__m128i *) sptr2); \
-                               sptr2 += (16/sizeof(_type_)); \
-                               xmm7 = _mm_load_si128((__m128i *) sptr2); \
-                               sptr2 += (16/sizeof(_type_)); \
-                               xmm0 = _op_(xmm0, xmm4); \
-                               xmm1 = _op_(xmm1, xmm5); \
-                               xmm2 = _op_(xmm2, xmm6); \
-                               xmm3 = _op_(xmm3, xmm7); \
-                               _mm_store_si128((__m128i *) dptr, xmm0); \
-                               dptr += (16/sizeof(_type_)); \
-                               _mm_store_si128((__m128i *) dptr, xmm1); \
-                               dptr += (16/sizeof(_type_)); \
-                               _mm_store_si128((__m128i *) dptr, xmm2); \
-                               dptr += (16/sizeof(_type_)); \
-                               _mm_store_si128((__m128i *) dptr, xmm3); \
-                               dptr += (16/sizeof(_type_)); \
-                       } \
-               } \
-               /* Use a single 128-bit SSE register. */ \
-               count = len >> (5-shifts); \
-               len -= count << (5-shifts); \
-               while (count--) \
-               { \
-                       __m128i xmm0, xmm1; \
-                       xmm0 = LOAD_SI128(sptr1);  sptr1 += (16/sizeof(_type_)); \
-                       xmm1 = LOAD_SI128(sptr2);  sptr2 += (16/sizeof(_type_)); \
-                       xmm0 = _op_(xmm0, xmm1); \
-                       _mm_store_si128((__m128i *) dptr, xmm0); \
-                       dptr += (16/sizeof(_type_)); \
-               } \
-               /* Finish off the remainder. */ \
-               while (len--) { _slowWay_; } \
-               return PRIMITIVES_SUCCESS; \
+       pstatus_t _name_(const _type_ *pSrc1, const _type_ *pSrc2, _type_ *pDst, UINT32 len) \
+    { \
+           int shifts; \
+           UINT32 offBeatMask; \
+           const _type_ *sptr1 = pSrc1; \
+           const _type_ *sptr2 = pSrc2; \
+           _type_ *dptr = pDst; \
+           size_t count; \
+           if (len < 16) /* pointless if too small */ \
+        { \
+               return _fallback_(pSrc1, pSrc2, pDst, len); \
+           } \
+           if      (sizeof(_type_) == 1) shifts = 1; \
+           else if (sizeof(_type_) == 2) shifts = 2; \
+           else if (sizeof(_type_) == 4) shifts = 3; \
+           else if (sizeof(_type_) == 8) shifts = 4; \
+           offBeatMask = (1 << (shifts - 1)) - 1; \
+           if ((ULONG_PTR) pDst & offBeatMask) \
+        { \
+               /* Incrementing the pointer skips over 16-byte boundary. */ \
+               return _fallback_(pSrc1, pSrc2, pDst, len); \
+           } \
+           /* Get to the 16-byte boundary now. */ \
+           while ((ULONG_PTR) dptr & 0x0f) \
+        { \
+               pstatus_t status; \
+               status = _slowWay_; \
+               if (status != PRIMITIVES_SUCCESS) return status; \
+               if (--len == 0) return PRIMITIVES_SUCCESS; \
+           } \
+           /* Use 4 128-bit SSE registers. */ \
+           count = len >> (7-shifts); \
+           len -= count << (7-shifts); \
+           if (((ULONG_PTR) sptr1 & 0x0f) || ((ULONG_PTR) sptr2 & 0x0f)) \
+        { \
+               /* Unaligned loads */ \
+               while (count--) \
+            { \
+                   __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \
+                   xmm0 = _mm_lddqu_si128((__m128i *) sptr1); \
+                   sptr1 += (16/sizeof(_type_)); \
+                   xmm1 = _mm_lddqu_si128((__m128i *) sptr1); \
+                   sptr1 += (16/sizeof(_type_)); \
+                   xmm2 = _mm_lddqu_si128((__m128i *) sptr1); \
+                   sptr1 += (16/sizeof(_type_)); \
+                   xmm3 = _mm_lddqu_si128((__m128i *) sptr1); \
+                   sptr1 += (16/sizeof(_type_)); \
+                   xmm4 = _mm_lddqu_si128((__m128i *) sptr2); \
+                   sptr2 += (16/sizeof(_type_)); \
+                   xmm5 = _mm_lddqu_si128((__m128i *) sptr2); \
+                   sptr2 += (16/sizeof(_type_)); \
+                   xmm6 = _mm_lddqu_si128((__m128i *) sptr2); \
+                   sptr2 += (16/sizeof(_type_)); \
+                   xmm7 = _mm_lddqu_si128((__m128i *) sptr2); \
+                   sptr2 += (16/sizeof(_type_)); \
+                   xmm0 = _op_(xmm0, xmm4); \
+                   xmm1 = _op_(xmm1, xmm5); \
+                   xmm2 = _op_(xmm2, xmm6); \
+                   xmm3 = _op_(xmm3, xmm7); \
+                   _mm_store_si128((__m128i *) dptr, xmm0); \
+                   dptr += (16/sizeof(_type_)); \
+                   _mm_store_si128((__m128i *) dptr, xmm1); \
+                   dptr += (16/sizeof(_type_)); \
+                   _mm_store_si128((__m128i *) dptr, xmm2); \
+                   dptr += (16/sizeof(_type_)); \
+                   _mm_store_si128((__m128i *) dptr, xmm3); \
+                   dptr += (16/sizeof(_type_)); \
+               } \
+           } \
+           else \
+        { \
+               /* Aligned loads */ \
+               while (count--) \
+            { \
+                   __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \
+                   xmm0 = _mm_load_si128((__m128i *) sptr1); \
+                   sptr1 += (16/sizeof(_type_)); \
+                   xmm1 = _mm_load_si128((__m128i *) sptr1); \
+                   sptr1 += (16/sizeof(_type_)); \
+                   xmm2 = _mm_load_si128((__m128i *) sptr1); \
+                   sptr1 += (16/sizeof(_type_)); \
+                   xmm3 = _mm_load_si128((__m128i *) sptr1); \
+                   sptr1 += (16/sizeof(_type_)); \
+                   xmm4 = _mm_load_si128((__m128i *) sptr2); \
+                   sptr2 += (16/sizeof(_type_)); \
+                   xmm5 = _mm_load_si128((__m128i *) sptr2); \
+                   sptr2 += (16/sizeof(_type_)); \
+                   xmm6 = _mm_load_si128((__m128i *) sptr2); \
+                   sptr2 += (16/sizeof(_type_)); \
+                   xmm7 = _mm_load_si128((__m128i *) sptr2); \
+                   sptr2 += (16/sizeof(_type_)); \
+                   xmm0 = _op_(xmm0, xmm4); \
+                   xmm1 = _op_(xmm1, xmm5); \
+                   xmm2 = _op_(xmm2, xmm6); \
+                   xmm3 = _op_(xmm3, xmm7); \
+                   _mm_store_si128((__m128i *) dptr, xmm0); \
+                   dptr += (16/sizeof(_type_)); \
+                   _mm_store_si128((__m128i *) dptr, xmm1); \
+                   dptr += (16/sizeof(_type_)); \
+                   _mm_store_si128((__m128i *) dptr, xmm2); \
+                   dptr += (16/sizeof(_type_)); \
+                   _mm_store_si128((__m128i *) dptr, xmm3); \
+                   dptr += (16/sizeof(_type_)); \
+               } \
+           } \
+           /* Use a single 128-bit SSE register. */ \
+           count = len >> (5-shifts); \
+           len -= count << (5-shifts); \
+           while (count--) \
+        { \
+               __m128i xmm0, xmm1; \
+               xmm0 = LOAD_SI128(sptr1);  sptr1 += (16/sizeof(_type_)); \
+               xmm1 = LOAD_SI128(sptr2);  sptr2 += (16/sizeof(_type_)); \
+               xmm0 = _op_(xmm0, xmm1); \
+               _mm_store_si128((__m128i *) dptr, xmm0); \
+               dptr += (16/sizeof(_type_)); \
+           } \
+           /* Finish off the remainder. */ \
+           while (len--) { _slowWay_; } \
+           return PRIMITIVES_SUCCESS; \
        }
 
 #endif /* !__PRIM_TEMPLATES_H_INCLUDED__ */
index 3b803a6..987342a 100644 (file)
@@ -81,6 +81,6 @@ primitives_t* primitives_get_generic(void)
        if (!pPrimitivesGenericInitialized)
                primitives_init_generic();
 
-       return &pPrimitives;
+       return &pPrimitivesGeneric;
 }
 
index ade6f49..125752b 100644 (file)
@@ -26,7 +26,7 @@ static BOOL test_add16s_func(void)
        pstatus_t status;
 
        INT16 ALIGN(src1[FUNC_TEST_SIZE + 3]), ALIGN(src2[FUNC_TEST_SIZE + 3]),
-             ALIGN(d1[FUNC_TEST_SIZE + 3]), ALIGN(d2[FUNC_TEST_SIZE + 3]);
+                       ALIGN(d1[FUNC_TEST_SIZE + 3]), ALIGN(d2[FUNC_TEST_SIZE + 3]);
 
        char testStr[256];
        testStr[0] = '\0';
@@ -50,7 +50,7 @@ static BOOL test_add16s_func(void)
 static BOOL test_add16s_speed(void)
 {
        BYTE ALIGN(src1[MAX_TEST_SIZE + 3]), ALIGN(src2[MAX_TEST_SIZE + 3]),
-             ALIGN(dst[MAX_TEST_SIZE + 3]);
+                       ALIGN(dst[MAX_TEST_SIZE + 3]);
 
        if (!g_TestPrimitivesPerformance)
                return TRUE;
@@ -59,7 +59,8 @@ static BOOL test_add16s_speed(void)
        winpr_RAND(src2, sizeof(src2));
 
        if (!speed_test("add16s", "aligned", g_Iterations,
-                       generic->add_16s, optimized->add_16s,
+                       (speed_test_fkt)generic->add_16s,
+                       (speed_test_fkt)optimized->add_16s,
                        src1, src2, dst, FUNC_TEST_SIZE))
                return FALSE;
 
@@ -72,8 +73,11 @@ int TestPrimitivesAdd(int argc, char* argv[])
        if (!test_add16s_func())
                return -1;
 
-       if (!test_add16s_speed())
-               return -1;
+       if (g_TestPrimitivesPerformance)
+       {
+               if (!test_add16s_speed())
+                       return -1;
+       }
 
        return 0;
 }
index 9919c0c..c7f49cf 100644 (file)
@@ -33,8 +33,13 @@ static const int block_size[] = { 4, 64, 256 };
 #define GRN(_c_) (((_c_) & 0x0000FF00U) >> 8)
 #define BLU(_c_) ((_c_) & 0x000000FFU)
 #define TOLERANCE 1
-#define PIXEL(_addr_, _bytes_, _x_, _y_) \
-       ((UINT32 *) (((BYTE *) (_addr_)) + (_x_)*4 + (_y_)*(_bytes_)))
+static inline const UINT32* PIXEL(const BYTE* _addr_, UINT32 _bytes_, UINT32 _x_, UINT32 _y_)
+{
+       const BYTE* addr = _addr_ + _x_ * sizeof(UINT32) + _y_ * _bytes_;
+
+       return (const UINT32*)addr;
+}
+
 #define SRC1_WIDTH 6
 #define SRC1_HEIGHT 6
 #define SRC2_WIDTH 7
@@ -46,8 +51,8 @@ static const int block_size[] = { 4, 64, 256 };
 
 /* ------------------------------------------------------------------------- */
 static UINT32 alpha_add(
-    UINT32 c1,
-    UINT32 c2)
+               UINT32 c1,
+               UINT32 c2)
 {
        UINT32 a1 = ALF(c1);
        UINT32 r1 = RED(c1);
@@ -66,8 +71,8 @@ static UINT32 alpha_add(
 
 /* ------------------------------------------------------------------------- */
 static UINT32 colordist(
-    UINT32 c1,
-    UINT32 c2)
+               UINT32 c1,
+               UINT32 c2)
 {
        int d, maxd = 0;
        d = ABS(ALF(c1) - ALF(c2));
@@ -90,10 +95,10 @@ static UINT32 colordist(
 }
 
 /* ------------------------------------------------------------------------- */
-static BOOL check(const BYTE* pSrc1,  INT32 src1Step,
-                 const BYTE* pSrc2,  INT32 src2Step,
-                 BYTE* pDst,  INT32 dstStep,
-                 INT32 width,  INT32 height)
+static BOOL check(const BYTE* pSrc1,  UINT32 src1Step,
+                 const BYTE* pSrc2,  UINT32 src2Step,
+                 BYTE* pDst,  UINT32 dstStep,
+                 UINT32 width,  UINT32 height)
 {
        UINT32 x, y;
        for (y = 0; y < height; ++y)
@@ -120,14 +125,14 @@ static BOOL check(const BYTE* pSrc1,  INT32 src1Step,
 static BOOL test_alphaComp_func(void)
 {
        pstatus_t status;
-       BYTE ALIGN(src1[SRC1_WIDTH * SRC1_HEIGHT]);
-       BYTE ALIGN(src2[SRC2_WIDTH * SRC2_HEIGHT]);
-       BYTE ALIGN(dst1[DST_WIDTH * DST_HEIGHT]);
-       char testStr[256];
+       BYTE ALIGN(src1[SRC1_WIDTH * SRC1_HEIGHT * 4]);
+       BYTE ALIGN(src2[SRC2_WIDTH * SRC2_HEIGHT * 4]);
+       BYTE ALIGN(dst1[DST_WIDTH * DST_HEIGHT * 4]);
        UINT32* ptr;
        UINT32 i;
-       testStr[0] = '\0';
+
        winpr_RAND((BYTE*)src1, sizeof(src1));
+
        /* Special-case the first two values */
        src1[0] &= 0x00FFFFFFU;
        src1[1] |= 0xFF000000U;
@@ -141,8 +146,8 @@ static BOOL test_alphaComp_func(void)
        memset(dst1, 0, sizeof(dst1));
 
        status = generic->alphaComp_argb(src1, 4 * SRC1_WIDTH,
-                              src2, 4 * SRC2_WIDTH,
-                              dst1, 4 * DST_WIDTH, TEST_WIDTH, TEST_HEIGHT);
+                                        src2, 4 * SRC2_WIDTH,
+                                        dst1, 4 * DST_WIDTH, TEST_WIDTH, TEST_HEIGHT);
        if (status != PRIMITIVES_SUCCESS)
                return FALSE;
 
@@ -152,8 +157,8 @@ static BOOL test_alphaComp_func(void)
                return FALSE;
 
        status = optimized->alphaComp_argb((const BYTE*) src1, 4 * SRC1_WIDTH,
-                              (const BYTE*) src2, 4 * SRC2_WIDTH,
-                              (BYTE*) dst1, 4 * DST_WIDTH, TEST_WIDTH, TEST_HEIGHT);
+                                          (const BYTE*) src2, 4 * SRC2_WIDTH,
+                                          (BYTE*) dst1, 4 * DST_WIDTH, TEST_WIDTH, TEST_HEIGHT);
        if (status != PRIMITIVES_SUCCESS)
                return FALSE;
 
@@ -188,7 +193,8 @@ static int test_alphaComp_speed(void)
        memset(dst1, 0, sizeof(dst1));
 
        if (!speed_test("add16s", "aligned", g_Iterations,
-                       generic->alphaComp_argb, optimized->alphaComp_argb,
+                       (speed_test_fkt)generic->alphaComp_argb,
+                       (speed_test_fkt)optimized->alphaComp_argb,
                        src1, 4 * SRC1_WIDTH,
                        src2, 4 * SRC2_WIDTH,
                        dst1, 4 * DST_WIDTH, TEST_WIDTH, TEST_HEIGHT))
@@ -203,8 +209,11 @@ int TestPrimitivesAlphaComp(int argc, char* argv[])
        if (!test_alphaComp_func())
                return -1;
 
-       if (!test_alphaComp_speed())
-               return -1;
+       if (g_TestPrimitivesPerformance)
+       {
+               if (!test_alphaComp_speed())
+                       return -1;
+       }
 
        return 0;
 }
index bf16d26..5e51c57 100644 (file)
 #include "prim_test.h"
 
 #define FUNC_TEST_SIZE 65536
-static const int ANDOR_PRETEST_ITERATIONS = 100000;
-static const int TEST_TIME = 2.0;  // seconds
 
 #define VALUE (0xA5A5A5A5U)
 
 /* ========================================================================= */
-static BOOL test_and_32u_func(void)
+static BOOL test_and_32u_impl(const char* name, __andC_32u_t fkt,
+                             const UINT32* src, const UINT32 val,
+                             UINT32* dst, size_t size)
 {
-       UINT32 ALIGN(src[FUNC_TEST_SIZE + 3]), ALIGN(dst[FUNC_TEST_SIZE + 3]);
-       int failed = 0;
-       int i;
-       char testStr[256];
-       testStr[0] = '\0';
-       winpr_RAND(src, sizeof(src));
-       generic->andC_32u(src + 1, VALUE, dst + 1, FUNC_TEST_SIZE);
-       strcat(testStr, " general");
+       size_t i;
+       pstatus_t status = fkt(src, val, dst, size);
+       if (status != PRIMITIVES_SUCCESS)
+               return FALSE;
 
-       for (i = 1; i <= FUNC_TEST_SIZE; ++i)
+       for (i = 0; i < size; ++i)
        {
-               if (dst[i] != (src[i] & VALUE))
+               if (dst[i] != (src[i] & val))
                {
-                       printf("AND-general FAIL[%d] 0x%08x&0x%08x=0x%08x, got 0x%08x\n",
-                              i, src[i], VALUE, src[i] & VALUE, dst[i]);
-                       ++failed;
+                       printf("AND %s FAIL[%d] 0x%08x&0x%08x=0x%08x, got 0x%08x\n",
+                              name, i, src[i], val, src[i] & val, dst[i]);
+                       return FALSE;
                }
        }
 
-#ifdef WITH_SSE2
-
-       if (IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
-       {
-               strcat(testStr, " SSE3");
-               /* Aligned */
-               memset(dst, 0, sizeof(dst));
-               sse3_andC_32u(src + 1, VALUE, dst + 1, FUNC_TEST_SIZE);
-
-               for (i = 1; i <= FUNC_TEST_SIZE; ++i)
-               {
-                       if (dst[i] != (src[i] & VALUE))
-                       {
-                               printf("AND-SSE-aligned FAIL[%d] 0x%08x&0x%08x=0x%08x, got 0x%08x\n",
-                                      i, src[i], VALUE, src[i] & VALUE, dst[i]);
-                               ++failed;
-                       }
-               }
-
-               /* Unaligned */
-               memset(dst, 0, sizeof(dst));
-               sse3_andC_32u(src + 1, VALUE, dst + 2, FUNC_TEST_SIZE);
+       return TRUE;
+}
 
-               for (i = 1; i <= FUNC_TEST_SIZE; ++i)
-               {
-                       if (dst[i + 1] != (src[i] & VALUE))
-                       {
-                               printf("AND-SSE-unaligned FAIL[%d] 0x%08x&0x%08x=0x%08x, got 0x%08x\n",
-                                      i, src[i], VALUE, src[i] & VALUE, dst[i + 1]);
-                               ++failed;
-                       }
-               }
-       }
+static BOOL test_and_32u_func(void)
+{
+       UINT32 ALIGN(src[FUNC_TEST_SIZE + 3]), ALIGN(dst[FUNC_TEST_SIZE + 3]);
 
-#endif /* i386 */
+       winpr_RAND((BYTE*)src, sizeof(src));
 
-       if (!failed) printf("All and_32u tests passed (%s).\n", testStr);
+       if (!test_and_32u_impl("generic->andC_32u aligned", generic->andC_32u,
+                              src + 1, VALUE, dst + 1, FUNC_TEST_SIZE))
+               return FALSE;
+       if (!test_and_32u_impl("generic->andC_32u unaligned", generic->andC_32u,
+                              src + 1, VALUE, dst + 2, FUNC_TEST_SIZE))
+               return FALSE;
+       if (!test_and_32u_impl("optimized->andC_32u aligned", optimized->andC_32u,
+                              src + 1, VALUE, dst + 1, FUNC_TEST_SIZE))
+               return FALSE;
+       if (!test_and_32u_impl("optimized->andC_32u unaligned", optimized->andC_32u,
+                              src + 1, VALUE, dst + 2, FUNC_TEST_SIZE))
+               return FALSE;
 
-       return (failed > 0) ? FAILURE : SUCCESS;
+       return TRUE;
 }
 
 /* ------------------------------------------------------------------------- */
 static BOOL test_and_32u_speed(void)
 {
        UINT32 ALIGN(src[MAX_TEST_SIZE + 3]), ALIGN(dst[MAX_TEST_SIZE + 3]);
-       winpr_RAND(src, sizeof(src));
-       andC_32u_speed_test("and32u", "aligned", src, NULL, VALUE, dst,
-                           test_sizes, NUM_TEST_SIZES, ANDOR_PRETEST_ITERATIONS, TEST_TIME);
-       andC_32u_speed_test("and32u", "unaligned", src + 1, NULL, VALUE, dst,
-                           test_sizes, NUM_TEST_SIZES, ANDOR_PRETEST_ITERATIONS, TEST_TIME);
-       return SUCCESS;
+
+       winpr_RAND((BYTE*)src, sizeof(src));
+
+       if (!speed_test("andC_32u", "aligned", g_Iterations,
+                       (speed_test_fkt)generic->andC_32u,
+                       (speed_test_fkt)optimized->andC_32u,
+                       src + 1, VALUE, dst + 1, MAX_TEST_SIZE))
+               return FALSE;
+       if (!speed_test("andC_32u", "unaligned", g_Iterations,
+                       (speed_test_fkt)generic->andC_32u,
+                       (speed_test_fkt)optimized->andC_32u,
+                       src + 1, VALUE, dst + 2, MAX_TEST_SIZE))
+               return FALSE;
+
+       return TRUE;
 }
 
 /* ========================================================================= */
 static BOOL check(const UINT32* src, const UINT32* dst, UINT32 size, UINT32 value)
 {
        UINT32 i;
-       UINT32 failed = 0;
 
-       for (i = 1; i <= size; ++i)
+       for (i = 0; i < size; ++i)
        {
                if (dst[i] != (src[i] | value))
                {
                        printf("OR-general general FAIL[%d] 0x%08x&0x%08x=0x%08x, got 0x%08x\n",
                               i, src[i], value, src[i] | value, dst[i]);
-                       ++failed;
+                       return FALSE;
                }
        }
 
@@ -123,8 +111,7 @@ static BOOL test_or_32u_func(void)
 {
        pstatus_t status;
        UINT32 ALIGN(src[FUNC_TEST_SIZE + 3]), ALIGN(dst[FUNC_TEST_SIZE + 3]);
-       char testStr[256];
-       testStr[0] = '\0';
+
        winpr_RAND((BYTE*)src, sizeof(src));
 
        status = generic->orC_32u(src + 1, VALUE, dst + 1, FUNC_TEST_SIZE);
@@ -153,7 +140,8 @@ static BOOL test_or_32u_speed(void)
        winpr_RAND((BYTE*)src, sizeof(src));
 
        if (!speed_test("add16s", "aligned", g_Iterations,
-                       generic->orC_32u, optimized->orC_32u,
+                       (speed_test_fkt)generic->orC_32u,
+                       (speed_test_fkt)optimized->orC_32u,
                        src + 1, VALUE, dst + 1, FUNC_TEST_SIZE))
                return FALSE;
 
@@ -167,14 +155,16 @@ int TestPrimitivesAndOr(int argc, char* argv[])
        if (!test_and_32u_func())
                return -1;
 
-       if (!test_and_32u_speed())
-               return -1;
-
        if (!test_or_32u_func())
                return -1;
 
-       if (!test_or_32u_speed())
-               return -1;
+       if (g_TestPrimitivesPerformance)
+       {
+               if (!test_and_32u_speed())
+                       return -1;
+               if (!test_or_32u_speed())
+                       return -1;
+       }
 
        return 0;
 }
index e6c5209..18c0dec 100644 (file)
@@ -24,19 +24,16 @@ static const int YCBCR_TRIAL_ITERATIONS = 1000;
 static const float TEST_TIME = 4.0;
 
 /* ------------------------------------------------------------------------- */
-int test_RGBToRGB_16s8u_P3AC4R_func(void)
+static BOOL test_RGBToRGB_16s8u_P3AC4R_func(void)
 {
        INT16 ALIGN(r[4096]), ALIGN(g[4096]), ALIGN(b[4096]);
        UINT32 ALIGN(out1[4096]);
-#ifdef WITH_SSE2
        UINT32 ALIGN(out2[4096]);
-#endif
        int i;
-       int failed = 0;
-       char testStr[256];
+       BOOL failed = FALSE;
        INT16* ptrs[3];
        prim_size_t roi = { 64, 64 };
-       testStr[0] = '\0';
+
        winpr_RAND((BYTE*)r, sizeof(r));
        winpr_RAND((BYTE*)g, sizeof(g));
        winpr_RAND((BYTE*)b, sizeof(b));
@@ -52,56 +49,38 @@ int test_RGBToRGB_16s8u_P3AC4R_func(void)
        ptrs[0] = r;
        ptrs[1] = g;
        ptrs[2] = b;
-       generic->RGBToRGB_16s8u_P3AC4R((const INT16**) ptrs, 64 * 2,
-                                     (BYTE*) out1, 64 * 4, &roi);
-#ifdef WITH_SSE2
+       if (generic->RGBToRGB_16s8u_P3AC4R((const INT16**) ptrs, 64 * 2,
+                                     (BYTE*) out1, 64 * 4, PIXEL_FORMAT_RGBA32,
+                                      &roi) != PRIMITIVES_SUCCESS)
+               return FALSE;
 
-       if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE))
-       {
-               strcat(testStr, " SSE2");
-               sse2_RGBToRGB_16s8u_P3AC4R((const INT16**) ptrs, 64 * 2,
-                                          (BYTE*) out2, 64 * 4, &roi);
+       if (optimized->RGBToRGB_16s8u_P3AC4R((const INT16**) ptrs, 64 * 2,
+                                        (BYTE*) out2, 64 * 4, PIXEL_FORMAT_RGBA32,
+                                         &roi) != PRIMITIVES_SUCCESS)
+               return FALSE;
 
-               for (i = 0; i < 4096; ++i)
+       for (i = 0; i < 4096; ++i)
+       {
+               if (out1[i] != out2[i])
                {
-                       if (out1[i] != out2[i])
-                       {
-                               printf("RGBToRGB-SSE FAIL: out1[%d]=0x%08x out2[%d]=0x%08x\n",
-                                      i, out1[i], i, out2[i]);
-                               failed = 1;
-                       }
+                       printf("RGBToRGB-SSE FAIL: out1[%d]=0x%08x out2[%d]=0x%08x\n",
+                              i, out1[i], i, out2[i]);
+                       failed = TRUE;
                }
        }
 
-#endif /* i386 */
-
-       if (!failed) printf("All RGBToRGB_16s8u_P3AC4R tests passed (%s).\n", testStr);
-
-       return (failed > 0) ? FAILURE : SUCCESS;
+       return !failed;
 }
 
 /* ------------------------------------------------------------------------- */
-static const prim_size_t roi64x64 = { 64, 64 };
-STD_SPEED_TEST(
-    rgb_to_argb_speed, INT16*, UINT32, dst = dst,
-    TRUE, generic->RGBToRGB_16s8u_P3AC4R(
-       (const INT16**) src1, 64 * 2, (BYTE*) dst, 64 * 4, &roi64x64),
-#ifdef WITH_SSE2
-    TRUE, sse2_RGBToRGB_16s8u_P3AC4R(
-       (const INT16**) src1, 64 * 2, (BYTE*) dst, 64 * 4, &roi64x64),
-    PF_SSE2_INSTRUCTIONS_AVAILABLE, FALSE,
-#else
-    FALSE, PRIM_NOP, 0, FALSE,
-#endif
-    FALSE, dst = dst);
-
-int test_RGBToRGB_16s8u_P3AC4R_speed(void)
+static BOOL test_RGBToRGB_16s8u_P3AC4R_speed(void)
 {
-       INT16 ALIGN(r[4096]), ALIGN(g[4096]), ALIGN(b[4096]);
-       UINT32 ALIGN(dst[4096]);
+       const prim_size_t roi64x64 = { 64, 64 };
+       INT16 ALIGN(r[4096+1]), ALIGN(g[4096+1]), ALIGN(b[4096+1]);
+       UINT32 ALIGN(dst[4096+1]);
        int i;
        INT16* ptrs[3];
-       int size_array[] = { 64 };
+
        winpr_RAND((BYTE*)r, sizeof(r));
        winpr_RAND((BYTE*)g, sizeof(g));
        winpr_RAND((BYTE*)b, sizeof(b));
@@ -114,29 +93,38 @@ int test_RGBToRGB_16s8u_P3AC4R_speed(void)
                b[i] &= 0x00FFU;
        }
 
-       ptrs[0] = r;
-       ptrs[1] = g;
-       ptrs[2] = b;
-       rgb_to_argb_speed("RGBToARGB", "aligned",
-                         (const INT16**) ptrs, NULL, 0, dst,
-                         size_array, 1, RGB_TRIAL_ITERATIONS, TEST_TIME);
-       return SUCCESS;
+       ptrs[0] = r+1;
+       ptrs[1] = g+1;
+       ptrs[2] = b+1;
+
+       if (!speed_test("RGBToRGB_16s8u_P3AC4R", "aligned", g_Iterations,
+                       (speed_test_fkt)generic->RGBToRGB_16s8u_P3AC4R,
+                       (speed_test_fkt)optimized->RGBToRGB_16s8u_P3AC4R,
+                       (const INT16**) ptrs, 64 * 2, (BYTE*) dst, 64 * 4, &roi64x64))
+               return FALSE;
+
+       if (!speed_test("RGBToRGB_16s8u_P3AC4R", "unaligned", g_Iterations,
+                       (speed_test_fkt)generic->RGBToRGB_16s8u_P3AC4R,
+                       (speed_test_fkt)optimized->RGBToRGB_16s8u_P3AC4R,
+                       (const INT16**) ptrs, 64 * 2, ((BYTE*) dst)+1, 64 * 4, &roi64x64))
+               return FALSE;
+
+       return TRUE;
 }
 
 /* ========================================================================= */
-int test_yCbCrToRGB_16s16s_P3P3_func(void)
+static BOOL test_yCbCrToRGB_16s16s_P3P3_func(void)
 {
+       pstatus_t status;
        INT16 ALIGN(y[4096]), ALIGN(cb[4096]), ALIGN(cr[4096]);
        INT16 ALIGN(r1[4096]), ALIGN(g1[4096]), ALIGN(b1[4096]);
        INT16 ALIGN(r2[4096]), ALIGN(g2[4096]), ALIGN(b2[4096]);
        int i;
-       int failed = 0;
-       char testStr[256];
        const INT16* in[3];
        INT16* out1[3];
        INT16* out2[3];
        prim_size_t roi = { 64, 64 };
-       testStr[0] = '\0';
+
        winpr_RAND((BYTE*)y, sizeof(y));
        winpr_RAND((BYTE*)cb, sizeof(cb));
        winpr_RAND((BYTE*)cr, sizeof(cr));
@@ -164,57 +152,40 @@ int test_yCbCrToRGB_16s16s_P3P3_func(void)
        out2[0] = r2;
        out2[1] = g2;
        out2[2] = b2;
-       generic->yCbCrToRGB_16s16s_P3P3(in, 64 * 2, out1, 64 * 2, &roi);
-#ifdef WITH_SSE2
 
-       if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE))
-       {
-               strcat(testStr, " SSE2");
-               sse2_yCbCrToRGB_16s16s_P3P3(in, 64 * 2, out2, 64 * 2, &roi);
+       status = generic->yCbCrToRGB_16s16s_P3P3(in, 64 * 2, out1, 64 * 2, &roi);
+       if (status != PRIMITIVES_SUCCESS)
+               return FALSE;
 
-               for (i = 0; i < 4096; ++i)
+       status = optimized->yCbCrToRGB_16s16s_P3P3(in, 64 * 2, out2, 64 * 2, &roi);
+       if (status != PRIMITIVES_SUCCESS)
+               return FALSE;
+
+       for (i = 0; i < 4096; ++i)
+       {
+               if ((ABS(r1[i] - r2[i]) > 1)
+                   || (ABS(g1[i] - g2[i]) > 1)
+                   || (ABS(b1[i] - b2[i]) > 1))
                {
-                       if ((ABS(r1[i] - r2[i]) > 1)
-                           || (ABS(g1[i] - g2[i]) > 1)
-                           || (ABS(b1[i] - b2[i]) > 1))
-                       {
-                               printf("YCbCrToRGB-SSE FAIL[%d]: %d,%d,%d vs %d,%d,%d\n", i,
-                                      r1[i], g1[i], b1[i], r2[i], g2[i], b2[i]);
-                               failed = 1;
-                       }
+                       printf("YCbCrToRGB-SSE FAIL[%d]: %d,%d,%d vs %d,%d,%d\n", i,
+                              r1[i], g1[i], b1[i], r2[i], g2[i], b2[i]);
+                       return FALSE;
                }
        }
 
-#endif /* i386 */
-
-       if (!failed) printf("All yCbCrToRGB_16s16s_P3P3 tests passed (%s).\n", testStr);
-
-       return (failed > 0) ? FAILURE : SUCCESS;
+       return TRUE;
 }
 
 /* ------------------------------------------------------------------------- */
-STD_SPEED_TEST(
-    ycbcr_to_rgb_speed, INT16*, INT16*, dst = dst,
-    TRUE, generic->yCbCrToRGB_16s16s_P3P3(src1, 64 * 2, dst, 64 * 2, &roi64x64),
-#ifdef WITH_SSE2
-    TRUE, sse2_yCbCrToRGB_16s16s_P3P3(src1, 64 * 2, dst, 64 * 2, &roi64x64),
-    PF_SSE2_INSTRUCTIONS_AVAILABLE, FALSE,
-#elif defined(WITH_NEON)
-    TRUE, neon_yCbCrToRGB_16s16s_P3P3(src1, 64 * 2, dst, 64 * 2, &roi64x64),
-    PF_ARM_NEON_INSTRUCTIONS_AVAILABLE, FALSE,
-#else
-    FALSE, PRIM_NOP, 0, FALSE,
-#endif
-    FALSE, dst = dst);
-
 static int test_yCbCrToRGB_16s16s_P3P3_speed(void)
 {
+       prim_size_t roi = { 64, 64 };
        INT16 ALIGN(y[4096]), ALIGN(cb[4096]), ALIGN(cr[4096]);
        INT16 ALIGN(r[4096]), ALIGN(g[4096]), ALIGN(b[4096]);
        int i;
        const INT16* input[3];
        INT16* output[3];
-       int size_array[] = { 64 };
+
        winpr_RAND((BYTE*)y, sizeof(y));
        winpr_RAND((BYTE*)cb, sizeof(cb));
        winpr_RAND((BYTE*)cr, sizeof(cr));
@@ -233,37 +204,35 @@ static int test_yCbCrToRGB_16s16s_P3P3_speed(void)
        output[0] = r;
        output[1] = g;
        output[2] = b;
-       ycbcr_to_rgb_speed("yCbCrToRGB", "aligned", input, NULL, NULL, output,
-                          size_array, 1, YCBCR_TRIAL_ITERATIONS, TEST_TIME);
-       return SUCCESS;
+
+       if (!speed_test("yCbCrToRGB_16s16s_P3P3", "aligned", g_Iterations,
+                       (speed_test_fkt)generic->yCbCrToRGB_16s16s_P3P3,
+                       (speed_test_fkt)optimized->yCbCrToRGB_16s16s_P3P3,
+                       input, 64 * 2, output, 64 * 2, &roi))
+               return FALSE;
+
+       return TRUE;
 }
 
 int TestPrimitivesColors(int argc, char* argv[])
 {
-       int status;
-       status = test_RGBToRGB_16s8u_P3AC4R_func();
+       prim_test_setup(FALSE);
 
-       if (status != SUCCESS)
+       if (!test_RGBToRGB_16s8u_P3AC4R_func())
                return 1;
 
        if (g_TestPrimitivesPerformance)
        {
-               status = test_RGBToRGB_16s8u_P3AC4R_speed();
-
-               if (status != SUCCESS)
+               if (!test_RGBToRGB_16s8u_P3AC4R_speed())
                        return 1;
        }
 
-       status = test_yCbCrToRGB_16s16s_P3P3_func();
-
-       if (status != SUCCESS)
+       if (!test_yCbCrToRGB_16s16s_P3P3_func())
                return 1;
 
        if (g_TestPrimitivesPerformance)
        {
-               status = test_yCbCrToRGB_16s16s_P3P3_speed();
-
-               if (status != SUCCESS)
+               if (!test_yCbCrToRGB_16s16s_P3P3_speed())
                        return 1;
        }
 
index 9220efe..917c702 100644 (file)
 #include <winpr/sysinfo.h>
 #include "prim_test.h"
 
-static const int MEMCPY_PRETEST_ITERATIONS = 1000000;
-static const int TEST_TIME = 1.0;  // seconds
 #define COPY_TESTSIZE (256*2+16*2+15+15)
 
 /* ------------------------------------------------------------------------- */
-static int test_copy8u_func(void)
+static BOOL test_copy8u_func(void)
 {
        primitives_t* prims = primitives_get();
        BYTE ALIGN(data[COPY_TESTSIZE + 15]);
        int i, soff;
-       int failed = 0;
-       char testStr[256];
        BYTE ALIGN(dest[COPY_TESTSIZE + 15]);
-       testStr[0] = '\0';
+
        winpr_RAND(data, sizeof(data));
-       strcat(testStr, " ptr");
 
        for (soff = 0; soff < 16; ++soff)
        {
@@ -47,7 +42,8 @@ static int test_copy8u_func(void)
                        for (length = 1; length <= COPY_TESTSIZE - doff; ++length)
                        {
                                memset(dest, 0, sizeof(dest));
-                               prims->copy_8u(data + soff, dest + doff, length);
+                               if (prims->copy_8u(data + soff, dest + doff, length) != PRIMITIVES_SUCCESS)
+                                       return FALSE;
 
                                for (i = 0; i < length; ++i)
                                {
@@ -57,48 +53,47 @@ static int test_copy8u_func(void)
                                                       "data[%d]=0x%02x\n",
                                                       doff, length, i + doff, dest[i + doff],
                                                       i + soff, data[i + soff]);
-                                               failed = 1;
+                                               return FALSE;
                                        }
                                }
                        }
                }
        }
 
-       if (!failed) printf("All copy8 tests passed (%s).\n", testStr);
-
-       return (failed > 0) ? FAILURE : SUCCESS;
+       return TRUE;
 }
 
 /* ------------------------------------------------------------------------- */
-STD_SPEED_TEST(copy8u_speed_test, BYTE, BYTE, dst = dst,
-              TRUE, memcpy(dst, src1, size),
-              FALSE, PRIM_NOP, 0, FALSE,
-              TRUE, ippsCopy_8u(src1, dst, size));
-
-int test_copy8u_speed(void)
+static BOOL test_copy8u_speed(void)
 {
        BYTE ALIGN(src[MAX_TEST_SIZE + 4]);
        BYTE ALIGN(dst[MAX_TEST_SIZE + 4]);
-       copy8u_speed_test("copy8u", "aligned", src, NULL, 0, dst,
-                         test_sizes, NUM_TEST_SIZES, MEMCPY_PRETEST_ITERATIONS, TEST_TIME);
-       copy8u_speed_test("copy8u", "unaligned", src + 1, NULL, 0, dst,
-                         test_sizes, NUM_TEST_SIZES, MEMCPY_PRETEST_ITERATIONS, TEST_TIME);
-       return SUCCESS;
+
+       if (!speed_test("copy_8u", "aligned", g_Iterations,
+                       (speed_test_fkt)generic->copy_8u,
+                       (speed_test_fkt)optimized->copy_8u,
+                       src, dst, MAX_TEST_SIZE))
+               return FALSE;
+
+       if (!speed_test("copy_8u", "unaligned", g_Iterations,
+                       (speed_test_fkt)generic->copy_8u,
+                       (speed_test_fkt)optimized->copy_8u,
+                       src+1, dst+1, MAX_TEST_SIZE))
+               return FALSE;
+
+       return TRUE;
 }
 
 int TestPrimitivesCopy(int argc, char* argv[])
 {
-       int status;
-       status = test_copy8u_func();
+       prim_test_setup(FALSE);
 
-       if (status != SUCCESS)
+       if (!test_copy8u_func())
                return 1;
 
        if (g_TestPrimitivesPerformance)
        {
-               status = test_copy8u_speed();
-
-               if (status != SUCCESS)
+               if (!test_copy8u_speed())
                        return 1;
        }
 
index 208e11e..219539b 100644 (file)
@@ -53,14 +53,14 @@ static BOOL test_set8u_func(void)
        {
                UINT32 len;
 
-               memset(dest, 0, sizeof(dest));
+               memset(dest, 3, sizeof(dest));
                for (len = 1; len < 48 - off; ++len)
                {
                        status = generic->set_8u(0xa5, dest + off, len);
                        if (status != PRIMITIVES_SUCCESS)
                                return FALSE;
 
-                       if (!check8(dest, len, off, 0xa8))
+                       if (!check8(dest, len, off, 0xa5))
                                return FALSE;
                }
        }
@@ -69,14 +69,14 @@ static BOOL test_set8u_func(void)
        {
                UINT32 len;
 
-               memset(dest, 0, sizeof(dest));
+               memset(dest, 3, sizeof(dest));
                for (len = 1; len < 48 - off; ++len)
                {
                        status = optimized->set_8u(0xa5, dest + off, len);
                        if (status != PRIMITIVES_SUCCESS)
                                return FALSE;
 
-                       if (!check8(dest, len, off, 0xa8))
+                       if (!check8(dest, len, off, 0xa5))
                                return FALSE;
                }
        }
@@ -95,8 +95,9 @@ static BOOL test_set8u_speed(void)
        {
                winpr_RAND(&value, sizeof(value));
                if (!speed_test("set_8u", "", g_Iterations,
-                               generic->set_8u, optimized->set_8u,
-                               value, dest + x, len))
+                               (speed_test_fkt)generic->set_8u,
+                               (speed_test_fkt)optimized->set_8u,
+                               value, dest + x, x))
                        return FALSE;
        }
 
@@ -232,8 +233,9 @@ static BOOL test_set32u_speed(void)
        {
                winpr_RAND(&value, sizeof(value));
                if (!speed_test("set_32u", "", g_Iterations,
-                               generic->set_32u, optimized->set_32u,
-                               value, dest + x, len))
+                               (speed_test_fkt)generic->set_32u,
+                               (speed_test_fkt)optimized->set_32u,
+                               value, dest + x, x))
                        return FALSE;
        }
 
@@ -251,8 +253,9 @@ static BOOL test_set32s_speed(void)
        {
                winpr_RAND(&value, sizeof(value));
                if (!speed_test("set_32s", "", g_Iterations,
-                               generic->set_32s, optimized->set_32s,
-                               value, dest + x, len))
+                               (speed_test_fkt)generic->set_32s,
+                               (speed_test_fkt)optimized->set_32s,
+                               value, dest + x, x))
                        return FALSE;
        }
 
@@ -265,21 +268,20 @@ int TestPrimitivesSet(int argc, char* argv[])
 
        if (!test_set8u_func())
                return -1;
-
-       if (!test_set8u_speed())
-               return -1;
-
        if (!test_set32s_func())
                return -1;
-
-       if (!test_set32s_speed())
-               return -1;
-
        if (!test_set32u_func())
                return -1;
 
-       if (!test_set32u_speed())
-               return -1;
+       if (g_TestPrimitivesPerformance)
+       {
+               if (!test_set8u_speed())
+                       return -1;
+               if (!test_set32s_speed())
+                       return -1;
+               if (!test_set32u_speed())
+                       return -1;
+       }
 
        return 0;
 }
index 4316cad..e2f9b67 100644 (file)
 #include "prim_test.h"
 
 #define FUNC_TEST_SIZE 65536
-static const int SHIFT_PRETEST_ITERATIONS = 50000;
-static const float TEST_TIME = 1.0;
-
-#ifdef WITH_SSE2
-#define SHIFT_TEST_FUNC(_name_, _type_, _str_, _f1_, _f2_) \
-       int _name_(void) \
-       { \
-               _type_ ALIGN(src[FUNC_TEST_SIZE+3]), \
-               ALIGN(d1[FUNC_TEST_SIZE+3]), ALIGN(d2[FUNC_TEST_SIZE+3]); \
-               int failed = 0; \
-               int i; \
-               char testStr[256]; \
-               testStr[0] = '\0'; \
-               get_random_data(src, sizeof(src)); \
-               _f1_(src+1, 3, d1+1, FUNC_TEST_SIZE); \
-               if (IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE)) \
-               { \
-                       strcat(testStr, " SSE3"); \
-                       /* Aligned */ \
-                       _f2_(src+1, 3, d2+1, FUNC_TEST_SIZE); \
-                       for (i=1; i<=FUNC_TEST_SIZE; ++i) \
-                       { \
-                               if (d1[i] != d2[i]) \
-                               {  \
-                                       printf("%s-SSE-aligned FAIL[%d]: 0x%x>>3=0x%x, got 0x%x\n", \
-                                              _str_, i, src[i], d1[i], d2[i]);  \
-                                       ++failed; \
-                               } \
-                       } \
-                       /* Unaligned */ \
-                       _f2_(src+1, 3, d2+2, FUNC_TEST_SIZE); \
-                       for (i=1; i<=FUNC_TEST_SIZE; ++i) \
-                       { \
-                               if (d1[i] != d2[i+1]) \
-                               {  \
-                                       printf("%s-SSE-unaligned FAIL[%d]: 0x%x>>3=0x%x, got 0x%x\n", \
-                                              _str_, i, src[i], d1[i], d2[i+1]);  \
-                                       ++failed; \
-                               } \
-                       } \
-               } \
-               if (!failed) printf("All %s tests passed (%s).\n", _str_, testStr); \
-               return (failed > 0) ? FAILURE : SUCCESS; \
-       }
-#else
-#define SHIFT_TEST_FUNC(_name_, _type_, _str_, _f1_, _f2_) \
-       int _name_(void) \
-       { \
-               return SUCCESS; \
-       }
-#endif /* i386 */
-
-SHIFT_TEST_FUNC(test_lShift_16s_func, INT16, "lshift_16s", general_lShiftC_16s,
-               sse2_lShiftC_16s)
-SHIFT_TEST_FUNC(test_lShift_16u_func, UINT16, "lshift_16u", general_lShiftC_16u,
-               sse2_lShiftC_16u)
-SHIFT_TEST_FUNC(test_rShift_16s_func, INT16, "rshift_16s", general_rShiftC_16s,
-               sse2_rShiftC_16s)
-SHIFT_TEST_FUNC(test_rShift_16u_func, UINT16, "rshift_16u", general_rShiftC_16u,
-               sse2_rShiftC_16u)
-
-/* ========================================================================= */
-STD_SPEED_TEST(speed_lShift_16s, INT16, INT16, dst = dst,
-              TRUE, general_lShiftC_16s(src1, constant, dst, size),
-#ifdef WITH_SSE2
-              TRUE, sse2_lShiftC_16s(src1, constant, dst, size),
-              PF_SSE2_INSTRUCTIONS_AVAILABLE, FALSE,
-#else
-              FALSE, PRIM_NOP, 0, FALSE,
-#endif
-              TRUE, ippsLShiftC_16s(src1, constant, dst, size));
-STD_SPEED_TEST(speed_lShift_16u, UINT16, UINT16, dst = dst,
-              TRUE, general_lShiftC_16u(src1, constant, dst, size),
-#ifdef WITH_SSE2
-              TRUE, sse2_lShiftC_16u(src1, constant, dst, size),
-              PF_SSE2_INSTRUCTIONS_AVAILABLE, FALSE,
-#else
-              FALSE, PRIM_NOP, 0, FALSE,
-#endif
-              TRUE, ippsLShiftC_16u(src1, constant, dst, size));
-STD_SPEED_TEST(speed_rShift_16s, INT16, INT16, dst = dst,
-              TRUE, general_rShiftC_16s(src1, constant, dst, size),
-#ifdef WITH_SSE2
-              TRUE, sse2_rShiftC_16s(src1, constant, dst, size),
-              PF_SSE2_INSTRUCTIONS_AVAILABLE, FALSE,
-#else
-              FALSE, PRIM_NOP, 0, FALSE,
-#endif
-              TRUE, ippsRShiftC_16s(src1, constant, dst, size));
-STD_SPEED_TEST(speed_rShift_16u, UINT16, UINT16, dst = dst,
-              TRUE, general_rShiftC_16u(src1, constant, dst, size),
-#ifdef WITH_SSE2
-              TRUE, sse2_rShiftC_16u(src1, constant, dst, size),
-              PF_SSE2_INSTRUCTIONS_AVAILABLE, FALSE,
-#else
-              FALSE, PRIM_NOP, 0, FALSE,
-#endif
-              TRUE, ippsRShiftC_16u(src1, constant, dst, size));
+
+static BOOL test_lShift_16s_func(void)
+{
+       pstatus_t status;
+       INT16 ALIGN(src[FUNC_TEST_SIZE+3]);
+       INT16 ALIGN(d1[FUNC_TEST_SIZE+3]);
+       UINT32 val;
+
+       winpr_RAND((BYTE*)&val, sizeof(val));
+       winpr_RAND((BYTE*)src, sizeof(src));
+
+       val = (val % (FUNC_TEST_SIZE - 1)) + 1;
+
+       /* Aligned */
+       status = generic->lShiftC_16s(src+1, val, d1+1, FUNC_TEST_SIZE);
+       if (status != PRIMITIVES_SUCCESS)
+               return FALSE;
+       status = optimized->lShiftC_16s(src+1, val, d1+1, FUNC_TEST_SIZE);
+       if (status != PRIMITIVES_SUCCESS)
+               return FALSE;
+
+       /* Unaligned */
+       status = generic->lShiftC_16s(src+1, val, d1+2, FUNC_TEST_SIZE);
+       if (status != PRIMITIVES_SUCCESS)
+               return FALSE;
+       status = optimized->lShiftC_16s(src+1, val, d1+2, FUNC_TEST_SIZE);
+       if (status != PRIMITIVES_SUCCESS)
+               return FALSE;
+
+       return TRUE;
+}
+
+static BOOL test_lShift_16u_func(void)
+{
+       pstatus_t status;
+       UINT16 ALIGN(src[FUNC_TEST_SIZE+3]);
+       UINT16 ALIGN(d1[FUNC_TEST_SIZE+3]);
+       UINT32 val;
+
+       winpr_RAND((BYTE*)&val, sizeof(val));
+       winpr_RAND((BYTE*)src, sizeof(src));
+
+       val = (val % (FUNC_TEST_SIZE - 1)) + 1;
+
+       /* Aligned */
+       status = generic->lShiftC_16u(src+1, val, d1+1, FUNC_TEST_SIZE);
+       if (status != PRIMITIVES_SUCCESS)
+               return FALSE;
+       status = optimized->lShiftC_16u(src+1, val, d1+1, FUNC_TEST_SIZE);
+       if (status != PRIMITIVES_SUCCESS)
+               return FALSE;
+
+       /* Unaligned */
+       status = generic->lShiftC_16u(src+1, val, d1+2, FUNC_TEST_SIZE);
+       if (status != PRIMITIVES_SUCCESS)
+               return FALSE;
+       status = optimized->lShiftC_16u(src+1, val, d1+2, FUNC_TEST_SIZE);
+       if (status != PRIMITIVES_SUCCESS)
+               return FALSE;
+
+       return TRUE;
+}
+
+static BOOL test_rShift_16s_func(void)
+{
+       pstatus_t status;
+       INT16 ALIGN(src[FUNC_TEST_SIZE+3]);
+       INT16 ALIGN(d1[FUNC_TEST_SIZE+3]);
+       UINT32 val;
+
+       winpr_RAND((BYTE*)&val, sizeof(val));
+       winpr_RAND((BYTE*)src, sizeof(src));
+
+       val = (val % (FUNC_TEST_SIZE - 1)) + 1;
+
+       /* Aligned */
+       status = generic->rShiftC_16s(src+1, val, d1+1, FUNC_TEST_SIZE);
+       if (status != PRIMITIVES_SUCCESS)
+               return FALSE;
+       status = optimized->rShiftC_16s(src+1, val, d1+1, FUNC_TEST_SIZE);
+       if (status != PRIMITIVES_SUCCESS)
+               return FALSE;
+
+       /* Unaligned */
+       status = generic->rShiftC_16s(src+1, val, d1+2, FUNC_TEST_SIZE);
+       if (status != PRIMITIVES_SUCCESS)
+               return FALSE;
+       status = optimized->rShiftC_16s(src+1, val, d1+2, FUNC_TEST_SIZE);
+       if (status != PRIMITIVES_SUCCESS)
+               return FALSE;
+
+       return TRUE;
+}
+
+static BOOL test_rShift_16u_func(void)
+{
+       pstatus_t status;
+       UINT16 ALIGN(src[FUNC_TEST_SIZE+3]);
+       UINT16 ALIGN(d1[FUNC_TEST_SIZE+3]);
+       UINT32 val;
+
+       winpr_RAND((BYTE*)&val, sizeof(val));
+       winpr_RAND((BYTE*)src, sizeof(src));
+
+       val = (val % (FUNC_TEST_SIZE - 1)) + 1;
+
+       /* Aligned */
+       status = generic->rShiftC_16u(src+1, val, d1+1, FUNC_TEST_SIZE);
+       if (status != PRIMITIVES_SUCCESS)
+               return FALSE;
+       status = optimized->rShiftC_16u(src+1, val, d1+1, FUNC_TEST_SIZE);
+       if (status != PRIMITIVES_SUCCESS)
+               return FALSE;
+
+       /* Unaligned */
+       status = generic->rShiftC_16u(src+1, val, d1+2, FUNC_TEST_SIZE);
+       if (status != PRIMITIVES_SUCCESS)
+               return FALSE;
+       status = optimized->rShiftC_16u(src+1, val, d1+2, FUNC_TEST_SIZE);
+       if (status != PRIMITIVES_SUCCESS)
+               return FALSE;
+
+       return TRUE;
+}
+
+static BOOL test_ShiftWrapper_16s_func(void)
+{
+       pstatus_t status;
+       INT16 ALIGN(src[FUNC_TEST_SIZE+3]);
+       INT16 ALIGN(d1[FUNC_TEST_SIZE+3]);
+       UINT32 tmp;
+       INT32 val;
+
+       winpr_RAND((BYTE*)&tmp, sizeof(tmp));
+       winpr_RAND((BYTE*)src, sizeof(src));
+
+       val = (tmp % (FUNC_TEST_SIZE - 1)) + 1;
+
+       /* Aligned */
+       status = generic->shiftC_16s(src+1, val, d1+1, FUNC_TEST_SIZE);
+       if (status != PRIMITIVES_SUCCESS)
+               return FALSE;
+       status = optimized->shiftC_16s(src+1, val, d1+1, FUNC_TEST_SIZE);
+       if (status != PRIMITIVES_SUCCESS)
+               return FALSE;
+
+       status = generic->shiftC_16s(src+1, -val, d1+1, FUNC_TEST_SIZE);
+       if (status != PRIMITIVES_SUCCESS)
+               return FALSE;
+       status = optimized->shiftC_16s(src+1, -val, d1+1, FUNC_TEST_SIZE);
+       if (status != PRIMITIVES_SUCCESS)
+               return FALSE;
+
+       /* Unaligned */
+       status = generic->shiftC_16s(src+1, val, d1+2, FUNC_TEST_SIZE);
+       if (status != PRIMITIVES_SUCCESS)
+               return FALSE;
+       status = optimized->shiftC_16s(src+1, val, d1+2, FUNC_TEST_SIZE);
+       if (status != PRIMITIVES_SUCCESS)
+               return FALSE;
+       status = generic->shiftC_16s(src+1, -val, d1+2, FUNC_TEST_SIZE);
+       if (status != PRIMITIVES_SUCCESS)
+               return FALSE;
+       status = optimized->shiftC_16s(src+1, -val, d1+2, FUNC_TEST_SIZE);
+       if (status != PRIMITIVES_SUCCESS)
+               return FALSE;
+
+       return TRUE;
+}
+
+static BOOL test_ShiftWrapper_16u_func(void)
+{
+       pstatus_t status;
+       UINT16 ALIGN(src[FUNC_TEST_SIZE+3]);
+       UINT16 ALIGN(d1[FUNC_TEST_SIZE+3]);
+       UINT32 tmp;
+       INT32 val;
+
+       winpr_RAND((BYTE*)&tmp, sizeof(tmp));
+       winpr_RAND((BYTE*)src, sizeof(src));
+
+       val = (tmp % (FUNC_TEST_SIZE - 1)) + 1;
+
+       /* Aligned */
+       status = generic->shiftC_16u(src+1, val, d1+1, FUNC_TEST_SIZE);
+       if (status != PRIMITIVES_SUCCESS)
+               return FALSE;
+       status = optimized->shiftC_16u(src+1, val, d1+1, FUNC_TEST_SIZE);
+       if (status != PRIMITIVES_SUCCESS)
+               return FALSE;
+
+       status = generic->shiftC_16u(src+1, -val, d1+1, FUNC_TEST_SIZE);
+       if (status != PRIMITIVES_SUCCESS)
+               return FALSE;
+       status = optimized->shiftC_16u(src+1, -val, d1+1, FUNC_TEST_SIZE);
+       if (status != PRIMITIVES_SUCCESS)
+               return FALSE;
+
+       /* Unaligned */
+       status = generic->shiftC_16u(src+1, val, d1+2, FUNC_TEST_SIZE);
+       if (status != PRIMITIVES_SUCCESS)
+               return FALSE;
+       status = optimized->shiftC_16u(src+1, val, d1+2, FUNC_TEST_SIZE);
+       if (status != PRIMITIVES_SUCCESS)
+               return FALSE;
+       status = generic->shiftC_16u(src+1, -val, d1+2, FUNC_TEST_SIZE);
+       if (status != PRIMITIVES_SUCCESS)
+               return FALSE;
+       status = optimized->shiftC_16u(src+1, -val, d1+2, FUNC_TEST_SIZE);
+       if (status != PRIMITIVES_SUCCESS)
+               return FALSE;
+
+       return TRUE;
+}
 
 /* ------------------------------------------------------------------------- */
-int test_lShift_16s_speed(void)
+static BOOL test_lShift_16s_speed(void)
 {
-       INT16 ALIGN(src[MAX_TEST_SIZE + 1]), ALIGN(dst[MAX_TEST_SIZE + 1]);
-       winpr_RAND(src, sizeof(src));
-       speed_lShift_16s("lShift_16s", "aligned", src, NULL, 3, dst,
-                        test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME);
-       speed_lShift_16s("lShift_16s", "unaligned", src + 1, NULL, 3, dst,
-                        test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME);
-       return SUCCESS;
+       UINT32 val;
+       INT16 ALIGN(src[MAX_TEST_SIZE+1]), ALIGN(dst[MAX_TEST_SIZE+1]);
+
+       winpr_RAND((BYTE*)src, sizeof(src));
+       winpr_RAND((BYTE*)&val, sizeof(val));
+
+       if (!speed_test("lShift_16s", "aligned", g_Iterations,
+                       (speed_test_fkt)generic->lShiftC_16s,
+                       (speed_test_fkt)optimized->lShiftC_16s, src, val,
+                       dst, MAX_TEST_SIZE))
+               return FALSE;
+
+       if (!speed_test("lShift_16s", "unaligned", g_Iterations,
+                       (speed_test_fkt)generic->lShiftC_16s,
+                       (speed_test_fkt)optimized->lShiftC_16s, src + 1, val,
+                       dst, MAX_TEST_SIZE))
+               return FALSE;
+
+       return TRUE;
 }
 
 /* ------------------------------------------------------------------------- */
-int test_lShift_16u_speed(void)
+static BOOL test_lShift_16u_speed(void)
 {
+       UINT32 val;
        UINT16 ALIGN(src[MAX_TEST_SIZE + 1]), ALIGN(dst[MAX_TEST_SIZE + 1]);
-       winpr_RAND(src, sizeof(src));
-       speed_lShift_16u("lShift_16u", "aligned", src, NULL, 3, dst,
-                        test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME);
-       speed_lShift_16u("lShift_16u", "unaligned", src + 1, NULL, 3, dst,
-                        test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME);
-       return SUCCESS;
+
+       winpr_RAND((BYTE*)&val, sizeof(val));
+       winpr_RAND((BYTE*)src, sizeof(src));
+
+       if (!speed_test("lShift_16u", "aligned", g_Iterations,
+                       (speed_test_fkt)generic->lShiftC_16u,
+                       (speed_test_fkt)optimized->lShiftC_16u, src, val,
+                       dst, MAX_TEST_SIZE))
+               return FALSE;
+
+       if (!speed_test("lShift_16u", "unaligned", g_Iterations,
+                       (speed_test_fkt)generic->lShiftC_16u,
+                       (speed_test_fkt)optimized->lShiftC_16u, src + 1, val,
+                       dst, MAX_TEST_SIZE))
+               return FALSE;
+
+       return TRUE;
 }
 
 /* ------------------------------------------------------------------------- */
-int test_rShift_16s_speed(void)
+static BOOL test_rShift_16s_speed(void)
 {
-       INT16 ALIGN(src[MAX_TEST_SIZE + 1]), ALIGN(dst[MAX_TEST_SIZE + 1]);
-       winpr_RAND(src, sizeof(src));
-       speed_rShift_16s("rShift_16s", "aligned", src, NULL, 3, dst,
-                        test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME);
-       speed_rShift_16s("rShift_16s", "unaligned", src + 1, NULL, 3, dst,
-                        test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME);
-       return SUCCESS;
+       UINT32 val;
+       INT16 ALIGN(src[MAX_TEST_SIZE+1]), ALIGN(dst[MAX_TEST_SIZE+1]);
+
+       winpr_RAND((BYTE*)src, sizeof(src));
+       winpr_RAND((BYTE*)&val, sizeof(val));
+       if (!speed_test("rShift_16s", "aligned", g_Iterations,
+                       (speed_test_fkt)generic->rShiftC_16s,
+                       (speed_test_fkt)optimized->rShiftC_16s, src, val,
+                       dst, MAX_TEST_SIZE))
+               return FALSE;
+
+       if (!speed_test("rShift_16s", "unaligned", g_Iterations,
+                       (speed_test_fkt)generic->rShiftC_16s,
+                       (speed_test_fkt)optimized->rShiftC_16s, src + 1, val,
+                       dst, MAX_TEST_SIZE))
+               return FALSE;
+
+       return TRUE;
 }
 
 /* ------------------------------------------------------------------------- */
-int test_rShift_16u_speed(void)
+static BOOL test_rShift_16u_speed(void)
 {
+       UINT32 val;
        UINT16 ALIGN(src[MAX_TEST_SIZE + 1]), ALIGN(dst[MAX_TEST_SIZE + 1]);
-       winpr_RAND(src, sizeof(src));
-       speed_rShift_16u("rShift_16u", "aligned", src, NULL, 3, dst,
-                        test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME);
-       speed_rShift_16u("rShift_16u", "unaligned", src + 1, NULL, 3, dst,
-                        test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME);
-       return SUCCESS;
+
+       winpr_RAND((BYTE*)&val, sizeof(val));
+       winpr_RAND((BYTE*)src, sizeof(src));
+
+       if (!speed_test("rShift_16u", "aligned", g_Iterations,
+                       (speed_test_fkt)generic->rShiftC_16u,
+                       (speed_test_fkt)optimized->rShiftC_16u, src, val,
+                       dst, MAX_TEST_SIZE))
+               return FALSE;
+
+       if (!speed_test("rShift_16u", "unaligned", g_Iterations,
+                       (speed_test_fkt)generic->rShiftC_16u,
+                       (speed_test_fkt)optimized->rShiftC_16u, src + 1, val,
+                       dst, MAX_TEST_SIZE))
+               return FALSE;
+
+       return TRUE;
 }
 
 int TestPrimitivesShift(int argc, char* argv[])
 {
-       int status;
-       status = test_lShift_16s_func();
+       prim_test_setup(FALSE);
 
-       if (status != SUCCESS)
+       if (!test_lShift_16s_func())
                return 1;
 
        if (g_TestPrimitivesPerformance)
        {
-               status = test_lShift_16s_speed();
-
-               if (status != SUCCESS)
+               if (!test_lShift_16s_speed())
                        return 1;
        }
 
-       status = test_lShift_16u_func();
-
-       if (status != SUCCESS)
+       if (!test_lShift_16u_func())
                return 1;
 
        if (g_TestPrimitivesPerformance)
        {
-               status = test_lShift_16u_speed();
-
-               if (status != SUCCESS)
+               if (!test_lShift_16u_speed())
                        return 1;
        }
 
-       status = test_rShift_16s_func();
-
-       if (status != SUCCESS)
+       if (!test_rShift_16s_func())
                return 1;
 
        if (g_TestPrimitivesPerformance)
        {
-               status = test_rShift_16s_speed();
-
-               if (status != SUCCESS)
+               if (!test_rShift_16s_speed())
                        return 1;
        }
 
-       status = test_rShift_16u_func();
-
-       if (status != SUCCESS)
+       if (!test_rShift_16u_func())
                return 1;
 
        if (g_TestPrimitivesPerformance)
        {
-               status = test_rShift_16u_speed();
-
-               if (status != SUCCESS)
+               if (!test_rShift_16u_speed())
                        return 1;
        }
 
+       if (!test_ShiftWrapper_16s_func())
+               return 1;
+
+       if (!test_ShiftWrapper_16u_func())
+               return 1;
+
        return 0;
 }
index 0496283..8091068 100644 (file)
 #include <winpr/sysinfo.h>
 #include "prim_test.h"
 
-static const int SIGN_PRETEST_ITERATIONS = 100000;
-static const float TEST_TIME = 1.0;
+#define TEST_BUFFER_SIZE 65535
 
 /* ------------------------------------------------------------------------- */
-static int test_sign16s_func(void)
+static BOOL test_sign16s_func(void)
 {
-       INT16 ALIGN(src[65535]), ALIGN(d1[65535]);
-#ifdef WITH_SSE2
-       INT16 ALIGN(d2[65535]);
-       int i;
-#endif
-       int failed = 0;
-       char testStr[256];
-       /* Test when we can reach 16-byte alignment */
-       testStr[0] = '\0';
-       winpr_RAND(src, sizeof(src));
-       general_sign_16s(src + 1, d1 + 1, 65535);
-#ifdef WITH_SSE2
-
-       if (IsProcessorFeaturePresentEx(PF_EX_SSSE3))
-       {
-               strcat(testStr, " SSSE3");
-               ssse3_sign_16s(src + 1, d2 + 1, 65535);
-
-               for (i = 1; i < 65535; ++i)
-               {
-                       if (d1[i] != d2[i])
-                       {
-                               printf("SIGN16s-SSE-aligned FAIL[%d] of %d: want %d, got %d\n",
-                                      i, src[i], d1[i], d2[i]);
-                               ++failed;
-                       }
-               }
-       }
-
-#endif /* i386 */
-       /* Test when we cannot reach 16-byte alignment */
-       winpr_RAND(src, sizeof(src));
-       general_sign_16s(src + 1, d1 + 2, 65535);
-#ifdef WITH_SSE2
-
-       if (IsProcessorFeaturePresentEx(PF_EX_SSSE3))
-       {
-               ssse3_sign_16s(src + 1, d2 + 2, 65535);
-
-               for (i = 2; i < 65535; ++i)
-               {
-                       if (d1[i] != d2[i])
-                       {
-                               printf("SIGN16s-SSE-unaligned FAIL[%d] of %d: want %d, got %d\n",
-                                      i, src[i - 1], d1[i], d2[i]);
-                               ++failed;
-                       }
-               }
-       }
-
-#endif /* i386 */
-
-       if (!failed) printf("All sign16s tests passed (%s).\n", testStr);
-
-       return (failed > 0) ? FAILURE : SUCCESS;
+       pstatus_t status;
+       INT16 ALIGN(src[TEST_BUFFER_SIZE]);
+       INT16 ALIGN(d1[TEST_BUFFER_SIZE]);
+       INT16 ALIGN(d2[TEST_BUFFER_SIZE]);
+
+       winpr_RAND((BYTE*)src, sizeof(src));
+
+       status = generic->sign_16s(src + 1, d1 + 1, TEST_BUFFER_SIZE);
+       if (status != PRIMITIVES_SUCCESS)
+               return FALSE;
+       status = optimized->sign_16s(src + 1, d2 + 1, TEST_BUFFER_SIZE);
+       if (status != PRIMITIVES_SUCCESS)
+               return FALSE;
+
+       if (memcmp(d1, d2, sizeof(d1)) != 0)
+               return FALSE;
+
+       status = generic->sign_16s(src + 1, d1 + 2, TEST_BUFFER_SIZE);
+       if (status != PRIMITIVES_SUCCESS)
+               return FALSE;
+       status = optimized->sign_16s(src + 1, d2 + 2, TEST_BUFFER_SIZE);
+       if (status != PRIMITIVES_SUCCESS)
+               return FALSE;
+
+       if (memcmp(d1, d2, sizeof(d1)) != 0)
+               return FALSE;
+
+       return TRUE;
 }
 
-/* ------------------------------------------------------------------------- */
-STD_SPEED_TEST(sign16s_speed_test, INT16, INT16, dst = dst,
-              TRUE, general_sign_16s(src1, dst, size),
-#ifdef WITH_SSE2
-              TRUE, ssse3_sign_16s(src1, dst, size), PF_EX_SSSE3, TRUE,
-#else
-              FALSE, PRIM_NOP, 0, FALSE,
-#endif
-              FALSE, dst = dst);
-
 static int test_sign16s_speed(void)
 {
        INT16 ALIGN(src[MAX_TEST_SIZE + 3]), ALIGN(dst[MAX_TEST_SIZE + 3]);
-       winpr_RAND(src, sizeof(src));
-       sign16s_speed_test("sign16s", "aligned", src, NULL, 0, dst,
-                          test_sizes, NUM_TEST_SIZES, SIGN_PRETEST_ITERATIONS, TEST_TIME);
-       sign16s_speed_test("sign16s", "unaligned", src + 1, NULL, 0, dst,
-                          test_sizes, NUM_TEST_SIZES, SIGN_PRETEST_ITERATIONS, TEST_TIME);
-       return SUCCESS;
+       winpr_RAND((BYTE*)src, sizeof(src));
+
+       if (!speed_test("sign16s", "aligned", g_Iterations,
+                       (speed_test_fkt)generic->sign_16s,
+                       (speed_test_fkt)optimized->sign_16s, src + 1, dst + 1,
+                       MAX_TEST_SIZE))
+               return FALSE;
+
+       if (!speed_test("sign16s", "unaligned", g_Iterations,
+                       (speed_test_fkt)generic->sign_16s,
+                       (speed_test_fkt)optimized->sign_16s, src + 1, dst + 2,
+                       MAX_TEST_SIZE))
+               return FALSE;
+
+       return TRUE;
 }
 
 int TestPrimitivesSign(int argc, char* argv[])
 {
-       int status;
-       status = test_sign16s_func();
+       prim_test_setup(FALSE);
 
-       if (status != SUCCESS)
+       if (!test_sign16s_func())
                return 1;
 
        if (g_TestPrimitivesPerformance)
        {
-               status = test_sign16s_speed();
-
-               if (status != SUCCESS)
+               if (!test_sign16s_speed())
                        return 1;
        }
 
index 2dd36e9..a1f5084 100644 (file)
 #include <winpr/sysinfo.h>
 #include "prim_test.h"
 
-static const int YCOCG_TRIAL_ITERATIONS = 20000;
-static const float TEST_TIME = 4.0;
-
 /* ------------------------------------------------------------------------- */
-int test_YCoCgRToRGB_8u_AC4R_func(void)
+static BOOL test_YCoCgRToRGB_8u_AC4R_func(void)
 {
-#ifdef WITH_SSE2
-       int i;
+       BOOL result = TRUE;
+       pstatus_t status;
        INT32 ALIGN(out_sse[4098]), ALIGN(out_sse_inv[4098]);
-#endif
        INT32 ALIGN(in[4098]);
        INT32 ALIGN(out_c[4098]), ALIGN(out_c_inv[4098]);
-       char testStr[256];
-       BOOL failed = FALSE;
-       testStr[0] = '\0';
-       winpr_RAND(in, sizeof(in));
-       general_YCoCgToRGB_8u_AC4R((const BYTE*)(in + 1), 63 * 4,
-                                  (BYTE*) out_c, 63 * 4, 63, 61, 2, TRUE, FALSE);
-       general_YCoCgToRGB_8u_AC4R((const BYTE*)(in + 1), 63 * 4,
-                                  (BYTE*) out_c_inv, 63 * 4, 63, 61, 2, TRUE, TRUE);
-#ifdef WITH_SSE2
-
-       if (IsProcessorFeaturePresentEx(PF_EX_SSSE3))
+
+       UINT32 i, x;
+       const UINT32 formats[] = {
+               PIXEL_FORMAT_ARGB32,
+               PIXEL_FORMAT_ABGR32,
+               PIXEL_FORMAT_RGBA32,
+               PIXEL_FORMAT_RGBX32,
+               PIXEL_FORMAT_BGRA32,
+               PIXEL_FORMAT_BGRX32
+       };
+
+       winpr_RAND((BYTE*)in, sizeof(in));
+
+       for (x=0; x<sizeof(formats)/sizeof(formats[0]); x++)
        {
-               strcat(testStr, " SSSE3");
-               ssse3_YCoCgRToRGB_8u_AC4R((const BYTE*)(in + 1), 63 * 4,
-                                         (BYTE*) out_sse, 63 * 4, 63, 61, 2, TRUE, FALSE);
+               UINT32 format = formats[x];
+
+               status = generic->YCoCgToRGB_8u_AC4R(
+                                (const BYTE*)(in + 1), 63 * 4,
+                                (BYTE*) out_c, format, 63 * 4, 63, 61, 2, TRUE);
+               if (status != PRIMITIVES_SUCCESS)
+                       return FALSE;
+               status = generic->YCoCgToRGB_8u_AC4R(
+                                (const BYTE*)(in + 1), 63 * 4,
+                                (BYTE*) out_c_inv, format, 63 * 4, 63, 61, 2, TRUE);
+               if (status != PRIMITIVES_SUCCESS)
+                       return FALSE;
+
+               status = optimized->YCoCgToRGB_8u_AC4R(
+                                (const BYTE*)(in + 1), 63 * 4,
+                                (BYTE*) out_sse, format, 63 * 4, 63, 61, 2, TRUE);
+               if (status != PRIMITIVES_SUCCESS)
+                       return FALSE;
+               status = optimized->YCoCgToRGB_8u_AC4R(
+                                (const BYTE*)(in + 1), 63 * 4,
+                                (BYTE*) out_sse_inv, format, 63 * 4, 63, 61, 2, TRUE);
+               if (status != PRIMITIVES_SUCCESS)
+                       return FALSE;
 
                for (i = 0; i < 63 * 61; ++i)
                {
                        if (out_c[i] != out_sse[i])
                        {
-                               printf("YCoCgRToRGB-SSE FAIL[%d]: 0x%08x -> C 0x%08x vs SSE 0x%08x\n", i,
+                               printf("optimized->YCoCgRToRGB FAIL[%d]: 0x%08x -> C 0x%08x vs optimized 0x%08x\n", i,
                                       in[i + 1], out_c[i], out_sse[i]);
-                               failed = TRUE;
+                               result = FALSE;
                        }
                }
 
-               ssse3_YCoCgRToRGB_8u_AC4R((const BYTE*)(in + 1), 63 * 4,
-                                         (BYTE*) out_sse_inv, 63 * 4, 63, 61, 2, TRUE, TRUE);
-
                for (i = 0; i < 63 * 61; ++i)
                {
                        if (out_c_inv[i] != out_sse_inv[i])
                        {
-                               printf("YCoCgRToRGB-SSE inverted FAIL[%d]: 0x%08x -> C 0x%08x vs SSE 0x%08x\n",
+                               printf("optimized->YCoCgRToRGB inverted FAIL[%d]: 0x%08x -> C 0x%08x vs optimized 0x%08x\n",
                                       i,
                                       in[i + 1], out_c_inv[i], out_sse_inv[i]);
-                               failed = TRUE;
+                               result = FALSE;
                        }
                }
        }
-
-#endif /* i386 */
-
-       if (!failed) printf("All YCoCgRToRGB_8u_AC4R tests passed (%s).\n", testStr);
-
-       return (failed > 0) ? FAILURE : SUCCESS;
+       return result;
 }
 
-/* ------------------------------------------------------------------------- */
-STD_SPEED_TEST(
-    ycocg_to_rgb_speed, BYTE, BYTE, PRIM_NOP,
-    TRUE, general_YCoCgToRGB_8u_AC4R(src1, 64 * 4, dst, 64 * 4, 64, 64, 2, FALSE,
-                                    FALSE),
-#ifdef WITH_SSE2
-    TRUE, ssse3_YCoCgRToRGB_8u_AC4R(src1, 64 * 4, dst, 64 * 4, 64, 64, 2, FALSE,
-                                   FALSE),
-    PF_EX_SSSE3, TRUE,
-#else
-    FALSE, PRIM_NOP, 0, FALSE,
-#endif
-    FALSE, PRIM_NOP);
-
 static int test_YCoCgRToRGB_8u_AC4R_speed(void)
 {
        INT32 ALIGN(in[4096]);
        INT32 ALIGN(out[4096]);
-       int size_array[] = { 64 };
-       winpr_RAND(in, sizeof(in));
-       ycocg_to_rgb_speed("YCoCgToRGB", "aligned", (const BYTE*) in,
-                          0, 0, (BYTE*) out,
-                          size_array, 1, YCOCG_TRIAL_ITERATIONS, TEST_TIME);
-       return SUCCESS;
+
+       winpr_RAND((BYTE*)in, sizeof(in));
+
+       if (!speed_test("YCoCgToRGB_8u_AC4R", "aligned", g_Iterations,
+                       (speed_test_fkt)generic->YCoCgToRGB_8u_AC4R,
+                       (speed_test_fkt)optimized->YCoCgToRGB_8u_AC4R,
+                       in, 64 * 4, out, 64 * 4, 64, 64, 2, FALSE, FALSE))
+               return FALSE;
+
+       return TRUE;
 }
 
 int TestPrimitivesYCoCg(int argc, char* argv[])
 {
-       int status;
-       status = test_YCoCgRToRGB_8u_AC4R_func();
+       prim_test_setup(FALSE);
 
-       if (status != SUCCESS)
+       if (!test_YCoCgRToRGB_8u_AC4R_func())
                return 1;
 
        if (g_TestPrimitivesPerformance)
        {
-               status = test_YCoCgRToRGB_8u_AC4R_speed();
-
-               if (status != SUCCESS)
+               if (!test_YCoCgRToRGB_8u_AC4R_speed())
                        return 1;
        }
 
index 7281763..c7af894 100644 (file)
@@ -38,8 +38,8 @@ static void get_size(UINT32* width, UINT32* height)
        winpr_RAND((BYTE*)width, sizeof(*width));
        winpr_RAND((BYTE*)height, sizeof(*height));
        // TODO: Algorithm only works on even resolutions...
-       *width = (*width % 4000) << 1;
-       *height = (*height % 4000 << 1);
+       *width = (*width % 64) << 1;
+       *height = (*height % 64 << 1);
 }
 
 static BOOL check_padding(const BYTE* psrc, size_t size, size_t padding,
@@ -370,11 +370,13 @@ static BOOL TestPrimitiveYUV(BOOL use444)
 
        if (use444)
        {
-               if (prims->RGBToYUV444_8u_P3AC4R(rgb, stride, yuv, yuv_step,
+               if (prims->RGBToYUV444_8u_P3AC4R(rgb, PIXEL_FORMAT_BGRA32,
+                                                stride, yuv, yuv_step,
                                                 &roi) != PRIMITIVES_SUCCESS)
                        goto fail;
        }
-       else if (prims->RGBToYUV420_8u_P3AC4R(rgb, stride, yuv, yuv_step,
+       else if (prims->RGBToYUV420_8u_P3AC4R(rgb, PIXEL_FORMAT_BGRA32,
+                                             stride, yuv, yuv_step,
                                              &roi) != PRIMITIVES_SUCCESS)
                goto fail;
 
@@ -429,16 +431,16 @@ int TestPrimitivesYUV(int argc, char* argv[])
        UINT32 x;
        int rc = -1;
 
+       prim_test_setup(FALSE);
+
        for (x = 0; x < 10; x++)
        {
-               /* TODO: This test fails on value comparison,
-                * there seems to be some issue left with encoder / decoder pass.
-               if (!TestPrimitiveYUV(FALSE))
-                       goto end;
-                       */
                if (!TestPrimitiveYUV(TRUE))
                        goto end;
 
+               if (!TestPrimitiveYUV(FALSE))
+                       goto end;
+
                if (!TestPrimitiveYUVCombine())
                        goto end;
        }
index e061887..45519f3 100644 (file)
 extern int test_sizes[];
 #define NUM_TEST_SIZES 10
 
-#ifndef SUCCESS
-#define SUCCESS 0
-#endif
-#ifndef FAILURE
-#define FAILURE 1
-#endif
-
 extern BOOL g_TestPrimitivesPerformance;
 extern UINT32 g_Iterations;
 
@@ -58,8 +51,10 @@ extern primitives_t* optimized;
 
 void prim_test_setup(BOOL performance);
 
+typedef pstatus_t (*speed_test_fkt)();
+
 BOOL speed_test(const char* name, const char* dsc, UINT32 iterations,
-               pstatus_t (*generic)(), pstatus_t (*optimised)(),
+               speed_test_fkt generic, speed_test_fkt optimized,
                ...);
 
 #endif // !__PRIMTEST_H_INCLUDED__