From 99c418766c74c9b2206881d81612f4cbe2805542 Mon Sep 17 00:00:00 2001 From: Armin Novak Date: Wed, 13 Jul 2016 14:04:48 +0200 Subject: [PATCH] Updated primitives API and tests. --- include/freerdp/primitives.h | 46 +- libfreerdp/codec/h264.c | 2 +- libfreerdp/primitives/prim_YCoCg.c | 17 +- libfreerdp/primitives/prim_YCoCg_opt.c | 82 ++- libfreerdp/primitives/prim_YUV.c | 56 +- libfreerdp/primitives/prim_YUV_opt.c | 5 + libfreerdp/primitives/prim_add.c | 2 +- libfreerdp/primitives/prim_add_opt.c | 2 +- libfreerdp/primitives/prim_alphaComp.c | 26 +- libfreerdp/primitives/prim_alphaComp_opt.c | 32 +- libfreerdp/primitives/prim_colors.c | 8 +- libfreerdp/primitives/prim_colors_opt.c | 16 +- libfreerdp/primitives/prim_set.c | 6 +- libfreerdp/primitives/prim_set_opt.c | 6 +- libfreerdp/primitives/prim_shift.c | 20 +- libfreerdp/primitives/prim_shift_opt.c | 8 +- libfreerdp/primitives/prim_sign.c | 2 +- libfreerdp/primitives/prim_sign_opt.c | 2 +- libfreerdp/primitives/prim_templates.h | 708 +++++++++++---------- libfreerdp/primitives/primitives.c | 2 +- libfreerdp/primitives/test/TestPrimitivesAdd.c | 14 +- .../primitives/test/TestPrimitivesAlphaComp.c | 53 +- libfreerdp/primitives/test/TestPrimitivesAndOr.c | 126 ++-- libfreerdp/primitives/test/TestPrimitivesColors.c | 179 +++--- libfreerdp/primitives/test/TestPrimitivesCopy.c | 53 +- libfreerdp/primitives/test/TestPrimitivesSet.c | 42 +- libfreerdp/primitives/test/TestPrimitivesShift.c | 458 ++++++++----- libfreerdp/primitives/test/TestPrimitivesSign.c | 128 ++-- libfreerdp/primitives/test/TestPrimitivesYCoCg.c | 118 ++-- libfreerdp/primitives/test/TestPrimitivesYUV.c | 20 +- libfreerdp/primitives/test/prim_test.h | 11 +- 31 files changed, 1195 insertions(+), 1055 deletions(-) diff --git a/include/freerdp/primitives.h b/include/freerdp/primitives.h index 287c892..8eb92f9 100644 --- a/include/freerdp/primitives.h +++ b/include/freerdp/primitives.h @@ -81,62 +81,62 @@ typedef pstatus_t (*__copy_8u_AC4r_t)( typedef pstatus_t (*__set_8u_t)( BYTE val, BYTE* pDst, - INT32 len); + UINT32 len); typedef pstatus_t (*__set_32s_t)( INT32 val, INT32* pDst, - INT32 len); + UINT32 len); typedef pstatus_t (*__set_32u_t)( UINT32 val, UINT32* pDst, - INT32 len); + UINT32 len); typedef pstatus_t (*__zero_t)( void* pDst, size_t bytes); typedef pstatus_t (*__alphaComp_argb_t)( - const BYTE* pSrc1, INT32 src1Step, - const BYTE* pSrc2, INT32 src2Step, - BYTE* pDst, INT32 dstStep, - INT32 width, INT32 height); + const BYTE* pSrc1, UINT32 src1Step, + const BYTE* pSrc2, UINT32 src2Step, + BYTE* pDst, UINT32 dstStep, + UINT32 width, UINT32 height); typedef pstatus_t (*__add_16s_t)( const INT16* pSrc1, const INT16* pSrc2, INT16* pDst, - INT32 len); + UINT32 len); typedef pstatus_t (*__lShiftC_16s_t)( const INT16* pSrc, - INT32 val, + UINT32 val, INT16* pSrcDst, - INT32 len); + UINT32 len); typedef pstatus_t (*__lShiftC_16u_t)( const UINT16* pSrc, - INT32 val, + UINT32 val, UINT16* pSrcDst, - INT32 len); + UINT32 len); typedef pstatus_t (*__rShiftC_16s_t)( const INT16* pSrc, - INT32 val, + UINT32 val, INT16* pSrcDst, - INT32 len); + UINT32 len); typedef pstatus_t (*__rShiftC_16u_t)( const UINT16* pSrc, - INT32 val, + UINT32 val, UINT16* pSrcDst, - INT32 len); + UINT32 len); typedef pstatus_t (*__shiftC_16s_t)( const INT16* pSrc, INT32 val, INT16* pSrcDst, - INT32 len); + UINT32 len); typedef pstatus_t (*__shiftC_16u_t)( const UINT16* pSrc, INT32 val, UINT16* pSrcDst, - INT32 len); + UINT32 len); typedef pstatus_t (*__sign_16s_t)( const INT16* pSrc, INT16* pDst, - INT32 len); + UINT32 len); typedef pstatus_t (*__yCbCrToRGB_16s8u_P3AC4R_t)( const INT16* pSrc[3], INT32 srcStep, BYTE* pDst, INT32 dstStep, UINT32 DstFormat, @@ -154,8 +154,8 @@ typedef pstatus_t (*__RGBToYCbCr_16s16s_P3P3_t)( INT16* pDst[3], INT32 dstStep, const prim_size_t* roi); typedef pstatus_t (*__RGBToRGB_16s8u_P3AC4R_t)( - const INT16* pSrc[3], INT32 srcStep, - BYTE* pDst, INT32 dstStep, UINT32 DstFormat, + const INT16* const pSrc[3], UINT32 srcStep, + BYTE* pDst, UINT32 dstStep, UINT32 DstFormat, const prim_size_t* roi); typedef pstatus_t (*__YCoCgToRGB_8u_AC4R_t)( const BYTE* pSrc, INT32 srcStep, @@ -177,11 +177,11 @@ typedef pstatus_t (*__YUV444ToRGB_8u_P3AC4R_t)( BYTE* pDst, UINT32 dstStep, UINT32 DstFormat, const prim_size_t* roi); typedef pstatus_t (*__RGBToYUV420_8u_P3AC4R_t)( - const BYTE* pSrc, UINT32 srcStep, + const BYTE* pSrc, UINT32 SrcFormat, UINT32 srcStep, BYTE* pDst[3], UINT32 dstStep[3], const prim_size_t* roi); typedef pstatus_t (*__RGBToYUV444_8u_P3AC4R_t)( - const BYTE* pSrc, UINT32 srcStep, + const BYTE* pSrc, UINT32 SrcFormat, UINT32 srcStep, BYTE* pDst[3], UINT32 dstStep[3], const prim_size_t* roi); typedef pstatus_t (*__YUV420CombineToYUV444_t)( diff --git a/libfreerdp/codec/h264.c b/libfreerdp/codec/h264.c index 2686e92..c6b16e2 100644 --- a/libfreerdp/codec/h264.c +++ b/libfreerdp/codec/h264.c @@ -1567,7 +1567,7 @@ INT32 avc420_compress(H264_CONTEXT* h264, BYTE* pSrcData, DWORD SrcFormat, roi.width = nSrcWidth; roi.height = nSrcHeight; - prims->RGBToYUV420_8u_P3AC4R(pSrcData, nSrcStep, pYUVData, iStride, &roi); + prims->RGBToYUV420_8u_P3AC4R(pSrcData, SrcFormat, nSrcStep, pYUVData, iStride, &roi); status = h264->subsystem->Compress(h264, ppDstData, pDstSize, 0); diff --git a/libfreerdp/primitives/prim_YCoCg.c b/libfreerdp/primitives/prim_YCoCg.c index 0b264d2..518f248 100644 --- a/libfreerdp/primitives/prim_YCoCg.c +++ b/libfreerdp/primitives/prim_YCoCg.c @@ -33,11 +33,11 @@ /* ------------------------------------------------------------------------- */ static pstatus_t general_YCoCgToRGB_8u_AC4R( - const BYTE* pSrc, INT32 srcStep, - BYTE* pDst, UINT32 DstFormat, INT32 dstStep, - UINT32 width, UINT32 height, - UINT8 shift, - BOOL withAlpha) + const BYTE* pSrc, INT32 srcStep, + BYTE* pDst, UINT32 DstFormat, INT32 dstStep, + UINT32 width, UINT32 height, + UINT8 shift, + BOOL withAlpha) { BYTE A; UINT32 x, y; @@ -66,8 +66,11 @@ static pstatus_t general_YCoCgToRGB_8u_AC4R( R = T + Co; G = Y + Cg; B = T - Co; - color = GetColor(DstFormat, MINMAX(R, 0, 255), MINMAX(G, 0, 255), MINMAX(B, 0, - 255), A); + + color = GetColor(DstFormat, + MINMAX(R, 0, 255), MINMAX(G, 0, 255), + MINMAX(B, 0, 255), A); + WriteColor(dptr, DstFormat, color); dptr += GetBytesPerPixel(DstFormat); } diff --git a/libfreerdp/primitives/prim_YCoCg_opt.c b/libfreerdp/primitives/prim_YCoCg_opt.c index 93cddb5..75ba935 100644 --- a/libfreerdp/primitives/prim_YCoCg_opt.c +++ b/libfreerdp/primitives/prim_YCoCg_opt.c @@ -40,8 +40,8 @@ static primitives_t* generic = NULL; #ifdef WITH_SSE2 /* ------------------------------------------------------------------------- */ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert( - const BYTE* pSrc, INT32 srcStep, - BYTE* pDst, INT32 dstStep, + const BYTE* pSrc, UINT32 srcStep, + BYTE* pDst, UINT32 DstFormat, UINT32 dstStep, UINT32 width, UINT32 height, UINT8 shift, BOOL withAlpha) @@ -70,8 +70,8 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert( { /* Too small, or we'll never hit a 16-byte boundary. Punt. */ return generic->YCoCgToRGB_8u_AC4R( - pSrc, srcStep, pDst, dstStep, - width, height, shift, withAlpha, TRUE); + pSrc, srcStep, pDst, DstFormat, dstStep, + width, height, shift, withAlpha); } for (h = 0; h < height; h++) @@ -82,12 +82,16 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert( /* Get to a 16-byte destination boundary. */ if ((ULONG_PTR) dptr & 0x0f) { + pstatus_t status; int startup = (16 - ((ULONG_PTR) dptr & 0x0f)) / 4; if (startup > width) startup = width; - generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, dstStep, - startup, 1, shift, withAlpha, TRUE); + status = generic->YCoCgToRGB_8u_AC4R( + sptr, srcStep, dptr, DstFormat, dstStep, + startup, 1, shift, withAlpha); + if (status != PRIMITIVES_SUCCESS) + return status; sptr += startup * sizeof(UINT32); dptr += startup * sizeof(UINT32); w -= startup; @@ -195,8 +199,13 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert( /* Handle any remainder pixels. */ if (w > 0) { - generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, dstStep, - w, 1, shift, withAlpha, TRUE); + pstatus_t status; + status = generic->YCoCgToRGB_8u_AC4R( + sptr, srcStep, dptr, DstFormat, dstStep, + w, 1, shift, withAlpha); + if (status != PRIMITIVES_SUCCESS) + return status; + sptr += w * sizeof(UINT32); dptr += w * sizeof(UINT32); } @@ -210,8 +219,8 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert( /* ------------------------------------------------------------------------- */ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert( - const BYTE* pSrc, INT32 srcStep, - BYTE* pDst, INT32 dstStep, + const BYTE* pSrc, UINT32 srcStep, + BYTE* pDst, UINT32 DstFormat, UINT32 dstStep, UINT32 width, UINT32 height, UINT8 shift, BOOL withAlpha) @@ -240,9 +249,8 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert( { /* Too small, or we'll never hit a 16-byte boundary. Punt. */ return generic->YCoCgToRGB_8u_AC4R( - pSrc, srcStep, - pDst, dstStep, width, height, shift, - withAlpha, FALSE); + pSrc, srcStep, pDst, DstFormat, dstStep, + width, height, shift, withAlpha); } for (h = 0; h < height; h++) @@ -253,12 +261,17 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert( /* Get to a 16-byte destination boundary. */ if ((ULONG_PTR) dptr & 0x0f) { + pstatus_t status; int startup = (16 - ((ULONG_PTR) dptr & 0x0f)) / 4; if (startup > width) startup = width; - generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, dstStep, - startup, 1, shift, withAlpha, FALSE); + status = generic->YCoCgToRGB_8u_AC4R( + sptr, srcStep, dptr, DstFormat, + dstStep, startup, 1, shift, withAlpha); + if (status != PRIMITIVES_SUCCESS) + return status; + sptr += startup * sizeof(UINT32); dptr += startup * sizeof(UINT32); w -= startup; @@ -370,8 +383,13 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert( /* Handle any remainder pixels. */ if (w > 0) { - generic->YCoCgToRGB_8u_AC4R(sptr, srcStep, dptr, dstStep, - w, 1, shift, withAlpha, FALSE); + pstatus_t status; + status = generic->YCoCgToRGB_8u_AC4R( + sptr, srcStep, dptr, DstFormat, dstStep, + w, 1, shift, withAlpha); + if (status != PRIMITIVES_SUCCESS) + return status; + sptr += w * sizeof(UINT32); dptr += w * sizeof(UINT32); } @@ -388,21 +406,29 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert( /* ------------------------------------------------------------------------- */ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R( const BYTE* pSrc, INT32 srcStep, - BYTE* pDst, INT32 dstStep, + BYTE* pDst, UINT32 DstFormat, INT32 dstStep, UINT32 width, UINT32 height, UINT8 shift, - BOOL withAlpha, - BOOL invert) + BOOL withAlpha) { - if (invert) - { - return ssse3_YCoCgRToRGB_8u_AC4R_invert(pSrc, srcStep, pDst, dstStep, - width, height, shift, withAlpha); - } - else + // TODO: Need to implement proper color conversion!!! + return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, + dstStep, width, height, shift, withAlpha); + + switch(DstFormat) { - return ssse3_YCoCgRToRGB_8u_AC4R_no_invert(pSrc, srcStep, pDst, dstStep, - width, height, shift, withAlpha); + case PIXEL_FORMAT_BGRX32: + case PIXEL_FORMAT_BGRA32: + return ssse3_YCoCgRToRGB_8u_AC4R_invert( + pSrc, srcStep, pDst, DstFormat, dstStep, + width, height, shift, withAlpha); + case PIXEL_FORMAT_RGBX32: + case PIXEL_FORMAT_RGBA32: + return ssse3_YCoCgRToRGB_8u_AC4R_no_invert( + pSrc, srcStep, pDst, DstFormat, dstStep, + width, height, shift, withAlpha); + default: + return -1; } } #endif /* WITH_SSE2 */ diff --git a/libfreerdp/primitives/prim_YUV.c b/libfreerdp/primitives/prim_YUV.c index bb32f34..5318020 100644 --- a/libfreerdp/primitives/prim_YUV.c +++ b/libfreerdp/primitives/prim_YUV.c @@ -232,9 +232,9 @@ static pstatus_t general_YUV444SplitToYUV420( { /* Filter */ const INT32 u = pSrcU[2 * x] + pSrcU[2 * x + 1] + pSrcU1[2 * x] - + pSrcU1[2 * x + 1]; + + pSrcU1[2 * x + 1]; const INT32 v = pSrcV[2 * x] + pSrcV[2 * x + 1] + pSrcV1[2 * x] - + pSrcV1[2 * x + 1]; + + pSrcV1[2 * x + 1]; pU[x] = CLIP(u / 4L); pV[x] = CLIP(v / 4L); } @@ -331,7 +331,7 @@ static INLINE BYTE* writePixel(BYTE* dst, UINT32 format, BYTE Y, BYTE U, BYTE V) const BYTE r = YUV2R(Y, U, V); const BYTE g = YUV2G(Y, U, V); const BYTE b = YUV2B(Y, U, V); - UINT32 color = GetColor(format, r, g, b, 0); + UINT32 color = GetColor(format, r, g, b, 0xFF); WriteColor(dst, format, color); return dst + GetBytesPerPixel(format); } @@ -500,9 +500,10 @@ static INLINE BYTE RGB2V(INT32 R, INT32 G, INT32 B) } static pstatus_t general_RGBToYUV444_8u_P3AC4R( - const BYTE* pSrc, const UINT32 srcStep, + const BYTE* pSrc, UINT32 SrcFormat, const UINT32 srcStep, BYTE* pDst[3], UINT32 dstStep[3], const prim_size_t* roi) { + const UINT32 bpp = GetBytesPerPixel(SrcFormat); UINT32 x, y; UINT32 nWidth, nHeight; nWidth = roi->width; @@ -517,9 +518,10 @@ static pstatus_t general_RGBToYUV444_8u_P3AC4R( for (x = 0; x < nWidth; x++) { - const BYTE B = pRGB[4 * x + 0]; - const BYTE G = pRGB[4 * x + 1]; - const BYTE R = pRGB[4 * x + 2]; + BYTE B, G, R; + const UINT32 color = ReadColor(&pRGB[x * bpp], SrcFormat); + SplitColor(color, SrcFormat, &R, &G, &B, NULL, NULL); + pY[x] = RGB2Y(R, G, B); pU[x] = RGB2U(R, G, B); pV[x] = RGB2V(R, G, B); @@ -530,9 +532,10 @@ static pstatus_t general_RGBToYUV444_8u_P3AC4R( } static pstatus_t general_RGBToYUV420_8u_P3AC4R( - const BYTE* pSrc, UINT32 srcStep, + const BYTE* pSrc, UINT32 SrcFormat, UINT32 srcStep, BYTE* pDst[3], UINT32 dstStep[3], const prim_size_t* roi) { + const UINT32 bpp = GetBytesPerPixel(SrcFormat); UINT32 x, y; UINT32 halfWidth; UINT32 halfHeight; @@ -555,39 +558,50 @@ static pstatus_t general_RGBToYUV420_8u_P3AC4R( for (x = 0; x < halfWidth; x++) { - INT32 R, G, B; + UINT32 color; INT32 Ra, Ga, Ba; const UINT32 val2x = (x * 2); const UINT32 val2x1 = val2x + 1; + BYTE B, G, R; + /* 1st pixel */ - Ba = B = pRGB[val2x * 4 + 0]; - Ga = G = pRGB[val2x * 4 + 1]; - Ra = R = pRGB[val2x * 4 + 2]; + color = ReadColor(&pRGB[val2x * bpp], SrcFormat); + SplitColor(color, SrcFormat, &R, &G, &B, NULL, NULL); + + Ba = B; + Ga = G; + Ra = R; pY[val2x] = RGB2Y(R, G, B); if (val2x1 < nWidth) { /* 2nd pixel */ - Ba += B = pRGB[val2x * 4 + 4]; - Ga += G = pRGB[val2x * 4 + 5]; - Ra += R = pRGB[val2x * 4 + 6]; + color = ReadColor(&pRGB[val2x1 * bpp], SrcFormat); + SplitColor(color, SrcFormat, &R, &G, &B, NULL, NULL); + Ba += B; + Ga += G; + Ra += R; pY[val2x1] = RGB2Y(R, G, B); } if (val2y1 < nHeight) { /* 3rd pixel */ - Ba += B = pRGB1[val2x * 4 + 0]; - Ga += G = pRGB1[val2x * 4 + 1]; - Ra += R = pRGB1[val2x * 4 + 2]; + color = ReadColor(&pRGB1[val2x * bpp], SrcFormat); + SplitColor(color, SrcFormat, &R, &G, &B, NULL, NULL); + Ba += B; + Ga += G; + Ra += R; pY1[val2x] = RGB2Y(R, G, B); if (val2x1 < nWidth) { /* 4th pixel */ - Ba += B = pRGB1[val2x * 4 + 4]; - Ga += G = pRGB1[val2x * 4 + 5]; - Ra += R = pRGB1[val2x * 4 + 6]; + color = ReadColor(&pRGB1[val2x1 * bpp], SrcFormat); + SplitColor(color, SrcFormat, &R, &G, &B, NULL, NULL); + Ba += B; + Ga += G; + Ra += R; pY1[val2x1] = RGB2Y(R, G, B); } } diff --git a/libfreerdp/primitives/prim_YUV_opt.c b/libfreerdp/primitives/prim_YUV_opt.c index 36c31d6..b8d2172 100644 --- a/libfreerdp/primitives/prim_YUV_opt.c +++ b/libfreerdp/primitives/prim_YUV_opt.c @@ -35,6 +35,11 @@ static pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R( UINT32 i, nWidth, nHeight, VaddDst, VaddY, VaddU, VaddV; __m128i r0, r1, r2, r3, r4, r5, r6, r7; __m128i* buffer; + + // TODO: Need to implement proper color conversion!!!!! + return generic->YUV420ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, + DstFormat, roi); + /* last_line: if the last (U,V doubled) line should be skipped, set to 10B * last_column: if it's the last column in a line, set to 10B (for handling line-endings not multiple by four) */ buffer = _aligned_malloc(4 * 16, 16); diff --git a/libfreerdp/primitives/prim_add.c b/libfreerdp/primitives/prim_add.c index 5984402..cc63268 100644 --- a/libfreerdp/primitives/prim_add.c +++ b/libfreerdp/primitives/prim_add.c @@ -30,7 +30,7 @@ static pstatus_t general_add_16s( const INT16* pSrc1, const INT16* pSrc2, INT16* pDst, - INT32 len) + UINT32 len) { while (len--) { diff --git a/libfreerdp/primitives/prim_add_opt.c b/libfreerdp/primitives/prim_add_opt.c index d49d634..c6c7504 100644 --- a/libfreerdp/primitives/prim_add_opt.c +++ b/libfreerdp/primitives/prim_add_opt.c @@ -40,7 +40,7 @@ static primitives_t* generic = NULL; # if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) /* ------------------------------------------------------------------------- */ SSE3_SSD_ROUTINE(sse3_add_16s, INT16, generic->add_16s, - _mm_adds_epi16, generic->add_16s(sptr1++, sptr2++, dptr++, 1)) + _mm_adds_epi16, generic->add_16s(sptr1++, sptr2++, dptr++, 1)) # endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */ #endif diff --git a/libfreerdp/primitives/prim_alphaComp.c b/libfreerdp/primitives/prim_alphaComp.c index a306b9c..cc2f75e 100644 --- a/libfreerdp/primitives/prim_alphaComp.c +++ b/libfreerdp/primitives/prim_alphaComp.c @@ -36,23 +36,19 @@ /* ------------------------------------------------------------------------- */ static pstatus_t general_alphaComp_argb( - const BYTE* pSrc1, INT32 src1Step, - const BYTE* pSrc2, INT32 src2Step, - BYTE* pDst, INT32 dstStep, - INT32 width, INT32 height) + const BYTE* pSrc1, UINT32 src1Step, + const BYTE* pSrc2, UINT32 src2Step, + BYTE* pDst, UINT32 dstStep, + UINT32 width, UINT32 height) { - const UINT32* sptr1 = (const UINT32*) pSrc1; - const UINT32* sptr2 = (const UINT32*) pSrc2; - UINT32* dptr = (UINT32*) pDst; - int linebytes = width * sizeof(UINT32); - int src1Jump = (src1Step - linebytes) / sizeof(UINT32); - int src2Jump = (src2Step - linebytes) / sizeof(UINT32); - int dstJump = (dstStep - linebytes) / sizeof(UINT32); - int y; + UINT32 y; for (y = 0; y < height; y++) { - int x; + const UINT32* sptr1 = (const UINT32*) (pSrc1 + y * src1Step); + const UINT32* sptr2 = (const UINT32*) (pSrc2 + y * src2Step); + UINT32* dptr = (UINT32*) (pDst + y * dstStep); + UINT32 x; for (x = 0; x < width; x++) { @@ -92,10 +88,6 @@ static pstatus_t general_alphaComp_argb( *dptr++ = rb | ag; } } - - sptr1 += src1Jump; - sptr2 += src2Jump; - dptr += dstJump; } return PRIMITIVES_SUCCESS; diff --git a/libfreerdp/primitives/prim_alphaComp_opt.c b/libfreerdp/primitives/prim_alphaComp_opt.c index d0f9744..94daf27 100644 --- a/libfreerdp/primitives/prim_alphaComp_opt.c +++ b/libfreerdp/primitives/prim_alphaComp_opt.c @@ -46,10 +46,10 @@ static primitives_t* generic = NULL; #if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) pstatus_t sse2_alphaComp_argb( - const BYTE* pSrc1, INT32 src1Step, - const BYTE* pSrc2, INT32 src2Step, - BYTE* pDst, INT32 dstStep, - INT32 width, INT32 height) + const BYTE* pSrc1, UINT32 src1Step, + const BYTE* pSrc2, UINT32 src2Step, + BYTE* pDst, UINT32 dstStep, + UINT32 width, UINT32 height) { const UINT32* sptr1 = (const UINT32*) pSrc1; const UINT32* sptr2 = (const UINT32*) pSrc2; @@ -62,7 +62,7 @@ pstatus_t sse2_alphaComp_argb( if (width < 4) /* pointless if too small */ { return generic->alphaComp_argb(pSrc1, src1Step, pSrc2, src2Step, - pDst, dstStep, width, height); + pDst, dstStep, width, height); } dptr = (UINT32*) pDst; @@ -108,9 +108,13 @@ pstatus_t sse2_alphaComp_argb( if (leadIn) { - generic->alphaComp_argb((const BYTE*) sptr1, - src1Step, (const BYTE*) sptr2, src2Step, - (BYTE*) dptr, dstStep, leadIn, 1); + pstatus_t status; + status = generic->alphaComp_argb((const BYTE*) sptr1, + src1Step, (const BYTE*) sptr2, src2Step, + (BYTE*) dptr, dstStep, leadIn, 1); + if (status != PRIMITIVES_SUCCESS) + return status; + sptr1 += leadIn; sptr2 += leadIn; dptr += leadIn; @@ -181,9 +185,13 @@ pstatus_t sse2_alphaComp_argb( /* Finish off the remainder. */ if (pixels) { - generic->alphaComp_argb((const BYTE*) sptr1, src1Step, - (const BYTE*) sptr2, src2Step, - (BYTE*) dptr, dstStep, pixels, 1); + pstatus_t status; + status = generic->alphaComp_argb((const BYTE*) sptr1, src1Step, + (const BYTE*) sptr2, src2Step, + (BYTE*) dptr, dstStep, pixels, 1); + if (status != PRIMITIVES_SUCCESS) + return status; + sptr1 += pixels; sptr2 += pixels; dptr += pixels; @@ -212,7 +220,7 @@ static pstatus_t ipp_alphaComp_argb( sz.width = width; sz.height = height; return ippiAlphaComp_8u_AC4R(pSrc1, src1Step, pSrc2, src2Step, - pDst, dstStep, sz, ippAlphaOver); + pDst, dstStep, sz, ippAlphaOver); } #endif diff --git a/libfreerdp/primitives/prim_colors.c b/libfreerdp/primitives/prim_colors.c index 30bc131..b2dce1d 100644 --- a/libfreerdp/primitives/prim_colors.c +++ b/libfreerdp/primitives/prim_colors.c @@ -262,7 +262,7 @@ static pstatus_t general_RGBToYCbCr_16s16s_P3P3( for (y = 0; y < roi->height; y++) { - int x; + UINT32 x; for (x = 0; x < roi->width; ++x) { @@ -305,10 +305,10 @@ static pstatus_t general_RGBToYCbCr_16s16s_P3P3( /* ------------------------------------------------------------------------- */ static pstatus_t general_RGBToRGB_16s8u_P3AC4R( - const INT16* pSrc[3], /* 16-bit R,G, and B arrays */ - INT32 srcStep, /* bytes between rows in source data */ + const INT16* const pSrc[3], /* 16-bit R,G, and B arrays */ + UINT32 srcStep, /* bytes between rows in source data */ BYTE* pDst, /* 32-bit interleaved ARGB (ABGR?) data */ - INT32 dstStep, /* bytes between rows in dest data */ + UINT32 dstStep, /* bytes between rows in dest data */ UINT32 DstFormat, const prim_size_t* roi) /* region of interest */ { diff --git a/libfreerdp/primitives/prim_colors_opt.c b/libfreerdp/primitives/prim_colors_opt.c index ac5c662..f349fca 100644 --- a/libfreerdp/primitives/prim_colors_opt.c +++ b/libfreerdp/primitives/prim_colors_opt.c @@ -91,7 +91,7 @@ static pstatus_t sse2_yCbCrToRGB_16s16s_P3P3( { /* We can't maintain 16-byte alignment. */ return generic->yCbCrToRGB_16s16s_P3P3(pSrc, srcStep, - pDst, dstStep, roi); + pDst, dstStep, roi); } zero = _mm_setzero_si128(); @@ -228,7 +228,7 @@ static pstatus_t sse2_RGBToYCbCr_16s16s_P3P3( { /* We can't maintain 16-byte alignment. */ return generic->RGBToYCbCr_16s16s_P3P3(pSrc, srcStep, - pDst, dstStep, roi); + pDst, dstStep, roi); } min = _mm_set1_epi16(-128 * 32); @@ -357,10 +357,10 @@ static pstatus_t sse2_RGBToYCbCr_16s16s_P3P3( _mm_set1_epi32(0xFFFFFFFFU) pstatus_t sse2_RGBToRGB_16s8u_P3AC4R( - const INT16* pSrc[3], /* 16-bit R,G, and B arrays */ - INT32 srcStep, /* bytes between rows in source data */ + const INT16* const pSrc[3], /* 16-bit R,G, and B arrays */ + UINT32 srcStep, /* bytes between rows in source data */ BYTE* pDst, /* 32-bit interleaved ARGB (ABGR?) data */ - INT32 dstStep, /* bytes between rows in dest data */ + UINT32 dstStep, /* bytes between rows in dest data */ UINT32 DstFormat, const prim_size_t* roi) /* region of interest */ { @@ -385,9 +385,13 @@ pstatus_t sse2_RGBToRGB_16s8u_P3AC4R( || (dstStep & 0x0f)) { return generic->RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, - dstStep, DstFormat, roi); + dstStep, DstFormat, roi); } + // TODO: Need to update SSE code to allow color conversion!!! + return generic->RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, + dstStep, DstFormat, roi); + out = (BYTE*) pDst; srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16); dstbump = (dstStep - (roi->width * sizeof(UINT32))); diff --git a/libfreerdp/primitives/prim_set.c b/libfreerdp/primitives/prim_set.c index 3770e50..b9e9e0e 100644 --- a/libfreerdp/primitives/prim_set.c +++ b/libfreerdp/primitives/prim_set.c @@ -29,7 +29,7 @@ static pstatus_t general_set_8u( BYTE val, BYTE* pDst, - INT32 len) + UINT32 len) { memset((void*) pDst, (int) val, (size_t) len); return PRIMITIVES_SUCCESS; @@ -48,7 +48,7 @@ static pstatus_t general_zero( static pstatus_t general_set_32s( INT32 val, INT32* pDst, - INT32 len) + UINT32 len) { INT32* dptr = (INT32*) pDst; size_t span, remaining; @@ -85,7 +85,7 @@ static pstatus_t general_set_32s( static pstatus_t general_set_32u( UINT32 val, UINT32* pDst, - INT32 len) + UINT32 len) { UINT32* dptr = (UINT32*) pDst; size_t span, remaining; diff --git a/libfreerdp/primitives/prim_set_opt.c b/libfreerdp/primitives/prim_set_opt.c index a5f11c3..97b49bc 100644 --- a/libfreerdp/primitives/prim_set_opt.c +++ b/libfreerdp/primitives/prim_set_opt.c @@ -40,7 +40,7 @@ static primitives_t* generic = NULL; static pstatus_t sse2_set_8u( BYTE val, BYTE* pDst, - INT32 len) + UINT32 len) { BYTE byte, *dptr; __m128i xmm0; @@ -126,7 +126,7 @@ static pstatus_t sse2_set_8u( static pstatus_t sse2_set_32u( UINT32 val, UINT32* pDst, - INT32 len) + UINT32 len) { const primitives_t* prim = primitives_get_generic(); UINT32* dptr = (UINT32*) pDst; @@ -218,7 +218,7 @@ static pstatus_t sse2_set_32u( static pstatus_t sse2_set_32s( INT32 val, INT32* pDst, - INT32 len) + UINT32 len) { UINT32 uval = *((UINT32*) &val); return sse2_set_32u(uval, (UINT32*) pDst, len); diff --git a/libfreerdp/primitives/prim_shift.c b/libfreerdp/primitives/prim_shift.c index 20d9e2e..684a39c 100644 --- a/libfreerdp/primitives/prim_shift.c +++ b/libfreerdp/primitives/prim_shift.c @@ -24,9 +24,9 @@ /* ------------------------------------------------------------------------- */ static pstatus_t general_lShiftC_16s( const INT16* pSrc, - INT32 val, + UINT32 val, INT16* pDst, - INT32 len) + UINT32 len) { if (val == 0) return PRIMITIVES_SUCCESS; @@ -38,9 +38,9 @@ static pstatus_t general_lShiftC_16s( /* ------------------------------------------------------------------------- */ static pstatus_t general_rShiftC_16s( const INT16* pSrc, - INT32 val, + UINT32 val, INT16* pDst, - INT32 len) + UINT32 len) { if (val == 0) return PRIMITIVES_SUCCESS; @@ -52,9 +52,9 @@ static pstatus_t general_rShiftC_16s( /* ------------------------------------------------------------------------- */ static pstatus_t general_lShiftC_16u( const UINT16* pSrc, - INT32 val, + UINT32 val, UINT16* pDst, - INT32 len) + UINT32 len) { if (val == 0) return PRIMITIVES_SUCCESS; @@ -66,9 +66,9 @@ static pstatus_t general_lShiftC_16u( /* ------------------------------------------------------------------------- */ static pstatus_t general_rShiftC_16u( const UINT16* pSrc, - INT32 val, + UINT32 val, UINT16* pDst, - INT32 len) + UINT32 len) { if (val == 0) return PRIMITIVES_SUCCESS; @@ -82,7 +82,7 @@ static pstatus_t general_shiftC_16s( const INT16* pSrc, INT32 val, INT16* pDst, - INT32 len) + UINT32 len) { if (val == 0) return PRIMITIVES_SUCCESS; @@ -95,7 +95,7 @@ static pstatus_t general_shiftC_16u( const UINT16* pSrc, INT32 val, UINT16* pDst, - INT32 len) + UINT32 len) { if (val == 0) return PRIMITIVES_SUCCESS; diff --git a/libfreerdp/primitives/prim_shift_opt.c b/libfreerdp/primitives/prim_shift_opt.c index df0fe2b..617330f 100644 --- a/libfreerdp/primitives/prim_shift_opt.c +++ b/libfreerdp/primitives/prim_shift_opt.c @@ -39,16 +39,16 @@ static primitives_t* generic = NULL; # if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) /* ------------------------------------------------------------------------- */ SSE3_SCD_ROUTINE(sse2_lShiftC_16s, INT16, generic->lShiftC_16s, - _mm_slli_epi16, *dptr++ = *sptr++ << val) + _mm_slli_epi16, *dptr++ = *sptr++ << val) /* ------------------------------------------------------------------------- */ SSE3_SCD_ROUTINE(sse2_rShiftC_16s, INT16, generic->rShiftC_16s, - _mm_srai_epi16, *dptr++ = *sptr++ >> val) + _mm_srai_epi16, *dptr++ = *sptr++ >> val) /* ------------------------------------------------------------------------- */ SSE3_SCD_ROUTINE(sse2_lShiftC_16u, UINT16, generic->lShiftC_16u, - _mm_slli_epi16, *dptr++ = *sptr++ << val) + _mm_slli_epi16, *dptr++ = *sptr++ << val) /* ------------------------------------------------------------------------- */ SSE3_SCD_ROUTINE(sse2_rShiftC_16u, UINT16, generic->rShiftC_16u, - _mm_srli_epi16, *dptr++ = *sptr++ >> val) + _mm_srli_epi16, *dptr++ = *sptr++ >> val) # endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */ #endif diff --git a/libfreerdp/primitives/prim_sign.c b/libfreerdp/primitives/prim_sign.c index b64b297..26cf7c2 100644 --- a/libfreerdp/primitives/prim_sign.c +++ b/libfreerdp/primitives/prim_sign.c @@ -28,7 +28,7 @@ static pstatus_t general_sign_16s( const INT16* pSrc, INT16* pDst, - INT32 len) + UINT32 len) { while (len--) { diff --git a/libfreerdp/primitives/prim_sign_opt.c b/libfreerdp/primitives/prim_sign_opt.c index e9b0373..4f01e97 100644 --- a/libfreerdp/primitives/prim_sign_opt.c +++ b/libfreerdp/primitives/prim_sign_opt.c @@ -35,7 +35,7 @@ static primitives_t* generic = NULL; static pstatus_t ssse3_sign_16s( const INT16* pSrc, INT16* pDst, - INT32 len) + UINT32 len) { const INT16* sptr = (const INT16*) pSrc; INT16* dptr = (INT16*) pDst; diff --git a/libfreerdp/primitives/prim_templates.h b/libfreerdp/primitives/prim_templates.h index b58266d..84d586f 100644 --- a/libfreerdp/primitives/prim_templates.h +++ b/libfreerdp/primitives/prim_templates.h @@ -44,143 +44,143 @@ * SCD = Source, Constant, Destination */ #define SSE3_SCD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \ - static pstatus_t _name_(const _type_ *pSrc, INT32 val, _type_ *pDst, INT32 len) \ - { \ - INT32 shifts; \ - UINT32 offBeatMask; \ - const _type_ *sptr = pSrc; \ - _type_ *dptr = pDst; \ - size_t count; \ - if (len < 16) /* pointless if too small */ \ - { \ - return _fallback_(pSrc, val, pDst, len); \ - } \ - if (sizeof(_type_) == 1) shifts = 1; \ - else if (sizeof(_type_) == 2) shifts = 2; \ - else if (sizeof(_type_) == 4) shifts = 3; \ - else if (sizeof(_type_) == 8) shifts = 4; \ - offBeatMask = (1 << (shifts - 1)) - 1; \ - if ((ULONG_PTR) pDst & offBeatMask) \ - { \ - /* Incrementing the pointer skips over 16-byte boundary. */ \ - return _fallback_(pSrc, val, pDst, len); \ - } \ - /* Get to the 16-byte boundary now. */ \ - while ((ULONG_PTR) dptr & 0x0f) \ - { \ - _slowWay_; \ - if (--len == 0) return PRIMITIVES_SUCCESS; \ - } \ - /* Use 8 128-bit SSE registers. */ \ - count = len >> (8-shifts); \ - len -= count << (8-shifts); \ - if ((ULONG_PTR) sptr & 0x0f) \ - { \ - while (count--) \ - { \ - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \ - xmm0 = _mm_lddqu_si128((__m128i *) sptr); \ - sptr += (16/sizeof(_type_)); \ - xmm1 = _mm_lddqu_si128((__m128i *) sptr); \ - sptr += (16/sizeof(_type_)); \ - xmm2 = _mm_lddqu_si128((__m128i *) sptr); \ - sptr += (16/sizeof(_type_)); \ - xmm3 = _mm_lddqu_si128((__m128i *) sptr); \ - sptr += (16/sizeof(_type_)); \ - xmm4 = _mm_lddqu_si128((__m128i *) sptr); \ - sptr += (16/sizeof(_type_)); \ - xmm5 = _mm_lddqu_si128((__m128i *) sptr); \ - sptr += (16/sizeof(_type_)); \ - xmm6 = _mm_lddqu_si128((__m128i *) sptr); \ - sptr += (16/sizeof(_type_)); \ - xmm7 = _mm_lddqu_si128((__m128i *) sptr); \ - sptr += (16/sizeof(_type_)); \ - xmm0 = _op_(xmm0, val); \ - xmm1 = _op_(xmm1, val); \ - xmm2 = _op_(xmm2, val); \ - xmm3 = _op_(xmm3, val); \ - xmm4 = _op_(xmm4, val); \ - xmm5 = _op_(xmm5, val); \ - xmm6 = _op_(xmm6, val); \ - xmm7 = _op_(xmm7, val); \ - _mm_store_si128((__m128i *) dptr, xmm0); \ - dptr += (16/sizeof(_type_)); \ - _mm_store_si128((__m128i *) dptr, xmm1); \ - dptr += (16/sizeof(_type_)); \ - _mm_store_si128((__m128i *) dptr, xmm2); \ - dptr += (16/sizeof(_type_)); \ - _mm_store_si128((__m128i *) dptr, xmm3); \ - dptr += (16/sizeof(_type_)); \ - _mm_store_si128((__m128i *) dptr, xmm4); \ - dptr += (16/sizeof(_type_)); \ - _mm_store_si128((__m128i *) dptr, xmm5); \ - dptr += (16/sizeof(_type_)); \ - _mm_store_si128((__m128i *) dptr, xmm6); \ - dptr += (16/sizeof(_type_)); \ - _mm_store_si128((__m128i *) dptr, xmm7); \ - dptr += (16/sizeof(_type_)); \ - } \ - } \ - else \ - { \ - while (count--) \ - { \ - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \ - xmm0 = _mm_load_si128((__m128i *) sptr); \ - sptr += (16/sizeof(_type_)); \ - xmm1 = _mm_load_si128((__m128i *) sptr); \ - sptr += (16/sizeof(_type_)); \ - xmm2 = _mm_load_si128((__m128i *) sptr); \ - sptr += (16/sizeof(_type_)); \ - xmm3 = _mm_load_si128((__m128i *) sptr); \ - sptr += (16/sizeof(_type_)); \ - xmm4 = _mm_load_si128((__m128i *) sptr); \ - sptr += (16/sizeof(_type_)); \ - xmm5 = _mm_load_si128((__m128i *) sptr); \ - sptr += (16/sizeof(_type_)); \ - xmm6 = _mm_load_si128((__m128i *) sptr); \ - sptr += (16/sizeof(_type_)); \ - xmm7 = _mm_load_si128((__m128i *) sptr); \ - sptr += (16/sizeof(_type_)); \ - xmm0 = _op_(xmm0, val); \ - xmm1 = _op_(xmm1, val); \ - xmm2 = _op_(xmm2, val); \ - xmm3 = _op_(xmm3, val); \ - xmm4 = _op_(xmm4, val); \ - xmm5 = _op_(xmm5, val); \ - xmm6 = _op_(xmm6, val); \ - xmm7 = _op_(xmm7, val); \ - _mm_store_si128((__m128i *) dptr, xmm0); \ - dptr += (16/sizeof(_type_)); \ - _mm_store_si128((__m128i *) dptr, xmm1); \ - dptr += (16/sizeof(_type_)); \ - _mm_store_si128((__m128i *) dptr, xmm2); \ - dptr += (16/sizeof(_type_)); \ - _mm_store_si128((__m128i *) dptr, xmm3); \ - dptr += (16/sizeof(_type_)); \ - _mm_store_si128((__m128i *) dptr, xmm4); \ - dptr += (16/sizeof(_type_)); \ - _mm_store_si128((__m128i *) dptr, xmm5); \ - dptr += (16/sizeof(_type_)); \ - _mm_store_si128((__m128i *) dptr, xmm6); \ - dptr += (16/sizeof(_type_)); \ - _mm_store_si128((__m128i *) dptr, xmm7); \ - dptr += (16/sizeof(_type_)); \ - } \ - } \ - /* Use a single 128-bit SSE register. */ \ - count = len >> (5-shifts); \ - len -= count << (5-shifts); \ - while (count--) \ - { \ - __m128i xmm0 = LOAD_SI128(sptr); sptr += (16/sizeof(_type_)); \ - xmm0 = _op_(xmm0, val); \ - _mm_store_si128((__m128i *) dptr, xmm0); \ - dptr += (16/sizeof(_type_)); \ - } \ - /* Finish off the remainder. */ \ - while (len--) { _slowWay_; } \ - return PRIMITIVES_SUCCESS; \ + static pstatus_t _name_(const _type_ *pSrc, UINT32 val, _type_ *pDst, UINT32 len) \ + { \ + INT32 shifts; \ + UINT32 offBeatMask; \ + const _type_ *sptr = pSrc; \ + _type_ *dptr = pDst; \ + size_t count; \ + if (len < 16) /* pointless if too small */ \ + { \ + return _fallback_(pSrc, val, pDst, len); \ + } \ + if (sizeof(_type_) == 1) shifts = 1; \ + else if (sizeof(_type_) == 2) shifts = 2; \ + else if (sizeof(_type_) == 4) shifts = 3; \ + else if (sizeof(_type_) == 8) shifts = 4; \ + offBeatMask = (1 << (shifts - 1)) - 1; \ + if ((ULONG_PTR) pDst & offBeatMask) \ + { \ + /* Incrementing the pointer skips over 16-byte boundary. */ \ + return _fallback_(pSrc, val, pDst, len); \ + } \ + /* Get to the 16-byte boundary now. */ \ + while ((ULONG_PTR) dptr & 0x0f) \ + { \ + _slowWay_; \ + if (--len == 0) return PRIMITIVES_SUCCESS; \ + } \ + /* Use 8 128-bit SSE registers. */ \ + count = len >> (8-shifts); \ + len -= count << (8-shifts); \ + if ((ULONG_PTR) sptr & 0x0f) \ + { \ + while (count--) \ + { \ + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \ + xmm0 = _mm_lddqu_si128((__m128i *) sptr); \ + sptr += (16/sizeof(_type_)); \ + xmm1 = _mm_lddqu_si128((__m128i *) sptr); \ + sptr += (16/sizeof(_type_)); \ + xmm2 = _mm_lddqu_si128((__m128i *) sptr); \ + sptr += (16/sizeof(_type_)); \ + xmm3 = _mm_lddqu_si128((__m128i *) sptr); \ + sptr += (16/sizeof(_type_)); \ + xmm4 = _mm_lddqu_si128((__m128i *) sptr); \ + sptr += (16/sizeof(_type_)); \ + xmm5 = _mm_lddqu_si128((__m128i *) sptr); \ + sptr += (16/sizeof(_type_)); \ + xmm6 = _mm_lddqu_si128((__m128i *) sptr); \ + sptr += (16/sizeof(_type_)); \ + xmm7 = _mm_lddqu_si128((__m128i *) sptr); \ + sptr += (16/sizeof(_type_)); \ + xmm0 = _op_(xmm0, val); \ + xmm1 = _op_(xmm1, val); \ + xmm2 = _op_(xmm2, val); \ + xmm3 = _op_(xmm3, val); \ + xmm4 = _op_(xmm4, val); \ + xmm5 = _op_(xmm5, val); \ + xmm6 = _op_(xmm6, val); \ + xmm7 = _op_(xmm7, val); \ + _mm_store_si128((__m128i *) dptr, xmm0); \ + dptr += (16/sizeof(_type_)); \ + _mm_store_si128((__m128i *) dptr, xmm1); \ + dptr += (16/sizeof(_type_)); \ + _mm_store_si128((__m128i *) dptr, xmm2); \ + dptr += (16/sizeof(_type_)); \ + _mm_store_si128((__m128i *) dptr, xmm3); \ + dptr += (16/sizeof(_type_)); \ + _mm_store_si128((__m128i *) dptr, xmm4); \ + dptr += (16/sizeof(_type_)); \ + _mm_store_si128((__m128i *) dptr, xmm5); \ + dptr += (16/sizeof(_type_)); \ + _mm_store_si128((__m128i *) dptr, xmm6); \ + dptr += (16/sizeof(_type_)); \ + _mm_store_si128((__m128i *) dptr, xmm7); \ + dptr += (16/sizeof(_type_)); \ + } \ + } \ + else \ + { \ + while (count--) \ + { \ + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \ + xmm0 = _mm_load_si128((__m128i *) sptr); \ + sptr += (16/sizeof(_type_)); \ + xmm1 = _mm_load_si128((__m128i *) sptr); \ + sptr += (16/sizeof(_type_)); \ + xmm2 = _mm_load_si128((__m128i *) sptr); \ + sptr += (16/sizeof(_type_)); \ + xmm3 = _mm_load_si128((__m128i *) sptr); \ + sptr += (16/sizeof(_type_)); \ + xmm4 = _mm_load_si128((__m128i *) sptr); \ + sptr += (16/sizeof(_type_)); \ + xmm5 = _mm_load_si128((__m128i *) sptr); \ + sptr += (16/sizeof(_type_)); \ + xmm6 = _mm_load_si128((__m128i *) sptr); \ + sptr += (16/sizeof(_type_)); \ + xmm7 = _mm_load_si128((__m128i *) sptr); \ + sptr += (16/sizeof(_type_)); \ + xmm0 = _op_(xmm0, val); \ + xmm1 = _op_(xmm1, val); \ + xmm2 = _op_(xmm2, val); \ + xmm3 = _op_(xmm3, val); \ + xmm4 = _op_(xmm4, val); \ + xmm5 = _op_(xmm5, val); \ + xmm6 = _op_(xmm6, val); \ + xmm7 = _op_(xmm7, val); \ + _mm_store_si128((__m128i *) dptr, xmm0); \ + dptr += (16/sizeof(_type_)); \ + _mm_store_si128((__m128i *) dptr, xmm1); \ + dptr += (16/sizeof(_type_)); \ + _mm_store_si128((__m128i *) dptr, xmm2); \ + dptr += (16/sizeof(_type_)); \ + _mm_store_si128((__m128i *) dptr, xmm3); \ + dptr += (16/sizeof(_type_)); \ + _mm_store_si128((__m128i *) dptr, xmm4); \ + dptr += (16/sizeof(_type_)); \ + _mm_store_si128((__m128i *) dptr, xmm5); \ + dptr += (16/sizeof(_type_)); \ + _mm_store_si128((__m128i *) dptr, xmm6); \ + dptr += (16/sizeof(_type_)); \ + _mm_store_si128((__m128i *) dptr, xmm7); \ + dptr += (16/sizeof(_type_)); \ + } \ + } \ + /* Use a single 128-bit SSE register. */ \ + count = len >> (5-shifts); \ + len -= count << (5-shifts); \ + while (count--) \ + { \ + __m128i xmm0 = LOAD_SI128(sptr); sptr += (16/sizeof(_type_)); \ + xmm0 = _op_(xmm0, val); \ + _mm_store_si128((__m128i *) dptr, xmm0); \ + dptr += (16/sizeof(_type_)); \ + } \ + /* Finish off the remainder. */ \ + while (len--) { _slowWay_; } \ + return PRIMITIVES_SUCCESS; \ } /* ---------------------------------------------------------------------------- @@ -189,228 +189,230 @@ */ #define SSE3_SCD_PRE_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \ pstatus_t _name_(const _type_ *pSrc, _type_ val, _type_ *pDst, INT32 len) \ - { \ - int shifts; \ - UINT32 offBeatMask; \ - const _type_ *sptr = pSrc; \ - _type_ *dptr = pDst; \ - size_t count; \ - __m128i xmm0; \ - if (len < 16) /* pointless if too small */ \ - { \ - return _fallback_(pSrc, val, pDst, len); \ - } \ - if (sizeof(_type_) == 1) shifts = 1; \ - else if (sizeof(_type_) == 2) shifts = 2; \ - else if (sizeof(_type_) == 4) shifts = 3; \ - else if (sizeof(_type_) == 8) shifts = 4; \ - offBeatMask = (1 << (shifts - 1)) - 1; \ - if ((ULONG_PTR) pDst & offBeatMask) \ - { \ - /* Incrementing the pointer skips over 16-byte boundary. */ \ - return _fallback_(pSrc, val, pDst, len); \ - } \ - /* Get to the 16-byte boundary now. */ \ - while ((ULONG_PTR) dptr & 0x0f) \ - { \ - _slowWay_; \ - if (--len == 0) return PRIMITIVES_SUCCESS; \ - } \ - /* Use 4 128-bit SSE registers. */ \ - count = len >> (7-shifts); \ - len -= count << (7-shifts); \ - xmm0 = _mm_set1_epi32(val); \ - if ((ULONG_PTR) sptr & 0x0f) \ - { \ - while (count--) \ - { \ - __m128i xmm1, xmm2, xmm3, xmm4; \ - xmm1 = _mm_lddqu_si128((__m128i *) sptr); \ - sptr += (16/sizeof(_type_)); \ - xmm2 = _mm_lddqu_si128((__m128i *) sptr); \ - sptr += (16/sizeof(_type_)); \ - xmm3 = _mm_lddqu_si128((__m128i *) sptr); \ - sptr += (16/sizeof(_type_)); \ - xmm4 = _mm_lddqu_si128((__m128i *) sptr); \ - sptr += (16/sizeof(_type_)); \ - xmm1 = _op_(xmm1, xmm0); \ - xmm2 = _op_(xmm2, xmm0); \ - xmm3 = _op_(xmm3, xmm0); \ - xmm4 = _op_(xmm4, xmm0); \ - _mm_store_si128((__m128i *) dptr, xmm1); \ - dptr += (16/sizeof(_type_)); \ - _mm_store_si128((__m128i *) dptr, xmm2); \ - dptr += (16/sizeof(_type_)); \ - _mm_store_si128((__m128i *) dptr, xmm3); \ - dptr += (16/sizeof(_type_)); \ - _mm_store_si128((__m128i *) dptr, xmm4); \ - dptr += (16/sizeof(_type_)); \ - } \ - } \ - else \ - { \ - while (count--) \ - { \ - __m128i xmm1, xmm2, xmm3, xmm4; \ - xmm1 = _mm_load_si128((__m128i *) sptr); \ - sptr += (16/sizeof(_type_)); \ - xmm2 = _mm_load_si128((__m128i *) sptr); \ - sptr += (16/sizeof(_type_)); \ - xmm3 = _mm_load_si128((__m128i *) sptr); \ - sptr += (16/sizeof(_type_)); \ - xmm4 = _mm_load_si128((__m128i *) sptr); \ - sptr += (16/sizeof(_type_)); \ - xmm1 = _op_(xmm1, xmm0); \ - xmm2 = _op_(xmm2, xmm0); \ - xmm3 = _op_(xmm3, xmm0); \ - xmm4 = _op_(xmm4, xmm0); \ - _mm_store_si128((__m128i *) dptr, xmm1); \ - dptr += (16/sizeof(_type_)); \ - _mm_store_si128((__m128i *) dptr, xmm2); \ - dptr += (16/sizeof(_type_)); \ - _mm_store_si128((__m128i *) dptr, xmm3); \ - dptr += (16/sizeof(_type_)); \ - _mm_store_si128((__m128i *) dptr, xmm4); \ - dptr += (16/sizeof(_type_)); \ - } \ - } \ - /* Use a single 128-bit SSE register. */ \ - count = len >> (5-shifts); \ - len -= count << (5-shifts); \ - while (count--) \ - { \ - __m128i xmm1 = LOAD_SI128(sptr); sptr += (16/sizeof(_type_)); \ - xmm1 = _op_(xmm1, xmm0); \ - _mm_store_si128((__m128i *) dptr, xmm1); \ - dptr += (16/sizeof(_type_)); \ - } \ - /* Finish off the remainder. */ \ - while (len--) { _slowWay_; } \ - return PRIMITIVES_SUCCESS; \ + { \ + int shifts; \ + UINT32 offBeatMask; \ + const _type_ *sptr = pSrc; \ + _type_ *dptr = pDst; \ + size_t count; \ + __m128i xmm0; \ + if (len < 16) /* pointless if too small */ \ + { \ + return _fallback_(pSrc, val, pDst, len); \ + } \ + if (sizeof(_type_) == 1) shifts = 1; \ + else if (sizeof(_type_) == 2) shifts = 2; \ + else if (sizeof(_type_) == 4) shifts = 3; \ + else if (sizeof(_type_) == 8) shifts = 4; \ + offBeatMask = (1 << (shifts - 1)) - 1; \ + if ((ULONG_PTR) pDst & offBeatMask) \ + { \ + /* Incrementing the pointer skips over 16-byte boundary. */ \ + return _fallback_(pSrc, val, pDst, len); \ + } \ + /* Get to the 16-byte boundary now. */ \ + while ((ULONG_PTR) dptr & 0x0f) \ + { \ + _slowWay_; \ + if (--len == 0) return PRIMITIVES_SUCCESS; \ + } \ + /* Use 4 128-bit SSE registers. */ \ + count = len >> (7-shifts); \ + len -= count << (7-shifts); \ + xmm0 = _mm_set1_epi32(val); \ + if ((ULONG_PTR) sptr & 0x0f) \ + { \ + while (count--) \ + { \ + __m128i xmm1, xmm2, xmm3, xmm4; \ + xmm1 = _mm_lddqu_si128((__m128i *) sptr); \ + sptr += (16/sizeof(_type_)); \ + xmm2 = _mm_lddqu_si128((__m128i *) sptr); \ + sptr += (16/sizeof(_type_)); \ + xmm3 = _mm_lddqu_si128((__m128i *) sptr); \ + sptr += (16/sizeof(_type_)); \ + xmm4 = _mm_lddqu_si128((__m128i *) sptr); \ + sptr += (16/sizeof(_type_)); \ + xmm1 = _op_(xmm1, xmm0); \ + xmm2 = _op_(xmm2, xmm0); \ + xmm3 = _op_(xmm3, xmm0); \ + xmm4 = _op_(xmm4, xmm0); \ + _mm_store_si128((__m128i *) dptr, xmm1); \ + dptr += (16/sizeof(_type_)); \ + _mm_store_si128((__m128i *) dptr, xmm2); \ + dptr += (16/sizeof(_type_)); \ + _mm_store_si128((__m128i *) dptr, xmm3); \ + dptr += (16/sizeof(_type_)); \ + _mm_store_si128((__m128i *) dptr, xmm4); \ + dptr += (16/sizeof(_type_)); \ + } \ + } \ + else \ + { \ + while (count--) \ + { \ + __m128i xmm1, xmm2, xmm3, xmm4; \ + xmm1 = _mm_load_si128((__m128i *) sptr); \ + sptr += (16/sizeof(_type_)); \ + xmm2 = _mm_load_si128((__m128i *) sptr); \ + sptr += (16/sizeof(_type_)); \ + xmm3 = _mm_load_si128((__m128i *) sptr); \ + sptr += (16/sizeof(_type_)); \ + xmm4 = _mm_load_si128((__m128i *) sptr); \ + sptr += (16/sizeof(_type_)); \ + xmm1 = _op_(xmm1, xmm0); \ + xmm2 = _op_(xmm2, xmm0); \ + xmm3 = _op_(xmm3, xmm0); \ + xmm4 = _op_(xmm4, xmm0); \ + _mm_store_si128((__m128i *) dptr, xmm1); \ + dptr += (16/sizeof(_type_)); \ + _mm_store_si128((__m128i *) dptr, xmm2); \ + dptr += (16/sizeof(_type_)); \ + _mm_store_si128((__m128i *) dptr, xmm3); \ + dptr += (16/sizeof(_type_)); \ + _mm_store_si128((__m128i *) dptr, xmm4); \ + dptr += (16/sizeof(_type_)); \ + } \ + } \ + /* Use a single 128-bit SSE register. */ \ + count = len >> (5-shifts); \ + len -= count << (5-shifts); \ + while (count--) \ + { \ + __m128i xmm1 = LOAD_SI128(sptr); sptr += (16/sizeof(_type_)); \ + xmm1 = _op_(xmm1, xmm0); \ + _mm_store_si128((__m128i *) dptr, xmm1); \ + dptr += (16/sizeof(_type_)); \ + } \ + /* Finish off the remainder. */ \ + while (len--) { _slowWay_; } \ + return PRIMITIVES_SUCCESS; \ } /* ---------------------------------------------------------------------------- * SSD = Source1, Source2, Destination */ #define SSE3_SSD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \ - pstatus_t _name_(const _type_ *pSrc1, const _type_ *pSrc2, _type_ *pDst, INT32 len) \ - { \ - int shifts; \ - UINT32 offBeatMask; \ - const _type_ *sptr1 = pSrc1; \ - const _type_ *sptr2 = pSrc2; \ - _type_ *dptr = pDst; \ - size_t count; \ - if (len < 16) /* pointless if too small */ \ - { \ - return _fallback_(pSrc1, pSrc2, pDst, len); \ - } \ - if (sizeof(_type_) == 1) shifts = 1; \ - else if (sizeof(_type_) == 2) shifts = 2; \ - else if (sizeof(_type_) == 4) shifts = 3; \ - else if (sizeof(_type_) == 8) shifts = 4; \ - offBeatMask = (1 << (shifts - 1)) - 1; \ - if ((ULONG_PTR) pDst & offBeatMask) \ - { \ - /* Incrementing the pointer skips over 16-byte boundary. */ \ - return _fallback_(pSrc1, pSrc2, pDst, len); \ - } \ - /* Get to the 16-byte boundary now. */ \ - while ((ULONG_PTR) dptr & 0x0f) \ - { \ - _slowWay_; \ - if (--len == 0) return PRIMITIVES_SUCCESS; \ - } \ - /* Use 4 128-bit SSE registers. */ \ - count = len >> (7-shifts); \ - len -= count << (7-shifts); \ - if (((ULONG_PTR) sptr1 & 0x0f) || ((ULONG_PTR) sptr2 & 0x0f)) \ - { \ - /* Unaligned loads */ \ - while (count--) \ - { \ - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \ - xmm0 = _mm_lddqu_si128((__m128i *) sptr1); \ - sptr1 += (16/sizeof(_type_)); \ - xmm1 = _mm_lddqu_si128((__m128i *) sptr1); \ - sptr1 += (16/sizeof(_type_)); \ - xmm2 = _mm_lddqu_si128((__m128i *) sptr1); \ - sptr1 += (16/sizeof(_type_)); \ - xmm3 = _mm_lddqu_si128((__m128i *) sptr1); \ - sptr1 += (16/sizeof(_type_)); \ - xmm4 = _mm_lddqu_si128((__m128i *) sptr2); \ - sptr2 += (16/sizeof(_type_)); \ - xmm5 = _mm_lddqu_si128((__m128i *) sptr2); \ - sptr2 += (16/sizeof(_type_)); \ - xmm6 = _mm_lddqu_si128((__m128i *) sptr2); \ - sptr2 += (16/sizeof(_type_)); \ - xmm7 = _mm_lddqu_si128((__m128i *) sptr2); \ - sptr2 += (16/sizeof(_type_)); \ - xmm0 = _op_(xmm0, xmm4); \ - xmm1 = _op_(xmm1, xmm5); \ - xmm2 = _op_(xmm2, xmm6); \ - xmm3 = _op_(xmm3, xmm7); \ - _mm_store_si128((__m128i *) dptr, xmm0); \ - dptr += (16/sizeof(_type_)); \ - _mm_store_si128((__m128i *) dptr, xmm1); \ - dptr += (16/sizeof(_type_)); \ - _mm_store_si128((__m128i *) dptr, xmm2); \ - dptr += (16/sizeof(_type_)); \ - _mm_store_si128((__m128i *) dptr, xmm3); \ - dptr += (16/sizeof(_type_)); \ - } \ - } \ - else \ - { \ - /* Aligned loads */ \ - while (count--) \ - { \ - __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \ - xmm0 = _mm_load_si128((__m128i *) sptr1); \ - sptr1 += (16/sizeof(_type_)); \ - xmm1 = _mm_load_si128((__m128i *) sptr1); \ - sptr1 += (16/sizeof(_type_)); \ - xmm2 = _mm_load_si128((__m128i *) sptr1); \ - sptr1 += (16/sizeof(_type_)); \ - xmm3 = _mm_load_si128((__m128i *) sptr1); \ - sptr1 += (16/sizeof(_type_)); \ - xmm4 = _mm_load_si128((__m128i *) sptr2); \ - sptr2 += (16/sizeof(_type_)); \ - xmm5 = _mm_load_si128((__m128i *) sptr2); \ - sptr2 += (16/sizeof(_type_)); \ - xmm6 = _mm_load_si128((__m128i *) sptr2); \ - sptr2 += (16/sizeof(_type_)); \ - xmm7 = _mm_load_si128((__m128i *) sptr2); \ - sptr2 += (16/sizeof(_type_)); \ - xmm0 = _op_(xmm0, xmm4); \ - xmm1 = _op_(xmm1, xmm5); \ - xmm2 = _op_(xmm2, xmm6); \ - xmm3 = _op_(xmm3, xmm7); \ - _mm_store_si128((__m128i *) dptr, xmm0); \ - dptr += (16/sizeof(_type_)); \ - _mm_store_si128((__m128i *) dptr, xmm1); \ - dptr += (16/sizeof(_type_)); \ - _mm_store_si128((__m128i *) dptr, xmm2); \ - dptr += (16/sizeof(_type_)); \ - _mm_store_si128((__m128i *) dptr, xmm3); \ - dptr += (16/sizeof(_type_)); \ - } \ - } \ - /* Use a single 128-bit SSE register. */ \ - count = len >> (5-shifts); \ - len -= count << (5-shifts); \ - while (count--) \ - { \ - __m128i xmm0, xmm1; \ - xmm0 = LOAD_SI128(sptr1); sptr1 += (16/sizeof(_type_)); \ - xmm1 = LOAD_SI128(sptr2); sptr2 += (16/sizeof(_type_)); \ - xmm0 = _op_(xmm0, xmm1); \ - _mm_store_si128((__m128i *) dptr, xmm0); \ - dptr += (16/sizeof(_type_)); \ - } \ - /* Finish off the remainder. */ \ - while (len--) { _slowWay_; } \ - return PRIMITIVES_SUCCESS; \ + pstatus_t _name_(const _type_ *pSrc1, const _type_ *pSrc2, _type_ *pDst, UINT32 len) \ + { \ + int shifts; \ + UINT32 offBeatMask; \ + const _type_ *sptr1 = pSrc1; \ + const _type_ *sptr2 = pSrc2; \ + _type_ *dptr = pDst; \ + size_t count; \ + if (len < 16) /* pointless if too small */ \ + { \ + return _fallback_(pSrc1, pSrc2, pDst, len); \ + } \ + if (sizeof(_type_) == 1) shifts = 1; \ + else if (sizeof(_type_) == 2) shifts = 2; \ + else if (sizeof(_type_) == 4) shifts = 3; \ + else if (sizeof(_type_) == 8) shifts = 4; \ + offBeatMask = (1 << (shifts - 1)) - 1; \ + if ((ULONG_PTR) pDst & offBeatMask) \ + { \ + /* Incrementing the pointer skips over 16-byte boundary. */ \ + return _fallback_(pSrc1, pSrc2, pDst, len); \ + } \ + /* Get to the 16-byte boundary now. */ \ + while ((ULONG_PTR) dptr & 0x0f) \ + { \ + pstatus_t status; \ + status = _slowWay_; \ + if (status != PRIMITIVES_SUCCESS) return status; \ + if (--len == 0) return PRIMITIVES_SUCCESS; \ + } \ + /* Use 4 128-bit SSE registers. */ \ + count = len >> (7-shifts); \ + len -= count << (7-shifts); \ + if (((ULONG_PTR) sptr1 & 0x0f) || ((ULONG_PTR) sptr2 & 0x0f)) \ + { \ + /* Unaligned loads */ \ + while (count--) \ + { \ + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \ + xmm0 = _mm_lddqu_si128((__m128i *) sptr1); \ + sptr1 += (16/sizeof(_type_)); \ + xmm1 = _mm_lddqu_si128((__m128i *) sptr1); \ + sptr1 += (16/sizeof(_type_)); \ + xmm2 = _mm_lddqu_si128((__m128i *) sptr1); \ + sptr1 += (16/sizeof(_type_)); \ + xmm3 = _mm_lddqu_si128((__m128i *) sptr1); \ + sptr1 += (16/sizeof(_type_)); \ + xmm4 = _mm_lddqu_si128((__m128i *) sptr2); \ + sptr2 += (16/sizeof(_type_)); \ + xmm5 = _mm_lddqu_si128((__m128i *) sptr2); \ + sptr2 += (16/sizeof(_type_)); \ + xmm6 = _mm_lddqu_si128((__m128i *) sptr2); \ + sptr2 += (16/sizeof(_type_)); \ + xmm7 = _mm_lddqu_si128((__m128i *) sptr2); \ + sptr2 += (16/sizeof(_type_)); \ + xmm0 = _op_(xmm0, xmm4); \ + xmm1 = _op_(xmm1, xmm5); \ + xmm2 = _op_(xmm2, xmm6); \ + xmm3 = _op_(xmm3, xmm7); \ + _mm_store_si128((__m128i *) dptr, xmm0); \ + dptr += (16/sizeof(_type_)); \ + _mm_store_si128((__m128i *) dptr, xmm1); \ + dptr += (16/sizeof(_type_)); \ + _mm_store_si128((__m128i *) dptr, xmm2); \ + dptr += (16/sizeof(_type_)); \ + _mm_store_si128((__m128i *) dptr, xmm3); \ + dptr += (16/sizeof(_type_)); \ + } \ + } \ + else \ + { \ + /* Aligned loads */ \ + while (count--) \ + { \ + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \ + xmm0 = _mm_load_si128((__m128i *) sptr1); \ + sptr1 += (16/sizeof(_type_)); \ + xmm1 = _mm_load_si128((__m128i *) sptr1); \ + sptr1 += (16/sizeof(_type_)); \ + xmm2 = _mm_load_si128((__m128i *) sptr1); \ + sptr1 += (16/sizeof(_type_)); \ + xmm3 = _mm_load_si128((__m128i *) sptr1); \ + sptr1 += (16/sizeof(_type_)); \ + xmm4 = _mm_load_si128((__m128i *) sptr2); \ + sptr2 += (16/sizeof(_type_)); \ + xmm5 = _mm_load_si128((__m128i *) sptr2); \ + sptr2 += (16/sizeof(_type_)); \ + xmm6 = _mm_load_si128((__m128i *) sptr2); \ + sptr2 += (16/sizeof(_type_)); \ + xmm7 = _mm_load_si128((__m128i *) sptr2); \ + sptr2 += (16/sizeof(_type_)); \ + xmm0 = _op_(xmm0, xmm4); \ + xmm1 = _op_(xmm1, xmm5); \ + xmm2 = _op_(xmm2, xmm6); \ + xmm3 = _op_(xmm3, xmm7); \ + _mm_store_si128((__m128i *) dptr, xmm0); \ + dptr += (16/sizeof(_type_)); \ + _mm_store_si128((__m128i *) dptr, xmm1); \ + dptr += (16/sizeof(_type_)); \ + _mm_store_si128((__m128i *) dptr, xmm2); \ + dptr += (16/sizeof(_type_)); \ + _mm_store_si128((__m128i *) dptr, xmm3); \ + dptr += (16/sizeof(_type_)); \ + } \ + } \ + /* Use a single 128-bit SSE register. */ \ + count = len >> (5-shifts); \ + len -= count << (5-shifts); \ + while (count--) \ + { \ + __m128i xmm0, xmm1; \ + xmm0 = LOAD_SI128(sptr1); sptr1 += (16/sizeof(_type_)); \ + xmm1 = LOAD_SI128(sptr2); sptr2 += (16/sizeof(_type_)); \ + xmm0 = _op_(xmm0, xmm1); \ + _mm_store_si128((__m128i *) dptr, xmm0); \ + dptr += (16/sizeof(_type_)); \ + } \ + /* Finish off the remainder. */ \ + while (len--) { _slowWay_; } \ + return PRIMITIVES_SUCCESS; \ } #endif /* !__PRIM_TEMPLATES_H_INCLUDED__ */ diff --git a/libfreerdp/primitives/primitives.c b/libfreerdp/primitives/primitives.c index 3b803a6..987342a 100644 --- a/libfreerdp/primitives/primitives.c +++ b/libfreerdp/primitives/primitives.c @@ -81,6 +81,6 @@ primitives_t* primitives_get_generic(void) if (!pPrimitivesGenericInitialized) primitives_init_generic(); - return &pPrimitives; + return &pPrimitivesGeneric; } diff --git a/libfreerdp/primitives/test/TestPrimitivesAdd.c b/libfreerdp/primitives/test/TestPrimitivesAdd.c index ade6f49..125752b 100644 --- a/libfreerdp/primitives/test/TestPrimitivesAdd.c +++ b/libfreerdp/primitives/test/TestPrimitivesAdd.c @@ -26,7 +26,7 @@ static BOOL test_add16s_func(void) pstatus_t status; INT16 ALIGN(src1[FUNC_TEST_SIZE + 3]), ALIGN(src2[FUNC_TEST_SIZE + 3]), - ALIGN(d1[FUNC_TEST_SIZE + 3]), ALIGN(d2[FUNC_TEST_SIZE + 3]); + ALIGN(d1[FUNC_TEST_SIZE + 3]), ALIGN(d2[FUNC_TEST_SIZE + 3]); char testStr[256]; testStr[0] = '\0'; @@ -50,7 +50,7 @@ static BOOL test_add16s_func(void) static BOOL test_add16s_speed(void) { BYTE ALIGN(src1[MAX_TEST_SIZE + 3]), ALIGN(src2[MAX_TEST_SIZE + 3]), - ALIGN(dst[MAX_TEST_SIZE + 3]); + ALIGN(dst[MAX_TEST_SIZE + 3]); if (!g_TestPrimitivesPerformance) return TRUE; @@ -59,7 +59,8 @@ static BOOL test_add16s_speed(void) winpr_RAND(src2, sizeof(src2)); if (!speed_test("add16s", "aligned", g_Iterations, - generic->add_16s, optimized->add_16s, + (speed_test_fkt)generic->add_16s, + (speed_test_fkt)optimized->add_16s, src1, src2, dst, FUNC_TEST_SIZE)) return FALSE; @@ -72,8 +73,11 @@ int TestPrimitivesAdd(int argc, char* argv[]) if (!test_add16s_func()) return -1; - if (!test_add16s_speed()) - return -1; + if (g_TestPrimitivesPerformance) + { + if (!test_add16s_speed()) + return -1; + } return 0; } diff --git a/libfreerdp/primitives/test/TestPrimitivesAlphaComp.c b/libfreerdp/primitives/test/TestPrimitivesAlphaComp.c index 9919c0c..c7f49cf 100644 --- a/libfreerdp/primitives/test/TestPrimitivesAlphaComp.c +++ b/libfreerdp/primitives/test/TestPrimitivesAlphaComp.c @@ -33,8 +33,13 @@ static const int block_size[] = { 4, 64, 256 }; #define GRN(_c_) (((_c_) & 0x0000FF00U) >> 8) #define BLU(_c_) ((_c_) & 0x000000FFU) #define TOLERANCE 1 -#define PIXEL(_addr_, _bytes_, _x_, _y_) \ - ((UINT32 *) (((BYTE *) (_addr_)) + (_x_)*4 + (_y_)*(_bytes_))) +static inline const UINT32* PIXEL(const BYTE* _addr_, UINT32 _bytes_, UINT32 _x_, UINT32 _y_) +{ + const BYTE* addr = _addr_ + _x_ * sizeof(UINT32) + _y_ * _bytes_; + + return (const UINT32*)addr; +} + #define SRC1_WIDTH 6 #define SRC1_HEIGHT 6 #define SRC2_WIDTH 7 @@ -46,8 +51,8 @@ static const int block_size[] = { 4, 64, 256 }; /* ------------------------------------------------------------------------- */ static UINT32 alpha_add( - UINT32 c1, - UINT32 c2) + UINT32 c1, + UINT32 c2) { UINT32 a1 = ALF(c1); UINT32 r1 = RED(c1); @@ -66,8 +71,8 @@ static UINT32 alpha_add( /* ------------------------------------------------------------------------- */ static UINT32 colordist( - UINT32 c1, - UINT32 c2) + UINT32 c1, + UINT32 c2) { int d, maxd = 0; d = ABS(ALF(c1) - ALF(c2)); @@ -90,10 +95,10 @@ static UINT32 colordist( } /* ------------------------------------------------------------------------- */ -static BOOL check(const BYTE* pSrc1, INT32 src1Step, - const BYTE* pSrc2, INT32 src2Step, - BYTE* pDst, INT32 dstStep, - INT32 width, INT32 height) +static BOOL check(const BYTE* pSrc1, UINT32 src1Step, + const BYTE* pSrc2, UINT32 src2Step, + BYTE* pDst, UINT32 dstStep, + UINT32 width, UINT32 height) { UINT32 x, y; for (y = 0; y < height; ++y) @@ -120,14 +125,14 @@ static BOOL check(const BYTE* pSrc1, INT32 src1Step, static BOOL test_alphaComp_func(void) { pstatus_t status; - BYTE ALIGN(src1[SRC1_WIDTH * SRC1_HEIGHT]); - BYTE ALIGN(src2[SRC2_WIDTH * SRC2_HEIGHT]); - BYTE ALIGN(dst1[DST_WIDTH * DST_HEIGHT]); - char testStr[256]; + BYTE ALIGN(src1[SRC1_WIDTH * SRC1_HEIGHT * 4]); + BYTE ALIGN(src2[SRC2_WIDTH * SRC2_HEIGHT * 4]); + BYTE ALIGN(dst1[DST_WIDTH * DST_HEIGHT * 4]); UINT32* ptr; UINT32 i; - testStr[0] = '\0'; + winpr_RAND((BYTE*)src1, sizeof(src1)); + /* Special-case the first two values */ src1[0] &= 0x00FFFFFFU; src1[1] |= 0xFF000000U; @@ -141,8 +146,8 @@ static BOOL test_alphaComp_func(void) memset(dst1, 0, sizeof(dst1)); status = generic->alphaComp_argb(src1, 4 * SRC1_WIDTH, - src2, 4 * SRC2_WIDTH, - dst1, 4 * DST_WIDTH, TEST_WIDTH, TEST_HEIGHT); + src2, 4 * SRC2_WIDTH, + dst1, 4 * DST_WIDTH, TEST_WIDTH, TEST_HEIGHT); if (status != PRIMITIVES_SUCCESS) return FALSE; @@ -152,8 +157,8 @@ static BOOL test_alphaComp_func(void) return FALSE; status = optimized->alphaComp_argb((const BYTE*) src1, 4 * SRC1_WIDTH, - (const BYTE*) src2, 4 * SRC2_WIDTH, - (BYTE*) dst1, 4 * DST_WIDTH, TEST_WIDTH, TEST_HEIGHT); + (const BYTE*) src2, 4 * SRC2_WIDTH, + (BYTE*) dst1, 4 * DST_WIDTH, TEST_WIDTH, TEST_HEIGHT); if (status != PRIMITIVES_SUCCESS) return FALSE; @@ -188,7 +193,8 @@ static int test_alphaComp_speed(void) memset(dst1, 0, sizeof(dst1)); if (!speed_test("add16s", "aligned", g_Iterations, - generic->alphaComp_argb, optimized->alphaComp_argb, + (speed_test_fkt)generic->alphaComp_argb, + (speed_test_fkt)optimized->alphaComp_argb, src1, 4 * SRC1_WIDTH, src2, 4 * SRC2_WIDTH, dst1, 4 * DST_WIDTH, TEST_WIDTH, TEST_HEIGHT)) @@ -203,8 +209,11 @@ int TestPrimitivesAlphaComp(int argc, char* argv[]) if (!test_alphaComp_func()) return -1; - if (!test_alphaComp_speed()) - return -1; + if (g_TestPrimitivesPerformance) + { + if (!test_alphaComp_speed()) + return -1; + } return 0; } diff --git a/libfreerdp/primitives/test/TestPrimitivesAndOr.c b/libfreerdp/primitives/test/TestPrimitivesAndOr.c index bf16d26..5e51c57 100644 --- a/libfreerdp/primitives/test/TestPrimitivesAndOr.c +++ b/libfreerdp/primitives/test/TestPrimitivesAndOr.c @@ -20,99 +20,87 @@ #include "prim_test.h" #define FUNC_TEST_SIZE 65536 -static const int ANDOR_PRETEST_ITERATIONS = 100000; -static const int TEST_TIME = 2.0; // seconds #define VALUE (0xA5A5A5A5U) /* ========================================================================= */ -static BOOL test_and_32u_func(void) +static BOOL test_and_32u_impl(const char* name, __andC_32u_t fkt, + const UINT32* src, const UINT32 val, + UINT32* dst, size_t size) { - UINT32 ALIGN(src[FUNC_TEST_SIZE + 3]), ALIGN(dst[FUNC_TEST_SIZE + 3]); - int failed = 0; - int i; - char testStr[256]; - testStr[0] = '\0'; - winpr_RAND(src, sizeof(src)); - generic->andC_32u(src + 1, VALUE, dst + 1, FUNC_TEST_SIZE); - strcat(testStr, " general"); + size_t i; + pstatus_t status = fkt(src, val, dst, size); + if (status != PRIMITIVES_SUCCESS) + return FALSE; - for (i = 1; i <= FUNC_TEST_SIZE; ++i) + for (i = 0; i < size; ++i) { - if (dst[i] != (src[i] & VALUE)) + if (dst[i] != (src[i] & val)) { - printf("AND-general FAIL[%d] 0x%08x&0x%08x=0x%08x, got 0x%08x\n", - i, src[i], VALUE, src[i] & VALUE, dst[i]); - ++failed; + printf("AND %s FAIL[%d] 0x%08x&0x%08x=0x%08x, got 0x%08x\n", + name, i, src[i], val, src[i] & val, dst[i]); + return FALSE; } } -#ifdef WITH_SSE2 - - if (IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE)) - { - strcat(testStr, " SSE3"); - /* Aligned */ - memset(dst, 0, sizeof(dst)); - sse3_andC_32u(src + 1, VALUE, dst + 1, FUNC_TEST_SIZE); - - for (i = 1; i <= FUNC_TEST_SIZE; ++i) - { - if (dst[i] != (src[i] & VALUE)) - { - printf("AND-SSE-aligned FAIL[%d] 0x%08x&0x%08x=0x%08x, got 0x%08x\n", - i, src[i], VALUE, src[i] & VALUE, dst[i]); - ++failed; - } - } - - /* Unaligned */ - memset(dst, 0, sizeof(dst)); - sse3_andC_32u(src + 1, VALUE, dst + 2, FUNC_TEST_SIZE); + return TRUE; +} - for (i = 1; i <= FUNC_TEST_SIZE; ++i) - { - if (dst[i + 1] != (src[i] & VALUE)) - { - printf("AND-SSE-unaligned FAIL[%d] 0x%08x&0x%08x=0x%08x, got 0x%08x\n", - i, src[i], VALUE, src[i] & VALUE, dst[i + 1]); - ++failed; - } - } - } +static BOOL test_and_32u_func(void) +{ + UINT32 ALIGN(src[FUNC_TEST_SIZE + 3]), ALIGN(dst[FUNC_TEST_SIZE + 3]); -#endif /* i386 */ + winpr_RAND((BYTE*)src, sizeof(src)); - if (!failed) printf("All and_32u tests passed (%s).\n", testStr); + if (!test_and_32u_impl("generic->andC_32u aligned", generic->andC_32u, + src + 1, VALUE, dst + 1, FUNC_TEST_SIZE)) + return FALSE; + if (!test_and_32u_impl("generic->andC_32u unaligned", generic->andC_32u, + src + 1, VALUE, dst + 2, FUNC_TEST_SIZE)) + return FALSE; + if (!test_and_32u_impl("optimized->andC_32u aligned", optimized->andC_32u, + src + 1, VALUE, dst + 1, FUNC_TEST_SIZE)) + return FALSE; + if (!test_and_32u_impl("optimized->andC_32u unaligned", optimized->andC_32u, + src + 1, VALUE, dst + 2, FUNC_TEST_SIZE)) + return FALSE; - return (failed > 0) ? FAILURE : SUCCESS; + return TRUE; } /* ------------------------------------------------------------------------- */ static BOOL test_and_32u_speed(void) { UINT32 ALIGN(src[MAX_TEST_SIZE + 3]), ALIGN(dst[MAX_TEST_SIZE + 3]); - winpr_RAND(src, sizeof(src)); - andC_32u_speed_test("and32u", "aligned", src, NULL, VALUE, dst, - test_sizes, NUM_TEST_SIZES, ANDOR_PRETEST_ITERATIONS, TEST_TIME); - andC_32u_speed_test("and32u", "unaligned", src + 1, NULL, VALUE, dst, - test_sizes, NUM_TEST_SIZES, ANDOR_PRETEST_ITERATIONS, TEST_TIME); - return SUCCESS; + + winpr_RAND((BYTE*)src, sizeof(src)); + + if (!speed_test("andC_32u", "aligned", g_Iterations, + (speed_test_fkt)generic->andC_32u, + (speed_test_fkt)optimized->andC_32u, + src + 1, VALUE, dst + 1, MAX_TEST_SIZE)) + return FALSE; + if (!speed_test("andC_32u", "unaligned", g_Iterations, + (speed_test_fkt)generic->andC_32u, + (speed_test_fkt)optimized->andC_32u, + src + 1, VALUE, dst + 2, MAX_TEST_SIZE)) + return FALSE; + + return TRUE; } /* ========================================================================= */ static BOOL check(const UINT32* src, const UINT32* dst, UINT32 size, UINT32 value) { UINT32 i; - UINT32 failed = 0; - for (i = 1; i <= size; ++i) + for (i = 0; i < size; ++i) { if (dst[i] != (src[i] | value)) { printf("OR-general general FAIL[%d] 0x%08x&0x%08x=0x%08x, got 0x%08x\n", i, src[i], value, src[i] | value, dst[i]); - ++failed; + return FALSE; } } @@ -123,8 +111,7 @@ static BOOL test_or_32u_func(void) { pstatus_t status; UINT32 ALIGN(src[FUNC_TEST_SIZE + 3]), ALIGN(dst[FUNC_TEST_SIZE + 3]); - char testStr[256]; - testStr[0] = '\0'; + winpr_RAND((BYTE*)src, sizeof(src)); status = generic->orC_32u(src + 1, VALUE, dst + 1, FUNC_TEST_SIZE); @@ -153,7 +140,8 @@ static BOOL test_or_32u_speed(void) winpr_RAND((BYTE*)src, sizeof(src)); if (!speed_test("add16s", "aligned", g_Iterations, - generic->orC_32u, optimized->orC_32u, + (speed_test_fkt)generic->orC_32u, + (speed_test_fkt)optimized->orC_32u, src + 1, VALUE, dst + 1, FUNC_TEST_SIZE)) return FALSE; @@ -167,14 +155,16 @@ int TestPrimitivesAndOr(int argc, char* argv[]) if (!test_and_32u_func()) return -1; - if (!test_and_32u_speed()) - return -1; - if (!test_or_32u_func()) return -1; - if (!test_or_32u_speed()) - return -1; + if (g_TestPrimitivesPerformance) + { + if (!test_and_32u_speed()) + return -1; + if (!test_or_32u_speed()) + return -1; + } return 0; } diff --git a/libfreerdp/primitives/test/TestPrimitivesColors.c b/libfreerdp/primitives/test/TestPrimitivesColors.c index e6c5209..18c0dec 100644 --- a/libfreerdp/primitives/test/TestPrimitivesColors.c +++ b/libfreerdp/primitives/test/TestPrimitivesColors.c @@ -24,19 +24,16 @@ static const int YCBCR_TRIAL_ITERATIONS = 1000; static const float TEST_TIME = 4.0; /* ------------------------------------------------------------------------- */ -int test_RGBToRGB_16s8u_P3AC4R_func(void) +static BOOL test_RGBToRGB_16s8u_P3AC4R_func(void) { INT16 ALIGN(r[4096]), ALIGN(g[4096]), ALIGN(b[4096]); UINT32 ALIGN(out1[4096]); -#ifdef WITH_SSE2 UINT32 ALIGN(out2[4096]); -#endif int i; - int failed = 0; - char testStr[256]; + BOOL failed = FALSE; INT16* ptrs[3]; prim_size_t roi = { 64, 64 }; - testStr[0] = '\0'; + winpr_RAND((BYTE*)r, sizeof(r)); winpr_RAND((BYTE*)g, sizeof(g)); winpr_RAND((BYTE*)b, sizeof(b)); @@ -52,56 +49,38 @@ int test_RGBToRGB_16s8u_P3AC4R_func(void) ptrs[0] = r; ptrs[1] = g; ptrs[2] = b; - generic->RGBToRGB_16s8u_P3AC4R((const INT16**) ptrs, 64 * 2, - (BYTE*) out1, 64 * 4, &roi); -#ifdef WITH_SSE2 + if (generic->RGBToRGB_16s8u_P3AC4R((const INT16**) ptrs, 64 * 2, + (BYTE*) out1, 64 * 4, PIXEL_FORMAT_RGBA32, + &roi) != PRIMITIVES_SUCCESS) + return FALSE; - if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE)) - { - strcat(testStr, " SSE2"); - sse2_RGBToRGB_16s8u_P3AC4R((const INT16**) ptrs, 64 * 2, - (BYTE*) out2, 64 * 4, &roi); + if (optimized->RGBToRGB_16s8u_P3AC4R((const INT16**) ptrs, 64 * 2, + (BYTE*) out2, 64 * 4, PIXEL_FORMAT_RGBA32, + &roi) != PRIMITIVES_SUCCESS) + return FALSE; - for (i = 0; i < 4096; ++i) + for (i = 0; i < 4096; ++i) + { + if (out1[i] != out2[i]) { - if (out1[i] != out2[i]) - { - printf("RGBToRGB-SSE FAIL: out1[%d]=0x%08x out2[%d]=0x%08x\n", - i, out1[i], i, out2[i]); - failed = 1; - } + printf("RGBToRGB-SSE FAIL: out1[%d]=0x%08x out2[%d]=0x%08x\n", + i, out1[i], i, out2[i]); + failed = TRUE; } } -#endif /* i386 */ - - if (!failed) printf("All RGBToRGB_16s8u_P3AC4R tests passed (%s).\n", testStr); - - return (failed > 0) ? FAILURE : SUCCESS; + return !failed; } /* ------------------------------------------------------------------------- */ -static const prim_size_t roi64x64 = { 64, 64 }; -STD_SPEED_TEST( - rgb_to_argb_speed, INT16*, UINT32, dst = dst, - TRUE, generic->RGBToRGB_16s8u_P3AC4R( - (const INT16**) src1, 64 * 2, (BYTE*) dst, 64 * 4, &roi64x64), -#ifdef WITH_SSE2 - TRUE, sse2_RGBToRGB_16s8u_P3AC4R( - (const INT16**) src1, 64 * 2, (BYTE*) dst, 64 * 4, &roi64x64), - PF_SSE2_INSTRUCTIONS_AVAILABLE, FALSE, -#else - FALSE, PRIM_NOP, 0, FALSE, -#endif - FALSE, dst = dst); - -int test_RGBToRGB_16s8u_P3AC4R_speed(void) +static BOOL test_RGBToRGB_16s8u_P3AC4R_speed(void) { - INT16 ALIGN(r[4096]), ALIGN(g[4096]), ALIGN(b[4096]); - UINT32 ALIGN(dst[4096]); + const prim_size_t roi64x64 = { 64, 64 }; + INT16 ALIGN(r[4096+1]), ALIGN(g[4096+1]), ALIGN(b[4096+1]); + UINT32 ALIGN(dst[4096+1]); int i; INT16* ptrs[3]; - int size_array[] = { 64 }; + winpr_RAND((BYTE*)r, sizeof(r)); winpr_RAND((BYTE*)g, sizeof(g)); winpr_RAND((BYTE*)b, sizeof(b)); @@ -114,29 +93,38 @@ int test_RGBToRGB_16s8u_P3AC4R_speed(void) b[i] &= 0x00FFU; } - ptrs[0] = r; - ptrs[1] = g; - ptrs[2] = b; - rgb_to_argb_speed("RGBToARGB", "aligned", - (const INT16**) ptrs, NULL, 0, dst, - size_array, 1, RGB_TRIAL_ITERATIONS, TEST_TIME); - return SUCCESS; + ptrs[0] = r+1; + ptrs[1] = g+1; + ptrs[2] = b+1; + + if (!speed_test("RGBToRGB_16s8u_P3AC4R", "aligned", g_Iterations, + (speed_test_fkt)generic->RGBToRGB_16s8u_P3AC4R, + (speed_test_fkt)optimized->RGBToRGB_16s8u_P3AC4R, + (const INT16**) ptrs, 64 * 2, (BYTE*) dst, 64 * 4, &roi64x64)) + return FALSE; + + if (!speed_test("RGBToRGB_16s8u_P3AC4R", "unaligned", g_Iterations, + (speed_test_fkt)generic->RGBToRGB_16s8u_P3AC4R, + (speed_test_fkt)optimized->RGBToRGB_16s8u_P3AC4R, + (const INT16**) ptrs, 64 * 2, ((BYTE*) dst)+1, 64 * 4, &roi64x64)) + return FALSE; + + return TRUE; } /* ========================================================================= */ -int test_yCbCrToRGB_16s16s_P3P3_func(void) +static BOOL test_yCbCrToRGB_16s16s_P3P3_func(void) { + pstatus_t status; INT16 ALIGN(y[4096]), ALIGN(cb[4096]), ALIGN(cr[4096]); INT16 ALIGN(r1[4096]), ALIGN(g1[4096]), ALIGN(b1[4096]); INT16 ALIGN(r2[4096]), ALIGN(g2[4096]), ALIGN(b2[4096]); int i; - int failed = 0; - char testStr[256]; const INT16* in[3]; INT16* out1[3]; INT16* out2[3]; prim_size_t roi = { 64, 64 }; - testStr[0] = '\0'; + winpr_RAND((BYTE*)y, sizeof(y)); winpr_RAND((BYTE*)cb, sizeof(cb)); winpr_RAND((BYTE*)cr, sizeof(cr)); @@ -164,57 +152,40 @@ int test_yCbCrToRGB_16s16s_P3P3_func(void) out2[0] = r2; out2[1] = g2; out2[2] = b2; - generic->yCbCrToRGB_16s16s_P3P3(in, 64 * 2, out1, 64 * 2, &roi); -#ifdef WITH_SSE2 - if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE)) - { - strcat(testStr, " SSE2"); - sse2_yCbCrToRGB_16s16s_P3P3(in, 64 * 2, out2, 64 * 2, &roi); + status = generic->yCbCrToRGB_16s16s_P3P3(in, 64 * 2, out1, 64 * 2, &roi); + if (status != PRIMITIVES_SUCCESS) + return FALSE; - for (i = 0; i < 4096; ++i) + status = optimized->yCbCrToRGB_16s16s_P3P3(in, 64 * 2, out2, 64 * 2, &roi); + if (status != PRIMITIVES_SUCCESS) + return FALSE; + + for (i = 0; i < 4096; ++i) + { + if ((ABS(r1[i] - r2[i]) > 1) + || (ABS(g1[i] - g2[i]) > 1) + || (ABS(b1[i] - b2[i]) > 1)) { - if ((ABS(r1[i] - r2[i]) > 1) - || (ABS(g1[i] - g2[i]) > 1) - || (ABS(b1[i] - b2[i]) > 1)) - { - printf("YCbCrToRGB-SSE FAIL[%d]: %d,%d,%d vs %d,%d,%d\n", i, - r1[i], g1[i], b1[i], r2[i], g2[i], b2[i]); - failed = 1; - } + printf("YCbCrToRGB-SSE FAIL[%d]: %d,%d,%d vs %d,%d,%d\n", i, + r1[i], g1[i], b1[i], r2[i], g2[i], b2[i]); + return FALSE; } } -#endif /* i386 */ - - if (!failed) printf("All yCbCrToRGB_16s16s_P3P3 tests passed (%s).\n", testStr); - - return (failed > 0) ? FAILURE : SUCCESS; + return TRUE; } /* ------------------------------------------------------------------------- */ -STD_SPEED_TEST( - ycbcr_to_rgb_speed, INT16*, INT16*, dst = dst, - TRUE, generic->yCbCrToRGB_16s16s_P3P3(src1, 64 * 2, dst, 64 * 2, &roi64x64), -#ifdef WITH_SSE2 - TRUE, sse2_yCbCrToRGB_16s16s_P3P3(src1, 64 * 2, dst, 64 * 2, &roi64x64), - PF_SSE2_INSTRUCTIONS_AVAILABLE, FALSE, -#elif defined(WITH_NEON) - TRUE, neon_yCbCrToRGB_16s16s_P3P3(src1, 64 * 2, dst, 64 * 2, &roi64x64), - PF_ARM_NEON_INSTRUCTIONS_AVAILABLE, FALSE, -#else - FALSE, PRIM_NOP, 0, FALSE, -#endif - FALSE, dst = dst); - static int test_yCbCrToRGB_16s16s_P3P3_speed(void) { + prim_size_t roi = { 64, 64 }; INT16 ALIGN(y[4096]), ALIGN(cb[4096]), ALIGN(cr[4096]); INT16 ALIGN(r[4096]), ALIGN(g[4096]), ALIGN(b[4096]); int i; const INT16* input[3]; INT16* output[3]; - int size_array[] = { 64 }; + winpr_RAND((BYTE*)y, sizeof(y)); winpr_RAND((BYTE*)cb, sizeof(cb)); winpr_RAND((BYTE*)cr, sizeof(cr)); @@ -233,37 +204,35 @@ static int test_yCbCrToRGB_16s16s_P3P3_speed(void) output[0] = r; output[1] = g; output[2] = b; - ycbcr_to_rgb_speed("yCbCrToRGB", "aligned", input, NULL, NULL, output, - size_array, 1, YCBCR_TRIAL_ITERATIONS, TEST_TIME); - return SUCCESS; + + if (!speed_test("yCbCrToRGB_16s16s_P3P3", "aligned", g_Iterations, + (speed_test_fkt)generic->yCbCrToRGB_16s16s_P3P3, + (speed_test_fkt)optimized->yCbCrToRGB_16s16s_P3P3, + input, 64 * 2, output, 64 * 2, &roi)) + return FALSE; + + return TRUE; } int TestPrimitivesColors(int argc, char* argv[]) { - int status; - status = test_RGBToRGB_16s8u_P3AC4R_func(); + prim_test_setup(FALSE); - if (status != SUCCESS) + if (!test_RGBToRGB_16s8u_P3AC4R_func()) return 1; if (g_TestPrimitivesPerformance) { - status = test_RGBToRGB_16s8u_P3AC4R_speed(); - - if (status != SUCCESS) + if (!test_RGBToRGB_16s8u_P3AC4R_speed()) return 1; } - status = test_yCbCrToRGB_16s16s_P3P3_func(); - - if (status != SUCCESS) + if (!test_yCbCrToRGB_16s16s_P3P3_func()) return 1; if (g_TestPrimitivesPerformance) { - status = test_yCbCrToRGB_16s16s_P3P3_speed(); - - if (status != SUCCESS) + if (!test_yCbCrToRGB_16s16s_P3P3_speed()) return 1; } diff --git a/libfreerdp/primitives/test/TestPrimitivesCopy.c b/libfreerdp/primitives/test/TestPrimitivesCopy.c index 9220efe..917c702 100644 --- a/libfreerdp/primitives/test/TestPrimitivesCopy.c +++ b/libfreerdp/primitives/test/TestPrimitivesCopy.c @@ -19,22 +19,17 @@ #include #include "prim_test.h" -static const int MEMCPY_PRETEST_ITERATIONS = 1000000; -static const int TEST_TIME = 1.0; // seconds #define COPY_TESTSIZE (256*2+16*2+15+15) /* ------------------------------------------------------------------------- */ -static int test_copy8u_func(void) +static BOOL test_copy8u_func(void) { primitives_t* prims = primitives_get(); BYTE ALIGN(data[COPY_TESTSIZE + 15]); int i, soff; - int failed = 0; - char testStr[256]; BYTE ALIGN(dest[COPY_TESTSIZE + 15]); - testStr[0] = '\0'; + winpr_RAND(data, sizeof(data)); - strcat(testStr, " ptr"); for (soff = 0; soff < 16; ++soff) { @@ -47,7 +42,8 @@ static int test_copy8u_func(void) for (length = 1; length <= COPY_TESTSIZE - doff; ++length) { memset(dest, 0, sizeof(dest)); - prims->copy_8u(data + soff, dest + doff, length); + if (prims->copy_8u(data + soff, dest + doff, length) != PRIMITIVES_SUCCESS) + return FALSE; for (i = 0; i < length; ++i) { @@ -57,48 +53,47 @@ static int test_copy8u_func(void) "data[%d]=0x%02x\n", doff, length, i + doff, dest[i + doff], i + soff, data[i + soff]); - failed = 1; + return FALSE; } } } } } - if (!failed) printf("All copy8 tests passed (%s).\n", testStr); - - return (failed > 0) ? FAILURE : SUCCESS; + return TRUE; } /* ------------------------------------------------------------------------- */ -STD_SPEED_TEST(copy8u_speed_test, BYTE, BYTE, dst = dst, - TRUE, memcpy(dst, src1, size), - FALSE, PRIM_NOP, 0, FALSE, - TRUE, ippsCopy_8u(src1, dst, size)); - -int test_copy8u_speed(void) +static BOOL test_copy8u_speed(void) { BYTE ALIGN(src[MAX_TEST_SIZE + 4]); BYTE ALIGN(dst[MAX_TEST_SIZE + 4]); - copy8u_speed_test("copy8u", "aligned", src, NULL, 0, dst, - test_sizes, NUM_TEST_SIZES, MEMCPY_PRETEST_ITERATIONS, TEST_TIME); - copy8u_speed_test("copy8u", "unaligned", src + 1, NULL, 0, dst, - test_sizes, NUM_TEST_SIZES, MEMCPY_PRETEST_ITERATIONS, TEST_TIME); - return SUCCESS; + + if (!speed_test("copy_8u", "aligned", g_Iterations, + (speed_test_fkt)generic->copy_8u, + (speed_test_fkt)optimized->copy_8u, + src, dst, MAX_TEST_SIZE)) + return FALSE; + + if (!speed_test("copy_8u", "unaligned", g_Iterations, + (speed_test_fkt)generic->copy_8u, + (speed_test_fkt)optimized->copy_8u, + src+1, dst+1, MAX_TEST_SIZE)) + return FALSE; + + return TRUE; } int TestPrimitivesCopy(int argc, char* argv[]) { - int status; - status = test_copy8u_func(); + prim_test_setup(FALSE); - if (status != SUCCESS) + if (!test_copy8u_func()) return 1; if (g_TestPrimitivesPerformance) { - status = test_copy8u_speed(); - - if (status != SUCCESS) + if (!test_copy8u_speed()) return 1; } diff --git a/libfreerdp/primitives/test/TestPrimitivesSet.c b/libfreerdp/primitives/test/TestPrimitivesSet.c index 208e11e..219539b 100644 --- a/libfreerdp/primitives/test/TestPrimitivesSet.c +++ b/libfreerdp/primitives/test/TestPrimitivesSet.c @@ -53,14 +53,14 @@ static BOOL test_set8u_func(void) { UINT32 len; - memset(dest, 0, sizeof(dest)); + memset(dest, 3, sizeof(dest)); for (len = 1; len < 48 - off; ++len) { status = generic->set_8u(0xa5, dest + off, len); if (status != PRIMITIVES_SUCCESS) return FALSE; - if (!check8(dest, len, off, 0xa8)) + if (!check8(dest, len, off, 0xa5)) return FALSE; } } @@ -69,14 +69,14 @@ static BOOL test_set8u_func(void) { UINT32 len; - memset(dest, 0, sizeof(dest)); + memset(dest, 3, sizeof(dest)); for (len = 1; len < 48 - off; ++len) { status = optimized->set_8u(0xa5, dest + off, len); if (status != PRIMITIVES_SUCCESS) return FALSE; - if (!check8(dest, len, off, 0xa8)) + if (!check8(dest, len, off, 0xa5)) return FALSE; } } @@ -95,8 +95,9 @@ static BOOL test_set8u_speed(void) { winpr_RAND(&value, sizeof(value)); if (!speed_test("set_8u", "", g_Iterations, - generic->set_8u, optimized->set_8u, - value, dest + x, len)) + (speed_test_fkt)generic->set_8u, + (speed_test_fkt)optimized->set_8u, + value, dest + x, x)) return FALSE; } @@ -232,8 +233,9 @@ static BOOL test_set32u_speed(void) { winpr_RAND(&value, sizeof(value)); if (!speed_test("set_32u", "", g_Iterations, - generic->set_32u, optimized->set_32u, - value, dest + x, len)) + (speed_test_fkt)generic->set_32u, + (speed_test_fkt)optimized->set_32u, + value, dest + x, x)) return FALSE; } @@ -251,8 +253,9 @@ static BOOL test_set32s_speed(void) { winpr_RAND(&value, sizeof(value)); if (!speed_test("set_32s", "", g_Iterations, - generic->set_32s, optimized->set_32s, - value, dest + x, len)) + (speed_test_fkt)generic->set_32s, + (speed_test_fkt)optimized->set_32s, + value, dest + x, x)) return FALSE; } @@ -265,21 +268,20 @@ int TestPrimitivesSet(int argc, char* argv[]) if (!test_set8u_func()) return -1; - - if (!test_set8u_speed()) - return -1; - if (!test_set32s_func()) return -1; - - if (!test_set32s_speed()) - return -1; - if (!test_set32u_func()) return -1; - if (!test_set32u_speed()) - return -1; + if (g_TestPrimitivesPerformance) + { + if (!test_set8u_speed()) + return -1; + if (!test_set32s_speed()) + return -1; + if (!test_set32u_speed()) + return -1; + } return 0; } diff --git a/libfreerdp/primitives/test/TestPrimitivesShift.c b/libfreerdp/primitives/test/TestPrimitivesShift.c index 4316cad..e2f9b67 100644 --- a/libfreerdp/primitives/test/TestPrimitivesShift.c +++ b/libfreerdp/primitives/test/TestPrimitivesShift.c @@ -20,207 +20,361 @@ #include "prim_test.h" #define FUNC_TEST_SIZE 65536 -static const int SHIFT_PRETEST_ITERATIONS = 50000; -static const float TEST_TIME = 1.0; - -#ifdef WITH_SSE2 -#define SHIFT_TEST_FUNC(_name_, _type_, _str_, _f1_, _f2_) \ - int _name_(void) \ - { \ - _type_ ALIGN(src[FUNC_TEST_SIZE+3]), \ - ALIGN(d1[FUNC_TEST_SIZE+3]), ALIGN(d2[FUNC_TEST_SIZE+3]); \ - int failed = 0; \ - int i; \ - char testStr[256]; \ - testStr[0] = '\0'; \ - get_random_data(src, sizeof(src)); \ - _f1_(src+1, 3, d1+1, FUNC_TEST_SIZE); \ - if (IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE)) \ - { \ - strcat(testStr, " SSE3"); \ - /* Aligned */ \ - _f2_(src+1, 3, d2+1, FUNC_TEST_SIZE); \ - for (i=1; i<=FUNC_TEST_SIZE; ++i) \ - { \ - if (d1[i] != d2[i]) \ - { \ - printf("%s-SSE-aligned FAIL[%d]: 0x%x>>3=0x%x, got 0x%x\n", \ - _str_, i, src[i], d1[i], d2[i]); \ - ++failed; \ - } \ - } \ - /* Unaligned */ \ - _f2_(src+1, 3, d2+2, FUNC_TEST_SIZE); \ - for (i=1; i<=FUNC_TEST_SIZE; ++i) \ - { \ - if (d1[i] != d2[i+1]) \ - { \ - printf("%s-SSE-unaligned FAIL[%d]: 0x%x>>3=0x%x, got 0x%x\n", \ - _str_, i, src[i], d1[i], d2[i+1]); \ - ++failed; \ - } \ - } \ - } \ - if (!failed) printf("All %s tests passed (%s).\n", _str_, testStr); \ - return (failed > 0) ? FAILURE : SUCCESS; \ - } -#else -#define SHIFT_TEST_FUNC(_name_, _type_, _str_, _f1_, _f2_) \ - int _name_(void) \ - { \ - return SUCCESS; \ - } -#endif /* i386 */ - -SHIFT_TEST_FUNC(test_lShift_16s_func, INT16, "lshift_16s", general_lShiftC_16s, - sse2_lShiftC_16s) -SHIFT_TEST_FUNC(test_lShift_16u_func, UINT16, "lshift_16u", general_lShiftC_16u, - sse2_lShiftC_16u) -SHIFT_TEST_FUNC(test_rShift_16s_func, INT16, "rshift_16s", general_rShiftC_16s, - sse2_rShiftC_16s) -SHIFT_TEST_FUNC(test_rShift_16u_func, UINT16, "rshift_16u", general_rShiftC_16u, - sse2_rShiftC_16u) - -/* ========================================================================= */ -STD_SPEED_TEST(speed_lShift_16s, INT16, INT16, dst = dst, - TRUE, general_lShiftC_16s(src1, constant, dst, size), -#ifdef WITH_SSE2 - TRUE, sse2_lShiftC_16s(src1, constant, dst, size), - PF_SSE2_INSTRUCTIONS_AVAILABLE, FALSE, -#else - FALSE, PRIM_NOP, 0, FALSE, -#endif - TRUE, ippsLShiftC_16s(src1, constant, dst, size)); -STD_SPEED_TEST(speed_lShift_16u, UINT16, UINT16, dst = dst, - TRUE, general_lShiftC_16u(src1, constant, dst, size), -#ifdef WITH_SSE2 - TRUE, sse2_lShiftC_16u(src1, constant, dst, size), - PF_SSE2_INSTRUCTIONS_AVAILABLE, FALSE, -#else - FALSE, PRIM_NOP, 0, FALSE, -#endif - TRUE, ippsLShiftC_16u(src1, constant, dst, size)); -STD_SPEED_TEST(speed_rShift_16s, INT16, INT16, dst = dst, - TRUE, general_rShiftC_16s(src1, constant, dst, size), -#ifdef WITH_SSE2 - TRUE, sse2_rShiftC_16s(src1, constant, dst, size), - PF_SSE2_INSTRUCTIONS_AVAILABLE, FALSE, -#else - FALSE, PRIM_NOP, 0, FALSE, -#endif - TRUE, ippsRShiftC_16s(src1, constant, dst, size)); -STD_SPEED_TEST(speed_rShift_16u, UINT16, UINT16, dst = dst, - TRUE, general_rShiftC_16u(src1, constant, dst, size), -#ifdef WITH_SSE2 - TRUE, sse2_rShiftC_16u(src1, constant, dst, size), - PF_SSE2_INSTRUCTIONS_AVAILABLE, FALSE, -#else - FALSE, PRIM_NOP, 0, FALSE, -#endif - TRUE, ippsRShiftC_16u(src1, constant, dst, size)); + +static BOOL test_lShift_16s_func(void) +{ + pstatus_t status; + INT16 ALIGN(src[FUNC_TEST_SIZE+3]); + INT16 ALIGN(d1[FUNC_TEST_SIZE+3]); + UINT32 val; + + winpr_RAND((BYTE*)&val, sizeof(val)); + winpr_RAND((BYTE*)src, sizeof(src)); + + val = (val % (FUNC_TEST_SIZE - 1)) + 1; + + /* Aligned */ + status = generic->lShiftC_16s(src+1, val, d1+1, FUNC_TEST_SIZE); + if (status != PRIMITIVES_SUCCESS) + return FALSE; + status = optimized->lShiftC_16s(src+1, val, d1+1, FUNC_TEST_SIZE); + if (status != PRIMITIVES_SUCCESS) + return FALSE; + + /* Unaligned */ + status = generic->lShiftC_16s(src+1, val, d1+2, FUNC_TEST_SIZE); + if (status != PRIMITIVES_SUCCESS) + return FALSE; + status = optimized->lShiftC_16s(src+1, val, d1+2, FUNC_TEST_SIZE); + if (status != PRIMITIVES_SUCCESS) + return FALSE; + + return TRUE; +} + +static BOOL test_lShift_16u_func(void) +{ + pstatus_t status; + UINT16 ALIGN(src[FUNC_TEST_SIZE+3]); + UINT16 ALIGN(d1[FUNC_TEST_SIZE+3]); + UINT32 val; + + winpr_RAND((BYTE*)&val, sizeof(val)); + winpr_RAND((BYTE*)src, sizeof(src)); + + val = (val % (FUNC_TEST_SIZE - 1)) + 1; + + /* Aligned */ + status = generic->lShiftC_16u(src+1, val, d1+1, FUNC_TEST_SIZE); + if (status != PRIMITIVES_SUCCESS) + return FALSE; + status = optimized->lShiftC_16u(src+1, val, d1+1, FUNC_TEST_SIZE); + if (status != PRIMITIVES_SUCCESS) + return FALSE; + + /* Unaligned */ + status = generic->lShiftC_16u(src+1, val, d1+2, FUNC_TEST_SIZE); + if (status != PRIMITIVES_SUCCESS) + return FALSE; + status = optimized->lShiftC_16u(src+1, val, d1+2, FUNC_TEST_SIZE); + if (status != PRIMITIVES_SUCCESS) + return FALSE; + + return TRUE; +} + +static BOOL test_rShift_16s_func(void) +{ + pstatus_t status; + INT16 ALIGN(src[FUNC_TEST_SIZE+3]); + INT16 ALIGN(d1[FUNC_TEST_SIZE+3]); + UINT32 val; + + winpr_RAND((BYTE*)&val, sizeof(val)); + winpr_RAND((BYTE*)src, sizeof(src)); + + val = (val % (FUNC_TEST_SIZE - 1)) + 1; + + /* Aligned */ + status = generic->rShiftC_16s(src+1, val, d1+1, FUNC_TEST_SIZE); + if (status != PRIMITIVES_SUCCESS) + return FALSE; + status = optimized->rShiftC_16s(src+1, val, d1+1, FUNC_TEST_SIZE); + if (status != PRIMITIVES_SUCCESS) + return FALSE; + + /* Unaligned */ + status = generic->rShiftC_16s(src+1, val, d1+2, FUNC_TEST_SIZE); + if (status != PRIMITIVES_SUCCESS) + return FALSE; + status = optimized->rShiftC_16s(src+1, val, d1+2, FUNC_TEST_SIZE); + if (status != PRIMITIVES_SUCCESS) + return FALSE; + + return TRUE; +} + +static BOOL test_rShift_16u_func(void) +{ + pstatus_t status; + UINT16 ALIGN(src[FUNC_TEST_SIZE+3]); + UINT16 ALIGN(d1[FUNC_TEST_SIZE+3]); + UINT32 val; + + winpr_RAND((BYTE*)&val, sizeof(val)); + winpr_RAND((BYTE*)src, sizeof(src)); + + val = (val % (FUNC_TEST_SIZE - 1)) + 1; + + /* Aligned */ + status = generic->rShiftC_16u(src+1, val, d1+1, FUNC_TEST_SIZE); + if (status != PRIMITIVES_SUCCESS) + return FALSE; + status = optimized->rShiftC_16u(src+1, val, d1+1, FUNC_TEST_SIZE); + if (status != PRIMITIVES_SUCCESS) + return FALSE; + + /* Unaligned */ + status = generic->rShiftC_16u(src+1, val, d1+2, FUNC_TEST_SIZE); + if (status != PRIMITIVES_SUCCESS) + return FALSE; + status = optimized->rShiftC_16u(src+1, val, d1+2, FUNC_TEST_SIZE); + if (status != PRIMITIVES_SUCCESS) + return FALSE; + + return TRUE; +} + +static BOOL test_ShiftWrapper_16s_func(void) +{ + pstatus_t status; + INT16 ALIGN(src[FUNC_TEST_SIZE+3]); + INT16 ALIGN(d1[FUNC_TEST_SIZE+3]); + UINT32 tmp; + INT32 val; + + winpr_RAND((BYTE*)&tmp, sizeof(tmp)); + winpr_RAND((BYTE*)src, sizeof(src)); + + val = (tmp % (FUNC_TEST_SIZE - 1)) + 1; + + /* Aligned */ + status = generic->shiftC_16s(src+1, val, d1+1, FUNC_TEST_SIZE); + if (status != PRIMITIVES_SUCCESS) + return FALSE; + status = optimized->shiftC_16s(src+1, val, d1+1, FUNC_TEST_SIZE); + if (status != PRIMITIVES_SUCCESS) + return FALSE; + + status = generic->shiftC_16s(src+1, -val, d1+1, FUNC_TEST_SIZE); + if (status != PRIMITIVES_SUCCESS) + return FALSE; + status = optimized->shiftC_16s(src+1, -val, d1+1, FUNC_TEST_SIZE); + if (status != PRIMITIVES_SUCCESS) + return FALSE; + + /* Unaligned */ + status = generic->shiftC_16s(src+1, val, d1+2, FUNC_TEST_SIZE); + if (status != PRIMITIVES_SUCCESS) + return FALSE; + status = optimized->shiftC_16s(src+1, val, d1+2, FUNC_TEST_SIZE); + if (status != PRIMITIVES_SUCCESS) + return FALSE; + status = generic->shiftC_16s(src+1, -val, d1+2, FUNC_TEST_SIZE); + if (status != PRIMITIVES_SUCCESS) + return FALSE; + status = optimized->shiftC_16s(src+1, -val, d1+2, FUNC_TEST_SIZE); + if (status != PRIMITIVES_SUCCESS) + return FALSE; + + return TRUE; +} + +static BOOL test_ShiftWrapper_16u_func(void) +{ + pstatus_t status; + UINT16 ALIGN(src[FUNC_TEST_SIZE+3]); + UINT16 ALIGN(d1[FUNC_TEST_SIZE+3]); + UINT32 tmp; + INT32 val; + + winpr_RAND((BYTE*)&tmp, sizeof(tmp)); + winpr_RAND((BYTE*)src, sizeof(src)); + + val = (tmp % (FUNC_TEST_SIZE - 1)) + 1; + + /* Aligned */ + status = generic->shiftC_16u(src+1, val, d1+1, FUNC_TEST_SIZE); + if (status != PRIMITIVES_SUCCESS) + return FALSE; + status = optimized->shiftC_16u(src+1, val, d1+1, FUNC_TEST_SIZE); + if (status != PRIMITIVES_SUCCESS) + return FALSE; + + status = generic->shiftC_16u(src+1, -val, d1+1, FUNC_TEST_SIZE); + if (status != PRIMITIVES_SUCCESS) + return FALSE; + status = optimized->shiftC_16u(src+1, -val, d1+1, FUNC_TEST_SIZE); + if (status != PRIMITIVES_SUCCESS) + return FALSE; + + /* Unaligned */ + status = generic->shiftC_16u(src+1, val, d1+2, FUNC_TEST_SIZE); + if (status != PRIMITIVES_SUCCESS) + return FALSE; + status = optimized->shiftC_16u(src+1, val, d1+2, FUNC_TEST_SIZE); + if (status != PRIMITIVES_SUCCESS) + return FALSE; + status = generic->shiftC_16u(src+1, -val, d1+2, FUNC_TEST_SIZE); + if (status != PRIMITIVES_SUCCESS) + return FALSE; + status = optimized->shiftC_16u(src+1, -val, d1+2, FUNC_TEST_SIZE); + if (status != PRIMITIVES_SUCCESS) + return FALSE; + + return TRUE; +} /* ------------------------------------------------------------------------- */ -int test_lShift_16s_speed(void) +static BOOL test_lShift_16s_speed(void) { - INT16 ALIGN(src[MAX_TEST_SIZE + 1]), ALIGN(dst[MAX_TEST_SIZE + 1]); - winpr_RAND(src, sizeof(src)); - speed_lShift_16s("lShift_16s", "aligned", src, NULL, 3, dst, - test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME); - speed_lShift_16s("lShift_16s", "unaligned", src + 1, NULL, 3, dst, - test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME); - return SUCCESS; + UINT32 val; + INT16 ALIGN(src[MAX_TEST_SIZE+1]), ALIGN(dst[MAX_TEST_SIZE+1]); + + winpr_RAND((BYTE*)src, sizeof(src)); + winpr_RAND((BYTE*)&val, sizeof(val)); + + if (!speed_test("lShift_16s", "aligned", g_Iterations, + (speed_test_fkt)generic->lShiftC_16s, + (speed_test_fkt)optimized->lShiftC_16s, src, val, + dst, MAX_TEST_SIZE)) + return FALSE; + + if (!speed_test("lShift_16s", "unaligned", g_Iterations, + (speed_test_fkt)generic->lShiftC_16s, + (speed_test_fkt)optimized->lShiftC_16s, src + 1, val, + dst, MAX_TEST_SIZE)) + return FALSE; + + return TRUE; } /* ------------------------------------------------------------------------- */ -int test_lShift_16u_speed(void) +static BOOL test_lShift_16u_speed(void) { + UINT32 val; UINT16 ALIGN(src[MAX_TEST_SIZE + 1]), ALIGN(dst[MAX_TEST_SIZE + 1]); - winpr_RAND(src, sizeof(src)); - speed_lShift_16u("lShift_16u", "aligned", src, NULL, 3, dst, - test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME); - speed_lShift_16u("lShift_16u", "unaligned", src + 1, NULL, 3, dst, - test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME); - return SUCCESS; + + winpr_RAND((BYTE*)&val, sizeof(val)); + winpr_RAND((BYTE*)src, sizeof(src)); + + if (!speed_test("lShift_16u", "aligned", g_Iterations, + (speed_test_fkt)generic->lShiftC_16u, + (speed_test_fkt)optimized->lShiftC_16u, src, val, + dst, MAX_TEST_SIZE)) + return FALSE; + + if (!speed_test("lShift_16u", "unaligned", g_Iterations, + (speed_test_fkt)generic->lShiftC_16u, + (speed_test_fkt)optimized->lShiftC_16u, src + 1, val, + dst, MAX_TEST_SIZE)) + return FALSE; + + return TRUE; } /* ------------------------------------------------------------------------- */ -int test_rShift_16s_speed(void) +static BOOL test_rShift_16s_speed(void) { - INT16 ALIGN(src[MAX_TEST_SIZE + 1]), ALIGN(dst[MAX_TEST_SIZE + 1]); - winpr_RAND(src, sizeof(src)); - speed_rShift_16s("rShift_16s", "aligned", src, NULL, 3, dst, - test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME); - speed_rShift_16s("rShift_16s", "unaligned", src + 1, NULL, 3, dst, - test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME); - return SUCCESS; + UINT32 val; + INT16 ALIGN(src[MAX_TEST_SIZE+1]), ALIGN(dst[MAX_TEST_SIZE+1]); + + winpr_RAND((BYTE*)src, sizeof(src)); + winpr_RAND((BYTE*)&val, sizeof(val)); + if (!speed_test("rShift_16s", "aligned", g_Iterations, + (speed_test_fkt)generic->rShiftC_16s, + (speed_test_fkt)optimized->rShiftC_16s, src, val, + dst, MAX_TEST_SIZE)) + return FALSE; + + if (!speed_test("rShift_16s", "unaligned", g_Iterations, + (speed_test_fkt)generic->rShiftC_16s, + (speed_test_fkt)optimized->rShiftC_16s, src + 1, val, + dst, MAX_TEST_SIZE)) + return FALSE; + + return TRUE; } /* ------------------------------------------------------------------------- */ -int test_rShift_16u_speed(void) +static BOOL test_rShift_16u_speed(void) { + UINT32 val; UINT16 ALIGN(src[MAX_TEST_SIZE + 1]), ALIGN(dst[MAX_TEST_SIZE + 1]); - winpr_RAND(src, sizeof(src)); - speed_rShift_16u("rShift_16u", "aligned", src, NULL, 3, dst, - test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME); - speed_rShift_16u("rShift_16u", "unaligned", src + 1, NULL, 3, dst, - test_sizes, NUM_TEST_SIZES, SHIFT_PRETEST_ITERATIONS, TEST_TIME); - return SUCCESS; + + winpr_RAND((BYTE*)&val, sizeof(val)); + winpr_RAND((BYTE*)src, sizeof(src)); + + if (!speed_test("rShift_16u", "aligned", g_Iterations, + (speed_test_fkt)generic->rShiftC_16u, + (speed_test_fkt)optimized->rShiftC_16u, src, val, + dst, MAX_TEST_SIZE)) + return FALSE; + + if (!speed_test("rShift_16u", "unaligned", g_Iterations, + (speed_test_fkt)generic->rShiftC_16u, + (speed_test_fkt)optimized->rShiftC_16u, src + 1, val, + dst, MAX_TEST_SIZE)) + return FALSE; + + return TRUE; } int TestPrimitivesShift(int argc, char* argv[]) { - int status; - status = test_lShift_16s_func(); + prim_test_setup(FALSE); - if (status != SUCCESS) + if (!test_lShift_16s_func()) return 1; if (g_TestPrimitivesPerformance) { - status = test_lShift_16s_speed(); - - if (status != SUCCESS) + if (!test_lShift_16s_speed()) return 1; } - status = test_lShift_16u_func(); - - if (status != SUCCESS) + if (!test_lShift_16u_func()) return 1; if (g_TestPrimitivesPerformance) { - status = test_lShift_16u_speed(); - - if (status != SUCCESS) + if (!test_lShift_16u_speed()) return 1; } - status = test_rShift_16s_func(); - - if (status != SUCCESS) + if (!test_rShift_16s_func()) return 1; if (g_TestPrimitivesPerformance) { - status = test_rShift_16s_speed(); - - if (status != SUCCESS) + if (!test_rShift_16s_speed()) return 1; } - status = test_rShift_16u_func(); - - if (status != SUCCESS) + if (!test_rShift_16u_func()) return 1; if (g_TestPrimitivesPerformance) { - status = test_rShift_16u_speed(); - - if (status != SUCCESS) + if (!test_rShift_16u_speed()) return 1; } + if (!test_ShiftWrapper_16s_func()) + return 1; + + if (!test_ShiftWrapper_16u_func()) + return 1; + return 0; } diff --git a/libfreerdp/primitives/test/TestPrimitivesSign.c b/libfreerdp/primitives/test/TestPrimitivesSign.c index 0496283..8091068 100644 --- a/libfreerdp/primitives/test/TestPrimitivesSign.c +++ b/libfreerdp/primitives/test/TestPrimitivesSign.c @@ -19,103 +19,71 @@ #include #include "prim_test.h" -static const int SIGN_PRETEST_ITERATIONS = 100000; -static const float TEST_TIME = 1.0; +#define TEST_BUFFER_SIZE 65535 /* ------------------------------------------------------------------------- */ -static int test_sign16s_func(void) +static BOOL test_sign16s_func(void) { - INT16 ALIGN(src[65535]), ALIGN(d1[65535]); -#ifdef WITH_SSE2 - INT16 ALIGN(d2[65535]); - int i; -#endif - int failed = 0; - char testStr[256]; - /* Test when we can reach 16-byte alignment */ - testStr[0] = '\0'; - winpr_RAND(src, sizeof(src)); - general_sign_16s(src + 1, d1 + 1, 65535); -#ifdef WITH_SSE2 - - if (IsProcessorFeaturePresentEx(PF_EX_SSSE3)) - { - strcat(testStr, " SSSE3"); - ssse3_sign_16s(src + 1, d2 + 1, 65535); - - for (i = 1; i < 65535; ++i) - { - if (d1[i] != d2[i]) - { - printf("SIGN16s-SSE-aligned FAIL[%d] of %d: want %d, got %d\n", - i, src[i], d1[i], d2[i]); - ++failed; - } - } - } - -#endif /* i386 */ - /* Test when we cannot reach 16-byte alignment */ - winpr_RAND(src, sizeof(src)); - general_sign_16s(src + 1, d1 + 2, 65535); -#ifdef WITH_SSE2 - - if (IsProcessorFeaturePresentEx(PF_EX_SSSE3)) - { - ssse3_sign_16s(src + 1, d2 + 2, 65535); - - for (i = 2; i < 65535; ++i) - { - if (d1[i] != d2[i]) - { - printf("SIGN16s-SSE-unaligned FAIL[%d] of %d: want %d, got %d\n", - i, src[i - 1], d1[i], d2[i]); - ++failed; - } - } - } - -#endif /* i386 */ - - if (!failed) printf("All sign16s tests passed (%s).\n", testStr); - - return (failed > 0) ? FAILURE : SUCCESS; + pstatus_t status; + INT16 ALIGN(src[TEST_BUFFER_SIZE]); + INT16 ALIGN(d1[TEST_BUFFER_SIZE]); + INT16 ALIGN(d2[TEST_BUFFER_SIZE]); + + winpr_RAND((BYTE*)src, sizeof(src)); + + status = generic->sign_16s(src + 1, d1 + 1, TEST_BUFFER_SIZE); + if (status != PRIMITIVES_SUCCESS) + return FALSE; + status = optimized->sign_16s(src + 1, d2 + 1, TEST_BUFFER_SIZE); + if (status != PRIMITIVES_SUCCESS) + return FALSE; + + if (memcmp(d1, d2, sizeof(d1)) != 0) + return FALSE; + + status = generic->sign_16s(src + 1, d1 + 2, TEST_BUFFER_SIZE); + if (status != PRIMITIVES_SUCCESS) + return FALSE; + status = optimized->sign_16s(src + 1, d2 + 2, TEST_BUFFER_SIZE); + if (status != PRIMITIVES_SUCCESS) + return FALSE; + + if (memcmp(d1, d2, sizeof(d1)) != 0) + return FALSE; + + return TRUE; } -/* ------------------------------------------------------------------------- */ -STD_SPEED_TEST(sign16s_speed_test, INT16, INT16, dst = dst, - TRUE, general_sign_16s(src1, dst, size), -#ifdef WITH_SSE2 - TRUE, ssse3_sign_16s(src1, dst, size), PF_EX_SSSE3, TRUE, -#else - FALSE, PRIM_NOP, 0, FALSE, -#endif - FALSE, dst = dst); - static int test_sign16s_speed(void) { INT16 ALIGN(src[MAX_TEST_SIZE + 3]), ALIGN(dst[MAX_TEST_SIZE + 3]); - winpr_RAND(src, sizeof(src)); - sign16s_speed_test("sign16s", "aligned", src, NULL, 0, dst, - test_sizes, NUM_TEST_SIZES, SIGN_PRETEST_ITERATIONS, TEST_TIME); - sign16s_speed_test("sign16s", "unaligned", src + 1, NULL, 0, dst, - test_sizes, NUM_TEST_SIZES, SIGN_PRETEST_ITERATIONS, TEST_TIME); - return SUCCESS; + winpr_RAND((BYTE*)src, sizeof(src)); + + if (!speed_test("sign16s", "aligned", g_Iterations, + (speed_test_fkt)generic->sign_16s, + (speed_test_fkt)optimized->sign_16s, src + 1, dst + 1, + MAX_TEST_SIZE)) + return FALSE; + + if (!speed_test("sign16s", "unaligned", g_Iterations, + (speed_test_fkt)generic->sign_16s, + (speed_test_fkt)optimized->sign_16s, src + 1, dst + 2, + MAX_TEST_SIZE)) + return FALSE; + + return TRUE; } int TestPrimitivesSign(int argc, char* argv[]) { - int status; - status = test_sign16s_func(); + prim_test_setup(FALSE); - if (status != SUCCESS) + if (!test_sign16s_func()) return 1; if (g_TestPrimitivesPerformance) { - status = test_sign16s_speed(); - - if (status != SUCCESS) + if (!test_sign16s_speed()) return 1; } diff --git a/libfreerdp/primitives/test/TestPrimitivesYCoCg.c b/libfreerdp/primitives/test/TestPrimitivesYCoCg.c index 2dd36e9..a1f5084 100644 --- a/libfreerdp/primitives/test/TestPrimitivesYCoCg.c +++ b/libfreerdp/primitives/test/TestPrimitivesYCoCg.c @@ -23,105 +23,103 @@ #include #include "prim_test.h" -static const int YCOCG_TRIAL_ITERATIONS = 20000; -static const float TEST_TIME = 4.0; - /* ------------------------------------------------------------------------- */ -int test_YCoCgRToRGB_8u_AC4R_func(void) +static BOOL test_YCoCgRToRGB_8u_AC4R_func(void) { -#ifdef WITH_SSE2 - int i; + BOOL result = TRUE; + pstatus_t status; INT32 ALIGN(out_sse[4098]), ALIGN(out_sse_inv[4098]); -#endif INT32 ALIGN(in[4098]); INT32 ALIGN(out_c[4098]), ALIGN(out_c_inv[4098]); - char testStr[256]; - BOOL failed = FALSE; - testStr[0] = '\0'; - winpr_RAND(in, sizeof(in)); - general_YCoCgToRGB_8u_AC4R((const BYTE*)(in + 1), 63 * 4, - (BYTE*) out_c, 63 * 4, 63, 61, 2, TRUE, FALSE); - general_YCoCgToRGB_8u_AC4R((const BYTE*)(in + 1), 63 * 4, - (BYTE*) out_c_inv, 63 * 4, 63, 61, 2, TRUE, TRUE); -#ifdef WITH_SSE2 - - if (IsProcessorFeaturePresentEx(PF_EX_SSSE3)) + + UINT32 i, x; + const UINT32 formats[] = { + PIXEL_FORMAT_ARGB32, + PIXEL_FORMAT_ABGR32, + PIXEL_FORMAT_RGBA32, + PIXEL_FORMAT_RGBX32, + PIXEL_FORMAT_BGRA32, + PIXEL_FORMAT_BGRX32 + }; + + winpr_RAND((BYTE*)in, sizeof(in)); + + for (x=0; xYCoCgToRGB_8u_AC4R( + (const BYTE*)(in + 1), 63 * 4, + (BYTE*) out_c, format, 63 * 4, 63, 61, 2, TRUE); + if (status != PRIMITIVES_SUCCESS) + return FALSE; + status = generic->YCoCgToRGB_8u_AC4R( + (const BYTE*)(in + 1), 63 * 4, + (BYTE*) out_c_inv, format, 63 * 4, 63, 61, 2, TRUE); + if (status != PRIMITIVES_SUCCESS) + return FALSE; + + status = optimized->YCoCgToRGB_8u_AC4R( + (const BYTE*)(in + 1), 63 * 4, + (BYTE*) out_sse, format, 63 * 4, 63, 61, 2, TRUE); + if (status != PRIMITIVES_SUCCESS) + return FALSE; + status = optimized->YCoCgToRGB_8u_AC4R( + (const BYTE*)(in + 1), 63 * 4, + (BYTE*) out_sse_inv, format, 63 * 4, 63, 61, 2, TRUE); + if (status != PRIMITIVES_SUCCESS) + return FALSE; for (i = 0; i < 63 * 61; ++i) { if (out_c[i] != out_sse[i]) { - printf("YCoCgRToRGB-SSE FAIL[%d]: 0x%08x -> C 0x%08x vs SSE 0x%08x\n", i, + printf("optimized->YCoCgRToRGB FAIL[%d]: 0x%08x -> C 0x%08x vs optimized 0x%08x\n", i, in[i + 1], out_c[i], out_sse[i]); - failed = TRUE; + result = FALSE; } } - ssse3_YCoCgRToRGB_8u_AC4R((const BYTE*)(in + 1), 63 * 4, - (BYTE*) out_sse_inv, 63 * 4, 63, 61, 2, TRUE, TRUE); - for (i = 0; i < 63 * 61; ++i) { if (out_c_inv[i] != out_sse_inv[i]) { - printf("YCoCgRToRGB-SSE inverted FAIL[%d]: 0x%08x -> C 0x%08x vs SSE 0x%08x\n", + printf("optimized->YCoCgRToRGB inverted FAIL[%d]: 0x%08x -> C 0x%08x vs optimized 0x%08x\n", i, in[i + 1], out_c_inv[i], out_sse_inv[i]); - failed = TRUE; + result = FALSE; } } } - -#endif /* i386 */ - - if (!failed) printf("All YCoCgRToRGB_8u_AC4R tests passed (%s).\n", testStr); - - return (failed > 0) ? FAILURE : SUCCESS; + return result; } -/* ------------------------------------------------------------------------- */ -STD_SPEED_TEST( - ycocg_to_rgb_speed, BYTE, BYTE, PRIM_NOP, - TRUE, general_YCoCgToRGB_8u_AC4R(src1, 64 * 4, dst, 64 * 4, 64, 64, 2, FALSE, - FALSE), -#ifdef WITH_SSE2 - TRUE, ssse3_YCoCgRToRGB_8u_AC4R(src1, 64 * 4, dst, 64 * 4, 64, 64, 2, FALSE, - FALSE), - PF_EX_SSSE3, TRUE, -#else - FALSE, PRIM_NOP, 0, FALSE, -#endif - FALSE, PRIM_NOP); - static int test_YCoCgRToRGB_8u_AC4R_speed(void) { INT32 ALIGN(in[4096]); INT32 ALIGN(out[4096]); - int size_array[] = { 64 }; - winpr_RAND(in, sizeof(in)); - ycocg_to_rgb_speed("YCoCgToRGB", "aligned", (const BYTE*) in, - 0, 0, (BYTE*) out, - size_array, 1, YCOCG_TRIAL_ITERATIONS, TEST_TIME); - return SUCCESS; + + winpr_RAND((BYTE*)in, sizeof(in)); + + if (!speed_test("YCoCgToRGB_8u_AC4R", "aligned", g_Iterations, + (speed_test_fkt)generic->YCoCgToRGB_8u_AC4R, + (speed_test_fkt)optimized->YCoCgToRGB_8u_AC4R, + in, 64 * 4, out, 64 * 4, 64, 64, 2, FALSE, FALSE)) + return FALSE; + + return TRUE; } int TestPrimitivesYCoCg(int argc, char* argv[]) { - int status; - status = test_YCoCgRToRGB_8u_AC4R_func(); + prim_test_setup(FALSE); - if (status != SUCCESS) + if (!test_YCoCgRToRGB_8u_AC4R_func()) return 1; if (g_TestPrimitivesPerformance) { - status = test_YCoCgRToRGB_8u_AC4R_speed(); - - if (status != SUCCESS) + if (!test_YCoCgRToRGB_8u_AC4R_speed()) return 1; } diff --git a/libfreerdp/primitives/test/TestPrimitivesYUV.c b/libfreerdp/primitives/test/TestPrimitivesYUV.c index 7281763..c7af894 100644 --- a/libfreerdp/primitives/test/TestPrimitivesYUV.c +++ b/libfreerdp/primitives/test/TestPrimitivesYUV.c @@ -38,8 +38,8 @@ static void get_size(UINT32* width, UINT32* height) winpr_RAND((BYTE*)width, sizeof(*width)); winpr_RAND((BYTE*)height, sizeof(*height)); // TODO: Algorithm only works on even resolutions... - *width = (*width % 4000) << 1; - *height = (*height % 4000 << 1); + *width = (*width % 64) << 1; + *height = (*height % 64 << 1); } static BOOL check_padding(const BYTE* psrc, size_t size, size_t padding, @@ -370,11 +370,13 @@ static BOOL TestPrimitiveYUV(BOOL use444) if (use444) { - if (prims->RGBToYUV444_8u_P3AC4R(rgb, stride, yuv, yuv_step, + if (prims->RGBToYUV444_8u_P3AC4R(rgb, PIXEL_FORMAT_BGRA32, + stride, yuv, yuv_step, &roi) != PRIMITIVES_SUCCESS) goto fail; } - else if (prims->RGBToYUV420_8u_P3AC4R(rgb, stride, yuv, yuv_step, + else if (prims->RGBToYUV420_8u_P3AC4R(rgb, PIXEL_FORMAT_BGRA32, + stride, yuv, yuv_step, &roi) != PRIMITIVES_SUCCESS) goto fail; @@ -429,16 +431,16 @@ int TestPrimitivesYUV(int argc, char* argv[]) UINT32 x; int rc = -1; + prim_test_setup(FALSE); + for (x = 0; x < 10; x++) { - /* TODO: This test fails on value comparison, - * there seems to be some issue left with encoder / decoder pass. - if (!TestPrimitiveYUV(FALSE)) - goto end; - */ if (!TestPrimitiveYUV(TRUE)) goto end; + if (!TestPrimitiveYUV(FALSE)) + goto end; + if (!TestPrimitiveYUVCombine()) goto end; } diff --git a/libfreerdp/primitives/test/prim_test.h b/libfreerdp/primitives/test/prim_test.h index e061887..45519f3 100644 --- a/libfreerdp/primitives/test/prim_test.h +++ b/libfreerdp/primitives/test/prim_test.h @@ -43,13 +43,6 @@ extern int test_sizes[]; #define NUM_TEST_SIZES 10 -#ifndef SUCCESS -#define SUCCESS 0 -#endif -#ifndef FAILURE -#define FAILURE 1 -#endif - extern BOOL g_TestPrimitivesPerformance; extern UINT32 g_Iterations; @@ -58,8 +51,10 @@ extern primitives_t* optimized; void prim_test_setup(BOOL performance); +typedef pstatus_t (*speed_test_fkt)(); + BOOL speed_test(const char* name, const char* dsc, UINT32 iterations, - pstatus_t (*generic)(), pstatus_t (*optimised)(), + speed_test_fkt generic, speed_test_fkt optimized, ...); #endif // !__PRIMTEST_H_INCLUDED__ -- 2.7.4