add YCoCg->RGB and 16-to-32bit SSE
authorDaryl Poe <daryl.poe@hp.com>
Wed, 2 Jul 2014 20:30:04 +0000 (14:30 -0600)
committerDaryl Poe <daryl.poe@hp.com>
Wed, 2 Jul 2014 20:30:04 +0000 (14:30 -0600)
19 files changed:
include/freerdp/primitives.h
libfreerdp/codec/bitmap_decode.c
libfreerdp/codec/color.c
libfreerdp/codec/planar.c
libfreerdp/core/capabilities.c
libfreerdp/primitives/CMakeLists.txt
libfreerdp/primitives/prim_16to32bpp.c [new file with mode: 0644]
libfreerdp/primitives/prim_16to32bpp.h [new file with mode: 0644]
libfreerdp/primitives/prim_16to32bpp_opt.c [new file with mode: 0644]
libfreerdp/primitives/prim_YCoCg.c [new file with mode: 0644]
libfreerdp/primitives/prim_YCoCg.h [new file with mode: 0644]
libfreerdp/primitives/prim_YCoCg_opt.c [new file with mode: 0644]
libfreerdp/primitives/prim_internal.h
libfreerdp/primitives/primitives.c
libfreerdp/primitives/test/CMakeLists.txt
libfreerdp/primitives/test/prim_test.c
libfreerdp/primitives/test/prim_test.h
libfreerdp/primitives/test/test_16to32bpp.c [new file with mode: 0644]
libfreerdp/primitives/test/test_YCoCg.c [new file with mode: 0644]

index 24bac47..bb518c2 100644 (file)
@@ -148,6 +148,18 @@ typedef pstatus_t (*__RGBToRGB_16s8u_P3AC4R_t)(
        const INT16 *pSrc[3],  INT32 srcStep,
        BYTE *pDst,  INT32 dstStep,
        const prim_size_t *roi);
+typedef pstatus_t (*__YCoCgRToRGB_8u_AC4R_t)(
+       const BYTE *pSrc, INT32 srcStep,
+       BYTE *pDst, INT32 dstStep,
+       UINT32 width, UINT32 height,
+       UINT8 shift,
+       BOOL withAlpha,
+       BOOL invert);
+typedef pstatus_t (*__RGB565ToARGB_16u32u_C3C4_t)(
+       const UINT16* pSrc, INT32 srcStep,
+       UINT32* pDst, INT32 dstStep,
+       UINT32 width, UINT32 height,
+       BOOL alpha, BOOL invert);
 typedef pstatus_t (*__andC_32u_t)(
        const UINT32 *pSrc,
        UINT32 val,
@@ -190,6 +202,8 @@ typedef struct
        __yCbCrToRGB_16s16s_P3P3_t yCbCrToRGB_16s16s_P3P3;
        __RGBToYCbCr_16s16s_P3P3_t RGBToYCbCr_16s16s_P3P3;
        __RGBToRGB_16s8u_P3AC4R_t RGBToRGB_16s8u_P3AC4R;
+       __YCoCgRToRGB_8u_AC4R_t YCoCgRToRGB_8u_AC4R;
+       __RGB565ToARGB_16u32u_C3C4_t RGB565ToARGB_16u32u_C3C4;
 } primitives_t;
 
 #ifdef __cplusplus
index c5232d9..4fbb6e1 100644 (file)
@@ -82,43 +82,30 @@ static const BYTE g_MaskLiteRunLength = 0x0F;
  * Reads the supplied order header and extracts the compression
  * order code ID.
  */
-static UINT32 ExtractCodeId(BYTE bOrderHdr)
+static INLINE UINT32 ExtractCodeId(BYTE bOrderHdr)
 {
-       int code;
-
-       switch (bOrderHdr)
-       {
-               case MEGA_MEGA_BG_RUN:
-               case MEGA_MEGA_FG_RUN:
-               case MEGA_MEGA_SET_FG_RUN:
-               case MEGA_MEGA_DITHERED_RUN:
-               case MEGA_MEGA_COLOR_RUN:
-               case MEGA_MEGA_FGBG_IMAGE:
-               case MEGA_MEGA_SET_FGBG_IMAGE:
-               case MEGA_MEGA_COLOR_IMAGE:
-               case SPECIAL_FGBG_1:
-               case SPECIAL_FGBG_2:
-               case SPECIAL_WHITE:
-               case SPECIAL_BLACK:
-                       return bOrderHdr;
+       if ((bOrderHdr & 0xC0U) != 0xC0U) {
+               /* REGULAR orders
+                * (000x xxxx, 001x xxxx, 010x xxxx, 011x xxxx, 100x xxxx)
+                */
+               return bOrderHdr >> 5;
        }
-       code = bOrderHdr >> 5;
-       switch (code)
-       {
-               case REGULAR_BG_RUN:
-               case REGULAR_FG_RUN:
-               case REGULAR_COLOR_RUN:
-               case REGULAR_FGBG_IMAGE:
-               case REGULAR_COLOR_IMAGE:
-                       return code;
+       else if ((bOrderHdr & 0xF0U) == 0xF0U) {
+               /* MEGA and SPECIAL orders (0xF*) */
+               return bOrderHdr;
+       }
+       else {
+               /* LITE orders
+                * 1100 xxxx, 1101 xxxx, 1110 xxxx)
+                */
+               return bOrderHdr >> 4;
        }
-       return bOrderHdr >> 4;
 }
 
 /**
  * Extract the run length of a compression order.
  */
-static UINT32 ExtractRunLength(UINT32 code, BYTE* pbOrderHdr, UINT32* advance)
+static INLINE UINT32 ExtractRunLength(UINT32 code, BYTE* pbOrderHdr, UINT32* advance)
 {
        UINT32 runLength;
        UINT32 ladvance;
index 540d313..4e9e357 100644 (file)
@@ -29,6 +29,7 @@
 
 #include <freerdp/api.h>
 #include <freerdp/freerdp.h>
+#include <freerdp/primitives.h>
 #include <freerdp/codec/color.h>
 
 int freerdp_get_pixel(BYTE* data, int x, int y, int width, int height, int bpp)
@@ -602,73 +603,23 @@ BYTE* freerdp_image_convert_16bpp(BYTE* srcData, BYTE* dstData, int width, int h
        }
        else if (dstBpp == 32)
        {
-               int i;
-               UINT32 pixel;
-               UINT16* src16;
-               UINT32* dst32;
-               BYTE red, green, blue;
+               primitives_t* prims;
 
                if (dstData == NULL)
-                       dstData = (BYTE*) malloc(width * height * 4);
-
-               src16 = (UINT16*) srcData;
-               dst32 = (UINT32*) dstData;
-
-               if (clrconv->alpha)
-               {
-                       if (clrconv->invert)
-                       {
-                               for (i = width * height; i > 0; i--)
-                               {
-                                       pixel = *src16;
-                                       src16++;
-                                       GetBGR16(red, green, blue, pixel);
-                                       pixel = ARGB32(0xFF, red, green, blue);
-                                       *dst32 = pixel;
-                                       dst32++;
-                               }
-                       }
-                       else
-                       {
-                               for (i = width * height; i > 0; i--)
-                               {
-                                       pixel = *src16;
-                                       src16++;
-                                       GetBGR16(red, green, blue, pixel);
-                                       pixel = ABGR32(0xFF, red, green, blue);
-                                       *dst32 = pixel;
-                                       dst32++;
-                               }
-                       }
-               }
-               else
                {
-                       if (clrconv->invert)
+                       if (posix_memalign((void **) &dstData, 16,
+                                       width * height * sizeof(UINT32)) != 0)
                        {
-                               for (i = width * height; i > 0; i--)
-                               {
-                                       pixel = *src16;
-                                       src16++;
-                                       GetBGR16(red, green, blue, pixel);
-                                       pixel = RGB32(red, green, blue);
-                                       *dst32 = pixel;
-                                       dst32++;
-                               }
-                       }
-                       else
-                       {
-                               for (i = width * height; i > 0; i--)
-                               {
-                                       pixel = *src16;
-                                       src16++;
-                                       GetBGR16(red, green, blue, pixel);
-                                       pixel = BGR32(red, green, blue);
-                                       *dst32 = pixel;
-                                       dst32++;
-                               }
+                               return NULL;
                        }
                }
 
+               prims = primitives_get();
+               prims->RGB565ToARGB_16u32u_C3C4(
+                       (const UINT16*) srcData, width * sizeof(UINT16),
+                       (UINT32*) dstData, width * sizeof(UINT32),
+                       width, height, clrconv->alpha, clrconv->invert);
+
                return dstData;
        }
 
index 824376f..2777460 100644 (file)
@@ -25,6 +25,7 @@
 #include <winpr/print.h>
 
 #include <freerdp/codec/bitmap.h>
+#include <freerdp/primitives.h>
 
 #include "planar.h"
 
@@ -327,6 +328,29 @@ int planar_decompress(BITMAP_PLANAR_CONTEXT* planar, BYTE* pSrcData, UINT32 SrcS
                srcp++;
        }
 
+       if (FormatHeader & PLANAR_FORMAT_HEADER_CLL_MASK)
+       {
+               /* The data is in YCoCg colorspace rather than RGB. */
+               if (FormatHeader & PLANAR_FORMAT_HEADER_CS)
+               {
+                       static BOOL been_warned = FALSE;
+                       if (!been_warned)
+                               fprintf(stderr, "Chroma-Subsampling is not implemented.\n");
+                       been_warned = TRUE;
+               }
+               else
+               {
+                       BOOL alpha;
+                       int cll;
+
+                       alpha = (FormatHeader & PLANAR_FORMAT_HEADER_NA) ? FALSE : TRUE;
+                       cll = FormatHeader & PLANAR_FORMAT_HEADER_CLL_MASK;
+                       primitives_get()->YCoCgRToRGB_8u_AC4R(
+                               pDstData, nDstStep, pDstData, nDstStep,
+                               nWidth, nHeight, cll, alpha, FALSE);
+               }
+       }
+
        status = (SrcSize == (srcp - pSrcData)) ? 1 : -1;
 
        return status;
index f3e105c..b4e2fca 100644 (file)
@@ -361,7 +361,17 @@ void rdp_write_bitmap_capability_set(wStream* s, rdpSettings* settings)
        header = rdp_capability_set_start(s);
 
        drawingFlags |= DRAW_ALLOW_SKIP_ALPHA;
-       drawingFlags |= DRAW_ALLOW_COLOR_SUBSAMPLING;
+       /* While bitmap_decode.c now implements YCoCg, in turning it
+        * on we have found Microsoft is inconsistent on whether to invert R & B.
+        * And it's not only from one server to another; on Win7/2008R2, it appears
+        * to send the main content with a different inversion than the Windows
+        * button!  So... don't advertise that we support YCoCg and the server
+        * will not send it.  YCoCg is still needed for EGFX, but it at least
+        * appears consistent in its use.
+        */
+       /* drawingFlags |= DRAW_ALLOW_DYNAMIC_COLOR_FIDELITY; */
+       /* YCoCg with chroma subsampling is not implemented in bitmap_decode.c. */
+       /* drawingFlags |= DRAW_ALLOW_COLOR_SUBSAMPLING; */
 
        if (settings->RdpVersion > 5)
                preferredBitsPerPixel = settings->ColorDepth;
index a35b24d..a075072 100644 (file)
@@ -17,6 +17,7 @@ set(MODULE_NAME "freerdp-primitives")
 set(MODULE_PREFIX "FREERDP_PRIMITIVES")
 
 set(${MODULE_PREFIX}_SRCS
+       prim_16to32bpp.c
        prim_add.c
        prim_andor.c
        prim_alphaComp.c
@@ -25,17 +26,20 @@ set(${MODULE_PREFIX}_SRCS
        prim_set.c
        prim_shift.c
        prim_sign.c
+       prim_YCoCg.c
        primitives.c
        prim_internal.h)
 
 set(${MODULE_PREFIX}_OPT_SRCS
+       prim_16to32bpp_opt.c
        prim_add_opt.c
        prim_andor_opt.c
        prim_alphaComp_opt.c
        prim_colors_opt.c
        prim_set_opt.c
        prim_shift_opt.c
-       prim_sign_opt.c)
+       prim_sign_opt.c
+       prim_YCoCg_opt.c)
 
 add_definitions(-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE})
 
diff --git a/libfreerdp/primitives/prim_16to32bpp.c b/libfreerdp/primitives/prim_16to32bpp.c
new file mode 100644 (file)
index 0000000..a911055
--- /dev/null
@@ -0,0 +1,137 @@
+/* prim_16to32bpp.c
+ * 16-bit to 32-bit color conversion (widely used)
+ * vi:ts=4 sw=4:
+ *
+ * The general routine was leveraged from freerdp/codec/color.c.
+ *
+ * Copyright 2010 Marc-Andre Moreau <marcandre.moreau@gmail.com>
+ * (c) Copyright 2014 Hewlett-Packard Development Company, L.P.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+#include <freerdp/codec/color.h>
+
+#include "prim_internal.h"
+#include "prim_16to32bpp.h"
+
+/* ------------------------------------------------------------------------- */
+pstatus_t general_RGB565ToARGB_16u32u_C3C4(
+       const UINT16* pSrc, INT32 srcStep,
+       UINT32* pDst, INT32 dstStep,
+       UINT32 width, UINT32 height,
+       BOOL alpha, BOOL invert)
+{
+       const UINT16* src16;
+       UINT32* dst32;
+       int x,y;
+       int srcRowBump, dstRowBump;
+       BYTE red, green, blue;
+
+       src16 = pSrc;
+       dst32 = pDst;
+       srcRowBump = (srcStep - (width * sizeof(UINT16))) / sizeof(UINT16);
+       dstRowBump = (dstStep - (width * sizeof(UINT32))) / sizeof(UINT32);
+
+       /* Loops are separated so if-decisions are not made in the loop. */
+       if (alpha)
+       {
+               if (invert)
+               {
+                       for (y=0; y<height; y++)
+                       {
+                               for (x=0; x<width; x++)
+                               {
+                                       UINT32 pixel = (UINT32) *src16++;
+                                       GetBGR16(red, green, blue, pixel);
+                                       pixel = ARGB32(0xFF, red, green, blue);
+                                       *dst32++ = pixel;
+                               }
+                               src16 += srcRowBump;
+                               dst32 += dstRowBump;
+                       }
+               }
+               else
+               {
+                       for (y=0; y<height; y++)
+                       {
+                               for (x=0; x<width; x++)
+                               {
+                                       UINT32 pixel = (UINT32) *src16++;
+                                       GetBGR16(red, green, blue, pixel);
+                                       pixel = ABGR32(0xFF, red, green, blue);
+                                       *dst32++ = pixel;
+                               }
+                               src16 += srcRowBump;
+                               dst32 += dstRowBump;
+                       }
+               }
+       }
+       else
+       {
+               if (invert)
+               {
+                       for (y=0; y<height; y++)
+                       {
+                               for (x=0; x<width; x++)
+                               {
+                                       UINT32 pixel = (UINT32) *src16++;
+                                       GetBGR16(red, green, blue, pixel);
+                                       pixel = RGB32(red, green, blue);
+                                       *dst32++ = pixel;
+                               }
+                               src16 += srcRowBump;
+                               dst32 += dstRowBump;
+                       }
+               }
+               else
+               {
+                       for (y=0; y<height; y++)
+                       {
+                               for (x=0; x<width; x++)
+                               {
+                                       UINT32 pixel = (UINT32) *src16++;
+                                       GetBGR16(red, green, blue, pixel);
+                                       pixel = BGR32(red, green, blue);
+                                       *dst32++ = pixel;
+                               }
+                               src16 += srcRowBump;
+                               dst32 += dstRowBump;
+                       }
+               }
+       }
+
+       return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_16to32bpp(
+       primitives_t *prims)
+{
+       prims->RGB565ToARGB_16u32u_C3C4 = general_RGB565ToARGB_16u32u_C3C4;
+
+       primitives_init_16to32bpp_opt(prims);
+}
+
+/* ------------------------------------------------------------------------- */
+void primitives_deinit_16to32bpp(
+       primitives_t *prims)
+{
+       /* Nothing to do. */
+}
diff --git a/libfreerdp/primitives/prim_16to32bpp.h b/libfreerdp/primitives/prim_16to32bpp.h
new file mode 100644 (file)
index 0000000..80a3cd7
--- /dev/null
@@ -0,0 +1,36 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * 16-bit to 32-bit color conversions
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2014 Hewlett-Packard Development Company, L.P.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef __GNUC__
+# pragma once
+#endif
+
+#ifndef __PRIM_16TO32BPP_H_INCLUDED__
+#define __PRIM_16TO32BPP_H_INCLUDED__
+
+#include <freerdp/primitives.h>
+
+extern pstatus_t general_RGB565ToARGB_16u32u_C3C4(
+       const UINT16* pSrc, INT32 srcStep,
+       UINT32* pDst, INT32 dstStep,
+       UINT32 width, UINT32 height,
+       BOOL alpha, BOOL invert);
+extern void primitives_init_16to32bpp_opt(primitives_t* prims);
+
+#endif /* !__PRIM_16TO32BPP_H_INCLUDED__ */
diff --git a/libfreerdp/primitives/prim_16to32bpp_opt.c b/libfreerdp/primitives/prim_16to32bpp_opt.c
new file mode 100644 (file)
index 0000000..a37e3e3
--- /dev/null
@@ -0,0 +1,280 @@
+/* prim_16to32bpp_opt.c
+ * 16-bit to 32-bit color conversion via SSE/Neon
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2014 Hewlett-Packard Development Company, L.P.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+#include <winpr/sysinfo.h>
+
+#ifdef WITH_SSE2
+#include <emmintrin.h>
+#include <tmmintrin.h>
+/* #elif defined(WITH_NEON) */
+/* #include <arm_neon.h> */
+#endif /* WITH_SSE2 */
+
+#include "prim_internal.h"
+#include "prim_16to32bpp.h"
+
+#ifdef WITH_SSE2
+/* ------------------------------------------------------------------------- */
+/* Note:  _no_invert and _invert could be coded with variables as shift
+ * amounts and a single routine, but tests showed that was much slower.
+ */
+static pstatus_t sse3_RGB565ToARGB_16u32u_C3C4_no_invert(
+       const UINT16* pSrc, INT32 srcStep,
+       UINT32* pDst, INT32 dstStep,
+       UINT32 width, UINT32 height,
+       BOOL alpha)
+{
+       const BYTE *src = (const BYTE *) pSrc;
+       BYTE *dst = (BYTE *) pDst;
+       int h;
+       int srcRowBump = srcStep - (width * sizeof(UINT16));
+       int dstRowBump = dstStep - (width * sizeof(UINT32));
+       __m128i R0, R1, R2, R_FC00, R_0300, R_00F8, R_0007, R_alpha;
+
+       R_FC00 = _mm_set1_epi16(0xFC00);
+       R_0300 = _mm_set1_epi16(0x0300);
+       R_00F8 = _mm_set1_epi16(0x00F8);
+       R_0007 = _mm_set1_epi16(0x0007);
+       if (alpha) R_alpha = _mm_set1_epi32(0xFF00FF00U);
+       else R_alpha = _mm_set1_epi32(0x00000000U);
+
+       for (h=0; h<height; h++)
+       {
+               int w = width;
+
+               /* Get to 16-byte destination boundary for the dest. */
+               if ((ULONG_PTR) dst & 0x0f)
+               {
+                       int startup = (16 - ((ULONG_PTR) dst & 0x0f)) / 4;
+                       if (startup > width) startup = width;
+                       general_RGB565ToARGB_16u32u_C3C4((const UINT16*) src, srcStep,
+                               (UINT32*) dst, dstStep, startup, 1, alpha, FALSE);
+                       src += startup * sizeof(UINT16);
+                       dst += startup * sizeof(UINT32);
+                       w -= startup;
+               }
+
+               /* The main loop handles eight pixels at a time. */
+               while (w >= 8)
+               {
+                       /* If off-stride, use the slower load. */
+                       if ((ULONG_PTR) src & 0x0f)
+                               R0 = _mm_lddqu_si128((__m128i *) src);
+                       else
+                               R0 = _mm_load_si128((__m128i *) src);
+                       src += (128/8);
+
+                       /* Do the lower two colors, which end up in the lower two bytes. */
+                       /* G = ((P<<5) & 0xFC00) | ((P>>1) & 0x0300) */
+                       R2 = _mm_slli_epi16(R0, 5);
+                       R2 = _mm_and_si128(R_FC00, R2);
+
+                       R1 = _mm_srli_epi16(R0, 1);
+                       R1 = _mm_and_si128(R_0300, R1);
+                       R2 = _mm_or_si128(R1, R2);
+
+                       /* R = ((P<<3) & 0x00F8) | ((P>>2) & 0x0007) */
+                       R1 = _mm_slli_epi16(R0, 3);
+                       R1 = _mm_and_si128(R_00F8, R1);
+                       R2 = _mm_or_si128(R1, R2);
+
+                       R1 = _mm_srli_epi16(R0, 2);
+                       R1 = _mm_and_si128(R_0007, R1);
+                       R2 = _mm_or_si128(R1, R2);              /* R2 = lowers */
+
+                       /* Handle the upper color. */
+                       /* B = ((P<<8) & 0x00F8) | ((P<<13) & 0x0007) */
+                       R1 = _mm_srli_epi16(R0, 8);
+                       R1 = _mm_and_si128(R_00F8, R1);
+
+                       R0 = _mm_srli_epi16(R0, 13);
+                       R0 = _mm_and_si128(R_0007, R0);
+                       R1 = _mm_or_si128(R0, R1);              /* R1 = uppers */
+
+                       /* Add alpha (or zero) . */
+                       R1 = _mm_or_si128(R_alpha, R1); /* + alpha */
+
+                       /* Unpack to intermix the AB and GR pieces. */
+                       R0 = _mm_unpackhi_epi16(R2, R1);
+                       R2 = _mm_unpacklo_epi16(R2, R1);
+
+                       /* Store the results. */
+                       _mm_store_si128((__m128i *) dst, R2);  dst += (128/8);
+                       _mm_store_si128((__m128i *) dst, R0);  dst += (128/8);
+                       w -= 8;
+               }
+
+               /* Handle any remainder. */
+               if (w > 0)
+               {
+                       general_RGB565ToARGB_16u32u_C3C4((const UINT16*) src, srcStep,
+                               (UINT32*) dst, dstStep, w, 1, alpha, FALSE);
+                       src += w * sizeof(UINT16);
+                       dst += w * sizeof(UINT32);
+               }
+
+               /* Bump to the start of the next row. */
+               src += srcRowBump;
+               dst += dstRowBump;
+       }
+
+       return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+static pstatus_t sse3_RGB565ToARGB_16u32u_C3C4_invert(
+       const UINT16* pSrc, INT32 srcStep,
+       UINT32* pDst, INT32 dstStep,
+       UINT32 width, UINT32 height,
+       BOOL alpha)
+{
+       const BYTE *src = (const BYTE *) pSrc;
+       BYTE *dst = (BYTE *) pDst;
+       int h;
+       int srcRowBump = srcStep - (width * sizeof(UINT16));
+       int dstRowBump = dstStep - (width * sizeof(UINT32));
+       __m128i R0, R1, R2, R_FC00, R_0300, R_00F8, R_0007, R_alpha;
+
+       R_FC00 = _mm_set1_epi16(0xFC00);
+       R_0300 = _mm_set1_epi16(0x0300);
+       R_00F8 = _mm_set1_epi16(0x00F8);
+       R_0007 = _mm_set1_epi16(0x0007);
+       if (alpha) R_alpha = _mm_set1_epi32(0xFF00FF00U);
+       else R_alpha = _mm_set1_epi32(0x00000000U);
+
+       for (h=0; h<height; h++)
+       {
+               int w = width;
+
+               /* Get to 16-byte destination boundary for the dest. */
+               if ((ULONG_PTR) dst & 0x0f)
+               {
+                       int startup = (16 - ((ULONG_PTR) dst & 0x0f)) / 4;
+                       if (startup > width) startup = width;
+                       general_RGB565ToARGB_16u32u_C3C4((const UINT16*) src, srcStep,
+                               (UINT32*) dst, dstStep, startup, 1, alpha, TRUE);
+                       src += startup * sizeof(UINT16);
+                       dst += startup * sizeof(UINT32);
+                       w -= startup;
+               }
+
+               /* The main loop handles eight pixels at a time. */
+               while (w >= 8)
+               {
+                       /* Off-stride, slower load. */
+                       if ((ULONG_PTR) src & 0x0f) 
+                               R0 = _mm_lddqu_si128((__m128i *) src);
+                       else
+                               R0 = _mm_load_si128((__m128i *) src);
+                       src += (128/8);
+
+                       /* Do the lower two colors, which end up in the lower two bytes. */
+                       /* G = ((P<<5) & 0xFC00) | ((P>>1) & 0x0300) */
+                       R2 = _mm_slli_epi16(R0, 5);
+                       R2 = _mm_and_si128(R_FC00, R2);
+
+                       R1 = _mm_srli_epi16(R0, 1);
+                       R1 = _mm_and_si128(R_0300, R1);
+                       R2 = _mm_or_si128(R1, R2);
+
+                       /* B = ((P>>8) & 0x00F8) | ((P>>13) & 0x0007) */
+                       R1 = _mm_srli_epi16(R0, 8);
+                       R1 = _mm_and_si128(R_00F8, R1);
+                       R2 = _mm_or_si128(R1, R2);
+
+                       R1 = _mm_srli_epi16(R0, 13);
+                       R1 = _mm_and_si128(R_0007, R1);
+                       R2 = _mm_or_si128(R1, R2);              /* R2 = lowers */
+
+                       /* Handle the upper color. */
+                       /* R = ((P<<3) & 0x00F8) | ((P>>13) & 0x0007) */
+                       R1 = _mm_slli_epi16(R0, 3);
+                       R1 = _mm_and_si128(R_00F8, R1);
+
+                       R0 = _mm_srli_epi16(R0, 2);
+                       R0 = _mm_and_si128(R_0007, R0);
+                       R1 = _mm_or_si128(R0, R1);              /* R1 = uppers */
+
+                       /* Add alpha (or zero) . */
+                       R1 = _mm_or_si128(R_alpha, R1); /* + alpha */
+
+                       /* Unpack to intermix the AR and GB pieces. */
+                       R0 = _mm_unpackhi_epi16(R2, R1);
+                       R2 = _mm_unpacklo_epi16(R2, R1);
+
+                       /* Store the results. */
+                       _mm_store_si128((__m128i *) dst, R2);  dst += (128/8);
+                       _mm_store_si128((__m128i *) dst, R0);  dst += (128/8);
+                       w -= 8;
+               }
+
+               /* Handle any remainder. */
+               if (w > 0)
+               {
+                       general_RGB565ToARGB_16u32u_C3C4((const UINT16*) src, srcStep,
+                               (UINT32*) dst, dstStep, w, 1, alpha, TRUE);
+                       src += w * sizeof(UINT16);
+                       dst += w * sizeof(UINT32);
+               }
+
+               /* Bump to the start of the next row. */
+               src += srcRowBump;
+               dst += dstRowBump;
+       }
+
+       return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+pstatus_t sse3_RGB565ToARGB_16u32u_C3C4(
+       const UINT16* pSrc, INT32 srcStep,
+       UINT32* pDst, INT32 dstStep,
+       UINT32 width, UINT32 height,
+       BOOL alpha, BOOL invert)
+{
+       if (invert)
+       {
+               return sse3_RGB565ToARGB_16u32u_C3C4_invert(pSrc, srcStep,
+                       pDst, dstStep, width, height, alpha);
+       }
+       else
+       {
+               return sse3_RGB565ToARGB_16u32u_C3C4_no_invert(pSrc, srcStep,
+                       pDst, dstStep, width, height, alpha);
+       }
+}
+#endif /* WITH_SSE2 */
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_16to32bpp_opt(
+       primitives_t *prims)
+{
+#ifdef WITH_SSE2
+    if (IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
+       {
+               prims->RGB565ToARGB_16u32u_C3C4 = sse3_RGB565ToARGB_16u32u_C3C4;
+       }
+#endif
+}
diff --git a/libfreerdp/primitives/prim_YCoCg.c b/libfreerdp/primitives/prim_YCoCg.c
new file mode 100644 (file)
index 0000000..3e75056
--- /dev/null
@@ -0,0 +1,121 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * YCoCg<->RGB Color conversion operations.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2014 Hewlett-Packard Development Company, L.P.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+
+#include "prim_internal.h"
+#include "prim_YCoCg.h"
+
+#ifndef MINMAX
+#define MINMAX(_v_, _l_, _h_) \
+       ((_v_) < (_l_) ? (_l_) : ((_v_) > (_h_) ? (_h_) : (_v_)))
+#endif /* !MINMAX */
+
+/* ------------------------------------------------------------------------- */
+pstatus_t general_YCoCgRToRGB_8u_AC4R(
+       const BYTE *pSrc, INT32 srcStep,
+       BYTE *pDst, INT32 dstStep,
+       UINT32 width, UINT32 height,
+       UINT8 shift,
+       BOOL withAlpha,
+       BOOL invert)
+{
+       const BYTE *sptr = pSrc;
+       BYTE *dptr = pDst;
+       int cll = shift - 1;  /* -1 builds in the /2's */
+       int x,y;
+       int srcRowBump = srcStep - width*sizeof(UINT32);
+       int dstRowBump = dstStep - width*sizeof(UINT32);
+       if (invert)
+       {
+               for (y=0; y<height; y++)
+               {
+                       for (x=0; x<width; x++)
+                       {
+                               INT16 cg, co, y, t, r, g, b;
+                               BYTE a;
+
+                               /* Note: shifts must be done before sign-conversion. */
+                               cg = (INT16) ((INT8) ((*sptr++) << cll));
+                               co = (INT16) ((INT8) ((*sptr++) << cll));
+                               y = (INT16) (*sptr++);  /* UINT8->INT16 */
+                               a = *sptr++;
+                               if (!withAlpha) a = 0xFFU;
+                               t  = y - cg;
+                               r  = t + co;
+                               g  = y + cg;
+                               b  = t - co;
+                               *dptr++ = (BYTE) MINMAX(r, 0, 255);
+                               *dptr++ = (BYTE) MINMAX(g, 0, 255);
+                               *dptr++ = (BYTE) MINMAX(b, 0, 255);
+                               *dptr++ = a;
+                       }
+                       sptr += srcRowBump;
+                       dptr += dstRowBump;
+               }
+       }
+       else
+       {
+               for (y=0; y<height; y++)
+               {
+                       for (x=0; x<width; x++)
+                       {
+                               INT16 cg, co, y, t, r, g, b;
+                               BYTE a;
+
+                               /* Note: shifts must be done before sign-conversion. */
+                               cg = (INT16) ((INT8) ((*sptr++) << cll));
+                               co = (INT16) ((INT8) ((*sptr++) << cll));
+                               y = (INT16) (*sptr++);  /* UINT8->INT16 */
+                               a = *sptr++;
+                               if (!withAlpha) a = 0xFFU;
+                               t  = y - cg;
+                               r  = t + co;
+                               g  = y + cg;
+                               b  = t - co;
+                               *dptr++ = (BYTE) MINMAX(b, 0, 255);
+                               *dptr++ = (BYTE) MINMAX(g, 0, 255);
+                               *dptr++ = (BYTE) MINMAX(r, 0, 255);
+                               *dptr++ = a;
+                       }
+                       sptr += srcRowBump;
+                       dptr += dstRowBump;
+               }
+       }
+       return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_YCoCg(primitives_t* prims)
+{
+       prims->YCoCgRToRGB_8u_AC4R = general_YCoCgRToRGB_8u_AC4R;
+
+       primitives_init_YCoCg_opt(prims);
+}
+
+/* ------------------------------------------------------------------------- */
+void primitives_deinit_YCoCg(primitives_t* prims)
+{
+       /* Nothing to do. */
+}
diff --git a/libfreerdp/primitives/prim_YCoCg.h b/libfreerdp/primitives/prim_YCoCg.h
new file mode 100644 (file)
index 0000000..aa3929a
--- /dev/null
@@ -0,0 +1,31 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * YCoCg<->RGB color conversion operations.
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2014 Hewlett-Packard Development Company, L.P.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef __GNUC__
+# pragma once
+#endif
+
+#ifndef __PRIM_YCOCG_H_INCLUDED__
+#define __PRIM_YCOCG_H_INCLUDED__
+
+pstatus_t general_YCoCgRToRGB_8u_AC4R(const BYTE *pSrc, INT32 srcStep, BYTE *pDst, INT32 dstStep, UINT32 width, UINT32 height, UINT8 shift, BOOL withAlpha, BOOL invert);
+
+void primitives_init_YCoCg_opt(primitives_t* prims);
+
+#endif /* !__PRIM_YCOCG_H_INCLUDED__ */
diff --git a/libfreerdp/primitives/prim_YCoCg_opt.c b/libfreerdp/primitives/prim_YCoCg_opt.c
new file mode 100644 (file)
index 0000000..51fce1f
--- /dev/null
@@ -0,0 +1,399 @@
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Optimized YCoCg<->RGB conversion operations.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2014 Hewlett-Packard Development Company, L.P.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+#include <winpr/sysinfo.h>
+
+#ifdef WITH_SSE2
+#include <emmintrin.h>
+#include <tmmintrin.h>
+#elif defined(WITH_NEON)
+#include <arm_neon.h>
+#endif /* WITH_SSE2 else WITH_NEON */
+
+#include "prim_internal.h"
+#include "prim_templates.h"
+#include "prim_YCoCg.h"
+
+#ifdef WITH_SSE2
+
+/* ------------------------------------------------------------------------- */
+static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert(
+       const BYTE *pSrc, INT32 srcStep,
+       BYTE *pDst, INT32 dstStep,
+       UINT32 width, UINT32 height,
+       UINT8 shift,
+       BOOL withAlpha)
+{
+       const BYTE *sptr = pSrc;
+       BYTE *dptr = (BYTE *) pDst;
+       int sRowBump = srcStep - width*sizeof(UINT32);
+       int dRowBump = dstStep - width*sizeof(UINT32);
+       int h;
+       /* Shift left by "shift" and divide by two is the same as shift
+        * left by "shift-1".
+        */
+       int dataShift = shift - 1;
+       BYTE mask = (BYTE) (0xFFU << dataShift);
+
+       /* Let's say the data is of the form:
+        * y0y0o0g0 a1y1o1g1 a2y2o2g2...
+        * Apply:
+        * |R|   | 1  1/2 -1/2 |   |y|
+        * |G| = | 1  0    1/2 | * |o|
+        * |B|   | 1 -1/2 -1/2 |   |g|
+        * where Y is 8-bit unsigned and o & g are 8-bit signed.
+        */
+
+       if ((width < 8) || (ULONG_PTR) dptr & 0x03)
+       {
+               /* Too small, or we'll never hit a 16-byte boundary.  Punt. */
+               return general_YCoCgRToRGB_8u_AC4R(pSrc, srcStep,
+                       pDst, dstStep, width, height, shift, withAlpha, TRUE);
+       }
+
+       for (h=0; h<height; h++)
+       {
+               int w = width;
+               BOOL onStride;
+
+               /* Get to a 16-byte destination boundary. */
+               if ((ULONG_PTR) dptr & 0x0f)
+               {
+                       int startup = (16 - ((ULONG_PTR) dptr & 0x0f)) / 4;
+                       if (startup > width) startup = width;
+                       general_YCoCgRToRGB_8u_AC4R(sptr, srcStep, dptr, dstStep,
+                               startup, 1, shift, withAlpha, TRUE);
+                       sptr += startup * sizeof(UINT32);
+                       dptr += startup * sizeof(UINT32);
+                       w -= startup;
+               }
+
+               /* Each loop handles eight pixels at a time. */
+               onStride = (((ULONG_PTR) sptr & 0x0f) == 0) ? TRUE : FALSE;
+               while (w >= 8)
+               {
+                       __m128i R0, R1, R2, R3, R4, R5, R6, R7;
+                       if (onStride)
+                       {
+                               /* The faster path, 16-byte aligned load. */
+                               R0 = _mm_load_si128((__m128i *) sptr);  sptr += (128/8);
+                               R1 = _mm_load_si128((__m128i *) sptr);  sptr += (128/8);
+                       }
+                       else
+                       {
+                               /* Off-stride, slower LDDQU load. */
+                               R0 = _mm_lddqu_si128((__m128i *) sptr);  sptr += (128/8);
+                               R1 = _mm_lddqu_si128((__m128i *) sptr);  sptr += (128/8);
+                       }
+                               /* R0 = a3y3o3g3 a2y2o2g2 a1y1o1g1 a0y0o0g0 */
+                               /* R1 = a7y7o7g7 a6y6o6g6 a5y5o5g5 a4y4o4g4 */
+
+                       /* Shuffle to pack all the like types together. */
+                       R2 = _mm_set_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
+                       R3 = _mm_shuffle_epi8(R0, R2);
+                       R4 = _mm_shuffle_epi8(R1, R2);
+                               /* R3 = a3a2a1a0 y3y2y1y0 o3o2o1o0 g3g2g1g0 */
+                               /* R4 = a7a6a5a4 y7y6y5y4 o7o6o5o4 g7g6g5g4 */
+                       R5 = _mm_unpackhi_epi32(R3, R4);
+                       R6 = _mm_unpacklo_epi32(R3, R4);
+                               /* R5 = a7a6a5a4 a3a2a1a0 y7y6y5y4 y3y2y1y0 */
+                               /* R6 = o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
+                       /* Save alphas aside */
+                       if (withAlpha) R7 = _mm_unpackhi_epi64(R5, R5);
+                       else R7 = _mm_set1_epi32(0xFFFFFFFFU);
+                               /* R7 = a7a6a5a4 a3a2a1a0 a7a6a5a4 a3a2a1a0 */
+                       /* Expand Y's from 8-bit unsigned to 16-bit signed. */
+                       R1 = _mm_set1_epi32(0);
+                       R0 = _mm_unpacklo_epi8(R5, R1);
+                               /* R0 = 00y700y6 00y500y4 00y300y2 00y100y0 */
+                       /* Shift Co's and Cg's by (shift-1).  -1 covers division by two.
+                        * Note: this must be done before sign-conversion.
+                        * Note also there is no slli_epi8, so we have to use a 16-bit
+                        * version and then mask.
+                        */
+                       R6 = _mm_slli_epi16(R6, dataShift);
+                       R1 = _mm_set1_epi8(mask);
+                       R6 = _mm_and_si128(R6, R1);
+                               /* R6 = shifted o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
+                       /* Expand Co's from 8-bit signed to 16-bit signed */
+                       R1 = _mm_unpackhi_epi8(R6, R6);
+                       R1 = _mm_srai_epi16(R1, 8);
+                               /* R1 = xxo7xxo6 xxo5xxo4 xxo3xxo2 xxo1xxo0 */
+                       /* Expand Cg's form 8-bit signed to 16-bit signed */
+                       R2 = _mm_unpacklo_epi8(R6, R6);
+                       R2 = _mm_srai_epi16(R2, 8);
+                               /* R2 = xxg7xxg6 xxg5xxg4 xxg3xxg2 xxg1xxg0 */
+                       /* Get Y - halfCg and save */
+                       R6 = _mm_subs_epi16(R0, R2);
+
+                       /* R = (Y-halfCg) + halfCo */
+                       R3 = _mm_adds_epi16(R6, R1);
+                               /* R3 = xxR7xxR6 xxR5xxR4 xxR3xxR2 xxR1xxR0 */
+                       /* G = Y + Cg(/2) */
+                       R4 = _mm_adds_epi16(R0, R2);
+                               /* R4 = xxG7xxG6 xxG5xxG4 xxG3xxG2 xxG1xxG0 */
+                       /* B = (Y-halfCg) - Co(/2) */
+                       R5 = _mm_subs_epi16(R6, R1);
+                               /* R5 = xxB7xxB6 xxB5xxB4 xxB3xxB2 xxB1xxB0 */
+
+                       /* Repack R's & B's.  */
+                       R0 = _mm_packus_epi16(R3, R5);
+                               /* R0 = R7R6R5R4 R3R2R1R0 B7B6B5B4 B3B2B1B0 */
+                       /* Repack G's. */
+                       R1 = _mm_packus_epi16(R4, R4);
+                               /* R1 = G7G6G6G4 G3G2G1G0 G7G6G6G4 G3G2G1G0 */
+                       /* And add the A's. */
+                       R1 = _mm_unpackhi_epi64(R1, R7);
+                               /* R1 = A7A6A6A4 A3A2A1A0 G7G6G6G4 G3G2G1G0 */
+
+                       /* Now do interleaving again. */
+                       R2 = _mm_unpacklo_epi8(R0, R1);
+                               /* R2 = G7B7G6B6 G5B5G4B4 G3B3G2B2 G1B1G0B0 */
+                       R3 = _mm_unpackhi_epi8(R0, R1);
+                               /* R3 = A7R7A6R6 A5R5A4R4 A3R3A2R2 A1R1A0R0 */
+                       R4 = _mm_unpacklo_epi16(R2, R3);
+                               /* R4 = A3R3G3B3 A2R2G2B2 A1R1G1B1 A0R0G0B0 */
+                       R5 = _mm_unpackhi_epi16(R2, R3);
+                               /* R5 = A7R7G7B7 A6R6G6B6 A5R6G5B5 A4R4G4B4 */
+
+                       _mm_store_si128((__m128i *) dptr, R4);  dptr += (128/8);
+                       _mm_store_si128((__m128i *) dptr, R5);  dptr += (128/8);
+                       w -= 8;
+               }
+
+               /* Handle any remainder pixels. */
+               if (w > 0) {
+                       general_YCoCgRToRGB_8u_AC4R(sptr, srcStep, dptr, dstStep,
+                               w, 1, shift, withAlpha, TRUE);
+                       sptr += w * sizeof(UINT32);
+                       dptr += w * sizeof(UINT32);
+               }
+
+               sptr += sRowBump;
+               dptr += dRowBump;
+       }
+       return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert(
+       const BYTE *pSrc, INT32 srcStep,
+       BYTE *pDst, INT32 dstStep,
+       UINT32 width, UINT32 height,
+       UINT8 shift,
+       BOOL withAlpha)
+{
+       const BYTE *sptr = pSrc;
+       BYTE *dptr = (BYTE *) pDst;
+       int sRowBump = srcStep - width*sizeof(UINT32);
+       int dRowBump = dstStep - width*sizeof(UINT32);
+       int h;
+       /* Shift left by "shift" and divide by two is the same as shift
+        * left by "shift-1".
+        */
+       int dataShift = shift - 1;
+       BYTE mask = (BYTE) (0xFFU << dataShift);
+
+       /* Let's say the data is of the form:
+        * y0y0o0g0 a1y1o1g1 a2y2o2g2...
+        * Apply:
+        * |R|   | 1  1/2 -1/2 |   |y|
+        * |G| = | 1  0    1/2 | * |o|
+        * |B|   | 1 -1/2 -1/2 |   |g|
+        * where Y is 8-bit unsigned and o & g are 8-bit signed.
+        */
+
+       if ((width < 8) || (ULONG_PTR) dptr & 0x03)
+       {
+               /* Too small, or we'll never hit a 16-byte boundary.  Punt. */
+               return general_YCoCgRToRGB_8u_AC4R(pSrc, srcStep,
+                       pDst, dstStep, width, height, shift, withAlpha, FALSE);
+       }
+
+       for (h=0; h<height; h++)
+       {
+               int w = width;
+               BOOL onStride;
+
+               /* Get to a 16-byte destination boundary. */
+               if ((ULONG_PTR) dptr & 0x0f)
+               {
+                       int startup = (16 - ((ULONG_PTR) dptr & 0x0f)) / 4;
+                       if (startup > width) startup = width;
+                       general_YCoCgRToRGB_8u_AC4R(sptr, srcStep, dptr, dstStep,
+                               startup, 1, shift, withAlpha, FALSE);
+                       sptr += startup * sizeof(UINT32);
+                       dptr += startup * sizeof(UINT32);
+                       w -= startup;
+               }
+
+               /* Each loop handles eight pixels at a time. */
+               onStride = (((ULONG_PTR) sptr & 0x0f) == 0) ? TRUE : FALSE;
+               while (w >= 8)
+               {
+                       __m128i R0, R1, R2, R3, R4, R5, R6, R7;
+                       if (onStride)
+                       {
+                               /* The faster path, 16-byte aligned load. */
+                               R0 = _mm_load_si128((__m128i *) sptr);  sptr += (128/8);
+                               R1 = _mm_load_si128((__m128i *) sptr);  sptr += (128/8);
+                       }
+                       else
+                       {
+                               /* Off-stride, slower LDDQU load. */
+                               R0 = _mm_lddqu_si128((__m128i *) sptr);  sptr += (128/8);
+                               R1 = _mm_lddqu_si128((__m128i *) sptr);  sptr += (128/8);
+                       }
+                               /* R0 = a3y3o3g3 a2y2o2g2 a1y1o1g1 a0y0o0g0 */
+                               /* R1 = a7y7o7g7 a6y6o6g6 a5y5o5g5 a4y4o4g4 */
+
+                       /* Shuffle to pack all the like types together. */
+                       R2 = _mm_set_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
+                       R3 = _mm_shuffle_epi8(R0, R2);
+                       R4 = _mm_shuffle_epi8(R1, R2);
+                               /* R3 = a3a2a1a0 y3y2y1y0 o3o2o1o0 g3g2g1g0 */
+                               /* R4 = a7a6a5a4 y7y6y5y4 o7o6o5o4 g7g6g5g4 */
+                       R5 = _mm_unpackhi_epi32(R3, R4);
+                       R6 = _mm_unpacklo_epi32(R3, R4);
+                               /* R5 = a7a6a5a4 a3a2a1a0 y7y6y5y4 y3y2y1y0 */
+                               /* R6 = o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
+                       /* Save alphas aside */
+                       if (withAlpha) R7 = _mm_unpackhi_epi64(R5, R5);
+                       else R7 = _mm_set1_epi32(0xFFFFFFFFU);
+                               /* R7 = a7a6a5a4 a3a2a1a0 a7a6a5a4 a3a2a1a0 */
+                       /* Expand Y's from 8-bit unsigned to 16-bit signed. */
+                       R1 = _mm_set1_epi32(0);
+                       R0 = _mm_unpacklo_epi8(R5, R1);
+                               /* R0 = 00y700y6 00y500y4 00y300y2 00y100y0 */
+                       /* Shift Co's and Cg's by (shift-1).  -1 covers division by two.
+                        * Note: this must be done before sign-conversion.
+                        * Note also there is no slli_epi8, so we have to use a 16-bit
+                        * version and then mask.
+                        */
+                       R6 = _mm_slli_epi16(R6, dataShift);
+                       R1 = _mm_set1_epi8(mask);
+                       R6 = _mm_and_si128(R6, R1);
+                               /* R6 = shifted o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
+                       /* Expand Co's from 8-bit signed to 16-bit signed */
+                       R1 = _mm_unpackhi_epi8(R6, R6);
+                       R1 = _mm_srai_epi16(R1, 8);
+                               /* R1 = xxo7xxo6 xxo5xxo4 xxo3xxo2 xxo1xxo0 */
+                       /* Expand Cg's form 8-bit signed to 16-bit signed */
+                       R2 = _mm_unpacklo_epi8(R6, R6);
+                       R2 = _mm_srai_epi16(R2, 8);
+                               /* R2 = xxg7xxg6 xxg5xxg4 xxg3xxg2 xxg1xxg0 */
+                       /* Get Y - halfCg and save */
+                       R6 = _mm_subs_epi16(R0, R2);
+
+                       /* R = (Y-halfCg) + halfCo */
+                       R3 = _mm_adds_epi16(R6, R1);
+                               /* R3 = xxR7xxR6 xxR5xxR4 xxR3xxR2 xxR1xxR0 */
+                       /* G = Y + Cg(/2) */
+                       R4 = _mm_adds_epi16(R0, R2);
+                               /* R4 = xxG7xxG6 xxG5xxG4 xxG3xxG2 xxG1xxG0 */
+                       /* B = (Y-halfCg) - Co(/2) */
+                       R5 = _mm_subs_epi16(R6, R1);
+                               /* R5 = xxB7xxB6 xxB5xxB4 xxB3xxB2 xxB1xxB0 */
+
+                       /* Repack R's & B's.  */
+                       /* This line is the only diff between inverted and non-inverted.
+                        * Unfortunately, it would be expensive to check "inverted" 
+                        * every time through this loop.
+                        */
+                       R0 = _mm_packus_epi16(R5, R3);
+                               /* R0 = B7B6B5B4 B3B2B1B0 R7R6R5R4 R3R2R1R0 */
+                       /* Repack G's. */
+                       R1 = _mm_packus_epi16(R4, R4);
+                               /* R1 = G7G6G6G4 G3G2G1G0 G7G6G6G4 G3G2G1G0 */
+                       /* And add the A's. */
+                       R1 = _mm_unpackhi_epi64(R1, R7);
+                               /* R1 = A7A6A6A4 A3A2A1A0 G7G6G6G4 G3G2G1G0 */
+
+                       /* Now do interleaving again. */
+                       R2 = _mm_unpacklo_epi8(R0, R1);
+                               /* R2 = G7B7G6B6 G5B5G4B4 G3B3G2B2 G1B1G0B0 */
+                       R3 = _mm_unpackhi_epi8(R0, R1);
+                               /* R3 = A7R7A6R6 A5R5A4R4 A3R3A2R2 A1R1A0R0 */
+                       R4 = _mm_unpacklo_epi16(R2, R3);
+                               /* R4 = A3R3G3B3 A2R2G2B2 A1R1G1B1 A0R0G0B0 */
+                       R5 = _mm_unpackhi_epi16(R2, R3);
+                               /* R5 = A7R7G7B7 A6R6G6B6 A5R6G5B5 A4R4G4B4 */
+
+                       _mm_store_si128((__m128i *) dptr, R4);  dptr += (128/8);
+                       _mm_store_si128((__m128i *) dptr, R5);  dptr += (128/8);
+                       w -= 8;
+               }
+
+               /* Handle any remainder pixels. */
+               if (w > 0) {
+                       general_YCoCgRToRGB_8u_AC4R(sptr, srcStep, dptr, dstStep,
+                               w, 1, shift, withAlpha, FALSE);
+                       sptr += w * sizeof(UINT32);
+                       dptr += w * sizeof(UINT32);
+               }
+
+               sptr += sRowBump;
+               dptr += dRowBump;
+       }
+       return PRIMITIVES_SUCCESS;
+}
+#endif /* WITH_SSE2 */
+
+#ifdef WITH_SSE2
+/* ------------------------------------------------------------------------- */
+pstatus_t ssse3_YCoCgRToRGB_8u_AC4R(
+       const BYTE *pSrc, INT32 srcStep,
+       BYTE *pDst, INT32 dstStep,
+       UINT32 width, UINT32 height,
+       UINT8 shift,
+       BOOL withAlpha,
+       BOOL invert)
+{
+       if (invert) {
+               return ssse3_YCoCgRToRGB_8u_AC4R_invert(pSrc, srcStep, pDst, dstStep,
+                       width, height, shift, withAlpha);
+       }
+       else {
+               return ssse3_YCoCgRToRGB_8u_AC4R_no_invert(pSrc, srcStep, pDst, dstStep,
+                       width, height, shift, withAlpha);
+       }
+}
+#endif /* WITH_SSE2 */
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_YCoCg_opt(primitives_t* prims)
+{
+/* While IPP acknowledges the existence of YCoCg-R, it doesn't currently
+ * include any routines to work with it, especially with variable shift
+ * width.
+ */
+#if defined(WITH_SSE2)
+       if (IsProcessorFeaturePresentEx(PF_EX_SSSE3)
+                       && IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
+       {
+               prims->YCoCgRToRGB_8u_AC4R = ssse3_YCoCgRToRGB_8u_AC4R;
+       }
+#endif /* WITH_SSE2 */
+}
index 06418fb..e1a248c 100644 (file)
@@ -75,4 +75,14 @@ extern void primitives_init_colors(
 extern void primitives_deinit_colors(
        primitives_t *prims);
 
+extern void primitives_init_YCoCg(
+       primitives_t *prims);
+extern void primitives_deinit_YCoCg(
+       primitives_t *prims);
+
+extern void primitives_init_16to32bpp(
+       primitives_t *prims);
+extern void primitives_deinit_16to32bpp(
+       primitives_t *prims);
+
 #endif /* !__PRIM_INTERNAL_H_INCLUDED__ */
index 33764f1..dc8d038 100644 (file)
@@ -49,6 +49,8 @@ void primitives_init(void)
        primitives_init_shift(pPrimitives);
        primitives_init_sign(pPrimitives);
        primitives_init_colors(pPrimitives);
+       primitives_init_YCoCg(pPrimitives);
+       primitives_init_16to32bpp(pPrimitives);
 }
 
 /* ------------------------------------------------------------------------- */
@@ -75,6 +77,8 @@ void primitives_deinit(void)
        primitives_deinit_shift(pPrimitives);
        primitives_deinit_sign(pPrimitives);
        primitives_deinit_colors(pPrimitives);
+       primitives_deinit_YCoCg(pPrimitives);
+       primitives_deinit_16to32bpp(pPrimitives);
 
        free((void*) pPrimitives);
        pPrimitives = NULL;
index 6d6898f..972cee3 100644 (file)
@@ -23,6 +23,7 @@ set(MODULE_PREFIX "PRIMITIVES_LIBRARY_TEST")
 
 set(PRIMITIVE_TEST_CFILES
        prim_test.c
+       test_16to32bpp.c
        test_add.c
        test_alphaComp.c
        test_andor.c
@@ -31,6 +32,8 @@ set(PRIMITIVE_TEST_CFILES
        test_set.c
        test_shift.c
        test_sign.c
+       test_YCoCg.c
+       ../prim_16to32bpp.c
        ../prim_add.c
        ../prim_andor.c
        ../prim_alphaComp.c
@@ -39,6 +42,8 @@ set(PRIMITIVE_TEST_CFILES
        ../prim_set.c
        ../prim_shift.c
        ../prim_sign.c
+       ../prim_YCoCg.c
+       ../prim_16to32bpp_opt.c
        ../prim_add_opt.c
        ../prim_alphaComp_opt.c
        ../prim_andor_opt.c
@@ -46,6 +51,7 @@ set(PRIMITIVE_TEST_CFILES
        ../prim_set_opt.c
        ../prim_shift_opt.c
        ../prim_sign_opt.c
+       ../prim_YCoCg_opt.c
        ../primitives.c
     )
 
@@ -138,7 +144,9 @@ endif()
 
 set_property(SOURCE ${PRIMITIVE_TEST_CFILES} PROPERTY COMPILE_FLAGS ${OPTFLAGS})
 
-target_link_libraries(prim_test rt winpr-sysinfo)
+find_library(WINPR_SYSINFO NAMES winpr-sysinfo HINTS ../../../winpr/libwinpr/sysinfo)
+target_link_libraries(prim_test rt ${WINPR_SYSINFO})
+
 if(NOT TESTING_OUTPUT_DIRECTORY)
        set(TESTING_OUTPUT_DIRECTORY .)
 endif()
index 52d5209..e9824a1 100644 (file)
@@ -227,6 +227,8 @@ void _floatprint(
 #define TEST_ALPHA                     (1<<11)
 #define TEST_AND                       (1<<12)
 #define TEST_OR                                (1<<13)
+#define TEST_YCOCG                     (1<<14)
+#define TEST_16TO32                    (1<<15)
 
 /* Specific types of testing: */
 #define TEST_FUNCTIONALITY             (1<<0)
@@ -264,9 +266,11 @@ static const test_t testList[] =
        { "rgb",                TEST_RGB },
        { "color",              TEST_RGB },
        { "colors",             TEST_RGB },
+       { "ycocg",              TEST_YCOCG },
        { "alpha",              TEST_ALPHA },
        { "and",                TEST_AND },
-       { "or",                 TEST_OR }
+       { "or",                 TEST_OR },
+       { "16to32",             TEST_16TO32 }
 };
 
 #define NUMTESTS (sizeof(testList)/sizeof(test_t))
@@ -475,6 +479,29 @@ int main(int argc, char** argv)
                        results |= test_yCbCrToRGB_16s16s_P3P3_speed();
                }
        }
+       if (testSet & TEST_YCOCG)
+       {
+               if (testTypes & TEST_FUNCTIONALITY)
+               {
+                       results |= test_YCoCgRToRGB_8u_AC4R_func();
+               }
+               if (testTypes & TEST_PERFORMANCE)
+               {
+                       results |= test_YCoCgRToRGB_8u_AC4R_speed();
+               }
+       }
+       /* 16 to 32 BPP */
+       if (testSet & TEST_16TO32)
+       {
+               if (testTypes & TEST_FUNCTIONALITY)
+               {
+                       results |= test_RGB565ToARGB_16u32u_C3C4_func();
+               }
+               if (testTypes & TEST_PERFORMANCE)
+               {
+                       results |= test_RGB565ToARGB_16u32u_C3C4_speed();
+               }
+       }
        /* ALPHA COMPOSITION */
        if (testSet & TEST_ALPHA)
        {
index f32f468..e336e3e 100644 (file)
@@ -89,6 +89,11 @@ extern int test_RGBToRGB_16s8u_P3AC4R_func(void);
 extern int test_RGBToRGB_16s8u_P3AC4R_speed(void);
 extern int test_yCbCrToRGB_16s16s_P3P3_func(void);
 extern int test_yCbCrToRGB_16s16s_P3P3_speed(void);
+extern int test_YCoCgRToRGB_8u_AC4R_func(void);
+extern int test_YCoCgRToRGB_8u_AC4R_speed(void);
+
+extern int test_RGB565ToARGB_16u32u_C3C4_func(void);
+extern int test_RGB565ToARGB_16u32u_C3C4_speed(void);
 
 extern int test_alphaComp_func(void);
 extern int test_alphaComp_speed(void);
diff --git a/libfreerdp/primitives/test/test_16to32bpp.c b/libfreerdp/primitives/test/test_16to32bpp.c
new file mode 100644 (file)
index 0000000..dc0c603
--- /dev/null
@@ -0,0 +1,188 @@
+/* test_colors.c
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2014 Hewlett-Packard Development Company, L.P.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <assert.h>
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <winpr/sysinfo.h>
+#include "prim_test.h"
+
+static const int RGB_TRIAL_ITERATIONS = 1000;
+static const float TEST_TIME = 4.0;
+
+extern pstatus_t general_RGB565ToARGB_16u32u_C3C4(
+       const UINT16* pSrc, INT32 srcStep,
+       UINT32* pDst, INT32 dstStep,
+       UINT32 width, UINT32 height,
+       BOOL alpha, BOOL invert);
+extern pstatus_t sse3_RGB565ToARGB_16u32u_C3C4(
+       const UINT16* pSrc, INT32 srcStep,
+       UINT32* pDst, INT32 dstStep,
+       UINT32 width, UINT32 height,
+       BOOL alpha, BOOL invert);
+
+/* ------------------------------------------------------------------------- */
+static BOOL try_16To32(
+       const UINT16 *data16,
+       int sOffset,
+       int dOffset,
+       int width, int height)
+{
+       BOOL success;
+       const char *sAligned = "sAlign";
+       const char *sUnaligned = "s!Align";
+       const char *dAligned = "dAlign";
+       const char *dUnaligned = "d!Align";
+       const char *sAlignStr, *dAlignStr;
+       const UINT16 *src;
+       UINT32 ALIGN(outNN1[4096+3]), ALIGN(outAN1[4096+3]),
+                  ALIGN(outNI1[4096+3]), ALIGN(outAI1[4096+3]);
+       UINT32 ALIGN(outNN2[4096+3]), ALIGN(outAN2[4096+3]),
+                  ALIGN(outNI2[4096+3]), ALIGN(outAI2[4096+3]);
+
+       assert(sOffset < 4);
+       assert(dOffset < 4);
+       assert(width*height <= 4096);
+
+       success = TRUE;
+       src = data16 + sOffset;
+       sAlignStr = (sOffset == 0) ? sAligned : sUnaligned;
+       dAlignStr = (dOffset == 0) ? dAligned : dUnaligned;
+
+       general_RGB565ToARGB_16u32u_C3C4(src, width*sizeof(UINT16),
+               outNN1+dOffset, width*sizeof(UINT32), width, height, FALSE, FALSE);
+       general_RGB565ToARGB_16u32u_C3C4(src, width*sizeof(UINT16),
+               outAN1+dOffset, width*sizeof(UINT32), width, height, TRUE, FALSE);
+       general_RGB565ToARGB_16u32u_C3C4(src, width*sizeof(UINT16),
+               outNI1+dOffset, width*sizeof(UINT32), width, height, FALSE, TRUE);
+       general_RGB565ToARGB_16u32u_C3C4(src, width*sizeof(UINT16),
+               outAI1+dOffset, width*sizeof(UINT32), width, height, TRUE, TRUE);
+
+#ifdef WITH_SSE2
+       printf("  Testing 16-to-32bpp SSE3 version (%s, %s, %dx%d)\n",
+               sAlignStr, dAlignStr, width, height);
+       if (IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
+       {
+               int i;
+
+               sse3_RGB565ToARGB_16u32u_C3C4(src, width*sizeof(UINT16),
+                       outNN2+dOffset, width*sizeof(UINT32), width, height, FALSE, FALSE);
+               sse3_RGB565ToARGB_16u32u_C3C4(src, width*sizeof(UINT16),
+                       outAN2+dOffset, width*sizeof(UINT32), width, height, TRUE, FALSE);
+               sse3_RGB565ToARGB_16u32u_C3C4(src, width*sizeof(UINT16),
+                       outNI2+dOffset, width*sizeof(UINT32), width, height, FALSE, TRUE);
+               sse3_RGB565ToARGB_16u32u_C3C4(src, width*sizeof(UINT16),
+                       outAI2+dOffset, width*sizeof(UINT32), width, height, TRUE, TRUE);
+               for (i=0; i<width * height; i++)
+               {
+                       int s = i + sOffset;
+                       int d = i + dOffset;
+                       if (outNN1[d] != outNN2[d])
+                       {
+                               printf("16To32bpp-SSE FAIL (%s, %s, !alpha, !invert)"
+                                       " 0x%04x -> 0x%08x rather than 0x%08x\n",
+                                       sAlignStr, dAlignStr, data16[s], outNN2[d], outNN1[d]);
+                               success = FALSE;
+                       }
+                       if (outAN1[d] != outAN2[d])
+                       {
+                               printf("16To32bpp-SSE FAIL (%s, %s, alpha, !invert)"
+                                       " 0x%04x -> 0x%08x rather than 0x%08x\n",
+                                       sAlignStr, dAlignStr, data16[s], outAN2[d], outAN1[d]);
+                               success = FALSE;
+                       }
+                       if (outNI1[d] != outNI2[d])
+                       {
+                               printf("16To32bpp-SSE FAIL (%s, %s, !alpha, invert)"
+                                       " 0x%04x -> 0x%08x rather than 0x%08x\n",
+                                       sAlignStr, dAlignStr, data16[s], outNI2[d], outNI1[d]);
+                               success = FALSE;
+                       }
+                       if (outAI1[d] != outAI2[d])
+                       {
+                               printf("16To32bpp-SSE FAIL (%s, %s, alpha, invert)"
+                                       " 0x%04x -> 0x%08x rather than 0x%08x\n",
+                                       sAlignStr, dAlignStr, data16[s], outAI2[d], outNI1[d]);
+                               success = FALSE;
+                       }
+               }
+       }
+#endif /* WITH_SSE2 */
+
+       return success;
+}
+
+
+/* ------------------------------------------------------------------------- */
+int test_RGB565ToARGB_16u32u_C3C4_func(void)
+{
+       INT16 ALIGN(data16[4096+3]);
+       BOOL success;
+
+       success = TRUE;
+       get_random_data(data16, sizeof(data16));
+
+       /* Source aligned, dest aligned, 64x64 */
+       success &= try_16To32(data16, 0, 0, 64, 64);
+       /* Source !aligned, dest aligned, 64x64 */
+       success &= try_16To32(data16, 1, 0, 64, 64);
+       /* Source aligned, dest !aligned, 64x64 */
+       success &= try_16To32(data16, 0, 1, 64, 64);
+       /* Source !aligned, dest !aligned, 64x64 */
+       success &= try_16To32(data16, 1, 1, 64, 64);
+       /* Odd size */
+       success &= try_16To32(data16, 0, 0, 17, 53);
+
+       if (success) printf("All RGB565ToARGB_16u32u_C3C4 tests passed.\n");
+       return success ? SUCCESS : FAILURE;
+}
+
+/* ------------------------------------------------------------------------- */
+STD_SPEED_TEST(
+       test16to32_speed, UINT16, UINT32, PRIM_NOP,
+       TRUE, general_RGB565ToARGB_16u32u_C3C4(
+               (const UINT16 *) src1, 64*2, (UINT32 *) dst, 64*4, 
+               64,64, TRUE, TRUE),
+#ifdef WITH_SSE2
+       TRUE, sse3_RGB565ToARGB_16u32u_C3C4(
+               (const UINT16 *) src1, 64*2, (UINT32 *) dst, 64*4,
+               64,64, TRUE, TRUE),
+               PF_SSE3_INSTRUCTIONS_AVAILABLE, FALSE,
+#else
+       FALSE, PRIM_NOP, 0, FALSE,
+#endif
+       FALSE, PRIM_NOP);
+
+/* ------------------------------------------------------------------------- */
+int test_RGB565ToARGB_16u32u_C3C4_speed(void)
+{
+       UINT16 ALIGN(src[4096]);
+       UINT32 ALIGN(dst[4096]);
+       int i;
+       int size_array[] = { 64 };
+
+       get_random_data(src, sizeof(src));
+
+       test16to32_speed("16-to-32bpp", "aligned", 
+               (const UINT16 *) src, 0, 0, (UINT32 *) dst,
+               size_array, 1, RGB_TRIAL_ITERATIONS, TEST_TIME);
+       return SUCCESS;
+}
diff --git a/libfreerdp/primitives/test/test_YCoCg.c b/libfreerdp/primitives/test/test_YCoCg.c
new file mode 100644 (file)
index 0000000..f09acd5
--- /dev/null
@@ -0,0 +1,109 @@
+/* test_YCoCg.c
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2014 Hewlett-Packard Development Company, L.P.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <winpr/sysinfo.h>
+#include "prim_test.h"
+
+static const int YCOCG_TRIAL_ITERATIONS = 20000;
+static const float TEST_TIME = 4.0;
+
+extern pstatus_t general_YCoCgRToRGB_8u_AC4R(const BYTE *pSrc, INT32 srcStep,
+       BYTE *pDst, INT32 dstStep, UINT32 width, UINT32 height,
+       UINT8 shift, BOOL withAlpha, BOOL invert);
+extern pstatus_t ssse3_YCoCgRToRGB_8u_AC4R(const BYTE *pSrc, INT32 srcStep,
+       BYTE *pDst, INT32 dstStep, UINT32 width, UINT32 height,
+       UINT8 shift, BOOL withAlpha, BOOL invert);
+
+/* ------------------------------------------------------------------------- */
+int test_YCoCgRToRGB_8u_AC4R_func(void)
+{
+       INT32 ALIGN(in[4098]);
+       INT32 ALIGN(out_c[4098]), ALIGN(out_c_inv[4098]);
+       INT32 ALIGN(out_sse[4098]), ALIGN(out_sse_inv[4098]);
+       char testStr[256];
+       BOOL failed = FALSE;
+       int i;
+
+       testStr[0] = '\0';
+       get_random_data(in, sizeof(in));
+
+       general_YCoCgRToRGB_8u_AC4R((const BYTE *) (in+1), 63*4,
+               (BYTE *) out_c, 63*4, 63, 61, 2, TRUE, FALSE);
+       general_YCoCgRToRGB_8u_AC4R((const BYTE *) (in+1), 63*4,
+               (BYTE *) out_c_inv, 63*4, 63, 61, 2, TRUE, TRUE);
+#ifdef WITH_SSE2
+       if (IsProcessorFeaturePresentEx(PF_EX_SSSE3))
+       {
+               strcat(testStr, " SSSE3");
+               ssse3_YCoCgRToRGB_8u_AC4R((const BYTE *) (in+1), 63*4,
+                       (BYTE *) out_sse, 63*4, 63, 61, 2, TRUE, FALSE);
+
+               for (i=0; i<63*61; ++i)
+               {
+                       if (out_c[i] != out_sse[i]) {
+                               printf("YCoCgRToRGB-SSE FAIL[%d]: 0x%08x -> C 0x%08x vs SSE 0x%08x\n", i,
+                                       in[i+1], out_c[i], out_sse[i]);
+                               failed = TRUE;
+                       }
+               }
+               ssse3_YCoCgRToRGB_8u_AC4R((const BYTE *) (in+1), 63*4,
+                       (BYTE *) out_sse_inv, 63*4, 63, 61, 2, TRUE, TRUE);
+               for (i=0; i<63*61; ++i)
+               {
+                       if (out_c_inv[i] != out_sse_inv[i]) {
+                               printf("YCoCgRToRGB-SSE inverted FAIL[%d]: 0x%08x -> C 0x%08x vs SSE 0x%08x\n", i,
+                                       in[i+1], out_c_inv[i], out_sse_inv[i]);
+                               failed = TRUE;
+                       }
+               }
+       }
+#endif /* i386 */
+       if (!failed) printf("All YCoCgRToRGB_8u_AC4R tests passed (%s).\n", testStr);
+       return (failed > 0) ? FAILURE : SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+STD_SPEED_TEST(
+       ycocg_to_rgb_speed, const BYTE, BYTE, PRIM_NOP,
+       TRUE, general_YCoCgRToRGB_8u_AC4R(src1, 64*4, dst, 64*4, 64, 64, 2, FALSE, FALSE),
+#ifdef WITH_SSE2
+       TRUE, ssse3_YCoCgRToRGB_8u_AC4R(src1, 64*4, dst, 64*4, 64, 64, 2, FALSE, FALSE),
+               PF_EX_SSSE3, TRUE,
+#else
+       FALSE, PRIM_NOP, 0, FALSE,
+#endif
+       FALSE, PRIM_NOP);
+
+int test_YCoCgRToRGB_8u_AC4R_speed(void)
+{
+       INT32 ALIGN(in[4096]);
+       INT32 ALIGN(out[4096]);
+       int i;
+       int size_array[] = { 64 };
+
+       get_random_data(in, sizeof(in));
+
+       ycocg_to_rgb_speed("YCoCgToRGB", "aligned", (const BYTE *) in,
+               0, 0, (BYTE *) out,
+               size_array, 1, YCOCG_TRIAL_ITERATIONS, TEST_TIME);
+       return SUCCESS;
+}