const INT16 *pSrc[3], INT32 srcStep,
BYTE *pDst, INT32 dstStep,
const prim_size_t *roi);
+typedef pstatus_t (*__YCoCgRToRGB_8u_AC4R_t)(
+ const BYTE *pSrc, INT32 srcStep,
+ BYTE *pDst, INT32 dstStep,
+ UINT32 width, UINT32 height,
+ UINT8 shift,
+ BOOL withAlpha,
+ BOOL invert);
+typedef pstatus_t (*__RGB565ToARGB_16u32u_C3C4_t)(
+ const UINT16* pSrc, INT32 srcStep,
+ UINT32* pDst, INT32 dstStep,
+ UINT32 width, UINT32 height,
+ BOOL alpha, BOOL invert);
typedef pstatus_t (*__andC_32u_t)(
const UINT32 *pSrc,
UINT32 val,
__yCbCrToRGB_16s16s_P3P3_t yCbCrToRGB_16s16s_P3P3;
__RGBToYCbCr_16s16s_P3P3_t RGBToYCbCr_16s16s_P3P3;
__RGBToRGB_16s8u_P3AC4R_t RGBToRGB_16s8u_P3AC4R;
+ __YCoCgRToRGB_8u_AC4R_t YCoCgRToRGB_8u_AC4R;
+ __RGB565ToARGB_16u32u_C3C4_t RGB565ToARGB_16u32u_C3C4;
} primitives_t;
#ifdef __cplusplus
* Reads the supplied order header and extracts the compression
* order code ID.
*/
-static UINT32 ExtractCodeId(BYTE bOrderHdr)
+static INLINE UINT32 ExtractCodeId(BYTE bOrderHdr)
{
- int code;
-
- switch (bOrderHdr)
- {
- case MEGA_MEGA_BG_RUN:
- case MEGA_MEGA_FG_RUN:
- case MEGA_MEGA_SET_FG_RUN:
- case MEGA_MEGA_DITHERED_RUN:
- case MEGA_MEGA_COLOR_RUN:
- case MEGA_MEGA_FGBG_IMAGE:
- case MEGA_MEGA_SET_FGBG_IMAGE:
- case MEGA_MEGA_COLOR_IMAGE:
- case SPECIAL_FGBG_1:
- case SPECIAL_FGBG_2:
- case SPECIAL_WHITE:
- case SPECIAL_BLACK:
- return bOrderHdr;
+ if ((bOrderHdr & 0xC0U) != 0xC0U) {
+ /* REGULAR orders
+ * (000x xxxx, 001x xxxx, 010x xxxx, 011x xxxx, 100x xxxx)
+ */
+ return bOrderHdr >> 5;
}
- code = bOrderHdr >> 5;
- switch (code)
- {
- case REGULAR_BG_RUN:
- case REGULAR_FG_RUN:
- case REGULAR_COLOR_RUN:
- case REGULAR_FGBG_IMAGE:
- case REGULAR_COLOR_IMAGE:
- return code;
+ else if ((bOrderHdr & 0xF0U) == 0xF0U) {
+ /* MEGA and SPECIAL orders (0xF*) */
+ return bOrderHdr;
+ }
+ else {
+ /* LITE orders
+ * 1100 xxxx, 1101 xxxx, 1110 xxxx)
+ */
+ return bOrderHdr >> 4;
}
- return bOrderHdr >> 4;
}
/**
* Extract the run length of a compression order.
*/
-static UINT32 ExtractRunLength(UINT32 code, BYTE* pbOrderHdr, UINT32* advance)
+static INLINE UINT32 ExtractRunLength(UINT32 code, BYTE* pbOrderHdr, UINT32* advance)
{
UINT32 runLength;
UINT32 ladvance;
#include <freerdp/api.h>
#include <freerdp/freerdp.h>
+#include <freerdp/primitives.h>
#include <freerdp/codec/color.h>
int freerdp_get_pixel(BYTE* data, int x, int y, int width, int height, int bpp)
}
else if (dstBpp == 32)
{
- int i;
- UINT32 pixel;
- UINT16* src16;
- UINT32* dst32;
- BYTE red, green, blue;
+ primitives_t* prims;
if (dstData == NULL)
- dstData = (BYTE*) malloc(width * height * 4);
-
- src16 = (UINT16*) srcData;
- dst32 = (UINT32*) dstData;
-
- if (clrconv->alpha)
- {
- if (clrconv->invert)
- {
- for (i = width * height; i > 0; i--)
- {
- pixel = *src16;
- src16++;
- GetBGR16(red, green, blue, pixel);
- pixel = ARGB32(0xFF, red, green, blue);
- *dst32 = pixel;
- dst32++;
- }
- }
- else
- {
- for (i = width * height; i > 0; i--)
- {
- pixel = *src16;
- src16++;
- GetBGR16(red, green, blue, pixel);
- pixel = ABGR32(0xFF, red, green, blue);
- *dst32 = pixel;
- dst32++;
- }
- }
- }
- else
{
- if (clrconv->invert)
+ if (posix_memalign((void **) &dstData, 16,
+ width * height * sizeof(UINT32)) != 0)
{
- for (i = width * height; i > 0; i--)
- {
- pixel = *src16;
- src16++;
- GetBGR16(red, green, blue, pixel);
- pixel = RGB32(red, green, blue);
- *dst32 = pixel;
- dst32++;
- }
- }
- else
- {
- for (i = width * height; i > 0; i--)
- {
- pixel = *src16;
- src16++;
- GetBGR16(red, green, blue, pixel);
- pixel = BGR32(red, green, blue);
- *dst32 = pixel;
- dst32++;
- }
+ return NULL;
}
}
+ prims = primitives_get();
+ prims->RGB565ToARGB_16u32u_C3C4(
+ (const UINT16*) srcData, width * sizeof(UINT16),
+ (UINT32*) dstData, width * sizeof(UINT32),
+ width, height, clrconv->alpha, clrconv->invert);
+
return dstData;
}
#include <winpr/print.h>
#include <freerdp/codec/bitmap.h>
+#include <freerdp/primitives.h>
#include "planar.h"
srcp++;
}
+ if (FormatHeader & PLANAR_FORMAT_HEADER_CLL_MASK)
+ {
+ /* The data is in YCoCg colorspace rather than RGB. */
+ if (FormatHeader & PLANAR_FORMAT_HEADER_CS)
+ {
+ static BOOL been_warned = FALSE;
+ if (!been_warned)
+ fprintf(stderr, "Chroma-Subsampling is not implemented.\n");
+ been_warned = TRUE;
+ }
+ else
+ {
+ BOOL alpha;
+ int cll;
+
+ alpha = (FormatHeader & PLANAR_FORMAT_HEADER_NA) ? FALSE : TRUE;
+ cll = FormatHeader & PLANAR_FORMAT_HEADER_CLL_MASK;
+ primitives_get()->YCoCgRToRGB_8u_AC4R(
+ pDstData, nDstStep, pDstData, nDstStep,
+ nWidth, nHeight, cll, alpha, FALSE);
+ }
+ }
+
status = (SrcSize == (srcp - pSrcData)) ? 1 : -1;
return status;
header = rdp_capability_set_start(s);
drawingFlags |= DRAW_ALLOW_SKIP_ALPHA;
- drawingFlags |= DRAW_ALLOW_COLOR_SUBSAMPLING;
+ /* While bitmap_decode.c now implements YCoCg, in turning it
+ * on we have found Microsoft is inconsistent on whether to invert R & B.
+ * And it's not only from one server to another; on Win7/2008R2, it appears
+ * to send the main content with a different inversion than the Windows
+ * button! So... don't advertise that we support YCoCg and the server
+ * will not send it. YCoCg is still needed for EGFX, but it at least
+ * appears consistent in its use.
+ */
+ /* drawingFlags |= DRAW_ALLOW_DYNAMIC_COLOR_FIDELITY; */
+ /* YCoCg with chroma subsampling is not implemented in bitmap_decode.c. */
+ /* drawingFlags |= DRAW_ALLOW_COLOR_SUBSAMPLING; */
if (settings->RdpVersion > 5)
preferredBitsPerPixel = settings->ColorDepth;
set(MODULE_PREFIX "FREERDP_PRIMITIVES")
set(${MODULE_PREFIX}_SRCS
+ prim_16to32bpp.c
prim_add.c
prim_andor.c
prim_alphaComp.c
prim_set.c
prim_shift.c
prim_sign.c
+ prim_YCoCg.c
primitives.c
prim_internal.h)
set(${MODULE_PREFIX}_OPT_SRCS
+ prim_16to32bpp_opt.c
prim_add_opt.c
prim_andor_opt.c
prim_alphaComp_opt.c
prim_colors_opt.c
prim_set_opt.c
prim_shift_opt.c
- prim_sign_opt.c)
+ prim_sign_opt.c
+ prim_YCoCg_opt.c)
add_definitions(-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE})
--- /dev/null
+/* prim_16to32bpp.c
+ * 16-bit to 32-bit color conversion (widely used)
+ * vi:ts=4 sw=4:
+ *
+ * The general routine was leveraged from freerdp/codec/color.c.
+ *
+ * Copyright 2010 Marc-Andre Moreau <marcandre.moreau@gmail.com>
+ * (c) Copyright 2014 Hewlett-Packard Development Company, L.P.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+#include <freerdp/codec/color.h>
+
+#include "prim_internal.h"
+#include "prim_16to32bpp.h"
+
+/* ------------------------------------------------------------------------- */
+pstatus_t general_RGB565ToARGB_16u32u_C3C4(
+ const UINT16* pSrc, INT32 srcStep,
+ UINT32* pDst, INT32 dstStep,
+ UINT32 width, UINT32 height,
+ BOOL alpha, BOOL invert)
+{
+ const UINT16* src16;
+ UINT32* dst32;
+ int x,y;
+ int srcRowBump, dstRowBump;
+ BYTE red, green, blue;
+
+ src16 = pSrc;
+ dst32 = pDst;
+ srcRowBump = (srcStep - (width * sizeof(UINT16))) / sizeof(UINT16);
+ dstRowBump = (dstStep - (width * sizeof(UINT32))) / sizeof(UINT32);
+
+ /* Loops are separated so if-decisions are not made in the loop. */
+ if (alpha)
+ {
+ if (invert)
+ {
+ for (y=0; y<height; y++)
+ {
+ for (x=0; x<width; x++)
+ {
+ UINT32 pixel = (UINT32) *src16++;
+ GetBGR16(red, green, blue, pixel);
+ pixel = ARGB32(0xFF, red, green, blue);
+ *dst32++ = pixel;
+ }
+ src16 += srcRowBump;
+ dst32 += dstRowBump;
+ }
+ }
+ else
+ {
+ for (y=0; y<height; y++)
+ {
+ for (x=0; x<width; x++)
+ {
+ UINT32 pixel = (UINT32) *src16++;
+ GetBGR16(red, green, blue, pixel);
+ pixel = ABGR32(0xFF, red, green, blue);
+ *dst32++ = pixel;
+ }
+ src16 += srcRowBump;
+ dst32 += dstRowBump;
+ }
+ }
+ }
+ else
+ {
+ if (invert)
+ {
+ for (y=0; y<height; y++)
+ {
+ for (x=0; x<width; x++)
+ {
+ UINT32 pixel = (UINT32) *src16++;
+ GetBGR16(red, green, blue, pixel);
+ pixel = RGB32(red, green, blue);
+ *dst32++ = pixel;
+ }
+ src16 += srcRowBump;
+ dst32 += dstRowBump;
+ }
+ }
+ else
+ {
+ for (y=0; y<height; y++)
+ {
+ for (x=0; x<width; x++)
+ {
+ UINT32 pixel = (UINT32) *src16++;
+ GetBGR16(red, green, blue, pixel);
+ pixel = BGR32(red, green, blue);
+ *dst32++ = pixel;
+ }
+ src16 += srcRowBump;
+ dst32 += dstRowBump;
+ }
+ }
+ }
+
+ return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_16to32bpp(
+ primitives_t *prims)
+{
+ prims->RGB565ToARGB_16u32u_C3C4 = general_RGB565ToARGB_16u32u_C3C4;
+
+ primitives_init_16to32bpp_opt(prims);
+}
+
+/* ------------------------------------------------------------------------- */
+void primitives_deinit_16to32bpp(
+ primitives_t *prims)
+{
+ /* Nothing to do. */
+}
--- /dev/null
+/* FreeRDP: A Remote Desktop Protocol Client
+ * 16-bit to 32-bit color conversions
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2014 Hewlett-Packard Development Company, L.P.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef __GNUC__
+# pragma once
+#endif
+
+#ifndef __PRIM_16TO32BPP_H_INCLUDED__
+#define __PRIM_16TO32BPP_H_INCLUDED__
+
+#include <freerdp/primitives.h>
+
+extern pstatus_t general_RGB565ToARGB_16u32u_C3C4(
+ const UINT16* pSrc, INT32 srcStep,
+ UINT32* pDst, INT32 dstStep,
+ UINT32 width, UINT32 height,
+ BOOL alpha, BOOL invert);
+extern void primitives_init_16to32bpp_opt(primitives_t* prims);
+
+#endif /* !__PRIM_16TO32BPP_H_INCLUDED__ */
--- /dev/null
+/* prim_16to32bpp_opt.c
+ * 16-bit to 32-bit color conversion via SSE/Neon
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2014 Hewlett-Packard Development Company, L.P.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+#include <winpr/sysinfo.h>
+
+#ifdef WITH_SSE2
+#include <emmintrin.h>
+#include <tmmintrin.h>
+/* #elif defined(WITH_NEON) */
+/* #include <arm_neon.h> */
+#endif /* WITH_SSE2 */
+
+#include "prim_internal.h"
+#include "prim_16to32bpp.h"
+
+#ifdef WITH_SSE2
+/* ------------------------------------------------------------------------- */
+/* Note: _no_invert and _invert could be coded with variables as shift
+ * amounts and a single routine, but tests showed that was much slower.
+ */
+static pstatus_t sse3_RGB565ToARGB_16u32u_C3C4_no_invert(
+ const UINT16* pSrc, INT32 srcStep,
+ UINT32* pDst, INT32 dstStep,
+ UINT32 width, UINT32 height,
+ BOOL alpha)
+{
+ const BYTE *src = (const BYTE *) pSrc;
+ BYTE *dst = (BYTE *) pDst;
+ int h;
+ int srcRowBump = srcStep - (width * sizeof(UINT16));
+ int dstRowBump = dstStep - (width * sizeof(UINT32));
+ __m128i R0, R1, R2, R_FC00, R_0300, R_00F8, R_0007, R_alpha;
+
+ R_FC00 = _mm_set1_epi16(0xFC00);
+ R_0300 = _mm_set1_epi16(0x0300);
+ R_00F8 = _mm_set1_epi16(0x00F8);
+ R_0007 = _mm_set1_epi16(0x0007);
+ if (alpha) R_alpha = _mm_set1_epi32(0xFF00FF00U);
+ else R_alpha = _mm_set1_epi32(0x00000000U);
+
+ for (h=0; h<height; h++)
+ {
+ int w = width;
+
+ /* Get to 16-byte destination boundary for the dest. */
+ if ((ULONG_PTR) dst & 0x0f)
+ {
+ int startup = (16 - ((ULONG_PTR) dst & 0x0f)) / 4;
+ if (startup > width) startup = width;
+ general_RGB565ToARGB_16u32u_C3C4((const UINT16*) src, srcStep,
+ (UINT32*) dst, dstStep, startup, 1, alpha, FALSE);
+ src += startup * sizeof(UINT16);
+ dst += startup * sizeof(UINT32);
+ w -= startup;
+ }
+
+ /* The main loop handles eight pixels at a time. */
+ while (w >= 8)
+ {
+ /* If off-stride, use the slower load. */
+ if ((ULONG_PTR) src & 0x0f)
+ R0 = _mm_lddqu_si128((__m128i *) src);
+ else
+ R0 = _mm_load_si128((__m128i *) src);
+ src += (128/8);
+
+ /* Do the lower two colors, which end up in the lower two bytes. */
+ /* G = ((P<<5) & 0xFC00) | ((P>>1) & 0x0300) */
+ R2 = _mm_slli_epi16(R0, 5);
+ R2 = _mm_and_si128(R_FC00, R2);
+
+ R1 = _mm_srli_epi16(R0, 1);
+ R1 = _mm_and_si128(R_0300, R1);
+ R2 = _mm_or_si128(R1, R2);
+
+ /* R = ((P<<3) & 0x00F8) | ((P>>2) & 0x0007) */
+ R1 = _mm_slli_epi16(R0, 3);
+ R1 = _mm_and_si128(R_00F8, R1);
+ R2 = _mm_or_si128(R1, R2);
+
+ R1 = _mm_srli_epi16(R0, 2);
+ R1 = _mm_and_si128(R_0007, R1);
+ R2 = _mm_or_si128(R1, R2); /* R2 = lowers */
+
+ /* Handle the upper color. */
+ /* B = ((P<<8) & 0x00F8) | ((P<<13) & 0x0007) */
+ R1 = _mm_srli_epi16(R0, 8);
+ R1 = _mm_and_si128(R_00F8, R1);
+
+ R0 = _mm_srli_epi16(R0, 13);
+ R0 = _mm_and_si128(R_0007, R0);
+ R1 = _mm_or_si128(R0, R1); /* R1 = uppers */
+
+ /* Add alpha (or zero) . */
+ R1 = _mm_or_si128(R_alpha, R1); /* + alpha */
+
+ /* Unpack to intermix the AB and GR pieces. */
+ R0 = _mm_unpackhi_epi16(R2, R1);
+ R2 = _mm_unpacklo_epi16(R2, R1);
+
+ /* Store the results. */
+ _mm_store_si128((__m128i *) dst, R2); dst += (128/8);
+ _mm_store_si128((__m128i *) dst, R0); dst += (128/8);
+ w -= 8;
+ }
+
+ /* Handle any remainder. */
+ if (w > 0)
+ {
+ general_RGB565ToARGB_16u32u_C3C4((const UINT16*) src, srcStep,
+ (UINT32*) dst, dstStep, w, 1, alpha, FALSE);
+ src += w * sizeof(UINT16);
+ dst += w * sizeof(UINT32);
+ }
+
+ /* Bump to the start of the next row. */
+ src += srcRowBump;
+ dst += dstRowBump;
+ }
+
+ return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+static pstatus_t sse3_RGB565ToARGB_16u32u_C3C4_invert(
+ const UINT16* pSrc, INT32 srcStep,
+ UINT32* pDst, INT32 dstStep,
+ UINT32 width, UINT32 height,
+ BOOL alpha)
+{
+ const BYTE *src = (const BYTE *) pSrc;
+ BYTE *dst = (BYTE *) pDst;
+ int h;
+ int srcRowBump = srcStep - (width * sizeof(UINT16));
+ int dstRowBump = dstStep - (width * sizeof(UINT32));
+ __m128i R0, R1, R2, R_FC00, R_0300, R_00F8, R_0007, R_alpha;
+
+ R_FC00 = _mm_set1_epi16(0xFC00);
+ R_0300 = _mm_set1_epi16(0x0300);
+ R_00F8 = _mm_set1_epi16(0x00F8);
+ R_0007 = _mm_set1_epi16(0x0007);
+ if (alpha) R_alpha = _mm_set1_epi32(0xFF00FF00U);
+ else R_alpha = _mm_set1_epi32(0x00000000U);
+
+ for (h=0; h<height; h++)
+ {
+ int w = width;
+
+ /* Get to 16-byte destination boundary for the dest. */
+ if ((ULONG_PTR) dst & 0x0f)
+ {
+ int startup = (16 - ((ULONG_PTR) dst & 0x0f)) / 4;
+ if (startup > width) startup = width;
+ general_RGB565ToARGB_16u32u_C3C4((const UINT16*) src, srcStep,
+ (UINT32*) dst, dstStep, startup, 1, alpha, TRUE);
+ src += startup * sizeof(UINT16);
+ dst += startup * sizeof(UINT32);
+ w -= startup;
+ }
+
+ /* The main loop handles eight pixels at a time. */
+ while (w >= 8)
+ {
+ /* Off-stride, slower load. */
+ if ((ULONG_PTR) src & 0x0f)
+ R0 = _mm_lddqu_si128((__m128i *) src);
+ else
+ R0 = _mm_load_si128((__m128i *) src);
+ src += (128/8);
+
+ /* Do the lower two colors, which end up in the lower two bytes. */
+ /* G = ((P<<5) & 0xFC00) | ((P>>1) & 0x0300) */
+ R2 = _mm_slli_epi16(R0, 5);
+ R2 = _mm_and_si128(R_FC00, R2);
+
+ R1 = _mm_srli_epi16(R0, 1);
+ R1 = _mm_and_si128(R_0300, R1);
+ R2 = _mm_or_si128(R1, R2);
+
+ /* B = ((P>>8) & 0x00F8) | ((P>>13) & 0x0007) */
+ R1 = _mm_srli_epi16(R0, 8);
+ R1 = _mm_and_si128(R_00F8, R1);
+ R2 = _mm_or_si128(R1, R2);
+
+ R1 = _mm_srli_epi16(R0, 13);
+ R1 = _mm_and_si128(R_0007, R1);
+ R2 = _mm_or_si128(R1, R2); /* R2 = lowers */
+
+ /* Handle the upper color. */
+ /* R = ((P<<3) & 0x00F8) | ((P>>13) & 0x0007) */
+ R1 = _mm_slli_epi16(R0, 3);
+ R1 = _mm_and_si128(R_00F8, R1);
+
+ R0 = _mm_srli_epi16(R0, 2);
+ R0 = _mm_and_si128(R_0007, R0);
+ R1 = _mm_or_si128(R0, R1); /* R1 = uppers */
+
+ /* Add alpha (or zero) . */
+ R1 = _mm_or_si128(R_alpha, R1); /* + alpha */
+
+ /* Unpack to intermix the AR and GB pieces. */
+ R0 = _mm_unpackhi_epi16(R2, R1);
+ R2 = _mm_unpacklo_epi16(R2, R1);
+
+ /* Store the results. */
+ _mm_store_si128((__m128i *) dst, R2); dst += (128/8);
+ _mm_store_si128((__m128i *) dst, R0); dst += (128/8);
+ w -= 8;
+ }
+
+ /* Handle any remainder. */
+ if (w > 0)
+ {
+ general_RGB565ToARGB_16u32u_C3C4((const UINT16*) src, srcStep,
+ (UINT32*) dst, dstStep, w, 1, alpha, TRUE);
+ src += w * sizeof(UINT16);
+ dst += w * sizeof(UINT32);
+ }
+
+ /* Bump to the start of the next row. */
+ src += srcRowBump;
+ dst += dstRowBump;
+ }
+
+ return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+pstatus_t sse3_RGB565ToARGB_16u32u_C3C4(
+ const UINT16* pSrc, INT32 srcStep,
+ UINT32* pDst, INT32 dstStep,
+ UINT32 width, UINT32 height,
+ BOOL alpha, BOOL invert)
+{
+ if (invert)
+ {
+ return sse3_RGB565ToARGB_16u32u_C3C4_invert(pSrc, srcStep,
+ pDst, dstStep, width, height, alpha);
+ }
+ else
+ {
+ return sse3_RGB565ToARGB_16u32u_C3C4_no_invert(pSrc, srcStep,
+ pDst, dstStep, width, height, alpha);
+ }
+}
+#endif /* WITH_SSE2 */
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_16to32bpp_opt(
+ primitives_t *prims)
+{
+#ifdef WITH_SSE2
+ if (IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
+ {
+ prims->RGB565ToARGB_16u32u_C3C4 = sse3_RGB565ToARGB_16u32u_C3C4;
+ }
+#endif
+}
--- /dev/null
+/* FreeRDP: A Remote Desktop Protocol Client
+ * YCoCg<->RGB Color conversion operations.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2014 Hewlett-Packard Development Company, L.P.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+
+#include "prim_internal.h"
+#include "prim_YCoCg.h"
+
+#ifndef MINMAX
+#define MINMAX(_v_, _l_, _h_) \
+ ((_v_) < (_l_) ? (_l_) : ((_v_) > (_h_) ? (_h_) : (_v_)))
+#endif /* !MINMAX */
+
+/* ------------------------------------------------------------------------- */
+pstatus_t general_YCoCgRToRGB_8u_AC4R(
+ const BYTE *pSrc, INT32 srcStep,
+ BYTE *pDst, INT32 dstStep,
+ UINT32 width, UINT32 height,
+ UINT8 shift,
+ BOOL withAlpha,
+ BOOL invert)
+{
+ const BYTE *sptr = pSrc;
+ BYTE *dptr = pDst;
+ int cll = shift - 1; /* -1 builds in the /2's */
+ int x,y;
+ int srcRowBump = srcStep - width*sizeof(UINT32);
+ int dstRowBump = dstStep - width*sizeof(UINT32);
+ if (invert)
+ {
+ for (y=0; y<height; y++)
+ {
+ for (x=0; x<width; x++)
+ {
+ INT16 cg, co, y, t, r, g, b;
+ BYTE a;
+
+ /* Note: shifts must be done before sign-conversion. */
+ cg = (INT16) ((INT8) ((*sptr++) << cll));
+ co = (INT16) ((INT8) ((*sptr++) << cll));
+ y = (INT16) (*sptr++); /* UINT8->INT16 */
+ a = *sptr++;
+ if (!withAlpha) a = 0xFFU;
+ t = y - cg;
+ r = t + co;
+ g = y + cg;
+ b = t - co;
+ *dptr++ = (BYTE) MINMAX(r, 0, 255);
+ *dptr++ = (BYTE) MINMAX(g, 0, 255);
+ *dptr++ = (BYTE) MINMAX(b, 0, 255);
+ *dptr++ = a;
+ }
+ sptr += srcRowBump;
+ dptr += dstRowBump;
+ }
+ }
+ else
+ {
+ for (y=0; y<height; y++)
+ {
+ for (x=0; x<width; x++)
+ {
+ INT16 cg, co, y, t, r, g, b;
+ BYTE a;
+
+ /* Note: shifts must be done before sign-conversion. */
+ cg = (INT16) ((INT8) ((*sptr++) << cll));
+ co = (INT16) ((INT8) ((*sptr++) << cll));
+ y = (INT16) (*sptr++); /* UINT8->INT16 */
+ a = *sptr++;
+ if (!withAlpha) a = 0xFFU;
+ t = y - cg;
+ r = t + co;
+ g = y + cg;
+ b = t - co;
+ *dptr++ = (BYTE) MINMAX(b, 0, 255);
+ *dptr++ = (BYTE) MINMAX(g, 0, 255);
+ *dptr++ = (BYTE) MINMAX(r, 0, 255);
+ *dptr++ = a;
+ }
+ sptr += srcRowBump;
+ dptr += dstRowBump;
+ }
+ }
+ return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_YCoCg(primitives_t* prims)
+{
+ prims->YCoCgRToRGB_8u_AC4R = general_YCoCgRToRGB_8u_AC4R;
+
+ primitives_init_YCoCg_opt(prims);
+}
+
+/* ------------------------------------------------------------------------- */
+void primitives_deinit_YCoCg(primitives_t* prims)
+{
+ /* Nothing to do. */
+}
--- /dev/null
+/* FreeRDP: A Remote Desktop Protocol Client
+ * YCoCg<->RGB color conversion operations.
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2014 Hewlett-Packard Development Company, L.P.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef __GNUC__
+# pragma once
+#endif
+
+#ifndef __PRIM_YCOCG_H_INCLUDED__
+#define __PRIM_YCOCG_H_INCLUDED__
+
+pstatus_t general_YCoCgRToRGB_8u_AC4R(const BYTE *pSrc, INT32 srcStep, BYTE *pDst, INT32 dstStep, UINT32 width, UINT32 height, UINT8 shift, BOOL withAlpha, BOOL invert);
+
+void primitives_init_YCoCg_opt(primitives_t* prims);
+
+#endif /* !__PRIM_YCOCG_H_INCLUDED__ */
--- /dev/null
+/* FreeRDP: A Remote Desktop Protocol Client
+ * Optimized YCoCg<->RGB conversion operations.
+ * vi:ts=4 sw=4:
+ *
+ * (c) Copyright 2014 Hewlett-Packard Development Company, L.P.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <freerdp/types.h>
+#include <freerdp/primitives.h>
+#include <winpr/sysinfo.h>
+
+#ifdef WITH_SSE2
+#include <emmintrin.h>
+#include <tmmintrin.h>
+#elif defined(WITH_NEON)
+#include <arm_neon.h>
+#endif /* WITH_SSE2 else WITH_NEON */
+
+#include "prim_internal.h"
+#include "prim_templates.h"
+#include "prim_YCoCg.h"
+
+#ifdef WITH_SSE2
+
+/* ------------------------------------------------------------------------- */
+static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_invert(
+ const BYTE *pSrc, INT32 srcStep,
+ BYTE *pDst, INT32 dstStep,
+ UINT32 width, UINT32 height,
+ UINT8 shift,
+ BOOL withAlpha)
+{
+ const BYTE *sptr = pSrc;
+ BYTE *dptr = (BYTE *) pDst;
+ int sRowBump = srcStep - width*sizeof(UINT32);
+ int dRowBump = dstStep - width*sizeof(UINT32);
+ int h;
+ /* Shift left by "shift" and divide by two is the same as shift
+ * left by "shift-1".
+ */
+ int dataShift = shift - 1;
+ BYTE mask = (BYTE) (0xFFU << dataShift);
+
+ /* Let's say the data is of the form:
+ * y0y0o0g0 a1y1o1g1 a2y2o2g2...
+ * Apply:
+ * |R| | 1 1/2 -1/2 | |y|
+ * |G| = | 1 0 1/2 | * |o|
+ * |B| | 1 -1/2 -1/2 | |g|
+ * where Y is 8-bit unsigned and o & g are 8-bit signed.
+ */
+
+ if ((width < 8) || (ULONG_PTR) dptr & 0x03)
+ {
+ /* Too small, or we'll never hit a 16-byte boundary. Punt. */
+ return general_YCoCgRToRGB_8u_AC4R(pSrc, srcStep,
+ pDst, dstStep, width, height, shift, withAlpha, TRUE);
+ }
+
+ for (h=0; h<height; h++)
+ {
+ int w = width;
+ BOOL onStride;
+
+ /* Get to a 16-byte destination boundary. */
+ if ((ULONG_PTR) dptr & 0x0f)
+ {
+ int startup = (16 - ((ULONG_PTR) dptr & 0x0f)) / 4;
+ if (startup > width) startup = width;
+ general_YCoCgRToRGB_8u_AC4R(sptr, srcStep, dptr, dstStep,
+ startup, 1, shift, withAlpha, TRUE);
+ sptr += startup * sizeof(UINT32);
+ dptr += startup * sizeof(UINT32);
+ w -= startup;
+ }
+
+ /* Each loop handles eight pixels at a time. */
+ onStride = (((ULONG_PTR) sptr & 0x0f) == 0) ? TRUE : FALSE;
+ while (w >= 8)
+ {
+ __m128i R0, R1, R2, R3, R4, R5, R6, R7;
+ if (onStride)
+ {
+ /* The faster path, 16-byte aligned load. */
+ R0 = _mm_load_si128((__m128i *) sptr); sptr += (128/8);
+ R1 = _mm_load_si128((__m128i *) sptr); sptr += (128/8);
+ }
+ else
+ {
+ /* Off-stride, slower LDDQU load. */
+ R0 = _mm_lddqu_si128((__m128i *) sptr); sptr += (128/8);
+ R1 = _mm_lddqu_si128((__m128i *) sptr); sptr += (128/8);
+ }
+ /* R0 = a3y3o3g3 a2y2o2g2 a1y1o1g1 a0y0o0g0 */
+ /* R1 = a7y7o7g7 a6y6o6g6 a5y5o5g5 a4y4o4g4 */
+
+ /* Shuffle to pack all the like types together. */
+ R2 = _mm_set_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
+ R3 = _mm_shuffle_epi8(R0, R2);
+ R4 = _mm_shuffle_epi8(R1, R2);
+ /* R3 = a3a2a1a0 y3y2y1y0 o3o2o1o0 g3g2g1g0 */
+ /* R4 = a7a6a5a4 y7y6y5y4 o7o6o5o4 g7g6g5g4 */
+ R5 = _mm_unpackhi_epi32(R3, R4);
+ R6 = _mm_unpacklo_epi32(R3, R4);
+ /* R5 = a7a6a5a4 a3a2a1a0 y7y6y5y4 y3y2y1y0 */
+ /* R6 = o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
+ /* Save alphas aside */
+ if (withAlpha) R7 = _mm_unpackhi_epi64(R5, R5);
+ else R7 = _mm_set1_epi32(0xFFFFFFFFU);
+ /* R7 = a7a6a5a4 a3a2a1a0 a7a6a5a4 a3a2a1a0 */
+ /* Expand Y's from 8-bit unsigned to 16-bit signed. */
+ R1 = _mm_set1_epi32(0);
+ R0 = _mm_unpacklo_epi8(R5, R1);
+ /* R0 = 00y700y6 00y500y4 00y300y2 00y100y0 */
+ /* Shift Co's and Cg's by (shift-1). -1 covers division by two.
+ * Note: this must be done before sign-conversion.
+ * Note also there is no slli_epi8, so we have to use a 16-bit
+ * version and then mask.
+ */
+ R6 = _mm_slli_epi16(R6, dataShift);
+ R1 = _mm_set1_epi8(mask);
+ R6 = _mm_and_si128(R6, R1);
+ /* R6 = shifted o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
+ /* Expand Co's from 8-bit signed to 16-bit signed */
+ R1 = _mm_unpackhi_epi8(R6, R6);
+ R1 = _mm_srai_epi16(R1, 8);
+ /* R1 = xxo7xxo6 xxo5xxo4 xxo3xxo2 xxo1xxo0 */
+ /* Expand Cg's form 8-bit signed to 16-bit signed */
+ R2 = _mm_unpacklo_epi8(R6, R6);
+ R2 = _mm_srai_epi16(R2, 8);
+ /* R2 = xxg7xxg6 xxg5xxg4 xxg3xxg2 xxg1xxg0 */
+ /* Get Y - halfCg and save */
+ R6 = _mm_subs_epi16(R0, R2);
+
+ /* R = (Y-halfCg) + halfCo */
+ R3 = _mm_adds_epi16(R6, R1);
+ /* R3 = xxR7xxR6 xxR5xxR4 xxR3xxR2 xxR1xxR0 */
+ /* G = Y + Cg(/2) */
+ R4 = _mm_adds_epi16(R0, R2);
+ /* R4 = xxG7xxG6 xxG5xxG4 xxG3xxG2 xxG1xxG0 */
+ /* B = (Y-halfCg) - Co(/2) */
+ R5 = _mm_subs_epi16(R6, R1);
+ /* R5 = xxB7xxB6 xxB5xxB4 xxB3xxB2 xxB1xxB0 */
+
+ /* Repack R's & B's. */
+ R0 = _mm_packus_epi16(R3, R5);
+ /* R0 = R7R6R5R4 R3R2R1R0 B7B6B5B4 B3B2B1B0 */
+ /* Repack G's. */
+ R1 = _mm_packus_epi16(R4, R4);
+ /* R1 = G7G6G6G4 G3G2G1G0 G7G6G6G4 G3G2G1G0 */
+ /* And add the A's. */
+ R1 = _mm_unpackhi_epi64(R1, R7);
+ /* R1 = A7A6A6A4 A3A2A1A0 G7G6G6G4 G3G2G1G0 */
+
+ /* Now do interleaving again. */
+ R2 = _mm_unpacklo_epi8(R0, R1);
+ /* R2 = G7B7G6B6 G5B5G4B4 G3B3G2B2 G1B1G0B0 */
+ R3 = _mm_unpackhi_epi8(R0, R1);
+ /* R3 = A7R7A6R6 A5R5A4R4 A3R3A2R2 A1R1A0R0 */
+ R4 = _mm_unpacklo_epi16(R2, R3);
+ /* R4 = A3R3G3B3 A2R2G2B2 A1R1G1B1 A0R0G0B0 */
+ R5 = _mm_unpackhi_epi16(R2, R3);
+ /* R5 = A7R7G7B7 A6R6G6B6 A5R6G5B5 A4R4G4B4 */
+
+ _mm_store_si128((__m128i *) dptr, R4); dptr += (128/8);
+ _mm_store_si128((__m128i *) dptr, R5); dptr += (128/8);
+ w -= 8;
+ }
+
+ /* Handle any remainder pixels. */
+ if (w > 0) {
+ general_YCoCgRToRGB_8u_AC4R(sptr, srcStep, dptr, dstStep,
+ w, 1, shift, withAlpha, TRUE);
+ sptr += w * sizeof(UINT32);
+ dptr += w * sizeof(UINT32);
+ }
+
+ sptr += sRowBump;
+ dptr += dRowBump;
+ }
+ return PRIMITIVES_SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R_no_invert(
+ const BYTE *pSrc, INT32 srcStep,
+ BYTE *pDst, INT32 dstStep,
+ UINT32 width, UINT32 height,
+ UINT8 shift,
+ BOOL withAlpha)
+{
+ const BYTE *sptr = pSrc;
+ BYTE *dptr = (BYTE *) pDst;
+ int sRowBump = srcStep - width*sizeof(UINT32);
+ int dRowBump = dstStep - width*sizeof(UINT32);
+ int h;
+ /* Shift left by "shift" and divide by two is the same as shift
+ * left by "shift-1".
+ */
+ int dataShift = shift - 1;
+ BYTE mask = (BYTE) (0xFFU << dataShift);
+
+ /* Let's say the data is of the form:
+ * y0y0o0g0 a1y1o1g1 a2y2o2g2...
+ * Apply:
+ * |R| | 1 1/2 -1/2 | |y|
+ * |G| = | 1 0 1/2 | * |o|
+ * |B| | 1 -1/2 -1/2 | |g|
+ * where Y is 8-bit unsigned and o & g are 8-bit signed.
+ */
+
+ if ((width < 8) || (ULONG_PTR) dptr & 0x03)
+ {
+ /* Too small, or we'll never hit a 16-byte boundary. Punt. */
+ return general_YCoCgRToRGB_8u_AC4R(pSrc, srcStep,
+ pDst, dstStep, width, height, shift, withAlpha, FALSE);
+ }
+
+ for (h=0; h<height; h++)
+ {
+ int w = width;
+ BOOL onStride;
+
+ /* Get to a 16-byte destination boundary. */
+ if ((ULONG_PTR) dptr & 0x0f)
+ {
+ int startup = (16 - ((ULONG_PTR) dptr & 0x0f)) / 4;
+ if (startup > width) startup = width;
+ general_YCoCgRToRGB_8u_AC4R(sptr, srcStep, dptr, dstStep,
+ startup, 1, shift, withAlpha, FALSE);
+ sptr += startup * sizeof(UINT32);
+ dptr += startup * sizeof(UINT32);
+ w -= startup;
+ }
+
+ /* Each loop handles eight pixels at a time. */
+ onStride = (((ULONG_PTR) sptr & 0x0f) == 0) ? TRUE : FALSE;
+ while (w >= 8)
+ {
+ __m128i R0, R1, R2, R3, R4, R5, R6, R7;
+ if (onStride)
+ {
+ /* The faster path, 16-byte aligned load. */
+ R0 = _mm_load_si128((__m128i *) sptr); sptr += (128/8);
+ R1 = _mm_load_si128((__m128i *) sptr); sptr += (128/8);
+ }
+ else
+ {
+ /* Off-stride, slower LDDQU load. */
+ R0 = _mm_lddqu_si128((__m128i *) sptr); sptr += (128/8);
+ R1 = _mm_lddqu_si128((__m128i *) sptr); sptr += (128/8);
+ }
+ /* R0 = a3y3o3g3 a2y2o2g2 a1y1o1g1 a0y0o0g0 */
+ /* R1 = a7y7o7g7 a6y6o6g6 a5y5o5g5 a4y4o4g4 */
+
+ /* Shuffle to pack all the like types together. */
+ R2 = _mm_set_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
+ R3 = _mm_shuffle_epi8(R0, R2);
+ R4 = _mm_shuffle_epi8(R1, R2);
+ /* R3 = a3a2a1a0 y3y2y1y0 o3o2o1o0 g3g2g1g0 */
+ /* R4 = a7a6a5a4 y7y6y5y4 o7o6o5o4 g7g6g5g4 */
+ R5 = _mm_unpackhi_epi32(R3, R4);
+ R6 = _mm_unpacklo_epi32(R3, R4);
+ /* R5 = a7a6a5a4 a3a2a1a0 y7y6y5y4 y3y2y1y0 */
+ /* R6 = o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
+ /* Save alphas aside */
+ if (withAlpha) R7 = _mm_unpackhi_epi64(R5, R5);
+ else R7 = _mm_set1_epi32(0xFFFFFFFFU);
+ /* R7 = a7a6a5a4 a3a2a1a0 a7a6a5a4 a3a2a1a0 */
+ /* Expand Y's from 8-bit unsigned to 16-bit signed. */
+ R1 = _mm_set1_epi32(0);
+ R0 = _mm_unpacklo_epi8(R5, R1);
+ /* R0 = 00y700y6 00y500y4 00y300y2 00y100y0 */
+ /* Shift Co's and Cg's by (shift-1). -1 covers division by two.
+ * Note: this must be done before sign-conversion.
+ * Note also there is no slli_epi8, so we have to use a 16-bit
+ * version and then mask.
+ */
+ R6 = _mm_slli_epi16(R6, dataShift);
+ R1 = _mm_set1_epi8(mask);
+ R6 = _mm_and_si128(R6, R1);
+ /* R6 = shifted o7o6o5o4 o3o2o1o0 g7g6g5g4 g3g2g1g0 */
+ /* Expand Co's from 8-bit signed to 16-bit signed */
+ R1 = _mm_unpackhi_epi8(R6, R6);
+ R1 = _mm_srai_epi16(R1, 8);
+ /* R1 = xxo7xxo6 xxo5xxo4 xxo3xxo2 xxo1xxo0 */
+ /* Expand Cg's form 8-bit signed to 16-bit signed */
+ R2 = _mm_unpacklo_epi8(R6, R6);
+ R2 = _mm_srai_epi16(R2, 8);
+ /* R2 = xxg7xxg6 xxg5xxg4 xxg3xxg2 xxg1xxg0 */
+ /* Get Y - halfCg and save */
+ R6 = _mm_subs_epi16(R0, R2);
+
+ /* R = (Y-halfCg) + halfCo */
+ R3 = _mm_adds_epi16(R6, R1);
+ /* R3 = xxR7xxR6 xxR5xxR4 xxR3xxR2 xxR1xxR0 */
+ /* G = Y + Cg(/2) */
+ R4 = _mm_adds_epi16(R0, R2);
+ /* R4 = xxG7xxG6 xxG5xxG4 xxG3xxG2 xxG1xxG0 */
+ /* B = (Y-halfCg) - Co(/2) */
+ R5 = _mm_subs_epi16(R6, R1);
+ /* R5 = xxB7xxB6 xxB5xxB4 xxB3xxB2 xxB1xxB0 */
+
+ /* Repack R's & B's. */
+ /* This line is the only diff between inverted and non-inverted.
+ * Unfortunately, it would be expensive to check "inverted"
+ * every time through this loop.
+ */
+ R0 = _mm_packus_epi16(R5, R3);
+ /* R0 = B7B6B5B4 B3B2B1B0 R7R6R5R4 R3R2R1R0 */
+ /* Repack G's. */
+ R1 = _mm_packus_epi16(R4, R4);
+ /* R1 = G7G6G6G4 G3G2G1G0 G7G6G6G4 G3G2G1G0 */
+ /* And add the A's. */
+ R1 = _mm_unpackhi_epi64(R1, R7);
+ /* R1 = A7A6A6A4 A3A2A1A0 G7G6G6G4 G3G2G1G0 */
+
+ /* Now do interleaving again. */
+ R2 = _mm_unpacklo_epi8(R0, R1);
+ /* R2 = G7B7G6B6 G5B5G4B4 G3B3G2B2 G1B1G0B0 */
+ R3 = _mm_unpackhi_epi8(R0, R1);
+ /* R3 = A7R7A6R6 A5R5A4R4 A3R3A2R2 A1R1A0R0 */
+ R4 = _mm_unpacklo_epi16(R2, R3);
+ /* R4 = A3R3G3B3 A2R2G2B2 A1R1G1B1 A0R0G0B0 */
+ R5 = _mm_unpackhi_epi16(R2, R3);
+ /* R5 = A7R7G7B7 A6R6G6B6 A5R6G5B5 A4R4G4B4 */
+
+ _mm_store_si128((__m128i *) dptr, R4); dptr += (128/8);
+ _mm_store_si128((__m128i *) dptr, R5); dptr += (128/8);
+ w -= 8;
+ }
+
+ /* Handle any remainder pixels. */
+ if (w > 0) {
+ general_YCoCgRToRGB_8u_AC4R(sptr, srcStep, dptr, dstStep,
+ w, 1, shift, withAlpha, FALSE);
+ sptr += w * sizeof(UINT32);
+ dptr += w * sizeof(UINT32);
+ }
+
+ sptr += sRowBump;
+ dptr += dRowBump;
+ }
+ return PRIMITIVES_SUCCESS;
+}
+#endif /* WITH_SSE2 */
+
+#ifdef WITH_SSE2
+/* ------------------------------------------------------------------------- */
+pstatus_t ssse3_YCoCgRToRGB_8u_AC4R(
+ const BYTE *pSrc, INT32 srcStep,
+ BYTE *pDst, INT32 dstStep,
+ UINT32 width, UINT32 height,
+ UINT8 shift,
+ BOOL withAlpha,
+ BOOL invert)
+{
+ if (invert) {
+ return ssse3_YCoCgRToRGB_8u_AC4R_invert(pSrc, srcStep, pDst, dstStep,
+ width, height, shift, withAlpha);
+ }
+ else {
+ return ssse3_YCoCgRToRGB_8u_AC4R_no_invert(pSrc, srcStep, pDst, dstStep,
+ width, height, shift, withAlpha);
+ }
+}
+#endif /* WITH_SSE2 */
+
+/* ------------------------------------------------------------------------- */
+void primitives_init_YCoCg_opt(primitives_t* prims)
+{
+/* While IPP acknowledges the existence of YCoCg-R, it doesn't currently
+ * include any routines to work with it, especially with variable shift
+ * width.
+ */
+#if defined(WITH_SSE2)
+ if (IsProcessorFeaturePresentEx(PF_EX_SSSE3)
+ && IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
+ {
+ prims->YCoCgRToRGB_8u_AC4R = ssse3_YCoCgRToRGB_8u_AC4R;
+ }
+#endif /* WITH_SSE2 */
+}
extern void primitives_deinit_colors(
primitives_t *prims);
+extern void primitives_init_YCoCg(
+ primitives_t *prims);
+extern void primitives_deinit_YCoCg(
+ primitives_t *prims);
+
+extern void primitives_init_16to32bpp(
+ primitives_t *prims);
+extern void primitives_deinit_16to32bpp(
+ primitives_t *prims);
+
#endif /* !__PRIM_INTERNAL_H_INCLUDED__ */
primitives_init_shift(pPrimitives);
primitives_init_sign(pPrimitives);
primitives_init_colors(pPrimitives);
+ primitives_init_YCoCg(pPrimitives);
+ primitives_init_16to32bpp(pPrimitives);
}
/* ------------------------------------------------------------------------- */
primitives_deinit_shift(pPrimitives);
primitives_deinit_sign(pPrimitives);
primitives_deinit_colors(pPrimitives);
+ primitives_deinit_YCoCg(pPrimitives);
+ primitives_deinit_16to32bpp(pPrimitives);
free((void*) pPrimitives);
pPrimitives = NULL;
set(PRIMITIVE_TEST_CFILES
prim_test.c
+ test_16to32bpp.c
test_add.c
test_alphaComp.c
test_andor.c
test_set.c
test_shift.c
test_sign.c
+ test_YCoCg.c
+ ../prim_16to32bpp.c
../prim_add.c
../prim_andor.c
../prim_alphaComp.c
../prim_set.c
../prim_shift.c
../prim_sign.c
+ ../prim_YCoCg.c
+ ../prim_16to32bpp_opt.c
../prim_add_opt.c
../prim_alphaComp_opt.c
../prim_andor_opt.c
../prim_set_opt.c
../prim_shift_opt.c
../prim_sign_opt.c
+ ../prim_YCoCg_opt.c
../primitives.c
)
set_property(SOURCE ${PRIMITIVE_TEST_CFILES} PROPERTY COMPILE_FLAGS ${OPTFLAGS})
-target_link_libraries(prim_test rt winpr-sysinfo)
+find_library(WINPR_SYSINFO NAMES winpr-sysinfo HINTS ../../../winpr/libwinpr/sysinfo)
+target_link_libraries(prim_test rt ${WINPR_SYSINFO})
+
if(NOT TESTING_OUTPUT_DIRECTORY)
set(TESTING_OUTPUT_DIRECTORY .)
endif()
#define TEST_ALPHA (1<<11)
#define TEST_AND (1<<12)
#define TEST_OR (1<<13)
+#define TEST_YCOCG (1<<14)
+#define TEST_16TO32 (1<<15)
/* Specific types of testing: */
#define TEST_FUNCTIONALITY (1<<0)
{ "rgb", TEST_RGB },
{ "color", TEST_RGB },
{ "colors", TEST_RGB },
+ { "ycocg", TEST_YCOCG },
{ "alpha", TEST_ALPHA },
{ "and", TEST_AND },
- { "or", TEST_OR }
+ { "or", TEST_OR },
+ { "16to32", TEST_16TO32 }
};
#define NUMTESTS (sizeof(testList)/sizeof(test_t))
results |= test_yCbCrToRGB_16s16s_P3P3_speed();
}
}
+ if (testSet & TEST_YCOCG)
+ {
+ if (testTypes & TEST_FUNCTIONALITY)
+ {
+ results |= test_YCoCgRToRGB_8u_AC4R_func();
+ }
+ if (testTypes & TEST_PERFORMANCE)
+ {
+ results |= test_YCoCgRToRGB_8u_AC4R_speed();
+ }
+ }
+ /* 16 to 32 BPP */
+ if (testSet & TEST_16TO32)
+ {
+ if (testTypes & TEST_FUNCTIONALITY)
+ {
+ results |= test_RGB565ToARGB_16u32u_C3C4_func();
+ }
+ if (testTypes & TEST_PERFORMANCE)
+ {
+ results |= test_RGB565ToARGB_16u32u_C3C4_speed();
+ }
+ }
/* ALPHA COMPOSITION */
if (testSet & TEST_ALPHA)
{
extern int test_RGBToRGB_16s8u_P3AC4R_speed(void);
extern int test_yCbCrToRGB_16s16s_P3P3_func(void);
extern int test_yCbCrToRGB_16s16s_P3P3_speed(void);
+extern int test_YCoCgRToRGB_8u_AC4R_func(void);
+extern int test_YCoCgRToRGB_8u_AC4R_speed(void);
+
+extern int test_RGB565ToARGB_16u32u_C3C4_func(void);
+extern int test_RGB565ToARGB_16u32u_C3C4_speed(void);
extern int test_alphaComp_func(void);
extern int test_alphaComp_speed(void);
--- /dev/null
+/* test_colors.c
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2014 Hewlett-Packard Development Company, L.P.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <assert.h>
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <winpr/sysinfo.h>
+#include "prim_test.h"
+
+static const int RGB_TRIAL_ITERATIONS = 1000;
+static const float TEST_TIME = 4.0;
+
+extern pstatus_t general_RGB565ToARGB_16u32u_C3C4(
+ const UINT16* pSrc, INT32 srcStep,
+ UINT32* pDst, INT32 dstStep,
+ UINT32 width, UINT32 height,
+ BOOL alpha, BOOL invert);
+extern pstatus_t sse3_RGB565ToARGB_16u32u_C3C4(
+ const UINT16* pSrc, INT32 srcStep,
+ UINT32* pDst, INT32 dstStep,
+ UINT32 width, UINT32 height,
+ BOOL alpha, BOOL invert);
+
+/* ------------------------------------------------------------------------- */
+static BOOL try_16To32(
+ const UINT16 *data16,
+ int sOffset,
+ int dOffset,
+ int width, int height)
+{
+ BOOL success;
+ const char *sAligned = "sAlign";
+ const char *sUnaligned = "s!Align";
+ const char *dAligned = "dAlign";
+ const char *dUnaligned = "d!Align";
+ const char *sAlignStr, *dAlignStr;
+ const UINT16 *src;
+ UINT32 ALIGN(outNN1[4096+3]), ALIGN(outAN1[4096+3]),
+ ALIGN(outNI1[4096+3]), ALIGN(outAI1[4096+3]);
+ UINT32 ALIGN(outNN2[4096+3]), ALIGN(outAN2[4096+3]),
+ ALIGN(outNI2[4096+3]), ALIGN(outAI2[4096+3]);
+
+ assert(sOffset < 4);
+ assert(dOffset < 4);
+ assert(width*height <= 4096);
+
+ success = TRUE;
+ src = data16 + sOffset;
+ sAlignStr = (sOffset == 0) ? sAligned : sUnaligned;
+ dAlignStr = (dOffset == 0) ? dAligned : dUnaligned;
+
+ general_RGB565ToARGB_16u32u_C3C4(src, width*sizeof(UINT16),
+ outNN1+dOffset, width*sizeof(UINT32), width, height, FALSE, FALSE);
+ general_RGB565ToARGB_16u32u_C3C4(src, width*sizeof(UINT16),
+ outAN1+dOffset, width*sizeof(UINT32), width, height, TRUE, FALSE);
+ general_RGB565ToARGB_16u32u_C3C4(src, width*sizeof(UINT16),
+ outNI1+dOffset, width*sizeof(UINT32), width, height, FALSE, TRUE);
+ general_RGB565ToARGB_16u32u_C3C4(src, width*sizeof(UINT16),
+ outAI1+dOffset, width*sizeof(UINT32), width, height, TRUE, TRUE);
+
+#ifdef WITH_SSE2
+ printf(" Testing 16-to-32bpp SSE3 version (%s, %s, %dx%d)\n",
+ sAlignStr, dAlignStr, width, height);
+ if (IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
+ {
+ int i;
+
+ sse3_RGB565ToARGB_16u32u_C3C4(src, width*sizeof(UINT16),
+ outNN2+dOffset, width*sizeof(UINT32), width, height, FALSE, FALSE);
+ sse3_RGB565ToARGB_16u32u_C3C4(src, width*sizeof(UINT16),
+ outAN2+dOffset, width*sizeof(UINT32), width, height, TRUE, FALSE);
+ sse3_RGB565ToARGB_16u32u_C3C4(src, width*sizeof(UINT16),
+ outNI2+dOffset, width*sizeof(UINT32), width, height, FALSE, TRUE);
+ sse3_RGB565ToARGB_16u32u_C3C4(src, width*sizeof(UINT16),
+ outAI2+dOffset, width*sizeof(UINT32), width, height, TRUE, TRUE);
+ for (i=0; i<width * height; i++)
+ {
+ int s = i + sOffset;
+ int d = i + dOffset;
+ if (outNN1[d] != outNN2[d])
+ {
+ printf("16To32bpp-SSE FAIL (%s, %s, !alpha, !invert)"
+ " 0x%04x -> 0x%08x rather than 0x%08x\n",
+ sAlignStr, dAlignStr, data16[s], outNN2[d], outNN1[d]);
+ success = FALSE;
+ }
+ if (outAN1[d] != outAN2[d])
+ {
+ printf("16To32bpp-SSE FAIL (%s, %s, alpha, !invert)"
+ " 0x%04x -> 0x%08x rather than 0x%08x\n",
+ sAlignStr, dAlignStr, data16[s], outAN2[d], outAN1[d]);
+ success = FALSE;
+ }
+ if (outNI1[d] != outNI2[d])
+ {
+ printf("16To32bpp-SSE FAIL (%s, %s, !alpha, invert)"
+ " 0x%04x -> 0x%08x rather than 0x%08x\n",
+ sAlignStr, dAlignStr, data16[s], outNI2[d], outNI1[d]);
+ success = FALSE;
+ }
+ if (outAI1[d] != outAI2[d])
+ {
+ printf("16To32bpp-SSE FAIL (%s, %s, alpha, invert)"
+ " 0x%04x -> 0x%08x rather than 0x%08x\n",
+ sAlignStr, dAlignStr, data16[s], outAI2[d], outNI1[d]);
+ success = FALSE;
+ }
+ }
+ }
+#endif /* WITH_SSE2 */
+
+ return success;
+}
+
+
+/* ------------------------------------------------------------------------- */
+int test_RGB565ToARGB_16u32u_C3C4_func(void)
+{
+ INT16 ALIGN(data16[4096+3]);
+ BOOL success;
+
+ success = TRUE;
+ get_random_data(data16, sizeof(data16));
+
+ /* Source aligned, dest aligned, 64x64 */
+ success &= try_16To32(data16, 0, 0, 64, 64);
+ /* Source !aligned, dest aligned, 64x64 */
+ success &= try_16To32(data16, 1, 0, 64, 64);
+ /* Source aligned, dest !aligned, 64x64 */
+ success &= try_16To32(data16, 0, 1, 64, 64);
+ /* Source !aligned, dest !aligned, 64x64 */
+ success &= try_16To32(data16, 1, 1, 64, 64);
+ /* Odd size */
+ success &= try_16To32(data16, 0, 0, 17, 53);
+
+ if (success) printf("All RGB565ToARGB_16u32u_C3C4 tests passed.\n");
+ return success ? SUCCESS : FAILURE;
+}
+
+/* ------------------------------------------------------------------------- */
+STD_SPEED_TEST(
+ test16to32_speed, UINT16, UINT32, PRIM_NOP,
+ TRUE, general_RGB565ToARGB_16u32u_C3C4(
+ (const UINT16 *) src1, 64*2, (UINT32 *) dst, 64*4,
+ 64,64, TRUE, TRUE),
+#ifdef WITH_SSE2
+ TRUE, sse3_RGB565ToARGB_16u32u_C3C4(
+ (const UINT16 *) src1, 64*2, (UINT32 *) dst, 64*4,
+ 64,64, TRUE, TRUE),
+ PF_SSE3_INSTRUCTIONS_AVAILABLE, FALSE,
+#else
+ FALSE, PRIM_NOP, 0, FALSE,
+#endif
+ FALSE, PRIM_NOP);
+
+/* ------------------------------------------------------------------------- */
+int test_RGB565ToARGB_16u32u_C3C4_speed(void)
+{
+ UINT16 ALIGN(src[4096]);
+ UINT32 ALIGN(dst[4096]);
+ int i;
+ int size_array[] = { 64 };
+
+ get_random_data(src, sizeof(src));
+
+ test16to32_speed("16-to-32bpp", "aligned",
+ (const UINT16 *) src, 0, 0, (UINT32 *) dst,
+ size_array, 1, RGB_TRIAL_ITERATIONS, TEST_TIME);
+ return SUCCESS;
+}
--- /dev/null
+/* test_YCoCg.c
+ * vi:ts=4 sw=4
+ *
+ * (c) Copyright 2014 Hewlett-Packard Development Company, L.P.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <winpr/sysinfo.h>
+#include "prim_test.h"
+
+static const int YCOCG_TRIAL_ITERATIONS = 20000;
+static const float TEST_TIME = 4.0;
+
+extern pstatus_t general_YCoCgRToRGB_8u_AC4R(const BYTE *pSrc, INT32 srcStep,
+ BYTE *pDst, INT32 dstStep, UINT32 width, UINT32 height,
+ UINT8 shift, BOOL withAlpha, BOOL invert);
+extern pstatus_t ssse3_YCoCgRToRGB_8u_AC4R(const BYTE *pSrc, INT32 srcStep,
+ BYTE *pDst, INT32 dstStep, UINT32 width, UINT32 height,
+ UINT8 shift, BOOL withAlpha, BOOL invert);
+
+/* ------------------------------------------------------------------------- */
+int test_YCoCgRToRGB_8u_AC4R_func(void)
+{
+ INT32 ALIGN(in[4098]);
+ INT32 ALIGN(out_c[4098]), ALIGN(out_c_inv[4098]);
+ INT32 ALIGN(out_sse[4098]), ALIGN(out_sse_inv[4098]);
+ char testStr[256];
+ BOOL failed = FALSE;
+ int i;
+
+ testStr[0] = '\0';
+ get_random_data(in, sizeof(in));
+
+ general_YCoCgRToRGB_8u_AC4R((const BYTE *) (in+1), 63*4,
+ (BYTE *) out_c, 63*4, 63, 61, 2, TRUE, FALSE);
+ general_YCoCgRToRGB_8u_AC4R((const BYTE *) (in+1), 63*4,
+ (BYTE *) out_c_inv, 63*4, 63, 61, 2, TRUE, TRUE);
+#ifdef WITH_SSE2
+ if (IsProcessorFeaturePresentEx(PF_EX_SSSE3))
+ {
+ strcat(testStr, " SSSE3");
+ ssse3_YCoCgRToRGB_8u_AC4R((const BYTE *) (in+1), 63*4,
+ (BYTE *) out_sse, 63*4, 63, 61, 2, TRUE, FALSE);
+
+ for (i=0; i<63*61; ++i)
+ {
+ if (out_c[i] != out_sse[i]) {
+ printf("YCoCgRToRGB-SSE FAIL[%d]: 0x%08x -> C 0x%08x vs SSE 0x%08x\n", i,
+ in[i+1], out_c[i], out_sse[i]);
+ failed = TRUE;
+ }
+ }
+ ssse3_YCoCgRToRGB_8u_AC4R((const BYTE *) (in+1), 63*4,
+ (BYTE *) out_sse_inv, 63*4, 63, 61, 2, TRUE, TRUE);
+ for (i=0; i<63*61; ++i)
+ {
+ if (out_c_inv[i] != out_sse_inv[i]) {
+ printf("YCoCgRToRGB-SSE inverted FAIL[%d]: 0x%08x -> C 0x%08x vs SSE 0x%08x\n", i,
+ in[i+1], out_c_inv[i], out_sse_inv[i]);
+ failed = TRUE;
+ }
+ }
+ }
+#endif /* i386 */
+ if (!failed) printf("All YCoCgRToRGB_8u_AC4R tests passed (%s).\n", testStr);
+ return (failed > 0) ? FAILURE : SUCCESS;
+}
+
+/* ------------------------------------------------------------------------- */
+STD_SPEED_TEST(
+ ycocg_to_rgb_speed, const BYTE, BYTE, PRIM_NOP,
+ TRUE, general_YCoCgRToRGB_8u_AC4R(src1, 64*4, dst, 64*4, 64, 64, 2, FALSE, FALSE),
+#ifdef WITH_SSE2
+ TRUE, ssse3_YCoCgRToRGB_8u_AC4R(src1, 64*4, dst, 64*4, 64, 64, 2, FALSE, FALSE),
+ PF_EX_SSSE3, TRUE,
+#else
+ FALSE, PRIM_NOP, 0, FALSE,
+#endif
+ FALSE, PRIM_NOP);
+
+int test_YCoCgRToRGB_8u_AC4R_speed(void)
+{
+ INT32 ALIGN(in[4096]);
+ INT32 ALIGN(out[4096]);
+ int i;
+ int size_array[] = { 64 };
+
+ get_random_data(in, sizeof(in));
+
+ ycocg_to_rgb_speed("YCoCgToRGB", "aligned", (const BYTE *) in,
+ 0, 0, (BYTE *) out,
+ size_array, 1, YCOCG_TRIAL_ITERATIONS, TEST_TIME);
+ return SUCCESS;
+}