dstStep, width, height, shift, withAlpha);
}
}
+#elif defined(WITH_NEON)
+
+static pstatus_t neon_YCoCgToRGB_8u_X(
+ const BYTE* pSrc, INT32 srcStep,
+ BYTE* pDst, UINT32 DstFormat, INT32 dstStep,
+ UINT32 width, UINT32 height,
+ UINT8 shift, BYTE rPos, BYTE gPos, BYTE bPos, BYTE aPos, BOOL alpha)
+{
+ UINT32 y;
+ BYTE* dptr = pDst;
+ const BYTE* sptr = pSrc;
+ const DWORD formatSize = GetBytesPerPixel(DstFormat);
+ const int8_t cll = shift - 1; /* -1 builds in the /2's */
+ const UINT32 srcPad = srcStep - (width * 4);
+ const UINT32 dstPad = dstStep - (width * formatSize);
+ const UINT32 pad = width % 8;
+ const uint8x8_t aVal = vdup_n_u8(0xFF);
+ const int8x8_t cllv = vdup_n_s8(cll);
+
+ for (y = 0; y < height; y++)
+ {
+ UINT32 x;
+
+ for (x = 0; x < width - pad; x += 8)
+ {
+ /* Note: shifts must be done before sign-conversion. */
+ const uint8x8x4_t raw = vld4_u8(sptr);
+ const int8x8_t CgRaw = vreinterpret_s8_u8(vshl_u8(raw.val[0], cllv));
+ const int8x8_t CoRaw = vreinterpret_s8_u8(vshl_u8(raw.val[1], cllv));
+ const int16x8_t Cg = vmovl_s8(CgRaw);
+ const int16x8_t Co = vmovl_s8(CoRaw);
+ const int16x8_t Y = vreinterpretq_s16_u16(vmovl_u8(raw.val[2])); /* UINT8 -> INT16 */
+ const int16x8_t T = vsubq_s16(Y, Cg);
+ const int16x8_t R = vaddq_s16(T, Co);
+ const int16x8_t G = vaddq_s16(Y, Cg);
+ const int16x8_t B = vsubq_s16(T, Co);
+ uint8x8x4_t bgrx;
+ bgrx.val[bPos] = vqmovun_s16(B);
+ bgrx.val[gPos] = vqmovun_s16(G);
+ bgrx.val[rPos] = vqmovun_s16(R);
+
+ if (alpha)
+ bgrx.val[aPos] = raw.val[3];
+ else
+ bgrx.val[aPos] = aVal;
+
+ vst4_u8(dptr, bgrx);
+ sptr += sizeof(raw);
+ dptr += sizeof(bgrx);
+ }
+
+ for (x = 0; x < pad; x++)
+ {
+ /* Note: shifts must be done before sign-conversion. */
+ const INT16 Cg = (INT16)((INT8)((*sptr++) << cll));
+ const INT16 Co = (INT16)((INT8)((*sptr++) << cll));
+ const INT16 Y = (INT16)(*sptr++); /* UINT8->INT16 */
+ const INT16 T = Y - Cg;
+ const INT16 R = T + Co;
+ const INT16 G = Y + Cg;
+ const INT16 B = T - Co;
+ BYTE bgra[4];
+ bgra[bPos] = CLIP(B);
+ bgra[gPos] = CLIP(G);
+ bgra[rPos] = CLIP(R);
+ bgra[aPos] = *sptr++;
+
+ if (!alpha)
+ bgra[aPos] = 0xFF;
+
+ *dptr++ = bgra[0];
+ *dptr++ = bgra[1];
+ *dptr++ = bgra[2];
+ *dptr++ = bgra[3];
+ }
+
+ sptr += srcPad;
+ dptr += dstPad;
+ }
+
+ return PRIMITIVES_SUCCESS;
+}
+static pstatus_t neon_YCoCgToRGB_8u_AC4R(
+ const BYTE* pSrc, INT32 srcStep,
+ BYTE* pDst, UINT32 DstFormat, INT32 dstStep,
+ UINT32 width, UINT32 height,
+ UINT8 shift,
+ BOOL withAlpha)
+{
+ switch (DstFormat)
+ {
+ case PIXEL_FORMAT_BGRA32:
+ return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height, shift, 2, 1, 0,
+ 3, withAlpha);
+
+ case PIXEL_FORMAT_BGRX32:
+ return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height, shift, 2, 1, 0,
+ 3, withAlpha);
+
+ case PIXEL_FORMAT_RGBA32:
+ return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height, shift, 0, 1, 2,
+ 3, withAlpha);
+
+ case PIXEL_FORMAT_RGBX32:
+ return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height, shift, 0, 1, 2,
+ 3, withAlpha);
+
+ case PIXEL_FORMAT_ARGB32:
+ return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height, shift, 1, 2, 3,
+ 0, withAlpha);
+
+ case PIXEL_FORMAT_XRGB32:
+ return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height, shift, 1, 2, 3,
+ 0, withAlpha);
+
+ case PIXEL_FORMAT_ABGR32:
+ return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height, shift, 3, 2, 1,
+ 0, withAlpha);
+
+ case PIXEL_FORMAT_XBGR32:
+ return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height, shift, 3, 2, 1,
+ 0, withAlpha);
+
+ default:
+ return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width, height, shift,
+ withAlpha);
+ }
+}
#endif /* WITH_SSE2 */
/* ------------------------------------------------------------------------- */
prims->YCoCgToRGB_8u_AC4R = ssse3_YCoCgRToRGB_8u_AC4R;
}
+#elif defined(WITH_NEON)
+
+ if (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
+ {
+ prims->YCoCgToRGB_8u_AC4R = neon_YCoCgToRGB_8u_AC4R;
+ }
+
#endif /* WITH_SSE2 */
}
#include <winpr/sysinfo.h>
#include "prim_test.h"
+#include <freerdp/utils/profiler.h>
/* ------------------------------------------------------------------------- */
-static BOOL test_YCoCgRToRGB_8u_AC4R_func(void)
+static BOOL test_YCoCgRToRGB_8u_AC4R_func(UINT32 width, UINT32 height)
{
- BOOL result = TRUE;
pstatus_t status;
- INT32 ALIGN(out_sse[4098]), ALIGN(out_sse_inv[4098]);
- INT32 ALIGN(in[4098]);
- INT32 ALIGN(out_c[4098]), ALIGN(out_c_inv[4098]);
-
+ BYTE* out_sse = NULL;
+ BYTE* in = NULL;
+ BYTE* out_c = NULL;
UINT32 i, x;
- const UINT32 formats[] = {
+ const UINT32 srcStride = width * 4;
+ const UINT32 size = srcStride * height;
+ const UINT32 formats[] =
+ {
PIXEL_FORMAT_ARGB32,
PIXEL_FORMAT_ABGR32,
PIXEL_FORMAT_RGBA32,
PIXEL_FORMAT_BGRA32,
PIXEL_FORMAT_BGRX32
};
+ PROFILER_DEFINE(genericProf);
+ PROFILER_DEFINE(optProf);
+ PROFILER_CREATE(genericProf, "YCoCgRToRGB_8u_AC4R-GENERIC");
+ PROFILER_CREATE(optProf, "YCoCgRToRGB_8u_AC4R-OPT");
+ in = _aligned_malloc(size, 16);
+ out_c = _aligned_malloc(size, 16);
+ out_sse = _aligned_malloc(size, 16);
- winpr_RAND((BYTE*)in, sizeof(in));
+ if (!in || !out_c || !out_sse)
+ goto fail;
- for (x=0; x<sizeof(formats)/sizeof(formats[0]); x++)
- {
- UINT32 format = formats[x];
+ winpr_RAND(in, sizeof(in));
+ for (x = 0; x < sizeof(formats) / sizeof(formats[0]); x++)
+ {
+ const UINT32 format = formats[x];
+ const UINT32 dstStride = width * GetBytesPerPixel(format);
+ const char* formatName = GetColorFormatName(format);
+ PROFILER_ENTER(genericProf);
status = generic->YCoCgToRGB_8u_AC4R(
- (const BYTE*)(in + 1), 63 * 4,
- (BYTE*) out_c, format, 63 * 4, 63, 61, 2, TRUE);
- if (status != PRIMITIVES_SUCCESS)
- return FALSE;
- status = generic->YCoCgToRGB_8u_AC4R(
- (const BYTE*)(in + 1), 63 * 4,
- (BYTE*) out_c_inv, format, 63 * 4, 63, 61, 2, TRUE);
- if (status != PRIMITIVES_SUCCESS)
- return FALSE;
+ in, srcStride,
+ out_c, format, dstStride, width, height, 2, TRUE);
+ PROFILER_EXIT(genericProf);
- status = optimized->YCoCgToRGB_8u_AC4R(
- (const BYTE*)(in + 1), 63 * 4,
- (BYTE*) out_sse, format, 63 * 4, 63, 61, 2, TRUE);
if (status != PRIMITIVES_SUCCESS)
- return FALSE;
+ goto fail;
+
+ PROFILER_ENTER(optProf);
status = optimized->YCoCgToRGB_8u_AC4R(
- (const BYTE*)(in + 1), 63 * 4,
- (BYTE*) out_sse_inv, format, 63 * 4, 63, 61, 2, TRUE);
+ in, srcStride,
+ out_sse, format, dstStride, width, height, 2, TRUE);
+ PROFILER_EXIT(optProf);
+
if (status != PRIMITIVES_SUCCESS)
- return FALSE;
+ goto fail;
- for (i = 0; i < 63 * 61; ++i)
+ if (memcmp(out_c, out_sse, dstStride * height) != 0)
{
- if (out_c[i] != out_sse[i])
+ for (i = 0; i < width * height; ++i)
{
- printf("optimized->YCoCgRToRGB FAIL[%"PRIu32"]: 0x%08"PRIx32" -> C 0x%08"PRIx32" vs optimized 0x%08"PRIx32"\n",
- i, in[i + 1], out_c[i], out_sse[i]);
- result = FALSE;
+ const UINT32 c = ReadColor(out_c + 4 * i, format);
+ const UINT32 sse = ReadColor(out_sse + 4 * i, format);
+
+ if (c != sse)
+ {
+ printf("optimized->YCoCgRToRGB FAIL[%s] [%"PRIu32"]: 0x%08"PRIx32" -> C 0x%08"PRIx32" vs optimized 0x%08"PRIx32"\n",
+ formatName, i, in[i + 1], c, sse);
+ status = -1;
+ }
}
}
- for (i = 0; i < 63 * 61; ++i)
- {
- if (out_c_inv[i] != out_sse_inv[i])
- {
- printf("optimized->YCoCgRToRGB inverted FAIL[%"PRIu32"]: 0x%08"PRIu32" -> C 0x%08"PRIx32" vs optimized 0x%08"PRIx32"\n",
- i, in[i + 1], out_c_inv[i], out_sse_inv[i]);
- result = FALSE;
- }
- }
+ PROFILER_PRINT(genericProf);
+ PROFILER_PRINT(optProf);
}
- return result;
-}
-
-static int test_YCoCgRToRGB_8u_AC4R_speed(void)
-{
- INT32 ALIGN(in[4096]);
- INT32 ALIGN(out[4096]);
- winpr_RAND((BYTE*)in, sizeof(in));
-
- if (!speed_test("YCoCgToRGB_8u_AC4R", "aligned", g_Iterations,
- (speed_test_fkt)generic->YCoCgToRGB_8u_AC4R,
- (speed_test_fkt)optimized->YCoCgToRGB_8u_AC4R,
- in, 64 * 4, out, 64 * 4, 64, 64, 2, FALSE, FALSE))
- return FALSE;
-
- return TRUE;
+fail:
+ PROFILER_FREE(genericProf);
+ PROFILER_FREE(optProf);
+ _aligned_free(in);
+ _aligned_free(out_c);
+ _aligned_free(out_sse);
+ return status == PRIMITIVES_SUCCESS;
}
int TestPrimitivesYCoCg(int argc, char* argv[])
{
prim_test_setup(FALSE);
- if (!test_YCoCgRToRGB_8u_AC4R_func())
+ if (!test_YCoCgRToRGB_8u_AC4R_func(1920, 1080))
return 1;
- if (g_TestPrimitivesPerformance)
- {
- if (!test_YCoCgRToRGB_8u_AC4R_speed())
- return 1;
- }
-
return 0;
}