From: Armin Novak Date: Mon, 13 Feb 2017 15:09:28 +0000 (+0100) Subject: NEON optimized YCoCg conversion. X-Git-Tag: 2.0.0-beta1+android10~43^2~4 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=e60cc9bdd936dbaae58a39f7ca0c37c189c2b27c;p=platform%2Fupstream%2Ffreerdp.git NEON optimized YCoCg conversion. --- diff --git a/libfreerdp/primitives/prim_YCoCg_opt.c b/libfreerdp/primitives/prim_YCoCg_opt.c index f8bee4d..eb93a7b 100644 --- a/libfreerdp/primitives/prim_YCoCg_opt.c +++ b/libfreerdp/primitives/prim_YCoCg_opt.c @@ -435,6 +435,134 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R( dstStep, width, height, shift, withAlpha); } } +#elif defined(WITH_NEON) + +static pstatus_t neon_YCoCgToRGB_8u_X( + const BYTE* pSrc, INT32 srcStep, + BYTE* pDst, UINT32 DstFormat, INT32 dstStep, + UINT32 width, UINT32 height, + UINT8 shift, BYTE rPos, BYTE gPos, BYTE bPos, BYTE aPos, BOOL alpha) +{ + UINT32 y; + BYTE* dptr = pDst; + const BYTE* sptr = pSrc; + const DWORD formatSize = GetBytesPerPixel(DstFormat); + const int8_t cll = shift - 1; /* -1 builds in the /2's */ + const UINT32 srcPad = srcStep - (width * 4); + const UINT32 dstPad = dstStep - (width * formatSize); + const UINT32 pad = width % 8; + const uint8x8_t aVal = vdup_n_u8(0xFF); + const int8x8_t cllv = vdup_n_s8(cll); + + for (y = 0; y < height; y++) + { + UINT32 x; + + for (x = 0; x < width - pad; x += 8) + { + /* Note: shifts must be done before sign-conversion. */ + const uint8x8x4_t raw = vld4_u8(sptr); + const int8x8_t CgRaw = vreinterpret_s8_u8(vshl_u8(raw.val[0], cllv)); + const int8x8_t CoRaw = vreinterpret_s8_u8(vshl_u8(raw.val[1], cllv)); + const int16x8_t Cg = vmovl_s8(CgRaw); + const int16x8_t Co = vmovl_s8(CoRaw); + const int16x8_t Y = vreinterpretq_s16_u16(vmovl_u8(raw.val[2])); /* UINT8 -> INT16 */ + const int16x8_t T = vsubq_s16(Y, Cg); + const int16x8_t R = vaddq_s16(T, Co); + const int16x8_t G = vaddq_s16(Y, Cg); + const int16x8_t B = vsubq_s16(T, Co); + uint8x8x4_t bgrx; + bgrx.val[bPos] = vqmovun_s16(B); + bgrx.val[gPos] = vqmovun_s16(G); + bgrx.val[rPos] = vqmovun_s16(R); + + if (alpha) + bgrx.val[aPos] = raw.val[3]; + else + bgrx.val[aPos] = aVal; + + vst4_u8(dptr, bgrx); + sptr += sizeof(raw); + dptr += sizeof(bgrx); + } + + for (x = 0; x < pad; x++) + { + /* Note: shifts must be done before sign-conversion. */ + const INT16 Cg = (INT16)((INT8)((*sptr++) << cll)); + const INT16 Co = (INT16)((INT8)((*sptr++) << cll)); + const INT16 Y = (INT16)(*sptr++); /* UINT8->INT16 */ + const INT16 T = Y - Cg; + const INT16 R = T + Co; + const INT16 G = Y + Cg; + const INT16 B = T - Co; + BYTE bgra[4]; + bgra[bPos] = CLIP(B); + bgra[gPos] = CLIP(G); + bgra[rPos] = CLIP(R); + bgra[aPos] = *sptr++; + + if (!alpha) + bgra[aPos] = 0xFF; + + *dptr++ = bgra[0]; + *dptr++ = bgra[1]; + *dptr++ = bgra[2]; + *dptr++ = bgra[3]; + } + + sptr += srcPad; + dptr += dstPad; + } + + return PRIMITIVES_SUCCESS; +} +static pstatus_t neon_YCoCgToRGB_8u_AC4R( + const BYTE* pSrc, INT32 srcStep, + BYTE* pDst, UINT32 DstFormat, INT32 dstStep, + UINT32 width, UINT32 height, + UINT8 shift, + BOOL withAlpha) +{ + switch (DstFormat) + { + case PIXEL_FORMAT_BGRA32: + return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height, shift, 2, 1, 0, + 3, withAlpha); + + case PIXEL_FORMAT_BGRX32: + return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height, shift, 2, 1, 0, + 3, withAlpha); + + case PIXEL_FORMAT_RGBA32: + return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height, shift, 0, 1, 2, + 3, withAlpha); + + case PIXEL_FORMAT_RGBX32: + return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height, shift, 0, 1, 2, + 3, withAlpha); + + case PIXEL_FORMAT_ARGB32: + return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height, shift, 1, 2, 3, + 0, withAlpha); + + case PIXEL_FORMAT_XRGB32: + return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height, shift, 1, 2, 3, + 0, withAlpha); + + case PIXEL_FORMAT_ABGR32: + return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height, shift, 3, 2, 1, + 0, withAlpha); + + case PIXEL_FORMAT_XBGR32: + return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height, shift, 3, 2, 1, + 0, withAlpha); + + default: + return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width, height, shift, + withAlpha); + } +} #endif /* WITH_SSE2 */ /* ------------------------------------------------------------------------- */ @@ -454,5 +582,12 @@ void primitives_init_YCoCg_opt(primitives_t* prims) prims->YCoCgToRGB_8u_AC4R = ssse3_YCoCgRToRGB_8u_AC4R; } +#elif defined(WITH_NEON) + + if (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE)) + { + prims->YCoCgToRGB_8u_AC4R = neon_YCoCgToRGB_8u_AC4R; + } + #endif /* WITH_SSE2 */ } diff --git a/libfreerdp/primitives/test/TestPrimitivesYCoCg.c b/libfreerdp/primitives/test/TestPrimitivesYCoCg.c index 47785f1..24371fb 100644 --- a/libfreerdp/primitives/test/TestPrimitivesYCoCg.c +++ b/libfreerdp/primitives/test/TestPrimitivesYCoCg.c @@ -22,18 +22,20 @@ #include #include "prim_test.h" +#include /* ------------------------------------------------------------------------- */ -static BOOL test_YCoCgRToRGB_8u_AC4R_func(void) +static BOOL test_YCoCgRToRGB_8u_AC4R_func(UINT32 width, UINT32 height) { - BOOL result = TRUE; pstatus_t status; - INT32 ALIGN(out_sse[4098]), ALIGN(out_sse_inv[4098]); - INT32 ALIGN(in[4098]); - INT32 ALIGN(out_c[4098]), ALIGN(out_c_inv[4098]); - + BYTE* out_sse = NULL; + BYTE* in = NULL; + BYTE* out_c = NULL; UINT32 i, x; - const UINT32 formats[] = { + const UINT32 srcStride = width * 4; + const UINT32 size = srcStride * height; + const UINT32 formats[] = + { PIXEL_FORMAT_ARGB32, PIXEL_FORMAT_ABGR32, PIXEL_FORMAT_RGBA32, @@ -41,86 +43,77 @@ static BOOL test_YCoCgRToRGB_8u_AC4R_func(void) PIXEL_FORMAT_BGRA32, PIXEL_FORMAT_BGRX32 }; + PROFILER_DEFINE(genericProf); + PROFILER_DEFINE(optProf); + PROFILER_CREATE(genericProf, "YCoCgRToRGB_8u_AC4R-GENERIC"); + PROFILER_CREATE(optProf, "YCoCgRToRGB_8u_AC4R-OPT"); + in = _aligned_malloc(size, 16); + out_c = _aligned_malloc(size, 16); + out_sse = _aligned_malloc(size, 16); - winpr_RAND((BYTE*)in, sizeof(in)); + if (!in || !out_c || !out_sse) + goto fail; - for (x=0; xYCoCgToRGB_8u_AC4R( - (const BYTE*)(in + 1), 63 * 4, - (BYTE*) out_c, format, 63 * 4, 63, 61, 2, TRUE); - if (status != PRIMITIVES_SUCCESS) - return FALSE; - status = generic->YCoCgToRGB_8u_AC4R( - (const BYTE*)(in + 1), 63 * 4, - (BYTE*) out_c_inv, format, 63 * 4, 63, 61, 2, TRUE); - if (status != PRIMITIVES_SUCCESS) - return FALSE; + in, srcStride, + out_c, format, dstStride, width, height, 2, TRUE); + PROFILER_EXIT(genericProf); - status = optimized->YCoCgToRGB_8u_AC4R( - (const BYTE*)(in + 1), 63 * 4, - (BYTE*) out_sse, format, 63 * 4, 63, 61, 2, TRUE); if (status != PRIMITIVES_SUCCESS) - return FALSE; + goto fail; + + PROFILER_ENTER(optProf); status = optimized->YCoCgToRGB_8u_AC4R( - (const BYTE*)(in + 1), 63 * 4, - (BYTE*) out_sse_inv, format, 63 * 4, 63, 61, 2, TRUE); + in, srcStride, + out_sse, format, dstStride, width, height, 2, TRUE); + PROFILER_EXIT(optProf); + if (status != PRIMITIVES_SUCCESS) - return FALSE; + goto fail; - for (i = 0; i < 63 * 61; ++i) + if (memcmp(out_c, out_sse, dstStride * height) != 0) { - if (out_c[i] != out_sse[i]) + for (i = 0; i < width * height; ++i) { - printf("optimized->YCoCgRToRGB FAIL[%"PRIu32"]: 0x%08"PRIx32" -> C 0x%08"PRIx32" vs optimized 0x%08"PRIx32"\n", - i, in[i + 1], out_c[i], out_sse[i]); - result = FALSE; + const UINT32 c = ReadColor(out_c + 4 * i, format); + const UINT32 sse = ReadColor(out_sse + 4 * i, format); + + if (c != sse) + { + printf("optimized->YCoCgRToRGB FAIL[%s] [%"PRIu32"]: 0x%08"PRIx32" -> C 0x%08"PRIx32" vs optimized 0x%08"PRIx32"\n", + formatName, i, in[i + 1], c, sse); + status = -1; + } } } - for (i = 0; i < 63 * 61; ++i) - { - if (out_c_inv[i] != out_sse_inv[i]) - { - printf("optimized->YCoCgRToRGB inverted FAIL[%"PRIu32"]: 0x%08"PRIu32" -> C 0x%08"PRIx32" vs optimized 0x%08"PRIx32"\n", - i, in[i + 1], out_c_inv[i], out_sse_inv[i]); - result = FALSE; - } - } + PROFILER_PRINT(genericProf); + PROFILER_PRINT(optProf); } - return result; -} - -static int test_YCoCgRToRGB_8u_AC4R_speed(void) -{ - INT32 ALIGN(in[4096]); - INT32 ALIGN(out[4096]); - winpr_RAND((BYTE*)in, sizeof(in)); - - if (!speed_test("YCoCgToRGB_8u_AC4R", "aligned", g_Iterations, - (speed_test_fkt)generic->YCoCgToRGB_8u_AC4R, - (speed_test_fkt)optimized->YCoCgToRGB_8u_AC4R, - in, 64 * 4, out, 64 * 4, 64, 64, 2, FALSE, FALSE)) - return FALSE; - - return TRUE; +fail: + PROFILER_FREE(genericProf); + PROFILER_FREE(optProf); + _aligned_free(in); + _aligned_free(out_c); + _aligned_free(out_sse); + return status == PRIMITIVES_SUCCESS; } int TestPrimitivesYCoCg(int argc, char* argv[]) { prim_test_setup(FALSE); - if (!test_YCoCgRToRGB_8u_AC4R_func()) + if (!test_YCoCgRToRGB_8u_AC4R_func(1920, 1080)) return 1; - if (g_TestPrimitivesPerformance) - { - if (!test_YCoCgRToRGB_8u_AC4R_speed()) - return 1; - } - return 0; }