NEON optimized YCoCg conversion.
authorArmin Novak <armin.novak@thincast.com>
Mon, 13 Feb 2017 15:09:28 +0000 (16:09 +0100)
committerArmin Novak <armin.novak@thincast.com>
Mon, 13 Feb 2017 15:09:28 +0000 (16:09 +0100)
libfreerdp/primitives/prim_YCoCg_opt.c
libfreerdp/primitives/test/TestPrimitivesYCoCg.c

index f8bee4d..eb93a7b 100644 (file)
@@ -435,6 +435,134 @@ static pstatus_t ssse3_YCoCgRToRGB_8u_AC4R(
                                                           dstStep, width, height, shift, withAlpha);
        }
 }
+#elif defined(WITH_NEON)
+
+static pstatus_t neon_YCoCgToRGB_8u_X(
+    const BYTE* pSrc, INT32 srcStep,
+    BYTE* pDst, UINT32 DstFormat, INT32 dstStep,
+    UINT32 width, UINT32 height,
+    UINT8 shift, BYTE rPos, BYTE gPos, BYTE bPos, BYTE aPos, BOOL alpha)
+{
+       UINT32 y;
+       BYTE* dptr = pDst;
+       const BYTE* sptr = pSrc;
+       const DWORD formatSize = GetBytesPerPixel(DstFormat);
+       const int8_t cll = shift - 1;  /* -1 builds in the /2's */
+       const UINT32 srcPad = srcStep - (width * 4);
+       const UINT32 dstPad = dstStep - (width * formatSize);
+       const UINT32 pad = width % 8;
+       const uint8x8_t aVal = vdup_n_u8(0xFF);
+       const int8x8_t cllv = vdup_n_s8(cll);
+
+       for (y = 0; y < height; y++)
+       {
+               UINT32 x;
+
+               for (x = 0; x < width - pad; x += 8)
+               {
+                       /* Note: shifts must be done before sign-conversion. */
+                       const uint8x8x4_t raw = vld4_u8(sptr);
+                       const int8x8_t CgRaw = vreinterpret_s8_u8(vshl_u8(raw.val[0], cllv));
+                       const int8x8_t CoRaw = vreinterpret_s8_u8(vshl_u8(raw.val[1], cllv));
+                       const int16x8_t Cg = vmovl_s8(CgRaw);
+                       const int16x8_t Co = vmovl_s8(CoRaw);
+                       const int16x8_t Y = vreinterpretq_s16_u16(vmovl_u8(raw.val[2]));        /* UINT8 -> INT16 */
+                       const int16x8_t T  = vsubq_s16(Y, Cg);
+                       const int16x8_t R  = vaddq_s16(T, Co);
+                       const int16x8_t G  = vaddq_s16(Y, Cg);
+                       const int16x8_t B  = vsubq_s16(T, Co);
+                       uint8x8x4_t bgrx;
+                       bgrx.val[bPos] = vqmovun_s16(B);
+                       bgrx.val[gPos] = vqmovun_s16(G);
+                       bgrx.val[rPos] = vqmovun_s16(R);
+
+                       if (alpha)
+                               bgrx.val[aPos] = raw.val[3];
+                       else
+                               bgrx.val[aPos] = aVal;
+
+                       vst4_u8(dptr, bgrx);
+                       sptr += sizeof(raw);
+                       dptr += sizeof(bgrx);
+               }
+
+               for (x = 0; x < pad; x++)
+               {
+                       /* Note: shifts must be done before sign-conversion. */
+                       const INT16 Cg = (INT16)((INT8)((*sptr++) << cll));
+                       const INT16 Co = (INT16)((INT8)((*sptr++) << cll));
+                       const INT16 Y = (INT16)(*sptr++);       /* UINT8->INT16 */
+                       const INT16 T  = Y - Cg;
+                       const INT16 R  = T + Co;
+                       const INT16 G  = Y + Cg;
+                       const INT16 B  = T - Co;
+                       BYTE bgra[4];
+                       bgra[bPos] = CLIP(B);
+                       bgra[gPos] = CLIP(G);
+                       bgra[rPos] = CLIP(R);
+                       bgra[aPos] = *sptr++;
+
+                       if (!alpha)
+                               bgra[aPos] = 0xFF;
+
+                       *dptr++ = bgra[0];
+                       *dptr++ = bgra[1];
+                       *dptr++ = bgra[2];
+                       *dptr++ = bgra[3];
+               }
+
+               sptr += srcPad;
+               dptr += dstPad;
+       }
+
+       return PRIMITIVES_SUCCESS;
+}
+static pstatus_t neon_YCoCgToRGB_8u_AC4R(
+    const BYTE* pSrc, INT32 srcStep,
+    BYTE* pDst, UINT32 DstFormat, INT32 dstStep,
+    UINT32 width, UINT32 height,
+    UINT8 shift,
+    BOOL withAlpha)
+{
+       switch (DstFormat)
+       {
+               case PIXEL_FORMAT_BGRA32:
+                       return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height, shift, 2, 1, 0,
+                                                   3, withAlpha);
+
+               case PIXEL_FORMAT_BGRX32:
+                       return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height, shift, 2, 1, 0,
+                                                   3, withAlpha);
+
+               case PIXEL_FORMAT_RGBA32:
+                       return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height, shift, 0, 1, 2,
+                                                   3, withAlpha);
+
+               case PIXEL_FORMAT_RGBX32:
+                       return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height, shift, 0, 1, 2,
+                                                   3, withAlpha);
+
+               case PIXEL_FORMAT_ARGB32:
+                       return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height, shift, 1, 2, 3,
+                                                   0, withAlpha);
+
+               case PIXEL_FORMAT_XRGB32:
+                       return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height, shift, 1, 2, 3,
+                                                   0, withAlpha);
+
+               case PIXEL_FORMAT_ABGR32:
+                       return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height, shift, 3, 2, 1,
+                                                   0, withAlpha);
+
+               case PIXEL_FORMAT_XBGR32:
+                       return neon_YCoCgToRGB_8u_X(pSrc, srcStep, pDst, DstFormat, dstStep, width, height, shift, 3, 2, 1,
+                                                   0, withAlpha);
+
+               default:
+                       return generic->YCoCgToRGB_8u_AC4R(pSrc, srcStep, pDst, DstFormat, dstStep, width, height, shift,
+                                                          withAlpha);
+       }
+}
 #endif /* WITH_SSE2 */
 
 /* ------------------------------------------------------------------------- */
@@ -454,5 +582,12 @@ void primitives_init_YCoCg_opt(primitives_t* prims)
                prims->YCoCgToRGB_8u_AC4R = ssse3_YCoCgRToRGB_8u_AC4R;
        }
 
+#elif defined(WITH_NEON)
+
+       if (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
+       {
+               prims->YCoCgToRGB_8u_AC4R = neon_YCoCgToRGB_8u_AC4R;
+       }
+
 #endif /* WITH_SSE2 */
 }
index 47785f1..24371fb 100644 (file)
 
 #include <winpr/sysinfo.h>
 #include "prim_test.h"
+#include <freerdp/utils/profiler.h>
 
 /* ------------------------------------------------------------------------- */
-static BOOL test_YCoCgRToRGB_8u_AC4R_func(void)
+static BOOL test_YCoCgRToRGB_8u_AC4R_func(UINT32 width, UINT32 height)
 {
-       BOOL result = TRUE;
        pstatus_t status;
-       INT32 ALIGN(out_sse[4098]), ALIGN(out_sse_inv[4098]);
-       INT32 ALIGN(in[4098]);
-       INT32 ALIGN(out_c[4098]), ALIGN(out_c_inv[4098]);
-
+       BYTE* out_sse = NULL;
+       BYTE* in = NULL;
+       BYTE* out_c = NULL;
        UINT32 i, x;
-       const UINT32 formats[] = {
+       const UINT32 srcStride = width * 4;
+       const UINT32 size = srcStride * height;
+       const UINT32 formats[] =
+       {
                PIXEL_FORMAT_ARGB32,
                PIXEL_FORMAT_ABGR32,
                PIXEL_FORMAT_RGBA32,
@@ -41,86 +43,77 @@ static BOOL test_YCoCgRToRGB_8u_AC4R_func(void)
                PIXEL_FORMAT_BGRA32,
                PIXEL_FORMAT_BGRX32
        };
+       PROFILER_DEFINE(genericProf);
+       PROFILER_DEFINE(optProf);
+       PROFILER_CREATE(genericProf, "YCoCgRToRGB_8u_AC4R-GENERIC");
+       PROFILER_CREATE(optProf, "YCoCgRToRGB_8u_AC4R-OPT");
+       in = _aligned_malloc(size, 16);
+       out_c = _aligned_malloc(size, 16);
+       out_sse = _aligned_malloc(size, 16);
 
-       winpr_RAND((BYTE*)in, sizeof(in));
+       if (!in || !out_c || !out_sse)
+               goto fail;
 
-       for (x=0; x<sizeof(formats)/sizeof(formats[0]); x++)
-       {
-               UINT32 format = formats[x];
+       winpr_RAND(in, sizeof(in));
 
+       for (x = 0; x < sizeof(formats) / sizeof(formats[0]); x++)
+       {
+               const UINT32 format = formats[x];
+               const UINT32 dstStride = width * GetBytesPerPixel(format);
+               const char* formatName = GetColorFormatName(format);
+               PROFILER_ENTER(genericProf);
                status = generic->YCoCgToRGB_8u_AC4R(
-                                (const BYTE*)(in + 1), 63 * 4,
-                                (BYTE*) out_c, format, 63 * 4, 63, 61, 2, TRUE);
-               if (status != PRIMITIVES_SUCCESS)
-                       return FALSE;
-               status = generic->YCoCgToRGB_8u_AC4R(
-                                (const BYTE*)(in + 1), 63 * 4,
-                                (BYTE*) out_c_inv, format, 63 * 4, 63, 61, 2, TRUE);
-               if (status != PRIMITIVES_SUCCESS)
-                       return FALSE;
+                            in, srcStride,
+                            out_c, format, dstStride, width, height, 2, TRUE);
+               PROFILER_EXIT(genericProf);
 
-               status = optimized->YCoCgToRGB_8u_AC4R(
-                                (const BYTE*)(in + 1), 63 * 4,
-                                (BYTE*) out_sse, format, 63 * 4, 63, 61, 2, TRUE);
                if (status != PRIMITIVES_SUCCESS)
-                       return FALSE;
+                       goto fail;
+
+               PROFILER_ENTER(optProf);
                status = optimized->YCoCgToRGB_8u_AC4R(
-                                (const BYTE*)(in + 1), 63 * 4,
-                                (BYTE*) out_sse_inv, format, 63 * 4, 63, 61, 2, TRUE);
+                            in, srcStride,
+                            out_sse, format, dstStride, width, height, 2, TRUE);
+               PROFILER_EXIT(optProf);
+
                if (status != PRIMITIVES_SUCCESS)
-                       return FALSE;
+                       goto fail;
 
-               for (i = 0; i < 63 * 61; ++i)
+               if (memcmp(out_c, out_sse, dstStride * height) != 0)
                {
-                       if (out_c[i] != out_sse[i])
+                       for (i = 0; i < width * height; ++i)
                        {
-                               printf("optimized->YCoCgRToRGB FAIL[%"PRIu32"]: 0x%08"PRIx32" -> C 0x%08"PRIx32" vs optimized 0x%08"PRIx32"\n",
-                                          i, in[i + 1], out_c[i], out_sse[i]);
-                               result = FALSE;
+                               const UINT32 c = ReadColor(out_c + 4 * i, format);
+                               const UINT32 sse = ReadColor(out_sse + 4 * i, format);
+
+                               if (c != sse)
+                               {
+                                       printf("optimized->YCoCgRToRGB FAIL[%s] [%"PRIu32"]: 0x%08"PRIx32" -> C 0x%08"PRIx32" vs optimized 0x%08"PRIx32"\n",
+                                              formatName, i, in[i + 1], c, sse);
+                                       status = -1;
+                               }
                        }
                }
 
-               for (i = 0; i < 63 * 61; ++i)
-               {
-                       if (out_c_inv[i] != out_sse_inv[i])
-                       {
-                               printf("optimized->YCoCgRToRGB inverted FAIL[%"PRIu32"]: 0x%08"PRIu32" -> C 0x%08"PRIx32" vs optimized 0x%08"PRIx32"\n",
-                                      i, in[i + 1], out_c_inv[i], out_sse_inv[i]);
-                               result = FALSE;
-                       }
-               }
+               PROFILER_PRINT(genericProf);
+               PROFILER_PRINT(optProf);
        }
-       return result;
-}
-
-static int test_YCoCgRToRGB_8u_AC4R_speed(void)
-{
-       INT32 ALIGN(in[4096]);
-       INT32 ALIGN(out[4096]);
 
-       winpr_RAND((BYTE*)in, sizeof(in));
-
-       if (!speed_test("YCoCgToRGB_8u_AC4R", "aligned", g_Iterations,
-                       (speed_test_fkt)generic->YCoCgToRGB_8u_AC4R,
-                       (speed_test_fkt)optimized->YCoCgToRGB_8u_AC4R,
-                       in, 64 * 4, out, 64 * 4, 64, 64, 2, FALSE, FALSE))
-               return FALSE;
-
-       return TRUE;
+fail:
+       PROFILER_FREE(genericProf);
+       PROFILER_FREE(optProf);
+       _aligned_free(in);
+       _aligned_free(out_c);
+       _aligned_free(out_sse);
+       return status == PRIMITIVES_SUCCESS;
 }
 
 int TestPrimitivesYCoCg(int argc, char* argv[])
 {
        prim_test_setup(FALSE);
 
-       if (!test_YCoCgRToRGB_8u_AC4R_func())
+       if (!test_YCoCgRToRGB_8u_AC4R_func(1920, 1080))
                return 1;
 
-       if (g_TestPrimitivesPerformance)
-       {
-               if (!test_YCoCgRToRGB_8u_AC4R_speed())
-                       return 1;
-       }
-
        return 0;
 }