From: Armin Novak Date: Thu, 25 Jan 2018 08:43:48 +0000 (+0100) Subject: Split optimized YUV implementations to separate files. X-Git-Tag: 2.0.0-rc2~78^2~3 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=f27ee2e3ab886d0e21e4974e925551197d0a0504;p=platform%2Fupstream%2Ffreerdp.git Split optimized YUV implementations to separate files. --- diff --git a/libfreerdp/CMakeLists.txt b/libfreerdp/CMakeLists.txt index 9516d9f..ed29d7c 100644 --- a/libfreerdp/CMakeLists.txt +++ b/libfreerdp/CMakeLists.txt @@ -222,8 +222,17 @@ set(PRIMITIVES_SSE3_SRCS set(PRIMITIVES_SSSE3_SRCS primitives/prim_sign_opt.c - primitives/prim_YCoCg_opt.c - primitives/prim_YUV_opt.c) + primitives/prim_YCoCg_opt.c) + +if (WITH_SSE2) + set(PRIMITIVES_SSSE3_SRCS ${PRIMITIVES_SSSE3_SRCS} + primitives/prim_YUV_ssse3.c) +endif() + +if (WITH_NEON) + set(PRIMITIVES_SSSE3_SRCS ${PRIMITIVES_SSSE3_SRCS} + primitives/prim_YUV_neon.c) +endif() set(PRIMITIVES_OPT_SRCS ${PRIMITIVES_SSE2_SRCS} diff --git a/libfreerdp/primitives/prim_YUV_neon.c b/libfreerdp/primitives/prim_YUV_neon.c new file mode 100644 index 0000000..885b856 --- /dev/null +++ b/libfreerdp/primitives/prim_YUV_neon.c @@ -0,0 +1,771 @@ +/** + * FreeRDP: A Remote Desktop Protocol Implementation + * Optimized YUV/RGB conversion operations + * + * Copyright 2014 Thomas Erbesdobler + * Copyright 2016-2017 Armin Novak + * Copyright 2016-2017 Norbert Federa + * Copyright 2016-2017 Thincast Technologies GmbH + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include +#include +#include +#include + +#include "prim_internal.h" + +#if !defined(WITH_NEON) +#error "This file must only be included if WITH_NEON is active!" +#endif + +#include + +static primitives_t* generic = NULL; + +static INLINE uint8x8_t neon_YUV2R(int32x4_t Ch, int32x4_t Cl, + int16x4_t Dh, int16x4_t Dl, + int16x4_t Eh, int16x4_t El) +{ + /* R = (256 * Y + 403 * (V - 128)) >> 8 */ + const int16x4_t c403 = vdup_n_s16(403); + const int32x4_t CEh = vmlal_s16(Ch, Eh, c403); + const int32x4_t CEl = vmlal_s16(Cl, El, c403); + const int32x4_t Rh = vrshrq_n_s32(CEh, 8); + const int32x4_t Rl = vrshrq_n_s32(CEl, 8); + const int16x8_t R = vcombine_s16(vqmovn_s32(Rl), vqmovn_s32(Rh)); + return vqmovun_s16(R); +} + +static INLINE uint8x8_t neon_YUV2G(int32x4_t Ch, int32x4_t Cl, + int16x4_t Dh, int16x4_t Dl, + int16x4_t Eh, int16x4_t El) +{ + /* G = (256L * Y - 48 * (U - 128) - 120 * (V - 128)) >> 8 */ + const int16x4_t c48 = vdup_n_s16(48); + const int16x4_t c120 = vdup_n_s16(120); + const int32x4_t CDh = vmlsl_s16(Ch, Dh, c48); + const int32x4_t CDl = vmlsl_s16(Cl, Dl, c48); + const int32x4_t CDEh = vmlsl_s16(CDh, Eh, c120); + const int32x4_t CDEl = vmlsl_s16(CDl, El, c120); + const int32x4_t Gh = vrshrq_n_s32(CDEh, 8); + const int32x4_t Gl = vrshrq_n_s32(CDEl, 8); + const int16x8_t G = vcombine_s16(vqmovn_s32(Gl), vqmovn_s32(Gh)); + return vqmovun_s16(G); +} + +static INLINE uint8x8_t neon_YUV2B(int32x4_t Ch, int32x4_t Cl, + int16x4_t Dh, int16x4_t Dl, + int16x4_t Eh, int16x4_t El) +{ + /* B = (256L * Y + 475 * (U - 128)) >> 8*/ + const int16x4_t c475 = vdup_n_s16(475); + const int32x4_t CDh = vmlal_s16(Ch, Dh, c475); + const int32x4_t CDl = vmlal_s16(Ch, Dl, c475); + const int32x4_t Bh = vrshrq_n_s32(CDh, 8); + const int32x4_t Bl = vrshrq_n_s32(CDl, 8); + const int16x8_t B = vcombine_s16(vqmovn_s32(Bl), vqmovn_s32(Bh)); + return vqmovun_s16(B); +} + +static INLINE BYTE* neon_YuvToRgbPixel(BYTE* pRGB, int16x8_t Y, int16x8_t D, int16x8_t E, + const uint8_t rPos, const uint8_t gPos, const uint8_t bPos, const uint8_t aPos) +{ + uint8x8x4_t bgrx; + const int32x4_t Ch = vmulq_n_s32(vmovl_s16(vget_high_s16(Y)), 256); /* Y * 256 */ + const int32x4_t Cl = vmulq_n_s32(vmovl_s16(vget_low_s16(Y)), 256); /* Y * 256 */ + const int16x4_t Dh = vget_high_s16(D); + const int16x4_t Dl = vget_low_s16(D); + const int16x4_t Eh = vget_high_s16(E); + const int16x4_t El = vget_low_s16(E); + { + /* B = (256L * Y + 475 * (U - 128)) >> 8*/ + const int16x4_t c475 = vdup_n_s16(475); + const int32x4_t CDh = vmlal_s16(Ch, Dh, c475); + const int32x4_t CDl = vmlal_s16(Cl, Dl, c475); + const int32x4_t Bh = vrshrq_n_s32(CDh, 8); + const int32x4_t Bl = vrshrq_n_s32(CDl, 8); + const int16x8_t B = vcombine_s16(vqmovn_s32(Bl), vqmovn_s32(Bh)); + bgrx.val[bPos] = vqmovun_s16(B); + } + { + /* G = (256L * Y - 48 * (U - 128) - 120 * (V - 128)) >> 8 */ + const int16x4_t c48 = vdup_n_s16(48); + const int16x4_t c120 = vdup_n_s16(120); + const int32x4_t CDh = vmlsl_s16(Ch, Dh, c48); + const int32x4_t CDl = vmlsl_s16(Cl, Dl, c48); + const int32x4_t CDEh = vmlsl_s16(CDh, Eh, c120); + const int32x4_t CDEl = vmlsl_s16(CDl, El, c120); + const int32x4_t Gh = vrshrq_n_s32(CDEh, 8); + const int32x4_t Gl = vrshrq_n_s32(CDEl, 8); + const int16x8_t G = vcombine_s16(vqmovn_s32(Gl), vqmovn_s32(Gh)); + bgrx.val[gPos] = vqmovun_s16(G); + } + { + /* R = (256 * Y + 403 * (V - 128)) >> 8 */ + const int16x4_t c403 = vdup_n_s16(403); + const int32x4_t CEh = vmlal_s16(Ch, Eh, c403); + const int32x4_t CEl = vmlal_s16(Cl, El, c403); + const int32x4_t Rh = vrshrq_n_s32(CEh, 8); + const int32x4_t Rl = vrshrq_n_s32(CEl, 8); + const int16x8_t R = vcombine_s16(vqmovn_s32(Rl), vqmovn_s32(Rh)); + bgrx.val[rPos] = vqmovun_s16(R); + } + { + /* A */ + bgrx.val[aPos] = vdup_n_u8(0xFF); + } + vst4_u8(pRGB, bgrx); + pRGB += 32; + return pRGB; +} + +static INLINE pstatus_t neon_YUV420ToX( + const BYTE* pSrc[3], const UINT32 srcStep[3], + BYTE* pDst, UINT32 dstStep, + const prim_size_t* roi, const uint8_t rPos, const uint8_t gPos, + const uint8_t bPos, const uint8_t aPos) +{ + UINT32 y; + const UINT32 nWidth = roi->width; + const UINT32 nHeight = roi->height; + const DWORD pad = nWidth % 16; + const UINT32 yPad = srcStep[0] - roi->width; + const UINT32 uPad = srcStep[1] - roi->width / 2; + const UINT32 vPad = srcStep[2] - roi->width / 2; + const UINT32 dPad = dstStep - roi->width * 4; + const int16x8_t c128 = vdupq_n_s16(128); + + for (y = 0; y < nHeight; y += 2) + { + const uint8_t* pY1 = pSrc[0] + y * srcStep[0]; + const uint8_t* pY2 = pY1 + srcStep[0]; + const uint8_t* pU = pSrc[1] + (y / 2) * srcStep[1]; + const uint8_t* pV = pSrc[2] + (y / 2) * srcStep[2]; + uint8_t* pRGB1 = pDst + y * dstStep; + uint8_t* pRGB2 = pRGB1 + dstStep; + UINT32 x; + const BOOL lastY = y >= nHeight - 1; + + for (x = 0; x < nWidth - pad;) + { + const uint8x8_t Uraw = vld1_u8(pU); + const uint8x8x2_t Uu = vzip_u8(Uraw, Uraw); + const int16x8_t U1 = vreinterpretq_s16_u16(vmovl_u8(Uu.val[0])); + const int16x8_t U2 = vreinterpretq_s16_u16(vmovl_u8(Uu.val[1])); + const uint8x8_t Vraw = vld1_u8(pV); + const uint8x8x2_t Vu = vzip_u8(Vraw, Vraw); + const int16x8_t V1 = vreinterpretq_s16_u16(vmovl_u8(Vu.val[0])); + const int16x8_t V2 = vreinterpretq_s16_u16(vmovl_u8(Vu.val[1])); + const int16x8_t D1 = vsubq_s16(U1, c128); + const int16x8_t E1 = vsubq_s16(V1, c128); + const int16x8_t D2 = vsubq_s16(U2, c128); + const int16x8_t E2 = vsubq_s16(V2, c128); + { + const uint8x8_t Y1u = vld1_u8(pY1); + const int16x8_t Y1 = vreinterpretq_s16_u16(vmovl_u8(Y1u)); + pRGB1 = neon_YuvToRgbPixel(pRGB1, Y1, D1, E1, rPos, gPos, bPos, aPos); + pY1 += 8; + x += 8; + } + { + const uint8x8_t Y1u = vld1_u8(pY1); + const int16x8_t Y1 = vreinterpretq_s16_u16(vmovl_u8(Y1u)); + pRGB1 = neon_YuvToRgbPixel(pRGB1, Y1, D2, E2, rPos, gPos, bPos, aPos); + pY1 += 8; + x += 8; + } + + if (!lastY) + { + { + const uint8x8_t Y2u = vld1_u8(pY2); + const int16x8_t Y2 = vreinterpretq_s16_u16(vmovl_u8(Y2u)); + pRGB2 = neon_YuvToRgbPixel(pRGB2, Y2, D1, E1, rPos, gPos, bPos, aPos); + pY2 += 8; + } + { + const uint8x8_t Y2u = vld1_u8(pY2); + const int16x8_t Y2 = vreinterpretq_s16_u16(vmovl_u8(Y2u)); + pRGB2 = neon_YuvToRgbPixel(pRGB2, Y2, D2, E2, rPos, gPos, bPos, aPos); + pY2 += 8; + } + } + + pU += 8; + pV += 8; + } + + for (; x < nWidth; x++) + { + const BYTE U = *pU; + const BYTE V = *pV; + { + const BYTE Y = *pY1++; + const BYTE r = YUV2R(Y, U, V); + const BYTE g = YUV2G(Y, U, V); + const BYTE b = YUV2B(Y, U, V); + pRGB1[aPos] = 0xFF; + pRGB1[rPos] = r; + pRGB1[gPos] = g; + pRGB1[bPos] = b; + pRGB1 += 4; + } + + if (!lastY) + { + const BYTE Y = *pY2++; + const BYTE r = YUV2R(Y, U, V); + const BYTE g = YUV2G(Y, U, V); + const BYTE b = YUV2B(Y, U, V); + pRGB2[aPos] = 0xFF; + pRGB2[rPos] = r; + pRGB2[gPos] = g; + pRGB2[bPos] = b; + pRGB2 += 4; + } + + if (x % 2) + { + pU++; + pV++; + } + } + + pRGB1 += dPad; + pRGB2 += dPad; + pY1 += yPad; + pY2 += yPad; + pU += uPad; + pV += vPad; + } + + return PRIMITIVES_SUCCESS; +} + +static pstatus_t neon_YUV420ToRGB_8u_P3AC4R( + const BYTE* pSrc[3], const UINT32 srcStep[3], + BYTE* pDst, UINT32 dstStep, UINT32 DstFormat, + const prim_size_t* roi) +{ + switch (DstFormat) + { + case PIXEL_FORMAT_BGRA32: + case PIXEL_FORMAT_BGRX32: + return neon_YUV420ToX(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3); + + case PIXEL_FORMAT_RGBA32: + case PIXEL_FORMAT_RGBX32: + return neon_YUV420ToX(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3); + + case PIXEL_FORMAT_ARGB32: + case PIXEL_FORMAT_XRGB32: + return neon_YUV420ToX(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0); + + case PIXEL_FORMAT_ABGR32: + case PIXEL_FORMAT_XBGR32: + return neon_YUV420ToX(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0); + + default: + return generic->YUV420ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi); + } +} + +static INLINE pstatus_t neon_YUV444ToX( + const BYTE* pSrc[3], const UINT32 srcStep[3], + BYTE* pDst, UINT32 dstStep, + const prim_size_t* roi, const uint8_t rPos, const uint8_t gPos, + const uint8_t bPos, const uint8_t aPos) +{ + UINT32 y; + const UINT32 nWidth = roi->width; + const UINT32 nHeight = roi->height; + const UINT32 yPad = srcStep[0] - roi->width; + const UINT32 uPad = srcStep[1] - roi->width; + const UINT32 vPad = srcStep[2] - roi->width; + const UINT32 dPad = dstStep - roi->width * 4; + const uint8_t* pY = pSrc[0]; + const uint8_t* pU = pSrc[1]; + const uint8_t* pV = pSrc[2]; + uint8_t* pRGB = pDst; + const int16x8_t c128 = vdupq_n_s16(128); + const DWORD pad = nWidth % 8; + + for (y = 0; y < nHeight; y++) + { + UINT32 x; + + for (x = 0; x < nWidth - pad; x += 8) + { + const uint8x8_t Yu = vld1_u8(pY); + const int16x8_t Y = vreinterpretq_s16_u16(vmovl_u8(Yu)); + const uint8x8_t Uu = vld1_u8(pU); + const int16x8_t U = vreinterpretq_s16_u16(vmovl_u8(Uu)); + const uint8x8_t Vu = vld1_u8(pV); + const int16x8_t V = vreinterpretq_s16_u16(vmovl_u8(Vu)); + /* Do the calculations on Y in 32bit width, the result of 255 * 256 does not fit + * a signed 16 bit value. */ + const int16x8_t D = vsubq_s16(U, c128); + const int16x8_t E = vsubq_s16(V, c128); + pRGB = neon_YuvToRgbPixel(pRGB, Y, D, E, rPos, gPos, bPos, aPos); + pY += 8; + pU += 8; + pV += 8; + } + + for (x = 0; x < pad; x++) + { + const BYTE Y = *pY++; + const BYTE U = *pU++; + const BYTE V = *pV++; + const BYTE r = YUV2R(Y, U, V); + const BYTE g = YUV2G(Y, U, V); + const BYTE b = YUV2B(Y, U, V); + pRGB[aPos] = 0xFF; + pRGB[rPos] = r; + pRGB[gPos] = g; + pRGB[bPos] = b; + pRGB += 4; + } + + pRGB += dPad; + pY += yPad; + pU += uPad; + pV += vPad; + } + + return PRIMITIVES_SUCCESS; +} + +static pstatus_t neon_YUV444ToRGB_8u_P3AC4R( + const BYTE* pSrc[3], const UINT32 srcStep[3], + BYTE* pDst, UINT32 dstStep, UINT32 DstFormat, + const prim_size_t* roi) +{ + switch (DstFormat) + { + case PIXEL_FORMAT_BGRA32: + case PIXEL_FORMAT_BGRX32: + return neon_YUV444ToX(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3); + + case PIXEL_FORMAT_RGBA32: + case PIXEL_FORMAT_RGBX32: + return neon_YUV444ToX(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3); + + case PIXEL_FORMAT_ARGB32: + case PIXEL_FORMAT_XRGB32: + return neon_YUV444ToX(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0); + + case PIXEL_FORMAT_ABGR32: + case PIXEL_FORMAT_XBGR32: + return neon_YUV444ToX(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0); + + default: + return generic->YUV444ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi); + } +} + +static pstatus_t neon_LumaToYUV444(const BYTE* pSrcRaw[3], const UINT32 srcStep[3], + BYTE* pDstRaw[3], const UINT32 dstStep[3], + const RECTANGLE_16* roi) +{ + UINT32 x, y; + const UINT32 nWidth = roi->right - roi->left; + const UINT32 nHeight = roi->bottom - roi->top; + const UINT32 halfWidth = (nWidth + 1) / 2; + const UINT32 halfHeight = (nHeight + 1) / 2; + const UINT32 evenY = 0; + const BYTE* pSrc[3] = + { + pSrcRaw[0] + roi->top* srcStep[0] + roi->left, + pSrcRaw[1] + roi->top / 2 * srcStep[1] + roi->left / 2, + pSrcRaw[2] + roi->top / 2 * srcStep[2] + roi->left / 2 + }; + BYTE* pDst[3] = + { + pDstRaw[0] + roi->top* dstStep[0] + roi->left, + pDstRaw[1] + roi->top* dstStep[1] + roi->left, + pDstRaw[2] + roi->top* dstStep[2] + roi->left + }; + + /* Y data is already here... */ + /* B1 */ + for (y = 0; y < nHeight; y++) + { + const BYTE* Ym = pSrc[0] + srcStep[0] * y; + BYTE* pY = pDst[0] + dstStep[0] * y; + memcpy(pY, Ym, nWidth); + } + + /* The first half of U, V are already here part of this frame. */ + /* B2 and B3 */ + for (y = 0; y < halfHeight; y++) + { + const UINT32 val2y = (2 * y + evenY); + const BYTE* Um = pSrc[1] + srcStep[1] * y; + const BYTE* Vm = pSrc[2] + srcStep[2] * y; + BYTE* pU = pDst[1] + dstStep[1] * val2y; + BYTE* pV = pDst[2] + dstStep[2] * val2y; + BYTE* pU1 = pU + dstStep[1]; + BYTE* pV1 = pV + dstStep[2]; + + for (x = 0; x + 16 < halfWidth; x += 16) + { + { + const uint8x16_t u = vld1q_u8(Um); + uint8x16x2_t u2x; + u2x.val[0] = u; + u2x.val[1] = u; + vst2q_u8(pU, u2x); + vst2q_u8(pU1, u2x); + Um += 16; + pU += 32; + pU1 += 32; + } + { + const uint8x16_t v = vld1q_u8(Vm); + uint8x16x2_t v2x; + v2x.val[0] = v; + v2x.val[1] = v; + vst2q_u8(pV, v2x); + vst2q_u8(pV1, v2x); + Vm += 16; + pV += 32; + pV1 += 32; + } + } + + for (; x < halfWidth; x++) + { + const BYTE u = *Um++; + const BYTE v = *Vm++; + *pU++ = u; + *pU++ = u; + *pU1++ = u; + *pU1++ = u; + *pV++ = v; + *pV++ = v; + *pV1++ = v; + *pV1++ = v; + } + } + + return PRIMITIVES_SUCCESS; +} + +static pstatus_t neon_ChromaFilter(BYTE* pDst[3], const UINT32 dstStep[3], + const RECTANGLE_16* roi) +{ + const UINT32 oddY = 1; + const UINT32 evenY = 0; + const UINT32 nWidth = roi->right - roi->left; + const UINT32 nHeight = roi->bottom - roi->top; + const UINT32 halfHeight = (nHeight + 1) / 2; + const UINT32 halfWidth = (nWidth + 1) / 2; + const UINT32 halfPad = halfWidth % 16; + UINT32 x, y; + + /* Filter */ + for (y = roi->top; y < halfHeight + roi->top; y++) + { + const UINT32 val2y = (y * 2 + evenY); + const UINT32 val2y1 = val2y + oddY; + BYTE* pU1 = pDst[1] + dstStep[1] * val2y1; + BYTE* pV1 = pDst[2] + dstStep[2] * val2y1; + BYTE* pU = pDst[1] + dstStep[1] * val2y; + BYTE* pV = pDst[2] + dstStep[2] * val2y; + + if (val2y1 > nHeight) + continue; + + for (x = roi->left / 2; x < halfWidth + roi->left / 2 - halfPad; x += 16) + { + { + /* U = (U2x,2y << 2) - U2x1,2y - U2x,2y1 - U2x1,2y1 */ + uint8x8x2_t u = vld2_u8(&pU[2 * x]); + const int16x8_t up = vreinterpretq_s16_u16(vshll_n_u8(u.val[0], 2)); /* Ux2,2y << 2 */ + const uint8x8x2_t u1 = vld2_u8(&pU1[2 * x]); + const uint16x8_t usub = vaddl_u8(u1.val[1], u1.val[0]); /* U2x,2y1 + U2x1,2y1 */ + const int16x8_t us = vreinterpretq_s16_u16(vaddw_u8(usub, + u.val[1])); /* U2x1,2y + U2x,2y1 + U2x1,2y1 */ + const int16x8_t un = vsubq_s16(up, us); + const uint8x8_t u8 = vqmovun_s16(un); /* CLIP(un) */ + u.val[0] = u8; + vst2_u8(&pU[2 * x], u); + } + { + /* V = (V2x,2y << 2) - V2x1,2y - V2x,2y1 - V2x1,2y1 */ + uint8x8x2_t v = vld2_u8(&pV[2 * x]); + const int16x8_t vp = vreinterpretq_s16_u16(vshll_n_u8(v.val[0], 2)); /* Vx2,2y << 2 */ + const uint8x8x2_t v1 = vld2_u8(&pV1[2 * x]); + const uint16x8_t vsub = vaddl_u8(v1.val[1], v1.val[0]); /* V2x,2y1 + V2x1,2y1 */ + const int16x8_t vs = vreinterpretq_s16_u16(vaddw_u8(vsub, + v.val[1])); /* V2x1,2y + V2x,2y1 + V2x1,2y1 */ + const int16x8_t vn = vsubq_s16(vp, vs); + const uint8x8_t v8 = vqmovun_s16(vn); /* CLIP(vn) */ + v.val[0] = v8; + vst2_u8(&pV[2 * x], v); + } + } + + for (; x < halfWidth + roi->left / 2; x++) + { + const UINT32 val2x = (x * 2); + const UINT32 val2x1 = val2x + 1; + const INT32 up = pU[val2x] * 4; + const INT32 vp = pV[val2x] * 4; + INT32 u2020; + INT32 v2020; + + if (val2x1 > nWidth) + continue; + + u2020 = up - pU[val2x1] - pU1[val2x] - pU1[val2x1]; + v2020 = vp - pV[val2x1] - pV1[val2x] - pV1[val2x1]; + pU[val2x] = CLIP(u2020); + pV[val2x] = CLIP(v2020); + } + } + + return PRIMITIVES_SUCCESS; +} + +static pstatus_t neon_ChromaV1ToYUV444(const BYTE* pSrcRaw[3], const UINT32 srcStep[3], + BYTE* pDstRaw[3], const UINT32 dstStep[3], + const RECTANGLE_16* roi) +{ + const UINT32 mod = 16; + UINT32 uY = 0; + UINT32 vY = 0; + UINT32 x, y; + const UINT32 nWidth = roi->right - roi->left; + const UINT32 nHeight = roi->bottom - roi->top; + const UINT32 halfWidth = (nWidth) / 2; + const UINT32 halfHeight = (nHeight) / 2; + const UINT32 oddY = 1; + const UINT32 evenY = 0; + const UINT32 oddX = 1; + /* The auxilary frame is aligned to multiples of 16x16. + * We need the padded height for B4 and B5 conversion. */ + const UINT32 padHeigth = nHeight + 16 - nHeight % 16; + const UINT32 halfPad = halfWidth % 16; + const BYTE* pSrc[3] = + { + pSrcRaw[0] + roi->top* srcStep[0] + roi->left, + pSrcRaw[1] + roi->top / 2 * srcStep[1] + roi->left / 2, + pSrcRaw[2] + roi->top / 2 * srcStep[2] + roi->left / 2 + }; + BYTE* pDst[3] = + { + pDstRaw[0] + roi->top* dstStep[0] + roi->left, + pDstRaw[1] + roi->top* dstStep[1] + roi->left, + pDstRaw[2] + roi->top* dstStep[2] + roi->left + }; + + /* The second half of U and V is a bit more tricky... */ + /* B4 and B5 */ + for (y = 0; y < padHeigth; y++) + { + const BYTE* Ya = pSrc[0] + srcStep[0] * y; + BYTE* pX; + + if ((y) % mod < (mod + 1) / 2) + { + const UINT32 pos = (2 * uY++ + oddY); + + if (pos >= nHeight) + continue; + + pX = pDst[1] + dstStep[1] * pos; + } + else + { + const UINT32 pos = (2 * vY++ + oddY); + + if (pos >= nHeight) + continue; + + pX = pDst[2] + dstStep[2] * pos; + } + + memcpy(pX, Ya, nWidth); + } + + /* B6 and B7 */ + for (y = 0; y < halfHeight; y++) + { + const UINT32 val2y = (y * 2 + evenY); + const BYTE* Ua = pSrc[1] + srcStep[1] * y; + const BYTE* Va = pSrc[2] + srcStep[2] * y; + BYTE* pU = pDst[1] + dstStep[1] * val2y; + BYTE* pV = pDst[2] + dstStep[2] * val2y; + + for (x = 0; x < halfWidth - halfPad; x += 16) + { + { + uint8x16x2_t u = vld2q_u8(&pU[2 * x]); + u.val[1] = vld1q_u8(&Ua[x]); + vst2q_u8(&pU[2 * x], u); + } + { + uint8x16x2_t v = vld2q_u8(&pV[2 * x]); + v.val[1] = vld1q_u8(&Va[x]); + vst2q_u8(&pV[2 * x], v); + } + } + + for (; x < halfWidth; x++) + { + const UINT32 val2x1 = (x * 2 + oddX); + pU[val2x1] = Ua[x]; + pV[val2x1] = Va[x]; + } + } + + /* Filter */ + return neon_ChromaFilter(pDst, dstStep, roi); +} + +static pstatus_t neon_ChromaV2ToYUV444(const BYTE* pSrc[3], const UINT32 srcStep[3], + UINT32 nTotalWidth, UINT32 nTotalHeight, + BYTE* pDst[3], const UINT32 dstStep[3], + const RECTANGLE_16* roi) +{ + UINT32 x, y; + const UINT32 nWidth = roi->right - roi->left; + const UINT32 nHeight = roi->bottom - roi->top; + const UINT32 halfWidth = (nWidth + 1) / 2; + const UINT32 halfPad = halfWidth % 16; + const UINT32 halfHeight = (nHeight + 1) / 2; + const UINT32 quaterWidth = (nWidth + 3) / 4; + const UINT32 quaterPad = quaterWidth % 16; + + /* B4 and B5: odd UV values for width/2, height */ + for (y = 0; y < nHeight; y++) + { + const UINT32 yTop = y + roi->top; + const BYTE* pYaU = pSrc[0] + srcStep[0] * yTop + roi->left / 2; + const BYTE* pYaV = pYaU + nTotalWidth / 2; + BYTE* pU = pDst[1] + dstStep[1] * yTop + roi->left; + BYTE* pV = pDst[2] + dstStep[2] * yTop + roi->left; + + for (x = 0; x < halfWidth - halfPad; x += 16) + { + { + uint8x16x2_t u = vld2q_u8(&pU[2 * x]); + u.val[1] = vld1q_u8(&pYaU[x]); + vst2q_u8(&pU[2 * x], u); + } + { + uint8x16x2_t v = vld2q_u8(&pV[2 * x]); + v.val[1] = vld1q_u8(&pYaV[x]); + vst2q_u8(&pV[2 * x], v); + } + } + + for (; x < halfWidth; x++) + { + const UINT32 odd = 2 * x + 1; + pU[odd] = pYaU[x]; + pV[odd] = pYaV[x]; + } + } + + /* B6 - B9 */ + for (y = 0; y < halfHeight; y++) + { + const BYTE* pUaU = pSrc[1] + srcStep[1] * (y + roi->top / 2) + roi->left / 4; + const BYTE* pUaV = pUaU + nTotalWidth / 4; + const BYTE* pVaU = pSrc[2] + srcStep[2] * (y + roi->top / 2) + roi->left / 4; + const BYTE* pVaV = pVaU + nTotalWidth / 4; + BYTE* pU = pDst[1] + dstStep[1] * (2 * y + 1 + roi->top) + roi->left; + BYTE* pV = pDst[2] + dstStep[2] * (2 * y + 1 + roi->top) + roi->left; + + for (x = 0; x < quaterWidth - quaterPad; x += 16) + { + { + uint8x16x4_t u = vld4q_u8(&pU[4 * x]); + u.val[0] = vld1q_u8(&pUaU[x]); + u.val[2] = vld1q_u8(&pVaU[x]); + vst4q_u8(&pU[4 * x], u); + } + { + uint8x16x4_t v = vld4q_u8(&pV[4 * x]); + v.val[0] = vld1q_u8(&pUaV[x]); + v.val[2] = vld1q_u8(&pVaV[x]); + vst4q_u8(&pV[4 * x], v); + } + } + + for (; x < quaterWidth; x++) + { + pU[4 * x + 0] = pUaU[x]; + pV[4 * x + 0] = pUaV[x]; + pU[4 * x + 2] = pVaU[x]; + pV[4 * x + 2] = pVaV[x]; + } + } + + return neon_ChromaFilter(pDst, dstStep, roi); +} + +static pstatus_t neon_YUV420CombineToYUV444( + avc444_frame_type type, + const BYTE* pSrc[3], const UINT32 srcStep[3], + UINT32 nWidth, UINT32 nHeight, + BYTE* pDst[3], const UINT32 dstStep[3], + const RECTANGLE_16* roi) +{ + if (!pSrc || !pSrc[0] || !pSrc[1] || !pSrc[2]) + return -1; + + if (!pDst || !pDst[0] || !pDst[1] || !pDst[2]) + return -1; + + if (!roi) + return -1; + + switch (type) + { + case AVC444_LUMA: + return neon_LumaToYUV444(pSrc, srcStep, pDst, dstStep, roi); + + case AVC444_CHROMAv1: + return neon_ChromaV1ToYUV444(pSrc, srcStep, pDst, dstStep, roi); + + case AVC444_CHROMAv2: + return neon_ChromaV2ToYUV444(pSrc, srcStep, nWidth, nHeight, pDst, dstStep, roi); + + default: + return -1; + } +} + +void primitives_init_YUV_opt(primitives_t* prims) +{ + generic = primitives_get_generic(); + primitives_init_YUV(prims); + + if (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE)) + { + prims->YUV420ToRGB_8u_P3AC4R = neon_YUV420ToRGB_8u_P3AC4R; + prims->YUV444ToRGB_8u_P3AC4R = neon_YUV444ToRGB_8u_P3AC4R; + prims->YUV420CombineToYUV444 = neon_YUV420CombineToYUV444; + } +} diff --git a/libfreerdp/primitives/prim_YUV_opt.c b/libfreerdp/primitives/prim_YUV_opt.c deleted file mode 100644 index 8229ae1..0000000 --- a/libfreerdp/primitives/prim_YUV_opt.c +++ /dev/null @@ -1,1828 +0,0 @@ -/** - * FreeRDP: A Remote Desktop Protocol Implementation - * Optimized YUV/RGB conversion operations - * - * Copyright 2014 Thomas Erbesdobler - * Copyright 2016-2017 Armin Novak - * Copyright 2016-2017 Norbert Federa - * Copyright 2016-2017 Thincast Technologies GmbH - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifdef HAVE_CONFIG_H -#include "config.h" -#endif - -#include -#include -#include -#include - -#include "prim_internal.h" - -#ifdef WITH_SSE2 - -#include -#include -#elif defined(WITH_NEON) -#include -#endif /* WITH_SSE2 else WITH_NEON */ - -static primitives_t* generic = NULL; - -#ifdef WITH_SSE2 -/****************************************************************************/ -/* SSSE3 YUV420 -> RGB conversion */ -/****************************************************************************/ -static __m128i* ssse3_YUV444Pixel(__m128i* dst, __m128i Yraw, __m128i Uraw, __m128i Vraw, UINT8 pos) -{ - /* Visual Studio 2010 doesn't like _mm_set_epi32 in array initializer list */ - /* Note: This also applies to Visual Studio 2013 before Update 4 */ -#if !defined(_MSC_VER) || (_MSC_VER > 1600) - const __m128i mapY[] = - { - _mm_set_epi32(0x80800380, 0x80800280, 0x80800180, 0x80800080), - _mm_set_epi32(0x80800780, 0x80800680, 0x80800580, 0x80800480), - _mm_set_epi32(0x80800B80, 0x80800A80, 0x80800980, 0x80800880), - _mm_set_epi32(0x80800F80, 0x80800E80, 0x80800D80, 0x80800C80) - }; - const __m128i mapUV[] = - { - _mm_set_epi32(0x80038002, 0x80018000, 0x80808080, 0x80808080), - _mm_set_epi32(0x80078006, 0x80058004, 0x80808080, 0x80808080), - _mm_set_epi32(0x800B800A, 0x80098008, 0x80808080, 0x80808080), - _mm_set_epi32(0x800F800E, 0x800D800C, 0x80808080, 0x80808080) - }; - const __m128i mask[] = - { - _mm_set_epi32(0x80038080, 0x80028080, 0x80018080, 0x80008080), - _mm_set_epi32(0x80800380, 0x80800280, 0x80800180, 0x80800080), - _mm_set_epi32(0x80808003, 0x80808002, 0x80808001, 0x80808000) - }; -#else - /* Note: must be in little-endian format ! */ - const __m128i mapY[] = - { - { 0x80, 0x00, 0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80, 0x02, 0x80, 0x80, 0x80, 0x03, 0x80, 0x80 }, - { 0x80, 0x04, 0x80, 0x80, 0x80, 0x05, 0x80, 0x80, 0x80, 0x06, 0x80, 0x80, 0x80, 0x07, 0x80, 0x80 }, - { 0x80, 0x08, 0x80, 0x80, 0x80, 0x09, 0x80, 0x80, 0x80, 0x0a, 0x80, 0x80, 0x80, 0x0b, 0x80, 0x80 }, - { 0x80, 0x0c, 0x80, 0x80, 0x80, 0x0d, 0x80, 0x80, 0x80, 0x0e, 0x80, 0x80, 0x80, 0x0f, 0x80, 0x80 } - - }; - const __m128i mapUV[] = - { - { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x00, 0x80, 0x01, 0x80, 0x02, 0x80, 0x03, 0x80 }, - { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x04, 0x80, 0x05, 0x80, 0x06, 0x80, 0x07, 0x80 }, - { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x08, 0x80, 0x09, 0x80, 0x0a, 0x80, 0x0b, 0x80 }, - { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x0c, 0x80, 0x0d, 0x80, 0x0e, 0x80, 0x0f, 0x80 } - }; - const __m128i mask[] = - { - { 0x80, 0x80, 0x00, 0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80, 0x02, 0x80, 0x80, 0x80, 0x03, 0x80 }, - { 0x80, 0x00, 0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80, 0x02, 0x80, 0x80, 0x80, 0x03, 0x80, 0x80 }, - { 0x00, 0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80, 0x02, 0x80, 0x80, 0x80, 0x03, 0x80, 0x80, 0x80 } - }; -#endif - const __m128i c128 = _mm_set1_epi16(128); - __m128i BGRX = _mm_set_epi32(0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000); - { - __m128i C, D, E; - /* Load Y values and expand to 32 bit */ - { - C = _mm_shuffle_epi8(Yraw, mapY[pos]); /* Reorder and multiply by 256 */ - } - /* Load U values and expand to 32 bit */ - { - const __m128i U = _mm_shuffle_epi8(Uraw, mapUV[pos]); /* Reorder dcba */ - D = _mm_sub_epi16(U, c128); /* D = U - 128 */ - } - /* Load V values and expand to 32 bit */ - { - const __m128i V = _mm_shuffle_epi8(Vraw, mapUV[pos]); /* Reorder dcba */ - E = _mm_sub_epi16(V, c128); /* E = V - 128 */ - } - /* Get the R value */ - { - const __m128i c403 = _mm_set1_epi16(403); - const __m128i e403 = _mm_unpackhi_epi16(_mm_mullo_epi16(E, c403), _mm_mulhi_epi16(E, c403)); - const __m128i Rs = _mm_add_epi32(C, e403); - const __m128i R32 = _mm_srai_epi32(Rs, 8); - const __m128i R16 = _mm_packs_epi32(R32, _mm_setzero_si128()); - const __m128i R = _mm_packus_epi16(R16, _mm_setzero_si128()); - const __m128i packed = _mm_shuffle_epi8(R, mask[0]); - BGRX = _mm_or_si128(BGRX, packed); - } - /* Get the G value */ - { - const __m128i c48 = _mm_set1_epi16(48); - const __m128i d48 = _mm_unpackhi_epi16(_mm_mullo_epi16(D, c48), _mm_mulhi_epi16(D, c48)); - const __m128i c120 = _mm_set1_epi16(120); - const __m128i e120 = _mm_unpackhi_epi16(_mm_mullo_epi16(E, c120), _mm_mulhi_epi16(E, c120)); - const __m128i de = _mm_add_epi32(d48, e120); - const __m128i Gs = _mm_sub_epi32(C, de); - const __m128i G32 = _mm_srai_epi32(Gs, 8); - const __m128i G16 = _mm_packs_epi32(G32, _mm_setzero_si128()); - const __m128i G = _mm_packus_epi16(G16, _mm_setzero_si128()); - const __m128i packed = _mm_shuffle_epi8(G, mask[1]); - BGRX = _mm_or_si128(BGRX, packed); - } - /* Get the B value */ - { - const __m128i c475 = _mm_set1_epi16(475); - const __m128i d475 = _mm_unpackhi_epi16(_mm_mullo_epi16(D, c475), _mm_mulhi_epi16(D, c475)); - const __m128i Bs = _mm_add_epi32(C, d475); - const __m128i B32 = _mm_srai_epi32(Bs, 8); - const __m128i B16 = _mm_packs_epi32(B32, _mm_setzero_si128()); - const __m128i B = _mm_packus_epi16(B16, _mm_setzero_si128()); - const __m128i packed = _mm_shuffle_epi8(B, mask[2]); - BGRX = _mm_or_si128(BGRX, packed); - } - } - _mm_storeu_si128(dst++, BGRX); - return dst; -} - -static pstatus_t ssse3_YUV420ToRGB_BGRX( - const BYTE** pSrc, const UINT32* srcStep, - BYTE* pDst, UINT32 dstStep, - const prim_size_t* roi) -{ - const UINT32 nWidth = roi->width; - const UINT32 nHeight = roi->height; - const UINT32 pad = roi->width % 16; - const __m128i duplicate = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0); - UINT32 y; - - for (y = 0; y < nHeight; y++) - { - UINT32 x; - __m128i* dst = (__m128i*)(pDst + dstStep * y); - const BYTE* YData = pSrc[0] + y * srcStep[0]; - const BYTE* UData = pSrc[1] + (y / 2) * srcStep[1]; - const BYTE* VData = pSrc[2] + (y / 2) * srcStep[2]; - - for (x = 0; x < nWidth - pad; x += 16) - { - const __m128i Y = _mm_loadu_si128((__m128i*)YData); - const __m128i uRaw = _mm_loadu_si128((__m128i*)UData); - const __m128i vRaw = _mm_loadu_si128((__m128i*)VData); - const __m128i U = _mm_shuffle_epi8(uRaw, duplicate); - const __m128i V = _mm_shuffle_epi8(vRaw, duplicate); - YData += 16; - UData += 8; - VData += 8; - dst = ssse3_YUV444Pixel(dst, Y, U, V, 0); - dst = ssse3_YUV444Pixel(dst, Y, U, V, 1); - dst = ssse3_YUV444Pixel(dst, Y, U, V, 2); - dst = ssse3_YUV444Pixel(dst, Y, U, V, 3); - } - - for (x = 0; x < pad; x++) - { - const BYTE Y = *YData++; - const BYTE U = *UData; - const BYTE V = *VData; - const BYTE r = YUV2R(Y, U, V); - const BYTE g = YUV2G(Y, U, V); - const BYTE b = YUV2B(Y, U, V); - dst = (__m128i*)writePixelBGRX((BYTE*)dst, 4, PIXEL_FORMAT_BGRX32, r, g, b, 0xFF); - - if (x % 2) - { - UData++; - VData++; - } - } - } - - return PRIMITIVES_SUCCESS; -} - -static pstatus_t ssse3_YUV420ToRGB( - const BYTE** pSrc, const UINT32* srcStep, - BYTE* pDst, UINT32 dstStep, UINT32 DstFormat, - const prim_size_t* roi) -{ - switch (DstFormat) - { - case PIXEL_FORMAT_BGRX32: - case PIXEL_FORMAT_BGRA32: - return ssse3_YUV420ToRGB_BGRX(pSrc, srcStep, pDst, dstStep, roi); - - default: - return generic->YUV420ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi); - } -} - -static pstatus_t ssse3_YUV444ToRGB_8u_P3AC4R_BGRX( - const BYTE** pSrc, const UINT32* srcStep, - BYTE* pDst, UINT32 dstStep, - const prim_size_t* roi) -{ - const UINT32 nWidth = roi->width; - const UINT32 nHeight = roi->height; - const UINT32 pad = roi->width % 16; - UINT32 y; - - for (y = 0; y < nHeight; y++) - { - UINT32 x; - __m128i* dst = (__m128i*)(pDst + dstStep * y); - const BYTE* YData = pSrc[0] + y * srcStep[0]; - const BYTE* UData = pSrc[1] + y * srcStep[1]; - const BYTE* VData = pSrc[2] + y * srcStep[2]; - - for (x = 0; x < nWidth - pad; x += 16) - { - __m128i Y = _mm_load_si128((__m128i*)YData); - __m128i U = _mm_load_si128((__m128i*)UData); - __m128i V = _mm_load_si128((__m128i*)VData); - YData += 16; - UData += 16; - VData += 16; - dst = ssse3_YUV444Pixel(dst, Y, U, V, 0); - dst = ssse3_YUV444Pixel(dst, Y, U, V, 1); - dst = ssse3_YUV444Pixel(dst, Y, U, V, 2); - dst = ssse3_YUV444Pixel(dst, Y, U, V, 3); - } - - for (x = 0; x < pad; x++) - { - const BYTE Y = *YData++; - const BYTE U = *UData++; - const BYTE V = *VData++; - const BYTE r = YUV2R(Y, U, V); - const BYTE g = YUV2G(Y, U, V); - const BYTE b = YUV2B(Y, U, V); - dst = (__m128i*)writePixelBGRX((BYTE*)dst, 4, PIXEL_FORMAT_BGRX32, r, g, b, 0xFF); - } - } - - return PRIMITIVES_SUCCESS; -} - -static pstatus_t ssse3_YUV444ToRGB_8u_P3AC4R(const BYTE** pSrc, const UINT32* srcStep, - BYTE* pDst, UINT32 dstStep, UINT32 DstFormat, - const prim_size_t* roi) -{ - if ((unsigned long)pSrc[0] % 16 || (unsigned long)pSrc[1] % 16 || (unsigned long)pSrc[2] % 16 || - srcStep[0] % 16 || srcStep[1] % 16 || srcStep[2] % 16) - return generic->YUV444ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi); - - switch (DstFormat) - { - case PIXEL_FORMAT_BGRX32: - case PIXEL_FORMAT_BGRA32: - return ssse3_YUV444ToRGB_8u_P3AC4R_BGRX(pSrc, srcStep, pDst, dstStep, roi); - - default: - return generic->YUV444ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi); - } -} - -/****************************************************************************/ -/* SSSE3 RGB -> YUV420 conversion **/ -/****************************************************************************/ - - -/** - * Note (nfedera): - * The used forward transformation factors from RGB to YUV are based on the - * values specified in [Rec. ITU-R BT.709-6] Section 3: - * http://www.itu.int/rec/R-REC-BT.709-6-201506-I/en - * - * Y = 0.21260 * R + 0.71520 * G + 0.07220 * B + 0; - * U = -0.11457 * R - 0.38543 * G + 0.50000 * B + 128; - * V = 0.50000 * R - 0.45415 * G - 0.04585 * B + 128; - * - * The most accurate integer arithmetic approximation when using 8-bit signed - * integer factors with 16-bit signed integer intermediate results is: - * - * Y = ( ( 27 * R + 92 * G + 9 * B) >> 7 ); - * U = ( (-15 * R - 49 * G + 64 * B) >> 7 ) + 128; - * V = ( ( 64 * R - 58 * G - 6 * B) >> 7 ) + 128; - * - */ - -PRIM_ALIGN_128 static const BYTE bgrx_y_factors[] = -{ - 9, 92, 27, 0, 9, 92, 27, 0, 9, 92, 27, 0, 9, 92, 27, 0 -}; -PRIM_ALIGN_128 static const BYTE bgrx_u_factors[] = -{ - 64, -49, -15, 0, 64, -49, -15, 0, 64, -49, -15, 0, 64, -49, -15, 0 -}; -PRIM_ALIGN_128 static const BYTE bgrx_v_factors[] = -{ - -6, -58, 64, 0, -6, -58, 64, 0, -6, -58, 64, 0, -6, -58, 64, 0 -}; - -PRIM_ALIGN_128 static const BYTE const_buf_128b[] = -{ - 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 -}; - -/* -TODO: -RGB[AX] can simply be supported using the following factors. And instead of loading the -globals directly the functions below could be passed pointers to the correct vectors -depending on the source picture format. - -PRIM_ALIGN_128 static const BYTE rgbx_y_factors[] = { - 27, 92, 9, 0, 27, 92, 9, 0, 27, 92, 9, 0, 27, 92, 9, 0 -}; -PRIM_ALIGN_128 static const BYTE rgbx_u_factors[] = { - -15, -49, 64, 0, -15, -49, 64, 0, -15, -49, 64, 0, -15, -49, 64, 0 -}; -PRIM_ALIGN_128 static const BYTE rgbx_v_factors[] = { - 64, -58, -6, 0, 64, -58, -6, 0, 64, -58, -6, 0, 64, -58, -6, 0 -}; -*/ - - -/* compute the luma (Y) component from a single rgb source line */ - -static INLINE void ssse3_RGBToYUV420_BGRX_Y( - const BYTE* src, BYTE* dst, UINT32 width) -{ - UINT32 x; - __m128i y_factors, x0, x1, x2, x3; - const __m128i* argb = (const __m128i*) src; - __m128i* ydst = (__m128i*) dst; - y_factors = _mm_load_si128((__m128i*)bgrx_y_factors); - - for (x = 0; x < width; x += 16) - { - /* store 16 rgba pixels in 4 128 bit registers */ - x0 = _mm_load_si128(argb++); // 1st 4 pixels - x1 = _mm_load_si128(argb++); // 2nd 4 pixels - x2 = _mm_load_si128(argb++); // 3rd 4 pixels - x3 = _mm_load_si128(argb++); // 4th 4 pixels - /* multiplications and subtotals */ - x0 = _mm_maddubs_epi16(x0, y_factors); - x1 = _mm_maddubs_epi16(x1, y_factors); - x2 = _mm_maddubs_epi16(x2, y_factors); - x3 = _mm_maddubs_epi16(x3, y_factors); - /* the total sums */ - x0 = _mm_hadd_epi16(x0, x1); - x2 = _mm_hadd_epi16(x2, x3); - /* shift the results */ - x0 = _mm_srli_epi16(x0, 7); - x2 = _mm_srli_epi16(x2, 7); - /* pack the 16 words into bytes */ - x0 = _mm_packus_epi16(x0, x2); - /* save to y plane */ - _mm_storeu_si128(ydst++, x0); - } -} - -/* compute the chrominance (UV) components from two rgb source lines */ - -static INLINE void ssse3_RGBToYUV420_BGRX_UV( - const BYTE* src1, const BYTE* src2, - BYTE* dst1, BYTE* dst2, UINT32 width) -{ - UINT32 x; - __m128i vector128, u_factors, v_factors, x0, x1, x2, x3, x4, x5; - const __m128i* rgb1 = (const __m128i*)src1; - const __m128i* rgb2 = (const __m128i*)src2; - __m64* udst = (__m64*)dst1; - __m64* vdst = (__m64*)dst2; - vector128 = _mm_load_si128((__m128i*)const_buf_128b); - u_factors = _mm_load_si128((__m128i*)bgrx_u_factors); - v_factors = _mm_load_si128((__m128i*)bgrx_v_factors); - - for (x = 0; x < width; x += 16) - { - /* subsample 16x2 pixels into 16x1 pixels */ - x0 = _mm_load_si128(rgb1++); - x4 = _mm_load_si128(rgb2++); - x0 = _mm_avg_epu8(x0, x4); - x1 = _mm_load_si128(rgb1++); - x4 = _mm_load_si128(rgb2++); - x1 = _mm_avg_epu8(x1, x4); - x2 = _mm_load_si128(rgb1++); - x4 = _mm_load_si128(rgb2++); - x2 = _mm_avg_epu8(x2, x4); - x3 = _mm_load_si128(rgb1++); - x4 = _mm_load_si128(rgb2++); - x3 = _mm_avg_epu8(x3, x4); - /* subsample these 16x1 pixels into 8x1 pixels */ - /** - * shuffle controls - * c = a[0],a[2],b[0],b[2] == 10 00 10 00 = 0x88 - * c = a[1],a[3],b[1],b[3] == 11 01 11 01 = 0xdd - */ - x4 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x0), _mm_castsi128_ps(x1), 0x88)); - x0 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x0), _mm_castsi128_ps(x1), 0xdd)); - x0 = _mm_avg_epu8(x0, x4); - x4 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x2), _mm_castsi128_ps(x3), 0x88)); - x1 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x2), _mm_castsi128_ps(x3), 0xdd)); - x1 = _mm_avg_epu8(x1, x4); - /* multiplications and subtotals */ - x2 = _mm_maddubs_epi16(x0, u_factors); - x3 = _mm_maddubs_epi16(x1, u_factors); - x4 = _mm_maddubs_epi16(x0, v_factors); - x5 = _mm_maddubs_epi16(x1, v_factors); - /* the total sums */ - x0 = _mm_hadd_epi16(x2, x3); - x1 = _mm_hadd_epi16(x4, x5); - /* shift the results */ - x0 = _mm_srai_epi16(x0, 7); - x1 = _mm_srai_epi16(x1, 7); - /* pack the 16 words into bytes */ - x0 = _mm_packs_epi16(x0, x1); - /* add 128 */ - x0 = _mm_add_epi8(x0, vector128); - /* the lower 8 bytes go to the u plane */ - _mm_storel_pi(udst++, _mm_castsi128_ps(x0)); - /* the upper 8 bytes go to the v plane */ - _mm_storeh_pi(vdst++, _mm_castsi128_ps(x0)); - } -} - -static pstatus_t ssse3_RGBToYUV420_BGRX( - const BYTE* pSrc, UINT32 srcFormat, UINT32 srcStep, - BYTE* pDst[3], UINT32 dstStep[3], - const prim_size_t* roi) -{ - UINT32 y; - const BYTE* argb = pSrc; - BYTE* ydst = pDst[0]; - BYTE* udst = pDst[1]; - BYTE* vdst = pDst[2]; - - if (roi->height < 1 || roi->width < 1) - { - return !PRIMITIVES_SUCCESS; - } - - if (roi->width % 16 || (unsigned long)pSrc % 16 || srcStep % 16) - { - return generic->RGBToYUV420_8u_P3AC4R(pSrc, srcFormat, srcStep, pDst, dstStep, roi); - } - - for (y = 0; y < roi->height - 1; y += 2) - { - const BYTE* line1 = argb; - const BYTE* line2 = argb + srcStep; - ssse3_RGBToYUV420_BGRX_UV(line1, line2, udst, vdst, roi->width); - ssse3_RGBToYUV420_BGRX_Y(line1, ydst, roi->width); - ssse3_RGBToYUV420_BGRX_Y(line2, ydst + dstStep[0], roi->width); - argb += 2 * srcStep; - ydst += 2 * dstStep[0]; - udst += 1 * dstStep[1]; - vdst += 1 * dstStep[2]; - } - - if (roi->height & 1) - { - /* pass the same last line of an odd height twice for UV */ - ssse3_RGBToYUV420_BGRX_UV(argb, argb, udst, vdst, roi->width); - ssse3_RGBToYUV420_BGRX_Y(argb, ydst, roi->width); - } - - return PRIMITIVES_SUCCESS; -} - -static pstatus_t ssse3_RGBToYUV420( - const BYTE* pSrc, UINT32 srcFormat, UINT32 srcStep, - BYTE* pDst[3], UINT32 dstStep[3], - const prim_size_t* roi) -{ - switch (srcFormat) - { - case PIXEL_FORMAT_BGRX32: - case PIXEL_FORMAT_BGRA32: - return ssse3_RGBToYUV420_BGRX(pSrc, srcFormat, srcStep, pDst, dstStep, roi); - - default: - return generic->RGBToYUV420_8u_P3AC4R(pSrc, srcFormat, srcStep, pDst, dstStep, roi); - } -} - - -/****************************************************************************/ -/* SSSE3 RGB -> AVC444-YUV conversion **/ -/****************************************************************************/ - -static INLINE void ssse3_RGBToAVC444YUV_BGRX_ROW( - const BYTE* src, BYTE* ydst, BYTE* udst1, BYTE* udst2, BYTE* vdst1, BYTE* vdst2, BOOL isEvenRow, - UINT32 width) -{ - UINT32 x; - __m128i vector128, y_factors, u_factors, v_factors, smask; - __m128i x1, x2, x3, x4, y, y1, y2, u, u1, u2, v, v1, v2; - const __m128i* argb = (const __m128i*) src; - __m128i* py = (__m128i*) ydst; - __m64* pu1 = (__m64*) udst1; - __m64* pu2 = (__m64*) udst2; - __m64* pv1 = (__m64*) vdst1; - __m64* pv2 = (__m64*) vdst2; - y_factors = _mm_load_si128((__m128i*)bgrx_y_factors); - u_factors = _mm_load_si128((__m128i*)bgrx_u_factors); - v_factors = _mm_load_si128((__m128i*)bgrx_v_factors); - vector128 = _mm_load_si128((__m128i*)const_buf_128b); - smask = _mm_set_epi8(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0); - - for (x = 0; x < width; x += 16) - { - /* store 16 rgba pixels in 4 128 bit registers */ - x1 = _mm_load_si128(argb++); // 1st 4 pixels - x2 = _mm_load_si128(argb++); // 2nd 4 pixels - x3 = _mm_load_si128(argb++); // 3rd 4 pixels - x4 = _mm_load_si128(argb++); // 4th 4 pixels - /* Y: multiplications with subtotals and horizontal sums */ - y1 = _mm_hadd_epi16(_mm_maddubs_epi16(x1, y_factors), _mm_maddubs_epi16(x2, y_factors)); - y2 = _mm_hadd_epi16(_mm_maddubs_epi16(x3, y_factors), _mm_maddubs_epi16(x4, y_factors)); - /* Y: shift the results (logical) */ - y1 = _mm_srli_epi16(y1, 7); - y2 = _mm_srli_epi16(y2, 7); - /* Y: pack (unsigned) 16 words into bytes */ - y = _mm_packus_epi16(y1, y2); - /* U: multiplications with subtotals and horizontal sums */ - u1 = _mm_hadd_epi16(_mm_maddubs_epi16(x1, u_factors), _mm_maddubs_epi16(x2, u_factors)); - u2 = _mm_hadd_epi16(_mm_maddubs_epi16(x3, u_factors), _mm_maddubs_epi16(x4, u_factors)); - /* U: shift the results (arithmetic) */ - u1 = _mm_srai_epi16(u1, 7); - u2 = _mm_srai_epi16(u2, 7); - /* U: pack (signed) 16 words into bytes */ - u = _mm_packs_epi16(u1, u2); - /* U: add 128 */ - u = _mm_add_epi8(u, vector128); - /* V: multiplications with subtotals and horizontal sums */ - v1 = _mm_hadd_epi16(_mm_maddubs_epi16(x1, v_factors), _mm_maddubs_epi16(x2, v_factors)); - v2 = _mm_hadd_epi16(_mm_maddubs_epi16(x3, v_factors), _mm_maddubs_epi16(x4, v_factors)); - /* V: shift the results (arithmetic) */ - v1 = _mm_srai_epi16(v1, 7); - v2 = _mm_srai_epi16(v2, 7); - /* V: pack (signed) 16 words into bytes */ - v = _mm_packs_epi16(v1, v2); - /* V: add 128 */ - v = _mm_add_epi8(v, vector128); - /* store y */ - _mm_storeu_si128(py++, y); - - /* store u and v */ - if (isEvenRow) - { - u = _mm_shuffle_epi8(u, smask); - v = _mm_shuffle_epi8(v, smask); - _mm_storel_pi(pu1++, _mm_castsi128_ps(u)); - _mm_storeh_pi(pu2++, _mm_castsi128_ps(u)); - _mm_storel_pi(pv1++, _mm_castsi128_ps(v)); - _mm_storeh_pi(pv2++, _mm_castsi128_ps(v)); - } - else - { - _mm_storel_pi(pu1, _mm_castsi128_ps(u)); - _mm_storeh_pi(pu2, _mm_castsi128_ps(u)); - _mm_storel_pi(pv1, _mm_castsi128_ps(v)); - _mm_storeh_pi(pv2, _mm_castsi128_ps(v)); - pu1 += 2; - pu2 += 2; - pv1 += 2; - pv2 += 2; - } - } -} - - -static pstatus_t ssse3_RGBToAVC444YUV_BGRX( - const BYTE* pSrc, UINT32 srcFormat, UINT32 srcStep, - BYTE* pDst1[3], const UINT32 dst1Step[3], - BYTE* pDst2[3], const UINT32 dst2Step[3], - const prim_size_t* roi) -{ - UINT32 y, numRows; - BOOL evenRow = TRUE; - BYTE* b1, *b2, *b3, *b4, *b5, *b6, *b7; - const BYTE* pMaxSrc = pSrc + (roi->height - 1) * srcStep; - - if (roi->height < 1 || roi->width < 1) - { - return !PRIMITIVES_SUCCESS; - } - - if (roi->width % 16 || (unsigned long)pSrc % 16 || srcStep % 16) - { - return generic->RGBToAVC444YUV(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2, dst2Step, roi); - } - - numRows = (roi->height + 1) & ~1; - - for (y = 0; y < numRows; y++, evenRow = !evenRow) - { - const BYTE* src = y < roi->height ? pSrc + y * srcStep : pMaxSrc; - UINT32 i = y >> 1; - b1 = pDst1[0] + y * dst1Step[0]; - - if (evenRow) - { - b2 = pDst1[1] + i * dst1Step[1]; - b3 = pDst1[2] + i * dst1Step[2]; - b6 = pDst2[1] + i * dst2Step[1]; - b7 = pDst2[2] + i * dst2Step[2]; - ssse3_RGBToAVC444YUV_BGRX_ROW(src, b1, b2, b6, b3, b7, TRUE, roi->width); - } - else - { - b4 = pDst2[0] + dst2Step[0] * ((i & ~7) + i); - b5 = b4 + 8 * dst2Step[0]; - ssse3_RGBToAVC444YUV_BGRX_ROW(src, b1, b4, b4 + 8, b5, b5 + 8, FALSE, roi->width); - } - } - - return PRIMITIVES_SUCCESS; -} - - -static pstatus_t ssse3_RGBToAVC444YUV( - const BYTE* pSrc, UINT32 srcFormat, UINT32 srcStep, - BYTE* pDst1[3], const UINT32 dst1Step[3], - BYTE* pDst2[3], const UINT32 dst2Step[3], - const prim_size_t* roi) -{ - switch (srcFormat) - { - case PIXEL_FORMAT_BGRX32: - case PIXEL_FORMAT_BGRA32: - return ssse3_RGBToAVC444YUV_BGRX(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2, dst2Step, roi); - - default: - return generic->RGBToAVC444YUV(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2, dst2Step, roi); - } -} - -static pstatus_t ssse3_LumaToYUV444(const BYTE* pSrcRaw[3], const UINT32 srcStep[3], - BYTE* pDstRaw[3], const UINT32 dstStep[3], - const RECTANGLE_16* roi) -{ - UINT32 x, y; - const UINT32 nWidth = roi->right - roi->left; - const UINT32 nHeight = roi->bottom - roi->top; - const UINT32 halfWidth = (nWidth + 1) / 2; - const UINT32 halfPad = halfWidth % 16; - const UINT32 halfHeight = (nHeight + 1) / 2; - const UINT32 oddY = 1; - const UINT32 evenY = 0; - const UINT32 oddX = 1; - const UINT32 evenX = 0; - const BYTE* pSrc[3] = - { - pSrcRaw[0] + roi->top* srcStep[0] + roi->left, - pSrcRaw[1] + roi->top / 2 * srcStep[1] + roi->left / 2, - pSrcRaw[2] + roi->top / 2 * srcStep[2] + roi->left / 2 - }; - BYTE* pDst[3] = - { - pDstRaw[0] + roi->top* dstStep[0] + roi->left, - pDstRaw[1] + roi->top* dstStep[1] + roi->left, - pDstRaw[2] + roi->top* dstStep[2] + roi->left - }; - - /* Y data is already here... */ - /* B1 */ - for (y = 0; y < nHeight; y++) - { - const BYTE* Ym = pSrc[0] + srcStep[0] * y; - BYTE* pY = pDst[0] + dstStep[0] * y; - memcpy(pY, Ym, nWidth); - } - - /* The first half of U, V are already here part of this frame. */ - /* B2 and B3 */ - for (y = 0; y < halfHeight; y++) - { - const UINT32 val2y = (2 * y + evenY); - const UINT32 val2y1 = val2y + oddY; - const BYTE* Um = pSrc[1] + srcStep[1] * y; - const BYTE* Vm = pSrc[2] + srcStep[2] * y; - BYTE* pU = pDst[1] + dstStep[1] * val2y; - BYTE* pV = pDst[2] + dstStep[2] * val2y; - BYTE* pU1 = pDst[1] + dstStep[1] * val2y1; - BYTE* pV1 = pDst[2] + dstStep[2] * val2y1; - - for (x = 0; x < halfWidth - halfPad; x += 16) - { - const __m128i unpackHigh = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0); - const __m128i unpackLow = _mm_set_epi8(15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8); - { - const __m128i u = _mm_loadu_si128((__m128i*)&Um[x]); - const __m128i uHigh = _mm_shuffle_epi8(u, unpackHigh); - const __m128i uLow = _mm_shuffle_epi8(u, unpackLow); - _mm_storeu_si128((__m128i*)&pU[2 * x], uHigh); - _mm_storeu_si128((__m128i*)&pU[2 * x + 16], uLow); - _mm_storeu_si128((__m128i*)&pU1[2 * x], uHigh); - _mm_storeu_si128((__m128i*)&pU1[2 * x + 16], uLow); - } - { - const __m128i u = _mm_loadu_si128((__m128i*)&Vm[x]); - const __m128i uHigh = _mm_shuffle_epi8(u, unpackHigh); - const __m128i uLow = _mm_shuffle_epi8(u, unpackLow); - _mm_storeu_si128((__m128i*)&pV[2 * x], uHigh); - _mm_storeu_si128((__m128i*)&pV[2 * x + 16], uLow); - _mm_storeu_si128((__m128i*)&pV1[2 * x], uHigh); - _mm_storeu_si128((__m128i*)&pV1[2 * x + 16], uLow); - } - } - - for (; x < halfWidth; x++) - { - const UINT32 val2x = 2 * x + evenX; - const UINT32 val2x1 = val2x + oddX; - pU[val2x] = Um[x]; - pV[val2x] = Vm[x]; - pU[val2x1] = Um[x]; - pV[val2x1] = Vm[x]; - pU1[val2x] = Um[x]; - pV1[val2x] = Vm[x]; - pU1[val2x1] = Um[x]; - pV1[val2x1] = Vm[x]; - } - } - - return PRIMITIVES_SUCCESS; -} - -static INLINE void ssse3_filter(BYTE* pSrcDst, const BYTE* pSrc2) -{ - const __m128i even = _mm_set_epi8(0x80, 14, 0x80, 12, 0x80, 10, 0x80, 8, 0x80, 6, 0x80, 4, 0x80, 2, - 0x80, 0); - const __m128i odd = _mm_set_epi8(0x80, 15, 0x80, 13, 0x80, 11, 0x80, 9, 0x80, 7, 0x80, 5, 0x80, 3, - 0x80, 1); - const __m128i interleave = _mm_set_epi8(15, 7, 14, 6, 13, 5, 12, 4, 11, 3, 10, 2, 9, 1, 8, 0); - const __m128i u = _mm_loadu_si128((__m128i*)pSrcDst); - const __m128i u1 = _mm_loadu_si128((__m128i*)pSrc2); - const __m128i uEven = _mm_shuffle_epi8(u, even); - const __m128i uEven4 = _mm_slli_epi16(uEven, 2); - const __m128i uOdd = _mm_shuffle_epi8(u, odd); - const __m128i u1Even = _mm_shuffle_epi8(u1, even); - const __m128i u1Odd = _mm_shuffle_epi8(u1, odd); - const __m128i tmp1 = _mm_add_epi16(uOdd, u1Even); - const __m128i tmp2 = _mm_add_epi16(tmp1, u1Odd); - const __m128i result = _mm_sub_epi16(uEven4, tmp2); - const __m128i packed = _mm_packus_epi16(result, uOdd); - const __m128i interleaved = _mm_shuffle_epi8(packed, interleave); - _mm_storeu_si128((__m128i*)pSrcDst, interleaved); -} - -static pstatus_t ssse3_ChromaFilter(BYTE* pDst[3], const UINT32 dstStep[3], - const RECTANGLE_16* roi) -{ - const UINT32 oddY = 1; - const UINT32 evenY = 0; - const UINT32 nWidth = roi->right - roi->left; - const UINT32 nHeight = roi->bottom - roi->top; - const UINT32 halfHeight = (nHeight + 1) / 2; - const UINT32 halfWidth = (nWidth + 1) / 2; - const UINT32 halfPad = halfWidth % 16; - UINT32 x, y; - - /* Filter */ - for (y = roi->top; y < halfHeight + roi->top; y++) - { - const UINT32 val2y = (y * 2 + evenY); - const UINT32 val2y1 = val2y + oddY; - BYTE* pU1 = pDst[1] + dstStep[1] * val2y1; - BYTE* pV1 = pDst[2] + dstStep[2] * val2y1; - BYTE* pU = pDst[1] + dstStep[1] * val2y; - BYTE* pV = pDst[2] + dstStep[2] * val2y; - - if (val2y1 > nHeight) - continue; - - for (x = roi->left; x < halfWidth + roi->left - halfPad; x += 16) - { - ssse3_filter(&pU[2 * x], &pU1[2 * x]); - ssse3_filter(&pV[2 * x], &pV1[2 * x]); - } - - for (; x < halfWidth + roi->left; x++) - { - const UINT32 val2x = (x * 2); - const UINT32 val2x1 = val2x + 1; - const INT32 up = pU[val2x] * 4; - const INT32 vp = pV[val2x] * 4; - INT32 u2020; - INT32 v2020; - - if (val2x1 > nWidth) - continue; - - u2020 = up - pU[val2x1] - pU1[val2x] - pU1[val2x1]; - v2020 = vp - pV[val2x1] - pV1[val2x] - pV1[val2x1]; - pU[val2x] = CLIP(u2020); - pV[val2x] = CLIP(v2020); - } - } - - return PRIMITIVES_SUCCESS; -} - -static pstatus_t ssse3_ChromaV1ToYUV444(const BYTE* pSrcRaw[3], const UINT32 srcStep[3], - BYTE* pDstRaw[3], const UINT32 dstStep[3], - const RECTANGLE_16* roi) -{ - const UINT32 mod = 16; - UINT32 uY = 0; - UINT32 vY = 0; - UINT32 x, y; - const UINT32 nWidth = roi->right - roi->left; - const UINT32 nHeight = roi->bottom - roi->top; - const UINT32 halfWidth = (nWidth + 1) / 2; - const UINT32 halfPad = halfWidth % 16; - const UINT32 halfHeight = (nHeight + 1) / 2; - const UINT32 oddY = 1; - const UINT32 evenY = 0; - const UINT32 oddX = 1; - /* The auxilary frame is aligned to multiples of 16x16. - * We need the padded height for B4 and B5 conversion. */ - const UINT32 padHeigth = nHeight + 16 - nHeight % 16; - const BYTE* pSrc[3] = - { - pSrcRaw[0] + roi->top* srcStep[0] + roi->left, - pSrcRaw[1] + roi->top / 2 * srcStep[1] + roi->left / 2, - pSrcRaw[2] + roi->top / 2 * srcStep[2] + roi->left / 2 - }; - BYTE* pDst[3] = - { - pDstRaw[0] + roi->top* dstStep[0] + roi->left, - pDstRaw[1] + roi->top* dstStep[1] + roi->left, - pDstRaw[2] + roi->top* dstStep[2] + roi->left - }; - const __m128i zero = _mm_setzero_si128(); - const __m128i mask = _mm_set_epi8(0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, - 0x80); - - /* The second half of U and V is a bit more tricky... */ - /* B4 and B5 */ - for (y = 0; y < padHeigth; y++) - { - const BYTE* Ya = pSrc[0] + srcStep[0] * y; - BYTE* pX; - - if ((y) % mod < (mod + 1) / 2) - { - const UINT32 pos = (2 * uY++ + oddY); - - if (pos >= nHeight) - continue; - - pX = pDst[1] + dstStep[1] * pos; - } - else - { - const UINT32 pos = (2 * vY++ + oddY); - - if (pos >= nHeight) - continue; - - pX = pDst[2] + dstStep[2] * pos; - } - - memcpy(pX, Ya, nWidth); - } - - /* B6 and B7 */ - for (y = 0; y < halfHeight; y++) - { - const UINT32 val2y = (y * 2 + evenY); - const BYTE* Ua = pSrc[1] + srcStep[1] * y; - const BYTE* Va = pSrc[2] + srcStep[2] * y; - BYTE* pU = pDst[1] + dstStep[1] * val2y; - BYTE* pV = pDst[2] + dstStep[2] * val2y; - - for (x = 0; x < halfWidth - halfPad; x += 16) - { - { - const __m128i u = _mm_loadu_si128((__m128i*)&Ua[x]); - const __m128i u2 = _mm_unpackhi_epi8(u, zero); - const __m128i u1 = _mm_unpacklo_epi8(u, zero); - _mm_maskmoveu_si128(u1, mask, (char*)&pU[2 * x]); - _mm_maskmoveu_si128(u2, mask, (char*)&pU[2 * x + 16]); - } - { - const __m128i u = _mm_loadu_si128((__m128i*)&Va[x]); - const __m128i u2 = _mm_unpackhi_epi8(u, zero); - const __m128i u1 = _mm_unpacklo_epi8(u, zero); - _mm_maskmoveu_si128(u1, mask, (char*)&pV[2 * x]); - _mm_maskmoveu_si128(u2, mask, (char*)&pV[2 * x + 16]); - } - } - - for (; x < halfWidth; x++) - { - const UINT32 val2x1 = (x * 2 + oddX); - pU[val2x1] = Ua[x]; - pV[val2x1] = Va[x]; - } - } - - /* Filter */ - return ssse3_ChromaFilter(pDst, dstStep, roi); -} - -static pstatus_t ssse3_ChromaV2ToYUV444(const BYTE* pSrc[3], const UINT32 srcStep[3], - UINT32 nTotalWidth, UINT32 nTotalHeight, - BYTE* pDst[3], const UINT32 dstStep[3], - const RECTANGLE_16* roi) -{ - UINT32 x, y; - const UINT32 nWidth = roi->right - roi->left; - const UINT32 nHeight = roi->bottom - roi->top; - const UINT32 halfWidth = (nWidth + 1) / 2; - const UINT32 halfPad = halfWidth % 16; - const UINT32 halfHeight = (nHeight + 1) / 2; - const UINT32 quaterWidth = (nWidth + 3) / 4; - const UINT32 quaterPad = quaterWidth % 16; - const __m128i zero = _mm_setzero_si128(); - const __m128i mask = _mm_set_epi8(0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, - 0x80, 0); - const __m128i mask2 = _mm_set_epi8(0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, - 0x80); - const __m128i shuffle1 = _mm_set_epi8(0x80, 15, 0x80, 14, 0x80, 13, 0x80, 12, 0x80, 11, 0x80, 10, - 0x80, 9, 0x80, 8); - const __m128i shuffle2 = _mm_set_epi8(0x80, 7, 0x80, 6, 0x80, 5, 0x80, 4, 0x80, 3, 0x80, 2, 0x80, 1, - 0x80, 0); - - /* B4 and B5: odd UV values for width/2, height */ - for (y = 0; y < nHeight; y++) - { - const UINT32 yTop = y + roi->top; - const BYTE* pYaU = pSrc[0] + srcStep[0] * yTop + roi->left / 2; - const BYTE* pYaV = pYaU + nTotalWidth / 2; - BYTE* pU = pDst[1] + dstStep[1] * yTop + roi->left; - BYTE* pV = pDst[2] + dstStep[2] * yTop + roi->left; - - for (x = 0; x < halfWidth - halfPad; x += 16) - { - { - const __m128i u = _mm_loadu_si128((__m128i*)&pYaU[x]); - const __m128i u2 = _mm_unpackhi_epi8(zero, u); - const __m128i u1 = _mm_unpacklo_epi8(zero, u); - _mm_maskmoveu_si128(u1, mask, (char*)&pU[2 * x]); - _mm_maskmoveu_si128(u2, mask, (char*)&pU[2 * x + 16]); - } - { - const __m128i v = _mm_loadu_si128((__m128i*)&pYaV[x]); - const __m128i v2 = _mm_unpackhi_epi8(zero, v); - const __m128i v1 = _mm_unpacklo_epi8(zero, v); - _mm_maskmoveu_si128(v1, mask, (char*)&pV[2 * x]); - _mm_maskmoveu_si128(v2, mask, (char*)&pV[2 * x + 16]); - } - } - - for (; x < halfWidth; x++) - { - const UINT32 odd = 2 * x + 1; - pU[odd] = pYaU[x]; - pV[odd] = pYaV[x]; - } - } - - /* B6 - B9 */ - for (y = 0; y < halfHeight; y++) - { - const BYTE* pUaU = pSrc[1] + srcStep[1] * (y + roi->top / 2) + roi->left / 4; - const BYTE* pUaV = pUaU + nTotalWidth / 4; - const BYTE* pVaU = pSrc[2] + srcStep[2] * (y + roi->top / 2) + roi->left / 4; - const BYTE* pVaV = pVaU + nTotalWidth / 4; - BYTE* pU = pDst[1] + dstStep[1] * (2 * y + 1 + roi->top) + roi->left; - BYTE* pV = pDst[2] + dstStep[2] * (2 * y + 1 + roi->top) + roi->left; - - for (x = 0; x < quaterWidth - quaterPad; x += 16) - { - { - const __m128i uU = _mm_loadu_si128((__m128i*)&pUaU[x]); - const __m128i uV = _mm_loadu_si128((__m128i*)&pVaU[x]); - const __m128i uHigh = _mm_unpackhi_epi8(uU, uV); - const __m128i uLow = _mm_unpacklo_epi8(uU, uV); - const __m128i u1 = _mm_shuffle_epi8(uLow, shuffle2); - const __m128i u2 = _mm_shuffle_epi8(uLow, shuffle1); - const __m128i u3 = _mm_shuffle_epi8(uHigh, shuffle2); - const __m128i u4 = _mm_shuffle_epi8(uHigh, shuffle1); - _mm_maskmoveu_si128(u1, mask2, (char*)&pU[4 * x + 0]); - _mm_maskmoveu_si128(u2, mask2, (char*)&pU[4 * x + 16]); - _mm_maskmoveu_si128(u3, mask2, (char*)&pU[4 * x + 32]); - _mm_maskmoveu_si128(u4, mask2, (char*)&pU[4 * x + 48]); - } - { - const __m128i vU = _mm_loadu_si128((__m128i*)&pUaV[x]); - const __m128i vV = _mm_loadu_si128((__m128i*)&pVaV[x]); - const __m128i vHigh = _mm_unpackhi_epi8(vU, vV); - const __m128i vLow = _mm_unpacklo_epi8(vU, vV); - const __m128i v1 = _mm_shuffle_epi8(vLow, shuffle2); - const __m128i v2 = _mm_shuffle_epi8(vLow, shuffle1); - const __m128i v3 = _mm_shuffle_epi8(vHigh, shuffle2); - const __m128i v4 = _mm_shuffle_epi8(vHigh, shuffle1); - _mm_maskmoveu_si128(v1, mask2, (char*)&pV[4 * x + 0]); - _mm_maskmoveu_si128(v2, mask2, (char*)&pV[4 * x + 16]); - _mm_maskmoveu_si128(v3, mask2, (char*)&pV[4 * x + 32]); - _mm_maskmoveu_si128(v4, mask2, (char*)&pV[4 * x + 48]); - } - } - - for (; x < quaterWidth; x++) - { - pU[4 * x + 0] = pUaU[x]; - pV[4 * x + 0] = pUaV[x]; - pU[4 * x + 2] = pVaU[x]; - pV[4 * x + 2] = pVaV[x]; - } - } - - return ssse3_ChromaFilter(pDst, dstStep, roi); -} - -static pstatus_t ssse3_YUV420CombineToYUV444( - avc444_frame_type type, - const BYTE* pSrc[3], const UINT32 srcStep[3], - UINT32 nWidth, UINT32 nHeight, - BYTE* pDst[3], const UINT32 dstStep[3], - const RECTANGLE_16* roi) -{ - if (!pSrc || !pSrc[0] || !pSrc[1] || !pSrc[2]) - return -1; - - if (!pDst || !pDst[0] || !pDst[1] || !pDst[2]) - return -1; - - if (!roi) - return -1; - - switch (type) - { - case AVC444_LUMA: - return ssse3_LumaToYUV444(pSrc, srcStep, pDst, dstStep, roi); - - case AVC444_CHROMAv1: - return ssse3_ChromaV1ToYUV444(pSrc, srcStep, pDst, dstStep, roi); - - case AVC444_CHROMAv2: - return ssse3_ChromaV2ToYUV444(pSrc, srcStep, nWidth, nHeight, pDst, dstStep, roi); - - default: - return -1; - } -} - -#elif defined(WITH_NEON) - -static INLINE uint8x8_t neon_YUV2R(int32x4_t Ch, int32x4_t Cl, - int16x4_t Dh, int16x4_t Dl, - int16x4_t Eh, int16x4_t El) -{ - /* R = (256 * Y + 403 * (V - 128)) >> 8 */ - const int16x4_t c403 = vdup_n_s16(403); - const int32x4_t CEh = vmlal_s16(Ch, Eh, c403); - const int32x4_t CEl = vmlal_s16(Cl, El, c403); - const int32x4_t Rh = vrshrq_n_s32(CEh, 8); - const int32x4_t Rl = vrshrq_n_s32(CEl, 8); - const int16x8_t R = vcombine_s16(vqmovn_s32(Rl), vqmovn_s32(Rh)); - return vqmovun_s16(R); -} - -static INLINE uint8x8_t neon_YUV2G(int32x4_t Ch, int32x4_t Cl, - int16x4_t Dh, int16x4_t Dl, - int16x4_t Eh, int16x4_t El) -{ - /* G = (256L * Y - 48 * (U - 128) - 120 * (V - 128)) >> 8 */ - const int16x4_t c48 = vdup_n_s16(48); - const int16x4_t c120 = vdup_n_s16(120); - const int32x4_t CDh = vmlsl_s16(Ch, Dh, c48); - const int32x4_t CDl = vmlsl_s16(Cl, Dl, c48); - const int32x4_t CDEh = vmlsl_s16(CDh, Eh, c120); - const int32x4_t CDEl = vmlsl_s16(CDl, El, c120); - const int32x4_t Gh = vrshrq_n_s32(CDEh, 8); - const int32x4_t Gl = vrshrq_n_s32(CDEl, 8); - const int16x8_t G = vcombine_s16(vqmovn_s32(Gl), vqmovn_s32(Gh)); - return vqmovun_s16(G); -} - -static INLINE uint8x8_t neon_YUV2B(int32x4_t Ch, int32x4_t Cl, - int16x4_t Dh, int16x4_t Dl, - int16x4_t Eh, int16x4_t El) -{ - /* B = (256L * Y + 475 * (U - 128)) >> 8*/ - const int16x4_t c475 = vdup_n_s16(475); - const int32x4_t CDh = vmlal_s16(Ch, Dh, c475); - const int32x4_t CDl = vmlal_s16(Ch, Dl, c475); - const int32x4_t Bh = vrshrq_n_s32(CDh, 8); - const int32x4_t Bl = vrshrq_n_s32(CDl, 8); - const int16x8_t B = vcombine_s16(vqmovn_s32(Bl), vqmovn_s32(Bh)); - return vqmovun_s16(B); -} - -static INLINE BYTE* neon_YuvToRgbPixel(BYTE* pRGB, int16x8_t Y, int16x8_t D, int16x8_t E, - const uint8_t rPos, const uint8_t gPos, const uint8_t bPos, const uint8_t aPos) -{ - uint8x8x4_t bgrx; - const int32x4_t Ch = vmulq_n_s32(vmovl_s16(vget_high_s16(Y)), 256); /* Y * 256 */ - const int32x4_t Cl = vmulq_n_s32(vmovl_s16(vget_low_s16(Y)), 256); /* Y * 256 */ - const int16x4_t Dh = vget_high_s16(D); - const int16x4_t Dl = vget_low_s16(D); - const int16x4_t Eh = vget_high_s16(E); - const int16x4_t El = vget_low_s16(E); - { - /* B = (256L * Y + 475 * (U - 128)) >> 8*/ - const int16x4_t c475 = vdup_n_s16(475); - const int32x4_t CDh = vmlal_s16(Ch, Dh, c475); - const int32x4_t CDl = vmlal_s16(Cl, Dl, c475); - const int32x4_t Bh = vrshrq_n_s32(CDh, 8); - const int32x4_t Bl = vrshrq_n_s32(CDl, 8); - const int16x8_t B = vcombine_s16(vqmovn_s32(Bl), vqmovn_s32(Bh)); - bgrx.val[bPos] = vqmovun_s16(B); - } - { - /* G = (256L * Y - 48 * (U - 128) - 120 * (V - 128)) >> 8 */ - const int16x4_t c48 = vdup_n_s16(48); - const int16x4_t c120 = vdup_n_s16(120); - const int32x4_t CDh = vmlsl_s16(Ch, Dh, c48); - const int32x4_t CDl = vmlsl_s16(Cl, Dl, c48); - const int32x4_t CDEh = vmlsl_s16(CDh, Eh, c120); - const int32x4_t CDEl = vmlsl_s16(CDl, El, c120); - const int32x4_t Gh = vrshrq_n_s32(CDEh, 8); - const int32x4_t Gl = vrshrq_n_s32(CDEl, 8); - const int16x8_t G = vcombine_s16(vqmovn_s32(Gl), vqmovn_s32(Gh)); - bgrx.val[gPos] = vqmovun_s16(G); - } - { - /* R = (256 * Y + 403 * (V - 128)) >> 8 */ - const int16x4_t c403 = vdup_n_s16(403); - const int32x4_t CEh = vmlal_s16(Ch, Eh, c403); - const int32x4_t CEl = vmlal_s16(Cl, El, c403); - const int32x4_t Rh = vrshrq_n_s32(CEh, 8); - const int32x4_t Rl = vrshrq_n_s32(CEl, 8); - const int16x8_t R = vcombine_s16(vqmovn_s32(Rl), vqmovn_s32(Rh)); - bgrx.val[rPos] = vqmovun_s16(R); - } - { - /* A */ - bgrx.val[aPos] = vdup_n_u8(0xFF); - } - vst4_u8(pRGB, bgrx); - pRGB += 32; - return pRGB; -} - -static INLINE pstatus_t neon_YUV420ToX( - const BYTE* pSrc[3], const UINT32 srcStep[3], - BYTE* pDst, UINT32 dstStep, - const prim_size_t* roi, const uint8_t rPos, const uint8_t gPos, - const uint8_t bPos, const uint8_t aPos) -{ - UINT32 y; - const UINT32 nWidth = roi->width; - const UINT32 nHeight = roi->height; - const DWORD pad = nWidth % 16; - const UINT32 yPad = srcStep[0] - roi->width; - const UINT32 uPad = srcStep[1] - roi->width / 2; - const UINT32 vPad = srcStep[2] - roi->width / 2; - const UINT32 dPad = dstStep - roi->width * 4; - const int16x8_t c128 = vdupq_n_s16(128); - - for (y = 0; y < nHeight; y += 2) - { - const uint8_t* pY1 = pSrc[0] + y * srcStep[0]; - const uint8_t* pY2 = pY1 + srcStep[0]; - const uint8_t* pU = pSrc[1] + (y / 2) * srcStep[1]; - const uint8_t* pV = pSrc[2] + (y / 2) * srcStep[2]; - uint8_t* pRGB1 = pDst + y * dstStep; - uint8_t* pRGB2 = pRGB1 + dstStep; - UINT32 x; - const BOOL lastY = y >= nHeight - 1; - - for (x = 0; x < nWidth - pad;) - { - const uint8x8_t Uraw = vld1_u8(pU); - const uint8x8x2_t Uu = vzip_u8(Uraw, Uraw); - const int16x8_t U1 = vreinterpretq_s16_u16(vmovl_u8(Uu.val[0])); - const int16x8_t U2 = vreinterpretq_s16_u16(vmovl_u8(Uu.val[1])); - const uint8x8_t Vraw = vld1_u8(pV); - const uint8x8x2_t Vu = vzip_u8(Vraw, Vraw); - const int16x8_t V1 = vreinterpretq_s16_u16(vmovl_u8(Vu.val[0])); - const int16x8_t V2 = vreinterpretq_s16_u16(vmovl_u8(Vu.val[1])); - const int16x8_t D1 = vsubq_s16(U1, c128); - const int16x8_t E1 = vsubq_s16(V1, c128); - const int16x8_t D2 = vsubq_s16(U2, c128); - const int16x8_t E2 = vsubq_s16(V2, c128); - { - const uint8x8_t Y1u = vld1_u8(pY1); - const int16x8_t Y1 = vreinterpretq_s16_u16(vmovl_u8(Y1u)); - pRGB1 = neon_YuvToRgbPixel(pRGB1, Y1, D1, E1, rPos, gPos, bPos, aPos); - pY1 += 8; - x += 8; - } - { - const uint8x8_t Y1u = vld1_u8(pY1); - const int16x8_t Y1 = vreinterpretq_s16_u16(vmovl_u8(Y1u)); - pRGB1 = neon_YuvToRgbPixel(pRGB1, Y1, D2, E2, rPos, gPos, bPos, aPos); - pY1 += 8; - x += 8; - } - - if (!lastY) - { - { - const uint8x8_t Y2u = vld1_u8(pY2); - const int16x8_t Y2 = vreinterpretq_s16_u16(vmovl_u8(Y2u)); - pRGB2 = neon_YuvToRgbPixel(pRGB2, Y2, D1, E1, rPos, gPos, bPos, aPos); - pY2 += 8; - } - { - const uint8x8_t Y2u = vld1_u8(pY2); - const int16x8_t Y2 = vreinterpretq_s16_u16(vmovl_u8(Y2u)); - pRGB2 = neon_YuvToRgbPixel(pRGB2, Y2, D2, E2, rPos, gPos, bPos, aPos); - pY2 += 8; - } - } - - pU += 8; - pV += 8; - } - - for (; x < nWidth; x++) - { - const BYTE U = *pU; - const BYTE V = *pV; - { - const BYTE Y = *pY1++; - const BYTE r = YUV2R(Y, U, V); - const BYTE g = YUV2G(Y, U, V); - const BYTE b = YUV2B(Y, U, V); - pRGB1[aPos] = 0xFF; - pRGB1[rPos] = r; - pRGB1[gPos] = g; - pRGB1[bPos] = b; - pRGB1 += 4; - } - - if (!lastY) - { - const BYTE Y = *pY2++; - const BYTE r = YUV2R(Y, U, V); - const BYTE g = YUV2G(Y, U, V); - const BYTE b = YUV2B(Y, U, V); - pRGB2[aPos] = 0xFF; - pRGB2[rPos] = r; - pRGB2[gPos] = g; - pRGB2[bPos] = b; - pRGB2 += 4; - } - - if (x % 2) - { - pU++; - pV++; - } - } - - pRGB1 += dPad; - pRGB2 += dPad; - pY1 += yPad; - pY2 += yPad; - pU += uPad; - pV += vPad; - } - - return PRIMITIVES_SUCCESS; -} - -static pstatus_t neon_YUV420ToRGB_8u_P3AC4R( - const BYTE* pSrc[3], const UINT32 srcStep[3], - BYTE* pDst, UINT32 dstStep, UINT32 DstFormat, - const prim_size_t* roi) -{ - switch (DstFormat) - { - case PIXEL_FORMAT_BGRA32: - case PIXEL_FORMAT_BGRX32: - return neon_YUV420ToX(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3); - - case PIXEL_FORMAT_RGBA32: - case PIXEL_FORMAT_RGBX32: - return neon_YUV420ToX(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3); - - case PIXEL_FORMAT_ARGB32: - case PIXEL_FORMAT_XRGB32: - return neon_YUV420ToX(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0); - - case PIXEL_FORMAT_ABGR32: - case PIXEL_FORMAT_XBGR32: - return neon_YUV420ToX(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0); - - default: - return generic->YUV420ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi); - } -} - -static INLINE pstatus_t neon_YUV444ToX( - const BYTE* pSrc[3], const UINT32 srcStep[3], - BYTE* pDst, UINT32 dstStep, - const prim_size_t* roi, const uint8_t rPos, const uint8_t gPos, - const uint8_t bPos, const uint8_t aPos) -{ - UINT32 y; - const UINT32 nWidth = roi->width; - const UINT32 nHeight = roi->height; - const UINT32 yPad = srcStep[0] - roi->width; - const UINT32 uPad = srcStep[1] - roi->width; - const UINT32 vPad = srcStep[2] - roi->width; - const UINT32 dPad = dstStep - roi->width * 4; - const uint8_t* pY = pSrc[0]; - const uint8_t* pU = pSrc[1]; - const uint8_t* pV = pSrc[2]; - uint8_t* pRGB = pDst; - const int16x8_t c128 = vdupq_n_s16(128); - const DWORD pad = nWidth % 8; - - for (y = 0; y < nHeight; y++) - { - UINT32 x; - - for (x = 0; x < nWidth - pad; x += 8) - { - const uint8x8_t Yu = vld1_u8(pY); - const int16x8_t Y = vreinterpretq_s16_u16(vmovl_u8(Yu)); - const uint8x8_t Uu = vld1_u8(pU); - const int16x8_t U = vreinterpretq_s16_u16(vmovl_u8(Uu)); - const uint8x8_t Vu = vld1_u8(pV); - const int16x8_t V = vreinterpretq_s16_u16(vmovl_u8(Vu)); - /* Do the calculations on Y in 32bit width, the result of 255 * 256 does not fit - * a signed 16 bit value. */ - const int16x8_t D = vsubq_s16(U, c128); - const int16x8_t E = vsubq_s16(V, c128); - pRGB = neon_YuvToRgbPixel(pRGB, Y, D, E, rPos, gPos, bPos, aPos); - pY += 8; - pU += 8; - pV += 8; - } - - for (x = 0; x < pad; x++) - { - const BYTE Y = *pY++; - const BYTE U = *pU++; - const BYTE V = *pV++; - const BYTE r = YUV2R(Y, U, V); - const BYTE g = YUV2G(Y, U, V); - const BYTE b = YUV2B(Y, U, V); - pRGB[aPos] = 0xFF; - pRGB[rPos] = r; - pRGB[gPos] = g; - pRGB[bPos] = b; - pRGB += 4; - } - - pRGB += dPad; - pY += yPad; - pU += uPad; - pV += vPad; - } - - return PRIMITIVES_SUCCESS; -} - -static pstatus_t neon_YUV444ToRGB_8u_P3AC4R( - const BYTE* pSrc[3], const UINT32 srcStep[3], - BYTE* pDst, UINT32 dstStep, UINT32 DstFormat, - const prim_size_t* roi) -{ - switch (DstFormat) - { - case PIXEL_FORMAT_BGRA32: - case PIXEL_FORMAT_BGRX32: - return neon_YUV444ToX(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3); - - case PIXEL_FORMAT_RGBA32: - case PIXEL_FORMAT_RGBX32: - return neon_YUV444ToX(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3); - - case PIXEL_FORMAT_ARGB32: - case PIXEL_FORMAT_XRGB32: - return neon_YUV444ToX(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0); - - case PIXEL_FORMAT_ABGR32: - case PIXEL_FORMAT_XBGR32: - return neon_YUV444ToX(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0); - - default: - return generic->YUV444ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi); - } -} - -static pstatus_t neon_LumaToYUV444(const BYTE* pSrcRaw[3], const UINT32 srcStep[3], - BYTE* pDstRaw[3], const UINT32 dstStep[3], - const RECTANGLE_16* roi) -{ - UINT32 x, y; - const UINT32 nWidth = roi->right - roi->left; - const UINT32 nHeight = roi->bottom - roi->top; - const UINT32 halfWidth = (nWidth + 1) / 2; - const UINT32 halfHeight = (nHeight + 1) / 2; - const UINT32 evenY = 0; - const BYTE* pSrc[3] = - { - pSrcRaw[0] + roi->top* srcStep[0] + roi->left, - pSrcRaw[1] + roi->top / 2 * srcStep[1] + roi->left / 2, - pSrcRaw[2] + roi->top / 2 * srcStep[2] + roi->left / 2 - }; - BYTE* pDst[3] = - { - pDstRaw[0] + roi->top* dstStep[0] + roi->left, - pDstRaw[1] + roi->top* dstStep[1] + roi->left, - pDstRaw[2] + roi->top* dstStep[2] + roi->left - }; - - /* Y data is already here... */ - /* B1 */ - for (y = 0; y < nHeight; y++) - { - const BYTE* Ym = pSrc[0] + srcStep[0] * y; - BYTE* pY = pDst[0] + dstStep[0] * y; - memcpy(pY, Ym, nWidth); - } - - /* The first half of U, V are already here part of this frame. */ - /* B2 and B3 */ - for (y = 0; y < halfHeight; y++) - { - const UINT32 val2y = (2 * y + evenY); - const BYTE* Um = pSrc[1] + srcStep[1] * y; - const BYTE* Vm = pSrc[2] + srcStep[2] * y; - BYTE* pU = pDst[1] + dstStep[1] * val2y; - BYTE* pV = pDst[2] + dstStep[2] * val2y; - BYTE* pU1 = pU + dstStep[1]; - BYTE* pV1 = pV + dstStep[2]; - - for (x = 0; x + 16 < halfWidth; x += 16) - { - { - const uint8x16_t u = vld1q_u8(Um); - uint8x16x2_t u2x; - u2x.val[0] = u; - u2x.val[1] = u; - vst2q_u8(pU, u2x); - vst2q_u8(pU1, u2x); - Um += 16; - pU += 32; - pU1 += 32; - } - { - const uint8x16_t v = vld1q_u8(Vm); - uint8x16x2_t v2x; - v2x.val[0] = v; - v2x.val[1] = v; - vst2q_u8(pV, v2x); - vst2q_u8(pV1, v2x); - Vm += 16; - pV += 32; - pV1 += 32; - } - } - - for (; x < halfWidth; x++) - { - const BYTE u = *Um++; - const BYTE v = *Vm++; - *pU++ = u; - *pU++ = u; - *pU1++ = u; - *pU1++ = u; - *pV++ = v; - *pV++ = v; - *pV1++ = v; - *pV1++ = v; - } - } - - return PRIMITIVES_SUCCESS; -} - -static pstatus_t neon_ChromaFilter(BYTE* pDst[3], const UINT32 dstStep[3], - const RECTANGLE_16* roi) -{ - const UINT32 oddY = 1; - const UINT32 evenY = 0; - const UINT32 nWidth = roi->right - roi->left; - const UINT32 nHeight = roi->bottom - roi->top; - const UINT32 halfHeight = (nHeight + 1) / 2; - const UINT32 halfWidth = (nWidth + 1) / 2; - const UINT32 halfPad = halfWidth % 16; - UINT32 x, y; - - /* Filter */ - for (y = roi->top; y < halfHeight + roi->top; y++) - { - const UINT32 val2y = (y * 2 + evenY); - const UINT32 val2y1 = val2y + oddY; - BYTE* pU1 = pDst[1] + dstStep[1] * val2y1; - BYTE* pV1 = pDst[2] + dstStep[2] * val2y1; - BYTE* pU = pDst[1] + dstStep[1] * val2y; - BYTE* pV = pDst[2] + dstStep[2] * val2y; - - if (val2y1 > nHeight) - continue; - - for (x = roi->left / 2; x < halfWidth + roi->left / 2 - halfPad; x += 16) - { - { - /* U = (U2x,2y << 2) - U2x1,2y - U2x,2y1 - U2x1,2y1 */ - uint8x8x2_t u = vld2_u8(&pU[2 * x]); - const int16x8_t up = vreinterpretq_s16_u16(vshll_n_u8(u.val[0], 2)); /* Ux2,2y << 2 */ - const uint8x8x2_t u1 = vld2_u8(&pU1[2 * x]); - const uint16x8_t usub = vaddl_u8(u1.val[1], u1.val[0]); /* U2x,2y1 + U2x1,2y1 */ - const int16x8_t us = vreinterpretq_s16_u16(vaddw_u8(usub, - u.val[1])); /* U2x1,2y + U2x,2y1 + U2x1,2y1 */ - const int16x8_t un = vsubq_s16(up, us); - const uint8x8_t u8 = vqmovun_s16(un); /* CLIP(un) */ - u.val[0] = u8; - vst2_u8(&pU[2 * x], u); - } - { - /* V = (V2x,2y << 2) - V2x1,2y - V2x,2y1 - V2x1,2y1 */ - uint8x8x2_t v = vld2_u8(&pV[2 * x]); - const int16x8_t vp = vreinterpretq_s16_u16(vshll_n_u8(v.val[0], 2)); /* Vx2,2y << 2 */ - const uint8x8x2_t v1 = vld2_u8(&pV1[2 * x]); - const uint16x8_t vsub = vaddl_u8(v1.val[1], v1.val[0]); /* V2x,2y1 + V2x1,2y1 */ - const int16x8_t vs = vreinterpretq_s16_u16(vaddw_u8(vsub, - v.val[1])); /* V2x1,2y + V2x,2y1 + V2x1,2y1 */ - const int16x8_t vn = vsubq_s16(vp, vs); - const uint8x8_t v8 = vqmovun_s16(vn); /* CLIP(vn) */ - v.val[0] = v8; - vst2_u8(&pV[2 * x], v); - } - } - - for (; x < halfWidth + roi->left / 2; x++) - { - const UINT32 val2x = (x * 2); - const UINT32 val2x1 = val2x + 1; - const INT32 up = pU[val2x] * 4; - const INT32 vp = pV[val2x] * 4; - INT32 u2020; - INT32 v2020; - - if (val2x1 > nWidth) - continue; - - u2020 = up - pU[val2x1] - pU1[val2x] - pU1[val2x1]; - v2020 = vp - pV[val2x1] - pV1[val2x] - pV1[val2x1]; - pU[val2x] = CLIP(u2020); - pV[val2x] = CLIP(v2020); - } - } - - return PRIMITIVES_SUCCESS; -} - -static pstatus_t neon_ChromaV1ToYUV444(const BYTE* pSrcRaw[3], const UINT32 srcStep[3], - BYTE* pDstRaw[3], const UINT32 dstStep[3], - const RECTANGLE_16* roi) -{ - const UINT32 mod = 16; - UINT32 uY = 0; - UINT32 vY = 0; - UINT32 x, y; - const UINT32 nWidth = roi->right - roi->left; - const UINT32 nHeight = roi->bottom - roi->top; - const UINT32 halfWidth = (nWidth) / 2; - const UINT32 halfHeight = (nHeight) / 2; - const UINT32 oddY = 1; - const UINT32 evenY = 0; - const UINT32 oddX = 1; - /* The auxilary frame is aligned to multiples of 16x16. - * We need the padded height for B4 and B5 conversion. */ - const UINT32 padHeigth = nHeight + 16 - nHeight % 16; - const UINT32 halfPad = halfWidth % 16; - const BYTE* pSrc[3] = - { - pSrcRaw[0] + roi->top* srcStep[0] + roi->left, - pSrcRaw[1] + roi->top / 2 * srcStep[1] + roi->left / 2, - pSrcRaw[2] + roi->top / 2 * srcStep[2] + roi->left / 2 - }; - BYTE* pDst[3] = - { - pDstRaw[0] + roi->top* dstStep[0] + roi->left, - pDstRaw[1] + roi->top* dstStep[1] + roi->left, - pDstRaw[2] + roi->top* dstStep[2] + roi->left - }; - - /* The second half of U and V is a bit more tricky... */ - /* B4 and B5 */ - for (y = 0; y < padHeigth; y++) - { - const BYTE* Ya = pSrc[0] + srcStep[0] * y; - BYTE* pX; - - if ((y) % mod < (mod + 1) / 2) - { - const UINT32 pos = (2 * uY++ + oddY); - - if (pos >= nHeight) - continue; - - pX = pDst[1] + dstStep[1] * pos; - } - else - { - const UINT32 pos = (2 * vY++ + oddY); - - if (pos >= nHeight) - continue; - - pX = pDst[2] + dstStep[2] * pos; - } - - memcpy(pX, Ya, nWidth); - } - - /* B6 and B7 */ - for (y = 0; y < halfHeight; y++) - { - const UINT32 val2y = (y * 2 + evenY); - const BYTE* Ua = pSrc[1] + srcStep[1] * y; - const BYTE* Va = pSrc[2] + srcStep[2] * y; - BYTE* pU = pDst[1] + dstStep[1] * val2y; - BYTE* pV = pDst[2] + dstStep[2] * val2y; - - for (x = 0; x < halfWidth - halfPad; x += 16) - { - { - uint8x16x2_t u = vld2q_u8(&pU[2 * x]); - u.val[1] = vld1q_u8(&Ua[x]); - vst2q_u8(&pU[2 * x], u); - } - { - uint8x16x2_t v = vld2q_u8(&pV[2 * x]); - v.val[1] = vld1q_u8(&Va[x]); - vst2q_u8(&pV[2 * x], v); - } - } - - for (; x < halfWidth; x++) - { - const UINT32 val2x1 = (x * 2 + oddX); - pU[val2x1] = Ua[x]; - pV[val2x1] = Va[x]; - } - } - - /* Filter */ - return neon_ChromaFilter(pDst, dstStep, roi); -} - -static pstatus_t neon_ChromaV2ToYUV444(const BYTE* pSrc[3], const UINT32 srcStep[3], - UINT32 nTotalWidth, UINT32 nTotalHeight, - BYTE* pDst[3], const UINT32 dstStep[3], - const RECTANGLE_16* roi) -{ - UINT32 x, y; - const UINT32 nWidth = roi->right - roi->left; - const UINT32 nHeight = roi->bottom - roi->top; - const UINT32 halfWidth = (nWidth + 1) / 2; - const UINT32 halfPad = halfWidth % 16; - const UINT32 halfHeight = (nHeight + 1) / 2; - const UINT32 quaterWidth = (nWidth + 3) / 4; - const UINT32 quaterPad = quaterWidth % 16; - - /* B4 and B5: odd UV values for width/2, height */ - for (y = 0; y < nHeight; y++) - { - const UINT32 yTop = y + roi->top; - const BYTE* pYaU = pSrc[0] + srcStep[0] * yTop + roi->left / 2; - const BYTE* pYaV = pYaU + nTotalWidth / 2; - BYTE* pU = pDst[1] + dstStep[1] * yTop + roi->left; - BYTE* pV = pDst[2] + dstStep[2] * yTop + roi->left; - - for (x = 0; x < halfWidth - halfPad; x += 16) - { - { - uint8x16x2_t u = vld2q_u8(&pU[2 * x]); - u.val[1] = vld1q_u8(&pYaU[x]); - vst2q_u8(&pU[2 * x], u); - } - { - uint8x16x2_t v = vld2q_u8(&pV[2 * x]); - v.val[1] = vld1q_u8(&pYaV[x]); - vst2q_u8(&pV[2 * x], v); - } - } - - for (; x < halfWidth; x++) - { - const UINT32 odd = 2 * x + 1; - pU[odd] = pYaU[x]; - pV[odd] = pYaV[x]; - } - } - - /* B6 - B9 */ - for (y = 0; y < halfHeight; y++) - { - const BYTE* pUaU = pSrc[1] + srcStep[1] * (y + roi->top / 2) + roi->left / 4; - const BYTE* pUaV = pUaU + nTotalWidth / 4; - const BYTE* pVaU = pSrc[2] + srcStep[2] * (y + roi->top / 2) + roi->left / 4; - const BYTE* pVaV = pVaU + nTotalWidth / 4; - BYTE* pU = pDst[1] + dstStep[1] * (2 * y + 1 + roi->top) + roi->left; - BYTE* pV = pDst[2] + dstStep[2] * (2 * y + 1 + roi->top) + roi->left; - - for (x = 0; x < quaterWidth - quaterPad; x += 16) - { - { - uint8x16x4_t u = vld4q_u8(&pU[4 * x]); - u.val[0] = vld1q_u8(&pUaU[x]); - u.val[2] = vld1q_u8(&pVaU[x]); - vst4q_u8(&pU[4 * x], u); - } - { - uint8x16x4_t v = vld4q_u8(&pV[4 * x]); - v.val[0] = vld1q_u8(&pUaV[x]); - v.val[2] = vld1q_u8(&pVaV[x]); - vst4q_u8(&pV[4 * x], v); - } - } - - for (; x < quaterWidth; x++) - { - pU[4 * x + 0] = pUaU[x]; - pV[4 * x + 0] = pUaV[x]; - pU[4 * x + 2] = pVaU[x]; - pV[4 * x + 2] = pVaV[x]; - } - } - - return neon_ChromaFilter(pDst, dstStep, roi); -} - -static pstatus_t neon_YUV420CombineToYUV444( - avc444_frame_type type, - const BYTE* pSrc[3], const UINT32 srcStep[3], - UINT32 nWidth, UINT32 nHeight, - BYTE* pDst[3], const UINT32 dstStep[3], - const RECTANGLE_16* roi) -{ - if (!pSrc || !pSrc[0] || !pSrc[1] || !pSrc[2]) - return -1; - - if (!pDst || !pDst[0] || !pDst[1] || !pDst[2]) - return -1; - - if (!roi) - return -1; - - switch (type) - { - case AVC444_LUMA: - return neon_LumaToYUV444(pSrc, srcStep, pDst, dstStep, roi); - - case AVC444_CHROMAv1: - return neon_ChromaV1ToYUV444(pSrc, srcStep, pDst, dstStep, roi); - - case AVC444_CHROMAv2: - return neon_ChromaV2ToYUV444(pSrc, srcStep, nWidth, nHeight, pDst, dstStep, roi); - - default: - return -1; - } -} -#endif - -void primitives_init_YUV_opt(primitives_t* prims) -{ - generic = primitives_get_generic(); - primitives_init_YUV(prims); -#ifdef WITH_SSE2 - - if (IsProcessorFeaturePresentEx(PF_EX_SSSE3) - && IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE)) - { - prims->RGBToYUV420_8u_P3AC4R = ssse3_RGBToYUV420; - prims->RGBToAVC444YUV = ssse3_RGBToAVC444YUV; - prims->YUV420ToRGB_8u_P3AC4R = ssse3_YUV420ToRGB; - prims->YUV444ToRGB_8u_P3AC4R = ssse3_YUV444ToRGB_8u_P3AC4R; - prims->YUV420CombineToYUV444 = ssse3_YUV420CombineToYUV444; - } - -#elif defined(WITH_NEON) - - if (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE)) - { - prims->YUV420ToRGB_8u_P3AC4R = neon_YUV420ToRGB_8u_P3AC4R; - prims->YUV444ToRGB_8u_P3AC4R = neon_YUV444ToRGB_8u_P3AC4R; - prims->YUV420CombineToYUV444 = neon_YUV420CombineToYUV444; - } - -#endif -} diff --git a/libfreerdp/primitives/prim_YUV_ssse3.c b/libfreerdp/primitives/prim_YUV_ssse3.c new file mode 100644 index 0000000..7569585 --- /dev/null +++ b/libfreerdp/primitives/prim_YUV_ssse3.c @@ -0,0 +1,1816 @@ +/** + * FreeRDP: A Remote Desktop Protocol Implementation + * Optimized YUV/RGB conversion operations + * + * Copyright 2014 Thomas Erbesdobler + * Copyright 2016-2017 Armin Novak + * Copyright 2016-2017 Norbert Federa + * Copyright 2016-2017 Thincast Technologies GmbH + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include +#include +#include +#include + +#include "prim_internal.h" + +#ifdef WITH_SSE2 + +#include +#include +#elif defined(WITH_NEON) +#include +#endif /* WITH_SSE2 else WITH_NEON */ + +static primitives_t* generic = NULL; + +#ifdef WITH_SSE2 +/****************************************************************************/ +/* SSSE3 YUV420 -> RGB conversion */ +/****************************************************************************/ +static __m128i* ssse3_YUV444Pixel(__m128i* dst, __m128i Yraw, __m128i Uraw, __m128i Vraw, UINT8 pos) +{ + /* Visual Studio 2010 doesn't like _mm_set_epi32 in array initializer list */ + /* Note: This also applies to Visual Studio 2013 before Update 4 */ +#if !defined(_MSC_VER) || (_MSC_VER > 1600) + const __m128i mapY[] = + { + _mm_set_epi32(0x80800380, 0x80800280, 0x80800180, 0x80800080), + _mm_set_epi32(0x80800780, 0x80800680, 0x80800580, 0x80800480), + _mm_set_epi32(0x80800B80, 0x80800A80, 0x80800980, 0x80800880), + _mm_set_epi32(0x80800F80, 0x80800E80, 0x80800D80, 0x80800C80) + }; + const __m128i mapUV[] = + { + _mm_set_epi32(0x80038002, 0x80018000, 0x80808080, 0x80808080), + _mm_set_epi32(0x80078006, 0x80058004, 0x80808080, 0x80808080), + _mm_set_epi32(0x800B800A, 0x80098008, 0x80808080, 0x80808080), + _mm_set_epi32(0x800F800E, 0x800D800C, 0x80808080, 0x80808080) + }; + const __m128i mask[] = + { + _mm_set_epi32(0x80038080, 0x80028080, 0x80018080, 0x80008080), + _mm_set_epi32(0x80800380, 0x80800280, 0x80800180, 0x80800080), + _mm_set_epi32(0x80808003, 0x80808002, 0x80808001, 0x80808000) + }; +#else + /* Note: must be in little-endian format ! */ + const __m128i mapY[] = + { + { 0x80, 0x00, 0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80, 0x02, 0x80, 0x80, 0x80, 0x03, 0x80, 0x80 }, + { 0x80, 0x04, 0x80, 0x80, 0x80, 0x05, 0x80, 0x80, 0x80, 0x06, 0x80, 0x80, 0x80, 0x07, 0x80, 0x80 }, + { 0x80, 0x08, 0x80, 0x80, 0x80, 0x09, 0x80, 0x80, 0x80, 0x0a, 0x80, 0x80, 0x80, 0x0b, 0x80, 0x80 }, + { 0x80, 0x0c, 0x80, 0x80, 0x80, 0x0d, 0x80, 0x80, 0x80, 0x0e, 0x80, 0x80, 0x80, 0x0f, 0x80, 0x80 } + + }; + const __m128i mapUV[] = + { + { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x00, 0x80, 0x01, 0x80, 0x02, 0x80, 0x03, 0x80 }, + { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x04, 0x80, 0x05, 0x80, 0x06, 0x80, 0x07, 0x80 }, + { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x08, 0x80, 0x09, 0x80, 0x0a, 0x80, 0x0b, 0x80 }, + { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x0c, 0x80, 0x0d, 0x80, 0x0e, 0x80, 0x0f, 0x80 } + }; + const __m128i mask[] = + { + { 0x80, 0x80, 0x00, 0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80, 0x02, 0x80, 0x80, 0x80, 0x03, 0x80 }, + { 0x80, 0x00, 0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80, 0x02, 0x80, 0x80, 0x80, 0x03, 0x80, 0x80 }, + { 0x00, 0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80, 0x02, 0x80, 0x80, 0x80, 0x03, 0x80, 0x80, 0x80 } + }; +#endif + const __m128i c128 = _mm_set1_epi16(128); + __m128i BGRX = _mm_set_epi32(0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000); + { + __m128i C, D, E; + /* Load Y values and expand to 32 bit */ + { + C = _mm_shuffle_epi8(Yraw, mapY[pos]); /* Reorder and multiply by 256 */ + } + /* Load U values and expand to 32 bit */ + { + const __m128i U = _mm_shuffle_epi8(Uraw, mapUV[pos]); /* Reorder dcba */ + D = _mm_sub_epi16(U, c128); /* D = U - 128 */ + } + /* Load V values and expand to 32 bit */ + { + const __m128i V = _mm_shuffle_epi8(Vraw, mapUV[pos]); /* Reorder dcba */ + E = _mm_sub_epi16(V, c128); /* E = V - 128 */ + } + /* Get the R value */ + { + const __m128i c403 = _mm_set1_epi16(403); + const __m128i e403 = _mm_unpackhi_epi16(_mm_mullo_epi16(E, c403), _mm_mulhi_epi16(E, c403)); + const __m128i Rs = _mm_add_epi32(C, e403); + const __m128i R32 = _mm_srai_epi32(Rs, 8); + const __m128i R16 = _mm_packs_epi32(R32, _mm_setzero_si128()); + const __m128i R = _mm_packus_epi16(R16, _mm_setzero_si128()); + const __m128i packed = _mm_shuffle_epi8(R, mask[0]); + BGRX = _mm_or_si128(BGRX, packed); + } + /* Get the G value */ + { + const __m128i c48 = _mm_set1_epi16(48); + const __m128i d48 = _mm_unpackhi_epi16(_mm_mullo_epi16(D, c48), _mm_mulhi_epi16(D, c48)); + const __m128i c120 = _mm_set1_epi16(120); + const __m128i e120 = _mm_unpackhi_epi16(_mm_mullo_epi16(E, c120), _mm_mulhi_epi16(E, c120)); + const __m128i de = _mm_add_epi32(d48, e120); + const __m128i Gs = _mm_sub_epi32(C, de); + const __m128i G32 = _mm_srai_epi32(Gs, 8); + const __m128i G16 = _mm_packs_epi32(G32, _mm_setzero_si128()); + const __m128i G = _mm_packus_epi16(G16, _mm_setzero_si128()); + const __m128i packed = _mm_shuffle_epi8(G, mask[1]); + BGRX = _mm_or_si128(BGRX, packed); + } + /* Get the B value */ + { + const __m128i c475 = _mm_set1_epi16(475); + const __m128i d475 = _mm_unpackhi_epi16(_mm_mullo_epi16(D, c475), _mm_mulhi_epi16(D, c475)); + const __m128i Bs = _mm_add_epi32(C, d475); + const __m128i B32 = _mm_srai_epi32(Bs, 8); + const __m128i B16 = _mm_packs_epi32(B32, _mm_setzero_si128()); + const __m128i B = _mm_packus_epi16(B16, _mm_setzero_si128()); + const __m128i packed = _mm_shuffle_epi8(B, mask[2]); + BGRX = _mm_or_si128(BGRX, packed); + } + } + _mm_storeu_si128(dst++, BGRX); + return dst; +} + +static pstatus_t ssse3_YUV420ToRGB_BGRX( + const BYTE** pSrc, const UINT32* srcStep, + BYTE* pDst, UINT32 dstStep, + const prim_size_t* roi) +{ + const UINT32 nWidth = roi->width; + const UINT32 nHeight = roi->height; + const UINT32 pad = roi->width % 16; + const __m128i duplicate = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0); + UINT32 y; + + for (y = 0; y < nHeight; y++) + { + UINT32 x; + __m128i* dst = (__m128i*)(pDst + dstStep * y); + const BYTE* YData = pSrc[0] + y * srcStep[0]; + const BYTE* UData = pSrc[1] + (y / 2) * srcStep[1]; + const BYTE* VData = pSrc[2] + (y / 2) * srcStep[2]; + + for (x = 0; x < nWidth - pad; x += 16) + { + const __m128i Y = _mm_loadu_si128((__m128i*)YData); + const __m128i uRaw = _mm_loadu_si128((__m128i*)UData); + const __m128i vRaw = _mm_loadu_si128((__m128i*)VData); + const __m128i U = _mm_shuffle_epi8(uRaw, duplicate); + const __m128i V = _mm_shuffle_epi8(vRaw, duplicate); + YData += 16; + UData += 8; + VData += 8; + dst = ssse3_YUV444Pixel(dst, Y, U, V, 0); + dst = ssse3_YUV444Pixel(dst, Y, U, V, 1); + dst = ssse3_YUV444Pixel(dst, Y, U, V, 2); + dst = ssse3_YUV444Pixel(dst, Y, U, V, 3); + } + + for (x = 0; x < pad; x++) + { + const BYTE Y = *YData++; + const BYTE U = *UData; + const BYTE V = *VData; + const BYTE r = YUV2R(Y, U, V); + const BYTE g = YUV2G(Y, U, V); + const BYTE b = YUV2B(Y, U, V); + dst = (__m128i*)writePixelBGRX((BYTE*)dst, 4, PIXEL_FORMAT_BGRX32, r, g, b, 0xFF); + + if (x % 2) + { + UData++; + VData++; + } + } + } + + return PRIMITIVES_SUCCESS; +} + +static pstatus_t ssse3_YUV420ToRGB( + const BYTE** pSrc, const UINT32* srcStep, + BYTE* pDst, UINT32 dstStep, UINT32 DstFormat, + const prim_size_t* roi) +{ + switch (DstFormat) + { + case PIXEL_FORMAT_BGRX32: + case PIXEL_FORMAT_BGRA32: + return ssse3_YUV420ToRGB_BGRX(pSrc, srcStep, pDst, dstStep, roi); + + default: + return generic->YUV420ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi); + } +} + +static pstatus_t ssse3_YUV444ToRGB_8u_P3AC4R_BGRX( + const BYTE** pSrc, const UINT32* srcStep, + BYTE* pDst, UINT32 dstStep, + const prim_size_t* roi) +{ + const UINT32 nWidth = roi->width; + const UINT32 nHeight = roi->height; + const UINT32 pad = roi->width % 16; + UINT32 y; + + for (y = 0; y < nHeight; y++) + { + UINT32 x; + __m128i* dst = (__m128i*)(pDst + dstStep * y); + const BYTE* YData = pSrc[0] + y * srcStep[0]; + const BYTE* UData = pSrc[1] + y * srcStep[1]; + const BYTE* VData = pSrc[2] + y * srcStep[2]; + + for (x = 0; x < nWidth - pad; x += 16) + { + __m128i Y = _mm_load_si128((__m128i*)YData); + __m128i U = _mm_load_si128((__m128i*)UData); + __m128i V = _mm_load_si128((__m128i*)VData); + YData += 16; + UData += 16; + VData += 16; + dst = ssse3_YUV444Pixel(dst, Y, U, V, 0); + dst = ssse3_YUV444Pixel(dst, Y, U, V, 1); + dst = ssse3_YUV444Pixel(dst, Y, U, V, 2); + dst = ssse3_YUV444Pixel(dst, Y, U, V, 3); + } + + for (x = 0; x < pad; x++) + { + const BYTE Y = *YData++; + const BYTE U = *UData++; + const BYTE V = *VData++; + const BYTE r = YUV2R(Y, U, V); + const BYTE g = YUV2G(Y, U, V); + const BYTE b = YUV2B(Y, U, V); + dst = (__m128i*)writePixelBGRX((BYTE*)dst, 4, PIXEL_FORMAT_BGRX32, r, g, b, 0xFF); + } + } + + return PRIMITIVES_SUCCESS; +} + +static pstatus_t ssse3_YUV444ToRGB_8u_P3AC4R(const BYTE** pSrc, const UINT32* srcStep, + BYTE* pDst, UINT32 dstStep, UINT32 DstFormat, + const prim_size_t* roi) +{ + if ((unsigned long)pSrc[0] % 16 || (unsigned long)pSrc[1] % 16 || (unsigned long)pSrc[2] % 16 || + srcStep[0] % 16 || srcStep[1] % 16 || srcStep[2] % 16) + return generic->YUV444ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi); + + switch (DstFormat) + { + case PIXEL_FORMAT_BGRX32: + case PIXEL_FORMAT_BGRA32: + return ssse3_YUV444ToRGB_8u_P3AC4R_BGRX(pSrc, srcStep, pDst, dstStep, roi); + + default: + return generic->YUV444ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi); + } +} + +/****************************************************************************/ +/* SSSE3 RGB -> YUV420 conversion **/ +/****************************************************************************/ + + +/** + * Note (nfedera): + * The used forward transformation factors from RGB to YUV are based on the + * values specified in [Rec. ITU-R BT.709-6] Section 3: + * http://www.itu.int/rec/R-REC-BT.709-6-201506-I/en + * + * Y = 0.21260 * R + 0.71520 * G + 0.07220 * B + 0; + * U = -0.11457 * R - 0.38543 * G + 0.50000 * B + 128; + * V = 0.50000 * R - 0.45415 * G - 0.04585 * B + 128; + * + * The most accurate integer arithmetic approximation when using 8-bit signed + * integer factors with 16-bit signed integer intermediate results is: + * + * Y = ( ( 27 * R + 92 * G + 9 * B) >> 7 ); + * U = ( (-15 * R - 49 * G + 64 * B) >> 7 ) + 128; + * V = ( ( 64 * R - 58 * G - 6 * B) >> 7 ) + 128; + * + */ + +PRIM_ALIGN_128 static const BYTE bgrx_y_factors[] = +{ + 9, 92, 27, 0, 9, 92, 27, 0, 9, 92, 27, 0, 9, 92, 27, 0 +}; +PRIM_ALIGN_128 static const BYTE bgrx_u_factors[] = +{ + 64, -49, -15, 0, 64, -49, -15, 0, 64, -49, -15, 0, 64, -49, -15, 0 +}; +PRIM_ALIGN_128 static const BYTE bgrx_v_factors[] = +{ + -6, -58, 64, 0, -6, -58, 64, 0, -6, -58, 64, 0, -6, -58, 64, 0 +}; + +PRIM_ALIGN_128 static const BYTE const_buf_128b[] = +{ + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 +}; + +/* +TODO: +RGB[AX] can simply be supported using the following factors. And instead of loading the +globals directly the functions below could be passed pointers to the correct vectors +depending on the source picture format. + +PRIM_ALIGN_128 static const BYTE rgbx_y_factors[] = { + 27, 92, 9, 0, 27, 92, 9, 0, 27, 92, 9, 0, 27, 92, 9, 0 +}; +PRIM_ALIGN_128 static const BYTE rgbx_u_factors[] = { + -15, -49, 64, 0, -15, -49, 64, 0, -15, -49, 64, 0, -15, -49, 64, 0 +}; +PRIM_ALIGN_128 static const BYTE rgbx_v_factors[] = { + 64, -58, -6, 0, 64, -58, -6, 0, 64, -58, -6, 0, 64, -58, -6, 0 +}; +*/ + + +/* compute the luma (Y) component from a single rgb source line */ + +static INLINE void ssse3_RGBToYUV420_BGRX_Y( + const BYTE* src, BYTE* dst, UINT32 width) +{ + UINT32 x; + __m128i y_factors, x0, x1, x2, x3; + const __m128i* argb = (const __m128i*) src; + __m128i* ydst = (__m128i*) dst; + y_factors = _mm_load_si128((__m128i*)bgrx_y_factors); + + for (x = 0; x < width; x += 16) + { + /* store 16 rgba pixels in 4 128 bit registers */ + x0 = _mm_load_si128(argb++); // 1st 4 pixels + x1 = _mm_load_si128(argb++); // 2nd 4 pixels + x2 = _mm_load_si128(argb++); // 3rd 4 pixels + x3 = _mm_load_si128(argb++); // 4th 4 pixels + /* multiplications and subtotals */ + x0 = _mm_maddubs_epi16(x0, y_factors); + x1 = _mm_maddubs_epi16(x1, y_factors); + x2 = _mm_maddubs_epi16(x2, y_factors); + x3 = _mm_maddubs_epi16(x3, y_factors); + /* the total sums */ + x0 = _mm_hadd_epi16(x0, x1); + x2 = _mm_hadd_epi16(x2, x3); + /* shift the results */ + x0 = _mm_srli_epi16(x0, 7); + x2 = _mm_srli_epi16(x2, 7); + /* pack the 16 words into bytes */ + x0 = _mm_packus_epi16(x0, x2); + /* save to y plane */ + _mm_storeu_si128(ydst++, x0); + } +} + +/* compute the chrominance (UV) components from two rgb source lines */ + +static INLINE void ssse3_RGBToYUV420_BGRX_UV( + const BYTE* src1, const BYTE* src2, + BYTE* dst1, BYTE* dst2, UINT32 width) +{ + UINT32 x; + __m128i vector128, u_factors, v_factors, x0, x1, x2, x3, x4, x5; + const __m128i* rgb1 = (const __m128i*)src1; + const __m128i* rgb2 = (const __m128i*)src2; + __m64* udst = (__m64*)dst1; + __m64* vdst = (__m64*)dst2; + vector128 = _mm_load_si128((__m128i*)const_buf_128b); + u_factors = _mm_load_si128((__m128i*)bgrx_u_factors); + v_factors = _mm_load_si128((__m128i*)bgrx_v_factors); + + for (x = 0; x < width; x += 16) + { + /* subsample 16x2 pixels into 16x1 pixels */ + x0 = _mm_load_si128(rgb1++); + x4 = _mm_load_si128(rgb2++); + x0 = _mm_avg_epu8(x0, x4); + x1 = _mm_load_si128(rgb1++); + x4 = _mm_load_si128(rgb2++); + x1 = _mm_avg_epu8(x1, x4); + x2 = _mm_load_si128(rgb1++); + x4 = _mm_load_si128(rgb2++); + x2 = _mm_avg_epu8(x2, x4); + x3 = _mm_load_si128(rgb1++); + x4 = _mm_load_si128(rgb2++); + x3 = _mm_avg_epu8(x3, x4); + /* subsample these 16x1 pixels into 8x1 pixels */ + /** + * shuffle controls + * c = a[0],a[2],b[0],b[2] == 10 00 10 00 = 0x88 + * c = a[1],a[3],b[1],b[3] == 11 01 11 01 = 0xdd + */ + x4 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x0), _mm_castsi128_ps(x1), 0x88)); + x0 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x0), _mm_castsi128_ps(x1), 0xdd)); + x0 = _mm_avg_epu8(x0, x4); + x4 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x2), _mm_castsi128_ps(x3), 0x88)); + x1 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x2), _mm_castsi128_ps(x3), 0xdd)); + x1 = _mm_avg_epu8(x1, x4); + /* multiplications and subtotals */ + x2 = _mm_maddubs_epi16(x0, u_factors); + x3 = _mm_maddubs_epi16(x1, u_factors); + x4 = _mm_maddubs_epi16(x0, v_factors); + x5 = _mm_maddubs_epi16(x1, v_factors); + /* the total sums */ + x0 = _mm_hadd_epi16(x2, x3); + x1 = _mm_hadd_epi16(x4, x5); + /* shift the results */ + x0 = _mm_srai_epi16(x0, 7); + x1 = _mm_srai_epi16(x1, 7); + /* pack the 16 words into bytes */ + x0 = _mm_packs_epi16(x0, x1); + /* add 128 */ + x0 = _mm_add_epi8(x0, vector128); + /* the lower 8 bytes go to the u plane */ + _mm_storel_pi(udst++, _mm_castsi128_ps(x0)); + /* the upper 8 bytes go to the v plane */ + _mm_storeh_pi(vdst++, _mm_castsi128_ps(x0)); + } +} + +static pstatus_t ssse3_RGBToYUV420_BGRX( + const BYTE* pSrc, UINT32 srcFormat, UINT32 srcStep, + BYTE* pDst[3], UINT32 dstStep[3], + const prim_size_t* roi) +{ + UINT32 y; + const BYTE* argb = pSrc; + BYTE* ydst = pDst[0]; + BYTE* udst = pDst[1]; + BYTE* vdst = pDst[2]; + + if (roi->height < 1 || roi->width < 1) + { + return !PRIMITIVES_SUCCESS; + } + + if (roi->width % 16 || (unsigned long)pSrc % 16 || srcStep % 16) + { + return generic->RGBToYUV420_8u_P3AC4R(pSrc, srcFormat, srcStep, pDst, dstStep, roi); + } + + for (y = 0; y < roi->height - 1; y += 2) + { + const BYTE* line1 = argb; + const BYTE* line2 = argb + srcStep; + ssse3_RGBToYUV420_BGRX_UV(line1, line2, udst, vdst, roi->width); + ssse3_RGBToYUV420_BGRX_Y(line1, ydst, roi->width); + ssse3_RGBToYUV420_BGRX_Y(line2, ydst + dstStep[0], roi->width); + argb += 2 * srcStep; + ydst += 2 * dstStep[0]; + udst += 1 * dstStep[1]; + vdst += 1 * dstStep[2]; + } + + if (roi->height & 1) + { + /* pass the same last line of an odd height twice for UV */ + ssse3_RGBToYUV420_BGRX_UV(argb, argb, udst, vdst, roi->width); + ssse3_RGBToYUV420_BGRX_Y(argb, ydst, roi->width); + } + + return PRIMITIVES_SUCCESS; +} + +static pstatus_t ssse3_RGBToYUV420( + const BYTE* pSrc, UINT32 srcFormat, UINT32 srcStep, + BYTE* pDst[3], UINT32 dstStep[3], + const prim_size_t* roi) +{ + switch (srcFormat) + { + case PIXEL_FORMAT_BGRX32: + case PIXEL_FORMAT_BGRA32: + return ssse3_RGBToYUV420_BGRX(pSrc, srcFormat, srcStep, pDst, dstStep, roi); + + default: + return generic->RGBToYUV420_8u_P3AC4R(pSrc, srcFormat, srcStep, pDst, dstStep, roi); + } +} + + +/****************************************************************************/ +/* SSSE3 RGB -> AVC444-YUV conversion **/ +/****************************************************************************/ + +static INLINE void ssse3_RGBToAVC444YUV_BGRX_ROW( + const BYTE* src, BYTE* ydst, BYTE* udst1, BYTE* udst2, BYTE* vdst1, BYTE* vdst2, BOOL isEvenRow, + UINT32 width) +{ + UINT32 x; + __m128i vector128, y_factors, u_factors, v_factors, smask; + __m128i x1, x2, x3, x4, y, y1, y2, u, u1, u2, v, v1, v2; + const __m128i* argb = (const __m128i*) src; + __m128i* py = (__m128i*) ydst; + __m64* pu1 = (__m64*) udst1; + __m64* pu2 = (__m64*) udst2; + __m64* pv1 = (__m64*) vdst1; + __m64* pv2 = (__m64*) vdst2; + y_factors = _mm_load_si128((__m128i*)bgrx_y_factors); + u_factors = _mm_load_si128((__m128i*)bgrx_u_factors); + v_factors = _mm_load_si128((__m128i*)bgrx_v_factors); + vector128 = _mm_load_si128((__m128i*)const_buf_128b); + smask = _mm_set_epi8(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0); + + for (x = 0; x < width; x += 16) + { + /* store 16 rgba pixels in 4 128 bit registers */ + x1 = _mm_load_si128(argb++); // 1st 4 pixels + x2 = _mm_load_si128(argb++); // 2nd 4 pixels + x3 = _mm_load_si128(argb++); // 3rd 4 pixels + x4 = _mm_load_si128(argb++); // 4th 4 pixels + /* Y: multiplications with subtotals and horizontal sums */ + y1 = _mm_hadd_epi16(_mm_maddubs_epi16(x1, y_factors), _mm_maddubs_epi16(x2, y_factors)); + y2 = _mm_hadd_epi16(_mm_maddubs_epi16(x3, y_factors), _mm_maddubs_epi16(x4, y_factors)); + /* Y: shift the results (logical) */ + y1 = _mm_srli_epi16(y1, 7); + y2 = _mm_srli_epi16(y2, 7); + /* Y: pack (unsigned) 16 words into bytes */ + y = _mm_packus_epi16(y1, y2); + /* U: multiplications with subtotals and horizontal sums */ + u1 = _mm_hadd_epi16(_mm_maddubs_epi16(x1, u_factors), _mm_maddubs_epi16(x2, u_factors)); + u2 = _mm_hadd_epi16(_mm_maddubs_epi16(x3, u_factors), _mm_maddubs_epi16(x4, u_factors)); + /* U: shift the results (arithmetic) */ + u1 = _mm_srai_epi16(u1, 7); + u2 = _mm_srai_epi16(u2, 7); + /* U: pack (signed) 16 words into bytes */ + u = _mm_packs_epi16(u1, u2); + /* U: add 128 */ + u = _mm_add_epi8(u, vector128); + /* V: multiplications with subtotals and horizontal sums */ + v1 = _mm_hadd_epi16(_mm_maddubs_epi16(x1, v_factors), _mm_maddubs_epi16(x2, v_factors)); + v2 = _mm_hadd_epi16(_mm_maddubs_epi16(x3, v_factors), _mm_maddubs_epi16(x4, v_factors)); + /* V: shift the results (arithmetic) */ + v1 = _mm_srai_epi16(v1, 7); + v2 = _mm_srai_epi16(v2, 7); + /* V: pack (signed) 16 words into bytes */ + v = _mm_packs_epi16(v1, v2); + /* V: add 128 */ + v = _mm_add_epi8(v, vector128); + /* store y */ + _mm_storeu_si128(py++, y); + + /* store u and v */ + if (isEvenRow) + { + u = _mm_shuffle_epi8(u, smask); + v = _mm_shuffle_epi8(v, smask); + _mm_storel_pi(pu1++, _mm_castsi128_ps(u)); + _mm_storeh_pi(pu2++, _mm_castsi128_ps(u)); + _mm_storel_pi(pv1++, _mm_castsi128_ps(v)); + _mm_storeh_pi(pv2++, _mm_castsi128_ps(v)); + } + else + { + _mm_storel_pi(pu1, _mm_castsi128_ps(u)); + _mm_storeh_pi(pu2, _mm_castsi128_ps(u)); + _mm_storel_pi(pv1, _mm_castsi128_ps(v)); + _mm_storeh_pi(pv2, _mm_castsi128_ps(v)); + pu1 += 2; + pu2 += 2; + pv1 += 2; + pv2 += 2; + } + } +} + + +static pstatus_t ssse3_RGBToAVC444YUV_BGRX( + const BYTE* pSrc, UINT32 srcFormat, UINT32 srcStep, + BYTE* pDst1[3], const UINT32 dst1Step[3], + BYTE* pDst2[3], const UINT32 dst2Step[3], + const prim_size_t* roi) +{ + UINT32 y, numRows; + BOOL evenRow = TRUE; + BYTE* b1, *b2, *b3, *b4, *b5, *b6, *b7; + const BYTE* pMaxSrc = pSrc + (roi->height - 1) * srcStep; + + if (roi->height < 1 || roi->width < 1) + { + return !PRIMITIVES_SUCCESS; + } + + if (roi->width % 16 || (unsigned long)pSrc % 16 || srcStep % 16) + { + return generic->RGBToAVC444YUV(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2, dst2Step, roi); + } + + numRows = (roi->height + 1) & ~1; + + for (y = 0; y < numRows; y++, evenRow = !evenRow) + { + const BYTE* src = y < roi->height ? pSrc + y * srcStep : pMaxSrc; + UINT32 i = y >> 1; + b1 = pDst1[0] + y * dst1Step[0]; + + if (evenRow) + { + b2 = pDst1[1] + i * dst1Step[1]; + b3 = pDst1[2] + i * dst1Step[2]; + b6 = pDst2[1] + i * dst2Step[1]; + b7 = pDst2[2] + i * dst2Step[2]; + ssse3_RGBToAVC444YUV_BGRX_ROW(src, b1, b2, b6, b3, b7, TRUE, roi->width); + } + else + { + b4 = pDst2[0] + dst2Step[0] * ((i & ~7) + i); + b5 = b4 + 8 * dst2Step[0]; + ssse3_RGBToAVC444YUV_BGRX_ROW(src, b1, b4, b4 + 8, b5, b5 + 8, FALSE, roi->width); + } + } + + return PRIMITIVES_SUCCESS; +} + + +static pstatus_t ssse3_RGBToAVC444YUV( + const BYTE* pSrc, UINT32 srcFormat, UINT32 srcStep, + BYTE* pDst1[3], const UINT32 dst1Step[3], + BYTE* pDst2[3], const UINT32 dst2Step[3], + const prim_size_t* roi) +{ + switch (srcFormat) + { + case PIXEL_FORMAT_BGRX32: + case PIXEL_FORMAT_BGRA32: + return ssse3_RGBToAVC444YUV_BGRX(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2, dst2Step, roi); + + default: + return generic->RGBToAVC444YUV(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2, dst2Step, roi); + } +} + +static pstatus_t ssse3_LumaToYUV444(const BYTE* pSrcRaw[3], const UINT32 srcStep[3], + BYTE* pDstRaw[3], const UINT32 dstStep[3], + const RECTANGLE_16* roi) +{ + UINT32 x, y; + const UINT32 nWidth = roi->right - roi->left; + const UINT32 nHeight = roi->bottom - roi->top; + const UINT32 halfWidth = (nWidth + 1) / 2; + const UINT32 halfPad = halfWidth % 16; + const UINT32 halfHeight = (nHeight + 1) / 2; + const UINT32 oddY = 1; + const UINT32 evenY = 0; + const UINT32 oddX = 1; + const UINT32 evenX = 0; + const BYTE* pSrc[3] = + { + pSrcRaw[0] + roi->top* srcStep[0] + roi->left, + pSrcRaw[1] + roi->top / 2 * srcStep[1] + roi->left / 2, + pSrcRaw[2] + roi->top / 2 * srcStep[2] + roi->left / 2 + }; + BYTE* pDst[3] = + { + pDstRaw[0] + roi->top* dstStep[0] + roi->left, + pDstRaw[1] + roi->top* dstStep[1] + roi->left, + pDstRaw[2] + roi->top* dstStep[2] + roi->left + }; + + /* Y data is already here... */ + /* B1 */ + for (y = 0; y < nHeight; y++) + { + const BYTE* Ym = pSrc[0] + srcStep[0] * y; + BYTE* pY = pDst[0] + dstStep[0] * y; + memcpy(pY, Ym, nWidth); + } + + /* The first half of U, V are already here part of this frame. */ + /* B2 and B3 */ + for (y = 0; y < halfHeight; y++) + { + const UINT32 val2y = (2 * y + evenY); + const UINT32 val2y1 = val2y + oddY; + const BYTE* Um = pSrc[1] + srcStep[1] * y; + const BYTE* Vm = pSrc[2] + srcStep[2] * y; + BYTE* pU = pDst[1] + dstStep[1] * val2y; + BYTE* pV = pDst[2] + dstStep[2] * val2y; + BYTE* pU1 = pDst[1] + dstStep[1] * val2y1; + BYTE* pV1 = pDst[2] + dstStep[2] * val2y1; + + for (x = 0; x < halfWidth - halfPad; x += 16) + { + const __m128i unpackHigh = _mm_set_epi8(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0); + const __m128i unpackLow = _mm_set_epi8(15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8); + { + const __m128i u = _mm_loadu_si128((__m128i*)&Um[x]); + const __m128i uHigh = _mm_shuffle_epi8(u, unpackHigh); + const __m128i uLow = _mm_shuffle_epi8(u, unpackLow); + _mm_storeu_si128((__m128i*)&pU[2 * x], uHigh); + _mm_storeu_si128((__m128i*)&pU[2 * x + 16], uLow); + _mm_storeu_si128((__m128i*)&pU1[2 * x], uHigh); + _mm_storeu_si128((__m128i*)&pU1[2 * x + 16], uLow); + } + { + const __m128i u = _mm_loadu_si128((__m128i*)&Vm[x]); + const __m128i uHigh = _mm_shuffle_epi8(u, unpackHigh); + const __m128i uLow = _mm_shuffle_epi8(u, unpackLow); + _mm_storeu_si128((__m128i*)&pV[2 * x], uHigh); + _mm_storeu_si128((__m128i*)&pV[2 * x + 16], uLow); + _mm_storeu_si128((__m128i*)&pV1[2 * x], uHigh); + _mm_storeu_si128((__m128i*)&pV1[2 * x + 16], uLow); + } + } + + for (; x < halfWidth; x++) + { + const UINT32 val2x = 2 * x + evenX; + const UINT32 val2x1 = val2x + oddX; + pU[val2x] = Um[x]; + pV[val2x] = Vm[x]; + pU[val2x1] = Um[x]; + pV[val2x1] = Vm[x]; + pU1[val2x] = Um[x]; + pV1[val2x] = Vm[x]; + pU1[val2x1] = Um[x]; + pV1[val2x1] = Vm[x]; + } + } + + return PRIMITIVES_SUCCESS; +} + +static INLINE void ssse3_filter(BYTE* pSrcDst, const BYTE* pSrc2) +{ + const __m128i even = _mm_set_epi8(0x80, 14, 0x80, 12, 0x80, 10, 0x80, 8, 0x80, 6, 0x80, 4, 0x80, 2, + 0x80, 0); + const __m128i odd = _mm_set_epi8(0x80, 15, 0x80, 13, 0x80, 11, 0x80, 9, 0x80, 7, 0x80, 5, 0x80, 3, + 0x80, 1); + const __m128i interleave = _mm_set_epi8(15, 7, 14, 6, 13, 5, 12, 4, 11, 3, 10, 2, 9, 1, 8, 0); + const __m128i u = _mm_loadu_si128((__m128i*)pSrcDst); + const __m128i u1 = _mm_loadu_si128((__m128i*)pSrc2); + const __m128i uEven = _mm_shuffle_epi8(u, even); + const __m128i uEven4 = _mm_slli_epi16(uEven, 2); + const __m128i uOdd = _mm_shuffle_epi8(u, odd); + const __m128i u1Even = _mm_shuffle_epi8(u1, even); + const __m128i u1Odd = _mm_shuffle_epi8(u1, odd); + const __m128i tmp1 = _mm_add_epi16(uOdd, u1Even); + const __m128i tmp2 = _mm_add_epi16(tmp1, u1Odd); + const __m128i result = _mm_sub_epi16(uEven4, tmp2); + const __m128i packed = _mm_packus_epi16(result, uOdd); + const __m128i interleaved = _mm_shuffle_epi8(packed, interleave); + _mm_storeu_si128((__m128i*)pSrcDst, interleaved); +} + +static pstatus_t ssse3_ChromaFilter(BYTE* pDst[3], const UINT32 dstStep[3], + const RECTANGLE_16* roi) +{ + const UINT32 oddY = 1; + const UINT32 evenY = 0; + const UINT32 nWidth = roi->right - roi->left; + const UINT32 nHeight = roi->bottom - roi->top; + const UINT32 halfHeight = (nHeight + 1) / 2; + const UINT32 halfWidth = (nWidth + 1) / 2; + const UINT32 halfPad = halfWidth % 16; + UINT32 x, y; + + /* Filter */ + for (y = roi->top; y < halfHeight + roi->top; y++) + { + const UINT32 val2y = (y * 2 + evenY); + const UINT32 val2y1 = val2y + oddY; + BYTE* pU1 = pDst[1] + dstStep[1] * val2y1; + BYTE* pV1 = pDst[2] + dstStep[2] * val2y1; + BYTE* pU = pDst[1] + dstStep[1] * val2y; + BYTE* pV = pDst[2] + dstStep[2] * val2y; + + if (val2y1 > nHeight) + continue; + + for (x = roi->left; x < halfWidth + roi->left - halfPad; x += 16) + { + ssse3_filter(&pU[2 * x], &pU1[2 * x]); + ssse3_filter(&pV[2 * x], &pV1[2 * x]); + } + + for (; x < halfWidth + roi->left; x++) + { + const UINT32 val2x = (x * 2); + const UINT32 val2x1 = val2x + 1; + const INT32 up = pU[val2x] * 4; + const INT32 vp = pV[val2x] * 4; + INT32 u2020; + INT32 v2020; + + if (val2x1 > nWidth) + continue; + + u2020 = up - pU[val2x1] - pU1[val2x] - pU1[val2x1]; + v2020 = vp - pV[val2x1] - pV1[val2x] - pV1[val2x1]; + pU[val2x] = CLIP(u2020); + pV[val2x] = CLIP(v2020); + } + } + + return PRIMITIVES_SUCCESS; +} + +static pstatus_t ssse3_ChromaV1ToYUV444(const BYTE* pSrcRaw[3], const UINT32 srcStep[3], + BYTE* pDstRaw[3], const UINT32 dstStep[3], + const RECTANGLE_16* roi) +{ + const UINT32 mod = 16; + UINT32 uY = 0; + UINT32 vY = 0; + UINT32 x, y; + const UINT32 nWidth = roi->right - roi->left; + const UINT32 nHeight = roi->bottom - roi->top; + const UINT32 halfWidth = (nWidth + 1) / 2; + const UINT32 halfPad = halfWidth % 16; + const UINT32 halfHeight = (nHeight + 1) / 2; + const UINT32 oddY = 1; + const UINT32 evenY = 0; + const UINT32 oddX = 1; + /* The auxilary frame is aligned to multiples of 16x16. + * We need the padded height for B4 and B5 conversion. */ + const UINT32 padHeigth = nHeight + 16 - nHeight % 16; + const BYTE* pSrc[3] = + { + pSrcRaw[0] + roi->top* srcStep[0] + roi->left, + pSrcRaw[1] + roi->top / 2 * srcStep[1] + roi->left / 2, + pSrcRaw[2] + roi->top / 2 * srcStep[2] + roi->left / 2 + }; + BYTE* pDst[3] = + { + pDstRaw[0] + roi->top* dstStep[0] + roi->left, + pDstRaw[1] + roi->top* dstStep[1] + roi->left, + pDstRaw[2] + roi->top* dstStep[2] + roi->left + }; + const __m128i zero = _mm_setzero_si128(); + const __m128i mask = _mm_set_epi8(0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, + 0x80); + + /* The second half of U and V is a bit more tricky... */ + /* B4 and B5 */ + for (y = 0; y < padHeigth; y++) + { + const BYTE* Ya = pSrc[0] + srcStep[0] * y; + BYTE* pX; + + if ((y) % mod < (mod + 1) / 2) + { + const UINT32 pos = (2 * uY++ + oddY); + + if (pos >= nHeight) + continue; + + pX = pDst[1] + dstStep[1] * pos; + } + else + { + const UINT32 pos = (2 * vY++ + oddY); + + if (pos >= nHeight) + continue; + + pX = pDst[2] + dstStep[2] * pos; + } + + memcpy(pX, Ya, nWidth); + } + + /* B6 and B7 */ + for (y = 0; y < halfHeight; y++) + { + const UINT32 val2y = (y * 2 + evenY); + const BYTE* Ua = pSrc[1] + srcStep[1] * y; + const BYTE* Va = pSrc[2] + srcStep[2] * y; + BYTE* pU = pDst[1] + dstStep[1] * val2y; + BYTE* pV = pDst[2] + dstStep[2] * val2y; + + for (x = 0; x < halfWidth - halfPad; x += 16) + { + { + const __m128i u = _mm_loadu_si128((__m128i*)&Ua[x]); + const __m128i u2 = _mm_unpackhi_epi8(u, zero); + const __m128i u1 = _mm_unpacklo_epi8(u, zero); + _mm_maskmoveu_si128(u1, mask, (char*)&pU[2 * x]); + _mm_maskmoveu_si128(u2, mask, (char*)&pU[2 * x + 16]); + } + { + const __m128i u = _mm_loadu_si128((__m128i*)&Va[x]); + const __m128i u2 = _mm_unpackhi_epi8(u, zero); + const __m128i u1 = _mm_unpacklo_epi8(u, zero); + _mm_maskmoveu_si128(u1, mask, (char*)&pV[2 * x]); + _mm_maskmoveu_si128(u2, mask, (char*)&pV[2 * x + 16]); + } + } + + for (; x < halfWidth; x++) + { + const UINT32 val2x1 = (x * 2 + oddX); + pU[val2x1] = Ua[x]; + pV[val2x1] = Va[x]; + } + } + + /* Filter */ + return ssse3_ChromaFilter(pDst, dstStep, roi); +} + +static pstatus_t ssse3_ChromaV2ToYUV444(const BYTE* pSrc[3], const UINT32 srcStep[3], + UINT32 nTotalWidth, UINT32 nTotalHeight, + BYTE* pDst[3], const UINT32 dstStep[3], + const RECTANGLE_16* roi) +{ + UINT32 x, y; + const UINT32 nWidth = roi->right - roi->left; + const UINT32 nHeight = roi->bottom - roi->top; + const UINT32 halfWidth = (nWidth + 1) / 2; + const UINT32 halfPad = halfWidth % 16; + const UINT32 halfHeight = (nHeight + 1) / 2; + const UINT32 quaterWidth = (nWidth + 3) / 4; + const UINT32 quaterPad = quaterWidth % 16; + const __m128i zero = _mm_setzero_si128(); + const __m128i mask = _mm_set_epi8(0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, + 0x80, 0); + const __m128i mask2 = _mm_set_epi8(0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, 0x80, 0, + 0x80); + const __m128i shuffle1 = _mm_set_epi8(0x80, 15, 0x80, 14, 0x80, 13, 0x80, 12, 0x80, 11, 0x80, 10, + 0x80, 9, 0x80, 8); + const __m128i shuffle2 = _mm_set_epi8(0x80, 7, 0x80, 6, 0x80, 5, 0x80, 4, 0x80, 3, 0x80, 2, 0x80, 1, + 0x80, 0); + + /* B4 and B5: odd UV values for width/2, height */ + for (y = 0; y < nHeight; y++) + { + const UINT32 yTop = y + roi->top; + const BYTE* pYaU = pSrc[0] + srcStep[0] * yTop + roi->left / 2; + const BYTE* pYaV = pYaU + nTotalWidth / 2; + BYTE* pU = pDst[1] + dstStep[1] * yTop + roi->left; + BYTE* pV = pDst[2] + dstStep[2] * yTop + roi->left; + + for (x = 0; x < halfWidth - halfPad; x += 16) + { + { + const __m128i u = _mm_loadu_si128((__m128i*)&pYaU[x]); + const __m128i u2 = _mm_unpackhi_epi8(zero, u); + const __m128i u1 = _mm_unpacklo_epi8(zero, u); + _mm_maskmoveu_si128(u1, mask, (char*)&pU[2 * x]); + _mm_maskmoveu_si128(u2, mask, (char*)&pU[2 * x + 16]); + } + { + const __m128i v = _mm_loadu_si128((__m128i*)&pYaV[x]); + const __m128i v2 = _mm_unpackhi_epi8(zero, v); + const __m128i v1 = _mm_unpacklo_epi8(zero, v); + _mm_maskmoveu_si128(v1, mask, (char*)&pV[2 * x]); + _mm_maskmoveu_si128(v2, mask, (char*)&pV[2 * x + 16]); + } + } + + for (; x < halfWidth; x++) + { + const UINT32 odd = 2 * x + 1; + pU[odd] = pYaU[x]; + pV[odd] = pYaV[x]; + } + } + + /* B6 - B9 */ + for (y = 0; y < halfHeight; y++) + { + const BYTE* pUaU = pSrc[1] + srcStep[1] * (y + roi->top / 2) + roi->left / 4; + const BYTE* pUaV = pUaU + nTotalWidth / 4; + const BYTE* pVaU = pSrc[2] + srcStep[2] * (y + roi->top / 2) + roi->left / 4; + const BYTE* pVaV = pVaU + nTotalWidth / 4; + BYTE* pU = pDst[1] + dstStep[1] * (2 * y + 1 + roi->top) + roi->left; + BYTE* pV = pDst[2] + dstStep[2] * (2 * y + 1 + roi->top) + roi->left; + + for (x = 0; x < quaterWidth - quaterPad; x += 16) + { + { + const __m128i uU = _mm_loadu_si128((__m128i*)&pUaU[x]); + const __m128i uV = _mm_loadu_si128((__m128i*)&pVaU[x]); + const __m128i uHigh = _mm_unpackhi_epi8(uU, uV); + const __m128i uLow = _mm_unpacklo_epi8(uU, uV); + const __m128i u1 = _mm_shuffle_epi8(uLow, shuffle2); + const __m128i u2 = _mm_shuffle_epi8(uLow, shuffle1); + const __m128i u3 = _mm_shuffle_epi8(uHigh, shuffle2); + const __m128i u4 = _mm_shuffle_epi8(uHigh, shuffle1); + _mm_maskmoveu_si128(u1, mask2, (char*)&pU[4 * x + 0]); + _mm_maskmoveu_si128(u2, mask2, (char*)&pU[4 * x + 16]); + _mm_maskmoveu_si128(u3, mask2, (char*)&pU[4 * x + 32]); + _mm_maskmoveu_si128(u4, mask2, (char*)&pU[4 * x + 48]); + } + { + const __m128i vU = _mm_loadu_si128((__m128i*)&pUaV[x]); + const __m128i vV = _mm_loadu_si128((__m128i*)&pVaV[x]); + const __m128i vHigh = _mm_unpackhi_epi8(vU, vV); + const __m128i vLow = _mm_unpacklo_epi8(vU, vV); + const __m128i v1 = _mm_shuffle_epi8(vLow, shuffle2); + const __m128i v2 = _mm_shuffle_epi8(vLow, shuffle1); + const __m128i v3 = _mm_shuffle_epi8(vHigh, shuffle2); + const __m128i v4 = _mm_shuffle_epi8(vHigh, shuffle1); + _mm_maskmoveu_si128(v1, mask2, (char*)&pV[4 * x + 0]); + _mm_maskmoveu_si128(v2, mask2, (char*)&pV[4 * x + 16]); + _mm_maskmoveu_si128(v3, mask2, (char*)&pV[4 * x + 32]); + _mm_maskmoveu_si128(v4, mask2, (char*)&pV[4 * x + 48]); + } + } + + for (; x < quaterWidth; x++) + { + pU[4 * x + 0] = pUaU[x]; + pV[4 * x + 0] = pUaV[x]; + pU[4 * x + 2] = pVaU[x]; + pV[4 * x + 2] = pVaV[x]; + } + } + + return ssse3_ChromaFilter(pDst, dstStep, roi); +} + +static pstatus_t ssse3_YUV420CombineToYUV444( + avc444_frame_type type, + const BYTE* pSrc[3], const UINT32 srcStep[3], + UINT32 nWidth, UINT32 nHeight, + BYTE* pDst[3], const UINT32 dstStep[3], + const RECTANGLE_16* roi) +{ + if (!pSrc || !pSrc[0] || !pSrc[1] || !pSrc[2]) + return -1; + + if (!pDst || !pDst[0] || !pDst[1] || !pDst[2]) + return -1; + + if (!roi) + return -1; + + switch (type) + { + case AVC444_LUMA: + return ssse3_LumaToYUV444(pSrc, srcStep, pDst, dstStep, roi); + + case AVC444_CHROMAv1: + return ssse3_ChromaV1ToYUV444(pSrc, srcStep, pDst, dstStep, roi); + + case AVC444_CHROMAv2: + return ssse3_ChromaV2ToYUV444(pSrc, srcStep, nWidth, nHeight, pDst, dstStep, roi); + + default: + return -1; + } +} + +#elif defined(WITH_NEON) + +static INLINE uint8x8_t neon_YUV2R(int32x4_t Ch, int32x4_t Cl, + int16x4_t Dh, int16x4_t Dl, + int16x4_t Eh, int16x4_t El) +{ + /* R = (256 * Y + 403 * (V - 128)) >> 8 */ + const int16x4_t c403 = vdup_n_s16(403); + const int32x4_t CEh = vmlal_s16(Ch, Eh, c403); + const int32x4_t CEl = vmlal_s16(Cl, El, c403); + const int32x4_t Rh = vrshrq_n_s32(CEh, 8); + const int32x4_t Rl = vrshrq_n_s32(CEl, 8); + const int16x8_t R = vcombine_s16(vqmovn_s32(Rl), vqmovn_s32(Rh)); + return vqmovun_s16(R); +} + +static INLINE uint8x8_t neon_YUV2G(int32x4_t Ch, int32x4_t Cl, + int16x4_t Dh, int16x4_t Dl, + int16x4_t Eh, int16x4_t El) +{ + /* G = (256L * Y - 48 * (U - 128) - 120 * (V - 128)) >> 8 */ + const int16x4_t c48 = vdup_n_s16(48); + const int16x4_t c120 = vdup_n_s16(120); + const int32x4_t CDh = vmlsl_s16(Ch, Dh, c48); + const int32x4_t CDl = vmlsl_s16(Cl, Dl, c48); + const int32x4_t CDEh = vmlsl_s16(CDh, Eh, c120); + const int32x4_t CDEl = vmlsl_s16(CDl, El, c120); + const int32x4_t Gh = vrshrq_n_s32(CDEh, 8); + const int32x4_t Gl = vrshrq_n_s32(CDEl, 8); + const int16x8_t G = vcombine_s16(vqmovn_s32(Gl), vqmovn_s32(Gh)); + return vqmovun_s16(G); +} + +static INLINE uint8x8_t neon_YUV2B(int32x4_t Ch, int32x4_t Cl, + int16x4_t Dh, int16x4_t Dl, + int16x4_t Eh, int16x4_t El) +{ + /* B = (256L * Y + 475 * (U - 128)) >> 8*/ + const int16x4_t c475 = vdup_n_s16(475); + const int32x4_t CDh = vmlal_s16(Ch, Dh, c475); + const int32x4_t CDl = vmlal_s16(Ch, Dl, c475); + const int32x4_t Bh = vrshrq_n_s32(CDh, 8); + const int32x4_t Bl = vrshrq_n_s32(CDl, 8); + const int16x8_t B = vcombine_s16(vqmovn_s32(Bl), vqmovn_s32(Bh)); + return vqmovun_s16(B); +} + +static INLINE BYTE* neon_YuvToRgbPixel(BYTE* pRGB, int16x8_t Y, int16x8_t D, int16x8_t E, + const uint8_t rPos, const uint8_t gPos, const uint8_t bPos, const uint8_t aPos) +{ + uint8x8x4_t bgrx; + const int32x4_t Ch = vmulq_n_s32(vmovl_s16(vget_high_s16(Y)), 256); /* Y * 256 */ + const int32x4_t Cl = vmulq_n_s32(vmovl_s16(vget_low_s16(Y)), 256); /* Y * 256 */ + const int16x4_t Dh = vget_high_s16(D); + const int16x4_t Dl = vget_low_s16(D); + const int16x4_t Eh = vget_high_s16(E); + const int16x4_t El = vget_low_s16(E); + { + /* B = (256L * Y + 475 * (U - 128)) >> 8*/ + const int16x4_t c475 = vdup_n_s16(475); + const int32x4_t CDh = vmlal_s16(Ch, Dh, c475); + const int32x4_t CDl = vmlal_s16(Cl, Dl, c475); + const int32x4_t Bh = vrshrq_n_s32(CDh, 8); + const int32x4_t Bl = vrshrq_n_s32(CDl, 8); + const int16x8_t B = vcombine_s16(vqmovn_s32(Bl), vqmovn_s32(Bh)); + bgrx.val[bPos] = vqmovun_s16(B); + } + { + /* G = (256L * Y - 48 * (U - 128) - 120 * (V - 128)) >> 8 */ + const int16x4_t c48 = vdup_n_s16(48); + const int16x4_t c120 = vdup_n_s16(120); + const int32x4_t CDh = vmlsl_s16(Ch, Dh, c48); + const int32x4_t CDl = vmlsl_s16(Cl, Dl, c48); + const int32x4_t CDEh = vmlsl_s16(CDh, Eh, c120); + const int32x4_t CDEl = vmlsl_s16(CDl, El, c120); + const int32x4_t Gh = vrshrq_n_s32(CDEh, 8); + const int32x4_t Gl = vrshrq_n_s32(CDEl, 8); + const int16x8_t G = vcombine_s16(vqmovn_s32(Gl), vqmovn_s32(Gh)); + bgrx.val[gPos] = vqmovun_s16(G); + } + { + /* R = (256 * Y + 403 * (V - 128)) >> 8 */ + const int16x4_t c403 = vdup_n_s16(403); + const int32x4_t CEh = vmlal_s16(Ch, Eh, c403); + const int32x4_t CEl = vmlal_s16(Cl, El, c403); + const int32x4_t Rh = vrshrq_n_s32(CEh, 8); + const int32x4_t Rl = vrshrq_n_s32(CEl, 8); + const int16x8_t R = vcombine_s16(vqmovn_s32(Rl), vqmovn_s32(Rh)); + bgrx.val[rPos] = vqmovun_s16(R); + } + { + /* A */ + bgrx.val[aPos] = vdup_n_u8(0xFF); + } + vst4_u8(pRGB, bgrx); + pRGB += 32; + return pRGB; +} + +static INLINE pstatus_t neon_YUV420ToX( + const BYTE* pSrc[3], const UINT32 srcStep[3], + BYTE* pDst, UINT32 dstStep, + const prim_size_t* roi, const uint8_t rPos, const uint8_t gPos, + const uint8_t bPos, const uint8_t aPos) +{ + UINT32 y; + const UINT32 nWidth = roi->width; + const UINT32 nHeight = roi->height; + const DWORD pad = nWidth % 16; + const UINT32 yPad = srcStep[0] - roi->width; + const UINT32 uPad = srcStep[1] - roi->width / 2; + const UINT32 vPad = srcStep[2] - roi->width / 2; + const UINT32 dPad = dstStep - roi->width * 4; + const int16x8_t c128 = vdupq_n_s16(128); + + for (y = 0; y < nHeight; y += 2) + { + const uint8_t* pY1 = pSrc[0] + y * srcStep[0]; + const uint8_t* pY2 = pY1 + srcStep[0]; + const uint8_t* pU = pSrc[1] + (y / 2) * srcStep[1]; + const uint8_t* pV = pSrc[2] + (y / 2) * srcStep[2]; + uint8_t* pRGB1 = pDst + y * dstStep; + uint8_t* pRGB2 = pRGB1 + dstStep; + UINT32 x; + const BOOL lastY = y >= nHeight - 1; + + for (x = 0; x < nWidth - pad;) + { + const uint8x8_t Uraw = vld1_u8(pU); + const uint8x8x2_t Uu = vzip_u8(Uraw, Uraw); + const int16x8_t U1 = vreinterpretq_s16_u16(vmovl_u8(Uu.val[0])); + const int16x8_t U2 = vreinterpretq_s16_u16(vmovl_u8(Uu.val[1])); + const uint8x8_t Vraw = vld1_u8(pV); + const uint8x8x2_t Vu = vzip_u8(Vraw, Vraw); + const int16x8_t V1 = vreinterpretq_s16_u16(vmovl_u8(Vu.val[0])); + const int16x8_t V2 = vreinterpretq_s16_u16(vmovl_u8(Vu.val[1])); + const int16x8_t D1 = vsubq_s16(U1, c128); + const int16x8_t E1 = vsubq_s16(V1, c128); + const int16x8_t D2 = vsubq_s16(U2, c128); + const int16x8_t E2 = vsubq_s16(V2, c128); + { + const uint8x8_t Y1u = vld1_u8(pY1); + const int16x8_t Y1 = vreinterpretq_s16_u16(vmovl_u8(Y1u)); + pRGB1 = neon_YuvToRgbPixel(pRGB1, Y1, D1, E1, rPos, gPos, bPos, aPos); + pY1 += 8; + x += 8; + } + { + const uint8x8_t Y1u = vld1_u8(pY1); + const int16x8_t Y1 = vreinterpretq_s16_u16(vmovl_u8(Y1u)); + pRGB1 = neon_YuvToRgbPixel(pRGB1, Y1, D2, E2, rPos, gPos, bPos, aPos); + pY1 += 8; + x += 8; + } + + if (!lastY) + { + { + const uint8x8_t Y2u = vld1_u8(pY2); + const int16x8_t Y2 = vreinterpretq_s16_u16(vmovl_u8(Y2u)); + pRGB2 = neon_YuvToRgbPixel(pRGB2, Y2, D1, E1, rPos, gPos, bPos, aPos); + pY2 += 8; + } + { + const uint8x8_t Y2u = vld1_u8(pY2); + const int16x8_t Y2 = vreinterpretq_s16_u16(vmovl_u8(Y2u)); + pRGB2 = neon_YuvToRgbPixel(pRGB2, Y2, D2, E2, rPos, gPos, bPos, aPos); + pY2 += 8; + } + } + + pU += 8; + pV += 8; + } + + for (; x < nWidth; x++) + { + const BYTE U = *pU; + const BYTE V = *pV; + { + const BYTE Y = *pY1++; + const BYTE r = YUV2R(Y, U, V); + const BYTE g = YUV2G(Y, U, V); + const BYTE b = YUV2B(Y, U, V); + pRGB1[aPos] = 0xFF; + pRGB1[rPos] = r; + pRGB1[gPos] = g; + pRGB1[bPos] = b; + pRGB1 += 4; + } + + if (!lastY) + { + const BYTE Y = *pY2++; + const BYTE r = YUV2R(Y, U, V); + const BYTE g = YUV2G(Y, U, V); + const BYTE b = YUV2B(Y, U, V); + pRGB2[aPos] = 0xFF; + pRGB2[rPos] = r; + pRGB2[gPos] = g; + pRGB2[bPos] = b; + pRGB2 += 4; + } + + if (x % 2) + { + pU++; + pV++; + } + } + + pRGB1 += dPad; + pRGB2 += dPad; + pY1 += yPad; + pY2 += yPad; + pU += uPad; + pV += vPad; + } + + return PRIMITIVES_SUCCESS; +} + +static pstatus_t neon_YUV420ToRGB_8u_P3AC4R( + const BYTE* pSrc[3], const UINT32 srcStep[3], + BYTE* pDst, UINT32 dstStep, UINT32 DstFormat, + const prim_size_t* roi) +{ + switch (DstFormat) + { + case PIXEL_FORMAT_BGRA32: + case PIXEL_FORMAT_BGRX32: + return neon_YUV420ToX(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3); + + case PIXEL_FORMAT_RGBA32: + case PIXEL_FORMAT_RGBX32: + return neon_YUV420ToX(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3); + + case PIXEL_FORMAT_ARGB32: + case PIXEL_FORMAT_XRGB32: + return neon_YUV420ToX(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0); + + case PIXEL_FORMAT_ABGR32: + case PIXEL_FORMAT_XBGR32: + return neon_YUV420ToX(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0); + + default: + return generic->YUV420ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi); + } +} + +static INLINE pstatus_t neon_YUV444ToX( + const BYTE* pSrc[3], const UINT32 srcStep[3], + BYTE* pDst, UINT32 dstStep, + const prim_size_t* roi, const uint8_t rPos, const uint8_t gPos, + const uint8_t bPos, const uint8_t aPos) +{ + UINT32 y; + const UINT32 nWidth = roi->width; + const UINT32 nHeight = roi->height; + const UINT32 yPad = srcStep[0] - roi->width; + const UINT32 uPad = srcStep[1] - roi->width; + const UINT32 vPad = srcStep[2] - roi->width; + const UINT32 dPad = dstStep - roi->width * 4; + const uint8_t* pY = pSrc[0]; + const uint8_t* pU = pSrc[1]; + const uint8_t* pV = pSrc[2]; + uint8_t* pRGB = pDst; + const int16x8_t c128 = vdupq_n_s16(128); + const DWORD pad = nWidth % 8; + + for (y = 0; y < nHeight; y++) + { + UINT32 x; + + for (x = 0; x < nWidth - pad; x += 8) + { + const uint8x8_t Yu = vld1_u8(pY); + const int16x8_t Y = vreinterpretq_s16_u16(vmovl_u8(Yu)); + const uint8x8_t Uu = vld1_u8(pU); + const int16x8_t U = vreinterpretq_s16_u16(vmovl_u8(Uu)); + const uint8x8_t Vu = vld1_u8(pV); + const int16x8_t V = vreinterpretq_s16_u16(vmovl_u8(Vu)); + /* Do the calculations on Y in 32bit width, the result of 255 * 256 does not fit + * a signed 16 bit value. */ + const int16x8_t D = vsubq_s16(U, c128); + const int16x8_t E = vsubq_s16(V, c128); + pRGB = neon_YuvToRgbPixel(pRGB, Y, D, E, rPos, gPos, bPos, aPos); + pY += 8; + pU += 8; + pV += 8; + } + + for (x = 0; x < pad; x++) + { + const BYTE Y = *pY++; + const BYTE U = *pU++; + const BYTE V = *pV++; + const BYTE r = YUV2R(Y, U, V); + const BYTE g = YUV2G(Y, U, V); + const BYTE b = YUV2B(Y, U, V); + pRGB[aPos] = 0xFF; + pRGB[rPos] = r; + pRGB[gPos] = g; + pRGB[bPos] = b; + pRGB += 4; + } + + pRGB += dPad; + pY += yPad; + pU += uPad; + pV += vPad; + } + + return PRIMITIVES_SUCCESS; +} + +static pstatus_t neon_YUV444ToRGB_8u_P3AC4R( + const BYTE* pSrc[3], const UINT32 srcStep[3], + BYTE* pDst, UINT32 dstStep, UINT32 DstFormat, + const prim_size_t* roi) +{ + switch (DstFormat) + { + case PIXEL_FORMAT_BGRA32: + case PIXEL_FORMAT_BGRX32: + return neon_YUV444ToX(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3); + + case PIXEL_FORMAT_RGBA32: + case PIXEL_FORMAT_RGBX32: + return neon_YUV444ToX(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3); + + case PIXEL_FORMAT_ARGB32: + case PIXEL_FORMAT_XRGB32: + return neon_YUV444ToX(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0); + + case PIXEL_FORMAT_ABGR32: + case PIXEL_FORMAT_XBGR32: + return neon_YUV444ToX(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0); + + default: + return generic->YUV444ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi); + } +} + +static pstatus_t neon_LumaToYUV444(const BYTE* pSrcRaw[3], const UINT32 srcStep[3], + BYTE* pDstRaw[3], const UINT32 dstStep[3], + const RECTANGLE_16* roi) +{ + UINT32 x, y; + const UINT32 nWidth = roi->right - roi->left; + const UINT32 nHeight = roi->bottom - roi->top; + const UINT32 halfWidth = (nWidth + 1) / 2; + const UINT32 halfHeight = (nHeight + 1) / 2; + const UINT32 evenY = 0; + const BYTE* pSrc[3] = + { + pSrcRaw[0] + roi->top* srcStep[0] + roi->left, + pSrcRaw[1] + roi->top / 2 * srcStep[1] + roi->left / 2, + pSrcRaw[2] + roi->top / 2 * srcStep[2] + roi->left / 2 + }; + BYTE* pDst[3] = + { + pDstRaw[0] + roi->top* dstStep[0] + roi->left, + pDstRaw[1] + roi->top* dstStep[1] + roi->left, + pDstRaw[2] + roi->top* dstStep[2] + roi->left + }; + + /* Y data is already here... */ + /* B1 */ + for (y = 0; y < nHeight; y++) + { + const BYTE* Ym = pSrc[0] + srcStep[0] * y; + BYTE* pY = pDst[0] + dstStep[0] * y; + memcpy(pY, Ym, nWidth); + } + + /* The first half of U, V are already here part of this frame. */ + /* B2 and B3 */ + for (y = 0; y < halfHeight; y++) + { + const UINT32 val2y = (2 * y + evenY); + const BYTE* Um = pSrc[1] + srcStep[1] * y; + const BYTE* Vm = pSrc[2] + srcStep[2] * y; + BYTE* pU = pDst[1] + dstStep[1] * val2y; + BYTE* pV = pDst[2] + dstStep[2] * val2y; + BYTE* pU1 = pU + dstStep[1]; + BYTE* pV1 = pV + dstStep[2]; + + for (x = 0; x + 16 < halfWidth; x += 16) + { + { + const uint8x16_t u = vld1q_u8(Um); + uint8x16x2_t u2x; + u2x.val[0] = u; + u2x.val[1] = u; + vst2q_u8(pU, u2x); + vst2q_u8(pU1, u2x); + Um += 16; + pU += 32; + pU1 += 32; + } + { + const uint8x16_t v = vld1q_u8(Vm); + uint8x16x2_t v2x; + v2x.val[0] = v; + v2x.val[1] = v; + vst2q_u8(pV, v2x); + vst2q_u8(pV1, v2x); + Vm += 16; + pV += 32; + pV1 += 32; + } + } + + for (; x < halfWidth; x++) + { + const BYTE u = *Um++; + const BYTE v = *Vm++; + *pU++ = u; + *pU++ = u; + *pU1++ = u; + *pU1++ = u; + *pV++ = v; + *pV++ = v; + *pV1++ = v; + *pV1++ = v; + } + } + + return PRIMITIVES_SUCCESS; +} + +static pstatus_t neon_ChromaFilter(BYTE* pDst[3], const UINT32 dstStep[3], + const RECTANGLE_16* roi) +{ + const UINT32 oddY = 1; + const UINT32 evenY = 0; + const UINT32 nWidth = roi->right - roi->left; + const UINT32 nHeight = roi->bottom - roi->top; + const UINT32 halfHeight = (nHeight + 1) / 2; + const UINT32 halfWidth = (nWidth + 1) / 2; + const UINT32 halfPad = halfWidth % 16; + UINT32 x, y; + + /* Filter */ + for (y = roi->top; y < halfHeight + roi->top; y++) + { + const UINT32 val2y = (y * 2 + evenY); + const UINT32 val2y1 = val2y + oddY; + BYTE* pU1 = pDst[1] + dstStep[1] * val2y1; + BYTE* pV1 = pDst[2] + dstStep[2] * val2y1; + BYTE* pU = pDst[1] + dstStep[1] * val2y; + BYTE* pV = pDst[2] + dstStep[2] * val2y; + + if (val2y1 > nHeight) + continue; + + for (x = roi->left / 2; x < halfWidth + roi->left / 2 - halfPad; x += 16) + { + { + /* U = (U2x,2y << 2) - U2x1,2y - U2x,2y1 - U2x1,2y1 */ + uint8x8x2_t u = vld2_u8(&pU[2 * x]); + const int16x8_t up = vreinterpretq_s16_u16(vshll_n_u8(u.val[0], 2)); /* Ux2,2y << 2 */ + const uint8x8x2_t u1 = vld2_u8(&pU1[2 * x]); + const uint16x8_t usub = vaddl_u8(u1.val[1], u1.val[0]); /* U2x,2y1 + U2x1,2y1 */ + const int16x8_t us = vreinterpretq_s16_u16(vaddw_u8(usub, + u.val[1])); /* U2x1,2y + U2x,2y1 + U2x1,2y1 */ + const int16x8_t un = vsubq_s16(up, us); + const uint8x8_t u8 = vqmovun_s16(un); /* CLIP(un) */ + u.val[0] = u8; + vst2_u8(&pU[2 * x], u); + } + { + /* V = (V2x,2y << 2) - V2x1,2y - V2x,2y1 - V2x1,2y1 */ + uint8x8x2_t v = vld2_u8(&pV[2 * x]); + const int16x8_t vp = vreinterpretq_s16_u16(vshll_n_u8(v.val[0], 2)); /* Vx2,2y << 2 */ + const uint8x8x2_t v1 = vld2_u8(&pV1[2 * x]); + const uint16x8_t vsub = vaddl_u8(v1.val[1], v1.val[0]); /* V2x,2y1 + V2x1,2y1 */ + const int16x8_t vs = vreinterpretq_s16_u16(vaddw_u8(vsub, + v.val[1])); /* V2x1,2y + V2x,2y1 + V2x1,2y1 */ + const int16x8_t vn = vsubq_s16(vp, vs); + const uint8x8_t v8 = vqmovun_s16(vn); /* CLIP(vn) */ + v.val[0] = v8; + vst2_u8(&pV[2 * x], v); + } + } + + for (; x < halfWidth + roi->left / 2; x++) + { + const UINT32 val2x = (x * 2); + const UINT32 val2x1 = val2x + 1; + const INT32 up = pU[val2x] * 4; + const INT32 vp = pV[val2x] * 4; + INT32 u2020; + INT32 v2020; + + if (val2x1 > nWidth) + continue; + + u2020 = up - pU[val2x1] - pU1[val2x] - pU1[val2x1]; + v2020 = vp - pV[val2x1] - pV1[val2x] - pV1[val2x1]; + pU[val2x] = CLIP(u2020); + pV[val2x] = CLIP(v2020); + } + } + + return PRIMITIVES_SUCCESS; +} + +static pstatus_t neon_ChromaV1ToYUV444(const BYTE* pSrcRaw[3], const UINT32 srcStep[3], + BYTE* pDstRaw[3], const UINT32 dstStep[3], + const RECTANGLE_16* roi) +{ + const UINT32 mod = 16; + UINT32 uY = 0; + UINT32 vY = 0; + UINT32 x, y; + const UINT32 nWidth = roi->right - roi->left; + const UINT32 nHeight = roi->bottom - roi->top; + const UINT32 halfWidth = (nWidth) / 2; + const UINT32 halfHeight = (nHeight) / 2; + const UINT32 oddY = 1; + const UINT32 evenY = 0; + const UINT32 oddX = 1; + /* The auxilary frame is aligned to multiples of 16x16. + * We need the padded height for B4 and B5 conversion. */ + const UINT32 padHeigth = nHeight + 16 - nHeight % 16; + const UINT32 halfPad = halfWidth % 16; + const BYTE* pSrc[3] = + { + pSrcRaw[0] + roi->top* srcStep[0] + roi->left, + pSrcRaw[1] + roi->top / 2 * srcStep[1] + roi->left / 2, + pSrcRaw[2] + roi->top / 2 * srcStep[2] + roi->left / 2 + }; + BYTE* pDst[3] = + { + pDstRaw[0] + roi->top* dstStep[0] + roi->left, + pDstRaw[1] + roi->top* dstStep[1] + roi->left, + pDstRaw[2] + roi->top* dstStep[2] + roi->left + }; + + /* The second half of U and V is a bit more tricky... */ + /* B4 and B5 */ + for (y = 0; y < padHeigth; y++) + { + const BYTE* Ya = pSrc[0] + srcStep[0] * y; + BYTE* pX; + + if ((y) % mod < (mod + 1) / 2) + { + const UINT32 pos = (2 * uY++ + oddY); + + if (pos >= nHeight) + continue; + + pX = pDst[1] + dstStep[1] * pos; + } + else + { + const UINT32 pos = (2 * vY++ + oddY); + + if (pos >= nHeight) + continue; + + pX = pDst[2] + dstStep[2] * pos; + } + + memcpy(pX, Ya, nWidth); + } + + /* B6 and B7 */ + for (y = 0; y < halfHeight; y++) + { + const UINT32 val2y = (y * 2 + evenY); + const BYTE* Ua = pSrc[1] + srcStep[1] * y; + const BYTE* Va = pSrc[2] + srcStep[2] * y; + BYTE* pU = pDst[1] + dstStep[1] * val2y; + BYTE* pV = pDst[2] + dstStep[2] * val2y; + + for (x = 0; x < halfWidth - halfPad; x += 16) + { + { + uint8x16x2_t u = vld2q_u8(&pU[2 * x]); + u.val[1] = vld1q_u8(&Ua[x]); + vst2q_u8(&pU[2 * x], u); + } + { + uint8x16x2_t v = vld2q_u8(&pV[2 * x]); + v.val[1] = vld1q_u8(&Va[x]); + vst2q_u8(&pV[2 * x], v); + } + } + + for (; x < halfWidth; x++) + { + const UINT32 val2x1 = (x * 2 + oddX); + pU[val2x1] = Ua[x]; + pV[val2x1] = Va[x]; + } + } + + /* Filter */ + return neon_ChromaFilter(pDst, dstStep, roi); +} + +static pstatus_t neon_ChromaV2ToYUV444(const BYTE* pSrc[3], const UINT32 srcStep[3], + UINT32 nTotalWidth, UINT32 nTotalHeight, + BYTE* pDst[3], const UINT32 dstStep[3], + const RECTANGLE_16* roi) +{ + UINT32 x, y; + const UINT32 nWidth = roi->right - roi->left; + const UINT32 nHeight = roi->bottom - roi->top; + const UINT32 halfWidth = (nWidth + 1) / 2; + const UINT32 halfPad = halfWidth % 16; + const UINT32 halfHeight = (nHeight + 1) / 2; + const UINT32 quaterWidth = (nWidth + 3) / 4; + const UINT32 quaterPad = quaterWidth % 16; + + /* B4 and B5: odd UV values for width/2, height */ + for (y = 0; y < nHeight; y++) + { + const UINT32 yTop = y + roi->top; + const BYTE* pYaU = pSrc[0] + srcStep[0] * yTop + roi->left / 2; + const BYTE* pYaV = pYaU + nTotalWidth / 2; + BYTE* pU = pDst[1] + dstStep[1] * yTop + roi->left; + BYTE* pV = pDst[2] + dstStep[2] * yTop + roi->left; + + for (x = 0; x < halfWidth - halfPad; x += 16) + { + { + uint8x16x2_t u = vld2q_u8(&pU[2 * x]); + u.val[1] = vld1q_u8(&pYaU[x]); + vst2q_u8(&pU[2 * x], u); + } + { + uint8x16x2_t v = vld2q_u8(&pV[2 * x]); + v.val[1] = vld1q_u8(&pYaV[x]); + vst2q_u8(&pV[2 * x], v); + } + } + + for (; x < halfWidth; x++) + { + const UINT32 odd = 2 * x + 1; + pU[odd] = pYaU[x]; + pV[odd] = pYaV[x]; + } + } + + /* B6 - B9 */ + for (y = 0; y < halfHeight; y++) + { + const BYTE* pUaU = pSrc[1] + srcStep[1] * (y + roi->top / 2) + roi->left / 4; + const BYTE* pUaV = pUaU + nTotalWidth / 4; + const BYTE* pVaU = pSrc[2] + srcStep[2] * (y + roi->top / 2) + roi->left / 4; + const BYTE* pVaV = pVaU + nTotalWidth / 4; + BYTE* pU = pDst[1] + dstStep[1] * (2 * y + 1 + roi->top) + roi->left; + BYTE* pV = pDst[2] + dstStep[2] * (2 * y + 1 + roi->top) + roi->left; + + for (x = 0; x < quaterWidth - quaterPad; x += 16) + { + { + uint8x16x4_t u = vld4q_u8(&pU[4 * x]); + u.val[0] = vld1q_u8(&pUaU[x]); + u.val[2] = vld1q_u8(&pVaU[x]); + vst4q_u8(&pU[4 * x], u); + } + { + uint8x16x4_t v = vld4q_u8(&pV[4 * x]); + v.val[0] = vld1q_u8(&pUaV[x]); + v.val[2] = vld1q_u8(&pVaV[x]); + vst4q_u8(&pV[4 * x], v); + } + } + + for (; x < quaterWidth; x++) + { + pU[4 * x + 0] = pUaU[x]; + pV[4 * x + 0] = pUaV[x]; + pU[4 * x + 2] = pVaU[x]; + pV[4 * x + 2] = pVaV[x]; + } + } + + return neon_ChromaFilter(pDst, dstStep, roi); +} + +static pstatus_t neon_YUV420CombineToYUV444( + avc444_frame_type type, + const BYTE* pSrc[3], const UINT32 srcStep[3], + UINT32 nWidth, UINT32 nHeight, + BYTE* pDst[3], const UINT32 dstStep[3], + const RECTANGLE_16* roi) +{ + if (!pSrc || !pSrc[0] || !pSrc[1] || !pSrc[2]) + return -1; + + if (!pDst || !pDst[0] || !pDst[1] || !pDst[2]) + return -1; + + if (!roi) + return -1; + + switch (type) + { + case AVC444_LUMA: + return neon_LumaToYUV444(pSrc, srcStep, pDst, dstStep, roi); + + case AVC444_CHROMAv1: + return neon_ChromaV1ToYUV444(pSrc, srcStep, pDst, dstStep, roi); + + case AVC444_CHROMAv2: + return neon_ChromaV2ToYUV444(pSrc, srcStep, nWidth, nHeight, pDst, dstStep, roi); + + default: + return -1; + } +} +#endif + +void primitives_init_YUV_opt(primitives_t* prims) +{ + generic = primitives_get_generic(); + primitives_init_YUV(prims); + + if (IsProcessorFeaturePresentEx(PF_EX_SSSE3) + && IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE)) + { + prims->RGBToYUV420_8u_P3AC4R = ssse3_RGBToYUV420; + prims->RGBToAVC444YUV = ssse3_RGBToAVC444YUV; + prims->YUV420ToRGB_8u_P3AC4R = ssse3_YUV420ToRGB; + prims->YUV444ToRGB_8u_P3AC4R = ssse3_YUV444ToRGB_8u_P3AC4R; + prims->YUV420CombineToYUV444 = ssse3_YUV420CombineToYUV444; + } +}