Added SSSE3 UV average to AVC444v1
authorArmin Novak <armin.novak@thincast.com>
Wed, 31 Jan 2018 10:33:55 +0000 (11:33 +0100)
committerArmin Novak <armin.novak@thincast.com>
Wed, 7 Feb 2018 09:48:43 +0000 (10:48 +0100)
libfreerdp/primitives/prim_YUV_ssse3.c

index a1bae6c..184f2be 100644 (file)
@@ -515,83 +515,184 @@ static pstatus_t ssse3_RGBToYUV420(
 /* SSSE3 RGB -> AVC444-YUV conversion                                      **/
 /****************************************************************************/
 
-static INLINE void ssse3_RGBToAVC444YUV_BGRX_ROW(
-    const BYTE* src, BYTE* ydst, BYTE* udst1, BYTE* udst2, BYTE* vdst1, BYTE* vdst2, BOOL isEvenRow,
-    UINT32 width)
+static INLINE void ssse3_RGBToAVC444YUV_BGRX_DOUBLE_ROW(
+    const BYTE* srcEven, const BYTE* srcOdd, BYTE* b1Even, BYTE* b1Odd, BYTE* b2,
+    BYTE* b3, BYTE* b4, BYTE* b5, BYTE* b6, BYTE* b7, UINT32 width)
 {
        UINT32 x;
-       __m128i vector128, y_factors, u_factors, v_factors, smask;
-       __m128i x1, x2, x3, x4, y, y1, y2, u, u1, u2, v, v1, v2;
-       const __m128i* argb = (const __m128i*) src;
-       __m128i* py = (__m128i*) ydst;
-       __m64* pu1 = (__m64*) udst1;
-       __m64* pu2 = (__m64*) udst2;
-       __m64* pv1 = (__m64*) vdst1;
-       __m64* pv2 = (__m64*) vdst2;
-       y_factors = _mm_load_si128((__m128i*)bgrx_y_factors);
-       u_factors = _mm_load_si128((__m128i*)bgrx_u_factors);
-       v_factors = _mm_load_si128((__m128i*)bgrx_v_factors);
-       vector128 = _mm_load_si128((__m128i*)const_buf_128b);
-       smask = _mm_set_epi8(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
+       const __m128i* argbEven = (const __m128i*) srcEven;
+       const __m128i* argbOdd = (const __m128i*) srcOdd;
+       const __m128i y_factors = _mm_load_si128((__m128i*)bgrx_y_factors);
+       const __m128i u_factors = _mm_load_si128((__m128i*)bgrx_u_factors);
+       const __m128i v_factors = _mm_load_si128((__m128i*)bgrx_v_factors);
+       const __m128i vector128 = _mm_load_si128((__m128i*)const_buf_128b);
 
        for (x = 0; x < width; x += 16)
        {
                /* store 16 rgba pixels in 4 128 bit registers */
-               x1 = _mm_load_si128(argb++); // 1st 4 pixels
-               x2 = _mm_load_si128(argb++); // 2nd 4 pixels
-               x3 = _mm_load_si128(argb++); // 3rd 4 pixels
-               x4 = _mm_load_si128(argb++); // 4th 4 pixels
-               /* Y: multiplications with subtotals and horizontal sums */
-               y1 = _mm_hadd_epi16(_mm_maddubs_epi16(x1, y_factors), _mm_maddubs_epi16(x2, y_factors));
-               y2 = _mm_hadd_epi16(_mm_maddubs_epi16(x3, y_factors), _mm_maddubs_epi16(x4, y_factors));
-               /* Y: shift the results (logical) */
-               y1 = _mm_srli_epi16(y1, 7);
-               y2 = _mm_srli_epi16(y2, 7);
-               /* Y: pack (unsigned) 16 words into bytes */
-               y = _mm_packus_epi16(y1, y2);
-               /* U: multiplications with subtotals and horizontal sums */
-               u1 = _mm_hadd_epi16(_mm_maddubs_epi16(x1, u_factors), _mm_maddubs_epi16(x2, u_factors));
-               u2 = _mm_hadd_epi16(_mm_maddubs_epi16(x3, u_factors), _mm_maddubs_epi16(x4, u_factors));
-               /* U: shift the results (arithmetic) */
-               u1 = _mm_srai_epi16(u1, 7);
-               u2 = _mm_srai_epi16(u2, 7);
-               /* U: pack (signed) 16 words into bytes */
-               u = _mm_packs_epi16(u1, u2);
-               /* U: add 128 */
-               u = _mm_add_epi8(u, vector128);
-               /* V: multiplications with subtotals and horizontal sums */
-               v1 = _mm_hadd_epi16(_mm_maddubs_epi16(x1, v_factors), _mm_maddubs_epi16(x2, v_factors));
-               v2 = _mm_hadd_epi16(_mm_maddubs_epi16(x3, v_factors), _mm_maddubs_epi16(x4, v_factors));
-               /* V: shift the results (arithmetic) */
-               v1 = _mm_srai_epi16(v1, 7);
-               v2 = _mm_srai_epi16(v2, 7);
-               /* V: pack (signed) 16 words into bytes */
-               v = _mm_packs_epi16(v1, v2);
-               /* V: add 128 */
-               v = _mm_add_epi8(v, vector128);
-               /* store y */
-               _mm_storeu_si128(py++, y);
-
-               /* store u and v */
-               if (isEvenRow)
+               const __m128i xe1 = _mm_load_si128(argbEven++); // 1st 4 pixels
+               const __m128i xe2 = _mm_load_si128(argbEven++); // 2nd 4 pixels
+               const __m128i xe3 = _mm_load_si128(argbEven++); // 3rd 4 pixels
+               const __m128i xe4 = _mm_load_si128(argbEven++); // 4th 4 pixels
+               const __m128i xo1 = _mm_load_si128(argbOdd++); // 1st 4 pixels
+               const __m128i xo2 = _mm_load_si128(argbOdd++); // 2nd 4 pixels
+               const __m128i xo3 = _mm_load_si128(argbOdd++); // 3rd 4 pixels
+               const __m128i xo4 = _mm_load_si128(argbOdd++); // 4th 4 pixels
                {
-                       u = _mm_shuffle_epi8(u, smask);
-                       v = _mm_shuffle_epi8(v, smask);
-                       _mm_storel_pi(pu1++, _mm_castsi128_ps(u));
-                       _mm_storeh_pi(pu2++, _mm_castsi128_ps(u));
-                       _mm_storel_pi(pv1++, _mm_castsi128_ps(v));
-                       _mm_storeh_pi(pv2++, _mm_castsi128_ps(v));
+                       /* Y: multiplications with subtotals and horizontal sums */
+                       const __m128i ye1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, y_factors),
+                                                          _mm_maddubs_epi16(xe2, y_factors)), 7);
+                       const __m128i ye2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, y_factors),
+                                                          _mm_maddubs_epi16(xe4, y_factors)), 7);
+                       const __m128i ye = _mm_packus_epi16(ye1, ye2);
+                       const __m128i yo1 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, y_factors),
+                                                          _mm_maddubs_epi16(xo2, y_factors)), 7);
+                       const __m128i yo2 = _mm_srli_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, y_factors),
+                                                          _mm_maddubs_epi16(xo4, y_factors)), 7);
+                       const __m128i yo = _mm_packus_epi16(yo1, yo2);
+                       /* store y [b1] */
+                       _mm_storeu_si128((__m128i*)b1Even, ye);
+                       b1Even += 16;
+
+                       if (b1Odd)
+                       {
+                               _mm_storeu_si128((__m128i*)b1Odd, yo);
+                               b1Odd += 16;
+                       }
                }
-               else
                {
-                       _mm_storel_pi(pu1, _mm_castsi128_ps(u));
-                       _mm_storeh_pi(pu2, _mm_castsi128_ps(u));
-                       _mm_storel_pi(pv1, _mm_castsi128_ps(v));
-                       _mm_storeh_pi(pv2, _mm_castsi128_ps(v));
-                       pu1 += 2;
-                       pu2 += 2;
-                       pv1 += 2;
-                       pv2 += 2;
+                       /* We have now
+                          * 16 even U values in ue
+                          * 16 odd U values in uo
+                          *
+                          * We need to split these according to
+                          * 3.3.8.3.2 YUV420p Stream Combination for YUV444 mode */
+                       __m128i ue, uo;
+                       {
+                               const __m128i ue1 = _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, u_factors),
+                                                                  _mm_maddubs_epi16(xe2, u_factors)), 7);
+                               const __m128i ue2 = _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, u_factors),
+                                                                  _mm_maddubs_epi16(xe4, u_factors)), 7);
+                               ue = _mm_add_epi8(_mm_packs_epi16(ue1, ue2), vector128);
+                       }
+
+                       if (b1Odd)
+                       {
+                               const __m128i uo1 = _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, u_factors),
+                                                                  _mm_maddubs_epi16(xo2, u_factors)), 7);
+                               const __m128i uo2 = _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, u_factors),
+                                                                  _mm_maddubs_epi16(xo4, u_factors)), 7);
+                               uo = _mm_add_epi8(_mm_packs_epi16(uo1, uo2), vector128);
+                       }
+
+                       /* Now we need the following storage distribution:
+                        * 2x   2y    -> b2
+                        * x    2y+1  -> b4
+                        * 2x+1 2y    -> b6 */
+                       if (b1Odd) /* b2 */
+                       {
+                               const __m128i ueh = _mm_unpackhi_epi8(ue, _mm_setzero_si128());
+                               const __m128i uoh = _mm_unpackhi_epi8(uo, _mm_setzero_si128());
+                               const __m128i hi = _mm_add_epi16(ueh, uoh);
+                               const __m128i uel = _mm_unpacklo_epi8(ue, _mm_setzero_si128());
+                               const __m128i uol = _mm_unpacklo_epi8(uo, _mm_setzero_si128());
+                               const __m128i lo = _mm_add_epi16(uel, uol);
+                               const __m128i added = _mm_hadd_epi16(lo, hi);
+                               const __m128i avg16 = _mm_srai_epi16(added, 2);
+                               const __m128i avg = _mm_packus_epi16(avg16, avg16);
+                               _mm_storel_epi64((__m128i*)b2, avg);
+                       }
+                       else
+                       {
+                               const __m128i mask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+                                                                 14, 12, 10, 8, 6, 4, 2, 0);
+                               const __m128i ud = _mm_shuffle_epi8(ue, mask);
+                               _mm_storel_epi64((__m128i*)b2, ud);
+                       }
+
+                       b2 += 8;
+
+                       if (b1Odd) /* b4 */
+                       {
+                               _mm_store_si128((__m128i*)b4, uo);
+                               b4 += 16;
+                       }
+
+                       {
+                               /* b6 */
+                               const __m128i mask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+                                                                 15, 13, 11, 9, 7, 5, 3, 1);
+                               const __m128i ude = _mm_shuffle_epi8(ue, mask);
+                               _mm_storel_epi64((__m128i*)b6, ude);
+                               b6 += 8;
+                       }
+               }
+               {
+                       /* We have now
+                          * 16 even V values in ue
+                          * 16 odd V values in uo
+                          *
+                          * We need to split these according to
+                          * 3.3.8.3.2 YUV420p Stream Combination for YUV444 mode */
+                       __m128i ve, vo;
+                       {
+                               const __m128i ve1 = _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe1, v_factors),
+                                                                  _mm_maddubs_epi16(xe2, v_factors)), 7);
+                               const __m128i ve2 = _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xe3, v_factors),
+                                                                  _mm_maddubs_epi16(xe4, v_factors)), 7);
+                               ve = _mm_add_epi8(_mm_packs_epi16(ve1, ve2), vector128);
+                       }
+
+                       if (b1Odd)
+                       {
+                               const __m128i vo1 = _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo1, v_factors),
+                                                                  _mm_maddubs_epi16(xo2, v_factors)), 7);
+                               const __m128i vo2 = _mm_srai_epi16(_mm_hadd_epi16(_mm_maddubs_epi16(xo3, v_factors),
+                                                                  _mm_maddubs_epi16(xo4, v_factors)), 7);
+                               vo = _mm_add_epi8(_mm_packs_epi16(vo1, vo2), vector128);
+                       }
+
+                       /* Now we need the following storage distribution:
+                        * 2x   2y    -> b3
+                        * x    2y+1  -> b5
+                        * 2x+1 2y    -> b7 */
+                       if (b1Odd) /* b3 */
+                       {
+                               const __m128i veh = _mm_unpackhi_epi8(ve, _mm_setzero_si128());
+                               const __m128i voh = _mm_unpackhi_epi8(vo, _mm_setzero_si128());
+                               const __m128i hi = _mm_add_epi16(veh, voh);
+                               const __m128i vel = _mm_unpacklo_epi8(ve, _mm_setzero_si128());
+                               const __m128i vol = _mm_unpacklo_epi8(vo, _mm_setzero_si128());
+                               const __m128i lo = _mm_add_epi16(vel, vol);
+                               const __m128i added = _mm_hadd_epi16(lo, hi);
+                               const __m128i avg16 = _mm_srai_epi16(added, 2);
+                               const __m128i avg = _mm_packus_epi16(avg16, avg16);
+                               _mm_storel_epi64((__m128i*)b3, avg);
+                       }
+                       else
+                       {
+                               const __m128i mask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+                                                                 14, 12, 10, 8, 6, 4, 2, 0);
+                               const __m128i vd = _mm_shuffle_epi8(ve, mask);
+                               _mm_storel_epi64((__m128i*)b3, vd);
+                       }
+
+                       b3 += 8;
+
+                       if (b1Odd) /* b5 */
+                       {
+                               _mm_store_si128((__m128i*)b5, vo);
+                               b5 += 16;
+                       }
+
+                       {
+                               /* b7 */
+                               const __m128i mask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+                                                                 15, 13, 11, 9, 7, 5, 3, 1);
+                               const __m128i vde = _mm_shuffle_epi8(ve, mask);
+                               _mm_storel_epi64((__m128i*)b7, vde);
+                               b7 += 8;
+                       }
                }
        }
 }
@@ -603,9 +704,7 @@ static pstatus_t ssse3_RGBToAVC444YUV_BGRX(
     BYTE* pDst2[3], const UINT32 dst2Step[3],
     const prim_size_t* roi)
 {
-       UINT32 y, numRows;
-       BOOL evenRow = TRUE;
-       BYTE* b1, *b2, *b3, *b4, *b5, *b6, *b7;
+       UINT32 y;
        const BYTE* pMaxSrc = pSrc + (roi->height - 1) * srcStep;
 
        if (roi->height < 1 || roi->width < 1)
@@ -614,28 +713,23 @@ static pstatus_t ssse3_RGBToAVC444YUV_BGRX(
        if (roi->width % 16 || (unsigned long)pSrc % 16 || srcStep % 16)
                return generic->RGBToAVC444YUV(pSrc, srcFormat, srcStep, pDst1, dst1Step, pDst2, dst2Step, roi);
 
-       numRows = (roi->height + 1) & ~1;
-
-       for (y = 0; y < numRows; y++, evenRow = !evenRow)
+       for (y = 0; y < roi->height; y += 2)
        {
-               const BYTE* src = y < roi->height ? pSrc + y * srcStep : pMaxSrc;
-               UINT32 i = y >> 1;
-               b1  = pDst1[0] + y * dst1Step[0];
-
-               if (evenRow)
-               {
-                       b2 = pDst1[1] + i * dst1Step[1];
-                       b3 = pDst1[2] + i * dst1Step[2];
-                       b6 = pDst2[1] + i * dst2Step[1];
-                       b7 = pDst2[2] + i * dst2Step[2];
-                       ssse3_RGBToAVC444YUV_BGRX_ROW(src, b1, b2, b6, b3, b7, TRUE, roi->width);
-               }
-               else
-               {
-                       b4 = pDst2[0] + dst2Step[0] * ((i & ~7) + i);
-                       b5 = b4 + 8 * dst2Step[0];
-                       ssse3_RGBToAVC444YUV_BGRX_ROW(src, b1, b4, b4 + 8, b5, b5 + 8, FALSE, roi->width);
-               }
+               const BOOL last = (y >= (roi->height - 1));
+               const BYTE* srcEven = y < roi->height ? pSrc + y * srcStep : pMaxSrc;
+               const BYTE* srcOdd = !last ? pSrc + (y + 1) * srcStep : pMaxSrc;
+               const UINT32 i = y >> 1;
+               const UINT32 n = (i & ~7) + i;
+               BYTE* b1Even = pDst1[0] + y * dst1Step[0];
+               BYTE* b1Odd = !last ? (b1Even + dst1Step[0]) : NULL;
+               BYTE* b2 = pDst1[1] + (y / 2) * dst1Step[1];
+               BYTE* b3 = pDst1[2] + (y / 2) * dst1Step[2];
+               BYTE* b4 = pDst2[0] + dst2Step[0] * n;
+               BYTE* b5 = b4 + 8 * dst2Step[0];
+               BYTE* b6 = pDst2[1] + (y / 2) * dst2Step[1];
+               BYTE* b7 = pDst2[2] + (y / 2) * dst2Step[2];
+               ssse3_RGBToAVC444YUV_BGRX_DOUBLE_ROW(srcEven, srcOdd, b1Even, b1Odd, b2, b3, b4, b5, b6, b7,
+                                                    roi->width);
        }
 
        return PRIMITIVES_SUCCESS;
@@ -775,8 +869,11 @@ static INLINE void ssse3_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(
                                const __m128i mask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
                                                                  14, 10, 6, 2, 12, 8, 4, 0);
                                const __m128i ud = _mm_shuffle_epi8(uo, mask);
-                               _mm_stream_si32((int*)uChromaDst1, ((int*)&ud)[0]);
-                               _mm_stream_si32((int*)vChromaDst1, ((int*)&ud)[1]);
+                               int* uDst1 = (int*)uChromaDst1;
+                               int* vDst1 = (int*)vChromaDst1;
+                               const int* src = (const int*)&ud;
+                               _mm_stream_si32(uDst1, src[0]);
+                               _mm_stream_si32(vDst1, src[1]);
                                uChromaDst1 += 4;
                                vChromaDst1 += 4;
                        }
@@ -839,9 +936,12 @@ static INLINE void ssse3_RGBToAVC444YUVv2_BGRX_DOUBLE_ROW(
                        {
                                const __m128i mask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
                                                                  14, 10, 6, 2, 12, 8, 4, 0);
-                               __m128i vd = _mm_shuffle_epi8(vo, mask);
-                               _mm_stream_si32((int*)uChromaDst2, ((int*)&vd)[0]);
-                               _mm_stream_si32((int*)vChromaDst2, ((int*)&vd)[1]);
+                               const __m128i vd = _mm_shuffle_epi8(vo, mask);
+                               int* uDst2 = (int*)uChromaDst2;
+                               int* vDst2 = (int*)vChromaDst2;
+                               const int* src = (const int*)&vd;
+                               _mm_stream_si32(uDst2, src[0]);
+                               _mm_stream_si32(vDst2, src[1]);
                                uChromaDst2 += 4;
                                vChromaDst2 += 4;
                        }