Streamline x86 u8 -> fixed15 math.
authorMike Klein <mtklein@chromium.org>
Fri, 20 Jan 2017 05:04:58 +0000 (00:04 -0500)
committerSkia Commit-Bot <skia-commit-bot@chromium.org>
Fri, 20 Jan 2017 16:06:35 +0000 (16:06 +0000)
We can use SSE's 16 bit mul-hi to get a very good approximation to the
ideal multiplier.  This lets us trim several instructions.

This removes the need for the constant 0x0001 and instead uses 0x8081.
I've reordered the constants so that 0x8000 comes first, which helps
trim an instruction here and there on ARM.

Change-Id: I3d490c802df39a89424230c4cfc491f52210c275
Reviewed-on: https://skia-review.googlesource.com/7282
Reviewed-by: Herb Derby <herb@google.com>
Commit-Queue: Mike Klein <mtklein@chromium.org>

src/splicer/SkSplicer.cpp
src/splicer/SkSplicer_generated_lowp.h
src/splicer/SkSplicer_shared.h
src/splicer/SkSplicer_stages_lowp.cpp

index f55eb34..4ed45e0 100644 (file)
@@ -44,7 +44,7 @@ namespace {
         12.46f, 0.411192f, 0.689206f, -0.0988f, 0.0043f,   //   to_srgb
     };
     static const SkSplicer_constants_lowp kConstants_lowp = {
-        0x0001, 0x8000,
+        0x8000, 0x8081,
     };
 
     // We do this a lot, so it's nice to infer the correct size.  Works fine with arrays.
index 3ea4962..58e03c7 100644 (file)
@@ -29,8 +29,7 @@ static const unsigned int kSplice_plus__lowp[] = {
     0x6e670c63,                                 //  uqadd         v3.8h, v3.8h, v7.8h
 };
 static const unsigned int kSplice_srcover_lowp[] = {
-    0x91000868,                                 //  add           x8, x3, #0x2
-    0x4d40c510,                                 //  ld1r          {v16.8h}, [x8]
+    0x4d40c470,                                 //  ld1r          {v16.8h}, [x3]
     0x6e632e10,                                 //  uqsub         v16.8h, v16.8h, v3.8h
     0x6e70b491,                                 //  sqrdmulh      v17.8h, v4.8h, v16.8h
     0x4e241e12,                                 //  and           v18.16b, v16.16b, v4.16b
@@ -54,8 +53,7 @@ static const unsigned int kSplice_srcover_lowp[] = {
     0x6e630e83,                                 //  uqadd         v3.8h, v20.8h, v3.8h
 };
 static const unsigned int kSplice_dstover_lowp[] = {
-    0x91000868,                                 //  add           x8, x3, #0x2
-    0x4d40c510,                                 //  ld1r          {v16.8h}, [x8]
+    0x4d40c470,                                 //  ld1r          {v16.8h}, [x3]
     0x6e672e10,                                 //  uqsub         v16.8h, v16.8h, v7.8h
     0x6e70b411,                                 //  sqrdmulh      v17.8h, v0.8h, v16.8h
     0x4e201e12,                                 //  and           v18.16b, v16.16b, v0.16b
@@ -79,16 +77,14 @@ static const unsigned int kSplice_dstover_lowp[] = {
     0x6e670e87,                                 //  uqadd         v7.8h, v20.8h, v7.8h
 };
 static const unsigned int kSplice_clamp_1_lowp[] = {
-    0x91000868,                                 //  add           x8, x3, #0x2
-    0x4d40c510,                                 //  ld1r          {v16.8h}, [x8]
+    0x4d40c470,                                 //  ld1r          {v16.8h}, [x3]
     0x6e706c00,                                 //  umin          v0.8h, v0.8h, v16.8h
     0x6e706c21,                                 //  umin          v1.8h, v1.8h, v16.8h
     0x6e706c42,                                 //  umin          v2.8h, v2.8h, v16.8h
     0x6e706c63,                                 //  umin          v3.8h, v3.8h, v16.8h
 };
 static const unsigned int kSplice_clamp_a_lowp[] = {
-    0x91000868,                                 //  add           x8, x3, #0x2
-    0x4d40c510,                                 //  ld1r          {v16.8h}, [x8]
+    0x4d40c470,                                 //  ld1r          {v16.8h}, [x3]
     0x6e706c63,                                 //  umin          v3.8h, v3.8h, v16.8h
     0x6e636c00,                                 //  umin          v0.8h, v0.8h, v3.8h
     0x6e636c21,                                 //  umin          v1.8h, v1.8h, v3.8h
@@ -197,8 +193,7 @@ static const unsigned int kSplice_plus__lowp[] = {
     0xf3133017,                                 //  vqadd.u16     d3, d3, d7
 };
 static const unsigned int kSplice_srcover_lowp[] = {
-    0xe283c002,                                 //  add           ip, r3, #2
-    0xf4ec0c5f,                                 //  vld1.16       {d16[]}, [ip :16]
+    0xf4e30c5f,                                 //  vld1.16       {d16[]}, [r3 :16]
     0xf3500293,                                 //  vqsub.u16     d16, d16, d3
     0xf3541b20,                                 //  vqrdmulh.s16  d17, d4, d16
     0xf3552b20,                                 //  vqrdmulh.s16  d18, d5, d16
@@ -222,8 +217,7 @@ static const unsigned int kSplice_srcover_lowp[] = {
     0xf3143093,                                 //  vqadd.u16     d3, d20, d3
 };
 static const unsigned int kSplice_dstover_lowp[] = {
-    0xe283c002,                                 //  add           ip, r3, #2
-    0xf4ec0c5f,                                 //  vld1.16       {d16[]}, [ip :16]
+    0xf4e30c5f,                                 //  vld1.16       {d16[]}, [r3 :16]
     0xf3500297,                                 //  vqsub.u16     d16, d16, d7
     0xf3501b20,                                 //  vqrdmulh.s16  d17, d0, d16
     0xf3512b20,                                 //  vqrdmulh.s16  d18, d1, d16
@@ -247,16 +241,14 @@ static const unsigned int kSplice_dstover_lowp[] = {
     0xf3147097,                                 //  vqadd.u16     d7, d20, d7
 };
 static const unsigned int kSplice_clamp_1_lowp[] = {
-    0xe283c002,                                 //  add           ip, r3, #2
-    0xf4ec0c5f,                                 //  vld1.16       {d16[]}, [ip :16]
+    0xf4e30c5f,                                 //  vld1.16       {d16[]}, [r3 :16]
     0xf3100630,                                 //  vmin.u16      d0, d0, d16
     0xf3111630,                                 //  vmin.u16      d1, d1, d16
     0xf3122630,                                 //  vmin.u16      d2, d2, d16
     0xf3133630,                                 //  vmin.u16      d3, d3, d16
 };
 static const unsigned int kSplice_clamp_a_lowp[] = {
-    0xe283c002,                                 //  add           ip, r3, #2
-    0xf4ec0c5f,                                 //  vld1.16       {d16[]}, [ip :16]
+    0xf4e30c5f,                                 //  vld1.16       {d16[]}, [r3 :16]
     0xf3133630,                                 //  vmin.u16      d3, d3, d16
     0xf3100613,                                 //  vmin.u16      d0, d0, d3
     0xf3111613,                                 //  vmin.u16      d1, d1, d3
@@ -376,7 +368,7 @@ static const unsigned char kSplice_plus__lowp[] = {
     0xc5,0xe5,0xdd,0xdf,                        //  vpaddusw      %ymm7,%ymm3,%ymm3
 };
 static const unsigned char kSplice_srcover_lowp[] = {
-    0xc4,0x62,0x7d,0x79,0x41,0x02,              //  vpbroadcastw  0x2(%rcx),%ymm8
+    0xc4,0x62,0x7d,0x79,0x01,                   //  vpbroadcastw  (%rcx),%ymm8
     0xc5,0x3d,0xd9,0xc3,                        //  vpsubusw      %ymm3,%ymm8,%ymm8
     0xc4,0x42,0x5d,0x0b,0xc8,                   //  vpmulhrsw     %ymm8,%ymm4,%ymm9
     0xc4,0x42,0x7d,0x1d,0xc9,                   //  vpabsw        %ymm9,%ymm9
@@ -392,7 +384,7 @@ static const unsigned char kSplice_srcover_lowp[] = {
     0xc5,0xbd,0xdd,0xdb,                        //  vpaddusw      %ymm3,%ymm8,%ymm3
 };
 static const unsigned char kSplice_dstover_lowp[] = {
-    0xc4,0x62,0x7d,0x79,0x41,0x02,              //  vpbroadcastw  0x2(%rcx),%ymm8
+    0xc4,0x62,0x7d,0x79,0x01,                   //  vpbroadcastw  (%rcx),%ymm8
     0xc5,0x3d,0xd9,0xc7,                        //  vpsubusw      %ymm7,%ymm8,%ymm8
     0xc4,0x42,0x7d,0x0b,0xc8,                   //  vpmulhrsw     %ymm8,%ymm0,%ymm9
     0xc4,0x42,0x7d,0x1d,0xc9,                   //  vpabsw        %ymm9,%ymm9
@@ -408,14 +400,14 @@ static const unsigned char kSplice_dstover_lowp[] = {
     0xc5,0xbd,0xdd,0xff,                        //  vpaddusw      %ymm7,%ymm8,%ymm7
 };
 static const unsigned char kSplice_clamp_1_lowp[] = {
-    0xc4,0x62,0x7d,0x79,0x41,0x02,              //  vpbroadcastw  0x2(%rcx),%ymm8
+    0xc4,0x62,0x7d,0x79,0x01,                   //  vpbroadcastw  (%rcx),%ymm8
     0xc4,0xc2,0x7d,0x3a,0xc0,                   //  vpminuw       %ymm8,%ymm0,%ymm0
     0xc4,0xc2,0x75,0x3a,0xc8,                   //  vpminuw       %ymm8,%ymm1,%ymm1
     0xc4,0xc2,0x6d,0x3a,0xd0,                   //  vpminuw       %ymm8,%ymm2,%ymm2
     0xc4,0xc2,0x65,0x3a,0xd8,                   //  vpminuw       %ymm8,%ymm3,%ymm3
 };
 static const unsigned char kSplice_clamp_a_lowp[] = {
-    0xc4,0x62,0x7d,0x79,0x41,0x02,              //  vpbroadcastw  0x2(%rcx),%ymm8
+    0xc4,0x62,0x7d,0x79,0x01,                   //  vpbroadcastw  (%rcx),%ymm8
     0xc4,0xc2,0x65,0x3a,0xd8,                   //  vpminuw       %ymm8,%ymm3,%ymm3
     0xc4,0xe2,0x7d,0x3a,0xc3,                   //  vpminuw       %ymm3,%ymm0,%ymm0
     0xc4,0xe2,0x75,0x3a,0xcb,                   //  vpminuw       %ymm3,%ymm1,%ymm1
@@ -458,13 +450,9 @@ static const unsigned char kSplice_premul_lowp[] = {
 static const unsigned char kSplice_scale_u8_lowp[] = {
     0x48,0x8b,0x02,                             //  mov           (%rdx),%rax
     0xc4,0x62,0x7d,0x30,0x04,0x38,              //  vpmovzxbw     (%rax,%rdi,1),%ymm8
-    0xc4,0xc1,0x35,0x71,0xf0,0x07,              //  vpsllw        $0x7,%ymm8,%ymm9
-    0xc4,0xc1,0x2d,0x71,0xd0,0x01,              //  vpsrlw        $0x1,%ymm8,%ymm10
-    0xc4,0x41,0x35,0xdd,0xca,                   //  vpaddusw      %ymm10,%ymm9,%ymm9
-    0xc4,0x62,0x7d,0x79,0x11,                   //  vpbroadcastw  (%rcx),%ymm10
-    0xc4,0x41,0x3d,0xdd,0xc2,                   //  vpaddusw      %ymm10,%ymm8,%ymm8
-    0xc4,0xc1,0x3d,0x71,0xd0,0x08,              //  vpsrlw        $0x8,%ymm8,%ymm8
-    0xc4,0x41,0x35,0xdd,0xc0,                   //  vpaddusw      %ymm8,%ymm9,%ymm8
+    0xc4,0xc1,0x3d,0x71,0xf0,0x08,              //  vpsllw        $0x8,%ymm8,%ymm8
+    0xc4,0x62,0x7d,0x79,0x49,0x02,              //  vpbroadcastw  0x2(%rcx),%ymm9
+    0xc4,0x41,0x3d,0xe4,0xc1,                   //  vpmulhuw      %ymm9,%ymm8,%ymm8
     0xc4,0xc2,0x7d,0x0b,0xc0,                   //  vpmulhrsw     %ymm8,%ymm0,%ymm0
     0xc4,0xe2,0x7d,0x1d,0xc0,                   //  vpabsw        %ymm0,%ymm0
     0xc4,0xc2,0x75,0x0b,0xc8,                   //  vpmulhrsw     %ymm8,%ymm1,%ymm1
@@ -494,37 +482,21 @@ static const unsigned char kSplice_load_8888_lowp[] = {
     0xc5,0x39,0x68,0xc1,                        //  vpunpckhbw    %xmm1,%xmm8,%xmm8
     0xc5,0xe9,0x6c,0xc3,                        //  vpunpcklqdq   %xmm3,%xmm2,%xmm0
     0xc4,0xe2,0x7d,0x30,0xc0,                   //  vpmovzxbw     %xmm0,%ymm0
-    0xc5,0xf5,0x71,0xf0,0x07,                   //  vpsllw        $0x7,%ymm0,%ymm1
-    0xc5,0xad,0x71,0xd0,0x01,                   //  vpsrlw        $0x1,%ymm0,%ymm10
-    0xc4,0xc1,0x75,0xdd,0xca,                   //  vpaddusw      %ymm10,%ymm1,%ymm1
-    0xc4,0x62,0x7d,0x79,0x11,                   //  vpbroadcastw  (%rcx),%ymm10
-    0xc4,0xc1,0x7d,0xdd,0xc2,                   //  vpaddusw      %ymm10,%ymm0,%ymm0
-    0xc5,0xfd,0x71,0xd0,0x08,                   //  vpsrlw        $0x8,%ymm0,%ymm0
-    0xc5,0xf5,0xdd,0xc0,                        //  vpaddusw      %ymm0,%ymm1,%ymm0
+    0xc5,0xfd,0x71,0xf0,0x08,                   //  vpsllw        $0x8,%ymm0,%ymm0
+    0xc4,0x62,0x7d,0x79,0x51,0x02,              //  vpbroadcastw  0x2(%rcx),%ymm10
+    0xc4,0xc1,0x7d,0xe4,0xc2,                   //  vpmulhuw      %ymm10,%ymm0,%ymm0
     0xc5,0xe9,0x6d,0xcb,                        //  vpunpckhqdq   %xmm3,%xmm2,%xmm1
     0xc4,0xe2,0x7d,0x30,0xc9,                   //  vpmovzxbw     %xmm1,%ymm1
-    0xc5,0xed,0x71,0xf1,0x07,                   //  vpsllw        $0x7,%ymm1,%ymm2
-    0xc5,0xe5,0x71,0xd1,0x01,                   //  vpsrlw        $0x1,%ymm1,%ymm3
-    0xc5,0xed,0xdd,0xd3,                        //  vpaddusw      %ymm3,%ymm2,%ymm2
-    0xc4,0xc1,0x75,0xdd,0xca,                   //  vpaddusw      %ymm10,%ymm1,%ymm1
-    0xc5,0xf5,0x71,0xd1,0x08,                   //  vpsrlw        $0x8,%ymm1,%ymm1
-    0xc5,0xed,0xdd,0xc9,                        //  vpaddusw      %ymm1,%ymm2,%ymm1
+    0xc5,0xf5,0x71,0xf1,0x08,                   //  vpsllw        $0x8,%ymm1,%ymm1
+    0xc4,0xc1,0x75,0xe4,0xca,                   //  vpmulhuw      %ymm10,%ymm1,%ymm1
     0xc4,0xc1,0x31,0x6c,0xd0,                   //  vpunpcklqdq   %xmm8,%xmm9,%xmm2
     0xc4,0xe2,0x7d,0x30,0xd2,                   //  vpmovzxbw     %xmm2,%ymm2
-    0xc5,0xe5,0x71,0xf2,0x07,                   //  vpsllw        $0x7,%ymm2,%ymm3
-    0xc5,0xa5,0x71,0xd2,0x01,                   //  vpsrlw        $0x1,%ymm2,%ymm11
-    0xc4,0xc1,0x65,0xdd,0xdb,                   //  vpaddusw      %ymm11,%ymm3,%ymm3
-    0xc4,0xc1,0x6d,0xdd,0xd2,                   //  vpaddusw      %ymm10,%ymm2,%ymm2
-    0xc5,0xed,0x71,0xd2,0x08,                   //  vpsrlw        $0x8,%ymm2,%ymm2
-    0xc5,0xe5,0xdd,0xd2,                        //  vpaddusw      %ymm2,%ymm3,%ymm2
+    0xc5,0xed,0x71,0xf2,0x08,                   //  vpsllw        $0x8,%ymm2,%ymm2
+    0xc4,0xc1,0x6d,0xe4,0xd2,                   //  vpmulhuw      %ymm10,%ymm2,%ymm2
     0xc4,0xc1,0x31,0x6d,0xd8,                   //  vpunpckhqdq   %xmm8,%xmm9,%xmm3
     0xc4,0xe2,0x7d,0x30,0xdb,                   //  vpmovzxbw     %xmm3,%ymm3
-    0xc5,0xbd,0x71,0xf3,0x07,                   //  vpsllw        $0x7,%ymm3,%ymm8
-    0xc5,0xb5,0x71,0xd3,0x01,                   //  vpsrlw        $0x1,%ymm3,%ymm9
-    0xc4,0x41,0x3d,0xdd,0xc1,                   //  vpaddusw      %ymm9,%ymm8,%ymm8
-    0xc4,0xc1,0x65,0xdd,0xda,                   //  vpaddusw      %ymm10,%ymm3,%ymm3
-    0xc5,0xe5,0x71,0xd3,0x08,                   //  vpsrlw        $0x8,%ymm3,%ymm3
-    0xc5,0xbd,0xdd,0xdb,                        //  vpaddusw      %ymm3,%ymm8,%ymm3
+    0xc5,0xe5,0x71,0xf3,0x08,                   //  vpsllw        $0x8,%ymm3,%ymm3
+    0xc4,0xc1,0x65,0xe4,0xda,                   //  vpmulhuw      %ymm10,%ymm3,%ymm3
 };
 static const unsigned char kSplice_store_8888_lowp[] = {
     0x48,0x8b,0x02,                             //  mov           (%rdx),%rax
index 7f4db14..9f69aa7 100644 (file)
@@ -41,8 +41,8 @@ struct SkSplicer_constants {
 };
 
 struct SkSplicer_constants_lowp {
-    uint16_t _0x0001;      // 0x0001 ==     1 == epsilon
     uint16_t _1;           // 0x8000 == 32768 == 1.0
+    uint16_t _0x8081;      // 0x8081 == 32897, closest value to 32768 * (256/255).
 };
 
 #endif//SkSplicer_shared_DEFINED
index 38a2632..9e5ea4e 100644 (file)
@@ -124,9 +124,11 @@ using K = const SkSplicer_constants_lowp;
     static F max(F a, F b) { return _mm256_max_epu16(a,b); }
 
     static F from_u8(U8 u8, K* k) {
-        // Nothing too interesting here.  We follow the stock SkFixed15 formula.
+        // Ideally we'd multiply by 32768/255 = 128.50196...
+        // We can approximate that very cheaply as 256*32897/65536 = 128.50391...
+        // 0 and 255 map to 0 and 32768 correctly, and the max error is 1 (on about 1/4 of values).
         F u16 = _mm256_cvtepu8_epi16(u8);
-        return (u16 << 7) + (u16 >> 1) + ((u16+k->_0x0001)>>8);
+        return _mm256_mulhi_epu16(u16 << 8, F(k->_0x8081));
     }
 #endif