SkSplicer: fix dstover
authorMike Klein <mtklein@chromium.org>
Tue, 14 Feb 2017 14:41:17 +0000 (09:41 -0500)
committerSkia Commit-Bot <skia-commit-bot@chromium.org>
Tue, 14 Feb 2017 15:12:42 +0000 (15:12 +0000)
dstover was updating the {dr,dg,db,da} registers instead of {r,g,b,a}.

Change-Id: Ie2280abe80efb274fb422dc38a329c6dac12f846
Reviewed-on: https://skia-review.googlesource.com/8404
Reviewed-by: Mike Klein <mtklein@chromium.org>
Reviewed-by: Herb Derby <herb@google.com>
Commit-Queue: Mike Klein <mtklein@chromium.org>

src/splicer/SkSplicer_generated.h
src/splicer/SkSplicer_stages.cpp

index a9d8891..4316364 100644 (file)
@@ -58,12 +58,20 @@ static const unsigned int aarch64_srcover[] = {
     0xd65f03c0,                                 //  return
 };
 static const unsigned int aarch64_dstover[] = {
-    0x4d40c870,                                 //  ld1r          {v16.4s}, [x3]
-    0x4ea7d610,                                 //  fsub          v16.4s, v16.4s, v7.4s
-    0x4e20ce04,                                 //  fmla          v4.4s, v16.4s, v0.4s
-    0x4e21ce05,                                 //  fmla          v5.4s, v16.4s, v1.4s
-    0x4e22ce06,                                 //  fmla          v6.4s, v16.4s, v2.4s
-    0x4e23ce07,                                 //  fmla          v7.4s, v16.4s, v3.4s
+    0x4d40c871,                                 //  ld1r          {v17.4s}, [x3]
+    0x4ea41c90,                                 //  mov           v16.16b, v4.16b
+    0x4ea61cd2,                                 //  mov           v18.16b, v6.16b
+    0x4ea71cf3,                                 //  mov           v19.16b, v7.16b
+    0x4ea7d634,                                 //  fsub          v20.4s, v17.4s, v7.4s
+    0x4ea51cb1,                                 //  mov           v17.16b, v5.16b
+    0x4e20ce90,                                 //  fmla          v16.4s, v20.4s, v0.4s
+    0x4e21ce91,                                 //  fmla          v17.4s, v20.4s, v1.4s
+    0x4e22ce92,                                 //  fmla          v18.4s, v20.4s, v2.4s
+    0x4e23ce93,                                 //  fmla          v19.4s, v20.4s, v3.4s
+    0x4eb01e00,                                 //  mov           v0.16b, v16.16b
+    0x4eb11e21,                                 //  mov           v1.16b, v17.16b
+    0x4eb21e42,                                 //  mov           v2.16b, v18.16b
+    0x4eb31e63,                                 //  mov           v3.16b, v19.16b
     0xd65f03c0,                                 //  return
 };
 static const unsigned int aarch64_clamp_0[] = {
@@ -494,11 +502,19 @@ static const unsigned int armv7_srcover[] = {
 };
 static const unsigned int armv7_dstover[] = {
     0xf4e30c9f,                                 //  vld1.32       {d16[]}, [r3 :32]
-    0xf2600d87,                                 //  vsub.f32      d16, d16, d7
-    0xf2004c30,                                 //  vfma.f32      d4, d0, d16
-    0xf2015c30,                                 //  vfma.f32      d5, d1, d16
-    0xf2026c30,                                 //  vfma.f32      d6, d2, d16
-    0xf2037c30,                                 //  vfma.f32      d7, d3, d16
+    0xf2651115,                                 //  vorr          d17, d5, d5
+    0xf2604d87,                                 //  vsub.f32      d20, d16, d7
+    0xf2640114,                                 //  vorr          d16, d4, d4
+    0xf2662116,                                 //  vorr          d18, d6, d6
+    0xf2673117,                                 //  vorr          d19, d7, d7
+    0xf2400c34,                                 //  vfma.f32      d16, d0, d20
+    0xf2411c34,                                 //  vfma.f32      d17, d1, d20
+    0xf2422c34,                                 //  vfma.f32      d18, d2, d20
+    0xf2433c34,                                 //  vfma.f32      d19, d3, d20
+    0xf22001b0,                                 //  vorr          d0, d16, d16
+    0xf22111b1,                                 //  vorr          d1, d17, d17
+    0xf22221b2,                                 //  vorr          d2, d18, d18
+    0xf22331b3,                                 //  vorr          d3, d19, d19
     0xe12fff1e,                                 //  return
 };
 static const unsigned int armv7_clamp_0[] = {
@@ -979,17 +995,14 @@ static const unsigned char sse2_dstover[] = {
     0xf3,0x44,0x0f,0x10,0x01,                   //  movss         (%rcx),%xmm8
     0x45,0x0f,0xc6,0xc0,0x00,                   //  shufps        $0x0,%xmm8,%xmm8
     0x44,0x0f,0x5c,0xc7,                        //  subps         %xmm7,%xmm8
-    0x45,0x0f,0x28,0xc8,                        //  movaps        %xmm8,%xmm9
-    0x44,0x0f,0x59,0xc8,                        //  mulps         %xmm0,%xmm9
-    0x41,0x0f,0x58,0xe1,                        //  addps         %xmm9,%xmm4
-    0x45,0x0f,0x28,0xc8,                        //  movaps        %xmm8,%xmm9
-    0x44,0x0f,0x59,0xc9,                        //  mulps         %xmm1,%xmm9
-    0x41,0x0f,0x58,0xe9,                        //  addps         %xmm9,%xmm5
-    0x45,0x0f,0x28,0xc8,                        //  movaps        %xmm8,%xmm9
-    0x44,0x0f,0x59,0xca,                        //  mulps         %xmm2,%xmm9
-    0x41,0x0f,0x58,0xf1,                        //  addps         %xmm9,%xmm6
-    0x44,0x0f,0x59,0xc3,                        //  mulps         %xmm3,%xmm8
-    0x41,0x0f,0x58,0xf8,                        //  addps         %xmm8,%xmm7
+    0x41,0x0f,0x59,0xc0,                        //  mulps         %xmm8,%xmm0
+    0x0f,0x58,0xc4,                             //  addps         %xmm4,%xmm0
+    0x41,0x0f,0x59,0xc8,                        //  mulps         %xmm8,%xmm1
+    0x0f,0x58,0xcd,                             //  addps         %xmm5,%xmm1
+    0x41,0x0f,0x59,0xd0,                        //  mulps         %xmm8,%xmm2
+    0x0f,0x58,0xd6,                             //  addps         %xmm6,%xmm2
+    0x41,0x0f,0x59,0xd8,                        //  mulps         %xmm8,%xmm3
+    0x0f,0x58,0xdf,                             //  addps         %xmm7,%xmm3
     0xc3,                                       //  return
 };
 static const unsigned char sse2_clamp_0[] = {
@@ -1540,10 +1553,10 @@ static const unsigned char hsw_srcover[] = {
 static const unsigned char hsw_dstover[] = {
     0xc4,0x62,0x7d,0x18,0x01,                   //  vbroadcastss  (%rcx),%ymm8
     0xc5,0x3c,0x5c,0xc7,                        //  vsubps        %ymm7,%ymm8,%ymm8
-    0xc4,0xc2,0x7d,0xb8,0xe0,                   //  vfmadd231ps   %ymm8,%ymm0,%ymm4
-    0xc4,0xc2,0x75,0xb8,0xe8,                   //  vfmadd231ps   %ymm8,%ymm1,%ymm5
-    0xc4,0xc2,0x6d,0xb8,0xf0,                   //  vfmadd231ps   %ymm8,%ymm2,%ymm6
-    0xc4,0xc2,0x65,0xb8,0xf8,                   //  vfmadd231ps   %ymm8,%ymm3,%ymm7
+    0xc4,0xe2,0x3d,0xa8,0xc4,                   //  vfmadd213ps   %ymm4,%ymm8,%ymm0
+    0xc4,0xe2,0x3d,0xa8,0xcd,                   //  vfmadd213ps   %ymm5,%ymm8,%ymm1
+    0xc4,0xe2,0x3d,0xa8,0xd6,                   //  vfmadd213ps   %ymm6,%ymm8,%ymm2
+    0xc4,0xe2,0x3d,0xa8,0xdf,                   //  vfmadd213ps   %ymm7,%ymm8,%ymm3
     0xc3,                                       //  return
 };
 static const unsigned char hsw_clamp_0[] = {
index badaf78..03b7001 100644 (file)
@@ -201,7 +201,13 @@ STAGE(srcover) {
     b = fma(db, A, b);
     a = fma(da, A, a);
 }
-STAGE(dstover) { srcover_k(x,limit,ctx,k, dr,dg,db,da, r,g,b,a); }
+STAGE(dstover) {
+    auto DA = k->_1 - da;
+    r = fma(r, DA, dr);
+    g = fma(g, DA, dg);
+    b = fma(b, DA, db);
+    a = fma(a, DA, da);
+}
 
 STAGE(clamp_0) {
     r = max(r, 0);