SkJumper: everyone gets a start_pipeline().

author Mike Klein <mtklein@chromium.org>

Thu, 16 Feb 2017 15:17:00 +0000 (10:17 -0500)

committer Skia Commit-Bot <skia-commit-bot@chromium.org>

Thu, 16 Feb 2017 18:38:49 +0000 (18:38 +0000)
author Mike Klein <mtklein@chromium.org>
Thu, 16 Feb 2017 15:17:00 +0000 (10:17 -0500)
committer Skia Commit-Bot <skia-commit-bot@chromium.org>
Thu, 16 Feb 2017 18:38:49 +0000 (18:38 +0000)
diff --git a/src/jumper/SkJumper.cpp b/src/jumper/SkJumper.cpp

index 3b27ffe..2ed95a1 100644 (file)
--- a/src/jumper/SkJumper.cpp
+++ b/src/jumper/SkJumper.cpp
@@ -21,11 +21,6 @@ static const SkJumper_constants kConstants = {
      0x77800000, 0x07800000, 0x04000400,                // fp16 <-> fp32
  };
  
-using JumperStage = void(size_t, void**, const SkJumper_constants*);
-// Jumper stages actually pass around 8 floating point vectors too.
-// They're designed to work when those vectors start unintialized,
-// so we don't need to mention them here.
-
  #define STAGES(M)     \
      M(seed_shader)    \
      M(constant_color) \
@@ -57,8 +52,12 @@ using JumperStage = void(size_t, void**, const SkJumper_constants*);
  
  // Declare the portable, single pixel stages that are linked into Skia from SkJumper_stages.o.
  extern "C" {
-    JumperStage sk_just_return;
-#define M(st) JumperStage sk_##st;
+    void sk_start_pipeline(size_t, void**, const SkJumper_constants*);
+
+    // We use void() as a convenient stand-in for the real stage function type.
+    // We never call these directly, so we don't really need to know their real types.
+    void sk_just_return(void);
+#define M(st) void sk_##st(void);
      STAGES(M)
  #undef M
  }
@@ -123,33 +122,51 @@ bool SkRasterPipeline::run_with_jumper(size_t x, size_t n) const {
      // We'll look for the best vector instruction set and stride we can use.
      size_t stride                                 = 0;
      void* (*lookup)(SkRasterPipeline::StockStage) = nullptr;
+    void* start_pipeline                          = nullptr;
      void* just_return                             = nullptr;
  
  #if defined(__aarch64__)
-    stride      = 4;
-    lookup      = aarch64_lookup;
-    just_return = (void*)aarch64_sk_just_return;
+    stride         = 4;
+    lookup         = aarch64_lookup;
+    start_pipeline = (void*)aarch64_sk_start_pipeline;
+    just_return    = (void*)aarch64_sk_just_return;
  
  #elif defined(__ARM_NEON__)
      if (SkCpu::Supports(SkCpu::NEON|SkCpu::NEON_FMA|SkCpu::VFP_FP16)) {
-        stride      = 2;
-        lookup      = armv7_lookup;
-        just_return = (void*)armv7_sk_just_return;
+        stride         = 2;
+        lookup         = armv7_lookup;
+        start_pipeline = (void*)armv7_sk_start_pipeline;
+        just_return    = (void*)armv7_sk_just_return;
      }
  
  #elif defined(__x86_64__) || defined(_M_X64)
-    stride      = 4;
-    lookup      = sse2_lookup;
-    just_return = (void*)sse2_sk_just_return;
+    stride         = 4;
+    lookup         = sse2_lookup;
+    start_pipeline = (void*)sse2_sk_start_pipeline;
+    just_return    = (void*)sse2_sk_just_return;
      if (SkCpu::Supports(SkCpu::SSE41)) {
-        stride      = 4;
-        lookup      = sse41_lookup;
-        just_return = (void*)sse41_sk_just_return;
+        stride         = 4;
+        lookup         = sse41_lookup;
+        start_pipeline = (void*)sse41_sk_start_pipeline;
+        just_return    = (void*)sse41_sk_just_return;
      }
      if (SkCpu::Supports(SkCpu::HSW)) {
-        stride      = 8;
-        lookup      = hsw_lookup;
-        just_return = (void*)hsw_sk_just_return;
+        stride         = 8;
+        lookup         = hsw_lookup;
+        start_pipeline = (void*)hsw_sk_start_pipeline;
+        just_return    = (void*)hsw_sk_just_return;
+    }
+#endif
+
+#if defined(_MSC_VER)
+    if (start_pipeline == (void*)sse2_sk_start_pipeline) {
+        start_pipeline =  (void*)sse2_sk_start_pipeline_ms;
+    }
+    if (start_pipeline == (void*)sse41_sk_start_pipeline) {
+        start_pipeline =  (void*)sse41_sk_start_pipeline_ms;
+    }
+    if (start_pipeline == (void*)hsw_sk_start_pipeline) {
+        start_pipeline =  (void*)hsw_sk_start_pipeline_ms;
      }
  #endif
  
@@ -170,10 +187,9 @@ bool SkRasterPipeline::run_with_jumper(size_t x, size_t n) const {
          }
          *ip = (void*)just_return;
  
-        ip = program.get();
-        auto start = (JumperStage*)*ip++;
+        auto start = (decltype(&sk_start_pipeline))start_pipeline;
          while (x + stride <= limit) {
-            start(x, ip, &kConstants);
+            start(x, program.get(), &kConstants);
              x += stride;
          }
      }
@@ -193,10 +209,9 @@ bool SkRasterPipeline::run_with_jumper(size_t x, size_t n) const {
          }
          *ip = (void*)sk_just_return;
  
-        ip = program.get();
-        auto start = (JumperStage*)*ip++;
+        auto start = sk_start_pipeline;
          while (x + stride <= limit) {
-            start(x, ip, &kConstants);
+            start(x, program.get(), &kConstants);
              x += stride;
          }
      }
diff --git a/src/jumper/SkJumper_generated.h b/src/jumper/SkJumper_generated.h

index ac0dd48..cf25d67 100644 (file)
--- a/src/jumper/SkJumper_generated.h
+++ b/src/jumper/SkJumper_generated.h
@@ -11,6 +11,18 @@
  // This file is generated semi-automatically with this command:
  //   $ src/jumper/build_stages.py
  
+static const unsigned int aarch64_sk_start_pipeline[] = {
+    0xf8408423,                                     //  ldr           x3, [x1],#8
+    0x6f00e400,                                     //  movi          v0.2d, #0x0
+    0x6f00e401,                                     //  movi          v1.2d, #0x0
+    0x6f00e402,                                     //  movi          v2.2d, #0x0
+    0x6f00e403,                                     //  movi          v3.2d, #0x0
+    0x6f00e404,                                     //  movi          v4.2d, #0x0
+    0x6f00e405,                                     //  movi          v5.2d, #0x0
+    0x6f00e406,                                     //  movi          v6.2d, #0x0
+    0x6f00e407,                                     //  movi          v7.2d, #0x0
+    0xd61f0060,                                     //  br            x3
+};
  static const unsigned int aarch64_sk_just_return[] = {
      0xd65f03c0,                                     //  ret
  };
@@ -507,6 +519,18 @@ static const unsigned int aarch64_sk_linear_gradient_2stops[] = {
      0x4eb01e00,                                     //  mov           v0.16b, v16.16b
      0xd61f0060,                                     //  br            x3
  };
+static const unsigned int armv7_sk_start_pipeline[] = {
+    0xe4913004,                                     //  ldr           r3, [r1], #4
+    0xf2800010,                                     //  vmov.i32      d0, #0
+    0xf2801010,                                     //  vmov.i32      d1, #0
+    0xf2802010,                                     //  vmov.i32      d2, #0
+    0xf2803010,                                     //  vmov.i32      d3, #0
+    0xf2804010,                                     //  vmov.i32      d4, #0
+    0xf2805010,                                     //  vmov.i32      d5, #0
+    0xf2806010,                                     //  vmov.i32      d6, #0
+    0xf2807010,                                     //  vmov.i32      d7, #0
+    0xe12fff13,                                     //  bx            r3
+};
  static const unsigned int armv7_sk_just_return[] = {
      0xe12fff1e,                                     //  bx            lr
  };
@@ -1066,11 +1090,64 @@ static const unsigned int armv7_sk_linear_gradient_2stops[] = {
      0xf22001b0,                                     //  vorr          d0, d16, d16
      0xe12fff1c,                                     //  bx            ip
  };
-static const unsigned char hsw_sk_just_return[] = {
-    0xc5,0xf8,0x77,                                 //  vzeroupper
+static const unsigned char hsw_sk_start_pipeline[] = {
+    0x48,0xad,                                      //  lods          %ds:(%rsi),%rax
+    0xc5,0xfc,0x57,0xc0,                            //  vxorps        %ymm0,%ymm0,%ymm0
+    0xc5,0xf4,0x57,0xc9,                            //  vxorps        %ymm1,%ymm1,%ymm1
+    0xc5,0xec,0x57,0xd2,                            //  vxorps        %ymm2,%ymm2,%ymm2
+    0xc5,0xe4,0x57,0xdb,                            //  vxorps        %ymm3,%ymm3,%ymm3
+    0xc5,0xdc,0x57,0xe4,                            //  vxorps        %ymm4,%ymm4,%ymm4
+    0xc5,0xd4,0x57,0xed,                            //  vxorps        %ymm5,%ymm5,%ymm5
+    0xc5,0xcc,0x57,0xf6,                            //  vxorps        %ymm6,%ymm6,%ymm6
+    0xc5,0xc4,0x57,0xff,                            //  vxorps        %ymm7,%ymm7,%ymm7
+    0xff,0xe0,                                      //  jmpq          *%rax
+};
+static const unsigned char hsw_sk_start_pipeline_ms[] = {
+    0x56,                                           //  push          %rsi
+    0x57,                                           //  push          %rdi
+    0x48,0x81,0xec,0xa8,0x00,0x00,0x00,             //  sub           $0xa8,%rsp
+    0xc5,0x78,0x29,0xbc,0x24,0x90,0x00,0x00,0x00,   //  vmovaps       %xmm15,0x90(%rsp)
+    0xc5,0x78,0x29,0xb4,0x24,0x80,0x00,0x00,0x00,   //  vmovaps       %xmm14,0x80(%rsp)
+    0xc5,0x78,0x29,0x6c,0x24,0x70,                  //  vmovaps       %xmm13,0x70(%rsp)
+    0xc5,0x78,0x29,0x64,0x24,0x60,                  //  vmovaps       %xmm12,0x60(%rsp)
+    0xc5,0x78,0x29,0x5c,0x24,0x50,                  //  vmovaps       %xmm11,0x50(%rsp)
+    0xc5,0x78,0x29,0x54,0x24,0x40,                  //  vmovaps       %xmm10,0x40(%rsp)
+    0xc5,0x78,0x29,0x4c,0x24,0x30,                  //  vmovaps       %xmm9,0x30(%rsp)
+    0xc5,0x78,0x29,0x44,0x24,0x20,                  //  vmovaps       %xmm8,0x20(%rsp)
+    0xc5,0xf8,0x29,0x7c,0x24,0x10,                  //  vmovaps       %xmm7,0x10(%rsp)
+    0xc5,0xf8,0x29,0x34,0x24,                       //  vmovaps       %xmm6,(%rsp)
+    0x48,0x89,0xd6,                                 //  mov           %rdx,%rsi
+    0x48,0xad,                                      //  lods          %ds:(%rsi),%rax
+    0xc5,0xfc,0x57,0xc0,                            //  vxorps        %ymm0,%ymm0,%ymm0
+    0xc5,0xf4,0x57,0xc9,                            //  vxorps        %ymm1,%ymm1,%ymm1
+    0xc5,0xec,0x57,0xd2,                            //  vxorps        %ymm2,%ymm2,%ymm2
+    0xc5,0xe4,0x57,0xdb,                            //  vxorps        %ymm3,%ymm3,%ymm3
+    0xc5,0xdc,0x57,0xe4,                            //  vxorps        %ymm4,%ymm4,%ymm4
+    0xc5,0xd4,0x57,0xed,                            //  vxorps        %ymm5,%ymm5,%ymm5
+    0xc5,0xcc,0x57,0xf6,                            //  vxorps        %ymm6,%ymm6,%ymm6
+    0xc5,0xc4,0x57,0xff,                            //  vxorps        %ymm7,%ymm7,%ymm7
+    0x48,0x89,0xcf,                                 //  mov           %rcx,%rdi
+    0x4c,0x89,0xc2,                                 //  mov           %r8,%rdx
+    0xff,0xd0,                                      //  callq         *%rax
+    0xc5,0xf8,0x28,0x34,0x24,                       //  vmovaps       (%rsp),%xmm6
+    0xc5,0xf8,0x28,0x7c,0x24,0x10,                  //  vmovaps       0x10(%rsp),%xmm7
+    0xc5,0x78,0x28,0x44,0x24,0x20,                  //  vmovaps       0x20(%rsp),%xmm8
+    0xc5,0x78,0x28,0x4c,0x24,0x30,                  //  vmovaps       0x30(%rsp),%xmm9
+    0xc5,0x78,0x28,0x54,0x24,0x40,                  //  vmovaps       0x40(%rsp),%xmm10
+    0xc5,0x78,0x28,0x5c,0x24,0x50,                  //  vmovaps       0x50(%rsp),%xmm11
+    0xc5,0x78,0x28,0x64,0x24,0x60,                  //  vmovaps       0x60(%rsp),%xmm12
+    0xc5,0x78,0x28,0x6c,0x24,0x70,                  //  vmovaps       0x70(%rsp),%xmm13
+    0xc5,0x78,0x28,0xb4,0x24,0x80,0x00,0x00,0x00,   //  vmovaps       0x80(%rsp),%xmm14
+    0xc5,0x78,0x28,0xbc,0x24,0x90,0x00,0x00,0x00,   //  vmovaps       0x90(%rsp),%xmm15
+    0x48,0x81,0xc4,0xa8,0x00,0x00,0x00,             //  add           $0xa8,%rsp
+    0x5f,                                           //  pop           %rdi
+    0x5e,                                           //  pop           %rsi
      0xc5,0xf8,0x77,                                 //  vzeroupper
      0xc3,                                           //  retq
  };
+static const unsigned char hsw_sk_just_return[] = {
+    0xc3,                                           //  retq
+};
  static const unsigned char hsw_sk_seed_shader[] = {
      0x48,0xad,                                      //  lods          %ds:(%rsi),%rax
      0xc5,0xf9,0x6e,0xc7,                            //  vmovd         %edi,%xmm0
@@ -1514,6 +1591,60 @@ static const unsigned char hsw_sk_linear_gradient_2stops[] = {
      0xc5,0x7c,0x29,0xc0,                            //  vmovaps       %ymm8,%ymm0
      0xff,0xe0,                                      //  jmpq          *%rax
  };
+static const unsigned char sse41_sk_start_pipeline[] = {
+    0x48,0xad,                                      //  lods          %ds:(%rsi),%rax
+    0x0f,0x57,0xc0,                                 //  xorps         %xmm0,%xmm0
+    0x0f,0x57,0xc9,                                 //  xorps         %xmm1,%xmm1
+    0x0f,0x57,0xd2,                                 //  xorps         %xmm2,%xmm2
+    0x0f,0x57,0xdb,                                 //  xorps         %xmm3,%xmm3
+    0x0f,0x57,0xe4,                                 //  xorps         %xmm4,%xmm4
+    0x0f,0x57,0xed,                                 //  xorps         %xmm5,%xmm5
+    0x0f,0x57,0xf6,                                 //  xorps         %xmm6,%xmm6
+    0x0f,0x57,0xff,                                 //  xorps         %xmm7,%xmm7
+    0xff,0xe0,                                      //  jmpq          *%rax
+};
+static const unsigned char sse41_sk_start_pipeline_ms[] = {
+    0x56,                                           //  push          %rsi
+    0x57,                                           //  push          %rdi
+    0x48,0x81,0xec,0xa8,0x00,0x00,0x00,             //  sub           $0xa8,%rsp
+    0x44,0x0f,0x29,0xbc,0x24,0x90,0x00,0x00,0x00,   //  movaps        %xmm15,0x90(%rsp)
+    0x44,0x0f,0x29,0xb4,0x24,0x80,0x00,0x00,0x00,   //  movaps        %xmm14,0x80(%rsp)
+    0x44,0x0f,0x29,0x6c,0x24,0x70,                  //  movaps        %xmm13,0x70(%rsp)
+    0x44,0x0f,0x29,0x64,0x24,0x60,                  //  movaps        %xmm12,0x60(%rsp)
+    0x44,0x0f,0x29,0x5c,0x24,0x50,                  //  movaps        %xmm11,0x50(%rsp)
+    0x44,0x0f,0x29,0x54,0x24,0x40,                  //  movaps        %xmm10,0x40(%rsp)
+    0x44,0x0f,0x29,0x4c,0x24,0x30,                  //  movaps        %xmm9,0x30(%rsp)
+    0x44,0x0f,0x29,0x44,0x24,0x20,                  //  movaps        %xmm8,0x20(%rsp)
+    0x0f,0x29,0x7c,0x24,0x10,                       //  movaps        %xmm7,0x10(%rsp)
+    0x0f,0x29,0x34,0x24,                            //  movaps        %xmm6,(%rsp)
+    0x48,0x89,0xd6,                                 //  mov           %rdx,%rsi
+    0x48,0xad,                                      //  lods          %ds:(%rsi),%rax
+    0x0f,0x57,0xc0,                                 //  xorps         %xmm0,%xmm0
+    0x0f,0x57,0xc9,                                 //  xorps         %xmm1,%xmm1
+    0x0f,0x57,0xd2,                                 //  xorps         %xmm2,%xmm2
+    0x0f,0x57,0xdb,                                 //  xorps         %xmm3,%xmm3
+    0x0f,0x57,0xe4,                                 //  xorps         %xmm4,%xmm4
+    0x0f,0x57,0xed,                                 //  xorps         %xmm5,%xmm5
+    0x0f,0x57,0xf6,                                 //  xorps         %xmm6,%xmm6
+    0x0f,0x57,0xff,                                 //  xorps         %xmm7,%xmm7
+    0x48,0x89,0xcf,                                 //  mov           %rcx,%rdi
+    0x4c,0x89,0xc2,                                 //  mov           %r8,%rdx
+    0xff,0xd0,                                      //  callq         *%rax
+    0x0f,0x28,0x34,0x24,                            //  movaps        (%rsp),%xmm6
+    0x0f,0x28,0x7c,0x24,0x10,                       //  movaps        0x10(%rsp),%xmm7
+    0x44,0x0f,0x28,0x44,0x24,0x20,                  //  movaps        0x20(%rsp),%xmm8
+    0x44,0x0f,0x28,0x4c,0x24,0x30,                  //  movaps        0x30(%rsp),%xmm9
+    0x44,0x0f,0x28,0x54,0x24,0x40,                  //  movaps        0x40(%rsp),%xmm10
+    0x44,0x0f,0x28,0x5c,0x24,0x50,                  //  movaps        0x50(%rsp),%xmm11
+    0x44,0x0f,0x28,0x64,0x24,0x60,                  //  movaps        0x60(%rsp),%xmm12
+    0x44,0x0f,0x28,0x6c,0x24,0x70,                  //  movaps        0x70(%rsp),%xmm13
+    0x44,0x0f,0x28,0xb4,0x24,0x80,0x00,0x00,0x00,   //  movaps        0x80(%rsp),%xmm14
+    0x44,0x0f,0x28,0xbc,0x24,0x90,0x00,0x00,0x00,   //  movaps        0x90(%rsp),%xmm15
+    0x48,0x81,0xc4,0xa8,0x00,0x00,0x00,             //  add           $0xa8,%rsp
+    0x5f,                                           //  pop           %rdi
+    0x5e,                                           //  pop           %rsi
+    0xc3,                                           //  retq
+};
  static const unsigned char sse41_sk_just_return[] = {
      0xc3,                                           //  retq
  };
@@ -2145,6 +2276,60 @@ static const unsigned char sse41_sk_linear_gradient_2stops[] = {
      0x41,0x0f,0x28,0xc0,                            //  movaps        %xmm8,%xmm0
      0xff,0xe0,                                      //  jmpq          *%rax
  };
+static const unsigned char sse2_sk_start_pipeline[] = {
+    0x48,0xad,                                      //  lods          %ds:(%rsi),%rax
+    0x0f,0x57,0xc0,                                 //  xorps         %xmm0,%xmm0
+    0x0f,0x57,0xc9,                                 //  xorps         %xmm1,%xmm1
+    0x0f,0x57,0xd2,                                 //  xorps         %xmm2,%xmm2
+    0x0f,0x57,0xdb,                                 //  xorps         %xmm3,%xmm3
+    0x0f,0x57,0xe4,                                 //  xorps         %xmm4,%xmm4
+    0x0f,0x57,0xed,                                 //  xorps         %xmm5,%xmm5
+    0x0f,0x57,0xf6,                                 //  xorps         %xmm6,%xmm6
+    0x0f,0x57,0xff,                                 //  xorps         %xmm7,%xmm7
+    0xff,0xe0,                                      //  jmpq          *%rax
+};
+static const unsigned char sse2_sk_start_pipeline_ms[] = {
+    0x56,                                           //  push          %rsi
+    0x57,                                           //  push          %rdi
+    0x48,0x81,0xec,0xa8,0x00,0x00,0x00,             //  sub           $0xa8,%rsp
+    0x44,0x0f,0x29,0xbc,0x24,0x90,0x00,0x00,0x00,   //  movaps        %xmm15,0x90(%rsp)
+    0x44,0x0f,0x29,0xb4,0x24,0x80,0x00,0x00,0x00,   //  movaps        %xmm14,0x80(%rsp)
+    0x44,0x0f,0x29,0x6c,0x24,0x70,                  //  movaps        %xmm13,0x70(%rsp)
+    0x44,0x0f,0x29,0x64,0x24,0x60,                  //  movaps        %xmm12,0x60(%rsp)
+    0x44,0x0f,0x29,0x5c,0x24,0x50,                  //  movaps        %xmm11,0x50(%rsp)
+    0x44,0x0f,0x29,0x54,0x24,0x40,                  //  movaps        %xmm10,0x40(%rsp)
+    0x44,0x0f,0x29,0x4c,0x24,0x30,                  //  movaps        %xmm9,0x30(%rsp)
+    0x44,0x0f,0x29,0x44,0x24,0x20,                  //  movaps        %xmm8,0x20(%rsp)
+    0x0f,0x29,0x7c,0x24,0x10,                       //  movaps        %xmm7,0x10(%rsp)
+    0x0f,0x29,0x34,0x24,                            //  movaps        %xmm6,(%rsp)
+    0x48,0x89,0xd6,                                 //  mov           %rdx,%rsi
+    0x48,0xad,                                      //  lods          %ds:(%rsi),%rax
+    0x0f,0x57,0xc0,                                 //  xorps         %xmm0,%xmm0
+    0x0f,0x57,0xc9,                                 //  xorps         %xmm1,%xmm1
+    0x0f,0x57,0xd2,                                 //  xorps         %xmm2,%xmm2
+    0x0f,0x57,0xdb,                                 //  xorps         %xmm3,%xmm3
+    0x0f,0x57,0xe4,                                 //  xorps         %xmm4,%xmm4
+    0x0f,0x57,0xed,                                 //  xorps         %xmm5,%xmm5
+    0x0f,0x57,0xf6,                                 //  xorps         %xmm6,%xmm6
+    0x0f,0x57,0xff,                                 //  xorps         %xmm7,%xmm7
+    0x48,0x89,0xcf,                                 //  mov           %rcx,%rdi
+    0x4c,0x89,0xc2,                                 //  mov           %r8,%rdx
+    0xff,0xd0,                                      //  callq         *%rax
+    0x0f,0x28,0x34,0x24,                            //  movaps        (%rsp),%xmm6
+    0x0f,0x28,0x7c,0x24,0x10,                       //  movaps        0x10(%rsp),%xmm7
+    0x44,0x0f,0x28,0x44,0x24,0x20,                  //  movaps        0x20(%rsp),%xmm8
+    0x44,0x0f,0x28,0x4c,0x24,0x30,                  //  movaps        0x30(%rsp),%xmm9
+    0x44,0x0f,0x28,0x54,0x24,0x40,                  //  movaps        0x40(%rsp),%xmm10
+    0x44,0x0f,0x28,0x5c,0x24,0x50,                  //  movaps        0x50(%rsp),%xmm11
+    0x44,0x0f,0x28,0x64,0x24,0x60,                  //  movaps        0x60(%rsp),%xmm12
+    0x44,0x0f,0x28,0x6c,0x24,0x70,                  //  movaps        0x70(%rsp),%xmm13
+    0x44,0x0f,0x28,0xb4,0x24,0x80,0x00,0x00,0x00,   //  movaps        0x80(%rsp),%xmm14
+    0x44,0x0f,0x28,0xbc,0x24,0x90,0x00,0x00,0x00,   //  movaps        0x90(%rsp),%xmm15
+    0x48,0x81,0xc4,0xa8,0x00,0x00,0x00,             //  add           $0xa8,%rsp
+    0x5f,                                           //  pop           %rdi
+    0x5e,                                           //  pop           %rsi
+    0xc3,                                           //  retq
+};
  static const unsigned char sse2_sk_just_return[] = {
      0xc3,                                           //  retq
  };
diff --git a/src/jumper/SkJumper_stages.cpp b/src/jumper/SkJumper_stages.cpp

index fa771c0..0c31051 100644 (file)
--- a/src/jumper/SkJumper_stages.cpp
+++ b/src/jumper/SkJumper_stages.cpp
@@ -200,13 +200,24 @@ static void* load_and_inc(void**& program) {
      static void name##_k(size_t& x, void* ctx, K* k,                          \
                           F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
  
-// A glue Stage to end the tail call chain, finally returning to the caller.
-extern "C" void sk_just_return(size_t, void**, K*, F,F,F,F, F,F,F,F) {
-#if defined(JUMPER) && defined(__AVX2__)
-    _mm256_zeroupper();
-#endif
+// Some glue stages that don't fit the normal pattern of stages.
+
+extern "C" void sk_start_pipeline(size_t x, void** program, K* k) {
+    auto next = (Stage*)load_and_inc(program);
+    F v{};   // TODO: faster uninitialized?
+    next(x,program,k, v,v,v,v, v,v,v,v);
  }
  
+#if defined(JUMPER) && defined(__x86_64__)
+    __attribute__((ms_abi))
+    extern "C" void sk_start_pipeline_ms(size_t x, void** program, K* k) {
+        sk_start_pipeline(x,program,k);
+    }
+#endif
+
+// Ends the chain of tail calls, returning back up to start_pipeline (and from there to the caller).
+extern "C" void sk_just_return(size_t, void**, K*, F,F,F,F, F,F,F,F) {}
+
  // We can now define Stages!
  
  // Some things to keep in mind while writing Stages:
author	Mike Klein <mtklein@chromium.org>
	Thu, 16 Feb 2017 15:17:00 +0000 (10:17 -0500)
committer	Skia Commit-Bot <skia-commit-bot@chromium.org>
	Thu, 16 Feb 2017 18:38:49 +0000 (18:38 +0000)
src/jumper/SkJumper.cpp		patch \| blob \| history
src/jumper/SkJumper_generated.h		patch \| blob \| history
src/jumper/SkJumper_stages.cpp		patch \| blob \| history