some armv7 hacking
authorMike Klein <mtklein@chromium.org>
Thu, 12 Jan 2017 16:36:46 +0000 (11:36 -0500)
committerSkia Commit-Bot <skia-commit-bot@chromium.org>
Fri, 13 Jan 2017 17:25:15 +0000 (17:25 +0000)
We can splice these stages if we drop them down to 2 at a time.
Turns out this is significantly (2-3x) faster than the status quo.

    SkRasterPipeline_…
    …f16_compile 1x  …srgb_compile 2.06x  …f16_run 3.08x  …srgb_run 4.61x

Added a couple ways to detect (likely) the required VFPv4 support:
   - use hwcap when available (NDK ≥21, Android framework)
   - use cpu-features when not (NDK <21)

The code in SkSplicer_generated.h is ARM, not Thumb2.  SkSplicer seems
to be blx'ing into it, so that's great, and we bx lr out.  There's no
point in attempting to use Thumb2 in vector heavy code... it'll all be
4 byte anyway.

Follow ups:
   - vpush {d8-d9} before the loop, vpop {d8-d9} afterwards,
     skip these instructions when splicing;
   - (probably) drop jumping stages down to 2-at-a-time also.

Change-Id: If151394ec10e8cbd6a05e2d81808488d743bfe15
Reviewed-on: https://skia-review.googlesource.com/6940
Reviewed-by: Herb Derby <herb@google.com>
Commit-Queue: Mike Klein <mtklein@chromium.org>

BUILD.gn
src/core/SkCpu.cpp
src/splicer/SkSplicer.cpp
src/splicer/SkSplicer_generated.h
src/splicer/SkSplicer_stages.cpp
src/splicer/build_stages.py

index 16eb634e6f850c836c9a1899811cb675d0d98f9d..ddc884b692188bc92109f741efbd11fe92cbedbb 100644 (file)
--- a/BUILD.gn
+++ b/BUILD.gn
@@ -30,9 +30,7 @@ declare_args() {
   skia_enable_discrete_gpu = true
   skia_enable_gpu = true
   skia_enable_pdf = true
-  skia_enable_splicer =
-      is_skia_standalone && sanitize != "MSAN" &&
-      (is_linux || is_mac || is_win || (is_android && target_cpu == "arm64"))
+  skia_enable_splicer = is_skia_standalone && sanitize != "MSAN" && !is_ios
   skia_enable_tools = is_skia_standalone
   skia_enable_vulkan_debug_layers = is_skia_standalone && is_debug
   skia_vulkan_sdk = getenv("VULKAN_SDK")
@@ -648,7 +646,10 @@ component("skia") {
   }
 
   if (is_android) {
-    deps += [ "//third_party/expat" ]
+    deps += [
+      "//third_party/cpu-features",
+      "//third_party/expat",
+    ]
     sources += [ "src/ports/SkDebug_android.cpp" ]
     libs += [
       "EGL",
index 28bdf6936dd52e00dcf7024e465bfe1388a2dbd5..1ae6723983f85fce38ffb2c735fab8abe45f0f33 100644 (file)
@@ -8,6 +8,10 @@
 #include "SkCpu.h"
 #include "SkOnce.h"
 
+#if !defined(__has_include)
+    #define __has_include(x) 0
+#endif
+
 #if defined(SK_CPU_X86)
     #if defined(SK_BUILD_FOR_WIN32)
         #include <intrin.h>
         return features;
     }
 
+#elif defined(SK_CPU_ARM32) && defined(SK_BUILD_FOR_ANDROID) && \
+    __has_include(<asm/hwcap.h>) && __has_include(<sys/auxv.h>)
+    // asm/hwcap.h and sys/auxv.h won't be present on builds targeting NDK APIs before 21.
+    #include <asm/hwcap.h>
+    #include <sys/auxv.h>
+
+    static uint32_t read_cpu_features() {
+        uint32_t features = 0;
+        uint32_t hwcaps = getauxval(AT_HWCAP);
+        if (hwcaps & HWCAP_VFPv4) { features |= SkCpu::NEON|SkCpu::NEON_FMA|SkCpu::VFP_FP16; }
+        return features;
+    }
+
+#elif defined(SK_CPU_ARM32) && defined(SK_BUILD_FOR_ANDROID) && \
+    !defined(SK_BUILD_FOR_ANDROID_FRAMEWORK)
+    #include <cpu-features.h>
+
+    static uint32_t read_cpu_features() {
+        uint32_t features = 0;
+        uint64_t cpu_features = android_getCpuFeatures();
+        if (cpu_features & ANDROID_CPU_ARM_FEATURE_NEON)     { features |= SkCpu::NEON; }
+        if (cpu_features & ANDROID_CPU_ARM_FEATURE_NEON_FMA) { features |= SkCpu::NEON_FMA; }
+        if (cpu_features & ANDROID_CPU_ARM_FEATURE_VFP_FP16) { features |= SkCpu::VFP_FP16; }
+        return features;
+    }
+
 #else
     static uint32_t read_cpu_features() {
         return 0;
index b2745538d13293280150c2538ccd4fa13cd348f1..fcff0afbdeecdc638263a08e804f71e07a51f50d 100644 (file)
 //   $ ./iaca.sh -arch HSW -64 -mark 0 /tmp/dump.bin | less
 //
 // To disassemble an aarch64 dump,
-//   $ gobjdump -b binary -m aarch64 -D dump.bin
+//   $ gobjdump -b binary -D dump.bin -m aarch64
+//
+// To disassemble an armv7 dump,
+//   $ gobjdump -b binary -D dump.bin -m arm
 
 namespace {
 
@@ -66,6 +69,28 @@ namespace {
     static void ret(SkWStream* buf) {
         splice(buf, 0xd65f03c0);  // ret
     }
+#elif defined(__ARM_NEON__)
+    static constexpr int kStride = 2;
+    static void set_ctx(SkWStream* buf, void* ctx) {
+        uint16_t parts[2];
+        auto encode = [](uint16_t part) -> uint32_t {
+            return (part & 0xf000) << 4 | (part & 0xfff);
+        };
+        memcpy(parts, &ctx, 4);
+        splice(buf, 0xe3002000 | encode(parts[0]));  // mov  r2, <bottom 16 bits>
+        splice(buf, 0xe3402000 | encode(parts[1]));  // movt r2,    <top 16 bits>
+    }
+    static void loop(SkWStream* buf, int loop_start) {
+        splice(buf, 0xe2800002);  // add r0, r0, #2
+        splice(buf, 0xe1500001);  // cmp r0, r1
+        int off = loop_start - ((int)buf->bytesWritten() + 8 /*ARM is weird*/);
+        off /= 4;   // bytes -> instructions, still signed
+        off = (off & 0x00ffffff);
+        splice(buf,  0x3a000000 | off);  // bcc loop_start
+    }
+    static void ret(SkWStream* buf) {
+        splice(buf, 0xe12fff1e);  // bx lr
+    }
 #else
     static constexpr int kStride = 8;
     static void set_ctx(SkWStream* buf, void* ctx) {
@@ -132,7 +157,7 @@ namespace {
         };
         splice(buf, system_v_to_ms);
     }
-#elif !defined(__aarch64__) && defined(DUMP)
+#elif !defined(__aarch64__) && !defined(__ARM_NEON__) && defined(DUMP)
     // IACA start and end markers.
     static const uint8_t      ud2[] = { 0x0f, 0x0b };         // undefined... crashes when run
     static const uint8_t     nop3[] = { 0x64, 0x67, 0x90 };   // 3 byte no-op
@@ -222,8 +247,14 @@ namespace {
             fSpliced    = nullptr;
             // If we return early anywhere in here, !fSpliced means we'll use fBackup instead.
 
-        #if !defined(__aarch64__)
-            // To keep things simple, only one target supported: Haswell+ x86-64.
+        #if defined(__aarch64__)
+        #elif defined(__ARM_NEON__)
+            // Late generation ARMv7, e.g. Cortex A15 or Krait.
+            if (!SkCpu::Supports(SkCpu::NEON|SkCpu::NEON_FMA|SkCpu::VFP_FP16)) {
+                return;
+            }
+        #else
+            // To keep things simple, only one x86 target supported: Haswell+ x86-64.
             if (!SkCpu::Supports(SkCpu::HSW) || sizeof(void*) != 8) {
                 return;
             }
index 7baf782844df0c7857b9bffa55a5e8d14d7ee86e..df4bd0333158222ff8f8446d85cee235baad639e 100644 (file)
@@ -282,6 +282,306 @@ static const unsigned int kSplice_store_f16[] = {
     0x0c000510,                                 //  st4           {v16.4h-v19.4h}, [x8]
 };
 
+#elif defined(__ARM_NEON__)
+
+static const unsigned int kSplice_clear[] = {
+    0xf2800010,                                 //  vmov.i32      d0, #0
+    0xf2801010,                                 //  vmov.i32      d1, #0
+    0xf2802010,                                 //  vmov.i32      d2, #0
+    0xf2803010,                                 //  vmov.i32      d3, #0
+};
+static const unsigned int kSplice_plus[] = {
+    0xf2000d04,                                 //  vadd.f32      d0, d0, d4
+    0xf2011d05,                                 //  vadd.f32      d1, d1, d5
+    0xf2022d06,                                 //  vadd.f32      d2, d2, d6
+    0xf2033d07,                                 //  vadd.f32      d3, d3, d7
+};
+static const unsigned int kSplice_srcover[] = {
+    0xe283c004,                                 //  add           ip, r3, #4
+    0xf4ec0c9f,                                 //  vld1.32       {d16[]}, [ip :32]
+    0xf2600d83,                                 //  vsub.f32      d16, d16, d3
+    0xf2040c30,                                 //  vfma.f32      d0, d4, d16
+    0xf2051c30,                                 //  vfma.f32      d1, d5, d16
+    0xf2062c30,                                 //  vfma.f32      d2, d6, d16
+    0xf2063c30,                                 //  vfma.f32      d3, d6, d16
+};
+static const unsigned int kSplice_dstover[] = {
+    0xe283c004,                                 //  add           ip, r3, #4
+    0xf4ec0c9f,                                 //  vld1.32       {d16[]}, [ip :32]
+    0xf2600d87,                                 //  vsub.f32      d16, d16, d7
+    0xf2004c30,                                 //  vfma.f32      d4, d0, d16
+    0xf2015c30,                                 //  vfma.f32      d5, d1, d16
+    0xf2026c30,                                 //  vfma.f32      d6, d2, d16
+    0xf2027c30,                                 //  vfma.f32      d7, d2, d16
+};
+static const unsigned int kSplice_clamp_0[] = {
+    0xf2c00010,                                 //  vmov.i32      d16, #0
+    0xf2000f20,                                 //  vmax.f32      d0, d0, d16
+    0xf2011f20,                                 //  vmax.f32      d1, d1, d16
+    0xf2022f20,                                 //  vmax.f32      d2, d2, d16
+    0xf2033f20,                                 //  vmax.f32      d3, d3, d16
+};
+static const unsigned int kSplice_clamp_1[] = {
+    0xe283c004,                                 //  add           ip, r3, #4
+    0xf4ec0c9f,                                 //  vld1.32       {d16[]}, [ip :32]
+    0xf2200f20,                                 //  vmin.f32      d0, d0, d16
+    0xf2211f20,                                 //  vmin.f32      d1, d1, d16
+    0xf2222f20,                                 //  vmin.f32      d2, d2, d16
+    0xf2233f20,                                 //  vmin.f32      d3, d3, d16
+};
+static const unsigned int kSplice_clamp_a[] = {
+    0xe283c004,                                 //  add           ip, r3, #4
+    0xf4ec0c9f,                                 //  vld1.32       {d16[]}, [ip :32]
+    0xf2233f20,                                 //  vmin.f32      d3, d3, d16
+    0xf2200f03,                                 //  vmin.f32      d0, d0, d3
+    0xf2211f03,                                 //  vmin.f32      d1, d1, d3
+    0xf2222f03,                                 //  vmin.f32      d2, d2, d3
+};
+static const unsigned int kSplice_swap[] = {
+    0xeef00b43,                                 //  vmov.f64      d16, d3
+    0xeef01b42,                                 //  vmov.f64      d17, d2
+    0xeef02b41,                                 //  vmov.f64      d18, d1
+    0xeef03b40,                                 //  vmov.f64      d19, d0
+    0xeeb00b44,                                 //  vmov.f64      d0, d4
+    0xeeb01b45,                                 //  vmov.f64      d1, d5
+    0xeeb02b46,                                 //  vmov.f64      d2, d6
+    0xeeb03b47,                                 //  vmov.f64      d3, d7
+    0xeeb04b63,                                 //  vmov.f64      d4, d19
+    0xeeb05b62,                                 //  vmov.f64      d5, d18
+    0xeeb06b61,                                 //  vmov.f64      d6, d17
+    0xeeb07b60,                                 //  vmov.f64      d7, d16
+};
+static const unsigned int kSplice_move_src_dst[] = {
+    0xeeb04b40,                                 //  vmov.f64      d4, d0
+    0xeeb05b41,                                 //  vmov.f64      d5, d1
+    0xeeb06b42,                                 //  vmov.f64      d6, d2
+    0xeeb07b43,                                 //  vmov.f64      d7, d3
+};
+static const unsigned int kSplice_move_dst_src[] = {
+    0xeeb00b44,                                 //  vmov.f64      d0, d4
+    0xeeb01b45,                                 //  vmov.f64      d1, d5
+    0xeeb02b46,                                 //  vmov.f64      d2, d6
+    0xeeb03b47,                                 //  vmov.f64      d3, d7
+};
+static const unsigned int kSplice_premul[] = {
+    0xf3000d13,                                 //  vmul.f32      d0, d0, d3
+    0xf3011d13,                                 //  vmul.f32      d1, d1, d3
+    0xf3022d13,                                 //  vmul.f32      d2, d2, d3
+};
+static const unsigned int kSplice_unpremul[] = {
+    0xed2d8b04,                                 //  vpush         {d8-d9}
+    0xed938a01,                                 //  vldr          s16, [r3, #4]
+    0xf2c00010,                                 //  vmov.i32      d16, #0
+    0xf3f91503,                                 //  vceq.f32      d17, d3, #0
+    0xeec89a23,                                 //  vdiv.f32      s19, s16, s7
+    0xee889a03,                                 //  vdiv.f32      s18, s16, s6
+    0xf3501199,                                 //  vbsl          d17, d16, d9
+    0xf3010d90,                                 //  vmul.f32      d0, d17, d0
+    0xf3011d91,                                 //  vmul.f32      d1, d17, d1
+    0xf3012d92,                                 //  vmul.f32      d2, d17, d2
+    0xecbd8b04,                                 //  vpop          {d8-d9}
+};
+static const unsigned int kSplice_from_srgb[] = {
+    0xed2d8b02,                                 //  vpush         {d8}
+    0xe283c018,                                 //  add           ip, r3, #24
+    0xed938a07,                                 //  vldr          s16, [r3, #28]
+    0xf3402d10,                                 //  vmul.f32      d18, d0, d0
+    0xf4ec0c9f,                                 //  vld1.32       {d16[]}, [ip :32]
+    0xe283c014,                                 //  add           ip, r3, #20
+    0xf3413d11,                                 //  vmul.f32      d19, d1, d1
+    0xf4ec1c9f,                                 //  vld1.32       {d17[]}, [ip :32]
+    0xe283c020,                                 //  add           ip, r3, #32
+    0xf26141b1,                                 //  vorr          d20, d17, d17
+    0xf26171b1,                                 //  vorr          d23, d17, d17
+    0xf4ec8c9f,                                 //  vld1.32       {d24[]}, [ip :32]
+    0xf2404c30,                                 //  vfma.f32      d20, d0, d16
+    0xe283c010,                                 //  add           ip, r3, #16
+    0xf2417c30,                                 //  vfma.f32      d23, d1, d16
+    0xf2421c30,                                 //  vfma.f32      d17, d2, d16
+    0xf3425d12,                                 //  vmul.f32      d21, d2, d2
+    0xf2e16948,                                 //  vmul.f32      d22, d1, d8[0]
+    0xf2e00948,                                 //  vmul.f32      d16, d0, d8[0]
+    0xf2e29948,                                 //  vmul.f32      d25, d2, d8[0]
+    0xf3282e82,                                 //  vcgt.f32      d2, d24, d2
+    0xf3281e81,                                 //  vcgt.f32      d1, d24, d1
+    0xf3280e80,                                 //  vcgt.f32      d0, d24, d0
+    0xf4ec8c9f,                                 //  vld1.32       {d24[]}, [ip :32]
+    0xf268a1b8,                                 //  vorr          d26, d24, d24
+    0xf242acb4,                                 //  vfma.f32      d26, d18, d20
+    0xf26821b8,                                 //  vorr          d18, d24, d24
+    0xf2432cb7,                                 //  vfma.f32      d18, d19, d23
+    0xf2458cb1,                                 //  vfma.f32      d24, d21, d17
+    0xf31001ba,                                 //  vbsl          d0, d16, d26
+    0xf31611b2,                                 //  vbsl          d1, d22, d18
+    0xf31921b8,                                 //  vbsl          d2, d25, d24
+    0xecbd8b02,                                 //  vpop          {d8}
+};
+static const unsigned int kSplice_to_srgb[] = {
+    0xed2d8b02,                                 //  vpush         {d8}
+    0xf3fb0580,                                 //  vrsqrte.f32   d16, d0
+    0xe283c02c,                                 //  add           ip, r3, #44
+    0xf3fb1582,                                 //  vrsqrte.f32   d17, d2
+    0xed938a09,                                 //  vldr          s16, [r3, #36]
+    0xf3fb2581,                                 //  vrsqrte.f32   d18, d1
+    0xf3403db0,                                 //  vmul.f32      d19, d16, d16
+    0xf3414db1,                                 //  vmul.f32      d20, d17, d17
+    0xf3425db2,                                 //  vmul.f32      d21, d18, d18
+    0xf2603f33,                                 //  vrsqrts.f32   d19, d0, d19
+    0xf2624f34,                                 //  vrsqrts.f32   d20, d2, d20
+    0xf2615f35,                                 //  vrsqrts.f32   d21, d1, d21
+    0xf3400db3,                                 //  vmul.f32      d16, d16, d19
+    0xf3411db4,                                 //  vmul.f32      d17, d17, d20
+    0xf3422db5,                                 //  vmul.f32      d18, d18, d21
+    0xf3fb3520,                                 //  vrecpe.f32    d19, d16
+    0xf3fb4521,                                 //  vrecpe.f32    d20, d17
+    0xf3fb6522,                                 //  vrecpe.f32    d22, d18
+    0xf3fb55a1,                                 //  vrsqrte.f32   d21, d17
+    0xf3fb75a0,                                 //  vrsqrte.f32   d23, d16
+    0xf3fb85a2,                                 //  vrsqrte.f32   d24, d18
+    0xf2409fb3,                                 //  vrecps.f32    d25, d16, d19
+    0xf241afb4,                                 //  vrecps.f32    d26, d17, d20
+    0xf242bfb6,                                 //  vrecps.f32    d27, d18, d22
+    0xf345cdb5,                                 //  vmul.f32      d28, d21, d21
+    0xf347ddb7,                                 //  vmul.f32      d29, d23, d23
+    0xf348edb8,                                 //  vmul.f32      d30, d24, d24
+    0xf2611fbc,                                 //  vrsqrts.f32   d17, d17, d28
+    0xf2600fbd,                                 //  vrsqrts.f32   d16, d16, d29
+    0xf2622fbe,                                 //  vrsqrts.f32   d18, d18, d30
+    0xf3433db9,                                 //  vmul.f32      d19, d19, d25
+    0xf4ec9c9f,                                 //  vld1.32       {d25[]}, [ip :32]
+    0xe283c030,                                 //  add           ip, r3, #48
+    0xf3444dba,                                 //  vmul.f32      d20, d20, d26
+    0xf3466dbb,                                 //  vmul.f32      d22, d22, d27
+    0xf4ecac9f,                                 //  vld1.32       {d26[]}, [ip :32]
+    0xe283c028,                                 //  add           ip, r3, #40
+    0xf26ab1ba,                                 //  vorr          d27, d26, d26
+    0xf249bcb3,                                 //  vfma.f32      d27, d25, d19
+    0xf26a31ba,                                 //  vorr          d19, d26, d26
+    0xf2493cb4,                                 //  vfma.f32      d19, d25, d20
+    0xf4ec4c9f,                                 //  vld1.32       {d20[]}, [ip :32]
+    0xf249acb6,                                 //  vfma.f32      d26, d25, d22
+    0xe283c034,                                 //  add           ip, r3, #52
+    0xf3470db0,                                 //  vmul.f32      d16, d23, d16
+    0xf3482db2,                                 //  vmul.f32      d18, d24, d18
+    0xf3451db1,                                 //  vmul.f32      d17, d21, d17
+    0xf244bcb0,                                 //  vfma.f32      d27, d20, d16
+    0xf2e20948,                                 //  vmul.f32      d16, d2, d8[0]
+    0xf244acb2,                                 //  vfma.f32      d26, d20, d18
+    0xf2443cb1,                                 //  vfma.f32      d19, d20, d17
+    0xf4ec4c9f,                                 //  vld1.32       {d20[]}, [ip :32]
+    0xf2e11948,                                 //  vmul.f32      d17, d1, d8[0]
+    0xe283c004,                                 //  add           ip, r3, #4
+    0xf2e02948,                                 //  vmul.f32      d18, d0, d8[0]
+    0xf3241e81,                                 //  vcgt.f32      d1, d20, d1
+    0xf4ec5c9f,                                 //  vld1.32       {d21[]}, [ip :32]
+    0xf3240e80,                                 //  vcgt.f32      d0, d20, d0
+    0xf3242e82,                                 //  vcgt.f32      d2, d20, d2
+    0xf2654fab,                                 //  vmin.f32      d20, d21, d27
+    0xf2656faa,                                 //  vmin.f32      d22, d21, d26
+    0xf2653fa3,                                 //  vmin.f32      d19, d21, d19
+    0xf31201b4,                                 //  vbsl          d0, d18, d20
+    0xf31111b6,                                 //  vbsl          d1, d17, d22
+    0xf31021b3,                                 //  vbsl          d2, d16, d19
+    0xecbd8b02,                                 //  vpop          {d8}
+};
+static const unsigned int kSplice_scale_u8[] = {
+    0xed2d8b02,                                 //  vpush         {d8}
+    0xe24dd008,                                 //  sub           sp, sp, #8
+    0xe592c000,                                 //  ldr           ip, [r2]
+    0xe08cc000,                                 //  add           ip, ip, r0
+    0xe1dcc0b0,                                 //  ldrh          ip, [ip]
+    0xe1cdc0b4,                                 //  strh          ip, [sp, #4]
+    0xe28dc004,                                 //  add           ip, sp, #4
+    0xed938a03,                                 //  vldr          s16, [r3, #12]
+    0xf4ec041f,                                 //  vld1.16       {d16[0]}, [ip :16]
+    0xf3c80a30,                                 //  vmovl.u8      q8, d16
+    0xf3d00a30,                                 //  vmovl.u16     q8, d16
+    0xf3fb06a0,                                 //  vcvt.f32.u32  d16, d16
+    0xf2e009c8,                                 //  vmul.f32      d16, d16, d8[0]
+    0xf3000d90,                                 //  vmul.f32      d0, d16, d0
+    0xf3001d91,                                 //  vmul.f32      d1, d16, d1
+    0xf3002d92,                                 //  vmul.f32      d2, d16, d2
+    0xf3003d93,                                 //  vmul.f32      d3, d16, d3
+    0xe28dd008,                                 //  add           sp, sp, #8
+    0xecbd8b02,                                 //  vpop          {d8}
+};
+static const unsigned int kSplice_load_8888[] = {
+    0xe592c000,                                 //  ldr           ip, [r2]
+    0xf4e30c9f,                                 //  vld1.32       {d16[]}, [r3 :32]
+    0xe08cc100,                                 //  add           ip, ip, r0, lsl #2
+    0xed932a03,                                 //  vldr          s4, [r3, #12]
+    0xeddc1b00,                                 //  vldr          d17, [ip]
+    0xf24021b1,                                 //  vand          d18, d16, d17
+    0xf3f83031,                                 //  vshr.u32      d19, d17, #8
+    0xf3e84031,                                 //  vshr.u32      d20, d17, #24
+    0xf3f01031,                                 //  vshr.u32      d17, d17, #16
+    0xf24031b3,                                 //  vand          d19, d16, d19
+    0xf24001b1,                                 //  vand          d16, d16, d17
+    0xf3fb2622,                                 //  vcvt.f32.s32  d18, d18
+    0xf3fb4624,                                 //  vcvt.f32.s32  d20, d20
+    0xf3fb1623,                                 //  vcvt.f32.s32  d17, d19
+    0xf3fb0620,                                 //  vcvt.f32.s32  d16, d16
+    0xf2a209c2,                                 //  vmul.f32      d0, d18, d2[0]
+    0xf2a439c2,                                 //  vmul.f32      d3, d20, d2[0]
+    0xf2a119c2,                                 //  vmul.f32      d1, d17, d2[0]
+    0xf2a029c2,                                 //  vmul.f32      d2, d16, d2[0]
+};
+static const unsigned int kSplice_store_8888[] = {
+    0xe283c008,                                 //  add           ip, r3, #8
+    0xf2c3261f,                                 //  vmov.i32      d18, #1056964608
+    0xf2c3361f,                                 //  vmov.i32      d19, #1056964608
+    0xf4ec1c9f,                                 //  vld1.32       {d17[]}, [ip :32]
+    0xf2c3061f,                                 //  vmov.i32      d16, #1056964608
+    0xf2412c31,                                 //  vfma.f32      d18, d1, d17
+    0xf2423c31,                                 //  vfma.f32      d19, d2, d17
+    0xf2c3461f,                                 //  vmov.i32      d20, #1056964608
+    0xe592c000,                                 //  ldr           ip, [r2]
+    0xf2400c31,                                 //  vfma.f32      d16, d0, d17
+    0xf2434c31,                                 //  vfma.f32      d20, d3, d17
+    0xe08cc100,                                 //  add           ip, ip, r0, lsl #2
+    0xf3fb17a2,                                 //  vcvt.u32.f32  d17, d18
+    0xf3fb27a3,                                 //  vcvt.u32.f32  d18, d19
+    0xf3fb07a0,                                 //  vcvt.u32.f32  d16, d16
+    0xf3fb37a4,                                 //  vcvt.u32.f32  d19, d20
+    0xf2e81531,                                 //  vshl.s32      d17, d17, #8
+    0xf2f02532,                                 //  vshl.s32      d18, d18, #16
+    0xf26101b0,                                 //  vorr          d16, d17, d16
+    0xf2f81533,                                 //  vshl.s32      d17, d19, #24
+    0xf26001b2,                                 //  vorr          d16, d16, d18
+    0xf26001b1,                                 //  vorr          d16, d16, d17
+    0xedcc0b00,                                 //  vstr          d16, [ip]
+};
+static const unsigned int kSplice_load_f16[] = {
+    0xed2d8b04,                                 //  vpush         {d8-d9}
+    0xe592c000,                                 //  ldr           ip, [r2]
+    0xe08cc180,                                 //  add           ip, ip, r0, lsl #3
+    0xf46c084f,                                 //  vld2.16       {d16-d17}, [ip]
+    0xf3b62720,                                 //  vcvt.f32.f16  q1, d16
+    0xf3b68721,                                 //  vcvt.f32.f16  q4, d17
+    0xf2220112,                                 //  vorr          d0, d2, d2
+    0xeef00a43,                                 //  vmov.f32      s1, s6
+    0xf2281118,                                 //  vorr          d1, d8, d8
+    0xeeb03a62,                                 //  vmov.f32      s6, s5
+    0xeef01a49,                                 //  vmov.f32      s3, s18
+    0xeeb09a68,                                 //  vmov.f32      s18, s17
+    0xeeb02b43,                                 //  vmov.f64      d2, d3
+    0xeeb03b49,                                 //  vmov.f64      d3, d9
+    0xecbd8b04,                                 //  vpop          {d8-d9}
+};
+static const unsigned int kSplice_store_f16[] = {
+    0xeef00b41,                                 //  vmov.f64      d16, d1
+    0xf2631113,                                 //  vorr          d17, d3, d3
+    0xeef02b40,                                 //  vmov.f64      d18, d0
+    0xf2623112,                                 //  vorr          d19, d2, d2
+    0xf3fa00a1,                                 //  vtrn.32       d16, d17
+    0xf3f61620,                                 //  vcvt.f16.f32  d17, q8
+    0xf3fa20a3,                                 //  vtrn.32       d18, d19
+    0xe592c000,                                 //  ldr           ip, [r2]
+    0xf3f60622,                                 //  vcvt.f16.f32  d16, q9
+    0xe08cc180,                                 //  add           ip, ip, r0, lsl #3
+    0xf44c084f,                                 //  vst2.16       {d16-d17}, [ip]
+};
+
 #else
 
 static const unsigned char kSplice_clear[] = {
index e3a19ea5a8bc42a0730cc9f6309658e2f367d95c..c45f204e2274a011af1b50a48b4ffbf13dcfbe44 100644 (file)
     using U8  = uint8_t  __attribute__((ext_vector_type(4)));
 
     // We polyfill a few routines that Clang doesn't build into ext_vector_types.
-    AI static U32 round(F v)                           { return vcvtnq_u32_f32(v);       }
     AI static F   min(F a, F b)                        { return vminq_f32(a,b);          }
     AI static F   max(F a, F b)                        { return vmaxq_f32(a,b);          }
     AI static F   fma(F f, F m, F a)                   { return vfmaq_f32(a,f,m);        }
     AI static F   rcp  (F v) { auto e = vrecpeq_f32 (v); return vrecpsq_f32 (v,e  ) * e; }
     AI static F   rsqrt(F v) { auto e = vrsqrteq_f32(v); return vrsqrtsq_f32(v,e*e) * e; }
     AI static F   if_then_else(I32 c, F t, F e)        { return vbslq_f32((U32)c,t,e);   }
+    AI static U32 round(F v, F scale)                  { return vcvtnq_u32_f32(v*scale); }
+
+#elif defined(__ARM_NEON__)
+    #if defined(__thumb2__) || !defined(__ARM_ARCH_7A__) || !defined(__ARM_VFPV4__)
+        #error On ARMv7, compile with -march=armv7-a -mfpu=neon-vfp4, without -mthumb.
+    #endif
+    #include <arm_neon.h>
+
+    // We can pass {s0-s15} as arguments under AAPCS-VFP.  We'll slice that as 8 d-registers.
+    using F   = float    __attribute__((ext_vector_type(2)));
+    using I32 =  int32_t __attribute__((ext_vector_type(2)));
+    using U32 = uint32_t __attribute__((ext_vector_type(2)));
+    using U8  = uint8_t  __attribute__((ext_vector_type(2)));
+
+    AI static F   min(F a, F b)                        { return vmin_f32(a,b);          }
+    AI static F   max(F a, F b)                        { return vmax_f32(a,b);          }
+    AI static F   fma(F f, F m, F a)                   { return vfma_f32(a,f,m);        }
+    AI static F   rcp  (F v)  { auto e = vrecpe_f32 (v); return vrecps_f32 (v,e  ) * e; }
+    AI static F   rsqrt(F v)  { auto e = vrsqrte_f32(v); return vrsqrts_f32(v,e*e) * e; }
+    AI static F   if_then_else(I32 c, F t, F e)        { return vbsl_f32((U32)c,t,e);   }
+    AI static U32 round(F v, F scale)                  { return vcvt_u32_f32(fma(v,scale,0.5f)); }
+
 #else
     #if !defined(__AVX2__) || !defined(__FMA__) || !defined(__F16C__)
         #error On x86, compile with -mavx2 -mfma -mf16c.
     using U32 = uint32_t __attribute__((ext_vector_type(8)));
     using U8  = uint8_t  __attribute__((ext_vector_type(8)));
 
-    AI static U32 round(F v)                    { return _mm256_cvtps_epi32(v); }
     AI static F   min(F a, F b)                 { return _mm256_min_ps  (a,b);  }
     AI static F   max(F a, F b)                 { return _mm256_max_ps  (a,b);  }
     AI static F   fma(F f, F m, F a)            { return _mm256_fmadd_ps(f,m,a);}
     AI static F   rcp  (F v)                    { return _mm256_rcp_ps     (v); }
     AI static F   rsqrt(F v)                    { return _mm256_rsqrt_ps   (v); }
     AI static F   if_then_else(I32 c, F t, F e) { return _mm256_blendv_ps(e,t,c); }
+    AI static U32 round(F v, F scale)           { return _mm256_cvtps_epi32(v*scale); }
 #endif
 
 AI static F   cast  (U32 v) { return __builtin_convertvector((I32)v, F);   }
@@ -58,7 +79,12 @@ AI static U32 expand(U8  v) { return __builtin_convertvector(     v, U32); }
 
 // We'll be compiling this file to an object file, then extracting parts of it into
 // SkSplicer_generated.h.  It's easier to do if the function names are not C++ mangled.
-#define C extern "C"
+// On ARMv7, use aapcs-vfp calling convention to pass as much data in registers as possible.
+#if defined(__ARM_NEON__)
+    #define C extern "C" __attribute__((pcs("aapcs-vfp")))
+#else
+    #define C extern "C"
+#endif
 
 // Stages all fit a common interface that allows SkSplicer to splice them together.
 using K = const SkSplicer_constants;
@@ -240,10 +266,10 @@ STAGE(load_8888) {
 STAGE(store_8888) {
     auto ptr = *(uint32_t**)ctx + x;
 
-    U32 px = round(r * k->_255)
-           | round(g * k->_255) <<  8
-           | round(b * k->_255) << 16
-           | round(a * k->_255) << 24;
+    U32 px = round(r, k->_255)
+           | round(g, k->_255) <<  8
+           | round(b, k->_255) << 16
+           | round(a, k->_255) << 24;
     memcpy(ptr, &px, sizeof(px));
 }
 
@@ -256,6 +282,14 @@ STAGE(load_f16) {
     g = vcvt_f32_f16(halfs.val[1]);
     b = vcvt_f32_f16(halfs.val[2]);
     a = vcvt_f32_f16(halfs.val[3]);
+#elif defined(__ARM_NEON__)
+    auto rb_ga = vld2_f16((const float16_t*)ptr);
+    auto rb = vcvt_f32_f16(rb_ga.val[0]),
+         ga = vcvt_f32_f16(rb_ga.val[1]);
+    r = {rb[0], rb[2]};
+    g = {ga[0], ga[2]};
+    b = {rb[1], rb[3]};
+    a = {ga[1], ga[3]};
 #else
     auto _01 = _mm_loadu_si128(((__m128i*)ptr) + 0),
          _23 = _mm_loadu_si128(((__m128i*)ptr) + 1),
@@ -290,6 +324,12 @@ STAGE(store_f16) {
         vcvt_f16_f32(a),
     }};
     vst4_f16((float16_t*)ptr, halfs);
+#elif defined(__ARM_NEON__)
+    float16x4x2_t rb_ga = {{
+        vcvt_f16_f32(float32x4_t{r[0], b[0], r[1], b[1]}),
+        vcvt_f16_f32(float32x4_t{g[0], a[0], g[1], a[1]}),
+    }};
+    vst2_f16((float16_t*)ptr, rb_ga);
 #else
     auto R = _mm256_cvtps_ph(r, _MM_FROUND_CUR_DIRECTION),
          G = _mm256_cvtps_ph(g, _MM_FROUND_CUR_DIRECTION),
index 1a9813e0ddab170b33bc503780e70e9d9904f472..f124c644888e24f52084da89dd6ac321d58a3771 100755 (executable)
@@ -25,21 +25,35 @@ subprocess.check_call(['clang++'] + cflags + aarch64 +
                       ['-c', 'src/splicer/SkSplicer_stages.cpp'] +
                       ['-o', 'aarch64.o'])
 
-def parse_object_file(dot_o, array_type, done):
-  for line in subprocess.check_output(['gobjdump', '-d', dot_o]).split('\n'):
+armv7 = [
+    '--target=arm-linux-androideabi',
+    '--sysroot=' +
+    '/Users/mtklein/brew/opt/android-ndk/platforms/android-18/arch-arm',
+    '-march=armv7-a',
+    '-mfpu=neon-vfpv4',
+]
+subprocess.check_call(['clang++'] + cflags + armv7 +
+                      ['-c', 'src/splicer/SkSplicer_stages.cpp'] +
+                      ['-o', 'armv7.o'])
+
+def parse_object_file(dot_o, array_type, done, target=None):
+  cmd = ['gobjdump', '-d', dot_o]
+  if target:
+    cmd += ['--target', target]
+  for line in subprocess.check_output(cmd).split('\n'):
     line = line.strip()
     if not line or line.startswith(dot_o) or line.startswith('Disassembly'):
       continue
 
     # E.g. 00000000000003a4 <_load_f16>:
-    m = re.match('''................ <_?(.*)>:''', line)
+    m = re.match('''[0-9a-f]+ <_?(.*)>:''', line)
     if m:
       print 'static const', array_type, 'kSplice_' + m.group(1) + '[] = {'
       continue
 
     columns = line.split('\t')
     code = columns[1]
-    if len(columns) == 4:
+    if len(columns) >= 4:
       inst = columns[2]
       args = columns[3]
     else:
@@ -74,6 +88,9 @@ print '''/*
 #if defined(__aarch64__)
 '''
 parse_object_file('aarch64.o', 'unsigned int', '14000000')
+print '\n#elif defined(__ARM_NEON__)\n'
+parse_object_file('armv7.o', 'unsigned int', 'eafffffe',
+                  target='elf32-littlearm')
 print '\n#else\n'
 parse_object_file('hsw.o', 'unsigned char', 'e9 00 00 00 00')
 print '\n#endif\n'