32-bit fast hash, tidy up murmur3 a bit

author mtklein <mtklein@chromium.org>

Tue, 16 Aug 2016 16:29:57 +0000 (09:29 -0700)

committer Commit bot <commit-bot@chromium.org>

Tue, 16 Aug 2016 16:29:57 +0000 (09:29 -0700)
author mtklein <mtklein@chromium.org>
Tue, 16 Aug 2016 16:29:57 +0000 (09:29 -0700)
committer Commit bot <commit-bot@chromium.org>
Tue, 16 Aug 2016 16:29:57 +0000 (09:29 -0700)
diff --git a/src/opts/SkChecksum_opts.h b/src/opts/SkChecksum_opts.h

index 346b16b..07fdfaa 100644 (file)
--- a/src/opts/SkChecksum_opts.h
+++ b/src/opts/SkChecksum_opts.h
@@ -16,18 +16,18 @@
  #endif
  
  // TODO: ARMv8 has optional CRC instructions similar to SSE 4.2
-// TODO: 32-bit x86 version: same sort of idea using only _mm_crc32_u32() and smaller
  
  namespace SK_OPTS_NS {
  
-#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE42 && (defined(__x86_64__) || defined(_M_X64))
-    template <typename T>
-    static inline T unaligned_load(const uint8_t* src) {
-        T val;
-        memcpy(&val, src, sizeof(val));
-        return val;
-    }
+template <typename T>
+static inline T unaligned_load(const uint8_t* src) {
+    T val;
+    memcpy(&val, src, sizeof(val));
+    return val;
+}
  
+#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE42 && (defined(__x86_64__) || defined(_M_X64))
+    // This is not a CRC32.  It's Just A Hash that uses those instructions because they're fast.
      static uint32_t hash_fn(const void* vdata, size_t bytes, uint32_t seed) {
          auto data = (const uint8_t*)vdata;
  
@@ -82,21 +82,61 @@ namespace SK_OPTS_NS {
          return hash32;
      }
  
+#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE42
+    // 32-bit version of above, using _mm_crc32_u32() but not _mm_crc32_u64().
+    static uint32_t hash_fn(const void* vdata, size_t bytes, uint32_t hash) {
+        auto data = (const uint8_t*)vdata;
+
+        if (bytes >= 12) {
+            // We'll create 3 independent hashes, each using _mm_crc32_u32()
+            // to hash 4 bytes per step.  Both 3 and independent are important:
+            // we can execute 3 of these instructions in parallel on a single core.
+            uint32_t a = hash,
+                     b = hash,
+                     c = hash;
+            size_t steps = bytes/12;
+            while (steps --> 0) {
+                a = _mm_crc32_u32(a, unaligned_load<uint32_t>(data+0));
+                b = _mm_crc32_u32(b, unaligned_load<uint32_t>(data+4));
+                c = _mm_crc32_u32(c, unaligned_load<uint32_t>(data+8));
+                data += 12;
+            }
+            bytes %= 12;
+            hash = a^b^c;
+        }
+
+        SkASSERT(bytes < 12);
+        if (bytes >= 8) {
+            hash = _mm_crc32_u32(hash, unaligned_load<uint32_t>(data));
+            bytes -= 4;
+            data  += 4;
+        }
+
+        SkASSERT(bytes < 8);
+        if (bytes & 4) {
+            hash = _mm_crc32_u32(hash, unaligned_load<uint32_t>(data));
+            data += 4;
+        }
+        if (bytes & 2) {
+            hash = _mm_crc32_u16(hash, unaligned_load<uint16_t>(data));
+            data += 2;
+        }
+        if (bytes & 1) {
+            hash = _mm_crc32_u8(hash, unaligned_load<uint8_t>(data));
+        }
+        return hash;
+    }
+
  #else
-    static uint32_t hash_fn(const void* data, size_t bytes, uint32_t seed) {
-        // This is Murmur3.
+    // This is Murmur3.
+    static uint32_t hash_fn(const void* vdata, size_t bytes, uint32_t hash) {
+        auto data = (const uint8_t*)vdata;
  
-        // Use may_alias to remind the compiler we're intentionally violating strict aliasing,
-        // and so not to apply strict-aliasing-based optimizations.
-        typedef uint32_t SK_ATTRIBUTE(may_alias) aliased_uint32_t;
-        typedef uint8_t SK_ATTRIBUTE(may_alias) aliased_uint8_t;
+        size_t original_bytes = bytes;
  
          // Handle 4 bytes at a time while possible.
-        const aliased_uint32_t* safe_data = (const aliased_uint32_t*)data;
-        const size_t words = bytes/4;
-        uint32_t hash = seed;
-        for (size_t i = 0; i < words; i++) {
-            uint32_t k = safe_data[i];
+        while (bytes >= 4) {
+            uint32_t k = unaligned_load<uint32_t>(data);
              k *= 0xcc9e2d51;
              k = (k << 15) | (k >> 17);
              k *= 0x1b873593;
@@ -105,22 +145,24 @@ namespace SK_OPTS_NS {
              hash = (hash << 13) | (hash >> 19);
              hash *= 5;
              hash += 0xe6546b64;
+
+            bytes -= 4;
+            data  += 4;
          }
  
          // Handle last 0-3 bytes.
-        const aliased_uint8_t* safe_tail = (const uint8_t*)(safe_data + words);
          uint32_t k = 0;
          switch (bytes & 3) {
-            case 3: k ^= safe_tail[2] << 16;
-            case 2: k ^= safe_tail[1] <<  8;
-            case 1: k ^= safe_tail[0] <<  0;
+            case 3: k ^= data[2] << 16;
+            case 2: k ^= data[1] <<  8;
+            case 1: k ^= data[0] <<  0;
                      k *= 0xcc9e2d51;
                      k = (k << 15) | (k >> 17);
                      k *= 0x1b873593;
                      hash ^= k;
          }
  
-        hash ^= bytes;
+        hash ^= original_bytes;
          return SkChecksum::Mix(hash);
      }
  #endif
author	mtklein <mtklein@chromium.org>
	Tue, 16 Aug 2016 16:29:57 +0000 (09:29 -0700)
committer	Commit bot <commit-bot@chromium.org>
	Tue, 16 Aug 2016 16:29:57 +0000 (09:29 -0700)