Updated AVX2 implementation of v_popcount for u8.

author Vitaly Tuzov <terfendail@mediana.jetos.com>

Tue, 14 May 2019 15:48:36 +0000 (18:48 +0300)

committer Vitaly Tuzov <terfendail@mediana.jetos.com>

Wed, 15 May 2019 16:39:25 +0000 (19:39 +0300)
author Vitaly Tuzov <terfendail@mediana.jetos.com>
Tue, 14 May 2019 15:48:36 +0000 (18:48 +0300)
committer Vitaly Tuzov <terfendail@mediana.jetos.com>
Wed, 15 May 2019 16:39:25 +0000 (19:39 +0300)
diff --git a/modules/core/include/opencv2/core/hal/intrin_avx.hpp b/modules/core/include/opencv2/core/hal/intrin_avx.hpp

index 60827f462ce84e8391e10ab138882b70c7f99496..91e44834449456a41ae043671a1005c8dd676ded 100644 (file)
--- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp
@@ -1188,14 +1188,11 @@ inline float v_reduce_sad(const v_float32x8& a, const v_float32x8& b)
  /** Popcount **/
  inline v_uint8x32 v_popcount(const v_uint8x32& a)
  {
-    __m256i m1 = _mm256_set1_epi32(0x55555555);
-    __m256i m2 = _mm256_set1_epi32(0x33333333);
-    __m256i m4 = _mm256_set1_epi32(0x0f0f0f0f);
-    __m256i p = a.val;
-    p = _mm256_add_epi32(_mm256_and_si256(_mm256_srli_epi32(p, 1), m1), _mm256_and_si256(p, m1));
-    p = _mm256_add_epi32(_mm256_and_si256(_mm256_srli_epi32(p, 2), m2), _mm256_and_si256(p, m2));
-    p = _mm256_add_epi32(_mm256_and_si256(_mm256_srli_epi32(p, 4), m4), _mm256_and_si256(p, m4));
-    return v_uint8x32(p);
+    __m256i _popcnt_table = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
+                                             0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
+    __m256i _popcnt_mask = _mm256_set1_epi8(0x0F);
+    return v_uint8x32(_mm256_add_epi8(_mm256_shuffle_epi8(_popcnt_table, _mm256_and_si256(                  a.val    , _popcnt_mask)),
+                                      _mm256_shuffle_epi8(_popcnt_table, _mm256_and_si256(_mm256_srli_epi16(a.val, 4), _popcnt_mask))));
  }
  inline v_uint16x16 v_popcount(const v_uint16x16& a)
  {
@@ -1212,14 +1209,7 @@ inline v_uint32x8 v_popcount(const v_uint32x8& a)
  }
  inline v_uint64x4 v_popcount(const v_uint64x4& a)
  {
-    __m256i m1 = _mm256_set1_epi32(0x55555555);
-    __m256i m2 = _mm256_set1_epi32(0x33333333);
-    __m256i m4 = _mm256_set1_epi32(0x0f0f0f0f);
-    __m256i p = a.val;
-    p = _mm256_add_epi32(_mm256_and_si256(_mm256_srli_epi32(p, 1), m1), _mm256_and_si256(p, m1));
-    p = _mm256_add_epi32(_mm256_and_si256(_mm256_srli_epi32(p, 2), m2), _mm256_and_si256(p, m2));
-    p = _mm256_add_epi32(_mm256_and_si256(_mm256_srli_epi32(p, 4), m4), _mm256_and_si256(p, m4));
-    return v_uint64x4(_mm256_sad_epu8(p, _mm256_setzero_si256()));
+    return v_uint64x4(_mm256_sad_epu8(v_popcount(v_reinterpret_as_u8(a)).val, _mm256_setzero_si256()));
  }
  inline v_uint8x32 v_popcount(const v_int8x32& a)
  { return v_popcount(v_reinterpret_as_u8(a)); }
diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp

index beeaa1663e5589dc7989ee70cb664c584ddf28d1..7b7e97c561d5c4fd0f8f8b9e78730394219ce23f 100644 (file)
--- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
@@ -1580,14 +1580,7 @@ inline v_uint32x4 v_popcount(const v_uint32x4& a)
  }
  inline v_uint64x2 v_popcount(const v_uint64x2& a)
  {
-    __m128i m1 = _mm_set1_epi32(0x55555555);
-    __m128i m2 = _mm_set1_epi32(0x33333333);
-    __m128i m4 = _mm_set1_epi32(0x0f0f0f0f);
-    __m128i p = a.val;
-    p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 1), m1), _mm_and_si128(p, m1));
-    p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 2), m2), _mm_and_si128(p, m2));
-    p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 4), m4), _mm_and_si128(p, m4));
-    return v_uint64x2(_mm_sad_epu8(p, _mm_setzero_si128()));
+    return v_uint64x2(_mm_sad_epu8(v_popcount(v_reinterpret_as_u8(a)).val, _mm_setzero_si128()));
  }
  inline v_uint8x16 v_popcount(const v_int8x16& a)
  { return v_popcount(v_reinterpret_as_u8(a)); }
diff --git a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp

index 80e6fefd53a7ff2b0321e7e7326e95f886b5dcb9..1a118ae270628e5e9182d628e560536a4e06e96b 100644 (file)
--- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
@@ -766,8 +766,8 @@ inline scalartype v_reduce_##suffix(const _Tpvec& a)
  }
  OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_uint8x16, vec_uchar16, uchar, max, vec_max)
  OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_uint8x16, vec_uchar16, uchar, min, vec_min)
-OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_int8x16, vec_char8, schar, max, vec_max)
-OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_int8x16, vec_char8, schar, min, vec_min)
+OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_int8x16, vec_char16, schar, max, vec_max)
+OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_int8x16, vec_char16, schar, min, vec_min)
  
  inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
                                   const v_float32x4& c, const v_float32x4& d)
diff --git a/modules/core/src/stat.simd.hpp b/modules/core/src/stat.simd.hpp

index 0da3f793806a18acbcf06df35860df786755da7a..34b784e12e7a260fab047161358d6986e63ea90f 100644 (file)
--- a/modules/core/src/stat.simd.hpp
+++ b/modules/core/src/stat.simd.hpp
@@ -32,28 +32,15 @@ int normHamming(const uchar* a, int n)
  
      int i = 0;
      int result = 0;
-#if CV_AVX2
-    {
-        __m256i _r0 = _mm256_setzero_si256();
-        __m256i _0 = _mm256_setzero_si256();
-        __m256i _popcnt_table = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
-                                                 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
-        __m256i _popcnt_mask = _mm256_set1_epi8(0x0F);
-
-        for(; i <= n - 32; i+= 32)
-        {
-            __m256i _a0 = _mm256_loadu_si256((const __m256i*)(a + i));
  
-            __m256i _popc0 = _mm256_shuffle_epi8(_popcnt_table, _mm256_and_si256(_a0, _popcnt_mask));
-            __m256i _popc1 = _mm256_shuffle_epi8(_popcnt_table,
-                             _mm256_and_si256(_mm256_srli_epi16(_a0, 4), _popcnt_mask));
-
-            _r0 = _mm256_add_epi32(_r0, _mm256_sad_epu8(_0, _mm256_add_epi8(_popc0, _popc1)));
-        }
-        _r0 = _mm256_add_epi32(_r0, _mm256_shuffle_epi32(_r0, 2));
-        result = _mm256_extract_epi32_(_mm256_add_epi32(_r0, _mm256_permute2x128_si256(_r0, _r0, 1)), 0);
+#if CV_SIMD && CV_SIMD_WIDTH > 16
+    {
+        v_uint64 t = vx_setzero_u64();
+        for (; i <= n - v_uint8::nlanes; i += v_uint8::nlanes)
+            t += v_popcount(v_reinterpret_as_u64(vx_load(a + i)));
+        result = (int)v_reduce_sum(t);
      }
-#endif // CV_AVX2
+#endif
  
  #if CV_POPCNT
      {
@@ -68,16 +55,14 @@ int normHamming(const uchar* a, int n)
              result += CV_POPCNT_U32(*(uint*)(a + i));
          }
      }
-#endif // CV_POPCNT
-
-#if CV_SIMD
+#elif CV_SIMD
      {
-        v_uint64 t = vx_setzero_u64();
-        for(; i <= n - v_uint8::nlanes; i += v_uint8::nlanes)
-            t += v_popcount(v_reinterpret_as_u64(vx_load(a + i)));
+        v_uint64x2 t = v_setzero_u64();
+        for(; i <= n - v_uint8x16::nlanes; i += v_uint8x16::nlanes)
+            t += v_popcount(v_reinterpret_as_u64(v_load(a + i)));
          result += (int)v_reduce_sum(t);
      }
-#endif // CV_SIMD
+#endif
  #if CV_ENABLE_UNROLLED
      for(; i <= n - 4; i += 4)
      {
@@ -98,31 +83,15 @@ int normHamming(const uchar* a, const uchar* b, int n)
  
      int i = 0;
      int result = 0;
-#if CV_AVX2
-    {
-        __m256i _r0 = _mm256_setzero_si256();
-        __m256i _0 = _mm256_setzero_si256();
-        __m256i _popcnt_table = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
-                                                 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
-        __m256i _popcnt_mask = _mm256_set1_epi8(0x0F);
-
-        for(; i <= n - 32; i+= 32)
-        {
-            __m256i _a0 = _mm256_loadu_si256((const __m256i*)(a + i));
-            __m256i _b0 = _mm256_loadu_si256((const __m256i*)(b + i));
-
-            __m256i _xor = _mm256_xor_si256(_a0, _b0);
  
-            __m256i _popc0 = _mm256_shuffle_epi8(_popcnt_table, _mm256_and_si256(_xor, _popcnt_mask));
-            __m256i _popc1 = _mm256_shuffle_epi8(_popcnt_table,
-                             _mm256_and_si256(_mm256_srli_epi16(_xor, 4), _popcnt_mask));
-
-            _r0 = _mm256_add_epi32(_r0, _mm256_sad_epu8(_0, _mm256_add_epi8(_popc0, _popc1)));
-        }
-        _r0 = _mm256_add_epi32(_r0, _mm256_shuffle_epi32(_r0, 2));
-        result = _mm256_extract_epi32_(_mm256_add_epi32(_r0, _mm256_permute2x128_si256(_r0, _r0, 1)), 0);
+#if CV_SIMD && CV_SIMD_WIDTH > 16
+    {
+        v_uint64 t = vx_setzero_u64();
+        for (; i <= n - v_uint8::nlanes; i += v_uint8::nlanes)
+            t += v_popcount(v_reinterpret_as_u64(vx_load(a + i) ^ vx_load(b + i)));
+        result += (int)v_reduce_sum(t);
      }
-#endif // CV_AVX2
+#endif
  
  #if CV_POPCNT
      {
@@ -137,16 +106,14 @@ int normHamming(const uchar* a, const uchar* b, int n)
              result += CV_POPCNT_U32(*(uint*)(a + i) ^ *(uint*)(b + i));
          }
      }
-#endif // CV_POPCNT
-
-#if CV_SIMD
+#elif CV_SIMD
      {
-        v_uint64 t = vx_setzero_u64();
-        for(; i <= n - v_uint8::nlanes; i += v_uint8::nlanes)
+        v_uint64x2 t = v_setzero_u64();
+        for(; i <= n - v_uint8x16::nlanes; i += v_uint8x16::nlanes)
              t += v_popcount(v_reinterpret_as_u64(vx_load(a + i) ^ vx_load(b + i)));
          result += (int)v_reduce_sum(t);
      }
-#endif // CV_SIMD
+#endif
  #if CV_ENABLE_UNROLLED
      for(; i <= n - 4; i += 4)
      {
author	Vitaly Tuzov <terfendail@mediana.jetos.com>
	Tue, 14 May 2019 15:48:36 +0000 (18:48 +0300)
committer	Vitaly Tuzov <terfendail@mediana.jetos.com>
	Wed, 15 May 2019 16:39:25 +0000 (19:39 +0300)
modules/core/include/opencv2/core/hal/intrin_avx.hpp		patch \| blob \| history
modules/core/include/opencv2/core/hal/intrin_sse.hpp		patch \| blob \| history
modules/core/include/opencv2/core/hal/intrin_vsx.hpp		patch \| blob \| history
modules/core/src/stat.simd.hpp		patch \| blob \| history