{
int i = 0, result = 0;
#if CV_NEON
- if (CPU_HAS_NEON_FEATURE)
- {
- uint32x4_t bits = vmovq_n_u32(0);
- for (; i <= n - 16; i += 16) {
- uint8x16_t A_vec = vld1q_u8 (a + i);
- uint8x16_t bitsSet = vcntq_u8 (A_vec);
- uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet);
- uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8);
- bits = vaddq_u32(bits, bitSet4);
- }
- uint64x2_t bitSet2 = vpaddlq_u32 (bits);
- result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0);
- result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2);
- }
- else
-#endif
- for( ; i <= n - 4; i += 4 )
+ uint32x4_t bits = vmovq_n_u32(0);
+ for (; i <= n - 16; i += 16) {
+ uint8x16_t A_vec = vld1q_u8 (a + i);
+ uint8x16_t bitsSet = vcntq_u8 (A_vec);
+ uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet);
+ uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8);
+ bits = vaddq_u32(bits, bitSet4);
+ }
+ uint64x2_t bitSet2 = vpaddlq_u32 (bits);
+ result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0);
+ result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2);
+#else
+ for( ; i <= n - 4; i += 4 )
result += popCountTable[a[i]] + popCountTable[a[i+1]] +
popCountTable[a[i+2]] + popCountTable[a[i+3]];
+#endif
for( ; i < n; i++ )
result += popCountTable[a[i]];
return result;
{
int i = 0, result = 0;
#if CV_NEON
- if (CPU_HAS_NEON_FEATURE)
- {
- uint32x4_t bits = vmovq_n_u32(0);
- for (; i <= n - 16; i += 16) {
- uint8x16_t A_vec = vld1q_u8 (a + i);
- uint8x16_t B_vec = vld1q_u8 (b + i);
- uint8x16_t AxorB = veorq_u8 (A_vec, B_vec);
- uint8x16_t bitsSet = vcntq_u8 (AxorB);
- uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet);
- uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8);
- bits = vaddq_u32(bits, bitSet4);
- }
- uint64x2_t bitSet2 = vpaddlq_u32 (bits);
- result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0);
- result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2);
- }
- else
+ uint32x4_t bits = vmovq_n_u32(0);
+ for (; i <= n - 16; i += 16) {
+ uint8x16_t A_vec = vld1q_u8 (a + i);
+ uint8x16_t B_vec = vld1q_u8 (b + i);
+ uint8x16_t AxorB = veorq_u8 (A_vec, B_vec);
+ uint8x16_t bitsSet = vcntq_u8 (AxorB);
+ uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet);
+ uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8);
+ bits = vaddq_u32(bits, bitSet4);
+ }
+ uint64x2_t bitSet2 = vpaddlq_u32 (bits);
+ result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0);
+ result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2);
+#else
+ for( ; i <= n - 4; i += 4 )
+ result += popCountTable[a[i] ^ b[i]] + popCountTable[a[i+1] ^ b[i+1]] +
+ popCountTable[a[i+2] ^ b[i+2]] + popCountTable[a[i+3] ^ b[i+3]];
#endif
- for( ; i <= n - 4; i += 4 )
- result += popCountTable[a[i] ^ b[i]] + popCountTable[a[i+1] ^ b[i+1]] +
- popCountTable[a[i+2] ^ b[i+2]] + popCountTable[a[i+3] ^ b[i+3]];
for( ; i < n; i++ )
result += popCountTable[a[i] ^ b[i]];
return result;
ASSERT_EQ(0, countNonZero(m1 - m2));
}
+
+TEST(Core_round, CvRound)
+{
+ ASSERT_EQ(2, cvRound(2.0));
+ ASSERT_EQ(2, cvRound(2.1));
+ ASSERT_EQ(-2, cvRound(-2.1));
+ ASSERT_EQ(3, cvRound(2.8));
+ ASSERT_EQ(-3, cvRound(-2.8));
+ ASSERT_EQ(2, cvRound(2.5));
+ ASSERT_EQ(4, cvRound(3.5));
+ ASSERT_EQ(-2, cvRound(-2.5));
+ ASSERT_EQ(-4, cvRound(-3.5));
+}
\ No newline at end of file
ResultType operator()(Iterator1 a, Iterator2 b, size_t size, ResultType /*worst_dist*/ = -1) const
{
ResultType result = 0;
-#ifdef __GNUC__
#ifdef __ARM_NEON__
{
uint32x4_t bits = vmovq_n_u32(0);
result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0);
result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2);
}
-#else
+#elif __GNUC__
{
//for portability just use unsigned long -- and use the __builtin_popcountll (see docs for __builtin_popcountll)
typedef unsigned long long pop_t;
result += __builtin_popcountll(a_final ^ b_final);
}
}
-#endif //NEON
-#else
+#else // NO NEON and NOT GNUC
+ typedef unsigned long long pop_t;
HammingLUT lut;
result = lut(reinterpret_cast<const unsigned char*> (a),
reinterpret_cast<const unsigned char*> (b), size * sizeof(pop_t));