From 8d5a5d5ceb70b995914d3a7c0d89af31d9657af8 Mon Sep 17 00:00:00 2001 From: Vitaly Tuzov Date: Tue, 17 Oct 2017 09:34:01 +0300 Subject: [PATCH] AVX2 optimizations for FAST corner tracking provided by Victoria Zhislina --- modules/features2d/src/fast.cpp | 175 ++++++++++++++++++++++++++++------------ 1 file changed, 125 insertions(+), 50 deletions(-) diff --git a/modules/features2d/src/fast.cpp b/modules/features2d/src/fast.cpp index e939f06..4e9afae 100644 --- a/modules/features2d/src/fast.cpp +++ b/modules/features2d/src/fast.cpp @@ -63,6 +63,14 @@ void FAST_t(InputArray _img, std::vector& keypoints, int threshold, bo const int quarterPatternSize = patternSize/4; v_uint8x16 delta = v_setall_u8(0x80), t = v_setall_u8((char)threshold), K16 = v_setall_u8((char)K); bool hasSimd = hasSIMD128(); +#if CV_TRY_AVX2 + __m256i delta256, t256, K16_256; + if (CV_CPU_HAS_SUPPORT_AVX2) + { + delta256 = _mm256_broadcastsi128_si256(delta.val), t256 = _mm256_broadcastsi128_si256(t.val), K16_256 = _mm256_broadcastsi128_si256(K16.val); + } +#endif + #endif int i, j, k, pixel[25]; makeOffsets(pixel, (int)img.step, patternSize); @@ -100,65 +108,132 @@ void FAST_t(InputArray _img, std::vector& keypoints, int threshold, bo { if( patternSize == 16 ) { - for(; j < img.cols - 16 - 3; j += 16, ptr += 16) +#if CV_TRY_AVX2 + if (CV_CPU_HAS_SUPPORT_AVX2) { - v_uint8x16 v = v_load(ptr); - v_int8x16 v0 = v_reinterpret_as_s8((v + t) ^ delta); - v_int8x16 v1 = v_reinterpret_as_s8((v - t) ^ delta); - - v_int8x16 x0 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[0]), delta)); - v_int8x16 x1 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[quarterPatternSize]), delta)); - v_int8x16 x2 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[2*quarterPatternSize]), delta)); - v_int8x16 x3 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[3*quarterPatternSize]), delta)); - - v_int8x16 m0, m1; - m0 = (v0 < x0) & (v0 < x1); - m1 = (x0 < v1) & (x1 < v1); - m0 = m0 | ((v0 < x1) & (v0 < x2)); - m1 = m1 | ((x1 < v1) & (x2 < v1)); - m0 = m0 | ((v0 < x2) & (v0 < x3)); - m1 = m1 | ((x2 < v1) & (x3 < v1)); - m0 = m0 | ((v0 < x3) & (v0 < x0)); - m1 = m1 | ((x3 < v1) & (x0 < v1)); - m0 = m0 | m1; - - int mask = v_signmask(m0); - if( mask == 0 ) - continue; - if( (mask & 255) == 0 ) + for(; j < img.cols - 32 - 3; j += 32, ptr += 32) { - j -= 8; - ptr -= 8; - continue; - } + __m256i m0, m1; + __m256i v0 = _mm256_loadu_si256((const __m256i*)ptr); + + __m256i v1 = _mm256_xor_si256(_mm256_subs_epu8(v0, t256), delta256); + v0 = _mm256_xor_si256(_mm256_adds_epu8(v0, t256), delta256); + + __m256i x0 = _mm256_sub_epi8(_mm256_loadu_si256((const __m256i*)(ptr + pixel[0])), delta256); + __m256i x1 = _mm256_sub_epi8(_mm256_loadu_si256((const __m256i*)(ptr + pixel[4])), delta256); + __m256i x2 = _mm256_sub_epi8(_mm256_loadu_si256((const __m256i*)(ptr + pixel[8])), delta256); + __m256i x3 = _mm256_sub_epi8(_mm256_loadu_si256((const __m256i*)(ptr + pixel[12])), delta256); + + m0 = _mm256_and_si256(_mm256_cmpgt_epi8(x0, v0), _mm256_cmpgt_epi8(x1, v0)); + m1 = _mm256_and_si256(_mm256_cmpgt_epi8(v1, x0), _mm256_cmpgt_epi8(v1, x1)); + m0 = _mm256_or_si256(m0, _mm256_and_si256(_mm256_cmpgt_epi8(x1, v0), _mm256_cmpgt_epi8(x2, v0))); + m1 = _mm256_or_si256(m1, _mm256_and_si256(_mm256_cmpgt_epi8(v1, x1), _mm256_cmpgt_epi8(v1, x2))); + m0 = _mm256_or_si256(m0, _mm256_and_si256(_mm256_cmpgt_epi8(x2, v0), _mm256_cmpgt_epi8(x3, v0))); + m1 = _mm256_or_si256(m1, _mm256_and_si256(_mm256_cmpgt_epi8(v1, x2), _mm256_cmpgt_epi8(v1, x3))); + m0 = _mm256_or_si256(m0, _mm256_and_si256(_mm256_cmpgt_epi8(x3, v0), _mm256_cmpgt_epi8(x0, v0))); + m1 = _mm256_or_si256(m1, _mm256_and_si256(_mm256_cmpgt_epi8(v1, x3), _mm256_cmpgt_epi8(v1, x0))); + m0 = _mm256_or_si256(m0, m1); + + unsigned int mask = _mm256_movemask_epi8(m0); //unsigned is important! + if (mask == 0){ + continue; + } + if ((mask & 0xffff) == 0) + { + j -= 16; + ptr -= 16; + continue; + } - v_int8x16 c0 = v_setzero_s8(); - v_int8x16 c1 = v_setzero_s8(); - v_uint8x16 max0 = v_setzero_u8(); - v_uint8x16 max1 = v_setzero_u8(); - for( k = 0; k < N; k++ ) - { - v_int8x16 x = v_reinterpret_as_s8(v_load((ptr + pixel[k])) ^ delta); - m0 = v0 < x; - m1 = x < v1; + __m256i c0 = _mm256_setzero_si256(), c1 = c0, max0 = c0, max1 = c0; + for (k = 0; k < N; k++) + { + __m256i x = _mm256_xor_si256(_mm256_loadu_si256((const __m256i*)(ptr + pixel[k])), delta256); + m0 = _mm256_cmpgt_epi8(x, v0); + m1 = _mm256_cmpgt_epi8(v1, x); - c0 = v_sub_wrap(c0, m0) & m0; - c1 = v_sub_wrap(c1, m1) & m1; + c0 = _mm256_and_si256(_mm256_sub_epi8(c0, m0), m0); + c1 = _mm256_and_si256(_mm256_sub_epi8(c1, m1), m1); - max0 = v_max(max0, v_reinterpret_as_u8(c0)); - max1 = v_max(max1, v_reinterpret_as_u8(c1)); - } + max0 = _mm256_max_epu8(max0, c0); + max1 = _mm256_max_epu8(max1, c1); + } - max0 = v_max(max0, max1); - int m = v_signmask(K16 < max0); + max0 = _mm256_max_epu8(max0, max1); + unsigned int m = _mm256_movemask_epi8(_mm256_cmpgt_epi8(max0, K16_256)); - for( k = 0; m > 0 && k < 16; k++, m >>= 1 ) + for (k = 0; m > 0 && k < 32; k++, m >>= 1) + if (m & 1) + { + cornerpos[ncorners++] = j + k; + if (nonmax_suppression) + curr[j + k] = (uchar)cornerScore(ptr + k, pixel, threshold); + } + } + } //CV_CPU_HAS_SUPPORT_AVX2 +#endif + //vz if (j <= (img.cols - 27)) //it doesn't make sense using vectors for less than 8 elements + { + for (; j < img.cols - 16 - 3; j += 16, ptr += 16) { - if(m & 1) + v_uint8x16 v = v_load(ptr); + v_int8x16 v0 = v_reinterpret_as_s8((v + t) ^ delta); + v_int8x16 v1 = v_reinterpret_as_s8((v - t) ^ delta); + + v_int8x16 x0 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[0]), delta)); + v_int8x16 x1 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[quarterPatternSize]), delta)); + v_int8x16 x2 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[2*quarterPatternSize]), delta)); + v_int8x16 x3 = v_reinterpret_as_s8(v_sub_wrap(v_load(ptr + pixel[3*quarterPatternSize]), delta)); + + v_int8x16 m0, m1; + m0 = (v0 < x0) & (v0 < x1); + m1 = (x0 < v1) & (x1 < v1); + m0 = m0 | ((v0 < x1) & (v0 < x2)); + m1 = m1 | ((x1 < v1) & (x2 < v1)); + m0 = m0 | ((v0 < x2) & (v0 < x3)); + m1 = m1 | ((x2 < v1) & (x3 < v1)); + m0 = m0 | ((v0 < x3) & (v0 < x0)); + m1 = m1 | ((x3 < v1) & (x0 < v1)); + m0 = m0 | m1; + + int mask = v_signmask(m0); + if( mask == 0 ) + continue; + if( (mask & 255) == 0 ) { - cornerpos[ncorners++] = j+k; - if(nonmax_suppression) - curr[j+k] = (uchar)cornerScore(ptr+k, pixel, threshold); + j -= 8; + ptr -= 8; + continue; + } + + v_int8x16 c0 = v_setzero_s8(); + v_int8x16 c1 = v_setzero_s8(); + v_uint8x16 max0 = v_setzero_u8(); + v_uint8x16 max1 = v_setzero_u8(); + for( k = 0; k < N; k++ ) + { + v_int8x16 x = v_reinterpret_as_s8(v_load((ptr + pixel[k])) ^ delta); + m0 = v0 < x; + m1 = x < v1; + + c0 = v_sub_wrap(c0, m0) & m0; + c1 = v_sub_wrap(c1, m1) & m1; + + max0 = v_max(max0, v_reinterpret_as_u8(c0)); + max1 = v_max(max1, v_reinterpret_as_u8(c1)); + } + + max0 = v_max(max0, max1); + int m = v_signmask(K16 < max0); + + for( k = 0; m > 0 && k < 16; k++, m >>= 1 ) + { + if(m & 1) + { + cornerpos[ncorners++] = j+k; + if(nonmax_suppression) + curr[j+k] = (uchar)cornerScore(ptr+k, pixel, threshold); + } } } } -- 2.7.4