From 615e7b2747c18d83aab1c2b4b4e412911f0ed917 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 10 Dec 2012 13:29:08 +0400 Subject: [PATCH] added SIMD optimization of Edge-Aware Demosaicing in case of CV_8U --- modules/imgproc/src/demosaicing.cpp | 152 ++++++++++++++++++++++++++++++++---- modules/imgproc/test/test_color.cpp | 27 ++++--- 2 files changed, 151 insertions(+), 28 deletions(-) diff --git a/modules/imgproc/src/demosaicing.cpp b/modules/imgproc/src/demosaicing.cpp index 227fb41..f5cbde9 100644 --- a/modules/imgproc/src/demosaicing.cpp +++ b/modules/imgproc/src/demosaicing.cpp @@ -63,6 +63,11 @@ public: { return 0; } + + int bayer2RGB_EA(const T*, int, T*, int, int) const + { + return 0; + } }; #if CV_SSE2 @@ -129,6 +134,7 @@ public: G R G R | G R G R | G R G R | G R G R B G B G | B G B G | B G B G | B G B G */ + __m128i delta1 = _mm_set1_epi16(1), delta2 = _mm_set1_epi16(2); __m128i mask = _mm_set1_epi16(blue < 0 ? -1 : 0), z = _mm_setzero_si128(); __m128i masklo = _mm_set1_epi16(0x00ff); @@ -141,10 +147,11 @@ public: __m128i r2 = _mm_loadu_si128((const __m128i*)(bayer+bayer_step*2)); __m128i b1 = _mm_add_epi16(_mm_and_si128(r0, masklo), _mm_and_si128(r2, masklo)); - __m128i b0 = _mm_add_epi16(b1, _mm_srli_si128(b1, 2)); - b1 = _mm_srli_si128(b1, 2); - b1 = _mm_srli_epi16(_mm_add_epi16(b1, delta1), 1); + __m128i nextb1 = _mm_srli_si128(b1, 2); + __m128i b0 = _mm_add_epi16(b1, nextb1); + b1 = _mm_srli_epi16(_mm_add_epi16(nextb1, delta1), 1); b0 = _mm_srli_epi16(_mm_add_epi16(b0, delta2), 2); + // b0 b2 ... b14 b1 b3 ... b15 b0 = _mm_packus_epi16(b0, b1); __m128i g0 = _mm_add_epi16(_mm_srli_epi16(r0, 8), _mm_srli_epi16(r2, 8)); @@ -152,38 +159,42 @@ public: g0 = _mm_add_epi16(g0, _mm_add_epi16(g1, _mm_srli_si128(g1, 2))); g1 = _mm_srli_si128(g1, 2); g0 = _mm_srli_epi16(_mm_add_epi16(g0, delta2), 2); + // g0 g2 ... g14 g1 g3 ... g15 g0 = _mm_packus_epi16(g0, g1); r0 = _mm_srli_epi16(r1, 8); r1 = _mm_add_epi16(r0, _mm_srli_si128(r0, 2)); r1 = _mm_srli_epi16(_mm_add_epi16(r1, delta1), 1); + // r0 r2 ... r14 r1 r3 ... r15 r0 = _mm_packus_epi16(r0, r1); b1 = _mm_and_si128(_mm_xor_si128(b0, r0), mask); b0 = _mm_xor_si128(b0, b1); r0 = _mm_xor_si128(r0, b1); - // b1 g1 b1 g1 ... + // b1 g1 b3 g3 b5 g5... b1 = _mm_unpackhi_epi8(b0, g0); // b0 g0 b2 g2 b4 g4 .... b0 = _mm_unpacklo_epi8(b0, g0); - // r1 0 r3 0 ... + // r1 0 r3 0 r5 0 ... r1 = _mm_unpackhi_epi8(r0, z); // r0 0 r2 0 r4 0 ... r0 = _mm_unpacklo_epi8(r0, z); - // 0 b0 g0 r0 0 b2 g2 r2 0 ... + // 0 b0 g0 r0 0 b2 g2 r2 ... g0 = _mm_slli_si128(_mm_unpacklo_epi16(b0, r0), 1); - // 0 b8 g8 r8 0 b10 g10 r10 0 ... + // 0 b8 g8 r8 0 b10 g10 r10 ... g1 = _mm_slli_si128(_mm_unpackhi_epi16(b0, r0), 1); - // b1 g1 r1 0 b3 g3 r3 .... + // b1 g1 r1 0 b3 g3 r3 0 ... r0 = _mm_unpacklo_epi16(b1, r1); - // b9 g9 r9 0 ... + // b9 g9 r9 0 b11 g11 r11 0 ... r1 = _mm_unpackhi_epi16(b1, r1); + // 0 b0 g0 r0 b1 g1 r1 0 ... b0 = _mm_srli_si128(_mm_unpacklo_epi32(g0, r0), 1); + // 0 b4 g4 r4 b5 g5 r5 0 ... b1 = _mm_srli_si128(_mm_unpackhi_epi32(g0, r0), 1); _mm_storel_epi64((__m128i*)(dst-1+0), b0); @@ -191,7 +202,9 @@ public: _mm_storel_epi64((__m128i*)(dst-1+6*2), b1); _mm_storel_epi64((__m128i*)(dst-1+6*3), _mm_srli_si128(b1, 8)); + // 0 b8 g8 r8 b9 g9 r9 0 ... g0 = _mm_srli_si128(_mm_unpacklo_epi32(g1, r1), 1); + // 0 b12 g12 r12 b13 g13 r13 0 ... g1 = _mm_srli_si128(_mm_unpackhi_epi32(g1, r1), 1); _mm_storel_epi64((__m128i*)(dst-1+6*4), g0); @@ -203,6 +216,109 @@ public: return (int)(bayer - (bayer_end - width)); } + int bayer2RGB_EA(const uchar* bayer, int bayer_step, uchar* dst, int width, int blue) const + { + if (!use_simd) + return 0; + + const uchar* bayer_end = bayer + width; + __m128i masklow = _mm_set1_epi16(0x00ff); + __m128i delta1 = _mm_set1_epi16(1), delta2 = _mm_set1_epi16(2); + __m128i full = _mm_set1_epi16(-1), z = _mm_setzero_si128(); + __m128i mask = _mm_set1_epi16(blue > 0 ? -1 : 0); + + for ( ; bayer <= bayer_end - 18; bayer += 14, dst += 42) + { + /* + B G B G | B G B G | B G B G | B G B G + G R G R | G R G R | G R G R | G R G R + B G B G | B G B G | B G B G | B G B G + */ + + __m128i r0 = _mm_loadu_si128((const __m128i*)bayer); + __m128i r1 = _mm_loadu_si128((const __m128i*)(bayer+bayer_step)); + __m128i r2 = _mm_loadu_si128((const __m128i*)(bayer+bayer_step*2)); + + __m128i b1 = _mm_add_epi16(_mm_and_si128(r0, masklow), _mm_and_si128(r2, masklow)); + __m128i nextb1 = _mm_srli_si128(b1, 2); + __m128i b0 = _mm_add_epi16(b1, nextb1); + b1 = _mm_srli_epi16(_mm_add_epi16(nextb1, delta1), 1); + b0 = _mm_srli_epi16(_mm_add_epi16(b0, delta2), 2); + // b0 b2 ... b14 b1 b3 ... b15 + b0 = _mm_packus_epi16(b0, b1); + + // vertical sum + __m128i r0g = _mm_srli_epi16(r0, 8); + __m128i r2g = _mm_srli_epi16(r2, 8); + __m128i sumv = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(r0g, r2g), delta1), 1); + // gorizontal sum + __m128i g1 = _mm_and_si128(masklow, r1); + __m128i nextg1 = _mm_srli_si128(g1, 2); + __m128i sumg = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(g1, nextg1), delta1), 1); + + // gradients + __m128i gradv = _mm_adds_epi16(_mm_subs_epu16(r0g, r2g), _mm_subs_epu16(r2g, r0g)); + __m128i gradg = _mm_adds_epi16(_mm_subs_epu16(nextg1, g1), _mm_subs_epu16(g1, nextg1)); + __m128i gmask = _mm_cmpgt_epi16(gradg, gradv); + + __m128i g0 = _mm_add_epi16(_mm_and_si128(gmask, sumv), _mm_and_si128(sumg, _mm_xor_si128(gmask, full))); + // g0 g2 ... g14 g1 g3 ... + g0 = _mm_packus_epi16(g0, nextg1); + + r0 = _mm_srli_epi16(r1, 8); + r1 = _mm_add_epi16(r0, _mm_srli_si128(r0, 2)); + r1 = _mm_srli_epi16(_mm_add_epi16(r1, delta1), 1); + // r0 r2 ... r14 r1 r3 ... r15 + r0 = _mm_packus_epi16(r0, r1); + + b1 = _mm_and_si128(_mm_xor_si128(b0, r0), mask); + b0 = _mm_xor_si128(b0, b1); + r0 = _mm_xor_si128(r0, b1); + + // b1 g1 b3 g3 b5 g5... + b1 = _mm_unpackhi_epi8(b0, g0); + // b0 g0 b2 g2 b4 g4 .... + b0 = _mm_unpacklo_epi8(b0, g0); + + // r1 0 r3 0 r5 0 ... + r1 = _mm_unpackhi_epi8(r0, z); + // r0 0 r2 0 r4 0 ... + r0 = _mm_unpacklo_epi8(r0, z); + + // 0 b0 g0 r0 0 b2 g2 r2 ... + g0 = _mm_slli_si128(_mm_unpacklo_epi16(b0, r0), 1); + // 0 b8 g8 r8 0 b10 g10 r10 ... + g1 = _mm_slli_si128(_mm_unpackhi_epi16(b0, r0), 1); + + // b1 g1 r1 0 b3 g3 r3 0 ... + r0 = _mm_unpacklo_epi16(b1, r1); + // b9 g9 r9 0 b11 g11 r11 0 ... + r1 = _mm_unpackhi_epi16(b1, r1); + + // 0 b0 g0 r0 b1 g1 r1 0 ... + b0 = _mm_srli_si128(_mm_unpacklo_epi32(g0, r0), 1); + // 0 b4 g4 r4 b5 g5 r5 0 ... + b1 = _mm_srli_si128(_mm_unpackhi_epi32(g0, r0), 1); + + _mm_storel_epi64((__m128i*)(dst+0), b0); + _mm_storel_epi64((__m128i*)(dst+6*1), _mm_srli_si128(b0, 8)); + _mm_storel_epi64((__m128i*)(dst+6*2), b1); + _mm_storel_epi64((__m128i*)(dst+6*3), _mm_srli_si128(b1, 8)); + + // 0 b8 g8 r8 b9 g9 r9 0 ... + g0 = _mm_srli_si128(_mm_unpacklo_epi32(g1, r1), 1); + // 0 b12 g12 r12 b13 g13 r13 0 ... + g1 = _mm_srli_si128(_mm_unpackhi_epi32(g1, r1), 1); + + _mm_storel_epi64((__m128i*)(dst+6*4), g0); + _mm_storel_epi64((__m128i*)(dst+6*5), _mm_srli_si128(g0, 8)); + + _mm_storel_epi64((__m128i*)(dst+6*6), g1); + } + + return bayer - (bayer_end - width); + } + bool use_simd; }; #else @@ -1173,7 +1289,7 @@ static void Bayer2RGB_VNG_8u( const Mat& srcmat, Mat& dstmat, int code ) //////////////////////////////// Edge-Aware Demosaicing ////////////////////////////////// -template +template class Bayer2RGB_EdgeAware_T_Invoker : public cv::ParallelLoopBody { @@ -1191,6 +1307,7 @@ public: int dcn2 = dcn<<1; int start_with_green = Start_with_green, blue = Blue; int sstep = src.step / src.elemSize1(), dstep = dst.step / dst.elemSize1(); + SIMDInterpolator vecOp; const T* S = reinterpret_cast(src.data + (range.start + 1) * src.step) + 1; T* D = reinterpret_cast(dst.data + (range.start + 1) * dst.step) + dcn; @@ -1215,6 +1332,11 @@ public: ++x; } + int delta = vecOp.bayer2RGB_EA(S - sstep - 1, sstep, D, size.width, blue); + x += delta; + S += delta; + D += dcn * delta; + if (blue) for (; x < size.width; x += 2, S += 2, D += dcn2) { @@ -1227,7 +1349,7 @@ public: D[5] = (S[-sstep+1] + S[sstep+1] + 1) >> 1; } else - for (; x < size.width; x += 2, S += 2, D += dcn2 ) + for (; x < size.width; x += 2, S += 2, D += dcn2) { D[0] = (S[-sstep-1] + S[-sstep+1] + S[sstep-1] + S[sstep+1] + 2) >> 2; D[1] = (std::abs(S[-1] - S[1]) > std::abs(S[sstep] - S[-sstep]) ? (S[sstep] + S[-sstep] + 1) : (S[-1] + S[1] + 1)) >> 1; @@ -1267,7 +1389,7 @@ private: int Blue, Start_with_green; }; -template +template static void Bayer2RGB_EdgeAware_T(const Mat& src, Mat& dst, int code) { Size size = src.size(); @@ -1287,7 +1409,7 @@ static void Bayer2RGB_EdgeAware_T(const Mat& src, Mat& dst, int code) if (size.height > 0) { - Bayer2RGB_EdgeAware_T_Invoker invoker(src, dst, size, blue, start_with_green); + Bayer2RGB_EdgeAware_T_Invoker invoker(src, dst, size, blue, start_with_green); Range range(0, size.height); parallel_for_(range, invoker, dst.total()/static_cast(1<<16)); } @@ -1380,9 +1502,9 @@ void cv::demosaicing(InputArray _src, OutputArray _dst, int code, int dcn) dst = _dst.getMat(); if (depth == CV_8U) - Bayer2RGB_EdgeAware_T(src, dst, code); + Bayer2RGB_EdgeAware_T(src, dst, code); else if (depth == CV_16U) - Bayer2RGB_EdgeAware_T(src, dst, code); + Bayer2RGB_EdgeAware_T >(src, dst, code); else CV_Error(CV_StsUnsupportedFormat, "Bayer->RGB Edge-Aware demosaicing only currently supports 8u and 16u types"); diff --git a/modules/imgproc/test/test_color.cpp b/modules/imgproc/test/test_color.cpp index afb5ccc..33b5bcf 100644 --- a/modules/imgproc/test/test_color.cpp +++ b/modules/imgproc/test/test_color.cpp @@ -1991,8 +1991,8 @@ static void test_Bayer2RGB_EdgeAware_8u(const Mat& src, Mat& dst, int code) { // red D[0] = S[0]; - D[1] = (std::abs(S[-1] - S[1]) > std::abs(S[step] - S[-step]) ? (S[step] + S[-step]) : (S[-1] + S[1])) / 2; - D[2] = ((S[-step-1] + S[-step+1] + S[step-1] + S[step+1]) / 4); + D[1] = (std::abs(S[-1] - S[1]) > std::abs(S[step] - S[-step]) ? (S[step] + S[-step] + 1) : (S[-1] + S[1] + 1)) / 2; + D[2] = ((S[-step-1] + S[-step+1] + S[step-1] + S[step+1] + 2) / 4); if (!blue) std::swap(D[0], D[2]); } @@ -2002,8 +2002,8 @@ static void test_Bayer2RGB_EdgeAware_8u(const Mat& src, Mat& dst, int code) for (int x = 1; x < size.width; x += 2, S += 2, D += 2*dcn) { D[0] = S[0]; - D[1] = (std::abs(S[-1] - S[1]) > std::abs(S[step] - S[-step]) ? (S[step] + S[-step]) : (S[-1] + S[1])) / 2; - D[2] = ((S[-step-1] + S[-step+1] + S[step-1] + S[step+1]) / 4); + D[1] = (std::abs(S[-1] - S[1]) > std::abs(S[step] - S[-step]) ? (S[step] + S[-step] + 1) : (S[-1] + S[1] + 1)) / 2; + D[2] = ((S[-step-1] + S[-step+1] + S[step-1] + S[step+1] + 2) / 4); if (!blue) std::swap(D[0], D[2]); } @@ -2013,9 +2013,9 @@ static void test_Bayer2RGB_EdgeAware_8u(const Mat& src, Mat& dst, int code) for (int x = 2; x < size.width; x += 2, S += 2, D += 2*dcn) { - D[0] = (S[-1] + S[1]) / 2; + D[0] = (S[-1] + S[1] + 1) / 2; D[1] = S[0]; - D[2] = (S[-step] + S[step]) / 2; + D[2] = (S[-step] + S[step] + 1) / 2; if (!blue) std::swap(D[0], D[2]); } @@ -2051,7 +2051,9 @@ static void checkData(const Mat& actual, const Mat& reference, cvtest::TS* ts, c EXPECT_EQ(actual.depth(), reference.depth()); Size size = reference.size(); - size.width *= reference.channels(); + int dcn = reference.channels(); + size.width *= dcn; + for (int y = 0; y < size.height && next; ++y) { const T* A = reinterpret_cast(actual.data + actual.step * y); @@ -2064,10 +2066,15 @@ static void checkData(const Mat& actual, const Mat& reference, cvtest::TS* ts, c ts->printf(SUM, "\nReference value: %d\n", static_cast(R[x])); ts->printf(SUM, "Actual value: %d\n", static_cast(A[x])); ts->printf(SUM, "(y, x): (%d, %d)\n", y, x / reference.channels()); + ts->printf(SUM, "Channel pos: %d\n", x % reference.channels()); ts->printf(SUM, "Pattern: %s\n", type); ts->printf(SUM, "Bayer image type: %s", bayer_type); #undef SUM + Mat diff; + absdiff(actual, reference, diff); + EXPECT_EQ(countNonZero(diff.reshape(1) > 1), 0); + ts->set_failed_test_info(cvtest::TS::FAIL_BAD_ACCURACY); ts->set_gtest_status(); @@ -2173,12 +2180,6 @@ TEST(ImgProc_Bayer2RGBA, accuracy) Mat diff; absdiff(actual, reference, diff); - - cv::Rect r(0, ssize.height - 5, 7, 5); - std::cout << "Actual: " << std::endl << actual(r) << std::endl << std::endl; - std::cout << "Reference: " << std::endl << reference(r) << std::endl << std::endl; - std::cout << "Difference: " << std::endl << diff(r) << std::endl << std::endl; - EXPECT_EQ(countNonZero(diff.reshape(1) > 1), 0); ts->set_failed_test_info(cvtest::TS::FAIL_BAD_ACCURACY); -- 2.7.4