added SIMD optimization of Edge-Aware Demosaicing in case of CV_8U
authorIlya Lavrenov <ilya.lavrenov@itseez.com>
Mon, 10 Dec 2012 09:29:08 +0000 (13:29 +0400)
committerIlya Lavrenov <ilya.lavrenov@itseez.com>
Mon, 10 Dec 2012 09:29:08 +0000 (13:29 +0400)
modules/imgproc/src/demosaicing.cpp
modules/imgproc/test/test_color.cpp

index 227fb41..f5cbde9 100644 (file)
@@ -63,6 +63,11 @@ public:
     {
         return 0;
     }
+
+    int bayer2RGB_EA(const T*, int, T*, int, int) const
+    {
+        return 0;
+    }
 };
 
 #if CV_SSE2
@@ -129,6 +134,7 @@ public:
          G R G R | G R G R | G R G R | G R G R
          B G B G | B G B G | B G B G | B G B G
          */
+
         __m128i delta1 = _mm_set1_epi16(1), delta2 = _mm_set1_epi16(2);
         __m128i mask = _mm_set1_epi16(blue < 0 ? -1 : 0), z = _mm_setzero_si128();
         __m128i masklo = _mm_set1_epi16(0x00ff);
@@ -141,10 +147,11 @@ public:
             __m128i r2 = _mm_loadu_si128((const __m128i*)(bayer+bayer_step*2));
 
             __m128i b1 = _mm_add_epi16(_mm_and_si128(r0, masklo), _mm_and_si128(r2, masklo));
-            __m128i b0 = _mm_add_epi16(b1, _mm_srli_si128(b1, 2));
-            b1 = _mm_srli_si128(b1, 2);
-            b1 = _mm_srli_epi16(_mm_add_epi16(b1, delta1), 1);
+            __m128i nextb1 = _mm_srli_si128(b1, 2);
+            __m128i b0 = _mm_add_epi16(b1, nextb1);
+            b1 = _mm_srli_epi16(_mm_add_epi16(nextb1, delta1), 1);
             b0 = _mm_srli_epi16(_mm_add_epi16(b0, delta2), 2);
+            // b0 b2 ... b14 b1 b3 ... b15
             b0 = _mm_packus_epi16(b0, b1);
 
             __m128i g0 = _mm_add_epi16(_mm_srli_epi16(r0, 8), _mm_srli_epi16(r2, 8));
@@ -152,38 +159,42 @@ public:
             g0 = _mm_add_epi16(g0, _mm_add_epi16(g1, _mm_srli_si128(g1, 2)));
             g1 = _mm_srli_si128(g1, 2);
             g0 = _mm_srli_epi16(_mm_add_epi16(g0, delta2), 2);
+            // g0 g2 ... g14 g1 g3 ... g15
             g0 = _mm_packus_epi16(g0, g1);
 
             r0 = _mm_srli_epi16(r1, 8);
             r1 = _mm_add_epi16(r0, _mm_srli_si128(r0, 2));
             r1 = _mm_srli_epi16(_mm_add_epi16(r1, delta1), 1);
+            // r0 r2 ... r14 r1 r3 ... r15
             r0 = _mm_packus_epi16(r0, r1);
 
             b1 = _mm_and_si128(_mm_xor_si128(b0, r0), mask);
             b0 = _mm_xor_si128(b0, b1);
             r0 = _mm_xor_si128(r0, b1);
 
-            // b1 g1 b1 g1 ...
+            // b1 g1 b3 g3 b5 g5...
             b1 = _mm_unpackhi_epi8(b0, g0);
             // b0 g0 b2 g2 b4 g4 ....
             b0 = _mm_unpacklo_epi8(b0, g0);
 
-            // r1 0 r3 0 ...
+            // r1 0 r3 0 r5 0 ...
             r1 = _mm_unpackhi_epi8(r0, z);
             // r0 0 r2 0 r4 0 ...
             r0 = _mm_unpacklo_epi8(r0, z);
 
-            // 0 b0 g0 r0 0 b2 g2 r2 ...
+            // 0 b0 g0 r0 0 b2 g2 r2 ...
             g0 = _mm_slli_si128(_mm_unpacklo_epi16(b0, r0), 1);
-            // 0 b8 g8 r8 0 b10 g10 r10 ...
+            // 0 b8 g8 r8 0 b10 g10 r10 ...
             g1 = _mm_slli_si128(_mm_unpackhi_epi16(b0, r0), 1);
 
-            // b1 g1 r1 0 b3 g3 r3 ....
+            // b1 g1 r1 0 b3 g3 r3 ...
             r0 = _mm_unpacklo_epi16(b1, r1);
-            // b9 g9 r9 0 ...
+            // b9 g9 r9 0 b11 g11 r11 0 ...
             r1 = _mm_unpackhi_epi16(b1, r1);
 
+            // 0 b0 g0 r0 b1 g1 r1 0 ...
             b0 = _mm_srli_si128(_mm_unpacklo_epi32(g0, r0), 1);
+            // 0 b4 g4 r4 b5 g5 r5 0 ...
             b1 = _mm_srli_si128(_mm_unpackhi_epi32(g0, r0), 1);
 
             _mm_storel_epi64((__m128i*)(dst-1+0), b0);
@@ -191,7 +202,9 @@ public:
             _mm_storel_epi64((__m128i*)(dst-1+6*2), b1);
             _mm_storel_epi64((__m128i*)(dst-1+6*3), _mm_srli_si128(b1, 8));
 
+            // 0 b8 g8 r8 b9 g9 r9 0 ...
             g0 = _mm_srli_si128(_mm_unpacklo_epi32(g1, r1), 1);
+            // 0 b12 g12 r12 b13 g13 r13 0 ...
             g1 = _mm_srli_si128(_mm_unpackhi_epi32(g1, r1), 1);
 
             _mm_storel_epi64((__m128i*)(dst-1+6*4), g0);
@@ -203,6 +216,109 @@ public:
         return (int)(bayer - (bayer_end - width));
     }
 
+    int bayer2RGB_EA(const uchar* bayer, int bayer_step, uchar* dst, int width, int blue) const
+    {
+        if (!use_simd)
+            return 0;
+
+        const uchar* bayer_end = bayer + width;
+        __m128i masklow = _mm_set1_epi16(0x00ff);
+        __m128i delta1 = _mm_set1_epi16(1), delta2 = _mm_set1_epi16(2);
+        __m128i full = _mm_set1_epi16(-1), z = _mm_setzero_si128();
+        __m128i mask = _mm_set1_epi16(blue > 0 ? -1 : 0);
+
+        for ( ; bayer <= bayer_end - 18; bayer += 14, dst += 42)
+        {
+            /*
+             B G B G | B G B G | B G B G | B G B G
+             G R G R | G R G R | G R G R | G R G R
+             B G B G | B G B G | B G B G | B G B G
+             */
+
+            __m128i r0 = _mm_loadu_si128((const __m128i*)bayer);
+            __m128i r1 = _mm_loadu_si128((const __m128i*)(bayer+bayer_step));
+            __m128i r2 = _mm_loadu_si128((const __m128i*)(bayer+bayer_step*2));
+
+            __m128i b1 = _mm_add_epi16(_mm_and_si128(r0, masklow), _mm_and_si128(r2, masklow));
+            __m128i nextb1 = _mm_srli_si128(b1, 2);
+            __m128i b0 = _mm_add_epi16(b1, nextb1);
+            b1 = _mm_srli_epi16(_mm_add_epi16(nextb1, delta1), 1);
+            b0 = _mm_srli_epi16(_mm_add_epi16(b0, delta2), 2);
+            // b0 b2 ... b14 b1 b3 ... b15
+            b0 = _mm_packus_epi16(b0, b1);
+
+            // vertical sum
+            __m128i r0g = _mm_srli_epi16(r0, 8);
+            __m128i r2g = _mm_srli_epi16(r2, 8);
+            __m128i sumv = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(r0g, r2g), delta1), 1);
+            // gorizontal sum
+            __m128i g1 = _mm_and_si128(masklow, r1);
+            __m128i nextg1 = _mm_srli_si128(g1, 2);
+            __m128i sumg = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(g1, nextg1), delta1), 1);
+
+            // gradients
+            __m128i gradv = _mm_adds_epi16(_mm_subs_epu16(r0g, r2g), _mm_subs_epu16(r2g, r0g));
+            __m128i gradg = _mm_adds_epi16(_mm_subs_epu16(nextg1, g1), _mm_subs_epu16(g1, nextg1));
+            __m128i gmask = _mm_cmpgt_epi16(gradg, gradv);
+
+            __m128i g0 = _mm_add_epi16(_mm_and_si128(gmask, sumv), _mm_and_si128(sumg, _mm_xor_si128(gmask, full)));
+            // g0 g2 ... g14 g1 g3 ...
+            g0 = _mm_packus_epi16(g0, nextg1);
+
+            r0 = _mm_srli_epi16(r1, 8);
+            r1 = _mm_add_epi16(r0, _mm_srli_si128(r0, 2));
+            r1 = _mm_srli_epi16(_mm_add_epi16(r1, delta1), 1);
+            // r0 r2 ... r14 r1 r3 ... r15
+            r0 = _mm_packus_epi16(r0, r1);
+
+            b1 = _mm_and_si128(_mm_xor_si128(b0, r0), mask);
+            b0 = _mm_xor_si128(b0, b1);
+            r0 = _mm_xor_si128(r0, b1);
+
+            // b1 g1 b3 g3 b5 g5...
+            b1 = _mm_unpackhi_epi8(b0, g0);
+            // b0 g0 b2 g2 b4 g4 ....
+            b0 = _mm_unpacklo_epi8(b0, g0);
+
+            // r1 0 r3 0 r5 0 ...
+            r1 = _mm_unpackhi_epi8(r0, z);
+            // r0 0 r2 0 r4 0 ...
+            r0 = _mm_unpacklo_epi8(r0, z);
+
+            // 0 b0 g0 r0 0 b2 g2 r2 ...
+            g0 = _mm_slli_si128(_mm_unpacklo_epi16(b0, r0), 1);
+            // 0 b8 g8 r8 0 b10 g10 r10 ...
+            g1 = _mm_slli_si128(_mm_unpackhi_epi16(b0, r0), 1);
+
+            // b1 g1 r1 0 b3 g3 r3 0 ...
+            r0 = _mm_unpacklo_epi16(b1, r1);
+            // b9 g9 r9 0 b11 g11 r11 0 ...
+            r1 = _mm_unpackhi_epi16(b1, r1);
+
+            // 0 b0 g0 r0 b1 g1 r1 0 ...
+            b0 = _mm_srli_si128(_mm_unpacklo_epi32(g0, r0), 1);
+            // 0 b4 g4 r4 b5 g5 r5 0 ...
+            b1 = _mm_srli_si128(_mm_unpackhi_epi32(g0, r0), 1);
+
+            _mm_storel_epi64((__m128i*)(dst+0), b0);
+            _mm_storel_epi64((__m128i*)(dst+6*1), _mm_srli_si128(b0, 8));
+            _mm_storel_epi64((__m128i*)(dst+6*2), b1);
+            _mm_storel_epi64((__m128i*)(dst+6*3), _mm_srli_si128(b1, 8));
+
+            // 0 b8 g8 r8 b9 g9 r9 0 ...
+            g0 = _mm_srli_si128(_mm_unpacklo_epi32(g1, r1), 1);
+            // 0 b12 g12 r12 b13 g13 r13 0 ...
+            g1 = _mm_srli_si128(_mm_unpackhi_epi32(g1, r1), 1);
+
+            _mm_storel_epi64((__m128i*)(dst+6*4), g0);
+            _mm_storel_epi64((__m128i*)(dst+6*5), _mm_srli_si128(g0, 8));
+
+            _mm_storel_epi64((__m128i*)(dst+6*6), g1);
+        }
+
+        return bayer - (bayer_end - width);
+    }
+
     bool use_simd;
 };
 #else
@@ -1173,7 +1289,7 @@ static void Bayer2RGB_VNG_8u( const Mat& srcmat, Mat& dstmat, int code )
 
 //////////////////////////////// Edge-Aware Demosaicing //////////////////////////////////
 
-template <typename T>
+template <typename T, typename SIMDInterpolator>
 class Bayer2RGB_EdgeAware_T_Invoker :
     public cv::ParallelLoopBody
 {
@@ -1191,6 +1307,7 @@ public:
         int dcn2 = dcn<<1;
         int start_with_green = Start_with_green, blue = Blue;
         int sstep = src.step / src.elemSize1(), dstep = dst.step / dst.elemSize1();
+        SIMDInterpolator vecOp;
 
         const T* S = reinterpret_cast<const T*>(src.data + (range.start + 1) * src.step) + 1;
         T* D = reinterpret_cast<T*>(dst.data + (range.start + 1) * dst.step) + dcn;
@@ -1215,6 +1332,11 @@ public:
                 ++x;
             }
 
+            int delta = vecOp.bayer2RGB_EA(S - sstep - 1, sstep, D, size.width, blue);
+            x += delta;
+            S += delta;
+            D += dcn * delta;
+
             if (blue)
                 for (; x < size.width; x += 2, S += 2, D += dcn2)
                 {
@@ -1227,7 +1349,7 @@ public:
                     D[5] = (S[-sstep+1] + S[sstep+1] + 1) >> 1;
                 }
             else
-                for (; x < size.width; x += 2, S += 2, D += dcn2    )
+                for (; x < size.width; x += 2, S += 2, D += dcn2)
                 {
                     D[0] = (S[-sstep-1] + S[-sstep+1] + S[sstep-1] + S[sstep+1] + 2) >> 2;
                     D[1] = (std::abs(S[-1] - S[1]) > std::abs(S[sstep] - S[-sstep]) ? (S[sstep] + S[-sstep] + 1) : (S[-1] + S[1] + 1)) >> 1;
@@ -1267,7 +1389,7 @@ private:
     int Blue, Start_with_green;
 };
 
-template <typename T>
+template <typename T, typename SIMDInterpolator>
 static void Bayer2RGB_EdgeAware_T(const Mat& src, Mat& dst, int code)
 {
     Size size = src.size();
@@ -1287,7 +1409,7 @@ static void Bayer2RGB_EdgeAware_T(const Mat& src, Mat& dst, int code)
 
     if (size.height > 0)
     {
-        Bayer2RGB_EdgeAware_T_Invoker<T> invoker(src, dst, size, blue, start_with_green);
+        Bayer2RGB_EdgeAware_T_Invoker<T, SIMDInterpolator> invoker(src, dst, size, blue, start_with_green);
         Range range(0, size.height);
         parallel_for_(range, invoker, dst.total()/static_cast<double>(1<<16));
     }
@@ -1380,9 +1502,9 @@ void cv::demosaicing(InputArray _src, OutputArray _dst, int code, int dcn)
         dst = _dst.getMat();
 
         if (depth == CV_8U)
-            Bayer2RGB_EdgeAware_T<uchar>(src, dst, code);
+            Bayer2RGB_EdgeAware_T<uchar, SIMDBayerInterpolator_8u>(src, dst, code);
         else if (depth == CV_16U)
-            Bayer2RGB_EdgeAware_T<ushort>(src, dst, code);
+            Bayer2RGB_EdgeAware_T<ushort, SIMDBayerStubInterpolator_<ushort> >(src, dst, code);
         else
             CV_Error(CV_StsUnsupportedFormat, "Bayer->RGB Edge-Aware demosaicing only currently supports 8u and 16u types");
 
index afb5ccc..33b5bcf 100644 (file)
@@ -1991,8 +1991,8 @@ static void test_Bayer2RGB_EdgeAware_8u(const Mat& src, Mat& dst, int code)
             {
                 // red
                 D[0] = S[0];
-                D[1] = (std::abs(S[-1] - S[1]) > std::abs(S[step] - S[-step]) ? (S[step] + S[-step]) : (S[-1] + S[1])) / 2;
-                D[2] = ((S[-step-1] + S[-step+1] + S[step-1] + S[step+1]) / 4);
+                D[1] = (std::abs(S[-1] - S[1]) > std::abs(S[step] - S[-step]) ? (S[step] + S[-step] + 1) : (S[-1] + S[1] + 1)) / 2;
+                D[2] = ((S[-step-1] + S[-step+1] + S[step-1] + S[step+1] + 2) / 4);
                 if (!blue)
                     std::swap(D[0], D[2]);
             }
@@ -2002,8 +2002,8 @@ static void test_Bayer2RGB_EdgeAware_8u(const Mat& src, Mat& dst, int code)
             for (int x = 1; x < size.width; x += 2, S += 2, D += 2*dcn)
             {
                 D[0] = S[0];
-                D[1] = (std::abs(S[-1] - S[1]) > std::abs(S[step] - S[-step]) ? (S[step] + S[-step]) : (S[-1] + S[1])) / 2;
-                D[2] = ((S[-step-1] + S[-step+1] + S[step-1] + S[step+1]) / 4);
+                D[1] = (std::abs(S[-1] - S[1]) > std::abs(S[step] - S[-step]) ? (S[step] + S[-step] + 1) : (S[-1] + S[1] + 1)) / 2;
+                D[2] = ((S[-step-1] + S[-step+1] + S[step-1] + S[step+1] + 2) / 4);
                 if (!blue)
                     std::swap(D[0], D[2]);
             }
@@ -2013,9 +2013,9 @@ static void test_Bayer2RGB_EdgeAware_8u(const Mat& src, Mat& dst, int code)
 
             for (int x = 2; x < size.width; x += 2, S += 2, D += 2*dcn)
             {
-                D[0] = (S[-1] + S[1]) / 2;
+                D[0] = (S[-1] + S[1] + 1) / 2;
                 D[1] = S[0];
-                D[2] = (S[-step] + S[step]) / 2;
+                D[2] = (S[-step] + S[step] + 1) / 2;
                 if (!blue)
                     std::swap(D[0], D[2]);
             }
@@ -2051,7 +2051,9 @@ static void checkData(const Mat& actual, const Mat& reference, cvtest::TS* ts, c
     EXPECT_EQ(actual.depth(), reference.depth());
 
     Size size = reference.size();
-    size.width *= reference.channels();
+    int dcn = reference.channels();
+    size.width *= dcn;
+
     for (int y = 0; y < size.height && next; ++y)
     {
         const T* A = reinterpret_cast<const T*>(actual.data + actual.step * y);
@@ -2064,10 +2066,15 @@ static void checkData(const Mat& actual, const Mat& reference, cvtest::TS* ts, c
                 ts->printf(SUM, "\nReference value: %d\n", static_cast<int>(R[x]));
                 ts->printf(SUM, "Actual value: %d\n", static_cast<int>(A[x]));
                 ts->printf(SUM, "(y, x): (%d, %d)\n", y, x / reference.channels());
+                ts->printf(SUM, "Channel pos: %d\n", x % reference.channels());
                 ts->printf(SUM, "Pattern: %s\n", type);
                 ts->printf(SUM, "Bayer image type: %s", bayer_type);
                 #undef SUM
 
+                Mat diff;
+                absdiff(actual, reference, diff);
+                EXPECT_EQ(countNonZero(diff.reshape(1) > 1), 0);
+
                 ts->set_failed_test_info(cvtest::TS::FAIL_BAD_ACCURACY);
                 ts->set_gtest_status();
 
@@ -2173,12 +2180,6 @@ TEST(ImgProc_Bayer2RGBA, accuracy)
 
                     Mat diff;
                     absdiff(actual, reference, diff);
-
-                    cv::Rect r(0, ssize.height - 5, 7, 5);
-                    std::cout << "Actual: " << std::endl << actual(r) << std::endl << std::endl;
-                    std::cout << "Reference: " << std::endl << reference(r) << std::endl << std::endl;
-                    std::cout << "Difference: " << std::endl << diff(r) << std::endl << std::endl;
-
                     EXPECT_EQ(countNonZero(diff.reshape(1) > 1), 0);
 
                     ts->set_failed_test_info(cvtest::TS::FAIL_BAD_ACCURACY);