From: Andrey Kamaev Date: Wed, 28 Mar 2012 20:20:29 +0000 (+0000) Subject: #1365 Fixed numerous bugs in Bayer2RGB_VNG_8u SSE optimization, added simple regressi... X-Git-Tag: accepted/2.0/20130307.220821~944 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=09490188b3a1a62d4652528150bf6fd916ce4d09;p=profile%2Fivi%2Fopencv.git #1365 Fixed numerous bugs in Bayer2RGB_VNG_8u SSE optimization, added simple regression test --- diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp index f1d60df..d345af3 100644 --- a/modules/imgproc/src/color.cpp +++ b/modules/imgproc/src/color.cpp @@ -2234,7 +2234,7 @@ static void Bayer2RGB_VNG_8u( const Mat& srcmat, Mat& dstmat, int code ) bool greenCell = greenCell0; i = 2; -#if CV_SSE2 +#if CV_SSE2 int limit = !haveSSE ? N-2 : greenCell ? std::min(3, N-2) : 2; #else int limit = N - 2; @@ -2401,202 +2401,218 @@ static void Bayer2RGB_VNG_8u( const Mat& srcmat, Mat& dstmat, int code ) dstrow[blueIdx^2] = CV_CAST_8U(R); greenCell = !greenCell; } - + #if CV_SSE2 if( !haveSSE ) break; - __m128i emask = _mm_set1_epi32(0x0000ffff), - omask = _mm_set1_epi32(0xffff0000), - all_ones = _mm_set1_epi16(1), - z = _mm_setzero_si128(); - __m128 _0_5 = _mm_set1_ps(0.5f); + __m128i emask = _mm_set1_epi32(0x0000ffff), + omask = _mm_set1_epi32(0xffff0000), + z = _mm_setzero_si128(); + __m128 _0_5 = _mm_set1_ps(0.5f); - #define _mm_merge_epi16(a, b) \ - _mm_or_si128(_mm_and_si128(a, emask), _mm_and_si128(b, omask)) - #define _mm_cvtloepi16_ps(a) _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(a,a), 16)) - #define _mm_cvthiepi16_ps(a) _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(a,a), 16)) + #define _mm_merge_epi16(a, b) _mm_or_si128(_mm_and_si128(a, emask), _mm_and_si128(b, omask)) //(aA_aA_aA_aA) * (bB_bB_bB_bB) => (bA_bA_bA_bA) + #define _mm_cvtloepi16_ps(a) _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(a,a), 16)) //(1,2,3,4,5,6,7,8) => (1f,2f,3f,4f) + #define _mm_cvthiepi16_ps(a) _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(a,a), 16)) //(1,2,3,4,5,6,7,8) => (5f,6f,7f,8f) + #define _mm_loadl_u8_s16(ptr, offset) _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)((ptr) + (offset))), z) //load 8 uchars to 8 shorts // process 8 pixels at once for( ; i <= N - 10; i += 8, srow += 8, brow0 += 8, brow1 += 8, brow2 += 8 ) { - __m128i gradN, gradS, gradW, gradE, gradNE, gradSW, gradNW, gradSE; - gradN = _mm_adds_epu16(_mm_loadu_si128((__m128i*)brow0), - _mm_loadu_si128((__m128i*)brow1)); - gradS = _mm_adds_epu16(_mm_loadu_si128((__m128i*)brow1), - _mm_loadu_si128((__m128i*)brow2)); - gradW = _mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow1+N-1)), - _mm_loadu_si128((__m128i*)(brow1+N))); - gradE = _mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow1+N+1)), - _mm_loadu_si128((__m128i*)(brow1+N))); + //int gradN = brow0[0] + brow1[0]; + __m128i gradN = _mm_adds_epi16(_mm_loadu_si128((__m128i*)brow0), _mm_loadu_si128((__m128i*)brow1)); + + //int gradS = brow1[0] + brow2[0]; + __m128i gradS = _mm_adds_epi16(_mm_loadu_si128((__m128i*)brow1), _mm_loadu_si128((__m128i*)brow2)); + + //int gradW = brow1[N-1] + brow1[N]; + __m128i gradW = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow1+N-1)), _mm_loadu_si128((__m128i*)(brow1+N))); + + //int gradE = brow1[N+1] + brow1[N]; + __m128i gradE = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow1+N+1)), _mm_loadu_si128((__m128i*)(brow1+N))); - __m128i minGrad, maxGrad, T; - minGrad = _mm_min_epi16(_mm_min_epi16(_mm_min_epi16(gradN, gradS), gradW), gradE); - maxGrad = _mm_max_epi16(_mm_max_epi16(_mm_max_epi16(gradN, gradS), gradW), gradE); + //int minGrad = std::min(std::min(std::min(gradN, gradS), gradW), gradE); + //int maxGrad = std::max(std::max(std::max(gradN, gradS), gradW), gradE); + __m128i minGrad = _mm_min_epi16(_mm_min_epi16(gradN, gradS), _mm_min_epi16(gradW, gradE)); + __m128i maxGrad = _mm_max_epi16(_mm_max_epi16(gradN, gradS), _mm_max_epi16(gradW, gradE)); __m128i grad0, grad1; - grad0 = _mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow0+N4+1)), - _mm_loadu_si128((__m128i*)(brow1+N4))); - grad1 = _mm_adds_epu16(_mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow0+N2)), - _mm_loadu_si128((__m128i*)(brow0+N2+1))), - _mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow1+N2)), - _mm_loadu_si128((__m128i*)(brow1+N2+1)))); - gradNE = _mm_srli_epi16(_mm_merge_epi16(grad0, grad1), 1); + //int gradNE = brow0[N4+1] + brow1[N4]; + //int gradNE = brow0[N2] + brow0[N2+1] + brow1[N2] + brow1[N2+1]; + grad0 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow0+N4+1)), _mm_loadu_si128((__m128i*)(brow1+N4))); + grad1 = _mm_adds_epi16( _mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow0+N2)), _mm_loadu_si128((__m128i*)(brow0+N2+1))), + _mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow1+N2)), _mm_loadu_si128((__m128i*)(brow1+N2+1)))); + __m128i gradNE = _mm_merge_epi16(grad0, grad1); - grad0 = _mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow2+N4-1)), - _mm_loadu_si128((__m128i*)(brow1+N4))); - grad1 = _mm_adds_epu16(_mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow2+N2)), - _mm_loadu_si128((__m128i*)(brow2+N2-1))), - _mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow1+N2)), - _mm_loadu_si128((__m128i*)(brow1+N2-1)))); - gradSW = _mm_srli_epi16(_mm_merge_epi16(grad0, grad1), 1); + //int gradSW = brow1[N4] + brow2[N4-1]; + //int gradSW = brow1[N2] + brow1[N2-1] + brow2[N2] + brow2[N2-1]; + grad0 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow2+N4-1)), _mm_loadu_si128((__m128i*)(brow1+N4))); + grad1 = _mm_adds_epi16(_mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow2+N2)), _mm_loadu_si128((__m128i*)(brow2+N2-1))), + _mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow1+N2)), _mm_loadu_si128((__m128i*)(brow1+N2-1)))); + __m128i gradSW = _mm_merge_epi16(grad0, grad1); minGrad = _mm_min_epi16(_mm_min_epi16(minGrad, gradNE), gradSW); maxGrad = _mm_max_epi16(_mm_max_epi16(maxGrad, gradNE), gradSW); + + //int gradNW = brow0[N5-1] + brow1[N5]; + //int gradNW = brow0[N3] + brow0[N3-1] + brow1[N3] + brow1[N3-1]; + grad0 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow0+N5-1)), _mm_loadu_si128((__m128i*)(brow1+N5))); + grad1 = _mm_adds_epi16(_mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow0+N3)), _mm_loadu_si128((__m128i*)(brow0+N3-1))), + _mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow1+N3)), _mm_loadu_si128((__m128i*)(brow1+N3-1)))); + __m128i gradNW = _mm_merge_epi16(grad0, grad1); - grad0 = _mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow0+N5-1)), - _mm_loadu_si128((__m128i*)(brow1+N5))); - grad1 = _mm_adds_epu16(_mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow0+N3)), - _mm_loadu_si128((__m128i*)(brow0+N3-1))), - _mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow1+N3)), - _mm_loadu_si128((__m128i*)(brow1+N3-1)))); - gradNW = _mm_srli_epi16(_mm_merge_epi16(grad0, grad1), 1); - - grad0 = _mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow2+N5+1)), - _mm_loadu_si128((__m128i*)(brow1+N5))); - grad1 = _mm_adds_epu16(_mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow2+N3)), - _mm_loadu_si128((__m128i*)(brow2+N3+1))), - _mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow1+N3)), - _mm_loadu_si128((__m128i*)(brow1+N3+1)))); - gradSE = _mm_srli_epi16(_mm_merge_epi16(grad0, grad1), 1); + //int gradSE = brow1[N5] + brow2[N5+1]; + //int gradSE = brow1[N3] + brow1[N3+1] + brow2[N3] + brow2[N3+1]; + grad0 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow2+N5+1)), _mm_loadu_si128((__m128i*)(brow1+N5))); + grad1 = _mm_adds_epi16(_mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow2+N3)), _mm_loadu_si128((__m128i*)(brow2+N3+1))), + _mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow1+N3)), _mm_loadu_si128((__m128i*)(brow1+N3+1)))); + __m128i gradSE = _mm_merge_epi16(grad0, grad1); minGrad = _mm_min_epi16(_mm_min_epi16(minGrad, gradNW), gradSE); maxGrad = _mm_max_epi16(_mm_max_epi16(maxGrad, gradNW), gradSE); - T = _mm_add_epi16(_mm_srli_epi16(maxGrad, 1), minGrad); - __m128i RGs = z, GRs = z, Bs = z, ng = z, mask; - - __m128i t0, t1, x0, x1, x2, x3, x4, x5, x6, x7, x8, - x9, x10, x11, x12, x13, x14, x15, x16; - - x0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)srow), z); - - x1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-bstep-1)), z); - x2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-bstep*2-1)), z); - x3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-bstep)), z); - x4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-bstep*2+1)), z); - x5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-bstep+1)), z); - x6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-bstep+2)), z); - x7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+1)), z); - x8 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+bstep+2)), z); - x9 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+bstep+1)), z); - x10 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+bstep*2+1)), z); - x11 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+bstep)), z); - x12 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+bstep*2-1)), z); - x13 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+bstep-1)), z); - x14 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+bstep-2)), z); - x15 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-1)), z); - x16 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-bstep-2)), z); - - // gradN - mask = _mm_cmpgt_epi16(T, gradN); - ng = _mm_sub_epi16(ng, mask); - - t0 = _mm_slli_epi16(x3, 1); - t1 = _mm_adds_epu16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-bstep*2)), z), x0); - - RGs = _mm_adds_epu16(RGs, _mm_and_si128(t1, mask)); - GRs = _mm_adds_epu16(GRs, _mm_and_si128(_mm_merge_epi16(t0, _mm_adds_epu16(x2,x4)), mask)); - Bs = _mm_adds_epu16(Bs, _mm_and_si128(_mm_merge_epi16(_mm_adds_epu16(x1,x5), t0), mask)); - - // gradNE - mask = _mm_cmpgt_epi16(T, gradNE); - ng = _mm_sub_epi16(ng, mask); - - t0 = _mm_slli_epi16(x5, 1); - t1 = _mm_adds_epu16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-bstep*2+2)), z), x0); + //int T = minGrad + maxGrad/2; + __m128i T = _mm_adds_epi16(_mm_srli_epi16(maxGrad, 1), minGrad); + + __m128i RGs = z, GRs = z, Bs = z, ng = z; - RGs = _mm_adds_epu16(RGs, _mm_and_si128(_mm_merge_epi16(t1, t0), mask)); - GRs = _mm_adds_epu16(GRs, _mm_and_si128(_mm_merge_epi16(_mm_loadu_si128((__m128i*)(brow0+N6+1)), - _mm_adds_epu16(x4,x7)), mask)); - Bs = _mm_adds_epu16(Bs, _mm_and_si128(_mm_merge_epi16(t0,_mm_adds_epu16(x3,x6)), mask)); + __m128i x0 = _mm_loadl_u8_s16(srow, +0 ); + __m128i x1 = _mm_loadl_u8_s16(srow, -1 - bstep ); + __m128i x2 = _mm_loadl_u8_s16(srow, -1 - bstep*2); + __m128i x3 = _mm_loadl_u8_s16(srow, - bstep ); + __m128i x4 = _mm_loadl_u8_s16(srow, +1 - bstep*2); + __m128i x5 = _mm_loadl_u8_s16(srow, +1 - bstep ); + __m128i x6 = _mm_loadl_u8_s16(srow, +2 - bstep ); + __m128i x7 = _mm_loadl_u8_s16(srow, +1 ); + __m128i x8 = _mm_loadl_u8_s16(srow, +2 + bstep ); + __m128i x9 = _mm_loadl_u8_s16(srow, +1 + bstep ); + __m128i x10 = _mm_loadl_u8_s16(srow, +1 + bstep*2); + __m128i x11 = _mm_loadl_u8_s16(srow, + bstep ); + __m128i x12 = _mm_loadl_u8_s16(srow, -1 + bstep*2); + __m128i x13 = _mm_loadl_u8_s16(srow, -1 + bstep ); + __m128i x14 = _mm_loadl_u8_s16(srow, -2 + bstep ); + __m128i x15 = _mm_loadl_u8_s16(srow, -1 ); + __m128i x16 = _mm_loadl_u8_s16(srow, -2 - bstep ); + + __m128i t0, t1, mask; - // gradE - mask = _mm_cmpgt_epi16(T, gradE); - ng = _mm_sub_epi16(ng, mask); + // gradN *********************************************** + mask = _mm_cmpgt_epi16(T, gradN); // mask = T>gradN + ng = _mm_sub_epi16(ng, mask); // ng += (T>gradN) - t0 = _mm_slli_epi16(x7, 1); - t1 = _mm_adds_epu16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+2)), z), x0); + t0 = _mm_slli_epi16(x3, 1); // srow[-bstep]*2 + t1 = _mm_adds_epi16(_mm_loadl_u8_s16(srow, -bstep*2), x0); // srow[-bstep*2] + srow[0] - RGs = _mm_adds_epu16(RGs, _mm_and_si128(t1, mask)); - GRs = _mm_adds_epu16(GRs, _mm_and_si128(t0, mask)); - Bs = _mm_adds_epu16(Bs, _mm_and_si128(_mm_merge_epi16(_mm_adds_epu16(x5,x9), - _mm_adds_epu16(x6,x8)), mask)); + // RGs += (srow[-bstep*2] + srow[0]) * (T>gradN) + RGs = _mm_adds_epi16(RGs, _mm_and_si128(t1, mask)); + // GRs += {srow[-bstep]*2; (srow[-bstep*2-1] + srow[-bstep*2+1])} * (T>gradN) + GRs = _mm_adds_epi16(GRs, _mm_and_si128(_mm_merge_epi16(t0, _mm_adds_epi16(x2,x4)), mask)); + // Bs += {(srow[-bstep-1]+srow[-bstep+1]); srow[-bstep]*2 } * (T>gradN) + Bs = _mm_adds_epi16(Bs, _mm_and_si128(_mm_merge_epi16(_mm_adds_epi16(x1,x5), t0), mask)); - // gradSE - mask = _mm_cmpgt_epi16(T, gradSE); - ng = _mm_sub_epi16(ng, mask); + // gradNE ********************************************** + mask = _mm_cmpgt_epi16(T, gradNE); // mask = T>gradNE + ng = _mm_sub_epi16(ng, mask); // ng += (T>gradNE) + + t0 = _mm_slli_epi16(x5, 1); // srow[-bstep+1]*2 + t1 = _mm_adds_epi16(_mm_loadl_u8_s16(srow, -bstep*2+2), x0); // srow[-bstep*2+2] + srow[0] - t0 = _mm_slli_epi16(x9, 1); - t1 = _mm_adds_epu16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+bstep*2+2)), z), x0); + // RGs += {(srow[-bstep*2+2] + srow[0]); srow[-bstep+1]*2} * (T>gradNE) + RGs = _mm_adds_epi16(RGs, _mm_and_si128(_mm_merge_epi16(t1, t0), mask)); + // GRs += {brow0[N6+1]; (srow[-bstep*2+1] + srow[1])} * (T>gradNE) + GRs = _mm_adds_epi16(GRs, _mm_and_si128(_mm_merge_epi16(_mm_loadu_si128((__m128i*)(brow0+N6+1)), _mm_adds_epi16(x4,x7)), mask)); + // Bs += {srow[-bstep+1]*2; (srow[-bstep] + srow[-bstep+2])} * (T>gradNE) + Bs = _mm_adds_epi16(Bs, _mm_and_si128(_mm_merge_epi16(t0,_mm_adds_epi16(x3,x6)), mask)); - RGs = _mm_adds_epu16(RGs, _mm_and_si128(_mm_merge_epi16(t1, t0), mask)); - GRs = _mm_adds_epu16(GRs, _mm_and_si128(_mm_merge_epi16(_mm_loadu_si128((__m128i*)(brow2+N6+1)), - _mm_adds_epu16(x7,x10)), mask)); - Bs = _mm_adds_epu16(Bs, _mm_and_si128(_mm_merge_epi16(t0, _mm_adds_epu16(x8,x11)), mask)); + // gradE *********************************************** + mask = _mm_cmpgt_epi16(T, gradE); // mask = T>gradE + ng = _mm_sub_epi16(ng, mask); // ng += (T>gradE) - // gradS - mask = _mm_cmpgt_epi16(T, gradS); - ng = _mm_sub_epi16(ng, mask); + t0 = _mm_slli_epi16(x7, 1); // srow[1]*2 + t1 = _mm_adds_epi16(_mm_loadl_u8_s16(srow, 2), x0); // srow[2] + srow[0] + + // RGs += (srow[2] + srow[0]) * (T>gradE) + RGs = _mm_adds_epi16(RGs, _mm_and_si128(t1, mask)); + // GRs += (srow[1]*2) * (T>gradE) + GRs = _mm_adds_epi16(GRs, _mm_and_si128(t0, mask)); + // Bs += {(srow[-bstep+1]+srow[bstep+1]); (srow[-bstep+2]+srow[bstep+2])} * (T>gradE) + Bs = _mm_adds_epi16(Bs, _mm_and_si128(_mm_merge_epi16(_mm_adds_epi16(x5,x9), _mm_adds_epi16(x6,x8)), mask)); - t0 = _mm_slli_epi16(x11, 1); - t1 = _mm_adds_epu16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+bstep*2)), z), x0); + // gradSE ********************************************** + mask = _mm_cmpgt_epi16(T, gradSE); // mask = T>gradSE + ng = _mm_sub_epi16(ng, mask); // ng += (T>gradSE) - RGs = _mm_adds_epu16(RGs, _mm_and_si128(t1, mask)); - GRs = _mm_adds_epu16(GRs, _mm_and_si128(_mm_merge_epi16(t0, _mm_adds_epu16(x10,x12)), mask)); - Bs = _mm_adds_epu16(Bs, _mm_and_si128(_mm_merge_epi16(_mm_adds_epu16(x9,x13), t0), mask)); + t0 = _mm_slli_epi16(x9, 1); // srow[bstep+1]*2 + t1 = _mm_adds_epi16(_mm_loadl_u8_s16(srow, bstep*2+2), x0); // srow[bstep*2+2] + srow[0] + + // RGs += {(srow[bstep*2+2] + srow[0]); srow[bstep+1]*2} * (T>gradSE) + RGs = _mm_adds_epi16(RGs, _mm_and_si128(_mm_merge_epi16(t1, t0), mask)); + // GRs += {brow2[N6+1]; (srow[1]+srow[bstep*2+1])} * (T>gradSE) + GRs = _mm_adds_epi16(GRs, _mm_and_si128(_mm_merge_epi16(_mm_loadu_si128((__m128i*)(brow2+N6+1)), _mm_adds_epi16(x7,x10)), mask)); + // Bs += {srow[-bstep+1]*2; (srow[bstep+2]+srow[bstep])} * (T>gradSE) + Bs = _mm_adds_epi16(Bs, _mm_and_si128(_mm_merge_epi16(_mm_slli_epi16(x5, 1), _mm_adds_epi16(x8,x11)), mask)); - // gradSW - mask = _mm_cmpgt_epi16(T, gradSW); - ng = _mm_sub_epi16(ng, mask); + // gradS *********************************************** + mask = _mm_cmpgt_epi16(T, gradS); // mask = T>gradS + ng = _mm_sub_epi16(ng, mask); // ng += (T>gradS) - t0 = _mm_slli_epi16(x13, 1); - t1 = _mm_adds_epu16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+bstep*2-2)), z), x0); + t0 = _mm_slli_epi16(x11, 1); // srow[bstep]*2 + t1 = _mm_adds_epi16(_mm_loadl_u8_s16(srow,bstep*2), x0); // srow[bstep*2]+srow[0] + + // RGs += (srow[bstep*2]+srow[0]) * (T>gradS) + RGs = _mm_adds_epi16(RGs, _mm_and_si128(t1, mask)); + // GRs += {srow[bstep]*2; (srow[bstep*2+1]+srow[bstep*2-1])} * (T>gradS) + GRs = _mm_adds_epi16(GRs, _mm_and_si128(_mm_merge_epi16(t0, _mm_adds_epi16(x10,x12)), mask)); + // Bs += {(srow[bstep+1]+srow[bstep-1]); srow[bstep]*2} * (T>gradS) + Bs = _mm_adds_epi16(Bs, _mm_and_si128(_mm_merge_epi16(_mm_adds_epi16(x9,x13), t0), mask)); - RGs = _mm_adds_epu16(RGs, _mm_and_si128(_mm_merge_epi16(t1, t0), mask)); - GRs = _mm_adds_epu16(GRs, _mm_and_si128(_mm_merge_epi16(_mm_loadu_si128((__m128i*)(brow2+N6-1)), - _mm_adds_epu16(x12,x15)), mask)); - Bs = _mm_adds_epu16(Bs, _mm_and_si128(_mm_merge_epi16(t0,_mm_adds_epu16(x11,x14)), mask)); + // gradSW ********************************************** + mask = _mm_cmpgt_epi16(T, gradSW); // mask = T>gradSW + ng = _mm_sub_epi16(ng, mask); // ng += (T>gradSW) - // gradW - mask = _mm_cmpgt_epi16(T, gradW); - ng = _mm_sub_epi16(ng, mask); + t0 = _mm_slli_epi16(x13, 1); // srow[bstep-1]*2 + t1 = _mm_adds_epi16(_mm_loadl_u8_s16(srow, bstep*2-2), x0); // srow[bstep*2-2]+srow[0] + + // RGs += {(srow[bstep*2-2]+srow[0]); srow[bstep-1]*2} * (T>gradSW) + RGs = _mm_adds_epi16(RGs, _mm_and_si128(_mm_merge_epi16(t1, t0), mask)); + // GRs += {brow2[N6-1]; (srow[bstep*2-1]+srow[-1])} * (T>gradSW) + GRs = _mm_adds_epi16(GRs, _mm_and_si128(_mm_merge_epi16(_mm_loadu_si128((__m128i*)(brow2+N6-1)), _mm_adds_epi16(x12,x15)), mask)); + // Bs += {srow[bstep-1]*2; (srow[bstep]+srow[bstep-2])} * (T>gradSW) + Bs = _mm_adds_epi16(Bs, _mm_and_si128(_mm_merge_epi16(t0,_mm_adds_epi16(x11,x14)), mask)); - t0 = _mm_slli_epi16(x15, 1); - t1 = _mm_adds_epu16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-2)), z), x0); + // gradW *********************************************** + mask = _mm_cmpgt_epi16(T, gradW); // mask = T>gradW + ng = _mm_sub_epi16(ng, mask); // ng += (T>gradW) - RGs = _mm_adds_epu16(RGs, _mm_and_si128(t1, mask)); - GRs = _mm_adds_epu16(GRs, _mm_and_si128(t0, mask)); - Bs = _mm_adds_epu16(Bs, _mm_and_si128(_mm_merge_epi16(_mm_adds_epu16(x1,x13), - _mm_adds_epu16(x14,x16)), mask)); + t0 = _mm_slli_epi16(x15, 1); // srow[-1]*2 + t1 = _mm_adds_epi16(_mm_loadl_u8_s16(srow, -2), x0); // srow[-2]+srow[0] + + // RGs += (srow[-2]+srow[0]) * (T>gradW) + RGs = _mm_adds_epi16(RGs, _mm_and_si128(t1, mask)); + // GRs += (srow[-1]*2) * (T>gradW) + GRs = _mm_adds_epi16(GRs, _mm_and_si128(t0, mask)); + // Bs += {(srow[-bstep-1]+srow[bstep-1]); (srow[bstep-2]+srow[-bstep-2])} * (T>gradW) + Bs = _mm_adds_epi16(Bs, _mm_and_si128(_mm_merge_epi16(_mm_adds_epi16(x1,x13), _mm_adds_epi16(x14,x16)), mask)); - // gradNW - mask = _mm_cmpgt_epi16(T, gradNW); - ng = _mm_max_epi16(_mm_sub_epi16(ng, mask), all_ones); + // gradNW ********************************************** + mask = _mm_cmpgt_epi16(T, gradNW); // mask = T>gradNW + ng = _mm_sub_epi16(ng, mask); // ng += (T>gradNW) + t0 = _mm_slli_epi16(x1, 1); // srow[-bstep-1]*2 + t1 = _mm_adds_epi16(_mm_loadl_u8_s16(srow,-bstep*2-2), x0); // srow[-bstep*2-2]+srow[0] + + // RGs += {(srow[-bstep*2-2]+srow[0]); srow[-bstep-1]*2} * (T>gradNW) + RGs = _mm_adds_epi16(RGs, _mm_and_si128(_mm_merge_epi16(t1, t0), mask)); + // GRs += {brow0[N6-1]; (srow[-bstep*2-1]+srow[-1])} * (T>gradNW) + GRs = _mm_adds_epi16(GRs, _mm_and_si128(_mm_merge_epi16(_mm_loadu_si128((__m128i*)(brow0+N6-1)), _mm_adds_epi16(x2,x15)), mask)); + // Bs += {srow[-bstep-1]*2; (srow[-bstep]+srow[-bstep-2])} * (T>gradNW) + Bs = _mm_adds_epi16(Bs, _mm_and_si128(_mm_merge_epi16(_mm_slli_epi16(x5, 1),_mm_adds_epi16(x3,x16)), mask)); + __m128 ngf0, ngf1; ngf0 = _mm_div_ps(_0_5, _mm_cvtloepi16_ps(ng)); ngf1 = _mm_div_ps(_0_5, _mm_cvthiepi16_ps(ng)); - t0 = _mm_slli_epi16(x1, 1); - t1 = _mm_adds_epu16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-bstep*2-2)), z), x0); - - RGs = _mm_adds_epu16(RGs, _mm_and_si128(_mm_merge_epi16(t1, t0), mask)); - GRs = _mm_adds_epu16(GRs, _mm_and_si128(_mm_merge_epi16(_mm_loadu_si128((__m128i*)(brow0+N6-1)), - _mm_adds_epu16(x2,x15)), mask)); - Bs = _mm_adds_epu16(Bs, _mm_and_si128(_mm_merge_epi16(t0,_mm_adds_epu16(x3,x16)), mask)); - // now interpolate r, g & b t0 = _mm_sub_epi16(GRs, RGs); t1 = _mm_sub_epi16(Bs, RGs); diff --git a/modules/imgproc/test/test_color.cpp b/modules/imgproc/test/test_color.cpp index 8c1da99..b88b30e 100644 --- a/modules/imgproc/test/test_color.cpp +++ b/modules/imgproc/test/test_color.cpp @@ -1658,6 +1658,7 @@ void CV_ColorBayerTest::prepare_to_validation( int /*test_case_idx*/ ) CV_Error(CV_StsUnsupportedFormat, ""); } + ///////////////////////////////////////////////////////////////////////////////////////////////// TEST(Imgproc_ColorGray, accuracy) { CV_ColorGrayTest test; test.safe_run(); } @@ -1669,3 +1670,24 @@ TEST(Imgproc_ColorLab, accuracy) { CV_ColorLabTest test; test.safe_run(); } TEST(Imgproc_ColorLuv, accuracy) { CV_ColorLuvTest test; test.safe_run(); } TEST(Imgproc_ColorRGB, accuracy) { CV_ColorRGBTest test; test.safe_run(); } TEST(Imgproc_ColorBayer, accuracy) { CV_ColorBayerTest test; test.safe_run(); } + + +TEST(Imgproc_ColorBayerVNG, accuracy) +{ + cvtest::TS& ts = *cvtest::TS::ptr(); + + Mat given = imread(ts.get_data_path() + "/cvtcolor/bayerVNG_input.png", CV_LOAD_IMAGE_GRAYSCALE); + Mat gold = imread(ts.get_data_path() + "/cvtcolor/bayerVNG_gold.png", CV_LOAD_IMAGE_UNCHANGED); + Mat result; + + cvtColor(given, result, CV_BayerBG2BGR_VNG, 3); + + EXPECT_EQ(gold.type(), result.type()); + EXPECT_EQ(gold.cols, result.cols); + EXPECT_EQ(gold.rows, result.rows); + + Mat diff; + absdiff(gold, result, diff); + + EXPECT_EQ(0, countNonZero(diff.reshape(1) > 1)); +}