Bilateral Filtering
\****************************************************************************************/
-#undef CV_SSE3
-
namespace cv
{
{
int i, j, cn = dest->channels(), k;
Size size = dest->size();
- #if CV_SSE3
+ #if CV_SSE3
int CV_DECL_ALIGNED(16) buf[4];
float CV_DECL_ALIGNED(16) bufSum[4];
static const int CV_DECL_ALIGNED(16) bufSignMask[] = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
bool haveSSE3 = checkHardwareSupport(CV_CPU_SSE3);
- #endif
+ #endif
for( i = range.start; i < range.end; i++ )
{
{
float sum = 0, wsum = 0;
int val0 = sptr[j];
- k = 0;
- #if CV_SSE3
- if( haveSSE3 )
- {
- __m128 _val0 = _mm_set1_ps(static_cast<float>(val0));
- const __m128 _signMask = _mm_load_ps((const float*)bufSignMask);
-
- for( ; k <= maxk - 4; k += 4 )
- {
- __m128 _valF = _mm_set_ps(sptr[j + space_ofs[k+3]], sptr[j + space_ofs[k+2]],
+ k = 0;
+ #if CV_SSE3
+ if( haveSSE3 )
+ {
+ __m128 _val0 = _mm_set1_ps(static_cast<float>(val0));
+ const __m128 _signMask = _mm_load_ps((const float*)bufSignMask);
+
+ for( ; k <= maxk - 4; k += 4 )
+ {
+ __m128 _valF = _mm_set_ps(sptr[j + space_ofs[k+3]], sptr[j + space_ofs[k+2]],
sptr[j + space_ofs[k+1]], sptr[j + space_ofs[k]]);
-
- __m128 _val = _mm_andnot_ps(_signMask, _mm_sub_ps(_valF, _val0));
- _mm_store_si128((__m128i*)buf, _mm_cvtps_epi32(_val));
+
+ __m128 _val = _mm_andnot_ps(_signMask, _mm_sub_ps(_valF, _val0));
+ _mm_store_si128((__m128i*)buf, _mm_cvtps_epi32(_val));
- __m128 _cw = _mm_set_ps(color_weight[buf[3]],color_weight[buf[2]],
+ __m128 _cw = _mm_set_ps(color_weight[buf[3]],color_weight[buf[2]],
color_weight[buf[1]],color_weight[buf[0]]);
- __m128 _sw = _mm_loadu_ps(space_weight+k);
- __m128 _w = _mm_mul_ps(_cw, _sw);
- _cw = _mm_mul_ps(_w, _valF);
-
- _sw = _mm_hadd_ps(_w, _cw);
- _sw = _mm_hadd_ps(_sw, _sw);
- _mm_storel_pi((__m64*)bufSum, _sw);
-
- sum += bufSum[1];
- wsum += bufSum[0];
- }
- }
- #endif
+ __m128 _sw = _mm_loadu_ps(space_weight+k);
+ __m128 _w = _mm_mul_ps(_cw, _sw);
+ _cw = _mm_mul_ps(_w, _valF);
+
+ _sw = _mm_hadd_ps(_w, _cw);
+ _sw = _mm_hadd_ps(_sw, _sw);
+ _mm_storel_pi((__m64*)bufSum, _sw);
+
+ sum += bufSum[1];
+ wsum += bufSum[0];
+ }
+ }
+ #endif
for( ; k < maxk; k++ )
{
int val = sptr[j + space_ofs[k]];
{
float sum_b = 0, sum_g = 0, sum_r = 0, wsum = 0;
int b0 = sptr[j], g0 = sptr[j+1], r0 = sptr[j+2];
- k = 0;
- #if CV_SSE3
- if( haveSSE3 )
- {
- const __m128 _b0 = _mm_set1_ps(static_cast<float>(b0));
- const __m128 _g0 = _mm_set1_ps(static_cast<float>(g0));
- const __m128 _r0 = _mm_set1_ps(static_cast<float>(r0));
- const __m128 _signMask = _mm_load_ps((const float*)bufSignMask);
+ k = 0;
+ #if CV_SSE3
+ if( haveSSE3 )
+ {
+ const __m128 _b0 = _mm_set1_ps(static_cast<float>(b0));
+ const __m128 _g0 = _mm_set1_ps(static_cast<float>(g0));
+ const __m128 _r0 = _mm_set1_ps(static_cast<float>(r0));
+ const __m128 _signMask = _mm_load_ps((const float*)bufSignMask);
for( ; k <= maxk - 4; k += 4 )
{
- const uchar* sptr_k = sptr + j + space_ofs[k];
- const uchar* sptr_k1 = sptr + j + space_ofs[k+1];
- const uchar* sptr_k2 = sptr + j + space_ofs[k+2];
- const uchar* sptr_k3 = sptr + j + space_ofs[k+3];
+ const uchar* sptr_k = sptr + j + space_ofs[k];
+ const uchar* sptr_k1 = sptr + j + space_ofs[k+1];
+ const uchar* sptr_k2 = sptr + j + space_ofs[k+2];
+ const uchar* sptr_k3 = sptr + j + space_ofs[k+3];
- __m128 _b = _mm_set_ps(sptr_k3[0],sptr_k2[0],sptr_k1[0],sptr_k[0]);
- __m128 _g = _mm_set_ps(sptr_k3[1],sptr_k2[1],sptr_k1[1],sptr_k[1]);
- __m128 _r = _mm_set_ps(sptr_k3[2],sptr_k2[2],sptr_k1[2],sptr_k[2]);
+ __m128 _b = _mm_set_ps(sptr_k3[0],sptr_k2[0],sptr_k1[0],sptr_k[0]);
+ __m128 _g = _mm_set_ps(sptr_k3[1],sptr_k2[1],sptr_k1[1],sptr_k[1]);
+ __m128 _r = _mm_set_ps(sptr_k3[2],sptr_k2[2],sptr_k1[2],sptr_k[2]);
- __m128 bt = _mm_andnot_ps(_signMask, _mm_sub_ps(_b,_b0));
- __m128 gt = _mm_andnot_ps(_signMask, _mm_sub_ps(_g,_g0));
- __m128 rt = _mm_andnot_ps(_signMask, _mm_sub_ps(_r,_r0));
+ __m128 bt = _mm_andnot_ps(_signMask, _mm_sub_ps(_b,_b0));
+ __m128 gt = _mm_andnot_ps(_signMask, _mm_sub_ps(_g,_g0));
+ __m128 rt = _mm_andnot_ps(_signMask, _mm_sub_ps(_r,_r0));
- bt =_mm_add_ps(rt, _mm_add_ps(bt, gt));
- _mm_store_si128((__m128i*)buf, _mm_cvtps_epi32(bt));
+ bt =_mm_add_ps(rt, _mm_add_ps(bt, gt));
+ _mm_store_si128((__m128i*)buf, _mm_cvtps_epi32(bt));
- __m128 _w = _mm_set_ps(color_weight[buf[3]],color_weight[buf[2]],
+ __m128 _w = _mm_set_ps(color_weight[buf[3]],color_weight[buf[2]],
color_weight[buf[1]],color_weight[buf[0]]);
- __m128 _sw = _mm_loadu_ps(space_weight+k);
+ __m128 _sw = _mm_loadu_ps(space_weight+k);
- _w = _mm_mul_ps(_w,_sw);
- _b = _mm_mul_ps(_b, _w);
- _g = _mm_mul_ps(_g, _w);
- _r = _mm_mul_ps(_r, _w);
+ _w = _mm_mul_ps(_w,_sw);
+ _b = _mm_mul_ps(_b, _w);
+ _g = _mm_mul_ps(_g, _w);
+ _r = _mm_mul_ps(_r, _w);
- _w = _mm_hadd_ps(_w, _b);
- _g = _mm_hadd_ps(_g, _r);
+ _w = _mm_hadd_ps(_w, _b);
+ _g = _mm_hadd_ps(_g, _r);
- _w = _mm_hadd_ps(_w, _g);
- _mm_store_ps(bufSum, _w);
+ _w = _mm_hadd_ps(_w, _g);
+ _mm_store_ps(bufSum, _w);
- wsum += bufSum[0];
- sum_b += bufSum[1];
- sum_g += bufSum[2];
- sum_r += bufSum[3];
- }
- }
- #endif
+ wsum += bufSum[0];
+ sum_b += bufSum[1];
+ sum_g += bufSum[2];
+ sum_r += bufSum[3];
+ }
+ }
+ #endif
for( ; k < maxk; k++ )
{
// initialize space-related bilateral filter coefficients
for( i = -radius, maxk = 0; i <= radius; i++ )
- {
- j = -radius;
+ {
+ j = -radius;
for( ;j <= radius; j++ )
{
space_weight[maxk] = (float)std::exp(r*r*gauss_space_coeff);
space_ofs[maxk++] = (int)(i*temp.step + j*cn);
}
- }
+ }
BilateralFilter_8u_Invoker body(dst, temp, radius, maxk, space_ofs, space_weight, color_weight);
parallel_for_(Range(0, size.height), body);
{
int i, j, k;
Size size = dest->size();
- #if CV_SSE3
+ #if CV_SSE3
int CV_DECL_ALIGNED(16) idxBuf[4];
float CV_DECL_ALIGNED(16) bufSum32[4];
static const int CV_DECL_ALIGNED(16) bufSignMask[] = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
bool haveSSE3 = checkHardwareSupport(CV_CPU_SSE3);
- #endif
+ #endif
for( i = range.start; i < range.end; i++ )
{
{
float sum = 0, wsum = 0;
float val0 = sptr[j];
- k = 0;
- #if CV_SSE3
- if( haveSSE3 )
- {
- const __m128 _val0 = _mm_set1_ps(sptr[j]);
- const __m128 _scale_index = _mm_set1_ps(scale_index);
- const __m128 _signMask = _mm_load_ps((const float*)bufSignMask);
-
- for( ; k <= maxk - 4 ; k += 4 )
- {
- __m128 _sw = _mm_loadu_ps(space_weight + k);
- __m128 _val = _mm_set_ps(sptr[j + space_ofs[k+3]], sptr[j + space_ofs[k+2]],
+ k = 0;
+ #if CV_SSE3
+ if( haveSSE3 )
+ {
+ const __m128 _val0 = _mm_set1_ps(sptr[j]);
+ const __m128 _scale_index = _mm_set1_ps(scale_index);
+ const __m128 _signMask = _mm_load_ps((const float*)bufSignMask);
+
+ for( ; k <= maxk - 4 ; k += 4 )
+ {
+ __m128 _sw = _mm_loadu_ps(space_weight + k);
+ __m128 _val = _mm_set_ps(sptr[j + space_ofs[k+3]], sptr[j + space_ofs[k+2]],
sptr[j + space_ofs[k+1]], sptr[j + space_ofs[k]]);
- __m128 _alpha = _mm_mul_ps(_mm_andnot_ps( _signMask, _mm_sub_ps(_val,_val0)), _scale_index);
+ __m128 _alpha = _mm_mul_ps(_mm_andnot_ps( _signMask, _mm_sub_ps(_val,_val0)), _scale_index);
- __m128i _idx = _mm_cvtps_epi32(_alpha);
- _mm_store_si128((__m128i*)idxBuf, _idx);
- _alpha = _mm_sub_ps(_alpha, _mm_cvtepi32_ps(_idx));
+ __m128i _idx = _mm_cvtps_epi32(_alpha);
+ _mm_store_si128((__m128i*)idxBuf, _idx);
+ _alpha = _mm_sub_ps(_alpha, _mm_cvtepi32_ps(_idx));
- __m128 _explut = _mm_set_ps(expLUT[idxBuf[3]], expLUT[idxBuf[2]],
+ __m128 _explut = _mm_set_ps(expLUT[idxBuf[3]], expLUT[idxBuf[2]],
expLUT[idxBuf[1]], expLUT[idxBuf[0]]);
- __m128 _explut1 = _mm_set_ps(expLUT[idxBuf[3]+1], expLUT[idxBuf[2]+1],
+ __m128 _explut1 = _mm_set_ps(expLUT[idxBuf[3]+1], expLUT[idxBuf[2]+1],
expLUT[idxBuf[1]+1], expLUT[idxBuf[0]+1]);
-
- __m128 _w = _mm_mul_ps(_sw, _mm_add_ps(_explut, _mm_mul_ps(_alpha, _mm_sub_ps(_explut1, _explut))));
- _val = _mm_mul_ps(_w, _val);
+
+ __m128 _w = _mm_mul_ps(_sw, _mm_add_ps(_explut, _mm_mul_ps(_alpha, _mm_sub_ps(_explut1, _explut))));
+ _val = _mm_mul_ps(_w, _val);
- _sw = _mm_hadd_ps(_w, _val);
- _sw = _mm_hadd_ps(_sw, _sw);
- _mm_storel_pi((__m64*)bufSum32, _sw);
+ _sw = _mm_hadd_ps(_w, _val);
+ _sw = _mm_hadd_ps(_sw, _sw);
+ _mm_storel_pi((__m64*)bufSum32, _sw);
- sum += bufSum32[1];
- wsum += bufSum32[0];
- }
- }
- #endif
+ sum += bufSum32[1];
+ wsum += bufSum32[0];
+ }
+ }
+ #endif
for( ; k < maxk; k++ )
{
{
float sum_b = 0, sum_g = 0, sum_r = 0, wsum = 0;
float b0 = sptr[j], g0 = sptr[j+1], r0 = sptr[j+2];
- k = 0;
- #if CV_SSE3
- if( haveSSE3 )
- {
- const __m128 _b0 = _mm_set1_ps(b0);
- const __m128 _g0 = _mm_set1_ps(g0);
- const __m128 _r0 = _mm_set1_ps(r0);
- const __m128 _scale_index = _mm_set1_ps(scale_index);
+ k = 0;
+ #if CV_SSE3
+ if( haveSSE3 )
+ {
+ const __m128 _b0 = _mm_set1_ps(b0);
+ const __m128 _g0 = _mm_set1_ps(g0);
+ const __m128 _r0 = _mm_set1_ps(r0);
+ const __m128 _scale_index = _mm_set1_ps(scale_index);
const __m128 _signMask = _mm_load_ps((const float*)bufSignMask);
-
- for( ; k <= maxk-4; k += 4 )
- {
- __m128 _sw = _mm_loadu_ps(space_weight + k);
-
- const float* sptr_k = sptr + j + space_ofs[k];
- const float* sptr_k1 = sptr + j + space_ofs[k+1];
- const float* sptr_k2 = sptr + j + space_ofs[k+2];
- const float* sptr_k3 = sptr + j + space_ofs[k+3];
-
- __m128 _b = _mm_set_ps(sptr_k3[0], sptr_k2[0], sptr_k1[0], sptr_k[0]);
- __m128 _g = _mm_set_ps(sptr_k3[1], sptr_k2[1], sptr_k1[1], sptr_k[1]);
- __m128 _r = _mm_set_ps(sptr_k3[2], sptr_k2[2], sptr_k1[2], sptr_k[2]);
-
- __m128 _bt = _mm_andnot_ps(_signMask,_mm_sub_ps(_b,_b0));
- __m128 _gt = _mm_andnot_ps(_signMask,_mm_sub_ps(_g,_g0));
- __m128 _rt = _mm_andnot_ps(_signMask,_mm_sub_ps(_r,_r0));
-
- __m128 _alpha = _mm_mul_ps(_scale_index, _mm_add_ps(_rt,_mm_add_ps(_bt, _gt)));
-
- __m128i _idx = _mm_cvtps_epi32(_alpha);
- _mm_store_si128((__m128i*)idxBuf, _idx);
- _alpha = _mm_sub_ps(_alpha, _mm_cvtepi32_ps(_idx));
-
- __m128 _explut = _mm_set_ps(expLUT[idxBuf[3]], expLUT[idxBuf[2]], expLUT[idxBuf[1]], expLUT[idxBuf[0]]);
- __m128 _explut1 = _mm_set_ps(expLUT[idxBuf[3]+1], expLUT[idxBuf[2]+1], expLUT[idxBuf[1]+1], expLUT[idxBuf[0]+1]);
-
- __m128 _w = _mm_mul_ps(_sw, _mm_add_ps(_explut, _mm_mul_ps(_alpha, _mm_sub_ps(_explut1, _explut))));
-
- _b = _mm_mul_ps(_b, _w);
- _g = _mm_mul_ps(_g, _w);
- _r = _mm_mul_ps(_r, _w);
-
- _w = _mm_hadd_ps(_w, _b);
- _g = _mm_hadd_ps(_g, _r);
-
- _w = _mm_hadd_ps(_w, _g);
- _mm_store_ps(bufSum32, _w);
-
- wsum += bufSum32[0];
- sum_b += bufSum32[1];
- sum_g += bufSum32[2];
- sum_r += bufSum32[3];
- }
-
- }
- #endif
-
+
+ for( ; k <= maxk-4; k += 4 )
+ {
+ __m128 _sw = _mm_loadu_ps(space_weight + k);
+
+ const float* sptr_k = sptr + j + space_ofs[k];
+ const float* sptr_k1 = sptr + j + space_ofs[k+1];
+ const float* sptr_k2 = sptr + j + space_ofs[k+2];
+ const float* sptr_k3 = sptr + j + space_ofs[k+3];
+
+ __m128 _b = _mm_set_ps(sptr_k3[0], sptr_k2[0], sptr_k1[0], sptr_k[0]);
+ __m128 _g = _mm_set_ps(sptr_k3[1], sptr_k2[1], sptr_k1[1], sptr_k[1]);
+ __m128 _r = _mm_set_ps(sptr_k3[2], sptr_k2[2], sptr_k1[2], sptr_k[2]);
+
+ __m128 _bt = _mm_andnot_ps(_signMask,_mm_sub_ps(_b,_b0));
+ __m128 _gt = _mm_andnot_ps(_signMask,_mm_sub_ps(_g,_g0));
+ __m128 _rt = _mm_andnot_ps(_signMask,_mm_sub_ps(_r,_r0));
+
+ __m128 _alpha = _mm_mul_ps(_scale_index, _mm_add_ps(_rt,_mm_add_ps(_bt, _gt)));
+
+ __m128i _idx = _mm_cvtps_epi32(_alpha);
+ _mm_store_si128((__m128i*)idxBuf, _idx);
+ _alpha = _mm_sub_ps(_alpha, _mm_cvtepi32_ps(_idx));
+
+ __m128 _explut = _mm_set_ps(expLUT[idxBuf[3]], expLUT[idxBuf[2]], expLUT[idxBuf[1]], expLUT[idxBuf[0]]);
+ __m128 _explut1 = _mm_set_ps(expLUT[idxBuf[3]+1], expLUT[idxBuf[2]+1], expLUT[idxBuf[1]+1], expLUT[idxBuf[0]+1]);
+
+ __m128 _w = _mm_mul_ps(_sw, _mm_add_ps(_explut, _mm_mul_ps(_alpha, _mm_sub_ps(_explut1, _explut))));
+
+ _b = _mm_mul_ps(_b, _w);
+ _g = _mm_mul_ps(_g, _w);
+ _r = _mm_mul_ps(_r, _w);
+
+ _w = _mm_hadd_ps(_w, _b);
+ _g = _mm_hadd_ps(_g, _r);
+
+ _w = _mm_hadd_ps(_w, _g);
+ _mm_store_ps(bufSum32, _w);
+
+ wsum += bufSum32[0];
+ sum_b += bufSum32[1];
+ sum_g += bufSum32[2];
+ sum_r += bufSum32[3];
+ }
+
+ }
+ #endif
+
for(; k < maxk; k++ )
{
const float* sptr_k = sptr + j + space_ofs[k];