SUM = &sum[0];
if( sumCount == 0 )
{
- for( i = 0; i < width; i++ )
- SUM[i] = 0;
+ memset((void*)SUM, 0, width*sizeof(ST));
+
for( ; sumCount < ksize - 1; sumCount++, src++ )
{
const ST* Sp = (const ST*)src[0];
#if CV_SSE2
if(haveSSE2)
{
- for( ; i < width-4; i+=4 )
+ for( ; i <= width-4; i+=4 )
{
__m128i _sum = _mm_loadu_si128((const __m128i*)(SUM+i));
__m128i _sp = _mm_loadu_si128((const __m128i*)(Sp+i));
_mm_storeu_si128((__m128i*)(SUM+i),_mm_add_epi32(_sum, _sp));
}
}
+ #elif CV_NEON
+ for( ; i <= width - 4; i+=4 )
+ vst1q_s32(SUM + i, vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)));
#endif
for( ; i < width; i++ )
SUM[i] += Sp[i];
if(haveSSE2)
{
const __m128 scale4 = _mm_set1_ps((float)_scale);
- for( ; i < width-8; i+=8 )
+ for( ; i <= width-8; i+=8 )
{
__m128i _sm = _mm_loadu_si128((const __m128i*)(Sm+i));
__m128i _sm1 = _mm_loadu_si128((const __m128i*)(Sm+i+4));
_mm_storeu_si128((__m128i*)(SUM+i+4),_mm_sub_epi32(_s01,_sm1));
}
}
+ #elif CV_NEON
+ float32x4_t v_scale = vdupq_n_f32((float)_scale);
+ for( ; i <= width-8; i+=8 )
+ {
+ int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
+ int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
+
+ uint32x4_t v_s0d = cv_vrndq_u32_f32(vmulq_f32(vcvtq_f32_s32(v_s0), v_scale));
+ uint32x4_t v_s01d = cv_vrndq_u32_f32(vmulq_f32(vcvtq_f32_s32(v_s01), v_scale));
+
+ uint16x8_t v_dst = vcombine_u16(vqmovn_u32(v_s0d), vqmovn_u32(v_s01d));
+ vst1_u8(D + i, vqmovn_u16(v_dst));
+
+ vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
+ vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
+ }
#endif
for( ; i < width; i++ )
{
#if CV_SSE2
if(haveSSE2)
{
- for( ; i < width-8; i+=8 )
+ for( ; i <= width-8; i+=8 )
{
__m128i _sm = _mm_loadu_si128((const __m128i*)(Sm+i));
__m128i _sm1 = _mm_loadu_si128((const __m128i*)(Sm+i+4));
_mm_storeu_si128((__m128i*)(SUM+i+4),_mm_sub_epi32(_s01,_sm1));
}
}
+ #elif CV_NEON
+ for( ; i <= width-8; i+=8 )
+ {
+ int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
+ int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
+
+ uint16x8_t v_dst = vcombine_u16(vqmovun_s32(v_s0), vqmovun_s32(v_s01));
+ vst1_u8(D + i, vqmovn_u16(v_dst));
+
+ vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
+ vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
+ }
#endif
for( ; i < width; i++ )
#if CV_SSE2
if(haveSSE2)
{
- for( ; i < width-4; i+=4 )
+ for( ; i <= width-4; i+=4 )
{
__m128i _sum = _mm_loadu_si128((const __m128i*)(SUM+i));
__m128i _sp = _mm_loadu_si128((const __m128i*)(Sp+i));
_mm_storeu_si128((__m128i*)(SUM+i),_mm_add_epi32(_sum, _sp));
}
}
+ #elif CV_NEON
+ for( ; i <= width - 4; i+=4 )
+ vst1q_s32(SUM + i, vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)));
#endif
for( ; i < width; i++ )
SUM[i] += Sp[i];
if(haveSSE2)
{
const __m128 scale4 = _mm_set1_ps((float)_scale);
- for( ; i < width-8; i+=8 )
+ for( ; i <= width-8; i+=8 )
{
__m128i _sm = _mm_loadu_si128((const __m128i*)(Sm+i));
__m128i _sm1 = _mm_loadu_si128((const __m128i*)(Sm+i+4));
_mm_storeu_si128((__m128i*)(SUM+i+4), _mm_sub_epi32(_s01,_sm1));
}
}
+ #elif CV_NEON
+ float32x4_t v_scale = vdupq_n_f32((float)_scale);
+ for( ; i <= width-8; i+=8 )
+ {
+ int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
+ int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
+
+ int32x4_t v_s0d = cv_vrndq_s32_f32(vmulq_f32(vcvtq_f32_s32(v_s0), v_scale));
+ int32x4_t v_s01d = cv_vrndq_s32_f32(vmulq_f32(vcvtq_f32_s32(v_s01), v_scale));
+ vst1q_s16(D + i, vcombine_s16(vqmovn_s32(v_s0d), vqmovn_s32(v_s01d)));
+
+ vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
+ vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
+ }
#endif
for( ; i < width; i++ )
{
#if CV_SSE2
if(haveSSE2)
{
- for( ; i < width-8; i+=8 )
+ for( ; i <= width-8; i+=8 )
{
__m128i _sm = _mm_loadu_si128((const __m128i*)(Sm+i));
_mm_storeu_si128((__m128i*)(SUM+i+4),_mm_sub_epi32(_s01,_sm1));
}
}
+ #elif CV_NEON
+ for( ; i <= width-8; i+=8 )
+ {
+ int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
+ int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
+
+ vst1q_s16(D + i, vcombine_s16(vqmovn_s32(v_s0), vqmovn_s32(v_s01)));
+
+ vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
+ vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
+ }
#endif
for( ; i < width; i++ )
_mm_storeu_si128((__m128i*)(SUM+i), _mm_add_epi32(_sum, _sp));
}
}
+ #elif CV_NEON
+ for( ; i <= width - 4; i+=4 )
+ vst1q_s32(SUM + i, vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)));
#endif
for( ; i < width; i++ )
SUM[i] += Sp[i];
_mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm));
}
}
+ #elif CV_NEON
+ float32x4_t v_scale = vdupq_n_f32((float)_scale);
+ for( ; i <= width-8; i+=8 )
+ {
+ int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
+ int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
+
+ uint32x4_t v_s0d = cv_vrndq_u32_f32(vmulq_f32(vcvtq_f32_s32(v_s0), v_scale));
+ uint32x4_t v_s01d = cv_vrndq_u32_f32(vmulq_f32(vcvtq_f32_s32(v_s01), v_scale));
+ vst1q_u16(D + i, vcombine_u16(vqmovn_u32(v_s0d), vqmovn_u32(v_s01d)));
+
+ vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
+ vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
+ }
#endif
for( ; i < width; i++ )
{
_mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm));
}
}
+ #elif CV_NEON
+ for( ; i <= width-8; i+=8 )
+ {
+ int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
+ int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
+
+ vst1q_u16(D + i, vcombine_u16(vqmovun_s32(v_s0), vqmovun_s32(v_s01)));
+
+ vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
+ vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
+ }
#endif
for( ; i < width; i++ )
std::vector<int> sum;
};
+template<>
+struct ColumnSum<int, float> :
+ public BaseColumnFilter
+{
+ ColumnSum( int _ksize, int _anchor, double _scale ) :
+ BaseColumnFilter()
+ {
+ ksize = _ksize;
+ anchor = _anchor;
+ scale = _scale;
+ sumCount = 0;
+ }
+
+ virtual void reset() { sumCount = 0; }
+
+ virtual void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
+ {
+ int i;
+ int* SUM;
+ bool haveScale = scale != 1;
+ double _scale = scale;
+
+ #if CV_SSE2
+ bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
+ #endif
+
+ if( width != (int)sum.size() )
+ {
+ sum.resize(width);
+ sumCount = 0;
+ }
+
+ SUM = &sum[0];
+ if( sumCount == 0 )
+ {
+ memset((void *)SUM, 0, sizeof(int) * width);
+
+ for( ; sumCount < ksize - 1; sumCount++, src++ )
+ {
+ const int* Sp = (const int*)src[0];
+ i = 0;
+
+ #if CV_SSE2
+ if(haveSSE2)
+ {
+ for( ; i < width-4; i+=4 )
+ {
+ __m128i _sum = _mm_loadu_si128((const __m128i*)(SUM+i));
+ __m128i _sp = _mm_loadu_si128((const __m128i*)(Sp+i));
+ _mm_storeu_si128((__m128i*)(SUM+i), _mm_add_epi32(_sum, _sp));
+ }
+ }
+ #elif CV_NEON
+ for( ; i <= width - 4; i+=4 )
+ vst1q_s32(SUM + i, vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i)));
+ #endif
+
+ for( ; i < width; i++ )
+ SUM[i] += Sp[i];
+ }
+ }
+ else
+ {
+ CV_Assert( sumCount == ksize-1 );
+ src += ksize-1;
+ }
+
+ for( ; count--; src++ )
+ {
+ const int * Sp = (const int*)src[0];
+ const int * Sm = (const int*)src[1-ksize];
+ float* D = (float*)dst;
+ if( haveScale )
+ {
+ i = 0;
+
+ #if CV_SSE2
+ if(haveSSE2)
+ {
+ const __m128 scale4 = _mm_set1_ps((float)_scale);
+
+ for( ; i < width-4; i+=4)
+ {
+ __m128i _sm = _mm_loadu_si128((const __m128i*)(Sm+i));
+ __m128i _s0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)),
+ _mm_loadu_si128((const __m128i*)(Sp+i)));
+
+ _mm_storeu_ps(D+i, _mm_mul_ps(scale4, _mm_cvtepi32_ps(_s0)));
+ _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm));
+ }
+ }
+ #elif CV_NEON
+ float32x4_t v_scale = vdupq_n_f32((float)_scale);
+ for( ; i <= width-8; i+=8 )
+ {
+ int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
+ int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
+
+ vst1q_f32(D + i, vmulq_f32(vcvtq_f32_s32(v_s0), v_scale));
+ vst1q_f32(D + i + 4, vmulq_f32(vcvtq_f32_s32(v_s01), v_scale));
+
+ vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
+ vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
+ }
+ #endif
+
+ for( ; i < width; i++ )
+ {
+ int s0 = SUM[i] + Sp[i];
+ D[i] = (float)(s0*_scale);
+ SUM[i] = s0 - Sm[i];
+ }
+ }
+ else
+ {
+ i = 0;
+
+ #if CV_SSE2
+ if(haveSSE2)
+ {
+ for( ; i < width-4; i+=4)
+ {
+ __m128i _sm = _mm_loadu_si128((const __m128i*)(Sm+i));
+ __m128i _s0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(SUM+i)),
+ _mm_loadu_si128((const __m128i*)(Sp+i)));
+
+ _mm_storeu_ps(D+i, _mm_cvtepi32_ps(_s0));
+ _mm_storeu_si128((__m128i*)(SUM+i), _mm_sub_epi32(_s0,_sm));
+ }
+ }
+ #elif CV_NEON
+ for( ; i <= width-8; i+=8 )
+ {
+ int32x4_t v_s0 = vaddq_s32(vld1q_s32(SUM + i), vld1q_s32(Sp + i));
+ int32x4_t v_s01 = vaddq_s32(vld1q_s32(SUM + i + 4), vld1q_s32(Sp + i + 4));
+
+ vst1q_f32(D + i, vcvtq_f32_s32(v_s0));
+ vst1q_f32(D + i + 4, vcvtq_f32_s32(v_s01));
+
+ vst1q_s32(SUM + i, vsubq_s32(v_s0, vld1q_s32(Sm + i)));
+ vst1q_s32(SUM + i + 4, vsubq_s32(v_s01, vld1q_s32(Sm + i + 4)));
+ }
+ #endif
+
+ for( ; i < width; i++ )
+ {
+ int s0 = SUM[i] + Sp[i];
+ D[i] = (float)(s0);
+ SUM[i] = s0 - Sm[i];
+ }
+ }
+ dst += dststep;
+ }
+ }
+
+ double scale;
+ int sumCount;
+ std::vector<int> sum;
+};
+
#ifdef HAVE_OPENCL
#define DIVUP(total, grain) ((total + grain - 1) / (grain))