template <typename T, typename WT>
struct ResizeAreaFastNoVec
{
- ResizeAreaFastNoVec(int /*_scale_x*/, int /*_scale_y*/,
- int /*_cn*/, int /*_step*//*, const int**/ /*_ofs*/) { }
- int operator() (const T* /*S*/, T* /*D*/, int /*w*/) const { return 0; }
+ ResizeAreaFastNoVec(int, int) { }
+ ResizeAreaFastNoVec(int, int, int, int) { }
+ int operator() (const T*, T*, int) const
+ { return 0; }
};
-template<typename T>
+#if CV_SSE2
+class ResizeAreaFastVec_SIMD_8u
+{
+public:
+ ResizeAreaFastVec_SIMD_8u(int _cn, int _step) :
+ cn(_cn), step(_step)
+ {
+ use_simd = checkHardwareSupport(CV_CPU_SSE2);
+ }
+
+ int operator() (const uchar* S, uchar* D, int w) const
+ {
+ if (!use_simd)
+ return 0;
+
+ int dx = 0;
+ const uchar* S0 = S;
+ const uchar* S1 = S0 + step;
+ __m128i masklow = _mm_set1_epi16(0x00ff);
+ __m128i zero = _mm_setzero_si128();
+
+ if (cn == 1)
+ {
+ for ( ; dx < w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
+ {
+ __m128i s0 = _mm_loadu_si128((const __m128i*)S0);
+ __m128i s1 = _mm_loadu_si128((const __m128i*)S1);
+
+ __m128i s = _mm_avg_epu8(s0, _mm_srli_si128(s0, 1));
+ s = _mm_avg_epu8(s, _mm_avg_epu8(s1, _mm_srli_si128(s1, 1)));
+
+ _mm_storel_epi64((__m128i*)D, _mm_packus_epi16(_mm_and_si128(s, masklow), zero));
+ }
+ }
+ else if (cn == 3)
+ for ( ; dx < w - 6; dx += 6, S0 += 12, S1 += 12, D += 6)
+ {
+ __m128i s0 = _mm_loadu_si128((const __m128i*)S0);
+ __m128i s1 = _mm_loadu_si128((const __m128i*)S1);
+
+ __m128i s = _mm_avg_epu8(s0, _mm_srli_si128(s0, 3));
+ s = _mm_avg_epu8(s, _mm_avg_epu8(s1, _mm_srli_si128(s1, 3)));
+
+ _mm_storel_epi64((__m128i*)D, s);
+ _mm_storel_epi64((__m128i*)(D+3), _mm_srli_si128(s, 6));
+ }
+ else
+ {
+ CV_Assert(cn == 4);
+ for ( ; dx < w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
+ {
+ __m128i s0 = _mm_loadu_si128((const __m128i*)S0);
+ __m128i s1 = _mm_loadu_si128((const __m128i*)S1);
+
+ __m128i s = _mm_avg_epu8(s0, _mm_srli_si128(s0, 4));
+ s = _mm_avg_epu8(s, _mm_avg_epu8(s1, _mm_srli_si128(s1, 4)));
+
+ _mm_storel_epi64((__m128i*)D, s);
+ _mm_storel_epi64((__m128i*)(D+4), _mm_srli_si128(s, 8));
+ }
+ }
+
+ return dx;
+ }
+
+private:
+ int cn;
+ int step;
+ bool use_simd;
+};
+
+class ResizeAreaFastVec_SIMD_16u
+{
+public:
+ ResizeAreaFastVec_SIMD_16u(int _cn, int _step) :
+ cn(_cn), step(_step)
+ {
+ use_simd = checkHardwareSupport(CV_CPU_SSE2);
+ }
+
+ int operator() (const ushort* S, ushort* D, int w) const
+ {
+ if (!use_simd)
+ return 0;
+
+ int dx = 0;
+ const ushort* S0 = (const ushort*)S;
+ const ushort* S1 = (const ushort*)(S0 + step);
+ __m128i masklow = _mm_set1_epi32(0x0000ffff);
+ __m128i zero = _mm_setzero_si128();
+
+ if (cn == 1)
+ {
+ for ( ; dx < w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
+ {
+ __m128i s0 = _mm_loadu_si128((const __m128i*)S0);
+ __m128i s1 = _mm_loadu_si128((const __m128i*)S1);
+
+ __m128i s = _mm_avg_epu16(s0, _mm_srli_si128(s0, 2));
+ s = _mm_avg_epu16(s, _mm_avg_epu16(s1, _mm_srli_si128(s1, 2)));
+
+ s = _mm_and_si128(s, masklow);
+ s = _mm_packs_epi32(s, zero);
+ _mm_storel_epi64((__m128i*)D, s);
+ }
+ }
+ else if (cn == 3)
+ for ( ; dx < w - 3; dx += 3, S0 += 6, S1 += 6, D += 3)
+ {
+ __m128i s0 = _mm_loadu_si128((const __m128i*)S0);
+ __m128i s1 = _mm_loadu_si128((const __m128i*)S1);
+
+ __m128i s = _mm_avg_epu16(s0, _mm_srli_si128(s0, 6));
+ s = _mm_avg_epu16(s, _mm_avg_epu16(s1, _mm_srli_si128(s1, 6)));
+
+ _mm_storel_epi64((__m128i*)D, s);
+ }
+ else
+ {
+ CV_Assert(cn == 4);
+ for ( ; dx < w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
+ {
+ __m128i s0 = _mm_loadu_si128((const __m128i*)S0);
+ __m128i s1 = _mm_loadu_si128((const __m128i*)S1);
+
+ __m128i s = _mm_avg_epu16(s0, _mm_srli_si128(s0, 8));
+ s = _mm_avg_epu16(s, _mm_avg_epu16(s1, _mm_srli_si128(s1, 8)));
+
+ _mm_storel_epi64((__m128i*)(D), s);
+ }
+ }
+
+ return dx;
+ }
+
+private:
+ int cn;
+ int step;
+ bool use_simd;
+};
+
+#else
+typedef ResizeAreaFastNoVec<uchar, uchar> ResizeAreaFastVec_SIMD_8u;
+typedef ResizeAreaFastNoVec<ushort, ushort> ResizeAreaFastVec_SIMD_16u;
+#endif
+
+template<typename T, typename SIMDVecOp>
struct ResizeAreaFastVec
{
- ResizeAreaFastVec(int _scale_x, int _scale_y, int _cn, int _step/*, const int* _ofs*/) :
- scale_x(_scale_x), scale_y(_scale_y), cn(_cn), step(_step)/*, ofs(_ofs)*/
+ ResizeAreaFastVec(int _scale_x, int _scale_y, int _cn, int _step) :
+ scale_x(_scale_x), scale_y(_scale_y), cn(_cn), step(_step), vecOp(_cn, _step)
{
fast_mode = scale_x == 2 && scale_y == 2 && (cn == 1 || cn == 3 || cn == 4);
}
return 0;
const T* nextS = (const T*)((const uchar*)S + step);
- int dx = 0;
+ int dx = vecOp(S, D, w);
if (cn == 1)
for( ; dx < w; ++dx )
}
else
{
- assert(cn == 4);
+ CV_Assert(cn == 4);
for( ; dx < w; dx += 4 )
{
int index = dx*2;
int cn;
bool fast_mode;
int step;
+ SIMDVecOp vecOp;
};
template <typename T, typename WT, typename VecOp>
static ResizeAreaFastFunc areafast_tab[] =
{
- resizeAreaFast_<uchar, int, ResizeAreaFastVec<uchar> >,
+ resizeAreaFast_<uchar, int, ResizeAreaFastVec<uchar, ResizeAreaFastVec_SIMD_8u> >,
0,
- resizeAreaFast_<ushort, float, ResizeAreaFastVec<ushort> >,
- resizeAreaFast_<short, float, ResizeAreaFastVec<short> >,
+ resizeAreaFast_<ushort, float, ResizeAreaFastVec<ushort, ResizeAreaFastVec_SIMD_16u> >,
+ resizeAreaFast_<short, float, ResizeAreaFastVec<short, ResizeAreaFastNoVec<short, float> > >,
0,
resizeAreaFast_<float, float, ResizeAreaFastNoVec<float, float> >,
resizeAreaFast_<double, double, ResizeAreaFastNoVec<double, double> >,
// in case of scale_x && scale_y is equal to 2
// INTER_AREA (fast) also is equal to INTER_LINEAR
if( interpolation == INTER_LINEAR && is_area_fast && iscale_x == 2 && iscale_y == 2 )
- {
interpolation = INTER_AREA;
- }
// true "area" interpolation is only implemented for the case (scale_x <= 1 && scale_y <= 1).
// In other cases it is emulated using some variant of bilinear interpolation