Merge pull request #12876 from savuor:color_rgb2rgb_wide
authorRostislav Vasilikhin <savuor@gmail.com>
Tue, 30 Oct 2018 15:36:23 +0000 (18:36 +0300)
committerAlexander Alekhin <alexander.a.alekhin@gmail.com>
Tue, 30 Oct 2018 15:36:23 +0000 (18:36 +0300)
* RGB2RGB initially rewritten

* NEON impl removed

* templated version added for ushort, float

* data copying allowed for RGB2RGB

* inplace processing fixed

* fields to local vars

* no zeroupper until it's fixed

* vx_cleanup() added back

modules/imgproc/src/color_rgb.cpp

index 91a6c34..b027e31 100644 (file)
@@ -12,183 +12,118 @@ namespace cv
 
 ////////////////// Various 3/4-channel to 3/4-channel RGB transformations /////////////////
 
-template<typename _Tp> struct RGB2RGB
-{
-    typedef _Tp channel_type;
+template<typename _Tp> struct v_type;
 
-    RGB2RGB(int _srccn, int _dstcn, int _blueIdx) : srccn(_srccn), dstcn(_dstcn), blueIdx(_blueIdx) {}
-    void operator()(const _Tp* src, _Tp* dst, int n) const
+template<>
+struct v_type<uchar>{
+    typedef v_uint8 t;
+};
+
+template<>
+struct v_type<ushort>{
+    typedef v_uint16 t;
+};
+
+template<>
+struct v_type<float>{
+    typedef v_float32 t;
+};
+
+template<typename _Tp> struct v_set;
+
+template<>
+struct v_set<uchar>
+{
+    static inline v_type<uchar>::t set(uchar x)
     {
-        int scn = srccn, dcn = dstcn, bidx = blueIdx;
-        if( dcn == 3 )
-        {
-            n *= 3;
-            for( int i = 0; i < n; i += 3, src += scn )
-            {
-                _Tp t0 = src[bidx], t1 = src[1], t2 = src[bidx ^ 2];
-                dst[i] = t0; dst[i+1] = t1; dst[i+2] = t2;
-            }
-        }
-        else if( scn == 3 )
-        {
-            n *= 3;
-            _Tp alpha = ColorChannel<_Tp>::max();
-            for( int i = 0; i < n; i += 3, dst += 4 )
-            {
-                _Tp t0 = src[i], t1 = src[i+1], t2 = src[i+2];
-                dst[bidx] = t0; dst[1] = t1; dst[bidx^2] = t2; dst[3] = alpha;
-            }
-        }
-        else
-        {
-            n *= 4;
-            for( int i = 0; i < n; i += 4 )
-            {
-                _Tp t0 = src[i], t1 = src[i+1], t2 = src[i+2], t3 = src[i+3];
-                dst[i+bidx] = t0; dst[i+1] = t1; dst[i+(bidx^2)] = t2; dst[i+3] = t3;
-            }
-        }
+        return vx_setall_u8(x);
     }
+};
 
-    int srccn, dstcn, blueIdx;
+template<>
+struct v_set<ushort>
+{
+    static inline v_type<ushort>::t set(ushort x)
+    {
+        return vx_setall_u16(x);
+    }
 };
 
-#if CV_NEON
+template<>
+struct v_set<float>
+{
+    static inline v_type<float>::t set(float x)
+    {
+        return vx_setall_f32(x);
+    }
+};
 
-template<> struct RGB2RGB<uchar>
+template<typename _Tp>
+struct RGB2RGB
 {
-    typedef uchar channel_type;
+    typedef _Tp channel_type;
+    typedef typename v_type<_Tp>::t vt;
 
     RGB2RGB(int _srccn, int _dstcn, int _blueIdx) :
         srccn(_srccn), dstcn(_dstcn), blueIdx(_blueIdx)
     {
-        v_alpha = vdupq_n_u8(ColorChannel<uchar>::max());
-        v_alpha2 = vget_low_u8(v_alpha);
+        CV_Assert(srccn == 3 || srccn == 4);
+        CV_Assert(dstcn == 3 || dstcn == 4);
     }
 
-    void operator()(const uchar * src, uchar * dst, int n) const
+    void operator()(const _Tp* src, _Tp* dst, int n) const
     {
-        int scn = srccn, dcn = dstcn, bidx = blueIdx, i = 0;
-        if (dcn == 3)
+        int scn = srccn, dcn = dstcn, bi = blueIdx;
+        int i = 0;
+        _Tp alphav = ColorChannel<_Tp>::max();
+
+#if CV_SIMD
+        const int vsize = vt::nlanes;
+
+        for(; i < n-vsize+1;
+            i += vsize, src += vsize*scn, dst += vsize*dcn)
         {
-            n *= 3;
-            if (scn == 3)
+            vt a, b, c, d;
+            if(scn == 4)
             {
-                for ( ; i <= n - 48; i += 48, src += 48 )
-                {
-                    uint8x16x3_t v_src = vld3q_u8(src), v_dst;
-                    v_dst.val[0] = v_src.val[bidx];
-                    v_dst.val[1] = v_src.val[1];
-                    v_dst.val[2] = v_src.val[bidx ^ 2];
-                    vst3q_u8(dst + i, v_dst);
-                }
-                for ( ; i <= n - 24; i += 24, src += 24 )
-                {
-                    uint8x8x3_t v_src = vld3_u8(src), v_dst;
-                    v_dst.val[0] = v_src.val[bidx];
-                    v_dst.val[1] = v_src.val[1];
-                    v_dst.val[2] = v_src.val[bidx ^ 2];
-                    vst3_u8(dst + i, v_dst);
-                }
-                for ( ; i < n; i += 3, src += 3 )
-                {
-                    uchar t0 = src[bidx], t1 = src[1], t2 = src[bidx ^ 2];
-                    dst[i] = t0; dst[i+1] = t1; dst[i+2] = t2;
-                }
+                v_load_deinterleave(src, a, b, c, d);
             }
             else
             {
-                for ( ; i <= n - 48; i += 48, src += 64 )
-                {
-                    uint8x16x4_t v_src = vld4q_u8(src);
-                    uint8x16x3_t v_dst;
-                    v_dst.val[0] = v_src.val[bidx];
-                    v_dst.val[1] = v_src.val[1];
-                    v_dst.val[2] = v_src.val[bidx ^ 2];
-                    vst3q_u8(dst + i, v_dst);
-                }
-                for ( ; i <= n - 24; i += 24, src += 32 )
-                {
-                    uint8x8x4_t v_src = vld4_u8(src);
-                    uint8x8x3_t v_dst;
-                    v_dst.val[0] = v_src.val[bidx];
-                    v_dst.val[1] = v_src.val[1];
-                    v_dst.val[2] = v_src.val[bidx ^ 2];
-                    vst3_u8(dst + i, v_dst);
-                }
-                for ( ; i < n; i += 3, src += 4 )
-                {
-                    uchar t0 = src[bidx], t1 = src[1], t2 = src[bidx ^ 2];
-                    dst[i] = t0; dst[i+1] = t1; dst[i+2] = t2;
-                }
-            }
-        }
-        else if (scn == 3)
-        {
-            n *= 3;
-            for ( ; i <= n - 48; i += 48, dst += 64 )
-            {
-                uint8x16x3_t v_src = vld3q_u8(src + i);
-                uint8x16x4_t v_dst;
-                v_dst.val[bidx] = v_src.val[0];
-                v_dst.val[1] = v_src.val[1];
-                v_dst.val[bidx ^ 2] = v_src.val[2];
-                v_dst.val[3] = v_alpha;
-                vst4q_u8(dst, v_dst);
+                v_load_deinterleave(src, a, b, c);
+                d = v_set<_Tp>::set(alphav);
             }
-            for ( ; i <= n - 24; i += 24, dst += 32 )
+            if(bi == 2)
+                swap(a, c);
+
+            if(dcn == 4)
             {
-                uint8x8x3_t v_src = vld3_u8(src + i);
-                uint8x8x4_t v_dst;
-                v_dst.val[bidx] = v_src.val[0];
-                v_dst.val[1] = v_src.val[1];
-                v_dst.val[bidx ^ 2] = v_src.val[2];
-                v_dst.val[3] = v_alpha2;
-                vst4_u8(dst, v_dst);
+                v_store_interleave(dst, a, b, c, d);
             }
-            uchar alpha = ColorChannel<uchar>::max();
-            for (; i < n; i += 3, dst += 4 )
+            else
             {
-                uchar t0 = src[i], t1 = src[i+1], t2 = src[i+2];
-                dst[bidx] = t0; dst[1] = t1; dst[bidx^2] = t2; dst[3] = alpha;
+                v_store_interleave(dst, a, b, c);
             }
         }
-        else
+        vx_cleanup();
+#endif
+        for ( ; i < n; i++, src += scn, dst += dcn )
         {
-            n *= 4;
-            for ( ; i <= n - 64; i += 64 )
-            {
-                uint8x16x4_t v_src = vld4q_u8(src + i), v_dst;
-                v_dst.val[0] = v_src.val[bidx];
-                v_dst.val[1] = v_src.val[1];
-                v_dst.val[2] = v_src.val[bidx^2];
-                v_dst.val[3] = v_src.val[3];
-                vst4q_u8(dst + i, v_dst);
-            }
-            for ( ; i <= n - 32; i += 32 )
+            _Tp t0 = src[0], t1 = src[1], t2 = src[2];
+            dst[bi  ] = t0;
+            dst[1]         = t1;
+            dst[bi^2] = t2;
+            if(dcn == 4)
             {
-                uint8x8x4_t v_src = vld4_u8(src + i), v_dst;
-                v_dst.val[0] = v_src.val[bidx];
-                v_dst.val[1] = v_src.val[1];
-                v_dst.val[2] = v_src.val[bidx^2];
-                v_dst.val[3] = v_src.val[3];
-                vst4_u8(dst + i, v_dst);
-            }
-            for ( ; i < n; i += 4)
-            {
-                uchar t0 = src[i], t1 = src[i+1], t2 = src[i+2], t3 = src[i+3];
-                dst[i+bidx] = t0; dst[i+1] = t1; dst[i+(bidx^2)] = t2; dst[i+3] = t3;
+                _Tp d = scn == 4 ? src[3] : alphav;
+                dst[3] = d;
             }
         }
     }
 
     int srccn, dstcn, blueIdx;
-
-    uint8x16_t v_alpha;
-    uint8x8_t v_alpha2;
 };
 
-#endif
 
 /////////// Transforming 16-bit (565 or 555) RGB to/from 24/32-bit (888[8]) RGB //////////